Here's my solution, for any interested. This is basically a hacked version of the .NET StreamReader code. It adds two public properties, LineLength and BytesRead. LineLength returns the actual length of the current record, inclusive of line-termination characters. BytesRead returns the actual real bytes read (regardless of the buffer mechanism). This value is suitable for passing into .BaseStream.Seek().
It could be better. The real StreamReader code uses some methods internal to some of the core .NET namespaces, so I did the best I could with those. If anyone would like to post improvements, they are welcome to do so.
using System;
using System.Text;
using System.Runtime.InteropServices;
using System.IO;
namespace TGREER
{
[Serializable()]
public class myStreamReader : TextReader
{
public new static readonly myStreamReader Null = new NullmyStreamReader();
internal const int DefaultBufferSize = 1024; // Byte buffer size
private const int DefaultFileStreamBufferSize = 4096;
private const int MinBufferSize = 128;
private Stream stream;
private Encoding encoding;
private Decoder decoder;
private byte[] byteBuffer;
private char[] charBuffer;
private byte[] _preamble;
private int charPos;
private int charLen;
private int byteLen;
private int _maxCharsPerBuffer;
private bool _detectEncoding;
private bool _checkPreamble;
private bool _isBlocked;
private int _lineLength;
public int LineLength
{
get { return _lineLength; }
}
private int _bytesRead;
public int BytesRead
{
get { return _bytesRead; }
}
internal myStreamReader()
{
}
public myStreamReader(Stream stream)
: this(stream, Encoding.UTF8, true, DefaultBufferSize)
{
}
public myStreamReader(Stream stream, bool detectEncodingFromByteOrderMarks)
: this(stream, Encoding.UTF8, detectEncodingFromByteOrderMarks, DefaultBufferSize)
{
}
public myStreamReader(Stream stream, Encoding encoding)
: this(stream, encoding, true, DefaultBufferSize)
{
}
public myStreamReader(Stream stream, Encoding encoding, bool detectEncodingFromByteOrderMarks)
: this(stream, encoding, detectEncodingFromByteOrderMarks, DefaultBufferSize)
{
}
public myStreamReader(Stream stream, Encoding encoding, bool detectEncodingFromByteOrderMarks, int bufferSize)
{
if (stream == null || encoding == null)
throw new ArgumentNullException((stream == null ? "stream" : "encoding"));
if (!stream.CanRead)
throw new ArgumentException(Environment.GetEnvironmentVariable("Argument_StreamNotReadable"));
if (bufferSize <= 0)
throw new ArgumentOutOfRangeException("bufferSize", Environment.GetEnvironmentVariable("ArgumentOutOfRange_NeedPosNum"));
Init(stream, encoding, detectEncodingFromByteOrderMarks, bufferSize);
}
/// <include file='doc\myStreamReader.uex' path='docs/doc[@for="myStreamReader.myStreamReader4"]/*' />
public myStreamReader(String path)
: this(path, Encoding.UTF8, true, DefaultBufferSize)
{
}
/// <include file='doc\myStreamReader.uex' path='docs/doc[@for="myStreamReader.myStreamReader9"]/*' />
public myStreamReader(String path, bool detectEncodingFromByteOrderMarks)
: this(path, Encoding.UTF8, detectEncodingFromByteOrderMarks, DefaultBufferSize)
{
}
/// <include file='doc\myStreamReader.uex' path='docs/doc[@for="myStreamReader.myStreamReader5"]/*' />
public myStreamReader(String path, Encoding encoding)
: this(path, encoding, true, DefaultBufferSize)
{
}
/// <include file='doc\myStreamReader.uex' path='docs/doc[@for="myStreamReader.myStreamReader6"]/*' />
public myStreamReader(String path, Encoding encoding, bool detectEncodingFromByteOrderMarks)
: this(path, encoding, detectEncodingFromByteOrderMarks, DefaultBufferSize)
{
}
/// <include file='doc\myStreamReader.uex' path='docs/doc[@for="myStreamReader.myStreamReader7"]/*' />
public myStreamReader(String path, Encoding encoding, bool detectEncodingFromByteOrderMarks, int bufferSize)
{
// Don't open a Stream before checking for invalid arguments,
// or we'll create a FileStream on disk and we won't close it until
// the finalizer runs, causing problems for applications.
if (path == null || encoding == null)
throw new ArgumentNullException((path == null ? "path" : "encoding"));
if (path.Length == 0)
throw new ArgumentException(Environment.GetEnvironmentVariable("Argument_EmptyPath"));
if (bufferSize <= 0)
throw new ArgumentOutOfRangeException("bufferSize", Environment.GetEnvironmentVariable("ArgumentOutOfRange_NeedPosNum"));
Stream stream = new FileStream(path, FileMode.Open, FileAccess.Read, FileShare.Read, DefaultFileStreamBufferSize);
Init(stream, encoding, detectEncodingFromByteOrderMarks, bufferSize);
}
private void Init(Stream stream, Encoding encoding, bool detectEncodingFromByteOrderMarks, int bufferSize)
{
this.stream = stream;
this.encoding = encoding;
decoder = encoding.GetDecoder();
if (bufferSize < MinBufferSize) bufferSize = MinBufferSize;
byteBuffer = new byte[bufferSize];
_maxCharsPerBuffer = encoding.GetMaxCharCount(bufferSize);
charBuffer = new char[_maxCharsPerBuffer];
byteLen = 0;
_detectEncoding = detectEncodingFromByteOrderMarks;
_preamble = encoding.GetPreamble();
_checkPreamble = (_preamble.Length > 0);
_isBlocked = false;
}
/// <include file='doc\myStreamReader.uex' path='docs/doc[@for="myStreamReader.Close"]/*' />
public override void Close()
{
Dispose(true);
}
/// <include file='doc\myStreamReader.uex' path='docs/doc[@for="myStreamReader.Dispose"]/*' />
protected override void Dispose(bool disposing)
{
if (disposing)
{
if (stream != null)
stream.Close();
}
if (stream != null)
{
stream = null;
encoding = null;
decoder = null;
byteBuffer = null;
charBuffer = null;
charPos = 0;
charLen = 0;
}
base.Dispose(disposing);
}
/// <include file='doc\myStreamReader.uex' path='docs/doc[@for="myStreamReader.CurrentEncoding"]/*' />
public virtual Encoding CurrentEncoding
{
get { return encoding; }
}
/// <include file='doc\myStreamReader.uex' path='docs/doc[@for="myStreamReader.BaseStream"]/*' />
public virtual Stream BaseStream
{
get { return stream; }
}
// DiscardBufferedData tells myStreamReader to throw away its internal
// buffer contents. This is useful if the user needs to seek on the
// underlying stream to a known location then wants the myStreamReader
// to start reading from this new point. This method should be called
// very sparingly, if ever, since it can lead to very poor performance.
// However, it may be the only way of handling some scenarios where
// users need to re-read the contents of a myStreamReader a second time.
/// <include file='doc\myStreamReader.uex' path='docs/doc[@for="myStreamReader.DiscardBufferedData"]/*' />
public void DiscardBufferedData()
{
byteLen = 0;
charLen = 0;
charPos = 0;
decoder = encoding.GetDecoder();
_isBlocked = false;
}
/// <include file='doc\myStreamReader.uex' path='docs/doc[@for="myStreamReader.Peek"]/*' />
public override int Peek()
{
//if (stream == null)
//__Error.ReaderClosed();
if (charPos == charLen)
{
if (_isBlocked || ReadBuffer() == 0) return -1;
}
return charBuffer[charPos];
}
public override int Read()
{
//if (stream == null)
//__Error.ReaderClosed();
if (charPos == charLen)
{
if (ReadBuffer() == 0) return -1;
}
return charBuffer[charPos++];
}
public override int Read([In, Out] char[] buffer, int index, int count)
{
//if (stream == null)
//__Error.ReaderClosed();
if (buffer == null)
throw new ArgumentNullException("buffer", Environment.GetEnvironmentVariable("ArgumentNull_Buffer"));
if (index < 0 || count < 0)
throw new ArgumentOutOfRangeException((index < 0 ? "index" : "count"), Environment.GetEnvironmentVariable("ArgumentOutOfRange_NeedNonNegNum"));
if (buffer.Length - index < count)
throw new ArgumentException(Environment.GetEnvironmentVariable("Argument_InvalidOffLen"));
int charsRead = 0;
// As a perf optimization, if we had exactly one buffer's worth of
// data read in, let's try writing directly to the user's buffer.
bool readToUserBuffer = false;
while (count > 0)
{
int n = charLen - charPos;
if (n == 0) n = ReadBuffer(buffer, index + charsRead, count, out readToUserBuffer);
if (n == 0) break; // We're at EOF
if (n > count) n = count;
if (!readToUserBuffer)
{
Buffer.BlockCopy(charBuffer, charPos * 2, buffer, (index + charsRead) * 2, n * 2);
charPos += n;
}
charsRead += n;
count -= n;
// This function shouldn't block for an indefinite amount of time,
// or reading from a network stream won't work right. If we got
// fewer bytes than we requested, then we want to break right here.
if (_isBlocked)
break;
}
return charsRead;
}
/// <include file='doc\myStreamReader.uex' path='docs/doc[@for="myStreamReader.ReadToEnd"]/*' />
public override String ReadToEnd()
{
//if (stream == null)
//__Error.ReaderClosed();
// For performance, call Read(char[], int, int) with a buffer
// as big as the myStreamReader's internal buffer, to get the
// readToUserBuffer optimization.
char[] chars = new char[charBuffer.Length];
int len;
StringBuilder sb = new StringBuilder(charBuffer.Length);
while ((len = Read(chars, 0, chars.Length)) != 0)
{
sb.Append(chars, 0, len);
}
return sb.ToString();
}
// Trims n bytes from the front of the buffer.
private void CompressBuffer(int n)
{
Buffer.BlockCopy(byteBuffer, n, byteBuffer, 0, byteLen - n);
byteLen -= n;
}
// returns whether the first array starts with the second array.
private static bool BytesMatch(byte[] buffer, byte[] compareTo)
{
for (int i = 0; i < compareTo.Length; i++)
if (buffer[i] != compareTo[i])
return false;
return true;
}
private void DetectEncoding()
{
if (byteLen < 2)
return;
_detectEncoding = false;
bool changedEncoding = false;
if (byteBuffer[0] == 0xFE && byteBuffer[1] == 0xFF)
{
// Big Endian Unicode
encoding = new UnicodeEncoding(true, true);
decoder = encoding.GetDecoder();
CompressBuffer(2);
changedEncoding = true;
}
else if (byteBuffer[0] == 0xFF && byteBuffer[1] == 0xFE)
{
// Little Endian Unicode
encoding = new UnicodeEncoding(false, true);
decoder = encoding.GetDecoder();
CompressBuffer(2);
changedEncoding = true;
}
else if (byteLen >= 3 && byteBuffer[0] == 0xEF && byteBuffer[1] == 0xBB && byteBuffer[2] == 0xBF)
{
// UTF-8
encoding = Encoding.UTF8;
decoder = encoding.GetDecoder();
CompressBuffer(3);
changedEncoding = true;
}
else if (byteLen == 2)
_detectEncoding = true;
// Note: in the future, if we change this algorithm significantly,
// we can support checking for the preamble of the given encoding.
if (changedEncoding)
{
_maxCharsPerBuffer = encoding.GetMaxCharCount(byteBuffer.Length);
charBuffer = new char[_maxCharsPerBuffer];
}
}
private int ReadBuffer()
{
charLen = 0;
byteLen = 0;
charPos = 0;
do
{
byteLen = stream.Read(byteBuffer, 0, byteBuffer.Length);
if (byteLen == 0) // We're at EOF
return charLen;
// _isBlocked == whether we read fewer bytes than we asked for.
// Note we must check it here because CompressBuffer or
// DetectEncoding will screw with byteLen.
_isBlocked = (byteLen < byteBuffer.Length);
if (_checkPreamble && byteLen >= _preamble.Length)
{
_checkPreamble = false;
if (BytesMatch(byteBuffer, _preamble))
{
_detectEncoding = false;
CompressBuffer(_preamble.Length);
}
}
// If we're supposed to detect the encoding and haven't done so yet,
// do it. Note this may need to be called more than once.
if (_detectEncoding && byteLen >= 2)
DetectEncoding();
charLen += decoder.GetChars(byteBuffer, 0, byteLen, charBuffer, charLen);
} while (charLen == 0);
//Console.WriteLine("ReadBuffer called. chars: "+charLen);
return charLen;
}
// This version has a perf optimization to decode data DIRECTLY into the
// user's buffer, bypassing StreamWriter's own buffer.
// This gives a > 20% perf improvement for our encodings across the board,
// but only when asking for at least the number of characters that one
// buffer's worth of bytes could produce.
// This optimization, if run, will break SwitchEncoding, so we must not do
// this on the first call to ReadBuffer.
private int ReadBuffer(char[] userBuffer, int userOffset, int desiredChars, out bool readToUserBuffer)
{
charLen = 0;
byteLen = 0;
charPos = 0;
int charsRead = 0;
// As a perf optimization, we can decode characters DIRECTLY into a
// user's char[]. We absolutely must not write more characters
// into the user's buffer than they asked for. Calculating
// encoding.GetMaxCharCount(byteLen) each time is potentially very
// expensive - instead, cache the number of chars a full buffer's
// worth of data may produce. Yes, this makes the perf optimization
// less aggressive, in that all reads that asked for fewer than AND
// returned fewer than _maxCharsPerBuffer chars won't get the user
// buffer optimization. This affects reads where the end of the
// Stream comes in the middle somewhere, and when you ask for
// fewer chars than than your buffer could produce.
readToUserBuffer = desiredChars >= _maxCharsPerBuffer;
do
{
byteLen = stream.Read(byteBuffer, 0, byteBuffer.Length);
if (byteLen == 0) // EOF
return charsRead;
// _isBlocked == whether we read fewer bytes than we asked for.
// Note we must check it here because CompressBuffer or
// DetectEncoding will screw with byteLen.
_isBlocked = (byteLen < byteBuffer.Length);
// On the first call to ReadBuffer, if we're supposed to detect the encoding, do it.
if (_detectEncoding && byteLen >= 2)
{
DetectEncoding();
// DetectEncoding changes some buffer state. Recompute this.
readToUserBuffer = desiredChars >= _maxCharsPerBuffer;
}
if (_checkPreamble && byteLen >= _preamble.Length)
{
_checkPreamble = false;
if (BytesMatch(byteBuffer, _preamble))
{
_detectEncoding = false;
CompressBuffer(_preamble.Length);
// CompressBuffer changes some buffer state. Recompute this.
readToUserBuffer = desiredChars >= _maxCharsPerBuffer;
}
}
/*
if (readToUserBuffer)
Console.Write('.');
else {
Console.WriteLine("Desired chars is wrong. byteBuffer.length: "+byteBuffer.Length+" max chars is: "+encoding.GetMaxCharCount(byteLen)+" desired: "+desiredChars);
}
*/
charPos = 0;
if (readToUserBuffer)
{
charsRead += decoder.GetChars(byteBuffer, 0, byteLen, userBuffer, userOffset + charsRead);
charLen = 0; // myStreamReader's buffer is empty.
}
else
{
charsRead = decoder.GetChars(byteBuffer, 0, byteLen, charBuffer, charsRead);
charLen += charsRead; // Number of chars in myStreamReader's buffer.
}
} while (charsRead == 0);
//Console.WriteLine("ReadBuffer: charsRead: "+charsRead+" readToUserBuffer: "+readToUserBuffer);
return charsRead;
}
// Reads a line. A line is defined as a sequence of characters followed by
// a carriage return ('\r'), a line feed ('\n'), or a carriage return
// immediately followed by a line feed. The resulting string does not
// contain the terminating carriage return and/or line feed. The returned
// value is null if the end of the input stream has been reached.
//
/// <include file='doc\myStreamReader.uex' path='docs/doc[@for="myStreamReader.ReadLine"]/*' />
public override String ReadLine()
{
_lineLength = 0;
//if (stream == null)
// __Error.ReaderClosed();
if (charPos == charLen)
{
if (ReadBuffer() == 0) return null;
}
StringBuilder sb = null;
do
{
int i = charPos;
do
{
char ch = charBuffer[i];
int EolChars = 0;
if (ch == '\r' || ch == '\n')
{
EolChars = 1;
String s;
if (sb != null)
{
sb.Append(charBuffer, charPos, i - charPos);
s = sb.ToString();
}
else
{
s = new String(charBuffer, charPos, i - charPos);
}
charPos = i + 1;
if (ch == '\r' && (charPos < charLen || ReadBuffer() > 0))
{
if (charBuffer[charPos] == '\n')
{
charPos++;
EolChars = 2;
}
}
_lineLength = s.Length + EolChars;
_bytesRead = _bytesRead + _lineLength;
return s;
}
i++;
} while (i < charLen);
i = charLen - charPos;
if (sb == null) sb = new StringBuilder(i + 80);
sb.Append(charBuffer, charPos, i);
} while (ReadBuffer() > 0);
string ss = sb.ToString();
_lineLength = ss.Length;
_bytesRead = _bytesRead + _lineLength;
return ss;
}
// No data, class doesn't need to be serializable.
// Note this class is threadsafe.
private class NullmyStreamReader : myStreamReader
{
public override Stream BaseStream
{
get { return Stream.Null; }
}
public override Encoding CurrentEncoding
{
get { return Encoding.Unicode; }
}
public override int Peek()
{
return -1;
}
public override int Read()
{
return -1;
}
/// <include file='doc\myStreamReader.uex' path='docs/doc[@for="myStreamReader.NullmyStreamReader.Read"]/*' />
public override int Read(char[] buffer, int index, int count)
{
return 0;
}
/// <include file='doc\myStreamReader.uex' path='docs/doc[@for="myStreamReader.NullmyStreamReader.ReadLine"]/*' />
public override String ReadLine()
{
return null;
}
public override String ReadToEnd()
{
return String.Empty;
}
}
}
}