CsvFieldIndexer: Fix calculation of offsets with unicode characters.

Fixes #4
This commit is contained in:
2023-08-18 15:47:09 +02:00
parent fb6d8d76a1
commit 6ed9718abb
4 changed files with 79 additions and 17 deletions

View File

@@ -1,4 +1,3 @@
using System;
using System.IO;
using System.Text;
@@ -10,15 +9,29 @@ namespace CsvLib
private int _position;
private readonly StringBuilder _sbBuffer = new StringBuilder();
private readonly Encoding _currentEncoding = Encoding.Default;
public BufferedTextReader(TextReader baseReader)
{
_baseReader = baseReader;
if (baseReader is StreamReader streamReader)
{
_currentEncoding = streamReader.CurrentEncoding;
}
}
public override int Read()
{
_position++;
int read = _baseReader.Read();
if (read > 127)
{
int count = _currentEncoding.GetByteCount(((char)read).ToString());
_position += count;
}
else
{
_position++;
}
if (read != -1)
{
_sbBuffer.Append((char)read);
@@ -26,11 +39,6 @@ namespace CsvLib
return read;
}
public override int Read(char[] buffer, int index, int count)
{
throw new NotImplementedException("Read buffered method on BufferedTextReader");
}
public override int Peek()
{
return _baseReader.Peek();

View File

@@ -9,6 +9,8 @@ namespace CsvLib
{
private bool _insideString;
private Encoding _currentEncoding = Encoding.Default;
private readonly char _separator;
private readonly char _quoteChar;
private readonly char _escapeChar;
@@ -59,6 +61,7 @@ namespace CsvLib
List<long> fieldPositions = new List<long>();
long? fieldStartPosition = null;
long? fieldEndPosition = null;
int unicodeDelta = 0;
for (int i = 0; i < line.Length; i++)
{
char c = line[i];
@@ -90,7 +93,12 @@ namespace CsvLib
}
else
{
long absolutePosition = lineOffset + i;
if (c > 127)
{
unicodeDelta += _currentEncoding.GetByteCount(c.ToString()) - 1;
}
long absolutePosition = lineOffset + i + unicodeDelta;
if (fieldStartPosition == null) { fieldStartPosition = absolutePosition; }
fieldEndPosition = absolutePosition;
}
@@ -121,6 +129,10 @@ namespace CsvLib
_index.Clear();
_index.Add(0);
int idxRow = 0;
if (textReader is StreamReader streamReader)
{
_currentEncoding = streamReader.CurrentEncoding;
}
using (BufferedTextReader reader = new BufferedTextReader(textReader))
{
string currentLine;
@@ -180,7 +192,7 @@ namespace CsvLib
public void LoadIndexOfFile(string file)
{
DateTime dtFile = File.GetCreationTime(file);
string indexFile = $"{file}.idx";
string indexFile = $"{file}.idx2";
if (File.Exists(indexFile) && File.GetCreationTime(indexFile) > dtFile)
{
_index = Index_LoadFile(indexFile);

View File

@@ -1,5 +1,5 @@
using System;
using System.IO;
using System.Text;
namespace CsvLib
{
@@ -7,21 +7,30 @@ namespace CsvLib
{
private readonly TextReader _baseReader;
private int _position;
private readonly Encoding _currentEncoding = Encoding.Default;
public TrackingTextReader(TextReader baseReader)
{
_baseReader = baseReader;
if (baseReader is StreamReader streamReader)
{
_currentEncoding = streamReader.CurrentEncoding;
}
}
public override int Read()
{
_position++;
return _baseReader.Read();
}
public override int Read(char[] buffer, int index, int count)
{
throw new NotImplementedException("Read buffered method on TrackingTextReader");
int read = _baseReader.Read();
if (read > 127)
{
int count = _currentEncoding.GetByteCount(((char)read).ToString());
_position += count;
}
else
{
_position++;
}
return read;
}
public override int Peek()