CsvFieldIndexer: Fix calculation of offsets with unicode characters.

Fixes #4
This commit is contained in:
2023-08-18 15:47:09 +02:00
parent fb6d8d76a1
commit 6ed9718abb
4 changed files with 79 additions and 17 deletions

View File

@@ -137,6 +137,39 @@ public class CsvFieldIndexerTests
Assert.Equal(29, indexer.FieldIndex[1][3]); Assert.Equal(29, indexer.FieldIndex[1][3]);
} }
[Fact]
public void GenerateIndex__TwoLinesWithTwoQuotedColumnsWithUnicode__TwoRowsTwoFields()
{
// --- Arrange
StringReader sr = new("""
"Hélló","Wórld"
"Hélló","Wórld"
""");
// --- Act
CsvFieldIndexer indexer = new();
indexer.GenerateIndex(sr);
// --- Assert
Assert.Equal(3, indexer.Index.Count);
Assert.Equal(0, indexer.Index[0]);
Assert.Equal(19, indexer.Index[1]);
Assert.Equal(38, indexer.Index[2]);
Assert.Equal(2, indexer.FieldIndex.Count);
Assert.Equal(4, indexer.FieldIndex[0].Count);
Assert.Equal(1, indexer.FieldIndex[0][0]);
Assert.Equal(7, indexer.FieldIndex[0][1]);
Assert.Equal(11, indexer.FieldIndex[0][2]);
Assert.Equal(16, indexer.FieldIndex[0][3]);
Assert.Equal(4, indexer.FieldIndex[1].Count);
Assert.Equal(20, indexer.FieldIndex[1][0]);
Assert.Equal(26, indexer.FieldIndex[1][1]);
Assert.Equal(30, indexer.FieldIndex[1][2]);
Assert.Equal(35, indexer.FieldIndex[1][3]);
}
#endregion GenerateIndex #endregion GenerateIndex
} }

View File

@@ -1,4 +1,3 @@
using System;
using System.IO; using System.IO;
using System.Text; using System.Text;
@@ -10,15 +9,29 @@ namespace CsvLib
private int _position; private int _position;
private readonly StringBuilder _sbBuffer = new StringBuilder(); private readonly StringBuilder _sbBuffer = new StringBuilder();
private readonly Encoding _currentEncoding = Encoding.Default;
public BufferedTextReader(TextReader baseReader) public BufferedTextReader(TextReader baseReader)
{ {
_baseReader = baseReader; _baseReader = baseReader;
if (baseReader is StreamReader streamReader)
{
_currentEncoding = streamReader.CurrentEncoding;
}
} }
public override int Read() public override int Read()
{ {
_position++;
int read = _baseReader.Read(); int read = _baseReader.Read();
if (read > 127)
{
int count = _currentEncoding.GetByteCount(((char)read).ToString());
_position += count;
}
else
{
_position++;
}
if (read != -1) if (read != -1)
{ {
_sbBuffer.Append((char)read); _sbBuffer.Append((char)read);
@@ -26,11 +39,6 @@ namespace CsvLib
return read; return read;
} }
public override int Read(char[] buffer, int index, int count)
{
throw new NotImplementedException("Read buffered method on BufferedTextReader");
}
public override int Peek() public override int Peek()
{ {
return _baseReader.Peek(); return _baseReader.Peek();

View File

@@ -9,6 +9,8 @@ namespace CsvLib
{ {
private bool _insideString; private bool _insideString;
private Encoding _currentEncoding = Encoding.Default;
private readonly char _separator; private readonly char _separator;
private readonly char _quoteChar; private readonly char _quoteChar;
private readonly char _escapeChar; private readonly char _escapeChar;
@@ -59,6 +61,7 @@ namespace CsvLib
List<long> fieldPositions = new List<long>(); List<long> fieldPositions = new List<long>();
long? fieldStartPosition = null; long? fieldStartPosition = null;
long? fieldEndPosition = null; long? fieldEndPosition = null;
int unicodeDelta = 0;
for (int i = 0; i < line.Length; i++) for (int i = 0; i < line.Length; i++)
{ {
char c = line[i]; char c = line[i];
@@ -90,7 +93,12 @@ namespace CsvLib
} }
else else
{ {
long absolutePosition = lineOffset + i; if (c > 127)
{
unicodeDelta += _currentEncoding.GetByteCount(c.ToString()) - 1;
}
long absolutePosition = lineOffset + i + unicodeDelta;
if (fieldStartPosition == null) { fieldStartPosition = absolutePosition; } if (fieldStartPosition == null) { fieldStartPosition = absolutePosition; }
fieldEndPosition = absolutePosition; fieldEndPosition = absolutePosition;
} }
@@ -121,6 +129,10 @@ namespace CsvLib
_index.Clear(); _index.Clear();
_index.Add(0); _index.Add(0);
int idxRow = 0; int idxRow = 0;
if (textReader is StreamReader streamReader)
{
_currentEncoding = streamReader.CurrentEncoding;
}
using (BufferedTextReader reader = new BufferedTextReader(textReader)) using (BufferedTextReader reader = new BufferedTextReader(textReader))
{ {
string currentLine; string currentLine;
@@ -180,7 +192,7 @@ namespace CsvLib
public void LoadIndexOfFile(string file) public void LoadIndexOfFile(string file)
{ {
DateTime dtFile = File.GetCreationTime(file); DateTime dtFile = File.GetCreationTime(file);
string indexFile = $"{file}.idx"; string indexFile = $"{file}.idx2";
if (File.Exists(indexFile) && File.GetCreationTime(indexFile) > dtFile) if (File.Exists(indexFile) && File.GetCreationTime(indexFile) > dtFile)
{ {
_index = Index_LoadFile(indexFile); _index = Index_LoadFile(indexFile);

View File

@@ -1,5 +1,5 @@
using System;
using System.IO; using System.IO;
using System.Text;
namespace CsvLib namespace CsvLib
{ {
@@ -7,21 +7,30 @@ namespace CsvLib
{ {
private readonly TextReader _baseReader; private readonly TextReader _baseReader;
private int _position; private int _position;
private readonly Encoding _currentEncoding = Encoding.Default;
public TrackingTextReader(TextReader baseReader) public TrackingTextReader(TextReader baseReader)
{ {
_baseReader = baseReader; _baseReader = baseReader;
if (baseReader is StreamReader streamReader)
{
_currentEncoding = streamReader.CurrentEncoding;
}
} }
public override int Read() public override int Read()
{ {
_position++; int read = _baseReader.Read();
return _baseReader.Read(); if (read > 127)
} {
int count = _currentEncoding.GetByteCount(((char)read).ToString());
public override int Read(char[] buffer, int index, int count) _position += count;
{ }
throw new NotImplementedException("Read buffered method on TrackingTextReader"); else
{
_position++;
}
return read;
} }
public override int Peek() public override int Peek()