From 6ed9718abbd10d4384fe548d5e9663f4c53791e6 Mon Sep 17 00:00:00 2001 From: "Valeriano A.R" Date: Fri, 18 Aug 2023 15:47:09 +0200 Subject: [PATCH] CsvFieldIndexer: Fix calculation of offsets with unicode characters. Fixes #4 --- CsvLib.Tests/CsvFieldIndexerTests.cs | 33 ++++++++++++++++++++++++++++ CsvLib/BufferedTextReader.cs | 22 +++++++++++++------ CsvLib/CsvFieldIndexer.cs | 16 ++++++++++++-- CsvLib/TrackingTextReader.cs | 25 ++++++++++++++------- 4 files changed, 79 insertions(+), 17 deletions(-) diff --git a/CsvLib.Tests/CsvFieldIndexerTests.cs b/CsvLib.Tests/CsvFieldIndexerTests.cs index b647e0e..e971672 100644 --- a/CsvLib.Tests/CsvFieldIndexerTests.cs +++ b/CsvLib.Tests/CsvFieldIndexerTests.cs @@ -137,6 +137,39 @@ public class CsvFieldIndexerTests Assert.Equal(29, indexer.FieldIndex[1][3]); } + [Fact] + public void GenerateIndex__TwoLinesWithTwoQuotedColumnsWithUnicode__TwoRowsTwoFields() + { + // --- Arrange + StringReader sr = new(""" + "Hélló","Wórld" + "Hélló","Wórld" + """); + + // --- Act + CsvFieldIndexer indexer = new(); + indexer.GenerateIndex(sr); + + // --- Assert + + Assert.Equal(3, indexer.Index.Count); + Assert.Equal(0, indexer.Index[0]); + Assert.Equal(19, indexer.Index[1]); + Assert.Equal(38, indexer.Index[2]); + + Assert.Equal(2, indexer.FieldIndex.Count); + Assert.Equal(4, indexer.FieldIndex[0].Count); + Assert.Equal(1, indexer.FieldIndex[0][0]); + Assert.Equal(7, indexer.FieldIndex[0][1]); + Assert.Equal(11, indexer.FieldIndex[0][2]); + Assert.Equal(16, indexer.FieldIndex[0][3]); + Assert.Equal(4, indexer.FieldIndex[1].Count); + Assert.Equal(20, indexer.FieldIndex[1][0]); + Assert.Equal(26, indexer.FieldIndex[1][1]); + Assert.Equal(30, indexer.FieldIndex[1][2]); + Assert.Equal(35, indexer.FieldIndex[1][3]); + } + #endregion GenerateIndex } diff --git a/CsvLib/BufferedTextReader.cs b/CsvLib/BufferedTextReader.cs index 9f349a0..b6f555e 100644 --- a/CsvLib/BufferedTextReader.cs +++ b/CsvLib/BufferedTextReader.cs @@ -1,4 +1,3 @@ -using System; using System.IO; using System.Text; @@ -10,15 +9,29 @@ namespace CsvLib private int _position; private readonly StringBuilder _sbBuffer = new StringBuilder(); + private readonly Encoding _currentEncoding = Encoding.Default; + public BufferedTextReader(TextReader baseReader) { _baseReader = baseReader; + if (baseReader is StreamReader streamReader) + { + _currentEncoding = streamReader.CurrentEncoding; + } } public override int Read() { - _position++; int read = _baseReader.Read(); + if (read > 127) + { + int count = _currentEncoding.GetByteCount(((char)read).ToString()); + _position += count; + } + else + { + _position++; + } if (read != -1) { _sbBuffer.Append((char)read); @@ -26,11 +39,6 @@ namespace CsvLib return read; } - public override int Read(char[] buffer, int index, int count) - { - throw new NotImplementedException("Read buffered method on BufferedTextReader"); - } - public override int Peek() { return _baseReader.Peek(); diff --git a/CsvLib/CsvFieldIndexer.cs b/CsvLib/CsvFieldIndexer.cs index a2c5d1f..df47523 100644 --- a/CsvLib/CsvFieldIndexer.cs +++ b/CsvLib/CsvFieldIndexer.cs @@ -9,6 +9,8 @@ namespace CsvLib { private bool _insideString; + private Encoding _currentEncoding = Encoding.Default; + private readonly char _separator; private readonly char _quoteChar; private readonly char _escapeChar; @@ -59,6 +61,7 @@ namespace CsvLib List fieldPositions = new List(); long? fieldStartPosition = null; long? fieldEndPosition = null; + int unicodeDelta = 0; for (int i = 0; i < line.Length; i++) { char c = line[i]; @@ -90,7 +93,12 @@ namespace CsvLib } else { - long absolutePosition = lineOffset + i; + if (c > 127) + { + unicodeDelta += _currentEncoding.GetByteCount(c.ToString()) - 1; + } + + long absolutePosition = lineOffset + i + unicodeDelta; if (fieldStartPosition == null) { fieldStartPosition = absolutePosition; } fieldEndPosition = absolutePosition; } @@ -121,6 +129,10 @@ namespace CsvLib _index.Clear(); _index.Add(0); int idxRow = 0; + if (textReader is StreamReader streamReader) + { + _currentEncoding = streamReader.CurrentEncoding; + } using (BufferedTextReader reader = new BufferedTextReader(textReader)) { string currentLine; @@ -180,7 +192,7 @@ namespace CsvLib public void LoadIndexOfFile(string file) { DateTime dtFile = File.GetCreationTime(file); - string indexFile = $"{file}.idx"; + string indexFile = $"{file}.idx2"; if (File.Exists(indexFile) && File.GetCreationTime(indexFile) > dtFile) { _index = Index_LoadFile(indexFile); diff --git a/CsvLib/TrackingTextReader.cs b/CsvLib/TrackingTextReader.cs index 6009077..95cba30 100644 --- a/CsvLib/TrackingTextReader.cs +++ b/CsvLib/TrackingTextReader.cs @@ -1,5 +1,5 @@ -using System; using System.IO; +using System.Text; namespace CsvLib { @@ -7,21 +7,30 @@ namespace CsvLib { private readonly TextReader _baseReader; private int _position; + private readonly Encoding _currentEncoding = Encoding.Default; public TrackingTextReader(TextReader baseReader) { _baseReader = baseReader; + if (baseReader is StreamReader streamReader) + { + _currentEncoding = streamReader.CurrentEncoding; + } } public override int Read() { - _position++; - return _baseReader.Read(); - } - - public override int Read(char[] buffer, int index, int count) - { - throw new NotImplementedException("Read buffered method on TrackingTextReader"); + int read = _baseReader.Read(); + if (read > 127) + { + int count = _currentEncoding.GetByteCount(((char)read).ToString()); + _position += count; + } + else + { + _position++; + } + return read; } public override int Peek()