From 6b4370b6ff0a427607c29b291e824dd2c7665254 Mon Sep 17 00:00:00 2001 From: "Valeriano A.R" Date: Sat, 5 Aug 2023 14:15:38 +0200 Subject: [PATCH] Implement CsvFieldIndexer. Fixes #2 --- CsvLib.Tests/CsvFieldIndexerTests.cs | 142 +++++++++++++++++++ CsvLib.Tests/CsvLib.Tests.csproj | 29 ++++ CsvLib.Tests/Usings.cs | 1 + CsvLib/BufferedTextReader.cs | 54 +++++++ CsvLib/CsvFieldIndexer.cs | 205 +++++++++++++++++++++++++++ CsvView.sln | 6 + 6 files changed, 437 insertions(+) create mode 100644 CsvLib.Tests/CsvFieldIndexerTests.cs create mode 100644 CsvLib.Tests/CsvLib.Tests.csproj create mode 100644 CsvLib.Tests/Usings.cs create mode 100644 CsvLib/BufferedTextReader.cs create mode 100644 CsvLib/CsvFieldIndexer.cs diff --git a/CsvLib.Tests/CsvFieldIndexerTests.cs b/CsvLib.Tests/CsvFieldIndexerTests.cs new file mode 100644 index 0000000..3a8a48e --- /dev/null +++ b/CsvLib.Tests/CsvFieldIndexerTests.cs @@ -0,0 +1,142 @@ +using CsvLib; + +namespace CvsLib; + +public class CsvFieldIndexerTests +{ + #region GenerateIndex + + [Fact] + public void GenerateIndex__Empty() + { + // --- Arrange + StringReader sr = new(string.Empty); + + // --- Act + CsvFieldIndexer indexer = new(); + indexer.GenerateIndex(sr); + + // --- Assert + + Assert.Single(indexer.Index); + + Assert.Equal(0, indexer.Index[0]); + Assert.Empty(indexer.FieldIndex); + } + + [Fact] + public void GenerateIndex__PlainText__OneRow() + { + // --- Arrange + StringReader sr = new("Hello World"); + + // --- Act + CsvFieldIndexer indexer = new(); + indexer.GenerateIndex(sr); + + // --- Assert + + Assert.Equal(2, indexer.Index.Count); + Assert.Equal(0, indexer.Index[0]); + Assert.Equal(12, indexer.Index[1]); + + Assert.Single(indexer.FieldIndex); + Assert.Equal(0, indexer.FieldIndex[0][0]); + Assert.Equal(10, indexer.FieldIndex[0][1]); + } + + [Fact] + public void GenerateIndex__TwoLinesOfPainText__TwoRows() + { + // --- Arrange + StringReader sr = new(""" + Hello World + Hello World + """); + + // --- Act + CsvFieldIndexer indexer = new(); + indexer.GenerateIndex(sr); + + // --- Assert + + Assert.Equal(3, indexer.Index.Count); + Assert.Equal(0, indexer.Index[0]); + Assert.Equal(12, indexer.Index[1]); + Assert.Equal(24, indexer.Index[2]); + + Assert.Equal(2, indexer.FieldIndex.Count); + Assert.Equal(2, indexer.FieldIndex[0].Count); + Assert.Equal(0, indexer.FieldIndex[0][0]); + Assert.Equal(10, indexer.FieldIndex[0][1]); + Assert.Equal(2, indexer.FieldIndex[1].Count); + Assert.Equal(12, indexer.FieldIndex[1][0]); + Assert.Equal(22, indexer.FieldIndex[1][1]); + } + + [Fact] + public void GenerateIndex__TwoLinesOfQuotedText__TwoRows() + { + // --- Arrange + StringReader sr = new(""" + "Hello World" + "Hello World" + """); + + // --- Act + CsvFieldIndexer indexer = new(); + indexer.GenerateIndex(sr); + + // --- Assert + + Assert.Equal(3, indexer.Index.Count); + Assert.Equal(0, indexer.Index[0]); + Assert.Equal(14, indexer.Index[1]); + Assert.Equal(28, indexer.Index[2]); + + Assert.Equal(2, indexer.FieldIndex.Count); + Assert.Equal(2, indexer.FieldIndex[0].Count); + Assert.Equal(1, indexer.FieldIndex[0][0]); + Assert.Equal(11, indexer.FieldIndex[0][1]); + Assert.Equal(2, indexer.FieldIndex[1].Count); + Assert.Equal(15, indexer.FieldIndex[1][0]); + Assert.Equal(25, indexer.FieldIndex[1][1]); + } + + [Fact] + public void GenerateIndex__TwoLinesWithTwoQuotedColumns__TwoRowsTwoFields() + { + // --- Arrange + StringReader sr = new(""" + "Hello","World" + "Hello","World" + """); + + // --- Act + CsvFieldIndexer indexer = new(); + indexer.GenerateIndex(sr); + + // --- Assert + + Assert.Equal(3, indexer.Index.Count); + Assert.Equal(0, indexer.Index[0]); + Assert.Equal(16, indexer.Index[1]); + Assert.Equal(32, indexer.Index[2]); + + Assert.Equal(2, indexer.FieldIndex.Count); + Assert.Equal(4, indexer.FieldIndex[0].Count); + Assert.Equal(1, indexer.FieldIndex[0][0]); + Assert.Equal(5, indexer.FieldIndex[0][1]); + Assert.Equal(9, indexer.FieldIndex[0][2]); + Assert.Equal(13, indexer.FieldIndex[0][3]); + Assert.Equal(4, indexer.FieldIndex[1].Count); + Assert.Equal(17, indexer.FieldIndex[1][0]); + Assert.Equal(21, indexer.FieldIndex[1][1]); + Assert.Equal(25, indexer.FieldIndex[1][2]); + Assert.Equal(29, indexer.FieldIndex[1][3]); + } + + + + #endregion GenerateIndex +} \ No newline at end of file diff --git a/CsvLib.Tests/CsvLib.Tests.csproj b/CsvLib.Tests/CsvLib.Tests.csproj new file mode 100644 index 0000000..59b30ab --- /dev/null +++ b/CsvLib.Tests/CsvLib.Tests.csproj @@ -0,0 +1,29 @@ + + + + net7.0 + enable + enable + CvsLib + + false + + + + + + + runtime; build; native; contentfiles; analyzers; buildtransitive + all + + + runtime; build; native; contentfiles; analyzers; buildtransitive + all + + + + + + + + diff --git a/CsvLib.Tests/Usings.cs b/CsvLib.Tests/Usings.cs new file mode 100644 index 0000000..c802f44 --- /dev/null +++ b/CsvLib.Tests/Usings.cs @@ -0,0 +1 @@ +global using Xunit; diff --git a/CsvLib/BufferedTextReader.cs b/CsvLib/BufferedTextReader.cs new file mode 100644 index 0000000..9f349a0 --- /dev/null +++ b/CsvLib/BufferedTextReader.cs @@ -0,0 +1,54 @@ +using System; +using System.IO; +using System.Text; + +namespace CsvLib +{ + public class BufferedTextReader : TextReader + { + private readonly TextReader _baseReader; + private int _position; + private readonly StringBuilder _sbBuffer = new StringBuilder(); + + public BufferedTextReader(TextReader baseReader) + { + _baseReader = baseReader; + } + + public override int Read() + { + _position++; + int read = _baseReader.Read(); + if (read != -1) + { + _sbBuffer.Append((char)read); + } + return read; + } + + public override int Read(char[] buffer, int index, int count) + { + throw new NotImplementedException("Read buffered method on BufferedTextReader"); + } + + public override int Peek() + { + return _baseReader.Peek(); + } + + public int Position + { + get { return _position; } + } + + public string GetBuffer() + { + return _sbBuffer.ToString(); + } + + public void CleanBuffer() + { + _sbBuffer.Clear(); + } + } +} diff --git a/CsvLib/CsvFieldIndexer.cs b/CsvLib/CsvFieldIndexer.cs new file mode 100644 index 0000000..dbd1f8b --- /dev/null +++ b/CsvLib/CsvFieldIndexer.cs @@ -0,0 +1,205 @@ +using System; +using System.Collections.Generic; +using System.IO; +using System.Text; + +namespace CsvLib +{ + public class CsvFieldIndexer + { + private bool _insideString; + + private readonly char _separator; + private readonly char _quoteChar; + private readonly char _escapeChar; + + public CsvFieldIndexer(char separator = ',', char quoteChar = '"', char escapeChar = '\\') + { + _separator = separator; + _quoteChar = quoteChar; + _escapeChar = escapeChar; + } + + private List _index = new List(); + + public List Index { get { return _index; } } + + private List> _fieldIndex = new List>(); + + public List> FieldIndex { get { return _fieldIndex; } } + + private void DummyParser(string line) + { + for (int i = 0; i < line.Length; i++) + { + char c = line[i]; + if (c == _separator && _insideString == false) + { + continue; + } + if (c == _quoteChar && _insideString == false) + { + _insideString = true; + continue; + } + if (c == _quoteChar && _insideString) + { + _insideString = false; + continue; + } + if (c == _escapeChar && _insideString) + { + i++; + c = line[i]; + } + } + } + + private List ParseLineIndex(string line, long lineOffset) + { + List fieldPositions = new List(); + long? fieldStartPosition = null; + long? fieldEndPosition = null; + for (int i = 0; i < line.Length; i++) + { + char c = line[i]; + if (c == _separator && _insideString == false) + { + if (fieldStartPosition != null) + { + fieldPositions.Add((long)fieldStartPosition); + fieldPositions.Add((long)fieldEndPosition); + } + fieldStartPosition = null; + fieldEndPosition = null; + } + else if (c == _quoteChar && _insideString == false) + { + _insideString = true; + } + else if (c == _quoteChar && _insideString) + { + _insideString = false; + } + else if (c == _escapeChar && _insideString) + { + i++; + c = line[i]; + } + else if ((c == '\n' || c == '\r') && _insideString == false) + { + break; + } + else + { + long absolutePosition = lineOffset + i; + if (fieldStartPosition == null) { fieldStartPosition = absolutePosition; } + fieldEndPosition = absolutePosition; + } + } + if (_insideString == false) + { + if (fieldStartPosition != null) + { + fieldPositions.Add((long)fieldStartPosition); + fieldPositions.Add((long)fieldEndPosition); + } + } + return fieldPositions; + } + + public void GenerateIndex(string file) + { + using (FileStream stream = new FileStream(file, FileMode.Open)) + using (StreamReader streamReader = new StreamReader(stream, Encoding.Default, true, 4096)) + { + GenerateIndex(streamReader); + } + } + + public void GenerateIndex(TextReader textReader) + { + _insideString = false; + _index.Clear(); + _index.Add(0); + int idxRow = 0; + using (BufferedTextReader reader = new BufferedTextReader(textReader)) + { + string currentLine; + while ((currentLine = reader.ReadLine()) != null) + { + DummyParser(currentLine); + if (_insideString) { continue; } + + string fullLine = reader.GetBuffer(); + reader.CleanBuffer(); + List fieldIndexes = ParseLineIndex(fullLine, _index[idxRow]); + _fieldIndex.Add(fieldIndexes); + + _index.Add(reader.Position); + + idxRow++; + } + } + } + + private void Index_SaveFile(string indexFile) + { + if (File.Exists(indexFile)) + { + File.Delete(indexFile); + } + Stream streamOut = File.Open(indexFile, FileMode.Create); + using (BinaryWriter binWriter = new BinaryWriter(streamOut)) + { + binWriter.Write(_index.Count); + for (int i = 0; i < _index.Count; i++) + { + binWriter.Write(_index[i]); + } + } + streamOut.Close(); + } + + private static List Index_LoadFile(string indexFile) + { + List tempIndex = new List(); + + Stream streamIn = File.Open(indexFile, FileMode.Open); + using (BinaryReader binReader = new BinaryReader(streamIn)) + { + int numRegs = binReader.ReadInt32(); + for (int i = 0; i < numRegs; i++) + { + long value = binReader.ReadInt64(); + tempIndex.Add(value); + } + } + streamIn.Close(); + return tempIndex; + } + + public void LoadIndexOfFile(string file) + { + DateTime dtFile = File.GetCreationTime(file); + string indexFile = $"{file}.idx"; + if (File.Exists(indexFile) && File.GetCreationTime(indexFile) > dtFile) + { + _index = Index_LoadFile(indexFile); + } + else + { + // Generate index + DateTime dtNow = DateTime.UtcNow; + GenerateIndex(file); + TimeSpan tsGenIndex = DateTime.UtcNow - dtNow; + + // Save Index if expensive generation + if (tsGenIndex.TotalSeconds > 2) + { + Index_SaveFile(indexFile); + } + } + } + } +} diff --git a/CsvView.sln b/CsvView.sln index 0edf147..1ccb1fb 100644 --- a/CsvView.sln +++ b/CsvView.sln @@ -7,6 +7,8 @@ Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "CsvView", "CsvView.csproj", EndProject Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "CsvLib", "CsvLib\CsvLib.csproj", "{EB0FDB60-8B9D-401C-85A8-4CF4105D5063}" EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "CsvLib.Tests", "CsvLib.Tests\CsvLib.Tests.csproj", "{EC5C84D8-1CDE-4AED-9C16-6C4086A20893}" +EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug|Any CPU = Debug|Any CPU @@ -21,6 +23,10 @@ Global {EB0FDB60-8B9D-401C-85A8-4CF4105D5063}.Debug|Any CPU.Build.0 = Debug|Any CPU {EB0FDB60-8B9D-401C-85A8-4CF4105D5063}.Release|Any CPU.ActiveCfg = Release|Any CPU {EB0FDB60-8B9D-401C-85A8-4CF4105D5063}.Release|Any CPU.Build.0 = Release|Any CPU + {EC5C84D8-1CDE-4AED-9C16-6C4086A20893}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {EC5C84D8-1CDE-4AED-9C16-6C4086A20893}.Debug|Any CPU.Build.0 = Debug|Any CPU + {EC5C84D8-1CDE-4AED-9C16-6C4086A20893}.Release|Any CPU.ActiveCfg = Release|Any CPU + {EC5C84D8-1CDE-4AED-9C16-6C4086A20893}.Release|Any CPU.Build.0 = Release|Any CPU EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE