From a0010593f605f39532de6cd5ae0eeedd77597c5a Mon Sep 17 00:00:00 2001 From: "Valeriano A.R" Date: Mon, 21 Aug 2023 04:31:21 +0200 Subject: [PATCH] CsvFieldIndexer.Search: Naive search implementation. --- CsvLib.Tests/CsvFieldIndexerTests.cs | 74 ++++++++++++++++++++++++++++ CsvLib/BufferedTextReader.cs | 5 -- CsvLib/ByteArraySearcher.cs | 3 +- CsvLib/CsvFieldIndexer.cs | 63 +++++++++++++++++++++-- 4 files changed, 135 insertions(+), 10 deletions(-) diff --git a/CsvLib.Tests/CsvFieldIndexerTests.cs b/CsvLib.Tests/CsvFieldIndexerTests.cs index e971672..478a786 100644 --- a/CsvLib.Tests/CsvFieldIndexerTests.cs +++ b/CsvLib.Tests/CsvFieldIndexerTests.cs @@ -1,3 +1,4 @@ +using System.Text; using CsvLib; namespace CvsLib; @@ -172,4 +173,77 @@ public class CsvFieldIndexerTests #endregion GenerateIndex + #region Search + + [Fact] + public void Search__TwoLinesWithTwoQuotedColumns__OneIndexFirstRow() + { + // --- Arrange + string strText = """ + "Hello","test" + "Hello","World" + """; + StringReader sr = new(strText); + CsvFieldIndexer indexer = new(); + indexer.GenerateIndex(sr); + + // --- Act + byte[] bText = Encoding.UTF8.GetBytes(strText); + MemoryStream ms = new(bText); + List indexes = indexer.Search(ms, "test"); + + // --- Assert + + Assert.Single(indexes); + Assert.Equal(0, indexes[0]); + } + + [Fact] + public void Search__TwoLinesWithTwoQuotedColumns__OneIndexSecondRow() + { + // --- Arrange + string strText = """ + "Hello","World" + "Hello","test" + """; + StringReader sr = new(strText); + CsvFieldIndexer indexer = new(); + indexer.GenerateIndex(sr); + + // --- Act + byte[] bText = Encoding.UTF8.GetBytes(strText); + MemoryStream ms = new(bText); + List indexes = indexer.Search(ms, "test"); + + // --- Assert + + Assert.Single(indexes); + Assert.Equal(16, indexes[0]); + } + + [Fact] + public void Search__TwoLinesWithTwoQuotedColumnsTwoMatches__OneIndexSecondRow() + { + // --- Arrange + string strText = """ + "Hello","World" + "test","test" + """; + StringReader sr = new(strText); + CsvFieldIndexer indexer = new(); + indexer.GenerateIndex(sr); + + // --- Act + byte[] bText = Encoding.UTF8.GetBytes(strText); + MemoryStream ms = new(bText); + List indexes = indexer.Search(ms, "test"); + + // --- Assert + + Assert.Single(indexes); + Assert.Equal(16, indexes[0]); + } + + #endregion Search + } diff --git a/CsvLib/BufferedTextReader.cs b/CsvLib/BufferedTextReader.cs index 3da080c..ec942d6 100644 --- a/CsvLib/BufferedTextReader.cs +++ b/CsvLib/BufferedTextReader.cs @@ -39,11 +39,6 @@ namespace CsvLib return read; } - public override int Peek() - { - return _baseReader.Peek(); - } - public int Position { get { return _position; } diff --git a/CsvLib/ByteArraySearcher.cs b/CsvLib/ByteArraySearcher.cs index f6ac65c..cbe0b64 100644 --- a/CsvLib/ByteArraySearcher.cs +++ b/CsvLib/ByteArraySearcher.cs @@ -32,8 +32,7 @@ public class ByteArraySearcher } } - if (found) - return true; + if (found) { return true; } } return false; diff --git a/CsvLib/CsvFieldIndexer.cs b/CsvLib/CsvFieldIndexer.cs index 2ab2d14..1a62582 100644 --- a/CsvLib/CsvFieldIndexer.cs +++ b/CsvLib/CsvFieldIndexer.cs @@ -119,6 +119,7 @@ namespace CsvLib using FileStream stream = new(file, FileMode.Open); using StreamReader streamReader = new(stream, Encoding.Default, true, 4096); GenerateIndex(streamReader); + stream.Close(); } public void GenerateIndex(TextReader textReader) @@ -172,7 +173,7 @@ namespace CsvLib { binWriter.Write(currentIndex); } - + binWriter.Write(_fieldIndex.Count); foreach (List currentFieldIndex in _fieldIndex) { @@ -198,7 +199,7 @@ namespace CsvLib try { using BinaryReader binReader = new(streamIn); - + byte magik0 = binReader.ReadByte(); byte magik1 = binReader.ReadByte(); byte magik2 = binReader.ReadByte(); @@ -214,7 +215,7 @@ namespace CsvLib long value = binReader.ReadInt64(); tempIndex.Add(value); } - + int numFieldIndexes = binReader.ReadInt32(); tempFieldIndex = new List>(numFieldIndexes); for (int j = 0; j < numFieldIndexes; j++) @@ -263,5 +264,61 @@ namespace CsvLib SaveFile(indexFile); } } + + public List Search(string fileName, string textToSearch, Action notifyProgress = null) + { + List index; + using FileStream streamIn = new(fileName, FileMode.Open); + try + { + index = Search(streamIn, textToSearch, notifyProgress); + } + finally + { + streamIn.Close(); + } + return index ?? new List(); + } + + public List Search(Stream streamIn, string textToSearch, Action notifyProgress = null) + { + // TODO: Use MemoryMappedFile for better IO performance + DateTime datePrevious = DateTime.UtcNow; + List newIndexes = new(); + byte[] bText = Encoding.UTF8.GetBytes(textToSearch); + ByteArraySearcher searcher = new(bText); + byte[] buffer = new byte[1024]; + for (int j = 0; j < _fieldIndex.Count; j++) + { + for (int i = 0; i < _fieldIndex[j].Count; i += 2) + { + TimeSpan tsElapsed = DateTime.UtcNow - datePrevious; + if (tsElapsed.TotalMilliseconds > 200) + { + datePrevious = DateTime.UtcNow; + notifyProgress?.Invoke(j/(float)_fieldIndex.Count); + } + + long offset = _fieldIndex[j][i]; + int length = (int)(_fieldIndex[j][i + 1] - offset) + 1; + + if (buffer.Length < length) + { + buffer = new byte[length]; + } + streamIn.Seek(offset, SeekOrigin.Begin); + int read = streamIn.Read(buffer, 0, length); + if (read != length) { throw new Exception($"Search: Expected {length} bytes, but read {read}"); } + + bool matches = searcher.Contains(buffer, length); + if (matches == false) { continue; } + + newIndexes.Add(_index[j]); + break; + } + } + + return newIndexes; + } } }