using System; using System.Collections.Generic; using System.IO; using System.Text; namespace CsvLib { public class CsvFieldIndexer { private bool _insideString; private readonly char _separator; private readonly char _quoteChar; private readonly char _escapeChar; public CsvFieldIndexer(char separator = ',', char quoteChar = '"', char escapeChar = '\\') { _separator = separator; _quoteChar = quoteChar; _escapeChar = escapeChar; } private List _index = new List(); public List Index { get { return _index; } } private List> _fieldIndex = new List>(); public List> FieldIndex { get { return _fieldIndex; } } private void DummyParser(string line) { for (int i = 0; i < line.Length; i++) { char c = line[i]; if (c == _separator && _insideString == false) { continue; } if (c == _quoteChar && _insideString == false) { _insideString = true; continue; } if (c == _quoteChar && _insideString) { _insideString = false; continue; } if (c == _escapeChar && _insideString) { i++; c = line[i]; } } } private List ParseLineIndex(string line, long lineOffset) { List fieldPositions = new List(); long? fieldStartPosition = null; long? fieldEndPosition = null; for (int i = 0; i < line.Length; i++) { char c = line[i]; if (c == _separator && _insideString == false) { if (fieldStartPosition != null) { fieldPositions.Add((long)fieldStartPosition); fieldPositions.Add((long)fieldEndPosition); } fieldStartPosition = null; fieldEndPosition = null; } else if (c == _quoteChar && _insideString == false) { _insideString = true; } else if (c == _quoteChar && _insideString) { _insideString = false; } else if (c == _escapeChar && _insideString) { i++; c = line[i]; } else if ((c == '\n' || c == '\r') && _insideString == false) { break; } else { long absolutePosition = lineOffset + i; if (fieldStartPosition == null) { fieldStartPosition = absolutePosition; } fieldEndPosition = absolutePosition; } } if (_insideString == false) { if (fieldStartPosition != null) { fieldPositions.Add((long)fieldStartPosition); fieldPositions.Add((long)fieldEndPosition); } } return fieldPositions; } public void GenerateIndex(string file) { using (FileStream stream = new FileStream(file, FileMode.Open)) using (StreamReader streamReader = new StreamReader(stream, Encoding.Default, true, 4096)) { GenerateIndex(streamReader); } } public void GenerateIndex(TextReader textReader) { _insideString = false; _index.Clear(); _index.Add(0); int idxRow = 0; using (BufferedTextReader reader = new BufferedTextReader(textReader)) { string currentLine; while ((currentLine = reader.ReadLine()) != null) { DummyParser(currentLine); if (_insideString) { continue; } string fullLine = reader.GetBuffer(); reader.CleanBuffer(); List fieldIndexes = ParseLineIndex(fullLine, _index[idxRow]); _fieldIndex.Add(fieldIndexes); _index.Add(reader.Position); idxRow++; } } } private void Index_SaveFile(string indexFile) { if (File.Exists(indexFile)) { File.Delete(indexFile); } Stream streamOut = File.Open(indexFile, FileMode.Create); using (BinaryWriter binWriter = new BinaryWriter(streamOut)) { binWriter.Write(_index.Count); for (int i = 0; i < _index.Count; i++) { binWriter.Write(_index[i]); } } streamOut.Close(); } private static List Index_LoadFile(string indexFile) { List tempIndex = new List(); Stream streamIn = File.Open(indexFile, FileMode.Open); using (BinaryReader binReader = new BinaryReader(streamIn)) { int numRegs = binReader.ReadInt32(); for (int i = 0; i < numRegs; i++) { long value = binReader.ReadInt64(); tempIndex.Add(value); } } streamIn.Close(); return tempIndex; } public void LoadIndexOfFile(string file) { DateTime dtFile = File.GetCreationTime(file); string indexFile = $"{file}.idx"; if (File.Exists(indexFile) && File.GetCreationTime(indexFile) > dtFile) { _index = Index_LoadFile(indexFile); } else { // Generate index DateTime dtNow = DateTime.UtcNow; GenerateIndex(file); TimeSpan tsGenIndex = DateTime.UtcNow - dtNow; // Save Index if expensive generation if (tsGenIndex.TotalSeconds > 2) { Index_SaveFile(indexFile); } } } } }