PdfTextExtractor: Better column and field extraction heuristics

This commit is contained in:
2016-06-24 17:37:19 +02:00
parent da908d0f36
commit 85d998a8d3

View File

@@ -262,13 +262,22 @@ namespace VAR.PdfTools
_textWidth = 0;
}
private void AddTextElement(PdfTextElement textElement)
{
if (string.IsNullOrEmpty(textElement.VisibleText.Trim()))
{
return;
}
_textElements.Add(textElement);
}
private void FlushTextElement()
{
if (_sbText.Length == 0)
{
if (_currentTextElement != null)
{
_textElements.Add(_currentTextElement);
AddTextElement(_currentTextElement);
_currentTextElement = null;
}
return;
@@ -277,24 +286,42 @@ namespace VAR.PdfTools
if (_currentTextElement != null)
{
FlushTextElementSoft();
_textElements.Add(_currentTextElement);
AddTextElement(_currentTextElement);
_currentTextElement = null;
}
else
{
PdfTextElement textElem = BuildTextElement();
_textElements.Add(textElem);
AddTextElement(textElem);
}
_sbText = new StringBuilder();
_textWidth = 0;
}
private string SimplifyText(string text)
{
StringBuilder sbResult = new StringBuilder();
foreach (char c in text)
{
if (c == '.' || c == ',' ||
c == ':' || c == ';' ||
c == '-' || c == '_' ||
c == ' ' || c == '\t')
{
continue;
}
sbResult.Append(char.ToUpper(c));
}
return sbResult.ToString();
}
private PdfTextElement FindElementByText(string text)
{
string simpleText = SimplifyText(text);
foreach (PdfTextElement elem in _textElements)
{
if (elem.VisibleText == text)
if (SimplifyText(elem.VisibleText) == simpleText)
{
return elem;
}
@@ -566,37 +593,65 @@ namespace VAR.PdfTools
return new List<string>();
}
double headY = columnHead.GetY();
double headX1 = columnHead.GetX();
double headX2 = headX1 + columnHead.VisibleWidth;
// Get all the elements that intersects vertically and sort
var columnData = new List<PdfTextElement>();
// Determine horizontal extent
double extentX1 = double.MinValue;
double extentX2 = double.MaxValue;
foreach (PdfTextElement elem in _textElements)
{
if(elem == columnHead){continue;}
if (TextElementHorizontalIntersection(columnHead, elem) == false) { continue; }
double elemX1 = elem.GetX();
double elemX2 = elemX1 + elem.VisibleWidth;
if (elemX2 < headX1)
{
if (elemX2 > extentX1)
{
extentX1 = elemX2;
}
}
if (elemX1 > headX2)
{
if (elemX1 < extentX2)
{
extentX2 = elemX1;
}
}
}
// Get all the elements that intersects vertically, are down and sort results
var columnDataRaw = new List<PdfTextElement>();
foreach (PdfTextElement elem in _textElements)
{
if (TextElementVerticalIntersection(columnHead, elem) == false) { continue; }
// Only intems down the column
double elemY = elem.GetY();
if (elemY >= headY) { continue; }
columnDataRaw.Add(elem);
}
columnDataRaw = columnDataRaw.OrderByDescending(elem => elem.GetY()).ToList();
// Only items completelly inside extents, amd break on the first element outside
var columnData = new List<PdfTextElement>();
foreach (PdfTextElement elem in columnDataRaw)
{
double elemX1 = elem.GetX();
double elemX2 = elemX1 + elem.VisibleWidth;
if (elemX1 < extentX1 || elemX2 > extentX2) { break; }
columnData.Add(elem);
}
columnData = columnData.OrderByDescending(elem => elem.GetY()).ToList();
// Filter only nearest elements
// Emit result
var result = new List<string>();
double prevY = headY;
double medDiff = 0;
bool first = true;
foreach (PdfTextElement elem in columnData)
{
double elemY = elem.GetY();
double diff = prevY - elemY;
prevY = elemY;
if (first)
{
first = false;
medDiff = diff;
}
if (diff > medDiff) { break; }
medDiff = (medDiff + diff) / 2;
result.Add(elem.VisibleText);
}
return result;