PdfTextExtractor: Better column extraction, spliting big TextElements.

This commit is contained in:
2019-10-28 02:57:42 +01:00
parent 781f212289
commit c8c7e32acc
2 changed files with 94 additions and 6 deletions

View File

@@ -100,6 +100,22 @@ namespace VAR.PdfTools
}; };
} }
public double GetCharacterPreviousSpacing(int index)
{
if (index <= 0) { return 0; }
double previousEnd = Characters[index - 1].Displacement + Characters[index - 1].Width;
double spacing = Characters[index].Displacement - previousEnd;
return spacing;
}
public double GetCharacterPrecedingSpacing(int index)
{
if (index >= (Characters.Count - 1)) { return 0; }
double currentEnd = Characters[index].Displacement + Characters[index].Width;
double spacing = Characters[index + 1].Displacement - currentEnd;
return spacing;
}
#endregion #endregion
} }

View File

@@ -170,6 +170,14 @@ namespace VAR.PdfTools
return list; return list;
} }
private bool TextElementVerticalIntersection(PdfTextElement elem1, double elem2X1, double elem2X2)
{
double elem1X1 = elem1.GetX();
double elem1X2 = elem1.GetX() + elem1.VisibleWidth;
return elem1X2 >= elem2X1 && elem2X2 >= elem1X1;
}
private bool TextElementVerticalIntersection(PdfTextElement elem1, PdfTextElement elem2) private bool TextElementVerticalIntersection(PdfTextElement elem1, PdfTextElement elem2)
{ {
double elem1X1 = elem1.GetX(); double elem1X1 = elem1.GetX();
@@ -699,14 +707,20 @@ namespace VAR.PdfTools
extentX2 = elemX1; extentX2 = elemX1;
} }
} }
} }
PdfTextElementColumn columnData = GetColumn(columnHead, headY, headX1, headX2, extentX1, extentX2);
return columnData;
}
public PdfTextElementColumn GetColumn(PdfTextElement columnHead, double headY, double headX1, double headX2, double extentX1, double extentX2)
{
// Get all the elements that intersects vertically, are down and sort results // Get all the elements that intersects vertically, are down and sort results
var columnDataRaw = new List<PdfTextElement>(); var columnDataRaw = new List<PdfTextElement>();
foreach (PdfTextElement elem in _textElements) foreach (PdfTextElement elem in _textElements)
{ {
if (TextElementVerticalIntersection(columnHead, elem) == false) { continue; } if (TextElementVerticalIntersection(elem, headX1, headX2) == false) { continue; }
// Only intems down the column // Only intems down the column
double elemY = elem.GetY(); double elemY = elem.GetY();
@@ -716,19 +730,77 @@ namespace VAR.PdfTools
} }
columnDataRaw = columnDataRaw.OrderByDescending(elem => elem.GetY()).ToList(); columnDataRaw = columnDataRaw.OrderByDescending(elem => elem.GetY()).ToList();
// Only items completelly inside extents, and break on the first element outside // Only items completelly inside extents, try spliting big elements and break on big elements that can't be splitted
var columnElements = new List<PdfTextElement>(); var columnElements = new List<PdfTextElement>();
foreach (PdfTextElement elem in columnDataRaw) foreach (PdfTextElement elem in columnDataRaw)
{ {
double elemX1 = elem.GetX(); double elemX1 = elem.GetX();
double elemX2 = elemX1 + elem.VisibleWidth; double elemX2 = elemX1 + elem.VisibleWidth;
if (elemX1 < extentX1 || elemX2 > extentX2) { break; }
columnElements.Add(elem); // Add elements completely inside
if (elemX1 > extentX1 && elemX2 < extentX2)
{
columnElements.Add(elem);
continue;
}
// Try to split elements intersecting extents of the column
double maxSpacing = elem.Characters.Average(c => c.Width) / 10;
int indexStart = 0;
int indexEnd = elem.Characters.Count - 1;
bool indexStartValid = true;
bool indexEndValid = true;
if (elemX1 < extentX1)
{
// Search best start
int index = 0;
double characterPosition = elemX1 + elem.Characters[index].Displacement;
while (characterPosition < extentX1 && index < (elem.Characters.Count - 1))
{
index++;
characterPosition = elemX1 + elem.Characters[index].Displacement;
}
double spacing = elem.GetCharacterPreviousSpacing(index);
while (spacing < maxSpacing && index < (elem.Characters.Count - 1))
{
index++;
spacing = elem.GetCharacterPreviousSpacing(index);
}
if (spacing < maxSpacing) { indexStartValid = false; }
indexStart = index;
}
if (elemX2 > extentX2)
{
// Search best end
int index = elem.Characters.Count - 1;
double characterPosition = elemX1 + elem.Characters[index].Displacement + elem.Characters[index].Width;
while (characterPosition > extentX2 && index > 0)
{
index--;
characterPosition = elemX1 + elem.Characters[index].Displacement + elem.Characters[index].Width;
}
double spacing = elem.GetCharacterPrecedingSpacing(index);
while (spacing < maxSpacing && index > 0)
{
index--;
spacing = elem.GetCharacterPrecedingSpacing(index);
}
if (spacing < maxSpacing) { indexEndValid = false; }
indexEnd = index;
}
// Break when there is no good split, spaning all extent
if (indexStartValid == false && indexEndValid == false) { break; }
// Continue when only one of the sides is invalid. (outside elements intersecting extents of the column)
if (indexStartValid == false || indexEndValid == false) { continue; }
// Add splitted element
columnElements.Add(elem.SubPart(indexStart, indexEnd + 1));
} }
var columnData = new PdfTextElementColumn(columnHead, columnElements, headY, extentX1, extentX2); var columnData = new PdfTextElementColumn(columnHead, columnElements, headY, extentX1, extentX2);
return columnData; return columnData;
} }