PdfTextExtractor: Apply simple heuristics to join different text blocks checking matrix "collinearity".
This commit is contained in:
@@ -1,4 +1,5 @@
|
||||
using System.Collections.Generic;
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.Linq;
|
||||
using System.Text;
|
||||
|
||||
@@ -56,6 +57,11 @@ namespace VAR.PdfTools
|
||||
Idenity();
|
||||
}
|
||||
|
||||
public Matrix3x3(double a, double b, double c, double d, double e, double f)
|
||||
{
|
||||
Set(a, b, c, d, e, f);
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Public methods
|
||||
@@ -73,6 +79,19 @@ namespace VAR.PdfTools
|
||||
_matrix[2, 2] = 1.0;
|
||||
}
|
||||
|
||||
public void Set(double a, double b, double c, double d, double e, double f)
|
||||
{
|
||||
_matrix[0, 0] = a;
|
||||
_matrix[1, 0] = b;
|
||||
_matrix[2, 0] = 0;
|
||||
_matrix[0, 1] = c;
|
||||
_matrix[1, 1] = d;
|
||||
_matrix[2, 1] = 0;
|
||||
_matrix[0, 2] = e;
|
||||
_matrix[1, 2] = f;
|
||||
_matrix[2, 2] = 1;
|
||||
}
|
||||
|
||||
public Vector3D Multiply(Vector3D vect)
|
||||
{
|
||||
Vector3D vectResult = new Vector3D();
|
||||
@@ -118,6 +137,19 @@ namespace VAR.PdfTools
|
||||
return newMatrix;
|
||||
}
|
||||
|
||||
public bool IsCollinear(Matrix3x3 otherMatrix, double horizontalDelta = 0.00001, double verticalDelta = 0.00001)
|
||||
{
|
||||
double epsilon = 0.00001;
|
||||
return (
|
||||
Math.Abs(_matrix[0, 0] - otherMatrix.Matrix[0, 0]) <= epsilon &&
|
||||
Math.Abs(_matrix[1, 0] - otherMatrix.Matrix[1, 0]) <= epsilon &&
|
||||
Math.Abs(_matrix[0, 1] - otherMatrix.Matrix[0, 1]) <= epsilon &&
|
||||
Math.Abs(_matrix[1, 1] - otherMatrix.Matrix[1, 1]) <= epsilon &&
|
||||
Math.Abs(_matrix[0, 2] - otherMatrix.Matrix[0, 2]) <= horizontalDelta &&
|
||||
Math.Abs(_matrix[1, 2] - otherMatrix.Matrix[1, 2]) <= verticalDelta &&
|
||||
true);
|
||||
}
|
||||
|
||||
#endregion
|
||||
}
|
||||
|
||||
@@ -367,15 +399,7 @@ namespace VAR.PdfTools
|
||||
|
||||
private void OpSetGraphMatrix(double a, double b, double c, double d, double e, double f)
|
||||
{
|
||||
_graphicsMatrix.Matrix[0, 0] = a;
|
||||
_graphicsMatrix.Matrix[1, 0] = b;
|
||||
_graphicsMatrix.Matrix[2, 0] = 0;
|
||||
_graphicsMatrix.Matrix[0, 1] = c;
|
||||
_graphicsMatrix.Matrix[1, 1] = d;
|
||||
_graphicsMatrix.Matrix[2, 1] = 0;
|
||||
_graphicsMatrix.Matrix[0, 2] = e;
|
||||
_graphicsMatrix.Matrix[1, 2] = f;
|
||||
_graphicsMatrix.Matrix[2, 2] = 1;
|
||||
_graphicsMatrix.Set(a, b, c, d, e, f);
|
||||
}
|
||||
|
||||
private void OpBeginText()
|
||||
@@ -418,16 +442,35 @@ namespace VAR.PdfTools
|
||||
|
||||
private void OpSetTextMatrix(double a, double b, double c, double d, double e, double f)
|
||||
{
|
||||
double halfSpaceWidth = 0;
|
||||
double horizontalDelta = 0;
|
||||
Matrix3x3 newMatrix = new Matrix3x3(a, b, c, d, e, f);
|
||||
|
||||
if (_font != null)
|
||||
{
|
||||
halfSpaceWidth = _font.GetCharWidth(' ') * _fontSize;
|
||||
}
|
||||
horizontalDelta = (_textWidth + halfSpaceWidth);
|
||||
if (_textMatrix.IsCollinear(newMatrix, horizontalDelta: horizontalDelta))
|
||||
{
|
||||
return;
|
||||
}
|
||||
if (_currentTextElement != null)
|
||||
{
|
||||
if (_currentTextElement.Font != null)
|
||||
{
|
||||
halfSpaceWidth = _currentTextElement.Font.GetCharWidth(' ') * _currentTextElement.FontSize;
|
||||
}
|
||||
horizontalDelta = (_currentTextElement.VisibleWidth + halfSpaceWidth);
|
||||
if (_currentTextElement.Matrix.IsCollinear(newMatrix, horizontalDelta: horizontalDelta))
|
||||
{
|
||||
FlushTextElementSoft();
|
||||
_textMatrix = newMatrix;
|
||||
return;
|
||||
}
|
||||
}
|
||||
FlushTextElement();
|
||||
_textMatrix.Matrix[0, 0] = a;
|
||||
_textMatrix.Matrix[1, 0] = b;
|
||||
_textMatrix.Matrix[2, 0] = 0;
|
||||
_textMatrix.Matrix[0, 1] = c;
|
||||
_textMatrix.Matrix[1, 1] = d;
|
||||
_textMatrix.Matrix[2, 1] = 0;
|
||||
_textMatrix.Matrix[0, 2] = e;
|
||||
_textMatrix.Matrix[1, 2] = f;
|
||||
_textMatrix.Matrix[2, 2] = 1;
|
||||
_textMatrix = newMatrix;
|
||||
}
|
||||
|
||||
private void OpTextPut(string text)
|
||||
|
||||
Reference in New Issue
Block a user