From bbafe91037b9baef1df3ad30f655f072c56ea52c Mon Sep 17 00:00:00 2001 From: "Valeriano A.R" Date: Mon, 20 Jun 2016 18:23:45 +0200 Subject: [PATCH] PdfTextElement: Extract text elements with coordinates and size --- VAR.PdfTools.Workbench/FrmPdfInfo.cs | 17 +- VAR.PdfTools/PdfParser.cs | 12 + VAR.PdfTools/PdfTextExtractor.cs | 388 +++++++++++++++++++++++++++ VAR.PdfTools/VAR.PdfTools.csproj | 1 + 4 files changed, 415 insertions(+), 3 deletions(-) create mode 100644 VAR.PdfTools/PdfTextExtractor.cs diff --git a/VAR.PdfTools.Workbench/FrmPdfInfo.cs b/VAR.PdfTools.Workbench/FrmPdfInfo.cs index d82d066..044169a 100644 --- a/VAR.PdfTools.Workbench/FrmPdfInfo.cs +++ b/VAR.PdfTools.Workbench/FrmPdfInfo.cs @@ -33,7 +33,7 @@ namespace VAR.PdfTools.Workbench txtPdfPath.Text = dlgFile.FileName; } } - + private void btnProcess_Click(object sender, EventArgs e) { if (System.IO.File.Exists(txtPdfPath.Text) == false) @@ -45,6 +45,7 @@ namespace VAR.PdfTools.Workbench PdfDocument doc = PdfDocument.Load(txtPdfPath.Text); int nObjects = doc.Objects.Count; + int nRootObject = doc.Objects.Where(obj => obj.UsageCount == 0).Count(); List streams = doc.Objects .Where(obj => obj.Data.Type == PdfElementTypes.Stream) .Select(obj => (PdfStream)obj.Data) @@ -55,11 +56,21 @@ namespace VAR.PdfTools.Workbench List lines = new List(); lines.Add(string.Format("Filename : {0}", System.IO.Path.GetFileNameWithoutExtension(txtPdfPath.Text))); lines.Add(string.Format("Number of Objects : {0}", nObjects)); + lines.Add(string.Format("Number of Roots : {0}", nRootObject)); lines.Add(string.Format("Number of Streams : {0}", nStreams)); lines.Add(string.Format("Number of Pages : {0}", nPages)); - - txtOutput.Lines = lines.ToArray(); + foreach (PdfDocumentPage page in doc.Pages) + { + PdfTextExtractor extractor = new PdfTextExtractor(page); + foreach (PdfTextElement textElement in extractor.Elements) + { + lines.Add(string.Format("Text({0}, {1})({2}): \"{3}\"", + textElement.Matrix.Matrix[0, 2], textElement.Matrix.Matrix[1, 2], textElement.VisibleWidth, textElement.VisibleText)); + } + } + + txtOutput.Lines = lines.ToArray(); } } diff --git a/VAR.PdfTools/PdfParser.cs b/VAR.PdfTools/PdfParser.cs index 89f377c..0b91e06 100644 --- a/VAR.PdfTools/PdfParser.cs +++ b/VAR.PdfTools/PdfParser.cs @@ -44,6 +44,13 @@ namespace VAR.PdfTools return 0; } + private byte[] GetRawData(long start, long size) + { + byte[] newArray = new byte[size]; + Array.Copy(_stream, start, newArray, 0, size); + return newArray; + } + private byte[] GetRawData(long length) { var memStream = new MemoryStream(); @@ -907,6 +914,11 @@ namespace VAR.PdfTools else { string token = ParseToken(); + if (string.IsNullOrEmpty(token)) + { + break; + //throw new Exception(string.Format("ParseContet: Expected token found nothing, at: {0}", _streamPosition)); + } PdfContentAction action = new PdfContentAction(token, elems); elems = new List(); actions.Add(action); diff --git a/VAR.PdfTools/PdfTextExtractor.cs b/VAR.PdfTools/PdfTextExtractor.cs new file mode 100644 index 0000000..8260048 --- /dev/null +++ b/VAR.PdfTools/PdfTextExtractor.cs @@ -0,0 +1,388 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Text; +using System.Threading.Tasks; + +namespace VAR.PdfTools +{ + public class Vector3D + { + public double[] _vector = new double[3]; + + public double[] Vector { get { return _vector; } } + + public Vector3D() + { + Init(); + } + + public void Init() + { + _vector[0] = 0.0; + _vector[1] = 0.0; + _vector[2] = 1.0; + } + } + + public class Matrix3x3 + { + public double[,] _matrix = new double[3, 3]; + + public double[,] Matrix { get { return _matrix; } } + + public Matrix3x3() + { + Idenity(); + } + + public void Idenity() + { + _matrix[0, 0] = 1.0; + _matrix[0, 1] = 0.0; + _matrix[0, 2] = 0.0; + _matrix[1, 0] = 0.0; + _matrix[1, 1] = 1.0; + _matrix[1, 2] = 0.0; + _matrix[2, 0] = 0.0; + _matrix[2, 1] = 0.0; + _matrix[2, 2] = 1.0; + } + + public Vector3D Multiply(Vector3D vect) + { + Vector3D vectResult = new Vector3D(); + + vectResult.Vector[0] = (vect.Vector[0] * _matrix[0, 0]) + (vect.Vector[1] * _matrix[0, 1]) + (vect.Vector[2] * _matrix[0, 2]); + vectResult.Vector[1] = (vect.Vector[0] * _matrix[1, 0]) + (vect.Vector[1] * _matrix[1, 1]) + (vect.Vector[2] * _matrix[1, 2]); + vectResult.Vector[2] = (vect.Vector[0] * _matrix[2, 0]) + (vect.Vector[1] * _matrix[2, 1]) + (vect.Vector[2] * _matrix[2, 2]); + + return vectResult; + } + + public Matrix3x3 Multiply(Matrix3x3 matrix) + { + Matrix3x3 newMatrix = new Matrix3x3(); + + newMatrix._matrix[0, 0] = (_matrix[0, 0] * matrix._matrix[0, 0]) + (_matrix[1, 0] * matrix._matrix[0, 1]) + (_matrix[2, 0] * matrix._matrix[0, 2]); + newMatrix._matrix[0, 1] = (_matrix[0, 1] * matrix._matrix[0, 0]) + (_matrix[1, 1] * matrix._matrix[0, 1]) + (_matrix[2, 1] * matrix._matrix[0, 2]); + newMatrix._matrix[0, 2] = (_matrix[0, 2] * matrix._matrix[0, 0]) + (_matrix[1, 2] * matrix._matrix[0, 1]) + (_matrix[2, 2] * matrix._matrix[0, 2]); + newMatrix._matrix[1, 0] = (_matrix[0, 0] * matrix._matrix[1, 0]) + (_matrix[1, 0] * matrix._matrix[1, 1]) + (_matrix[2, 0] * matrix._matrix[1, 2]); + newMatrix._matrix[1, 1] = (_matrix[0, 1] * matrix._matrix[1, 0]) + (_matrix[1, 1] * matrix._matrix[1, 1]) + (_matrix[2, 1] * matrix._matrix[1, 2]); + newMatrix._matrix[1, 2] = (_matrix[0, 2] * matrix._matrix[1, 0]) + (_matrix[1, 2] * matrix._matrix[1, 1]) + (_matrix[2, 2] * matrix._matrix[1, 2]); + newMatrix._matrix[2, 0] = (_matrix[0, 0] * matrix._matrix[2, 0]) + (_matrix[1, 0] * matrix._matrix[2, 1]) + (_matrix[2, 0] * matrix._matrix[2, 2]); + newMatrix._matrix[2, 1] = (_matrix[0, 1] * matrix._matrix[2, 0]) + (_matrix[1, 1] * matrix._matrix[2, 1]) + (_matrix[2, 1] * matrix._matrix[2, 2]); + newMatrix._matrix[2, 2] = (_matrix[0, 2] * matrix._matrix[2, 0]) + (_matrix[1, 2] * matrix._matrix[2, 1]) + (_matrix[2, 2] * matrix._matrix[2, 2]); + + return newMatrix; + } + + public Matrix3x3 Copy() + { + Matrix3x3 newMatrix = new Matrix3x3(); + + newMatrix._matrix[0, 0] = _matrix[0, 0]; + newMatrix._matrix[0, 1] = _matrix[0, 1]; + newMatrix._matrix[0, 2] = _matrix[0, 2]; + newMatrix._matrix[1, 0] = _matrix[1, 0]; + newMatrix._matrix[1, 1] = _matrix[1, 1]; + newMatrix._matrix[1, 2] = _matrix[1, 2]; + newMatrix._matrix[2, 0] = _matrix[2, 0]; + newMatrix._matrix[2, 1] = _matrix[2, 1]; + newMatrix._matrix[2, 2] = _matrix[2, 2]; + + return newMatrix; + } + + } + + public class PdfTextElement + { + public PdfFont Font { get; set; } + + public double TextSize { get; set; } + + public Matrix3x3 Matrix { get; set; } + + public string RawText { get; set; } + + public string VisibleText { get; set; } + + public double VisibleWidth { get; set; } + } + + public class PdfTextExtractor + { + #region Declarations + + private PdfDocumentPage _page = null; + + private List _textElements = new List(); + + private PdfFont _font = null; + private double _fontSize = 1; + private double _textLeading = 0; + + private bool inText = false; + private Matrix3x3 _matrix = new Matrix3x3(); + private StringBuilder _sbText = new StringBuilder(); + private double _textWidth = 0; + + #endregion + + #region Properties + + public PdfDocumentPage Page { get { return _page; } } + + public List Elements { get { return _textElements; } } + + #endregion + + #region lifecycle + + public PdfTextExtractor(PdfDocumentPage page) + { + _page = page; + ProcessPage(); + } + + #endregion + + #region Utility methods + + private string PdfElement_GetOnlyStrings(IPdfElement elem) + { + if (elem is PdfString) + { + return ((PdfString)elem).Value; + } + if (elem is PdfArray) + { + var sbText = new StringBuilder(); + PdfArray array = elem as PdfArray; + foreach (IPdfElement subElem in array.Values) + { + sbText.Append(PdfElement_GetOnlyStrings(subElem)); + } + return sbText.ToString(); + } + return string.Empty; + } + + private double PdfElement_GetReal(IPdfElement elem, double defaultValue) + { + if (elem is PdfInteger) + { + return ((PdfInteger)elem).Value; + } + if (elem is PdfReal) + { + return ((PdfReal)elem).Value; + } + return defaultValue; + } + + private string PdfString_ToUnicode(string text, PdfFont font) + { + if (font == null) + { + return text; + } + + StringBuilder sbText = new StringBuilder(); + foreach (char c in text) + { + sbText.Append(font.ToUnicode(c).Replace("\0", "")); + } + return sbText.ToString(); + } + + private void FlushTextElement() + { + if (_sbText.Length == 0) + { + return; + } + + PdfTextElement textElem = new PdfTextElement(); + textElem.Matrix = _matrix.Copy(); + textElem.Font = _font; + textElem.RawText = _sbText.ToString(); + textElem.VisibleText = PdfString_ToUnicode(textElem.RawText, _font); + textElem.VisibleWidth = _textWidth; + _textElements.Add(textElem); + + _sbText = new StringBuilder(); + _textWidth = 0; + } + + #endregion + + #region Operations + + private void OpBT() + { + _matrix.Idenity(); + inText = true; + } + + private void OpET() + { + FlushTextElement(); + inText = false; + } + + private void OpTf(string fontName, double size) + { + FlushTextElement(); + _font = _page.Fonts[fontName]; + _fontSize = size; + } + + private void OpTL(double textLeading) + { + _textLeading = textLeading; + } + + private void OpTd(double x, double y) + { + FlushTextElement(); + var newMatrix = new Matrix3x3(); + newMatrix.Matrix[0, 2] = x; + newMatrix.Matrix[1, 2] = y; + _matrix = newMatrix.Multiply(_matrix); + } + + private void OpTStar() + { + OpTd(0, -_textLeading); + } + + private void OpTm(double a, double b, double c, double d, double e, double f) + { + FlushTextElement(); + _matrix.Matrix[0, 0] = a; + _matrix.Matrix[1, 0] = b; + _matrix.Matrix[2, 0] = 0; + _matrix.Matrix[0, 1] = c; + _matrix.Matrix[1, 1] = d; + _matrix.Matrix[2, 1] = 0; + _matrix.Matrix[0, 2] = e; + _matrix.Matrix[1, 2] = f; + _matrix.Matrix[2, 2] = 1; + } + + private void OpTj(string text) + { + if (inText == false) { return; } + _sbText.Append(text); + if (_font != null) + { + foreach (char c in text) + { + _textWidth += _font.GetCharWidth(c) * _fontSize; + } + } + } + + private void OpTJ(PdfArray array) + { + if (inText == false) { return; } + foreach (IPdfElement elem in array.Values) + { + if(elem is PdfString) + { + OpTj(((PdfString)elem).Value); + } + else if(elem is PdfInteger || elem is PdfReal) + { + double spacing = PdfElement_GetReal(elem, 0); + _textWidth += spacing; + } + else if(elem is PdfArray) + { + OpTJ(((PdfArray)elem)); + } + } + } + + #endregion + + #region Private methods + + private void ProcessPage() + { + foreach (PdfContentAction action in _page.ContentActions) + { + if (action.Token == "BT") + { + OpBT(); + } + else if (action.Token == "ET") + { + OpET(); + } + else if (action.Token == "Tf") + { + string fontName = ((PdfName)action.Parameters[0]).Value; + double fontSize = PdfElement_GetReal(action.Parameters[1], 0); + OpTf(fontName, fontSize); + } + else if (action.Token == "TL") + { + double leading = PdfElement_GetReal(action.Parameters[0], 0); + OpTL(leading); + } + else if (action.Token == "Td") + { + double x = PdfElement_GetReal(action.Parameters[0], 0); + double y = PdfElement_GetReal(action.Parameters[1], 0); + OpTd(x, y); + } + else if (action.Token == "TD") + { + double x = PdfElement_GetReal(action.Parameters[0], 0); + double y = PdfElement_GetReal(action.Parameters[1], 0); + OpTL(-y); + OpTd(x, y); + } + else if (action.Token == "Tm") + { + double a = PdfElement_GetReal(action.Parameters[0], 0); + double b = PdfElement_GetReal(action.Parameters[1], 0); + double c = PdfElement_GetReal(action.Parameters[2], 0); + double d = PdfElement_GetReal(action.Parameters[3], 0); + double e = PdfElement_GetReal(action.Parameters[4], 0); + double f = PdfElement_GetReal(action.Parameters[5], 0); + OpTm(a, b, c, d, e, f); + } + else if (action.Token == "T*") + { + OpTStar(); + } + else if (action.Token == "Tj") + { + OpTj(((PdfString)action.Parameters[0]).Value); + } + else if (action.Token == "'") + { + OpTStar(); + OpTj(((PdfString)action.Parameters[0]).Value); + } + else if (action.Token == "\"") + { + double wordSpacing = PdfElement_GetReal(action.Parameters[0], 0); + double charSpacing = PdfElement_GetReal(action.Parameters[1], 0); + OpTj(((PdfString)action.Parameters[2]).Value); + } + else if (action.Token == "TJ") + { + OpTJ(((PdfArray)action.Parameters[0])); + } + } + FlushTextElement(); + } + + #endregion + } +} diff --git a/VAR.PdfTools/VAR.PdfTools.csproj b/VAR.PdfTools/VAR.PdfTools.csproj index 8859b7d..6151639 100644 --- a/VAR.PdfTools/VAR.PdfTools.csproj +++ b/VAR.PdfTools/VAR.PdfTools.csproj @@ -46,6 +46,7 @@ +