From 44f6cb3d0c21d3c12b8d0d03fad1156de1fe9ee1 Mon Sep 17 00:00:00 2001 From: "Valeriano A.R" Date: Tue, 21 Jun 2016 11:24:39 +0200 Subject: [PATCH] PdfTextExtractor: Keep basic graphics state for correct matrix calculations --- VAR.PdfTools/PdfTextExtractor.cs | 184 +++++++++++++++++++++++-------- 1 file changed, 141 insertions(+), 43 deletions(-) diff --git a/VAR.PdfTools/PdfTextExtractor.cs b/VAR.PdfTools/PdfTextExtractor.cs index 90e0391..7c366a5 100644 --- a/VAR.PdfTools/PdfTextExtractor.cs +++ b/VAR.PdfTools/PdfTextExtractor.cs @@ -5,10 +5,20 @@ namespace VAR.PdfTools { public class Vector3D { + #region Declarations + public double[] _vector = new double[3]; + #endregion + + #region Properties + public double[] Vector { get { return _vector; } } + #endregion + + #region Creator + public Vector3D() { Init(); @@ -20,19 +30,35 @@ namespace VAR.PdfTools _vector[1] = 0.0; _vector[2] = 1.0; } + + #endregion } public class Matrix3x3 { + #region Declarations + public double[,] _matrix = new double[3, 3]; + #endregion + + #region Properties + public double[,] Matrix { get { return _matrix; } } + #endregion + + #region Creator + public Matrix3x3() { Idenity(); } + #endregion + + #region Public methods + public void Idenity() { _matrix[0, 0] = 1.0; @@ -91,8 +117,9 @@ namespace VAR.PdfTools return newMatrix; } + #endregion } - + public class PdfTextElement { public PdfFont Font { get; set; } @@ -115,13 +142,19 @@ namespace VAR.PdfTools private PdfDocumentPage _page = null; private List _textElements = new List(); - + + // Graphics state + private List _graphicsMatrixStack = new List(); + private Matrix3x3 _graphicsMatrix = new Matrix3x3(); + + // Text state private PdfFont _font = null; private double _fontSize = 1; private double _textLeading = 0; + // Text object state private bool inText = false; - private Matrix3x3 _matrix = new Matrix3x3(); + private Matrix3x3 _textMatrix = new Matrix3x3(); private StringBuilder _sbText = new StringBuilder(); private double _textWidth = 0; @@ -202,11 +235,11 @@ namespace VAR.PdfTools } PdfTextElement textElem = new PdfTextElement(); - textElem.Matrix = _matrix.Copy(); + textElem.Matrix = _textMatrix.Multiply(_graphicsMatrix); textElem.Font = _font; textElem.RawText = _sbText.ToString(); textElem.VisibleText = PdfString_ToUnicode(textElem.RawText, _font); - textElem.VisibleWidth = _textWidth; + textElem.VisibleWidth = _textWidth * textElem.Matrix.Matrix[0, 0]; _textElements.Add(textElem); _sbText = new StringBuilder(); @@ -217,59 +250,83 @@ namespace VAR.PdfTools #region Operations - private void OpBT() + private void OpPushGraphState() { - _matrix.Idenity(); + _graphicsMatrixStack.Add(_graphicsMatrix.Copy()); + } + + private void OpPopGraphState() + { + _graphicsMatrix = _graphicsMatrixStack[_graphicsMatrixStack.Count - 1]; + _graphicsMatrixStack.RemoveAt(_graphicsMatrixStack.Count - 1); + } + + private void OpSetGraphMatrix(double a, double b, double c, double d, double e, double f) + { + _graphicsMatrix.Matrix[0, 0] = a; + _graphicsMatrix.Matrix[1, 0] = b; + _graphicsMatrix.Matrix[2, 0] = 0; + _graphicsMatrix.Matrix[0, 1] = c; + _graphicsMatrix.Matrix[1, 1] = d; + _graphicsMatrix.Matrix[2, 1] = 0; + _graphicsMatrix.Matrix[0, 2] = e; + _graphicsMatrix.Matrix[1, 2] = f; + _graphicsMatrix.Matrix[2, 2] = 1; + } + + private void OpBeginText() + { + _textMatrix.Idenity(); inText = true; } - private void OpET() + private void OpEndText() { FlushTextElement(); inText = false; } - private void OpTf(string fontName, double size) + private void OpTextFont(string fontName, double size) { FlushTextElement(); _font = _page.Fonts[fontName]; _fontSize = size; } - private void OpTL(double textLeading) + private void OpTextLeading(double textLeading) { _textLeading = textLeading; } - private void OpTd(double x, double y) + private void OpTesDisplace(double x, double y) { FlushTextElement(); var newMatrix = new Matrix3x3(); newMatrix.Matrix[0, 2] = x; newMatrix.Matrix[1, 2] = y; - _matrix = newMatrix.Multiply(_matrix); + _textMatrix = newMatrix.Multiply(_textMatrix); } - private void OpTStar() + private void OpTextLineFeed() { - OpTd(0, -_textLeading); + OpTesDisplace(0, -_textLeading); } - private void OpTm(double a, double b, double c, double d, double e, double f) + private void OpSetTextMatrix(double a, double b, double c, double d, double e, double f) { FlushTextElement(); - _matrix.Matrix[0, 0] = a; - _matrix.Matrix[1, 0] = b; - _matrix.Matrix[2, 0] = 0; - _matrix.Matrix[0, 1] = c; - _matrix.Matrix[1, 1] = d; - _matrix.Matrix[2, 1] = 0; - _matrix.Matrix[0, 2] = e; - _matrix.Matrix[1, 2] = f; - _matrix.Matrix[2, 2] = 1; + _textMatrix.Matrix[0, 0] = a; + _textMatrix.Matrix[1, 0] = b; + _textMatrix.Matrix[2, 0] = 0; + _textMatrix.Matrix[0, 1] = c; + _textMatrix.Matrix[1, 1] = d; + _textMatrix.Matrix[2, 1] = 0; + _textMatrix.Matrix[0, 2] = e; + _textMatrix.Matrix[1, 2] = f; + _textMatrix.Matrix[2, 2] = 1; } - private void OpTj(string text) + private void OpTextPut(string text) { if (inText == false) { return; } _sbText.Append(text); @@ -282,14 +339,14 @@ namespace VAR.PdfTools } } - private void OpTJ(PdfArray array) + private void OpTextPutMultiple(PdfArray array) { if (inText == false) { return; } foreach (IPdfElement elem in array.Values) { if(elem is PdfString) { - OpTj(((PdfString)elem).Value); + OpTextPut(((PdfString)elem).Value); } else if(elem is PdfInteger || elem is PdfReal) { @@ -298,7 +355,7 @@ namespace VAR.PdfTools } else if(elem is PdfArray) { - OpTJ(((PdfArray)elem)); + OpTextPutMultiple(((PdfArray)elem)); } } } @@ -311,37 +368,78 @@ namespace VAR.PdfTools { foreach (PdfContentAction action in _page.ContentActions) { - if (action.Token == "BT") + // Graphics Operations + if (action.Token == "q") { - OpBT(); + OpPushGraphState(); + } + else if (action.Token == "Q") + { + OpPopGraphState(); + } + else if (action.Token == "cm") + { + double a = PdfElement_GetReal(action.Parameters[0], 0); + double b = PdfElement_GetReal(action.Parameters[1], 0); + double c = PdfElement_GetReal(action.Parameters[2], 0); + double d = PdfElement_GetReal(action.Parameters[3], 0); + double e = PdfElement_GetReal(action.Parameters[4], 0); + double f = PdfElement_GetReal(action.Parameters[5], 0); + OpSetGraphMatrix(a, b, c, d, e, f); + } + + // Text Operations + else if (action.Token == "BT") + { + OpBeginText(); } else if (action.Token == "ET") { - OpET(); + OpEndText(); + } + else if (action.Token == "Tc") + { + // FIXME: Char spacing + } + else if (action.Token == "Tw") + { + // FIXME: Word spacing + } + else if (action.Token == "Tz") + { + // FIXME: Horizontal Scale } else if (action.Token == "Tf") { string fontName = ((PdfName)action.Parameters[0]).Value; double fontSize = PdfElement_GetReal(action.Parameters[1], 0); - OpTf(fontName, fontSize); + OpTextFont(fontName, fontSize); } else if (action.Token == "TL") { double leading = PdfElement_GetReal(action.Parameters[0], 0); - OpTL(leading); + OpTextLeading(leading); + } + else if (action.Token == "Tr") + { + // FIXME: Rendering mode + } + else if (action.Token == "Ts") + { + // FIXME: Text rise } else if (action.Token == "Td") { double x = PdfElement_GetReal(action.Parameters[0], 0); double y = PdfElement_GetReal(action.Parameters[1], 0); - OpTd(x, y); + OpTesDisplace(x, y); } else if (action.Token == "TD") { double x = PdfElement_GetReal(action.Parameters[0], 0); double y = PdfElement_GetReal(action.Parameters[1], 0); - OpTL(-y); - OpTd(x, y); + OpTextLeading(-y); + OpTesDisplace(x, y); } else if (action.Token == "Tm") { @@ -351,30 +449,30 @@ namespace VAR.PdfTools double d = PdfElement_GetReal(action.Parameters[3], 0); double e = PdfElement_GetReal(action.Parameters[4], 0); double f = PdfElement_GetReal(action.Parameters[5], 0); - OpTm(a, b, c, d, e, f); + OpSetTextMatrix(a, b, c, d, e, f); } else if (action.Token == "T*") { - OpTStar(); + OpTextLineFeed(); } else if (action.Token == "Tj") { - OpTj(((PdfString)action.Parameters[0]).Value); + OpTextPut(((PdfString)action.Parameters[0]).Value); } else if (action.Token == "'") { - OpTStar(); - OpTj(((PdfString)action.Parameters[0]).Value); + OpTextLineFeed(); + OpTextPut(((PdfString)action.Parameters[0]).Value); } else if (action.Token == "\"") { double wordSpacing = PdfElement_GetReal(action.Parameters[0], 0); double charSpacing = PdfElement_GetReal(action.Parameters[1], 0); - OpTj(((PdfString)action.Parameters[2]).Value); + OpTextPut(((PdfString)action.Parameters[2]).Value); } else if (action.Token == "TJ") { - OpTJ(((PdfArray)action.Parameters[0])); + OpTextPutMultiple(((PdfArray)action.Parameters[0])); } } FlushTextElement();