diff --git a/VAR.PdfTools.Workbench/FrmPdfInfo.cs b/VAR.PdfTools.Workbench/FrmPdfInfo.cs index 609a27e..93da119 100644 --- a/VAR.PdfTools.Workbench/FrmPdfInfo.cs +++ b/VAR.PdfTools.Workbench/FrmPdfInfo.cs @@ -68,10 +68,10 @@ namespace VAR.PdfTools.Workbench { PdfArray cropBox = page.BaseData.Values["CropBox"] as PdfArray; lines.Add(string.Format("Page({0} of {1}): {2} {3} {4} {5}", pageNumber, doc.Pages.Count, - ((PdfInteger)cropBox.Values[0]).Value, - ((PdfInteger)cropBox.Values[1]).Value, - ((PdfInteger)cropBox.Values[2]).Value, - ((PdfInteger)cropBox.Values[3]).Value)); + PdfElementUtils.GetReal(cropBox.Values[0], 0), + PdfElementUtils.GetReal(cropBox.Values[1], 0), + PdfElementUtils.GetReal(cropBox.Values[2], 0), + PdfElementUtils.GetReal(cropBox.Values[3], 0))); } else { @@ -82,8 +82,9 @@ namespace VAR.PdfTools.Workbench PdfTextExtractor extractor = new PdfTextExtractor(page); foreach (PdfTextElement textElement in extractor.Elements) { - lines.Add(string.Format("Text({0}, {1})({2}): \"{3}\"", - textElement.Matrix.Matrix[0, 2], textElement.Matrix.Matrix[1, 2], textElement.VisibleWidth, textElement.VisibleText)); + lines.Add(string.Format("Text({0}, {1})({2}, {3}): \"{4}\"", + textElement.Matrix.Matrix[0, 2], textElement.Matrix.Matrix[1, 2], textElement.VisibleWidth, textElement.VisibleHeight, + textElement.VisibleText)); } } diff --git a/VAR.PdfTools/PdfElements.cs b/VAR.PdfTools/PdfElements.cs index 9e82bc7..f9b3192 100644 --- a/VAR.PdfTools/PdfElements.cs +++ b/VAR.PdfTools/PdfElements.cs @@ -163,4 +163,20 @@ namespace VAR.PdfTools public IPdfElement Data { get; set; } public int UsageCount { get; set; } = 0; } + + public static class PdfElementUtils + { + public static double GetReal(IPdfElement elem, double defaultValue) + { + if (elem is PdfInteger) + { + return ((PdfInteger)elem).Value; + } + if (elem is PdfReal) + { + return ((PdfReal)elem).Value; + } + return defaultValue; + } + } } diff --git a/VAR.PdfTools/PdfFont.cs b/VAR.PdfTools/PdfFont.cs index 108743d..3fb3d23 100644 --- a/VAR.PdfTools/PdfFont.cs +++ b/VAR.PdfTools/PdfFont.cs @@ -13,12 +13,16 @@ namespace VAR.PdfTools private Dictionary _widths = null; + private double _height = 1.0; + #endregion #region Properties public PdfDictionary BaseData { get { return _baseData; } } + public double Height { get { return _height; } } + #endregion #region Life cycle @@ -39,13 +43,13 @@ namespace VAR.PdfTools _toUnicode = parser.ParseToUnicode(); } - if (BaseData.Values.ContainsKey("FirstChar") && baseData.Values.ContainsKey("LastChar") && baseData.Values.ContainsKey("Widths")) + if (_baseData.Values.ContainsKey("FirstChar") && _baseData.Values.ContainsKey("LastChar") && _baseData.Values.ContainsKey("Widths")) { double glyphSpaceToTextSpace = 1000.0; // FIXME: SubType:Type3 Uses a FontMatrix that may not correspond to 1/1000th _widths = new Dictionary(); - char firstChar = (char)baseData.GetParamAsInt("FirstChar"); - char lastChar = (char)baseData.GetParamAsInt("LastChar"); - PdfArray widths = baseData.Values["Widths"] as PdfArray; + char firstChar = (char)_baseData.GetParamAsInt("FirstChar"); + char lastChar = (char)_baseData.GetParamAsInt("LastChar"); + PdfArray widths = _baseData.Values["Widths"] as PdfArray; char actualChar = firstChar; foreach (IPdfElement elem in widths.Values) { @@ -64,6 +68,7 @@ namespace VAR.PdfTools continue; } } + // FIMXE: Calculate real height } } diff --git a/VAR.PdfTools/PdfTextExtractor.cs b/VAR.PdfTools/PdfTextExtractor.cs index 7c366a5..cbde9b2 100644 --- a/VAR.PdfTools/PdfTextExtractor.cs +++ b/VAR.PdfTools/PdfTextExtractor.cs @@ -124,7 +124,7 @@ namespace VAR.PdfTools { public PdfFont Font { get; set; } - public double TextSize { get; set; } + public double FontSize { get; set; } public Matrix3x3 Matrix { get; set; } @@ -133,6 +133,8 @@ namespace VAR.PdfTools public string VisibleText { get; set; } public double VisibleWidth { get; set; } + + public double VisibleHeight { get; set; } } public class PdfTextExtractor @@ -180,38 +182,6 @@ namespace VAR.PdfTools #region Utility methods - private string PdfElement_GetOnlyStrings(IPdfElement elem) - { - if (elem is PdfString) - { - return ((PdfString)elem).Value; - } - if (elem is PdfArray) - { - var sbText = new StringBuilder(); - PdfArray array = elem as PdfArray; - foreach (IPdfElement subElem in array.Values) - { - sbText.Append(PdfElement_GetOnlyStrings(subElem)); - } - return sbText.ToString(); - } - return string.Empty; - } - - private double PdfElement_GetReal(IPdfElement elem, double defaultValue) - { - if (elem is PdfInteger) - { - return ((PdfInteger)elem).Value; - } - if (elem is PdfReal) - { - return ((PdfReal)elem).Value; - } - return defaultValue; - } - private string PdfString_ToUnicode(string text, PdfFont font) { if (font == null) @@ -235,11 +205,13 @@ namespace VAR.PdfTools } PdfTextElement textElem = new PdfTextElement(); - textElem.Matrix = _textMatrix.Multiply(_graphicsMatrix); textElem.Font = _font; + textElem.FontSize = _fontSize; + textElem.Matrix = _textMatrix.Multiply(_graphicsMatrix); textElem.RawText = _sbText.ToString(); textElem.VisibleText = PdfString_ToUnicode(textElem.RawText, _font); textElem.VisibleWidth = _textWidth * textElem.Matrix.Matrix[0, 0]; + textElem.VisibleHeight = (_font.Height * _fontSize) * textElem.Matrix.Matrix[1, 1]; _textElements.Add(textElem); _sbText = new StringBuilder(); @@ -350,7 +322,7 @@ namespace VAR.PdfTools } else if(elem is PdfInteger || elem is PdfReal) { - double spacing = PdfElement_GetReal(elem, 0); + double spacing = PdfElementUtils.GetReal(elem, 0); _textWidth += spacing; } else if(elem is PdfArray) @@ -379,12 +351,12 @@ namespace VAR.PdfTools } else if (action.Token == "cm") { - double a = PdfElement_GetReal(action.Parameters[0], 0); - double b = PdfElement_GetReal(action.Parameters[1], 0); - double c = PdfElement_GetReal(action.Parameters[2], 0); - double d = PdfElement_GetReal(action.Parameters[3], 0); - double e = PdfElement_GetReal(action.Parameters[4], 0); - double f = PdfElement_GetReal(action.Parameters[5], 0); + double a = PdfElementUtils.GetReal(action.Parameters[0], 0); + double b = PdfElementUtils.GetReal(action.Parameters[1], 0); + double c = PdfElementUtils.GetReal(action.Parameters[2], 0); + double d = PdfElementUtils.GetReal(action.Parameters[3], 0); + double e = PdfElementUtils.GetReal(action.Parameters[4], 0); + double f = PdfElementUtils.GetReal(action.Parameters[5], 0); OpSetGraphMatrix(a, b, c, d, e, f); } @@ -412,12 +384,12 @@ namespace VAR.PdfTools else if (action.Token == "Tf") { string fontName = ((PdfName)action.Parameters[0]).Value; - double fontSize = PdfElement_GetReal(action.Parameters[1], 0); + double fontSize = PdfElementUtils.GetReal(action.Parameters[1], 0); OpTextFont(fontName, fontSize); } else if (action.Token == "TL") { - double leading = PdfElement_GetReal(action.Parameters[0], 0); + double leading = PdfElementUtils.GetReal(action.Parameters[0], 0); OpTextLeading(leading); } else if (action.Token == "Tr") @@ -430,25 +402,25 @@ namespace VAR.PdfTools } else if (action.Token == "Td") { - double x = PdfElement_GetReal(action.Parameters[0], 0); - double y = PdfElement_GetReal(action.Parameters[1], 0); + double x = PdfElementUtils.GetReal(action.Parameters[0], 0); + double y = PdfElementUtils.GetReal(action.Parameters[1], 0); OpTesDisplace(x, y); } else if (action.Token == "TD") { - double x = PdfElement_GetReal(action.Parameters[0], 0); - double y = PdfElement_GetReal(action.Parameters[1], 0); + double x = PdfElementUtils.GetReal(action.Parameters[0], 0); + double y = PdfElementUtils.GetReal(action.Parameters[1], 0); OpTextLeading(-y); OpTesDisplace(x, y); } else if (action.Token == "Tm") { - double a = PdfElement_GetReal(action.Parameters[0], 0); - double b = PdfElement_GetReal(action.Parameters[1], 0); - double c = PdfElement_GetReal(action.Parameters[2], 0); - double d = PdfElement_GetReal(action.Parameters[3], 0); - double e = PdfElement_GetReal(action.Parameters[4], 0); - double f = PdfElement_GetReal(action.Parameters[5], 0); + double a = PdfElementUtils.GetReal(action.Parameters[0], 0); + double b = PdfElementUtils.GetReal(action.Parameters[1], 0); + double c = PdfElementUtils.GetReal(action.Parameters[2], 0); + double d = PdfElementUtils.GetReal(action.Parameters[3], 0); + double e = PdfElementUtils.GetReal(action.Parameters[4], 0); + double f = PdfElementUtils.GetReal(action.Parameters[5], 0); OpSetTextMatrix(a, b, c, d, e, f); } else if (action.Token == "T*") @@ -466,8 +438,8 @@ namespace VAR.PdfTools } else if (action.Token == "\"") { - double wordSpacing = PdfElement_GetReal(action.Parameters[0], 0); - double charSpacing = PdfElement_GetReal(action.Parameters[1], 0); + double wordSpacing = PdfElementUtils.GetReal(action.Parameters[0], 0); + double charSpacing = PdfElementUtils.GetReal(action.Parameters[1], 0); OpTextPut(((PdfString)action.Parameters[2]).Value); } else if (action.Token == "TJ")