PdfTextExtractor: Calculate PdfTextElement height

This commit is contained in:
2016-06-21 15:19:39 +02:00
parent 44f6cb3d0c
commit baea8aabc2
4 changed files with 59 additions and 65 deletions

View File

@@ -68,10 +68,10 @@ namespace VAR.PdfTools.Workbench
{
PdfArray cropBox = page.BaseData.Values["CropBox"] as PdfArray;
lines.Add(string.Format("Page({0} of {1}): {2} {3} {4} {5}", pageNumber, doc.Pages.Count,
((PdfInteger)cropBox.Values[0]).Value,
((PdfInteger)cropBox.Values[1]).Value,
((PdfInteger)cropBox.Values[2]).Value,
((PdfInteger)cropBox.Values[3]).Value));
PdfElementUtils.GetReal(cropBox.Values[0], 0),
PdfElementUtils.GetReal(cropBox.Values[1], 0),
PdfElementUtils.GetReal(cropBox.Values[2], 0),
PdfElementUtils.GetReal(cropBox.Values[3], 0)));
}
else
{
@@ -82,8 +82,9 @@ namespace VAR.PdfTools.Workbench
PdfTextExtractor extractor = new PdfTextExtractor(page);
foreach (PdfTextElement textElement in extractor.Elements)
{
lines.Add(string.Format("Text({0}, {1})({2}): \"{3}\"",
textElement.Matrix.Matrix[0, 2], textElement.Matrix.Matrix[1, 2], textElement.VisibleWidth, textElement.VisibleText));
lines.Add(string.Format("Text({0}, {1})({2}, {3}): \"{4}\"",
textElement.Matrix.Matrix[0, 2], textElement.Matrix.Matrix[1, 2], textElement.VisibleWidth, textElement.VisibleHeight,
textElement.VisibleText));
}
}

View File

@@ -163,4 +163,20 @@ namespace VAR.PdfTools
public IPdfElement Data { get; set; }
public int UsageCount { get; set; } = 0;
}
public static class PdfElementUtils
{
public static double GetReal(IPdfElement elem, double defaultValue)
{
if (elem is PdfInteger)
{
return ((PdfInteger)elem).Value;
}
if (elem is PdfReal)
{
return ((PdfReal)elem).Value;
}
return defaultValue;
}
}
}

View File

@@ -13,12 +13,16 @@ namespace VAR.PdfTools
private Dictionary<char, double> _widths = null;
private double _height = 1.0;
#endregion
#region Properties
public PdfDictionary BaseData { get { return _baseData; } }
public double Height { get { return _height; } }
#endregion
#region Life cycle
@@ -39,13 +43,13 @@ namespace VAR.PdfTools
_toUnicode = parser.ParseToUnicode();
}
if (BaseData.Values.ContainsKey("FirstChar") && baseData.Values.ContainsKey("LastChar") && baseData.Values.ContainsKey("Widths"))
if (_baseData.Values.ContainsKey("FirstChar") && _baseData.Values.ContainsKey("LastChar") && _baseData.Values.ContainsKey("Widths"))
{
double glyphSpaceToTextSpace = 1000.0; // FIXME: SubType:Type3 Uses a FontMatrix that may not correspond to 1/1000th
_widths = new Dictionary<char, double>();
char firstChar = (char)baseData.GetParamAsInt("FirstChar");
char lastChar = (char)baseData.GetParamAsInt("LastChar");
PdfArray widths = baseData.Values["Widths"] as PdfArray;
char firstChar = (char)_baseData.GetParamAsInt("FirstChar");
char lastChar = (char)_baseData.GetParamAsInt("LastChar");
PdfArray widths = _baseData.Values["Widths"] as PdfArray;
char actualChar = firstChar;
foreach (IPdfElement elem in widths.Values)
{
@@ -64,6 +68,7 @@ namespace VAR.PdfTools
continue;
}
}
// FIMXE: Calculate real height
}
}

View File

@@ -124,7 +124,7 @@ namespace VAR.PdfTools
{
public PdfFont Font { get; set; }
public double TextSize { get; set; }
public double FontSize { get; set; }
public Matrix3x3 Matrix { get; set; }
@@ -133,6 +133,8 @@ namespace VAR.PdfTools
public string VisibleText { get; set; }
public double VisibleWidth { get; set; }
public double VisibleHeight { get; set; }
}
public class PdfTextExtractor
@@ -180,38 +182,6 @@ namespace VAR.PdfTools
#region Utility methods
private string PdfElement_GetOnlyStrings(IPdfElement elem)
{
if (elem is PdfString)
{
return ((PdfString)elem).Value;
}
if (elem is PdfArray)
{
var sbText = new StringBuilder();
PdfArray array = elem as PdfArray;
foreach (IPdfElement subElem in array.Values)
{
sbText.Append(PdfElement_GetOnlyStrings(subElem));
}
return sbText.ToString();
}
return string.Empty;
}
private double PdfElement_GetReal(IPdfElement elem, double defaultValue)
{
if (elem is PdfInteger)
{
return ((PdfInteger)elem).Value;
}
if (elem is PdfReal)
{
return ((PdfReal)elem).Value;
}
return defaultValue;
}
private string PdfString_ToUnicode(string text, PdfFont font)
{
if (font == null)
@@ -235,11 +205,13 @@ namespace VAR.PdfTools
}
PdfTextElement textElem = new PdfTextElement();
textElem.Matrix = _textMatrix.Multiply(_graphicsMatrix);
textElem.Font = _font;
textElem.FontSize = _fontSize;
textElem.Matrix = _textMatrix.Multiply(_graphicsMatrix);
textElem.RawText = _sbText.ToString();
textElem.VisibleText = PdfString_ToUnicode(textElem.RawText, _font);
textElem.VisibleWidth = _textWidth * textElem.Matrix.Matrix[0, 0];
textElem.VisibleHeight = (_font.Height * _fontSize) * textElem.Matrix.Matrix[1, 1];
_textElements.Add(textElem);
_sbText = new StringBuilder();
@@ -350,7 +322,7 @@ namespace VAR.PdfTools
}
else if(elem is PdfInteger || elem is PdfReal)
{
double spacing = PdfElement_GetReal(elem, 0);
double spacing = PdfElementUtils.GetReal(elem, 0);
_textWidth += spacing;
}
else if(elem is PdfArray)
@@ -379,12 +351,12 @@ namespace VAR.PdfTools
}
else if (action.Token == "cm")
{
double a = PdfElement_GetReal(action.Parameters[0], 0);
double b = PdfElement_GetReal(action.Parameters[1], 0);
double c = PdfElement_GetReal(action.Parameters[2], 0);
double d = PdfElement_GetReal(action.Parameters[3], 0);
double e = PdfElement_GetReal(action.Parameters[4], 0);
double f = PdfElement_GetReal(action.Parameters[5], 0);
double a = PdfElementUtils.GetReal(action.Parameters[0], 0);
double b = PdfElementUtils.GetReal(action.Parameters[1], 0);
double c = PdfElementUtils.GetReal(action.Parameters[2], 0);
double d = PdfElementUtils.GetReal(action.Parameters[3], 0);
double e = PdfElementUtils.GetReal(action.Parameters[4], 0);
double f = PdfElementUtils.GetReal(action.Parameters[5], 0);
OpSetGraphMatrix(a, b, c, d, e, f);
}
@@ -412,12 +384,12 @@ namespace VAR.PdfTools
else if (action.Token == "Tf")
{
string fontName = ((PdfName)action.Parameters[0]).Value;
double fontSize = PdfElement_GetReal(action.Parameters[1], 0);
double fontSize = PdfElementUtils.GetReal(action.Parameters[1], 0);
OpTextFont(fontName, fontSize);
}
else if (action.Token == "TL")
{
double leading = PdfElement_GetReal(action.Parameters[0], 0);
double leading = PdfElementUtils.GetReal(action.Parameters[0], 0);
OpTextLeading(leading);
}
else if (action.Token == "Tr")
@@ -430,25 +402,25 @@ namespace VAR.PdfTools
}
else if (action.Token == "Td")
{
double x = PdfElement_GetReal(action.Parameters[0], 0);
double y = PdfElement_GetReal(action.Parameters[1], 0);
double x = PdfElementUtils.GetReal(action.Parameters[0], 0);
double y = PdfElementUtils.GetReal(action.Parameters[1], 0);
OpTesDisplace(x, y);
}
else if (action.Token == "TD")
{
double x = PdfElement_GetReal(action.Parameters[0], 0);
double y = PdfElement_GetReal(action.Parameters[1], 0);
double x = PdfElementUtils.GetReal(action.Parameters[0], 0);
double y = PdfElementUtils.GetReal(action.Parameters[1], 0);
OpTextLeading(-y);
OpTesDisplace(x, y);
}
else if (action.Token == "Tm")
{
double a = PdfElement_GetReal(action.Parameters[0], 0);
double b = PdfElement_GetReal(action.Parameters[1], 0);
double c = PdfElement_GetReal(action.Parameters[2], 0);
double d = PdfElement_GetReal(action.Parameters[3], 0);
double e = PdfElement_GetReal(action.Parameters[4], 0);
double f = PdfElement_GetReal(action.Parameters[5], 0);
double a = PdfElementUtils.GetReal(action.Parameters[0], 0);
double b = PdfElementUtils.GetReal(action.Parameters[1], 0);
double c = PdfElementUtils.GetReal(action.Parameters[2], 0);
double d = PdfElementUtils.GetReal(action.Parameters[3], 0);
double e = PdfElementUtils.GetReal(action.Parameters[4], 0);
double f = PdfElementUtils.GetReal(action.Parameters[5], 0);
OpSetTextMatrix(a, b, c, d, e, f);
}
else if (action.Token == "T*")
@@ -466,8 +438,8 @@ namespace VAR.PdfTools
}
else if (action.Token == "\"")
{
double wordSpacing = PdfElement_GetReal(action.Parameters[0], 0);
double charSpacing = PdfElement_GetReal(action.Parameters[1], 0);
double wordSpacing = PdfElementUtils.GetReal(action.Parameters[0], 0);
double charSpacing = PdfElementUtils.GetReal(action.Parameters[1], 0);
OpTextPut(((PdfString)action.Parameters[2]).Value);
}
else if (action.Token == "TJ")