PdfTextExtractor: Calculate PdfTextElement height
This commit is contained in:
@@ -68,10 +68,10 @@ namespace VAR.PdfTools.Workbench
|
||||
{
|
||||
PdfArray cropBox = page.BaseData.Values["CropBox"] as PdfArray;
|
||||
lines.Add(string.Format("Page({0} of {1}): {2} {3} {4} {5}", pageNumber, doc.Pages.Count,
|
||||
((PdfInteger)cropBox.Values[0]).Value,
|
||||
((PdfInteger)cropBox.Values[1]).Value,
|
||||
((PdfInteger)cropBox.Values[2]).Value,
|
||||
((PdfInteger)cropBox.Values[3]).Value));
|
||||
PdfElementUtils.GetReal(cropBox.Values[0], 0),
|
||||
PdfElementUtils.GetReal(cropBox.Values[1], 0),
|
||||
PdfElementUtils.GetReal(cropBox.Values[2], 0),
|
||||
PdfElementUtils.GetReal(cropBox.Values[3], 0)));
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -82,8 +82,9 @@ namespace VAR.PdfTools.Workbench
|
||||
PdfTextExtractor extractor = new PdfTextExtractor(page);
|
||||
foreach (PdfTextElement textElement in extractor.Elements)
|
||||
{
|
||||
lines.Add(string.Format("Text({0}, {1})({2}): \"{3}\"",
|
||||
textElement.Matrix.Matrix[0, 2], textElement.Matrix.Matrix[1, 2], textElement.VisibleWidth, textElement.VisibleText));
|
||||
lines.Add(string.Format("Text({0}, {1})({2}, {3}): \"{4}\"",
|
||||
textElement.Matrix.Matrix[0, 2], textElement.Matrix.Matrix[1, 2], textElement.VisibleWidth, textElement.VisibleHeight,
|
||||
textElement.VisibleText));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -163,4 +163,20 @@ namespace VAR.PdfTools
|
||||
public IPdfElement Data { get; set; }
|
||||
public int UsageCount { get; set; } = 0;
|
||||
}
|
||||
|
||||
public static class PdfElementUtils
|
||||
{
|
||||
public static double GetReal(IPdfElement elem, double defaultValue)
|
||||
{
|
||||
if (elem is PdfInteger)
|
||||
{
|
||||
return ((PdfInteger)elem).Value;
|
||||
}
|
||||
if (elem is PdfReal)
|
||||
{
|
||||
return ((PdfReal)elem).Value;
|
||||
}
|
||||
return defaultValue;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -13,12 +13,16 @@ namespace VAR.PdfTools
|
||||
|
||||
private Dictionary<char, double> _widths = null;
|
||||
|
||||
private double _height = 1.0;
|
||||
|
||||
#endregion
|
||||
|
||||
#region Properties
|
||||
|
||||
public PdfDictionary BaseData { get { return _baseData; } }
|
||||
|
||||
public double Height { get { return _height; } }
|
||||
|
||||
#endregion
|
||||
|
||||
#region Life cycle
|
||||
@@ -39,13 +43,13 @@ namespace VAR.PdfTools
|
||||
_toUnicode = parser.ParseToUnicode();
|
||||
}
|
||||
|
||||
if (BaseData.Values.ContainsKey("FirstChar") && baseData.Values.ContainsKey("LastChar") && baseData.Values.ContainsKey("Widths"))
|
||||
if (_baseData.Values.ContainsKey("FirstChar") && _baseData.Values.ContainsKey("LastChar") && _baseData.Values.ContainsKey("Widths"))
|
||||
{
|
||||
double glyphSpaceToTextSpace = 1000.0; // FIXME: SubType:Type3 Uses a FontMatrix that may not correspond to 1/1000th
|
||||
_widths = new Dictionary<char, double>();
|
||||
char firstChar = (char)baseData.GetParamAsInt("FirstChar");
|
||||
char lastChar = (char)baseData.GetParamAsInt("LastChar");
|
||||
PdfArray widths = baseData.Values["Widths"] as PdfArray;
|
||||
char firstChar = (char)_baseData.GetParamAsInt("FirstChar");
|
||||
char lastChar = (char)_baseData.GetParamAsInt("LastChar");
|
||||
PdfArray widths = _baseData.Values["Widths"] as PdfArray;
|
||||
char actualChar = firstChar;
|
||||
foreach (IPdfElement elem in widths.Values)
|
||||
{
|
||||
@@ -64,6 +68,7 @@ namespace VAR.PdfTools
|
||||
continue;
|
||||
}
|
||||
}
|
||||
// FIMXE: Calculate real height
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -124,7 +124,7 @@ namespace VAR.PdfTools
|
||||
{
|
||||
public PdfFont Font { get; set; }
|
||||
|
||||
public double TextSize { get; set; }
|
||||
public double FontSize { get; set; }
|
||||
|
||||
public Matrix3x3 Matrix { get; set; }
|
||||
|
||||
@@ -133,6 +133,8 @@ namespace VAR.PdfTools
|
||||
public string VisibleText { get; set; }
|
||||
|
||||
public double VisibleWidth { get; set; }
|
||||
|
||||
public double VisibleHeight { get; set; }
|
||||
}
|
||||
|
||||
public class PdfTextExtractor
|
||||
@@ -180,38 +182,6 @@ namespace VAR.PdfTools
|
||||
|
||||
#region Utility methods
|
||||
|
||||
private string PdfElement_GetOnlyStrings(IPdfElement elem)
|
||||
{
|
||||
if (elem is PdfString)
|
||||
{
|
||||
return ((PdfString)elem).Value;
|
||||
}
|
||||
if (elem is PdfArray)
|
||||
{
|
||||
var sbText = new StringBuilder();
|
||||
PdfArray array = elem as PdfArray;
|
||||
foreach (IPdfElement subElem in array.Values)
|
||||
{
|
||||
sbText.Append(PdfElement_GetOnlyStrings(subElem));
|
||||
}
|
||||
return sbText.ToString();
|
||||
}
|
||||
return string.Empty;
|
||||
}
|
||||
|
||||
private double PdfElement_GetReal(IPdfElement elem, double defaultValue)
|
||||
{
|
||||
if (elem is PdfInteger)
|
||||
{
|
||||
return ((PdfInteger)elem).Value;
|
||||
}
|
||||
if (elem is PdfReal)
|
||||
{
|
||||
return ((PdfReal)elem).Value;
|
||||
}
|
||||
return defaultValue;
|
||||
}
|
||||
|
||||
private string PdfString_ToUnicode(string text, PdfFont font)
|
||||
{
|
||||
if (font == null)
|
||||
@@ -235,11 +205,13 @@ namespace VAR.PdfTools
|
||||
}
|
||||
|
||||
PdfTextElement textElem = new PdfTextElement();
|
||||
textElem.Matrix = _textMatrix.Multiply(_graphicsMatrix);
|
||||
textElem.Font = _font;
|
||||
textElem.FontSize = _fontSize;
|
||||
textElem.Matrix = _textMatrix.Multiply(_graphicsMatrix);
|
||||
textElem.RawText = _sbText.ToString();
|
||||
textElem.VisibleText = PdfString_ToUnicode(textElem.RawText, _font);
|
||||
textElem.VisibleWidth = _textWidth * textElem.Matrix.Matrix[0, 0];
|
||||
textElem.VisibleHeight = (_font.Height * _fontSize) * textElem.Matrix.Matrix[1, 1];
|
||||
_textElements.Add(textElem);
|
||||
|
||||
_sbText = new StringBuilder();
|
||||
@@ -350,7 +322,7 @@ namespace VAR.PdfTools
|
||||
}
|
||||
else if(elem is PdfInteger || elem is PdfReal)
|
||||
{
|
||||
double spacing = PdfElement_GetReal(elem, 0);
|
||||
double spacing = PdfElementUtils.GetReal(elem, 0);
|
||||
_textWidth += spacing;
|
||||
}
|
||||
else if(elem is PdfArray)
|
||||
@@ -379,12 +351,12 @@ namespace VAR.PdfTools
|
||||
}
|
||||
else if (action.Token == "cm")
|
||||
{
|
||||
double a = PdfElement_GetReal(action.Parameters[0], 0);
|
||||
double b = PdfElement_GetReal(action.Parameters[1], 0);
|
||||
double c = PdfElement_GetReal(action.Parameters[2], 0);
|
||||
double d = PdfElement_GetReal(action.Parameters[3], 0);
|
||||
double e = PdfElement_GetReal(action.Parameters[4], 0);
|
||||
double f = PdfElement_GetReal(action.Parameters[5], 0);
|
||||
double a = PdfElementUtils.GetReal(action.Parameters[0], 0);
|
||||
double b = PdfElementUtils.GetReal(action.Parameters[1], 0);
|
||||
double c = PdfElementUtils.GetReal(action.Parameters[2], 0);
|
||||
double d = PdfElementUtils.GetReal(action.Parameters[3], 0);
|
||||
double e = PdfElementUtils.GetReal(action.Parameters[4], 0);
|
||||
double f = PdfElementUtils.GetReal(action.Parameters[5], 0);
|
||||
OpSetGraphMatrix(a, b, c, d, e, f);
|
||||
}
|
||||
|
||||
@@ -412,12 +384,12 @@ namespace VAR.PdfTools
|
||||
else if (action.Token == "Tf")
|
||||
{
|
||||
string fontName = ((PdfName)action.Parameters[0]).Value;
|
||||
double fontSize = PdfElement_GetReal(action.Parameters[1], 0);
|
||||
double fontSize = PdfElementUtils.GetReal(action.Parameters[1], 0);
|
||||
OpTextFont(fontName, fontSize);
|
||||
}
|
||||
else if (action.Token == "TL")
|
||||
{
|
||||
double leading = PdfElement_GetReal(action.Parameters[0], 0);
|
||||
double leading = PdfElementUtils.GetReal(action.Parameters[0], 0);
|
||||
OpTextLeading(leading);
|
||||
}
|
||||
else if (action.Token == "Tr")
|
||||
@@ -430,25 +402,25 @@ namespace VAR.PdfTools
|
||||
}
|
||||
else if (action.Token == "Td")
|
||||
{
|
||||
double x = PdfElement_GetReal(action.Parameters[0], 0);
|
||||
double y = PdfElement_GetReal(action.Parameters[1], 0);
|
||||
double x = PdfElementUtils.GetReal(action.Parameters[0], 0);
|
||||
double y = PdfElementUtils.GetReal(action.Parameters[1], 0);
|
||||
OpTesDisplace(x, y);
|
||||
}
|
||||
else if (action.Token == "TD")
|
||||
{
|
||||
double x = PdfElement_GetReal(action.Parameters[0], 0);
|
||||
double y = PdfElement_GetReal(action.Parameters[1], 0);
|
||||
double x = PdfElementUtils.GetReal(action.Parameters[0], 0);
|
||||
double y = PdfElementUtils.GetReal(action.Parameters[1], 0);
|
||||
OpTextLeading(-y);
|
||||
OpTesDisplace(x, y);
|
||||
}
|
||||
else if (action.Token == "Tm")
|
||||
{
|
||||
double a = PdfElement_GetReal(action.Parameters[0], 0);
|
||||
double b = PdfElement_GetReal(action.Parameters[1], 0);
|
||||
double c = PdfElement_GetReal(action.Parameters[2], 0);
|
||||
double d = PdfElement_GetReal(action.Parameters[3], 0);
|
||||
double e = PdfElement_GetReal(action.Parameters[4], 0);
|
||||
double f = PdfElement_GetReal(action.Parameters[5], 0);
|
||||
double a = PdfElementUtils.GetReal(action.Parameters[0], 0);
|
||||
double b = PdfElementUtils.GetReal(action.Parameters[1], 0);
|
||||
double c = PdfElementUtils.GetReal(action.Parameters[2], 0);
|
||||
double d = PdfElementUtils.GetReal(action.Parameters[3], 0);
|
||||
double e = PdfElementUtils.GetReal(action.Parameters[4], 0);
|
||||
double f = PdfElementUtils.GetReal(action.Parameters[5], 0);
|
||||
OpSetTextMatrix(a, b, c, d, e, f);
|
||||
}
|
||||
else if (action.Token == "T*")
|
||||
@@ -466,8 +438,8 @@ namespace VAR.PdfTools
|
||||
}
|
||||
else if (action.Token == "\"")
|
||||
{
|
||||
double wordSpacing = PdfElement_GetReal(action.Parameters[0], 0);
|
||||
double charSpacing = PdfElement_GetReal(action.Parameters[1], 0);
|
||||
double wordSpacing = PdfElementUtils.GetReal(action.Parameters[0], 0);
|
||||
double charSpacing = PdfElementUtils.GetReal(action.Parameters[1], 0);
|
||||
OpTextPut(((PdfString)action.Parameters[2]).Value);
|
||||
}
|
||||
else if (action.Token == "TJ")
|
||||
|
||||
Reference in New Issue
Block a user