3 Commits
1_5_1 ... 1_5_2

Author SHA1 Message Date
beb3b931ea Bump version 1.5.2 2019-10-21 13:09:13 +02:00
8806020036 ignore ".vs" directory. 2019-10-21 13:08:44 +02:00
f3b7cd1b0d PdfTextExtractor: Better joining and splitting heuristics. 2019-10-21 13:08:19 +02:00
4 changed files with 29 additions and 15 deletions

2
.gitignore vendored
View File

@@ -27,3 +27,5 @@ obj/
_ReSharper*/ _ReSharper*/
*.userprefs *.userprefs
*.nupkg *.nupkg
.vs

View File

@@ -1,6 +1,6 @@
The MIT License (MIT) The MIT License (MIT)
Copyright (c) 2016-2017 Valeriano Alfonso Rodriguez Copyright (c) 2016-2019 Valeriano Alfonso Rodriguez
Permission is hereby granted, free of charge, to any person obtaining a copy Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal of this software and associated documentation files (the "Software"), to deal

View File

@@ -33,7 +33,7 @@ namespace VAR.PdfTools
public double VisibleHeight { get; set; } public double VisibleHeight { get; set; }
public List<PdfCharElement> Characters { get; set; } public List<PdfCharElement> Characters { get; set; }
public List<PdfTextElement> Childs { get; set; } public List<PdfTextElement> Childs { get; set; }
#endregion #endregion
@@ -85,6 +85,11 @@ namespace VAR.PdfTools
return blockElem; return blockElem;
} }
public double MaxWidth()
{
return Characters.Average(c => c.Width) / 2;
}
#endregion #endregion
} }
@@ -134,7 +139,7 @@ namespace VAR.PdfTools
JoinTextElements(); JoinTextElements();
SplitTextElements(); SplitTextElements();
} }
#endregion #endregion
#region Utility methods #region Utility methods
@@ -290,7 +295,7 @@ namespace VAR.PdfTools
_graphicsMatrix = _graphicsMatrixStack[_graphicsMatrixStack.Count - 1]; _graphicsMatrix = _graphicsMatrixStack[_graphicsMatrixStack.Count - 1];
_graphicsMatrixStack.RemoveAt(_graphicsMatrixStack.Count - 1); _graphicsMatrixStack.RemoveAt(_graphicsMatrixStack.Count - 1);
} }
private void OpBeginText() private void OpBeginText()
{ {
_textMatrix.Idenity(); _textMatrix.Idenity();
@@ -310,7 +315,7 @@ namespace VAR.PdfTools
_font = _page.Fonts[fontName]; _font = _page.Fonts[fontName];
_fontSize = size; _fontSize = size;
} }
private void OpTextCharSpacing(double charSpacing) private void OpTextCharSpacing(double charSpacing)
{ {
_charSpacing = charSpacing; _charSpacing = charSpacing;
@@ -601,7 +606,7 @@ namespace VAR.PdfTools
} }
FlushTextElement(); FlushTextElement();
} }
private void JoinTextElements() private void JoinTextElements()
{ {
var textElementsCondensed = new List<PdfTextElement>(); var textElementsCondensed = new List<PdfTextElement>();
@@ -622,10 +627,17 @@ namespace VAR.PdfTools
while (i < _textElements.Count) while (i < _textElements.Count)
{ {
PdfTextElement neighbour = _textElements[i]; PdfTextElement neighbour = _textElements[i];
if (neighbour.Font != elem.Font || neighbour.FontSize != elem.FontSize)
{
i++;
continue;
}
double neighbourY = neighbour.GetY(); double neighbourY = neighbour.GetY();
if (Math.Abs(neighbourY - blockY) > 0.001) { i++; continue; } if (Math.Abs(neighbourY - blockY) > 0.001) { i++; continue; }
double maxWidth = neighbour.Characters.Max(c => c.Width); double maxWidth = neighbour.MaxWidth();
double neighbourXMin = neighbour.GetX(); double neighbourXMin = neighbour.GetX();
double neighbourXMax = neighbourXMin + neighbour.VisibleWidth; double neighbourXMax = neighbourXMin + neighbour.VisibleWidth;
@@ -642,8 +654,8 @@ namespace VAR.PdfTools
} }
i++; i++;
} }
if(textElementNeighbours.Count == 0) if (textElementNeighbours.Count == 1)
{ {
textElementsCondensed.Add(elem); textElementsCondensed.Add(elem);
continue; continue;
@@ -654,7 +666,7 @@ namespace VAR.PdfTools
foreach (PdfTextElement neighbour in textElementNeighbours) foreach (PdfTextElement neighbour in textElementNeighbours)
{ {
double neighbourXMin = neighbour.GetX(); double neighbourXMin = neighbour.GetX();
foreach(PdfCharElement c in neighbour.Characters) foreach (PdfCharElement c in neighbour.Characters)
{ {
chars.Add(new PdfCharElement chars.Add(new PdfCharElement
{ {
@@ -666,7 +678,7 @@ namespace VAR.PdfTools
} }
chars = chars.OrderBy(c => c.Displacement).ToList(); chars = chars.OrderBy(c => c.Displacement).ToList();
var sbText = new StringBuilder(); var sbText = new StringBuilder();
foreach(PdfCharElement c in chars) foreach (PdfCharElement c in chars)
{ {
sbText.Append(c.Char); sbText.Append(c.Char);
} }
@@ -687,7 +699,7 @@ namespace VAR.PdfTools
} }
_textElements = textElementsCondensed; _textElements = textElementsCondensed;
} }
private void SplitTextElements() private void SplitTextElements()
{ {
var textElementsSplitted = new List<PdfTextElement>(); var textElementsSplitted = new List<PdfTextElement>();
@@ -696,7 +708,7 @@ namespace VAR.PdfTools
PdfTextElement elem = _textElements[0]; PdfTextElement elem = _textElements[0];
_textElements.Remove(elem); _textElements.Remove(elem);
double maxWidth = elem.Characters.Max(c => c.Width); double maxWidth = elem.MaxWidth();
int prevBreak = 0; int prevBreak = 0;
for (int i = 1; i < elem.Characters.Count; i++) for (int i = 1; i < elem.Characters.Count; i++)

View File

@@ -6,9 +6,9 @@ using System.Runtime.InteropServices;
[assembly: AssemblyConfiguration("")] [assembly: AssemblyConfiguration("")]
[assembly: AssemblyCompany("VAR")] [assembly: AssemblyCompany("VAR")]
[assembly: AssemblyProduct("VAR.PdfTools")] [assembly: AssemblyProduct("VAR.PdfTools")]
[assembly: AssemblyCopyright("Copyright © VAR 2016-2017")] [assembly: AssemblyCopyright("Copyright © VAR 2016-2019")]
[assembly: AssemblyTrademark("")] [assembly: AssemblyTrademark("")]
[assembly: AssemblyCulture("")] [assembly: AssemblyCulture("")]
[assembly: ComVisible(false)] [assembly: ComVisible(false)]
[assembly: Guid("eb7e003a-6a95-4002-809f-926c7c8a11e9")] [assembly: Guid("eb7e003a-6a95-4002-809f-926c7c8a11e9")]
[assembly: AssemblyVersion("1.5.1.*")] [assembly: AssemblyVersion("1.5.2.*")]