From dc1b9bc7ca91bce1e1f4df131954ae589584dbdd Mon Sep 17 00:00:00 2001 From: "Valeriano A.R" Date: Tue, 27 Jun 2017 01:09:50 +0200 Subject: [PATCH] PdfTextExtractor.JoinTextElements: Joins PdfTextElements when they are nearby. --- VAR.PdfTools/PdfTextExtractor.cs | 93 ++++++++++++++++++++++++++++++-- 1 file changed, 89 insertions(+), 4 deletions(-) diff --git a/VAR.PdfTools/PdfTextExtractor.cs b/VAR.PdfTools/PdfTextExtractor.cs index 5349c11..59d1a80 100644 --- a/VAR.PdfTools/PdfTextExtractor.cs +++ b/VAR.PdfTools/PdfTextExtractor.cs @@ -1,4 +1,5 @@ -using System.Collections.Generic; +using System; +using System.Collections.Generic; using System.Linq; using System.Text; using VAR.PdfTools.Maths; @@ -31,9 +32,8 @@ namespace VAR.PdfTools public double VisibleHeight { get; set; } public List Characters { get; set; } - - private List _childs = new List(); - public List Childs { get { return _childs; } } + + public List Childs { get; set; } #endregion @@ -95,6 +95,7 @@ namespace VAR.PdfTools { _page = page; ProcessPageContent(); + JoinTextElements(); } #endregion @@ -135,6 +136,7 @@ namespace VAR.PdfTools Displacement = (c.Displacement * textElem.Matrix.Matrix[0, 0]), }); } + textElem.Childs = new List(); return textElem; } @@ -519,6 +521,89 @@ namespace VAR.PdfTools FlushTextElement(); } + private void JoinTextElements() + { + var textElementsCondensed = new List(); + while (_textElements.Count > 0) + { + PdfTextElement elem = _textElements[0]; + _textElements.Remove(elem); + double blockY = elem.GetY(); + double blockXMin = elem.GetX(); + double blockXMax = blockXMin + elem.VisibleWidth; + + // Prepare first neighbour + var textElementNeighbours = new List(); + textElementNeighbours.Add(elem); + + // Search Neighbours + int i = 0; + while (i < _textElements.Count) + { + PdfTextElement neighbour = _textElements[i]; + double neighbourY = neighbour.GetY(); + if (Math.Abs(neighbourY - blockY) > 0.001) { i++; continue; } + + double neighbourXMin = neighbour.GetX(); + double neighbourXMax = neighbourXMin + neighbour.VisibleWidth; + double auxBlockXMin = blockXMin - elem.FontSize; + double auxBlockXMax = blockXMax + elem.FontSize; + if (auxBlockXMax >= neighbourXMin && neighbourXMax >= auxBlockXMin) + { + _textElements.Remove(neighbour); + textElementNeighbours.Add(neighbour); + if (blockXMax < neighbourXMax) { blockXMax = neighbourXMax; } + if (blockXMin > neighbourXMin) { blockXMin = neighbourXMin; } + i = 0; + continue; + } + i++; + } + + if(textElementNeighbours.Count == 0) + { + textElementsCondensed.Add(elem); + continue; + } + + // Join neighbours + var chars = new List(); + foreach (PdfTextElement neighbour in textElementNeighbours) + { + double neighbourXMin = neighbour.GetX(); + foreach(PdfCharElement c in neighbour.Characters) + { + chars.Add(new PdfCharElement + { + Char = c.Char, + Displacement = (c.Displacement + neighbourXMin) - blockXMin, + }); + } + } + chars = chars.OrderBy(c => c.Displacement).ToList(); + var sbText = new StringBuilder(); + foreach(PdfCharElement c in chars) + { + sbText.Append(c.Char); + } + PdfTextElement blockElem = new PdfTextElement + { + Font = null, + FontSize = elem.FontSize, + Matrix = elem.Matrix.Copy(), + RawText = sbText.ToString(), + VisibleText = sbText.ToString(), + VisibleWidth = blockXMax - blockXMin, + VisibleHeight = elem.VisibleHeight, + Characters = chars, + Childs = textElementNeighbours, + }; + blockElem.Matrix.Matrix[0, 2] = blockXMin; + textElementsCondensed.Add(blockElem); + } + _textElements = textElementsCondensed; + } + #endregion #region Public methods