PdfTextExtractor: Get results as PdfTextElementColumn, for debugging purposes.

This commit is contained in:
2019-10-27 18:45:13 +01:00
parent 386b38bd21
commit 9af363529c
5 changed files with 166 additions and 55 deletions

View File

@@ -48,6 +48,7 @@
this.txtField3 = new System.Windows.Forms.TextBox(); this.txtField3 = new System.Windows.Forms.TextBox();
this.btnGetColumn3 = new System.Windows.Forms.Button(); this.btnGetColumn3 = new System.Windows.Forms.Button();
this.txtPages = new System.Windows.Forms.TextBox(); this.txtPages = new System.Windows.Forms.TextBox();
this.chkRender = new System.Windows.Forms.CheckBox();
this.SuspendLayout(); this.SuspendLayout();
// //
// lblOutputs // lblOutputs
@@ -120,7 +121,7 @@
// //
this.btnGetColumn1.Location = new System.Drawing.Point(292, 51); this.btnGetColumn1.Location = new System.Drawing.Point(292, 51);
this.btnGetColumn1.Name = "btnGetColumn1"; this.btnGetColumn1.Name = "btnGetColumn1";
this.btnGetColumn1.Size = new System.Drawing.Size(60, 23); this.btnGetColumn1.Size = new System.Drawing.Size(69, 23);
this.btnGetColumn1.TabIndex = 12; this.btnGetColumn1.TabIndex = 12;
this.btnGetColumn1.Text = "GetColumn"; this.btnGetColumn1.Text = "GetColumn";
this.btnGetColumn1.UseVisualStyleBackColor = true; this.btnGetColumn1.UseVisualStyleBackColor = true;
@@ -195,7 +196,7 @@
// //
this.btnGetColumn2.Location = new System.Drawing.Point(292, 80); this.btnGetColumn2.Location = new System.Drawing.Point(292, 80);
this.btnGetColumn2.Name = "btnGetColumn2"; this.btnGetColumn2.Name = "btnGetColumn2";
this.btnGetColumn2.Size = new System.Drawing.Size(60, 23); this.btnGetColumn2.Size = new System.Drawing.Size(69, 23);
this.btnGetColumn2.TabIndex = 19; this.btnGetColumn2.TabIndex = 19;
this.btnGetColumn2.Text = "GetColumn"; this.btnGetColumn2.Text = "GetColumn";
this.btnGetColumn2.UseVisualStyleBackColor = true; this.btnGetColumn2.UseVisualStyleBackColor = true;
@@ -232,7 +233,7 @@
// //
this.btnGetColumn3.Location = new System.Drawing.Point(292, 109); this.btnGetColumn3.Location = new System.Drawing.Point(292, 109);
this.btnGetColumn3.Name = "btnGetColumn3"; this.btnGetColumn3.Name = "btnGetColumn3";
this.btnGetColumn3.Size = new System.Drawing.Size(60, 23); this.btnGetColumn3.Size = new System.Drawing.Size(69, 23);
this.btnGetColumn3.TabIndex = 23; this.btnGetColumn3.TabIndex = 23;
this.btnGetColumn3.Text = "GetColumn"; this.btnGetColumn3.Text = "GetColumn";
this.btnGetColumn3.UseVisualStyleBackColor = true; this.btnGetColumn3.UseVisualStyleBackColor = true;
@@ -246,11 +247,22 @@
this.txtPages.Size = new System.Drawing.Size(75, 20); this.txtPages.Size = new System.Drawing.Size(75, 20);
this.txtPages.TabIndex = 27; this.txtPages.TabIndex = 27;
// //
// chkRender
//
this.chkRender.AutoSize = true;
this.chkRender.Location = new System.Drawing.Point(292, 138);
this.chkRender.Name = "chkRender";
this.chkRender.Size = new System.Drawing.Size(61, 17);
this.chkRender.TabIndex = 28;
this.chkRender.Text = "Render";
this.chkRender.UseVisualStyleBackColor = true;
//
// FrmPdfInfo // FrmPdfInfo
// //
this.AutoScaleDimensions = new System.Drawing.SizeF(6F, 13F); this.AutoScaleDimensions = new System.Drawing.SizeF(6F, 13F);
this.AutoScaleMode = System.Windows.Forms.AutoScaleMode.Font; this.AutoScaleMode = System.Windows.Forms.AutoScaleMode.Font;
this.ClientSize = new System.Drawing.Size(484, 461); this.ClientSize = new System.Drawing.Size(484, 461);
this.Controls.Add(this.chkRender);
this.Controls.Add(this.txtPages); this.Controls.Add(this.txtPages);
this.Controls.Add(this.btnHasText3); this.Controls.Add(this.btnHasText3);
this.Controls.Add(this.btnGetField3); this.Controls.Add(this.btnGetField3);
@@ -302,5 +314,6 @@
private System.Windows.Forms.TextBox txtField3; private System.Windows.Forms.TextBox txtField3;
private System.Windows.Forms.Button btnGetColumn3; private System.Windows.Forms.Button btnGetColumn3;
private System.Windows.Forms.TextBox txtPages; private System.Windows.Forms.TextBox txtPages;
private System.Windows.Forms.CheckBox chkRender;
} }
} }

View File

@@ -1,7 +1,6 @@
using System; using System;
using System.Collections.Generic; using System.Collections.Generic;
using System.Drawing; using System.Drawing;
using System.Drawing.Drawing2D;
using System.Drawing.Imaging; using System.Drawing.Imaging;
using System.IO; using System.IO;
using System.Linq; using System.Linq;
@@ -214,10 +213,11 @@ namespace VAR.PdfTools.Workbench
if (part.Contains("-")) if (part.Contains("-"))
{ {
string[] range = part.Split('-'); string[] range = part.Split('-');
if (range.Length == 2) { if (range.Length == 2)
{
int pageStart; int pageStart;
int pageEnd; int pageEnd;
if(int.TryParse(range[0], out pageStart) && int.TryParse(range[1], out pageEnd)) if (int.TryParse(range[0], out pageStart) && int.TryParse(range[1], out pageEnd))
{ {
listPages.AddRange(Enumerable.Range(pageStart, (pageEnd - pageStart) + 1)); listPages.AddRange(Enumerable.Range(pageStart, (pageEnd - pageStart) + 1));
} }
@@ -226,13 +226,13 @@ namespace VAR.PdfTools.Workbench
else else
{ {
int pageNum; int pageNum;
if(int.TryParse(part, out pageNum)) if (int.TryParse(part, out pageNum))
{ {
listPages.Add(pageNum); listPages.Add(pageNum);
} }
} }
} }
if(listPages.Count == 0) if (listPages.Count == 0)
{ {
listPages.AddRange(Enumerable.Range(1, maxPages)); listPages.AddRange(Enumerable.Range(1, maxPages));
} }
@@ -294,18 +294,29 @@ namespace VAR.PdfTools.Workbench
} }
PdfDocument doc = PdfDocument.Load(pdfPath); PdfDocument doc = PdfDocument.Load(pdfPath);
string baseDocumentPath = Path.GetDirectoryName(txtPdfPath.Text);
string baseDocumentFilename = Path.GetFileNameWithoutExtension(txtPdfPath.Text);
IEnumerable<int> selectedPages = GetSelectedPages(doc.Pages.Count); IEnumerable<int> selectedPages = GetSelectedPages(doc.Pages.Count);
var columnData = new List<string>(); var columns = new List<string>();
int pageNum = 0; int pageNum = 0;
foreach (PdfDocumentPage page in doc.Pages) foreach (PdfDocumentPage page in doc.Pages)
{ {
pageNum++; pageNum++;
if (selectedPages.Contains(pageNum) == false) { continue; } if (selectedPages.Contains(pageNum) == false) { continue; }
PdfTextExtractor extractor = new PdfTextExtractor(page); PdfTextExtractor extractor = new PdfTextExtractor(page);
columnData.AddRange(extractor.GetColumnAsStrings(column)); PdfTextElementColumn columnData = extractor.GetColumn(column);
if (chkRender.Checked)
{
var pdfPageRenderer = new PdfPageRenderer(extractor);
Bitmap bmp = pdfPageRenderer.Render();
pdfPageRenderer.RenderColumn(columnData, bmp);
string fileName = Path.Combine(baseDocumentPath, string.Format("{0}_{1:0000}.png", baseDocumentFilename, pageNum));
bmp.Save(fileName, ImageFormat.Png);
}
columns.AddRange(columnData.Elements.Select(t => t.VisibleText));
} }
txtOutput.Lines = columnData.ToArray(); txtOutput.Lines = columns.ToArray();
} }
private void btnRender_Click(object sender, EventArgs e) private void btnRender_Click(object sender, EventArgs e)
@@ -325,19 +336,19 @@ namespace VAR.PdfTools.Workbench
lines.Add(string.Format("Number of Pages : {0}", doc.Pages.Count)); lines.Add(string.Format("Number of Pages : {0}", doc.Pages.Count));
IEnumerable<int> selectedPages = GetSelectedPages(doc.Pages.Count); IEnumerable<int> selectedPages = GetSelectedPages(doc.Pages.Count);
int pageNumber = 0; int pageNum = 0;
foreach (PdfDocumentPage page in doc.Pages) foreach (PdfDocumentPage page in doc.Pages)
{ {
pageNumber++; pageNum++;
if (selectedPages.Contains(pageNumber) == false) { continue; } if (selectedPages.Contains(pageNum) == false) { continue; }
PdfPageRenderer pdfPageRenderer = new PdfPageRenderer(page); PdfPageRenderer pdfPageRenderer = new PdfPageRenderer(page);
Bitmap bmp = pdfPageRenderer.Render(); Bitmap bmp = pdfPageRenderer.Render();
lines.Add(string.Format("Page {0:0000} TextElements : {1}", pageNumber, pdfPageRenderer.Extractor.Elements.Count)); lines.Add(string.Format("Page {0:0000} TextElements : {1}", pageNum, pdfPageRenderer.Extractor.Elements.Count));
// Save image to disk // Save image to disk
string fileName = Path.Combine(baseDocumentPath, string.Format("{0}_{1:0000}.png", baseDocumentFilename, pageNumber)); string fileName = Path.Combine(baseDocumentPath, string.Format("{0}_{1:0000}.png", baseDocumentFilename, pageNum));
bmp.Save(fileName, ImageFormat.Png); bmp.Save(fileName, ImageFormat.Png);
} }

View File

@@ -10,7 +10,14 @@ namespace VAR.PdfTools
{ {
private PdfDocumentPage _page; private PdfDocumentPage _page;
private PdfTextExtractor _pdfTextExtractor; private PdfTextExtractor _pdfTextExtractor;
private Rect _pageRect;
private int _pageWidth;
private int _pageHeight;
private int _scale = 10;
private const int MaxSize = 10000;
public PdfTextExtractor Extractor { get { return _pdfTextExtractor; } } public PdfTextExtractor Extractor { get { return _pdfTextExtractor; } }
public PdfPageRenderer(PdfDocumentPage page) public PdfPageRenderer(PdfDocumentPage page)
@@ -19,45 +26,92 @@ namespace VAR.PdfTools
_pdfTextExtractor = new PdfTextExtractor(_page); _pdfTextExtractor = new PdfTextExtractor(_page);
} }
public PdfPageRenderer(PdfTextExtractor pdfTextExtractor)
{
_pdfTextExtractor = pdfTextExtractor;
_page = pdfTextExtractor.Page;
// Calculate page size and scale
_pageRect = _pdfTextExtractor.GetRect();
_pageWidth = (int)Math.Ceiling(_pageRect.XMax - _pageRect.XMin);
_pageHeight = (int)Math.Ceiling(_pageRect.YMax - _pageRect.YMin);
while ((_pageWidth * _scale) > MaxSize) { _scale--; }
while ((_pageHeight * _scale) > MaxSize) { _scale--; }
if (_scale <= 0) { _scale = 1; }
}
public Bitmap Render() public Bitmap Render()
{ {
if (_pdfTextExtractor.Elements.Count == 0) if (_pdfTextExtractor.Elements.Count == 0)
{ {
// Nothing to render // Nothing to render
Bitmap emptyBmp = new Bitmap(100, 200, PixelFormat.Format32bppArgb); Bitmap emptyBmp = new Bitmap(100, 200, PixelFormat.Format32bppArgb);
using (Graphics gc = Graphics.FromImage(emptyBmp)) using (Graphics gcEmpty = Graphics.FromImage(emptyBmp))
gc.Clear(Color.White); gcEmpty.Clear(Color.White);
return emptyBmp; return emptyBmp;
} }
// Calculate page size and scale // Prepare image
Rect pageRect = _pdfTextExtractor.GetRect(); Bitmap bmp = new Bitmap(_pageWidth * _scale, _pageHeight * _scale, PixelFormat.Format32bppArgb);
int pageWidth = (int)Math.Ceiling(pageRect.XMax - pageRect.XMin); Graphics gc = Graphics.FromImage(bmp);
int pageHeight = (int)Math.Ceiling(pageRect.YMax - pageRect.YMin); gc.Clear(Color.White);
int Scale = 10;
int MaxSize = 10000;
while ((pageWidth * Scale) > MaxSize) { Scale--; }
while ((pageHeight * Scale) > MaxSize && Scale > 1) { Scale--; }
if (Scale <= 0) { Scale = 1; }
// Draw page image // Draw text elements of the page
Bitmap bmp = new Bitmap(pageWidth * Scale, pageHeight * Scale, PixelFormat.Format32bppArgb);
using (Graphics gc = Graphics.FromImage(bmp))
using (Pen penTextElem = new Pen(Color.Blue)) using (Pen penTextElem = new Pen(Color.Blue))
using (Pen penCharElem = new Pen(Color.Navy)) using (Pen penCharElem = new Pen(Color.Navy))
{ {
gc.Clear(Color.White);
// Draw text elements
foreach (PdfTextElement textElement in _pdfTextExtractor.Elements) foreach (PdfTextElement textElement in _pdfTextExtractor.Elements)
{ {
DrawTextElement(textElement, gc, penTextElem, penCharElem, Scale, pageHeight, pageRect.XMin, pageRect.YMin, Brushes.Black); DrawTextElement(textElement, gc, penTextElem, penCharElem, _scale, _pageHeight, _pageRect.XMin, _pageRect.YMin, Brushes.Black);
} }
} }
gc.Dispose();
return bmp; return bmp;
} }
private static void DrawTextElement(PdfTextElement textElement, Graphics gc, Pen penTextElem, Pen penCharElem, int Scale, int pageHeight, double pageXMin, double pageYMin, Brush brushText) public Bitmap RenderColumn(PdfTextElementColumn columnData, Bitmap bmp = null)
{
Graphics gc;
if (bmp == null)
{
bmp = new Bitmap(_pageWidth * _scale, _pageHeight * _scale, PixelFormat.Format32bppArgb);
gc = Graphics.FromImage(bmp);
gc.Clear(Color.White);
}
else
{
gc = Graphics.FromImage(bmp);
}
// Draw text elements of the column
using (Pen penTextElem = new Pen(Color.Red))
using (Pen penCharElem = new Pen(Color.DarkRed))
{
foreach (PdfTextElement textElement in columnData.Elements)
{
DrawTextElement(textElement, gc, penTextElem, penCharElem, _scale, _pageHeight, _pageRect.XMin, _pageRect.YMin, Brushes.OrangeRed);
}
}
// Draw column extents
using (Pen penColumn = new Pen(Color.Red))
{
float y = (float)(_pageRect.YMax - columnData.Y);
float x1 = (float)(columnData.X1 - _pageRect.XMin);
float x2 = (float)(columnData.X2 - _pageRect.XMin);
gc.DrawLine(penColumn, x1 * _scale, y * _scale, x2 * _scale, y * _scale);
gc.DrawLine(penColumn, x1 * _scale, y * _scale, x1 * _scale, _pageHeight * _scale);
gc.DrawLine(penColumn, x2 * _scale, y * _scale, x2 * _scale, _pageHeight * _scale);
}
gc.Dispose();
return bmp;
}
private static void DrawTextElement(PdfTextElement textElement, Graphics gc, Pen penTextElem, Pen penCharElem, int scale, int pageHeight, double pageXMin, double pageYMin, Brush brushText)
{ {
double textElementX = textElement.GetX() - pageXMin; double textElementX = textElement.GetX() - pageXMin;
double textElementY = textElement.GetY() - pageYMin; double textElementY = textElement.GetY() - pageYMin;
@@ -74,29 +128,29 @@ namespace VAR.PdfTools
if (penTextElem != null) if (penTextElem != null)
{ {
DrawRoundedRectangle(gc, penTextElem, DrawRoundedRectangle(gc, penTextElem,
(int)(textElementPageX * Scale), (int)(textElementPageX * scale),
(int)(textElementPageY * Scale), (int)(textElementPageY * scale),
(int)(textElementWidth * Scale), (int)(textElementWidth * scale),
(int)(textElementHeight * Scale), (int)(textElementHeight * scale),
5); 5);
} }
using (Font font = new Font("Arial", (int)(textElementHeight * Scale), GraphicsUnit.Pixel)) using (Font font = new Font("Arial", (int)(textElementHeight * scale), GraphicsUnit.Pixel))
{ {
foreach (PdfCharElement c in textElement.Characters) foreach (PdfCharElement c in textElement.Characters)
{ {
gc.DrawString(c.Char, gc.DrawString(c.Char,
font, font,
brushText, brushText,
(int)((textElementPageX + c.Displacement) * Scale), (int)((textElementPageX + c.Displacement) * scale),
(int)(textElementPageY * Scale)); (int)(textElementPageY * scale));
if (penCharElem != null) if (penCharElem != null)
{ {
DrawRoundedRectangle(gc, penCharElem, DrawRoundedRectangle(gc, penCharElem,
(int)((textElementPageX + c.Displacement) * Scale), (int)((textElementPageX + c.Displacement) * scale),
(int)(textElementPageY * Scale), (int)(textElementPageY * scale),
(int)(c.Width * Scale), (int)(c.Width * scale),
(int)(textElementHeight * Scale), (int)(textElementHeight * scale),
5); 5);
} }
} }

View File

@@ -103,4 +103,28 @@ namespace VAR.PdfTools
#endregion #endregion
} }
public class PdfTextElementColumn
{
public PdfTextElement HeadTextElement { get; private set; }
public IEnumerable<PdfTextElement> Elements { get; private set; }
public double Y { get; private set; }
public double X1 { get; private set; }
public double X2 { get; private set; }
public static PdfTextElementColumn Empty { get; } = new PdfTextElementColumn();
private PdfTextElementColumn() { }
public PdfTextElementColumn(PdfTextElement head, IEnumerable<PdfTextElement> elements, double y, double x1, double x2)
{
HeadTextElement = head;
Elements = elements;
Y = y;
X1 = x1;
X2 = x2;
}
}
} }

View File

@@ -651,7 +651,7 @@ namespace VAR.PdfTools
#endregion #endregion
#region Public methods #region Public methods
public Rect GetRect() public Rect GetRect()
{ {
Rect rect = null; Rect rect = null;
@@ -664,12 +664,12 @@ namespace VAR.PdfTools
return rect; return rect;
} }
public List<string> GetColumnAsStrings(string column, bool fuzzy =true) public PdfTextElementColumn GetColumn(string column, bool fuzzy = true)
{ {
PdfTextElement columnHead = FindElementByText(column, fuzzy); PdfTextElement columnHead = FindElementByText(column, fuzzy);
if (columnHead == null) if (columnHead == null)
{ {
return new List<string>(); return PdfTextElementColumn.Empty;
} }
double headY = columnHead.GetY(); double headY = columnHead.GetY();
double headX1 = columnHead.GetX(); double headX1 = columnHead.GetX();
@@ -717,25 +717,34 @@ namespace VAR.PdfTools
columnDataRaw = columnDataRaw.OrderByDescending(elem => elem.GetY()).ToList(); columnDataRaw = columnDataRaw.OrderByDescending(elem => elem.GetY()).ToList();
// Only items completelly inside extents, and break on the first element outside // Only items completelly inside extents, and break on the first element outside
var columnData = new List<PdfTextElement>(); var columnElements = new List<PdfTextElement>();
foreach (PdfTextElement elem in columnDataRaw) foreach (PdfTextElement elem in columnDataRaw)
{ {
double elemX1 = elem.GetX(); double elemX1 = elem.GetX();
double elemX2 = elemX1 + elem.VisibleWidth; double elemX2 = elemX1 + elem.VisibleWidth;
if (elemX1 < extentX1 || elemX2 > extentX2) { break; } if (elemX1 < extentX1 || elemX2 > extentX2) { break; }
columnData.Add(elem); columnElements.Add(elem);
} }
var columnData = new PdfTextElementColumn(columnHead, columnElements, headY, extentX1, extentX2);
return columnData;
}
public List<string> GetColumnAsStrings(string column, bool fuzzy = true)
{
PdfTextElementColumn columnData = GetColumn(column, fuzzy);
// Emit result // Emit result
var result = new List<string>(); var result = new List<string>();
foreach (PdfTextElement elem in columnData) foreach (PdfTextElement elem in columnData.Elements)
{ {
result.Add(elem.VisibleText); result.Add(elem.VisibleText);
} }
return result; return result;
} }
public string GetFieldAsString(string field, bool fuzzy = true) public string GetFieldAsString(string field, bool fuzzy = true)
{ {
PdfTextElement fieldTitle = FindElementByText(field, fuzzy); PdfTextElement fieldTitle = FindElementByText(field, fuzzy);
@@ -763,7 +772,7 @@ namespace VAR.PdfTools
return fieldData.OrderBy(elem => elem.GetX()).FirstOrDefault().VisibleText; return fieldData.OrderBy(elem => elem.GetX()).FirstOrDefault().VisibleText;
} }
public bool HasText(string text, bool fuzzy = true) public bool HasText(string text, bool fuzzy = true)
{ {
List<PdfTextElement> list = FindElementsContainingText(text, fuzzy); List<PdfTextElement> list = FindElementsContainingText(text, fuzzy);