PdfTextExtractor: Get results as PdfTextElementColumn, for debugging purposes.

This commit is contained in:
2019-10-27 18:45:13 +01:00
parent 386b38bd21
commit 9af363529c
5 changed files with 166 additions and 55 deletions

View File

@@ -48,6 +48,7 @@
this.txtField3 = new System.Windows.Forms.TextBox();
this.btnGetColumn3 = new System.Windows.Forms.Button();
this.txtPages = new System.Windows.Forms.TextBox();
this.chkRender = new System.Windows.Forms.CheckBox();
this.SuspendLayout();
//
// lblOutputs
@@ -120,7 +121,7 @@
//
this.btnGetColumn1.Location = new System.Drawing.Point(292, 51);
this.btnGetColumn1.Name = "btnGetColumn1";
this.btnGetColumn1.Size = new System.Drawing.Size(60, 23);
this.btnGetColumn1.Size = new System.Drawing.Size(69, 23);
this.btnGetColumn1.TabIndex = 12;
this.btnGetColumn1.Text = "GetColumn";
this.btnGetColumn1.UseVisualStyleBackColor = true;
@@ -195,7 +196,7 @@
//
this.btnGetColumn2.Location = new System.Drawing.Point(292, 80);
this.btnGetColumn2.Name = "btnGetColumn2";
this.btnGetColumn2.Size = new System.Drawing.Size(60, 23);
this.btnGetColumn2.Size = new System.Drawing.Size(69, 23);
this.btnGetColumn2.TabIndex = 19;
this.btnGetColumn2.Text = "GetColumn";
this.btnGetColumn2.UseVisualStyleBackColor = true;
@@ -232,7 +233,7 @@
//
this.btnGetColumn3.Location = new System.Drawing.Point(292, 109);
this.btnGetColumn3.Name = "btnGetColumn3";
this.btnGetColumn3.Size = new System.Drawing.Size(60, 23);
this.btnGetColumn3.Size = new System.Drawing.Size(69, 23);
this.btnGetColumn3.TabIndex = 23;
this.btnGetColumn3.Text = "GetColumn";
this.btnGetColumn3.UseVisualStyleBackColor = true;
@@ -246,11 +247,22 @@
this.txtPages.Size = new System.Drawing.Size(75, 20);
this.txtPages.TabIndex = 27;
//
// chkRender
//
this.chkRender.AutoSize = true;
this.chkRender.Location = new System.Drawing.Point(292, 138);
this.chkRender.Name = "chkRender";
this.chkRender.Size = new System.Drawing.Size(61, 17);
this.chkRender.TabIndex = 28;
this.chkRender.Text = "Render";
this.chkRender.UseVisualStyleBackColor = true;
//
// FrmPdfInfo
//
this.AutoScaleDimensions = new System.Drawing.SizeF(6F, 13F);
this.AutoScaleMode = System.Windows.Forms.AutoScaleMode.Font;
this.ClientSize = new System.Drawing.Size(484, 461);
this.Controls.Add(this.chkRender);
this.Controls.Add(this.txtPages);
this.Controls.Add(this.btnHasText3);
this.Controls.Add(this.btnGetField3);
@@ -302,5 +314,6 @@
private System.Windows.Forms.TextBox txtField3;
private System.Windows.Forms.Button btnGetColumn3;
private System.Windows.Forms.TextBox txtPages;
private System.Windows.Forms.CheckBox chkRender;
}
}

View File

@@ -1,7 +1,6 @@
using System;
using System.Collections.Generic;
using System.Drawing;
using System.Drawing.Drawing2D;
using System.Drawing.Imaging;
using System.IO;
using System.Linq;
@@ -214,10 +213,11 @@ namespace VAR.PdfTools.Workbench
if (part.Contains("-"))
{
string[] range = part.Split('-');
if (range.Length == 2) {
if (range.Length == 2)
{
int pageStart;
int pageEnd;
if(int.TryParse(range[0], out pageStart) && int.TryParse(range[1], out pageEnd))
if (int.TryParse(range[0], out pageStart) && int.TryParse(range[1], out pageEnd))
{
listPages.AddRange(Enumerable.Range(pageStart, (pageEnd - pageStart) + 1));
}
@@ -226,13 +226,13 @@ namespace VAR.PdfTools.Workbench
else
{
int pageNum;
if(int.TryParse(part, out pageNum))
if (int.TryParse(part, out pageNum))
{
listPages.Add(pageNum);
}
}
}
if(listPages.Count == 0)
if (listPages.Count == 0)
{
listPages.AddRange(Enumerable.Range(1, maxPages));
}
@@ -294,18 +294,29 @@ namespace VAR.PdfTools.Workbench
}
PdfDocument doc = PdfDocument.Load(pdfPath);
string baseDocumentPath = Path.GetDirectoryName(txtPdfPath.Text);
string baseDocumentFilename = Path.GetFileNameWithoutExtension(txtPdfPath.Text);
IEnumerable<int> selectedPages = GetSelectedPages(doc.Pages.Count);
var columnData = new List<string>();
var columns = new List<string>();
int pageNum = 0;
foreach (PdfDocumentPage page in doc.Pages)
{
pageNum++;
if (selectedPages.Contains(pageNum) == false) { continue; }
PdfTextExtractor extractor = new PdfTextExtractor(page);
columnData.AddRange(extractor.GetColumnAsStrings(column));
PdfTextElementColumn columnData = extractor.GetColumn(column);
if (chkRender.Checked)
{
var pdfPageRenderer = new PdfPageRenderer(extractor);
Bitmap bmp = pdfPageRenderer.Render();
pdfPageRenderer.RenderColumn(columnData, bmp);
string fileName = Path.Combine(baseDocumentPath, string.Format("{0}_{1:0000}.png", baseDocumentFilename, pageNum));
bmp.Save(fileName, ImageFormat.Png);
}
columns.AddRange(columnData.Elements.Select(t => t.VisibleText));
}
txtOutput.Lines = columnData.ToArray();
txtOutput.Lines = columns.ToArray();
}
private void btnRender_Click(object sender, EventArgs e)
@@ -325,19 +336,19 @@ namespace VAR.PdfTools.Workbench
lines.Add(string.Format("Number of Pages : {0}", doc.Pages.Count));
IEnumerable<int> selectedPages = GetSelectedPages(doc.Pages.Count);
int pageNumber = 0;
int pageNum = 0;
foreach (PdfDocumentPage page in doc.Pages)
{
pageNumber++;
if (selectedPages.Contains(pageNumber) == false) { continue; }
pageNum++;
if (selectedPages.Contains(pageNum) == false) { continue; }
PdfPageRenderer pdfPageRenderer = new PdfPageRenderer(page);
Bitmap bmp = pdfPageRenderer.Render();
lines.Add(string.Format("Page {0:0000} TextElements : {1}", pageNumber, pdfPageRenderer.Extractor.Elements.Count));
lines.Add(string.Format("Page {0:0000} TextElements : {1}", pageNum, pdfPageRenderer.Extractor.Elements.Count));
// Save image to disk
string fileName = Path.Combine(baseDocumentPath, string.Format("{0}_{1:0000}.png", baseDocumentFilename, pageNumber));
string fileName = Path.Combine(baseDocumentPath, string.Format("{0}_{1:0000}.png", baseDocumentFilename, pageNum));
bmp.Save(fileName, ImageFormat.Png);
}

View File

@@ -10,7 +10,14 @@ namespace VAR.PdfTools
{
private PdfDocumentPage _page;
private PdfTextExtractor _pdfTextExtractor;
private Rect _pageRect;
private int _pageWidth;
private int _pageHeight;
private int _scale = 10;
private const int MaxSize = 10000;
public PdfTextExtractor Extractor { get { return _pdfTextExtractor; } }
public PdfPageRenderer(PdfDocumentPage page)
@@ -19,45 +26,92 @@ namespace VAR.PdfTools
_pdfTextExtractor = new PdfTextExtractor(_page);
}
public PdfPageRenderer(PdfTextExtractor pdfTextExtractor)
{
_pdfTextExtractor = pdfTextExtractor;
_page = pdfTextExtractor.Page;
// Calculate page size and scale
_pageRect = _pdfTextExtractor.GetRect();
_pageWidth = (int)Math.Ceiling(_pageRect.XMax - _pageRect.XMin);
_pageHeight = (int)Math.Ceiling(_pageRect.YMax - _pageRect.YMin);
while ((_pageWidth * _scale) > MaxSize) { _scale--; }
while ((_pageHeight * _scale) > MaxSize) { _scale--; }
if (_scale <= 0) { _scale = 1; }
}
public Bitmap Render()
{
if (_pdfTextExtractor.Elements.Count == 0)
{
// Nothing to render
Bitmap emptyBmp = new Bitmap(100, 200, PixelFormat.Format32bppArgb);
using (Graphics gc = Graphics.FromImage(emptyBmp))
gc.Clear(Color.White);
using (Graphics gcEmpty = Graphics.FromImage(emptyBmp))
gcEmpty.Clear(Color.White);
return emptyBmp;
}
// Calculate page size and scale
Rect pageRect = _pdfTextExtractor.GetRect();
int pageWidth = (int)Math.Ceiling(pageRect.XMax - pageRect.XMin);
int pageHeight = (int)Math.Ceiling(pageRect.YMax - pageRect.YMin);
int Scale = 10;
int MaxSize = 10000;
while ((pageWidth * Scale) > MaxSize) { Scale--; }
while ((pageHeight * Scale) > MaxSize && Scale > 1) { Scale--; }
if (Scale <= 0) { Scale = 1; }
// Prepare image
Bitmap bmp = new Bitmap(_pageWidth * _scale, _pageHeight * _scale, PixelFormat.Format32bppArgb);
Graphics gc = Graphics.FromImage(bmp);
gc.Clear(Color.White);
// Draw page image
Bitmap bmp = new Bitmap(pageWidth * Scale, pageHeight * Scale, PixelFormat.Format32bppArgb);
using (Graphics gc = Graphics.FromImage(bmp))
// Draw text elements of the page
using (Pen penTextElem = new Pen(Color.Blue))
using (Pen penCharElem = new Pen(Color.Navy))
{
gc.Clear(Color.White);
// Draw text elements
foreach (PdfTextElement textElement in _pdfTextExtractor.Elements)
{
DrawTextElement(textElement, gc, penTextElem, penCharElem, Scale, pageHeight, pageRect.XMin, pageRect.YMin, Brushes.Black);
DrawTextElement(textElement, gc, penTextElem, penCharElem, _scale, _pageHeight, _pageRect.XMin, _pageRect.YMin, Brushes.Black);
}
}
gc.Dispose();
return bmp;
}
private static void DrawTextElement(PdfTextElement textElement, Graphics gc, Pen penTextElem, Pen penCharElem, int Scale, int pageHeight, double pageXMin, double pageYMin, Brush brushText)
public Bitmap RenderColumn(PdfTextElementColumn columnData, Bitmap bmp = null)
{
Graphics gc;
if (bmp == null)
{
bmp = new Bitmap(_pageWidth * _scale, _pageHeight * _scale, PixelFormat.Format32bppArgb);
gc = Graphics.FromImage(bmp);
gc.Clear(Color.White);
}
else
{
gc = Graphics.FromImage(bmp);
}
// Draw text elements of the column
using (Pen penTextElem = new Pen(Color.Red))
using (Pen penCharElem = new Pen(Color.DarkRed))
{
foreach (PdfTextElement textElement in columnData.Elements)
{
DrawTextElement(textElement, gc, penTextElem, penCharElem, _scale, _pageHeight, _pageRect.XMin, _pageRect.YMin, Brushes.OrangeRed);
}
}
// Draw column extents
using (Pen penColumn = new Pen(Color.Red))
{
float y = (float)(_pageRect.YMax - columnData.Y);
float x1 = (float)(columnData.X1 - _pageRect.XMin);
float x2 = (float)(columnData.X2 - _pageRect.XMin);
gc.DrawLine(penColumn, x1 * _scale, y * _scale, x2 * _scale, y * _scale);
gc.DrawLine(penColumn, x1 * _scale, y * _scale, x1 * _scale, _pageHeight * _scale);
gc.DrawLine(penColumn, x2 * _scale, y * _scale, x2 * _scale, _pageHeight * _scale);
}
gc.Dispose();
return bmp;
}
private static void DrawTextElement(PdfTextElement textElement, Graphics gc, Pen penTextElem, Pen penCharElem, int scale, int pageHeight, double pageXMin, double pageYMin, Brush brushText)
{
double textElementX = textElement.GetX() - pageXMin;
double textElementY = textElement.GetY() - pageYMin;
@@ -74,29 +128,29 @@ namespace VAR.PdfTools
if (penTextElem != null)
{
DrawRoundedRectangle(gc, penTextElem,
(int)(textElementPageX * Scale),
(int)(textElementPageY * Scale),
(int)(textElementWidth * Scale),
(int)(textElementHeight * Scale),
(int)(textElementPageX * scale),
(int)(textElementPageY * scale),
(int)(textElementWidth * scale),
(int)(textElementHeight * scale),
5);
}
using (Font font = new Font("Arial", (int)(textElementHeight * Scale), GraphicsUnit.Pixel))
using (Font font = new Font("Arial", (int)(textElementHeight * scale), GraphicsUnit.Pixel))
{
foreach (PdfCharElement c in textElement.Characters)
{
gc.DrawString(c.Char,
font,
brushText,
(int)((textElementPageX + c.Displacement) * Scale),
(int)(textElementPageY * Scale));
(int)((textElementPageX + c.Displacement) * scale),
(int)(textElementPageY * scale));
if (penCharElem != null)
{
DrawRoundedRectangle(gc, penCharElem,
(int)((textElementPageX + c.Displacement) * Scale),
(int)(textElementPageY * Scale),
(int)(c.Width * Scale),
(int)(textElementHeight * Scale),
(int)((textElementPageX + c.Displacement) * scale),
(int)(textElementPageY * scale),
(int)(c.Width * scale),
(int)(textElementHeight * scale),
5);
}
}

View File

@@ -103,4 +103,28 @@ namespace VAR.PdfTools
#endregion
}
public class PdfTextElementColumn
{
public PdfTextElement HeadTextElement { get; private set; }
public IEnumerable<PdfTextElement> Elements { get; private set; }
public double Y { get; private set; }
public double X1 { get; private set; }
public double X2 { get; private set; }
public static PdfTextElementColumn Empty { get; } = new PdfTextElementColumn();
private PdfTextElementColumn() { }
public PdfTextElementColumn(PdfTextElement head, IEnumerable<PdfTextElement> elements, double y, double x1, double x2)
{
HeadTextElement = head;
Elements = elements;
Y = y;
X1 = x1;
X2 = x2;
}
}
}

View File

@@ -651,7 +651,7 @@ namespace VAR.PdfTools
#endregion
#region Public methods
public Rect GetRect()
{
Rect rect = null;
@@ -664,12 +664,12 @@ namespace VAR.PdfTools
return rect;
}
public List<string> GetColumnAsStrings(string column, bool fuzzy =true)
public PdfTextElementColumn GetColumn(string column, bool fuzzy = true)
{
PdfTextElement columnHead = FindElementByText(column, fuzzy);
if (columnHead == null)
{
return new List<string>();
return PdfTextElementColumn.Empty;
}
double headY = columnHead.GetY();
double headX1 = columnHead.GetX();
@@ -717,25 +717,34 @@ namespace VAR.PdfTools
columnDataRaw = columnDataRaw.OrderByDescending(elem => elem.GetY()).ToList();
// Only items completelly inside extents, and break on the first element outside
var columnData = new List<PdfTextElement>();
var columnElements = new List<PdfTextElement>();
foreach (PdfTextElement elem in columnDataRaw)
{
double elemX1 = elem.GetX();
double elemX2 = elemX1 + elem.VisibleWidth;
if (elemX1 < extentX1 || elemX2 > extentX2) { break; }
columnData.Add(elem);
columnElements.Add(elem);
}
var columnData = new PdfTextElementColumn(columnHead, columnElements, headY, extentX1, extentX2);
return columnData;
}
public List<string> GetColumnAsStrings(string column, bool fuzzy = true)
{
PdfTextElementColumn columnData = GetColumn(column, fuzzy);
// Emit result
var result = new List<string>();
foreach (PdfTextElement elem in columnData)
foreach (PdfTextElement elem in columnData.Elements)
{
result.Add(elem.VisibleText);
}
return result;
}
public string GetFieldAsString(string field, bool fuzzy = true)
{
PdfTextElement fieldTitle = FindElementByText(field, fuzzy);
@@ -763,7 +772,7 @@ namespace VAR.PdfTools
return fieldData.OrderBy(elem => elem.GetX()).FirstOrDefault().VisibleText;
}
public bool HasText(string text, bool fuzzy = true)
{
List<PdfTextElement> list = FindElementsContainingText(text, fuzzy);