PdfTextExtractor: Get results as PdfTextElementColumn, for debugging purposes.
This commit is contained in:
19
VAR.PdfTools.Workbench/FrmPdfInfo.Designer.cs
generated
19
VAR.PdfTools.Workbench/FrmPdfInfo.Designer.cs
generated
@@ -48,6 +48,7 @@
|
||||
this.txtField3 = new System.Windows.Forms.TextBox();
|
||||
this.btnGetColumn3 = new System.Windows.Forms.Button();
|
||||
this.txtPages = new System.Windows.Forms.TextBox();
|
||||
this.chkRender = new System.Windows.Forms.CheckBox();
|
||||
this.SuspendLayout();
|
||||
//
|
||||
// lblOutputs
|
||||
@@ -120,7 +121,7 @@
|
||||
//
|
||||
this.btnGetColumn1.Location = new System.Drawing.Point(292, 51);
|
||||
this.btnGetColumn1.Name = "btnGetColumn1";
|
||||
this.btnGetColumn1.Size = new System.Drawing.Size(60, 23);
|
||||
this.btnGetColumn1.Size = new System.Drawing.Size(69, 23);
|
||||
this.btnGetColumn1.TabIndex = 12;
|
||||
this.btnGetColumn1.Text = "GetColumn";
|
||||
this.btnGetColumn1.UseVisualStyleBackColor = true;
|
||||
@@ -195,7 +196,7 @@
|
||||
//
|
||||
this.btnGetColumn2.Location = new System.Drawing.Point(292, 80);
|
||||
this.btnGetColumn2.Name = "btnGetColumn2";
|
||||
this.btnGetColumn2.Size = new System.Drawing.Size(60, 23);
|
||||
this.btnGetColumn2.Size = new System.Drawing.Size(69, 23);
|
||||
this.btnGetColumn2.TabIndex = 19;
|
||||
this.btnGetColumn2.Text = "GetColumn";
|
||||
this.btnGetColumn2.UseVisualStyleBackColor = true;
|
||||
@@ -232,7 +233,7 @@
|
||||
//
|
||||
this.btnGetColumn3.Location = new System.Drawing.Point(292, 109);
|
||||
this.btnGetColumn3.Name = "btnGetColumn3";
|
||||
this.btnGetColumn3.Size = new System.Drawing.Size(60, 23);
|
||||
this.btnGetColumn3.Size = new System.Drawing.Size(69, 23);
|
||||
this.btnGetColumn3.TabIndex = 23;
|
||||
this.btnGetColumn3.Text = "GetColumn";
|
||||
this.btnGetColumn3.UseVisualStyleBackColor = true;
|
||||
@@ -246,11 +247,22 @@
|
||||
this.txtPages.Size = new System.Drawing.Size(75, 20);
|
||||
this.txtPages.TabIndex = 27;
|
||||
//
|
||||
// chkRender
|
||||
//
|
||||
this.chkRender.AutoSize = true;
|
||||
this.chkRender.Location = new System.Drawing.Point(292, 138);
|
||||
this.chkRender.Name = "chkRender";
|
||||
this.chkRender.Size = new System.Drawing.Size(61, 17);
|
||||
this.chkRender.TabIndex = 28;
|
||||
this.chkRender.Text = "Render";
|
||||
this.chkRender.UseVisualStyleBackColor = true;
|
||||
//
|
||||
// FrmPdfInfo
|
||||
//
|
||||
this.AutoScaleDimensions = new System.Drawing.SizeF(6F, 13F);
|
||||
this.AutoScaleMode = System.Windows.Forms.AutoScaleMode.Font;
|
||||
this.ClientSize = new System.Drawing.Size(484, 461);
|
||||
this.Controls.Add(this.chkRender);
|
||||
this.Controls.Add(this.txtPages);
|
||||
this.Controls.Add(this.btnHasText3);
|
||||
this.Controls.Add(this.btnGetField3);
|
||||
@@ -302,5 +314,6 @@
|
||||
private System.Windows.Forms.TextBox txtField3;
|
||||
private System.Windows.Forms.Button btnGetColumn3;
|
||||
private System.Windows.Forms.TextBox txtPages;
|
||||
private System.Windows.Forms.CheckBox chkRender;
|
||||
}
|
||||
}
|
||||
@@ -1,7 +1,6 @@
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.Drawing;
|
||||
using System.Drawing.Drawing2D;
|
||||
using System.Drawing.Imaging;
|
||||
using System.IO;
|
||||
using System.Linq;
|
||||
@@ -214,10 +213,11 @@ namespace VAR.PdfTools.Workbench
|
||||
if (part.Contains("-"))
|
||||
{
|
||||
string[] range = part.Split('-');
|
||||
if (range.Length == 2) {
|
||||
if (range.Length == 2)
|
||||
{
|
||||
int pageStart;
|
||||
int pageEnd;
|
||||
if(int.TryParse(range[0], out pageStart) && int.TryParse(range[1], out pageEnd))
|
||||
if (int.TryParse(range[0], out pageStart) && int.TryParse(range[1], out pageEnd))
|
||||
{
|
||||
listPages.AddRange(Enumerable.Range(pageStart, (pageEnd - pageStart) + 1));
|
||||
}
|
||||
@@ -226,13 +226,13 @@ namespace VAR.PdfTools.Workbench
|
||||
else
|
||||
{
|
||||
int pageNum;
|
||||
if(int.TryParse(part, out pageNum))
|
||||
if (int.TryParse(part, out pageNum))
|
||||
{
|
||||
listPages.Add(pageNum);
|
||||
}
|
||||
}
|
||||
}
|
||||
if(listPages.Count == 0)
|
||||
if (listPages.Count == 0)
|
||||
{
|
||||
listPages.AddRange(Enumerable.Range(1, maxPages));
|
||||
}
|
||||
@@ -294,18 +294,29 @@ namespace VAR.PdfTools.Workbench
|
||||
}
|
||||
|
||||
PdfDocument doc = PdfDocument.Load(pdfPath);
|
||||
string baseDocumentPath = Path.GetDirectoryName(txtPdfPath.Text);
|
||||
string baseDocumentFilename = Path.GetFileNameWithoutExtension(txtPdfPath.Text);
|
||||
|
||||
IEnumerable<int> selectedPages = GetSelectedPages(doc.Pages.Count);
|
||||
var columnData = new List<string>();
|
||||
var columns = new List<string>();
|
||||
int pageNum = 0;
|
||||
foreach (PdfDocumentPage page in doc.Pages)
|
||||
{
|
||||
pageNum++;
|
||||
if (selectedPages.Contains(pageNum) == false) { continue; }
|
||||
PdfTextExtractor extractor = new PdfTextExtractor(page);
|
||||
columnData.AddRange(extractor.GetColumnAsStrings(column));
|
||||
PdfTextElementColumn columnData = extractor.GetColumn(column);
|
||||
if (chkRender.Checked)
|
||||
{
|
||||
var pdfPageRenderer = new PdfPageRenderer(extractor);
|
||||
Bitmap bmp = pdfPageRenderer.Render();
|
||||
pdfPageRenderer.RenderColumn(columnData, bmp);
|
||||
string fileName = Path.Combine(baseDocumentPath, string.Format("{0}_{1:0000}.png", baseDocumentFilename, pageNum));
|
||||
bmp.Save(fileName, ImageFormat.Png);
|
||||
}
|
||||
columns.AddRange(columnData.Elements.Select(t => t.VisibleText));
|
||||
}
|
||||
txtOutput.Lines = columnData.ToArray();
|
||||
txtOutput.Lines = columns.ToArray();
|
||||
}
|
||||
|
||||
private void btnRender_Click(object sender, EventArgs e)
|
||||
@@ -325,19 +336,19 @@ namespace VAR.PdfTools.Workbench
|
||||
lines.Add(string.Format("Number of Pages : {0}", doc.Pages.Count));
|
||||
|
||||
IEnumerable<int> selectedPages = GetSelectedPages(doc.Pages.Count);
|
||||
int pageNumber = 0;
|
||||
int pageNum = 0;
|
||||
foreach (PdfDocumentPage page in doc.Pages)
|
||||
{
|
||||
pageNumber++;
|
||||
if (selectedPages.Contains(pageNumber) == false) { continue; }
|
||||
pageNum++;
|
||||
if (selectedPages.Contains(pageNum) == false) { continue; }
|
||||
|
||||
PdfPageRenderer pdfPageRenderer = new PdfPageRenderer(page);
|
||||
Bitmap bmp = pdfPageRenderer.Render();
|
||||
|
||||
lines.Add(string.Format("Page {0:0000} TextElements : {1}", pageNumber, pdfPageRenderer.Extractor.Elements.Count));
|
||||
lines.Add(string.Format("Page {0:0000} TextElements : {1}", pageNum, pdfPageRenderer.Extractor.Elements.Count));
|
||||
|
||||
// Save image to disk
|
||||
string fileName = Path.Combine(baseDocumentPath, string.Format("{0}_{1:0000}.png", baseDocumentFilename, pageNumber));
|
||||
string fileName = Path.Combine(baseDocumentPath, string.Format("{0}_{1:0000}.png", baseDocumentFilename, pageNum));
|
||||
bmp.Save(fileName, ImageFormat.Png);
|
||||
}
|
||||
|
||||
|
||||
@@ -10,7 +10,14 @@ namespace VAR.PdfTools
|
||||
{
|
||||
private PdfDocumentPage _page;
|
||||
private PdfTextExtractor _pdfTextExtractor;
|
||||
|
||||
private Rect _pageRect;
|
||||
private int _pageWidth;
|
||||
private int _pageHeight;
|
||||
private int _scale = 10;
|
||||
|
||||
private const int MaxSize = 10000;
|
||||
|
||||
|
||||
public PdfTextExtractor Extractor { get { return _pdfTextExtractor; } }
|
||||
|
||||
public PdfPageRenderer(PdfDocumentPage page)
|
||||
@@ -19,45 +26,92 @@ namespace VAR.PdfTools
|
||||
_pdfTextExtractor = new PdfTextExtractor(_page);
|
||||
}
|
||||
|
||||
public PdfPageRenderer(PdfTextExtractor pdfTextExtractor)
|
||||
{
|
||||
_pdfTextExtractor = pdfTextExtractor;
|
||||
_page = pdfTextExtractor.Page;
|
||||
|
||||
|
||||
// Calculate page size and scale
|
||||
_pageRect = _pdfTextExtractor.GetRect();
|
||||
_pageWidth = (int)Math.Ceiling(_pageRect.XMax - _pageRect.XMin);
|
||||
_pageHeight = (int)Math.Ceiling(_pageRect.YMax - _pageRect.YMin);
|
||||
while ((_pageWidth * _scale) > MaxSize) { _scale--; }
|
||||
while ((_pageHeight * _scale) > MaxSize) { _scale--; }
|
||||
if (_scale <= 0) { _scale = 1; }
|
||||
}
|
||||
|
||||
public Bitmap Render()
|
||||
{
|
||||
if (_pdfTextExtractor.Elements.Count == 0)
|
||||
{
|
||||
// Nothing to render
|
||||
Bitmap emptyBmp = new Bitmap(100, 200, PixelFormat.Format32bppArgb);
|
||||
using (Graphics gc = Graphics.FromImage(emptyBmp))
|
||||
gc.Clear(Color.White);
|
||||
using (Graphics gcEmpty = Graphics.FromImage(emptyBmp))
|
||||
gcEmpty.Clear(Color.White);
|
||||
return emptyBmp;
|
||||
}
|
||||
|
||||
// Calculate page size and scale
|
||||
Rect pageRect = _pdfTextExtractor.GetRect();
|
||||
int pageWidth = (int)Math.Ceiling(pageRect.XMax - pageRect.XMin);
|
||||
int pageHeight = (int)Math.Ceiling(pageRect.YMax - pageRect.YMin);
|
||||
int Scale = 10;
|
||||
int MaxSize = 10000;
|
||||
while ((pageWidth * Scale) > MaxSize) { Scale--; }
|
||||
while ((pageHeight * Scale) > MaxSize && Scale > 1) { Scale--; }
|
||||
if (Scale <= 0) { Scale = 1; }
|
||||
// Prepare image
|
||||
Bitmap bmp = new Bitmap(_pageWidth * _scale, _pageHeight * _scale, PixelFormat.Format32bppArgb);
|
||||
Graphics gc = Graphics.FromImage(bmp);
|
||||
gc.Clear(Color.White);
|
||||
|
||||
// Draw page image
|
||||
Bitmap bmp = new Bitmap(pageWidth * Scale, pageHeight * Scale, PixelFormat.Format32bppArgb);
|
||||
using (Graphics gc = Graphics.FromImage(bmp))
|
||||
// Draw text elements of the page
|
||||
using (Pen penTextElem = new Pen(Color.Blue))
|
||||
using (Pen penCharElem = new Pen(Color.Navy))
|
||||
{
|
||||
gc.Clear(Color.White);
|
||||
|
||||
// Draw text elements
|
||||
foreach (PdfTextElement textElement in _pdfTextExtractor.Elements)
|
||||
{
|
||||
DrawTextElement(textElement, gc, penTextElem, penCharElem, Scale, pageHeight, pageRect.XMin, pageRect.YMin, Brushes.Black);
|
||||
DrawTextElement(textElement, gc, penTextElem, penCharElem, _scale, _pageHeight, _pageRect.XMin, _pageRect.YMin, Brushes.Black);
|
||||
}
|
||||
}
|
||||
|
||||
gc.Dispose();
|
||||
return bmp;
|
||||
}
|
||||
|
||||
private static void DrawTextElement(PdfTextElement textElement, Graphics gc, Pen penTextElem, Pen penCharElem, int Scale, int pageHeight, double pageXMin, double pageYMin, Brush brushText)
|
||||
public Bitmap RenderColumn(PdfTextElementColumn columnData, Bitmap bmp = null)
|
||||
{
|
||||
Graphics gc;
|
||||
if (bmp == null)
|
||||
{
|
||||
bmp = new Bitmap(_pageWidth * _scale, _pageHeight * _scale, PixelFormat.Format32bppArgb);
|
||||
gc = Graphics.FromImage(bmp);
|
||||
gc.Clear(Color.White);
|
||||
}
|
||||
else
|
||||
{
|
||||
gc = Graphics.FromImage(bmp);
|
||||
}
|
||||
|
||||
// Draw text elements of the column
|
||||
using (Pen penTextElem = new Pen(Color.Red))
|
||||
using (Pen penCharElem = new Pen(Color.DarkRed))
|
||||
{
|
||||
foreach (PdfTextElement textElement in columnData.Elements)
|
||||
{
|
||||
DrawTextElement(textElement, gc, penTextElem, penCharElem, _scale, _pageHeight, _pageRect.XMin, _pageRect.YMin, Brushes.OrangeRed);
|
||||
}
|
||||
}
|
||||
|
||||
// Draw column extents
|
||||
using (Pen penColumn = new Pen(Color.Red))
|
||||
{
|
||||
float y = (float)(_pageRect.YMax - columnData.Y);
|
||||
float x1 = (float)(columnData.X1 - _pageRect.XMin);
|
||||
float x2 = (float)(columnData.X2 - _pageRect.XMin);
|
||||
|
||||
gc.DrawLine(penColumn, x1 * _scale, y * _scale, x2 * _scale, y * _scale);
|
||||
gc.DrawLine(penColumn, x1 * _scale, y * _scale, x1 * _scale, _pageHeight * _scale);
|
||||
gc.DrawLine(penColumn, x2 * _scale, y * _scale, x2 * _scale, _pageHeight * _scale);
|
||||
}
|
||||
|
||||
gc.Dispose();
|
||||
return bmp;
|
||||
}
|
||||
|
||||
private static void DrawTextElement(PdfTextElement textElement, Graphics gc, Pen penTextElem, Pen penCharElem, int scale, int pageHeight, double pageXMin, double pageYMin, Brush brushText)
|
||||
{
|
||||
double textElementX = textElement.GetX() - pageXMin;
|
||||
double textElementY = textElement.GetY() - pageYMin;
|
||||
@@ -74,29 +128,29 @@ namespace VAR.PdfTools
|
||||
if (penTextElem != null)
|
||||
{
|
||||
DrawRoundedRectangle(gc, penTextElem,
|
||||
(int)(textElementPageX * Scale),
|
||||
(int)(textElementPageY * Scale),
|
||||
(int)(textElementWidth * Scale),
|
||||
(int)(textElementHeight * Scale),
|
||||
(int)(textElementPageX * scale),
|
||||
(int)(textElementPageY * scale),
|
||||
(int)(textElementWidth * scale),
|
||||
(int)(textElementHeight * scale),
|
||||
5);
|
||||
}
|
||||
|
||||
using (Font font = new Font("Arial", (int)(textElementHeight * Scale), GraphicsUnit.Pixel))
|
||||
using (Font font = new Font("Arial", (int)(textElementHeight * scale), GraphicsUnit.Pixel))
|
||||
{
|
||||
foreach (PdfCharElement c in textElement.Characters)
|
||||
{
|
||||
gc.DrawString(c.Char,
|
||||
font,
|
||||
brushText,
|
||||
(int)((textElementPageX + c.Displacement) * Scale),
|
||||
(int)(textElementPageY * Scale));
|
||||
(int)((textElementPageX + c.Displacement) * scale),
|
||||
(int)(textElementPageY * scale));
|
||||
if (penCharElem != null)
|
||||
{
|
||||
DrawRoundedRectangle(gc, penCharElem,
|
||||
(int)((textElementPageX + c.Displacement) * Scale),
|
||||
(int)(textElementPageY * Scale),
|
||||
(int)(c.Width * Scale),
|
||||
(int)(textElementHeight * Scale),
|
||||
(int)((textElementPageX + c.Displacement) * scale),
|
||||
(int)(textElementPageY * scale),
|
||||
(int)(c.Width * scale),
|
||||
(int)(textElementHeight * scale),
|
||||
5);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -103,4 +103,28 @@ namespace VAR.PdfTools
|
||||
#endregion
|
||||
}
|
||||
|
||||
public class PdfTextElementColumn
|
||||
{
|
||||
public PdfTextElement HeadTextElement { get; private set; }
|
||||
|
||||
public IEnumerable<PdfTextElement> Elements { get; private set; }
|
||||
|
||||
public double Y { get; private set; }
|
||||
|
||||
public double X1 { get; private set; }
|
||||
public double X2 { get; private set; }
|
||||
|
||||
public static PdfTextElementColumn Empty { get; } = new PdfTextElementColumn();
|
||||
|
||||
private PdfTextElementColumn() { }
|
||||
|
||||
public PdfTextElementColumn(PdfTextElement head, IEnumerable<PdfTextElement> elements, double y, double x1, double x2)
|
||||
{
|
||||
HeadTextElement = head;
|
||||
Elements = elements;
|
||||
Y = y;
|
||||
X1 = x1;
|
||||
X2 = x2;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -651,7 +651,7 @@ namespace VAR.PdfTools
|
||||
#endregion
|
||||
|
||||
#region Public methods
|
||||
|
||||
|
||||
public Rect GetRect()
|
||||
{
|
||||
Rect rect = null;
|
||||
@@ -664,12 +664,12 @@ namespace VAR.PdfTools
|
||||
return rect;
|
||||
}
|
||||
|
||||
public List<string> GetColumnAsStrings(string column, bool fuzzy =true)
|
||||
public PdfTextElementColumn GetColumn(string column, bool fuzzy = true)
|
||||
{
|
||||
PdfTextElement columnHead = FindElementByText(column, fuzzy);
|
||||
if (columnHead == null)
|
||||
{
|
||||
return new List<string>();
|
||||
return PdfTextElementColumn.Empty;
|
||||
}
|
||||
double headY = columnHead.GetY();
|
||||
double headX1 = columnHead.GetX();
|
||||
@@ -717,25 +717,34 @@ namespace VAR.PdfTools
|
||||
columnDataRaw = columnDataRaw.OrderByDescending(elem => elem.GetY()).ToList();
|
||||
|
||||
// Only items completelly inside extents, and break on the first element outside
|
||||
var columnData = new List<PdfTextElement>();
|
||||
var columnElements = new List<PdfTextElement>();
|
||||
foreach (PdfTextElement elem in columnDataRaw)
|
||||
{
|
||||
double elemX1 = elem.GetX();
|
||||
double elemX2 = elemX1 + elem.VisibleWidth;
|
||||
if (elemX1 < extentX1 || elemX2 > extentX2) { break; }
|
||||
|
||||
columnData.Add(elem);
|
||||
columnElements.Add(elem);
|
||||
}
|
||||
|
||||
var columnData = new PdfTextElementColumn(columnHead, columnElements, headY, extentX1, extentX2);
|
||||
|
||||
return columnData;
|
||||
}
|
||||
|
||||
public List<string> GetColumnAsStrings(string column, bool fuzzy = true)
|
||||
{
|
||||
PdfTextElementColumn columnData = GetColumn(column, fuzzy);
|
||||
|
||||
// Emit result
|
||||
var result = new List<string>();
|
||||
foreach (PdfTextElement elem in columnData)
|
||||
foreach (PdfTextElement elem in columnData.Elements)
|
||||
{
|
||||
result.Add(elem.VisibleText);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
|
||||
public string GetFieldAsString(string field, bool fuzzy = true)
|
||||
{
|
||||
PdfTextElement fieldTitle = FindElementByText(field, fuzzy);
|
||||
@@ -763,7 +772,7 @@ namespace VAR.PdfTools
|
||||
|
||||
return fieldData.OrderBy(elem => elem.GetX()).FirstOrDefault().VisibleText;
|
||||
}
|
||||
|
||||
|
||||
public bool HasText(string text, bool fuzzy = true)
|
||||
{
|
||||
List<PdfTextElement> list = FindElementsContainingText(text, fuzzy);
|
||||
|
||||
Reference in New Issue
Block a user