PdfExtractor: GetColumn and GetField for easy data exploration

This commit is contained in:
2016-06-21 17:35:35 +02:00
parent 5e96ee22d8
commit 8f12b13a9f
7 changed files with 260 additions and 7 deletions

View File

@@ -13,6 +13,12 @@
<setting name="LastPdfPath" serializeAs="String">
<value />
</setting>
<setting name="LastColumnName" serializeAs="String">
<value />
</setting>
<setting name="LastFieldName" serializeAs="String">
<value />
</setting>
</VAR.PdfTools.Workbench.Properties.Settings>
</userSettings>
</configuration>

View File

@@ -34,13 +34,17 @@
this.txtPdfPath = new System.Windows.Forms.TextBox();
this.txtOutput = new System.Windows.Forms.TextBox();
this.btnProcess = new System.Windows.Forms.Button();
this.btnGetColumn = new System.Windows.Forms.Button();
this.txtColumnName = new System.Windows.Forms.TextBox();
this.txtFieldName = new System.Windows.Forms.TextBox();
this.btnGetField = new System.Windows.Forms.Button();
this.SuspendLayout();
//
// lblOutputs
//
this.lblOutputs.AutoSize = true;
this.lblOutputs.Font = new System.Drawing.Font("Microsoft Sans Serif", 8.25F, System.Drawing.FontStyle.Bold, System.Drawing.GraphicsUnit.Point, ((byte)(0)));
this.lblOutputs.Location = new System.Drawing.Point(12, 48);
this.lblOutputs.Location = new System.Drawing.Point(12, 130);
this.lblOutputs.Name = "lblOutputs";
this.lblOutputs.Size = new System.Drawing.Size(51, 13);
this.lblOutputs.TabIndex = 11;
@@ -59,7 +63,7 @@
// btnBrowse
//
this.btnBrowse.Anchor = ((System.Windows.Forms.AnchorStyles)((System.Windows.Forms.AnchorStyles.Top | System.Windows.Forms.AnchorStyles.Right)));
this.btnBrowse.Location = new System.Drawing.Point(323, 23);
this.btnBrowse.Location = new System.Drawing.Point(316, 23);
this.btnBrowse.Name = "btnBrowse";
this.btnBrowse.Size = new System.Drawing.Size(75, 23);
this.btnBrowse.TabIndex = 9;
@@ -73,7 +77,7 @@
| System.Windows.Forms.AnchorStyles.Right)));
this.txtPdfPath.Location = new System.Drawing.Point(15, 25);
this.txtPdfPath.Name = "txtPdfPath";
this.txtPdfPath.Size = new System.Drawing.Size(302, 20);
this.txtPdfPath.Size = new System.Drawing.Size(295, 20);
this.txtPdfPath.TabIndex = 8;
//
// txtOutput
@@ -84,17 +88,17 @@
| System.Windows.Forms.AnchorStyles.Left)
| System.Windows.Forms.AnchorStyles.Right)));
this.txtOutput.Font = new System.Drawing.Font("Consolas", 8.25F, System.Drawing.FontStyle.Regular, System.Drawing.GraphicsUnit.Point, ((byte)(0)));
this.txtOutput.Location = new System.Drawing.Point(15, 64);
this.txtOutput.Location = new System.Drawing.Point(15, 146);
this.txtOutput.Multiline = true;
this.txtOutput.Name = "txtOutput";
this.txtOutput.ScrollBars = System.Windows.Forms.ScrollBars.Vertical;
this.txtOutput.Size = new System.Drawing.Size(464, 355);
this.txtOutput.Size = new System.Drawing.Size(457, 303);
this.txtOutput.TabIndex = 7;
//
// btnProcess
//
this.btnProcess.Anchor = ((System.Windows.Forms.AnchorStyles)((System.Windows.Forms.AnchorStyles.Top | System.Windows.Forms.AnchorStyles.Right)));
this.btnProcess.Location = new System.Drawing.Point(404, 23);
this.btnProcess.Location = new System.Drawing.Point(397, 23);
this.btnProcess.Name = "btnProcess";
this.btnProcess.Size = new System.Drawing.Size(75, 23);
this.btnProcess.TabIndex = 6;
@@ -102,11 +106,49 @@
this.btnProcess.UseVisualStyleBackColor = true;
this.btnProcess.Click += new System.EventHandler(this.btnProcess_Click);
//
// btnGetColumn
//
this.btnGetColumn.Location = new System.Drawing.Point(163, 51);
this.btnGetColumn.Name = "btnGetColumn";
this.btnGetColumn.Size = new System.Drawing.Size(75, 23);
this.btnGetColumn.TabIndex = 12;
this.btnGetColumn.Text = "GetColumn";
this.btnGetColumn.UseVisualStyleBackColor = true;
this.btnGetColumn.Click += new System.EventHandler(this.btnGetColumn_Click);
//
// txtColumnName
//
this.txtColumnName.Location = new System.Drawing.Point(15, 53);
this.txtColumnName.Name = "txtColumnName";
this.txtColumnName.Size = new System.Drawing.Size(142, 20);
this.txtColumnName.TabIndex = 13;
//
// txtFieldName
//
this.txtFieldName.Location = new System.Drawing.Point(15, 82);
this.txtFieldName.Name = "txtFieldName";
this.txtFieldName.Size = new System.Drawing.Size(142, 20);
this.txtFieldName.TabIndex = 15;
//
// btnGetField
//
this.btnGetField.Location = new System.Drawing.Point(163, 80);
this.btnGetField.Name = "btnGetField";
this.btnGetField.Size = new System.Drawing.Size(75, 23);
this.btnGetField.TabIndex = 14;
this.btnGetField.Text = "GetField";
this.btnGetField.UseVisualStyleBackColor = true;
this.btnGetField.Click += new System.EventHandler(this.btnGetField_Click);
//
// FrmPdfInfo
//
this.AutoScaleDimensions = new System.Drawing.SizeF(6F, 13F);
this.AutoScaleMode = System.Windows.Forms.AutoScaleMode.Font;
this.ClientSize = new System.Drawing.Size(491, 431);
this.ClientSize = new System.Drawing.Size(484, 461);
this.Controls.Add(this.txtFieldName);
this.Controls.Add(this.btnGetField);
this.Controls.Add(this.txtColumnName);
this.Controls.Add(this.btnGetColumn);
this.Controls.Add(this.lblOutputs);
this.Controls.Add(this.lblInputs);
this.Controls.Add(this.btnBrowse);
@@ -130,5 +172,9 @@
private System.Windows.Forms.TextBox txtPdfPath;
private System.Windows.Forms.TextBox txtOutput;
private System.Windows.Forms.Button btnProcess;
private System.Windows.Forms.Button btnGetColumn;
private System.Windows.Forms.TextBox txtColumnName;
private System.Windows.Forms.TextBox txtFieldName;
private System.Windows.Forms.Button btnGetField;
}
}

View File

@@ -16,11 +16,15 @@ namespace VAR.PdfTools.Workbench
private void FrmPdfInfo_Load(object sender, EventArgs e)
{
txtPdfPath.Text = Properties.Settings.Default.LastPdfPath;
txtColumnName.Text = Properties.Settings.Default.LastColumnName;
txtFieldName.Text = Properties.Settings.Default.LastFieldName;
}
private void FrmPdfInfo_FormClosing(object sender, FormClosingEventArgs e)
{
Properties.Settings.Default.LastPdfPath = txtPdfPath.Text;
Properties.Settings.Default.LastColumnName = txtColumnName.Text;
Properties.Settings.Default.LastFieldName = txtFieldName.Text;
Properties.Settings.Default.Save();
}
@@ -91,5 +95,42 @@ namespace VAR.PdfTools.Workbench
txtOutput.Lines = lines.ToArray();
}
private void btnGetColumn_Click(object sender, EventArgs e)
{
if (System.IO.File.Exists(txtPdfPath.Text) == false)
{
MessageBox.Show("File does not exist");
return;
}
PdfDocument doc = PdfDocument.Load(txtPdfPath.Text);
var columnData = new List<string>();
foreach (PdfDocumentPage page in doc.Pages)
{
PdfTextExtractor extractor = new PdfTextExtractor(page);
columnData.AddRange(extractor.GetColumn(txtColumnName.Text));
}
txtOutput.Lines = columnData.ToArray();
}
private void btnGetField_Click(object sender, EventArgs e)
{
if (System.IO.File.Exists(txtPdfPath.Text) == false)
{
MessageBox.Show("File does not exist");
return;
}
PdfDocument doc = PdfDocument.Load(txtPdfPath.Text);
var fieldData = new List<string>();
foreach (PdfDocumentPage page in doc.Pages)
{
PdfTextExtractor extractor = new PdfTextExtractor(page);
fieldData.Add(extractor.GetField(txtFieldName.Text));
}
txtOutput.Lines = fieldData.ToArray();
}
}
}

View File

@@ -34,5 +34,29 @@ namespace VAR.PdfTools.Workbench.Properties {
this["LastPdfPath"] = value;
}
}
[global::System.Configuration.UserScopedSettingAttribute()]
[global::System.Diagnostics.DebuggerNonUserCodeAttribute()]
[global::System.Configuration.DefaultSettingValueAttribute("")]
public string LastColumnName {
get {
return ((string)(this["LastColumnName"]));
}
set {
this["LastColumnName"] = value;
}
}
[global::System.Configuration.UserScopedSettingAttribute()]
[global::System.Diagnostics.DebuggerNonUserCodeAttribute()]
[global::System.Configuration.DefaultSettingValueAttribute("")]
public string LastFieldName {
get {
return ((string)(this["LastFieldName"]));
}
set {
this["LastFieldName"] = value;
}
}
}
}

View File

@@ -5,5 +5,11 @@
<Setting Name="LastPdfPath" Type="System.String" Scope="User">
<Value Profile="(Default)" />
</Setting>
<Setting Name="LastColumnName" Type="System.String" Scope="User">
<Value Profile="(Default)" />
</Setting>
<Setting Name="LastFieldName" Type="System.String" Scope="User">
<Value Profile="(Default)" />
</Setting>
</Settings>
</SettingsFile>

View File

@@ -32,6 +32,9 @@
<ErrorReport>prompt</ErrorReport>
<WarningLevel>4</WarningLevel>
</PropertyGroup>
<PropertyGroup>
<StartupObject />
</PropertyGroup>
<ItemGroup>
<Reference Include="System" />
<Reference Include="System.Core" />

View File

@@ -1,4 +1,5 @@
using System.Collections.Generic;
using System.Linq;
using System.Text;
namespace VAR.PdfTools
@@ -122,6 +123,8 @@ namespace VAR.PdfTools
public class PdfTextElement
{
#region Properties
public PdfFont Font { get; set; }
public double FontSize { get; set; }
@@ -135,6 +138,22 @@ namespace VAR.PdfTools
public double VisibleWidth { get; set; }
public double VisibleHeight { get; set; }
#endregion
#region Public methods
public double GetX()
{
return Matrix.Matrix[0, 2];
}
public double GetY()
{
return Matrix.Matrix[1, 2];
}
#endregion
}
public class PdfTextExtractor
@@ -218,6 +237,38 @@ namespace VAR.PdfTools
_textWidth = 0;
}
private PdfTextElement FindElementByText(string text)
{
foreach (PdfTextElement elem in _textElements)
{
if (elem.VisibleText == text)
{
return elem;
}
}
return null;
}
private bool TextElementVerticalIntersection(PdfTextElement elem1, PdfTextElement elem2)
{
double elem1X1 = elem1.GetX();
double elem1X2 = elem1.GetX() + elem1.VisibleWidth;
double elem2X1 = elem2.GetX();
double elem2X2 = elem2.GetX() + elem2.VisibleWidth;
return elem1X2 >= elem2X1 && elem2X2 >= elem1X1;
}
private bool TextElementHorizontalIntersection(PdfTextElement elem1, PdfTextElement elem2)
{
double elem1Y1 = elem1.GetY();
double elem1Y2 = elem1.GetY() + elem1.VisibleHeight;
double elem2Y1 = elem2.GetY();
double elem2Y2 = elem2.GetY() + elem2.VisibleHeight;
return elem1Y2 >= elem2Y1 && elem2Y2 >= elem1Y1;
}
#endregion
#region Operations
@@ -451,5 +502,81 @@ namespace VAR.PdfTools
}
#endregion
#region Public methods
public List<string> GetColumn(string column)
{
PdfTextElement columnHead = FindElementByText(column);
if(columnHead == null)
{
return new List<string>();
}
double headY = columnHead.GetY();
// Get all the elements that intersects vertically and sort
var columnData = new List<PdfTextElement>();
foreach (PdfTextElement elem in _textElements)
{
if (TextElementVerticalIntersection(columnHead, elem) == false) { continue; }
double elemY = elem.GetY();
if (elemY >= headY) { continue; }
columnData.Add(elem);
}
columnData = columnData.OrderByDescending(elem => elem.GetY()).ToList();
// Filter only nearest elements
var result = new List<string>();
double prevY = headY;
double medDiff = 0;
bool first = true;
foreach (PdfTextElement elem in columnData)
{
double elemY = elem.GetY();
double diff = prevY - elemY;
prevY = elemY;
if (first)
{
first = false;
medDiff = diff;
}
if (diff > medDiff) { break; }
medDiff = (medDiff + diff) / 2;
result.Add(elem.VisibleText);
}
return result;
}
public string GetField(string column)
{
PdfTextElement fieldTitle = FindElementByText(column);
if (fieldTitle == null)
{
return null;
}
double titleX = fieldTitle.GetX();
var fieldData = new List<PdfTextElement>();
foreach (PdfTextElement elem in _textElements)
{
if (TextElementHorizontalIntersection(fieldTitle, elem) == false) { continue; }
double elemX = elem.GetX();
if (elemX <= titleX) { continue; }
fieldData.Add(elem);
}
if(fieldData.Count == 0)
{
return null;
}
return fieldData.OrderBy(elem => elem.GetX()).FirstOrDefault().VisibleText;
}
#endregion
}
}