PdfExtractor: GetColumn and GetField for easy data exploration
This commit is contained in:
@@ -13,6 +13,12 @@
|
||||
<setting name="LastPdfPath" serializeAs="String">
|
||||
<value />
|
||||
</setting>
|
||||
<setting name="LastColumnName" serializeAs="String">
|
||||
<value />
|
||||
</setting>
|
||||
<setting name="LastFieldName" serializeAs="String">
|
||||
<value />
|
||||
</setting>
|
||||
</VAR.PdfTools.Workbench.Properties.Settings>
|
||||
</userSettings>
|
||||
</configuration>
|
||||
60
VAR.PdfTools.Workbench/FrmPdfInfo.Designer.cs
generated
60
VAR.PdfTools.Workbench/FrmPdfInfo.Designer.cs
generated
@@ -34,13 +34,17 @@
|
||||
this.txtPdfPath = new System.Windows.Forms.TextBox();
|
||||
this.txtOutput = new System.Windows.Forms.TextBox();
|
||||
this.btnProcess = new System.Windows.Forms.Button();
|
||||
this.btnGetColumn = new System.Windows.Forms.Button();
|
||||
this.txtColumnName = new System.Windows.Forms.TextBox();
|
||||
this.txtFieldName = new System.Windows.Forms.TextBox();
|
||||
this.btnGetField = new System.Windows.Forms.Button();
|
||||
this.SuspendLayout();
|
||||
//
|
||||
// lblOutputs
|
||||
//
|
||||
this.lblOutputs.AutoSize = true;
|
||||
this.lblOutputs.Font = new System.Drawing.Font("Microsoft Sans Serif", 8.25F, System.Drawing.FontStyle.Bold, System.Drawing.GraphicsUnit.Point, ((byte)(0)));
|
||||
this.lblOutputs.Location = new System.Drawing.Point(12, 48);
|
||||
this.lblOutputs.Location = new System.Drawing.Point(12, 130);
|
||||
this.lblOutputs.Name = "lblOutputs";
|
||||
this.lblOutputs.Size = new System.Drawing.Size(51, 13);
|
||||
this.lblOutputs.TabIndex = 11;
|
||||
@@ -59,7 +63,7 @@
|
||||
// btnBrowse
|
||||
//
|
||||
this.btnBrowse.Anchor = ((System.Windows.Forms.AnchorStyles)((System.Windows.Forms.AnchorStyles.Top | System.Windows.Forms.AnchorStyles.Right)));
|
||||
this.btnBrowse.Location = new System.Drawing.Point(323, 23);
|
||||
this.btnBrowse.Location = new System.Drawing.Point(316, 23);
|
||||
this.btnBrowse.Name = "btnBrowse";
|
||||
this.btnBrowse.Size = new System.Drawing.Size(75, 23);
|
||||
this.btnBrowse.TabIndex = 9;
|
||||
@@ -73,7 +77,7 @@
|
||||
| System.Windows.Forms.AnchorStyles.Right)));
|
||||
this.txtPdfPath.Location = new System.Drawing.Point(15, 25);
|
||||
this.txtPdfPath.Name = "txtPdfPath";
|
||||
this.txtPdfPath.Size = new System.Drawing.Size(302, 20);
|
||||
this.txtPdfPath.Size = new System.Drawing.Size(295, 20);
|
||||
this.txtPdfPath.TabIndex = 8;
|
||||
//
|
||||
// txtOutput
|
||||
@@ -84,17 +88,17 @@
|
||||
| System.Windows.Forms.AnchorStyles.Left)
|
||||
| System.Windows.Forms.AnchorStyles.Right)));
|
||||
this.txtOutput.Font = new System.Drawing.Font("Consolas", 8.25F, System.Drawing.FontStyle.Regular, System.Drawing.GraphicsUnit.Point, ((byte)(0)));
|
||||
this.txtOutput.Location = new System.Drawing.Point(15, 64);
|
||||
this.txtOutput.Location = new System.Drawing.Point(15, 146);
|
||||
this.txtOutput.Multiline = true;
|
||||
this.txtOutput.Name = "txtOutput";
|
||||
this.txtOutput.ScrollBars = System.Windows.Forms.ScrollBars.Vertical;
|
||||
this.txtOutput.Size = new System.Drawing.Size(464, 355);
|
||||
this.txtOutput.Size = new System.Drawing.Size(457, 303);
|
||||
this.txtOutput.TabIndex = 7;
|
||||
//
|
||||
// btnProcess
|
||||
//
|
||||
this.btnProcess.Anchor = ((System.Windows.Forms.AnchorStyles)((System.Windows.Forms.AnchorStyles.Top | System.Windows.Forms.AnchorStyles.Right)));
|
||||
this.btnProcess.Location = new System.Drawing.Point(404, 23);
|
||||
this.btnProcess.Location = new System.Drawing.Point(397, 23);
|
||||
this.btnProcess.Name = "btnProcess";
|
||||
this.btnProcess.Size = new System.Drawing.Size(75, 23);
|
||||
this.btnProcess.TabIndex = 6;
|
||||
@@ -102,11 +106,49 @@
|
||||
this.btnProcess.UseVisualStyleBackColor = true;
|
||||
this.btnProcess.Click += new System.EventHandler(this.btnProcess_Click);
|
||||
//
|
||||
// btnGetColumn
|
||||
//
|
||||
this.btnGetColumn.Location = new System.Drawing.Point(163, 51);
|
||||
this.btnGetColumn.Name = "btnGetColumn";
|
||||
this.btnGetColumn.Size = new System.Drawing.Size(75, 23);
|
||||
this.btnGetColumn.TabIndex = 12;
|
||||
this.btnGetColumn.Text = "GetColumn";
|
||||
this.btnGetColumn.UseVisualStyleBackColor = true;
|
||||
this.btnGetColumn.Click += new System.EventHandler(this.btnGetColumn_Click);
|
||||
//
|
||||
// txtColumnName
|
||||
//
|
||||
this.txtColumnName.Location = new System.Drawing.Point(15, 53);
|
||||
this.txtColumnName.Name = "txtColumnName";
|
||||
this.txtColumnName.Size = new System.Drawing.Size(142, 20);
|
||||
this.txtColumnName.TabIndex = 13;
|
||||
//
|
||||
// txtFieldName
|
||||
//
|
||||
this.txtFieldName.Location = new System.Drawing.Point(15, 82);
|
||||
this.txtFieldName.Name = "txtFieldName";
|
||||
this.txtFieldName.Size = new System.Drawing.Size(142, 20);
|
||||
this.txtFieldName.TabIndex = 15;
|
||||
//
|
||||
// btnGetField
|
||||
//
|
||||
this.btnGetField.Location = new System.Drawing.Point(163, 80);
|
||||
this.btnGetField.Name = "btnGetField";
|
||||
this.btnGetField.Size = new System.Drawing.Size(75, 23);
|
||||
this.btnGetField.TabIndex = 14;
|
||||
this.btnGetField.Text = "GetField";
|
||||
this.btnGetField.UseVisualStyleBackColor = true;
|
||||
this.btnGetField.Click += new System.EventHandler(this.btnGetField_Click);
|
||||
//
|
||||
// FrmPdfInfo
|
||||
//
|
||||
this.AutoScaleDimensions = new System.Drawing.SizeF(6F, 13F);
|
||||
this.AutoScaleMode = System.Windows.Forms.AutoScaleMode.Font;
|
||||
this.ClientSize = new System.Drawing.Size(491, 431);
|
||||
this.ClientSize = new System.Drawing.Size(484, 461);
|
||||
this.Controls.Add(this.txtFieldName);
|
||||
this.Controls.Add(this.btnGetField);
|
||||
this.Controls.Add(this.txtColumnName);
|
||||
this.Controls.Add(this.btnGetColumn);
|
||||
this.Controls.Add(this.lblOutputs);
|
||||
this.Controls.Add(this.lblInputs);
|
||||
this.Controls.Add(this.btnBrowse);
|
||||
@@ -130,5 +172,9 @@
|
||||
private System.Windows.Forms.TextBox txtPdfPath;
|
||||
private System.Windows.Forms.TextBox txtOutput;
|
||||
private System.Windows.Forms.Button btnProcess;
|
||||
private System.Windows.Forms.Button btnGetColumn;
|
||||
private System.Windows.Forms.TextBox txtColumnName;
|
||||
private System.Windows.Forms.TextBox txtFieldName;
|
||||
private System.Windows.Forms.Button btnGetField;
|
||||
}
|
||||
}
|
||||
@@ -16,11 +16,15 @@ namespace VAR.PdfTools.Workbench
|
||||
private void FrmPdfInfo_Load(object sender, EventArgs e)
|
||||
{
|
||||
txtPdfPath.Text = Properties.Settings.Default.LastPdfPath;
|
||||
txtColumnName.Text = Properties.Settings.Default.LastColumnName;
|
||||
txtFieldName.Text = Properties.Settings.Default.LastFieldName;
|
||||
}
|
||||
|
||||
private void FrmPdfInfo_FormClosing(object sender, FormClosingEventArgs e)
|
||||
{
|
||||
Properties.Settings.Default.LastPdfPath = txtPdfPath.Text;
|
||||
Properties.Settings.Default.LastColumnName = txtColumnName.Text;
|
||||
Properties.Settings.Default.LastFieldName = txtFieldName.Text;
|
||||
Properties.Settings.Default.Save();
|
||||
}
|
||||
|
||||
@@ -91,5 +95,42 @@ namespace VAR.PdfTools.Workbench
|
||||
txtOutput.Lines = lines.ToArray();
|
||||
}
|
||||
|
||||
private void btnGetColumn_Click(object sender, EventArgs e)
|
||||
{
|
||||
if (System.IO.File.Exists(txtPdfPath.Text) == false)
|
||||
{
|
||||
MessageBox.Show("File does not exist");
|
||||
return;
|
||||
}
|
||||
|
||||
PdfDocument doc = PdfDocument.Load(txtPdfPath.Text);
|
||||
|
||||
var columnData = new List<string>();
|
||||
foreach (PdfDocumentPage page in doc.Pages)
|
||||
{
|
||||
PdfTextExtractor extractor = new PdfTextExtractor(page);
|
||||
columnData.AddRange(extractor.GetColumn(txtColumnName.Text));
|
||||
}
|
||||
txtOutput.Lines = columnData.ToArray();
|
||||
}
|
||||
|
||||
private void btnGetField_Click(object sender, EventArgs e)
|
||||
{
|
||||
if (System.IO.File.Exists(txtPdfPath.Text) == false)
|
||||
{
|
||||
MessageBox.Show("File does not exist");
|
||||
return;
|
||||
}
|
||||
|
||||
PdfDocument doc = PdfDocument.Load(txtPdfPath.Text);
|
||||
|
||||
var fieldData = new List<string>();
|
||||
foreach (PdfDocumentPage page in doc.Pages)
|
||||
{
|
||||
PdfTextExtractor extractor = new PdfTextExtractor(page);
|
||||
fieldData.Add(extractor.GetField(txtFieldName.Text));
|
||||
}
|
||||
txtOutput.Lines = fieldData.ToArray();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -34,5 +34,29 @@ namespace VAR.PdfTools.Workbench.Properties {
|
||||
this["LastPdfPath"] = value;
|
||||
}
|
||||
}
|
||||
|
||||
[global::System.Configuration.UserScopedSettingAttribute()]
|
||||
[global::System.Diagnostics.DebuggerNonUserCodeAttribute()]
|
||||
[global::System.Configuration.DefaultSettingValueAttribute("")]
|
||||
public string LastColumnName {
|
||||
get {
|
||||
return ((string)(this["LastColumnName"]));
|
||||
}
|
||||
set {
|
||||
this["LastColumnName"] = value;
|
||||
}
|
||||
}
|
||||
|
||||
[global::System.Configuration.UserScopedSettingAttribute()]
|
||||
[global::System.Diagnostics.DebuggerNonUserCodeAttribute()]
|
||||
[global::System.Configuration.DefaultSettingValueAttribute("")]
|
||||
public string LastFieldName {
|
||||
get {
|
||||
return ((string)(this["LastFieldName"]));
|
||||
}
|
||||
set {
|
||||
this["LastFieldName"] = value;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -5,5 +5,11 @@
|
||||
<Setting Name="LastPdfPath" Type="System.String" Scope="User">
|
||||
<Value Profile="(Default)" />
|
||||
</Setting>
|
||||
<Setting Name="LastColumnName" Type="System.String" Scope="User">
|
||||
<Value Profile="(Default)" />
|
||||
</Setting>
|
||||
<Setting Name="LastFieldName" Type="System.String" Scope="User">
|
||||
<Value Profile="(Default)" />
|
||||
</Setting>
|
||||
</Settings>
|
||||
</SettingsFile>
|
||||
@@ -32,6 +32,9 @@
|
||||
<ErrorReport>prompt</ErrorReport>
|
||||
<WarningLevel>4</WarningLevel>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup>
|
||||
<StartupObject />
|
||||
</PropertyGroup>
|
||||
<ItemGroup>
|
||||
<Reference Include="System" />
|
||||
<Reference Include="System.Core" />
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
using System.Collections.Generic;
|
||||
using System.Linq;
|
||||
using System.Text;
|
||||
|
||||
namespace VAR.PdfTools
|
||||
@@ -122,6 +123,8 @@ namespace VAR.PdfTools
|
||||
|
||||
public class PdfTextElement
|
||||
{
|
||||
#region Properties
|
||||
|
||||
public PdfFont Font { get; set; }
|
||||
|
||||
public double FontSize { get; set; }
|
||||
@@ -135,6 +138,22 @@ namespace VAR.PdfTools
|
||||
public double VisibleWidth { get; set; }
|
||||
|
||||
public double VisibleHeight { get; set; }
|
||||
|
||||
#endregion
|
||||
|
||||
#region Public methods
|
||||
|
||||
public double GetX()
|
||||
{
|
||||
return Matrix.Matrix[0, 2];
|
||||
}
|
||||
|
||||
public double GetY()
|
||||
{
|
||||
return Matrix.Matrix[1, 2];
|
||||
}
|
||||
|
||||
#endregion
|
||||
}
|
||||
|
||||
public class PdfTextExtractor
|
||||
@@ -218,6 +237,38 @@ namespace VAR.PdfTools
|
||||
_textWidth = 0;
|
||||
}
|
||||
|
||||
private PdfTextElement FindElementByText(string text)
|
||||
{
|
||||
foreach (PdfTextElement elem in _textElements)
|
||||
{
|
||||
if (elem.VisibleText == text)
|
||||
{
|
||||
return elem;
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
private bool TextElementVerticalIntersection(PdfTextElement elem1, PdfTextElement elem2)
|
||||
{
|
||||
double elem1X1 = elem1.GetX();
|
||||
double elem1X2 = elem1.GetX() + elem1.VisibleWidth;
|
||||
double elem2X1 = elem2.GetX();
|
||||
double elem2X2 = elem2.GetX() + elem2.VisibleWidth;
|
||||
|
||||
return elem1X2 >= elem2X1 && elem2X2 >= elem1X1;
|
||||
}
|
||||
|
||||
private bool TextElementHorizontalIntersection(PdfTextElement elem1, PdfTextElement elem2)
|
||||
{
|
||||
double elem1Y1 = elem1.GetY();
|
||||
double elem1Y2 = elem1.GetY() + elem1.VisibleHeight;
|
||||
double elem2Y1 = elem2.GetY();
|
||||
double elem2Y2 = elem2.GetY() + elem2.VisibleHeight;
|
||||
|
||||
return elem1Y2 >= elem2Y1 && elem2Y2 >= elem1Y1;
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Operations
|
||||
@@ -451,5 +502,81 @@ namespace VAR.PdfTools
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
#region Public methods
|
||||
|
||||
public List<string> GetColumn(string column)
|
||||
{
|
||||
PdfTextElement columnHead = FindElementByText(column);
|
||||
if(columnHead == null)
|
||||
{
|
||||
return new List<string>();
|
||||
}
|
||||
double headY = columnHead.GetY();
|
||||
|
||||
// Get all the elements that intersects vertically and sort
|
||||
var columnData = new List<PdfTextElement>();
|
||||
foreach (PdfTextElement elem in _textElements)
|
||||
{
|
||||
if (TextElementVerticalIntersection(columnHead, elem) == false) { continue; }
|
||||
double elemY = elem.GetY();
|
||||
if (elemY >= headY) { continue; }
|
||||
|
||||
columnData.Add(elem);
|
||||
}
|
||||
columnData = columnData.OrderByDescending(elem => elem.GetY()).ToList();
|
||||
|
||||
// Filter only nearest elements
|
||||
var result = new List<string>();
|
||||
double prevY = headY;
|
||||
double medDiff = 0;
|
||||
bool first = true;
|
||||
foreach (PdfTextElement elem in columnData)
|
||||
{
|
||||
double elemY = elem.GetY();
|
||||
double diff = prevY - elemY;
|
||||
prevY = elemY;
|
||||
if (first)
|
||||
{
|
||||
first = false;
|
||||
medDiff = diff;
|
||||
}
|
||||
if (diff > medDiff) { break; }
|
||||
medDiff = (medDiff + diff) / 2;
|
||||
|
||||
result.Add(elem.VisibleText);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
public string GetField(string column)
|
||||
{
|
||||
PdfTextElement fieldTitle = FindElementByText(column);
|
||||
if (fieldTitle == null)
|
||||
{
|
||||
return null;
|
||||
}
|
||||
double titleX = fieldTitle.GetX();
|
||||
var fieldData = new List<PdfTextElement>();
|
||||
|
||||
|
||||
foreach (PdfTextElement elem in _textElements)
|
||||
{
|
||||
if (TextElementHorizontalIntersection(fieldTitle, elem) == false) { continue; }
|
||||
double elemX = elem.GetX();
|
||||
if (elemX <= titleX) { continue; }
|
||||
|
||||
fieldData.Add(elem);
|
||||
}
|
||||
|
||||
if(fieldData.Count == 0)
|
||||
{
|
||||
return null;
|
||||
}
|
||||
|
||||
return fieldData.OrderBy(elem => elem.GetX()).FirstOrDefault().VisibleText;
|
||||
}
|
||||
|
||||
#endregion
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user