From 8f12b13a9fff154e5564f05bda08b9991d0b5d8f Mon Sep 17 00:00:00 2001 From: "Valeriano A.R" Date: Tue, 21 Jun 2016 17:35:35 +0200 Subject: [PATCH] PdfExtractor: GetColumn and GetField for easy data exploration --- VAR.PdfTools.Workbench/App.config | 6 + VAR.PdfTools.Workbench/FrmPdfInfo.Designer.cs | 60 ++++++++- VAR.PdfTools.Workbench/FrmPdfInfo.cs | 41 ++++++ .../Properties/Settings.Designer.cs | 24 ++++ .../Properties/Settings.settings | 6 + .../VAR.PdfTools.Workbench.csproj | 3 + VAR.PdfTools/PdfTextExtractor.cs | 127 ++++++++++++++++++ 7 files changed, 260 insertions(+), 7 deletions(-) diff --git a/VAR.PdfTools.Workbench/App.config b/VAR.PdfTools.Workbench/App.config index 3c82f18..e2b6bd9 100644 --- a/VAR.PdfTools.Workbench/App.config +++ b/VAR.PdfTools.Workbench/App.config @@ -13,6 +13,12 @@ + + + + + + \ No newline at end of file diff --git a/VAR.PdfTools.Workbench/FrmPdfInfo.Designer.cs b/VAR.PdfTools.Workbench/FrmPdfInfo.Designer.cs index 1d7cb5e..3ebbebf 100644 --- a/VAR.PdfTools.Workbench/FrmPdfInfo.Designer.cs +++ b/VAR.PdfTools.Workbench/FrmPdfInfo.Designer.cs @@ -34,13 +34,17 @@ this.txtPdfPath = new System.Windows.Forms.TextBox(); this.txtOutput = new System.Windows.Forms.TextBox(); this.btnProcess = new System.Windows.Forms.Button(); + this.btnGetColumn = new System.Windows.Forms.Button(); + this.txtColumnName = new System.Windows.Forms.TextBox(); + this.txtFieldName = new System.Windows.Forms.TextBox(); + this.btnGetField = new System.Windows.Forms.Button(); this.SuspendLayout(); // // lblOutputs // this.lblOutputs.AutoSize = true; this.lblOutputs.Font = new System.Drawing.Font("Microsoft Sans Serif", 8.25F, System.Drawing.FontStyle.Bold, System.Drawing.GraphicsUnit.Point, ((byte)(0))); - this.lblOutputs.Location = new System.Drawing.Point(12, 48); + this.lblOutputs.Location = new System.Drawing.Point(12, 130); this.lblOutputs.Name = "lblOutputs"; this.lblOutputs.Size = new System.Drawing.Size(51, 13); this.lblOutputs.TabIndex = 11; @@ -59,7 +63,7 @@ // btnBrowse // this.btnBrowse.Anchor = ((System.Windows.Forms.AnchorStyles)((System.Windows.Forms.AnchorStyles.Top | System.Windows.Forms.AnchorStyles.Right))); - this.btnBrowse.Location = new System.Drawing.Point(323, 23); + this.btnBrowse.Location = new System.Drawing.Point(316, 23); this.btnBrowse.Name = "btnBrowse"; this.btnBrowse.Size = new System.Drawing.Size(75, 23); this.btnBrowse.TabIndex = 9; @@ -73,7 +77,7 @@ | System.Windows.Forms.AnchorStyles.Right))); this.txtPdfPath.Location = new System.Drawing.Point(15, 25); this.txtPdfPath.Name = "txtPdfPath"; - this.txtPdfPath.Size = new System.Drawing.Size(302, 20); + this.txtPdfPath.Size = new System.Drawing.Size(295, 20); this.txtPdfPath.TabIndex = 8; // // txtOutput @@ -84,17 +88,17 @@ | System.Windows.Forms.AnchorStyles.Left) | System.Windows.Forms.AnchorStyles.Right))); this.txtOutput.Font = new System.Drawing.Font("Consolas", 8.25F, System.Drawing.FontStyle.Regular, System.Drawing.GraphicsUnit.Point, ((byte)(0))); - this.txtOutput.Location = new System.Drawing.Point(15, 64); + this.txtOutput.Location = new System.Drawing.Point(15, 146); this.txtOutput.Multiline = true; this.txtOutput.Name = "txtOutput"; this.txtOutput.ScrollBars = System.Windows.Forms.ScrollBars.Vertical; - this.txtOutput.Size = new System.Drawing.Size(464, 355); + this.txtOutput.Size = new System.Drawing.Size(457, 303); this.txtOutput.TabIndex = 7; // // btnProcess // this.btnProcess.Anchor = ((System.Windows.Forms.AnchorStyles)((System.Windows.Forms.AnchorStyles.Top | System.Windows.Forms.AnchorStyles.Right))); - this.btnProcess.Location = new System.Drawing.Point(404, 23); + this.btnProcess.Location = new System.Drawing.Point(397, 23); this.btnProcess.Name = "btnProcess"; this.btnProcess.Size = new System.Drawing.Size(75, 23); this.btnProcess.TabIndex = 6; @@ -102,11 +106,49 @@ this.btnProcess.UseVisualStyleBackColor = true; this.btnProcess.Click += new System.EventHandler(this.btnProcess_Click); // + // btnGetColumn + // + this.btnGetColumn.Location = new System.Drawing.Point(163, 51); + this.btnGetColumn.Name = "btnGetColumn"; + this.btnGetColumn.Size = new System.Drawing.Size(75, 23); + this.btnGetColumn.TabIndex = 12; + this.btnGetColumn.Text = "GetColumn"; + this.btnGetColumn.UseVisualStyleBackColor = true; + this.btnGetColumn.Click += new System.EventHandler(this.btnGetColumn_Click); + // + // txtColumnName + // + this.txtColumnName.Location = new System.Drawing.Point(15, 53); + this.txtColumnName.Name = "txtColumnName"; + this.txtColumnName.Size = new System.Drawing.Size(142, 20); + this.txtColumnName.TabIndex = 13; + // + // txtFieldName + // + this.txtFieldName.Location = new System.Drawing.Point(15, 82); + this.txtFieldName.Name = "txtFieldName"; + this.txtFieldName.Size = new System.Drawing.Size(142, 20); + this.txtFieldName.TabIndex = 15; + // + // btnGetField + // + this.btnGetField.Location = new System.Drawing.Point(163, 80); + this.btnGetField.Name = "btnGetField"; + this.btnGetField.Size = new System.Drawing.Size(75, 23); + this.btnGetField.TabIndex = 14; + this.btnGetField.Text = "GetField"; + this.btnGetField.UseVisualStyleBackColor = true; + this.btnGetField.Click += new System.EventHandler(this.btnGetField_Click); + // // FrmPdfInfo // this.AutoScaleDimensions = new System.Drawing.SizeF(6F, 13F); this.AutoScaleMode = System.Windows.Forms.AutoScaleMode.Font; - this.ClientSize = new System.Drawing.Size(491, 431); + this.ClientSize = new System.Drawing.Size(484, 461); + this.Controls.Add(this.txtFieldName); + this.Controls.Add(this.btnGetField); + this.Controls.Add(this.txtColumnName); + this.Controls.Add(this.btnGetColumn); this.Controls.Add(this.lblOutputs); this.Controls.Add(this.lblInputs); this.Controls.Add(this.btnBrowse); @@ -130,5 +172,9 @@ private System.Windows.Forms.TextBox txtPdfPath; private System.Windows.Forms.TextBox txtOutput; private System.Windows.Forms.Button btnProcess; + private System.Windows.Forms.Button btnGetColumn; + private System.Windows.Forms.TextBox txtColumnName; + private System.Windows.Forms.TextBox txtFieldName; + private System.Windows.Forms.Button btnGetField; } } \ No newline at end of file diff --git a/VAR.PdfTools.Workbench/FrmPdfInfo.cs b/VAR.PdfTools.Workbench/FrmPdfInfo.cs index 93da119..e3718ad 100644 --- a/VAR.PdfTools.Workbench/FrmPdfInfo.cs +++ b/VAR.PdfTools.Workbench/FrmPdfInfo.cs @@ -16,11 +16,15 @@ namespace VAR.PdfTools.Workbench private void FrmPdfInfo_Load(object sender, EventArgs e) { txtPdfPath.Text = Properties.Settings.Default.LastPdfPath; + txtColumnName.Text = Properties.Settings.Default.LastColumnName; + txtFieldName.Text = Properties.Settings.Default.LastFieldName; } private void FrmPdfInfo_FormClosing(object sender, FormClosingEventArgs e) { Properties.Settings.Default.LastPdfPath = txtPdfPath.Text; + Properties.Settings.Default.LastColumnName = txtColumnName.Text; + Properties.Settings.Default.LastFieldName = txtFieldName.Text; Properties.Settings.Default.Save(); } @@ -91,5 +95,42 @@ namespace VAR.PdfTools.Workbench txtOutput.Lines = lines.ToArray(); } + private void btnGetColumn_Click(object sender, EventArgs e) + { + if (System.IO.File.Exists(txtPdfPath.Text) == false) + { + MessageBox.Show("File does not exist"); + return; + } + + PdfDocument doc = PdfDocument.Load(txtPdfPath.Text); + + var columnData = new List(); + foreach (PdfDocumentPage page in doc.Pages) + { + PdfTextExtractor extractor = new PdfTextExtractor(page); + columnData.AddRange(extractor.GetColumn(txtColumnName.Text)); + } + txtOutput.Lines = columnData.ToArray(); + } + + private void btnGetField_Click(object sender, EventArgs e) + { + if (System.IO.File.Exists(txtPdfPath.Text) == false) + { + MessageBox.Show("File does not exist"); + return; + } + + PdfDocument doc = PdfDocument.Load(txtPdfPath.Text); + + var fieldData = new List(); + foreach (PdfDocumentPage page in doc.Pages) + { + PdfTextExtractor extractor = new PdfTextExtractor(page); + fieldData.Add(extractor.GetField(txtFieldName.Text)); + } + txtOutput.Lines = fieldData.ToArray(); + } } } diff --git a/VAR.PdfTools.Workbench/Properties/Settings.Designer.cs b/VAR.PdfTools.Workbench/Properties/Settings.Designer.cs index 15e7fc3..d6561d7 100644 --- a/VAR.PdfTools.Workbench/Properties/Settings.Designer.cs +++ b/VAR.PdfTools.Workbench/Properties/Settings.Designer.cs @@ -34,5 +34,29 @@ namespace VAR.PdfTools.Workbench.Properties { this["LastPdfPath"] = value; } } + + [global::System.Configuration.UserScopedSettingAttribute()] + [global::System.Diagnostics.DebuggerNonUserCodeAttribute()] + [global::System.Configuration.DefaultSettingValueAttribute("")] + public string LastColumnName { + get { + return ((string)(this["LastColumnName"])); + } + set { + this["LastColumnName"] = value; + } + } + + [global::System.Configuration.UserScopedSettingAttribute()] + [global::System.Diagnostics.DebuggerNonUserCodeAttribute()] + [global::System.Configuration.DefaultSettingValueAttribute("")] + public string LastFieldName { + get { + return ((string)(this["LastFieldName"])); + } + set { + this["LastFieldName"] = value; + } + } } } diff --git a/VAR.PdfTools.Workbench/Properties/Settings.settings b/VAR.PdfTools.Workbench/Properties/Settings.settings index 3a7c7fe..a2e8457 100644 --- a/VAR.PdfTools.Workbench/Properties/Settings.settings +++ b/VAR.PdfTools.Workbench/Properties/Settings.settings @@ -5,5 +5,11 @@ + + + + + + \ No newline at end of file diff --git a/VAR.PdfTools.Workbench/VAR.PdfTools.Workbench.csproj b/VAR.PdfTools.Workbench/VAR.PdfTools.Workbench.csproj index 4270173..82dbb1b 100644 --- a/VAR.PdfTools.Workbench/VAR.PdfTools.Workbench.csproj +++ b/VAR.PdfTools.Workbench/VAR.PdfTools.Workbench.csproj @@ -32,6 +32,9 @@ prompt 4 + + + diff --git a/VAR.PdfTools/PdfTextExtractor.cs b/VAR.PdfTools/PdfTextExtractor.cs index cbde9b2..e87e77d 100644 --- a/VAR.PdfTools/PdfTextExtractor.cs +++ b/VAR.PdfTools/PdfTextExtractor.cs @@ -1,4 +1,5 @@ using System.Collections.Generic; +using System.Linq; using System.Text; namespace VAR.PdfTools @@ -122,6 +123,8 @@ namespace VAR.PdfTools public class PdfTextElement { + #region Properties + public PdfFont Font { get; set; } public double FontSize { get; set; } @@ -135,6 +138,22 @@ namespace VAR.PdfTools public double VisibleWidth { get; set; } public double VisibleHeight { get; set; } + + #endregion + + #region Public methods + + public double GetX() + { + return Matrix.Matrix[0, 2]; + } + + public double GetY() + { + return Matrix.Matrix[1, 2]; + } + + #endregion } public class PdfTextExtractor @@ -218,6 +237,38 @@ namespace VAR.PdfTools _textWidth = 0; } + private PdfTextElement FindElementByText(string text) + { + foreach (PdfTextElement elem in _textElements) + { + if (elem.VisibleText == text) + { + return elem; + } + } + return null; + } + + private bool TextElementVerticalIntersection(PdfTextElement elem1, PdfTextElement elem2) + { + double elem1X1 = elem1.GetX(); + double elem1X2 = elem1.GetX() + elem1.VisibleWidth; + double elem2X1 = elem2.GetX(); + double elem2X2 = elem2.GetX() + elem2.VisibleWidth; + + return elem1X2 >= elem2X1 && elem2X2 >= elem1X1; + } + + private bool TextElementHorizontalIntersection(PdfTextElement elem1, PdfTextElement elem2) + { + double elem1Y1 = elem1.GetY(); + double elem1Y2 = elem1.GetY() + elem1.VisibleHeight; + double elem2Y1 = elem2.GetY(); + double elem2Y2 = elem2.GetY() + elem2.VisibleHeight; + + return elem1Y2 >= elem2Y1 && elem2Y2 >= elem1Y1; + } + #endregion #region Operations @@ -451,5 +502,81 @@ namespace VAR.PdfTools } #endregion + + #region Public methods + + public List GetColumn(string column) + { + PdfTextElement columnHead = FindElementByText(column); + if(columnHead == null) + { + return new List(); + } + double headY = columnHead.GetY(); + + // Get all the elements that intersects vertically and sort + var columnData = new List(); + foreach (PdfTextElement elem in _textElements) + { + if (TextElementVerticalIntersection(columnHead, elem) == false) { continue; } + double elemY = elem.GetY(); + if (elemY >= headY) { continue; } + + columnData.Add(elem); + } + columnData = columnData.OrderByDescending(elem => elem.GetY()).ToList(); + + // Filter only nearest elements + var result = new List(); + double prevY = headY; + double medDiff = 0; + bool first = true; + foreach (PdfTextElement elem in columnData) + { + double elemY = elem.GetY(); + double diff = prevY - elemY; + prevY = elemY; + if (first) + { + first = false; + medDiff = diff; + } + if (diff > medDiff) { break; } + medDiff = (medDiff + diff) / 2; + + result.Add(elem.VisibleText); + } + return result; + } + + public string GetField(string column) + { + PdfTextElement fieldTitle = FindElementByText(column); + if (fieldTitle == null) + { + return null; + } + double titleX = fieldTitle.GetX(); + var fieldData = new List(); + + + foreach (PdfTextElement elem in _textElements) + { + if (TextElementHorizontalIntersection(fieldTitle, elem) == false) { continue; } + double elemX = elem.GetX(); + if (elemX <= titleX) { continue; } + + fieldData.Add(elem); + } + + if(fieldData.Count == 0) + { + return null; + } + + return fieldData.OrderBy(elem => elem.GetX()).FirstOrDefault().VisibleText; + } + + #endregion } }