21 Commits
1_5 ... 1_6_0

Author SHA1 Message Date
d5d843014a Bump version 1.6.0 2019-10-28 02:58:59 +01:00
b9750745bc FrmPdfInfo: Allow raw coordinates input for GetColumn. 2019-10-28 02:58:28 +01:00
c8c7e32acc PdfTextExtractor: Better column extraction, spliting big TextElements. 2019-10-28 02:57:42 +01:00
781f212289 PdfPageRenderer: Fix Rendering of null pages. 2019-10-28 00:43:50 +01:00
8a966049f6 PdfPageRenderer: Adjust column rendering. 2019-10-27 22:40:52 +01:00
80ab9b9ff3 FrmPdfInfo: Better configuration handling with the Configuration class. 2019-10-27 22:36:54 +01:00
9af363529c PdfTextExtractor: Get results as PdfTextElementColumn, for debugging purposes. 2019-10-27 18:45:13 +01:00
386b38bd21 PdfPageRenderer: Refactor using Rect. 2019-10-27 13:12:11 +01:00
53d07db9c0 Use Rect class for size definition of TextElements and pages. 2019-10-27 13:11:40 +01:00
9bc7854b48 README.md: Adjust year on LICENSE section. 2019-10-27 12:43:32 +01:00
77a5cd1b0e PdfTextExtractor: Adjust public method names. 2019-10-27 12:40:51 +01:00
b6611b6285 Put class PdfTextElement in his own file. 2019-10-27 12:37:16 +01:00
7badc8e4b1 PdfPageRenderer: Better rendering of character size. 2019-10-27 09:59:46 +01:00
203f30e55c FrmPdfInfo: Pages selector.
A simple textbox where you can put page numbers separated by comma. And ranges joined by dash.
2019-10-27 09:59:08 +01:00
c3967dd439 Set C# lang version to 6.0. 2019-10-27 09:57:24 +01:00
da8b512c1b Move page rendering code to PdfPageRenderer. 2019-10-27 08:58:34 +01:00
beb3b931ea Bump version 1.5.2 2019-10-21 13:09:13 +02:00
8806020036 ignore ".vs" directory. 2019-10-21 13:08:44 +02:00
f3b7cd1b0d PdfTextExtractor: Better joining and splitting heuristics. 2019-10-21 13:08:19 +02:00
33f9723ac6 Bump version: 1.5.1 2017-11-14 13:34:21 +01:00
13ba41f851 PdfTextExtractor: Change Join and Split logic to use max character width of the elements. 2017-11-02 13:27:38 +01:00
18 changed files with 800 additions and 403 deletions

2
.gitignore vendored
View File

@@ -27,3 +27,5 @@ obj/
_ReSharper*/ _ReSharper*/
*.userprefs *.userprefs
*.nupkg *.nupkg
.vs

View File

@@ -1,6 +1,6 @@
The MIT License (MIT) The MIT License (MIT)
Copyright (c) 2016-2017 Valeriano Alfonso Rodriguez Copyright (c) 2016-2019 Valeriano Alfonso Rodriguez
Permission is hereby granted, free of charge, to any person obtaining a copy Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal of this software and associated documentation files (the "Software"), to deal

View File

@@ -5,27 +5,33 @@
### VAR.PdfTools ### VAR.PdfTools
Add the resulting assembly as reference in your projects, and this line on code: Add the resulting assembly as reference in your projects, and this line on code:
```csharp
using VAR.PdfTools; using VAR.PdfTools;
```
Then extract the contents of a data column using: Then extract the contents of a data column using:
```csharp
var columnData = new List<string>(); var columnData = new List<string>();
PdfDocument doc = PdfDocument.Load("document.pdf"); PdfDocument doc = PdfDocument.Load("document.pdf");
foreach (PdfDocumentPage page in doc.Pages) foreach (PdfDocumentPage page in doc.Pages)
{ {
PdfTextExtractor extractor = new PdfTextExtractor(page); PdfTextExtractor extractor = new PdfTextExtractor(page);
columnData.AddRange(extractor.GetColumn("Column")); columnData.AddRange(extractor.GetColumnAsStrings("Column"));
} }
```
Or the content of a field (text on the right of the indicated text): Or the content of a field (text on the right of the indicated text):
```csharp
var fieldData = new List<string>(); var fieldData = new List<string>();
PdfDocument doc = PdfDocument.Load("document.pdf"); PdfDocument doc = PdfDocument.Load("document.pdf");
foreach (PdfDocumentPage page in doc.Pages) foreach (PdfDocumentPage page in doc.Pages)
{ {
PdfTextExtractor extractor = new PdfTextExtractor(page); PdfTextExtractor extractor = new PdfTextExtractor(page);
fieldData.Add(extractor.GetField(txtFieldName.Text)); fieldData.Add(extractor.GetFieldAsString(txtFieldName.Text));
} }
```
### VAR.PdfTools.Workbench ### VAR.PdfTools.Workbench
It is a simple Windows.Forms application, to test basic funcitionallity of the library. It is a simple Windows.Forms application, to test basic funcitionallity of the library.
@@ -34,6 +40,7 @@ It is a simple Windows.Forms application, to test basic funcitionallity of the l
A Visual Studio 2015 and 2010 solutions are provided. Simply, click build on the IDE. A Visual Studio 2015 and 2010 solutions are provided. Simply, click build on the IDE.
A .nuget package can be build using: A .nuget package can be build using:
VAR.PdfTools\Build.NuGet.cmd VAR.PdfTools\Build.NuGet.cmd
## Contributing ## Contributing
@@ -50,7 +57,7 @@ A .nuget package can be build using:
The MIT License (MIT) The MIT License (MIT)
Copyright (c) 2016-2017 Valeriano Alfonso Rodriguez Copyright (c) 2016-2019 Valeriano Alfonso Rodriguez
Permission is hereby granted, free of charge, to any person obtaining a copy Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal of this software and associated documentation files (the "Software"), to deal

View File

@@ -0,0 +1,117 @@
using System.Collections.Generic;
using System.IO;
using System.Text;
namespace VAR.PdfTools.Workbench
{
public class Configuration
{
private Dictionary<string, string> _configItems = new Dictionary<string, string>();
private static string GetConfigFileName()
{
string location = System.Reflection.Assembly.GetEntryAssembly().Location;
string path = Path.GetDirectoryName(location);
string filenameWithoutExtension = Path.GetFileNameWithoutExtension(location);
string configFile = string.Format("{0}/{1}.cfg", path, filenameWithoutExtension);
return configFile;
}
private static string[] GetConfigurationLines()
{
string configFile = GetConfigFileName();
string[] config;
if (File.Exists(configFile) == false)
{
config = new string[0];
}
else
{
config = File.ReadAllLines(configFile);
}
return config;
}
public void Load()
{
_configItems.Clear();
string[] configLines = GetConfigurationLines();
foreach (string configLine in configLines)
{
int idxSplit = configLine.IndexOf('|');
if (idxSplit < 0) { continue; }
string configName = configLine.Substring(0, idxSplit);
string configData = configLine.Substring(idxSplit + 1);
if (_configItems.ContainsKey(configName))
{
_configItems[configName] = configData;
}
else
{
_configItems.Add(configName, configData);
}
}
}
public void Save()
{
StringBuilder sbConfig = new StringBuilder();
foreach (KeyValuePair<string, string> pair in _configItems)
{
sbConfig.AppendFormat("{0}|{1}\n", pair.Key, pair.Value);
}
string configFileName = GetConfigFileName();
File.WriteAllText(configFileName, sbConfig.ToString());
}
public string Get(string key, string defaultValue)
{
if (_configItems == null) { return defaultValue; }
if (_configItems.ContainsKey(key))
{
return _configItems[key];
}
return defaultValue;
}
public bool Get(string key, bool defaultValue)
{
if (_configItems == null) { return defaultValue; }
if (_configItems.ContainsKey(key))
{
string value = _configItems[key];
return (value == "true");
}
return defaultValue;
}
public void Set(string key, string value)
{
if (_configItems == null) { return; }
if (_configItems.ContainsKey(key))
{
_configItems[key] = value;
}
else
{
_configItems.Add(key, value);
}
}
public void Set(string key, bool value)
{
if (_configItems == null) { return; }
if (_configItems.ContainsKey(key))
{
_configItems[key] = value ? "true" : "false";
}
else
{
_configItems.Add(key, value ? "true" : "false");
}
}
}
}

View File

@@ -47,6 +47,8 @@
this.btnGetField3 = new System.Windows.Forms.Button(); this.btnGetField3 = new System.Windows.Forms.Button();
this.txtField3 = new System.Windows.Forms.TextBox(); this.txtField3 = new System.Windows.Forms.TextBox();
this.btnGetColumn3 = new System.Windows.Forms.Button(); this.btnGetColumn3 = new System.Windows.Forms.Button();
this.txtPages = new System.Windows.Forms.TextBox();
this.chkRender = new System.Windows.Forms.CheckBox();
this.SuspendLayout(); this.SuspendLayout();
// //
// lblOutputs // lblOutputs
@@ -119,7 +121,7 @@
// //
this.btnGetColumn1.Location = new System.Drawing.Point(292, 51); this.btnGetColumn1.Location = new System.Drawing.Point(292, 51);
this.btnGetColumn1.Name = "btnGetColumn1"; this.btnGetColumn1.Name = "btnGetColumn1";
this.btnGetColumn1.Size = new System.Drawing.Size(60, 23); this.btnGetColumn1.Size = new System.Drawing.Size(69, 23);
this.btnGetColumn1.TabIndex = 12; this.btnGetColumn1.TabIndex = 12;
this.btnGetColumn1.Text = "GetColumn"; this.btnGetColumn1.Text = "GetColumn";
this.btnGetColumn1.UseVisualStyleBackColor = true; this.btnGetColumn1.UseVisualStyleBackColor = true;
@@ -194,7 +196,7 @@
// //
this.btnGetColumn2.Location = new System.Drawing.Point(292, 80); this.btnGetColumn2.Location = new System.Drawing.Point(292, 80);
this.btnGetColumn2.Name = "btnGetColumn2"; this.btnGetColumn2.Name = "btnGetColumn2";
this.btnGetColumn2.Size = new System.Drawing.Size(60, 23); this.btnGetColumn2.Size = new System.Drawing.Size(69, 23);
this.btnGetColumn2.TabIndex = 19; this.btnGetColumn2.TabIndex = 19;
this.btnGetColumn2.Text = "GetColumn"; this.btnGetColumn2.Text = "GetColumn";
this.btnGetColumn2.UseVisualStyleBackColor = true; this.btnGetColumn2.UseVisualStyleBackColor = true;
@@ -231,17 +233,37 @@
// //
this.btnGetColumn3.Location = new System.Drawing.Point(292, 109); this.btnGetColumn3.Location = new System.Drawing.Point(292, 109);
this.btnGetColumn3.Name = "btnGetColumn3"; this.btnGetColumn3.Name = "btnGetColumn3";
this.btnGetColumn3.Size = new System.Drawing.Size(60, 23); this.btnGetColumn3.Size = new System.Drawing.Size(69, 23);
this.btnGetColumn3.TabIndex = 23; this.btnGetColumn3.TabIndex = 23;
this.btnGetColumn3.Text = "GetColumn"; this.btnGetColumn3.Text = "GetColumn";
this.btnGetColumn3.UseVisualStyleBackColor = true; this.btnGetColumn3.UseVisualStyleBackColor = true;
this.btnGetColumn3.Click += new System.EventHandler(this.btnGetColumn3_Click); this.btnGetColumn3.Click += new System.EventHandler(this.btnGetColumn3_Click);
// //
// txtPages
//
this.txtPages.Anchor = ((System.Windows.Forms.AnchorStyles)((System.Windows.Forms.AnchorStyles.Top | System.Windows.Forms.AnchorStyles.Right)));
this.txtPages.Location = new System.Drawing.Point(397, 82);
this.txtPages.Name = "txtPages";
this.txtPages.Size = new System.Drawing.Size(75, 20);
this.txtPages.TabIndex = 27;
//
// chkRender
//
this.chkRender.AutoSize = true;
this.chkRender.Location = new System.Drawing.Point(292, 138);
this.chkRender.Name = "chkRender";
this.chkRender.Size = new System.Drawing.Size(61, 17);
this.chkRender.TabIndex = 28;
this.chkRender.Text = "Render";
this.chkRender.UseVisualStyleBackColor = true;
//
// FrmPdfInfo // FrmPdfInfo
// //
this.AutoScaleDimensions = new System.Drawing.SizeF(6F, 13F); this.AutoScaleDimensions = new System.Drawing.SizeF(6F, 13F);
this.AutoScaleMode = System.Windows.Forms.AutoScaleMode.Font; this.AutoScaleMode = System.Windows.Forms.AutoScaleMode.Font;
this.ClientSize = new System.Drawing.Size(484, 461); this.ClientSize = new System.Drawing.Size(484, 461);
this.Controls.Add(this.chkRender);
this.Controls.Add(this.txtPages);
this.Controls.Add(this.btnHasText3); this.Controls.Add(this.btnHasText3);
this.Controls.Add(this.btnGetField3); this.Controls.Add(this.btnGetField3);
this.Controls.Add(this.txtField3); this.Controls.Add(this.txtField3);
@@ -291,5 +313,7 @@
private System.Windows.Forms.Button btnGetField3; private System.Windows.Forms.Button btnGetField3;
private System.Windows.Forms.TextBox txtField3; private System.Windows.Forms.TextBox txtField3;
private System.Windows.Forms.Button btnGetColumn3; private System.Windows.Forms.Button btnGetColumn3;
private System.Windows.Forms.TextBox txtPages;
private System.Windows.Forms.CheckBox chkRender;
} }
} }

View File

@@ -1,7 +1,6 @@
using System; using System;
using System.Collections.Generic; using System.Collections.Generic;
using System.Drawing; using System.Drawing;
using System.Drawing.Drawing2D;
using System.Drawing.Imaging; using System.Drawing.Imaging;
using System.IO; using System.IO;
using System.Linq; using System.Linq;
@@ -20,19 +19,27 @@ namespace VAR.PdfTools.Workbench
private void FrmPdfInfo_Load(object sender, EventArgs e) private void FrmPdfInfo_Load(object sender, EventArgs e)
{ {
txtPdfPath.Text = Properties.Settings.Default.LastPdfPath; var configuration = new Configuration();
txtField1.Text = Properties.Settings.Default.Field1; configuration.Load();
txtField2.Text = Properties.Settings.Default.Field2; txtPdfPath.Text = configuration.Get("LastPdfPath", string.Empty);
txtField3.Text = Properties.Settings.Default.Field3; txtField1.Text = configuration.Get("Field1", string.Empty);
txtField2.Text = configuration.Get("Field2", string.Empty);
txtField3.Text = configuration.Get("Field3", string.Empty);
txtPages.Text = configuration.Get("Pages", string.Empty);
chkRender.Checked = configuration.Get("Render", false);
} }
private void FrmPdfInfo_FormClosing(object sender, FormClosingEventArgs e) private void FrmPdfInfo_FormClosing(object sender, FormClosingEventArgs e)
{ {
Properties.Settings.Default.LastPdfPath = txtPdfPath.Text; var configuration = new Configuration();
Properties.Settings.Default.Field1 = txtField1.Text; var configItems = new Dictionary<string, string>();
Properties.Settings.Default.Field2 = txtField2.Text; configuration.Set("LastPdfPath", txtPdfPath.Text);
Properties.Settings.Default.Field3 = txtField3.Text; configuration.Set("Field1", txtField1.Text);
Properties.Settings.Default.Save(); configuration.Set("Field2", txtField2.Text);
configuration.Set("Field3", txtField3.Text);
configuration.Set("Pages", txtPages.Text);
configuration.Set("Render", chkRender.Checked);
configuration.Save();
} }
private void btnBrowse_Click(object sender, EventArgs e) private void btnBrowse_Click(object sender, EventArgs e)
@@ -191,6 +198,55 @@ namespace VAR.PdfTools.Workbench
Action_GetColumn(pdfPath, column); Action_GetColumn(pdfPath, column);
} }
private IEnumerable<int> GetSelectedPages(int maxPages)
{
string pages = txtPages.Text;
if (string.IsNullOrEmpty(pages))
{
return Enumerable.Range(1, maxPages);
}
string[] pagesParts;
if (pages.Contains(","))
{
pagesParts = pages.Split(',');
}
else
{
pagesParts = new string[] { pages };
}
List<int> listPages = new List<int>();
foreach (string part in pagesParts)
{
if (part.Contains("-"))
{
string[] range = part.Split('-');
if (range.Length == 2)
{
int pageStart;
int pageEnd;
if (int.TryParse(range[0], out pageStart) && int.TryParse(range[1], out pageEnd))
{
listPages.AddRange(Enumerable.Range(pageStart, (pageEnd - pageStart) + 1));
}
}
}
else
{
int pageNum;
if (int.TryParse(part, out pageNum))
{
listPages.Add(pageNum);
}
}
}
if (listPages.Count == 0)
{
listPages.AddRange(Enumerable.Range(1, maxPages));
}
return listPages;
}
private void Action_HasText(string pdfPath, string text) private void Action_HasText(string pdfPath, string text)
{ {
if (System.IO.File.Exists(pdfPath) == false) if (System.IO.File.Exists(pdfPath) == false)
@@ -201,10 +257,13 @@ namespace VAR.PdfTools.Workbench
PdfDocument doc = PdfDocument.Load(pdfPath); PdfDocument doc = PdfDocument.Load(pdfPath);
IEnumerable<int> selectedPages = GetSelectedPages(doc.Pages.Count);
List<string> lines = new List<string>(); List<string> lines = new List<string>();
int pageNum = 1; int pageNum = 0;
foreach (PdfDocumentPage page in doc.Pages) foreach (PdfDocumentPage page in doc.Pages)
{ {
pageNum++;
if (selectedPages.Contains(pageNum) == false) { continue; }
PdfTextExtractor extractor = new PdfTextExtractor(page); PdfTextExtractor extractor = new PdfTextExtractor(page);
lines.Add(string.Format("Page({0}) : {1}", pageNum, Convert.ToString(extractor.HasText(text)))); lines.Add(string.Format("Page({0}) : {1}", pageNum, Convert.ToString(extractor.HasText(text))));
} }
@@ -221,11 +280,15 @@ namespace VAR.PdfTools.Workbench
PdfDocument doc = PdfDocument.Load(pdfPath); PdfDocument doc = PdfDocument.Load(pdfPath);
IEnumerable<int> selectedPages = GetSelectedPages(doc.Pages.Count);
var fieldData = new List<string>(); var fieldData = new List<string>();
int pageNum = 0;
foreach (PdfDocumentPage page in doc.Pages) foreach (PdfDocumentPage page in doc.Pages)
{ {
pageNum++;
if (selectedPages.Contains(pageNum) == false) { continue; }
PdfTextExtractor extractor = new PdfTextExtractor(page); PdfTextExtractor extractor = new PdfTextExtractor(page);
fieldData.Add(extractor.GetField(field)); fieldData.Add(extractor.GetFieldAsString(field));
} }
txtOutput.Lines = fieldData.ToArray(); txtOutput.Lines = fieldData.ToArray();
} }
@@ -239,14 +302,41 @@ namespace VAR.PdfTools.Workbench
} }
PdfDocument doc = PdfDocument.Load(pdfPath); PdfDocument doc = PdfDocument.Load(pdfPath);
string baseDocumentPath = Path.GetDirectoryName(txtPdfPath.Text);
string baseDocumentFilename = Path.GetFileNameWithoutExtension(txtPdfPath.Text);
var columnData = new List<string>(); IEnumerable<int> selectedPages = GetSelectedPages(doc.Pages.Count);
var columns = new List<string>();
int pageNum = 0;
foreach (PdfDocumentPage page in doc.Pages) foreach (PdfDocumentPage page in doc.Pages)
{ {
pageNum++;
if (selectedPages.Contains(pageNum) == false) { continue; }
PdfTextExtractor extractor = new PdfTextExtractor(page); PdfTextExtractor extractor = new PdfTextExtractor(page);
columnData.AddRange(extractor.GetColumn(column)); PdfTextElementColumn columnData;
if (column.StartsWith("#"))
{
string[] columnParts = column.Substring(1).Split(';');
double y = Convert.ToDouble(columnParts[0]);
double x1 = Convert.ToDouble(columnParts[1]);
double x2 = Convert.ToDouble(columnParts[2]);
columnData = extractor.GetColumn(null, y, x1, x2, x1, x2);
} }
txtOutput.Lines = columnData.ToArray(); else
{
columnData = extractor.GetColumn(column);
}
if (chkRender.Checked)
{
var pdfPageRenderer = new PdfPageRenderer(extractor);
Bitmap bmp = pdfPageRenderer.Render();
pdfPageRenderer.RenderColumn(columnData, bmp);
string fileName = Path.Combine(baseDocumentPath, string.Format("{0}_{1:0000}.png", baseDocumentFilename, pageNum));
bmp.Save(fileName, ImageFormat.Png);
}
columns.AddRange(columnData.Elements.Select(t => t.VisibleText));
}
txtOutput.Lines = columns.ToArray();
} }
private void btnRender_Click(object sender, EventArgs e) private void btnRender_Click(object sender, EventArgs e)
@@ -257,8 +347,6 @@ namespace VAR.PdfTools.Workbench
return; return;
} }
int MaxSize = 10000;
PdfDocument doc = PdfDocument.Load(txtPdfPath.Text); PdfDocument doc = PdfDocument.Load(txtPdfPath.Text);
string baseDocumentPath = Path.GetDirectoryName(txtPdfPath.Text); string baseDocumentPath = Path.GetDirectoryName(txtPdfPath.Text);
string baseDocumentFilename = Path.GetFileNameWithoutExtension(txtPdfPath.Text); string baseDocumentFilename = Path.GetFileNameWithoutExtension(txtPdfPath.Text);
@@ -267,139 +355,24 @@ namespace VAR.PdfTools.Workbench
lines.Add(string.Format("Filename : {0}", baseDocumentFilename)); lines.Add(string.Format("Filename : {0}", baseDocumentFilename));
lines.Add(string.Format("Number of Pages : {0}", doc.Pages.Count)); lines.Add(string.Format("Number of Pages : {0}", doc.Pages.Count));
int pageNumber = 1; IEnumerable<int> selectedPages = GetSelectedPages(doc.Pages.Count);
int pageNum = 0;
foreach (PdfDocumentPage page in doc.Pages) foreach (PdfDocumentPage page in doc.Pages)
{ {
double pageXMin = double.MaxValue; pageNum++;
double pageYMin = double.MaxValue; if (selectedPages.Contains(pageNum) == false) { continue; }
double pageXMax = double.MinValue;
double pageYMax = double.MinValue;
// Preprocess page to get max size PdfPageRenderer pdfPageRenderer = new PdfPageRenderer(page);
PdfTextExtractor extractor = new PdfTextExtractor(page); Bitmap bmp = pdfPageRenderer.Render();
foreach (PdfTextElement textElement in extractor.Elements)
{
double textElementXMin = textElement.GetX();
double textElementYMax = textElement.GetY();
double textElementXMax = textElementXMin + textElement.VisibleWidth;
double textElementYMin = textElementYMax - textElement.VisibleHeight;
if (textElementXMax > pageXMax) { pageXMax = textElementXMax; } lines.Add(string.Format("Page {0:0000} TextElements : {1}", pageNum, pdfPageRenderer.Extractor.Elements.Count));
if (textElementYMax > pageYMax) { pageYMax = textElementYMax; }
if (textElementXMin < pageXMin) { pageXMin = textElementXMin; }
if (textElementYMin < pageYMin) { pageYMin = textElementYMin; }
}
lines.Add(string.Format("Page {0:0000} TextElements : {1}", pageNumber, extractor.Elements.Count));
// Prepare page image
int pageWidth = (int)Math.Ceiling(pageXMax - pageXMin);
int pageHeight = (int)Math.Ceiling(pageYMax - pageYMin);
int Scale = 10;
while ((pageWidth * Scale) > MaxSize) { Scale--; }
while ((pageHeight * Scale) > MaxSize) { Scale--; }
if (Scale <= 0) { Scale = 1; }
using (Bitmap bmp = new Bitmap(pageWidth * Scale, pageHeight * Scale, PixelFormat.Format32bppArgb))
using (Graphics gc = Graphics.FromImage(bmp))
using (Pen penTextElem = new Pen(Color.Blue))
{
gc.Clear(Color.White);
// Draw text elements
foreach (PdfTextElement textElement in extractor.Elements)
{
DrawTextElement(textElement, gc, penTextElem, Scale, pageHeight, pageXMin, pageYMin);
}
// Save image to disk // Save image to disk
string fileName = Path.Combine(baseDocumentPath, string.Format("{0}_{1:0000}.png", baseDocumentFilename, pageNumber)); string fileName = Path.Combine(baseDocumentPath, string.Format("{0}_{1:0000}.png", baseDocumentFilename, pageNum));
bmp.Save(fileName, ImageFormat.Png); bmp.Save(fileName, ImageFormat.Png);
} }
pageNumber++;
}
txtOutput.Lines = lines.ToArray(); txtOutput.Lines = lines.ToArray();
} }
private static void DrawTextElement(PdfTextElement textElement, Graphics gc, Pen penTextElem, int Scale, int pageHeight, double pageXMin, double pageYMin)
{
double textElementX = textElement.GetX() - pageXMin;
double textElementY = textElement.GetY() - pageYMin;
double textElementWidth = textElement.VisibleWidth;
double textElementHeight = textElement.VisibleHeight;
string textElementText = textElement.VisibleText;
string textElementFontName = (textElement.Font == null ? string.Empty : textElement.Font.Name);
if (textElementHeight < 0.0001) { return; }
double textElementPageX = textElementX;
double textElementPageY = pageHeight - textElementY;
DrawRoundedRectangle(gc, penTextElem,
(int)(textElementPageX * Scale),
(int)(textElementPageY * Scale),
(int)(textElementWidth * Scale),
(int)(textElementHeight * Scale),
5);
using (Font font = new Font("Arial", (int)(textElementHeight * Scale), GraphicsUnit.Pixel))
{
foreach (PdfCharElement c in textElement.Characters)
{
gc.DrawString(c.Char,
font,
Brushes.Black,
(int)((textElementPageX + c.Displacement) * Scale),
(int)(textElementPageY * Scale));
gc.FillRectangle(Brushes.Red,
(int)((textElementPageX + c.Displacement) * Scale),
(int)(textElementPageY * Scale),
2, 2);
gc.FillRectangle(Brushes.Green,
(int)((textElementPageX + c.Displacement + c.Width) * Scale),
(int)(textElementPageY * Scale),
2, 2);
}
}
}
public static GraphicsPath RoundedRect(int x, int y, int width, int height, int radius)
{
int diameter = radius * 2;
Size size = new Size(diameter, diameter);
Rectangle arc = new Rectangle(x, y, diameter, diameter);
GraphicsPath path = new GraphicsPath();
// top left arc
path.AddArc(arc, 180, 90);
// top right arc
arc.X = (x + width) - diameter;
path.AddArc(arc, 270, 90);
// bottom right arc
arc.Y = (y + height) - diameter;
path.AddArc(arc, 0, 90);
// bottom left arc
arc.X = x;
path.AddArc(arc, 90, 90);
path.CloseFigure();
return path;
}
public static void DrawRoundedRectangle(Graphics graphics, Pen pen, int x, int y, int width, int height, int cornerRadius)
{
if (graphics == null)
throw new ArgumentNullException("graphics");
if (pen == null)
throw new ArgumentNullException("pen");
using (GraphicsPath path = RoundedRect(x, y, width, height, cornerRadius))
{
graphics.DrawPath(pen, path);
}
}
} }
} }

View File

@@ -11,4 +11,4 @@ using System.Runtime.InteropServices;
[assembly: AssemblyCulture("")] [assembly: AssemblyCulture("")]
[assembly: ComVisible(false)] [assembly: ComVisible(false)]
[assembly: Guid("a5825d8e-9f81-49e0-b610-8ae5e46d02ea")] [assembly: Guid("a5825d8e-9f81-49e0-b610-8ae5e46d02ea")]
[assembly: AssemblyVersion("1.5.0.*")] [assembly: AssemblyVersion("1.6.0.*")]

View File

@@ -1,74 +0,0 @@
//------------------------------------------------------------------------------
// <auto-generated>
// This code was generated by a tool.
// Runtime Version:4.0.30319.42000
//
// Changes to this file may cause incorrect behavior and will be lost if
// the code is regenerated.
// </auto-generated>
//------------------------------------------------------------------------------
namespace VAR.PdfTools.Workbench.Properties {
[global::System.Runtime.CompilerServices.CompilerGeneratedAttribute()]
[global::System.CodeDom.Compiler.GeneratedCodeAttribute("Microsoft.VisualStudio.Editors.SettingsDesigner.SettingsSingleFileGenerator", "14.0.0.0")]
internal sealed partial class Settings : global::System.Configuration.ApplicationSettingsBase {
private static Settings defaultInstance = ((Settings)(global::System.Configuration.ApplicationSettingsBase.Synchronized(new Settings())));
public static Settings Default {
get {
return defaultInstance;
}
}
[global::System.Configuration.UserScopedSettingAttribute()]
[global::System.Diagnostics.DebuggerNonUserCodeAttribute()]
[global::System.Configuration.DefaultSettingValueAttribute("")]
public string LastPdfPath {
get {
return ((string)(this["LastPdfPath"]));
}
set {
this["LastPdfPath"] = value;
}
}
[global::System.Configuration.UserScopedSettingAttribute()]
[global::System.Diagnostics.DebuggerNonUserCodeAttribute()]
[global::System.Configuration.DefaultSettingValueAttribute("")]
public string Field1 {
get {
return ((string)(this["Field1"]));
}
set {
this["Field1"] = value;
}
}
[global::System.Configuration.UserScopedSettingAttribute()]
[global::System.Diagnostics.DebuggerNonUserCodeAttribute()]
[global::System.Configuration.DefaultSettingValueAttribute("")]
public string Field2 {
get {
return ((string)(this["Field2"]));
}
set {
this["Field2"] = value;
}
}
[global::System.Configuration.UserScopedSettingAttribute()]
[global::System.Diagnostics.DebuggerNonUserCodeAttribute()]
[global::System.Configuration.DefaultSettingValueAttribute("")]
public string Field3 {
get {
return ((string)(this["Field3"]));
}
set {
this["Field3"] = value;
}
}
}
}

View File

@@ -1,18 +0,0 @@
<?xml version='1.0' encoding='utf-8'?>
<SettingsFile xmlns="http://schemas.microsoft.com/VisualStudio/2004/01/settings" CurrentProfile="(Default)" GeneratedClassNamespace="VAR.PdfTools.Workbench.Properties" GeneratedClassName="Settings">
<Profiles />
<Settings>
<Setting Name="LastPdfPath" Type="System.String" Scope="User">
<Value Profile="(Default)" />
</Setting>
<Setting Name="Field1" Type="System.String" Scope="User">
<Value Profile="(Default)" />
</Setting>
<Setting Name="Field2" Type="System.String" Scope="User">
<Value Profile="(Default)" />
</Setting>
<Setting Name="Field3" Type="System.String" Scope="User">
<Value Profile="(Default)" />
</Setting>
</Settings>
</SettingsFile>

View File

@@ -23,6 +23,7 @@
<DefineConstants>DEBUG;TRACE</DefineConstants> <DefineConstants>DEBUG;TRACE</DefineConstants>
<ErrorReport>prompt</ErrorReport> <ErrorReport>prompt</ErrorReport>
<WarningLevel>4</WarningLevel> <WarningLevel>4</WarningLevel>
<LangVersion>6</LangVersion>
</PropertyGroup> </PropertyGroup>
<PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Release|AnyCPU' "> <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Release|AnyCPU' ">
<PlatformTarget>AnyCPU</PlatformTarget> <PlatformTarget>AnyCPU</PlatformTarget>
@@ -47,6 +48,7 @@
<Reference Include="System.Xml" /> <Reference Include="System.Xml" />
</ItemGroup> </ItemGroup>
<ItemGroup> <ItemGroup>
<Compile Include="Configuration.cs" />
<Compile Include="FrmPdfInfo.cs"> <Compile Include="FrmPdfInfo.cs">
<SubType>Form</SubType> <SubType>Form</SubType>
</Compile> </Compile>
@@ -55,16 +57,6 @@
</Compile> </Compile>
<Compile Include="Program.cs" /> <Compile Include="Program.cs" />
<Compile Include="Properties\AssemblyInfo.cs" /> <Compile Include="Properties\AssemblyInfo.cs" />
<None Include="app.config" />
<None Include="Properties\Settings.settings">
<Generator>SettingsSingleFileGenerator</Generator>
<LastGenOutput>Settings.Designer.cs</LastGenOutput>
</None>
<Compile Include="Properties\Settings.Designer.cs">
<AutoGen>True</AutoGen>
<DependentUpon>Settings.settings</DependentUpon>
<DesignTimeSharedInput>True</DesignTimeSharedInput>
</Compile>
</ItemGroup> </ItemGroup>
<ItemGroup> <ItemGroup>
<ProjectReference Include="..\VAR.PdfTools\VAR.PdfTools.csproj"> <ProjectReference Include="..\VAR.PdfTools\VAR.PdfTools.csproj">

View File

@@ -1,24 +0,0 @@
<?xml version="1.0" encoding="utf-8" ?>
<configuration>
<configSections>
<sectionGroup name="userSettings" type="System.Configuration.UserSettingsGroup, System, Version=4.0.0.0, Culture=neutral, PublicKeyToken=b77a5c561934e089" >
<section name="VAR.PdfTools.Workbench.Properties.Settings" type="System.Configuration.ClientSettingsSection, System, Version=4.0.0.0, Culture=neutral, PublicKeyToken=b77a5c561934e089" allowExeDefinition="MachineToLocalUser" requirePermission="false" />
</sectionGroup>
</configSections>
<userSettings>
<VAR.PdfTools.Workbench.Properties.Settings>
<setting name="LastPdfPath" serializeAs="String">
<value />
</setting>
<setting name="Field1" serializeAs="String">
<value />
</setting>
<setting name="Field2" serializeAs="String">
<value />
</setting>
<setting name="Field3" serializeAs="String">
<value />
</setting>
</VAR.PdfTools.Workbench.Properties.Settings>
</userSettings>
</configuration>

View File

@@ -0,0 +1,25 @@
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
namespace VAR.PdfTools.Maths
{
public class Rect
{
public double XMin { get; set; }
public double XMax { get; set; }
public double YMin { get; set; }
public double YMax { get; set; }
public void Add(Rect rect)
{
if (rect.XMax > XMax) { XMax = rect.XMax; }
if (rect.YMax > YMax) { YMax = rect.YMax; }
if (rect.XMin < XMin) { XMin = rect.XMin; }
if (rect.YMin < YMin) { YMin = rect.YMin; }
}
}
}

View File

@@ -69,7 +69,8 @@ namespace VAR.PdfTools
{ {
PdfParser parser = new PdfParser(_content); PdfParser parser = new PdfParser(_content);
_contentActions = parser.ParseContent(); _contentActions = parser.ParseContent();
}else }
else
{ {
_contentActions = new List<PdfContentAction>(); _contentActions = new List<PdfContentAction>();
} }

View File

@@ -0,0 +1,210 @@
using System;
using System.Drawing;
using System.Drawing.Drawing2D;
using System.Drawing.Imaging;
using VAR.PdfTools.Maths;
namespace VAR.PdfTools
{
public class PdfPageRenderer
{
private PdfDocumentPage _page;
private PdfTextExtractor _pdfTextExtractor;
private Rect _pageRect;
private int _pageWidth;
private int _pageHeight;
private int _scale = 10;
private const int MaxSize = 10000;
public PdfTextExtractor Extractor { get { return _pdfTextExtractor; } }
public PdfPageRenderer(PdfDocumentPage page)
{
_page = page;
_pdfTextExtractor = new PdfTextExtractor(_page);
InitPage();
}
public PdfPageRenderer(PdfTextExtractor pdfTextExtractor)
{
_pdfTextExtractor = pdfTextExtractor;
_page = pdfTextExtractor.Page;
InitPage();
}
private void InitPage()
{
_pageRect = _pdfTextExtractor.GetRect();
_pageWidth = (int)Math.Ceiling(_pageRect.XMax - _pageRect.XMin);
_pageHeight = (int)Math.Ceiling(_pageRect.YMax - _pageRect.YMin);
while ((_pageWidth * _scale) > MaxSize) { _scale--; }
while ((_pageHeight * _scale) > MaxSize) { _scale--; }
if (_scale <= 0) { _scale = 1; }
}
public Bitmap Render()
{
if (_pdfTextExtractor.Elements.Count == 0)
{
// Nothing to render
Bitmap emptyBmp = new Bitmap(100, 200, PixelFormat.Format32bppArgb);
using (Graphics gcEmpty = Graphics.FromImage(emptyBmp))
gcEmpty.Clear(Color.White);
return emptyBmp;
}
// Prepare image
Bitmap bmp = new Bitmap(_pageWidth * _scale, _pageHeight * _scale, PixelFormat.Format32bppArgb);
Graphics gc = Graphics.FromImage(bmp);
gc.Clear(Color.White);
// Draw text elements of the page
using (Pen penTextElem = new Pen(Color.Blue))
using (Pen penCharElem = new Pen(Color.Navy))
{
foreach (PdfTextElement textElement in _pdfTextExtractor.Elements)
{
DrawTextElement(textElement, gc, penTextElem, penCharElem, _scale, _pageHeight, _pageRect.XMin, _pageRect.YMin, Brushes.Black);
}
}
gc.Dispose();
return bmp;
}
public Bitmap RenderColumn(PdfTextElementColumn columnData, Bitmap bmp = null)
{
Graphics gc;
if (bmp == null)
{
bmp = new Bitmap(_pageWidth * _scale, _pageHeight * _scale, PixelFormat.Format32bppArgb);
gc = Graphics.FromImage(bmp);
gc.Clear(Color.White);
}
else
{
gc = Graphics.FromImage(bmp);
}
// Draw text elements of the column header
using (Pen penTextElem = new Pen(Color.Green))
using (Pen penCharElem = new Pen(Color.DarkGreen))
{
DrawTextElement(columnData.HeadTextElement, gc, penTextElem, penCharElem, _scale, _pageHeight, _pageRect.XMin, _pageRect.YMin, Brushes.Olive);
}
// Draw text elements of the column
using (Pen penTextElem = new Pen(Color.Red))
using (Pen penCharElem = new Pen(Color.DarkRed))
{
foreach (PdfTextElement textElement in columnData.Elements)
{
DrawTextElement(textElement, gc, penTextElem, penCharElem, _scale, _pageHeight, _pageRect.XMin, _pageRect.YMin, Brushes.OrangeRed);
}
}
// Draw column extents
using (Pen penColumn = new Pen(Color.Red))
{
float y = (float)(_pageRect.YMax - columnData.Y);
float x1 = (float)(columnData.X1 - _pageRect.XMin);
float x2 = (float)(columnData.X2 - _pageRect.XMin);
gc.DrawLine(penColumn, x1 * _scale, y * _scale, x2 * _scale, y * _scale);
gc.DrawLine(penColumn, x1 * _scale, y * _scale, x1 * _scale, _pageHeight * _scale);
gc.DrawLine(penColumn, x2 * _scale, y * _scale, x2 * _scale, _pageHeight * _scale);
}
gc.Dispose();
return bmp;
}
private static void DrawTextElement(PdfTextElement textElement, Graphics gc, Pen penTextElem, Pen penCharElem, int scale, int pageHeight, double pageXMin, double pageYMin, Brush brushText)
{
if (textElement == null) { return; }
double textElementX = textElement.GetX() - pageXMin;
double textElementY = textElement.GetY() - pageYMin;
double textElementWidth = textElement.VisibleWidth;
double textElementHeight = textElement.VisibleHeight;
string textElementText = textElement.VisibleText;
string textElementFontName = (textElement.Font == null ? string.Empty : textElement.Font.Name);
if (textElementHeight < 0.0001) { return; }
double textElementPageX = textElementX;
double textElementPageY = pageHeight - textElementY;
if (penTextElem != null)
{
DrawRoundedRectangle(gc, penTextElem,
(int)(textElementPageX * scale),
(int)(textElementPageY * scale),
(int)(textElementWidth * scale),
(int)(textElementHeight * scale),
5);
}
using (Font font = new Font("Arial", (int)(textElementHeight * scale), GraphicsUnit.Pixel))
{
foreach (PdfCharElement c in textElement.Characters)
{
gc.DrawString(c.Char,
font,
brushText,
(int)((textElementPageX + c.Displacement) * scale),
(int)(textElementPageY * scale));
if (penCharElem != null)
{
DrawRoundedRectangle(gc, penCharElem,
(int)((textElementPageX + c.Displacement) * scale),
(int)(textElementPageY * scale),
(int)(c.Width * scale),
(int)(textElementHeight * scale),
5);
}
}
}
}
public static GraphicsPath RoundedRect(int x, int y, int width, int height, int radius)
{
int diameter = radius * 2;
Size size = new Size(diameter, diameter);
Rectangle arc = new Rectangle(x, y, diameter, diameter);
GraphicsPath path = new GraphicsPath();
// top left arc
path.AddArc(arc, 180, 90);
// top right arc
arc.X = (x + width) - diameter;
path.AddArc(arc, 270, 90);
// bottom right arc
arc.Y = (y + height) - diameter;
path.AddArc(arc, 0, 90);
// bottom left arc
arc.X = x;
path.AddArc(arc, 90, 90);
path.CloseFigure();
return path;
}
public static void DrawRoundedRectangle(Graphics graphics, Pen pen, int x, int y, int width, int height, int cornerRadius)
{
if (graphics == null)
throw new ArgumentNullException("graphics");
if (pen == null)
throw new ArgumentNullException("pen");
using (GraphicsPath path = RoundedRect(x, y, width, height, cornerRadius))
{
graphics.DrawPath(pen, path);
}
}
}
}

View File

@@ -0,0 +1,149 @@
using System.Collections.Generic;
using System.Linq;
using VAR.PdfTools.Maths;
namespace VAR.PdfTools
{
public struct PdfCharElement
{
public string Char;
public double Displacement;
public double Width;
}
public class PdfTextElement
{
#region Properties
public PdfFont Font { get; set; }
public double FontSize { get; set; }
public Matrix3x3 Matrix { get; set; }
public string RawText { get; set; }
public string VisibleText { get; set; }
public double VisibleWidth { get; set; }
public double VisibleHeight { get; set; }
public List<PdfCharElement> Characters { get; set; }
public List<PdfTextElement> Childs { get; set; }
#endregion
#region Public methods
public double GetX()
{
return Matrix.Matrix[0, 2];
}
public double GetY()
{
return Matrix.Matrix[1, 2];
}
public PdfTextElement SubPart(int startIndex, int endIndex)
{
PdfTextElement blockElem = new PdfTextElement
{
Font = null,
FontSize = FontSize,
Matrix = Matrix.Copy(),
RawText = RawText.Substring(startIndex, endIndex - startIndex),
VisibleText = VisibleText.Substring(startIndex, endIndex - startIndex),
VisibleWidth = 0,
VisibleHeight = VisibleHeight,
Characters = new List<PdfCharElement>(),
Childs = new List<PdfTextElement>(),
};
double displacement = Characters[startIndex].Displacement;
blockElem.Matrix.Matrix[0, 2] += displacement;
for (int j = startIndex; j < endIndex; j++)
{
blockElem.Characters.Add(new PdfCharElement
{
Char = Characters[j].Char,
Displacement = Characters[j].Displacement - displacement,
Width = Characters[j].Width,
});
}
PdfCharElement lastChar = blockElem.Characters[blockElem.Characters.Count - 1];
blockElem.VisibleWidth = lastChar.Displacement + lastChar.Width;
foreach (PdfTextElement elem in Childs)
{
blockElem.Childs.Add(elem);
}
return blockElem;
}
public double MaxWidth()
{
return Characters.Average(c => c.Width);
}
public Rect GetRect()
{
double x = GetX();
double y = GetY();
return new Rect
{
XMin = x,
YMax = y,
XMax = x + VisibleWidth,
YMin = y - VisibleHeight,
};
}
public double GetCharacterPreviousSpacing(int index)
{
if (index <= 0) { return 0; }
double previousEnd = Characters[index - 1].Displacement + Characters[index - 1].Width;
double spacing = Characters[index].Displacement - previousEnd;
return spacing;
}
public double GetCharacterPrecedingSpacing(int index)
{
if (index >= (Characters.Count - 1)) { return 0; }
double currentEnd = Characters[index].Displacement + Characters[index].Width;
double spacing = Characters[index + 1].Displacement - currentEnd;
return spacing;
}
#endregion
}
public class PdfTextElementColumn
{
public PdfTextElement HeadTextElement { get; private set; }
public IEnumerable<PdfTextElement> Elements { get; private set; }
public double Y { get; private set; }
public double X1 { get; private set; }
public double X2 { get; private set; }
public static PdfTextElementColumn Empty { get; } = new PdfTextElementColumn();
private PdfTextElementColumn()
{
Elements = new List<PdfTextElement>();
}
public PdfTextElementColumn(PdfTextElement head, IEnumerable<PdfTextElement> elements, double y, double x1, double x2)
{
HeadTextElement = head;
Elements = elements;
Y = y;
X1 = x1;
X2 = x2;
}
}
}

View File

@@ -7,87 +7,6 @@ using VAR.PdfTools.PdfElements;
namespace VAR.PdfTools namespace VAR.PdfTools
{ {
public struct PdfCharElement
{
public string Char;
public double Displacement;
public double Width;
}
public class PdfTextElement
{
#region Properties
public PdfFont Font { get; set; }
public double FontSize { get; set; }
public Matrix3x3 Matrix { get; set; }
public string RawText { get; set; }
public string VisibleText { get; set; }
public double VisibleWidth { get; set; }
public double VisibleHeight { get; set; }
public List<PdfCharElement> Characters { get; set; }
public List<PdfTextElement> Childs { get; set; }
#endregion
#region Public methods
public double GetX()
{
return Matrix.Matrix[0, 2];
}
public double GetY()
{
return Matrix.Matrix[1, 2];
}
public PdfTextElement SubPart(int startIndex, int endIndex)
{
PdfTextElement blockElem = new PdfTextElement
{
Font = null,
FontSize = FontSize,
Matrix = Matrix.Copy(),
RawText = RawText.Substring(startIndex, endIndex - startIndex),
VisibleText = VisibleText.Substring(startIndex, endIndex - startIndex),
VisibleWidth = 0,
VisibleHeight = VisibleHeight,
Characters = new List<PdfCharElement>(),
Childs = new List<PdfTextElement>(),
};
double displacement = Characters[startIndex].Displacement;
blockElem.Matrix.Matrix[0, 2] += displacement;
for (int j = startIndex; j < endIndex; j++)
{
blockElem.Characters.Add(new PdfCharElement
{
Char = Characters[j].Char,
Displacement = Characters[j].Displacement - displacement,
Width = Characters[j].Width,
});
}
PdfCharElement lastChar = blockElem.Characters[blockElem.Characters.Count - 1];
blockElem.VisibleWidth = lastChar.Displacement + lastChar.Width;
foreach (PdfTextElement elem in Childs)
{
blockElem.Childs.Add(elem);
}
return blockElem;
}
#endregion
}
public class PdfTextExtractor public class PdfTextExtractor
{ {
#region Declarations #region Declarations
@@ -251,6 +170,14 @@ namespace VAR.PdfTools
return list; return list;
} }
private bool TextElementVerticalIntersection(PdfTextElement elem1, double elem2X1, double elem2X2)
{
double elem1X1 = elem1.GetX();
double elem1X2 = elem1.GetX() + elem1.VisibleWidth;
return elem1X2 >= elem2X1 && elem2X2 >= elem1X1;
}
private bool TextElementVerticalIntersection(PdfTextElement elem1, PdfTextElement elem2) private bool TextElementVerticalIntersection(PdfTextElement elem1, PdfTextElement elem2)
{ {
double elem1X1 = elem1.GetX(); double elem1X1 = elem1.GetX();
@@ -622,13 +549,22 @@ namespace VAR.PdfTools
while (i < _textElements.Count) while (i < _textElements.Count)
{ {
PdfTextElement neighbour = _textElements[i]; PdfTextElement neighbour = _textElements[i];
if (neighbour.Font != elem.Font || neighbour.FontSize != elem.FontSize)
{
i++;
continue;
}
double neighbourY = neighbour.GetY(); double neighbourY = neighbour.GetY();
if (Math.Abs(neighbourY - blockY) > 0.001) { i++; continue; } if (Math.Abs(neighbourY - blockY) > 0.001) { i++; continue; }
double maxWidth = neighbour.MaxWidth();
double neighbourXMin = neighbour.GetX(); double neighbourXMin = neighbour.GetX();
double neighbourXMax = neighbourXMin + neighbour.VisibleWidth; double neighbourXMax = neighbourXMin + neighbour.VisibleWidth;
double auxBlockXMin = blockXMin - (elem.FontSize * elem.Font.GetCharWidth('m')); double auxBlockXMin = blockXMin - maxWidth;
double auxBlockXMax = blockXMax + (elem.FontSize * elem.Font.GetCharWidth('m')); double auxBlockXMax = blockXMax + maxWidth;
if (auxBlockXMax >= neighbourXMin && neighbourXMax >= auxBlockXMin) if (auxBlockXMax >= neighbourXMin && neighbourXMax >= auxBlockXMin)
{ {
_textElements.Remove(neighbour); _textElements.Remove(neighbour);
@@ -641,7 +577,7 @@ namespace VAR.PdfTools
i++; i++;
} }
if(textElementNeighbours.Count == 0) if (textElementNeighbours.Count == 1)
{ {
textElementsCondensed.Add(elem); textElementsCondensed.Add(elem);
continue; continue;
@@ -694,12 +630,14 @@ namespace VAR.PdfTools
PdfTextElement elem = _textElements[0]; PdfTextElement elem = _textElements[0];
_textElements.Remove(elem); _textElements.Remove(elem);
double maxWidth = elem.MaxWidth();
int prevBreak = 0; int prevBreak = 0;
for (int i = 1; i < elem.Characters.Count; i++) for (int i = 1; i < elem.Characters.Count; i++)
{ {
double prevCharEnd = elem.Characters[i - 1].Displacement + elem.Characters[i - 1].Width; double prevCharEnd = elem.Characters[i - 1].Displacement + elem.Characters[i - 1].Width;
double charSeparation = elem.Characters[i].Displacement - prevCharEnd; double charSeparation = elem.Characters[i].Displacement - prevCharEnd;
if (charSeparation > (elem.Characters[i - 1].Width * 2)) if (charSeparation > maxWidth)
{ {
PdfTextElement partElem = elem.SubPart(prevBreak, i); PdfTextElement partElem = elem.SubPart(prevBreak, i);
textElementsSplitted.Add(partElem); textElementsSplitted.Add(partElem);
@@ -722,19 +660,26 @@ namespace VAR.PdfTools
#region Public methods #region Public methods
public List<string> GetColumn(string column) public Rect GetRect()
{ {
return GetColumn(column, true); Rect rect = null;
foreach (PdfTextElement textElement in _textElements)
{
Rect elementRect = textElement.GetRect();
if (rect == null) { rect = elementRect; }
rect.Add(elementRect);
}
return rect;
} }
public List<string> GetColumn(string column, bool fuzzy) public PdfTextElementColumn GetColumn(string column, bool fuzzy = true)
{ {
PdfTextElement columnHead = FindElementByText(column, fuzzy); PdfTextElement columnHead = FindElementByText(column, fuzzy);
if (columnHead == null) if (columnHead == null)
{ {
return new List<string>(); return PdfTextElementColumn.Empty;
} }
double headY = columnHead.GetY(); double headY = columnHead.GetY() - columnHead.VisibleHeight;
double headX1 = columnHead.GetX(); double headX1 = columnHead.GetX();
double headX2 = headX1 + columnHead.VisibleWidth; double headX2 = headX1 + columnHead.VisibleWidth;
@@ -762,14 +707,20 @@ namespace VAR.PdfTools
extentX2 = elemX1; extentX2 = elemX1;
} }
} }
} }
PdfTextElementColumn columnData = GetColumn(columnHead, headY, headX1, headX2, extentX1, extentX2);
return columnData;
}
public PdfTextElementColumn GetColumn(PdfTextElement columnHead, double headY, double headX1, double headX2, double extentX1, double extentX2)
{
// Get all the elements that intersects vertically, are down and sort results // Get all the elements that intersects vertically, are down and sort results
var columnDataRaw = new List<PdfTextElement>(); var columnDataRaw = new List<PdfTextElement>();
foreach (PdfTextElement elem in _textElements) foreach (PdfTextElement elem in _textElements)
{ {
if (TextElementVerticalIntersection(columnHead, elem) == false) { continue; } if (TextElementVerticalIntersection(elem, headX1, headX2) == false) { continue; }
// Only intems down the column // Only intems down the column
double elemY = elem.GetY(); double elemY = elem.GetY();
@@ -779,32 +730,94 @@ namespace VAR.PdfTools
} }
columnDataRaw = columnDataRaw.OrderByDescending(elem => elem.GetY()).ToList(); columnDataRaw = columnDataRaw.OrderByDescending(elem => elem.GetY()).ToList();
// Only items completelly inside extents, amd break on the first element outside // Only items completelly inside extents, try spliting big elements and break on big elements that can't be splitted
var columnData = new List<PdfTextElement>(); var columnElements = new List<PdfTextElement>();
foreach (PdfTextElement elem in columnDataRaw) foreach (PdfTextElement elem in columnDataRaw)
{ {
double elemX1 = elem.GetX(); double elemX1 = elem.GetX();
double elemX2 = elemX1 + elem.VisibleWidth; double elemX2 = elemX1 + elem.VisibleWidth;
if (elemX1 < extentX1 || elemX2 > extentX2) { break; }
columnData.Add(elem); // Add elements completely inside
if (elemX1 > extentX1 && elemX2 < extentX2)
{
columnElements.Add(elem);
continue;
} }
// Try to split elements intersecting extents of the column
double maxSpacing = elem.Characters.Average(c => c.Width) / 10;
int indexStart = 0;
int indexEnd = elem.Characters.Count - 1;
bool indexStartValid = true;
bool indexEndValid = true;
if (elemX1 < extentX1)
{
// Search best start
int index = 0;
double characterPosition = elemX1 + elem.Characters[index].Displacement;
while (characterPosition < extentX1 && index < (elem.Characters.Count - 1))
{
index++;
characterPosition = elemX1 + elem.Characters[index].Displacement;
}
double spacing = elem.GetCharacterPreviousSpacing(index);
while (spacing < maxSpacing && index < (elem.Characters.Count - 1))
{
index++;
spacing = elem.GetCharacterPreviousSpacing(index);
}
if (spacing < maxSpacing) { indexStartValid = false; }
indexStart = index;
}
if (elemX2 > extentX2)
{
// Search best end
int index = elem.Characters.Count - 1;
double characterPosition = elemX1 + elem.Characters[index].Displacement + elem.Characters[index].Width;
while (characterPosition > extentX2 && index > 0)
{
index--;
characterPosition = elemX1 + elem.Characters[index].Displacement + elem.Characters[index].Width;
}
double spacing = elem.GetCharacterPrecedingSpacing(index);
while (spacing < maxSpacing && index > 0)
{
index--;
spacing = elem.GetCharacterPrecedingSpacing(index);
}
if (spacing < maxSpacing) { indexEndValid = false; }
indexEnd = index;
}
// Break when there is no good split, spaning all extent
if (indexStartValid == false && indexEndValid == false) { break; }
// Continue when only one of the sides is invalid. (outside elements intersecting extents of the column)
if (indexStartValid == false || indexEndValid == false) { continue; }
// Add splitted element
columnElements.Add(elem.SubPart(indexStart, indexEnd + 1));
}
var columnData = new PdfTextElementColumn(columnHead, columnElements, headY, extentX1, extentX2);
return columnData;
}
public List<string> GetColumnAsStrings(string column, bool fuzzy = true)
{
PdfTextElementColumn columnData = GetColumn(column, fuzzy);
// Emit result // Emit result
var result = new List<string>(); var result = new List<string>();
foreach (PdfTextElement elem in columnData) foreach (PdfTextElement elem in columnData.Elements)
{ {
result.Add(elem.VisibleText); result.Add(elem.VisibleText);
} }
return result; return result;
} }
public string GetField(string field) public string GetFieldAsString(string field, bool fuzzy = true)
{
return GetField(field, true);
}
public string GetField(string field, bool fuzzy)
{ {
PdfTextElement fieldTitle = FindElementByText(field, fuzzy); PdfTextElement fieldTitle = FindElementByText(field, fuzzy);
if (fieldTitle == null) if (fieldTitle == null)
@@ -832,12 +845,7 @@ namespace VAR.PdfTools
return fieldData.OrderBy(elem => elem.GetX()).FirstOrDefault().VisibleText; return fieldData.OrderBy(elem => elem.GetX()).FirstOrDefault().VisibleText;
} }
public bool HasText(string text) public bool HasText(string text, bool fuzzy = true)
{
return HasText(text, true);
}
public bool HasText(string text, bool fuzzy)
{ {
List<PdfTextElement> list = FindElementsContainingText(text, fuzzy); List<PdfTextElement> list = FindElementsContainingText(text, fuzzy);
return (list.Count > 0); return (list.Count > 0);

View File

@@ -6,9 +6,9 @@ using System.Runtime.InteropServices;
[assembly: AssemblyConfiguration("")] [assembly: AssemblyConfiguration("")]
[assembly: AssemblyCompany("VAR")] [assembly: AssemblyCompany("VAR")]
[assembly: AssemblyProduct("VAR.PdfTools")] [assembly: AssemblyProduct("VAR.PdfTools")]
[assembly: AssemblyCopyright("Copyright © VAR 2016-2017")] [assembly: AssemblyCopyright("Copyright © VAR 2016-2019")]
[assembly: AssemblyTrademark("")] [assembly: AssemblyTrademark("")]
[assembly: AssemblyCulture("")] [assembly: AssemblyCulture("")]
[assembly: ComVisible(false)] [assembly: ComVisible(false)]
[assembly: Guid("eb7e003a-6a95-4002-809f-926c7c8a11e9")] [assembly: Guid("eb7e003a-6a95-4002-809f-926c7c8a11e9")]
[assembly: AssemblyVersion("1.5.0.*")] [assembly: AssemblyVersion("1.6.0.*")]

View File

@@ -22,6 +22,7 @@
<ErrorReport>prompt</ErrorReport> <ErrorReport>prompt</ErrorReport>
<WarningLevel>4</WarningLevel> <WarningLevel>4</WarningLevel>
<TargetFrameworkVersion>v4.6.1</TargetFrameworkVersion> <TargetFrameworkVersion>v4.6.1</TargetFrameworkVersion>
<LangVersion>6</LangVersion>
</PropertyGroup> </PropertyGroup>
<PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Release .Net 4.6.1|AnyCPU' "> <PropertyGroup Condition=" '$(Configuration)|$(Platform)' == 'Release .Net 4.6.1|AnyCPU' ">
<DebugType>pdbonly</DebugType> <DebugType>pdbonly</DebugType>
@@ -54,6 +55,7 @@
<ItemGroup> <ItemGroup>
<Reference Include="System" /> <Reference Include="System" />
<Reference Include="System.Core" /> <Reference Include="System.Core" />
<Reference Include="System.Drawing" />
<Reference Include="System.Xml.Linq" /> <Reference Include="System.Xml.Linq" />
<Reference Include="System.Data.DataSetExtensions" /> <Reference Include="System.Data.DataSetExtensions" />
<Reference Include="System.Data" /> <Reference Include="System.Data" />
@@ -61,6 +63,7 @@
</ItemGroup> </ItemGroup>
<ItemGroup> <ItemGroup>
<Compile Include="Maths\Matrix3x3.cs" /> <Compile Include="Maths\Matrix3x3.cs" />
<Compile Include="Maths\Rect.cs" />
<Compile Include="PdfContentAction.cs" /> <Compile Include="PdfContentAction.cs" />
<Compile Include="PdfDocument.cs" /> <Compile Include="PdfDocument.cs" />
<Compile Include="PdfDocumentPage.cs" /> <Compile Include="PdfDocumentPage.cs" />
@@ -81,7 +84,9 @@
<Compile Include="PdfElements\PdfStream.cs" /> <Compile Include="PdfElements\PdfStream.cs" />
<Compile Include="PdfElements\PdfString.cs" /> <Compile Include="PdfElements\PdfString.cs" />
<Compile Include="PdfParser.cs" /> <Compile Include="PdfParser.cs" />
<Compile Include="PdfPageRenderer.cs" />
<Compile Include="PdfStandar14FontMetrics.cs" /> <Compile Include="PdfStandar14FontMetrics.cs" />
<Compile Include="PdfTextElement.cs" />
<Compile Include="PdfTextExtractor.cs" /> <Compile Include="PdfTextExtractor.cs" />
<Compile Include="Properties\AssemblyInfo.cs" /> <Compile Include="Properties\AssemblyInfo.cs" />
<Compile Include="Maths\Vector3D.cs" /> <Compile Include="Maths\Vector3D.cs" />