diff --git a/VAR.PdfTools.Workbench/FrmPdfInfo.Designer.cs b/VAR.PdfTools.Workbench/FrmPdfInfo.Designer.cs index e80f53a..744b721 100644 --- a/VAR.PdfTools.Workbench/FrmPdfInfo.Designer.cs +++ b/VAR.PdfTools.Workbench/FrmPdfInfo.Designer.cs @@ -48,6 +48,7 @@ this.txtField3 = new System.Windows.Forms.TextBox(); this.btnGetColumn3 = new System.Windows.Forms.Button(); this.txtPages = new System.Windows.Forms.TextBox(); + this.chkRender = new System.Windows.Forms.CheckBox(); this.SuspendLayout(); // // lblOutputs @@ -120,7 +121,7 @@ // this.btnGetColumn1.Location = new System.Drawing.Point(292, 51); this.btnGetColumn1.Name = "btnGetColumn1"; - this.btnGetColumn1.Size = new System.Drawing.Size(60, 23); + this.btnGetColumn1.Size = new System.Drawing.Size(69, 23); this.btnGetColumn1.TabIndex = 12; this.btnGetColumn1.Text = "GetColumn"; this.btnGetColumn1.UseVisualStyleBackColor = true; @@ -195,7 +196,7 @@ // this.btnGetColumn2.Location = new System.Drawing.Point(292, 80); this.btnGetColumn2.Name = "btnGetColumn2"; - this.btnGetColumn2.Size = new System.Drawing.Size(60, 23); + this.btnGetColumn2.Size = new System.Drawing.Size(69, 23); this.btnGetColumn2.TabIndex = 19; this.btnGetColumn2.Text = "GetColumn"; this.btnGetColumn2.UseVisualStyleBackColor = true; @@ -232,7 +233,7 @@ // this.btnGetColumn3.Location = new System.Drawing.Point(292, 109); this.btnGetColumn3.Name = "btnGetColumn3"; - this.btnGetColumn3.Size = new System.Drawing.Size(60, 23); + this.btnGetColumn3.Size = new System.Drawing.Size(69, 23); this.btnGetColumn3.TabIndex = 23; this.btnGetColumn3.Text = "GetColumn"; this.btnGetColumn3.UseVisualStyleBackColor = true; @@ -246,11 +247,22 @@ this.txtPages.Size = new System.Drawing.Size(75, 20); this.txtPages.TabIndex = 27; // + // chkRender + // + this.chkRender.AutoSize = true; + this.chkRender.Location = new System.Drawing.Point(292, 138); + this.chkRender.Name = "chkRender"; + this.chkRender.Size = new System.Drawing.Size(61, 17); + this.chkRender.TabIndex = 28; + this.chkRender.Text = "Render"; + this.chkRender.UseVisualStyleBackColor = true; + // // FrmPdfInfo // this.AutoScaleDimensions = new System.Drawing.SizeF(6F, 13F); this.AutoScaleMode = System.Windows.Forms.AutoScaleMode.Font; this.ClientSize = new System.Drawing.Size(484, 461); + this.Controls.Add(this.chkRender); this.Controls.Add(this.txtPages); this.Controls.Add(this.btnHasText3); this.Controls.Add(this.btnGetField3); @@ -302,5 +314,6 @@ private System.Windows.Forms.TextBox txtField3; private System.Windows.Forms.Button btnGetColumn3; private System.Windows.Forms.TextBox txtPages; + private System.Windows.Forms.CheckBox chkRender; } } \ No newline at end of file diff --git a/VAR.PdfTools.Workbench/FrmPdfInfo.cs b/VAR.PdfTools.Workbench/FrmPdfInfo.cs index 76743fb..bb34ce9 100644 --- a/VAR.PdfTools.Workbench/FrmPdfInfo.cs +++ b/VAR.PdfTools.Workbench/FrmPdfInfo.cs @@ -1,7 +1,6 @@ using System; using System.Collections.Generic; using System.Drawing; -using System.Drawing.Drawing2D; using System.Drawing.Imaging; using System.IO; using System.Linq; @@ -214,10 +213,11 @@ namespace VAR.PdfTools.Workbench if (part.Contains("-")) { string[] range = part.Split('-'); - if (range.Length == 2) { + if (range.Length == 2) + { int pageStart; int pageEnd; - if(int.TryParse(range[0], out pageStart) && int.TryParse(range[1], out pageEnd)) + if (int.TryParse(range[0], out pageStart) && int.TryParse(range[1], out pageEnd)) { listPages.AddRange(Enumerable.Range(pageStart, (pageEnd - pageStart) + 1)); } @@ -226,13 +226,13 @@ namespace VAR.PdfTools.Workbench else { int pageNum; - if(int.TryParse(part, out pageNum)) + if (int.TryParse(part, out pageNum)) { listPages.Add(pageNum); } } } - if(listPages.Count == 0) + if (listPages.Count == 0) { listPages.AddRange(Enumerable.Range(1, maxPages)); } @@ -294,18 +294,29 @@ namespace VAR.PdfTools.Workbench } PdfDocument doc = PdfDocument.Load(pdfPath); + string baseDocumentPath = Path.GetDirectoryName(txtPdfPath.Text); + string baseDocumentFilename = Path.GetFileNameWithoutExtension(txtPdfPath.Text); IEnumerable selectedPages = GetSelectedPages(doc.Pages.Count); - var columnData = new List(); + var columns = new List(); int pageNum = 0; foreach (PdfDocumentPage page in doc.Pages) { pageNum++; if (selectedPages.Contains(pageNum) == false) { continue; } PdfTextExtractor extractor = new PdfTextExtractor(page); - columnData.AddRange(extractor.GetColumnAsStrings(column)); + PdfTextElementColumn columnData = extractor.GetColumn(column); + if (chkRender.Checked) + { + var pdfPageRenderer = new PdfPageRenderer(extractor); + Bitmap bmp = pdfPageRenderer.Render(); + pdfPageRenderer.RenderColumn(columnData, bmp); + string fileName = Path.Combine(baseDocumentPath, string.Format("{0}_{1:0000}.png", baseDocumentFilename, pageNum)); + bmp.Save(fileName, ImageFormat.Png); + } + columns.AddRange(columnData.Elements.Select(t => t.VisibleText)); } - txtOutput.Lines = columnData.ToArray(); + txtOutput.Lines = columns.ToArray(); } private void btnRender_Click(object sender, EventArgs e) @@ -325,19 +336,19 @@ namespace VAR.PdfTools.Workbench lines.Add(string.Format("Number of Pages : {0}", doc.Pages.Count)); IEnumerable selectedPages = GetSelectedPages(doc.Pages.Count); - int pageNumber = 0; + int pageNum = 0; foreach (PdfDocumentPage page in doc.Pages) { - pageNumber++; - if (selectedPages.Contains(pageNumber) == false) { continue; } + pageNum++; + if (selectedPages.Contains(pageNum) == false) { continue; } PdfPageRenderer pdfPageRenderer = new PdfPageRenderer(page); Bitmap bmp = pdfPageRenderer.Render(); - lines.Add(string.Format("Page {0:0000} TextElements : {1}", pageNumber, pdfPageRenderer.Extractor.Elements.Count)); + lines.Add(string.Format("Page {0:0000} TextElements : {1}", pageNum, pdfPageRenderer.Extractor.Elements.Count)); // Save image to disk - string fileName = Path.Combine(baseDocumentPath, string.Format("{0}_{1:0000}.png", baseDocumentFilename, pageNumber)); + string fileName = Path.Combine(baseDocumentPath, string.Format("{0}_{1:0000}.png", baseDocumentFilename, pageNum)); bmp.Save(fileName, ImageFormat.Png); } diff --git a/VAR.PdfTools/PdfPageRenderer.cs b/VAR.PdfTools/PdfPageRenderer.cs index 5dd6604..d661831 100644 --- a/VAR.PdfTools/PdfPageRenderer.cs +++ b/VAR.PdfTools/PdfPageRenderer.cs @@ -10,7 +10,14 @@ namespace VAR.PdfTools { private PdfDocumentPage _page; private PdfTextExtractor _pdfTextExtractor; - + private Rect _pageRect; + private int _pageWidth; + private int _pageHeight; + private int _scale = 10; + + private const int MaxSize = 10000; + + public PdfTextExtractor Extractor { get { return _pdfTextExtractor; } } public PdfPageRenderer(PdfDocumentPage page) @@ -19,45 +26,92 @@ namespace VAR.PdfTools _pdfTextExtractor = new PdfTextExtractor(_page); } + public PdfPageRenderer(PdfTextExtractor pdfTextExtractor) + { + _pdfTextExtractor = pdfTextExtractor; + _page = pdfTextExtractor.Page; + + + // Calculate page size and scale + _pageRect = _pdfTextExtractor.GetRect(); + _pageWidth = (int)Math.Ceiling(_pageRect.XMax - _pageRect.XMin); + _pageHeight = (int)Math.Ceiling(_pageRect.YMax - _pageRect.YMin); + while ((_pageWidth * _scale) > MaxSize) { _scale--; } + while ((_pageHeight * _scale) > MaxSize) { _scale--; } + if (_scale <= 0) { _scale = 1; } + } + public Bitmap Render() { if (_pdfTextExtractor.Elements.Count == 0) { // Nothing to render Bitmap emptyBmp = new Bitmap(100, 200, PixelFormat.Format32bppArgb); - using (Graphics gc = Graphics.FromImage(emptyBmp)) - gc.Clear(Color.White); + using (Graphics gcEmpty = Graphics.FromImage(emptyBmp)) + gcEmpty.Clear(Color.White); return emptyBmp; } - // Calculate page size and scale - Rect pageRect = _pdfTextExtractor.GetRect(); - int pageWidth = (int)Math.Ceiling(pageRect.XMax - pageRect.XMin); - int pageHeight = (int)Math.Ceiling(pageRect.YMax - pageRect.YMin); - int Scale = 10; - int MaxSize = 10000; - while ((pageWidth * Scale) > MaxSize) { Scale--; } - while ((pageHeight * Scale) > MaxSize && Scale > 1) { Scale--; } - if (Scale <= 0) { Scale = 1; } + // Prepare image + Bitmap bmp = new Bitmap(_pageWidth * _scale, _pageHeight * _scale, PixelFormat.Format32bppArgb); + Graphics gc = Graphics.FromImage(bmp); + gc.Clear(Color.White); - // Draw page image - Bitmap bmp = new Bitmap(pageWidth * Scale, pageHeight * Scale, PixelFormat.Format32bppArgb); - using (Graphics gc = Graphics.FromImage(bmp)) + // Draw text elements of the page using (Pen penTextElem = new Pen(Color.Blue)) using (Pen penCharElem = new Pen(Color.Navy)) { - gc.Clear(Color.White); - - // Draw text elements foreach (PdfTextElement textElement in _pdfTextExtractor.Elements) { - DrawTextElement(textElement, gc, penTextElem, penCharElem, Scale, pageHeight, pageRect.XMin, pageRect.YMin, Brushes.Black); + DrawTextElement(textElement, gc, penTextElem, penCharElem, _scale, _pageHeight, _pageRect.XMin, _pageRect.YMin, Brushes.Black); } } + + gc.Dispose(); return bmp; } - private static void DrawTextElement(PdfTextElement textElement, Graphics gc, Pen penTextElem, Pen penCharElem, int Scale, int pageHeight, double pageXMin, double pageYMin, Brush brushText) + public Bitmap RenderColumn(PdfTextElementColumn columnData, Bitmap bmp = null) + { + Graphics gc; + if (bmp == null) + { + bmp = new Bitmap(_pageWidth * _scale, _pageHeight * _scale, PixelFormat.Format32bppArgb); + gc = Graphics.FromImage(bmp); + gc.Clear(Color.White); + } + else + { + gc = Graphics.FromImage(bmp); + } + + // Draw text elements of the column + using (Pen penTextElem = new Pen(Color.Red)) + using (Pen penCharElem = new Pen(Color.DarkRed)) + { + foreach (PdfTextElement textElement in columnData.Elements) + { + DrawTextElement(textElement, gc, penTextElem, penCharElem, _scale, _pageHeight, _pageRect.XMin, _pageRect.YMin, Brushes.OrangeRed); + } + } + + // Draw column extents + using (Pen penColumn = new Pen(Color.Red)) + { + float y = (float)(_pageRect.YMax - columnData.Y); + float x1 = (float)(columnData.X1 - _pageRect.XMin); + float x2 = (float)(columnData.X2 - _pageRect.XMin); + + gc.DrawLine(penColumn, x1 * _scale, y * _scale, x2 * _scale, y * _scale); + gc.DrawLine(penColumn, x1 * _scale, y * _scale, x1 * _scale, _pageHeight * _scale); + gc.DrawLine(penColumn, x2 * _scale, y * _scale, x2 * _scale, _pageHeight * _scale); + } + + gc.Dispose(); + return bmp; + } + + private static void DrawTextElement(PdfTextElement textElement, Graphics gc, Pen penTextElem, Pen penCharElem, int scale, int pageHeight, double pageXMin, double pageYMin, Brush brushText) { double textElementX = textElement.GetX() - pageXMin; double textElementY = textElement.GetY() - pageYMin; @@ -74,29 +128,29 @@ namespace VAR.PdfTools if (penTextElem != null) { DrawRoundedRectangle(gc, penTextElem, - (int)(textElementPageX * Scale), - (int)(textElementPageY * Scale), - (int)(textElementWidth * Scale), - (int)(textElementHeight * Scale), + (int)(textElementPageX * scale), + (int)(textElementPageY * scale), + (int)(textElementWidth * scale), + (int)(textElementHeight * scale), 5); } - using (Font font = new Font("Arial", (int)(textElementHeight * Scale), GraphicsUnit.Pixel)) + using (Font font = new Font("Arial", (int)(textElementHeight * scale), GraphicsUnit.Pixel)) { foreach (PdfCharElement c in textElement.Characters) { gc.DrawString(c.Char, font, brushText, - (int)((textElementPageX + c.Displacement) * Scale), - (int)(textElementPageY * Scale)); + (int)((textElementPageX + c.Displacement) * scale), + (int)(textElementPageY * scale)); if (penCharElem != null) { DrawRoundedRectangle(gc, penCharElem, - (int)((textElementPageX + c.Displacement) * Scale), - (int)(textElementPageY * Scale), - (int)(c.Width * Scale), - (int)(textElementHeight * Scale), + (int)((textElementPageX + c.Displacement) * scale), + (int)(textElementPageY * scale), + (int)(c.Width * scale), + (int)(textElementHeight * scale), 5); } } diff --git a/VAR.PdfTools/PdfTextElement.cs b/VAR.PdfTools/PdfTextElement.cs index c780b62..7fe8f11 100644 --- a/VAR.PdfTools/PdfTextElement.cs +++ b/VAR.PdfTools/PdfTextElement.cs @@ -103,4 +103,28 @@ namespace VAR.PdfTools #endregion } + public class PdfTextElementColumn + { + public PdfTextElement HeadTextElement { get; private set; } + + public IEnumerable Elements { get; private set; } + + public double Y { get; private set; } + + public double X1 { get; private set; } + public double X2 { get; private set; } + + public static PdfTextElementColumn Empty { get; } = new PdfTextElementColumn(); + + private PdfTextElementColumn() { } + + public PdfTextElementColumn(PdfTextElement head, IEnumerable elements, double y, double x1, double x2) + { + HeadTextElement = head; + Elements = elements; + Y = y; + X1 = x1; + X2 = x2; + } + } } diff --git a/VAR.PdfTools/PdfTextExtractor.cs b/VAR.PdfTools/PdfTextExtractor.cs index a065031..f720b45 100644 --- a/VAR.PdfTools/PdfTextExtractor.cs +++ b/VAR.PdfTools/PdfTextExtractor.cs @@ -651,7 +651,7 @@ namespace VAR.PdfTools #endregion #region Public methods - + public Rect GetRect() { Rect rect = null; @@ -664,12 +664,12 @@ namespace VAR.PdfTools return rect; } - public List GetColumnAsStrings(string column, bool fuzzy =true) + public PdfTextElementColumn GetColumn(string column, bool fuzzy = true) { PdfTextElement columnHead = FindElementByText(column, fuzzy); if (columnHead == null) { - return new List(); + return PdfTextElementColumn.Empty; } double headY = columnHead.GetY(); double headX1 = columnHead.GetX(); @@ -717,25 +717,34 @@ namespace VAR.PdfTools columnDataRaw = columnDataRaw.OrderByDescending(elem => elem.GetY()).ToList(); // Only items completelly inside extents, and break on the first element outside - var columnData = new List(); + var columnElements = new List(); foreach (PdfTextElement elem in columnDataRaw) { double elemX1 = elem.GetX(); double elemX2 = elemX1 + elem.VisibleWidth; if (elemX1 < extentX1 || elemX2 > extentX2) { break; } - columnData.Add(elem); + columnElements.Add(elem); } + var columnData = new PdfTextElementColumn(columnHead, columnElements, headY, extentX1, extentX2); + + return columnData; + } + + public List GetColumnAsStrings(string column, bool fuzzy = true) + { + PdfTextElementColumn columnData = GetColumn(column, fuzzy); + // Emit result var result = new List(); - foreach (PdfTextElement elem in columnData) + foreach (PdfTextElement elem in columnData.Elements) { result.Add(elem.VisibleText); } return result; } - + public string GetFieldAsString(string field, bool fuzzy = true) { PdfTextElement fieldTitle = FindElementByText(field, fuzzy); @@ -763,7 +772,7 @@ namespace VAR.PdfTools return fieldData.OrderBy(elem => elem.GetX()).FirstOrDefault().VisibleText; } - + public bool HasText(string text, bool fuzzy = true) { List list = FindElementsContainingText(text, fuzzy);