commit 973bfe3fe2da3b81b1c4ad7cb8fd21706bf457b5 Author: Valeriano A.R Date: Fri Jun 17 22:24:08 2016 +0200 Basic PDF file structure parsing diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..9b2de5b --- /dev/null +++ b/.gitignore @@ -0,0 +1,27 @@ +#ignorar miniaturas creadas por windows +Thumbs.db +#Ignorar archivos construidos por Visual Studio +*.obj +*.exe +*.pdb +*.user +*.aps +*.pch +*.vspscc +*_i.c +*_p.c +*.ncb +*.suo +*.tlb +*.tlh +*.bak +*.cache +*.ilk +*.log +[Bb]in +[Dd]ebug*/ +*.lib +*.sbr +obj/ +[Rr]elease*/ +_ReSharper*/ diff --git a/LICENSE.txt b/LICENSE.txt new file mode 100644 index 0000000..318247e --- /dev/null +++ b/LICENSE.txt @@ -0,0 +1,21 @@ +The MIT License (MIT) + +Copyright (c) 2014-2015 Valeriano Alfonso Rodriguez + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. \ No newline at end of file diff --git a/VAR.PdfTools.Workbench/App.config b/VAR.PdfTools.Workbench/App.config new file mode 100644 index 0000000..3c82f18 --- /dev/null +++ b/VAR.PdfTools.Workbench/App.config @@ -0,0 +1,18 @@ + + + + +
+ + + + + + + + + + + + + \ No newline at end of file diff --git a/VAR.PdfTools.Workbench/FrmPdfInfo.Designer.cs b/VAR.PdfTools.Workbench/FrmPdfInfo.Designer.cs new file mode 100644 index 0000000..85b5e05 --- /dev/null +++ b/VAR.PdfTools.Workbench/FrmPdfInfo.Designer.cs @@ -0,0 +1,134 @@ +namespace VAR.PdfTools.Workbench +{ + partial class FrmPdfInfo + { + /// + /// Required designer variable. + /// + private System.ComponentModel.IContainer components = null; + + /// + /// Clean up any resources being used. + /// + /// true if managed resources should be disposed; otherwise, false. + protected override void Dispose(bool disposing) + { + if (disposing && (components != null)) + { + components.Dispose(); + } + base.Dispose(disposing); + } + + #region Windows Form Designer generated code + + /// + /// Required method for Designer support - do not modify + /// the contents of this method with the code editor. + /// + private void InitializeComponent() + { + this.lblOutputs = new System.Windows.Forms.Label(); + this.lblInputs = new System.Windows.Forms.Label(); + this.btnBrowse = new System.Windows.Forms.Button(); + this.txtPdfPath = new System.Windows.Forms.TextBox(); + this.txtOutput = new System.Windows.Forms.TextBox(); + this.btnProcess = new System.Windows.Forms.Button(); + this.SuspendLayout(); + // + // lblOutputs + // + this.lblOutputs.AutoSize = true; + this.lblOutputs.Font = new System.Drawing.Font("Microsoft Sans Serif", 8.25F, System.Drawing.FontStyle.Bold, System.Drawing.GraphicsUnit.Point, ((byte)(0))); + this.lblOutputs.Location = new System.Drawing.Point(12, 48); + this.lblOutputs.Name = "lblOutputs"; + this.lblOutputs.Size = new System.Drawing.Size(51, 13); + this.lblOutputs.TabIndex = 11; + this.lblOutputs.Text = "Outputs"; + // + // lblInputs + // + this.lblInputs.AutoSize = true; + this.lblInputs.Font = new System.Drawing.Font("Microsoft Sans Serif", 8.25F, System.Drawing.FontStyle.Bold, System.Drawing.GraphicsUnit.Point, ((byte)(0))); + this.lblInputs.Location = new System.Drawing.Point(12, 9); + this.lblInputs.Name = "lblInputs"; + this.lblInputs.Size = new System.Drawing.Size(42, 13); + this.lblInputs.TabIndex = 10; + this.lblInputs.Text = "Inputs"; + // + // btnBrowse + // + this.btnBrowse.Anchor = ((System.Windows.Forms.AnchorStyles)((System.Windows.Forms.AnchorStyles.Top | System.Windows.Forms.AnchorStyles.Right))); + this.btnBrowse.Location = new System.Drawing.Point(323, 22); + this.btnBrowse.Name = "btnBrowse"; + this.btnBrowse.Size = new System.Drawing.Size(75, 23); + this.btnBrowse.TabIndex = 9; + this.btnBrowse.Text = "Browse"; + this.btnBrowse.UseVisualStyleBackColor = true; + this.btnBrowse.Click += new System.EventHandler(this.btnBrowse_Click); + // + // txtPdfPath + // + this.txtPdfPath.Anchor = ((System.Windows.Forms.AnchorStyles)(((System.Windows.Forms.AnchorStyles.Top | System.Windows.Forms.AnchorStyles.Left) + | System.Windows.Forms.AnchorStyles.Right))); + this.txtPdfPath.Location = new System.Drawing.Point(15, 25); + this.txtPdfPath.Name = "txtPdfPath"; + this.txtPdfPath.Size = new System.Drawing.Size(302, 20); + this.txtPdfPath.TabIndex = 8; + // + // txtOutput + // + this.txtOutput.AcceptsReturn = true; + this.txtOutput.AcceptsTab = true; + this.txtOutput.Anchor = ((System.Windows.Forms.AnchorStyles)((((System.Windows.Forms.AnchorStyles.Top | System.Windows.Forms.AnchorStyles.Bottom) + | System.Windows.Forms.AnchorStyles.Left) + | System.Windows.Forms.AnchorStyles.Right))); + this.txtOutput.Font = new System.Drawing.Font("Consolas", 8.25F, System.Drawing.FontStyle.Regular, System.Drawing.GraphicsUnit.Point, ((byte)(0))); + this.txtOutput.Location = new System.Drawing.Point(15, 64); + this.txtOutput.Multiline = true; + this.txtOutput.Name = "txtOutput"; + this.txtOutput.ScrollBars = System.Windows.Forms.ScrollBars.Vertical; + this.txtOutput.Size = new System.Drawing.Size(464, 355); + this.txtOutput.TabIndex = 7; + // + // btnProcess + // + this.btnProcess.Anchor = ((System.Windows.Forms.AnchorStyles)((System.Windows.Forms.AnchorStyles.Top | System.Windows.Forms.AnchorStyles.Right))); + this.btnProcess.Location = new System.Drawing.Point(404, 22); + this.btnProcess.Name = "btnProcess"; + this.btnProcess.Size = new System.Drawing.Size(75, 23); + this.btnProcess.TabIndex = 6; + this.btnProcess.Text = "Process"; + this.btnProcess.UseVisualStyleBackColor = true; + this.btnProcess.Click += new System.EventHandler(this.btnProcess_Click); + // + // FrmPdfInfo + // + this.AutoScaleDimensions = new System.Drawing.SizeF(6F, 13F); + this.AutoScaleMode = System.Windows.Forms.AutoScaleMode.Font; + this.ClientSize = new System.Drawing.Size(491, 431); + this.Controls.Add(this.lblOutputs); + this.Controls.Add(this.lblInputs); + this.Controls.Add(this.btnBrowse); + this.Controls.Add(this.txtPdfPath); + this.Controls.Add(this.txtOutput); + this.Controls.Add(this.btnProcess); + this.Name = "FrmPdfInfo"; + this.Text = "PdfInfo"; + this.FormClosing += new System.Windows.Forms.FormClosingEventHandler(this.FrmPdfInfo_FormClosing); + this.Load += new System.EventHandler(this.FrmPdfInfo_Load); + this.ResumeLayout(false); + this.PerformLayout(); + + } + + #endregion + + private System.Windows.Forms.Label lblOutputs; + private System.Windows.Forms.Label lblInputs; + private System.Windows.Forms.Button btnBrowse; + private System.Windows.Forms.TextBox txtPdfPath; + private System.Windows.Forms.TextBox txtOutput; + private System.Windows.Forms.Button btnProcess; + } +} \ No newline at end of file diff --git a/VAR.PdfTools.Workbench/FrmPdfInfo.cs b/VAR.PdfTools.Workbench/FrmPdfInfo.cs new file mode 100644 index 0000000..1b4f55b --- /dev/null +++ b/VAR.PdfTools.Workbench/FrmPdfInfo.cs @@ -0,0 +1,81 @@ +using System; +using System.Collections.Generic; +using System.ComponentModel; +using System.Data; +using System.Drawing; +using System.Linq; +using System.Text; +using System.Threading.Tasks; +using System.Windows.Forms; + +namespace VAR.PdfTools.Workbench +{ + public partial class FrmPdfInfo : Form + { + public FrmPdfInfo() + { + InitializeComponent(); + } + + private void FrmPdfInfo_Load(object sender, EventArgs e) + { + txtPdfPath.Text = Properties.Settings.Default.LastPdfPath; + } + + private void FrmPdfInfo_FormClosing(object sender, FormClosingEventArgs e) + { + Properties.Settings.Default.LastPdfPath = txtPdfPath.Text; + Properties.Settings.Default.Save(); + } + + private void btnBrowse_Click(object sender, EventArgs e) + { + var dlgFile = new OpenFileDialog(); + DialogResult result = dlgFile.ShowDialog(); + if (result == DialogResult.OK) + { + txtPdfPath.Text = dlgFile.FileName; + } + } + + private void btnProcess_Click(object sender, EventArgs e) + { + if (System.IO.File.Exists(txtPdfPath.Text) == false) + { + MessageBox.Show("File does not exist"); + return; + } + + PdfDocument doc = PdfDocument.Load(txtPdfPath.Text); + + int nObjects = doc.Objects.Count; + List streams = doc.Objects.Where(obj => obj.Data.Type == PdfElementTypes.Stream).Select(obj => (PdfStream)obj.Data).ToList(); + int nStreams = streams.Count; + List streamsWithFilters = streams + .Where(stream => stream.Dictionary.Values.ContainsKey("Filter")) + .ToList(); + var streamFilters = new List(); + foreach(PdfStream stream in streamsWithFilters) + { + IPdfElement filter = stream.Dictionary.Values["Filter"]; + if (filter is PdfArray) + { + filter = ((PdfArray)filter).Values[0]; + } + if (filter is PdfName) + { + streamFilters.Add(((PdfName)filter).Value); + } + } + + txtOutput.Lines = new string[] + { + string.Format("Number of Objects: {0}", nObjects), + string.Format("Number of Streams: {0}", nStreams), + string.Format("Stream Filters: {0}", string.Join(", ", streamFilters.Distinct().ToArray())), + }; + + } + + } +} diff --git a/VAR.PdfTools.Workbench/FrmPdfInfo.resx b/VAR.PdfTools.Workbench/FrmPdfInfo.resx new file mode 100644 index 0000000..1af7de1 --- /dev/null +++ b/VAR.PdfTools.Workbench/FrmPdfInfo.resx @@ -0,0 +1,120 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + text/microsoft-resx + + + 2.0 + + + System.Resources.ResXResourceReader, System.Windows.Forms, Version=4.0.0.0, Culture=neutral, PublicKeyToken=b77a5c561934e089 + + + System.Resources.ResXResourceWriter, System.Windows.Forms, Version=4.0.0.0, Culture=neutral, PublicKeyToken=b77a5c561934e089 + + \ No newline at end of file diff --git a/VAR.PdfTools.Workbench/Program.cs b/VAR.PdfTools.Workbench/Program.cs new file mode 100644 index 0000000..dd6601a --- /dev/null +++ b/VAR.PdfTools.Workbench/Program.cs @@ -0,0 +1,22 @@ +using System; +using System.Collections.Generic; +using System.Linq; +using System.Threading.Tasks; +using System.Windows.Forms; + +namespace VAR.PdfTools.Workbench +{ + static class Program + { + /// + /// The main entry point for the application. + /// + [STAThread] + static void Main() + { + Application.EnableVisualStyles(); + Application.SetCompatibleTextRenderingDefault(false); + Application.Run(new FrmPdfInfo()); + } + } +} diff --git a/VAR.PdfTools.Workbench/Properties/AssemblyInfo.cs b/VAR.PdfTools.Workbench/Properties/AssemblyInfo.cs new file mode 100644 index 0000000..561f40c --- /dev/null +++ b/VAR.PdfTools.Workbench/Properties/AssemblyInfo.cs @@ -0,0 +1,14 @@ +using System.Reflection; +using System.Runtime.InteropServices; + +[assembly: AssemblyTitle("VAR.PdfTools.Workbench")] +[assembly: AssemblyDescription("PdfTools Workbench")] +[assembly: AssemblyConfiguration("")] +[assembly: AssemblyCompany("VAR")] +[assembly: AssemblyProduct("VAR.PdfTools.Workbench")] +[assembly: AssemblyCopyright("Copyright © VAR 2016")] +[assembly: AssemblyTrademark("")] +[assembly: AssemblyCulture("")] +[assembly: ComVisible(false)] +[assembly: Guid("a5825d8e-9f81-49e0-b610-8ae5e46d02ea")] +[assembly: AssemblyVersion("1.0.*")] diff --git a/VAR.PdfTools.Workbench/Properties/Resources.Designer.cs b/VAR.PdfTools.Workbench/Properties/Resources.Designer.cs new file mode 100644 index 0000000..678e5cf --- /dev/null +++ b/VAR.PdfTools.Workbench/Properties/Resources.Designer.cs @@ -0,0 +1,71 @@ +//------------------------------------------------------------------------------ +// +// This code was generated by a tool. +// Runtime Version:4.0.30319.42000 +// +// Changes to this file may cause incorrect behavior and will be lost if +// the code is regenerated. +// +//------------------------------------------------------------------------------ + +namespace VAR.PdfTools.Workbench.Properties +{ + + + /// + /// A strongly-typed resource class, for looking up localized strings, etc. + /// + // This class was auto-generated by the StronglyTypedResourceBuilder + // class via a tool like ResGen or Visual Studio. + // To add or remove a member, edit your .ResX file then rerun ResGen + // with the /str option, or rebuild your VS project. + [global::System.CodeDom.Compiler.GeneratedCodeAttribute("System.Resources.Tools.StronglyTypedResourceBuilder", "4.0.0.0")] + [global::System.Diagnostics.DebuggerNonUserCodeAttribute()] + [global::System.Runtime.CompilerServices.CompilerGeneratedAttribute()] + internal class Resources + { + + private static global::System.Resources.ResourceManager resourceMan; + + private static global::System.Globalization.CultureInfo resourceCulture; + + [global::System.Diagnostics.CodeAnalysis.SuppressMessageAttribute("Microsoft.Performance", "CA1811:AvoidUncalledPrivateCode")] + internal Resources() + { + } + + /// + /// Returns the cached ResourceManager instance used by this class. + /// + [global::System.ComponentModel.EditorBrowsableAttribute(global::System.ComponentModel.EditorBrowsableState.Advanced)] + internal static global::System.Resources.ResourceManager ResourceManager + { + get + { + if ((resourceMan == null)) + { + global::System.Resources.ResourceManager temp = new global::System.Resources.ResourceManager("VAR.PdfTools.Workbench.Properties.Resources", typeof(Resources).Assembly); + resourceMan = temp; + } + return resourceMan; + } + } + + /// + /// Overrides the current thread's CurrentUICulture property for all + /// resource lookups using this strongly typed resource class. + /// + [global::System.ComponentModel.EditorBrowsableAttribute(global::System.ComponentModel.EditorBrowsableState.Advanced)] + internal static global::System.Globalization.CultureInfo Culture + { + get + { + return resourceCulture; + } + set + { + resourceCulture = value; + } + } + } +} diff --git a/VAR.PdfTools.Workbench/Properties/Resources.resx b/VAR.PdfTools.Workbench/Properties/Resources.resx new file mode 100644 index 0000000..af7dbeb --- /dev/null +++ b/VAR.PdfTools.Workbench/Properties/Resources.resx @@ -0,0 +1,117 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + text/microsoft-resx + + + 2.0 + + + System.Resources.ResXResourceReader, System.Windows.Forms, Version=2.0.0.0, Culture=neutral, PublicKeyToken=b77a5c561934e089 + + + System.Resources.ResXResourceWriter, System.Windows.Forms, Version=2.0.0.0, Culture=neutral, PublicKeyToken=b77a5c561934e089 + + \ No newline at end of file diff --git a/VAR.PdfTools.Workbench/Properties/Settings.Designer.cs b/VAR.PdfTools.Workbench/Properties/Settings.Designer.cs new file mode 100644 index 0000000..15e7fc3 --- /dev/null +++ b/VAR.PdfTools.Workbench/Properties/Settings.Designer.cs @@ -0,0 +1,38 @@ +//------------------------------------------------------------------------------ +// +// This code was generated by a tool. +// Runtime Version:4.0.30319.42000 +// +// Changes to this file may cause incorrect behavior and will be lost if +// the code is regenerated. +// +//------------------------------------------------------------------------------ + +namespace VAR.PdfTools.Workbench.Properties { + + + [global::System.Runtime.CompilerServices.CompilerGeneratedAttribute()] + [global::System.CodeDom.Compiler.GeneratedCodeAttribute("Microsoft.VisualStudio.Editors.SettingsDesigner.SettingsSingleFileGenerator", "14.0.0.0")] + internal sealed partial class Settings : global::System.Configuration.ApplicationSettingsBase { + + private static Settings defaultInstance = ((Settings)(global::System.Configuration.ApplicationSettingsBase.Synchronized(new Settings()))); + + public static Settings Default { + get { + return defaultInstance; + } + } + + [global::System.Configuration.UserScopedSettingAttribute()] + [global::System.Diagnostics.DebuggerNonUserCodeAttribute()] + [global::System.Configuration.DefaultSettingValueAttribute("")] + public string LastPdfPath { + get { + return ((string)(this["LastPdfPath"])); + } + set { + this["LastPdfPath"] = value; + } + } + } +} diff --git a/VAR.PdfTools.Workbench/Properties/Settings.settings b/VAR.PdfTools.Workbench/Properties/Settings.settings new file mode 100644 index 0000000..3a7c7fe --- /dev/null +++ b/VAR.PdfTools.Workbench/Properties/Settings.settings @@ -0,0 +1,9 @@ + + + + + + + + + \ No newline at end of file diff --git a/VAR.PdfTools.Workbench/VAR.PdfTools.Workbench.csproj b/VAR.PdfTools.Workbench/VAR.PdfTools.Workbench.csproj new file mode 100644 index 0000000..4270173 --- /dev/null +++ b/VAR.PdfTools.Workbench/VAR.PdfTools.Workbench.csproj @@ -0,0 +1,96 @@ + + + + + Debug + AnyCPU + {A5825D8E-9F81-49E0-B610-8AE5E46D02EA} + WinExe + Properties + VAR.PdfTools.Workbench + VAR.PdfTools.Workbench + v4.6.1 + 512 + true + + + AnyCPU + true + full + false + bin\Debug\ + DEBUG;TRACE + prompt + 4 + + + AnyCPU + pdbonly + true + bin\Release\ + TRACE + prompt + 4 + + + + + + + + + + + + + + + + + Form + + + FrmPdfInfo.cs + + + + + FrmPdfInfo.cs + + + ResXFileCodeGenerator + Resources.Designer.cs + Designer + + + True + Resources.resx + + + SettingsSingleFileGenerator + Settings.Designer.cs + + + True + Settings.settings + True + + + + + + + + {eb7e003a-6a95-4002-809f-926c7c8a11e9} + VAR.PdfTools + + + + + \ No newline at end of file diff --git a/VAR.PdfTools.sln b/VAR.PdfTools.sln new file mode 100644 index 0000000..bf33ddc --- /dev/null +++ b/VAR.PdfTools.sln @@ -0,0 +1,28 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio 14 +VisualStudioVersion = 14.0.25123.0 +MinimumVisualStudioVersion = 10.0.40219.1 +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "VAR.PdfTools", "VAR.PdfTools\VAR.PdfTools.csproj", "{EB7E003A-6A95-4002-809F-926C7C8A11E9}" +EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "VAR.PdfTools.Workbench", "VAR.PdfTools.Workbench\VAR.PdfTools.Workbench.csproj", "{A5825D8E-9F81-49E0-B610-8AE5E46D02EA}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|Any CPU = Debug|Any CPU + Release|Any CPU = Release|Any CPU + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {EB7E003A-6A95-4002-809F-926C7C8A11E9}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {EB7E003A-6A95-4002-809F-926C7C8A11E9}.Debug|Any CPU.Build.0 = Debug|Any CPU + {EB7E003A-6A95-4002-809F-926C7C8A11E9}.Release|Any CPU.ActiveCfg = Release|Any CPU + {EB7E003A-6A95-4002-809F-926C7C8A11E9}.Release|Any CPU.Build.0 = Release|Any CPU + {A5825D8E-9F81-49E0-B610-8AE5E46D02EA}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {A5825D8E-9F81-49E0-B610-8AE5E46D02EA}.Debug|Any CPU.Build.0 = Debug|Any CPU + {A5825D8E-9F81-49E0-B610-8AE5E46D02EA}.Release|Any CPU.ActiveCfg = Release|Any CPU + {A5825D8E-9F81-49E0-B610-8AE5E46D02EA}.Release|Any CPU.Build.0 = Release|Any CPU + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection +EndGlobal diff --git a/VAR.PdfTools/PdfDocument.cs b/VAR.PdfTools/PdfDocument.cs new file mode 100644 index 0000000..c843a10 --- /dev/null +++ b/VAR.PdfTools/PdfDocument.cs @@ -0,0 +1,52 @@ +using System.Collections.Generic; +using System.IO; + +namespace VAR.PdfTools +{ + public class PdfDocument + { + #region Declarations + + private List _objects = new List(); + + #endregion + + #region Properties + + public List Objects { get { return _objects; } } + + #endregion + + #region Life cycle + + private PdfDocument() { } + + #endregion + + #region Public methods + + public static PdfDocument Load(string filename) + { + byte[] fileBytes = File.ReadAllBytes(filename); + return Load(fileBytes); + } + + public static PdfDocument Load(byte[] data) + { + var parser = new PdfParser(data); + var doc = new PdfDocument(); + do + { + PdfObject obj = parser.ParseObject(); + if (obj != null) + { + doc.Objects.Add(obj); + } + } while (parser.IsEndOfStream() == false); + return doc; + } + + #endregion + + } +} diff --git a/VAR.PdfTools/PdfElements.cs b/VAR.PdfTools/PdfElements.cs new file mode 100644 index 0000000..6a021ff --- /dev/null +++ b/VAR.PdfTools/PdfElements.cs @@ -0,0 +1,96 @@ +using System.Collections.Generic; + +namespace VAR.PdfTools +{ + public enum PdfElementTypes + { + Undefined, + Boolean, + Integer, + Real, + String, + Name, + Array, + Dictionary, + Null, + ObjectReference, + Object, + Stream, + }; + + public interface IPdfElement + { + PdfElementTypes Type { get; } + } + + public class PdfBoolean : IPdfElement + { + public PdfElementTypes Type { get; private set; } = PdfElementTypes.Boolean; + public bool Value { get; set; } + } + + public class PdfInteger : IPdfElement + { + public PdfElementTypes Type { get; private set; } = PdfElementTypes.Integer; + public int Value { get; set; } + } + + public class PdfReal : IPdfElement + { + public PdfElementTypes Type { get; private set; } = PdfElementTypes.Real; + public double Value { get; set; } + } + + public class PdfString : IPdfElement + { + public PdfElementTypes Type { get; private set; } = PdfElementTypes.String; + public string Value { get; set; } + } + + public class PdfName : IPdfElement + { + public PdfElementTypes Type { get; private set; } = PdfElementTypes.Name; + public string Value { get; set; } + } + + public class PdfArray : IPdfElement + { + public PdfElementTypes Type { get; private set; } = PdfElementTypes.Array; + private List _values = new List(); + public List Values { get { return _values; } } + } + + public class PdfDictionary : IPdfElement + { + public PdfElementTypes Type { get; private set; } = PdfElementTypes.Dictionary; + private Dictionary _values = new Dictionary(); + public Dictionary Values { get { return _values; } } + } + + public class PdfNull : IPdfElement + { + public PdfElementTypes Type { get; private set; } = PdfElementTypes.Null; + } + + public class PdfObjectReference : IPdfElement + { + public PdfElementTypes Type { get; private set; } = PdfElementTypes.ObjectReference; + public int ObjectID { get; set; } + public int ObjectGeneration { get; set; } + } + + public class PdfStream : IPdfElement + { + public PdfElementTypes Type { get; private set; } = PdfElementTypes.Stream; + public PdfDictionary Dictionary { get; set; } + public byte[] Data { get; set; } + } + + public class PdfObject : IPdfElement + { + public PdfElementTypes Type { get; private set; } = PdfElementTypes.Object; + public int ObjectID { get; set; } + public int ObjectGeneration { get; set; } + public IPdfElement Data { get; set; } + } +} diff --git a/VAR.PdfTools/PdfParser.cs b/VAR.PdfTools/PdfParser.cs new file mode 100644 index 0000000..70a328d --- /dev/null +++ b/VAR.PdfTools/PdfParser.cs @@ -0,0 +1,748 @@ +using System; +using System.IO; +using System.Text; + +namespace VAR.PdfTools +{ + public class PdfParser + { + #region Declarations + + private byte[] _stream = null; + private long _streamPosition = 0; + + #endregion + + #region Creator + + public PdfParser(byte[] stream) + { + _stream = stream; + } + + #endregion + + #region Utility methods + + private int ByteHexValue(byte character) + { + if (character >= 0x30 && character <= 0x39) + { + return (character - 0x30); + } + if (character >= 0x41 && character <= 0x46) + { + return 10 + (character - 0x41); + } + if (character >= 0x61 && character <= 0x66) + { + return 10 + (character - 0x61); + } + return 0; + } + + private byte[] GetRawData(long length) + { + var memStream = new MemoryStream(); + var byteArray = new byte[1]; + + for (long i = 0; i < length; i++) + { + byteArray[0] = PeekChar(); + NextChar(); + memStream.Write(byteArray, 0, 1); + } + + return memStream.ToArray(); + } + + private byte PeekChar() + { + if (_streamPosition >= _stream.Length) + { + return 0; + } + return _stream[_streamPosition]; + } + + private byte PeekNextChar() + { + return PeekNextChar(1); + } + + private byte PeekNextChar(int offset) + { + if ((_streamPosition + offset) >= _stream.Length) + { + return 0; + } + return _stream[_streamPosition + offset]; + } + + private bool NextChar() + { + _streamPosition++; + if (_streamPosition >= _stream.Length) + { + return false; + } + return true; + } + + private bool IsWhitespace(byte character) + { + if ( + character == 0x00 || // NULL + character == 0x09 || // Horizontal Tab (HT) + character == 0x0A || // Line Feed (LF) + character == 0x0C || // Form Feed (FF) + character == 0x0D || // Carriage Return (CR) + character == 0x20 || // Space (SP) + false) + { + return true; + } + return false; + } + + private bool IsEndOfLine(byte character) + { + if ( + character == 0x0A || // Line Feed (LF) + character == 0x0D || // Carriage Return (CR) + false) + { + return true; + } + return false; + } + + private bool IsDelimiter(byte character) + { + if ( + character == '(' || + character == ')' || + character == '<' || + character == '>' || + character == '[' || + character == ']' || + character == '}' || + character == '{' || + character == '%' || + false) + { + return true; + } + return false; + } + + private bool IsDigit(byte character) + { + if (character >= '0' && character <= '9') + { + return true; + } + return false; + } + + private void SkipWhitespace() + { + while (IsWhitespace(PeekChar())) + { + _streamPosition++; + if (_streamPosition >= _stream.Length) + { + // EOS + break; + } + } + } + + private void SkipEndOfLine() + { + byte lineFeed = 0x0A; + byte carriageReturn = 0x0D; + if (PeekChar() == carriageReturn) + { + NextChar(); + if (_streamPosition < _stream.Length) + { + if (_stream[_streamPosition] == lineFeed) + { + _streamPosition++; + } + } + return; + } + if (PeekChar() != lineFeed) + { + NextChar(); + return; + } + } + + private void SkipToEndOfLine() + { + while (IsEndOfLine(PeekChar()) == false) + { + if (NextChar() == false) + { + break; + } + } + } + + private void SkipComment() + { + if (PeekChar() != '%') { return; } + SkipToEndOfLine(); + SkipEndOfLine(); + } + + private string ParseComment() + { + if (PeekChar() != '%') { return string.Empty; } + NextChar(); + StringBuilder sbComment = new StringBuilder(); + while (IsEndOfLine(PeekChar()) == false) + { + sbComment.Append((char)PeekChar()); + if (NextChar() == false) + { + break; + } + } + SkipEndOfLine(); + return sbComment.ToString(); + } + + private string ParseToken() + { + SkipWhitespace(); + StringBuilder sbToken = new StringBuilder(); + do + { + byte character = PeekChar(); + if (char.IsLetter((char)character) == false) + { + break; + } + sbToken.Append((char)character); + NextChar(); + } while (IsEndOfStream() == false); + return sbToken.ToString(); + } + + private IPdfElement ParseElement() + { + IPdfElement obj = null; + byte character = PeekChar(); + byte nextCharacter = PeekNextChar(); + + if (character == 't' || character == 'f') + { + obj = ParseBoolean(); + } + if (character == 'n') + { + obj = ParseNull(); + } + else if (IsDigit(character) || character == '+' || character == '-' || character == '.') + { + obj = ParseNumberOrReference(); + } + else if (character == '(' || (character == '<' && nextCharacter != '<')) + { + obj = ParseString(); + } + else if (character == '/') + { + obj = ParseName(); + } + else if (character == '[') + { + obj = ParseArray(); + } + else if (character == '<' && nextCharacter == '<') + { + obj = ParseDictionary(); + } + return obj; + } + + private PdfBoolean ParseBoolean() + { + if ( + PeekNextChar(0) == 't' && + PeekNextChar(1) == 'r' && + PeekNextChar(2) == 'u' && + PeekNextChar(3) == 'e' + ) + { + NextChar(); + NextChar(); + NextChar(); + NextChar(); + return new PdfBoolean { Value = true }; + } + + if ( + PeekNextChar(0) == 'f' && + PeekNextChar(1) == 'a' && + PeekNextChar(2) == 'l' && + PeekNextChar(3) == 's' && + PeekNextChar(4) == 'e' + ) + { + NextChar(); + NextChar(); + NextChar(); + NextChar(); + NextChar(); + return new PdfBoolean { Value = false }; + } + + return null; + } + + private PdfNull ParseNull() + { + if ( + PeekNextChar(0) == 'n' && + PeekNextChar(0) == 'u' && + PeekNextChar(0) == 'l' && + PeekNextChar(0) == 'l' + ) + { + NextChar(); + NextChar(); + NextChar(); + NextChar(); + return new PdfNull(); + } + + return null; + } + + private IPdfElement ParseNumber() + { + long startPosition = _streamPosition; + bool valid = false; + int dotCount = 0; + StringBuilder sbNumber = new StringBuilder(); + if (PeekChar() == '-') + { + sbNumber.Append('-'); + NextChar(); + } + else if (PeekChar() == '+') + { + NextChar(); + } + while (IsDigit(PeekChar()) || PeekChar() == '.') + { + if (PeekChar() == '.') + { + dotCount++; + } + sbNumber.Append((char)PeekChar()); + NextChar(); + valid = true; + } + if (valid && dotCount <= 1) + { + if (dotCount == 0) + { + PdfInteger obj = new PdfInteger(); + obj.Value = Convert.ToInt32(sbNumber.ToString()); + return obj; + } + if (dotCount == 1) + { + PdfReal obj = new PdfReal(); + obj.Value = Convert.ToDouble(sbNumber.ToString()); + return obj; + } + } + throw new Exception(string.Format("Expected number at {0}, found \"{1}\"", startPosition, sbNumber.ToString())); + } + + private IPdfElement ParseNumberOrReference() + { + IPdfElement obj = ParseNumber(); + PdfInteger number = obj as PdfInteger; + if (number == null) + { + return obj; + } + + // Try to get an indirect object reference + long streamPosition = _streamPosition; + SkipWhitespace(); + if (char.IsDigit((char)PeekChar()) == false) + { + _streamPosition = streamPosition; + return obj; + } + IPdfElement objectGeneration = ParseNumber(); + SkipWhitespace(); + if (PeekChar() != 'R') + { + _streamPosition = streamPosition; + return obj; + } + NextChar(); + PdfObjectReference objRef = new PdfObjectReference(); + objRef.ObjectID = number.Value; + objRef.ObjectGeneration = ((PdfInteger)objectGeneration).Value; + return objRef; + } + + private PdfString ParseString() + { + if (PeekChar() == '(') + { + StringBuilder sbString = new StringBuilder(); + int depth = 1; + NextChar(); + do + { + byte character = PeekChar(); + if (character == '(') + { + depth++; + sbString.Append((char)character); + } + else if (character == ')') + { + depth--; + sbString.Append((char)character); + if (depth == 0) + { + break; + } + } + else if (character == '\\') + { + if (NextChar() == false) + { + throw new Exception("Unexpected end of string and file"); + } + character = PeekChar(); + if (character == 'n') + { + sbString.Append('\n'); + } + else if (character == 'r') + { + sbString.Append('\n'); + } + else if (character == 't') + { + sbString.Append('\t'); + } + else if (character == 'b') + { + sbString.Append('\b'); + } + else if (character == 'f') + { + sbString.Append('\f'); + } + else if (character == '(') + { + sbString.Append('('); + } + else if (character == ')') + { + sbString.Append(')'); + } + else if (character == '\\') + { + sbString.Append('\\'); + } + else if (IsEndOfLine(character)) + { + SkipEndOfLine(); + continue; + } + else if (IsDigit(character)) + { + if (_streamPosition + 2 >= _stream.Length) + { + throw new Exception("Unexpected end of string and file"); + } + StringBuilder sbOctal = new StringBuilder(); + sbOctal.Append((char)character); + NextChar(); + sbOctal.Append((char)PeekChar()); + NextChar(); + sbOctal.Append((char)PeekChar()); + char newCharacter = (char)(Convert.ToInt32(sbOctal.ToString(), 8)); + } + } + else + { + sbString.Append((Char)character); + } + if (NextChar() == false) + { + throw new Exception("Unexpected end of string and file"); + } + } while (IsEndOfStream() == false); + + PdfString obj = new PdfString(); + obj.Value = sbString.ToString(); + return obj; + } + else if (PeekChar() == '<') + { + StringBuilder sbString = new StringBuilder(); + NextChar(); + do + { + byte character = PeekChar(); + if (character == '>') { break; } + byte nextCharacter = PeekNextChar(1); + byte realChar = (byte)(ByteHexValue(character) * 16 + ByteHexValue(nextCharacter)); + sbString.Append((Char)realChar); + NextChar(); + } while (IsEndOfStream() == false); + NextChar(); + + PdfString obj = new PdfString(); + obj.Value = sbString.ToString(); + return obj; + } + return null; + } + + private PdfName ParseName() + { + if (PeekChar() != '/') + { + return null; + } + NextChar(); + StringBuilder sbName = new StringBuilder(); + do + { + byte character = PeekChar(); + if (IsDelimiter(character) || character == '/') + { + break; + } + else if (character == '#') + { + StringBuilder sbHex = new StringBuilder(); + sbHex.Append((char)character); + if (NextChar() == false) + { + throw new Exception("Unexpected end of string and file"); + } + sbHex.Append((char)character); + byte newChar = Convert.ToByte(sbHex.ToString(), 16); + sbName.Append((char)newChar); + } + else if (character > 0x20 && character < 0x7F) + { + sbName.Append((char)character); + } + else + { + break; + } + if (NextChar() == false) + { + throw new Exception("Unexpected end of name and file"); + } + } while (IsEndOfStream() == false); + + PdfName obj = new PdfName(); + obj.Value = sbName.ToString(); + return obj; + } + + private PdfArray ParseArray() + { + if (PeekChar() != '[') + { + return null; + } + NextChar(); + SkipWhitespace(); + + PdfArray array = new PdfArray(); + do + { + byte character = PeekChar(); + + if (character == ']') + { + NextChar(); + break; + } + else + { + IPdfElement obj = ParseElement(); + array.Values.Add(obj); + } + SkipWhitespace(); + } while (IsEndOfStream() == false); + return array; + } + + private PdfDictionary ParseDictionary() + { + if (PeekChar() != '<' || PeekNextChar() != '<') + { + return null; + } + NextChar(); + NextChar(); + SkipWhitespace(); + + PdfName previousName = null; + PdfDictionary dict = new PdfDictionary(); + do + { + byte character = PeekChar(); + byte nextCharacter = PeekNextChar(); + + if (character == '>' && nextCharacter == '>') + { + NextChar(); + NextChar(); + break; + } + else if (character == '/') + { + PdfName name = ParseName(); + previousName = name; + SkipWhitespace(); + IPdfElement obj = ParseElement(); + if (obj is PdfNull) + { + dict.Values.Remove(name.Value); + } + else + { + previousName = name; + if (dict.Values.ContainsKey(name.Value)) + { + dict.Values[name.Value] = obj; + } + else + { + dict.Values.Add(name.Value, obj); + } + } + SkipWhitespace(); + } + else + { + throw new Exception(string.Format("Error parsing Dictionary at: {0}", _streamPosition)); + } + + } while (IsEndOfStream() == false); + return dict; + } + + #endregion + + #region Public methods + + public PdfObject ParseObject() + { + PdfObject obj = null; + long startPosition = _streamPosition; + do + { + SkipWhitespace(); + byte character = PeekChar(); + + if (character == '%') + { + SkipComment(); + } + else if (IsDigit(character)) + { + IPdfElement objectID = ParseNumber(); + SkipWhitespace(); + IPdfElement objectGeneration = ParseNumber(); + SkipWhitespace(); + string token = ParseToken(); + if (token == "obj") + { + SkipWhitespace(); + IPdfElement element = ParseElement(); + string endToken = ParseToken(); + + // Intercept streams + if (endToken == "stream") + { + PdfDictionary streamDict = element as PdfDictionary; + if (streamDict == null) + { + throw new Exception(string.Format("Stream after a not dictionary element at: {0}", _streamPosition)); + } + if (streamDict.Values.ContainsKey("Length") == false) + { + throw new Exception(string.Format("Dictionary of stream does not specify Lenght at: {0}", _streamPosition)); + } + long length = ((PdfInteger)streamDict.Values["Length"]).Value; + SkipEndOfLine(); + byte[] streamBody = GetRawData(length); + SkipEndOfLine(); + endToken = ParseToken(); + if (endToken != "endstream") + { + throw new Exception(string.Format("Expected \"endstream\" token, \"{0}\" found at: {1}", token, _streamPosition)); + } + SkipWhitespace(); + endToken = ParseToken(); + PdfStream stream = new PdfStream(); + stream.Dictionary = streamDict; + stream.Data = streamBody; + element = stream; + } + + if (endToken == "endobj") + { + obj = new PdfObject(); + obj.ObjectID = ((PdfInteger)objectID).Value; + obj.ObjectGeneration = ((PdfInteger)objectGeneration).Value; + obj.Data = element; + break; + } + } + } + else + { + long streamPosition = _streamPosition; + string token = ParseToken(); + if (token == "startxref") + { + // FIXME: Ignoring xref for now + SkipEndOfLine(); + SkipToEndOfLine(); + SkipEndOfLine(); + SkipToEndOfLine(); + SkipEndOfLine(); + SkipWhitespace(); + continue; + } + throw new Exception(string.Format("Expected objectID at {0}", startPosition)); + } + } while (IsEndOfStream() == false); + return obj; + } + + public bool IsEndOfStream() + { + return _streamPosition >= _stream.Length; + } + + #endregion + } +} diff --git a/VAR.PdfTools/Properties/AssemblyInfo.cs b/VAR.PdfTools/Properties/AssemblyInfo.cs new file mode 100644 index 0000000..b0845a9 --- /dev/null +++ b/VAR.PdfTools/Properties/AssemblyInfo.cs @@ -0,0 +1,14 @@ +using System.Reflection; +using System.Runtime.InteropServices; + +[assembly: AssemblyTitle("VAR.PdfTools")] +[assembly: AssemblyDescription("PdfTools Library")] +[assembly: AssemblyConfiguration("")] +[assembly: AssemblyCompany("VAR")] +[assembly: AssemblyProduct("VAR.PdfTools")] +[assembly: AssemblyCopyright("Copyright © VAR 2016")] +[assembly: AssemblyTrademark("")] +[assembly: AssemblyCulture("")] +[assembly: ComVisible(false)] +[assembly: Guid("eb7e003a-6a95-4002-809f-926c7c8a11e9")] +[assembly: AssemblyVersion("1.0.*")] diff --git a/VAR.PdfTools/VAR.PdfTools.csproj b/VAR.PdfTools/VAR.PdfTools.csproj new file mode 100644 index 0000000..e970fc9 --- /dev/null +++ b/VAR.PdfTools/VAR.PdfTools.csproj @@ -0,0 +1,56 @@ + + + + + Debug + AnyCPU + {EB7E003A-6A95-4002-809F-926C7C8A11E9} + Library + Properties + VAR.PdfTools + VAR.PdfTools + v4.6.1 + 512 + + + true + full + false + bin\Debug\ + DEBUG;TRACE + prompt + 4 + + + pdbonly + true + bin\Release\ + TRACE + prompt + 4 + + + + + + + + + + + + + + + + + + + + \ No newline at end of file