Parse ObjectStreams

This commit is contained in:
2016-06-19 03:29:59 +02:00
parent af5644758c
commit 142241e791
3 changed files with 99 additions and 21 deletions

View File

@@ -32,11 +32,10 @@ namespace VAR.PdfTools
MemoryStream msOutput = new MemoryStream();
// It seems to work when skipping the first two bytes.
byte header; // 0x30 0x59
byte header;
header = (byte)msInput.ReadByte();
//Debug.Assert(header == 48);
header = (byte)msInput.ReadByte();
//Debug.Assert(header == 89);
DeflateStream zip = new DeflateStream(msInput, CompressionMode.Decompress, true);
int cbRead;
byte[] abResult = new byte[1024];
@@ -44,8 +43,10 @@ namespace VAR.PdfTools
{
cbRead = zip.Read(abResult, 0, abResult.Length);
if (cbRead > 0)
{
msOutput.Write(abResult, 0, cbRead);
}
}
while (cbRead > 0);
zip.Close();
msOutput.Flush();
@@ -57,6 +58,20 @@ namespace VAR.PdfTools
return null;
}
private static void ApplyFiltersToStreams(PdfStream stream)
{
string filter = stream.GetParamAsString("Filter");
if (filter == "FlateDecode")
{
stream.OriginalData = stream.Data;
stream.OriginalFilter = stream.Dictionary.Values["Filter"];
byte[] decodedStreamData = DecodeFlateStreamData(stream.Data);
stream.Data = decodedStreamData;
stream.Dictionary.Values["Length"] = new PdfInteger { Value = decodedStreamData.Length };
stream.Dictionary.Values.Remove("Filter");
}
}
#endregion
#region Public methods
@@ -69,34 +84,43 @@ namespace VAR.PdfTools
public static PdfDocument Load(byte[] data)
{
var parser = new PdfParser(data);
var doc = new PdfDocument();
// Parse data
var parser = new PdfParser(data);
do
{
PdfObject obj = parser.ParseObject();
if (obj != null)
{
if (obj.Data is PdfStream)
{
ApplyFiltersToStreams((PdfStream)obj.Data);
}
doc.Objects.Add(obj);
}
} while (parser.IsEndOfStream() == false);
// Apply filters to streams
foreach(PdfObject obj in doc.Objects)
// Expand Object Streams
List<PdfObject> streamObjects = new List<PdfObject>();
foreach (PdfObject obj in doc.Objects)
{
if(obj.Data.Type != PdfElementTypes.Stream) { continue; }
if (obj.Data.Type != PdfElementTypes.Stream) { continue; }
PdfStream stream = obj.Data as PdfStream;
string filter = stream.GetParamAsString("Filter");
if (filter == "FlateDecode")
string type = stream.GetParamAsString("Type");
long? number = stream.GetParamAsInt("N");
long? first = stream.GetParamAsInt("First");
if (type == "ObjStm" && number != null && first != null)
{
stream.OriginalData = stream.Data;
stream.OriginalFilter = stream.Dictionary.Values["Filter"];
byte[] decodedStreamData = DecodeFlateStreamData(stream.Data);
stream.Data = decodedStreamData;
stream.Dictionary.Values["Length"] = new PdfInteger { Value = decodedStreamData.Length };
stream.Dictionary.Values.Remove("Filter");
PdfParser parserAux = new PdfParser(stream.Data);
streamObjects.AddRange(parserAux.ParseObjectStream((int)number, (long)first));
}
}
foreach (PdfObject obj in streamObjects)
{
doc.Objects.Add(obj);
}
return doc;
}

View File

@@ -32,7 +32,7 @@ namespace VAR.PdfTools
public class PdfInteger : IPdfElement
{
public PdfElementTypes Type { get; private set; } = PdfElementTypes.Integer;
public int Value { get; set; }
public long Value { get; set; }
}
public class PdfReal : IPdfElement
@@ -107,6 +107,22 @@ namespace VAR.PdfTools
}
return null;
}
public long? GetParamAsInt(string name)
{
if (Dictionary.Values.ContainsKey(name) == false) { return null; }
IPdfElement value = Dictionary.Values[name];
if (value is PdfArray)
{
value = ((PdfArray)value).Values[0];
}
if (value is PdfInteger)
{
return ((PdfInteger)value).Value;
}
return null;
}
}
public class PdfObject : IPdfElement

View File

@@ -1,4 +1,5 @@
using System;
using System.Collections.Generic;
using System.IO;
using System.Text;
@@ -422,8 +423,8 @@ namespace VAR.PdfTools
}
NextChar();
PdfObjectReference objRef = new PdfObjectReference();
objRef.ObjectID = number.Value;
objRef.ObjectGeneration = ((PdfInteger)objectGeneration).Value;
objRef.ObjectID = (int)number.Value;
objRef.ObjectGeneration = (int)((PdfInteger)objectGeneration).Value;
return objRef;
}
@@ -571,7 +572,6 @@ namespace VAR.PdfTools
NextChar();
character = PeekChar();
realChar += (byte)ByteHexValue(character);
NextChar();
sbName.Append((char)realChar);
}
else if (character > 0x20 && character < 0x7F)
@@ -751,8 +751,8 @@ namespace VAR.PdfTools
if (endToken == "endobj")
{
obj = new PdfObject();
obj.ObjectID = ((PdfInteger)objectID).Value;
obj.ObjectGeneration = ((PdfInteger)objectGeneration).Value;
obj.ObjectID = (int)((PdfInteger)objectID).Value;
obj.ObjectGeneration = (int)((PdfInteger)objectGeneration).Value;
obj.Data = element;
break;
}
@@ -811,6 +811,44 @@ namespace VAR.PdfTools
return obj;
}
public List<PdfObject> ParseObjectStream(int number, long first)
{
var streamObjects = new List<PdfObject>();
var objectIds = new List<long>();
for (int i = 0; i < number; i++)
{
SkipWhitespace();
IPdfElement objectId = ParseElement();
if (objectId is PdfInteger)
{
objectIds.Add(((PdfInteger)objectId).Value);
}
else
{
throw new System.Exception(string.Format("Unexpected element parsing ObjectStream at: {0}", _streamPosition));
}
SkipWhitespace();
ParseElement();
}
_streamPosition = (int)first;
for (int i = 0; i < number; i++)
{
SkipWhitespace();
IPdfElement elem = ParseElement();
if (elem == null)
{
throw new System.Exception(string.Format("Unexpected error parsing ObjectStream at: {0}", _streamPosition));
}
PdfObject objAux = new PdfObject();
objAux.ObjectGeneration = 0;
objAux.ObjectID = (int)objectIds[i];
objAux.Data = elem;
streamObjects.Add(objAux);
}
return streamObjects;
}
public bool IsEndOfStream()
{
return _streamPosition >= _stream.Length;