From 5e96ee22d817df5f0a4b2991185b8965fe38852d Mon Sep 17 00:00:00 2001 From: "Valeriano A.R" Date: Tue, 21 Jun 2016 16:33:23 +0200 Subject: [PATCH] Robust stream parsing --- VAR.PdfTools/PdfDocument.cs | 2 +- VAR.PdfTools/PdfElements.cs | 20 +++++++ VAR.PdfTools/PdfFont.cs | 7 ++- VAR.PdfTools/PdfParser.cs | 111 +++++++++++++++++++++++++++++++++--- 4 files changed, 130 insertions(+), 10 deletions(-) diff --git a/VAR.PdfTools/PdfDocument.cs b/VAR.PdfTools/PdfDocument.cs index 3f6c141..c5a7d44 100644 --- a/VAR.PdfTools/PdfDocument.cs +++ b/VAR.PdfTools/PdfDocument.cs @@ -210,7 +210,7 @@ namespace VAR.PdfTools var parser = new PdfParser(data); do { - PdfObject obj = parser.ParseObject(); + PdfObject obj = parser.ParseObject(doc.Objects); if (obj != null) { if (obj.Data is PdfStream) diff --git a/VAR.PdfTools/PdfElements.cs b/VAR.PdfTools/PdfElements.cs index f9b3192..c9377e0 100644 --- a/VAR.PdfTools/PdfElements.cs +++ b/VAR.PdfTools/PdfElements.cs @@ -168,6 +168,10 @@ namespace VAR.PdfTools { public static double GetReal(IPdfElement elem, double defaultValue) { + if(elem == null) + { + return defaultValue; + } if (elem is PdfInteger) { return ((PdfInteger)elem).Value; @@ -178,5 +182,21 @@ namespace VAR.PdfTools } return defaultValue; } + public static long GetInt(IPdfElement elem, long defaultValue) + { + if (elem == null) + { + return defaultValue; + } + if (elem is PdfInteger) + { + return ((PdfInteger)elem).Value; + } + if (elem is PdfReal) + { + return (long)((PdfReal)elem).Value; + } + return defaultValue; + } } } diff --git a/VAR.PdfTools/PdfFont.cs b/VAR.PdfTools/PdfFont.cs index c7796a1..c55af25 100644 --- a/VAR.PdfTools/PdfFont.cs +++ b/VAR.PdfTools/PdfFont.cs @@ -15,6 +15,8 @@ namespace VAR.PdfTools private double _height = 1.0; + private bool _tainted = false; + #endregion #region Properties @@ -23,6 +25,8 @@ namespace VAR.PdfTools public double Height { get { return _height; } } + public bool Tainted { get { return _tainted; } } + #endregion #region Life cycle @@ -33,7 +37,8 @@ namespace VAR.PdfTools string type = baseData.GetParamAsString("Type"); if (type != "Font") { - throw new Exception(string.Format("PdfFont: Expected dictionary of type:\"Font\". Found: {0}", type)); + // NOTE: Type="Font" is Required by the standard, continuing anyway + _tainted = true; } if (baseData.Values.ContainsKey("ToUnicode")) diff --git a/VAR.PdfTools/PdfParser.cs b/VAR.PdfTools/PdfParser.cs index 34ced6e..ee44d9a 100644 --- a/VAR.PdfTools/PdfParser.cs +++ b/VAR.PdfTools/PdfParser.cs @@ -90,7 +90,7 @@ namespace VAR.PdfTools } position++; } while (position < _stream.Length); - return 0; + return -1; } private byte PeekChar() @@ -197,6 +197,30 @@ namespace VAR.PdfTools } } + private void SkipWhitespaceBack() + { + while (IsWhitespace(PeekChar())) + { + if (_streamPosition == 0) + { + break; + } + _streamPosition--; + } + } + + private void SkipDigitsBack() + { + while (IsDigit(PeekChar())) + { + if (_streamPosition == 0) + { + break; + } + _streamPosition--; + } + } + private void SkipEndOfLine() { byte lineFeed = 0x0A; @@ -718,11 +742,23 @@ namespace VAR.PdfTools return Encoding.BigEndianUnicode.GetString(byteArray); } + public IPdfElement SearchObjectID(List knownObjects, long objectID) + { + foreach (PdfObject obj in knownObjects) + { + if (obj.ObjectID == objectID) + { + return obj.Data; + } + } + return null; + } + #endregion #region Public methods - public PdfObject ParseObject() + public PdfObject ParseObject(List knownObjects) { PdfObject obj = null; long startPosition = _streamPosition; @@ -757,12 +793,19 @@ namespace VAR.PdfTools throw new Exception(string.Format("Stream after a not dictionary element at: {0}", _streamPosition)); } SkipEndOfLine(); - long length; - if (streamDict.Values.ContainsKey("Length") && streamDict.Values["Length"] is PdfInteger) + + // Find the length of the stream + long length = -1; + if (streamDict.Values.ContainsKey("Length") ) { - length = ((PdfInteger)streamDict.Values["Length"]).Value; + length = PdfElementUtils.GetInt(streamDict.Values["Length"], -1); + if (length == -1 && streamDict.Values["Length"] is PdfObjectReference) + { + IPdfElement lenghtObj = SearchObjectID(knownObjects, ((PdfObjectReference) streamDict.Values["Length"]).ObjectID); + length = PdfElementUtils.GetInt(lenghtObj, -1); + } } - else + if(length == -1) { byte lineFeed = 0x0A; byte carriageReturn = 0x0D; @@ -773,6 +816,8 @@ namespace VAR.PdfTools new char[] {'e', 'n', 'd', 's', 't', 'r', 'e', 'a', 'm', (char)carriageReturn, (char)lineFeed}, }); } + + // Get the stream byte[] streamBody = GetRawData(length); SkipEndOfLine(); endToken = ParseToken(); @@ -857,7 +902,58 @@ namespace VAR.PdfTools SkipWhitespace(); continue; } - throw new Exception(string.Format("Expected objectID at {0}", startPosition)); + + // Try to find an object marker + byte lineFeed = 0x0A; + byte carriageReturn = 0x0D; + long distToObject = MeasureToMarkers(new char[][] { + new char[] {' ', 'o', 'b', 'j', (char)lineFeed}, + new char[] {' ', 'o', 'b', 'j', (char)carriageReturn, (char)lineFeed}, + }); + if (distToObject > 0) + { + // Object marker found, backtrack and retry + long originalPosition = _streamPosition; + _streamPosition += distToObject; + long marker = _streamPosition; + SkipWhitespaceBack(); + if (_streamPosition == marker) + { + // Abort backtrack, skip garbage + _streamPosition = originalPosition + distToObject + 4; + continue; + } + marker = _streamPosition; + SkipDigitsBack(); + if (_streamPosition == marker) + { + // Abort backtrack, skip garbage + _streamPosition = originalPosition + distToObject + 4; + continue; + } + marker = _streamPosition; + SkipWhitespaceBack(); + if (_streamPosition == marker) + { + // Abort backtrack, skip garbage + _streamPosition = originalPosition + distToObject + 4; + continue; + } + marker = _streamPosition; + SkipDigitsBack(); + if (_streamPosition == marker) + { + // Abort backtrack, skip garbage + _streamPosition = originalPosition + distToObject + 4; + continue; + } + NextChar(); + } + else + { + // No more obj markers found, abort all. + _streamPosition = _stream.Length; + } } } while (IsEndOfStream() == false); return obj; @@ -919,7 +1015,6 @@ namespace VAR.PdfTools if (string.IsNullOrEmpty(token)) { break; - //throw new Exception(string.Format("ParseContet: Expected token found nothing, at: {0}", _streamPosition)); } PdfContentAction action = new PdfContentAction(token, elems); elems = new List();