Parse Fonts ToUnicode

This commit is contained in:
2016-06-20 11:40:15 +02:00
parent 6e8f58e2e1
commit 63b50a8198
4 changed files with 218 additions and 3 deletions

View File

@@ -13,7 +13,7 @@ namespace VAR.PdfTools
private PdfDictionary _resources = null;
private Dictionary<string, PdfDictionary> _fonts = new Dictionary<string, PdfDictionary>();
private Dictionary<string, PdfFont> _fonts = new Dictionary<string, PdfFont>();
private List<PdfContentAction> _contentActions = null;
@@ -25,7 +25,7 @@ namespace VAR.PdfTools
public byte[] Content { get { return _content; } }
public Dictionary<string, PdfDictionary> Fonts { get { return _fonts; } }
public Dictionary<string, PdfFont> Fonts { get { return _fonts; } }
public List<PdfContentAction> ContentActions { get { return _contentActions; } }
@@ -57,7 +57,8 @@ namespace VAR.PdfTools
PdfDictionary fonts = _resources.Values["Font"] as PdfDictionary;
foreach (KeyValuePair<string, IPdfElement> pair in fonts.Values)
{
_fonts.Add(pair.Key, pair.Value as PdfDictionary);
var font = new PdfFont(pair.Value as PdfDictionary);
_fonts.Add(pair.Key, font);
}
}

67
VAR.PdfTools/PdfFont.cs Normal file
View File

@@ -0,0 +1,67 @@
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
namespace VAR.PdfTools
{
public class PdfFont
{
#region Declarations
private PdfDictionary _baseData = null;
private Dictionary<char, string> _toUnicode = null;
#endregion
#region Properties
public PdfDictionary BaseData { get { return _baseData; } }
#endregion
#region Life cycle
public PdfFont(PdfDictionary baseData)
{
_baseData = baseData;
string type = baseData.GetParamAsString("Type");
if (type != "Font")
{
throw new Exception(string.Format("PdfFont: Expected dictionary of type:\"Font\". Found: {0}", type));
}
if (baseData.Values.ContainsKey("ToUnicode"))
{
byte[] toUnicodeStream = ((PdfStream)baseData.Values["ToUnicode"]).Data;
PdfParser parser = new PdfParser(toUnicodeStream);
_toUnicode = parser.ParseToUnicode();
}
}
#endregion
#region Public methods
public string ToUnicode(char character)
{
if (_toUnicode == null)
{
// FIXME: use standar tables
return new string(character, 1);
}
if (_toUnicode.ContainsKey(character))
{
return _toUnicode[character];
}
return new string(character, 1);
}
#endregion
}
}

View File

@@ -2,6 +2,7 @@
using System.Collections.Generic;
using System.Globalization;
using System.IO;
using System.Linq;
using System.Text;
namespace VAR.PdfTools
@@ -697,6 +698,19 @@ namespace VAR.PdfTools
return dict;
}
public string ReencodeStringToUTF16BE(string strIn)
{
byte[] byteArray = strIn.Select(c => (byte)c).ToArray();
if((byteArray.Length % 2) == 1)
{
byte[] newByteArray = new byte[byteArray.Length + 1];
newByteArray[0] = 0x00;
Array.Copy(byteArray, 0, newByteArray, 1, byteArray.Length);
byteArray = newByteArray;
}
return Encoding.BigEndianUnicode.GetString(byteArray);
}
#endregion
#region Public methods
@@ -891,6 +905,138 @@ namespace VAR.PdfTools
return actions;
}
public Dictionary<char, string> ParseToUnicode()
{
var toUnicode = new Dictionary<char, string>();
long skip = MeasureToMarkers(new char[][] {
new char[] { 'b', 'e', 'g', 'i', 'n', 'c', 'm', 'a', 'p'},
});
_streamPosition = skip;
var stack = new List<IPdfElement>();
do
{
SkipWhitespace();
IPdfElement elem = ParseElement();
if (elem != null)
{
stack.Add(elem);
}
else
{
string token = ParseToken();
if (token == "begincodespacerange")
{
PdfInteger numCodespaces = stack.Last() as PdfInteger;
if (numCodespaces == null)
{
throw new Exception(string.Format("ParseToUnicode: \"begincodespacerange\" found without preceding count at: {0}", _streamPosition));
}
for (int i = 0; i < numCodespaces.Value; i++)
{
// Skip CodeSpaceRanges
SkipWhitespace();
PdfString strStart = ParseString();
SkipWhitespace();
PdfString strEnd = ParseString();
}
SkipWhitespace();
string endToken = ParseToken();
if (endToken != "endcodespacerange")
{
throw new Exception(string.Format("ParseToUnicode: Expected \"endcodespacerange\", found \"{0}\", at: {1}", endToken, _streamPosition));
}
}
else if (token == "beginbfrange")
{
PdfInteger numRanges = stack.Last() as PdfInteger;
if (numRanges == null)
{
throw new Exception(string.Format("ParseToUnicode: \"beginbfrange\" found without preceding count at: {0}", _streamPosition));
}
for (int i = 0; i < numRanges.Value; i++)
{
SkipWhitespace();
PdfString pdfStrStart = ParseString();
SkipWhitespace();
PdfString pdfStrEnd = ParseString();
SkipWhitespace();
IPdfElement pdfElemDest = ParseElement();
char chStart = ReencodeStringToUTF16BE(pdfStrStart.Value)[0];
char chEnd = ReencodeStringToUTF16BE(pdfStrEnd.Value)[0];
if(chStart == chEnd && pdfElemDest is PdfString)
{
string strDst = ReencodeStringToUTF16BE(((PdfString)pdfElemDest).Value);
toUnicode.Add(chStart, strDst);
continue;
}
if (chEnd > chStart && pdfElemDest is PdfString)
{
string strDst = ReencodeStringToUTF16BE(((PdfString)pdfElemDest).Value);
char[] chsDest = strDst.ToArray();
for (char c = chStart; c <= chEnd; c++)
{
toUnicode.Add(c, new string(chsDest));
chsDest[chsDest.Length - 1]++;
}
continue;
}
if (chEnd > chStart && pdfElemDest is PdfArray)
{
PdfArray array = pdfElemDest as PdfArray;
int length = chEnd - chStart;
for (int j = 0; j <= length; j++)
{
char c = (char)(chStart + j);
string strDst = ReencodeStringToUTF16BE(((PdfString)array.Values[j]).Value);
toUnicode.Add(c, strDst);
}
continue;
}
}
SkipWhitespace();
string endToken = ParseToken();
if (endToken != "endbfrange")
{
throw new Exception(string.Format("ParseToUnicode: Expected \"endbfrange\", found \"{0}\", at: {1}", endToken, _streamPosition));
}
}
else if (token == "beginbfchar")
{
PdfInteger numChars = stack.Last() as PdfInteger;
if (numChars == null)
{
throw new Exception(string.Format("ParseToUnicode: \"beginbfchar\" found without preceding count at: {0}", _streamPosition));
}
for (int i = 0; i < numChars.Value; i++)
{
SkipWhitespace();
PdfString pdfStrOrig = ParseString();
SkipWhitespace();
PdfString pdfStrDest = ParseString();
char chOrig = ReencodeStringToUTF16BE(pdfStrOrig.Value)[0];
string strDst = ReencodeStringToUTF16BE(((PdfString)pdfStrDest).Value);
toUnicode.Add(chOrig, strDst);
}
SkipWhitespace();
string endToken = ParseToken();
if (endToken != "endbfchar")
{
throw new Exception(string.Format("ParseToUnicode: Expected \"endbfchar\", found \"{0}\", at: {1}", endToken, _streamPosition));
}
}
else
{
// Ignore rest of tokens
}
}
} while (IsEndOfStream() == false);
return toUnicode;
}
public bool IsEndOfStream()
{
return _streamPosition >= _stream.Length;

View File

@@ -44,6 +44,7 @@
<Compile Include="PdfDocument.cs" />
<Compile Include="PdfDocumentPage.cs" />
<Compile Include="PdfElements.cs" />
<Compile Include="PdfFont.cs" />
<Compile Include="PdfParser.cs" />
<Compile Include="Properties\AssemblyInfo.cs" />
</ItemGroup>