しばらく時間がかかりましたが、Identity-H でエンコードされた PDF からプレーン テキストを読み取るためのコードがようやくできました。他の人を助けるためにここに投稿しますが、改善する方法があることを知っています. たとえば、文字マッピング (beginbfchar) には触れておらず、範囲は実際には範囲ではありません。私はすでにこれに 1 週間以上費やしてきましたが、別の方法で動作するファイルにヒットしない限り、その時間を正当化することはできません。ごめん。
使用法:
PdfDocument inputDocument = PDFHelpers.Open(physcialFilePath, PdfDocumentOpenMode.Import)
foreach (PdfPage page in inputDocument.Pages)
{
for (Int32 index = 0; index < page.Contents.Elements.Count; index++)
{
PdfDictionary.PdfStream stream = page.Contents.Elements.GetDictionary(index).Stream;
String outputText = new PDFParser().ExtractTextFromPDFBytes(stream.Value).Replace(" ", String.Empty);
if (outputText == "" || outputText.Replace("\n\r", "") == "")
{
// Identity-H encoded file
string[] hierarchy = new string[] { "/Resources", "/Font", "/F*" };
List<PdfItem> fonts = PDFHelpers.FindObjects(hierarchy, page, true);
outputText = PDFHelpers.FromUnicode(stream, fonts);
}
}
}
そして、実際のヘルパー クラスは、この問題を解決しようとしていたときに自分自身で完全な例をほとんど見つけられなかったため、すべての例で使用されているため、全体を投稿します。ヘルパーは、PDFSharp と iTextSharp の両方を使用して 1.5 より前と後の PDF を開くことができるようにし、ExtractTextFromPDFBytes を使用して標準の PDF を読み取り、私の FindObjects (ドキュメント ツリーを検索してオブジェクトを返す) と暗号化されたテキストを受け取る FromUnicode を使用します。翻訳するためのフォント コレクション。
using PdfSharp.Pdf;
using PdfSharp.Pdf.Content;
using PdfSharp.Pdf.Content.Objects;
using System;
using System.Collections.Generic;
using System.IO;
namespace PdfSharp.Pdf.IO
{
/// <summary>
/// uses itextsharp 4.1.6 to convert any pdf to 1.4 compatible pdf, called instead of PdfReader.open
/// </summary>
static public class PDFHelpers
{
/// <summary>
/// uses itextsharp 4.1.6 to convert any pdf to 1.4 compatible pdf, called instead of PdfReader.open
/// </summary>
static public PdfDocument Open(string PdfPath, PdfDocumentOpenMode openmode)
{
return Open(PdfPath, null, openmode);
}
/// <summary>
/// uses itextsharp 4.1.6 to convert any pdf to 1.4 compatible pdf, called instead of PdfReader.open
/// </summary>
static public PdfDocument Open(string PdfPath, string password, PdfDocumentOpenMode openmode)
{
using (FileStream fileStream = new FileStream(PdfPath, FileMode.Open, FileAccess.Read))
{
int len = (int)fileStream.Length;
// TODO: Setting this byteArray causes the out of memory exception which is why we
// have the 70mb limit. Solve this and we can increase the file size limit
System.Diagnostics.Process proc = System.Diagnostics.Process.GetCurrentProcess();
long availableMemory = proc.PrivateMemorySize64 / 1024 / 1024; //Mb of RAM allocated to this process that cannot be shared with other processes
if (availableMemory < (fileStream.Length / 1024 / 1024))
{
throw new Exception("The available memory " + availableMemory + "Mb is not enough to open, split and save a file of " + fileStream.Length / 1024 / 1024);
}
try
{
Byte[] fileArray = new Byte[len];
fileStream.Read(fileArray, 0, len);
fileStream.Close();
fileStream.Dispose();
PdfDocument result = Open(fileArray, openmode);
if (result.FullPath == "")
{
// The file was converted to a v1.4 document and only exists as a document in memory
// Save over the original file so other references to the file get the compatible version
// TODO: It would be good if we could do this conversion without opening every document another 2 times
PdfDocument tempResult = Open(fileArray, PdfDocumentOpenMode.Modify);
iTextSharp.text.pdf.BaseFont bfR = iTextSharp.text.pdf.BaseFont.CreateFont(Environment.GetEnvironmentVariable("SystemRoot") + "\\fonts\\arial.ttf", iTextSharp.text.pdf.BaseFont.IDENTITY_H, iTextSharp.text.pdf.BaseFont.EMBEDDED);
bfR.Subset = false;
tempResult.Save(PdfPath);
tempResult.Close();
tempResult.Dispose();
result = Open(fileArray, openmode);
}
return result;
}
catch (OutOfMemoryException)
{
fileStream.Close();
fileStream.Dispose();
throw;
}
}
}
/// <summary>
/// uses itextsharp 4.1.6 to convert any pdf to 1.4 compatible pdf, called instead of PdfReader.open
/// </summary>
static public PdfDocument Open(byte[] fileArray, PdfDocumentOpenMode openmode)
{
return Open(new MemoryStream(fileArray), null, openmode);
}
/// <summary>
/// uses itextsharp 4.1.6 to convert any pdf to 1.4 compatible pdf, called instead of PdfReader.open
/// </summary>
static public PdfDocument Open(byte[] fileArray, string password, PdfDocumentOpenMode openmode)
{
return Open(new MemoryStream(fileArray), password, openmode);
}
/// <summary>
/// uses itextsharp 4.1.6 to convert any pdf to 1.4 compatible pdf, called instead of PdfReader.open
/// </summary>
static public PdfDocument Open(MemoryStream sourceStream, PdfDocumentOpenMode openmode)
{
return Open(sourceStream, null, openmode);
}
/// <summary>
/// uses itextsharp 4.1.6 to convert any pdf to 1.4 compatible pdf, called instead of PdfReader.open
/// </summary>
static public PdfDocument Open(MemoryStream sourceStream, string password, PdfDocumentOpenMode openmode)
{
PdfDocument outDoc = null;
sourceStream.Position = 0;
try
{
outDoc = (password == null) ?
PdfReader.Open(sourceStream, openmode) :
PdfReader.Open(sourceStream, password, openmode);
sourceStream.Position = 0;
MemoryStream outputStream = new MemoryStream();
iTextSharp.text.pdf.PdfReader reader = (password == null) ?
new iTextSharp.text.pdf.PdfReader(sourceStream) :
new iTextSharp.text.pdf.PdfReader(sourceStream, System.Text.ASCIIEncoding.ASCII.GetBytes(password));
System.Collections.ArrayList fontList = iTextSharp.text.pdf.BaseFont.GetDocumentFonts(reader, 1);
}
catch (PdfSharp.Pdf.IO.PdfReaderException)
{
//workaround if pdfsharp doesn't support this pdf
sourceStream.Position = 0;
MemoryStream outputStream = new MemoryStream();
iTextSharp.text.pdf.PdfReader reader = (password == null) ?
new iTextSharp.text.pdf.PdfReader(sourceStream) :
new iTextSharp.text.pdf.PdfReader(sourceStream, System.Text.ASCIIEncoding.ASCII.GetBytes(password));
iTextSharp.text.pdf.PdfStamper pdfStamper = new iTextSharp.text.pdf.PdfStamper(reader, outputStream);
pdfStamper.FormFlattening = true;
pdfStamper.Writer.SetPdfVersion(iTextSharp.text.pdf.PdfWriter.PDF_VERSION_1_4);
pdfStamper.Writer.CloseStream = false;
pdfStamper.Close();
outDoc = PdfReader.Open(outputStream, openmode);
}
return outDoc;
}
/// <summary>
/// Uses a recurrsive function to step through the PDF document tree to find the specified objects.
/// </summary>
/// <param name="objectHierarchy">An array of the names of objects to look for in the tree. Wildcards can be used in element names, e.g., /F*. The order represents
/// a top-down hierarchy if followHierarchy is true.
/// If a single object is passed in array it should be in the level below startingObject, or followHierarchy set to false to find it anywhere in the tree</param>
/// <param name="startingObject">A PDF object to parse. This will likely be a document or a page, but could be any lower-level item</param>
/// <param name="followHierarchy">If true the order of names in the objectHierarchy will be used to search only that branch. If false the whole tree will be parsed for
/// any items matching those in objectHierarchy regardless of position</param>
static public List<PdfItem> FindObjects(string[] objectHierarchy, PdfItem startingObject, bool followHierarchy)
{
List<PdfItem> results = new List<PdfItem>();
FindObjects(objectHierarchy, startingObject, followHierarchy, ref results, 0);
return results;
}
static private void FindObjects(string[] objectHierarchy, PdfItem startingObject, bool followHierarchy, ref List<PdfItem> results, int Level)
{
PdfName[] keyNames = ((PdfDictionary)startingObject).Elements.KeyNames;
foreach (PdfName keyName in keyNames)
{
bool matchFound = false;
if (!followHierarchy)
{
// We need to check all items for a match, not just the top one
for (int i = 0; i < objectHierarchy.Length; i++)
{
if (keyName.Value == objectHierarchy[i] ||
(objectHierarchy[i].Contains("*") &&
(keyName.Value.StartsWith(objectHierarchy[i].Substring(0, objectHierarchy[i].IndexOf("*") - 1)) &&
keyName.Value.EndsWith(objectHierarchy[i].Substring(objectHierarchy[i].IndexOf("*") + 1)))))
{
matchFound = true;
}
}
}
else
{
// Check the item in the hierarchy at this level for a match
if (Level < objectHierarchy.Length && (keyName.Value == objectHierarchy[Level] ||
(objectHierarchy[Level].Contains("*") &&
(keyName.Value.StartsWith(objectHierarchy[Level].Substring(0, objectHierarchy[Level].IndexOf("*") - 1)) &&
keyName.Value.EndsWith(objectHierarchy[Level].Substring(objectHierarchy[Level].IndexOf("*") + 1))))))
{
matchFound = true;
}
}
if (matchFound)
{
PdfItem item = ((PdfDictionary)startingObject).Elements[keyName];
if (item != null && item is PdfSharp.Pdf.Advanced.PdfReference)
{
item = ((PdfSharp.Pdf.Advanced.PdfReference)item).Value;
}
System.Diagnostics.Debug.WriteLine("Level " + Level.ToString() + " - " + keyName.ToString() + " matched");
if (Level == objectHierarchy.Length - 1)
{
// We are at the end of the hierarchy, so this is the target
results.Add(item);
}
else if (!followHierarchy)
{
// We are returning every matching object so add it
results.Add(item);
}
// Call back to this function to search lower levels
Level++;
FindObjects(objectHierarchy, item, followHierarchy, ref results, Level);
Level--;
}
else
{
System.Diagnostics.Debug.WriteLine("Level " + Level.ToString() + " - " + keyName.ToString() + " unmatched");
}
}
Level--;
System.Diagnostics.Debug.WriteLine("Level " + Level.ToString());
}
/// <summary>
/// Uses the Font object to translate CID encoded text to readable text
/// </summary>
/// <param name="unreadableText">The text stream that needs to be decoded</param>
/// <param name="font">A List of PDFItems containing the /Font object containing a /ToUnicode with a CMap</param>
static public string FromUnicode(PdfDictionary.PdfStream unreadableText, List<PdfItem> PDFFonts)
{
Dictionary<string, string[]> fonts = new Dictionary<string, string[]>();
// Get the CMap from each font in the passed array and store them by font name
for (int font = 0; font < PDFFonts.Count; font++)
{
PdfName[] keyNames = ((PdfDictionary)PDFFonts[font]).Elements.KeyNames;
foreach (PdfName keyName in keyNames)
{
if (keyName.Value == "/ToUnicode") {
PdfItem item = ((PdfDictionary)PDFFonts[font]).Elements[keyName];
if (item != null && item is PdfSharp.Pdf.Advanced.PdfReference)
{
item = ((PdfSharp.Pdf.Advanced.PdfReference)item).Value;
}
string FontName = "/F" + font.ToString();
string CMap = ((PdfDictionary)item).Stream.ToString();
if (CMap.IndexOf("beginbfrange") > 0)
{
CMap = CMap.Substring(CMap.IndexOf("beginbfrange") + "beginbfrange".Length);
if (CMap.IndexOf("endbfrange") > 0)
{
CMap = CMap.Substring(0, CMap.IndexOf("endbfrange") - 1);
string[] CMapArray = CMap.Split(new string[] { "\r\n" }, StringSplitOptions.RemoveEmptyEntries);
fonts.Add(FontName, CMapArray);
}
}
break;
}
}
}
// Holds the final result to be returned
string resultString = "";
// Break the input text into lines
string[] lines = unreadableText.ToString().Split(new string[] {"\n"} , StringSplitOptions.RemoveEmptyEntries);
// Holds the last font reference and therefore the CMAP table
// to be used for any text found after it
string[] currentFontRef = fonts["/F0"];
// Are we in a block of text or not? They can break across lines so we need an identifier
bool blnInText = false;
for (int line = 0; line < lines.Length; line++)
{
string thisLine = lines[line].Trim();
if (thisLine == "q")
{
// I think this denotes the start of a text block, and where we need to reset to the default font
currentFontRef = fonts["/F0"];
}
else if (thisLine.IndexOf(" Td <") != -1)
{
thisLine = thisLine.Substring(thisLine.IndexOf(" Td <") + 5);
blnInText = true;
}
if (thisLine.EndsWith("Tf"))
{
// This is a font assignment. Take note of this and use this fonts ToUnicode map when we find text
if (fonts.ContainsKey(thisLine.Substring(0, thisLine.IndexOf(" "))))
{
currentFontRef = fonts[thisLine.Substring(0, thisLine.IndexOf(" "))];
}
}
else if (thisLine.EndsWith("> Tj"))
{
thisLine = thisLine.Substring(0, thisLine.IndexOf("> Tj"));
}
if(blnInText)
{
// This is a text block
try
{
// Get the section of codes that exist between angled brackets
string unicodeStr = thisLine;
// Wrap every group of 4 characters in angle brackets
// This will directly match the items in the CMap but also allows the next for to avoid double-translating items
unicodeStr = "<" + String.Join("><", unicodeStr.SplitInParts(4)) + ">";
for (int transform = 0; transform < currentFontRef.Length; transform++)
{
// Get the last item in the line, which is the unicode value of the glyph
string glyph = currentFontRef[transform].Substring(currentFontRef[transform].IndexOf("<"));
glyph = glyph.Substring(0, glyph.IndexOf(">") + 1);
string counterpart = currentFontRef[transform].Substring(currentFontRef[transform].LastIndexOf("<") + 1);
counterpart = counterpart.Substring(0, counterpart.LastIndexOf(">"));
// Replace each item that matches with the translated counterpart
// Insert a \\u before every 4th character so it's a C# unicode compatible string
unicodeStr = unicodeStr.Replace(glyph, "\\u" + counterpart);
if (unicodeStr.IndexOf(">") == 0)
{
// All items have been replaced, so lets get outta here
break;
}
}
resultString = resultString + System.Text.RegularExpressions.Regex.Unescape(unicodeStr);
}
catch
{
return "";
}
}
if (lines[line].Trim().EndsWith("> Tj"))
{
blnInText = false;
if (lines[line].Trim().IndexOf(" 0 Td <") == -1)
{
// The vertical coords have changed, so add a new line
resultString = resultString + Environment.NewLine;
}
else
{
resultString = resultString + " ";
}
}
}
return resultString;
}
// Credit to http://stackoverflow.com/questions/4133377/
private static IEnumerable<String> SplitInParts(this String s, Int32 partLength)
{
if (s == null)
throw new ArgumentNullException("s");
if (partLength <= 0)
throw new ArgumentException("Part length has to be positive.", "partLength");
for (var i = 0; i < s.Length; i += partLength)
yield return s.Substring(i, Math.Min(partLength, s.Length - i));
}
}
}
public class PDFParser
{
/// BT = Beginning of a text object operator
/// ET = End of a text object operator
/// Td move to the start of next line
/// 5 Ts = superscript
/// -5 Ts = subscript
#region Fields
#region _numberOfCharsToKeep
/// <summary>
/// The number of characters to keep, when extracting text.
/// </summary>
private static int _numberOfCharsToKeep = 15;
#endregion
#endregion
#region ExtractTextFromPDFBytes
/// <summary>
/// This method processes an uncompressed Adobe (text) object
/// and extracts text.
/// </summary>
/// <param name="input">uncompressed</param>
/// <returns></returns>
public string ExtractTextFromPDFBytes(byte[] input)
{
if (input == null || input.Length == 0) return "";
try
{
string resultString = "";
// Flag showing if we are we currently inside a text object
bool inTextObject = false;
// Flag showing if the next character is literal
// e.g. '\\' to get a '\' character or '\(' to get '('
bool nextLiteral = false;
// () Bracket nesting level. Text appears inside ()
int bracketDepth = 0;
// Keep previous chars to get extract numbers etc.:
char[] previousCharacters = new char[_numberOfCharsToKeep];
for (int j = 0; j < _numberOfCharsToKeep; j++) previousCharacters[j] = ' ';
for (int i = 0; i < input.Length; i++)
{
char c = (char)input[i];
if (inTextObject)
{
// Position the text
if (bracketDepth == 0)
{
if (CheckToken(new string[] { "TD", "Td" }, previousCharacters))
{
resultString += "\n\r";
}
else
{
if (CheckToken(new string[] { "'", "T*", "\"" }, previousCharacters))
{
resultString += "\n";
}
else
{
if (CheckToken(new string[] { "Tj" }, previousCharacters))
{
resultString += " ";
}
}
}
}
// End of a text object, also go to a new line.
if (bracketDepth == 0 &&
CheckToken(new string[] { "ET" }, previousCharacters))
{
inTextObject = false;
resultString += " ";
}
else
{
// Start outputting text
if ((c == '(') && (bracketDepth == 0) && (!nextLiteral))
{
bracketDepth = 1;
}
else
{
// Stop outputting text
if ((c == ')') && (bracketDepth == 1) && (!nextLiteral))
{
bracketDepth = 0;
}
else
{
// Just a normal text character:
if (bracketDepth == 1)
{
// Only print out next character no matter what.
// Do not interpret.
if (c == '\\' && !nextLiteral)
{
nextLiteral = true;
}
else
{
if (((c >= ' ') && (c <= '~')) ||
((c >= 128) && (c < 255)))
{
resultString += c.ToString();
}
nextLiteral = false;
}
}
}
}
}
}
// Store the recent characters for
// when we have to go back for a checking
for (int j = 0; j < _numberOfCharsToKeep - 1; j++)
{
previousCharacters[j] = previousCharacters[j + 1];
}
previousCharacters[_numberOfCharsToKeep - 1] = c;
// Start of a text object
if (!inTextObject && CheckToken(new string[] { "BT" }, previousCharacters))
{
inTextObject = true;
}
}
return resultString;
}
catch
{
return "";
}
}
#endregion
#region CheckToken
/// <summary>
/// Check if a certain 2 character token just came along (e.g. BT)
/// </summary>
/// <param name="search">the searched token</param>
/// <param name="recent">the recent character array</param>
/// <returns></returns>
private bool CheckToken(string[] tokens, char[] recent)
{
foreach (string token in tokens)
{
if (token.Length > 1)
{
if ((recent[_numberOfCharsToKeep - 3] == token[0]) &&
(recent[_numberOfCharsToKeep - 2] == token[1]) &&
((recent[_numberOfCharsToKeep - 1] == ' ') ||
(recent[_numberOfCharsToKeep - 1] == 0x0d) ||
(recent[_numberOfCharsToKeep - 1] == 0x0a)) &&
((recent[_numberOfCharsToKeep - 4] == ' ') ||
(recent[_numberOfCharsToKeep - 4] == 0x0d) ||
(recent[_numberOfCharsToKeep - 4] == 0x0a))
)
{
return true;
}
}
else
{
return false;
}
}
return false;
}
#endregion
}
最終的に実用的なソリューションをまとめることを可能にする助けとスニペットを提供してくれたすべての人に感謝します