c# - PDF 内のテキストから書誌データを取得し、ウィンドウフォームにエクスポートする

Question

以下のコードを使用して、iText5 for .NET を使用して PDF からテキストを抽出します。

private void button1_Click(object sender, EventArgs e)
{
  PdfReader reader2 = new PdfReader("Scharfetter1969.pdf");

  int pagen = reader2.NumberOfPages;
  reader2.Close();

  ITextExtractionStrategy its = new iTextSharp.text.pdf.parser.SimpleTextExtractionStrategy();
  for (int i = 1; i < 2; i++)
  {
    textBox1.Text = "";
    PdfReader reader = new PdfReader("Scharfetter1969.pdf");
    String s = PdfTextExtractor.GetTextFromPage(reader, i, its);
    s = Encoding.UTF8.GetString(ASCIIEncoding.Convert(Encoding.Default, Encoding.UTF8, Encoding.Default.GetBytes(s)));
    textBox1.Text = s;
    reader.Close();
  }
}

しかし、研究論文pdfから書誌データを取得したいです。

これは、この pdf から抽出されたデータの例です (文末の形式で)。ここにリンクがあります。

%0 Journal Article
%T Repeated temperature modulation epitaxy for p-type doping and light-emitting diode based on ZnO
%A Tsukazaki, A.
%A Ohtomo, A.
%A Onuma, T.
%A Ohtani, M.
%A Makino, T.
%A Sumiya, M.
%A Ohtani, K.
%A Chichibu, S.F.
%A Fuke, S.
%A Segawa, Y.
%J Nature Materials
%V 4
%N 1
%P 42-46
%@ 1476-1122
%D 2004
%I Nature Publishing Group

ただし、これは書誌情報であることを忘れないでください。この pdf のメタデータでは利用できません。記事の種類 (%O)、タイトル (%T)、作成者 (%A)、日付 (%D)、および (%I) にアクセスし、ウィンドウフォームの別の割り当てられたテキストボックスに表示したいと考えています。

このためのコードがある場合、またはこれを行う方法を教えてくれる場合は、C＃を使用しています。

score 2 · Accepted Answer

PDF は一方向形式です。すべてのデバイス (モニター、プリンターなど) で一貫して表示されるようにデータを入れますが、その形式はデータを引き出すことを意図したものではありません。それを行うためのすべての試みは、純粋な当て推量です。iTextPdfTextExtractorは機能しますが、独自の任意のルールセットに基づいて物事をつなぎ合わせる必要があり、これらのルールはおそらく PDF から PDF に変更されます。提供された PDF は InDesign によって作成されたもので、テキストの見栄えを良くするため、実際にはデータを解析するのがさらに難しくなっています。

とはいえ、PDF がすべて視覚的に一貫している場合は、フォーマットを保持したままデータを取り出し、フォーマットルールを使用して何が何であるかを推測することができます。その投稿により、推測できる HTML 形式が得られます。(これが実際に機能する場合は、HTML よりも具体的なものを返すことをお勧めしますが、それはあなたに任せます。)

提供された PDF に対して実行すると、タイトルがHelveticaNeue-LightExt約 17pts のフォントを使用していることがわかるので、そのフォントをそのサイズで使用するすべての行を探してそれらを結合するルールを作成できます。著者はHelveticaNeue-Condensed約 10 ポイントで終了するので、それは別のルールです。

以下のコードは、上記のリンクの修正版です。iTextSharp 5.1.1.0 を対象とする、完全に動作する C# 2010 WinForms アプリです。提供された PDF のタイトルと作成者を引き出しますが、他の PDF とメタデータについては微調整する必要があります。具体的な実装の詳細については、コード内のコメントを参照してください。

using System;
using System.Collections.Generic;
using System.Text;
using System.Windows.Forms;
using iTextSharp.text.pdf.parser;
using iTextSharp.text.pdf;

namespace WindowsFormsApplication1
{
    public partial class Form1 : Form
    {
        public Form1()
        {
            InitializeComponent();
        }

        private void Form1_Load(object sender, EventArgs e)
        {
            PdfReader reader = new PdfReader(System.IO.Path.Combine(Environment.GetFolderPath(Environment.SpecialFolder.Desktop), "nmat4-42.pdf"));
            TextWithFontExtractionStategy S = new TextWithFontExtractionStategy();
            string F = iTextSharp.text.pdf.parser.PdfTextExtractor.GetTextFromPage(reader, 1, S);

            //Buffers to hold various parts from the PDF
            List<string> titles = new List<string>();
            List<string> authors = new List<string>();

            //Array of lines of text
            string[] lines = F.Split(new string[] { Environment.NewLine }, StringSplitOptions.None);

            //Temporary string
            string t;

            //Loop through each line in the array
            foreach (string line in lines)
            {
                //See if the line looks like a "title"
                if (line.Contains("HelveticaNeue-LightExt") && line.Contains("font-size:17.28003"))
                {
                    //Remove the HTML tags
                    titles.Add(System.Text.RegularExpressions.Regex.Replace(line, "</?span.*?>", "").Trim());
                }
                    //See if the line looks like an "author"
                else if (line.Contains("HelveticaNeue-Condensed") && line.Contains("font-size:9.995972"))
                {
                    //Remove the HTML tags and trim extra characters
                    t = System.Text.RegularExpressions.Regex.Replace(line, "</?span.*?>", "").Trim(new char[] { ' ', ',', '*' });
                    //Make sure we have a valid name, probably need some more exceptions here, too
                    if (!string.IsNullOrWhiteSpace(t) && t != "AND")
                    {
                        authors.Add(t);
                    }
                }
            }
            //Write out the title to the console
            Console.WriteLine("Title  : {0}", string.Join(" ", titles.ToArray()));
            //Write out each author
            foreach (string author in authors)
            {
                Console.WriteLine("Author : {0}", author);
            }
            Console.WriteLine(F);

            this.Close();
        }

        public class TextWithFontExtractionStategy : iTextSharp.text.pdf.parser.ITextExtractionStrategy
        {
            //HTML buffer
            private StringBuilder result = new StringBuilder();

            //Store last used properties
            private Vector lastBaseLine;
            private string lastFont;
            private float lastFontSize;

            //http://api.itextpdf.com/itext/com/itextpdf/text/pdf/parser/TextRenderInfo.html
            private enum TextRenderMode
            {
                FillText = 0,
                StrokeText = 1,
                FillThenStrokeText = 2,
                Invisible = 3,
                FillTextAndAddToPathForClipping = 4,
                StrokeTextAndAddToPathForClipping = 5,
                FillThenStrokeTextAndAddToPathForClipping = 6,
                AddTextToPaddForClipping = 7
            }



            public void RenderText(iTextSharp.text.pdf.parser.TextRenderInfo renderInfo)
            {
                string curFont = renderInfo.GetFont().PostscriptFontName;
                //Check if faux bold is used
                if ((renderInfo.GetTextRenderMode() == (int)TextRenderMode.FillThenStrokeText))
                {
                    curFont += "-Bold";
                }

                //This code assumes that if the baseline changes then we're on a newline
                Vector curBaseline = renderInfo.GetBaseline().GetStartPoint();
                Vector topRight = renderInfo.GetAscentLine().GetEndPoint();
                iTextSharp.text.Rectangle rect = new iTextSharp.text.Rectangle(curBaseline[Vector.I1], curBaseline[Vector.I2], topRight[Vector.I1], topRight[Vector.I2]);
                Single curFontSize = rect.Height;

                //See if something has changed, either the baseline, the font or the font size
                if ((this.lastBaseLine == null) || (curBaseline[Vector.I2] != lastBaseLine[Vector.I2]) || (curFontSize != lastFontSize) || (curFont != lastFont))
                {
                    //if we've put down at least one span tag close it
                    if ((this.lastBaseLine != null))
                    {
                        this.result.AppendLine("</span>");
                    }
                    //If the baseline has changed then insert a line break
                    if ((this.lastBaseLine != null) && curBaseline[Vector.I2] != lastBaseLine[Vector.I2])
                    {
                        this.result.AppendLine("<br />");
                    }
                    //Create an HTML tag with appropriate styles
                    this.result.AppendFormat("<span style=\"font-family:{0};font-size:{1}\">", curFont, curFontSize);
                }

                //Append the current text
                this.result.Append(renderInfo.GetText());

                //Set currently used properties
                this.lastBaseLine = curBaseline;
                this.lastFontSize = curFontSize;
                this.lastFont = curFont;
            }

            public string GetResultantText()
            {
                //If we wrote anything then we'll always have a missing closing tag so close it here
                if (result.Length > 0)
                {
                    result.Append("</span>");
                }
                return result.ToString();
            }

            //Not needed
            public void BeginTextBlock() { }
            public void EndTextBlock() { }
            public void RenderImage(ImageRenderInfo renderInfo) { }
        }
    }
}

c# - PDF 内のテキストから書誌データを取得し、ウィンドウ フォームにエクスポートする

1 に答える 1

Related

Reference

c# - PDF 内のテキストから書誌データを取得し、ウィンドウフォームにエクスポートする