私はPDFrw
その例の1つを使用して、PFDファイル内の唯一の画像を抽出し、その画像をPNGまたはJPEGファイルに保存しています。
コードが難しすぎて理解できません。どのパラメータに渡す必要がありfind_objects
ますか?
from pdfrw.objects import PdfDict, PdfArray, PdfName
from pdfrw.pdfwriter import user_fmt
def find_objects(source, valid_types=(PdfName.XObject, None),
valid_subtypes=(PdfName.Form, PdfName.Image),
no_follow=(PdfName.Parent,),
isinstance=isinstance, id=id, sorted=sorted,
reversed=reversed, PdfDict=PdfDict):
'''
Find all the objects of a particular kind in a document
or array. Defaults to looking for Form and Image XObjects.
This could be done recursively, but some PDFs
are quite deeply nested, so we do it without
recursion.
Note that we don't know exactly where things appear on pages,
but we aim for a sort order that is (a) mostly in document order,
and (b) reproducible. For arrays, objects are processed in
array order, and for dicts, they are processed in key order.
'''
container = (PdfDict, PdfArray)
# Allow passing a list of pages, or a dict
if isinstance(source, PdfDict):
source = [source]
else:
source = list(source)
visited = set()
source.reverse()
while source:
obj = source.pop()
if not isinstance(obj, container):
continue
myid = id(obj)
if myid in visited:
continue
visited.add(myid)
if isinstance(obj, PdfDict):
if obj.Type in valid_types and obj.Subtype in valid_subtypes:
yield obj
obj = [y for (x, y) in sorted(obj.iteritems())
if x not in no_follow]
else:
# TODO: This forces resolution of any indirect objects in
# the array. It may not be necessary. Don't know if
# reversed() does any voodoo underneath the hood.
# It's cheap enough for now, but might be removeable.
obj and obj[0]
source.extend(reversed(obj))
find_objects('target.pdf')