1
0
mirror of https://github.com/gryf/ebook-converter.git synced 2026-01-05 18:54:11 +01:00
Files
ebook-converter/ebook_converter/ebooks/metadata/pdf.py

181 lines
6.3 KiB
Python

"""
Read meta information from PDF files
"""
import os, subprocess, shutil, re
from functools import partial
from ebook_converter import prints
from ebook_converter.constants import iswindows, ispy3
from ebook_converter.ptempfile import TemporaryDirectory
from ebook_converter.ebooks.metadata import (
MetaInformation, string_to_authors, check_isbn, check_doi)
from ebook_converter.utils.ipc.simple_worker import fork_job, WorkerError
from ebook_converter.polyglot.builtins import iteritems
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
def get_tools():
from ebook_converter.ebooks.pdf.pdftohtml import PDFTOHTML
base = os.path.dirname(PDFTOHTML)
suffix = '.exe' if iswindows else ''
pdfinfo = os.path.join(base, 'pdfinfo') + suffix
pdftoppm = os.path.join(base, 'pdftoppm') + suffix
return pdfinfo, pdftoppm
def read_info(outputdir, get_cover):
''' Read info dict and cover from a pdf file named src.pdf in outputdir.
Note that this function changes the cwd to outputdir and is therefore not
thread safe. Run it using fork_job. This is necessary as there is no safe
way to pass unicode paths via command line arguments. This also ensures
that if poppler crashes, no stale file handles are left for the original
file, only for src.pdf.'''
pdfinfo, pdftoppm = get_tools()
source_file = os.path.join(outputdir, 'src.pdf')
cover_file = os.path.join(outputdir, 'cover')
ans = {}
try:
raw = subprocess.check_output([pdfinfo, '-enc', 'UTF-8', '-isodates',
source_file])
except subprocess.CalledProcessError as e:
prints('pdfinfo errored out with return code: %d'%e.returncode)
return None
try:
info_raw = raw.decode('utf-8')
except UnicodeDecodeError:
prints('pdfinfo returned no UTF-8 data')
return None
for line in info_raw.splitlines():
if ':' not in line:
continue
field, val = line.partition(':')[::2]
val = val.strip()
if field and val:
ans[field] = val.strip()
# Now read XMP metadata
# Versions of poppler before 0.47.0 used to print out both the Info dict and
# XMP metadata packet together. However, since that changed in
# https://cgit.freedesktop.org/poppler/poppler/commit/?id=c91483aceb1b640771f572cb3df9ad707e5cad0d
# we can no longer rely on it.
try:
raw = subprocess.check_output([pdfinfo, '-meta', source_file]).strip()
except subprocess.CalledProcessError as e:
prints('pdfinfo failed to read XML metadata with return code: %d'%e.returncode)
else:
parts = re.split(br'^Metadata:', raw, 1, flags=re.MULTILINE)
if len(parts) > 1:
# old poppler < 0.47.0
raw = parts[1].strip()
if raw:
ans['xmp_metadata'] = raw
if get_cover:
try:
subprocess.check_call([pdftoppm, '-singlefile', '-jpeg',
'-cropbox', source_file, cover_file])
except subprocess.CalledProcessError as e:
prints('pdftoppm errored out with return code: %d'%e.returncode)
return ans
def page_images(pdfpath, outputdir='.', first=1, last=1, image_format='jpeg', prefix='page-images'):
pdftoppm = get_tools()[1]
outputdir = os.path.abspath(outputdir)
args = {}
if iswindows:
import win32process as w
args['creationflags'] = w.HIGH_PRIORITY_CLASS | w.CREATE_NO_WINDOW
try:
subprocess.check_call([
pdftoppm, '-cropbox', '-' + image_format, '-f', str(first),
'-l', str(last), pdfpath, os.path.join(outputdir, prefix)
], **args)
except subprocess.CalledProcessError as e:
raise ValueError('Failed to render PDF, pdftoppm errorcode: %s'%e.returncode)
def is_pdf_encrypted(path_to_pdf):
if not ispy3 and not isinstance(path_to_pdf, bytes):
path_to_pdf = path_to_pdf.encode('mbcs' if iswindows else 'utf-8')
pdfinfo = get_tools()[0]
raw = subprocess.check_output([pdfinfo, path_to_pdf])
q = re.search(br'^Encrypted:\s*(\S+)', raw, flags=re.MULTILINE)
if q is not None:
return q.group(1) == b'yes'
return False
def get_metadata(stream, cover=True):
with TemporaryDirectory('_pdf_metadata_read') as pdfpath:
stream.seek(0)
with open(os.path.join(pdfpath, 'src.pdf'), 'wb') as f:
shutil.copyfileobj(stream, f)
info = read_info(pdfpath, bool(cover))
if info is None:
raise ValueError('Could not read info dict from PDF')
covpath = os.path.join(pdfpath, 'cover.jpg')
cdata = None
if cover and os.path.exists(covpath):
with open(covpath, 'rb') as f:
cdata = f.read()
title = info.get('Title', None) or _('Unknown')
au = info.get('Author', None)
if au is None:
au = [_('Unknown')]
else:
au = string_to_authors(au)
mi = MetaInformation(title, au)
creator = info.get('Creator', None)
if creator:
mi.book_producer = creator
keywords = info.get('Keywords', None)
mi.tags = []
if keywords:
mi.tags = [x.strip() for x in keywords.split(',')]
isbn = [check_isbn(x) for x in mi.tags if check_isbn(x)]
if isbn:
mi.isbn = isbn = isbn[0]
mi.tags = [x for x in mi.tags if check_isbn(x) != isbn]
subject = info.get('Subject', None)
if subject:
mi.tags.insert(0, subject)
if 'xmp_metadata' in info:
from ebook_converter.ebooks.metadata.xmp import consolidate_metadata
mi = consolidate_metadata(mi, info)
# Look for recognizable identifiers in the info dict, if they were not
# found in the XMP metadata
for scheme, check_func in iteritems({'doi':check_doi, 'isbn':check_isbn}):
if scheme not in mi.get_identifiers():
for k, v in iteritems(info):
if k != 'xmp_metadata':
val = check_func(v)
if val:
mi.set_identifier(scheme, val)
break
if cdata:
mi.cover_data = ('jpeg', cdata)
return mi
get_quick_metadata = partial(get_metadata, cover=False)
#from ebook_converter.utils.podofo import set_metadata as podofo_set_metadata
def set_metadata(stream, mi):
return None