mirror of
https://github.com/gryf/ebook-converter.git
synced 2026-01-05 18:54:11 +01:00
181 lines
6.3 KiB
Python
181 lines
6.3 KiB
Python
"""
|
|
Read meta information from PDF files
|
|
"""
|
|
import os, subprocess, shutil, re
|
|
from functools import partial
|
|
|
|
from ebook_converter import prints
|
|
from ebook_converter.constants import iswindows, ispy3
|
|
from ebook_converter.ptempfile import TemporaryDirectory
|
|
from ebook_converter.ebooks.metadata import (
|
|
MetaInformation, string_to_authors, check_isbn, check_doi)
|
|
from ebook_converter.utils.ipc.simple_worker import fork_job, WorkerError
|
|
from ebook_converter.polyglot.builtins import iteritems
|
|
|
|
|
|
__license__ = 'GPL v3'
|
|
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
|
|
|
|
|
|
def get_tools():
|
|
from ebook_converter.ebooks.pdf.pdftohtml import PDFTOHTML
|
|
base = os.path.dirname(PDFTOHTML)
|
|
suffix = '.exe' if iswindows else ''
|
|
pdfinfo = os.path.join(base, 'pdfinfo') + suffix
|
|
pdftoppm = os.path.join(base, 'pdftoppm') + suffix
|
|
return pdfinfo, pdftoppm
|
|
|
|
|
|
def read_info(outputdir, get_cover):
|
|
''' Read info dict and cover from a pdf file named src.pdf in outputdir.
|
|
Note that this function changes the cwd to outputdir and is therefore not
|
|
thread safe. Run it using fork_job. This is necessary as there is no safe
|
|
way to pass unicode paths via command line arguments. This also ensures
|
|
that if poppler crashes, no stale file handles are left for the original
|
|
file, only for src.pdf.'''
|
|
pdfinfo, pdftoppm = get_tools()
|
|
source_file = os.path.join(outputdir, 'src.pdf')
|
|
cover_file = os.path.join(outputdir, 'cover')
|
|
ans = {}
|
|
|
|
try:
|
|
raw = subprocess.check_output([pdfinfo, '-enc', 'UTF-8', '-isodates',
|
|
source_file])
|
|
except subprocess.CalledProcessError as e:
|
|
prints('pdfinfo errored out with return code: %d'%e.returncode)
|
|
return None
|
|
try:
|
|
info_raw = raw.decode('utf-8')
|
|
except UnicodeDecodeError:
|
|
prints('pdfinfo returned no UTF-8 data')
|
|
return None
|
|
|
|
for line in info_raw.splitlines():
|
|
if ':' not in line:
|
|
continue
|
|
field, val = line.partition(':')[::2]
|
|
val = val.strip()
|
|
if field and val:
|
|
ans[field] = val.strip()
|
|
|
|
# Now read XMP metadata
|
|
# Versions of poppler before 0.47.0 used to print out both the Info dict and
|
|
# XMP metadata packet together. However, since that changed in
|
|
# https://cgit.freedesktop.org/poppler/poppler/commit/?id=c91483aceb1b640771f572cb3df9ad707e5cad0d
|
|
# we can no longer rely on it.
|
|
try:
|
|
raw = subprocess.check_output([pdfinfo, '-meta', source_file]).strip()
|
|
except subprocess.CalledProcessError as e:
|
|
prints('pdfinfo failed to read XML metadata with return code: %d'%e.returncode)
|
|
else:
|
|
parts = re.split(br'^Metadata:', raw, 1, flags=re.MULTILINE)
|
|
if len(parts) > 1:
|
|
# old poppler < 0.47.0
|
|
raw = parts[1].strip()
|
|
if raw:
|
|
ans['xmp_metadata'] = raw
|
|
|
|
if get_cover:
|
|
try:
|
|
subprocess.check_call([pdftoppm, '-singlefile', '-jpeg',
|
|
'-cropbox', source_file, cover_file])
|
|
except subprocess.CalledProcessError as e:
|
|
prints('pdftoppm errored out with return code: %d'%e.returncode)
|
|
|
|
return ans
|
|
|
|
|
|
def page_images(pdfpath, outputdir='.', first=1, last=1, image_format='jpeg', prefix='page-images'):
|
|
pdftoppm = get_tools()[1]
|
|
outputdir = os.path.abspath(outputdir)
|
|
args = {}
|
|
if iswindows:
|
|
import win32process as w
|
|
args['creationflags'] = w.HIGH_PRIORITY_CLASS | w.CREATE_NO_WINDOW
|
|
try:
|
|
subprocess.check_call([
|
|
pdftoppm, '-cropbox', '-' + image_format, '-f', str(first),
|
|
'-l', str(last), pdfpath, os.path.join(outputdir, prefix)
|
|
], **args)
|
|
except subprocess.CalledProcessError as e:
|
|
raise ValueError('Failed to render PDF, pdftoppm errorcode: %s'%e.returncode)
|
|
|
|
|
|
def is_pdf_encrypted(path_to_pdf):
|
|
if not ispy3 and not isinstance(path_to_pdf, bytes):
|
|
path_to_pdf = path_to_pdf.encode('mbcs' if iswindows else 'utf-8')
|
|
pdfinfo = get_tools()[0]
|
|
raw = subprocess.check_output([pdfinfo, path_to_pdf])
|
|
q = re.search(br'^Encrypted:\s*(\S+)', raw, flags=re.MULTILINE)
|
|
if q is not None:
|
|
return q.group(1) == b'yes'
|
|
return False
|
|
|
|
|
|
def get_metadata(stream, cover=True):
|
|
with TemporaryDirectory('_pdf_metadata_read') as pdfpath:
|
|
stream.seek(0)
|
|
with open(os.path.join(pdfpath, 'src.pdf'), 'wb') as f:
|
|
shutil.copyfileobj(stream, f)
|
|
info = read_info(pdfpath, bool(cover))
|
|
if info is None:
|
|
raise ValueError('Could not read info dict from PDF')
|
|
covpath = os.path.join(pdfpath, 'cover.jpg')
|
|
cdata = None
|
|
if cover and os.path.exists(covpath):
|
|
with open(covpath, 'rb') as f:
|
|
cdata = f.read()
|
|
|
|
title = info.get('Title', None) or _('Unknown')
|
|
au = info.get('Author', None)
|
|
if au is None:
|
|
au = [_('Unknown')]
|
|
else:
|
|
au = string_to_authors(au)
|
|
mi = MetaInformation(title, au)
|
|
|
|
creator = info.get('Creator', None)
|
|
if creator:
|
|
mi.book_producer = creator
|
|
|
|
keywords = info.get('Keywords', None)
|
|
mi.tags = []
|
|
if keywords:
|
|
mi.tags = [x.strip() for x in keywords.split(',')]
|
|
isbn = [check_isbn(x) for x in mi.tags if check_isbn(x)]
|
|
if isbn:
|
|
mi.isbn = isbn = isbn[0]
|
|
mi.tags = [x for x in mi.tags if check_isbn(x) != isbn]
|
|
|
|
subject = info.get('Subject', None)
|
|
if subject:
|
|
mi.tags.insert(0, subject)
|
|
|
|
if 'xmp_metadata' in info:
|
|
from ebook_converter.ebooks.metadata.xmp import consolidate_metadata
|
|
mi = consolidate_metadata(mi, info)
|
|
|
|
# Look for recognizable identifiers in the info dict, if they were not
|
|
# found in the XMP metadata
|
|
for scheme, check_func in iteritems({'doi':check_doi, 'isbn':check_isbn}):
|
|
if scheme not in mi.get_identifiers():
|
|
for k, v in iteritems(info):
|
|
if k != 'xmp_metadata':
|
|
val = check_func(v)
|
|
if val:
|
|
mi.set_identifier(scheme, val)
|
|
break
|
|
|
|
if cdata:
|
|
mi.cover_data = ('jpeg', cdata)
|
|
return mi
|
|
|
|
|
|
get_quick_metadata = partial(get_metadata, cover=False)
|
|
|
|
#from ebook_converter.utils.podofo import set_metadata as podofo_set_metadata
|
|
|
|
|
|
def set_metadata(stream, mi):
|
|
return None
|