mirror of
https://github.com/gryf/ebook-converter.git
synced 2026-03-23 10:53:34 +01:00
Added pdf related modules
This commit is contained in:
190
ebook_converter/ebooks/metadata/pdf.py
Normal file
190
ebook_converter/ebooks/metadata/pdf.py
Normal file
@@ -0,0 +1,190 @@
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
'''Read meta information from PDF files'''
|
||||
|
||||
import os, subprocess, shutil, re
|
||||
from functools import partial
|
||||
|
||||
from calibre import prints
|
||||
from calibre.constants import iswindows, ispy3
|
||||
from calibre.ptempfile import TemporaryDirectory
|
||||
from calibre.ebooks.metadata import (
|
||||
MetaInformation, string_to_authors, check_isbn, check_doi)
|
||||
from calibre.utils.ipc.simple_worker import fork_job, WorkerError
|
||||
from polyglot.builtins import iteritems, unicode_type
|
||||
|
||||
|
||||
def get_tools():
|
||||
from calibre.ebooks.pdf.pdftohtml import PDFTOHTML
|
||||
base = os.path.dirname(PDFTOHTML)
|
||||
suffix = '.exe' if iswindows else ''
|
||||
pdfinfo = os.path.join(base, 'pdfinfo') + suffix
|
||||
pdftoppm = os.path.join(base, 'pdftoppm') + suffix
|
||||
return pdfinfo, pdftoppm
|
||||
|
||||
|
||||
def read_info(outputdir, get_cover):
|
||||
''' Read info dict and cover from a pdf file named src.pdf in outputdir.
|
||||
Note that this function changes the cwd to outputdir and is therefore not
|
||||
thread safe. Run it using fork_job. This is necessary as there is no safe
|
||||
way to pass unicode paths via command line arguments. This also ensures
|
||||
that if poppler crashes, no stale file handles are left for the original
|
||||
file, only for src.pdf.'''
|
||||
os.chdir(outputdir)
|
||||
pdfinfo, pdftoppm = get_tools()
|
||||
ans = {}
|
||||
|
||||
try:
|
||||
raw = subprocess.check_output([pdfinfo, '-enc', 'UTF-8', '-isodates', 'src.pdf'])
|
||||
except subprocess.CalledProcessError as e:
|
||||
prints('pdfinfo errored out with return code: %d'%e.returncode)
|
||||
return None
|
||||
try:
|
||||
info_raw = raw.decode('utf-8')
|
||||
except UnicodeDecodeError:
|
||||
prints('pdfinfo returned no UTF-8 data')
|
||||
return None
|
||||
|
||||
for line in info_raw.splitlines():
|
||||
if ':' not in line:
|
||||
continue
|
||||
field, val = line.partition(':')[::2]
|
||||
val = val.strip()
|
||||
if field and val:
|
||||
ans[field] = val.strip()
|
||||
|
||||
# Now read XMP metadata
|
||||
# Versions of poppler before 0.47.0 used to print out both the Info dict and
|
||||
# XMP metadata packet together. However, since that changed in
|
||||
# https://cgit.freedesktop.org/poppler/poppler/commit/?id=c91483aceb1b640771f572cb3df9ad707e5cad0d
|
||||
# we can no longer rely on it.
|
||||
try:
|
||||
raw = subprocess.check_output([pdfinfo, '-meta', 'src.pdf']).strip()
|
||||
except subprocess.CalledProcessError as e:
|
||||
prints('pdfinfo failed to read XML metadata with return code: %d'%e.returncode)
|
||||
else:
|
||||
parts = re.split(br'^Metadata:', raw, 1, flags=re.MULTILINE)
|
||||
if len(parts) > 1:
|
||||
# old poppler < 0.47.0
|
||||
raw = parts[1].strip()
|
||||
if raw:
|
||||
ans['xmp_metadata'] = raw
|
||||
|
||||
if get_cover:
|
||||
try:
|
||||
subprocess.check_call([pdftoppm, '-singlefile', '-jpeg', '-cropbox',
|
||||
'src.pdf', 'cover'])
|
||||
except subprocess.CalledProcessError as e:
|
||||
prints('pdftoppm errored out with return code: %d'%e.returncode)
|
||||
|
||||
return ans
|
||||
|
||||
|
||||
def page_images(pdfpath, outputdir='.', first=1, last=1, image_format='jpeg', prefix='page-images'):
|
||||
pdftoppm = get_tools()[1]
|
||||
outputdir = os.path.abspath(outputdir)
|
||||
args = {}
|
||||
if iswindows:
|
||||
import win32process as w
|
||||
args['creationflags'] = w.HIGH_PRIORITY_CLASS | w.CREATE_NO_WINDOW
|
||||
try:
|
||||
subprocess.check_call([
|
||||
pdftoppm, '-cropbox', '-' + image_format, '-f', unicode_type(first),
|
||||
'-l', unicode_type(last), pdfpath, os.path.join(outputdir, prefix)
|
||||
], **args)
|
||||
except subprocess.CalledProcessError as e:
|
||||
raise ValueError('Failed to render PDF, pdftoppm errorcode: %s'%e.returncode)
|
||||
|
||||
|
||||
def is_pdf_encrypted(path_to_pdf):
|
||||
if not ispy3 and not isinstance(path_to_pdf, bytes):
|
||||
path_to_pdf = path_to_pdf.encode('mbcs' if iswindows else 'utf-8')
|
||||
pdfinfo = get_tools()[0]
|
||||
raw = subprocess.check_output([pdfinfo, path_to_pdf])
|
||||
q = re.search(br'^Encrypted:\s*(\S+)', raw, flags=re.MULTILINE)
|
||||
if q is not None:
|
||||
return q.group(1) == b'yes'
|
||||
return False
|
||||
|
||||
|
||||
def get_metadata(stream, cover=True):
|
||||
with TemporaryDirectory('_pdf_metadata_read') as pdfpath:
|
||||
stream.seek(0)
|
||||
with open(os.path.join(pdfpath, 'src.pdf'), 'wb') as f:
|
||||
shutil.copyfileobj(stream, f)
|
||||
try:
|
||||
res = fork_job('calibre.ebooks.metadata.pdf', 'read_info',
|
||||
(pdfpath, bool(cover)))
|
||||
except WorkerError as e:
|
||||
prints(e.orig_tb)
|
||||
raise RuntimeError('Failed to run pdfinfo')
|
||||
info = res['result']
|
||||
with open(res['stdout_stderr'], 'rb') as f:
|
||||
raw = f.read().strip()
|
||||
if raw:
|
||||
prints(raw)
|
||||
if info is None:
|
||||
raise ValueError('Could not read info dict from PDF')
|
||||
covpath = os.path.join(pdfpath, 'cover.jpg')
|
||||
cdata = None
|
||||
if cover and os.path.exists(covpath):
|
||||
with open(covpath, 'rb') as f:
|
||||
cdata = f.read()
|
||||
|
||||
title = info.get('Title', None) or _('Unknown')
|
||||
au = info.get('Author', None)
|
||||
if au is None:
|
||||
au = [_('Unknown')]
|
||||
else:
|
||||
au = string_to_authors(au)
|
||||
mi = MetaInformation(title, au)
|
||||
# if isbn is not None:
|
||||
# mi.isbn = isbn
|
||||
|
||||
creator = info.get('Creator', None)
|
||||
if creator:
|
||||
mi.book_producer = creator
|
||||
|
||||
keywords = info.get('Keywords', None)
|
||||
mi.tags = []
|
||||
if keywords:
|
||||
mi.tags = [x.strip() for x in keywords.split(',')]
|
||||
isbn = [check_isbn(x) for x in mi.tags if check_isbn(x)]
|
||||
if isbn:
|
||||
mi.isbn = isbn = isbn[0]
|
||||
mi.tags = [x for x in mi.tags if check_isbn(x) != isbn]
|
||||
|
||||
subject = info.get('Subject', None)
|
||||
if subject:
|
||||
mi.tags.insert(0, subject)
|
||||
|
||||
if 'xmp_metadata' in info:
|
||||
from calibre.ebooks.metadata.xmp import consolidate_metadata
|
||||
mi = consolidate_metadata(mi, info)
|
||||
|
||||
# Look for recognizable identifiers in the info dict, if they were not
|
||||
# found in the XMP metadata
|
||||
for scheme, check_func in iteritems({'doi':check_doi, 'isbn':check_isbn}):
|
||||
if scheme not in mi.get_identifiers():
|
||||
for k, v in iteritems(info):
|
||||
if k != 'xmp_metadata':
|
||||
val = check_func(v)
|
||||
if val:
|
||||
mi.set_identifier(scheme, val)
|
||||
break
|
||||
|
||||
if cdata:
|
||||
mi.cover_data = ('jpeg', cdata)
|
||||
return mi
|
||||
|
||||
|
||||
get_quick_metadata = partial(get_metadata, cover=False)
|
||||
|
||||
from calibre.utils.podofo import set_metadata as podofo_set_metadata
|
||||
|
||||
|
||||
def set_metadata(stream, mi):
|
||||
stream.seek(0)
|
||||
return podofo_set_metadata(stream, mi)
|
||||
Reference in New Issue
Block a user