1
0
mirror of https://github.com/gryf/ebook-converter.git synced 2026-03-29 08:03:31 +02:00

Cleanup, cleanup

This commit is contained in:
2020-07-17 18:59:45 +02:00
parent 5ac8451668
commit 45b6bb5b2c
10 changed files with 63 additions and 167 deletions

View File

@@ -1,27 +1,17 @@
"""
Read meta information from PDF files
"""
import os, subprocess, shutil, re
from functools import partial
import functools
import os
import re
import shutil
import subprocess
from ebook_converter.ptempfile import TemporaryDirectory
from ebook_converter.ebooks.metadata import (
MetaInformation, string_to_authors, check_isbn, check_doi)
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
def get_tools():
from ebook_converter.ebooks.pdf.pdftohtml import PDFTOHTML
base = os.path.dirname(PDFTOHTML)
suffix = ''
pdfinfo = os.path.join(base, 'pdfinfo') + suffix
pdftoppm = os.path.join(base, 'pdftoppm') + suffix
return pdfinfo, pdftoppm
def read_info(outputdir, get_cover):
''' Read info dict and cover from a pdf file named src.pdf in outputdir.
Note that this function changes the cwd to outputdir and is therefore not
@@ -29,7 +19,8 @@ def read_info(outputdir, get_cover):
way to pass unicode paths via command line arguments. This also ensures
that if poppler crashes, no stale file handles are left for the original
file, only for src.pdf.'''
pdfinfo, pdftoppm = get_tools()
pdfinfo = 'pdfinfo'
pdftoppm = 'pdftoppm'
source_file = os.path.join(outputdir, 'src.pdf')
cover_file = os.path.join(outputdir, 'cover')
ans = {}
@@ -55,8 +46,8 @@ def read_info(outputdir, get_cover):
ans[field] = val.strip()
# Now read XMP metadata
# Versions of poppler before 0.47.0 used to print out both the Info dict and
# XMP metadata packet together. However, since that changed in
# Versions of poppler before 0.47.0 used to print out both the Info dict
# and XMP metadata packet together. However, since that changed in
# https://cgit.freedesktop.org/poppler/poppler/commit/?id=c91483aceb1b640771f572cb3df9ad707e5cad0d
# we can no longer rely on it.
try:
@@ -77,13 +68,14 @@ def read_info(outputdir, get_cover):
subprocess.check_call([pdftoppm, '-singlefile', '-jpeg',
'-cropbox', source_file, cover_file])
except subprocess.CalledProcessError as e:
print('pdftoppm errored out with return code: {e.returncode}')
print(f'pdftoppm errored out with return code: {e.returncode}')
return ans
def page_images(pdfpath, outputdir='.', first=1, last=1, image_format='jpeg', prefix='page-images'):
pdftoppm = get_tools()[1]
def page_images(pdfpath, outputdir='.', first=1, last=1, image_format='jpeg',
prefix='page-images'):
pdftoppm = 'pdftoppm'
outputdir = os.path.abspath(outputdir)
args = {}
try:
@@ -92,11 +84,12 @@ def page_images(pdfpath, outputdir='.', first=1, last=1, image_format='jpeg', pr
'-l', str(last), pdfpath, os.path.join(outputdir, prefix)
], **args)
except subprocess.CalledProcessError as e:
raise ValueError('Failed to render PDF, pdftoppm errorcode: %s'%e.returncode)
raise ValueError('Failed to render PDF, pdftoppm errorcode: %s' %
e.returncode)
def is_pdf_encrypted(path_to_pdf):
pdfinfo = get_tools()[0]
pdfinfo = 'pdfinfo'
raw = subprocess.check_output([pdfinfo, path_to_pdf])
q = re.search(br'^Encrypted:\s*(\S+)', raw, flags=re.MULTILINE)
if q is not None:
@@ -149,7 +142,7 @@ def get_metadata(stream, cover=True):
# Look for recognizable identifiers in the info dict, if they were not
# found in the XMP metadata
for scheme, check_func in {'doi':check_doi, 'isbn':check_isbn}.items():
for scheme, check_func in {'doi': check_doi, 'isbn': check_isbn}.items():
if scheme not in mi.get_identifiers():
for k, v in info.items():
if k != 'xmp_metadata':
@@ -163,9 +156,7 @@ def get_metadata(stream, cover=True):
return mi
get_quick_metadata = partial(get_metadata, cover=False)
#from ebook_converter.utils.podofo import set_metadata as podofo_set_metadata
get_quick_metadata = functools.partial(get_metadata, cover=False)
def set_metadata(stream, mi):