1
0
mirror of https://github.com/gryf/ebook-converter.git synced 2026-03-15 14:13:40 +01:00

Cleanup, cleanup

This commit is contained in:
2020-07-17 18:59:45 +02:00
parent 5ac8451668
commit 45b6bb5b2c
10 changed files with 63 additions and 167 deletions

View File

@@ -121,7 +121,6 @@ def render_html_data(path_to_html, width, height):
result = {}
def report_error(text=''):
__import__('pdb').set_trace()
print(f'Failed to render {path_to_html}')
# file=sys.stderr)
if text:

View File

@@ -2,10 +2,8 @@ import functools
import mimetypes
import os
import re
import tempfile
import urllib.parse
from ebook_converter.constants_old import islinux, isbsd
from ebook_converter.customize.conversion import InputFormatPlugin
from ebook_converter.customize.conversion import OptionRecommendation
from ebook_converter.utils.localization import get_lang
@@ -55,7 +53,6 @@ class HTMLInput(InputFormatPlugin):
def convert(self, stream, opts, file_ext, log,
accelerators):
self._is_case_sensitive = None
basedir = os.getcwd()
self.opts = opts
@@ -81,14 +78,6 @@ class HTMLInput(InputFormatPlugin):
return create_oebbook(log, stream.name, opts,
encoding=opts.input_encoding)
def is_case_sensitive(self, path):
if getattr(self, '_is_case_sensitive', None) is not None:
return self._is_case_sensitive
if not path or not os.path.exists(path):
return islinux or isbsd
self._is_case_sensitive = not (os.path.exists(path.lower()) and os.path.exists(path.upper()))
return self._is_case_sensitive
def create_oebbook(self, htmlpath, basedir, opts, log, mi):
import uuid
from ebook_converter.ebooks.conversion.plumber import create_oebbook
@@ -154,8 +143,6 @@ class HTMLInput(InputFormatPlugin):
self.log = log
self.log('Normalizing filename cases')
for path, href in htmlfile_map.items():
if not self.is_case_sensitive(path):
path = path.lower()
self.added_resources[path] = href
self.urlnormalize, self.DirContainer = urlnormalize, DirContainer
self.urldefrag = urllib.parse.urldefrag
@@ -252,8 +239,6 @@ class HTMLInput(InputFormatPlugin):
if os.path.isdir(link):
self.log.warn(link_, 'is a link to a directory. Ignoring.')
return link_
if not self.is_case_sensitive(tempfile.gettempdir()):
link = link.lower()
if link not in self.added_resources:
bhref = os.path.basename(link)
id, href = self.oeb.manifest.generate(id='added', href=sanitize_file_name(bhref))

View File

@@ -1,27 +1,17 @@
"""
Read meta information from PDF files
"""
import os, subprocess, shutil, re
from functools import partial
import functools
import os
import re
import shutil
import subprocess
from ebook_converter.ptempfile import TemporaryDirectory
from ebook_converter.ebooks.metadata import (
MetaInformation, string_to_authors, check_isbn, check_doi)
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
def get_tools():
from ebook_converter.ebooks.pdf.pdftohtml import PDFTOHTML
base = os.path.dirname(PDFTOHTML)
suffix = ''
pdfinfo = os.path.join(base, 'pdfinfo') + suffix
pdftoppm = os.path.join(base, 'pdftoppm') + suffix
return pdfinfo, pdftoppm
def read_info(outputdir, get_cover):
''' Read info dict and cover from a pdf file named src.pdf in outputdir.
Note that this function changes the cwd to outputdir and is therefore not
@@ -29,7 +19,8 @@ def read_info(outputdir, get_cover):
way to pass unicode paths via command line arguments. This also ensures
that if poppler crashes, no stale file handles are left for the original
file, only for src.pdf.'''
pdfinfo, pdftoppm = get_tools()
pdfinfo = 'pdfinfo'
pdftoppm = 'pdftoppm'
source_file = os.path.join(outputdir, 'src.pdf')
cover_file = os.path.join(outputdir, 'cover')
ans = {}
@@ -55,8 +46,8 @@ def read_info(outputdir, get_cover):
ans[field] = val.strip()
# Now read XMP metadata
# Versions of poppler before 0.47.0 used to print out both the Info dict and
# XMP metadata packet together. However, since that changed in
# Versions of poppler before 0.47.0 used to print out both the Info dict
# and XMP metadata packet together. However, since that changed in
# https://cgit.freedesktop.org/poppler/poppler/commit/?id=c91483aceb1b640771f572cb3df9ad707e5cad0d
# we can no longer rely on it.
try:
@@ -77,13 +68,14 @@ def read_info(outputdir, get_cover):
subprocess.check_call([pdftoppm, '-singlefile', '-jpeg',
'-cropbox', source_file, cover_file])
except subprocess.CalledProcessError as e:
print('pdftoppm errored out with return code: {e.returncode}')
print(f'pdftoppm errored out with return code: {e.returncode}')
return ans
def page_images(pdfpath, outputdir='.', first=1, last=1, image_format='jpeg', prefix='page-images'):
pdftoppm = get_tools()[1]
def page_images(pdfpath, outputdir='.', first=1, last=1, image_format='jpeg',
prefix='page-images'):
pdftoppm = 'pdftoppm'
outputdir = os.path.abspath(outputdir)
args = {}
try:
@@ -92,11 +84,12 @@ def page_images(pdfpath, outputdir='.', first=1, last=1, image_format='jpeg', pr
'-l', str(last), pdfpath, os.path.join(outputdir, prefix)
], **args)
except subprocess.CalledProcessError as e:
raise ValueError('Failed to render PDF, pdftoppm errorcode: %s'%e.returncode)
raise ValueError('Failed to render PDF, pdftoppm errorcode: %s' %
e.returncode)
def is_pdf_encrypted(path_to_pdf):
pdfinfo = get_tools()[0]
pdfinfo = 'pdfinfo'
raw = subprocess.check_output([pdfinfo, path_to_pdf])
q = re.search(br'^Encrypted:\s*(\S+)', raw, flags=re.MULTILINE)
if q is not None:
@@ -149,7 +142,7 @@ def get_metadata(stream, cover=True):
# Look for recognizable identifiers in the info dict, if they were not
# found in the XMP metadata
for scheme, check_func in {'doi':check_doi, 'isbn':check_isbn}.items():
for scheme, check_func in {'doi': check_doi, 'isbn': check_isbn}.items():
if scheme not in mi.get_identifiers():
for k, v in info.items():
if k != 'xmp_metadata':
@@ -163,9 +156,7 @@ def get_metadata(stream, cover=True):
return mi
get_quick_metadata = partial(get_metadata, cover=False)
#from ebook_converter.utils.podofo import set_metadata as podofo_set_metadata
get_quick_metadata = functools.partial(get_metadata, cover=False)
def set_metadata(stream, mi):

View File

@@ -3,12 +3,10 @@ import os
import re
import shutil
import subprocess
import sys
from lxml import etree
from ebook_converter import CurrentDir, xml_replace_entities
from ebook_converter.constants_old import isbsd, islinux, isosx
from ebook_converter.ebooks import ConversionError, DRMError
from ebook_converter.ebooks.chardet import xml_to_unicode
from ebook_converter.ptempfile import PersistentTemporaryFile
@@ -16,21 +14,10 @@ from ebook_converter.utils.cleantext import clean_xml_chars
from ebook_converter.utils.ipc import eintr_retry_call
PDFTOHTML = 'pdftohtml'
def popen(cmd, **kw):
return subprocess.Popen(cmd, **kw)
if isosx and hasattr(sys, 'frameworks_dir'):
base = os.path.join(os.path.dirname(sys.frameworks_dir), 'utils.app',
'Contents', 'MacOS')
PDFTOHTML = os.path.join(base, PDFTOHTML)
if (islinux or isbsd) and getattr(sys, 'frozen', False):
PDFTOHTML = os.path.join(sys.executables_location, 'bin', 'pdftohtml')
def pdftohtml(output_dir, pdf_path, no_images, as_xml=False):
'''
Convert the pdf into html using the pdftohtml app.
@@ -49,12 +36,9 @@ def pdftohtml(output_dir, pdf_path, no_images, as_xml=False):
def a(x):
return os.path.basename(x)
exe = PDFTOHTML
cmd = [exe, '-enc', 'UTF-8', '-noframes', '-p', '-nomerge',
cmd = ['pdftohtml', '-enc', 'UTF-8', '-noframes', '-p', '-nomerge',
'-nodrm', a(pdfsrc), a(index)]
if isbsd:
cmd.remove('-nodrm')
if no_images:
cmd.append('-i')
if as_xml:
@@ -105,11 +89,9 @@ def pdftohtml(output_dir, pdf_path, no_images, as_xml=False):
i.write(raw.encode('utf-8'))
cmd = [exe, '-f', '1', '-l', '1', '-xml', '-i', '-enc', 'UTF-8',
'-noframes', '-p', '-nomerge', '-nodrm', '-q', '-stdout',
a(pdfsrc)]
if isbsd:
cmd.remove('-nodrm')
cmd = ['pdftohtml', '-f', '1', '-l', '1', '-xml', '-i', '-enc',
'UTF-8', '-noframes', '-p', '-nomerge', '-nodrm', '-q',
'-stdout', a(pdfsrc)]
p = popen(cmd, stdout=subprocess.PIPE)
raw = p.stdout.read().strip()
if p.wait() == 0 and raw: