mirror of
https://github.com/gryf/ebook-converter.git
synced 2026-01-28 08:45:46 +01:00
This is progressing refactor of the calibre code to make it more readable, and transform it to something more coherent. In this patch, there are changes regarding imports for some modules, instead of polluting namespace of each module with some other modules symbols, which often were imported from other modules. Yuck.
188 lines
6.7 KiB
Python
188 lines
6.7 KiB
Python
import errno
|
|
import os
|
|
import re
|
|
import shutil
|
|
import subprocess
|
|
import sys
|
|
|
|
from lxml import etree
|
|
|
|
from ebook_converter import CurrentDir, xml_replace_entities, prints
|
|
from ebook_converter.constants_old import isbsd, islinux, isosx, iswindows
|
|
from ebook_converter.ebooks import ConversionError, DRMError
|
|
from ebook_converter.ebooks.chardet import xml_to_unicode
|
|
from ebook_converter.ptempfile import PersistentTemporaryFile
|
|
from ebook_converter.utils.cleantext import clean_xml_chars
|
|
from ebook_converter.utils.ipc import eintr_retry_call
|
|
|
|
|
|
PDFTOHTML = 'pdftohtml'
|
|
|
|
|
|
def popen(cmd, **kw):
|
|
if iswindows:
|
|
kw['creationflags'] = 0x08
|
|
return subprocess.Popen(cmd, **kw)
|
|
|
|
|
|
if isosx and hasattr(sys, 'frameworks_dir'):
|
|
base = os.path.join(os.path.dirname(sys.frameworks_dir), 'utils.app',
|
|
'Contents', 'MacOS')
|
|
PDFTOHTML = os.path.join(base, PDFTOHTML)
|
|
if iswindows and hasattr(sys, 'frozen'):
|
|
base = os.path.dirname(sys.executable)
|
|
if hasattr(sys, 'new_app_layout'):
|
|
base = sys.extensions_location
|
|
PDFTOHTML = os.path.join(base, 'pdftohtml.exe')
|
|
if (islinux or isbsd) and getattr(sys, 'frozen', False):
|
|
PDFTOHTML = os.path.join(sys.executables_location, 'bin', 'pdftohtml')
|
|
|
|
|
|
def pdftohtml(output_dir, pdf_path, no_images, as_xml=False):
|
|
'''
|
|
Convert the pdf into html using the pdftohtml app.
|
|
This will write the html as index.html into output_dir.
|
|
It will also write all extracted images to the output_dir
|
|
'''
|
|
|
|
pdfsrc = os.path.join(output_dir, 'src.pdf')
|
|
index = os.path.join(output_dir, 'index.'+('xml' if as_xml else 'html'))
|
|
|
|
with open(pdf_path, 'rb') as src, open(pdfsrc, 'wb') as dest:
|
|
shutil.copyfileobj(src, dest)
|
|
|
|
with CurrentDir(output_dir):
|
|
|
|
def a(x):
|
|
return os.path.basename(x)
|
|
|
|
exe = PDFTOHTML
|
|
cmd = [exe, '-enc', 'UTF-8', '-noframes', '-p', '-nomerge',
|
|
'-nodrm', a(pdfsrc), a(index)]
|
|
|
|
if isbsd:
|
|
cmd.remove('-nodrm')
|
|
if no_images:
|
|
cmd.append('-i')
|
|
if as_xml:
|
|
cmd.append('-xml')
|
|
|
|
logf = PersistentTemporaryFile('pdftohtml_log')
|
|
try:
|
|
p = popen(cmd, stderr=logf._fd, stdout=logf._fd,
|
|
stdin=subprocess.PIPE)
|
|
except OSError as err:
|
|
if err.errno == errno.ENOENT:
|
|
raise ConversionError('Could not find pdftohtml, check it is '
|
|
'in your PATH')
|
|
else:
|
|
raise
|
|
ret = eintr_retry_call(p.wait)
|
|
logf.flush()
|
|
logf.close()
|
|
out = open(logf.name, 'rb').read().decode('utf-8', 'replace').strip()
|
|
if ret != 0:
|
|
raise ConversionError('pdftohtml failed with return code: '
|
|
'%d\n%s' % (ret, out))
|
|
if out:
|
|
prints("pdftohtml log:")
|
|
prints(out)
|
|
if not os.path.exists(index) or os.stat(index).st_size < 100:
|
|
raise DRMError()
|
|
|
|
if not as_xml:
|
|
with open(index, 'r+b') as i:
|
|
raw = i.read().decode('utf-8', 'replace')
|
|
raw = flip_images(raw)
|
|
raw = raw.replace('<head', '<!-- created by ebook-converter\'s'
|
|
' pdftohtml -->\n <head', 1)
|
|
i.seek(0)
|
|
i.truncate()
|
|
# versions of pdftohtml >= 0.20 output self closing <br> tags,
|
|
# this breaks the pdf heuristics regexps, so replace them
|
|
raw = raw.replace('<br/>', '<br>')
|
|
raw = re.sub(r'<a\s+name=(\d+)', r'<a id="\1"', raw,
|
|
flags=re.I)
|
|
raw = re.sub(r'<a id="(\d+)"', r'<a id="p\1"', raw,
|
|
flags=re.I)
|
|
raw = re.sub(r'<a href="index.html#(\d+)"', r'<a href="#p\1"',
|
|
raw, flags=re.I)
|
|
raw = xml_replace_entities(raw)
|
|
raw = raw.replace('\u00a0', ' ')
|
|
|
|
i.write(raw.encode('utf-8'))
|
|
|
|
cmd = [exe, '-f', '1', '-l', '1', '-xml', '-i', '-enc', 'UTF-8',
|
|
'-noframes', '-p', '-nomerge', '-nodrm', '-q', '-stdout',
|
|
a(pdfsrc)]
|
|
if isbsd:
|
|
cmd.remove('-nodrm')
|
|
p = popen(cmd, stdout=subprocess.PIPE)
|
|
raw = p.stdout.read().strip()
|
|
if p.wait() == 0 and raw:
|
|
parse_outline(raw, output_dir)
|
|
|
|
try:
|
|
os.remove(pdfsrc)
|
|
except Exception:
|
|
pass
|
|
|
|
|
|
def parse_outline(raw, output_dir):
|
|
raw = clean_xml_chars(xml_to_unicode(raw, strip_encoding_pats=True,
|
|
assume_utf8=True)[0])
|
|
outline = etree.fromstring(raw).xpath('(//outline)[1]')
|
|
if outline:
|
|
from ebook_converter.ebooks.oeb.polish.toc import TOC, create_ncx
|
|
outline = outline[0]
|
|
toc = TOC()
|
|
count = [0]
|
|
|
|
def process_node(node, toc):
|
|
for child in node.iterchildren('*'):
|
|
if child.tag == 'outline':
|
|
parent = toc.children[-1] if toc.children else toc
|
|
process_node(child, parent)
|
|
else:
|
|
if child.text:
|
|
page = child.get('page', '1')
|
|
toc.add(child.text, 'index.html', 'p' + page)
|
|
count[0] += 1
|
|
process_node(outline, toc)
|
|
if count[0] > 2:
|
|
root = create_ncx(toc, (lambda x: x), 'pdftohtml', 'en',
|
|
'pdftohtml')
|
|
with open(os.path.join(output_dir, 'toc.ncx'), 'wb') as f:
|
|
f.write(etree.tostring(root, pretty_print=True,
|
|
with_tail=False, encoding='utf-8',
|
|
xml_declaration=True))
|
|
|
|
|
|
def flip_image(img, flip):
|
|
from ebook_converter.utils.img import image_to_data
|
|
from ebook_converter.utils.img import image_and_format_from_data
|
|
from ebook_converter.utils.img import flip_image
|
|
with open(img, 'r+b') as f:
|
|
img, fmt = image_and_format_from_data(f.read())
|
|
img = flip_image(img, horizontal='x' in flip, vertical='y' in flip)
|
|
f.seek(0), f.truncate()
|
|
f.write(image_to_data(img, fmt=fmt))
|
|
|
|
|
|
def flip_images(raw):
|
|
for match in re.finditer('<IMG[^>]+/?>', raw, flags=re.I):
|
|
img = match.group()
|
|
m = re.search(r'class="(x|y|xy)flip"', img)
|
|
if m is None:
|
|
continue
|
|
flip = m.group(1)
|
|
src = re.search(r'src="([^"]+)"', img)
|
|
if src is None:
|
|
continue
|
|
img = src.group(1)
|
|
if not os.path.exists(img):
|
|
continue
|
|
flip_image(img, flip)
|
|
raw = re.sub(r'<STYLE.+?</STYLE>\s*', '', raw, flags=re.I | re.DOTALL)
|
|
return raw
|