import errno import os import re import shutil import subprocess import sys from lxml import etree from ebook_converter import CurrentDir, xml_replace_entities, prints from ebook_converter.constants_old import isbsd, islinux, isosx from ebook_converter.ebooks import ConversionError, DRMError from ebook_converter.ebooks.chardet import xml_to_unicode from ebook_converter.ptempfile import PersistentTemporaryFile from ebook_converter.utils.cleantext import clean_xml_chars from ebook_converter.utils.ipc import eintr_retry_call PDFTOHTML = 'pdftohtml' def popen(cmd, **kw): return subprocess.Popen(cmd, **kw) if isosx and hasattr(sys, 'frameworks_dir'): base = os.path.join(os.path.dirname(sys.frameworks_dir), 'utils.app', 'Contents', 'MacOS') PDFTOHTML = os.path.join(base, PDFTOHTML) if (islinux or isbsd) and getattr(sys, 'frozen', False): PDFTOHTML = os.path.join(sys.executables_location, 'bin', 'pdftohtml') def pdftohtml(output_dir, pdf_path, no_images, as_xml=False): ''' Convert the pdf into html using the pdftohtml app. This will write the html as index.html into output_dir. It will also write all extracted images to the output_dir ''' pdfsrc = os.path.join(output_dir, 'src.pdf') index = os.path.join(output_dir, 'index.'+('xml' if as_xml else 'html')) with open(pdf_path, 'rb') as src, open(pdfsrc, 'wb') as dest: shutil.copyfileobj(src, dest) with CurrentDir(output_dir): def a(x): return os.path.basename(x) exe = PDFTOHTML cmd = [exe, '-enc', 'UTF-8', '-noframes', '-p', '-nomerge', '-nodrm', a(pdfsrc), a(index)] if isbsd: cmd.remove('-nodrm') if no_images: cmd.append('-i') if as_xml: cmd.append('-xml') logf = PersistentTemporaryFile('pdftohtml_log') try: p = popen(cmd, stderr=logf._fd, stdout=logf._fd, stdin=subprocess.PIPE) except OSError as err: if err.errno == errno.ENOENT: raise ConversionError('Could not find pdftohtml, check it is ' 'in your PATH') else: raise ret = eintr_retry_call(p.wait) logf.flush() logf.close() out = open(logf.name, 'rb').read().decode('utf-8', 'replace').strip() if ret != 0: raise ConversionError('pdftohtml failed with return code: ' '%d\n%s' % (ret, out)) if out: prints("pdftohtml log:") prints(out) if not os.path.exists(index) or os.stat(index).st_size < 100: raise DRMError() if not as_xml: with open(index, 'r+b') as i: raw = i.read().decode('utf-8', 'replace') raw = flip_images(raw) raw = raw.replace('
\n = 0.20 output self closing