mirror of
https://github.com/gryf/ebook-converter.git
synced 2026-01-01 15:32:26 +01:00
82 lines
3.2 KiB
Python
82 lines
3.2 KiB
Python
# -*- coding: utf-8 -*-
|
|
from __future__ import absolute_import, division, print_function, unicode_literals
|
|
|
|
__license__ = 'GPL 3'
|
|
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
|
__docformat__ = 'restructuredtext en'
|
|
|
|
import os
|
|
|
|
from ebook_converter.customize.conversion import InputFormatPlugin, OptionRecommendation
|
|
from ebook_converter.polyglot.builtins import as_bytes, getcwd
|
|
|
|
|
|
class PDFInput(InputFormatPlugin):
|
|
|
|
name = 'PDF Input'
|
|
author = 'Kovid Goyal and John Schember'
|
|
description = 'Convert PDF files to HTML'
|
|
file_types = {'pdf'}
|
|
commit_name = 'pdf_input'
|
|
|
|
options = {
|
|
OptionRecommendation(name='no_images', recommended_value=False,
|
|
help=_('Do not extract images from the document')),
|
|
OptionRecommendation(name='unwrap_factor', recommended_value=0.45,
|
|
help=_('Scale used to determine the length at which a line should '
|
|
'be unwrapped. Valid values are a decimal between 0 and 1. The '
|
|
'default is 0.45, just below the median line length.')),
|
|
OptionRecommendation(name='new_pdf_engine', recommended_value=False,
|
|
help=_('Use the new PDF conversion engine. Currently not operational.'))
|
|
}
|
|
|
|
def convert_new(self, stream, accelerators):
|
|
from ebook_converter.ebooks.pdf.pdftohtml import pdftohtml
|
|
from ebook_converter.utils.cleantext import clean_ascii_chars
|
|
from ebook_converter.ebooks.pdf.reflow import PDFDocument
|
|
|
|
pdftohtml(getcwd(), stream.name, self.opts.no_images, as_xml=True)
|
|
with lopen('index.xml', 'rb') as f:
|
|
xml = clean_ascii_chars(f.read())
|
|
PDFDocument(xml, self.opts, self.log)
|
|
return os.path.join(getcwd(), 'metadata.opf')
|
|
|
|
def convert(self, stream, options, file_ext, log, accelerators):
|
|
from ebook_converter.ebooks.metadata.opf2 import OPFCreator
|
|
from ebook_converter.ebooks.pdf.pdftohtml import pdftohtml
|
|
|
|
log.debug('Converting file to html...')
|
|
# The main html file will be named index.html
|
|
self.opts, self.log = options, log
|
|
if options.new_pdf_engine:
|
|
return self.convert_new(stream, accelerators)
|
|
pdftohtml(getcwd(), stream.name, options.no_images)
|
|
|
|
from ebook_converter.ebooks.metadata.meta import get_metadata
|
|
log.debug('Retrieving document metadata...')
|
|
mi = get_metadata(stream, 'pdf')
|
|
opf = OPFCreator(getcwd(), mi)
|
|
|
|
manifest = [('index.html', None)]
|
|
|
|
images = os.listdir(getcwd())
|
|
images.remove('index.html')
|
|
for i in images:
|
|
manifest.append((i, None))
|
|
log.debug('Generating manifest...')
|
|
opf.create_manifest(manifest)
|
|
|
|
opf.create_spine(['index.html'])
|
|
log.debug('Rendering manifest...')
|
|
with lopen('metadata.opf', 'wb') as opffile:
|
|
opf.render(opffile)
|
|
if os.path.exists('toc.ncx'):
|
|
ncxid = opf.manifest.id_for_path('toc.ncx')
|
|
if ncxid:
|
|
with lopen('metadata.opf', 'r+b') as f:
|
|
raw = f.read().replace(b'<spine', b'<spine toc="%s"' % as_bytes(ncxid))
|
|
f.seek(0)
|
|
f.write(raw)
|
|
|
|
return os.path.join(getcwd(), 'metadata.opf')
|