mirror of
https://github.com/gryf/ebook-converter.git
synced 2026-01-02 16:54:12 +01:00
Here is the first batch of modules, which are needed for converting several formats to LRF. Some of the logic has been change, more cleanups will follow.
83 lines
3.2 KiB
Python
83 lines
3.2 KiB
Python
# -*- coding: utf-8 -*-
|
|
from __future__ import absolute_import, division, print_function, unicode_literals
|
|
|
|
__license__ = 'GPL 3'
|
|
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
|
__docformat__ = 'restructuredtext en'
|
|
|
|
import os
|
|
|
|
from ebook_converter.customize.conversion import InputFormatPlugin, OptionRecommendation
|
|
from ebook_converter.polyglot.builtins import as_bytes, getcwd
|
|
|
|
|
|
class PDFInput(InputFormatPlugin):
|
|
|
|
name = 'PDF Input'
|
|
author = 'Kovid Goyal and John Schember'
|
|
description = 'Convert PDF files to HTML'
|
|
file_types = {'pdf'}
|
|
commit_name = 'pdf_input'
|
|
|
|
options = {
|
|
OptionRecommendation(name='no_images', recommended_value=False,
|
|
help=_('Do not extract images from the document')),
|
|
OptionRecommendation(name='unwrap_factor', recommended_value=0.45,
|
|
help=_('Scale used to determine the length at which a line should '
|
|
'be unwrapped. Valid values are a decimal between 0 and 1. The '
|
|
'default is 0.45, just below the median line length.')),
|
|
OptionRecommendation(name='new_pdf_engine', recommended_value=False,
|
|
help=_('Use the new PDF conversion engine. Currently not operational.'))
|
|
}
|
|
|
|
def convert_new(self, stream, accelerators):
|
|
from ebook_converter.ebooks.pdf.pdftohtml import pdftohtml
|
|
from ebook_converter.utils.cleantext import clean_ascii_chars
|
|
from ebook_converter.ebooks.pdf.reflow import PDFDocument
|
|
|
|
pdftohtml(getcwd(), stream.name, self.opts.no_images, as_xml=True)
|
|
with lopen('index.xml', 'rb') as f:
|
|
xml = clean_ascii_chars(f.read())
|
|
PDFDocument(xml, self.opts, self.log)
|
|
return os.path.join(getcwd(), 'metadata.opf')
|
|
|
|
def convert(self, stream, options, file_ext, log,
|
|
accelerators):
|
|
from ebook_converter.ebooks.metadata.opf2 import OPFCreator
|
|
from ebook_converter.ebooks.pdf.pdftohtml import pdftohtml
|
|
|
|
log.debug('Converting file to html...')
|
|
# The main html file will be named index.html
|
|
self.opts, self.log = options, log
|
|
if options.new_pdf_engine:
|
|
return self.convert_new(stream, accelerators)
|
|
pdftohtml(getcwd(), stream.name, options.no_images)
|
|
|
|
from ebook_converter.ebooks.metadata.meta import get_metadata
|
|
log.debug('Retrieving document metadata...')
|
|
mi = get_metadata(stream, 'pdf')
|
|
opf = OPFCreator(getcwd(), mi)
|
|
|
|
manifest = [('index.html', None)]
|
|
|
|
images = os.listdir(getcwd())
|
|
images.remove('index.html')
|
|
for i in images:
|
|
manifest.append((i, None))
|
|
log.debug('Generating manifest...')
|
|
opf.create_manifest(manifest)
|
|
|
|
opf.create_spine(['index.html'])
|
|
log.debug('Rendering manifest...')
|
|
with lopen('metadata.opf', 'wb') as opffile:
|
|
opf.render(opffile)
|
|
if os.path.exists('toc.ncx'):
|
|
ncxid = opf.manifest.id_for_path('toc.ncx')
|
|
if ncxid:
|
|
with lopen('metadata.opf', 'r+b') as f:
|
|
raw = f.read().replace(b'<spine', b'<spine toc="%s"' % as_bytes(ncxid))
|
|
f.seek(0)
|
|
f.write(raw)
|
|
|
|
return os.path.join(getcwd(), 'metadata.opf')
|