mirror of
https://github.com/gryf/ebook-converter.git
synced 2026-02-22 18:15:49 +01:00
Initial import
This commit is contained in:
10
ebook_converter/ebooks/conversion/plugins/__init__.py
Normal file
10
ebook_converter/ebooks/conversion/plugins/__init__.py
Normal file
@@ -0,0 +1,10 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
|
||||
|
||||
29
ebook_converter/ebooks/conversion/plugins/azw4_input.py
Normal file
29
ebook_converter/ebooks/conversion/plugins/azw4_input.py
Normal file
@@ -0,0 +1,29 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2011, John Schember <john@nachtimwald.com>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
from calibre.customize.conversion import InputFormatPlugin
|
||||
from polyglot.builtins import getcwd
|
||||
|
||||
|
||||
class AZW4Input(InputFormatPlugin):
|
||||
|
||||
name = 'AZW4 Input'
|
||||
author = 'John Schember'
|
||||
description = 'Convert AZW4 to HTML'
|
||||
file_types = {'azw4'}
|
||||
commit_name = 'azw4_input'
|
||||
|
||||
def convert(self, stream, options, file_ext, log,
|
||||
accelerators):
|
||||
from calibre.ebooks.pdb.header import PdbHeaderReader
|
||||
from calibre.ebooks.azw4.reader import Reader
|
||||
|
||||
header = PdbHeaderReader(stream)
|
||||
reader = Reader(header, stream, log, options)
|
||||
opf = reader.extract_content(getcwd())
|
||||
|
||||
return opf
|
||||
202
ebook_converter/ebooks/conversion/plugins/chm_input.py
Normal file
202
ebook_converter/ebooks/conversion/plugins/chm_input.py
Normal file
@@ -0,0 +1,202 @@
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
''' CHM File decoding support '''
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>,' \
|
||||
' and Alex Bramley <a.bramley at gmail.com>.'
|
||||
|
||||
import os
|
||||
|
||||
from calibre.customize.conversion import InputFormatPlugin
|
||||
from calibre.ptempfile import TemporaryDirectory
|
||||
from calibre.constants import filesystem_encoding
|
||||
from polyglot.builtins import unicode_type, as_bytes
|
||||
|
||||
|
||||
class CHMInput(InputFormatPlugin):
|
||||
|
||||
name = 'CHM Input'
|
||||
author = 'Kovid Goyal and Alex Bramley'
|
||||
description = 'Convert CHM files to OEB'
|
||||
file_types = {'chm'}
|
||||
commit_name = 'chm_input'
|
||||
|
||||
def _chmtohtml(self, output_dir, chm_path, no_images, log, debug_dump=False):
|
||||
from calibre.ebooks.chm.reader import CHMReader
|
||||
log.debug('Opening CHM file')
|
||||
rdr = CHMReader(chm_path, log, input_encoding=self.opts.input_encoding)
|
||||
log.debug('Extracting CHM to %s' % output_dir)
|
||||
rdr.extract_content(output_dir, debug_dump=debug_dump)
|
||||
self._chm_reader = rdr
|
||||
return rdr.hhc_path
|
||||
|
||||
def convert(self, stream, options, file_ext, log, accelerators):
|
||||
from calibre.ebooks.chm.metadata import get_metadata_from_reader
|
||||
from calibre.customize.ui import plugin_for_input_format
|
||||
self.opts = options
|
||||
|
||||
log.debug('Processing CHM...')
|
||||
with TemporaryDirectory('_chm2oeb') as tdir:
|
||||
if not isinstance(tdir, unicode_type):
|
||||
tdir = tdir.decode(filesystem_encoding)
|
||||
html_input = plugin_for_input_format('html')
|
||||
for opt in html_input.options:
|
||||
setattr(options, opt.option.name, opt.recommended_value)
|
||||
no_images = False # options.no_images
|
||||
chm_name = stream.name
|
||||
# chm_data = stream.read()
|
||||
|
||||
# closing stream so CHM can be opened by external library
|
||||
stream.close()
|
||||
log.debug('tdir=%s' % tdir)
|
||||
log.debug('stream.name=%s' % stream.name)
|
||||
debug_dump = False
|
||||
odi = options.debug_pipeline
|
||||
if odi:
|
||||
debug_dump = os.path.join(odi, 'input')
|
||||
mainname = self._chmtohtml(tdir, chm_name, no_images, log,
|
||||
debug_dump=debug_dump)
|
||||
mainpath = os.path.join(tdir, mainname)
|
||||
|
||||
try:
|
||||
metadata = get_metadata_from_reader(self._chm_reader)
|
||||
except Exception:
|
||||
log.exception('Failed to read metadata, using filename')
|
||||
from calibre.ebooks.metadata.book.base import Metadata
|
||||
metadata = Metadata(os.path.basename(chm_name))
|
||||
encoding = self._chm_reader.get_encoding() or options.input_encoding or 'cp1252'
|
||||
self._chm_reader.CloseCHM()
|
||||
# print((tdir, mainpath))
|
||||
# from calibre import ipython
|
||||
# ipython()
|
||||
|
||||
options.debug_pipeline = None
|
||||
options.input_encoding = 'utf-8'
|
||||
uenc = encoding
|
||||
if os.path.abspath(mainpath) in self._chm_reader.re_encoded_files:
|
||||
uenc = 'utf-8'
|
||||
htmlpath, toc = self._create_html_root(mainpath, log, uenc)
|
||||
oeb = self._create_oebbook_html(htmlpath, tdir, options, log, metadata)
|
||||
options.debug_pipeline = odi
|
||||
if toc.count() > 1:
|
||||
oeb.toc = self.parse_html_toc(oeb.spine[0])
|
||||
oeb.manifest.remove(oeb.spine[0])
|
||||
oeb.auto_generated_toc = False
|
||||
return oeb
|
||||
|
||||
def parse_html_toc(self, item):
|
||||
from calibre.ebooks.oeb.base import TOC, XPath
|
||||
dx = XPath('./h:div')
|
||||
ax = XPath('./h:a[1]')
|
||||
|
||||
def do_node(parent, div):
|
||||
for child in dx(div):
|
||||
a = ax(child)[0]
|
||||
c = parent.add(a.text, a.attrib['href'])
|
||||
do_node(c, child)
|
||||
|
||||
toc = TOC()
|
||||
root = XPath('//h:div[1]')(item.data)[0]
|
||||
do_node(toc, root)
|
||||
return toc
|
||||
|
||||
def _create_oebbook_html(self, htmlpath, basedir, opts, log, mi):
|
||||
# use HTMLInput plugin to generate book
|
||||
from calibre.customize.builtins import HTMLInput
|
||||
opts.breadth_first = True
|
||||
htmlinput = HTMLInput(None)
|
||||
oeb = htmlinput.create_oebbook(htmlpath, basedir, opts, log, mi)
|
||||
return oeb
|
||||
|
||||
def _create_html_root(self, hhcpath, log, encoding):
|
||||
from lxml import html
|
||||
from polyglot.urllib import unquote as _unquote
|
||||
from calibre.ebooks.oeb.base import urlquote
|
||||
from calibre.ebooks.chardet import xml_to_unicode
|
||||
hhcdata = self._read_file(hhcpath)
|
||||
hhcdata = hhcdata.decode(encoding)
|
||||
hhcdata = xml_to_unicode(hhcdata, verbose=True,
|
||||
strip_encoding_pats=True, resolve_entities=True)[0]
|
||||
hhcroot = html.fromstring(hhcdata)
|
||||
toc = self._process_nodes(hhcroot)
|
||||
# print("=============================")
|
||||
# print("Printing hhcroot")
|
||||
# print(etree.tostring(hhcroot, pretty_print=True))
|
||||
# print("=============================")
|
||||
log.debug('Found %d section nodes' % toc.count())
|
||||
htmlpath = os.path.splitext(hhcpath)[0] + ".html"
|
||||
base = os.path.dirname(os.path.abspath(htmlpath))
|
||||
|
||||
def unquote(x):
|
||||
if isinstance(x, unicode_type):
|
||||
x = x.encode('utf-8')
|
||||
return _unquote(x).decode('utf-8')
|
||||
|
||||
def unquote_path(x):
|
||||
y = unquote(x)
|
||||
if (not os.path.exists(os.path.join(base, x)) and os.path.exists(os.path.join(base, y))):
|
||||
x = y
|
||||
return x
|
||||
|
||||
def donode(item, parent, base, subpath):
|
||||
for child in item:
|
||||
title = child.title
|
||||
if not title:
|
||||
continue
|
||||
raw = unquote_path(child.href or '')
|
||||
rsrcname = os.path.basename(raw)
|
||||
rsrcpath = os.path.join(subpath, rsrcname)
|
||||
if (not os.path.exists(os.path.join(base, rsrcpath)) and os.path.exists(os.path.join(base, raw))):
|
||||
rsrcpath = raw
|
||||
|
||||
if '%' not in rsrcpath:
|
||||
rsrcpath = urlquote(rsrcpath)
|
||||
if not raw:
|
||||
rsrcpath = ''
|
||||
c = DIV(A(title, href=rsrcpath))
|
||||
donode(child, c, base, subpath)
|
||||
parent.append(c)
|
||||
|
||||
with open(htmlpath, 'wb') as f:
|
||||
if toc.count() > 1:
|
||||
from lxml.html.builder import HTML, BODY, DIV, A
|
||||
path0 = toc[0].href
|
||||
path0 = unquote_path(path0)
|
||||
subpath = os.path.dirname(path0)
|
||||
base = os.path.dirname(f.name)
|
||||
root = DIV()
|
||||
donode(toc, root, base, subpath)
|
||||
raw = html.tostring(HTML(BODY(root)), encoding='utf-8',
|
||||
pretty_print=True)
|
||||
f.write(raw)
|
||||
else:
|
||||
f.write(as_bytes(hhcdata))
|
||||
return htmlpath, toc
|
||||
|
||||
def _read_file(self, name):
|
||||
with lopen(name, 'rb') as f:
|
||||
data = f.read()
|
||||
return data
|
||||
|
||||
def add_node(self, node, toc, ancestor_map):
|
||||
from calibre.ebooks.chm.reader import match_string
|
||||
if match_string(node.attrib.get('type', ''), 'text/sitemap'):
|
||||
p = node.xpath('ancestor::ul[1]/ancestor::li[1]/object[1]')
|
||||
parent = p[0] if p else None
|
||||
toc = ancestor_map.get(parent, toc)
|
||||
title = href = ''
|
||||
for param in node.xpath('./param'):
|
||||
if match_string(param.attrib['name'], 'name'):
|
||||
title = param.attrib['value']
|
||||
elif match_string(param.attrib['name'], 'local'):
|
||||
href = param.attrib['value']
|
||||
child = toc.add(title or _('Unknown'), href)
|
||||
ancestor_map[node] = child
|
||||
|
||||
def _process_nodes(self, root):
|
||||
from calibre.ebooks.oeb.base import TOC
|
||||
toc = TOC()
|
||||
ancestor_map = {}
|
||||
for node in root.xpath('//object'):
|
||||
self.add_node(node, toc, ancestor_map)
|
||||
return toc
|
||||
310
ebook_converter/ebooks/conversion/plugins/comic_input.py
Normal file
310
ebook_converter/ebooks/conversion/plugins/comic_input.py
Normal file
@@ -0,0 +1,310 @@
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
'''
|
||||
Based on ideas from comiclrf created by FangornUK.
|
||||
'''
|
||||
|
||||
import shutil, textwrap, codecs, os
|
||||
|
||||
from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
|
||||
from calibre import CurrentDir
|
||||
from calibre.ptempfile import PersistentTemporaryDirectory
|
||||
from polyglot.builtins import getcwd, map
|
||||
|
||||
|
||||
class ComicInput(InputFormatPlugin):
|
||||
|
||||
name = 'Comic Input'
|
||||
author = 'Kovid Goyal'
|
||||
description = 'Optimize comic files (.cbz, .cbr, .cbc) for viewing on portable devices'
|
||||
file_types = {'cbz', 'cbr', 'cbc'}
|
||||
is_image_collection = True
|
||||
commit_name = 'comic_input'
|
||||
core_usage = -1
|
||||
|
||||
options = {
|
||||
OptionRecommendation(name='colors', recommended_value=0,
|
||||
help=_('Reduce the number of colors used in the image. This works only'
|
||||
' if you choose the PNG output format. It is useful to reduce file sizes.'
|
||||
' Set to zero to turn off. Maximum value is 256. It is off by default.')),
|
||||
OptionRecommendation(name='dont_normalize', recommended_value=False,
|
||||
help=_('Disable normalize (improve contrast) color range '
|
||||
'for pictures. Default: False')),
|
||||
OptionRecommendation(name='keep_aspect_ratio', recommended_value=False,
|
||||
help=_('Maintain picture aspect ratio. Default is to fill the screen.')),
|
||||
OptionRecommendation(name='dont_sharpen', recommended_value=False,
|
||||
help=_('Disable sharpening.')),
|
||||
OptionRecommendation(name='disable_trim', recommended_value=False,
|
||||
help=_('Disable trimming of comic pages. For some comics, '
|
||||
'trimming might remove content as well as borders.')),
|
||||
OptionRecommendation(name='landscape', recommended_value=False,
|
||||
help=_("Don't split landscape images into two portrait images")),
|
||||
OptionRecommendation(name='wide', recommended_value=False,
|
||||
help=_("Keep aspect ratio and scale image using screen height as "
|
||||
"image width for viewing in landscape mode.")),
|
||||
OptionRecommendation(name='right2left', recommended_value=False,
|
||||
help=_('Used for right-to-left publications like manga. '
|
||||
'Causes landscape pages to be split into portrait pages '
|
||||
'from right to left.')),
|
||||
OptionRecommendation(name='despeckle', recommended_value=False,
|
||||
help=_('Enable Despeckle. Reduces speckle noise. '
|
||||
'May greatly increase processing time.')),
|
||||
OptionRecommendation(name='no_sort', recommended_value=False,
|
||||
help=_("Don't sort the files found in the comic "
|
||||
"alphabetically by name. Instead use the order they were "
|
||||
"added to the comic.")),
|
||||
OptionRecommendation(name='output_format', choices=['png', 'jpg'],
|
||||
recommended_value='png', help=_('The format that images in the created e-book '
|
||||
'are converted to. You can experiment to see which format gives '
|
||||
'you optimal size and look on your device.')),
|
||||
OptionRecommendation(name='no_process', recommended_value=False,
|
||||
help=_("Apply no processing to the image")),
|
||||
OptionRecommendation(name='dont_grayscale', recommended_value=False,
|
||||
help=_('Do not convert the image to grayscale (black and white)')),
|
||||
OptionRecommendation(name='comic_image_size', recommended_value=None,
|
||||
help=_('Specify the image size as widthxheight pixels. Normally,'
|
||||
' an image size is automatically calculated from the output '
|
||||
'profile, this option overrides it.')),
|
||||
OptionRecommendation(name='dont_add_comic_pages_to_toc', recommended_value=False,
|
||||
help=_('When converting a CBC do not add links to each page to'
|
||||
' the TOC. Note this only applies if the TOC has more than one'
|
||||
' section')),
|
||||
}
|
||||
|
||||
recommendations = {
|
||||
('margin_left', 0, OptionRecommendation.HIGH),
|
||||
('margin_top', 0, OptionRecommendation.HIGH),
|
||||
('margin_right', 0, OptionRecommendation.HIGH),
|
||||
('margin_bottom', 0, OptionRecommendation.HIGH),
|
||||
('insert_blank_line', False, OptionRecommendation.HIGH),
|
||||
('remove_paragraph_spacing', False, OptionRecommendation.HIGH),
|
||||
('change_justification', 'left', OptionRecommendation.HIGH),
|
||||
('dont_split_on_pagebreaks', True, OptionRecommendation.HIGH),
|
||||
('chapter', None, OptionRecommendation.HIGH),
|
||||
('page_breaks_brefore', None, OptionRecommendation.HIGH),
|
||||
('use_auto_toc', False, OptionRecommendation.HIGH),
|
||||
('page_breaks_before', None, OptionRecommendation.HIGH),
|
||||
('disable_font_rescaling', True, OptionRecommendation.HIGH),
|
||||
('linearize_tables', False, OptionRecommendation.HIGH),
|
||||
}
|
||||
|
||||
def get_comics_from_collection(self, stream):
|
||||
from calibre.libunzip import extract as zipextract
|
||||
tdir = PersistentTemporaryDirectory('_comic_collection')
|
||||
zipextract(stream, tdir)
|
||||
comics = []
|
||||
with CurrentDir(tdir):
|
||||
if not os.path.exists('comics.txt'):
|
||||
raise ValueError((
|
||||
'%s is not a valid comic collection'
|
||||
' no comics.txt was found in the file')
|
||||
%stream.name)
|
||||
with open('comics.txt', 'rb') as f:
|
||||
raw = f.read()
|
||||
if raw.startswith(codecs.BOM_UTF16_BE):
|
||||
raw = raw.decode('utf-16-be')[1:]
|
||||
elif raw.startswith(codecs.BOM_UTF16_LE):
|
||||
raw = raw.decode('utf-16-le')[1:]
|
||||
elif raw.startswith(codecs.BOM_UTF8):
|
||||
raw = raw.decode('utf-8')[1:]
|
||||
else:
|
||||
raw = raw.decode('utf-8')
|
||||
for line in raw.splitlines():
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
fname, title = line.partition(':')[0], line.partition(':')[-1]
|
||||
fname = fname.replace('#', '_')
|
||||
fname = os.path.join(tdir, *fname.split('/'))
|
||||
if not title:
|
||||
title = os.path.basename(fname).rpartition('.')[0]
|
||||
if os.access(fname, os.R_OK):
|
||||
comics.append([title, fname])
|
||||
if not comics:
|
||||
raise ValueError('%s has no comics'%stream.name)
|
||||
return comics
|
||||
|
||||
def get_pages(self, comic, tdir2):
|
||||
from calibre.ebooks.comic.input import (extract_comic, process_pages,
|
||||
find_pages)
|
||||
tdir = extract_comic(comic)
|
||||
new_pages = find_pages(tdir, sort_on_mtime=self.opts.no_sort,
|
||||
verbose=self.opts.verbose)
|
||||
thumbnail = None
|
||||
if not new_pages:
|
||||
raise ValueError('Could not find any pages in the comic: %s'
|
||||
%comic)
|
||||
if self.opts.no_process:
|
||||
n2 = []
|
||||
for i, page in enumerate(new_pages):
|
||||
n2.append(os.path.join(tdir2, '{} - {}' .format(i, os.path.basename(page))))
|
||||
shutil.copyfile(page, n2[-1])
|
||||
new_pages = n2
|
||||
else:
|
||||
new_pages, failures = process_pages(new_pages, self.opts,
|
||||
self.report_progress, tdir2)
|
||||
if failures:
|
||||
self.log.warning('Could not process the following pages '
|
||||
'(run with --verbose to see why):')
|
||||
for f in failures:
|
||||
self.log.warning('\t', f)
|
||||
if not new_pages:
|
||||
raise ValueError('Could not find any valid pages in comic: %s'
|
||||
% comic)
|
||||
thumbnail = os.path.join(tdir2,
|
||||
'thumbnail.'+self.opts.output_format.lower())
|
||||
if not os.access(thumbnail, os.R_OK):
|
||||
thumbnail = None
|
||||
return new_pages
|
||||
|
||||
def get_images(self):
|
||||
return self._images
|
||||
|
||||
def convert(self, stream, opts, file_ext, log, accelerators):
|
||||
from calibre.ebooks.metadata import MetaInformation
|
||||
from calibre.ebooks.metadata.opf2 import OPFCreator
|
||||
from calibre.ebooks.metadata.toc import TOC
|
||||
|
||||
self.opts, self.log= opts, log
|
||||
if file_ext == 'cbc':
|
||||
comics_ = self.get_comics_from_collection(stream)
|
||||
else:
|
||||
comics_ = [['Comic', os.path.abspath(stream.name)]]
|
||||
stream.close()
|
||||
comics = []
|
||||
for i, x in enumerate(comics_):
|
||||
title, fname = x
|
||||
cdir = 'comic_%d'%(i+1) if len(comics_) > 1 else '.'
|
||||
cdir = os.path.abspath(cdir)
|
||||
if not os.path.exists(cdir):
|
||||
os.makedirs(cdir)
|
||||
pages = self.get_pages(fname, cdir)
|
||||
if not pages:
|
||||
continue
|
||||
if self.for_viewer:
|
||||
comics.append((title, pages, [self.create_viewer_wrapper(pages)]))
|
||||
else:
|
||||
wrappers = self.create_wrappers(pages)
|
||||
comics.append((title, pages, wrappers))
|
||||
|
||||
if not comics:
|
||||
raise ValueError('No comic pages found in %s'%stream.name)
|
||||
|
||||
mi = MetaInformation(os.path.basename(stream.name).rpartition('.')[0],
|
||||
[_('Unknown')])
|
||||
opf = OPFCreator(getcwd(), mi)
|
||||
entries = []
|
||||
|
||||
def href(x):
|
||||
if len(comics) == 1:
|
||||
return os.path.basename(x)
|
||||
return '/'.join(x.split(os.sep)[-2:])
|
||||
|
||||
cover_href = None
|
||||
for comic in comics:
|
||||
pages, wrappers = comic[1:]
|
||||
page_entries = [(x, None) for x in map(href, pages)]
|
||||
entries += [(w, None) for w in map(href, wrappers)] + page_entries
|
||||
if cover_href is None and page_entries:
|
||||
cover_href = page_entries[0][0]
|
||||
opf.create_manifest(entries)
|
||||
spine = []
|
||||
for comic in comics:
|
||||
spine.extend(map(href, comic[2]))
|
||||
self._images = []
|
||||
for comic in comics:
|
||||
self._images.extend(comic[1])
|
||||
opf.create_spine(spine)
|
||||
if self.for_viewer and cover_href:
|
||||
opf.guide.set_cover(cover_href)
|
||||
toc = TOC()
|
||||
if len(comics) == 1:
|
||||
wrappers = comics[0][2]
|
||||
for i, x in enumerate(wrappers):
|
||||
toc.add_item(href(x), None, _('Page')+' %d'%(i+1),
|
||||
play_order=i)
|
||||
else:
|
||||
po = 0
|
||||
for comic in comics:
|
||||
po += 1
|
||||
wrappers = comic[2]
|
||||
stoc = toc.add_item(href(wrappers[0]),
|
||||
None, comic[0], play_order=po)
|
||||
if not opts.dont_add_comic_pages_to_toc:
|
||||
for i, x in enumerate(wrappers):
|
||||
stoc.add_item(href(x), None,
|
||||
_('Page')+' %d'%(i+1), play_order=po)
|
||||
po += 1
|
||||
opf.set_toc(toc)
|
||||
with open('metadata.opf', 'wb') as m, open('toc.ncx', 'wb') as n:
|
||||
opf.render(m, n, 'toc.ncx')
|
||||
return os.path.abspath('metadata.opf')
|
||||
|
||||
def create_wrappers(self, pages):
|
||||
from calibre.ebooks.oeb.base import XHTML_NS
|
||||
wrappers = []
|
||||
WRAPPER = textwrap.dedent('''\
|
||||
<html xmlns="%s">
|
||||
<head>
|
||||
<meta charset="utf-8"/>
|
||||
<title>Page #%d</title>
|
||||
<style type="text/css">
|
||||
@page { margin:0pt; padding: 0pt}
|
||||
body { margin: 0pt; padding: 0pt}
|
||||
div { text-align: center }
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<div>
|
||||
<img src="%s" alt="comic page #%d" />
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
''')
|
||||
dir = os.path.dirname(pages[0])
|
||||
for i, page in enumerate(pages):
|
||||
wrapper = WRAPPER%(XHTML_NS, i+1, os.path.basename(page), i+1)
|
||||
page = os.path.join(dir, 'page_%d.xhtml'%(i+1))
|
||||
with open(page, 'wb') as f:
|
||||
f.write(wrapper.encode('utf-8'))
|
||||
wrappers.append(page)
|
||||
return wrappers
|
||||
|
||||
def create_viewer_wrapper(self, pages):
|
||||
from calibre.ebooks.oeb.base import XHTML_NS
|
||||
|
||||
def page(src):
|
||||
return '<img src="{}"></img>'.format(os.path.basename(src))
|
||||
|
||||
pages = '\n'.join(map(page, pages))
|
||||
base = os.path.dirname(pages[0])
|
||||
wrapper = '''
|
||||
<html xmlns="%s">
|
||||
<head>
|
||||
<meta charset="utf-8"/>
|
||||
<style type="text/css">
|
||||
html, body, img { height: 100vh; display: block; margin: 0; padding: 0; border-width: 0; }
|
||||
img {
|
||||
width: 100%%; height: 100%%;
|
||||
object-fit: contain;
|
||||
margin-left: auto; margin-right: auto;
|
||||
max-width: 100vw; max-height: 100vh;
|
||||
top: 50vh; transform: translateY(-50%%);
|
||||
position: relative;
|
||||
page-break-after: always;
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
%s
|
||||
</body>
|
||||
</html>
|
||||
''' % (XHTML_NS, pages)
|
||||
path = os.path.join(base, 'wrapper.xhtml')
|
||||
with open(path, 'wb') as f:
|
||||
f.write(wrapper.encode('utf-8'))
|
||||
return path
|
||||
67
ebook_converter/ebooks/conversion/plugins/djvu_input.py
Normal file
67
ebook_converter/ebooks/conversion/plugins/djvu_input.py
Normal file
@@ -0,0 +1,67 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL 3'
|
||||
__copyright__ = '2011, Anthon van der Neut <anthon@mnt.org>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import os
|
||||
from io import BytesIO
|
||||
|
||||
from calibre.customize.conversion import InputFormatPlugin
|
||||
from polyglot.builtins import getcwd
|
||||
|
||||
|
||||
class DJVUInput(InputFormatPlugin):
|
||||
|
||||
name = 'DJVU Input'
|
||||
author = 'Anthon van der Neut'
|
||||
description = 'Convert OCR-ed DJVU files (.djvu) to HTML'
|
||||
file_types = {'djvu', 'djv'}
|
||||
commit_name = 'djvu_input'
|
||||
|
||||
def convert(self, stream, options, file_ext, log, accelerators):
|
||||
from calibre.ebooks.txt.processor import convert_basic
|
||||
|
||||
stdout = BytesIO()
|
||||
from calibre.ebooks.djvu.djvu import DJVUFile
|
||||
x = DJVUFile(stream)
|
||||
x.get_text(stdout)
|
||||
raw_text = stdout.getvalue()
|
||||
if not raw_text:
|
||||
raise ValueError('The DJVU file contains no text, only images, probably page scans.'
|
||||
' calibre only supports conversion of DJVU files with actual text in them.')
|
||||
|
||||
html = convert_basic(raw_text.replace(b"\n", b' ').replace(
|
||||
b'\037', b'\n\n'))
|
||||
# Run the HTMLized text through the html processing plugin.
|
||||
from calibre.customize.ui import plugin_for_input_format
|
||||
html_input = plugin_for_input_format('html')
|
||||
for opt in html_input.options:
|
||||
setattr(options, opt.option.name, opt.recommended_value)
|
||||
options.input_encoding = 'utf-8'
|
||||
base = getcwd()
|
||||
htmlfile = os.path.join(base, 'index.html')
|
||||
c = 0
|
||||
while os.path.exists(htmlfile):
|
||||
c += 1
|
||||
htmlfile = os.path.join(base, 'index%d.html'%c)
|
||||
with open(htmlfile, 'wb') as f:
|
||||
f.write(html.encode('utf-8'))
|
||||
odi = options.debug_pipeline
|
||||
options.debug_pipeline = None
|
||||
# Generate oeb from html conversion.
|
||||
with open(htmlfile, 'rb') as f:
|
||||
oeb = html_input.convert(f, options, 'html', log,
|
||||
{})
|
||||
options.debug_pipeline = odi
|
||||
os.remove(htmlfile)
|
||||
|
||||
# Set metadata from file.
|
||||
from calibre.customize.ui import get_file_type_metadata
|
||||
from calibre.ebooks.oeb.transforms.metadata import meta_info_to_oeb_metadata
|
||||
mi = get_file_type_metadata(stream, file_ext)
|
||||
meta_info_to_oeb_metadata(mi, oeb.metadata, log)
|
||||
|
||||
return oeb
|
||||
34
ebook_converter/ebooks/conversion/plugins/docx_input.py
Normal file
34
ebook_converter/ebooks/conversion/plugins/docx_input.py
Normal file
@@ -0,0 +1,34 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=utf-8
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
|
||||
from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
|
||||
|
||||
|
||||
class DOCXInput(InputFormatPlugin):
|
||||
name = 'DOCX Input'
|
||||
author = 'Kovid Goyal'
|
||||
description = _('Convert DOCX files (.docx and .docm) to HTML')
|
||||
file_types = {'docx', 'docm'}
|
||||
commit_name = 'docx_input'
|
||||
|
||||
options = {
|
||||
OptionRecommendation(name='docx_no_cover', recommended_value=False,
|
||||
help=_('Normally, if a large image is present at the start of the document that looks like a cover, '
|
||||
'it will be removed from the document and used as the cover for created e-book. This option '
|
||||
'turns off that behavior.')),
|
||||
OptionRecommendation(name='docx_no_pagebreaks_between_notes', recommended_value=False,
|
||||
help=_('Do not insert a page break after every endnote.')),
|
||||
OptionRecommendation(name='docx_inline_subsup', recommended_value=False,
|
||||
help=_('Render superscripts and subscripts so that they do not affect the line height.')),
|
||||
}
|
||||
|
||||
recommendations = {('page_breaks_before', '/', OptionRecommendation.MED)}
|
||||
|
||||
def convert(self, stream, options, file_ext, log, accelerators):
|
||||
from calibre.ebooks.docx.to_html import Convert
|
||||
return Convert(stream, detect_cover=not options.docx_no_cover, log=log, notes_nopb=options.docx_no_pagebreaks_between_notes,
|
||||
nosupsub=options.docx_inline_subsup)()
|
||||
93
ebook_converter/ebooks/conversion/plugins/docx_output.py
Normal file
93
ebook_converter/ebooks/conversion/plugins/docx_output.py
Normal file
@@ -0,0 +1,93 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=utf-8
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
|
||||
from calibre.customize.conversion import OutputFormatPlugin, OptionRecommendation
|
||||
|
||||
PAGE_SIZES = ['a0', 'a1', 'a2', 'a3', 'a4', 'a5', 'a6', 'b0', 'b1',
|
||||
'b2', 'b3', 'b4', 'b5', 'b6', 'legal', 'letter']
|
||||
|
||||
|
||||
class DOCXOutput(OutputFormatPlugin):
|
||||
|
||||
name = 'DOCX Output'
|
||||
author = 'Kovid Goyal'
|
||||
file_type = 'docx'
|
||||
commit_name = 'docx_output'
|
||||
ui_data = {'page_sizes': PAGE_SIZES}
|
||||
|
||||
options = {
|
||||
OptionRecommendation(name='docx_page_size', recommended_value='letter',
|
||||
level=OptionRecommendation.LOW, choices=PAGE_SIZES,
|
||||
help=_('The size of the page. Default is letter. Choices '
|
||||
'are %s') % PAGE_SIZES),
|
||||
|
||||
OptionRecommendation(name='docx_custom_page_size', recommended_value=None,
|
||||
help=_('Custom size of the document. Use the form widthxheight '
|
||||
'EG. `123x321` to specify the width and height (in pts). '
|
||||
'This overrides any specified page-size.')),
|
||||
|
||||
OptionRecommendation(name='docx_no_cover', recommended_value=False,
|
||||
help=_('Do not insert the book cover as an image at the start of the document.'
|
||||
' If you use this option, the book cover will be discarded.')),
|
||||
|
||||
OptionRecommendation(name='preserve_cover_aspect_ratio', recommended_value=False,
|
||||
help=_('Preserve the aspect ratio of the cover image instead of stretching'
|
||||
' it out to cover the entire page.')),
|
||||
|
||||
OptionRecommendation(name='docx_no_toc', recommended_value=False,
|
||||
help=_('Do not insert the table of contents as a page at the start of the document.')),
|
||||
|
||||
OptionRecommendation(name='extract_to',
|
||||
help=_('Extract the contents of the generated %s file to the '
|
||||
'specified directory. The contents of the directory are first '
|
||||
'deleted, so be careful.') % 'DOCX'),
|
||||
|
||||
OptionRecommendation(name='docx_page_margin_left', recommended_value=72.0,
|
||||
level=OptionRecommendation.LOW,
|
||||
help=_('The size of the left page margin, in pts. Default is 72pt.'
|
||||
' Overrides the common left page margin setting.')
|
||||
),
|
||||
|
||||
OptionRecommendation(name='docx_page_margin_top', recommended_value=72.0,
|
||||
level=OptionRecommendation.LOW,
|
||||
help=_('The size of the top page margin, in pts. Default is 72pt.'
|
||||
' Overrides the common top page margin setting, unless set to zero.')
|
||||
),
|
||||
|
||||
OptionRecommendation(name='docx_page_margin_right', recommended_value=72.0,
|
||||
level=OptionRecommendation.LOW,
|
||||
help=_('The size of the right page margin, in pts. Default is 72pt.'
|
||||
' Overrides the common right page margin setting, unless set to zero.')
|
||||
),
|
||||
|
||||
OptionRecommendation(name='docx_page_margin_bottom', recommended_value=72.0,
|
||||
level=OptionRecommendation.LOW,
|
||||
help=_('The size of the bottom page margin, in pts. Default is 72pt.'
|
||||
' Overrides the common bottom page margin setting, unless set to zero.')
|
||||
),
|
||||
|
||||
}
|
||||
|
||||
def convert_metadata(self, oeb):
|
||||
from lxml import etree
|
||||
from calibre.ebooks.oeb.base import OPF, OPF2_NS
|
||||
from calibre.ebooks.metadata.opf2 import OPF as ReadOPF
|
||||
from io import BytesIO
|
||||
package = etree.Element(OPF('package'), attrib={'version': '2.0'}, nsmap={None: OPF2_NS})
|
||||
oeb.metadata.to_opf2(package)
|
||||
self.mi = ReadOPF(BytesIO(etree.tostring(package, encoding='utf-8')), populate_spine=False, try_to_guess_cover=False).to_book_metadata()
|
||||
|
||||
def convert(self, oeb, output_path, input_plugin, opts, log):
|
||||
from calibre.ebooks.docx.writer.container import DOCX
|
||||
from calibre.ebooks.docx.writer.from_html import Convert
|
||||
docx = DOCX(opts, log)
|
||||
self.convert_metadata(oeb)
|
||||
Convert(oeb, docx, self.mi, not opts.docx_no_cover, not opts.docx_no_toc)()
|
||||
docx.write(output_path, self.mi)
|
||||
if opts.extract_to:
|
||||
from calibre.ebooks.docx.dump import do_dump
|
||||
do_dump(output_path, opts.extract_to)
|
||||
438
ebook_converter/ebooks/conversion/plugins/epub_input.py
Normal file
438
ebook_converter/ebooks/conversion/plugins/epub_input.py
Normal file
@@ -0,0 +1,438 @@
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL 3'
|
||||
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import os, re, posixpath
|
||||
from itertools import cycle
|
||||
|
||||
from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
|
||||
from polyglot.builtins import getcwd
|
||||
|
||||
ADOBE_OBFUSCATION = 'http://ns.adobe.com/pdf/enc#RC'
|
||||
IDPF_OBFUSCATION = 'http://www.idpf.org/2008/embedding'
|
||||
|
||||
|
||||
def decrypt_font_data(key, data, algorithm):
|
||||
is_adobe = algorithm == ADOBE_OBFUSCATION
|
||||
crypt_len = 1024 if is_adobe else 1040
|
||||
crypt = bytearray(data[:crypt_len])
|
||||
key = cycle(iter(bytearray(key)))
|
||||
decrypt = bytes(bytearray(x^next(key) for x in crypt))
|
||||
return decrypt + data[crypt_len:]
|
||||
|
||||
|
||||
def decrypt_font(key, path, algorithm):
|
||||
with lopen(path, 'r+b') as f:
|
||||
data = decrypt_font_data(key, f.read(), algorithm)
|
||||
f.seek(0), f.truncate(), f.write(data)
|
||||
|
||||
|
||||
class EPUBInput(InputFormatPlugin):
|
||||
|
||||
name = 'EPUB Input'
|
||||
author = 'Kovid Goyal'
|
||||
description = 'Convert EPUB files (.epub) to HTML'
|
||||
file_types = {'epub'}
|
||||
output_encoding = None
|
||||
commit_name = 'epub_input'
|
||||
|
||||
recommendations = {('page_breaks_before', '/', OptionRecommendation.MED)}
|
||||
|
||||
def process_encryption(self, encfile, opf, log):
|
||||
from lxml import etree
|
||||
import uuid, hashlib
|
||||
idpf_key = opf.raw_unique_identifier
|
||||
if idpf_key:
|
||||
idpf_key = re.sub('[\u0020\u0009\u000d\u000a]', '', idpf_key)
|
||||
idpf_key = hashlib.sha1(idpf_key.encode('utf-8')).digest()
|
||||
key = None
|
||||
for item in opf.identifier_iter():
|
||||
scheme = None
|
||||
for xkey in item.attrib.keys():
|
||||
if xkey.endswith('scheme'):
|
||||
scheme = item.get(xkey)
|
||||
if (scheme and scheme.lower() == 'uuid') or \
|
||||
(item.text and item.text.startswith('urn:uuid:')):
|
||||
try:
|
||||
key = item.text.rpartition(':')[-1]
|
||||
key = uuid.UUID(key).bytes
|
||||
except:
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
key = None
|
||||
|
||||
try:
|
||||
root = etree.parse(encfile)
|
||||
for em in root.xpath('descendant::*[contains(name(), "EncryptionMethod")]'):
|
||||
algorithm = em.get('Algorithm', '')
|
||||
if algorithm not in {ADOBE_OBFUSCATION, IDPF_OBFUSCATION}:
|
||||
return False
|
||||
cr = em.getparent().xpath('descendant::*[contains(name(), "CipherReference")]')[0]
|
||||
uri = cr.get('URI')
|
||||
path = os.path.abspath(os.path.join(os.path.dirname(encfile), '..', *uri.split('/')))
|
||||
tkey = (key if algorithm == ADOBE_OBFUSCATION else idpf_key)
|
||||
if (tkey and os.path.exists(path)):
|
||||
self._encrypted_font_uris.append(uri)
|
||||
decrypt_font(tkey, path, algorithm)
|
||||
return True
|
||||
except:
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
return False
|
||||
|
||||
def set_guide_type(self, opf, gtype, href=None, title=''):
|
||||
# Set the specified guide entry
|
||||
for elem in list(opf.iterguide()):
|
||||
if elem.get('type', '').lower() == gtype:
|
||||
elem.getparent().remove(elem)
|
||||
|
||||
if href is not None:
|
||||
t = opf.create_guide_item(gtype, title, href)
|
||||
for guide in opf.root.xpath('./*[local-name()="guide"]'):
|
||||
guide.append(t)
|
||||
return
|
||||
guide = opf.create_guide_element()
|
||||
opf.root.append(guide)
|
||||
guide.append(t)
|
||||
return t
|
||||
|
||||
def rationalize_cover3(self, opf, log):
|
||||
''' If there is a reference to the cover/titlepage via manifest properties, convert to
|
||||
entries in the <guide> so that the rest of the pipeline picks it up. '''
|
||||
from calibre.ebooks.metadata.opf3 import items_with_property
|
||||
removed = guide_titlepage_href = guide_titlepage_id = None
|
||||
|
||||
# Look for titlepages incorrectly marked in the <guide> as covers
|
||||
guide_cover, guide_elem = None, None
|
||||
for guide_elem in opf.iterguide():
|
||||
if guide_elem.get('type', '').lower() == 'cover':
|
||||
guide_cover = guide_elem.get('href', '').partition('#')[0]
|
||||
break
|
||||
if guide_cover:
|
||||
spine = list(opf.iterspine())
|
||||
if spine:
|
||||
idref = spine[0].get('idref', '')
|
||||
for x in opf.itermanifest():
|
||||
if x.get('id') == idref and x.get('href') == guide_cover:
|
||||
guide_titlepage_href = guide_cover
|
||||
guide_titlepage_id = idref
|
||||
break
|
||||
|
||||
raster_cover_href = opf.epub3_raster_cover or opf.raster_cover
|
||||
if raster_cover_href:
|
||||
self.set_guide_type(opf, 'cover', raster_cover_href, 'Cover Image')
|
||||
titlepage_id = titlepage_href = None
|
||||
for item in items_with_property(opf.root, 'calibre:title-page'):
|
||||
tid, href = item.get('id'), item.get('href')
|
||||
if href and tid:
|
||||
titlepage_id, titlepage_href = tid, href.partition('#')[0]
|
||||
break
|
||||
if titlepage_href is None:
|
||||
titlepage_href, titlepage_id = guide_titlepage_href, guide_titlepage_id
|
||||
if titlepage_href is not None:
|
||||
self.set_guide_type(opf, 'titlepage', titlepage_href, 'Title Page')
|
||||
spine = list(opf.iterspine())
|
||||
if len(spine) > 1:
|
||||
for item in spine:
|
||||
if item.get('idref') == titlepage_id:
|
||||
log('Found HTML cover', titlepage_href)
|
||||
if self.for_viewer:
|
||||
item.attrib.pop('linear', None)
|
||||
else:
|
||||
item.getparent().remove(item)
|
||||
removed = titlepage_href
|
||||
return removed
|
||||
|
||||
def rationalize_cover2(self, opf, log):
|
||||
''' Ensure that the cover information in the guide is correct. That
|
||||
means, at most one entry with type="cover" that points to a raster
|
||||
cover and at most one entry with type="titlepage" that points to an
|
||||
HTML titlepage. '''
|
||||
from calibre.ebooks.oeb.base import OPF
|
||||
removed = None
|
||||
from lxml import etree
|
||||
guide_cover, guide_elem = None, None
|
||||
for guide_elem in opf.iterguide():
|
||||
if guide_elem.get('type', '').lower() == 'cover':
|
||||
guide_cover = guide_elem.get('href', '').partition('#')[0]
|
||||
break
|
||||
if not guide_cover:
|
||||
raster_cover = opf.raster_cover
|
||||
if raster_cover:
|
||||
if guide_elem is None:
|
||||
g = opf.root.makeelement(OPF('guide'))
|
||||
opf.root.append(g)
|
||||
else:
|
||||
g = guide_elem.getparent()
|
||||
guide_cover = raster_cover
|
||||
guide_elem = g.makeelement(OPF('reference'), attrib={'href':raster_cover, 'type':'cover'})
|
||||
g.append(guide_elem)
|
||||
return
|
||||
spine = list(opf.iterspine())
|
||||
if not spine:
|
||||
return
|
||||
# Check if the cover specified in the guide is also
|
||||
# the first element in spine
|
||||
idref = spine[0].get('idref', '')
|
||||
manifest = list(opf.itermanifest())
|
||||
if not manifest:
|
||||
return
|
||||
elem = [x for x in manifest if x.get('id', '') == idref]
|
||||
if not elem or elem[0].get('href', None) != guide_cover:
|
||||
return
|
||||
log('Found HTML cover', guide_cover)
|
||||
|
||||
# Remove from spine as covers must be treated
|
||||
# specially
|
||||
if not self.for_viewer:
|
||||
if len(spine) == 1:
|
||||
log.warn('There is only a single spine item and it is marked as the cover. Removing cover marking.')
|
||||
for guide_elem in tuple(opf.iterguide()):
|
||||
if guide_elem.get('type', '').lower() == 'cover':
|
||||
guide_elem.getparent().remove(guide_elem)
|
||||
return
|
||||
else:
|
||||
spine[0].getparent().remove(spine[0])
|
||||
removed = guide_cover
|
||||
else:
|
||||
# Ensure the cover is displayed as the first item in the book, some
|
||||
# epub files have it set with linear='no' which causes the cover to
|
||||
# display in the end
|
||||
spine[0].attrib.pop('linear', None)
|
||||
opf.spine[0].is_linear = True
|
||||
# Ensure that the guide has a cover entry pointing to a raster cover
|
||||
# and a titlepage entry pointing to the html titlepage. The titlepage
|
||||
# entry will be used by the epub output plugin, the raster cover entry
|
||||
# by other output plugins.
|
||||
|
||||
# Search for a raster cover identified in the OPF
|
||||
raster_cover = opf.raster_cover
|
||||
|
||||
# Set the cover guide entry
|
||||
if raster_cover is not None:
|
||||
guide_elem.set('href', raster_cover)
|
||||
else:
|
||||
# Render the titlepage to create a raster cover
|
||||
from calibre.ebooks import render_html_svg_workaround
|
||||
guide_elem.set('href', 'calibre_raster_cover.jpg')
|
||||
t = etree.SubElement(
|
||||
elem[0].getparent(), OPF('item'), href=guide_elem.get('href'), id='calibre_raster_cover')
|
||||
t.set('media-type', 'image/jpeg')
|
||||
if os.path.exists(guide_cover):
|
||||
renderer = render_html_svg_workaround(guide_cover, log)
|
||||
if renderer is not None:
|
||||
with lopen('calibre_raster_cover.jpg', 'wb') as f:
|
||||
f.write(renderer)
|
||||
|
||||
# Set the titlepage guide entry
|
||||
self.set_guide_type(opf, 'titlepage', guide_cover, 'Title Page')
|
||||
return removed
|
||||
|
||||
def find_opf(self):
|
||||
from calibre.utils.xml_parse import safe_xml_fromstring
|
||||
|
||||
def attr(n, attr):
|
||||
for k, v in n.attrib.items():
|
||||
if k.endswith(attr):
|
||||
return v
|
||||
try:
|
||||
with lopen('META-INF/container.xml', 'rb') as f:
|
||||
root = safe_xml_fromstring(f.read())
|
||||
for r in root.xpath('//*[local-name()="rootfile"]'):
|
||||
if attr(r, 'media-type') != "application/oebps-package+xml":
|
||||
continue
|
||||
path = attr(r, 'full-path')
|
||||
if not path:
|
||||
continue
|
||||
path = os.path.join(getcwd(), *path.split('/'))
|
||||
if os.path.exists(path):
|
||||
return path
|
||||
except Exception:
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
|
||||
def convert(self, stream, options, file_ext, log, accelerators):
|
||||
from calibre.utils.zipfile import ZipFile
|
||||
from calibre import walk
|
||||
from calibre.ebooks import DRMError
|
||||
from calibre.ebooks.metadata.opf2 import OPF
|
||||
try:
|
||||
zf = ZipFile(stream)
|
||||
zf.extractall(getcwd())
|
||||
except:
|
||||
log.exception('EPUB appears to be invalid ZIP file, trying a'
|
||||
' more forgiving ZIP parser')
|
||||
from calibre.utils.localunzip import extractall
|
||||
stream.seek(0)
|
||||
extractall(stream)
|
||||
encfile = os.path.abspath(os.path.join('META-INF', 'encryption.xml'))
|
||||
opf = self.find_opf()
|
||||
if opf is None:
|
||||
for f in walk('.'):
|
||||
if f.lower().endswith('.opf') and '__MACOSX' not in f and \
|
||||
not os.path.basename(f).startswith('.'):
|
||||
opf = os.path.abspath(f)
|
||||
break
|
||||
path = getattr(stream, 'name', 'stream')
|
||||
|
||||
if opf is None:
|
||||
raise ValueError('%s is not a valid EPUB file (could not find opf)'%path)
|
||||
|
||||
opf = os.path.relpath(opf, getcwd())
|
||||
parts = os.path.split(opf)
|
||||
opf = OPF(opf, os.path.dirname(os.path.abspath(opf)))
|
||||
|
||||
self._encrypted_font_uris = []
|
||||
if os.path.exists(encfile):
|
||||
if not self.process_encryption(encfile, opf, log):
|
||||
raise DRMError(os.path.basename(path))
|
||||
self.encrypted_fonts = self._encrypted_font_uris
|
||||
|
||||
if len(parts) > 1 and parts[0]:
|
||||
delta = '/'.join(parts[:-1])+'/'
|
||||
|
||||
def normpath(x):
|
||||
return posixpath.normpath(delta + elem.get('href'))
|
||||
|
||||
for elem in opf.itermanifest():
|
||||
elem.set('href', normpath(elem.get('href')))
|
||||
for elem in opf.iterguide():
|
||||
elem.set('href', normpath(elem.get('href')))
|
||||
|
||||
f = self.rationalize_cover3 if opf.package_version >= 3.0 else self.rationalize_cover2
|
||||
self.removed_cover = f(opf, log)
|
||||
if self.removed_cover:
|
||||
self.removed_items_to_ignore = (self.removed_cover,)
|
||||
epub3_nav = opf.epub3_nav
|
||||
if epub3_nav is not None:
|
||||
self.convert_epub3_nav(epub3_nav, opf, log, options)
|
||||
|
||||
for x in opf.itermanifest():
|
||||
if x.get('media-type', '') == 'application/x-dtbook+xml':
|
||||
raise ValueError(
|
||||
'EPUB files with DTBook markup are not supported')
|
||||
|
||||
not_for_spine = set()
|
||||
for y in opf.itermanifest():
|
||||
id_ = y.get('id', None)
|
||||
if id_:
|
||||
mt = y.get('media-type', None)
|
||||
if mt in {
|
||||
'application/vnd.adobe-page-template+xml',
|
||||
'application/vnd.adobe.page-template+xml',
|
||||
'application/adobe-page-template+xml',
|
||||
'application/adobe.page-template+xml',
|
||||
'application/text'
|
||||
}:
|
||||
not_for_spine.add(id_)
|
||||
ext = y.get('href', '').rpartition('.')[-1].lower()
|
||||
if mt == 'text/plain' and ext in {'otf', 'ttf'}:
|
||||
# some epub authoring software sets font mime types to
|
||||
# text/plain
|
||||
not_for_spine.add(id_)
|
||||
y.set('media-type', 'application/font')
|
||||
|
||||
seen = set()
|
||||
for x in list(opf.iterspine()):
|
||||
ref = x.get('idref', None)
|
||||
if not ref or ref in not_for_spine or ref in seen:
|
||||
x.getparent().remove(x)
|
||||
continue
|
||||
seen.add(ref)
|
||||
|
||||
if len(list(opf.iterspine())) == 0:
|
||||
raise ValueError('No valid entries in the spine of this EPUB')
|
||||
|
||||
with lopen('content.opf', 'wb') as nopf:
|
||||
nopf.write(opf.render())
|
||||
|
||||
return os.path.abspath('content.opf')
|
||||
|
||||
def convert_epub3_nav(self, nav_path, opf, log, opts):
|
||||
from lxml import etree
|
||||
from calibre.ebooks.chardet import xml_to_unicode
|
||||
from calibre.ebooks.oeb.polish.parsing import parse
|
||||
from calibre.ebooks.oeb.base import EPUB_NS, XHTML, NCX_MIME, NCX, urlnormalize, urlunquote, serialize
|
||||
from calibre.ebooks.oeb.polish.toc import first_child
|
||||
from calibre.utils.xml_parse import safe_xml_fromstring
|
||||
from tempfile import NamedTemporaryFile
|
||||
with lopen(nav_path, 'rb') as f:
|
||||
raw = f.read()
|
||||
raw = xml_to_unicode(raw, strip_encoding_pats=True, assume_utf8=True)[0]
|
||||
root = parse(raw, log=log)
|
||||
ncx = safe_xml_fromstring('<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1" xml:lang="eng"><navMap/></ncx>')
|
||||
navmap = ncx[0]
|
||||
et = '{%s}type' % EPUB_NS
|
||||
bn = os.path.basename(nav_path)
|
||||
|
||||
def add_from_li(li, parent):
|
||||
href = text = None
|
||||
for x in li.iterchildren(XHTML('a'), XHTML('span')):
|
||||
text = etree.tostring(
|
||||
x, method='text', encoding='unicode', with_tail=False).strip() or ' '.join(
|
||||
x.xpath('descendant-or-self::*/@title')).strip()
|
||||
href = x.get('href')
|
||||
if href:
|
||||
if href.startswith('#'):
|
||||
href = bn + href
|
||||
break
|
||||
np = parent.makeelement(NCX('navPoint'))
|
||||
parent.append(np)
|
||||
np.append(np.makeelement(NCX('navLabel')))
|
||||
np[0].append(np.makeelement(NCX('text')))
|
||||
np[0][0].text = text
|
||||
if href:
|
||||
np.append(np.makeelement(NCX('content'), attrib={'src':href}))
|
||||
return np
|
||||
|
||||
def process_nav_node(node, toc_parent):
|
||||
for li in node.iterchildren(XHTML('li')):
|
||||
child = add_from_li(li, toc_parent)
|
||||
ol = first_child(li, XHTML('ol'))
|
||||
if child is not None and ol is not None:
|
||||
process_nav_node(ol, child)
|
||||
|
||||
for nav in root.iterdescendants(XHTML('nav')):
|
||||
if nav.get(et) == 'toc':
|
||||
ol = first_child(nav, XHTML('ol'))
|
||||
if ol is not None:
|
||||
process_nav_node(ol, navmap)
|
||||
break
|
||||
else:
|
||||
return
|
||||
|
||||
with NamedTemporaryFile(suffix='.ncx', dir=os.path.dirname(nav_path), delete=False) as f:
|
||||
f.write(etree.tostring(ncx, encoding='utf-8'))
|
||||
ncx_href = os.path.relpath(f.name, getcwd()).replace(os.sep, '/')
|
||||
ncx_id = opf.create_manifest_item(ncx_href, NCX_MIME, append=True).get('id')
|
||||
for spine in opf.root.xpath('//*[local-name()="spine"]'):
|
||||
spine.set('toc', ncx_id)
|
||||
opts.epub3_nav_href = urlnormalize(os.path.relpath(nav_path).replace(os.sep, '/'))
|
||||
opts.epub3_nav_parsed = root
|
||||
if getattr(self, 'removed_cover', None):
|
||||
changed = False
|
||||
base_path = os.path.dirname(nav_path)
|
||||
for elem in root.xpath('//*[@href]'):
|
||||
href, frag = elem.get('href').partition('#')[::2]
|
||||
link_path = os.path.relpath(os.path.join(base_path, urlunquote(href)), base_path)
|
||||
abs_href = urlnormalize(link_path)
|
||||
if abs_href == self.removed_cover:
|
||||
changed = True
|
||||
elem.set('data-calibre-removed-titlepage', '1')
|
||||
if changed:
|
||||
with lopen(nav_path, 'wb') as f:
|
||||
f.write(serialize(root, 'application/xhtml+xml'))
|
||||
|
||||
def postprocess_book(self, oeb, opts, log):
|
||||
rc = getattr(self, 'removed_cover', None)
|
||||
if rc:
|
||||
cover_toc_item = None
|
||||
for item in oeb.toc.iterdescendants():
|
||||
if item.href and item.href.partition('#')[0] == rc:
|
||||
cover_toc_item = item
|
||||
break
|
||||
spine = {x.href for x in oeb.spine}
|
||||
if (cover_toc_item is not None and cover_toc_item not in spine):
|
||||
oeb.toc.item_that_refers_to_cover = cover_toc_item
|
||||
548
ebook_converter/ebooks/conversion/plugins/epub_output.py
Normal file
548
ebook_converter/ebooks/conversion/plugins/epub_output.py
Normal file
@@ -0,0 +1,548 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import os, shutil, re
|
||||
|
||||
from calibre.customize.conversion import (OutputFormatPlugin,
|
||||
OptionRecommendation)
|
||||
from calibre.ptempfile import TemporaryDirectory
|
||||
from calibre import CurrentDir
|
||||
from polyglot.builtins import unicode_type, filter, map, zip, range, as_bytes
|
||||
|
||||
block_level_tags = (
|
||||
'address',
|
||||
'body',
|
||||
'blockquote',
|
||||
'center',
|
||||
'dir',
|
||||
'div',
|
||||
'dl',
|
||||
'fieldset',
|
||||
'form',
|
||||
'h1',
|
||||
'h2',
|
||||
'h3',
|
||||
'h4',
|
||||
'h5',
|
||||
'h6',
|
||||
'hr',
|
||||
'isindex',
|
||||
'menu',
|
||||
'noframes',
|
||||
'noscript',
|
||||
'ol',
|
||||
'p',
|
||||
'pre',
|
||||
'table',
|
||||
'ul',
|
||||
)
|
||||
|
||||
|
||||
class EPUBOutput(OutputFormatPlugin):
|
||||
|
||||
name = 'EPUB Output'
|
||||
author = 'Kovid Goyal'
|
||||
file_type = 'epub'
|
||||
commit_name = 'epub_output'
|
||||
ui_data = {'versions': ('2', '3')}
|
||||
|
||||
options = {
|
||||
OptionRecommendation(name='extract_to',
|
||||
help=_('Extract the contents of the generated %s file to the '
|
||||
'specified directory. The contents of the directory are first '
|
||||
'deleted, so be careful.') % 'EPUB'),
|
||||
|
||||
OptionRecommendation(name='dont_split_on_page_breaks',
|
||||
recommended_value=False, level=OptionRecommendation.LOW,
|
||||
help=_('Turn off splitting at page breaks. Normally, input '
|
||||
'files are automatically split at every page break into '
|
||||
'two files. This gives an output e-book that can be '
|
||||
'parsed faster and with less resources. However, '
|
||||
'splitting is slow and if your source file contains a '
|
||||
'very large number of page breaks, you should turn off '
|
||||
'splitting on page breaks.'
|
||||
)
|
||||
),
|
||||
|
||||
OptionRecommendation(name='flow_size', recommended_value=260,
|
||||
help=_('Split all HTML files larger than this size (in KB). '
|
||||
'This is necessary as most EPUB readers cannot handle large '
|
||||
'file sizes. The default of %defaultKB is the size required '
|
||||
'for Adobe Digital Editions. Set to 0 to disable size based splitting.')
|
||||
),
|
||||
|
||||
OptionRecommendation(name='no_default_epub_cover', recommended_value=False,
|
||||
help=_('Normally, if the input file has no cover and you don\'t'
|
||||
' specify one, a default cover is generated with the title, '
|
||||
'authors, etc. This option disables the generation of this cover.')
|
||||
),
|
||||
|
||||
OptionRecommendation(name='no_svg_cover', recommended_value=False,
|
||||
help=_('Do not use SVG for the book cover. Use this option if '
|
||||
'your EPUB is going to be used on a device that does not '
|
||||
'support SVG, like the iPhone or the JetBook Lite. '
|
||||
'Without this option, such devices will display the cover '
|
||||
'as a blank page.')
|
||||
),
|
||||
|
||||
OptionRecommendation(name='preserve_cover_aspect_ratio',
|
||||
recommended_value=False, help=_(
|
||||
'When using an SVG cover, this option will cause the cover to scale '
|
||||
'to cover the available screen area, but still preserve its aspect ratio '
|
||||
'(ratio of width to height). That means there may be white borders '
|
||||
'at the sides or top and bottom of the image, but the image will '
|
||||
'never be distorted. Without this option the image may be slightly '
|
||||
'distorted, but there will be no borders.'
|
||||
)
|
||||
),
|
||||
|
||||
OptionRecommendation(name='epub_flatten', recommended_value=False,
|
||||
help=_('This option is needed only if you intend to use the EPUB'
|
||||
' with FBReaderJ. It will flatten the file system inside the'
|
||||
' EPUB, putting all files into the top level.')
|
||||
),
|
||||
|
||||
OptionRecommendation(name='epub_inline_toc', recommended_value=False,
|
||||
help=_('Insert an inline Table of Contents that will appear as part of the main book content.')
|
||||
),
|
||||
|
||||
OptionRecommendation(name='epub_toc_at_end', recommended_value=False,
|
||||
help=_('Put the inserted inline Table of Contents at the end of the book instead of the start.')
|
||||
),
|
||||
|
||||
OptionRecommendation(name='toc_title', recommended_value=None,
|
||||
help=_('Title for any generated in-line table of contents.')
|
||||
),
|
||||
|
||||
OptionRecommendation(name='epub_version', recommended_value='2', choices=ui_data['versions'],
|
||||
help=_('The version of the EPUB file to generate. EPUB 2 is the'
|
||||
' most widely compatible, only use EPUB 3 if you know you'
|
||||
' actually need it.')
|
||||
),
|
||||
|
||||
}
|
||||
|
||||
recommendations = {('pretty_print', True, OptionRecommendation.HIGH)}
|
||||
|
||||
def workaround_webkit_quirks(self): # {{{
|
||||
from calibre.ebooks.oeb.base import XPath
|
||||
for x in self.oeb.spine:
|
||||
root = x.data
|
||||
body = XPath('//h:body')(root)
|
||||
if body:
|
||||
body = body[0]
|
||||
|
||||
if not hasattr(body, 'xpath'):
|
||||
continue
|
||||
|
||||
for pre in XPath('//h:pre')(body):
|
||||
if not pre.text and len(pre) == 0:
|
||||
pre.tag = 'div'
|
||||
# }}}
|
||||
|
||||
def upshift_markup(self): # {{{
|
||||
'Upgrade markup to comply with XHTML 1.1 where possible'
|
||||
from calibre.ebooks.oeb.base import XPath, XML
|
||||
for x in self.oeb.spine:
|
||||
root = x.data
|
||||
if (not root.get(XML('lang'))) and (root.get('lang')):
|
||||
root.set(XML('lang'), root.get('lang'))
|
||||
body = XPath('//h:body')(root)
|
||||
if body:
|
||||
body = body[0]
|
||||
|
||||
if not hasattr(body, 'xpath'):
|
||||
continue
|
||||
for u in XPath('//h:u')(root):
|
||||
u.tag = 'span'
|
||||
|
||||
seen_ids, seen_names = set(), set()
|
||||
for x in XPath('//*[@id or @name]')(root):
|
||||
eid, name = x.get('id', None), x.get('name', None)
|
||||
if eid:
|
||||
if eid in seen_ids:
|
||||
del x.attrib['id']
|
||||
else:
|
||||
seen_ids.add(eid)
|
||||
if name:
|
||||
if name in seen_names:
|
||||
del x.attrib['name']
|
||||
else:
|
||||
seen_names.add(name)
|
||||
|
||||
# }}}
|
||||
|
||||
def convert(self, oeb, output_path, input_plugin, opts, log):
|
||||
self.log, self.opts, self.oeb = log, opts, oeb
|
||||
|
||||
if self.opts.epub_inline_toc:
|
||||
from calibre.ebooks.mobi.writer8.toc import TOCAdder
|
||||
opts.mobi_toc_at_start = not opts.epub_toc_at_end
|
||||
opts.mobi_passthrough = False
|
||||
opts.no_inline_toc = False
|
||||
TOCAdder(oeb, opts, replace_previous_inline_toc=True, ignore_existing_toc=True)
|
||||
|
||||
if self.opts.epub_flatten:
|
||||
from calibre.ebooks.oeb.transforms.filenames import FlatFilenames
|
||||
FlatFilenames()(oeb, opts)
|
||||
else:
|
||||
from calibre.ebooks.oeb.transforms.filenames import UniqueFilenames
|
||||
UniqueFilenames()(oeb, opts)
|
||||
|
||||
self.workaround_ade_quirks()
|
||||
self.workaround_webkit_quirks()
|
||||
self.upshift_markup()
|
||||
from calibre.ebooks.oeb.transforms.rescale import RescaleImages
|
||||
RescaleImages(check_colorspaces=True)(oeb, opts)
|
||||
|
||||
from calibre.ebooks.oeb.transforms.split import Split
|
||||
split = Split(not self.opts.dont_split_on_page_breaks,
|
||||
max_flow_size=self.opts.flow_size*1024
|
||||
)
|
||||
split(self.oeb, self.opts)
|
||||
|
||||
from calibre.ebooks.oeb.transforms.cover import CoverManager
|
||||
cm = CoverManager(
|
||||
no_default_cover=self.opts.no_default_epub_cover,
|
||||
no_svg_cover=self.opts.no_svg_cover,
|
||||
preserve_aspect_ratio=self.opts.preserve_cover_aspect_ratio)
|
||||
cm(self.oeb, self.opts, self.log)
|
||||
|
||||
self.workaround_sony_quirks()
|
||||
|
||||
if self.oeb.toc.count() == 0:
|
||||
self.log.warn('This EPUB file has no Table of Contents. '
|
||||
'Creating a default TOC')
|
||||
first = next(iter(self.oeb.spine))
|
||||
self.oeb.toc.add(_('Start'), first.href)
|
||||
|
||||
from calibre.ebooks.oeb.base import OPF
|
||||
identifiers = oeb.metadata['identifier']
|
||||
uuid = None
|
||||
for x in identifiers:
|
||||
if x.get(OPF('scheme'), None).lower() == 'uuid' or unicode_type(x).startswith('urn:uuid:'):
|
||||
uuid = unicode_type(x).split(':')[-1]
|
||||
break
|
||||
encrypted_fonts = getattr(input_plugin, 'encrypted_fonts', [])
|
||||
|
||||
if uuid is None:
|
||||
self.log.warn('No UUID identifier found')
|
||||
from uuid import uuid4
|
||||
uuid = unicode_type(uuid4())
|
||||
oeb.metadata.add('identifier', uuid, scheme='uuid', id=uuid)
|
||||
|
||||
if encrypted_fonts and not uuid.startswith('urn:uuid:'):
|
||||
# Apparently ADE requires this value to start with urn:uuid:
|
||||
# for some absurd reason, or it will throw a hissy fit and refuse
|
||||
# to use the obfuscated fonts.
|
||||
for x in identifiers:
|
||||
if unicode_type(x) == uuid:
|
||||
x.content = 'urn:uuid:'+uuid
|
||||
|
||||
with TemporaryDirectory('_epub_output') as tdir:
|
||||
from calibre.customize.ui import plugin_for_output_format
|
||||
metadata_xml = None
|
||||
extra_entries = []
|
||||
if self.is_periodical:
|
||||
if self.opts.output_profile.epub_periodical_format == 'sony':
|
||||
from calibre.ebooks.epub.periodical import sony_metadata
|
||||
metadata_xml, atom_xml = sony_metadata(oeb)
|
||||
extra_entries = [('atom.xml', 'application/atom+xml', atom_xml)]
|
||||
oeb_output = plugin_for_output_format('oeb')
|
||||
oeb_output.convert(oeb, tdir, input_plugin, opts, log)
|
||||
opf = [x for x in os.listdir(tdir) if x.endswith('.opf')][0]
|
||||
self.condense_ncx([os.path.join(tdir, x) for x in os.listdir(tdir)
|
||||
if x.endswith('.ncx')][0])
|
||||
if self.opts.epub_version == '3':
|
||||
self.upgrade_to_epub3(tdir, opf)
|
||||
encryption = None
|
||||
if encrypted_fonts:
|
||||
encryption = self.encrypt_fonts(encrypted_fonts, tdir, uuid)
|
||||
|
||||
from calibre.ebooks.epub import initialize_container
|
||||
with initialize_container(output_path, os.path.basename(opf),
|
||||
extra_entries=extra_entries) as epub:
|
||||
epub.add_dir(tdir)
|
||||
if encryption is not None:
|
||||
epub.writestr('META-INF/encryption.xml', as_bytes(encryption))
|
||||
if metadata_xml is not None:
|
||||
epub.writestr('META-INF/metadata.xml',
|
||||
metadata_xml.encode('utf-8'))
|
||||
if opts.extract_to is not None:
|
||||
from calibre.utils.zipfile import ZipFile
|
||||
if os.path.exists(opts.extract_to):
|
||||
if os.path.isdir(opts.extract_to):
|
||||
shutil.rmtree(opts.extract_to)
|
||||
else:
|
||||
os.remove(opts.extract_to)
|
||||
os.mkdir(opts.extract_to)
|
||||
with ZipFile(output_path) as zf:
|
||||
zf.extractall(path=opts.extract_to)
|
||||
self.log.info('EPUB extracted to', opts.extract_to)
|
||||
|
||||
def upgrade_to_epub3(self, tdir, opf):
|
||||
self.log.info('Upgrading to EPUB 3...')
|
||||
from calibre.ebooks.epub import simple_container_xml
|
||||
from calibre.ebooks.oeb.polish.cover import fix_conversion_titlepage_links_in_nav
|
||||
try:
|
||||
os.mkdir(os.path.join(tdir, 'META-INF'))
|
||||
except EnvironmentError:
|
||||
pass
|
||||
with open(os.path.join(tdir, 'META-INF', 'container.xml'), 'wb') as f:
|
||||
f.write(simple_container_xml(os.path.basename(opf)).encode('utf-8'))
|
||||
from calibre.ebooks.oeb.polish.container import EpubContainer
|
||||
container = EpubContainer(tdir, self.log)
|
||||
from calibre.ebooks.oeb.polish.upgrade import epub_2_to_3
|
||||
existing_nav = getattr(self.opts, 'epub3_nav_parsed', None)
|
||||
nav_href = getattr(self.opts, 'epub3_nav_href', None)
|
||||
previous_nav = (nav_href, existing_nav) if existing_nav and nav_href else None
|
||||
epub_2_to_3(container, self.log.info, previous_nav=previous_nav)
|
||||
fix_conversion_titlepage_links_in_nav(container)
|
||||
container.commit()
|
||||
os.remove(f.name)
|
||||
try:
|
||||
os.rmdir(os.path.join(tdir, 'META-INF'))
|
||||
except EnvironmentError:
|
||||
pass
|
||||
|
||||
def encrypt_fonts(self, uris, tdir, uuid): # {{{
|
||||
from polyglot.binary import from_hex_bytes
|
||||
|
||||
key = re.sub(r'[^a-fA-F0-9]', '', uuid)
|
||||
if len(key) < 16:
|
||||
raise ValueError('UUID identifier %r is invalid'%uuid)
|
||||
key = bytearray(from_hex_bytes((key + key)[:32]))
|
||||
paths = []
|
||||
with CurrentDir(tdir):
|
||||
paths = [os.path.join(*x.split('/')) for x in uris]
|
||||
uris = dict(zip(uris, paths))
|
||||
fonts = []
|
||||
for uri in list(uris.keys()):
|
||||
path = uris[uri]
|
||||
if not os.path.exists(path):
|
||||
uris.pop(uri)
|
||||
continue
|
||||
self.log.debug('Encrypting font:', uri)
|
||||
with lopen(path, 'r+b') as f:
|
||||
data = f.read(1024)
|
||||
if len(data) >= 1024:
|
||||
data = bytearray(data)
|
||||
f.seek(0)
|
||||
f.write(bytes(bytearray(data[i] ^ key[i%16] for i in range(1024))))
|
||||
else:
|
||||
self.log.warn('Font', path, 'is invalid, ignoring')
|
||||
if not isinstance(uri, unicode_type):
|
||||
uri = uri.decode('utf-8')
|
||||
fonts.append('''
|
||||
<enc:EncryptedData>
|
||||
<enc:EncryptionMethod Algorithm="http://ns.adobe.com/pdf/enc#RC"/>
|
||||
<enc:CipherData>
|
||||
<enc:CipherReference URI="%s"/>
|
||||
</enc:CipherData>
|
||||
</enc:EncryptedData>
|
||||
'''%(uri.replace('"', '\\"')))
|
||||
if fonts:
|
||||
ans = '''<encryption
|
||||
xmlns="urn:oasis:names:tc:opendocument:xmlns:container"
|
||||
xmlns:enc="http://www.w3.org/2001/04/xmlenc#"
|
||||
xmlns:deenc="http://ns.adobe.com/digitaleditions/enc">
|
||||
'''
|
||||
ans += '\n'.join(fonts)
|
||||
ans += '\n</encryption>'
|
||||
return ans
|
||||
# }}}
|
||||
|
||||
def condense_ncx(self, ncx_path): # {{{
|
||||
from lxml import etree
|
||||
if not self.opts.pretty_print:
|
||||
tree = etree.parse(ncx_path)
|
||||
for tag in tree.getroot().iter(tag=etree.Element):
|
||||
if tag.text:
|
||||
tag.text = tag.text.strip()
|
||||
if tag.tail:
|
||||
tag.tail = tag.tail.strip()
|
||||
compressed = etree.tostring(tree.getroot(), encoding='utf-8')
|
||||
with open(ncx_path, 'wb') as f:
|
||||
f.write(compressed)
|
||||
# }}}
|
||||
|
||||
def workaround_ade_quirks(self): # {{{
|
||||
'''
|
||||
Perform various markup transforms to get the output to render correctly
|
||||
in the quirky ADE.
|
||||
'''
|
||||
from calibre.ebooks.oeb.base import XPath, XHTML, barename, urlunquote
|
||||
|
||||
stylesheet = self.oeb.manifest.main_stylesheet
|
||||
|
||||
# ADE cries big wet tears when it encounters an invalid fragment
|
||||
# identifier in the NCX toc.
|
||||
frag_pat = re.compile(r'[-A-Za-z0-9_:.]+$')
|
||||
for node in self.oeb.toc.iter():
|
||||
href = getattr(node, 'href', None)
|
||||
if hasattr(href, 'partition'):
|
||||
base, _, frag = href.partition('#')
|
||||
frag = urlunquote(frag)
|
||||
if frag and frag_pat.match(frag) is None:
|
||||
self.log.warn(
|
||||
'Removing fragment identifier %r from TOC as Adobe Digital Editions cannot handle it'%frag)
|
||||
node.href = base
|
||||
|
||||
for x in self.oeb.spine:
|
||||
root = x.data
|
||||
body = XPath('//h:body')(root)
|
||||
if body:
|
||||
body = body[0]
|
||||
|
||||
if hasattr(body, 'xpath'):
|
||||
# remove <img> tags with empty src elements
|
||||
bad = []
|
||||
for x in XPath('//h:img')(body):
|
||||
src = x.get('src', '').strip()
|
||||
if src in ('', '#') or src.startswith('http:'):
|
||||
bad.append(x)
|
||||
for img in bad:
|
||||
img.getparent().remove(img)
|
||||
|
||||
# Add id attribute to <a> tags that have name
|
||||
for x in XPath('//h:a[@name]')(body):
|
||||
if not x.get('id', False):
|
||||
x.set('id', x.get('name'))
|
||||
# The delightful epubcheck has started complaining about <a> tags that
|
||||
# have name attributes.
|
||||
x.attrib.pop('name')
|
||||
|
||||
# Replace <br> that are children of <body> as ADE doesn't handle them
|
||||
for br in XPath('./h:br')(body):
|
||||
if br.getparent() is None:
|
||||
continue
|
||||
try:
|
||||
prior = next(br.itersiblings(preceding=True))
|
||||
priortag = barename(prior.tag)
|
||||
priortext = prior.tail
|
||||
except:
|
||||
priortag = 'body'
|
||||
priortext = body.text
|
||||
if priortext:
|
||||
priortext = priortext.strip()
|
||||
br.tag = XHTML('p')
|
||||
br.text = '\u00a0'
|
||||
style = br.get('style', '').split(';')
|
||||
style = list(filter(None, map(lambda x: x.strip(), style)))
|
||||
style.append('margin:0pt; border:0pt')
|
||||
# If the prior tag is a block (including a <br> we replaced)
|
||||
# then this <br> replacement should have a 1-line height.
|
||||
# Otherwise it should have no height.
|
||||
if not priortext and priortag in block_level_tags:
|
||||
style.append('height:1em')
|
||||
else:
|
||||
style.append('height:0pt')
|
||||
br.set('style', '; '.join(style))
|
||||
|
||||
for tag in XPath('//h:embed')(root):
|
||||
tag.getparent().remove(tag)
|
||||
for tag in XPath('//h:object')(root):
|
||||
if tag.get('type', '').lower().strip() in {'image/svg+xml', 'application/svg+xml'}:
|
||||
continue
|
||||
tag.getparent().remove(tag)
|
||||
|
||||
for tag in XPath('//h:title|//h:style')(root):
|
||||
if not tag.text:
|
||||
tag.getparent().remove(tag)
|
||||
for tag in XPath('//h:script')(root):
|
||||
if (not tag.text and not tag.get('src', False) and tag.get('type', None) != 'text/x-mathjax-config'):
|
||||
tag.getparent().remove(tag)
|
||||
for tag in XPath('//h:body/descendant::h:script')(root):
|
||||
tag.getparent().remove(tag)
|
||||
|
||||
formchildren = XPath('./h:input|./h:button|./h:textarea|'
|
||||
'./h:label|./h:fieldset|./h:legend')
|
||||
for tag in XPath('//h:form')(root):
|
||||
if formchildren(tag):
|
||||
tag.getparent().remove(tag)
|
||||
else:
|
||||
# Not a real form
|
||||
tag.tag = XHTML('div')
|
||||
|
||||
for tag in XPath('//h:center')(root):
|
||||
tag.tag = XHTML('div')
|
||||
tag.set('style', 'text-align:center')
|
||||
# ADE can't handle & in an img url
|
||||
for tag in XPath('//h:img[@src]')(root):
|
||||
tag.set('src', tag.get('src', '').replace('&', ''))
|
||||
|
||||
# ADE whimpers in fright when it encounters a <td> outside a
|
||||
# <table>
|
||||
in_table = XPath('ancestor::h:table')
|
||||
for tag in XPath('//h:td|//h:tr|//h:th')(root):
|
||||
if not in_table(tag):
|
||||
tag.tag = XHTML('div')
|
||||
|
||||
# ADE fails to render non breaking hyphens/soft hyphens/zero width spaces
|
||||
special_chars = re.compile('[\u200b\u00ad]')
|
||||
for elem in root.iterdescendants('*'):
|
||||
if elem.text:
|
||||
elem.text = special_chars.sub('', elem.text)
|
||||
elem.text = elem.text.replace('\u2011', '-')
|
||||
if elem.tail:
|
||||
elem.tail = special_chars.sub('', elem.tail)
|
||||
elem.tail = elem.tail.replace('\u2011', '-')
|
||||
|
||||
if stylesheet is not None:
|
||||
# ADE doesn't render lists correctly if they have left margins
|
||||
from css_parser.css import CSSRule
|
||||
for lb in XPath('//h:ul[@class]|//h:ol[@class]')(root):
|
||||
sel = '.'+lb.get('class')
|
||||
for rule in stylesheet.data.cssRules.rulesOfType(CSSRule.STYLE_RULE):
|
||||
if sel == rule.selectorList.selectorText:
|
||||
rule.style.removeProperty('margin-left')
|
||||
# padding-left breaks rendering in webkit and gecko
|
||||
rule.style.removeProperty('padding-left')
|
||||
# Change whitespace:pre to pre-wrap to accommodate readers that
|
||||
# cannot scroll horizontally
|
||||
for rule in stylesheet.data.cssRules.rulesOfType(CSSRule.STYLE_RULE):
|
||||
style = rule.style
|
||||
ws = style.getPropertyValue('white-space')
|
||||
if ws == 'pre':
|
||||
style.setProperty('white-space', 'pre-wrap')
|
||||
|
||||
# }}}
|
||||
|
||||
def workaround_sony_quirks(self): # {{{
|
||||
'''
|
||||
Perform toc link transforms to alleviate slow loading.
|
||||
'''
|
||||
from calibre.ebooks.oeb.base import urldefrag, XPath
|
||||
from calibre.ebooks.oeb.polish.toc import item_at_top
|
||||
|
||||
def frag_is_at_top(root, frag):
|
||||
elem = XPath('//*[@id="%s" or @name="%s"]'%(frag, frag))(root)
|
||||
if elem:
|
||||
elem = elem[0]
|
||||
else:
|
||||
return False
|
||||
return item_at_top(elem)
|
||||
|
||||
def simplify_toc_entry(toc):
|
||||
if toc.href:
|
||||
href, frag = urldefrag(toc.href)
|
||||
if frag:
|
||||
for x in self.oeb.spine:
|
||||
if x.href == href:
|
||||
if frag_is_at_top(x.data, frag):
|
||||
self.log.debug('Removing anchor from TOC href:',
|
||||
href+'#'+frag)
|
||||
toc.href = href
|
||||
break
|
||||
for x in toc:
|
||||
simplify_toc_entry(x)
|
||||
|
||||
if self.oeb.toc:
|
||||
simplify_toc_entry(self.oeb.toc)
|
||||
|
||||
# }}}
|
||||
179
ebook_converter/ebooks/conversion/plugins/fb2_input.py
Normal file
179
ebook_converter/ebooks/conversion/plugins/fb2_input.py
Normal file
@@ -0,0 +1,179 @@
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2008, Anatoly Shipitsin <norguhtar at gmail.com>'
|
||||
"""
|
||||
Convert .fb2 files to .lrf
|
||||
"""
|
||||
import os, re
|
||||
|
||||
from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
|
||||
from calibre import guess_type
|
||||
from polyglot.builtins import iteritems, getcwd
|
||||
|
||||
FB2NS = 'http://www.gribuser.ru/xml/fictionbook/2.0'
|
||||
FB21NS = 'http://www.gribuser.ru/xml/fictionbook/2.1'
|
||||
|
||||
|
||||
class FB2Input(InputFormatPlugin):
|
||||
|
||||
name = 'FB2 Input'
|
||||
author = 'Anatoly Shipitsin'
|
||||
description = 'Convert FB2 and FBZ files to HTML'
|
||||
file_types = {'fb2', 'fbz'}
|
||||
commit_name = 'fb2_input'
|
||||
|
||||
recommendations = {
|
||||
('level1_toc', '//h:h1', OptionRecommendation.MED),
|
||||
('level2_toc', '//h:h2', OptionRecommendation.MED),
|
||||
('level3_toc', '//h:h3', OptionRecommendation.MED),
|
||||
}
|
||||
|
||||
options = {
|
||||
OptionRecommendation(name='no_inline_fb2_toc',
|
||||
recommended_value=False, level=OptionRecommendation.LOW,
|
||||
help=_('Do not insert a Table of Contents at the beginning of the book.'
|
||||
)
|
||||
)}
|
||||
|
||||
def convert(self, stream, options, file_ext, log,
|
||||
accelerators):
|
||||
from lxml import etree
|
||||
from calibre.utils.xml_parse import safe_xml_fromstring
|
||||
from calibre.ebooks.metadata.fb2 import ensure_namespace, get_fb2_data
|
||||
from calibre.ebooks.metadata.opf2 import OPFCreator
|
||||
from calibre.ebooks.metadata.meta import get_metadata
|
||||
from calibre.ebooks.oeb.base import XLINK_NS, XHTML_NS
|
||||
from calibre.ebooks.chardet import xml_to_unicode
|
||||
self.log = log
|
||||
log.debug('Parsing XML...')
|
||||
raw = get_fb2_data(stream)[0]
|
||||
raw = raw.replace(b'\0', b'')
|
||||
raw = xml_to_unicode(raw, strip_encoding_pats=True,
|
||||
assume_utf8=True, resolve_entities=True)[0]
|
||||
try:
|
||||
doc = safe_xml_fromstring(raw)
|
||||
except etree.XMLSyntaxError:
|
||||
doc = safe_xml_fromstring(raw.replace('& ', '&'))
|
||||
if doc is None:
|
||||
raise ValueError('The FB2 file is not valid XML')
|
||||
doc = ensure_namespace(doc)
|
||||
try:
|
||||
fb_ns = doc.nsmap[doc.prefix]
|
||||
except Exception:
|
||||
fb_ns = FB2NS
|
||||
|
||||
NAMESPACES = {'f':fb_ns, 'l':XLINK_NS}
|
||||
stylesheets = doc.xpath('//*[local-name() = "stylesheet" and @type="text/css"]')
|
||||
css = ''
|
||||
for s in stylesheets:
|
||||
css += etree.tostring(s, encoding='unicode', method='text',
|
||||
with_tail=False) + '\n\n'
|
||||
if css:
|
||||
import css_parser, logging
|
||||
parser = css_parser.CSSParser(fetcher=None,
|
||||
log=logging.getLogger('calibre.css'))
|
||||
|
||||
XHTML_CSS_NAMESPACE = '@namespace "%s";\n' % XHTML_NS
|
||||
text = XHTML_CSS_NAMESPACE + css
|
||||
log.debug('Parsing stylesheet...')
|
||||
stylesheet = parser.parseString(text)
|
||||
stylesheet.namespaces['h'] = XHTML_NS
|
||||
css = stylesheet.cssText
|
||||
if isinstance(css, bytes):
|
||||
css = css.decode('utf-8', 'replace')
|
||||
css = css.replace('h|style', 'h|span')
|
||||
css = re.sub(r'name\s*=\s*', 'class=', css)
|
||||
self.extract_embedded_content(doc)
|
||||
log.debug('Converting XML to HTML...')
|
||||
with open(P('templates/fb2.xsl'), 'rb') as f:
|
||||
ss = f.read().decode('utf-8')
|
||||
ss = ss.replace("__FB_NS__", fb_ns)
|
||||
if options.no_inline_fb2_toc:
|
||||
log('Disabling generation of inline FB2 TOC')
|
||||
ss = re.compile(r'<!-- BUILD TOC -->.*<!-- END BUILD TOC -->',
|
||||
re.DOTALL).sub('', ss)
|
||||
|
||||
styledoc = safe_xml_fromstring(ss)
|
||||
|
||||
transform = etree.XSLT(styledoc)
|
||||
result = transform(doc)
|
||||
|
||||
# Handle links of type note and cite
|
||||
notes = {a.get('href')[1:]: a for a in result.xpath('//a[@link_note and @href]') if a.get('href').startswith('#')}
|
||||
cites = {a.get('link_cite'): a for a in result.xpath('//a[@link_cite]') if not a.get('href', '')}
|
||||
all_ids = {x for x in result.xpath('//*/@id')}
|
||||
for cite, a in iteritems(cites):
|
||||
note = notes.get(cite, None)
|
||||
if note:
|
||||
c = 1
|
||||
while 'cite%d' % c in all_ids:
|
||||
c += 1
|
||||
if not note.get('id', None):
|
||||
note.set('id', 'cite%d' % c)
|
||||
all_ids.add(note.get('id'))
|
||||
a.set('href', '#%s' % note.get('id'))
|
||||
for x in result.xpath('//*[@link_note or @link_cite]'):
|
||||
x.attrib.pop('link_note', None)
|
||||
x.attrib.pop('link_cite', None)
|
||||
|
||||
for img in result.xpath('//img[@src]'):
|
||||
src = img.get('src')
|
||||
img.set('src', self.binary_map.get(src, src))
|
||||
index = transform.tostring(result)
|
||||
with open('index.xhtml', 'wb') as f:
|
||||
f.write(index.encode('utf-8'))
|
||||
with open('inline-styles.css', 'wb') as f:
|
||||
f.write(css.encode('utf-8'))
|
||||
stream.seek(0)
|
||||
mi = get_metadata(stream, 'fb2')
|
||||
if not mi.title:
|
||||
mi.title = _('Unknown')
|
||||
if not mi.authors:
|
||||
mi.authors = [_('Unknown')]
|
||||
cpath = None
|
||||
if mi.cover_data and mi.cover_data[1]:
|
||||
with open('fb2_cover_calibre_mi.jpg', 'wb') as f:
|
||||
f.write(mi.cover_data[1])
|
||||
cpath = os.path.abspath('fb2_cover_calibre_mi.jpg')
|
||||
else:
|
||||
for img in doc.xpath('//f:coverpage/f:image', namespaces=NAMESPACES):
|
||||
href = img.get('{%s}href'%XLINK_NS, img.get('href', None))
|
||||
if href is not None:
|
||||
if href.startswith('#'):
|
||||
href = href[1:]
|
||||
cpath = os.path.abspath(href)
|
||||
break
|
||||
|
||||
opf = OPFCreator(getcwd(), mi)
|
||||
entries = [(f2, guess_type(f2)[0]) for f2 in os.listdir(u'.')]
|
||||
opf.create_manifest(entries)
|
||||
opf.create_spine(['index.xhtml'])
|
||||
if cpath:
|
||||
opf.guide.set_cover(cpath)
|
||||
with open('metadata.opf', 'wb') as f:
|
||||
opf.render(f)
|
||||
return os.path.join(getcwd(), 'metadata.opf')
|
||||
|
||||
def extract_embedded_content(self, doc):
|
||||
from calibre.ebooks.fb2 import base64_decode
|
||||
self.binary_map = {}
|
||||
for elem in doc.xpath('./*'):
|
||||
if elem.text and 'binary' in elem.tag and 'id' in elem.attrib:
|
||||
ct = elem.get('content-type', '')
|
||||
fname = elem.attrib['id']
|
||||
ext = ct.rpartition('/')[-1].lower()
|
||||
if ext in ('png', 'jpeg', 'jpg'):
|
||||
if fname.lower().rpartition('.')[-1] not in {'jpg', 'jpeg',
|
||||
'png'}:
|
||||
fname += '.' + ext
|
||||
self.binary_map[elem.get('id')] = fname
|
||||
raw = elem.text.strip()
|
||||
try:
|
||||
data = base64_decode(raw)
|
||||
except TypeError:
|
||||
self.log.exception('Binary data with id=%s is corrupted, ignoring'%(
|
||||
elem.get('id')))
|
||||
else:
|
||||
with open(fname, 'wb') as f:
|
||||
f.write(data)
|
||||
203
ebook_converter/ebooks/conversion/plugins/fb2_output.py
Normal file
203
ebook_converter/ebooks/conversion/plugins/fb2_output.py
Normal file
@@ -0,0 +1,203 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL 3'
|
||||
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import os
|
||||
|
||||
from calibre.customize.conversion import OutputFormatPlugin, OptionRecommendation
|
||||
|
||||
|
||||
class FB2Output(OutputFormatPlugin):
|
||||
|
||||
name = 'FB2 Output'
|
||||
author = 'John Schember'
|
||||
file_type = 'fb2'
|
||||
commit_name = 'fb2_output'
|
||||
|
||||
FB2_GENRES = [
|
||||
# Science Fiction & Fantasy
|
||||
'sf_history', # Alternative history
|
||||
'sf_action', # Action
|
||||
'sf_epic', # Epic
|
||||
'sf_heroic', # Heroic
|
||||
'sf_detective', # Detective
|
||||
'sf_cyberpunk', # Cyberpunk
|
||||
'sf_space', # Space
|
||||
'sf_social', # Social#philosophical
|
||||
'sf_horror', # Horror & mystic
|
||||
'sf_humor', # Humor
|
||||
'sf_fantasy', # Fantasy
|
||||
'sf', # Science Fiction
|
||||
# Detectives & Thrillers
|
||||
'det_classic', # Classical detectives
|
||||
'det_police', # Police Stories
|
||||
'det_action', # Action
|
||||
'det_irony', # Ironical detectives
|
||||
'det_history', # Historical detectives
|
||||
'det_espionage', # Espionage detectives
|
||||
'det_crime', # Crime detectives
|
||||
'det_political', # Political detectives
|
||||
'det_maniac', # Maniacs
|
||||
'det_hard', # Hard#boiled
|
||||
'thriller', # Thrillers
|
||||
'detective', # Detectives
|
||||
# Prose
|
||||
'prose_classic', # Classics prose
|
||||
'prose_history', # Historical prose
|
||||
'prose_contemporary', # Contemporary prose
|
||||
'prose_counter', # Counterculture
|
||||
'prose_rus_classic', # Russial classics prose
|
||||
'prose_su_classics', # Soviet classics prose
|
||||
# Romance
|
||||
'love_contemporary', # Contemporary Romance
|
||||
'love_history', # Historical Romance
|
||||
'love_detective', # Detective Romance
|
||||
'love_short', # Short Romance
|
||||
'love_erotica', # Erotica
|
||||
# Adventure
|
||||
'adv_western', # Western
|
||||
'adv_history', # History
|
||||
'adv_indian', # Indians
|
||||
'adv_maritime', # Maritime Fiction
|
||||
'adv_geo', # Travel & geography
|
||||
'adv_animal', # Nature & animals
|
||||
'adventure', # Other
|
||||
# Children's
|
||||
'child_tale', # Fairy Tales
|
||||
'child_verse', # Verses
|
||||
'child_prose', # Prose
|
||||
'child_sf', # Science Fiction
|
||||
'child_det', # Detectives & Thrillers
|
||||
'child_adv', # Adventures
|
||||
'child_education', # Educational
|
||||
'children', # Other
|
||||
# Poetry & Dramaturgy
|
||||
'poetry', # Poetry
|
||||
'dramaturgy', # Dramaturgy
|
||||
# Antique literature
|
||||
'antique_ant', # Antique
|
||||
'antique_european', # European
|
||||
'antique_russian', # Old russian
|
||||
'antique_east', # Old east
|
||||
'antique_myths', # Myths. Legends. Epos
|
||||
'antique', # Other
|
||||
# Scientific#educational
|
||||
'sci_history', # History
|
||||
'sci_psychology', # Psychology
|
||||
'sci_culture', # Cultural science
|
||||
'sci_religion', # Religious studies
|
||||
'sci_philosophy', # Philosophy
|
||||
'sci_politics', # Politics
|
||||
'sci_business', # Business literature
|
||||
'sci_juris', # Jurisprudence
|
||||
'sci_linguistic', # Linguistics
|
||||
'sci_medicine', # Medicine
|
||||
'sci_phys', # Physics
|
||||
'sci_math', # Mathematics
|
||||
'sci_chem', # Chemistry
|
||||
'sci_biology', # Biology
|
||||
'sci_tech', # Technical
|
||||
'science', # Other
|
||||
# Computers & Internet
|
||||
'comp_www', # Internet
|
||||
'comp_programming', # Programming
|
||||
'comp_hard', # Hardware
|
||||
'comp_soft', # Software
|
||||
'comp_db', # Databases
|
||||
'comp_osnet', # OS & Networking
|
||||
'computers', # Other
|
||||
# Reference
|
||||
'ref_encyc', # Encyclopedias
|
||||
'ref_dict', # Dictionaries
|
||||
'ref_ref', # Reference
|
||||
'ref_guide', # Guidebooks
|
||||
'reference', # Other
|
||||
# Nonfiction
|
||||
'nonf_biography', # Biography & Memoirs
|
||||
'nonf_publicism', # Publicism
|
||||
'nonf_criticism', # Criticism
|
||||
'design', # Art & design
|
||||
'nonfiction', # Other
|
||||
# Religion & Inspiration
|
||||
'religion_rel', # Religion
|
||||
'religion_esoterics', # Esoterics
|
||||
'religion_self', # Self#improvement
|
||||
'religion', # Other
|
||||
# Humor
|
||||
'humor_anecdote', # Anecdote (funny stories)
|
||||
'humor_prose', # Prose
|
||||
'humor_verse', # Verses
|
||||
'humor', # Other
|
||||
# Home & Family
|
||||
'home_cooking', # Cooking
|
||||
'home_pets', # Pets
|
||||
'home_crafts', # Hobbies & Crafts
|
||||
'home_entertain', # Entertaining
|
||||
'home_health', # Health
|
||||
'home_garden', # Garden
|
||||
'home_diy', # Do it yourself
|
||||
'home_sport', # Sports
|
||||
'home_sex', # Erotica & sex
|
||||
'home', # Other
|
||||
]
|
||||
ui_data = {
|
||||
'sectionize': {
|
||||
'toc': _('Section per entry in the ToC'),
|
||||
'files': _('Section per file'),
|
||||
'nothing': _('A single section')
|
||||
},
|
||||
'genres': FB2_GENRES,
|
||||
}
|
||||
|
||||
options = {
|
||||
OptionRecommendation(name='sectionize',
|
||||
recommended_value='files', level=OptionRecommendation.LOW,
|
||||
choices=list(ui_data['sectionize']),
|
||||
help=_('Specify how sections are created:\n'
|
||||
' * nothing: {nothing}\n'
|
||||
' * files: {files}\n'
|
||||
' * toc: {toc}\n'
|
||||
'If ToC based generation fails, adjust the "Structure detection" and/or "Table of Contents" settings '
|
||||
'(turn on "Force use of auto-generated Table of Contents").').format(**ui_data['sectionize'])
|
||||
),
|
||||
OptionRecommendation(name='fb2_genre',
|
||||
recommended_value='antique', level=OptionRecommendation.LOW,
|
||||
choices=FB2_GENRES,
|
||||
help=(_('Genre for the book. Choices: %s\n\n See: ') % ', '.join(FB2_GENRES)
|
||||
) + 'http://www.fictionbook.org/index.php/Eng:FictionBook_2.1_genres ' + _('for a complete list with descriptions.')),
|
||||
}
|
||||
|
||||
def convert(self, oeb_book, output_path, input_plugin, opts, log):
|
||||
from calibre.ebooks.oeb.transforms.jacket import linearize_jacket
|
||||
from calibre.ebooks.oeb.transforms.rasterize import SVGRasterizer, Unavailable
|
||||
from calibre.ebooks.fb2.fb2ml import FB2MLizer
|
||||
|
||||
try:
|
||||
rasterizer = SVGRasterizer()
|
||||
rasterizer(oeb_book, opts)
|
||||
except Unavailable:
|
||||
log.warn('SVG rasterizer unavailable, SVG will not be converted')
|
||||
|
||||
linearize_jacket(oeb_book)
|
||||
|
||||
fb2mlizer = FB2MLizer(log)
|
||||
fb2_content = fb2mlizer.extract_content(oeb_book, opts)
|
||||
|
||||
close = False
|
||||
if not hasattr(output_path, 'write'):
|
||||
close = True
|
||||
if not os.path.exists(os.path.dirname(output_path)) and os.path.dirname(output_path) != '':
|
||||
os.makedirs(os.path.dirname(output_path))
|
||||
out_stream = lopen(output_path, 'wb')
|
||||
else:
|
||||
out_stream = output_path
|
||||
|
||||
out_stream.seek(0)
|
||||
out_stream.truncate()
|
||||
out_stream.write(fb2_content.encode('utf-8', 'replace'))
|
||||
|
||||
if close:
|
||||
out_stream.close()
|
||||
316
ebook_converter/ebooks/conversion/plugins/html_input.py
Normal file
316
ebook_converter/ebooks/conversion/plugins/html_input.py
Normal file
@@ -0,0 +1,316 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import re, tempfile, os
|
||||
from functools import partial
|
||||
|
||||
from calibre.constants import islinux, isbsd
|
||||
from calibre.customize.conversion import (InputFormatPlugin,
|
||||
OptionRecommendation)
|
||||
from calibre.utils.localization import get_lang
|
||||
from calibre.utils.filenames import ascii_filename
|
||||
from calibre.utils.imghdr import what
|
||||
from polyglot.builtins import unicode_type, zip, getcwd, as_unicode
|
||||
|
||||
|
||||
def sanitize_file_name(x):
|
||||
ans = re.sub(r'\s+', ' ', re.sub(r'[?&=;#]', '_', ascii_filename(x))).strip().rstrip('.')
|
||||
ans, ext = ans.rpartition('.')[::2]
|
||||
return (ans.strip() + '.' + ext.strip()).rstrip('.')
|
||||
|
||||
|
||||
class HTMLInput(InputFormatPlugin):
|
||||
|
||||
name = 'HTML Input'
|
||||
author = 'Kovid Goyal'
|
||||
description = 'Convert HTML and OPF files to an OEB'
|
||||
file_types = {'opf', 'html', 'htm', 'xhtml', 'xhtm', 'shtm', 'shtml'}
|
||||
commit_name = 'html_input'
|
||||
|
||||
options = {
|
||||
OptionRecommendation(name='breadth_first',
|
||||
recommended_value=False, level=OptionRecommendation.LOW,
|
||||
help=_('Traverse links in HTML files breadth first. Normally, '
|
||||
'they are traversed depth first.'
|
||||
)
|
||||
),
|
||||
|
||||
OptionRecommendation(name='max_levels',
|
||||
recommended_value=5, level=OptionRecommendation.LOW,
|
||||
help=_('Maximum levels of recursion when following links in '
|
||||
'HTML files. Must be non-negative. 0 implies that no '
|
||||
'links in the root HTML file are followed. Default is '
|
||||
'%default.'
|
||||
)
|
||||
),
|
||||
|
||||
OptionRecommendation(name='dont_package',
|
||||
recommended_value=False, level=OptionRecommendation.LOW,
|
||||
help=_('Normally this input plugin re-arranges all the input '
|
||||
'files into a standard folder hierarchy. Only use this option '
|
||||
'if you know what you are doing as it can result in various '
|
||||
'nasty side effects in the rest of the conversion pipeline.'
|
||||
)
|
||||
),
|
||||
|
||||
}
|
||||
|
||||
def convert(self, stream, opts, file_ext, log,
|
||||
accelerators):
|
||||
self._is_case_sensitive = None
|
||||
basedir = getcwd()
|
||||
self.opts = opts
|
||||
|
||||
fname = None
|
||||
if hasattr(stream, 'name'):
|
||||
basedir = os.path.dirname(stream.name)
|
||||
fname = os.path.basename(stream.name)
|
||||
|
||||
if file_ext != 'opf':
|
||||
if opts.dont_package:
|
||||
raise ValueError('The --dont-package option is not supported for an HTML input file')
|
||||
from calibre.ebooks.metadata.html import get_metadata
|
||||
mi = get_metadata(stream)
|
||||
if fname:
|
||||
from calibre.ebooks.metadata.meta import metadata_from_filename
|
||||
fmi = metadata_from_filename(fname)
|
||||
fmi.smart_update(mi)
|
||||
mi = fmi
|
||||
oeb = self.create_oebbook(stream.name, basedir, opts, log, mi)
|
||||
return oeb
|
||||
|
||||
from calibre.ebooks.conversion.plumber import create_oebbook
|
||||
return create_oebbook(log, stream.name, opts,
|
||||
encoding=opts.input_encoding)
|
||||
|
||||
def is_case_sensitive(self, path):
|
||||
if getattr(self, '_is_case_sensitive', None) is not None:
|
||||
return self._is_case_sensitive
|
||||
if not path or not os.path.exists(path):
|
||||
return islinux or isbsd
|
||||
self._is_case_sensitive = not (os.path.exists(path.lower()) and os.path.exists(path.upper()))
|
||||
return self._is_case_sensitive
|
||||
|
||||
def create_oebbook(self, htmlpath, basedir, opts, log, mi):
|
||||
import uuid
|
||||
from calibre.ebooks.conversion.plumber import create_oebbook
|
||||
from calibre.ebooks.oeb.base import (DirContainer,
|
||||
rewrite_links, urlnormalize, urldefrag, BINARY_MIME, OEB_STYLES,
|
||||
xpath, urlquote)
|
||||
from calibre import guess_type
|
||||
from calibre.ebooks.oeb.transforms.metadata import \
|
||||
meta_info_to_oeb_metadata
|
||||
from calibre.ebooks.html.input import get_filelist
|
||||
from calibre.ebooks.metadata import string_to_authors
|
||||
from calibre.utils.localization import canonicalize_lang
|
||||
import css_parser, logging
|
||||
css_parser.log.setLevel(logging.WARN)
|
||||
self.OEB_STYLES = OEB_STYLES
|
||||
oeb = create_oebbook(log, None, opts, self,
|
||||
encoding=opts.input_encoding, populate=False)
|
||||
self.oeb = oeb
|
||||
|
||||
metadata = oeb.metadata
|
||||
meta_info_to_oeb_metadata(mi, metadata, log)
|
||||
if not metadata.language:
|
||||
l = canonicalize_lang(getattr(opts, 'language', None))
|
||||
if not l:
|
||||
oeb.logger.warn('Language not specified')
|
||||
l = get_lang().replace('_', '-')
|
||||
metadata.add('language', l)
|
||||
if not metadata.creator:
|
||||
a = getattr(opts, 'authors', None)
|
||||
if a:
|
||||
a = string_to_authors(a)
|
||||
if not a:
|
||||
oeb.logger.warn('Creator not specified')
|
||||
a = [self.oeb.translate(__('Unknown'))]
|
||||
for aut in a:
|
||||
metadata.add('creator', aut)
|
||||
if not metadata.title:
|
||||
oeb.logger.warn('Title not specified')
|
||||
metadata.add('title', self.oeb.translate(__('Unknown')))
|
||||
bookid = unicode_type(uuid.uuid4())
|
||||
metadata.add('identifier', bookid, id='uuid_id', scheme='uuid')
|
||||
for ident in metadata.identifier:
|
||||
if 'id' in ident.attrib:
|
||||
self.oeb.uid = metadata.identifier[0]
|
||||
break
|
||||
|
||||
filelist = get_filelist(htmlpath, basedir, opts, log)
|
||||
filelist = [f for f in filelist if not f.is_binary]
|
||||
htmlfile_map = {}
|
||||
for f in filelist:
|
||||
path = f.path
|
||||
oeb.container = DirContainer(os.path.dirname(path), log,
|
||||
ignore_opf=True)
|
||||
bname = os.path.basename(path)
|
||||
id, href = oeb.manifest.generate(id='html', href=sanitize_file_name(bname))
|
||||
htmlfile_map[path] = href
|
||||
item = oeb.manifest.add(id, href, 'text/html')
|
||||
if path == htmlpath and '%' in path:
|
||||
bname = urlquote(bname)
|
||||
item.html_input_href = bname
|
||||
oeb.spine.add(item, True)
|
||||
|
||||
self.added_resources = {}
|
||||
self.log = log
|
||||
self.log('Normalizing filename cases')
|
||||
for path, href in htmlfile_map.items():
|
||||
if not self.is_case_sensitive(path):
|
||||
path = path.lower()
|
||||
self.added_resources[path] = href
|
||||
self.urlnormalize, self.DirContainer = urlnormalize, DirContainer
|
||||
self.urldefrag = urldefrag
|
||||
self.guess_type, self.BINARY_MIME = guess_type, BINARY_MIME
|
||||
|
||||
self.log('Rewriting HTML links')
|
||||
for f in filelist:
|
||||
path = f.path
|
||||
dpath = os.path.dirname(path)
|
||||
oeb.container = DirContainer(dpath, log, ignore_opf=True)
|
||||
href = htmlfile_map[path]
|
||||
try:
|
||||
item = oeb.manifest.hrefs[href]
|
||||
except KeyError:
|
||||
item = oeb.manifest.hrefs[urlnormalize(href)]
|
||||
rewrite_links(item.data, partial(self.resource_adder, base=dpath))
|
||||
|
||||
for item in oeb.manifest.values():
|
||||
if item.media_type in self.OEB_STYLES:
|
||||
dpath = None
|
||||
for path, href in self.added_resources.items():
|
||||
if href == item.href:
|
||||
dpath = os.path.dirname(path)
|
||||
break
|
||||
css_parser.replaceUrls(item.data,
|
||||
partial(self.resource_adder, base=dpath))
|
||||
|
||||
toc = self.oeb.toc
|
||||
self.oeb.auto_generated_toc = True
|
||||
titles = []
|
||||
headers = []
|
||||
for item in self.oeb.spine:
|
||||
if not item.linear:
|
||||
continue
|
||||
html = item.data
|
||||
title = ''.join(xpath(html, '/h:html/h:head/h:title/text()'))
|
||||
title = re.sub(r'\s+', ' ', title.strip())
|
||||
if title:
|
||||
titles.append(title)
|
||||
headers.append('(unlabled)')
|
||||
for tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'strong'):
|
||||
expr = '/h:html/h:body//h:%s[position()=1]/text()'
|
||||
header = ''.join(xpath(html, expr % tag))
|
||||
header = re.sub(r'\s+', ' ', header.strip())
|
||||
if header:
|
||||
headers[-1] = header
|
||||
break
|
||||
use = titles
|
||||
if len(titles) > len(set(titles)):
|
||||
use = headers
|
||||
for title, item in zip(use, self.oeb.spine):
|
||||
if not item.linear:
|
||||
continue
|
||||
toc.add(title, item.href)
|
||||
|
||||
oeb.container = DirContainer(getcwd(), oeb.log, ignore_opf=True)
|
||||
return oeb
|
||||
|
||||
def link_to_local_path(self, link_, base=None):
|
||||
from calibre.ebooks.html.input import Link
|
||||
if not isinstance(link_, unicode_type):
|
||||
try:
|
||||
link_ = link_.decode('utf-8', 'error')
|
||||
except:
|
||||
self.log.warn('Failed to decode link %r. Ignoring'%link_)
|
||||
return None, None
|
||||
try:
|
||||
l = Link(link_, base if base else getcwd())
|
||||
except:
|
||||
self.log.exception('Failed to process link: %r'%link_)
|
||||
return None, None
|
||||
if l.path is None:
|
||||
# Not a local resource
|
||||
return None, None
|
||||
link = l.path.replace('/', os.sep).strip()
|
||||
frag = l.fragment
|
||||
if not link:
|
||||
return None, None
|
||||
return link, frag
|
||||
|
||||
def resource_adder(self, link_, base=None):
|
||||
from polyglot.urllib import quote
|
||||
link, frag = self.link_to_local_path(link_, base=base)
|
||||
if link is None:
|
||||
return link_
|
||||
try:
|
||||
if base and not os.path.isabs(link):
|
||||
link = os.path.join(base, link)
|
||||
link = os.path.abspath(link)
|
||||
except:
|
||||
return link_
|
||||
if not os.access(link, os.R_OK):
|
||||
return link_
|
||||
if os.path.isdir(link):
|
||||
self.log.warn(link_, 'is a link to a directory. Ignoring.')
|
||||
return link_
|
||||
if not self.is_case_sensitive(tempfile.gettempdir()):
|
||||
link = link.lower()
|
||||
if link not in self.added_resources:
|
||||
bhref = os.path.basename(link)
|
||||
id, href = self.oeb.manifest.generate(id='added', href=sanitize_file_name(bhref))
|
||||
guessed = self.guess_type(href)[0]
|
||||
media_type = guessed or self.BINARY_MIME
|
||||
if media_type == 'text/plain':
|
||||
self.log.warn('Ignoring link to text file %r'%link_)
|
||||
return None
|
||||
if media_type == self.BINARY_MIME:
|
||||
# Check for the common case, images
|
||||
try:
|
||||
img = what(link)
|
||||
except EnvironmentError:
|
||||
pass
|
||||
else:
|
||||
if img:
|
||||
media_type = self.guess_type('dummy.'+img)[0] or self.BINARY_MIME
|
||||
|
||||
self.oeb.log.debug('Added', link)
|
||||
self.oeb.container = self.DirContainer(os.path.dirname(link),
|
||||
self.oeb.log, ignore_opf=True)
|
||||
# Load into memory
|
||||
item = self.oeb.manifest.add(id, href, media_type)
|
||||
# bhref refers to an already existing file. The read() method of
|
||||
# DirContainer will call unquote on it before trying to read the
|
||||
# file, therefore we quote it here.
|
||||
if isinstance(bhref, unicode_type):
|
||||
bhref = bhref.encode('utf-8')
|
||||
item.html_input_href = as_unicode(quote(bhref))
|
||||
if guessed in self.OEB_STYLES:
|
||||
item.override_css_fetch = partial(
|
||||
self.css_import_handler, os.path.dirname(link))
|
||||
item.data
|
||||
self.added_resources[link] = href
|
||||
|
||||
nlink = self.added_resources[link]
|
||||
if frag:
|
||||
nlink = '#'.join((nlink, frag))
|
||||
return nlink
|
||||
|
||||
def css_import_handler(self, base, href):
|
||||
link, frag = self.link_to_local_path(href, base=base)
|
||||
if link is None or not os.access(link, os.R_OK) or os.path.isdir(link):
|
||||
return (None, None)
|
||||
try:
|
||||
with open(link, 'rb') as f:
|
||||
raw = f.read().decode('utf-8', 'replace')
|
||||
raw = self.oeb.css_preprocessor(raw, add_namespace=False)
|
||||
except:
|
||||
self.log.exception('Failed to read CSS file: %r'%link)
|
||||
return (None, None)
|
||||
return (None, raw)
|
||||
226
ebook_converter/ebooks/conversion/plugins/html_output.py
Normal file
226
ebook_converter/ebooks/conversion/plugins/html_output.py
Normal file
@@ -0,0 +1,226 @@
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL 3'
|
||||
__copyright__ = '2010, Fabian Grassl <fg@jusmeum.de>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import os, re, shutil
|
||||
from os.path import dirname, abspath, relpath as _relpath, exists, basename
|
||||
|
||||
from calibre.customize.conversion import OutputFormatPlugin, OptionRecommendation
|
||||
from calibre import CurrentDir
|
||||
from calibre.ptempfile import PersistentTemporaryDirectory
|
||||
from polyglot.builtins import unicode_type
|
||||
|
||||
|
||||
def relpath(*args):
|
||||
return _relpath(*args).replace(os.sep, '/')
|
||||
|
||||
|
||||
class HTMLOutput(OutputFormatPlugin):
|
||||
|
||||
name = 'HTML Output'
|
||||
author = 'Fabian Grassl'
|
||||
file_type = 'zip'
|
||||
commit_name = 'html_output'
|
||||
|
||||
options = {
|
||||
OptionRecommendation(name='template_css',
|
||||
help=_('CSS file used for the output instead of the default file')),
|
||||
|
||||
OptionRecommendation(name='template_html_index',
|
||||
help=_('Template used for generation of the HTML index file instead of the default file')),
|
||||
|
||||
OptionRecommendation(name='template_html',
|
||||
help=_('Template used for the generation of the HTML contents of the book instead of the default file')),
|
||||
|
||||
OptionRecommendation(name='extract_to',
|
||||
help=_('Extract the contents of the generated ZIP file to the '
|
||||
'specified directory. WARNING: The contents of the directory '
|
||||
'will be deleted.')
|
||||
),
|
||||
}
|
||||
|
||||
recommendations = {('pretty_print', True, OptionRecommendation.HIGH)}
|
||||
|
||||
def generate_toc(self, oeb_book, ref_url, output_dir):
|
||||
'''
|
||||
Generate table of contents
|
||||
'''
|
||||
from lxml import etree
|
||||
from polyglot.urllib import unquote
|
||||
|
||||
from calibre.ebooks.oeb.base import element
|
||||
from calibre.utils.cleantext import clean_xml_chars
|
||||
with CurrentDir(output_dir):
|
||||
def build_node(current_node, parent=None):
|
||||
if parent is None:
|
||||
parent = etree.Element('ul')
|
||||
elif len(current_node.nodes):
|
||||
parent = element(parent, ('ul'))
|
||||
for node in current_node.nodes:
|
||||
point = element(parent, 'li')
|
||||
href = relpath(abspath(unquote(node.href)), dirname(ref_url))
|
||||
if isinstance(href, bytes):
|
||||
href = href.decode('utf-8')
|
||||
link = element(point, 'a', href=clean_xml_chars(href))
|
||||
title = node.title
|
||||
if isinstance(title, bytes):
|
||||
title = title.decode('utf-8')
|
||||
if title:
|
||||
title = re.sub(r'\s+', ' ', title)
|
||||
link.text = clean_xml_chars(title)
|
||||
build_node(node, point)
|
||||
return parent
|
||||
wrap = etree.Element('div')
|
||||
wrap.append(build_node(oeb_book.toc))
|
||||
return wrap
|
||||
|
||||
def generate_html_toc(self, oeb_book, ref_url, output_dir):
|
||||
from lxml import etree
|
||||
|
||||
root = self.generate_toc(oeb_book, ref_url, output_dir)
|
||||
return etree.tostring(root, pretty_print=True, encoding='unicode',
|
||||
xml_declaration=False)
|
||||
|
||||
def convert(self, oeb_book, output_path, input_plugin, opts, log):
|
||||
from lxml import etree
|
||||
from calibre.utils import zipfile
|
||||
from templite import Templite
|
||||
from polyglot.urllib import unquote
|
||||
from calibre.ebooks.html.meta import EasyMeta
|
||||
|
||||
# read template files
|
||||
if opts.template_html_index is not None:
|
||||
with open(opts.template_html_index, 'rb') as f:
|
||||
template_html_index_data = f.read()
|
||||
else:
|
||||
template_html_index_data = P('templates/html_export_default_index.tmpl', data=True)
|
||||
|
||||
if opts.template_html is not None:
|
||||
with open(opts.template_html, 'rb') as f:
|
||||
template_html_data = f.read()
|
||||
else:
|
||||
template_html_data = P('templates/html_export_default.tmpl', data=True)
|
||||
|
||||
if opts.template_css is not None:
|
||||
with open(opts.template_css, 'rb') as f:
|
||||
template_css_data = f.read()
|
||||
else:
|
||||
template_css_data = P('templates/html_export_default.css', data=True)
|
||||
|
||||
template_html_index_data = template_html_index_data.decode('utf-8')
|
||||
template_html_data = template_html_data.decode('utf-8')
|
||||
template_css_data = template_css_data.decode('utf-8')
|
||||
|
||||
self.log = log
|
||||
self.opts = opts
|
||||
meta = EasyMeta(oeb_book.metadata)
|
||||
|
||||
tempdir = os.path.realpath(PersistentTemporaryDirectory())
|
||||
output_file = os.path.join(tempdir,
|
||||
basename(re.sub(r'\.zip', '', output_path)+'.html'))
|
||||
output_dir = re.sub(r'\.html', '', output_file)+'_files'
|
||||
|
||||
if not exists(output_dir):
|
||||
os.makedirs(output_dir)
|
||||
|
||||
css_path = output_dir+os.sep+'calibreHtmlOutBasicCss.css'
|
||||
with open(css_path, 'wb') as f:
|
||||
f.write(template_css_data.encode('utf-8'))
|
||||
|
||||
with open(output_file, 'wb') as f:
|
||||
html_toc = self.generate_html_toc(oeb_book, output_file, output_dir)
|
||||
templite = Templite(template_html_index_data)
|
||||
nextLink = oeb_book.spine[0].href
|
||||
nextLink = relpath(output_dir+os.sep+nextLink, dirname(output_file))
|
||||
cssLink = relpath(abspath(css_path), dirname(output_file))
|
||||
tocUrl = relpath(output_file, dirname(output_file))
|
||||
t = templite.render(has_toc=bool(oeb_book.toc.count()),
|
||||
toc=html_toc, meta=meta, nextLink=nextLink,
|
||||
tocUrl=tocUrl, cssLink=cssLink,
|
||||
firstContentPageLink=nextLink)
|
||||
if isinstance(t, unicode_type):
|
||||
t = t.encode('utf-8')
|
||||
f.write(t)
|
||||
|
||||
with CurrentDir(output_dir):
|
||||
for item in oeb_book.manifest:
|
||||
path = abspath(unquote(item.href))
|
||||
dir = dirname(path)
|
||||
if not exists(dir):
|
||||
os.makedirs(dir)
|
||||
if item.spine_position is not None:
|
||||
with open(path, 'wb') as f:
|
||||
pass
|
||||
else:
|
||||
with open(path, 'wb') as f:
|
||||
f.write(item.bytes_representation)
|
||||
item.unload_data_from_memory(memory=path)
|
||||
|
||||
for item in oeb_book.spine:
|
||||
path = abspath(unquote(item.href))
|
||||
dir = dirname(path)
|
||||
root = item.data.getroottree()
|
||||
|
||||
# get & clean HTML <HEAD>-data
|
||||
head = root.xpath('//h:head', namespaces={'h': 'http://www.w3.org/1999/xhtml'})[0]
|
||||
head_content = etree.tostring(head, pretty_print=True, encoding='unicode')
|
||||
head_content = re.sub(r'\<\/?head.*\>', '', head_content)
|
||||
head_content = re.sub(re.compile(r'\<style.*\/style\>', re.M|re.S), '', head_content)
|
||||
head_content = re.sub(r'<(title)([^>]*)/>', r'<\1\2></\1>', head_content)
|
||||
|
||||
# get & clean HTML <BODY>-data
|
||||
body = root.xpath('//h:body', namespaces={'h': 'http://www.w3.org/1999/xhtml'})[0]
|
||||
ebook_content = etree.tostring(body, pretty_print=True, encoding='unicode')
|
||||
ebook_content = re.sub(r'\<\/?body.*\>', '', ebook_content)
|
||||
ebook_content = re.sub(r'<(div|a|span)([^>]*)/>', r'<\1\2></\1>', ebook_content)
|
||||
|
||||
# generate link to next page
|
||||
if item.spine_position+1 < len(oeb_book.spine):
|
||||
nextLink = oeb_book.spine[item.spine_position+1].href
|
||||
nextLink = relpath(abspath(nextLink), dir)
|
||||
else:
|
||||
nextLink = None
|
||||
|
||||
# generate link to previous page
|
||||
if item.spine_position > 0:
|
||||
prevLink = oeb_book.spine[item.spine_position-1].href
|
||||
prevLink = relpath(abspath(prevLink), dir)
|
||||
else:
|
||||
prevLink = None
|
||||
|
||||
cssLink = relpath(abspath(css_path), dir)
|
||||
tocUrl = relpath(output_file, dir)
|
||||
firstContentPageLink = oeb_book.spine[0].href
|
||||
|
||||
# render template
|
||||
templite = Templite(template_html_data)
|
||||
toc = lambda: self.generate_html_toc(oeb_book, path, output_dir)
|
||||
t = templite.render(ebookContent=ebook_content,
|
||||
prevLink=prevLink, nextLink=nextLink,
|
||||
has_toc=bool(oeb_book.toc.count()), toc=toc,
|
||||
tocUrl=tocUrl, head_content=head_content,
|
||||
meta=meta, cssLink=cssLink,
|
||||
firstContentPageLink=firstContentPageLink)
|
||||
|
||||
# write html to file
|
||||
with open(path, 'wb') as f:
|
||||
f.write(t.encode('utf-8'))
|
||||
item.unload_data_from_memory(memory=path)
|
||||
|
||||
zfile = zipfile.ZipFile(output_path, "w")
|
||||
zfile.add_dir(output_dir, basename(output_dir))
|
||||
zfile.write(output_file, basename(output_file), zipfile.ZIP_DEFLATED)
|
||||
|
||||
if opts.extract_to:
|
||||
if os.path.exists(opts.extract_to):
|
||||
shutil.rmtree(opts.extract_to)
|
||||
os.makedirs(opts.extract_to)
|
||||
zfile.extractall(opts.extract_to)
|
||||
self.log('Zip file extracted to', opts.extract_to)
|
||||
|
||||
zfile.close()
|
||||
|
||||
# cleanup temp dir
|
||||
shutil.rmtree(tempdir)
|
||||
133
ebook_converter/ebooks/conversion/plugins/htmlz_input.py
Normal file
133
ebook_converter/ebooks/conversion/plugins/htmlz_input.py
Normal file
@@ -0,0 +1,133 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
|
||||
__license__ = 'GPL 3'
|
||||
__copyright__ = '2011, John Schember <john@nachtimwald.com>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import os
|
||||
|
||||
from calibre import guess_type
|
||||
from calibre.customize.conversion import InputFormatPlugin
|
||||
from polyglot.builtins import getcwd
|
||||
|
||||
|
||||
class HTMLZInput(InputFormatPlugin):
|
||||
|
||||
name = 'HTLZ Input'
|
||||
author = 'John Schember'
|
||||
description = 'Convert HTML files to HTML'
|
||||
file_types = {'htmlz'}
|
||||
commit_name = 'htmlz_input'
|
||||
|
||||
def convert(self, stream, options, file_ext, log,
|
||||
accelerators):
|
||||
from calibre.ebooks.chardet import xml_to_unicode
|
||||
from calibre.ebooks.metadata.opf2 import OPF
|
||||
from calibre.utils.zipfile import ZipFile
|
||||
|
||||
self.log = log
|
||||
html = u''
|
||||
top_levels = []
|
||||
|
||||
# Extract content from zip archive.
|
||||
zf = ZipFile(stream)
|
||||
zf.extractall()
|
||||
|
||||
# Find the HTML file in the archive. It needs to be
|
||||
# top level.
|
||||
index = u''
|
||||
multiple_html = False
|
||||
# Get a list of all top level files in the archive.
|
||||
for x in os.listdir(u'.'):
|
||||
if os.path.isfile(x):
|
||||
top_levels.append(x)
|
||||
# Try to find an index. file.
|
||||
for x in top_levels:
|
||||
if x.lower() in (u'index.html', u'index.xhtml', u'index.htm'):
|
||||
index = x
|
||||
break
|
||||
# Look for multiple HTML files in the archive. We look at the
|
||||
# top level files only as only they matter in HTMLZ.
|
||||
for x in top_levels:
|
||||
if os.path.splitext(x)[1].lower() in (u'.html', u'.xhtml', u'.htm'):
|
||||
# Set index to the first HTML file found if it's not
|
||||
# called index.
|
||||
if not index:
|
||||
index = x
|
||||
else:
|
||||
multiple_html = True
|
||||
# Warn the user if there multiple HTML file in the archive. HTMLZ
|
||||
# supports a single HTML file. A conversion with a multiple HTML file
|
||||
# HTMLZ archive probably won't turn out as the user expects. With
|
||||
# Multiple HTML files ZIP input should be used in place of HTMLZ.
|
||||
if multiple_html:
|
||||
log.warn(_('Multiple HTML files found in the archive. Only %s will be used.') % index)
|
||||
|
||||
if index:
|
||||
with open(index, 'rb') as tf:
|
||||
html = tf.read()
|
||||
else:
|
||||
raise Exception(_('No top level HTML file found.'))
|
||||
|
||||
if not html:
|
||||
raise Exception(_('Top level HTML file %s is empty') % index)
|
||||
|
||||
# Encoding
|
||||
if options.input_encoding:
|
||||
ienc = options.input_encoding
|
||||
else:
|
||||
ienc = xml_to_unicode(html[:4096])[-1]
|
||||
html = html.decode(ienc, 'replace')
|
||||
|
||||
# Run the HTML through the html processing plugin.
|
||||
from calibre.customize.ui import plugin_for_input_format
|
||||
html_input = plugin_for_input_format('html')
|
||||
for opt in html_input.options:
|
||||
setattr(options, opt.option.name, opt.recommended_value)
|
||||
options.input_encoding = 'utf-8'
|
||||
base = getcwd()
|
||||
htmlfile = os.path.join(base, u'index.html')
|
||||
c = 0
|
||||
while os.path.exists(htmlfile):
|
||||
c += 1
|
||||
htmlfile = u'index%d.html'%c
|
||||
with open(htmlfile, 'wb') as f:
|
||||
f.write(html.encode('utf-8'))
|
||||
odi = options.debug_pipeline
|
||||
options.debug_pipeline = None
|
||||
# Generate oeb from html conversion.
|
||||
with open(htmlfile, 'rb') as f:
|
||||
oeb = html_input.convert(f, options, 'html', log,
|
||||
{})
|
||||
options.debug_pipeline = odi
|
||||
os.remove(htmlfile)
|
||||
|
||||
# Set metadata from file.
|
||||
from calibre.customize.ui import get_file_type_metadata
|
||||
from calibre.ebooks.oeb.transforms.metadata import meta_info_to_oeb_metadata
|
||||
mi = get_file_type_metadata(stream, file_ext)
|
||||
meta_info_to_oeb_metadata(mi, oeb.metadata, log)
|
||||
|
||||
# Get the cover path from the OPF.
|
||||
cover_path = None
|
||||
opf = None
|
||||
for x in top_levels:
|
||||
if os.path.splitext(x)[1].lower() == u'.opf':
|
||||
opf = x
|
||||
break
|
||||
if opf:
|
||||
opf = OPF(opf, basedir=getcwd())
|
||||
cover_path = opf.raster_cover or opf.cover
|
||||
# Set the cover.
|
||||
if cover_path:
|
||||
cdata = None
|
||||
with open(os.path.join(getcwd(), cover_path), 'rb') as cf:
|
||||
cdata = cf.read()
|
||||
cover_name = os.path.basename(cover_path)
|
||||
id, href = oeb.manifest.generate('cover', cover_name)
|
||||
oeb.manifest.add(id, href, guess_type(cover_name)[0], data=cdata)
|
||||
oeb.guide.add('cover', 'Cover', href)
|
||||
|
||||
return oeb
|
||||
136
ebook_converter/ebooks/conversion/plugins/htmlz_output.py
Normal file
136
ebook_converter/ebooks/conversion/plugins/htmlz_output.py
Normal file
@@ -0,0 +1,136 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
|
||||
__license__ = 'GPL 3'
|
||||
__copyright__ = '2011, John Schember <john@nachtimwald.com>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import io
|
||||
import os
|
||||
|
||||
from calibre.customize.conversion import OutputFormatPlugin, \
|
||||
OptionRecommendation
|
||||
from calibre.ptempfile import TemporaryDirectory
|
||||
from polyglot.builtins import unicode_type
|
||||
|
||||
|
||||
class HTMLZOutput(OutputFormatPlugin):
|
||||
|
||||
name = 'HTMLZ Output'
|
||||
author = 'John Schember'
|
||||
file_type = 'htmlz'
|
||||
commit_name = 'htmlz_output'
|
||||
ui_data = {
|
||||
'css_choices': {
|
||||
'class': _('Use CSS classes'),
|
||||
'inline': _('Use the style attribute'),
|
||||
'tag': _('Use HTML tags wherever possible')
|
||||
},
|
||||
'sheet_choices': {
|
||||
'external': _('Use an external CSS file'),
|
||||
'inline': _('Use a <style> tag in the HTML file')
|
||||
}
|
||||
}
|
||||
|
||||
options = {
|
||||
OptionRecommendation(name='htmlz_css_type', recommended_value='class',
|
||||
level=OptionRecommendation.LOW,
|
||||
choices=list(ui_data['css_choices']),
|
||||
help=_('Specify the handling of CSS. Default is class.\n'
|
||||
'class: {class}\n'
|
||||
'inline: {inline}\n'
|
||||
'tag: {tag}'
|
||||
).format(**ui_data['css_choices'])),
|
||||
OptionRecommendation(name='htmlz_class_style', recommended_value='external',
|
||||
level=OptionRecommendation.LOW,
|
||||
choices=list(ui_data['sheet_choices']),
|
||||
help=_('How to handle the CSS when using css-type = \'class\'.\n'
|
||||
'Default is external.\n'
|
||||
'external: {external}\n'
|
||||
'inline: {inline}'
|
||||
).format(**ui_data['sheet_choices'])),
|
||||
OptionRecommendation(name='htmlz_title_filename',
|
||||
recommended_value=False, level=OptionRecommendation.LOW,
|
||||
help=_('If set this option causes the file name of the HTML file'
|
||||
' inside the HTMLZ archive to be based on the book title.')
|
||||
),
|
||||
}
|
||||
|
||||
def convert(self, oeb_book, output_path, input_plugin, opts, log):
|
||||
from lxml import etree
|
||||
from calibre.ebooks.oeb.base import OEB_IMAGES, SVG_MIME
|
||||
from calibre.ebooks.metadata.opf2 import OPF, metadata_to_opf
|
||||
from calibre.utils.zipfile import ZipFile
|
||||
from calibre.utils.filenames import ascii_filename
|
||||
|
||||
# HTML
|
||||
if opts.htmlz_css_type == 'inline':
|
||||
from calibre.ebooks.htmlz.oeb2html import OEB2HTMLInlineCSSizer
|
||||
OEB2HTMLizer = OEB2HTMLInlineCSSizer
|
||||
elif opts.htmlz_css_type == 'tag':
|
||||
from calibre.ebooks.htmlz.oeb2html import OEB2HTMLNoCSSizer
|
||||
OEB2HTMLizer = OEB2HTMLNoCSSizer
|
||||
else:
|
||||
from calibre.ebooks.htmlz.oeb2html import OEB2HTMLClassCSSizer as OEB2HTMLizer
|
||||
|
||||
with TemporaryDirectory(u'_htmlz_output') as tdir:
|
||||
htmlizer = OEB2HTMLizer(log)
|
||||
html = htmlizer.oeb2html(oeb_book, opts)
|
||||
|
||||
fname = u'index'
|
||||
if opts.htmlz_title_filename:
|
||||
from calibre.utils.filenames import shorten_components_to
|
||||
fname = shorten_components_to(100, (ascii_filename(unicode_type(oeb_book.metadata.title[0])),))[0]
|
||||
with open(os.path.join(tdir, fname+u'.html'), 'wb') as tf:
|
||||
if isinstance(html, unicode_type):
|
||||
html = html.encode('utf-8')
|
||||
tf.write(html)
|
||||
|
||||
# CSS
|
||||
if opts.htmlz_css_type == 'class' and opts.htmlz_class_style == 'external':
|
||||
with open(os.path.join(tdir, u'style.css'), 'wb') as tf:
|
||||
tf.write(htmlizer.get_css(oeb_book))
|
||||
|
||||
# Images
|
||||
images = htmlizer.images
|
||||
if images:
|
||||
if not os.path.exists(os.path.join(tdir, u'images')):
|
||||
os.makedirs(os.path.join(tdir, u'images'))
|
||||
for item in oeb_book.manifest:
|
||||
if item.media_type in OEB_IMAGES and item.href in images:
|
||||
if item.media_type == SVG_MIME:
|
||||
data = etree.tostring(item.data, encoding='unicode')
|
||||
else:
|
||||
data = item.data
|
||||
fname = os.path.join(tdir, u'images', images[item.href])
|
||||
with open(fname, 'wb') as img:
|
||||
img.write(data)
|
||||
|
||||
# Cover
|
||||
cover_path = None
|
||||
try:
|
||||
cover_data = None
|
||||
if oeb_book.metadata.cover:
|
||||
term = oeb_book.metadata.cover[0].term
|
||||
cover_data = oeb_book.guide[term].item.data
|
||||
if cover_data:
|
||||
from calibre.utils.img import save_cover_data_to
|
||||
cover_path = os.path.join(tdir, u'cover.jpg')
|
||||
with lopen(cover_path, 'w') as cf:
|
||||
cf.write('')
|
||||
save_cover_data_to(cover_data, cover_path)
|
||||
except:
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
|
||||
# Metadata
|
||||
with open(os.path.join(tdir, u'metadata.opf'), 'wb') as mdataf:
|
||||
opf = OPF(io.BytesIO(etree.tostring(oeb_book.metadata.to_opf1(), encoding='UTF-8')))
|
||||
mi = opf.to_book_metadata()
|
||||
if cover_path:
|
||||
mi.cover = u'cover.jpg'
|
||||
mdataf.write(metadata_to_opf(mi))
|
||||
|
||||
htmlz = ZipFile(output_path, 'w')
|
||||
htmlz.add_dir(tdir)
|
||||
64
ebook_converter/ebooks/conversion/plugins/lit_input.py
Normal file
64
ebook_converter/ebooks/conversion/plugins/lit_input.py
Normal file
@@ -0,0 +1,64 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
from calibre.customize.conversion import InputFormatPlugin
|
||||
|
||||
|
||||
class LITInput(InputFormatPlugin):
|
||||
|
||||
name = 'LIT Input'
|
||||
author = 'Marshall T. Vandegrift'
|
||||
description = 'Convert LIT files to HTML'
|
||||
file_types = {'lit'}
|
||||
commit_name = 'lit_input'
|
||||
|
||||
def convert(self, stream, options, file_ext, log,
|
||||
accelerators):
|
||||
from calibre.ebooks.lit.reader import LitReader
|
||||
from calibre.ebooks.conversion.plumber import create_oebbook
|
||||
self.log = log
|
||||
return create_oebbook(log, stream, options, reader=LitReader)
|
||||
|
||||
def postprocess_book(self, oeb, opts, log):
|
||||
from calibre.ebooks.oeb.base import XHTML_NS, XPath, XHTML
|
||||
for item in oeb.spine:
|
||||
root = item.data
|
||||
if not hasattr(root, 'xpath'):
|
||||
continue
|
||||
for bad in ('metadata', 'guide'):
|
||||
metadata = XPath('//h:'+bad)(root)
|
||||
if metadata:
|
||||
for x in metadata:
|
||||
x.getparent().remove(x)
|
||||
body = XPath('//h:body')(root)
|
||||
if body:
|
||||
body = body[0]
|
||||
if len(body) == 1 and body[0].tag == XHTML('pre'):
|
||||
pre = body[0]
|
||||
from calibre.ebooks.txt.processor import convert_basic, \
|
||||
separate_paragraphs_single_line
|
||||
from calibre.ebooks.chardet import xml_to_unicode
|
||||
from calibre.utils.xml_parse import safe_xml_fromstring
|
||||
import copy
|
||||
self.log('LIT file with all text in singe <pre> tag detected')
|
||||
html = separate_paragraphs_single_line(pre.text)
|
||||
html = convert_basic(html).replace('<html>',
|
||||
'<html xmlns="%s">'%XHTML_NS)
|
||||
html = xml_to_unicode(html, strip_encoding_pats=True,
|
||||
resolve_entities=True)[0]
|
||||
if opts.smarten_punctuation:
|
||||
# SmartyPants skips text inside <pre> tags
|
||||
from calibre.ebooks.conversion.preprocess import smarten_punctuation
|
||||
html = smarten_punctuation(html, self.log)
|
||||
root = safe_xml_fromstring(html)
|
||||
body = XPath('//h:body')(root)
|
||||
pre.tag = XHTML('div')
|
||||
pre.text = ''
|
||||
for elem in body:
|
||||
ne = copy.deepcopy(elem)
|
||||
pre.append(ne)
|
||||
38
ebook_converter/ebooks/conversion/plugins/lit_output.py
Normal file
38
ebook_converter/ebooks/conversion/plugins/lit_output.py
Normal file
@@ -0,0 +1,38 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
|
||||
from calibre.customize.conversion import OutputFormatPlugin
|
||||
|
||||
|
||||
class LITOutput(OutputFormatPlugin):
|
||||
|
||||
name = 'LIT Output'
|
||||
author = 'Marshall T. Vandegrift'
|
||||
file_type = 'lit'
|
||||
commit_name = 'lit_output'
|
||||
|
||||
def convert(self, oeb, output_path, input_plugin, opts, log):
|
||||
self.log, self.opts, self.oeb = log, opts, oeb
|
||||
from calibre.ebooks.oeb.transforms.manglecase import CaseMangler
|
||||
from calibre.ebooks.oeb.transforms.rasterize import SVGRasterizer
|
||||
from calibre.ebooks.oeb.transforms.htmltoc import HTMLTOCAdder
|
||||
from calibre.ebooks.lit.writer import LitWriter
|
||||
from calibre.ebooks.oeb.transforms.split import Split
|
||||
split = Split(split_on_page_breaks=True, max_flow_size=0,
|
||||
remove_css_pagebreaks=False)
|
||||
split(self.oeb, self.opts)
|
||||
|
||||
tocadder = HTMLTOCAdder()
|
||||
tocadder(oeb, opts)
|
||||
mangler = CaseMangler()
|
||||
mangler(oeb, opts)
|
||||
rasterizer = SVGRasterizer()
|
||||
rasterizer(oeb, opts)
|
||||
lit = LitWriter(self.opts)
|
||||
lit(oeb, output_path)
|
||||
82
ebook_converter/ebooks/conversion/plugins/lrf_input.py
Normal file
82
ebook_converter/ebooks/conversion/plugins/lrf_input.py
Normal file
@@ -0,0 +1,82 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import os, sys
|
||||
from calibre.customize.conversion import InputFormatPlugin
|
||||
|
||||
|
||||
class LRFInput(InputFormatPlugin):
|
||||
|
||||
name = 'LRF Input'
|
||||
author = 'Kovid Goyal'
|
||||
description = 'Convert LRF files to HTML'
|
||||
file_types = {'lrf'}
|
||||
commit_name = 'lrf_input'
|
||||
|
||||
def convert(self, stream, options, file_ext, log,
|
||||
accelerators):
|
||||
from calibre.ebooks.lrf.input import (MediaType, Styles, TextBlock,
|
||||
Canvas, ImageBlock, RuledLine)
|
||||
self.log = log
|
||||
self.log('Generating XML')
|
||||
from calibre.ebooks.lrf.lrfparser import LRFDocument
|
||||
from calibre.utils.xml_parse import safe_xml_fromstring
|
||||
from lxml import etree
|
||||
d = LRFDocument(stream)
|
||||
d.parse()
|
||||
xml = d.to_xml(write_files=True)
|
||||
if options.verbose > 2:
|
||||
open(u'lrs.xml', 'wb').write(xml.encode('utf-8'))
|
||||
doc = safe_xml_fromstring(xml)
|
||||
|
||||
char_button_map = {}
|
||||
for x in doc.xpath('//CharButton[@refobj]'):
|
||||
ro = x.get('refobj')
|
||||
jump_button = doc.xpath('//*[@objid="%s"]'%ro)
|
||||
if jump_button:
|
||||
jump_to = jump_button[0].xpath('descendant::JumpTo[@refpage and @refobj]')
|
||||
if jump_to:
|
||||
char_button_map[ro] = '%s.xhtml#%s'%(jump_to[0].get('refpage'),
|
||||
jump_to[0].get('refobj'))
|
||||
plot_map = {}
|
||||
for x in doc.xpath('//Plot[@refobj]'):
|
||||
ro = x.get('refobj')
|
||||
image = doc.xpath('//Image[@objid="%s" and @refstream]'%ro)
|
||||
if image:
|
||||
imgstr = doc.xpath('//ImageStream[@objid="%s" and @file]'%
|
||||
image[0].get('refstream'))
|
||||
if imgstr:
|
||||
plot_map[ro] = imgstr[0].get('file')
|
||||
|
||||
self.log('Converting XML to HTML...')
|
||||
styledoc = safe_xml_fromstring(P('templates/lrf.xsl', data=True))
|
||||
media_type = MediaType()
|
||||
styles = Styles()
|
||||
text_block = TextBlock(styles, char_button_map, plot_map, log)
|
||||
canvas = Canvas(doc, styles, text_block, log)
|
||||
image_block = ImageBlock(canvas)
|
||||
ruled_line = RuledLine()
|
||||
extensions = {
|
||||
('calibre', 'media-type') : media_type,
|
||||
('calibre', 'text-block') : text_block,
|
||||
('calibre', 'ruled-line') : ruled_line,
|
||||
('calibre', 'styles') : styles,
|
||||
('calibre', 'canvas') : canvas,
|
||||
('calibre', 'image-block'): image_block,
|
||||
}
|
||||
transform = etree.XSLT(styledoc, extensions=extensions)
|
||||
try:
|
||||
result = transform(doc)
|
||||
except RuntimeError:
|
||||
sys.setrecursionlimit(5000)
|
||||
result = transform(doc)
|
||||
|
||||
with open('content.opf', 'wb') as f:
|
||||
f.write(result)
|
||||
styles.write()
|
||||
return os.path.abspath('content.opf')
|
||||
196
ebook_converter/ebooks/conversion/plugins/lrf_output.py
Normal file
196
ebook_converter/ebooks/conversion/plugins/lrf_output.py
Normal file
@@ -0,0 +1,196 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import sys, os
|
||||
|
||||
from calibre.customize.conversion import OutputFormatPlugin
|
||||
from calibre.customize.conversion import OptionRecommendation
|
||||
from polyglot.builtins import unicode_type
|
||||
|
||||
|
||||
class LRFOptions(object):
|
||||
|
||||
def __init__(self, output, opts, oeb):
|
||||
def f2s(f):
|
||||
try:
|
||||
return unicode_type(f[0])
|
||||
except:
|
||||
return ''
|
||||
m = oeb.metadata
|
||||
for x in ('left', 'top', 'right', 'bottom'):
|
||||
attr = 'margin_'+x
|
||||
val = getattr(opts, attr)
|
||||
if val < 0:
|
||||
setattr(opts, attr, 0)
|
||||
self.title = None
|
||||
self.author = self.publisher = _('Unknown')
|
||||
self.title_sort = self.author_sort = ''
|
||||
for x in m.creator:
|
||||
if x.role == 'aut':
|
||||
self.author = unicode_type(x)
|
||||
fa = unicode_type(getattr(x, 'file_as', ''))
|
||||
if fa:
|
||||
self.author_sort = fa
|
||||
for x in m.title:
|
||||
if unicode_type(x.file_as):
|
||||
self.title_sort = unicode_type(x.file_as)
|
||||
self.freetext = f2s(m.description)
|
||||
self.category = f2s(m.subject)
|
||||
self.cover = None
|
||||
self.use_metadata_cover = True
|
||||
self.output = output
|
||||
self.ignore_tables = opts.linearize_tables
|
||||
if opts.disable_font_rescaling:
|
||||
self.base_font_size = 0
|
||||
else:
|
||||
self.base_font_size = opts.base_font_size
|
||||
self.blank_after_para = opts.insert_blank_line
|
||||
self.use_spine = True
|
||||
self.font_delta = 0
|
||||
self.ignore_colors = False
|
||||
from calibre.ebooks.lrf import PRS500_PROFILE
|
||||
self.profile = PRS500_PROFILE
|
||||
self.link_levels = sys.maxsize
|
||||
self.link_exclude = '@'
|
||||
self.no_links_in_toc = True
|
||||
self.disable_chapter_detection = True
|
||||
self.chapter_regex = 'dsadcdswcdec'
|
||||
self.chapter_attr = '$,,$'
|
||||
self.override_css = self._override_css = ''
|
||||
self.page_break = 'h[12]'
|
||||
self.force_page_break = '$'
|
||||
self.force_page_break_attr = '$'
|
||||
self.add_chapters_to_toc = False
|
||||
self.baen = self.pdftohtml = self.book_designer = False
|
||||
self.verbose = opts.verbose
|
||||
self.encoding = 'utf-8'
|
||||
self.lrs = False
|
||||
self.minimize_memory_usage = False
|
||||
self.autorotation = opts.enable_autorotation
|
||||
self.header_separation = (self.profile.dpi/72.) * opts.header_separation
|
||||
self.headerformat = opts.header_format
|
||||
|
||||
for x in ('top', 'bottom', 'left', 'right'):
|
||||
setattr(self, x+'_margin',
|
||||
(self.profile.dpi/72.) * float(getattr(opts, 'margin_'+x)))
|
||||
|
||||
for x in ('wordspace', 'header', 'header_format',
|
||||
'minimum_indent', 'serif_family',
|
||||
'render_tables_as_images', 'sans_family', 'mono_family',
|
||||
'text_size_multiplier_for_rendered_tables'):
|
||||
setattr(self, x, getattr(opts, x))
|
||||
|
||||
|
||||
class LRFOutput(OutputFormatPlugin):
|
||||
|
||||
name = 'LRF Output'
|
||||
author = 'Kovid Goyal'
|
||||
file_type = 'lrf'
|
||||
commit_name = 'lrf_output'
|
||||
|
||||
options = {
|
||||
OptionRecommendation(name='enable_autorotation', recommended_value=False,
|
||||
help=_('Enable auto-rotation of images that are wider than the screen width.')
|
||||
),
|
||||
OptionRecommendation(name='wordspace',
|
||||
recommended_value=2.5, level=OptionRecommendation.LOW,
|
||||
help=_('Set the space between words in pts. Default is %default')
|
||||
),
|
||||
OptionRecommendation(name='header', recommended_value=False,
|
||||
help=_('Add a header to all the pages with title and author.')
|
||||
),
|
||||
OptionRecommendation(name='header_format', recommended_value="%t by %a",
|
||||
help=_('Set the format of the header. %a is replaced by the author '
|
||||
'and %t by the title. Default is %default')
|
||||
),
|
||||
OptionRecommendation(name='header_separation', recommended_value=0,
|
||||
help=_('Add extra spacing below the header. Default is %default pt.')
|
||||
),
|
||||
OptionRecommendation(name='minimum_indent', recommended_value=0,
|
||||
help=_('Minimum paragraph indent (the indent of the first line '
|
||||
'of a paragraph) in pts. Default: %default')
|
||||
),
|
||||
OptionRecommendation(name='render_tables_as_images',
|
||||
recommended_value=False,
|
||||
help=_('This option has no effect')
|
||||
),
|
||||
OptionRecommendation(name='text_size_multiplier_for_rendered_tables',
|
||||
recommended_value=1.0,
|
||||
help=_('Multiply the size of text in rendered tables by this '
|
||||
'factor. Default is %default')
|
||||
),
|
||||
OptionRecommendation(name='serif_family', recommended_value=None,
|
||||
help=_('The serif family of fonts to embed')
|
||||
),
|
||||
OptionRecommendation(name='sans_family', recommended_value=None,
|
||||
help=_('The sans-serif family of fonts to embed')
|
||||
),
|
||||
OptionRecommendation(name='mono_family', recommended_value=None,
|
||||
help=_('The monospace family of fonts to embed')
|
||||
),
|
||||
|
||||
}
|
||||
|
||||
recommendations = {
|
||||
('change_justification', 'original', OptionRecommendation.HIGH)}
|
||||
|
||||
def convert_images(self, pages, opts, wide):
|
||||
from calibre.ebooks.lrf.pylrs.pylrs import Book, BookSetting, ImageStream, ImageBlock
|
||||
from uuid import uuid4
|
||||
from calibre.constants import __appname__, __version__
|
||||
|
||||
width, height = (784, 1012) if wide else (584, 754)
|
||||
|
||||
ps = {}
|
||||
ps['topmargin'] = 0
|
||||
ps['evensidemargin'] = 0
|
||||
ps['oddsidemargin'] = 0
|
||||
ps['textwidth'] = width
|
||||
ps['textheight'] = height
|
||||
book = Book(title=opts.title, author=opts.author,
|
||||
bookid=uuid4().hex,
|
||||
publisher='%s %s'%(__appname__, __version__),
|
||||
category=_('Comic'), pagestyledefault=ps,
|
||||
booksetting=BookSetting(screenwidth=width, screenheight=height))
|
||||
for page in pages:
|
||||
imageStream = ImageStream(page)
|
||||
_page = book.create_page()
|
||||
_page.append(ImageBlock(refstream=imageStream,
|
||||
blockwidth=width, blockheight=height, xsize=width,
|
||||
ysize=height, x1=width, y1=height))
|
||||
book.append(_page)
|
||||
|
||||
book.renderLrf(open(opts.output, 'wb'))
|
||||
|
||||
def flatten_toc(self):
|
||||
from calibre.ebooks.oeb.base import TOC
|
||||
nroot = TOC()
|
||||
for x in self.oeb.toc.iterdescendants():
|
||||
nroot.add(x.title, x.href)
|
||||
self.oeb.toc = nroot
|
||||
|
||||
def convert(self, oeb, output_path, input_plugin, opts, log):
|
||||
self.log, self.opts, self.oeb = log, opts, oeb
|
||||
|
||||
lrf_opts = LRFOptions(output_path, opts, oeb)
|
||||
|
||||
if input_plugin.is_image_collection:
|
||||
self.convert_images(input_plugin.get_images(), lrf_opts,
|
||||
getattr(opts, 'wide', False))
|
||||
return
|
||||
|
||||
self.flatten_toc()
|
||||
|
||||
from calibre.ptempfile import TemporaryDirectory
|
||||
with TemporaryDirectory('_lrf_output') as tdir:
|
||||
from calibre.customize.ui import plugin_for_output_format
|
||||
oeb_output = plugin_for_output_format('oeb')
|
||||
oeb_output.convert(oeb, tdir, input_plugin, opts, log)
|
||||
opf = [x for x in os.listdir(tdir) if x.endswith('.opf')][0]
|
||||
from calibre.ebooks.lrf.html.convert_from import process_file
|
||||
process_file(os.path.join(tdir, opf), lrf_opts, self.log)
|
||||
66
ebook_converter/ebooks/conversion/plugins/mobi_input.py
Normal file
66
ebook_converter/ebooks/conversion/plugins/mobi_input.py
Normal file
@@ -0,0 +1,66 @@
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL 3'
|
||||
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import os
|
||||
|
||||
from calibre.customize.conversion import InputFormatPlugin
|
||||
from polyglot.builtins import unicode_type
|
||||
|
||||
|
||||
class MOBIInput(InputFormatPlugin):
|
||||
|
||||
name = 'MOBI Input'
|
||||
author = 'Kovid Goyal'
|
||||
description = 'Convert MOBI files (.mobi, .prc, .azw) to HTML'
|
||||
file_types = {'mobi', 'prc', 'azw', 'azw3', 'pobi'}
|
||||
commit_name = 'mobi_input'
|
||||
|
||||
def convert(self, stream, options, file_ext, log,
|
||||
accelerators):
|
||||
self.is_kf8 = False
|
||||
self.mobi_is_joint = False
|
||||
|
||||
from calibre.ebooks.mobi.reader.mobi6 import MobiReader
|
||||
from lxml import html
|
||||
parse_cache = {}
|
||||
try:
|
||||
mr = MobiReader(stream, log, options.input_encoding,
|
||||
options.debug_pipeline)
|
||||
if mr.kf8_type is None:
|
||||
mr.extract_content('.', parse_cache)
|
||||
|
||||
except:
|
||||
mr = MobiReader(stream, log, options.input_encoding,
|
||||
options.debug_pipeline, try_extra_data_fix=True)
|
||||
if mr.kf8_type is None:
|
||||
mr.extract_content('.', parse_cache)
|
||||
|
||||
if mr.kf8_type is not None:
|
||||
log('Found KF8 MOBI of type %r'%mr.kf8_type)
|
||||
if mr.kf8_type == 'joint':
|
||||
self.mobi_is_joint = True
|
||||
from calibre.ebooks.mobi.reader.mobi8 import Mobi8Reader
|
||||
mr = Mobi8Reader(mr, log)
|
||||
opf = os.path.abspath(mr())
|
||||
self.encrypted_fonts = mr.encrypted_fonts
|
||||
self.is_kf8 = True
|
||||
return opf
|
||||
|
||||
raw = parse_cache.pop('calibre_raw_mobi_markup', False)
|
||||
if raw:
|
||||
if isinstance(raw, unicode_type):
|
||||
raw = raw.encode('utf-8')
|
||||
with lopen('debug-raw.html', 'wb') as f:
|
||||
f.write(raw)
|
||||
from calibre.ebooks.oeb.base import close_self_closing_tags
|
||||
for f, root in parse_cache.items():
|
||||
raw = html.tostring(root, encoding='utf-8', method='xml',
|
||||
include_meta_content_type=False)
|
||||
raw = close_self_closing_tags(raw)
|
||||
with lopen(f, 'wb') as q:
|
||||
q.write(raw)
|
||||
accelerators['pagebreaks'] = '//h:div[@class="mbp_pagebreak"]'
|
||||
return mr.created_opf_path
|
||||
337
ebook_converter/ebooks/conversion/plugins/mobi_output.py
Normal file
337
ebook_converter/ebooks/conversion/plugins/mobi_output.py
Normal file
@@ -0,0 +1,337 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
from calibre.customize.conversion import (OutputFormatPlugin,
|
||||
OptionRecommendation)
|
||||
from polyglot.builtins import unicode_type
|
||||
|
||||
|
||||
def remove_html_cover(oeb, log):
|
||||
from calibre.ebooks.oeb.base import OEB_DOCS
|
||||
|
||||
if not oeb.metadata.cover \
|
||||
or 'cover' not in oeb.guide:
|
||||
return
|
||||
href = oeb.guide['cover'].href
|
||||
del oeb.guide['cover']
|
||||
item = oeb.manifest.hrefs[href]
|
||||
if item.spine_position is not None:
|
||||
log.warn('Found an HTML cover: ', item.href, 'removing it.',
|
||||
'If you find some content missing from the output MOBI, it '
|
||||
'is because you misidentified the HTML cover in the input '
|
||||
'document')
|
||||
oeb.spine.remove(item)
|
||||
if item.media_type in OEB_DOCS:
|
||||
oeb.manifest.remove(item)
|
||||
|
||||
|
||||
def extract_mobi(output_path, opts):
|
||||
if opts.extract_to is not None:
|
||||
from calibre.ebooks.mobi.debug.main import inspect_mobi
|
||||
ddir = opts.extract_to
|
||||
inspect_mobi(output_path, ddir=ddir)
|
||||
|
||||
|
||||
class MOBIOutput(OutputFormatPlugin):
|
||||
|
||||
name = 'MOBI Output'
|
||||
author = 'Kovid Goyal'
|
||||
file_type = 'mobi'
|
||||
commit_name = 'mobi_output'
|
||||
ui_data = {'file_types': ['old', 'both', 'new']}
|
||||
|
||||
options = {
|
||||
OptionRecommendation(name='prefer_author_sort',
|
||||
recommended_value=False, level=OptionRecommendation.LOW,
|
||||
help=_('When present, use author sort field as author.')
|
||||
),
|
||||
OptionRecommendation(name='no_inline_toc',
|
||||
recommended_value=False, level=OptionRecommendation.LOW,
|
||||
help=_('Don\'t add Table of Contents to the book. Useful if '
|
||||
'the book has its own table of contents.')),
|
||||
OptionRecommendation(name='toc_title', recommended_value=None,
|
||||
help=_('Title for any generated in-line table of contents.')
|
||||
),
|
||||
OptionRecommendation(name='dont_compress',
|
||||
recommended_value=False, level=OptionRecommendation.LOW,
|
||||
help=_('Disable compression of the file contents.')
|
||||
),
|
||||
OptionRecommendation(name='personal_doc', recommended_value='[PDOC]',
|
||||
help=_('Tag for MOBI files to be marked as personal documents.'
|
||||
' This option has no effect on the conversion. It is used'
|
||||
' only when sending MOBI files to a device. If the file'
|
||||
' being sent has the specified tag, it will be marked as'
|
||||
' a personal document when sent to the Kindle.')
|
||||
),
|
||||
OptionRecommendation(name='mobi_ignore_margins',
|
||||
recommended_value=False,
|
||||
help=_('Ignore margins in the input document. If False, then '
|
||||
'the MOBI output plugin will try to convert margins specified'
|
||||
' in the input document, otherwise it will ignore them.')
|
||||
),
|
||||
OptionRecommendation(name='mobi_toc_at_start',
|
||||
recommended_value=False,
|
||||
help=_('When adding the Table of Contents to the book, add it at the start of the '
|
||||
'book instead of the end. Not recommended.')
|
||||
),
|
||||
OptionRecommendation(name='extract_to',
|
||||
help=_('Extract the contents of the generated %s file to the '
|
||||
'specified directory. The contents of the directory are first '
|
||||
'deleted, so be careful.') % 'MOBI'
|
||||
),
|
||||
OptionRecommendation(name='share_not_sync', recommended_value=False,
|
||||
help=_('Enable sharing of book content via Facebook etc. '
|
||||
' on the Kindle. WARNING: Using this feature means that '
|
||||
' the book will not auto sync its last read position '
|
||||
' on multiple devices. Complain to Amazon.')
|
||||
),
|
||||
OptionRecommendation(name='mobi_keep_original_images',
|
||||
recommended_value=False,
|
||||
help=_('By default calibre converts all images to JPEG format '
|
||||
'in the output MOBI file. This is for maximum compatibility '
|
||||
'as some older MOBI viewers have problems with other image '
|
||||
'formats. This option tells calibre not to do this. '
|
||||
'Useful if your document contains lots of GIF/PNG images that '
|
||||
'become very large when converted to JPEG.')),
|
||||
OptionRecommendation(name='mobi_file_type', choices=ui_data['file_types'], recommended_value='old',
|
||||
help=_('By default calibre generates MOBI files that contain the '
|
||||
'old MOBI 6 format. This format is compatible with all '
|
||||
'devices. However, by changing this setting, you can tell '
|
||||
'calibre to generate MOBI files that contain both MOBI 6 and '
|
||||
'the new KF8 format, or only the new KF8 format. KF8 has '
|
||||
'more features than MOBI 6, but only works with newer Kindles. '
|
||||
'Allowed values: {}').format('old, both, new')),
|
||||
|
||||
}
|
||||
|
||||
def check_for_periodical(self):
|
||||
if self.is_periodical:
|
||||
self.periodicalize_toc()
|
||||
self.check_for_masthead()
|
||||
self.opts.mobi_periodical = True
|
||||
else:
|
||||
self.opts.mobi_periodical = False
|
||||
|
||||
def check_for_masthead(self):
|
||||
found = 'masthead' in self.oeb.guide
|
||||
if not found:
|
||||
from calibre.ebooks import generate_masthead
|
||||
self.oeb.log.debug('No masthead found in manifest, generating default mastheadImage...')
|
||||
raw = generate_masthead(unicode_type(self.oeb.metadata['title'][0]))
|
||||
id, href = self.oeb.manifest.generate('masthead', 'masthead')
|
||||
self.oeb.manifest.add(id, href, 'image/gif', data=raw)
|
||||
self.oeb.guide.add('masthead', 'Masthead Image', href)
|
||||
else:
|
||||
self.oeb.log.debug('Using mastheadImage supplied in manifest...')
|
||||
|
||||
def periodicalize_toc(self):
|
||||
from calibre.ebooks.oeb.base import TOC
|
||||
toc = self.oeb.toc
|
||||
if not toc or len(self.oeb.spine) < 3:
|
||||
return
|
||||
if toc and toc[0].klass != 'periodical':
|
||||
one, two = self.oeb.spine[0], self.oeb.spine[1]
|
||||
self.log('Converting TOC for MOBI periodical indexing...')
|
||||
|
||||
articles = {}
|
||||
if toc.depth() < 3:
|
||||
# single section periodical
|
||||
self.oeb.manifest.remove(one)
|
||||
self.oeb.manifest.remove(two)
|
||||
sections = [TOC(klass='section', title=_('All articles'),
|
||||
href=self.oeb.spine[0].href)]
|
||||
for x in toc:
|
||||
sections[0].nodes.append(x)
|
||||
else:
|
||||
# multi-section periodical
|
||||
self.oeb.manifest.remove(one)
|
||||
sections = list(toc)
|
||||
for i,x in enumerate(sections):
|
||||
x.klass = 'section'
|
||||
articles_ = list(x)
|
||||
if articles_:
|
||||
self.oeb.manifest.remove(self.oeb.manifest.hrefs[x.href])
|
||||
x.href = articles_[0].href
|
||||
|
||||
for sec in sections:
|
||||
articles[id(sec)] = []
|
||||
for a in list(sec):
|
||||
a.klass = 'article'
|
||||
articles[id(sec)].append(a)
|
||||
sec.nodes.remove(a)
|
||||
|
||||
root = TOC(klass='periodical', href=self.oeb.spine[0].href,
|
||||
title=unicode_type(self.oeb.metadata.title[0]))
|
||||
|
||||
for s in sections:
|
||||
if articles[id(s)]:
|
||||
for a in articles[id(s)]:
|
||||
s.nodes.append(a)
|
||||
root.nodes.append(s)
|
||||
|
||||
for x in list(toc.nodes):
|
||||
toc.nodes.remove(x)
|
||||
|
||||
toc.nodes.append(root)
|
||||
|
||||
# Fix up the periodical href to point to first section href
|
||||
toc.nodes[0].href = toc.nodes[0].nodes[0].href
|
||||
|
||||
def convert(self, oeb, output_path, input_plugin, opts, log):
|
||||
from calibre.ebooks.mobi.writer2.resources import Resources
|
||||
self.log, self.opts, self.oeb = log, opts, oeb
|
||||
|
||||
mobi_type = opts.mobi_file_type
|
||||
if self.is_periodical:
|
||||
mobi_type = 'old' # Amazon does not support KF8 periodicals
|
||||
create_kf8 = mobi_type in ('new', 'both')
|
||||
|
||||
remove_html_cover(self.oeb, self.log)
|
||||
resources = Resources(oeb, opts, self.is_periodical,
|
||||
add_fonts=create_kf8)
|
||||
self.check_for_periodical()
|
||||
|
||||
if create_kf8:
|
||||
from calibre.ebooks.mobi.writer8.cleanup import remove_duplicate_anchors
|
||||
remove_duplicate_anchors(self.oeb)
|
||||
# Split on pagebreaks so that the resulting KF8 is faster to load
|
||||
from calibre.ebooks.oeb.transforms.split import Split
|
||||
Split()(self.oeb, self.opts)
|
||||
|
||||
kf8 = self.create_kf8(resources, for_joint=mobi_type=='both'
|
||||
) if create_kf8 else None
|
||||
if mobi_type == 'new':
|
||||
kf8.write(output_path)
|
||||
extract_mobi(output_path, opts)
|
||||
return
|
||||
|
||||
self.log('Creating MOBI 6 output')
|
||||
self.write_mobi(input_plugin, output_path, kf8, resources)
|
||||
|
||||
def create_kf8(self, resources, for_joint=False):
|
||||
from calibre.ebooks.mobi.writer8.main import create_kf8_book
|
||||
return create_kf8_book(self.oeb, self.opts, resources,
|
||||
for_joint=for_joint)
|
||||
|
||||
def write_mobi(self, input_plugin, output_path, kf8, resources):
|
||||
from calibre.ebooks.mobi.mobiml import MobiMLizer
|
||||
from calibre.ebooks.oeb.transforms.manglecase import CaseMangler
|
||||
from calibre.ebooks.oeb.transforms.rasterize import SVGRasterizer, Unavailable
|
||||
from calibre.ebooks.oeb.transforms.htmltoc import HTMLTOCAdder
|
||||
from calibre.customize.ui import plugin_for_input_format
|
||||
|
||||
opts, oeb = self.opts, self.oeb
|
||||
if not opts.no_inline_toc:
|
||||
tocadder = HTMLTOCAdder(title=opts.toc_title, position='start' if
|
||||
opts.mobi_toc_at_start else 'end')
|
||||
tocadder(oeb, opts)
|
||||
mangler = CaseMangler()
|
||||
mangler(oeb, opts)
|
||||
try:
|
||||
rasterizer = SVGRasterizer()
|
||||
rasterizer(oeb, opts)
|
||||
except Unavailable:
|
||||
self.log.warn('SVG rasterizer unavailable, SVG will not be converted')
|
||||
else:
|
||||
# Add rasterized SVG images
|
||||
resources.add_extra_images()
|
||||
if hasattr(self.oeb, 'inserted_metadata_jacket'):
|
||||
self.workaround_fire_bugs(self.oeb.inserted_metadata_jacket)
|
||||
mobimlizer = MobiMLizer(ignore_tables=opts.linearize_tables)
|
||||
mobimlizer(oeb, opts)
|
||||
write_page_breaks_after_item = input_plugin is not plugin_for_input_format('cbz')
|
||||
from calibre.ebooks.mobi.writer2.main import MobiWriter
|
||||
writer = MobiWriter(opts, resources, kf8,
|
||||
write_page_breaks_after_item=write_page_breaks_after_item)
|
||||
writer(oeb, output_path)
|
||||
extract_mobi(output_path, opts)
|
||||
|
||||
def specialize_css_for_output(self, log, opts, item, stylizer):
|
||||
from calibre.ebooks.mobi.writer8.cleanup import CSSCleanup
|
||||
CSSCleanup(log, opts)(item, stylizer)
|
||||
|
||||
def workaround_fire_bugs(self, jacket):
|
||||
# The idiotic Fire crashes when trying to render the table used to
|
||||
# layout the jacket
|
||||
from calibre.ebooks.oeb.base import XHTML
|
||||
for table in jacket.data.xpath('//*[local-name()="table"]'):
|
||||
table.tag = XHTML('div')
|
||||
for tr in table.xpath('descendant::*[local-name()="tr"]'):
|
||||
cols = tr.xpath('descendant::*[local-name()="td"]')
|
||||
tr.tag = XHTML('div')
|
||||
for td in cols:
|
||||
td.tag = XHTML('span' if cols else 'div')
|
||||
|
||||
|
||||
class AZW3Output(OutputFormatPlugin):
|
||||
|
||||
name = 'AZW3 Output'
|
||||
author = 'Kovid Goyal'
|
||||
file_type = 'azw3'
|
||||
commit_name = 'azw3_output'
|
||||
|
||||
options = {
|
||||
OptionRecommendation(name='prefer_author_sort',
|
||||
recommended_value=False, level=OptionRecommendation.LOW,
|
||||
help=_('When present, use author sort field as author.')
|
||||
),
|
||||
OptionRecommendation(name='no_inline_toc',
|
||||
recommended_value=False, level=OptionRecommendation.LOW,
|
||||
help=_('Don\'t add Table of Contents to the book. Useful if '
|
||||
'the book has its own table of contents.')),
|
||||
OptionRecommendation(name='toc_title', recommended_value=None,
|
||||
help=_('Title for any generated in-line table of contents.')
|
||||
),
|
||||
OptionRecommendation(name='dont_compress',
|
||||
recommended_value=False, level=OptionRecommendation.LOW,
|
||||
help=_('Disable compression of the file contents.')
|
||||
),
|
||||
OptionRecommendation(name='mobi_toc_at_start',
|
||||
recommended_value=False,
|
||||
help=_('When adding the Table of Contents to the book, add it at the start of the '
|
||||
'book instead of the end. Not recommended.')
|
||||
),
|
||||
OptionRecommendation(name='extract_to',
|
||||
help=_('Extract the contents of the generated %s file to the '
|
||||
'specified directory. The contents of the directory are first '
|
||||
'deleted, so be careful.') % 'AZW3'),
|
||||
OptionRecommendation(name='share_not_sync', recommended_value=False,
|
||||
help=_('Enable sharing of book content via Facebook etc. '
|
||||
' on the Kindle. WARNING: Using this feature means that '
|
||||
' the book will not auto sync its last read position '
|
||||
' on multiple devices. Complain to Amazon.')
|
||||
),
|
||||
}
|
||||
|
||||
def convert(self, oeb, output_path, input_plugin, opts, log):
|
||||
from calibre.ebooks.mobi.writer2.resources import Resources
|
||||
from calibre.ebooks.mobi.writer8.main import create_kf8_book
|
||||
from calibre.ebooks.mobi.writer8.cleanup import remove_duplicate_anchors
|
||||
|
||||
self.oeb, self.opts, self.log = oeb, opts, log
|
||||
opts.mobi_periodical = self.is_periodical
|
||||
passthrough = getattr(opts, 'mobi_passthrough', False)
|
||||
remove_duplicate_anchors(oeb)
|
||||
|
||||
resources = Resources(self.oeb, self.opts, self.is_periodical,
|
||||
add_fonts=True, process_images=False)
|
||||
if not passthrough:
|
||||
remove_html_cover(self.oeb, self.log)
|
||||
|
||||
# Split on pagebreaks so that the resulting KF8 is faster to load
|
||||
from calibre.ebooks.oeb.transforms.split import Split
|
||||
Split()(self.oeb, self.opts)
|
||||
|
||||
kf8 = create_kf8_book(self.oeb, self.opts, resources, for_joint=False)
|
||||
|
||||
kf8.write(output_path)
|
||||
extract_mobi(output_path, opts)
|
||||
|
||||
def specialize_css_for_output(self, log, opts, item, stylizer):
|
||||
from calibre.ebooks.mobi.writer8.cleanup import CSSCleanup
|
||||
CSSCleanup(log, opts)(item, stylizer)
|
||||
25
ebook_converter/ebooks/conversion/plugins/odt_input.py
Normal file
25
ebook_converter/ebooks/conversion/plugins/odt_input.py
Normal file
@@ -0,0 +1,25 @@
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
'''
|
||||
Convert an ODT file into a Open Ebook
|
||||
'''
|
||||
|
||||
from calibre.customize.conversion import InputFormatPlugin
|
||||
|
||||
|
||||
class ODTInput(InputFormatPlugin):
|
||||
|
||||
name = 'ODT Input'
|
||||
author = 'Kovid Goyal'
|
||||
description = 'Convert ODT (OpenOffice) files to HTML'
|
||||
file_types = {'odt'}
|
||||
commit_name = 'odt_input'
|
||||
|
||||
def convert(self, stream, options, file_ext, log,
|
||||
accelerators):
|
||||
from calibre.ebooks.odt.input import Extract
|
||||
return Extract()(stream, '.', log)
|
||||
122
ebook_converter/ebooks/conversion/plugins/oeb_output.py
Normal file
122
ebook_converter/ebooks/conversion/plugins/oeb_output.py
Normal file
@@ -0,0 +1,122 @@
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL 3'
|
||||
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import os, re
|
||||
|
||||
|
||||
from calibre.customize.conversion import (OutputFormatPlugin,
|
||||
OptionRecommendation)
|
||||
from calibre import CurrentDir
|
||||
|
||||
|
||||
class OEBOutput(OutputFormatPlugin):
|
||||
|
||||
name = 'OEB Output'
|
||||
author = 'Kovid Goyal'
|
||||
file_type = 'oeb'
|
||||
commit_name = 'oeb_output'
|
||||
|
||||
recommendations = {('pretty_print', True, OptionRecommendation.HIGH)}
|
||||
|
||||
def convert(self, oeb_book, output_path, input_plugin, opts, log):
|
||||
from polyglot.urllib import unquote
|
||||
from lxml import etree
|
||||
|
||||
self.log, self.opts = log, opts
|
||||
if not os.path.exists(output_path):
|
||||
os.makedirs(output_path)
|
||||
from calibre.ebooks.oeb.base import OPF_MIME, NCX_MIME, PAGE_MAP_MIME, OEB_STYLES
|
||||
from calibre.ebooks.oeb.normalize_css import condense_sheet
|
||||
with CurrentDir(output_path):
|
||||
results = oeb_book.to_opf2(page_map=True)
|
||||
for key in (OPF_MIME, NCX_MIME, PAGE_MAP_MIME):
|
||||
href, root = results.pop(key, [None, None])
|
||||
if root is not None:
|
||||
if key == OPF_MIME:
|
||||
try:
|
||||
self.workaround_nook_cover_bug(root)
|
||||
except:
|
||||
self.log.exception('Something went wrong while trying to'
|
||||
' workaround Nook cover bug, ignoring')
|
||||
try:
|
||||
self.workaround_pocketbook_cover_bug(root)
|
||||
except:
|
||||
self.log.exception('Something went wrong while trying to'
|
||||
' workaround Pocketbook cover bug, ignoring')
|
||||
self.migrate_lang_code(root)
|
||||
raw = etree.tostring(root, pretty_print=True,
|
||||
encoding='utf-8', xml_declaration=True)
|
||||
if key == OPF_MIME:
|
||||
# Needed as I can't get lxml to output opf:role and
|
||||
# not output <opf:metadata> as well
|
||||
raw = re.sub(br'(<[/]{0,1})opf:', br'\1', raw)
|
||||
with lopen(href, 'wb') as f:
|
||||
f.write(raw)
|
||||
|
||||
for item in oeb_book.manifest:
|
||||
if (
|
||||
not self.opts.expand_css and item.media_type in OEB_STYLES and hasattr(
|
||||
item.data, 'cssText') and 'nook' not in self.opts.output_profile.short_name):
|
||||
condense_sheet(item.data)
|
||||
path = os.path.abspath(unquote(item.href))
|
||||
dir = os.path.dirname(path)
|
||||
if not os.path.exists(dir):
|
||||
os.makedirs(dir)
|
||||
with lopen(path, 'wb') as f:
|
||||
f.write(item.bytes_representation)
|
||||
item.unload_data_from_memory(memory=path)
|
||||
|
||||
def workaround_nook_cover_bug(self, root): # {{{
|
||||
cov = root.xpath('//*[local-name() = "meta" and @name="cover" and'
|
||||
' @content != "cover"]')
|
||||
|
||||
def manifest_items_with_id(id_):
|
||||
return root.xpath('//*[local-name() = "manifest"]/*[local-name() = "item" '
|
||||
' and @id="%s"]'%id_)
|
||||
|
||||
if len(cov) == 1:
|
||||
cov = cov[0]
|
||||
covid = cov.get('content', '')
|
||||
|
||||
if covid:
|
||||
manifest_item = manifest_items_with_id(covid)
|
||||
if len(manifest_item) == 1 and \
|
||||
manifest_item[0].get('media-type',
|
||||
'').startswith('image/'):
|
||||
self.log.warn('The cover image has an id != "cover". Renaming'
|
||||
' to work around bug in Nook Color')
|
||||
|
||||
from calibre.ebooks.oeb.base import uuid_id
|
||||
newid = uuid_id()
|
||||
|
||||
for item in manifest_items_with_id('cover'):
|
||||
item.set('id', newid)
|
||||
|
||||
for x in root.xpath('//*[@idref="cover"]'):
|
||||
x.set('idref', newid)
|
||||
|
||||
manifest_item = manifest_item[0]
|
||||
manifest_item.set('id', 'cover')
|
||||
cov.set('content', 'cover')
|
||||
# }}}
|
||||
|
||||
def workaround_pocketbook_cover_bug(self, root): # {{{
|
||||
m = root.xpath('//*[local-name() = "manifest"]/*[local-name() = "item" '
|
||||
' and @id="cover"]')
|
||||
if len(m) == 1:
|
||||
m = m[0]
|
||||
p = m.getparent()
|
||||
p.remove(m)
|
||||
p.insert(0, m)
|
||||
# }}}
|
||||
|
||||
def migrate_lang_code(self, root): # {{{
|
||||
from calibre.utils.localization import lang_as_iso639_1
|
||||
for lang in root.xpath('//*[local-name() = "language"]'):
|
||||
clc = lang_as_iso639_1(lang.text)
|
||||
if clc:
|
||||
lang.text = clc
|
||||
# }}}
|
||||
37
ebook_converter/ebooks/conversion/plugins/pdb_input.py
Normal file
37
ebook_converter/ebooks/conversion/plugins/pdb_input.py
Normal file
@@ -0,0 +1,37 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
from calibre.customize.conversion import InputFormatPlugin
|
||||
from polyglot.builtins import getcwd
|
||||
|
||||
|
||||
class PDBInput(InputFormatPlugin):
|
||||
|
||||
name = 'PDB Input'
|
||||
author = 'John Schember'
|
||||
description = 'Convert PDB to HTML'
|
||||
file_types = {'pdb', 'updb'}
|
||||
commit_name = 'pdb_input'
|
||||
|
||||
def convert(self, stream, options, file_ext, log,
|
||||
accelerators):
|
||||
from calibre.ebooks.pdb.header import PdbHeaderReader
|
||||
from calibre.ebooks.pdb import PDBError, IDENTITY_TO_NAME, get_reader
|
||||
|
||||
header = PdbHeaderReader(stream)
|
||||
Reader = get_reader(header.ident)
|
||||
|
||||
if Reader is None:
|
||||
raise PDBError('No reader available for format within container.\n Identity is %s. Book type is %s' %
|
||||
(header.ident, IDENTITY_TO_NAME.get(header.ident, _('Unknown'))))
|
||||
|
||||
log.debug('Detected ebook format as: %s with identity: %s' % (IDENTITY_TO_NAME[header.ident], header.ident))
|
||||
|
||||
reader = Reader(header, stream, log, options)
|
||||
opf = reader.extract_content(getcwd())
|
||||
|
||||
return opf
|
||||
64
ebook_converter/ebooks/conversion/plugins/pdb_output.py
Normal file
64
ebook_converter/ebooks/conversion/plugins/pdb_output.py
Normal file
@@ -0,0 +1,64 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL 3'
|
||||
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import os
|
||||
|
||||
from calibre.customize.conversion import OutputFormatPlugin, \
|
||||
OptionRecommendation
|
||||
from calibre.ebooks.pdb import PDBError, get_writer, ALL_FORMAT_WRITERS
|
||||
|
||||
|
||||
class PDBOutput(OutputFormatPlugin):
|
||||
|
||||
name = 'PDB Output'
|
||||
author = 'John Schember'
|
||||
file_type = 'pdb'
|
||||
commit_name = 'pdb_output'
|
||||
ui_data = {'formats': tuple(ALL_FORMAT_WRITERS)}
|
||||
|
||||
options = {
|
||||
OptionRecommendation(name='format', recommended_value='doc',
|
||||
level=OptionRecommendation.LOW,
|
||||
short_switch='f', choices=list(ALL_FORMAT_WRITERS),
|
||||
help=(_('Format to use inside the pdb container. Choices are:') + ' %s' % sorted(ALL_FORMAT_WRITERS))),
|
||||
OptionRecommendation(name='pdb_output_encoding', recommended_value='cp1252',
|
||||
level=OptionRecommendation.LOW,
|
||||
help=_('Specify the character encoding of the output document. '
|
||||
'The default is cp1252. Note: This option is not honored by all '
|
||||
'formats.')),
|
||||
OptionRecommendation(name='inline_toc',
|
||||
recommended_value=False, level=OptionRecommendation.LOW,
|
||||
help=_('Add Table of Contents to beginning of the book.')),
|
||||
}
|
||||
|
||||
def convert(self, oeb_book, output_path, input_plugin, opts, log):
|
||||
close = False
|
||||
if not hasattr(output_path, 'write'):
|
||||
close = True
|
||||
if not os.path.exists(os.path.dirname(output_path)) and os.path.dirname(output_path):
|
||||
os.makedirs(os.path.dirname(output_path))
|
||||
out_stream = lopen(output_path, 'wb')
|
||||
else:
|
||||
out_stream = output_path
|
||||
|
||||
Writer = get_writer(opts.format)
|
||||
|
||||
if Writer is None:
|
||||
raise PDBError('No writer available for format %s.' % format)
|
||||
|
||||
setattr(opts, 'max_line_length', 0)
|
||||
setattr(opts, 'force_max_line_length', False)
|
||||
|
||||
writer = Writer(opts, log)
|
||||
|
||||
out_stream.seek(0)
|
||||
out_stream.truncate()
|
||||
|
||||
writer.write_content(oeb_book, out_stream, oeb_book.metadata)
|
||||
|
||||
if close:
|
||||
out_stream.close()
|
||||
82
ebook_converter/ebooks/conversion/plugins/pdf_input.py
Normal file
82
ebook_converter/ebooks/conversion/plugins/pdf_input.py
Normal file
@@ -0,0 +1,82 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL 3'
|
||||
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import os
|
||||
|
||||
from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
|
||||
from polyglot.builtins import as_bytes, getcwd
|
||||
|
||||
|
||||
class PDFInput(InputFormatPlugin):
|
||||
|
||||
name = 'PDF Input'
|
||||
author = 'Kovid Goyal and John Schember'
|
||||
description = 'Convert PDF files to HTML'
|
||||
file_types = {'pdf'}
|
||||
commit_name = 'pdf_input'
|
||||
|
||||
options = {
|
||||
OptionRecommendation(name='no_images', recommended_value=False,
|
||||
help=_('Do not extract images from the document')),
|
||||
OptionRecommendation(name='unwrap_factor', recommended_value=0.45,
|
||||
help=_('Scale used to determine the length at which a line should '
|
||||
'be unwrapped. Valid values are a decimal between 0 and 1. The '
|
||||
'default is 0.45, just below the median line length.')),
|
||||
OptionRecommendation(name='new_pdf_engine', recommended_value=False,
|
||||
help=_('Use the new PDF conversion engine. Currently not operational.'))
|
||||
}
|
||||
|
||||
def convert_new(self, stream, accelerators):
|
||||
from calibre.ebooks.pdf.pdftohtml import pdftohtml
|
||||
from calibre.utils.cleantext import clean_ascii_chars
|
||||
from calibre.ebooks.pdf.reflow import PDFDocument
|
||||
|
||||
pdftohtml(getcwd(), stream.name, self.opts.no_images, as_xml=True)
|
||||
with lopen('index.xml', 'rb') as f:
|
||||
xml = clean_ascii_chars(f.read())
|
||||
PDFDocument(xml, self.opts, self.log)
|
||||
return os.path.join(getcwd(), 'metadata.opf')
|
||||
|
||||
def convert(self, stream, options, file_ext, log,
|
||||
accelerators):
|
||||
from calibre.ebooks.metadata.opf2 import OPFCreator
|
||||
from calibre.ebooks.pdf.pdftohtml import pdftohtml
|
||||
|
||||
log.debug('Converting file to html...')
|
||||
# The main html file will be named index.html
|
||||
self.opts, self.log = options, log
|
||||
if options.new_pdf_engine:
|
||||
return self.convert_new(stream, accelerators)
|
||||
pdftohtml(getcwd(), stream.name, options.no_images)
|
||||
|
||||
from calibre.ebooks.metadata.meta import get_metadata
|
||||
log.debug('Retrieving document metadata...')
|
||||
mi = get_metadata(stream, 'pdf')
|
||||
opf = OPFCreator(getcwd(), mi)
|
||||
|
||||
manifest = [('index.html', None)]
|
||||
|
||||
images = os.listdir(getcwd())
|
||||
images.remove('index.html')
|
||||
for i in images:
|
||||
manifest.append((i, None))
|
||||
log.debug('Generating manifest...')
|
||||
opf.create_manifest(manifest)
|
||||
|
||||
opf.create_spine(['index.html'])
|
||||
log.debug('Rendering manifest...')
|
||||
with lopen('metadata.opf', 'wb') as opffile:
|
||||
opf.render(opffile)
|
||||
if os.path.exists('toc.ncx'):
|
||||
ncxid = opf.manifest.id_for_path('toc.ncx')
|
||||
if ncxid:
|
||||
with lopen('metadata.opf', 'r+b') as f:
|
||||
raw = f.read().replace(b'<spine', b'<spine toc="%s"' % as_bytes(ncxid))
|
||||
f.seek(0)
|
||||
f.write(raw)
|
||||
|
||||
return os.path.join(getcwd(), 'metadata.opf')
|
||||
256
ebook_converter/ebooks/conversion/plugins/pdf_output.py
Normal file
256
ebook_converter/ebooks/conversion/plugins/pdf_output.py
Normal file
@@ -0,0 +1,256 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL 3'
|
||||
__copyright__ = '2012, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
'''
|
||||
Convert OEB ebook format to PDF.
|
||||
'''
|
||||
|
||||
import glob, os
|
||||
|
||||
from calibre.customize.conversion import (OutputFormatPlugin,
|
||||
OptionRecommendation)
|
||||
from calibre.ptempfile import TemporaryDirectory
|
||||
from polyglot.builtins import iteritems, unicode_type
|
||||
|
||||
UNITS = ('millimeter', 'centimeter', 'point', 'inch' , 'pica' , 'didot',
|
||||
'cicero', 'devicepixel')
|
||||
|
||||
PAPER_SIZES = ('a0', 'a1', 'a2', 'a3', 'a4', 'a5', 'a6', 'b0', 'b1',
|
||||
'b2', 'b3', 'b4', 'b5', 'b6', 'legal', 'letter')
|
||||
|
||||
|
||||
class PDFOutput(OutputFormatPlugin):
|
||||
|
||||
name = 'PDF Output'
|
||||
author = 'Kovid Goyal'
|
||||
file_type = 'pdf'
|
||||
commit_name = 'pdf_output'
|
||||
ui_data = {'paper_sizes': PAPER_SIZES, 'units': UNITS, 'font_types': ('serif', 'sans', 'mono')}
|
||||
|
||||
options = {
|
||||
OptionRecommendation(name='use_profile_size', recommended_value=False,
|
||||
help=_('Instead of using the paper size specified in the PDF Output options,'
|
||||
' use a paper size corresponding to the current output profile.'
|
||||
' Useful if you want to generate a PDF for viewing on a specific device.')),
|
||||
OptionRecommendation(name='unit', recommended_value='inch',
|
||||
level=OptionRecommendation.LOW, short_switch='u', choices=UNITS,
|
||||
help=_('The unit of measure for page sizes. Default is inch. Choices '
|
||||
'are {} '
|
||||
'Note: This does not override the unit for margins!').format(', '.join(UNITS))),
|
||||
OptionRecommendation(name='paper_size', recommended_value='letter',
|
||||
level=OptionRecommendation.LOW, choices=PAPER_SIZES,
|
||||
help=_('The size of the paper. This size will be overridden when a '
|
||||
'non default output profile is used. Default is letter. Choices '
|
||||
'are {}').format(', '.join(PAPER_SIZES))),
|
||||
OptionRecommendation(name='custom_size', recommended_value=None,
|
||||
help=_('Custom size of the document. Use the form widthxheight '
|
||||
'e.g. `123x321` to specify the width and height. '
|
||||
'This overrides any specified paper-size.')),
|
||||
OptionRecommendation(name='preserve_cover_aspect_ratio',
|
||||
recommended_value=False,
|
||||
help=_('Preserve the aspect ratio of the cover, instead'
|
||||
' of stretching it to fill the full first page of the'
|
||||
' generated pdf.')),
|
||||
OptionRecommendation(name='pdf_serif_family',
|
||||
recommended_value='Times', help=_(
|
||||
'The font family used to render serif fonts. Will work only if the font is available system-wide.')),
|
||||
OptionRecommendation(name='pdf_sans_family',
|
||||
recommended_value='Helvetica', help=_(
|
||||
'The font family used to render sans-serif fonts. Will work only if the font is available system-wide.')),
|
||||
OptionRecommendation(name='pdf_mono_family',
|
||||
recommended_value='Courier', help=_(
|
||||
'The font family used to render monospace fonts. Will work only if the font is available system-wide.')),
|
||||
OptionRecommendation(name='pdf_standard_font', choices=ui_data['font_types'],
|
||||
recommended_value='serif', help=_(
|
||||
'The font family used to render monospace fonts')),
|
||||
OptionRecommendation(name='pdf_default_font_size',
|
||||
recommended_value=20, help=_(
|
||||
'The default font size')),
|
||||
OptionRecommendation(name='pdf_mono_font_size',
|
||||
recommended_value=16, help=_(
|
||||
'The default font size for monospaced text')),
|
||||
OptionRecommendation(name='pdf_hyphenate', recommended_value=False,
|
||||
help=_('Break long words at the end of lines. This can give the text at the right margin a more even appearance.')),
|
||||
OptionRecommendation(name='pdf_mark_links', recommended_value=False,
|
||||
help=_('Surround all links with a red box, useful for debugging.')),
|
||||
OptionRecommendation(name='pdf_page_numbers', recommended_value=False,
|
||||
help=_('Add page numbers to the bottom of every page in the generated PDF file. If you '
|
||||
'specify a footer template, it will take precedence '
|
||||
'over this option.')),
|
||||
OptionRecommendation(name='pdf_footer_template', recommended_value=None,
|
||||
help=_('An HTML template used to generate %s on every page.'
|
||||
' The strings _PAGENUM_, _TITLE_, _AUTHOR_ and _SECTION_ will be replaced by their current values.')%_('footers')),
|
||||
OptionRecommendation(name='pdf_header_template', recommended_value=None,
|
||||
help=_('An HTML template used to generate %s on every page.'
|
||||
' The strings _PAGENUM_, _TITLE_, _AUTHOR_ and _SECTION_ will be replaced by their current values.')%_('headers')),
|
||||
OptionRecommendation(name='pdf_add_toc', recommended_value=False,
|
||||
help=_('Add a Table of Contents at the end of the PDF that lists page numbers. '
|
||||
'Useful if you want to print out the PDF. If this PDF is intended for electronic use, use the PDF Outline instead.')),
|
||||
OptionRecommendation(name='toc_title', recommended_value=None,
|
||||
help=_('Title for generated table of contents.')
|
||||
),
|
||||
|
||||
OptionRecommendation(name='pdf_page_margin_left', recommended_value=72.0,
|
||||
level=OptionRecommendation.LOW,
|
||||
help=_('The size of the left page margin, in pts. Default is 72pt.'
|
||||
' Overrides the common left page margin setting.')
|
||||
),
|
||||
|
||||
OptionRecommendation(name='pdf_page_margin_top', recommended_value=72.0,
|
||||
level=OptionRecommendation.LOW,
|
||||
help=_('The size of the top page margin, in pts. Default is 72pt.'
|
||||
' Overrides the common top page margin setting, unless set to zero.')
|
||||
),
|
||||
|
||||
OptionRecommendation(name='pdf_page_margin_right', recommended_value=72.0,
|
||||
level=OptionRecommendation.LOW,
|
||||
help=_('The size of the right page margin, in pts. Default is 72pt.'
|
||||
' Overrides the common right page margin setting, unless set to zero.')
|
||||
),
|
||||
|
||||
OptionRecommendation(name='pdf_page_margin_bottom', recommended_value=72.0,
|
||||
level=OptionRecommendation.LOW,
|
||||
help=_('The size of the bottom page margin, in pts. Default is 72pt.'
|
||||
' Overrides the common bottom page margin setting, unless set to zero.')
|
||||
),
|
||||
OptionRecommendation(name='pdf_use_document_margins', recommended_value=False,
|
||||
help=_('Use the page margins specified in the input document via @page CSS rules.'
|
||||
' This will cause the margins specified in the conversion settings to be ignored.'
|
||||
' If the document does not specify page margins, the conversion settings will be used as a fallback.')
|
||||
),
|
||||
OptionRecommendation(name='pdf_page_number_map', recommended_value=None,
|
||||
help=_('Adjust page numbers, as needed. Syntax is a JavaScript expression for the page number.'
|
||||
' For example, "if (n < 3) 0; else n - 3;", where n is current page number.')
|
||||
),
|
||||
OptionRecommendation(name='uncompressed_pdf',
|
||||
recommended_value=False, help=_(
|
||||
'Generate an uncompressed PDF, useful for debugging.')
|
||||
),
|
||||
OptionRecommendation(name='pdf_odd_even_offset', recommended_value=0.0,
|
||||
level=OptionRecommendation.LOW,
|
||||
help=_(
|
||||
'Shift the text horizontally by the specified offset (in pts).'
|
||||
' On odd numbered pages, it is shifted to the right and on even'
|
||||
' numbered pages to the left. Use negative numbers for the opposite'
|
||||
' effect. Note that this setting is ignored on pages where the margins'
|
||||
' are smaller than the specified offset. Shifting is done by setting'
|
||||
' the PDF CropBox, not all software respects the CropBox.'
|
||||
)
|
||||
),
|
||||
|
||||
}
|
||||
|
||||
def specialize_options(self, log, opts, input_fmt):
|
||||
# Ensure Qt is setup to be used with WebEngine
|
||||
# specialize_options is called early enough in the pipeline
|
||||
# that hopefully no Qt application has been constructed as yet
|
||||
from PyQt5.QtWebEngineCore import QWebEngineUrlScheme
|
||||
from PyQt5.QtWebEngineWidgets import QWebEnginePage # noqa
|
||||
from calibre.gui2 import must_use_qt
|
||||
from calibre.constants import FAKE_PROTOCOL
|
||||
scheme = QWebEngineUrlScheme(FAKE_PROTOCOL.encode('ascii'))
|
||||
scheme.setSyntax(QWebEngineUrlScheme.Syntax.Host)
|
||||
scheme.setFlags(QWebEngineUrlScheme.SecureScheme)
|
||||
QWebEngineUrlScheme.registerScheme(scheme)
|
||||
must_use_qt()
|
||||
self.input_fmt = input_fmt
|
||||
|
||||
if opts.pdf_use_document_margins:
|
||||
# Prevent the conversion pipeline from overwriting document margins
|
||||
opts.margin_left = opts.margin_right = opts.margin_top = opts.margin_bottom = -1
|
||||
|
||||
def convert(self, oeb_book, output_path, input_plugin, opts, log):
|
||||
self.stored_page_margins = getattr(opts, '_stored_page_margins', {})
|
||||
|
||||
self.oeb = oeb_book
|
||||
self.input_plugin, self.opts, self.log = input_plugin, opts, log
|
||||
self.output_path = output_path
|
||||
from calibre.ebooks.oeb.base import OPF, OPF2_NS
|
||||
from lxml import etree
|
||||
from io import BytesIO
|
||||
package = etree.Element(OPF('package'),
|
||||
attrib={'version': '2.0', 'unique-identifier': 'dummy'},
|
||||
nsmap={None: OPF2_NS})
|
||||
from calibre.ebooks.metadata.opf2 import OPF
|
||||
self.oeb.metadata.to_opf2(package)
|
||||
self.metadata = OPF(BytesIO(etree.tostring(package))).to_book_metadata()
|
||||
self.cover_data = None
|
||||
|
||||
if input_plugin.is_image_collection:
|
||||
log.debug('Converting input as an image collection...')
|
||||
self.convert_images(input_plugin.get_images())
|
||||
else:
|
||||
log.debug('Converting input as a text based book...')
|
||||
self.convert_text(oeb_book)
|
||||
|
||||
def convert_images(self, images):
|
||||
from calibre.ebooks.pdf.image_writer import convert
|
||||
convert(images, self.output_path, self.opts, self.metadata, self.report_progress)
|
||||
|
||||
def get_cover_data(self):
|
||||
oeb = self.oeb
|
||||
if (oeb.metadata.cover and unicode_type(oeb.metadata.cover[0]) in oeb.manifest.ids):
|
||||
cover_id = unicode_type(oeb.metadata.cover[0])
|
||||
item = oeb.manifest.ids[cover_id]
|
||||
self.cover_data = item.data
|
||||
|
||||
def process_fonts(self):
|
||||
''' Make sure all fonts are embeddable '''
|
||||
from calibre.ebooks.oeb.base import urlnormalize
|
||||
from calibre.utils.fonts.utils import remove_embed_restriction
|
||||
|
||||
processed = set()
|
||||
for item in list(self.oeb.manifest):
|
||||
if not hasattr(item.data, 'cssRules'):
|
||||
continue
|
||||
for i, rule in enumerate(item.data.cssRules):
|
||||
if rule.type == rule.FONT_FACE_RULE:
|
||||
try:
|
||||
s = rule.style
|
||||
src = s.getProperty('src').propertyValue[0].uri
|
||||
except:
|
||||
continue
|
||||
path = item.abshref(src)
|
||||
ff = self.oeb.manifest.hrefs.get(urlnormalize(path), None)
|
||||
if ff is None:
|
||||
continue
|
||||
|
||||
raw = nraw = ff.data
|
||||
if path not in processed:
|
||||
processed.add(path)
|
||||
try:
|
||||
nraw = remove_embed_restriction(raw)
|
||||
except:
|
||||
continue
|
||||
if nraw != raw:
|
||||
ff.data = nraw
|
||||
self.oeb.container.write(path, nraw)
|
||||
|
||||
def convert_text(self, oeb_book):
|
||||
import json
|
||||
from calibre.ebooks.pdf.html_writer import convert
|
||||
self.get_cover_data()
|
||||
self.process_fonts()
|
||||
|
||||
if self.opts.pdf_use_document_margins and self.stored_page_margins:
|
||||
for href, margins in iteritems(self.stored_page_margins):
|
||||
item = oeb_book.manifest.hrefs.get(href)
|
||||
if item is not None:
|
||||
root = item.data
|
||||
if hasattr(root, 'xpath') and margins:
|
||||
root.set('data-calibre-pdf-output-page-margins', json.dumps(margins))
|
||||
|
||||
with TemporaryDirectory('_pdf_out') as oeb_dir:
|
||||
from calibre.customize.ui import plugin_for_output_format
|
||||
oeb_dir = os.path.realpath(oeb_dir)
|
||||
oeb_output = plugin_for_output_format('oeb')
|
||||
oeb_output.convert(oeb_book, oeb_dir, self.input_plugin, self.opts, self.log)
|
||||
opfpath = glob.glob(os.path.join(oeb_dir, '*.opf'))[0]
|
||||
convert(
|
||||
opfpath, self.opts, metadata=self.metadata, output_path=self.output_path,
|
||||
log=self.log, cover_data=self.cover_data, report_progress=self.report_progress
|
||||
)
|
||||
165
ebook_converter/ebooks/conversion/plugins/pml_input.py
Normal file
165
ebook_converter/ebooks/conversion/plugins/pml_input.py
Normal file
@@ -0,0 +1,165 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import glob
|
||||
import os
|
||||
import shutil
|
||||
|
||||
from calibre.customize.conversion import InputFormatPlugin
|
||||
from calibre.ptempfile import TemporaryDirectory
|
||||
from polyglot.builtins import getcwd
|
||||
|
||||
|
||||
class PMLInput(InputFormatPlugin):
|
||||
|
||||
name = 'PML Input'
|
||||
author = 'John Schember'
|
||||
description = 'Convert PML to OEB'
|
||||
# pmlz is a zip file containing pml files and png images.
|
||||
file_types = {'pml', 'pmlz'}
|
||||
commit_name = 'pml_input'
|
||||
|
||||
def process_pml(self, pml_path, html_path, close_all=False):
|
||||
from calibre.ebooks.pml.pmlconverter import PML_HTMLizer
|
||||
|
||||
pclose = False
|
||||
hclose = False
|
||||
|
||||
if not hasattr(pml_path, 'read'):
|
||||
pml_stream = lopen(pml_path, 'rb')
|
||||
pclose = True
|
||||
else:
|
||||
pml_stream = pml_path
|
||||
pml_stream.seek(0)
|
||||
|
||||
if not hasattr(html_path, 'write'):
|
||||
html_stream = lopen(html_path, 'wb')
|
||||
hclose = True
|
||||
else:
|
||||
html_stream = html_path
|
||||
|
||||
ienc = getattr(pml_stream, 'encoding', None)
|
||||
if ienc is None:
|
||||
ienc = 'cp1252'
|
||||
if self.options.input_encoding:
|
||||
ienc = self.options.input_encoding
|
||||
|
||||
self.log.debug('Converting PML to HTML...')
|
||||
hizer = PML_HTMLizer()
|
||||
html = hizer.parse_pml(pml_stream.read().decode(ienc), html_path)
|
||||
html = '<html><head><title></title></head><body>%s</body></html>'%html
|
||||
html_stream.write(html.encode('utf-8', 'replace'))
|
||||
|
||||
if pclose:
|
||||
pml_stream.close()
|
||||
if hclose:
|
||||
html_stream.close()
|
||||
|
||||
return hizer.get_toc()
|
||||
|
||||
def get_images(self, stream, tdir, top_level=False):
|
||||
images = []
|
||||
imgs = []
|
||||
|
||||
if top_level:
|
||||
imgs = glob.glob(os.path.join(tdir, '*.png'))
|
||||
# Images not in top level try bookname_img directory because
|
||||
# that's where Dropbook likes to see them.
|
||||
if not imgs:
|
||||
if hasattr(stream, 'name'):
|
||||
imgs = glob.glob(os.path.join(tdir, os.path.splitext(os.path.basename(stream.name))[0] + '_img', '*.png'))
|
||||
# No images in Dropbook location try generic images directory
|
||||
if not imgs:
|
||||
imgs = glob.glob(os.path.join(os.path.join(tdir, 'images'), '*.png'))
|
||||
if imgs:
|
||||
os.makedirs(os.path.join(getcwd(), 'images'))
|
||||
for img in imgs:
|
||||
pimg_name = os.path.basename(img)
|
||||
pimg_path = os.path.join(getcwd(), 'images', pimg_name)
|
||||
|
||||
images.append('images/' + pimg_name)
|
||||
|
||||
shutil.copy(img, pimg_path)
|
||||
|
||||
return images
|
||||
|
||||
def convert(self, stream, options, file_ext, log,
|
||||
accelerators):
|
||||
from calibre.ebooks.metadata.toc import TOC
|
||||
from calibre.ebooks.metadata.opf2 import OPFCreator
|
||||
from calibre.utils.zipfile import ZipFile
|
||||
|
||||
self.options = options
|
||||
self.log = log
|
||||
pages, images = [], []
|
||||
toc = TOC()
|
||||
|
||||
if file_ext == 'pmlz':
|
||||
log.debug('De-compressing content to temporary directory...')
|
||||
with TemporaryDirectory('_unpmlz') as tdir:
|
||||
zf = ZipFile(stream)
|
||||
zf.extractall(tdir)
|
||||
|
||||
pmls = glob.glob(os.path.join(tdir, '*.pml'))
|
||||
for pml in pmls:
|
||||
html_name = os.path.splitext(os.path.basename(pml))[0]+'.html'
|
||||
html_path = os.path.join(getcwd(), html_name)
|
||||
|
||||
pages.append(html_name)
|
||||
log.debug('Processing PML item %s...' % pml)
|
||||
ttoc = self.process_pml(pml, html_path)
|
||||
toc += ttoc
|
||||
images = self.get_images(stream, tdir, True)
|
||||
else:
|
||||
toc = self.process_pml(stream, 'index.html')
|
||||
pages.append('index.html')
|
||||
|
||||
if hasattr(stream, 'name'):
|
||||
images = self.get_images(stream, os.path.abspath(os.path.dirname(stream.name)))
|
||||
|
||||
# We want pages to be orded alphabetically.
|
||||
pages.sort()
|
||||
|
||||
manifest_items = []
|
||||
for item in pages+images:
|
||||
manifest_items.append((item, None))
|
||||
|
||||
from calibre.ebooks.metadata.meta import get_metadata
|
||||
log.debug('Reading metadata from input file...')
|
||||
mi = get_metadata(stream, 'pml')
|
||||
if 'images/cover.png' in images:
|
||||
mi.cover = 'images/cover.png'
|
||||
opf = OPFCreator(getcwd(), mi)
|
||||
log.debug('Generating manifest...')
|
||||
opf.create_manifest(manifest_items)
|
||||
opf.create_spine(pages)
|
||||
opf.set_toc(toc)
|
||||
with lopen('metadata.opf', 'wb') as opffile:
|
||||
with lopen('toc.ncx', 'wb') as tocfile:
|
||||
opf.render(opffile, tocfile, 'toc.ncx')
|
||||
|
||||
return os.path.join(getcwd(), 'metadata.opf')
|
||||
|
||||
def postprocess_book(self, oeb, opts, log):
|
||||
from calibre.ebooks.oeb.base import XHTML, barename
|
||||
for item in oeb.spine:
|
||||
if hasattr(item.data, 'xpath'):
|
||||
for heading in item.data.iterdescendants(*map(XHTML, 'h1 h2 h3 h4 h5 h6'.split())):
|
||||
if not len(heading):
|
||||
continue
|
||||
span = heading[0]
|
||||
if not heading.text and not span.text and not len(span) and barename(span.tag) == 'span':
|
||||
if not heading.get('id') and span.get('id'):
|
||||
heading.set('id', span.get('id'))
|
||||
heading.text = span.tail
|
||||
heading.remove(span)
|
||||
if len(heading) == 1 and heading[0].get('style') == 'text-align: center; margin: auto;':
|
||||
div = heading[0]
|
||||
if barename(div.tag) == 'div' and not len(div) and not div.get('id') and not heading.get('style'):
|
||||
heading.text = (heading.text or '') + (div.text or '') + (div.tail or '')
|
||||
heading.remove(div)
|
||||
heading.set('style', 'text-align: center')
|
||||
77
ebook_converter/ebooks/conversion/plugins/pml_output.py
Normal file
77
ebook_converter/ebooks/conversion/plugins/pml_output.py
Normal file
@@ -0,0 +1,77 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL 3'
|
||||
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import os, io
|
||||
|
||||
from calibre.customize.conversion import (OutputFormatPlugin,
|
||||
OptionRecommendation)
|
||||
from calibre.ptempfile import TemporaryDirectory
|
||||
from polyglot.builtins import unicode_type
|
||||
|
||||
|
||||
class PMLOutput(OutputFormatPlugin):
|
||||
|
||||
name = 'PML Output'
|
||||
author = 'John Schember'
|
||||
file_type = 'pmlz'
|
||||
commit_name = 'pml_output'
|
||||
|
||||
options = {
|
||||
OptionRecommendation(name='pml_output_encoding', recommended_value='cp1252',
|
||||
level=OptionRecommendation.LOW,
|
||||
help=_('Specify the character encoding of the output document. '
|
||||
'The default is cp1252.')),
|
||||
OptionRecommendation(name='inline_toc',
|
||||
recommended_value=False, level=OptionRecommendation.LOW,
|
||||
help=_('Add Table of Contents to beginning of the book.')),
|
||||
OptionRecommendation(name='full_image_depth',
|
||||
recommended_value=False, level=OptionRecommendation.LOW,
|
||||
help=_('Do not reduce the size or bit depth of images. Images '
|
||||
'have their size and depth reduced by default to accommodate '
|
||||
'applications that can not convert images on their '
|
||||
'own such as Dropbook.')),
|
||||
}
|
||||
|
||||
def convert(self, oeb_book, output_path, input_plugin, opts, log):
|
||||
from calibre.ebooks.pml.pmlml import PMLMLizer
|
||||
from calibre.utils.zipfile import ZipFile
|
||||
|
||||
with TemporaryDirectory('_pmlz_output') as tdir:
|
||||
pmlmlizer = PMLMLizer(log)
|
||||
pml = unicode_type(pmlmlizer.extract_content(oeb_book, opts))
|
||||
with lopen(os.path.join(tdir, 'index.pml'), 'wb') as out:
|
||||
out.write(pml.encode(opts.pml_output_encoding, 'replace'))
|
||||
|
||||
img_path = os.path.join(tdir, 'index_img')
|
||||
if not os.path.exists(img_path):
|
||||
os.makedirs(img_path)
|
||||
self.write_images(oeb_book.manifest, pmlmlizer.image_hrefs, img_path, opts)
|
||||
|
||||
log.debug('Compressing output...')
|
||||
pmlz = ZipFile(output_path, 'w')
|
||||
pmlz.add_dir(tdir)
|
||||
|
||||
def write_images(self, manifest, image_hrefs, out_dir, opts):
|
||||
from PIL import Image
|
||||
|
||||
from calibre.ebooks.oeb.base import OEB_RASTER_IMAGES
|
||||
for item in manifest:
|
||||
if item.media_type in OEB_RASTER_IMAGES and item.href in image_hrefs.keys():
|
||||
if opts.full_image_depth:
|
||||
im = Image.open(io.BytesIO(item.data))
|
||||
else:
|
||||
im = Image.open(io.BytesIO(item.data)).convert('P')
|
||||
im.thumbnail((300,300), Image.ANTIALIAS)
|
||||
|
||||
data = io.BytesIO()
|
||||
im.save(data, 'PNG')
|
||||
data = data.getvalue()
|
||||
|
||||
path = os.path.join(out_dir, image_hrefs[item.href])
|
||||
|
||||
with lopen(path, 'wb') as out:
|
||||
out.write(data)
|
||||
28
ebook_converter/ebooks/conversion/plugins/rb_input.py
Normal file
28
ebook_converter/ebooks/conversion/plugins/rb_input.py
Normal file
@@ -0,0 +1,28 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL 3'
|
||||
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
|
||||
from calibre.customize.conversion import InputFormatPlugin
|
||||
from polyglot.builtins import getcwd
|
||||
|
||||
|
||||
class RBInput(InputFormatPlugin):
|
||||
|
||||
name = 'RB Input'
|
||||
author = 'John Schember'
|
||||
description = 'Convert RB files to HTML'
|
||||
file_types = {'rb'}
|
||||
commit_name = 'rb_input'
|
||||
|
||||
def convert(self, stream, options, file_ext, log,
|
||||
accelerators):
|
||||
from calibre.ebooks.rb.reader import Reader
|
||||
|
||||
reader = Reader(stream, log, options.input_encoding)
|
||||
opf = reader.extract_content(getcwd())
|
||||
|
||||
return opf
|
||||
45
ebook_converter/ebooks/conversion/plugins/rb_output.py
Normal file
45
ebook_converter/ebooks/conversion/plugins/rb_output.py
Normal file
@@ -0,0 +1,45 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL 3'
|
||||
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import os
|
||||
|
||||
from calibre.customize.conversion import OutputFormatPlugin, OptionRecommendation
|
||||
|
||||
|
||||
class RBOutput(OutputFormatPlugin):
|
||||
|
||||
name = 'RB Output'
|
||||
author = 'John Schember'
|
||||
file_type = 'rb'
|
||||
commit_name = 'rb_output'
|
||||
|
||||
options = {
|
||||
OptionRecommendation(name='inline_toc',
|
||||
recommended_value=False, level=OptionRecommendation.LOW,
|
||||
help=_('Add Table of Contents to beginning of the book.'))}
|
||||
|
||||
def convert(self, oeb_book, output_path, input_plugin, opts, log):
|
||||
from calibre.ebooks.rb.writer import RBWriter
|
||||
|
||||
close = False
|
||||
if not hasattr(output_path, 'write'):
|
||||
close = True
|
||||
if not os.path.exists(os.path.dirname(output_path)) and os.path.dirname(output_path):
|
||||
os.makedirs(os.path.dirname(output_path))
|
||||
out_stream = lopen(output_path, 'wb')
|
||||
else:
|
||||
out_stream = output_path
|
||||
|
||||
writer = RBWriter(opts, log)
|
||||
|
||||
out_stream.seek(0)
|
||||
out_stream.truncate()
|
||||
|
||||
writer.write_content(oeb_book, out_stream, oeb_book.metadata)
|
||||
|
||||
if close:
|
||||
out_stream.close()
|
||||
169
ebook_converter/ebooks/conversion/plugins/recipe_input.py
Normal file
169
ebook_converter/ebooks/conversion/plugins/recipe_input.py
Normal file
@@ -0,0 +1,169 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import os
|
||||
|
||||
from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
|
||||
from calibre.constants import numeric_version
|
||||
from calibre import walk
|
||||
from polyglot.builtins import unicode_type
|
||||
|
||||
|
||||
class RecipeDisabled(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class RecipeInput(InputFormatPlugin):
|
||||
|
||||
name = 'Recipe Input'
|
||||
author = 'Kovid Goyal'
|
||||
description = _('Download periodical content from the internet')
|
||||
file_types = {'recipe', 'downloaded_recipe'}
|
||||
commit_name = 'recipe_input'
|
||||
|
||||
recommendations = {
|
||||
('chapter', None, OptionRecommendation.HIGH),
|
||||
('dont_split_on_page_breaks', True, OptionRecommendation.HIGH),
|
||||
('use_auto_toc', False, OptionRecommendation.HIGH),
|
||||
('input_encoding', None, OptionRecommendation.HIGH),
|
||||
('input_profile', 'default', OptionRecommendation.HIGH),
|
||||
('page_breaks_before', None, OptionRecommendation.HIGH),
|
||||
('insert_metadata', False, OptionRecommendation.HIGH),
|
||||
}
|
||||
|
||||
options = {
|
||||
OptionRecommendation(name='test', recommended_value=False,
|
||||
help=_(
|
||||
'Useful for recipe development. Forces'
|
||||
' max_articles_per_feed to 2 and downloads at most 2 feeds.'
|
||||
' You can change the number of feeds and articles by supplying optional arguments.'
|
||||
' For example: --test 3 1 will download at most 3 feeds and only 1 article per feed.')),
|
||||
OptionRecommendation(name='username', recommended_value=None,
|
||||
help=_('Username for sites that require a login to access '
|
||||
'content.')),
|
||||
OptionRecommendation(name='password', recommended_value=None,
|
||||
help=_('Password for sites that require a login to access '
|
||||
'content.')),
|
||||
OptionRecommendation(name='dont_download_recipe',
|
||||
recommended_value=False,
|
||||
help=_('Do not download latest version of builtin recipes from the calibre server')),
|
||||
OptionRecommendation(name='lrf', recommended_value=False,
|
||||
help='Optimize fetching for subsequent conversion to LRF.'),
|
||||
}
|
||||
|
||||
def convert(self, recipe_or_file, opts, file_ext, log,
|
||||
accelerators):
|
||||
from calibre.web.feeds.recipes import compile_recipe
|
||||
opts.output_profile.flow_size = 0
|
||||
if file_ext == 'downloaded_recipe':
|
||||
from calibre.utils.zipfile import ZipFile
|
||||
zf = ZipFile(recipe_or_file, 'r')
|
||||
zf.extractall()
|
||||
zf.close()
|
||||
with lopen('download.recipe', 'rb') as f:
|
||||
self.recipe_source = f.read()
|
||||
recipe = compile_recipe(self.recipe_source)
|
||||
recipe.needs_subscription = False
|
||||
self.recipe_object = recipe(opts, log, self.report_progress)
|
||||
else:
|
||||
if os.environ.get('CALIBRE_RECIPE_URN'):
|
||||
from calibre.web.feeds.recipes.collection import get_custom_recipe, get_builtin_recipe_by_id
|
||||
urn = os.environ['CALIBRE_RECIPE_URN']
|
||||
log('Downloading recipe urn: ' + urn)
|
||||
rtype, recipe_id = urn.partition(':')[::2]
|
||||
if not recipe_id:
|
||||
raise ValueError('Invalid recipe urn: ' + urn)
|
||||
if rtype == 'custom':
|
||||
self.recipe_source = get_custom_recipe(recipe_id)
|
||||
else:
|
||||
self.recipe_source = get_builtin_recipe_by_id(urn, log=log, download_recipe=True)
|
||||
if not self.recipe_source:
|
||||
raise ValueError('Could not find recipe with urn: ' + urn)
|
||||
if not isinstance(self.recipe_source, bytes):
|
||||
self.recipe_source = self.recipe_source.encode('utf-8')
|
||||
recipe = compile_recipe(self.recipe_source)
|
||||
elif os.access(recipe_or_file, os.R_OK):
|
||||
with lopen(recipe_or_file, 'rb') as f:
|
||||
self.recipe_source = f.read()
|
||||
recipe = compile_recipe(self.recipe_source)
|
||||
log('Using custom recipe')
|
||||
else:
|
||||
from calibre.web.feeds.recipes.collection import (
|
||||
get_builtin_recipe_by_title, get_builtin_recipe_titles)
|
||||
title = getattr(opts, 'original_recipe_input_arg', recipe_or_file)
|
||||
title = os.path.basename(title).rpartition('.')[0]
|
||||
titles = frozenset(get_builtin_recipe_titles())
|
||||
if title not in titles:
|
||||
title = getattr(opts, 'original_recipe_input_arg', recipe_or_file)
|
||||
title = title.rpartition('.')[0]
|
||||
|
||||
raw = get_builtin_recipe_by_title(title, log=log,
|
||||
download_recipe=not opts.dont_download_recipe)
|
||||
builtin = False
|
||||
try:
|
||||
recipe = compile_recipe(raw)
|
||||
self.recipe_source = raw
|
||||
if recipe.requires_version > numeric_version:
|
||||
log.warn(
|
||||
'Downloaded recipe needs calibre version at least: %s' %
|
||||
('.'.join(recipe.requires_version)))
|
||||
builtin = True
|
||||
except:
|
||||
log.exception('Failed to compile downloaded recipe. Falling '
|
||||
'back to builtin one')
|
||||
builtin = True
|
||||
if builtin:
|
||||
log('Using bundled builtin recipe')
|
||||
raw = get_builtin_recipe_by_title(title, log=log,
|
||||
download_recipe=False)
|
||||
if raw is None:
|
||||
raise ValueError('Failed to find builtin recipe: '+title)
|
||||
recipe = compile_recipe(raw)
|
||||
self.recipe_source = raw
|
||||
else:
|
||||
log('Using downloaded builtin recipe')
|
||||
|
||||
if recipe is None:
|
||||
raise ValueError('%r is not a valid recipe file or builtin recipe' %
|
||||
recipe_or_file)
|
||||
|
||||
disabled = getattr(recipe, 'recipe_disabled', None)
|
||||
if disabled is not None:
|
||||
raise RecipeDisabled(disabled)
|
||||
ro = recipe(opts, log, self.report_progress)
|
||||
ro.download()
|
||||
self.recipe_object = ro
|
||||
|
||||
for key, val in self.recipe_object.conversion_options.items():
|
||||
setattr(opts, key, val)
|
||||
|
||||
for f in os.listdir('.'):
|
||||
if f.endswith('.opf'):
|
||||
return os.path.abspath(f)
|
||||
|
||||
for f in walk('.'):
|
||||
if f.endswith('.opf'):
|
||||
return os.path.abspath(f)
|
||||
|
||||
def postprocess_book(self, oeb, opts, log):
|
||||
if self.recipe_object is not None:
|
||||
self.recipe_object.internal_postprocess_book(oeb, opts, log)
|
||||
self.recipe_object.postprocess_book(oeb, opts, log)
|
||||
|
||||
def specialize(self, oeb, opts, log, output_fmt):
|
||||
if opts.no_inline_navbars:
|
||||
from calibre.ebooks.oeb.base import XPath
|
||||
for item in oeb.spine:
|
||||
for div in XPath('//h:div[contains(@class, "calibre_navbar")]')(item.data):
|
||||
div.getparent().remove(div)
|
||||
|
||||
def save_download(self, zf):
|
||||
raw = self.recipe_source
|
||||
if isinstance(raw, unicode_type):
|
||||
raw = raw.encode('utf-8')
|
||||
zf.writestr('download.recipe', raw)
|
||||
323
ebook_converter/ebooks/conversion/plugins/rtf_input.py
Normal file
323
ebook_converter/ebooks/conversion/plugins/rtf_input.py
Normal file
@@ -0,0 +1,323 @@
|
||||
from __future__ import with_statement, unicode_literals
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
|
||||
import os, glob, re, textwrap
|
||||
|
||||
from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
|
||||
from polyglot.builtins import iteritems, filter, getcwd, as_bytes
|
||||
|
||||
border_style_map = {
|
||||
'single' : 'solid',
|
||||
'double-thickness-border' : 'double',
|
||||
'shadowed-border': 'outset',
|
||||
'double-border': 'double',
|
||||
'dotted-border': 'dotted',
|
||||
'dashed': 'dashed',
|
||||
'hairline': 'solid',
|
||||
'inset': 'inset',
|
||||
'dash-small': 'dashed',
|
||||
'dot-dash': 'dotted',
|
||||
'dot-dot-dash': 'dotted',
|
||||
'outset': 'outset',
|
||||
'tripple': 'double',
|
||||
'triple': 'double',
|
||||
'thick-thin-small': 'solid',
|
||||
'thin-thick-small': 'solid',
|
||||
'thin-thick-thin-small': 'solid',
|
||||
'thick-thin-medium': 'solid',
|
||||
'thin-thick-medium': 'solid',
|
||||
'thin-thick-thin-medium': 'solid',
|
||||
'thick-thin-large': 'solid',
|
||||
'thin-thick-thin-large': 'solid',
|
||||
'wavy': 'ridge',
|
||||
'double-wavy': 'ridge',
|
||||
'striped': 'ridge',
|
||||
'emboss': 'inset',
|
||||
'engrave': 'inset',
|
||||
'frame': 'ridge',
|
||||
}
|
||||
|
||||
|
||||
class RTFInput(InputFormatPlugin):
|
||||
|
||||
name = 'RTF Input'
|
||||
author = 'Kovid Goyal'
|
||||
description = 'Convert RTF files to HTML'
|
||||
file_types = {'rtf'}
|
||||
commit_name = 'rtf_input'
|
||||
|
||||
options = {
|
||||
OptionRecommendation(name='ignore_wmf', recommended_value=False,
|
||||
help=_('Ignore WMF images instead of replacing them with a placeholder image.')),
|
||||
}
|
||||
|
||||
def generate_xml(self, stream):
|
||||
from calibre.ebooks.rtf2xml.ParseRtf import ParseRtf
|
||||
ofile = u'dataxml.xml'
|
||||
run_lev, debug_dir, indent_out = 1, None, 0
|
||||
if getattr(self.opts, 'debug_pipeline', None) is not None:
|
||||
try:
|
||||
os.mkdir(u'rtfdebug')
|
||||
debug_dir = u'rtfdebug'
|
||||
run_lev = 4
|
||||
indent_out = 1
|
||||
self.log('Running RTFParser in debug mode')
|
||||
except:
|
||||
self.log.warn('Impossible to run RTFParser in debug mode')
|
||||
parser = ParseRtf(
|
||||
in_file=stream,
|
||||
out_file=ofile,
|
||||
# Convert symbol fonts to unicode equivalents. Default
|
||||
# is 1
|
||||
convert_symbol=1,
|
||||
|
||||
# Convert Zapf fonts to unicode equivalents. Default
|
||||
# is 1.
|
||||
convert_zapf=1,
|
||||
|
||||
# Convert Wingding fonts to unicode equivalents.
|
||||
# Default is 1.
|
||||
convert_wingdings=1,
|
||||
|
||||
# Convert RTF caps to real caps.
|
||||
# Default is 1.
|
||||
convert_caps=1,
|
||||
|
||||
# Indent resulting XML.
|
||||
# Default is 0 (no indent).
|
||||
indent=indent_out,
|
||||
|
||||
# Form lists from RTF. Default is 1.
|
||||
form_lists=1,
|
||||
|
||||
# Convert headings to sections. Default is 0.
|
||||
headings_to_sections=1,
|
||||
|
||||
# Group paragraphs with the same style name. Default is 1.
|
||||
group_styles=1,
|
||||
|
||||
# Group borders. Default is 1.
|
||||
group_borders=1,
|
||||
|
||||
# Write or do not write paragraphs. Default is 0.
|
||||
empty_paragraphs=1,
|
||||
|
||||
# Debug
|
||||
deb_dir=debug_dir,
|
||||
|
||||
# Default encoding
|
||||
default_encoding=getattr(self.opts, 'input_encoding', 'cp1252') or 'cp1252',
|
||||
|
||||
# Run level
|
||||
run_level=run_lev,
|
||||
)
|
||||
parser.parse_rtf()
|
||||
with open(ofile, 'rb') as f:
|
||||
return f.read()
|
||||
|
||||
def extract_images(self, picts):
|
||||
from calibre.utils.imghdr import what
|
||||
from binascii import unhexlify
|
||||
self.log('Extracting images...')
|
||||
|
||||
with open(picts, 'rb') as f:
|
||||
raw = f.read()
|
||||
picts = filter(len, re.findall(br'\{\\pict([^}]+)\}', raw))
|
||||
hex_pat = re.compile(br'[^a-fA-F0-9]')
|
||||
encs = [hex_pat.sub(b'', pict) for pict in picts]
|
||||
|
||||
count = 0
|
||||
imap = {}
|
||||
for enc in encs:
|
||||
if len(enc) % 2 == 1:
|
||||
enc = enc[:-1]
|
||||
data = unhexlify(enc)
|
||||
fmt = what(None, data)
|
||||
if fmt is None:
|
||||
fmt = 'wmf'
|
||||
count += 1
|
||||
name = u'%04d.%s' % (count, fmt)
|
||||
with open(name, 'wb') as f:
|
||||
f.write(data)
|
||||
imap[count] = name
|
||||
# with open(name+'.hex', 'wb') as f:
|
||||
# f.write(enc)
|
||||
return self.convert_images(imap)
|
||||
|
||||
def convert_images(self, imap):
|
||||
self.default_img = None
|
||||
for count, val in iteritems(imap):
|
||||
try:
|
||||
imap[count] = self.convert_image(val)
|
||||
except:
|
||||
self.log.exception('Failed to convert', val)
|
||||
return imap
|
||||
|
||||
def convert_image(self, name):
|
||||
if not name.endswith('.wmf'):
|
||||
return name
|
||||
try:
|
||||
return self.rasterize_wmf(name)
|
||||
except Exception:
|
||||
self.log.exception('Failed to convert WMF image %r'%name)
|
||||
return self.replace_wmf(name)
|
||||
|
||||
def replace_wmf(self, name):
|
||||
if self.opts.ignore_wmf:
|
||||
os.remove(name)
|
||||
return '__REMOVE_ME__'
|
||||
from calibre.ebooks.covers import message_image
|
||||
if self.default_img is None:
|
||||
self.default_img = message_image('Conversion of WMF images is not supported.'
|
||||
' Use Microsoft Word or OpenOffice to save this RTF file'
|
||||
' as HTML and convert that in calibre.')
|
||||
name = name.replace('.wmf', '.jpg')
|
||||
with lopen(name, 'wb') as f:
|
||||
f.write(self.default_img)
|
||||
return name
|
||||
|
||||
def rasterize_wmf(self, name):
|
||||
from calibre.utils.wmf.parse import wmf_unwrap
|
||||
with open(name, 'rb') as f:
|
||||
data = f.read()
|
||||
data = wmf_unwrap(data)
|
||||
name = name.replace('.wmf', '.png')
|
||||
with open(name, 'wb') as f:
|
||||
f.write(data)
|
||||
return name
|
||||
|
||||
def write_inline_css(self, ic, border_styles):
|
||||
font_size_classes = ['span.fs%d { font-size: %spt }'%(i, x) for i, x in
|
||||
enumerate(ic.font_sizes)]
|
||||
color_classes = ['span.col%d { color: %s }'%(i, x) for i, x in
|
||||
enumerate(ic.colors) if x != 'false']
|
||||
css = textwrap.dedent('''
|
||||
span.none {
|
||||
text-decoration: none; font-weight: normal;
|
||||
font-style: normal; font-variant: normal
|
||||
}
|
||||
|
||||
span.italics { font-style: italic }
|
||||
|
||||
span.bold { font-weight: bold }
|
||||
|
||||
span.small-caps { font-variant: small-caps }
|
||||
|
||||
span.underlined { text-decoration: underline }
|
||||
|
||||
span.strike-through { text-decoration: line-through }
|
||||
|
||||
''')
|
||||
css += '\n'+'\n'.join(font_size_classes)
|
||||
css += '\n' +'\n'.join(color_classes)
|
||||
|
||||
for cls, val in iteritems(border_styles):
|
||||
css += '\n\n.%s {\n%s\n}'%(cls, val)
|
||||
|
||||
with open(u'styles.css', 'ab') as f:
|
||||
f.write(css.encode('utf-8'))
|
||||
|
||||
def convert_borders(self, doc):
|
||||
border_styles = []
|
||||
style_map = {}
|
||||
for elem in doc.xpath(r'//*[local-name()="cell"]'):
|
||||
style = ['border-style: hidden', 'border-width: 1px',
|
||||
'border-color: black']
|
||||
for x in ('bottom', 'top', 'left', 'right'):
|
||||
bs = elem.get('border-cell-%s-style'%x, None)
|
||||
if bs:
|
||||
cbs = border_style_map.get(bs, 'solid')
|
||||
style.append('border-%s-style: %s'%(x, cbs))
|
||||
bw = elem.get('border-cell-%s-line-width'%x, None)
|
||||
if bw:
|
||||
style.append('border-%s-width: %spt'%(x, bw))
|
||||
bc = elem.get('border-cell-%s-color'%x, None)
|
||||
if bc:
|
||||
style.append('border-%s-color: %s'%(x, bc))
|
||||
style = ';\n'.join(style)
|
||||
if style not in border_styles:
|
||||
border_styles.append(style)
|
||||
idx = border_styles.index(style)
|
||||
cls = 'border_style%d'%idx
|
||||
style_map[cls] = style
|
||||
elem.set('class', cls)
|
||||
return style_map
|
||||
|
||||
def convert(self, stream, options, file_ext, log,
|
||||
accelerators):
|
||||
from lxml import etree
|
||||
from calibre.ebooks.metadata.meta import get_metadata
|
||||
from calibre.ebooks.metadata.opf2 import OPFCreator
|
||||
from calibre.ebooks.rtf2xml.ParseRtf import RtfInvalidCodeException
|
||||
from calibre.ebooks.rtf.input import InlineClass
|
||||
from calibre.utils.xml_parse import safe_xml_fromstring
|
||||
self.opts = options
|
||||
self.log = log
|
||||
self.log('Converting RTF to XML...')
|
||||
try:
|
||||
xml = self.generate_xml(stream.name)
|
||||
except RtfInvalidCodeException as e:
|
||||
self.log.exception('Unable to parse RTF')
|
||||
raise ValueError(_('This RTF file has a feature calibre does not '
|
||||
'support. Convert it to HTML first and then try it.\n%s')%e)
|
||||
|
||||
d = glob.glob(os.path.join('*_rtf_pict_dir', 'picts.rtf'))
|
||||
if d:
|
||||
imap = {}
|
||||
try:
|
||||
imap = self.extract_images(d[0])
|
||||
except:
|
||||
self.log.exception('Failed to extract images...')
|
||||
|
||||
self.log('Parsing XML...')
|
||||
doc = safe_xml_fromstring(xml)
|
||||
border_styles = self.convert_borders(doc)
|
||||
for pict in doc.xpath('//rtf:pict[@num]',
|
||||
namespaces={'rtf':'http://rtf2xml.sourceforge.net/'}):
|
||||
num = int(pict.get('num'))
|
||||
name = imap.get(num, None)
|
||||
if name is not None:
|
||||
pict.set('num', name)
|
||||
|
||||
self.log('Converting XML to HTML...')
|
||||
inline_class = InlineClass(self.log)
|
||||
styledoc = safe_xml_fromstring(P('templates/rtf.xsl', data=True), recover=False)
|
||||
extensions = {('calibre', 'inline-class') : inline_class}
|
||||
transform = etree.XSLT(styledoc, extensions=extensions)
|
||||
result = transform(doc)
|
||||
html = u'index.xhtml'
|
||||
with open(html, 'wb') as f:
|
||||
res = as_bytes(transform.tostring(result))
|
||||
# res = res[:100].replace('xmlns:html', 'xmlns') + res[100:]
|
||||
# clean multiple \n
|
||||
res = re.sub(b'\n+', b'\n', res)
|
||||
# Replace newlines inserted by the 'empty_paragraphs' option in rtf2xml with html blank lines
|
||||
# res = re.sub('\s*<body>', '<body>', res)
|
||||
# res = re.sub('(?<=\n)\n{2}',
|
||||
# u'<p>\u00a0</p>\n'.encode('utf-8'), res)
|
||||
f.write(res)
|
||||
self.write_inline_css(inline_class, border_styles)
|
||||
stream.seek(0)
|
||||
mi = get_metadata(stream, 'rtf')
|
||||
if not mi.title:
|
||||
mi.title = _('Unknown')
|
||||
if not mi.authors:
|
||||
mi.authors = [_('Unknown')]
|
||||
opf = OPFCreator(getcwd(), mi)
|
||||
opf.create_manifest([(u'index.xhtml', None)])
|
||||
opf.create_spine([u'index.xhtml'])
|
||||
opf.render(open(u'metadata.opf', 'wb'))
|
||||
return os.path.abspath(u'metadata.opf')
|
||||
|
||||
def postprocess_book(self, oeb, opts, log):
|
||||
for item in oeb.spine:
|
||||
for img in item.data.xpath('//*[local-name()="img" and @src="__REMOVE_ME__"]'):
|
||||
p = img.getparent()
|
||||
idx = p.index(img)
|
||||
p.remove(img)
|
||||
if img.tail:
|
||||
if idx == 0:
|
||||
p.text = (p.text or '') + img.tail
|
||||
else:
|
||||
p[idx-1].tail = (p[idx-1].tail or '') + img.tail
|
||||
40
ebook_converter/ebooks/conversion/plugins/rtf_output.py
Normal file
40
ebook_converter/ebooks/conversion/plugins/rtf_output.py
Normal file
@@ -0,0 +1,40 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL 3'
|
||||
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import os
|
||||
|
||||
from calibre.customize.conversion import OutputFormatPlugin
|
||||
|
||||
|
||||
class RTFOutput(OutputFormatPlugin):
|
||||
|
||||
name = 'RTF Output'
|
||||
author = 'John Schember'
|
||||
file_type = 'rtf'
|
||||
commit_name = 'rtf_output'
|
||||
|
||||
def convert(self, oeb_book, output_path, input_plugin, opts, log):
|
||||
from calibre.ebooks.rtf.rtfml import RTFMLizer
|
||||
|
||||
rtfmlitzer = RTFMLizer(log)
|
||||
content = rtfmlitzer.extract_content(oeb_book, opts)
|
||||
|
||||
close = False
|
||||
if not hasattr(output_path, 'write'):
|
||||
close = True
|
||||
if not os.path.exists(os.path.dirname(output_path)) and os.path.dirname(output_path) != '':
|
||||
os.makedirs(os.path.dirname(output_path))
|
||||
out_stream = lopen(output_path, 'wb')
|
||||
else:
|
||||
out_stream = output_path
|
||||
|
||||
out_stream.seek(0)
|
||||
out_stream.truncate()
|
||||
out_stream.write(content.encode('ascii', 'replace'))
|
||||
|
||||
if close:
|
||||
out_stream.close()
|
||||
122
ebook_converter/ebooks/conversion/plugins/snb_input.py
Normal file
122
ebook_converter/ebooks/conversion/plugins/snb_input.py
Normal file
@@ -0,0 +1,122 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL 3'
|
||||
__copyright__ = '2010, Li Fanxi <lifanxi@freemindworld.com>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import os
|
||||
|
||||
from calibre.customize.conversion import InputFormatPlugin
|
||||
from calibre.ptempfile import TemporaryDirectory
|
||||
from calibre.utils.filenames import ascii_filename
|
||||
from polyglot.builtins import unicode_type
|
||||
|
||||
HTML_TEMPLATE = '<html><head><meta http-equiv="Content-Type" content="text/html; charset=utf-8"/><title>%s</title></head><body>\n%s\n</body></html>'
|
||||
|
||||
|
||||
def html_encode(s):
|
||||
return s.replace('&', '&').replace('<', '<').replace('>', '>').replace('"', '"').replace("'", ''').replace('\n', '<br/>').replace(' ', ' ') # noqa
|
||||
|
||||
|
||||
class SNBInput(InputFormatPlugin):
|
||||
|
||||
name = 'SNB Input'
|
||||
author = 'Li Fanxi'
|
||||
description = 'Convert SNB files to OEB'
|
||||
file_types = {'snb'}
|
||||
commit_name = 'snb_input'
|
||||
|
||||
options = set()
|
||||
|
||||
def convert(self, stream, options, file_ext, log,
|
||||
accelerators):
|
||||
import uuid
|
||||
|
||||
from calibre.ebooks.oeb.base import DirContainer
|
||||
from calibre.ebooks.snb.snbfile import SNBFile
|
||||
from calibre.utils.xml_parse import safe_xml_fromstring
|
||||
|
||||
log.debug("Parsing SNB file...")
|
||||
snbFile = SNBFile()
|
||||
try:
|
||||
snbFile.Parse(stream)
|
||||
except:
|
||||
raise ValueError("Invalid SNB file")
|
||||
if not snbFile.IsValid():
|
||||
log.debug("Invalid SNB file")
|
||||
raise ValueError("Invalid SNB file")
|
||||
log.debug("Handle meta data ...")
|
||||
from calibre.ebooks.conversion.plumber import create_oebbook
|
||||
oeb = create_oebbook(log, None, options,
|
||||
encoding=options.input_encoding, populate=False)
|
||||
meta = snbFile.GetFileStream('snbf/book.snbf')
|
||||
if meta is not None:
|
||||
meta = safe_xml_fromstring(meta)
|
||||
l = {'title' : './/head/name',
|
||||
'creator' : './/head/author',
|
||||
'language' : './/head/language',
|
||||
'generator': './/head/generator',
|
||||
'publisher': './/head/publisher',
|
||||
'cover' : './/head/cover', }
|
||||
d = {}
|
||||
for item in l:
|
||||
node = meta.find(l[item])
|
||||
if node is not None:
|
||||
d[item] = node.text if node.text is not None else ''
|
||||
else:
|
||||
d[item] = ''
|
||||
|
||||
oeb.metadata.add('title', d['title'])
|
||||
oeb.metadata.add('creator', d['creator'], attrib={'role':'aut'})
|
||||
oeb.metadata.add('language', d['language'].lower().replace('_', '-'))
|
||||
oeb.metadata.add('generator', d['generator'])
|
||||
oeb.metadata.add('publisher', d['publisher'])
|
||||
if d['cover'] != '':
|
||||
oeb.guide.add('cover', 'Cover', d['cover'])
|
||||
|
||||
bookid = unicode_type(uuid.uuid4())
|
||||
oeb.metadata.add('identifier', bookid, id='uuid_id', scheme='uuid')
|
||||
for ident in oeb.metadata.identifier:
|
||||
if 'id' in ident.attrib:
|
||||
oeb.uid = oeb.metadata.identifier[0]
|
||||
break
|
||||
|
||||
with TemporaryDirectory('_snb2oeb', keep=True) as tdir:
|
||||
log.debug('Process TOC ...')
|
||||
toc = snbFile.GetFileStream('snbf/toc.snbf')
|
||||
oeb.container = DirContainer(tdir, log)
|
||||
if toc is not None:
|
||||
toc = safe_xml_fromstring(toc)
|
||||
i = 1
|
||||
for ch in toc.find('.//body'):
|
||||
chapterName = ch.text
|
||||
chapterSrc = ch.get('src')
|
||||
fname = 'ch_%d.htm' % i
|
||||
data = snbFile.GetFileStream('snbc/' + chapterSrc)
|
||||
if data is None:
|
||||
continue
|
||||
snbc = safe_xml_fromstring(data)
|
||||
lines = []
|
||||
for line in snbc.find('.//body'):
|
||||
if line.tag == 'text':
|
||||
lines.append('<p>%s</p>' % html_encode(line.text))
|
||||
elif line.tag == 'img':
|
||||
lines.append('<p><img src="%s" /></p>' % html_encode(line.text))
|
||||
with open(os.path.join(tdir, fname), 'wb') as f:
|
||||
f.write((HTML_TEMPLATE % (chapterName, '\n'.join(lines))).encode('utf-8', 'replace'))
|
||||
oeb.toc.add(ch.text, fname)
|
||||
id, href = oeb.manifest.generate(id='html',
|
||||
href=ascii_filename(fname))
|
||||
item = oeb.manifest.add(id, href, 'text/html')
|
||||
item.html_input_href = fname
|
||||
oeb.spine.add(item, True)
|
||||
i = i + 1
|
||||
imageFiles = snbFile.OutputImageFiles(tdir)
|
||||
for f, m in imageFiles:
|
||||
id, href = oeb.manifest.generate(id='image',
|
||||
href=ascii_filename(f))
|
||||
item = oeb.manifest.add(id, href, m)
|
||||
item.html_input_href = f
|
||||
|
||||
return oeb
|
||||
269
ebook_converter/ebooks/conversion/plugins/snb_output.py
Normal file
269
ebook_converter/ebooks/conversion/plugins/snb_output.py
Normal file
@@ -0,0 +1,269 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL 3'
|
||||
__copyright__ = '2010, Li Fanxi <lifanxi@freemindworld.com>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import os
|
||||
|
||||
from calibre.customize.conversion import OutputFormatPlugin, OptionRecommendation
|
||||
from calibre.ptempfile import TemporaryDirectory
|
||||
from calibre.constants import __appname__, __version__
|
||||
from polyglot.builtins import unicode_type
|
||||
|
||||
|
||||
class SNBOutput(OutputFormatPlugin):
|
||||
|
||||
name = 'SNB Output'
|
||||
author = 'Li Fanxi'
|
||||
file_type = 'snb'
|
||||
commit_name = 'snb_output'
|
||||
|
||||
options = {
|
||||
OptionRecommendation(name='snb_output_encoding', recommended_value='utf-8',
|
||||
level=OptionRecommendation.LOW,
|
||||
help=_('Specify the character encoding of the output document. '
|
||||
'The default is utf-8.')),
|
||||
OptionRecommendation(name='snb_max_line_length',
|
||||
recommended_value=0, level=OptionRecommendation.LOW,
|
||||
help=_('The maximum number of characters per line. This splits on '
|
||||
'the first space before the specified value. If no space is found '
|
||||
'the line will be broken at the space after and will exceed the '
|
||||
'specified value. Also, there is a minimum of 25 characters. '
|
||||
'Use 0 to disable line splitting.')),
|
||||
OptionRecommendation(name='snb_insert_empty_line',
|
||||
recommended_value=False, level=OptionRecommendation.LOW,
|
||||
help=_('Specify whether or not to insert an empty line between '
|
||||
'two paragraphs.')),
|
||||
OptionRecommendation(name='snb_dont_indent_first_line',
|
||||
recommended_value=False, level=OptionRecommendation.LOW,
|
||||
help=_('Specify whether or not to insert two space characters '
|
||||
'to indent the first line of each paragraph.')),
|
||||
OptionRecommendation(name='snb_hide_chapter_name',
|
||||
recommended_value=False, level=OptionRecommendation.LOW,
|
||||
help=_('Specify whether or not to hide the chapter title for each '
|
||||
'chapter. Useful for image-only output (eg. comics).')),
|
||||
OptionRecommendation(name='snb_full_screen',
|
||||
recommended_value=False, level=OptionRecommendation.LOW,
|
||||
help=_('Resize all the images for full screen view. ')),
|
||||
}
|
||||
|
||||
def convert(self, oeb_book, output_path, input_plugin, opts, log):
|
||||
from lxml import etree
|
||||
from calibre.ebooks.snb.snbfile import SNBFile
|
||||
from calibre.ebooks.snb.snbml import SNBMLizer, ProcessFileName
|
||||
|
||||
self.opts = opts
|
||||
from calibre.ebooks.oeb.transforms.rasterize import SVGRasterizer, Unavailable
|
||||
try:
|
||||
rasterizer = SVGRasterizer()
|
||||
rasterizer(oeb_book, opts)
|
||||
except Unavailable:
|
||||
log.warn('SVG rasterizer unavailable, SVG will not be converted')
|
||||
|
||||
# Create temp dir
|
||||
with TemporaryDirectory('_snb_output') as tdir:
|
||||
# Create stub directories
|
||||
snbfDir = os.path.join(tdir, 'snbf')
|
||||
snbcDir = os.path.join(tdir, 'snbc')
|
||||
snbiDir = os.path.join(tdir, 'snbc/images')
|
||||
os.mkdir(snbfDir)
|
||||
os.mkdir(snbcDir)
|
||||
os.mkdir(snbiDir)
|
||||
|
||||
# Process Meta data
|
||||
meta = oeb_book.metadata
|
||||
if meta.title:
|
||||
title = unicode_type(meta.title[0])
|
||||
else:
|
||||
title = ''
|
||||
authors = [unicode_type(x) for x in meta.creator if x.role == 'aut']
|
||||
if meta.publisher:
|
||||
publishers = unicode_type(meta.publisher[0])
|
||||
else:
|
||||
publishers = ''
|
||||
if meta.language:
|
||||
lang = unicode_type(meta.language[0]).upper()
|
||||
else:
|
||||
lang = ''
|
||||
if meta.description:
|
||||
abstract = unicode_type(meta.description[0])
|
||||
else:
|
||||
abstract = ''
|
||||
|
||||
# Process Cover
|
||||
g, m, s = oeb_book.guide, oeb_book.manifest, oeb_book.spine
|
||||
href = None
|
||||
if 'titlepage' not in g:
|
||||
if 'cover' in g:
|
||||
href = g['cover'].href
|
||||
|
||||
# Output book info file
|
||||
bookInfoTree = etree.Element("book-snbf", version="1.0")
|
||||
headTree = etree.SubElement(bookInfoTree, "head")
|
||||
etree.SubElement(headTree, "name").text = title
|
||||
etree.SubElement(headTree, "author").text = ' '.join(authors)
|
||||
etree.SubElement(headTree, "language").text = lang
|
||||
etree.SubElement(headTree, "rights")
|
||||
etree.SubElement(headTree, "publisher").text = publishers
|
||||
etree.SubElement(headTree, "generator").text = __appname__ + ' ' + __version__
|
||||
etree.SubElement(headTree, "created")
|
||||
etree.SubElement(headTree, "abstract").text = abstract
|
||||
if href is not None:
|
||||
etree.SubElement(headTree, "cover").text = ProcessFileName(href)
|
||||
else:
|
||||
etree.SubElement(headTree, "cover")
|
||||
with open(os.path.join(snbfDir, 'book.snbf'), 'wb') as f:
|
||||
f.write(etree.tostring(bookInfoTree, pretty_print=True, encoding='utf-8'))
|
||||
|
||||
# Output TOC
|
||||
tocInfoTree = etree.Element("toc-snbf")
|
||||
tocHead = etree.SubElement(tocInfoTree, "head")
|
||||
tocBody = etree.SubElement(tocInfoTree, "body")
|
||||
outputFiles = {}
|
||||
if oeb_book.toc.count() == 0:
|
||||
log.warn('This SNB file has no Table of Contents. '
|
||||
'Creating a default TOC')
|
||||
first = next(iter(oeb_book.spine))
|
||||
oeb_book.toc.add(_('Start page'), first.href)
|
||||
else:
|
||||
first = next(iter(oeb_book.spine))
|
||||
if oeb_book.toc[0].href != first.href:
|
||||
# The pages before the fist item in toc will be stored as
|
||||
# "Cover Pages".
|
||||
# oeb_book.toc does not support "insert", so we generate
|
||||
# the tocInfoTree directly instead of modifying the toc
|
||||
ch = etree.SubElement(tocBody, "chapter")
|
||||
ch.set("src", ProcessFileName(first.href) + ".snbc")
|
||||
ch.text = _('Cover pages')
|
||||
outputFiles[first.href] = []
|
||||
outputFiles[first.href].append(("", _("Cover pages")))
|
||||
|
||||
for tocitem in oeb_book.toc:
|
||||
if tocitem.href.find('#') != -1:
|
||||
item = tocitem.href.split('#')
|
||||
if len(item) != 2:
|
||||
log.error('Error in TOC item: %s' % tocitem)
|
||||
else:
|
||||
if item[0] in outputFiles:
|
||||
outputFiles[item[0]].append((item[1], tocitem.title))
|
||||
else:
|
||||
outputFiles[item[0]] = []
|
||||
if "" not in outputFiles[item[0]]:
|
||||
outputFiles[item[0]].append(("", tocitem.title + _(" (Preface)")))
|
||||
ch = etree.SubElement(tocBody, "chapter")
|
||||
ch.set("src", ProcessFileName(item[0]) + ".snbc")
|
||||
ch.text = tocitem.title + _(" (Preface)")
|
||||
outputFiles[item[0]].append((item[1], tocitem.title))
|
||||
else:
|
||||
if tocitem.href in outputFiles:
|
||||
outputFiles[tocitem.href].append(("", tocitem.title))
|
||||
else:
|
||||
outputFiles[tocitem.href] = []
|
||||
outputFiles[tocitem.href].append(("", tocitem.title))
|
||||
ch = etree.SubElement(tocBody, "chapter")
|
||||
ch.set("src", ProcessFileName(tocitem.href) + ".snbc")
|
||||
ch.text = tocitem.title
|
||||
|
||||
etree.SubElement(tocHead, "chapters").text = '%d' % len(tocBody)
|
||||
|
||||
with open(os.path.join(snbfDir, 'toc.snbf'), 'wb') as f:
|
||||
f.write(etree.tostring(tocInfoTree, pretty_print=True, encoding='utf-8'))
|
||||
|
||||
# Output Files
|
||||
oldTree = None
|
||||
mergeLast = False
|
||||
lastName = None
|
||||
for item in s:
|
||||
from calibre.ebooks.oeb.base import OEB_DOCS, OEB_IMAGES
|
||||
if m.hrefs[item.href].media_type in OEB_DOCS:
|
||||
if item.href not in outputFiles:
|
||||
log.debug('File %s is unused in TOC. Continue in last chapter' % item.href)
|
||||
mergeLast = True
|
||||
else:
|
||||
if oldTree is not None and mergeLast:
|
||||
log.debug('Output the modified chapter again: %s' % lastName)
|
||||
with open(os.path.join(snbcDir, lastName), 'wb') as f:
|
||||
f.write(etree.tostring(oldTree, pretty_print=True, encoding='utf-8'))
|
||||
mergeLast = False
|
||||
|
||||
log.debug('Converting %s to snbc...' % item.href)
|
||||
snbwriter = SNBMLizer(log)
|
||||
snbcTrees = None
|
||||
if not mergeLast:
|
||||
snbcTrees = snbwriter.extract_content(oeb_book, item, outputFiles[item.href], opts)
|
||||
for subName in snbcTrees:
|
||||
postfix = ''
|
||||
if subName != '':
|
||||
postfix = '_' + subName
|
||||
lastName = ProcessFileName(item.href + postfix + ".snbc")
|
||||
oldTree = snbcTrees[subName]
|
||||
with open(os.path.join(snbcDir, lastName), 'wb') as f:
|
||||
f.write(etree.tostring(oldTree, pretty_print=True, encoding='utf-8'))
|
||||
else:
|
||||
log.debug('Merge %s with last TOC item...' % item.href)
|
||||
snbwriter.merge_content(oldTree, oeb_book, item, [('', _("Start"))], opts)
|
||||
|
||||
# Output the last one if needed
|
||||
log.debug('Output the last modified chapter again: %s' % lastName)
|
||||
if oldTree is not None and mergeLast:
|
||||
with open(os.path.join(snbcDir, lastName), 'wb') as f:
|
||||
f.write(etree.tostring(oldTree, pretty_print=True, encoding='utf-8'))
|
||||
mergeLast = False
|
||||
|
||||
for item in m:
|
||||
if m.hrefs[item.href].media_type in OEB_IMAGES:
|
||||
log.debug('Converting image: %s ...' % item.href)
|
||||
content = m.hrefs[item.href].data
|
||||
# Convert & Resize image
|
||||
self.HandleImage(content, os.path.join(snbiDir, ProcessFileName(item.href)))
|
||||
|
||||
# Package as SNB File
|
||||
snbFile = SNBFile()
|
||||
snbFile.FromDir(tdir)
|
||||
snbFile.Output(output_path)
|
||||
|
||||
def HandleImage(self, imageData, imagePath):
|
||||
from calibre.utils.img import image_from_data, resize_image, image_to_data
|
||||
img = image_from_data(imageData)
|
||||
x, y = img.width(), img.height()
|
||||
if self.opts:
|
||||
if self.opts.snb_full_screen:
|
||||
SCREEN_X, SCREEN_Y = self.opts.output_profile.screen_size
|
||||
else:
|
||||
SCREEN_X, SCREEN_Y = self.opts.output_profile.comic_screen_size
|
||||
else:
|
||||
SCREEN_X = 540
|
||||
SCREEN_Y = 700
|
||||
# Handle big image only
|
||||
if x > SCREEN_X or y > SCREEN_Y:
|
||||
xScale = float(x) / SCREEN_X
|
||||
yScale = float(y) / SCREEN_Y
|
||||
scale = max(xScale, yScale)
|
||||
# TODO : intelligent image rotation
|
||||
# img = img.rotate(90)
|
||||
# x,y = y,x
|
||||
img = resize_image(img, x // scale, y // scale)
|
||||
with lopen(imagePath, 'wb') as f:
|
||||
f.write(image_to_data(img, fmt=imagePath.rpartition('.')[-1]))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
from calibre.ebooks.oeb.reader import OEBReader
|
||||
from calibre.ebooks.oeb.base import OEBBook
|
||||
from calibre.ebooks.conversion.preprocess import HTMLPreProcessor
|
||||
from calibre.customize.profiles import HanlinV3Output
|
||||
|
||||
class OptionValues(object):
|
||||
pass
|
||||
|
||||
opts = OptionValues()
|
||||
opts.output_profile = HanlinV3Output(None)
|
||||
|
||||
html_preprocessor = HTMLPreProcessor(None, None, opts)
|
||||
from calibre.utils.logging import default_log
|
||||
oeb = OEBBook(default_log, html_preprocessor)
|
||||
reader = OEBReader
|
||||
reader()(oeb, '/tmp/bbb/processed/')
|
||||
SNBOutput(None).convert(oeb, '/tmp/test.snb', None, None, default_log)
|
||||
39
ebook_converter/ebooks/conversion/plugins/tcr_input.py
Normal file
39
ebook_converter/ebooks/conversion/plugins/tcr_input.py
Normal file
@@ -0,0 +1,39 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL 3'
|
||||
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
from io import BytesIO
|
||||
|
||||
from calibre.customize.conversion import InputFormatPlugin
|
||||
|
||||
|
||||
class TCRInput(InputFormatPlugin):
|
||||
|
||||
name = 'TCR Input'
|
||||
author = 'John Schember'
|
||||
description = 'Convert TCR files to HTML'
|
||||
file_types = {'tcr'}
|
||||
commit_name = 'tcr_input'
|
||||
|
||||
def convert(self, stream, options, file_ext, log, accelerators):
|
||||
from calibre.ebooks.compression.tcr import decompress
|
||||
|
||||
log.info('Decompressing text...')
|
||||
raw_txt = decompress(stream)
|
||||
|
||||
log.info('Converting text to OEB...')
|
||||
stream = BytesIO(raw_txt)
|
||||
|
||||
from calibre.customize.ui import plugin_for_input_format
|
||||
|
||||
txt_plugin = plugin_for_input_format('txt')
|
||||
for opt in txt_plugin.options:
|
||||
if not hasattr(self.options, opt.option.name):
|
||||
setattr(options, opt.option.name, opt.recommended_value)
|
||||
|
||||
stream.seek(0)
|
||||
return txt_plugin.convert(stream, options,
|
||||
'txt', log, accelerators)
|
||||
56
ebook_converter/ebooks/conversion/plugins/tcr_output.py
Normal file
56
ebook_converter/ebooks/conversion/plugins/tcr_output.py
Normal file
@@ -0,0 +1,56 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL 3'
|
||||
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import os
|
||||
|
||||
from calibre.customize.conversion import OutputFormatPlugin, \
|
||||
OptionRecommendation
|
||||
|
||||
|
||||
class TCROutput(OutputFormatPlugin):
|
||||
|
||||
name = 'TCR Output'
|
||||
author = 'John Schember'
|
||||
file_type = 'tcr'
|
||||
commit_name = 'tcr_output'
|
||||
|
||||
options = {
|
||||
OptionRecommendation(name='tcr_output_encoding', recommended_value='utf-8',
|
||||
level=OptionRecommendation.LOW,
|
||||
help=_('Specify the character encoding of the output document. '
|
||||
'The default is utf-8.'))}
|
||||
|
||||
def convert(self, oeb_book, output_path, input_plugin, opts, log):
|
||||
from calibre.ebooks.txt.txtml import TXTMLizer
|
||||
from calibre.ebooks.compression.tcr import compress
|
||||
|
||||
close = False
|
||||
if not hasattr(output_path, 'write'):
|
||||
close = True
|
||||
if not os.path.exists(os.path.dirname(output_path)) and os.path.dirname(output_path):
|
||||
os.makedirs(os.path.dirname(output_path))
|
||||
out_stream = lopen(output_path, 'wb')
|
||||
else:
|
||||
out_stream = output_path
|
||||
|
||||
setattr(opts, 'flush_paras', False)
|
||||
setattr(opts, 'max_line_length', 0)
|
||||
setattr(opts, 'force_max_line_length', False)
|
||||
setattr(opts, 'indent_paras', False)
|
||||
|
||||
writer = TXTMLizer(log)
|
||||
txt = writer.extract_content(oeb_book, opts).encode(opts.tcr_output_encoding, 'replace')
|
||||
|
||||
log.info('Compressing text...')
|
||||
txt = compress(txt)
|
||||
|
||||
out_stream.seek(0)
|
||||
out_stream.truncate()
|
||||
out_stream.write(txt)
|
||||
|
||||
if close:
|
||||
out_stream.close()
|
||||
308
ebook_converter/ebooks/conversion/plugins/txt_input.py
Normal file
308
ebook_converter/ebooks/conversion/plugins/txt_input.py
Normal file
@@ -0,0 +1,308 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL 3'
|
||||
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import os
|
||||
|
||||
from calibre import _ent_pat, walk, xml_entity_to_unicode
|
||||
from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
|
||||
from polyglot.builtins import getcwd
|
||||
|
||||
MD_EXTENSIONS = {
|
||||
'abbr': _('Abbreviations'),
|
||||
'admonition': _('Support admonitions'),
|
||||
'attr_list': _('Add attribute to HTML tags'),
|
||||
'codehilite': _('Add code highlighting via Pygments'),
|
||||
'def_list': _('Definition lists'),
|
||||
'extra': _('Enables various common extensions'),
|
||||
'fenced_code': _('Alternative code block syntax'),
|
||||
'footnotes': _('Footnotes'),
|
||||
'legacy_attrs': _('Use legacy element attributes'),
|
||||
'legacy_em': _('Use legacy underscore handling for connected words'),
|
||||
'meta': _('Metadata in the document'),
|
||||
'nl2br': _('Treat newlines as hard breaks'),
|
||||
'sane_lists': _('Do not allow mixing list types'),
|
||||
'smarty': _('Use markdown\'s internal smartypants parser'),
|
||||
'tables': _('Support tables'),
|
||||
'toc': _('Generate a table of contents'),
|
||||
'wikilinks': _('Wiki style links'),
|
||||
}
|
||||
|
||||
|
||||
class TXTInput(InputFormatPlugin):
|
||||
|
||||
name = 'TXT Input'
|
||||
author = 'John Schember'
|
||||
description = 'Convert TXT files to HTML'
|
||||
file_types = {'txt', 'txtz', 'text', 'md', 'textile', 'markdown'}
|
||||
commit_name = 'txt_input'
|
||||
ui_data = {
|
||||
'md_extensions': MD_EXTENSIONS,
|
||||
'paragraph_types': {
|
||||
'auto': _('Try to auto detect paragraph type'),
|
||||
'block': _('Treat a blank line as a paragraph break'),
|
||||
'single': _('Assume every line is a paragraph'),
|
||||
'print': _('Assume every line starting with 2+ spaces or a tab starts a paragraph'),
|
||||
'unformatted': _('Most lines have hard line breaks, few/no blank lines or indents'),
|
||||
'off': _('Don\'t modify the paragraph structure'),
|
||||
},
|
||||
'formatting_types': {
|
||||
'auto': _('Automatically decide which formatting processor to use'),
|
||||
'plain': _('No formatting'),
|
||||
'heuristic': _('Use heuristics to determine chapter headings, italics, etc.'),
|
||||
'textile': _('Use the TexTile markup language'),
|
||||
'markdown': _('Use the Markdown markup language')
|
||||
},
|
||||
}
|
||||
|
||||
options = {
|
||||
OptionRecommendation(name='formatting_type', recommended_value='auto',
|
||||
choices=list(ui_data['formatting_types']),
|
||||
help=_('Formatting used within the document.\n'
|
||||
'* auto: {auto}\n'
|
||||
'* plain: {plain}\n'
|
||||
'* heuristic: {heuristic}\n'
|
||||
'* textile: {textile}\n'
|
||||
'* markdown: {markdown}\n'
|
||||
'To learn more about markdown see {url}').format(
|
||||
url='https://daringfireball.net/projects/markdown/', **ui_data['formatting_types'])
|
||||
),
|
||||
OptionRecommendation(name='paragraph_type', recommended_value='auto',
|
||||
choices=list(ui_data['paragraph_types']),
|
||||
help=_('Paragraph structure to assume. The value of "off" is useful for formatted documents such as Markdown or Textile. '
|
||||
'Choices are:\n'
|
||||
'* auto: {auto}\n'
|
||||
'* block: {block}\n'
|
||||
'* single: {single}\n'
|
||||
'* print: {print}\n'
|
||||
'* unformatted: {unformatted}\n'
|
||||
'* off: {off}').format(**ui_data['paragraph_types'])
|
||||
),
|
||||
OptionRecommendation(name='preserve_spaces', recommended_value=False,
|
||||
help=_('Normally extra spaces are condensed into a single space. '
|
||||
'With this option all spaces will be displayed.')),
|
||||
OptionRecommendation(name='txt_in_remove_indents', recommended_value=False,
|
||||
help=_('Normally extra space at the beginning of lines is retained. '
|
||||
'With this option they will be removed.')),
|
||||
OptionRecommendation(name="markdown_extensions", recommended_value='footnotes, tables, toc',
|
||||
help=_('Enable extensions to markdown syntax. Extensions are formatting that is not part '
|
||||
'of the standard markdown format. The extensions enabled by default: %default.\n'
|
||||
'To learn more about markdown extensions, see {}\n'
|
||||
'This should be a comma separated list of extensions to enable:\n'
|
||||
).format('https://python-markdown.github.io/extensions/') + '\n'.join('* %s: %s' % (k, MD_EXTENSIONS[k]) for k in sorted(MD_EXTENSIONS))),
|
||||
}
|
||||
|
||||
def shift_file(self, fname, data):
|
||||
name, ext = os.path.splitext(fname)
|
||||
candidate = os.path.join(self.output_dir, fname)
|
||||
c = 0
|
||||
while os.path.exists(candidate):
|
||||
c += 1
|
||||
candidate = os.path.join(self.output_dir, '{}-{}{}'.format(name, c, ext))
|
||||
ans = candidate
|
||||
with open(ans, 'wb') as f:
|
||||
f.write(data)
|
||||
return f.name
|
||||
|
||||
def fix_resources(self, html, base_dir):
|
||||
from html5_parser import parse
|
||||
root = parse(html)
|
||||
changed = False
|
||||
for img in root.xpath('//img[@src]'):
|
||||
src = img.get('src')
|
||||
prefix = src.split(':', 1)[0].lower()
|
||||
if prefix not in ('file', 'http', 'https', 'ftp') and not os.path.isabs(src):
|
||||
src = os.path.join(base_dir, src)
|
||||
if os.access(src, os.R_OK):
|
||||
with open(src, 'rb') as f:
|
||||
data = f.read()
|
||||
f = self.shift_file(os.path.basename(src), data)
|
||||
changed = True
|
||||
img.set('src', os.path.basename(f))
|
||||
if changed:
|
||||
from lxml import etree
|
||||
html = etree.tostring(root, encoding='unicode')
|
||||
return html
|
||||
|
||||
def convert(self, stream, options, file_ext, log,
|
||||
accelerators):
|
||||
from calibre.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator
|
||||
from calibre.ebooks.chardet import detect
|
||||
from calibre.utils.zipfile import ZipFile
|
||||
from calibre.ebooks.txt.processor import (convert_basic,
|
||||
convert_markdown_with_metadata, separate_paragraphs_single_line,
|
||||
separate_paragraphs_print_formatted, preserve_spaces,
|
||||
detect_paragraph_type, detect_formatting_type,
|
||||
normalize_line_endings, convert_textile, remove_indents,
|
||||
block_to_single_line, separate_hard_scene_breaks)
|
||||
|
||||
self.log = log
|
||||
txt = b''
|
||||
log.debug('Reading text from file...')
|
||||
length = 0
|
||||
base_dir = self.output_dir = getcwd()
|
||||
|
||||
# Extract content from zip archive.
|
||||
if file_ext == 'txtz':
|
||||
zf = ZipFile(stream)
|
||||
zf.extractall('.')
|
||||
|
||||
for x in walk('.'):
|
||||
if os.path.splitext(x)[1].lower() in ('.txt', '.text'):
|
||||
with open(x, 'rb') as tf:
|
||||
txt += tf.read() + b'\n\n'
|
||||
else:
|
||||
if getattr(stream, 'name', None):
|
||||
base_dir = os.path.dirname(stream.name)
|
||||
txt = stream.read()
|
||||
if file_ext in {'md', 'textile', 'markdown'}:
|
||||
options.formatting_type = {'md': 'markdown'}.get(file_ext, file_ext)
|
||||
log.info('File extension indicates particular formatting. '
|
||||
'Forcing formatting type to: %s'%options.formatting_type)
|
||||
options.paragraph_type = 'off'
|
||||
|
||||
# Get the encoding of the document.
|
||||
if options.input_encoding:
|
||||
ienc = options.input_encoding
|
||||
log.debug('Using user specified input encoding of %s' % ienc)
|
||||
else:
|
||||
det_encoding = detect(txt[:4096])
|
||||
det_encoding, confidence = det_encoding['encoding'], det_encoding['confidence']
|
||||
if det_encoding and det_encoding.lower().replace('_', '-').strip() in (
|
||||
'gb2312', 'chinese', 'csiso58gb231280', 'euc-cn', 'euccn',
|
||||
'eucgb2312-cn', 'gb2312-1980', 'gb2312-80', 'iso-ir-58'):
|
||||
# Microsoft Word exports to HTML with encoding incorrectly set to
|
||||
# gb2312 instead of gbk. gbk is a superset of gb2312, anyway.
|
||||
det_encoding = 'gbk'
|
||||
ienc = det_encoding
|
||||
log.debug('Detected input encoding as %s with a confidence of %s%%' % (ienc, confidence * 100))
|
||||
if not ienc:
|
||||
ienc = 'utf-8'
|
||||
log.debug('No input encoding specified and could not auto detect using %s' % ienc)
|
||||
# Remove BOM from start of txt as its presence can confuse markdown
|
||||
import codecs
|
||||
for bom in (codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE, codecs.BOM_UTF8, codecs.BOM_UTF32_LE, codecs.BOM_UTF32_BE):
|
||||
if txt.startswith(bom):
|
||||
txt = txt[len(bom):]
|
||||
break
|
||||
txt = txt.decode(ienc, 'replace')
|
||||
|
||||
# Replace entities
|
||||
txt = _ent_pat.sub(xml_entity_to_unicode, txt)
|
||||
|
||||
# Normalize line endings
|
||||
txt = normalize_line_endings(txt)
|
||||
|
||||
# Determine the paragraph type of the document.
|
||||
if options.paragraph_type == 'auto':
|
||||
options.paragraph_type = detect_paragraph_type(txt)
|
||||
if options.paragraph_type == 'unknown':
|
||||
log.debug('Could not reliably determine paragraph type using block')
|
||||
options.paragraph_type = 'block'
|
||||
else:
|
||||
log.debug('Auto detected paragraph type as %s' % options.paragraph_type)
|
||||
|
||||
# Detect formatting
|
||||
if options.formatting_type == 'auto':
|
||||
options.formatting_type = detect_formatting_type(txt)
|
||||
log.debug('Auto detected formatting as %s' % options.formatting_type)
|
||||
|
||||
if options.formatting_type == 'heuristic':
|
||||
setattr(options, 'enable_heuristics', True)
|
||||
setattr(options, 'unwrap_lines', False)
|
||||
setattr(options, 'smarten_punctuation', True)
|
||||
|
||||
# Reformat paragraphs to block formatting based on the detected type.
|
||||
# We don't check for block because the processor assumes block.
|
||||
# single and print at transformed to block for processing.
|
||||
if options.paragraph_type == 'single':
|
||||
txt = separate_paragraphs_single_line(txt)
|
||||
elif options.paragraph_type == 'print':
|
||||
txt = separate_hard_scene_breaks(txt)
|
||||
txt = separate_paragraphs_print_formatted(txt)
|
||||
txt = block_to_single_line(txt)
|
||||
elif options.paragraph_type == 'unformatted':
|
||||
from calibre.ebooks.conversion.utils import HeuristicProcessor
|
||||
# unwrap lines based on punctuation
|
||||
docanalysis = DocAnalysis('txt', txt)
|
||||
length = docanalysis.line_length(.5)
|
||||
preprocessor = HeuristicProcessor(options, log=getattr(self, 'log', None))
|
||||
txt = preprocessor.punctuation_unwrap(length, txt, 'txt')
|
||||
txt = separate_paragraphs_single_line(txt)
|
||||
elif options.paragraph_type == 'block':
|
||||
txt = separate_hard_scene_breaks(txt)
|
||||
txt = block_to_single_line(txt)
|
||||
|
||||
if getattr(options, 'enable_heuristics', False) and getattr(options, 'dehyphenate', False):
|
||||
docanalysis = DocAnalysis('txt', txt)
|
||||
if not length:
|
||||
length = docanalysis.line_length(.5)
|
||||
dehyphenator = Dehyphenator(options.verbose, log=self.log)
|
||||
txt = dehyphenator(txt,'txt', length)
|
||||
|
||||
# User requested transformation on the text.
|
||||
if options.txt_in_remove_indents:
|
||||
txt = remove_indents(txt)
|
||||
|
||||
# Preserve spaces will replace multiple spaces to a space
|
||||
# followed by the entity.
|
||||
if options.preserve_spaces:
|
||||
txt = preserve_spaces(txt)
|
||||
|
||||
# Process the text using the appropriate text processor.
|
||||
self.shifted_files = []
|
||||
try:
|
||||
html = ''
|
||||
input_mi = None
|
||||
if options.formatting_type == 'markdown':
|
||||
log.debug('Running text through markdown conversion...')
|
||||
try:
|
||||
input_mi, html = convert_markdown_with_metadata(txt, extensions=[x.strip() for x in options.markdown_extensions.split(',') if x.strip()])
|
||||
except RuntimeError:
|
||||
raise ValueError('This txt file has malformed markup, it cannot be'
|
||||
' converted by calibre. See https://daringfireball.net/projects/markdown/syntax')
|
||||
html = self.fix_resources(html, base_dir)
|
||||
elif options.formatting_type == 'textile':
|
||||
log.debug('Running text through textile conversion...')
|
||||
html = convert_textile(txt)
|
||||
html = self.fix_resources(html, base_dir)
|
||||
else:
|
||||
log.debug('Running text through basic conversion...')
|
||||
flow_size = getattr(options, 'flow_size', 0)
|
||||
html = convert_basic(txt, epub_split_size_kb=flow_size)
|
||||
|
||||
# Run the HTMLized text through the html processing plugin.
|
||||
from calibre.customize.ui import plugin_for_input_format
|
||||
html_input = plugin_for_input_format('html')
|
||||
for opt in html_input.options:
|
||||
setattr(options, opt.option.name, opt.recommended_value)
|
||||
options.input_encoding = 'utf-8'
|
||||
htmlfile = self.shift_file('index.html', html.encode('utf-8'))
|
||||
odi = options.debug_pipeline
|
||||
options.debug_pipeline = None
|
||||
# Generate oeb from html conversion.
|
||||
oeb = html_input.convert(open(htmlfile, 'rb'), options, 'html', log, {})
|
||||
options.debug_pipeline = odi
|
||||
finally:
|
||||
for x in self.shifted_files:
|
||||
os.remove(x)
|
||||
|
||||
# Set metadata from file.
|
||||
if input_mi is None:
|
||||
from calibre.customize.ui import get_file_type_metadata
|
||||
input_mi = get_file_type_metadata(stream, file_ext)
|
||||
from calibre.ebooks.oeb.transforms.metadata import meta_info_to_oeb_metadata
|
||||
meta_info_to_oeb_metadata(input_mi, oeb.metadata, log)
|
||||
self.html_postprocess_title = input_mi.title
|
||||
|
||||
return oeb
|
||||
|
||||
def postprocess_book(self, oeb, opts, log):
|
||||
for item in oeb.spine:
|
||||
if hasattr(item.data, 'xpath'):
|
||||
for title in item.data.xpath('//*[local-name()="title"]'):
|
||||
if title.text == _('Unknown'):
|
||||
title.text = self.html_postprocess_title
|
||||
165
ebook_converter/ebooks/conversion/plugins/txt_output.py
Normal file
165
ebook_converter/ebooks/conversion/plugins/txt_output.py
Normal file
@@ -0,0 +1,165 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL 3'
|
||||
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import os
|
||||
import shutil
|
||||
|
||||
|
||||
from calibre.customize.conversion import OutputFormatPlugin, \
|
||||
OptionRecommendation
|
||||
from calibre.ptempfile import TemporaryDirectory, TemporaryFile
|
||||
|
||||
NEWLINE_TYPES = ['system', 'unix', 'old_mac', 'windows']
|
||||
|
||||
|
||||
class TXTOutput(OutputFormatPlugin):
|
||||
|
||||
name = 'TXT Output'
|
||||
author = 'John Schember'
|
||||
file_type = 'txt'
|
||||
commit_name = 'txt_output'
|
||||
ui_data = {
|
||||
'newline_types': NEWLINE_TYPES,
|
||||
'formatting_types': {
|
||||
'plain': _('Plain text'),
|
||||
'markdown': _('Markdown formatted text'),
|
||||
'textile': _('TexTile formatted text')
|
||||
},
|
||||
}
|
||||
|
||||
options = {
|
||||
OptionRecommendation(name='newline', recommended_value='system',
|
||||
level=OptionRecommendation.LOW,
|
||||
short_switch='n', choices=NEWLINE_TYPES,
|
||||
help=_('Type of newline to use. Options are %s. Default is \'system\'. '
|
||||
'Use \'old_mac\' for compatibility with Mac OS 9 and earlier. '
|
||||
'For macOS use \'unix\'. \'system\' will default to the newline '
|
||||
'type used by this OS.') % sorted(NEWLINE_TYPES)),
|
||||
OptionRecommendation(name='txt_output_encoding', recommended_value='utf-8',
|
||||
level=OptionRecommendation.LOW,
|
||||
help=_('Specify the character encoding of the output document. '
|
||||
'The default is utf-8.')),
|
||||
OptionRecommendation(name='inline_toc',
|
||||
recommended_value=False, level=OptionRecommendation.LOW,
|
||||
help=_('Add Table of Contents to beginning of the book.')),
|
||||
OptionRecommendation(name='max_line_length',
|
||||
recommended_value=0, level=OptionRecommendation.LOW,
|
||||
help=_('The maximum number of characters per line. This splits on '
|
||||
'the first space before the specified value. If no space is found '
|
||||
'the line will be broken at the space after and will exceed the '
|
||||
'specified value. Also, there is a minimum of 25 characters. '
|
||||
'Use 0 to disable line splitting.')),
|
||||
OptionRecommendation(name='force_max_line_length',
|
||||
recommended_value=False, level=OptionRecommendation.LOW,
|
||||
help=_('Force splitting on the max-line-length value when no space '
|
||||
'is present. Also allows max-line-length to be below the minimum')),
|
||||
OptionRecommendation(name='txt_output_formatting',
|
||||
recommended_value='plain',
|
||||
choices=list(ui_data['formatting_types']),
|
||||
help=_('Formatting used within the document.\n'
|
||||
'* plain: {plain}\n'
|
||||
'* markdown: {markdown}\n'
|
||||
'* textile: {textile}').format(**ui_data['formatting_types'])),
|
||||
OptionRecommendation(name='keep_links',
|
||||
recommended_value=False, level=OptionRecommendation.LOW,
|
||||
help=_('Do not remove links within the document. This is only '
|
||||
'useful when paired with a txt-output-formatting option that '
|
||||
'is not none because links are always removed with plain text output.')),
|
||||
OptionRecommendation(name='keep_image_references',
|
||||
recommended_value=False, level=OptionRecommendation.LOW,
|
||||
help=_('Do not remove image references within the document. This is only '
|
||||
'useful when paired with a txt-output-formatting option that '
|
||||
'is not none because links are always removed with plain text output.')),
|
||||
OptionRecommendation(name='keep_color',
|
||||
recommended_value=False, level=OptionRecommendation.LOW,
|
||||
help=_('Do not remove font color from output. This is only useful when '
|
||||
'txt-output-formatting is set to textile. Textile is the only '
|
||||
'formatting that supports setting font color. If this option is '
|
||||
'not specified font color will not be set and default to the '
|
||||
'color displayed by the reader (generally this is black).')),
|
||||
}
|
||||
|
||||
def convert(self, oeb_book, output_path, input_plugin, opts, log):
|
||||
from calibre.ebooks.txt.txtml import TXTMLizer
|
||||
from calibre.utils.cleantext import clean_ascii_chars
|
||||
from calibre.ebooks.txt.newlines import specified_newlines, TxtNewlines
|
||||
|
||||
if opts.txt_output_formatting.lower() == 'markdown':
|
||||
from calibre.ebooks.txt.markdownml import MarkdownMLizer
|
||||
self.writer = MarkdownMLizer(log)
|
||||
elif opts.txt_output_formatting.lower() == 'textile':
|
||||
from calibre.ebooks.txt.textileml import TextileMLizer
|
||||
self.writer = TextileMLizer(log)
|
||||
else:
|
||||
self.writer = TXTMLizer(log)
|
||||
|
||||
txt = self.writer.extract_content(oeb_book, opts)
|
||||
txt = clean_ascii_chars(txt)
|
||||
|
||||
log.debug('\tReplacing newlines with selected type...')
|
||||
txt = specified_newlines(TxtNewlines(opts.newline).newline, txt)
|
||||
|
||||
close = False
|
||||
if not hasattr(output_path, 'write'):
|
||||
close = True
|
||||
if not os.path.exists(os.path.dirname(output_path)) and os.path.dirname(output_path) != '':
|
||||
os.makedirs(os.path.dirname(output_path))
|
||||
out_stream = open(output_path, 'wb')
|
||||
else:
|
||||
out_stream = output_path
|
||||
|
||||
out_stream.seek(0)
|
||||
out_stream.truncate()
|
||||
out_stream.write(txt.encode(opts.txt_output_encoding, 'replace'))
|
||||
|
||||
if close:
|
||||
out_stream.close()
|
||||
|
||||
|
||||
class TXTZOutput(TXTOutput):
|
||||
|
||||
name = 'TXTZ Output'
|
||||
author = 'John Schember'
|
||||
file_type = 'txtz'
|
||||
|
||||
def convert(self, oeb_book, output_path, input_plugin, opts, log):
|
||||
from calibre.ebooks.oeb.base import OEB_IMAGES
|
||||
from calibre.utils.zipfile import ZipFile
|
||||
from lxml import etree
|
||||
|
||||
with TemporaryDirectory('_txtz_output') as tdir:
|
||||
# TXT
|
||||
txt_name = 'index.txt'
|
||||
if opts.txt_output_formatting.lower() == 'textile':
|
||||
txt_name = 'index.text'
|
||||
with TemporaryFile(txt_name) as tf:
|
||||
TXTOutput.convert(self, oeb_book, tf, input_plugin, opts, log)
|
||||
shutil.copy(tf, os.path.join(tdir, txt_name))
|
||||
|
||||
# Images
|
||||
for item in oeb_book.manifest:
|
||||
if item.media_type in OEB_IMAGES:
|
||||
if hasattr(self.writer, 'images'):
|
||||
path = os.path.join(tdir, 'images')
|
||||
if item.href in self.writer.images:
|
||||
href = self.writer.images[item.href]
|
||||
else:
|
||||
continue
|
||||
else:
|
||||
path = os.path.join(tdir, os.path.dirname(item.href))
|
||||
href = os.path.basename(item.href)
|
||||
if not os.path.exists(path):
|
||||
os.makedirs(path)
|
||||
with open(os.path.join(path, href), 'wb') as imgf:
|
||||
imgf.write(item.data)
|
||||
|
||||
# Metadata
|
||||
with open(os.path.join(tdir, 'metadata.opf'), 'wb') as mdataf:
|
||||
mdataf.write(etree.tostring(oeb_book.metadata.to_opf1()))
|
||||
|
||||
txtz = ZipFile(output_path, 'w')
|
||||
txtz.add_dir(tdir)
|
||||
Reference in New Issue
Block a user