1
0
mirror of https://github.com/gryf/ebook-converter.git synced 2026-02-22 18:15:49 +01:00

Initial import

This commit is contained in:
2020-03-31 17:15:23 +02:00
commit d97ea9b0bc
311 changed files with 131419 additions and 0 deletions

View File

@@ -0,0 +1,10 @@
#!/usr/bin/env python2
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'

View File

@@ -0,0 +1,29 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2011, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
from calibre.customize.conversion import InputFormatPlugin
from polyglot.builtins import getcwd
class AZW4Input(InputFormatPlugin):
name = 'AZW4 Input'
author = 'John Schember'
description = 'Convert AZW4 to HTML'
file_types = {'azw4'}
commit_name = 'azw4_input'
def convert(self, stream, options, file_ext, log,
accelerators):
from calibre.ebooks.pdb.header import PdbHeaderReader
from calibre.ebooks.azw4.reader import Reader
header = PdbHeaderReader(stream)
reader = Reader(header, stream, log, options)
opf = reader.extract_content(getcwd())
return opf

View File

@@ -0,0 +1,202 @@
from __future__ import absolute_import, division, print_function, unicode_literals
''' CHM File decoding support '''
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>,' \
' and Alex Bramley <a.bramley at gmail.com>.'
import os
from calibre.customize.conversion import InputFormatPlugin
from calibre.ptempfile import TemporaryDirectory
from calibre.constants import filesystem_encoding
from polyglot.builtins import unicode_type, as_bytes
class CHMInput(InputFormatPlugin):
name = 'CHM Input'
author = 'Kovid Goyal and Alex Bramley'
description = 'Convert CHM files to OEB'
file_types = {'chm'}
commit_name = 'chm_input'
def _chmtohtml(self, output_dir, chm_path, no_images, log, debug_dump=False):
from calibre.ebooks.chm.reader import CHMReader
log.debug('Opening CHM file')
rdr = CHMReader(chm_path, log, input_encoding=self.opts.input_encoding)
log.debug('Extracting CHM to %s' % output_dir)
rdr.extract_content(output_dir, debug_dump=debug_dump)
self._chm_reader = rdr
return rdr.hhc_path
def convert(self, stream, options, file_ext, log, accelerators):
from calibre.ebooks.chm.metadata import get_metadata_from_reader
from calibre.customize.ui import plugin_for_input_format
self.opts = options
log.debug('Processing CHM...')
with TemporaryDirectory('_chm2oeb') as tdir:
if not isinstance(tdir, unicode_type):
tdir = tdir.decode(filesystem_encoding)
html_input = plugin_for_input_format('html')
for opt in html_input.options:
setattr(options, opt.option.name, opt.recommended_value)
no_images = False # options.no_images
chm_name = stream.name
# chm_data = stream.read()
# closing stream so CHM can be opened by external library
stream.close()
log.debug('tdir=%s' % tdir)
log.debug('stream.name=%s' % stream.name)
debug_dump = False
odi = options.debug_pipeline
if odi:
debug_dump = os.path.join(odi, 'input')
mainname = self._chmtohtml(tdir, chm_name, no_images, log,
debug_dump=debug_dump)
mainpath = os.path.join(tdir, mainname)
try:
metadata = get_metadata_from_reader(self._chm_reader)
except Exception:
log.exception('Failed to read metadata, using filename')
from calibre.ebooks.metadata.book.base import Metadata
metadata = Metadata(os.path.basename(chm_name))
encoding = self._chm_reader.get_encoding() or options.input_encoding or 'cp1252'
self._chm_reader.CloseCHM()
# print((tdir, mainpath))
# from calibre import ipython
# ipython()
options.debug_pipeline = None
options.input_encoding = 'utf-8'
uenc = encoding
if os.path.abspath(mainpath) in self._chm_reader.re_encoded_files:
uenc = 'utf-8'
htmlpath, toc = self._create_html_root(mainpath, log, uenc)
oeb = self._create_oebbook_html(htmlpath, tdir, options, log, metadata)
options.debug_pipeline = odi
if toc.count() > 1:
oeb.toc = self.parse_html_toc(oeb.spine[0])
oeb.manifest.remove(oeb.spine[0])
oeb.auto_generated_toc = False
return oeb
def parse_html_toc(self, item):
from calibre.ebooks.oeb.base import TOC, XPath
dx = XPath('./h:div')
ax = XPath('./h:a[1]')
def do_node(parent, div):
for child in dx(div):
a = ax(child)[0]
c = parent.add(a.text, a.attrib['href'])
do_node(c, child)
toc = TOC()
root = XPath('//h:div[1]')(item.data)[0]
do_node(toc, root)
return toc
def _create_oebbook_html(self, htmlpath, basedir, opts, log, mi):
# use HTMLInput plugin to generate book
from calibre.customize.builtins import HTMLInput
opts.breadth_first = True
htmlinput = HTMLInput(None)
oeb = htmlinput.create_oebbook(htmlpath, basedir, opts, log, mi)
return oeb
def _create_html_root(self, hhcpath, log, encoding):
from lxml import html
from polyglot.urllib import unquote as _unquote
from calibre.ebooks.oeb.base import urlquote
from calibre.ebooks.chardet import xml_to_unicode
hhcdata = self._read_file(hhcpath)
hhcdata = hhcdata.decode(encoding)
hhcdata = xml_to_unicode(hhcdata, verbose=True,
strip_encoding_pats=True, resolve_entities=True)[0]
hhcroot = html.fromstring(hhcdata)
toc = self._process_nodes(hhcroot)
# print("=============================")
# print("Printing hhcroot")
# print(etree.tostring(hhcroot, pretty_print=True))
# print("=============================")
log.debug('Found %d section nodes' % toc.count())
htmlpath = os.path.splitext(hhcpath)[0] + ".html"
base = os.path.dirname(os.path.abspath(htmlpath))
def unquote(x):
if isinstance(x, unicode_type):
x = x.encode('utf-8')
return _unquote(x).decode('utf-8')
def unquote_path(x):
y = unquote(x)
if (not os.path.exists(os.path.join(base, x)) and os.path.exists(os.path.join(base, y))):
x = y
return x
def donode(item, parent, base, subpath):
for child in item:
title = child.title
if not title:
continue
raw = unquote_path(child.href or '')
rsrcname = os.path.basename(raw)
rsrcpath = os.path.join(subpath, rsrcname)
if (not os.path.exists(os.path.join(base, rsrcpath)) and os.path.exists(os.path.join(base, raw))):
rsrcpath = raw
if '%' not in rsrcpath:
rsrcpath = urlquote(rsrcpath)
if not raw:
rsrcpath = ''
c = DIV(A(title, href=rsrcpath))
donode(child, c, base, subpath)
parent.append(c)
with open(htmlpath, 'wb') as f:
if toc.count() > 1:
from lxml.html.builder import HTML, BODY, DIV, A
path0 = toc[0].href
path0 = unquote_path(path0)
subpath = os.path.dirname(path0)
base = os.path.dirname(f.name)
root = DIV()
donode(toc, root, base, subpath)
raw = html.tostring(HTML(BODY(root)), encoding='utf-8',
pretty_print=True)
f.write(raw)
else:
f.write(as_bytes(hhcdata))
return htmlpath, toc
def _read_file(self, name):
with lopen(name, 'rb') as f:
data = f.read()
return data
def add_node(self, node, toc, ancestor_map):
from calibre.ebooks.chm.reader import match_string
if match_string(node.attrib.get('type', ''), 'text/sitemap'):
p = node.xpath('ancestor::ul[1]/ancestor::li[1]/object[1]')
parent = p[0] if p else None
toc = ancestor_map.get(parent, toc)
title = href = ''
for param in node.xpath('./param'):
if match_string(param.attrib['name'], 'name'):
title = param.attrib['value']
elif match_string(param.attrib['name'], 'local'):
href = param.attrib['value']
child = toc.add(title or _('Unknown'), href)
ancestor_map[node] = child
def _process_nodes(self, root):
from calibre.ebooks.oeb.base import TOC
toc = TOC()
ancestor_map = {}
for node in root.xpath('//object'):
self.add_node(node, toc, ancestor_map)
return toc

View File

@@ -0,0 +1,310 @@
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
__docformat__ = 'restructuredtext en'
'''
Based on ideas from comiclrf created by FangornUK.
'''
import shutil, textwrap, codecs, os
from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
from calibre import CurrentDir
from calibre.ptempfile import PersistentTemporaryDirectory
from polyglot.builtins import getcwd, map
class ComicInput(InputFormatPlugin):
name = 'Comic Input'
author = 'Kovid Goyal'
description = 'Optimize comic files (.cbz, .cbr, .cbc) for viewing on portable devices'
file_types = {'cbz', 'cbr', 'cbc'}
is_image_collection = True
commit_name = 'comic_input'
core_usage = -1
options = {
OptionRecommendation(name='colors', recommended_value=0,
help=_('Reduce the number of colors used in the image. This works only'
' if you choose the PNG output format. It is useful to reduce file sizes.'
' Set to zero to turn off. Maximum value is 256. It is off by default.')),
OptionRecommendation(name='dont_normalize', recommended_value=False,
help=_('Disable normalize (improve contrast) color range '
'for pictures. Default: False')),
OptionRecommendation(name='keep_aspect_ratio', recommended_value=False,
help=_('Maintain picture aspect ratio. Default is to fill the screen.')),
OptionRecommendation(name='dont_sharpen', recommended_value=False,
help=_('Disable sharpening.')),
OptionRecommendation(name='disable_trim', recommended_value=False,
help=_('Disable trimming of comic pages. For some comics, '
'trimming might remove content as well as borders.')),
OptionRecommendation(name='landscape', recommended_value=False,
help=_("Don't split landscape images into two portrait images")),
OptionRecommendation(name='wide', recommended_value=False,
help=_("Keep aspect ratio and scale image using screen height as "
"image width for viewing in landscape mode.")),
OptionRecommendation(name='right2left', recommended_value=False,
help=_('Used for right-to-left publications like manga. '
'Causes landscape pages to be split into portrait pages '
'from right to left.')),
OptionRecommendation(name='despeckle', recommended_value=False,
help=_('Enable Despeckle. Reduces speckle noise. '
'May greatly increase processing time.')),
OptionRecommendation(name='no_sort', recommended_value=False,
help=_("Don't sort the files found in the comic "
"alphabetically by name. Instead use the order they were "
"added to the comic.")),
OptionRecommendation(name='output_format', choices=['png', 'jpg'],
recommended_value='png', help=_('The format that images in the created e-book '
'are converted to. You can experiment to see which format gives '
'you optimal size and look on your device.')),
OptionRecommendation(name='no_process', recommended_value=False,
help=_("Apply no processing to the image")),
OptionRecommendation(name='dont_grayscale', recommended_value=False,
help=_('Do not convert the image to grayscale (black and white)')),
OptionRecommendation(name='comic_image_size', recommended_value=None,
help=_('Specify the image size as widthxheight pixels. Normally,'
' an image size is automatically calculated from the output '
'profile, this option overrides it.')),
OptionRecommendation(name='dont_add_comic_pages_to_toc', recommended_value=False,
help=_('When converting a CBC do not add links to each page to'
' the TOC. Note this only applies if the TOC has more than one'
' section')),
}
recommendations = {
('margin_left', 0, OptionRecommendation.HIGH),
('margin_top', 0, OptionRecommendation.HIGH),
('margin_right', 0, OptionRecommendation.HIGH),
('margin_bottom', 0, OptionRecommendation.HIGH),
('insert_blank_line', False, OptionRecommendation.HIGH),
('remove_paragraph_spacing', False, OptionRecommendation.HIGH),
('change_justification', 'left', OptionRecommendation.HIGH),
('dont_split_on_pagebreaks', True, OptionRecommendation.HIGH),
('chapter', None, OptionRecommendation.HIGH),
('page_breaks_brefore', None, OptionRecommendation.HIGH),
('use_auto_toc', False, OptionRecommendation.HIGH),
('page_breaks_before', None, OptionRecommendation.HIGH),
('disable_font_rescaling', True, OptionRecommendation.HIGH),
('linearize_tables', False, OptionRecommendation.HIGH),
}
def get_comics_from_collection(self, stream):
from calibre.libunzip import extract as zipextract
tdir = PersistentTemporaryDirectory('_comic_collection')
zipextract(stream, tdir)
comics = []
with CurrentDir(tdir):
if not os.path.exists('comics.txt'):
raise ValueError((
'%s is not a valid comic collection'
' no comics.txt was found in the file')
%stream.name)
with open('comics.txt', 'rb') as f:
raw = f.read()
if raw.startswith(codecs.BOM_UTF16_BE):
raw = raw.decode('utf-16-be')[1:]
elif raw.startswith(codecs.BOM_UTF16_LE):
raw = raw.decode('utf-16-le')[1:]
elif raw.startswith(codecs.BOM_UTF8):
raw = raw.decode('utf-8')[1:]
else:
raw = raw.decode('utf-8')
for line in raw.splitlines():
line = line.strip()
if not line:
continue
fname, title = line.partition(':')[0], line.partition(':')[-1]
fname = fname.replace('#', '_')
fname = os.path.join(tdir, *fname.split('/'))
if not title:
title = os.path.basename(fname).rpartition('.')[0]
if os.access(fname, os.R_OK):
comics.append([title, fname])
if not comics:
raise ValueError('%s has no comics'%stream.name)
return comics
def get_pages(self, comic, tdir2):
from calibre.ebooks.comic.input import (extract_comic, process_pages,
find_pages)
tdir = extract_comic(comic)
new_pages = find_pages(tdir, sort_on_mtime=self.opts.no_sort,
verbose=self.opts.verbose)
thumbnail = None
if not new_pages:
raise ValueError('Could not find any pages in the comic: %s'
%comic)
if self.opts.no_process:
n2 = []
for i, page in enumerate(new_pages):
n2.append(os.path.join(tdir2, '{} - {}' .format(i, os.path.basename(page))))
shutil.copyfile(page, n2[-1])
new_pages = n2
else:
new_pages, failures = process_pages(new_pages, self.opts,
self.report_progress, tdir2)
if failures:
self.log.warning('Could not process the following pages '
'(run with --verbose to see why):')
for f in failures:
self.log.warning('\t', f)
if not new_pages:
raise ValueError('Could not find any valid pages in comic: %s'
% comic)
thumbnail = os.path.join(tdir2,
'thumbnail.'+self.opts.output_format.lower())
if not os.access(thumbnail, os.R_OK):
thumbnail = None
return new_pages
def get_images(self):
return self._images
def convert(self, stream, opts, file_ext, log, accelerators):
from calibre.ebooks.metadata import MetaInformation
from calibre.ebooks.metadata.opf2 import OPFCreator
from calibre.ebooks.metadata.toc import TOC
self.opts, self.log= opts, log
if file_ext == 'cbc':
comics_ = self.get_comics_from_collection(stream)
else:
comics_ = [['Comic', os.path.abspath(stream.name)]]
stream.close()
comics = []
for i, x in enumerate(comics_):
title, fname = x
cdir = 'comic_%d'%(i+1) if len(comics_) > 1 else '.'
cdir = os.path.abspath(cdir)
if not os.path.exists(cdir):
os.makedirs(cdir)
pages = self.get_pages(fname, cdir)
if not pages:
continue
if self.for_viewer:
comics.append((title, pages, [self.create_viewer_wrapper(pages)]))
else:
wrappers = self.create_wrappers(pages)
comics.append((title, pages, wrappers))
if not comics:
raise ValueError('No comic pages found in %s'%stream.name)
mi = MetaInformation(os.path.basename(stream.name).rpartition('.')[0],
[_('Unknown')])
opf = OPFCreator(getcwd(), mi)
entries = []
def href(x):
if len(comics) == 1:
return os.path.basename(x)
return '/'.join(x.split(os.sep)[-2:])
cover_href = None
for comic in comics:
pages, wrappers = comic[1:]
page_entries = [(x, None) for x in map(href, pages)]
entries += [(w, None) for w in map(href, wrappers)] + page_entries
if cover_href is None and page_entries:
cover_href = page_entries[0][0]
opf.create_manifest(entries)
spine = []
for comic in comics:
spine.extend(map(href, comic[2]))
self._images = []
for comic in comics:
self._images.extend(comic[1])
opf.create_spine(spine)
if self.for_viewer and cover_href:
opf.guide.set_cover(cover_href)
toc = TOC()
if len(comics) == 1:
wrappers = comics[0][2]
for i, x in enumerate(wrappers):
toc.add_item(href(x), None, _('Page')+' %d'%(i+1),
play_order=i)
else:
po = 0
for comic in comics:
po += 1
wrappers = comic[2]
stoc = toc.add_item(href(wrappers[0]),
None, comic[0], play_order=po)
if not opts.dont_add_comic_pages_to_toc:
for i, x in enumerate(wrappers):
stoc.add_item(href(x), None,
_('Page')+' %d'%(i+1), play_order=po)
po += 1
opf.set_toc(toc)
with open('metadata.opf', 'wb') as m, open('toc.ncx', 'wb') as n:
opf.render(m, n, 'toc.ncx')
return os.path.abspath('metadata.opf')
def create_wrappers(self, pages):
from calibre.ebooks.oeb.base import XHTML_NS
wrappers = []
WRAPPER = textwrap.dedent('''\
<html xmlns="%s">
<head>
<meta charset="utf-8"/>
<title>Page #%d</title>
<style type="text/css">
@page { margin:0pt; padding: 0pt}
body { margin: 0pt; padding: 0pt}
div { text-align: center }
</style>
</head>
<body>
<div>
<img src="%s" alt="comic page #%d" />
</div>
</body>
</html>
''')
dir = os.path.dirname(pages[0])
for i, page in enumerate(pages):
wrapper = WRAPPER%(XHTML_NS, i+1, os.path.basename(page), i+1)
page = os.path.join(dir, 'page_%d.xhtml'%(i+1))
with open(page, 'wb') as f:
f.write(wrapper.encode('utf-8'))
wrappers.append(page)
return wrappers
def create_viewer_wrapper(self, pages):
from calibre.ebooks.oeb.base import XHTML_NS
def page(src):
return '<img src="{}"></img>'.format(os.path.basename(src))
pages = '\n'.join(map(page, pages))
base = os.path.dirname(pages[0])
wrapper = '''
<html xmlns="%s">
<head>
<meta charset="utf-8"/>
<style type="text/css">
html, body, img { height: 100vh; display: block; margin: 0; padding: 0; border-width: 0; }
img {
width: 100%%; height: 100%%;
object-fit: contain;
margin-left: auto; margin-right: auto;
max-width: 100vw; max-height: 100vh;
top: 50vh; transform: translateY(-50%%);
position: relative;
page-break-after: always;
}
</style>
</head>
<body>
%s
</body>
</html>
''' % (XHTML_NS, pages)
path = os.path.join(base, 'wrapper.xhtml')
with open(path, 'wb') as f:
f.write(wrapper.encode('utf-8'))
return path

View File

@@ -0,0 +1,67 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL 3'
__copyright__ = '2011, Anthon van der Neut <anthon@mnt.org>'
__docformat__ = 'restructuredtext en'
import os
from io import BytesIO
from calibre.customize.conversion import InputFormatPlugin
from polyglot.builtins import getcwd
class DJVUInput(InputFormatPlugin):
name = 'DJVU Input'
author = 'Anthon van der Neut'
description = 'Convert OCR-ed DJVU files (.djvu) to HTML'
file_types = {'djvu', 'djv'}
commit_name = 'djvu_input'
def convert(self, stream, options, file_ext, log, accelerators):
from calibre.ebooks.txt.processor import convert_basic
stdout = BytesIO()
from calibre.ebooks.djvu.djvu import DJVUFile
x = DJVUFile(stream)
x.get_text(stdout)
raw_text = stdout.getvalue()
if not raw_text:
raise ValueError('The DJVU file contains no text, only images, probably page scans.'
' calibre only supports conversion of DJVU files with actual text in them.')
html = convert_basic(raw_text.replace(b"\n", b' ').replace(
b'\037', b'\n\n'))
# Run the HTMLized text through the html processing plugin.
from calibre.customize.ui import plugin_for_input_format
html_input = plugin_for_input_format('html')
for opt in html_input.options:
setattr(options, opt.option.name, opt.recommended_value)
options.input_encoding = 'utf-8'
base = getcwd()
htmlfile = os.path.join(base, 'index.html')
c = 0
while os.path.exists(htmlfile):
c += 1
htmlfile = os.path.join(base, 'index%d.html'%c)
with open(htmlfile, 'wb') as f:
f.write(html.encode('utf-8'))
odi = options.debug_pipeline
options.debug_pipeline = None
# Generate oeb from html conversion.
with open(htmlfile, 'rb') as f:
oeb = html_input.convert(f, options, 'html', log,
{})
options.debug_pipeline = odi
os.remove(htmlfile)
# Set metadata from file.
from calibre.customize.ui import get_file_type_metadata
from calibre.ebooks.oeb.transforms.metadata import meta_info_to_oeb_metadata
mi = get_file_type_metadata(stream, file_ext)
meta_info_to_oeb_metadata(mi, oeb.metadata, log)
return oeb

View File

@@ -0,0 +1,34 @@
#!/usr/bin/env python2
# vim:fileencoding=utf-8
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
class DOCXInput(InputFormatPlugin):
name = 'DOCX Input'
author = 'Kovid Goyal'
description = _('Convert DOCX files (.docx and .docm) to HTML')
file_types = {'docx', 'docm'}
commit_name = 'docx_input'
options = {
OptionRecommendation(name='docx_no_cover', recommended_value=False,
help=_('Normally, if a large image is present at the start of the document that looks like a cover, '
'it will be removed from the document and used as the cover for created e-book. This option '
'turns off that behavior.')),
OptionRecommendation(name='docx_no_pagebreaks_between_notes', recommended_value=False,
help=_('Do not insert a page break after every endnote.')),
OptionRecommendation(name='docx_inline_subsup', recommended_value=False,
help=_('Render superscripts and subscripts so that they do not affect the line height.')),
}
recommendations = {('page_breaks_before', '/', OptionRecommendation.MED)}
def convert(self, stream, options, file_ext, log, accelerators):
from calibre.ebooks.docx.to_html import Convert
return Convert(stream, detect_cover=not options.docx_no_cover, log=log, notes_nopb=options.docx_no_pagebreaks_between_notes,
nosupsub=options.docx_inline_subsup)()

View File

@@ -0,0 +1,93 @@
#!/usr/bin/env python2
# vim:fileencoding=utf-8
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
from calibre.customize.conversion import OutputFormatPlugin, OptionRecommendation
PAGE_SIZES = ['a0', 'a1', 'a2', 'a3', 'a4', 'a5', 'a6', 'b0', 'b1',
'b2', 'b3', 'b4', 'b5', 'b6', 'legal', 'letter']
class DOCXOutput(OutputFormatPlugin):
name = 'DOCX Output'
author = 'Kovid Goyal'
file_type = 'docx'
commit_name = 'docx_output'
ui_data = {'page_sizes': PAGE_SIZES}
options = {
OptionRecommendation(name='docx_page_size', recommended_value='letter',
level=OptionRecommendation.LOW, choices=PAGE_SIZES,
help=_('The size of the page. Default is letter. Choices '
'are %s') % PAGE_SIZES),
OptionRecommendation(name='docx_custom_page_size', recommended_value=None,
help=_('Custom size of the document. Use the form widthxheight '
'EG. `123x321` to specify the width and height (in pts). '
'This overrides any specified page-size.')),
OptionRecommendation(name='docx_no_cover', recommended_value=False,
help=_('Do not insert the book cover as an image at the start of the document.'
' If you use this option, the book cover will be discarded.')),
OptionRecommendation(name='preserve_cover_aspect_ratio', recommended_value=False,
help=_('Preserve the aspect ratio of the cover image instead of stretching'
' it out to cover the entire page.')),
OptionRecommendation(name='docx_no_toc', recommended_value=False,
help=_('Do not insert the table of contents as a page at the start of the document.')),
OptionRecommendation(name='extract_to',
help=_('Extract the contents of the generated %s file to the '
'specified directory. The contents of the directory are first '
'deleted, so be careful.') % 'DOCX'),
OptionRecommendation(name='docx_page_margin_left', recommended_value=72.0,
level=OptionRecommendation.LOW,
help=_('The size of the left page margin, in pts. Default is 72pt.'
' Overrides the common left page margin setting.')
),
OptionRecommendation(name='docx_page_margin_top', recommended_value=72.0,
level=OptionRecommendation.LOW,
help=_('The size of the top page margin, in pts. Default is 72pt.'
' Overrides the common top page margin setting, unless set to zero.')
),
OptionRecommendation(name='docx_page_margin_right', recommended_value=72.0,
level=OptionRecommendation.LOW,
help=_('The size of the right page margin, in pts. Default is 72pt.'
' Overrides the common right page margin setting, unless set to zero.')
),
OptionRecommendation(name='docx_page_margin_bottom', recommended_value=72.0,
level=OptionRecommendation.LOW,
help=_('The size of the bottom page margin, in pts. Default is 72pt.'
' Overrides the common bottom page margin setting, unless set to zero.')
),
}
def convert_metadata(self, oeb):
from lxml import etree
from calibre.ebooks.oeb.base import OPF, OPF2_NS
from calibre.ebooks.metadata.opf2 import OPF as ReadOPF
from io import BytesIO
package = etree.Element(OPF('package'), attrib={'version': '2.0'}, nsmap={None: OPF2_NS})
oeb.metadata.to_opf2(package)
self.mi = ReadOPF(BytesIO(etree.tostring(package, encoding='utf-8')), populate_spine=False, try_to_guess_cover=False).to_book_metadata()
def convert(self, oeb, output_path, input_plugin, opts, log):
from calibre.ebooks.docx.writer.container import DOCX
from calibre.ebooks.docx.writer.from_html import Convert
docx = DOCX(opts, log)
self.convert_metadata(oeb)
Convert(oeb, docx, self.mi, not opts.docx_no_cover, not opts.docx_no_toc)()
docx.write(output_path, self.mi)
if opts.extract_to:
from calibre.ebooks.docx.dump import do_dump
do_dump(output_path, opts.extract_to)

View File

@@ -0,0 +1,438 @@
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL 3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import os, re, posixpath
from itertools import cycle
from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
from polyglot.builtins import getcwd
ADOBE_OBFUSCATION = 'http://ns.adobe.com/pdf/enc#RC'
IDPF_OBFUSCATION = 'http://www.idpf.org/2008/embedding'
def decrypt_font_data(key, data, algorithm):
is_adobe = algorithm == ADOBE_OBFUSCATION
crypt_len = 1024 if is_adobe else 1040
crypt = bytearray(data[:crypt_len])
key = cycle(iter(bytearray(key)))
decrypt = bytes(bytearray(x^next(key) for x in crypt))
return decrypt + data[crypt_len:]
def decrypt_font(key, path, algorithm):
with lopen(path, 'r+b') as f:
data = decrypt_font_data(key, f.read(), algorithm)
f.seek(0), f.truncate(), f.write(data)
class EPUBInput(InputFormatPlugin):
name = 'EPUB Input'
author = 'Kovid Goyal'
description = 'Convert EPUB files (.epub) to HTML'
file_types = {'epub'}
output_encoding = None
commit_name = 'epub_input'
recommendations = {('page_breaks_before', '/', OptionRecommendation.MED)}
def process_encryption(self, encfile, opf, log):
from lxml import etree
import uuid, hashlib
idpf_key = opf.raw_unique_identifier
if idpf_key:
idpf_key = re.sub('[\u0020\u0009\u000d\u000a]', '', idpf_key)
idpf_key = hashlib.sha1(idpf_key.encode('utf-8')).digest()
key = None
for item in opf.identifier_iter():
scheme = None
for xkey in item.attrib.keys():
if xkey.endswith('scheme'):
scheme = item.get(xkey)
if (scheme and scheme.lower() == 'uuid') or \
(item.text and item.text.startswith('urn:uuid:')):
try:
key = item.text.rpartition(':')[-1]
key = uuid.UUID(key).bytes
except:
import traceback
traceback.print_exc()
key = None
try:
root = etree.parse(encfile)
for em in root.xpath('descendant::*[contains(name(), "EncryptionMethod")]'):
algorithm = em.get('Algorithm', '')
if algorithm not in {ADOBE_OBFUSCATION, IDPF_OBFUSCATION}:
return False
cr = em.getparent().xpath('descendant::*[contains(name(), "CipherReference")]')[0]
uri = cr.get('URI')
path = os.path.abspath(os.path.join(os.path.dirname(encfile), '..', *uri.split('/')))
tkey = (key if algorithm == ADOBE_OBFUSCATION else idpf_key)
if (tkey and os.path.exists(path)):
self._encrypted_font_uris.append(uri)
decrypt_font(tkey, path, algorithm)
return True
except:
import traceback
traceback.print_exc()
return False
def set_guide_type(self, opf, gtype, href=None, title=''):
# Set the specified guide entry
for elem in list(opf.iterguide()):
if elem.get('type', '').lower() == gtype:
elem.getparent().remove(elem)
if href is not None:
t = opf.create_guide_item(gtype, title, href)
for guide in opf.root.xpath('./*[local-name()="guide"]'):
guide.append(t)
return
guide = opf.create_guide_element()
opf.root.append(guide)
guide.append(t)
return t
def rationalize_cover3(self, opf, log):
''' If there is a reference to the cover/titlepage via manifest properties, convert to
entries in the <guide> so that the rest of the pipeline picks it up. '''
from calibre.ebooks.metadata.opf3 import items_with_property
removed = guide_titlepage_href = guide_titlepage_id = None
# Look for titlepages incorrectly marked in the <guide> as covers
guide_cover, guide_elem = None, None
for guide_elem in opf.iterguide():
if guide_elem.get('type', '').lower() == 'cover':
guide_cover = guide_elem.get('href', '').partition('#')[0]
break
if guide_cover:
spine = list(opf.iterspine())
if spine:
idref = spine[0].get('idref', '')
for x in opf.itermanifest():
if x.get('id') == idref and x.get('href') == guide_cover:
guide_titlepage_href = guide_cover
guide_titlepage_id = idref
break
raster_cover_href = opf.epub3_raster_cover or opf.raster_cover
if raster_cover_href:
self.set_guide_type(opf, 'cover', raster_cover_href, 'Cover Image')
titlepage_id = titlepage_href = None
for item in items_with_property(opf.root, 'calibre:title-page'):
tid, href = item.get('id'), item.get('href')
if href and tid:
titlepage_id, titlepage_href = tid, href.partition('#')[0]
break
if titlepage_href is None:
titlepage_href, titlepage_id = guide_titlepage_href, guide_titlepage_id
if titlepage_href is not None:
self.set_guide_type(opf, 'titlepage', titlepage_href, 'Title Page')
spine = list(opf.iterspine())
if len(spine) > 1:
for item in spine:
if item.get('idref') == titlepage_id:
log('Found HTML cover', titlepage_href)
if self.for_viewer:
item.attrib.pop('linear', None)
else:
item.getparent().remove(item)
removed = titlepage_href
return removed
def rationalize_cover2(self, opf, log):
''' Ensure that the cover information in the guide is correct. That
means, at most one entry with type="cover" that points to a raster
cover and at most one entry with type="titlepage" that points to an
HTML titlepage. '''
from calibre.ebooks.oeb.base import OPF
removed = None
from lxml import etree
guide_cover, guide_elem = None, None
for guide_elem in opf.iterguide():
if guide_elem.get('type', '').lower() == 'cover':
guide_cover = guide_elem.get('href', '').partition('#')[0]
break
if not guide_cover:
raster_cover = opf.raster_cover
if raster_cover:
if guide_elem is None:
g = opf.root.makeelement(OPF('guide'))
opf.root.append(g)
else:
g = guide_elem.getparent()
guide_cover = raster_cover
guide_elem = g.makeelement(OPF('reference'), attrib={'href':raster_cover, 'type':'cover'})
g.append(guide_elem)
return
spine = list(opf.iterspine())
if not spine:
return
# Check if the cover specified in the guide is also
# the first element in spine
idref = spine[0].get('idref', '')
manifest = list(opf.itermanifest())
if not manifest:
return
elem = [x for x in manifest if x.get('id', '') == idref]
if not elem or elem[0].get('href', None) != guide_cover:
return
log('Found HTML cover', guide_cover)
# Remove from spine as covers must be treated
# specially
if not self.for_viewer:
if len(spine) == 1:
log.warn('There is only a single spine item and it is marked as the cover. Removing cover marking.')
for guide_elem in tuple(opf.iterguide()):
if guide_elem.get('type', '').lower() == 'cover':
guide_elem.getparent().remove(guide_elem)
return
else:
spine[0].getparent().remove(spine[0])
removed = guide_cover
else:
# Ensure the cover is displayed as the first item in the book, some
# epub files have it set with linear='no' which causes the cover to
# display in the end
spine[0].attrib.pop('linear', None)
opf.spine[0].is_linear = True
# Ensure that the guide has a cover entry pointing to a raster cover
# and a titlepage entry pointing to the html titlepage. The titlepage
# entry will be used by the epub output plugin, the raster cover entry
# by other output plugins.
# Search for a raster cover identified in the OPF
raster_cover = opf.raster_cover
# Set the cover guide entry
if raster_cover is not None:
guide_elem.set('href', raster_cover)
else:
# Render the titlepage to create a raster cover
from calibre.ebooks import render_html_svg_workaround
guide_elem.set('href', 'calibre_raster_cover.jpg')
t = etree.SubElement(
elem[0].getparent(), OPF('item'), href=guide_elem.get('href'), id='calibre_raster_cover')
t.set('media-type', 'image/jpeg')
if os.path.exists(guide_cover):
renderer = render_html_svg_workaround(guide_cover, log)
if renderer is not None:
with lopen('calibre_raster_cover.jpg', 'wb') as f:
f.write(renderer)
# Set the titlepage guide entry
self.set_guide_type(opf, 'titlepage', guide_cover, 'Title Page')
return removed
def find_opf(self):
from calibre.utils.xml_parse import safe_xml_fromstring
def attr(n, attr):
for k, v in n.attrib.items():
if k.endswith(attr):
return v
try:
with lopen('META-INF/container.xml', 'rb') as f:
root = safe_xml_fromstring(f.read())
for r in root.xpath('//*[local-name()="rootfile"]'):
if attr(r, 'media-type') != "application/oebps-package+xml":
continue
path = attr(r, 'full-path')
if not path:
continue
path = os.path.join(getcwd(), *path.split('/'))
if os.path.exists(path):
return path
except Exception:
import traceback
traceback.print_exc()
def convert(self, stream, options, file_ext, log, accelerators):
from calibre.utils.zipfile import ZipFile
from calibre import walk
from calibre.ebooks import DRMError
from calibre.ebooks.metadata.opf2 import OPF
try:
zf = ZipFile(stream)
zf.extractall(getcwd())
except:
log.exception('EPUB appears to be invalid ZIP file, trying a'
' more forgiving ZIP parser')
from calibre.utils.localunzip import extractall
stream.seek(0)
extractall(stream)
encfile = os.path.abspath(os.path.join('META-INF', 'encryption.xml'))
opf = self.find_opf()
if opf is None:
for f in walk('.'):
if f.lower().endswith('.opf') and '__MACOSX' not in f and \
not os.path.basename(f).startswith('.'):
opf = os.path.abspath(f)
break
path = getattr(stream, 'name', 'stream')
if opf is None:
raise ValueError('%s is not a valid EPUB file (could not find opf)'%path)
opf = os.path.relpath(opf, getcwd())
parts = os.path.split(opf)
opf = OPF(opf, os.path.dirname(os.path.abspath(opf)))
self._encrypted_font_uris = []
if os.path.exists(encfile):
if not self.process_encryption(encfile, opf, log):
raise DRMError(os.path.basename(path))
self.encrypted_fonts = self._encrypted_font_uris
if len(parts) > 1 and parts[0]:
delta = '/'.join(parts[:-1])+'/'
def normpath(x):
return posixpath.normpath(delta + elem.get('href'))
for elem in opf.itermanifest():
elem.set('href', normpath(elem.get('href')))
for elem in opf.iterguide():
elem.set('href', normpath(elem.get('href')))
f = self.rationalize_cover3 if opf.package_version >= 3.0 else self.rationalize_cover2
self.removed_cover = f(opf, log)
if self.removed_cover:
self.removed_items_to_ignore = (self.removed_cover,)
epub3_nav = opf.epub3_nav
if epub3_nav is not None:
self.convert_epub3_nav(epub3_nav, opf, log, options)
for x in opf.itermanifest():
if x.get('media-type', '') == 'application/x-dtbook+xml':
raise ValueError(
'EPUB files with DTBook markup are not supported')
not_for_spine = set()
for y in opf.itermanifest():
id_ = y.get('id', None)
if id_:
mt = y.get('media-type', None)
if mt in {
'application/vnd.adobe-page-template+xml',
'application/vnd.adobe.page-template+xml',
'application/adobe-page-template+xml',
'application/adobe.page-template+xml',
'application/text'
}:
not_for_spine.add(id_)
ext = y.get('href', '').rpartition('.')[-1].lower()
if mt == 'text/plain' and ext in {'otf', 'ttf'}:
# some epub authoring software sets font mime types to
# text/plain
not_for_spine.add(id_)
y.set('media-type', 'application/font')
seen = set()
for x in list(opf.iterspine()):
ref = x.get('idref', None)
if not ref or ref in not_for_spine or ref in seen:
x.getparent().remove(x)
continue
seen.add(ref)
if len(list(opf.iterspine())) == 0:
raise ValueError('No valid entries in the spine of this EPUB')
with lopen('content.opf', 'wb') as nopf:
nopf.write(opf.render())
return os.path.abspath('content.opf')
def convert_epub3_nav(self, nav_path, opf, log, opts):
from lxml import etree
from calibre.ebooks.chardet import xml_to_unicode
from calibre.ebooks.oeb.polish.parsing import parse
from calibre.ebooks.oeb.base import EPUB_NS, XHTML, NCX_MIME, NCX, urlnormalize, urlunquote, serialize
from calibre.ebooks.oeb.polish.toc import first_child
from calibre.utils.xml_parse import safe_xml_fromstring
from tempfile import NamedTemporaryFile
with lopen(nav_path, 'rb') as f:
raw = f.read()
raw = xml_to_unicode(raw, strip_encoding_pats=True, assume_utf8=True)[0]
root = parse(raw, log=log)
ncx = safe_xml_fromstring('<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1" xml:lang="eng"><navMap/></ncx>')
navmap = ncx[0]
et = '{%s}type' % EPUB_NS
bn = os.path.basename(nav_path)
def add_from_li(li, parent):
href = text = None
for x in li.iterchildren(XHTML('a'), XHTML('span')):
text = etree.tostring(
x, method='text', encoding='unicode', with_tail=False).strip() or ' '.join(
x.xpath('descendant-or-self::*/@title')).strip()
href = x.get('href')
if href:
if href.startswith('#'):
href = bn + href
break
np = parent.makeelement(NCX('navPoint'))
parent.append(np)
np.append(np.makeelement(NCX('navLabel')))
np[0].append(np.makeelement(NCX('text')))
np[0][0].text = text
if href:
np.append(np.makeelement(NCX('content'), attrib={'src':href}))
return np
def process_nav_node(node, toc_parent):
for li in node.iterchildren(XHTML('li')):
child = add_from_li(li, toc_parent)
ol = first_child(li, XHTML('ol'))
if child is not None and ol is not None:
process_nav_node(ol, child)
for nav in root.iterdescendants(XHTML('nav')):
if nav.get(et) == 'toc':
ol = first_child(nav, XHTML('ol'))
if ol is not None:
process_nav_node(ol, navmap)
break
else:
return
with NamedTemporaryFile(suffix='.ncx', dir=os.path.dirname(nav_path), delete=False) as f:
f.write(etree.tostring(ncx, encoding='utf-8'))
ncx_href = os.path.relpath(f.name, getcwd()).replace(os.sep, '/')
ncx_id = opf.create_manifest_item(ncx_href, NCX_MIME, append=True).get('id')
for spine in opf.root.xpath('//*[local-name()="spine"]'):
spine.set('toc', ncx_id)
opts.epub3_nav_href = urlnormalize(os.path.relpath(nav_path).replace(os.sep, '/'))
opts.epub3_nav_parsed = root
if getattr(self, 'removed_cover', None):
changed = False
base_path = os.path.dirname(nav_path)
for elem in root.xpath('//*[@href]'):
href, frag = elem.get('href').partition('#')[::2]
link_path = os.path.relpath(os.path.join(base_path, urlunquote(href)), base_path)
abs_href = urlnormalize(link_path)
if abs_href == self.removed_cover:
changed = True
elem.set('data-calibre-removed-titlepage', '1')
if changed:
with lopen(nav_path, 'wb') as f:
f.write(serialize(root, 'application/xhtml+xml'))
def postprocess_book(self, oeb, opts, log):
rc = getattr(self, 'removed_cover', None)
if rc:
cover_toc_item = None
for item in oeb.toc.iterdescendants():
if item.href and item.href.partition('#')[0] == rc:
cover_toc_item = item
break
spine = {x.href for x in oeb.spine}
if (cover_toc_item is not None and cover_toc_item not in spine):
oeb.toc.item_that_refers_to_cover = cover_toc_item

View File

@@ -0,0 +1,548 @@
#!/usr/bin/env python2
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import os, shutil, re
from calibre.customize.conversion import (OutputFormatPlugin,
OptionRecommendation)
from calibre.ptempfile import TemporaryDirectory
from calibre import CurrentDir
from polyglot.builtins import unicode_type, filter, map, zip, range, as_bytes
block_level_tags = (
'address',
'body',
'blockquote',
'center',
'dir',
'div',
'dl',
'fieldset',
'form',
'h1',
'h2',
'h3',
'h4',
'h5',
'h6',
'hr',
'isindex',
'menu',
'noframes',
'noscript',
'ol',
'p',
'pre',
'table',
'ul',
)
class EPUBOutput(OutputFormatPlugin):
name = 'EPUB Output'
author = 'Kovid Goyal'
file_type = 'epub'
commit_name = 'epub_output'
ui_data = {'versions': ('2', '3')}
options = {
OptionRecommendation(name='extract_to',
help=_('Extract the contents of the generated %s file to the '
'specified directory. The contents of the directory are first '
'deleted, so be careful.') % 'EPUB'),
OptionRecommendation(name='dont_split_on_page_breaks',
recommended_value=False, level=OptionRecommendation.LOW,
help=_('Turn off splitting at page breaks. Normally, input '
'files are automatically split at every page break into '
'two files. This gives an output e-book that can be '
'parsed faster and with less resources. However, '
'splitting is slow and if your source file contains a '
'very large number of page breaks, you should turn off '
'splitting on page breaks.'
)
),
OptionRecommendation(name='flow_size', recommended_value=260,
help=_('Split all HTML files larger than this size (in KB). '
'This is necessary as most EPUB readers cannot handle large '
'file sizes. The default of %defaultKB is the size required '
'for Adobe Digital Editions. Set to 0 to disable size based splitting.')
),
OptionRecommendation(name='no_default_epub_cover', recommended_value=False,
help=_('Normally, if the input file has no cover and you don\'t'
' specify one, a default cover is generated with the title, '
'authors, etc. This option disables the generation of this cover.')
),
OptionRecommendation(name='no_svg_cover', recommended_value=False,
help=_('Do not use SVG for the book cover. Use this option if '
'your EPUB is going to be used on a device that does not '
'support SVG, like the iPhone or the JetBook Lite. '
'Without this option, such devices will display the cover '
'as a blank page.')
),
OptionRecommendation(name='preserve_cover_aspect_ratio',
recommended_value=False, help=_(
'When using an SVG cover, this option will cause the cover to scale '
'to cover the available screen area, but still preserve its aspect ratio '
'(ratio of width to height). That means there may be white borders '
'at the sides or top and bottom of the image, but the image will '
'never be distorted. Without this option the image may be slightly '
'distorted, but there will be no borders.'
)
),
OptionRecommendation(name='epub_flatten', recommended_value=False,
help=_('This option is needed only if you intend to use the EPUB'
' with FBReaderJ. It will flatten the file system inside the'
' EPUB, putting all files into the top level.')
),
OptionRecommendation(name='epub_inline_toc', recommended_value=False,
help=_('Insert an inline Table of Contents that will appear as part of the main book content.')
),
OptionRecommendation(name='epub_toc_at_end', recommended_value=False,
help=_('Put the inserted inline Table of Contents at the end of the book instead of the start.')
),
OptionRecommendation(name='toc_title', recommended_value=None,
help=_('Title for any generated in-line table of contents.')
),
OptionRecommendation(name='epub_version', recommended_value='2', choices=ui_data['versions'],
help=_('The version of the EPUB file to generate. EPUB 2 is the'
' most widely compatible, only use EPUB 3 if you know you'
' actually need it.')
),
}
recommendations = {('pretty_print', True, OptionRecommendation.HIGH)}
def workaround_webkit_quirks(self): # {{{
from calibre.ebooks.oeb.base import XPath
for x in self.oeb.spine:
root = x.data
body = XPath('//h:body')(root)
if body:
body = body[0]
if not hasattr(body, 'xpath'):
continue
for pre in XPath('//h:pre')(body):
if not pre.text and len(pre) == 0:
pre.tag = 'div'
# }}}
def upshift_markup(self): # {{{
'Upgrade markup to comply with XHTML 1.1 where possible'
from calibre.ebooks.oeb.base import XPath, XML
for x in self.oeb.spine:
root = x.data
if (not root.get(XML('lang'))) and (root.get('lang')):
root.set(XML('lang'), root.get('lang'))
body = XPath('//h:body')(root)
if body:
body = body[0]
if not hasattr(body, 'xpath'):
continue
for u in XPath('//h:u')(root):
u.tag = 'span'
seen_ids, seen_names = set(), set()
for x in XPath('//*[@id or @name]')(root):
eid, name = x.get('id', None), x.get('name', None)
if eid:
if eid in seen_ids:
del x.attrib['id']
else:
seen_ids.add(eid)
if name:
if name in seen_names:
del x.attrib['name']
else:
seen_names.add(name)
# }}}
def convert(self, oeb, output_path, input_plugin, opts, log):
self.log, self.opts, self.oeb = log, opts, oeb
if self.opts.epub_inline_toc:
from calibre.ebooks.mobi.writer8.toc import TOCAdder
opts.mobi_toc_at_start = not opts.epub_toc_at_end
opts.mobi_passthrough = False
opts.no_inline_toc = False
TOCAdder(oeb, opts, replace_previous_inline_toc=True, ignore_existing_toc=True)
if self.opts.epub_flatten:
from calibre.ebooks.oeb.transforms.filenames import FlatFilenames
FlatFilenames()(oeb, opts)
else:
from calibre.ebooks.oeb.transforms.filenames import UniqueFilenames
UniqueFilenames()(oeb, opts)
self.workaround_ade_quirks()
self.workaround_webkit_quirks()
self.upshift_markup()
from calibre.ebooks.oeb.transforms.rescale import RescaleImages
RescaleImages(check_colorspaces=True)(oeb, opts)
from calibre.ebooks.oeb.transforms.split import Split
split = Split(not self.opts.dont_split_on_page_breaks,
max_flow_size=self.opts.flow_size*1024
)
split(self.oeb, self.opts)
from calibre.ebooks.oeb.transforms.cover import CoverManager
cm = CoverManager(
no_default_cover=self.opts.no_default_epub_cover,
no_svg_cover=self.opts.no_svg_cover,
preserve_aspect_ratio=self.opts.preserve_cover_aspect_ratio)
cm(self.oeb, self.opts, self.log)
self.workaround_sony_quirks()
if self.oeb.toc.count() == 0:
self.log.warn('This EPUB file has no Table of Contents. '
'Creating a default TOC')
first = next(iter(self.oeb.spine))
self.oeb.toc.add(_('Start'), first.href)
from calibre.ebooks.oeb.base import OPF
identifiers = oeb.metadata['identifier']
uuid = None
for x in identifiers:
if x.get(OPF('scheme'), None).lower() == 'uuid' or unicode_type(x).startswith('urn:uuid:'):
uuid = unicode_type(x).split(':')[-1]
break
encrypted_fonts = getattr(input_plugin, 'encrypted_fonts', [])
if uuid is None:
self.log.warn('No UUID identifier found')
from uuid import uuid4
uuid = unicode_type(uuid4())
oeb.metadata.add('identifier', uuid, scheme='uuid', id=uuid)
if encrypted_fonts and not uuid.startswith('urn:uuid:'):
# Apparently ADE requires this value to start with urn:uuid:
# for some absurd reason, or it will throw a hissy fit and refuse
# to use the obfuscated fonts.
for x in identifiers:
if unicode_type(x) == uuid:
x.content = 'urn:uuid:'+uuid
with TemporaryDirectory('_epub_output') as tdir:
from calibre.customize.ui import plugin_for_output_format
metadata_xml = None
extra_entries = []
if self.is_periodical:
if self.opts.output_profile.epub_periodical_format == 'sony':
from calibre.ebooks.epub.periodical import sony_metadata
metadata_xml, atom_xml = sony_metadata(oeb)
extra_entries = [('atom.xml', 'application/atom+xml', atom_xml)]
oeb_output = plugin_for_output_format('oeb')
oeb_output.convert(oeb, tdir, input_plugin, opts, log)
opf = [x for x in os.listdir(tdir) if x.endswith('.opf')][0]
self.condense_ncx([os.path.join(tdir, x) for x in os.listdir(tdir)
if x.endswith('.ncx')][0])
if self.opts.epub_version == '3':
self.upgrade_to_epub3(tdir, opf)
encryption = None
if encrypted_fonts:
encryption = self.encrypt_fonts(encrypted_fonts, tdir, uuid)
from calibre.ebooks.epub import initialize_container
with initialize_container(output_path, os.path.basename(opf),
extra_entries=extra_entries) as epub:
epub.add_dir(tdir)
if encryption is not None:
epub.writestr('META-INF/encryption.xml', as_bytes(encryption))
if metadata_xml is not None:
epub.writestr('META-INF/metadata.xml',
metadata_xml.encode('utf-8'))
if opts.extract_to is not None:
from calibre.utils.zipfile import ZipFile
if os.path.exists(opts.extract_to):
if os.path.isdir(opts.extract_to):
shutil.rmtree(opts.extract_to)
else:
os.remove(opts.extract_to)
os.mkdir(opts.extract_to)
with ZipFile(output_path) as zf:
zf.extractall(path=opts.extract_to)
self.log.info('EPUB extracted to', opts.extract_to)
def upgrade_to_epub3(self, tdir, opf):
self.log.info('Upgrading to EPUB 3...')
from calibre.ebooks.epub import simple_container_xml
from calibre.ebooks.oeb.polish.cover import fix_conversion_titlepage_links_in_nav
try:
os.mkdir(os.path.join(tdir, 'META-INF'))
except EnvironmentError:
pass
with open(os.path.join(tdir, 'META-INF', 'container.xml'), 'wb') as f:
f.write(simple_container_xml(os.path.basename(opf)).encode('utf-8'))
from calibre.ebooks.oeb.polish.container import EpubContainer
container = EpubContainer(tdir, self.log)
from calibre.ebooks.oeb.polish.upgrade import epub_2_to_3
existing_nav = getattr(self.opts, 'epub3_nav_parsed', None)
nav_href = getattr(self.opts, 'epub3_nav_href', None)
previous_nav = (nav_href, existing_nav) if existing_nav and nav_href else None
epub_2_to_3(container, self.log.info, previous_nav=previous_nav)
fix_conversion_titlepage_links_in_nav(container)
container.commit()
os.remove(f.name)
try:
os.rmdir(os.path.join(tdir, 'META-INF'))
except EnvironmentError:
pass
def encrypt_fonts(self, uris, tdir, uuid): # {{{
from polyglot.binary import from_hex_bytes
key = re.sub(r'[^a-fA-F0-9]', '', uuid)
if len(key) < 16:
raise ValueError('UUID identifier %r is invalid'%uuid)
key = bytearray(from_hex_bytes((key + key)[:32]))
paths = []
with CurrentDir(tdir):
paths = [os.path.join(*x.split('/')) for x in uris]
uris = dict(zip(uris, paths))
fonts = []
for uri in list(uris.keys()):
path = uris[uri]
if not os.path.exists(path):
uris.pop(uri)
continue
self.log.debug('Encrypting font:', uri)
with lopen(path, 'r+b') as f:
data = f.read(1024)
if len(data) >= 1024:
data = bytearray(data)
f.seek(0)
f.write(bytes(bytearray(data[i] ^ key[i%16] for i in range(1024))))
else:
self.log.warn('Font', path, 'is invalid, ignoring')
if not isinstance(uri, unicode_type):
uri = uri.decode('utf-8')
fonts.append('''
<enc:EncryptedData>
<enc:EncryptionMethod Algorithm="http://ns.adobe.com/pdf/enc#RC"/>
<enc:CipherData>
<enc:CipherReference URI="%s"/>
</enc:CipherData>
</enc:EncryptedData>
'''%(uri.replace('"', '\\"')))
if fonts:
ans = '''<encryption
xmlns="urn:oasis:names:tc:opendocument:xmlns:container"
xmlns:enc="http://www.w3.org/2001/04/xmlenc#"
xmlns:deenc="http://ns.adobe.com/digitaleditions/enc">
'''
ans += '\n'.join(fonts)
ans += '\n</encryption>'
return ans
# }}}
def condense_ncx(self, ncx_path): # {{{
from lxml import etree
if not self.opts.pretty_print:
tree = etree.parse(ncx_path)
for tag in tree.getroot().iter(tag=etree.Element):
if tag.text:
tag.text = tag.text.strip()
if tag.tail:
tag.tail = tag.tail.strip()
compressed = etree.tostring(tree.getroot(), encoding='utf-8')
with open(ncx_path, 'wb') as f:
f.write(compressed)
# }}}
def workaround_ade_quirks(self): # {{{
'''
Perform various markup transforms to get the output to render correctly
in the quirky ADE.
'''
from calibre.ebooks.oeb.base import XPath, XHTML, barename, urlunquote
stylesheet = self.oeb.manifest.main_stylesheet
# ADE cries big wet tears when it encounters an invalid fragment
# identifier in the NCX toc.
frag_pat = re.compile(r'[-A-Za-z0-9_:.]+$')
for node in self.oeb.toc.iter():
href = getattr(node, 'href', None)
if hasattr(href, 'partition'):
base, _, frag = href.partition('#')
frag = urlunquote(frag)
if frag and frag_pat.match(frag) is None:
self.log.warn(
'Removing fragment identifier %r from TOC as Adobe Digital Editions cannot handle it'%frag)
node.href = base
for x in self.oeb.spine:
root = x.data
body = XPath('//h:body')(root)
if body:
body = body[0]
if hasattr(body, 'xpath'):
# remove <img> tags with empty src elements
bad = []
for x in XPath('//h:img')(body):
src = x.get('src', '').strip()
if src in ('', '#') or src.startswith('http:'):
bad.append(x)
for img in bad:
img.getparent().remove(img)
# Add id attribute to <a> tags that have name
for x in XPath('//h:a[@name]')(body):
if not x.get('id', False):
x.set('id', x.get('name'))
# The delightful epubcheck has started complaining about <a> tags that
# have name attributes.
x.attrib.pop('name')
# Replace <br> that are children of <body> as ADE doesn't handle them
for br in XPath('./h:br')(body):
if br.getparent() is None:
continue
try:
prior = next(br.itersiblings(preceding=True))
priortag = barename(prior.tag)
priortext = prior.tail
except:
priortag = 'body'
priortext = body.text
if priortext:
priortext = priortext.strip()
br.tag = XHTML('p')
br.text = '\u00a0'
style = br.get('style', '').split(';')
style = list(filter(None, map(lambda x: x.strip(), style)))
style.append('margin:0pt; border:0pt')
# If the prior tag is a block (including a <br> we replaced)
# then this <br> replacement should have a 1-line height.
# Otherwise it should have no height.
if not priortext and priortag in block_level_tags:
style.append('height:1em')
else:
style.append('height:0pt')
br.set('style', '; '.join(style))
for tag in XPath('//h:embed')(root):
tag.getparent().remove(tag)
for tag in XPath('//h:object')(root):
if tag.get('type', '').lower().strip() in {'image/svg+xml', 'application/svg+xml'}:
continue
tag.getparent().remove(tag)
for tag in XPath('//h:title|//h:style')(root):
if not tag.text:
tag.getparent().remove(tag)
for tag in XPath('//h:script')(root):
if (not tag.text and not tag.get('src', False) and tag.get('type', None) != 'text/x-mathjax-config'):
tag.getparent().remove(tag)
for tag in XPath('//h:body/descendant::h:script')(root):
tag.getparent().remove(tag)
formchildren = XPath('./h:input|./h:button|./h:textarea|'
'./h:label|./h:fieldset|./h:legend')
for tag in XPath('//h:form')(root):
if formchildren(tag):
tag.getparent().remove(tag)
else:
# Not a real form
tag.tag = XHTML('div')
for tag in XPath('//h:center')(root):
tag.tag = XHTML('div')
tag.set('style', 'text-align:center')
# ADE can't handle &amp; in an img url
for tag in XPath('//h:img[@src]')(root):
tag.set('src', tag.get('src', '').replace('&', ''))
# ADE whimpers in fright when it encounters a <td> outside a
# <table>
in_table = XPath('ancestor::h:table')
for tag in XPath('//h:td|//h:tr|//h:th')(root):
if not in_table(tag):
tag.tag = XHTML('div')
# ADE fails to render non breaking hyphens/soft hyphens/zero width spaces
special_chars = re.compile('[\u200b\u00ad]')
for elem in root.iterdescendants('*'):
if elem.text:
elem.text = special_chars.sub('', elem.text)
elem.text = elem.text.replace('\u2011', '-')
if elem.tail:
elem.tail = special_chars.sub('', elem.tail)
elem.tail = elem.tail.replace('\u2011', '-')
if stylesheet is not None:
# ADE doesn't render lists correctly if they have left margins
from css_parser.css import CSSRule
for lb in XPath('//h:ul[@class]|//h:ol[@class]')(root):
sel = '.'+lb.get('class')
for rule in stylesheet.data.cssRules.rulesOfType(CSSRule.STYLE_RULE):
if sel == rule.selectorList.selectorText:
rule.style.removeProperty('margin-left')
# padding-left breaks rendering in webkit and gecko
rule.style.removeProperty('padding-left')
# Change whitespace:pre to pre-wrap to accommodate readers that
# cannot scroll horizontally
for rule in stylesheet.data.cssRules.rulesOfType(CSSRule.STYLE_RULE):
style = rule.style
ws = style.getPropertyValue('white-space')
if ws == 'pre':
style.setProperty('white-space', 'pre-wrap')
# }}}
def workaround_sony_quirks(self): # {{{
'''
Perform toc link transforms to alleviate slow loading.
'''
from calibre.ebooks.oeb.base import urldefrag, XPath
from calibre.ebooks.oeb.polish.toc import item_at_top
def frag_is_at_top(root, frag):
elem = XPath('//*[@id="%s" or @name="%s"]'%(frag, frag))(root)
if elem:
elem = elem[0]
else:
return False
return item_at_top(elem)
def simplify_toc_entry(toc):
if toc.href:
href, frag = urldefrag(toc.href)
if frag:
for x in self.oeb.spine:
if x.href == href:
if frag_is_at_top(x.data, frag):
self.log.debug('Removing anchor from TOC href:',
href+'#'+frag)
toc.href = href
break
for x in toc:
simplify_toc_entry(x)
if self.oeb.toc:
simplify_toc_entry(self.oeb.toc)
# }}}

View File

@@ -0,0 +1,179 @@
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2008, Anatoly Shipitsin <norguhtar at gmail.com>'
"""
Convert .fb2 files to .lrf
"""
import os, re
from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
from calibre import guess_type
from polyglot.builtins import iteritems, getcwd
FB2NS = 'http://www.gribuser.ru/xml/fictionbook/2.0'
FB21NS = 'http://www.gribuser.ru/xml/fictionbook/2.1'
class FB2Input(InputFormatPlugin):
name = 'FB2 Input'
author = 'Anatoly Shipitsin'
description = 'Convert FB2 and FBZ files to HTML'
file_types = {'fb2', 'fbz'}
commit_name = 'fb2_input'
recommendations = {
('level1_toc', '//h:h1', OptionRecommendation.MED),
('level2_toc', '//h:h2', OptionRecommendation.MED),
('level3_toc', '//h:h3', OptionRecommendation.MED),
}
options = {
OptionRecommendation(name='no_inline_fb2_toc',
recommended_value=False, level=OptionRecommendation.LOW,
help=_('Do not insert a Table of Contents at the beginning of the book.'
)
)}
def convert(self, stream, options, file_ext, log,
accelerators):
from lxml import etree
from calibre.utils.xml_parse import safe_xml_fromstring
from calibre.ebooks.metadata.fb2 import ensure_namespace, get_fb2_data
from calibre.ebooks.metadata.opf2 import OPFCreator
from calibre.ebooks.metadata.meta import get_metadata
from calibre.ebooks.oeb.base import XLINK_NS, XHTML_NS
from calibre.ebooks.chardet import xml_to_unicode
self.log = log
log.debug('Parsing XML...')
raw = get_fb2_data(stream)[0]
raw = raw.replace(b'\0', b'')
raw = xml_to_unicode(raw, strip_encoding_pats=True,
assume_utf8=True, resolve_entities=True)[0]
try:
doc = safe_xml_fromstring(raw)
except etree.XMLSyntaxError:
doc = safe_xml_fromstring(raw.replace('& ', '&amp;'))
if doc is None:
raise ValueError('The FB2 file is not valid XML')
doc = ensure_namespace(doc)
try:
fb_ns = doc.nsmap[doc.prefix]
except Exception:
fb_ns = FB2NS
NAMESPACES = {'f':fb_ns, 'l':XLINK_NS}
stylesheets = doc.xpath('//*[local-name() = "stylesheet" and @type="text/css"]')
css = ''
for s in stylesheets:
css += etree.tostring(s, encoding='unicode', method='text',
with_tail=False) + '\n\n'
if css:
import css_parser, logging
parser = css_parser.CSSParser(fetcher=None,
log=logging.getLogger('calibre.css'))
XHTML_CSS_NAMESPACE = '@namespace "%s";\n' % XHTML_NS
text = XHTML_CSS_NAMESPACE + css
log.debug('Parsing stylesheet...')
stylesheet = parser.parseString(text)
stylesheet.namespaces['h'] = XHTML_NS
css = stylesheet.cssText
if isinstance(css, bytes):
css = css.decode('utf-8', 'replace')
css = css.replace('h|style', 'h|span')
css = re.sub(r'name\s*=\s*', 'class=', css)
self.extract_embedded_content(doc)
log.debug('Converting XML to HTML...')
with open(P('templates/fb2.xsl'), 'rb') as f:
ss = f.read().decode('utf-8')
ss = ss.replace("__FB_NS__", fb_ns)
if options.no_inline_fb2_toc:
log('Disabling generation of inline FB2 TOC')
ss = re.compile(r'<!-- BUILD TOC -->.*<!-- END BUILD TOC -->',
re.DOTALL).sub('', ss)
styledoc = safe_xml_fromstring(ss)
transform = etree.XSLT(styledoc)
result = transform(doc)
# Handle links of type note and cite
notes = {a.get('href')[1:]: a for a in result.xpath('//a[@link_note and @href]') if a.get('href').startswith('#')}
cites = {a.get('link_cite'): a for a in result.xpath('//a[@link_cite]') if not a.get('href', '')}
all_ids = {x for x in result.xpath('//*/@id')}
for cite, a in iteritems(cites):
note = notes.get(cite, None)
if note:
c = 1
while 'cite%d' % c in all_ids:
c += 1
if not note.get('id', None):
note.set('id', 'cite%d' % c)
all_ids.add(note.get('id'))
a.set('href', '#%s' % note.get('id'))
for x in result.xpath('//*[@link_note or @link_cite]'):
x.attrib.pop('link_note', None)
x.attrib.pop('link_cite', None)
for img in result.xpath('//img[@src]'):
src = img.get('src')
img.set('src', self.binary_map.get(src, src))
index = transform.tostring(result)
with open('index.xhtml', 'wb') as f:
f.write(index.encode('utf-8'))
with open('inline-styles.css', 'wb') as f:
f.write(css.encode('utf-8'))
stream.seek(0)
mi = get_metadata(stream, 'fb2')
if not mi.title:
mi.title = _('Unknown')
if not mi.authors:
mi.authors = [_('Unknown')]
cpath = None
if mi.cover_data and mi.cover_data[1]:
with open('fb2_cover_calibre_mi.jpg', 'wb') as f:
f.write(mi.cover_data[1])
cpath = os.path.abspath('fb2_cover_calibre_mi.jpg')
else:
for img in doc.xpath('//f:coverpage/f:image', namespaces=NAMESPACES):
href = img.get('{%s}href'%XLINK_NS, img.get('href', None))
if href is not None:
if href.startswith('#'):
href = href[1:]
cpath = os.path.abspath(href)
break
opf = OPFCreator(getcwd(), mi)
entries = [(f2, guess_type(f2)[0]) for f2 in os.listdir(u'.')]
opf.create_manifest(entries)
opf.create_spine(['index.xhtml'])
if cpath:
opf.guide.set_cover(cpath)
with open('metadata.opf', 'wb') as f:
opf.render(f)
return os.path.join(getcwd(), 'metadata.opf')
def extract_embedded_content(self, doc):
from calibre.ebooks.fb2 import base64_decode
self.binary_map = {}
for elem in doc.xpath('./*'):
if elem.text and 'binary' in elem.tag and 'id' in elem.attrib:
ct = elem.get('content-type', '')
fname = elem.attrib['id']
ext = ct.rpartition('/')[-1].lower()
if ext in ('png', 'jpeg', 'jpg'):
if fname.lower().rpartition('.')[-1] not in {'jpg', 'jpeg',
'png'}:
fname += '.' + ext
self.binary_map[elem.get('id')] = fname
raw = elem.text.strip()
try:
data = base64_decode(raw)
except TypeError:
self.log.exception('Binary data with id=%s is corrupted, ignoring'%(
elem.get('id')))
else:
with open(fname, 'wb') as f:
f.write(data)

View File

@@ -0,0 +1,203 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL 3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
import os
from calibre.customize.conversion import OutputFormatPlugin, OptionRecommendation
class FB2Output(OutputFormatPlugin):
name = 'FB2 Output'
author = 'John Schember'
file_type = 'fb2'
commit_name = 'fb2_output'
FB2_GENRES = [
# Science Fiction & Fantasy
'sf_history', # Alternative history
'sf_action', # Action
'sf_epic', # Epic
'sf_heroic', # Heroic
'sf_detective', # Detective
'sf_cyberpunk', # Cyberpunk
'sf_space', # Space
'sf_social', # Social#philosophical
'sf_horror', # Horror & mystic
'sf_humor', # Humor
'sf_fantasy', # Fantasy
'sf', # Science Fiction
# Detectives & Thrillers
'det_classic', # Classical detectives
'det_police', # Police Stories
'det_action', # Action
'det_irony', # Ironical detectives
'det_history', # Historical detectives
'det_espionage', # Espionage detectives
'det_crime', # Crime detectives
'det_political', # Political detectives
'det_maniac', # Maniacs
'det_hard', # Hard#boiled
'thriller', # Thrillers
'detective', # Detectives
# Prose
'prose_classic', # Classics prose
'prose_history', # Historical prose
'prose_contemporary', # Contemporary prose
'prose_counter', # Counterculture
'prose_rus_classic', # Russial classics prose
'prose_su_classics', # Soviet classics prose
# Romance
'love_contemporary', # Contemporary Romance
'love_history', # Historical Romance
'love_detective', # Detective Romance
'love_short', # Short Romance
'love_erotica', # Erotica
# Adventure
'adv_western', # Western
'adv_history', # History
'adv_indian', # Indians
'adv_maritime', # Maritime Fiction
'adv_geo', # Travel & geography
'adv_animal', # Nature & animals
'adventure', # Other
# Children's
'child_tale', # Fairy Tales
'child_verse', # Verses
'child_prose', # Prose
'child_sf', # Science Fiction
'child_det', # Detectives & Thrillers
'child_adv', # Adventures
'child_education', # Educational
'children', # Other
# Poetry & Dramaturgy
'poetry', # Poetry
'dramaturgy', # Dramaturgy
# Antique literature
'antique_ant', # Antique
'antique_european', # European
'antique_russian', # Old russian
'antique_east', # Old east
'antique_myths', # Myths. Legends. Epos
'antique', # Other
# Scientific#educational
'sci_history', # History
'sci_psychology', # Psychology
'sci_culture', # Cultural science
'sci_religion', # Religious studies
'sci_philosophy', # Philosophy
'sci_politics', # Politics
'sci_business', # Business literature
'sci_juris', # Jurisprudence
'sci_linguistic', # Linguistics
'sci_medicine', # Medicine
'sci_phys', # Physics
'sci_math', # Mathematics
'sci_chem', # Chemistry
'sci_biology', # Biology
'sci_tech', # Technical
'science', # Other
# Computers & Internet
'comp_www', # Internet
'comp_programming', # Programming
'comp_hard', # Hardware
'comp_soft', # Software
'comp_db', # Databases
'comp_osnet', # OS & Networking
'computers', # Other
# Reference
'ref_encyc', # Encyclopedias
'ref_dict', # Dictionaries
'ref_ref', # Reference
'ref_guide', # Guidebooks
'reference', # Other
# Nonfiction
'nonf_biography', # Biography & Memoirs
'nonf_publicism', # Publicism
'nonf_criticism', # Criticism
'design', # Art & design
'nonfiction', # Other
# Religion & Inspiration
'religion_rel', # Religion
'religion_esoterics', # Esoterics
'religion_self', # Self#improvement
'religion', # Other
# Humor
'humor_anecdote', # Anecdote (funny stories)
'humor_prose', # Prose
'humor_verse', # Verses
'humor', # Other
# Home & Family
'home_cooking', # Cooking
'home_pets', # Pets
'home_crafts', # Hobbies & Crafts
'home_entertain', # Entertaining
'home_health', # Health
'home_garden', # Garden
'home_diy', # Do it yourself
'home_sport', # Sports
'home_sex', # Erotica & sex
'home', # Other
]
ui_data = {
'sectionize': {
'toc': _('Section per entry in the ToC'),
'files': _('Section per file'),
'nothing': _('A single section')
},
'genres': FB2_GENRES,
}
options = {
OptionRecommendation(name='sectionize',
recommended_value='files', level=OptionRecommendation.LOW,
choices=list(ui_data['sectionize']),
help=_('Specify how sections are created:\n'
' * nothing: {nothing}\n'
' * files: {files}\n'
' * toc: {toc}\n'
'If ToC based generation fails, adjust the "Structure detection" and/or "Table of Contents" settings '
'(turn on "Force use of auto-generated Table of Contents").').format(**ui_data['sectionize'])
),
OptionRecommendation(name='fb2_genre',
recommended_value='antique', level=OptionRecommendation.LOW,
choices=FB2_GENRES,
help=(_('Genre for the book. Choices: %s\n\n See: ') % ', '.join(FB2_GENRES)
) + 'http://www.fictionbook.org/index.php/Eng:FictionBook_2.1_genres ' + _('for a complete list with descriptions.')),
}
def convert(self, oeb_book, output_path, input_plugin, opts, log):
from calibre.ebooks.oeb.transforms.jacket import linearize_jacket
from calibre.ebooks.oeb.transforms.rasterize import SVGRasterizer, Unavailable
from calibre.ebooks.fb2.fb2ml import FB2MLizer
try:
rasterizer = SVGRasterizer()
rasterizer(oeb_book, opts)
except Unavailable:
log.warn('SVG rasterizer unavailable, SVG will not be converted')
linearize_jacket(oeb_book)
fb2mlizer = FB2MLizer(log)
fb2_content = fb2mlizer.extract_content(oeb_book, opts)
close = False
if not hasattr(output_path, 'write'):
close = True
if not os.path.exists(os.path.dirname(output_path)) and os.path.dirname(output_path) != '':
os.makedirs(os.path.dirname(output_path))
out_stream = lopen(output_path, 'wb')
else:
out_stream = output_path
out_stream.seek(0)
out_stream.truncate()
out_stream.write(fb2_content.encode('utf-8', 'replace'))
if close:
out_stream.close()

View File

@@ -0,0 +1,316 @@
#!/usr/bin/env python2
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import re, tempfile, os
from functools import partial
from calibre.constants import islinux, isbsd
from calibre.customize.conversion import (InputFormatPlugin,
OptionRecommendation)
from calibre.utils.localization import get_lang
from calibre.utils.filenames import ascii_filename
from calibre.utils.imghdr import what
from polyglot.builtins import unicode_type, zip, getcwd, as_unicode
def sanitize_file_name(x):
ans = re.sub(r'\s+', ' ', re.sub(r'[?&=;#]', '_', ascii_filename(x))).strip().rstrip('.')
ans, ext = ans.rpartition('.')[::2]
return (ans.strip() + '.' + ext.strip()).rstrip('.')
class HTMLInput(InputFormatPlugin):
name = 'HTML Input'
author = 'Kovid Goyal'
description = 'Convert HTML and OPF files to an OEB'
file_types = {'opf', 'html', 'htm', 'xhtml', 'xhtm', 'shtm', 'shtml'}
commit_name = 'html_input'
options = {
OptionRecommendation(name='breadth_first',
recommended_value=False, level=OptionRecommendation.LOW,
help=_('Traverse links in HTML files breadth first. Normally, '
'they are traversed depth first.'
)
),
OptionRecommendation(name='max_levels',
recommended_value=5, level=OptionRecommendation.LOW,
help=_('Maximum levels of recursion when following links in '
'HTML files. Must be non-negative. 0 implies that no '
'links in the root HTML file are followed. Default is '
'%default.'
)
),
OptionRecommendation(name='dont_package',
recommended_value=False, level=OptionRecommendation.LOW,
help=_('Normally this input plugin re-arranges all the input '
'files into a standard folder hierarchy. Only use this option '
'if you know what you are doing as it can result in various '
'nasty side effects in the rest of the conversion pipeline.'
)
),
}
def convert(self, stream, opts, file_ext, log,
accelerators):
self._is_case_sensitive = None
basedir = getcwd()
self.opts = opts
fname = None
if hasattr(stream, 'name'):
basedir = os.path.dirname(stream.name)
fname = os.path.basename(stream.name)
if file_ext != 'opf':
if opts.dont_package:
raise ValueError('The --dont-package option is not supported for an HTML input file')
from calibre.ebooks.metadata.html import get_metadata
mi = get_metadata(stream)
if fname:
from calibre.ebooks.metadata.meta import metadata_from_filename
fmi = metadata_from_filename(fname)
fmi.smart_update(mi)
mi = fmi
oeb = self.create_oebbook(stream.name, basedir, opts, log, mi)
return oeb
from calibre.ebooks.conversion.plumber import create_oebbook
return create_oebbook(log, stream.name, opts,
encoding=opts.input_encoding)
def is_case_sensitive(self, path):
if getattr(self, '_is_case_sensitive', None) is not None:
return self._is_case_sensitive
if not path or not os.path.exists(path):
return islinux or isbsd
self._is_case_sensitive = not (os.path.exists(path.lower()) and os.path.exists(path.upper()))
return self._is_case_sensitive
def create_oebbook(self, htmlpath, basedir, opts, log, mi):
import uuid
from calibre.ebooks.conversion.plumber import create_oebbook
from calibre.ebooks.oeb.base import (DirContainer,
rewrite_links, urlnormalize, urldefrag, BINARY_MIME, OEB_STYLES,
xpath, urlquote)
from calibre import guess_type
from calibre.ebooks.oeb.transforms.metadata import \
meta_info_to_oeb_metadata
from calibre.ebooks.html.input import get_filelist
from calibre.ebooks.metadata import string_to_authors
from calibre.utils.localization import canonicalize_lang
import css_parser, logging
css_parser.log.setLevel(logging.WARN)
self.OEB_STYLES = OEB_STYLES
oeb = create_oebbook(log, None, opts, self,
encoding=opts.input_encoding, populate=False)
self.oeb = oeb
metadata = oeb.metadata
meta_info_to_oeb_metadata(mi, metadata, log)
if not metadata.language:
l = canonicalize_lang(getattr(opts, 'language', None))
if not l:
oeb.logger.warn('Language not specified')
l = get_lang().replace('_', '-')
metadata.add('language', l)
if not metadata.creator:
a = getattr(opts, 'authors', None)
if a:
a = string_to_authors(a)
if not a:
oeb.logger.warn('Creator not specified')
a = [self.oeb.translate(__('Unknown'))]
for aut in a:
metadata.add('creator', aut)
if not metadata.title:
oeb.logger.warn('Title not specified')
metadata.add('title', self.oeb.translate(__('Unknown')))
bookid = unicode_type(uuid.uuid4())
metadata.add('identifier', bookid, id='uuid_id', scheme='uuid')
for ident in metadata.identifier:
if 'id' in ident.attrib:
self.oeb.uid = metadata.identifier[0]
break
filelist = get_filelist(htmlpath, basedir, opts, log)
filelist = [f for f in filelist if not f.is_binary]
htmlfile_map = {}
for f in filelist:
path = f.path
oeb.container = DirContainer(os.path.dirname(path), log,
ignore_opf=True)
bname = os.path.basename(path)
id, href = oeb.manifest.generate(id='html', href=sanitize_file_name(bname))
htmlfile_map[path] = href
item = oeb.manifest.add(id, href, 'text/html')
if path == htmlpath and '%' in path:
bname = urlquote(bname)
item.html_input_href = bname
oeb.spine.add(item, True)
self.added_resources = {}
self.log = log
self.log('Normalizing filename cases')
for path, href in htmlfile_map.items():
if not self.is_case_sensitive(path):
path = path.lower()
self.added_resources[path] = href
self.urlnormalize, self.DirContainer = urlnormalize, DirContainer
self.urldefrag = urldefrag
self.guess_type, self.BINARY_MIME = guess_type, BINARY_MIME
self.log('Rewriting HTML links')
for f in filelist:
path = f.path
dpath = os.path.dirname(path)
oeb.container = DirContainer(dpath, log, ignore_opf=True)
href = htmlfile_map[path]
try:
item = oeb.manifest.hrefs[href]
except KeyError:
item = oeb.manifest.hrefs[urlnormalize(href)]
rewrite_links(item.data, partial(self.resource_adder, base=dpath))
for item in oeb.manifest.values():
if item.media_type in self.OEB_STYLES:
dpath = None
for path, href in self.added_resources.items():
if href == item.href:
dpath = os.path.dirname(path)
break
css_parser.replaceUrls(item.data,
partial(self.resource_adder, base=dpath))
toc = self.oeb.toc
self.oeb.auto_generated_toc = True
titles = []
headers = []
for item in self.oeb.spine:
if not item.linear:
continue
html = item.data
title = ''.join(xpath(html, '/h:html/h:head/h:title/text()'))
title = re.sub(r'\s+', ' ', title.strip())
if title:
titles.append(title)
headers.append('(unlabled)')
for tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'strong'):
expr = '/h:html/h:body//h:%s[position()=1]/text()'
header = ''.join(xpath(html, expr % tag))
header = re.sub(r'\s+', ' ', header.strip())
if header:
headers[-1] = header
break
use = titles
if len(titles) > len(set(titles)):
use = headers
for title, item in zip(use, self.oeb.spine):
if not item.linear:
continue
toc.add(title, item.href)
oeb.container = DirContainer(getcwd(), oeb.log, ignore_opf=True)
return oeb
def link_to_local_path(self, link_, base=None):
from calibre.ebooks.html.input import Link
if not isinstance(link_, unicode_type):
try:
link_ = link_.decode('utf-8', 'error')
except:
self.log.warn('Failed to decode link %r. Ignoring'%link_)
return None, None
try:
l = Link(link_, base if base else getcwd())
except:
self.log.exception('Failed to process link: %r'%link_)
return None, None
if l.path is None:
# Not a local resource
return None, None
link = l.path.replace('/', os.sep).strip()
frag = l.fragment
if not link:
return None, None
return link, frag
def resource_adder(self, link_, base=None):
from polyglot.urllib import quote
link, frag = self.link_to_local_path(link_, base=base)
if link is None:
return link_
try:
if base and not os.path.isabs(link):
link = os.path.join(base, link)
link = os.path.abspath(link)
except:
return link_
if not os.access(link, os.R_OK):
return link_
if os.path.isdir(link):
self.log.warn(link_, 'is a link to a directory. Ignoring.')
return link_
if not self.is_case_sensitive(tempfile.gettempdir()):
link = link.lower()
if link not in self.added_resources:
bhref = os.path.basename(link)
id, href = self.oeb.manifest.generate(id='added', href=sanitize_file_name(bhref))
guessed = self.guess_type(href)[0]
media_type = guessed or self.BINARY_MIME
if media_type == 'text/plain':
self.log.warn('Ignoring link to text file %r'%link_)
return None
if media_type == self.BINARY_MIME:
# Check for the common case, images
try:
img = what(link)
except EnvironmentError:
pass
else:
if img:
media_type = self.guess_type('dummy.'+img)[0] or self.BINARY_MIME
self.oeb.log.debug('Added', link)
self.oeb.container = self.DirContainer(os.path.dirname(link),
self.oeb.log, ignore_opf=True)
# Load into memory
item = self.oeb.manifest.add(id, href, media_type)
# bhref refers to an already existing file. The read() method of
# DirContainer will call unquote on it before trying to read the
# file, therefore we quote it here.
if isinstance(bhref, unicode_type):
bhref = bhref.encode('utf-8')
item.html_input_href = as_unicode(quote(bhref))
if guessed in self.OEB_STYLES:
item.override_css_fetch = partial(
self.css_import_handler, os.path.dirname(link))
item.data
self.added_resources[link] = href
nlink = self.added_resources[link]
if frag:
nlink = '#'.join((nlink, frag))
return nlink
def css_import_handler(self, base, href):
link, frag = self.link_to_local_path(href, base=base)
if link is None or not os.access(link, os.R_OK) or os.path.isdir(link):
return (None, None)
try:
with open(link, 'rb') as f:
raw = f.read().decode('utf-8', 'replace')
raw = self.oeb.css_preprocessor(raw, add_namespace=False)
except:
self.log.exception('Failed to read CSS file: %r'%link)
return (None, None)
return (None, raw)

View File

@@ -0,0 +1,226 @@
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL 3'
__copyright__ = '2010, Fabian Grassl <fg@jusmeum.de>'
__docformat__ = 'restructuredtext en'
import os, re, shutil
from os.path import dirname, abspath, relpath as _relpath, exists, basename
from calibre.customize.conversion import OutputFormatPlugin, OptionRecommendation
from calibre import CurrentDir
from calibre.ptempfile import PersistentTemporaryDirectory
from polyglot.builtins import unicode_type
def relpath(*args):
return _relpath(*args).replace(os.sep, '/')
class HTMLOutput(OutputFormatPlugin):
name = 'HTML Output'
author = 'Fabian Grassl'
file_type = 'zip'
commit_name = 'html_output'
options = {
OptionRecommendation(name='template_css',
help=_('CSS file used for the output instead of the default file')),
OptionRecommendation(name='template_html_index',
help=_('Template used for generation of the HTML index file instead of the default file')),
OptionRecommendation(name='template_html',
help=_('Template used for the generation of the HTML contents of the book instead of the default file')),
OptionRecommendation(name='extract_to',
help=_('Extract the contents of the generated ZIP file to the '
'specified directory. WARNING: The contents of the directory '
'will be deleted.')
),
}
recommendations = {('pretty_print', True, OptionRecommendation.HIGH)}
def generate_toc(self, oeb_book, ref_url, output_dir):
'''
Generate table of contents
'''
from lxml import etree
from polyglot.urllib import unquote
from calibre.ebooks.oeb.base import element
from calibre.utils.cleantext import clean_xml_chars
with CurrentDir(output_dir):
def build_node(current_node, parent=None):
if parent is None:
parent = etree.Element('ul')
elif len(current_node.nodes):
parent = element(parent, ('ul'))
for node in current_node.nodes:
point = element(parent, 'li')
href = relpath(abspath(unquote(node.href)), dirname(ref_url))
if isinstance(href, bytes):
href = href.decode('utf-8')
link = element(point, 'a', href=clean_xml_chars(href))
title = node.title
if isinstance(title, bytes):
title = title.decode('utf-8')
if title:
title = re.sub(r'\s+', ' ', title)
link.text = clean_xml_chars(title)
build_node(node, point)
return parent
wrap = etree.Element('div')
wrap.append(build_node(oeb_book.toc))
return wrap
def generate_html_toc(self, oeb_book, ref_url, output_dir):
from lxml import etree
root = self.generate_toc(oeb_book, ref_url, output_dir)
return etree.tostring(root, pretty_print=True, encoding='unicode',
xml_declaration=False)
def convert(self, oeb_book, output_path, input_plugin, opts, log):
from lxml import etree
from calibre.utils import zipfile
from templite import Templite
from polyglot.urllib import unquote
from calibre.ebooks.html.meta import EasyMeta
# read template files
if opts.template_html_index is not None:
with open(opts.template_html_index, 'rb') as f:
template_html_index_data = f.read()
else:
template_html_index_data = P('templates/html_export_default_index.tmpl', data=True)
if opts.template_html is not None:
with open(opts.template_html, 'rb') as f:
template_html_data = f.read()
else:
template_html_data = P('templates/html_export_default.tmpl', data=True)
if opts.template_css is not None:
with open(opts.template_css, 'rb') as f:
template_css_data = f.read()
else:
template_css_data = P('templates/html_export_default.css', data=True)
template_html_index_data = template_html_index_data.decode('utf-8')
template_html_data = template_html_data.decode('utf-8')
template_css_data = template_css_data.decode('utf-8')
self.log = log
self.opts = opts
meta = EasyMeta(oeb_book.metadata)
tempdir = os.path.realpath(PersistentTemporaryDirectory())
output_file = os.path.join(tempdir,
basename(re.sub(r'\.zip', '', output_path)+'.html'))
output_dir = re.sub(r'\.html', '', output_file)+'_files'
if not exists(output_dir):
os.makedirs(output_dir)
css_path = output_dir+os.sep+'calibreHtmlOutBasicCss.css'
with open(css_path, 'wb') as f:
f.write(template_css_data.encode('utf-8'))
with open(output_file, 'wb') as f:
html_toc = self.generate_html_toc(oeb_book, output_file, output_dir)
templite = Templite(template_html_index_data)
nextLink = oeb_book.spine[0].href
nextLink = relpath(output_dir+os.sep+nextLink, dirname(output_file))
cssLink = relpath(abspath(css_path), dirname(output_file))
tocUrl = relpath(output_file, dirname(output_file))
t = templite.render(has_toc=bool(oeb_book.toc.count()),
toc=html_toc, meta=meta, nextLink=nextLink,
tocUrl=tocUrl, cssLink=cssLink,
firstContentPageLink=nextLink)
if isinstance(t, unicode_type):
t = t.encode('utf-8')
f.write(t)
with CurrentDir(output_dir):
for item in oeb_book.manifest:
path = abspath(unquote(item.href))
dir = dirname(path)
if not exists(dir):
os.makedirs(dir)
if item.spine_position is not None:
with open(path, 'wb') as f:
pass
else:
with open(path, 'wb') as f:
f.write(item.bytes_representation)
item.unload_data_from_memory(memory=path)
for item in oeb_book.spine:
path = abspath(unquote(item.href))
dir = dirname(path)
root = item.data.getroottree()
# get & clean HTML <HEAD>-data
head = root.xpath('//h:head', namespaces={'h': 'http://www.w3.org/1999/xhtml'})[0]
head_content = etree.tostring(head, pretty_print=True, encoding='unicode')
head_content = re.sub(r'\<\/?head.*\>', '', head_content)
head_content = re.sub(re.compile(r'\<style.*\/style\>', re.M|re.S), '', head_content)
head_content = re.sub(r'<(title)([^>]*)/>', r'<\1\2></\1>', head_content)
# get & clean HTML <BODY>-data
body = root.xpath('//h:body', namespaces={'h': 'http://www.w3.org/1999/xhtml'})[0]
ebook_content = etree.tostring(body, pretty_print=True, encoding='unicode')
ebook_content = re.sub(r'\<\/?body.*\>', '', ebook_content)
ebook_content = re.sub(r'<(div|a|span)([^>]*)/>', r'<\1\2></\1>', ebook_content)
# generate link to next page
if item.spine_position+1 < len(oeb_book.spine):
nextLink = oeb_book.spine[item.spine_position+1].href
nextLink = relpath(abspath(nextLink), dir)
else:
nextLink = None
# generate link to previous page
if item.spine_position > 0:
prevLink = oeb_book.spine[item.spine_position-1].href
prevLink = relpath(abspath(prevLink), dir)
else:
prevLink = None
cssLink = relpath(abspath(css_path), dir)
tocUrl = relpath(output_file, dir)
firstContentPageLink = oeb_book.spine[0].href
# render template
templite = Templite(template_html_data)
toc = lambda: self.generate_html_toc(oeb_book, path, output_dir)
t = templite.render(ebookContent=ebook_content,
prevLink=prevLink, nextLink=nextLink,
has_toc=bool(oeb_book.toc.count()), toc=toc,
tocUrl=tocUrl, head_content=head_content,
meta=meta, cssLink=cssLink,
firstContentPageLink=firstContentPageLink)
# write html to file
with open(path, 'wb') as f:
f.write(t.encode('utf-8'))
item.unload_data_from_memory(memory=path)
zfile = zipfile.ZipFile(output_path, "w")
zfile.add_dir(output_dir, basename(output_dir))
zfile.write(output_file, basename(output_file), zipfile.ZIP_DEFLATED)
if opts.extract_to:
if os.path.exists(opts.extract_to):
shutil.rmtree(opts.extract_to)
os.makedirs(opts.extract_to)
zfile.extractall(opts.extract_to)
self.log('Zip file extracted to', opts.extract_to)
zfile.close()
# cleanup temp dir
shutil.rmtree(tempdir)

View File

@@ -0,0 +1,133 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL 3'
__copyright__ = '2011, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
import os
from calibre import guess_type
from calibre.customize.conversion import InputFormatPlugin
from polyglot.builtins import getcwd
class HTMLZInput(InputFormatPlugin):
name = 'HTLZ Input'
author = 'John Schember'
description = 'Convert HTML files to HTML'
file_types = {'htmlz'}
commit_name = 'htmlz_input'
def convert(self, stream, options, file_ext, log,
accelerators):
from calibre.ebooks.chardet import xml_to_unicode
from calibre.ebooks.metadata.opf2 import OPF
from calibre.utils.zipfile import ZipFile
self.log = log
html = u''
top_levels = []
# Extract content from zip archive.
zf = ZipFile(stream)
zf.extractall()
# Find the HTML file in the archive. It needs to be
# top level.
index = u''
multiple_html = False
# Get a list of all top level files in the archive.
for x in os.listdir(u'.'):
if os.path.isfile(x):
top_levels.append(x)
# Try to find an index. file.
for x in top_levels:
if x.lower() in (u'index.html', u'index.xhtml', u'index.htm'):
index = x
break
# Look for multiple HTML files in the archive. We look at the
# top level files only as only they matter in HTMLZ.
for x in top_levels:
if os.path.splitext(x)[1].lower() in (u'.html', u'.xhtml', u'.htm'):
# Set index to the first HTML file found if it's not
# called index.
if not index:
index = x
else:
multiple_html = True
# Warn the user if there multiple HTML file in the archive. HTMLZ
# supports a single HTML file. A conversion with a multiple HTML file
# HTMLZ archive probably won't turn out as the user expects. With
# Multiple HTML files ZIP input should be used in place of HTMLZ.
if multiple_html:
log.warn(_('Multiple HTML files found in the archive. Only %s will be used.') % index)
if index:
with open(index, 'rb') as tf:
html = tf.read()
else:
raise Exception(_('No top level HTML file found.'))
if not html:
raise Exception(_('Top level HTML file %s is empty') % index)
# Encoding
if options.input_encoding:
ienc = options.input_encoding
else:
ienc = xml_to_unicode(html[:4096])[-1]
html = html.decode(ienc, 'replace')
# Run the HTML through the html processing plugin.
from calibre.customize.ui import plugin_for_input_format
html_input = plugin_for_input_format('html')
for opt in html_input.options:
setattr(options, opt.option.name, opt.recommended_value)
options.input_encoding = 'utf-8'
base = getcwd()
htmlfile = os.path.join(base, u'index.html')
c = 0
while os.path.exists(htmlfile):
c += 1
htmlfile = u'index%d.html'%c
with open(htmlfile, 'wb') as f:
f.write(html.encode('utf-8'))
odi = options.debug_pipeline
options.debug_pipeline = None
# Generate oeb from html conversion.
with open(htmlfile, 'rb') as f:
oeb = html_input.convert(f, options, 'html', log,
{})
options.debug_pipeline = odi
os.remove(htmlfile)
# Set metadata from file.
from calibre.customize.ui import get_file_type_metadata
from calibre.ebooks.oeb.transforms.metadata import meta_info_to_oeb_metadata
mi = get_file_type_metadata(stream, file_ext)
meta_info_to_oeb_metadata(mi, oeb.metadata, log)
# Get the cover path from the OPF.
cover_path = None
opf = None
for x in top_levels:
if os.path.splitext(x)[1].lower() == u'.opf':
opf = x
break
if opf:
opf = OPF(opf, basedir=getcwd())
cover_path = opf.raster_cover or opf.cover
# Set the cover.
if cover_path:
cdata = None
with open(os.path.join(getcwd(), cover_path), 'rb') as cf:
cdata = cf.read()
cover_name = os.path.basename(cover_path)
id, href = oeb.manifest.generate('cover', cover_name)
oeb.manifest.add(id, href, guess_type(cover_name)[0], data=cdata)
oeb.guide.add('cover', 'Cover', href)
return oeb

View File

@@ -0,0 +1,136 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL 3'
__copyright__ = '2011, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
import io
import os
from calibre.customize.conversion import OutputFormatPlugin, \
OptionRecommendation
from calibre.ptempfile import TemporaryDirectory
from polyglot.builtins import unicode_type
class HTMLZOutput(OutputFormatPlugin):
name = 'HTMLZ Output'
author = 'John Schember'
file_type = 'htmlz'
commit_name = 'htmlz_output'
ui_data = {
'css_choices': {
'class': _('Use CSS classes'),
'inline': _('Use the style attribute'),
'tag': _('Use HTML tags wherever possible')
},
'sheet_choices': {
'external': _('Use an external CSS file'),
'inline': _('Use a <style> tag in the HTML file')
}
}
options = {
OptionRecommendation(name='htmlz_css_type', recommended_value='class',
level=OptionRecommendation.LOW,
choices=list(ui_data['css_choices']),
help=_('Specify the handling of CSS. Default is class.\n'
'class: {class}\n'
'inline: {inline}\n'
'tag: {tag}'
).format(**ui_data['css_choices'])),
OptionRecommendation(name='htmlz_class_style', recommended_value='external',
level=OptionRecommendation.LOW,
choices=list(ui_data['sheet_choices']),
help=_('How to handle the CSS when using css-type = \'class\'.\n'
'Default is external.\n'
'external: {external}\n'
'inline: {inline}'
).format(**ui_data['sheet_choices'])),
OptionRecommendation(name='htmlz_title_filename',
recommended_value=False, level=OptionRecommendation.LOW,
help=_('If set this option causes the file name of the HTML file'
' inside the HTMLZ archive to be based on the book title.')
),
}
def convert(self, oeb_book, output_path, input_plugin, opts, log):
from lxml import etree
from calibre.ebooks.oeb.base import OEB_IMAGES, SVG_MIME
from calibre.ebooks.metadata.opf2 import OPF, metadata_to_opf
from calibre.utils.zipfile import ZipFile
from calibre.utils.filenames import ascii_filename
# HTML
if opts.htmlz_css_type == 'inline':
from calibre.ebooks.htmlz.oeb2html import OEB2HTMLInlineCSSizer
OEB2HTMLizer = OEB2HTMLInlineCSSizer
elif opts.htmlz_css_type == 'tag':
from calibre.ebooks.htmlz.oeb2html import OEB2HTMLNoCSSizer
OEB2HTMLizer = OEB2HTMLNoCSSizer
else:
from calibre.ebooks.htmlz.oeb2html import OEB2HTMLClassCSSizer as OEB2HTMLizer
with TemporaryDirectory(u'_htmlz_output') as tdir:
htmlizer = OEB2HTMLizer(log)
html = htmlizer.oeb2html(oeb_book, opts)
fname = u'index'
if opts.htmlz_title_filename:
from calibre.utils.filenames import shorten_components_to
fname = shorten_components_to(100, (ascii_filename(unicode_type(oeb_book.metadata.title[0])),))[0]
with open(os.path.join(tdir, fname+u'.html'), 'wb') as tf:
if isinstance(html, unicode_type):
html = html.encode('utf-8')
tf.write(html)
# CSS
if opts.htmlz_css_type == 'class' and opts.htmlz_class_style == 'external':
with open(os.path.join(tdir, u'style.css'), 'wb') as tf:
tf.write(htmlizer.get_css(oeb_book))
# Images
images = htmlizer.images
if images:
if not os.path.exists(os.path.join(tdir, u'images')):
os.makedirs(os.path.join(tdir, u'images'))
for item in oeb_book.manifest:
if item.media_type in OEB_IMAGES and item.href in images:
if item.media_type == SVG_MIME:
data = etree.tostring(item.data, encoding='unicode')
else:
data = item.data
fname = os.path.join(tdir, u'images', images[item.href])
with open(fname, 'wb') as img:
img.write(data)
# Cover
cover_path = None
try:
cover_data = None
if oeb_book.metadata.cover:
term = oeb_book.metadata.cover[0].term
cover_data = oeb_book.guide[term].item.data
if cover_data:
from calibre.utils.img import save_cover_data_to
cover_path = os.path.join(tdir, u'cover.jpg')
with lopen(cover_path, 'w') as cf:
cf.write('')
save_cover_data_to(cover_data, cover_path)
except:
import traceback
traceback.print_exc()
# Metadata
with open(os.path.join(tdir, u'metadata.opf'), 'wb') as mdataf:
opf = OPF(io.BytesIO(etree.tostring(oeb_book.metadata.to_opf1(), encoding='UTF-8')))
mi = opf.to_book_metadata()
if cover_path:
mi.cover = u'cover.jpg'
mdataf.write(metadata_to_opf(mi))
htmlz = ZipFile(output_path, 'w')
htmlz.add_dir(tdir)

View File

@@ -0,0 +1,64 @@
#!/usr/bin/env python2
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
from calibre.customize.conversion import InputFormatPlugin
class LITInput(InputFormatPlugin):
name = 'LIT Input'
author = 'Marshall T. Vandegrift'
description = 'Convert LIT files to HTML'
file_types = {'lit'}
commit_name = 'lit_input'
def convert(self, stream, options, file_ext, log,
accelerators):
from calibre.ebooks.lit.reader import LitReader
from calibre.ebooks.conversion.plumber import create_oebbook
self.log = log
return create_oebbook(log, stream, options, reader=LitReader)
def postprocess_book(self, oeb, opts, log):
from calibre.ebooks.oeb.base import XHTML_NS, XPath, XHTML
for item in oeb.spine:
root = item.data
if not hasattr(root, 'xpath'):
continue
for bad in ('metadata', 'guide'):
metadata = XPath('//h:'+bad)(root)
if metadata:
for x in metadata:
x.getparent().remove(x)
body = XPath('//h:body')(root)
if body:
body = body[0]
if len(body) == 1 and body[0].tag == XHTML('pre'):
pre = body[0]
from calibre.ebooks.txt.processor import convert_basic, \
separate_paragraphs_single_line
from calibre.ebooks.chardet import xml_to_unicode
from calibre.utils.xml_parse import safe_xml_fromstring
import copy
self.log('LIT file with all text in singe <pre> tag detected')
html = separate_paragraphs_single_line(pre.text)
html = convert_basic(html).replace('<html>',
'<html xmlns="%s">'%XHTML_NS)
html = xml_to_unicode(html, strip_encoding_pats=True,
resolve_entities=True)[0]
if opts.smarten_punctuation:
# SmartyPants skips text inside <pre> tags
from calibre.ebooks.conversion.preprocess import smarten_punctuation
html = smarten_punctuation(html, self.log)
root = safe_xml_fromstring(html)
body = XPath('//h:body')(root)
pre.tag = XHTML('div')
pre.text = ''
for elem in body:
ne = copy.deepcopy(elem)
pre.append(ne)

View File

@@ -0,0 +1,38 @@
#!/usr/bin/env python2
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
from calibre.customize.conversion import OutputFormatPlugin
class LITOutput(OutputFormatPlugin):
name = 'LIT Output'
author = 'Marshall T. Vandegrift'
file_type = 'lit'
commit_name = 'lit_output'
def convert(self, oeb, output_path, input_plugin, opts, log):
self.log, self.opts, self.oeb = log, opts, oeb
from calibre.ebooks.oeb.transforms.manglecase import CaseMangler
from calibre.ebooks.oeb.transforms.rasterize import SVGRasterizer
from calibre.ebooks.oeb.transforms.htmltoc import HTMLTOCAdder
from calibre.ebooks.lit.writer import LitWriter
from calibre.ebooks.oeb.transforms.split import Split
split = Split(split_on_page_breaks=True, max_flow_size=0,
remove_css_pagebreaks=False)
split(self.oeb, self.opts)
tocadder = HTMLTOCAdder()
tocadder(oeb, opts)
mangler = CaseMangler()
mangler(oeb, opts)
rasterizer = SVGRasterizer()
rasterizer(oeb, opts)
lit = LitWriter(self.opts)
lit(oeb, output_path)

View File

@@ -0,0 +1,82 @@
#!/usr/bin/env python2
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import os, sys
from calibre.customize.conversion import InputFormatPlugin
class LRFInput(InputFormatPlugin):
name = 'LRF Input'
author = 'Kovid Goyal'
description = 'Convert LRF files to HTML'
file_types = {'lrf'}
commit_name = 'lrf_input'
def convert(self, stream, options, file_ext, log,
accelerators):
from calibre.ebooks.lrf.input import (MediaType, Styles, TextBlock,
Canvas, ImageBlock, RuledLine)
self.log = log
self.log('Generating XML')
from calibre.ebooks.lrf.lrfparser import LRFDocument
from calibre.utils.xml_parse import safe_xml_fromstring
from lxml import etree
d = LRFDocument(stream)
d.parse()
xml = d.to_xml(write_files=True)
if options.verbose > 2:
open(u'lrs.xml', 'wb').write(xml.encode('utf-8'))
doc = safe_xml_fromstring(xml)
char_button_map = {}
for x in doc.xpath('//CharButton[@refobj]'):
ro = x.get('refobj')
jump_button = doc.xpath('//*[@objid="%s"]'%ro)
if jump_button:
jump_to = jump_button[0].xpath('descendant::JumpTo[@refpage and @refobj]')
if jump_to:
char_button_map[ro] = '%s.xhtml#%s'%(jump_to[0].get('refpage'),
jump_to[0].get('refobj'))
plot_map = {}
for x in doc.xpath('//Plot[@refobj]'):
ro = x.get('refobj')
image = doc.xpath('//Image[@objid="%s" and @refstream]'%ro)
if image:
imgstr = doc.xpath('//ImageStream[@objid="%s" and @file]'%
image[0].get('refstream'))
if imgstr:
plot_map[ro] = imgstr[0].get('file')
self.log('Converting XML to HTML...')
styledoc = safe_xml_fromstring(P('templates/lrf.xsl', data=True))
media_type = MediaType()
styles = Styles()
text_block = TextBlock(styles, char_button_map, plot_map, log)
canvas = Canvas(doc, styles, text_block, log)
image_block = ImageBlock(canvas)
ruled_line = RuledLine()
extensions = {
('calibre', 'media-type') : media_type,
('calibre', 'text-block') : text_block,
('calibre', 'ruled-line') : ruled_line,
('calibre', 'styles') : styles,
('calibre', 'canvas') : canvas,
('calibre', 'image-block'): image_block,
}
transform = etree.XSLT(styledoc, extensions=extensions)
try:
result = transform(doc)
except RuntimeError:
sys.setrecursionlimit(5000)
result = transform(doc)
with open('content.opf', 'wb') as f:
f.write(result)
styles.write()
return os.path.abspath('content.opf')

View File

@@ -0,0 +1,196 @@
#!/usr/bin/env python2
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import sys, os
from calibre.customize.conversion import OutputFormatPlugin
from calibre.customize.conversion import OptionRecommendation
from polyglot.builtins import unicode_type
class LRFOptions(object):
def __init__(self, output, opts, oeb):
def f2s(f):
try:
return unicode_type(f[0])
except:
return ''
m = oeb.metadata
for x in ('left', 'top', 'right', 'bottom'):
attr = 'margin_'+x
val = getattr(opts, attr)
if val < 0:
setattr(opts, attr, 0)
self.title = None
self.author = self.publisher = _('Unknown')
self.title_sort = self.author_sort = ''
for x in m.creator:
if x.role == 'aut':
self.author = unicode_type(x)
fa = unicode_type(getattr(x, 'file_as', ''))
if fa:
self.author_sort = fa
for x in m.title:
if unicode_type(x.file_as):
self.title_sort = unicode_type(x.file_as)
self.freetext = f2s(m.description)
self.category = f2s(m.subject)
self.cover = None
self.use_metadata_cover = True
self.output = output
self.ignore_tables = opts.linearize_tables
if opts.disable_font_rescaling:
self.base_font_size = 0
else:
self.base_font_size = opts.base_font_size
self.blank_after_para = opts.insert_blank_line
self.use_spine = True
self.font_delta = 0
self.ignore_colors = False
from calibre.ebooks.lrf import PRS500_PROFILE
self.profile = PRS500_PROFILE
self.link_levels = sys.maxsize
self.link_exclude = '@'
self.no_links_in_toc = True
self.disable_chapter_detection = True
self.chapter_regex = 'dsadcdswcdec'
self.chapter_attr = '$,,$'
self.override_css = self._override_css = ''
self.page_break = 'h[12]'
self.force_page_break = '$'
self.force_page_break_attr = '$'
self.add_chapters_to_toc = False
self.baen = self.pdftohtml = self.book_designer = False
self.verbose = opts.verbose
self.encoding = 'utf-8'
self.lrs = False
self.minimize_memory_usage = False
self.autorotation = opts.enable_autorotation
self.header_separation = (self.profile.dpi/72.) * opts.header_separation
self.headerformat = opts.header_format
for x in ('top', 'bottom', 'left', 'right'):
setattr(self, x+'_margin',
(self.profile.dpi/72.) * float(getattr(opts, 'margin_'+x)))
for x in ('wordspace', 'header', 'header_format',
'minimum_indent', 'serif_family',
'render_tables_as_images', 'sans_family', 'mono_family',
'text_size_multiplier_for_rendered_tables'):
setattr(self, x, getattr(opts, x))
class LRFOutput(OutputFormatPlugin):
name = 'LRF Output'
author = 'Kovid Goyal'
file_type = 'lrf'
commit_name = 'lrf_output'
options = {
OptionRecommendation(name='enable_autorotation', recommended_value=False,
help=_('Enable auto-rotation of images that are wider than the screen width.')
),
OptionRecommendation(name='wordspace',
recommended_value=2.5, level=OptionRecommendation.LOW,
help=_('Set the space between words in pts. Default is %default')
),
OptionRecommendation(name='header', recommended_value=False,
help=_('Add a header to all the pages with title and author.')
),
OptionRecommendation(name='header_format', recommended_value="%t by %a",
help=_('Set the format of the header. %a is replaced by the author '
'and %t by the title. Default is %default')
),
OptionRecommendation(name='header_separation', recommended_value=0,
help=_('Add extra spacing below the header. Default is %default pt.')
),
OptionRecommendation(name='minimum_indent', recommended_value=0,
help=_('Minimum paragraph indent (the indent of the first line '
'of a paragraph) in pts. Default: %default')
),
OptionRecommendation(name='render_tables_as_images',
recommended_value=False,
help=_('This option has no effect')
),
OptionRecommendation(name='text_size_multiplier_for_rendered_tables',
recommended_value=1.0,
help=_('Multiply the size of text in rendered tables by this '
'factor. Default is %default')
),
OptionRecommendation(name='serif_family', recommended_value=None,
help=_('The serif family of fonts to embed')
),
OptionRecommendation(name='sans_family', recommended_value=None,
help=_('The sans-serif family of fonts to embed')
),
OptionRecommendation(name='mono_family', recommended_value=None,
help=_('The monospace family of fonts to embed')
),
}
recommendations = {
('change_justification', 'original', OptionRecommendation.HIGH)}
def convert_images(self, pages, opts, wide):
from calibre.ebooks.lrf.pylrs.pylrs import Book, BookSetting, ImageStream, ImageBlock
from uuid import uuid4
from calibre.constants import __appname__, __version__
width, height = (784, 1012) if wide else (584, 754)
ps = {}
ps['topmargin'] = 0
ps['evensidemargin'] = 0
ps['oddsidemargin'] = 0
ps['textwidth'] = width
ps['textheight'] = height
book = Book(title=opts.title, author=opts.author,
bookid=uuid4().hex,
publisher='%s %s'%(__appname__, __version__),
category=_('Comic'), pagestyledefault=ps,
booksetting=BookSetting(screenwidth=width, screenheight=height))
for page in pages:
imageStream = ImageStream(page)
_page = book.create_page()
_page.append(ImageBlock(refstream=imageStream,
blockwidth=width, blockheight=height, xsize=width,
ysize=height, x1=width, y1=height))
book.append(_page)
book.renderLrf(open(opts.output, 'wb'))
def flatten_toc(self):
from calibre.ebooks.oeb.base import TOC
nroot = TOC()
for x in self.oeb.toc.iterdescendants():
nroot.add(x.title, x.href)
self.oeb.toc = nroot
def convert(self, oeb, output_path, input_plugin, opts, log):
self.log, self.opts, self.oeb = log, opts, oeb
lrf_opts = LRFOptions(output_path, opts, oeb)
if input_plugin.is_image_collection:
self.convert_images(input_plugin.get_images(), lrf_opts,
getattr(opts, 'wide', False))
return
self.flatten_toc()
from calibre.ptempfile import TemporaryDirectory
with TemporaryDirectory('_lrf_output') as tdir:
from calibre.customize.ui import plugin_for_output_format
oeb_output = plugin_for_output_format('oeb')
oeb_output.convert(oeb, tdir, input_plugin, opts, log)
opf = [x for x in os.listdir(tdir) if x.endswith('.opf')][0]
from calibre.ebooks.lrf.html.convert_from import process_file
process_file(os.path.join(tdir, opf), lrf_opts, self.log)

View File

@@ -0,0 +1,66 @@
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL 3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import os
from calibre.customize.conversion import InputFormatPlugin
from polyglot.builtins import unicode_type
class MOBIInput(InputFormatPlugin):
name = 'MOBI Input'
author = 'Kovid Goyal'
description = 'Convert MOBI files (.mobi, .prc, .azw) to HTML'
file_types = {'mobi', 'prc', 'azw', 'azw3', 'pobi'}
commit_name = 'mobi_input'
def convert(self, stream, options, file_ext, log,
accelerators):
self.is_kf8 = False
self.mobi_is_joint = False
from calibre.ebooks.mobi.reader.mobi6 import MobiReader
from lxml import html
parse_cache = {}
try:
mr = MobiReader(stream, log, options.input_encoding,
options.debug_pipeline)
if mr.kf8_type is None:
mr.extract_content('.', parse_cache)
except:
mr = MobiReader(stream, log, options.input_encoding,
options.debug_pipeline, try_extra_data_fix=True)
if mr.kf8_type is None:
mr.extract_content('.', parse_cache)
if mr.kf8_type is not None:
log('Found KF8 MOBI of type %r'%mr.kf8_type)
if mr.kf8_type == 'joint':
self.mobi_is_joint = True
from calibre.ebooks.mobi.reader.mobi8 import Mobi8Reader
mr = Mobi8Reader(mr, log)
opf = os.path.abspath(mr())
self.encrypted_fonts = mr.encrypted_fonts
self.is_kf8 = True
return opf
raw = parse_cache.pop('calibre_raw_mobi_markup', False)
if raw:
if isinstance(raw, unicode_type):
raw = raw.encode('utf-8')
with lopen('debug-raw.html', 'wb') as f:
f.write(raw)
from calibre.ebooks.oeb.base import close_self_closing_tags
for f, root in parse_cache.items():
raw = html.tostring(root, encoding='utf-8', method='xml',
include_meta_content_type=False)
raw = close_self_closing_tags(raw)
with lopen(f, 'wb') as q:
q.write(raw)
accelerators['pagebreaks'] = '//h:div[@class="mbp_pagebreak"]'
return mr.created_opf_path

View File

@@ -0,0 +1,337 @@
#!/usr/bin/env python2
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
from calibre.customize.conversion import (OutputFormatPlugin,
OptionRecommendation)
from polyglot.builtins import unicode_type
def remove_html_cover(oeb, log):
from calibre.ebooks.oeb.base import OEB_DOCS
if not oeb.metadata.cover \
or 'cover' not in oeb.guide:
return
href = oeb.guide['cover'].href
del oeb.guide['cover']
item = oeb.manifest.hrefs[href]
if item.spine_position is not None:
log.warn('Found an HTML cover: ', item.href, 'removing it.',
'If you find some content missing from the output MOBI, it '
'is because you misidentified the HTML cover in the input '
'document')
oeb.spine.remove(item)
if item.media_type in OEB_DOCS:
oeb.manifest.remove(item)
def extract_mobi(output_path, opts):
if opts.extract_to is not None:
from calibre.ebooks.mobi.debug.main import inspect_mobi
ddir = opts.extract_to
inspect_mobi(output_path, ddir=ddir)
class MOBIOutput(OutputFormatPlugin):
name = 'MOBI Output'
author = 'Kovid Goyal'
file_type = 'mobi'
commit_name = 'mobi_output'
ui_data = {'file_types': ['old', 'both', 'new']}
options = {
OptionRecommendation(name='prefer_author_sort',
recommended_value=False, level=OptionRecommendation.LOW,
help=_('When present, use author sort field as author.')
),
OptionRecommendation(name='no_inline_toc',
recommended_value=False, level=OptionRecommendation.LOW,
help=_('Don\'t add Table of Contents to the book. Useful if '
'the book has its own table of contents.')),
OptionRecommendation(name='toc_title', recommended_value=None,
help=_('Title for any generated in-line table of contents.')
),
OptionRecommendation(name='dont_compress',
recommended_value=False, level=OptionRecommendation.LOW,
help=_('Disable compression of the file contents.')
),
OptionRecommendation(name='personal_doc', recommended_value='[PDOC]',
help=_('Tag for MOBI files to be marked as personal documents.'
' This option has no effect on the conversion. It is used'
' only when sending MOBI files to a device. If the file'
' being sent has the specified tag, it will be marked as'
' a personal document when sent to the Kindle.')
),
OptionRecommendation(name='mobi_ignore_margins',
recommended_value=False,
help=_('Ignore margins in the input document. If False, then '
'the MOBI output plugin will try to convert margins specified'
' in the input document, otherwise it will ignore them.')
),
OptionRecommendation(name='mobi_toc_at_start',
recommended_value=False,
help=_('When adding the Table of Contents to the book, add it at the start of the '
'book instead of the end. Not recommended.')
),
OptionRecommendation(name='extract_to',
help=_('Extract the contents of the generated %s file to the '
'specified directory. The contents of the directory are first '
'deleted, so be careful.') % 'MOBI'
),
OptionRecommendation(name='share_not_sync', recommended_value=False,
help=_('Enable sharing of book content via Facebook etc. '
' on the Kindle. WARNING: Using this feature means that '
' the book will not auto sync its last read position '
' on multiple devices. Complain to Amazon.')
),
OptionRecommendation(name='mobi_keep_original_images',
recommended_value=False,
help=_('By default calibre converts all images to JPEG format '
'in the output MOBI file. This is for maximum compatibility '
'as some older MOBI viewers have problems with other image '
'formats. This option tells calibre not to do this. '
'Useful if your document contains lots of GIF/PNG images that '
'become very large when converted to JPEG.')),
OptionRecommendation(name='mobi_file_type', choices=ui_data['file_types'], recommended_value='old',
help=_('By default calibre generates MOBI files that contain the '
'old MOBI 6 format. This format is compatible with all '
'devices. However, by changing this setting, you can tell '
'calibre to generate MOBI files that contain both MOBI 6 and '
'the new KF8 format, or only the new KF8 format. KF8 has '
'more features than MOBI 6, but only works with newer Kindles. '
'Allowed values: {}').format('old, both, new')),
}
def check_for_periodical(self):
if self.is_periodical:
self.periodicalize_toc()
self.check_for_masthead()
self.opts.mobi_periodical = True
else:
self.opts.mobi_periodical = False
def check_for_masthead(self):
found = 'masthead' in self.oeb.guide
if not found:
from calibre.ebooks import generate_masthead
self.oeb.log.debug('No masthead found in manifest, generating default mastheadImage...')
raw = generate_masthead(unicode_type(self.oeb.metadata['title'][0]))
id, href = self.oeb.manifest.generate('masthead', 'masthead')
self.oeb.manifest.add(id, href, 'image/gif', data=raw)
self.oeb.guide.add('masthead', 'Masthead Image', href)
else:
self.oeb.log.debug('Using mastheadImage supplied in manifest...')
def periodicalize_toc(self):
from calibre.ebooks.oeb.base import TOC
toc = self.oeb.toc
if not toc or len(self.oeb.spine) < 3:
return
if toc and toc[0].klass != 'periodical':
one, two = self.oeb.spine[0], self.oeb.spine[1]
self.log('Converting TOC for MOBI periodical indexing...')
articles = {}
if toc.depth() < 3:
# single section periodical
self.oeb.manifest.remove(one)
self.oeb.manifest.remove(two)
sections = [TOC(klass='section', title=_('All articles'),
href=self.oeb.spine[0].href)]
for x in toc:
sections[0].nodes.append(x)
else:
# multi-section periodical
self.oeb.manifest.remove(one)
sections = list(toc)
for i,x in enumerate(sections):
x.klass = 'section'
articles_ = list(x)
if articles_:
self.oeb.manifest.remove(self.oeb.manifest.hrefs[x.href])
x.href = articles_[0].href
for sec in sections:
articles[id(sec)] = []
for a in list(sec):
a.klass = 'article'
articles[id(sec)].append(a)
sec.nodes.remove(a)
root = TOC(klass='periodical', href=self.oeb.spine[0].href,
title=unicode_type(self.oeb.metadata.title[0]))
for s in sections:
if articles[id(s)]:
for a in articles[id(s)]:
s.nodes.append(a)
root.nodes.append(s)
for x in list(toc.nodes):
toc.nodes.remove(x)
toc.nodes.append(root)
# Fix up the periodical href to point to first section href
toc.nodes[0].href = toc.nodes[0].nodes[0].href
def convert(self, oeb, output_path, input_plugin, opts, log):
from calibre.ebooks.mobi.writer2.resources import Resources
self.log, self.opts, self.oeb = log, opts, oeb
mobi_type = opts.mobi_file_type
if self.is_periodical:
mobi_type = 'old' # Amazon does not support KF8 periodicals
create_kf8 = mobi_type in ('new', 'both')
remove_html_cover(self.oeb, self.log)
resources = Resources(oeb, opts, self.is_periodical,
add_fonts=create_kf8)
self.check_for_periodical()
if create_kf8:
from calibre.ebooks.mobi.writer8.cleanup import remove_duplicate_anchors
remove_duplicate_anchors(self.oeb)
# Split on pagebreaks so that the resulting KF8 is faster to load
from calibre.ebooks.oeb.transforms.split import Split
Split()(self.oeb, self.opts)
kf8 = self.create_kf8(resources, for_joint=mobi_type=='both'
) if create_kf8 else None
if mobi_type == 'new':
kf8.write(output_path)
extract_mobi(output_path, opts)
return
self.log('Creating MOBI 6 output')
self.write_mobi(input_plugin, output_path, kf8, resources)
def create_kf8(self, resources, for_joint=False):
from calibre.ebooks.mobi.writer8.main import create_kf8_book
return create_kf8_book(self.oeb, self.opts, resources,
for_joint=for_joint)
def write_mobi(self, input_plugin, output_path, kf8, resources):
from calibre.ebooks.mobi.mobiml import MobiMLizer
from calibre.ebooks.oeb.transforms.manglecase import CaseMangler
from calibre.ebooks.oeb.transforms.rasterize import SVGRasterizer, Unavailable
from calibre.ebooks.oeb.transforms.htmltoc import HTMLTOCAdder
from calibre.customize.ui import plugin_for_input_format
opts, oeb = self.opts, self.oeb
if not opts.no_inline_toc:
tocadder = HTMLTOCAdder(title=opts.toc_title, position='start' if
opts.mobi_toc_at_start else 'end')
tocadder(oeb, opts)
mangler = CaseMangler()
mangler(oeb, opts)
try:
rasterizer = SVGRasterizer()
rasterizer(oeb, opts)
except Unavailable:
self.log.warn('SVG rasterizer unavailable, SVG will not be converted')
else:
# Add rasterized SVG images
resources.add_extra_images()
if hasattr(self.oeb, 'inserted_metadata_jacket'):
self.workaround_fire_bugs(self.oeb.inserted_metadata_jacket)
mobimlizer = MobiMLizer(ignore_tables=opts.linearize_tables)
mobimlizer(oeb, opts)
write_page_breaks_after_item = input_plugin is not plugin_for_input_format('cbz')
from calibre.ebooks.mobi.writer2.main import MobiWriter
writer = MobiWriter(opts, resources, kf8,
write_page_breaks_after_item=write_page_breaks_after_item)
writer(oeb, output_path)
extract_mobi(output_path, opts)
def specialize_css_for_output(self, log, opts, item, stylizer):
from calibre.ebooks.mobi.writer8.cleanup import CSSCleanup
CSSCleanup(log, opts)(item, stylizer)
def workaround_fire_bugs(self, jacket):
# The idiotic Fire crashes when trying to render the table used to
# layout the jacket
from calibre.ebooks.oeb.base import XHTML
for table in jacket.data.xpath('//*[local-name()="table"]'):
table.tag = XHTML('div')
for tr in table.xpath('descendant::*[local-name()="tr"]'):
cols = tr.xpath('descendant::*[local-name()="td"]')
tr.tag = XHTML('div')
for td in cols:
td.tag = XHTML('span' if cols else 'div')
class AZW3Output(OutputFormatPlugin):
name = 'AZW3 Output'
author = 'Kovid Goyal'
file_type = 'azw3'
commit_name = 'azw3_output'
options = {
OptionRecommendation(name='prefer_author_sort',
recommended_value=False, level=OptionRecommendation.LOW,
help=_('When present, use author sort field as author.')
),
OptionRecommendation(name='no_inline_toc',
recommended_value=False, level=OptionRecommendation.LOW,
help=_('Don\'t add Table of Contents to the book. Useful if '
'the book has its own table of contents.')),
OptionRecommendation(name='toc_title', recommended_value=None,
help=_('Title for any generated in-line table of contents.')
),
OptionRecommendation(name='dont_compress',
recommended_value=False, level=OptionRecommendation.LOW,
help=_('Disable compression of the file contents.')
),
OptionRecommendation(name='mobi_toc_at_start',
recommended_value=False,
help=_('When adding the Table of Contents to the book, add it at the start of the '
'book instead of the end. Not recommended.')
),
OptionRecommendation(name='extract_to',
help=_('Extract the contents of the generated %s file to the '
'specified directory. The contents of the directory are first '
'deleted, so be careful.') % 'AZW3'),
OptionRecommendation(name='share_not_sync', recommended_value=False,
help=_('Enable sharing of book content via Facebook etc. '
' on the Kindle. WARNING: Using this feature means that '
' the book will not auto sync its last read position '
' on multiple devices. Complain to Amazon.')
),
}
def convert(self, oeb, output_path, input_plugin, opts, log):
from calibre.ebooks.mobi.writer2.resources import Resources
from calibre.ebooks.mobi.writer8.main import create_kf8_book
from calibre.ebooks.mobi.writer8.cleanup import remove_duplicate_anchors
self.oeb, self.opts, self.log = oeb, opts, log
opts.mobi_periodical = self.is_periodical
passthrough = getattr(opts, 'mobi_passthrough', False)
remove_duplicate_anchors(oeb)
resources = Resources(self.oeb, self.opts, self.is_periodical,
add_fonts=True, process_images=False)
if not passthrough:
remove_html_cover(self.oeb, self.log)
# Split on pagebreaks so that the resulting KF8 is faster to load
from calibre.ebooks.oeb.transforms.split import Split
Split()(self.oeb, self.opts)
kf8 = create_kf8_book(self.oeb, self.opts, resources, for_joint=False)
kf8.write(output_path)
extract_mobi(output_path, opts)
def specialize_css_for_output(self, log, opts, item, stylizer):
from calibre.ebooks.mobi.writer8.cleanup import CSSCleanup
CSSCleanup(log, opts)(item, stylizer)

View File

@@ -0,0 +1,25 @@
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
__docformat__ = 'restructuredtext en'
'''
Convert an ODT file into a Open Ebook
'''
from calibre.customize.conversion import InputFormatPlugin
class ODTInput(InputFormatPlugin):
name = 'ODT Input'
author = 'Kovid Goyal'
description = 'Convert ODT (OpenOffice) files to HTML'
file_types = {'odt'}
commit_name = 'odt_input'
def convert(self, stream, options, file_ext, log,
accelerators):
from calibre.ebooks.odt.input import Extract
return Extract()(stream, '.', log)

View File

@@ -0,0 +1,122 @@
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL 3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import os, re
from calibre.customize.conversion import (OutputFormatPlugin,
OptionRecommendation)
from calibre import CurrentDir
class OEBOutput(OutputFormatPlugin):
name = 'OEB Output'
author = 'Kovid Goyal'
file_type = 'oeb'
commit_name = 'oeb_output'
recommendations = {('pretty_print', True, OptionRecommendation.HIGH)}
def convert(self, oeb_book, output_path, input_plugin, opts, log):
from polyglot.urllib import unquote
from lxml import etree
self.log, self.opts = log, opts
if not os.path.exists(output_path):
os.makedirs(output_path)
from calibre.ebooks.oeb.base import OPF_MIME, NCX_MIME, PAGE_MAP_MIME, OEB_STYLES
from calibre.ebooks.oeb.normalize_css import condense_sheet
with CurrentDir(output_path):
results = oeb_book.to_opf2(page_map=True)
for key in (OPF_MIME, NCX_MIME, PAGE_MAP_MIME):
href, root = results.pop(key, [None, None])
if root is not None:
if key == OPF_MIME:
try:
self.workaround_nook_cover_bug(root)
except:
self.log.exception('Something went wrong while trying to'
' workaround Nook cover bug, ignoring')
try:
self.workaround_pocketbook_cover_bug(root)
except:
self.log.exception('Something went wrong while trying to'
' workaround Pocketbook cover bug, ignoring')
self.migrate_lang_code(root)
raw = etree.tostring(root, pretty_print=True,
encoding='utf-8', xml_declaration=True)
if key == OPF_MIME:
# Needed as I can't get lxml to output opf:role and
# not output <opf:metadata> as well
raw = re.sub(br'(<[/]{0,1})opf:', br'\1', raw)
with lopen(href, 'wb') as f:
f.write(raw)
for item in oeb_book.manifest:
if (
not self.opts.expand_css and item.media_type in OEB_STYLES and hasattr(
item.data, 'cssText') and 'nook' not in self.opts.output_profile.short_name):
condense_sheet(item.data)
path = os.path.abspath(unquote(item.href))
dir = os.path.dirname(path)
if not os.path.exists(dir):
os.makedirs(dir)
with lopen(path, 'wb') as f:
f.write(item.bytes_representation)
item.unload_data_from_memory(memory=path)
def workaround_nook_cover_bug(self, root): # {{{
cov = root.xpath('//*[local-name() = "meta" and @name="cover" and'
' @content != "cover"]')
def manifest_items_with_id(id_):
return root.xpath('//*[local-name() = "manifest"]/*[local-name() = "item" '
' and @id="%s"]'%id_)
if len(cov) == 1:
cov = cov[0]
covid = cov.get('content', '')
if covid:
manifest_item = manifest_items_with_id(covid)
if len(manifest_item) == 1 and \
manifest_item[0].get('media-type',
'').startswith('image/'):
self.log.warn('The cover image has an id != "cover". Renaming'
' to work around bug in Nook Color')
from calibre.ebooks.oeb.base import uuid_id
newid = uuid_id()
for item in manifest_items_with_id('cover'):
item.set('id', newid)
for x in root.xpath('//*[@idref="cover"]'):
x.set('idref', newid)
manifest_item = manifest_item[0]
manifest_item.set('id', 'cover')
cov.set('content', 'cover')
# }}}
def workaround_pocketbook_cover_bug(self, root): # {{{
m = root.xpath('//*[local-name() = "manifest"]/*[local-name() = "item" '
' and @id="cover"]')
if len(m) == 1:
m = m[0]
p = m.getparent()
p.remove(m)
p.insert(0, m)
# }}}
def migrate_lang_code(self, root): # {{{
from calibre.utils.localization import lang_as_iso639_1
for lang in root.xpath('//*[local-name() = "language"]'):
clc = lang_as_iso639_1(lang.text)
if clc:
lang.text = clc
# }}}

View File

@@ -0,0 +1,37 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
from calibre.customize.conversion import InputFormatPlugin
from polyglot.builtins import getcwd
class PDBInput(InputFormatPlugin):
name = 'PDB Input'
author = 'John Schember'
description = 'Convert PDB to HTML'
file_types = {'pdb', 'updb'}
commit_name = 'pdb_input'
def convert(self, stream, options, file_ext, log,
accelerators):
from calibre.ebooks.pdb.header import PdbHeaderReader
from calibre.ebooks.pdb import PDBError, IDENTITY_TO_NAME, get_reader
header = PdbHeaderReader(stream)
Reader = get_reader(header.ident)
if Reader is None:
raise PDBError('No reader available for format within container.\n Identity is %s. Book type is %s' %
(header.ident, IDENTITY_TO_NAME.get(header.ident, _('Unknown'))))
log.debug('Detected ebook format as: %s with identity: %s' % (IDENTITY_TO_NAME[header.ident], header.ident))
reader = Reader(header, stream, log, options)
opf = reader.extract_content(getcwd())
return opf

View File

@@ -0,0 +1,64 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL 3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
import os
from calibre.customize.conversion import OutputFormatPlugin, \
OptionRecommendation
from calibre.ebooks.pdb import PDBError, get_writer, ALL_FORMAT_WRITERS
class PDBOutput(OutputFormatPlugin):
name = 'PDB Output'
author = 'John Schember'
file_type = 'pdb'
commit_name = 'pdb_output'
ui_data = {'formats': tuple(ALL_FORMAT_WRITERS)}
options = {
OptionRecommendation(name='format', recommended_value='doc',
level=OptionRecommendation.LOW,
short_switch='f', choices=list(ALL_FORMAT_WRITERS),
help=(_('Format to use inside the pdb container. Choices are:') + ' %s' % sorted(ALL_FORMAT_WRITERS))),
OptionRecommendation(name='pdb_output_encoding', recommended_value='cp1252',
level=OptionRecommendation.LOW,
help=_('Specify the character encoding of the output document. '
'The default is cp1252. Note: This option is not honored by all '
'formats.')),
OptionRecommendation(name='inline_toc',
recommended_value=False, level=OptionRecommendation.LOW,
help=_('Add Table of Contents to beginning of the book.')),
}
def convert(self, oeb_book, output_path, input_plugin, opts, log):
close = False
if not hasattr(output_path, 'write'):
close = True
if not os.path.exists(os.path.dirname(output_path)) and os.path.dirname(output_path):
os.makedirs(os.path.dirname(output_path))
out_stream = lopen(output_path, 'wb')
else:
out_stream = output_path
Writer = get_writer(opts.format)
if Writer is None:
raise PDBError('No writer available for format %s.' % format)
setattr(opts, 'max_line_length', 0)
setattr(opts, 'force_max_line_length', False)
writer = Writer(opts, log)
out_stream.seek(0)
out_stream.truncate()
writer.write_content(oeb_book, out_stream, oeb_book.metadata)
if close:
out_stream.close()

View File

@@ -0,0 +1,82 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL 3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
import os
from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
from polyglot.builtins import as_bytes, getcwd
class PDFInput(InputFormatPlugin):
name = 'PDF Input'
author = 'Kovid Goyal and John Schember'
description = 'Convert PDF files to HTML'
file_types = {'pdf'}
commit_name = 'pdf_input'
options = {
OptionRecommendation(name='no_images', recommended_value=False,
help=_('Do not extract images from the document')),
OptionRecommendation(name='unwrap_factor', recommended_value=0.45,
help=_('Scale used to determine the length at which a line should '
'be unwrapped. Valid values are a decimal between 0 and 1. The '
'default is 0.45, just below the median line length.')),
OptionRecommendation(name='new_pdf_engine', recommended_value=False,
help=_('Use the new PDF conversion engine. Currently not operational.'))
}
def convert_new(self, stream, accelerators):
from calibre.ebooks.pdf.pdftohtml import pdftohtml
from calibre.utils.cleantext import clean_ascii_chars
from calibre.ebooks.pdf.reflow import PDFDocument
pdftohtml(getcwd(), stream.name, self.opts.no_images, as_xml=True)
with lopen('index.xml', 'rb') as f:
xml = clean_ascii_chars(f.read())
PDFDocument(xml, self.opts, self.log)
return os.path.join(getcwd(), 'metadata.opf')
def convert(self, stream, options, file_ext, log,
accelerators):
from calibre.ebooks.metadata.opf2 import OPFCreator
from calibre.ebooks.pdf.pdftohtml import pdftohtml
log.debug('Converting file to html...')
# The main html file will be named index.html
self.opts, self.log = options, log
if options.new_pdf_engine:
return self.convert_new(stream, accelerators)
pdftohtml(getcwd(), stream.name, options.no_images)
from calibre.ebooks.metadata.meta import get_metadata
log.debug('Retrieving document metadata...')
mi = get_metadata(stream, 'pdf')
opf = OPFCreator(getcwd(), mi)
manifest = [('index.html', None)]
images = os.listdir(getcwd())
images.remove('index.html')
for i in images:
manifest.append((i, None))
log.debug('Generating manifest...')
opf.create_manifest(manifest)
opf.create_spine(['index.html'])
log.debug('Rendering manifest...')
with lopen('metadata.opf', 'wb') as opffile:
opf.render(opffile)
if os.path.exists('toc.ncx'):
ncxid = opf.manifest.id_for_path('toc.ncx')
if ncxid:
with lopen('metadata.opf', 'r+b') as f:
raw = f.read().replace(b'<spine', b'<spine toc="%s"' % as_bytes(ncxid))
f.seek(0)
f.write(raw)
return os.path.join(getcwd(), 'metadata.opf')

View File

@@ -0,0 +1,256 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL 3'
__copyright__ = '2012, Kovid Goyal <kovid at kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
'''
Convert OEB ebook format to PDF.
'''
import glob, os
from calibre.customize.conversion import (OutputFormatPlugin,
OptionRecommendation)
from calibre.ptempfile import TemporaryDirectory
from polyglot.builtins import iteritems, unicode_type
UNITS = ('millimeter', 'centimeter', 'point', 'inch' , 'pica' , 'didot',
'cicero', 'devicepixel')
PAPER_SIZES = ('a0', 'a1', 'a2', 'a3', 'a4', 'a5', 'a6', 'b0', 'b1',
'b2', 'b3', 'b4', 'b5', 'b6', 'legal', 'letter')
class PDFOutput(OutputFormatPlugin):
name = 'PDF Output'
author = 'Kovid Goyal'
file_type = 'pdf'
commit_name = 'pdf_output'
ui_data = {'paper_sizes': PAPER_SIZES, 'units': UNITS, 'font_types': ('serif', 'sans', 'mono')}
options = {
OptionRecommendation(name='use_profile_size', recommended_value=False,
help=_('Instead of using the paper size specified in the PDF Output options,'
' use a paper size corresponding to the current output profile.'
' Useful if you want to generate a PDF for viewing on a specific device.')),
OptionRecommendation(name='unit', recommended_value='inch',
level=OptionRecommendation.LOW, short_switch='u', choices=UNITS,
help=_('The unit of measure for page sizes. Default is inch. Choices '
'are {} '
'Note: This does not override the unit for margins!').format(', '.join(UNITS))),
OptionRecommendation(name='paper_size', recommended_value='letter',
level=OptionRecommendation.LOW, choices=PAPER_SIZES,
help=_('The size of the paper. This size will be overridden when a '
'non default output profile is used. Default is letter. Choices '
'are {}').format(', '.join(PAPER_SIZES))),
OptionRecommendation(name='custom_size', recommended_value=None,
help=_('Custom size of the document. Use the form widthxheight '
'e.g. `123x321` to specify the width and height. '
'This overrides any specified paper-size.')),
OptionRecommendation(name='preserve_cover_aspect_ratio',
recommended_value=False,
help=_('Preserve the aspect ratio of the cover, instead'
' of stretching it to fill the full first page of the'
' generated pdf.')),
OptionRecommendation(name='pdf_serif_family',
recommended_value='Times', help=_(
'The font family used to render serif fonts. Will work only if the font is available system-wide.')),
OptionRecommendation(name='pdf_sans_family',
recommended_value='Helvetica', help=_(
'The font family used to render sans-serif fonts. Will work only if the font is available system-wide.')),
OptionRecommendation(name='pdf_mono_family',
recommended_value='Courier', help=_(
'The font family used to render monospace fonts. Will work only if the font is available system-wide.')),
OptionRecommendation(name='pdf_standard_font', choices=ui_data['font_types'],
recommended_value='serif', help=_(
'The font family used to render monospace fonts')),
OptionRecommendation(name='pdf_default_font_size',
recommended_value=20, help=_(
'The default font size')),
OptionRecommendation(name='pdf_mono_font_size',
recommended_value=16, help=_(
'The default font size for monospaced text')),
OptionRecommendation(name='pdf_hyphenate', recommended_value=False,
help=_('Break long words at the end of lines. This can give the text at the right margin a more even appearance.')),
OptionRecommendation(name='pdf_mark_links', recommended_value=False,
help=_('Surround all links with a red box, useful for debugging.')),
OptionRecommendation(name='pdf_page_numbers', recommended_value=False,
help=_('Add page numbers to the bottom of every page in the generated PDF file. If you '
'specify a footer template, it will take precedence '
'over this option.')),
OptionRecommendation(name='pdf_footer_template', recommended_value=None,
help=_('An HTML template used to generate %s on every page.'
' The strings _PAGENUM_, _TITLE_, _AUTHOR_ and _SECTION_ will be replaced by their current values.')%_('footers')),
OptionRecommendation(name='pdf_header_template', recommended_value=None,
help=_('An HTML template used to generate %s on every page.'
' The strings _PAGENUM_, _TITLE_, _AUTHOR_ and _SECTION_ will be replaced by their current values.')%_('headers')),
OptionRecommendation(name='pdf_add_toc', recommended_value=False,
help=_('Add a Table of Contents at the end of the PDF that lists page numbers. '
'Useful if you want to print out the PDF. If this PDF is intended for electronic use, use the PDF Outline instead.')),
OptionRecommendation(name='toc_title', recommended_value=None,
help=_('Title for generated table of contents.')
),
OptionRecommendation(name='pdf_page_margin_left', recommended_value=72.0,
level=OptionRecommendation.LOW,
help=_('The size of the left page margin, in pts. Default is 72pt.'
' Overrides the common left page margin setting.')
),
OptionRecommendation(name='pdf_page_margin_top', recommended_value=72.0,
level=OptionRecommendation.LOW,
help=_('The size of the top page margin, in pts. Default is 72pt.'
' Overrides the common top page margin setting, unless set to zero.')
),
OptionRecommendation(name='pdf_page_margin_right', recommended_value=72.0,
level=OptionRecommendation.LOW,
help=_('The size of the right page margin, in pts. Default is 72pt.'
' Overrides the common right page margin setting, unless set to zero.')
),
OptionRecommendation(name='pdf_page_margin_bottom', recommended_value=72.0,
level=OptionRecommendation.LOW,
help=_('The size of the bottom page margin, in pts. Default is 72pt.'
' Overrides the common bottom page margin setting, unless set to zero.')
),
OptionRecommendation(name='pdf_use_document_margins', recommended_value=False,
help=_('Use the page margins specified in the input document via @page CSS rules.'
' This will cause the margins specified in the conversion settings to be ignored.'
' If the document does not specify page margins, the conversion settings will be used as a fallback.')
),
OptionRecommendation(name='pdf_page_number_map', recommended_value=None,
help=_('Adjust page numbers, as needed. Syntax is a JavaScript expression for the page number.'
' For example, "if (n < 3) 0; else n - 3;", where n is current page number.')
),
OptionRecommendation(name='uncompressed_pdf',
recommended_value=False, help=_(
'Generate an uncompressed PDF, useful for debugging.')
),
OptionRecommendation(name='pdf_odd_even_offset', recommended_value=0.0,
level=OptionRecommendation.LOW,
help=_(
'Shift the text horizontally by the specified offset (in pts).'
' On odd numbered pages, it is shifted to the right and on even'
' numbered pages to the left. Use negative numbers for the opposite'
' effect. Note that this setting is ignored on pages where the margins'
' are smaller than the specified offset. Shifting is done by setting'
' the PDF CropBox, not all software respects the CropBox.'
)
),
}
def specialize_options(self, log, opts, input_fmt):
# Ensure Qt is setup to be used with WebEngine
# specialize_options is called early enough in the pipeline
# that hopefully no Qt application has been constructed as yet
from PyQt5.QtWebEngineCore import QWebEngineUrlScheme
from PyQt5.QtWebEngineWidgets import QWebEnginePage # noqa
from calibre.gui2 import must_use_qt
from calibre.constants import FAKE_PROTOCOL
scheme = QWebEngineUrlScheme(FAKE_PROTOCOL.encode('ascii'))
scheme.setSyntax(QWebEngineUrlScheme.Syntax.Host)
scheme.setFlags(QWebEngineUrlScheme.SecureScheme)
QWebEngineUrlScheme.registerScheme(scheme)
must_use_qt()
self.input_fmt = input_fmt
if opts.pdf_use_document_margins:
# Prevent the conversion pipeline from overwriting document margins
opts.margin_left = opts.margin_right = opts.margin_top = opts.margin_bottom = -1
def convert(self, oeb_book, output_path, input_plugin, opts, log):
self.stored_page_margins = getattr(opts, '_stored_page_margins', {})
self.oeb = oeb_book
self.input_plugin, self.opts, self.log = input_plugin, opts, log
self.output_path = output_path
from calibre.ebooks.oeb.base import OPF, OPF2_NS
from lxml import etree
from io import BytesIO
package = etree.Element(OPF('package'),
attrib={'version': '2.0', 'unique-identifier': 'dummy'},
nsmap={None: OPF2_NS})
from calibre.ebooks.metadata.opf2 import OPF
self.oeb.metadata.to_opf2(package)
self.metadata = OPF(BytesIO(etree.tostring(package))).to_book_metadata()
self.cover_data = None
if input_plugin.is_image_collection:
log.debug('Converting input as an image collection...')
self.convert_images(input_plugin.get_images())
else:
log.debug('Converting input as a text based book...')
self.convert_text(oeb_book)
def convert_images(self, images):
from calibre.ebooks.pdf.image_writer import convert
convert(images, self.output_path, self.opts, self.metadata, self.report_progress)
def get_cover_data(self):
oeb = self.oeb
if (oeb.metadata.cover and unicode_type(oeb.metadata.cover[0]) in oeb.manifest.ids):
cover_id = unicode_type(oeb.metadata.cover[0])
item = oeb.manifest.ids[cover_id]
self.cover_data = item.data
def process_fonts(self):
''' Make sure all fonts are embeddable '''
from calibre.ebooks.oeb.base import urlnormalize
from calibre.utils.fonts.utils import remove_embed_restriction
processed = set()
for item in list(self.oeb.manifest):
if not hasattr(item.data, 'cssRules'):
continue
for i, rule in enumerate(item.data.cssRules):
if rule.type == rule.FONT_FACE_RULE:
try:
s = rule.style
src = s.getProperty('src').propertyValue[0].uri
except:
continue
path = item.abshref(src)
ff = self.oeb.manifest.hrefs.get(urlnormalize(path), None)
if ff is None:
continue
raw = nraw = ff.data
if path not in processed:
processed.add(path)
try:
nraw = remove_embed_restriction(raw)
except:
continue
if nraw != raw:
ff.data = nraw
self.oeb.container.write(path, nraw)
def convert_text(self, oeb_book):
import json
from calibre.ebooks.pdf.html_writer import convert
self.get_cover_data()
self.process_fonts()
if self.opts.pdf_use_document_margins and self.stored_page_margins:
for href, margins in iteritems(self.stored_page_margins):
item = oeb_book.manifest.hrefs.get(href)
if item is not None:
root = item.data
if hasattr(root, 'xpath') and margins:
root.set('data-calibre-pdf-output-page-margins', json.dumps(margins))
with TemporaryDirectory('_pdf_out') as oeb_dir:
from calibre.customize.ui import plugin_for_output_format
oeb_dir = os.path.realpath(oeb_dir)
oeb_output = plugin_for_output_format('oeb')
oeb_output.convert(oeb_book, oeb_dir, self.input_plugin, self.opts, self.log)
opfpath = glob.glob(os.path.join(oeb_dir, '*.opf'))[0]
convert(
opfpath, self.opts, metadata=self.metadata, output_path=self.output_path,
log=self.log, cover_data=self.cover_data, report_progress=self.report_progress
)

View File

@@ -0,0 +1,165 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
import glob
import os
import shutil
from calibre.customize.conversion import InputFormatPlugin
from calibre.ptempfile import TemporaryDirectory
from polyglot.builtins import getcwd
class PMLInput(InputFormatPlugin):
name = 'PML Input'
author = 'John Schember'
description = 'Convert PML to OEB'
# pmlz is a zip file containing pml files and png images.
file_types = {'pml', 'pmlz'}
commit_name = 'pml_input'
def process_pml(self, pml_path, html_path, close_all=False):
from calibre.ebooks.pml.pmlconverter import PML_HTMLizer
pclose = False
hclose = False
if not hasattr(pml_path, 'read'):
pml_stream = lopen(pml_path, 'rb')
pclose = True
else:
pml_stream = pml_path
pml_stream.seek(0)
if not hasattr(html_path, 'write'):
html_stream = lopen(html_path, 'wb')
hclose = True
else:
html_stream = html_path
ienc = getattr(pml_stream, 'encoding', None)
if ienc is None:
ienc = 'cp1252'
if self.options.input_encoding:
ienc = self.options.input_encoding
self.log.debug('Converting PML to HTML...')
hizer = PML_HTMLizer()
html = hizer.parse_pml(pml_stream.read().decode(ienc), html_path)
html = '<html><head><title></title></head><body>%s</body></html>'%html
html_stream.write(html.encode('utf-8', 'replace'))
if pclose:
pml_stream.close()
if hclose:
html_stream.close()
return hizer.get_toc()
def get_images(self, stream, tdir, top_level=False):
images = []
imgs = []
if top_level:
imgs = glob.glob(os.path.join(tdir, '*.png'))
# Images not in top level try bookname_img directory because
# that's where Dropbook likes to see them.
if not imgs:
if hasattr(stream, 'name'):
imgs = glob.glob(os.path.join(tdir, os.path.splitext(os.path.basename(stream.name))[0] + '_img', '*.png'))
# No images in Dropbook location try generic images directory
if not imgs:
imgs = glob.glob(os.path.join(os.path.join(tdir, 'images'), '*.png'))
if imgs:
os.makedirs(os.path.join(getcwd(), 'images'))
for img in imgs:
pimg_name = os.path.basename(img)
pimg_path = os.path.join(getcwd(), 'images', pimg_name)
images.append('images/' + pimg_name)
shutil.copy(img, pimg_path)
return images
def convert(self, stream, options, file_ext, log,
accelerators):
from calibre.ebooks.metadata.toc import TOC
from calibre.ebooks.metadata.opf2 import OPFCreator
from calibre.utils.zipfile import ZipFile
self.options = options
self.log = log
pages, images = [], []
toc = TOC()
if file_ext == 'pmlz':
log.debug('De-compressing content to temporary directory...')
with TemporaryDirectory('_unpmlz') as tdir:
zf = ZipFile(stream)
zf.extractall(tdir)
pmls = glob.glob(os.path.join(tdir, '*.pml'))
for pml in pmls:
html_name = os.path.splitext(os.path.basename(pml))[0]+'.html'
html_path = os.path.join(getcwd(), html_name)
pages.append(html_name)
log.debug('Processing PML item %s...' % pml)
ttoc = self.process_pml(pml, html_path)
toc += ttoc
images = self.get_images(stream, tdir, True)
else:
toc = self.process_pml(stream, 'index.html')
pages.append('index.html')
if hasattr(stream, 'name'):
images = self.get_images(stream, os.path.abspath(os.path.dirname(stream.name)))
# We want pages to be orded alphabetically.
pages.sort()
manifest_items = []
for item in pages+images:
manifest_items.append((item, None))
from calibre.ebooks.metadata.meta import get_metadata
log.debug('Reading metadata from input file...')
mi = get_metadata(stream, 'pml')
if 'images/cover.png' in images:
mi.cover = 'images/cover.png'
opf = OPFCreator(getcwd(), mi)
log.debug('Generating manifest...')
opf.create_manifest(manifest_items)
opf.create_spine(pages)
opf.set_toc(toc)
with lopen('metadata.opf', 'wb') as opffile:
with lopen('toc.ncx', 'wb') as tocfile:
opf.render(opffile, tocfile, 'toc.ncx')
return os.path.join(getcwd(), 'metadata.opf')
def postprocess_book(self, oeb, opts, log):
from calibre.ebooks.oeb.base import XHTML, barename
for item in oeb.spine:
if hasattr(item.data, 'xpath'):
for heading in item.data.iterdescendants(*map(XHTML, 'h1 h2 h3 h4 h5 h6'.split())):
if not len(heading):
continue
span = heading[0]
if not heading.text and not span.text and not len(span) and barename(span.tag) == 'span':
if not heading.get('id') and span.get('id'):
heading.set('id', span.get('id'))
heading.text = span.tail
heading.remove(span)
if len(heading) == 1 and heading[0].get('style') == 'text-align: center; margin: auto;':
div = heading[0]
if barename(div.tag) == 'div' and not len(div) and not div.get('id') and not heading.get('style'):
heading.text = (heading.text or '') + (div.text or '') + (div.tail or '')
heading.remove(div)
heading.set('style', 'text-align: center')

View File

@@ -0,0 +1,77 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL 3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
import os, io
from calibre.customize.conversion import (OutputFormatPlugin,
OptionRecommendation)
from calibre.ptempfile import TemporaryDirectory
from polyglot.builtins import unicode_type
class PMLOutput(OutputFormatPlugin):
name = 'PML Output'
author = 'John Schember'
file_type = 'pmlz'
commit_name = 'pml_output'
options = {
OptionRecommendation(name='pml_output_encoding', recommended_value='cp1252',
level=OptionRecommendation.LOW,
help=_('Specify the character encoding of the output document. '
'The default is cp1252.')),
OptionRecommendation(name='inline_toc',
recommended_value=False, level=OptionRecommendation.LOW,
help=_('Add Table of Contents to beginning of the book.')),
OptionRecommendation(name='full_image_depth',
recommended_value=False, level=OptionRecommendation.LOW,
help=_('Do not reduce the size or bit depth of images. Images '
'have their size and depth reduced by default to accommodate '
'applications that can not convert images on their '
'own such as Dropbook.')),
}
def convert(self, oeb_book, output_path, input_plugin, opts, log):
from calibre.ebooks.pml.pmlml import PMLMLizer
from calibre.utils.zipfile import ZipFile
with TemporaryDirectory('_pmlz_output') as tdir:
pmlmlizer = PMLMLizer(log)
pml = unicode_type(pmlmlizer.extract_content(oeb_book, opts))
with lopen(os.path.join(tdir, 'index.pml'), 'wb') as out:
out.write(pml.encode(opts.pml_output_encoding, 'replace'))
img_path = os.path.join(tdir, 'index_img')
if not os.path.exists(img_path):
os.makedirs(img_path)
self.write_images(oeb_book.manifest, pmlmlizer.image_hrefs, img_path, opts)
log.debug('Compressing output...')
pmlz = ZipFile(output_path, 'w')
pmlz.add_dir(tdir)
def write_images(self, manifest, image_hrefs, out_dir, opts):
from PIL import Image
from calibre.ebooks.oeb.base import OEB_RASTER_IMAGES
for item in manifest:
if item.media_type in OEB_RASTER_IMAGES and item.href in image_hrefs.keys():
if opts.full_image_depth:
im = Image.open(io.BytesIO(item.data))
else:
im = Image.open(io.BytesIO(item.data)).convert('P')
im.thumbnail((300,300), Image.ANTIALIAS)
data = io.BytesIO()
im.save(data, 'PNG')
data = data.getvalue()
path = os.path.join(out_dir, image_hrefs[item.href])
with lopen(path, 'wb') as out:
out.write(data)

View File

@@ -0,0 +1,28 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL 3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
from calibre.customize.conversion import InputFormatPlugin
from polyglot.builtins import getcwd
class RBInput(InputFormatPlugin):
name = 'RB Input'
author = 'John Schember'
description = 'Convert RB files to HTML'
file_types = {'rb'}
commit_name = 'rb_input'
def convert(self, stream, options, file_ext, log,
accelerators):
from calibre.ebooks.rb.reader import Reader
reader = Reader(stream, log, options.input_encoding)
opf = reader.extract_content(getcwd())
return opf

View File

@@ -0,0 +1,45 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL 3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
import os
from calibre.customize.conversion import OutputFormatPlugin, OptionRecommendation
class RBOutput(OutputFormatPlugin):
name = 'RB Output'
author = 'John Schember'
file_type = 'rb'
commit_name = 'rb_output'
options = {
OptionRecommendation(name='inline_toc',
recommended_value=False, level=OptionRecommendation.LOW,
help=_('Add Table of Contents to beginning of the book.'))}
def convert(self, oeb_book, output_path, input_plugin, opts, log):
from calibre.ebooks.rb.writer import RBWriter
close = False
if not hasattr(output_path, 'write'):
close = True
if not os.path.exists(os.path.dirname(output_path)) and os.path.dirname(output_path):
os.makedirs(os.path.dirname(output_path))
out_stream = lopen(output_path, 'wb')
else:
out_stream = output_path
writer = RBWriter(opts, log)
out_stream.seek(0)
out_stream.truncate()
writer.write_content(oeb_book, out_stream, oeb_book.metadata)
if close:
out_stream.close()

View File

@@ -0,0 +1,169 @@
#!/usr/bin/env python2
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import os
from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
from calibre.constants import numeric_version
from calibre import walk
from polyglot.builtins import unicode_type
class RecipeDisabled(Exception):
pass
class RecipeInput(InputFormatPlugin):
name = 'Recipe Input'
author = 'Kovid Goyal'
description = _('Download periodical content from the internet')
file_types = {'recipe', 'downloaded_recipe'}
commit_name = 'recipe_input'
recommendations = {
('chapter', None, OptionRecommendation.HIGH),
('dont_split_on_page_breaks', True, OptionRecommendation.HIGH),
('use_auto_toc', False, OptionRecommendation.HIGH),
('input_encoding', None, OptionRecommendation.HIGH),
('input_profile', 'default', OptionRecommendation.HIGH),
('page_breaks_before', None, OptionRecommendation.HIGH),
('insert_metadata', False, OptionRecommendation.HIGH),
}
options = {
OptionRecommendation(name='test', recommended_value=False,
help=_(
'Useful for recipe development. Forces'
' max_articles_per_feed to 2 and downloads at most 2 feeds.'
' You can change the number of feeds and articles by supplying optional arguments.'
' For example: --test 3 1 will download at most 3 feeds and only 1 article per feed.')),
OptionRecommendation(name='username', recommended_value=None,
help=_('Username for sites that require a login to access '
'content.')),
OptionRecommendation(name='password', recommended_value=None,
help=_('Password for sites that require a login to access '
'content.')),
OptionRecommendation(name='dont_download_recipe',
recommended_value=False,
help=_('Do not download latest version of builtin recipes from the calibre server')),
OptionRecommendation(name='lrf', recommended_value=False,
help='Optimize fetching for subsequent conversion to LRF.'),
}
def convert(self, recipe_or_file, opts, file_ext, log,
accelerators):
from calibre.web.feeds.recipes import compile_recipe
opts.output_profile.flow_size = 0
if file_ext == 'downloaded_recipe':
from calibre.utils.zipfile import ZipFile
zf = ZipFile(recipe_or_file, 'r')
zf.extractall()
zf.close()
with lopen('download.recipe', 'rb') as f:
self.recipe_source = f.read()
recipe = compile_recipe(self.recipe_source)
recipe.needs_subscription = False
self.recipe_object = recipe(opts, log, self.report_progress)
else:
if os.environ.get('CALIBRE_RECIPE_URN'):
from calibre.web.feeds.recipes.collection import get_custom_recipe, get_builtin_recipe_by_id
urn = os.environ['CALIBRE_RECIPE_URN']
log('Downloading recipe urn: ' + urn)
rtype, recipe_id = urn.partition(':')[::2]
if not recipe_id:
raise ValueError('Invalid recipe urn: ' + urn)
if rtype == 'custom':
self.recipe_source = get_custom_recipe(recipe_id)
else:
self.recipe_source = get_builtin_recipe_by_id(urn, log=log, download_recipe=True)
if not self.recipe_source:
raise ValueError('Could not find recipe with urn: ' + urn)
if not isinstance(self.recipe_source, bytes):
self.recipe_source = self.recipe_source.encode('utf-8')
recipe = compile_recipe(self.recipe_source)
elif os.access(recipe_or_file, os.R_OK):
with lopen(recipe_or_file, 'rb') as f:
self.recipe_source = f.read()
recipe = compile_recipe(self.recipe_source)
log('Using custom recipe')
else:
from calibre.web.feeds.recipes.collection import (
get_builtin_recipe_by_title, get_builtin_recipe_titles)
title = getattr(opts, 'original_recipe_input_arg', recipe_or_file)
title = os.path.basename(title).rpartition('.')[0]
titles = frozenset(get_builtin_recipe_titles())
if title not in titles:
title = getattr(opts, 'original_recipe_input_arg', recipe_or_file)
title = title.rpartition('.')[0]
raw = get_builtin_recipe_by_title(title, log=log,
download_recipe=not opts.dont_download_recipe)
builtin = False
try:
recipe = compile_recipe(raw)
self.recipe_source = raw
if recipe.requires_version > numeric_version:
log.warn(
'Downloaded recipe needs calibre version at least: %s' %
('.'.join(recipe.requires_version)))
builtin = True
except:
log.exception('Failed to compile downloaded recipe. Falling '
'back to builtin one')
builtin = True
if builtin:
log('Using bundled builtin recipe')
raw = get_builtin_recipe_by_title(title, log=log,
download_recipe=False)
if raw is None:
raise ValueError('Failed to find builtin recipe: '+title)
recipe = compile_recipe(raw)
self.recipe_source = raw
else:
log('Using downloaded builtin recipe')
if recipe is None:
raise ValueError('%r is not a valid recipe file or builtin recipe' %
recipe_or_file)
disabled = getattr(recipe, 'recipe_disabled', None)
if disabled is not None:
raise RecipeDisabled(disabled)
ro = recipe(opts, log, self.report_progress)
ro.download()
self.recipe_object = ro
for key, val in self.recipe_object.conversion_options.items():
setattr(opts, key, val)
for f in os.listdir('.'):
if f.endswith('.opf'):
return os.path.abspath(f)
for f in walk('.'):
if f.endswith('.opf'):
return os.path.abspath(f)
def postprocess_book(self, oeb, opts, log):
if self.recipe_object is not None:
self.recipe_object.internal_postprocess_book(oeb, opts, log)
self.recipe_object.postprocess_book(oeb, opts, log)
def specialize(self, oeb, opts, log, output_fmt):
if opts.no_inline_navbars:
from calibre.ebooks.oeb.base import XPath
for item in oeb.spine:
for div in XPath('//h:div[contains(@class, "calibre_navbar")]')(item.data):
div.getparent().remove(div)
def save_download(self, zf):
raw = self.recipe_source
if isinstance(raw, unicode_type):
raw = raw.encode('utf-8')
zf.writestr('download.recipe', raw)

View File

@@ -0,0 +1,323 @@
from __future__ import with_statement, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
import os, glob, re, textwrap
from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
from polyglot.builtins import iteritems, filter, getcwd, as_bytes
border_style_map = {
'single' : 'solid',
'double-thickness-border' : 'double',
'shadowed-border': 'outset',
'double-border': 'double',
'dotted-border': 'dotted',
'dashed': 'dashed',
'hairline': 'solid',
'inset': 'inset',
'dash-small': 'dashed',
'dot-dash': 'dotted',
'dot-dot-dash': 'dotted',
'outset': 'outset',
'tripple': 'double',
'triple': 'double',
'thick-thin-small': 'solid',
'thin-thick-small': 'solid',
'thin-thick-thin-small': 'solid',
'thick-thin-medium': 'solid',
'thin-thick-medium': 'solid',
'thin-thick-thin-medium': 'solid',
'thick-thin-large': 'solid',
'thin-thick-thin-large': 'solid',
'wavy': 'ridge',
'double-wavy': 'ridge',
'striped': 'ridge',
'emboss': 'inset',
'engrave': 'inset',
'frame': 'ridge',
}
class RTFInput(InputFormatPlugin):
name = 'RTF Input'
author = 'Kovid Goyal'
description = 'Convert RTF files to HTML'
file_types = {'rtf'}
commit_name = 'rtf_input'
options = {
OptionRecommendation(name='ignore_wmf', recommended_value=False,
help=_('Ignore WMF images instead of replacing them with a placeholder image.')),
}
def generate_xml(self, stream):
from calibre.ebooks.rtf2xml.ParseRtf import ParseRtf
ofile = u'dataxml.xml'
run_lev, debug_dir, indent_out = 1, None, 0
if getattr(self.opts, 'debug_pipeline', None) is not None:
try:
os.mkdir(u'rtfdebug')
debug_dir = u'rtfdebug'
run_lev = 4
indent_out = 1
self.log('Running RTFParser in debug mode')
except:
self.log.warn('Impossible to run RTFParser in debug mode')
parser = ParseRtf(
in_file=stream,
out_file=ofile,
# Convert symbol fonts to unicode equivalents. Default
# is 1
convert_symbol=1,
# Convert Zapf fonts to unicode equivalents. Default
# is 1.
convert_zapf=1,
# Convert Wingding fonts to unicode equivalents.
# Default is 1.
convert_wingdings=1,
# Convert RTF caps to real caps.
# Default is 1.
convert_caps=1,
# Indent resulting XML.
# Default is 0 (no indent).
indent=indent_out,
# Form lists from RTF. Default is 1.
form_lists=1,
# Convert headings to sections. Default is 0.
headings_to_sections=1,
# Group paragraphs with the same style name. Default is 1.
group_styles=1,
# Group borders. Default is 1.
group_borders=1,
# Write or do not write paragraphs. Default is 0.
empty_paragraphs=1,
# Debug
deb_dir=debug_dir,
# Default encoding
default_encoding=getattr(self.opts, 'input_encoding', 'cp1252') or 'cp1252',
# Run level
run_level=run_lev,
)
parser.parse_rtf()
with open(ofile, 'rb') as f:
return f.read()
def extract_images(self, picts):
from calibre.utils.imghdr import what
from binascii import unhexlify
self.log('Extracting images...')
with open(picts, 'rb') as f:
raw = f.read()
picts = filter(len, re.findall(br'\{\\pict([^}]+)\}', raw))
hex_pat = re.compile(br'[^a-fA-F0-9]')
encs = [hex_pat.sub(b'', pict) for pict in picts]
count = 0
imap = {}
for enc in encs:
if len(enc) % 2 == 1:
enc = enc[:-1]
data = unhexlify(enc)
fmt = what(None, data)
if fmt is None:
fmt = 'wmf'
count += 1
name = u'%04d.%s' % (count, fmt)
with open(name, 'wb') as f:
f.write(data)
imap[count] = name
# with open(name+'.hex', 'wb') as f:
# f.write(enc)
return self.convert_images(imap)
def convert_images(self, imap):
self.default_img = None
for count, val in iteritems(imap):
try:
imap[count] = self.convert_image(val)
except:
self.log.exception('Failed to convert', val)
return imap
def convert_image(self, name):
if not name.endswith('.wmf'):
return name
try:
return self.rasterize_wmf(name)
except Exception:
self.log.exception('Failed to convert WMF image %r'%name)
return self.replace_wmf(name)
def replace_wmf(self, name):
if self.opts.ignore_wmf:
os.remove(name)
return '__REMOVE_ME__'
from calibre.ebooks.covers import message_image
if self.default_img is None:
self.default_img = message_image('Conversion of WMF images is not supported.'
' Use Microsoft Word or OpenOffice to save this RTF file'
' as HTML and convert that in calibre.')
name = name.replace('.wmf', '.jpg')
with lopen(name, 'wb') as f:
f.write(self.default_img)
return name
def rasterize_wmf(self, name):
from calibre.utils.wmf.parse import wmf_unwrap
with open(name, 'rb') as f:
data = f.read()
data = wmf_unwrap(data)
name = name.replace('.wmf', '.png')
with open(name, 'wb') as f:
f.write(data)
return name
def write_inline_css(self, ic, border_styles):
font_size_classes = ['span.fs%d { font-size: %spt }'%(i, x) for i, x in
enumerate(ic.font_sizes)]
color_classes = ['span.col%d { color: %s }'%(i, x) for i, x in
enumerate(ic.colors) if x != 'false']
css = textwrap.dedent('''
span.none {
text-decoration: none; font-weight: normal;
font-style: normal; font-variant: normal
}
span.italics { font-style: italic }
span.bold { font-weight: bold }
span.small-caps { font-variant: small-caps }
span.underlined { text-decoration: underline }
span.strike-through { text-decoration: line-through }
''')
css += '\n'+'\n'.join(font_size_classes)
css += '\n' +'\n'.join(color_classes)
for cls, val in iteritems(border_styles):
css += '\n\n.%s {\n%s\n}'%(cls, val)
with open(u'styles.css', 'ab') as f:
f.write(css.encode('utf-8'))
def convert_borders(self, doc):
border_styles = []
style_map = {}
for elem in doc.xpath(r'//*[local-name()="cell"]'):
style = ['border-style: hidden', 'border-width: 1px',
'border-color: black']
for x in ('bottom', 'top', 'left', 'right'):
bs = elem.get('border-cell-%s-style'%x, None)
if bs:
cbs = border_style_map.get(bs, 'solid')
style.append('border-%s-style: %s'%(x, cbs))
bw = elem.get('border-cell-%s-line-width'%x, None)
if bw:
style.append('border-%s-width: %spt'%(x, bw))
bc = elem.get('border-cell-%s-color'%x, None)
if bc:
style.append('border-%s-color: %s'%(x, bc))
style = ';\n'.join(style)
if style not in border_styles:
border_styles.append(style)
idx = border_styles.index(style)
cls = 'border_style%d'%idx
style_map[cls] = style
elem.set('class', cls)
return style_map
def convert(self, stream, options, file_ext, log,
accelerators):
from lxml import etree
from calibre.ebooks.metadata.meta import get_metadata
from calibre.ebooks.metadata.opf2 import OPFCreator
from calibre.ebooks.rtf2xml.ParseRtf import RtfInvalidCodeException
from calibre.ebooks.rtf.input import InlineClass
from calibre.utils.xml_parse import safe_xml_fromstring
self.opts = options
self.log = log
self.log('Converting RTF to XML...')
try:
xml = self.generate_xml(stream.name)
except RtfInvalidCodeException as e:
self.log.exception('Unable to parse RTF')
raise ValueError(_('This RTF file has a feature calibre does not '
'support. Convert it to HTML first and then try it.\n%s')%e)
d = glob.glob(os.path.join('*_rtf_pict_dir', 'picts.rtf'))
if d:
imap = {}
try:
imap = self.extract_images(d[0])
except:
self.log.exception('Failed to extract images...')
self.log('Parsing XML...')
doc = safe_xml_fromstring(xml)
border_styles = self.convert_borders(doc)
for pict in doc.xpath('//rtf:pict[@num]',
namespaces={'rtf':'http://rtf2xml.sourceforge.net/'}):
num = int(pict.get('num'))
name = imap.get(num, None)
if name is not None:
pict.set('num', name)
self.log('Converting XML to HTML...')
inline_class = InlineClass(self.log)
styledoc = safe_xml_fromstring(P('templates/rtf.xsl', data=True), recover=False)
extensions = {('calibre', 'inline-class') : inline_class}
transform = etree.XSLT(styledoc, extensions=extensions)
result = transform(doc)
html = u'index.xhtml'
with open(html, 'wb') as f:
res = as_bytes(transform.tostring(result))
# res = res[:100].replace('xmlns:html', 'xmlns') + res[100:]
# clean multiple \n
res = re.sub(b'\n+', b'\n', res)
# Replace newlines inserted by the 'empty_paragraphs' option in rtf2xml with html blank lines
# res = re.sub('\s*<body>', '<body>', res)
# res = re.sub('(?<=\n)\n{2}',
# u'<p>\u00a0</p>\n'.encode('utf-8'), res)
f.write(res)
self.write_inline_css(inline_class, border_styles)
stream.seek(0)
mi = get_metadata(stream, 'rtf')
if not mi.title:
mi.title = _('Unknown')
if not mi.authors:
mi.authors = [_('Unknown')]
opf = OPFCreator(getcwd(), mi)
opf.create_manifest([(u'index.xhtml', None)])
opf.create_spine([u'index.xhtml'])
opf.render(open(u'metadata.opf', 'wb'))
return os.path.abspath(u'metadata.opf')
def postprocess_book(self, oeb, opts, log):
for item in oeb.spine:
for img in item.data.xpath('//*[local-name()="img" and @src="__REMOVE_ME__"]'):
p = img.getparent()
idx = p.index(img)
p.remove(img)
if img.tail:
if idx == 0:
p.text = (p.text or '') + img.tail
else:
p[idx-1].tail = (p[idx-1].tail or '') + img.tail

View File

@@ -0,0 +1,40 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL 3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
import os
from calibre.customize.conversion import OutputFormatPlugin
class RTFOutput(OutputFormatPlugin):
name = 'RTF Output'
author = 'John Schember'
file_type = 'rtf'
commit_name = 'rtf_output'
def convert(self, oeb_book, output_path, input_plugin, opts, log):
from calibre.ebooks.rtf.rtfml import RTFMLizer
rtfmlitzer = RTFMLizer(log)
content = rtfmlitzer.extract_content(oeb_book, opts)
close = False
if not hasattr(output_path, 'write'):
close = True
if not os.path.exists(os.path.dirname(output_path)) and os.path.dirname(output_path) != '':
os.makedirs(os.path.dirname(output_path))
out_stream = lopen(output_path, 'wb')
else:
out_stream = output_path
out_stream.seek(0)
out_stream.truncate()
out_stream.write(content.encode('ascii', 'replace'))
if close:
out_stream.close()

View File

@@ -0,0 +1,122 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL 3'
__copyright__ = '2010, Li Fanxi <lifanxi@freemindworld.com>'
__docformat__ = 'restructuredtext en'
import os
from calibre.customize.conversion import InputFormatPlugin
from calibre.ptempfile import TemporaryDirectory
from calibre.utils.filenames import ascii_filename
from polyglot.builtins import unicode_type
HTML_TEMPLATE = '<html><head><meta http-equiv="Content-Type" content="text/html; charset=utf-8"/><title>%s</title></head><body>\n%s\n</body></html>'
def html_encode(s):
return s.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;').replace('"', '&quot;').replace("'", '&apos;').replace('\n', '<br/>').replace(' ', '&nbsp;') # noqa
class SNBInput(InputFormatPlugin):
name = 'SNB Input'
author = 'Li Fanxi'
description = 'Convert SNB files to OEB'
file_types = {'snb'}
commit_name = 'snb_input'
options = set()
def convert(self, stream, options, file_ext, log,
accelerators):
import uuid
from calibre.ebooks.oeb.base import DirContainer
from calibre.ebooks.snb.snbfile import SNBFile
from calibre.utils.xml_parse import safe_xml_fromstring
log.debug("Parsing SNB file...")
snbFile = SNBFile()
try:
snbFile.Parse(stream)
except:
raise ValueError("Invalid SNB file")
if not snbFile.IsValid():
log.debug("Invalid SNB file")
raise ValueError("Invalid SNB file")
log.debug("Handle meta data ...")
from calibre.ebooks.conversion.plumber import create_oebbook
oeb = create_oebbook(log, None, options,
encoding=options.input_encoding, populate=False)
meta = snbFile.GetFileStream('snbf/book.snbf')
if meta is not None:
meta = safe_xml_fromstring(meta)
l = {'title' : './/head/name',
'creator' : './/head/author',
'language' : './/head/language',
'generator': './/head/generator',
'publisher': './/head/publisher',
'cover' : './/head/cover', }
d = {}
for item in l:
node = meta.find(l[item])
if node is not None:
d[item] = node.text if node.text is not None else ''
else:
d[item] = ''
oeb.metadata.add('title', d['title'])
oeb.metadata.add('creator', d['creator'], attrib={'role':'aut'})
oeb.metadata.add('language', d['language'].lower().replace('_', '-'))
oeb.metadata.add('generator', d['generator'])
oeb.metadata.add('publisher', d['publisher'])
if d['cover'] != '':
oeb.guide.add('cover', 'Cover', d['cover'])
bookid = unicode_type(uuid.uuid4())
oeb.metadata.add('identifier', bookid, id='uuid_id', scheme='uuid')
for ident in oeb.metadata.identifier:
if 'id' in ident.attrib:
oeb.uid = oeb.metadata.identifier[0]
break
with TemporaryDirectory('_snb2oeb', keep=True) as tdir:
log.debug('Process TOC ...')
toc = snbFile.GetFileStream('snbf/toc.snbf')
oeb.container = DirContainer(tdir, log)
if toc is not None:
toc = safe_xml_fromstring(toc)
i = 1
for ch in toc.find('.//body'):
chapterName = ch.text
chapterSrc = ch.get('src')
fname = 'ch_%d.htm' % i
data = snbFile.GetFileStream('snbc/' + chapterSrc)
if data is None:
continue
snbc = safe_xml_fromstring(data)
lines = []
for line in snbc.find('.//body'):
if line.tag == 'text':
lines.append('<p>%s</p>' % html_encode(line.text))
elif line.tag == 'img':
lines.append('<p><img src="%s" /></p>' % html_encode(line.text))
with open(os.path.join(tdir, fname), 'wb') as f:
f.write((HTML_TEMPLATE % (chapterName, '\n'.join(lines))).encode('utf-8', 'replace'))
oeb.toc.add(ch.text, fname)
id, href = oeb.manifest.generate(id='html',
href=ascii_filename(fname))
item = oeb.manifest.add(id, href, 'text/html')
item.html_input_href = fname
oeb.spine.add(item, True)
i = i + 1
imageFiles = snbFile.OutputImageFiles(tdir)
for f, m in imageFiles:
id, href = oeb.manifest.generate(id='image',
href=ascii_filename(f))
item = oeb.manifest.add(id, href, m)
item.html_input_href = f
return oeb

View File

@@ -0,0 +1,269 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL 3'
__copyright__ = '2010, Li Fanxi <lifanxi@freemindworld.com>'
__docformat__ = 'restructuredtext en'
import os
from calibre.customize.conversion import OutputFormatPlugin, OptionRecommendation
from calibre.ptempfile import TemporaryDirectory
from calibre.constants import __appname__, __version__
from polyglot.builtins import unicode_type
class SNBOutput(OutputFormatPlugin):
name = 'SNB Output'
author = 'Li Fanxi'
file_type = 'snb'
commit_name = 'snb_output'
options = {
OptionRecommendation(name='snb_output_encoding', recommended_value='utf-8',
level=OptionRecommendation.LOW,
help=_('Specify the character encoding of the output document. '
'The default is utf-8.')),
OptionRecommendation(name='snb_max_line_length',
recommended_value=0, level=OptionRecommendation.LOW,
help=_('The maximum number of characters per line. This splits on '
'the first space before the specified value. If no space is found '
'the line will be broken at the space after and will exceed the '
'specified value. Also, there is a minimum of 25 characters. '
'Use 0 to disable line splitting.')),
OptionRecommendation(name='snb_insert_empty_line',
recommended_value=False, level=OptionRecommendation.LOW,
help=_('Specify whether or not to insert an empty line between '
'two paragraphs.')),
OptionRecommendation(name='snb_dont_indent_first_line',
recommended_value=False, level=OptionRecommendation.LOW,
help=_('Specify whether or not to insert two space characters '
'to indent the first line of each paragraph.')),
OptionRecommendation(name='snb_hide_chapter_name',
recommended_value=False, level=OptionRecommendation.LOW,
help=_('Specify whether or not to hide the chapter title for each '
'chapter. Useful for image-only output (eg. comics).')),
OptionRecommendation(name='snb_full_screen',
recommended_value=False, level=OptionRecommendation.LOW,
help=_('Resize all the images for full screen view. ')),
}
def convert(self, oeb_book, output_path, input_plugin, opts, log):
from lxml import etree
from calibre.ebooks.snb.snbfile import SNBFile
from calibre.ebooks.snb.snbml import SNBMLizer, ProcessFileName
self.opts = opts
from calibre.ebooks.oeb.transforms.rasterize import SVGRasterizer, Unavailable
try:
rasterizer = SVGRasterizer()
rasterizer(oeb_book, opts)
except Unavailable:
log.warn('SVG rasterizer unavailable, SVG will not be converted')
# Create temp dir
with TemporaryDirectory('_snb_output') as tdir:
# Create stub directories
snbfDir = os.path.join(tdir, 'snbf')
snbcDir = os.path.join(tdir, 'snbc')
snbiDir = os.path.join(tdir, 'snbc/images')
os.mkdir(snbfDir)
os.mkdir(snbcDir)
os.mkdir(snbiDir)
# Process Meta data
meta = oeb_book.metadata
if meta.title:
title = unicode_type(meta.title[0])
else:
title = ''
authors = [unicode_type(x) for x in meta.creator if x.role == 'aut']
if meta.publisher:
publishers = unicode_type(meta.publisher[0])
else:
publishers = ''
if meta.language:
lang = unicode_type(meta.language[0]).upper()
else:
lang = ''
if meta.description:
abstract = unicode_type(meta.description[0])
else:
abstract = ''
# Process Cover
g, m, s = oeb_book.guide, oeb_book.manifest, oeb_book.spine
href = None
if 'titlepage' not in g:
if 'cover' in g:
href = g['cover'].href
# Output book info file
bookInfoTree = etree.Element("book-snbf", version="1.0")
headTree = etree.SubElement(bookInfoTree, "head")
etree.SubElement(headTree, "name").text = title
etree.SubElement(headTree, "author").text = ' '.join(authors)
etree.SubElement(headTree, "language").text = lang
etree.SubElement(headTree, "rights")
etree.SubElement(headTree, "publisher").text = publishers
etree.SubElement(headTree, "generator").text = __appname__ + ' ' + __version__
etree.SubElement(headTree, "created")
etree.SubElement(headTree, "abstract").text = abstract
if href is not None:
etree.SubElement(headTree, "cover").text = ProcessFileName(href)
else:
etree.SubElement(headTree, "cover")
with open(os.path.join(snbfDir, 'book.snbf'), 'wb') as f:
f.write(etree.tostring(bookInfoTree, pretty_print=True, encoding='utf-8'))
# Output TOC
tocInfoTree = etree.Element("toc-snbf")
tocHead = etree.SubElement(tocInfoTree, "head")
tocBody = etree.SubElement(tocInfoTree, "body")
outputFiles = {}
if oeb_book.toc.count() == 0:
log.warn('This SNB file has no Table of Contents. '
'Creating a default TOC')
first = next(iter(oeb_book.spine))
oeb_book.toc.add(_('Start page'), first.href)
else:
first = next(iter(oeb_book.spine))
if oeb_book.toc[0].href != first.href:
# The pages before the fist item in toc will be stored as
# "Cover Pages".
# oeb_book.toc does not support "insert", so we generate
# the tocInfoTree directly instead of modifying the toc
ch = etree.SubElement(tocBody, "chapter")
ch.set("src", ProcessFileName(first.href) + ".snbc")
ch.text = _('Cover pages')
outputFiles[first.href] = []
outputFiles[first.href].append(("", _("Cover pages")))
for tocitem in oeb_book.toc:
if tocitem.href.find('#') != -1:
item = tocitem.href.split('#')
if len(item) != 2:
log.error('Error in TOC item: %s' % tocitem)
else:
if item[0] in outputFiles:
outputFiles[item[0]].append((item[1], tocitem.title))
else:
outputFiles[item[0]] = []
if "" not in outputFiles[item[0]]:
outputFiles[item[0]].append(("", tocitem.title + _(" (Preface)")))
ch = etree.SubElement(tocBody, "chapter")
ch.set("src", ProcessFileName(item[0]) + ".snbc")
ch.text = tocitem.title + _(" (Preface)")
outputFiles[item[0]].append((item[1], tocitem.title))
else:
if tocitem.href in outputFiles:
outputFiles[tocitem.href].append(("", tocitem.title))
else:
outputFiles[tocitem.href] = []
outputFiles[tocitem.href].append(("", tocitem.title))
ch = etree.SubElement(tocBody, "chapter")
ch.set("src", ProcessFileName(tocitem.href) + ".snbc")
ch.text = tocitem.title
etree.SubElement(tocHead, "chapters").text = '%d' % len(tocBody)
with open(os.path.join(snbfDir, 'toc.snbf'), 'wb') as f:
f.write(etree.tostring(tocInfoTree, pretty_print=True, encoding='utf-8'))
# Output Files
oldTree = None
mergeLast = False
lastName = None
for item in s:
from calibre.ebooks.oeb.base import OEB_DOCS, OEB_IMAGES
if m.hrefs[item.href].media_type in OEB_DOCS:
if item.href not in outputFiles:
log.debug('File %s is unused in TOC. Continue in last chapter' % item.href)
mergeLast = True
else:
if oldTree is not None and mergeLast:
log.debug('Output the modified chapter again: %s' % lastName)
with open(os.path.join(snbcDir, lastName), 'wb') as f:
f.write(etree.tostring(oldTree, pretty_print=True, encoding='utf-8'))
mergeLast = False
log.debug('Converting %s to snbc...' % item.href)
snbwriter = SNBMLizer(log)
snbcTrees = None
if not mergeLast:
snbcTrees = snbwriter.extract_content(oeb_book, item, outputFiles[item.href], opts)
for subName in snbcTrees:
postfix = ''
if subName != '':
postfix = '_' + subName
lastName = ProcessFileName(item.href + postfix + ".snbc")
oldTree = snbcTrees[subName]
with open(os.path.join(snbcDir, lastName), 'wb') as f:
f.write(etree.tostring(oldTree, pretty_print=True, encoding='utf-8'))
else:
log.debug('Merge %s with last TOC item...' % item.href)
snbwriter.merge_content(oldTree, oeb_book, item, [('', _("Start"))], opts)
# Output the last one if needed
log.debug('Output the last modified chapter again: %s' % lastName)
if oldTree is not None and mergeLast:
with open(os.path.join(snbcDir, lastName), 'wb') as f:
f.write(etree.tostring(oldTree, pretty_print=True, encoding='utf-8'))
mergeLast = False
for item in m:
if m.hrefs[item.href].media_type in OEB_IMAGES:
log.debug('Converting image: %s ...' % item.href)
content = m.hrefs[item.href].data
# Convert & Resize image
self.HandleImage(content, os.path.join(snbiDir, ProcessFileName(item.href)))
# Package as SNB File
snbFile = SNBFile()
snbFile.FromDir(tdir)
snbFile.Output(output_path)
def HandleImage(self, imageData, imagePath):
from calibre.utils.img import image_from_data, resize_image, image_to_data
img = image_from_data(imageData)
x, y = img.width(), img.height()
if self.opts:
if self.opts.snb_full_screen:
SCREEN_X, SCREEN_Y = self.opts.output_profile.screen_size
else:
SCREEN_X, SCREEN_Y = self.opts.output_profile.comic_screen_size
else:
SCREEN_X = 540
SCREEN_Y = 700
# Handle big image only
if x > SCREEN_X or y > SCREEN_Y:
xScale = float(x) / SCREEN_X
yScale = float(y) / SCREEN_Y
scale = max(xScale, yScale)
# TODO : intelligent image rotation
# img = img.rotate(90)
# x,y = y,x
img = resize_image(img, x // scale, y // scale)
with lopen(imagePath, 'wb') as f:
f.write(image_to_data(img, fmt=imagePath.rpartition('.')[-1]))
if __name__ == '__main__':
from calibre.ebooks.oeb.reader import OEBReader
from calibre.ebooks.oeb.base import OEBBook
from calibre.ebooks.conversion.preprocess import HTMLPreProcessor
from calibre.customize.profiles import HanlinV3Output
class OptionValues(object):
pass
opts = OptionValues()
opts.output_profile = HanlinV3Output(None)
html_preprocessor = HTMLPreProcessor(None, None, opts)
from calibre.utils.logging import default_log
oeb = OEBBook(default_log, html_preprocessor)
reader = OEBReader
reader()(oeb, '/tmp/bbb/processed/')
SNBOutput(None).convert(oeb, '/tmp/test.snb', None, None, default_log)

View File

@@ -0,0 +1,39 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL 3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
from io import BytesIO
from calibre.customize.conversion import InputFormatPlugin
class TCRInput(InputFormatPlugin):
name = 'TCR Input'
author = 'John Schember'
description = 'Convert TCR files to HTML'
file_types = {'tcr'}
commit_name = 'tcr_input'
def convert(self, stream, options, file_ext, log, accelerators):
from calibre.ebooks.compression.tcr import decompress
log.info('Decompressing text...')
raw_txt = decompress(stream)
log.info('Converting text to OEB...')
stream = BytesIO(raw_txt)
from calibre.customize.ui import plugin_for_input_format
txt_plugin = plugin_for_input_format('txt')
for opt in txt_plugin.options:
if not hasattr(self.options, opt.option.name):
setattr(options, opt.option.name, opt.recommended_value)
stream.seek(0)
return txt_plugin.convert(stream, options,
'txt', log, accelerators)

View File

@@ -0,0 +1,56 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL 3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
import os
from calibre.customize.conversion import OutputFormatPlugin, \
OptionRecommendation
class TCROutput(OutputFormatPlugin):
name = 'TCR Output'
author = 'John Schember'
file_type = 'tcr'
commit_name = 'tcr_output'
options = {
OptionRecommendation(name='tcr_output_encoding', recommended_value='utf-8',
level=OptionRecommendation.LOW,
help=_('Specify the character encoding of the output document. '
'The default is utf-8.'))}
def convert(self, oeb_book, output_path, input_plugin, opts, log):
from calibre.ebooks.txt.txtml import TXTMLizer
from calibre.ebooks.compression.tcr import compress
close = False
if not hasattr(output_path, 'write'):
close = True
if not os.path.exists(os.path.dirname(output_path)) and os.path.dirname(output_path):
os.makedirs(os.path.dirname(output_path))
out_stream = lopen(output_path, 'wb')
else:
out_stream = output_path
setattr(opts, 'flush_paras', False)
setattr(opts, 'max_line_length', 0)
setattr(opts, 'force_max_line_length', False)
setattr(opts, 'indent_paras', False)
writer = TXTMLizer(log)
txt = writer.extract_content(oeb_book, opts).encode(opts.tcr_output_encoding, 'replace')
log.info('Compressing text...')
txt = compress(txt)
out_stream.seek(0)
out_stream.truncate()
out_stream.write(txt)
if close:
out_stream.close()

View File

@@ -0,0 +1,308 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL 3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
import os
from calibre import _ent_pat, walk, xml_entity_to_unicode
from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
from polyglot.builtins import getcwd
MD_EXTENSIONS = {
'abbr': _('Abbreviations'),
'admonition': _('Support admonitions'),
'attr_list': _('Add attribute to HTML tags'),
'codehilite': _('Add code highlighting via Pygments'),
'def_list': _('Definition lists'),
'extra': _('Enables various common extensions'),
'fenced_code': _('Alternative code block syntax'),
'footnotes': _('Footnotes'),
'legacy_attrs': _('Use legacy element attributes'),
'legacy_em': _('Use legacy underscore handling for connected words'),
'meta': _('Metadata in the document'),
'nl2br': _('Treat newlines as hard breaks'),
'sane_lists': _('Do not allow mixing list types'),
'smarty': _('Use markdown\'s internal smartypants parser'),
'tables': _('Support tables'),
'toc': _('Generate a table of contents'),
'wikilinks': _('Wiki style links'),
}
class TXTInput(InputFormatPlugin):
name = 'TXT Input'
author = 'John Schember'
description = 'Convert TXT files to HTML'
file_types = {'txt', 'txtz', 'text', 'md', 'textile', 'markdown'}
commit_name = 'txt_input'
ui_data = {
'md_extensions': MD_EXTENSIONS,
'paragraph_types': {
'auto': _('Try to auto detect paragraph type'),
'block': _('Treat a blank line as a paragraph break'),
'single': _('Assume every line is a paragraph'),
'print': _('Assume every line starting with 2+ spaces or a tab starts a paragraph'),
'unformatted': _('Most lines have hard line breaks, few/no blank lines or indents'),
'off': _('Don\'t modify the paragraph structure'),
},
'formatting_types': {
'auto': _('Automatically decide which formatting processor to use'),
'plain': _('No formatting'),
'heuristic': _('Use heuristics to determine chapter headings, italics, etc.'),
'textile': _('Use the TexTile markup language'),
'markdown': _('Use the Markdown markup language')
},
}
options = {
OptionRecommendation(name='formatting_type', recommended_value='auto',
choices=list(ui_data['formatting_types']),
help=_('Formatting used within the document.\n'
'* auto: {auto}\n'
'* plain: {plain}\n'
'* heuristic: {heuristic}\n'
'* textile: {textile}\n'
'* markdown: {markdown}\n'
'To learn more about markdown see {url}').format(
url='https://daringfireball.net/projects/markdown/', **ui_data['formatting_types'])
),
OptionRecommendation(name='paragraph_type', recommended_value='auto',
choices=list(ui_data['paragraph_types']),
help=_('Paragraph structure to assume. The value of "off" is useful for formatted documents such as Markdown or Textile. '
'Choices are:\n'
'* auto: {auto}\n'
'* block: {block}\n'
'* single: {single}\n'
'* print: {print}\n'
'* unformatted: {unformatted}\n'
'* off: {off}').format(**ui_data['paragraph_types'])
),
OptionRecommendation(name='preserve_spaces', recommended_value=False,
help=_('Normally extra spaces are condensed into a single space. '
'With this option all spaces will be displayed.')),
OptionRecommendation(name='txt_in_remove_indents', recommended_value=False,
help=_('Normally extra space at the beginning of lines is retained. '
'With this option they will be removed.')),
OptionRecommendation(name="markdown_extensions", recommended_value='footnotes, tables, toc',
help=_('Enable extensions to markdown syntax. Extensions are formatting that is not part '
'of the standard markdown format. The extensions enabled by default: %default.\n'
'To learn more about markdown extensions, see {}\n'
'This should be a comma separated list of extensions to enable:\n'
).format('https://python-markdown.github.io/extensions/') + '\n'.join('* %s: %s' % (k, MD_EXTENSIONS[k]) for k in sorted(MD_EXTENSIONS))),
}
def shift_file(self, fname, data):
name, ext = os.path.splitext(fname)
candidate = os.path.join(self.output_dir, fname)
c = 0
while os.path.exists(candidate):
c += 1
candidate = os.path.join(self.output_dir, '{}-{}{}'.format(name, c, ext))
ans = candidate
with open(ans, 'wb') as f:
f.write(data)
return f.name
def fix_resources(self, html, base_dir):
from html5_parser import parse
root = parse(html)
changed = False
for img in root.xpath('//img[@src]'):
src = img.get('src')
prefix = src.split(':', 1)[0].lower()
if prefix not in ('file', 'http', 'https', 'ftp') and not os.path.isabs(src):
src = os.path.join(base_dir, src)
if os.access(src, os.R_OK):
with open(src, 'rb') as f:
data = f.read()
f = self.shift_file(os.path.basename(src), data)
changed = True
img.set('src', os.path.basename(f))
if changed:
from lxml import etree
html = etree.tostring(root, encoding='unicode')
return html
def convert(self, stream, options, file_ext, log,
accelerators):
from calibre.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator
from calibre.ebooks.chardet import detect
from calibre.utils.zipfile import ZipFile
from calibre.ebooks.txt.processor import (convert_basic,
convert_markdown_with_metadata, separate_paragraphs_single_line,
separate_paragraphs_print_formatted, preserve_spaces,
detect_paragraph_type, detect_formatting_type,
normalize_line_endings, convert_textile, remove_indents,
block_to_single_line, separate_hard_scene_breaks)
self.log = log
txt = b''
log.debug('Reading text from file...')
length = 0
base_dir = self.output_dir = getcwd()
# Extract content from zip archive.
if file_ext == 'txtz':
zf = ZipFile(stream)
zf.extractall('.')
for x in walk('.'):
if os.path.splitext(x)[1].lower() in ('.txt', '.text'):
with open(x, 'rb') as tf:
txt += tf.read() + b'\n\n'
else:
if getattr(stream, 'name', None):
base_dir = os.path.dirname(stream.name)
txt = stream.read()
if file_ext in {'md', 'textile', 'markdown'}:
options.formatting_type = {'md': 'markdown'}.get(file_ext, file_ext)
log.info('File extension indicates particular formatting. '
'Forcing formatting type to: %s'%options.formatting_type)
options.paragraph_type = 'off'
# Get the encoding of the document.
if options.input_encoding:
ienc = options.input_encoding
log.debug('Using user specified input encoding of %s' % ienc)
else:
det_encoding = detect(txt[:4096])
det_encoding, confidence = det_encoding['encoding'], det_encoding['confidence']
if det_encoding and det_encoding.lower().replace('_', '-').strip() in (
'gb2312', 'chinese', 'csiso58gb231280', 'euc-cn', 'euccn',
'eucgb2312-cn', 'gb2312-1980', 'gb2312-80', 'iso-ir-58'):
# Microsoft Word exports to HTML with encoding incorrectly set to
# gb2312 instead of gbk. gbk is a superset of gb2312, anyway.
det_encoding = 'gbk'
ienc = det_encoding
log.debug('Detected input encoding as %s with a confidence of %s%%' % (ienc, confidence * 100))
if not ienc:
ienc = 'utf-8'
log.debug('No input encoding specified and could not auto detect using %s' % ienc)
# Remove BOM from start of txt as its presence can confuse markdown
import codecs
for bom in (codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE, codecs.BOM_UTF8, codecs.BOM_UTF32_LE, codecs.BOM_UTF32_BE):
if txt.startswith(bom):
txt = txt[len(bom):]
break
txt = txt.decode(ienc, 'replace')
# Replace entities
txt = _ent_pat.sub(xml_entity_to_unicode, txt)
# Normalize line endings
txt = normalize_line_endings(txt)
# Determine the paragraph type of the document.
if options.paragraph_type == 'auto':
options.paragraph_type = detect_paragraph_type(txt)
if options.paragraph_type == 'unknown':
log.debug('Could not reliably determine paragraph type using block')
options.paragraph_type = 'block'
else:
log.debug('Auto detected paragraph type as %s' % options.paragraph_type)
# Detect formatting
if options.formatting_type == 'auto':
options.formatting_type = detect_formatting_type(txt)
log.debug('Auto detected formatting as %s' % options.formatting_type)
if options.formatting_type == 'heuristic':
setattr(options, 'enable_heuristics', True)
setattr(options, 'unwrap_lines', False)
setattr(options, 'smarten_punctuation', True)
# Reformat paragraphs to block formatting based on the detected type.
# We don't check for block because the processor assumes block.
# single and print at transformed to block for processing.
if options.paragraph_type == 'single':
txt = separate_paragraphs_single_line(txt)
elif options.paragraph_type == 'print':
txt = separate_hard_scene_breaks(txt)
txt = separate_paragraphs_print_formatted(txt)
txt = block_to_single_line(txt)
elif options.paragraph_type == 'unformatted':
from calibre.ebooks.conversion.utils import HeuristicProcessor
# unwrap lines based on punctuation
docanalysis = DocAnalysis('txt', txt)
length = docanalysis.line_length(.5)
preprocessor = HeuristicProcessor(options, log=getattr(self, 'log', None))
txt = preprocessor.punctuation_unwrap(length, txt, 'txt')
txt = separate_paragraphs_single_line(txt)
elif options.paragraph_type == 'block':
txt = separate_hard_scene_breaks(txt)
txt = block_to_single_line(txt)
if getattr(options, 'enable_heuristics', False) and getattr(options, 'dehyphenate', False):
docanalysis = DocAnalysis('txt', txt)
if not length:
length = docanalysis.line_length(.5)
dehyphenator = Dehyphenator(options.verbose, log=self.log)
txt = dehyphenator(txt,'txt', length)
# User requested transformation on the text.
if options.txt_in_remove_indents:
txt = remove_indents(txt)
# Preserve spaces will replace multiple spaces to a space
# followed by the &nbsp; entity.
if options.preserve_spaces:
txt = preserve_spaces(txt)
# Process the text using the appropriate text processor.
self.shifted_files = []
try:
html = ''
input_mi = None
if options.formatting_type == 'markdown':
log.debug('Running text through markdown conversion...')
try:
input_mi, html = convert_markdown_with_metadata(txt, extensions=[x.strip() for x in options.markdown_extensions.split(',') if x.strip()])
except RuntimeError:
raise ValueError('This txt file has malformed markup, it cannot be'
' converted by calibre. See https://daringfireball.net/projects/markdown/syntax')
html = self.fix_resources(html, base_dir)
elif options.formatting_type == 'textile':
log.debug('Running text through textile conversion...')
html = convert_textile(txt)
html = self.fix_resources(html, base_dir)
else:
log.debug('Running text through basic conversion...')
flow_size = getattr(options, 'flow_size', 0)
html = convert_basic(txt, epub_split_size_kb=flow_size)
# Run the HTMLized text through the html processing plugin.
from calibre.customize.ui import plugin_for_input_format
html_input = plugin_for_input_format('html')
for opt in html_input.options:
setattr(options, opt.option.name, opt.recommended_value)
options.input_encoding = 'utf-8'
htmlfile = self.shift_file('index.html', html.encode('utf-8'))
odi = options.debug_pipeline
options.debug_pipeline = None
# Generate oeb from html conversion.
oeb = html_input.convert(open(htmlfile, 'rb'), options, 'html', log, {})
options.debug_pipeline = odi
finally:
for x in self.shifted_files:
os.remove(x)
# Set metadata from file.
if input_mi is None:
from calibre.customize.ui import get_file_type_metadata
input_mi = get_file_type_metadata(stream, file_ext)
from calibre.ebooks.oeb.transforms.metadata import meta_info_to_oeb_metadata
meta_info_to_oeb_metadata(input_mi, oeb.metadata, log)
self.html_postprocess_title = input_mi.title
return oeb
def postprocess_book(self, oeb, opts, log):
for item in oeb.spine:
if hasattr(item.data, 'xpath'):
for title in item.data.xpath('//*[local-name()="title"]'):
if title.text == _('Unknown'):
title.text = self.html_postprocess_title

View File

@@ -0,0 +1,165 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL 3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
import os
import shutil
from calibre.customize.conversion import OutputFormatPlugin, \
OptionRecommendation
from calibre.ptempfile import TemporaryDirectory, TemporaryFile
NEWLINE_TYPES = ['system', 'unix', 'old_mac', 'windows']
class TXTOutput(OutputFormatPlugin):
name = 'TXT Output'
author = 'John Schember'
file_type = 'txt'
commit_name = 'txt_output'
ui_data = {
'newline_types': NEWLINE_TYPES,
'formatting_types': {
'plain': _('Plain text'),
'markdown': _('Markdown formatted text'),
'textile': _('TexTile formatted text')
},
}
options = {
OptionRecommendation(name='newline', recommended_value='system',
level=OptionRecommendation.LOW,
short_switch='n', choices=NEWLINE_TYPES,
help=_('Type of newline to use. Options are %s. Default is \'system\'. '
'Use \'old_mac\' for compatibility with Mac OS 9 and earlier. '
'For macOS use \'unix\'. \'system\' will default to the newline '
'type used by this OS.') % sorted(NEWLINE_TYPES)),
OptionRecommendation(name='txt_output_encoding', recommended_value='utf-8',
level=OptionRecommendation.LOW,
help=_('Specify the character encoding of the output document. '
'The default is utf-8.')),
OptionRecommendation(name='inline_toc',
recommended_value=False, level=OptionRecommendation.LOW,
help=_('Add Table of Contents to beginning of the book.')),
OptionRecommendation(name='max_line_length',
recommended_value=0, level=OptionRecommendation.LOW,
help=_('The maximum number of characters per line. This splits on '
'the first space before the specified value. If no space is found '
'the line will be broken at the space after and will exceed the '
'specified value. Also, there is a minimum of 25 characters. '
'Use 0 to disable line splitting.')),
OptionRecommendation(name='force_max_line_length',
recommended_value=False, level=OptionRecommendation.LOW,
help=_('Force splitting on the max-line-length value when no space '
'is present. Also allows max-line-length to be below the minimum')),
OptionRecommendation(name='txt_output_formatting',
recommended_value='plain',
choices=list(ui_data['formatting_types']),
help=_('Formatting used within the document.\n'
'* plain: {plain}\n'
'* markdown: {markdown}\n'
'* textile: {textile}').format(**ui_data['formatting_types'])),
OptionRecommendation(name='keep_links',
recommended_value=False, level=OptionRecommendation.LOW,
help=_('Do not remove links within the document. This is only '
'useful when paired with a txt-output-formatting option that '
'is not none because links are always removed with plain text output.')),
OptionRecommendation(name='keep_image_references',
recommended_value=False, level=OptionRecommendation.LOW,
help=_('Do not remove image references within the document. This is only '
'useful when paired with a txt-output-formatting option that '
'is not none because links are always removed with plain text output.')),
OptionRecommendation(name='keep_color',
recommended_value=False, level=OptionRecommendation.LOW,
help=_('Do not remove font color from output. This is only useful when '
'txt-output-formatting is set to textile. Textile is the only '
'formatting that supports setting font color. If this option is '
'not specified font color will not be set and default to the '
'color displayed by the reader (generally this is black).')),
}
def convert(self, oeb_book, output_path, input_plugin, opts, log):
from calibre.ebooks.txt.txtml import TXTMLizer
from calibre.utils.cleantext import clean_ascii_chars
from calibre.ebooks.txt.newlines import specified_newlines, TxtNewlines
if opts.txt_output_formatting.lower() == 'markdown':
from calibre.ebooks.txt.markdownml import MarkdownMLizer
self.writer = MarkdownMLizer(log)
elif opts.txt_output_formatting.lower() == 'textile':
from calibre.ebooks.txt.textileml import TextileMLizer
self.writer = TextileMLizer(log)
else:
self.writer = TXTMLizer(log)
txt = self.writer.extract_content(oeb_book, opts)
txt = clean_ascii_chars(txt)
log.debug('\tReplacing newlines with selected type...')
txt = specified_newlines(TxtNewlines(opts.newline).newline, txt)
close = False
if not hasattr(output_path, 'write'):
close = True
if not os.path.exists(os.path.dirname(output_path)) and os.path.dirname(output_path) != '':
os.makedirs(os.path.dirname(output_path))
out_stream = open(output_path, 'wb')
else:
out_stream = output_path
out_stream.seek(0)
out_stream.truncate()
out_stream.write(txt.encode(opts.txt_output_encoding, 'replace'))
if close:
out_stream.close()
class TXTZOutput(TXTOutput):
name = 'TXTZ Output'
author = 'John Schember'
file_type = 'txtz'
def convert(self, oeb_book, output_path, input_plugin, opts, log):
from calibre.ebooks.oeb.base import OEB_IMAGES
from calibre.utils.zipfile import ZipFile
from lxml import etree
with TemporaryDirectory('_txtz_output') as tdir:
# TXT
txt_name = 'index.txt'
if opts.txt_output_formatting.lower() == 'textile':
txt_name = 'index.text'
with TemporaryFile(txt_name) as tf:
TXTOutput.convert(self, oeb_book, tf, input_plugin, opts, log)
shutil.copy(tf, os.path.join(tdir, txt_name))
# Images
for item in oeb_book.manifest:
if item.media_type in OEB_IMAGES:
if hasattr(self.writer, 'images'):
path = os.path.join(tdir, 'images')
if item.href in self.writer.images:
href = self.writer.images[item.href]
else:
continue
else:
path = os.path.join(tdir, os.path.dirname(item.href))
href = os.path.basename(item.href)
if not os.path.exists(path):
os.makedirs(path)
with open(os.path.join(path, href), 'wb') as imgf:
imgf.write(item.data)
# Metadata
with open(os.path.join(tdir, 'metadata.opf'), 'wb') as mdataf:
mdataf.write(etree.tostring(oeb_book.metadata.to_opf1()))
txtz = ZipFile(output_path, 'w')
txtz.add_dir(tdir)