1
0
mirror of https://github.com/gryf/ebook-converter.git synced 2026-03-15 22:23:42 +01:00

Initial import

This commit is contained in:
2020-03-31 17:15:23 +02:00
commit d97ea9b0bc
311 changed files with 131419 additions and 0 deletions

View File

@@ -0,0 +1,30 @@
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
from polyglot.builtins import native_string_type
class ConversionUserFeedBack(Exception):
def __init__(self, title, msg, level='info', det_msg=''):
''' Show a simple message to the user
:param title: The title (very short description)
:param msg: The message to show the user
:param level: Must be one of 'info', 'warn' or 'error'
:param det_msg: Optional detailed message to show the user
'''
import json
Exception.__init__(self, json.dumps({'msg':msg, 'level':level,
'det_msg':det_msg, 'title':title}))
self.title, self.msg, self.det_msg = title, msg, det_msg
self.level = level
# Ensure exception uses fully qualified name as this is used to detect it in
# the GUI.
ConversionUserFeedBack.__name__ = native_string_type('calibre.ebooks.conversion.ConversionUserFeedBack')

View File

@@ -0,0 +1,428 @@
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL 3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
'''
Command line interface to conversion sub-system
'''
import sys, os, numbers
from optparse import OptionGroup, Option
from collections import OrderedDict
from calibre.utils.config import OptionParser
from calibre.utils.logging import Log
from calibre.customize.conversion import OptionRecommendation
from calibre import patheq
from calibre.ebooks.conversion import ConversionUserFeedBack
from calibre.utils.localization import localize_user_manual_link
from polyglot.builtins import iteritems
USAGE = '%prog ' + _('''\
input_file output_file [options]
Convert an e-book from one format to another.
input_file is the input and output_file is the output. Both must be \
specified as the first two arguments to the command.
The output e-book format is guessed from the file extension of \
output_file. output_file can also be of the special format .EXT where \
EXT is the output file extension. In this case, the name of the output \
file is derived from the name of the input file. Note that the filenames must \
not start with a hyphen. Finally, if output_file has no extension, then \
it is treated as a directory and an "open e-book" (OEB) consisting of HTML \
files is written to that directory. These files are the files that would \
normally have been passed to the output plugin.
After specifying the input \
and output file you can customize the conversion by specifying various \
options. The available options depend on the input and output file types. \
To get help on them specify the input and output file and then use the -h \
option.
For full documentation of the conversion system see
''') + localize_user_manual_link('https://manual.calibre-ebook.com/conversion.html')
HEURISTIC_OPTIONS = ['markup_chapter_headings',
'italicize_common_cases', 'fix_indents',
'html_unwrap_factor', 'unwrap_lines',
'delete_blank_paragraphs', 'format_scene_breaks',
'dehyphenate', 'renumber_headings',
'replace_scene_breaks']
DEFAULT_TRUE_OPTIONS = HEURISTIC_OPTIONS + ['remove_fake_margins']
def print_help(parser, log):
parser.print_help()
def check_command_line_options(parser, args, log):
if len(args) < 3 or args[1].startswith('-') or args[2].startswith('-'):
print_help(parser, log)
log.error('\n\nYou must specify the input AND output files')
raise SystemExit(1)
input = os.path.abspath(args[1])
if not input.endswith('.recipe') and not os.access(input, os.R_OK) and not \
('-h' in args or '--help' in args):
log.error('Cannot read from', input)
raise SystemExit(1)
if input.endswith('.recipe') and not os.access(input, os.R_OK):
input = args[1]
output = args[2]
if (output.startswith('.') and output[:2] not in {'..', '.'} and '/' not in
output and '\\' not in output):
output = os.path.splitext(os.path.basename(input))[0]+output
output = os.path.abspath(output)
return input, output
def option_recommendation_to_cli_option(add_option, rec):
opt = rec.option
switches = ['-'+opt.short_switch] if opt.short_switch else []
switches.append('--'+opt.long_switch)
attrs = dict(dest=opt.name, help=opt.help,
choices=opt.choices, default=rec.recommended_value)
if isinstance(rec.recommended_value, type(True)):
attrs['action'] = 'store_false' if rec.recommended_value else \
'store_true'
else:
if isinstance(rec.recommended_value, numbers.Integral):
attrs['type'] = 'int'
if isinstance(rec.recommended_value, numbers.Real):
attrs['type'] = 'float'
if opt.long_switch == 'verbose':
attrs['action'] = 'count'
attrs.pop('type', '')
if opt.name == 'read_metadata_from_opf':
switches.append('--from-opf')
if opt.name == 'transform_css_rules':
attrs['help'] = _(
'Path to a file containing rules to transform the CSS styles'
' in this book. The easiest way to create such a file is to'
' use the wizard for creating rules in the calibre GUI. Access'
' it in the "Look & feel->Transform styles" section of the conversion'
' dialog. Once you create the rules, you can use the "Export" button'
' to save them to a file.'
)
if opt.name in DEFAULT_TRUE_OPTIONS and rec.recommended_value is True:
switches = ['--disable-'+opt.long_switch]
add_option(Option(*switches, **attrs))
def group_titles():
return _('INPUT OPTIONS'), _('OUTPUT OPTIONS')
def recipe_test(option, opt_str, value, parser):
assert value is None
value = []
def floatable(s):
try:
float(s)
return True
except ValueError:
return False
for arg in parser.rargs:
# stop on --foo like options
if arg[:2] == "--":
break
# stop on -a, but not on -3 or -3.0
if arg[:1] == "-" and len(arg) > 1 and not floatable(arg):
break
try:
value.append(int(arg))
except (TypeError, ValueError, AttributeError):
break
if len(value) == 2:
break
del parser.rargs[:len(value)]
while len(value) < 2:
value.append(2)
setattr(parser.values, option.dest, tuple(value))
def add_input_output_options(parser, plumber):
input_options, output_options = \
plumber.input_options, plumber.output_options
def add_options(group, options):
for opt in options:
if plumber.input_fmt == 'recipe' and opt.option.long_switch == 'test':
group(Option('--test', dest='test', action='callback', callback=recipe_test))
else:
option_recommendation_to_cli_option(group, opt)
if input_options:
title = group_titles()[0]
io = OptionGroup(parser, title, _('Options to control the processing'
' of the input %s file')%plumber.input_fmt)
add_options(io.add_option, input_options)
parser.add_option_group(io)
if output_options:
title = group_titles()[1]
oo = OptionGroup(parser, title, _('Options to control the processing'
' of the output %s')%plumber.output_fmt)
add_options(oo.add_option, output_options)
parser.add_option_group(oo)
def add_pipeline_options(parser, plumber):
groups = OrderedDict((
('' , ('',
[
'input_profile',
'output_profile',
]
)),
(_('LOOK AND FEEL') , (
_('Options to control the look and feel of the output'),
[
'base_font_size', 'disable_font_rescaling',
'font_size_mapping', 'embed_font_family',
'subset_embedded_fonts', 'embed_all_fonts',
'line_height', 'minimum_line_height',
'linearize_tables',
'extra_css', 'filter_css', 'transform_css_rules', 'expand_css',
'smarten_punctuation', 'unsmarten_punctuation',
'margin_top', 'margin_left', 'margin_right',
'margin_bottom', 'change_justification',
'insert_blank_line', 'insert_blank_line_size',
'remove_paragraph_spacing',
'remove_paragraph_spacing_indent_size',
'asciiize', 'keep_ligatures',
]
)),
(_('HEURISTIC PROCESSING') , (
_('Modify the document text and structure using common'
' patterns. Disabled by default. Use %(en)s to enable. '
' Individual actions can be disabled with the %(dis)s options.')
% dict(en='--enable-heuristics', dis='--disable-*'),
['enable_heuristics'] + HEURISTIC_OPTIONS
)),
(_('SEARCH AND REPLACE') , (
_('Modify the document text and structure using user defined patterns.'),
[
'sr1_search', 'sr1_replace',
'sr2_search', 'sr2_replace',
'sr3_search', 'sr3_replace',
'search_replace',
]
)),
(_('STRUCTURE DETECTION') , (
_('Control auto-detection of document structure.'),
[
'chapter', 'chapter_mark',
'prefer_metadata_cover', 'remove_first_image',
'insert_metadata', 'page_breaks_before',
'remove_fake_margins', 'start_reading_at',
]
)),
(_('TABLE OF CONTENTS') , (
_('Control the automatic generation of a Table of Contents. By '
'default, if the source file has a Table of Contents, it will '
'be used in preference to the automatically generated one.'),
[
'level1_toc', 'level2_toc', 'level3_toc',
'toc_threshold', 'max_toc_links', 'no_chapters_in_toc',
'use_auto_toc', 'toc_filter', 'duplicate_links_in_toc',
]
)),
(_('METADATA') , (_('Options to set metadata in the output'),
plumber.metadata_option_names + ['read_metadata_from_opf'],
)),
(_('DEBUG'), (_('Options to help with debugging the conversion'),
[
'verbose',
'debug_pipeline',
])),
))
for group, (desc, options) in iteritems(groups):
if group:
group = OptionGroup(parser, group, desc)
parser.add_option_group(group)
add_option = group.add_option if group != '' else parser.add_option
for name in options:
rec = plumber.get_option_by_name(name)
if rec.level < rec.HIGH:
option_recommendation_to_cli_option(add_option, rec)
def option_parser():
parser = OptionParser(usage=USAGE)
parser.add_option('--list-recipes', default=False, action='store_true',
help=_('List builtin recipe names. You can create an e-book from '
'a builtin recipe like this: ebook-convert "Recipe Name.recipe" '
'output.epub'))
return parser
class ProgressBar(object):
def __init__(self, log):
self.log = log
def __call__(self, frac, msg=''):
if msg:
percent = int(frac*100)
self.log('%d%% %s'%(percent, msg))
def create_option_parser(args, log):
if '--version' in args:
from calibre.constants import __appname__, __version__, __author__
log(os.path.basename(args[0]), '('+__appname__, __version__+')')
log('Created by:', __author__)
raise SystemExit(0)
if '--list-recipes' in args:
from calibre.web.feeds.recipes.collection import get_builtin_recipe_titles
log('Available recipes:')
titles = sorted(get_builtin_recipe_titles())
for title in titles:
try:
log('\t'+title)
except:
log('\t'+repr(title))
log('%d recipes available'%len(titles))
raise SystemExit(0)
parser = option_parser()
if len(args) < 3:
print_help(parser, log)
if any(x in args for x in ('-h', '--help')):
raise SystemExit(0)
else:
raise SystemExit(1)
input, output = check_command_line_options(parser, args, log)
from calibre.ebooks.conversion.plumber import Plumber
reporter = ProgressBar(log)
if patheq(input, output):
raise ValueError('Input file is the same as the output file')
plumber = Plumber(input, output, log, reporter)
add_input_output_options(parser, plumber)
add_pipeline_options(parser, plumber)
return parser, plumber
def abspath(x):
if x.startswith('http:') or x.startswith('https:'):
return x
return os.path.abspath(os.path.expanduser(x))
def escape_sr_pattern(exp):
return exp.replace('\n', '\ue123')
def read_sr_patterns(path, log=None):
import json, re
pats = []
with open(path, 'rb') as f:
lines = f.read().decode('utf-8').splitlines()
pat = None
for line in lines:
if pat is None:
if not line.strip():
continue
line = line.replace('\ue123', '\n')
try:
re.compile(line)
except:
msg = 'Invalid regular expression: %r from file: %r'%(
line, path)
if log is not None:
log.error(msg)
raise SystemExit(1)
else:
raise ValueError(msg)
pat = line
else:
pats.append((pat, line))
pat = None
return json.dumps(pats)
def main(args=sys.argv):
log = Log()
parser, plumber = create_option_parser(args, log)
opts, leftover_args = parser.parse_args(args)
if len(leftover_args) > 3:
log.error('Extra arguments not understood:', u', '.join(leftover_args[3:]))
return 1
for x in ('read_metadata_from_opf', 'cover'):
if getattr(opts, x, None) is not None:
setattr(opts, x, abspath(getattr(opts, x)))
if opts.search_replace:
opts.search_replace = read_sr_patterns(opts.search_replace, log)
if opts.transform_css_rules:
from calibre.ebooks.css_transform_rules import import_rules, validate_rule
with open(opts.transform_css_rules, 'rb') as tcr:
opts.transform_css_rules = rules = list(import_rules(tcr.read()))
for rule in rules:
title, msg = validate_rule(rule)
if title and msg:
log.error('Failed to parse CSS transform rules')
log.error(title)
log.error(msg)
return 1
recommendations = [(n.dest, getattr(opts, n.dest),
OptionRecommendation.HIGH)
for n in parser.options_iter()
if n.dest]
plumber.merge_ui_recommendations(recommendations)
try:
plumber.run()
except ConversionUserFeedBack as e:
ll = {'info': log.info, 'warn': log.warn,
'error':log.error}.get(e.level, log.info)
ll(e.title)
if e.det_msg:
log.debug(e.detmsg)
ll(e.msg)
raise SystemExit(1)
log(_('Output saved to'), ' ', plumber.output)
return 0
def manual_index_strings():
return _('''\
The options and default values for the options change depending on both the
input and output formats, so you should always check with::
%s
Below are the options that are common to all conversion, followed by the
options specific to every input and output format.''')
if __name__ == '__main__':
sys.exit(main())

View File

@@ -0,0 +1,10 @@
#!/usr/bin/env python2
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'

View File

@@ -0,0 +1,29 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2011, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
from calibre.customize.conversion import InputFormatPlugin
from polyglot.builtins import getcwd
class AZW4Input(InputFormatPlugin):
name = 'AZW4 Input'
author = 'John Schember'
description = 'Convert AZW4 to HTML'
file_types = {'azw4'}
commit_name = 'azw4_input'
def convert(self, stream, options, file_ext, log,
accelerators):
from calibre.ebooks.pdb.header import PdbHeaderReader
from calibre.ebooks.azw4.reader import Reader
header = PdbHeaderReader(stream)
reader = Reader(header, stream, log, options)
opf = reader.extract_content(getcwd())
return opf

View File

@@ -0,0 +1,202 @@
from __future__ import absolute_import, division, print_function, unicode_literals
''' CHM File decoding support '''
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>,' \
' and Alex Bramley <a.bramley at gmail.com>.'
import os
from calibre.customize.conversion import InputFormatPlugin
from calibre.ptempfile import TemporaryDirectory
from calibre.constants import filesystem_encoding
from polyglot.builtins import unicode_type, as_bytes
class CHMInput(InputFormatPlugin):
name = 'CHM Input'
author = 'Kovid Goyal and Alex Bramley'
description = 'Convert CHM files to OEB'
file_types = {'chm'}
commit_name = 'chm_input'
def _chmtohtml(self, output_dir, chm_path, no_images, log, debug_dump=False):
from calibre.ebooks.chm.reader import CHMReader
log.debug('Opening CHM file')
rdr = CHMReader(chm_path, log, input_encoding=self.opts.input_encoding)
log.debug('Extracting CHM to %s' % output_dir)
rdr.extract_content(output_dir, debug_dump=debug_dump)
self._chm_reader = rdr
return rdr.hhc_path
def convert(self, stream, options, file_ext, log, accelerators):
from calibre.ebooks.chm.metadata import get_metadata_from_reader
from calibre.customize.ui import plugin_for_input_format
self.opts = options
log.debug('Processing CHM...')
with TemporaryDirectory('_chm2oeb') as tdir:
if not isinstance(tdir, unicode_type):
tdir = tdir.decode(filesystem_encoding)
html_input = plugin_for_input_format('html')
for opt in html_input.options:
setattr(options, opt.option.name, opt.recommended_value)
no_images = False # options.no_images
chm_name = stream.name
# chm_data = stream.read()
# closing stream so CHM can be opened by external library
stream.close()
log.debug('tdir=%s' % tdir)
log.debug('stream.name=%s' % stream.name)
debug_dump = False
odi = options.debug_pipeline
if odi:
debug_dump = os.path.join(odi, 'input')
mainname = self._chmtohtml(tdir, chm_name, no_images, log,
debug_dump=debug_dump)
mainpath = os.path.join(tdir, mainname)
try:
metadata = get_metadata_from_reader(self._chm_reader)
except Exception:
log.exception('Failed to read metadata, using filename')
from calibre.ebooks.metadata.book.base import Metadata
metadata = Metadata(os.path.basename(chm_name))
encoding = self._chm_reader.get_encoding() or options.input_encoding or 'cp1252'
self._chm_reader.CloseCHM()
# print((tdir, mainpath))
# from calibre import ipython
# ipython()
options.debug_pipeline = None
options.input_encoding = 'utf-8'
uenc = encoding
if os.path.abspath(mainpath) in self._chm_reader.re_encoded_files:
uenc = 'utf-8'
htmlpath, toc = self._create_html_root(mainpath, log, uenc)
oeb = self._create_oebbook_html(htmlpath, tdir, options, log, metadata)
options.debug_pipeline = odi
if toc.count() > 1:
oeb.toc = self.parse_html_toc(oeb.spine[0])
oeb.manifest.remove(oeb.spine[0])
oeb.auto_generated_toc = False
return oeb
def parse_html_toc(self, item):
from calibre.ebooks.oeb.base import TOC, XPath
dx = XPath('./h:div')
ax = XPath('./h:a[1]')
def do_node(parent, div):
for child in dx(div):
a = ax(child)[0]
c = parent.add(a.text, a.attrib['href'])
do_node(c, child)
toc = TOC()
root = XPath('//h:div[1]')(item.data)[0]
do_node(toc, root)
return toc
def _create_oebbook_html(self, htmlpath, basedir, opts, log, mi):
# use HTMLInput plugin to generate book
from calibre.customize.builtins import HTMLInput
opts.breadth_first = True
htmlinput = HTMLInput(None)
oeb = htmlinput.create_oebbook(htmlpath, basedir, opts, log, mi)
return oeb
def _create_html_root(self, hhcpath, log, encoding):
from lxml import html
from polyglot.urllib import unquote as _unquote
from calibre.ebooks.oeb.base import urlquote
from calibre.ebooks.chardet import xml_to_unicode
hhcdata = self._read_file(hhcpath)
hhcdata = hhcdata.decode(encoding)
hhcdata = xml_to_unicode(hhcdata, verbose=True,
strip_encoding_pats=True, resolve_entities=True)[0]
hhcroot = html.fromstring(hhcdata)
toc = self._process_nodes(hhcroot)
# print("=============================")
# print("Printing hhcroot")
# print(etree.tostring(hhcroot, pretty_print=True))
# print("=============================")
log.debug('Found %d section nodes' % toc.count())
htmlpath = os.path.splitext(hhcpath)[0] + ".html"
base = os.path.dirname(os.path.abspath(htmlpath))
def unquote(x):
if isinstance(x, unicode_type):
x = x.encode('utf-8')
return _unquote(x).decode('utf-8')
def unquote_path(x):
y = unquote(x)
if (not os.path.exists(os.path.join(base, x)) and os.path.exists(os.path.join(base, y))):
x = y
return x
def donode(item, parent, base, subpath):
for child in item:
title = child.title
if not title:
continue
raw = unquote_path(child.href or '')
rsrcname = os.path.basename(raw)
rsrcpath = os.path.join(subpath, rsrcname)
if (not os.path.exists(os.path.join(base, rsrcpath)) and os.path.exists(os.path.join(base, raw))):
rsrcpath = raw
if '%' not in rsrcpath:
rsrcpath = urlquote(rsrcpath)
if not raw:
rsrcpath = ''
c = DIV(A(title, href=rsrcpath))
donode(child, c, base, subpath)
parent.append(c)
with open(htmlpath, 'wb') as f:
if toc.count() > 1:
from lxml.html.builder import HTML, BODY, DIV, A
path0 = toc[0].href
path0 = unquote_path(path0)
subpath = os.path.dirname(path0)
base = os.path.dirname(f.name)
root = DIV()
donode(toc, root, base, subpath)
raw = html.tostring(HTML(BODY(root)), encoding='utf-8',
pretty_print=True)
f.write(raw)
else:
f.write(as_bytes(hhcdata))
return htmlpath, toc
def _read_file(self, name):
with lopen(name, 'rb') as f:
data = f.read()
return data
def add_node(self, node, toc, ancestor_map):
from calibre.ebooks.chm.reader import match_string
if match_string(node.attrib.get('type', ''), 'text/sitemap'):
p = node.xpath('ancestor::ul[1]/ancestor::li[1]/object[1]')
parent = p[0] if p else None
toc = ancestor_map.get(parent, toc)
title = href = ''
for param in node.xpath('./param'):
if match_string(param.attrib['name'], 'name'):
title = param.attrib['value']
elif match_string(param.attrib['name'], 'local'):
href = param.attrib['value']
child = toc.add(title or _('Unknown'), href)
ancestor_map[node] = child
def _process_nodes(self, root):
from calibre.ebooks.oeb.base import TOC
toc = TOC()
ancestor_map = {}
for node in root.xpath('//object'):
self.add_node(node, toc, ancestor_map)
return toc

View File

@@ -0,0 +1,310 @@
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
__docformat__ = 'restructuredtext en'
'''
Based on ideas from comiclrf created by FangornUK.
'''
import shutil, textwrap, codecs, os
from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
from calibre import CurrentDir
from calibre.ptempfile import PersistentTemporaryDirectory
from polyglot.builtins import getcwd, map
class ComicInput(InputFormatPlugin):
name = 'Comic Input'
author = 'Kovid Goyal'
description = 'Optimize comic files (.cbz, .cbr, .cbc) for viewing on portable devices'
file_types = {'cbz', 'cbr', 'cbc'}
is_image_collection = True
commit_name = 'comic_input'
core_usage = -1
options = {
OptionRecommendation(name='colors', recommended_value=0,
help=_('Reduce the number of colors used in the image. This works only'
' if you choose the PNG output format. It is useful to reduce file sizes.'
' Set to zero to turn off. Maximum value is 256. It is off by default.')),
OptionRecommendation(name='dont_normalize', recommended_value=False,
help=_('Disable normalize (improve contrast) color range '
'for pictures. Default: False')),
OptionRecommendation(name='keep_aspect_ratio', recommended_value=False,
help=_('Maintain picture aspect ratio. Default is to fill the screen.')),
OptionRecommendation(name='dont_sharpen', recommended_value=False,
help=_('Disable sharpening.')),
OptionRecommendation(name='disable_trim', recommended_value=False,
help=_('Disable trimming of comic pages. For some comics, '
'trimming might remove content as well as borders.')),
OptionRecommendation(name='landscape', recommended_value=False,
help=_("Don't split landscape images into two portrait images")),
OptionRecommendation(name='wide', recommended_value=False,
help=_("Keep aspect ratio and scale image using screen height as "
"image width for viewing in landscape mode.")),
OptionRecommendation(name='right2left', recommended_value=False,
help=_('Used for right-to-left publications like manga. '
'Causes landscape pages to be split into portrait pages '
'from right to left.')),
OptionRecommendation(name='despeckle', recommended_value=False,
help=_('Enable Despeckle. Reduces speckle noise. '
'May greatly increase processing time.')),
OptionRecommendation(name='no_sort', recommended_value=False,
help=_("Don't sort the files found in the comic "
"alphabetically by name. Instead use the order they were "
"added to the comic.")),
OptionRecommendation(name='output_format', choices=['png', 'jpg'],
recommended_value='png', help=_('The format that images in the created e-book '
'are converted to. You can experiment to see which format gives '
'you optimal size and look on your device.')),
OptionRecommendation(name='no_process', recommended_value=False,
help=_("Apply no processing to the image")),
OptionRecommendation(name='dont_grayscale', recommended_value=False,
help=_('Do not convert the image to grayscale (black and white)')),
OptionRecommendation(name='comic_image_size', recommended_value=None,
help=_('Specify the image size as widthxheight pixels. Normally,'
' an image size is automatically calculated from the output '
'profile, this option overrides it.')),
OptionRecommendation(name='dont_add_comic_pages_to_toc', recommended_value=False,
help=_('When converting a CBC do not add links to each page to'
' the TOC. Note this only applies if the TOC has more than one'
' section')),
}
recommendations = {
('margin_left', 0, OptionRecommendation.HIGH),
('margin_top', 0, OptionRecommendation.HIGH),
('margin_right', 0, OptionRecommendation.HIGH),
('margin_bottom', 0, OptionRecommendation.HIGH),
('insert_blank_line', False, OptionRecommendation.HIGH),
('remove_paragraph_spacing', False, OptionRecommendation.HIGH),
('change_justification', 'left', OptionRecommendation.HIGH),
('dont_split_on_pagebreaks', True, OptionRecommendation.HIGH),
('chapter', None, OptionRecommendation.HIGH),
('page_breaks_brefore', None, OptionRecommendation.HIGH),
('use_auto_toc', False, OptionRecommendation.HIGH),
('page_breaks_before', None, OptionRecommendation.HIGH),
('disable_font_rescaling', True, OptionRecommendation.HIGH),
('linearize_tables', False, OptionRecommendation.HIGH),
}
def get_comics_from_collection(self, stream):
from calibre.libunzip import extract as zipextract
tdir = PersistentTemporaryDirectory('_comic_collection')
zipextract(stream, tdir)
comics = []
with CurrentDir(tdir):
if not os.path.exists('comics.txt'):
raise ValueError((
'%s is not a valid comic collection'
' no comics.txt was found in the file')
%stream.name)
with open('comics.txt', 'rb') as f:
raw = f.read()
if raw.startswith(codecs.BOM_UTF16_BE):
raw = raw.decode('utf-16-be')[1:]
elif raw.startswith(codecs.BOM_UTF16_LE):
raw = raw.decode('utf-16-le')[1:]
elif raw.startswith(codecs.BOM_UTF8):
raw = raw.decode('utf-8')[1:]
else:
raw = raw.decode('utf-8')
for line in raw.splitlines():
line = line.strip()
if not line:
continue
fname, title = line.partition(':')[0], line.partition(':')[-1]
fname = fname.replace('#', '_')
fname = os.path.join(tdir, *fname.split('/'))
if not title:
title = os.path.basename(fname).rpartition('.')[0]
if os.access(fname, os.R_OK):
comics.append([title, fname])
if not comics:
raise ValueError('%s has no comics'%stream.name)
return comics
def get_pages(self, comic, tdir2):
from calibre.ebooks.comic.input import (extract_comic, process_pages,
find_pages)
tdir = extract_comic(comic)
new_pages = find_pages(tdir, sort_on_mtime=self.opts.no_sort,
verbose=self.opts.verbose)
thumbnail = None
if not new_pages:
raise ValueError('Could not find any pages in the comic: %s'
%comic)
if self.opts.no_process:
n2 = []
for i, page in enumerate(new_pages):
n2.append(os.path.join(tdir2, '{} - {}' .format(i, os.path.basename(page))))
shutil.copyfile(page, n2[-1])
new_pages = n2
else:
new_pages, failures = process_pages(new_pages, self.opts,
self.report_progress, tdir2)
if failures:
self.log.warning('Could not process the following pages '
'(run with --verbose to see why):')
for f in failures:
self.log.warning('\t', f)
if not new_pages:
raise ValueError('Could not find any valid pages in comic: %s'
% comic)
thumbnail = os.path.join(tdir2,
'thumbnail.'+self.opts.output_format.lower())
if not os.access(thumbnail, os.R_OK):
thumbnail = None
return new_pages
def get_images(self):
return self._images
def convert(self, stream, opts, file_ext, log, accelerators):
from calibre.ebooks.metadata import MetaInformation
from calibre.ebooks.metadata.opf2 import OPFCreator
from calibre.ebooks.metadata.toc import TOC
self.opts, self.log= opts, log
if file_ext == 'cbc':
comics_ = self.get_comics_from_collection(stream)
else:
comics_ = [['Comic', os.path.abspath(stream.name)]]
stream.close()
comics = []
for i, x in enumerate(comics_):
title, fname = x
cdir = 'comic_%d'%(i+1) if len(comics_) > 1 else '.'
cdir = os.path.abspath(cdir)
if not os.path.exists(cdir):
os.makedirs(cdir)
pages = self.get_pages(fname, cdir)
if not pages:
continue
if self.for_viewer:
comics.append((title, pages, [self.create_viewer_wrapper(pages)]))
else:
wrappers = self.create_wrappers(pages)
comics.append((title, pages, wrappers))
if not comics:
raise ValueError('No comic pages found in %s'%stream.name)
mi = MetaInformation(os.path.basename(stream.name).rpartition('.')[0],
[_('Unknown')])
opf = OPFCreator(getcwd(), mi)
entries = []
def href(x):
if len(comics) == 1:
return os.path.basename(x)
return '/'.join(x.split(os.sep)[-2:])
cover_href = None
for comic in comics:
pages, wrappers = comic[1:]
page_entries = [(x, None) for x in map(href, pages)]
entries += [(w, None) for w in map(href, wrappers)] + page_entries
if cover_href is None and page_entries:
cover_href = page_entries[0][0]
opf.create_manifest(entries)
spine = []
for comic in comics:
spine.extend(map(href, comic[2]))
self._images = []
for comic in comics:
self._images.extend(comic[1])
opf.create_spine(spine)
if self.for_viewer and cover_href:
opf.guide.set_cover(cover_href)
toc = TOC()
if len(comics) == 1:
wrappers = comics[0][2]
for i, x in enumerate(wrappers):
toc.add_item(href(x), None, _('Page')+' %d'%(i+1),
play_order=i)
else:
po = 0
for comic in comics:
po += 1
wrappers = comic[2]
stoc = toc.add_item(href(wrappers[0]),
None, comic[0], play_order=po)
if not opts.dont_add_comic_pages_to_toc:
for i, x in enumerate(wrappers):
stoc.add_item(href(x), None,
_('Page')+' %d'%(i+1), play_order=po)
po += 1
opf.set_toc(toc)
with open('metadata.opf', 'wb') as m, open('toc.ncx', 'wb') as n:
opf.render(m, n, 'toc.ncx')
return os.path.abspath('metadata.opf')
def create_wrappers(self, pages):
from calibre.ebooks.oeb.base import XHTML_NS
wrappers = []
WRAPPER = textwrap.dedent('''\
<html xmlns="%s">
<head>
<meta charset="utf-8"/>
<title>Page #%d</title>
<style type="text/css">
@page { margin:0pt; padding: 0pt}
body { margin: 0pt; padding: 0pt}
div { text-align: center }
</style>
</head>
<body>
<div>
<img src="%s" alt="comic page #%d" />
</div>
</body>
</html>
''')
dir = os.path.dirname(pages[0])
for i, page in enumerate(pages):
wrapper = WRAPPER%(XHTML_NS, i+1, os.path.basename(page), i+1)
page = os.path.join(dir, 'page_%d.xhtml'%(i+1))
with open(page, 'wb') as f:
f.write(wrapper.encode('utf-8'))
wrappers.append(page)
return wrappers
def create_viewer_wrapper(self, pages):
from calibre.ebooks.oeb.base import XHTML_NS
def page(src):
return '<img src="{}"></img>'.format(os.path.basename(src))
pages = '\n'.join(map(page, pages))
base = os.path.dirname(pages[0])
wrapper = '''
<html xmlns="%s">
<head>
<meta charset="utf-8"/>
<style type="text/css">
html, body, img { height: 100vh; display: block; margin: 0; padding: 0; border-width: 0; }
img {
width: 100%%; height: 100%%;
object-fit: contain;
margin-left: auto; margin-right: auto;
max-width: 100vw; max-height: 100vh;
top: 50vh; transform: translateY(-50%%);
position: relative;
page-break-after: always;
}
</style>
</head>
<body>
%s
</body>
</html>
''' % (XHTML_NS, pages)
path = os.path.join(base, 'wrapper.xhtml')
with open(path, 'wb') as f:
f.write(wrapper.encode('utf-8'))
return path

View File

@@ -0,0 +1,67 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL 3'
__copyright__ = '2011, Anthon van der Neut <anthon@mnt.org>'
__docformat__ = 'restructuredtext en'
import os
from io import BytesIO
from calibre.customize.conversion import InputFormatPlugin
from polyglot.builtins import getcwd
class DJVUInput(InputFormatPlugin):
name = 'DJVU Input'
author = 'Anthon van der Neut'
description = 'Convert OCR-ed DJVU files (.djvu) to HTML'
file_types = {'djvu', 'djv'}
commit_name = 'djvu_input'
def convert(self, stream, options, file_ext, log, accelerators):
from calibre.ebooks.txt.processor import convert_basic
stdout = BytesIO()
from calibre.ebooks.djvu.djvu import DJVUFile
x = DJVUFile(stream)
x.get_text(stdout)
raw_text = stdout.getvalue()
if not raw_text:
raise ValueError('The DJVU file contains no text, only images, probably page scans.'
' calibre only supports conversion of DJVU files with actual text in them.')
html = convert_basic(raw_text.replace(b"\n", b' ').replace(
b'\037', b'\n\n'))
# Run the HTMLized text through the html processing plugin.
from calibre.customize.ui import plugin_for_input_format
html_input = plugin_for_input_format('html')
for opt in html_input.options:
setattr(options, opt.option.name, opt.recommended_value)
options.input_encoding = 'utf-8'
base = getcwd()
htmlfile = os.path.join(base, 'index.html')
c = 0
while os.path.exists(htmlfile):
c += 1
htmlfile = os.path.join(base, 'index%d.html'%c)
with open(htmlfile, 'wb') as f:
f.write(html.encode('utf-8'))
odi = options.debug_pipeline
options.debug_pipeline = None
# Generate oeb from html conversion.
with open(htmlfile, 'rb') as f:
oeb = html_input.convert(f, options, 'html', log,
{})
options.debug_pipeline = odi
os.remove(htmlfile)
# Set metadata from file.
from calibre.customize.ui import get_file_type_metadata
from calibre.ebooks.oeb.transforms.metadata import meta_info_to_oeb_metadata
mi = get_file_type_metadata(stream, file_ext)
meta_info_to_oeb_metadata(mi, oeb.metadata, log)
return oeb

View File

@@ -0,0 +1,34 @@
#!/usr/bin/env python2
# vim:fileencoding=utf-8
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
class DOCXInput(InputFormatPlugin):
name = 'DOCX Input'
author = 'Kovid Goyal'
description = _('Convert DOCX files (.docx and .docm) to HTML')
file_types = {'docx', 'docm'}
commit_name = 'docx_input'
options = {
OptionRecommendation(name='docx_no_cover', recommended_value=False,
help=_('Normally, if a large image is present at the start of the document that looks like a cover, '
'it will be removed from the document and used as the cover for created e-book. This option '
'turns off that behavior.')),
OptionRecommendation(name='docx_no_pagebreaks_between_notes', recommended_value=False,
help=_('Do not insert a page break after every endnote.')),
OptionRecommendation(name='docx_inline_subsup', recommended_value=False,
help=_('Render superscripts and subscripts so that they do not affect the line height.')),
}
recommendations = {('page_breaks_before', '/', OptionRecommendation.MED)}
def convert(self, stream, options, file_ext, log, accelerators):
from calibre.ebooks.docx.to_html import Convert
return Convert(stream, detect_cover=not options.docx_no_cover, log=log, notes_nopb=options.docx_no_pagebreaks_between_notes,
nosupsub=options.docx_inline_subsup)()

View File

@@ -0,0 +1,93 @@
#!/usr/bin/env python2
# vim:fileencoding=utf-8
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
from calibre.customize.conversion import OutputFormatPlugin, OptionRecommendation
PAGE_SIZES = ['a0', 'a1', 'a2', 'a3', 'a4', 'a5', 'a6', 'b0', 'b1',
'b2', 'b3', 'b4', 'b5', 'b6', 'legal', 'letter']
class DOCXOutput(OutputFormatPlugin):
name = 'DOCX Output'
author = 'Kovid Goyal'
file_type = 'docx'
commit_name = 'docx_output'
ui_data = {'page_sizes': PAGE_SIZES}
options = {
OptionRecommendation(name='docx_page_size', recommended_value='letter',
level=OptionRecommendation.LOW, choices=PAGE_SIZES,
help=_('The size of the page. Default is letter. Choices '
'are %s') % PAGE_SIZES),
OptionRecommendation(name='docx_custom_page_size', recommended_value=None,
help=_('Custom size of the document. Use the form widthxheight '
'EG. `123x321` to specify the width and height (in pts). '
'This overrides any specified page-size.')),
OptionRecommendation(name='docx_no_cover', recommended_value=False,
help=_('Do not insert the book cover as an image at the start of the document.'
' If you use this option, the book cover will be discarded.')),
OptionRecommendation(name='preserve_cover_aspect_ratio', recommended_value=False,
help=_('Preserve the aspect ratio of the cover image instead of stretching'
' it out to cover the entire page.')),
OptionRecommendation(name='docx_no_toc', recommended_value=False,
help=_('Do not insert the table of contents as a page at the start of the document.')),
OptionRecommendation(name='extract_to',
help=_('Extract the contents of the generated %s file to the '
'specified directory. The contents of the directory are first '
'deleted, so be careful.') % 'DOCX'),
OptionRecommendation(name='docx_page_margin_left', recommended_value=72.0,
level=OptionRecommendation.LOW,
help=_('The size of the left page margin, in pts. Default is 72pt.'
' Overrides the common left page margin setting.')
),
OptionRecommendation(name='docx_page_margin_top', recommended_value=72.0,
level=OptionRecommendation.LOW,
help=_('The size of the top page margin, in pts. Default is 72pt.'
' Overrides the common top page margin setting, unless set to zero.')
),
OptionRecommendation(name='docx_page_margin_right', recommended_value=72.0,
level=OptionRecommendation.LOW,
help=_('The size of the right page margin, in pts. Default is 72pt.'
' Overrides the common right page margin setting, unless set to zero.')
),
OptionRecommendation(name='docx_page_margin_bottom', recommended_value=72.0,
level=OptionRecommendation.LOW,
help=_('The size of the bottom page margin, in pts. Default is 72pt.'
' Overrides the common bottom page margin setting, unless set to zero.')
),
}
def convert_metadata(self, oeb):
from lxml import etree
from calibre.ebooks.oeb.base import OPF, OPF2_NS
from calibre.ebooks.metadata.opf2 import OPF as ReadOPF
from io import BytesIO
package = etree.Element(OPF('package'), attrib={'version': '2.0'}, nsmap={None: OPF2_NS})
oeb.metadata.to_opf2(package)
self.mi = ReadOPF(BytesIO(etree.tostring(package, encoding='utf-8')), populate_spine=False, try_to_guess_cover=False).to_book_metadata()
def convert(self, oeb, output_path, input_plugin, opts, log):
from calibre.ebooks.docx.writer.container import DOCX
from calibre.ebooks.docx.writer.from_html import Convert
docx = DOCX(opts, log)
self.convert_metadata(oeb)
Convert(oeb, docx, self.mi, not opts.docx_no_cover, not opts.docx_no_toc)()
docx.write(output_path, self.mi)
if opts.extract_to:
from calibre.ebooks.docx.dump import do_dump
do_dump(output_path, opts.extract_to)

View File

@@ -0,0 +1,438 @@
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL 3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import os, re, posixpath
from itertools import cycle
from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
from polyglot.builtins import getcwd
ADOBE_OBFUSCATION = 'http://ns.adobe.com/pdf/enc#RC'
IDPF_OBFUSCATION = 'http://www.idpf.org/2008/embedding'
def decrypt_font_data(key, data, algorithm):
is_adobe = algorithm == ADOBE_OBFUSCATION
crypt_len = 1024 if is_adobe else 1040
crypt = bytearray(data[:crypt_len])
key = cycle(iter(bytearray(key)))
decrypt = bytes(bytearray(x^next(key) for x in crypt))
return decrypt + data[crypt_len:]
def decrypt_font(key, path, algorithm):
with lopen(path, 'r+b') as f:
data = decrypt_font_data(key, f.read(), algorithm)
f.seek(0), f.truncate(), f.write(data)
class EPUBInput(InputFormatPlugin):
name = 'EPUB Input'
author = 'Kovid Goyal'
description = 'Convert EPUB files (.epub) to HTML'
file_types = {'epub'}
output_encoding = None
commit_name = 'epub_input'
recommendations = {('page_breaks_before', '/', OptionRecommendation.MED)}
def process_encryption(self, encfile, opf, log):
from lxml import etree
import uuid, hashlib
idpf_key = opf.raw_unique_identifier
if idpf_key:
idpf_key = re.sub('[\u0020\u0009\u000d\u000a]', '', idpf_key)
idpf_key = hashlib.sha1(idpf_key.encode('utf-8')).digest()
key = None
for item in opf.identifier_iter():
scheme = None
for xkey in item.attrib.keys():
if xkey.endswith('scheme'):
scheme = item.get(xkey)
if (scheme and scheme.lower() == 'uuid') or \
(item.text and item.text.startswith('urn:uuid:')):
try:
key = item.text.rpartition(':')[-1]
key = uuid.UUID(key).bytes
except:
import traceback
traceback.print_exc()
key = None
try:
root = etree.parse(encfile)
for em in root.xpath('descendant::*[contains(name(), "EncryptionMethod")]'):
algorithm = em.get('Algorithm', '')
if algorithm not in {ADOBE_OBFUSCATION, IDPF_OBFUSCATION}:
return False
cr = em.getparent().xpath('descendant::*[contains(name(), "CipherReference")]')[0]
uri = cr.get('URI')
path = os.path.abspath(os.path.join(os.path.dirname(encfile), '..', *uri.split('/')))
tkey = (key if algorithm == ADOBE_OBFUSCATION else idpf_key)
if (tkey and os.path.exists(path)):
self._encrypted_font_uris.append(uri)
decrypt_font(tkey, path, algorithm)
return True
except:
import traceback
traceback.print_exc()
return False
def set_guide_type(self, opf, gtype, href=None, title=''):
# Set the specified guide entry
for elem in list(opf.iterguide()):
if elem.get('type', '').lower() == gtype:
elem.getparent().remove(elem)
if href is not None:
t = opf.create_guide_item(gtype, title, href)
for guide in opf.root.xpath('./*[local-name()="guide"]'):
guide.append(t)
return
guide = opf.create_guide_element()
opf.root.append(guide)
guide.append(t)
return t
def rationalize_cover3(self, opf, log):
''' If there is a reference to the cover/titlepage via manifest properties, convert to
entries in the <guide> so that the rest of the pipeline picks it up. '''
from calibre.ebooks.metadata.opf3 import items_with_property
removed = guide_titlepage_href = guide_titlepage_id = None
# Look for titlepages incorrectly marked in the <guide> as covers
guide_cover, guide_elem = None, None
for guide_elem in opf.iterguide():
if guide_elem.get('type', '').lower() == 'cover':
guide_cover = guide_elem.get('href', '').partition('#')[0]
break
if guide_cover:
spine = list(opf.iterspine())
if spine:
idref = spine[0].get('idref', '')
for x in opf.itermanifest():
if x.get('id') == idref and x.get('href') == guide_cover:
guide_titlepage_href = guide_cover
guide_titlepage_id = idref
break
raster_cover_href = opf.epub3_raster_cover or opf.raster_cover
if raster_cover_href:
self.set_guide_type(opf, 'cover', raster_cover_href, 'Cover Image')
titlepage_id = titlepage_href = None
for item in items_with_property(opf.root, 'calibre:title-page'):
tid, href = item.get('id'), item.get('href')
if href and tid:
titlepage_id, titlepage_href = tid, href.partition('#')[0]
break
if titlepage_href is None:
titlepage_href, titlepage_id = guide_titlepage_href, guide_titlepage_id
if titlepage_href is not None:
self.set_guide_type(opf, 'titlepage', titlepage_href, 'Title Page')
spine = list(opf.iterspine())
if len(spine) > 1:
for item in spine:
if item.get('idref') == titlepage_id:
log('Found HTML cover', titlepage_href)
if self.for_viewer:
item.attrib.pop('linear', None)
else:
item.getparent().remove(item)
removed = titlepage_href
return removed
def rationalize_cover2(self, opf, log):
''' Ensure that the cover information in the guide is correct. That
means, at most one entry with type="cover" that points to a raster
cover and at most one entry with type="titlepage" that points to an
HTML titlepage. '''
from calibre.ebooks.oeb.base import OPF
removed = None
from lxml import etree
guide_cover, guide_elem = None, None
for guide_elem in opf.iterguide():
if guide_elem.get('type', '').lower() == 'cover':
guide_cover = guide_elem.get('href', '').partition('#')[0]
break
if not guide_cover:
raster_cover = opf.raster_cover
if raster_cover:
if guide_elem is None:
g = opf.root.makeelement(OPF('guide'))
opf.root.append(g)
else:
g = guide_elem.getparent()
guide_cover = raster_cover
guide_elem = g.makeelement(OPF('reference'), attrib={'href':raster_cover, 'type':'cover'})
g.append(guide_elem)
return
spine = list(opf.iterspine())
if not spine:
return
# Check if the cover specified in the guide is also
# the first element in spine
idref = spine[0].get('idref', '')
manifest = list(opf.itermanifest())
if not manifest:
return
elem = [x for x in manifest if x.get('id', '') == idref]
if not elem or elem[0].get('href', None) != guide_cover:
return
log('Found HTML cover', guide_cover)
# Remove from spine as covers must be treated
# specially
if not self.for_viewer:
if len(spine) == 1:
log.warn('There is only a single spine item and it is marked as the cover. Removing cover marking.')
for guide_elem in tuple(opf.iterguide()):
if guide_elem.get('type', '').lower() == 'cover':
guide_elem.getparent().remove(guide_elem)
return
else:
spine[0].getparent().remove(spine[0])
removed = guide_cover
else:
# Ensure the cover is displayed as the first item in the book, some
# epub files have it set with linear='no' which causes the cover to
# display in the end
spine[0].attrib.pop('linear', None)
opf.spine[0].is_linear = True
# Ensure that the guide has a cover entry pointing to a raster cover
# and a titlepage entry pointing to the html titlepage. The titlepage
# entry will be used by the epub output plugin, the raster cover entry
# by other output plugins.
# Search for a raster cover identified in the OPF
raster_cover = opf.raster_cover
# Set the cover guide entry
if raster_cover is not None:
guide_elem.set('href', raster_cover)
else:
# Render the titlepage to create a raster cover
from calibre.ebooks import render_html_svg_workaround
guide_elem.set('href', 'calibre_raster_cover.jpg')
t = etree.SubElement(
elem[0].getparent(), OPF('item'), href=guide_elem.get('href'), id='calibre_raster_cover')
t.set('media-type', 'image/jpeg')
if os.path.exists(guide_cover):
renderer = render_html_svg_workaround(guide_cover, log)
if renderer is not None:
with lopen('calibre_raster_cover.jpg', 'wb') as f:
f.write(renderer)
# Set the titlepage guide entry
self.set_guide_type(opf, 'titlepage', guide_cover, 'Title Page')
return removed
def find_opf(self):
from calibre.utils.xml_parse import safe_xml_fromstring
def attr(n, attr):
for k, v in n.attrib.items():
if k.endswith(attr):
return v
try:
with lopen('META-INF/container.xml', 'rb') as f:
root = safe_xml_fromstring(f.read())
for r in root.xpath('//*[local-name()="rootfile"]'):
if attr(r, 'media-type') != "application/oebps-package+xml":
continue
path = attr(r, 'full-path')
if not path:
continue
path = os.path.join(getcwd(), *path.split('/'))
if os.path.exists(path):
return path
except Exception:
import traceback
traceback.print_exc()
def convert(self, stream, options, file_ext, log, accelerators):
from calibre.utils.zipfile import ZipFile
from calibre import walk
from calibre.ebooks import DRMError
from calibre.ebooks.metadata.opf2 import OPF
try:
zf = ZipFile(stream)
zf.extractall(getcwd())
except:
log.exception('EPUB appears to be invalid ZIP file, trying a'
' more forgiving ZIP parser')
from calibre.utils.localunzip import extractall
stream.seek(0)
extractall(stream)
encfile = os.path.abspath(os.path.join('META-INF', 'encryption.xml'))
opf = self.find_opf()
if opf is None:
for f in walk('.'):
if f.lower().endswith('.opf') and '__MACOSX' not in f and \
not os.path.basename(f).startswith('.'):
opf = os.path.abspath(f)
break
path = getattr(stream, 'name', 'stream')
if opf is None:
raise ValueError('%s is not a valid EPUB file (could not find opf)'%path)
opf = os.path.relpath(opf, getcwd())
parts = os.path.split(opf)
opf = OPF(opf, os.path.dirname(os.path.abspath(opf)))
self._encrypted_font_uris = []
if os.path.exists(encfile):
if not self.process_encryption(encfile, opf, log):
raise DRMError(os.path.basename(path))
self.encrypted_fonts = self._encrypted_font_uris
if len(parts) > 1 and parts[0]:
delta = '/'.join(parts[:-1])+'/'
def normpath(x):
return posixpath.normpath(delta + elem.get('href'))
for elem in opf.itermanifest():
elem.set('href', normpath(elem.get('href')))
for elem in opf.iterguide():
elem.set('href', normpath(elem.get('href')))
f = self.rationalize_cover3 if opf.package_version >= 3.0 else self.rationalize_cover2
self.removed_cover = f(opf, log)
if self.removed_cover:
self.removed_items_to_ignore = (self.removed_cover,)
epub3_nav = opf.epub3_nav
if epub3_nav is not None:
self.convert_epub3_nav(epub3_nav, opf, log, options)
for x in opf.itermanifest():
if x.get('media-type', '') == 'application/x-dtbook+xml':
raise ValueError(
'EPUB files with DTBook markup are not supported')
not_for_spine = set()
for y in opf.itermanifest():
id_ = y.get('id', None)
if id_:
mt = y.get('media-type', None)
if mt in {
'application/vnd.adobe-page-template+xml',
'application/vnd.adobe.page-template+xml',
'application/adobe-page-template+xml',
'application/adobe.page-template+xml',
'application/text'
}:
not_for_spine.add(id_)
ext = y.get('href', '').rpartition('.')[-1].lower()
if mt == 'text/plain' and ext in {'otf', 'ttf'}:
# some epub authoring software sets font mime types to
# text/plain
not_for_spine.add(id_)
y.set('media-type', 'application/font')
seen = set()
for x in list(opf.iterspine()):
ref = x.get('idref', None)
if not ref or ref in not_for_spine or ref in seen:
x.getparent().remove(x)
continue
seen.add(ref)
if len(list(opf.iterspine())) == 0:
raise ValueError('No valid entries in the spine of this EPUB')
with lopen('content.opf', 'wb') as nopf:
nopf.write(opf.render())
return os.path.abspath('content.opf')
def convert_epub3_nav(self, nav_path, opf, log, opts):
from lxml import etree
from calibre.ebooks.chardet import xml_to_unicode
from calibre.ebooks.oeb.polish.parsing import parse
from calibre.ebooks.oeb.base import EPUB_NS, XHTML, NCX_MIME, NCX, urlnormalize, urlunquote, serialize
from calibre.ebooks.oeb.polish.toc import first_child
from calibre.utils.xml_parse import safe_xml_fromstring
from tempfile import NamedTemporaryFile
with lopen(nav_path, 'rb') as f:
raw = f.read()
raw = xml_to_unicode(raw, strip_encoding_pats=True, assume_utf8=True)[0]
root = parse(raw, log=log)
ncx = safe_xml_fromstring('<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1" xml:lang="eng"><navMap/></ncx>')
navmap = ncx[0]
et = '{%s}type' % EPUB_NS
bn = os.path.basename(nav_path)
def add_from_li(li, parent):
href = text = None
for x in li.iterchildren(XHTML('a'), XHTML('span')):
text = etree.tostring(
x, method='text', encoding='unicode', with_tail=False).strip() or ' '.join(
x.xpath('descendant-or-self::*/@title')).strip()
href = x.get('href')
if href:
if href.startswith('#'):
href = bn + href
break
np = parent.makeelement(NCX('navPoint'))
parent.append(np)
np.append(np.makeelement(NCX('navLabel')))
np[0].append(np.makeelement(NCX('text')))
np[0][0].text = text
if href:
np.append(np.makeelement(NCX('content'), attrib={'src':href}))
return np
def process_nav_node(node, toc_parent):
for li in node.iterchildren(XHTML('li')):
child = add_from_li(li, toc_parent)
ol = first_child(li, XHTML('ol'))
if child is not None and ol is not None:
process_nav_node(ol, child)
for nav in root.iterdescendants(XHTML('nav')):
if nav.get(et) == 'toc':
ol = first_child(nav, XHTML('ol'))
if ol is not None:
process_nav_node(ol, navmap)
break
else:
return
with NamedTemporaryFile(suffix='.ncx', dir=os.path.dirname(nav_path), delete=False) as f:
f.write(etree.tostring(ncx, encoding='utf-8'))
ncx_href = os.path.relpath(f.name, getcwd()).replace(os.sep, '/')
ncx_id = opf.create_manifest_item(ncx_href, NCX_MIME, append=True).get('id')
for spine in opf.root.xpath('//*[local-name()="spine"]'):
spine.set('toc', ncx_id)
opts.epub3_nav_href = urlnormalize(os.path.relpath(nav_path).replace(os.sep, '/'))
opts.epub3_nav_parsed = root
if getattr(self, 'removed_cover', None):
changed = False
base_path = os.path.dirname(nav_path)
for elem in root.xpath('//*[@href]'):
href, frag = elem.get('href').partition('#')[::2]
link_path = os.path.relpath(os.path.join(base_path, urlunquote(href)), base_path)
abs_href = urlnormalize(link_path)
if abs_href == self.removed_cover:
changed = True
elem.set('data-calibre-removed-titlepage', '1')
if changed:
with lopen(nav_path, 'wb') as f:
f.write(serialize(root, 'application/xhtml+xml'))
def postprocess_book(self, oeb, opts, log):
rc = getattr(self, 'removed_cover', None)
if rc:
cover_toc_item = None
for item in oeb.toc.iterdescendants():
if item.href and item.href.partition('#')[0] == rc:
cover_toc_item = item
break
spine = {x.href for x in oeb.spine}
if (cover_toc_item is not None and cover_toc_item not in spine):
oeb.toc.item_that_refers_to_cover = cover_toc_item

View File

@@ -0,0 +1,548 @@
#!/usr/bin/env python2
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import os, shutil, re
from calibre.customize.conversion import (OutputFormatPlugin,
OptionRecommendation)
from calibre.ptempfile import TemporaryDirectory
from calibre import CurrentDir
from polyglot.builtins import unicode_type, filter, map, zip, range, as_bytes
block_level_tags = (
'address',
'body',
'blockquote',
'center',
'dir',
'div',
'dl',
'fieldset',
'form',
'h1',
'h2',
'h3',
'h4',
'h5',
'h6',
'hr',
'isindex',
'menu',
'noframes',
'noscript',
'ol',
'p',
'pre',
'table',
'ul',
)
class EPUBOutput(OutputFormatPlugin):
name = 'EPUB Output'
author = 'Kovid Goyal'
file_type = 'epub'
commit_name = 'epub_output'
ui_data = {'versions': ('2', '3')}
options = {
OptionRecommendation(name='extract_to',
help=_('Extract the contents of the generated %s file to the '
'specified directory. The contents of the directory are first '
'deleted, so be careful.') % 'EPUB'),
OptionRecommendation(name='dont_split_on_page_breaks',
recommended_value=False, level=OptionRecommendation.LOW,
help=_('Turn off splitting at page breaks. Normally, input '
'files are automatically split at every page break into '
'two files. This gives an output e-book that can be '
'parsed faster and with less resources. However, '
'splitting is slow and if your source file contains a '
'very large number of page breaks, you should turn off '
'splitting on page breaks.'
)
),
OptionRecommendation(name='flow_size', recommended_value=260,
help=_('Split all HTML files larger than this size (in KB). '
'This is necessary as most EPUB readers cannot handle large '
'file sizes. The default of %defaultKB is the size required '
'for Adobe Digital Editions. Set to 0 to disable size based splitting.')
),
OptionRecommendation(name='no_default_epub_cover', recommended_value=False,
help=_('Normally, if the input file has no cover and you don\'t'
' specify one, a default cover is generated with the title, '
'authors, etc. This option disables the generation of this cover.')
),
OptionRecommendation(name='no_svg_cover', recommended_value=False,
help=_('Do not use SVG for the book cover. Use this option if '
'your EPUB is going to be used on a device that does not '
'support SVG, like the iPhone or the JetBook Lite. '
'Without this option, such devices will display the cover '
'as a blank page.')
),
OptionRecommendation(name='preserve_cover_aspect_ratio',
recommended_value=False, help=_(
'When using an SVG cover, this option will cause the cover to scale '
'to cover the available screen area, but still preserve its aspect ratio '
'(ratio of width to height). That means there may be white borders '
'at the sides or top and bottom of the image, but the image will '
'never be distorted. Without this option the image may be slightly '
'distorted, but there will be no borders.'
)
),
OptionRecommendation(name='epub_flatten', recommended_value=False,
help=_('This option is needed only if you intend to use the EPUB'
' with FBReaderJ. It will flatten the file system inside the'
' EPUB, putting all files into the top level.')
),
OptionRecommendation(name='epub_inline_toc', recommended_value=False,
help=_('Insert an inline Table of Contents that will appear as part of the main book content.')
),
OptionRecommendation(name='epub_toc_at_end', recommended_value=False,
help=_('Put the inserted inline Table of Contents at the end of the book instead of the start.')
),
OptionRecommendation(name='toc_title', recommended_value=None,
help=_('Title for any generated in-line table of contents.')
),
OptionRecommendation(name='epub_version', recommended_value='2', choices=ui_data['versions'],
help=_('The version of the EPUB file to generate. EPUB 2 is the'
' most widely compatible, only use EPUB 3 if you know you'
' actually need it.')
),
}
recommendations = {('pretty_print', True, OptionRecommendation.HIGH)}
def workaround_webkit_quirks(self): # {{{
from calibre.ebooks.oeb.base import XPath
for x in self.oeb.spine:
root = x.data
body = XPath('//h:body')(root)
if body:
body = body[0]
if not hasattr(body, 'xpath'):
continue
for pre in XPath('//h:pre')(body):
if not pre.text and len(pre) == 0:
pre.tag = 'div'
# }}}
def upshift_markup(self): # {{{
'Upgrade markup to comply with XHTML 1.1 where possible'
from calibre.ebooks.oeb.base import XPath, XML
for x in self.oeb.spine:
root = x.data
if (not root.get(XML('lang'))) and (root.get('lang')):
root.set(XML('lang'), root.get('lang'))
body = XPath('//h:body')(root)
if body:
body = body[0]
if not hasattr(body, 'xpath'):
continue
for u in XPath('//h:u')(root):
u.tag = 'span'
seen_ids, seen_names = set(), set()
for x in XPath('//*[@id or @name]')(root):
eid, name = x.get('id', None), x.get('name', None)
if eid:
if eid in seen_ids:
del x.attrib['id']
else:
seen_ids.add(eid)
if name:
if name in seen_names:
del x.attrib['name']
else:
seen_names.add(name)
# }}}
def convert(self, oeb, output_path, input_plugin, opts, log):
self.log, self.opts, self.oeb = log, opts, oeb
if self.opts.epub_inline_toc:
from calibre.ebooks.mobi.writer8.toc import TOCAdder
opts.mobi_toc_at_start = not opts.epub_toc_at_end
opts.mobi_passthrough = False
opts.no_inline_toc = False
TOCAdder(oeb, opts, replace_previous_inline_toc=True, ignore_existing_toc=True)
if self.opts.epub_flatten:
from calibre.ebooks.oeb.transforms.filenames import FlatFilenames
FlatFilenames()(oeb, opts)
else:
from calibre.ebooks.oeb.transforms.filenames import UniqueFilenames
UniqueFilenames()(oeb, opts)
self.workaround_ade_quirks()
self.workaround_webkit_quirks()
self.upshift_markup()
from calibre.ebooks.oeb.transforms.rescale import RescaleImages
RescaleImages(check_colorspaces=True)(oeb, opts)
from calibre.ebooks.oeb.transforms.split import Split
split = Split(not self.opts.dont_split_on_page_breaks,
max_flow_size=self.opts.flow_size*1024
)
split(self.oeb, self.opts)
from calibre.ebooks.oeb.transforms.cover import CoverManager
cm = CoverManager(
no_default_cover=self.opts.no_default_epub_cover,
no_svg_cover=self.opts.no_svg_cover,
preserve_aspect_ratio=self.opts.preserve_cover_aspect_ratio)
cm(self.oeb, self.opts, self.log)
self.workaround_sony_quirks()
if self.oeb.toc.count() == 0:
self.log.warn('This EPUB file has no Table of Contents. '
'Creating a default TOC')
first = next(iter(self.oeb.spine))
self.oeb.toc.add(_('Start'), first.href)
from calibre.ebooks.oeb.base import OPF
identifiers = oeb.metadata['identifier']
uuid = None
for x in identifiers:
if x.get(OPF('scheme'), None).lower() == 'uuid' or unicode_type(x).startswith('urn:uuid:'):
uuid = unicode_type(x).split(':')[-1]
break
encrypted_fonts = getattr(input_plugin, 'encrypted_fonts', [])
if uuid is None:
self.log.warn('No UUID identifier found')
from uuid import uuid4
uuid = unicode_type(uuid4())
oeb.metadata.add('identifier', uuid, scheme='uuid', id=uuid)
if encrypted_fonts and not uuid.startswith('urn:uuid:'):
# Apparently ADE requires this value to start with urn:uuid:
# for some absurd reason, or it will throw a hissy fit and refuse
# to use the obfuscated fonts.
for x in identifiers:
if unicode_type(x) == uuid:
x.content = 'urn:uuid:'+uuid
with TemporaryDirectory('_epub_output') as tdir:
from calibre.customize.ui import plugin_for_output_format
metadata_xml = None
extra_entries = []
if self.is_periodical:
if self.opts.output_profile.epub_periodical_format == 'sony':
from calibre.ebooks.epub.periodical import sony_metadata
metadata_xml, atom_xml = sony_metadata(oeb)
extra_entries = [('atom.xml', 'application/atom+xml', atom_xml)]
oeb_output = plugin_for_output_format('oeb')
oeb_output.convert(oeb, tdir, input_plugin, opts, log)
opf = [x for x in os.listdir(tdir) if x.endswith('.opf')][0]
self.condense_ncx([os.path.join(tdir, x) for x in os.listdir(tdir)
if x.endswith('.ncx')][0])
if self.opts.epub_version == '3':
self.upgrade_to_epub3(tdir, opf)
encryption = None
if encrypted_fonts:
encryption = self.encrypt_fonts(encrypted_fonts, tdir, uuid)
from calibre.ebooks.epub import initialize_container
with initialize_container(output_path, os.path.basename(opf),
extra_entries=extra_entries) as epub:
epub.add_dir(tdir)
if encryption is not None:
epub.writestr('META-INF/encryption.xml', as_bytes(encryption))
if metadata_xml is not None:
epub.writestr('META-INF/metadata.xml',
metadata_xml.encode('utf-8'))
if opts.extract_to is not None:
from calibre.utils.zipfile import ZipFile
if os.path.exists(opts.extract_to):
if os.path.isdir(opts.extract_to):
shutil.rmtree(opts.extract_to)
else:
os.remove(opts.extract_to)
os.mkdir(opts.extract_to)
with ZipFile(output_path) as zf:
zf.extractall(path=opts.extract_to)
self.log.info('EPUB extracted to', opts.extract_to)
def upgrade_to_epub3(self, tdir, opf):
self.log.info('Upgrading to EPUB 3...')
from calibre.ebooks.epub import simple_container_xml
from calibre.ebooks.oeb.polish.cover import fix_conversion_titlepage_links_in_nav
try:
os.mkdir(os.path.join(tdir, 'META-INF'))
except EnvironmentError:
pass
with open(os.path.join(tdir, 'META-INF', 'container.xml'), 'wb') as f:
f.write(simple_container_xml(os.path.basename(opf)).encode('utf-8'))
from calibre.ebooks.oeb.polish.container import EpubContainer
container = EpubContainer(tdir, self.log)
from calibre.ebooks.oeb.polish.upgrade import epub_2_to_3
existing_nav = getattr(self.opts, 'epub3_nav_parsed', None)
nav_href = getattr(self.opts, 'epub3_nav_href', None)
previous_nav = (nav_href, existing_nav) if existing_nav and nav_href else None
epub_2_to_3(container, self.log.info, previous_nav=previous_nav)
fix_conversion_titlepage_links_in_nav(container)
container.commit()
os.remove(f.name)
try:
os.rmdir(os.path.join(tdir, 'META-INF'))
except EnvironmentError:
pass
def encrypt_fonts(self, uris, tdir, uuid): # {{{
from polyglot.binary import from_hex_bytes
key = re.sub(r'[^a-fA-F0-9]', '', uuid)
if len(key) < 16:
raise ValueError('UUID identifier %r is invalid'%uuid)
key = bytearray(from_hex_bytes((key + key)[:32]))
paths = []
with CurrentDir(tdir):
paths = [os.path.join(*x.split('/')) for x in uris]
uris = dict(zip(uris, paths))
fonts = []
for uri in list(uris.keys()):
path = uris[uri]
if not os.path.exists(path):
uris.pop(uri)
continue
self.log.debug('Encrypting font:', uri)
with lopen(path, 'r+b') as f:
data = f.read(1024)
if len(data) >= 1024:
data = bytearray(data)
f.seek(0)
f.write(bytes(bytearray(data[i] ^ key[i%16] for i in range(1024))))
else:
self.log.warn('Font', path, 'is invalid, ignoring')
if not isinstance(uri, unicode_type):
uri = uri.decode('utf-8')
fonts.append('''
<enc:EncryptedData>
<enc:EncryptionMethod Algorithm="http://ns.adobe.com/pdf/enc#RC"/>
<enc:CipherData>
<enc:CipherReference URI="%s"/>
</enc:CipherData>
</enc:EncryptedData>
'''%(uri.replace('"', '\\"')))
if fonts:
ans = '''<encryption
xmlns="urn:oasis:names:tc:opendocument:xmlns:container"
xmlns:enc="http://www.w3.org/2001/04/xmlenc#"
xmlns:deenc="http://ns.adobe.com/digitaleditions/enc">
'''
ans += '\n'.join(fonts)
ans += '\n</encryption>'
return ans
# }}}
def condense_ncx(self, ncx_path): # {{{
from lxml import etree
if not self.opts.pretty_print:
tree = etree.parse(ncx_path)
for tag in tree.getroot().iter(tag=etree.Element):
if tag.text:
tag.text = tag.text.strip()
if tag.tail:
tag.tail = tag.tail.strip()
compressed = etree.tostring(tree.getroot(), encoding='utf-8')
with open(ncx_path, 'wb') as f:
f.write(compressed)
# }}}
def workaround_ade_quirks(self): # {{{
'''
Perform various markup transforms to get the output to render correctly
in the quirky ADE.
'''
from calibre.ebooks.oeb.base import XPath, XHTML, barename, urlunquote
stylesheet = self.oeb.manifest.main_stylesheet
# ADE cries big wet tears when it encounters an invalid fragment
# identifier in the NCX toc.
frag_pat = re.compile(r'[-A-Za-z0-9_:.]+$')
for node in self.oeb.toc.iter():
href = getattr(node, 'href', None)
if hasattr(href, 'partition'):
base, _, frag = href.partition('#')
frag = urlunquote(frag)
if frag and frag_pat.match(frag) is None:
self.log.warn(
'Removing fragment identifier %r from TOC as Adobe Digital Editions cannot handle it'%frag)
node.href = base
for x in self.oeb.spine:
root = x.data
body = XPath('//h:body')(root)
if body:
body = body[0]
if hasattr(body, 'xpath'):
# remove <img> tags with empty src elements
bad = []
for x in XPath('//h:img')(body):
src = x.get('src', '').strip()
if src in ('', '#') or src.startswith('http:'):
bad.append(x)
for img in bad:
img.getparent().remove(img)
# Add id attribute to <a> tags that have name
for x in XPath('//h:a[@name]')(body):
if not x.get('id', False):
x.set('id', x.get('name'))
# The delightful epubcheck has started complaining about <a> tags that
# have name attributes.
x.attrib.pop('name')
# Replace <br> that are children of <body> as ADE doesn't handle them
for br in XPath('./h:br')(body):
if br.getparent() is None:
continue
try:
prior = next(br.itersiblings(preceding=True))
priortag = barename(prior.tag)
priortext = prior.tail
except:
priortag = 'body'
priortext = body.text
if priortext:
priortext = priortext.strip()
br.tag = XHTML('p')
br.text = '\u00a0'
style = br.get('style', '').split(';')
style = list(filter(None, map(lambda x: x.strip(), style)))
style.append('margin:0pt; border:0pt')
# If the prior tag is a block (including a <br> we replaced)
# then this <br> replacement should have a 1-line height.
# Otherwise it should have no height.
if not priortext and priortag in block_level_tags:
style.append('height:1em')
else:
style.append('height:0pt')
br.set('style', '; '.join(style))
for tag in XPath('//h:embed')(root):
tag.getparent().remove(tag)
for tag in XPath('//h:object')(root):
if tag.get('type', '').lower().strip() in {'image/svg+xml', 'application/svg+xml'}:
continue
tag.getparent().remove(tag)
for tag in XPath('//h:title|//h:style')(root):
if not tag.text:
tag.getparent().remove(tag)
for tag in XPath('//h:script')(root):
if (not tag.text and not tag.get('src', False) and tag.get('type', None) != 'text/x-mathjax-config'):
tag.getparent().remove(tag)
for tag in XPath('//h:body/descendant::h:script')(root):
tag.getparent().remove(tag)
formchildren = XPath('./h:input|./h:button|./h:textarea|'
'./h:label|./h:fieldset|./h:legend')
for tag in XPath('//h:form')(root):
if formchildren(tag):
tag.getparent().remove(tag)
else:
# Not a real form
tag.tag = XHTML('div')
for tag in XPath('//h:center')(root):
tag.tag = XHTML('div')
tag.set('style', 'text-align:center')
# ADE can't handle &amp; in an img url
for tag in XPath('//h:img[@src]')(root):
tag.set('src', tag.get('src', '').replace('&', ''))
# ADE whimpers in fright when it encounters a <td> outside a
# <table>
in_table = XPath('ancestor::h:table')
for tag in XPath('//h:td|//h:tr|//h:th')(root):
if not in_table(tag):
tag.tag = XHTML('div')
# ADE fails to render non breaking hyphens/soft hyphens/zero width spaces
special_chars = re.compile('[\u200b\u00ad]')
for elem in root.iterdescendants('*'):
if elem.text:
elem.text = special_chars.sub('', elem.text)
elem.text = elem.text.replace('\u2011', '-')
if elem.tail:
elem.tail = special_chars.sub('', elem.tail)
elem.tail = elem.tail.replace('\u2011', '-')
if stylesheet is not None:
# ADE doesn't render lists correctly if they have left margins
from css_parser.css import CSSRule
for lb in XPath('//h:ul[@class]|//h:ol[@class]')(root):
sel = '.'+lb.get('class')
for rule in stylesheet.data.cssRules.rulesOfType(CSSRule.STYLE_RULE):
if sel == rule.selectorList.selectorText:
rule.style.removeProperty('margin-left')
# padding-left breaks rendering in webkit and gecko
rule.style.removeProperty('padding-left')
# Change whitespace:pre to pre-wrap to accommodate readers that
# cannot scroll horizontally
for rule in stylesheet.data.cssRules.rulesOfType(CSSRule.STYLE_RULE):
style = rule.style
ws = style.getPropertyValue('white-space')
if ws == 'pre':
style.setProperty('white-space', 'pre-wrap')
# }}}
def workaround_sony_quirks(self): # {{{
'''
Perform toc link transforms to alleviate slow loading.
'''
from calibre.ebooks.oeb.base import urldefrag, XPath
from calibre.ebooks.oeb.polish.toc import item_at_top
def frag_is_at_top(root, frag):
elem = XPath('//*[@id="%s" or @name="%s"]'%(frag, frag))(root)
if elem:
elem = elem[0]
else:
return False
return item_at_top(elem)
def simplify_toc_entry(toc):
if toc.href:
href, frag = urldefrag(toc.href)
if frag:
for x in self.oeb.spine:
if x.href == href:
if frag_is_at_top(x.data, frag):
self.log.debug('Removing anchor from TOC href:',
href+'#'+frag)
toc.href = href
break
for x in toc:
simplify_toc_entry(x)
if self.oeb.toc:
simplify_toc_entry(self.oeb.toc)
# }}}

View File

@@ -0,0 +1,179 @@
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2008, Anatoly Shipitsin <norguhtar at gmail.com>'
"""
Convert .fb2 files to .lrf
"""
import os, re
from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
from calibre import guess_type
from polyglot.builtins import iteritems, getcwd
FB2NS = 'http://www.gribuser.ru/xml/fictionbook/2.0'
FB21NS = 'http://www.gribuser.ru/xml/fictionbook/2.1'
class FB2Input(InputFormatPlugin):
name = 'FB2 Input'
author = 'Anatoly Shipitsin'
description = 'Convert FB2 and FBZ files to HTML'
file_types = {'fb2', 'fbz'}
commit_name = 'fb2_input'
recommendations = {
('level1_toc', '//h:h1', OptionRecommendation.MED),
('level2_toc', '//h:h2', OptionRecommendation.MED),
('level3_toc', '//h:h3', OptionRecommendation.MED),
}
options = {
OptionRecommendation(name='no_inline_fb2_toc',
recommended_value=False, level=OptionRecommendation.LOW,
help=_('Do not insert a Table of Contents at the beginning of the book.'
)
)}
def convert(self, stream, options, file_ext, log,
accelerators):
from lxml import etree
from calibre.utils.xml_parse import safe_xml_fromstring
from calibre.ebooks.metadata.fb2 import ensure_namespace, get_fb2_data
from calibre.ebooks.metadata.opf2 import OPFCreator
from calibre.ebooks.metadata.meta import get_metadata
from calibre.ebooks.oeb.base import XLINK_NS, XHTML_NS
from calibre.ebooks.chardet import xml_to_unicode
self.log = log
log.debug('Parsing XML...')
raw = get_fb2_data(stream)[0]
raw = raw.replace(b'\0', b'')
raw = xml_to_unicode(raw, strip_encoding_pats=True,
assume_utf8=True, resolve_entities=True)[0]
try:
doc = safe_xml_fromstring(raw)
except etree.XMLSyntaxError:
doc = safe_xml_fromstring(raw.replace('& ', '&amp;'))
if doc is None:
raise ValueError('The FB2 file is not valid XML')
doc = ensure_namespace(doc)
try:
fb_ns = doc.nsmap[doc.prefix]
except Exception:
fb_ns = FB2NS
NAMESPACES = {'f':fb_ns, 'l':XLINK_NS}
stylesheets = doc.xpath('//*[local-name() = "stylesheet" and @type="text/css"]')
css = ''
for s in stylesheets:
css += etree.tostring(s, encoding='unicode', method='text',
with_tail=False) + '\n\n'
if css:
import css_parser, logging
parser = css_parser.CSSParser(fetcher=None,
log=logging.getLogger('calibre.css'))
XHTML_CSS_NAMESPACE = '@namespace "%s";\n' % XHTML_NS
text = XHTML_CSS_NAMESPACE + css
log.debug('Parsing stylesheet...')
stylesheet = parser.parseString(text)
stylesheet.namespaces['h'] = XHTML_NS
css = stylesheet.cssText
if isinstance(css, bytes):
css = css.decode('utf-8', 'replace')
css = css.replace('h|style', 'h|span')
css = re.sub(r'name\s*=\s*', 'class=', css)
self.extract_embedded_content(doc)
log.debug('Converting XML to HTML...')
with open(P('templates/fb2.xsl'), 'rb') as f:
ss = f.read().decode('utf-8')
ss = ss.replace("__FB_NS__", fb_ns)
if options.no_inline_fb2_toc:
log('Disabling generation of inline FB2 TOC')
ss = re.compile(r'<!-- BUILD TOC -->.*<!-- END BUILD TOC -->',
re.DOTALL).sub('', ss)
styledoc = safe_xml_fromstring(ss)
transform = etree.XSLT(styledoc)
result = transform(doc)
# Handle links of type note and cite
notes = {a.get('href')[1:]: a for a in result.xpath('//a[@link_note and @href]') if a.get('href').startswith('#')}
cites = {a.get('link_cite'): a for a in result.xpath('//a[@link_cite]') if not a.get('href', '')}
all_ids = {x for x in result.xpath('//*/@id')}
for cite, a in iteritems(cites):
note = notes.get(cite, None)
if note:
c = 1
while 'cite%d' % c in all_ids:
c += 1
if not note.get('id', None):
note.set('id', 'cite%d' % c)
all_ids.add(note.get('id'))
a.set('href', '#%s' % note.get('id'))
for x in result.xpath('//*[@link_note or @link_cite]'):
x.attrib.pop('link_note', None)
x.attrib.pop('link_cite', None)
for img in result.xpath('//img[@src]'):
src = img.get('src')
img.set('src', self.binary_map.get(src, src))
index = transform.tostring(result)
with open('index.xhtml', 'wb') as f:
f.write(index.encode('utf-8'))
with open('inline-styles.css', 'wb') as f:
f.write(css.encode('utf-8'))
stream.seek(0)
mi = get_metadata(stream, 'fb2')
if not mi.title:
mi.title = _('Unknown')
if not mi.authors:
mi.authors = [_('Unknown')]
cpath = None
if mi.cover_data and mi.cover_data[1]:
with open('fb2_cover_calibre_mi.jpg', 'wb') as f:
f.write(mi.cover_data[1])
cpath = os.path.abspath('fb2_cover_calibre_mi.jpg')
else:
for img in doc.xpath('//f:coverpage/f:image', namespaces=NAMESPACES):
href = img.get('{%s}href'%XLINK_NS, img.get('href', None))
if href is not None:
if href.startswith('#'):
href = href[1:]
cpath = os.path.abspath(href)
break
opf = OPFCreator(getcwd(), mi)
entries = [(f2, guess_type(f2)[0]) for f2 in os.listdir(u'.')]
opf.create_manifest(entries)
opf.create_spine(['index.xhtml'])
if cpath:
opf.guide.set_cover(cpath)
with open('metadata.opf', 'wb') as f:
opf.render(f)
return os.path.join(getcwd(), 'metadata.opf')
def extract_embedded_content(self, doc):
from calibre.ebooks.fb2 import base64_decode
self.binary_map = {}
for elem in doc.xpath('./*'):
if elem.text and 'binary' in elem.tag and 'id' in elem.attrib:
ct = elem.get('content-type', '')
fname = elem.attrib['id']
ext = ct.rpartition('/')[-1].lower()
if ext in ('png', 'jpeg', 'jpg'):
if fname.lower().rpartition('.')[-1] not in {'jpg', 'jpeg',
'png'}:
fname += '.' + ext
self.binary_map[elem.get('id')] = fname
raw = elem.text.strip()
try:
data = base64_decode(raw)
except TypeError:
self.log.exception('Binary data with id=%s is corrupted, ignoring'%(
elem.get('id')))
else:
with open(fname, 'wb') as f:
f.write(data)

View File

@@ -0,0 +1,203 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL 3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
import os
from calibre.customize.conversion import OutputFormatPlugin, OptionRecommendation
class FB2Output(OutputFormatPlugin):
name = 'FB2 Output'
author = 'John Schember'
file_type = 'fb2'
commit_name = 'fb2_output'
FB2_GENRES = [
# Science Fiction & Fantasy
'sf_history', # Alternative history
'sf_action', # Action
'sf_epic', # Epic
'sf_heroic', # Heroic
'sf_detective', # Detective
'sf_cyberpunk', # Cyberpunk
'sf_space', # Space
'sf_social', # Social#philosophical
'sf_horror', # Horror & mystic
'sf_humor', # Humor
'sf_fantasy', # Fantasy
'sf', # Science Fiction
# Detectives & Thrillers
'det_classic', # Classical detectives
'det_police', # Police Stories
'det_action', # Action
'det_irony', # Ironical detectives
'det_history', # Historical detectives
'det_espionage', # Espionage detectives
'det_crime', # Crime detectives
'det_political', # Political detectives
'det_maniac', # Maniacs
'det_hard', # Hard#boiled
'thriller', # Thrillers
'detective', # Detectives
# Prose
'prose_classic', # Classics prose
'prose_history', # Historical prose
'prose_contemporary', # Contemporary prose
'prose_counter', # Counterculture
'prose_rus_classic', # Russial classics prose
'prose_su_classics', # Soviet classics prose
# Romance
'love_contemporary', # Contemporary Romance
'love_history', # Historical Romance
'love_detective', # Detective Romance
'love_short', # Short Romance
'love_erotica', # Erotica
# Adventure
'adv_western', # Western
'adv_history', # History
'adv_indian', # Indians
'adv_maritime', # Maritime Fiction
'adv_geo', # Travel & geography
'adv_animal', # Nature & animals
'adventure', # Other
# Children's
'child_tale', # Fairy Tales
'child_verse', # Verses
'child_prose', # Prose
'child_sf', # Science Fiction
'child_det', # Detectives & Thrillers
'child_adv', # Adventures
'child_education', # Educational
'children', # Other
# Poetry & Dramaturgy
'poetry', # Poetry
'dramaturgy', # Dramaturgy
# Antique literature
'antique_ant', # Antique
'antique_european', # European
'antique_russian', # Old russian
'antique_east', # Old east
'antique_myths', # Myths. Legends. Epos
'antique', # Other
# Scientific#educational
'sci_history', # History
'sci_psychology', # Psychology
'sci_culture', # Cultural science
'sci_religion', # Religious studies
'sci_philosophy', # Philosophy
'sci_politics', # Politics
'sci_business', # Business literature
'sci_juris', # Jurisprudence
'sci_linguistic', # Linguistics
'sci_medicine', # Medicine
'sci_phys', # Physics
'sci_math', # Mathematics
'sci_chem', # Chemistry
'sci_biology', # Biology
'sci_tech', # Technical
'science', # Other
# Computers & Internet
'comp_www', # Internet
'comp_programming', # Programming
'comp_hard', # Hardware
'comp_soft', # Software
'comp_db', # Databases
'comp_osnet', # OS & Networking
'computers', # Other
# Reference
'ref_encyc', # Encyclopedias
'ref_dict', # Dictionaries
'ref_ref', # Reference
'ref_guide', # Guidebooks
'reference', # Other
# Nonfiction
'nonf_biography', # Biography & Memoirs
'nonf_publicism', # Publicism
'nonf_criticism', # Criticism
'design', # Art & design
'nonfiction', # Other
# Religion & Inspiration
'religion_rel', # Religion
'religion_esoterics', # Esoterics
'religion_self', # Self#improvement
'religion', # Other
# Humor
'humor_anecdote', # Anecdote (funny stories)
'humor_prose', # Prose
'humor_verse', # Verses
'humor', # Other
# Home & Family
'home_cooking', # Cooking
'home_pets', # Pets
'home_crafts', # Hobbies & Crafts
'home_entertain', # Entertaining
'home_health', # Health
'home_garden', # Garden
'home_diy', # Do it yourself
'home_sport', # Sports
'home_sex', # Erotica & sex
'home', # Other
]
ui_data = {
'sectionize': {
'toc': _('Section per entry in the ToC'),
'files': _('Section per file'),
'nothing': _('A single section')
},
'genres': FB2_GENRES,
}
options = {
OptionRecommendation(name='sectionize',
recommended_value='files', level=OptionRecommendation.LOW,
choices=list(ui_data['sectionize']),
help=_('Specify how sections are created:\n'
' * nothing: {nothing}\n'
' * files: {files}\n'
' * toc: {toc}\n'
'If ToC based generation fails, adjust the "Structure detection" and/or "Table of Contents" settings '
'(turn on "Force use of auto-generated Table of Contents").').format(**ui_data['sectionize'])
),
OptionRecommendation(name='fb2_genre',
recommended_value='antique', level=OptionRecommendation.LOW,
choices=FB2_GENRES,
help=(_('Genre for the book. Choices: %s\n\n See: ') % ', '.join(FB2_GENRES)
) + 'http://www.fictionbook.org/index.php/Eng:FictionBook_2.1_genres ' + _('for a complete list with descriptions.')),
}
def convert(self, oeb_book, output_path, input_plugin, opts, log):
from calibre.ebooks.oeb.transforms.jacket import linearize_jacket
from calibre.ebooks.oeb.transforms.rasterize import SVGRasterizer, Unavailable
from calibre.ebooks.fb2.fb2ml import FB2MLizer
try:
rasterizer = SVGRasterizer()
rasterizer(oeb_book, opts)
except Unavailable:
log.warn('SVG rasterizer unavailable, SVG will not be converted')
linearize_jacket(oeb_book)
fb2mlizer = FB2MLizer(log)
fb2_content = fb2mlizer.extract_content(oeb_book, opts)
close = False
if not hasattr(output_path, 'write'):
close = True
if not os.path.exists(os.path.dirname(output_path)) and os.path.dirname(output_path) != '':
os.makedirs(os.path.dirname(output_path))
out_stream = lopen(output_path, 'wb')
else:
out_stream = output_path
out_stream.seek(0)
out_stream.truncate()
out_stream.write(fb2_content.encode('utf-8', 'replace'))
if close:
out_stream.close()

View File

@@ -0,0 +1,316 @@
#!/usr/bin/env python2
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import re, tempfile, os
from functools import partial
from calibre.constants import islinux, isbsd
from calibre.customize.conversion import (InputFormatPlugin,
OptionRecommendation)
from calibre.utils.localization import get_lang
from calibre.utils.filenames import ascii_filename
from calibre.utils.imghdr import what
from polyglot.builtins import unicode_type, zip, getcwd, as_unicode
def sanitize_file_name(x):
ans = re.sub(r'\s+', ' ', re.sub(r'[?&=;#]', '_', ascii_filename(x))).strip().rstrip('.')
ans, ext = ans.rpartition('.')[::2]
return (ans.strip() + '.' + ext.strip()).rstrip('.')
class HTMLInput(InputFormatPlugin):
name = 'HTML Input'
author = 'Kovid Goyal'
description = 'Convert HTML and OPF files to an OEB'
file_types = {'opf', 'html', 'htm', 'xhtml', 'xhtm', 'shtm', 'shtml'}
commit_name = 'html_input'
options = {
OptionRecommendation(name='breadth_first',
recommended_value=False, level=OptionRecommendation.LOW,
help=_('Traverse links in HTML files breadth first. Normally, '
'they are traversed depth first.'
)
),
OptionRecommendation(name='max_levels',
recommended_value=5, level=OptionRecommendation.LOW,
help=_('Maximum levels of recursion when following links in '
'HTML files. Must be non-negative. 0 implies that no '
'links in the root HTML file are followed. Default is '
'%default.'
)
),
OptionRecommendation(name='dont_package',
recommended_value=False, level=OptionRecommendation.LOW,
help=_('Normally this input plugin re-arranges all the input '
'files into a standard folder hierarchy. Only use this option '
'if you know what you are doing as it can result in various '
'nasty side effects in the rest of the conversion pipeline.'
)
),
}
def convert(self, stream, opts, file_ext, log,
accelerators):
self._is_case_sensitive = None
basedir = getcwd()
self.opts = opts
fname = None
if hasattr(stream, 'name'):
basedir = os.path.dirname(stream.name)
fname = os.path.basename(stream.name)
if file_ext != 'opf':
if opts.dont_package:
raise ValueError('The --dont-package option is not supported for an HTML input file')
from calibre.ebooks.metadata.html import get_metadata
mi = get_metadata(stream)
if fname:
from calibre.ebooks.metadata.meta import metadata_from_filename
fmi = metadata_from_filename(fname)
fmi.smart_update(mi)
mi = fmi
oeb = self.create_oebbook(stream.name, basedir, opts, log, mi)
return oeb
from calibre.ebooks.conversion.plumber import create_oebbook
return create_oebbook(log, stream.name, opts,
encoding=opts.input_encoding)
def is_case_sensitive(self, path):
if getattr(self, '_is_case_sensitive', None) is not None:
return self._is_case_sensitive
if not path or not os.path.exists(path):
return islinux or isbsd
self._is_case_sensitive = not (os.path.exists(path.lower()) and os.path.exists(path.upper()))
return self._is_case_sensitive
def create_oebbook(self, htmlpath, basedir, opts, log, mi):
import uuid
from calibre.ebooks.conversion.plumber import create_oebbook
from calibre.ebooks.oeb.base import (DirContainer,
rewrite_links, urlnormalize, urldefrag, BINARY_MIME, OEB_STYLES,
xpath, urlquote)
from calibre import guess_type
from calibre.ebooks.oeb.transforms.metadata import \
meta_info_to_oeb_metadata
from calibre.ebooks.html.input import get_filelist
from calibre.ebooks.metadata import string_to_authors
from calibre.utils.localization import canonicalize_lang
import css_parser, logging
css_parser.log.setLevel(logging.WARN)
self.OEB_STYLES = OEB_STYLES
oeb = create_oebbook(log, None, opts, self,
encoding=opts.input_encoding, populate=False)
self.oeb = oeb
metadata = oeb.metadata
meta_info_to_oeb_metadata(mi, metadata, log)
if not metadata.language:
l = canonicalize_lang(getattr(opts, 'language', None))
if not l:
oeb.logger.warn('Language not specified')
l = get_lang().replace('_', '-')
metadata.add('language', l)
if not metadata.creator:
a = getattr(opts, 'authors', None)
if a:
a = string_to_authors(a)
if not a:
oeb.logger.warn('Creator not specified')
a = [self.oeb.translate(__('Unknown'))]
for aut in a:
metadata.add('creator', aut)
if not metadata.title:
oeb.logger.warn('Title not specified')
metadata.add('title', self.oeb.translate(__('Unknown')))
bookid = unicode_type(uuid.uuid4())
metadata.add('identifier', bookid, id='uuid_id', scheme='uuid')
for ident in metadata.identifier:
if 'id' in ident.attrib:
self.oeb.uid = metadata.identifier[0]
break
filelist = get_filelist(htmlpath, basedir, opts, log)
filelist = [f for f in filelist if not f.is_binary]
htmlfile_map = {}
for f in filelist:
path = f.path
oeb.container = DirContainer(os.path.dirname(path), log,
ignore_opf=True)
bname = os.path.basename(path)
id, href = oeb.manifest.generate(id='html', href=sanitize_file_name(bname))
htmlfile_map[path] = href
item = oeb.manifest.add(id, href, 'text/html')
if path == htmlpath and '%' in path:
bname = urlquote(bname)
item.html_input_href = bname
oeb.spine.add(item, True)
self.added_resources = {}
self.log = log
self.log('Normalizing filename cases')
for path, href in htmlfile_map.items():
if not self.is_case_sensitive(path):
path = path.lower()
self.added_resources[path] = href
self.urlnormalize, self.DirContainer = urlnormalize, DirContainer
self.urldefrag = urldefrag
self.guess_type, self.BINARY_MIME = guess_type, BINARY_MIME
self.log('Rewriting HTML links')
for f in filelist:
path = f.path
dpath = os.path.dirname(path)
oeb.container = DirContainer(dpath, log, ignore_opf=True)
href = htmlfile_map[path]
try:
item = oeb.manifest.hrefs[href]
except KeyError:
item = oeb.manifest.hrefs[urlnormalize(href)]
rewrite_links(item.data, partial(self.resource_adder, base=dpath))
for item in oeb.manifest.values():
if item.media_type in self.OEB_STYLES:
dpath = None
for path, href in self.added_resources.items():
if href == item.href:
dpath = os.path.dirname(path)
break
css_parser.replaceUrls(item.data,
partial(self.resource_adder, base=dpath))
toc = self.oeb.toc
self.oeb.auto_generated_toc = True
titles = []
headers = []
for item in self.oeb.spine:
if not item.linear:
continue
html = item.data
title = ''.join(xpath(html, '/h:html/h:head/h:title/text()'))
title = re.sub(r'\s+', ' ', title.strip())
if title:
titles.append(title)
headers.append('(unlabled)')
for tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'strong'):
expr = '/h:html/h:body//h:%s[position()=1]/text()'
header = ''.join(xpath(html, expr % tag))
header = re.sub(r'\s+', ' ', header.strip())
if header:
headers[-1] = header
break
use = titles
if len(titles) > len(set(titles)):
use = headers
for title, item in zip(use, self.oeb.spine):
if not item.linear:
continue
toc.add(title, item.href)
oeb.container = DirContainer(getcwd(), oeb.log, ignore_opf=True)
return oeb
def link_to_local_path(self, link_, base=None):
from calibre.ebooks.html.input import Link
if not isinstance(link_, unicode_type):
try:
link_ = link_.decode('utf-8', 'error')
except:
self.log.warn('Failed to decode link %r. Ignoring'%link_)
return None, None
try:
l = Link(link_, base if base else getcwd())
except:
self.log.exception('Failed to process link: %r'%link_)
return None, None
if l.path is None:
# Not a local resource
return None, None
link = l.path.replace('/', os.sep).strip()
frag = l.fragment
if not link:
return None, None
return link, frag
def resource_adder(self, link_, base=None):
from polyglot.urllib import quote
link, frag = self.link_to_local_path(link_, base=base)
if link is None:
return link_
try:
if base and not os.path.isabs(link):
link = os.path.join(base, link)
link = os.path.abspath(link)
except:
return link_
if not os.access(link, os.R_OK):
return link_
if os.path.isdir(link):
self.log.warn(link_, 'is a link to a directory. Ignoring.')
return link_
if not self.is_case_sensitive(tempfile.gettempdir()):
link = link.lower()
if link not in self.added_resources:
bhref = os.path.basename(link)
id, href = self.oeb.manifest.generate(id='added', href=sanitize_file_name(bhref))
guessed = self.guess_type(href)[0]
media_type = guessed or self.BINARY_MIME
if media_type == 'text/plain':
self.log.warn('Ignoring link to text file %r'%link_)
return None
if media_type == self.BINARY_MIME:
# Check for the common case, images
try:
img = what(link)
except EnvironmentError:
pass
else:
if img:
media_type = self.guess_type('dummy.'+img)[0] or self.BINARY_MIME
self.oeb.log.debug('Added', link)
self.oeb.container = self.DirContainer(os.path.dirname(link),
self.oeb.log, ignore_opf=True)
# Load into memory
item = self.oeb.manifest.add(id, href, media_type)
# bhref refers to an already existing file. The read() method of
# DirContainer will call unquote on it before trying to read the
# file, therefore we quote it here.
if isinstance(bhref, unicode_type):
bhref = bhref.encode('utf-8')
item.html_input_href = as_unicode(quote(bhref))
if guessed in self.OEB_STYLES:
item.override_css_fetch = partial(
self.css_import_handler, os.path.dirname(link))
item.data
self.added_resources[link] = href
nlink = self.added_resources[link]
if frag:
nlink = '#'.join((nlink, frag))
return nlink
def css_import_handler(self, base, href):
link, frag = self.link_to_local_path(href, base=base)
if link is None or not os.access(link, os.R_OK) or os.path.isdir(link):
return (None, None)
try:
with open(link, 'rb') as f:
raw = f.read().decode('utf-8', 'replace')
raw = self.oeb.css_preprocessor(raw, add_namespace=False)
except:
self.log.exception('Failed to read CSS file: %r'%link)
return (None, None)
return (None, raw)

View File

@@ -0,0 +1,226 @@
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL 3'
__copyright__ = '2010, Fabian Grassl <fg@jusmeum.de>'
__docformat__ = 'restructuredtext en'
import os, re, shutil
from os.path import dirname, abspath, relpath as _relpath, exists, basename
from calibre.customize.conversion import OutputFormatPlugin, OptionRecommendation
from calibre import CurrentDir
from calibre.ptempfile import PersistentTemporaryDirectory
from polyglot.builtins import unicode_type
def relpath(*args):
return _relpath(*args).replace(os.sep, '/')
class HTMLOutput(OutputFormatPlugin):
name = 'HTML Output'
author = 'Fabian Grassl'
file_type = 'zip'
commit_name = 'html_output'
options = {
OptionRecommendation(name='template_css',
help=_('CSS file used for the output instead of the default file')),
OptionRecommendation(name='template_html_index',
help=_('Template used for generation of the HTML index file instead of the default file')),
OptionRecommendation(name='template_html',
help=_('Template used for the generation of the HTML contents of the book instead of the default file')),
OptionRecommendation(name='extract_to',
help=_('Extract the contents of the generated ZIP file to the '
'specified directory. WARNING: The contents of the directory '
'will be deleted.')
),
}
recommendations = {('pretty_print', True, OptionRecommendation.HIGH)}
def generate_toc(self, oeb_book, ref_url, output_dir):
'''
Generate table of contents
'''
from lxml import etree
from polyglot.urllib import unquote
from calibre.ebooks.oeb.base import element
from calibre.utils.cleantext import clean_xml_chars
with CurrentDir(output_dir):
def build_node(current_node, parent=None):
if parent is None:
parent = etree.Element('ul')
elif len(current_node.nodes):
parent = element(parent, ('ul'))
for node in current_node.nodes:
point = element(parent, 'li')
href = relpath(abspath(unquote(node.href)), dirname(ref_url))
if isinstance(href, bytes):
href = href.decode('utf-8')
link = element(point, 'a', href=clean_xml_chars(href))
title = node.title
if isinstance(title, bytes):
title = title.decode('utf-8')
if title:
title = re.sub(r'\s+', ' ', title)
link.text = clean_xml_chars(title)
build_node(node, point)
return parent
wrap = etree.Element('div')
wrap.append(build_node(oeb_book.toc))
return wrap
def generate_html_toc(self, oeb_book, ref_url, output_dir):
from lxml import etree
root = self.generate_toc(oeb_book, ref_url, output_dir)
return etree.tostring(root, pretty_print=True, encoding='unicode',
xml_declaration=False)
def convert(self, oeb_book, output_path, input_plugin, opts, log):
from lxml import etree
from calibre.utils import zipfile
from templite import Templite
from polyglot.urllib import unquote
from calibre.ebooks.html.meta import EasyMeta
# read template files
if opts.template_html_index is not None:
with open(opts.template_html_index, 'rb') as f:
template_html_index_data = f.read()
else:
template_html_index_data = P('templates/html_export_default_index.tmpl', data=True)
if opts.template_html is not None:
with open(opts.template_html, 'rb') as f:
template_html_data = f.read()
else:
template_html_data = P('templates/html_export_default.tmpl', data=True)
if opts.template_css is not None:
with open(opts.template_css, 'rb') as f:
template_css_data = f.read()
else:
template_css_data = P('templates/html_export_default.css', data=True)
template_html_index_data = template_html_index_data.decode('utf-8')
template_html_data = template_html_data.decode('utf-8')
template_css_data = template_css_data.decode('utf-8')
self.log = log
self.opts = opts
meta = EasyMeta(oeb_book.metadata)
tempdir = os.path.realpath(PersistentTemporaryDirectory())
output_file = os.path.join(tempdir,
basename(re.sub(r'\.zip', '', output_path)+'.html'))
output_dir = re.sub(r'\.html', '', output_file)+'_files'
if not exists(output_dir):
os.makedirs(output_dir)
css_path = output_dir+os.sep+'calibreHtmlOutBasicCss.css'
with open(css_path, 'wb') as f:
f.write(template_css_data.encode('utf-8'))
with open(output_file, 'wb') as f:
html_toc = self.generate_html_toc(oeb_book, output_file, output_dir)
templite = Templite(template_html_index_data)
nextLink = oeb_book.spine[0].href
nextLink = relpath(output_dir+os.sep+nextLink, dirname(output_file))
cssLink = relpath(abspath(css_path), dirname(output_file))
tocUrl = relpath(output_file, dirname(output_file))
t = templite.render(has_toc=bool(oeb_book.toc.count()),
toc=html_toc, meta=meta, nextLink=nextLink,
tocUrl=tocUrl, cssLink=cssLink,
firstContentPageLink=nextLink)
if isinstance(t, unicode_type):
t = t.encode('utf-8')
f.write(t)
with CurrentDir(output_dir):
for item in oeb_book.manifest:
path = abspath(unquote(item.href))
dir = dirname(path)
if not exists(dir):
os.makedirs(dir)
if item.spine_position is not None:
with open(path, 'wb') as f:
pass
else:
with open(path, 'wb') as f:
f.write(item.bytes_representation)
item.unload_data_from_memory(memory=path)
for item in oeb_book.spine:
path = abspath(unquote(item.href))
dir = dirname(path)
root = item.data.getroottree()
# get & clean HTML <HEAD>-data
head = root.xpath('//h:head', namespaces={'h': 'http://www.w3.org/1999/xhtml'})[0]
head_content = etree.tostring(head, pretty_print=True, encoding='unicode')
head_content = re.sub(r'\<\/?head.*\>', '', head_content)
head_content = re.sub(re.compile(r'\<style.*\/style\>', re.M|re.S), '', head_content)
head_content = re.sub(r'<(title)([^>]*)/>', r'<\1\2></\1>', head_content)
# get & clean HTML <BODY>-data
body = root.xpath('//h:body', namespaces={'h': 'http://www.w3.org/1999/xhtml'})[0]
ebook_content = etree.tostring(body, pretty_print=True, encoding='unicode')
ebook_content = re.sub(r'\<\/?body.*\>', '', ebook_content)
ebook_content = re.sub(r'<(div|a|span)([^>]*)/>', r'<\1\2></\1>', ebook_content)
# generate link to next page
if item.spine_position+1 < len(oeb_book.spine):
nextLink = oeb_book.spine[item.spine_position+1].href
nextLink = relpath(abspath(nextLink), dir)
else:
nextLink = None
# generate link to previous page
if item.spine_position > 0:
prevLink = oeb_book.spine[item.spine_position-1].href
prevLink = relpath(abspath(prevLink), dir)
else:
prevLink = None
cssLink = relpath(abspath(css_path), dir)
tocUrl = relpath(output_file, dir)
firstContentPageLink = oeb_book.spine[0].href
# render template
templite = Templite(template_html_data)
toc = lambda: self.generate_html_toc(oeb_book, path, output_dir)
t = templite.render(ebookContent=ebook_content,
prevLink=prevLink, nextLink=nextLink,
has_toc=bool(oeb_book.toc.count()), toc=toc,
tocUrl=tocUrl, head_content=head_content,
meta=meta, cssLink=cssLink,
firstContentPageLink=firstContentPageLink)
# write html to file
with open(path, 'wb') as f:
f.write(t.encode('utf-8'))
item.unload_data_from_memory(memory=path)
zfile = zipfile.ZipFile(output_path, "w")
zfile.add_dir(output_dir, basename(output_dir))
zfile.write(output_file, basename(output_file), zipfile.ZIP_DEFLATED)
if opts.extract_to:
if os.path.exists(opts.extract_to):
shutil.rmtree(opts.extract_to)
os.makedirs(opts.extract_to)
zfile.extractall(opts.extract_to)
self.log('Zip file extracted to', opts.extract_to)
zfile.close()
# cleanup temp dir
shutil.rmtree(tempdir)

View File

@@ -0,0 +1,133 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL 3'
__copyright__ = '2011, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
import os
from calibre import guess_type
from calibre.customize.conversion import InputFormatPlugin
from polyglot.builtins import getcwd
class HTMLZInput(InputFormatPlugin):
name = 'HTLZ Input'
author = 'John Schember'
description = 'Convert HTML files to HTML'
file_types = {'htmlz'}
commit_name = 'htmlz_input'
def convert(self, stream, options, file_ext, log,
accelerators):
from calibre.ebooks.chardet import xml_to_unicode
from calibre.ebooks.metadata.opf2 import OPF
from calibre.utils.zipfile import ZipFile
self.log = log
html = u''
top_levels = []
# Extract content from zip archive.
zf = ZipFile(stream)
zf.extractall()
# Find the HTML file in the archive. It needs to be
# top level.
index = u''
multiple_html = False
# Get a list of all top level files in the archive.
for x in os.listdir(u'.'):
if os.path.isfile(x):
top_levels.append(x)
# Try to find an index. file.
for x in top_levels:
if x.lower() in (u'index.html', u'index.xhtml', u'index.htm'):
index = x
break
# Look for multiple HTML files in the archive. We look at the
# top level files only as only they matter in HTMLZ.
for x in top_levels:
if os.path.splitext(x)[1].lower() in (u'.html', u'.xhtml', u'.htm'):
# Set index to the first HTML file found if it's not
# called index.
if not index:
index = x
else:
multiple_html = True
# Warn the user if there multiple HTML file in the archive. HTMLZ
# supports a single HTML file. A conversion with a multiple HTML file
# HTMLZ archive probably won't turn out as the user expects. With
# Multiple HTML files ZIP input should be used in place of HTMLZ.
if multiple_html:
log.warn(_('Multiple HTML files found in the archive. Only %s will be used.') % index)
if index:
with open(index, 'rb') as tf:
html = tf.read()
else:
raise Exception(_('No top level HTML file found.'))
if not html:
raise Exception(_('Top level HTML file %s is empty') % index)
# Encoding
if options.input_encoding:
ienc = options.input_encoding
else:
ienc = xml_to_unicode(html[:4096])[-1]
html = html.decode(ienc, 'replace')
# Run the HTML through the html processing plugin.
from calibre.customize.ui import plugin_for_input_format
html_input = plugin_for_input_format('html')
for opt in html_input.options:
setattr(options, opt.option.name, opt.recommended_value)
options.input_encoding = 'utf-8'
base = getcwd()
htmlfile = os.path.join(base, u'index.html')
c = 0
while os.path.exists(htmlfile):
c += 1
htmlfile = u'index%d.html'%c
with open(htmlfile, 'wb') as f:
f.write(html.encode('utf-8'))
odi = options.debug_pipeline
options.debug_pipeline = None
# Generate oeb from html conversion.
with open(htmlfile, 'rb') as f:
oeb = html_input.convert(f, options, 'html', log,
{})
options.debug_pipeline = odi
os.remove(htmlfile)
# Set metadata from file.
from calibre.customize.ui import get_file_type_metadata
from calibre.ebooks.oeb.transforms.metadata import meta_info_to_oeb_metadata
mi = get_file_type_metadata(stream, file_ext)
meta_info_to_oeb_metadata(mi, oeb.metadata, log)
# Get the cover path from the OPF.
cover_path = None
opf = None
for x in top_levels:
if os.path.splitext(x)[1].lower() == u'.opf':
opf = x
break
if opf:
opf = OPF(opf, basedir=getcwd())
cover_path = opf.raster_cover or opf.cover
# Set the cover.
if cover_path:
cdata = None
with open(os.path.join(getcwd(), cover_path), 'rb') as cf:
cdata = cf.read()
cover_name = os.path.basename(cover_path)
id, href = oeb.manifest.generate('cover', cover_name)
oeb.manifest.add(id, href, guess_type(cover_name)[0], data=cdata)
oeb.guide.add('cover', 'Cover', href)
return oeb

View File

@@ -0,0 +1,136 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL 3'
__copyright__ = '2011, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
import io
import os
from calibre.customize.conversion import OutputFormatPlugin, \
OptionRecommendation
from calibre.ptempfile import TemporaryDirectory
from polyglot.builtins import unicode_type
class HTMLZOutput(OutputFormatPlugin):
name = 'HTMLZ Output'
author = 'John Schember'
file_type = 'htmlz'
commit_name = 'htmlz_output'
ui_data = {
'css_choices': {
'class': _('Use CSS classes'),
'inline': _('Use the style attribute'),
'tag': _('Use HTML tags wherever possible')
},
'sheet_choices': {
'external': _('Use an external CSS file'),
'inline': _('Use a <style> tag in the HTML file')
}
}
options = {
OptionRecommendation(name='htmlz_css_type', recommended_value='class',
level=OptionRecommendation.LOW,
choices=list(ui_data['css_choices']),
help=_('Specify the handling of CSS. Default is class.\n'
'class: {class}\n'
'inline: {inline}\n'
'tag: {tag}'
).format(**ui_data['css_choices'])),
OptionRecommendation(name='htmlz_class_style', recommended_value='external',
level=OptionRecommendation.LOW,
choices=list(ui_data['sheet_choices']),
help=_('How to handle the CSS when using css-type = \'class\'.\n'
'Default is external.\n'
'external: {external}\n'
'inline: {inline}'
).format(**ui_data['sheet_choices'])),
OptionRecommendation(name='htmlz_title_filename',
recommended_value=False, level=OptionRecommendation.LOW,
help=_('If set this option causes the file name of the HTML file'
' inside the HTMLZ archive to be based on the book title.')
),
}
def convert(self, oeb_book, output_path, input_plugin, opts, log):
from lxml import etree
from calibre.ebooks.oeb.base import OEB_IMAGES, SVG_MIME
from calibre.ebooks.metadata.opf2 import OPF, metadata_to_opf
from calibre.utils.zipfile import ZipFile
from calibre.utils.filenames import ascii_filename
# HTML
if opts.htmlz_css_type == 'inline':
from calibre.ebooks.htmlz.oeb2html import OEB2HTMLInlineCSSizer
OEB2HTMLizer = OEB2HTMLInlineCSSizer
elif opts.htmlz_css_type == 'tag':
from calibre.ebooks.htmlz.oeb2html import OEB2HTMLNoCSSizer
OEB2HTMLizer = OEB2HTMLNoCSSizer
else:
from calibre.ebooks.htmlz.oeb2html import OEB2HTMLClassCSSizer as OEB2HTMLizer
with TemporaryDirectory(u'_htmlz_output') as tdir:
htmlizer = OEB2HTMLizer(log)
html = htmlizer.oeb2html(oeb_book, opts)
fname = u'index'
if opts.htmlz_title_filename:
from calibre.utils.filenames import shorten_components_to
fname = shorten_components_to(100, (ascii_filename(unicode_type(oeb_book.metadata.title[0])),))[0]
with open(os.path.join(tdir, fname+u'.html'), 'wb') as tf:
if isinstance(html, unicode_type):
html = html.encode('utf-8')
tf.write(html)
# CSS
if opts.htmlz_css_type == 'class' and opts.htmlz_class_style == 'external':
with open(os.path.join(tdir, u'style.css'), 'wb') as tf:
tf.write(htmlizer.get_css(oeb_book))
# Images
images = htmlizer.images
if images:
if not os.path.exists(os.path.join(tdir, u'images')):
os.makedirs(os.path.join(tdir, u'images'))
for item in oeb_book.manifest:
if item.media_type in OEB_IMAGES and item.href in images:
if item.media_type == SVG_MIME:
data = etree.tostring(item.data, encoding='unicode')
else:
data = item.data
fname = os.path.join(tdir, u'images', images[item.href])
with open(fname, 'wb') as img:
img.write(data)
# Cover
cover_path = None
try:
cover_data = None
if oeb_book.metadata.cover:
term = oeb_book.metadata.cover[0].term
cover_data = oeb_book.guide[term].item.data
if cover_data:
from calibre.utils.img import save_cover_data_to
cover_path = os.path.join(tdir, u'cover.jpg')
with lopen(cover_path, 'w') as cf:
cf.write('')
save_cover_data_to(cover_data, cover_path)
except:
import traceback
traceback.print_exc()
# Metadata
with open(os.path.join(tdir, u'metadata.opf'), 'wb') as mdataf:
opf = OPF(io.BytesIO(etree.tostring(oeb_book.metadata.to_opf1(), encoding='UTF-8')))
mi = opf.to_book_metadata()
if cover_path:
mi.cover = u'cover.jpg'
mdataf.write(metadata_to_opf(mi))
htmlz = ZipFile(output_path, 'w')
htmlz.add_dir(tdir)

View File

@@ -0,0 +1,64 @@
#!/usr/bin/env python2
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
from calibre.customize.conversion import InputFormatPlugin
class LITInput(InputFormatPlugin):
name = 'LIT Input'
author = 'Marshall T. Vandegrift'
description = 'Convert LIT files to HTML'
file_types = {'lit'}
commit_name = 'lit_input'
def convert(self, stream, options, file_ext, log,
accelerators):
from calibre.ebooks.lit.reader import LitReader
from calibre.ebooks.conversion.plumber import create_oebbook
self.log = log
return create_oebbook(log, stream, options, reader=LitReader)
def postprocess_book(self, oeb, opts, log):
from calibre.ebooks.oeb.base import XHTML_NS, XPath, XHTML
for item in oeb.spine:
root = item.data
if not hasattr(root, 'xpath'):
continue
for bad in ('metadata', 'guide'):
metadata = XPath('//h:'+bad)(root)
if metadata:
for x in metadata:
x.getparent().remove(x)
body = XPath('//h:body')(root)
if body:
body = body[0]
if len(body) == 1 and body[0].tag == XHTML('pre'):
pre = body[0]
from calibre.ebooks.txt.processor import convert_basic, \
separate_paragraphs_single_line
from calibre.ebooks.chardet import xml_to_unicode
from calibre.utils.xml_parse import safe_xml_fromstring
import copy
self.log('LIT file with all text in singe <pre> tag detected')
html = separate_paragraphs_single_line(pre.text)
html = convert_basic(html).replace('<html>',
'<html xmlns="%s">'%XHTML_NS)
html = xml_to_unicode(html, strip_encoding_pats=True,
resolve_entities=True)[0]
if opts.smarten_punctuation:
# SmartyPants skips text inside <pre> tags
from calibre.ebooks.conversion.preprocess import smarten_punctuation
html = smarten_punctuation(html, self.log)
root = safe_xml_fromstring(html)
body = XPath('//h:body')(root)
pre.tag = XHTML('div')
pre.text = ''
for elem in body:
ne = copy.deepcopy(elem)
pre.append(ne)

View File

@@ -0,0 +1,38 @@
#!/usr/bin/env python2
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
from calibre.customize.conversion import OutputFormatPlugin
class LITOutput(OutputFormatPlugin):
name = 'LIT Output'
author = 'Marshall T. Vandegrift'
file_type = 'lit'
commit_name = 'lit_output'
def convert(self, oeb, output_path, input_plugin, opts, log):
self.log, self.opts, self.oeb = log, opts, oeb
from calibre.ebooks.oeb.transforms.manglecase import CaseMangler
from calibre.ebooks.oeb.transforms.rasterize import SVGRasterizer
from calibre.ebooks.oeb.transforms.htmltoc import HTMLTOCAdder
from calibre.ebooks.lit.writer import LitWriter
from calibre.ebooks.oeb.transforms.split import Split
split = Split(split_on_page_breaks=True, max_flow_size=0,
remove_css_pagebreaks=False)
split(self.oeb, self.opts)
tocadder = HTMLTOCAdder()
tocadder(oeb, opts)
mangler = CaseMangler()
mangler(oeb, opts)
rasterizer = SVGRasterizer()
rasterizer(oeb, opts)
lit = LitWriter(self.opts)
lit(oeb, output_path)

View File

@@ -0,0 +1,82 @@
#!/usr/bin/env python2
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import os, sys
from calibre.customize.conversion import InputFormatPlugin
class LRFInput(InputFormatPlugin):
name = 'LRF Input'
author = 'Kovid Goyal'
description = 'Convert LRF files to HTML'
file_types = {'lrf'}
commit_name = 'lrf_input'
def convert(self, stream, options, file_ext, log,
accelerators):
from calibre.ebooks.lrf.input import (MediaType, Styles, TextBlock,
Canvas, ImageBlock, RuledLine)
self.log = log
self.log('Generating XML')
from calibre.ebooks.lrf.lrfparser import LRFDocument
from calibre.utils.xml_parse import safe_xml_fromstring
from lxml import etree
d = LRFDocument(stream)
d.parse()
xml = d.to_xml(write_files=True)
if options.verbose > 2:
open(u'lrs.xml', 'wb').write(xml.encode('utf-8'))
doc = safe_xml_fromstring(xml)
char_button_map = {}
for x in doc.xpath('//CharButton[@refobj]'):
ro = x.get('refobj')
jump_button = doc.xpath('//*[@objid="%s"]'%ro)
if jump_button:
jump_to = jump_button[0].xpath('descendant::JumpTo[@refpage and @refobj]')
if jump_to:
char_button_map[ro] = '%s.xhtml#%s'%(jump_to[0].get('refpage'),
jump_to[0].get('refobj'))
plot_map = {}
for x in doc.xpath('//Plot[@refobj]'):
ro = x.get('refobj')
image = doc.xpath('//Image[@objid="%s" and @refstream]'%ro)
if image:
imgstr = doc.xpath('//ImageStream[@objid="%s" and @file]'%
image[0].get('refstream'))
if imgstr:
plot_map[ro] = imgstr[0].get('file')
self.log('Converting XML to HTML...')
styledoc = safe_xml_fromstring(P('templates/lrf.xsl', data=True))
media_type = MediaType()
styles = Styles()
text_block = TextBlock(styles, char_button_map, plot_map, log)
canvas = Canvas(doc, styles, text_block, log)
image_block = ImageBlock(canvas)
ruled_line = RuledLine()
extensions = {
('calibre', 'media-type') : media_type,
('calibre', 'text-block') : text_block,
('calibre', 'ruled-line') : ruled_line,
('calibre', 'styles') : styles,
('calibre', 'canvas') : canvas,
('calibre', 'image-block'): image_block,
}
transform = etree.XSLT(styledoc, extensions=extensions)
try:
result = transform(doc)
except RuntimeError:
sys.setrecursionlimit(5000)
result = transform(doc)
with open('content.opf', 'wb') as f:
f.write(result)
styles.write()
return os.path.abspath('content.opf')

View File

@@ -0,0 +1,196 @@
#!/usr/bin/env python2
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import sys, os
from calibre.customize.conversion import OutputFormatPlugin
from calibre.customize.conversion import OptionRecommendation
from polyglot.builtins import unicode_type
class LRFOptions(object):
def __init__(self, output, opts, oeb):
def f2s(f):
try:
return unicode_type(f[0])
except:
return ''
m = oeb.metadata
for x in ('left', 'top', 'right', 'bottom'):
attr = 'margin_'+x
val = getattr(opts, attr)
if val < 0:
setattr(opts, attr, 0)
self.title = None
self.author = self.publisher = _('Unknown')
self.title_sort = self.author_sort = ''
for x in m.creator:
if x.role == 'aut':
self.author = unicode_type(x)
fa = unicode_type(getattr(x, 'file_as', ''))
if fa:
self.author_sort = fa
for x in m.title:
if unicode_type(x.file_as):
self.title_sort = unicode_type(x.file_as)
self.freetext = f2s(m.description)
self.category = f2s(m.subject)
self.cover = None
self.use_metadata_cover = True
self.output = output
self.ignore_tables = opts.linearize_tables
if opts.disable_font_rescaling:
self.base_font_size = 0
else:
self.base_font_size = opts.base_font_size
self.blank_after_para = opts.insert_blank_line
self.use_spine = True
self.font_delta = 0
self.ignore_colors = False
from calibre.ebooks.lrf import PRS500_PROFILE
self.profile = PRS500_PROFILE
self.link_levels = sys.maxsize
self.link_exclude = '@'
self.no_links_in_toc = True
self.disable_chapter_detection = True
self.chapter_regex = 'dsadcdswcdec'
self.chapter_attr = '$,,$'
self.override_css = self._override_css = ''
self.page_break = 'h[12]'
self.force_page_break = '$'
self.force_page_break_attr = '$'
self.add_chapters_to_toc = False
self.baen = self.pdftohtml = self.book_designer = False
self.verbose = opts.verbose
self.encoding = 'utf-8'
self.lrs = False
self.minimize_memory_usage = False
self.autorotation = opts.enable_autorotation
self.header_separation = (self.profile.dpi/72.) * opts.header_separation
self.headerformat = opts.header_format
for x in ('top', 'bottom', 'left', 'right'):
setattr(self, x+'_margin',
(self.profile.dpi/72.) * float(getattr(opts, 'margin_'+x)))
for x in ('wordspace', 'header', 'header_format',
'minimum_indent', 'serif_family',
'render_tables_as_images', 'sans_family', 'mono_family',
'text_size_multiplier_for_rendered_tables'):
setattr(self, x, getattr(opts, x))
class LRFOutput(OutputFormatPlugin):
name = 'LRF Output'
author = 'Kovid Goyal'
file_type = 'lrf'
commit_name = 'lrf_output'
options = {
OptionRecommendation(name='enable_autorotation', recommended_value=False,
help=_('Enable auto-rotation of images that are wider than the screen width.')
),
OptionRecommendation(name='wordspace',
recommended_value=2.5, level=OptionRecommendation.LOW,
help=_('Set the space between words in pts. Default is %default')
),
OptionRecommendation(name='header', recommended_value=False,
help=_('Add a header to all the pages with title and author.')
),
OptionRecommendation(name='header_format', recommended_value="%t by %a",
help=_('Set the format of the header. %a is replaced by the author '
'and %t by the title. Default is %default')
),
OptionRecommendation(name='header_separation', recommended_value=0,
help=_('Add extra spacing below the header. Default is %default pt.')
),
OptionRecommendation(name='minimum_indent', recommended_value=0,
help=_('Minimum paragraph indent (the indent of the first line '
'of a paragraph) in pts. Default: %default')
),
OptionRecommendation(name='render_tables_as_images',
recommended_value=False,
help=_('This option has no effect')
),
OptionRecommendation(name='text_size_multiplier_for_rendered_tables',
recommended_value=1.0,
help=_('Multiply the size of text in rendered tables by this '
'factor. Default is %default')
),
OptionRecommendation(name='serif_family', recommended_value=None,
help=_('The serif family of fonts to embed')
),
OptionRecommendation(name='sans_family', recommended_value=None,
help=_('The sans-serif family of fonts to embed')
),
OptionRecommendation(name='mono_family', recommended_value=None,
help=_('The monospace family of fonts to embed')
),
}
recommendations = {
('change_justification', 'original', OptionRecommendation.HIGH)}
def convert_images(self, pages, opts, wide):
from calibre.ebooks.lrf.pylrs.pylrs import Book, BookSetting, ImageStream, ImageBlock
from uuid import uuid4
from calibre.constants import __appname__, __version__
width, height = (784, 1012) if wide else (584, 754)
ps = {}
ps['topmargin'] = 0
ps['evensidemargin'] = 0
ps['oddsidemargin'] = 0
ps['textwidth'] = width
ps['textheight'] = height
book = Book(title=opts.title, author=opts.author,
bookid=uuid4().hex,
publisher='%s %s'%(__appname__, __version__),
category=_('Comic'), pagestyledefault=ps,
booksetting=BookSetting(screenwidth=width, screenheight=height))
for page in pages:
imageStream = ImageStream(page)
_page = book.create_page()
_page.append(ImageBlock(refstream=imageStream,
blockwidth=width, blockheight=height, xsize=width,
ysize=height, x1=width, y1=height))
book.append(_page)
book.renderLrf(open(opts.output, 'wb'))
def flatten_toc(self):
from calibre.ebooks.oeb.base import TOC
nroot = TOC()
for x in self.oeb.toc.iterdescendants():
nroot.add(x.title, x.href)
self.oeb.toc = nroot
def convert(self, oeb, output_path, input_plugin, opts, log):
self.log, self.opts, self.oeb = log, opts, oeb
lrf_opts = LRFOptions(output_path, opts, oeb)
if input_plugin.is_image_collection:
self.convert_images(input_plugin.get_images(), lrf_opts,
getattr(opts, 'wide', False))
return
self.flatten_toc()
from calibre.ptempfile import TemporaryDirectory
with TemporaryDirectory('_lrf_output') as tdir:
from calibre.customize.ui import plugin_for_output_format
oeb_output = plugin_for_output_format('oeb')
oeb_output.convert(oeb, tdir, input_plugin, opts, log)
opf = [x for x in os.listdir(tdir) if x.endswith('.opf')][0]
from calibre.ebooks.lrf.html.convert_from import process_file
process_file(os.path.join(tdir, opf), lrf_opts, self.log)

View File

@@ -0,0 +1,66 @@
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL 3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import os
from calibre.customize.conversion import InputFormatPlugin
from polyglot.builtins import unicode_type
class MOBIInput(InputFormatPlugin):
name = 'MOBI Input'
author = 'Kovid Goyal'
description = 'Convert MOBI files (.mobi, .prc, .azw) to HTML'
file_types = {'mobi', 'prc', 'azw', 'azw3', 'pobi'}
commit_name = 'mobi_input'
def convert(self, stream, options, file_ext, log,
accelerators):
self.is_kf8 = False
self.mobi_is_joint = False
from calibre.ebooks.mobi.reader.mobi6 import MobiReader
from lxml import html
parse_cache = {}
try:
mr = MobiReader(stream, log, options.input_encoding,
options.debug_pipeline)
if mr.kf8_type is None:
mr.extract_content('.', parse_cache)
except:
mr = MobiReader(stream, log, options.input_encoding,
options.debug_pipeline, try_extra_data_fix=True)
if mr.kf8_type is None:
mr.extract_content('.', parse_cache)
if mr.kf8_type is not None:
log('Found KF8 MOBI of type %r'%mr.kf8_type)
if mr.kf8_type == 'joint':
self.mobi_is_joint = True
from calibre.ebooks.mobi.reader.mobi8 import Mobi8Reader
mr = Mobi8Reader(mr, log)
opf = os.path.abspath(mr())
self.encrypted_fonts = mr.encrypted_fonts
self.is_kf8 = True
return opf
raw = parse_cache.pop('calibre_raw_mobi_markup', False)
if raw:
if isinstance(raw, unicode_type):
raw = raw.encode('utf-8')
with lopen('debug-raw.html', 'wb') as f:
f.write(raw)
from calibre.ebooks.oeb.base import close_self_closing_tags
for f, root in parse_cache.items():
raw = html.tostring(root, encoding='utf-8', method='xml',
include_meta_content_type=False)
raw = close_self_closing_tags(raw)
with lopen(f, 'wb') as q:
q.write(raw)
accelerators['pagebreaks'] = '//h:div[@class="mbp_pagebreak"]'
return mr.created_opf_path

View File

@@ -0,0 +1,337 @@
#!/usr/bin/env python2
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
from calibre.customize.conversion import (OutputFormatPlugin,
OptionRecommendation)
from polyglot.builtins import unicode_type
def remove_html_cover(oeb, log):
from calibre.ebooks.oeb.base import OEB_DOCS
if not oeb.metadata.cover \
or 'cover' not in oeb.guide:
return
href = oeb.guide['cover'].href
del oeb.guide['cover']
item = oeb.manifest.hrefs[href]
if item.spine_position is not None:
log.warn('Found an HTML cover: ', item.href, 'removing it.',
'If you find some content missing from the output MOBI, it '
'is because you misidentified the HTML cover in the input '
'document')
oeb.spine.remove(item)
if item.media_type in OEB_DOCS:
oeb.manifest.remove(item)
def extract_mobi(output_path, opts):
if opts.extract_to is not None:
from calibre.ebooks.mobi.debug.main import inspect_mobi
ddir = opts.extract_to
inspect_mobi(output_path, ddir=ddir)
class MOBIOutput(OutputFormatPlugin):
name = 'MOBI Output'
author = 'Kovid Goyal'
file_type = 'mobi'
commit_name = 'mobi_output'
ui_data = {'file_types': ['old', 'both', 'new']}
options = {
OptionRecommendation(name='prefer_author_sort',
recommended_value=False, level=OptionRecommendation.LOW,
help=_('When present, use author sort field as author.')
),
OptionRecommendation(name='no_inline_toc',
recommended_value=False, level=OptionRecommendation.LOW,
help=_('Don\'t add Table of Contents to the book. Useful if '
'the book has its own table of contents.')),
OptionRecommendation(name='toc_title', recommended_value=None,
help=_('Title for any generated in-line table of contents.')
),
OptionRecommendation(name='dont_compress',
recommended_value=False, level=OptionRecommendation.LOW,
help=_('Disable compression of the file contents.')
),
OptionRecommendation(name='personal_doc', recommended_value='[PDOC]',
help=_('Tag for MOBI files to be marked as personal documents.'
' This option has no effect on the conversion. It is used'
' only when sending MOBI files to a device. If the file'
' being sent has the specified tag, it will be marked as'
' a personal document when sent to the Kindle.')
),
OptionRecommendation(name='mobi_ignore_margins',
recommended_value=False,
help=_('Ignore margins in the input document. If False, then '
'the MOBI output plugin will try to convert margins specified'
' in the input document, otherwise it will ignore them.')
),
OptionRecommendation(name='mobi_toc_at_start',
recommended_value=False,
help=_('When adding the Table of Contents to the book, add it at the start of the '
'book instead of the end. Not recommended.')
),
OptionRecommendation(name='extract_to',
help=_('Extract the contents of the generated %s file to the '
'specified directory. The contents of the directory are first '
'deleted, so be careful.') % 'MOBI'
),
OptionRecommendation(name='share_not_sync', recommended_value=False,
help=_('Enable sharing of book content via Facebook etc. '
' on the Kindle. WARNING: Using this feature means that '
' the book will not auto sync its last read position '
' on multiple devices. Complain to Amazon.')
),
OptionRecommendation(name='mobi_keep_original_images',
recommended_value=False,
help=_('By default calibre converts all images to JPEG format '
'in the output MOBI file. This is for maximum compatibility '
'as some older MOBI viewers have problems with other image '
'formats. This option tells calibre not to do this. '
'Useful if your document contains lots of GIF/PNG images that '
'become very large when converted to JPEG.')),
OptionRecommendation(name='mobi_file_type', choices=ui_data['file_types'], recommended_value='old',
help=_('By default calibre generates MOBI files that contain the '
'old MOBI 6 format. This format is compatible with all '
'devices. However, by changing this setting, you can tell '
'calibre to generate MOBI files that contain both MOBI 6 and '
'the new KF8 format, or only the new KF8 format. KF8 has '
'more features than MOBI 6, but only works with newer Kindles. '
'Allowed values: {}').format('old, both, new')),
}
def check_for_periodical(self):
if self.is_periodical:
self.periodicalize_toc()
self.check_for_masthead()
self.opts.mobi_periodical = True
else:
self.opts.mobi_periodical = False
def check_for_masthead(self):
found = 'masthead' in self.oeb.guide
if not found:
from calibre.ebooks import generate_masthead
self.oeb.log.debug('No masthead found in manifest, generating default mastheadImage...')
raw = generate_masthead(unicode_type(self.oeb.metadata['title'][0]))
id, href = self.oeb.manifest.generate('masthead', 'masthead')
self.oeb.manifest.add(id, href, 'image/gif', data=raw)
self.oeb.guide.add('masthead', 'Masthead Image', href)
else:
self.oeb.log.debug('Using mastheadImage supplied in manifest...')
def periodicalize_toc(self):
from calibre.ebooks.oeb.base import TOC
toc = self.oeb.toc
if not toc or len(self.oeb.spine) < 3:
return
if toc and toc[0].klass != 'periodical':
one, two = self.oeb.spine[0], self.oeb.spine[1]
self.log('Converting TOC for MOBI periodical indexing...')
articles = {}
if toc.depth() < 3:
# single section periodical
self.oeb.manifest.remove(one)
self.oeb.manifest.remove(two)
sections = [TOC(klass='section', title=_('All articles'),
href=self.oeb.spine[0].href)]
for x in toc:
sections[0].nodes.append(x)
else:
# multi-section periodical
self.oeb.manifest.remove(one)
sections = list(toc)
for i,x in enumerate(sections):
x.klass = 'section'
articles_ = list(x)
if articles_:
self.oeb.manifest.remove(self.oeb.manifest.hrefs[x.href])
x.href = articles_[0].href
for sec in sections:
articles[id(sec)] = []
for a in list(sec):
a.klass = 'article'
articles[id(sec)].append(a)
sec.nodes.remove(a)
root = TOC(klass='periodical', href=self.oeb.spine[0].href,
title=unicode_type(self.oeb.metadata.title[0]))
for s in sections:
if articles[id(s)]:
for a in articles[id(s)]:
s.nodes.append(a)
root.nodes.append(s)
for x in list(toc.nodes):
toc.nodes.remove(x)
toc.nodes.append(root)
# Fix up the periodical href to point to first section href
toc.nodes[0].href = toc.nodes[0].nodes[0].href
def convert(self, oeb, output_path, input_plugin, opts, log):
from calibre.ebooks.mobi.writer2.resources import Resources
self.log, self.opts, self.oeb = log, opts, oeb
mobi_type = opts.mobi_file_type
if self.is_periodical:
mobi_type = 'old' # Amazon does not support KF8 periodicals
create_kf8 = mobi_type in ('new', 'both')
remove_html_cover(self.oeb, self.log)
resources = Resources(oeb, opts, self.is_periodical,
add_fonts=create_kf8)
self.check_for_periodical()
if create_kf8:
from calibre.ebooks.mobi.writer8.cleanup import remove_duplicate_anchors
remove_duplicate_anchors(self.oeb)
# Split on pagebreaks so that the resulting KF8 is faster to load
from calibre.ebooks.oeb.transforms.split import Split
Split()(self.oeb, self.opts)
kf8 = self.create_kf8(resources, for_joint=mobi_type=='both'
) if create_kf8 else None
if mobi_type == 'new':
kf8.write(output_path)
extract_mobi(output_path, opts)
return
self.log('Creating MOBI 6 output')
self.write_mobi(input_plugin, output_path, kf8, resources)
def create_kf8(self, resources, for_joint=False):
from calibre.ebooks.mobi.writer8.main import create_kf8_book
return create_kf8_book(self.oeb, self.opts, resources,
for_joint=for_joint)
def write_mobi(self, input_plugin, output_path, kf8, resources):
from calibre.ebooks.mobi.mobiml import MobiMLizer
from calibre.ebooks.oeb.transforms.manglecase import CaseMangler
from calibre.ebooks.oeb.transforms.rasterize import SVGRasterizer, Unavailable
from calibre.ebooks.oeb.transforms.htmltoc import HTMLTOCAdder
from calibre.customize.ui import plugin_for_input_format
opts, oeb = self.opts, self.oeb
if not opts.no_inline_toc:
tocadder = HTMLTOCAdder(title=opts.toc_title, position='start' if
opts.mobi_toc_at_start else 'end')
tocadder(oeb, opts)
mangler = CaseMangler()
mangler(oeb, opts)
try:
rasterizer = SVGRasterizer()
rasterizer(oeb, opts)
except Unavailable:
self.log.warn('SVG rasterizer unavailable, SVG will not be converted')
else:
# Add rasterized SVG images
resources.add_extra_images()
if hasattr(self.oeb, 'inserted_metadata_jacket'):
self.workaround_fire_bugs(self.oeb.inserted_metadata_jacket)
mobimlizer = MobiMLizer(ignore_tables=opts.linearize_tables)
mobimlizer(oeb, opts)
write_page_breaks_after_item = input_plugin is not plugin_for_input_format('cbz')
from calibre.ebooks.mobi.writer2.main import MobiWriter
writer = MobiWriter(opts, resources, kf8,
write_page_breaks_after_item=write_page_breaks_after_item)
writer(oeb, output_path)
extract_mobi(output_path, opts)
def specialize_css_for_output(self, log, opts, item, stylizer):
from calibre.ebooks.mobi.writer8.cleanup import CSSCleanup
CSSCleanup(log, opts)(item, stylizer)
def workaround_fire_bugs(self, jacket):
# The idiotic Fire crashes when trying to render the table used to
# layout the jacket
from calibre.ebooks.oeb.base import XHTML
for table in jacket.data.xpath('//*[local-name()="table"]'):
table.tag = XHTML('div')
for tr in table.xpath('descendant::*[local-name()="tr"]'):
cols = tr.xpath('descendant::*[local-name()="td"]')
tr.tag = XHTML('div')
for td in cols:
td.tag = XHTML('span' if cols else 'div')
class AZW3Output(OutputFormatPlugin):
name = 'AZW3 Output'
author = 'Kovid Goyal'
file_type = 'azw3'
commit_name = 'azw3_output'
options = {
OptionRecommendation(name='prefer_author_sort',
recommended_value=False, level=OptionRecommendation.LOW,
help=_('When present, use author sort field as author.')
),
OptionRecommendation(name='no_inline_toc',
recommended_value=False, level=OptionRecommendation.LOW,
help=_('Don\'t add Table of Contents to the book. Useful if '
'the book has its own table of contents.')),
OptionRecommendation(name='toc_title', recommended_value=None,
help=_('Title for any generated in-line table of contents.')
),
OptionRecommendation(name='dont_compress',
recommended_value=False, level=OptionRecommendation.LOW,
help=_('Disable compression of the file contents.')
),
OptionRecommendation(name='mobi_toc_at_start',
recommended_value=False,
help=_('When adding the Table of Contents to the book, add it at the start of the '
'book instead of the end. Not recommended.')
),
OptionRecommendation(name='extract_to',
help=_('Extract the contents of the generated %s file to the '
'specified directory. The contents of the directory are first '
'deleted, so be careful.') % 'AZW3'),
OptionRecommendation(name='share_not_sync', recommended_value=False,
help=_('Enable sharing of book content via Facebook etc. '
' on the Kindle. WARNING: Using this feature means that '
' the book will not auto sync its last read position '
' on multiple devices. Complain to Amazon.')
),
}
def convert(self, oeb, output_path, input_plugin, opts, log):
from calibre.ebooks.mobi.writer2.resources import Resources
from calibre.ebooks.mobi.writer8.main import create_kf8_book
from calibre.ebooks.mobi.writer8.cleanup import remove_duplicate_anchors
self.oeb, self.opts, self.log = oeb, opts, log
opts.mobi_periodical = self.is_periodical
passthrough = getattr(opts, 'mobi_passthrough', False)
remove_duplicate_anchors(oeb)
resources = Resources(self.oeb, self.opts, self.is_periodical,
add_fonts=True, process_images=False)
if not passthrough:
remove_html_cover(self.oeb, self.log)
# Split on pagebreaks so that the resulting KF8 is faster to load
from calibre.ebooks.oeb.transforms.split import Split
Split()(self.oeb, self.opts)
kf8 = create_kf8_book(self.oeb, self.opts, resources, for_joint=False)
kf8.write(output_path)
extract_mobi(output_path, opts)
def specialize_css_for_output(self, log, opts, item, stylizer):
from calibre.ebooks.mobi.writer8.cleanup import CSSCleanup
CSSCleanup(log, opts)(item, stylizer)

View File

@@ -0,0 +1,25 @@
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
__docformat__ = 'restructuredtext en'
'''
Convert an ODT file into a Open Ebook
'''
from calibre.customize.conversion import InputFormatPlugin
class ODTInput(InputFormatPlugin):
name = 'ODT Input'
author = 'Kovid Goyal'
description = 'Convert ODT (OpenOffice) files to HTML'
file_types = {'odt'}
commit_name = 'odt_input'
def convert(self, stream, options, file_ext, log,
accelerators):
from calibre.ebooks.odt.input import Extract
return Extract()(stream, '.', log)

View File

@@ -0,0 +1,122 @@
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL 3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import os, re
from calibre.customize.conversion import (OutputFormatPlugin,
OptionRecommendation)
from calibre import CurrentDir
class OEBOutput(OutputFormatPlugin):
name = 'OEB Output'
author = 'Kovid Goyal'
file_type = 'oeb'
commit_name = 'oeb_output'
recommendations = {('pretty_print', True, OptionRecommendation.HIGH)}
def convert(self, oeb_book, output_path, input_plugin, opts, log):
from polyglot.urllib import unquote
from lxml import etree
self.log, self.opts = log, opts
if not os.path.exists(output_path):
os.makedirs(output_path)
from calibre.ebooks.oeb.base import OPF_MIME, NCX_MIME, PAGE_MAP_MIME, OEB_STYLES
from calibre.ebooks.oeb.normalize_css import condense_sheet
with CurrentDir(output_path):
results = oeb_book.to_opf2(page_map=True)
for key in (OPF_MIME, NCX_MIME, PAGE_MAP_MIME):
href, root = results.pop(key, [None, None])
if root is not None:
if key == OPF_MIME:
try:
self.workaround_nook_cover_bug(root)
except:
self.log.exception('Something went wrong while trying to'
' workaround Nook cover bug, ignoring')
try:
self.workaround_pocketbook_cover_bug(root)
except:
self.log.exception('Something went wrong while trying to'
' workaround Pocketbook cover bug, ignoring')
self.migrate_lang_code(root)
raw = etree.tostring(root, pretty_print=True,
encoding='utf-8', xml_declaration=True)
if key == OPF_MIME:
# Needed as I can't get lxml to output opf:role and
# not output <opf:metadata> as well
raw = re.sub(br'(<[/]{0,1})opf:', br'\1', raw)
with lopen(href, 'wb') as f:
f.write(raw)
for item in oeb_book.manifest:
if (
not self.opts.expand_css and item.media_type in OEB_STYLES and hasattr(
item.data, 'cssText') and 'nook' not in self.opts.output_profile.short_name):
condense_sheet(item.data)
path = os.path.abspath(unquote(item.href))
dir = os.path.dirname(path)
if not os.path.exists(dir):
os.makedirs(dir)
with lopen(path, 'wb') as f:
f.write(item.bytes_representation)
item.unload_data_from_memory(memory=path)
def workaround_nook_cover_bug(self, root): # {{{
cov = root.xpath('//*[local-name() = "meta" and @name="cover" and'
' @content != "cover"]')
def manifest_items_with_id(id_):
return root.xpath('//*[local-name() = "manifest"]/*[local-name() = "item" '
' and @id="%s"]'%id_)
if len(cov) == 1:
cov = cov[0]
covid = cov.get('content', '')
if covid:
manifest_item = manifest_items_with_id(covid)
if len(manifest_item) == 1 and \
manifest_item[0].get('media-type',
'').startswith('image/'):
self.log.warn('The cover image has an id != "cover". Renaming'
' to work around bug in Nook Color')
from calibre.ebooks.oeb.base import uuid_id
newid = uuid_id()
for item in manifest_items_with_id('cover'):
item.set('id', newid)
for x in root.xpath('//*[@idref="cover"]'):
x.set('idref', newid)
manifest_item = manifest_item[0]
manifest_item.set('id', 'cover')
cov.set('content', 'cover')
# }}}
def workaround_pocketbook_cover_bug(self, root): # {{{
m = root.xpath('//*[local-name() = "manifest"]/*[local-name() = "item" '
' and @id="cover"]')
if len(m) == 1:
m = m[0]
p = m.getparent()
p.remove(m)
p.insert(0, m)
# }}}
def migrate_lang_code(self, root): # {{{
from calibre.utils.localization import lang_as_iso639_1
for lang in root.xpath('//*[local-name() = "language"]'):
clc = lang_as_iso639_1(lang.text)
if clc:
lang.text = clc
# }}}

View File

@@ -0,0 +1,37 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
from calibre.customize.conversion import InputFormatPlugin
from polyglot.builtins import getcwd
class PDBInput(InputFormatPlugin):
name = 'PDB Input'
author = 'John Schember'
description = 'Convert PDB to HTML'
file_types = {'pdb', 'updb'}
commit_name = 'pdb_input'
def convert(self, stream, options, file_ext, log,
accelerators):
from calibre.ebooks.pdb.header import PdbHeaderReader
from calibre.ebooks.pdb import PDBError, IDENTITY_TO_NAME, get_reader
header = PdbHeaderReader(stream)
Reader = get_reader(header.ident)
if Reader is None:
raise PDBError('No reader available for format within container.\n Identity is %s. Book type is %s' %
(header.ident, IDENTITY_TO_NAME.get(header.ident, _('Unknown'))))
log.debug('Detected ebook format as: %s with identity: %s' % (IDENTITY_TO_NAME[header.ident], header.ident))
reader = Reader(header, stream, log, options)
opf = reader.extract_content(getcwd())
return opf

View File

@@ -0,0 +1,64 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL 3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
import os
from calibre.customize.conversion import OutputFormatPlugin, \
OptionRecommendation
from calibre.ebooks.pdb import PDBError, get_writer, ALL_FORMAT_WRITERS
class PDBOutput(OutputFormatPlugin):
name = 'PDB Output'
author = 'John Schember'
file_type = 'pdb'
commit_name = 'pdb_output'
ui_data = {'formats': tuple(ALL_FORMAT_WRITERS)}
options = {
OptionRecommendation(name='format', recommended_value='doc',
level=OptionRecommendation.LOW,
short_switch='f', choices=list(ALL_FORMAT_WRITERS),
help=(_('Format to use inside the pdb container. Choices are:') + ' %s' % sorted(ALL_FORMAT_WRITERS))),
OptionRecommendation(name='pdb_output_encoding', recommended_value='cp1252',
level=OptionRecommendation.LOW,
help=_('Specify the character encoding of the output document. '
'The default is cp1252. Note: This option is not honored by all '
'formats.')),
OptionRecommendation(name='inline_toc',
recommended_value=False, level=OptionRecommendation.LOW,
help=_('Add Table of Contents to beginning of the book.')),
}
def convert(self, oeb_book, output_path, input_plugin, opts, log):
close = False
if not hasattr(output_path, 'write'):
close = True
if not os.path.exists(os.path.dirname(output_path)) and os.path.dirname(output_path):
os.makedirs(os.path.dirname(output_path))
out_stream = lopen(output_path, 'wb')
else:
out_stream = output_path
Writer = get_writer(opts.format)
if Writer is None:
raise PDBError('No writer available for format %s.' % format)
setattr(opts, 'max_line_length', 0)
setattr(opts, 'force_max_line_length', False)
writer = Writer(opts, log)
out_stream.seek(0)
out_stream.truncate()
writer.write_content(oeb_book, out_stream, oeb_book.metadata)
if close:
out_stream.close()

View File

@@ -0,0 +1,82 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL 3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
import os
from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
from polyglot.builtins import as_bytes, getcwd
class PDFInput(InputFormatPlugin):
name = 'PDF Input'
author = 'Kovid Goyal and John Schember'
description = 'Convert PDF files to HTML'
file_types = {'pdf'}
commit_name = 'pdf_input'
options = {
OptionRecommendation(name='no_images', recommended_value=False,
help=_('Do not extract images from the document')),
OptionRecommendation(name='unwrap_factor', recommended_value=0.45,
help=_('Scale used to determine the length at which a line should '
'be unwrapped. Valid values are a decimal between 0 and 1. The '
'default is 0.45, just below the median line length.')),
OptionRecommendation(name='new_pdf_engine', recommended_value=False,
help=_('Use the new PDF conversion engine. Currently not operational.'))
}
def convert_new(self, stream, accelerators):
from calibre.ebooks.pdf.pdftohtml import pdftohtml
from calibre.utils.cleantext import clean_ascii_chars
from calibre.ebooks.pdf.reflow import PDFDocument
pdftohtml(getcwd(), stream.name, self.opts.no_images, as_xml=True)
with lopen('index.xml', 'rb') as f:
xml = clean_ascii_chars(f.read())
PDFDocument(xml, self.opts, self.log)
return os.path.join(getcwd(), 'metadata.opf')
def convert(self, stream, options, file_ext, log,
accelerators):
from calibre.ebooks.metadata.opf2 import OPFCreator
from calibre.ebooks.pdf.pdftohtml import pdftohtml
log.debug('Converting file to html...')
# The main html file will be named index.html
self.opts, self.log = options, log
if options.new_pdf_engine:
return self.convert_new(stream, accelerators)
pdftohtml(getcwd(), stream.name, options.no_images)
from calibre.ebooks.metadata.meta import get_metadata
log.debug('Retrieving document metadata...')
mi = get_metadata(stream, 'pdf')
opf = OPFCreator(getcwd(), mi)
manifest = [('index.html', None)]
images = os.listdir(getcwd())
images.remove('index.html')
for i in images:
manifest.append((i, None))
log.debug('Generating manifest...')
opf.create_manifest(manifest)
opf.create_spine(['index.html'])
log.debug('Rendering manifest...')
with lopen('metadata.opf', 'wb') as opffile:
opf.render(opffile)
if os.path.exists('toc.ncx'):
ncxid = opf.manifest.id_for_path('toc.ncx')
if ncxid:
with lopen('metadata.opf', 'r+b') as f:
raw = f.read().replace(b'<spine', b'<spine toc="%s"' % as_bytes(ncxid))
f.seek(0)
f.write(raw)
return os.path.join(getcwd(), 'metadata.opf')

View File

@@ -0,0 +1,256 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL 3'
__copyright__ = '2012, Kovid Goyal <kovid at kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
'''
Convert OEB ebook format to PDF.
'''
import glob, os
from calibre.customize.conversion import (OutputFormatPlugin,
OptionRecommendation)
from calibre.ptempfile import TemporaryDirectory
from polyglot.builtins import iteritems, unicode_type
UNITS = ('millimeter', 'centimeter', 'point', 'inch' , 'pica' , 'didot',
'cicero', 'devicepixel')
PAPER_SIZES = ('a0', 'a1', 'a2', 'a3', 'a4', 'a5', 'a6', 'b0', 'b1',
'b2', 'b3', 'b4', 'b5', 'b6', 'legal', 'letter')
class PDFOutput(OutputFormatPlugin):
name = 'PDF Output'
author = 'Kovid Goyal'
file_type = 'pdf'
commit_name = 'pdf_output'
ui_data = {'paper_sizes': PAPER_SIZES, 'units': UNITS, 'font_types': ('serif', 'sans', 'mono')}
options = {
OptionRecommendation(name='use_profile_size', recommended_value=False,
help=_('Instead of using the paper size specified in the PDF Output options,'
' use a paper size corresponding to the current output profile.'
' Useful if you want to generate a PDF for viewing on a specific device.')),
OptionRecommendation(name='unit', recommended_value='inch',
level=OptionRecommendation.LOW, short_switch='u', choices=UNITS,
help=_('The unit of measure for page sizes. Default is inch. Choices '
'are {} '
'Note: This does not override the unit for margins!').format(', '.join(UNITS))),
OptionRecommendation(name='paper_size', recommended_value='letter',
level=OptionRecommendation.LOW, choices=PAPER_SIZES,
help=_('The size of the paper. This size will be overridden when a '
'non default output profile is used. Default is letter. Choices '
'are {}').format(', '.join(PAPER_SIZES))),
OptionRecommendation(name='custom_size', recommended_value=None,
help=_('Custom size of the document. Use the form widthxheight '
'e.g. `123x321` to specify the width and height. '
'This overrides any specified paper-size.')),
OptionRecommendation(name='preserve_cover_aspect_ratio',
recommended_value=False,
help=_('Preserve the aspect ratio of the cover, instead'
' of stretching it to fill the full first page of the'
' generated pdf.')),
OptionRecommendation(name='pdf_serif_family',
recommended_value='Times', help=_(
'The font family used to render serif fonts. Will work only if the font is available system-wide.')),
OptionRecommendation(name='pdf_sans_family',
recommended_value='Helvetica', help=_(
'The font family used to render sans-serif fonts. Will work only if the font is available system-wide.')),
OptionRecommendation(name='pdf_mono_family',
recommended_value='Courier', help=_(
'The font family used to render monospace fonts. Will work only if the font is available system-wide.')),
OptionRecommendation(name='pdf_standard_font', choices=ui_data['font_types'],
recommended_value='serif', help=_(
'The font family used to render monospace fonts')),
OptionRecommendation(name='pdf_default_font_size',
recommended_value=20, help=_(
'The default font size')),
OptionRecommendation(name='pdf_mono_font_size',
recommended_value=16, help=_(
'The default font size for monospaced text')),
OptionRecommendation(name='pdf_hyphenate', recommended_value=False,
help=_('Break long words at the end of lines. This can give the text at the right margin a more even appearance.')),
OptionRecommendation(name='pdf_mark_links', recommended_value=False,
help=_('Surround all links with a red box, useful for debugging.')),
OptionRecommendation(name='pdf_page_numbers', recommended_value=False,
help=_('Add page numbers to the bottom of every page in the generated PDF file. If you '
'specify a footer template, it will take precedence '
'over this option.')),
OptionRecommendation(name='pdf_footer_template', recommended_value=None,
help=_('An HTML template used to generate %s on every page.'
' The strings _PAGENUM_, _TITLE_, _AUTHOR_ and _SECTION_ will be replaced by their current values.')%_('footers')),
OptionRecommendation(name='pdf_header_template', recommended_value=None,
help=_('An HTML template used to generate %s on every page.'
' The strings _PAGENUM_, _TITLE_, _AUTHOR_ and _SECTION_ will be replaced by their current values.')%_('headers')),
OptionRecommendation(name='pdf_add_toc', recommended_value=False,
help=_('Add a Table of Contents at the end of the PDF that lists page numbers. '
'Useful if you want to print out the PDF. If this PDF is intended for electronic use, use the PDF Outline instead.')),
OptionRecommendation(name='toc_title', recommended_value=None,
help=_('Title for generated table of contents.')
),
OptionRecommendation(name='pdf_page_margin_left', recommended_value=72.0,
level=OptionRecommendation.LOW,
help=_('The size of the left page margin, in pts. Default is 72pt.'
' Overrides the common left page margin setting.')
),
OptionRecommendation(name='pdf_page_margin_top', recommended_value=72.0,
level=OptionRecommendation.LOW,
help=_('The size of the top page margin, in pts. Default is 72pt.'
' Overrides the common top page margin setting, unless set to zero.')
),
OptionRecommendation(name='pdf_page_margin_right', recommended_value=72.0,
level=OptionRecommendation.LOW,
help=_('The size of the right page margin, in pts. Default is 72pt.'
' Overrides the common right page margin setting, unless set to zero.')
),
OptionRecommendation(name='pdf_page_margin_bottom', recommended_value=72.0,
level=OptionRecommendation.LOW,
help=_('The size of the bottom page margin, in pts. Default is 72pt.'
' Overrides the common bottom page margin setting, unless set to zero.')
),
OptionRecommendation(name='pdf_use_document_margins', recommended_value=False,
help=_('Use the page margins specified in the input document via @page CSS rules.'
' This will cause the margins specified in the conversion settings to be ignored.'
' If the document does not specify page margins, the conversion settings will be used as a fallback.')
),
OptionRecommendation(name='pdf_page_number_map', recommended_value=None,
help=_('Adjust page numbers, as needed. Syntax is a JavaScript expression for the page number.'
' For example, "if (n < 3) 0; else n - 3;", where n is current page number.')
),
OptionRecommendation(name='uncompressed_pdf',
recommended_value=False, help=_(
'Generate an uncompressed PDF, useful for debugging.')
),
OptionRecommendation(name='pdf_odd_even_offset', recommended_value=0.0,
level=OptionRecommendation.LOW,
help=_(
'Shift the text horizontally by the specified offset (in pts).'
' On odd numbered pages, it is shifted to the right and on even'
' numbered pages to the left. Use negative numbers for the opposite'
' effect. Note that this setting is ignored on pages where the margins'
' are smaller than the specified offset. Shifting is done by setting'
' the PDF CropBox, not all software respects the CropBox.'
)
),
}
def specialize_options(self, log, opts, input_fmt):
# Ensure Qt is setup to be used with WebEngine
# specialize_options is called early enough in the pipeline
# that hopefully no Qt application has been constructed as yet
from PyQt5.QtWebEngineCore import QWebEngineUrlScheme
from PyQt5.QtWebEngineWidgets import QWebEnginePage # noqa
from calibre.gui2 import must_use_qt
from calibre.constants import FAKE_PROTOCOL
scheme = QWebEngineUrlScheme(FAKE_PROTOCOL.encode('ascii'))
scheme.setSyntax(QWebEngineUrlScheme.Syntax.Host)
scheme.setFlags(QWebEngineUrlScheme.SecureScheme)
QWebEngineUrlScheme.registerScheme(scheme)
must_use_qt()
self.input_fmt = input_fmt
if opts.pdf_use_document_margins:
# Prevent the conversion pipeline from overwriting document margins
opts.margin_left = opts.margin_right = opts.margin_top = opts.margin_bottom = -1
def convert(self, oeb_book, output_path, input_plugin, opts, log):
self.stored_page_margins = getattr(opts, '_stored_page_margins', {})
self.oeb = oeb_book
self.input_plugin, self.opts, self.log = input_plugin, opts, log
self.output_path = output_path
from calibre.ebooks.oeb.base import OPF, OPF2_NS
from lxml import etree
from io import BytesIO
package = etree.Element(OPF('package'),
attrib={'version': '2.0', 'unique-identifier': 'dummy'},
nsmap={None: OPF2_NS})
from calibre.ebooks.metadata.opf2 import OPF
self.oeb.metadata.to_opf2(package)
self.metadata = OPF(BytesIO(etree.tostring(package))).to_book_metadata()
self.cover_data = None
if input_plugin.is_image_collection:
log.debug('Converting input as an image collection...')
self.convert_images(input_plugin.get_images())
else:
log.debug('Converting input as a text based book...')
self.convert_text(oeb_book)
def convert_images(self, images):
from calibre.ebooks.pdf.image_writer import convert
convert(images, self.output_path, self.opts, self.metadata, self.report_progress)
def get_cover_data(self):
oeb = self.oeb
if (oeb.metadata.cover and unicode_type(oeb.metadata.cover[0]) in oeb.manifest.ids):
cover_id = unicode_type(oeb.metadata.cover[0])
item = oeb.manifest.ids[cover_id]
self.cover_data = item.data
def process_fonts(self):
''' Make sure all fonts are embeddable '''
from calibre.ebooks.oeb.base import urlnormalize
from calibre.utils.fonts.utils import remove_embed_restriction
processed = set()
for item in list(self.oeb.manifest):
if not hasattr(item.data, 'cssRules'):
continue
for i, rule in enumerate(item.data.cssRules):
if rule.type == rule.FONT_FACE_RULE:
try:
s = rule.style
src = s.getProperty('src').propertyValue[0].uri
except:
continue
path = item.abshref(src)
ff = self.oeb.manifest.hrefs.get(urlnormalize(path), None)
if ff is None:
continue
raw = nraw = ff.data
if path not in processed:
processed.add(path)
try:
nraw = remove_embed_restriction(raw)
except:
continue
if nraw != raw:
ff.data = nraw
self.oeb.container.write(path, nraw)
def convert_text(self, oeb_book):
import json
from calibre.ebooks.pdf.html_writer import convert
self.get_cover_data()
self.process_fonts()
if self.opts.pdf_use_document_margins and self.stored_page_margins:
for href, margins in iteritems(self.stored_page_margins):
item = oeb_book.manifest.hrefs.get(href)
if item is not None:
root = item.data
if hasattr(root, 'xpath') and margins:
root.set('data-calibre-pdf-output-page-margins', json.dumps(margins))
with TemporaryDirectory('_pdf_out') as oeb_dir:
from calibre.customize.ui import plugin_for_output_format
oeb_dir = os.path.realpath(oeb_dir)
oeb_output = plugin_for_output_format('oeb')
oeb_output.convert(oeb_book, oeb_dir, self.input_plugin, self.opts, self.log)
opfpath = glob.glob(os.path.join(oeb_dir, '*.opf'))[0]
convert(
opfpath, self.opts, metadata=self.metadata, output_path=self.output_path,
log=self.log, cover_data=self.cover_data, report_progress=self.report_progress
)

View File

@@ -0,0 +1,165 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
import glob
import os
import shutil
from calibre.customize.conversion import InputFormatPlugin
from calibre.ptempfile import TemporaryDirectory
from polyglot.builtins import getcwd
class PMLInput(InputFormatPlugin):
name = 'PML Input'
author = 'John Schember'
description = 'Convert PML to OEB'
# pmlz is a zip file containing pml files and png images.
file_types = {'pml', 'pmlz'}
commit_name = 'pml_input'
def process_pml(self, pml_path, html_path, close_all=False):
from calibre.ebooks.pml.pmlconverter import PML_HTMLizer
pclose = False
hclose = False
if not hasattr(pml_path, 'read'):
pml_stream = lopen(pml_path, 'rb')
pclose = True
else:
pml_stream = pml_path
pml_stream.seek(0)
if not hasattr(html_path, 'write'):
html_stream = lopen(html_path, 'wb')
hclose = True
else:
html_stream = html_path
ienc = getattr(pml_stream, 'encoding', None)
if ienc is None:
ienc = 'cp1252'
if self.options.input_encoding:
ienc = self.options.input_encoding
self.log.debug('Converting PML to HTML...')
hizer = PML_HTMLizer()
html = hizer.parse_pml(pml_stream.read().decode(ienc), html_path)
html = '<html><head><title></title></head><body>%s</body></html>'%html
html_stream.write(html.encode('utf-8', 'replace'))
if pclose:
pml_stream.close()
if hclose:
html_stream.close()
return hizer.get_toc()
def get_images(self, stream, tdir, top_level=False):
images = []
imgs = []
if top_level:
imgs = glob.glob(os.path.join(tdir, '*.png'))
# Images not in top level try bookname_img directory because
# that's where Dropbook likes to see them.
if not imgs:
if hasattr(stream, 'name'):
imgs = glob.glob(os.path.join(tdir, os.path.splitext(os.path.basename(stream.name))[0] + '_img', '*.png'))
# No images in Dropbook location try generic images directory
if not imgs:
imgs = glob.glob(os.path.join(os.path.join(tdir, 'images'), '*.png'))
if imgs:
os.makedirs(os.path.join(getcwd(), 'images'))
for img in imgs:
pimg_name = os.path.basename(img)
pimg_path = os.path.join(getcwd(), 'images', pimg_name)
images.append('images/' + pimg_name)
shutil.copy(img, pimg_path)
return images
def convert(self, stream, options, file_ext, log,
accelerators):
from calibre.ebooks.metadata.toc import TOC
from calibre.ebooks.metadata.opf2 import OPFCreator
from calibre.utils.zipfile import ZipFile
self.options = options
self.log = log
pages, images = [], []
toc = TOC()
if file_ext == 'pmlz':
log.debug('De-compressing content to temporary directory...')
with TemporaryDirectory('_unpmlz') as tdir:
zf = ZipFile(stream)
zf.extractall(tdir)
pmls = glob.glob(os.path.join(tdir, '*.pml'))
for pml in pmls:
html_name = os.path.splitext(os.path.basename(pml))[0]+'.html'
html_path = os.path.join(getcwd(), html_name)
pages.append(html_name)
log.debug('Processing PML item %s...' % pml)
ttoc = self.process_pml(pml, html_path)
toc += ttoc
images = self.get_images(stream, tdir, True)
else:
toc = self.process_pml(stream, 'index.html')
pages.append('index.html')
if hasattr(stream, 'name'):
images = self.get_images(stream, os.path.abspath(os.path.dirname(stream.name)))
# We want pages to be orded alphabetically.
pages.sort()
manifest_items = []
for item in pages+images:
manifest_items.append((item, None))
from calibre.ebooks.metadata.meta import get_metadata
log.debug('Reading metadata from input file...')
mi = get_metadata(stream, 'pml')
if 'images/cover.png' in images:
mi.cover = 'images/cover.png'
opf = OPFCreator(getcwd(), mi)
log.debug('Generating manifest...')
opf.create_manifest(manifest_items)
opf.create_spine(pages)
opf.set_toc(toc)
with lopen('metadata.opf', 'wb') as opffile:
with lopen('toc.ncx', 'wb') as tocfile:
opf.render(opffile, tocfile, 'toc.ncx')
return os.path.join(getcwd(), 'metadata.opf')
def postprocess_book(self, oeb, opts, log):
from calibre.ebooks.oeb.base import XHTML, barename
for item in oeb.spine:
if hasattr(item.data, 'xpath'):
for heading in item.data.iterdescendants(*map(XHTML, 'h1 h2 h3 h4 h5 h6'.split())):
if not len(heading):
continue
span = heading[0]
if not heading.text and not span.text and not len(span) and barename(span.tag) == 'span':
if not heading.get('id') and span.get('id'):
heading.set('id', span.get('id'))
heading.text = span.tail
heading.remove(span)
if len(heading) == 1 and heading[0].get('style') == 'text-align: center; margin: auto;':
div = heading[0]
if barename(div.tag) == 'div' and not len(div) and not div.get('id') and not heading.get('style'):
heading.text = (heading.text or '') + (div.text or '') + (div.tail or '')
heading.remove(div)
heading.set('style', 'text-align: center')

View File

@@ -0,0 +1,77 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL 3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
import os, io
from calibre.customize.conversion import (OutputFormatPlugin,
OptionRecommendation)
from calibre.ptempfile import TemporaryDirectory
from polyglot.builtins import unicode_type
class PMLOutput(OutputFormatPlugin):
name = 'PML Output'
author = 'John Schember'
file_type = 'pmlz'
commit_name = 'pml_output'
options = {
OptionRecommendation(name='pml_output_encoding', recommended_value='cp1252',
level=OptionRecommendation.LOW,
help=_('Specify the character encoding of the output document. '
'The default is cp1252.')),
OptionRecommendation(name='inline_toc',
recommended_value=False, level=OptionRecommendation.LOW,
help=_('Add Table of Contents to beginning of the book.')),
OptionRecommendation(name='full_image_depth',
recommended_value=False, level=OptionRecommendation.LOW,
help=_('Do not reduce the size or bit depth of images. Images '
'have their size and depth reduced by default to accommodate '
'applications that can not convert images on their '
'own such as Dropbook.')),
}
def convert(self, oeb_book, output_path, input_plugin, opts, log):
from calibre.ebooks.pml.pmlml import PMLMLizer
from calibre.utils.zipfile import ZipFile
with TemporaryDirectory('_pmlz_output') as tdir:
pmlmlizer = PMLMLizer(log)
pml = unicode_type(pmlmlizer.extract_content(oeb_book, opts))
with lopen(os.path.join(tdir, 'index.pml'), 'wb') as out:
out.write(pml.encode(opts.pml_output_encoding, 'replace'))
img_path = os.path.join(tdir, 'index_img')
if not os.path.exists(img_path):
os.makedirs(img_path)
self.write_images(oeb_book.manifest, pmlmlizer.image_hrefs, img_path, opts)
log.debug('Compressing output...')
pmlz = ZipFile(output_path, 'w')
pmlz.add_dir(tdir)
def write_images(self, manifest, image_hrefs, out_dir, opts):
from PIL import Image
from calibre.ebooks.oeb.base import OEB_RASTER_IMAGES
for item in manifest:
if item.media_type in OEB_RASTER_IMAGES and item.href in image_hrefs.keys():
if opts.full_image_depth:
im = Image.open(io.BytesIO(item.data))
else:
im = Image.open(io.BytesIO(item.data)).convert('P')
im.thumbnail((300,300), Image.ANTIALIAS)
data = io.BytesIO()
im.save(data, 'PNG')
data = data.getvalue()
path = os.path.join(out_dir, image_hrefs[item.href])
with lopen(path, 'wb') as out:
out.write(data)

View File

@@ -0,0 +1,28 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL 3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
from calibre.customize.conversion import InputFormatPlugin
from polyglot.builtins import getcwd
class RBInput(InputFormatPlugin):
name = 'RB Input'
author = 'John Schember'
description = 'Convert RB files to HTML'
file_types = {'rb'}
commit_name = 'rb_input'
def convert(self, stream, options, file_ext, log,
accelerators):
from calibre.ebooks.rb.reader import Reader
reader = Reader(stream, log, options.input_encoding)
opf = reader.extract_content(getcwd())
return opf

View File

@@ -0,0 +1,45 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL 3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
import os
from calibre.customize.conversion import OutputFormatPlugin, OptionRecommendation
class RBOutput(OutputFormatPlugin):
name = 'RB Output'
author = 'John Schember'
file_type = 'rb'
commit_name = 'rb_output'
options = {
OptionRecommendation(name='inline_toc',
recommended_value=False, level=OptionRecommendation.LOW,
help=_('Add Table of Contents to beginning of the book.'))}
def convert(self, oeb_book, output_path, input_plugin, opts, log):
from calibre.ebooks.rb.writer import RBWriter
close = False
if not hasattr(output_path, 'write'):
close = True
if not os.path.exists(os.path.dirname(output_path)) and os.path.dirname(output_path):
os.makedirs(os.path.dirname(output_path))
out_stream = lopen(output_path, 'wb')
else:
out_stream = output_path
writer = RBWriter(opts, log)
out_stream.seek(0)
out_stream.truncate()
writer.write_content(oeb_book, out_stream, oeb_book.metadata)
if close:
out_stream.close()

View File

@@ -0,0 +1,169 @@
#!/usr/bin/env python2
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import os
from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
from calibre.constants import numeric_version
from calibre import walk
from polyglot.builtins import unicode_type
class RecipeDisabled(Exception):
pass
class RecipeInput(InputFormatPlugin):
name = 'Recipe Input'
author = 'Kovid Goyal'
description = _('Download periodical content from the internet')
file_types = {'recipe', 'downloaded_recipe'}
commit_name = 'recipe_input'
recommendations = {
('chapter', None, OptionRecommendation.HIGH),
('dont_split_on_page_breaks', True, OptionRecommendation.HIGH),
('use_auto_toc', False, OptionRecommendation.HIGH),
('input_encoding', None, OptionRecommendation.HIGH),
('input_profile', 'default', OptionRecommendation.HIGH),
('page_breaks_before', None, OptionRecommendation.HIGH),
('insert_metadata', False, OptionRecommendation.HIGH),
}
options = {
OptionRecommendation(name='test', recommended_value=False,
help=_(
'Useful for recipe development. Forces'
' max_articles_per_feed to 2 and downloads at most 2 feeds.'
' You can change the number of feeds and articles by supplying optional arguments.'
' For example: --test 3 1 will download at most 3 feeds and only 1 article per feed.')),
OptionRecommendation(name='username', recommended_value=None,
help=_('Username for sites that require a login to access '
'content.')),
OptionRecommendation(name='password', recommended_value=None,
help=_('Password for sites that require a login to access '
'content.')),
OptionRecommendation(name='dont_download_recipe',
recommended_value=False,
help=_('Do not download latest version of builtin recipes from the calibre server')),
OptionRecommendation(name='lrf', recommended_value=False,
help='Optimize fetching for subsequent conversion to LRF.'),
}
def convert(self, recipe_or_file, opts, file_ext, log,
accelerators):
from calibre.web.feeds.recipes import compile_recipe
opts.output_profile.flow_size = 0
if file_ext == 'downloaded_recipe':
from calibre.utils.zipfile import ZipFile
zf = ZipFile(recipe_or_file, 'r')
zf.extractall()
zf.close()
with lopen('download.recipe', 'rb') as f:
self.recipe_source = f.read()
recipe = compile_recipe(self.recipe_source)
recipe.needs_subscription = False
self.recipe_object = recipe(opts, log, self.report_progress)
else:
if os.environ.get('CALIBRE_RECIPE_URN'):
from calibre.web.feeds.recipes.collection import get_custom_recipe, get_builtin_recipe_by_id
urn = os.environ['CALIBRE_RECIPE_URN']
log('Downloading recipe urn: ' + urn)
rtype, recipe_id = urn.partition(':')[::2]
if not recipe_id:
raise ValueError('Invalid recipe urn: ' + urn)
if rtype == 'custom':
self.recipe_source = get_custom_recipe(recipe_id)
else:
self.recipe_source = get_builtin_recipe_by_id(urn, log=log, download_recipe=True)
if not self.recipe_source:
raise ValueError('Could not find recipe with urn: ' + urn)
if not isinstance(self.recipe_source, bytes):
self.recipe_source = self.recipe_source.encode('utf-8')
recipe = compile_recipe(self.recipe_source)
elif os.access(recipe_or_file, os.R_OK):
with lopen(recipe_or_file, 'rb') as f:
self.recipe_source = f.read()
recipe = compile_recipe(self.recipe_source)
log('Using custom recipe')
else:
from calibre.web.feeds.recipes.collection import (
get_builtin_recipe_by_title, get_builtin_recipe_titles)
title = getattr(opts, 'original_recipe_input_arg', recipe_or_file)
title = os.path.basename(title).rpartition('.')[0]
titles = frozenset(get_builtin_recipe_titles())
if title not in titles:
title = getattr(opts, 'original_recipe_input_arg', recipe_or_file)
title = title.rpartition('.')[0]
raw = get_builtin_recipe_by_title(title, log=log,
download_recipe=not opts.dont_download_recipe)
builtin = False
try:
recipe = compile_recipe(raw)
self.recipe_source = raw
if recipe.requires_version > numeric_version:
log.warn(
'Downloaded recipe needs calibre version at least: %s' %
('.'.join(recipe.requires_version)))
builtin = True
except:
log.exception('Failed to compile downloaded recipe. Falling '
'back to builtin one')
builtin = True
if builtin:
log('Using bundled builtin recipe')
raw = get_builtin_recipe_by_title(title, log=log,
download_recipe=False)
if raw is None:
raise ValueError('Failed to find builtin recipe: '+title)
recipe = compile_recipe(raw)
self.recipe_source = raw
else:
log('Using downloaded builtin recipe')
if recipe is None:
raise ValueError('%r is not a valid recipe file or builtin recipe' %
recipe_or_file)
disabled = getattr(recipe, 'recipe_disabled', None)
if disabled is not None:
raise RecipeDisabled(disabled)
ro = recipe(opts, log, self.report_progress)
ro.download()
self.recipe_object = ro
for key, val in self.recipe_object.conversion_options.items():
setattr(opts, key, val)
for f in os.listdir('.'):
if f.endswith('.opf'):
return os.path.abspath(f)
for f in walk('.'):
if f.endswith('.opf'):
return os.path.abspath(f)
def postprocess_book(self, oeb, opts, log):
if self.recipe_object is not None:
self.recipe_object.internal_postprocess_book(oeb, opts, log)
self.recipe_object.postprocess_book(oeb, opts, log)
def specialize(self, oeb, opts, log, output_fmt):
if opts.no_inline_navbars:
from calibre.ebooks.oeb.base import XPath
for item in oeb.spine:
for div in XPath('//h:div[contains(@class, "calibre_navbar")]')(item.data):
div.getparent().remove(div)
def save_download(self, zf):
raw = self.recipe_source
if isinstance(raw, unicode_type):
raw = raw.encode('utf-8')
zf.writestr('download.recipe', raw)

View File

@@ -0,0 +1,323 @@
from __future__ import with_statement, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
import os, glob, re, textwrap
from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
from polyglot.builtins import iteritems, filter, getcwd, as_bytes
border_style_map = {
'single' : 'solid',
'double-thickness-border' : 'double',
'shadowed-border': 'outset',
'double-border': 'double',
'dotted-border': 'dotted',
'dashed': 'dashed',
'hairline': 'solid',
'inset': 'inset',
'dash-small': 'dashed',
'dot-dash': 'dotted',
'dot-dot-dash': 'dotted',
'outset': 'outset',
'tripple': 'double',
'triple': 'double',
'thick-thin-small': 'solid',
'thin-thick-small': 'solid',
'thin-thick-thin-small': 'solid',
'thick-thin-medium': 'solid',
'thin-thick-medium': 'solid',
'thin-thick-thin-medium': 'solid',
'thick-thin-large': 'solid',
'thin-thick-thin-large': 'solid',
'wavy': 'ridge',
'double-wavy': 'ridge',
'striped': 'ridge',
'emboss': 'inset',
'engrave': 'inset',
'frame': 'ridge',
}
class RTFInput(InputFormatPlugin):
name = 'RTF Input'
author = 'Kovid Goyal'
description = 'Convert RTF files to HTML'
file_types = {'rtf'}
commit_name = 'rtf_input'
options = {
OptionRecommendation(name='ignore_wmf', recommended_value=False,
help=_('Ignore WMF images instead of replacing them with a placeholder image.')),
}
def generate_xml(self, stream):
from calibre.ebooks.rtf2xml.ParseRtf import ParseRtf
ofile = u'dataxml.xml'
run_lev, debug_dir, indent_out = 1, None, 0
if getattr(self.opts, 'debug_pipeline', None) is not None:
try:
os.mkdir(u'rtfdebug')
debug_dir = u'rtfdebug'
run_lev = 4
indent_out = 1
self.log('Running RTFParser in debug mode')
except:
self.log.warn('Impossible to run RTFParser in debug mode')
parser = ParseRtf(
in_file=stream,
out_file=ofile,
# Convert symbol fonts to unicode equivalents. Default
# is 1
convert_symbol=1,
# Convert Zapf fonts to unicode equivalents. Default
# is 1.
convert_zapf=1,
# Convert Wingding fonts to unicode equivalents.
# Default is 1.
convert_wingdings=1,
# Convert RTF caps to real caps.
# Default is 1.
convert_caps=1,
# Indent resulting XML.
# Default is 0 (no indent).
indent=indent_out,
# Form lists from RTF. Default is 1.
form_lists=1,
# Convert headings to sections. Default is 0.
headings_to_sections=1,
# Group paragraphs with the same style name. Default is 1.
group_styles=1,
# Group borders. Default is 1.
group_borders=1,
# Write or do not write paragraphs. Default is 0.
empty_paragraphs=1,
# Debug
deb_dir=debug_dir,
# Default encoding
default_encoding=getattr(self.opts, 'input_encoding', 'cp1252') or 'cp1252',
# Run level
run_level=run_lev,
)
parser.parse_rtf()
with open(ofile, 'rb') as f:
return f.read()
def extract_images(self, picts):
from calibre.utils.imghdr import what
from binascii import unhexlify
self.log('Extracting images...')
with open(picts, 'rb') as f:
raw = f.read()
picts = filter(len, re.findall(br'\{\\pict([^}]+)\}', raw))
hex_pat = re.compile(br'[^a-fA-F0-9]')
encs = [hex_pat.sub(b'', pict) for pict in picts]
count = 0
imap = {}
for enc in encs:
if len(enc) % 2 == 1:
enc = enc[:-1]
data = unhexlify(enc)
fmt = what(None, data)
if fmt is None:
fmt = 'wmf'
count += 1
name = u'%04d.%s' % (count, fmt)
with open(name, 'wb') as f:
f.write(data)
imap[count] = name
# with open(name+'.hex', 'wb') as f:
# f.write(enc)
return self.convert_images(imap)
def convert_images(self, imap):
self.default_img = None
for count, val in iteritems(imap):
try:
imap[count] = self.convert_image(val)
except:
self.log.exception('Failed to convert', val)
return imap
def convert_image(self, name):
if not name.endswith('.wmf'):
return name
try:
return self.rasterize_wmf(name)
except Exception:
self.log.exception('Failed to convert WMF image %r'%name)
return self.replace_wmf(name)
def replace_wmf(self, name):
if self.opts.ignore_wmf:
os.remove(name)
return '__REMOVE_ME__'
from calibre.ebooks.covers import message_image
if self.default_img is None:
self.default_img = message_image('Conversion of WMF images is not supported.'
' Use Microsoft Word or OpenOffice to save this RTF file'
' as HTML and convert that in calibre.')
name = name.replace('.wmf', '.jpg')
with lopen(name, 'wb') as f:
f.write(self.default_img)
return name
def rasterize_wmf(self, name):
from calibre.utils.wmf.parse import wmf_unwrap
with open(name, 'rb') as f:
data = f.read()
data = wmf_unwrap(data)
name = name.replace('.wmf', '.png')
with open(name, 'wb') as f:
f.write(data)
return name
def write_inline_css(self, ic, border_styles):
font_size_classes = ['span.fs%d { font-size: %spt }'%(i, x) for i, x in
enumerate(ic.font_sizes)]
color_classes = ['span.col%d { color: %s }'%(i, x) for i, x in
enumerate(ic.colors) if x != 'false']
css = textwrap.dedent('''
span.none {
text-decoration: none; font-weight: normal;
font-style: normal; font-variant: normal
}
span.italics { font-style: italic }
span.bold { font-weight: bold }
span.small-caps { font-variant: small-caps }
span.underlined { text-decoration: underline }
span.strike-through { text-decoration: line-through }
''')
css += '\n'+'\n'.join(font_size_classes)
css += '\n' +'\n'.join(color_classes)
for cls, val in iteritems(border_styles):
css += '\n\n.%s {\n%s\n}'%(cls, val)
with open(u'styles.css', 'ab') as f:
f.write(css.encode('utf-8'))
def convert_borders(self, doc):
border_styles = []
style_map = {}
for elem in doc.xpath(r'//*[local-name()="cell"]'):
style = ['border-style: hidden', 'border-width: 1px',
'border-color: black']
for x in ('bottom', 'top', 'left', 'right'):
bs = elem.get('border-cell-%s-style'%x, None)
if bs:
cbs = border_style_map.get(bs, 'solid')
style.append('border-%s-style: %s'%(x, cbs))
bw = elem.get('border-cell-%s-line-width'%x, None)
if bw:
style.append('border-%s-width: %spt'%(x, bw))
bc = elem.get('border-cell-%s-color'%x, None)
if bc:
style.append('border-%s-color: %s'%(x, bc))
style = ';\n'.join(style)
if style not in border_styles:
border_styles.append(style)
idx = border_styles.index(style)
cls = 'border_style%d'%idx
style_map[cls] = style
elem.set('class', cls)
return style_map
def convert(self, stream, options, file_ext, log,
accelerators):
from lxml import etree
from calibre.ebooks.metadata.meta import get_metadata
from calibre.ebooks.metadata.opf2 import OPFCreator
from calibre.ebooks.rtf2xml.ParseRtf import RtfInvalidCodeException
from calibre.ebooks.rtf.input import InlineClass
from calibre.utils.xml_parse import safe_xml_fromstring
self.opts = options
self.log = log
self.log('Converting RTF to XML...')
try:
xml = self.generate_xml(stream.name)
except RtfInvalidCodeException as e:
self.log.exception('Unable to parse RTF')
raise ValueError(_('This RTF file has a feature calibre does not '
'support. Convert it to HTML first and then try it.\n%s')%e)
d = glob.glob(os.path.join('*_rtf_pict_dir', 'picts.rtf'))
if d:
imap = {}
try:
imap = self.extract_images(d[0])
except:
self.log.exception('Failed to extract images...')
self.log('Parsing XML...')
doc = safe_xml_fromstring(xml)
border_styles = self.convert_borders(doc)
for pict in doc.xpath('//rtf:pict[@num]',
namespaces={'rtf':'http://rtf2xml.sourceforge.net/'}):
num = int(pict.get('num'))
name = imap.get(num, None)
if name is not None:
pict.set('num', name)
self.log('Converting XML to HTML...')
inline_class = InlineClass(self.log)
styledoc = safe_xml_fromstring(P('templates/rtf.xsl', data=True), recover=False)
extensions = {('calibre', 'inline-class') : inline_class}
transform = etree.XSLT(styledoc, extensions=extensions)
result = transform(doc)
html = u'index.xhtml'
with open(html, 'wb') as f:
res = as_bytes(transform.tostring(result))
# res = res[:100].replace('xmlns:html', 'xmlns') + res[100:]
# clean multiple \n
res = re.sub(b'\n+', b'\n', res)
# Replace newlines inserted by the 'empty_paragraphs' option in rtf2xml with html blank lines
# res = re.sub('\s*<body>', '<body>', res)
# res = re.sub('(?<=\n)\n{2}',
# u'<p>\u00a0</p>\n'.encode('utf-8'), res)
f.write(res)
self.write_inline_css(inline_class, border_styles)
stream.seek(0)
mi = get_metadata(stream, 'rtf')
if not mi.title:
mi.title = _('Unknown')
if not mi.authors:
mi.authors = [_('Unknown')]
opf = OPFCreator(getcwd(), mi)
opf.create_manifest([(u'index.xhtml', None)])
opf.create_spine([u'index.xhtml'])
opf.render(open(u'metadata.opf', 'wb'))
return os.path.abspath(u'metadata.opf')
def postprocess_book(self, oeb, opts, log):
for item in oeb.spine:
for img in item.data.xpath('//*[local-name()="img" and @src="__REMOVE_ME__"]'):
p = img.getparent()
idx = p.index(img)
p.remove(img)
if img.tail:
if idx == 0:
p.text = (p.text or '') + img.tail
else:
p[idx-1].tail = (p[idx-1].tail or '') + img.tail

View File

@@ -0,0 +1,40 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL 3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
import os
from calibre.customize.conversion import OutputFormatPlugin
class RTFOutput(OutputFormatPlugin):
name = 'RTF Output'
author = 'John Schember'
file_type = 'rtf'
commit_name = 'rtf_output'
def convert(self, oeb_book, output_path, input_plugin, opts, log):
from calibre.ebooks.rtf.rtfml import RTFMLizer
rtfmlitzer = RTFMLizer(log)
content = rtfmlitzer.extract_content(oeb_book, opts)
close = False
if not hasattr(output_path, 'write'):
close = True
if not os.path.exists(os.path.dirname(output_path)) and os.path.dirname(output_path) != '':
os.makedirs(os.path.dirname(output_path))
out_stream = lopen(output_path, 'wb')
else:
out_stream = output_path
out_stream.seek(0)
out_stream.truncate()
out_stream.write(content.encode('ascii', 'replace'))
if close:
out_stream.close()

View File

@@ -0,0 +1,122 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL 3'
__copyright__ = '2010, Li Fanxi <lifanxi@freemindworld.com>'
__docformat__ = 'restructuredtext en'
import os
from calibre.customize.conversion import InputFormatPlugin
from calibre.ptempfile import TemporaryDirectory
from calibre.utils.filenames import ascii_filename
from polyglot.builtins import unicode_type
HTML_TEMPLATE = '<html><head><meta http-equiv="Content-Type" content="text/html; charset=utf-8"/><title>%s</title></head><body>\n%s\n</body></html>'
def html_encode(s):
return s.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;').replace('"', '&quot;').replace("'", '&apos;').replace('\n', '<br/>').replace(' ', '&nbsp;') # noqa
class SNBInput(InputFormatPlugin):
name = 'SNB Input'
author = 'Li Fanxi'
description = 'Convert SNB files to OEB'
file_types = {'snb'}
commit_name = 'snb_input'
options = set()
def convert(self, stream, options, file_ext, log,
accelerators):
import uuid
from calibre.ebooks.oeb.base import DirContainer
from calibre.ebooks.snb.snbfile import SNBFile
from calibre.utils.xml_parse import safe_xml_fromstring
log.debug("Parsing SNB file...")
snbFile = SNBFile()
try:
snbFile.Parse(stream)
except:
raise ValueError("Invalid SNB file")
if not snbFile.IsValid():
log.debug("Invalid SNB file")
raise ValueError("Invalid SNB file")
log.debug("Handle meta data ...")
from calibre.ebooks.conversion.plumber import create_oebbook
oeb = create_oebbook(log, None, options,
encoding=options.input_encoding, populate=False)
meta = snbFile.GetFileStream('snbf/book.snbf')
if meta is not None:
meta = safe_xml_fromstring(meta)
l = {'title' : './/head/name',
'creator' : './/head/author',
'language' : './/head/language',
'generator': './/head/generator',
'publisher': './/head/publisher',
'cover' : './/head/cover', }
d = {}
for item in l:
node = meta.find(l[item])
if node is not None:
d[item] = node.text if node.text is not None else ''
else:
d[item] = ''
oeb.metadata.add('title', d['title'])
oeb.metadata.add('creator', d['creator'], attrib={'role':'aut'})
oeb.metadata.add('language', d['language'].lower().replace('_', '-'))
oeb.metadata.add('generator', d['generator'])
oeb.metadata.add('publisher', d['publisher'])
if d['cover'] != '':
oeb.guide.add('cover', 'Cover', d['cover'])
bookid = unicode_type(uuid.uuid4())
oeb.metadata.add('identifier', bookid, id='uuid_id', scheme='uuid')
for ident in oeb.metadata.identifier:
if 'id' in ident.attrib:
oeb.uid = oeb.metadata.identifier[0]
break
with TemporaryDirectory('_snb2oeb', keep=True) as tdir:
log.debug('Process TOC ...')
toc = snbFile.GetFileStream('snbf/toc.snbf')
oeb.container = DirContainer(tdir, log)
if toc is not None:
toc = safe_xml_fromstring(toc)
i = 1
for ch in toc.find('.//body'):
chapterName = ch.text
chapterSrc = ch.get('src')
fname = 'ch_%d.htm' % i
data = snbFile.GetFileStream('snbc/' + chapterSrc)
if data is None:
continue
snbc = safe_xml_fromstring(data)
lines = []
for line in snbc.find('.//body'):
if line.tag == 'text':
lines.append('<p>%s</p>' % html_encode(line.text))
elif line.tag == 'img':
lines.append('<p><img src="%s" /></p>' % html_encode(line.text))
with open(os.path.join(tdir, fname), 'wb') as f:
f.write((HTML_TEMPLATE % (chapterName, '\n'.join(lines))).encode('utf-8', 'replace'))
oeb.toc.add(ch.text, fname)
id, href = oeb.manifest.generate(id='html',
href=ascii_filename(fname))
item = oeb.manifest.add(id, href, 'text/html')
item.html_input_href = fname
oeb.spine.add(item, True)
i = i + 1
imageFiles = snbFile.OutputImageFiles(tdir)
for f, m in imageFiles:
id, href = oeb.manifest.generate(id='image',
href=ascii_filename(f))
item = oeb.manifest.add(id, href, m)
item.html_input_href = f
return oeb

View File

@@ -0,0 +1,269 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL 3'
__copyright__ = '2010, Li Fanxi <lifanxi@freemindworld.com>'
__docformat__ = 'restructuredtext en'
import os
from calibre.customize.conversion import OutputFormatPlugin, OptionRecommendation
from calibre.ptempfile import TemporaryDirectory
from calibre.constants import __appname__, __version__
from polyglot.builtins import unicode_type
class SNBOutput(OutputFormatPlugin):
name = 'SNB Output'
author = 'Li Fanxi'
file_type = 'snb'
commit_name = 'snb_output'
options = {
OptionRecommendation(name='snb_output_encoding', recommended_value='utf-8',
level=OptionRecommendation.LOW,
help=_('Specify the character encoding of the output document. '
'The default is utf-8.')),
OptionRecommendation(name='snb_max_line_length',
recommended_value=0, level=OptionRecommendation.LOW,
help=_('The maximum number of characters per line. This splits on '
'the first space before the specified value. If no space is found '
'the line will be broken at the space after and will exceed the '
'specified value. Also, there is a minimum of 25 characters. '
'Use 0 to disable line splitting.')),
OptionRecommendation(name='snb_insert_empty_line',
recommended_value=False, level=OptionRecommendation.LOW,
help=_('Specify whether or not to insert an empty line between '
'two paragraphs.')),
OptionRecommendation(name='snb_dont_indent_first_line',
recommended_value=False, level=OptionRecommendation.LOW,
help=_('Specify whether or not to insert two space characters '
'to indent the first line of each paragraph.')),
OptionRecommendation(name='snb_hide_chapter_name',
recommended_value=False, level=OptionRecommendation.LOW,
help=_('Specify whether or not to hide the chapter title for each '
'chapter. Useful for image-only output (eg. comics).')),
OptionRecommendation(name='snb_full_screen',
recommended_value=False, level=OptionRecommendation.LOW,
help=_('Resize all the images for full screen view. ')),
}
def convert(self, oeb_book, output_path, input_plugin, opts, log):
from lxml import etree
from calibre.ebooks.snb.snbfile import SNBFile
from calibre.ebooks.snb.snbml import SNBMLizer, ProcessFileName
self.opts = opts
from calibre.ebooks.oeb.transforms.rasterize import SVGRasterizer, Unavailable
try:
rasterizer = SVGRasterizer()
rasterizer(oeb_book, opts)
except Unavailable:
log.warn('SVG rasterizer unavailable, SVG will not be converted')
# Create temp dir
with TemporaryDirectory('_snb_output') as tdir:
# Create stub directories
snbfDir = os.path.join(tdir, 'snbf')
snbcDir = os.path.join(tdir, 'snbc')
snbiDir = os.path.join(tdir, 'snbc/images')
os.mkdir(snbfDir)
os.mkdir(snbcDir)
os.mkdir(snbiDir)
# Process Meta data
meta = oeb_book.metadata
if meta.title:
title = unicode_type(meta.title[0])
else:
title = ''
authors = [unicode_type(x) for x in meta.creator if x.role == 'aut']
if meta.publisher:
publishers = unicode_type(meta.publisher[0])
else:
publishers = ''
if meta.language:
lang = unicode_type(meta.language[0]).upper()
else:
lang = ''
if meta.description:
abstract = unicode_type(meta.description[0])
else:
abstract = ''
# Process Cover
g, m, s = oeb_book.guide, oeb_book.manifest, oeb_book.spine
href = None
if 'titlepage' not in g:
if 'cover' in g:
href = g['cover'].href
# Output book info file
bookInfoTree = etree.Element("book-snbf", version="1.0")
headTree = etree.SubElement(bookInfoTree, "head")
etree.SubElement(headTree, "name").text = title
etree.SubElement(headTree, "author").text = ' '.join(authors)
etree.SubElement(headTree, "language").text = lang
etree.SubElement(headTree, "rights")
etree.SubElement(headTree, "publisher").text = publishers
etree.SubElement(headTree, "generator").text = __appname__ + ' ' + __version__
etree.SubElement(headTree, "created")
etree.SubElement(headTree, "abstract").text = abstract
if href is not None:
etree.SubElement(headTree, "cover").text = ProcessFileName(href)
else:
etree.SubElement(headTree, "cover")
with open(os.path.join(snbfDir, 'book.snbf'), 'wb') as f:
f.write(etree.tostring(bookInfoTree, pretty_print=True, encoding='utf-8'))
# Output TOC
tocInfoTree = etree.Element("toc-snbf")
tocHead = etree.SubElement(tocInfoTree, "head")
tocBody = etree.SubElement(tocInfoTree, "body")
outputFiles = {}
if oeb_book.toc.count() == 0:
log.warn('This SNB file has no Table of Contents. '
'Creating a default TOC')
first = next(iter(oeb_book.spine))
oeb_book.toc.add(_('Start page'), first.href)
else:
first = next(iter(oeb_book.spine))
if oeb_book.toc[0].href != first.href:
# The pages before the fist item in toc will be stored as
# "Cover Pages".
# oeb_book.toc does not support "insert", so we generate
# the tocInfoTree directly instead of modifying the toc
ch = etree.SubElement(tocBody, "chapter")
ch.set("src", ProcessFileName(first.href) + ".snbc")
ch.text = _('Cover pages')
outputFiles[first.href] = []
outputFiles[first.href].append(("", _("Cover pages")))
for tocitem in oeb_book.toc:
if tocitem.href.find('#') != -1:
item = tocitem.href.split('#')
if len(item) != 2:
log.error('Error in TOC item: %s' % tocitem)
else:
if item[0] in outputFiles:
outputFiles[item[0]].append((item[1], tocitem.title))
else:
outputFiles[item[0]] = []
if "" not in outputFiles[item[0]]:
outputFiles[item[0]].append(("", tocitem.title + _(" (Preface)")))
ch = etree.SubElement(tocBody, "chapter")
ch.set("src", ProcessFileName(item[0]) + ".snbc")
ch.text = tocitem.title + _(" (Preface)")
outputFiles[item[0]].append((item[1], tocitem.title))
else:
if tocitem.href in outputFiles:
outputFiles[tocitem.href].append(("", tocitem.title))
else:
outputFiles[tocitem.href] = []
outputFiles[tocitem.href].append(("", tocitem.title))
ch = etree.SubElement(tocBody, "chapter")
ch.set("src", ProcessFileName(tocitem.href) + ".snbc")
ch.text = tocitem.title
etree.SubElement(tocHead, "chapters").text = '%d' % len(tocBody)
with open(os.path.join(snbfDir, 'toc.snbf'), 'wb') as f:
f.write(etree.tostring(tocInfoTree, pretty_print=True, encoding='utf-8'))
# Output Files
oldTree = None
mergeLast = False
lastName = None
for item in s:
from calibre.ebooks.oeb.base import OEB_DOCS, OEB_IMAGES
if m.hrefs[item.href].media_type in OEB_DOCS:
if item.href not in outputFiles:
log.debug('File %s is unused in TOC. Continue in last chapter' % item.href)
mergeLast = True
else:
if oldTree is not None and mergeLast:
log.debug('Output the modified chapter again: %s' % lastName)
with open(os.path.join(snbcDir, lastName), 'wb') as f:
f.write(etree.tostring(oldTree, pretty_print=True, encoding='utf-8'))
mergeLast = False
log.debug('Converting %s to snbc...' % item.href)
snbwriter = SNBMLizer(log)
snbcTrees = None
if not mergeLast:
snbcTrees = snbwriter.extract_content(oeb_book, item, outputFiles[item.href], opts)
for subName in snbcTrees:
postfix = ''
if subName != '':
postfix = '_' + subName
lastName = ProcessFileName(item.href + postfix + ".snbc")
oldTree = snbcTrees[subName]
with open(os.path.join(snbcDir, lastName), 'wb') as f:
f.write(etree.tostring(oldTree, pretty_print=True, encoding='utf-8'))
else:
log.debug('Merge %s with last TOC item...' % item.href)
snbwriter.merge_content(oldTree, oeb_book, item, [('', _("Start"))], opts)
# Output the last one if needed
log.debug('Output the last modified chapter again: %s' % lastName)
if oldTree is not None and mergeLast:
with open(os.path.join(snbcDir, lastName), 'wb') as f:
f.write(etree.tostring(oldTree, pretty_print=True, encoding='utf-8'))
mergeLast = False
for item in m:
if m.hrefs[item.href].media_type in OEB_IMAGES:
log.debug('Converting image: %s ...' % item.href)
content = m.hrefs[item.href].data
# Convert & Resize image
self.HandleImage(content, os.path.join(snbiDir, ProcessFileName(item.href)))
# Package as SNB File
snbFile = SNBFile()
snbFile.FromDir(tdir)
snbFile.Output(output_path)
def HandleImage(self, imageData, imagePath):
from calibre.utils.img import image_from_data, resize_image, image_to_data
img = image_from_data(imageData)
x, y = img.width(), img.height()
if self.opts:
if self.opts.snb_full_screen:
SCREEN_X, SCREEN_Y = self.opts.output_profile.screen_size
else:
SCREEN_X, SCREEN_Y = self.opts.output_profile.comic_screen_size
else:
SCREEN_X = 540
SCREEN_Y = 700
# Handle big image only
if x > SCREEN_X or y > SCREEN_Y:
xScale = float(x) / SCREEN_X
yScale = float(y) / SCREEN_Y
scale = max(xScale, yScale)
# TODO : intelligent image rotation
# img = img.rotate(90)
# x,y = y,x
img = resize_image(img, x // scale, y // scale)
with lopen(imagePath, 'wb') as f:
f.write(image_to_data(img, fmt=imagePath.rpartition('.')[-1]))
if __name__ == '__main__':
from calibre.ebooks.oeb.reader import OEBReader
from calibre.ebooks.oeb.base import OEBBook
from calibre.ebooks.conversion.preprocess import HTMLPreProcessor
from calibre.customize.profiles import HanlinV3Output
class OptionValues(object):
pass
opts = OptionValues()
opts.output_profile = HanlinV3Output(None)
html_preprocessor = HTMLPreProcessor(None, None, opts)
from calibre.utils.logging import default_log
oeb = OEBBook(default_log, html_preprocessor)
reader = OEBReader
reader()(oeb, '/tmp/bbb/processed/')
SNBOutput(None).convert(oeb, '/tmp/test.snb', None, None, default_log)

View File

@@ -0,0 +1,39 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL 3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
from io import BytesIO
from calibre.customize.conversion import InputFormatPlugin
class TCRInput(InputFormatPlugin):
name = 'TCR Input'
author = 'John Schember'
description = 'Convert TCR files to HTML'
file_types = {'tcr'}
commit_name = 'tcr_input'
def convert(self, stream, options, file_ext, log, accelerators):
from calibre.ebooks.compression.tcr import decompress
log.info('Decompressing text...')
raw_txt = decompress(stream)
log.info('Converting text to OEB...')
stream = BytesIO(raw_txt)
from calibre.customize.ui import plugin_for_input_format
txt_plugin = plugin_for_input_format('txt')
for opt in txt_plugin.options:
if not hasattr(self.options, opt.option.name):
setattr(options, opt.option.name, opt.recommended_value)
stream.seek(0)
return txt_plugin.convert(stream, options,
'txt', log, accelerators)

View File

@@ -0,0 +1,56 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL 3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
import os
from calibre.customize.conversion import OutputFormatPlugin, \
OptionRecommendation
class TCROutput(OutputFormatPlugin):
name = 'TCR Output'
author = 'John Schember'
file_type = 'tcr'
commit_name = 'tcr_output'
options = {
OptionRecommendation(name='tcr_output_encoding', recommended_value='utf-8',
level=OptionRecommendation.LOW,
help=_('Specify the character encoding of the output document. '
'The default is utf-8.'))}
def convert(self, oeb_book, output_path, input_plugin, opts, log):
from calibre.ebooks.txt.txtml import TXTMLizer
from calibre.ebooks.compression.tcr import compress
close = False
if not hasattr(output_path, 'write'):
close = True
if not os.path.exists(os.path.dirname(output_path)) and os.path.dirname(output_path):
os.makedirs(os.path.dirname(output_path))
out_stream = lopen(output_path, 'wb')
else:
out_stream = output_path
setattr(opts, 'flush_paras', False)
setattr(opts, 'max_line_length', 0)
setattr(opts, 'force_max_line_length', False)
setattr(opts, 'indent_paras', False)
writer = TXTMLizer(log)
txt = writer.extract_content(oeb_book, opts).encode(opts.tcr_output_encoding, 'replace')
log.info('Compressing text...')
txt = compress(txt)
out_stream.seek(0)
out_stream.truncate()
out_stream.write(txt)
if close:
out_stream.close()

View File

@@ -0,0 +1,308 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL 3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
import os
from calibre import _ent_pat, walk, xml_entity_to_unicode
from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
from polyglot.builtins import getcwd
MD_EXTENSIONS = {
'abbr': _('Abbreviations'),
'admonition': _('Support admonitions'),
'attr_list': _('Add attribute to HTML tags'),
'codehilite': _('Add code highlighting via Pygments'),
'def_list': _('Definition lists'),
'extra': _('Enables various common extensions'),
'fenced_code': _('Alternative code block syntax'),
'footnotes': _('Footnotes'),
'legacy_attrs': _('Use legacy element attributes'),
'legacy_em': _('Use legacy underscore handling for connected words'),
'meta': _('Metadata in the document'),
'nl2br': _('Treat newlines as hard breaks'),
'sane_lists': _('Do not allow mixing list types'),
'smarty': _('Use markdown\'s internal smartypants parser'),
'tables': _('Support tables'),
'toc': _('Generate a table of contents'),
'wikilinks': _('Wiki style links'),
}
class TXTInput(InputFormatPlugin):
name = 'TXT Input'
author = 'John Schember'
description = 'Convert TXT files to HTML'
file_types = {'txt', 'txtz', 'text', 'md', 'textile', 'markdown'}
commit_name = 'txt_input'
ui_data = {
'md_extensions': MD_EXTENSIONS,
'paragraph_types': {
'auto': _('Try to auto detect paragraph type'),
'block': _('Treat a blank line as a paragraph break'),
'single': _('Assume every line is a paragraph'),
'print': _('Assume every line starting with 2+ spaces or a tab starts a paragraph'),
'unformatted': _('Most lines have hard line breaks, few/no blank lines or indents'),
'off': _('Don\'t modify the paragraph structure'),
},
'formatting_types': {
'auto': _('Automatically decide which formatting processor to use'),
'plain': _('No formatting'),
'heuristic': _('Use heuristics to determine chapter headings, italics, etc.'),
'textile': _('Use the TexTile markup language'),
'markdown': _('Use the Markdown markup language')
},
}
options = {
OptionRecommendation(name='formatting_type', recommended_value='auto',
choices=list(ui_data['formatting_types']),
help=_('Formatting used within the document.\n'
'* auto: {auto}\n'
'* plain: {plain}\n'
'* heuristic: {heuristic}\n'
'* textile: {textile}\n'
'* markdown: {markdown}\n'
'To learn more about markdown see {url}').format(
url='https://daringfireball.net/projects/markdown/', **ui_data['formatting_types'])
),
OptionRecommendation(name='paragraph_type', recommended_value='auto',
choices=list(ui_data['paragraph_types']),
help=_('Paragraph structure to assume. The value of "off" is useful for formatted documents such as Markdown or Textile. '
'Choices are:\n'
'* auto: {auto}\n'
'* block: {block}\n'
'* single: {single}\n'
'* print: {print}\n'
'* unformatted: {unformatted}\n'
'* off: {off}').format(**ui_data['paragraph_types'])
),
OptionRecommendation(name='preserve_spaces', recommended_value=False,
help=_('Normally extra spaces are condensed into a single space. '
'With this option all spaces will be displayed.')),
OptionRecommendation(name='txt_in_remove_indents', recommended_value=False,
help=_('Normally extra space at the beginning of lines is retained. '
'With this option they will be removed.')),
OptionRecommendation(name="markdown_extensions", recommended_value='footnotes, tables, toc',
help=_('Enable extensions to markdown syntax. Extensions are formatting that is not part '
'of the standard markdown format. The extensions enabled by default: %default.\n'
'To learn more about markdown extensions, see {}\n'
'This should be a comma separated list of extensions to enable:\n'
).format('https://python-markdown.github.io/extensions/') + '\n'.join('* %s: %s' % (k, MD_EXTENSIONS[k]) for k in sorted(MD_EXTENSIONS))),
}
def shift_file(self, fname, data):
name, ext = os.path.splitext(fname)
candidate = os.path.join(self.output_dir, fname)
c = 0
while os.path.exists(candidate):
c += 1
candidate = os.path.join(self.output_dir, '{}-{}{}'.format(name, c, ext))
ans = candidate
with open(ans, 'wb') as f:
f.write(data)
return f.name
def fix_resources(self, html, base_dir):
from html5_parser import parse
root = parse(html)
changed = False
for img in root.xpath('//img[@src]'):
src = img.get('src')
prefix = src.split(':', 1)[0].lower()
if prefix not in ('file', 'http', 'https', 'ftp') and not os.path.isabs(src):
src = os.path.join(base_dir, src)
if os.access(src, os.R_OK):
with open(src, 'rb') as f:
data = f.read()
f = self.shift_file(os.path.basename(src), data)
changed = True
img.set('src', os.path.basename(f))
if changed:
from lxml import etree
html = etree.tostring(root, encoding='unicode')
return html
def convert(self, stream, options, file_ext, log,
accelerators):
from calibre.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator
from calibre.ebooks.chardet import detect
from calibre.utils.zipfile import ZipFile
from calibre.ebooks.txt.processor import (convert_basic,
convert_markdown_with_metadata, separate_paragraphs_single_line,
separate_paragraphs_print_formatted, preserve_spaces,
detect_paragraph_type, detect_formatting_type,
normalize_line_endings, convert_textile, remove_indents,
block_to_single_line, separate_hard_scene_breaks)
self.log = log
txt = b''
log.debug('Reading text from file...')
length = 0
base_dir = self.output_dir = getcwd()
# Extract content from zip archive.
if file_ext == 'txtz':
zf = ZipFile(stream)
zf.extractall('.')
for x in walk('.'):
if os.path.splitext(x)[1].lower() in ('.txt', '.text'):
with open(x, 'rb') as tf:
txt += tf.read() + b'\n\n'
else:
if getattr(stream, 'name', None):
base_dir = os.path.dirname(stream.name)
txt = stream.read()
if file_ext in {'md', 'textile', 'markdown'}:
options.formatting_type = {'md': 'markdown'}.get(file_ext, file_ext)
log.info('File extension indicates particular formatting. '
'Forcing formatting type to: %s'%options.formatting_type)
options.paragraph_type = 'off'
# Get the encoding of the document.
if options.input_encoding:
ienc = options.input_encoding
log.debug('Using user specified input encoding of %s' % ienc)
else:
det_encoding = detect(txt[:4096])
det_encoding, confidence = det_encoding['encoding'], det_encoding['confidence']
if det_encoding and det_encoding.lower().replace('_', '-').strip() in (
'gb2312', 'chinese', 'csiso58gb231280', 'euc-cn', 'euccn',
'eucgb2312-cn', 'gb2312-1980', 'gb2312-80', 'iso-ir-58'):
# Microsoft Word exports to HTML with encoding incorrectly set to
# gb2312 instead of gbk. gbk is a superset of gb2312, anyway.
det_encoding = 'gbk'
ienc = det_encoding
log.debug('Detected input encoding as %s with a confidence of %s%%' % (ienc, confidence * 100))
if not ienc:
ienc = 'utf-8'
log.debug('No input encoding specified and could not auto detect using %s' % ienc)
# Remove BOM from start of txt as its presence can confuse markdown
import codecs
for bom in (codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE, codecs.BOM_UTF8, codecs.BOM_UTF32_LE, codecs.BOM_UTF32_BE):
if txt.startswith(bom):
txt = txt[len(bom):]
break
txt = txt.decode(ienc, 'replace')
# Replace entities
txt = _ent_pat.sub(xml_entity_to_unicode, txt)
# Normalize line endings
txt = normalize_line_endings(txt)
# Determine the paragraph type of the document.
if options.paragraph_type == 'auto':
options.paragraph_type = detect_paragraph_type(txt)
if options.paragraph_type == 'unknown':
log.debug('Could not reliably determine paragraph type using block')
options.paragraph_type = 'block'
else:
log.debug('Auto detected paragraph type as %s' % options.paragraph_type)
# Detect formatting
if options.formatting_type == 'auto':
options.formatting_type = detect_formatting_type(txt)
log.debug('Auto detected formatting as %s' % options.formatting_type)
if options.formatting_type == 'heuristic':
setattr(options, 'enable_heuristics', True)
setattr(options, 'unwrap_lines', False)
setattr(options, 'smarten_punctuation', True)
# Reformat paragraphs to block formatting based on the detected type.
# We don't check for block because the processor assumes block.
# single and print at transformed to block for processing.
if options.paragraph_type == 'single':
txt = separate_paragraphs_single_line(txt)
elif options.paragraph_type == 'print':
txt = separate_hard_scene_breaks(txt)
txt = separate_paragraphs_print_formatted(txt)
txt = block_to_single_line(txt)
elif options.paragraph_type == 'unformatted':
from calibre.ebooks.conversion.utils import HeuristicProcessor
# unwrap lines based on punctuation
docanalysis = DocAnalysis('txt', txt)
length = docanalysis.line_length(.5)
preprocessor = HeuristicProcessor(options, log=getattr(self, 'log', None))
txt = preprocessor.punctuation_unwrap(length, txt, 'txt')
txt = separate_paragraphs_single_line(txt)
elif options.paragraph_type == 'block':
txt = separate_hard_scene_breaks(txt)
txt = block_to_single_line(txt)
if getattr(options, 'enable_heuristics', False) and getattr(options, 'dehyphenate', False):
docanalysis = DocAnalysis('txt', txt)
if not length:
length = docanalysis.line_length(.5)
dehyphenator = Dehyphenator(options.verbose, log=self.log)
txt = dehyphenator(txt,'txt', length)
# User requested transformation on the text.
if options.txt_in_remove_indents:
txt = remove_indents(txt)
# Preserve spaces will replace multiple spaces to a space
# followed by the &nbsp; entity.
if options.preserve_spaces:
txt = preserve_spaces(txt)
# Process the text using the appropriate text processor.
self.shifted_files = []
try:
html = ''
input_mi = None
if options.formatting_type == 'markdown':
log.debug('Running text through markdown conversion...')
try:
input_mi, html = convert_markdown_with_metadata(txt, extensions=[x.strip() for x in options.markdown_extensions.split(',') if x.strip()])
except RuntimeError:
raise ValueError('This txt file has malformed markup, it cannot be'
' converted by calibre. See https://daringfireball.net/projects/markdown/syntax')
html = self.fix_resources(html, base_dir)
elif options.formatting_type == 'textile':
log.debug('Running text through textile conversion...')
html = convert_textile(txt)
html = self.fix_resources(html, base_dir)
else:
log.debug('Running text through basic conversion...')
flow_size = getattr(options, 'flow_size', 0)
html = convert_basic(txt, epub_split_size_kb=flow_size)
# Run the HTMLized text through the html processing plugin.
from calibre.customize.ui import plugin_for_input_format
html_input = plugin_for_input_format('html')
for opt in html_input.options:
setattr(options, opt.option.name, opt.recommended_value)
options.input_encoding = 'utf-8'
htmlfile = self.shift_file('index.html', html.encode('utf-8'))
odi = options.debug_pipeline
options.debug_pipeline = None
# Generate oeb from html conversion.
oeb = html_input.convert(open(htmlfile, 'rb'), options, 'html', log, {})
options.debug_pipeline = odi
finally:
for x in self.shifted_files:
os.remove(x)
# Set metadata from file.
if input_mi is None:
from calibre.customize.ui import get_file_type_metadata
input_mi = get_file_type_metadata(stream, file_ext)
from calibre.ebooks.oeb.transforms.metadata import meta_info_to_oeb_metadata
meta_info_to_oeb_metadata(input_mi, oeb.metadata, log)
self.html_postprocess_title = input_mi.title
return oeb
def postprocess_book(self, oeb, opts, log):
for item in oeb.spine:
if hasattr(item.data, 'xpath'):
for title in item.data.xpath('//*[local-name()="title"]'):
if title.text == _('Unknown'):
title.text = self.html_postprocess_title

View File

@@ -0,0 +1,165 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL 3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
import os
import shutil
from calibre.customize.conversion import OutputFormatPlugin, \
OptionRecommendation
from calibre.ptempfile import TemporaryDirectory, TemporaryFile
NEWLINE_TYPES = ['system', 'unix', 'old_mac', 'windows']
class TXTOutput(OutputFormatPlugin):
name = 'TXT Output'
author = 'John Schember'
file_type = 'txt'
commit_name = 'txt_output'
ui_data = {
'newline_types': NEWLINE_TYPES,
'formatting_types': {
'plain': _('Plain text'),
'markdown': _('Markdown formatted text'),
'textile': _('TexTile formatted text')
},
}
options = {
OptionRecommendation(name='newline', recommended_value='system',
level=OptionRecommendation.LOW,
short_switch='n', choices=NEWLINE_TYPES,
help=_('Type of newline to use. Options are %s. Default is \'system\'. '
'Use \'old_mac\' for compatibility with Mac OS 9 and earlier. '
'For macOS use \'unix\'. \'system\' will default to the newline '
'type used by this OS.') % sorted(NEWLINE_TYPES)),
OptionRecommendation(name='txt_output_encoding', recommended_value='utf-8',
level=OptionRecommendation.LOW,
help=_('Specify the character encoding of the output document. '
'The default is utf-8.')),
OptionRecommendation(name='inline_toc',
recommended_value=False, level=OptionRecommendation.LOW,
help=_('Add Table of Contents to beginning of the book.')),
OptionRecommendation(name='max_line_length',
recommended_value=0, level=OptionRecommendation.LOW,
help=_('The maximum number of characters per line. This splits on '
'the first space before the specified value. If no space is found '
'the line will be broken at the space after and will exceed the '
'specified value. Also, there is a minimum of 25 characters. '
'Use 0 to disable line splitting.')),
OptionRecommendation(name='force_max_line_length',
recommended_value=False, level=OptionRecommendation.LOW,
help=_('Force splitting on the max-line-length value when no space '
'is present. Also allows max-line-length to be below the minimum')),
OptionRecommendation(name='txt_output_formatting',
recommended_value='plain',
choices=list(ui_data['formatting_types']),
help=_('Formatting used within the document.\n'
'* plain: {plain}\n'
'* markdown: {markdown}\n'
'* textile: {textile}').format(**ui_data['formatting_types'])),
OptionRecommendation(name='keep_links',
recommended_value=False, level=OptionRecommendation.LOW,
help=_('Do not remove links within the document. This is only '
'useful when paired with a txt-output-formatting option that '
'is not none because links are always removed with plain text output.')),
OptionRecommendation(name='keep_image_references',
recommended_value=False, level=OptionRecommendation.LOW,
help=_('Do not remove image references within the document. This is only '
'useful when paired with a txt-output-formatting option that '
'is not none because links are always removed with plain text output.')),
OptionRecommendation(name='keep_color',
recommended_value=False, level=OptionRecommendation.LOW,
help=_('Do not remove font color from output. This is only useful when '
'txt-output-formatting is set to textile. Textile is the only '
'formatting that supports setting font color. If this option is '
'not specified font color will not be set and default to the '
'color displayed by the reader (generally this is black).')),
}
def convert(self, oeb_book, output_path, input_plugin, opts, log):
from calibre.ebooks.txt.txtml import TXTMLizer
from calibre.utils.cleantext import clean_ascii_chars
from calibre.ebooks.txt.newlines import specified_newlines, TxtNewlines
if opts.txt_output_formatting.lower() == 'markdown':
from calibre.ebooks.txt.markdownml import MarkdownMLizer
self.writer = MarkdownMLizer(log)
elif opts.txt_output_formatting.lower() == 'textile':
from calibre.ebooks.txt.textileml import TextileMLizer
self.writer = TextileMLizer(log)
else:
self.writer = TXTMLizer(log)
txt = self.writer.extract_content(oeb_book, opts)
txt = clean_ascii_chars(txt)
log.debug('\tReplacing newlines with selected type...')
txt = specified_newlines(TxtNewlines(opts.newline).newline, txt)
close = False
if not hasattr(output_path, 'write'):
close = True
if not os.path.exists(os.path.dirname(output_path)) and os.path.dirname(output_path) != '':
os.makedirs(os.path.dirname(output_path))
out_stream = open(output_path, 'wb')
else:
out_stream = output_path
out_stream.seek(0)
out_stream.truncate()
out_stream.write(txt.encode(opts.txt_output_encoding, 'replace'))
if close:
out_stream.close()
class TXTZOutput(TXTOutput):
name = 'TXTZ Output'
author = 'John Schember'
file_type = 'txtz'
def convert(self, oeb_book, output_path, input_plugin, opts, log):
from calibre.ebooks.oeb.base import OEB_IMAGES
from calibre.utils.zipfile import ZipFile
from lxml import etree
with TemporaryDirectory('_txtz_output') as tdir:
# TXT
txt_name = 'index.txt'
if opts.txt_output_formatting.lower() == 'textile':
txt_name = 'index.text'
with TemporaryFile(txt_name) as tf:
TXTOutput.convert(self, oeb_book, tf, input_plugin, opts, log)
shutil.copy(tf, os.path.join(tdir, txt_name))
# Images
for item in oeb_book.manifest:
if item.media_type in OEB_IMAGES:
if hasattr(self.writer, 'images'):
path = os.path.join(tdir, 'images')
if item.href in self.writer.images:
href = self.writer.images[item.href]
else:
continue
else:
path = os.path.join(tdir, os.path.dirname(item.href))
href = os.path.basename(item.href)
if not os.path.exists(path):
os.makedirs(path)
with open(os.path.join(path, href), 'wb') as imgf:
imgf.write(item.data)
# Metadata
with open(os.path.join(tdir, 'metadata.opf'), 'wb') as mdataf:
mdataf.write(etree.tostring(oeb_book.metadata.to_opf1()))
txtz = ZipFile(output_path, 'w')
txtz.add_dir(tdir)

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,646 @@
#!/usr/bin/env python2
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import functools, re, json
from math import ceil
from calibre import entity_to_unicode, as_unicode
from polyglot.builtins import unicode_type, range
XMLDECL_RE = re.compile(r'^\s*<[?]xml.*?[?]>')
SVG_NS = 'http://www.w3.org/2000/svg'
XLINK_NS = 'http://www.w3.org/1999/xlink'
convert_entities = functools.partial(entity_to_unicode,
result_exceptions={
'<' : '&lt;',
'>' : '&gt;',
"'" : '&apos;',
'"' : '&quot;',
'&' : '&amp;',
})
_span_pat = re.compile('<span.*?</span>', re.DOTALL|re.IGNORECASE)
LIGATURES = {
# '\u00c6': 'AE',
# '\u00e6': 'ae',
# '\u0152': 'OE',
# '\u0153': 'oe',
# '\u0132': 'IJ',
# '\u0133': 'ij',
# '\u1D6B': 'ue',
'\uFB00': 'ff',
'\uFB01': 'fi',
'\uFB02': 'fl',
'\uFB03': 'ffi',
'\uFB04': 'ffl',
'\uFB05': 'ft',
'\uFB06': 'st',
}
_ligpat = re.compile('|'.join(LIGATURES))
def sanitize_head(match):
x = match.group(1)
x = _span_pat.sub('', x)
return '<head>\n%s\n</head>' % x
def chap_head(match):
chap = match.group('chap')
title = match.group('title')
if not title:
return '<h1>'+chap+'</h1><br/>\n'
else:
return '<h1>'+chap+'</h1>\n<h3>'+title+'</h3>\n'
def wrap_lines(match):
ital = match.group('ital')
if not ital:
return ' '
else:
return ital+' '
def smarten_punctuation(html, log=None):
from calibre.utils.smartypants import smartyPants
from calibre.ebooks.chardet import substitute_entites
from calibre.ebooks.conversion.utils import HeuristicProcessor
preprocessor = HeuristicProcessor(log=log)
from uuid import uuid4
start = 'calibre-smartypants-'+unicode_type(uuid4())
stop = 'calibre-smartypants-'+unicode_type(uuid4())
html = html.replace('<!--', start)
html = html.replace('-->', stop)
html = preprocessor.fix_nbsp_indents(html)
html = smartyPants(html)
html = html.replace(start, '<!--')
html = html.replace(stop, '-->')
return substitute_entites(html)
class DocAnalysis(object):
'''
Provides various text analysis functions to determine how the document is structured.
format is the type of document analysis will be done against.
raw is the raw text to determine the line length to use for wrapping.
Blank lines are excluded from analysis
'''
def __init__(self, format='html', raw=''):
raw = raw.replace('&nbsp;', ' ')
if format == 'html':
linere = re.compile(r'(?<=<p)(?![^>]*>\s*</p>).*?(?=</p>)', re.DOTALL)
elif format == 'pdf':
linere = re.compile(r'(?<=<br>)(?!\s*<br>).*?(?=<br>)', re.DOTALL)
elif format == 'spanned_html':
linere = re.compile('(?<=<span).*?(?=</span>)', re.DOTALL)
elif format == 'txt':
linere = re.compile('.*?\n')
self.lines = linere.findall(raw)
def line_length(self, percent):
'''
Analyses the document to find the median line length.
percentage is a decimal number, 0 - 1 which is used to determine
how far in the list of line lengths to use. The list of line lengths is
ordered smallest to largest and does not include duplicates. 0.5 is the
median value.
'''
lengths = []
for line in self.lines:
if len(line) > 0:
lengths.append(len(line))
if not lengths:
return 0
lengths = list(set(lengths))
total = sum(lengths)
avg = total / len(lengths)
max_line = ceil(avg * 2)
lengths = sorted(lengths)
for i in range(len(lengths) - 1, -1, -1):
if lengths[i] > max_line:
del lengths[i]
if percent > 1:
percent = 1
if percent < 0:
percent = 0
index = int(len(lengths) * percent) - 1
return lengths[index]
def line_histogram(self, percent):
'''
Creates a broad histogram of the document to determine whether it incorporates hard
line breaks. Lines are sorted into 20 'buckets' based on length.
percent is the percentage of lines that should be in a single bucket to return true
The majority of the lines will exist in 1-2 buckets in typical docs with hard line breaks
'''
minLineLength=20 # Ignore lines under 20 chars (typical of spaces)
maxLineLength=1900 # Discard larger than this to stay in range
buckets=20 # Each line is divided into a bucket based on length
# print("there are "+unicode_type(len(lines))+" lines")
# max = 0
# for line in self.lines:
# l = len(line)
# if l > max:
# max = l
# print("max line found is "+unicode_type(max))
# Build the line length histogram
hRaw = [0 for i in range(0,buckets)]
for line in self.lines:
l = len(line)
if l > minLineLength and l < maxLineLength:
l = int(l // 100)
# print("adding "+unicode_type(l))
hRaw[l]+=1
# Normalize the histogram into percents
totalLines = len(self.lines)
if totalLines > 0:
h = [float(count)/totalLines for count in hRaw]
else:
h = []
# print("\nhRaw histogram lengths are: "+unicode_type(hRaw))
# print(" percents are: "+unicode_type(h)+"\n")
# Find the biggest bucket
maxValue = 0
for i in range(0,len(h)):
if h[i] > maxValue:
maxValue = h[i]
if maxValue < percent:
# print("Line lengths are too variable. Not unwrapping.")
return False
else:
# print(unicode_type(maxValue)+" of the lines were in one bucket")
return True
class Dehyphenator(object):
'''
Analyzes words to determine whether hyphens should be retained/removed. Uses the document
itself is as a dictionary. This method handles all languages along with uncommon, made-up, and
scientific words. The primary disadvantage is that words appearing only once in the document
retain hyphens.
'''
def __init__(self, verbose=0, log=None):
self.log = log
self.verbose = verbose
# Add common suffixes to the regex below to increase the likelihood of a match -
# don't add suffixes which are also complete words, such as 'able' or 'sex'
# only remove if it's not already the point of hyphenation
self.suffix_string = (
"((ed)?ly|'?e?s||a?(t|s)?ion(s|al(ly)?)?|ings?|er|(i)?ous|"
"(i|a)ty|(it)?ies|ive|gence|istic(ally)?|(e|a)nce|m?ents?|ism|ated|"
"(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex|ian)$")
self.suffixes = re.compile(r"^%s" % self.suffix_string, re.IGNORECASE)
self.removesuffixes = re.compile(r"%s" % self.suffix_string, re.IGNORECASE)
# remove prefixes if the prefix was not already the point of hyphenation
self.prefix_string = '^(dis|re|un|in|ex)'
self.prefixes = re.compile(r'%s$' % self.prefix_string, re.IGNORECASE)
self.removeprefix = re.compile(r'%s' % self.prefix_string, re.IGNORECASE)
def dehyphenate(self, match):
firsthalf = match.group('firstpart')
secondhalf = match.group('secondpart')
try:
wraptags = match.group('wraptags')
except:
wraptags = ''
hyphenated = unicode_type(firsthalf) + "-" + unicode_type(secondhalf)
dehyphenated = unicode_type(firsthalf) + unicode_type(secondhalf)
if self.suffixes.match(secondhalf) is None:
lookupword = self.removesuffixes.sub('', dehyphenated)
else:
lookupword = dehyphenated
if len(firsthalf) > 4 and self.prefixes.match(firsthalf) is None:
lookupword = self.removeprefix.sub('', lookupword)
if self.verbose > 2:
self.log("lookup word is: "+lookupword+", orig is: " + hyphenated)
try:
searchresult = self.html.find(lookupword.lower())
except:
return hyphenated
if self.format == 'html_cleanup' or self.format == 'txt_cleanup':
if self.html.find(lookupword) != -1 or searchresult != -1:
if self.verbose > 2:
self.log(" Cleanup:returned dehyphenated word: " + dehyphenated)
return dehyphenated
elif self.html.find(hyphenated) != -1:
if self.verbose > 2:
self.log(" Cleanup:returned hyphenated word: " + hyphenated)
return hyphenated
else:
if self.verbose > 2:
self.log(" Cleanup:returning original text "+firsthalf+" + linefeed "+secondhalf)
return firsthalf+'\u2014'+wraptags+secondhalf
else:
if self.format == 'individual_words' and len(firsthalf) + len(secondhalf) <= 6:
if self.verbose > 2:
self.log("too short, returned hyphenated word: " + hyphenated)
return hyphenated
if len(firsthalf) <= 2 and len(secondhalf) <= 2:
if self.verbose > 2:
self.log("too short, returned hyphenated word: " + hyphenated)
return hyphenated
if self.html.find(lookupword) != -1 or searchresult != -1:
if self.verbose > 2:
self.log(" returned dehyphenated word: " + dehyphenated)
return dehyphenated
else:
if self.verbose > 2:
self.log(" returned hyphenated word: " + hyphenated)
return hyphenated
def __call__(self, html, format, length=1):
self.html = html
self.format = format
if format == 'html':
intextmatch = re.compile((
r'(?<=.{%i})(?P<firstpart>[^\W\-]+)(-|)\s*(?=<)(?P<wraptags>(</span>)?'
r'\s*(</[iubp]>\s*){1,2}(?P<up2threeblanks><(p|div)[^>]*>\s*(<p[^>]*>\s*</p>\s*)'
r'?</(p|div)>\s+){0,3}\s*(<[iubp][^>]*>\s*){1,2}(<span[^>]*>)?)\s*(?P<secondpart>[\w\d]+)') % length)
elif format == 'pdf':
intextmatch = re.compile((
r'(?<=.{%i})(?P<firstpart>[^\W\-]+)(-|)\s*(?P<wraptags><p>|'
r'</[iub]>\s*<p>\s*<[iub]>)\s*(?P<secondpart>[\w\d]+)')% length)
elif format == 'txt':
intextmatch = re.compile(
'(?<=.{%i})(?P<firstpart>[^\\W\\-]+)(-|)(\u0020|\u0009)*(?P<wraptags>(\n(\u0020|\u0009)*)+)(?P<secondpart>[\\w\\d]+)'% length)
elif format == 'individual_words':
intextmatch = re.compile(
r'(?!<)(?P<firstpart>[^\W\-]+)(-|)\s*(?P<secondpart>\w+)(?![^<]*?>)', re.UNICODE)
elif format == 'html_cleanup':
intextmatch = re.compile(
r'(?P<firstpart>[^\W\-]+)(-|)\s*(?=<)(?P<wraptags></span>\s*(</[iubp]>'
r'\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?P<secondpart>[\w\d]+)')
elif format == 'txt_cleanup':
intextmatch = re.compile(
r'(?P<firstpart>[^\W\-]+)(-|)(?P<wraptags>\s+)(?P<secondpart>[\w\d]+)')
html = intextmatch.sub(self.dehyphenate, html)
return html
class CSSPreProcessor(object):
# Remove some of the broken CSS Microsoft products
# create
MS_PAT = re.compile(r'''
(?P<start>^|;|\{)\s* # The end of the previous rule or block start
(%s).+? # The invalid selectors
(?P<end>$|;|\}) # The end of the declaration
'''%'mso-|panose-|text-underline|tab-interval',
re.MULTILINE|re.IGNORECASE|re.VERBOSE)
def ms_sub(self, match):
end = match.group('end')
try:
start = match.group('start')
except:
start = ''
if end == ';':
end = ''
return start + end
def __call__(self, data, add_namespace=False):
from calibre.ebooks.oeb.base import XHTML_CSS_NAMESPACE
data = self.MS_PAT.sub(self.ms_sub, data)
if not add_namespace:
return data
# Remove comments as the following namespace logic will break if there
# are commented lines before the first @import or @charset rule. Since
# the conversion will remove all stylesheets anyway, we don't lose
# anything
data = re.sub(unicode_type(r'/\*.*?\*/'), '', data, flags=re.DOTALL)
ans, namespaced = [], False
for line in data.splitlines():
ll = line.lstrip()
if not (namespaced or ll.startswith('@import') or not ll or
ll.startswith('@charset')):
ans.append(XHTML_CSS_NAMESPACE.strip())
namespaced = True
ans.append(line)
return '\n'.join(ans)
def accent_regex(accent_maps, letter_before=False):
accent_cat = set()
letters = set()
for accent in tuple(accent_maps):
accent_cat.add(accent)
k, v = accent_maps[accent].split(':', 1)
if len(k) != len(v):
raise ValueError('Invalid mapping for: {} -> {}'.format(k, v))
accent_maps[accent] = lmap = dict(zip(k, v))
letters |= set(lmap)
if letter_before:
args = ''.join(letters), ''.join(accent_cat)
accent_group, letter_group = 2, 1
else:
args = ''.join(accent_cat), ''.join(letters)
accent_group, letter_group = 1, 2
pat = re.compile(r'([{}])\s*(?:<br[^>]*>){{0,1}}\s*([{}])'.format(*args), re.UNICODE)
def sub(m):
lmap = accent_maps[m.group(accent_group)]
return lmap.get(m.group(letter_group)) or m.group()
return pat, sub
def html_preprocess_rules():
ans = getattr(html_preprocess_rules, 'ans', None)
if ans is None:
ans = html_preprocess_rules.ans = [
# Remove huge block of contiguous spaces as they slow down
# the following regexes pretty badly
(re.compile(r'\s{10000,}'), ''),
# Some idiotic HTML generators (Frontpage I'm looking at you)
# Put all sorts of crap into <head>. This messes up lxml
(re.compile(r'<head[^>]*>\n*(.*?)\n*</head>', re.IGNORECASE|re.DOTALL),
sanitize_head),
# Convert all entities, since lxml doesn't handle them well
(re.compile(r'&(\S+?);'), convert_entities),
# Remove the <![if/endif tags inserted by everybody's darling, MS Word
(re.compile(r'</{0,1}!\[(end){0,1}if\]{0,1}>', re.IGNORECASE), ''),
]
return ans
def pdftohtml_rules():
ans = getattr(pdftohtml_rules, 'ans', None)
if ans is None:
ans = pdftohtml_rules.ans = [
accent_regex({
'¨': 'aAeEiIoOuU:äÄëËïÏöÖüÜ',
'`': 'aAeEiIoOuU:àÀèÈìÌòÒùÙ',
'´': 'aAcCeEiIlLoOnNrRsSuUzZ:áÁćĆéÉíÍĺĹóÓńŃŕŔśŚúÚźŹ',
'ˆ': 'aAeEiIoOuU:âÂêÊîÎôÔûÛ',
'¸': 'cC:çÇ',
'˛': 'aAeE:ąĄęĘ',
'˙': 'zZ:żŻ',
'ˇ': 'cCdDeElLnNrRsStTzZ:čČďĎěĚľĽňŇřŘšŠťŤžŽ',
'°': 'uU:ůŮ',
}),
accent_regex({'`': 'aAeEiIoOuU:àÀèÈìÌòÒùÙ'}, letter_before=True),
# If pdf printed from a browser then the header/footer has a reliable pattern
(re.compile(r'((?<=</a>)\s*file:/{2,4}[A-Z].*<br>|file:////?[A-Z].*<br>(?=\s*<hr>))', re.IGNORECASE), lambda match: ''),
# Center separator lines
(re.compile(r'<br>\s*(?P<break>([*#•✦=] *){3,})\s*<br>'), lambda match: '<p>\n<p style="text-align:center">' + match.group('break') + '</p>'),
# Remove <hr> tags
(re.compile(r'<hr.*?>', re.IGNORECASE), ''),
# Remove gray background
(re.compile(r'<BODY[^<>]+>'), '<BODY>'),
# Convert line breaks to paragraphs
(re.compile(r'<br[^>]*>\s*'), '</p>\n<p>'),
(re.compile(r'<body[^>]*>\s*'), '<body>\n<p>'),
(re.compile(r'\s*</body>'), '</p>\n</body>'),
# Clean up spaces
(re.compile(r'(?<=[\.,;\?!”"\'])[\s^ ]*(?=<)'), ' '),
# Add space before and after italics
(re.compile(r'(?<!“)<i>'), ' <i>'),
(re.compile(r'</i>(?=\w)'), '</i> '),
]
return ans
def book_designer_rules():
ans = getattr(book_designer_rules, 'ans', None)
if ans is None:
ans = book_designer_rules.ans = [
# HR
(re.compile('<hr>', re.IGNORECASE),
lambda match : '<span style="page-break-after:always"> </span>'),
# Create header tags
(re.compile(r'<h2[^><]*?id=BookTitle[^><]*?(align=)*(?(1)(\w+))*[^><]*?>[^><]*?</h2>', re.IGNORECASE),
lambda match : '<h1 id="BookTitle" align="%s">%s</h1>'%(match.group(2) if match.group(2) else 'center', match.group(3))),
(re.compile(r'<h2[^><]*?id=BookAuthor[^><]*?(align=)*(?(1)(\w+))*[^><]*?>[^><]*?</h2>', re.IGNORECASE),
lambda match : '<h2 id="BookAuthor" align="%s">%s</h2>'%(match.group(2) if match.group(2) else 'center', match.group(3))),
(re.compile('<span[^><]*?id=title[^><]*?>(.*?)</span>', re.IGNORECASE|re.DOTALL),
lambda match : '<h2 class="title">%s</h2>'%(match.group(1),)),
(re.compile('<span[^><]*?id=subtitle[^><]*?>(.*?)</span>', re.IGNORECASE|re.DOTALL),
lambda match : '<h3 class="subtitle">%s</h3>'%(match.group(1),)),
]
return None
class HTMLPreProcessor(object):
def __init__(self, log=None, extra_opts=None, regex_wizard_callback=None):
self.log = log
self.extra_opts = extra_opts
self.regex_wizard_callback = regex_wizard_callback
self.current_href = None
def is_baen(self, src):
return re.compile(r'<meta\s+name="Publisher"\s+content=".*?Baen.*?"',
re.IGNORECASE).search(src) is not None
def is_book_designer(self, raw):
return re.search('<H2[^><]*id=BookTitle', raw) is not None
def is_pdftohtml(self, src):
return '<!-- created by calibre\'s pdftohtml -->' in src[:1000]
def __call__(self, html, remove_special_chars=None,
get_preprocess_html=False):
if remove_special_chars is not None:
html = remove_special_chars.sub('', html)
html = html.replace('\0', '')
is_pdftohtml = self.is_pdftohtml(html)
if self.is_baen(html):
rules = []
elif self.is_book_designer(html):
rules = book_designer_rules()
elif is_pdftohtml:
rules = pdftohtml_rules()
else:
rules = []
start_rules = []
if not getattr(self.extra_opts, 'keep_ligatures', False):
html = _ligpat.sub(lambda m:LIGATURES[m.group()], html)
user_sr_rules = {}
# Function for processing search and replace
def do_search_replace(search_pattern, replace_txt):
from calibre.ebooks.conversion.search_replace import compile_regular_expression
try:
search_re = compile_regular_expression(search_pattern)
if not replace_txt:
replace_txt = ''
rules.insert(0, (search_re, replace_txt))
user_sr_rules[(search_re, replace_txt)] = search_pattern
except Exception as e:
self.log.error('Failed to parse %r regexp because %s' %
(search, as_unicode(e)))
# search / replace using the sr?_search / sr?_replace options
for i in range(1, 4):
search, replace = 'sr%d_search'%i, 'sr%d_replace'%i
search_pattern = getattr(self.extra_opts, search, '')
replace_txt = getattr(self.extra_opts, replace, '')
if search_pattern:
do_search_replace(search_pattern, replace_txt)
# multi-search / replace using the search_replace option
search_replace = getattr(self.extra_opts, 'search_replace', None)
if search_replace:
search_replace = json.loads(search_replace)
for search_pattern, replace_txt in reversed(search_replace):
do_search_replace(search_pattern, replace_txt)
end_rules = []
# delete soft hyphens - moved here so it's executed after header/footer removal
if is_pdftohtml:
# unwrap/delete soft hyphens
end_rules.append((re.compile(
r'[­](</p>\s*<p>\s*)+\s*(?=[\[a-z\d])'), lambda match: ''))
# unwrap/delete soft hyphens with formatting
end_rules.append((re.compile(
r'[­]\s*(</(i|u|b)>)+(</p>\s*<p>\s*)+\s*(<(i|u|b)>)+\s*(?=[\[a-z\d])'), lambda match: ''))
length = -1
if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01:
docanalysis = DocAnalysis('pdf', html)
length = docanalysis.line_length(getattr(self.extra_opts, 'unwrap_factor'))
if length:
# print("The pdf line length returned is " + unicode_type(length))
# unwrap em/en dashes
end_rules.append((re.compile(
r'(?<=.{%i}[–—])\s*<p>\s*(?=[\[a-z\d])' % length), lambda match: ''))
end_rules.append(
# Un wrap using punctuation
(re.compile((
r'(?<=.{%i}([a-zäëïöüàèìòùáćéíĺóŕńśúýâêîôûçąężıãõñæøþðßěľščťžňďřů,:)\\IAß]'
r'|(?<!\&\w{4});))\s*(?P<ital></(i|b|u)>)?\s*(</p>\s*<p>\s*)+\s*(?=(<(i|b|u)>)?'
r'\s*[\w\d$(])') % length, re.UNICODE), wrap_lines),
)
for rule in html_preprocess_rules() + start_rules:
html = rule[0].sub(rule[1], html)
if self.regex_wizard_callback is not None:
self.regex_wizard_callback(self.current_href, html)
if get_preprocess_html:
return html
def dump(raw, where):
import os
dp = getattr(self.extra_opts, 'debug_pipeline', None)
if dp and os.path.exists(dp):
odir = os.path.join(dp, 'input')
if os.path.exists(odir):
odir = os.path.join(odir, where)
if not os.path.exists(odir):
os.makedirs(odir)
name, i = None, 0
while not name or os.path.exists(os.path.join(odir, name)):
i += 1
name = '%04d.html'%i
with open(os.path.join(odir, name), 'wb') as f:
f.write(raw.encode('utf-8'))
# dump(html, 'pre-preprocess')
for rule in rules + end_rules:
try:
html = rule[0].sub(rule[1], html)
except Exception as e:
if rule in user_sr_rules:
self.log.error(
'User supplied search & replace rule: %s -> %s '
'failed with error: %s, ignoring.'%(
user_sr_rules[rule], rule[1], e))
else:
raise
if is_pdftohtml and length > -1:
# Dehyphenate
dehyphenator = Dehyphenator(self.extra_opts.verbose, self.log)
html = dehyphenator(html,'html', length)
if is_pdftohtml:
from calibre.ebooks.conversion.utils import HeuristicProcessor
pdf_markup = HeuristicProcessor(self.extra_opts, None)
totalwords = 0
if pdf_markup.get_word_count(html) > 7000:
html = pdf_markup.markup_chapters(html, totalwords, True)
# dump(html, 'post-preprocess')
# Handle broken XHTML w/ SVG (ugh)
if 'svg:' in html and SVG_NS not in html:
html = html.replace(
'<html', '<html xmlns:svg="%s"' % SVG_NS, 1)
if 'xlink:' in html and XLINK_NS not in html:
html = html.replace(
'<html', '<html xmlns:xlink="%s"' % XLINK_NS, 1)
html = XMLDECL_RE.sub('', html)
if getattr(self.extra_opts, 'asciiize', False):
from calibre.utils.localization import get_udc
from calibre.utils.mreplace import MReplace
unihandecoder = get_udc()
mr = MReplace(data={'«':'&lt;'*3, '»':'&gt;'*3})
html = mr.mreplace(html)
html = unihandecoder.decode(html)
if getattr(self.extra_opts, 'enable_heuristics', False):
from calibre.ebooks.conversion.utils import HeuristicProcessor
preprocessor = HeuristicProcessor(self.extra_opts, self.log)
html = preprocessor(html)
if is_pdftohtml:
html = html.replace('<!-- created by calibre\'s pdftohtml -->', '')
if getattr(self.extra_opts, 'smarten_punctuation', False):
html = smarten_punctuation(html, self.log)
try:
unsupported_unicode_chars = self.extra_opts.output_profile.unsupported_unicode_chars
except AttributeError:
unsupported_unicode_chars = ''
if unsupported_unicode_chars:
from calibre.utils.localization import get_udc
unihandecoder = get_udc()
for char in unsupported_unicode_chars:
asciichar = unihandecoder.decode(char)
html = html.replace(char, asciichar)
return html

View File

@@ -0,0 +1,881 @@
#!/usr/bin/env python2
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import re
from math import ceil
from calibre.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator
from calibre.utils.logging import default_log
from calibre.utils.wordcount import get_wordcount_obj
from polyglot.builtins import unicode_type
class HeuristicProcessor(object):
def __init__(self, extra_opts=None, log=None):
self.log = default_log if log is None else log
self.html_preprocess_sections = 0
self.found_indents = 0
self.extra_opts = extra_opts
self.deleted_nbsps = False
self.totalwords = 0
self.min_chapters = 1
self.chapters_no_title = 0
self.chapters_with_title = 0
self.blanks_deleted = False
self.blanks_between_paragraphs = False
self.linereg = re.compile('(?<=<p).*?(?=</p>)', re.IGNORECASE|re.DOTALL)
self.blankreg = re.compile(r'\s*(?P<openline><p(?!\sclass=\"(softbreak|whitespace)\")[^>]*>)\s*(?P<closeline></p>)', re.IGNORECASE)
self.anyblank = re.compile(r'\s*(?P<openline><p[^>]*>)\s*(?P<closeline></p>)', re.IGNORECASE)
self.multi_blank = re.compile(r'(\s*<p[^>]*>\s*</p>(\s*<div[^>]*>\s*</div>\s*)*){2,}(?!\s*<h\d)', re.IGNORECASE)
self.any_multi_blank = re.compile(r'(\s*<p[^>]*>\s*</p>(\s*<div[^>]*>\s*</div>\s*)*){2,}', re.IGNORECASE)
self.line_open = (
r"<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>font|span|[ibu])[^>]*>)?\s*"
r"(<(?P<inner2>font|span|[ibu])[^>]*>)?\s*(<(?P<inner3>font|span|[ibu])[^>]*>)?\s*")
self.line_close = "(</(?P=inner3)>)?\\s*(</(?P=inner2)>)?\\s*(</(?P=inner1)>)?\\s*</(?P=outer)>"
self.single_blank = re.compile(r'(\s*<(p|div)[^>]*>\s*</(p|div)>)', re.IGNORECASE)
self.scene_break_open = '<p class="scenebreak" style="text-align:center; text-indent:0%; margin-top:1em; margin-bottom:1em; page-break-before:avoid">'
self.common_in_text_endings = '[\"\'—’”,\\.!\\?\\\\)„\\w]'
self.common_in_text_beginnings = '[\\w\'\"“‘‛]'
def is_pdftohtml(self, src):
return '<!-- created by calibre\'s pdftohtml -->' in src[:1000]
def is_abbyy(self, src):
return '<meta name="generator" content="ABBYY FineReader' in src[:1000]
def chapter_head(self, match):
from calibre.utils.html2text import html2text
chap = match.group('chap')
title = match.group('title')
if not title:
self.html_preprocess_sections = self.html_preprocess_sections + 1
self.log.debug("marked " + unicode_type(self.html_preprocess_sections) +
" chapters. - " + unicode_type(chap))
return '<h2>'+chap+'</h2>\n'
else:
delete_whitespace = re.compile('^\\s*(?P<c>.*?)\\s*$')
delete_quotes = re.compile('\'\"')
txt_chap = delete_quotes.sub('', delete_whitespace.sub('\\g<c>', html2text(chap)))
txt_title = delete_quotes.sub('', delete_whitespace.sub('\\g<c>', html2text(title)))
self.html_preprocess_sections = self.html_preprocess_sections + 1
self.log.debug("marked " + unicode_type(self.html_preprocess_sections) +
" chapters & titles. - " + unicode_type(chap) + ", " + unicode_type(title))
return '<h2 title="'+txt_chap+', '+txt_title+'">'+chap+'</h2>\n<h3 class="sigilNotInTOC">'+title+'</h3>\n'
def chapter_break(self, match):
chap = match.group('section')
styles = match.group('styles')
self.html_preprocess_sections = self.html_preprocess_sections + 1
self.log.debug("marked " + unicode_type(self.html_preprocess_sections) +
" section markers based on punctuation. - " + unicode_type(chap))
return '<'+styles+' style="page-break-before:always">'+chap
def analyze_title_matches(self, match):
# chap = match.group('chap')
title = match.group('title')
if not title:
self.chapters_no_title = self.chapters_no_title + 1
else:
self.chapters_with_title = self.chapters_with_title + 1
def insert_indent(self, match):
pstyle = match.group('formatting')
tag = match.group('tagtype')
span = match.group('span')
self.found_indents = self.found_indents + 1
if pstyle:
if pstyle.lower().find('style') != -1:
pstyle = re.sub(r'"$', '; text-indent:3%"', pstyle)
else:
pstyle = pstyle+' style="text-indent:3%"'
if not span:
return '<'+tag+' '+pstyle+'>'
else:
return '<'+tag+' '+pstyle+'>'+span
else:
if not span:
return '<'+tag+' style="text-indent:3%">'
else:
return '<'+tag+' style="text-indent:3%">'+span
def no_markup(self, raw, percent):
'''
Detects total marked up line endings in the file. raw is the text to
inspect. Percent is the minimum percent of line endings which should
be marked up to return true.
'''
htm_end_ere = re.compile('</(p|div)>', re.DOTALL)
line_end_ere = re.compile('(\n|\r|\r\n)', re.DOTALL)
htm_end = htm_end_ere.findall(raw)
line_end = line_end_ere.findall(raw)
tot_htm_ends = len(htm_end)
tot_ln_fds = len(line_end)
# self.log.debug("There are " + unicode_type(tot_ln_fds) + " total Line feeds, and " +
# unicode_type(tot_htm_ends) + " marked up endings")
if percent > 1:
percent = 1
if percent < 0:
percent = 0
min_lns = tot_ln_fds * percent
# self.log.debug("There must be fewer than " + unicode_type(min_lns) + " unmarked lines to add markup")
return min_lns > tot_htm_ends
def dump(self, raw, where):
import os
dp = getattr(self.extra_opts, 'debug_pipeline', None)
if dp and os.path.exists(dp):
odir = os.path.join(dp, 'preprocess')
if not os.path.exists(odir):
os.makedirs(odir)
if os.path.exists(odir):
odir = os.path.join(odir, where)
if not os.path.exists(odir):
os.makedirs(odir)
name, i = None, 0
while not name or os.path.exists(os.path.join(odir, name)):
i += 1
name = '%04d.html'%i
with open(os.path.join(odir, name), 'wb') as f:
f.write(raw.encode('utf-8'))
def get_word_count(self, html):
word_count_text = re.sub(r'(?s)<head[^>]*>.*?</head>', '', html)
word_count_text = re.sub(r'<[^>]*>', '', word_count_text)
wordcount = get_wordcount_obj(word_count_text)
return wordcount.words
def markup_italicis(self, html):
# self.log.debug("\n\n\nitalicize debugging \n\n\n")
ITALICIZE_WORDS = [
'Etc.', 'etc.', 'viz.', 'ie.', 'i.e.', 'Ie.', 'I.e.', 'eg.',
'e.g.', 'Eg.', 'E.g.', 'et al.', 'et cetera', 'n.b.', 'N.b.',
'nota bene', 'Nota bene', 'Ste.', 'Mme.', 'Mdme.',
'Mlle.', 'Mons.', 'PS.', 'PPS.',
]
ITALICIZE_STYLE_PATS = [
unicode_type(r'(?msu)(?<=[\s>"\'])_\*/(?P<words>[^\*_]+)/\*_'),
unicode_type(r'(?msu)(?<=[\s>"\'])~~(?P<words>[^~]+)~~'),
unicode_type(r'(?msu)(?<=[\s>"\'])_/(?P<words>[^/_]+)/_'),
unicode_type(r'(?msu)(?<=[\s>"\'])_\*(?P<words>[^\*_]+)\*_'),
unicode_type(r'(?msu)(?<=[\s>"\'])\*/(?P<words>[^/\*]+)/\*'),
unicode_type(r'(?msu)(?<=[\s>"\'])/:(?P<words>[^:/]+):/'),
unicode_type(r'(?msu)(?<=[\s>"\'])\|:(?P<words>[^:\|]+):\|'),
unicode_type(r'(?msu)(?<=[\s>"\'])\*(?P<words>[^\*]+)\*'),
unicode_type(r'(?msu)(?<=[\s>"\'])~(?P<words>[^~]+)~'),
unicode_type(r'(?msu)(?<=[\s>"\'])/(?P<words>[^/\*><]+)/'),
unicode_type(r'(?msu)(?<=[\s>"\'])_(?P<words>[^_]+)_'),
]
for word in ITALICIZE_WORDS:
html = re.sub(r'(?<=\s|>)' + re.escape(word) + r'(?=\s|<)', '<i>%s</i>' % word, html)
search_text = re.sub(r'(?s)<head[^>]*>.*?</head>', '', html)
search_text = re.sub(r'<[^>]*>', '', search_text)
for pat in ITALICIZE_STYLE_PATS:
for match in re.finditer(pat, search_text):
ital_string = unicode_type(match.group('words'))
# self.log.debug("italicising "+unicode_type(match.group(0))+" with <i>"+ital_string+"</i>")
try:
html = re.sub(re.escape(unicode_type(match.group(0))), '<i>%s</i>' % ital_string, html)
except OverflowError:
# match.group(0) was too large to be compiled into a regex
continue
except re.error:
# the match was not a valid regular expression
continue
return html
def markup_chapters(self, html, wordcount, blanks_between_paragraphs):
'''
Searches for common chapter headings throughout the document
attempts multiple patterns based on likelihood of a match
with minimum false positives. Exits after finding a successful pattern
'''
# Typical chapters are between 2000 and 7000 words, use the larger number to decide the
# minimum of chapters to search for. A max limit is calculated to prevent things like OCR
# or pdf page numbers from being treated as TOC markers
max_chapters = 150
typical_chapters = 7000.
if wordcount > 7000:
if wordcount > 200000:
typical_chapters = 15000.
self.min_chapters = int(ceil(wordcount / typical_chapters))
self.log.debug("minimum chapters required are: "+unicode_type(self.min_chapters))
heading = re.compile('<h[1-3][^>]*>', re.IGNORECASE)
self.html_preprocess_sections = len(heading.findall(html))
self.log.debug("found " + unicode_type(self.html_preprocess_sections) + " pre-existing headings")
# Build the Regular Expressions in pieces
init_lookahead = "(?=<(p|div))"
chapter_line_open = self.line_open
title_line_open = (r"<(?P<outer2>p|div)[^>]*>\s*(<(?P<inner4>font|span|[ibu])[^>]*>)?"
r"\s*(<(?P<inner5>font|span|[ibu])[^>]*>)?\s*(<(?P<inner6>font|span|[ibu])[^>]*>)?\s*")
chapter_header_open = r"(?P<chap>"
title_header_open = r"(?P<title>"
chapter_header_close = ")\\s*"
title_header_close = ")"
chapter_line_close = self.line_close
title_line_close = "(</(?P=inner6)>)?\\s*(</(?P=inner5)>)?\\s*(</(?P=inner4)>)?\\s*</(?P=outer2)>"
is_pdftohtml = self.is_pdftohtml(html)
if is_pdftohtml:
title_line_open = "<(?P<outer2>p)[^>]*>\\s*"
title_line_close = "\\s*</(?P=outer2)>"
if blanks_between_paragraphs:
blank_lines = "(\\s*<p[^>]*>\\s*</p>){0,2}\\s*"
else:
blank_lines = ""
opt_title_open = "("
opt_title_close = ")?"
n_lookahead_open = "(?!\\s*"
n_lookahead_close = ")\\s*"
default_title = r"(<[ibu][^>]*>)?\s{0,3}(?!Chapter)([\w\:\'\"-]+\s{0,3}){1,5}?(</[ibu][^>]*>)?(?=<)"
simple_title = r"(<[ibu][^>]*>)?\s{0,3}(?!(Chapter|\s+<)).{0,65}?(</[ibu][^>]*>)?(?=<)"
analysis_result = []
chapter_types = [
[(
r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Epilogue|CHAPTER|Kapitel|Volume\b|Prologue|Book\b|Part\b|Dedication|Preface)"
r"\s*([\d\w-]+\:?\'?\s*){0,5}"), True, True, True, False, "Searching for common section headings", 'common'],
# Highest frequency headings which include titles
[r"[^'\"]?(CHAPTER|Kapitel)\s*([\dA-Z\-\'\"\?!#,]+\s*){0,7}\s*", True, True, True, False, "Searching for most common chapter headings", 'chapter'],
[r"<b[^>]*>\s*(<span[^>]*>)?\s*(?!([*#•=]+\s*)+)(\s*(?=[\d.\w#\-*\s]+<)([\d.\w#-*]+\s*){1,5}\s*)(?!\.)(</span>)?\s*</b>",
True, True, True, False, "Searching for emphasized lines", 'emphasized'], # Emphasized lines
[r"[^'\"]?(\d+(\.|:))\s*([\w\-\'\"#,]+\s*){0,7}\s*", True, True, True, False,
"Searching for numeric chapter headings", 'numeric'], # Numeric Chapters
[r"([A-Z]\s+){3,}\s*([\d\w-]+\s*){0,3}\s*", True, True, True, False, "Searching for letter spaced headings", 'letter_spaced'], # Spaced Lettering
[r"[^'\"]?(\d+\.?\s+([\d\w-]+\:?\'?-?\s?){0,5})\s*", True, True, True, False,
"Searching for numeric chapters with titles", 'numeric_title'], # Numeric Titles
[r"[^'\"]?(\d+)\s*([\dA-Z\-\'\"\?!#,]+\s*){0,7}\s*", True, True, True, False,
"Searching for simple numeric headings", 'plain_number'], # Numeric Chapters, no dot or colon
[r"\s*[^'\"]?([A-Z#]+(\s|-){0,3}){1,5}\s*", False, True, False, False,
"Searching for chapters with Uppercase Characters", 'uppercase'] # Uppercase Chapters
]
def recurse_patterns(html, analyze):
# Start with most typical chapter headings, get more aggressive until one works
for [chapter_type, n_lookahead_req, strict_title, ignorecase, title_req, log_message, type_name] in chapter_types:
n_lookahead = ''
hits = 0
self.chapters_no_title = 0
self.chapters_with_title = 0
if n_lookahead_req:
lp_n_lookahead_open = n_lookahead_open
lp_n_lookahead_close = n_lookahead_close
else:
lp_n_lookahead_open = ''
lp_n_lookahead_close = ''
if strict_title:
lp_title = default_title
else:
lp_title = simple_title
if ignorecase:
arg_ignorecase = r'(?i)'
else:
arg_ignorecase = ''
if title_req:
lp_opt_title_open = ''
lp_opt_title_close = ''
else:
lp_opt_title_open = opt_title_open
lp_opt_title_close = opt_title_close
if self.html_preprocess_sections >= self.min_chapters:
break
full_chapter_line = chapter_line_open+chapter_header_open+chapter_type+chapter_header_close+chapter_line_close
if n_lookahead_req:
n_lookahead = re.sub("(ou|in|cha)", "lookahead_", full_chapter_line)
if not analyze:
self.log.debug("Marked " + unicode_type(self.html_preprocess_sections) + " headings, " + log_message)
chapter_marker = arg_ignorecase+init_lookahead+full_chapter_line+blank_lines+lp_n_lookahead_open+n_lookahead+lp_n_lookahead_close+ \
lp_opt_title_open+title_line_open+title_header_open+lp_title+title_header_close+title_line_close+lp_opt_title_close
chapdetect = re.compile(r'%s' % chapter_marker)
if analyze:
hits = len(chapdetect.findall(html))
if hits:
chapdetect.sub(self.analyze_title_matches, html)
if float(self.chapters_with_title) / float(hits) > .5:
title_req = True
strict_title = False
self.log.debug(
unicode_type(type_name)+" had "+unicode_type(hits)+
" hits - "+unicode_type(self.chapters_no_title)+" chapters with no title, "+
unicode_type(self.chapters_with_title)+" chapters with titles, "+
unicode_type(float(self.chapters_with_title) / float(hits))+" percent. ")
if type_name == 'common':
analysis_result.append([chapter_type, n_lookahead_req, strict_title, ignorecase, title_req, log_message, type_name])
elif self.min_chapters <= hits < max_chapters or self.min_chapters < 3 > hits:
analysis_result.append([chapter_type, n_lookahead_req, strict_title, ignorecase, title_req, log_message, type_name])
break
else:
html = chapdetect.sub(self.chapter_head, html)
return html
recurse_patterns(html, True)
chapter_types = analysis_result
html = recurse_patterns(html, False)
words_per_chptr = wordcount
if words_per_chptr > 0 and self.html_preprocess_sections > 0:
words_per_chptr = wordcount // self.html_preprocess_sections
self.log.debug("Total wordcount is: "+ unicode_type(wordcount)+", Average words per section is: "+
unicode_type(words_per_chptr)+", Marked up "+unicode_type(self.html_preprocess_sections)+" chapters")
return html
def punctuation_unwrap(self, length, content, format):
'''
Unwraps lines based on line length and punctuation
supports a range of html markup and text files
the lookahead regex below is meant look for any non-full stop characters - punctuation
characters which can be used as a full stop should *not* be added below - e.g. ?!“”. etc
the reason for this is to prevent false positive wrapping. False positives are more
difficult to detect than false negatives during a manual review of the doc
This function intentionally leaves hyphenated content alone as that is handled by the
dehyphenate routine in a separate step
'''
def style_unwrap(match):
style_close = match.group('style_close')
style_open = match.group('style_open')
if style_open and style_close:
return style_close+' '+style_open
elif style_open and not style_close:
return ' '+style_open
elif not style_open and style_close:
return style_close+' '
else:
return ' '
# define the pieces of the regex
# (?<!\&\w{4});) is a semicolon not part of an entity
lookahead = "(?<=.{"+unicode_type(length)+r"}([a-zა-ჰäëïöüàèìòùáćéíĺóŕńśúýâêîôûçąężıãõñæøþðßěľščťžňďřů,:)\\IAß]|(?<!\&\w{4});))"
em_en_lookahead = "(?<=.{"+unicode_type(length)+"}[\u2013\u2014])"
soft_hyphen = "\xad"
line_ending = "\\s*(?P<style_close></(span|[iub])>)?\\s*(</(p|div)>)?"
blanklines = "\\s*(?P<up2threeblanks><(p|span|div)[^>]*>\\s*(<(p|span|div)[^>]*>\\s*</(span|p|div)>\\s*)</(span|p|div)>\\s*){0,3}\\s*"
line_opening = "<(p|div)[^>]*>\\s*(?P<style_open><(span|[iub])[^>]*>)?\\s*"
txt_line_wrap = "((\u0020|\u0009)*\n){1,4}"
if format == 'txt':
unwrap_regex = lookahead+txt_line_wrap
em_en_unwrap_regex = em_en_lookahead+txt_line_wrap
shy_unwrap_regex = soft_hyphen+txt_line_wrap
else:
unwrap_regex = lookahead+line_ending+blanklines+line_opening
em_en_unwrap_regex = em_en_lookahead+line_ending+blanklines+line_opening
shy_unwrap_regex = soft_hyphen+line_ending+blanklines+line_opening
unwrap = re.compile("%s" % unwrap_regex, re.UNICODE)
em_en_unwrap = re.compile("%s" % em_en_unwrap_regex, re.UNICODE)
shy_unwrap = re.compile("%s" % shy_unwrap_regex, re.UNICODE)
if format == 'txt':
content = unwrap.sub(' ', content)
content = em_en_unwrap.sub('', content)
content = shy_unwrap.sub('', content)
else:
content = unwrap.sub(style_unwrap, content)
content = em_en_unwrap.sub(style_unwrap, content)
content = shy_unwrap.sub(style_unwrap, content)
return content
def txt_process(self, match):
from calibre.ebooks.txt.processor import convert_basic, separate_paragraphs_single_line
content = match.group('text')
content = separate_paragraphs_single_line(content)
content = convert_basic(content, epub_split_size_kb=0)
return content
def markup_pre(self, html):
pre = re.compile(r'<pre>', re.IGNORECASE)
if len(pre.findall(html)) >= 1:
self.log.debug("Running Text Processing")
outerhtml = re.compile(r'.*?(?<=<pre>)(?P<text>.*?)</pre>', re.IGNORECASE|re.DOTALL)
html = outerhtml.sub(self.txt_process, html)
from calibre.ebooks.conversion.preprocess import convert_entities
html = re.sub(r'&(\S+?);', convert_entities, html)
else:
# Add markup naively
# TODO - find out if there are cases where there are more than one <pre> tag or
# other types of unmarked html and handle them in some better fashion
add_markup = re.compile('(?<!>)(\n)')
html = add_markup.sub('</p>\n<p>', html)
return html
def arrange_htm_line_endings(self, html):
html = re.sub(r"\s*</(?P<tag>p|div)>", "</"+"\\g<tag>"+">\n", html)
html = re.sub(r"\s*<(?P<tag>p|div)(?P<style>[^>]*)>\s*", "\n<"+"\\g<tag>"+"\\g<style>"+">", html)
return html
def fix_nbsp_indents(self, html):
txtindent = re.compile(unicode_type(r'<(?P<tagtype>p|div)(?P<formatting>[^>]*)>\s*(?P<span>(<span[^>]*>\s*)+)?\s*(\u00a0){2,}'), re.IGNORECASE)
html = txtindent.sub(self.insert_indent, html)
if self.found_indents > 1:
self.log.debug("replaced "+unicode_type(self.found_indents)+ " nbsp indents with inline styles")
return html
def cleanup_markup(self, html):
# remove remaining non-breaking spaces
html = re.sub(unicode_type(r'\u00a0'), ' ', html)
# Get rid of various common microsoft specific tags which can cause issues later
# Get rid of empty <o:p> tags to simplify other processing
html = re.sub(unicode_type(r'\s*<o:p>\s*</o:p>'), ' ', html)
# Delete microsoft 'smart' tags
html = re.sub('(?i)</?st1:\\w+>', '', html)
# Re-open self closing paragraph tags
html = re.sub('<p[^>/]*/>', '<p> </p>', html)
# Get rid of empty span, bold, font, em, & italics tags
fmt_tags = 'font|[ibu]|em|strong'
open_fmt_pat, close_fmt_pat = r'<(?:{})(?:\s[^>]*)?>'.format(fmt_tags), '</(?:{})>'.format(fmt_tags)
for i in range(2):
html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]*>\s*</span>){0,2}\s*</span>\s*", " ", html)
html = re.sub(
r"\s*{open}\s*({open}\s*{close}\s*){{0,2}}\s*{close}".format(open=open_fmt_pat, close=close_fmt_pat) , " ", html)
# delete surrounding divs from empty paragraphs
html = re.sub('<div[^>]*>\\s*<p[^>]*>\\s*</p>\\s*</div>', '<p> </p>', html)
# Empty heading tags
html = re.sub(r'(?i)<h\d+>\s*</h\d+>', '', html)
self.deleted_nbsps = True
return html
def analyze_line_endings(self, html):
'''
determines the type of html line ending used most commonly in a document
use before calling docanalysis functions
'''
paras_reg = re.compile('<p[^>]*>', re.IGNORECASE)
spans_reg = re.compile('<span[^>]*>', re.IGNORECASE)
paras = len(paras_reg.findall(html))
spans = len(spans_reg.findall(html))
if spans > 1:
if float(paras) / float(spans) < 0.75:
return 'spanned_html'
else:
return 'html'
else:
return 'html'
def analyze_blanks(self, html):
blanklines = self.blankreg.findall(html)
lines = self.linereg.findall(html)
if len(lines) > 1:
self.log.debug("There are " + unicode_type(len(blanklines)) + " blank lines. " +
unicode_type(float(len(blanklines)) / float(len(lines))) + " percent blank")
if float(len(blanklines)) / float(len(lines)) > 0.40:
return True
else:
return False
def cleanup_required(self):
for option in ['unwrap_lines', 'markup_chapter_headings', 'format_scene_breaks', 'delete_blank_paragraphs']:
if getattr(self.extra_opts, option, False):
return True
return False
def merge_blanks(self, html, blanks_count=None):
base_em = .5 # Baseline is 1.5em per blank line, 1st line is .5 em css and 1em for the nbsp
em_per_line = 1.5 # Add another 1.5 em for each additional blank
def merge_matches(match):
to_merge = match.group(0)
lines = float(len(self.single_blank.findall(to_merge))) - 1.
em = base_em + (em_per_line * lines)
if to_merge.find('whitespace'):
newline = self.any_multi_blank.sub('\n<p class="whitespace'+unicode_type(int(em * 10))+
'" style="text-align:center; margin-top:'+unicode_type(em)+'em"> </p>', match.group(0))
else:
newline = self.any_multi_blank.sub('\n<p class="softbreak'+unicode_type(int(em * 10))+
'" style="text-align:center; margin-top:'+unicode_type(em)+'em"> </p>', match.group(0))
return newline
html = self.any_multi_blank.sub(merge_matches, html)
return html
def detect_whitespace(self, html):
blanks_around_headings = re.compile(
r'(?P<initparas>(<(p|div)[^>]*>\s*</(p|div)>\s*){1,}\s*)?'
r'(?P<content><h(?P<hnum>\d+)[^>]*>.*?</h(?P=hnum)>)(?P<endparas>\s*(<(p|div)[^>]*>\s*</(p|div)>\s*){1,})?', re.IGNORECASE|re.DOTALL)
blanks_around_scene_breaks = re.compile(
r'(?P<initparas>(<(p|div)[^>]*>\s*</(p|div)>\s*){1,}\s*)?'
r'(?P<content><p class="scenebreak"[^>]*>.*?</p>)(?P<endparas>\s*(<(p|div)[^>]*>\s*</(p|div)>\s*){1,})?', re.IGNORECASE|re.DOTALL)
blanks_n_nopunct = re.compile(
r'(?P<initparas>(<p[^>]*>\s*</p>\s*){1,}\s*)?<p[^>]*>\s*(<(span|[ibu]|em|strong|font)[^>]*>\s*)*'
r'.{1,100}?[^\W](</(span|[ibu]|em|strong|font)>\s*)*</p>(?P<endparas>\s*(<p[^>]*>\s*</p>\s*){1,})?', re.IGNORECASE|re.DOTALL)
def merge_header_whitespace(match):
initblanks = match.group('initparas')
endblanks = match.group('endparas')
content = match.group('content')
top_margin = ''
bottom_margin = ''
if initblanks is not None:
top_margin = 'margin-top:'+unicode_type(len(self.single_blank.findall(initblanks)))+'em;'
if endblanks is not None:
bottom_margin = 'margin-bottom:'+unicode_type(len(self.single_blank.findall(endblanks)))+'em;'
if initblanks is None and endblanks is None:
return content
elif content.find('scenebreak') != -1:
return content
else:
content = re.sub('(?i)<h(?P<hnum>\\d+)[^>]*>', '\n\n<h'+'\\g<hnum>'+' style="'+top_margin+bottom_margin+'">', content)
return content
html = blanks_around_headings.sub(merge_header_whitespace, html)
html = blanks_around_scene_breaks.sub(merge_header_whitespace, html)
def markup_whitespaces(match):
blanks = match.group(0)
blanks = self.blankreg.sub('\n<p class="whitespace" style="text-align:center; margin-top:0em; margin-bottom:0em"> </p>', blanks)
return blanks
html = blanks_n_nopunct.sub(markup_whitespaces, html)
if self.html_preprocess_sections > self.min_chapters:
html = re.sub('(?si)^.*?(?=<h\\d)', markup_whitespaces, html)
return html
def detect_soft_breaks(self, html):
line = '(?P<initline>'+self.line_open+'\\s*(?P<init_content>.*?)'+self.line_close+')'
line_two = '(?P<line_two>'+re.sub('(ou|in|cha)', 'linetwo_', self.line_open)+ \
'\\s*(?P<line_two_content>.*?)'+re.sub('(ou|in|cha)', 'linetwo_', self.line_close)+')'
div_break_candidate_pattern = line+'\\s*<div[^>]*>\\s*</div>\\s*'+line_two
div_break_candidate = re.compile(r'%s' % div_break_candidate_pattern, re.IGNORECASE|re.UNICODE)
def convert_div_softbreaks(match):
init_is_paragraph = self.check_paragraph(match.group('init_content'))
line_two_is_paragraph = self.check_paragraph(match.group('line_two_content'))
if init_is_paragraph and line_two_is_paragraph:
return (match.group('initline')+
'\n<p class="softbreak" style="margin-top:.5em; page-break-before:avoid; text-align:center"> </p>\n'+
match.group('line_two'))
else:
return match.group(0)
html = div_break_candidate.sub(convert_div_softbreaks, html)
if not self.blanks_deleted and self.blanks_between_paragraphs:
html = self.multi_blank.sub('\n<p class="softbreak" style="margin-top:1em; page-break-before:avoid; text-align:center"> </p>', html)
else:
html = self.blankreg.sub('\n<p class="softbreak" style="margin-top:.5em; page-break-before:avoid; text-align:center"> </p>', html)
return html
def detect_scene_breaks(self, html):
scene_break_regex = self.line_open+'(?!('+self.common_in_text_beginnings+'|.*?'+self.common_in_text_endings+ \
'<))(?P<break>((?P<break_char>((?!\\s)\\W))\\s*(?P=break_char)?)+)\\s*'+self.line_close
scene_breaks = re.compile(r'%s' % scene_break_regex, re.IGNORECASE|re.UNICODE)
html = scene_breaks.sub(self.scene_break_open+'\\g<break>'+'</p>', html)
return html
def markup_user_break(self, replacement_break):
'''
Takes string a user supplies and wraps it in markup that will be centered with
appropriate margins. <hr> and <img> tags are allowed. If the user specifies
a style with width attributes in the <hr> tag then the appropriate margins are
applied to wrapping divs. This is because many ebook devices don't support margin:auto
All other html is converted to text.
'''
hr_open = '<div id="scenebreak" style="margin-left: 45%; margin-right: 45%; margin-top:1.5em; margin-bottom:1.5em; page-break-before:avoid">'
if re.findall('(<|>)', replacement_break):
if re.match('^<hr', replacement_break):
if replacement_break.find('width') != -1:
try:
width = int(re.sub('.*?width(:|=)(?P<wnum>\\d+).*', '\\g<wnum>', replacement_break))
except:
scene_break = hr_open+'<hr style="height: 3px; background:#505050" /></div>'
self.log.warn('Invalid replacement scene break'
' expression, using default')
else:
replacement_break = re.sub('(?i)(width=\\d+\\%?|width:\\s*\\d+(\\%|px|pt|em)?;?)', '', replacement_break)
divpercent = (100 - width) // 2
hr_open = re.sub('45', unicode_type(divpercent), hr_open)
scene_break = hr_open+replacement_break+'</div>'
else:
scene_break = hr_open+'<hr style="height: 3px; background:#505050" /></div>'
elif re.match('^<img', replacement_break):
scene_break = self.scene_break_open+replacement_break+'</p>'
else:
from calibre.utils.html2text import html2text
replacement_break = html2text(replacement_break)
replacement_break = re.sub('\\s', '&nbsp;', replacement_break)
scene_break = self.scene_break_open+replacement_break+'</p>'
else:
replacement_break = re.sub('\\s', '&nbsp;', replacement_break)
scene_break = self.scene_break_open+replacement_break+'</p>'
return scene_break
def check_paragraph(self, content):
content = re.sub('\\s*</?span[^>]*>\\s*', '', content)
if re.match('.*[\"\'.!?:]$', content):
# print "detected this as a paragraph"
return True
else:
return False
def abbyy_processor(self, html):
abbyy_line = re.compile('((?P<linestart><p\\sstyle="(?P<styles>[^\"]*?);?">)(?P<content>.*?)(?P<lineend></p>)|(?P<image><img[^>]*>))', re.IGNORECASE)
empty_paragraph = '\n<p> </p>\n'
self.in_blockquote = False
self.previous_was_paragraph = False
html = re.sub('</?a[^>]*>', '', html)
def convert_styles(match):
# print "raw styles are: "+match.group('styles')
content = match.group('content')
# print "raw content is: "+match.group('content')
image = match.group('image')
is_paragraph = False
text_align = ''
text_indent = ''
paragraph_before = ''
paragraph_after = ''
blockquote_open = '\n<blockquote>\n'
blockquote_close = '</blockquote>\n'
indented_text = 'text-indent:3%;'
blockquote_open_loop = ''
blockquote_close_loop = ''
debugabby = False
if image:
debugabby = True
if self.in_blockquote:
self.in_blockquote = False
blockquote_close_loop = blockquote_close
self.previous_was_paragraph = False
return blockquote_close_loop+'\n'+image+'\n'
else:
styles = match.group('styles').split(';')
is_paragraph = self.check_paragraph(content)
# print "styles for this line are: "+unicode_type(styles)
split_styles = []
for style in styles:
# print "style is: "+unicode_type(style)
newstyle = style.split(':')
# print "newstyle is: "+unicode_type(newstyle)
split_styles.append(newstyle)
styles = split_styles
for style, setting in styles:
if style == 'text-align' and setting != 'left':
text_align = style+':'+setting+';'
if style == 'text-indent':
setting = int(re.sub('\\s*pt\\s*', '', setting))
if 9 < setting < 14:
text_indent = indented_text
else:
text_indent = style+':'+unicode_type(setting)+'pt;'
if style == 'padding':
setting = re.sub('pt', '', setting).split(' ')
if int(setting[1]) < 16 and int(setting[3]) < 16:
if self.in_blockquote:
debugabby = True
if is_paragraph:
self.in_blockquote = False
blockquote_close_loop = blockquote_close
if int(setting[3]) > 8 and text_indent == '':
text_indent = indented_text
if int(setting[0]) > 5:
paragraph_before = empty_paragraph
if int(setting[2]) > 5:
paragraph_after = empty_paragraph
elif not self.in_blockquote and self.previous_was_paragraph:
debugabby = True
self.in_blockquote = True
blockquote_open_loop = blockquote_open
if debugabby:
self.log.debug('\n\n******\n')
self.log.debug('padding top is: '+unicode_type(setting[0]))
self.log.debug('padding right is:' +unicode_type(setting[1]))
self.log.debug('padding bottom is: ' + unicode_type(setting[2]))
self.log.debug('padding left is: ' +unicode_type(setting[3]))
# print "text-align is: "+unicode_type(text_align)
# print "\n***\nline is:\n "+unicode_type(match.group(0))+'\n'
if debugabby:
# print "this line is a paragraph = "+unicode_type(is_paragraph)+", previous line was "+unicode_type(self.previous_was_paragraph)
self.log.debug("styles for this line were:", styles)
self.log.debug('newline is:')
self.log.debug(blockquote_open_loop+blockquote_close_loop+
paragraph_before+'<p style="'+text_indent+text_align+
'">'+content+'</p>'+paragraph_after+'\n\n\n\n\n')
# print "is_paragraph is "+unicode_type(is_paragraph)+", previous_was_paragraph is "+unicode_type(self.previous_was_paragraph)
self.previous_was_paragraph = is_paragraph
# print "previous_was_paragraph is now set to "+unicode_type(self.previous_was_paragraph)+"\n\n\n"
return blockquote_open_loop+blockquote_close_loop+paragraph_before+'<p style="'+text_indent+text_align+'">'+content+'</p>'+paragraph_after
html = abbyy_line.sub(convert_styles, html)
return html
def __call__(self, html):
self.log.debug("********* Heuristic processing HTML *********")
# Count the words in the document to estimate how many chapters to look for and whether
# other types of processing are attempted
try:
self.totalwords = self.get_word_count(html)
except:
self.log.warn("Can't get wordcount")
if self.totalwords < 50:
self.log.warn("flow is too short, not running heuristics")
return html
is_abbyy = self.is_abbyy(html)
if is_abbyy:
html = self.abbyy_processor(html)
# Arrange line feeds and </p> tags so the line_length and no_markup functions work correctly
html = self.arrange_htm_line_endings(html)
# self.dump(html, 'after_arrange_line_endings')
if self.cleanup_required():
# ##### Check Markup ######
#
# some lit files don't have any <p> tags or equivalent (generally just plain text between
# <pre> tags), check and mark up line endings if required before proceeding
# fix indents must run after this step
if self.no_markup(html, 0.1):
self.log.debug("not enough paragraph markers, adding now")
# markup using text processing
html = self.markup_pre(html)
# Replace series of non-breaking spaces with text-indent
if getattr(self.extra_opts, 'fix_indents', False):
html = self.fix_nbsp_indents(html)
if self.cleanup_required():
# fix indents must run before this step, as it removes non-breaking spaces
html = self.cleanup_markup(html)
is_pdftohtml = self.is_pdftohtml(html)
if is_pdftohtml:
self.line_open = "<(?P<outer>p)[^>]*>(\\s*<[ibu][^>]*>)?\\s*"
self.line_close = "\\s*(</[ibu][^>]*>\\s*)?</(?P=outer)>"
# ADE doesn't render <br />, change to empty paragraphs
# html = re.sub('<br[^>]*>', u'<p>\u00a0</p>', html)
# Determine whether the document uses interleaved blank lines
self.blanks_between_paragraphs = self.analyze_blanks(html)
# detect chapters/sections to match xpath or splitting logic
if getattr(self.extra_opts, 'markup_chapter_headings', False):
html = self.markup_chapters(html, self.totalwords, self.blanks_between_paragraphs)
# self.dump(html, 'after_chapter_markup')
if getattr(self.extra_opts, 'italicize_common_cases', False):
html = self.markup_italicis(html)
# If more than 40% of the lines are empty paragraphs and the user has enabled delete
# blank paragraphs then delete blank lines to clean up spacing
if self.blanks_between_paragraphs and getattr(self.extra_opts, 'delete_blank_paragraphs', False):
self.log.debug("deleting blank lines")
self.blanks_deleted = True
html = self.multi_blank.sub('\n<p class="softbreak" style="margin-top:.5em; page-break-before:avoid; text-align:center"> </p>', html)
html = self.blankreg.sub('', html)
# Determine line ending type
# Some OCR sourced files have line breaks in the html using a combination of span & p tags
# span are used for hard line breaks, p for new paragraphs. Determine which is used so
# that lines can be un-wrapped across page boundaries
format = self.analyze_line_endings(html)
# Check Line histogram to determine if the document uses hard line breaks, If 50% or
# more of the lines break in the same region of the document then unwrapping is required
docanalysis = DocAnalysis(format, html)
hardbreaks = docanalysis.line_histogram(.50)
self.log.debug("Hard line breaks check returned "+unicode_type(hardbreaks))
# Calculate Length
unwrap_factor = getattr(self.extra_opts, 'html_unwrap_factor', 0.4)
length = docanalysis.line_length(unwrap_factor)
self.log.debug("Median line length is " + unicode_type(length) + ", calculated with " + format + " format")
# ##### Unwrap lines ######
if getattr(self.extra_opts, 'unwrap_lines', False):
# only go through unwrapping code if the histogram shows unwrapping is required or if the user decreased the default unwrap_factor
if hardbreaks or unwrap_factor < 0.4:
self.log.debug("Unwrapping required, unwrapping Lines")
# Dehyphenate with line length limiters
dehyphenator = Dehyphenator(self.extra_opts.verbose, self.log)
html = dehyphenator(html,'html', length)
html = self.punctuation_unwrap(length, html, 'html')
if getattr(self.extra_opts, 'dehyphenate', False):
# dehyphenate in cleanup mode to fix anything previous conversions/editing missed
self.log.debug("Fixing hyphenated content")
dehyphenator = Dehyphenator(self.extra_opts.verbose, self.log)
html = dehyphenator(html,'html_cleanup', length)
html = dehyphenator(html, 'individual_words', length)
# If still no sections after unwrapping mark split points on lines with no punctuation
if self.html_preprocess_sections < self.min_chapters and getattr(self.extra_opts, 'markup_chapter_headings', False):
self.log.debug("Looking for more split points based on punctuation,"
" currently have " + unicode_type(self.html_preprocess_sections))
chapdetect3 = re.compile(
r'<(?P<styles>(p|div)[^>]*)>\s*(?P<section>(<span[^>]*>)?\s*(?!([\W]+\s*)+)'
r'(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*'
r'.?(?=[a-z#\-*\s]+<)([a-z#-*]+\s*){1,5}\s*\s*(</span>)?(</[ibu]>){0,2}\s*'
r'(</span>)?\s*(</[ibu]>){0,2}\s*(</span>)?\s*</(p|div)>)', re.IGNORECASE)
html = chapdetect3.sub(self.chapter_break, html)
if getattr(self.extra_opts, 'renumber_headings', False):
# search for places where a first or second level heading is immediately followed by another
# top level heading. demote the second heading to h3 to prevent splitting between chapter
# headings and titles, images, etc
doubleheading = re.compile(
r'(?P<firsthead><h(1|2)[^>]*>.+?</h(1|2)>\s*(<(?!h\d)[^>]*>\s*)*)<h(1|2)(?P<secondhead>[^>]*>.+?)</h(1|2)>', re.IGNORECASE)
html = doubleheading.sub('\\g<firsthead>'+'\n<h3'+'\\g<secondhead>'+'</h3>', html)
# If scene break formatting is enabled, find all blank paragraphs that definitely aren't scenebreaks,
# style it with the 'whitespace' class. All remaining blank lines are styled as softbreaks.
# Multiple sequential blank paragraphs are merged with appropriate margins
# If non-blank scene breaks exist they are center aligned and styled with appropriate margins.
if getattr(self.extra_opts, 'format_scene_breaks', False):
self.log.debug('Formatting scene breaks')
html = re.sub('(?i)<div[^>]*>\\s*<br(\\s?/)?>\\s*</div>', '<p></p>', html)
html = self.detect_scene_breaks(html)
html = self.detect_whitespace(html)
html = self.detect_soft_breaks(html)
blanks_count = len(self.any_multi_blank.findall(html))
if blanks_count >= 1:
html = self.merge_blanks(html, blanks_count)
detected_scene_break = re.compile(r'<p class="scenebreak"[^>]*>.*?</p>')
scene_break_count = len(detected_scene_break.findall(html))
# If the user has enabled scene break replacement, then either softbreaks
# or 'hard' scene breaks are replaced, depending on which is in use
# Otherwise separator lines are centered, use a bit larger margin in this case
replacement_break = getattr(self.extra_opts, 'replace_scene_breaks', None)
if replacement_break:
replacement_break = self.markup_user_break(replacement_break)
if scene_break_count >= 1:
html = detected_scene_break.sub(replacement_break, html)
html = re.sub('<p\\s+class="softbreak"[^>]*>\\s*</p>', replacement_break, html)
else:
html = re.sub('<p\\s+class="softbreak"[^>]*>\\s*</p>', replacement_break, html)
if self.deleted_nbsps:
# put back non-breaking spaces in empty paragraphs so they render correctly
html = self.anyblank.sub('\n'+r'\g<openline>'+'\u00a0'+r'\g<closeline>', html)
return html