mirror of
https://github.com/gryf/ebook-converter.git
synced 2026-03-15 22:23:42 +01:00
Initial import
This commit is contained in:
30
ebook_converter/ebooks/conversion/__init__.py
Normal file
30
ebook_converter/ebooks/conversion/__init__.py
Normal file
@@ -0,0 +1,30 @@
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
from polyglot.builtins import native_string_type
|
||||
|
||||
|
||||
class ConversionUserFeedBack(Exception):
|
||||
|
||||
def __init__(self, title, msg, level='info', det_msg=''):
|
||||
''' Show a simple message to the user
|
||||
|
||||
:param title: The title (very short description)
|
||||
:param msg: The message to show the user
|
||||
:param level: Must be one of 'info', 'warn' or 'error'
|
||||
:param det_msg: Optional detailed message to show the user
|
||||
'''
|
||||
import json
|
||||
Exception.__init__(self, json.dumps({'msg':msg, 'level':level,
|
||||
'det_msg':det_msg, 'title':title}))
|
||||
self.title, self.msg, self.det_msg = title, msg, det_msg
|
||||
self.level = level
|
||||
|
||||
|
||||
# Ensure exception uses fully qualified name as this is used to detect it in
|
||||
# the GUI.
|
||||
ConversionUserFeedBack.__name__ = native_string_type('calibre.ebooks.conversion.ConversionUserFeedBack')
|
||||
428
ebook_converter/ebooks/conversion/cli.py
Normal file
428
ebook_converter/ebooks/conversion/cli.py
Normal file
@@ -0,0 +1,428 @@
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL 3'
|
||||
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
'''
|
||||
Command line interface to conversion sub-system
|
||||
'''
|
||||
|
||||
import sys, os, numbers
|
||||
from optparse import OptionGroup, Option
|
||||
from collections import OrderedDict
|
||||
|
||||
from calibre.utils.config import OptionParser
|
||||
from calibre.utils.logging import Log
|
||||
from calibre.customize.conversion import OptionRecommendation
|
||||
from calibre import patheq
|
||||
from calibre.ebooks.conversion import ConversionUserFeedBack
|
||||
from calibre.utils.localization import localize_user_manual_link
|
||||
from polyglot.builtins import iteritems
|
||||
|
||||
USAGE = '%prog ' + _('''\
|
||||
input_file output_file [options]
|
||||
|
||||
Convert an e-book from one format to another.
|
||||
|
||||
input_file is the input and output_file is the output. Both must be \
|
||||
specified as the first two arguments to the command.
|
||||
|
||||
The output e-book format is guessed from the file extension of \
|
||||
output_file. output_file can also be of the special format .EXT where \
|
||||
EXT is the output file extension. In this case, the name of the output \
|
||||
file is derived from the name of the input file. Note that the filenames must \
|
||||
not start with a hyphen. Finally, if output_file has no extension, then \
|
||||
it is treated as a directory and an "open e-book" (OEB) consisting of HTML \
|
||||
files is written to that directory. These files are the files that would \
|
||||
normally have been passed to the output plugin.
|
||||
|
||||
After specifying the input \
|
||||
and output file you can customize the conversion by specifying various \
|
||||
options. The available options depend on the input and output file types. \
|
||||
To get help on them specify the input and output file and then use the -h \
|
||||
option.
|
||||
|
||||
For full documentation of the conversion system see
|
||||
''') + localize_user_manual_link('https://manual.calibre-ebook.com/conversion.html')
|
||||
|
||||
HEURISTIC_OPTIONS = ['markup_chapter_headings',
|
||||
'italicize_common_cases', 'fix_indents',
|
||||
'html_unwrap_factor', 'unwrap_lines',
|
||||
'delete_blank_paragraphs', 'format_scene_breaks',
|
||||
'dehyphenate', 'renumber_headings',
|
||||
'replace_scene_breaks']
|
||||
|
||||
DEFAULT_TRUE_OPTIONS = HEURISTIC_OPTIONS + ['remove_fake_margins']
|
||||
|
||||
|
||||
def print_help(parser, log):
|
||||
parser.print_help()
|
||||
|
||||
|
||||
def check_command_line_options(parser, args, log):
|
||||
if len(args) < 3 or args[1].startswith('-') or args[2].startswith('-'):
|
||||
print_help(parser, log)
|
||||
log.error('\n\nYou must specify the input AND output files')
|
||||
raise SystemExit(1)
|
||||
|
||||
input = os.path.abspath(args[1])
|
||||
if not input.endswith('.recipe') and not os.access(input, os.R_OK) and not \
|
||||
('-h' in args or '--help' in args):
|
||||
log.error('Cannot read from', input)
|
||||
raise SystemExit(1)
|
||||
if input.endswith('.recipe') and not os.access(input, os.R_OK):
|
||||
input = args[1]
|
||||
|
||||
output = args[2]
|
||||
if (output.startswith('.') and output[:2] not in {'..', '.'} and '/' not in
|
||||
output and '\\' not in output):
|
||||
output = os.path.splitext(os.path.basename(input))[0]+output
|
||||
output = os.path.abspath(output)
|
||||
|
||||
return input, output
|
||||
|
||||
|
||||
def option_recommendation_to_cli_option(add_option, rec):
|
||||
opt = rec.option
|
||||
switches = ['-'+opt.short_switch] if opt.short_switch else []
|
||||
switches.append('--'+opt.long_switch)
|
||||
attrs = dict(dest=opt.name, help=opt.help,
|
||||
choices=opt.choices, default=rec.recommended_value)
|
||||
if isinstance(rec.recommended_value, type(True)):
|
||||
attrs['action'] = 'store_false' if rec.recommended_value else \
|
||||
'store_true'
|
||||
else:
|
||||
if isinstance(rec.recommended_value, numbers.Integral):
|
||||
attrs['type'] = 'int'
|
||||
if isinstance(rec.recommended_value, numbers.Real):
|
||||
attrs['type'] = 'float'
|
||||
|
||||
if opt.long_switch == 'verbose':
|
||||
attrs['action'] = 'count'
|
||||
attrs.pop('type', '')
|
||||
if opt.name == 'read_metadata_from_opf':
|
||||
switches.append('--from-opf')
|
||||
if opt.name == 'transform_css_rules':
|
||||
attrs['help'] = _(
|
||||
'Path to a file containing rules to transform the CSS styles'
|
||||
' in this book. The easiest way to create such a file is to'
|
||||
' use the wizard for creating rules in the calibre GUI. Access'
|
||||
' it in the "Look & feel->Transform styles" section of the conversion'
|
||||
' dialog. Once you create the rules, you can use the "Export" button'
|
||||
' to save them to a file.'
|
||||
)
|
||||
if opt.name in DEFAULT_TRUE_OPTIONS and rec.recommended_value is True:
|
||||
switches = ['--disable-'+opt.long_switch]
|
||||
add_option(Option(*switches, **attrs))
|
||||
|
||||
|
||||
def group_titles():
|
||||
return _('INPUT OPTIONS'), _('OUTPUT OPTIONS')
|
||||
|
||||
|
||||
def recipe_test(option, opt_str, value, parser):
|
||||
assert value is None
|
||||
value = []
|
||||
|
||||
def floatable(s):
|
||||
try:
|
||||
float(s)
|
||||
return True
|
||||
except ValueError:
|
||||
return False
|
||||
|
||||
for arg in parser.rargs:
|
||||
# stop on --foo like options
|
||||
if arg[:2] == "--":
|
||||
break
|
||||
# stop on -a, but not on -3 or -3.0
|
||||
if arg[:1] == "-" and len(arg) > 1 and not floatable(arg):
|
||||
break
|
||||
try:
|
||||
value.append(int(arg))
|
||||
except (TypeError, ValueError, AttributeError):
|
||||
break
|
||||
if len(value) == 2:
|
||||
break
|
||||
del parser.rargs[:len(value)]
|
||||
|
||||
while len(value) < 2:
|
||||
value.append(2)
|
||||
|
||||
setattr(parser.values, option.dest, tuple(value))
|
||||
|
||||
|
||||
def add_input_output_options(parser, plumber):
|
||||
input_options, output_options = \
|
||||
plumber.input_options, plumber.output_options
|
||||
|
||||
def add_options(group, options):
|
||||
for opt in options:
|
||||
if plumber.input_fmt == 'recipe' and opt.option.long_switch == 'test':
|
||||
group(Option('--test', dest='test', action='callback', callback=recipe_test))
|
||||
else:
|
||||
option_recommendation_to_cli_option(group, opt)
|
||||
|
||||
if input_options:
|
||||
title = group_titles()[0]
|
||||
io = OptionGroup(parser, title, _('Options to control the processing'
|
||||
' of the input %s file')%plumber.input_fmt)
|
||||
add_options(io.add_option, input_options)
|
||||
parser.add_option_group(io)
|
||||
|
||||
if output_options:
|
||||
title = group_titles()[1]
|
||||
oo = OptionGroup(parser, title, _('Options to control the processing'
|
||||
' of the output %s')%plumber.output_fmt)
|
||||
add_options(oo.add_option, output_options)
|
||||
parser.add_option_group(oo)
|
||||
|
||||
|
||||
def add_pipeline_options(parser, plumber):
|
||||
groups = OrderedDict((
|
||||
('' , ('',
|
||||
[
|
||||
'input_profile',
|
||||
'output_profile',
|
||||
]
|
||||
)),
|
||||
(_('LOOK AND FEEL') , (
|
||||
_('Options to control the look and feel of the output'),
|
||||
[
|
||||
'base_font_size', 'disable_font_rescaling',
|
||||
'font_size_mapping', 'embed_font_family',
|
||||
'subset_embedded_fonts', 'embed_all_fonts',
|
||||
'line_height', 'minimum_line_height',
|
||||
'linearize_tables',
|
||||
'extra_css', 'filter_css', 'transform_css_rules', 'expand_css',
|
||||
'smarten_punctuation', 'unsmarten_punctuation',
|
||||
'margin_top', 'margin_left', 'margin_right',
|
||||
'margin_bottom', 'change_justification',
|
||||
'insert_blank_line', 'insert_blank_line_size',
|
||||
'remove_paragraph_spacing',
|
||||
'remove_paragraph_spacing_indent_size',
|
||||
'asciiize', 'keep_ligatures',
|
||||
]
|
||||
)),
|
||||
|
||||
(_('HEURISTIC PROCESSING') , (
|
||||
_('Modify the document text and structure using common'
|
||||
' patterns. Disabled by default. Use %(en)s to enable. '
|
||||
' Individual actions can be disabled with the %(dis)s options.')
|
||||
% dict(en='--enable-heuristics', dis='--disable-*'),
|
||||
['enable_heuristics'] + HEURISTIC_OPTIONS
|
||||
)),
|
||||
|
||||
(_('SEARCH AND REPLACE') , (
|
||||
_('Modify the document text and structure using user defined patterns.'),
|
||||
[
|
||||
'sr1_search', 'sr1_replace',
|
||||
'sr2_search', 'sr2_replace',
|
||||
'sr3_search', 'sr3_replace',
|
||||
'search_replace',
|
||||
]
|
||||
)),
|
||||
|
||||
(_('STRUCTURE DETECTION') , (
|
||||
_('Control auto-detection of document structure.'),
|
||||
[
|
||||
'chapter', 'chapter_mark',
|
||||
'prefer_metadata_cover', 'remove_first_image',
|
||||
'insert_metadata', 'page_breaks_before',
|
||||
'remove_fake_margins', 'start_reading_at',
|
||||
]
|
||||
)),
|
||||
|
||||
(_('TABLE OF CONTENTS') , (
|
||||
_('Control the automatic generation of a Table of Contents. By '
|
||||
'default, if the source file has a Table of Contents, it will '
|
||||
'be used in preference to the automatically generated one.'),
|
||||
[
|
||||
'level1_toc', 'level2_toc', 'level3_toc',
|
||||
'toc_threshold', 'max_toc_links', 'no_chapters_in_toc',
|
||||
'use_auto_toc', 'toc_filter', 'duplicate_links_in_toc',
|
||||
]
|
||||
)),
|
||||
|
||||
(_('METADATA') , (_('Options to set metadata in the output'),
|
||||
plumber.metadata_option_names + ['read_metadata_from_opf'],
|
||||
)),
|
||||
(_('DEBUG'), (_('Options to help with debugging the conversion'),
|
||||
[
|
||||
'verbose',
|
||||
'debug_pipeline',
|
||||
])),
|
||||
|
||||
))
|
||||
|
||||
for group, (desc, options) in iteritems(groups):
|
||||
if group:
|
||||
group = OptionGroup(parser, group, desc)
|
||||
parser.add_option_group(group)
|
||||
add_option = group.add_option if group != '' else parser.add_option
|
||||
|
||||
for name in options:
|
||||
rec = plumber.get_option_by_name(name)
|
||||
if rec.level < rec.HIGH:
|
||||
option_recommendation_to_cli_option(add_option, rec)
|
||||
|
||||
|
||||
def option_parser():
|
||||
parser = OptionParser(usage=USAGE)
|
||||
parser.add_option('--list-recipes', default=False, action='store_true',
|
||||
help=_('List builtin recipe names. You can create an e-book from '
|
||||
'a builtin recipe like this: ebook-convert "Recipe Name.recipe" '
|
||||
'output.epub'))
|
||||
return parser
|
||||
|
||||
|
||||
class ProgressBar(object):
|
||||
|
||||
def __init__(self, log):
|
||||
self.log = log
|
||||
|
||||
def __call__(self, frac, msg=''):
|
||||
if msg:
|
||||
percent = int(frac*100)
|
||||
self.log('%d%% %s'%(percent, msg))
|
||||
|
||||
|
||||
def create_option_parser(args, log):
|
||||
if '--version' in args:
|
||||
from calibre.constants import __appname__, __version__, __author__
|
||||
log(os.path.basename(args[0]), '('+__appname__, __version__+')')
|
||||
log('Created by:', __author__)
|
||||
raise SystemExit(0)
|
||||
if '--list-recipes' in args:
|
||||
from calibre.web.feeds.recipes.collection import get_builtin_recipe_titles
|
||||
log('Available recipes:')
|
||||
titles = sorted(get_builtin_recipe_titles())
|
||||
for title in titles:
|
||||
try:
|
||||
log('\t'+title)
|
||||
except:
|
||||
log('\t'+repr(title))
|
||||
log('%d recipes available'%len(titles))
|
||||
raise SystemExit(0)
|
||||
|
||||
parser = option_parser()
|
||||
if len(args) < 3:
|
||||
print_help(parser, log)
|
||||
if any(x in args for x in ('-h', '--help')):
|
||||
raise SystemExit(0)
|
||||
else:
|
||||
raise SystemExit(1)
|
||||
|
||||
input, output = check_command_line_options(parser, args, log)
|
||||
|
||||
from calibre.ebooks.conversion.plumber import Plumber
|
||||
|
||||
reporter = ProgressBar(log)
|
||||
if patheq(input, output):
|
||||
raise ValueError('Input file is the same as the output file')
|
||||
|
||||
plumber = Plumber(input, output, log, reporter)
|
||||
add_input_output_options(parser, plumber)
|
||||
add_pipeline_options(parser, plumber)
|
||||
|
||||
return parser, plumber
|
||||
|
||||
|
||||
def abspath(x):
|
||||
if x.startswith('http:') or x.startswith('https:'):
|
||||
return x
|
||||
return os.path.abspath(os.path.expanduser(x))
|
||||
|
||||
|
||||
def escape_sr_pattern(exp):
|
||||
return exp.replace('\n', '\ue123')
|
||||
|
||||
|
||||
def read_sr_patterns(path, log=None):
|
||||
import json, re
|
||||
pats = []
|
||||
with open(path, 'rb') as f:
|
||||
lines = f.read().decode('utf-8').splitlines()
|
||||
pat = None
|
||||
for line in lines:
|
||||
if pat is None:
|
||||
if not line.strip():
|
||||
continue
|
||||
line = line.replace('\ue123', '\n')
|
||||
try:
|
||||
re.compile(line)
|
||||
except:
|
||||
msg = 'Invalid regular expression: %r from file: %r'%(
|
||||
line, path)
|
||||
if log is not None:
|
||||
log.error(msg)
|
||||
raise SystemExit(1)
|
||||
else:
|
||||
raise ValueError(msg)
|
||||
pat = line
|
||||
else:
|
||||
pats.append((pat, line))
|
||||
pat = None
|
||||
return json.dumps(pats)
|
||||
|
||||
|
||||
def main(args=sys.argv):
|
||||
log = Log()
|
||||
parser, plumber = create_option_parser(args, log)
|
||||
opts, leftover_args = parser.parse_args(args)
|
||||
if len(leftover_args) > 3:
|
||||
log.error('Extra arguments not understood:', u', '.join(leftover_args[3:]))
|
||||
return 1
|
||||
for x in ('read_metadata_from_opf', 'cover'):
|
||||
if getattr(opts, x, None) is not None:
|
||||
setattr(opts, x, abspath(getattr(opts, x)))
|
||||
if opts.search_replace:
|
||||
opts.search_replace = read_sr_patterns(opts.search_replace, log)
|
||||
if opts.transform_css_rules:
|
||||
from calibre.ebooks.css_transform_rules import import_rules, validate_rule
|
||||
with open(opts.transform_css_rules, 'rb') as tcr:
|
||||
opts.transform_css_rules = rules = list(import_rules(tcr.read()))
|
||||
for rule in rules:
|
||||
title, msg = validate_rule(rule)
|
||||
if title and msg:
|
||||
log.error('Failed to parse CSS transform rules')
|
||||
log.error(title)
|
||||
log.error(msg)
|
||||
return 1
|
||||
|
||||
recommendations = [(n.dest, getattr(opts, n.dest),
|
||||
OptionRecommendation.HIGH)
|
||||
for n in parser.options_iter()
|
||||
if n.dest]
|
||||
plumber.merge_ui_recommendations(recommendations)
|
||||
|
||||
try:
|
||||
plumber.run()
|
||||
except ConversionUserFeedBack as e:
|
||||
ll = {'info': log.info, 'warn': log.warn,
|
||||
'error':log.error}.get(e.level, log.info)
|
||||
ll(e.title)
|
||||
if e.det_msg:
|
||||
log.debug(e.detmsg)
|
||||
ll(e.msg)
|
||||
raise SystemExit(1)
|
||||
|
||||
log(_('Output saved to'), ' ', plumber.output)
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
def manual_index_strings():
|
||||
return _('''\
|
||||
The options and default values for the options change depending on both the
|
||||
input and output formats, so you should always check with::
|
||||
|
||||
%s
|
||||
|
||||
Below are the options that are common to all conversion, followed by the
|
||||
options specific to every input and output format.''')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
sys.exit(main())
|
||||
10
ebook_converter/ebooks/conversion/plugins/__init__.py
Normal file
10
ebook_converter/ebooks/conversion/plugins/__init__.py
Normal file
@@ -0,0 +1,10 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
|
||||
|
||||
29
ebook_converter/ebooks/conversion/plugins/azw4_input.py
Normal file
29
ebook_converter/ebooks/conversion/plugins/azw4_input.py
Normal file
@@ -0,0 +1,29 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2011, John Schember <john@nachtimwald.com>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
from calibre.customize.conversion import InputFormatPlugin
|
||||
from polyglot.builtins import getcwd
|
||||
|
||||
|
||||
class AZW4Input(InputFormatPlugin):
|
||||
|
||||
name = 'AZW4 Input'
|
||||
author = 'John Schember'
|
||||
description = 'Convert AZW4 to HTML'
|
||||
file_types = {'azw4'}
|
||||
commit_name = 'azw4_input'
|
||||
|
||||
def convert(self, stream, options, file_ext, log,
|
||||
accelerators):
|
||||
from calibre.ebooks.pdb.header import PdbHeaderReader
|
||||
from calibre.ebooks.azw4.reader import Reader
|
||||
|
||||
header = PdbHeaderReader(stream)
|
||||
reader = Reader(header, stream, log, options)
|
||||
opf = reader.extract_content(getcwd())
|
||||
|
||||
return opf
|
||||
202
ebook_converter/ebooks/conversion/plugins/chm_input.py
Normal file
202
ebook_converter/ebooks/conversion/plugins/chm_input.py
Normal file
@@ -0,0 +1,202 @@
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
''' CHM File decoding support '''
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>,' \
|
||||
' and Alex Bramley <a.bramley at gmail.com>.'
|
||||
|
||||
import os
|
||||
|
||||
from calibre.customize.conversion import InputFormatPlugin
|
||||
from calibre.ptempfile import TemporaryDirectory
|
||||
from calibre.constants import filesystem_encoding
|
||||
from polyglot.builtins import unicode_type, as_bytes
|
||||
|
||||
|
||||
class CHMInput(InputFormatPlugin):
|
||||
|
||||
name = 'CHM Input'
|
||||
author = 'Kovid Goyal and Alex Bramley'
|
||||
description = 'Convert CHM files to OEB'
|
||||
file_types = {'chm'}
|
||||
commit_name = 'chm_input'
|
||||
|
||||
def _chmtohtml(self, output_dir, chm_path, no_images, log, debug_dump=False):
|
||||
from calibre.ebooks.chm.reader import CHMReader
|
||||
log.debug('Opening CHM file')
|
||||
rdr = CHMReader(chm_path, log, input_encoding=self.opts.input_encoding)
|
||||
log.debug('Extracting CHM to %s' % output_dir)
|
||||
rdr.extract_content(output_dir, debug_dump=debug_dump)
|
||||
self._chm_reader = rdr
|
||||
return rdr.hhc_path
|
||||
|
||||
def convert(self, stream, options, file_ext, log, accelerators):
|
||||
from calibre.ebooks.chm.metadata import get_metadata_from_reader
|
||||
from calibre.customize.ui import plugin_for_input_format
|
||||
self.opts = options
|
||||
|
||||
log.debug('Processing CHM...')
|
||||
with TemporaryDirectory('_chm2oeb') as tdir:
|
||||
if not isinstance(tdir, unicode_type):
|
||||
tdir = tdir.decode(filesystem_encoding)
|
||||
html_input = plugin_for_input_format('html')
|
||||
for opt in html_input.options:
|
||||
setattr(options, opt.option.name, opt.recommended_value)
|
||||
no_images = False # options.no_images
|
||||
chm_name = stream.name
|
||||
# chm_data = stream.read()
|
||||
|
||||
# closing stream so CHM can be opened by external library
|
||||
stream.close()
|
||||
log.debug('tdir=%s' % tdir)
|
||||
log.debug('stream.name=%s' % stream.name)
|
||||
debug_dump = False
|
||||
odi = options.debug_pipeline
|
||||
if odi:
|
||||
debug_dump = os.path.join(odi, 'input')
|
||||
mainname = self._chmtohtml(tdir, chm_name, no_images, log,
|
||||
debug_dump=debug_dump)
|
||||
mainpath = os.path.join(tdir, mainname)
|
||||
|
||||
try:
|
||||
metadata = get_metadata_from_reader(self._chm_reader)
|
||||
except Exception:
|
||||
log.exception('Failed to read metadata, using filename')
|
||||
from calibre.ebooks.metadata.book.base import Metadata
|
||||
metadata = Metadata(os.path.basename(chm_name))
|
||||
encoding = self._chm_reader.get_encoding() or options.input_encoding or 'cp1252'
|
||||
self._chm_reader.CloseCHM()
|
||||
# print((tdir, mainpath))
|
||||
# from calibre import ipython
|
||||
# ipython()
|
||||
|
||||
options.debug_pipeline = None
|
||||
options.input_encoding = 'utf-8'
|
||||
uenc = encoding
|
||||
if os.path.abspath(mainpath) in self._chm_reader.re_encoded_files:
|
||||
uenc = 'utf-8'
|
||||
htmlpath, toc = self._create_html_root(mainpath, log, uenc)
|
||||
oeb = self._create_oebbook_html(htmlpath, tdir, options, log, metadata)
|
||||
options.debug_pipeline = odi
|
||||
if toc.count() > 1:
|
||||
oeb.toc = self.parse_html_toc(oeb.spine[0])
|
||||
oeb.manifest.remove(oeb.spine[0])
|
||||
oeb.auto_generated_toc = False
|
||||
return oeb
|
||||
|
||||
def parse_html_toc(self, item):
|
||||
from calibre.ebooks.oeb.base import TOC, XPath
|
||||
dx = XPath('./h:div')
|
||||
ax = XPath('./h:a[1]')
|
||||
|
||||
def do_node(parent, div):
|
||||
for child in dx(div):
|
||||
a = ax(child)[0]
|
||||
c = parent.add(a.text, a.attrib['href'])
|
||||
do_node(c, child)
|
||||
|
||||
toc = TOC()
|
||||
root = XPath('//h:div[1]')(item.data)[0]
|
||||
do_node(toc, root)
|
||||
return toc
|
||||
|
||||
def _create_oebbook_html(self, htmlpath, basedir, opts, log, mi):
|
||||
# use HTMLInput plugin to generate book
|
||||
from calibre.customize.builtins import HTMLInput
|
||||
opts.breadth_first = True
|
||||
htmlinput = HTMLInput(None)
|
||||
oeb = htmlinput.create_oebbook(htmlpath, basedir, opts, log, mi)
|
||||
return oeb
|
||||
|
||||
def _create_html_root(self, hhcpath, log, encoding):
|
||||
from lxml import html
|
||||
from polyglot.urllib import unquote as _unquote
|
||||
from calibre.ebooks.oeb.base import urlquote
|
||||
from calibre.ebooks.chardet import xml_to_unicode
|
||||
hhcdata = self._read_file(hhcpath)
|
||||
hhcdata = hhcdata.decode(encoding)
|
||||
hhcdata = xml_to_unicode(hhcdata, verbose=True,
|
||||
strip_encoding_pats=True, resolve_entities=True)[0]
|
||||
hhcroot = html.fromstring(hhcdata)
|
||||
toc = self._process_nodes(hhcroot)
|
||||
# print("=============================")
|
||||
# print("Printing hhcroot")
|
||||
# print(etree.tostring(hhcroot, pretty_print=True))
|
||||
# print("=============================")
|
||||
log.debug('Found %d section nodes' % toc.count())
|
||||
htmlpath = os.path.splitext(hhcpath)[0] + ".html"
|
||||
base = os.path.dirname(os.path.abspath(htmlpath))
|
||||
|
||||
def unquote(x):
|
||||
if isinstance(x, unicode_type):
|
||||
x = x.encode('utf-8')
|
||||
return _unquote(x).decode('utf-8')
|
||||
|
||||
def unquote_path(x):
|
||||
y = unquote(x)
|
||||
if (not os.path.exists(os.path.join(base, x)) and os.path.exists(os.path.join(base, y))):
|
||||
x = y
|
||||
return x
|
||||
|
||||
def donode(item, parent, base, subpath):
|
||||
for child in item:
|
||||
title = child.title
|
||||
if not title:
|
||||
continue
|
||||
raw = unquote_path(child.href or '')
|
||||
rsrcname = os.path.basename(raw)
|
||||
rsrcpath = os.path.join(subpath, rsrcname)
|
||||
if (not os.path.exists(os.path.join(base, rsrcpath)) and os.path.exists(os.path.join(base, raw))):
|
||||
rsrcpath = raw
|
||||
|
||||
if '%' not in rsrcpath:
|
||||
rsrcpath = urlquote(rsrcpath)
|
||||
if not raw:
|
||||
rsrcpath = ''
|
||||
c = DIV(A(title, href=rsrcpath))
|
||||
donode(child, c, base, subpath)
|
||||
parent.append(c)
|
||||
|
||||
with open(htmlpath, 'wb') as f:
|
||||
if toc.count() > 1:
|
||||
from lxml.html.builder import HTML, BODY, DIV, A
|
||||
path0 = toc[0].href
|
||||
path0 = unquote_path(path0)
|
||||
subpath = os.path.dirname(path0)
|
||||
base = os.path.dirname(f.name)
|
||||
root = DIV()
|
||||
donode(toc, root, base, subpath)
|
||||
raw = html.tostring(HTML(BODY(root)), encoding='utf-8',
|
||||
pretty_print=True)
|
||||
f.write(raw)
|
||||
else:
|
||||
f.write(as_bytes(hhcdata))
|
||||
return htmlpath, toc
|
||||
|
||||
def _read_file(self, name):
|
||||
with lopen(name, 'rb') as f:
|
||||
data = f.read()
|
||||
return data
|
||||
|
||||
def add_node(self, node, toc, ancestor_map):
|
||||
from calibre.ebooks.chm.reader import match_string
|
||||
if match_string(node.attrib.get('type', ''), 'text/sitemap'):
|
||||
p = node.xpath('ancestor::ul[1]/ancestor::li[1]/object[1]')
|
||||
parent = p[0] if p else None
|
||||
toc = ancestor_map.get(parent, toc)
|
||||
title = href = ''
|
||||
for param in node.xpath('./param'):
|
||||
if match_string(param.attrib['name'], 'name'):
|
||||
title = param.attrib['value']
|
||||
elif match_string(param.attrib['name'], 'local'):
|
||||
href = param.attrib['value']
|
||||
child = toc.add(title or _('Unknown'), href)
|
||||
ancestor_map[node] = child
|
||||
|
||||
def _process_nodes(self, root):
|
||||
from calibre.ebooks.oeb.base import TOC
|
||||
toc = TOC()
|
||||
ancestor_map = {}
|
||||
for node in root.xpath('//object'):
|
||||
self.add_node(node, toc, ancestor_map)
|
||||
return toc
|
||||
310
ebook_converter/ebooks/conversion/plugins/comic_input.py
Normal file
310
ebook_converter/ebooks/conversion/plugins/comic_input.py
Normal file
@@ -0,0 +1,310 @@
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
'''
|
||||
Based on ideas from comiclrf created by FangornUK.
|
||||
'''
|
||||
|
||||
import shutil, textwrap, codecs, os
|
||||
|
||||
from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
|
||||
from calibre import CurrentDir
|
||||
from calibre.ptempfile import PersistentTemporaryDirectory
|
||||
from polyglot.builtins import getcwd, map
|
||||
|
||||
|
||||
class ComicInput(InputFormatPlugin):
|
||||
|
||||
name = 'Comic Input'
|
||||
author = 'Kovid Goyal'
|
||||
description = 'Optimize comic files (.cbz, .cbr, .cbc) for viewing on portable devices'
|
||||
file_types = {'cbz', 'cbr', 'cbc'}
|
||||
is_image_collection = True
|
||||
commit_name = 'comic_input'
|
||||
core_usage = -1
|
||||
|
||||
options = {
|
||||
OptionRecommendation(name='colors', recommended_value=0,
|
||||
help=_('Reduce the number of colors used in the image. This works only'
|
||||
' if you choose the PNG output format. It is useful to reduce file sizes.'
|
||||
' Set to zero to turn off. Maximum value is 256. It is off by default.')),
|
||||
OptionRecommendation(name='dont_normalize', recommended_value=False,
|
||||
help=_('Disable normalize (improve contrast) color range '
|
||||
'for pictures. Default: False')),
|
||||
OptionRecommendation(name='keep_aspect_ratio', recommended_value=False,
|
||||
help=_('Maintain picture aspect ratio. Default is to fill the screen.')),
|
||||
OptionRecommendation(name='dont_sharpen', recommended_value=False,
|
||||
help=_('Disable sharpening.')),
|
||||
OptionRecommendation(name='disable_trim', recommended_value=False,
|
||||
help=_('Disable trimming of comic pages. For some comics, '
|
||||
'trimming might remove content as well as borders.')),
|
||||
OptionRecommendation(name='landscape', recommended_value=False,
|
||||
help=_("Don't split landscape images into two portrait images")),
|
||||
OptionRecommendation(name='wide', recommended_value=False,
|
||||
help=_("Keep aspect ratio and scale image using screen height as "
|
||||
"image width for viewing in landscape mode.")),
|
||||
OptionRecommendation(name='right2left', recommended_value=False,
|
||||
help=_('Used for right-to-left publications like manga. '
|
||||
'Causes landscape pages to be split into portrait pages '
|
||||
'from right to left.')),
|
||||
OptionRecommendation(name='despeckle', recommended_value=False,
|
||||
help=_('Enable Despeckle. Reduces speckle noise. '
|
||||
'May greatly increase processing time.')),
|
||||
OptionRecommendation(name='no_sort', recommended_value=False,
|
||||
help=_("Don't sort the files found in the comic "
|
||||
"alphabetically by name. Instead use the order they were "
|
||||
"added to the comic.")),
|
||||
OptionRecommendation(name='output_format', choices=['png', 'jpg'],
|
||||
recommended_value='png', help=_('The format that images in the created e-book '
|
||||
'are converted to. You can experiment to see which format gives '
|
||||
'you optimal size and look on your device.')),
|
||||
OptionRecommendation(name='no_process', recommended_value=False,
|
||||
help=_("Apply no processing to the image")),
|
||||
OptionRecommendation(name='dont_grayscale', recommended_value=False,
|
||||
help=_('Do not convert the image to grayscale (black and white)')),
|
||||
OptionRecommendation(name='comic_image_size', recommended_value=None,
|
||||
help=_('Specify the image size as widthxheight pixels. Normally,'
|
||||
' an image size is automatically calculated from the output '
|
||||
'profile, this option overrides it.')),
|
||||
OptionRecommendation(name='dont_add_comic_pages_to_toc', recommended_value=False,
|
||||
help=_('When converting a CBC do not add links to each page to'
|
||||
' the TOC. Note this only applies if the TOC has more than one'
|
||||
' section')),
|
||||
}
|
||||
|
||||
recommendations = {
|
||||
('margin_left', 0, OptionRecommendation.HIGH),
|
||||
('margin_top', 0, OptionRecommendation.HIGH),
|
||||
('margin_right', 0, OptionRecommendation.HIGH),
|
||||
('margin_bottom', 0, OptionRecommendation.HIGH),
|
||||
('insert_blank_line', False, OptionRecommendation.HIGH),
|
||||
('remove_paragraph_spacing', False, OptionRecommendation.HIGH),
|
||||
('change_justification', 'left', OptionRecommendation.HIGH),
|
||||
('dont_split_on_pagebreaks', True, OptionRecommendation.HIGH),
|
||||
('chapter', None, OptionRecommendation.HIGH),
|
||||
('page_breaks_brefore', None, OptionRecommendation.HIGH),
|
||||
('use_auto_toc', False, OptionRecommendation.HIGH),
|
||||
('page_breaks_before', None, OptionRecommendation.HIGH),
|
||||
('disable_font_rescaling', True, OptionRecommendation.HIGH),
|
||||
('linearize_tables', False, OptionRecommendation.HIGH),
|
||||
}
|
||||
|
||||
def get_comics_from_collection(self, stream):
|
||||
from calibre.libunzip import extract as zipextract
|
||||
tdir = PersistentTemporaryDirectory('_comic_collection')
|
||||
zipextract(stream, tdir)
|
||||
comics = []
|
||||
with CurrentDir(tdir):
|
||||
if not os.path.exists('comics.txt'):
|
||||
raise ValueError((
|
||||
'%s is not a valid comic collection'
|
||||
' no comics.txt was found in the file')
|
||||
%stream.name)
|
||||
with open('comics.txt', 'rb') as f:
|
||||
raw = f.read()
|
||||
if raw.startswith(codecs.BOM_UTF16_BE):
|
||||
raw = raw.decode('utf-16-be')[1:]
|
||||
elif raw.startswith(codecs.BOM_UTF16_LE):
|
||||
raw = raw.decode('utf-16-le')[1:]
|
||||
elif raw.startswith(codecs.BOM_UTF8):
|
||||
raw = raw.decode('utf-8')[1:]
|
||||
else:
|
||||
raw = raw.decode('utf-8')
|
||||
for line in raw.splitlines():
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
fname, title = line.partition(':')[0], line.partition(':')[-1]
|
||||
fname = fname.replace('#', '_')
|
||||
fname = os.path.join(tdir, *fname.split('/'))
|
||||
if not title:
|
||||
title = os.path.basename(fname).rpartition('.')[0]
|
||||
if os.access(fname, os.R_OK):
|
||||
comics.append([title, fname])
|
||||
if not comics:
|
||||
raise ValueError('%s has no comics'%stream.name)
|
||||
return comics
|
||||
|
||||
def get_pages(self, comic, tdir2):
|
||||
from calibre.ebooks.comic.input import (extract_comic, process_pages,
|
||||
find_pages)
|
||||
tdir = extract_comic(comic)
|
||||
new_pages = find_pages(tdir, sort_on_mtime=self.opts.no_sort,
|
||||
verbose=self.opts.verbose)
|
||||
thumbnail = None
|
||||
if not new_pages:
|
||||
raise ValueError('Could not find any pages in the comic: %s'
|
||||
%comic)
|
||||
if self.opts.no_process:
|
||||
n2 = []
|
||||
for i, page in enumerate(new_pages):
|
||||
n2.append(os.path.join(tdir2, '{} - {}' .format(i, os.path.basename(page))))
|
||||
shutil.copyfile(page, n2[-1])
|
||||
new_pages = n2
|
||||
else:
|
||||
new_pages, failures = process_pages(new_pages, self.opts,
|
||||
self.report_progress, tdir2)
|
||||
if failures:
|
||||
self.log.warning('Could not process the following pages '
|
||||
'(run with --verbose to see why):')
|
||||
for f in failures:
|
||||
self.log.warning('\t', f)
|
||||
if not new_pages:
|
||||
raise ValueError('Could not find any valid pages in comic: %s'
|
||||
% comic)
|
||||
thumbnail = os.path.join(tdir2,
|
||||
'thumbnail.'+self.opts.output_format.lower())
|
||||
if not os.access(thumbnail, os.R_OK):
|
||||
thumbnail = None
|
||||
return new_pages
|
||||
|
||||
def get_images(self):
|
||||
return self._images
|
||||
|
||||
def convert(self, stream, opts, file_ext, log, accelerators):
|
||||
from calibre.ebooks.metadata import MetaInformation
|
||||
from calibre.ebooks.metadata.opf2 import OPFCreator
|
||||
from calibre.ebooks.metadata.toc import TOC
|
||||
|
||||
self.opts, self.log= opts, log
|
||||
if file_ext == 'cbc':
|
||||
comics_ = self.get_comics_from_collection(stream)
|
||||
else:
|
||||
comics_ = [['Comic', os.path.abspath(stream.name)]]
|
||||
stream.close()
|
||||
comics = []
|
||||
for i, x in enumerate(comics_):
|
||||
title, fname = x
|
||||
cdir = 'comic_%d'%(i+1) if len(comics_) > 1 else '.'
|
||||
cdir = os.path.abspath(cdir)
|
||||
if not os.path.exists(cdir):
|
||||
os.makedirs(cdir)
|
||||
pages = self.get_pages(fname, cdir)
|
||||
if not pages:
|
||||
continue
|
||||
if self.for_viewer:
|
||||
comics.append((title, pages, [self.create_viewer_wrapper(pages)]))
|
||||
else:
|
||||
wrappers = self.create_wrappers(pages)
|
||||
comics.append((title, pages, wrappers))
|
||||
|
||||
if not comics:
|
||||
raise ValueError('No comic pages found in %s'%stream.name)
|
||||
|
||||
mi = MetaInformation(os.path.basename(stream.name).rpartition('.')[0],
|
||||
[_('Unknown')])
|
||||
opf = OPFCreator(getcwd(), mi)
|
||||
entries = []
|
||||
|
||||
def href(x):
|
||||
if len(comics) == 1:
|
||||
return os.path.basename(x)
|
||||
return '/'.join(x.split(os.sep)[-2:])
|
||||
|
||||
cover_href = None
|
||||
for comic in comics:
|
||||
pages, wrappers = comic[1:]
|
||||
page_entries = [(x, None) for x in map(href, pages)]
|
||||
entries += [(w, None) for w in map(href, wrappers)] + page_entries
|
||||
if cover_href is None and page_entries:
|
||||
cover_href = page_entries[0][0]
|
||||
opf.create_manifest(entries)
|
||||
spine = []
|
||||
for comic in comics:
|
||||
spine.extend(map(href, comic[2]))
|
||||
self._images = []
|
||||
for comic in comics:
|
||||
self._images.extend(comic[1])
|
||||
opf.create_spine(spine)
|
||||
if self.for_viewer and cover_href:
|
||||
opf.guide.set_cover(cover_href)
|
||||
toc = TOC()
|
||||
if len(comics) == 1:
|
||||
wrappers = comics[0][2]
|
||||
for i, x in enumerate(wrappers):
|
||||
toc.add_item(href(x), None, _('Page')+' %d'%(i+1),
|
||||
play_order=i)
|
||||
else:
|
||||
po = 0
|
||||
for comic in comics:
|
||||
po += 1
|
||||
wrappers = comic[2]
|
||||
stoc = toc.add_item(href(wrappers[0]),
|
||||
None, comic[0], play_order=po)
|
||||
if not opts.dont_add_comic_pages_to_toc:
|
||||
for i, x in enumerate(wrappers):
|
||||
stoc.add_item(href(x), None,
|
||||
_('Page')+' %d'%(i+1), play_order=po)
|
||||
po += 1
|
||||
opf.set_toc(toc)
|
||||
with open('metadata.opf', 'wb') as m, open('toc.ncx', 'wb') as n:
|
||||
opf.render(m, n, 'toc.ncx')
|
||||
return os.path.abspath('metadata.opf')
|
||||
|
||||
def create_wrappers(self, pages):
|
||||
from calibre.ebooks.oeb.base import XHTML_NS
|
||||
wrappers = []
|
||||
WRAPPER = textwrap.dedent('''\
|
||||
<html xmlns="%s">
|
||||
<head>
|
||||
<meta charset="utf-8"/>
|
||||
<title>Page #%d</title>
|
||||
<style type="text/css">
|
||||
@page { margin:0pt; padding: 0pt}
|
||||
body { margin: 0pt; padding: 0pt}
|
||||
div { text-align: center }
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<div>
|
||||
<img src="%s" alt="comic page #%d" />
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
''')
|
||||
dir = os.path.dirname(pages[0])
|
||||
for i, page in enumerate(pages):
|
||||
wrapper = WRAPPER%(XHTML_NS, i+1, os.path.basename(page), i+1)
|
||||
page = os.path.join(dir, 'page_%d.xhtml'%(i+1))
|
||||
with open(page, 'wb') as f:
|
||||
f.write(wrapper.encode('utf-8'))
|
||||
wrappers.append(page)
|
||||
return wrappers
|
||||
|
||||
def create_viewer_wrapper(self, pages):
|
||||
from calibre.ebooks.oeb.base import XHTML_NS
|
||||
|
||||
def page(src):
|
||||
return '<img src="{}"></img>'.format(os.path.basename(src))
|
||||
|
||||
pages = '\n'.join(map(page, pages))
|
||||
base = os.path.dirname(pages[0])
|
||||
wrapper = '''
|
||||
<html xmlns="%s">
|
||||
<head>
|
||||
<meta charset="utf-8"/>
|
||||
<style type="text/css">
|
||||
html, body, img { height: 100vh; display: block; margin: 0; padding: 0; border-width: 0; }
|
||||
img {
|
||||
width: 100%%; height: 100%%;
|
||||
object-fit: contain;
|
||||
margin-left: auto; margin-right: auto;
|
||||
max-width: 100vw; max-height: 100vh;
|
||||
top: 50vh; transform: translateY(-50%%);
|
||||
position: relative;
|
||||
page-break-after: always;
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
%s
|
||||
</body>
|
||||
</html>
|
||||
''' % (XHTML_NS, pages)
|
||||
path = os.path.join(base, 'wrapper.xhtml')
|
||||
with open(path, 'wb') as f:
|
||||
f.write(wrapper.encode('utf-8'))
|
||||
return path
|
||||
67
ebook_converter/ebooks/conversion/plugins/djvu_input.py
Normal file
67
ebook_converter/ebooks/conversion/plugins/djvu_input.py
Normal file
@@ -0,0 +1,67 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL 3'
|
||||
__copyright__ = '2011, Anthon van der Neut <anthon@mnt.org>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import os
|
||||
from io import BytesIO
|
||||
|
||||
from calibre.customize.conversion import InputFormatPlugin
|
||||
from polyglot.builtins import getcwd
|
||||
|
||||
|
||||
class DJVUInput(InputFormatPlugin):
|
||||
|
||||
name = 'DJVU Input'
|
||||
author = 'Anthon van der Neut'
|
||||
description = 'Convert OCR-ed DJVU files (.djvu) to HTML'
|
||||
file_types = {'djvu', 'djv'}
|
||||
commit_name = 'djvu_input'
|
||||
|
||||
def convert(self, stream, options, file_ext, log, accelerators):
|
||||
from calibre.ebooks.txt.processor import convert_basic
|
||||
|
||||
stdout = BytesIO()
|
||||
from calibre.ebooks.djvu.djvu import DJVUFile
|
||||
x = DJVUFile(stream)
|
||||
x.get_text(stdout)
|
||||
raw_text = stdout.getvalue()
|
||||
if not raw_text:
|
||||
raise ValueError('The DJVU file contains no text, only images, probably page scans.'
|
||||
' calibre only supports conversion of DJVU files with actual text in them.')
|
||||
|
||||
html = convert_basic(raw_text.replace(b"\n", b' ').replace(
|
||||
b'\037', b'\n\n'))
|
||||
# Run the HTMLized text through the html processing plugin.
|
||||
from calibre.customize.ui import plugin_for_input_format
|
||||
html_input = plugin_for_input_format('html')
|
||||
for opt in html_input.options:
|
||||
setattr(options, opt.option.name, opt.recommended_value)
|
||||
options.input_encoding = 'utf-8'
|
||||
base = getcwd()
|
||||
htmlfile = os.path.join(base, 'index.html')
|
||||
c = 0
|
||||
while os.path.exists(htmlfile):
|
||||
c += 1
|
||||
htmlfile = os.path.join(base, 'index%d.html'%c)
|
||||
with open(htmlfile, 'wb') as f:
|
||||
f.write(html.encode('utf-8'))
|
||||
odi = options.debug_pipeline
|
||||
options.debug_pipeline = None
|
||||
# Generate oeb from html conversion.
|
||||
with open(htmlfile, 'rb') as f:
|
||||
oeb = html_input.convert(f, options, 'html', log,
|
||||
{})
|
||||
options.debug_pipeline = odi
|
||||
os.remove(htmlfile)
|
||||
|
||||
# Set metadata from file.
|
||||
from calibre.customize.ui import get_file_type_metadata
|
||||
from calibre.ebooks.oeb.transforms.metadata import meta_info_to_oeb_metadata
|
||||
mi = get_file_type_metadata(stream, file_ext)
|
||||
meta_info_to_oeb_metadata(mi, oeb.metadata, log)
|
||||
|
||||
return oeb
|
||||
34
ebook_converter/ebooks/conversion/plugins/docx_input.py
Normal file
34
ebook_converter/ebooks/conversion/plugins/docx_input.py
Normal file
@@ -0,0 +1,34 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=utf-8
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
|
||||
from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
|
||||
|
||||
|
||||
class DOCXInput(InputFormatPlugin):
|
||||
name = 'DOCX Input'
|
||||
author = 'Kovid Goyal'
|
||||
description = _('Convert DOCX files (.docx and .docm) to HTML')
|
||||
file_types = {'docx', 'docm'}
|
||||
commit_name = 'docx_input'
|
||||
|
||||
options = {
|
||||
OptionRecommendation(name='docx_no_cover', recommended_value=False,
|
||||
help=_('Normally, if a large image is present at the start of the document that looks like a cover, '
|
||||
'it will be removed from the document and used as the cover for created e-book. This option '
|
||||
'turns off that behavior.')),
|
||||
OptionRecommendation(name='docx_no_pagebreaks_between_notes', recommended_value=False,
|
||||
help=_('Do not insert a page break after every endnote.')),
|
||||
OptionRecommendation(name='docx_inline_subsup', recommended_value=False,
|
||||
help=_('Render superscripts and subscripts so that they do not affect the line height.')),
|
||||
}
|
||||
|
||||
recommendations = {('page_breaks_before', '/', OptionRecommendation.MED)}
|
||||
|
||||
def convert(self, stream, options, file_ext, log, accelerators):
|
||||
from calibre.ebooks.docx.to_html import Convert
|
||||
return Convert(stream, detect_cover=not options.docx_no_cover, log=log, notes_nopb=options.docx_no_pagebreaks_between_notes,
|
||||
nosupsub=options.docx_inline_subsup)()
|
||||
93
ebook_converter/ebooks/conversion/plugins/docx_output.py
Normal file
93
ebook_converter/ebooks/conversion/plugins/docx_output.py
Normal file
@@ -0,0 +1,93 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=utf-8
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
|
||||
from calibre.customize.conversion import OutputFormatPlugin, OptionRecommendation
|
||||
|
||||
PAGE_SIZES = ['a0', 'a1', 'a2', 'a3', 'a4', 'a5', 'a6', 'b0', 'b1',
|
||||
'b2', 'b3', 'b4', 'b5', 'b6', 'legal', 'letter']
|
||||
|
||||
|
||||
class DOCXOutput(OutputFormatPlugin):
|
||||
|
||||
name = 'DOCX Output'
|
||||
author = 'Kovid Goyal'
|
||||
file_type = 'docx'
|
||||
commit_name = 'docx_output'
|
||||
ui_data = {'page_sizes': PAGE_SIZES}
|
||||
|
||||
options = {
|
||||
OptionRecommendation(name='docx_page_size', recommended_value='letter',
|
||||
level=OptionRecommendation.LOW, choices=PAGE_SIZES,
|
||||
help=_('The size of the page. Default is letter. Choices '
|
||||
'are %s') % PAGE_SIZES),
|
||||
|
||||
OptionRecommendation(name='docx_custom_page_size', recommended_value=None,
|
||||
help=_('Custom size of the document. Use the form widthxheight '
|
||||
'EG. `123x321` to specify the width and height (in pts). '
|
||||
'This overrides any specified page-size.')),
|
||||
|
||||
OptionRecommendation(name='docx_no_cover', recommended_value=False,
|
||||
help=_('Do not insert the book cover as an image at the start of the document.'
|
||||
' If you use this option, the book cover will be discarded.')),
|
||||
|
||||
OptionRecommendation(name='preserve_cover_aspect_ratio', recommended_value=False,
|
||||
help=_('Preserve the aspect ratio of the cover image instead of stretching'
|
||||
' it out to cover the entire page.')),
|
||||
|
||||
OptionRecommendation(name='docx_no_toc', recommended_value=False,
|
||||
help=_('Do not insert the table of contents as a page at the start of the document.')),
|
||||
|
||||
OptionRecommendation(name='extract_to',
|
||||
help=_('Extract the contents of the generated %s file to the '
|
||||
'specified directory. The contents of the directory are first '
|
||||
'deleted, so be careful.') % 'DOCX'),
|
||||
|
||||
OptionRecommendation(name='docx_page_margin_left', recommended_value=72.0,
|
||||
level=OptionRecommendation.LOW,
|
||||
help=_('The size of the left page margin, in pts. Default is 72pt.'
|
||||
' Overrides the common left page margin setting.')
|
||||
),
|
||||
|
||||
OptionRecommendation(name='docx_page_margin_top', recommended_value=72.0,
|
||||
level=OptionRecommendation.LOW,
|
||||
help=_('The size of the top page margin, in pts. Default is 72pt.'
|
||||
' Overrides the common top page margin setting, unless set to zero.')
|
||||
),
|
||||
|
||||
OptionRecommendation(name='docx_page_margin_right', recommended_value=72.0,
|
||||
level=OptionRecommendation.LOW,
|
||||
help=_('The size of the right page margin, in pts. Default is 72pt.'
|
||||
' Overrides the common right page margin setting, unless set to zero.')
|
||||
),
|
||||
|
||||
OptionRecommendation(name='docx_page_margin_bottom', recommended_value=72.0,
|
||||
level=OptionRecommendation.LOW,
|
||||
help=_('The size of the bottom page margin, in pts. Default is 72pt.'
|
||||
' Overrides the common bottom page margin setting, unless set to zero.')
|
||||
),
|
||||
|
||||
}
|
||||
|
||||
def convert_metadata(self, oeb):
|
||||
from lxml import etree
|
||||
from calibre.ebooks.oeb.base import OPF, OPF2_NS
|
||||
from calibre.ebooks.metadata.opf2 import OPF as ReadOPF
|
||||
from io import BytesIO
|
||||
package = etree.Element(OPF('package'), attrib={'version': '2.0'}, nsmap={None: OPF2_NS})
|
||||
oeb.metadata.to_opf2(package)
|
||||
self.mi = ReadOPF(BytesIO(etree.tostring(package, encoding='utf-8')), populate_spine=False, try_to_guess_cover=False).to_book_metadata()
|
||||
|
||||
def convert(self, oeb, output_path, input_plugin, opts, log):
|
||||
from calibre.ebooks.docx.writer.container import DOCX
|
||||
from calibre.ebooks.docx.writer.from_html import Convert
|
||||
docx = DOCX(opts, log)
|
||||
self.convert_metadata(oeb)
|
||||
Convert(oeb, docx, self.mi, not opts.docx_no_cover, not opts.docx_no_toc)()
|
||||
docx.write(output_path, self.mi)
|
||||
if opts.extract_to:
|
||||
from calibre.ebooks.docx.dump import do_dump
|
||||
do_dump(output_path, opts.extract_to)
|
||||
438
ebook_converter/ebooks/conversion/plugins/epub_input.py
Normal file
438
ebook_converter/ebooks/conversion/plugins/epub_input.py
Normal file
@@ -0,0 +1,438 @@
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL 3'
|
||||
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import os, re, posixpath
|
||||
from itertools import cycle
|
||||
|
||||
from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
|
||||
from polyglot.builtins import getcwd
|
||||
|
||||
ADOBE_OBFUSCATION = 'http://ns.adobe.com/pdf/enc#RC'
|
||||
IDPF_OBFUSCATION = 'http://www.idpf.org/2008/embedding'
|
||||
|
||||
|
||||
def decrypt_font_data(key, data, algorithm):
|
||||
is_adobe = algorithm == ADOBE_OBFUSCATION
|
||||
crypt_len = 1024 if is_adobe else 1040
|
||||
crypt = bytearray(data[:crypt_len])
|
||||
key = cycle(iter(bytearray(key)))
|
||||
decrypt = bytes(bytearray(x^next(key) for x in crypt))
|
||||
return decrypt + data[crypt_len:]
|
||||
|
||||
|
||||
def decrypt_font(key, path, algorithm):
|
||||
with lopen(path, 'r+b') as f:
|
||||
data = decrypt_font_data(key, f.read(), algorithm)
|
||||
f.seek(0), f.truncate(), f.write(data)
|
||||
|
||||
|
||||
class EPUBInput(InputFormatPlugin):
|
||||
|
||||
name = 'EPUB Input'
|
||||
author = 'Kovid Goyal'
|
||||
description = 'Convert EPUB files (.epub) to HTML'
|
||||
file_types = {'epub'}
|
||||
output_encoding = None
|
||||
commit_name = 'epub_input'
|
||||
|
||||
recommendations = {('page_breaks_before', '/', OptionRecommendation.MED)}
|
||||
|
||||
def process_encryption(self, encfile, opf, log):
|
||||
from lxml import etree
|
||||
import uuid, hashlib
|
||||
idpf_key = opf.raw_unique_identifier
|
||||
if idpf_key:
|
||||
idpf_key = re.sub('[\u0020\u0009\u000d\u000a]', '', idpf_key)
|
||||
idpf_key = hashlib.sha1(idpf_key.encode('utf-8')).digest()
|
||||
key = None
|
||||
for item in opf.identifier_iter():
|
||||
scheme = None
|
||||
for xkey in item.attrib.keys():
|
||||
if xkey.endswith('scheme'):
|
||||
scheme = item.get(xkey)
|
||||
if (scheme and scheme.lower() == 'uuid') or \
|
||||
(item.text and item.text.startswith('urn:uuid:')):
|
||||
try:
|
||||
key = item.text.rpartition(':')[-1]
|
||||
key = uuid.UUID(key).bytes
|
||||
except:
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
key = None
|
||||
|
||||
try:
|
||||
root = etree.parse(encfile)
|
||||
for em in root.xpath('descendant::*[contains(name(), "EncryptionMethod")]'):
|
||||
algorithm = em.get('Algorithm', '')
|
||||
if algorithm not in {ADOBE_OBFUSCATION, IDPF_OBFUSCATION}:
|
||||
return False
|
||||
cr = em.getparent().xpath('descendant::*[contains(name(), "CipherReference")]')[0]
|
||||
uri = cr.get('URI')
|
||||
path = os.path.abspath(os.path.join(os.path.dirname(encfile), '..', *uri.split('/')))
|
||||
tkey = (key if algorithm == ADOBE_OBFUSCATION else idpf_key)
|
||||
if (tkey and os.path.exists(path)):
|
||||
self._encrypted_font_uris.append(uri)
|
||||
decrypt_font(tkey, path, algorithm)
|
||||
return True
|
||||
except:
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
return False
|
||||
|
||||
def set_guide_type(self, opf, gtype, href=None, title=''):
|
||||
# Set the specified guide entry
|
||||
for elem in list(opf.iterguide()):
|
||||
if elem.get('type', '').lower() == gtype:
|
||||
elem.getparent().remove(elem)
|
||||
|
||||
if href is not None:
|
||||
t = opf.create_guide_item(gtype, title, href)
|
||||
for guide in opf.root.xpath('./*[local-name()="guide"]'):
|
||||
guide.append(t)
|
||||
return
|
||||
guide = opf.create_guide_element()
|
||||
opf.root.append(guide)
|
||||
guide.append(t)
|
||||
return t
|
||||
|
||||
def rationalize_cover3(self, opf, log):
|
||||
''' If there is a reference to the cover/titlepage via manifest properties, convert to
|
||||
entries in the <guide> so that the rest of the pipeline picks it up. '''
|
||||
from calibre.ebooks.metadata.opf3 import items_with_property
|
||||
removed = guide_titlepage_href = guide_titlepage_id = None
|
||||
|
||||
# Look for titlepages incorrectly marked in the <guide> as covers
|
||||
guide_cover, guide_elem = None, None
|
||||
for guide_elem in opf.iterguide():
|
||||
if guide_elem.get('type', '').lower() == 'cover':
|
||||
guide_cover = guide_elem.get('href', '').partition('#')[0]
|
||||
break
|
||||
if guide_cover:
|
||||
spine = list(opf.iterspine())
|
||||
if spine:
|
||||
idref = spine[0].get('idref', '')
|
||||
for x in opf.itermanifest():
|
||||
if x.get('id') == idref and x.get('href') == guide_cover:
|
||||
guide_titlepage_href = guide_cover
|
||||
guide_titlepage_id = idref
|
||||
break
|
||||
|
||||
raster_cover_href = opf.epub3_raster_cover or opf.raster_cover
|
||||
if raster_cover_href:
|
||||
self.set_guide_type(opf, 'cover', raster_cover_href, 'Cover Image')
|
||||
titlepage_id = titlepage_href = None
|
||||
for item in items_with_property(opf.root, 'calibre:title-page'):
|
||||
tid, href = item.get('id'), item.get('href')
|
||||
if href and tid:
|
||||
titlepage_id, titlepage_href = tid, href.partition('#')[0]
|
||||
break
|
||||
if titlepage_href is None:
|
||||
titlepage_href, titlepage_id = guide_titlepage_href, guide_titlepage_id
|
||||
if titlepage_href is not None:
|
||||
self.set_guide_type(opf, 'titlepage', titlepage_href, 'Title Page')
|
||||
spine = list(opf.iterspine())
|
||||
if len(spine) > 1:
|
||||
for item in spine:
|
||||
if item.get('idref') == titlepage_id:
|
||||
log('Found HTML cover', titlepage_href)
|
||||
if self.for_viewer:
|
||||
item.attrib.pop('linear', None)
|
||||
else:
|
||||
item.getparent().remove(item)
|
||||
removed = titlepage_href
|
||||
return removed
|
||||
|
||||
def rationalize_cover2(self, opf, log):
|
||||
''' Ensure that the cover information in the guide is correct. That
|
||||
means, at most one entry with type="cover" that points to a raster
|
||||
cover and at most one entry with type="titlepage" that points to an
|
||||
HTML titlepage. '''
|
||||
from calibre.ebooks.oeb.base import OPF
|
||||
removed = None
|
||||
from lxml import etree
|
||||
guide_cover, guide_elem = None, None
|
||||
for guide_elem in opf.iterguide():
|
||||
if guide_elem.get('type', '').lower() == 'cover':
|
||||
guide_cover = guide_elem.get('href', '').partition('#')[0]
|
||||
break
|
||||
if not guide_cover:
|
||||
raster_cover = opf.raster_cover
|
||||
if raster_cover:
|
||||
if guide_elem is None:
|
||||
g = opf.root.makeelement(OPF('guide'))
|
||||
opf.root.append(g)
|
||||
else:
|
||||
g = guide_elem.getparent()
|
||||
guide_cover = raster_cover
|
||||
guide_elem = g.makeelement(OPF('reference'), attrib={'href':raster_cover, 'type':'cover'})
|
||||
g.append(guide_elem)
|
||||
return
|
||||
spine = list(opf.iterspine())
|
||||
if not spine:
|
||||
return
|
||||
# Check if the cover specified in the guide is also
|
||||
# the first element in spine
|
||||
idref = spine[0].get('idref', '')
|
||||
manifest = list(opf.itermanifest())
|
||||
if not manifest:
|
||||
return
|
||||
elem = [x for x in manifest if x.get('id', '') == idref]
|
||||
if not elem or elem[0].get('href', None) != guide_cover:
|
||||
return
|
||||
log('Found HTML cover', guide_cover)
|
||||
|
||||
# Remove from spine as covers must be treated
|
||||
# specially
|
||||
if not self.for_viewer:
|
||||
if len(spine) == 1:
|
||||
log.warn('There is only a single spine item and it is marked as the cover. Removing cover marking.')
|
||||
for guide_elem in tuple(opf.iterguide()):
|
||||
if guide_elem.get('type', '').lower() == 'cover':
|
||||
guide_elem.getparent().remove(guide_elem)
|
||||
return
|
||||
else:
|
||||
spine[0].getparent().remove(spine[0])
|
||||
removed = guide_cover
|
||||
else:
|
||||
# Ensure the cover is displayed as the first item in the book, some
|
||||
# epub files have it set with linear='no' which causes the cover to
|
||||
# display in the end
|
||||
spine[0].attrib.pop('linear', None)
|
||||
opf.spine[0].is_linear = True
|
||||
# Ensure that the guide has a cover entry pointing to a raster cover
|
||||
# and a titlepage entry pointing to the html titlepage. The titlepage
|
||||
# entry will be used by the epub output plugin, the raster cover entry
|
||||
# by other output plugins.
|
||||
|
||||
# Search for a raster cover identified in the OPF
|
||||
raster_cover = opf.raster_cover
|
||||
|
||||
# Set the cover guide entry
|
||||
if raster_cover is not None:
|
||||
guide_elem.set('href', raster_cover)
|
||||
else:
|
||||
# Render the titlepage to create a raster cover
|
||||
from calibre.ebooks import render_html_svg_workaround
|
||||
guide_elem.set('href', 'calibre_raster_cover.jpg')
|
||||
t = etree.SubElement(
|
||||
elem[0].getparent(), OPF('item'), href=guide_elem.get('href'), id='calibre_raster_cover')
|
||||
t.set('media-type', 'image/jpeg')
|
||||
if os.path.exists(guide_cover):
|
||||
renderer = render_html_svg_workaround(guide_cover, log)
|
||||
if renderer is not None:
|
||||
with lopen('calibre_raster_cover.jpg', 'wb') as f:
|
||||
f.write(renderer)
|
||||
|
||||
# Set the titlepage guide entry
|
||||
self.set_guide_type(opf, 'titlepage', guide_cover, 'Title Page')
|
||||
return removed
|
||||
|
||||
def find_opf(self):
|
||||
from calibre.utils.xml_parse import safe_xml_fromstring
|
||||
|
||||
def attr(n, attr):
|
||||
for k, v in n.attrib.items():
|
||||
if k.endswith(attr):
|
||||
return v
|
||||
try:
|
||||
with lopen('META-INF/container.xml', 'rb') as f:
|
||||
root = safe_xml_fromstring(f.read())
|
||||
for r in root.xpath('//*[local-name()="rootfile"]'):
|
||||
if attr(r, 'media-type') != "application/oebps-package+xml":
|
||||
continue
|
||||
path = attr(r, 'full-path')
|
||||
if not path:
|
||||
continue
|
||||
path = os.path.join(getcwd(), *path.split('/'))
|
||||
if os.path.exists(path):
|
||||
return path
|
||||
except Exception:
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
|
||||
def convert(self, stream, options, file_ext, log, accelerators):
|
||||
from calibre.utils.zipfile import ZipFile
|
||||
from calibre import walk
|
||||
from calibre.ebooks import DRMError
|
||||
from calibre.ebooks.metadata.opf2 import OPF
|
||||
try:
|
||||
zf = ZipFile(stream)
|
||||
zf.extractall(getcwd())
|
||||
except:
|
||||
log.exception('EPUB appears to be invalid ZIP file, trying a'
|
||||
' more forgiving ZIP parser')
|
||||
from calibre.utils.localunzip import extractall
|
||||
stream.seek(0)
|
||||
extractall(stream)
|
||||
encfile = os.path.abspath(os.path.join('META-INF', 'encryption.xml'))
|
||||
opf = self.find_opf()
|
||||
if opf is None:
|
||||
for f in walk('.'):
|
||||
if f.lower().endswith('.opf') and '__MACOSX' not in f and \
|
||||
not os.path.basename(f).startswith('.'):
|
||||
opf = os.path.abspath(f)
|
||||
break
|
||||
path = getattr(stream, 'name', 'stream')
|
||||
|
||||
if opf is None:
|
||||
raise ValueError('%s is not a valid EPUB file (could not find opf)'%path)
|
||||
|
||||
opf = os.path.relpath(opf, getcwd())
|
||||
parts = os.path.split(opf)
|
||||
opf = OPF(opf, os.path.dirname(os.path.abspath(opf)))
|
||||
|
||||
self._encrypted_font_uris = []
|
||||
if os.path.exists(encfile):
|
||||
if not self.process_encryption(encfile, opf, log):
|
||||
raise DRMError(os.path.basename(path))
|
||||
self.encrypted_fonts = self._encrypted_font_uris
|
||||
|
||||
if len(parts) > 1 and parts[0]:
|
||||
delta = '/'.join(parts[:-1])+'/'
|
||||
|
||||
def normpath(x):
|
||||
return posixpath.normpath(delta + elem.get('href'))
|
||||
|
||||
for elem in opf.itermanifest():
|
||||
elem.set('href', normpath(elem.get('href')))
|
||||
for elem in opf.iterguide():
|
||||
elem.set('href', normpath(elem.get('href')))
|
||||
|
||||
f = self.rationalize_cover3 if opf.package_version >= 3.0 else self.rationalize_cover2
|
||||
self.removed_cover = f(opf, log)
|
||||
if self.removed_cover:
|
||||
self.removed_items_to_ignore = (self.removed_cover,)
|
||||
epub3_nav = opf.epub3_nav
|
||||
if epub3_nav is not None:
|
||||
self.convert_epub3_nav(epub3_nav, opf, log, options)
|
||||
|
||||
for x in opf.itermanifest():
|
||||
if x.get('media-type', '') == 'application/x-dtbook+xml':
|
||||
raise ValueError(
|
||||
'EPUB files with DTBook markup are not supported')
|
||||
|
||||
not_for_spine = set()
|
||||
for y in opf.itermanifest():
|
||||
id_ = y.get('id', None)
|
||||
if id_:
|
||||
mt = y.get('media-type', None)
|
||||
if mt in {
|
||||
'application/vnd.adobe-page-template+xml',
|
||||
'application/vnd.adobe.page-template+xml',
|
||||
'application/adobe-page-template+xml',
|
||||
'application/adobe.page-template+xml',
|
||||
'application/text'
|
||||
}:
|
||||
not_for_spine.add(id_)
|
||||
ext = y.get('href', '').rpartition('.')[-1].lower()
|
||||
if mt == 'text/plain' and ext in {'otf', 'ttf'}:
|
||||
# some epub authoring software sets font mime types to
|
||||
# text/plain
|
||||
not_for_spine.add(id_)
|
||||
y.set('media-type', 'application/font')
|
||||
|
||||
seen = set()
|
||||
for x in list(opf.iterspine()):
|
||||
ref = x.get('idref', None)
|
||||
if not ref or ref in not_for_spine or ref in seen:
|
||||
x.getparent().remove(x)
|
||||
continue
|
||||
seen.add(ref)
|
||||
|
||||
if len(list(opf.iterspine())) == 0:
|
||||
raise ValueError('No valid entries in the spine of this EPUB')
|
||||
|
||||
with lopen('content.opf', 'wb') as nopf:
|
||||
nopf.write(opf.render())
|
||||
|
||||
return os.path.abspath('content.opf')
|
||||
|
||||
def convert_epub3_nav(self, nav_path, opf, log, opts):
|
||||
from lxml import etree
|
||||
from calibre.ebooks.chardet import xml_to_unicode
|
||||
from calibre.ebooks.oeb.polish.parsing import parse
|
||||
from calibre.ebooks.oeb.base import EPUB_NS, XHTML, NCX_MIME, NCX, urlnormalize, urlunquote, serialize
|
||||
from calibre.ebooks.oeb.polish.toc import first_child
|
||||
from calibre.utils.xml_parse import safe_xml_fromstring
|
||||
from tempfile import NamedTemporaryFile
|
||||
with lopen(nav_path, 'rb') as f:
|
||||
raw = f.read()
|
||||
raw = xml_to_unicode(raw, strip_encoding_pats=True, assume_utf8=True)[0]
|
||||
root = parse(raw, log=log)
|
||||
ncx = safe_xml_fromstring('<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1" xml:lang="eng"><navMap/></ncx>')
|
||||
navmap = ncx[0]
|
||||
et = '{%s}type' % EPUB_NS
|
||||
bn = os.path.basename(nav_path)
|
||||
|
||||
def add_from_li(li, parent):
|
||||
href = text = None
|
||||
for x in li.iterchildren(XHTML('a'), XHTML('span')):
|
||||
text = etree.tostring(
|
||||
x, method='text', encoding='unicode', with_tail=False).strip() or ' '.join(
|
||||
x.xpath('descendant-or-self::*/@title')).strip()
|
||||
href = x.get('href')
|
||||
if href:
|
||||
if href.startswith('#'):
|
||||
href = bn + href
|
||||
break
|
||||
np = parent.makeelement(NCX('navPoint'))
|
||||
parent.append(np)
|
||||
np.append(np.makeelement(NCX('navLabel')))
|
||||
np[0].append(np.makeelement(NCX('text')))
|
||||
np[0][0].text = text
|
||||
if href:
|
||||
np.append(np.makeelement(NCX('content'), attrib={'src':href}))
|
||||
return np
|
||||
|
||||
def process_nav_node(node, toc_parent):
|
||||
for li in node.iterchildren(XHTML('li')):
|
||||
child = add_from_li(li, toc_parent)
|
||||
ol = first_child(li, XHTML('ol'))
|
||||
if child is not None and ol is not None:
|
||||
process_nav_node(ol, child)
|
||||
|
||||
for nav in root.iterdescendants(XHTML('nav')):
|
||||
if nav.get(et) == 'toc':
|
||||
ol = first_child(nav, XHTML('ol'))
|
||||
if ol is not None:
|
||||
process_nav_node(ol, navmap)
|
||||
break
|
||||
else:
|
||||
return
|
||||
|
||||
with NamedTemporaryFile(suffix='.ncx', dir=os.path.dirname(nav_path), delete=False) as f:
|
||||
f.write(etree.tostring(ncx, encoding='utf-8'))
|
||||
ncx_href = os.path.relpath(f.name, getcwd()).replace(os.sep, '/')
|
||||
ncx_id = opf.create_manifest_item(ncx_href, NCX_MIME, append=True).get('id')
|
||||
for spine in opf.root.xpath('//*[local-name()="spine"]'):
|
||||
spine.set('toc', ncx_id)
|
||||
opts.epub3_nav_href = urlnormalize(os.path.relpath(nav_path).replace(os.sep, '/'))
|
||||
opts.epub3_nav_parsed = root
|
||||
if getattr(self, 'removed_cover', None):
|
||||
changed = False
|
||||
base_path = os.path.dirname(nav_path)
|
||||
for elem in root.xpath('//*[@href]'):
|
||||
href, frag = elem.get('href').partition('#')[::2]
|
||||
link_path = os.path.relpath(os.path.join(base_path, urlunquote(href)), base_path)
|
||||
abs_href = urlnormalize(link_path)
|
||||
if abs_href == self.removed_cover:
|
||||
changed = True
|
||||
elem.set('data-calibre-removed-titlepage', '1')
|
||||
if changed:
|
||||
with lopen(nav_path, 'wb') as f:
|
||||
f.write(serialize(root, 'application/xhtml+xml'))
|
||||
|
||||
def postprocess_book(self, oeb, opts, log):
|
||||
rc = getattr(self, 'removed_cover', None)
|
||||
if rc:
|
||||
cover_toc_item = None
|
||||
for item in oeb.toc.iterdescendants():
|
||||
if item.href and item.href.partition('#')[0] == rc:
|
||||
cover_toc_item = item
|
||||
break
|
||||
spine = {x.href for x in oeb.spine}
|
||||
if (cover_toc_item is not None and cover_toc_item not in spine):
|
||||
oeb.toc.item_that_refers_to_cover = cover_toc_item
|
||||
548
ebook_converter/ebooks/conversion/plugins/epub_output.py
Normal file
548
ebook_converter/ebooks/conversion/plugins/epub_output.py
Normal file
@@ -0,0 +1,548 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import os, shutil, re
|
||||
|
||||
from calibre.customize.conversion import (OutputFormatPlugin,
|
||||
OptionRecommendation)
|
||||
from calibre.ptempfile import TemporaryDirectory
|
||||
from calibre import CurrentDir
|
||||
from polyglot.builtins import unicode_type, filter, map, zip, range, as_bytes
|
||||
|
||||
block_level_tags = (
|
||||
'address',
|
||||
'body',
|
||||
'blockquote',
|
||||
'center',
|
||||
'dir',
|
||||
'div',
|
||||
'dl',
|
||||
'fieldset',
|
||||
'form',
|
||||
'h1',
|
||||
'h2',
|
||||
'h3',
|
||||
'h4',
|
||||
'h5',
|
||||
'h6',
|
||||
'hr',
|
||||
'isindex',
|
||||
'menu',
|
||||
'noframes',
|
||||
'noscript',
|
||||
'ol',
|
||||
'p',
|
||||
'pre',
|
||||
'table',
|
||||
'ul',
|
||||
)
|
||||
|
||||
|
||||
class EPUBOutput(OutputFormatPlugin):
|
||||
|
||||
name = 'EPUB Output'
|
||||
author = 'Kovid Goyal'
|
||||
file_type = 'epub'
|
||||
commit_name = 'epub_output'
|
||||
ui_data = {'versions': ('2', '3')}
|
||||
|
||||
options = {
|
||||
OptionRecommendation(name='extract_to',
|
||||
help=_('Extract the contents of the generated %s file to the '
|
||||
'specified directory. The contents of the directory are first '
|
||||
'deleted, so be careful.') % 'EPUB'),
|
||||
|
||||
OptionRecommendation(name='dont_split_on_page_breaks',
|
||||
recommended_value=False, level=OptionRecommendation.LOW,
|
||||
help=_('Turn off splitting at page breaks. Normally, input '
|
||||
'files are automatically split at every page break into '
|
||||
'two files. This gives an output e-book that can be '
|
||||
'parsed faster and with less resources. However, '
|
||||
'splitting is slow and if your source file contains a '
|
||||
'very large number of page breaks, you should turn off '
|
||||
'splitting on page breaks.'
|
||||
)
|
||||
),
|
||||
|
||||
OptionRecommendation(name='flow_size', recommended_value=260,
|
||||
help=_('Split all HTML files larger than this size (in KB). '
|
||||
'This is necessary as most EPUB readers cannot handle large '
|
||||
'file sizes. The default of %defaultKB is the size required '
|
||||
'for Adobe Digital Editions. Set to 0 to disable size based splitting.')
|
||||
),
|
||||
|
||||
OptionRecommendation(name='no_default_epub_cover', recommended_value=False,
|
||||
help=_('Normally, if the input file has no cover and you don\'t'
|
||||
' specify one, a default cover is generated with the title, '
|
||||
'authors, etc. This option disables the generation of this cover.')
|
||||
),
|
||||
|
||||
OptionRecommendation(name='no_svg_cover', recommended_value=False,
|
||||
help=_('Do not use SVG for the book cover. Use this option if '
|
||||
'your EPUB is going to be used on a device that does not '
|
||||
'support SVG, like the iPhone or the JetBook Lite. '
|
||||
'Without this option, such devices will display the cover '
|
||||
'as a blank page.')
|
||||
),
|
||||
|
||||
OptionRecommendation(name='preserve_cover_aspect_ratio',
|
||||
recommended_value=False, help=_(
|
||||
'When using an SVG cover, this option will cause the cover to scale '
|
||||
'to cover the available screen area, but still preserve its aspect ratio '
|
||||
'(ratio of width to height). That means there may be white borders '
|
||||
'at the sides or top and bottom of the image, but the image will '
|
||||
'never be distorted. Without this option the image may be slightly '
|
||||
'distorted, but there will be no borders.'
|
||||
)
|
||||
),
|
||||
|
||||
OptionRecommendation(name='epub_flatten', recommended_value=False,
|
||||
help=_('This option is needed only if you intend to use the EPUB'
|
||||
' with FBReaderJ. It will flatten the file system inside the'
|
||||
' EPUB, putting all files into the top level.')
|
||||
),
|
||||
|
||||
OptionRecommendation(name='epub_inline_toc', recommended_value=False,
|
||||
help=_('Insert an inline Table of Contents that will appear as part of the main book content.')
|
||||
),
|
||||
|
||||
OptionRecommendation(name='epub_toc_at_end', recommended_value=False,
|
||||
help=_('Put the inserted inline Table of Contents at the end of the book instead of the start.')
|
||||
),
|
||||
|
||||
OptionRecommendation(name='toc_title', recommended_value=None,
|
||||
help=_('Title for any generated in-line table of contents.')
|
||||
),
|
||||
|
||||
OptionRecommendation(name='epub_version', recommended_value='2', choices=ui_data['versions'],
|
||||
help=_('The version of the EPUB file to generate. EPUB 2 is the'
|
||||
' most widely compatible, only use EPUB 3 if you know you'
|
||||
' actually need it.')
|
||||
),
|
||||
|
||||
}
|
||||
|
||||
recommendations = {('pretty_print', True, OptionRecommendation.HIGH)}
|
||||
|
||||
def workaround_webkit_quirks(self): # {{{
|
||||
from calibre.ebooks.oeb.base import XPath
|
||||
for x in self.oeb.spine:
|
||||
root = x.data
|
||||
body = XPath('//h:body')(root)
|
||||
if body:
|
||||
body = body[0]
|
||||
|
||||
if not hasattr(body, 'xpath'):
|
||||
continue
|
||||
|
||||
for pre in XPath('//h:pre')(body):
|
||||
if not pre.text and len(pre) == 0:
|
||||
pre.tag = 'div'
|
||||
# }}}
|
||||
|
||||
def upshift_markup(self): # {{{
|
||||
'Upgrade markup to comply with XHTML 1.1 where possible'
|
||||
from calibre.ebooks.oeb.base import XPath, XML
|
||||
for x in self.oeb.spine:
|
||||
root = x.data
|
||||
if (not root.get(XML('lang'))) and (root.get('lang')):
|
||||
root.set(XML('lang'), root.get('lang'))
|
||||
body = XPath('//h:body')(root)
|
||||
if body:
|
||||
body = body[0]
|
||||
|
||||
if not hasattr(body, 'xpath'):
|
||||
continue
|
||||
for u in XPath('//h:u')(root):
|
||||
u.tag = 'span'
|
||||
|
||||
seen_ids, seen_names = set(), set()
|
||||
for x in XPath('//*[@id or @name]')(root):
|
||||
eid, name = x.get('id', None), x.get('name', None)
|
||||
if eid:
|
||||
if eid in seen_ids:
|
||||
del x.attrib['id']
|
||||
else:
|
||||
seen_ids.add(eid)
|
||||
if name:
|
||||
if name in seen_names:
|
||||
del x.attrib['name']
|
||||
else:
|
||||
seen_names.add(name)
|
||||
|
||||
# }}}
|
||||
|
||||
def convert(self, oeb, output_path, input_plugin, opts, log):
|
||||
self.log, self.opts, self.oeb = log, opts, oeb
|
||||
|
||||
if self.opts.epub_inline_toc:
|
||||
from calibre.ebooks.mobi.writer8.toc import TOCAdder
|
||||
opts.mobi_toc_at_start = not opts.epub_toc_at_end
|
||||
opts.mobi_passthrough = False
|
||||
opts.no_inline_toc = False
|
||||
TOCAdder(oeb, opts, replace_previous_inline_toc=True, ignore_existing_toc=True)
|
||||
|
||||
if self.opts.epub_flatten:
|
||||
from calibre.ebooks.oeb.transforms.filenames import FlatFilenames
|
||||
FlatFilenames()(oeb, opts)
|
||||
else:
|
||||
from calibre.ebooks.oeb.transforms.filenames import UniqueFilenames
|
||||
UniqueFilenames()(oeb, opts)
|
||||
|
||||
self.workaround_ade_quirks()
|
||||
self.workaround_webkit_quirks()
|
||||
self.upshift_markup()
|
||||
from calibre.ebooks.oeb.transforms.rescale import RescaleImages
|
||||
RescaleImages(check_colorspaces=True)(oeb, opts)
|
||||
|
||||
from calibre.ebooks.oeb.transforms.split import Split
|
||||
split = Split(not self.opts.dont_split_on_page_breaks,
|
||||
max_flow_size=self.opts.flow_size*1024
|
||||
)
|
||||
split(self.oeb, self.opts)
|
||||
|
||||
from calibre.ebooks.oeb.transforms.cover import CoverManager
|
||||
cm = CoverManager(
|
||||
no_default_cover=self.opts.no_default_epub_cover,
|
||||
no_svg_cover=self.opts.no_svg_cover,
|
||||
preserve_aspect_ratio=self.opts.preserve_cover_aspect_ratio)
|
||||
cm(self.oeb, self.opts, self.log)
|
||||
|
||||
self.workaround_sony_quirks()
|
||||
|
||||
if self.oeb.toc.count() == 0:
|
||||
self.log.warn('This EPUB file has no Table of Contents. '
|
||||
'Creating a default TOC')
|
||||
first = next(iter(self.oeb.spine))
|
||||
self.oeb.toc.add(_('Start'), first.href)
|
||||
|
||||
from calibre.ebooks.oeb.base import OPF
|
||||
identifiers = oeb.metadata['identifier']
|
||||
uuid = None
|
||||
for x in identifiers:
|
||||
if x.get(OPF('scheme'), None).lower() == 'uuid' or unicode_type(x).startswith('urn:uuid:'):
|
||||
uuid = unicode_type(x).split(':')[-1]
|
||||
break
|
||||
encrypted_fonts = getattr(input_plugin, 'encrypted_fonts', [])
|
||||
|
||||
if uuid is None:
|
||||
self.log.warn('No UUID identifier found')
|
||||
from uuid import uuid4
|
||||
uuid = unicode_type(uuid4())
|
||||
oeb.metadata.add('identifier', uuid, scheme='uuid', id=uuid)
|
||||
|
||||
if encrypted_fonts and not uuid.startswith('urn:uuid:'):
|
||||
# Apparently ADE requires this value to start with urn:uuid:
|
||||
# for some absurd reason, or it will throw a hissy fit and refuse
|
||||
# to use the obfuscated fonts.
|
||||
for x in identifiers:
|
||||
if unicode_type(x) == uuid:
|
||||
x.content = 'urn:uuid:'+uuid
|
||||
|
||||
with TemporaryDirectory('_epub_output') as tdir:
|
||||
from calibre.customize.ui import plugin_for_output_format
|
||||
metadata_xml = None
|
||||
extra_entries = []
|
||||
if self.is_periodical:
|
||||
if self.opts.output_profile.epub_periodical_format == 'sony':
|
||||
from calibre.ebooks.epub.periodical import sony_metadata
|
||||
metadata_xml, atom_xml = sony_metadata(oeb)
|
||||
extra_entries = [('atom.xml', 'application/atom+xml', atom_xml)]
|
||||
oeb_output = plugin_for_output_format('oeb')
|
||||
oeb_output.convert(oeb, tdir, input_plugin, opts, log)
|
||||
opf = [x for x in os.listdir(tdir) if x.endswith('.opf')][0]
|
||||
self.condense_ncx([os.path.join(tdir, x) for x in os.listdir(tdir)
|
||||
if x.endswith('.ncx')][0])
|
||||
if self.opts.epub_version == '3':
|
||||
self.upgrade_to_epub3(tdir, opf)
|
||||
encryption = None
|
||||
if encrypted_fonts:
|
||||
encryption = self.encrypt_fonts(encrypted_fonts, tdir, uuid)
|
||||
|
||||
from calibre.ebooks.epub import initialize_container
|
||||
with initialize_container(output_path, os.path.basename(opf),
|
||||
extra_entries=extra_entries) as epub:
|
||||
epub.add_dir(tdir)
|
||||
if encryption is not None:
|
||||
epub.writestr('META-INF/encryption.xml', as_bytes(encryption))
|
||||
if metadata_xml is not None:
|
||||
epub.writestr('META-INF/metadata.xml',
|
||||
metadata_xml.encode('utf-8'))
|
||||
if opts.extract_to is not None:
|
||||
from calibre.utils.zipfile import ZipFile
|
||||
if os.path.exists(opts.extract_to):
|
||||
if os.path.isdir(opts.extract_to):
|
||||
shutil.rmtree(opts.extract_to)
|
||||
else:
|
||||
os.remove(opts.extract_to)
|
||||
os.mkdir(opts.extract_to)
|
||||
with ZipFile(output_path) as zf:
|
||||
zf.extractall(path=opts.extract_to)
|
||||
self.log.info('EPUB extracted to', opts.extract_to)
|
||||
|
||||
def upgrade_to_epub3(self, tdir, opf):
|
||||
self.log.info('Upgrading to EPUB 3...')
|
||||
from calibre.ebooks.epub import simple_container_xml
|
||||
from calibre.ebooks.oeb.polish.cover import fix_conversion_titlepage_links_in_nav
|
||||
try:
|
||||
os.mkdir(os.path.join(tdir, 'META-INF'))
|
||||
except EnvironmentError:
|
||||
pass
|
||||
with open(os.path.join(tdir, 'META-INF', 'container.xml'), 'wb') as f:
|
||||
f.write(simple_container_xml(os.path.basename(opf)).encode('utf-8'))
|
||||
from calibre.ebooks.oeb.polish.container import EpubContainer
|
||||
container = EpubContainer(tdir, self.log)
|
||||
from calibre.ebooks.oeb.polish.upgrade import epub_2_to_3
|
||||
existing_nav = getattr(self.opts, 'epub3_nav_parsed', None)
|
||||
nav_href = getattr(self.opts, 'epub3_nav_href', None)
|
||||
previous_nav = (nav_href, existing_nav) if existing_nav and nav_href else None
|
||||
epub_2_to_3(container, self.log.info, previous_nav=previous_nav)
|
||||
fix_conversion_titlepage_links_in_nav(container)
|
||||
container.commit()
|
||||
os.remove(f.name)
|
||||
try:
|
||||
os.rmdir(os.path.join(tdir, 'META-INF'))
|
||||
except EnvironmentError:
|
||||
pass
|
||||
|
||||
def encrypt_fonts(self, uris, tdir, uuid): # {{{
|
||||
from polyglot.binary import from_hex_bytes
|
||||
|
||||
key = re.sub(r'[^a-fA-F0-9]', '', uuid)
|
||||
if len(key) < 16:
|
||||
raise ValueError('UUID identifier %r is invalid'%uuid)
|
||||
key = bytearray(from_hex_bytes((key + key)[:32]))
|
||||
paths = []
|
||||
with CurrentDir(tdir):
|
||||
paths = [os.path.join(*x.split('/')) for x in uris]
|
||||
uris = dict(zip(uris, paths))
|
||||
fonts = []
|
||||
for uri in list(uris.keys()):
|
||||
path = uris[uri]
|
||||
if not os.path.exists(path):
|
||||
uris.pop(uri)
|
||||
continue
|
||||
self.log.debug('Encrypting font:', uri)
|
||||
with lopen(path, 'r+b') as f:
|
||||
data = f.read(1024)
|
||||
if len(data) >= 1024:
|
||||
data = bytearray(data)
|
||||
f.seek(0)
|
||||
f.write(bytes(bytearray(data[i] ^ key[i%16] for i in range(1024))))
|
||||
else:
|
||||
self.log.warn('Font', path, 'is invalid, ignoring')
|
||||
if not isinstance(uri, unicode_type):
|
||||
uri = uri.decode('utf-8')
|
||||
fonts.append('''
|
||||
<enc:EncryptedData>
|
||||
<enc:EncryptionMethod Algorithm="http://ns.adobe.com/pdf/enc#RC"/>
|
||||
<enc:CipherData>
|
||||
<enc:CipherReference URI="%s"/>
|
||||
</enc:CipherData>
|
||||
</enc:EncryptedData>
|
||||
'''%(uri.replace('"', '\\"')))
|
||||
if fonts:
|
||||
ans = '''<encryption
|
||||
xmlns="urn:oasis:names:tc:opendocument:xmlns:container"
|
||||
xmlns:enc="http://www.w3.org/2001/04/xmlenc#"
|
||||
xmlns:deenc="http://ns.adobe.com/digitaleditions/enc">
|
||||
'''
|
||||
ans += '\n'.join(fonts)
|
||||
ans += '\n</encryption>'
|
||||
return ans
|
||||
# }}}
|
||||
|
||||
def condense_ncx(self, ncx_path): # {{{
|
||||
from lxml import etree
|
||||
if not self.opts.pretty_print:
|
||||
tree = etree.parse(ncx_path)
|
||||
for tag in tree.getroot().iter(tag=etree.Element):
|
||||
if tag.text:
|
||||
tag.text = tag.text.strip()
|
||||
if tag.tail:
|
||||
tag.tail = tag.tail.strip()
|
||||
compressed = etree.tostring(tree.getroot(), encoding='utf-8')
|
||||
with open(ncx_path, 'wb') as f:
|
||||
f.write(compressed)
|
||||
# }}}
|
||||
|
||||
def workaround_ade_quirks(self): # {{{
|
||||
'''
|
||||
Perform various markup transforms to get the output to render correctly
|
||||
in the quirky ADE.
|
||||
'''
|
||||
from calibre.ebooks.oeb.base import XPath, XHTML, barename, urlunquote
|
||||
|
||||
stylesheet = self.oeb.manifest.main_stylesheet
|
||||
|
||||
# ADE cries big wet tears when it encounters an invalid fragment
|
||||
# identifier in the NCX toc.
|
||||
frag_pat = re.compile(r'[-A-Za-z0-9_:.]+$')
|
||||
for node in self.oeb.toc.iter():
|
||||
href = getattr(node, 'href', None)
|
||||
if hasattr(href, 'partition'):
|
||||
base, _, frag = href.partition('#')
|
||||
frag = urlunquote(frag)
|
||||
if frag and frag_pat.match(frag) is None:
|
||||
self.log.warn(
|
||||
'Removing fragment identifier %r from TOC as Adobe Digital Editions cannot handle it'%frag)
|
||||
node.href = base
|
||||
|
||||
for x in self.oeb.spine:
|
||||
root = x.data
|
||||
body = XPath('//h:body')(root)
|
||||
if body:
|
||||
body = body[0]
|
||||
|
||||
if hasattr(body, 'xpath'):
|
||||
# remove <img> tags with empty src elements
|
||||
bad = []
|
||||
for x in XPath('//h:img')(body):
|
||||
src = x.get('src', '').strip()
|
||||
if src in ('', '#') or src.startswith('http:'):
|
||||
bad.append(x)
|
||||
for img in bad:
|
||||
img.getparent().remove(img)
|
||||
|
||||
# Add id attribute to <a> tags that have name
|
||||
for x in XPath('//h:a[@name]')(body):
|
||||
if not x.get('id', False):
|
||||
x.set('id', x.get('name'))
|
||||
# The delightful epubcheck has started complaining about <a> tags that
|
||||
# have name attributes.
|
||||
x.attrib.pop('name')
|
||||
|
||||
# Replace <br> that are children of <body> as ADE doesn't handle them
|
||||
for br in XPath('./h:br')(body):
|
||||
if br.getparent() is None:
|
||||
continue
|
||||
try:
|
||||
prior = next(br.itersiblings(preceding=True))
|
||||
priortag = barename(prior.tag)
|
||||
priortext = prior.tail
|
||||
except:
|
||||
priortag = 'body'
|
||||
priortext = body.text
|
||||
if priortext:
|
||||
priortext = priortext.strip()
|
||||
br.tag = XHTML('p')
|
||||
br.text = '\u00a0'
|
||||
style = br.get('style', '').split(';')
|
||||
style = list(filter(None, map(lambda x: x.strip(), style)))
|
||||
style.append('margin:0pt; border:0pt')
|
||||
# If the prior tag is a block (including a <br> we replaced)
|
||||
# then this <br> replacement should have a 1-line height.
|
||||
# Otherwise it should have no height.
|
||||
if not priortext and priortag in block_level_tags:
|
||||
style.append('height:1em')
|
||||
else:
|
||||
style.append('height:0pt')
|
||||
br.set('style', '; '.join(style))
|
||||
|
||||
for tag in XPath('//h:embed')(root):
|
||||
tag.getparent().remove(tag)
|
||||
for tag in XPath('//h:object')(root):
|
||||
if tag.get('type', '').lower().strip() in {'image/svg+xml', 'application/svg+xml'}:
|
||||
continue
|
||||
tag.getparent().remove(tag)
|
||||
|
||||
for tag in XPath('//h:title|//h:style')(root):
|
||||
if not tag.text:
|
||||
tag.getparent().remove(tag)
|
||||
for tag in XPath('//h:script')(root):
|
||||
if (not tag.text and not tag.get('src', False) and tag.get('type', None) != 'text/x-mathjax-config'):
|
||||
tag.getparent().remove(tag)
|
||||
for tag in XPath('//h:body/descendant::h:script')(root):
|
||||
tag.getparent().remove(tag)
|
||||
|
||||
formchildren = XPath('./h:input|./h:button|./h:textarea|'
|
||||
'./h:label|./h:fieldset|./h:legend')
|
||||
for tag in XPath('//h:form')(root):
|
||||
if formchildren(tag):
|
||||
tag.getparent().remove(tag)
|
||||
else:
|
||||
# Not a real form
|
||||
tag.tag = XHTML('div')
|
||||
|
||||
for tag in XPath('//h:center')(root):
|
||||
tag.tag = XHTML('div')
|
||||
tag.set('style', 'text-align:center')
|
||||
# ADE can't handle & in an img url
|
||||
for tag in XPath('//h:img[@src]')(root):
|
||||
tag.set('src', tag.get('src', '').replace('&', ''))
|
||||
|
||||
# ADE whimpers in fright when it encounters a <td> outside a
|
||||
# <table>
|
||||
in_table = XPath('ancestor::h:table')
|
||||
for tag in XPath('//h:td|//h:tr|//h:th')(root):
|
||||
if not in_table(tag):
|
||||
tag.tag = XHTML('div')
|
||||
|
||||
# ADE fails to render non breaking hyphens/soft hyphens/zero width spaces
|
||||
special_chars = re.compile('[\u200b\u00ad]')
|
||||
for elem in root.iterdescendants('*'):
|
||||
if elem.text:
|
||||
elem.text = special_chars.sub('', elem.text)
|
||||
elem.text = elem.text.replace('\u2011', '-')
|
||||
if elem.tail:
|
||||
elem.tail = special_chars.sub('', elem.tail)
|
||||
elem.tail = elem.tail.replace('\u2011', '-')
|
||||
|
||||
if stylesheet is not None:
|
||||
# ADE doesn't render lists correctly if they have left margins
|
||||
from css_parser.css import CSSRule
|
||||
for lb in XPath('//h:ul[@class]|//h:ol[@class]')(root):
|
||||
sel = '.'+lb.get('class')
|
||||
for rule in stylesheet.data.cssRules.rulesOfType(CSSRule.STYLE_RULE):
|
||||
if sel == rule.selectorList.selectorText:
|
||||
rule.style.removeProperty('margin-left')
|
||||
# padding-left breaks rendering in webkit and gecko
|
||||
rule.style.removeProperty('padding-left')
|
||||
# Change whitespace:pre to pre-wrap to accommodate readers that
|
||||
# cannot scroll horizontally
|
||||
for rule in stylesheet.data.cssRules.rulesOfType(CSSRule.STYLE_RULE):
|
||||
style = rule.style
|
||||
ws = style.getPropertyValue('white-space')
|
||||
if ws == 'pre':
|
||||
style.setProperty('white-space', 'pre-wrap')
|
||||
|
||||
# }}}
|
||||
|
||||
def workaround_sony_quirks(self): # {{{
|
||||
'''
|
||||
Perform toc link transforms to alleviate slow loading.
|
||||
'''
|
||||
from calibre.ebooks.oeb.base import urldefrag, XPath
|
||||
from calibre.ebooks.oeb.polish.toc import item_at_top
|
||||
|
||||
def frag_is_at_top(root, frag):
|
||||
elem = XPath('//*[@id="%s" or @name="%s"]'%(frag, frag))(root)
|
||||
if elem:
|
||||
elem = elem[0]
|
||||
else:
|
||||
return False
|
||||
return item_at_top(elem)
|
||||
|
||||
def simplify_toc_entry(toc):
|
||||
if toc.href:
|
||||
href, frag = urldefrag(toc.href)
|
||||
if frag:
|
||||
for x in self.oeb.spine:
|
||||
if x.href == href:
|
||||
if frag_is_at_top(x.data, frag):
|
||||
self.log.debug('Removing anchor from TOC href:',
|
||||
href+'#'+frag)
|
||||
toc.href = href
|
||||
break
|
||||
for x in toc:
|
||||
simplify_toc_entry(x)
|
||||
|
||||
if self.oeb.toc:
|
||||
simplify_toc_entry(self.oeb.toc)
|
||||
|
||||
# }}}
|
||||
179
ebook_converter/ebooks/conversion/plugins/fb2_input.py
Normal file
179
ebook_converter/ebooks/conversion/plugins/fb2_input.py
Normal file
@@ -0,0 +1,179 @@
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2008, Anatoly Shipitsin <norguhtar at gmail.com>'
|
||||
"""
|
||||
Convert .fb2 files to .lrf
|
||||
"""
|
||||
import os, re
|
||||
|
||||
from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
|
||||
from calibre import guess_type
|
||||
from polyglot.builtins import iteritems, getcwd
|
||||
|
||||
FB2NS = 'http://www.gribuser.ru/xml/fictionbook/2.0'
|
||||
FB21NS = 'http://www.gribuser.ru/xml/fictionbook/2.1'
|
||||
|
||||
|
||||
class FB2Input(InputFormatPlugin):
|
||||
|
||||
name = 'FB2 Input'
|
||||
author = 'Anatoly Shipitsin'
|
||||
description = 'Convert FB2 and FBZ files to HTML'
|
||||
file_types = {'fb2', 'fbz'}
|
||||
commit_name = 'fb2_input'
|
||||
|
||||
recommendations = {
|
||||
('level1_toc', '//h:h1', OptionRecommendation.MED),
|
||||
('level2_toc', '//h:h2', OptionRecommendation.MED),
|
||||
('level3_toc', '//h:h3', OptionRecommendation.MED),
|
||||
}
|
||||
|
||||
options = {
|
||||
OptionRecommendation(name='no_inline_fb2_toc',
|
||||
recommended_value=False, level=OptionRecommendation.LOW,
|
||||
help=_('Do not insert a Table of Contents at the beginning of the book.'
|
||||
)
|
||||
)}
|
||||
|
||||
def convert(self, stream, options, file_ext, log,
|
||||
accelerators):
|
||||
from lxml import etree
|
||||
from calibre.utils.xml_parse import safe_xml_fromstring
|
||||
from calibre.ebooks.metadata.fb2 import ensure_namespace, get_fb2_data
|
||||
from calibre.ebooks.metadata.opf2 import OPFCreator
|
||||
from calibre.ebooks.metadata.meta import get_metadata
|
||||
from calibre.ebooks.oeb.base import XLINK_NS, XHTML_NS
|
||||
from calibre.ebooks.chardet import xml_to_unicode
|
||||
self.log = log
|
||||
log.debug('Parsing XML...')
|
||||
raw = get_fb2_data(stream)[0]
|
||||
raw = raw.replace(b'\0', b'')
|
||||
raw = xml_to_unicode(raw, strip_encoding_pats=True,
|
||||
assume_utf8=True, resolve_entities=True)[0]
|
||||
try:
|
||||
doc = safe_xml_fromstring(raw)
|
||||
except etree.XMLSyntaxError:
|
||||
doc = safe_xml_fromstring(raw.replace('& ', '&'))
|
||||
if doc is None:
|
||||
raise ValueError('The FB2 file is not valid XML')
|
||||
doc = ensure_namespace(doc)
|
||||
try:
|
||||
fb_ns = doc.nsmap[doc.prefix]
|
||||
except Exception:
|
||||
fb_ns = FB2NS
|
||||
|
||||
NAMESPACES = {'f':fb_ns, 'l':XLINK_NS}
|
||||
stylesheets = doc.xpath('//*[local-name() = "stylesheet" and @type="text/css"]')
|
||||
css = ''
|
||||
for s in stylesheets:
|
||||
css += etree.tostring(s, encoding='unicode', method='text',
|
||||
with_tail=False) + '\n\n'
|
||||
if css:
|
||||
import css_parser, logging
|
||||
parser = css_parser.CSSParser(fetcher=None,
|
||||
log=logging.getLogger('calibre.css'))
|
||||
|
||||
XHTML_CSS_NAMESPACE = '@namespace "%s";\n' % XHTML_NS
|
||||
text = XHTML_CSS_NAMESPACE + css
|
||||
log.debug('Parsing stylesheet...')
|
||||
stylesheet = parser.parseString(text)
|
||||
stylesheet.namespaces['h'] = XHTML_NS
|
||||
css = stylesheet.cssText
|
||||
if isinstance(css, bytes):
|
||||
css = css.decode('utf-8', 'replace')
|
||||
css = css.replace('h|style', 'h|span')
|
||||
css = re.sub(r'name\s*=\s*', 'class=', css)
|
||||
self.extract_embedded_content(doc)
|
||||
log.debug('Converting XML to HTML...')
|
||||
with open(P('templates/fb2.xsl'), 'rb') as f:
|
||||
ss = f.read().decode('utf-8')
|
||||
ss = ss.replace("__FB_NS__", fb_ns)
|
||||
if options.no_inline_fb2_toc:
|
||||
log('Disabling generation of inline FB2 TOC')
|
||||
ss = re.compile(r'<!-- BUILD TOC -->.*<!-- END BUILD TOC -->',
|
||||
re.DOTALL).sub('', ss)
|
||||
|
||||
styledoc = safe_xml_fromstring(ss)
|
||||
|
||||
transform = etree.XSLT(styledoc)
|
||||
result = transform(doc)
|
||||
|
||||
# Handle links of type note and cite
|
||||
notes = {a.get('href')[1:]: a for a in result.xpath('//a[@link_note and @href]') if a.get('href').startswith('#')}
|
||||
cites = {a.get('link_cite'): a for a in result.xpath('//a[@link_cite]') if not a.get('href', '')}
|
||||
all_ids = {x for x in result.xpath('//*/@id')}
|
||||
for cite, a in iteritems(cites):
|
||||
note = notes.get(cite, None)
|
||||
if note:
|
||||
c = 1
|
||||
while 'cite%d' % c in all_ids:
|
||||
c += 1
|
||||
if not note.get('id', None):
|
||||
note.set('id', 'cite%d' % c)
|
||||
all_ids.add(note.get('id'))
|
||||
a.set('href', '#%s' % note.get('id'))
|
||||
for x in result.xpath('//*[@link_note or @link_cite]'):
|
||||
x.attrib.pop('link_note', None)
|
||||
x.attrib.pop('link_cite', None)
|
||||
|
||||
for img in result.xpath('//img[@src]'):
|
||||
src = img.get('src')
|
||||
img.set('src', self.binary_map.get(src, src))
|
||||
index = transform.tostring(result)
|
||||
with open('index.xhtml', 'wb') as f:
|
||||
f.write(index.encode('utf-8'))
|
||||
with open('inline-styles.css', 'wb') as f:
|
||||
f.write(css.encode('utf-8'))
|
||||
stream.seek(0)
|
||||
mi = get_metadata(stream, 'fb2')
|
||||
if not mi.title:
|
||||
mi.title = _('Unknown')
|
||||
if not mi.authors:
|
||||
mi.authors = [_('Unknown')]
|
||||
cpath = None
|
||||
if mi.cover_data and mi.cover_data[1]:
|
||||
with open('fb2_cover_calibre_mi.jpg', 'wb') as f:
|
||||
f.write(mi.cover_data[1])
|
||||
cpath = os.path.abspath('fb2_cover_calibre_mi.jpg')
|
||||
else:
|
||||
for img in doc.xpath('//f:coverpage/f:image', namespaces=NAMESPACES):
|
||||
href = img.get('{%s}href'%XLINK_NS, img.get('href', None))
|
||||
if href is not None:
|
||||
if href.startswith('#'):
|
||||
href = href[1:]
|
||||
cpath = os.path.abspath(href)
|
||||
break
|
||||
|
||||
opf = OPFCreator(getcwd(), mi)
|
||||
entries = [(f2, guess_type(f2)[0]) for f2 in os.listdir(u'.')]
|
||||
opf.create_manifest(entries)
|
||||
opf.create_spine(['index.xhtml'])
|
||||
if cpath:
|
||||
opf.guide.set_cover(cpath)
|
||||
with open('metadata.opf', 'wb') as f:
|
||||
opf.render(f)
|
||||
return os.path.join(getcwd(), 'metadata.opf')
|
||||
|
||||
def extract_embedded_content(self, doc):
|
||||
from calibre.ebooks.fb2 import base64_decode
|
||||
self.binary_map = {}
|
||||
for elem in doc.xpath('./*'):
|
||||
if elem.text and 'binary' in elem.tag and 'id' in elem.attrib:
|
||||
ct = elem.get('content-type', '')
|
||||
fname = elem.attrib['id']
|
||||
ext = ct.rpartition('/')[-1].lower()
|
||||
if ext in ('png', 'jpeg', 'jpg'):
|
||||
if fname.lower().rpartition('.')[-1] not in {'jpg', 'jpeg',
|
||||
'png'}:
|
||||
fname += '.' + ext
|
||||
self.binary_map[elem.get('id')] = fname
|
||||
raw = elem.text.strip()
|
||||
try:
|
||||
data = base64_decode(raw)
|
||||
except TypeError:
|
||||
self.log.exception('Binary data with id=%s is corrupted, ignoring'%(
|
||||
elem.get('id')))
|
||||
else:
|
||||
with open(fname, 'wb') as f:
|
||||
f.write(data)
|
||||
203
ebook_converter/ebooks/conversion/plugins/fb2_output.py
Normal file
203
ebook_converter/ebooks/conversion/plugins/fb2_output.py
Normal file
@@ -0,0 +1,203 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL 3'
|
||||
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import os
|
||||
|
||||
from calibre.customize.conversion import OutputFormatPlugin, OptionRecommendation
|
||||
|
||||
|
||||
class FB2Output(OutputFormatPlugin):
|
||||
|
||||
name = 'FB2 Output'
|
||||
author = 'John Schember'
|
||||
file_type = 'fb2'
|
||||
commit_name = 'fb2_output'
|
||||
|
||||
FB2_GENRES = [
|
||||
# Science Fiction & Fantasy
|
||||
'sf_history', # Alternative history
|
||||
'sf_action', # Action
|
||||
'sf_epic', # Epic
|
||||
'sf_heroic', # Heroic
|
||||
'sf_detective', # Detective
|
||||
'sf_cyberpunk', # Cyberpunk
|
||||
'sf_space', # Space
|
||||
'sf_social', # Social#philosophical
|
||||
'sf_horror', # Horror & mystic
|
||||
'sf_humor', # Humor
|
||||
'sf_fantasy', # Fantasy
|
||||
'sf', # Science Fiction
|
||||
# Detectives & Thrillers
|
||||
'det_classic', # Classical detectives
|
||||
'det_police', # Police Stories
|
||||
'det_action', # Action
|
||||
'det_irony', # Ironical detectives
|
||||
'det_history', # Historical detectives
|
||||
'det_espionage', # Espionage detectives
|
||||
'det_crime', # Crime detectives
|
||||
'det_political', # Political detectives
|
||||
'det_maniac', # Maniacs
|
||||
'det_hard', # Hard#boiled
|
||||
'thriller', # Thrillers
|
||||
'detective', # Detectives
|
||||
# Prose
|
||||
'prose_classic', # Classics prose
|
||||
'prose_history', # Historical prose
|
||||
'prose_contemporary', # Contemporary prose
|
||||
'prose_counter', # Counterculture
|
||||
'prose_rus_classic', # Russial classics prose
|
||||
'prose_su_classics', # Soviet classics prose
|
||||
# Romance
|
||||
'love_contemporary', # Contemporary Romance
|
||||
'love_history', # Historical Romance
|
||||
'love_detective', # Detective Romance
|
||||
'love_short', # Short Romance
|
||||
'love_erotica', # Erotica
|
||||
# Adventure
|
||||
'adv_western', # Western
|
||||
'adv_history', # History
|
||||
'adv_indian', # Indians
|
||||
'adv_maritime', # Maritime Fiction
|
||||
'adv_geo', # Travel & geography
|
||||
'adv_animal', # Nature & animals
|
||||
'adventure', # Other
|
||||
# Children's
|
||||
'child_tale', # Fairy Tales
|
||||
'child_verse', # Verses
|
||||
'child_prose', # Prose
|
||||
'child_sf', # Science Fiction
|
||||
'child_det', # Detectives & Thrillers
|
||||
'child_adv', # Adventures
|
||||
'child_education', # Educational
|
||||
'children', # Other
|
||||
# Poetry & Dramaturgy
|
||||
'poetry', # Poetry
|
||||
'dramaturgy', # Dramaturgy
|
||||
# Antique literature
|
||||
'antique_ant', # Antique
|
||||
'antique_european', # European
|
||||
'antique_russian', # Old russian
|
||||
'antique_east', # Old east
|
||||
'antique_myths', # Myths. Legends. Epos
|
||||
'antique', # Other
|
||||
# Scientific#educational
|
||||
'sci_history', # History
|
||||
'sci_psychology', # Psychology
|
||||
'sci_culture', # Cultural science
|
||||
'sci_religion', # Religious studies
|
||||
'sci_philosophy', # Philosophy
|
||||
'sci_politics', # Politics
|
||||
'sci_business', # Business literature
|
||||
'sci_juris', # Jurisprudence
|
||||
'sci_linguistic', # Linguistics
|
||||
'sci_medicine', # Medicine
|
||||
'sci_phys', # Physics
|
||||
'sci_math', # Mathematics
|
||||
'sci_chem', # Chemistry
|
||||
'sci_biology', # Biology
|
||||
'sci_tech', # Technical
|
||||
'science', # Other
|
||||
# Computers & Internet
|
||||
'comp_www', # Internet
|
||||
'comp_programming', # Programming
|
||||
'comp_hard', # Hardware
|
||||
'comp_soft', # Software
|
||||
'comp_db', # Databases
|
||||
'comp_osnet', # OS & Networking
|
||||
'computers', # Other
|
||||
# Reference
|
||||
'ref_encyc', # Encyclopedias
|
||||
'ref_dict', # Dictionaries
|
||||
'ref_ref', # Reference
|
||||
'ref_guide', # Guidebooks
|
||||
'reference', # Other
|
||||
# Nonfiction
|
||||
'nonf_biography', # Biography & Memoirs
|
||||
'nonf_publicism', # Publicism
|
||||
'nonf_criticism', # Criticism
|
||||
'design', # Art & design
|
||||
'nonfiction', # Other
|
||||
# Religion & Inspiration
|
||||
'religion_rel', # Religion
|
||||
'religion_esoterics', # Esoterics
|
||||
'religion_self', # Self#improvement
|
||||
'religion', # Other
|
||||
# Humor
|
||||
'humor_anecdote', # Anecdote (funny stories)
|
||||
'humor_prose', # Prose
|
||||
'humor_verse', # Verses
|
||||
'humor', # Other
|
||||
# Home & Family
|
||||
'home_cooking', # Cooking
|
||||
'home_pets', # Pets
|
||||
'home_crafts', # Hobbies & Crafts
|
||||
'home_entertain', # Entertaining
|
||||
'home_health', # Health
|
||||
'home_garden', # Garden
|
||||
'home_diy', # Do it yourself
|
||||
'home_sport', # Sports
|
||||
'home_sex', # Erotica & sex
|
||||
'home', # Other
|
||||
]
|
||||
ui_data = {
|
||||
'sectionize': {
|
||||
'toc': _('Section per entry in the ToC'),
|
||||
'files': _('Section per file'),
|
||||
'nothing': _('A single section')
|
||||
},
|
||||
'genres': FB2_GENRES,
|
||||
}
|
||||
|
||||
options = {
|
||||
OptionRecommendation(name='sectionize',
|
||||
recommended_value='files', level=OptionRecommendation.LOW,
|
||||
choices=list(ui_data['sectionize']),
|
||||
help=_('Specify how sections are created:\n'
|
||||
' * nothing: {nothing}\n'
|
||||
' * files: {files}\n'
|
||||
' * toc: {toc}\n'
|
||||
'If ToC based generation fails, adjust the "Structure detection" and/or "Table of Contents" settings '
|
||||
'(turn on "Force use of auto-generated Table of Contents").').format(**ui_data['sectionize'])
|
||||
),
|
||||
OptionRecommendation(name='fb2_genre',
|
||||
recommended_value='antique', level=OptionRecommendation.LOW,
|
||||
choices=FB2_GENRES,
|
||||
help=(_('Genre for the book. Choices: %s\n\n See: ') % ', '.join(FB2_GENRES)
|
||||
) + 'http://www.fictionbook.org/index.php/Eng:FictionBook_2.1_genres ' + _('for a complete list with descriptions.')),
|
||||
}
|
||||
|
||||
def convert(self, oeb_book, output_path, input_plugin, opts, log):
|
||||
from calibre.ebooks.oeb.transforms.jacket import linearize_jacket
|
||||
from calibre.ebooks.oeb.transforms.rasterize import SVGRasterizer, Unavailable
|
||||
from calibre.ebooks.fb2.fb2ml import FB2MLizer
|
||||
|
||||
try:
|
||||
rasterizer = SVGRasterizer()
|
||||
rasterizer(oeb_book, opts)
|
||||
except Unavailable:
|
||||
log.warn('SVG rasterizer unavailable, SVG will not be converted')
|
||||
|
||||
linearize_jacket(oeb_book)
|
||||
|
||||
fb2mlizer = FB2MLizer(log)
|
||||
fb2_content = fb2mlizer.extract_content(oeb_book, opts)
|
||||
|
||||
close = False
|
||||
if not hasattr(output_path, 'write'):
|
||||
close = True
|
||||
if not os.path.exists(os.path.dirname(output_path)) and os.path.dirname(output_path) != '':
|
||||
os.makedirs(os.path.dirname(output_path))
|
||||
out_stream = lopen(output_path, 'wb')
|
||||
else:
|
||||
out_stream = output_path
|
||||
|
||||
out_stream.seek(0)
|
||||
out_stream.truncate()
|
||||
out_stream.write(fb2_content.encode('utf-8', 'replace'))
|
||||
|
||||
if close:
|
||||
out_stream.close()
|
||||
316
ebook_converter/ebooks/conversion/plugins/html_input.py
Normal file
316
ebook_converter/ebooks/conversion/plugins/html_input.py
Normal file
@@ -0,0 +1,316 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import re, tempfile, os
|
||||
from functools import partial
|
||||
|
||||
from calibre.constants import islinux, isbsd
|
||||
from calibre.customize.conversion import (InputFormatPlugin,
|
||||
OptionRecommendation)
|
||||
from calibre.utils.localization import get_lang
|
||||
from calibre.utils.filenames import ascii_filename
|
||||
from calibre.utils.imghdr import what
|
||||
from polyglot.builtins import unicode_type, zip, getcwd, as_unicode
|
||||
|
||||
|
||||
def sanitize_file_name(x):
|
||||
ans = re.sub(r'\s+', ' ', re.sub(r'[?&=;#]', '_', ascii_filename(x))).strip().rstrip('.')
|
||||
ans, ext = ans.rpartition('.')[::2]
|
||||
return (ans.strip() + '.' + ext.strip()).rstrip('.')
|
||||
|
||||
|
||||
class HTMLInput(InputFormatPlugin):
|
||||
|
||||
name = 'HTML Input'
|
||||
author = 'Kovid Goyal'
|
||||
description = 'Convert HTML and OPF files to an OEB'
|
||||
file_types = {'opf', 'html', 'htm', 'xhtml', 'xhtm', 'shtm', 'shtml'}
|
||||
commit_name = 'html_input'
|
||||
|
||||
options = {
|
||||
OptionRecommendation(name='breadth_first',
|
||||
recommended_value=False, level=OptionRecommendation.LOW,
|
||||
help=_('Traverse links in HTML files breadth first. Normally, '
|
||||
'they are traversed depth first.'
|
||||
)
|
||||
),
|
||||
|
||||
OptionRecommendation(name='max_levels',
|
||||
recommended_value=5, level=OptionRecommendation.LOW,
|
||||
help=_('Maximum levels of recursion when following links in '
|
||||
'HTML files. Must be non-negative. 0 implies that no '
|
||||
'links in the root HTML file are followed. Default is '
|
||||
'%default.'
|
||||
)
|
||||
),
|
||||
|
||||
OptionRecommendation(name='dont_package',
|
||||
recommended_value=False, level=OptionRecommendation.LOW,
|
||||
help=_('Normally this input plugin re-arranges all the input '
|
||||
'files into a standard folder hierarchy. Only use this option '
|
||||
'if you know what you are doing as it can result in various '
|
||||
'nasty side effects in the rest of the conversion pipeline.'
|
||||
)
|
||||
),
|
||||
|
||||
}
|
||||
|
||||
def convert(self, stream, opts, file_ext, log,
|
||||
accelerators):
|
||||
self._is_case_sensitive = None
|
||||
basedir = getcwd()
|
||||
self.opts = opts
|
||||
|
||||
fname = None
|
||||
if hasattr(stream, 'name'):
|
||||
basedir = os.path.dirname(stream.name)
|
||||
fname = os.path.basename(stream.name)
|
||||
|
||||
if file_ext != 'opf':
|
||||
if opts.dont_package:
|
||||
raise ValueError('The --dont-package option is not supported for an HTML input file')
|
||||
from calibre.ebooks.metadata.html import get_metadata
|
||||
mi = get_metadata(stream)
|
||||
if fname:
|
||||
from calibre.ebooks.metadata.meta import metadata_from_filename
|
||||
fmi = metadata_from_filename(fname)
|
||||
fmi.smart_update(mi)
|
||||
mi = fmi
|
||||
oeb = self.create_oebbook(stream.name, basedir, opts, log, mi)
|
||||
return oeb
|
||||
|
||||
from calibre.ebooks.conversion.plumber import create_oebbook
|
||||
return create_oebbook(log, stream.name, opts,
|
||||
encoding=opts.input_encoding)
|
||||
|
||||
def is_case_sensitive(self, path):
|
||||
if getattr(self, '_is_case_sensitive', None) is not None:
|
||||
return self._is_case_sensitive
|
||||
if not path or not os.path.exists(path):
|
||||
return islinux or isbsd
|
||||
self._is_case_sensitive = not (os.path.exists(path.lower()) and os.path.exists(path.upper()))
|
||||
return self._is_case_sensitive
|
||||
|
||||
def create_oebbook(self, htmlpath, basedir, opts, log, mi):
|
||||
import uuid
|
||||
from calibre.ebooks.conversion.plumber import create_oebbook
|
||||
from calibre.ebooks.oeb.base import (DirContainer,
|
||||
rewrite_links, urlnormalize, urldefrag, BINARY_MIME, OEB_STYLES,
|
||||
xpath, urlquote)
|
||||
from calibre import guess_type
|
||||
from calibre.ebooks.oeb.transforms.metadata import \
|
||||
meta_info_to_oeb_metadata
|
||||
from calibre.ebooks.html.input import get_filelist
|
||||
from calibre.ebooks.metadata import string_to_authors
|
||||
from calibre.utils.localization import canonicalize_lang
|
||||
import css_parser, logging
|
||||
css_parser.log.setLevel(logging.WARN)
|
||||
self.OEB_STYLES = OEB_STYLES
|
||||
oeb = create_oebbook(log, None, opts, self,
|
||||
encoding=opts.input_encoding, populate=False)
|
||||
self.oeb = oeb
|
||||
|
||||
metadata = oeb.metadata
|
||||
meta_info_to_oeb_metadata(mi, metadata, log)
|
||||
if not metadata.language:
|
||||
l = canonicalize_lang(getattr(opts, 'language', None))
|
||||
if not l:
|
||||
oeb.logger.warn('Language not specified')
|
||||
l = get_lang().replace('_', '-')
|
||||
metadata.add('language', l)
|
||||
if not metadata.creator:
|
||||
a = getattr(opts, 'authors', None)
|
||||
if a:
|
||||
a = string_to_authors(a)
|
||||
if not a:
|
||||
oeb.logger.warn('Creator not specified')
|
||||
a = [self.oeb.translate(__('Unknown'))]
|
||||
for aut in a:
|
||||
metadata.add('creator', aut)
|
||||
if not metadata.title:
|
||||
oeb.logger.warn('Title not specified')
|
||||
metadata.add('title', self.oeb.translate(__('Unknown')))
|
||||
bookid = unicode_type(uuid.uuid4())
|
||||
metadata.add('identifier', bookid, id='uuid_id', scheme='uuid')
|
||||
for ident in metadata.identifier:
|
||||
if 'id' in ident.attrib:
|
||||
self.oeb.uid = metadata.identifier[0]
|
||||
break
|
||||
|
||||
filelist = get_filelist(htmlpath, basedir, opts, log)
|
||||
filelist = [f for f in filelist if not f.is_binary]
|
||||
htmlfile_map = {}
|
||||
for f in filelist:
|
||||
path = f.path
|
||||
oeb.container = DirContainer(os.path.dirname(path), log,
|
||||
ignore_opf=True)
|
||||
bname = os.path.basename(path)
|
||||
id, href = oeb.manifest.generate(id='html', href=sanitize_file_name(bname))
|
||||
htmlfile_map[path] = href
|
||||
item = oeb.manifest.add(id, href, 'text/html')
|
||||
if path == htmlpath and '%' in path:
|
||||
bname = urlquote(bname)
|
||||
item.html_input_href = bname
|
||||
oeb.spine.add(item, True)
|
||||
|
||||
self.added_resources = {}
|
||||
self.log = log
|
||||
self.log('Normalizing filename cases')
|
||||
for path, href in htmlfile_map.items():
|
||||
if not self.is_case_sensitive(path):
|
||||
path = path.lower()
|
||||
self.added_resources[path] = href
|
||||
self.urlnormalize, self.DirContainer = urlnormalize, DirContainer
|
||||
self.urldefrag = urldefrag
|
||||
self.guess_type, self.BINARY_MIME = guess_type, BINARY_MIME
|
||||
|
||||
self.log('Rewriting HTML links')
|
||||
for f in filelist:
|
||||
path = f.path
|
||||
dpath = os.path.dirname(path)
|
||||
oeb.container = DirContainer(dpath, log, ignore_opf=True)
|
||||
href = htmlfile_map[path]
|
||||
try:
|
||||
item = oeb.manifest.hrefs[href]
|
||||
except KeyError:
|
||||
item = oeb.manifest.hrefs[urlnormalize(href)]
|
||||
rewrite_links(item.data, partial(self.resource_adder, base=dpath))
|
||||
|
||||
for item in oeb.manifest.values():
|
||||
if item.media_type in self.OEB_STYLES:
|
||||
dpath = None
|
||||
for path, href in self.added_resources.items():
|
||||
if href == item.href:
|
||||
dpath = os.path.dirname(path)
|
||||
break
|
||||
css_parser.replaceUrls(item.data,
|
||||
partial(self.resource_adder, base=dpath))
|
||||
|
||||
toc = self.oeb.toc
|
||||
self.oeb.auto_generated_toc = True
|
||||
titles = []
|
||||
headers = []
|
||||
for item in self.oeb.spine:
|
||||
if not item.linear:
|
||||
continue
|
||||
html = item.data
|
||||
title = ''.join(xpath(html, '/h:html/h:head/h:title/text()'))
|
||||
title = re.sub(r'\s+', ' ', title.strip())
|
||||
if title:
|
||||
titles.append(title)
|
||||
headers.append('(unlabled)')
|
||||
for tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'strong'):
|
||||
expr = '/h:html/h:body//h:%s[position()=1]/text()'
|
||||
header = ''.join(xpath(html, expr % tag))
|
||||
header = re.sub(r'\s+', ' ', header.strip())
|
||||
if header:
|
||||
headers[-1] = header
|
||||
break
|
||||
use = titles
|
||||
if len(titles) > len(set(titles)):
|
||||
use = headers
|
||||
for title, item in zip(use, self.oeb.spine):
|
||||
if not item.linear:
|
||||
continue
|
||||
toc.add(title, item.href)
|
||||
|
||||
oeb.container = DirContainer(getcwd(), oeb.log, ignore_opf=True)
|
||||
return oeb
|
||||
|
||||
def link_to_local_path(self, link_, base=None):
|
||||
from calibre.ebooks.html.input import Link
|
||||
if not isinstance(link_, unicode_type):
|
||||
try:
|
||||
link_ = link_.decode('utf-8', 'error')
|
||||
except:
|
||||
self.log.warn('Failed to decode link %r. Ignoring'%link_)
|
||||
return None, None
|
||||
try:
|
||||
l = Link(link_, base if base else getcwd())
|
||||
except:
|
||||
self.log.exception('Failed to process link: %r'%link_)
|
||||
return None, None
|
||||
if l.path is None:
|
||||
# Not a local resource
|
||||
return None, None
|
||||
link = l.path.replace('/', os.sep).strip()
|
||||
frag = l.fragment
|
||||
if not link:
|
||||
return None, None
|
||||
return link, frag
|
||||
|
||||
def resource_adder(self, link_, base=None):
|
||||
from polyglot.urllib import quote
|
||||
link, frag = self.link_to_local_path(link_, base=base)
|
||||
if link is None:
|
||||
return link_
|
||||
try:
|
||||
if base and not os.path.isabs(link):
|
||||
link = os.path.join(base, link)
|
||||
link = os.path.abspath(link)
|
||||
except:
|
||||
return link_
|
||||
if not os.access(link, os.R_OK):
|
||||
return link_
|
||||
if os.path.isdir(link):
|
||||
self.log.warn(link_, 'is a link to a directory. Ignoring.')
|
||||
return link_
|
||||
if not self.is_case_sensitive(tempfile.gettempdir()):
|
||||
link = link.lower()
|
||||
if link not in self.added_resources:
|
||||
bhref = os.path.basename(link)
|
||||
id, href = self.oeb.manifest.generate(id='added', href=sanitize_file_name(bhref))
|
||||
guessed = self.guess_type(href)[0]
|
||||
media_type = guessed or self.BINARY_MIME
|
||||
if media_type == 'text/plain':
|
||||
self.log.warn('Ignoring link to text file %r'%link_)
|
||||
return None
|
||||
if media_type == self.BINARY_MIME:
|
||||
# Check for the common case, images
|
||||
try:
|
||||
img = what(link)
|
||||
except EnvironmentError:
|
||||
pass
|
||||
else:
|
||||
if img:
|
||||
media_type = self.guess_type('dummy.'+img)[0] or self.BINARY_MIME
|
||||
|
||||
self.oeb.log.debug('Added', link)
|
||||
self.oeb.container = self.DirContainer(os.path.dirname(link),
|
||||
self.oeb.log, ignore_opf=True)
|
||||
# Load into memory
|
||||
item = self.oeb.manifest.add(id, href, media_type)
|
||||
# bhref refers to an already existing file. The read() method of
|
||||
# DirContainer will call unquote on it before trying to read the
|
||||
# file, therefore we quote it here.
|
||||
if isinstance(bhref, unicode_type):
|
||||
bhref = bhref.encode('utf-8')
|
||||
item.html_input_href = as_unicode(quote(bhref))
|
||||
if guessed in self.OEB_STYLES:
|
||||
item.override_css_fetch = partial(
|
||||
self.css_import_handler, os.path.dirname(link))
|
||||
item.data
|
||||
self.added_resources[link] = href
|
||||
|
||||
nlink = self.added_resources[link]
|
||||
if frag:
|
||||
nlink = '#'.join((nlink, frag))
|
||||
return nlink
|
||||
|
||||
def css_import_handler(self, base, href):
|
||||
link, frag = self.link_to_local_path(href, base=base)
|
||||
if link is None or not os.access(link, os.R_OK) or os.path.isdir(link):
|
||||
return (None, None)
|
||||
try:
|
||||
with open(link, 'rb') as f:
|
||||
raw = f.read().decode('utf-8', 'replace')
|
||||
raw = self.oeb.css_preprocessor(raw, add_namespace=False)
|
||||
except:
|
||||
self.log.exception('Failed to read CSS file: %r'%link)
|
||||
return (None, None)
|
||||
return (None, raw)
|
||||
226
ebook_converter/ebooks/conversion/plugins/html_output.py
Normal file
226
ebook_converter/ebooks/conversion/plugins/html_output.py
Normal file
@@ -0,0 +1,226 @@
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL 3'
|
||||
__copyright__ = '2010, Fabian Grassl <fg@jusmeum.de>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import os, re, shutil
|
||||
from os.path import dirname, abspath, relpath as _relpath, exists, basename
|
||||
|
||||
from calibre.customize.conversion import OutputFormatPlugin, OptionRecommendation
|
||||
from calibre import CurrentDir
|
||||
from calibre.ptempfile import PersistentTemporaryDirectory
|
||||
from polyglot.builtins import unicode_type
|
||||
|
||||
|
||||
def relpath(*args):
|
||||
return _relpath(*args).replace(os.sep, '/')
|
||||
|
||||
|
||||
class HTMLOutput(OutputFormatPlugin):
|
||||
|
||||
name = 'HTML Output'
|
||||
author = 'Fabian Grassl'
|
||||
file_type = 'zip'
|
||||
commit_name = 'html_output'
|
||||
|
||||
options = {
|
||||
OptionRecommendation(name='template_css',
|
||||
help=_('CSS file used for the output instead of the default file')),
|
||||
|
||||
OptionRecommendation(name='template_html_index',
|
||||
help=_('Template used for generation of the HTML index file instead of the default file')),
|
||||
|
||||
OptionRecommendation(name='template_html',
|
||||
help=_('Template used for the generation of the HTML contents of the book instead of the default file')),
|
||||
|
||||
OptionRecommendation(name='extract_to',
|
||||
help=_('Extract the contents of the generated ZIP file to the '
|
||||
'specified directory. WARNING: The contents of the directory '
|
||||
'will be deleted.')
|
||||
),
|
||||
}
|
||||
|
||||
recommendations = {('pretty_print', True, OptionRecommendation.HIGH)}
|
||||
|
||||
def generate_toc(self, oeb_book, ref_url, output_dir):
|
||||
'''
|
||||
Generate table of contents
|
||||
'''
|
||||
from lxml import etree
|
||||
from polyglot.urllib import unquote
|
||||
|
||||
from calibre.ebooks.oeb.base import element
|
||||
from calibre.utils.cleantext import clean_xml_chars
|
||||
with CurrentDir(output_dir):
|
||||
def build_node(current_node, parent=None):
|
||||
if parent is None:
|
||||
parent = etree.Element('ul')
|
||||
elif len(current_node.nodes):
|
||||
parent = element(parent, ('ul'))
|
||||
for node in current_node.nodes:
|
||||
point = element(parent, 'li')
|
||||
href = relpath(abspath(unquote(node.href)), dirname(ref_url))
|
||||
if isinstance(href, bytes):
|
||||
href = href.decode('utf-8')
|
||||
link = element(point, 'a', href=clean_xml_chars(href))
|
||||
title = node.title
|
||||
if isinstance(title, bytes):
|
||||
title = title.decode('utf-8')
|
||||
if title:
|
||||
title = re.sub(r'\s+', ' ', title)
|
||||
link.text = clean_xml_chars(title)
|
||||
build_node(node, point)
|
||||
return parent
|
||||
wrap = etree.Element('div')
|
||||
wrap.append(build_node(oeb_book.toc))
|
||||
return wrap
|
||||
|
||||
def generate_html_toc(self, oeb_book, ref_url, output_dir):
|
||||
from lxml import etree
|
||||
|
||||
root = self.generate_toc(oeb_book, ref_url, output_dir)
|
||||
return etree.tostring(root, pretty_print=True, encoding='unicode',
|
||||
xml_declaration=False)
|
||||
|
||||
def convert(self, oeb_book, output_path, input_plugin, opts, log):
|
||||
from lxml import etree
|
||||
from calibre.utils import zipfile
|
||||
from templite import Templite
|
||||
from polyglot.urllib import unquote
|
||||
from calibre.ebooks.html.meta import EasyMeta
|
||||
|
||||
# read template files
|
||||
if opts.template_html_index is not None:
|
||||
with open(opts.template_html_index, 'rb') as f:
|
||||
template_html_index_data = f.read()
|
||||
else:
|
||||
template_html_index_data = P('templates/html_export_default_index.tmpl', data=True)
|
||||
|
||||
if opts.template_html is not None:
|
||||
with open(opts.template_html, 'rb') as f:
|
||||
template_html_data = f.read()
|
||||
else:
|
||||
template_html_data = P('templates/html_export_default.tmpl', data=True)
|
||||
|
||||
if opts.template_css is not None:
|
||||
with open(opts.template_css, 'rb') as f:
|
||||
template_css_data = f.read()
|
||||
else:
|
||||
template_css_data = P('templates/html_export_default.css', data=True)
|
||||
|
||||
template_html_index_data = template_html_index_data.decode('utf-8')
|
||||
template_html_data = template_html_data.decode('utf-8')
|
||||
template_css_data = template_css_data.decode('utf-8')
|
||||
|
||||
self.log = log
|
||||
self.opts = opts
|
||||
meta = EasyMeta(oeb_book.metadata)
|
||||
|
||||
tempdir = os.path.realpath(PersistentTemporaryDirectory())
|
||||
output_file = os.path.join(tempdir,
|
||||
basename(re.sub(r'\.zip', '', output_path)+'.html'))
|
||||
output_dir = re.sub(r'\.html', '', output_file)+'_files'
|
||||
|
||||
if not exists(output_dir):
|
||||
os.makedirs(output_dir)
|
||||
|
||||
css_path = output_dir+os.sep+'calibreHtmlOutBasicCss.css'
|
||||
with open(css_path, 'wb') as f:
|
||||
f.write(template_css_data.encode('utf-8'))
|
||||
|
||||
with open(output_file, 'wb') as f:
|
||||
html_toc = self.generate_html_toc(oeb_book, output_file, output_dir)
|
||||
templite = Templite(template_html_index_data)
|
||||
nextLink = oeb_book.spine[0].href
|
||||
nextLink = relpath(output_dir+os.sep+nextLink, dirname(output_file))
|
||||
cssLink = relpath(abspath(css_path), dirname(output_file))
|
||||
tocUrl = relpath(output_file, dirname(output_file))
|
||||
t = templite.render(has_toc=bool(oeb_book.toc.count()),
|
||||
toc=html_toc, meta=meta, nextLink=nextLink,
|
||||
tocUrl=tocUrl, cssLink=cssLink,
|
||||
firstContentPageLink=nextLink)
|
||||
if isinstance(t, unicode_type):
|
||||
t = t.encode('utf-8')
|
||||
f.write(t)
|
||||
|
||||
with CurrentDir(output_dir):
|
||||
for item in oeb_book.manifest:
|
||||
path = abspath(unquote(item.href))
|
||||
dir = dirname(path)
|
||||
if not exists(dir):
|
||||
os.makedirs(dir)
|
||||
if item.spine_position is not None:
|
||||
with open(path, 'wb') as f:
|
||||
pass
|
||||
else:
|
||||
with open(path, 'wb') as f:
|
||||
f.write(item.bytes_representation)
|
||||
item.unload_data_from_memory(memory=path)
|
||||
|
||||
for item in oeb_book.spine:
|
||||
path = abspath(unquote(item.href))
|
||||
dir = dirname(path)
|
||||
root = item.data.getroottree()
|
||||
|
||||
# get & clean HTML <HEAD>-data
|
||||
head = root.xpath('//h:head', namespaces={'h': 'http://www.w3.org/1999/xhtml'})[0]
|
||||
head_content = etree.tostring(head, pretty_print=True, encoding='unicode')
|
||||
head_content = re.sub(r'\<\/?head.*\>', '', head_content)
|
||||
head_content = re.sub(re.compile(r'\<style.*\/style\>', re.M|re.S), '', head_content)
|
||||
head_content = re.sub(r'<(title)([^>]*)/>', r'<\1\2></\1>', head_content)
|
||||
|
||||
# get & clean HTML <BODY>-data
|
||||
body = root.xpath('//h:body', namespaces={'h': 'http://www.w3.org/1999/xhtml'})[0]
|
||||
ebook_content = etree.tostring(body, pretty_print=True, encoding='unicode')
|
||||
ebook_content = re.sub(r'\<\/?body.*\>', '', ebook_content)
|
||||
ebook_content = re.sub(r'<(div|a|span)([^>]*)/>', r'<\1\2></\1>', ebook_content)
|
||||
|
||||
# generate link to next page
|
||||
if item.spine_position+1 < len(oeb_book.spine):
|
||||
nextLink = oeb_book.spine[item.spine_position+1].href
|
||||
nextLink = relpath(abspath(nextLink), dir)
|
||||
else:
|
||||
nextLink = None
|
||||
|
||||
# generate link to previous page
|
||||
if item.spine_position > 0:
|
||||
prevLink = oeb_book.spine[item.spine_position-1].href
|
||||
prevLink = relpath(abspath(prevLink), dir)
|
||||
else:
|
||||
prevLink = None
|
||||
|
||||
cssLink = relpath(abspath(css_path), dir)
|
||||
tocUrl = relpath(output_file, dir)
|
||||
firstContentPageLink = oeb_book.spine[0].href
|
||||
|
||||
# render template
|
||||
templite = Templite(template_html_data)
|
||||
toc = lambda: self.generate_html_toc(oeb_book, path, output_dir)
|
||||
t = templite.render(ebookContent=ebook_content,
|
||||
prevLink=prevLink, nextLink=nextLink,
|
||||
has_toc=bool(oeb_book.toc.count()), toc=toc,
|
||||
tocUrl=tocUrl, head_content=head_content,
|
||||
meta=meta, cssLink=cssLink,
|
||||
firstContentPageLink=firstContentPageLink)
|
||||
|
||||
# write html to file
|
||||
with open(path, 'wb') as f:
|
||||
f.write(t.encode('utf-8'))
|
||||
item.unload_data_from_memory(memory=path)
|
||||
|
||||
zfile = zipfile.ZipFile(output_path, "w")
|
||||
zfile.add_dir(output_dir, basename(output_dir))
|
||||
zfile.write(output_file, basename(output_file), zipfile.ZIP_DEFLATED)
|
||||
|
||||
if opts.extract_to:
|
||||
if os.path.exists(opts.extract_to):
|
||||
shutil.rmtree(opts.extract_to)
|
||||
os.makedirs(opts.extract_to)
|
||||
zfile.extractall(opts.extract_to)
|
||||
self.log('Zip file extracted to', opts.extract_to)
|
||||
|
||||
zfile.close()
|
||||
|
||||
# cleanup temp dir
|
||||
shutil.rmtree(tempdir)
|
||||
133
ebook_converter/ebooks/conversion/plugins/htmlz_input.py
Normal file
133
ebook_converter/ebooks/conversion/plugins/htmlz_input.py
Normal file
@@ -0,0 +1,133 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
|
||||
__license__ = 'GPL 3'
|
||||
__copyright__ = '2011, John Schember <john@nachtimwald.com>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import os
|
||||
|
||||
from calibre import guess_type
|
||||
from calibre.customize.conversion import InputFormatPlugin
|
||||
from polyglot.builtins import getcwd
|
||||
|
||||
|
||||
class HTMLZInput(InputFormatPlugin):
|
||||
|
||||
name = 'HTLZ Input'
|
||||
author = 'John Schember'
|
||||
description = 'Convert HTML files to HTML'
|
||||
file_types = {'htmlz'}
|
||||
commit_name = 'htmlz_input'
|
||||
|
||||
def convert(self, stream, options, file_ext, log,
|
||||
accelerators):
|
||||
from calibre.ebooks.chardet import xml_to_unicode
|
||||
from calibre.ebooks.metadata.opf2 import OPF
|
||||
from calibre.utils.zipfile import ZipFile
|
||||
|
||||
self.log = log
|
||||
html = u''
|
||||
top_levels = []
|
||||
|
||||
# Extract content from zip archive.
|
||||
zf = ZipFile(stream)
|
||||
zf.extractall()
|
||||
|
||||
# Find the HTML file in the archive. It needs to be
|
||||
# top level.
|
||||
index = u''
|
||||
multiple_html = False
|
||||
# Get a list of all top level files in the archive.
|
||||
for x in os.listdir(u'.'):
|
||||
if os.path.isfile(x):
|
||||
top_levels.append(x)
|
||||
# Try to find an index. file.
|
||||
for x in top_levels:
|
||||
if x.lower() in (u'index.html', u'index.xhtml', u'index.htm'):
|
||||
index = x
|
||||
break
|
||||
# Look for multiple HTML files in the archive. We look at the
|
||||
# top level files only as only they matter in HTMLZ.
|
||||
for x in top_levels:
|
||||
if os.path.splitext(x)[1].lower() in (u'.html', u'.xhtml', u'.htm'):
|
||||
# Set index to the first HTML file found if it's not
|
||||
# called index.
|
||||
if not index:
|
||||
index = x
|
||||
else:
|
||||
multiple_html = True
|
||||
# Warn the user if there multiple HTML file in the archive. HTMLZ
|
||||
# supports a single HTML file. A conversion with a multiple HTML file
|
||||
# HTMLZ archive probably won't turn out as the user expects. With
|
||||
# Multiple HTML files ZIP input should be used in place of HTMLZ.
|
||||
if multiple_html:
|
||||
log.warn(_('Multiple HTML files found in the archive. Only %s will be used.') % index)
|
||||
|
||||
if index:
|
||||
with open(index, 'rb') as tf:
|
||||
html = tf.read()
|
||||
else:
|
||||
raise Exception(_('No top level HTML file found.'))
|
||||
|
||||
if not html:
|
||||
raise Exception(_('Top level HTML file %s is empty') % index)
|
||||
|
||||
# Encoding
|
||||
if options.input_encoding:
|
||||
ienc = options.input_encoding
|
||||
else:
|
||||
ienc = xml_to_unicode(html[:4096])[-1]
|
||||
html = html.decode(ienc, 'replace')
|
||||
|
||||
# Run the HTML through the html processing plugin.
|
||||
from calibre.customize.ui import plugin_for_input_format
|
||||
html_input = plugin_for_input_format('html')
|
||||
for opt in html_input.options:
|
||||
setattr(options, opt.option.name, opt.recommended_value)
|
||||
options.input_encoding = 'utf-8'
|
||||
base = getcwd()
|
||||
htmlfile = os.path.join(base, u'index.html')
|
||||
c = 0
|
||||
while os.path.exists(htmlfile):
|
||||
c += 1
|
||||
htmlfile = u'index%d.html'%c
|
||||
with open(htmlfile, 'wb') as f:
|
||||
f.write(html.encode('utf-8'))
|
||||
odi = options.debug_pipeline
|
||||
options.debug_pipeline = None
|
||||
# Generate oeb from html conversion.
|
||||
with open(htmlfile, 'rb') as f:
|
||||
oeb = html_input.convert(f, options, 'html', log,
|
||||
{})
|
||||
options.debug_pipeline = odi
|
||||
os.remove(htmlfile)
|
||||
|
||||
# Set metadata from file.
|
||||
from calibre.customize.ui import get_file_type_metadata
|
||||
from calibre.ebooks.oeb.transforms.metadata import meta_info_to_oeb_metadata
|
||||
mi = get_file_type_metadata(stream, file_ext)
|
||||
meta_info_to_oeb_metadata(mi, oeb.metadata, log)
|
||||
|
||||
# Get the cover path from the OPF.
|
||||
cover_path = None
|
||||
opf = None
|
||||
for x in top_levels:
|
||||
if os.path.splitext(x)[1].lower() == u'.opf':
|
||||
opf = x
|
||||
break
|
||||
if opf:
|
||||
opf = OPF(opf, basedir=getcwd())
|
||||
cover_path = opf.raster_cover or opf.cover
|
||||
# Set the cover.
|
||||
if cover_path:
|
||||
cdata = None
|
||||
with open(os.path.join(getcwd(), cover_path), 'rb') as cf:
|
||||
cdata = cf.read()
|
||||
cover_name = os.path.basename(cover_path)
|
||||
id, href = oeb.manifest.generate('cover', cover_name)
|
||||
oeb.manifest.add(id, href, guess_type(cover_name)[0], data=cdata)
|
||||
oeb.guide.add('cover', 'Cover', href)
|
||||
|
||||
return oeb
|
||||
136
ebook_converter/ebooks/conversion/plugins/htmlz_output.py
Normal file
136
ebook_converter/ebooks/conversion/plugins/htmlz_output.py
Normal file
@@ -0,0 +1,136 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
|
||||
__license__ = 'GPL 3'
|
||||
__copyright__ = '2011, John Schember <john@nachtimwald.com>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import io
|
||||
import os
|
||||
|
||||
from calibre.customize.conversion import OutputFormatPlugin, \
|
||||
OptionRecommendation
|
||||
from calibre.ptempfile import TemporaryDirectory
|
||||
from polyglot.builtins import unicode_type
|
||||
|
||||
|
||||
class HTMLZOutput(OutputFormatPlugin):
|
||||
|
||||
name = 'HTMLZ Output'
|
||||
author = 'John Schember'
|
||||
file_type = 'htmlz'
|
||||
commit_name = 'htmlz_output'
|
||||
ui_data = {
|
||||
'css_choices': {
|
||||
'class': _('Use CSS classes'),
|
||||
'inline': _('Use the style attribute'),
|
||||
'tag': _('Use HTML tags wherever possible')
|
||||
},
|
||||
'sheet_choices': {
|
||||
'external': _('Use an external CSS file'),
|
||||
'inline': _('Use a <style> tag in the HTML file')
|
||||
}
|
||||
}
|
||||
|
||||
options = {
|
||||
OptionRecommendation(name='htmlz_css_type', recommended_value='class',
|
||||
level=OptionRecommendation.LOW,
|
||||
choices=list(ui_data['css_choices']),
|
||||
help=_('Specify the handling of CSS. Default is class.\n'
|
||||
'class: {class}\n'
|
||||
'inline: {inline}\n'
|
||||
'tag: {tag}'
|
||||
).format(**ui_data['css_choices'])),
|
||||
OptionRecommendation(name='htmlz_class_style', recommended_value='external',
|
||||
level=OptionRecommendation.LOW,
|
||||
choices=list(ui_data['sheet_choices']),
|
||||
help=_('How to handle the CSS when using css-type = \'class\'.\n'
|
||||
'Default is external.\n'
|
||||
'external: {external}\n'
|
||||
'inline: {inline}'
|
||||
).format(**ui_data['sheet_choices'])),
|
||||
OptionRecommendation(name='htmlz_title_filename',
|
||||
recommended_value=False, level=OptionRecommendation.LOW,
|
||||
help=_('If set this option causes the file name of the HTML file'
|
||||
' inside the HTMLZ archive to be based on the book title.')
|
||||
),
|
||||
}
|
||||
|
||||
def convert(self, oeb_book, output_path, input_plugin, opts, log):
|
||||
from lxml import etree
|
||||
from calibre.ebooks.oeb.base import OEB_IMAGES, SVG_MIME
|
||||
from calibre.ebooks.metadata.opf2 import OPF, metadata_to_opf
|
||||
from calibre.utils.zipfile import ZipFile
|
||||
from calibre.utils.filenames import ascii_filename
|
||||
|
||||
# HTML
|
||||
if opts.htmlz_css_type == 'inline':
|
||||
from calibre.ebooks.htmlz.oeb2html import OEB2HTMLInlineCSSizer
|
||||
OEB2HTMLizer = OEB2HTMLInlineCSSizer
|
||||
elif opts.htmlz_css_type == 'tag':
|
||||
from calibre.ebooks.htmlz.oeb2html import OEB2HTMLNoCSSizer
|
||||
OEB2HTMLizer = OEB2HTMLNoCSSizer
|
||||
else:
|
||||
from calibre.ebooks.htmlz.oeb2html import OEB2HTMLClassCSSizer as OEB2HTMLizer
|
||||
|
||||
with TemporaryDirectory(u'_htmlz_output') as tdir:
|
||||
htmlizer = OEB2HTMLizer(log)
|
||||
html = htmlizer.oeb2html(oeb_book, opts)
|
||||
|
||||
fname = u'index'
|
||||
if opts.htmlz_title_filename:
|
||||
from calibre.utils.filenames import shorten_components_to
|
||||
fname = shorten_components_to(100, (ascii_filename(unicode_type(oeb_book.metadata.title[0])),))[0]
|
||||
with open(os.path.join(tdir, fname+u'.html'), 'wb') as tf:
|
||||
if isinstance(html, unicode_type):
|
||||
html = html.encode('utf-8')
|
||||
tf.write(html)
|
||||
|
||||
# CSS
|
||||
if opts.htmlz_css_type == 'class' and opts.htmlz_class_style == 'external':
|
||||
with open(os.path.join(tdir, u'style.css'), 'wb') as tf:
|
||||
tf.write(htmlizer.get_css(oeb_book))
|
||||
|
||||
# Images
|
||||
images = htmlizer.images
|
||||
if images:
|
||||
if not os.path.exists(os.path.join(tdir, u'images')):
|
||||
os.makedirs(os.path.join(tdir, u'images'))
|
||||
for item in oeb_book.manifest:
|
||||
if item.media_type in OEB_IMAGES and item.href in images:
|
||||
if item.media_type == SVG_MIME:
|
||||
data = etree.tostring(item.data, encoding='unicode')
|
||||
else:
|
||||
data = item.data
|
||||
fname = os.path.join(tdir, u'images', images[item.href])
|
||||
with open(fname, 'wb') as img:
|
||||
img.write(data)
|
||||
|
||||
# Cover
|
||||
cover_path = None
|
||||
try:
|
||||
cover_data = None
|
||||
if oeb_book.metadata.cover:
|
||||
term = oeb_book.metadata.cover[0].term
|
||||
cover_data = oeb_book.guide[term].item.data
|
||||
if cover_data:
|
||||
from calibre.utils.img import save_cover_data_to
|
||||
cover_path = os.path.join(tdir, u'cover.jpg')
|
||||
with lopen(cover_path, 'w') as cf:
|
||||
cf.write('')
|
||||
save_cover_data_to(cover_data, cover_path)
|
||||
except:
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
|
||||
# Metadata
|
||||
with open(os.path.join(tdir, u'metadata.opf'), 'wb') as mdataf:
|
||||
opf = OPF(io.BytesIO(etree.tostring(oeb_book.metadata.to_opf1(), encoding='UTF-8')))
|
||||
mi = opf.to_book_metadata()
|
||||
if cover_path:
|
||||
mi.cover = u'cover.jpg'
|
||||
mdataf.write(metadata_to_opf(mi))
|
||||
|
||||
htmlz = ZipFile(output_path, 'w')
|
||||
htmlz.add_dir(tdir)
|
||||
64
ebook_converter/ebooks/conversion/plugins/lit_input.py
Normal file
64
ebook_converter/ebooks/conversion/plugins/lit_input.py
Normal file
@@ -0,0 +1,64 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
from calibre.customize.conversion import InputFormatPlugin
|
||||
|
||||
|
||||
class LITInput(InputFormatPlugin):
|
||||
|
||||
name = 'LIT Input'
|
||||
author = 'Marshall T. Vandegrift'
|
||||
description = 'Convert LIT files to HTML'
|
||||
file_types = {'lit'}
|
||||
commit_name = 'lit_input'
|
||||
|
||||
def convert(self, stream, options, file_ext, log,
|
||||
accelerators):
|
||||
from calibre.ebooks.lit.reader import LitReader
|
||||
from calibre.ebooks.conversion.plumber import create_oebbook
|
||||
self.log = log
|
||||
return create_oebbook(log, stream, options, reader=LitReader)
|
||||
|
||||
def postprocess_book(self, oeb, opts, log):
|
||||
from calibre.ebooks.oeb.base import XHTML_NS, XPath, XHTML
|
||||
for item in oeb.spine:
|
||||
root = item.data
|
||||
if not hasattr(root, 'xpath'):
|
||||
continue
|
||||
for bad in ('metadata', 'guide'):
|
||||
metadata = XPath('//h:'+bad)(root)
|
||||
if metadata:
|
||||
for x in metadata:
|
||||
x.getparent().remove(x)
|
||||
body = XPath('//h:body')(root)
|
||||
if body:
|
||||
body = body[0]
|
||||
if len(body) == 1 and body[0].tag == XHTML('pre'):
|
||||
pre = body[0]
|
||||
from calibre.ebooks.txt.processor import convert_basic, \
|
||||
separate_paragraphs_single_line
|
||||
from calibre.ebooks.chardet import xml_to_unicode
|
||||
from calibre.utils.xml_parse import safe_xml_fromstring
|
||||
import copy
|
||||
self.log('LIT file with all text in singe <pre> tag detected')
|
||||
html = separate_paragraphs_single_line(pre.text)
|
||||
html = convert_basic(html).replace('<html>',
|
||||
'<html xmlns="%s">'%XHTML_NS)
|
||||
html = xml_to_unicode(html, strip_encoding_pats=True,
|
||||
resolve_entities=True)[0]
|
||||
if opts.smarten_punctuation:
|
||||
# SmartyPants skips text inside <pre> tags
|
||||
from calibre.ebooks.conversion.preprocess import smarten_punctuation
|
||||
html = smarten_punctuation(html, self.log)
|
||||
root = safe_xml_fromstring(html)
|
||||
body = XPath('//h:body')(root)
|
||||
pre.tag = XHTML('div')
|
||||
pre.text = ''
|
||||
for elem in body:
|
||||
ne = copy.deepcopy(elem)
|
||||
pre.append(ne)
|
||||
38
ebook_converter/ebooks/conversion/plugins/lit_output.py
Normal file
38
ebook_converter/ebooks/conversion/plugins/lit_output.py
Normal file
@@ -0,0 +1,38 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
|
||||
from calibre.customize.conversion import OutputFormatPlugin
|
||||
|
||||
|
||||
class LITOutput(OutputFormatPlugin):
|
||||
|
||||
name = 'LIT Output'
|
||||
author = 'Marshall T. Vandegrift'
|
||||
file_type = 'lit'
|
||||
commit_name = 'lit_output'
|
||||
|
||||
def convert(self, oeb, output_path, input_plugin, opts, log):
|
||||
self.log, self.opts, self.oeb = log, opts, oeb
|
||||
from calibre.ebooks.oeb.transforms.manglecase import CaseMangler
|
||||
from calibre.ebooks.oeb.transforms.rasterize import SVGRasterizer
|
||||
from calibre.ebooks.oeb.transforms.htmltoc import HTMLTOCAdder
|
||||
from calibre.ebooks.lit.writer import LitWriter
|
||||
from calibre.ebooks.oeb.transforms.split import Split
|
||||
split = Split(split_on_page_breaks=True, max_flow_size=0,
|
||||
remove_css_pagebreaks=False)
|
||||
split(self.oeb, self.opts)
|
||||
|
||||
tocadder = HTMLTOCAdder()
|
||||
tocadder(oeb, opts)
|
||||
mangler = CaseMangler()
|
||||
mangler(oeb, opts)
|
||||
rasterizer = SVGRasterizer()
|
||||
rasterizer(oeb, opts)
|
||||
lit = LitWriter(self.opts)
|
||||
lit(oeb, output_path)
|
||||
82
ebook_converter/ebooks/conversion/plugins/lrf_input.py
Normal file
82
ebook_converter/ebooks/conversion/plugins/lrf_input.py
Normal file
@@ -0,0 +1,82 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import os, sys
|
||||
from calibre.customize.conversion import InputFormatPlugin
|
||||
|
||||
|
||||
class LRFInput(InputFormatPlugin):
|
||||
|
||||
name = 'LRF Input'
|
||||
author = 'Kovid Goyal'
|
||||
description = 'Convert LRF files to HTML'
|
||||
file_types = {'lrf'}
|
||||
commit_name = 'lrf_input'
|
||||
|
||||
def convert(self, stream, options, file_ext, log,
|
||||
accelerators):
|
||||
from calibre.ebooks.lrf.input import (MediaType, Styles, TextBlock,
|
||||
Canvas, ImageBlock, RuledLine)
|
||||
self.log = log
|
||||
self.log('Generating XML')
|
||||
from calibre.ebooks.lrf.lrfparser import LRFDocument
|
||||
from calibre.utils.xml_parse import safe_xml_fromstring
|
||||
from lxml import etree
|
||||
d = LRFDocument(stream)
|
||||
d.parse()
|
||||
xml = d.to_xml(write_files=True)
|
||||
if options.verbose > 2:
|
||||
open(u'lrs.xml', 'wb').write(xml.encode('utf-8'))
|
||||
doc = safe_xml_fromstring(xml)
|
||||
|
||||
char_button_map = {}
|
||||
for x in doc.xpath('//CharButton[@refobj]'):
|
||||
ro = x.get('refobj')
|
||||
jump_button = doc.xpath('//*[@objid="%s"]'%ro)
|
||||
if jump_button:
|
||||
jump_to = jump_button[0].xpath('descendant::JumpTo[@refpage and @refobj]')
|
||||
if jump_to:
|
||||
char_button_map[ro] = '%s.xhtml#%s'%(jump_to[0].get('refpage'),
|
||||
jump_to[0].get('refobj'))
|
||||
plot_map = {}
|
||||
for x in doc.xpath('//Plot[@refobj]'):
|
||||
ro = x.get('refobj')
|
||||
image = doc.xpath('//Image[@objid="%s" and @refstream]'%ro)
|
||||
if image:
|
||||
imgstr = doc.xpath('//ImageStream[@objid="%s" and @file]'%
|
||||
image[0].get('refstream'))
|
||||
if imgstr:
|
||||
plot_map[ro] = imgstr[0].get('file')
|
||||
|
||||
self.log('Converting XML to HTML...')
|
||||
styledoc = safe_xml_fromstring(P('templates/lrf.xsl', data=True))
|
||||
media_type = MediaType()
|
||||
styles = Styles()
|
||||
text_block = TextBlock(styles, char_button_map, plot_map, log)
|
||||
canvas = Canvas(doc, styles, text_block, log)
|
||||
image_block = ImageBlock(canvas)
|
||||
ruled_line = RuledLine()
|
||||
extensions = {
|
||||
('calibre', 'media-type') : media_type,
|
||||
('calibre', 'text-block') : text_block,
|
||||
('calibre', 'ruled-line') : ruled_line,
|
||||
('calibre', 'styles') : styles,
|
||||
('calibre', 'canvas') : canvas,
|
||||
('calibre', 'image-block'): image_block,
|
||||
}
|
||||
transform = etree.XSLT(styledoc, extensions=extensions)
|
||||
try:
|
||||
result = transform(doc)
|
||||
except RuntimeError:
|
||||
sys.setrecursionlimit(5000)
|
||||
result = transform(doc)
|
||||
|
||||
with open('content.opf', 'wb') as f:
|
||||
f.write(result)
|
||||
styles.write()
|
||||
return os.path.abspath('content.opf')
|
||||
196
ebook_converter/ebooks/conversion/plugins/lrf_output.py
Normal file
196
ebook_converter/ebooks/conversion/plugins/lrf_output.py
Normal file
@@ -0,0 +1,196 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import sys, os
|
||||
|
||||
from calibre.customize.conversion import OutputFormatPlugin
|
||||
from calibre.customize.conversion import OptionRecommendation
|
||||
from polyglot.builtins import unicode_type
|
||||
|
||||
|
||||
class LRFOptions(object):
|
||||
|
||||
def __init__(self, output, opts, oeb):
|
||||
def f2s(f):
|
||||
try:
|
||||
return unicode_type(f[0])
|
||||
except:
|
||||
return ''
|
||||
m = oeb.metadata
|
||||
for x in ('left', 'top', 'right', 'bottom'):
|
||||
attr = 'margin_'+x
|
||||
val = getattr(opts, attr)
|
||||
if val < 0:
|
||||
setattr(opts, attr, 0)
|
||||
self.title = None
|
||||
self.author = self.publisher = _('Unknown')
|
||||
self.title_sort = self.author_sort = ''
|
||||
for x in m.creator:
|
||||
if x.role == 'aut':
|
||||
self.author = unicode_type(x)
|
||||
fa = unicode_type(getattr(x, 'file_as', ''))
|
||||
if fa:
|
||||
self.author_sort = fa
|
||||
for x in m.title:
|
||||
if unicode_type(x.file_as):
|
||||
self.title_sort = unicode_type(x.file_as)
|
||||
self.freetext = f2s(m.description)
|
||||
self.category = f2s(m.subject)
|
||||
self.cover = None
|
||||
self.use_metadata_cover = True
|
||||
self.output = output
|
||||
self.ignore_tables = opts.linearize_tables
|
||||
if opts.disable_font_rescaling:
|
||||
self.base_font_size = 0
|
||||
else:
|
||||
self.base_font_size = opts.base_font_size
|
||||
self.blank_after_para = opts.insert_blank_line
|
||||
self.use_spine = True
|
||||
self.font_delta = 0
|
||||
self.ignore_colors = False
|
||||
from calibre.ebooks.lrf import PRS500_PROFILE
|
||||
self.profile = PRS500_PROFILE
|
||||
self.link_levels = sys.maxsize
|
||||
self.link_exclude = '@'
|
||||
self.no_links_in_toc = True
|
||||
self.disable_chapter_detection = True
|
||||
self.chapter_regex = 'dsadcdswcdec'
|
||||
self.chapter_attr = '$,,$'
|
||||
self.override_css = self._override_css = ''
|
||||
self.page_break = 'h[12]'
|
||||
self.force_page_break = '$'
|
||||
self.force_page_break_attr = '$'
|
||||
self.add_chapters_to_toc = False
|
||||
self.baen = self.pdftohtml = self.book_designer = False
|
||||
self.verbose = opts.verbose
|
||||
self.encoding = 'utf-8'
|
||||
self.lrs = False
|
||||
self.minimize_memory_usage = False
|
||||
self.autorotation = opts.enable_autorotation
|
||||
self.header_separation = (self.profile.dpi/72.) * opts.header_separation
|
||||
self.headerformat = opts.header_format
|
||||
|
||||
for x in ('top', 'bottom', 'left', 'right'):
|
||||
setattr(self, x+'_margin',
|
||||
(self.profile.dpi/72.) * float(getattr(opts, 'margin_'+x)))
|
||||
|
||||
for x in ('wordspace', 'header', 'header_format',
|
||||
'minimum_indent', 'serif_family',
|
||||
'render_tables_as_images', 'sans_family', 'mono_family',
|
||||
'text_size_multiplier_for_rendered_tables'):
|
||||
setattr(self, x, getattr(opts, x))
|
||||
|
||||
|
||||
class LRFOutput(OutputFormatPlugin):
|
||||
|
||||
name = 'LRF Output'
|
||||
author = 'Kovid Goyal'
|
||||
file_type = 'lrf'
|
||||
commit_name = 'lrf_output'
|
||||
|
||||
options = {
|
||||
OptionRecommendation(name='enable_autorotation', recommended_value=False,
|
||||
help=_('Enable auto-rotation of images that are wider than the screen width.')
|
||||
),
|
||||
OptionRecommendation(name='wordspace',
|
||||
recommended_value=2.5, level=OptionRecommendation.LOW,
|
||||
help=_('Set the space between words in pts. Default is %default')
|
||||
),
|
||||
OptionRecommendation(name='header', recommended_value=False,
|
||||
help=_('Add a header to all the pages with title and author.')
|
||||
),
|
||||
OptionRecommendation(name='header_format', recommended_value="%t by %a",
|
||||
help=_('Set the format of the header. %a is replaced by the author '
|
||||
'and %t by the title. Default is %default')
|
||||
),
|
||||
OptionRecommendation(name='header_separation', recommended_value=0,
|
||||
help=_('Add extra spacing below the header. Default is %default pt.')
|
||||
),
|
||||
OptionRecommendation(name='minimum_indent', recommended_value=0,
|
||||
help=_('Minimum paragraph indent (the indent of the first line '
|
||||
'of a paragraph) in pts. Default: %default')
|
||||
),
|
||||
OptionRecommendation(name='render_tables_as_images',
|
||||
recommended_value=False,
|
||||
help=_('This option has no effect')
|
||||
),
|
||||
OptionRecommendation(name='text_size_multiplier_for_rendered_tables',
|
||||
recommended_value=1.0,
|
||||
help=_('Multiply the size of text in rendered tables by this '
|
||||
'factor. Default is %default')
|
||||
),
|
||||
OptionRecommendation(name='serif_family', recommended_value=None,
|
||||
help=_('The serif family of fonts to embed')
|
||||
),
|
||||
OptionRecommendation(name='sans_family', recommended_value=None,
|
||||
help=_('The sans-serif family of fonts to embed')
|
||||
),
|
||||
OptionRecommendation(name='mono_family', recommended_value=None,
|
||||
help=_('The monospace family of fonts to embed')
|
||||
),
|
||||
|
||||
}
|
||||
|
||||
recommendations = {
|
||||
('change_justification', 'original', OptionRecommendation.HIGH)}
|
||||
|
||||
def convert_images(self, pages, opts, wide):
|
||||
from calibre.ebooks.lrf.pylrs.pylrs import Book, BookSetting, ImageStream, ImageBlock
|
||||
from uuid import uuid4
|
||||
from calibre.constants import __appname__, __version__
|
||||
|
||||
width, height = (784, 1012) if wide else (584, 754)
|
||||
|
||||
ps = {}
|
||||
ps['topmargin'] = 0
|
||||
ps['evensidemargin'] = 0
|
||||
ps['oddsidemargin'] = 0
|
||||
ps['textwidth'] = width
|
||||
ps['textheight'] = height
|
||||
book = Book(title=opts.title, author=opts.author,
|
||||
bookid=uuid4().hex,
|
||||
publisher='%s %s'%(__appname__, __version__),
|
||||
category=_('Comic'), pagestyledefault=ps,
|
||||
booksetting=BookSetting(screenwidth=width, screenheight=height))
|
||||
for page in pages:
|
||||
imageStream = ImageStream(page)
|
||||
_page = book.create_page()
|
||||
_page.append(ImageBlock(refstream=imageStream,
|
||||
blockwidth=width, blockheight=height, xsize=width,
|
||||
ysize=height, x1=width, y1=height))
|
||||
book.append(_page)
|
||||
|
||||
book.renderLrf(open(opts.output, 'wb'))
|
||||
|
||||
def flatten_toc(self):
|
||||
from calibre.ebooks.oeb.base import TOC
|
||||
nroot = TOC()
|
||||
for x in self.oeb.toc.iterdescendants():
|
||||
nroot.add(x.title, x.href)
|
||||
self.oeb.toc = nroot
|
||||
|
||||
def convert(self, oeb, output_path, input_plugin, opts, log):
|
||||
self.log, self.opts, self.oeb = log, opts, oeb
|
||||
|
||||
lrf_opts = LRFOptions(output_path, opts, oeb)
|
||||
|
||||
if input_plugin.is_image_collection:
|
||||
self.convert_images(input_plugin.get_images(), lrf_opts,
|
||||
getattr(opts, 'wide', False))
|
||||
return
|
||||
|
||||
self.flatten_toc()
|
||||
|
||||
from calibre.ptempfile import TemporaryDirectory
|
||||
with TemporaryDirectory('_lrf_output') as tdir:
|
||||
from calibre.customize.ui import plugin_for_output_format
|
||||
oeb_output = plugin_for_output_format('oeb')
|
||||
oeb_output.convert(oeb, tdir, input_plugin, opts, log)
|
||||
opf = [x for x in os.listdir(tdir) if x.endswith('.opf')][0]
|
||||
from calibre.ebooks.lrf.html.convert_from import process_file
|
||||
process_file(os.path.join(tdir, opf), lrf_opts, self.log)
|
||||
66
ebook_converter/ebooks/conversion/plugins/mobi_input.py
Normal file
66
ebook_converter/ebooks/conversion/plugins/mobi_input.py
Normal file
@@ -0,0 +1,66 @@
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL 3'
|
||||
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import os
|
||||
|
||||
from calibre.customize.conversion import InputFormatPlugin
|
||||
from polyglot.builtins import unicode_type
|
||||
|
||||
|
||||
class MOBIInput(InputFormatPlugin):
|
||||
|
||||
name = 'MOBI Input'
|
||||
author = 'Kovid Goyal'
|
||||
description = 'Convert MOBI files (.mobi, .prc, .azw) to HTML'
|
||||
file_types = {'mobi', 'prc', 'azw', 'azw3', 'pobi'}
|
||||
commit_name = 'mobi_input'
|
||||
|
||||
def convert(self, stream, options, file_ext, log,
|
||||
accelerators):
|
||||
self.is_kf8 = False
|
||||
self.mobi_is_joint = False
|
||||
|
||||
from calibre.ebooks.mobi.reader.mobi6 import MobiReader
|
||||
from lxml import html
|
||||
parse_cache = {}
|
||||
try:
|
||||
mr = MobiReader(stream, log, options.input_encoding,
|
||||
options.debug_pipeline)
|
||||
if mr.kf8_type is None:
|
||||
mr.extract_content('.', parse_cache)
|
||||
|
||||
except:
|
||||
mr = MobiReader(stream, log, options.input_encoding,
|
||||
options.debug_pipeline, try_extra_data_fix=True)
|
||||
if mr.kf8_type is None:
|
||||
mr.extract_content('.', parse_cache)
|
||||
|
||||
if mr.kf8_type is not None:
|
||||
log('Found KF8 MOBI of type %r'%mr.kf8_type)
|
||||
if mr.kf8_type == 'joint':
|
||||
self.mobi_is_joint = True
|
||||
from calibre.ebooks.mobi.reader.mobi8 import Mobi8Reader
|
||||
mr = Mobi8Reader(mr, log)
|
||||
opf = os.path.abspath(mr())
|
||||
self.encrypted_fonts = mr.encrypted_fonts
|
||||
self.is_kf8 = True
|
||||
return opf
|
||||
|
||||
raw = parse_cache.pop('calibre_raw_mobi_markup', False)
|
||||
if raw:
|
||||
if isinstance(raw, unicode_type):
|
||||
raw = raw.encode('utf-8')
|
||||
with lopen('debug-raw.html', 'wb') as f:
|
||||
f.write(raw)
|
||||
from calibre.ebooks.oeb.base import close_self_closing_tags
|
||||
for f, root in parse_cache.items():
|
||||
raw = html.tostring(root, encoding='utf-8', method='xml',
|
||||
include_meta_content_type=False)
|
||||
raw = close_self_closing_tags(raw)
|
||||
with lopen(f, 'wb') as q:
|
||||
q.write(raw)
|
||||
accelerators['pagebreaks'] = '//h:div[@class="mbp_pagebreak"]'
|
||||
return mr.created_opf_path
|
||||
337
ebook_converter/ebooks/conversion/plugins/mobi_output.py
Normal file
337
ebook_converter/ebooks/conversion/plugins/mobi_output.py
Normal file
@@ -0,0 +1,337 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
from calibre.customize.conversion import (OutputFormatPlugin,
|
||||
OptionRecommendation)
|
||||
from polyglot.builtins import unicode_type
|
||||
|
||||
|
||||
def remove_html_cover(oeb, log):
|
||||
from calibre.ebooks.oeb.base import OEB_DOCS
|
||||
|
||||
if not oeb.metadata.cover \
|
||||
or 'cover' not in oeb.guide:
|
||||
return
|
||||
href = oeb.guide['cover'].href
|
||||
del oeb.guide['cover']
|
||||
item = oeb.manifest.hrefs[href]
|
||||
if item.spine_position is not None:
|
||||
log.warn('Found an HTML cover: ', item.href, 'removing it.',
|
||||
'If you find some content missing from the output MOBI, it '
|
||||
'is because you misidentified the HTML cover in the input '
|
||||
'document')
|
||||
oeb.spine.remove(item)
|
||||
if item.media_type in OEB_DOCS:
|
||||
oeb.manifest.remove(item)
|
||||
|
||||
|
||||
def extract_mobi(output_path, opts):
|
||||
if opts.extract_to is not None:
|
||||
from calibre.ebooks.mobi.debug.main import inspect_mobi
|
||||
ddir = opts.extract_to
|
||||
inspect_mobi(output_path, ddir=ddir)
|
||||
|
||||
|
||||
class MOBIOutput(OutputFormatPlugin):
|
||||
|
||||
name = 'MOBI Output'
|
||||
author = 'Kovid Goyal'
|
||||
file_type = 'mobi'
|
||||
commit_name = 'mobi_output'
|
||||
ui_data = {'file_types': ['old', 'both', 'new']}
|
||||
|
||||
options = {
|
||||
OptionRecommendation(name='prefer_author_sort',
|
||||
recommended_value=False, level=OptionRecommendation.LOW,
|
||||
help=_('When present, use author sort field as author.')
|
||||
),
|
||||
OptionRecommendation(name='no_inline_toc',
|
||||
recommended_value=False, level=OptionRecommendation.LOW,
|
||||
help=_('Don\'t add Table of Contents to the book. Useful if '
|
||||
'the book has its own table of contents.')),
|
||||
OptionRecommendation(name='toc_title', recommended_value=None,
|
||||
help=_('Title for any generated in-line table of contents.')
|
||||
),
|
||||
OptionRecommendation(name='dont_compress',
|
||||
recommended_value=False, level=OptionRecommendation.LOW,
|
||||
help=_('Disable compression of the file contents.')
|
||||
),
|
||||
OptionRecommendation(name='personal_doc', recommended_value='[PDOC]',
|
||||
help=_('Tag for MOBI files to be marked as personal documents.'
|
||||
' This option has no effect on the conversion. It is used'
|
||||
' only when sending MOBI files to a device. If the file'
|
||||
' being sent has the specified tag, it will be marked as'
|
||||
' a personal document when sent to the Kindle.')
|
||||
),
|
||||
OptionRecommendation(name='mobi_ignore_margins',
|
||||
recommended_value=False,
|
||||
help=_('Ignore margins in the input document. If False, then '
|
||||
'the MOBI output plugin will try to convert margins specified'
|
||||
' in the input document, otherwise it will ignore them.')
|
||||
),
|
||||
OptionRecommendation(name='mobi_toc_at_start',
|
||||
recommended_value=False,
|
||||
help=_('When adding the Table of Contents to the book, add it at the start of the '
|
||||
'book instead of the end. Not recommended.')
|
||||
),
|
||||
OptionRecommendation(name='extract_to',
|
||||
help=_('Extract the contents of the generated %s file to the '
|
||||
'specified directory. The contents of the directory are first '
|
||||
'deleted, so be careful.') % 'MOBI'
|
||||
),
|
||||
OptionRecommendation(name='share_not_sync', recommended_value=False,
|
||||
help=_('Enable sharing of book content via Facebook etc. '
|
||||
' on the Kindle. WARNING: Using this feature means that '
|
||||
' the book will not auto sync its last read position '
|
||||
' on multiple devices. Complain to Amazon.')
|
||||
),
|
||||
OptionRecommendation(name='mobi_keep_original_images',
|
||||
recommended_value=False,
|
||||
help=_('By default calibre converts all images to JPEG format '
|
||||
'in the output MOBI file. This is for maximum compatibility '
|
||||
'as some older MOBI viewers have problems with other image '
|
||||
'formats. This option tells calibre not to do this. '
|
||||
'Useful if your document contains lots of GIF/PNG images that '
|
||||
'become very large when converted to JPEG.')),
|
||||
OptionRecommendation(name='mobi_file_type', choices=ui_data['file_types'], recommended_value='old',
|
||||
help=_('By default calibre generates MOBI files that contain the '
|
||||
'old MOBI 6 format. This format is compatible with all '
|
||||
'devices. However, by changing this setting, you can tell '
|
||||
'calibre to generate MOBI files that contain both MOBI 6 and '
|
||||
'the new KF8 format, or only the new KF8 format. KF8 has '
|
||||
'more features than MOBI 6, but only works with newer Kindles. '
|
||||
'Allowed values: {}').format('old, both, new')),
|
||||
|
||||
}
|
||||
|
||||
def check_for_periodical(self):
|
||||
if self.is_periodical:
|
||||
self.periodicalize_toc()
|
||||
self.check_for_masthead()
|
||||
self.opts.mobi_periodical = True
|
||||
else:
|
||||
self.opts.mobi_periodical = False
|
||||
|
||||
def check_for_masthead(self):
|
||||
found = 'masthead' in self.oeb.guide
|
||||
if not found:
|
||||
from calibre.ebooks import generate_masthead
|
||||
self.oeb.log.debug('No masthead found in manifest, generating default mastheadImage...')
|
||||
raw = generate_masthead(unicode_type(self.oeb.metadata['title'][0]))
|
||||
id, href = self.oeb.manifest.generate('masthead', 'masthead')
|
||||
self.oeb.manifest.add(id, href, 'image/gif', data=raw)
|
||||
self.oeb.guide.add('masthead', 'Masthead Image', href)
|
||||
else:
|
||||
self.oeb.log.debug('Using mastheadImage supplied in manifest...')
|
||||
|
||||
def periodicalize_toc(self):
|
||||
from calibre.ebooks.oeb.base import TOC
|
||||
toc = self.oeb.toc
|
||||
if not toc or len(self.oeb.spine) < 3:
|
||||
return
|
||||
if toc and toc[0].klass != 'periodical':
|
||||
one, two = self.oeb.spine[0], self.oeb.spine[1]
|
||||
self.log('Converting TOC for MOBI periodical indexing...')
|
||||
|
||||
articles = {}
|
||||
if toc.depth() < 3:
|
||||
# single section periodical
|
||||
self.oeb.manifest.remove(one)
|
||||
self.oeb.manifest.remove(two)
|
||||
sections = [TOC(klass='section', title=_('All articles'),
|
||||
href=self.oeb.spine[0].href)]
|
||||
for x in toc:
|
||||
sections[0].nodes.append(x)
|
||||
else:
|
||||
# multi-section periodical
|
||||
self.oeb.manifest.remove(one)
|
||||
sections = list(toc)
|
||||
for i,x in enumerate(sections):
|
||||
x.klass = 'section'
|
||||
articles_ = list(x)
|
||||
if articles_:
|
||||
self.oeb.manifest.remove(self.oeb.manifest.hrefs[x.href])
|
||||
x.href = articles_[0].href
|
||||
|
||||
for sec in sections:
|
||||
articles[id(sec)] = []
|
||||
for a in list(sec):
|
||||
a.klass = 'article'
|
||||
articles[id(sec)].append(a)
|
||||
sec.nodes.remove(a)
|
||||
|
||||
root = TOC(klass='periodical', href=self.oeb.spine[0].href,
|
||||
title=unicode_type(self.oeb.metadata.title[0]))
|
||||
|
||||
for s in sections:
|
||||
if articles[id(s)]:
|
||||
for a in articles[id(s)]:
|
||||
s.nodes.append(a)
|
||||
root.nodes.append(s)
|
||||
|
||||
for x in list(toc.nodes):
|
||||
toc.nodes.remove(x)
|
||||
|
||||
toc.nodes.append(root)
|
||||
|
||||
# Fix up the periodical href to point to first section href
|
||||
toc.nodes[0].href = toc.nodes[0].nodes[0].href
|
||||
|
||||
def convert(self, oeb, output_path, input_plugin, opts, log):
|
||||
from calibre.ebooks.mobi.writer2.resources import Resources
|
||||
self.log, self.opts, self.oeb = log, opts, oeb
|
||||
|
||||
mobi_type = opts.mobi_file_type
|
||||
if self.is_periodical:
|
||||
mobi_type = 'old' # Amazon does not support KF8 periodicals
|
||||
create_kf8 = mobi_type in ('new', 'both')
|
||||
|
||||
remove_html_cover(self.oeb, self.log)
|
||||
resources = Resources(oeb, opts, self.is_periodical,
|
||||
add_fonts=create_kf8)
|
||||
self.check_for_periodical()
|
||||
|
||||
if create_kf8:
|
||||
from calibre.ebooks.mobi.writer8.cleanup import remove_duplicate_anchors
|
||||
remove_duplicate_anchors(self.oeb)
|
||||
# Split on pagebreaks so that the resulting KF8 is faster to load
|
||||
from calibre.ebooks.oeb.transforms.split import Split
|
||||
Split()(self.oeb, self.opts)
|
||||
|
||||
kf8 = self.create_kf8(resources, for_joint=mobi_type=='both'
|
||||
) if create_kf8 else None
|
||||
if mobi_type == 'new':
|
||||
kf8.write(output_path)
|
||||
extract_mobi(output_path, opts)
|
||||
return
|
||||
|
||||
self.log('Creating MOBI 6 output')
|
||||
self.write_mobi(input_plugin, output_path, kf8, resources)
|
||||
|
||||
def create_kf8(self, resources, for_joint=False):
|
||||
from calibre.ebooks.mobi.writer8.main import create_kf8_book
|
||||
return create_kf8_book(self.oeb, self.opts, resources,
|
||||
for_joint=for_joint)
|
||||
|
||||
def write_mobi(self, input_plugin, output_path, kf8, resources):
|
||||
from calibre.ebooks.mobi.mobiml import MobiMLizer
|
||||
from calibre.ebooks.oeb.transforms.manglecase import CaseMangler
|
||||
from calibre.ebooks.oeb.transforms.rasterize import SVGRasterizer, Unavailable
|
||||
from calibre.ebooks.oeb.transforms.htmltoc import HTMLTOCAdder
|
||||
from calibre.customize.ui import plugin_for_input_format
|
||||
|
||||
opts, oeb = self.opts, self.oeb
|
||||
if not opts.no_inline_toc:
|
||||
tocadder = HTMLTOCAdder(title=opts.toc_title, position='start' if
|
||||
opts.mobi_toc_at_start else 'end')
|
||||
tocadder(oeb, opts)
|
||||
mangler = CaseMangler()
|
||||
mangler(oeb, opts)
|
||||
try:
|
||||
rasterizer = SVGRasterizer()
|
||||
rasterizer(oeb, opts)
|
||||
except Unavailable:
|
||||
self.log.warn('SVG rasterizer unavailable, SVG will not be converted')
|
||||
else:
|
||||
# Add rasterized SVG images
|
||||
resources.add_extra_images()
|
||||
if hasattr(self.oeb, 'inserted_metadata_jacket'):
|
||||
self.workaround_fire_bugs(self.oeb.inserted_metadata_jacket)
|
||||
mobimlizer = MobiMLizer(ignore_tables=opts.linearize_tables)
|
||||
mobimlizer(oeb, opts)
|
||||
write_page_breaks_after_item = input_plugin is not plugin_for_input_format('cbz')
|
||||
from calibre.ebooks.mobi.writer2.main import MobiWriter
|
||||
writer = MobiWriter(opts, resources, kf8,
|
||||
write_page_breaks_after_item=write_page_breaks_after_item)
|
||||
writer(oeb, output_path)
|
||||
extract_mobi(output_path, opts)
|
||||
|
||||
def specialize_css_for_output(self, log, opts, item, stylizer):
|
||||
from calibre.ebooks.mobi.writer8.cleanup import CSSCleanup
|
||||
CSSCleanup(log, opts)(item, stylizer)
|
||||
|
||||
def workaround_fire_bugs(self, jacket):
|
||||
# The idiotic Fire crashes when trying to render the table used to
|
||||
# layout the jacket
|
||||
from calibre.ebooks.oeb.base import XHTML
|
||||
for table in jacket.data.xpath('//*[local-name()="table"]'):
|
||||
table.tag = XHTML('div')
|
||||
for tr in table.xpath('descendant::*[local-name()="tr"]'):
|
||||
cols = tr.xpath('descendant::*[local-name()="td"]')
|
||||
tr.tag = XHTML('div')
|
||||
for td in cols:
|
||||
td.tag = XHTML('span' if cols else 'div')
|
||||
|
||||
|
||||
class AZW3Output(OutputFormatPlugin):
|
||||
|
||||
name = 'AZW3 Output'
|
||||
author = 'Kovid Goyal'
|
||||
file_type = 'azw3'
|
||||
commit_name = 'azw3_output'
|
||||
|
||||
options = {
|
||||
OptionRecommendation(name='prefer_author_sort',
|
||||
recommended_value=False, level=OptionRecommendation.LOW,
|
||||
help=_('When present, use author sort field as author.')
|
||||
),
|
||||
OptionRecommendation(name='no_inline_toc',
|
||||
recommended_value=False, level=OptionRecommendation.LOW,
|
||||
help=_('Don\'t add Table of Contents to the book. Useful if '
|
||||
'the book has its own table of contents.')),
|
||||
OptionRecommendation(name='toc_title', recommended_value=None,
|
||||
help=_('Title for any generated in-line table of contents.')
|
||||
),
|
||||
OptionRecommendation(name='dont_compress',
|
||||
recommended_value=False, level=OptionRecommendation.LOW,
|
||||
help=_('Disable compression of the file contents.')
|
||||
),
|
||||
OptionRecommendation(name='mobi_toc_at_start',
|
||||
recommended_value=False,
|
||||
help=_('When adding the Table of Contents to the book, add it at the start of the '
|
||||
'book instead of the end. Not recommended.')
|
||||
),
|
||||
OptionRecommendation(name='extract_to',
|
||||
help=_('Extract the contents of the generated %s file to the '
|
||||
'specified directory. The contents of the directory are first '
|
||||
'deleted, so be careful.') % 'AZW3'),
|
||||
OptionRecommendation(name='share_not_sync', recommended_value=False,
|
||||
help=_('Enable sharing of book content via Facebook etc. '
|
||||
' on the Kindle. WARNING: Using this feature means that '
|
||||
' the book will not auto sync its last read position '
|
||||
' on multiple devices. Complain to Amazon.')
|
||||
),
|
||||
}
|
||||
|
||||
def convert(self, oeb, output_path, input_plugin, opts, log):
|
||||
from calibre.ebooks.mobi.writer2.resources import Resources
|
||||
from calibre.ebooks.mobi.writer8.main import create_kf8_book
|
||||
from calibre.ebooks.mobi.writer8.cleanup import remove_duplicate_anchors
|
||||
|
||||
self.oeb, self.opts, self.log = oeb, opts, log
|
||||
opts.mobi_periodical = self.is_periodical
|
||||
passthrough = getattr(opts, 'mobi_passthrough', False)
|
||||
remove_duplicate_anchors(oeb)
|
||||
|
||||
resources = Resources(self.oeb, self.opts, self.is_periodical,
|
||||
add_fonts=True, process_images=False)
|
||||
if not passthrough:
|
||||
remove_html_cover(self.oeb, self.log)
|
||||
|
||||
# Split on pagebreaks so that the resulting KF8 is faster to load
|
||||
from calibre.ebooks.oeb.transforms.split import Split
|
||||
Split()(self.oeb, self.opts)
|
||||
|
||||
kf8 = create_kf8_book(self.oeb, self.opts, resources, for_joint=False)
|
||||
|
||||
kf8.write(output_path)
|
||||
extract_mobi(output_path, opts)
|
||||
|
||||
def specialize_css_for_output(self, log, opts, item, stylizer):
|
||||
from calibre.ebooks.mobi.writer8.cleanup import CSSCleanup
|
||||
CSSCleanup(log, opts)(item, stylizer)
|
||||
25
ebook_converter/ebooks/conversion/plugins/odt_input.py
Normal file
25
ebook_converter/ebooks/conversion/plugins/odt_input.py
Normal file
@@ -0,0 +1,25 @@
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
'''
|
||||
Convert an ODT file into a Open Ebook
|
||||
'''
|
||||
|
||||
from calibre.customize.conversion import InputFormatPlugin
|
||||
|
||||
|
||||
class ODTInput(InputFormatPlugin):
|
||||
|
||||
name = 'ODT Input'
|
||||
author = 'Kovid Goyal'
|
||||
description = 'Convert ODT (OpenOffice) files to HTML'
|
||||
file_types = {'odt'}
|
||||
commit_name = 'odt_input'
|
||||
|
||||
def convert(self, stream, options, file_ext, log,
|
||||
accelerators):
|
||||
from calibre.ebooks.odt.input import Extract
|
||||
return Extract()(stream, '.', log)
|
||||
122
ebook_converter/ebooks/conversion/plugins/oeb_output.py
Normal file
122
ebook_converter/ebooks/conversion/plugins/oeb_output.py
Normal file
@@ -0,0 +1,122 @@
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL 3'
|
||||
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import os, re
|
||||
|
||||
|
||||
from calibre.customize.conversion import (OutputFormatPlugin,
|
||||
OptionRecommendation)
|
||||
from calibre import CurrentDir
|
||||
|
||||
|
||||
class OEBOutput(OutputFormatPlugin):
|
||||
|
||||
name = 'OEB Output'
|
||||
author = 'Kovid Goyal'
|
||||
file_type = 'oeb'
|
||||
commit_name = 'oeb_output'
|
||||
|
||||
recommendations = {('pretty_print', True, OptionRecommendation.HIGH)}
|
||||
|
||||
def convert(self, oeb_book, output_path, input_plugin, opts, log):
|
||||
from polyglot.urllib import unquote
|
||||
from lxml import etree
|
||||
|
||||
self.log, self.opts = log, opts
|
||||
if not os.path.exists(output_path):
|
||||
os.makedirs(output_path)
|
||||
from calibre.ebooks.oeb.base import OPF_MIME, NCX_MIME, PAGE_MAP_MIME, OEB_STYLES
|
||||
from calibre.ebooks.oeb.normalize_css import condense_sheet
|
||||
with CurrentDir(output_path):
|
||||
results = oeb_book.to_opf2(page_map=True)
|
||||
for key in (OPF_MIME, NCX_MIME, PAGE_MAP_MIME):
|
||||
href, root = results.pop(key, [None, None])
|
||||
if root is not None:
|
||||
if key == OPF_MIME:
|
||||
try:
|
||||
self.workaround_nook_cover_bug(root)
|
||||
except:
|
||||
self.log.exception('Something went wrong while trying to'
|
||||
' workaround Nook cover bug, ignoring')
|
||||
try:
|
||||
self.workaround_pocketbook_cover_bug(root)
|
||||
except:
|
||||
self.log.exception('Something went wrong while trying to'
|
||||
' workaround Pocketbook cover bug, ignoring')
|
||||
self.migrate_lang_code(root)
|
||||
raw = etree.tostring(root, pretty_print=True,
|
||||
encoding='utf-8', xml_declaration=True)
|
||||
if key == OPF_MIME:
|
||||
# Needed as I can't get lxml to output opf:role and
|
||||
# not output <opf:metadata> as well
|
||||
raw = re.sub(br'(<[/]{0,1})opf:', br'\1', raw)
|
||||
with lopen(href, 'wb') as f:
|
||||
f.write(raw)
|
||||
|
||||
for item in oeb_book.manifest:
|
||||
if (
|
||||
not self.opts.expand_css and item.media_type in OEB_STYLES and hasattr(
|
||||
item.data, 'cssText') and 'nook' not in self.opts.output_profile.short_name):
|
||||
condense_sheet(item.data)
|
||||
path = os.path.abspath(unquote(item.href))
|
||||
dir = os.path.dirname(path)
|
||||
if not os.path.exists(dir):
|
||||
os.makedirs(dir)
|
||||
with lopen(path, 'wb') as f:
|
||||
f.write(item.bytes_representation)
|
||||
item.unload_data_from_memory(memory=path)
|
||||
|
||||
def workaround_nook_cover_bug(self, root): # {{{
|
||||
cov = root.xpath('//*[local-name() = "meta" and @name="cover" and'
|
||||
' @content != "cover"]')
|
||||
|
||||
def manifest_items_with_id(id_):
|
||||
return root.xpath('//*[local-name() = "manifest"]/*[local-name() = "item" '
|
||||
' and @id="%s"]'%id_)
|
||||
|
||||
if len(cov) == 1:
|
||||
cov = cov[0]
|
||||
covid = cov.get('content', '')
|
||||
|
||||
if covid:
|
||||
manifest_item = manifest_items_with_id(covid)
|
||||
if len(manifest_item) == 1 and \
|
||||
manifest_item[0].get('media-type',
|
||||
'').startswith('image/'):
|
||||
self.log.warn('The cover image has an id != "cover". Renaming'
|
||||
' to work around bug in Nook Color')
|
||||
|
||||
from calibre.ebooks.oeb.base import uuid_id
|
||||
newid = uuid_id()
|
||||
|
||||
for item in manifest_items_with_id('cover'):
|
||||
item.set('id', newid)
|
||||
|
||||
for x in root.xpath('//*[@idref="cover"]'):
|
||||
x.set('idref', newid)
|
||||
|
||||
manifest_item = manifest_item[0]
|
||||
manifest_item.set('id', 'cover')
|
||||
cov.set('content', 'cover')
|
||||
# }}}
|
||||
|
||||
def workaround_pocketbook_cover_bug(self, root): # {{{
|
||||
m = root.xpath('//*[local-name() = "manifest"]/*[local-name() = "item" '
|
||||
' and @id="cover"]')
|
||||
if len(m) == 1:
|
||||
m = m[0]
|
||||
p = m.getparent()
|
||||
p.remove(m)
|
||||
p.insert(0, m)
|
||||
# }}}
|
||||
|
||||
def migrate_lang_code(self, root): # {{{
|
||||
from calibre.utils.localization import lang_as_iso639_1
|
||||
for lang in root.xpath('//*[local-name() = "language"]'):
|
||||
clc = lang_as_iso639_1(lang.text)
|
||||
if clc:
|
||||
lang.text = clc
|
||||
# }}}
|
||||
37
ebook_converter/ebooks/conversion/plugins/pdb_input.py
Normal file
37
ebook_converter/ebooks/conversion/plugins/pdb_input.py
Normal file
@@ -0,0 +1,37 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
from calibre.customize.conversion import InputFormatPlugin
|
||||
from polyglot.builtins import getcwd
|
||||
|
||||
|
||||
class PDBInput(InputFormatPlugin):
|
||||
|
||||
name = 'PDB Input'
|
||||
author = 'John Schember'
|
||||
description = 'Convert PDB to HTML'
|
||||
file_types = {'pdb', 'updb'}
|
||||
commit_name = 'pdb_input'
|
||||
|
||||
def convert(self, stream, options, file_ext, log,
|
||||
accelerators):
|
||||
from calibre.ebooks.pdb.header import PdbHeaderReader
|
||||
from calibre.ebooks.pdb import PDBError, IDENTITY_TO_NAME, get_reader
|
||||
|
||||
header = PdbHeaderReader(stream)
|
||||
Reader = get_reader(header.ident)
|
||||
|
||||
if Reader is None:
|
||||
raise PDBError('No reader available for format within container.\n Identity is %s. Book type is %s' %
|
||||
(header.ident, IDENTITY_TO_NAME.get(header.ident, _('Unknown'))))
|
||||
|
||||
log.debug('Detected ebook format as: %s with identity: %s' % (IDENTITY_TO_NAME[header.ident], header.ident))
|
||||
|
||||
reader = Reader(header, stream, log, options)
|
||||
opf = reader.extract_content(getcwd())
|
||||
|
||||
return opf
|
||||
64
ebook_converter/ebooks/conversion/plugins/pdb_output.py
Normal file
64
ebook_converter/ebooks/conversion/plugins/pdb_output.py
Normal file
@@ -0,0 +1,64 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL 3'
|
||||
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import os
|
||||
|
||||
from calibre.customize.conversion import OutputFormatPlugin, \
|
||||
OptionRecommendation
|
||||
from calibre.ebooks.pdb import PDBError, get_writer, ALL_FORMAT_WRITERS
|
||||
|
||||
|
||||
class PDBOutput(OutputFormatPlugin):
|
||||
|
||||
name = 'PDB Output'
|
||||
author = 'John Schember'
|
||||
file_type = 'pdb'
|
||||
commit_name = 'pdb_output'
|
||||
ui_data = {'formats': tuple(ALL_FORMAT_WRITERS)}
|
||||
|
||||
options = {
|
||||
OptionRecommendation(name='format', recommended_value='doc',
|
||||
level=OptionRecommendation.LOW,
|
||||
short_switch='f', choices=list(ALL_FORMAT_WRITERS),
|
||||
help=(_('Format to use inside the pdb container. Choices are:') + ' %s' % sorted(ALL_FORMAT_WRITERS))),
|
||||
OptionRecommendation(name='pdb_output_encoding', recommended_value='cp1252',
|
||||
level=OptionRecommendation.LOW,
|
||||
help=_('Specify the character encoding of the output document. '
|
||||
'The default is cp1252. Note: This option is not honored by all '
|
||||
'formats.')),
|
||||
OptionRecommendation(name='inline_toc',
|
||||
recommended_value=False, level=OptionRecommendation.LOW,
|
||||
help=_('Add Table of Contents to beginning of the book.')),
|
||||
}
|
||||
|
||||
def convert(self, oeb_book, output_path, input_plugin, opts, log):
|
||||
close = False
|
||||
if not hasattr(output_path, 'write'):
|
||||
close = True
|
||||
if not os.path.exists(os.path.dirname(output_path)) and os.path.dirname(output_path):
|
||||
os.makedirs(os.path.dirname(output_path))
|
||||
out_stream = lopen(output_path, 'wb')
|
||||
else:
|
||||
out_stream = output_path
|
||||
|
||||
Writer = get_writer(opts.format)
|
||||
|
||||
if Writer is None:
|
||||
raise PDBError('No writer available for format %s.' % format)
|
||||
|
||||
setattr(opts, 'max_line_length', 0)
|
||||
setattr(opts, 'force_max_line_length', False)
|
||||
|
||||
writer = Writer(opts, log)
|
||||
|
||||
out_stream.seek(0)
|
||||
out_stream.truncate()
|
||||
|
||||
writer.write_content(oeb_book, out_stream, oeb_book.metadata)
|
||||
|
||||
if close:
|
||||
out_stream.close()
|
||||
82
ebook_converter/ebooks/conversion/plugins/pdf_input.py
Normal file
82
ebook_converter/ebooks/conversion/plugins/pdf_input.py
Normal file
@@ -0,0 +1,82 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL 3'
|
||||
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import os
|
||||
|
||||
from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
|
||||
from polyglot.builtins import as_bytes, getcwd
|
||||
|
||||
|
||||
class PDFInput(InputFormatPlugin):
|
||||
|
||||
name = 'PDF Input'
|
||||
author = 'Kovid Goyal and John Schember'
|
||||
description = 'Convert PDF files to HTML'
|
||||
file_types = {'pdf'}
|
||||
commit_name = 'pdf_input'
|
||||
|
||||
options = {
|
||||
OptionRecommendation(name='no_images', recommended_value=False,
|
||||
help=_('Do not extract images from the document')),
|
||||
OptionRecommendation(name='unwrap_factor', recommended_value=0.45,
|
||||
help=_('Scale used to determine the length at which a line should '
|
||||
'be unwrapped. Valid values are a decimal between 0 and 1. The '
|
||||
'default is 0.45, just below the median line length.')),
|
||||
OptionRecommendation(name='new_pdf_engine', recommended_value=False,
|
||||
help=_('Use the new PDF conversion engine. Currently not operational.'))
|
||||
}
|
||||
|
||||
def convert_new(self, stream, accelerators):
|
||||
from calibre.ebooks.pdf.pdftohtml import pdftohtml
|
||||
from calibre.utils.cleantext import clean_ascii_chars
|
||||
from calibre.ebooks.pdf.reflow import PDFDocument
|
||||
|
||||
pdftohtml(getcwd(), stream.name, self.opts.no_images, as_xml=True)
|
||||
with lopen('index.xml', 'rb') as f:
|
||||
xml = clean_ascii_chars(f.read())
|
||||
PDFDocument(xml, self.opts, self.log)
|
||||
return os.path.join(getcwd(), 'metadata.opf')
|
||||
|
||||
def convert(self, stream, options, file_ext, log,
|
||||
accelerators):
|
||||
from calibre.ebooks.metadata.opf2 import OPFCreator
|
||||
from calibre.ebooks.pdf.pdftohtml import pdftohtml
|
||||
|
||||
log.debug('Converting file to html...')
|
||||
# The main html file will be named index.html
|
||||
self.opts, self.log = options, log
|
||||
if options.new_pdf_engine:
|
||||
return self.convert_new(stream, accelerators)
|
||||
pdftohtml(getcwd(), stream.name, options.no_images)
|
||||
|
||||
from calibre.ebooks.metadata.meta import get_metadata
|
||||
log.debug('Retrieving document metadata...')
|
||||
mi = get_metadata(stream, 'pdf')
|
||||
opf = OPFCreator(getcwd(), mi)
|
||||
|
||||
manifest = [('index.html', None)]
|
||||
|
||||
images = os.listdir(getcwd())
|
||||
images.remove('index.html')
|
||||
for i in images:
|
||||
manifest.append((i, None))
|
||||
log.debug('Generating manifest...')
|
||||
opf.create_manifest(manifest)
|
||||
|
||||
opf.create_spine(['index.html'])
|
||||
log.debug('Rendering manifest...')
|
||||
with lopen('metadata.opf', 'wb') as opffile:
|
||||
opf.render(opffile)
|
||||
if os.path.exists('toc.ncx'):
|
||||
ncxid = opf.manifest.id_for_path('toc.ncx')
|
||||
if ncxid:
|
||||
with lopen('metadata.opf', 'r+b') as f:
|
||||
raw = f.read().replace(b'<spine', b'<spine toc="%s"' % as_bytes(ncxid))
|
||||
f.seek(0)
|
||||
f.write(raw)
|
||||
|
||||
return os.path.join(getcwd(), 'metadata.opf')
|
||||
256
ebook_converter/ebooks/conversion/plugins/pdf_output.py
Normal file
256
ebook_converter/ebooks/conversion/plugins/pdf_output.py
Normal file
@@ -0,0 +1,256 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL 3'
|
||||
__copyright__ = '2012, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
'''
|
||||
Convert OEB ebook format to PDF.
|
||||
'''
|
||||
|
||||
import glob, os
|
||||
|
||||
from calibre.customize.conversion import (OutputFormatPlugin,
|
||||
OptionRecommendation)
|
||||
from calibre.ptempfile import TemporaryDirectory
|
||||
from polyglot.builtins import iteritems, unicode_type
|
||||
|
||||
UNITS = ('millimeter', 'centimeter', 'point', 'inch' , 'pica' , 'didot',
|
||||
'cicero', 'devicepixel')
|
||||
|
||||
PAPER_SIZES = ('a0', 'a1', 'a2', 'a3', 'a4', 'a5', 'a6', 'b0', 'b1',
|
||||
'b2', 'b3', 'b4', 'b5', 'b6', 'legal', 'letter')
|
||||
|
||||
|
||||
class PDFOutput(OutputFormatPlugin):
|
||||
|
||||
name = 'PDF Output'
|
||||
author = 'Kovid Goyal'
|
||||
file_type = 'pdf'
|
||||
commit_name = 'pdf_output'
|
||||
ui_data = {'paper_sizes': PAPER_SIZES, 'units': UNITS, 'font_types': ('serif', 'sans', 'mono')}
|
||||
|
||||
options = {
|
||||
OptionRecommendation(name='use_profile_size', recommended_value=False,
|
||||
help=_('Instead of using the paper size specified in the PDF Output options,'
|
||||
' use a paper size corresponding to the current output profile.'
|
||||
' Useful if you want to generate a PDF for viewing on a specific device.')),
|
||||
OptionRecommendation(name='unit', recommended_value='inch',
|
||||
level=OptionRecommendation.LOW, short_switch='u', choices=UNITS,
|
||||
help=_('The unit of measure for page sizes. Default is inch. Choices '
|
||||
'are {} '
|
||||
'Note: This does not override the unit for margins!').format(', '.join(UNITS))),
|
||||
OptionRecommendation(name='paper_size', recommended_value='letter',
|
||||
level=OptionRecommendation.LOW, choices=PAPER_SIZES,
|
||||
help=_('The size of the paper. This size will be overridden when a '
|
||||
'non default output profile is used. Default is letter. Choices '
|
||||
'are {}').format(', '.join(PAPER_SIZES))),
|
||||
OptionRecommendation(name='custom_size', recommended_value=None,
|
||||
help=_('Custom size of the document. Use the form widthxheight '
|
||||
'e.g. `123x321` to specify the width and height. '
|
||||
'This overrides any specified paper-size.')),
|
||||
OptionRecommendation(name='preserve_cover_aspect_ratio',
|
||||
recommended_value=False,
|
||||
help=_('Preserve the aspect ratio of the cover, instead'
|
||||
' of stretching it to fill the full first page of the'
|
||||
' generated pdf.')),
|
||||
OptionRecommendation(name='pdf_serif_family',
|
||||
recommended_value='Times', help=_(
|
||||
'The font family used to render serif fonts. Will work only if the font is available system-wide.')),
|
||||
OptionRecommendation(name='pdf_sans_family',
|
||||
recommended_value='Helvetica', help=_(
|
||||
'The font family used to render sans-serif fonts. Will work only if the font is available system-wide.')),
|
||||
OptionRecommendation(name='pdf_mono_family',
|
||||
recommended_value='Courier', help=_(
|
||||
'The font family used to render monospace fonts. Will work only if the font is available system-wide.')),
|
||||
OptionRecommendation(name='pdf_standard_font', choices=ui_data['font_types'],
|
||||
recommended_value='serif', help=_(
|
||||
'The font family used to render monospace fonts')),
|
||||
OptionRecommendation(name='pdf_default_font_size',
|
||||
recommended_value=20, help=_(
|
||||
'The default font size')),
|
||||
OptionRecommendation(name='pdf_mono_font_size',
|
||||
recommended_value=16, help=_(
|
||||
'The default font size for monospaced text')),
|
||||
OptionRecommendation(name='pdf_hyphenate', recommended_value=False,
|
||||
help=_('Break long words at the end of lines. This can give the text at the right margin a more even appearance.')),
|
||||
OptionRecommendation(name='pdf_mark_links', recommended_value=False,
|
||||
help=_('Surround all links with a red box, useful for debugging.')),
|
||||
OptionRecommendation(name='pdf_page_numbers', recommended_value=False,
|
||||
help=_('Add page numbers to the bottom of every page in the generated PDF file. If you '
|
||||
'specify a footer template, it will take precedence '
|
||||
'over this option.')),
|
||||
OptionRecommendation(name='pdf_footer_template', recommended_value=None,
|
||||
help=_('An HTML template used to generate %s on every page.'
|
||||
' The strings _PAGENUM_, _TITLE_, _AUTHOR_ and _SECTION_ will be replaced by their current values.')%_('footers')),
|
||||
OptionRecommendation(name='pdf_header_template', recommended_value=None,
|
||||
help=_('An HTML template used to generate %s on every page.'
|
||||
' The strings _PAGENUM_, _TITLE_, _AUTHOR_ and _SECTION_ will be replaced by their current values.')%_('headers')),
|
||||
OptionRecommendation(name='pdf_add_toc', recommended_value=False,
|
||||
help=_('Add a Table of Contents at the end of the PDF that lists page numbers. '
|
||||
'Useful if you want to print out the PDF. If this PDF is intended for electronic use, use the PDF Outline instead.')),
|
||||
OptionRecommendation(name='toc_title', recommended_value=None,
|
||||
help=_('Title for generated table of contents.')
|
||||
),
|
||||
|
||||
OptionRecommendation(name='pdf_page_margin_left', recommended_value=72.0,
|
||||
level=OptionRecommendation.LOW,
|
||||
help=_('The size of the left page margin, in pts. Default is 72pt.'
|
||||
' Overrides the common left page margin setting.')
|
||||
),
|
||||
|
||||
OptionRecommendation(name='pdf_page_margin_top', recommended_value=72.0,
|
||||
level=OptionRecommendation.LOW,
|
||||
help=_('The size of the top page margin, in pts. Default is 72pt.'
|
||||
' Overrides the common top page margin setting, unless set to zero.')
|
||||
),
|
||||
|
||||
OptionRecommendation(name='pdf_page_margin_right', recommended_value=72.0,
|
||||
level=OptionRecommendation.LOW,
|
||||
help=_('The size of the right page margin, in pts. Default is 72pt.'
|
||||
' Overrides the common right page margin setting, unless set to zero.')
|
||||
),
|
||||
|
||||
OptionRecommendation(name='pdf_page_margin_bottom', recommended_value=72.0,
|
||||
level=OptionRecommendation.LOW,
|
||||
help=_('The size of the bottom page margin, in pts. Default is 72pt.'
|
||||
' Overrides the common bottom page margin setting, unless set to zero.')
|
||||
),
|
||||
OptionRecommendation(name='pdf_use_document_margins', recommended_value=False,
|
||||
help=_('Use the page margins specified in the input document via @page CSS rules.'
|
||||
' This will cause the margins specified in the conversion settings to be ignored.'
|
||||
' If the document does not specify page margins, the conversion settings will be used as a fallback.')
|
||||
),
|
||||
OptionRecommendation(name='pdf_page_number_map', recommended_value=None,
|
||||
help=_('Adjust page numbers, as needed. Syntax is a JavaScript expression for the page number.'
|
||||
' For example, "if (n < 3) 0; else n - 3;", where n is current page number.')
|
||||
),
|
||||
OptionRecommendation(name='uncompressed_pdf',
|
||||
recommended_value=False, help=_(
|
||||
'Generate an uncompressed PDF, useful for debugging.')
|
||||
),
|
||||
OptionRecommendation(name='pdf_odd_even_offset', recommended_value=0.0,
|
||||
level=OptionRecommendation.LOW,
|
||||
help=_(
|
||||
'Shift the text horizontally by the specified offset (in pts).'
|
||||
' On odd numbered pages, it is shifted to the right and on even'
|
||||
' numbered pages to the left. Use negative numbers for the opposite'
|
||||
' effect. Note that this setting is ignored on pages where the margins'
|
||||
' are smaller than the specified offset. Shifting is done by setting'
|
||||
' the PDF CropBox, not all software respects the CropBox.'
|
||||
)
|
||||
),
|
||||
|
||||
}
|
||||
|
||||
def specialize_options(self, log, opts, input_fmt):
|
||||
# Ensure Qt is setup to be used with WebEngine
|
||||
# specialize_options is called early enough in the pipeline
|
||||
# that hopefully no Qt application has been constructed as yet
|
||||
from PyQt5.QtWebEngineCore import QWebEngineUrlScheme
|
||||
from PyQt5.QtWebEngineWidgets import QWebEnginePage # noqa
|
||||
from calibre.gui2 import must_use_qt
|
||||
from calibre.constants import FAKE_PROTOCOL
|
||||
scheme = QWebEngineUrlScheme(FAKE_PROTOCOL.encode('ascii'))
|
||||
scheme.setSyntax(QWebEngineUrlScheme.Syntax.Host)
|
||||
scheme.setFlags(QWebEngineUrlScheme.SecureScheme)
|
||||
QWebEngineUrlScheme.registerScheme(scheme)
|
||||
must_use_qt()
|
||||
self.input_fmt = input_fmt
|
||||
|
||||
if opts.pdf_use_document_margins:
|
||||
# Prevent the conversion pipeline from overwriting document margins
|
||||
opts.margin_left = opts.margin_right = opts.margin_top = opts.margin_bottom = -1
|
||||
|
||||
def convert(self, oeb_book, output_path, input_plugin, opts, log):
|
||||
self.stored_page_margins = getattr(opts, '_stored_page_margins', {})
|
||||
|
||||
self.oeb = oeb_book
|
||||
self.input_plugin, self.opts, self.log = input_plugin, opts, log
|
||||
self.output_path = output_path
|
||||
from calibre.ebooks.oeb.base import OPF, OPF2_NS
|
||||
from lxml import etree
|
||||
from io import BytesIO
|
||||
package = etree.Element(OPF('package'),
|
||||
attrib={'version': '2.0', 'unique-identifier': 'dummy'},
|
||||
nsmap={None: OPF2_NS})
|
||||
from calibre.ebooks.metadata.opf2 import OPF
|
||||
self.oeb.metadata.to_opf2(package)
|
||||
self.metadata = OPF(BytesIO(etree.tostring(package))).to_book_metadata()
|
||||
self.cover_data = None
|
||||
|
||||
if input_plugin.is_image_collection:
|
||||
log.debug('Converting input as an image collection...')
|
||||
self.convert_images(input_plugin.get_images())
|
||||
else:
|
||||
log.debug('Converting input as a text based book...')
|
||||
self.convert_text(oeb_book)
|
||||
|
||||
def convert_images(self, images):
|
||||
from calibre.ebooks.pdf.image_writer import convert
|
||||
convert(images, self.output_path, self.opts, self.metadata, self.report_progress)
|
||||
|
||||
def get_cover_data(self):
|
||||
oeb = self.oeb
|
||||
if (oeb.metadata.cover and unicode_type(oeb.metadata.cover[0]) in oeb.manifest.ids):
|
||||
cover_id = unicode_type(oeb.metadata.cover[0])
|
||||
item = oeb.manifest.ids[cover_id]
|
||||
self.cover_data = item.data
|
||||
|
||||
def process_fonts(self):
|
||||
''' Make sure all fonts are embeddable '''
|
||||
from calibre.ebooks.oeb.base import urlnormalize
|
||||
from calibre.utils.fonts.utils import remove_embed_restriction
|
||||
|
||||
processed = set()
|
||||
for item in list(self.oeb.manifest):
|
||||
if not hasattr(item.data, 'cssRules'):
|
||||
continue
|
||||
for i, rule in enumerate(item.data.cssRules):
|
||||
if rule.type == rule.FONT_FACE_RULE:
|
||||
try:
|
||||
s = rule.style
|
||||
src = s.getProperty('src').propertyValue[0].uri
|
||||
except:
|
||||
continue
|
||||
path = item.abshref(src)
|
||||
ff = self.oeb.manifest.hrefs.get(urlnormalize(path), None)
|
||||
if ff is None:
|
||||
continue
|
||||
|
||||
raw = nraw = ff.data
|
||||
if path not in processed:
|
||||
processed.add(path)
|
||||
try:
|
||||
nraw = remove_embed_restriction(raw)
|
||||
except:
|
||||
continue
|
||||
if nraw != raw:
|
||||
ff.data = nraw
|
||||
self.oeb.container.write(path, nraw)
|
||||
|
||||
def convert_text(self, oeb_book):
|
||||
import json
|
||||
from calibre.ebooks.pdf.html_writer import convert
|
||||
self.get_cover_data()
|
||||
self.process_fonts()
|
||||
|
||||
if self.opts.pdf_use_document_margins and self.stored_page_margins:
|
||||
for href, margins in iteritems(self.stored_page_margins):
|
||||
item = oeb_book.manifest.hrefs.get(href)
|
||||
if item is not None:
|
||||
root = item.data
|
||||
if hasattr(root, 'xpath') and margins:
|
||||
root.set('data-calibre-pdf-output-page-margins', json.dumps(margins))
|
||||
|
||||
with TemporaryDirectory('_pdf_out') as oeb_dir:
|
||||
from calibre.customize.ui import plugin_for_output_format
|
||||
oeb_dir = os.path.realpath(oeb_dir)
|
||||
oeb_output = plugin_for_output_format('oeb')
|
||||
oeb_output.convert(oeb_book, oeb_dir, self.input_plugin, self.opts, self.log)
|
||||
opfpath = glob.glob(os.path.join(oeb_dir, '*.opf'))[0]
|
||||
convert(
|
||||
opfpath, self.opts, metadata=self.metadata, output_path=self.output_path,
|
||||
log=self.log, cover_data=self.cover_data, report_progress=self.report_progress
|
||||
)
|
||||
165
ebook_converter/ebooks/conversion/plugins/pml_input.py
Normal file
165
ebook_converter/ebooks/conversion/plugins/pml_input.py
Normal file
@@ -0,0 +1,165 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import glob
|
||||
import os
|
||||
import shutil
|
||||
|
||||
from calibre.customize.conversion import InputFormatPlugin
|
||||
from calibre.ptempfile import TemporaryDirectory
|
||||
from polyglot.builtins import getcwd
|
||||
|
||||
|
||||
class PMLInput(InputFormatPlugin):
|
||||
|
||||
name = 'PML Input'
|
||||
author = 'John Schember'
|
||||
description = 'Convert PML to OEB'
|
||||
# pmlz is a zip file containing pml files and png images.
|
||||
file_types = {'pml', 'pmlz'}
|
||||
commit_name = 'pml_input'
|
||||
|
||||
def process_pml(self, pml_path, html_path, close_all=False):
|
||||
from calibre.ebooks.pml.pmlconverter import PML_HTMLizer
|
||||
|
||||
pclose = False
|
||||
hclose = False
|
||||
|
||||
if not hasattr(pml_path, 'read'):
|
||||
pml_stream = lopen(pml_path, 'rb')
|
||||
pclose = True
|
||||
else:
|
||||
pml_stream = pml_path
|
||||
pml_stream.seek(0)
|
||||
|
||||
if not hasattr(html_path, 'write'):
|
||||
html_stream = lopen(html_path, 'wb')
|
||||
hclose = True
|
||||
else:
|
||||
html_stream = html_path
|
||||
|
||||
ienc = getattr(pml_stream, 'encoding', None)
|
||||
if ienc is None:
|
||||
ienc = 'cp1252'
|
||||
if self.options.input_encoding:
|
||||
ienc = self.options.input_encoding
|
||||
|
||||
self.log.debug('Converting PML to HTML...')
|
||||
hizer = PML_HTMLizer()
|
||||
html = hizer.parse_pml(pml_stream.read().decode(ienc), html_path)
|
||||
html = '<html><head><title></title></head><body>%s</body></html>'%html
|
||||
html_stream.write(html.encode('utf-8', 'replace'))
|
||||
|
||||
if pclose:
|
||||
pml_stream.close()
|
||||
if hclose:
|
||||
html_stream.close()
|
||||
|
||||
return hizer.get_toc()
|
||||
|
||||
def get_images(self, stream, tdir, top_level=False):
|
||||
images = []
|
||||
imgs = []
|
||||
|
||||
if top_level:
|
||||
imgs = glob.glob(os.path.join(tdir, '*.png'))
|
||||
# Images not in top level try bookname_img directory because
|
||||
# that's where Dropbook likes to see them.
|
||||
if not imgs:
|
||||
if hasattr(stream, 'name'):
|
||||
imgs = glob.glob(os.path.join(tdir, os.path.splitext(os.path.basename(stream.name))[0] + '_img', '*.png'))
|
||||
# No images in Dropbook location try generic images directory
|
||||
if not imgs:
|
||||
imgs = glob.glob(os.path.join(os.path.join(tdir, 'images'), '*.png'))
|
||||
if imgs:
|
||||
os.makedirs(os.path.join(getcwd(), 'images'))
|
||||
for img in imgs:
|
||||
pimg_name = os.path.basename(img)
|
||||
pimg_path = os.path.join(getcwd(), 'images', pimg_name)
|
||||
|
||||
images.append('images/' + pimg_name)
|
||||
|
||||
shutil.copy(img, pimg_path)
|
||||
|
||||
return images
|
||||
|
||||
def convert(self, stream, options, file_ext, log,
|
||||
accelerators):
|
||||
from calibre.ebooks.metadata.toc import TOC
|
||||
from calibre.ebooks.metadata.opf2 import OPFCreator
|
||||
from calibre.utils.zipfile import ZipFile
|
||||
|
||||
self.options = options
|
||||
self.log = log
|
||||
pages, images = [], []
|
||||
toc = TOC()
|
||||
|
||||
if file_ext == 'pmlz':
|
||||
log.debug('De-compressing content to temporary directory...')
|
||||
with TemporaryDirectory('_unpmlz') as tdir:
|
||||
zf = ZipFile(stream)
|
||||
zf.extractall(tdir)
|
||||
|
||||
pmls = glob.glob(os.path.join(tdir, '*.pml'))
|
||||
for pml in pmls:
|
||||
html_name = os.path.splitext(os.path.basename(pml))[0]+'.html'
|
||||
html_path = os.path.join(getcwd(), html_name)
|
||||
|
||||
pages.append(html_name)
|
||||
log.debug('Processing PML item %s...' % pml)
|
||||
ttoc = self.process_pml(pml, html_path)
|
||||
toc += ttoc
|
||||
images = self.get_images(stream, tdir, True)
|
||||
else:
|
||||
toc = self.process_pml(stream, 'index.html')
|
||||
pages.append('index.html')
|
||||
|
||||
if hasattr(stream, 'name'):
|
||||
images = self.get_images(stream, os.path.abspath(os.path.dirname(stream.name)))
|
||||
|
||||
# We want pages to be orded alphabetically.
|
||||
pages.sort()
|
||||
|
||||
manifest_items = []
|
||||
for item in pages+images:
|
||||
manifest_items.append((item, None))
|
||||
|
||||
from calibre.ebooks.metadata.meta import get_metadata
|
||||
log.debug('Reading metadata from input file...')
|
||||
mi = get_metadata(stream, 'pml')
|
||||
if 'images/cover.png' in images:
|
||||
mi.cover = 'images/cover.png'
|
||||
opf = OPFCreator(getcwd(), mi)
|
||||
log.debug('Generating manifest...')
|
||||
opf.create_manifest(manifest_items)
|
||||
opf.create_spine(pages)
|
||||
opf.set_toc(toc)
|
||||
with lopen('metadata.opf', 'wb') as opffile:
|
||||
with lopen('toc.ncx', 'wb') as tocfile:
|
||||
opf.render(opffile, tocfile, 'toc.ncx')
|
||||
|
||||
return os.path.join(getcwd(), 'metadata.opf')
|
||||
|
||||
def postprocess_book(self, oeb, opts, log):
|
||||
from calibre.ebooks.oeb.base import XHTML, barename
|
||||
for item in oeb.spine:
|
||||
if hasattr(item.data, 'xpath'):
|
||||
for heading in item.data.iterdescendants(*map(XHTML, 'h1 h2 h3 h4 h5 h6'.split())):
|
||||
if not len(heading):
|
||||
continue
|
||||
span = heading[0]
|
||||
if not heading.text and not span.text and not len(span) and barename(span.tag) == 'span':
|
||||
if not heading.get('id') and span.get('id'):
|
||||
heading.set('id', span.get('id'))
|
||||
heading.text = span.tail
|
||||
heading.remove(span)
|
||||
if len(heading) == 1 and heading[0].get('style') == 'text-align: center; margin: auto;':
|
||||
div = heading[0]
|
||||
if barename(div.tag) == 'div' and not len(div) and not div.get('id') and not heading.get('style'):
|
||||
heading.text = (heading.text or '') + (div.text or '') + (div.tail or '')
|
||||
heading.remove(div)
|
||||
heading.set('style', 'text-align: center')
|
||||
77
ebook_converter/ebooks/conversion/plugins/pml_output.py
Normal file
77
ebook_converter/ebooks/conversion/plugins/pml_output.py
Normal file
@@ -0,0 +1,77 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL 3'
|
||||
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import os, io
|
||||
|
||||
from calibre.customize.conversion import (OutputFormatPlugin,
|
||||
OptionRecommendation)
|
||||
from calibre.ptempfile import TemporaryDirectory
|
||||
from polyglot.builtins import unicode_type
|
||||
|
||||
|
||||
class PMLOutput(OutputFormatPlugin):
|
||||
|
||||
name = 'PML Output'
|
||||
author = 'John Schember'
|
||||
file_type = 'pmlz'
|
||||
commit_name = 'pml_output'
|
||||
|
||||
options = {
|
||||
OptionRecommendation(name='pml_output_encoding', recommended_value='cp1252',
|
||||
level=OptionRecommendation.LOW,
|
||||
help=_('Specify the character encoding of the output document. '
|
||||
'The default is cp1252.')),
|
||||
OptionRecommendation(name='inline_toc',
|
||||
recommended_value=False, level=OptionRecommendation.LOW,
|
||||
help=_('Add Table of Contents to beginning of the book.')),
|
||||
OptionRecommendation(name='full_image_depth',
|
||||
recommended_value=False, level=OptionRecommendation.LOW,
|
||||
help=_('Do not reduce the size or bit depth of images. Images '
|
||||
'have their size and depth reduced by default to accommodate '
|
||||
'applications that can not convert images on their '
|
||||
'own such as Dropbook.')),
|
||||
}
|
||||
|
||||
def convert(self, oeb_book, output_path, input_plugin, opts, log):
|
||||
from calibre.ebooks.pml.pmlml import PMLMLizer
|
||||
from calibre.utils.zipfile import ZipFile
|
||||
|
||||
with TemporaryDirectory('_pmlz_output') as tdir:
|
||||
pmlmlizer = PMLMLizer(log)
|
||||
pml = unicode_type(pmlmlizer.extract_content(oeb_book, opts))
|
||||
with lopen(os.path.join(tdir, 'index.pml'), 'wb') as out:
|
||||
out.write(pml.encode(opts.pml_output_encoding, 'replace'))
|
||||
|
||||
img_path = os.path.join(tdir, 'index_img')
|
||||
if not os.path.exists(img_path):
|
||||
os.makedirs(img_path)
|
||||
self.write_images(oeb_book.manifest, pmlmlizer.image_hrefs, img_path, opts)
|
||||
|
||||
log.debug('Compressing output...')
|
||||
pmlz = ZipFile(output_path, 'w')
|
||||
pmlz.add_dir(tdir)
|
||||
|
||||
def write_images(self, manifest, image_hrefs, out_dir, opts):
|
||||
from PIL import Image
|
||||
|
||||
from calibre.ebooks.oeb.base import OEB_RASTER_IMAGES
|
||||
for item in manifest:
|
||||
if item.media_type in OEB_RASTER_IMAGES and item.href in image_hrefs.keys():
|
||||
if opts.full_image_depth:
|
||||
im = Image.open(io.BytesIO(item.data))
|
||||
else:
|
||||
im = Image.open(io.BytesIO(item.data)).convert('P')
|
||||
im.thumbnail((300,300), Image.ANTIALIAS)
|
||||
|
||||
data = io.BytesIO()
|
||||
im.save(data, 'PNG')
|
||||
data = data.getvalue()
|
||||
|
||||
path = os.path.join(out_dir, image_hrefs[item.href])
|
||||
|
||||
with lopen(path, 'wb') as out:
|
||||
out.write(data)
|
||||
28
ebook_converter/ebooks/conversion/plugins/rb_input.py
Normal file
28
ebook_converter/ebooks/conversion/plugins/rb_input.py
Normal file
@@ -0,0 +1,28 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL 3'
|
||||
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
|
||||
from calibre.customize.conversion import InputFormatPlugin
|
||||
from polyglot.builtins import getcwd
|
||||
|
||||
|
||||
class RBInput(InputFormatPlugin):
|
||||
|
||||
name = 'RB Input'
|
||||
author = 'John Schember'
|
||||
description = 'Convert RB files to HTML'
|
||||
file_types = {'rb'}
|
||||
commit_name = 'rb_input'
|
||||
|
||||
def convert(self, stream, options, file_ext, log,
|
||||
accelerators):
|
||||
from calibre.ebooks.rb.reader import Reader
|
||||
|
||||
reader = Reader(stream, log, options.input_encoding)
|
||||
opf = reader.extract_content(getcwd())
|
||||
|
||||
return opf
|
||||
45
ebook_converter/ebooks/conversion/plugins/rb_output.py
Normal file
45
ebook_converter/ebooks/conversion/plugins/rb_output.py
Normal file
@@ -0,0 +1,45 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL 3'
|
||||
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import os
|
||||
|
||||
from calibre.customize.conversion import OutputFormatPlugin, OptionRecommendation
|
||||
|
||||
|
||||
class RBOutput(OutputFormatPlugin):
|
||||
|
||||
name = 'RB Output'
|
||||
author = 'John Schember'
|
||||
file_type = 'rb'
|
||||
commit_name = 'rb_output'
|
||||
|
||||
options = {
|
||||
OptionRecommendation(name='inline_toc',
|
||||
recommended_value=False, level=OptionRecommendation.LOW,
|
||||
help=_('Add Table of Contents to beginning of the book.'))}
|
||||
|
||||
def convert(self, oeb_book, output_path, input_plugin, opts, log):
|
||||
from calibre.ebooks.rb.writer import RBWriter
|
||||
|
||||
close = False
|
||||
if not hasattr(output_path, 'write'):
|
||||
close = True
|
||||
if not os.path.exists(os.path.dirname(output_path)) and os.path.dirname(output_path):
|
||||
os.makedirs(os.path.dirname(output_path))
|
||||
out_stream = lopen(output_path, 'wb')
|
||||
else:
|
||||
out_stream = output_path
|
||||
|
||||
writer = RBWriter(opts, log)
|
||||
|
||||
out_stream.seek(0)
|
||||
out_stream.truncate()
|
||||
|
||||
writer.write_content(oeb_book, out_stream, oeb_book.metadata)
|
||||
|
||||
if close:
|
||||
out_stream.close()
|
||||
169
ebook_converter/ebooks/conversion/plugins/recipe_input.py
Normal file
169
ebook_converter/ebooks/conversion/plugins/recipe_input.py
Normal file
@@ -0,0 +1,169 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import os
|
||||
|
||||
from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
|
||||
from calibre.constants import numeric_version
|
||||
from calibre import walk
|
||||
from polyglot.builtins import unicode_type
|
||||
|
||||
|
||||
class RecipeDisabled(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class RecipeInput(InputFormatPlugin):
|
||||
|
||||
name = 'Recipe Input'
|
||||
author = 'Kovid Goyal'
|
||||
description = _('Download periodical content from the internet')
|
||||
file_types = {'recipe', 'downloaded_recipe'}
|
||||
commit_name = 'recipe_input'
|
||||
|
||||
recommendations = {
|
||||
('chapter', None, OptionRecommendation.HIGH),
|
||||
('dont_split_on_page_breaks', True, OptionRecommendation.HIGH),
|
||||
('use_auto_toc', False, OptionRecommendation.HIGH),
|
||||
('input_encoding', None, OptionRecommendation.HIGH),
|
||||
('input_profile', 'default', OptionRecommendation.HIGH),
|
||||
('page_breaks_before', None, OptionRecommendation.HIGH),
|
||||
('insert_metadata', False, OptionRecommendation.HIGH),
|
||||
}
|
||||
|
||||
options = {
|
||||
OptionRecommendation(name='test', recommended_value=False,
|
||||
help=_(
|
||||
'Useful for recipe development. Forces'
|
||||
' max_articles_per_feed to 2 and downloads at most 2 feeds.'
|
||||
' You can change the number of feeds and articles by supplying optional arguments.'
|
||||
' For example: --test 3 1 will download at most 3 feeds and only 1 article per feed.')),
|
||||
OptionRecommendation(name='username', recommended_value=None,
|
||||
help=_('Username for sites that require a login to access '
|
||||
'content.')),
|
||||
OptionRecommendation(name='password', recommended_value=None,
|
||||
help=_('Password for sites that require a login to access '
|
||||
'content.')),
|
||||
OptionRecommendation(name='dont_download_recipe',
|
||||
recommended_value=False,
|
||||
help=_('Do not download latest version of builtin recipes from the calibre server')),
|
||||
OptionRecommendation(name='lrf', recommended_value=False,
|
||||
help='Optimize fetching for subsequent conversion to LRF.'),
|
||||
}
|
||||
|
||||
def convert(self, recipe_or_file, opts, file_ext, log,
|
||||
accelerators):
|
||||
from calibre.web.feeds.recipes import compile_recipe
|
||||
opts.output_profile.flow_size = 0
|
||||
if file_ext == 'downloaded_recipe':
|
||||
from calibre.utils.zipfile import ZipFile
|
||||
zf = ZipFile(recipe_or_file, 'r')
|
||||
zf.extractall()
|
||||
zf.close()
|
||||
with lopen('download.recipe', 'rb') as f:
|
||||
self.recipe_source = f.read()
|
||||
recipe = compile_recipe(self.recipe_source)
|
||||
recipe.needs_subscription = False
|
||||
self.recipe_object = recipe(opts, log, self.report_progress)
|
||||
else:
|
||||
if os.environ.get('CALIBRE_RECIPE_URN'):
|
||||
from calibre.web.feeds.recipes.collection import get_custom_recipe, get_builtin_recipe_by_id
|
||||
urn = os.environ['CALIBRE_RECIPE_URN']
|
||||
log('Downloading recipe urn: ' + urn)
|
||||
rtype, recipe_id = urn.partition(':')[::2]
|
||||
if not recipe_id:
|
||||
raise ValueError('Invalid recipe urn: ' + urn)
|
||||
if rtype == 'custom':
|
||||
self.recipe_source = get_custom_recipe(recipe_id)
|
||||
else:
|
||||
self.recipe_source = get_builtin_recipe_by_id(urn, log=log, download_recipe=True)
|
||||
if not self.recipe_source:
|
||||
raise ValueError('Could not find recipe with urn: ' + urn)
|
||||
if not isinstance(self.recipe_source, bytes):
|
||||
self.recipe_source = self.recipe_source.encode('utf-8')
|
||||
recipe = compile_recipe(self.recipe_source)
|
||||
elif os.access(recipe_or_file, os.R_OK):
|
||||
with lopen(recipe_or_file, 'rb') as f:
|
||||
self.recipe_source = f.read()
|
||||
recipe = compile_recipe(self.recipe_source)
|
||||
log('Using custom recipe')
|
||||
else:
|
||||
from calibre.web.feeds.recipes.collection import (
|
||||
get_builtin_recipe_by_title, get_builtin_recipe_titles)
|
||||
title = getattr(opts, 'original_recipe_input_arg', recipe_or_file)
|
||||
title = os.path.basename(title).rpartition('.')[0]
|
||||
titles = frozenset(get_builtin_recipe_titles())
|
||||
if title not in titles:
|
||||
title = getattr(opts, 'original_recipe_input_arg', recipe_or_file)
|
||||
title = title.rpartition('.')[0]
|
||||
|
||||
raw = get_builtin_recipe_by_title(title, log=log,
|
||||
download_recipe=not opts.dont_download_recipe)
|
||||
builtin = False
|
||||
try:
|
||||
recipe = compile_recipe(raw)
|
||||
self.recipe_source = raw
|
||||
if recipe.requires_version > numeric_version:
|
||||
log.warn(
|
||||
'Downloaded recipe needs calibre version at least: %s' %
|
||||
('.'.join(recipe.requires_version)))
|
||||
builtin = True
|
||||
except:
|
||||
log.exception('Failed to compile downloaded recipe. Falling '
|
||||
'back to builtin one')
|
||||
builtin = True
|
||||
if builtin:
|
||||
log('Using bundled builtin recipe')
|
||||
raw = get_builtin_recipe_by_title(title, log=log,
|
||||
download_recipe=False)
|
||||
if raw is None:
|
||||
raise ValueError('Failed to find builtin recipe: '+title)
|
||||
recipe = compile_recipe(raw)
|
||||
self.recipe_source = raw
|
||||
else:
|
||||
log('Using downloaded builtin recipe')
|
||||
|
||||
if recipe is None:
|
||||
raise ValueError('%r is not a valid recipe file or builtin recipe' %
|
||||
recipe_or_file)
|
||||
|
||||
disabled = getattr(recipe, 'recipe_disabled', None)
|
||||
if disabled is not None:
|
||||
raise RecipeDisabled(disabled)
|
||||
ro = recipe(opts, log, self.report_progress)
|
||||
ro.download()
|
||||
self.recipe_object = ro
|
||||
|
||||
for key, val in self.recipe_object.conversion_options.items():
|
||||
setattr(opts, key, val)
|
||||
|
||||
for f in os.listdir('.'):
|
||||
if f.endswith('.opf'):
|
||||
return os.path.abspath(f)
|
||||
|
||||
for f in walk('.'):
|
||||
if f.endswith('.opf'):
|
||||
return os.path.abspath(f)
|
||||
|
||||
def postprocess_book(self, oeb, opts, log):
|
||||
if self.recipe_object is not None:
|
||||
self.recipe_object.internal_postprocess_book(oeb, opts, log)
|
||||
self.recipe_object.postprocess_book(oeb, opts, log)
|
||||
|
||||
def specialize(self, oeb, opts, log, output_fmt):
|
||||
if opts.no_inline_navbars:
|
||||
from calibre.ebooks.oeb.base import XPath
|
||||
for item in oeb.spine:
|
||||
for div in XPath('//h:div[contains(@class, "calibre_navbar")]')(item.data):
|
||||
div.getparent().remove(div)
|
||||
|
||||
def save_download(self, zf):
|
||||
raw = self.recipe_source
|
||||
if isinstance(raw, unicode_type):
|
||||
raw = raw.encode('utf-8')
|
||||
zf.writestr('download.recipe', raw)
|
||||
323
ebook_converter/ebooks/conversion/plugins/rtf_input.py
Normal file
323
ebook_converter/ebooks/conversion/plugins/rtf_input.py
Normal file
@@ -0,0 +1,323 @@
|
||||
from __future__ import with_statement, unicode_literals
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
|
||||
import os, glob, re, textwrap
|
||||
|
||||
from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
|
||||
from polyglot.builtins import iteritems, filter, getcwd, as_bytes
|
||||
|
||||
border_style_map = {
|
||||
'single' : 'solid',
|
||||
'double-thickness-border' : 'double',
|
||||
'shadowed-border': 'outset',
|
||||
'double-border': 'double',
|
||||
'dotted-border': 'dotted',
|
||||
'dashed': 'dashed',
|
||||
'hairline': 'solid',
|
||||
'inset': 'inset',
|
||||
'dash-small': 'dashed',
|
||||
'dot-dash': 'dotted',
|
||||
'dot-dot-dash': 'dotted',
|
||||
'outset': 'outset',
|
||||
'tripple': 'double',
|
||||
'triple': 'double',
|
||||
'thick-thin-small': 'solid',
|
||||
'thin-thick-small': 'solid',
|
||||
'thin-thick-thin-small': 'solid',
|
||||
'thick-thin-medium': 'solid',
|
||||
'thin-thick-medium': 'solid',
|
||||
'thin-thick-thin-medium': 'solid',
|
||||
'thick-thin-large': 'solid',
|
||||
'thin-thick-thin-large': 'solid',
|
||||
'wavy': 'ridge',
|
||||
'double-wavy': 'ridge',
|
||||
'striped': 'ridge',
|
||||
'emboss': 'inset',
|
||||
'engrave': 'inset',
|
||||
'frame': 'ridge',
|
||||
}
|
||||
|
||||
|
||||
class RTFInput(InputFormatPlugin):
|
||||
|
||||
name = 'RTF Input'
|
||||
author = 'Kovid Goyal'
|
||||
description = 'Convert RTF files to HTML'
|
||||
file_types = {'rtf'}
|
||||
commit_name = 'rtf_input'
|
||||
|
||||
options = {
|
||||
OptionRecommendation(name='ignore_wmf', recommended_value=False,
|
||||
help=_('Ignore WMF images instead of replacing them with a placeholder image.')),
|
||||
}
|
||||
|
||||
def generate_xml(self, stream):
|
||||
from calibre.ebooks.rtf2xml.ParseRtf import ParseRtf
|
||||
ofile = u'dataxml.xml'
|
||||
run_lev, debug_dir, indent_out = 1, None, 0
|
||||
if getattr(self.opts, 'debug_pipeline', None) is not None:
|
||||
try:
|
||||
os.mkdir(u'rtfdebug')
|
||||
debug_dir = u'rtfdebug'
|
||||
run_lev = 4
|
||||
indent_out = 1
|
||||
self.log('Running RTFParser in debug mode')
|
||||
except:
|
||||
self.log.warn('Impossible to run RTFParser in debug mode')
|
||||
parser = ParseRtf(
|
||||
in_file=stream,
|
||||
out_file=ofile,
|
||||
# Convert symbol fonts to unicode equivalents. Default
|
||||
# is 1
|
||||
convert_symbol=1,
|
||||
|
||||
# Convert Zapf fonts to unicode equivalents. Default
|
||||
# is 1.
|
||||
convert_zapf=1,
|
||||
|
||||
# Convert Wingding fonts to unicode equivalents.
|
||||
# Default is 1.
|
||||
convert_wingdings=1,
|
||||
|
||||
# Convert RTF caps to real caps.
|
||||
# Default is 1.
|
||||
convert_caps=1,
|
||||
|
||||
# Indent resulting XML.
|
||||
# Default is 0 (no indent).
|
||||
indent=indent_out,
|
||||
|
||||
# Form lists from RTF. Default is 1.
|
||||
form_lists=1,
|
||||
|
||||
# Convert headings to sections. Default is 0.
|
||||
headings_to_sections=1,
|
||||
|
||||
# Group paragraphs with the same style name. Default is 1.
|
||||
group_styles=1,
|
||||
|
||||
# Group borders. Default is 1.
|
||||
group_borders=1,
|
||||
|
||||
# Write or do not write paragraphs. Default is 0.
|
||||
empty_paragraphs=1,
|
||||
|
||||
# Debug
|
||||
deb_dir=debug_dir,
|
||||
|
||||
# Default encoding
|
||||
default_encoding=getattr(self.opts, 'input_encoding', 'cp1252') or 'cp1252',
|
||||
|
||||
# Run level
|
||||
run_level=run_lev,
|
||||
)
|
||||
parser.parse_rtf()
|
||||
with open(ofile, 'rb') as f:
|
||||
return f.read()
|
||||
|
||||
def extract_images(self, picts):
|
||||
from calibre.utils.imghdr import what
|
||||
from binascii import unhexlify
|
||||
self.log('Extracting images...')
|
||||
|
||||
with open(picts, 'rb') as f:
|
||||
raw = f.read()
|
||||
picts = filter(len, re.findall(br'\{\\pict([^}]+)\}', raw))
|
||||
hex_pat = re.compile(br'[^a-fA-F0-9]')
|
||||
encs = [hex_pat.sub(b'', pict) for pict in picts]
|
||||
|
||||
count = 0
|
||||
imap = {}
|
||||
for enc in encs:
|
||||
if len(enc) % 2 == 1:
|
||||
enc = enc[:-1]
|
||||
data = unhexlify(enc)
|
||||
fmt = what(None, data)
|
||||
if fmt is None:
|
||||
fmt = 'wmf'
|
||||
count += 1
|
||||
name = u'%04d.%s' % (count, fmt)
|
||||
with open(name, 'wb') as f:
|
||||
f.write(data)
|
||||
imap[count] = name
|
||||
# with open(name+'.hex', 'wb') as f:
|
||||
# f.write(enc)
|
||||
return self.convert_images(imap)
|
||||
|
||||
def convert_images(self, imap):
|
||||
self.default_img = None
|
||||
for count, val in iteritems(imap):
|
||||
try:
|
||||
imap[count] = self.convert_image(val)
|
||||
except:
|
||||
self.log.exception('Failed to convert', val)
|
||||
return imap
|
||||
|
||||
def convert_image(self, name):
|
||||
if not name.endswith('.wmf'):
|
||||
return name
|
||||
try:
|
||||
return self.rasterize_wmf(name)
|
||||
except Exception:
|
||||
self.log.exception('Failed to convert WMF image %r'%name)
|
||||
return self.replace_wmf(name)
|
||||
|
||||
def replace_wmf(self, name):
|
||||
if self.opts.ignore_wmf:
|
||||
os.remove(name)
|
||||
return '__REMOVE_ME__'
|
||||
from calibre.ebooks.covers import message_image
|
||||
if self.default_img is None:
|
||||
self.default_img = message_image('Conversion of WMF images is not supported.'
|
||||
' Use Microsoft Word or OpenOffice to save this RTF file'
|
||||
' as HTML and convert that in calibre.')
|
||||
name = name.replace('.wmf', '.jpg')
|
||||
with lopen(name, 'wb') as f:
|
||||
f.write(self.default_img)
|
||||
return name
|
||||
|
||||
def rasterize_wmf(self, name):
|
||||
from calibre.utils.wmf.parse import wmf_unwrap
|
||||
with open(name, 'rb') as f:
|
||||
data = f.read()
|
||||
data = wmf_unwrap(data)
|
||||
name = name.replace('.wmf', '.png')
|
||||
with open(name, 'wb') as f:
|
||||
f.write(data)
|
||||
return name
|
||||
|
||||
def write_inline_css(self, ic, border_styles):
|
||||
font_size_classes = ['span.fs%d { font-size: %spt }'%(i, x) for i, x in
|
||||
enumerate(ic.font_sizes)]
|
||||
color_classes = ['span.col%d { color: %s }'%(i, x) for i, x in
|
||||
enumerate(ic.colors) if x != 'false']
|
||||
css = textwrap.dedent('''
|
||||
span.none {
|
||||
text-decoration: none; font-weight: normal;
|
||||
font-style: normal; font-variant: normal
|
||||
}
|
||||
|
||||
span.italics { font-style: italic }
|
||||
|
||||
span.bold { font-weight: bold }
|
||||
|
||||
span.small-caps { font-variant: small-caps }
|
||||
|
||||
span.underlined { text-decoration: underline }
|
||||
|
||||
span.strike-through { text-decoration: line-through }
|
||||
|
||||
''')
|
||||
css += '\n'+'\n'.join(font_size_classes)
|
||||
css += '\n' +'\n'.join(color_classes)
|
||||
|
||||
for cls, val in iteritems(border_styles):
|
||||
css += '\n\n.%s {\n%s\n}'%(cls, val)
|
||||
|
||||
with open(u'styles.css', 'ab') as f:
|
||||
f.write(css.encode('utf-8'))
|
||||
|
||||
def convert_borders(self, doc):
|
||||
border_styles = []
|
||||
style_map = {}
|
||||
for elem in doc.xpath(r'//*[local-name()="cell"]'):
|
||||
style = ['border-style: hidden', 'border-width: 1px',
|
||||
'border-color: black']
|
||||
for x in ('bottom', 'top', 'left', 'right'):
|
||||
bs = elem.get('border-cell-%s-style'%x, None)
|
||||
if bs:
|
||||
cbs = border_style_map.get(bs, 'solid')
|
||||
style.append('border-%s-style: %s'%(x, cbs))
|
||||
bw = elem.get('border-cell-%s-line-width'%x, None)
|
||||
if bw:
|
||||
style.append('border-%s-width: %spt'%(x, bw))
|
||||
bc = elem.get('border-cell-%s-color'%x, None)
|
||||
if bc:
|
||||
style.append('border-%s-color: %s'%(x, bc))
|
||||
style = ';\n'.join(style)
|
||||
if style not in border_styles:
|
||||
border_styles.append(style)
|
||||
idx = border_styles.index(style)
|
||||
cls = 'border_style%d'%idx
|
||||
style_map[cls] = style
|
||||
elem.set('class', cls)
|
||||
return style_map
|
||||
|
||||
def convert(self, stream, options, file_ext, log,
|
||||
accelerators):
|
||||
from lxml import etree
|
||||
from calibre.ebooks.metadata.meta import get_metadata
|
||||
from calibre.ebooks.metadata.opf2 import OPFCreator
|
||||
from calibre.ebooks.rtf2xml.ParseRtf import RtfInvalidCodeException
|
||||
from calibre.ebooks.rtf.input import InlineClass
|
||||
from calibre.utils.xml_parse import safe_xml_fromstring
|
||||
self.opts = options
|
||||
self.log = log
|
||||
self.log('Converting RTF to XML...')
|
||||
try:
|
||||
xml = self.generate_xml(stream.name)
|
||||
except RtfInvalidCodeException as e:
|
||||
self.log.exception('Unable to parse RTF')
|
||||
raise ValueError(_('This RTF file has a feature calibre does not '
|
||||
'support. Convert it to HTML first and then try it.\n%s')%e)
|
||||
|
||||
d = glob.glob(os.path.join('*_rtf_pict_dir', 'picts.rtf'))
|
||||
if d:
|
||||
imap = {}
|
||||
try:
|
||||
imap = self.extract_images(d[0])
|
||||
except:
|
||||
self.log.exception('Failed to extract images...')
|
||||
|
||||
self.log('Parsing XML...')
|
||||
doc = safe_xml_fromstring(xml)
|
||||
border_styles = self.convert_borders(doc)
|
||||
for pict in doc.xpath('//rtf:pict[@num]',
|
||||
namespaces={'rtf':'http://rtf2xml.sourceforge.net/'}):
|
||||
num = int(pict.get('num'))
|
||||
name = imap.get(num, None)
|
||||
if name is not None:
|
||||
pict.set('num', name)
|
||||
|
||||
self.log('Converting XML to HTML...')
|
||||
inline_class = InlineClass(self.log)
|
||||
styledoc = safe_xml_fromstring(P('templates/rtf.xsl', data=True), recover=False)
|
||||
extensions = {('calibre', 'inline-class') : inline_class}
|
||||
transform = etree.XSLT(styledoc, extensions=extensions)
|
||||
result = transform(doc)
|
||||
html = u'index.xhtml'
|
||||
with open(html, 'wb') as f:
|
||||
res = as_bytes(transform.tostring(result))
|
||||
# res = res[:100].replace('xmlns:html', 'xmlns') + res[100:]
|
||||
# clean multiple \n
|
||||
res = re.sub(b'\n+', b'\n', res)
|
||||
# Replace newlines inserted by the 'empty_paragraphs' option in rtf2xml with html blank lines
|
||||
# res = re.sub('\s*<body>', '<body>', res)
|
||||
# res = re.sub('(?<=\n)\n{2}',
|
||||
# u'<p>\u00a0</p>\n'.encode('utf-8'), res)
|
||||
f.write(res)
|
||||
self.write_inline_css(inline_class, border_styles)
|
||||
stream.seek(0)
|
||||
mi = get_metadata(stream, 'rtf')
|
||||
if not mi.title:
|
||||
mi.title = _('Unknown')
|
||||
if not mi.authors:
|
||||
mi.authors = [_('Unknown')]
|
||||
opf = OPFCreator(getcwd(), mi)
|
||||
opf.create_manifest([(u'index.xhtml', None)])
|
||||
opf.create_spine([u'index.xhtml'])
|
||||
opf.render(open(u'metadata.opf', 'wb'))
|
||||
return os.path.abspath(u'metadata.opf')
|
||||
|
||||
def postprocess_book(self, oeb, opts, log):
|
||||
for item in oeb.spine:
|
||||
for img in item.data.xpath('//*[local-name()="img" and @src="__REMOVE_ME__"]'):
|
||||
p = img.getparent()
|
||||
idx = p.index(img)
|
||||
p.remove(img)
|
||||
if img.tail:
|
||||
if idx == 0:
|
||||
p.text = (p.text or '') + img.tail
|
||||
else:
|
||||
p[idx-1].tail = (p[idx-1].tail or '') + img.tail
|
||||
40
ebook_converter/ebooks/conversion/plugins/rtf_output.py
Normal file
40
ebook_converter/ebooks/conversion/plugins/rtf_output.py
Normal file
@@ -0,0 +1,40 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL 3'
|
||||
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import os
|
||||
|
||||
from calibre.customize.conversion import OutputFormatPlugin
|
||||
|
||||
|
||||
class RTFOutput(OutputFormatPlugin):
|
||||
|
||||
name = 'RTF Output'
|
||||
author = 'John Schember'
|
||||
file_type = 'rtf'
|
||||
commit_name = 'rtf_output'
|
||||
|
||||
def convert(self, oeb_book, output_path, input_plugin, opts, log):
|
||||
from calibre.ebooks.rtf.rtfml import RTFMLizer
|
||||
|
||||
rtfmlitzer = RTFMLizer(log)
|
||||
content = rtfmlitzer.extract_content(oeb_book, opts)
|
||||
|
||||
close = False
|
||||
if not hasattr(output_path, 'write'):
|
||||
close = True
|
||||
if not os.path.exists(os.path.dirname(output_path)) and os.path.dirname(output_path) != '':
|
||||
os.makedirs(os.path.dirname(output_path))
|
||||
out_stream = lopen(output_path, 'wb')
|
||||
else:
|
||||
out_stream = output_path
|
||||
|
||||
out_stream.seek(0)
|
||||
out_stream.truncate()
|
||||
out_stream.write(content.encode('ascii', 'replace'))
|
||||
|
||||
if close:
|
||||
out_stream.close()
|
||||
122
ebook_converter/ebooks/conversion/plugins/snb_input.py
Normal file
122
ebook_converter/ebooks/conversion/plugins/snb_input.py
Normal file
@@ -0,0 +1,122 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL 3'
|
||||
__copyright__ = '2010, Li Fanxi <lifanxi@freemindworld.com>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import os
|
||||
|
||||
from calibre.customize.conversion import InputFormatPlugin
|
||||
from calibre.ptempfile import TemporaryDirectory
|
||||
from calibre.utils.filenames import ascii_filename
|
||||
from polyglot.builtins import unicode_type
|
||||
|
||||
HTML_TEMPLATE = '<html><head><meta http-equiv="Content-Type" content="text/html; charset=utf-8"/><title>%s</title></head><body>\n%s\n</body></html>'
|
||||
|
||||
|
||||
def html_encode(s):
|
||||
return s.replace('&', '&').replace('<', '<').replace('>', '>').replace('"', '"').replace("'", ''').replace('\n', '<br/>').replace(' ', ' ') # noqa
|
||||
|
||||
|
||||
class SNBInput(InputFormatPlugin):
|
||||
|
||||
name = 'SNB Input'
|
||||
author = 'Li Fanxi'
|
||||
description = 'Convert SNB files to OEB'
|
||||
file_types = {'snb'}
|
||||
commit_name = 'snb_input'
|
||||
|
||||
options = set()
|
||||
|
||||
def convert(self, stream, options, file_ext, log,
|
||||
accelerators):
|
||||
import uuid
|
||||
|
||||
from calibre.ebooks.oeb.base import DirContainer
|
||||
from calibre.ebooks.snb.snbfile import SNBFile
|
||||
from calibre.utils.xml_parse import safe_xml_fromstring
|
||||
|
||||
log.debug("Parsing SNB file...")
|
||||
snbFile = SNBFile()
|
||||
try:
|
||||
snbFile.Parse(stream)
|
||||
except:
|
||||
raise ValueError("Invalid SNB file")
|
||||
if not snbFile.IsValid():
|
||||
log.debug("Invalid SNB file")
|
||||
raise ValueError("Invalid SNB file")
|
||||
log.debug("Handle meta data ...")
|
||||
from calibre.ebooks.conversion.plumber import create_oebbook
|
||||
oeb = create_oebbook(log, None, options,
|
||||
encoding=options.input_encoding, populate=False)
|
||||
meta = snbFile.GetFileStream('snbf/book.snbf')
|
||||
if meta is not None:
|
||||
meta = safe_xml_fromstring(meta)
|
||||
l = {'title' : './/head/name',
|
||||
'creator' : './/head/author',
|
||||
'language' : './/head/language',
|
||||
'generator': './/head/generator',
|
||||
'publisher': './/head/publisher',
|
||||
'cover' : './/head/cover', }
|
||||
d = {}
|
||||
for item in l:
|
||||
node = meta.find(l[item])
|
||||
if node is not None:
|
||||
d[item] = node.text if node.text is not None else ''
|
||||
else:
|
||||
d[item] = ''
|
||||
|
||||
oeb.metadata.add('title', d['title'])
|
||||
oeb.metadata.add('creator', d['creator'], attrib={'role':'aut'})
|
||||
oeb.metadata.add('language', d['language'].lower().replace('_', '-'))
|
||||
oeb.metadata.add('generator', d['generator'])
|
||||
oeb.metadata.add('publisher', d['publisher'])
|
||||
if d['cover'] != '':
|
||||
oeb.guide.add('cover', 'Cover', d['cover'])
|
||||
|
||||
bookid = unicode_type(uuid.uuid4())
|
||||
oeb.metadata.add('identifier', bookid, id='uuid_id', scheme='uuid')
|
||||
for ident in oeb.metadata.identifier:
|
||||
if 'id' in ident.attrib:
|
||||
oeb.uid = oeb.metadata.identifier[0]
|
||||
break
|
||||
|
||||
with TemporaryDirectory('_snb2oeb', keep=True) as tdir:
|
||||
log.debug('Process TOC ...')
|
||||
toc = snbFile.GetFileStream('snbf/toc.snbf')
|
||||
oeb.container = DirContainer(tdir, log)
|
||||
if toc is not None:
|
||||
toc = safe_xml_fromstring(toc)
|
||||
i = 1
|
||||
for ch in toc.find('.//body'):
|
||||
chapterName = ch.text
|
||||
chapterSrc = ch.get('src')
|
||||
fname = 'ch_%d.htm' % i
|
||||
data = snbFile.GetFileStream('snbc/' + chapterSrc)
|
||||
if data is None:
|
||||
continue
|
||||
snbc = safe_xml_fromstring(data)
|
||||
lines = []
|
||||
for line in snbc.find('.//body'):
|
||||
if line.tag == 'text':
|
||||
lines.append('<p>%s</p>' % html_encode(line.text))
|
||||
elif line.tag == 'img':
|
||||
lines.append('<p><img src="%s" /></p>' % html_encode(line.text))
|
||||
with open(os.path.join(tdir, fname), 'wb') as f:
|
||||
f.write((HTML_TEMPLATE % (chapterName, '\n'.join(lines))).encode('utf-8', 'replace'))
|
||||
oeb.toc.add(ch.text, fname)
|
||||
id, href = oeb.manifest.generate(id='html',
|
||||
href=ascii_filename(fname))
|
||||
item = oeb.manifest.add(id, href, 'text/html')
|
||||
item.html_input_href = fname
|
||||
oeb.spine.add(item, True)
|
||||
i = i + 1
|
||||
imageFiles = snbFile.OutputImageFiles(tdir)
|
||||
for f, m in imageFiles:
|
||||
id, href = oeb.manifest.generate(id='image',
|
||||
href=ascii_filename(f))
|
||||
item = oeb.manifest.add(id, href, m)
|
||||
item.html_input_href = f
|
||||
|
||||
return oeb
|
||||
269
ebook_converter/ebooks/conversion/plugins/snb_output.py
Normal file
269
ebook_converter/ebooks/conversion/plugins/snb_output.py
Normal file
@@ -0,0 +1,269 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL 3'
|
||||
__copyright__ = '2010, Li Fanxi <lifanxi@freemindworld.com>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import os
|
||||
|
||||
from calibre.customize.conversion import OutputFormatPlugin, OptionRecommendation
|
||||
from calibre.ptempfile import TemporaryDirectory
|
||||
from calibre.constants import __appname__, __version__
|
||||
from polyglot.builtins import unicode_type
|
||||
|
||||
|
||||
class SNBOutput(OutputFormatPlugin):
|
||||
|
||||
name = 'SNB Output'
|
||||
author = 'Li Fanxi'
|
||||
file_type = 'snb'
|
||||
commit_name = 'snb_output'
|
||||
|
||||
options = {
|
||||
OptionRecommendation(name='snb_output_encoding', recommended_value='utf-8',
|
||||
level=OptionRecommendation.LOW,
|
||||
help=_('Specify the character encoding of the output document. '
|
||||
'The default is utf-8.')),
|
||||
OptionRecommendation(name='snb_max_line_length',
|
||||
recommended_value=0, level=OptionRecommendation.LOW,
|
||||
help=_('The maximum number of characters per line. This splits on '
|
||||
'the first space before the specified value. If no space is found '
|
||||
'the line will be broken at the space after and will exceed the '
|
||||
'specified value. Also, there is a minimum of 25 characters. '
|
||||
'Use 0 to disable line splitting.')),
|
||||
OptionRecommendation(name='snb_insert_empty_line',
|
||||
recommended_value=False, level=OptionRecommendation.LOW,
|
||||
help=_('Specify whether or not to insert an empty line between '
|
||||
'two paragraphs.')),
|
||||
OptionRecommendation(name='snb_dont_indent_first_line',
|
||||
recommended_value=False, level=OptionRecommendation.LOW,
|
||||
help=_('Specify whether or not to insert two space characters '
|
||||
'to indent the first line of each paragraph.')),
|
||||
OptionRecommendation(name='snb_hide_chapter_name',
|
||||
recommended_value=False, level=OptionRecommendation.LOW,
|
||||
help=_('Specify whether or not to hide the chapter title for each '
|
||||
'chapter. Useful for image-only output (eg. comics).')),
|
||||
OptionRecommendation(name='snb_full_screen',
|
||||
recommended_value=False, level=OptionRecommendation.LOW,
|
||||
help=_('Resize all the images for full screen view. ')),
|
||||
}
|
||||
|
||||
def convert(self, oeb_book, output_path, input_plugin, opts, log):
|
||||
from lxml import etree
|
||||
from calibre.ebooks.snb.snbfile import SNBFile
|
||||
from calibre.ebooks.snb.snbml import SNBMLizer, ProcessFileName
|
||||
|
||||
self.opts = opts
|
||||
from calibre.ebooks.oeb.transforms.rasterize import SVGRasterizer, Unavailable
|
||||
try:
|
||||
rasterizer = SVGRasterizer()
|
||||
rasterizer(oeb_book, opts)
|
||||
except Unavailable:
|
||||
log.warn('SVG rasterizer unavailable, SVG will not be converted')
|
||||
|
||||
# Create temp dir
|
||||
with TemporaryDirectory('_snb_output') as tdir:
|
||||
# Create stub directories
|
||||
snbfDir = os.path.join(tdir, 'snbf')
|
||||
snbcDir = os.path.join(tdir, 'snbc')
|
||||
snbiDir = os.path.join(tdir, 'snbc/images')
|
||||
os.mkdir(snbfDir)
|
||||
os.mkdir(snbcDir)
|
||||
os.mkdir(snbiDir)
|
||||
|
||||
# Process Meta data
|
||||
meta = oeb_book.metadata
|
||||
if meta.title:
|
||||
title = unicode_type(meta.title[0])
|
||||
else:
|
||||
title = ''
|
||||
authors = [unicode_type(x) for x in meta.creator if x.role == 'aut']
|
||||
if meta.publisher:
|
||||
publishers = unicode_type(meta.publisher[0])
|
||||
else:
|
||||
publishers = ''
|
||||
if meta.language:
|
||||
lang = unicode_type(meta.language[0]).upper()
|
||||
else:
|
||||
lang = ''
|
||||
if meta.description:
|
||||
abstract = unicode_type(meta.description[0])
|
||||
else:
|
||||
abstract = ''
|
||||
|
||||
# Process Cover
|
||||
g, m, s = oeb_book.guide, oeb_book.manifest, oeb_book.spine
|
||||
href = None
|
||||
if 'titlepage' not in g:
|
||||
if 'cover' in g:
|
||||
href = g['cover'].href
|
||||
|
||||
# Output book info file
|
||||
bookInfoTree = etree.Element("book-snbf", version="1.0")
|
||||
headTree = etree.SubElement(bookInfoTree, "head")
|
||||
etree.SubElement(headTree, "name").text = title
|
||||
etree.SubElement(headTree, "author").text = ' '.join(authors)
|
||||
etree.SubElement(headTree, "language").text = lang
|
||||
etree.SubElement(headTree, "rights")
|
||||
etree.SubElement(headTree, "publisher").text = publishers
|
||||
etree.SubElement(headTree, "generator").text = __appname__ + ' ' + __version__
|
||||
etree.SubElement(headTree, "created")
|
||||
etree.SubElement(headTree, "abstract").text = abstract
|
||||
if href is not None:
|
||||
etree.SubElement(headTree, "cover").text = ProcessFileName(href)
|
||||
else:
|
||||
etree.SubElement(headTree, "cover")
|
||||
with open(os.path.join(snbfDir, 'book.snbf'), 'wb') as f:
|
||||
f.write(etree.tostring(bookInfoTree, pretty_print=True, encoding='utf-8'))
|
||||
|
||||
# Output TOC
|
||||
tocInfoTree = etree.Element("toc-snbf")
|
||||
tocHead = etree.SubElement(tocInfoTree, "head")
|
||||
tocBody = etree.SubElement(tocInfoTree, "body")
|
||||
outputFiles = {}
|
||||
if oeb_book.toc.count() == 0:
|
||||
log.warn('This SNB file has no Table of Contents. '
|
||||
'Creating a default TOC')
|
||||
first = next(iter(oeb_book.spine))
|
||||
oeb_book.toc.add(_('Start page'), first.href)
|
||||
else:
|
||||
first = next(iter(oeb_book.spine))
|
||||
if oeb_book.toc[0].href != first.href:
|
||||
# The pages before the fist item in toc will be stored as
|
||||
# "Cover Pages".
|
||||
# oeb_book.toc does not support "insert", so we generate
|
||||
# the tocInfoTree directly instead of modifying the toc
|
||||
ch = etree.SubElement(tocBody, "chapter")
|
||||
ch.set("src", ProcessFileName(first.href) + ".snbc")
|
||||
ch.text = _('Cover pages')
|
||||
outputFiles[first.href] = []
|
||||
outputFiles[first.href].append(("", _("Cover pages")))
|
||||
|
||||
for tocitem in oeb_book.toc:
|
||||
if tocitem.href.find('#') != -1:
|
||||
item = tocitem.href.split('#')
|
||||
if len(item) != 2:
|
||||
log.error('Error in TOC item: %s' % tocitem)
|
||||
else:
|
||||
if item[0] in outputFiles:
|
||||
outputFiles[item[0]].append((item[1], tocitem.title))
|
||||
else:
|
||||
outputFiles[item[0]] = []
|
||||
if "" not in outputFiles[item[0]]:
|
||||
outputFiles[item[0]].append(("", tocitem.title + _(" (Preface)")))
|
||||
ch = etree.SubElement(tocBody, "chapter")
|
||||
ch.set("src", ProcessFileName(item[0]) + ".snbc")
|
||||
ch.text = tocitem.title + _(" (Preface)")
|
||||
outputFiles[item[0]].append((item[1], tocitem.title))
|
||||
else:
|
||||
if tocitem.href in outputFiles:
|
||||
outputFiles[tocitem.href].append(("", tocitem.title))
|
||||
else:
|
||||
outputFiles[tocitem.href] = []
|
||||
outputFiles[tocitem.href].append(("", tocitem.title))
|
||||
ch = etree.SubElement(tocBody, "chapter")
|
||||
ch.set("src", ProcessFileName(tocitem.href) + ".snbc")
|
||||
ch.text = tocitem.title
|
||||
|
||||
etree.SubElement(tocHead, "chapters").text = '%d' % len(tocBody)
|
||||
|
||||
with open(os.path.join(snbfDir, 'toc.snbf'), 'wb') as f:
|
||||
f.write(etree.tostring(tocInfoTree, pretty_print=True, encoding='utf-8'))
|
||||
|
||||
# Output Files
|
||||
oldTree = None
|
||||
mergeLast = False
|
||||
lastName = None
|
||||
for item in s:
|
||||
from calibre.ebooks.oeb.base import OEB_DOCS, OEB_IMAGES
|
||||
if m.hrefs[item.href].media_type in OEB_DOCS:
|
||||
if item.href not in outputFiles:
|
||||
log.debug('File %s is unused in TOC. Continue in last chapter' % item.href)
|
||||
mergeLast = True
|
||||
else:
|
||||
if oldTree is not None and mergeLast:
|
||||
log.debug('Output the modified chapter again: %s' % lastName)
|
||||
with open(os.path.join(snbcDir, lastName), 'wb') as f:
|
||||
f.write(etree.tostring(oldTree, pretty_print=True, encoding='utf-8'))
|
||||
mergeLast = False
|
||||
|
||||
log.debug('Converting %s to snbc...' % item.href)
|
||||
snbwriter = SNBMLizer(log)
|
||||
snbcTrees = None
|
||||
if not mergeLast:
|
||||
snbcTrees = snbwriter.extract_content(oeb_book, item, outputFiles[item.href], opts)
|
||||
for subName in snbcTrees:
|
||||
postfix = ''
|
||||
if subName != '':
|
||||
postfix = '_' + subName
|
||||
lastName = ProcessFileName(item.href + postfix + ".snbc")
|
||||
oldTree = snbcTrees[subName]
|
||||
with open(os.path.join(snbcDir, lastName), 'wb') as f:
|
||||
f.write(etree.tostring(oldTree, pretty_print=True, encoding='utf-8'))
|
||||
else:
|
||||
log.debug('Merge %s with last TOC item...' % item.href)
|
||||
snbwriter.merge_content(oldTree, oeb_book, item, [('', _("Start"))], opts)
|
||||
|
||||
# Output the last one if needed
|
||||
log.debug('Output the last modified chapter again: %s' % lastName)
|
||||
if oldTree is not None and mergeLast:
|
||||
with open(os.path.join(snbcDir, lastName), 'wb') as f:
|
||||
f.write(etree.tostring(oldTree, pretty_print=True, encoding='utf-8'))
|
||||
mergeLast = False
|
||||
|
||||
for item in m:
|
||||
if m.hrefs[item.href].media_type in OEB_IMAGES:
|
||||
log.debug('Converting image: %s ...' % item.href)
|
||||
content = m.hrefs[item.href].data
|
||||
# Convert & Resize image
|
||||
self.HandleImage(content, os.path.join(snbiDir, ProcessFileName(item.href)))
|
||||
|
||||
# Package as SNB File
|
||||
snbFile = SNBFile()
|
||||
snbFile.FromDir(tdir)
|
||||
snbFile.Output(output_path)
|
||||
|
||||
def HandleImage(self, imageData, imagePath):
|
||||
from calibre.utils.img import image_from_data, resize_image, image_to_data
|
||||
img = image_from_data(imageData)
|
||||
x, y = img.width(), img.height()
|
||||
if self.opts:
|
||||
if self.opts.snb_full_screen:
|
||||
SCREEN_X, SCREEN_Y = self.opts.output_profile.screen_size
|
||||
else:
|
||||
SCREEN_X, SCREEN_Y = self.opts.output_profile.comic_screen_size
|
||||
else:
|
||||
SCREEN_X = 540
|
||||
SCREEN_Y = 700
|
||||
# Handle big image only
|
||||
if x > SCREEN_X or y > SCREEN_Y:
|
||||
xScale = float(x) / SCREEN_X
|
||||
yScale = float(y) / SCREEN_Y
|
||||
scale = max(xScale, yScale)
|
||||
# TODO : intelligent image rotation
|
||||
# img = img.rotate(90)
|
||||
# x,y = y,x
|
||||
img = resize_image(img, x // scale, y // scale)
|
||||
with lopen(imagePath, 'wb') as f:
|
||||
f.write(image_to_data(img, fmt=imagePath.rpartition('.')[-1]))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
from calibre.ebooks.oeb.reader import OEBReader
|
||||
from calibre.ebooks.oeb.base import OEBBook
|
||||
from calibre.ebooks.conversion.preprocess import HTMLPreProcessor
|
||||
from calibre.customize.profiles import HanlinV3Output
|
||||
|
||||
class OptionValues(object):
|
||||
pass
|
||||
|
||||
opts = OptionValues()
|
||||
opts.output_profile = HanlinV3Output(None)
|
||||
|
||||
html_preprocessor = HTMLPreProcessor(None, None, opts)
|
||||
from calibre.utils.logging import default_log
|
||||
oeb = OEBBook(default_log, html_preprocessor)
|
||||
reader = OEBReader
|
||||
reader()(oeb, '/tmp/bbb/processed/')
|
||||
SNBOutput(None).convert(oeb, '/tmp/test.snb', None, None, default_log)
|
||||
39
ebook_converter/ebooks/conversion/plugins/tcr_input.py
Normal file
39
ebook_converter/ebooks/conversion/plugins/tcr_input.py
Normal file
@@ -0,0 +1,39 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL 3'
|
||||
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
from io import BytesIO
|
||||
|
||||
from calibre.customize.conversion import InputFormatPlugin
|
||||
|
||||
|
||||
class TCRInput(InputFormatPlugin):
|
||||
|
||||
name = 'TCR Input'
|
||||
author = 'John Schember'
|
||||
description = 'Convert TCR files to HTML'
|
||||
file_types = {'tcr'}
|
||||
commit_name = 'tcr_input'
|
||||
|
||||
def convert(self, stream, options, file_ext, log, accelerators):
|
||||
from calibre.ebooks.compression.tcr import decompress
|
||||
|
||||
log.info('Decompressing text...')
|
||||
raw_txt = decompress(stream)
|
||||
|
||||
log.info('Converting text to OEB...')
|
||||
stream = BytesIO(raw_txt)
|
||||
|
||||
from calibre.customize.ui import plugin_for_input_format
|
||||
|
||||
txt_plugin = plugin_for_input_format('txt')
|
||||
for opt in txt_plugin.options:
|
||||
if not hasattr(self.options, opt.option.name):
|
||||
setattr(options, opt.option.name, opt.recommended_value)
|
||||
|
||||
stream.seek(0)
|
||||
return txt_plugin.convert(stream, options,
|
||||
'txt', log, accelerators)
|
||||
56
ebook_converter/ebooks/conversion/plugins/tcr_output.py
Normal file
56
ebook_converter/ebooks/conversion/plugins/tcr_output.py
Normal file
@@ -0,0 +1,56 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL 3'
|
||||
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import os
|
||||
|
||||
from calibre.customize.conversion import OutputFormatPlugin, \
|
||||
OptionRecommendation
|
||||
|
||||
|
||||
class TCROutput(OutputFormatPlugin):
|
||||
|
||||
name = 'TCR Output'
|
||||
author = 'John Schember'
|
||||
file_type = 'tcr'
|
||||
commit_name = 'tcr_output'
|
||||
|
||||
options = {
|
||||
OptionRecommendation(name='tcr_output_encoding', recommended_value='utf-8',
|
||||
level=OptionRecommendation.LOW,
|
||||
help=_('Specify the character encoding of the output document. '
|
||||
'The default is utf-8.'))}
|
||||
|
||||
def convert(self, oeb_book, output_path, input_plugin, opts, log):
|
||||
from calibre.ebooks.txt.txtml import TXTMLizer
|
||||
from calibre.ebooks.compression.tcr import compress
|
||||
|
||||
close = False
|
||||
if not hasattr(output_path, 'write'):
|
||||
close = True
|
||||
if not os.path.exists(os.path.dirname(output_path)) and os.path.dirname(output_path):
|
||||
os.makedirs(os.path.dirname(output_path))
|
||||
out_stream = lopen(output_path, 'wb')
|
||||
else:
|
||||
out_stream = output_path
|
||||
|
||||
setattr(opts, 'flush_paras', False)
|
||||
setattr(opts, 'max_line_length', 0)
|
||||
setattr(opts, 'force_max_line_length', False)
|
||||
setattr(opts, 'indent_paras', False)
|
||||
|
||||
writer = TXTMLizer(log)
|
||||
txt = writer.extract_content(oeb_book, opts).encode(opts.tcr_output_encoding, 'replace')
|
||||
|
||||
log.info('Compressing text...')
|
||||
txt = compress(txt)
|
||||
|
||||
out_stream.seek(0)
|
||||
out_stream.truncate()
|
||||
out_stream.write(txt)
|
||||
|
||||
if close:
|
||||
out_stream.close()
|
||||
308
ebook_converter/ebooks/conversion/plugins/txt_input.py
Normal file
308
ebook_converter/ebooks/conversion/plugins/txt_input.py
Normal file
@@ -0,0 +1,308 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL 3'
|
||||
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import os
|
||||
|
||||
from calibre import _ent_pat, walk, xml_entity_to_unicode
|
||||
from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
|
||||
from polyglot.builtins import getcwd
|
||||
|
||||
MD_EXTENSIONS = {
|
||||
'abbr': _('Abbreviations'),
|
||||
'admonition': _('Support admonitions'),
|
||||
'attr_list': _('Add attribute to HTML tags'),
|
||||
'codehilite': _('Add code highlighting via Pygments'),
|
||||
'def_list': _('Definition lists'),
|
||||
'extra': _('Enables various common extensions'),
|
||||
'fenced_code': _('Alternative code block syntax'),
|
||||
'footnotes': _('Footnotes'),
|
||||
'legacy_attrs': _('Use legacy element attributes'),
|
||||
'legacy_em': _('Use legacy underscore handling for connected words'),
|
||||
'meta': _('Metadata in the document'),
|
||||
'nl2br': _('Treat newlines as hard breaks'),
|
||||
'sane_lists': _('Do not allow mixing list types'),
|
||||
'smarty': _('Use markdown\'s internal smartypants parser'),
|
||||
'tables': _('Support tables'),
|
||||
'toc': _('Generate a table of contents'),
|
||||
'wikilinks': _('Wiki style links'),
|
||||
}
|
||||
|
||||
|
||||
class TXTInput(InputFormatPlugin):
|
||||
|
||||
name = 'TXT Input'
|
||||
author = 'John Schember'
|
||||
description = 'Convert TXT files to HTML'
|
||||
file_types = {'txt', 'txtz', 'text', 'md', 'textile', 'markdown'}
|
||||
commit_name = 'txt_input'
|
||||
ui_data = {
|
||||
'md_extensions': MD_EXTENSIONS,
|
||||
'paragraph_types': {
|
||||
'auto': _('Try to auto detect paragraph type'),
|
||||
'block': _('Treat a blank line as a paragraph break'),
|
||||
'single': _('Assume every line is a paragraph'),
|
||||
'print': _('Assume every line starting with 2+ spaces or a tab starts a paragraph'),
|
||||
'unformatted': _('Most lines have hard line breaks, few/no blank lines or indents'),
|
||||
'off': _('Don\'t modify the paragraph structure'),
|
||||
},
|
||||
'formatting_types': {
|
||||
'auto': _('Automatically decide which formatting processor to use'),
|
||||
'plain': _('No formatting'),
|
||||
'heuristic': _('Use heuristics to determine chapter headings, italics, etc.'),
|
||||
'textile': _('Use the TexTile markup language'),
|
||||
'markdown': _('Use the Markdown markup language')
|
||||
},
|
||||
}
|
||||
|
||||
options = {
|
||||
OptionRecommendation(name='formatting_type', recommended_value='auto',
|
||||
choices=list(ui_data['formatting_types']),
|
||||
help=_('Formatting used within the document.\n'
|
||||
'* auto: {auto}\n'
|
||||
'* plain: {plain}\n'
|
||||
'* heuristic: {heuristic}\n'
|
||||
'* textile: {textile}\n'
|
||||
'* markdown: {markdown}\n'
|
||||
'To learn more about markdown see {url}').format(
|
||||
url='https://daringfireball.net/projects/markdown/', **ui_data['formatting_types'])
|
||||
),
|
||||
OptionRecommendation(name='paragraph_type', recommended_value='auto',
|
||||
choices=list(ui_data['paragraph_types']),
|
||||
help=_('Paragraph structure to assume. The value of "off" is useful for formatted documents such as Markdown or Textile. '
|
||||
'Choices are:\n'
|
||||
'* auto: {auto}\n'
|
||||
'* block: {block}\n'
|
||||
'* single: {single}\n'
|
||||
'* print: {print}\n'
|
||||
'* unformatted: {unformatted}\n'
|
||||
'* off: {off}').format(**ui_data['paragraph_types'])
|
||||
),
|
||||
OptionRecommendation(name='preserve_spaces', recommended_value=False,
|
||||
help=_('Normally extra spaces are condensed into a single space. '
|
||||
'With this option all spaces will be displayed.')),
|
||||
OptionRecommendation(name='txt_in_remove_indents', recommended_value=False,
|
||||
help=_('Normally extra space at the beginning of lines is retained. '
|
||||
'With this option they will be removed.')),
|
||||
OptionRecommendation(name="markdown_extensions", recommended_value='footnotes, tables, toc',
|
||||
help=_('Enable extensions to markdown syntax. Extensions are formatting that is not part '
|
||||
'of the standard markdown format. The extensions enabled by default: %default.\n'
|
||||
'To learn more about markdown extensions, see {}\n'
|
||||
'This should be a comma separated list of extensions to enable:\n'
|
||||
).format('https://python-markdown.github.io/extensions/') + '\n'.join('* %s: %s' % (k, MD_EXTENSIONS[k]) for k in sorted(MD_EXTENSIONS))),
|
||||
}
|
||||
|
||||
def shift_file(self, fname, data):
|
||||
name, ext = os.path.splitext(fname)
|
||||
candidate = os.path.join(self.output_dir, fname)
|
||||
c = 0
|
||||
while os.path.exists(candidate):
|
||||
c += 1
|
||||
candidate = os.path.join(self.output_dir, '{}-{}{}'.format(name, c, ext))
|
||||
ans = candidate
|
||||
with open(ans, 'wb') as f:
|
||||
f.write(data)
|
||||
return f.name
|
||||
|
||||
def fix_resources(self, html, base_dir):
|
||||
from html5_parser import parse
|
||||
root = parse(html)
|
||||
changed = False
|
||||
for img in root.xpath('//img[@src]'):
|
||||
src = img.get('src')
|
||||
prefix = src.split(':', 1)[0].lower()
|
||||
if prefix not in ('file', 'http', 'https', 'ftp') and not os.path.isabs(src):
|
||||
src = os.path.join(base_dir, src)
|
||||
if os.access(src, os.R_OK):
|
||||
with open(src, 'rb') as f:
|
||||
data = f.read()
|
||||
f = self.shift_file(os.path.basename(src), data)
|
||||
changed = True
|
||||
img.set('src', os.path.basename(f))
|
||||
if changed:
|
||||
from lxml import etree
|
||||
html = etree.tostring(root, encoding='unicode')
|
||||
return html
|
||||
|
||||
def convert(self, stream, options, file_ext, log,
|
||||
accelerators):
|
||||
from calibre.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator
|
||||
from calibre.ebooks.chardet import detect
|
||||
from calibre.utils.zipfile import ZipFile
|
||||
from calibre.ebooks.txt.processor import (convert_basic,
|
||||
convert_markdown_with_metadata, separate_paragraphs_single_line,
|
||||
separate_paragraphs_print_formatted, preserve_spaces,
|
||||
detect_paragraph_type, detect_formatting_type,
|
||||
normalize_line_endings, convert_textile, remove_indents,
|
||||
block_to_single_line, separate_hard_scene_breaks)
|
||||
|
||||
self.log = log
|
||||
txt = b''
|
||||
log.debug('Reading text from file...')
|
||||
length = 0
|
||||
base_dir = self.output_dir = getcwd()
|
||||
|
||||
# Extract content from zip archive.
|
||||
if file_ext == 'txtz':
|
||||
zf = ZipFile(stream)
|
||||
zf.extractall('.')
|
||||
|
||||
for x in walk('.'):
|
||||
if os.path.splitext(x)[1].lower() in ('.txt', '.text'):
|
||||
with open(x, 'rb') as tf:
|
||||
txt += tf.read() + b'\n\n'
|
||||
else:
|
||||
if getattr(stream, 'name', None):
|
||||
base_dir = os.path.dirname(stream.name)
|
||||
txt = stream.read()
|
||||
if file_ext in {'md', 'textile', 'markdown'}:
|
||||
options.formatting_type = {'md': 'markdown'}.get(file_ext, file_ext)
|
||||
log.info('File extension indicates particular formatting. '
|
||||
'Forcing formatting type to: %s'%options.formatting_type)
|
||||
options.paragraph_type = 'off'
|
||||
|
||||
# Get the encoding of the document.
|
||||
if options.input_encoding:
|
||||
ienc = options.input_encoding
|
||||
log.debug('Using user specified input encoding of %s' % ienc)
|
||||
else:
|
||||
det_encoding = detect(txt[:4096])
|
||||
det_encoding, confidence = det_encoding['encoding'], det_encoding['confidence']
|
||||
if det_encoding and det_encoding.lower().replace('_', '-').strip() in (
|
||||
'gb2312', 'chinese', 'csiso58gb231280', 'euc-cn', 'euccn',
|
||||
'eucgb2312-cn', 'gb2312-1980', 'gb2312-80', 'iso-ir-58'):
|
||||
# Microsoft Word exports to HTML with encoding incorrectly set to
|
||||
# gb2312 instead of gbk. gbk is a superset of gb2312, anyway.
|
||||
det_encoding = 'gbk'
|
||||
ienc = det_encoding
|
||||
log.debug('Detected input encoding as %s with a confidence of %s%%' % (ienc, confidence * 100))
|
||||
if not ienc:
|
||||
ienc = 'utf-8'
|
||||
log.debug('No input encoding specified and could not auto detect using %s' % ienc)
|
||||
# Remove BOM from start of txt as its presence can confuse markdown
|
||||
import codecs
|
||||
for bom in (codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE, codecs.BOM_UTF8, codecs.BOM_UTF32_LE, codecs.BOM_UTF32_BE):
|
||||
if txt.startswith(bom):
|
||||
txt = txt[len(bom):]
|
||||
break
|
||||
txt = txt.decode(ienc, 'replace')
|
||||
|
||||
# Replace entities
|
||||
txt = _ent_pat.sub(xml_entity_to_unicode, txt)
|
||||
|
||||
# Normalize line endings
|
||||
txt = normalize_line_endings(txt)
|
||||
|
||||
# Determine the paragraph type of the document.
|
||||
if options.paragraph_type == 'auto':
|
||||
options.paragraph_type = detect_paragraph_type(txt)
|
||||
if options.paragraph_type == 'unknown':
|
||||
log.debug('Could not reliably determine paragraph type using block')
|
||||
options.paragraph_type = 'block'
|
||||
else:
|
||||
log.debug('Auto detected paragraph type as %s' % options.paragraph_type)
|
||||
|
||||
# Detect formatting
|
||||
if options.formatting_type == 'auto':
|
||||
options.formatting_type = detect_formatting_type(txt)
|
||||
log.debug('Auto detected formatting as %s' % options.formatting_type)
|
||||
|
||||
if options.formatting_type == 'heuristic':
|
||||
setattr(options, 'enable_heuristics', True)
|
||||
setattr(options, 'unwrap_lines', False)
|
||||
setattr(options, 'smarten_punctuation', True)
|
||||
|
||||
# Reformat paragraphs to block formatting based on the detected type.
|
||||
# We don't check for block because the processor assumes block.
|
||||
# single and print at transformed to block for processing.
|
||||
if options.paragraph_type == 'single':
|
||||
txt = separate_paragraphs_single_line(txt)
|
||||
elif options.paragraph_type == 'print':
|
||||
txt = separate_hard_scene_breaks(txt)
|
||||
txt = separate_paragraphs_print_formatted(txt)
|
||||
txt = block_to_single_line(txt)
|
||||
elif options.paragraph_type == 'unformatted':
|
||||
from calibre.ebooks.conversion.utils import HeuristicProcessor
|
||||
# unwrap lines based on punctuation
|
||||
docanalysis = DocAnalysis('txt', txt)
|
||||
length = docanalysis.line_length(.5)
|
||||
preprocessor = HeuristicProcessor(options, log=getattr(self, 'log', None))
|
||||
txt = preprocessor.punctuation_unwrap(length, txt, 'txt')
|
||||
txt = separate_paragraphs_single_line(txt)
|
||||
elif options.paragraph_type == 'block':
|
||||
txt = separate_hard_scene_breaks(txt)
|
||||
txt = block_to_single_line(txt)
|
||||
|
||||
if getattr(options, 'enable_heuristics', False) and getattr(options, 'dehyphenate', False):
|
||||
docanalysis = DocAnalysis('txt', txt)
|
||||
if not length:
|
||||
length = docanalysis.line_length(.5)
|
||||
dehyphenator = Dehyphenator(options.verbose, log=self.log)
|
||||
txt = dehyphenator(txt,'txt', length)
|
||||
|
||||
# User requested transformation on the text.
|
||||
if options.txt_in_remove_indents:
|
||||
txt = remove_indents(txt)
|
||||
|
||||
# Preserve spaces will replace multiple spaces to a space
|
||||
# followed by the entity.
|
||||
if options.preserve_spaces:
|
||||
txt = preserve_spaces(txt)
|
||||
|
||||
# Process the text using the appropriate text processor.
|
||||
self.shifted_files = []
|
||||
try:
|
||||
html = ''
|
||||
input_mi = None
|
||||
if options.formatting_type == 'markdown':
|
||||
log.debug('Running text through markdown conversion...')
|
||||
try:
|
||||
input_mi, html = convert_markdown_with_metadata(txt, extensions=[x.strip() for x in options.markdown_extensions.split(',') if x.strip()])
|
||||
except RuntimeError:
|
||||
raise ValueError('This txt file has malformed markup, it cannot be'
|
||||
' converted by calibre. See https://daringfireball.net/projects/markdown/syntax')
|
||||
html = self.fix_resources(html, base_dir)
|
||||
elif options.formatting_type == 'textile':
|
||||
log.debug('Running text through textile conversion...')
|
||||
html = convert_textile(txt)
|
||||
html = self.fix_resources(html, base_dir)
|
||||
else:
|
||||
log.debug('Running text through basic conversion...')
|
||||
flow_size = getattr(options, 'flow_size', 0)
|
||||
html = convert_basic(txt, epub_split_size_kb=flow_size)
|
||||
|
||||
# Run the HTMLized text through the html processing plugin.
|
||||
from calibre.customize.ui import plugin_for_input_format
|
||||
html_input = plugin_for_input_format('html')
|
||||
for opt in html_input.options:
|
||||
setattr(options, opt.option.name, opt.recommended_value)
|
||||
options.input_encoding = 'utf-8'
|
||||
htmlfile = self.shift_file('index.html', html.encode('utf-8'))
|
||||
odi = options.debug_pipeline
|
||||
options.debug_pipeline = None
|
||||
# Generate oeb from html conversion.
|
||||
oeb = html_input.convert(open(htmlfile, 'rb'), options, 'html', log, {})
|
||||
options.debug_pipeline = odi
|
||||
finally:
|
||||
for x in self.shifted_files:
|
||||
os.remove(x)
|
||||
|
||||
# Set metadata from file.
|
||||
if input_mi is None:
|
||||
from calibre.customize.ui import get_file_type_metadata
|
||||
input_mi = get_file_type_metadata(stream, file_ext)
|
||||
from calibre.ebooks.oeb.transforms.metadata import meta_info_to_oeb_metadata
|
||||
meta_info_to_oeb_metadata(input_mi, oeb.metadata, log)
|
||||
self.html_postprocess_title = input_mi.title
|
||||
|
||||
return oeb
|
||||
|
||||
def postprocess_book(self, oeb, opts, log):
|
||||
for item in oeb.spine:
|
||||
if hasattr(item.data, 'xpath'):
|
||||
for title in item.data.xpath('//*[local-name()="title"]'):
|
||||
if title.text == _('Unknown'):
|
||||
title.text = self.html_postprocess_title
|
||||
165
ebook_converter/ebooks/conversion/plugins/txt_output.py
Normal file
165
ebook_converter/ebooks/conversion/plugins/txt_output.py
Normal file
@@ -0,0 +1,165 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL 3'
|
||||
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import os
|
||||
import shutil
|
||||
|
||||
|
||||
from calibre.customize.conversion import OutputFormatPlugin, \
|
||||
OptionRecommendation
|
||||
from calibre.ptempfile import TemporaryDirectory, TemporaryFile
|
||||
|
||||
NEWLINE_TYPES = ['system', 'unix', 'old_mac', 'windows']
|
||||
|
||||
|
||||
class TXTOutput(OutputFormatPlugin):
|
||||
|
||||
name = 'TXT Output'
|
||||
author = 'John Schember'
|
||||
file_type = 'txt'
|
||||
commit_name = 'txt_output'
|
||||
ui_data = {
|
||||
'newline_types': NEWLINE_TYPES,
|
||||
'formatting_types': {
|
||||
'plain': _('Plain text'),
|
||||
'markdown': _('Markdown formatted text'),
|
||||
'textile': _('TexTile formatted text')
|
||||
},
|
||||
}
|
||||
|
||||
options = {
|
||||
OptionRecommendation(name='newline', recommended_value='system',
|
||||
level=OptionRecommendation.LOW,
|
||||
short_switch='n', choices=NEWLINE_TYPES,
|
||||
help=_('Type of newline to use. Options are %s. Default is \'system\'. '
|
||||
'Use \'old_mac\' for compatibility with Mac OS 9 and earlier. '
|
||||
'For macOS use \'unix\'. \'system\' will default to the newline '
|
||||
'type used by this OS.') % sorted(NEWLINE_TYPES)),
|
||||
OptionRecommendation(name='txt_output_encoding', recommended_value='utf-8',
|
||||
level=OptionRecommendation.LOW,
|
||||
help=_('Specify the character encoding of the output document. '
|
||||
'The default is utf-8.')),
|
||||
OptionRecommendation(name='inline_toc',
|
||||
recommended_value=False, level=OptionRecommendation.LOW,
|
||||
help=_('Add Table of Contents to beginning of the book.')),
|
||||
OptionRecommendation(name='max_line_length',
|
||||
recommended_value=0, level=OptionRecommendation.LOW,
|
||||
help=_('The maximum number of characters per line. This splits on '
|
||||
'the first space before the specified value. If no space is found '
|
||||
'the line will be broken at the space after and will exceed the '
|
||||
'specified value. Also, there is a minimum of 25 characters. '
|
||||
'Use 0 to disable line splitting.')),
|
||||
OptionRecommendation(name='force_max_line_length',
|
||||
recommended_value=False, level=OptionRecommendation.LOW,
|
||||
help=_('Force splitting on the max-line-length value when no space '
|
||||
'is present. Also allows max-line-length to be below the minimum')),
|
||||
OptionRecommendation(name='txt_output_formatting',
|
||||
recommended_value='plain',
|
||||
choices=list(ui_data['formatting_types']),
|
||||
help=_('Formatting used within the document.\n'
|
||||
'* plain: {plain}\n'
|
||||
'* markdown: {markdown}\n'
|
||||
'* textile: {textile}').format(**ui_data['formatting_types'])),
|
||||
OptionRecommendation(name='keep_links',
|
||||
recommended_value=False, level=OptionRecommendation.LOW,
|
||||
help=_('Do not remove links within the document. This is only '
|
||||
'useful when paired with a txt-output-formatting option that '
|
||||
'is not none because links are always removed with plain text output.')),
|
||||
OptionRecommendation(name='keep_image_references',
|
||||
recommended_value=False, level=OptionRecommendation.LOW,
|
||||
help=_('Do not remove image references within the document. This is only '
|
||||
'useful when paired with a txt-output-formatting option that '
|
||||
'is not none because links are always removed with plain text output.')),
|
||||
OptionRecommendation(name='keep_color',
|
||||
recommended_value=False, level=OptionRecommendation.LOW,
|
||||
help=_('Do not remove font color from output. This is only useful when '
|
||||
'txt-output-formatting is set to textile. Textile is the only '
|
||||
'formatting that supports setting font color. If this option is '
|
||||
'not specified font color will not be set and default to the '
|
||||
'color displayed by the reader (generally this is black).')),
|
||||
}
|
||||
|
||||
def convert(self, oeb_book, output_path, input_plugin, opts, log):
|
||||
from calibre.ebooks.txt.txtml import TXTMLizer
|
||||
from calibre.utils.cleantext import clean_ascii_chars
|
||||
from calibre.ebooks.txt.newlines import specified_newlines, TxtNewlines
|
||||
|
||||
if opts.txt_output_formatting.lower() == 'markdown':
|
||||
from calibre.ebooks.txt.markdownml import MarkdownMLizer
|
||||
self.writer = MarkdownMLizer(log)
|
||||
elif opts.txt_output_formatting.lower() == 'textile':
|
||||
from calibre.ebooks.txt.textileml import TextileMLizer
|
||||
self.writer = TextileMLizer(log)
|
||||
else:
|
||||
self.writer = TXTMLizer(log)
|
||||
|
||||
txt = self.writer.extract_content(oeb_book, opts)
|
||||
txt = clean_ascii_chars(txt)
|
||||
|
||||
log.debug('\tReplacing newlines with selected type...')
|
||||
txt = specified_newlines(TxtNewlines(opts.newline).newline, txt)
|
||||
|
||||
close = False
|
||||
if not hasattr(output_path, 'write'):
|
||||
close = True
|
||||
if not os.path.exists(os.path.dirname(output_path)) and os.path.dirname(output_path) != '':
|
||||
os.makedirs(os.path.dirname(output_path))
|
||||
out_stream = open(output_path, 'wb')
|
||||
else:
|
||||
out_stream = output_path
|
||||
|
||||
out_stream.seek(0)
|
||||
out_stream.truncate()
|
||||
out_stream.write(txt.encode(opts.txt_output_encoding, 'replace'))
|
||||
|
||||
if close:
|
||||
out_stream.close()
|
||||
|
||||
|
||||
class TXTZOutput(TXTOutput):
|
||||
|
||||
name = 'TXTZ Output'
|
||||
author = 'John Schember'
|
||||
file_type = 'txtz'
|
||||
|
||||
def convert(self, oeb_book, output_path, input_plugin, opts, log):
|
||||
from calibre.ebooks.oeb.base import OEB_IMAGES
|
||||
from calibre.utils.zipfile import ZipFile
|
||||
from lxml import etree
|
||||
|
||||
with TemporaryDirectory('_txtz_output') as tdir:
|
||||
# TXT
|
||||
txt_name = 'index.txt'
|
||||
if opts.txt_output_formatting.lower() == 'textile':
|
||||
txt_name = 'index.text'
|
||||
with TemporaryFile(txt_name) as tf:
|
||||
TXTOutput.convert(self, oeb_book, tf, input_plugin, opts, log)
|
||||
shutil.copy(tf, os.path.join(tdir, txt_name))
|
||||
|
||||
# Images
|
||||
for item in oeb_book.manifest:
|
||||
if item.media_type in OEB_IMAGES:
|
||||
if hasattr(self.writer, 'images'):
|
||||
path = os.path.join(tdir, 'images')
|
||||
if item.href in self.writer.images:
|
||||
href = self.writer.images[item.href]
|
||||
else:
|
||||
continue
|
||||
else:
|
||||
path = os.path.join(tdir, os.path.dirname(item.href))
|
||||
href = os.path.basename(item.href)
|
||||
if not os.path.exists(path):
|
||||
os.makedirs(path)
|
||||
with open(os.path.join(path, href), 'wb') as imgf:
|
||||
imgf.write(item.data)
|
||||
|
||||
# Metadata
|
||||
with open(os.path.join(tdir, 'metadata.opf'), 'wb') as mdataf:
|
||||
mdataf.write(etree.tostring(oeb_book.metadata.to_opf1()))
|
||||
|
||||
txtz = ZipFile(output_path, 'w')
|
||||
txtz.add_dir(tdir)
|
||||
1330
ebook_converter/ebooks/conversion/plumber.py
Normal file
1330
ebook_converter/ebooks/conversion/plumber.py
Normal file
File diff suppressed because it is too large
Load Diff
646
ebook_converter/ebooks/conversion/preprocess.py
Normal file
646
ebook_converter/ebooks/conversion/preprocess.py
Normal file
@@ -0,0 +1,646 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import functools, re, json
|
||||
from math import ceil
|
||||
|
||||
from calibre import entity_to_unicode, as_unicode
|
||||
from polyglot.builtins import unicode_type, range
|
||||
|
||||
XMLDECL_RE = re.compile(r'^\s*<[?]xml.*?[?]>')
|
||||
SVG_NS = 'http://www.w3.org/2000/svg'
|
||||
XLINK_NS = 'http://www.w3.org/1999/xlink'
|
||||
|
||||
convert_entities = functools.partial(entity_to_unicode,
|
||||
result_exceptions={
|
||||
'<' : '<',
|
||||
'>' : '>',
|
||||
"'" : ''',
|
||||
'"' : '"',
|
||||
'&' : '&',
|
||||
})
|
||||
_span_pat = re.compile('<span.*?</span>', re.DOTALL|re.IGNORECASE)
|
||||
|
||||
LIGATURES = {
|
||||
# '\u00c6': 'AE',
|
||||
# '\u00e6': 'ae',
|
||||
# '\u0152': 'OE',
|
||||
# '\u0153': 'oe',
|
||||
# '\u0132': 'IJ',
|
||||
# '\u0133': 'ij',
|
||||
# '\u1D6B': 'ue',
|
||||
'\uFB00': 'ff',
|
||||
'\uFB01': 'fi',
|
||||
'\uFB02': 'fl',
|
||||
'\uFB03': 'ffi',
|
||||
'\uFB04': 'ffl',
|
||||
'\uFB05': 'ft',
|
||||
'\uFB06': 'st',
|
||||
}
|
||||
|
||||
_ligpat = re.compile('|'.join(LIGATURES))
|
||||
|
||||
|
||||
def sanitize_head(match):
|
||||
x = match.group(1)
|
||||
x = _span_pat.sub('', x)
|
||||
return '<head>\n%s\n</head>' % x
|
||||
|
||||
|
||||
def chap_head(match):
|
||||
chap = match.group('chap')
|
||||
title = match.group('title')
|
||||
if not title:
|
||||
return '<h1>'+chap+'</h1><br/>\n'
|
||||
else:
|
||||
return '<h1>'+chap+'</h1>\n<h3>'+title+'</h3>\n'
|
||||
|
||||
|
||||
def wrap_lines(match):
|
||||
ital = match.group('ital')
|
||||
if not ital:
|
||||
return ' '
|
||||
else:
|
||||
return ital+' '
|
||||
|
||||
|
||||
def smarten_punctuation(html, log=None):
|
||||
from calibre.utils.smartypants import smartyPants
|
||||
from calibre.ebooks.chardet import substitute_entites
|
||||
from calibre.ebooks.conversion.utils import HeuristicProcessor
|
||||
preprocessor = HeuristicProcessor(log=log)
|
||||
from uuid import uuid4
|
||||
start = 'calibre-smartypants-'+unicode_type(uuid4())
|
||||
stop = 'calibre-smartypants-'+unicode_type(uuid4())
|
||||
html = html.replace('<!--', start)
|
||||
html = html.replace('-->', stop)
|
||||
html = preprocessor.fix_nbsp_indents(html)
|
||||
html = smartyPants(html)
|
||||
html = html.replace(start, '<!--')
|
||||
html = html.replace(stop, '-->')
|
||||
return substitute_entites(html)
|
||||
|
||||
|
||||
class DocAnalysis(object):
|
||||
'''
|
||||
Provides various text analysis functions to determine how the document is structured.
|
||||
format is the type of document analysis will be done against.
|
||||
raw is the raw text to determine the line length to use for wrapping.
|
||||
Blank lines are excluded from analysis
|
||||
'''
|
||||
|
||||
def __init__(self, format='html', raw=''):
|
||||
raw = raw.replace(' ', ' ')
|
||||
if format == 'html':
|
||||
linere = re.compile(r'(?<=<p)(?![^>]*>\s*</p>).*?(?=</p>)', re.DOTALL)
|
||||
elif format == 'pdf':
|
||||
linere = re.compile(r'(?<=<br>)(?!\s*<br>).*?(?=<br>)', re.DOTALL)
|
||||
elif format == 'spanned_html':
|
||||
linere = re.compile('(?<=<span).*?(?=</span>)', re.DOTALL)
|
||||
elif format == 'txt':
|
||||
linere = re.compile('.*?\n')
|
||||
self.lines = linere.findall(raw)
|
||||
|
||||
def line_length(self, percent):
|
||||
'''
|
||||
Analyses the document to find the median line length.
|
||||
percentage is a decimal number, 0 - 1 which is used to determine
|
||||
how far in the list of line lengths to use. The list of line lengths is
|
||||
ordered smallest to largest and does not include duplicates. 0.5 is the
|
||||
median value.
|
||||
'''
|
||||
lengths = []
|
||||
for line in self.lines:
|
||||
if len(line) > 0:
|
||||
lengths.append(len(line))
|
||||
|
||||
if not lengths:
|
||||
return 0
|
||||
|
||||
lengths = list(set(lengths))
|
||||
total = sum(lengths)
|
||||
avg = total / len(lengths)
|
||||
max_line = ceil(avg * 2)
|
||||
|
||||
lengths = sorted(lengths)
|
||||
for i in range(len(lengths) - 1, -1, -1):
|
||||
if lengths[i] > max_line:
|
||||
del lengths[i]
|
||||
|
||||
if percent > 1:
|
||||
percent = 1
|
||||
if percent < 0:
|
||||
percent = 0
|
||||
|
||||
index = int(len(lengths) * percent) - 1
|
||||
|
||||
return lengths[index]
|
||||
|
||||
def line_histogram(self, percent):
|
||||
'''
|
||||
Creates a broad histogram of the document to determine whether it incorporates hard
|
||||
line breaks. Lines are sorted into 20 'buckets' based on length.
|
||||
percent is the percentage of lines that should be in a single bucket to return true
|
||||
The majority of the lines will exist in 1-2 buckets in typical docs with hard line breaks
|
||||
'''
|
||||
minLineLength=20 # Ignore lines under 20 chars (typical of spaces)
|
||||
maxLineLength=1900 # Discard larger than this to stay in range
|
||||
buckets=20 # Each line is divided into a bucket based on length
|
||||
|
||||
# print("there are "+unicode_type(len(lines))+" lines")
|
||||
# max = 0
|
||||
# for line in self.lines:
|
||||
# l = len(line)
|
||||
# if l > max:
|
||||
# max = l
|
||||
# print("max line found is "+unicode_type(max))
|
||||
# Build the line length histogram
|
||||
hRaw = [0 for i in range(0,buckets)]
|
||||
for line in self.lines:
|
||||
l = len(line)
|
||||
if l > minLineLength and l < maxLineLength:
|
||||
l = int(l // 100)
|
||||
# print("adding "+unicode_type(l))
|
||||
hRaw[l]+=1
|
||||
|
||||
# Normalize the histogram into percents
|
||||
totalLines = len(self.lines)
|
||||
if totalLines > 0:
|
||||
h = [float(count)/totalLines for count in hRaw]
|
||||
else:
|
||||
h = []
|
||||
# print("\nhRaw histogram lengths are: "+unicode_type(hRaw))
|
||||
# print(" percents are: "+unicode_type(h)+"\n")
|
||||
|
||||
# Find the biggest bucket
|
||||
maxValue = 0
|
||||
for i in range(0,len(h)):
|
||||
if h[i] > maxValue:
|
||||
maxValue = h[i]
|
||||
|
||||
if maxValue < percent:
|
||||
# print("Line lengths are too variable. Not unwrapping.")
|
||||
return False
|
||||
else:
|
||||
# print(unicode_type(maxValue)+" of the lines were in one bucket")
|
||||
return True
|
||||
|
||||
|
||||
class Dehyphenator(object):
|
||||
'''
|
||||
Analyzes words to determine whether hyphens should be retained/removed. Uses the document
|
||||
itself is as a dictionary. This method handles all languages along with uncommon, made-up, and
|
||||
scientific words. The primary disadvantage is that words appearing only once in the document
|
||||
retain hyphens.
|
||||
'''
|
||||
|
||||
def __init__(self, verbose=0, log=None):
|
||||
self.log = log
|
||||
self.verbose = verbose
|
||||
# Add common suffixes to the regex below to increase the likelihood of a match -
|
||||
# don't add suffixes which are also complete words, such as 'able' or 'sex'
|
||||
# only remove if it's not already the point of hyphenation
|
||||
self.suffix_string = (
|
||||
"((ed)?ly|'?e?s||a?(t|s)?ion(s|al(ly)?)?|ings?|er|(i)?ous|"
|
||||
"(i|a)ty|(it)?ies|ive|gence|istic(ally)?|(e|a)nce|m?ents?|ism|ated|"
|
||||
"(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex|ian)$")
|
||||
self.suffixes = re.compile(r"^%s" % self.suffix_string, re.IGNORECASE)
|
||||
self.removesuffixes = re.compile(r"%s" % self.suffix_string, re.IGNORECASE)
|
||||
# remove prefixes if the prefix was not already the point of hyphenation
|
||||
self.prefix_string = '^(dis|re|un|in|ex)'
|
||||
self.prefixes = re.compile(r'%s$' % self.prefix_string, re.IGNORECASE)
|
||||
self.removeprefix = re.compile(r'%s' % self.prefix_string, re.IGNORECASE)
|
||||
|
||||
def dehyphenate(self, match):
|
||||
firsthalf = match.group('firstpart')
|
||||
secondhalf = match.group('secondpart')
|
||||
try:
|
||||
wraptags = match.group('wraptags')
|
||||
except:
|
||||
wraptags = ''
|
||||
hyphenated = unicode_type(firsthalf) + "-" + unicode_type(secondhalf)
|
||||
dehyphenated = unicode_type(firsthalf) + unicode_type(secondhalf)
|
||||
if self.suffixes.match(secondhalf) is None:
|
||||
lookupword = self.removesuffixes.sub('', dehyphenated)
|
||||
else:
|
||||
lookupword = dehyphenated
|
||||
if len(firsthalf) > 4 and self.prefixes.match(firsthalf) is None:
|
||||
lookupword = self.removeprefix.sub('', lookupword)
|
||||
if self.verbose > 2:
|
||||
self.log("lookup word is: "+lookupword+", orig is: " + hyphenated)
|
||||
try:
|
||||
searchresult = self.html.find(lookupword.lower())
|
||||
except:
|
||||
return hyphenated
|
||||
if self.format == 'html_cleanup' or self.format == 'txt_cleanup':
|
||||
if self.html.find(lookupword) != -1 or searchresult != -1:
|
||||
if self.verbose > 2:
|
||||
self.log(" Cleanup:returned dehyphenated word: " + dehyphenated)
|
||||
return dehyphenated
|
||||
elif self.html.find(hyphenated) != -1:
|
||||
if self.verbose > 2:
|
||||
self.log(" Cleanup:returned hyphenated word: " + hyphenated)
|
||||
return hyphenated
|
||||
else:
|
||||
if self.verbose > 2:
|
||||
self.log(" Cleanup:returning original text "+firsthalf+" + linefeed "+secondhalf)
|
||||
return firsthalf+'\u2014'+wraptags+secondhalf
|
||||
|
||||
else:
|
||||
if self.format == 'individual_words' and len(firsthalf) + len(secondhalf) <= 6:
|
||||
if self.verbose > 2:
|
||||
self.log("too short, returned hyphenated word: " + hyphenated)
|
||||
return hyphenated
|
||||
if len(firsthalf) <= 2 and len(secondhalf) <= 2:
|
||||
if self.verbose > 2:
|
||||
self.log("too short, returned hyphenated word: " + hyphenated)
|
||||
return hyphenated
|
||||
if self.html.find(lookupword) != -1 or searchresult != -1:
|
||||
if self.verbose > 2:
|
||||
self.log(" returned dehyphenated word: " + dehyphenated)
|
||||
return dehyphenated
|
||||
else:
|
||||
if self.verbose > 2:
|
||||
self.log(" returned hyphenated word: " + hyphenated)
|
||||
return hyphenated
|
||||
|
||||
def __call__(self, html, format, length=1):
|
||||
self.html = html
|
||||
self.format = format
|
||||
if format == 'html':
|
||||
intextmatch = re.compile((
|
||||
r'(?<=.{%i})(?P<firstpart>[^\W\-]+)(-|‐)\s*(?=<)(?P<wraptags>(</span>)?'
|
||||
r'\s*(</[iubp]>\s*){1,2}(?P<up2threeblanks><(p|div)[^>]*>\s*(<p[^>]*>\s*</p>\s*)'
|
||||
r'?</(p|div)>\s+){0,3}\s*(<[iubp][^>]*>\s*){1,2}(<span[^>]*>)?)\s*(?P<secondpart>[\w\d]+)') % length)
|
||||
elif format == 'pdf':
|
||||
intextmatch = re.compile((
|
||||
r'(?<=.{%i})(?P<firstpart>[^\W\-]+)(-|‐)\s*(?P<wraptags><p>|'
|
||||
r'</[iub]>\s*<p>\s*<[iub]>)\s*(?P<secondpart>[\w\d]+)')% length)
|
||||
elif format == 'txt':
|
||||
intextmatch = re.compile(
|
||||
'(?<=.{%i})(?P<firstpart>[^\\W\\-]+)(-|‐)(\u0020|\u0009)*(?P<wraptags>(\n(\u0020|\u0009)*)+)(?P<secondpart>[\\w\\d]+)'% length)
|
||||
elif format == 'individual_words':
|
||||
intextmatch = re.compile(
|
||||
r'(?!<)(?P<firstpart>[^\W\-]+)(-|‐)\s*(?P<secondpart>\w+)(?![^<]*?>)', re.UNICODE)
|
||||
elif format == 'html_cleanup':
|
||||
intextmatch = re.compile(
|
||||
r'(?P<firstpart>[^\W\-]+)(-|‐)\s*(?=<)(?P<wraptags></span>\s*(</[iubp]>'
|
||||
r'\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?P<secondpart>[\w\d]+)')
|
||||
elif format == 'txt_cleanup':
|
||||
intextmatch = re.compile(
|
||||
r'(?P<firstpart>[^\W\-]+)(-|‐)(?P<wraptags>\s+)(?P<secondpart>[\w\d]+)')
|
||||
|
||||
html = intextmatch.sub(self.dehyphenate, html)
|
||||
return html
|
||||
|
||||
|
||||
class CSSPreProcessor(object):
|
||||
|
||||
# Remove some of the broken CSS Microsoft products
|
||||
# create
|
||||
MS_PAT = re.compile(r'''
|
||||
(?P<start>^|;|\{)\s* # The end of the previous rule or block start
|
||||
(%s).+? # The invalid selectors
|
||||
(?P<end>$|;|\}) # The end of the declaration
|
||||
'''%'mso-|panose-|text-underline|tab-interval',
|
||||
re.MULTILINE|re.IGNORECASE|re.VERBOSE)
|
||||
|
||||
def ms_sub(self, match):
|
||||
end = match.group('end')
|
||||
try:
|
||||
start = match.group('start')
|
||||
except:
|
||||
start = ''
|
||||
if end == ';':
|
||||
end = ''
|
||||
return start + end
|
||||
|
||||
def __call__(self, data, add_namespace=False):
|
||||
from calibre.ebooks.oeb.base import XHTML_CSS_NAMESPACE
|
||||
data = self.MS_PAT.sub(self.ms_sub, data)
|
||||
if not add_namespace:
|
||||
return data
|
||||
|
||||
# Remove comments as the following namespace logic will break if there
|
||||
# are commented lines before the first @import or @charset rule. Since
|
||||
# the conversion will remove all stylesheets anyway, we don't lose
|
||||
# anything
|
||||
data = re.sub(unicode_type(r'/\*.*?\*/'), '', data, flags=re.DOTALL)
|
||||
|
||||
ans, namespaced = [], False
|
||||
for line in data.splitlines():
|
||||
ll = line.lstrip()
|
||||
if not (namespaced or ll.startswith('@import') or not ll or
|
||||
ll.startswith('@charset')):
|
||||
ans.append(XHTML_CSS_NAMESPACE.strip())
|
||||
namespaced = True
|
||||
ans.append(line)
|
||||
|
||||
return '\n'.join(ans)
|
||||
|
||||
|
||||
def accent_regex(accent_maps, letter_before=False):
|
||||
accent_cat = set()
|
||||
letters = set()
|
||||
|
||||
for accent in tuple(accent_maps):
|
||||
accent_cat.add(accent)
|
||||
k, v = accent_maps[accent].split(':', 1)
|
||||
if len(k) != len(v):
|
||||
raise ValueError('Invalid mapping for: {} -> {}'.format(k, v))
|
||||
accent_maps[accent] = lmap = dict(zip(k, v))
|
||||
letters |= set(lmap)
|
||||
|
||||
if letter_before:
|
||||
args = ''.join(letters), ''.join(accent_cat)
|
||||
accent_group, letter_group = 2, 1
|
||||
else:
|
||||
args = ''.join(accent_cat), ''.join(letters)
|
||||
accent_group, letter_group = 1, 2
|
||||
|
||||
pat = re.compile(r'([{}])\s*(?:<br[^>]*>){{0,1}}\s*([{}])'.format(*args), re.UNICODE)
|
||||
|
||||
def sub(m):
|
||||
lmap = accent_maps[m.group(accent_group)]
|
||||
return lmap.get(m.group(letter_group)) or m.group()
|
||||
|
||||
return pat, sub
|
||||
|
||||
|
||||
def html_preprocess_rules():
|
||||
ans = getattr(html_preprocess_rules, 'ans', None)
|
||||
if ans is None:
|
||||
ans = html_preprocess_rules.ans = [
|
||||
# Remove huge block of contiguous spaces as they slow down
|
||||
# the following regexes pretty badly
|
||||
(re.compile(r'\s{10000,}'), ''),
|
||||
# Some idiotic HTML generators (Frontpage I'm looking at you)
|
||||
# Put all sorts of crap into <head>. This messes up lxml
|
||||
(re.compile(r'<head[^>]*>\n*(.*?)\n*</head>', re.IGNORECASE|re.DOTALL),
|
||||
sanitize_head),
|
||||
# Convert all entities, since lxml doesn't handle them well
|
||||
(re.compile(r'&(\S+?);'), convert_entities),
|
||||
# Remove the <![if/endif tags inserted by everybody's darling, MS Word
|
||||
(re.compile(r'</{0,1}!\[(end){0,1}if\]{0,1}>', re.IGNORECASE), ''),
|
||||
]
|
||||
return ans
|
||||
|
||||
|
||||
def pdftohtml_rules():
|
||||
ans = getattr(pdftohtml_rules, 'ans', None)
|
||||
if ans is None:
|
||||
ans = pdftohtml_rules.ans = [
|
||||
accent_regex({
|
||||
'¨': 'aAeEiIoOuU:äÄëËïÏöÖüÜ',
|
||||
'`': 'aAeEiIoOuU:àÀèÈìÌòÒùÙ',
|
||||
'´': 'aAcCeEiIlLoOnNrRsSuUzZ:áÁćĆéÉíÍĺĹóÓńŃŕŔśŚúÚźŹ',
|
||||
'ˆ': 'aAeEiIoOuU:âÂêÊîÎôÔûÛ',
|
||||
'¸': 'cC:çÇ',
|
||||
'˛': 'aAeE:ąĄęĘ',
|
||||
'˙': 'zZ:żŻ',
|
||||
'ˇ': 'cCdDeElLnNrRsStTzZ:čČďĎěĚľĽňŇřŘšŠťŤžŽ',
|
||||
'°': 'uU:ůŮ',
|
||||
}),
|
||||
|
||||
accent_regex({'`': 'aAeEiIoOuU:àÀèÈìÌòÒùÙ'}, letter_before=True),
|
||||
|
||||
# If pdf printed from a browser then the header/footer has a reliable pattern
|
||||
(re.compile(r'((?<=</a>)\s*file:/{2,4}[A-Z].*<br>|file:////?[A-Z].*<br>(?=\s*<hr>))', re.IGNORECASE), lambda match: ''),
|
||||
|
||||
# Center separator lines
|
||||
(re.compile(r'<br>\s*(?P<break>([*#•✦=] *){3,})\s*<br>'), lambda match: '<p>\n<p style="text-align:center">' + match.group('break') + '</p>'),
|
||||
|
||||
# Remove <hr> tags
|
||||
(re.compile(r'<hr.*?>', re.IGNORECASE), ''),
|
||||
|
||||
# Remove gray background
|
||||
(re.compile(r'<BODY[^<>]+>'), '<BODY>'),
|
||||
|
||||
# Convert line breaks to paragraphs
|
||||
(re.compile(r'<br[^>]*>\s*'), '</p>\n<p>'),
|
||||
(re.compile(r'<body[^>]*>\s*'), '<body>\n<p>'),
|
||||
(re.compile(r'\s*</body>'), '</p>\n</body>'),
|
||||
|
||||
# Clean up spaces
|
||||
(re.compile(r'(?<=[\.,;\?!”"\'])[\s^ ]*(?=<)'), ' '),
|
||||
# Add space before and after italics
|
||||
(re.compile(r'(?<!“)<i>'), ' <i>'),
|
||||
(re.compile(r'</i>(?=\w)'), '</i> '),
|
||||
]
|
||||
return ans
|
||||
|
||||
|
||||
def book_designer_rules():
|
||||
ans = getattr(book_designer_rules, 'ans', None)
|
||||
if ans is None:
|
||||
ans = book_designer_rules.ans = [
|
||||
# HR
|
||||
(re.compile('<hr>', re.IGNORECASE),
|
||||
lambda match : '<span style="page-break-after:always"> </span>'),
|
||||
# Create header tags
|
||||
(re.compile(r'<h2[^><]*?id=BookTitle[^><]*?(align=)*(?(1)(\w+))*[^><]*?>[^><]*?</h2>', re.IGNORECASE),
|
||||
lambda match : '<h1 id="BookTitle" align="%s">%s</h1>'%(match.group(2) if match.group(2) else 'center', match.group(3))),
|
||||
(re.compile(r'<h2[^><]*?id=BookAuthor[^><]*?(align=)*(?(1)(\w+))*[^><]*?>[^><]*?</h2>', re.IGNORECASE),
|
||||
lambda match : '<h2 id="BookAuthor" align="%s">%s</h2>'%(match.group(2) if match.group(2) else 'center', match.group(3))),
|
||||
(re.compile('<span[^><]*?id=title[^><]*?>(.*?)</span>', re.IGNORECASE|re.DOTALL),
|
||||
lambda match : '<h2 class="title">%s</h2>'%(match.group(1),)),
|
||||
(re.compile('<span[^><]*?id=subtitle[^><]*?>(.*?)</span>', re.IGNORECASE|re.DOTALL),
|
||||
lambda match : '<h3 class="subtitle">%s</h3>'%(match.group(1),)),
|
||||
]
|
||||
return None
|
||||
|
||||
|
||||
class HTMLPreProcessor(object):
|
||||
|
||||
def __init__(self, log=None, extra_opts=None, regex_wizard_callback=None):
|
||||
self.log = log
|
||||
self.extra_opts = extra_opts
|
||||
self.regex_wizard_callback = regex_wizard_callback
|
||||
self.current_href = None
|
||||
|
||||
def is_baen(self, src):
|
||||
return re.compile(r'<meta\s+name="Publisher"\s+content=".*?Baen.*?"',
|
||||
re.IGNORECASE).search(src) is not None
|
||||
|
||||
def is_book_designer(self, raw):
|
||||
return re.search('<H2[^><]*id=BookTitle', raw) is not None
|
||||
|
||||
def is_pdftohtml(self, src):
|
||||
return '<!-- created by calibre\'s pdftohtml -->' in src[:1000]
|
||||
|
||||
def __call__(self, html, remove_special_chars=None,
|
||||
get_preprocess_html=False):
|
||||
if remove_special_chars is not None:
|
||||
html = remove_special_chars.sub('', html)
|
||||
html = html.replace('\0', '')
|
||||
is_pdftohtml = self.is_pdftohtml(html)
|
||||
if self.is_baen(html):
|
||||
rules = []
|
||||
elif self.is_book_designer(html):
|
||||
rules = book_designer_rules()
|
||||
elif is_pdftohtml:
|
||||
rules = pdftohtml_rules()
|
||||
else:
|
||||
rules = []
|
||||
|
||||
start_rules = []
|
||||
|
||||
if not getattr(self.extra_opts, 'keep_ligatures', False):
|
||||
html = _ligpat.sub(lambda m:LIGATURES[m.group()], html)
|
||||
|
||||
user_sr_rules = {}
|
||||
# Function for processing search and replace
|
||||
|
||||
def do_search_replace(search_pattern, replace_txt):
|
||||
from calibre.ebooks.conversion.search_replace import compile_regular_expression
|
||||
try:
|
||||
search_re = compile_regular_expression(search_pattern)
|
||||
if not replace_txt:
|
||||
replace_txt = ''
|
||||
rules.insert(0, (search_re, replace_txt))
|
||||
user_sr_rules[(search_re, replace_txt)] = search_pattern
|
||||
except Exception as e:
|
||||
self.log.error('Failed to parse %r regexp because %s' %
|
||||
(search, as_unicode(e)))
|
||||
|
||||
# search / replace using the sr?_search / sr?_replace options
|
||||
for i in range(1, 4):
|
||||
search, replace = 'sr%d_search'%i, 'sr%d_replace'%i
|
||||
search_pattern = getattr(self.extra_opts, search, '')
|
||||
replace_txt = getattr(self.extra_opts, replace, '')
|
||||
if search_pattern:
|
||||
do_search_replace(search_pattern, replace_txt)
|
||||
|
||||
# multi-search / replace using the search_replace option
|
||||
search_replace = getattr(self.extra_opts, 'search_replace', None)
|
||||
if search_replace:
|
||||
search_replace = json.loads(search_replace)
|
||||
for search_pattern, replace_txt in reversed(search_replace):
|
||||
do_search_replace(search_pattern, replace_txt)
|
||||
|
||||
end_rules = []
|
||||
# delete soft hyphens - moved here so it's executed after header/footer removal
|
||||
if is_pdftohtml:
|
||||
# unwrap/delete soft hyphens
|
||||
end_rules.append((re.compile(
|
||||
r'[](</p>\s*<p>\s*)+\s*(?=[\[a-z\d])'), lambda match: ''))
|
||||
# unwrap/delete soft hyphens with formatting
|
||||
end_rules.append((re.compile(
|
||||
r'[]\s*(</(i|u|b)>)+(</p>\s*<p>\s*)+\s*(<(i|u|b)>)+\s*(?=[\[a-z\d])'), lambda match: ''))
|
||||
|
||||
length = -1
|
||||
if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01:
|
||||
docanalysis = DocAnalysis('pdf', html)
|
||||
length = docanalysis.line_length(getattr(self.extra_opts, 'unwrap_factor'))
|
||||
if length:
|
||||
# print("The pdf line length returned is " + unicode_type(length))
|
||||
# unwrap em/en dashes
|
||||
end_rules.append((re.compile(
|
||||
r'(?<=.{%i}[–—])\s*<p>\s*(?=[\[a-z\d])' % length), lambda match: ''))
|
||||
end_rules.append(
|
||||
# Un wrap using punctuation
|
||||
(re.compile((
|
||||
r'(?<=.{%i}([a-zäëïöüàèìòùáćéíĺóŕńśúýâêîôûçąężıãõñæøþðßěľščťžňďřů,:)\\IAß]'
|
||||
r'|(?<!\&\w{4});))\s*(?P<ital></(i|b|u)>)?\s*(</p>\s*<p>\s*)+\s*(?=(<(i|b|u)>)?'
|
||||
r'\s*[\w\d$(])') % length, re.UNICODE), wrap_lines),
|
||||
)
|
||||
|
||||
for rule in html_preprocess_rules() + start_rules:
|
||||
html = rule[0].sub(rule[1], html)
|
||||
|
||||
if self.regex_wizard_callback is not None:
|
||||
self.regex_wizard_callback(self.current_href, html)
|
||||
|
||||
if get_preprocess_html:
|
||||
return html
|
||||
|
||||
def dump(raw, where):
|
||||
import os
|
||||
dp = getattr(self.extra_opts, 'debug_pipeline', None)
|
||||
if dp and os.path.exists(dp):
|
||||
odir = os.path.join(dp, 'input')
|
||||
if os.path.exists(odir):
|
||||
odir = os.path.join(odir, where)
|
||||
if not os.path.exists(odir):
|
||||
os.makedirs(odir)
|
||||
name, i = None, 0
|
||||
while not name or os.path.exists(os.path.join(odir, name)):
|
||||
i += 1
|
||||
name = '%04d.html'%i
|
||||
with open(os.path.join(odir, name), 'wb') as f:
|
||||
f.write(raw.encode('utf-8'))
|
||||
|
||||
# dump(html, 'pre-preprocess')
|
||||
|
||||
for rule in rules + end_rules:
|
||||
try:
|
||||
html = rule[0].sub(rule[1], html)
|
||||
except Exception as e:
|
||||
if rule in user_sr_rules:
|
||||
self.log.error(
|
||||
'User supplied search & replace rule: %s -> %s '
|
||||
'failed with error: %s, ignoring.'%(
|
||||
user_sr_rules[rule], rule[1], e))
|
||||
else:
|
||||
raise
|
||||
|
||||
if is_pdftohtml and length > -1:
|
||||
# Dehyphenate
|
||||
dehyphenator = Dehyphenator(self.extra_opts.verbose, self.log)
|
||||
html = dehyphenator(html,'html', length)
|
||||
|
||||
if is_pdftohtml:
|
||||
from calibre.ebooks.conversion.utils import HeuristicProcessor
|
||||
pdf_markup = HeuristicProcessor(self.extra_opts, None)
|
||||
totalwords = 0
|
||||
if pdf_markup.get_word_count(html) > 7000:
|
||||
html = pdf_markup.markup_chapters(html, totalwords, True)
|
||||
|
||||
# dump(html, 'post-preprocess')
|
||||
|
||||
# Handle broken XHTML w/ SVG (ugh)
|
||||
if 'svg:' in html and SVG_NS not in html:
|
||||
html = html.replace(
|
||||
'<html', '<html xmlns:svg="%s"' % SVG_NS, 1)
|
||||
if 'xlink:' in html and XLINK_NS not in html:
|
||||
html = html.replace(
|
||||
'<html', '<html xmlns:xlink="%s"' % XLINK_NS, 1)
|
||||
|
||||
html = XMLDECL_RE.sub('', html)
|
||||
|
||||
if getattr(self.extra_opts, 'asciiize', False):
|
||||
from calibre.utils.localization import get_udc
|
||||
from calibre.utils.mreplace import MReplace
|
||||
unihandecoder = get_udc()
|
||||
mr = MReplace(data={'«':'<'*3, '»':'>'*3})
|
||||
html = mr.mreplace(html)
|
||||
html = unihandecoder.decode(html)
|
||||
|
||||
if getattr(self.extra_opts, 'enable_heuristics', False):
|
||||
from calibre.ebooks.conversion.utils import HeuristicProcessor
|
||||
preprocessor = HeuristicProcessor(self.extra_opts, self.log)
|
||||
html = preprocessor(html)
|
||||
|
||||
if is_pdftohtml:
|
||||
html = html.replace('<!-- created by calibre\'s pdftohtml -->', '')
|
||||
|
||||
if getattr(self.extra_opts, 'smarten_punctuation', False):
|
||||
html = smarten_punctuation(html, self.log)
|
||||
|
||||
try:
|
||||
unsupported_unicode_chars = self.extra_opts.output_profile.unsupported_unicode_chars
|
||||
except AttributeError:
|
||||
unsupported_unicode_chars = ''
|
||||
if unsupported_unicode_chars:
|
||||
from calibre.utils.localization import get_udc
|
||||
unihandecoder = get_udc()
|
||||
for char in unsupported_unicode_chars:
|
||||
asciichar = unihandecoder.decode(char)
|
||||
html = html.replace(char, asciichar)
|
||||
|
||||
return html
|
||||
881
ebook_converter/ebooks/conversion/utils.py
Normal file
881
ebook_converter/ebooks/conversion/utils.py
Normal file
@@ -0,0 +1,881 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import re
|
||||
from math import ceil
|
||||
from calibre.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator
|
||||
from calibre.utils.logging import default_log
|
||||
from calibre.utils.wordcount import get_wordcount_obj
|
||||
from polyglot.builtins import unicode_type
|
||||
|
||||
|
||||
class HeuristicProcessor(object):
|
||||
|
||||
def __init__(self, extra_opts=None, log=None):
|
||||
self.log = default_log if log is None else log
|
||||
self.html_preprocess_sections = 0
|
||||
self.found_indents = 0
|
||||
self.extra_opts = extra_opts
|
||||
self.deleted_nbsps = False
|
||||
self.totalwords = 0
|
||||
self.min_chapters = 1
|
||||
self.chapters_no_title = 0
|
||||
self.chapters_with_title = 0
|
||||
self.blanks_deleted = False
|
||||
self.blanks_between_paragraphs = False
|
||||
self.linereg = re.compile('(?<=<p).*?(?=</p>)', re.IGNORECASE|re.DOTALL)
|
||||
self.blankreg = re.compile(r'\s*(?P<openline><p(?!\sclass=\"(softbreak|whitespace)\")[^>]*>)\s*(?P<closeline></p>)', re.IGNORECASE)
|
||||
self.anyblank = re.compile(r'\s*(?P<openline><p[^>]*>)\s*(?P<closeline></p>)', re.IGNORECASE)
|
||||
self.multi_blank = re.compile(r'(\s*<p[^>]*>\s*</p>(\s*<div[^>]*>\s*</div>\s*)*){2,}(?!\s*<h\d)', re.IGNORECASE)
|
||||
self.any_multi_blank = re.compile(r'(\s*<p[^>]*>\s*</p>(\s*<div[^>]*>\s*</div>\s*)*){2,}', re.IGNORECASE)
|
||||
self.line_open = (
|
||||
r"<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>font|span|[ibu])[^>]*>)?\s*"
|
||||
r"(<(?P<inner2>font|span|[ibu])[^>]*>)?\s*(<(?P<inner3>font|span|[ibu])[^>]*>)?\s*")
|
||||
self.line_close = "(</(?P=inner3)>)?\\s*(</(?P=inner2)>)?\\s*(</(?P=inner1)>)?\\s*</(?P=outer)>"
|
||||
self.single_blank = re.compile(r'(\s*<(p|div)[^>]*>\s*</(p|div)>)', re.IGNORECASE)
|
||||
self.scene_break_open = '<p class="scenebreak" style="text-align:center; text-indent:0%; margin-top:1em; margin-bottom:1em; page-break-before:avoid">'
|
||||
self.common_in_text_endings = '[\"\'—’”,\\.!\\?\\…\\)„\\w]'
|
||||
self.common_in_text_beginnings = '[\\w\'\"“‘‛]'
|
||||
|
||||
def is_pdftohtml(self, src):
|
||||
return '<!-- created by calibre\'s pdftohtml -->' in src[:1000]
|
||||
|
||||
def is_abbyy(self, src):
|
||||
return '<meta name="generator" content="ABBYY FineReader' in src[:1000]
|
||||
|
||||
def chapter_head(self, match):
|
||||
from calibre.utils.html2text import html2text
|
||||
chap = match.group('chap')
|
||||
title = match.group('title')
|
||||
if not title:
|
||||
self.html_preprocess_sections = self.html_preprocess_sections + 1
|
||||
self.log.debug("marked " + unicode_type(self.html_preprocess_sections) +
|
||||
" chapters. - " + unicode_type(chap))
|
||||
return '<h2>'+chap+'</h2>\n'
|
||||
else:
|
||||
delete_whitespace = re.compile('^\\s*(?P<c>.*?)\\s*$')
|
||||
delete_quotes = re.compile('\'\"')
|
||||
txt_chap = delete_quotes.sub('', delete_whitespace.sub('\\g<c>', html2text(chap)))
|
||||
txt_title = delete_quotes.sub('', delete_whitespace.sub('\\g<c>', html2text(title)))
|
||||
self.html_preprocess_sections = self.html_preprocess_sections + 1
|
||||
self.log.debug("marked " + unicode_type(self.html_preprocess_sections) +
|
||||
" chapters & titles. - " + unicode_type(chap) + ", " + unicode_type(title))
|
||||
return '<h2 title="'+txt_chap+', '+txt_title+'">'+chap+'</h2>\n<h3 class="sigilNotInTOC">'+title+'</h3>\n'
|
||||
|
||||
def chapter_break(self, match):
|
||||
chap = match.group('section')
|
||||
styles = match.group('styles')
|
||||
self.html_preprocess_sections = self.html_preprocess_sections + 1
|
||||
self.log.debug("marked " + unicode_type(self.html_preprocess_sections) +
|
||||
" section markers based on punctuation. - " + unicode_type(chap))
|
||||
return '<'+styles+' style="page-break-before:always">'+chap
|
||||
|
||||
def analyze_title_matches(self, match):
|
||||
# chap = match.group('chap')
|
||||
title = match.group('title')
|
||||
if not title:
|
||||
self.chapters_no_title = self.chapters_no_title + 1
|
||||
else:
|
||||
self.chapters_with_title = self.chapters_with_title + 1
|
||||
|
||||
def insert_indent(self, match):
|
||||
pstyle = match.group('formatting')
|
||||
tag = match.group('tagtype')
|
||||
span = match.group('span')
|
||||
self.found_indents = self.found_indents + 1
|
||||
if pstyle:
|
||||
if pstyle.lower().find('style') != -1:
|
||||
pstyle = re.sub(r'"$', '; text-indent:3%"', pstyle)
|
||||
else:
|
||||
pstyle = pstyle+' style="text-indent:3%"'
|
||||
if not span:
|
||||
return '<'+tag+' '+pstyle+'>'
|
||||
else:
|
||||
return '<'+tag+' '+pstyle+'>'+span
|
||||
else:
|
||||
if not span:
|
||||
return '<'+tag+' style="text-indent:3%">'
|
||||
else:
|
||||
return '<'+tag+' style="text-indent:3%">'+span
|
||||
|
||||
def no_markup(self, raw, percent):
|
||||
'''
|
||||
Detects total marked up line endings in the file. raw is the text to
|
||||
inspect. Percent is the minimum percent of line endings which should
|
||||
be marked up to return true.
|
||||
'''
|
||||
htm_end_ere = re.compile('</(p|div)>', re.DOTALL)
|
||||
line_end_ere = re.compile('(\n|\r|\r\n)', re.DOTALL)
|
||||
htm_end = htm_end_ere.findall(raw)
|
||||
line_end = line_end_ere.findall(raw)
|
||||
tot_htm_ends = len(htm_end)
|
||||
tot_ln_fds = len(line_end)
|
||||
# self.log.debug("There are " + unicode_type(tot_ln_fds) + " total Line feeds, and " +
|
||||
# unicode_type(tot_htm_ends) + " marked up endings")
|
||||
|
||||
if percent > 1:
|
||||
percent = 1
|
||||
if percent < 0:
|
||||
percent = 0
|
||||
|
||||
min_lns = tot_ln_fds * percent
|
||||
# self.log.debug("There must be fewer than " + unicode_type(min_lns) + " unmarked lines to add markup")
|
||||
return min_lns > tot_htm_ends
|
||||
|
||||
def dump(self, raw, where):
|
||||
import os
|
||||
dp = getattr(self.extra_opts, 'debug_pipeline', None)
|
||||
if dp and os.path.exists(dp):
|
||||
odir = os.path.join(dp, 'preprocess')
|
||||
if not os.path.exists(odir):
|
||||
os.makedirs(odir)
|
||||
if os.path.exists(odir):
|
||||
odir = os.path.join(odir, where)
|
||||
if not os.path.exists(odir):
|
||||
os.makedirs(odir)
|
||||
name, i = None, 0
|
||||
while not name or os.path.exists(os.path.join(odir, name)):
|
||||
i += 1
|
||||
name = '%04d.html'%i
|
||||
with open(os.path.join(odir, name), 'wb') as f:
|
||||
f.write(raw.encode('utf-8'))
|
||||
|
||||
def get_word_count(self, html):
|
||||
word_count_text = re.sub(r'(?s)<head[^>]*>.*?</head>', '', html)
|
||||
word_count_text = re.sub(r'<[^>]*>', '', word_count_text)
|
||||
wordcount = get_wordcount_obj(word_count_text)
|
||||
return wordcount.words
|
||||
|
||||
def markup_italicis(self, html):
|
||||
# self.log.debug("\n\n\nitalicize debugging \n\n\n")
|
||||
ITALICIZE_WORDS = [
|
||||
'Etc.', 'etc.', 'viz.', 'ie.', 'i.e.', 'Ie.', 'I.e.', 'eg.',
|
||||
'e.g.', 'Eg.', 'E.g.', 'et al.', 'et cetera', 'n.b.', 'N.b.',
|
||||
'nota bene', 'Nota bene', 'Ste.', 'Mme.', 'Mdme.',
|
||||
'Mlle.', 'Mons.', 'PS.', 'PPS.',
|
||||
]
|
||||
|
||||
ITALICIZE_STYLE_PATS = [
|
||||
unicode_type(r'(?msu)(?<=[\s>"“\'‘])_\*/(?P<words>[^\*_]+)/\*_'),
|
||||
unicode_type(r'(?msu)(?<=[\s>"“\'‘])~~(?P<words>[^~]+)~~'),
|
||||
unicode_type(r'(?msu)(?<=[\s>"“\'‘])_/(?P<words>[^/_]+)/_'),
|
||||
unicode_type(r'(?msu)(?<=[\s>"“\'‘])_\*(?P<words>[^\*_]+)\*_'),
|
||||
unicode_type(r'(?msu)(?<=[\s>"“\'‘])\*/(?P<words>[^/\*]+)/\*'),
|
||||
unicode_type(r'(?msu)(?<=[\s>"“\'‘])/:(?P<words>[^:/]+):/'),
|
||||
unicode_type(r'(?msu)(?<=[\s>"“\'‘])\|:(?P<words>[^:\|]+):\|'),
|
||||
unicode_type(r'(?msu)(?<=[\s>"“\'‘])\*(?P<words>[^\*]+)\*'),
|
||||
unicode_type(r'(?msu)(?<=[\s>"“\'‘])~(?P<words>[^~]+)~'),
|
||||
unicode_type(r'(?msu)(?<=[\s>"“\'‘])/(?P<words>[^/\*><]+)/'),
|
||||
unicode_type(r'(?msu)(?<=[\s>"“\'‘])_(?P<words>[^_]+)_'),
|
||||
]
|
||||
|
||||
for word in ITALICIZE_WORDS:
|
||||
html = re.sub(r'(?<=\s|>)' + re.escape(word) + r'(?=\s|<)', '<i>%s</i>' % word, html)
|
||||
|
||||
search_text = re.sub(r'(?s)<head[^>]*>.*?</head>', '', html)
|
||||
search_text = re.sub(r'<[^>]*>', '', search_text)
|
||||
for pat in ITALICIZE_STYLE_PATS:
|
||||
for match in re.finditer(pat, search_text):
|
||||
ital_string = unicode_type(match.group('words'))
|
||||
# self.log.debug("italicising "+unicode_type(match.group(0))+" with <i>"+ital_string+"</i>")
|
||||
try:
|
||||
html = re.sub(re.escape(unicode_type(match.group(0))), '<i>%s</i>' % ital_string, html)
|
||||
except OverflowError:
|
||||
# match.group(0) was too large to be compiled into a regex
|
||||
continue
|
||||
except re.error:
|
||||
# the match was not a valid regular expression
|
||||
continue
|
||||
|
||||
return html
|
||||
|
||||
def markup_chapters(self, html, wordcount, blanks_between_paragraphs):
|
||||
'''
|
||||
Searches for common chapter headings throughout the document
|
||||
attempts multiple patterns based on likelihood of a match
|
||||
with minimum false positives. Exits after finding a successful pattern
|
||||
'''
|
||||
# Typical chapters are between 2000 and 7000 words, use the larger number to decide the
|
||||
# minimum of chapters to search for. A max limit is calculated to prevent things like OCR
|
||||
# or pdf page numbers from being treated as TOC markers
|
||||
max_chapters = 150
|
||||
typical_chapters = 7000.
|
||||
if wordcount > 7000:
|
||||
if wordcount > 200000:
|
||||
typical_chapters = 15000.
|
||||
self.min_chapters = int(ceil(wordcount / typical_chapters))
|
||||
self.log.debug("minimum chapters required are: "+unicode_type(self.min_chapters))
|
||||
heading = re.compile('<h[1-3][^>]*>', re.IGNORECASE)
|
||||
self.html_preprocess_sections = len(heading.findall(html))
|
||||
self.log.debug("found " + unicode_type(self.html_preprocess_sections) + " pre-existing headings")
|
||||
|
||||
# Build the Regular Expressions in pieces
|
||||
init_lookahead = "(?=<(p|div))"
|
||||
chapter_line_open = self.line_open
|
||||
title_line_open = (r"<(?P<outer2>p|div)[^>]*>\s*(<(?P<inner4>font|span|[ibu])[^>]*>)?"
|
||||
r"\s*(<(?P<inner5>font|span|[ibu])[^>]*>)?\s*(<(?P<inner6>font|span|[ibu])[^>]*>)?\s*")
|
||||
chapter_header_open = r"(?P<chap>"
|
||||
title_header_open = r"(?P<title>"
|
||||
chapter_header_close = ")\\s*"
|
||||
title_header_close = ")"
|
||||
chapter_line_close = self.line_close
|
||||
title_line_close = "(</(?P=inner6)>)?\\s*(</(?P=inner5)>)?\\s*(</(?P=inner4)>)?\\s*</(?P=outer2)>"
|
||||
|
||||
is_pdftohtml = self.is_pdftohtml(html)
|
||||
if is_pdftohtml:
|
||||
title_line_open = "<(?P<outer2>p)[^>]*>\\s*"
|
||||
title_line_close = "\\s*</(?P=outer2)>"
|
||||
|
||||
if blanks_between_paragraphs:
|
||||
blank_lines = "(\\s*<p[^>]*>\\s*</p>){0,2}\\s*"
|
||||
else:
|
||||
blank_lines = ""
|
||||
opt_title_open = "("
|
||||
opt_title_close = ")?"
|
||||
n_lookahead_open = "(?!\\s*"
|
||||
n_lookahead_close = ")\\s*"
|
||||
|
||||
default_title = r"(<[ibu][^>]*>)?\s{0,3}(?!Chapter)([\w\:\'’\"-]+\s{0,3}){1,5}?(</[ibu][^>]*>)?(?=<)"
|
||||
simple_title = r"(<[ibu][^>]*>)?\s{0,3}(?!(Chapter|\s+<)).{0,65}?(</[ibu][^>]*>)?(?=<)"
|
||||
|
||||
analysis_result = []
|
||||
|
||||
chapter_types = [
|
||||
[(
|
||||
r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Epilogue|CHAPTER|Kapitel|Volume\b|Prologue|Book\b|Part\b|Dedication|Preface)"
|
||||
r"\s*([\d\w-]+\:?\'?\s*){0,5}"), True, True, True, False, "Searching for common section headings", 'common'],
|
||||
# Highest frequency headings which include titles
|
||||
[r"[^'\"]?(CHAPTER|Kapitel)\s*([\dA-Z\-\'\"\?!#,]+\s*){0,7}\s*", True, True, True, False, "Searching for most common chapter headings", 'chapter'],
|
||||
[r"<b[^>]*>\s*(<span[^>]*>)?\s*(?!([*#•=]+\s*)+)(\s*(?=[\d.\w#\-*\s]+<)([\d.\w#-*]+\s*){1,5}\s*)(?!\.)(</span>)?\s*</b>",
|
||||
True, True, True, False, "Searching for emphasized lines", 'emphasized'], # Emphasized lines
|
||||
[r"[^'\"]?(\d+(\.|:))\s*([\w\-\'\"#,]+\s*){0,7}\s*", True, True, True, False,
|
||||
"Searching for numeric chapter headings", 'numeric'], # Numeric Chapters
|
||||
[r"([A-Z]\s+){3,}\s*([\d\w-]+\s*){0,3}\s*", True, True, True, False, "Searching for letter spaced headings", 'letter_spaced'], # Spaced Lettering
|
||||
[r"[^'\"]?(\d+\.?\s+([\d\w-]+\:?\'?-?\s?){0,5})\s*", True, True, True, False,
|
||||
"Searching for numeric chapters with titles", 'numeric_title'], # Numeric Titles
|
||||
[r"[^'\"]?(\d+)\s*([\dA-Z\-\'\"\?!#,]+\s*){0,7}\s*", True, True, True, False,
|
||||
"Searching for simple numeric headings", 'plain_number'], # Numeric Chapters, no dot or colon
|
||||
[r"\s*[^'\"]?([A-Z#]+(\s|-){0,3}){1,5}\s*", False, True, False, False,
|
||||
"Searching for chapters with Uppercase Characters", 'uppercase'] # Uppercase Chapters
|
||||
]
|
||||
|
||||
def recurse_patterns(html, analyze):
|
||||
# Start with most typical chapter headings, get more aggressive until one works
|
||||
for [chapter_type, n_lookahead_req, strict_title, ignorecase, title_req, log_message, type_name] in chapter_types:
|
||||
n_lookahead = ''
|
||||
hits = 0
|
||||
self.chapters_no_title = 0
|
||||
self.chapters_with_title = 0
|
||||
|
||||
if n_lookahead_req:
|
||||
lp_n_lookahead_open = n_lookahead_open
|
||||
lp_n_lookahead_close = n_lookahead_close
|
||||
else:
|
||||
lp_n_lookahead_open = ''
|
||||
lp_n_lookahead_close = ''
|
||||
|
||||
if strict_title:
|
||||
lp_title = default_title
|
||||
else:
|
||||
lp_title = simple_title
|
||||
|
||||
if ignorecase:
|
||||
arg_ignorecase = r'(?i)'
|
||||
else:
|
||||
arg_ignorecase = ''
|
||||
|
||||
if title_req:
|
||||
lp_opt_title_open = ''
|
||||
lp_opt_title_close = ''
|
||||
else:
|
||||
lp_opt_title_open = opt_title_open
|
||||
lp_opt_title_close = opt_title_close
|
||||
|
||||
if self.html_preprocess_sections >= self.min_chapters:
|
||||
break
|
||||
full_chapter_line = chapter_line_open+chapter_header_open+chapter_type+chapter_header_close+chapter_line_close
|
||||
if n_lookahead_req:
|
||||
n_lookahead = re.sub("(ou|in|cha)", "lookahead_", full_chapter_line)
|
||||
if not analyze:
|
||||
self.log.debug("Marked " + unicode_type(self.html_preprocess_sections) + " headings, " + log_message)
|
||||
|
||||
chapter_marker = arg_ignorecase+init_lookahead+full_chapter_line+blank_lines+lp_n_lookahead_open+n_lookahead+lp_n_lookahead_close+ \
|
||||
lp_opt_title_open+title_line_open+title_header_open+lp_title+title_header_close+title_line_close+lp_opt_title_close
|
||||
chapdetect = re.compile(r'%s' % chapter_marker)
|
||||
|
||||
if analyze:
|
||||
hits = len(chapdetect.findall(html))
|
||||
if hits:
|
||||
chapdetect.sub(self.analyze_title_matches, html)
|
||||
if float(self.chapters_with_title) / float(hits) > .5:
|
||||
title_req = True
|
||||
strict_title = False
|
||||
self.log.debug(
|
||||
unicode_type(type_name)+" had "+unicode_type(hits)+
|
||||
" hits - "+unicode_type(self.chapters_no_title)+" chapters with no title, "+
|
||||
unicode_type(self.chapters_with_title)+" chapters with titles, "+
|
||||
unicode_type(float(self.chapters_with_title) / float(hits))+" percent. ")
|
||||
if type_name == 'common':
|
||||
analysis_result.append([chapter_type, n_lookahead_req, strict_title, ignorecase, title_req, log_message, type_name])
|
||||
elif self.min_chapters <= hits < max_chapters or self.min_chapters < 3 > hits:
|
||||
analysis_result.append([chapter_type, n_lookahead_req, strict_title, ignorecase, title_req, log_message, type_name])
|
||||
break
|
||||
else:
|
||||
html = chapdetect.sub(self.chapter_head, html)
|
||||
return html
|
||||
|
||||
recurse_patterns(html, True)
|
||||
chapter_types = analysis_result
|
||||
html = recurse_patterns(html, False)
|
||||
|
||||
words_per_chptr = wordcount
|
||||
if words_per_chptr > 0 and self.html_preprocess_sections > 0:
|
||||
words_per_chptr = wordcount // self.html_preprocess_sections
|
||||
self.log.debug("Total wordcount is: "+ unicode_type(wordcount)+", Average words per section is: "+
|
||||
unicode_type(words_per_chptr)+", Marked up "+unicode_type(self.html_preprocess_sections)+" chapters")
|
||||
return html
|
||||
|
||||
def punctuation_unwrap(self, length, content, format):
|
||||
'''
|
||||
Unwraps lines based on line length and punctuation
|
||||
supports a range of html markup and text files
|
||||
|
||||
the lookahead regex below is meant look for any non-full stop characters - punctuation
|
||||
characters which can be used as a full stop should *not* be added below - e.g. ?!“”. etc
|
||||
the reason for this is to prevent false positive wrapping. False positives are more
|
||||
difficult to detect than false negatives during a manual review of the doc
|
||||
|
||||
This function intentionally leaves hyphenated content alone as that is handled by the
|
||||
dehyphenate routine in a separate step
|
||||
'''
|
||||
def style_unwrap(match):
|
||||
style_close = match.group('style_close')
|
||||
style_open = match.group('style_open')
|
||||
if style_open and style_close:
|
||||
return style_close+' '+style_open
|
||||
elif style_open and not style_close:
|
||||
return ' '+style_open
|
||||
elif not style_open and style_close:
|
||||
return style_close+' '
|
||||
else:
|
||||
return ' '
|
||||
|
||||
# define the pieces of the regex
|
||||
# (?<!\&\w{4});) is a semicolon not part of an entity
|
||||
lookahead = "(?<=.{"+unicode_type(length)+r"}([a-zა-ჰäëïöüàèìòùáćéíĺóŕńśúýâêîôûçąężıãõñæøþðßěľščťžňďřů,:)\\IAß]|(?<!\&\w{4});))"
|
||||
em_en_lookahead = "(?<=.{"+unicode_type(length)+"}[\u2013\u2014])"
|
||||
soft_hyphen = "\xad"
|
||||
line_ending = "\\s*(?P<style_close></(span|[iub])>)?\\s*(</(p|div)>)?"
|
||||
blanklines = "\\s*(?P<up2threeblanks><(p|span|div)[^>]*>\\s*(<(p|span|div)[^>]*>\\s*</(span|p|div)>\\s*)</(span|p|div)>\\s*){0,3}\\s*"
|
||||
line_opening = "<(p|div)[^>]*>\\s*(?P<style_open><(span|[iub])[^>]*>)?\\s*"
|
||||
txt_line_wrap = "((\u0020|\u0009)*\n){1,4}"
|
||||
|
||||
if format == 'txt':
|
||||
unwrap_regex = lookahead+txt_line_wrap
|
||||
em_en_unwrap_regex = em_en_lookahead+txt_line_wrap
|
||||
shy_unwrap_regex = soft_hyphen+txt_line_wrap
|
||||
else:
|
||||
unwrap_regex = lookahead+line_ending+blanklines+line_opening
|
||||
em_en_unwrap_regex = em_en_lookahead+line_ending+blanklines+line_opening
|
||||
shy_unwrap_regex = soft_hyphen+line_ending+blanklines+line_opening
|
||||
|
||||
unwrap = re.compile("%s" % unwrap_regex, re.UNICODE)
|
||||
em_en_unwrap = re.compile("%s" % em_en_unwrap_regex, re.UNICODE)
|
||||
shy_unwrap = re.compile("%s" % shy_unwrap_regex, re.UNICODE)
|
||||
|
||||
if format == 'txt':
|
||||
content = unwrap.sub(' ', content)
|
||||
content = em_en_unwrap.sub('', content)
|
||||
content = shy_unwrap.sub('', content)
|
||||
else:
|
||||
content = unwrap.sub(style_unwrap, content)
|
||||
content = em_en_unwrap.sub(style_unwrap, content)
|
||||
content = shy_unwrap.sub(style_unwrap, content)
|
||||
|
||||
return content
|
||||
|
||||
def txt_process(self, match):
|
||||
from calibre.ebooks.txt.processor import convert_basic, separate_paragraphs_single_line
|
||||
content = match.group('text')
|
||||
content = separate_paragraphs_single_line(content)
|
||||
content = convert_basic(content, epub_split_size_kb=0)
|
||||
return content
|
||||
|
||||
def markup_pre(self, html):
|
||||
pre = re.compile(r'<pre>', re.IGNORECASE)
|
||||
if len(pre.findall(html)) >= 1:
|
||||
self.log.debug("Running Text Processing")
|
||||
outerhtml = re.compile(r'.*?(?<=<pre>)(?P<text>.*?)</pre>', re.IGNORECASE|re.DOTALL)
|
||||
html = outerhtml.sub(self.txt_process, html)
|
||||
from calibre.ebooks.conversion.preprocess import convert_entities
|
||||
html = re.sub(r'&(\S+?);', convert_entities, html)
|
||||
else:
|
||||
# Add markup naively
|
||||
# TODO - find out if there are cases where there are more than one <pre> tag or
|
||||
# other types of unmarked html and handle them in some better fashion
|
||||
add_markup = re.compile('(?<!>)(\n)')
|
||||
html = add_markup.sub('</p>\n<p>', html)
|
||||
return html
|
||||
|
||||
def arrange_htm_line_endings(self, html):
|
||||
html = re.sub(r"\s*</(?P<tag>p|div)>", "</"+"\\g<tag>"+">\n", html)
|
||||
html = re.sub(r"\s*<(?P<tag>p|div)(?P<style>[^>]*)>\s*", "\n<"+"\\g<tag>"+"\\g<style>"+">", html)
|
||||
return html
|
||||
|
||||
def fix_nbsp_indents(self, html):
|
||||
txtindent = re.compile(unicode_type(r'<(?P<tagtype>p|div)(?P<formatting>[^>]*)>\s*(?P<span>(<span[^>]*>\s*)+)?\s*(\u00a0){2,}'), re.IGNORECASE)
|
||||
html = txtindent.sub(self.insert_indent, html)
|
||||
if self.found_indents > 1:
|
||||
self.log.debug("replaced "+unicode_type(self.found_indents)+ " nbsp indents with inline styles")
|
||||
return html
|
||||
|
||||
def cleanup_markup(self, html):
|
||||
# remove remaining non-breaking spaces
|
||||
html = re.sub(unicode_type(r'\u00a0'), ' ', html)
|
||||
# Get rid of various common microsoft specific tags which can cause issues later
|
||||
# Get rid of empty <o:p> tags to simplify other processing
|
||||
html = re.sub(unicode_type(r'\s*<o:p>\s*</o:p>'), ' ', html)
|
||||
# Delete microsoft 'smart' tags
|
||||
html = re.sub('(?i)</?st1:\\w+>', '', html)
|
||||
# Re-open self closing paragraph tags
|
||||
html = re.sub('<p[^>/]*/>', '<p> </p>', html)
|
||||
# Get rid of empty span, bold, font, em, & italics tags
|
||||
fmt_tags = 'font|[ibu]|em|strong'
|
||||
open_fmt_pat, close_fmt_pat = r'<(?:{})(?:\s[^>]*)?>'.format(fmt_tags), '</(?:{})>'.format(fmt_tags)
|
||||
for i in range(2):
|
||||
html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]*>\s*</span>){0,2}\s*</span>\s*", " ", html)
|
||||
html = re.sub(
|
||||
r"\s*{open}\s*({open}\s*{close}\s*){{0,2}}\s*{close}".format(open=open_fmt_pat, close=close_fmt_pat) , " ", html)
|
||||
# delete surrounding divs from empty paragraphs
|
||||
html = re.sub('<div[^>]*>\\s*<p[^>]*>\\s*</p>\\s*</div>', '<p> </p>', html)
|
||||
# Empty heading tags
|
||||
html = re.sub(r'(?i)<h\d+>\s*</h\d+>', '', html)
|
||||
self.deleted_nbsps = True
|
||||
return html
|
||||
|
||||
def analyze_line_endings(self, html):
|
||||
'''
|
||||
determines the type of html line ending used most commonly in a document
|
||||
use before calling docanalysis functions
|
||||
'''
|
||||
paras_reg = re.compile('<p[^>]*>', re.IGNORECASE)
|
||||
spans_reg = re.compile('<span[^>]*>', re.IGNORECASE)
|
||||
paras = len(paras_reg.findall(html))
|
||||
spans = len(spans_reg.findall(html))
|
||||
if spans > 1:
|
||||
if float(paras) / float(spans) < 0.75:
|
||||
return 'spanned_html'
|
||||
else:
|
||||
return 'html'
|
||||
else:
|
||||
return 'html'
|
||||
|
||||
def analyze_blanks(self, html):
|
||||
blanklines = self.blankreg.findall(html)
|
||||
lines = self.linereg.findall(html)
|
||||
if len(lines) > 1:
|
||||
self.log.debug("There are " + unicode_type(len(blanklines)) + " blank lines. " +
|
||||
unicode_type(float(len(blanklines)) / float(len(lines))) + " percent blank")
|
||||
|
||||
if float(len(blanklines)) / float(len(lines)) > 0.40:
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
def cleanup_required(self):
|
||||
for option in ['unwrap_lines', 'markup_chapter_headings', 'format_scene_breaks', 'delete_blank_paragraphs']:
|
||||
if getattr(self.extra_opts, option, False):
|
||||
return True
|
||||
return False
|
||||
|
||||
def merge_blanks(self, html, blanks_count=None):
|
||||
base_em = .5 # Baseline is 1.5em per blank line, 1st line is .5 em css and 1em for the nbsp
|
||||
em_per_line = 1.5 # Add another 1.5 em for each additional blank
|
||||
|
||||
def merge_matches(match):
|
||||
to_merge = match.group(0)
|
||||
lines = float(len(self.single_blank.findall(to_merge))) - 1.
|
||||
em = base_em + (em_per_line * lines)
|
||||
if to_merge.find('whitespace'):
|
||||
newline = self.any_multi_blank.sub('\n<p class="whitespace'+unicode_type(int(em * 10))+
|
||||
'" style="text-align:center; margin-top:'+unicode_type(em)+'em"> </p>', match.group(0))
|
||||
else:
|
||||
newline = self.any_multi_blank.sub('\n<p class="softbreak'+unicode_type(int(em * 10))+
|
||||
'" style="text-align:center; margin-top:'+unicode_type(em)+'em"> </p>', match.group(0))
|
||||
return newline
|
||||
|
||||
html = self.any_multi_blank.sub(merge_matches, html)
|
||||
return html
|
||||
|
||||
def detect_whitespace(self, html):
|
||||
blanks_around_headings = re.compile(
|
||||
r'(?P<initparas>(<(p|div)[^>]*>\s*</(p|div)>\s*){1,}\s*)?'
|
||||
r'(?P<content><h(?P<hnum>\d+)[^>]*>.*?</h(?P=hnum)>)(?P<endparas>\s*(<(p|div)[^>]*>\s*</(p|div)>\s*){1,})?', re.IGNORECASE|re.DOTALL)
|
||||
blanks_around_scene_breaks = re.compile(
|
||||
r'(?P<initparas>(<(p|div)[^>]*>\s*</(p|div)>\s*){1,}\s*)?'
|
||||
r'(?P<content><p class="scenebreak"[^>]*>.*?</p>)(?P<endparas>\s*(<(p|div)[^>]*>\s*</(p|div)>\s*){1,})?', re.IGNORECASE|re.DOTALL)
|
||||
blanks_n_nopunct = re.compile(
|
||||
r'(?P<initparas>(<p[^>]*>\s*</p>\s*){1,}\s*)?<p[^>]*>\s*(<(span|[ibu]|em|strong|font)[^>]*>\s*)*'
|
||||
r'.{1,100}?[^\W](</(span|[ibu]|em|strong|font)>\s*)*</p>(?P<endparas>\s*(<p[^>]*>\s*</p>\s*){1,})?', re.IGNORECASE|re.DOTALL)
|
||||
|
||||
def merge_header_whitespace(match):
|
||||
initblanks = match.group('initparas')
|
||||
endblanks = match.group('endparas')
|
||||
content = match.group('content')
|
||||
top_margin = ''
|
||||
bottom_margin = ''
|
||||
if initblanks is not None:
|
||||
top_margin = 'margin-top:'+unicode_type(len(self.single_blank.findall(initblanks)))+'em;'
|
||||
if endblanks is not None:
|
||||
bottom_margin = 'margin-bottom:'+unicode_type(len(self.single_blank.findall(endblanks)))+'em;'
|
||||
|
||||
if initblanks is None and endblanks is None:
|
||||
return content
|
||||
elif content.find('scenebreak') != -1:
|
||||
return content
|
||||
else:
|
||||
content = re.sub('(?i)<h(?P<hnum>\\d+)[^>]*>', '\n\n<h'+'\\g<hnum>'+' style="'+top_margin+bottom_margin+'">', content)
|
||||
return content
|
||||
|
||||
html = blanks_around_headings.sub(merge_header_whitespace, html)
|
||||
html = blanks_around_scene_breaks.sub(merge_header_whitespace, html)
|
||||
|
||||
def markup_whitespaces(match):
|
||||
blanks = match.group(0)
|
||||
blanks = self.blankreg.sub('\n<p class="whitespace" style="text-align:center; margin-top:0em; margin-bottom:0em"> </p>', blanks)
|
||||
return blanks
|
||||
|
||||
html = blanks_n_nopunct.sub(markup_whitespaces, html)
|
||||
if self.html_preprocess_sections > self.min_chapters:
|
||||
html = re.sub('(?si)^.*?(?=<h\\d)', markup_whitespaces, html)
|
||||
|
||||
return html
|
||||
|
||||
def detect_soft_breaks(self, html):
|
||||
line = '(?P<initline>'+self.line_open+'\\s*(?P<init_content>.*?)'+self.line_close+')'
|
||||
line_two = '(?P<line_two>'+re.sub('(ou|in|cha)', 'linetwo_', self.line_open)+ \
|
||||
'\\s*(?P<line_two_content>.*?)'+re.sub('(ou|in|cha)', 'linetwo_', self.line_close)+')'
|
||||
div_break_candidate_pattern = line+'\\s*<div[^>]*>\\s*</div>\\s*'+line_two
|
||||
div_break_candidate = re.compile(r'%s' % div_break_candidate_pattern, re.IGNORECASE|re.UNICODE)
|
||||
|
||||
def convert_div_softbreaks(match):
|
||||
init_is_paragraph = self.check_paragraph(match.group('init_content'))
|
||||
line_two_is_paragraph = self.check_paragraph(match.group('line_two_content'))
|
||||
if init_is_paragraph and line_two_is_paragraph:
|
||||
return (match.group('initline')+
|
||||
'\n<p class="softbreak" style="margin-top:.5em; page-break-before:avoid; text-align:center"> </p>\n'+
|
||||
match.group('line_two'))
|
||||
else:
|
||||
return match.group(0)
|
||||
|
||||
html = div_break_candidate.sub(convert_div_softbreaks, html)
|
||||
|
||||
if not self.blanks_deleted and self.blanks_between_paragraphs:
|
||||
html = self.multi_blank.sub('\n<p class="softbreak" style="margin-top:1em; page-break-before:avoid; text-align:center"> </p>', html)
|
||||
else:
|
||||
html = self.blankreg.sub('\n<p class="softbreak" style="margin-top:.5em; page-break-before:avoid; text-align:center"> </p>', html)
|
||||
return html
|
||||
|
||||
def detect_scene_breaks(self, html):
|
||||
scene_break_regex = self.line_open+'(?!('+self.common_in_text_beginnings+'|.*?'+self.common_in_text_endings+ \
|
||||
'<))(?P<break>((?P<break_char>((?!\\s)\\W))\\s*(?P=break_char)?)+)\\s*'+self.line_close
|
||||
scene_breaks = re.compile(r'%s' % scene_break_regex, re.IGNORECASE|re.UNICODE)
|
||||
html = scene_breaks.sub(self.scene_break_open+'\\g<break>'+'</p>', html)
|
||||
return html
|
||||
|
||||
def markup_user_break(self, replacement_break):
|
||||
'''
|
||||
Takes string a user supplies and wraps it in markup that will be centered with
|
||||
appropriate margins. <hr> and <img> tags are allowed. If the user specifies
|
||||
a style with width attributes in the <hr> tag then the appropriate margins are
|
||||
applied to wrapping divs. This is because many ebook devices don't support margin:auto
|
||||
All other html is converted to text.
|
||||
'''
|
||||
hr_open = '<div id="scenebreak" style="margin-left: 45%; margin-right: 45%; margin-top:1.5em; margin-bottom:1.5em; page-break-before:avoid">'
|
||||
if re.findall('(<|>)', replacement_break):
|
||||
if re.match('^<hr', replacement_break):
|
||||
if replacement_break.find('width') != -1:
|
||||
try:
|
||||
width = int(re.sub('.*?width(:|=)(?P<wnum>\\d+).*', '\\g<wnum>', replacement_break))
|
||||
except:
|
||||
scene_break = hr_open+'<hr style="height: 3px; background:#505050" /></div>'
|
||||
self.log.warn('Invalid replacement scene break'
|
||||
' expression, using default')
|
||||
else:
|
||||
replacement_break = re.sub('(?i)(width=\\d+\\%?|width:\\s*\\d+(\\%|px|pt|em)?;?)', '', replacement_break)
|
||||
divpercent = (100 - width) // 2
|
||||
hr_open = re.sub('45', unicode_type(divpercent), hr_open)
|
||||
scene_break = hr_open+replacement_break+'</div>'
|
||||
else:
|
||||
scene_break = hr_open+'<hr style="height: 3px; background:#505050" /></div>'
|
||||
elif re.match('^<img', replacement_break):
|
||||
scene_break = self.scene_break_open+replacement_break+'</p>'
|
||||
else:
|
||||
from calibre.utils.html2text import html2text
|
||||
replacement_break = html2text(replacement_break)
|
||||
replacement_break = re.sub('\\s', ' ', replacement_break)
|
||||
scene_break = self.scene_break_open+replacement_break+'</p>'
|
||||
else:
|
||||
replacement_break = re.sub('\\s', ' ', replacement_break)
|
||||
scene_break = self.scene_break_open+replacement_break+'</p>'
|
||||
|
||||
return scene_break
|
||||
|
||||
def check_paragraph(self, content):
|
||||
content = re.sub('\\s*</?span[^>]*>\\s*', '', content)
|
||||
if re.match('.*[\"\'.!?:]$', content):
|
||||
# print "detected this as a paragraph"
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
def abbyy_processor(self, html):
|
||||
abbyy_line = re.compile('((?P<linestart><p\\sstyle="(?P<styles>[^\"]*?);?">)(?P<content>.*?)(?P<lineend></p>)|(?P<image><img[^>]*>))', re.IGNORECASE)
|
||||
empty_paragraph = '\n<p> </p>\n'
|
||||
self.in_blockquote = False
|
||||
self.previous_was_paragraph = False
|
||||
html = re.sub('</?a[^>]*>', '', html)
|
||||
|
||||
def convert_styles(match):
|
||||
# print "raw styles are: "+match.group('styles')
|
||||
content = match.group('content')
|
||||
# print "raw content is: "+match.group('content')
|
||||
image = match.group('image')
|
||||
|
||||
is_paragraph = False
|
||||
text_align = ''
|
||||
text_indent = ''
|
||||
paragraph_before = ''
|
||||
paragraph_after = ''
|
||||
blockquote_open = '\n<blockquote>\n'
|
||||
blockquote_close = '</blockquote>\n'
|
||||
indented_text = 'text-indent:3%;'
|
||||
blockquote_open_loop = ''
|
||||
blockquote_close_loop = ''
|
||||
debugabby = False
|
||||
|
||||
if image:
|
||||
debugabby = True
|
||||
if self.in_blockquote:
|
||||
self.in_blockquote = False
|
||||
blockquote_close_loop = blockquote_close
|
||||
self.previous_was_paragraph = False
|
||||
return blockquote_close_loop+'\n'+image+'\n'
|
||||
else:
|
||||
styles = match.group('styles').split(';')
|
||||
is_paragraph = self.check_paragraph(content)
|
||||
# print "styles for this line are: "+unicode_type(styles)
|
||||
split_styles = []
|
||||
for style in styles:
|
||||
# print "style is: "+unicode_type(style)
|
||||
newstyle = style.split(':')
|
||||
# print "newstyle is: "+unicode_type(newstyle)
|
||||
split_styles.append(newstyle)
|
||||
styles = split_styles
|
||||
for style, setting in styles:
|
||||
if style == 'text-align' and setting != 'left':
|
||||
text_align = style+':'+setting+';'
|
||||
if style == 'text-indent':
|
||||
setting = int(re.sub('\\s*pt\\s*', '', setting))
|
||||
if 9 < setting < 14:
|
||||
text_indent = indented_text
|
||||
else:
|
||||
text_indent = style+':'+unicode_type(setting)+'pt;'
|
||||
if style == 'padding':
|
||||
setting = re.sub('pt', '', setting).split(' ')
|
||||
if int(setting[1]) < 16 and int(setting[3]) < 16:
|
||||
if self.in_blockquote:
|
||||
debugabby = True
|
||||
if is_paragraph:
|
||||
self.in_blockquote = False
|
||||
blockquote_close_loop = blockquote_close
|
||||
if int(setting[3]) > 8 and text_indent == '':
|
||||
text_indent = indented_text
|
||||
if int(setting[0]) > 5:
|
||||
paragraph_before = empty_paragraph
|
||||
if int(setting[2]) > 5:
|
||||
paragraph_after = empty_paragraph
|
||||
elif not self.in_blockquote and self.previous_was_paragraph:
|
||||
debugabby = True
|
||||
self.in_blockquote = True
|
||||
blockquote_open_loop = blockquote_open
|
||||
if debugabby:
|
||||
self.log.debug('\n\n******\n')
|
||||
self.log.debug('padding top is: '+unicode_type(setting[0]))
|
||||
self.log.debug('padding right is:' +unicode_type(setting[1]))
|
||||
self.log.debug('padding bottom is: ' + unicode_type(setting[2]))
|
||||
self.log.debug('padding left is: ' +unicode_type(setting[3]))
|
||||
|
||||
# print "text-align is: "+unicode_type(text_align)
|
||||
# print "\n***\nline is:\n "+unicode_type(match.group(0))+'\n'
|
||||
if debugabby:
|
||||
# print "this line is a paragraph = "+unicode_type(is_paragraph)+", previous line was "+unicode_type(self.previous_was_paragraph)
|
||||
self.log.debug("styles for this line were:", styles)
|
||||
self.log.debug('newline is:')
|
||||
self.log.debug(blockquote_open_loop+blockquote_close_loop+
|
||||
paragraph_before+'<p style="'+text_indent+text_align+
|
||||
'">'+content+'</p>'+paragraph_after+'\n\n\n\n\n')
|
||||
# print "is_paragraph is "+unicode_type(is_paragraph)+", previous_was_paragraph is "+unicode_type(self.previous_was_paragraph)
|
||||
self.previous_was_paragraph = is_paragraph
|
||||
# print "previous_was_paragraph is now set to "+unicode_type(self.previous_was_paragraph)+"\n\n\n"
|
||||
return blockquote_open_loop+blockquote_close_loop+paragraph_before+'<p style="'+text_indent+text_align+'">'+content+'</p>'+paragraph_after
|
||||
|
||||
html = abbyy_line.sub(convert_styles, html)
|
||||
return html
|
||||
|
||||
def __call__(self, html):
|
||||
self.log.debug("********* Heuristic processing HTML *********")
|
||||
# Count the words in the document to estimate how many chapters to look for and whether
|
||||
# other types of processing are attempted
|
||||
try:
|
||||
self.totalwords = self.get_word_count(html)
|
||||
except:
|
||||
self.log.warn("Can't get wordcount")
|
||||
|
||||
if self.totalwords < 50:
|
||||
self.log.warn("flow is too short, not running heuristics")
|
||||
return html
|
||||
|
||||
is_abbyy = self.is_abbyy(html)
|
||||
if is_abbyy:
|
||||
html = self.abbyy_processor(html)
|
||||
|
||||
# Arrange line feeds and </p> tags so the line_length and no_markup functions work correctly
|
||||
html = self.arrange_htm_line_endings(html)
|
||||
# self.dump(html, 'after_arrange_line_endings')
|
||||
if self.cleanup_required():
|
||||
# ##### Check Markup ######
|
||||
#
|
||||
# some lit files don't have any <p> tags or equivalent (generally just plain text between
|
||||
# <pre> tags), check and mark up line endings if required before proceeding
|
||||
# fix indents must run after this step
|
||||
if self.no_markup(html, 0.1):
|
||||
self.log.debug("not enough paragraph markers, adding now")
|
||||
# markup using text processing
|
||||
html = self.markup_pre(html)
|
||||
|
||||
# Replace series of non-breaking spaces with text-indent
|
||||
if getattr(self.extra_opts, 'fix_indents', False):
|
||||
html = self.fix_nbsp_indents(html)
|
||||
|
||||
if self.cleanup_required():
|
||||
# fix indents must run before this step, as it removes non-breaking spaces
|
||||
html = self.cleanup_markup(html)
|
||||
|
||||
is_pdftohtml = self.is_pdftohtml(html)
|
||||
if is_pdftohtml:
|
||||
self.line_open = "<(?P<outer>p)[^>]*>(\\s*<[ibu][^>]*>)?\\s*"
|
||||
self.line_close = "\\s*(</[ibu][^>]*>\\s*)?</(?P=outer)>"
|
||||
|
||||
# ADE doesn't render <br />, change to empty paragraphs
|
||||
# html = re.sub('<br[^>]*>', u'<p>\u00a0</p>', html)
|
||||
|
||||
# Determine whether the document uses interleaved blank lines
|
||||
self.blanks_between_paragraphs = self.analyze_blanks(html)
|
||||
|
||||
# detect chapters/sections to match xpath or splitting logic
|
||||
|
||||
if getattr(self.extra_opts, 'markup_chapter_headings', False):
|
||||
html = self.markup_chapters(html, self.totalwords, self.blanks_between_paragraphs)
|
||||
# self.dump(html, 'after_chapter_markup')
|
||||
|
||||
if getattr(self.extra_opts, 'italicize_common_cases', False):
|
||||
html = self.markup_italicis(html)
|
||||
|
||||
# If more than 40% of the lines are empty paragraphs and the user has enabled delete
|
||||
# blank paragraphs then delete blank lines to clean up spacing
|
||||
if self.blanks_between_paragraphs and getattr(self.extra_opts, 'delete_blank_paragraphs', False):
|
||||
self.log.debug("deleting blank lines")
|
||||
self.blanks_deleted = True
|
||||
html = self.multi_blank.sub('\n<p class="softbreak" style="margin-top:.5em; page-break-before:avoid; text-align:center"> </p>', html)
|
||||
html = self.blankreg.sub('', html)
|
||||
|
||||
# Determine line ending type
|
||||
# Some OCR sourced files have line breaks in the html using a combination of span & p tags
|
||||
# span are used for hard line breaks, p for new paragraphs. Determine which is used so
|
||||
# that lines can be un-wrapped across page boundaries
|
||||
format = self.analyze_line_endings(html)
|
||||
|
||||
# Check Line histogram to determine if the document uses hard line breaks, If 50% or
|
||||
# more of the lines break in the same region of the document then unwrapping is required
|
||||
docanalysis = DocAnalysis(format, html)
|
||||
hardbreaks = docanalysis.line_histogram(.50)
|
||||
self.log.debug("Hard line breaks check returned "+unicode_type(hardbreaks))
|
||||
|
||||
# Calculate Length
|
||||
unwrap_factor = getattr(self.extra_opts, 'html_unwrap_factor', 0.4)
|
||||
length = docanalysis.line_length(unwrap_factor)
|
||||
self.log.debug("Median line length is " + unicode_type(length) + ", calculated with " + format + " format")
|
||||
|
||||
# ##### Unwrap lines ######
|
||||
if getattr(self.extra_opts, 'unwrap_lines', False):
|
||||
# only go through unwrapping code if the histogram shows unwrapping is required or if the user decreased the default unwrap_factor
|
||||
if hardbreaks or unwrap_factor < 0.4:
|
||||
self.log.debug("Unwrapping required, unwrapping Lines")
|
||||
# Dehyphenate with line length limiters
|
||||
dehyphenator = Dehyphenator(self.extra_opts.verbose, self.log)
|
||||
html = dehyphenator(html,'html', length)
|
||||
html = self.punctuation_unwrap(length, html, 'html')
|
||||
|
||||
if getattr(self.extra_opts, 'dehyphenate', False):
|
||||
# dehyphenate in cleanup mode to fix anything previous conversions/editing missed
|
||||
self.log.debug("Fixing hyphenated content")
|
||||
dehyphenator = Dehyphenator(self.extra_opts.verbose, self.log)
|
||||
html = dehyphenator(html,'html_cleanup', length)
|
||||
html = dehyphenator(html, 'individual_words', length)
|
||||
|
||||
# If still no sections after unwrapping mark split points on lines with no punctuation
|
||||
if self.html_preprocess_sections < self.min_chapters and getattr(self.extra_opts, 'markup_chapter_headings', False):
|
||||
self.log.debug("Looking for more split points based on punctuation,"
|
||||
" currently have " + unicode_type(self.html_preprocess_sections))
|
||||
chapdetect3 = re.compile(
|
||||
r'<(?P<styles>(p|div)[^>]*)>\s*(?P<section>(<span[^>]*>)?\s*(?!([\W]+\s*)+)'
|
||||
r'(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*'
|
||||
r'.?(?=[a-z#\-*\s]+<)([a-z#-*]+\s*){1,5}\s*\s*(</span>)?(</[ibu]>){0,2}\s*'
|
||||
r'(</span>)?\s*(</[ibu]>){0,2}\s*(</span>)?\s*</(p|div)>)', re.IGNORECASE)
|
||||
html = chapdetect3.sub(self.chapter_break, html)
|
||||
|
||||
if getattr(self.extra_opts, 'renumber_headings', False):
|
||||
# search for places where a first or second level heading is immediately followed by another
|
||||
# top level heading. demote the second heading to h3 to prevent splitting between chapter
|
||||
# headings and titles, images, etc
|
||||
doubleheading = re.compile(
|
||||
r'(?P<firsthead><h(1|2)[^>]*>.+?</h(1|2)>\s*(<(?!h\d)[^>]*>\s*)*)<h(1|2)(?P<secondhead>[^>]*>.+?)</h(1|2)>', re.IGNORECASE)
|
||||
html = doubleheading.sub('\\g<firsthead>'+'\n<h3'+'\\g<secondhead>'+'</h3>', html)
|
||||
|
||||
# If scene break formatting is enabled, find all blank paragraphs that definitely aren't scenebreaks,
|
||||
# style it with the 'whitespace' class. All remaining blank lines are styled as softbreaks.
|
||||
# Multiple sequential blank paragraphs are merged with appropriate margins
|
||||
# If non-blank scene breaks exist they are center aligned and styled with appropriate margins.
|
||||
if getattr(self.extra_opts, 'format_scene_breaks', False):
|
||||
self.log.debug('Formatting scene breaks')
|
||||
html = re.sub('(?i)<div[^>]*>\\s*<br(\\s?/)?>\\s*</div>', '<p></p>', html)
|
||||
html = self.detect_scene_breaks(html)
|
||||
html = self.detect_whitespace(html)
|
||||
html = self.detect_soft_breaks(html)
|
||||
blanks_count = len(self.any_multi_blank.findall(html))
|
||||
if blanks_count >= 1:
|
||||
html = self.merge_blanks(html, blanks_count)
|
||||
detected_scene_break = re.compile(r'<p class="scenebreak"[^>]*>.*?</p>')
|
||||
scene_break_count = len(detected_scene_break.findall(html))
|
||||
# If the user has enabled scene break replacement, then either softbreaks
|
||||
# or 'hard' scene breaks are replaced, depending on which is in use
|
||||
# Otherwise separator lines are centered, use a bit larger margin in this case
|
||||
replacement_break = getattr(self.extra_opts, 'replace_scene_breaks', None)
|
||||
if replacement_break:
|
||||
replacement_break = self.markup_user_break(replacement_break)
|
||||
if scene_break_count >= 1:
|
||||
html = detected_scene_break.sub(replacement_break, html)
|
||||
html = re.sub('<p\\s+class="softbreak"[^>]*>\\s*</p>', replacement_break, html)
|
||||
else:
|
||||
html = re.sub('<p\\s+class="softbreak"[^>]*>\\s*</p>', replacement_break, html)
|
||||
|
||||
if self.deleted_nbsps:
|
||||
# put back non-breaking spaces in empty paragraphs so they render correctly
|
||||
html = self.anyblank.sub('\n'+r'\g<openline>'+'\u00a0'+r'\g<closeline>', html)
|
||||
return html
|
||||
Reference in New Issue
Block a user