mirror of
https://github.com/gryf/ebook-converter.git
synced 2026-02-17 06:35:45 +01:00
Every mime related function in main __init__.py has a flag check for the check if initialization has already done. This is nonsense, since it should be done implicitly early on the converter is starting. This commit straight the things out, and initialization is done in cli module. Also, function guess_type was removed, since it's just a proxy for mimetypes.guess_type function.
1283 lines
57 KiB
Python
1283 lines
57 KiB
Python
import functools
|
|
import json
|
|
import os
|
|
import pprint
|
|
import shutil
|
|
import sys
|
|
|
|
from ebook_converter.customize.conversion import OptionRecommendation, DummyReporter
|
|
from ebook_converter.customize.ui import input_profiles, output_profiles, \
|
|
plugin_for_input_format, plugin_for_output_format, \
|
|
available_input_formats, available_output_formats, \
|
|
run_plugins_on_preprocess, run_plugins_on_postprocess
|
|
from ebook_converter.ebooks.conversion.preprocess import HTMLPreProcessor
|
|
from ebook_converter.ptempfile import PersistentTemporaryDirectory
|
|
from ebook_converter.utils.date import parse_date
|
|
from ebook_converter.utils.zipfile import ZipFile
|
|
from ebook_converter import extract, walk, filesystem_encoding
|
|
from ebook_converter.constants_old import __version__
|
|
|
|
|
|
DEBUG_README=b'''
|
|
This debug directory contains snapshots of the e-book as it passes through the
|
|
various stages of conversion. The stages are:
|
|
|
|
1. input - This is the result of running the input plugin on the source
|
|
file. Use this directory to debug the input plugin.
|
|
|
|
2. parsed - This is the result of preprocessing and parsing the output of
|
|
the input plugin. Note that for some input plugins this will be identical to
|
|
the input sub-directory. Use this directory to debug structure detection,
|
|
etc.
|
|
|
|
3. structure - This corresponds to the stage in the pipeline when structure
|
|
detection has run, but before the CSS is flattened. Use this directory to
|
|
debug the CSS flattening, font size conversion, etc.
|
|
|
|
4. processed - This corresponds to the e-book as it is passed to the output
|
|
plugin. Use this directory to debug the output plugin.
|
|
|
|
'''
|
|
|
|
|
|
def supported_input_formats():
|
|
fmts = available_input_formats()
|
|
for x in ('zip', 'rar', 'oebzip'):
|
|
fmts.add(x)
|
|
return fmts
|
|
|
|
|
|
class OptionValues(object):
|
|
pass
|
|
|
|
|
|
class CompositeProgressReporter(object):
|
|
|
|
def __init__(self, global_min, global_max, global_reporter):
|
|
self.global_min, self.global_max = global_min, global_max
|
|
self.global_reporter = global_reporter
|
|
|
|
def __call__(self, fraction, msg=''):
|
|
global_frac = self.global_min + fraction * \
|
|
(self.global_max - self.global_min)
|
|
self.global_reporter(global_frac, msg)
|
|
|
|
|
|
ARCHIVE_FMTS = ('zip', 'rar', 'oebzip')
|
|
|
|
|
|
class Plumber(object):
|
|
|
|
'''
|
|
The `Plumber` manages the conversion pipeline. An UI should call the methods
|
|
:method:`merge_ui_recommendations` and then :method:`run`. The plumber will
|
|
take care of the rest.
|
|
'''
|
|
|
|
metadata_option_names = [
|
|
'title', 'authors', 'title_sort', 'author_sort', 'cover', 'comments',
|
|
'publisher', 'series', 'series_index', 'rating', 'isbn',
|
|
'tags', 'book_producer', 'language', 'pubdate', 'timestamp'
|
|
]
|
|
|
|
def __init__(self, input, output, log, report_progress=DummyReporter(),
|
|
dummy=False, merge_plugin_recs=True, abort_after_input_dump=False,
|
|
override_input_metadata=False, for_regex_wizard=False, view_kepub=False):
|
|
'''
|
|
:param input: Path to input file.
|
|
:param output: Path to output file/directory
|
|
'''
|
|
if isinstance(input, bytes):
|
|
input = input.decode(filesystem_encoding)
|
|
if isinstance(output, bytes):
|
|
output = output.decode(filesystem_encoding)
|
|
self.original_input_arg = input
|
|
self.for_regex_wizard = for_regex_wizard
|
|
self.input = os.path.abspath(input)
|
|
self.output = os.path.abspath(output)
|
|
self.log = log
|
|
self.ui_reporter = report_progress
|
|
self.abort_after_input_dump = abort_after_input_dump
|
|
self.override_input_metadata = override_input_metadata
|
|
|
|
# Pipeline options {{{
|
|
# Initialize the conversion options that are independent of input and
|
|
# output formats. The input and output plugins can still disable these
|
|
# options via recommendations.
|
|
self.pipeline_options = [
|
|
|
|
OptionRecommendation(name='verbose',
|
|
recommended_value=0, level=OptionRecommendation.LOW,
|
|
short_switch='v',
|
|
help='Level of verbosity. Specify multiple times for greater '
|
|
'verbosity. Specifying it twice will result in full '
|
|
'verbosity, once medium verbosity and zero times least verbosity.'
|
|
),
|
|
|
|
OptionRecommendation(name='debug_pipeline',
|
|
recommended_value=None, level=OptionRecommendation.LOW,
|
|
short_switch='d',
|
|
help='Save the output from different stages of the conversion '
|
|
'pipeline to the specified '
|
|
'directory. Useful if you are unsure at which stage '
|
|
'of the conversion process a bug is occurring.'
|
|
),
|
|
|
|
OptionRecommendation(name='input_profile',
|
|
recommended_value='default', level=OptionRecommendation.LOW,
|
|
choices=[x.short_name for x in input_profiles()],
|
|
help='Specify the input profile. The input profile gives the '
|
|
'conversion system information on how to interpret '
|
|
'various information in the input document. For '
|
|
'example resolution dependent lengths (i.e. lengths in '
|
|
'pixels). Choices are:'+ ', '.join([
|
|
x.short_name for x in input_profiles()])
|
|
),
|
|
|
|
OptionRecommendation(name='output_profile',
|
|
recommended_value='default', level=OptionRecommendation.LOW,
|
|
choices=[x.short_name for x in output_profiles()],
|
|
help='Specify the output profile. The output profile '
|
|
'tells the conversion system how to optimize the '
|
|
'created document for the specified device (such as by resizing images for the device screen size). In some cases, '
|
|
'an output profile can be used to optimize the output for a particular device, but this is rarely necessary. '
|
|
'Choices are:' + ', '.join([
|
|
x.short_name for x in output_profiles()])
|
|
),
|
|
|
|
OptionRecommendation(name='base_font_size',
|
|
recommended_value=0, level=OptionRecommendation.LOW,
|
|
help='The base font size in pts. All font sizes in the produced book '
|
|
'will be rescaled based on this size. By choosing a larger '
|
|
'size you can make the fonts in the output bigger and vice '
|
|
'versa. By default, when the value is zero, the base font size is chosen based on '
|
|
'the output profile you chose.'
|
|
),
|
|
|
|
OptionRecommendation(name='font_size_mapping',
|
|
recommended_value=None, level=OptionRecommendation.LOW,
|
|
help='Mapping from CSS font names to font sizes in pts. '
|
|
'An example setting is 12,12,14,16,18,20,22,24. '
|
|
'These are the mappings for the sizes xx-small to xx-large, '
|
|
'with the final size being for huge fonts. The font '
|
|
'rescaling algorithm uses these sizes to intelligently '
|
|
'rescale fonts. The default is to use a mapping based on '
|
|
'the output profile you chose.'
|
|
),
|
|
|
|
OptionRecommendation(name='disable_font_rescaling',
|
|
recommended_value=False, level=OptionRecommendation.LOW,
|
|
help='Disable all rescaling of font sizes.'
|
|
),
|
|
|
|
OptionRecommendation(name='minimum_line_height',
|
|
recommended_value=120.0, level=OptionRecommendation.LOW,
|
|
help='The minimum line height, as a percentage of the element\'s '
|
|
'calculated font size. calibre will ensure that every element '
|
|
'has a line height of at least this setting, irrespective of '
|
|
'what the input document specifies. Set to zero to disable. '
|
|
'Default is 120%. Use this setting in preference to '
|
|
'the direct line height specification, unless you know what '
|
|
'you are doing. For example, you can achieve "double spaced" '
|
|
'text by setting this to 240.'
|
|
),
|
|
|
|
|
|
OptionRecommendation(name='line_height',
|
|
recommended_value=0, level=OptionRecommendation.LOW,
|
|
help='The line height in pts. Controls spacing between consecutive '
|
|
'lines of text. Only applies to elements that do not define '
|
|
'their own line height. In most cases, the minimum line height '
|
|
'option is more useful. '
|
|
'By default no line height manipulation is performed.'
|
|
),
|
|
|
|
OptionRecommendation(name='embed_font_family',
|
|
recommended_value=None, level=OptionRecommendation.LOW,
|
|
help='Embed the specified font family into the book. This specifies '
|
|
'the "base" font used for the book. If the input document '
|
|
'specifies its own fonts, they may override this base font. '
|
|
'You can use the filter style information option to remove fonts from the '
|
|
'input document. Note that font embedding only works '
|
|
'with some output formats, principally EPUB, AZW3 and DOCX.'
|
|
),
|
|
|
|
OptionRecommendation(name='embed_all_fonts',
|
|
recommended_value=False, level=OptionRecommendation.LOW,
|
|
help='Embed every font that is referenced in the input document '
|
|
'but not already embedded. This will search your system for the '
|
|
'fonts, and if found, they will be embedded. Embedding will only work '
|
|
'if the format you are converting to supports embedded fonts, such as '
|
|
'EPUB, AZW3, DOCX or PDF. Please ensure that you have the proper license for embedding '
|
|
'the fonts used in this document.'
|
|
),
|
|
|
|
OptionRecommendation(name='subset_embedded_fonts',
|
|
recommended_value=False, level=OptionRecommendation.LOW,
|
|
help='Subset all embedded fonts. Every embedded font is reduced '
|
|
'to contain only the glyphs used in this document. This decreases '
|
|
'the size of the font files. Useful if you are embedding a '
|
|
'particularly large font with lots of unused glyphs.'
|
|
),
|
|
|
|
OptionRecommendation(name='linearize_tables',
|
|
recommended_value=False, level=OptionRecommendation.LOW,
|
|
help='Some badly designed documents use tables to control the '
|
|
'layout of text on the page. When converted these documents '
|
|
'often have text that runs off the page and other artifacts. '
|
|
'This option will extract the content from the tables and '
|
|
'present it in a linear fashion.'
|
|
),
|
|
|
|
OptionRecommendation(name='level1_toc',
|
|
recommended_value=None, level=OptionRecommendation.LOW,
|
|
help='XPath expression that specifies all tags that '
|
|
'should be added to the Table of Contents at level one. If '
|
|
'this is specified, it takes precedence over other forms '
|
|
'of auto-detection.'
|
|
' See the XPath Tutorial in the calibre User Manual for examples.'
|
|
),
|
|
|
|
OptionRecommendation(name='level2_toc',
|
|
recommended_value=None, level=OptionRecommendation.LOW,
|
|
help='XPath expression that specifies all tags that should be '
|
|
'added to the Table of Contents at level two. Each entry is added '
|
|
'under the previous level one entry.'
|
|
' See the XPath Tutorial in the calibre User Manual for examples.'
|
|
),
|
|
|
|
OptionRecommendation(name='level3_toc',
|
|
recommended_value=None, level=OptionRecommendation.LOW,
|
|
help='XPath expression that specifies all tags that should be '
|
|
'added to the Table of Contents at level three. Each entry '
|
|
'is added under the previous level two entry.'
|
|
' See the XPath Tutorial in the calibre User Manual for examples.'
|
|
),
|
|
|
|
OptionRecommendation(name='use_auto_toc',
|
|
recommended_value=False, level=OptionRecommendation.LOW,
|
|
help='Normally, if the source file already has a Table of '
|
|
'Contents, it is used in preference to the auto-generated one. '
|
|
'With this option, the auto-generated one is always used.'
|
|
),
|
|
|
|
OptionRecommendation(name='no_chapters_in_toc',
|
|
recommended_value=False, level=OptionRecommendation.LOW,
|
|
help="Don't add auto-detected chapters to the Table of "
|
|
'Contents.'
|
|
),
|
|
|
|
OptionRecommendation(name='toc_threshold',
|
|
recommended_value=6, level=OptionRecommendation.LOW,
|
|
help='If fewer than this number of chapters is detected, then links '
|
|
'are added to the Table of Contents. Default: %default'
|
|
),
|
|
|
|
OptionRecommendation(name='max_toc_links',
|
|
recommended_value=50, level=OptionRecommendation.LOW,
|
|
help='Maximum number of links to insert into the TOC. Set to 0 '
|
|
'to disable. Default is: %default. Links are only added to the '
|
|
'TOC if less than the threshold number of chapters were detected.'
|
|
),
|
|
|
|
OptionRecommendation(name='toc_filter',
|
|
recommended_value=None, level=OptionRecommendation.LOW,
|
|
help='Remove entries from the Table of Contents whose titles '
|
|
'match the specified regular expression. Matching entries and all '
|
|
'their children are removed.'
|
|
),
|
|
|
|
OptionRecommendation(name='duplicate_links_in_toc',
|
|
recommended_value=False, level=OptionRecommendation.LOW,
|
|
help='When creating a TOC from links in the input document, '
|
|
'allow duplicate entries, i.e. allow more than one entry '
|
|
'with the same text, provided that they point to a '
|
|
'different location.'
|
|
),
|
|
|
|
|
|
OptionRecommendation(name='chapter',
|
|
recommended_value="//*[((name()='h1' or name()='h2') and "
|
|
r"re:test(., '\s*((chapter|book|section|part)\s+)|((prolog|prologue|epilogue)(\s+|$))', 'i')) or @class "
|
|
"= 'chapter']", level=OptionRecommendation.LOW,
|
|
help='An XPath expression to detect chapter titles. The default '
|
|
'is to consider <h1> or <h2> tags that contain the words '
|
|
'"chapter", "book", "section", "prologue", "epilogue" or "part" as chapter titles as '
|
|
'well as any tags that have class="chapter". The expression '
|
|
'used must evaluate to a list of elements. To disable chapter '
|
|
'detection, use the expression "/". See the XPath Tutorial '
|
|
'in the calibre User Manual for further help on using this '
|
|
'feature.'
|
|
),
|
|
|
|
OptionRecommendation(name='chapter_mark',
|
|
recommended_value='pagebreak', level=OptionRecommendation.LOW,
|
|
choices=['pagebreak', 'rule', 'both', 'none'],
|
|
help='Specify how to mark detected chapters. A value of '
|
|
'"pagebreak" will insert page breaks before chapters. '
|
|
'A value of "rule" will insert a line before chapters. '
|
|
'A value of "none" will disable chapter marking and a '
|
|
'value of "both" will use both page breaks and lines '
|
|
'to mark chapters.'
|
|
),
|
|
|
|
OptionRecommendation(name='start_reading_at',
|
|
recommended_value=None, level=OptionRecommendation.LOW,
|
|
help='An XPath expression to detect the location in the document'
|
|
' at which to start reading. Some e-book reading programs'
|
|
' (most prominently the Kindle) use this location as the'
|
|
' position at which to open the book. See the XPath tutorial'
|
|
' in the calibre User Manual for further help using this'
|
|
' feature.'
|
|
),
|
|
|
|
OptionRecommendation(name='extra_css',
|
|
recommended_value=None, level=OptionRecommendation.LOW,
|
|
help='Either the path to a CSS stylesheet or raw CSS. '
|
|
'This CSS will be appended to the style rules from '
|
|
'the source file, so it can be used to override those '
|
|
'rules.'
|
|
),
|
|
|
|
OptionRecommendation(name='transform_css_rules',
|
|
recommended_value=None, level=OptionRecommendation.LOW,
|
|
help='Rules for transforming the styles in this book. These'
|
|
' rules are applied after all other CSS processing is done.'
|
|
),
|
|
|
|
OptionRecommendation(name='filter_css',
|
|
recommended_value=None, level=OptionRecommendation.LOW,
|
|
help='A comma separated list of CSS properties that '
|
|
'will be removed from all CSS style rules. This is useful '
|
|
'if the presence of some style information prevents it '
|
|
'from being overridden on your device. '
|
|
'For example: '
|
|
'font-family,color,margin-left,margin-right'
|
|
),
|
|
|
|
OptionRecommendation(name='expand_css',
|
|
recommended_value=False, level=OptionRecommendation.LOW,
|
|
help='By default, calibre will use the shorthand form for various'
|
|
' CSS properties such as margin, padding, border, etc. This'
|
|
' option will cause it to use the full expanded form instead.'
|
|
' Note that CSS is always expanded when generating EPUB files'
|
|
' with the output profile set to one of the Nook profiles'
|
|
' as the Nook cannot handle shorthand CSS.'
|
|
),
|
|
|
|
OptionRecommendation(name='page_breaks_before',
|
|
recommended_value="//*[name()='h1' or name()='h2']",
|
|
level=OptionRecommendation.LOW,
|
|
help='An XPath expression. Page breaks are inserted '
|
|
'before the specified elements. To disable use the expression: /'
|
|
),
|
|
|
|
OptionRecommendation(name='remove_fake_margins',
|
|
recommended_value=True, level=OptionRecommendation.LOW,
|
|
help='Some documents specify page margins by '
|
|
'specifying a left and right margin on each individual '
|
|
'paragraph. calibre will try to detect and remove these '
|
|
'margins. Sometimes, this can cause the removal of '
|
|
'margins that should not have been removed. In this '
|
|
'case you can disable the removal.'
|
|
),
|
|
|
|
|
|
OptionRecommendation(name='margin_top',
|
|
recommended_value=5.0, level=OptionRecommendation.LOW,
|
|
help='Set the top margin in pts. Default is %default. '
|
|
'Setting this to less than zero will cause no margin to be set '
|
|
'(the margin setting in the original document will be preserved). '
|
|
'Note: Page oriented formats such as PDF and DOCX have their own'
|
|
' margin settings that take precedence.'),
|
|
|
|
OptionRecommendation(name='margin_bottom',
|
|
recommended_value=5.0, level=OptionRecommendation.LOW,
|
|
help='Set the bottom margin in pts. Default is %default. '
|
|
'Setting this to less than zero will cause no margin to be set '
|
|
'(the margin setting in the original document will be preserved). '
|
|
'Note: Page oriented formats such as PDF and DOCX have their own'
|
|
' margin settings that take precedence.'),
|
|
|
|
OptionRecommendation(name='margin_left',
|
|
recommended_value=5.0, level=OptionRecommendation.LOW,
|
|
help='Set the left margin in pts. Default is %default. '
|
|
'Setting this to less than zero will cause no margin to be set '
|
|
'(the margin setting in the original document will be preserved). '
|
|
'Note: Page oriented formats such as PDF and DOCX have their own'
|
|
' margin settings that take precedence.'),
|
|
|
|
OptionRecommendation(name='margin_right',
|
|
recommended_value=5.0, level=OptionRecommendation.LOW,
|
|
help='Set the right margin in pts. Default is %default. '
|
|
'Setting this to less than zero will cause no margin to be set '
|
|
'(the margin setting in the original document will be preserved). '
|
|
'Note: Page oriented formats such as PDF and DOCX have their own'
|
|
' margin settings that take precedence.'),
|
|
|
|
OptionRecommendation(name='change_justification',
|
|
recommended_value='original', level=OptionRecommendation.LOW,
|
|
choices=['left','justify','original'],
|
|
help='Change text justification. A value of "left" converts all'
|
|
' justified text in the source to left aligned (i.e. '
|
|
'unjustified) text. A value of "justify" converts all '
|
|
'unjustified text to justified. A value of "original" '
|
|
'(the default) does not change justification in the '
|
|
'source file. Note that only some output formats support '
|
|
'justification.'),
|
|
|
|
OptionRecommendation(name='remove_paragraph_spacing',
|
|
recommended_value=False, level=OptionRecommendation.LOW,
|
|
help='Remove spacing between paragraphs. Also sets an indent on '
|
|
'paragraphs of 1.5em. Spacing removal will not work '
|
|
'if the source file does not use paragraphs (<p> or <div> tags).'
|
|
),
|
|
|
|
OptionRecommendation(name='remove_paragraph_spacing_indent_size',
|
|
recommended_value=1.5, level=OptionRecommendation.LOW,
|
|
help='When calibre removes blank lines between paragraphs, it automatically '
|
|
'sets a paragraph indent, to ensure that paragraphs can be easily '
|
|
'distinguished. This option controls the width of that indent (in em). '
|
|
'If you set this value negative, then the indent specified in the input '
|
|
'document is used, that is, calibre does not change the indentation.'
|
|
),
|
|
|
|
OptionRecommendation(name='prefer_metadata_cover',
|
|
recommended_value=False, level=OptionRecommendation.LOW,
|
|
help='Use the cover detected from the source file in preference '
|
|
'to the specified cover.'
|
|
),
|
|
|
|
OptionRecommendation(name='insert_blank_line',
|
|
recommended_value=False, level=OptionRecommendation.LOW,
|
|
help='Insert a blank line between paragraphs. Will not work '
|
|
'if the source file does not use paragraphs (<p> or <div> tags).'
|
|
),
|
|
|
|
OptionRecommendation(name='insert_blank_line_size',
|
|
recommended_value=0.5, level=OptionRecommendation.LOW,
|
|
help='Set the height of the inserted blank lines (in em).'
|
|
' The height of the lines between paragraphs will be twice the value'
|
|
' set here.'
|
|
),
|
|
|
|
OptionRecommendation(name='remove_first_image',
|
|
recommended_value=False, level=OptionRecommendation.LOW,
|
|
help='Remove the first image from the input e-book. Useful if the '
|
|
'input document has a cover image that is not identified as a cover. '
|
|
'In this case, if you set a cover in calibre, the output document will '
|
|
'end up with two cover images if you do not specify this option.'
|
|
),
|
|
|
|
OptionRecommendation(name='insert_metadata',
|
|
recommended_value=False, level=OptionRecommendation.LOW,
|
|
help='Insert the book metadata at the start of '
|
|
'the book. This is useful if your e-book reader does not support '
|
|
'displaying/searching metadata directly.'
|
|
),
|
|
|
|
OptionRecommendation(name='smarten_punctuation',
|
|
recommended_value=False, level=OptionRecommendation.LOW,
|
|
help='Convert plain quotes, dashes and ellipsis to their '
|
|
'typographically correct equivalents. For details, see '
|
|
'https://daringfireball.net/projects/smartypants'
|
|
),
|
|
|
|
OptionRecommendation(name='unsmarten_punctuation',
|
|
recommended_value=False, level=OptionRecommendation.LOW,
|
|
help='Convert fancy quotes, dashes and ellipsis to their '
|
|
'plain equivalents.'
|
|
),
|
|
|
|
OptionRecommendation(name='read_metadata_from_opf',
|
|
recommended_value=None, level=OptionRecommendation.LOW,
|
|
short_switch='m',
|
|
help='Read metadata from the specified OPF file. Metadata read '
|
|
'from this file will override any metadata in the source '
|
|
'file.'
|
|
),
|
|
|
|
OptionRecommendation(name='asciiize',
|
|
recommended_value=False, level=OptionRecommendation.LOW,
|
|
help='Transliterate unicode characters to an ASCII '
|
|
'representation. Use with care because this will replace '
|
|
'unicode characters with ASCII. For instance it will replace "%s" '
|
|
'with "Mikhail Gorbachiov". Also, note that in '
|
|
'cases where there are multiple representations of a character '
|
|
'(characters shared by Chinese and Japanese for instance) the '
|
|
'representation based on the current calibre interface language will be '
|
|
'used.' %
|
|
'\u041c\u0438\u0445\u0430\u0438\u043b '
|
|
'\u0413\u043e\u0440\u0431\u0430\u0447\u0451\u0432'
|
|
),
|
|
|
|
OptionRecommendation(name='keep_ligatures',
|
|
recommended_value=False, level=OptionRecommendation.LOW,
|
|
help='Preserve ligatures present in the input document. '
|
|
'A ligature is a special rendering of a pair of '
|
|
'characters like ff, fi, fl et cetera. '
|
|
'Most readers do not have support for '
|
|
'ligatures in their default fonts, so they are '
|
|
'unlikely to render correctly. By default, calibre '
|
|
'will turn a ligature into the corresponding pair of normal '
|
|
'characters. This option will preserve them instead.'
|
|
),
|
|
|
|
OptionRecommendation(name='title',
|
|
recommended_value=None, level=OptionRecommendation.LOW,
|
|
help='Set the title.'),
|
|
|
|
OptionRecommendation(name='authors',
|
|
recommended_value=None, level=OptionRecommendation.LOW,
|
|
help='Set the authors. Multiple authors should be separated by '
|
|
'ampersands.'),
|
|
|
|
OptionRecommendation(name='title_sort',
|
|
recommended_value=None, level=OptionRecommendation.LOW,
|
|
help='The version of the title to be used for sorting. '),
|
|
|
|
OptionRecommendation(name='author_sort',
|
|
recommended_value=None, level=OptionRecommendation.LOW,
|
|
help='String to be used when sorting by author. '),
|
|
|
|
OptionRecommendation(name='cover',
|
|
recommended_value=None, level=OptionRecommendation.LOW,
|
|
help='Set the cover to the specified file or URL'),
|
|
|
|
OptionRecommendation(name='comments',
|
|
recommended_value=None, level=OptionRecommendation.LOW,
|
|
help='Set the e-book description.'),
|
|
|
|
OptionRecommendation(name='publisher',
|
|
recommended_value=None, level=OptionRecommendation.LOW,
|
|
help='Set the e-book publisher.'),
|
|
|
|
OptionRecommendation(name='series',
|
|
recommended_value=None, level=OptionRecommendation.LOW,
|
|
help='Set the series this e-book belongs to.'),
|
|
|
|
OptionRecommendation(name='series_index',
|
|
recommended_value=None, level=OptionRecommendation.LOW,
|
|
help='Set the index of the book in this series.'),
|
|
|
|
OptionRecommendation(name='rating',
|
|
recommended_value=None, level=OptionRecommendation.LOW,
|
|
help='Set the rating. Should be a number between 1 and 5.'),
|
|
|
|
OptionRecommendation(name='isbn',
|
|
recommended_value=None, level=OptionRecommendation.LOW,
|
|
help='Set the ISBN of the book.'),
|
|
|
|
OptionRecommendation(name='tags',
|
|
recommended_value=None, level=OptionRecommendation.LOW,
|
|
help='Set the tags for the book. Should be a comma separated list.'),
|
|
|
|
OptionRecommendation(name='book_producer',
|
|
recommended_value=None, level=OptionRecommendation.LOW,
|
|
help='Set the book producer.'),
|
|
|
|
OptionRecommendation(name='language',
|
|
recommended_value=None, level=OptionRecommendation.LOW,
|
|
help='Set the language.'),
|
|
|
|
OptionRecommendation(name='pubdate',
|
|
recommended_value=None, level=OptionRecommendation.LOW,
|
|
help='Set the publication date (assumed to be in the local timezone, unless the timezone is explicitly specified)'),
|
|
|
|
OptionRecommendation(name='timestamp',
|
|
recommended_value=None, level=OptionRecommendation.LOW,
|
|
help='Set the book timestamp (no longer used anywhere)'),
|
|
|
|
OptionRecommendation(name='enable_heuristics',
|
|
recommended_value=False, level=OptionRecommendation.LOW,
|
|
help='Enable heuristic processing. This option must be set for any '
|
|
'heuristic processing to take place.'),
|
|
|
|
OptionRecommendation(name='markup_chapter_headings',
|
|
recommended_value=True, level=OptionRecommendation.LOW,
|
|
help='Detect unformatted chapter headings and sub headings. Change '
|
|
'them to h2 and h3 tags. This setting will not create a TOC, '
|
|
'but can be used in conjunction with structure detection to create '
|
|
'one.'),
|
|
|
|
OptionRecommendation(name='italicize_common_cases',
|
|
recommended_value=True, level=OptionRecommendation.LOW,
|
|
help='Look for common words and patterns that denote '
|
|
'italics and italicize them.'),
|
|
|
|
OptionRecommendation(name='fix_indents',
|
|
recommended_value=True, level=OptionRecommendation.LOW,
|
|
help='Turn indentation created from multiple non-breaking space entities '
|
|
'into CSS indents.'),
|
|
|
|
OptionRecommendation(name='html_unwrap_factor',
|
|
recommended_value=0.40, level=OptionRecommendation.LOW,
|
|
help='Scale used to determine the length at which a line should '
|
|
'be unwrapped. Valid values are a decimal between 0 and 1. The '
|
|
'default is 0.4, just below the median line length. If only a '
|
|
'few lines in the document require unwrapping this value should '
|
|
'be reduced'),
|
|
|
|
OptionRecommendation(name='unwrap_lines',
|
|
recommended_value=True, level=OptionRecommendation.LOW,
|
|
help='Unwrap lines using punctuation and other formatting clues.'),
|
|
|
|
OptionRecommendation(name='delete_blank_paragraphs',
|
|
recommended_value=True, level=OptionRecommendation.LOW,
|
|
help='Remove empty paragraphs from the document when they exist between '
|
|
'every other paragraph'),
|
|
|
|
OptionRecommendation(name='format_scene_breaks',
|
|
recommended_value=True, level=OptionRecommendation.LOW,
|
|
help='Left aligned scene break markers are center aligned. Replace soft '
|
|
'scene breaks that use multiple blank lines with horizontal rules.'),
|
|
|
|
OptionRecommendation(name='replace_scene_breaks',
|
|
recommended_value='', level=OptionRecommendation.LOW,
|
|
help='Replace scene breaks with the specified text. By default, the text '
|
|
'from the input document is used.'),
|
|
|
|
OptionRecommendation(name='dehyphenate',
|
|
recommended_value=True, level=OptionRecommendation.LOW,
|
|
help='Analyze hyphenated words throughout the document. The document '
|
|
'itself is used as a dictionary to determine whether hyphens should '
|
|
'be retained or removed.'),
|
|
|
|
OptionRecommendation(name='renumber_headings',
|
|
recommended_value=True, level=OptionRecommendation.LOW,
|
|
help='Looks for occurrences of sequential <h1> or <h2> tags. The tags are '
|
|
'renumbered to prevent splitting in the middle of chapter headings.'),
|
|
|
|
OptionRecommendation(name='sr1_search',
|
|
recommended_value='', level=OptionRecommendation.LOW,
|
|
help='Search pattern (regular expression) to be replaced with '
|
|
'sr1-replace.'),
|
|
|
|
OptionRecommendation(name='sr1_replace',
|
|
recommended_value='', level=OptionRecommendation.LOW,
|
|
help='Replacement to replace the text found with sr1-search.'),
|
|
|
|
OptionRecommendation(name='sr2_search',
|
|
recommended_value='', level=OptionRecommendation.LOW,
|
|
help='Search pattern (regular expression) to be replaced with '
|
|
'sr2-replace.'),
|
|
|
|
OptionRecommendation(name='sr2_replace',
|
|
recommended_value='', level=OptionRecommendation.LOW,
|
|
help='Replacement to replace the text found with sr2-search.'),
|
|
|
|
OptionRecommendation(name='sr3_search',
|
|
recommended_value='', level=OptionRecommendation.LOW,
|
|
help='Search pattern (regular expression) to be replaced with '
|
|
'sr3-replace.'),
|
|
|
|
OptionRecommendation(name='sr3_replace',
|
|
recommended_value='', level=OptionRecommendation.LOW,
|
|
help='Replacement to replace the text found with sr3-search.'),
|
|
|
|
OptionRecommendation(name='search_replace',
|
|
recommended_value=None, level=OptionRecommendation.LOW,
|
|
help='Path to a file containing search and replace regular expressions. '
|
|
'The file must contain alternating lines of regular expression '
|
|
'followed by replacement pattern (which can be an empty line). '
|
|
'The regular expression must be in the Python regex syntax and '
|
|
'the file must be UTF-8 encoded.'),
|
|
]
|
|
# }}}
|
|
|
|
input_fmt = os.path.splitext(self.input)[1]
|
|
if not input_fmt:
|
|
raise ValueError('Input file must have an extension')
|
|
input_fmt = input_fmt[1:].lower().replace('original_', '')
|
|
if view_kepub and input_fmt.lower() == 'kepub':
|
|
input_fmt = 'epub'
|
|
self.archive_input_tdir = None
|
|
self.changed_options = set()
|
|
if input_fmt in ARCHIVE_FMTS:
|
|
self.log('Processing archive...')
|
|
tdir = PersistentTemporaryDirectory('_pl_arc')
|
|
self.input, input_fmt = self.unarchive(self.input, tdir)
|
|
self.archive_input_tdir = tdir
|
|
if os.access(self.input, os.R_OK):
|
|
nfp = run_plugins_on_preprocess(self.input, input_fmt)
|
|
if nfp != self.input:
|
|
self.input = nfp
|
|
input_fmt = os.path.splitext(self.input)[1]
|
|
if not input_fmt:
|
|
raise ValueError('Input file must have an extension')
|
|
input_fmt = input_fmt[1:].lower()
|
|
|
|
if os.path.exists(self.output) and os.path.isdir(self.output):
|
|
output_fmt = 'oeb'
|
|
else:
|
|
output_fmt = os.path.splitext(self.output)[1]
|
|
if not output_fmt:
|
|
output_fmt = '.oeb'
|
|
output_fmt = output_fmt[1:].lower()
|
|
|
|
self.input_plugin = plugin_for_input_format(input_fmt)
|
|
self.output_plugin = plugin_for_output_format(output_fmt)
|
|
|
|
if self.input_plugin is None:
|
|
raise ValueError('No plugin to handle input format: '+input_fmt)
|
|
|
|
if self.output_plugin is None:
|
|
raise ValueError('No plugin to handle output format: '+output_fmt)
|
|
|
|
self.input_fmt = input_fmt
|
|
self.output_fmt = output_fmt
|
|
|
|
self.all_format_options = set()
|
|
self.input_options = set()
|
|
self.output_options = set()
|
|
# Build set of all possible options. Two options are equal if their
|
|
# names are the same.
|
|
if not dummy:
|
|
self.input_options = self.input_plugin.options.union(
|
|
self.input_plugin.common_options)
|
|
self.output_options = self.output_plugin.options.union(
|
|
self.output_plugin.common_options)
|
|
else:
|
|
for fmt in available_input_formats():
|
|
input_plugin = plugin_for_input_format(fmt)
|
|
if input_plugin:
|
|
self.all_format_options = self.all_format_options.union(
|
|
input_plugin.options.union(input_plugin.common_options))
|
|
for fmt in available_output_formats():
|
|
output_plugin = plugin_for_output_format(fmt)
|
|
if output_plugin:
|
|
self.all_format_options = self.all_format_options.union(
|
|
output_plugin.options.union(output_plugin.common_options))
|
|
|
|
# Remove the options that have been disabled by recommendations from the
|
|
# plugins.
|
|
for w in ('input_options', 'output_options',
|
|
'all_format_options'):
|
|
temp = set()
|
|
for x in getattr(self, w):
|
|
temp.add(x.clone())
|
|
setattr(self, w, temp)
|
|
if merge_plugin_recs:
|
|
self.merge_plugin_recommendations()
|
|
|
|
@classmethod
|
|
def unarchive(self, path, tdir):
|
|
extract(path, tdir)
|
|
files = list(walk(tdir))
|
|
files = [f if isinstance(f, str) else f.decode(filesystem_encoding)
|
|
for f in files]
|
|
from ebook_converter.customize.ui import available_input_formats
|
|
fmts = set(available_input_formats())
|
|
fmts -= {'htm', 'html', 'xhtm', 'xhtml'}
|
|
fmts -= set(ARCHIVE_FMTS)
|
|
|
|
for ext in fmts:
|
|
for f in files:
|
|
if f.lower().endswith('.'+ext):
|
|
if ext in ['txt', 'rtf'] and os.stat(f).st_size < 2048:
|
|
continue
|
|
return f, ext
|
|
return self.find_html_index(files)
|
|
|
|
@classmethod
|
|
def find_html_index(self, files):
|
|
'''
|
|
Given a list of files, find the most likely root HTML file in the
|
|
list.
|
|
'''
|
|
html_pat = re.compile(r'\.(x){0,1}htm(l){0,1}$', re.IGNORECASE)
|
|
html_files = [f for f in files if html_pat.search(f) is not None]
|
|
if not html_files:
|
|
raise ValueError('Could not find an e-book inside the archive')
|
|
html_files = [(f, os.stat(f).st_size) for f in html_files]
|
|
html_files.sort(key=lambda x: x[1])
|
|
html_files = [f[0] for f in html_files]
|
|
for q in ('toc', 'index'):
|
|
for f in html_files:
|
|
if os.path.splitext(os.path.basename(f))[0].lower() == q:
|
|
return f, os.path.splitext(f)[1].lower()[1:]
|
|
return html_files[-1], os.path.splitext(html_files[-1])[1].lower()[1:]
|
|
|
|
def get_all_options(self):
|
|
ans = {}
|
|
for group in (self.input_options, self.pipeline_options,
|
|
self.output_options, self.all_format_options):
|
|
for rec in group:
|
|
ans[rec.option] = rec.recommended_value
|
|
return ans
|
|
|
|
def get_option_by_name(self, name):
|
|
for group in (self.input_options, self.pipeline_options,
|
|
self.output_options, self.all_format_options):
|
|
for rec in group:
|
|
if rec.option == name:
|
|
return rec
|
|
|
|
def get_option_help(self, name):
|
|
rec = self.get_option_by_name(name)
|
|
help = getattr(rec, 'help', None)
|
|
if help is not None:
|
|
return help.replace('%default', str(rec.recommended_value))
|
|
|
|
def get_all_help(self):
|
|
ans = {}
|
|
for group in (self.input_options, self.pipeline_options,
|
|
self.output_options, self.all_format_options):
|
|
for rec in group:
|
|
help = getattr(rec, 'help', None)
|
|
if help is not None:
|
|
ans[rec.option.name] = help
|
|
return ans
|
|
|
|
def merge_plugin_recs(self, plugin):
|
|
for name, val, level in plugin.recommendations:
|
|
rec = self.get_option_by_name(name)
|
|
if rec is not None and rec.level <= level:
|
|
rec.recommended_value = val
|
|
rec.level = level
|
|
|
|
def merge_plugin_recommendations(self):
|
|
for source in (self.input_plugin, self.output_plugin):
|
|
self.merge_plugin_recs(source)
|
|
|
|
def merge_ui_recommendations(self, recommendations):
|
|
'''
|
|
Merge recommendations from the UI. As long as the UI recommendation
|
|
level is >= the baseline recommended level, the UI value is used,
|
|
*except* if the baseline has a recommendation level of `HIGH`.
|
|
'''
|
|
|
|
def eq(name, a, b):
|
|
if name in {'sr1_search', 'sr1_replace', 'sr2_search', 'sr2_replace', 'sr3_search', 'sr3_replace', 'filter_css', 'comments'}:
|
|
if not a and not b:
|
|
return True
|
|
if name in {'transform_css_rules', 'search_replace'}:
|
|
if b == '[]':
|
|
b = None
|
|
return a == b
|
|
|
|
for name, val, level in recommendations:
|
|
rec = self.get_option_by_name(name)
|
|
if rec is not None and rec.level <= level and rec.level < rec.HIGH:
|
|
changed = not eq(name, rec.recommended_value, val)
|
|
rec.recommended_value = val
|
|
rec.level = level
|
|
if changed:
|
|
self.changed_options.add(rec)
|
|
|
|
def opts_to_mi(self, mi):
|
|
from ebook_converter.ebooks.metadata import string_to_authors
|
|
for x in self.metadata_option_names:
|
|
val = getattr(self.opts, x, None)
|
|
if val is not None:
|
|
if x == 'authors':
|
|
val = string_to_authors(val)
|
|
elif x == 'tags':
|
|
val = [i.strip() for i in val.split(',')]
|
|
elif x in ('rating', 'series_index'):
|
|
try:
|
|
val = float(val)
|
|
except ValueError:
|
|
self.log.warn('Values of series index and rating must'
|
|
' be numbers. Ignoring', val)
|
|
continue
|
|
elif x in ('timestamp', 'pubdate'):
|
|
try:
|
|
val = parse_date(val, assume_utc=x=='timestamp')
|
|
except:
|
|
self.log.exception('Failed to parse date/time %s', val)
|
|
continue
|
|
setattr(mi, x, val)
|
|
|
|
def read_user_metadata(self):
|
|
'''
|
|
Read all metadata specified by the user. Command line options override
|
|
metadata from a specified OPF file.
|
|
'''
|
|
from ebook_converter.ebooks.metadata import MetaInformation
|
|
from ebook_converter.ebooks.metadata.opf2 import OPF
|
|
mi = MetaInformation(None, [])
|
|
if self.opts.read_metadata_from_opf is not None:
|
|
self.opts.read_metadata_from_opf = os.path.abspath(
|
|
self.opts.read_metadata_from_opf)
|
|
with open(self.opts.read_metadata_from_opf, 'rb') as stream:
|
|
opf = OPF(stream, os.path.dirname(self.opts.read_metadata_from_opf))
|
|
mi = opf.to_book_metadata()
|
|
self.opts_to_mi(mi)
|
|
if mi.cover:
|
|
if mi.cover.startswith('http:') or mi.cover.startswith('https:'):
|
|
self.log.warn("TODO: Cover image is on remote server, "
|
|
"implement downloading using requests")
|
|
ext = mi.cover.rpartition('.')[-1].lower().strip()
|
|
if ext not in ('png', 'jpg', 'jpeg', 'gif'):
|
|
ext = 'jpg'
|
|
with open(mi.cover, 'rb') as stream:
|
|
mi.cover_data = (ext, stream.read())
|
|
mi.cover = None
|
|
self.user_metadata = mi
|
|
|
|
def setup_options(self):
|
|
'''
|
|
Setup the `self.opts` object.
|
|
'''
|
|
self.opts = OptionValues()
|
|
for group in (self.input_options, self.pipeline_options,
|
|
self.output_options, self.all_format_options):
|
|
for rec in group:
|
|
setattr(self.opts, rec.option.name, rec.recommended_value)
|
|
|
|
def set_profile(profiles, which):
|
|
attr = which + '_profile'
|
|
sval = getattr(self.opts, attr)
|
|
for x in profiles():
|
|
if x.short_name == sval:
|
|
setattr(self.opts, attr, x)
|
|
return
|
|
self.log.warn(
|
|
'Profile (%s) %r is no longer available, using default'%(which, sval))
|
|
for x in profiles():
|
|
if x.short_name == 'default':
|
|
setattr(self.opts, attr, x)
|
|
break
|
|
|
|
set_profile(input_profiles, 'input')
|
|
set_profile(output_profiles, 'output')
|
|
|
|
self.read_user_metadata()
|
|
self.opts.no_inline_navbars = self.opts.output_profile.supports_mobi_indexing \
|
|
and self.output_fmt == 'mobi'
|
|
if self.opts.verbose:
|
|
self.log.filter_level = self.log.DEBUG
|
|
if self.changed_options:
|
|
self.log('Conversion options changed from defaults:')
|
|
for rec in self.changed_options:
|
|
if rec.option.name not in ('username', 'password'):
|
|
self.log(' ', '%s:' % rec.option.name, repr(rec.recommended_value))
|
|
if self.opts.verbose > 1:
|
|
self.log.debug('Resolved conversion options')
|
|
try:
|
|
self.log.debug('ebook_converter version:', __version__)
|
|
odict = dict(self.opts.__dict__)
|
|
for x in ('username', 'password'):
|
|
odict.pop(x, None)
|
|
self.log.debug(pprint.pformat(odict))
|
|
except:
|
|
self.log.exception('Failed to get resolved conversion options')
|
|
|
|
def flush(self):
|
|
try:
|
|
sys.stdout.flush()
|
|
sys.stderr.flush()
|
|
except Exception:
|
|
pass
|
|
|
|
def dump_oeb(self, oeb, out_dir):
|
|
from ebook_converter.ebooks.oeb.writer import OEBWriter
|
|
w = OEBWriter(pretty_print=self.opts.pretty_print)
|
|
w(oeb, out_dir)
|
|
|
|
def dump_input(self, ret, output_dir):
|
|
out_dir = os.path.join(self.opts.debug_pipeline, 'input')
|
|
if isinstance(ret, (str, bytes)):
|
|
shutil.copytree(output_dir, out_dir)
|
|
else:
|
|
if not os.path.exists(out_dir):
|
|
os.makedirs(out_dir)
|
|
self.dump_oeb(ret, out_dir)
|
|
if self.input_fmt == 'recipe':
|
|
zf = ZipFile(os.path.join(self.opts.debug_pipeline,
|
|
'periodical.downloaded_recipe'), 'w')
|
|
zf.add_dir(out_dir)
|
|
with self.input_plugin:
|
|
self.input_plugin.save_download(zf)
|
|
zf.close()
|
|
|
|
self.log.info('Input debug saved to:', out_dir)
|
|
|
|
def run(self):
|
|
'''
|
|
Run the conversion pipeline
|
|
'''
|
|
# Setup baseline option values
|
|
self.setup_options()
|
|
if self.opts.verbose:
|
|
self.log.filter_level = self.log.DEBUG
|
|
if self.for_regex_wizard and hasattr(self.opts, 'no_process'):
|
|
self.opts.no_process = True
|
|
self.flush()
|
|
if self.opts.embed_all_fonts or self.opts.embed_font_family:
|
|
# Start the threaded font scanner now, for performance
|
|
from ebook_converter.utils.fonts.scanner import font_scanner # noqa
|
|
import css_parser, logging
|
|
css_parser.log.setLevel(logging.WARN)
|
|
|
|
if self.opts.debug_pipeline is not None:
|
|
self.opts.verbose = max(self.opts.verbose, 4)
|
|
self.opts.debug_pipeline = os.path.abspath(self.opts.debug_pipeline)
|
|
if not os.path.exists(self.opts.debug_pipeline):
|
|
os.makedirs(self.opts.debug_pipeline)
|
|
with open(os.path.join(self.opts.debug_pipeline, 'README.txt'), 'wb') as f:
|
|
f.write(DEBUG_README)
|
|
for x in ('input', 'parsed', 'structure', 'processed'):
|
|
x = os.path.join(self.opts.debug_pipeline, x)
|
|
if os.path.exists(x):
|
|
shutil.rmtree(x)
|
|
|
|
# Run any preprocess plugins
|
|
from ebook_converter.customize.ui import run_plugins_on_preprocess
|
|
self.input = run_plugins_on_preprocess(self.input)
|
|
|
|
self.flush()
|
|
# Create an OEBBook from the input file. The input plugin does all the
|
|
# heavy lifting.
|
|
accelerators = {}
|
|
|
|
tdir = PersistentTemporaryDirectory('_plumber')
|
|
stream = self.input if self.input_fmt == 'recipe' else \
|
|
open(self.input, 'rb')
|
|
if self.input_fmt == 'recipe':
|
|
self.opts.original_recipe_input_arg = self.original_input_arg
|
|
|
|
if hasattr(self.opts, 'lrf') and self.output_plugin.file_type == 'lrf':
|
|
self.opts.lrf = True
|
|
if self.input_fmt == 'azw4' and self.output_plugin.file_type == 'pdf':
|
|
self.ui_reporter(0.01, 'AZW4 files are simply wrappers around PDF files.'
|
|
' Skipping the conversion and unwrapping the embedded PDF instead')
|
|
from ebook_converter.ebooks.azw4.reader import unwrap
|
|
unwrap(stream, self.output)
|
|
self.ui_reporter(1.)
|
|
self.log(self.output_fmt.upper(), 'output written to', self.output)
|
|
self.flush()
|
|
return
|
|
|
|
self.ui_reporter(0.01, 'Converting input to HTML...')
|
|
ir = CompositeProgressReporter(0.01, 0.34, self.ui_reporter)
|
|
self.input_plugin.report_progress = ir
|
|
if self.for_regex_wizard:
|
|
self.input_plugin.for_viewer = True
|
|
self.output_plugin.specialize_options(self.log, self.opts, self.input_fmt)
|
|
with self.input_plugin:
|
|
self.oeb = self.input_plugin(stream, self.opts,
|
|
self.input_fmt, self.log,
|
|
accelerators, tdir)
|
|
if self.opts.debug_pipeline is not None:
|
|
self.dump_input(self.oeb, tdir)
|
|
if self.abort_after_input_dump:
|
|
return
|
|
if self.input_fmt in ('recipe', 'downloaded_recipe'):
|
|
self.opts_to_mi(self.user_metadata)
|
|
if not hasattr(self.oeb, 'manifest'):
|
|
self.oeb = create_oebbook(
|
|
self.log, self.oeb, self.opts,
|
|
encoding=self.input_plugin.output_encoding,
|
|
for_regex_wizard=self.for_regex_wizard, removed_items=getattr(self.input_plugin, 'removed_items_to_ignore', ()))
|
|
if self.for_regex_wizard:
|
|
return
|
|
self.input_plugin.postprocess_book(self.oeb, self.opts, self.log)
|
|
self.opts.is_image_collection = self.input_plugin.is_image_collection
|
|
pr = CompositeProgressReporter(0.34, 0.67, self.ui_reporter)
|
|
self.flush()
|
|
if self.opts.debug_pipeline is not None:
|
|
out_dir = os.path.join(self.opts.debug_pipeline, 'parsed')
|
|
self.dump_oeb(self.oeb, out_dir)
|
|
self.log('Parsed HTML written to:', out_dir)
|
|
self.input_plugin.specialize(self.oeb, self.opts, self.log,
|
|
self.output_fmt)
|
|
|
|
pr(0., 'Running transforms on e-book...')
|
|
|
|
self.oeb.plumber_output_format = self.output_fmt or ''
|
|
|
|
from ebook_converter.ebooks.oeb.transforms.data_url import DataURL
|
|
DataURL()(self.oeb, self.opts)
|
|
from ebook_converter.ebooks.oeb.transforms.guide import Clean
|
|
Clean()(self.oeb, self.opts)
|
|
pr(0.1)
|
|
self.flush()
|
|
|
|
self.opts.source = self.opts.input_profile
|
|
self.opts.dest = self.opts.output_profile
|
|
|
|
from ebook_converter.ebooks.oeb.transforms.jacket import RemoveFirstImage
|
|
RemoveFirstImage()(self.oeb, self.opts, self.user_metadata)
|
|
from ebook_converter.ebooks.oeb.transforms.metadata import MergeMetadata
|
|
MergeMetadata()(self.oeb, self.user_metadata, self.opts,
|
|
override_input_metadata=self.override_input_metadata)
|
|
pr(0.2)
|
|
self.flush()
|
|
|
|
from ebook_converter.ebooks.oeb.transforms.structure import DetectStructure
|
|
DetectStructure()(self.oeb, self.opts)
|
|
pr(0.35)
|
|
self.flush()
|
|
|
|
if self.output_plugin.file_type not in ('epub', 'kepub'):
|
|
# Remove the toc reference to the html cover, if any, except for
|
|
# epub, as the epub output plugin will do the right thing with it.
|
|
item = getattr(self.oeb.toc, 'item_that_refers_to_cover', None)
|
|
if item is not None and item.count() == 0:
|
|
self.oeb.toc.remove(item)
|
|
|
|
from ebook_converter.ebooks.oeb.transforms.flatcss import CSSFlattener
|
|
fbase = self.opts.base_font_size
|
|
if fbase < 1e-4:
|
|
fbase = float(self.opts.dest.fbase)
|
|
fkey = self.opts.font_size_mapping
|
|
if fkey is None:
|
|
fkey = self.opts.dest.fkey
|
|
else:
|
|
try:
|
|
fkey = list(map(float, fkey.split(',')))
|
|
except Exception:
|
|
self.log.error('Invalid font size key: %r ignoring'%fkey)
|
|
fkey = self.opts.dest.fkey
|
|
|
|
from ebook_converter.ebooks.oeb.transforms.jacket import Jacket
|
|
Jacket()(self.oeb, self.opts, self.user_metadata)
|
|
pr(0.4)
|
|
self.flush()
|
|
|
|
if self.opts.debug_pipeline is not None:
|
|
out_dir = os.path.join(self.opts.debug_pipeline, 'structure')
|
|
self.dump_oeb(self.oeb, out_dir)
|
|
self.log('Structured HTML written to:', out_dir)
|
|
|
|
if self.opts.extra_css and os.path.exists(self.opts.extra_css):
|
|
with open(self.opts.extra_css, 'rb') as f:
|
|
self.opts.extra_css = f.read()
|
|
|
|
oibl = self.opts.insert_blank_line
|
|
orps = self.opts.remove_paragraph_spacing
|
|
if self.output_plugin.file_type == 'lrf':
|
|
self.opts.insert_blank_line = False
|
|
self.opts.remove_paragraph_spacing = False
|
|
line_height = self.opts.line_height
|
|
if line_height < 1e-4:
|
|
line_height = None
|
|
|
|
if self.opts.linearize_tables and \
|
|
self.output_plugin.file_type not in ('mobi', 'lrf'):
|
|
from ebook_converter.ebooks.oeb.transforms.linearize_tables import LinearizeTables
|
|
LinearizeTables()(self.oeb, self.opts)
|
|
|
|
if self.opts.unsmarten_punctuation:
|
|
from ebook_converter.ebooks.oeb.transforms.unsmarten import UnsmartenPunctuation
|
|
UnsmartenPunctuation()(self.oeb, self.opts)
|
|
|
|
mobi_file_type = getattr(self.opts, 'mobi_file_type', 'old')
|
|
needs_old_markup = (self.output_plugin.file_type == 'lit' or (
|
|
self.output_plugin.file_type == 'mobi' and mobi_file_type == 'old'))
|
|
transform_css_rules = ()
|
|
if self.opts.transform_css_rules:
|
|
transform_css_rules = self.opts.transform_css_rules
|
|
if isinstance(transform_css_rules, (str, bytes)):
|
|
transform_css_rules = json.loads(transform_css_rules)
|
|
flattener = CSSFlattener(fbase=fbase, fkey=fkey,
|
|
lineh=line_height,
|
|
untable=needs_old_markup,
|
|
unfloat=needs_old_markup,
|
|
page_break_on_body=self.output_plugin.file_type in ('mobi',
|
|
'lit'),
|
|
transform_css_rules=transform_css_rules,
|
|
specializer=functools.partial(self.output_plugin.specialize_css_for_output,
|
|
self.log, self.opts))
|
|
flattener(self.oeb, self.opts)
|
|
self.opts._final_base_font_size = fbase
|
|
|
|
self.opts.insert_blank_line = oibl
|
|
self.opts.remove_paragraph_spacing = orps
|
|
|
|
from ebook_converter.ebooks.oeb.transforms.page_margin import \
|
|
RemoveFakeMargins, RemoveAdobeMargins
|
|
RemoveFakeMargins()(self.oeb, self.log, self.opts)
|
|
RemoveAdobeMargins()(self.oeb, self.log, self.opts)
|
|
|
|
if self.opts.embed_all_fonts:
|
|
from ebook_converter.ebooks.oeb.transforms.embed_fonts import EmbedFonts
|
|
EmbedFonts()(self.oeb, self.log, self.opts)
|
|
|
|
if self.opts.subset_embedded_fonts and self.output_plugin.file_type != 'pdf':
|
|
from ebook_converter.ebooks.oeb.transforms.subset import SubsetFonts
|
|
SubsetFonts()(self.oeb, self.log, self.opts)
|
|
|
|
pr(0.9)
|
|
self.flush()
|
|
|
|
from ebook_converter.ebooks.oeb.transforms.trimmanifest import ManifestTrimmer
|
|
|
|
self.log.info('Cleaning up manifest...')
|
|
trimmer = ManifestTrimmer()
|
|
trimmer(self.oeb, self.opts)
|
|
|
|
self.oeb.toc.rationalize_play_orders()
|
|
pr(1.)
|
|
self.flush()
|
|
|
|
if self.opts.debug_pipeline is not None:
|
|
out_dir = os.path.join(self.opts.debug_pipeline, 'processed')
|
|
self.dump_oeb(self.oeb, out_dir)
|
|
self.log('Processed HTML written to:', out_dir)
|
|
|
|
self.log.info('Creating %s...'%self.output_plugin.name)
|
|
our = CompositeProgressReporter(0.67, 1., self.ui_reporter)
|
|
self.output_plugin.report_progress = our
|
|
our(0., 'Running %s plugin' % self.output_plugin.name)
|
|
with self.output_plugin:
|
|
self.output_plugin.convert(self.oeb, self.output, self.input_plugin,
|
|
self.opts, self.log)
|
|
self.oeb.clean_temp_files()
|
|
self.ui_reporter(1.)
|
|
run_plugins_on_postprocess(self.output, self.output_fmt)
|
|
|
|
self.log(self.output_fmt.upper(), 'output written to', self.output)
|
|
self.flush()
|
|
|
|
|
|
# This has to be global as create_oebbook can be called from other locations
|
|
# (for example in the html input plugin)
|
|
regex_wizard_callback = None
|
|
|
|
|
|
def set_regex_wizard_callback(f):
|
|
global regex_wizard_callback
|
|
regex_wizard_callback = f
|
|
|
|
|
|
def create_oebbook(log, path_or_stream, opts, reader=None,
|
|
encoding='utf-8', populate=True, for_regex_wizard=False, specialize=None, removed_items=()):
|
|
'''
|
|
Create an OEBBook.
|
|
'''
|
|
from ebook_converter.ebooks.oeb.base import OEBBook
|
|
html_preprocessor = HTMLPreProcessor(log, opts, regex_wizard_callback=regex_wizard_callback)
|
|
if not encoding:
|
|
encoding = None
|
|
oeb = OEBBook(log, html_preprocessor,
|
|
pretty_print=opts.pretty_print, input_encoding=encoding)
|
|
if not populate:
|
|
return oeb
|
|
if specialize is not None:
|
|
oeb = specialize(oeb) or oeb
|
|
# Read OEB Book into OEBBook
|
|
log('Parsing all content...')
|
|
oeb.removed_items_to_ignore = removed_items
|
|
if reader is None:
|
|
from ebook_converter.ebooks.oeb.reader import OEBReader
|
|
reader = OEBReader
|
|
|
|
reader()(oeb, path_or_stream)
|
|
return oeb
|
|
|
|
|
|
def create_dummy_plumber(input_format, output_format):
|
|
from ebook_converter.utils.logging import Log
|
|
input_format = input_format.lower()
|
|
output_format = output_format.lower()
|
|
output_path = 'dummy.'+output_format
|
|
log = Log()
|
|
log.outputs = []
|
|
input_file = 'dummy.'+input_format
|
|
if input_format in ARCHIVE_FMTS:
|
|
input_file = 'dummy.html'
|
|
return Plumber(input_file, output_path, log)
|