mirror of
https://github.com/gryf/ebook-converter.git
synced 2026-01-05 18:54:11 +01:00
Here is the first batch of modules, which are needed for converting several formats to LRF. Some of the logic has been change, more cleanups will follow.
309 lines
14 KiB
Python
309 lines
14 KiB
Python
# -*- coding: utf-8 -*-
|
|
from __future__ import absolute_import, division, print_function, unicode_literals
|
|
|
|
__license__ = 'GPL 3'
|
|
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
|
__docformat__ = 'restructuredtext en'
|
|
|
|
import os
|
|
|
|
from ebook_converter import _ent_pat, walk, xml_entity_to_unicode
|
|
from ebook_converter.customize.conversion import InputFormatPlugin, OptionRecommendation
|
|
from ebook_converter.polyglot.builtins import getcwd
|
|
|
|
MD_EXTENSIONS = {
|
|
'abbr': _('Abbreviations'),
|
|
'admonition': _('Support admonitions'),
|
|
'attr_list': _('Add attribute to HTML tags'),
|
|
'codehilite': _('Add code highlighting via Pygments'),
|
|
'def_list': _('Definition lists'),
|
|
'extra': _('Enables various common extensions'),
|
|
'fenced_code': _('Alternative code block syntax'),
|
|
'footnotes': _('Footnotes'),
|
|
'legacy_attrs': _('Use legacy element attributes'),
|
|
'legacy_em': _('Use legacy underscore handling for connected words'),
|
|
'meta': _('Metadata in the document'),
|
|
'nl2br': _('Treat newlines as hard breaks'),
|
|
'sane_lists': _('Do not allow mixing list types'),
|
|
'smarty': _('Use markdown\'s internal smartypants parser'),
|
|
'tables': _('Support tables'),
|
|
'toc': _('Generate a table of contents'),
|
|
'wikilinks': _('Wiki style links'),
|
|
}
|
|
|
|
|
|
class TXTInput(InputFormatPlugin):
|
|
|
|
name = 'TXT Input'
|
|
author = 'John Schember'
|
|
description = 'Convert TXT files to HTML'
|
|
file_types = {'txt', 'txtz', 'text', 'md', 'textile', 'markdown'}
|
|
commit_name = 'txt_input'
|
|
ui_data = {
|
|
'md_extensions': MD_EXTENSIONS,
|
|
'paragraph_types': {
|
|
'auto': _('Try to auto detect paragraph type'),
|
|
'block': _('Treat a blank line as a paragraph break'),
|
|
'single': _('Assume every line is a paragraph'),
|
|
'print': _('Assume every line starting with 2+ spaces or a tab starts a paragraph'),
|
|
'unformatted': _('Most lines have hard line breaks, few/no blank lines or indents'),
|
|
'off': _('Don\'t modify the paragraph structure'),
|
|
},
|
|
'formatting_types': {
|
|
'auto': _('Automatically decide which formatting processor to use'),
|
|
'plain': _('No formatting'),
|
|
'heuristic': _('Use heuristics to determine chapter headings, italics, etc.'),
|
|
'textile': _('Use the TexTile markup language'),
|
|
'markdown': _('Use the Markdown markup language')
|
|
},
|
|
}
|
|
|
|
options = {
|
|
OptionRecommendation(name='formatting_type', recommended_value='auto',
|
|
choices=list(ui_data['formatting_types']),
|
|
help=_('Formatting used within the document.\n'
|
|
'* auto: {auto}\n'
|
|
'* plain: {plain}\n'
|
|
'* heuristic: {heuristic}\n'
|
|
'* textile: {textile}\n'
|
|
'* markdown: {markdown}\n'
|
|
'To learn more about markdown see {url}').format(
|
|
url='https://daringfireball.net/projects/markdown/', **ui_data['formatting_types'])
|
|
),
|
|
OptionRecommendation(name='paragraph_type', recommended_value='auto',
|
|
choices=list(ui_data['paragraph_types']),
|
|
help=_('Paragraph structure to assume. The value of "off" is useful for formatted documents such as Markdown or Textile. '
|
|
'Choices are:\n'
|
|
'* auto: {auto}\n'
|
|
'* block: {block}\n'
|
|
'* single: {single}\n'
|
|
'* print: {print}\n'
|
|
'* unformatted: {unformatted}\n'
|
|
'* off: {off}').format(**ui_data['paragraph_types'])
|
|
),
|
|
OptionRecommendation(name='preserve_spaces', recommended_value=False,
|
|
help=_('Normally extra spaces are condensed into a single space. '
|
|
'With this option all spaces will be displayed.')),
|
|
OptionRecommendation(name='txt_in_remove_indents', recommended_value=False,
|
|
help=_('Normally extra space at the beginning of lines is retained. '
|
|
'With this option they will be removed.')),
|
|
OptionRecommendation(name="markdown_extensions", recommended_value='footnotes, tables, toc',
|
|
help=_('Enable extensions to markdown syntax. Extensions are formatting that is not part '
|
|
'of the standard markdown format. The extensions enabled by default: %default.\n'
|
|
'To learn more about markdown extensions, see {}\n'
|
|
'This should be a comma separated list of extensions to enable:\n'
|
|
).format('https://python-markdown.github.io/extensions/') + '\n'.join('* %s: %s' % (k, MD_EXTENSIONS[k]) for k in sorted(MD_EXTENSIONS))),
|
|
}
|
|
|
|
def shift_file(self, fname, data):
|
|
name, ext = os.path.splitext(fname)
|
|
candidate = os.path.join(self.output_dir, fname)
|
|
c = 0
|
|
while os.path.exists(candidate):
|
|
c += 1
|
|
candidate = os.path.join(self.output_dir, '{}-{}{}'.format(name, c, ext))
|
|
ans = candidate
|
|
with open(ans, 'wb') as f:
|
|
f.write(data)
|
|
return f.name
|
|
|
|
def fix_resources(self, html, base_dir):
|
|
from html5_parser import parse
|
|
root = parse(html)
|
|
changed = False
|
|
for img in root.xpath('//img[@src]'):
|
|
src = img.get('src')
|
|
prefix = src.split(':', 1)[0].lower()
|
|
if prefix not in ('file', 'http', 'https', 'ftp') and not os.path.isabs(src):
|
|
src = os.path.join(base_dir, src)
|
|
if os.access(src, os.R_OK):
|
|
with open(src, 'rb') as f:
|
|
data = f.read()
|
|
f = self.shift_file(os.path.basename(src), data)
|
|
changed = True
|
|
img.set('src', os.path.basename(f))
|
|
if changed:
|
|
from lxml import etree
|
|
html = etree.tostring(root, encoding='unicode')
|
|
return html
|
|
|
|
def convert(self, stream, options, file_ext, log,
|
|
accelerators):
|
|
from ebook_converter.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator
|
|
from ebook_converter.ebooks.chardet import detect
|
|
from ebook_converter.utils.zipfile import ZipFile
|
|
from ebook_converter.ebooks.txt.processor import (convert_basic,
|
|
convert_markdown_with_metadata, separate_paragraphs_single_line,
|
|
separate_paragraphs_print_formatted, preserve_spaces,
|
|
detect_paragraph_type, detect_formatting_type,
|
|
normalize_line_endings, convert_textile, remove_indents,
|
|
block_to_single_line, separate_hard_scene_breaks)
|
|
|
|
self.log = log
|
|
txt = b''
|
|
log.debug('Reading text from file...')
|
|
length = 0
|
|
base_dir = self.output_dir = getcwd()
|
|
|
|
# Extract content from zip archive.
|
|
if file_ext == 'txtz':
|
|
zf = ZipFile(stream)
|
|
zf.extractall('.')
|
|
|
|
for x in walk('.'):
|
|
if os.path.splitext(x)[1].lower() in ('.txt', '.text'):
|
|
with open(x, 'rb') as tf:
|
|
txt += tf.read() + b'\n\n'
|
|
else:
|
|
if getattr(stream, 'name', None):
|
|
base_dir = os.path.dirname(stream.name)
|
|
txt = stream.read()
|
|
if file_ext in {'md', 'textile', 'markdown'}:
|
|
options.formatting_type = {'md': 'markdown'}.get(file_ext, file_ext)
|
|
log.info('File extension indicates particular formatting. '
|
|
'Forcing formatting type to: %s'%options.formatting_type)
|
|
options.paragraph_type = 'off'
|
|
|
|
# Get the encoding of the document.
|
|
if options.input_encoding:
|
|
ienc = options.input_encoding
|
|
log.debug('Using user specified input encoding of %s' % ienc)
|
|
else:
|
|
det_encoding = detect(txt[:4096])
|
|
det_encoding, confidence = det_encoding['encoding'], det_encoding['confidence']
|
|
if det_encoding and det_encoding.lower().replace('_', '-').strip() in (
|
|
'gb2312', 'chinese', 'csiso58gb231280', 'euc-cn', 'euccn',
|
|
'eucgb2312-cn', 'gb2312-1980', 'gb2312-80', 'iso-ir-58'):
|
|
# Microsoft Word exports to HTML with encoding incorrectly set to
|
|
# gb2312 instead of gbk. gbk is a superset of gb2312, anyway.
|
|
det_encoding = 'gbk'
|
|
ienc = det_encoding
|
|
log.debug('Detected input encoding as %s with a confidence of %s%%' % (ienc, confidence * 100))
|
|
if not ienc:
|
|
ienc = 'utf-8'
|
|
log.debug('No input encoding specified and could not auto detect using %s' % ienc)
|
|
# Remove BOM from start of txt as its presence can confuse markdown
|
|
import codecs
|
|
for bom in (codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE, codecs.BOM_UTF8, codecs.BOM_UTF32_LE, codecs.BOM_UTF32_BE):
|
|
if txt.startswith(bom):
|
|
txt = txt[len(bom):]
|
|
break
|
|
txt = txt.decode(ienc, 'replace')
|
|
|
|
# Replace entities
|
|
txt = _ent_pat.sub(xml_entity_to_unicode, txt)
|
|
|
|
# Normalize line endings
|
|
txt = normalize_line_endings(txt)
|
|
|
|
# Determine the paragraph type of the document.
|
|
if options.paragraph_type == 'auto':
|
|
options.paragraph_type = detect_paragraph_type(txt)
|
|
if options.paragraph_type == 'unknown':
|
|
log.debug('Could not reliably determine paragraph type using block')
|
|
options.paragraph_type = 'block'
|
|
else:
|
|
log.debug('Auto detected paragraph type as %s' % options.paragraph_type)
|
|
|
|
# Detect formatting
|
|
if options.formatting_type == 'auto':
|
|
options.formatting_type = detect_formatting_type(txt)
|
|
log.debug('Auto detected formatting as %s' % options.formatting_type)
|
|
|
|
if options.formatting_type == 'heuristic':
|
|
setattr(options, 'enable_heuristics', True)
|
|
setattr(options, 'unwrap_lines', False)
|
|
setattr(options, 'smarten_punctuation', True)
|
|
|
|
# Reformat paragraphs to block formatting based on the detected type.
|
|
# We don't check for block because the processor assumes block.
|
|
# single and print at transformed to block for processing.
|
|
if options.paragraph_type == 'single':
|
|
txt = separate_paragraphs_single_line(txt)
|
|
elif options.paragraph_type == 'print':
|
|
txt = separate_hard_scene_breaks(txt)
|
|
txt = separate_paragraphs_print_formatted(txt)
|
|
txt = block_to_single_line(txt)
|
|
elif options.paragraph_type == 'unformatted':
|
|
from ebook_converter.ebooks.conversion.utils import HeuristicProcessor
|
|
# unwrap lines based on punctuation
|
|
docanalysis = DocAnalysis('txt', txt)
|
|
length = docanalysis.line_length(.5)
|
|
preprocessor = HeuristicProcessor(options, log=getattr(self, 'log', None))
|
|
txt = preprocessor.punctuation_unwrap(length, txt, 'txt')
|
|
txt = separate_paragraphs_single_line(txt)
|
|
elif options.paragraph_type == 'block':
|
|
txt = separate_hard_scene_breaks(txt)
|
|
txt = block_to_single_line(txt)
|
|
|
|
if getattr(options, 'enable_heuristics', False) and getattr(options, 'dehyphenate', False):
|
|
docanalysis = DocAnalysis('txt', txt)
|
|
if not length:
|
|
length = docanalysis.line_length(.5)
|
|
dehyphenator = Dehyphenator(options.verbose, log=self.log)
|
|
txt = dehyphenator(txt,'txt', length)
|
|
|
|
# User requested transformation on the text.
|
|
if options.txt_in_remove_indents:
|
|
txt = remove_indents(txt)
|
|
|
|
# Preserve spaces will replace multiple spaces to a space
|
|
# followed by the entity.
|
|
if options.preserve_spaces:
|
|
txt = preserve_spaces(txt)
|
|
|
|
# Process the text using the appropriate text processor.
|
|
self.shifted_files = []
|
|
try:
|
|
html = ''
|
|
input_mi = None
|
|
if options.formatting_type == 'markdown':
|
|
log.debug('Running text through markdown conversion...')
|
|
try:
|
|
input_mi, html = convert_markdown_with_metadata(txt, extensions=[x.strip() for x in options.markdown_extensions.split(',') if x.strip()])
|
|
except RuntimeError:
|
|
raise ValueError('This txt file has malformed markup, it cannot be'
|
|
' converted by calibre. See https://daringfireball.net/projects/markdown/syntax')
|
|
html = self.fix_resources(html, base_dir)
|
|
elif options.formatting_type == 'textile':
|
|
log.debug('Running text through textile conversion...')
|
|
html = convert_textile(txt)
|
|
html = self.fix_resources(html, base_dir)
|
|
else:
|
|
log.debug('Running text through basic conversion...')
|
|
flow_size = getattr(options, 'flow_size', 0)
|
|
html = convert_basic(txt, epub_split_size_kb=flow_size)
|
|
|
|
# Run the HTMLized text through the html processing plugin.
|
|
from ebook_converter.customize.ui import plugin_for_input_format
|
|
html_input = plugin_for_input_format('html')
|
|
for opt in html_input.options:
|
|
setattr(options, opt.option.name, opt.recommended_value)
|
|
options.input_encoding = 'utf-8'
|
|
htmlfile = self.shift_file('index.html', html.encode('utf-8'))
|
|
odi = options.debug_pipeline
|
|
options.debug_pipeline = None
|
|
# Generate oeb from html conversion.
|
|
oeb = html_input.convert(open(htmlfile, 'rb'), options, 'html', log, {})
|
|
options.debug_pipeline = odi
|
|
finally:
|
|
for x in self.shifted_files:
|
|
os.remove(x)
|
|
|
|
# Set metadata from file.
|
|
if input_mi is None:
|
|
from ebook_converter.customize.ui import get_file_type_metadata
|
|
input_mi = get_file_type_metadata(stream, file_ext)
|
|
from ebook_converter.ebooks.oeb.transforms.metadata import meta_info_to_oeb_metadata
|
|
meta_info_to_oeb_metadata(input_mi, oeb.metadata, log)
|
|
self.html_postprocess_title = input_mi.title
|
|
|
|
return oeb
|
|
|
|
def postprocess_book(self, oeb, opts, log):
|
|
for item in oeb.spine:
|
|
if hasattr(item.data, 'xpath'):
|
|
for title in item.data.xpath('//*[local-name()="title"]'):
|
|
if title.text == _('Unknown'):
|
|
title.text = self.html_postprocess_title
|