mirror of
https://github.com/gryf/ebook-converter.git
synced 2026-03-15 22:23:42 +01:00
Initial import
This commit is contained in:
308
ebook_converter/ebooks/conversion/plugins/txt_input.py
Normal file
308
ebook_converter/ebooks/conversion/plugins/txt_input.py
Normal file
@@ -0,0 +1,308 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL 3'
|
||||
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import os
|
||||
|
||||
from calibre import _ent_pat, walk, xml_entity_to_unicode
|
||||
from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
|
||||
from polyglot.builtins import getcwd
|
||||
|
||||
MD_EXTENSIONS = {
|
||||
'abbr': _('Abbreviations'),
|
||||
'admonition': _('Support admonitions'),
|
||||
'attr_list': _('Add attribute to HTML tags'),
|
||||
'codehilite': _('Add code highlighting via Pygments'),
|
||||
'def_list': _('Definition lists'),
|
||||
'extra': _('Enables various common extensions'),
|
||||
'fenced_code': _('Alternative code block syntax'),
|
||||
'footnotes': _('Footnotes'),
|
||||
'legacy_attrs': _('Use legacy element attributes'),
|
||||
'legacy_em': _('Use legacy underscore handling for connected words'),
|
||||
'meta': _('Metadata in the document'),
|
||||
'nl2br': _('Treat newlines as hard breaks'),
|
||||
'sane_lists': _('Do not allow mixing list types'),
|
||||
'smarty': _('Use markdown\'s internal smartypants parser'),
|
||||
'tables': _('Support tables'),
|
||||
'toc': _('Generate a table of contents'),
|
||||
'wikilinks': _('Wiki style links'),
|
||||
}
|
||||
|
||||
|
||||
class TXTInput(InputFormatPlugin):
|
||||
|
||||
name = 'TXT Input'
|
||||
author = 'John Schember'
|
||||
description = 'Convert TXT files to HTML'
|
||||
file_types = {'txt', 'txtz', 'text', 'md', 'textile', 'markdown'}
|
||||
commit_name = 'txt_input'
|
||||
ui_data = {
|
||||
'md_extensions': MD_EXTENSIONS,
|
||||
'paragraph_types': {
|
||||
'auto': _('Try to auto detect paragraph type'),
|
||||
'block': _('Treat a blank line as a paragraph break'),
|
||||
'single': _('Assume every line is a paragraph'),
|
||||
'print': _('Assume every line starting with 2+ spaces or a tab starts a paragraph'),
|
||||
'unformatted': _('Most lines have hard line breaks, few/no blank lines or indents'),
|
||||
'off': _('Don\'t modify the paragraph structure'),
|
||||
},
|
||||
'formatting_types': {
|
||||
'auto': _('Automatically decide which formatting processor to use'),
|
||||
'plain': _('No formatting'),
|
||||
'heuristic': _('Use heuristics to determine chapter headings, italics, etc.'),
|
||||
'textile': _('Use the TexTile markup language'),
|
||||
'markdown': _('Use the Markdown markup language')
|
||||
},
|
||||
}
|
||||
|
||||
options = {
|
||||
OptionRecommendation(name='formatting_type', recommended_value='auto',
|
||||
choices=list(ui_data['formatting_types']),
|
||||
help=_('Formatting used within the document.\n'
|
||||
'* auto: {auto}\n'
|
||||
'* plain: {plain}\n'
|
||||
'* heuristic: {heuristic}\n'
|
||||
'* textile: {textile}\n'
|
||||
'* markdown: {markdown}\n'
|
||||
'To learn more about markdown see {url}').format(
|
||||
url='https://daringfireball.net/projects/markdown/', **ui_data['formatting_types'])
|
||||
),
|
||||
OptionRecommendation(name='paragraph_type', recommended_value='auto',
|
||||
choices=list(ui_data['paragraph_types']),
|
||||
help=_('Paragraph structure to assume. The value of "off" is useful for formatted documents such as Markdown or Textile. '
|
||||
'Choices are:\n'
|
||||
'* auto: {auto}\n'
|
||||
'* block: {block}\n'
|
||||
'* single: {single}\n'
|
||||
'* print: {print}\n'
|
||||
'* unformatted: {unformatted}\n'
|
||||
'* off: {off}').format(**ui_data['paragraph_types'])
|
||||
),
|
||||
OptionRecommendation(name='preserve_spaces', recommended_value=False,
|
||||
help=_('Normally extra spaces are condensed into a single space. '
|
||||
'With this option all spaces will be displayed.')),
|
||||
OptionRecommendation(name='txt_in_remove_indents', recommended_value=False,
|
||||
help=_('Normally extra space at the beginning of lines is retained. '
|
||||
'With this option they will be removed.')),
|
||||
OptionRecommendation(name="markdown_extensions", recommended_value='footnotes, tables, toc',
|
||||
help=_('Enable extensions to markdown syntax. Extensions are formatting that is not part '
|
||||
'of the standard markdown format. The extensions enabled by default: %default.\n'
|
||||
'To learn more about markdown extensions, see {}\n'
|
||||
'This should be a comma separated list of extensions to enable:\n'
|
||||
).format('https://python-markdown.github.io/extensions/') + '\n'.join('* %s: %s' % (k, MD_EXTENSIONS[k]) for k in sorted(MD_EXTENSIONS))),
|
||||
}
|
||||
|
||||
def shift_file(self, fname, data):
|
||||
name, ext = os.path.splitext(fname)
|
||||
candidate = os.path.join(self.output_dir, fname)
|
||||
c = 0
|
||||
while os.path.exists(candidate):
|
||||
c += 1
|
||||
candidate = os.path.join(self.output_dir, '{}-{}{}'.format(name, c, ext))
|
||||
ans = candidate
|
||||
with open(ans, 'wb') as f:
|
||||
f.write(data)
|
||||
return f.name
|
||||
|
||||
def fix_resources(self, html, base_dir):
|
||||
from html5_parser import parse
|
||||
root = parse(html)
|
||||
changed = False
|
||||
for img in root.xpath('//img[@src]'):
|
||||
src = img.get('src')
|
||||
prefix = src.split(':', 1)[0].lower()
|
||||
if prefix not in ('file', 'http', 'https', 'ftp') and not os.path.isabs(src):
|
||||
src = os.path.join(base_dir, src)
|
||||
if os.access(src, os.R_OK):
|
||||
with open(src, 'rb') as f:
|
||||
data = f.read()
|
||||
f = self.shift_file(os.path.basename(src), data)
|
||||
changed = True
|
||||
img.set('src', os.path.basename(f))
|
||||
if changed:
|
||||
from lxml import etree
|
||||
html = etree.tostring(root, encoding='unicode')
|
||||
return html
|
||||
|
||||
def convert(self, stream, options, file_ext, log,
|
||||
accelerators):
|
||||
from calibre.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator
|
||||
from calibre.ebooks.chardet import detect
|
||||
from calibre.utils.zipfile import ZipFile
|
||||
from calibre.ebooks.txt.processor import (convert_basic,
|
||||
convert_markdown_with_metadata, separate_paragraphs_single_line,
|
||||
separate_paragraphs_print_formatted, preserve_spaces,
|
||||
detect_paragraph_type, detect_formatting_type,
|
||||
normalize_line_endings, convert_textile, remove_indents,
|
||||
block_to_single_line, separate_hard_scene_breaks)
|
||||
|
||||
self.log = log
|
||||
txt = b''
|
||||
log.debug('Reading text from file...')
|
||||
length = 0
|
||||
base_dir = self.output_dir = getcwd()
|
||||
|
||||
# Extract content from zip archive.
|
||||
if file_ext == 'txtz':
|
||||
zf = ZipFile(stream)
|
||||
zf.extractall('.')
|
||||
|
||||
for x in walk('.'):
|
||||
if os.path.splitext(x)[1].lower() in ('.txt', '.text'):
|
||||
with open(x, 'rb') as tf:
|
||||
txt += tf.read() + b'\n\n'
|
||||
else:
|
||||
if getattr(stream, 'name', None):
|
||||
base_dir = os.path.dirname(stream.name)
|
||||
txt = stream.read()
|
||||
if file_ext in {'md', 'textile', 'markdown'}:
|
||||
options.formatting_type = {'md': 'markdown'}.get(file_ext, file_ext)
|
||||
log.info('File extension indicates particular formatting. '
|
||||
'Forcing formatting type to: %s'%options.formatting_type)
|
||||
options.paragraph_type = 'off'
|
||||
|
||||
# Get the encoding of the document.
|
||||
if options.input_encoding:
|
||||
ienc = options.input_encoding
|
||||
log.debug('Using user specified input encoding of %s' % ienc)
|
||||
else:
|
||||
det_encoding = detect(txt[:4096])
|
||||
det_encoding, confidence = det_encoding['encoding'], det_encoding['confidence']
|
||||
if det_encoding and det_encoding.lower().replace('_', '-').strip() in (
|
||||
'gb2312', 'chinese', 'csiso58gb231280', 'euc-cn', 'euccn',
|
||||
'eucgb2312-cn', 'gb2312-1980', 'gb2312-80', 'iso-ir-58'):
|
||||
# Microsoft Word exports to HTML with encoding incorrectly set to
|
||||
# gb2312 instead of gbk. gbk is a superset of gb2312, anyway.
|
||||
det_encoding = 'gbk'
|
||||
ienc = det_encoding
|
||||
log.debug('Detected input encoding as %s with a confidence of %s%%' % (ienc, confidence * 100))
|
||||
if not ienc:
|
||||
ienc = 'utf-8'
|
||||
log.debug('No input encoding specified and could not auto detect using %s' % ienc)
|
||||
# Remove BOM from start of txt as its presence can confuse markdown
|
||||
import codecs
|
||||
for bom in (codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE, codecs.BOM_UTF8, codecs.BOM_UTF32_LE, codecs.BOM_UTF32_BE):
|
||||
if txt.startswith(bom):
|
||||
txt = txt[len(bom):]
|
||||
break
|
||||
txt = txt.decode(ienc, 'replace')
|
||||
|
||||
# Replace entities
|
||||
txt = _ent_pat.sub(xml_entity_to_unicode, txt)
|
||||
|
||||
# Normalize line endings
|
||||
txt = normalize_line_endings(txt)
|
||||
|
||||
# Determine the paragraph type of the document.
|
||||
if options.paragraph_type == 'auto':
|
||||
options.paragraph_type = detect_paragraph_type(txt)
|
||||
if options.paragraph_type == 'unknown':
|
||||
log.debug('Could not reliably determine paragraph type using block')
|
||||
options.paragraph_type = 'block'
|
||||
else:
|
||||
log.debug('Auto detected paragraph type as %s' % options.paragraph_type)
|
||||
|
||||
# Detect formatting
|
||||
if options.formatting_type == 'auto':
|
||||
options.formatting_type = detect_formatting_type(txt)
|
||||
log.debug('Auto detected formatting as %s' % options.formatting_type)
|
||||
|
||||
if options.formatting_type == 'heuristic':
|
||||
setattr(options, 'enable_heuristics', True)
|
||||
setattr(options, 'unwrap_lines', False)
|
||||
setattr(options, 'smarten_punctuation', True)
|
||||
|
||||
# Reformat paragraphs to block formatting based on the detected type.
|
||||
# We don't check for block because the processor assumes block.
|
||||
# single and print at transformed to block for processing.
|
||||
if options.paragraph_type == 'single':
|
||||
txt = separate_paragraphs_single_line(txt)
|
||||
elif options.paragraph_type == 'print':
|
||||
txt = separate_hard_scene_breaks(txt)
|
||||
txt = separate_paragraphs_print_formatted(txt)
|
||||
txt = block_to_single_line(txt)
|
||||
elif options.paragraph_type == 'unformatted':
|
||||
from calibre.ebooks.conversion.utils import HeuristicProcessor
|
||||
# unwrap lines based on punctuation
|
||||
docanalysis = DocAnalysis('txt', txt)
|
||||
length = docanalysis.line_length(.5)
|
||||
preprocessor = HeuristicProcessor(options, log=getattr(self, 'log', None))
|
||||
txt = preprocessor.punctuation_unwrap(length, txt, 'txt')
|
||||
txt = separate_paragraphs_single_line(txt)
|
||||
elif options.paragraph_type == 'block':
|
||||
txt = separate_hard_scene_breaks(txt)
|
||||
txt = block_to_single_line(txt)
|
||||
|
||||
if getattr(options, 'enable_heuristics', False) and getattr(options, 'dehyphenate', False):
|
||||
docanalysis = DocAnalysis('txt', txt)
|
||||
if not length:
|
||||
length = docanalysis.line_length(.5)
|
||||
dehyphenator = Dehyphenator(options.verbose, log=self.log)
|
||||
txt = dehyphenator(txt,'txt', length)
|
||||
|
||||
# User requested transformation on the text.
|
||||
if options.txt_in_remove_indents:
|
||||
txt = remove_indents(txt)
|
||||
|
||||
# Preserve spaces will replace multiple spaces to a space
|
||||
# followed by the entity.
|
||||
if options.preserve_spaces:
|
||||
txt = preserve_spaces(txt)
|
||||
|
||||
# Process the text using the appropriate text processor.
|
||||
self.shifted_files = []
|
||||
try:
|
||||
html = ''
|
||||
input_mi = None
|
||||
if options.formatting_type == 'markdown':
|
||||
log.debug('Running text through markdown conversion...')
|
||||
try:
|
||||
input_mi, html = convert_markdown_with_metadata(txt, extensions=[x.strip() for x in options.markdown_extensions.split(',') if x.strip()])
|
||||
except RuntimeError:
|
||||
raise ValueError('This txt file has malformed markup, it cannot be'
|
||||
' converted by calibre. See https://daringfireball.net/projects/markdown/syntax')
|
||||
html = self.fix_resources(html, base_dir)
|
||||
elif options.formatting_type == 'textile':
|
||||
log.debug('Running text through textile conversion...')
|
||||
html = convert_textile(txt)
|
||||
html = self.fix_resources(html, base_dir)
|
||||
else:
|
||||
log.debug('Running text through basic conversion...')
|
||||
flow_size = getattr(options, 'flow_size', 0)
|
||||
html = convert_basic(txt, epub_split_size_kb=flow_size)
|
||||
|
||||
# Run the HTMLized text through the html processing plugin.
|
||||
from calibre.customize.ui import plugin_for_input_format
|
||||
html_input = plugin_for_input_format('html')
|
||||
for opt in html_input.options:
|
||||
setattr(options, opt.option.name, opt.recommended_value)
|
||||
options.input_encoding = 'utf-8'
|
||||
htmlfile = self.shift_file('index.html', html.encode('utf-8'))
|
||||
odi = options.debug_pipeline
|
||||
options.debug_pipeline = None
|
||||
# Generate oeb from html conversion.
|
||||
oeb = html_input.convert(open(htmlfile, 'rb'), options, 'html', log, {})
|
||||
options.debug_pipeline = odi
|
||||
finally:
|
||||
for x in self.shifted_files:
|
||||
os.remove(x)
|
||||
|
||||
# Set metadata from file.
|
||||
if input_mi is None:
|
||||
from calibre.customize.ui import get_file_type_metadata
|
||||
input_mi = get_file_type_metadata(stream, file_ext)
|
||||
from calibre.ebooks.oeb.transforms.metadata import meta_info_to_oeb_metadata
|
||||
meta_info_to_oeb_metadata(input_mi, oeb.metadata, log)
|
||||
self.html_postprocess_title = input_mi.title
|
||||
|
||||
return oeb
|
||||
|
||||
def postprocess_book(self, oeb, opts, log):
|
||||
for item in oeb.spine:
|
||||
if hasattr(item.data, 'xpath'):
|
||||
for title in item.data.xpath('//*[local-name()="title"]'):
|
||||
if title.text == _('Unknown'):
|
||||
title.text = self.html_postprocess_title
|
||||
Reference in New Issue
Block a user