mirror of
https://github.com/gryf/ebook-converter.git
synced 2026-01-30 02:05:45 +01:00
Here is the first batch of modules, which are needed for converting several formats to LRF. Some of the logic has been change, more cleanups will follow.
346 lines
12 KiB
Python
346 lines
12 KiB
Python
# -*- coding: utf-8 -*-
|
|
from __future__ import absolute_import, division, print_function, unicode_literals
|
|
|
|
__license__ = 'GPL v3'
|
|
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
|
__docformat__ = 'restructuredtext en'
|
|
|
|
|
|
'''
|
|
Read content from txt file.
|
|
'''
|
|
|
|
import os, re
|
|
|
|
from ebook_converter import prepare_string_for_xml, isbytestring
|
|
from ebook_converter.ebooks.metadata.opf2 import OPFCreator
|
|
|
|
from ebook_converter.ebooks.conversion.preprocess import DocAnalysis
|
|
from ebook_converter.utils.cleantext import clean_ascii_chars
|
|
from ebook_converter.polyglot.builtins import iteritems, unicode_type, map, range, long_type
|
|
|
|
HTML_TEMPLATE = '<html><head><meta http-equiv="Content-Type" content="text/html; charset=utf-8"/><title>%s </title></head><body>\n%s\n</body></html>'
|
|
|
|
|
|
def clean_txt(txt):
|
|
'''
|
|
Run transformations on the text to put it into
|
|
consistent state.
|
|
'''
|
|
if isbytestring(txt):
|
|
txt = txt.decode('utf-8', 'replace')
|
|
# Strip whitespace from the end of the line. Also replace
|
|
# all line breaks with \n.
|
|
txt = '\n'.join([line.rstrip() for line in txt.splitlines()])
|
|
|
|
# Replace whitespace at the beginning of the line with
|
|
txt = re.sub('(?m)(?<=^)([ ]{2,}|\t+)(?=.)', ' ' * 4, txt)
|
|
|
|
# Condense redundant spaces
|
|
txt = re.sub('[ ]{2,}', ' ', txt)
|
|
|
|
# Remove blank space from the beginning and end of the document.
|
|
txt = re.sub(r'^\s+(?=.)', '', txt)
|
|
txt = re.sub(r'(?<=.)\s+$', '', txt)
|
|
# Remove excessive line breaks.
|
|
txt = re.sub('\n{5,}', '\n\n\n\n', txt)
|
|
# remove ASCII invalid chars : 0 to 8 and 11-14 to 24
|
|
txt = clean_ascii_chars(txt)
|
|
|
|
return txt
|
|
|
|
|
|
def split_txt(txt, epub_split_size_kb=0):
|
|
'''
|
|
Ensure there are split points for converting
|
|
to EPUB. A misdetected paragraph type can
|
|
result in the entire document being one giant
|
|
paragraph. In this case the EPUB parser will not
|
|
be able to determine where to split the file
|
|
to accommodate the EPUB file size limitation
|
|
and will fail.
|
|
'''
|
|
# Takes care if there is no point to split
|
|
if epub_split_size_kb > 0:
|
|
if isinstance(txt, unicode_type):
|
|
txt = txt.encode('utf-8')
|
|
length_byte = len(txt)
|
|
# Calculating the average chunk value for easy splitting as EPUB (+2 as a safe margin)
|
|
chunk_size = long_type(length_byte / (int(length_byte / (epub_split_size_kb * 1024)) + 2))
|
|
# if there are chunks with a superior size then go and break
|
|
parts = txt.split(b'\n\n')
|
|
lengths = tuple(map(len, parts))
|
|
if lengths and max(lengths) > chunk_size:
|
|
txt = b'\n\n'.join([
|
|
split_string_separator(line, chunk_size) for line in parts
|
|
])
|
|
if isbytestring(txt):
|
|
txt = txt.decode('utf-8')
|
|
|
|
return txt
|
|
|
|
|
|
def convert_basic(txt, title='', epub_split_size_kb=0):
|
|
'''
|
|
Converts plain text to html by putting all paragraphs in
|
|
<p> tags. It condense and retains blank lines when necessary.
|
|
|
|
Requires paragraphs to be in single line format.
|
|
'''
|
|
txt = clean_txt(txt)
|
|
txt = split_txt(txt, epub_split_size_kb)
|
|
|
|
lines = []
|
|
blank_count = 0
|
|
# Split into paragraphs based on having a blank line between text.
|
|
for line in txt.split('\n'):
|
|
if line.strip():
|
|
blank_count = 0
|
|
lines.append(u'<p>%s</p>' % prepare_string_for_xml(line.replace('\n', ' ')))
|
|
else:
|
|
blank_count += 1
|
|
if blank_count == 2:
|
|
lines.append(u'<p> </p>')
|
|
|
|
return HTML_TEMPLATE % (title, u'\n'.join(lines))
|
|
|
|
|
|
DEFAULT_MD_EXTENSIONS = ('footnotes', 'tables', 'toc')
|
|
|
|
|
|
def create_markdown_object(extensions):
|
|
# Need to load markdown extensions without relying on pkg_resources
|
|
import importlib
|
|
from ebook_converter.ebooks.markdown import Markdown
|
|
from markdown import Extension
|
|
|
|
class NotBrainDeadMarkdown(Markdown):
|
|
def build_extension(self, ext_name, configs):
|
|
if '.' in ext_name or ':' in ext_name:
|
|
return Markdown.build_extension(self, ext_name, configs)
|
|
ext_name = 'markdown.extensions.' + ext_name
|
|
module = importlib.import_module(ext_name)
|
|
if hasattr(module, 'makeExtension'):
|
|
return module.makeExtension(**configs)
|
|
for name, x in vars(module).items():
|
|
if type(x) is type and issubclass(x, Extension) and x is not Extension:
|
|
return x(**configs)
|
|
raise ImportError('No extension class in {}'.format(ext_name))
|
|
|
|
from ebook_converter.ebooks.conversion.plugins.txt_input import MD_EXTENSIONS
|
|
extensions = [x.lower() for x in extensions]
|
|
extensions = [x for x in extensions if x in MD_EXTENSIONS]
|
|
md = NotBrainDeadMarkdown(extensions=extensions)
|
|
return md
|
|
|
|
|
|
def convert_markdown(txt, title='', extensions=DEFAULT_MD_EXTENSIONS):
|
|
md = create_markdown_object(extensions)
|
|
return HTML_TEMPLATE % (title, md.convert(txt))
|
|
|
|
|
|
def convert_markdown_with_metadata(txt, title='', extensions=DEFAULT_MD_EXTENSIONS):
|
|
from ebook_converter.ebooks.metadata.book.base import Metadata
|
|
from ebook_converter.utils.date import parse_only_date
|
|
from ebook_converter.db.write import get_series_values
|
|
if 'meta' not in extensions:
|
|
extensions.append('meta')
|
|
md = create_markdown_object(extensions)
|
|
html = md.convert(txt)
|
|
mi = Metadata(title or _('Unknown'))
|
|
m = md.Meta
|
|
for k, v in iteritems({'date':'pubdate', 'summary':'comments'}):
|
|
if v not in m and k in m:
|
|
m[v] = m.pop(k)
|
|
for k in 'title authors series tags pubdate comments publisher rating'.split():
|
|
val = m.get(k)
|
|
if val:
|
|
mf = mi.metadata_for_field(k)
|
|
if not mf.get('is_multiple'):
|
|
val = val[0]
|
|
if k == 'series':
|
|
val, si = get_series_values(val)
|
|
mi.series_index = 1 if si is None else si
|
|
if k == 'rating':
|
|
try:
|
|
val = max(0, min(int(float(val)), 10))
|
|
except Exception:
|
|
continue
|
|
if mf.get('datatype') == 'datetime':
|
|
try:
|
|
val = parse_only_date(val, assume_utc=False)
|
|
except Exception:
|
|
continue
|
|
setattr(mi, k, val)
|
|
return mi, HTML_TEMPLATE % (mi.title, html)
|
|
|
|
|
|
def convert_textile(txt, title=''):
|
|
from ebook_converter.ebooks.textile import textile
|
|
html = textile(txt, encoding='utf-8')
|
|
return HTML_TEMPLATE % (title, html)
|
|
|
|
|
|
def normalize_line_endings(txt):
|
|
txt = txt.replace('\r\n', '\n')
|
|
txt = txt.replace('\r', '\n')
|
|
return txt
|
|
|
|
|
|
def separate_paragraphs_single_line(txt):
|
|
txt = txt.replace('\n', '\n\n')
|
|
return txt
|
|
|
|
|
|
def separate_paragraphs_print_formatted(txt):
|
|
txt = re.sub(u'(?miu)^(?P<indent>\t+|[ ]{2,})(?=.)', lambda mo: '\n%s' % mo.group('indent'), txt)
|
|
return txt
|
|
|
|
|
|
def separate_hard_scene_breaks(txt):
|
|
def sep_break(line):
|
|
if len(line.strip()) > 0:
|
|
return '\n%s\n' % line
|
|
else:
|
|
return line
|
|
txt = re.sub(r'(?miu)^[ \t-=~\/_]+$', lambda mo: sep_break(mo.group()), txt)
|
|
return txt
|
|
|
|
|
|
def block_to_single_line(txt):
|
|
txt = re.sub(r'(?<=.)\n(?=.)', ' ', txt)
|
|
return txt
|
|
|
|
|
|
def preserve_spaces(txt):
|
|
'''
|
|
Replaces spaces multiple spaces with entities.
|
|
'''
|
|
txt = re.sub('(?P<space>[ ]{2,})', lambda mo: ' ' + (' ' * (len(mo.group('space')) - 1)), txt)
|
|
txt = txt.replace('\t', ' ')
|
|
return txt
|
|
|
|
|
|
def remove_indents(txt):
|
|
'''
|
|
Remove whitespace at the beginning of each line.
|
|
'''
|
|
return '\n'.join([l.lstrip() for l in txt.splitlines()])
|
|
|
|
|
|
def opf_writer(path, opf_name, manifest, spine, mi):
|
|
opf = OPFCreator(path, mi)
|
|
opf.create_manifest(manifest)
|
|
opf.create_spine(spine)
|
|
with lopen(os.path.join(path, opf_name), 'wb') as opffile:
|
|
opf.render(opffile)
|
|
|
|
|
|
def split_string_separator(txt, size):
|
|
'''
|
|
Splits the text by putting \n\n at the point size.
|
|
'''
|
|
if len(txt) > size and size > 2:
|
|
size -= 2
|
|
txt = []
|
|
for part in (txt[i * size: (i + 1) * size] for i in range(0, len(txt), size)):
|
|
idx = part.rfind(b'.')
|
|
if idx == -1:
|
|
part += b'\n\n'
|
|
else:
|
|
part = part[:idx + 1] + b'\n\n' + part[idx:]
|
|
txt.append(part)
|
|
txt = b''.join(txt)
|
|
return txt
|
|
|
|
|
|
def detect_paragraph_type(txt):
|
|
'''
|
|
Tries to determine the paragraph type of the document.
|
|
|
|
block: Paragraphs are separated by a blank line.
|
|
single: Each line is a paragraph.
|
|
print: Each paragraph starts with a 2+ spaces or a tab
|
|
and ends when a new paragraph is reached.
|
|
unformatted: most lines have hard line breaks, few/no blank lines or indents
|
|
|
|
returns block, single, print, unformatted
|
|
'''
|
|
txt = txt.replace('\r\n', '\n')
|
|
txt = txt.replace('\r', '\n')
|
|
txt_line_count = len(re.findall(r'(?mu)^\s*.+$', txt))
|
|
|
|
# Check for hard line breaks - true if 55% of the doc breaks in the same region
|
|
docanalysis = DocAnalysis('txt', txt)
|
|
hardbreaks = docanalysis.line_histogram(.55)
|
|
|
|
if hardbreaks:
|
|
# Determine print percentage
|
|
tab_line_count = len(re.findall(r'(?mu)^(\t|\s{2,}).+$', txt))
|
|
print_percent = tab_line_count / float(txt_line_count)
|
|
|
|
# Determine block percentage
|
|
empty_line_count = len(re.findall(r'(?mu)^\s*$', txt))
|
|
block_percent = empty_line_count / float(txt_line_count)
|
|
|
|
# Compare the two types - the type with the larger number of instances wins
|
|
# in cases where only one or the other represents the vast majority of the document neither wins
|
|
if print_percent >= block_percent:
|
|
if .15 <= print_percent <= .75:
|
|
return 'print'
|
|
elif .15 <= block_percent <= .75:
|
|
return 'block'
|
|
|
|
# Assume unformatted text with hardbreaks if nothing else matches
|
|
return 'unformatted'
|
|
|
|
# return single if hardbreaks is false
|
|
return 'single'
|
|
|
|
|
|
def detect_formatting_type(txt):
|
|
'''
|
|
Tries to determine the formatting of the document.
|
|
|
|
markdown: Markdown formatting is used.
|
|
textile: Textile formatting is used.
|
|
heuristic: When none of the above formatting types are
|
|
detected heuristic is returned.
|
|
'''
|
|
# Keep a count of the number of format specific object
|
|
# that are found in the text.
|
|
markdown_count = 0
|
|
textile_count = 0
|
|
|
|
# Check for markdown
|
|
# Headings
|
|
markdown_count += len(re.findall('(?mu)^#+', txt))
|
|
markdown_count += len(re.findall('(?mu)^=+$', txt))
|
|
markdown_count += len(re.findall('(?mu)^-+$', txt))
|
|
# Images
|
|
markdown_count += len(re.findall(r'(?u)!\[.*?\](\[|\()', txt))
|
|
# Links
|
|
markdown_count += len(re.findall(r'(?u)^|[^!]\[.*?\](\[|\()', txt))
|
|
|
|
# Check for textile
|
|
# Headings
|
|
textile_count += len(re.findall(r'(?mu)^h[1-6]\.', txt))
|
|
# Block quote.
|
|
textile_count += len(re.findall(r'(?mu)^bq\.', txt))
|
|
# Images
|
|
textile_count += len(re.findall(r'(?mu)(?<=\!)\S+(?=\!)', txt))
|
|
# Links
|
|
textile_count += len(re.findall(r'"[^"]*":\S+', txt))
|
|
# paragraph blocks
|
|
textile_count += len(re.findall(r'(?mu)^p(<|<>|=|>)?\. ', txt))
|
|
|
|
# Decide if either markdown or textile is used in the text
|
|
# based on the number of unique formatting elements found.
|
|
if markdown_count > 5 or textile_count > 5:
|
|
if markdown_count > textile_count:
|
|
return 'markdown'
|
|
else:
|
|
return 'textile'
|
|
|
|
return 'heuristic'
|