mirror of
https://github.com/gryf/ebook-converter.git
synced 2026-01-29 17:55:45 +01:00
337 lines
11 KiB
Python
337 lines
11 KiB
Python
"""
|
|
Read content from txt file.
|
|
"""
|
|
import os, re
|
|
|
|
from ebook_converter import prepare_string_for_xml
|
|
from ebook_converter.ebooks.metadata.opf2 import OPFCreator
|
|
|
|
from ebook_converter.ebooks.conversion.preprocess import DocAnalysis
|
|
from ebook_converter.utils.cleantext import clean_ascii_chars
|
|
|
|
|
|
HTML_TEMPLATE = '<html><head><meta http-equiv="Content-Type" content="text/html; charset=utf-8"/><title>%s </title></head><body>\n%s\n</body></html>'
|
|
|
|
|
|
def clean_txt(txt):
|
|
'''
|
|
Run transformations on the text to put it into
|
|
consistent state.
|
|
'''
|
|
if isinstance(txt, bytes):
|
|
txt = txt.decode('utf-8', 'replace')
|
|
# Strip whitespace from the end of the line. Also replace
|
|
# all line breaks with \n.
|
|
txt = '\n'.join([line.rstrip() for line in txt.splitlines()])
|
|
|
|
# Replace whitespace at the beginning of the line with
|
|
txt = re.sub('(?m)(?<=^)([ ]{2,}|\t+)(?=.)', ' ' * 4, txt)
|
|
|
|
# Condense redundant spaces
|
|
txt = re.sub('[ ]{2,}', ' ', txt)
|
|
|
|
# Remove blank space from the beginning and end of the document.
|
|
txt = re.sub(r'^\s+(?=.)', '', txt)
|
|
txt = re.sub(r'(?<=.)\s+$', '', txt)
|
|
# Remove excessive line breaks.
|
|
txt = re.sub('\n{5,}', '\n\n\n\n', txt)
|
|
# remove ASCII invalid chars : 0 to 8 and 11-14 to 24
|
|
txt = clean_ascii_chars(txt)
|
|
|
|
return txt
|
|
|
|
|
|
def split_txt(txt, epub_split_size_kb=0):
|
|
'''
|
|
Ensure there are split points for converting
|
|
to EPUB. A misdetected paragraph type can
|
|
result in the entire document being one giant
|
|
paragraph. In this case the EPUB parser will not
|
|
be able to determine where to split the file
|
|
to accommodate the EPUB file size limitation
|
|
and will fail.
|
|
'''
|
|
# Takes care if there is no point to split
|
|
if epub_split_size_kb > 0:
|
|
if isinstance(txt, str):
|
|
txt = txt.encode('utf-8')
|
|
length_byte = len(txt)
|
|
# Calculating the average chunk value for easy splitting as EPUB (+2 as a safe margin)
|
|
chunk_size = int(length_byte / (int(length_byte / (epub_split_size_kb * 1024)) + 2))
|
|
# if there are chunks with a superior size then go and break
|
|
parts = txt.split(b'\n\n')
|
|
lengths = tuple(map(len, parts))
|
|
if lengths and max(lengths) > chunk_size:
|
|
txt = b'\n\n'.join([
|
|
split_string_separator(line, chunk_size) for line in parts
|
|
])
|
|
if isinstance(txt, bytes):
|
|
txt = txt.decode('utf-8')
|
|
|
|
return txt
|
|
|
|
|
|
def convert_basic(txt, title='', epub_split_size_kb=0):
|
|
'''
|
|
Converts plain text to html by putting all paragraphs in
|
|
<p> tags. It condense and retains blank lines when necessary.
|
|
|
|
Requires paragraphs to be in single line format.
|
|
'''
|
|
txt = clean_txt(txt)
|
|
txt = split_txt(txt, epub_split_size_kb)
|
|
|
|
lines = []
|
|
blank_count = 0
|
|
# Split into paragraphs based on having a blank line between text.
|
|
for line in txt.split('\n'):
|
|
if line.strip():
|
|
blank_count = 0
|
|
lines.append(u'<p>%s</p>' % prepare_string_for_xml(line.replace('\n', ' ')))
|
|
else:
|
|
blank_count += 1
|
|
if blank_count == 2:
|
|
lines.append(u'<p> </p>')
|
|
|
|
return HTML_TEMPLATE % (title, u'\n'.join(lines))
|
|
|
|
|
|
DEFAULT_MD_EXTENSIONS = ('footnotes', 'tables', 'toc')
|
|
|
|
|
|
def create_markdown_object(extensions):
|
|
# Need to load markdown extensions without relying on pkg_resources
|
|
import importlib
|
|
from ebook_converter.ebooks.markdown import Markdown
|
|
from markdown import Extension
|
|
|
|
class NotBrainDeadMarkdown(Markdown):
|
|
def build_extension(self, ext_name, configs):
|
|
if '.' in ext_name or ':' in ext_name:
|
|
return Markdown.build_extension(self, ext_name, configs)
|
|
ext_name = 'markdown.extensions.' + ext_name
|
|
module = importlib.import_module(ext_name)
|
|
if hasattr(module, 'makeExtension'):
|
|
return module.makeExtension(**configs)
|
|
for name, x in vars(module).items():
|
|
if type(x) is type and issubclass(x, Extension) and x is not Extension:
|
|
return x(**configs)
|
|
raise ImportError('No extension class in {}'.format(ext_name))
|
|
|
|
from ebook_converter.ebooks.conversion.plugins.txt_input import MD_EXTENSIONS
|
|
extensions = [x.lower() for x in extensions]
|
|
extensions = [x for x in extensions if x in MD_EXTENSIONS]
|
|
md = NotBrainDeadMarkdown(extensions=extensions)
|
|
return md
|
|
|
|
|
|
def convert_markdown(txt, title='', extensions=DEFAULT_MD_EXTENSIONS):
|
|
md = create_markdown_object(extensions)
|
|
return HTML_TEMPLATE % (title, md.convert(txt))
|
|
|
|
|
|
def convert_markdown_with_metadata(txt, title='', extensions=DEFAULT_MD_EXTENSIONS):
|
|
from ebook_converter.ebooks.metadata.book.base import Metadata
|
|
from ebook_converter.utils.date import parse_only_date
|
|
from ebook_converter.db.write import get_series_values
|
|
if 'meta' not in extensions:
|
|
extensions.append('meta')
|
|
md = create_markdown_object(extensions)
|
|
html = md.convert(txt)
|
|
mi = Metadata(title or 'Unknown')
|
|
m = md.Meta
|
|
for k, v in {'date':'pubdate', 'summary':'comments'}.items():
|
|
if v not in m and k in m:
|
|
m[v] = m.pop(k)
|
|
for k in 'title authors series tags pubdate comments publisher rating'.split():
|
|
val = m.get(k)
|
|
if val:
|
|
mf = mi.metadata_for_field(k)
|
|
if not mf.get('is_multiple'):
|
|
val = val[0]
|
|
if k == 'series':
|
|
val, si = get_series_values(val)
|
|
mi.series_index = 1 if si is None else si
|
|
if k == 'rating':
|
|
try:
|
|
val = max(0, min(int(float(val)), 10))
|
|
except Exception:
|
|
continue
|
|
if mf.get('datatype') == 'datetime':
|
|
try:
|
|
val = parse_only_date(val, assume_utc=False)
|
|
except Exception:
|
|
continue
|
|
setattr(mi, k, val)
|
|
return mi, HTML_TEMPLATE % (mi.title, html)
|
|
|
|
|
|
def convert_textile(txt, title=''):
|
|
from ebook_converter.ebooks.textile.functions import textile
|
|
html = textile(txt, encoding='utf-8')
|
|
return HTML_TEMPLATE % (title, html)
|
|
|
|
|
|
def normalize_line_endings(txt):
|
|
txt = txt.replace('\r\n', '\n')
|
|
txt = txt.replace('\r', '\n')
|
|
return txt
|
|
|
|
|
|
def separate_paragraphs_single_line(txt):
|
|
txt = txt.replace('\n', '\n\n')
|
|
return txt
|
|
|
|
|
|
def separate_paragraphs_print_formatted(txt):
|
|
txt = re.sub(u'(?miu)^(?P<indent>\t+|[ ]{2,})(?=.)', lambda mo: '\n%s' % mo.group('indent'), txt)
|
|
return txt
|
|
|
|
|
|
def separate_hard_scene_breaks(txt):
|
|
def sep_break(line):
|
|
if len(line.strip()) > 0:
|
|
return '\n%s\n' % line
|
|
else:
|
|
return line
|
|
txt = re.sub(r'(?miu)^[ \t-=~\/_]+$', lambda mo: sep_break(mo.group()), txt)
|
|
return txt
|
|
|
|
|
|
def block_to_single_line(txt):
|
|
txt = re.sub(r'(?<=.)\n(?=.)', ' ', txt)
|
|
return txt
|
|
|
|
|
|
def preserve_spaces(txt):
|
|
'''
|
|
Replaces spaces multiple spaces with entities.
|
|
'''
|
|
txt = re.sub('(?P<space>[ ]{2,})', lambda mo: ' ' + (' ' * (len(mo.group('space')) - 1)), txt)
|
|
txt = txt.replace('\t', ' ')
|
|
return txt
|
|
|
|
|
|
def remove_indents(txt):
|
|
'''
|
|
Remove whitespace at the beginning of each line.
|
|
'''
|
|
return '\n'.join([l.lstrip() for l in txt.splitlines()])
|
|
|
|
|
|
def opf_writer(path, opf_name, manifest, spine, mi):
|
|
opf = OPFCreator(path, mi)
|
|
opf.create_manifest(manifest)
|
|
opf.create_spine(spine)
|
|
with open(os.path.join(path, opf_name), 'wb') as opffile:
|
|
opf.render(opffile)
|
|
|
|
|
|
def split_string_separator(txt, size):
|
|
'''
|
|
Splits the text by putting \n\n at the point size.
|
|
'''
|
|
if len(txt) > size and size > 2:
|
|
size -= 2
|
|
txt = []
|
|
for part in (txt[i * size: (i + 1) * size] for i in range(0, len(txt), size)):
|
|
idx = part.rfind(b'.')
|
|
if idx == -1:
|
|
part += b'\n\n'
|
|
else:
|
|
part = part[:idx + 1] + b'\n\n' + part[idx:]
|
|
txt.append(part)
|
|
txt = b''.join(txt)
|
|
return txt
|
|
|
|
|
|
def detect_paragraph_type(txt):
|
|
'''
|
|
Tries to determine the paragraph type of the document.
|
|
|
|
block: Paragraphs are separated by a blank line.
|
|
single: Each line is a paragraph.
|
|
print: Each paragraph starts with a 2+ spaces or a tab
|
|
and ends when a new paragraph is reached.
|
|
unformatted: most lines have hard line breaks, few/no blank lines or indents
|
|
|
|
returns block, single, print, unformatted
|
|
'''
|
|
txt = txt.replace('\r\n', '\n')
|
|
txt = txt.replace('\r', '\n')
|
|
txt_line_count = len(re.findall(r'(?mu)^\s*.+$', txt))
|
|
|
|
# Check for hard line breaks - true if 55% of the doc breaks in the same region
|
|
docanalysis = DocAnalysis('txt', txt)
|
|
hardbreaks = docanalysis.line_histogram(.55)
|
|
|
|
if hardbreaks:
|
|
# Determine print percentage
|
|
tab_line_count = len(re.findall(r'(?mu)^(\t|\s{2,}).+$', txt))
|
|
print_percent = tab_line_count / float(txt_line_count)
|
|
|
|
# Determine block percentage
|
|
empty_line_count = len(re.findall(r'(?mu)^\s*$', txt))
|
|
block_percent = empty_line_count / float(txt_line_count)
|
|
|
|
# Compare the two types - the type with the larger number of instances wins
|
|
# in cases where only one or the other represents the vast majority of the document neither wins
|
|
if print_percent >= block_percent:
|
|
if .15 <= print_percent <= .75:
|
|
return 'print'
|
|
elif .15 <= block_percent <= .75:
|
|
return 'block'
|
|
|
|
# Assume unformatted text with hardbreaks if nothing else matches
|
|
return 'unformatted'
|
|
|
|
# return single if hardbreaks is false
|
|
return 'single'
|
|
|
|
|
|
def detect_formatting_type(txt):
|
|
'''
|
|
Tries to determine the formatting of the document.
|
|
|
|
markdown: Markdown formatting is used.
|
|
textile: Textile formatting is used.
|
|
heuristic: When none of the above formatting types are
|
|
detected heuristic is returned.
|
|
'''
|
|
# Keep a count of the number of format specific object
|
|
# that are found in the text.
|
|
markdown_count = 0
|
|
textile_count = 0
|
|
|
|
# Check for markdown
|
|
# Headings
|
|
markdown_count += len(re.findall('(?mu)^#+', txt))
|
|
markdown_count += len(re.findall('(?mu)^=+$', txt))
|
|
markdown_count += len(re.findall('(?mu)^-+$', txt))
|
|
# Images
|
|
markdown_count += len(re.findall(r'(?u)!\[.*?\](\[|\()', txt))
|
|
# Links
|
|
markdown_count += len(re.findall(r'(?u)^|[^!]\[.*?\](\[|\()', txt))
|
|
|
|
# Check for textile
|
|
# Headings
|
|
textile_count += len(re.findall(r'(?mu)^h[1-6]\.', txt))
|
|
# Block quote.
|
|
textile_count += len(re.findall(r'(?mu)^bq\.', txt))
|
|
# Images
|
|
textile_count += len(re.findall(r'(?mu)(?<=\!)\S+(?=\!)', txt))
|
|
# Links
|
|
textile_count += len(re.findall(r'"[^"]*":\S+', txt))
|
|
# paragraph blocks
|
|
textile_count += len(re.findall(r'(?mu)^p(<|<>|=|>)?\. ', txt))
|
|
|
|
# Decide if either markdown or textile is used in the text
|
|
# based on the number of unique formatting elements found.
|
|
if markdown_count > 5 or textile_count > 5:
|
|
if markdown_count > textile_count:
|
|
return 'markdown'
|
|
else:
|
|
return 'textile'
|
|
|
|
return 'heuristic'
|