ebook-converter/ebook_converter/ebooks/txt/processor.py

# -*- coding: utf-8 -*-
from __future__ import absolute_import, division, print_function, unicode_literals

__license__   = 'GPL v3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'


'''
Read content from txt file.
'''

import os, re

from ebook_converter import prepare_string_for_xml, isbytestring
from ebook_converter.ebooks.metadata.opf2 import OPFCreator

from ebook_converter.ebooks.conversion.preprocess import DocAnalysis
from ebook_converter.utils.cleantext import clean_ascii_chars
from ebook_converter.polyglot.builtins import iteritems, unicode_type, map, range, long_type

HTML_TEMPLATE = '<html><head><meta http-equiv="Content-Type" content="text/html; charset=utf-8"/><title>%s </title></head><body>\n%s\n</body></html>'


def clean_txt(txt):
    '''
    Run transformations on the text to put it into
    consistent state.
    '''
    if isbytestring(txt):
        txt = txt.decode('utf-8', 'replace')
    # Strip whitespace from the end of the line. Also replace
    # all line breaks with \n.
    txt = '\n'.join([line.rstrip() for line in txt.splitlines()])

    # Replace whitespace at the beginning of the line with &nbsp;
    txt = re.sub('(?m)(?<=^)([ ]{2,}|\t+)(?=.)', '&nbsp;' * 4, txt)

    # Condense redundant spaces
    txt = re.sub('[ ]{2,}', ' ', txt)

    # Remove blank space from the beginning and end of the document.
    txt = re.sub(r'^\s+(?=.)', '', txt)
    txt = re.sub(r'(?<=.)\s+$', '', txt)
    # Remove excessive line breaks.
    txt = re.sub('\n{5,}', '\n\n\n\n', txt)
    # remove ASCII invalid chars : 0 to 8 and 11-14 to 24
    txt = clean_ascii_chars(txt)

    return txt


def split_txt(txt, epub_split_size_kb=0):
    '''
    Ensure there are split points for converting
    to EPUB. A misdetected paragraph type can
    result in the entire document being one giant
    paragraph. In this case the EPUB parser will not
    be able to determine where to split the file
    to accommodate the EPUB file size limitation
    and will fail.
    '''
    # Takes care if there is no point to split
    if epub_split_size_kb > 0:
        if isinstance(txt, unicode_type):
            txt = txt.encode('utf-8')
        length_byte = len(txt)
        # Calculating the average chunk value for easy splitting as EPUB (+2 as a safe margin)
        chunk_size = long_type(length_byte / (int(length_byte / (epub_split_size_kb * 1024)) + 2))
        # if there are chunks with a superior size then go and break
        parts = txt.split(b'\n\n')
        lengths = tuple(map(len, parts))
        if lengths and max(lengths) > chunk_size:
            txt = b'\n\n'.join([
                split_string_separator(line, chunk_size) for line in parts
            ])
    if isbytestring(txt):
        txt = txt.decode('utf-8')

    return txt


def convert_basic(txt, title='', epub_split_size_kb=0):
    '''
    Converts plain text to html by putting all paragraphs in
    <p> tags. It condense and retains blank lines when necessary.

    Requires paragraphs to be in single line format.
    '''
    txt = clean_txt(txt)
    txt = split_txt(txt, epub_split_size_kb)

    lines = []
    blank_count = 0
    # Split into paragraphs based on having a blank line between text.
    for line in txt.split('\n'):
        if line.strip():
            blank_count = 0
            lines.append(u'<p>%s</p>' % prepare_string_for_xml(line.replace('\n', ' ')))
        else:
            blank_count += 1
            if blank_count == 2:
                lines.append(u'<p>&nbsp;</p>')

    return HTML_TEMPLATE % (title, u'\n'.join(lines))


DEFAULT_MD_EXTENSIONS = ('footnotes', 'tables', 'toc')


def create_markdown_object(extensions):
    # Need to load markdown extensions without relying on pkg_resources
    import importlib
    from ebook_converter.ebooks.markdown import Markdown
    from markdown import Extension

    class NotBrainDeadMarkdown(Markdown):
        def build_extension(self, ext_name, configs):
            if '.' in ext_name or ':' in ext_name:
                return Markdown.build_extension(self, ext_name, configs)
            ext_name = 'markdown.extensions.' + ext_name
            module = importlib.import_module(ext_name)
            if hasattr(module, 'makeExtension'):
                return module.makeExtension(**configs)
            for name, x in vars(module).items():
                if type(x) is type and issubclass(x, Extension) and x is not Extension:
                    return x(**configs)
            raise ImportError('No extension class in {}'.format(ext_name))

    from ebook_converter.ebooks.conversion.plugins.txt_input import MD_EXTENSIONS
    extensions = [x.lower() for x in extensions]
    extensions = [x for x in extensions if x in MD_EXTENSIONS]
    md = NotBrainDeadMarkdown(extensions=extensions)
    return md


def convert_markdown(txt, title='', extensions=DEFAULT_MD_EXTENSIONS):
    md = create_markdown_object(extensions)
    return HTML_TEMPLATE % (title, md.convert(txt))


def convert_markdown_with_metadata(txt, title='', extensions=DEFAULT_MD_EXTENSIONS):
    from ebook_converter.ebooks.metadata.book.base import Metadata
    from ebook_converter.utils.date import parse_only_date
    from ebook_converter.db.write import get_series_values
    if 'meta' not in extensions:
        extensions.append('meta')
    md = create_markdown_object(extensions)
    html = md.convert(txt)
    mi = Metadata(title or _('Unknown'))
    m = md.Meta
    for k, v in iteritems({'date':'pubdate', 'summary':'comments'}):
        if v not in m and k in m:
            m[v] = m.pop(k)
    for k in 'title authors series tags pubdate comments publisher rating'.split():
        val = m.get(k)
        if val:
            mf = mi.metadata_for_field(k)
            if not mf.get('is_multiple'):
                val = val[0]
            if k == 'series':
                val, si = get_series_values(val)
                mi.series_index = 1 if si is None else si
            if k == 'rating':
                try:
                    val = max(0, min(int(float(val)), 10))
                except Exception:
                    continue
            if mf.get('datatype') == 'datetime':
                try:
                    val = parse_only_date(val, assume_utc=False)
                except Exception:
                    continue
            setattr(mi, k, val)
    return mi, HTML_TEMPLATE % (mi.title, html)


def convert_textile(txt, title=''):
    from ebook_converter.ebooks.textile import textile
    html = textile(txt, encoding='utf-8')
    return HTML_TEMPLATE % (title, html)


def normalize_line_endings(txt):
    txt = txt.replace('\r\n', '\n')
    txt = txt.replace('\r', '\n')
    return txt


def separate_paragraphs_single_line(txt):
    txt = txt.replace('\n', '\n\n')
    return txt


def separate_paragraphs_print_formatted(txt):
    txt = re.sub(u'(?miu)^(?P<indent>\t+|[ ]{2,})(?=.)', lambda mo: '\n%s' % mo.group('indent'), txt)
    return txt


def separate_hard_scene_breaks(txt):
    def sep_break(line):
        if len(line.strip()) > 0:
            return '\n%s\n' % line
        else:
            return line
    txt = re.sub(r'(?miu)^[ \t-=~\/_]+$', lambda mo: sep_break(mo.group()), txt)
    return txt


def block_to_single_line(txt):
    txt = re.sub(r'(?<=.)\n(?=.)', ' ', txt)
    return txt


def preserve_spaces(txt):
    '''
    Replaces spaces multiple spaces with &nbsp; entities.
    '''
    txt = re.sub('(?P<space>[ ]{2,})', lambda mo: ' ' + ('&nbsp;' * (len(mo.group('space')) - 1)), txt)
    txt = txt.replace('\t', '&nbsp;&nbsp;&nbsp;&nbsp;')
    return txt


def remove_indents(txt):
    '''
    Remove whitespace at the beginning of each line.
    '''
    return '\n'.join([l.lstrip() for l in txt.splitlines()])


def opf_writer(path, opf_name, manifest, spine, mi):
    opf = OPFCreator(path, mi)
    opf.create_manifest(manifest)
    opf.create_spine(spine)
    with lopen(os.path.join(path, opf_name), 'wb') as opffile:
        opf.render(opffile)


def split_string_separator(txt, size):
    '''
    Splits the text by putting \n\n at the point size.
    '''
    if len(txt) > size and size > 2:
        size -= 2
        txt = []
        for part in (txt[i * size: (i + 1) * size] for i in range(0, len(txt), size)):
            idx = part.rfind(b'.')
            if idx == -1:
                part += b'\n\n'
            else:
                part = part[:idx + 1] + b'\n\n' + part[idx:]
            txt.append(part)
        txt = b''.join(txt)
    return txt


def detect_paragraph_type(txt):
    '''
    Tries to determine the paragraph type of the document.

    block: Paragraphs are separated by a blank line.
    single: Each line is a paragraph.
    print: Each paragraph starts with a 2+ spaces or a tab
           and ends when a new paragraph is reached.
    unformatted: most lines have hard line breaks, few/no blank lines or indents

    returns block, single, print, unformatted
    '''
    txt = txt.replace('\r\n', '\n')
    txt = txt.replace('\r', '\n')
    txt_line_count = len(re.findall(r'(?mu)^\s*.+$', txt))

    # Check for hard line breaks - true if 55% of the doc breaks in the same region
    docanalysis = DocAnalysis('txt', txt)
    hardbreaks = docanalysis.line_histogram(.55)

    if hardbreaks:
        # Determine print percentage
        tab_line_count = len(re.findall(r'(?mu)^(\t|\s{2,}).+$', txt))
        print_percent = tab_line_count / float(txt_line_count)

        # Determine block percentage
        empty_line_count = len(re.findall(r'(?mu)^\s*$', txt))
        block_percent = empty_line_count / float(txt_line_count)

        # Compare the two types - the type with the larger number of instances wins
        # in cases where only one or the other represents the vast majority of the document neither wins
        if print_percent >= block_percent:
            if .15 <= print_percent <= .75:
                return 'print'
        elif .15 <= block_percent <= .75:
            return 'block'

        # Assume unformatted text with hardbreaks if nothing else matches
        return 'unformatted'

    # return single if hardbreaks is false
    return 'single'


def detect_formatting_type(txt):
    '''
    Tries to determine the formatting of the document.

    markdown: Markdown formatting is used.
    textile: Textile formatting is used.
    heuristic: When none of the above formatting types are
               detected heuristic is returned.
    '''
    # Keep a count of the number of format specific object
    # that are found in the text.
    markdown_count = 0
    textile_count = 0

    # Check for markdown
    # Headings
    markdown_count += len(re.findall('(?mu)^#+', txt))
    markdown_count += len(re.findall('(?mu)^=+$', txt))
    markdown_count += len(re.findall('(?mu)^-+$', txt))
    # Images
    markdown_count += len(re.findall(r'(?u)!\[.*?\](\[|\()', txt))
    # Links
    markdown_count += len(re.findall(r'(?u)^|[^!]\[.*?\](\[|\()', txt))

    # Check for textile
    # Headings
    textile_count += len(re.findall(r'(?mu)^h[1-6]\.', txt))
    # Block quote.
    textile_count += len(re.findall(r'(?mu)^bq\.', txt))
    # Images
    textile_count += len(re.findall(r'(?mu)(?<=\!)\S+(?=\!)', txt))
    # Links
    textile_count += len(re.findall(r'"[^"]*":\S+', txt))
    # paragraph blocks
    textile_count += len(re.findall(r'(?mu)^p(<|<>|=|>)?\. ', txt))

    # Decide if either markdown or textile is used in the text
    # based on the number of unique formatting elements found.
    if markdown_count > 5 or textile_count > 5:
        if markdown_count > textile_count:
            return 'markdown'
        else:
            return 'textile'

    return 'heuristic'