ebook-converter/ebook_converter/ebooks/txt/txtml.py

"""
Transform OEB content into plain text
"""
import re

from lxml import etree

from ebook_converter import constants as const
from ebook_converter.ebooks.oeb import base
from ebook_converter.ebooks.oeb import parse_utils
from ebook_converter.ebooks.oeb.stylizer import Stylizer


BLOCK_TAGS = [
    'div',
    'p',
    'h1',
    'h2',
    'h3',
    'h4',
    'h5',
    'h6',
    'li',
    'tr',
]

BLOCK_STYLES = [
    'block',
]

HEADING_TAGS = [
    'h1',
    'h2',
    'h3',
    'h4',
    'h5',
    'h6',
]

SPACE_TAGS = [
    'td',
    'br',
]


class TXTMLizer(object):

    def __init__(self, log):
        self.log = log

    def extract_content(self, oeb_book, opts):
        self.log.info('Converting XHTML to TXT...')
        self.oeb_book = oeb_book
        self.opts = opts
        self.toc_titles = []
        self.toc_ids = []
        self.last_was_heading = False

        self.create_flat_toc(self.oeb_book.toc)

        return self.mlize_spine()

    def mlize_spine(self):
        output = [u'']
        output.append(self.get_toc())
        for item in self.oeb_book.spine:
            self.log.debug('Converting %s to TXT...', item.href)
            for x in item.data.iterdescendants(etree.Comment):
                if x.text and '--' in x.text:
                    x.text = x.text.replace('--', '__')
            content = etree.tostring(item.data, encoding='unicode')
            content = self.remove_newlines(content)
            content = etree.fromstring(content)
            stylizer = Stylizer(content, item.href, self.oeb_book, self.opts,
                                self.opts.output_profile)
            output += self.dump_text(content.find(base.tag('xhtml', 'body')),
                                     stylizer, item)
            output += '\n\n\n\n\n\n'
        output = ''.join(output)
        output = '\n'.join(l.rstrip() for l in output.splitlines())
        output = self.cleanup_text(output)

        return output

    def remove_newlines(self, text):
        self.log.debug('\tRemove newlines for processing...')
        text = text.replace('\r\n', ' ')
        text = text.replace('\n', ' ')
        text = text.replace('\r', ' ')
        # Condense redundant spaces created by replacing newlines with spaces.
        text = re.sub(r'[ ]{2,}', ' ', text)

        return text

    def get_toc(self):
        toc = ['']
        if getattr(self.opts, 'inline_toc', None):
            self.log.debug('Generating table of contents...')
            toc.append('%s\n\n' % 'Table of Contents:')
            for item in self.toc_titles:
                toc.append('* %s\n\n' % item)
        return ''.join(toc)

    def create_flat_toc(self, nodes):
        '''
        Turns a hierarchical list of TOC href's into a flat list.
        '''
        for item in nodes:
            self.toc_titles.append(item.title)
            self.toc_ids.append(item.href)
            self.create_flat_toc(item.nodes)

    def cleanup_text(self, text):
        self.log.debug('\tClean up text...')
        # Replace bad characters.
        text = text.replace(u'\xa0', ' ')

        # Replace tabs, vertical tags and form feeds with single space.
        text = text.replace('\t+', ' ')
        text = text.replace('\v+', ' ')
        text = text.replace('\f+', ' ')

        # Single line paragraph.
        text = re.sub('(?<=.)\n(?=.)', ' ', text)

        # Remove multiple spaces.
        text = re.sub('[ ]{2,}', ' ', text)

        # Remove excessive newlines.
        text = re.sub('\n[ ]+\n', '\n\n', text)
        if self.opts.remove_paragraph_spacing:
            text = re.sub('\n{2,}', '\n', text)
            text = re.sub(r'(?msu)^(?P<t>[^\t\n]+?)$', lambda mo: u'%s\n\n' %
                          mo.group('t'), text)
            text = re.sub(r'(?msu)(?P<b>[^\n])\n+(?P<t>[^\t\n]+?)(?=\n)',
                          lambda mo: '%s\n\n\n\n\n\n%s' % (mo.group('b'),
                                                           mo.group('t')),
                          text)
        else:
            text = re.sub('\n{7,}', '\n\n\n\n\n\n', text)

        # Replace spaces at the beginning and end of lines
        # We don't replace tabs because those are only added
        # when remove paragraph spacing is enabled.
        text = re.sub('(?imu)^[ ]+', '', text)
        text = re.sub('(?imu)[ ]+$', '', text)

        # Remove empty space and newlines at the beginning of the document.
        text = re.sub(r'(?u)^[ \n]+', '', text)

        if self.opts.max_line_length:
            max_length = self.opts.max_line_length
            if (self.opts.max_line_length < 25 and not
                    self.opts.force_max_line_length):
                max_length = 25
            short_lines = []
            lines = text.splitlines()
            for line in lines:
                while len(line) > max_length:
                    space = line.rfind(' ', 0, max_length)
                    if space != -1:
                        # Space was found.
                        short_lines.append(line[:space])
                        line = line[space + 1:]
                    else:
                        # Space was not found.
                        if self.opts.force_max_line_length:
                            # Force breaking at max_lenght.
                            short_lines.append(line[:max_length])
                            line = line[max_length:]
                        else:
                            # Look for the first space after max_length.
                            space = line.find(' ', max_length, len(line))
                            if space != -1:
                                # Space was found.
                                short_lines.append(line[:space])
                                line = line[space + 1:]
                            else:
                                # No space was found cannot break line.
                                short_lines.append(line)
                                line = ''
                # Add the text that was less than max_lengh to the list
                short_lines.append(line)
            text = '\n'.join(short_lines)

        return text

    def dump_text(self, elem, stylizer, page):
        '''
        @elem: The element in the etree that we are working on.
        @stylizer: The style information attached to the element.
        @page: OEB page used to determine absolute urls.
        '''

        if not isinstance(elem.tag, (str, bytes)) \
           or parse_utils.namespace(elem.tag) != const.XHTML_NS:
            p = elem.getparent()
            if (p is not None and isinstance(p.tag, (str, bytes)) and
                    parse_utils.namespace(p.tag) == const.XHTML_NS and
                    elem.tail):
                return [elem.tail]
            return ['']

        text = ['']
        style = stylizer.style(elem)

        if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \
           or style['visibility'] == 'hidden':
            if hasattr(elem, 'tail') and elem.tail:
                return [elem.tail]
            return ['']

        tag = parse_utils.barename(elem.tag)
        tag_id = elem.attrib.get('id', None)
        in_block = False
        in_heading = False

        # Are we in a heading?
        # This can either be a heading tag or a TOC item.
        if tag in HEADING_TAGS or '%s#%s' % (page.href,
                                             tag_id) in self.toc_ids:
            in_heading = True
            if not self.last_was_heading:
                text.append('\n\n\n\n\n\n')

        # Are we in a paragraph block?
        if tag in BLOCK_TAGS or style['display'] in BLOCK_STYLES:
            if self.opts.remove_paragraph_spacing and not in_heading:
                text.append('\t')
            in_block = True

        if tag in SPACE_TAGS:
            text.append(' ')

        # Hard scene breaks.
        if tag == 'hr':
            text.append('\n\n* * *\n\n')
        # Soft scene breaks.
        try:
            ems = int(round((float(style.marginTop) / style.fontSize) - 1))
            if ems >= 1:
                text.append('\n' * ems)
        except Exception:
            pass

        # Process tags that contain text.
        if hasattr(elem, 'text') and elem.text:
            text.append(elem.text)

        # Recurse down into tags within the tag we are in.
        for item in elem:
            text += self.dump_text(item, stylizer, page)

        if in_block:
            text.append('\n\n')
        if in_heading:
            text.append('\n')
            self.last_was_heading = True
        else:
            self.last_was_heading = False

        if hasattr(elem, 'tail') and elem.tail:
            text.append(elem.tail)

        return text