Added txt related modules

2020-04-19 13:50:42 +02:00
parent 0f628900f3
commit 69d2e536c5
7 changed files with 2311 additions and 0 deletions
@@ -0,0 +1,264 @@
+# -*- coding: utf-8 -*-
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__ = 'GPL 3'
+__copyright__ = '2009, John Schember <john@nachtimwald.com>'
+__docformat__ = 'restructuredtext en'
+
+'''
+Transform OEB content into plain text
+'''
+
+import re
+
+from lxml import etree
+from polyglot.builtins import string_or_bytes
+
+
+BLOCK_TAGS = [
+    'div',
+    'p',
+    'h1',
+    'h2',
+    'h3',
+    'h4',
+    'h5',
+    'h6',
+    'li',
+    'tr',
+]
+
+BLOCK_STYLES = [
+    'block',
+]
+
+HEADING_TAGS = [
+    'h1',
+    'h2',
+    'h3',
+    'h4',
+    'h5',
+    'h6',
+]
+
+SPACE_TAGS = [
+    'td',
+    'br',
+]
+
+
+class TXTMLizer(object):
+
+    def __init__(self, log):
+        self.log = log
+
+    def extract_content(self, oeb_book, opts):
+        self.log.info('Converting XHTML to TXT...')
+        self.oeb_book = oeb_book
+        self.opts = opts
+        self.toc_titles = []
+        self.toc_ids = []
+        self.last_was_heading = False
+
+        self.create_flat_toc(self.oeb_book.toc)
+
+        return self.mlize_spine()
+
+    def mlize_spine(self):
+        from calibre.ebooks.oeb.base import XHTML
+        from calibre.ebooks.oeb.stylizer import Stylizer
+        from calibre.utils.xml_parse import safe_xml_fromstring
+        output = [u'']
+        output.append(self.get_toc())
+        for item in self.oeb_book.spine:
+            self.log.debug('Converting %s to TXT...' % item.href)
+            for x in item.data.iterdescendants(etree.Comment):
+                if x.text and '--' in x.text:
+                    x.text = x.text.replace('--', '__')
+            content = etree.tostring(item.data, encoding='unicode')
+            content = self.remove_newlines(content)
+            content = safe_xml_fromstring(content)
+            stylizer = Stylizer(content, item.href, self.oeb_book, self.opts, self.opts.output_profile)
+            output += self.dump_text(content.find(XHTML('body')), stylizer, item)
+            output += '\n\n\n\n\n\n'
+        output = ''.join(output)
+        output = '\n'.join(l.rstrip() for l in output.splitlines())
+        output = self.cleanup_text(output)
+
+        return output
+
+    def remove_newlines(self, text):
+        self.log.debug('\tRemove newlines for processing...')
+        text = text.replace('\r\n', ' ')
+        text = text.replace('\n', ' ')
+        text = text.replace('\r', ' ')
+        # Condense redundant spaces created by replacing newlines with spaces.
+        text = re.sub(r'[ ]{2,}', ' ', text)
+
+        return text
+
+    def get_toc(self):
+        toc = ['']
+        if getattr(self.opts, 'inline_toc', None):
+            self.log.debug('Generating table of contents...')
+            toc.append('%s\n\n' % _('Table of Contents:'))
+            for item in self.toc_titles:
+                toc.append('* %s\n\n' % item)
+        return ''.join(toc)
+
+    def create_flat_toc(self, nodes):
+        '''
+        Turns a hierarchical list of TOC href's into a flat list.
+        '''
+        for item in nodes:
+            self.toc_titles.append(item.title)
+            self.toc_ids.append(item.href)
+            self.create_flat_toc(item.nodes)
+
+    def cleanup_text(self, text):
+        self.log.debug('\tClean up text...')
+        # Replace bad characters.
+        text = text.replace(u'\xa0', ' ')
+
+        # Replace tabs, vertical tags and form feeds with single space.
+        text = text.replace('\t+', ' ')
+        text = text.replace('\v+', ' ')
+        text = text.replace('\f+', ' ')
+
+        # Single line paragraph.
+        text = re.sub('(?<=.)\n(?=.)', ' ', text)
+
+        # Remove multiple spaces.
+        text = re.sub('[ ]{2,}', ' ', text)
+
+        # Remove excessive newlines.
+        text = re.sub('\n[ ]+\n', '\n\n', text)
+        if self.opts.remove_paragraph_spacing:
+            text = re.sub('\n{2,}', '\n', text)
+            text = re.sub(r'(?msu)^(?P<t>[^\t\n]+?)$', lambda mo: u'%s\n\n' % mo.group('t'), text)
+            text = re.sub(r'(?msu)(?P<b>[^\n])\n+(?P<t>[^\t\n]+?)(?=\n)', lambda mo: '%s\n\n\n\n\n\n%s' % (mo.group('b'), mo.group('t')), text)
+        else:
+            text = re.sub('\n{7,}', '\n\n\n\n\n\n', text)
+
+        # Replace spaces at the beginning and end of lines
+        # We don't replace tabs because those are only added
+        # when remove paragraph spacing is enabled.
+        text = re.sub('(?imu)^[ ]+', '', text)
+        text = re.sub('(?imu)[ ]+$', '', text)
+
+        # Remove empty space and newlines at the beginning of the document.
+        text = re.sub(r'(?u)^[ \n]+', '', text)
+
+        if self.opts.max_line_length:
+            max_length = self.opts.max_line_length
+            if self.opts.max_line_length < 25 and not self.opts.force_max_line_length:
+                max_length = 25
+            short_lines = []
+            lines = text.splitlines()
+            for line in lines:
+                while len(line) > max_length:
+                    space = line.rfind(' ', 0, max_length)
+                    if space != -1:
+                        # Space was found.
+                        short_lines.append(line[:space])
+                        line = line[space + 1:]
+                    else:
+                        # Space was not found.
+                        if self.opts.force_max_line_length:
+                            # Force breaking at max_lenght.
+                            short_lines.append(line[:max_length])
+                            line = line[max_length:]
+                        else:
+                            # Look for the first space after max_length.
+                            space = line.find(' ', max_length, len(line))
+                            if space != -1:
+                                # Space was found.
+                                short_lines.append(line[:space])
+                                line = line[space + 1:]
+                            else:
+                                # No space was found cannot break line.
+                                short_lines.append(line)
+                                line = ''
+                # Add the text that was less than max_lengh to the list
+                short_lines.append(line)
+            text = '\n'.join(short_lines)
+
+        return text
+
+    def dump_text(self, elem, stylizer, page):
+        '''
+        @elem: The element in the etree that we are working on.
+        @stylizer: The style information attached to the element.
+        @page: OEB page used to determine absolute urls.
+        '''
+        from calibre.ebooks.oeb.base import XHTML_NS, barename, namespace
+
+        if not isinstance(elem.tag, string_or_bytes) \
+           or namespace(elem.tag) != XHTML_NS:
+            p = elem.getparent()
+            if p is not None and isinstance(p.tag, string_or_bytes) and namespace(p.tag) == XHTML_NS \
+                    and elem.tail:
+                return [elem.tail]
+            return ['']
+
+        text = ['']
+        style = stylizer.style(elem)
+
+        if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \
+           or style['visibility'] == 'hidden':
+            if hasattr(elem, 'tail') and elem.tail:
+                return [elem.tail]
+            return ['']
+
+        tag = barename(elem.tag)
+        tag_id = elem.attrib.get('id', None)
+        in_block = False
+        in_heading = False
+
+        # Are we in a heading?
+        # This can either be a heading tag or a TOC item.
+        if tag in HEADING_TAGS or '%s#%s' % (page.href, tag_id) in self.toc_ids:
+            in_heading = True
+            if not self.last_was_heading:
+                text.append('\n\n\n\n\n\n')
+
+        # Are we in a paragraph block?
+        if tag in BLOCK_TAGS or style['display'] in BLOCK_STYLES:
+            if self.opts.remove_paragraph_spacing and not in_heading:
+                text.append('\t')
+            in_block = True
+
+        if tag in SPACE_TAGS:
+            text.append(' ')
+
+        # Hard scene breaks.
+        if tag == 'hr':
+            text.append('\n\n* * *\n\n')
+        # Soft scene breaks.
+        try:
+            ems = int(round((float(style.marginTop) / style.fontSize) - 1))
+            if ems >= 1:
+                text.append('\n' * ems)
+        except:
+            pass
+
+        # Process tags that contain text.
+        if hasattr(elem, 'text') and elem.text:
+            text.append(elem.text)
+
+        # Recurse down into tags within the tag we are in.
+        for item in elem:
+            text += self.dump_text(item, stylizer, page)
+
+        if in_block:
+            text.append('\n\n')
+        if in_heading:
+            text.append('\n')
+            self.last_was_heading = True
+        else:
+            self.last_was_heading = False
+
+        if hasattr(elem, 'tail') and elem.tail:
+            text.append(elem.tail)
+
+        return text