Added txt related modules

2020-04-19 13:50:42 +02:00
parent 0f628900f3
commit 69d2e536c5
7 changed files with 2311 additions and 0 deletions
@@ -0,0 +1,286 @@
+# -*- coding: utf-8 -*-
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__ = 'GPL 3'
+__copyright__ = '''2011, John Schember <john@nachtimwald.com>
+2011, Leigh Parry <leighparry@blueyonder.co.uk>'''
+__docformat__ = 'restructuredtext en'
+
+'''
+Transform OEB content into Textile formatted plain text
+'''
+import re
+
+from functools import partial
+
+from calibre.ebooks.htmlz.oeb2html import OEB2HTML
+from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace, rewrite_links
+from calibre.ebooks.oeb.stylizer import Stylizer
+from polyglot.builtins import unicode_type, string_or_bytes
+
+
+class MarkdownMLizer(OEB2HTML):
+
+    def extract_content(self, oeb_book, opts):
+        self.log.info('Converting XHTML to Markdown formatted TXT...')
+        self.opts = opts
+        self.in_code = False
+        self.in_pre = False
+        self.list = []
+        self.blockquotes = 0
+        self.remove_space_after_newline = False
+        self.base_hrefs = [item.href for item in oeb_book.spine]
+        self.map_resources(oeb_book)
+
+        self.style_bold = False
+        self.style_italic = False
+
+        txt = self.mlize_spine(oeb_book)
+
+        # Do some tidying up
+        txt = self.tidy_up(txt)
+
+        return txt
+
+    def mlize_spine(self, oeb_book):
+        output = ['']
+        for item in oeb_book.spine:
+            self.log.debug('Converting %s to Markdown formatted TXT...' % item.href)
+            self.rewrite_ids(item.data, item)
+            rewrite_links(item.data, partial(self.rewrite_link, page=item))
+            stylizer = Stylizer(item.data, item.href, oeb_book, self.opts, self.opts.output_profile)
+            output += self.dump_text(item.data.find(XHTML('body')), stylizer)
+            output.append('\n\n')
+        return ''.join(output)
+
+    def tidy_up(self, text):
+        # Remove blank space form beginning of paragraph.
+        text = re.sub('(?msu)^[ ]{1,3}', '', text)
+        # pre has 4 spaces. We trimmed 3 so anything with a space left is a pre.
+        text = re.sub('(?msu)^[ ]', '    ', text)
+
+        # Remove tabs that aren't at the beinning of a line
+        new_text = []
+        for l in text.splitlines():
+            start = re.match('\t+', l)
+            if start:
+                start = start.group()
+            else:
+                start = ''
+            l = re.sub('\t', '', l)
+            new_text.append(start + l)
+        text = '\n'.join(new_text)
+
+        # Remove spaces from blank lines.
+        text = re.sub('(?msu)^[ ]+$', '', text)
+
+        # Reduce blank lines
+        text = re.sub('(?msu)\n{7,}', '\n' * 6, text)
+
+        # Remove blank lines at beginning and end of document.
+        text = re.sub(r'^\s*', '', text)
+        text = re.sub(r'\s*$', '\n\n', text)
+
+        return text
+
+    def remove_newlines(self, text):
+        text = text.replace('\r\n', ' ')
+        text = text.replace('\n', ' ')
+        text = text.replace('\r', ' ')
+        # Condense redundant spaces created by replacing newlines with spaces.
+        text = re.sub(r'[ ]{2,}', ' ', text)
+        text = re.sub(r'\t+', '', text)
+        if self.remove_space_after_newline == True:  # noqa
+            text = re.sub(r'^ +', '', text)
+            self.remove_space_after_newline = False
+        return text
+
+    def prepare_string_for_markdown(self, txt):
+        txt = re.sub(r'([\\`*_{}\[\]()#+!])', r'\\\1', txt)
+        return txt
+
+    def prepare_string_for_pre(self, txt):
+        new_text = []
+        for l in txt.splitlines():
+            new_text.append('    ' + l)
+        return '\n'.join(new_text)
+
+    def dump_text(self, elem, stylizer):
+        '''
+        @elem: The element in the etree that we are working on.
+        @stylizer: The style information attached to the element.
+        '''
+
+        # We can only processes tags. If there isn't a tag return any text.
+        if not isinstance(elem.tag, string_or_bytes) \
+           or namespace(elem.tag) != XHTML_NS:
+            p = elem.getparent()
+            if p is not None and isinstance(p.tag, string_or_bytes) and namespace(p.tag) == XHTML_NS \
+                    and elem.tail:
+                return [elem.tail]
+            return ['']
+
+        # Setup our variables.
+        text = []
+        style = stylizer.style(elem)
+        tags = []
+        tag = barename(elem.tag)
+        attribs = elem.attrib
+
+        # Ignore anything that is set to not be displayed.
+        if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \
+           or style['visibility'] == 'hidden':
+            if hasattr(elem, 'tail') and elem.tail:
+                return [elem.tail]
+            return ['']
+
+        # Soft scene breaks.
+        if 'margin-top' in style.cssdict() and style['margin-top'] != 'auto':
+            ems = int(round(float(style.marginTop) / style.fontSize) - 1)
+            if ems >= 1:
+                text.append(u'\n\n' * ems)
+
+        bq = '> ' * self.blockquotes
+        # Block level elements
+        if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'div'):
+            h_tag = ''
+            if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6'):
+                h_tag = '#' * int(tag[1]) + ' '
+            text.append('\n' + bq + h_tag)
+            tags.append('\n')
+            self.remove_space_after_newline = True
+
+        if style['font-style'] == 'italic' or tag in ('i', 'em'):
+            if tag not in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'cite'):
+                if self.style_italic == False:  # noqa
+                    text.append('*')
+                    tags.append('*')
+                    self.style_italic = True
+        if style['font-weight'] in ('bold', 'bolder') or tag in ('b', 'strong'):
+            if tag not in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'th'):
+                if self.style_bold == False:  # noqa
+                    text.append('**')
+                    tags.append('**')
+                    self.style_bold = True
+        if tag == 'br':
+            text.append('  \n')
+            self.remove_space_after_newline = True
+        if tag == 'blockquote':
+            self.blockquotes += 1
+            tags.append('>')
+            text.append('> ' * self.blockquotes)
+        elif tag == 'code':
+            if not self.in_pre and not self.in_code:
+                text.append('`')
+                tags.append('`')
+                self.in_code = True
+        elif tag == 'pre':
+            if not self.in_pre:
+                text.append('\n')
+                tags.append('pre')
+                self.in_pre = True
+        elif tag == 'hr':
+            text.append('\n* * *')
+            tags.append('\n')
+        elif tag == 'a':
+            # Only write links with absolute (external) urls.
+            if self.opts.keep_links and 'href' in attribs and '://' in attribs['href']:
+                title = ''
+                if 'title' in attribs:
+                    title = ' "' + attribs['title'] + '"'
+                    remove_space = self.remove_space_after_newline
+                    title = self.remove_newlines(title)
+                    self.remove_space_after_newline = remove_space
+                text.append('[')
+                tags.append('](' + attribs['href'] + title + ')')
+        elif tag == 'img':
+            if self.opts.keep_image_references:
+                txt = '!'
+                if 'alt' in attribs:
+                    remove_space = self.remove_space_after_newline
+                    txt += '[' + self.remove_newlines(attribs['alt']) + ']'
+                    self.remove_space_after_newline = remove_space
+                txt += '(' + attribs['src'] + ')'
+                text.append(txt)
+        elif tag in ('ol', 'ul'):
+            tags.append(tag)
+            # Add the list to our lists of lists so we can track
+            # nested lists.
+            self.list.append({'name': tag, 'num': 0})
+        elif tag == 'li':
+            # Get the last list from our list of lists
+            if self.list:
+                li = self.list[-1]
+            else:
+                li = {'name': 'ul', 'num': 0}
+            # Add a new line to start the item
+            text.append('\n')
+            # Add indent if we have nested lists.
+            list_count = len(self.list)
+            # We only care about indenting nested lists.
+            if (list_count - 1) > 0:
+                text.append('\t' * (list_count - 1))
+            # Add blockquote if we have a blockquote in a list item.
+            text.append(bq)
+            # Write the proper sign for ordered and unorded lists.
+            if li['name'] == 'ul':
+                text.append('+ ')
+            elif li['name'] == 'ol':
+                li['num'] += 1
+                text.append(unicode_type(li['num']) + '. ')
+
+        # Process tags that contain text.
+        if hasattr(elem, 'text') and elem.text:
+            txt = elem.text
+            if self.in_pre:
+                txt = self.prepare_string_for_pre(txt)
+            elif self.in_code:
+                txt = self.remove_newlines(txt)
+            else:
+                txt = self.prepare_string_for_markdown(self.remove_newlines(txt))
+            text.append(txt)
+
+        # Recurse down into tags within the tag we are in.
+        for item in elem:
+            text += self.dump_text(item, stylizer)
+
+        # Close all open tags.
+        tags.reverse()
+        for t in tags:
+            if t in ('pre', 'ul', 'ol', '>'):
+                if t == 'pre':
+                    self.in_pre = False
+                    text.append('\n')
+                elif t == '>':
+                    self.blockquotes -= 1
+                elif t in ('ul', 'ol'):
+                    if self.list:
+                        self.list.pop()
+                    text.append('\n')
+            else:
+                if t == '**':
+                    self.style_bold = False
+                elif t == '*':
+                    self.style_italic = False
+                elif t == '`':
+                    self.in_code = False
+                text.append('%s' % t)
+
+        # Soft scene breaks.
+        if 'margin-bottom' in style.cssdict() and style['margin-bottom'] != 'auto':
+            ems = int(round((float(style.marginBottom) / style.fontSize) - 1))
+            if ems >= 1:
+                text.append(u'\n\n' * ems)
+
+        # Add the text that is outside of the tag.
+        if hasattr(elem, 'tail') and elem.tail:
+            tail = elem.tail
+            if self.in_pre:
+                tail = self.prepare_string_for_pre(tail)
+            elif self.in_code:
+                tail = self.remove_newlines(tail)
+            else:
+                tail = self.prepare_string_for_markdown(self.remove_newlines(tail))
+            text.append(tail)
+
+        return text
@@ -0,0 +1,32 @@
+# -*- coding: utf-8 -*-
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__ = 'GPL 3'
+__copyright__ = '2009, John Schember <john@nachtimwald.com>'
+__docformat__ = 'restructuredtext en'
+
+import os
+
+
+class TxtNewlines(object):
+
+    NEWLINE_TYPES = {
+                        'system'  : os.linesep,
+                        'unix'    : '\n',
+                        'old_mac' : '\r',
+                        'windows' : '\r\n'
+                     }
+
+    def __init__(self, newline_type):
+        self.newline = self.NEWLINE_TYPES.get(newline_type.lower(), os.linesep)
+
+
+def specified_newlines(newline, text):
+    # Convert all newlines to \n
+    text = text.replace('\r\n', '\n')
+    text = text.replace('\r', '\n')
+
+    if newline == '\n':
+        return text
+
+    return text.replace('\n', newline)
@@ -0,0 +1,502 @@
+# -*- coding: utf-8 -*-
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__ = 'GPL 3'
+__copyright__ = '2011, Leigh Parry <leighparry@blueyonder.co.uk>'
+__docformat__ = 'restructuredtext en'
+
+'''
+Transform OEB content into Textile formatted plain text
+'''
+import re
+
+from functools import partial
+
+from calibre.ebooks.htmlz.oeb2html import OEB2HTML
+from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace, rewrite_links
+from calibre.ebooks.oeb.stylizer import Stylizer
+from calibre.ebooks import unit_convert
+from calibre.ebooks.textile.unsmarten import unsmarten
+from polyglot.builtins import string_or_bytes
+
+
+class TextileMLizer(OEB2HTML):
+
+    MAX_EM = 10
+
+    def extract_content(self, oeb_book, opts):
+        self.log.info('Converting XHTML to Textile formatted TXT...')
+        self.opts = opts
+        self.in_pre = False
+        self.in_table = False
+        self.links = {}
+        self.list = []
+        self.our_links = []
+        self.in_a_link = False
+        self.our_ids = []
+        self.images = {}
+        self.id_no_text = ''
+        self.style_embed = []
+        self.remove_space_after_newline = False
+        self.base_hrefs = [item.href for item in oeb_book.spine]
+        self.map_resources(oeb_book)
+
+        self.style_bold = False
+        self.style_italic = False
+        self.style_under = False
+        self.style_strike = False
+        self.style_smallcap = False
+
+        txt = self.mlize_spine(oeb_book)
+        if self.opts.unsmarten_punctuation:
+            txt = unsmarten(txt)
+
+        # Do some tidying up
+        txt = self.tidy_up(txt)
+
+        return txt
+
+    def mlize_spine(self, oeb_book):
+        output = ['']
+        for item in oeb_book.spine:
+            self.log.debug('Converting %s to Textile formatted TXT...' % item.href)
+            self.rewrite_ids(item.data, item)
+            rewrite_links(item.data, partial(self.rewrite_link, page=item))
+            stylizer = Stylizer(item.data, item.href, oeb_book, self.opts, self.opts.output_profile)
+            output += self.dump_text(item.data.find(XHTML('body')), stylizer)
+            output.append('\n\n')
+        return ''.join(output)
+
+    def tidy_up(self, text):
+        # May need tweaking and finetuning
+        def check_escaping(text, tests):
+            for t in tests:
+                # I'm not checking for duplicated spans '%' as any that follow each other were being incorrectly merged
+                txt = '%s' % t
+                if txt != '%':
+                    text = re.sub(r'([^'+t+'|^\n])'+t+r'\]\['+t+'([^'+t+'])', r'\1\2', text)
+                    text = re.sub(r'([^'+t+'|^\n])'+t+t+'([^'+t+'])', r'\1\2', text)
+                text = re.sub(r'(\s|[*_\'"])\[('+t+'[a-zA-Z0-9 \'",.*_]+'+t+r')\](\s|[*_\'"?!,.])', r'\1\2\3', text)
+            return text
+
+        # Now tidyup links and ids - remove ones that don't have a correponding opposite
+        if self.opts.keep_links:
+            for i in self.our_links:
+                if i[0] == '#':
+                    if i not in self.our_ids:
+                        text = re.sub(r'"(.+)":'+i+r'(\s)', r'\1\2', text)
+            for i in self.our_ids:
+                if i not in self.our_links:
+                    text = re.sub(r'%?\('+i+'\\)\xa0?%?', r'', text)
+
+        # Remove obvious non-needed escaping, add sub/sup-script ones
+        text = check_escaping(text, [r'\*', '_', r'\*'])
+        # escape the super/sub-scripts if needed
+        text = re.sub(r'(\w)([~^]\w+[~^])', r'\1[\2]', text)
+        # escape the super/sub-scripts if needed
+        text = re.sub(r'([~^]\w+[~^])(\w)', r'[\1]\2', text)
+
+        # remove empty spans
+        text = re.sub(r'%\xa0+', r'%', text)
+        # remove empty spans - MAY MERGE SOME ?
+        text = re.sub(r'%%', r'', text)
+        # remove spans from tagged output
+        text = re.sub(r'%([_+*-]+)%', r'\1', text)
+        # remove spaces before a newline
+        text = re.sub(r' +\n', r'\n', text)
+        # remove newlines at top of file
+        text = re.sub(r'^\n+', r'', text)
+        # correct blockcode paras
+        text = re.sub(r'\npre\.\n?\nbc\.', r'\nbc.', text)
+        # correct blockquote paras
+        text = re.sub(r'\nbq\.\n?\np.*?\. ', r'\nbq. ', text)
+
+        # reduce blank lines
+        text = re.sub(r'\n{3}', r'\n\np. \n\n', text)
+        text = re.sub(u'%\n(p[<>=]{1,2}\\.|p\\.)', r'%\n\n\1', text)
+        # Check span following blank para
+        text = re.sub(r'\n+ +%', r' %', text)
+        text = re.sub(u'p[<>=]{1,2}\\.\n\n?', r'', text)
+        # blank paragraph
+        text = re.sub(r'\n(p.*\.)\n', r'\n\1 \n\n', text)
+        # blank paragraph
+        text = re.sub(u'\n\xa0', r'\np. ', text)
+        # blank paragraph
+        text = re.sub(u'\np[<>=]{1,2}?\\. \xa0', r'\np. ', text)
+        text = re.sub(r'(^|\n)(p.*\. ?\n)(p.*\.)', r'\1\3', text)
+        text = re.sub(r'\n(p\. \n)(p.*\.|h.*\.)', r'\n\2', text)
+        # sort out spaces in tables
+        text = re.sub(r' {2,}\|', r' |', text)
+
+        # Now put back spaces removed earlier as they're needed here
+        text = re.sub(r'\np\.\n', r'\np. \n', text)
+        # reduce blank lines
+        text = re.sub(r' \n\n\n', r' \n\n', text)
+
+        return text
+
+    def remove_newlines(self, text):
+        text = text.replace('\r\n', ' ')
+        text = text.replace('\n', ' ')
+        text = text.replace('\r', ' ')
+        # Condense redundant spaces created by replacing newlines with spaces.
+        text = re.sub(r'[ ]{2,}', ' ', text)
+        text = re.sub(r'\t+', '', text)
+        if self.remove_space_after_newline == True:  # noqa
+            text = re.sub(r'^ +', '', text)
+            self.remove_space_after_newline = False
+        return text
+
+    def check_styles(self, style):
+        txt = '{'
+        if self.opts.keep_color:
+            if 'color' in style.cssdict() and style['color'] != 'black':
+                txt += 'color:'+style['color']+';'
+            if 'background' in style.cssdict():
+                txt += 'background:'+style['background']+';'
+        txt += '}'
+        if txt == '{}':
+            txt = ''
+        return txt
+
+    def check_halign(self, style):
+        tests = {'left':'<','justify':'<>','center':'=','right':'>'}
+        for i in tests:
+            if style['text-align'] == i:
+                return tests[i]
+        return ''
+
+    def check_valign(self, style):
+        tests = {'top':'^','bottom':'~'}  # , 'middle':'-'}
+        for i in tests:
+            if style['vertical-align'] == i:
+                return tests[i]
+        return ''
+
+    def check_padding(self, style, stylizer):
+        txt = ''
+        left_padding_pts = 0
+        left_margin_pts = 0
+        if 'padding-left' in style.cssdict() and style['padding-left'] != 'auto':
+            left_padding_pts = unit_convert(style['padding-left'], style.width, style.fontSize, stylizer.profile.dpi)
+        if 'margin-left' in style.cssdict() and style['margin-left'] != 'auto':
+            left_margin_pts = unit_convert(style['margin-left'], style.width, style.fontSize, stylizer.profile.dpi)
+        left = left_margin_pts + left_padding_pts
+        emleft = min(int(round(left / stylizer.profile.fbase)), self.MAX_EM)
+        if emleft >= 1:
+            txt += '(' * emleft
+        right_padding_pts = 0
+        right_margin_pts = 0
+        if 'padding-right' in style.cssdict() and style['padding-right'] != 'auto':
+            right_padding_pts = unit_convert(style['padding-right'], style.width, style.fontSize, stylizer.profile.dpi)
+        if 'margin-right' in style.cssdict() and style['margin-right'] != 'auto':
+            right_margin_pts = unit_convert(style['margin-right'], style.width, style.fontSize, stylizer.profile.dpi)
+        right = right_margin_pts + right_padding_pts
+        emright = min(int(round(right / stylizer.profile.fbase)), self.MAX_EM)
+        if emright >= 1:
+            txt += ')' * emright
+
+        return txt
+
+    def check_id_tag(self, attribs):
+        txt = ''
+        if 'id' in attribs:
+            txt = '(#'+attribs['id']+ ')'
+            self.our_ids.append('#'+attribs['id'])
+            self.id_no_text = u'\xa0'
+        return txt
+
+    def build_block(self, tag, style, attribs, stylizer):
+        txt = '\n' + tag
+        if self.opts.keep_links:
+            txt += self.check_id_tag(attribs)
+        txt += self.check_padding(style, stylizer)
+        txt += self.check_halign(style)
+        txt += self.check_styles(style)
+        return txt
+
+    def prepare_string_for_textile(self, txt):
+        if re.search(r'(\s([*&_+\-~@%|]|\?{2})\S)|(\S([*&_+\-~@%|]|\?{2})\s)', txt):
+            return ' ==%s== ' % txt
+        return txt
+
+    def dump_text(self, elem, stylizer):
+        '''
+        @elem: The element in the etree that we are working on.
+        @stylizer: The style information attached to the element.
+        '''
+
+        # We can only processes tags. If there isn't a tag return any text.
+        if not isinstance(elem.tag, string_or_bytes) \
+           or namespace(elem.tag) != XHTML_NS:
+            p = elem.getparent()
+            if p is not None and isinstance(p.tag, string_or_bytes) and namespace(p.tag) == XHTML_NS \
+                    and elem.tail:
+                return [elem.tail]
+            return ['']
+
+        # Setup our variables.
+        text = ['']
+        style = stylizer.style(elem)
+        tags = []
+        tag = barename(elem.tag)
+        attribs = elem.attrib
+
+        # Ignore anything that is set to not be displayed.
+        if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \
+           or style['visibility'] == 'hidden':
+            if hasattr(elem, 'tail') and elem.tail:
+                return [elem.tail]
+            return ['']
+
+        # Soft scene breaks.
+        if 'margin-top' in style.cssdict() and style['margin-top'] != 'auto':
+            ems = min(int(round(float(style.marginTop) / style.fontSize) - 1), self.MAX_EM)
+            if ems >= 1:
+                text.append(u'\n\n\xa0' * ems)
+
+        if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'div'):
+            if tag == 'div':
+                tag = 'p'
+            text.append(self.build_block(tag, style, attribs, stylizer))
+            text.append('. ')
+            tags.append('\n')
+
+        if style['font-style'] == 'italic' or tag in ('i', 'em'):
+            if tag not in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'cite'):
+                if self.style_italic == False:  # noqa
+                    if self.in_a_link:
+                        text.append('_')
+                        tags.append('_')
+                    else:
+                        text.append('[_')
+                        tags.append('_]')
+                    self.style_embed.append('_')
+                    self.style_italic = True
+        if style['font-weight'] in ('bold', 'bolder') or tag in ('b', 'strong'):
+            if tag not in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'th'):
+                if self.style_bold == False:  # noqa
+                    if self.in_a_link:
+                        text.append('*')
+                        tags.append('*')
+                    else:
+                        text.append('[*')
+                        tags.append('*]')
+                    self.style_embed.append('*')
+                    self.style_bold = True
+        if style['text-decoration'] == 'underline' or tag in ('u', 'ins'):
+            if tag != 'a':
+                if self.style_under == False:  # noqa
+                    text.append('[+')
+                    tags.append('+]')
+                    self.style_embed.append('+')
+                    self.style_under = True
+        if style['text-decoration'] == 'line-through' or tag in ('strike', 'del', 's'):
+            if self.style_strike == False:  # noqa
+                text.append('[-')
+                tags.append('-]')
+                self.style_embed.append('-')
+                self.style_strike = True
+        if tag == 'br':
+            for i in reversed(self.style_embed):
+                text.append(i)
+            text.append('\n')
+            for i in self.style_embed:
+                text.append(i)
+            tags.append('')
+            self.remove_space_after_newline = True
+        if tag == 'blockquote':
+            text.append('\nbq. ')
+            tags.append('\n')
+        elif tag in ('abbr', 'acronym'):
+            text.append('')
+            txt = attribs['title']
+            tags.append('(' + txt + ')')
+        elif tag == 'sup':
+            text.append('^')
+            tags.append('^')
+        elif tag == 'sub':
+            text.append('~')
+            tags.append('~')
+        elif tag == 'code':
+            if self.in_pre:
+                text.append('\nbc. ')
+                tags.append('')
+            else:
+                text.append('@')
+                tags.append('@')
+        elif tag == 'cite':
+            text.append('??')
+            tags.append('??')
+        elif tag == 'hr':
+            text.append('\n***')
+            tags.append('\n')
+        elif tag == 'pre':
+            self.in_pre = True
+            text.append('\npre. ')
+            tags.append('pre\n')
+        elif tag == 'a':
+            if self.opts.keep_links:
+                if 'href' in attribs:
+                    text.append('"')
+                    tags.append('a')
+                    tags.append('":' + attribs['href'])
+                    self.our_links.append(attribs['href'])
+                    if 'title' in attribs:
+                        tags.append('(' + attribs['title'] + ')')
+                    self.in_a_link = True
+                else:
+                    text.append('%')
+                    tags.append('%')
+        elif tag == 'img':
+            if self.opts.keep_image_references:
+                txt = '!' + self.check_halign(style)
+                txt += self.check_valign(style)
+                txt += attribs['src']
+                text.append(txt)
+                if 'alt' in attribs:
+                    txt = attribs['alt']
+                    if txt != '':
+                        text.append('(' + txt + ')')
+                tags.append('!')
+        elif tag in ('ol', 'ul'):
+            self.list.append({'name': tag, 'num': 0})
+            text.append('')
+            tags.append(tag)
+        elif tag == 'li':
+            if self.list:
+                li = self.list[-1]
+            else:
+                li = {'name': 'ul', 'num': 0}
+            text.append('\n')
+            if li['name'] == 'ul':
+                text.append('*' * len(self.list) + ' ')
+            elif li['name'] == 'ol':
+                text.append('#' * len(self.list) + ' ')
+            tags.append('')
+        elif tag == 'dl':
+            text.append('\n')
+            tags.append('')
+        elif tag == 'dt':
+            text.append('')
+            tags.append('\n')
+        elif tag == 'dd':
+            text.append('    ')
+            tags.append('')
+        elif tag == 'dd':
+            text.append('')
+            tags.append('\n')
+        elif tag == 'table':
+            txt = self.build_block(tag, style, attribs, stylizer)
+            txt += '. \n'
+            if txt != '\ntable. \n':
+                text.append(txt)
+            else:
+                text.append('\n')
+            tags.append('')
+        elif tag == 'tr':
+            txt = self.build_block('', style, attribs, stylizer)
+            txt += '. '
+            if txt != '\n. ':
+                txt = re.sub('\n', '', txt)
+                text.append(txt)
+            tags.append('|\n')
+        elif tag == 'td':
+            text.append('|')
+            txt = ''
+            txt += self.check_halign(style)
+            txt += self.check_valign(style)
+            if 'colspan' in attribs:
+                txt += '\\' + attribs['colspan']
+            if 'rowspan' in attribs:
+                txt += '/' + attribs['rowspan']
+            txt += self.check_styles(style)
+            if txt != '':
+                text.append(txt + '. ')
+            tags.append('')
+        elif tag == 'th':
+            text.append('|_. ')
+            tags.append('')
+        elif tag == 'span':
+            if style['font-variant'] == 'small-caps':
+                if self.style_smallcap == False:  # noqa
+                    text.append('&')
+                    tags.append('&')
+                    self.style_smallcap = True
+            else:
+                if self.in_a_link == False:  # noqa
+                    txt = '%'
+                    if self.opts.keep_links:
+                        txt += self.check_id_tag(attribs)
+                        txt += self.check_styles(style)
+                    if txt != '%':
+                        text.append(txt)
+                        tags.append('%')
+
+        if self.opts.keep_links and 'id' in attribs:
+            if tag not in ('body', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'span', 'table'):
+                text.append(self.check_id_tag(attribs))
+
+        # Process the styles for any that we want to keep
+        if tag not in ('body', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'hr', 'a', 'img',
+                'span', 'table', 'tr', 'td'):
+            if not self.in_a_link:
+                text.append(self.check_styles(style))
+
+        # Process tags that contain text.
+        if hasattr(elem, 'text') and elem.text:
+            txt = elem.text
+            if not self.in_pre:
+                txt = self.prepare_string_for_textile(self.remove_newlines(txt))
+            text.append(txt)
+            self.id_no_text = u''
+
+        # Recurse down into tags within the tag we are in.
+        for item in elem:
+            text += self.dump_text(item, stylizer)
+
+        # Close all open tags.
+        tags.reverse()
+        for t in tags:
+            if t in ('pre', 'ul', 'ol', 'li', 'table'):
+                if t == 'pre':
+                    self.in_pre = False
+                elif t in ('ul', 'ol'):
+                    if self.list:
+                        self.list.pop()
+                    if not self.list:
+                        text.append('\n')
+            else:
+                if t == 'a':
+                    self.in_a_link = False
+                    t = ''
+                text.append(self.id_no_text)
+                self.id_no_text = u''
+                if t in ('*]', '*'):
+                    self.style_bold = False
+                elif t in ('_]', '_'):
+                    self.style_italic = False
+                elif t == '+]':
+                    self.style_under = False
+                elif t == '-]':
+                    self.style_strike = False
+                elif t == '&':
+                    self.style_smallcap = False
+                if t in ('*]', '_]', '+]', '-]', '*', '_'):
+                    txt = self.style_embed.pop()
+                text.append('%s' % t)
+
+        # Soft scene breaks.
+        if 'margin-bottom' in style.cssdict() and style['margin-bottom'] != 'auto':
+            ems = min(int(round((float(style.marginBottom) / style.fontSize) - 1)), self.MAX_EM)
+            if ems >= 1:
+                text.append(u'\n\n\xa0' * ems)
+
+        # Add the text that is outside of the tag.
+        if hasattr(elem, 'tail') and elem.tail:
+            tail = elem.tail
+            if not self.in_pre:
+                tail = self.prepare_string_for_textile(self.remove_newlines(tail))
+            text.append(tail)
+
+        return text
@@ -0,0 +1,264 @@
+# -*- coding: utf-8 -*-
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+__license__ = 'GPL 3'
+__copyright__ = '2009, John Schember <john@nachtimwald.com>'
+__docformat__ = 'restructuredtext en'
+
+'''
+Transform OEB content into plain text
+'''
+
+import re
+
+from lxml import etree
+from polyglot.builtins import string_or_bytes
+
+
+BLOCK_TAGS = [
+    'div',
+    'p',
+    'h1',
+    'h2',
+    'h3',
+    'h4',
+    'h5',
+    'h6',
+    'li',
+    'tr',
+]
+
+BLOCK_STYLES = [
+    'block',
+]
+
+HEADING_TAGS = [
+    'h1',
+    'h2',
+    'h3',
+    'h4',
+    'h5',
+    'h6',
+]
+
+SPACE_TAGS = [
+    'td',
+    'br',
+]
+
+
+class TXTMLizer(object):
+
+    def __init__(self, log):
+        self.log = log
+
+    def extract_content(self, oeb_book, opts):
+        self.log.info('Converting XHTML to TXT...')
+        self.oeb_book = oeb_book
+        self.opts = opts
+        self.toc_titles = []
+        self.toc_ids = []
+        self.last_was_heading = False
+
+        self.create_flat_toc(self.oeb_book.toc)
+
+        return self.mlize_spine()
+
+    def mlize_spine(self):
+        from calibre.ebooks.oeb.base import XHTML
+        from calibre.ebooks.oeb.stylizer import Stylizer
+        from calibre.utils.xml_parse import safe_xml_fromstring
+        output = [u'']
+        output.append(self.get_toc())
+        for item in self.oeb_book.spine:
+            self.log.debug('Converting %s to TXT...' % item.href)
+            for x in item.data.iterdescendants(etree.Comment):
+                if x.text and '--' in x.text:
+                    x.text = x.text.replace('--', '__')
+            content = etree.tostring(item.data, encoding='unicode')
+            content = self.remove_newlines(content)
+            content = safe_xml_fromstring(content)
+            stylizer = Stylizer(content, item.href, self.oeb_book, self.opts, self.opts.output_profile)
+            output += self.dump_text(content.find(XHTML('body')), stylizer, item)
+            output += '\n\n\n\n\n\n'
+        output = ''.join(output)
+        output = '\n'.join(l.rstrip() for l in output.splitlines())
+        output = self.cleanup_text(output)
+
+        return output
+
+    def remove_newlines(self, text):
+        self.log.debug('\tRemove newlines for processing...')
+        text = text.replace('\r\n', ' ')
+        text = text.replace('\n', ' ')
+        text = text.replace('\r', ' ')
+        # Condense redundant spaces created by replacing newlines with spaces.
+        text = re.sub(r'[ ]{2,}', ' ', text)
+
+        return text
+
+    def get_toc(self):
+        toc = ['']
+        if getattr(self.opts, 'inline_toc', None):
+            self.log.debug('Generating table of contents...')
+            toc.append('%s\n\n' % _('Table of Contents:'))
+            for item in self.toc_titles:
+                toc.append('* %s\n\n' % item)
+        return ''.join(toc)
+
+    def create_flat_toc(self, nodes):
+        '''
+        Turns a hierarchical list of TOC href's into a flat list.
+        '''
+        for item in nodes:
+            self.toc_titles.append(item.title)
+            self.toc_ids.append(item.href)
+            self.create_flat_toc(item.nodes)
+
+    def cleanup_text(self, text):
+        self.log.debug('\tClean up text...')
+        # Replace bad characters.
+        text = text.replace(u'\xa0', ' ')
+
+        # Replace tabs, vertical tags and form feeds with single space.
+        text = text.replace('\t+', ' ')
+        text = text.replace('\v+', ' ')
+        text = text.replace('\f+', ' ')
+
+        # Single line paragraph.
+        text = re.sub('(?<=.)\n(?=.)', ' ', text)
+
+        # Remove multiple spaces.
+        text = re.sub('[ ]{2,}', ' ', text)
+
+        # Remove excessive newlines.
+        text = re.sub('\n[ ]+\n', '\n\n', text)
+        if self.opts.remove_paragraph_spacing:
+            text = re.sub('\n{2,}', '\n', text)
+            text = re.sub(r'(?msu)^(?P<t>[^\t\n]+?)$', lambda mo: u'%s\n\n' % mo.group('t'), text)
+            text = re.sub(r'(?msu)(?P<b>[^\n])\n+(?P<t>[^\t\n]+?)(?=\n)', lambda mo: '%s\n\n\n\n\n\n%s' % (mo.group('b'), mo.group('t')), text)
+        else:
+            text = re.sub('\n{7,}', '\n\n\n\n\n\n', text)
+
+        # Replace spaces at the beginning and end of lines
+        # We don't replace tabs because those are only added
+        # when remove paragraph spacing is enabled.
+        text = re.sub('(?imu)^[ ]+', '', text)
+        text = re.sub('(?imu)[ ]+$', '', text)
+
+        # Remove empty space and newlines at the beginning of the document.
+        text = re.sub(r'(?u)^[ \n]+', '', text)
+
+        if self.opts.max_line_length:
+            max_length = self.opts.max_line_length
+            if self.opts.max_line_length < 25 and not self.opts.force_max_line_length:
+                max_length = 25
+            short_lines = []
+            lines = text.splitlines()
+            for line in lines:
+                while len(line) > max_length:
+                    space = line.rfind(' ', 0, max_length)
+                    if space != -1:
+                        # Space was found.
+                        short_lines.append(line[:space])
+                        line = line[space + 1:]
+                    else:
+                        # Space was not found.
+                        if self.opts.force_max_line_length:
+                            # Force breaking at max_lenght.
+                            short_lines.append(line[:max_length])
+                            line = line[max_length:]
+                        else:
+                            # Look for the first space after max_length.
+                            space = line.find(' ', max_length, len(line))
+                            if space != -1:
+                                # Space was found.
+                                short_lines.append(line[:space])
+                                line = line[space + 1:]
+                            else:
+                                # No space was found cannot break line.
+                                short_lines.append(line)
+                                line = ''
+                # Add the text that was less than max_lengh to the list
+                short_lines.append(line)
+            text = '\n'.join(short_lines)
+
+        return text
+
+    def dump_text(self, elem, stylizer, page):
+        '''
+        @elem: The element in the etree that we are working on.
+        @stylizer: The style information attached to the element.
+        @page: OEB page used to determine absolute urls.
+        '''
+        from calibre.ebooks.oeb.base import XHTML_NS, barename, namespace
+
+        if not isinstance(elem.tag, string_or_bytes) \
+           or namespace(elem.tag) != XHTML_NS:
+            p = elem.getparent()
+            if p is not None and isinstance(p.tag, string_or_bytes) and namespace(p.tag) == XHTML_NS \
+                    and elem.tail:
+                return [elem.tail]
+            return ['']
+
+        text = ['']
+        style = stylizer.style(elem)
+
+        if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \
+           or style['visibility'] == 'hidden':
+            if hasattr(elem, 'tail') and elem.tail:
+                return [elem.tail]
+            return ['']
+
+        tag = barename(elem.tag)
+        tag_id = elem.attrib.get('id', None)
+        in_block = False
+        in_heading = False
+
+        # Are we in a heading?
+        # This can either be a heading tag or a TOC item.
+        if tag in HEADING_TAGS or '%s#%s' % (page.href, tag_id) in self.toc_ids:
+            in_heading = True
+            if not self.last_was_heading:
+                text.append('\n\n\n\n\n\n')
+
+        # Are we in a paragraph block?
+        if tag in BLOCK_TAGS or style['display'] in BLOCK_STYLES:
+            if self.opts.remove_paragraph_spacing and not in_heading:
+                text.append('\t')
+            in_block = True
+
+        if tag in SPACE_TAGS:
+            text.append(' ')
+
+        # Hard scene breaks.
+        if tag == 'hr':
+            text.append('\n\n* * *\n\n')
+        # Soft scene breaks.
+        try:
+            ems = int(round((float(style.marginTop) / style.fontSize) - 1))
+            if ems >= 1:
+                text.append('\n' * ems)
+        except:
+            pass
+
+        # Process tags that contain text.
+        if hasattr(elem, 'text') and elem.text:
+            text.append(elem.text)
+
+        # Recurse down into tags within the tag we are in.
+        for item in elem:
+            text += self.dump_text(item, stylizer, page)
+
+        if in_block:
+            text.append('\n\n')
+        if in_heading:
+            text.append('\n')
+            self.last_was_heading = True
+        else:
+            self.last_was_heading = False
+
+        if hasattr(elem, 'tail') and elem.tail:
+            text.append(elem.tail)
+
+        return text