mirror of
https://github.com/gryf/ebook-converter.git
synced 2026-02-05 06:05:45 +01:00
Added txt related modules
This commit is contained in:
7
ebook_converter/ebooks/textile/__init__.py
Normal file
7
ebook_converter/ebooks/textile/__init__.py
Normal file
@@ -0,0 +1,7 @@
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
from .functions import textile, textile_restricted, Textile
|
||||
|
||||
if False:
|
||||
textile, textile_restricted, Textile
|
||||
|
||||
__all__ = ['textile', 'textile_restricted']
|
||||
1091
ebook_converter/ebooks/textile/functions.py
Normal file
1091
ebook_converter/ebooks/textile/functions.py
Normal file
File diff suppressed because it is too large
Load Diff
129
ebook_converter/ebooks/textile/unsmarten.py
Normal file
129
ebook_converter/ebooks/textile/unsmarten.py
Normal file
@@ -0,0 +1,129 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL 3'
|
||||
__copyright__ = '2011, Leigh Parry <leighparry@blueyonder.co.uk>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import re
|
||||
|
||||
|
||||
def unsmarten(txt):
|
||||
txt = re.sub(u'¢|¢|¢', r'{c\}', txt) # cent
|
||||
txt = re.sub(u'£|£|£', r'{L-}', txt) # pound
|
||||
txt = re.sub(u'¥|¥|¥', r'{Y=}', txt) # yen
|
||||
txt = re.sub(u'©|©|©', r'{(c)}', txt) # copyright
|
||||
txt = re.sub(u'®|®|®', r'{(r)}', txt) # registered
|
||||
txt = re.sub(u'¼|¼|¼', r'{1/4}', txt) # quarter
|
||||
txt = re.sub(u'½|½|½', r'{1/2}', txt) # half
|
||||
txt = re.sub(u'¾|¾|¾', r'{3/4}', txt) # three-quarter
|
||||
txt = re.sub(u'À|À|À', r'{A`)}', txt) # A-grave
|
||||
txt = re.sub(u'Á|Á|Á', r"{A'}", txt) # A-acute
|
||||
txt = re.sub(u'Â|Â|Â', r'{A^}', txt) # A-circumflex
|
||||
txt = re.sub(u'Ã|Ã|Ã', r'{A~}', txt) # A-tilde
|
||||
txt = re.sub(u'Ä|Ä|Ä', r'{A"}', txt) # A-umlaut
|
||||
txt = re.sub(u'Å|Å|Å', r'{Ao}', txt) # A-ring
|
||||
txt = re.sub(u'Æ|Æ|Æ', r'{AE}', txt) # AE
|
||||
txt = re.sub(u'Ç|Ç|Ç', r'{C,}', txt) # C-cedilla
|
||||
txt = re.sub(u'È|È|È', r'{E`}', txt) # E-grave
|
||||
txt = re.sub(u'É|É|É', r"{E'}", txt) # E-acute
|
||||
txt = re.sub(u'Ê|Ê|Ê', r'{E^}', txt) # E-circumflex
|
||||
txt = re.sub(u'Ë|Ë|Ë', r'{E"}', txt) # E-umlaut
|
||||
txt = re.sub(u'Ì|Ì|Ì', r'{I`}', txt) # I-grave
|
||||
txt = re.sub(u'Í|Í|Í', r"{I'}", txt) # I-acute
|
||||
txt = re.sub(u'Î|Î|Î', r'{I^}', txt) # I-circumflex
|
||||
txt = re.sub(u'Ï|Ï|Ï', r'{I"}', txt) # I-umlaut
|
||||
txt = re.sub(u'Ð|Ð|Ð', r'{D-}', txt) # ETH
|
||||
txt = re.sub(u'Ñ|Ñ|Ñ', r'{N~}', txt) # N-tilde
|
||||
txt = re.sub(u'Ò|Ò|Ò', r'{O`}', txt) # O-grave
|
||||
txt = re.sub(u'Ó|Ó|Ó', r"{O'}", txt) # O-acute
|
||||
txt = re.sub(u'Ô|Ô|Ô', r'{O^}', txt) # O-circumflex
|
||||
txt = re.sub(u'Õ|Õ|Õ', r'{O~}', txt) # O-tilde
|
||||
txt = re.sub(u'Ö|Ö|Ö', r'{O"}', txt) # O-umlaut
|
||||
txt = re.sub(u'×|×|×', r'{x}', txt) # dimension
|
||||
txt = re.sub(u'Ø|Ø|Ø', r'{O/}', txt) # O-slash
|
||||
txt = re.sub(u'Ù|Ù|Ù', r"{U`}", txt) # U-grave
|
||||
txt = re.sub(u'Ú|Ú|Ú', r"{U'}", txt) # U-acute
|
||||
txt = re.sub(u'Û|Û|Û', r'{U^}', txt) # U-circumflex
|
||||
txt = re.sub(u'Ü|Ü|Ü', r'{U"}', txt) # U-umlaut
|
||||
txt = re.sub(u'Ý|Ý|Ý', r"{Y'}", txt) # Y-grave
|
||||
txt = re.sub(u'ß|ß|ß', r'{sz}', txt) # sharp-s
|
||||
txt = re.sub(u'à|à|à', r'{a`}', txt) # a-grave
|
||||
txt = re.sub(u'á|á|á', r"{a'}", txt) # a-acute
|
||||
txt = re.sub(u'â|â|â', r'{a^}', txt) # a-circumflex
|
||||
txt = re.sub(u'ã|ã|ã', r'{a~}', txt) # a-tilde
|
||||
txt = re.sub(u'ä|ä|ä', r'{a"}', txt) # a-umlaut
|
||||
txt = re.sub(u'å|å|å', r'{ao}', txt) # a-ring
|
||||
txt = re.sub(u'æ|æ|æ', r'{ae}', txt) # ae
|
||||
txt = re.sub(u'ç|ç|ç', r'{c,}', txt) # c-cedilla
|
||||
txt = re.sub(u'è|è|è', r'{e`}', txt) # e-grave
|
||||
txt = re.sub(u'é|é|é', r"{e'}", txt) # e-acute
|
||||
txt = re.sub(u'ê|ê|ê', r'{e^}', txt) # e-circumflex
|
||||
txt = re.sub(u'ë|ë|ë', r'{e"}', txt) # e-umlaut
|
||||
txt = re.sub(u'ì|ì|ì', r'{i`}', txt) # i-grave
|
||||
txt = re.sub(u'í|í|í', r"{i'}", txt) # i-acute
|
||||
txt = re.sub(u'î|î|î', r'{i^}', txt) # i-circumflex
|
||||
txt = re.sub(u'ï|ï|ï', r'{i"}', txt) # i-umlaut
|
||||
txt = re.sub(u'ð|ð|ð', r'{d-}', txt) # eth
|
||||
txt = re.sub(u'ñ|ñ|ñ', r'{n~}', txt) # n-tilde
|
||||
txt = re.sub(u'ò|ò|ò', r'{o`}', txt) # o-grave
|
||||
txt = re.sub(u'ó|ó|ó', r"{o'}", txt) # o-acute
|
||||
txt = re.sub(u'ô|ô|ô', r'{o^}', txt) # o-circumflex
|
||||
txt = re.sub(u'õ|õ|õ', r'{o~}', txt) # o-tilde
|
||||
txt = re.sub(u'ö|ö|ö', r'{o"}', txt) # o-umlaut
|
||||
txt = re.sub(u'ø|ø|ø', r'{o/}', txt) # o-stroke
|
||||
txt = re.sub(u'ù|ù|ù', r'{u`}', txt) # u-grave
|
||||
txt = re.sub(u'ú|ú|ú', r"{u'}", txt) # u-acute
|
||||
txt = re.sub(u'û|û|û', r'{u^}', txt) # u-circumflex
|
||||
txt = re.sub(u'ü|ü|ü', r'{u"}', txt) # u-umlaut
|
||||
txt = re.sub(u'ý|ý|ý', r"{y'}", txt) # y-acute
|
||||
txt = re.sub(u'ÿ|ÿ|ÿ', r'{y"}', txt) # y-umlaut
|
||||
|
||||
txt = re.sub(u'Č|Č|Č', r'{Cˇ}', txt) # C-caron
|
||||
txt = re.sub(u'č|č|č', r'{cˇ}', txt) # c-caron
|
||||
txt = re.sub(u'Ď|Ď|Ď', r'{Dˇ}', txt) # D-caron
|
||||
txt = re.sub(u'ď|ď|ď', r'{dˇ}', txt) # d-caron
|
||||
txt = re.sub(u'Ě|Ě|Ě', r'{Eˇ}', txt) # E-caron
|
||||
txt = re.sub(u'ě|ě|ě', r'{eˇ}', txt) # e-caron
|
||||
txt = re.sub(u'Ĺ|Ĺ|Ĺ', r"{L'}", txt) # L-acute
|
||||
txt = re.sub(u'ĺ|ĺ|ĺ', r"{l'}", txt) # l-acute
|
||||
txt = re.sub(u'Ľ|Ľ|Ľ', r'{Lˇ}', txt) # L-caron
|
||||
txt = re.sub(u'ľ|ľ|ľ', r'{lˇ}', txt) # l-caron
|
||||
txt = re.sub(u'Ň|Ň|Ň', r'{Nˇ}', txt) # N-caron
|
||||
txt = re.sub(u'ň|ň|ň', r'{nˇ}', txt) # n-caron
|
||||
|
||||
txt = re.sub(u'Œ|Œ|Œ', r'{OE}', txt) # OE
|
||||
txt = re.sub(u'œ|œ|œ', r'{oe}', txt) # oe
|
||||
|
||||
txt = re.sub(u'Ŕ|Ŕ|Ŕ', r"{R'}", txt) # R-acute
|
||||
txt = re.sub(u'ŕ|ŕ|ŕ', r"{r'}", txt) # r-acute
|
||||
txt = re.sub(u'Ř|Ř|Ř', r'{Rˇ}', txt) # R-caron
|
||||
txt = re.sub(u'ř|ř|ř', r'{rˇ}', txt) # r-caron
|
||||
txt = re.sub(u'Ŝ|Ŝ', r'{S^}', txt) # S-circumflex
|
||||
txt = re.sub(u'ŝ|ŝ', r'{s^}', txt) # s-circumflex
|
||||
txt = re.sub(u'Š|Š|Š', r'{Sˇ}', txt) # S-caron
|
||||
txt = re.sub(u'š|š|š', r'{sˇ}', txt) # s-caron
|
||||
txt = re.sub(u'Ť|Ť|Ť', r'{Tˇ}', txt) # T-caron
|
||||
txt = re.sub(u'ť|ť|ť', r'{tˇ}', txt) # t-caron
|
||||
txt = re.sub(u'Ů|Ů|Ů', r'{U°}', txt) # U-ring
|
||||
txt = re.sub(u'ů|ů|ů', r'{u°}', txt) # u-ring
|
||||
txt = re.sub(u'Ž|Ž|Ž', r'{Zˇ}', txt) # Z-caron
|
||||
txt = re.sub(u'ž|ž|ž', r'{zˇ}', txt) # z-caron
|
||||
|
||||
txt = re.sub(u'•|•|•', r'{*}', txt) # bullet
|
||||
txt = re.sub(u'₣|₣', r'{Fr}', txt) # Franc
|
||||
txt = re.sub(u'₤|₤', r'{L=}', txt) # Lira
|
||||
txt = re.sub(u'₨|₨', r'{Rs}', txt) # Rupee
|
||||
txt = re.sub(u'€|€|€', r'{C=}', txt) # euro
|
||||
txt = re.sub(u'™|™|™', r'{tm}', txt) # trademark
|
||||
txt = re.sub(u'♠|♠|♠', r'{spade}', txt) # spade
|
||||
txt = re.sub(u'♣|♣|♣', r'{club}', txt) # club
|
||||
txt = re.sub(u'♥|♥|♥', r'{heart}', txt) # heart
|
||||
txt = re.sub(u'♦|♦|♦', r'{diamond}', txt) # diamond
|
||||
|
||||
# Move into main code?
|
||||
# txt = re.sub(u'\xa0', r'p. ', txt) # blank paragraph
|
||||
# txt = re.sub(u'\n\n\n\n', r'\n\np. \n\n', txt) # blank paragraph
|
||||
# txt = re.sub(u'\n \n', r'\n<br />\n', txt) # blank paragraph - br tag
|
||||
|
||||
return txt
|
||||
286
ebook_converter/ebooks/txt/markdownml.py
Normal file
286
ebook_converter/ebooks/txt/markdownml.py
Normal file
@@ -0,0 +1,286 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL 3'
|
||||
__copyright__ = '''2011, John Schember <john@nachtimwald.com>
|
||||
2011, Leigh Parry <leighparry@blueyonder.co.uk>'''
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
'''
|
||||
Transform OEB content into Textile formatted plain text
|
||||
'''
|
||||
import re
|
||||
|
||||
from functools import partial
|
||||
|
||||
from calibre.ebooks.htmlz.oeb2html import OEB2HTML
|
||||
from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace, rewrite_links
|
||||
from calibre.ebooks.oeb.stylizer import Stylizer
|
||||
from polyglot.builtins import unicode_type, string_or_bytes
|
||||
|
||||
|
||||
class MarkdownMLizer(OEB2HTML):
|
||||
|
||||
def extract_content(self, oeb_book, opts):
|
||||
self.log.info('Converting XHTML to Markdown formatted TXT...')
|
||||
self.opts = opts
|
||||
self.in_code = False
|
||||
self.in_pre = False
|
||||
self.list = []
|
||||
self.blockquotes = 0
|
||||
self.remove_space_after_newline = False
|
||||
self.base_hrefs = [item.href for item in oeb_book.spine]
|
||||
self.map_resources(oeb_book)
|
||||
|
||||
self.style_bold = False
|
||||
self.style_italic = False
|
||||
|
||||
txt = self.mlize_spine(oeb_book)
|
||||
|
||||
# Do some tidying up
|
||||
txt = self.tidy_up(txt)
|
||||
|
||||
return txt
|
||||
|
||||
def mlize_spine(self, oeb_book):
|
||||
output = ['']
|
||||
for item in oeb_book.spine:
|
||||
self.log.debug('Converting %s to Markdown formatted TXT...' % item.href)
|
||||
self.rewrite_ids(item.data, item)
|
||||
rewrite_links(item.data, partial(self.rewrite_link, page=item))
|
||||
stylizer = Stylizer(item.data, item.href, oeb_book, self.opts, self.opts.output_profile)
|
||||
output += self.dump_text(item.data.find(XHTML('body')), stylizer)
|
||||
output.append('\n\n')
|
||||
return ''.join(output)
|
||||
|
||||
def tidy_up(self, text):
|
||||
# Remove blank space form beginning of paragraph.
|
||||
text = re.sub('(?msu)^[ ]{1,3}', '', text)
|
||||
# pre has 4 spaces. We trimmed 3 so anything with a space left is a pre.
|
||||
text = re.sub('(?msu)^[ ]', ' ', text)
|
||||
|
||||
# Remove tabs that aren't at the beinning of a line
|
||||
new_text = []
|
||||
for l in text.splitlines():
|
||||
start = re.match('\t+', l)
|
||||
if start:
|
||||
start = start.group()
|
||||
else:
|
||||
start = ''
|
||||
l = re.sub('\t', '', l)
|
||||
new_text.append(start + l)
|
||||
text = '\n'.join(new_text)
|
||||
|
||||
# Remove spaces from blank lines.
|
||||
text = re.sub('(?msu)^[ ]+$', '', text)
|
||||
|
||||
# Reduce blank lines
|
||||
text = re.sub('(?msu)\n{7,}', '\n' * 6, text)
|
||||
|
||||
# Remove blank lines at beginning and end of document.
|
||||
text = re.sub(r'^\s*', '', text)
|
||||
text = re.sub(r'\s*$', '\n\n', text)
|
||||
|
||||
return text
|
||||
|
||||
def remove_newlines(self, text):
|
||||
text = text.replace('\r\n', ' ')
|
||||
text = text.replace('\n', ' ')
|
||||
text = text.replace('\r', ' ')
|
||||
# Condense redundant spaces created by replacing newlines with spaces.
|
||||
text = re.sub(r'[ ]{2,}', ' ', text)
|
||||
text = re.sub(r'\t+', '', text)
|
||||
if self.remove_space_after_newline == True: # noqa
|
||||
text = re.sub(r'^ +', '', text)
|
||||
self.remove_space_after_newline = False
|
||||
return text
|
||||
|
||||
def prepare_string_for_markdown(self, txt):
|
||||
txt = re.sub(r'([\\`*_{}\[\]()#+!])', r'\\\1', txt)
|
||||
return txt
|
||||
|
||||
def prepare_string_for_pre(self, txt):
|
||||
new_text = []
|
||||
for l in txt.splitlines():
|
||||
new_text.append(' ' + l)
|
||||
return '\n'.join(new_text)
|
||||
|
||||
def dump_text(self, elem, stylizer):
|
||||
'''
|
||||
@elem: The element in the etree that we are working on.
|
||||
@stylizer: The style information attached to the element.
|
||||
'''
|
||||
|
||||
# We can only processes tags. If there isn't a tag return any text.
|
||||
if not isinstance(elem.tag, string_or_bytes) \
|
||||
or namespace(elem.tag) != XHTML_NS:
|
||||
p = elem.getparent()
|
||||
if p is not None and isinstance(p.tag, string_or_bytes) and namespace(p.tag) == XHTML_NS \
|
||||
and elem.tail:
|
||||
return [elem.tail]
|
||||
return ['']
|
||||
|
||||
# Setup our variables.
|
||||
text = []
|
||||
style = stylizer.style(elem)
|
||||
tags = []
|
||||
tag = barename(elem.tag)
|
||||
attribs = elem.attrib
|
||||
|
||||
# Ignore anything that is set to not be displayed.
|
||||
if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \
|
||||
or style['visibility'] == 'hidden':
|
||||
if hasattr(elem, 'tail') and elem.tail:
|
||||
return [elem.tail]
|
||||
return ['']
|
||||
|
||||
# Soft scene breaks.
|
||||
if 'margin-top' in style.cssdict() and style['margin-top'] != 'auto':
|
||||
ems = int(round(float(style.marginTop) / style.fontSize) - 1)
|
||||
if ems >= 1:
|
||||
text.append(u'\n\n' * ems)
|
||||
|
||||
bq = '> ' * self.blockquotes
|
||||
# Block level elements
|
||||
if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'div'):
|
||||
h_tag = ''
|
||||
if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6'):
|
||||
h_tag = '#' * int(tag[1]) + ' '
|
||||
text.append('\n' + bq + h_tag)
|
||||
tags.append('\n')
|
||||
self.remove_space_after_newline = True
|
||||
|
||||
if style['font-style'] == 'italic' or tag in ('i', 'em'):
|
||||
if tag not in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'cite'):
|
||||
if self.style_italic == False: # noqa
|
||||
text.append('*')
|
||||
tags.append('*')
|
||||
self.style_italic = True
|
||||
if style['font-weight'] in ('bold', 'bolder') or tag in ('b', 'strong'):
|
||||
if tag not in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'th'):
|
||||
if self.style_bold == False: # noqa
|
||||
text.append('**')
|
||||
tags.append('**')
|
||||
self.style_bold = True
|
||||
if tag == 'br':
|
||||
text.append(' \n')
|
||||
self.remove_space_after_newline = True
|
||||
if tag == 'blockquote':
|
||||
self.blockquotes += 1
|
||||
tags.append('>')
|
||||
text.append('> ' * self.blockquotes)
|
||||
elif tag == 'code':
|
||||
if not self.in_pre and not self.in_code:
|
||||
text.append('`')
|
||||
tags.append('`')
|
||||
self.in_code = True
|
||||
elif tag == 'pre':
|
||||
if not self.in_pre:
|
||||
text.append('\n')
|
||||
tags.append('pre')
|
||||
self.in_pre = True
|
||||
elif tag == 'hr':
|
||||
text.append('\n* * *')
|
||||
tags.append('\n')
|
||||
elif tag == 'a':
|
||||
# Only write links with absolute (external) urls.
|
||||
if self.opts.keep_links and 'href' in attribs and '://' in attribs['href']:
|
||||
title = ''
|
||||
if 'title' in attribs:
|
||||
title = ' "' + attribs['title'] + '"'
|
||||
remove_space = self.remove_space_after_newline
|
||||
title = self.remove_newlines(title)
|
||||
self.remove_space_after_newline = remove_space
|
||||
text.append('[')
|
||||
tags.append('](' + attribs['href'] + title + ')')
|
||||
elif tag == 'img':
|
||||
if self.opts.keep_image_references:
|
||||
txt = '!'
|
||||
if 'alt' in attribs:
|
||||
remove_space = self.remove_space_after_newline
|
||||
txt += '[' + self.remove_newlines(attribs['alt']) + ']'
|
||||
self.remove_space_after_newline = remove_space
|
||||
txt += '(' + attribs['src'] + ')'
|
||||
text.append(txt)
|
||||
elif tag in ('ol', 'ul'):
|
||||
tags.append(tag)
|
||||
# Add the list to our lists of lists so we can track
|
||||
# nested lists.
|
||||
self.list.append({'name': tag, 'num': 0})
|
||||
elif tag == 'li':
|
||||
# Get the last list from our list of lists
|
||||
if self.list:
|
||||
li = self.list[-1]
|
||||
else:
|
||||
li = {'name': 'ul', 'num': 0}
|
||||
# Add a new line to start the item
|
||||
text.append('\n')
|
||||
# Add indent if we have nested lists.
|
||||
list_count = len(self.list)
|
||||
# We only care about indenting nested lists.
|
||||
if (list_count - 1) > 0:
|
||||
text.append('\t' * (list_count - 1))
|
||||
# Add blockquote if we have a blockquote in a list item.
|
||||
text.append(bq)
|
||||
# Write the proper sign for ordered and unorded lists.
|
||||
if li['name'] == 'ul':
|
||||
text.append('+ ')
|
||||
elif li['name'] == 'ol':
|
||||
li['num'] += 1
|
||||
text.append(unicode_type(li['num']) + '. ')
|
||||
|
||||
# Process tags that contain text.
|
||||
if hasattr(elem, 'text') and elem.text:
|
||||
txt = elem.text
|
||||
if self.in_pre:
|
||||
txt = self.prepare_string_for_pre(txt)
|
||||
elif self.in_code:
|
||||
txt = self.remove_newlines(txt)
|
||||
else:
|
||||
txt = self.prepare_string_for_markdown(self.remove_newlines(txt))
|
||||
text.append(txt)
|
||||
|
||||
# Recurse down into tags within the tag we are in.
|
||||
for item in elem:
|
||||
text += self.dump_text(item, stylizer)
|
||||
|
||||
# Close all open tags.
|
||||
tags.reverse()
|
||||
for t in tags:
|
||||
if t in ('pre', 'ul', 'ol', '>'):
|
||||
if t == 'pre':
|
||||
self.in_pre = False
|
||||
text.append('\n')
|
||||
elif t == '>':
|
||||
self.blockquotes -= 1
|
||||
elif t in ('ul', 'ol'):
|
||||
if self.list:
|
||||
self.list.pop()
|
||||
text.append('\n')
|
||||
else:
|
||||
if t == '**':
|
||||
self.style_bold = False
|
||||
elif t == '*':
|
||||
self.style_italic = False
|
||||
elif t == '`':
|
||||
self.in_code = False
|
||||
text.append('%s' % t)
|
||||
|
||||
# Soft scene breaks.
|
||||
if 'margin-bottom' in style.cssdict() and style['margin-bottom'] != 'auto':
|
||||
ems = int(round((float(style.marginBottom) / style.fontSize) - 1))
|
||||
if ems >= 1:
|
||||
text.append(u'\n\n' * ems)
|
||||
|
||||
# Add the text that is outside of the tag.
|
||||
if hasattr(elem, 'tail') and elem.tail:
|
||||
tail = elem.tail
|
||||
if self.in_pre:
|
||||
tail = self.prepare_string_for_pre(tail)
|
||||
elif self.in_code:
|
||||
tail = self.remove_newlines(tail)
|
||||
else:
|
||||
tail = self.prepare_string_for_markdown(self.remove_newlines(tail))
|
||||
text.append(tail)
|
||||
|
||||
return text
|
||||
32
ebook_converter/ebooks/txt/newlines.py
Normal file
32
ebook_converter/ebooks/txt/newlines.py
Normal file
@@ -0,0 +1,32 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL 3'
|
||||
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import os
|
||||
|
||||
|
||||
class TxtNewlines(object):
|
||||
|
||||
NEWLINE_TYPES = {
|
||||
'system' : os.linesep,
|
||||
'unix' : '\n',
|
||||
'old_mac' : '\r',
|
||||
'windows' : '\r\n'
|
||||
}
|
||||
|
||||
def __init__(self, newline_type):
|
||||
self.newline = self.NEWLINE_TYPES.get(newline_type.lower(), os.linesep)
|
||||
|
||||
|
||||
def specified_newlines(newline, text):
|
||||
# Convert all newlines to \n
|
||||
text = text.replace('\r\n', '\n')
|
||||
text = text.replace('\r', '\n')
|
||||
|
||||
if newline == '\n':
|
||||
return text
|
||||
|
||||
return text.replace('\n', newline)
|
||||
502
ebook_converter/ebooks/txt/textileml.py
Normal file
502
ebook_converter/ebooks/txt/textileml.py
Normal file
@@ -0,0 +1,502 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL 3'
|
||||
__copyright__ = '2011, Leigh Parry <leighparry@blueyonder.co.uk>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
'''
|
||||
Transform OEB content into Textile formatted plain text
|
||||
'''
|
||||
import re
|
||||
|
||||
from functools import partial
|
||||
|
||||
from calibre.ebooks.htmlz.oeb2html import OEB2HTML
|
||||
from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace, rewrite_links
|
||||
from calibre.ebooks.oeb.stylizer import Stylizer
|
||||
from calibre.ebooks import unit_convert
|
||||
from calibre.ebooks.textile.unsmarten import unsmarten
|
||||
from polyglot.builtins import string_or_bytes
|
||||
|
||||
|
||||
class TextileMLizer(OEB2HTML):
|
||||
|
||||
MAX_EM = 10
|
||||
|
||||
def extract_content(self, oeb_book, opts):
|
||||
self.log.info('Converting XHTML to Textile formatted TXT...')
|
||||
self.opts = opts
|
||||
self.in_pre = False
|
||||
self.in_table = False
|
||||
self.links = {}
|
||||
self.list = []
|
||||
self.our_links = []
|
||||
self.in_a_link = False
|
||||
self.our_ids = []
|
||||
self.images = {}
|
||||
self.id_no_text = ''
|
||||
self.style_embed = []
|
||||
self.remove_space_after_newline = False
|
||||
self.base_hrefs = [item.href for item in oeb_book.spine]
|
||||
self.map_resources(oeb_book)
|
||||
|
||||
self.style_bold = False
|
||||
self.style_italic = False
|
||||
self.style_under = False
|
||||
self.style_strike = False
|
||||
self.style_smallcap = False
|
||||
|
||||
txt = self.mlize_spine(oeb_book)
|
||||
if self.opts.unsmarten_punctuation:
|
||||
txt = unsmarten(txt)
|
||||
|
||||
# Do some tidying up
|
||||
txt = self.tidy_up(txt)
|
||||
|
||||
return txt
|
||||
|
||||
def mlize_spine(self, oeb_book):
|
||||
output = ['']
|
||||
for item in oeb_book.spine:
|
||||
self.log.debug('Converting %s to Textile formatted TXT...' % item.href)
|
||||
self.rewrite_ids(item.data, item)
|
||||
rewrite_links(item.data, partial(self.rewrite_link, page=item))
|
||||
stylizer = Stylizer(item.data, item.href, oeb_book, self.opts, self.opts.output_profile)
|
||||
output += self.dump_text(item.data.find(XHTML('body')), stylizer)
|
||||
output.append('\n\n')
|
||||
return ''.join(output)
|
||||
|
||||
def tidy_up(self, text):
|
||||
# May need tweaking and finetuning
|
||||
def check_escaping(text, tests):
|
||||
for t in tests:
|
||||
# I'm not checking for duplicated spans '%' as any that follow each other were being incorrectly merged
|
||||
txt = '%s' % t
|
||||
if txt != '%':
|
||||
text = re.sub(r'([^'+t+'|^\n])'+t+r'\]\['+t+'([^'+t+'])', r'\1\2', text)
|
||||
text = re.sub(r'([^'+t+'|^\n])'+t+t+'([^'+t+'])', r'\1\2', text)
|
||||
text = re.sub(r'(\s|[*_\'"])\[('+t+'[a-zA-Z0-9 \'",.*_]+'+t+r')\](\s|[*_\'"?!,.])', r'\1\2\3', text)
|
||||
return text
|
||||
|
||||
# Now tidyup links and ids - remove ones that don't have a correponding opposite
|
||||
if self.opts.keep_links:
|
||||
for i in self.our_links:
|
||||
if i[0] == '#':
|
||||
if i not in self.our_ids:
|
||||
text = re.sub(r'"(.+)":'+i+r'(\s)', r'\1\2', text)
|
||||
for i in self.our_ids:
|
||||
if i not in self.our_links:
|
||||
text = re.sub(r'%?\('+i+'\\)\xa0?%?', r'', text)
|
||||
|
||||
# Remove obvious non-needed escaping, add sub/sup-script ones
|
||||
text = check_escaping(text, [r'\*', '_', r'\*'])
|
||||
# escape the super/sub-scripts if needed
|
||||
text = re.sub(r'(\w)([~^]\w+[~^])', r'\1[\2]', text)
|
||||
# escape the super/sub-scripts if needed
|
||||
text = re.sub(r'([~^]\w+[~^])(\w)', r'[\1]\2', text)
|
||||
|
||||
# remove empty spans
|
||||
text = re.sub(r'%\xa0+', r'%', text)
|
||||
# remove empty spans - MAY MERGE SOME ?
|
||||
text = re.sub(r'%%', r'', text)
|
||||
# remove spans from tagged output
|
||||
text = re.sub(r'%([_+*-]+)%', r'\1', text)
|
||||
# remove spaces before a newline
|
||||
text = re.sub(r' +\n', r'\n', text)
|
||||
# remove newlines at top of file
|
||||
text = re.sub(r'^\n+', r'', text)
|
||||
# correct blockcode paras
|
||||
text = re.sub(r'\npre\.\n?\nbc\.', r'\nbc.', text)
|
||||
# correct blockquote paras
|
||||
text = re.sub(r'\nbq\.\n?\np.*?\. ', r'\nbq. ', text)
|
||||
|
||||
# reduce blank lines
|
||||
text = re.sub(r'\n{3}', r'\n\np. \n\n', text)
|
||||
text = re.sub(u'%\n(p[<>=]{1,2}\\.|p\\.)', r'%\n\n\1', text)
|
||||
# Check span following blank para
|
||||
text = re.sub(r'\n+ +%', r' %', text)
|
||||
text = re.sub(u'p[<>=]{1,2}\\.\n\n?', r'', text)
|
||||
# blank paragraph
|
||||
text = re.sub(r'\n(p.*\.)\n', r'\n\1 \n\n', text)
|
||||
# blank paragraph
|
||||
text = re.sub(u'\n\xa0', r'\np. ', text)
|
||||
# blank paragraph
|
||||
text = re.sub(u'\np[<>=]{1,2}?\\. \xa0', r'\np. ', text)
|
||||
text = re.sub(r'(^|\n)(p.*\. ?\n)(p.*\.)', r'\1\3', text)
|
||||
text = re.sub(r'\n(p\. \n)(p.*\.|h.*\.)', r'\n\2', text)
|
||||
# sort out spaces in tables
|
||||
text = re.sub(r' {2,}\|', r' |', text)
|
||||
|
||||
# Now put back spaces removed earlier as they're needed here
|
||||
text = re.sub(r'\np\.\n', r'\np. \n', text)
|
||||
# reduce blank lines
|
||||
text = re.sub(r' \n\n\n', r' \n\n', text)
|
||||
|
||||
return text
|
||||
|
||||
def remove_newlines(self, text):
|
||||
text = text.replace('\r\n', ' ')
|
||||
text = text.replace('\n', ' ')
|
||||
text = text.replace('\r', ' ')
|
||||
# Condense redundant spaces created by replacing newlines with spaces.
|
||||
text = re.sub(r'[ ]{2,}', ' ', text)
|
||||
text = re.sub(r'\t+', '', text)
|
||||
if self.remove_space_after_newline == True: # noqa
|
||||
text = re.sub(r'^ +', '', text)
|
||||
self.remove_space_after_newline = False
|
||||
return text
|
||||
|
||||
def check_styles(self, style):
|
||||
txt = '{'
|
||||
if self.opts.keep_color:
|
||||
if 'color' in style.cssdict() and style['color'] != 'black':
|
||||
txt += 'color:'+style['color']+';'
|
||||
if 'background' in style.cssdict():
|
||||
txt += 'background:'+style['background']+';'
|
||||
txt += '}'
|
||||
if txt == '{}':
|
||||
txt = ''
|
||||
return txt
|
||||
|
||||
def check_halign(self, style):
|
||||
tests = {'left':'<','justify':'<>','center':'=','right':'>'}
|
||||
for i in tests:
|
||||
if style['text-align'] == i:
|
||||
return tests[i]
|
||||
return ''
|
||||
|
||||
def check_valign(self, style):
|
||||
tests = {'top':'^','bottom':'~'} # , 'middle':'-'}
|
||||
for i in tests:
|
||||
if style['vertical-align'] == i:
|
||||
return tests[i]
|
||||
return ''
|
||||
|
||||
def check_padding(self, style, stylizer):
|
||||
txt = ''
|
||||
left_padding_pts = 0
|
||||
left_margin_pts = 0
|
||||
if 'padding-left' in style.cssdict() and style['padding-left'] != 'auto':
|
||||
left_padding_pts = unit_convert(style['padding-left'], style.width, style.fontSize, stylizer.profile.dpi)
|
||||
if 'margin-left' in style.cssdict() and style['margin-left'] != 'auto':
|
||||
left_margin_pts = unit_convert(style['margin-left'], style.width, style.fontSize, stylizer.profile.dpi)
|
||||
left = left_margin_pts + left_padding_pts
|
||||
emleft = min(int(round(left / stylizer.profile.fbase)), self.MAX_EM)
|
||||
if emleft >= 1:
|
||||
txt += '(' * emleft
|
||||
right_padding_pts = 0
|
||||
right_margin_pts = 0
|
||||
if 'padding-right' in style.cssdict() and style['padding-right'] != 'auto':
|
||||
right_padding_pts = unit_convert(style['padding-right'], style.width, style.fontSize, stylizer.profile.dpi)
|
||||
if 'margin-right' in style.cssdict() and style['margin-right'] != 'auto':
|
||||
right_margin_pts = unit_convert(style['margin-right'], style.width, style.fontSize, stylizer.profile.dpi)
|
||||
right = right_margin_pts + right_padding_pts
|
||||
emright = min(int(round(right / stylizer.profile.fbase)), self.MAX_EM)
|
||||
if emright >= 1:
|
||||
txt += ')' * emright
|
||||
|
||||
return txt
|
||||
|
||||
def check_id_tag(self, attribs):
|
||||
txt = ''
|
||||
if 'id' in attribs:
|
||||
txt = '(#'+attribs['id']+ ')'
|
||||
self.our_ids.append('#'+attribs['id'])
|
||||
self.id_no_text = u'\xa0'
|
||||
return txt
|
||||
|
||||
def build_block(self, tag, style, attribs, stylizer):
|
||||
txt = '\n' + tag
|
||||
if self.opts.keep_links:
|
||||
txt += self.check_id_tag(attribs)
|
||||
txt += self.check_padding(style, stylizer)
|
||||
txt += self.check_halign(style)
|
||||
txt += self.check_styles(style)
|
||||
return txt
|
||||
|
||||
def prepare_string_for_textile(self, txt):
|
||||
if re.search(r'(\s([*&_+\-~@%|]|\?{2})\S)|(\S([*&_+\-~@%|]|\?{2})\s)', txt):
|
||||
return ' ==%s== ' % txt
|
||||
return txt
|
||||
|
||||
def dump_text(self, elem, stylizer):
|
||||
'''
|
||||
@elem: The element in the etree that we are working on.
|
||||
@stylizer: The style information attached to the element.
|
||||
'''
|
||||
|
||||
# We can only processes tags. If there isn't a tag return any text.
|
||||
if not isinstance(elem.tag, string_or_bytes) \
|
||||
or namespace(elem.tag) != XHTML_NS:
|
||||
p = elem.getparent()
|
||||
if p is not None and isinstance(p.tag, string_or_bytes) and namespace(p.tag) == XHTML_NS \
|
||||
and elem.tail:
|
||||
return [elem.tail]
|
||||
return ['']
|
||||
|
||||
# Setup our variables.
|
||||
text = ['']
|
||||
style = stylizer.style(elem)
|
||||
tags = []
|
||||
tag = barename(elem.tag)
|
||||
attribs = elem.attrib
|
||||
|
||||
# Ignore anything that is set to not be displayed.
|
||||
if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \
|
||||
or style['visibility'] == 'hidden':
|
||||
if hasattr(elem, 'tail') and elem.tail:
|
||||
return [elem.tail]
|
||||
return ['']
|
||||
|
||||
# Soft scene breaks.
|
||||
if 'margin-top' in style.cssdict() and style['margin-top'] != 'auto':
|
||||
ems = min(int(round(float(style.marginTop) / style.fontSize) - 1), self.MAX_EM)
|
||||
if ems >= 1:
|
||||
text.append(u'\n\n\xa0' * ems)
|
||||
|
||||
if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'div'):
|
||||
if tag == 'div':
|
||||
tag = 'p'
|
||||
text.append(self.build_block(tag, style, attribs, stylizer))
|
||||
text.append('. ')
|
||||
tags.append('\n')
|
||||
|
||||
if style['font-style'] == 'italic' or tag in ('i', 'em'):
|
||||
if tag not in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'cite'):
|
||||
if self.style_italic == False: # noqa
|
||||
if self.in_a_link:
|
||||
text.append('_')
|
||||
tags.append('_')
|
||||
else:
|
||||
text.append('[_')
|
||||
tags.append('_]')
|
||||
self.style_embed.append('_')
|
||||
self.style_italic = True
|
||||
if style['font-weight'] in ('bold', 'bolder') or tag in ('b', 'strong'):
|
||||
if tag not in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'th'):
|
||||
if self.style_bold == False: # noqa
|
||||
if self.in_a_link:
|
||||
text.append('*')
|
||||
tags.append('*')
|
||||
else:
|
||||
text.append('[*')
|
||||
tags.append('*]')
|
||||
self.style_embed.append('*')
|
||||
self.style_bold = True
|
||||
if style['text-decoration'] == 'underline' or tag in ('u', 'ins'):
|
||||
if tag != 'a':
|
||||
if self.style_under == False: # noqa
|
||||
text.append('[+')
|
||||
tags.append('+]')
|
||||
self.style_embed.append('+')
|
||||
self.style_under = True
|
||||
if style['text-decoration'] == 'line-through' or tag in ('strike', 'del', 's'):
|
||||
if self.style_strike == False: # noqa
|
||||
text.append('[-')
|
||||
tags.append('-]')
|
||||
self.style_embed.append('-')
|
||||
self.style_strike = True
|
||||
if tag == 'br':
|
||||
for i in reversed(self.style_embed):
|
||||
text.append(i)
|
||||
text.append('\n')
|
||||
for i in self.style_embed:
|
||||
text.append(i)
|
||||
tags.append('')
|
||||
self.remove_space_after_newline = True
|
||||
if tag == 'blockquote':
|
||||
text.append('\nbq. ')
|
||||
tags.append('\n')
|
||||
elif tag in ('abbr', 'acronym'):
|
||||
text.append('')
|
||||
txt = attribs['title']
|
||||
tags.append('(' + txt + ')')
|
||||
elif tag == 'sup':
|
||||
text.append('^')
|
||||
tags.append('^')
|
||||
elif tag == 'sub':
|
||||
text.append('~')
|
||||
tags.append('~')
|
||||
elif tag == 'code':
|
||||
if self.in_pre:
|
||||
text.append('\nbc. ')
|
||||
tags.append('')
|
||||
else:
|
||||
text.append('@')
|
||||
tags.append('@')
|
||||
elif tag == 'cite':
|
||||
text.append('??')
|
||||
tags.append('??')
|
||||
elif tag == 'hr':
|
||||
text.append('\n***')
|
||||
tags.append('\n')
|
||||
elif tag == 'pre':
|
||||
self.in_pre = True
|
||||
text.append('\npre. ')
|
||||
tags.append('pre\n')
|
||||
elif tag == 'a':
|
||||
if self.opts.keep_links:
|
||||
if 'href' in attribs:
|
||||
text.append('"')
|
||||
tags.append('a')
|
||||
tags.append('":' + attribs['href'])
|
||||
self.our_links.append(attribs['href'])
|
||||
if 'title' in attribs:
|
||||
tags.append('(' + attribs['title'] + ')')
|
||||
self.in_a_link = True
|
||||
else:
|
||||
text.append('%')
|
||||
tags.append('%')
|
||||
elif tag == 'img':
|
||||
if self.opts.keep_image_references:
|
||||
txt = '!' + self.check_halign(style)
|
||||
txt += self.check_valign(style)
|
||||
txt += attribs['src']
|
||||
text.append(txt)
|
||||
if 'alt' in attribs:
|
||||
txt = attribs['alt']
|
||||
if txt != '':
|
||||
text.append('(' + txt + ')')
|
||||
tags.append('!')
|
||||
elif tag in ('ol', 'ul'):
|
||||
self.list.append({'name': tag, 'num': 0})
|
||||
text.append('')
|
||||
tags.append(tag)
|
||||
elif tag == 'li':
|
||||
if self.list:
|
||||
li = self.list[-1]
|
||||
else:
|
||||
li = {'name': 'ul', 'num': 0}
|
||||
text.append('\n')
|
||||
if li['name'] == 'ul':
|
||||
text.append('*' * len(self.list) + ' ')
|
||||
elif li['name'] == 'ol':
|
||||
text.append('#' * len(self.list) + ' ')
|
||||
tags.append('')
|
||||
elif tag == 'dl':
|
||||
text.append('\n')
|
||||
tags.append('')
|
||||
elif tag == 'dt':
|
||||
text.append('')
|
||||
tags.append('\n')
|
||||
elif tag == 'dd':
|
||||
text.append(' ')
|
||||
tags.append('')
|
||||
elif tag == 'dd':
|
||||
text.append('')
|
||||
tags.append('\n')
|
||||
elif tag == 'table':
|
||||
txt = self.build_block(tag, style, attribs, stylizer)
|
||||
txt += '. \n'
|
||||
if txt != '\ntable. \n':
|
||||
text.append(txt)
|
||||
else:
|
||||
text.append('\n')
|
||||
tags.append('')
|
||||
elif tag == 'tr':
|
||||
txt = self.build_block('', style, attribs, stylizer)
|
||||
txt += '. '
|
||||
if txt != '\n. ':
|
||||
txt = re.sub('\n', '', txt)
|
||||
text.append(txt)
|
||||
tags.append('|\n')
|
||||
elif tag == 'td':
|
||||
text.append('|')
|
||||
txt = ''
|
||||
txt += self.check_halign(style)
|
||||
txt += self.check_valign(style)
|
||||
if 'colspan' in attribs:
|
||||
txt += '\\' + attribs['colspan']
|
||||
if 'rowspan' in attribs:
|
||||
txt += '/' + attribs['rowspan']
|
||||
txt += self.check_styles(style)
|
||||
if txt != '':
|
||||
text.append(txt + '. ')
|
||||
tags.append('')
|
||||
elif tag == 'th':
|
||||
text.append('|_. ')
|
||||
tags.append('')
|
||||
elif tag == 'span':
|
||||
if style['font-variant'] == 'small-caps':
|
||||
if self.style_smallcap == False: # noqa
|
||||
text.append('&')
|
||||
tags.append('&')
|
||||
self.style_smallcap = True
|
||||
else:
|
||||
if self.in_a_link == False: # noqa
|
||||
txt = '%'
|
||||
if self.opts.keep_links:
|
||||
txt += self.check_id_tag(attribs)
|
||||
txt += self.check_styles(style)
|
||||
if txt != '%':
|
||||
text.append(txt)
|
||||
tags.append('%')
|
||||
|
||||
if self.opts.keep_links and 'id' in attribs:
|
||||
if tag not in ('body', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'span', 'table'):
|
||||
text.append(self.check_id_tag(attribs))
|
||||
|
||||
# Process the styles for any that we want to keep
|
||||
if tag not in ('body', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'hr', 'a', 'img',
|
||||
'span', 'table', 'tr', 'td'):
|
||||
if not self.in_a_link:
|
||||
text.append(self.check_styles(style))
|
||||
|
||||
# Process tags that contain text.
|
||||
if hasattr(elem, 'text') and elem.text:
|
||||
txt = elem.text
|
||||
if not self.in_pre:
|
||||
txt = self.prepare_string_for_textile(self.remove_newlines(txt))
|
||||
text.append(txt)
|
||||
self.id_no_text = u''
|
||||
|
||||
# Recurse down into tags within the tag we are in.
|
||||
for item in elem:
|
||||
text += self.dump_text(item, stylizer)
|
||||
|
||||
# Close all open tags.
|
||||
tags.reverse()
|
||||
for t in tags:
|
||||
if t in ('pre', 'ul', 'ol', 'li', 'table'):
|
||||
if t == 'pre':
|
||||
self.in_pre = False
|
||||
elif t in ('ul', 'ol'):
|
||||
if self.list:
|
||||
self.list.pop()
|
||||
if not self.list:
|
||||
text.append('\n')
|
||||
else:
|
||||
if t == 'a':
|
||||
self.in_a_link = False
|
||||
t = ''
|
||||
text.append(self.id_no_text)
|
||||
self.id_no_text = u''
|
||||
if t in ('*]', '*'):
|
||||
self.style_bold = False
|
||||
elif t in ('_]', '_'):
|
||||
self.style_italic = False
|
||||
elif t == '+]':
|
||||
self.style_under = False
|
||||
elif t == '-]':
|
||||
self.style_strike = False
|
||||
elif t == '&':
|
||||
self.style_smallcap = False
|
||||
if t in ('*]', '_]', '+]', '-]', '*', '_'):
|
||||
txt = self.style_embed.pop()
|
||||
text.append('%s' % t)
|
||||
|
||||
# Soft scene breaks.
|
||||
if 'margin-bottom' in style.cssdict() and style['margin-bottom'] != 'auto':
|
||||
ems = min(int(round((float(style.marginBottom) / style.fontSize) - 1)), self.MAX_EM)
|
||||
if ems >= 1:
|
||||
text.append(u'\n\n\xa0' * ems)
|
||||
|
||||
# Add the text that is outside of the tag.
|
||||
if hasattr(elem, 'tail') and elem.tail:
|
||||
tail = elem.tail
|
||||
if not self.in_pre:
|
||||
tail = self.prepare_string_for_textile(self.remove_newlines(tail))
|
||||
text.append(tail)
|
||||
|
||||
return text
|
||||
264
ebook_converter/ebooks/txt/txtml.py
Normal file
264
ebook_converter/ebooks/txt/txtml.py
Normal file
@@ -0,0 +1,264 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL 3'
|
||||
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
'''
|
||||
Transform OEB content into plain text
|
||||
'''
|
||||
|
||||
import re
|
||||
|
||||
from lxml import etree
|
||||
from polyglot.builtins import string_or_bytes
|
||||
|
||||
|
||||
BLOCK_TAGS = [
|
||||
'div',
|
||||
'p',
|
||||
'h1',
|
||||
'h2',
|
||||
'h3',
|
||||
'h4',
|
||||
'h5',
|
||||
'h6',
|
||||
'li',
|
||||
'tr',
|
||||
]
|
||||
|
||||
BLOCK_STYLES = [
|
||||
'block',
|
||||
]
|
||||
|
||||
HEADING_TAGS = [
|
||||
'h1',
|
||||
'h2',
|
||||
'h3',
|
||||
'h4',
|
||||
'h5',
|
||||
'h6',
|
||||
]
|
||||
|
||||
SPACE_TAGS = [
|
||||
'td',
|
||||
'br',
|
||||
]
|
||||
|
||||
|
||||
class TXTMLizer(object):
|
||||
|
||||
def __init__(self, log):
|
||||
self.log = log
|
||||
|
||||
def extract_content(self, oeb_book, opts):
|
||||
self.log.info('Converting XHTML to TXT...')
|
||||
self.oeb_book = oeb_book
|
||||
self.opts = opts
|
||||
self.toc_titles = []
|
||||
self.toc_ids = []
|
||||
self.last_was_heading = False
|
||||
|
||||
self.create_flat_toc(self.oeb_book.toc)
|
||||
|
||||
return self.mlize_spine()
|
||||
|
||||
def mlize_spine(self):
|
||||
from calibre.ebooks.oeb.base import XHTML
|
||||
from calibre.ebooks.oeb.stylizer import Stylizer
|
||||
from calibre.utils.xml_parse import safe_xml_fromstring
|
||||
output = [u'']
|
||||
output.append(self.get_toc())
|
||||
for item in self.oeb_book.spine:
|
||||
self.log.debug('Converting %s to TXT...' % item.href)
|
||||
for x in item.data.iterdescendants(etree.Comment):
|
||||
if x.text and '--' in x.text:
|
||||
x.text = x.text.replace('--', '__')
|
||||
content = etree.tostring(item.data, encoding='unicode')
|
||||
content = self.remove_newlines(content)
|
||||
content = safe_xml_fromstring(content)
|
||||
stylizer = Stylizer(content, item.href, self.oeb_book, self.opts, self.opts.output_profile)
|
||||
output += self.dump_text(content.find(XHTML('body')), stylizer, item)
|
||||
output += '\n\n\n\n\n\n'
|
||||
output = ''.join(output)
|
||||
output = '\n'.join(l.rstrip() for l in output.splitlines())
|
||||
output = self.cleanup_text(output)
|
||||
|
||||
return output
|
||||
|
||||
def remove_newlines(self, text):
|
||||
self.log.debug('\tRemove newlines for processing...')
|
||||
text = text.replace('\r\n', ' ')
|
||||
text = text.replace('\n', ' ')
|
||||
text = text.replace('\r', ' ')
|
||||
# Condense redundant spaces created by replacing newlines with spaces.
|
||||
text = re.sub(r'[ ]{2,}', ' ', text)
|
||||
|
||||
return text
|
||||
|
||||
def get_toc(self):
|
||||
toc = ['']
|
||||
if getattr(self.opts, 'inline_toc', None):
|
||||
self.log.debug('Generating table of contents...')
|
||||
toc.append('%s\n\n' % _('Table of Contents:'))
|
||||
for item in self.toc_titles:
|
||||
toc.append('* %s\n\n' % item)
|
||||
return ''.join(toc)
|
||||
|
||||
def create_flat_toc(self, nodes):
|
||||
'''
|
||||
Turns a hierarchical list of TOC href's into a flat list.
|
||||
'''
|
||||
for item in nodes:
|
||||
self.toc_titles.append(item.title)
|
||||
self.toc_ids.append(item.href)
|
||||
self.create_flat_toc(item.nodes)
|
||||
|
||||
def cleanup_text(self, text):
|
||||
self.log.debug('\tClean up text...')
|
||||
# Replace bad characters.
|
||||
text = text.replace(u'\xa0', ' ')
|
||||
|
||||
# Replace tabs, vertical tags and form feeds with single space.
|
||||
text = text.replace('\t+', ' ')
|
||||
text = text.replace('\v+', ' ')
|
||||
text = text.replace('\f+', ' ')
|
||||
|
||||
# Single line paragraph.
|
||||
text = re.sub('(?<=.)\n(?=.)', ' ', text)
|
||||
|
||||
# Remove multiple spaces.
|
||||
text = re.sub('[ ]{2,}', ' ', text)
|
||||
|
||||
# Remove excessive newlines.
|
||||
text = re.sub('\n[ ]+\n', '\n\n', text)
|
||||
if self.opts.remove_paragraph_spacing:
|
||||
text = re.sub('\n{2,}', '\n', text)
|
||||
text = re.sub(r'(?msu)^(?P<t>[^\t\n]+?)$', lambda mo: u'%s\n\n' % mo.group('t'), text)
|
||||
text = re.sub(r'(?msu)(?P<b>[^\n])\n+(?P<t>[^\t\n]+?)(?=\n)', lambda mo: '%s\n\n\n\n\n\n%s' % (mo.group('b'), mo.group('t')), text)
|
||||
else:
|
||||
text = re.sub('\n{7,}', '\n\n\n\n\n\n', text)
|
||||
|
||||
# Replace spaces at the beginning and end of lines
|
||||
# We don't replace tabs because those are only added
|
||||
# when remove paragraph spacing is enabled.
|
||||
text = re.sub('(?imu)^[ ]+', '', text)
|
||||
text = re.sub('(?imu)[ ]+$', '', text)
|
||||
|
||||
# Remove empty space and newlines at the beginning of the document.
|
||||
text = re.sub(r'(?u)^[ \n]+', '', text)
|
||||
|
||||
if self.opts.max_line_length:
|
||||
max_length = self.opts.max_line_length
|
||||
if self.opts.max_line_length < 25 and not self.opts.force_max_line_length:
|
||||
max_length = 25
|
||||
short_lines = []
|
||||
lines = text.splitlines()
|
||||
for line in lines:
|
||||
while len(line) > max_length:
|
||||
space = line.rfind(' ', 0, max_length)
|
||||
if space != -1:
|
||||
# Space was found.
|
||||
short_lines.append(line[:space])
|
||||
line = line[space + 1:]
|
||||
else:
|
||||
# Space was not found.
|
||||
if self.opts.force_max_line_length:
|
||||
# Force breaking at max_lenght.
|
||||
short_lines.append(line[:max_length])
|
||||
line = line[max_length:]
|
||||
else:
|
||||
# Look for the first space after max_length.
|
||||
space = line.find(' ', max_length, len(line))
|
||||
if space != -1:
|
||||
# Space was found.
|
||||
short_lines.append(line[:space])
|
||||
line = line[space + 1:]
|
||||
else:
|
||||
# No space was found cannot break line.
|
||||
short_lines.append(line)
|
||||
line = ''
|
||||
# Add the text that was less than max_lengh to the list
|
||||
short_lines.append(line)
|
||||
text = '\n'.join(short_lines)
|
||||
|
||||
return text
|
||||
|
||||
def dump_text(self, elem, stylizer, page):
|
||||
'''
|
||||
@elem: The element in the etree that we are working on.
|
||||
@stylizer: The style information attached to the element.
|
||||
@page: OEB page used to determine absolute urls.
|
||||
'''
|
||||
from calibre.ebooks.oeb.base import XHTML_NS, barename, namespace
|
||||
|
||||
if not isinstance(elem.tag, string_or_bytes) \
|
||||
or namespace(elem.tag) != XHTML_NS:
|
||||
p = elem.getparent()
|
||||
if p is not None and isinstance(p.tag, string_or_bytes) and namespace(p.tag) == XHTML_NS \
|
||||
and elem.tail:
|
||||
return [elem.tail]
|
||||
return ['']
|
||||
|
||||
text = ['']
|
||||
style = stylizer.style(elem)
|
||||
|
||||
if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \
|
||||
or style['visibility'] == 'hidden':
|
||||
if hasattr(elem, 'tail') and elem.tail:
|
||||
return [elem.tail]
|
||||
return ['']
|
||||
|
||||
tag = barename(elem.tag)
|
||||
tag_id = elem.attrib.get('id', None)
|
||||
in_block = False
|
||||
in_heading = False
|
||||
|
||||
# Are we in a heading?
|
||||
# This can either be a heading tag or a TOC item.
|
||||
if tag in HEADING_TAGS or '%s#%s' % (page.href, tag_id) in self.toc_ids:
|
||||
in_heading = True
|
||||
if not self.last_was_heading:
|
||||
text.append('\n\n\n\n\n\n')
|
||||
|
||||
# Are we in a paragraph block?
|
||||
if tag in BLOCK_TAGS or style['display'] in BLOCK_STYLES:
|
||||
if self.opts.remove_paragraph_spacing and not in_heading:
|
||||
text.append('\t')
|
||||
in_block = True
|
||||
|
||||
if tag in SPACE_TAGS:
|
||||
text.append(' ')
|
||||
|
||||
# Hard scene breaks.
|
||||
if tag == 'hr':
|
||||
text.append('\n\n* * *\n\n')
|
||||
# Soft scene breaks.
|
||||
try:
|
||||
ems = int(round((float(style.marginTop) / style.fontSize) - 1))
|
||||
if ems >= 1:
|
||||
text.append('\n' * ems)
|
||||
except:
|
||||
pass
|
||||
|
||||
# Process tags that contain text.
|
||||
if hasattr(elem, 'text') and elem.text:
|
||||
text.append(elem.text)
|
||||
|
||||
# Recurse down into tags within the tag we are in.
|
||||
for item in elem:
|
||||
text += self.dump_text(item, stylizer, page)
|
||||
|
||||
if in_block:
|
||||
text.append('\n\n')
|
||||
if in_heading:
|
||||
text.append('\n')
|
||||
self.last_was_heading = True
|
||||
else:
|
||||
self.last_was_heading = False
|
||||
|
||||
if hasattr(elem, 'tail') and elem.tail:
|
||||
text.append(elem.tail)
|
||||
|
||||
return text
|
||||
Reference in New Issue
Block a user