diff --git a/ebook_converter/ebooks/textile/__init__.py b/ebook_converter/ebooks/textile/__init__.py new file mode 100644 index 0000000..5a0a767 --- /dev/null +++ b/ebook_converter/ebooks/textile/__init__.py @@ -0,0 +1,7 @@ +from __future__ import absolute_import, division, print_function, unicode_literals +from .functions import textile, textile_restricted, Textile + +if False: + textile, textile_restricted, Textile + +__all__ = ['textile', 'textile_restricted'] diff --git a/ebook_converter/ebooks/textile/functions.py b/ebook_converter/ebooks/textile/functions.py new file mode 100644 index 0000000..5029a2a --- /dev/null +++ b/ebook_converter/ebooks/textile/functions.py @@ -0,0 +1,1091 @@ +#!/usr/bin/env python2 +# -*- coding: utf-8 -*- +from __future__ import absolute_import, division, print_function, unicode_literals + +""" +PyTextile + +A Humane Web Text Generator +""" + +# Last upstream version basis +# __version__ = '2.1.4' +# __date__ = '2009/12/04' + +__copyright__ = """ +Copyright (c) 2011, Leigh Parry +Copyright (c) 2011, John Schember +Copyright (c) 2009, Jason Samsa, http://jsamsa.com/ +Copyright (c) 2004, Roberto A. F. De Almeida, http://dealmeida.net/ +Copyright (c) 2003, Mark Pilgrim, http://diveintomark.org/ + +Original PHP Version: +Copyright (c) 2003-2004, Dean Allen +All rights reserved. + +Thanks to Carlo Zottmann for refactoring +Textile's procedural code into a class framework + +Additions and fixes Copyright (c) 2006 Alex Shiels http://thresholdstate.com/ + +""" + +__license__ = """ +L I C E N S E +============= +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +* Neither the name Textile nor the names of its contributors may be used to + endorse or promote products derived from this software without specific + prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. + +""" + +import re +import uuid + +from calibre.utils.smartypants import smartyPants +from polyglot.builtins import unicode_type +from polyglot.urllib import urlopen, urlparse + + +def _normalize_newlines(string): + out = re.sub(r'\r\n', '\n', string) + out = re.sub(r'\n{3,}', '\n\n', out) + out = re.sub(r'\n\s*\n', '\n\n', out) + out = re.sub(r'"$', '" ', out) + return out + + +def getimagesize(url): + """ + Attempts to determine an image's width and height, and returns a string + suitable for use in an tag, or None in case of failure. + Requires that PIL is installed. + + >>> getimagesize("http://www.google.com/intl/en_ALL/images/logo.gif") + ... #doctest: +ELLIPSIS, +SKIP + 'width="..." height="..."' + + """ + + from PIL import ImageFile + + try: + p = ImageFile.Parser() + f = urlopen(url) + while True: + s = f.read(1024) + if not s: + break + p.feed(s) + if p.image: + return 'width="%i" height="%i"' % p.image.size + except (IOError, ValueError): + return None + + +class Textile(object): + hlgn = r'(?:\<(?!>)|(?|\<\>|\=|[()]+(?! ))' + vlgn = r'[\-^~]' + clas = r'(?:\([^)]+\))' + lnge = r'(?:\[[^\]]+\])' + styl = r'(?:\{[^}]+\})' + cspn = r'(?:\\\d+)' + rspn = r'(?:\/\d+)' + a = r'(?:%s|%s)*' % (hlgn, vlgn) + s = r'(?:%s|%s)*' % (cspn, rspn) + c = r'(?:%s)*' % '|'.join([clas, styl, lnge, hlgn]) + + pnct = r'[-!"#$%&()*+,/:;<=>?@\'\[\\\]\.^_`{|}~]' + # urlch = r'[\w"$\-_.+!*\'(),";/?:@=&%#{}|\\^~\[\]`]' + urlch = r'[\w"$\-_.+*\'(),";\/?:@=&%#{}|\\^~\[\]`]' + + url_schemes = ('http', 'https', 'ftp', 'mailto') + + btag = ('bq', 'bc', 'notextile', 'pre', 'h[1-6]', r'fn\d+', 'p') + btag_lite = ('bq', 'bc', 'p') + + macro_defaults = [ + (re.compile(r'{(c\||\|c)}'), r'¢'), # cent + (re.compile(r'{(L-|-L)}'), r'£'), # pound + (re.compile(r'{(Y=|=Y)}'), r'¥'), # yen + (re.compile(r'{\(c\)}'), r'©'), # copyright + (re.compile(r'{\(r\)}'), r'®'), # registered + (re.compile(r'{(\+_|_\+)}'), r'±'), # plus-minus + (re.compile(r'{1/4}'), r'¼'), # quarter + (re.compile(r'{1/2}'), r'½'), # half + (re.compile(r'{3/4}'), r'¾'), # three-quarter + (re.compile(r'{(A`|`A)}'), r'À'), # A-acute + (re.compile(r'{(A\'|\'A)}'), r'Á'), # A-grave + (re.compile(r'{(A\^|\^A)}'), r'Â'), # A-circumflex + (re.compile(r'{(A~|~A)}'), r'Ã'), # A-tilde + (re.compile(r'{(A\"|\"A)}'), r'Ä'), # A-diaeresis + (re.compile(r'{(Ao|oA)}'), r'Å'), # A-ring + (re.compile(r'{(AE)}'), r'Æ'), # AE + (re.compile(r'{(C,|,C)}'), r'Ç'), # C-cedilla + (re.compile(r'{(E`|`E)}'), r'È'), # E-acute + (re.compile(r'{(E\'|\'E)}'), r'É'), # E-grave + (re.compile(r'{(E\^|\^E)}'), r'Ê'), # E-circumflex + (re.compile(r'{(E\"|\"E)}'), r'Ë'), # E-diaeresis + (re.compile(r'{(I`|`I)}'), r'Ì'), # I-acute + (re.compile(r'{(I\'|\'I)}'), r'Í'), # I-grave + (re.compile(r'{(I\^|\^I)}'), r'Î'), # I-circumflex + (re.compile(r'{(I\"|\"I)}'), r'Ï'), # I-diaeresis + (re.compile(r'{(D-|-D)}'), r'Ð'), # ETH + (re.compile(r'{(N~|~N)}'), r'Ñ'), # N-tilde + (re.compile(r'{(O`|`O)}'), r'Ò'), # O-acute + (re.compile(r'{(O\'|\'O)}'), r'Ó'), # O-grave + (re.compile(r'{(O\^|\^O)}'), r'Ô'), # O-circumflex + (re.compile(r'{(O~|~O)}'), r'Õ'), # O-tilde + (re.compile(r'{(O\"|\"O)}'), r'Ö'), # O-diaeresis + (re.compile(r'{x}'), r'×'), # dimension + (re.compile(r'{(O\/|\/O)}'), r'Ø'), # O-slash + (re.compile(r'{(U`|`U)}'), r'Ù'), # U-acute + (re.compile(r'{(U\'|\'U)}'), r'Ú'), # U-grave + (re.compile(r'{(U\^|\^U)}'), r'Û'), # U-circumflex + (re.compile(r'{(U\"|\"U)}'), r'Ü'), # U-diaeresis + (re.compile(r'{(Y\'|\'Y)}'), r'Ý'), # Y-grave + (re.compile(r'{sz}'), r'ß'), # sharp-s + (re.compile(r'{(a`|`a)}'), r'à'), # a-grave + (re.compile(r'{(a\'|\'a)}'), r'á'), # a-acute + (re.compile(r'{(a\^|\^a)}'), r'â'), # a-circumflex + (re.compile(r'{(a~|~a)}'), r'ã'), # a-tilde + (re.compile(r'{(a\"|\"a)}'), r'ä'), # a-diaeresis + (re.compile(r'{(ao|oa)}'), r'å'), # a-ring + (re.compile(r'{ae}'), r'æ'), # ae + (re.compile(r'{(c,|,c)}'), r'ç'), # c-cedilla + (re.compile(r'{(e`|`e)}'), r'è'), # e-grave + (re.compile(r'{(e\'|\'e)}'), r'é'), # e-acute + (re.compile(r'{(e\^|\^e)}'), r'ê'), # e-circumflex + (re.compile(r'{(e\"|\"e)}'), r'ë'), # e-diaeresis + (re.compile(r'{(i`|`i)}'), r'ì'), # i-grave + (re.compile(r'{(i\'|\'i)}'), r'í'), # i-acute + (re.compile(r'{(i\^|\^i)}'), r'î'), # i-circumflex + (re.compile(r'{(i\"|\"i)}'), r'ï'), # i-diaeresis + (re.compile(r'{(d-|-d)}'), r'ð'), # eth + (re.compile(r'{(n~|~n)}'), r'ñ'), # n-tilde + (re.compile(r'{(o`|`o)}'), r'ò'), # o-grave + (re.compile(r'{(o\'|\'o)}'), r'ó'), # o-acute + (re.compile(r'{(o\^|\^o)}'), r'ô'), # o-circumflex + (re.compile(r'{(o~|~o)}'), r'õ'), # o-tilde + (re.compile(r'{(o\"|\"o)}'), r'ö'), # o-diaeresis + (re.compile(r'{(o\/|\/o)}'), r'ø'), # o-stroke + (re.compile(r'{(u`|`u)}'), r'ù'), # u-grave + (re.compile(r'{(u\'|\'u)}'), r'ú'), # u-acute + (re.compile(r'{(u\^|\^u)}'), r'û'), # u-circumflex + (re.compile(r'{(u\"|\"u)}'), r'ü'), # u-diaeresis + (re.compile(r'{(y\'|\'y)}'), r'ý'), # y-acute + (re.compile(r'{(y\"|\"y)}'), r'ÿ'), # y-diaeresis + + (re.compile(r'{(C\ˇ|\ˇC)}'), r'Č'), # C-caron + (re.compile(r'{(c\ˇ|\ˇc)}'), r'č'), # c-caron + (re.compile(r'{(D\ˇ|\ˇD)}'), r'Ď'), # D-caron + (re.compile(r'{(d\ˇ|\ˇd)}'), r'ď'), # d-caron + (re.compile(r'{(E\ˇ|\ˇE)}'), r'Ě'), # E-caron + (re.compile(r'{(e\ˇ|\ˇe)}'), r'ě'), # e-caron + (re.compile(r'{(L\'|\'L)}'), r'Ĺ'), # L-acute + (re.compile(r'{(l\'|\'l)}'), r'ĺ'), # l-acute + (re.compile(r'{(L\ˇ|\ˇL)}'), r'Ľ'), # L-caron + (re.compile(r'{(l\ˇ|\ˇl)}'), r'ľ'), # l-caron + (re.compile(r'{(N\ˇ|\ˇN)}'), r'Ň'), # N-caron + (re.compile(r'{(n\ˇ|\ˇn)}'), r'ň'), # n-caron + + (re.compile(r'{OE}'), r'Œ'), # OE + (re.compile(r'{oe}'), r'œ'), # oe + + (re.compile(r'{(R\'|\'R)}'), r'Ŕ'), # R-acute + (re.compile(r'{(r\'|\'r)}'), r'ŕ'), # r-acute + (re.compile(r'{(R\ˇ|\ˇR)}'), r'Ř'), # R-caron + (re.compile(r'{(r\ˇ|\ˇr)}'), r'ř'), # r-caron + + (re.compile(r'{(S\^|\^S)}'), r'Ŝ'), # S-circumflex + (re.compile(r'{(s\^|\^s)}'), r'ŝ'), # s-circumflex + + (re.compile(r'{(S\ˇ|\ˇS)}'), r'Š'), # S-caron + (re.compile(r'{(s\ˇ|\ˇs)}'), r'š'), # s-caron + (re.compile(r'{(T\ˇ|\ˇT)}'), r'Ť'), # T-caron + (re.compile(r'{(t\ˇ|\ˇt)}'), r'ť'), # t-caron + (re.compile(r'{(U\°|\°U)}'), r'Ů'), # U-ring + (re.compile(r'{(u\°|\°u)}'), r'ů'), # u-ring + (re.compile(r'{(Z\ˇ|\ˇZ)}'), r'Ž'), # Z-caron + (re.compile(r'{(z\ˇ|\ˇz)}'), r'ž'), # z-caron + + (re.compile(r'{\*}'), r'•'), # bullet + (re.compile(r'{Fr}'), r'₣'), # Franc + (re.compile(r'{(L=|=L)}'), r'₤'), # Lira + (re.compile(r'{Rs}'), r'₨'), # Rupee + (re.compile(r'{(C=|=C)}'), r'€'), # euro + (re.compile(r'{tm}'), r'™'), # trademark + (re.compile(r'{spades?}'), r'♠'), # spade + (re.compile(r'{clubs?}'), r'♣'), # club + (re.compile(r'{hearts?}'), r'♥'), # heart + (re.compile(r'{diam(onds?|s)}'), r'♦'), # diamond + (re.compile(r'{"}'), r'"'), # double-quote + (re.compile(r"{'}"), r'''), # single-quote + (re.compile(r"{(’|'/|/')}"), r'’'), # closing-single-quote - apostrophe + (re.compile(r"{(‘|\\'|'\\)}"), r'‘'), # opening-single-quote + (re.compile(r'{(”|"/|/")}'), r'”'), # closing-double-quote + (re.compile(r'{(“|\\"|"\\)}'), r'“'), # opening-double-quote + ] + glyph_defaults = [ + (re.compile(r'(\d+\'?\"?)( ?)x( ?)(?=\d+)'), r'\1\2×\3'), # dimension sign + (re.compile(r'(\d+)\'(\s)', re.I), r'\1′\2'), # prime + (re.compile(r'(\d+)\"(\s)', re.I), r'\1″\2'), # prime-double + (re.compile(r'\b([A-Z][A-Z0-9]{2,})\b(?:[(]([^)]*)[)])'), r'\1'), # 3+ uppercase acronym + (re.compile(r'\b([A-Z][A-Z\'\-]+[A-Z])(?=[\s.,\)>])'), r'\1'), # 3+ uppercase + (re.compile(r'\b(\s{0,1})?\.{3}'), r'\1…'), # ellipsis + (re.compile(r'^[\*_-]{3,}$', re.M), r'
'), #
scene-break + (re.compile(r'(^|[^-])--([^-]|$)'), r'\1—\2'), # em dash + (re.compile(r'\s-(?:\s|$)'), r' – '), # en dash + (re.compile(r'\b( ?)[([]TM[])]', re.I), r'\1™'), # trademark + (re.compile(r'\b( ?)[([]R[])]', re.I), r'\1®'), # registered + (re.compile(r'\b( ?)[([]C[])]', re.I), r'\1©'), # copyright + ] + + def __init__(self, restricted=False, lite=False, noimage=False): + """docstring for __init__""" + self.restricted = restricted + self.lite = lite + self.noimage = noimage + self.get_sizes = False + self.fn = {} + self.urlrefs = {} + self.shelf = {} + self.rel = '' + self.html_type = 'xhtml' + + def textile(self, text, rel=None, head_offset=0, html_type='xhtml'): + """ + >>> import textile + >>> textile.textile('some textile') + u'\\t

some textile

' + """ + self.html_type = html_type + + # text = type(u'')(text) + text = _normalize_newlines(text) + + if self.restricted: + text = self.encode_html(text, quotes=False) + + if rel: + self.rel = ' rel="%s"' % rel + + text = self.getRefs(text) + text = self.block(text, int(head_offset)) + text = self.retrieve(text) + text = smartyPants(text, 'q') + + return text + + def pba(self, input, element=None): + """ + Parse block attributes. + + >>> t = Textile() + >>> t.pba(r'\3') + '' + >>> t.pba(r'\\3', element='td') + ' colspan="3"' + >>> t.pba(r'/4', element='td') + ' rowspan="4"' + >>> t.pba(r'\\3/4', element='td') + ' colspan="3" rowspan="4"' + + >>> t.vAlign('^') + 'top' + + >>> t.pba('^', element='td') + ' style="vertical-align:top;"' + + >>> t.pba('{line-height:18px}') + ' style="line-height:18px;"' + + >>> t.pba('(foo-bar)') + ' class="foo-bar"' + + >>> t.pba('(#myid)') + ' id="myid"' + + >>> t.pba('(foo-bar#myid)') + ' class="foo-bar" id="myid"' + + >>> t.pba('((((') + ' style="padding-left:4em;"' + + >>> t.pba(')))') + ' style="padding-right:3em;"' + + >>> t.pba('[fr]') + ' lang="fr"' + + """ + style = [] + aclass = '' + lang = '' + colspan = '' + rowspan = '' + id = '' + + if not input: + return '' + + matched = input + if element == 'td': + m = re.search(r'\\(\d+)', matched) + if m: + colspan = m.group(1) + + m = re.search(r'/(\d+)', matched) + if m: + rowspan = m.group(1) + + if element == 'td' or element == 'tr': + m = re.search(r'(%s)' % self.vlgn, matched) + if m: + style.append("vertical-align:%s;" % self.vAlign(m.group(1))) + + m = re.search(r'\{([^}]*)\}', matched) + if m: + style.append(m.group(1).rstrip(';') + ';') + matched = matched.replace(m.group(0), '') + + m = re.search(r'\[([^\]]+)\]', matched, re.U) + if m: + lang = m.group(1) + matched = matched.replace(m.group(0), '') + + m = re.search(r'\(([^()]+)\)', matched, re.U) + if m: + aclass = m.group(1) + matched = matched.replace(m.group(0), '') + + m = re.search(r'([(]+)', matched) + if m: + style.append("padding-left:%sem;" % len(m.group(1))) + matched = matched.replace(m.group(0), '') + + m = re.search(r'([)]+)', matched) + if m: + style.append("padding-right:%sem;" % len(m.group(1))) + matched = matched.replace(m.group(0), '') + + m = re.search(r'(%s)' % self.hlgn, matched) + if m: + style.append("text-align:%s;" % self.hAlign(m.group(1))) + + m = re.search(r'^(.*)#(.*)$', aclass) + if m: + id = m.group(2) + aclass = m.group(1) + + if self.restricted: + if lang: + return ' lang="%s"' + else: + return '' + + result = [] + if style: + result.append(' style="%s"' % "".join(style)) + if aclass: + result.append(' class="%s"' % aclass) + if lang: + result.append(' lang="%s"' % lang) + if id: + result.append(' id="%s"' % id) + if colspan: + result.append(' colspan="%s"' % colspan) + if rowspan: + result.append(' rowspan="%s"' % rowspan) + return ''.join(result) + + def hasRawText(self, text): + """ + checks whether the text has text not already enclosed by a block tag + + >>> t = Textile() + >>> t.hasRawText('

foo bar biz baz

') + False + + >>> t.hasRawText(' why yes, yes it does') + True + + """ + r = re.compile(r'<(p|blockquote|div|form|table|ul|ol|pre|h\d)[^>]*?>.*', re.S).sub('', text.strip()).strip() + r = re.compile(r'<(hr|br)[^>]*?/>').sub('', r) + return '' != r + + def table(self, text): + r""" + >>> t = Textile() + >>> t.table('|one|two|three|\n|a|b|c|') + '\t\n\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\t\n\t\t\n\t\t\t\n\t\t\t\n\t\t\t\n\t\t\n\t
onetwothree
abc
\n\n' + """ + text = text + "\n\n" + pattern = re.compile(r'^(?:table(_?%(s)s%(a)s%(c)s)\. ?\n)?^(%(a)s%(c)s\.? ?\|.*\|)\n\n' % {'s':self.s, 'a':self.a, 'c':self.c}, re.S|re.M|re.U) + return pattern.sub(self.fTable, text) + + def fTable(self, match): + tatts = self.pba(match.group(1), 'table') + rows = [] + for row in [x for x in match.group(2).split('\n') if x]: + rmtch = re.search(r'^(%s%s\. )(.*)' % (self.a, self.c), row.lstrip()) + if rmtch: + ratts = self.pba(rmtch.group(1), 'tr') + row = rmtch.group(2) + else: + ratts = '' + + cells = [] + for cell in row.split('|')[1:-1]: + ctyp = 'd' + if re.search(r'^_', cell): + ctyp = "h" + cmtch = re.search(r'^(_?%s%s%s\. )(.*)' % (self.s, self.a, self.c), cell) + if cmtch: + catts = self.pba(cmtch.group(1), 'td') + cell = cmtch.group(2) + else: + catts = '' + + cell = self.graf(self.span(cell)) + cells.append('\t\t\t%s' % (ctyp, catts, cell, ctyp)) + rows.append("\t\t\n%s\n\t\t" % (ratts, '\n'.join(cells))) + cells = [] + catts = None + return "\t\n%s\n\t\n\n" % (tatts, '\n'.join(rows)) + + def lists(self, text): + """ + >>> t = Textile() + >>> t.lists("* one\\n* two\\n* three") + '\\t
    \\n\\t\\t
  • one
  • \\n\\t\\t
  • two
  • \\n\\t\\t
  • three
  • \\n\\t
' + """ + pattern = re.compile(r'^([#*]+%s .*)$(?![^#*])' % self.c, re.U|re.M|re.S) + return pattern.sub(self.fList, text) + + def fList(self, match): + text = match.group(0).split("\n") + result = [] + lists = [] + for i, line in enumerate(text): + try: + nextline = text[i+1] + except IndexError: + nextline = '' + + m = re.search(r"^([#*]+)(%s%s) (.*)$" % (self.a, self.c), line, re.S) + if m: + tl, atts, content = m.groups() + nl = '' + nm = re.search(r'^([#*]+)\s.*', nextline) + if nm: + nl = nm.group(1) + if tl not in lists: + lists.append(tl) + atts = self.pba(atts) + line = "\t<%sl%s>\n\t\t
  • %s" % (self.lT(tl), atts, self.graf(content)) + else: + line = "\t\t
  • " + self.graf(content) + + if len(nl) <= len(tl): + line = line + "
  • " + for k in reversed(lists): + if len(k) > len(nl): + line = line + "\n\t" % self.lT(k) + if len(k) > 1: + line = line + "" + lists.remove(k) + + result.append(line) + return "\n".join(result) + + def lT(self, input): + if re.search(r'^#+', input): + return 'o' + else: + return 'u' + + def doPBr(self, in_): + return re.compile(r'<(p)([^>]*?)>(.*)()', re.S).sub(self.doBr, in_) + + def doBr(self, match): + if self.html_type == 'html': + content = re.sub(r'(.+)(?:(?)|(?))\n(?![#*\s|])', '\\1
    ', match.group(3)) + else: + content = re.sub(r'(.+)(?:(?)|(?))\n(?![#*\s|])', '\\1
    ', match.group(3)) + return '<%s%s>%s%s' % (match.group(1), match.group(2), content, match.group(4)) + + def block(self, text, head_offset=0): + """ + >>> t = Textile() + >>> t.block('h1. foobar baby') + '\\t

    foobar baby

    ' + """ + if not self.lite: + tre = '|'.join(self.btag) + else: + tre = '|'.join(self.btag_lite) + text = text.split('\n\n') + + tag = 'p' + atts = cite = graf = ext = c1 = '' + + out = [] + + anon = False + for line in text: + pattern = r'^(%s)(%s%s)\.(\.?)(?::(\S+))? (.*)$' % (tre, self.a, self.c) + match = re.search(pattern, line, re.S) + if match: + if ext: + out.append(out.pop() + c1) + + tag, atts, ext, cite, graf = match.groups() + h_match = re.search(r'h([1-6])', tag) + if h_match: + head_level, = h_match.groups() + tag = 'h%i' % max(1, + min(int(head_level) + head_offset, + 6)) + o1, o2, content, c2, c1 = self.fBlock(tag, atts, ext, + cite, graf) + # leave off c1 if this block is extended, + # we'll close it at the start of the next block + + if ext: + line = "%s%s%s%s" % (o1, o2, content, c2) + else: + line = "%s%s%s%s%s" % (o1, o2, content, c2, c1) + + else: + anon = True + if ext or not re.search(r'^\s', line): + o1, o2, content, c2, c1 = self.fBlock(tag, atts, ext, + cite, line) + # skip $o1/$c1 because this is part of a continuing + # extended block + if tag == 'p' and not self.hasRawText(content): + line = content + else: + line = "%s%s%s" % (o2, content, c2) + else: + line = self.graf(line) + + line = self.doPBr(line) + if self.html_type == 'xhtml': + line = re.sub(r'
    ', '
    ', line) + + if ext and anon: + out.append(out.pop() + "\n" + line) + else: + out.append(line) + + if not ext: + tag = 'p' + atts = '' + cite = '' + graf = '' + + if ext: + out.append(out.pop() + c1) + return '\n\n'.join(out) + + def fBlock(self, tag, atts, ext, cite, content): + """ + >>> t = Textile() + >>> t.fBlock("bq", "", None, "", "Hello BlockQuote") + ('\\t
    \\n', '\\t\\t

    ', 'Hello BlockQuote', '

    ', '\\n\\t
    ') + + >>> t.fBlock("bq", "", None, "http://google.com", "Hello BlockQuote") + ('\\t
    \\n', '\\t\\t

    ', 'Hello BlockQuote', '

    ', '\\n\\t
    ') + + >>> t.fBlock("bc", "", None, "", 'printf "Hello, World";') # doctest: +ELLIPSIS + ('
    ', '', ..., '', '
    ') + + >>> t.fBlock("h1", "", None, "", "foobar") + ('', '\\t

    ', 'foobar', '

    ', '') + """ + atts = self.pba(atts) + o1 = o2 = c2 = c1 = '' + + m = re.search(r'fn(\d+)', tag) + if m: + tag = 'p' + if m.group(1) in self.fn: + fnid = self.fn[m.group(1)] + else: + fnid = m.group(1) + atts = atts + ' id="fn%s"' % fnid + if atts.find('class=') < 0: + atts = atts + ' class="footnote"' + content = ('%s' % m.group(1)) + content + + if tag == 'bq': + cite = self.checkRefs(cite) + if cite: + cite = ' cite="%s"' % cite + else: + cite = '' + o1 = "\t\n" % (cite, atts) + o2 = "\t\t" % atts + c2 = "

    " + c1 = "\n\t" + + elif tag == 'bc': + o1 = "" % atts + o2 = "" % atts + c2 = "" + c1 = "" + content = self.shelve(self.encode_html(content.rstrip("\n") + "\n")) + + elif tag == 'notextile': + content = self.shelve(content) + o1 = o2 = '' + c1 = c2 = '' + + elif tag == 'pre': + content = self.shelve(self.encode_html(content.rstrip("\n") + "\n")) + o1 = "" % atts + o2 = c2 = '' + c1 = '' + + else: + o2 = "\t<%s%s>" % (tag, atts) + c2 = "" % tag + + content = self.graf(content) + return o1, o2, content, c2, c1 + + def footnoteRef(self, text): + """ + >>> t = Textile() + >>> t.footnoteRef('foo[1] ') # doctest: +ELLIPSIS + 'foo1 ' + """ + return re.sub(r'\b\[([0-9]+)\](\s)?', self.footnoteID, text) + + def footnoteID(self, match): + id, t = match.groups() + if id not in self.fn: + self.fn[id] = unicode_type(uuid.uuid4()) + fnid = self.fn[id] + if not t: + t = '' + return '%s%s' % (fnid, id, t) + + def glyphs(self, text): + """ + >>> t = Textile() + + >>> t.glyphs("apostrophe's") + 'apostrophe’s' + + >>> t.glyphs("back in '88") + 'back in ’88' + + >>> t.glyphs('foo ...') + 'foo …' + + >>> t.glyphs('--') + '—' + + >>> t.glyphs('FooBar[tm]') + 'FooBar™' + + >>> t.glyphs("

    Cat's Cradle by Vonnegut

    ") + '

    Cat’s Cradle by Vonnegut

    ' + + """ + # fix: hackish + text = re.sub(r'"\Z', '\" ', text) + + result = [] + for line in re.compile(r'(<.*?>)', re.U).split(text): + if not re.search(r'<.*>', line): + rules = [] + if re.search(r'{.+?}', line): + rules = self.macro_defaults + self.glyph_defaults + else: + rules = self.glyph_defaults + for s, r in rules: + line = s.sub(r, line) + result.append(line) + return ''.join(result) + + def macros_only(self, text): + # fix: hackish + text = re.sub(r'"\Z', '\" ', text) + + result = [] + for line in re.compile(r'(<.*?>)', re.U).split(text): + if not re.search(r'<.*>', line): + rules = [] + if re.search(r'{.+?}', line): + rules = self.macro_defaults + for s, r in rules: + line = s.sub(r, line) + result.append(line) + return ''.join(result) + + def vAlign(self, input): + d = {'^':'top', '-':'middle', '~':'bottom'} + return d.get(input, '') + + def hAlign(self, input): + d = {'<':'left', '=':'center', '>':'right', '<>': 'justify'} + return d.get(input, '') + + def getRefs(self, text): + """ + what is this for? + """ + pattern = re.compile(r'(?:(?<=^)|(?<=\s))\[(.+)\]((?:http(?:s?):\/\/|\/)\S+)(?=\s|$)', re.U) + text = pattern.sub(self.refs, text) + return text + + def refs(self, match): + flag, url = match.groups() + self.urlrefs[flag] = url + return '' + + def checkRefs(self, url): + return self.urlrefs.get(url, url) + + def isRelURL(self, url): + """ + Identify relative urls. + + >>> t = Textile() + >>> t.isRelURL("http://www.google.com/") + False + >>> t.isRelURL("/foo") + True + + """ + (scheme, netloc) = urlparse(url)[0:2] + return not scheme and not netloc + + def relURL(self, url): + scheme = urlparse(url)[0] + if self.restricted and scheme and scheme not in self.url_schemes: + return '#' + return url + + def shelve(self, text): + id = unicode_type(uuid.uuid4()) + 'c' + self.shelf[id] = text + return id + + def retrieve(self, text): + """ + >>> t = Textile() + >>> id = t.shelve("foobar") + >>> t.retrieve(id) + 'foobar' + """ + while True: + old = text + for k, v in self.shelf.items(): + text = text.replace(k, v) + if text == old: + break + return text + + def encode_html(self, text, quotes=True): + a = ( + ('&', '&'), + ('<', '<'), + ('>', '>') + ) + + if quotes: + a = a + ( + ("'", '''), + ('"', '"') + ) + + for k, v in a: + text = text.replace(k, v) + return text + + def graf(self, text): + if not self.lite: + text = self.noTextile(text) + text = self.code(text) + + text = self.links(text) + + if not self.noimage: + text = self.image(text) + + if not self.lite: + text = self.lists(text) + text = self.table(text) + + text = self.span(text) + text = self.footnoteRef(text) + text = self.glyphs(text) + + return text.rstrip('\n') + + def links(self, text): + """ + >>> t = Textile() + >>> t.links('fooobar "Google":http://google.com/foobar/ and hello world "flickr":http://flickr.com/photos/jsamsa/ ') # doctest: +ELLIPSIS + 'fooobar ... and hello world ...' + """ + + text = self.macros_only(text) + punct = '!"#$%&\'*+,-./:;=?@\\^_`|~' + + pattern = r''' + (?P
        [\s\[{(]|[%s]   )?
    +            "                          # start
    +            (?P   %s       )
    +            (?P   [^"]+?   )
    +            \s?
    +            (?:   \(([^)]+?)\)(?=")   )?     # $title
    +            ":
    +            (?P    (?:ftp|https?)? (?: :// )? [-A-Za-z0-9+&@#/?=~_()|!:,.;]*[-A-Za-z0-9+&@#/=~_()|]   )
    +            (?P   [^\w\/;]*?   )
    +            (?=<|\s|$)
    +        ''' % (re.escape(punct), self.c)
    +
    +        text = re.compile(pattern, re.X).sub(self.fLink, text)
    +
    +        return text
    +
    +    def fLink(self, match):
    +        pre, atts, text, title, url, post = match.groups()
    +
    +        if pre is None:
    +            pre = ''
    +
    +        # assume ) at the end of the url is not actually part of the url
    +        # unless the url also contains a (
    +        if url.endswith(')') and not url.find('(') > -1:
    +            post = url[-1] + post
    +            url = url[:-1]
    +
    +        url = self.checkRefs(url)
    +
    +        atts = self.pba(atts)
    +        if title:
    +            atts = atts +  ' title="%s"' % self.encode_html(title)
    +
    +        if not self.noimage:
    +            text = self.image(text)
    +
    +        text = self.span(text)
    +        text = self.glyphs(text)
    +
    +        url = self.relURL(url)
    +        out = '%s' % (self.encode_html(url), atts, self.rel, text)
    +        out = self.shelve(out)
    +        return ''.join([pre, out, post])
    +
    +    def span(self, text):
    +        """
    +        >>> t = Textile()
    +        >>> t.span(r"hello %(bob)span *strong* and **bold**% goodbye")
    +        'hello span strong and bold goodbye'
    +        """
    +        qtags = (r'\*\*', r'\*', r'\?\?', r'\-', r'__', r'_', r'%', r'\+', r'~', r'\^')
    +        pnct = ".,\"'?!;:"
    +
    +        for qtag in qtags:
    +            pattern = re.compile(r"""
    +                (?:^|(?<=[\s>%(pnct)s\(])|\[|([\]}]))
    +                (%(qtag)s)(?!%(qtag)s)
    +                (%(c)s)
    +                (?::(\S+))?
    +                ([^\s%(qtag)s]+|\S[^%(qtag)s\n]*[^\s%(qtag)s\n])
    +                ([%(pnct)s]*)
    +                %(qtag)s
    +                (?:$|([\]}])|(?=%(selfpnct)s{1,2}|\s))
    +            """ % {'qtag':qtag, 'c':self.c, 'pnct':pnct,
    +                   'selfpnct':self.pnct}, re.X)
    +            text = pattern.sub(self.fSpan, text)
    +        return text
    +
    +    def fSpan(self, match):
    +        _, tag, atts, cite, content, end, _ = match.groups()
    +
    +        qtags = {
    +            '*': 'strong',
    +            '**': 'b',
    +            '??': 'cite',
    +            '_' : 'em',
    +            '__': 'i',
    +            '-' : 'del',
    +            '%' : 'span',
    +            '+' : 'ins',
    +            '~' : 'sub',
    +            '^' : 'sup'
    +        }
    +        tag = qtags[tag]
    +        atts = self.pba(atts)
    +        if cite:
    +            atts = atts + 'cite="%s"' % cite
    +
    +        content = self.span(content)
    +
    +        out = "<%s%s>%s%s" % (tag, atts, content, end, tag)
    +        return out
    +
    +    def image(self, text):
    +        """
    +        >>> t = Textile()
    +        >>> t.image('!/imgs/myphoto.jpg!:http://jsamsa.com')
    +        ''
    +        """
    +        pattern = re.compile(r"""
    +            (?:[\[{])?          # pre
    +            \!                 # opening !
    +            (%s)               # optional style,class atts
    +            (?:\. )?           # optional dot-space
    +            ([^\s(!]+)         # presume this is the src
    +            \s?                # optional space
    +            (?:\(([^\)]+)\))?  # optional title
    +            \!                 # closing
    +            (?::(\S+))?        # optional href
    +            (?:[\]}]|(?=\s|$)) # lookahead: space or end of string
    +        """ % self.c, re.U|re.X)
    +        return pattern.sub(self.fImage, text)
    +
    +    def fImage(self, match):
    +        # (None, '', '/imgs/myphoto.jpg', None, None)
    +        atts, url, title, href = match.groups()
    +        atts  = self.pba(atts)
    +
    +        if title:
    +            atts = atts + ' title="%s" alt="%s"' % (title, title)
    +        else:
    +            atts = atts + ' alt=""'
    +
    +        if not self.isRelURL(url) and self.get_sizes:
    +            size = getimagesize(url)
    +            if (size):
    +                atts += " %s" % size
    +
    +        if href:
    +            href = self.checkRefs(href)
    +
    +        url = self.checkRefs(url)
    +        url = self.relURL(url)
    +
    +        out = []
    +        if href:
    +            out.append('' % href)
    +        if self.html_type == 'html':
    +            out.append('' % (url, atts))
    +        else:
    +            out.append('' % (url, atts))
    +        if href:
    +            out.append('')
    +
    +        return ''.join(out)
    +
    +    def code(self, text):
    +        text = self.doSpecial(text, '', '', self.fCode)
    +        text = self.doSpecial(text, '@', '@', self.fCode)
    +        text = self.doSpecial(text, '
    ', '
    ', self.fPre) + return text + + def fCode(self, match): + before, text, after = match.groups() + if after is None: + after = '' + # text needs to be escaped + if not self.restricted: + text = self.encode_html(text) + return ''.join([before, self.shelve('%s' % text), after]) + + def fPre(self, match): + before, text, after = match.groups() + if after is None: + after = '' + # text needs to be escapedd + if not self.restricted: + text = self.encode_html(text) + return ''.join([before, '
    ', self.shelve(text), '
    ', after]) + + def doSpecial(self, text, start, end, method=None): + if method is None: + method = self.fSpecial + pattern = re.compile(r'(^|\s|[\[({>])%s(.*?)%s(\s|$|[\])}])?' % (re.escape(start), re.escape(end)), re.M|re.S) + return pattern.sub(method, text) + + def fSpecial(self, match): + """ + special blocks like notextile or code + """ + before, text, after = match.groups() + if after is None: + after = '' + return ''.join([before, self.shelve(self.encode_html(text)), after]) + + def noTextile(self, text): + text = self.doSpecial(text, '', '', self.fTextile) + return self.doSpecial(text, '==', '==', self.fTextile) + + def fTextile(self, match): + before, notextile, after = match.groups() + if after is None: + after = '' + return ''.join([before, self.shelve(notextile), after]) + + +def textile(text, head_offset=0, html_type='xhtml', encoding=None, output=None): + """ + this function takes additional parameters: + head_offset - offset to apply to heading levels (default: 0) + html_type - 'xhtml' or 'html' style tags (default: 'xhtml') + """ + return Textile().textile(text, head_offset=head_offset, + html_type=html_type) + + +def textile_restricted(text, lite=True, noimage=True, html_type='xhtml'): + """ + Restricted version of Textile designed for weblog comments and other + untrusted input. + + Raw HTML is escaped. + Style attributes are disabled. + rel='nofollow' is added to external links. + + When lite=True is set (the default): + Block tags are restricted to p, bq, and bc. + Lists and tables are disabled. + + When noimage=True is set (the default): + Image tags are disabled. + + """ + return Textile(restricted=True, lite=lite, + noimage=noimage).textile(text, rel='nofollow', + html_type=html_type) diff --git a/ebook_converter/ebooks/textile/unsmarten.py b/ebook_converter/ebooks/textile/unsmarten.py new file mode 100644 index 0000000..ff05227 --- /dev/null +++ b/ebook_converter/ebooks/textile/unsmarten.py @@ -0,0 +1,129 @@ +# -*- coding: utf-8 -*- +from __future__ import absolute_import, division, print_function, unicode_literals + +__license__ = 'GPL 3' +__copyright__ = '2011, Leigh Parry ' +__docformat__ = 'restructuredtext en' + +import re + + +def unsmarten(txt): + txt = re.sub(u'¢|¢|¢', r'{c\}', txt) # cent + txt = re.sub(u'£|£|£', r'{L-}', txt) # pound + txt = re.sub(u'¥|¥|¥', r'{Y=}', txt) # yen + txt = re.sub(u'©|©|©', r'{(c)}', txt) # copyright + txt = re.sub(u'®|®|®', r'{(r)}', txt) # registered + txt = re.sub(u'¼|¼|¼', r'{1/4}', txt) # quarter + txt = re.sub(u'½|½|½', r'{1/2}', txt) # half + txt = re.sub(u'¾|¾|¾', r'{3/4}', txt) # three-quarter + txt = re.sub(u'À|À|À', r'{A`)}', txt) # A-grave + txt = re.sub(u'Á|Á|Á', r"{A'}", txt) # A-acute + txt = re.sub(u'Â|Â|Â', r'{A^}', txt) # A-circumflex + txt = re.sub(u'Ã|Ã|Ã', r'{A~}', txt) # A-tilde + txt = re.sub(u'Ä|Ä|Ä', r'{A"}', txt) # A-umlaut + txt = re.sub(u'Å|Å|Å', r'{Ao}', txt) # A-ring + txt = re.sub(u'Æ|Æ|Æ', r'{AE}', txt) # AE + txt = re.sub(u'Ç|Ç|Ç', r'{C,}', txt) # C-cedilla + txt = re.sub(u'È|È|È', r'{E`}', txt) # E-grave + txt = re.sub(u'É|É|É', r"{E'}", txt) # E-acute + txt = re.sub(u'Ê|Ê|Ê', r'{E^}', txt) # E-circumflex + txt = re.sub(u'Ë|Ë|Ë', r'{E"}', txt) # E-umlaut + txt = re.sub(u'Ì|Ì|Ì', r'{I`}', txt) # I-grave + txt = re.sub(u'Í|Í|Í', r"{I'}", txt) # I-acute + txt = re.sub(u'Î|Î|Î', r'{I^}', txt) # I-circumflex + txt = re.sub(u'Ï|Ï|Ï', r'{I"}', txt) # I-umlaut + txt = re.sub(u'Ð|Ð|Ð', r'{D-}', txt) # ETH + txt = re.sub(u'Ñ|Ñ|Ñ', r'{N~}', txt) # N-tilde + txt = re.sub(u'Ò|Ò|Ò', r'{O`}', txt) # O-grave + txt = re.sub(u'Ó|Ó|Ó', r"{O'}", txt) # O-acute + txt = re.sub(u'Ô|Ô|Ô', r'{O^}', txt) # O-circumflex + txt = re.sub(u'Õ|Õ|Õ', r'{O~}', txt) # O-tilde + txt = re.sub(u'Ö|Ö|Ö', r'{O"}', txt) # O-umlaut + txt = re.sub(u'×|×|×', r'{x}', txt) # dimension + txt = re.sub(u'Ø|Ø|Ø', r'{O/}', txt) # O-slash + txt = re.sub(u'Ù|Ù|Ù', r"{U`}", txt) # U-grave + txt = re.sub(u'Ú|Ú|Ú', r"{U'}", txt) # U-acute + txt = re.sub(u'Û|Û|Û', r'{U^}', txt) # U-circumflex + txt = re.sub(u'Ü|Ü|Ü', r'{U"}', txt) # U-umlaut + txt = re.sub(u'Ý|Ý|Ý', r"{Y'}", txt) # Y-grave + txt = re.sub(u'ß|ß|ß', r'{sz}', txt) # sharp-s + txt = re.sub(u'à|à|à', r'{a`}', txt) # a-grave + txt = re.sub(u'á|á|á', r"{a'}", txt) # a-acute + txt = re.sub(u'â|â|â', r'{a^}', txt) # a-circumflex + txt = re.sub(u'ã|ã|ã', r'{a~}', txt) # a-tilde + txt = re.sub(u'ä|ä|ä', r'{a"}', txt) # a-umlaut + txt = re.sub(u'å|å|å', r'{ao}', txt) # a-ring + txt = re.sub(u'æ|æ|æ', r'{ae}', txt) # ae + txt = re.sub(u'ç|ç|ç', r'{c,}', txt) # c-cedilla + txt = re.sub(u'è|è|è', r'{e`}', txt) # e-grave + txt = re.sub(u'é|é|é', r"{e'}", txt) # e-acute + txt = re.sub(u'ê|ê|ê', r'{e^}', txt) # e-circumflex + txt = re.sub(u'ë|ë|ë', r'{e"}', txt) # e-umlaut + txt = re.sub(u'ì|ì|ì', r'{i`}', txt) # i-grave + txt = re.sub(u'í|í|í', r"{i'}", txt) # i-acute + txt = re.sub(u'î|î|î', r'{i^}', txt) # i-circumflex + txt = re.sub(u'ï|ï|ï', r'{i"}', txt) # i-umlaut + txt = re.sub(u'ð|ð|ð', r'{d-}', txt) # eth + txt = re.sub(u'ñ|ñ|ñ', r'{n~}', txt) # n-tilde + txt = re.sub(u'ò|ò|ò', r'{o`}', txt) # o-grave + txt = re.sub(u'ó|ó|ó', r"{o'}", txt) # o-acute + txt = re.sub(u'ô|ô|ô', r'{o^}', txt) # o-circumflex + txt = re.sub(u'õ|õ|õ', r'{o~}', txt) # o-tilde + txt = re.sub(u'ö|ö|ö', r'{o"}', txt) # o-umlaut + txt = re.sub(u'ø|ø|ø', r'{o/}', txt) # o-stroke + txt = re.sub(u'ù|ù|ù', r'{u`}', txt) # u-grave + txt = re.sub(u'ú|ú|ú', r"{u'}", txt) # u-acute + txt = re.sub(u'û|û|û', r'{u^}', txt) # u-circumflex + txt = re.sub(u'ü|ü|ü', r'{u"}', txt) # u-umlaut + txt = re.sub(u'ý|ý|ý', r"{y'}", txt) # y-acute + txt = re.sub(u'ÿ|ÿ|ÿ', r'{y"}', txt) # y-umlaut + + txt = re.sub(u'Č|Č|Č', r'{Cˇ}', txt) # C-caron + txt = re.sub(u'č|č|č', r'{cˇ}', txt) # c-caron + txt = re.sub(u'Ď|Ď|Ď', r'{Dˇ}', txt) # D-caron + txt = re.sub(u'ď|ď|ď', r'{dˇ}', txt) # d-caron + txt = re.sub(u'Ě|Ě|Ě', r'{Eˇ}', txt) # E-caron + txt = re.sub(u'ě|ě|ě', r'{eˇ}', txt) # e-caron + txt = re.sub(u'Ĺ|Ĺ|Ĺ', r"{L'}", txt) # L-acute + txt = re.sub(u'ĺ|ĺ|ĺ', r"{l'}", txt) # l-acute + txt = re.sub(u'Ľ|Ľ|Ľ', r'{Lˇ}', txt) # L-caron + txt = re.sub(u'ľ|ľ|ľ', r'{lˇ}', txt) # l-caron + txt = re.sub(u'Ň|Ň|Ň', r'{Nˇ}', txt) # N-caron + txt = re.sub(u'ň|ň|ň', r'{nˇ}', txt) # n-caron + + txt = re.sub(u'Œ|Œ|Œ', r'{OE}', txt) # OE + txt = re.sub(u'œ|œ|œ', r'{oe}', txt) # oe + + txt = re.sub(u'Ŕ|Ŕ|Ŕ', r"{R'}", txt) # R-acute + txt = re.sub(u'ŕ|ŕ|ŕ', r"{r'}", txt) # r-acute + txt = re.sub(u'Ř|Ř|Ř', r'{Rˇ}', txt) # R-caron + txt = re.sub(u'ř|ř|ř', r'{rˇ}', txt) # r-caron + txt = re.sub(u'Ŝ|Ŝ', r'{S^}', txt) # S-circumflex + txt = re.sub(u'ŝ|ŝ', r'{s^}', txt) # s-circumflex + txt = re.sub(u'Š|Š|Š', r'{Sˇ}', txt) # S-caron + txt = re.sub(u'š|š|š', r'{sˇ}', txt) # s-caron + txt = re.sub(u'Ť|Ť|Ť', r'{Tˇ}', txt) # T-caron + txt = re.sub(u'ť|ť|ť', r'{tˇ}', txt) # t-caron + txt = re.sub(u'Ů|Ů|Ů', r'{U°}', txt) # U-ring + txt = re.sub(u'ů|ů|ů', r'{u°}', txt) # u-ring + txt = re.sub(u'Ž|Ž|Ž', r'{Zˇ}', txt) # Z-caron + txt = re.sub(u'ž|ž|ž', r'{zˇ}', txt) # z-caron + + txt = re.sub(u'•|•|•', r'{*}', txt) # bullet + txt = re.sub(u'₣|₣', r'{Fr}', txt) # Franc + txt = re.sub(u'₤|₤', r'{L=}', txt) # Lira + txt = re.sub(u'₨|₨', r'{Rs}', txt) # Rupee + txt = re.sub(u'€|€|€', r'{C=}', txt) # euro + txt = re.sub(u'™|™|™', r'{tm}', txt) # trademark + txt = re.sub(u'♠|♠|♠', r'{spade}', txt) # spade + txt = re.sub(u'♣|♣|♣', r'{club}', txt) # club + txt = re.sub(u'♥|♥|♥', r'{heart}', txt) # heart + txt = re.sub(u'♦|♦|♦', r'{diamond}', txt) # diamond + + # Move into main code? + # txt = re.sub(u'\xa0', r'p. ', txt) # blank paragraph + # txt = re.sub(u'\n\n\n\n', r'\n\np. \n\n', txt) # blank paragraph + # txt = re.sub(u'\n \n', r'\n
    \n', txt) # blank paragraph - br tag + + return txt diff --git a/ebook_converter/ebooks/txt/markdownml.py b/ebook_converter/ebooks/txt/markdownml.py new file mode 100644 index 0000000..4c25c47 --- /dev/null +++ b/ebook_converter/ebooks/txt/markdownml.py @@ -0,0 +1,286 @@ +# -*- coding: utf-8 -*- +from __future__ import absolute_import, division, print_function, unicode_literals + +__license__ = 'GPL 3' +__copyright__ = '''2011, John Schember +2011, Leigh Parry ''' +__docformat__ = 'restructuredtext en' + +''' +Transform OEB content into Textile formatted plain text +''' +import re + +from functools import partial + +from calibre.ebooks.htmlz.oeb2html import OEB2HTML +from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace, rewrite_links +from calibre.ebooks.oeb.stylizer import Stylizer +from polyglot.builtins import unicode_type, string_or_bytes + + +class MarkdownMLizer(OEB2HTML): + + def extract_content(self, oeb_book, opts): + self.log.info('Converting XHTML to Markdown formatted TXT...') + self.opts = opts + self.in_code = False + self.in_pre = False + self.list = [] + self.blockquotes = 0 + self.remove_space_after_newline = False + self.base_hrefs = [item.href for item in oeb_book.spine] + self.map_resources(oeb_book) + + self.style_bold = False + self.style_italic = False + + txt = self.mlize_spine(oeb_book) + + # Do some tidying up + txt = self.tidy_up(txt) + + return txt + + def mlize_spine(self, oeb_book): + output = [''] + for item in oeb_book.spine: + self.log.debug('Converting %s to Markdown formatted TXT...' % item.href) + self.rewrite_ids(item.data, item) + rewrite_links(item.data, partial(self.rewrite_link, page=item)) + stylizer = Stylizer(item.data, item.href, oeb_book, self.opts, self.opts.output_profile) + output += self.dump_text(item.data.find(XHTML('body')), stylizer) + output.append('\n\n') + return ''.join(output) + + def tidy_up(self, text): + # Remove blank space form beginning of paragraph. + text = re.sub('(?msu)^[ ]{1,3}', '', text) + # pre has 4 spaces. We trimmed 3 so anything with a space left is a pre. + text = re.sub('(?msu)^[ ]', ' ', text) + + # Remove tabs that aren't at the beinning of a line + new_text = [] + for l in text.splitlines(): + start = re.match('\t+', l) + if start: + start = start.group() + else: + start = '' + l = re.sub('\t', '', l) + new_text.append(start + l) + text = '\n'.join(new_text) + + # Remove spaces from blank lines. + text = re.sub('(?msu)^[ ]+$', '', text) + + # Reduce blank lines + text = re.sub('(?msu)\n{7,}', '\n' * 6, text) + + # Remove blank lines at beginning and end of document. + text = re.sub(r'^\s*', '', text) + text = re.sub(r'\s*$', '\n\n', text) + + return text + + def remove_newlines(self, text): + text = text.replace('\r\n', ' ') + text = text.replace('\n', ' ') + text = text.replace('\r', ' ') + # Condense redundant spaces created by replacing newlines with spaces. + text = re.sub(r'[ ]{2,}', ' ', text) + text = re.sub(r'\t+', '', text) + if self.remove_space_after_newline == True: # noqa + text = re.sub(r'^ +', '', text) + self.remove_space_after_newline = False + return text + + def prepare_string_for_markdown(self, txt): + txt = re.sub(r'([\\`*_{}\[\]()#+!])', r'\\\1', txt) + return txt + + def prepare_string_for_pre(self, txt): + new_text = [] + for l in txt.splitlines(): + new_text.append(' ' + l) + return '\n'.join(new_text) + + def dump_text(self, elem, stylizer): + ''' + @elem: The element in the etree that we are working on. + @stylizer: The style information attached to the element. + ''' + + # We can only processes tags. If there isn't a tag return any text. + if not isinstance(elem.tag, string_or_bytes) \ + or namespace(elem.tag) != XHTML_NS: + p = elem.getparent() + if p is not None and isinstance(p.tag, string_or_bytes) and namespace(p.tag) == XHTML_NS \ + and elem.tail: + return [elem.tail] + return [''] + + # Setup our variables. + text = [] + style = stylizer.style(elem) + tags = [] + tag = barename(elem.tag) + attribs = elem.attrib + + # Ignore anything that is set to not be displayed. + if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \ + or style['visibility'] == 'hidden': + if hasattr(elem, 'tail') and elem.tail: + return [elem.tail] + return [''] + + # Soft scene breaks. + if 'margin-top' in style.cssdict() and style['margin-top'] != 'auto': + ems = int(round(float(style.marginTop) / style.fontSize) - 1) + if ems >= 1: + text.append(u'\n\n' * ems) + + bq = '> ' * self.blockquotes + # Block level elements + if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'div'): + h_tag = '' + if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6'): + h_tag = '#' * int(tag[1]) + ' ' + text.append('\n' + bq + h_tag) + tags.append('\n') + self.remove_space_after_newline = True + + if style['font-style'] == 'italic' or tag in ('i', 'em'): + if tag not in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'cite'): + if self.style_italic == False: # noqa + text.append('*') + tags.append('*') + self.style_italic = True + if style['font-weight'] in ('bold', 'bolder') or tag in ('b', 'strong'): + if tag not in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'th'): + if self.style_bold == False: # noqa + text.append('**') + tags.append('**') + self.style_bold = True + if tag == 'br': + text.append(' \n') + self.remove_space_after_newline = True + if tag == 'blockquote': + self.blockquotes += 1 + tags.append('>') + text.append('> ' * self.blockquotes) + elif tag == 'code': + if not self.in_pre and not self.in_code: + text.append('`') + tags.append('`') + self.in_code = True + elif tag == 'pre': + if not self.in_pre: + text.append('\n') + tags.append('pre') + self.in_pre = True + elif tag == 'hr': + text.append('\n* * *') + tags.append('\n') + elif tag == 'a': + # Only write links with absolute (external) urls. + if self.opts.keep_links and 'href' in attribs and '://' in attribs['href']: + title = '' + if 'title' in attribs: + title = ' "' + attribs['title'] + '"' + remove_space = self.remove_space_after_newline + title = self.remove_newlines(title) + self.remove_space_after_newline = remove_space + text.append('[') + tags.append('](' + attribs['href'] + title + ')') + elif tag == 'img': + if self.opts.keep_image_references: + txt = '!' + if 'alt' in attribs: + remove_space = self.remove_space_after_newline + txt += '[' + self.remove_newlines(attribs['alt']) + ']' + self.remove_space_after_newline = remove_space + txt += '(' + attribs['src'] + ')' + text.append(txt) + elif tag in ('ol', 'ul'): + tags.append(tag) + # Add the list to our lists of lists so we can track + # nested lists. + self.list.append({'name': tag, 'num': 0}) + elif tag == 'li': + # Get the last list from our list of lists + if self.list: + li = self.list[-1] + else: + li = {'name': 'ul', 'num': 0} + # Add a new line to start the item + text.append('\n') + # Add indent if we have nested lists. + list_count = len(self.list) + # We only care about indenting nested lists. + if (list_count - 1) > 0: + text.append('\t' * (list_count - 1)) + # Add blockquote if we have a blockquote in a list item. + text.append(bq) + # Write the proper sign for ordered and unorded lists. + if li['name'] == 'ul': + text.append('+ ') + elif li['name'] == 'ol': + li['num'] += 1 + text.append(unicode_type(li['num']) + '. ') + + # Process tags that contain text. + if hasattr(elem, 'text') and elem.text: + txt = elem.text + if self.in_pre: + txt = self.prepare_string_for_pre(txt) + elif self.in_code: + txt = self.remove_newlines(txt) + else: + txt = self.prepare_string_for_markdown(self.remove_newlines(txt)) + text.append(txt) + + # Recurse down into tags within the tag we are in. + for item in elem: + text += self.dump_text(item, stylizer) + + # Close all open tags. + tags.reverse() + for t in tags: + if t in ('pre', 'ul', 'ol', '>'): + if t == 'pre': + self.in_pre = False + text.append('\n') + elif t == '>': + self.blockquotes -= 1 + elif t in ('ul', 'ol'): + if self.list: + self.list.pop() + text.append('\n') + else: + if t == '**': + self.style_bold = False + elif t == '*': + self.style_italic = False + elif t == '`': + self.in_code = False + text.append('%s' % t) + + # Soft scene breaks. + if 'margin-bottom' in style.cssdict() and style['margin-bottom'] != 'auto': + ems = int(round((float(style.marginBottom) / style.fontSize) - 1)) + if ems >= 1: + text.append(u'\n\n' * ems) + + # Add the text that is outside of the tag. + if hasattr(elem, 'tail') and elem.tail: + tail = elem.tail + if self.in_pre: + tail = self.prepare_string_for_pre(tail) + elif self.in_code: + tail = self.remove_newlines(tail) + else: + tail = self.prepare_string_for_markdown(self.remove_newlines(tail)) + text.append(tail) + + return text diff --git a/ebook_converter/ebooks/txt/newlines.py b/ebook_converter/ebooks/txt/newlines.py new file mode 100644 index 0000000..56a890c --- /dev/null +++ b/ebook_converter/ebooks/txt/newlines.py @@ -0,0 +1,32 @@ +# -*- coding: utf-8 -*- +from __future__ import absolute_import, division, print_function, unicode_literals + +__license__ = 'GPL 3' +__copyright__ = '2009, John Schember ' +__docformat__ = 'restructuredtext en' + +import os + + +class TxtNewlines(object): + + NEWLINE_TYPES = { + 'system' : os.linesep, + 'unix' : '\n', + 'old_mac' : '\r', + 'windows' : '\r\n' + } + + def __init__(self, newline_type): + self.newline = self.NEWLINE_TYPES.get(newline_type.lower(), os.linesep) + + +def specified_newlines(newline, text): + # Convert all newlines to \n + text = text.replace('\r\n', '\n') + text = text.replace('\r', '\n') + + if newline == '\n': + return text + + return text.replace('\n', newline) diff --git a/ebook_converter/ebooks/txt/textileml.py b/ebook_converter/ebooks/txt/textileml.py new file mode 100644 index 0000000..e9eecfc --- /dev/null +++ b/ebook_converter/ebooks/txt/textileml.py @@ -0,0 +1,502 @@ +# -*- coding: utf-8 -*- +from __future__ import absolute_import, division, print_function, unicode_literals + +__license__ = 'GPL 3' +__copyright__ = '2011, Leigh Parry ' +__docformat__ = 'restructuredtext en' + +''' +Transform OEB content into Textile formatted plain text +''' +import re + +from functools import partial + +from calibre.ebooks.htmlz.oeb2html import OEB2HTML +from calibre.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace, rewrite_links +from calibre.ebooks.oeb.stylizer import Stylizer +from calibre.ebooks import unit_convert +from calibre.ebooks.textile.unsmarten import unsmarten +from polyglot.builtins import string_or_bytes + + +class TextileMLizer(OEB2HTML): + + MAX_EM = 10 + + def extract_content(self, oeb_book, opts): + self.log.info('Converting XHTML to Textile formatted TXT...') + self.opts = opts + self.in_pre = False + self.in_table = False + self.links = {} + self.list = [] + self.our_links = [] + self.in_a_link = False + self.our_ids = [] + self.images = {} + self.id_no_text = '' + self.style_embed = [] + self.remove_space_after_newline = False + self.base_hrefs = [item.href for item in oeb_book.spine] + self.map_resources(oeb_book) + + self.style_bold = False + self.style_italic = False + self.style_under = False + self.style_strike = False + self.style_smallcap = False + + txt = self.mlize_spine(oeb_book) + if self.opts.unsmarten_punctuation: + txt = unsmarten(txt) + + # Do some tidying up + txt = self.tidy_up(txt) + + return txt + + def mlize_spine(self, oeb_book): + output = [''] + for item in oeb_book.spine: + self.log.debug('Converting %s to Textile formatted TXT...' % item.href) + self.rewrite_ids(item.data, item) + rewrite_links(item.data, partial(self.rewrite_link, page=item)) + stylizer = Stylizer(item.data, item.href, oeb_book, self.opts, self.opts.output_profile) + output += self.dump_text(item.data.find(XHTML('body')), stylizer) + output.append('\n\n') + return ''.join(output) + + def tidy_up(self, text): + # May need tweaking and finetuning + def check_escaping(text, tests): + for t in tests: + # I'm not checking for duplicated spans '%' as any that follow each other were being incorrectly merged + txt = '%s' % t + if txt != '%': + text = re.sub(r'([^'+t+'|^\n])'+t+r'\]\['+t+'([^'+t+'])', r'\1\2', text) + text = re.sub(r'([^'+t+'|^\n])'+t+t+'([^'+t+'])', r'\1\2', text) + text = re.sub(r'(\s|[*_\'"])\[('+t+'[a-zA-Z0-9 \'",.*_]+'+t+r')\](\s|[*_\'"?!,.])', r'\1\2\3', text) + return text + + # Now tidyup links and ids - remove ones that don't have a correponding opposite + if self.opts.keep_links: + for i in self.our_links: + if i[0] == '#': + if i not in self.our_ids: + text = re.sub(r'"(.+)":'+i+r'(\s)', r'\1\2', text) + for i in self.our_ids: + if i not in self.our_links: + text = re.sub(r'%?\('+i+'\\)\xa0?%?', r'', text) + + # Remove obvious non-needed escaping, add sub/sup-script ones + text = check_escaping(text, [r'\*', '_', r'\*']) + # escape the super/sub-scripts if needed + text = re.sub(r'(\w)([~^]\w+[~^])', r'\1[\2]', text) + # escape the super/sub-scripts if needed + text = re.sub(r'([~^]\w+[~^])(\w)', r'[\1]\2', text) + + # remove empty spans + text = re.sub(r'%\xa0+', r'%', text) + # remove empty spans - MAY MERGE SOME ? + text = re.sub(r'%%', r'', text) + # remove spans from tagged output + text = re.sub(r'%([_+*-]+)%', r'\1', text) + # remove spaces before a newline + text = re.sub(r' +\n', r'\n', text) + # remove newlines at top of file + text = re.sub(r'^\n+', r'', text) + # correct blockcode paras + text = re.sub(r'\npre\.\n?\nbc\.', r'\nbc.', text) + # correct blockquote paras + text = re.sub(r'\nbq\.\n?\np.*?\. ', r'\nbq. ', text) + + # reduce blank lines + text = re.sub(r'\n{3}', r'\n\np. \n\n', text) + text = re.sub(u'%\n(p[<>=]{1,2}\\.|p\\.)', r'%\n\n\1', text) + # Check span following blank para + text = re.sub(r'\n+ +%', r' %', text) + text = re.sub(u'p[<>=]{1,2}\\.\n\n?', r'', text) + # blank paragraph + text = re.sub(r'\n(p.*\.)\n', r'\n\1 \n\n', text) + # blank paragraph + text = re.sub(u'\n\xa0', r'\np. ', text) + # blank paragraph + text = re.sub(u'\np[<>=]{1,2}?\\. \xa0', r'\np. ', text) + text = re.sub(r'(^|\n)(p.*\. ?\n)(p.*\.)', r'\1\3', text) + text = re.sub(r'\n(p\. \n)(p.*\.|h.*\.)', r'\n\2', text) + # sort out spaces in tables + text = re.sub(r' {2,}\|', r' |', text) + + # Now put back spaces removed earlier as they're needed here + text = re.sub(r'\np\.\n', r'\np. \n', text) + # reduce blank lines + text = re.sub(r' \n\n\n', r' \n\n', text) + + return text + + def remove_newlines(self, text): + text = text.replace('\r\n', ' ') + text = text.replace('\n', ' ') + text = text.replace('\r', ' ') + # Condense redundant spaces created by replacing newlines with spaces. + text = re.sub(r'[ ]{2,}', ' ', text) + text = re.sub(r'\t+', '', text) + if self.remove_space_after_newline == True: # noqa + text = re.sub(r'^ +', '', text) + self.remove_space_after_newline = False + return text + + def check_styles(self, style): + txt = '{' + if self.opts.keep_color: + if 'color' in style.cssdict() and style['color'] != 'black': + txt += 'color:'+style['color']+';' + if 'background' in style.cssdict(): + txt += 'background:'+style['background']+';' + txt += '}' + if txt == '{}': + txt = '' + return txt + + def check_halign(self, style): + tests = {'left':'<','justify':'<>','center':'=','right':'>'} + for i in tests: + if style['text-align'] == i: + return tests[i] + return '' + + def check_valign(self, style): + tests = {'top':'^','bottom':'~'} # , 'middle':'-'} + for i in tests: + if style['vertical-align'] == i: + return tests[i] + return '' + + def check_padding(self, style, stylizer): + txt = '' + left_padding_pts = 0 + left_margin_pts = 0 + if 'padding-left' in style.cssdict() and style['padding-left'] != 'auto': + left_padding_pts = unit_convert(style['padding-left'], style.width, style.fontSize, stylizer.profile.dpi) + if 'margin-left' in style.cssdict() and style['margin-left'] != 'auto': + left_margin_pts = unit_convert(style['margin-left'], style.width, style.fontSize, stylizer.profile.dpi) + left = left_margin_pts + left_padding_pts + emleft = min(int(round(left / stylizer.profile.fbase)), self.MAX_EM) + if emleft >= 1: + txt += '(' * emleft + right_padding_pts = 0 + right_margin_pts = 0 + if 'padding-right' in style.cssdict() and style['padding-right'] != 'auto': + right_padding_pts = unit_convert(style['padding-right'], style.width, style.fontSize, stylizer.profile.dpi) + if 'margin-right' in style.cssdict() and style['margin-right'] != 'auto': + right_margin_pts = unit_convert(style['margin-right'], style.width, style.fontSize, stylizer.profile.dpi) + right = right_margin_pts + right_padding_pts + emright = min(int(round(right / stylizer.profile.fbase)), self.MAX_EM) + if emright >= 1: + txt += ')' * emright + + return txt + + def check_id_tag(self, attribs): + txt = '' + if 'id' in attribs: + txt = '(#'+attribs['id']+ ')' + self.our_ids.append('#'+attribs['id']) + self.id_no_text = u'\xa0' + return txt + + def build_block(self, tag, style, attribs, stylizer): + txt = '\n' + tag + if self.opts.keep_links: + txt += self.check_id_tag(attribs) + txt += self.check_padding(style, stylizer) + txt += self.check_halign(style) + txt += self.check_styles(style) + return txt + + def prepare_string_for_textile(self, txt): + if re.search(r'(\s([*&_+\-~@%|]|\?{2})\S)|(\S([*&_+\-~@%|]|\?{2})\s)', txt): + return ' ==%s== ' % txt + return txt + + def dump_text(self, elem, stylizer): + ''' + @elem: The element in the etree that we are working on. + @stylizer: The style information attached to the element. + ''' + + # We can only processes tags. If there isn't a tag return any text. + if not isinstance(elem.tag, string_or_bytes) \ + or namespace(elem.tag) != XHTML_NS: + p = elem.getparent() + if p is not None and isinstance(p.tag, string_or_bytes) and namespace(p.tag) == XHTML_NS \ + and elem.tail: + return [elem.tail] + return [''] + + # Setup our variables. + text = [''] + style = stylizer.style(elem) + tags = [] + tag = barename(elem.tag) + attribs = elem.attrib + + # Ignore anything that is set to not be displayed. + if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \ + or style['visibility'] == 'hidden': + if hasattr(elem, 'tail') and elem.tail: + return [elem.tail] + return [''] + + # Soft scene breaks. + if 'margin-top' in style.cssdict() and style['margin-top'] != 'auto': + ems = min(int(round(float(style.marginTop) / style.fontSize) - 1), self.MAX_EM) + if ems >= 1: + text.append(u'\n\n\xa0' * ems) + + if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'div'): + if tag == 'div': + tag = 'p' + text.append(self.build_block(tag, style, attribs, stylizer)) + text.append('. ') + tags.append('\n') + + if style['font-style'] == 'italic' or tag in ('i', 'em'): + if tag not in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'cite'): + if self.style_italic == False: # noqa + if self.in_a_link: + text.append('_') + tags.append('_') + else: + text.append('[_') + tags.append('_]') + self.style_embed.append('_') + self.style_italic = True + if style['font-weight'] in ('bold', 'bolder') or tag in ('b', 'strong'): + if tag not in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'th'): + if self.style_bold == False: # noqa + if self.in_a_link: + text.append('*') + tags.append('*') + else: + text.append('[*') + tags.append('*]') + self.style_embed.append('*') + self.style_bold = True + if style['text-decoration'] == 'underline' or tag in ('u', 'ins'): + if tag != 'a': + if self.style_under == False: # noqa + text.append('[+') + tags.append('+]') + self.style_embed.append('+') + self.style_under = True + if style['text-decoration'] == 'line-through' or tag in ('strike', 'del', 's'): + if self.style_strike == False: # noqa + text.append('[-') + tags.append('-]') + self.style_embed.append('-') + self.style_strike = True + if tag == 'br': + for i in reversed(self.style_embed): + text.append(i) + text.append('\n') + for i in self.style_embed: + text.append(i) + tags.append('') + self.remove_space_after_newline = True + if tag == 'blockquote': + text.append('\nbq. ') + tags.append('\n') + elif tag in ('abbr', 'acronym'): + text.append('') + txt = attribs['title'] + tags.append('(' + txt + ')') + elif tag == 'sup': + text.append('^') + tags.append('^') + elif tag == 'sub': + text.append('~') + tags.append('~') + elif tag == 'code': + if self.in_pre: + text.append('\nbc. ') + tags.append('') + else: + text.append('@') + tags.append('@') + elif tag == 'cite': + text.append('??') + tags.append('??') + elif tag == 'hr': + text.append('\n***') + tags.append('\n') + elif tag == 'pre': + self.in_pre = True + text.append('\npre. ') + tags.append('pre\n') + elif tag == 'a': + if self.opts.keep_links: + if 'href' in attribs: + text.append('"') + tags.append('a') + tags.append('":' + attribs['href']) + self.our_links.append(attribs['href']) + if 'title' in attribs: + tags.append('(' + attribs['title'] + ')') + self.in_a_link = True + else: + text.append('%') + tags.append('%') + elif tag == 'img': + if self.opts.keep_image_references: + txt = '!' + self.check_halign(style) + txt += self.check_valign(style) + txt += attribs['src'] + text.append(txt) + if 'alt' in attribs: + txt = attribs['alt'] + if txt != '': + text.append('(' + txt + ')') + tags.append('!') + elif tag in ('ol', 'ul'): + self.list.append({'name': tag, 'num': 0}) + text.append('') + tags.append(tag) + elif tag == 'li': + if self.list: + li = self.list[-1] + else: + li = {'name': 'ul', 'num': 0} + text.append('\n') + if li['name'] == 'ul': + text.append('*' * len(self.list) + ' ') + elif li['name'] == 'ol': + text.append('#' * len(self.list) + ' ') + tags.append('') + elif tag == 'dl': + text.append('\n') + tags.append('') + elif tag == 'dt': + text.append('') + tags.append('\n') + elif tag == 'dd': + text.append(' ') + tags.append('') + elif tag == 'dd': + text.append('') + tags.append('\n') + elif tag == 'table': + txt = self.build_block(tag, style, attribs, stylizer) + txt += '. \n' + if txt != '\ntable. \n': + text.append(txt) + else: + text.append('\n') + tags.append('') + elif tag == 'tr': + txt = self.build_block('', style, attribs, stylizer) + txt += '. ' + if txt != '\n. ': + txt = re.sub('\n', '', txt) + text.append(txt) + tags.append('|\n') + elif tag == 'td': + text.append('|') + txt = '' + txt += self.check_halign(style) + txt += self.check_valign(style) + if 'colspan' in attribs: + txt += '\\' + attribs['colspan'] + if 'rowspan' in attribs: + txt += '/' + attribs['rowspan'] + txt += self.check_styles(style) + if txt != '': + text.append(txt + '. ') + tags.append('') + elif tag == 'th': + text.append('|_. ') + tags.append('') + elif tag == 'span': + if style['font-variant'] == 'small-caps': + if self.style_smallcap == False: # noqa + text.append('&') + tags.append('&') + self.style_smallcap = True + else: + if self.in_a_link == False: # noqa + txt = '%' + if self.opts.keep_links: + txt += self.check_id_tag(attribs) + txt += self.check_styles(style) + if txt != '%': + text.append(txt) + tags.append('%') + + if self.opts.keep_links and 'id' in attribs: + if tag not in ('body', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'span', 'table'): + text.append(self.check_id_tag(attribs)) + + # Process the styles for any that we want to keep + if tag not in ('body', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'hr', 'a', 'img', + 'span', 'table', 'tr', 'td'): + if not self.in_a_link: + text.append(self.check_styles(style)) + + # Process tags that contain text. + if hasattr(elem, 'text') and elem.text: + txt = elem.text + if not self.in_pre: + txt = self.prepare_string_for_textile(self.remove_newlines(txt)) + text.append(txt) + self.id_no_text = u'' + + # Recurse down into tags within the tag we are in. + for item in elem: + text += self.dump_text(item, stylizer) + + # Close all open tags. + tags.reverse() + for t in tags: + if t in ('pre', 'ul', 'ol', 'li', 'table'): + if t == 'pre': + self.in_pre = False + elif t in ('ul', 'ol'): + if self.list: + self.list.pop() + if not self.list: + text.append('\n') + else: + if t == 'a': + self.in_a_link = False + t = '' + text.append(self.id_no_text) + self.id_no_text = u'' + if t in ('*]', '*'): + self.style_bold = False + elif t in ('_]', '_'): + self.style_italic = False + elif t == '+]': + self.style_under = False + elif t == '-]': + self.style_strike = False + elif t == '&': + self.style_smallcap = False + if t in ('*]', '_]', '+]', '-]', '*', '_'): + txt = self.style_embed.pop() + text.append('%s' % t) + + # Soft scene breaks. + if 'margin-bottom' in style.cssdict() and style['margin-bottom'] != 'auto': + ems = min(int(round((float(style.marginBottom) / style.fontSize) - 1)), self.MAX_EM) + if ems >= 1: + text.append(u'\n\n\xa0' * ems) + + # Add the text that is outside of the tag. + if hasattr(elem, 'tail') and elem.tail: + tail = elem.tail + if not self.in_pre: + tail = self.prepare_string_for_textile(self.remove_newlines(tail)) + text.append(tail) + + return text diff --git a/ebook_converter/ebooks/txt/txtml.py b/ebook_converter/ebooks/txt/txtml.py new file mode 100644 index 0000000..14cc7d8 --- /dev/null +++ b/ebook_converter/ebooks/txt/txtml.py @@ -0,0 +1,264 @@ +# -*- coding: utf-8 -*- +from __future__ import absolute_import, division, print_function, unicode_literals + +__license__ = 'GPL 3' +__copyright__ = '2009, John Schember ' +__docformat__ = 'restructuredtext en' + +''' +Transform OEB content into plain text +''' + +import re + +from lxml import etree +from polyglot.builtins import string_or_bytes + + +BLOCK_TAGS = [ + 'div', + 'p', + 'h1', + 'h2', + 'h3', + 'h4', + 'h5', + 'h6', + 'li', + 'tr', +] + +BLOCK_STYLES = [ + 'block', +] + +HEADING_TAGS = [ + 'h1', + 'h2', + 'h3', + 'h4', + 'h5', + 'h6', +] + +SPACE_TAGS = [ + 'td', + 'br', +] + + +class TXTMLizer(object): + + def __init__(self, log): + self.log = log + + def extract_content(self, oeb_book, opts): + self.log.info('Converting XHTML to TXT...') + self.oeb_book = oeb_book + self.opts = opts + self.toc_titles = [] + self.toc_ids = [] + self.last_was_heading = False + + self.create_flat_toc(self.oeb_book.toc) + + return self.mlize_spine() + + def mlize_spine(self): + from calibre.ebooks.oeb.base import XHTML + from calibre.ebooks.oeb.stylizer import Stylizer + from calibre.utils.xml_parse import safe_xml_fromstring + output = [u''] + output.append(self.get_toc()) + for item in self.oeb_book.spine: + self.log.debug('Converting %s to TXT...' % item.href) + for x in item.data.iterdescendants(etree.Comment): + if x.text and '--' in x.text: + x.text = x.text.replace('--', '__') + content = etree.tostring(item.data, encoding='unicode') + content = self.remove_newlines(content) + content = safe_xml_fromstring(content) + stylizer = Stylizer(content, item.href, self.oeb_book, self.opts, self.opts.output_profile) + output += self.dump_text(content.find(XHTML('body')), stylizer, item) + output += '\n\n\n\n\n\n' + output = ''.join(output) + output = '\n'.join(l.rstrip() for l in output.splitlines()) + output = self.cleanup_text(output) + + return output + + def remove_newlines(self, text): + self.log.debug('\tRemove newlines for processing...') + text = text.replace('\r\n', ' ') + text = text.replace('\n', ' ') + text = text.replace('\r', ' ') + # Condense redundant spaces created by replacing newlines with spaces. + text = re.sub(r'[ ]{2,}', ' ', text) + + return text + + def get_toc(self): + toc = [''] + if getattr(self.opts, 'inline_toc', None): + self.log.debug('Generating table of contents...') + toc.append('%s\n\n' % _('Table of Contents:')) + for item in self.toc_titles: + toc.append('* %s\n\n' % item) + return ''.join(toc) + + def create_flat_toc(self, nodes): + ''' + Turns a hierarchical list of TOC href's into a flat list. + ''' + for item in nodes: + self.toc_titles.append(item.title) + self.toc_ids.append(item.href) + self.create_flat_toc(item.nodes) + + def cleanup_text(self, text): + self.log.debug('\tClean up text...') + # Replace bad characters. + text = text.replace(u'\xa0', ' ') + + # Replace tabs, vertical tags and form feeds with single space. + text = text.replace('\t+', ' ') + text = text.replace('\v+', ' ') + text = text.replace('\f+', ' ') + + # Single line paragraph. + text = re.sub('(?<=.)\n(?=.)', ' ', text) + + # Remove multiple spaces. + text = re.sub('[ ]{2,}', ' ', text) + + # Remove excessive newlines. + text = re.sub('\n[ ]+\n', '\n\n', text) + if self.opts.remove_paragraph_spacing: + text = re.sub('\n{2,}', '\n', text) + text = re.sub(r'(?msu)^(?P[^\t\n]+?)$', lambda mo: u'%s\n\n' % mo.group('t'), text) + text = re.sub(r'(?msu)(?P[^\n])\n+(?P[^\t\n]+?)(?=\n)', lambda mo: '%s\n\n\n\n\n\n%s' % (mo.group('b'), mo.group('t')), text) + else: + text = re.sub('\n{7,}', '\n\n\n\n\n\n', text) + + # Replace spaces at the beginning and end of lines + # We don't replace tabs because those are only added + # when remove paragraph spacing is enabled. + text = re.sub('(?imu)^[ ]+', '', text) + text = re.sub('(?imu)[ ]+$', '', text) + + # Remove empty space and newlines at the beginning of the document. + text = re.sub(r'(?u)^[ \n]+', '', text) + + if self.opts.max_line_length: + max_length = self.opts.max_line_length + if self.opts.max_line_length < 25 and not self.opts.force_max_line_length: + max_length = 25 + short_lines = [] + lines = text.splitlines() + for line in lines: + while len(line) > max_length: + space = line.rfind(' ', 0, max_length) + if space != -1: + # Space was found. + short_lines.append(line[:space]) + line = line[space + 1:] + else: + # Space was not found. + if self.opts.force_max_line_length: + # Force breaking at max_lenght. + short_lines.append(line[:max_length]) + line = line[max_length:] + else: + # Look for the first space after max_length. + space = line.find(' ', max_length, len(line)) + if space != -1: + # Space was found. + short_lines.append(line[:space]) + line = line[space + 1:] + else: + # No space was found cannot break line. + short_lines.append(line) + line = '' + # Add the text that was less than max_lengh to the list + short_lines.append(line) + text = '\n'.join(short_lines) + + return text + + def dump_text(self, elem, stylizer, page): + ''' + @elem: The element in the etree that we are working on. + @stylizer: The style information attached to the element. + @page: OEB page used to determine absolute urls. + ''' + from calibre.ebooks.oeb.base import XHTML_NS, barename, namespace + + if not isinstance(elem.tag, string_or_bytes) \ + or namespace(elem.tag) != XHTML_NS: + p = elem.getparent() + if p is not None and isinstance(p.tag, string_or_bytes) and namespace(p.tag) == XHTML_NS \ + and elem.tail: + return [elem.tail] + return [''] + + text = [''] + style = stylizer.style(elem) + + if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \ + or style['visibility'] == 'hidden': + if hasattr(elem, 'tail') and elem.tail: + return [elem.tail] + return [''] + + tag = barename(elem.tag) + tag_id = elem.attrib.get('id', None) + in_block = False + in_heading = False + + # Are we in a heading? + # This can either be a heading tag or a TOC item. + if tag in HEADING_TAGS or '%s#%s' % (page.href, tag_id) in self.toc_ids: + in_heading = True + if not self.last_was_heading: + text.append('\n\n\n\n\n\n') + + # Are we in a paragraph block? + if tag in BLOCK_TAGS or style['display'] in BLOCK_STYLES: + if self.opts.remove_paragraph_spacing and not in_heading: + text.append('\t') + in_block = True + + if tag in SPACE_TAGS: + text.append(' ') + + # Hard scene breaks. + if tag == 'hr': + text.append('\n\n* * *\n\n') + # Soft scene breaks. + try: + ems = int(round((float(style.marginTop) / style.fontSize) - 1)) + if ems >= 1: + text.append('\n' * ems) + except: + pass + + # Process tags that contain text. + if hasattr(elem, 'text') and elem.text: + text.append(elem.text) + + # Recurse down into tags within the tag we are in. + for item in elem: + text += self.dump_text(item, stylizer, page) + + if in_block: + text.append('\n\n') + if in_heading: + text.append('\n') + self.last_was_heading = True + else: + self.last_was_heading = False + + if hasattr(elem, 'tail') and elem.tail: + text.append(elem.tail) + + return text