mirror of
https://github.com/gryf/ebook-converter.git
synced 2026-02-16 22:25:54 +01:00
This is progressing refactor of the calibre code to make it more readable, and transform it to something more coherent. In this patch, there are changes regarding imports for some modules, instead of polluting namespace of each module with some other modules symbols, which often were imported from other modules. Yuck.
502 lines
19 KiB
Python
502 lines
19 KiB
Python
"""
|
|
Transform OEB content into Textile formatted plain text
|
|
"""
|
|
import re
|
|
|
|
from functools import partial
|
|
|
|
from ebook_converter import constants as const
|
|
from ebook_converter.ebooks.htmlz.oeb2html import OEB2HTML
|
|
from ebook_converter.ebooks.oeb.base import XHTML, barename, namespace, \
|
|
rewrite_links
|
|
from ebook_converter.ebooks.oeb.stylizer import Stylizer
|
|
from ebook_converter.ebooks import unit_convert
|
|
from ebook_converter.ebooks.textile.unsmarten import unsmarten
|
|
|
|
|
|
__license__ = 'GPL 3'
|
|
__copyright__ = '2011, Leigh Parry <leighparry@blueyonder.co.uk>'
|
|
__docformat__ = 'restructuredtext en'
|
|
|
|
|
|
class TextileMLizer(OEB2HTML):
|
|
|
|
MAX_EM = 10
|
|
|
|
def extract_content(self, oeb_book, opts):
|
|
self.log.info('Converting XHTML to Textile formatted TXT...')
|
|
self.opts = opts
|
|
self.in_pre = False
|
|
self.in_table = False
|
|
self.links = {}
|
|
self.list = []
|
|
self.our_links = []
|
|
self.in_a_link = False
|
|
self.our_ids = []
|
|
self.images = {}
|
|
self.id_no_text = ''
|
|
self.style_embed = []
|
|
self.remove_space_after_newline = False
|
|
self.base_hrefs = [item.href for item in oeb_book.spine]
|
|
self.map_resources(oeb_book)
|
|
|
|
self.style_bold = False
|
|
self.style_italic = False
|
|
self.style_under = False
|
|
self.style_strike = False
|
|
self.style_smallcap = False
|
|
|
|
txt = self.mlize_spine(oeb_book)
|
|
if self.opts.unsmarten_punctuation:
|
|
txt = unsmarten(txt)
|
|
|
|
# Do some tidying up
|
|
txt = self.tidy_up(txt)
|
|
|
|
return txt
|
|
|
|
def mlize_spine(self, oeb_book):
|
|
output = ['']
|
|
for item in oeb_book.spine:
|
|
self.log.debug('Converting %s to Textile formatted TXT...' % item.href)
|
|
self.rewrite_ids(item.data, item)
|
|
rewrite_links(item.data, partial(self.rewrite_link, page=item))
|
|
stylizer = Stylizer(item.data, item.href, oeb_book, self.opts, self.opts.output_profile)
|
|
output += self.dump_text(item.data.find(XHTML('body')), stylizer)
|
|
output.append('\n\n')
|
|
return ''.join(output)
|
|
|
|
def tidy_up(self, text):
|
|
# May need tweaking and finetuning
|
|
def check_escaping(text, tests):
|
|
for t in tests:
|
|
# I'm not checking for duplicated spans '%' as any that follow each other were being incorrectly merged
|
|
txt = '%s' % t
|
|
if txt != '%':
|
|
text = re.sub(r'([^'+t+'|^\n])'+t+r'\]\['+t+'([^'+t+'])', r'\1\2', text)
|
|
text = re.sub(r'([^'+t+'|^\n])'+t+t+'([^'+t+'])', r'\1\2', text)
|
|
text = re.sub(r'(\s|[*_\'"])\[('+t+'[a-zA-Z0-9 \'",.*_]+'+t+r')\](\s|[*_\'"?!,.])', r'\1\2\3', text)
|
|
return text
|
|
|
|
# Now tidyup links and ids - remove ones that don't have a correponding opposite
|
|
if self.opts.keep_links:
|
|
for i in self.our_links:
|
|
if i[0] == '#':
|
|
if i not in self.our_ids:
|
|
text = re.sub(r'"(.+)":'+i+r'(\s)', r'\1\2', text)
|
|
for i in self.our_ids:
|
|
if i not in self.our_links:
|
|
text = re.sub(r'%?\('+i+'\\)\xa0?%?', r'', text)
|
|
|
|
# Remove obvious non-needed escaping, add sub/sup-script ones
|
|
text = check_escaping(text, [r'\*', '_', r'\*'])
|
|
# escape the super/sub-scripts if needed
|
|
text = re.sub(r'(\w)([~^]\w+[~^])', r'\1[\2]', text)
|
|
# escape the super/sub-scripts if needed
|
|
text = re.sub(r'([~^]\w+[~^])(\w)', r'[\1]\2', text)
|
|
|
|
# remove empty spans
|
|
text = re.sub(r'%\xa0+', r'%', text)
|
|
# remove empty spans - MAY MERGE SOME ?
|
|
text = re.sub(r'%%', r'', text)
|
|
# remove spans from tagged output
|
|
text = re.sub(r'%([_+*-]+)%', r'\1', text)
|
|
# remove spaces before a newline
|
|
text = re.sub(r' +\n', r'\n', text)
|
|
# remove newlines at top of file
|
|
text = re.sub(r'^\n+', r'', text)
|
|
# correct blockcode paras
|
|
text = re.sub(r'\npre\.\n?\nbc\.', r'\nbc.', text)
|
|
# correct blockquote paras
|
|
text = re.sub(r'\nbq\.\n?\np.*?\. ', r'\nbq. ', text)
|
|
|
|
# reduce blank lines
|
|
text = re.sub(r'\n{3}', r'\n\np. \n\n', text)
|
|
text = re.sub(u'%\n(p[<>=]{1,2}\\.|p\\.)', r'%\n\n\1', text)
|
|
# Check span following blank para
|
|
text = re.sub(r'\n+ +%', r' %', text)
|
|
text = re.sub(u'p[<>=]{1,2}\\.\n\n?', r'', text)
|
|
# blank paragraph
|
|
text = re.sub(r'\n(p.*\.)\n', r'\n\1 \n\n', text)
|
|
# blank paragraph
|
|
text = re.sub(u'\n\xa0', r'\np. ', text)
|
|
# blank paragraph
|
|
text = re.sub(u'\np[<>=]{1,2}?\\. \xa0', r'\np. ', text)
|
|
text = re.sub(r'(^|\n)(p.*\. ?\n)(p.*\.)', r'\1\3', text)
|
|
text = re.sub(r'\n(p\. \n)(p.*\.|h.*\.)', r'\n\2', text)
|
|
# sort out spaces in tables
|
|
text = re.sub(r' {2,}\|', r' |', text)
|
|
|
|
# Now put back spaces removed earlier as they're needed here
|
|
text = re.sub(r'\np\.\n', r'\np. \n', text)
|
|
# reduce blank lines
|
|
text = re.sub(r' \n\n\n', r' \n\n', text)
|
|
|
|
return text
|
|
|
|
def remove_newlines(self, text):
|
|
text = text.replace('\r\n', ' ')
|
|
text = text.replace('\n', ' ')
|
|
text = text.replace('\r', ' ')
|
|
# Condense redundant spaces created by replacing newlines with spaces.
|
|
text = re.sub(r'[ ]{2,}', ' ', text)
|
|
text = re.sub(r'\t+', '', text)
|
|
if self.remove_space_after_newline == True: # noqa
|
|
text = re.sub(r'^ +', '', text)
|
|
self.remove_space_after_newline = False
|
|
return text
|
|
|
|
def check_styles(self, style):
|
|
txt = '{'
|
|
if self.opts.keep_color:
|
|
if 'color' in style.cssdict() and style['color'] != 'black':
|
|
txt += 'color:'+style['color']+';'
|
|
if 'background' in style.cssdict():
|
|
txt += 'background:'+style['background']+';'
|
|
txt += '}'
|
|
if txt == '{}':
|
|
txt = ''
|
|
return txt
|
|
|
|
def check_halign(self, style):
|
|
tests = {'left':'<','justify':'<>','center':'=','right':'>'}
|
|
for i in tests:
|
|
if style['text-align'] == i:
|
|
return tests[i]
|
|
return ''
|
|
|
|
def check_valign(self, style):
|
|
tests = {'top':'^','bottom':'~'} # , 'middle':'-'}
|
|
for i in tests:
|
|
if style['vertical-align'] == i:
|
|
return tests[i]
|
|
return ''
|
|
|
|
def check_padding(self, style, stylizer):
|
|
txt = ''
|
|
left_padding_pts = 0
|
|
left_margin_pts = 0
|
|
if 'padding-left' in style.cssdict() and style['padding-left'] != 'auto':
|
|
left_padding_pts = unit_convert(style['padding-left'], style.width, style.fontSize, stylizer.profile.dpi)
|
|
if 'margin-left' in style.cssdict() and style['margin-left'] != 'auto':
|
|
left_margin_pts = unit_convert(style['margin-left'], style.width, style.fontSize, stylizer.profile.dpi)
|
|
left = left_margin_pts + left_padding_pts
|
|
emleft = min(int(round(left / stylizer.profile.fbase)), self.MAX_EM)
|
|
if emleft >= 1:
|
|
txt += '(' * emleft
|
|
right_padding_pts = 0
|
|
right_margin_pts = 0
|
|
if 'padding-right' in style.cssdict() and style['padding-right'] != 'auto':
|
|
right_padding_pts = unit_convert(style['padding-right'], style.width, style.fontSize, stylizer.profile.dpi)
|
|
if 'margin-right' in style.cssdict() and style['margin-right'] != 'auto':
|
|
right_margin_pts = unit_convert(style['margin-right'], style.width, style.fontSize, stylizer.profile.dpi)
|
|
right = right_margin_pts + right_padding_pts
|
|
emright = min(int(round(right / stylizer.profile.fbase)), self.MAX_EM)
|
|
if emright >= 1:
|
|
txt += ')' * emright
|
|
|
|
return txt
|
|
|
|
def check_id_tag(self, attribs):
|
|
txt = ''
|
|
if 'id' in attribs:
|
|
txt = '(#'+attribs['id']+ ')'
|
|
self.our_ids.append('#'+attribs['id'])
|
|
self.id_no_text = u'\xa0'
|
|
return txt
|
|
|
|
def build_block(self, tag, style, attribs, stylizer):
|
|
txt = '\n' + tag
|
|
if self.opts.keep_links:
|
|
txt += self.check_id_tag(attribs)
|
|
txt += self.check_padding(style, stylizer)
|
|
txt += self.check_halign(style)
|
|
txt += self.check_styles(style)
|
|
return txt
|
|
|
|
def prepare_string_for_textile(self, txt):
|
|
if re.search(r'(\s([*&_+\-~@%|]|\?{2})\S)|(\S([*&_+\-~@%|]|\?{2})\s)', txt):
|
|
return ' ==%s== ' % txt
|
|
return txt
|
|
|
|
def dump_text(self, elem, stylizer):
|
|
'''
|
|
@elem: The element in the etree that we are working on.
|
|
@stylizer: The style information attached to the element.
|
|
'''
|
|
|
|
# We can only processes tags. If there isn't a tag return any text.
|
|
if not isinstance(elem.tag, (str, bytes)) \
|
|
or namespace(elem.tag) != const.XHTML_NS:
|
|
p = elem.getparent()
|
|
if p is not None and isinstance(p.tag, (str, bytes)) and namespace(p.tag) == const.XHTML_NS \
|
|
and elem.tail:
|
|
return [elem.tail]
|
|
return ['']
|
|
|
|
# Setup our variables.
|
|
text = ['']
|
|
style = stylizer.style(elem)
|
|
tags = []
|
|
tag = barename(elem.tag)
|
|
attribs = elem.attrib
|
|
|
|
# Ignore anything that is set to not be displayed.
|
|
if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \
|
|
or style['visibility'] == 'hidden':
|
|
if hasattr(elem, 'tail') and elem.tail:
|
|
return [elem.tail]
|
|
return ['']
|
|
|
|
# Soft scene breaks.
|
|
if 'margin-top' in style.cssdict() and style['margin-top'] != 'auto':
|
|
ems = min(int(round(float(style.marginTop) / style.fontSize) - 1), self.MAX_EM)
|
|
if ems >= 1:
|
|
text.append(u'\n\n\xa0' * ems)
|
|
|
|
if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'div'):
|
|
if tag == 'div':
|
|
tag = 'p'
|
|
text.append(self.build_block(tag, style, attribs, stylizer))
|
|
text.append('. ')
|
|
tags.append('\n')
|
|
|
|
if style['font-style'] == 'italic' or tag in ('i', 'em'):
|
|
if tag not in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'cite'):
|
|
if self.style_italic == False: # noqa
|
|
if self.in_a_link:
|
|
text.append('_')
|
|
tags.append('_')
|
|
else:
|
|
text.append('[_')
|
|
tags.append('_]')
|
|
self.style_embed.append('_')
|
|
self.style_italic = True
|
|
if style['font-weight'] in ('bold', 'bolder') or tag in ('b', 'strong'):
|
|
if tag not in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'th'):
|
|
if self.style_bold == False: # noqa
|
|
if self.in_a_link:
|
|
text.append('*')
|
|
tags.append('*')
|
|
else:
|
|
text.append('[*')
|
|
tags.append('*]')
|
|
self.style_embed.append('*')
|
|
self.style_bold = True
|
|
if style['text-decoration'] == 'underline' or tag in ('u', 'ins'):
|
|
if tag != 'a':
|
|
if self.style_under == False: # noqa
|
|
text.append('[+')
|
|
tags.append('+]')
|
|
self.style_embed.append('+')
|
|
self.style_under = True
|
|
if style['text-decoration'] == 'line-through' or tag in ('strike', 'del', 's'):
|
|
if self.style_strike == False: # noqa
|
|
text.append('[-')
|
|
tags.append('-]')
|
|
self.style_embed.append('-')
|
|
self.style_strike = True
|
|
if tag == 'br':
|
|
for i in reversed(self.style_embed):
|
|
text.append(i)
|
|
text.append('\n')
|
|
for i in self.style_embed:
|
|
text.append(i)
|
|
tags.append('')
|
|
self.remove_space_after_newline = True
|
|
if tag == 'blockquote':
|
|
text.append('\nbq. ')
|
|
tags.append('\n')
|
|
elif tag in ('abbr', 'acronym'):
|
|
text.append('')
|
|
txt = attribs['title']
|
|
tags.append('(' + txt + ')')
|
|
elif tag == 'sup':
|
|
text.append('^')
|
|
tags.append('^')
|
|
elif tag == 'sub':
|
|
text.append('~')
|
|
tags.append('~')
|
|
elif tag == 'code':
|
|
if self.in_pre:
|
|
text.append('\nbc. ')
|
|
tags.append('')
|
|
else:
|
|
text.append('@')
|
|
tags.append('@')
|
|
elif tag == 'cite':
|
|
text.append('??')
|
|
tags.append('??')
|
|
elif tag == 'hr':
|
|
text.append('\n***')
|
|
tags.append('\n')
|
|
elif tag == 'pre':
|
|
self.in_pre = True
|
|
text.append('\npre. ')
|
|
tags.append('pre\n')
|
|
elif tag == 'a':
|
|
if self.opts.keep_links:
|
|
if 'href' in attribs:
|
|
text.append('"')
|
|
tags.append('a')
|
|
tags.append('":' + attribs['href'])
|
|
self.our_links.append(attribs['href'])
|
|
if 'title' in attribs:
|
|
tags.append('(' + attribs['title'] + ')')
|
|
self.in_a_link = True
|
|
else:
|
|
text.append('%')
|
|
tags.append('%')
|
|
elif tag == 'img':
|
|
if self.opts.keep_image_references:
|
|
txt = '!' + self.check_halign(style)
|
|
txt += self.check_valign(style)
|
|
txt += attribs['src']
|
|
text.append(txt)
|
|
if 'alt' in attribs:
|
|
txt = attribs['alt']
|
|
if txt != '':
|
|
text.append('(' + txt + ')')
|
|
tags.append('!')
|
|
elif tag in ('ol', 'ul'):
|
|
self.list.append({'name': tag, 'num': 0})
|
|
text.append('')
|
|
tags.append(tag)
|
|
elif tag == 'li':
|
|
if self.list:
|
|
li = self.list[-1]
|
|
else:
|
|
li = {'name': 'ul', 'num': 0}
|
|
text.append('\n')
|
|
if li['name'] == 'ul':
|
|
text.append('*' * len(self.list) + ' ')
|
|
elif li['name'] == 'ol':
|
|
text.append('#' * len(self.list) + ' ')
|
|
tags.append('')
|
|
elif tag == 'dl':
|
|
text.append('\n')
|
|
tags.append('')
|
|
elif tag == 'dt':
|
|
text.append('')
|
|
tags.append('\n')
|
|
elif tag == 'dd':
|
|
text.append(' ')
|
|
tags.append('')
|
|
elif tag == 'dd':
|
|
text.append('')
|
|
tags.append('\n')
|
|
elif tag == 'table':
|
|
txt = self.build_block(tag, style, attribs, stylizer)
|
|
txt += '. \n'
|
|
if txt != '\ntable. \n':
|
|
text.append(txt)
|
|
else:
|
|
text.append('\n')
|
|
tags.append('')
|
|
elif tag == 'tr':
|
|
txt = self.build_block('', style, attribs, stylizer)
|
|
txt += '. '
|
|
if txt != '\n. ':
|
|
txt = re.sub('\n', '', txt)
|
|
text.append(txt)
|
|
tags.append('|\n')
|
|
elif tag == 'td':
|
|
text.append('|')
|
|
txt = ''
|
|
txt += self.check_halign(style)
|
|
txt += self.check_valign(style)
|
|
if 'colspan' in attribs:
|
|
txt += '\\' + attribs['colspan']
|
|
if 'rowspan' in attribs:
|
|
txt += '/' + attribs['rowspan']
|
|
txt += self.check_styles(style)
|
|
if txt != '':
|
|
text.append(txt + '. ')
|
|
tags.append('')
|
|
elif tag == 'th':
|
|
text.append('|_. ')
|
|
tags.append('')
|
|
elif tag == 'span':
|
|
if style['font-variant'] == 'small-caps':
|
|
if self.style_smallcap == False: # noqa
|
|
text.append('&')
|
|
tags.append('&')
|
|
self.style_smallcap = True
|
|
else:
|
|
if self.in_a_link == False: # noqa
|
|
txt = '%'
|
|
if self.opts.keep_links:
|
|
txt += self.check_id_tag(attribs)
|
|
txt += self.check_styles(style)
|
|
if txt != '%':
|
|
text.append(txt)
|
|
tags.append('%')
|
|
|
|
if self.opts.keep_links and 'id' in attribs:
|
|
if tag not in ('body', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'span', 'table'):
|
|
text.append(self.check_id_tag(attribs))
|
|
|
|
# Process the styles for any that we want to keep
|
|
if tag not in ('body', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'hr', 'a', 'img',
|
|
'span', 'table', 'tr', 'td'):
|
|
if not self.in_a_link:
|
|
text.append(self.check_styles(style))
|
|
|
|
# Process tags that contain text.
|
|
if hasattr(elem, 'text') and elem.text:
|
|
txt = elem.text
|
|
if not self.in_pre:
|
|
txt = self.prepare_string_for_textile(self.remove_newlines(txt))
|
|
text.append(txt)
|
|
self.id_no_text = u''
|
|
|
|
# Recurse down into tags within the tag we are in.
|
|
for item in elem:
|
|
text += self.dump_text(item, stylizer)
|
|
|
|
# Close all open tags.
|
|
tags.reverse()
|
|
for t in tags:
|
|
if t in ('pre', 'ul', 'ol', 'li', 'table'):
|
|
if t == 'pre':
|
|
self.in_pre = False
|
|
elif t in ('ul', 'ol'):
|
|
if self.list:
|
|
self.list.pop()
|
|
if not self.list:
|
|
text.append('\n')
|
|
else:
|
|
if t == 'a':
|
|
self.in_a_link = False
|
|
t = ''
|
|
text.append(self.id_no_text)
|
|
self.id_no_text = u''
|
|
if t in ('*]', '*'):
|
|
self.style_bold = False
|
|
elif t in ('_]', '_'):
|
|
self.style_italic = False
|
|
elif t == '+]':
|
|
self.style_under = False
|
|
elif t == '-]':
|
|
self.style_strike = False
|
|
elif t == '&':
|
|
self.style_smallcap = False
|
|
if t in ('*]', '_]', '+]', '-]', '*', '_'):
|
|
txt = self.style_embed.pop()
|
|
text.append('%s' % t)
|
|
|
|
# Soft scene breaks.
|
|
if 'margin-bottom' in style.cssdict() and style['margin-bottom'] != 'auto':
|
|
ems = min(int(round((float(style.marginBottom) / style.fontSize) - 1)), self.MAX_EM)
|
|
if ems >= 1:
|
|
text.append(u'\n\n\xa0' * ems)
|
|
|
|
# Add the text that is outside of the tag.
|
|
if hasattr(elem, 'tail') and elem.tail:
|
|
tail = elem.tail
|
|
if not self.in_pre:
|
|
tail = self.prepare_string_for_textile(self.remove_newlines(tail))
|
|
text.append(tail)
|
|
|
|
return text
|