mirror of
https://github.com/gryf/ebook-converter.git
synced 2026-01-31 02:25:45 +01:00
This is progressing refactor of the calibre code to make it more readable, and transform it to something more coherent. In this patch, there are changes regarding imports for some modules, instead of polluting namespace of each module with some other modules symbols, which often were imported from other modules. Yuck.
285 lines
10 KiB
Python
285 lines
10 KiB
Python
"""
|
|
Transform OEB content into Textile formatted plain text
|
|
"""
|
|
import re
|
|
|
|
from functools import partial
|
|
|
|
from ebook_converter import constants as const
|
|
from ebook_converter.ebooks.htmlz.oeb2html import OEB2HTML
|
|
from ebook_converter.ebooks.oeb.base import XHTML, barename, namespace, rewrite_links
|
|
from ebook_converter.ebooks.oeb.stylizer import Stylizer
|
|
|
|
|
|
__license__ = 'GPL 3'
|
|
__copyright__ = ('2011, John Schember <john@nachtimwald.com> 2011, '
|
|
'Leigh Parry <leighparry@blueyonder.co.uk>')
|
|
__docformat__ = 'restructuredtext en'
|
|
|
|
|
|
class MarkdownMLizer(OEB2HTML):
|
|
|
|
def extract_content(self, oeb_book, opts):
|
|
self.log.info('Converting XHTML to Markdown formatted TXT...')
|
|
self.opts = opts
|
|
self.in_code = False
|
|
self.in_pre = False
|
|
self.list = []
|
|
self.blockquotes = 0
|
|
self.remove_space_after_newline = False
|
|
self.base_hrefs = [item.href for item in oeb_book.spine]
|
|
self.map_resources(oeb_book)
|
|
|
|
self.style_bold = False
|
|
self.style_italic = False
|
|
|
|
txt = self.mlize_spine(oeb_book)
|
|
|
|
# Do some tidying up
|
|
txt = self.tidy_up(txt)
|
|
|
|
return txt
|
|
|
|
def mlize_spine(self, oeb_book):
|
|
output = ['']
|
|
for item in oeb_book.spine:
|
|
self.log.debug('Converting %s to Markdown formatted TXT...' % item.href)
|
|
self.rewrite_ids(item.data, item)
|
|
rewrite_links(item.data, partial(self.rewrite_link, page=item))
|
|
stylizer = Stylizer(item.data, item.href, oeb_book, self.opts, self.opts.output_profile)
|
|
output += self.dump_text(item.data.find(XHTML('body')), stylizer)
|
|
output.append('\n\n')
|
|
return ''.join(output)
|
|
|
|
def tidy_up(self, text):
|
|
# Remove blank space form beginning of paragraph.
|
|
text = re.sub('(?msu)^[ ]{1,3}', '', text)
|
|
# pre has 4 spaces. We trimmed 3 so anything with a space left is a pre.
|
|
text = re.sub('(?msu)^[ ]', ' ', text)
|
|
|
|
# Remove tabs that aren't at the beinning of a line
|
|
new_text = []
|
|
for l in text.splitlines():
|
|
start = re.match('\t+', l)
|
|
if start:
|
|
start = start.group()
|
|
else:
|
|
start = ''
|
|
l = re.sub('\t', '', l)
|
|
new_text.append(start + l)
|
|
text = '\n'.join(new_text)
|
|
|
|
# Remove spaces from blank lines.
|
|
text = re.sub('(?msu)^[ ]+$', '', text)
|
|
|
|
# Reduce blank lines
|
|
text = re.sub('(?msu)\n{7,}', '\n' * 6, text)
|
|
|
|
# Remove blank lines at beginning and end of document.
|
|
text = re.sub(r'^\s*', '', text)
|
|
text = re.sub(r'\s*$', '\n\n', text)
|
|
|
|
return text
|
|
|
|
def remove_newlines(self, text):
|
|
text = text.replace('\r\n', ' ')
|
|
text = text.replace('\n', ' ')
|
|
text = text.replace('\r', ' ')
|
|
# Condense redundant spaces created by replacing newlines with spaces.
|
|
text = re.sub(r'[ ]{2,}', ' ', text)
|
|
text = re.sub(r'\t+', '', text)
|
|
if self.remove_space_after_newline == True: # noqa
|
|
text = re.sub(r'^ +', '', text)
|
|
self.remove_space_after_newline = False
|
|
return text
|
|
|
|
def prepare_string_for_markdown(self, txt):
|
|
txt = re.sub(r'([\\`*_{}\[\]()#+!])', r'\\\1', txt)
|
|
return txt
|
|
|
|
def prepare_string_for_pre(self, txt):
|
|
new_text = []
|
|
for l in txt.splitlines():
|
|
new_text.append(' ' + l)
|
|
return '\n'.join(new_text)
|
|
|
|
def dump_text(self, elem, stylizer):
|
|
'''
|
|
@elem: The element in the etree that we are working on.
|
|
@stylizer: The style information attached to the element.
|
|
'''
|
|
|
|
# We can only processes tags. If there isn't a tag return any text.
|
|
if not isinstance(elem.tag, (str, bytes)) \
|
|
or namespace(elem.tag) != const.XHTML_NS:
|
|
p = elem.getparent()
|
|
if p is not None and isinstance(p.tag, (str, bytes)) and namespace(p.tag) == const.XHTML_NS \
|
|
and elem.tail:
|
|
return [elem.tail]
|
|
return ['']
|
|
|
|
# Setup our variables.
|
|
text = []
|
|
style = stylizer.style(elem)
|
|
tags = []
|
|
tag = barename(elem.tag)
|
|
attribs = elem.attrib
|
|
|
|
# Ignore anything that is set to not be displayed.
|
|
if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \
|
|
or style['visibility'] == 'hidden':
|
|
if hasattr(elem, 'tail') and elem.tail:
|
|
return [elem.tail]
|
|
return ['']
|
|
|
|
# Soft scene breaks.
|
|
if 'margin-top' in style.cssdict() and style['margin-top'] != 'auto':
|
|
ems = int(round(float(style.marginTop) / style.fontSize) - 1)
|
|
if ems >= 1:
|
|
text.append(u'\n\n' * ems)
|
|
|
|
bq = '> ' * self.blockquotes
|
|
# Block level elements
|
|
if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'div'):
|
|
h_tag = ''
|
|
if tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6'):
|
|
h_tag = '#' * int(tag[1]) + ' '
|
|
text.append('\n' + bq + h_tag)
|
|
tags.append('\n')
|
|
self.remove_space_after_newline = True
|
|
|
|
if style['font-style'] == 'italic' or tag in ('i', 'em'):
|
|
if tag not in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'cite'):
|
|
if self.style_italic == False: # noqa
|
|
text.append('*')
|
|
tags.append('*')
|
|
self.style_italic = True
|
|
if style['font-weight'] in ('bold', 'bolder') or tag in ('b', 'strong'):
|
|
if tag not in ('h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'th'):
|
|
if self.style_bold == False: # noqa
|
|
text.append('**')
|
|
tags.append('**')
|
|
self.style_bold = True
|
|
if tag == 'br':
|
|
text.append(' \n')
|
|
self.remove_space_after_newline = True
|
|
if tag == 'blockquote':
|
|
self.blockquotes += 1
|
|
tags.append('>')
|
|
text.append('> ' * self.blockquotes)
|
|
elif tag == 'code':
|
|
if not self.in_pre and not self.in_code:
|
|
text.append('`')
|
|
tags.append('`')
|
|
self.in_code = True
|
|
elif tag == 'pre':
|
|
if not self.in_pre:
|
|
text.append('\n')
|
|
tags.append('pre')
|
|
self.in_pre = True
|
|
elif tag == 'hr':
|
|
text.append('\n* * *')
|
|
tags.append('\n')
|
|
elif tag == 'a':
|
|
# Only write links with absolute (external) urls.
|
|
if self.opts.keep_links and 'href' in attribs and '://' in attribs['href']:
|
|
title = ''
|
|
if 'title' in attribs:
|
|
title = ' "' + attribs['title'] + '"'
|
|
remove_space = self.remove_space_after_newline
|
|
title = self.remove_newlines(title)
|
|
self.remove_space_after_newline = remove_space
|
|
text.append('[')
|
|
tags.append('](' + attribs['href'] + title + ')')
|
|
elif tag == 'img':
|
|
if self.opts.keep_image_references:
|
|
txt = '!'
|
|
if 'alt' in attribs:
|
|
remove_space = self.remove_space_after_newline
|
|
txt += '[' + self.remove_newlines(attribs['alt']) + ']'
|
|
self.remove_space_after_newline = remove_space
|
|
txt += '(' + attribs['src'] + ')'
|
|
text.append(txt)
|
|
elif tag in ('ol', 'ul'):
|
|
tags.append(tag)
|
|
# Add the list to our lists of lists so we can track
|
|
# nested lists.
|
|
self.list.append({'name': tag, 'num': 0})
|
|
elif tag == 'li':
|
|
# Get the last list from our list of lists
|
|
if self.list:
|
|
li = self.list[-1]
|
|
else:
|
|
li = {'name': 'ul', 'num': 0}
|
|
# Add a new line to start the item
|
|
text.append('\n')
|
|
# Add indent if we have nested lists.
|
|
list_count = len(self.list)
|
|
# We only care about indenting nested lists.
|
|
if (list_count - 1) > 0:
|
|
text.append('\t' * (list_count - 1))
|
|
# Add blockquote if we have a blockquote in a list item.
|
|
text.append(bq)
|
|
# Write the proper sign for ordered and unorded lists.
|
|
if li['name'] == 'ul':
|
|
text.append('+ ')
|
|
elif li['name'] == 'ol':
|
|
li['num'] += 1
|
|
text.append(str(li['num']) + '. ')
|
|
|
|
# Process tags that contain text.
|
|
if hasattr(elem, 'text') and elem.text:
|
|
txt = elem.text
|
|
if self.in_pre:
|
|
txt = self.prepare_string_for_pre(txt)
|
|
elif self.in_code:
|
|
txt = self.remove_newlines(txt)
|
|
else:
|
|
txt = self.prepare_string_for_markdown(self.remove_newlines(txt))
|
|
text.append(txt)
|
|
|
|
# Recurse down into tags within the tag we are in.
|
|
for item in elem:
|
|
text += self.dump_text(item, stylizer)
|
|
|
|
# Close all open tags.
|
|
tags.reverse()
|
|
for t in tags:
|
|
if t in ('pre', 'ul', 'ol', '>'):
|
|
if t == 'pre':
|
|
self.in_pre = False
|
|
text.append('\n')
|
|
elif t == '>':
|
|
self.blockquotes -= 1
|
|
elif t in ('ul', 'ol'):
|
|
if self.list:
|
|
self.list.pop()
|
|
text.append('\n')
|
|
else:
|
|
if t == '**':
|
|
self.style_bold = False
|
|
elif t == '*':
|
|
self.style_italic = False
|
|
elif t == '`':
|
|
self.in_code = False
|
|
text.append('%s' % t)
|
|
|
|
# Soft scene breaks.
|
|
if 'margin-bottom' in style.cssdict() and style['margin-bottom'] != 'auto':
|
|
ems = int(round((float(style.marginBottom) / style.fontSize) - 1))
|
|
if ems >= 1:
|
|
text.append(u'\n\n' * ems)
|
|
|
|
# Add the text that is outside of the tag.
|
|
if hasattr(elem, 'tail') and elem.tail:
|
|
tail = elem.tail
|
|
if self.in_pre:
|
|
tail = self.prepare_string_for_pre(tail)
|
|
elif self.in_code:
|
|
tail = self.remove_newlines(tail)
|
|
else:
|
|
tail = self.prepare_string_for_markdown(self.remove_newlines(tail))
|
|
text.append(tail)
|
|
|
|
return text
|