mirror of
https://github.com/gryf/ebook-converter.git
synced 2026-03-10 19:45:51 +01:00
This is progressing refactor of the calibre code to make it more readable, and transform it to something more coherent. In this patch, there are changes regarding imports for some modules, instead of polluting namespace of each module with some other modules symbols, which often were imported from other modules. Yuck.
266 lines
8.6 KiB
Python
266 lines
8.6 KiB
Python
"""
|
|
Transform OEB content into plain text
|
|
"""
|
|
import re
|
|
|
|
from lxml import etree
|
|
|
|
from ebook_converter import constants as const
|
|
from ebook_converter.ebooks.oeb import base
|
|
from ebook_converter.ebooks.oeb import parse_utils
|
|
from ebook_converter.ebooks.oeb.stylizer import Stylizer
|
|
|
|
|
|
BLOCK_TAGS = [
|
|
'div',
|
|
'p',
|
|
'h1',
|
|
'h2',
|
|
'h3',
|
|
'h4',
|
|
'h5',
|
|
'h6',
|
|
'li',
|
|
'tr',
|
|
]
|
|
|
|
BLOCK_STYLES = [
|
|
'block',
|
|
]
|
|
|
|
HEADING_TAGS = [
|
|
'h1',
|
|
'h2',
|
|
'h3',
|
|
'h4',
|
|
'h5',
|
|
'h6',
|
|
]
|
|
|
|
SPACE_TAGS = [
|
|
'td',
|
|
'br',
|
|
]
|
|
|
|
|
|
class TXTMLizer(object):
|
|
|
|
def __init__(self, log):
|
|
self.log = log
|
|
|
|
def extract_content(self, oeb_book, opts):
|
|
self.log.info('Converting XHTML to TXT...')
|
|
self.oeb_book = oeb_book
|
|
self.opts = opts
|
|
self.toc_titles = []
|
|
self.toc_ids = []
|
|
self.last_was_heading = False
|
|
|
|
self.create_flat_toc(self.oeb_book.toc)
|
|
|
|
return self.mlize_spine()
|
|
|
|
def mlize_spine(self):
|
|
output = [u'']
|
|
output.append(self.get_toc())
|
|
for item in self.oeb_book.spine:
|
|
self.log.debug('Converting %s to TXT...' % item.href)
|
|
for x in item.data.iterdescendants(etree.Comment):
|
|
if x.text and '--' in x.text:
|
|
x.text = x.text.replace('--', '__')
|
|
content = etree.tostring(item.data, encoding='unicode')
|
|
content = self.remove_newlines(content)
|
|
content = etree.fromstring(content)
|
|
stylizer = Stylizer(content, item.href, self.oeb_book, self.opts,
|
|
self.opts.output_profile)
|
|
output += self.dump_text(content.find(base.tag('xhtml', 'body')),
|
|
stylizer, item)
|
|
output += '\n\n\n\n\n\n'
|
|
output = ''.join(output)
|
|
output = '\n'.join(l.rstrip() for l in output.splitlines())
|
|
output = self.cleanup_text(output)
|
|
|
|
return output
|
|
|
|
def remove_newlines(self, text):
|
|
self.log.debug('\tRemove newlines for processing...')
|
|
text = text.replace('\r\n', ' ')
|
|
text = text.replace('\n', ' ')
|
|
text = text.replace('\r', ' ')
|
|
# Condense redundant spaces created by replacing newlines with spaces.
|
|
text = re.sub(r'[ ]{2,}', ' ', text)
|
|
|
|
return text
|
|
|
|
def get_toc(self):
|
|
toc = ['']
|
|
if getattr(self.opts, 'inline_toc', None):
|
|
self.log.debug('Generating table of contents...')
|
|
toc.append('%s\n\n' % 'Table of Contents:')
|
|
for item in self.toc_titles:
|
|
toc.append('* %s\n\n' % item)
|
|
return ''.join(toc)
|
|
|
|
def create_flat_toc(self, nodes):
|
|
'''
|
|
Turns a hierarchical list of TOC href's into a flat list.
|
|
'''
|
|
for item in nodes:
|
|
self.toc_titles.append(item.title)
|
|
self.toc_ids.append(item.href)
|
|
self.create_flat_toc(item.nodes)
|
|
|
|
def cleanup_text(self, text):
|
|
self.log.debug('\tClean up text...')
|
|
# Replace bad characters.
|
|
text = text.replace(u'\xa0', ' ')
|
|
|
|
# Replace tabs, vertical tags and form feeds with single space.
|
|
text = text.replace('\t+', ' ')
|
|
text = text.replace('\v+', ' ')
|
|
text = text.replace('\f+', ' ')
|
|
|
|
# Single line paragraph.
|
|
text = re.sub('(?<=.)\n(?=.)', ' ', text)
|
|
|
|
# Remove multiple spaces.
|
|
text = re.sub('[ ]{2,}', ' ', text)
|
|
|
|
# Remove excessive newlines.
|
|
text = re.sub('\n[ ]+\n', '\n\n', text)
|
|
if self.opts.remove_paragraph_spacing:
|
|
text = re.sub('\n{2,}', '\n', text)
|
|
text = re.sub(r'(?msu)^(?P<t>[^\t\n]+?)$', lambda mo: u'%s\n\n' %
|
|
mo.group('t'), text)
|
|
text = re.sub(r'(?msu)(?P<b>[^\n])\n+(?P<t>[^\t\n]+?)(?=\n)',
|
|
lambda mo: '%s\n\n\n\n\n\n%s' % (mo.group('b'),
|
|
mo.group('t')),
|
|
text)
|
|
else:
|
|
text = re.sub('\n{7,}', '\n\n\n\n\n\n', text)
|
|
|
|
# Replace spaces at the beginning and end of lines
|
|
# We don't replace tabs because those are only added
|
|
# when remove paragraph spacing is enabled.
|
|
text = re.sub('(?imu)^[ ]+', '', text)
|
|
text = re.sub('(?imu)[ ]+$', '', text)
|
|
|
|
# Remove empty space and newlines at the beginning of the document.
|
|
text = re.sub(r'(?u)^[ \n]+', '', text)
|
|
|
|
if self.opts.max_line_length:
|
|
max_length = self.opts.max_line_length
|
|
if (self.opts.max_line_length < 25 and not
|
|
self.opts.force_max_line_length):
|
|
max_length = 25
|
|
short_lines = []
|
|
lines = text.splitlines()
|
|
for line in lines:
|
|
while len(line) > max_length:
|
|
space = line.rfind(' ', 0, max_length)
|
|
if space != -1:
|
|
# Space was found.
|
|
short_lines.append(line[:space])
|
|
line = line[space + 1:]
|
|
else:
|
|
# Space was not found.
|
|
if self.opts.force_max_line_length:
|
|
# Force breaking at max_lenght.
|
|
short_lines.append(line[:max_length])
|
|
line = line[max_length:]
|
|
else:
|
|
# Look for the first space after max_length.
|
|
space = line.find(' ', max_length, len(line))
|
|
if space != -1:
|
|
# Space was found.
|
|
short_lines.append(line[:space])
|
|
line = line[space + 1:]
|
|
else:
|
|
# No space was found cannot break line.
|
|
short_lines.append(line)
|
|
line = ''
|
|
# Add the text that was less than max_lengh to the list
|
|
short_lines.append(line)
|
|
text = '\n'.join(short_lines)
|
|
|
|
return text
|
|
|
|
def dump_text(self, elem, stylizer, page):
|
|
'''
|
|
@elem: The element in the etree that we are working on.
|
|
@stylizer: The style information attached to the element.
|
|
@page: OEB page used to determine absolute urls.
|
|
'''
|
|
|
|
if not isinstance(elem.tag, (str, bytes)) \
|
|
or parse_utils.namespace(elem.tag) != const.XHTML_NS:
|
|
p = elem.getparent()
|
|
if (p is not None and isinstance(p.tag, (str, bytes)) and
|
|
parse_utils.namespace(p.tag) == const.XHTML_NS and
|
|
elem.tail):
|
|
return [elem.tail]
|
|
return ['']
|
|
|
|
text = ['']
|
|
style = stylizer.style(elem)
|
|
|
|
if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \
|
|
or style['visibility'] == 'hidden':
|
|
if hasattr(elem, 'tail') and elem.tail:
|
|
return [elem.tail]
|
|
return ['']
|
|
|
|
tag = parse_utils.barename(elem.tag)
|
|
tag_id = elem.attrib.get('id', None)
|
|
in_block = False
|
|
in_heading = False
|
|
|
|
# Are we in a heading?
|
|
# This can either be a heading tag or a TOC item.
|
|
if tag in HEADING_TAGS or '%s#%s' % (page.href,
|
|
tag_id) in self.toc_ids:
|
|
in_heading = True
|
|
if not self.last_was_heading:
|
|
text.append('\n\n\n\n\n\n')
|
|
|
|
# Are we in a paragraph block?
|
|
if tag in BLOCK_TAGS or style['display'] in BLOCK_STYLES:
|
|
if self.opts.remove_paragraph_spacing and not in_heading:
|
|
text.append('\t')
|
|
in_block = True
|
|
|
|
if tag in SPACE_TAGS:
|
|
text.append(' ')
|
|
|
|
# Hard scene breaks.
|
|
if tag == 'hr':
|
|
text.append('\n\n* * *\n\n')
|
|
# Soft scene breaks.
|
|
try:
|
|
ems = int(round((float(style.marginTop) / style.fontSize) - 1))
|
|
if ems >= 1:
|
|
text.append('\n' * ems)
|
|
except Exception:
|
|
pass
|
|
|
|
# Process tags that contain text.
|
|
if hasattr(elem, 'text') and elem.text:
|
|
text.append(elem.text)
|
|
|
|
# Recurse down into tags within the tag we are in.
|
|
for item in elem:
|
|
text += self.dump_text(item, stylizer, page)
|
|
|
|
if in_block:
|
|
text.append('\n\n')
|
|
if in_heading:
|
|
text.append('\n')
|
|
self.last_was_heading = True
|
|
else:
|
|
self.last_was_heading = False
|
|
|
|
if hasattr(elem, 'tail') and elem.tail:
|
|
text.append(elem.tail)
|
|
|
|
return text
|