1
0
mirror of https://github.com/gryf/ebook-converter.git synced 2026-03-10 19:45:51 +01:00
Files
ebook-converter/ebook_converter/ebooks/txt/txtml.py
gryf ce89f5c9d1 Use the real constants module.
This is progressing refactor of the calibre code to make it more
readable, and transform it to something more coherent.

In this patch, there are changes regarding imports for some modules,
instead of polluting namespace of each module with some other modules
symbols, which often were imported from other modules. Yuck.
2020-05-29 17:04:53 +02:00

266 lines
8.6 KiB
Python

"""
Transform OEB content into plain text
"""
import re
from lxml import etree
from ebook_converter import constants as const
from ebook_converter.ebooks.oeb import base
from ebook_converter.ebooks.oeb import parse_utils
from ebook_converter.ebooks.oeb.stylizer import Stylizer
BLOCK_TAGS = [
'div',
'p',
'h1',
'h2',
'h3',
'h4',
'h5',
'h6',
'li',
'tr',
]
BLOCK_STYLES = [
'block',
]
HEADING_TAGS = [
'h1',
'h2',
'h3',
'h4',
'h5',
'h6',
]
SPACE_TAGS = [
'td',
'br',
]
class TXTMLizer(object):
def __init__(self, log):
self.log = log
def extract_content(self, oeb_book, opts):
self.log.info('Converting XHTML to TXT...')
self.oeb_book = oeb_book
self.opts = opts
self.toc_titles = []
self.toc_ids = []
self.last_was_heading = False
self.create_flat_toc(self.oeb_book.toc)
return self.mlize_spine()
def mlize_spine(self):
output = [u'']
output.append(self.get_toc())
for item in self.oeb_book.spine:
self.log.debug('Converting %s to TXT...' % item.href)
for x in item.data.iterdescendants(etree.Comment):
if x.text and '--' in x.text:
x.text = x.text.replace('--', '__')
content = etree.tostring(item.data, encoding='unicode')
content = self.remove_newlines(content)
content = etree.fromstring(content)
stylizer = Stylizer(content, item.href, self.oeb_book, self.opts,
self.opts.output_profile)
output += self.dump_text(content.find(base.tag('xhtml', 'body')),
stylizer, item)
output += '\n\n\n\n\n\n'
output = ''.join(output)
output = '\n'.join(l.rstrip() for l in output.splitlines())
output = self.cleanup_text(output)
return output
def remove_newlines(self, text):
self.log.debug('\tRemove newlines for processing...')
text = text.replace('\r\n', ' ')
text = text.replace('\n', ' ')
text = text.replace('\r', ' ')
# Condense redundant spaces created by replacing newlines with spaces.
text = re.sub(r'[ ]{2,}', ' ', text)
return text
def get_toc(self):
toc = ['']
if getattr(self.opts, 'inline_toc', None):
self.log.debug('Generating table of contents...')
toc.append('%s\n\n' % 'Table of Contents:')
for item in self.toc_titles:
toc.append('* %s\n\n' % item)
return ''.join(toc)
def create_flat_toc(self, nodes):
'''
Turns a hierarchical list of TOC href's into a flat list.
'''
for item in nodes:
self.toc_titles.append(item.title)
self.toc_ids.append(item.href)
self.create_flat_toc(item.nodes)
def cleanup_text(self, text):
self.log.debug('\tClean up text...')
# Replace bad characters.
text = text.replace(u'\xa0', ' ')
# Replace tabs, vertical tags and form feeds with single space.
text = text.replace('\t+', ' ')
text = text.replace('\v+', ' ')
text = text.replace('\f+', ' ')
# Single line paragraph.
text = re.sub('(?<=.)\n(?=.)', ' ', text)
# Remove multiple spaces.
text = re.sub('[ ]{2,}', ' ', text)
# Remove excessive newlines.
text = re.sub('\n[ ]+\n', '\n\n', text)
if self.opts.remove_paragraph_spacing:
text = re.sub('\n{2,}', '\n', text)
text = re.sub(r'(?msu)^(?P<t>[^\t\n]+?)$', lambda mo: u'%s\n\n' %
mo.group('t'), text)
text = re.sub(r'(?msu)(?P<b>[^\n])\n+(?P<t>[^\t\n]+?)(?=\n)',
lambda mo: '%s\n\n\n\n\n\n%s' % (mo.group('b'),
mo.group('t')),
text)
else:
text = re.sub('\n{7,}', '\n\n\n\n\n\n', text)
# Replace spaces at the beginning and end of lines
# We don't replace tabs because those are only added
# when remove paragraph spacing is enabled.
text = re.sub('(?imu)^[ ]+', '', text)
text = re.sub('(?imu)[ ]+$', '', text)
# Remove empty space and newlines at the beginning of the document.
text = re.sub(r'(?u)^[ \n]+', '', text)
if self.opts.max_line_length:
max_length = self.opts.max_line_length
if (self.opts.max_line_length < 25 and not
self.opts.force_max_line_length):
max_length = 25
short_lines = []
lines = text.splitlines()
for line in lines:
while len(line) > max_length:
space = line.rfind(' ', 0, max_length)
if space != -1:
# Space was found.
short_lines.append(line[:space])
line = line[space + 1:]
else:
# Space was not found.
if self.opts.force_max_line_length:
# Force breaking at max_lenght.
short_lines.append(line[:max_length])
line = line[max_length:]
else:
# Look for the first space after max_length.
space = line.find(' ', max_length, len(line))
if space != -1:
# Space was found.
short_lines.append(line[:space])
line = line[space + 1:]
else:
# No space was found cannot break line.
short_lines.append(line)
line = ''
# Add the text that was less than max_lengh to the list
short_lines.append(line)
text = '\n'.join(short_lines)
return text
def dump_text(self, elem, stylizer, page):
'''
@elem: The element in the etree that we are working on.
@stylizer: The style information attached to the element.
@page: OEB page used to determine absolute urls.
'''
if not isinstance(elem.tag, (str, bytes)) \
or parse_utils.namespace(elem.tag) != const.XHTML_NS:
p = elem.getparent()
if (p is not None and isinstance(p.tag, (str, bytes)) and
parse_utils.namespace(p.tag) == const.XHTML_NS and
elem.tail):
return [elem.tail]
return ['']
text = ['']
style = stylizer.style(elem)
if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \
or style['visibility'] == 'hidden':
if hasattr(elem, 'tail') and elem.tail:
return [elem.tail]
return ['']
tag = parse_utils.barename(elem.tag)
tag_id = elem.attrib.get('id', None)
in_block = False
in_heading = False
# Are we in a heading?
# This can either be a heading tag or a TOC item.
if tag in HEADING_TAGS or '%s#%s' % (page.href,
tag_id) in self.toc_ids:
in_heading = True
if not self.last_was_heading:
text.append('\n\n\n\n\n\n')
# Are we in a paragraph block?
if tag in BLOCK_TAGS or style['display'] in BLOCK_STYLES:
if self.opts.remove_paragraph_spacing and not in_heading:
text.append('\t')
in_block = True
if tag in SPACE_TAGS:
text.append(' ')
# Hard scene breaks.
if tag == 'hr':
text.append('\n\n* * *\n\n')
# Soft scene breaks.
try:
ems = int(round((float(style.marginTop) / style.fontSize) - 1))
if ems >= 1:
text.append('\n' * ems)
except Exception:
pass
# Process tags that contain text.
if hasattr(elem, 'text') and elem.text:
text.append(elem.text)
# Recurse down into tags within the tag we are in.
for item in elem:
text += self.dump_text(item, stylizer, page)
if in_block:
text.append('\n\n')
if in_heading:
text.append('\n')
self.last_was_heading = True
else:
self.last_was_heading = False
if hasattr(elem, 'tail') and elem.tail:
text.append(elem.tail)
return text