mirror of
https://github.com/gryf/ebook-converter.git
synced 2026-01-23 05:15:45 +01:00
This is progressing refactor of the calibre code to make it more readable, and transform it to something more coherent. In this patch, there are changes regarding imports for some modules, instead of polluting namespace of each module with some other modules symbols, which often were imported from other modules. Yuck.
426 lines
15 KiB
Python
426 lines
15 KiB
Python
"""
|
|
Transform OEB content into a single (more or less) HTML file.
|
|
"""
|
|
import os
|
|
import re
|
|
import urllib.parse
|
|
|
|
from functools import partial
|
|
from lxml import html
|
|
|
|
from ebook_converter import prepare_string_for_xml
|
|
from ebook_converter import constants as const
|
|
from ebook_converter.ebooks.oeb import base
|
|
from ebook_converter.ebooks.oeb import parse_utils
|
|
from ebook_converter.ebooks.oeb.stylizer import Stylizer
|
|
from ebook_converter.utils.logging import default_log
|
|
from ebook_converter.polyglot.builtins import as_bytes
|
|
|
|
|
|
__license__ = 'GPL 3'
|
|
__copyright__ = '2011, John Schember <john@nachtimwald.com>'
|
|
__docformat__ = 'restructuredtext en'
|
|
|
|
SELF_CLOSING_TAGS = {'area', 'base', 'basefont', 'br', 'hr', 'input', 'img', 'link', 'meta'}
|
|
|
|
|
|
class OEB2HTML(object):
|
|
'''
|
|
Base class. All subclasses should implement dump_text to actually transform
|
|
content. Also, callers should use oeb2html to get the transformed html.
|
|
links and images can be retrieved after calling oeb2html to get the mapping
|
|
of OEB links and images to the new names used in the html returned by oeb2html.
|
|
Images will always be referenced as if they are in an images directory.
|
|
|
|
Use get_css to get the CSS classes for the OEB document as a string.
|
|
'''
|
|
|
|
def __init__(self, log=None):
|
|
self.log = default_log if log is None else log
|
|
self.links = {}
|
|
self.images = {}
|
|
|
|
def oeb2html(self, oeb_book, opts):
|
|
self.log.info('Converting OEB book to HTML...')
|
|
self.opts = opts
|
|
try:
|
|
self.book_title = str(oeb_book.metadata.title[0])
|
|
except Exception:
|
|
self.book_title = 'Unknown'
|
|
self.links = {}
|
|
self.images = {}
|
|
self.base_hrefs = [item.href for item in oeb_book.spine]
|
|
self.map_resources(oeb_book)
|
|
|
|
return self.mlize_spine(oeb_book)
|
|
|
|
def mlize_spine(self, oeb_book):
|
|
output = [
|
|
u'<html><head><meta http-equiv="Content-Type" content="text/html;charset=utf-8" /><title>%s</title></head><body>' % (
|
|
prepare_string_for_xml(self.book_title))
|
|
]
|
|
for item in oeb_book.spine:
|
|
self.log.debug('Converting %s to HTML...' % item.href)
|
|
self.rewrite_ids(item.data, item)
|
|
base.rewrite_links(item.data, partial(self.rewrite_link, page=item))
|
|
stylizer = Stylizer(item.data, item.href, oeb_book, self.opts)
|
|
output += self.dump_text(item.data.find(base.tag('xhtml', 'body')), stylizer, item)
|
|
output.append('\n\n')
|
|
output.append('</body></html>')
|
|
return ''.join(output)
|
|
|
|
def dump_text(self, elem, stylizer, page):
|
|
raise NotImplementedError
|
|
|
|
def get_link_id(self, href, id=''):
|
|
if id:
|
|
href += '#%s' % id
|
|
if href not in self.links:
|
|
self.links[href] = '#calibre_link-%s' % len(self.links.keys())
|
|
return self.links[href]
|
|
|
|
def map_resources(self, oeb_book):
|
|
for item in oeb_book.manifest:
|
|
if item.media_type in base.OEB_IMAGES:
|
|
if item.href not in self.images:
|
|
ext = os.path.splitext(item.href)[1]
|
|
fname = '%s%s' % (len(self.images), ext)
|
|
fname = fname.zfill(10)
|
|
self.images[item.href] = fname
|
|
if item in oeb_book.spine:
|
|
self.get_link_id(item.href)
|
|
root = item.data.find(base.tag('xhtml', 'body'))
|
|
link_attrs = set(html.defs.link_attrs)
|
|
link_attrs.add(base.tag('xlink', 'href'))
|
|
for el in root.iter():
|
|
attribs = el.attrib
|
|
try:
|
|
if not isinstance(el.tag, (str, bytes)):
|
|
continue
|
|
except:
|
|
continue
|
|
for attr in attribs:
|
|
if attr in link_attrs:
|
|
href = item.abshref(attribs[attr])
|
|
href, id = urllib.parse.urldefrag(href)
|
|
if href in self.base_hrefs:
|
|
self.get_link_id(href, id)
|
|
|
|
def rewrite_link(self, url, page=None):
|
|
if not page:
|
|
return url
|
|
abs_url = page.abshref(base.urlnormalize(url))
|
|
if abs_url in self.images:
|
|
return 'images/%s' % self.images[abs_url]
|
|
if abs_url in self.links:
|
|
return self.links[abs_url]
|
|
return url
|
|
|
|
def rewrite_ids(self, root, page):
|
|
for el in root.iter():
|
|
try:
|
|
tag = el.tag
|
|
except UnicodeDecodeError:
|
|
continue
|
|
if tag == base.tag('xhtml', 'body'):
|
|
el.attrib['id'] = self.get_link_id(page.href)[1:]
|
|
continue
|
|
if 'id' in el.attrib:
|
|
el.attrib['id'] = self.get_link_id(page.href, el.attrib['id'])[1:]
|
|
|
|
def get_css(self, oeb_book):
|
|
css = b''
|
|
for item in oeb_book.manifest:
|
|
if item.media_type == 'text/css':
|
|
css += as_bytes(item.data.cssText) + b'\n\n'
|
|
return css
|
|
|
|
def prepare_string_for_html(self, raw):
|
|
raw = prepare_string_for_xml(raw)
|
|
raw = raw.replace(u'\u00ad', '­')
|
|
raw = raw.replace(u'\u2014', '—')
|
|
raw = raw.replace(u'\u2013', '–')
|
|
raw = raw.replace(u'\u00a0', ' ')
|
|
return raw
|
|
|
|
|
|
class OEB2HTMLNoCSSizer(OEB2HTML):
|
|
'''
|
|
This will remap a small number of CSS styles to equivalent HTML tags.
|
|
'''
|
|
|
|
def dump_text(self, elem, stylizer, page):
|
|
'''
|
|
@elem: The element in the etree that we are working on.
|
|
@stylizer: The style information attached to the element.
|
|
'''
|
|
|
|
# We can only processes tags. If there isn't a tag return any text.
|
|
if not isinstance(elem.tag, (str, bytes)) \
|
|
or parse_utils.namespace(elem.tag) not in (const.XHTML_NS, const.SVG_NS):
|
|
p = elem.getparent()
|
|
if p is not None and isinstance(p.tag, (str, bytes)) and parse_utils.namespace(p.tag) in (const.XHTML_NS, const.SVG_NS) \
|
|
and elem.tail:
|
|
return [elem.tail]
|
|
return ['']
|
|
|
|
# Setup our variables.
|
|
text = ['']
|
|
style = stylizer.style(elem)
|
|
tags = []
|
|
tag = parse_utils.barename(elem.tag)
|
|
attribs = elem.attrib
|
|
|
|
if tag == 'body':
|
|
tag = 'div'
|
|
tags.append(tag)
|
|
|
|
# Ignore anything that is set to not be displayed.
|
|
if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \
|
|
or style['visibility'] == 'hidden':
|
|
return ['']
|
|
|
|
# Remove attributes we won't want.
|
|
if 'class' in attribs:
|
|
del attribs['class']
|
|
if 'style' in attribs:
|
|
del attribs['style']
|
|
|
|
# Turn the rest of the attributes into a string we can write with the tag.
|
|
at = ''
|
|
for k, v in attribs.items():
|
|
at += ' %s="%s"' % (k, prepare_string_for_xml(v, attribute=True))
|
|
|
|
# Write the tag.
|
|
text.append('<%s%s' % (tag, at))
|
|
if tag in SELF_CLOSING_TAGS:
|
|
text.append(' />')
|
|
else:
|
|
text.append('>')
|
|
|
|
# Turn styles into tags.
|
|
if style['font-weight'] in ('bold', 'bolder'):
|
|
text.append('<b>')
|
|
tags.append('b')
|
|
if style['font-style'] == 'italic':
|
|
text.append('<i>')
|
|
tags.append('i')
|
|
if style['text-decoration'] == 'underline':
|
|
text.append('<u>')
|
|
tags.append('u')
|
|
if style['text-decoration'] == 'line-through':
|
|
text.append('<s>')
|
|
tags.append('s')
|
|
|
|
# Process tags that contain text.
|
|
if hasattr(elem, 'text') and elem.text:
|
|
text.append(self.prepare_string_for_html(elem.text))
|
|
|
|
# Recurse down into tags within the tag we are in.
|
|
for item in elem:
|
|
text += self.dump_text(item, stylizer, page)
|
|
|
|
# Close all open tags.
|
|
tags.reverse()
|
|
for t in tags:
|
|
if t not in SELF_CLOSING_TAGS:
|
|
text.append('</%s>' % t)
|
|
|
|
# Add the text that is outside of the tag.
|
|
if hasattr(elem, 'tail') and elem.tail:
|
|
text.append(self.prepare_string_for_html(elem.tail))
|
|
|
|
return text
|
|
|
|
|
|
class OEB2HTMLInlineCSSizer(OEB2HTML):
|
|
'''
|
|
Turns external CSS classes into inline style attributes.
|
|
'''
|
|
|
|
def dump_text(self, elem, stylizer, page):
|
|
'''
|
|
@elem: The element in the etree that we are working on.
|
|
@stylizer: The style information attached to the element.
|
|
'''
|
|
|
|
# We can only processes tags. If there isn't a tag return any text.
|
|
if not isinstance(elem.tag, (str, bytes)) \
|
|
or parse_utils.namespace(elem.tag) not in (const.XHTML_NS, const.SVG_NS):
|
|
p = elem.getparent()
|
|
if p is not None and isinstance(p.tag, (str, bytes)) and parse_utils.namespace(p.tag) in (const.XHTML_NS, const.SVG_NS) \
|
|
and elem.tail:
|
|
return [elem.tail]
|
|
return ['']
|
|
|
|
# Setup our variables.
|
|
text = ['']
|
|
style = stylizer.style(elem)
|
|
tags = []
|
|
tag = parse_utils.barename(elem.tag)
|
|
attribs = elem.attrib
|
|
|
|
style_a = '%s' % style
|
|
style_a = style_a if style_a else ''
|
|
if tag == 'body':
|
|
# Change the body to a div so we can merge multiple files.
|
|
tag = 'div'
|
|
# Add page-break-brefore: always because renders typically treat a new file (we're merging files)
|
|
# as a page break and remove all other page break types that might be set.
|
|
style_a = 'page-break-before: always; %s' % re.sub('page-break-[^:]+:[^;]+;?', '', style_a)
|
|
# Remove unnecessary spaces.
|
|
style_a = re.sub(r'\s{2,}', ' ', style_a).strip()
|
|
tags.append(tag)
|
|
|
|
# Remove attributes we won't want.
|
|
if 'class' in attribs:
|
|
del attribs['class']
|
|
if 'style' in attribs:
|
|
del attribs['style']
|
|
|
|
# Turn the rest of the attributes into a string we can write with the tag.
|
|
at = ''
|
|
for k, v in attribs.items():
|
|
at += ' %s="%s"' % (k, prepare_string_for_xml(v, attribute=True))
|
|
|
|
# Turn style into strings for putting in the tag.
|
|
style_t = ''
|
|
if style_a:
|
|
style_t = ' style="%s"' % style_a.replace('"', "'")
|
|
|
|
# Write the tag.
|
|
text.append('<%s%s%s' % (tag, at, style_t))
|
|
if tag in SELF_CLOSING_TAGS:
|
|
text.append(' />')
|
|
else:
|
|
text.append('>')
|
|
|
|
# Process tags that contain text.
|
|
if hasattr(elem, 'text') and elem.text:
|
|
text.append(self.prepare_string_for_html(elem.text))
|
|
|
|
# Recurse down into tags within the tag we are in.
|
|
for item in elem:
|
|
text += self.dump_text(item, stylizer, page)
|
|
|
|
# Close all open tags.
|
|
tags.reverse()
|
|
for t in tags:
|
|
if t not in SELF_CLOSING_TAGS:
|
|
text.append('</%s>' % t)
|
|
|
|
# Add the text that is outside of the tag.
|
|
if hasattr(elem, 'tail') and elem.tail:
|
|
text.append(self.prepare_string_for_html(elem.tail))
|
|
|
|
return text
|
|
|
|
|
|
class OEB2HTMLClassCSSizer(OEB2HTML):
|
|
'''
|
|
Use CSS classes. css_style option can specify whether to use
|
|
inline classes (style tag in the head) or reference an external
|
|
CSS file called style.css.
|
|
'''
|
|
|
|
def mlize_spine(self, oeb_book):
|
|
output = []
|
|
for item in oeb_book.spine:
|
|
self.log.debug('Converting %s to HTML...' % item.href)
|
|
self.rewrite_ids(item.data, item)
|
|
base.rewrite_links(item.data, partial(self.rewrite_link, page=item))
|
|
stylizer = Stylizer(item.data, item.href, oeb_book, self.opts)
|
|
output += self.dump_text(item.data.find(base.tag('xhtml', 'body')), stylizer, item)
|
|
output.append('\n\n')
|
|
if self.opts.htmlz_class_style == 'external':
|
|
css = u'<link href="style.css" rel="stylesheet" type="text/css" />'
|
|
else:
|
|
css = u'<style type="text/css">' + self.get_css(oeb_book) + u'</style>'
|
|
title = u'<title>%s</title>' % prepare_string_for_xml(self.book_title)
|
|
output = [u'<html><head><meta http-equiv="Content-Type" content="text/html;charset=utf-8" />'] + \
|
|
[css] + [title, u'</head><body>'] + output + [u'</body></html>']
|
|
return ''.join(output)
|
|
|
|
def dump_text(self, elem, stylizer, page):
|
|
'''
|
|
@elem: The element in the etree that we are working on.
|
|
@stylizer: The style information attached to the element.
|
|
'''
|
|
|
|
# We can only processes tags. If there isn't a tag return any text.
|
|
if not isinstance(elem.tag, (str, bytes)) \
|
|
or parse_utils.namespace(elem.tag) not in (const.XHTML_NS, const.SVG_NS):
|
|
p = elem.getparent()
|
|
if p is not None and isinstance(p.tag, (str, bytes)) and parse_utils.namespace(p.tag) in (const.XHTML_NS, const.SVG_NS) \
|
|
and elem.tail:
|
|
return [elem.tail]
|
|
return ['']
|
|
|
|
# Setup our variables.
|
|
text = ['']
|
|
tags = []
|
|
tag = parse_utils.barename(elem.tag)
|
|
attribs = elem.attrib
|
|
|
|
if tag == 'body':
|
|
tag = 'div'
|
|
tags.append(tag)
|
|
|
|
# Remove attributes we won't want.
|
|
if 'style' in attribs:
|
|
del attribs['style']
|
|
|
|
# Turn the rest of the attributes into a string we can write with the tag.
|
|
at = ''
|
|
for k, v in attribs.items():
|
|
at += ' %s="%s"' % (k, prepare_string_for_xml(v, attribute=True))
|
|
|
|
# Write the tag.
|
|
text.append('<%s%s' % (tag, at))
|
|
if tag in SELF_CLOSING_TAGS:
|
|
text.append(' />')
|
|
else:
|
|
text.append('>')
|
|
|
|
# Process tags that contain text.
|
|
if hasattr(elem, 'text') and elem.text:
|
|
text.append(self.prepare_string_for_html(elem.text))
|
|
|
|
# Recurse down into tags within the tag we are in.
|
|
for item in elem:
|
|
text += self.dump_text(item, stylizer, page)
|
|
|
|
# Close all open tags.
|
|
tags.reverse()
|
|
for t in tags:
|
|
if t not in SELF_CLOSING_TAGS:
|
|
text.append('</%s>' % t)
|
|
|
|
# Add the text that is outside of the tag.
|
|
if hasattr(elem, 'tail') and elem.tail:
|
|
text.append(self.prepare_string_for_html(elem.tail))
|
|
|
|
return text
|
|
|
|
|
|
def oeb2html_no_css(oeb_book, log, opts):
|
|
izer = OEB2HTMLNoCSSizer(log)
|
|
html = izer.oeb2html(oeb_book, opts)
|
|
images = izer.images
|
|
return (html, images)
|
|
|
|
|
|
def oeb2html_inline_css(oeb_book, log, opts):
|
|
izer = OEB2HTMLInlineCSSizer(log)
|
|
html = izer.oeb2html(oeb_book, opts)
|
|
images = izer.images
|
|
return (html, images)
|
|
|
|
|
|
def oeb2html_class_css(oeb_book, log, opts):
|
|
izer = OEB2HTMLClassCSSizer(log)
|
|
setattr(opts, 'class_style', 'inline')
|
|
html = izer.oeb2html(oeb_book, opts)
|
|
images = izer.images
|
|
return (html, images)
|