From 5ff0f058d39eddb65de44924dbf20d889a07d0a9 Mon Sep 17 00:00:00 2001 From: gryf Date: Tue, 14 Apr 2020 19:01:19 +0200 Subject: [PATCH] Added hmtlz module --- ebook_converter/ebooks/htmlz/__init__.py | 0 ebook_converter/ebooks/htmlz/oeb2html.py | 428 +++++++++++++++++++++++ 2 files changed, 428 insertions(+) create mode 100644 ebook_converter/ebooks/htmlz/__init__.py create mode 100644 ebook_converter/ebooks/htmlz/oeb2html.py diff --git a/ebook_converter/ebooks/htmlz/__init__.py b/ebook_converter/ebooks/htmlz/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/ebook_converter/ebooks/htmlz/oeb2html.py b/ebook_converter/ebooks/htmlz/oeb2html.py new file mode 100644 index 0000000..9684723 --- /dev/null +++ b/ebook_converter/ebooks/htmlz/oeb2html.py @@ -0,0 +1,428 @@ +# -*- coding: utf-8 -*- +from __future__ import absolute_import, division, print_function, unicode_literals + + +__license__ = 'GPL 3' +__copyright__ = '2011, John Schember ' +__docformat__ = 'restructuredtext en' + +''' +Transform OEB content into a single (more or less) HTML file. +''' + +import os +import re + +from functools import partial +from lxml import html + +from calibre import prepare_string_for_xml +from calibre.ebooks.oeb.base import ( + XHTML, XHTML_NS, SVG_NS, barename, namespace, OEB_IMAGES, XLINK, rewrite_links, urlnormalize) +from calibre.ebooks.oeb.stylizer import Stylizer +from calibre.utils.logging import default_log +from polyglot.builtins import unicode_type, string_or_bytes, as_bytes +from polyglot.urllib import urldefrag + +SELF_CLOSING_TAGS = {'area', 'base', 'basefont', 'br', 'hr', 'input', 'img', 'link', 'meta'} + + +class OEB2HTML(object): + ''' + Base class. All subclasses should implement dump_text to actually transform + content. Also, callers should use oeb2html to get the transformed html. + links and images can be retrieved after calling oeb2html to get the mapping + of OEB links and images to the new names used in the html returned by oeb2html. + Images will always be referenced as if they are in an images directory. + + Use get_css to get the CSS classes for the OEB document as a string. + ''' + + def __init__(self, log=None): + self.log = default_log if log is None else log + self.links = {} + self.images = {} + + def oeb2html(self, oeb_book, opts): + self.log.info('Converting OEB book to HTML...') + self.opts = opts + try: + self.book_title = unicode_type(oeb_book.metadata.title[0]) + except Exception: + self.book_title = _('Unknown') + self.links = {} + self.images = {} + self.base_hrefs = [item.href for item in oeb_book.spine] + self.map_resources(oeb_book) + + return self.mlize_spine(oeb_book) + + def mlize_spine(self, oeb_book): + output = [ + u'%s' % ( + prepare_string_for_xml(self.book_title)) + ] + for item in oeb_book.spine: + self.log.debug('Converting %s to HTML...' % item.href) + self.rewrite_ids(item.data, item) + rewrite_links(item.data, partial(self.rewrite_link, page=item)) + stylizer = Stylizer(item.data, item.href, oeb_book, self.opts) + output += self.dump_text(item.data.find(XHTML('body')), stylizer, item) + output.append('\n\n') + output.append('') + return ''.join(output) + + def dump_text(self, elem, stylizer, page): + raise NotImplementedError + + def get_link_id(self, href, id=''): + if id: + href += '#%s' % id + if href not in self.links: + self.links[href] = '#calibre_link-%s' % len(self.links.keys()) + return self.links[href] + + def map_resources(self, oeb_book): + for item in oeb_book.manifest: + if item.media_type in OEB_IMAGES: + if item.href not in self.images: + ext = os.path.splitext(item.href)[1] + fname = '%s%s' % (len(self.images), ext) + fname = fname.zfill(10) + self.images[item.href] = fname + if item in oeb_book.spine: + self.get_link_id(item.href) + root = item.data.find(XHTML('body')) + link_attrs = set(html.defs.link_attrs) + link_attrs.add(XLINK('href')) + for el in root.iter(): + attribs = el.attrib + try: + if not isinstance(el.tag, string_or_bytes): + continue + except: + continue + for attr in attribs: + if attr in link_attrs: + href = item.abshref(attribs[attr]) + href, id = urldefrag(href) + if href in self.base_hrefs: + self.get_link_id(href, id) + + def rewrite_link(self, url, page=None): + if not page: + return url + abs_url = page.abshref(urlnormalize(url)) + if abs_url in self.images: + return 'images/%s' % self.images[abs_url] + if abs_url in self.links: + return self.links[abs_url] + return url + + def rewrite_ids(self, root, page): + for el in root.iter(): + try: + tag = el.tag + except UnicodeDecodeError: + continue + if tag == XHTML('body'): + el.attrib['id'] = self.get_link_id(page.href)[1:] + continue + if 'id' in el.attrib: + el.attrib['id'] = self.get_link_id(page.href, el.attrib['id'])[1:] + + def get_css(self, oeb_book): + css = b'' + for item in oeb_book.manifest: + if item.media_type == 'text/css': + css += as_bytes(item.data.cssText) + b'\n\n' + return css + + def prepare_string_for_html(self, raw): + raw = prepare_string_for_xml(raw) + raw = raw.replace(u'\u00ad', '­') + raw = raw.replace(u'\u2014', '—') + raw = raw.replace(u'\u2013', '–') + raw = raw.replace(u'\u00a0', ' ') + return raw + + +class OEB2HTMLNoCSSizer(OEB2HTML): + ''' + This will remap a small number of CSS styles to equivalent HTML tags. + ''' + + def dump_text(self, elem, stylizer, page): + ''' + @elem: The element in the etree that we are working on. + @stylizer: The style information attached to the element. + ''' + + # We can only processes tags. If there isn't a tag return any text. + if not isinstance(elem.tag, string_or_bytes) \ + or namespace(elem.tag) not in (XHTML_NS, SVG_NS): + p = elem.getparent() + if p is not None and isinstance(p.tag, string_or_bytes) and namespace(p.tag) in (XHTML_NS, SVG_NS) \ + and elem.tail: + return [elem.tail] + return [''] + + # Setup our variables. + text = [''] + style = stylizer.style(elem) + tags = [] + tag = barename(elem.tag) + attribs = elem.attrib + + if tag == 'body': + tag = 'div' + tags.append(tag) + + # Ignore anything that is set to not be displayed. + if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \ + or style['visibility'] == 'hidden': + return [''] + + # Remove attributes we won't want. + if 'class' in attribs: + del attribs['class'] + if 'style' in attribs: + del attribs['style'] + + # Turn the rest of the attributes into a string we can write with the tag. + at = '' + for k, v in attribs.items(): + at += ' %s="%s"' % (k, prepare_string_for_xml(v, attribute=True)) + + # Write the tag. + text.append('<%s%s' % (tag, at)) + if tag in SELF_CLOSING_TAGS: + text.append(' />') + else: + text.append('>') + + # Turn styles into tags. + if style['font-weight'] in ('bold', 'bolder'): + text.append('') + tags.append('b') + if style['font-style'] == 'italic': + text.append('') + tags.append('i') + if style['text-decoration'] == 'underline': + text.append('') + tags.append('u') + if style['text-decoration'] == 'line-through': + text.append('') + tags.append('s') + + # Process tags that contain text. + if hasattr(elem, 'text') and elem.text: + text.append(self.prepare_string_for_html(elem.text)) + + # Recurse down into tags within the tag we are in. + for item in elem: + text += self.dump_text(item, stylizer, page) + + # Close all open tags. + tags.reverse() + for t in tags: + if t not in SELF_CLOSING_TAGS: + text.append('' % t) + + # Add the text that is outside of the tag. + if hasattr(elem, 'tail') and elem.tail: + text.append(self.prepare_string_for_html(elem.tail)) + + return text + + +class OEB2HTMLInlineCSSizer(OEB2HTML): + ''' + Turns external CSS classes into inline style attributes. + ''' + + def dump_text(self, elem, stylizer, page): + ''' + @elem: The element in the etree that we are working on. + @stylizer: The style information attached to the element. + ''' + + # We can only processes tags. If there isn't a tag return any text. + if not isinstance(elem.tag, string_or_bytes) \ + or namespace(elem.tag) not in (XHTML_NS, SVG_NS): + p = elem.getparent() + if p is not None and isinstance(p.tag, string_or_bytes) and namespace(p.tag) in (XHTML_NS, SVG_NS) \ + and elem.tail: + return [elem.tail] + return [''] + + # Setup our variables. + text = [''] + style = stylizer.style(elem) + tags = [] + tag = barename(elem.tag) + attribs = elem.attrib + + style_a = '%s' % style + style_a = style_a if style_a else '' + if tag == 'body': + # Change the body to a div so we can merge multiple files. + tag = 'div' + # Add page-break-brefore: always because renders typically treat a new file (we're merging files) + # as a page break and remove all other page break types that might be set. + style_a = 'page-break-before: always; %s' % re.sub('page-break-[^:]+:[^;]+;?', '', style_a) + # Remove unnecessary spaces. + style_a = re.sub(r'\s{2,}', ' ', style_a).strip() + tags.append(tag) + + # Remove attributes we won't want. + if 'class' in attribs: + del attribs['class'] + if 'style' in attribs: + del attribs['style'] + + # Turn the rest of the attributes into a string we can write with the tag. + at = '' + for k, v in attribs.items(): + at += ' %s="%s"' % (k, prepare_string_for_xml(v, attribute=True)) + + # Turn style into strings for putting in the tag. + style_t = '' + if style_a: + style_t = ' style="%s"' % style_a.replace('"', "'") + + # Write the tag. + text.append('<%s%s%s' % (tag, at, style_t)) + if tag in SELF_CLOSING_TAGS: + text.append(' />') + else: + text.append('>') + + # Process tags that contain text. + if hasattr(elem, 'text') and elem.text: + text.append(self.prepare_string_for_html(elem.text)) + + # Recurse down into tags within the tag we are in. + for item in elem: + text += self.dump_text(item, stylizer, page) + + # Close all open tags. + tags.reverse() + for t in tags: + if t not in SELF_CLOSING_TAGS: + text.append('' % t) + + # Add the text that is outside of the tag. + if hasattr(elem, 'tail') and elem.tail: + text.append(self.prepare_string_for_html(elem.tail)) + + return text + + +class OEB2HTMLClassCSSizer(OEB2HTML): + ''' + Use CSS classes. css_style option can specify whether to use + inline classes (style tag in the head) or reference an external + CSS file called style.css. + ''' + + def mlize_spine(self, oeb_book): + output = [] + for item in oeb_book.spine: + self.log.debug('Converting %s to HTML...' % item.href) + self.rewrite_ids(item.data, item) + rewrite_links(item.data, partial(self.rewrite_link, page=item)) + stylizer = Stylizer(item.data, item.href, oeb_book, self.opts) + output += self.dump_text(item.data.find(XHTML('body')), stylizer, item) + output.append('\n\n') + if self.opts.htmlz_class_style == 'external': + css = u'' + else: + css = u'' + title = u'%s' % prepare_string_for_xml(self.book_title) + output = [u''] + \ + [css] + [title, u''] + output + [u''] + return ''.join(output) + + def dump_text(self, elem, stylizer, page): + ''' + @elem: The element in the etree that we are working on. + @stylizer: The style information attached to the element. + ''' + + # We can only processes tags. If there isn't a tag return any text. + if not isinstance(elem.tag, string_or_bytes) \ + or namespace(elem.tag) not in (XHTML_NS, SVG_NS): + p = elem.getparent() + if p is not None and isinstance(p.tag, string_or_bytes) and namespace(p.tag) in (XHTML_NS, SVG_NS) \ + and elem.tail: + return [elem.tail] + return [''] + + # Setup our variables. + text = [''] + tags = [] + tag = barename(elem.tag) + attribs = elem.attrib + + if tag == 'body': + tag = 'div' + tags.append(tag) + + # Remove attributes we won't want. + if 'style' in attribs: + del attribs['style'] + + # Turn the rest of the attributes into a string we can write with the tag. + at = '' + for k, v in attribs.items(): + at += ' %s="%s"' % (k, prepare_string_for_xml(v, attribute=True)) + + # Write the tag. + text.append('<%s%s' % (tag, at)) + if tag in SELF_CLOSING_TAGS: + text.append(' />') + else: + text.append('>') + + # Process tags that contain text. + if hasattr(elem, 'text') and elem.text: + text.append(self.prepare_string_for_html(elem.text)) + + # Recurse down into tags within the tag we are in. + for item in elem: + text += self.dump_text(item, stylizer, page) + + # Close all open tags. + tags.reverse() + for t in tags: + if t not in SELF_CLOSING_TAGS: + text.append('' % t) + + # Add the text that is outside of the tag. + if hasattr(elem, 'tail') and elem.tail: + text.append(self.prepare_string_for_html(elem.tail)) + + return text + + +def oeb2html_no_css(oeb_book, log, opts): + izer = OEB2HTMLNoCSSizer(log) + html = izer.oeb2html(oeb_book, opts) + images = izer.images + return (html, images) + + +def oeb2html_inline_css(oeb_book, log, opts): + izer = OEB2HTMLInlineCSSizer(log) + html = izer.oeb2html(oeb_book, opts) + images = izer.images + return (html, images) + + +def oeb2html_class_css(oeb_book, log, opts): + izer = OEB2HTMLClassCSSizer(log) + setattr(opts, 'class_style', 'inline') + html = izer.oeb2html(oeb_book, opts) + images = izer.images + return (html, images)