diff --git a/ebook_converter/ebooks/htmlz/__init__.py b/ebook_converter/ebooks/htmlz/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/ebook_converter/ebooks/htmlz/oeb2html.py b/ebook_converter/ebooks/htmlz/oeb2html.py
new file mode 100644
index 0000000..9684723
--- /dev/null
+++ b/ebook_converter/ebooks/htmlz/oeb2html.py
@@ -0,0 +1,428 @@
+# -*- coding: utf-8 -*-
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+
+__license__ = 'GPL 3'
+__copyright__ = '2011, John Schember '
+__docformat__ = 'restructuredtext en'
+
+'''
+Transform OEB content into a single (more or less) HTML file.
+'''
+
+import os
+import re
+
+from functools import partial
+from lxml import html
+
+from calibre import prepare_string_for_xml
+from calibre.ebooks.oeb.base import (
+ XHTML, XHTML_NS, SVG_NS, barename, namespace, OEB_IMAGES, XLINK, rewrite_links, urlnormalize)
+from calibre.ebooks.oeb.stylizer import Stylizer
+from calibre.utils.logging import default_log
+from polyglot.builtins import unicode_type, string_or_bytes, as_bytes
+from polyglot.urllib import urldefrag
+
+SELF_CLOSING_TAGS = {'area', 'base', 'basefont', 'br', 'hr', 'input', 'img', 'link', 'meta'}
+
+
+class OEB2HTML(object):
+ '''
+ Base class. All subclasses should implement dump_text to actually transform
+ content. Also, callers should use oeb2html to get the transformed html.
+ links and images can be retrieved after calling oeb2html to get the mapping
+ of OEB links and images to the new names used in the html returned by oeb2html.
+ Images will always be referenced as if they are in an images directory.
+
+ Use get_css to get the CSS classes for the OEB document as a string.
+ '''
+
+ def __init__(self, log=None):
+ self.log = default_log if log is None else log
+ self.links = {}
+ self.images = {}
+
+ def oeb2html(self, oeb_book, opts):
+ self.log.info('Converting OEB book to HTML...')
+ self.opts = opts
+ try:
+ self.book_title = unicode_type(oeb_book.metadata.title[0])
+ except Exception:
+ self.book_title = _('Unknown')
+ self.links = {}
+ self.images = {}
+ self.base_hrefs = [item.href for item in oeb_book.spine]
+ self.map_resources(oeb_book)
+
+ return self.mlize_spine(oeb_book)
+
+ def mlize_spine(self, oeb_book):
+ output = [
+ u'%s' % (
+ prepare_string_for_xml(self.book_title))
+ ]
+ for item in oeb_book.spine:
+ self.log.debug('Converting %s to HTML...' % item.href)
+ self.rewrite_ids(item.data, item)
+ rewrite_links(item.data, partial(self.rewrite_link, page=item))
+ stylizer = Stylizer(item.data, item.href, oeb_book, self.opts)
+ output += self.dump_text(item.data.find(XHTML('body')), stylizer, item)
+ output.append('\n\n')
+ output.append('')
+ return ''.join(output)
+
+ def dump_text(self, elem, stylizer, page):
+ raise NotImplementedError
+
+ def get_link_id(self, href, id=''):
+ if id:
+ href += '#%s' % id
+ if href not in self.links:
+ self.links[href] = '#calibre_link-%s' % len(self.links.keys())
+ return self.links[href]
+
+ def map_resources(self, oeb_book):
+ for item in oeb_book.manifest:
+ if item.media_type in OEB_IMAGES:
+ if item.href not in self.images:
+ ext = os.path.splitext(item.href)[1]
+ fname = '%s%s' % (len(self.images), ext)
+ fname = fname.zfill(10)
+ self.images[item.href] = fname
+ if item in oeb_book.spine:
+ self.get_link_id(item.href)
+ root = item.data.find(XHTML('body'))
+ link_attrs = set(html.defs.link_attrs)
+ link_attrs.add(XLINK('href'))
+ for el in root.iter():
+ attribs = el.attrib
+ try:
+ if not isinstance(el.tag, string_or_bytes):
+ continue
+ except:
+ continue
+ for attr in attribs:
+ if attr in link_attrs:
+ href = item.abshref(attribs[attr])
+ href, id = urldefrag(href)
+ if href in self.base_hrefs:
+ self.get_link_id(href, id)
+
+ def rewrite_link(self, url, page=None):
+ if not page:
+ return url
+ abs_url = page.abshref(urlnormalize(url))
+ if abs_url in self.images:
+ return 'images/%s' % self.images[abs_url]
+ if abs_url in self.links:
+ return self.links[abs_url]
+ return url
+
+ def rewrite_ids(self, root, page):
+ for el in root.iter():
+ try:
+ tag = el.tag
+ except UnicodeDecodeError:
+ continue
+ if tag == XHTML('body'):
+ el.attrib['id'] = self.get_link_id(page.href)[1:]
+ continue
+ if 'id' in el.attrib:
+ el.attrib['id'] = self.get_link_id(page.href, el.attrib['id'])[1:]
+
+ def get_css(self, oeb_book):
+ css = b''
+ for item in oeb_book.manifest:
+ if item.media_type == 'text/css':
+ css += as_bytes(item.data.cssText) + b'\n\n'
+ return css
+
+ def prepare_string_for_html(self, raw):
+ raw = prepare_string_for_xml(raw)
+ raw = raw.replace(u'\u00ad', '')
+ raw = raw.replace(u'\u2014', '—')
+ raw = raw.replace(u'\u2013', '–')
+ raw = raw.replace(u'\u00a0', ' ')
+ return raw
+
+
+class OEB2HTMLNoCSSizer(OEB2HTML):
+ '''
+ This will remap a small number of CSS styles to equivalent HTML tags.
+ '''
+
+ def dump_text(self, elem, stylizer, page):
+ '''
+ @elem: The element in the etree that we are working on.
+ @stylizer: The style information attached to the element.
+ '''
+
+ # We can only processes tags. If there isn't a tag return any text.
+ if not isinstance(elem.tag, string_or_bytes) \
+ or namespace(elem.tag) not in (XHTML_NS, SVG_NS):
+ p = elem.getparent()
+ if p is not None and isinstance(p.tag, string_or_bytes) and namespace(p.tag) in (XHTML_NS, SVG_NS) \
+ and elem.tail:
+ return [elem.tail]
+ return ['']
+
+ # Setup our variables.
+ text = ['']
+ style = stylizer.style(elem)
+ tags = []
+ tag = barename(elem.tag)
+ attribs = elem.attrib
+
+ if tag == 'body':
+ tag = 'div'
+ tags.append(tag)
+
+ # Ignore anything that is set to not be displayed.
+ if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \
+ or style['visibility'] == 'hidden':
+ return ['']
+
+ # Remove attributes we won't want.
+ if 'class' in attribs:
+ del attribs['class']
+ if 'style' in attribs:
+ del attribs['style']
+
+ # Turn the rest of the attributes into a string we can write with the tag.
+ at = ''
+ for k, v in attribs.items():
+ at += ' %s="%s"' % (k, prepare_string_for_xml(v, attribute=True))
+
+ # Write the tag.
+ text.append('<%s%s' % (tag, at))
+ if tag in SELF_CLOSING_TAGS:
+ text.append(' />')
+ else:
+ text.append('>')
+
+ # Turn styles into tags.
+ if style['font-weight'] in ('bold', 'bolder'):
+ text.append('')
+ tags.append('b')
+ if style['font-style'] == 'italic':
+ text.append('')
+ tags.append('i')
+ if style['text-decoration'] == 'underline':
+ text.append('')
+ tags.append('u')
+ if style['text-decoration'] == 'line-through':
+ text.append('')
+ tags.append('s')
+
+ # Process tags that contain text.
+ if hasattr(elem, 'text') and elem.text:
+ text.append(self.prepare_string_for_html(elem.text))
+
+ # Recurse down into tags within the tag we are in.
+ for item in elem:
+ text += self.dump_text(item, stylizer, page)
+
+ # Close all open tags.
+ tags.reverse()
+ for t in tags:
+ if t not in SELF_CLOSING_TAGS:
+ text.append('%s>' % t)
+
+ # Add the text that is outside of the tag.
+ if hasattr(elem, 'tail') and elem.tail:
+ text.append(self.prepare_string_for_html(elem.tail))
+
+ return text
+
+
+class OEB2HTMLInlineCSSizer(OEB2HTML):
+ '''
+ Turns external CSS classes into inline style attributes.
+ '''
+
+ def dump_text(self, elem, stylizer, page):
+ '''
+ @elem: The element in the etree that we are working on.
+ @stylizer: The style information attached to the element.
+ '''
+
+ # We can only processes tags. If there isn't a tag return any text.
+ if not isinstance(elem.tag, string_or_bytes) \
+ or namespace(elem.tag) not in (XHTML_NS, SVG_NS):
+ p = elem.getparent()
+ if p is not None and isinstance(p.tag, string_or_bytes) and namespace(p.tag) in (XHTML_NS, SVG_NS) \
+ and elem.tail:
+ return [elem.tail]
+ return ['']
+
+ # Setup our variables.
+ text = ['']
+ style = stylizer.style(elem)
+ tags = []
+ tag = barename(elem.tag)
+ attribs = elem.attrib
+
+ style_a = '%s' % style
+ style_a = style_a if style_a else ''
+ if tag == 'body':
+ # Change the body to a div so we can merge multiple files.
+ tag = 'div'
+ # Add page-break-brefore: always because renders typically treat a new file (we're merging files)
+ # as a page break and remove all other page break types that might be set.
+ style_a = 'page-break-before: always; %s' % re.sub('page-break-[^:]+:[^;]+;?', '', style_a)
+ # Remove unnecessary spaces.
+ style_a = re.sub(r'\s{2,}', ' ', style_a).strip()
+ tags.append(tag)
+
+ # Remove attributes we won't want.
+ if 'class' in attribs:
+ del attribs['class']
+ if 'style' in attribs:
+ del attribs['style']
+
+ # Turn the rest of the attributes into a string we can write with the tag.
+ at = ''
+ for k, v in attribs.items():
+ at += ' %s="%s"' % (k, prepare_string_for_xml(v, attribute=True))
+
+ # Turn style into strings for putting in the tag.
+ style_t = ''
+ if style_a:
+ style_t = ' style="%s"' % style_a.replace('"', "'")
+
+ # Write the tag.
+ text.append('<%s%s%s' % (tag, at, style_t))
+ if tag in SELF_CLOSING_TAGS:
+ text.append(' />')
+ else:
+ text.append('>')
+
+ # Process tags that contain text.
+ if hasattr(elem, 'text') and elem.text:
+ text.append(self.prepare_string_for_html(elem.text))
+
+ # Recurse down into tags within the tag we are in.
+ for item in elem:
+ text += self.dump_text(item, stylizer, page)
+
+ # Close all open tags.
+ tags.reverse()
+ for t in tags:
+ if t not in SELF_CLOSING_TAGS:
+ text.append('%s>' % t)
+
+ # Add the text that is outside of the tag.
+ if hasattr(elem, 'tail') and elem.tail:
+ text.append(self.prepare_string_for_html(elem.tail))
+
+ return text
+
+
+class OEB2HTMLClassCSSizer(OEB2HTML):
+ '''
+ Use CSS classes. css_style option can specify whether to use
+ inline classes (style tag in the head) or reference an external
+ CSS file called style.css.
+ '''
+
+ def mlize_spine(self, oeb_book):
+ output = []
+ for item in oeb_book.spine:
+ self.log.debug('Converting %s to HTML...' % item.href)
+ self.rewrite_ids(item.data, item)
+ rewrite_links(item.data, partial(self.rewrite_link, page=item))
+ stylizer = Stylizer(item.data, item.href, oeb_book, self.opts)
+ output += self.dump_text(item.data.find(XHTML('body')), stylizer, item)
+ output.append('\n\n')
+ if self.opts.htmlz_class_style == 'external':
+ css = u''
+ else:
+ css = u''
+ title = u'%s' % prepare_string_for_xml(self.book_title)
+ output = [u''] + \
+ [css] + [title, u''] + output + [u'']
+ return ''.join(output)
+
+ def dump_text(self, elem, stylizer, page):
+ '''
+ @elem: The element in the etree that we are working on.
+ @stylizer: The style information attached to the element.
+ '''
+
+ # We can only processes tags. If there isn't a tag return any text.
+ if not isinstance(elem.tag, string_or_bytes) \
+ or namespace(elem.tag) not in (XHTML_NS, SVG_NS):
+ p = elem.getparent()
+ if p is not None and isinstance(p.tag, string_or_bytes) and namespace(p.tag) in (XHTML_NS, SVG_NS) \
+ and elem.tail:
+ return [elem.tail]
+ return ['']
+
+ # Setup our variables.
+ text = ['']
+ tags = []
+ tag = barename(elem.tag)
+ attribs = elem.attrib
+
+ if tag == 'body':
+ tag = 'div'
+ tags.append(tag)
+
+ # Remove attributes we won't want.
+ if 'style' in attribs:
+ del attribs['style']
+
+ # Turn the rest of the attributes into a string we can write with the tag.
+ at = ''
+ for k, v in attribs.items():
+ at += ' %s="%s"' % (k, prepare_string_for_xml(v, attribute=True))
+
+ # Write the tag.
+ text.append('<%s%s' % (tag, at))
+ if tag in SELF_CLOSING_TAGS:
+ text.append(' />')
+ else:
+ text.append('>')
+
+ # Process tags that contain text.
+ if hasattr(elem, 'text') and elem.text:
+ text.append(self.prepare_string_for_html(elem.text))
+
+ # Recurse down into tags within the tag we are in.
+ for item in elem:
+ text += self.dump_text(item, stylizer, page)
+
+ # Close all open tags.
+ tags.reverse()
+ for t in tags:
+ if t not in SELF_CLOSING_TAGS:
+ text.append('%s>' % t)
+
+ # Add the text that is outside of the tag.
+ if hasattr(elem, 'tail') and elem.tail:
+ text.append(self.prepare_string_for_html(elem.tail))
+
+ return text
+
+
+def oeb2html_no_css(oeb_book, log, opts):
+ izer = OEB2HTMLNoCSSizer(log)
+ html = izer.oeb2html(oeb_book, opts)
+ images = izer.images
+ return (html, images)
+
+
+def oeb2html_inline_css(oeb_book, log, opts):
+ izer = OEB2HTMLInlineCSSizer(log)
+ html = izer.oeb2html(oeb_book, opts)
+ images = izer.images
+ return (html, images)
+
+
+def oeb2html_class_css(oeb_book, log, opts):
+ izer = OEB2HTMLClassCSSizer(log)
+ setattr(opts, 'class_style', 'inline')
+ html = izer.oeb2html(oeb_book, opts)
+ images = izer.images
+ return (html, images)