import logging import re from lxml import etree from lxml import html from ebook_converter import constants as const from ebook_converter import xml_replace_entities, force_unicode from ebook_converter.constants_old import filesystem_encoding from ebook_converter.ebooks.chardet import xml_to_unicode, strip_encoding_declarations RECOVER_PARSER = etree.XMLParser(recover=True, no_network=True, resolve_entities=False) # TODO(gryf): replace this with real logging setup. LOG = logging class NotHTML(Exception): def __init__(self, root_tag): Exception.__init__(self, 'Data is not HTML') self.root_tag = root_tag def barename(name): return name.rpartition('}')[-1] def namespace(name): return name.rpartition('}')[0][1:] def XHTML(name): return '{%s}%s' % (const.XHTML_NS, name) def xpath(elem, expr): return elem.xpath(expr, namespaces={'h':const.XHTML_NS}) def XPath(expr): return etree.XPath(expr, namespaces={'h':const.XHTML_NS}) META_XP = XPath('/h:html/h:head/h:meta[@http-equiv="Content-Type"]') def merge_multiple_html_heads_and_bodies(root, log=None): heads, bodies = xpath(root, '//h:head'), xpath(root, '//h:body') if not (len(heads) > 1 or len(bodies) > 1): return root for child in root: root.remove(child) head = root.makeelement(XHTML('head')) body = root.makeelement(XHTML('body')) for h in heads: for x in h: head.append(x) for b in bodies: for x in b: body.append(x) tuple(map(root.append, (head, body))) if log is not None: log.warn('Merging multiple
and sections') return root def clone_element(elem, nsmap={}, in_context=True): if in_context: maker = elem.getroottree().getroot().makeelement else: maker = etree.Element nelem = maker(elem.tag, attrib=elem.attrib, nsmap=nsmap) nelem.text, nelem.tail = elem.text, elem.tail nelem.extend(elem) return nelem def node_depth(node): ans = 0 p = node.getparent() while p is not None: ans += 1 p = p.getparent() return ans def html5_parse(data, max_nesting_depth=100): from html5_parser import parse from ebook_converter.utils.cleantext import clean_xml_chars data = parse(clean_xml_chars(data), maybe_xhtml=True, keep_doctype=False, sanitize_names=True) # Check that the asinine HTML 5 algorithm did not result in a tree with # insane nesting depths for x in data.iterdescendants(): if isinstance(x.tag, (str, bytes)) and not len(x): # Leaf node depth = node_depth(x) if depth > max_nesting_depth: raise ValueError('HTML 5 parsing resulted in a tree with nesting' ' depth > %d'%max_nesting_depth) return data def _html4_parse(data): data = html.fromstring(data) data.attrib.pop('xmlns', None) for elem in data.iter(tag=etree.Comment): if elem.text: elem.text = elem.text.strip('-') data = etree.tostring(data, encoding='unicode') data = etree.fromstring(data) return data def clean_word_doc(data, log): prefixes = [] for match in re.finditer(r'xmlns:(\S+?)=".*?microsoft.*?"', data): prefixes.append(match.group(1)) if prefixes: log.warn('Found microsoft markup, cleaning...') # Remove empty tags as they are not rendered by browsers # but can become renderable HTML tags like if the # document is parsed by an HTML parser pat = re.compile( r'<(%s):([a-zA-Z0-9]+)[^>/]*?>\1:\2>'%('|'.join(prefixes)), re.DOTALL) data = pat.sub('', data) pat = re.compile( r'<(%s):([a-zA-Z0-9]+)[^>/]*?/>'%('|'.join(prefixes))) data = pat.sub('', data) return data def ensure_namespace_prefixes(node, nsmap): namespace_uris = frozenset(nsmap.values()) fnsmap = {k:v for k, v in node.nsmap.items() if v not in namespace_uris} fnsmap.update(nsmap) if fnsmap != dict(node.nsmap): node = clone_element(node, nsmap=fnsmap, in_context=False) return node class HTML5Doc(ValueError): pass def check_for_html5(prefix, root): if re.search(r'', prefix, re.IGNORECASE) is not None: if root.xpath('//svg'): raise HTML5Doc('This document appears to be un-namespaced HTML 5, should be parsed by the HTML 5 parser') def parse_html(data, log=None, decoder=None, preprocessor=None, filename='head = xpath(data, '/h:html/h:head') head = head[0] if head else None if head is None: log.warn('File %s missing
element' % filename) head = etree.Element(XHTML('head')) data.insert(0, head) title = etree.SubElement(head, XHTML('title')) title.text = 'Unknown' elif not xpath(data, '/h:html/h:head/h:title'): title = etree.SubElement(head, XHTML('title')) title.text = 'Unknown' # Ensure
if not xpath(data, '/h:html/h:body'): body = xpath(data, '//h:body') if body: body = body[0] body.getparent().remove(body) data.append(body) else: log.warn('File %s missing
element' % filename)
etree.SubElement(data, XHTML('body'))
# Remove microsoft office markup
r = [x for x in data.iterdescendants(etree.Element) if 'microsoft-com' in x.tag]
for x in r:
x.tag = XHTML('span')
def remove_elem(a):
p = a.getparent()
idx = p.index(a) -1
p.remove(a)
if a.tail:
if idx < 0:
if p.text is None:
p.text = ''
p.text += a.tail
else:
if p[idx].tail is None:
p[idx].tail = ''
p[idx].tail += a.tail
# Remove hyperlinks with no content as they cause rendering
# artifacts in browser based renderers
# Also remove empty , and tags
for a in xpath(data, '//h:a[@href]|//h:i|//h:b|//h:u'):
if a.get('id', None) is None and a.get('name', None) is None \
and len(a) == 0 and not a.text:
remove_elem(a)
# Convert
s with content into paragraphs as ADE can't handle
# them
for br in xpath(data, '//h:br'):
if len(br) > 0 or br.text:
br.tag = XHTML('div')
# Remove any stray text in the section and format it nicely
data.text = '\n '
head = xpath(data, '//h:head')
if head:
head = head[0]
head.text = '\n '
head.tail = '\n '
for child in head:
child.tail = '\n '
child.tail = '\n '
return data