mirror of
https://github.com/gryf/ebook-converter.git
synced 2026-01-28 08:45:46 +01:00
Here is the first batch of modules, which are needed for converting several formats to LRF. Some of the logic has been change, more cleanups will follow.
390 lines
13 KiB
Python
390 lines
13 KiB
Python
#!/usr/bin/env python2
|
|
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
|
from __future__ import absolute_import, division, print_function, unicode_literals
|
|
|
|
__license__ = 'GPL v3'
|
|
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
|
|
__docformat__ = 'restructuredtext en'
|
|
|
|
import re
|
|
|
|
from lxml import etree, html
|
|
|
|
from ebook_converter import xml_replace_entities, force_unicode
|
|
from ebook_converter.utils.xml_parse import safe_xml_fromstring
|
|
from ebook_converter.constants import filesystem_encoding
|
|
from ebook_converter.ebooks.chardet import xml_to_unicode, strip_encoding_declarations
|
|
from ebook_converter.polyglot.builtins import iteritems, itervalues, unicode_type, string_or_bytes, map
|
|
|
|
RECOVER_PARSER = etree.XMLParser(recover=True, no_network=True, resolve_entities=False)
|
|
XHTML_NS = 'http://www.w3.org/1999/xhtml'
|
|
XMLNS_NS = 'http://www.w3.org/2000/xmlns/'
|
|
|
|
|
|
class NotHTML(Exception):
|
|
|
|
def __init__(self, root_tag):
|
|
Exception.__init__(self, 'Data is not HTML')
|
|
self.root_tag = root_tag
|
|
|
|
|
|
def barename(name):
|
|
return name.rpartition('}')[-1]
|
|
|
|
|
|
def namespace(name):
|
|
return name.rpartition('}')[0][1:]
|
|
|
|
|
|
def XHTML(name):
|
|
return '{%s}%s' % (XHTML_NS, name)
|
|
|
|
|
|
def xpath(elem, expr):
|
|
return elem.xpath(expr, namespaces={'h':XHTML_NS})
|
|
|
|
|
|
def XPath(expr):
|
|
return etree.XPath(expr, namespaces={'h':XHTML_NS})
|
|
|
|
|
|
META_XP = XPath('/h:html/h:head/h:meta[@http-equiv="Content-Type"]')
|
|
|
|
|
|
def merge_multiple_html_heads_and_bodies(root, log=None):
|
|
heads, bodies = xpath(root, '//h:head'), xpath(root, '//h:body')
|
|
if not (len(heads) > 1 or len(bodies) > 1):
|
|
return root
|
|
for child in root:
|
|
root.remove(child)
|
|
head = root.makeelement(XHTML('head'))
|
|
body = root.makeelement(XHTML('body'))
|
|
for h in heads:
|
|
for x in h:
|
|
head.append(x)
|
|
for b in bodies:
|
|
for x in b:
|
|
body.append(x)
|
|
tuple(map(root.append, (head, body)))
|
|
if log is not None:
|
|
log.warn('Merging multiple <head> and <body> sections')
|
|
return root
|
|
|
|
|
|
def clone_element(elem, nsmap={}, in_context=True):
|
|
if in_context:
|
|
maker = elem.getroottree().getroot().makeelement
|
|
else:
|
|
maker = etree.Element
|
|
nelem = maker(elem.tag, attrib=elem.attrib,
|
|
nsmap=nsmap)
|
|
nelem.text, nelem.tail = elem.text, elem.tail
|
|
nelem.extend(elem)
|
|
return nelem
|
|
|
|
|
|
def node_depth(node):
|
|
ans = 0
|
|
p = node.getparent()
|
|
while p is not None:
|
|
ans += 1
|
|
p = p.getparent()
|
|
return ans
|
|
|
|
|
|
def html5_parse(data, max_nesting_depth=100):
|
|
from html5_parser import parse
|
|
from ebook_converter.utils.cleantext import clean_xml_chars
|
|
data = parse(clean_xml_chars(data), maybe_xhtml=True, keep_doctype=False, sanitize_names=True)
|
|
# Check that the asinine HTML 5 algorithm did not result in a tree with
|
|
# insane nesting depths
|
|
for x in data.iterdescendants():
|
|
if isinstance(x.tag, string_or_bytes) and not len(x): # Leaf node
|
|
depth = node_depth(x)
|
|
if depth > max_nesting_depth:
|
|
raise ValueError('HTML 5 parsing resulted in a tree with nesting'
|
|
' depth > %d'%max_nesting_depth)
|
|
return data
|
|
|
|
|
|
def _html4_parse(data):
|
|
data = html.fromstring(data)
|
|
data.attrib.pop('xmlns', None)
|
|
for elem in data.iter(tag=etree.Comment):
|
|
if elem.text:
|
|
elem.text = elem.text.strip('-')
|
|
data = etree.tostring(data, encoding='unicode')
|
|
|
|
data = safe_xml_fromstring(data)
|
|
return data
|
|
|
|
|
|
def clean_word_doc(data, log):
|
|
prefixes = []
|
|
for match in re.finditer(r'xmlns:(\S+?)=".*?microsoft.*?"', data):
|
|
prefixes.append(match.group(1))
|
|
if prefixes:
|
|
log.warn('Found microsoft markup, cleaning...')
|
|
# Remove empty tags as they are not rendered by browsers
|
|
# but can become renderable HTML tags like <p/> if the
|
|
# document is parsed by an HTML parser
|
|
pat = re.compile(
|
|
r'<(%s):([a-zA-Z0-9]+)[^>/]*?></\1:\2>'%('|'.join(prefixes)),
|
|
re.DOTALL)
|
|
data = pat.sub('', data)
|
|
pat = re.compile(
|
|
r'<(%s):([a-zA-Z0-9]+)[^>/]*?/>'%('|'.join(prefixes)))
|
|
data = pat.sub('', data)
|
|
return data
|
|
|
|
|
|
def ensure_namespace_prefixes(node, nsmap):
|
|
namespace_uris = frozenset(itervalues(nsmap))
|
|
fnsmap = {k:v for k, v in iteritems(node.nsmap) if v not in namespace_uris}
|
|
fnsmap.update(nsmap)
|
|
if fnsmap != dict(node.nsmap):
|
|
node = clone_element(node, nsmap=fnsmap, in_context=False)
|
|
return node
|
|
|
|
|
|
class HTML5Doc(ValueError):
|
|
pass
|
|
|
|
|
|
def check_for_html5(prefix, root):
|
|
if re.search(r'<!DOCTYPE\s+html\s*>', prefix, re.IGNORECASE) is not None:
|
|
if root.xpath('//svg'):
|
|
raise HTML5Doc('This document appears to be un-namespaced HTML 5, should be parsed by the HTML 5 parser')
|
|
|
|
|
|
def parse_html(data, log=None, decoder=None, preprocessor=None,
|
|
filename='<string>', non_html_file_tags=frozenset()):
|
|
if log is None:
|
|
from ebook_converter.utils.logging import default_log
|
|
log = default_log
|
|
|
|
filename = force_unicode(filename, enc=filesystem_encoding)
|
|
|
|
if not isinstance(data, unicode_type):
|
|
if decoder is not None:
|
|
data = decoder(data)
|
|
else:
|
|
data = xml_to_unicode(data)[0]
|
|
|
|
data = strip_encoding_declarations(data)
|
|
# Remove DOCTYPE declaration as it messes up parsing
|
|
# In particular, it causes tostring to insert xmlns
|
|
# declarations, which messes up the coercing logic
|
|
pre = ''
|
|
idx = data.find('<html')
|
|
if idx == -1:
|
|
idx = data.find('<HTML')
|
|
has_html4_doctype = False
|
|
if idx > -1:
|
|
pre = data[:idx]
|
|
data = data[idx:]
|
|
if '<!DOCTYPE' in pre: # Handle user defined entities
|
|
# kindlegen produces invalid xhtml with uppercase attribute names
|
|
# if fed HTML 4 with uppercase attribute names, so try to detect
|
|
# and compensate for that.
|
|
has_html4_doctype = re.search(r'<!DOCTYPE\s+[^>]+HTML\s+4.0[^.]+>', pre) is not None
|
|
# Process private entities
|
|
user_entities = {}
|
|
for match in re.finditer(r'<!ENTITY\s+(\S+)\s+([^>]+)', pre):
|
|
val = match.group(2)
|
|
if val.startswith('"') and val.endswith('"'):
|
|
val = val[1:-1]
|
|
user_entities[match.group(1)] = val
|
|
if user_entities:
|
|
pat = re.compile(r'&(%s);'%('|'.join(list(user_entities.keys()))))
|
|
data = pat.sub(lambda m:user_entities[m.group(1)], data)
|
|
|
|
if preprocessor is not None:
|
|
data = preprocessor(data)
|
|
|
|
# There could be null bytes in data if it had � entities in it
|
|
data = data.replace('\0', '')
|
|
data = raw = clean_word_doc(data, log)
|
|
|
|
# Try with more & more drastic measures to parse
|
|
try:
|
|
data = safe_xml_fromstring(data, recover=False)
|
|
check_for_html5(pre, data)
|
|
except (HTML5Doc, etree.XMLSyntaxError):
|
|
log.debug('Initial parse failed, using more'
|
|
' forgiving parsers')
|
|
raw = data = xml_replace_entities(raw)
|
|
try:
|
|
data = safe_xml_fromstring(data, recover=False)
|
|
check_for_html5(pre, data)
|
|
except (HTML5Doc, etree.XMLSyntaxError):
|
|
log.debug('Parsing %s as HTML' % filename)
|
|
data = raw
|
|
try:
|
|
data = html5_parse(data)
|
|
except Exception:
|
|
log.exception(
|
|
'HTML 5 parsing failed, falling back to older parsers')
|
|
data = _html4_parse(data)
|
|
|
|
if has_html4_doctype or data.tag == 'HTML' or (len(data) and (data[-1].get('LANG') or data[-1].get('DIR'))):
|
|
# Lower case all tag and attribute names
|
|
data.tag = data.tag.lower()
|
|
for x in data.iterdescendants():
|
|
try:
|
|
x.tag = x.tag.lower()
|
|
for key, val in list(iteritems(x.attrib)):
|
|
del x.attrib[key]
|
|
key = key.lower()
|
|
x.attrib[key] = val
|
|
except:
|
|
pass
|
|
|
|
if barename(data.tag) != 'html':
|
|
if barename(data.tag) in non_html_file_tags:
|
|
raise NotHTML(data.tag)
|
|
log.warn('File %r does not appear to be (X)HTML'%filename)
|
|
nroot = safe_xml_fromstring('<html></html>')
|
|
has_body = False
|
|
for child in list(data):
|
|
if isinstance(child.tag, (unicode_type, bytes)) and barename(child.tag) == 'body':
|
|
has_body = True
|
|
break
|
|
parent = nroot
|
|
if not has_body:
|
|
log.warn('File %r appears to be a HTML fragment'%filename)
|
|
nroot = safe_xml_fromstring('<html><body/></html>')
|
|
parent = nroot[0]
|
|
for child in list(data.iter()):
|
|
oparent = child.getparent()
|
|
if oparent is not None:
|
|
oparent.remove(child)
|
|
parent.append(child)
|
|
data = nroot
|
|
|
|
# Force into the XHTML namespace
|
|
if not namespace(data.tag):
|
|
log.warn('Forcing', filename, 'into XHTML namespace')
|
|
data.attrib['xmlns'] = XHTML_NS
|
|
data = etree.tostring(data, encoding='unicode')
|
|
|
|
try:
|
|
data = safe_xml_fromstring(data, recover=False)
|
|
except:
|
|
data = data.replace(':=', '=').replace(':>', '>')
|
|
data = data.replace('<http:/>', '')
|
|
try:
|
|
data = safe_xml_fromstring(data, recover=False)
|
|
except etree.XMLSyntaxError:
|
|
log.warn('Stripping comments from %s'%
|
|
filename)
|
|
data = re.compile(r'<!--.*?-->', re.DOTALL).sub('',
|
|
data)
|
|
data = data.replace(
|
|
"<?xml version='1.0' encoding='utf-8'?><o:p></o:p>",
|
|
'')
|
|
data = data.replace("<?xml version='1.0' encoding='utf-8'??>", '')
|
|
try:
|
|
data = safe_xml_fromstring(data)
|
|
except etree.XMLSyntaxError:
|
|
log.warn('Stripping meta tags from %s'% filename)
|
|
data = re.sub(r'<meta\s+[^>]+?>', '', data)
|
|
data = safe_xml_fromstring(data)
|
|
elif namespace(data.tag) != XHTML_NS:
|
|
# OEB_DOC_NS, but possibly others
|
|
ns = namespace(data.tag)
|
|
attrib = dict(data.attrib)
|
|
nroot = etree.Element(XHTML('html'),
|
|
nsmap={None: XHTML_NS}, attrib=attrib)
|
|
for elem in data.iterdescendants():
|
|
if isinstance(elem.tag, string_or_bytes) and \
|
|
namespace(elem.tag) == ns:
|
|
elem.tag = XHTML(barename(elem.tag))
|
|
for elem in data:
|
|
nroot.append(elem)
|
|
data = nroot
|
|
|
|
# Remove non default prefixes referring to the XHTML namespace
|
|
data = ensure_namespace_prefixes(data, {None: XHTML_NS})
|
|
|
|
data = merge_multiple_html_heads_and_bodies(data, log)
|
|
# Ensure has a <head/>
|
|
head = xpath(data, '/h:html/h:head')
|
|
head = head[0] if head else None
|
|
if head is None:
|
|
log.warn('File %s missing <head/> element' % filename)
|
|
head = etree.Element(XHTML('head'))
|
|
data.insert(0, head)
|
|
title = etree.SubElement(head, XHTML('title'))
|
|
title.text = _('Unknown')
|
|
elif not xpath(data, '/h:html/h:head/h:title'):
|
|
title = etree.SubElement(head, XHTML('title'))
|
|
title.text = _('Unknown')
|
|
# Ensure <title> is not empty
|
|
title = xpath(data, '/h:html/h:head/h:title')[0]
|
|
if not title.text or not title.text.strip():
|
|
title.text = _('Unknown')
|
|
# Remove any encoding-specifying <meta/> elements
|
|
for meta in META_XP(data):
|
|
meta.getparent().remove(meta)
|
|
meta = etree.SubElement(head, XHTML('meta'),
|
|
attrib={'http-equiv': 'Content-Type'})
|
|
meta.set('content', 'text/html; charset=utf-8') # Ensure content is second attribute
|
|
|
|
# Ensure has a <body/>
|
|
if not xpath(data, '/h:html/h:body'):
|
|
body = xpath(data, '//h:body')
|
|
if body:
|
|
body = body[0]
|
|
body.getparent().remove(body)
|
|
data.append(body)
|
|
else:
|
|
log.warn('File %s missing <body/> element' % filename)
|
|
etree.SubElement(data, XHTML('body'))
|
|
|
|
# Remove microsoft office markup
|
|
r = [x for x in data.iterdescendants(etree.Element) if 'microsoft-com' in x.tag]
|
|
for x in r:
|
|
x.tag = XHTML('span')
|
|
|
|
def remove_elem(a):
|
|
p = a.getparent()
|
|
idx = p.index(a) -1
|
|
p.remove(a)
|
|
if a.tail:
|
|
if idx < 0:
|
|
if p.text is None:
|
|
p.text = ''
|
|
p.text += a.tail
|
|
else:
|
|
if p[idx].tail is None:
|
|
p[idx].tail = ''
|
|
p[idx].tail += a.tail
|
|
|
|
# Remove hyperlinks with no content as they cause rendering
|
|
# artifacts in browser based renderers
|
|
# Also remove empty <b>, <u> and <i> tags
|
|
for a in xpath(data, '//h:a[@href]|//h:i|//h:b|//h:u'):
|
|
if a.get('id', None) is None and a.get('name', None) is None \
|
|
and len(a) == 0 and not a.text:
|
|
remove_elem(a)
|
|
|
|
# Convert <br>s with content into paragraphs as ADE can't handle
|
|
# them
|
|
for br in xpath(data, '//h:br'):
|
|
if len(br) > 0 or br.text:
|
|
br.tag = XHTML('div')
|
|
|
|
# Remove any stray text in the <head> section and format it nicely
|
|
data.text = '\n '
|
|
head = xpath(data, '//h:head')
|
|
if head:
|
|
head = head[0]
|
|
head.text = '\n '
|
|
head.tail = '\n '
|
|
for child in head:
|
|
child.tail = '\n '
|
|
child.tail = '\n '
|
|
|
|
return data
|