mirror of
https://github.com/gryf/ebook-converter.git
synced 2026-04-14 01:23:32 +02:00
Here is the first batch of modules, which are needed for converting several formats to LRF. Some of the logic has been change, more cleanups will follow.
100 lines
4.3 KiB
Python
100 lines
4.3 KiB
Python
#!/usr/bin/env python2
|
|
# vim:fileencoding=utf-8
|
|
from __future__ import absolute_import, division, print_function, unicode_literals
|
|
|
|
__license__ = 'GPL v3'
|
|
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
|
|
|
|
import re
|
|
|
|
from lxml.etree import Element as LxmlElement
|
|
import html5_parser
|
|
|
|
from ebook_converter import xml_replace_entities
|
|
from ebook_converter.utils.xml_parse import safe_xml_fromstring
|
|
from ebook_converter.ebooks.chardet import xml_to_unicode, strip_encoding_declarations
|
|
from ebook_converter.utils.cleantext import clean_xml_chars
|
|
from ebook_converter.polyglot.builtins import unicode_type
|
|
|
|
XHTML_NS = 'http://www.w3.org/1999/xhtml'
|
|
|
|
|
|
def parse_html5(raw, decoder=None, log=None, discard_namespaces=False, line_numbers=True, linenumber_attribute=None, replace_entities=True, fix_newlines=True):
|
|
if isinstance(raw, bytes):
|
|
raw = xml_to_unicode(raw)[0] if decoder is None else decoder(raw)
|
|
if replace_entities:
|
|
raw = xml_replace_entities(raw)
|
|
if fix_newlines:
|
|
raw = raw.replace('\r\n', '\n').replace('\r', '\n')
|
|
raw = clean_xml_chars(raw)
|
|
root = html5_parser.parse(raw, maybe_xhtml=not discard_namespaces, line_number_attr=linenumber_attribute, keep_doctype=False, sanitize_names=True)
|
|
if (discard_namespaces and root.tag != 'html') or (
|
|
not discard_namespaces and (root.tag != '{%s}%s' % (XHTML_NS, 'html') or root.prefix)):
|
|
raise ValueError('Failed to parse correctly, root has tag: %s and prefix: %s' % (root.tag, root.prefix))
|
|
return root
|
|
|
|
|
|
def handle_private_entities(data):
|
|
# Process private entities
|
|
pre = ''
|
|
idx = data.find('<html')
|
|
if idx == -1:
|
|
idx = data.find('<HTML')
|
|
if idx > -1:
|
|
pre = data[:idx]
|
|
num_of_nl_in_pre = pre.count('\n')
|
|
if '<!DOCTYPE' in pre: # Handle user defined entities
|
|
user_entities = {}
|
|
for match in re.finditer(r'<!ENTITY\s+(\S+)\s+([^>]+)', pre):
|
|
val = match.group(2)
|
|
if val.startswith('"') and val.endswith('"'):
|
|
val = val[1:-1]
|
|
user_entities[match.group(1)] = val
|
|
if user_entities:
|
|
data = ('\n' * num_of_nl_in_pre) + data[idx:]
|
|
pat = re.compile(r'&(%s);'%('|'.join(user_entities.keys())))
|
|
data = pat.sub(lambda m:user_entities[m.group(1)], data)
|
|
return data
|
|
|
|
|
|
def parse(raw, decoder=None, log=None, line_numbers=True, linenumber_attribute=None, replace_entities=True, force_html5_parse=False):
|
|
if isinstance(raw, bytes):
|
|
raw = xml_to_unicode(raw)[0] if decoder is None else decoder(raw)
|
|
raw = handle_private_entities(raw)
|
|
if replace_entities:
|
|
raw = xml_replace_entities(raw).replace('\0', '') # Handle �
|
|
raw = raw.replace('\r\n', '\n').replace('\r', '\n')
|
|
|
|
# Remove any preamble before the opening html tag as it can cause problems,
|
|
# especially doctypes, preserve the original linenumbers by inserting
|
|
# newlines at the start
|
|
pre = raw[:2048]
|
|
for match in re.finditer(r'<\s*html', pre, flags=re.I):
|
|
newlines = raw.count('\n', 0, match.start())
|
|
raw = ('\n' * newlines) + raw[match.start():]
|
|
break
|
|
|
|
raw = strip_encoding_declarations(raw, limit=10*1024, preserve_newlines=True)
|
|
if force_html5_parse:
|
|
return parse_html5(raw, log=log, line_numbers=line_numbers, linenumber_attribute=linenumber_attribute, replace_entities=False, fix_newlines=False)
|
|
try:
|
|
ans = safe_xml_fromstring(raw, recover=False)
|
|
if ans.tag != '{%s}html' % XHTML_NS:
|
|
raise ValueError('Root tag is not <html> in the XHTML namespace')
|
|
if linenumber_attribute:
|
|
for elem in ans.iter(LxmlElement):
|
|
if elem.sourceline is not None:
|
|
elem.set(linenumber_attribute, unicode_type(elem.sourceline))
|
|
return ans
|
|
except Exception:
|
|
if log is not None:
|
|
log.exception('Failed to parse as XML, parsing as tag soup')
|
|
return parse_html5(raw, log=log, line_numbers=line_numbers, linenumber_attribute=linenumber_attribute, replace_entities=False, fix_newlines=False)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
from lxml import etree
|
|
root = parse_html5('\n<html><head><title>a\n</title><p b=1 c=2 a=0> \n<b>b<svg ass="wipe" viewbox="0">', discard_namespaces=False)
|
|
print(etree.tostring(root, encoding='utf-8'))
|
|
print()
|