mirror of
https://github.com/gryf/ebook-converter.git
synced 2026-03-13 04:55:49 +01:00
This is progressing refactor of the calibre code to make it more readable, and transform it to something more coherent. In this patch, there are changes regarding imports for some modules, instead of polluting namespace of each module with some other modules symbols, which often were imported from other modules. Yuck.
105 lines
4.3 KiB
Python
105 lines
4.3 KiB
Python
import re
|
|
|
|
from lxml import etree
|
|
import html5_parser
|
|
|
|
from ebook_converter import constants as const
|
|
from ebook_converter import xml_replace_entities
|
|
from ebook_converter.ebooks.chardet import strip_encoding_declarations
|
|
from ebook_converter.ebooks.chardet import xml_to_unicode
|
|
from ebook_converter.utils.cleantext import clean_xml_chars
|
|
|
|
|
|
def parse_html5(raw, decoder=None, log=None, discard_namespaces=False,
|
|
line_numbers=True, linenumber_attribute=None,
|
|
replace_entities=True, fix_newlines=True):
|
|
if isinstance(raw, bytes):
|
|
raw = xml_to_unicode(raw)[0] if decoder is None else decoder(raw)
|
|
if replace_entities:
|
|
raw = xml_replace_entities(raw)
|
|
if fix_newlines:
|
|
raw = raw.replace('\r\n', '\n').replace('\r', '\n')
|
|
raw = clean_xml_chars(raw)
|
|
root = html5_parser.parse(raw, maybe_xhtml=not discard_namespaces,
|
|
line_number_attr=linenumber_attribute,
|
|
keep_doctype=False, sanitize_names=True)
|
|
if ((discard_namespaces and root.tag != 'html') or
|
|
(not discard_namespaces and
|
|
(root.tag != '{%s}%s' % (const.XHTML_NS, 'html') or root.prefix))):
|
|
raise ValueError('Failed to parse correctly, root has tag: %s and '
|
|
'prefix: %s' % (root.tag, root.prefix))
|
|
return root
|
|
|
|
|
|
def handle_private_entities(data):
|
|
# Process private entities
|
|
pre = ''
|
|
idx = data.find('<html')
|
|
if idx == -1:
|
|
idx = data.find('<HTML')
|
|
if idx > -1:
|
|
pre = data[:idx]
|
|
num_of_nl_in_pre = pre.count('\n')
|
|
if '<!DOCTYPE' in pre: # Handle user defined entities
|
|
user_entities = {}
|
|
for match in re.finditer(r'<!ENTITY\s+(\S+)\s+([^>]+)', pre):
|
|
val = match.group(2)
|
|
if val.startswith('"') and val.endswith('"'):
|
|
val = val[1:-1]
|
|
user_entities[match.group(1)] = val
|
|
if user_entities:
|
|
data = ('\n' * num_of_nl_in_pre) + data[idx:]
|
|
pat = re.compile(r'&(%s);' % ('|'.join(user_entities.keys())))
|
|
data = pat.sub(lambda m: user_entities[m.group(1)], data)
|
|
return data
|
|
|
|
|
|
def parse(raw, decoder=None, log=None, line_numbers=True,
|
|
linenumber_attribute=None, replace_entities=True,
|
|
force_html5_parse=False):
|
|
if isinstance(raw, bytes):
|
|
raw = xml_to_unicode(raw)[0] if decoder is None else decoder(raw)
|
|
raw = handle_private_entities(raw)
|
|
if replace_entities:
|
|
raw = xml_replace_entities(raw).replace('\0', '') # Handle �
|
|
raw = raw.replace('\r\n', '\n').replace('\r', '\n')
|
|
|
|
# Remove any preamble before the opening html tag as it can cause problems,
|
|
# especially doctypes, preserve the original linenumbers by inserting
|
|
# newlines at the start
|
|
pre = raw[:2048]
|
|
for match in re.finditer(r'<\s*html', pre, flags=re.I):
|
|
newlines = raw.count('\n', 0, match.start())
|
|
raw = ('\n' * newlines) + raw[match.start():]
|
|
break
|
|
|
|
raw = strip_encoding_declarations(raw, limit=10*1024,
|
|
preserve_newlines=True)
|
|
if force_html5_parse:
|
|
return parse_html5(raw, log=log, line_numbers=line_numbers,
|
|
linenumber_attribute=linenumber_attribute,
|
|
replace_entities=False, fix_newlines=False)
|
|
try:
|
|
ans = etree.fromstring(raw)
|
|
if ans.tag != '{%s}html' % const.XHTML_NS:
|
|
raise ValueError('Root tag is not <html> in the XHTML namespace')
|
|
if linenumber_attribute:
|
|
for elem in ans.iter(etree.element):
|
|
if elem.sourceline is not None:
|
|
elem.set(linenumber_attribute, str(elem.sourceline))
|
|
return ans
|
|
except Exception:
|
|
if log is not None:
|
|
log.exception('Failed to parse as XML, parsing as tag soup')
|
|
return parse_html5(raw, log=log, line_numbers=line_numbers,
|
|
linenumber_attribute=linenumber_attribute,
|
|
replace_entities=False, fix_newlines=False)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
root = parse_html5('\n<html><head><title>a\n</title><p b=1 c=2 a=0> '
|
|
'\n<b>b<svg ass="wipe" viewbox="0">',
|
|
discard_namespaces=False)
|
|
print(etree.tostring(root, encoding='utf-8'))
|
|
print()
|