mirror of
https://github.com/gryf/ebook-converter.git
synced 2026-04-21 13:41:30 +02:00
Use the real constants module.
This is progressing refactor of the calibre code to make it more readable, and transform it to something more coherent. In this patch, there are changes regarding imports for some modules, instead of polluting namespace of each module with some other modules symbols, which often were imported from other modules. Yuck.
This commit is contained in:
@@ -1,21 +1,18 @@
|
||||
import re
|
||||
|
||||
from lxml.etree import Element as LxmlElement
|
||||
from lxml import etree
|
||||
import html5_parser
|
||||
|
||||
from ebook_converter import constants as const
|
||||
from ebook_converter import xml_replace_entities
|
||||
from ebook_converter.utils.xml_parse import safe_xml_fromstring
|
||||
from ebook_converter.ebooks.chardet import xml_to_unicode, strip_encoding_declarations
|
||||
from ebook_converter.ebooks.chardet import strip_encoding_declarations
|
||||
from ebook_converter.ebooks.chardet import xml_to_unicode
|
||||
from ebook_converter.utils.cleantext import clean_xml_chars
|
||||
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
|
||||
XHTML_NS = 'http://www.w3.org/1999/xhtml'
|
||||
|
||||
|
||||
def parse_html5(raw, decoder=None, log=None, discard_namespaces=False, line_numbers=True, linenumber_attribute=None, replace_entities=True, fix_newlines=True):
|
||||
def parse_html5(raw, decoder=None, log=None, discard_namespaces=False,
|
||||
line_numbers=True, linenumber_attribute=None,
|
||||
replace_entities=True, fix_newlines=True):
|
||||
if isinstance(raw, bytes):
|
||||
raw = xml_to_unicode(raw)[0] if decoder is None else decoder(raw)
|
||||
if replace_entities:
|
||||
@@ -23,10 +20,14 @@ def parse_html5(raw, decoder=None, log=None, discard_namespaces=False, line_numb
|
||||
if fix_newlines:
|
||||
raw = raw.replace('\r\n', '\n').replace('\r', '\n')
|
||||
raw = clean_xml_chars(raw)
|
||||
root = html5_parser.parse(raw, maybe_xhtml=not discard_namespaces, line_number_attr=linenumber_attribute, keep_doctype=False, sanitize_names=True)
|
||||
if (discard_namespaces and root.tag != 'html') or (
|
||||
not discard_namespaces and (root.tag != '{%s}%s' % (XHTML_NS, 'html') or root.prefix)):
|
||||
raise ValueError('Failed to parse correctly, root has tag: %s and prefix: %s' % (root.tag, root.prefix))
|
||||
root = html5_parser.parse(raw, maybe_xhtml=not discard_namespaces,
|
||||
line_number_attr=linenumber_attribute,
|
||||
keep_doctype=False, sanitize_names=True)
|
||||
if ((discard_namespaces and root.tag != 'html') or
|
||||
(not discard_namespaces and
|
||||
(root.tag != '{%s}%s' % (const.XHTML_NS, 'html') or root.prefix))):
|
||||
raise ValueError('Failed to parse correctly, root has tag: %s and '
|
||||
'prefix: %s' % (root.tag, root.prefix))
|
||||
return root
|
||||
|
||||
|
||||
@@ -48,12 +49,14 @@ def handle_private_entities(data):
|
||||
user_entities[match.group(1)] = val
|
||||
if user_entities:
|
||||
data = ('\n' * num_of_nl_in_pre) + data[idx:]
|
||||
pat = re.compile(r'&(%s);'%('|'.join(user_entities.keys())))
|
||||
data = pat.sub(lambda m:user_entities[m.group(1)], data)
|
||||
pat = re.compile(r'&(%s);' % ('|'.join(user_entities.keys())))
|
||||
data = pat.sub(lambda m: user_entities[m.group(1)], data)
|
||||
return data
|
||||
|
||||
|
||||
def parse(raw, decoder=None, log=None, line_numbers=True, linenumber_attribute=None, replace_entities=True, force_html5_parse=False):
|
||||
def parse(raw, decoder=None, log=None, line_numbers=True,
|
||||
linenumber_attribute=None, replace_entities=True,
|
||||
force_html5_parse=False):
|
||||
if isinstance(raw, bytes):
|
||||
raw = xml_to_unicode(raw)[0] if decoder is None else decoder(raw)
|
||||
raw = handle_private_entities(raw)
|
||||
@@ -70,26 +73,32 @@ def parse(raw, decoder=None, log=None, line_numbers=True, linenumber_attribute=N
|
||||
raw = ('\n' * newlines) + raw[match.start():]
|
||||
break
|
||||
|
||||
raw = strip_encoding_declarations(raw, limit=10*1024, preserve_newlines=True)
|
||||
raw = strip_encoding_declarations(raw, limit=10*1024,
|
||||
preserve_newlines=True)
|
||||
if force_html5_parse:
|
||||
return parse_html5(raw, log=log, line_numbers=line_numbers, linenumber_attribute=linenumber_attribute, replace_entities=False, fix_newlines=False)
|
||||
return parse_html5(raw, log=log, line_numbers=line_numbers,
|
||||
linenumber_attribute=linenumber_attribute,
|
||||
replace_entities=False, fix_newlines=False)
|
||||
try:
|
||||
ans = safe_xml_fromstring(raw, recover=False)
|
||||
if ans.tag != '{%s}html' % XHTML_NS:
|
||||
ans = etree.fromstring(raw)
|
||||
if ans.tag != '{%s}html' % const.XHTML_NS:
|
||||
raise ValueError('Root tag is not <html> in the XHTML namespace')
|
||||
if linenumber_attribute:
|
||||
for elem in ans.iter(LxmlElement):
|
||||
for elem in ans.iter(etree.element):
|
||||
if elem.sourceline is not None:
|
||||
elem.set(linenumber_attribute, str(elem.sourceline))
|
||||
return ans
|
||||
except Exception:
|
||||
if log is not None:
|
||||
log.exception('Failed to parse as XML, parsing as tag soup')
|
||||
return parse_html5(raw, log=log, line_numbers=line_numbers, linenumber_attribute=linenumber_attribute, replace_entities=False, fix_newlines=False)
|
||||
return parse_html5(raw, log=log, line_numbers=line_numbers,
|
||||
linenumber_attribute=linenumber_attribute,
|
||||
replace_entities=False, fix_newlines=False)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
from lxml import etree
|
||||
root = parse_html5('\n<html><head><title>a\n</title><p b=1 c=2 a=0> \n<b>b<svg ass="wipe" viewbox="0">', discard_namespaces=False)
|
||||
root = parse_html5('\n<html><head><title>a\n</title><p b=1 c=2 a=0> '
|
||||
'\n<b>b<svg ass="wipe" viewbox="0">',
|
||||
discard_namespaces=False)
|
||||
print(etree.tostring(root, encoding='utf-8'))
|
||||
print()
|
||||
|
||||
Reference in New Issue
Block a user