1
0
mirror of https://github.com/gryf/ebook-converter.git synced 2026-04-21 13:41:30 +02:00

Use the real constants module.

This is progressing refactor of the calibre code to make it more
readable, and transform it to something more coherent.

In this patch, there are changes regarding imports for some modules,
instead of polluting namespace of each module with some other modules
symbols, which often were imported from other modules. Yuck.
This commit is contained in:
2020-05-29 17:04:53 +02:00
parent ee4801228f
commit ce89f5c9d1
54 changed files with 2383 additions and 2081 deletions
+34 -25
View File
@@ -1,21 +1,18 @@
import re
from lxml.etree import Element as LxmlElement
from lxml import etree
import html5_parser
from ebook_converter import constants as const
from ebook_converter import xml_replace_entities
from ebook_converter.utils.xml_parse import safe_xml_fromstring
from ebook_converter.ebooks.chardet import xml_to_unicode, strip_encoding_declarations
from ebook_converter.ebooks.chardet import strip_encoding_declarations
from ebook_converter.ebooks.chardet import xml_to_unicode
from ebook_converter.utils.cleantext import clean_xml_chars
__license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
XHTML_NS = 'http://www.w3.org/1999/xhtml'
def parse_html5(raw, decoder=None, log=None, discard_namespaces=False, line_numbers=True, linenumber_attribute=None, replace_entities=True, fix_newlines=True):
def parse_html5(raw, decoder=None, log=None, discard_namespaces=False,
line_numbers=True, linenumber_attribute=None,
replace_entities=True, fix_newlines=True):
if isinstance(raw, bytes):
raw = xml_to_unicode(raw)[0] if decoder is None else decoder(raw)
if replace_entities:
@@ -23,10 +20,14 @@ def parse_html5(raw, decoder=None, log=None, discard_namespaces=False, line_numb
if fix_newlines:
raw = raw.replace('\r\n', '\n').replace('\r', '\n')
raw = clean_xml_chars(raw)
root = html5_parser.parse(raw, maybe_xhtml=not discard_namespaces, line_number_attr=linenumber_attribute, keep_doctype=False, sanitize_names=True)
if (discard_namespaces and root.tag != 'html') or (
not discard_namespaces and (root.tag != '{%s}%s' % (XHTML_NS, 'html') or root.prefix)):
raise ValueError('Failed to parse correctly, root has tag: %s and prefix: %s' % (root.tag, root.prefix))
root = html5_parser.parse(raw, maybe_xhtml=not discard_namespaces,
line_number_attr=linenumber_attribute,
keep_doctype=False, sanitize_names=True)
if ((discard_namespaces and root.tag != 'html') or
(not discard_namespaces and
(root.tag != '{%s}%s' % (const.XHTML_NS, 'html') or root.prefix))):
raise ValueError('Failed to parse correctly, root has tag: %s and '
'prefix: %s' % (root.tag, root.prefix))
return root
@@ -48,12 +49,14 @@ def handle_private_entities(data):
user_entities[match.group(1)] = val
if user_entities:
data = ('\n' * num_of_nl_in_pre) + data[idx:]
pat = re.compile(r'&(%s);'%('|'.join(user_entities.keys())))
data = pat.sub(lambda m:user_entities[m.group(1)], data)
pat = re.compile(r'&(%s);' % ('|'.join(user_entities.keys())))
data = pat.sub(lambda m: user_entities[m.group(1)], data)
return data
def parse(raw, decoder=None, log=None, line_numbers=True, linenumber_attribute=None, replace_entities=True, force_html5_parse=False):
def parse(raw, decoder=None, log=None, line_numbers=True,
linenumber_attribute=None, replace_entities=True,
force_html5_parse=False):
if isinstance(raw, bytes):
raw = xml_to_unicode(raw)[0] if decoder is None else decoder(raw)
raw = handle_private_entities(raw)
@@ -70,26 +73,32 @@ def parse(raw, decoder=None, log=None, line_numbers=True, linenumber_attribute=N
raw = ('\n' * newlines) + raw[match.start():]
break
raw = strip_encoding_declarations(raw, limit=10*1024, preserve_newlines=True)
raw = strip_encoding_declarations(raw, limit=10*1024,
preserve_newlines=True)
if force_html5_parse:
return parse_html5(raw, log=log, line_numbers=line_numbers, linenumber_attribute=linenumber_attribute, replace_entities=False, fix_newlines=False)
return parse_html5(raw, log=log, line_numbers=line_numbers,
linenumber_attribute=linenumber_attribute,
replace_entities=False, fix_newlines=False)
try:
ans = safe_xml_fromstring(raw, recover=False)
if ans.tag != '{%s}html' % XHTML_NS:
ans = etree.fromstring(raw)
if ans.tag != '{%s}html' % const.XHTML_NS:
raise ValueError('Root tag is not <html> in the XHTML namespace')
if linenumber_attribute:
for elem in ans.iter(LxmlElement):
for elem in ans.iter(etree.element):
if elem.sourceline is not None:
elem.set(linenumber_attribute, str(elem.sourceline))
return ans
except Exception:
if log is not None:
log.exception('Failed to parse as XML, parsing as tag soup')
return parse_html5(raw, log=log, line_numbers=line_numbers, linenumber_attribute=linenumber_attribute, replace_entities=False, fix_newlines=False)
return parse_html5(raw, log=log, line_numbers=line_numbers,
linenumber_attribute=linenumber_attribute,
replace_entities=False, fix_newlines=False)
if __name__ == '__main__':
from lxml import etree
root = parse_html5('\n<html><head><title>a\n</title><p b=1 c=2 a=0>&nbsp;\n<b>b<svg ass="wipe" viewbox="0">', discard_namespaces=False)
root = parse_html5('\n<html><head><title>a\n</title><p b=1 c=2 a=0>&nbsp;'
'\n<b>b<svg ass="wipe" viewbox="0">',
discard_namespaces=False)
print(etree.tostring(root, encoding='utf-8'))
print()