1
0
mirror of https://github.com/gryf/ebook-converter.git synced 2026-04-04 20:03:34 +02:00

Use the real constants module.

This is progressing refactor of the calibre code to make it more
readable, and transform it to something more coherent.

In this patch, there are changes regarding imports for some modules,
instead of polluting namespace of each module with some other modules
symbols, which often were imported from other modules. Yuck.
This commit is contained in:
2020-05-29 17:04:53 +02:00
parent ee4801228f
commit ce89f5c9d1
54 changed files with 2383 additions and 2081 deletions

View File

@@ -1,20 +1,16 @@
import re
from lxml import etree, html
from lxml import etree
from lxml import html
from ebook_converter import constants as const
from ebook_converter import xml_replace_entities, force_unicode
from ebook_converter.utils.xml_parse import safe_xml_fromstring
from ebook_converter.constants_old import filesystem_encoding
from ebook_converter.ebooks.chardet import xml_to_unicode, strip_encoding_declarations
__license__ = 'GPL v3'
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
RECOVER_PARSER = etree.XMLParser(recover=True, no_network=True, resolve_entities=False)
XHTML_NS = 'http://www.w3.org/1999/xhtml'
XMLNS_NS = 'http://www.w3.org/2000/xmlns/'
RECOVER_PARSER = etree.XMLParser(recover=True, no_network=True,
resolve_entities=False)
class NotHTML(Exception):
@@ -33,15 +29,15 @@ def namespace(name):
def XHTML(name):
return '{%s}%s' % (XHTML_NS, name)
return '{%s}%s' % (const.XHTML_NS, name)
def xpath(elem, expr):
return elem.xpath(expr, namespaces={'h':XHTML_NS})
return elem.xpath(expr, namespaces={'h':const.XHTML_NS})
def XPath(expr):
return etree.XPath(expr, namespaces={'h':XHTML_NS})
return etree.XPath(expr, namespaces={'h':const.XHTML_NS})
META_XP = XPath('/h:html/h:head/h:meta[@http-equiv="Content-Type"]')
@@ -111,7 +107,7 @@ def _html4_parse(data):
elem.text = elem.text.strip('-')
data = etree.tostring(data, encoding='unicode')
data = safe_xml_fromstring(data)
data = etree.fromstring(data)
return data
@@ -204,14 +200,14 @@ def parse_html(data, log=None, decoder=None, preprocessor=None,
# Try with more & more drastic measures to parse
try:
data = safe_xml_fromstring(data, recover=False)
data = etree.fromstring(data)
check_for_html5(pre, data)
except (HTML5Doc, etree.XMLSyntaxError):
log.debug('Initial parse failed, using more'
' forgiving parsers')
raw = data = xml_replace_entities(raw)
try:
data = safe_xml_fromstring(data, recover=False)
data = etree.fromstring(data)
check_for_html5(pre, data)
except (HTML5Doc, etree.XMLSyntaxError):
log.debug('Parsing %s as HTML' % filename)
@@ -240,7 +236,7 @@ def parse_html(data, log=None, decoder=None, preprocessor=None,
if barename(data.tag) in non_html_file_tags:
raise NotHTML(data.tag)
log.warn('File %r does not appear to be (X)HTML'%filename)
nroot = safe_xml_fromstring('<html></html>')
nroot = etree.fromstring('<html></html>')
has_body = False
for child in list(data):
if isinstance(child.tag, (str, bytes)) and barename(child.tag) == 'body':
@@ -249,7 +245,7 @@ def parse_html(data, log=None, decoder=None, preprocessor=None,
parent = nroot
if not has_body:
log.warn('File %r appears to be a HTML fragment'%filename)
nroot = safe_xml_fromstring('<html><body/></html>')
nroot = etree.fromstring('<html><body/></html>')
parent = nroot[0]
for child in list(data.iter()):
oparent = child.getparent()
@@ -261,16 +257,16 @@ def parse_html(data, log=None, decoder=None, preprocessor=None,
# Force into the XHTML namespace
if not namespace(data.tag):
log.warn('Forcing', filename, 'into XHTML namespace')
data.attrib['xmlns'] = XHTML_NS
data.attrib['xmlns'] = const.XHTML_NS
data = etree.tostring(data, encoding='unicode')
try:
data = safe_xml_fromstring(data, recover=False)
data = etree.fromstring(data)
except:
data = data.replace(':=', '=').replace(':>', '>')
data = data.replace('<http:/>', '')
try:
data = safe_xml_fromstring(data, recover=False)
data = etree.fromstring(data)
except etree.XMLSyntaxError:
log.warn('Stripping comments from %s'%
filename)
@@ -281,17 +277,17 @@ def parse_html(data, log=None, decoder=None, preprocessor=None,
'')
data = data.replace("<?xml version='1.0' encoding='utf-8'??>", '')
try:
data = safe_xml_fromstring(data)
data = etree.fromstring(data)
except etree.XMLSyntaxError:
log.warn('Stripping meta tags from %s'% filename)
data = re.sub(r'<meta\s+[^>]+?>', '', data)
data = safe_xml_fromstring(data)
elif namespace(data.tag) != XHTML_NS:
data = etree.fromstring(data)
elif namespace(data.tag) != const.XHTML_NS:
# OEB_DOC_NS, but possibly others
ns = namespace(data.tag)
attrib = dict(data.attrib)
nroot = etree.Element(XHTML('html'),
nsmap={None: XHTML_NS}, attrib=attrib)
nsmap={None: const.XHTML_NS}, attrib=attrib)
for elem in data.iterdescendants():
if isinstance(elem.tag, (str, bytes)) and \
namespace(elem.tag) == ns:
@@ -301,7 +297,7 @@ def parse_html(data, log=None, decoder=None, preprocessor=None,
data = nroot
# Remove non default prefixes referring to the XHTML namespace
data = ensure_namespace_prefixes(data, {None: XHTML_NS})
data = ensure_namespace_prefixes(data, {None: const.XHTML_NS})
data = merge_multiple_html_heads_and_bodies(data, log)
# Ensure has a <head/>