1
0
mirror of https://github.com/gryf/ebook-converter.git synced 2026-04-24 07:01:30 +02:00

Fixing leftovers from first concept of constants

This commit is contained in:
2020-06-07 11:59:00 +02:00
parent 7419954e0c
commit a69884d724
9 changed files with 652 additions and 464 deletions
+70 -43
View File
@@ -1,18 +1,13 @@
import textwrap
# from lxml.etree import Element
from ebook_converter import constants as const
from ebook_converter import force_unicode
from ebook_converter.ebooks.oeb import parse_utils
from ebook_converter.ebooks.oeb.base import serialize, OEB_DOCS, OEB_STYLES
from ebook_converter.ebooks.oeb import base
from ebook_converter.ebooks.oeb.polish.utils import guess_type
from ebook_converter.utils.icu import sort_key
__license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
def isspace(x):
return not x.strip('\u0009\u000a\u000c\u000d\u0020')
@@ -28,37 +23,40 @@ def pretty_xml_tree(elem, level=0, indent=' '):
for i, child in enumerate(elem):
pretty_xml_tree(child, level=level+1, indent=indent)
if not child.tail or isspace(child.tail):
l = level + 1
new_level = level + 1
if i == len(elem) - 1:
l -= 1
child.tail = '\n' + (indent * l)
new_level -= 1
child.tail = '\n' + (indent * new_level)
def pretty_opf(root):
# Put all dc: tags first starting with title and author. Preserve order for
# the rest.
def dckey(x):
return {'title':0, 'creator':1}.get(parse_utils.barename(x.tag), 2)
for metadata in root.xpath('//opf:metadata', namespaces=const.OPF_NAMESPACES):
return {'title': 0, 'creator': 1}.get(parse_utils.barename(x.tag), 2)
for metadata in root.xpath('//opf:metadata',
namespaces=const.OPF_NAMESPACES):
dc_tags = metadata.xpath('./*[namespace-uri()="%s"]' % const.DC11_NS)
dc_tags.sort(key=dckey)
for x in reversed(dc_tags):
metadata.insert(0, x)
# Group items in the manifest
spine_ids = root.xpath('//opf:spine/opf:itemref/@idref', namespaces=const.OPF_NAMESPACES)
spine_ids = {x:i for i, x in enumerate(spine_ids)}
spine_ids = root.xpath('//opf:spine/opf:itemref/@idref',
namespaces=const.OPF_NAMESPACES)
spine_ids = {x: i for i, x in enumerate(spine_ids)}
def manifest_key(x):
mt = x.get('media-type', '')
href = x.get('href', '')
ext = href.rpartition('.')[-1].lower()
cat = 1000
if mt in OEB_DOCS:
if mt in base.OEB_DOCS:
cat = 0
elif mt == guess_type('a.ncx'):
cat = 1
elif mt in OEB_STYLES:
elif mt in base.OEB_STYLES:
cat = 2
elif mt.startswith('image/'):
cat = 3
@@ -75,20 +73,23 @@ def pretty_opf(root):
i = sort_key(href)
return (cat, i)
for manifest in root.xpath('//opf:manifest', namespaces=const.OPF_NAMESPACES):
for manifest in root.xpath('//opf:manifest',
namespaces=const.OPF_NAMESPACES):
try:
children = sorted(manifest, key=manifest_key)
except AttributeError:
continue # There are comments so dont sort since that would mess up the comments
# There are comments so dont sort since that would mess up the
# comments.
continue
for x in reversed(children):
manifest.insert(0, x)
def isblock(x):
if callable(x.tag) or not x.tag:
return True
if x.tag in const.XHTML_BLOCK_TAGS | {const.SVG_SVG}:
if x.tag in const.XHTML_BLOCK_TAGS | {base.tag('svg', 'svg')}:
return True
return False
@@ -133,28 +134,34 @@ def pretty_block(parent, level=1, indent=' '):
that contain only other block tags '''
if not parent.text or isspace(parent.text):
parent.text = ''
nn = '\n' if hasattr(parent.tag, 'strip') and parse_utils.barename(parent.tag) in {'tr', 'td', 'th'} else '\n\n'
if (hasattr(parent.tag, 'strip') and
parse_utils.barename(parent.tag) in {'tr', 'td', 'th'}):
nn = '\n'
else:
nn = '\n\n'
parent.text = parent.text + nn + (indent * level)
for i, child in enumerate(parent):
if isblock(child) and has_only_blocks(child):
pretty_block(child, level=level+1, indent=indent)
elif child.tag == const.SVG_SVG:
elif child.tag == base.tag('svg', 'svg'):
pretty_xml_tree(child, level=level, indent=indent)
l = level
new_level = level
if i == len(parent) - 1:
l -= 1
new_level -= 1
if not child.tail or isspace(child.tail):
child.tail = ''
child.tail = child.tail + nn + (indent * l)
child.tail = child.tail + nn + (indent * new_level)
def pretty_script_or_style(container, child):
if child.text:
indent = indent_for_tag(child)
if child.tag.endswith('style'):
child.text = force_unicode(pretty_css(container, '', child.text), 'utf-8')
child.text = force_unicode(pretty_css(container, '', child.text),
'utf-8')
child.text = textwrap.dedent(child.text)
child.text = '\n' + '\n'.join([(indent + x) if x else '' for x in child.text.splitlines()])
child.text = '\n' + '\n'.join([(indent + x) if x else ''
for x in child.text.splitlines()])
set_indent(child, 'text', indent)
@@ -169,62 +176,82 @@ def pretty_html_tree(container, root):
# Special case the handling of a body that contains a single block tag
# with all content. In this case we prettify the containing block tag
# even if it has non block children.
if (len(body) == 1 and not callable(body[0].tag) and isblock(body[0]) and not has_only_blocks(
body[0]) and parse_utils.barename(body[0].tag) not in (
'pre', 'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6') and len(body[0]) > 0):
if (len(body) == 1 and
not callable(body[0].tag) and
isblock(body[0]) and
not has_only_blocks(body[0]) and
parse_utils.barename(body[0].tag) not in ('pre', 'p', 'h1',
'h2', 'h3', 'h4',
'h5', 'h6') and
len(body[0]) > 0):
pretty_block(body[0], level=2)
if container is not None:
# Handle <script> and <style> tags
for child in root.xpath('//*[local-name()="script" or local-name()="style"]'):
for child in root.xpath('//*[local-name()="script" or local-name()='
'"style"]'):
pretty_script_or_style(container, child)
def fix_html(container, raw):
' Fix any parsing errors in the HTML represented as a string in raw. Fixing is done using the HTML5 parsing algorithm. '
"""
Fix any parsing errors in the HTML represented as a string in raw. Fixing
is done using the HTML5 parsing algorithm.
"""
root = container.parse_xhtml(raw)
return serialize(root, 'text/html')
return base.serialize(root, 'text/html')
def pretty_html(container, name, raw):
' Pretty print the HTML represented as a string in raw '
"""
Pretty print the HTML represented as a string in raw
"""
root = container.parse_xhtml(raw)
pretty_html_tree(container, root)
return serialize(root, 'text/html')
return base.serialize(root, 'text/html')
def pretty_css(container, name, raw):
' Pretty print the CSS represented as a string in raw '
"""
Pretty print the CSS represented as a string in raw
"""
sheet = container.parse_css(raw)
return serialize(sheet, 'text/css')
return base.serialize(sheet, 'text/css')
def pretty_xml(container, name, raw):
' Pretty print the XML represented as a string in raw. If ``name`` is the name of the OPF, extra OPF-specific prettying is performed. '
"""
Pretty print the XML represented as a string in raw. If ``name`` is the
name of the OPF, extra OPF-specific prettying is performed.
"""
root = container.parse_xml(raw)
if name == container.opf_name:
pretty_opf(root)
pretty_xml_tree(root)
return serialize(root, 'text/xml')
return base.serialize(root, 'text/xml')
def fix_all_html(container):
' Fix any parsing errors in all HTML files in the container. Fixing is done using the HTML5 parsing algorithm. '
"""
Fix any parsing errors in all HTML files in the container. Fixing is done
using the HTML5 parsing algorithm. """
for name, mt in container.mime_map.items():
if mt in OEB_DOCS:
if mt in base.OEB_DOCS:
container.parsed(name)
container.dirty(name)
def pretty_all(container):
' Pretty print all HTML/CSS/XML files in the container '
"""
Pretty print all HTML/CSS/XML files in the container
"""
xml_types = {guess_type('a.ncx'), guess_type('a.xml'), guess_type('a.svg')}
for name, mt in container.mime_map.items():
prettied = False
if mt in OEB_DOCS:
if mt in base.OEB_DOCS:
pretty_html_tree(container, container.parsed(name))
prettied = True
elif mt in OEB_STYLES:
elif mt in base.OEB_STYLES:
container.parsed(name)
prettied = True
elif name == container.opf_name: