1
0
mirror of https://github.com/gryf/ebook-converter.git synced 2026-04-24 15:11:30 +02:00

Fixing leftovers from first concept of constants

This commit is contained in:
2020-06-07 11:59:00 +02:00
parent 7419954e0c
commit a69884d724
9 changed files with 652 additions and 464 deletions
+33 -36
View File
@@ -1,5 +1,8 @@
import collections
import errno
import hashlib
import io
import itertools
import logging
import os
import re
@@ -7,13 +10,10 @@ import shutil
import sys
import time
import unicodedata
import uuid
from collections import defaultdict
from io import BytesIO
from itertools import count
import urllib.parse
import uuid
from css_parser import getUrls, replaceUrls
import css_parser
from lxml import etree
from ebook_converter import constants as const
@@ -35,10 +35,7 @@ from ebook_converter.ebooks.metadata.utils import parse_opf_version
from ebook_converter.ebooks.mobi import MobiError
from ebook_converter.ebooks.mobi.reader.headers import MetadataHeader
from ebook_converter.ebooks.mobi.tweak import set_cover
from ebook_converter.ebooks.oeb.base import (
OEB_DOCS, OEB_STYLES, Manifest, itercsslinks, iterlinks,
rewrite_links, serialize, urlquote, urlunquote
)
from ebook_converter.ebooks.oeb import base as oeb_base
from ebook_converter.ebooks.oeb.parse_utils import NotHTML, parse_html
from ebook_converter.ebooks.oeb.polish.errors import DRMError, InvalidBook
from ebook_converter.ebooks.oeb.polish.parsing import parse as parse_html_tweak
@@ -96,7 +93,7 @@ def abspath_to_name(path, root):
return relpath(os.path.abspath(path), root).replace(os.sep, '/')
def name_to_href(name, root, base=None, quote=urlquote):
def name_to_href(name, root, base=None, quote=oeb_base.urlquote):
fullpath = name_to_abspath(name, root)
basepath = root if base is None else os.path.dirname(name_to_abspath(base, root))
path = relpath(fullpath, basepath).replace(os.sep, '/')
@@ -111,7 +108,7 @@ def href_to_name(href, root, base=None):
return None
if purl.scheme or not purl.path:
return None
href = urlunquote(purl.path)
href = oeb_base.urlunquote(purl.path)
if iswindows and ':' in href:
# path manipulations on windows fail for paths with : in them, so we
# assume all such paths are invalid/absolute paths.
@@ -324,7 +321,7 @@ class Container(ContainerBase): # {{{
item_id = 'id' + '%d'%c
manifest = self.opf_xpath('//opf:manifest')[0]
href = self.name_to_href(name, self.opf_name)
item = manifest.makeelement(const.OPF_ITEM,
item = manifest.makeelement(oeb_base.tag('opf', 'item'),
id=item_id, href=href)
item.set('media-type', self.mime_map[name])
self.insert_into_xml(manifest, item)
@@ -340,7 +337,7 @@ class Container(ContainerBase): # {{{
def make_name_unique(self, name):
''' Ensure that `name` does not already exist in this book. If it does, return a modified version that does not exist. '''
counter = count()
counter = itertools.count()
while self.has_name_case_insensitive(name) or self.manifest_has_name(name):
c = next(counter) + 1
base, ext = name.rpartition('.')[::2]
@@ -377,10 +374,10 @@ class Container(ContainerBase): # {{{
if self.ok_to_be_unmanifested(name):
return name
item_id = self.add_name_to_manifest(name, process_manifest_item=process_manifest_item)
if mt in OEB_DOCS:
if mt in oeb_base.OEB_DOCS:
manifest = self.opf_xpath('//opf:manifest')[0]
spine = self.opf_xpath('//opf:spine')[0]
si = manifest.makeelement(const.OPF_ITEMREF, idref=item_id)
si = manifest.makeelement(oeb_base.tag('opf', 'itemref'), idref=item_id)
self.insert_into_xml(spine, si, index=spine_index)
return name
@@ -442,12 +439,12 @@ class Container(ContainerBase): # {{{
replace_func.file_type = 'opf'
for elem in self.opf_xpath('//*[@href]'):
elem.set('href', replace_func(elem.get('href')))
elif media_type.lower() in OEB_DOCS:
elif media_type.lower() in oeb_base.OEB_DOCS:
replace_func.file_type = 'text'
rewrite_links(self.parsed(name), replace_func)
elif media_type.lower() in OEB_STYLES:
oeb_base.rewrite_links(self.parsed(name), replace_func)
elif media_type.lower() in oeb_base.OEB_STYLES:
replace_func.file_type = 'style'
replaceUrls(self.parsed(name), replace_func)
css_parser.replaceUrls(self.parsed(name), replace_func)
elif media_type.lower() == guess_type('toc.ncx'):
replace_func.file_type = 'ncx'
for elem in self.parsed(name).xpath('//*[@src]'):
@@ -467,21 +464,21 @@ class Container(ContainerBase): # {{{
if name == self.opf_name:
for elem in self.opf_xpath('//*[@href]'):
yield (elem.get('href'), elem.sourceline, 0) if get_line_numbers else elem.get('href')
elif media_type.lower() in OEB_DOCS:
for el, attr, link, pos in iterlinks(self.parsed(name)):
elif media_type.lower() in oeb_base.OEB_DOCS:
for el, attr, link, pos in oeb_base.iterlinks(self.parsed(name)):
yield (link, el.sourceline, pos) if get_line_numbers else link
elif media_type.lower() in OEB_STYLES:
elif media_type.lower() in oeb_base.OEB_STYLES:
if get_line_numbers:
with self.open(name, 'rb') as f:
raw = self.decode(f.read()).replace('\r\n', '\n').replace('\r', '\n')
position = PositionFinder(raw)
is_in_comment = CommentFinder(raw)
for link, offset in itercsslinks(raw):
for link, offset in oeb_base.itercsslinks(raw):
if not is_in_comment(offset):
lnum, col = position(offset)
yield link, lnum, col
else:
for link in getUrls(self.parsed(name)):
for link in css_parser.getUrls(self.parsed(name)):
yield link
elif media_type.lower() == guess_type('toc.ncx'):
for elem in self.parsed(name).xpath('//*[@src]'):
@@ -533,7 +530,7 @@ class Container(ContainerBase): # {{{
def opf_xpath(self, expr):
' Convenience method to evaluate an XPath expression on the OPF file, has the opf: and dc: namespace prefixes pre-defined. '
return self.opf.xpath(expr, namespaces=const.OPF_NAMESPACES)
return self.opf.xpath(expr, namespaces=oeb_base.tag('opf', 'namespaces'))
def has_name(self, name):
''' Return True iff a file with the same canonical name as that specified exists. Unlike :meth:`exists` this method is always case-sensitive. '''
@@ -580,11 +577,11 @@ class Container(ContainerBase): # {{{
def parse(self, path, mime):
with open(path, 'rb') as src:
data = src.read()
if mime in OEB_DOCS:
if mime in oeb_base.OEB_DOCS:
data = self.parse_xhtml(data, self.relpath(path))
elif mime[-4:] in {'+xml', '/xml'}:
data = self.parse_xml(data)
elif mime in OEB_STYLES:
elif mime in oeb_base.OEB_STYLES:
data = self.parse_css(data, self.relpath(path))
return data
@@ -597,7 +594,7 @@ class Container(ContainerBase): # {{{
'''
ans = self.open(name).read()
mime = self.mime_map.get(name, guess_type(name))
if decode and (mime in OEB_STYLES or mime in OEB_DOCS or mime == 'text/plain' or mime[-4:] in {'+xml', '/xml'}):
if decode and (mime in oeb_base.OEB_STYLES or mime in oeb_base.OEB_DOCS or mime == 'text/plain' or mime[-4:] in {'+xml', '/xml'}):
ans = self.decode(ans, normalize_to_nfc=normalize_to_nfc)
return ans
@@ -637,7 +634,7 @@ class Container(ContainerBase): # {{{
so use it sparingly. '''
from ebook_converter.ebooks.metadata.opf2 import OPF as O
mi = self.serialize_item(self.opf_name)
return O(BytesIO(mi), basedir=self.opf_dir, unquote_urls=False,
return O(io.BytesIO(mi), basedir=self.opf_dir, unquote_urls=False,
populate_spine=False).to_book_metadata()
@property
@@ -662,7 +659,7 @@ class Container(ContainerBase): # {{{
@property
def manifest_type_map(self):
' Mapping of manifest media-type to list of canonical names of that media-type '
ans = defaultdict(list)
ans = collections.defaultdict(list)
for item in self.opf_xpath('//opf:manifest/opf:item[@href and @media-type]'):
ans[item.get('media-type').lower()].append(self.href_to_name(
item.get('href'), self.opf_name))
@@ -813,7 +810,7 @@ class Container(ContainerBase): # {{{
spine = self.opf_xpath('//opf:spine')[0]
spine.text = tail
for name, linear in spine_items:
i = spine.makeelement(const.OPF_ITEMREF,
i = spine.makeelement(oeb_base.tag('opf', 'itemref'),
nsmap={'opf': const.OPF2_NS})
i.tail = tail
i.set('idref', imap[name])
@@ -922,7 +919,7 @@ class Container(ContainerBase): # {{{
return ans[0]
self.dirty(self.opf_name)
package = self.opf_xpath('//opf:package')[0]
item = package.makeelement(OPF(name))
item = package.makeelement(oeb_base.tag('opf', name))
item.tail = '\n'
package.append(item)
return item
@@ -945,7 +942,7 @@ class Container(ContainerBase): # {{{
item_id = id_prefix + '%d'%c
manifest = self.opf_xpath('//opf:manifest')[0]
item = manifest.makeelement(const.OPF_ITEM,
item = manifest.makeelement(oeb_base.tag('opf', 'item'),
id=item_id, href=href)
item.set('media-type', media_type)
self.insert_into_xml(manifest, item)
@@ -992,7 +989,7 @@ class Container(ContainerBase): # {{{
data = root = self.parsed(name)
if name == self.opf_name:
self.format_opf()
data = serialize(data, self.mime_map[name], pretty_print=name in
data = oeb_base.serialize(data, self.mime_map[name], pretty_print=name in
self.pretty_print)
if name == self.opf_name and root.nsmap.get(None) == const.OPF2_NS:
# Needed as I can't get lxml to output opf:role and
@@ -1181,7 +1178,7 @@ class EpubContainer(Container):
)
if not opf_files:
raise InvalidEpub('META-INF/container.xml contains no link to OPF file')
opf_path = os.path.join(self.root, *(urlunquote(opf_files[0].get('full-path')).split('/')))
opf_path = os.path.join(self.root, *(oeb_base.urlunquote(opf_files[0].get('full-path')).split('/')))
if not exists(opf_path):
raise InvalidEpub('OPF file does not exist at location pointed to'
' by META-INF/container.xml')
@@ -1412,7 +1409,7 @@ def do_explode(path, dest):
def opf_to_azw3(opf, outpath, container):
from ebook_converter.ebooks.conversion.plumber import Plumber, create_oebbook
class Item(Manifest.Item):
class Item(oeb_base.Manifest.Item):
def _parse_css(self, data):
# The default CSS parser used by oeb.base inserts the h namespace
+93 -59
View File
@@ -1,22 +1,16 @@
from collections import defaultdict
from functools import partial
import collections
import functools
from css_parser.css import CSSRule, CSSStyleDeclaration
from ebook_converter import constants as const
from ebook_converter import force_unicode
from ebook_converter.css_selectors import parse, SelectorSyntaxError
from ebook_converter.ebooks.oeb.base import OEB_STYLES, OEB_DOCS, css_text
from ebook_converter.ebooks.oeb.normalize_css import normalize_filter_css, normalizers
from ebook_converter.ebooks.oeb.polish.pretty import pretty_script_or_style, pretty_xml_tree, serialize
from ebook_converter.ebooks.oeb import base
from ebook_converter.ebooks.oeb.polish import pretty
from ebook_converter.utils.icu import numeric_sort_key
from ebook_converter.css_selectors import Select, SelectorError
__license__ = 'GPL v3'
__copyright__ = '2014, Kovid Goyal <kovid at kovidgoyal.net>'
def filter_used_rules(rules, log, select):
for rule in rules:
used = False
@@ -34,7 +28,8 @@ def filter_used_rules(rules, log, select):
yield rule
def get_imported_sheets(name, container, sheets, recursion_level=10, sheet=None):
def get_imported_sheets(name, container, sheets, recursion_level=10,
sheet=None):
ans = set()
sheet = sheet or sheets[name]
for rule in sheet.cssRules.rulesOfType(CSSRule.IMPORT_RULE):
@@ -44,7 +39,8 @@ def get_imported_sheets(name, container, sheets, recursion_level=10, sheet=None)
ans.add(iname)
if recursion_level > 0:
for imported_sheet in tuple(ans):
ans |= get_imported_sheets(imported_sheet, container, sheets, recursion_level=recursion_level-1)
ans |= get_imported_sheets(imported_sheet, container, sheets,
recursion_level=recursion_level-1)
ans.discard(name)
return ans
@@ -56,7 +52,7 @@ def merge_declarations(first, second):
def merge_identical_selectors(sheet):
' Merge rules that have identical selectors '
selector_map = defaultdict(list)
selector_map = collections.defaultdict(list)
for rule in sheet.cssRules.rulesOfType(CSSRule.STYLE_RULE):
selector_map[rule.selectorText].append(rule)
remove = []
@@ -70,23 +66,29 @@ def merge_identical_selectors(sheet):
return len(remove)
def remove_unused_css(container, report=None, remove_unused_classes=False, merge_rules=False):
'''
Remove all unused CSS rules from the book. An unused CSS rule is one that does not match any actual content.
def remove_unused_css(container, report=None, remove_unused_classes=False,
merge_rules=False):
"""
Remove all unused CSS rules from the book. An unused CSS rule is one that
does not match any actual content.
:param report: An optional callable that takes a single argument. It is called with information about the operations being performed.
:param remove_unused_classes: If True, class attributes in the HTML that do not match any CSS rules are also removed.
:param report: An optional callable that takes a single argument. It is
called with information about the operations being
performed.
:param remove_unused_classes: If True, class attributes in the HTML that
do not match any CSS rules are also removed.
:param merge_rules: If True, rules with identical selectors are merged.
'''
report = report or (lambda x:x)
"""
report = report or (lambda x: x)
def safe_parse(name):
try:
return container.parsed(name)
except TypeError:
pass
sheets = {name:safe_parse(name) for name, mt in container.mime_map.items() if mt in OEB_STYLES}
sheets = {k:v for k, v in sheets.items() if v is not None}
sheets = {name: safe_parse(name) for name, mt in container.mime_map.items()
if mt in base.OEB_STYLES and safe_parse(name) is not None}
num_merged = 0
if merge_rules:
for name, sheet in sheets.items():
@@ -106,7 +108,7 @@ def remove_unused_css(container, report=None, remove_unused_classes=False, merge
num_of_removed_rules = num_of_removed_classes = 0
for name, mt in container.mime_map.items():
if mt not in OEB_DOCS:
if mt not in base.OEB_DOCS:
continue
root = container.parsed(name)
select = Select(root, ignore_inappropriate_pseudo_classes=True)
@@ -120,31 +122,39 @@ def remove_unused_css(container, report=None, remove_unused_classes=False, merge
num_merged += num
container.dirty(name)
if remove_unused_classes:
used_classes |= {x.lower() for x in classes_in_rule_list(sheet.cssRules)}
imports = get_imported_sheets(name, container, sheets, sheet=sheet)
used_classes |= {x.lower() for x in
classes_in_rule_list(sheet.cssRules)}
imports = get_imported_sheets(name, container, sheets,
sheet=sheet)
for imported_sheet in imports:
style_rules[imported_sheet] = tuple(filter_used_rules(style_rules[imported_sheet], container.log, select))
style_rules[imported_sheet] = tuple(filter_used_rules(
style_rules[imported_sheet], container.log, select))
if remove_unused_classes:
used_classes |= class_map[imported_sheet]
rules = tuple(sheet.cssRules.rulesOfType(CSSRule.STYLE_RULE))
unused_rules = tuple(filter_used_rules(rules, container.log, select))
unused_rules = tuple(filter_used_rules(rules, container.log,
select))
if unused_rules:
num_of_removed_rules += len(unused_rules)
[sheet.cssRules.remove(r) for r in unused_rules]
style.text = force_unicode(sheet.cssText, 'utf-8')
pretty_script_or_style(container, style)
pretty.pretty_script_or_style(container, style)
container.dirty(name)
for link in root.xpath('//*[local-name()="link" and @href]'):
sname = container.href_to_name(link.get('href'), name)
if sname not in sheets:
continue
style_rules[sname] = tuple(filter_used_rules(style_rules[sname], container.log, select))
style_rules[sname] = tuple(filter_used_rules(style_rules[sname],
container.log,
select))
if remove_unused_classes:
used_classes |= class_map[sname]
for iname in import_map[sname]:
style_rules[iname] = tuple(filter_used_rules(style_rules[iname], container.log, select))
style_rules[iname] = tuple(
filter_used_rules(style_rules[iname], container.log,
select))
if remove_unused_classes:
used_classes |= class_map[iname]
@@ -159,7 +169,8 @@ def remove_unused_css(container, report=None, remove_unused_classes=False, merge
elem.set('class', ' '.join(classes))
else:
del elem.attrib['class']
num_of_removed_classes += len(original_classes) - len(classes)
num_of_removed_classes += (len(original_classes) -
len(classes))
container.dirty(name)
for name, sheet in sheets.items():
@@ -195,7 +206,7 @@ def filter_declaration(style, properties=()):
changed = True
all_props = set(style.keys())
for prop in style.getProperties():
n = normalizers.get(prop.name, None)
n = base.normalize_css.normalizers.get(prop.name, None)
if n is not None:
normalized = n(prop.name, prop.propertyValue)
removed = properties.intersection(set(normalized))
@@ -225,12 +236,13 @@ def transform_inline_styles(container, name, transform_sheet, transform_style):
root = container.parsed(name)
changed = False
for style in root.xpath('//*[local-name()="style"]'):
if style.text and (style.get('type') or 'text/css').lower() == 'text/css':
if style.text and (style.get('type') or
'text/css').lower() == 'text/css':
sheet = container.parse_css(style.text)
if transform_sheet(sheet):
changed = True
style.text = force_unicode(sheet.cssText, 'utf-8')
pretty_script_or_style(container, style)
pretty.pretty_script_or_style(container, style)
for elem in root.xpath('//*[@style]'):
text = elem.get('style', None)
if text:
@@ -240,13 +252,16 @@ def transform_inline_styles(container, name, transform_sheet, transform_style):
if style.length == 0:
del elem.attrib['style']
else:
elem.set('style', force_unicode(style.getCssText(separator=' '), 'utf-8'))
elem.set('style',
force_unicode(style.getCssText(separator=' '),
'utf-8'))
return changed
def transform_css(container, transform_sheet=None, transform_style=None, names=()):
def transform_css(container, transform_sheet=None, transform_style=None,
names=()):
if not names:
types = OEB_STYLES | OEB_DOCS
types = base.OEB_STYLES | base.OEB_DOCS
names = []
for name, mt in container.mime_map.items():
if mt in types:
@@ -256,13 +271,14 @@ def transform_css(container, transform_sheet=None, transform_style=None, names=(
for name in names:
mt = container.mime_map[name]
if mt in OEB_STYLES:
if mt in base.OEB_STYLES:
sheet = container.parsed(name)
if transform_sheet(sheet):
container.dirty(name)
doc_changed = True
elif mt in OEB_DOCS:
if transform_inline_styles(container, name, transform_sheet, transform_style):
elif mt in base.OEB_DOCS:
if transform_inline_styles(container, name, transform_sheet,
transform_style):
container.dirty(name)
doc_changed = True
@@ -270,15 +286,21 @@ def transform_css(container, transform_sheet=None, transform_style=None, names=(
def filter_css(container, properties, names=()):
'''
"""
Remove the specified CSS properties from all CSS rules in the book.
:param properties: Set of properties to remove. For example: :code:`{'font-family', 'color'}`.
:param names: The files from which to remove the properties. Defaults to all HTML and CSS files in the book.
'''
properties = normalize_filter_css(properties)
return transform_css(container, transform_sheet=partial(filter_sheet, properties=properties),
transform_style=partial(filter_declaration, properties=properties), names=names)
:param properties: Set of properties to remove. For example:
:code:`{'font-family', 'color'}`.
:param names: The files from which to remove the properties. Defaults to
all HTML and CSS files in the book.
"""
properties = base.normalize_css.normalize_filter_css(properties)
return transform_css(container,
transform_sheet=functools.partial(
filter_sheet, properties=properties),
transform_style=functools.partial(
filter_declaration, properties=properties),
names=names)
def _classes_in_selector(selector, classes):
@@ -331,21 +353,29 @@ def remove_property_value(prop, predicate):
if len(removed_vals) == len(prop.propertyValue):
prop.parent.removeProperty(prop.name)
else:
x = css_text(prop.propertyValue)
x = base.css_text(prop.propertyValue)
for v in removed_vals:
x = x.replace(css_text(v), '').strip()
x = x.replace(base.css_text(v), '').strip()
prop.propertyValue.cssText = x
return bool(removed_vals)
RULE_PRIORITIES = {t:i for i, t in enumerate((CSSRule.COMMENT, CSSRule.CHARSET_RULE, CSSRule.IMPORT_RULE, CSSRule.NAMESPACE_RULE))}
RULE_PRIORITIES = {t: i for i, t in enumerate((CSSRule.COMMENT,
CSSRule.CHARSET_RULE,
CSSRule.IMPORT_RULE,
CSSRule.NAMESPACE_RULE))}
def sort_sheet(container, sheet_or_text):
''' Sort the rules in a stylesheet. Note that in the general case this can
change the effective styles, but for most common sheets, it should be safe.
'''
sheet = container.parse_css(sheet_or_text) if isinstance(sheet_or_text, str) else sheet_or_text
"""
Sort the rules in a stylesheet. Note that in the general case this can
change the effective styles, but for most common sheets, it should be
safe.
"""
if isinstance(sheet_or_text, str):
sheet = container.parse_css(sheet_or_text)
else:
sheet = sheet_or_text
def text_sort_key(x):
return numeric_sort_key(str(x or ''))
@@ -364,7 +394,8 @@ def sort_sheet(container, sheet_or_text):
rule.selectorText = ', '.join(s.selectorText for s in selectors)
elif rule.type == CSSRule.FONT_FACE_RULE:
try:
tertiary = text_sort_key(rule.style.getPropertyValue('font-family'))
tertiary = text_sort_key(rule.style.getPropertyValue('font-'
'family'))
except Exception:
pass
@@ -379,11 +410,14 @@ def add_stylesheet_links(container, name, text):
if not head:
return
head = head[0]
sheets = tuple(container.manifest_items_of_type(lambda mt: mt in OEB_STYLES))
sheets = tuple(container.manifest_items_of_type(lambda mt:
mt in base.OEB_STYLES))
if not sheets:
return
for sname in sheets:
link = head.makeelement(const.XHTML_LINK, type='text/css', rel='stylesheet', href=container.name_to_href(sname, name))
link = head.makeelement(base.tag('xhtml', 'link'), type='text/css',
rel='stylesheet',
href=container.name_to_href(sname, name))
head.append(link)
pretty_xml_tree(head)
return serialize(root, 'text/html')
pretty.pretty_xml_tree(head)
return pretty.serialize(root, 'text/html')
+3 -2
View File
@@ -1,6 +1,7 @@
from lxml import etree
from ebook_converter import constants as const
from ebook_converter.ebooks.oeb import base
from ebook_converter.utils.localization import canonicalize_lang
@@ -14,7 +15,7 @@ def get_book_language(container):
def set_guide_item(container, item_type, title, name, frag=None):
ref_tag = const.OPF_REFERENCE
ref_tag = base.tag('opf', 'reference')
href = None
if name:
href = container.name_to_href(name, container.opf_name)
@@ -23,7 +24,7 @@ def set_guide_item(container, item_type, title, name, frag=None):
guides = container.opf_xpath('//opf:guide')
if not guides and href:
g = container.opf.makeelement(const.OPF_GUIDE,
g = container.opf.makeelement(base.tag('opf', 'guide'),
nsmap={'opf': const.OPF2_NS})
container.insert_into_xml(container.opf, g)
guides = [g]
+70 -43
View File
@@ -1,18 +1,13 @@
import textwrap
# from lxml.etree import Element
from ebook_converter import constants as const
from ebook_converter import force_unicode
from ebook_converter.ebooks.oeb import parse_utils
from ebook_converter.ebooks.oeb.base import serialize, OEB_DOCS, OEB_STYLES
from ebook_converter.ebooks.oeb import base
from ebook_converter.ebooks.oeb.polish.utils import guess_type
from ebook_converter.utils.icu import sort_key
__license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
def isspace(x):
return not x.strip('\u0009\u000a\u000c\u000d\u0020')
@@ -28,37 +23,40 @@ def pretty_xml_tree(elem, level=0, indent=' '):
for i, child in enumerate(elem):
pretty_xml_tree(child, level=level+1, indent=indent)
if not child.tail or isspace(child.tail):
l = level + 1
new_level = level + 1
if i == len(elem) - 1:
l -= 1
child.tail = '\n' + (indent * l)
new_level -= 1
child.tail = '\n' + (indent * new_level)
def pretty_opf(root):
# Put all dc: tags first starting with title and author. Preserve order for
# the rest.
def dckey(x):
return {'title':0, 'creator':1}.get(parse_utils.barename(x.tag), 2)
for metadata in root.xpath('//opf:metadata', namespaces=const.OPF_NAMESPACES):
return {'title': 0, 'creator': 1}.get(parse_utils.barename(x.tag), 2)
for metadata in root.xpath('//opf:metadata',
namespaces=const.OPF_NAMESPACES):
dc_tags = metadata.xpath('./*[namespace-uri()="%s"]' % const.DC11_NS)
dc_tags.sort(key=dckey)
for x in reversed(dc_tags):
metadata.insert(0, x)
# Group items in the manifest
spine_ids = root.xpath('//opf:spine/opf:itemref/@idref', namespaces=const.OPF_NAMESPACES)
spine_ids = {x:i for i, x in enumerate(spine_ids)}
spine_ids = root.xpath('//opf:spine/opf:itemref/@idref',
namespaces=const.OPF_NAMESPACES)
spine_ids = {x: i for i, x in enumerate(spine_ids)}
def manifest_key(x):
mt = x.get('media-type', '')
href = x.get('href', '')
ext = href.rpartition('.')[-1].lower()
cat = 1000
if mt in OEB_DOCS:
if mt in base.OEB_DOCS:
cat = 0
elif mt == guess_type('a.ncx'):
cat = 1
elif mt in OEB_STYLES:
elif mt in base.OEB_STYLES:
cat = 2
elif mt.startswith('image/'):
cat = 3
@@ -75,20 +73,23 @@ def pretty_opf(root):
i = sort_key(href)
return (cat, i)
for manifest in root.xpath('//opf:manifest', namespaces=const.OPF_NAMESPACES):
for manifest in root.xpath('//opf:manifest',
namespaces=const.OPF_NAMESPACES):
try:
children = sorted(manifest, key=manifest_key)
except AttributeError:
continue # There are comments so dont sort since that would mess up the comments
# There are comments so dont sort since that would mess up the
# comments.
continue
for x in reversed(children):
manifest.insert(0, x)
def isblock(x):
if callable(x.tag) or not x.tag:
return True
if x.tag in const.XHTML_BLOCK_TAGS | {const.SVG_SVG}:
if x.tag in const.XHTML_BLOCK_TAGS | {base.tag('svg', 'svg')}:
return True
return False
@@ -133,28 +134,34 @@ def pretty_block(parent, level=1, indent=' '):
that contain only other block tags '''
if not parent.text or isspace(parent.text):
parent.text = ''
nn = '\n' if hasattr(parent.tag, 'strip') and parse_utils.barename(parent.tag) in {'tr', 'td', 'th'} else '\n\n'
if (hasattr(parent.tag, 'strip') and
parse_utils.barename(parent.tag) in {'tr', 'td', 'th'}):
nn = '\n'
else:
nn = '\n\n'
parent.text = parent.text + nn + (indent * level)
for i, child in enumerate(parent):
if isblock(child) and has_only_blocks(child):
pretty_block(child, level=level+1, indent=indent)
elif child.tag == const.SVG_SVG:
elif child.tag == base.tag('svg', 'svg'):
pretty_xml_tree(child, level=level, indent=indent)
l = level
new_level = level
if i == len(parent) - 1:
l -= 1
new_level -= 1
if not child.tail or isspace(child.tail):
child.tail = ''
child.tail = child.tail + nn + (indent * l)
child.tail = child.tail + nn + (indent * new_level)
def pretty_script_or_style(container, child):
if child.text:
indent = indent_for_tag(child)
if child.tag.endswith('style'):
child.text = force_unicode(pretty_css(container, '', child.text), 'utf-8')
child.text = force_unicode(pretty_css(container, '', child.text),
'utf-8')
child.text = textwrap.dedent(child.text)
child.text = '\n' + '\n'.join([(indent + x) if x else '' for x in child.text.splitlines()])
child.text = '\n' + '\n'.join([(indent + x) if x else ''
for x in child.text.splitlines()])
set_indent(child, 'text', indent)
@@ -169,62 +176,82 @@ def pretty_html_tree(container, root):
# Special case the handling of a body that contains a single block tag
# with all content. In this case we prettify the containing block tag
# even if it has non block children.
if (len(body) == 1 and not callable(body[0].tag) and isblock(body[0]) and not has_only_blocks(
body[0]) and parse_utils.barename(body[0].tag) not in (
'pre', 'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6') and len(body[0]) > 0):
if (len(body) == 1 and
not callable(body[0].tag) and
isblock(body[0]) and
not has_only_blocks(body[0]) and
parse_utils.barename(body[0].tag) not in ('pre', 'p', 'h1',
'h2', 'h3', 'h4',
'h5', 'h6') and
len(body[0]) > 0):
pretty_block(body[0], level=2)
if container is not None:
# Handle <script> and <style> tags
for child in root.xpath('//*[local-name()="script" or local-name()="style"]'):
for child in root.xpath('//*[local-name()="script" or local-name()='
'"style"]'):
pretty_script_or_style(container, child)
def fix_html(container, raw):
' Fix any parsing errors in the HTML represented as a string in raw. Fixing is done using the HTML5 parsing algorithm. '
"""
Fix any parsing errors in the HTML represented as a string in raw. Fixing
is done using the HTML5 parsing algorithm.
"""
root = container.parse_xhtml(raw)
return serialize(root, 'text/html')
return base.serialize(root, 'text/html')
def pretty_html(container, name, raw):
' Pretty print the HTML represented as a string in raw '
"""
Pretty print the HTML represented as a string in raw
"""
root = container.parse_xhtml(raw)
pretty_html_tree(container, root)
return serialize(root, 'text/html')
return base.serialize(root, 'text/html')
def pretty_css(container, name, raw):
' Pretty print the CSS represented as a string in raw '
"""
Pretty print the CSS represented as a string in raw
"""
sheet = container.parse_css(raw)
return serialize(sheet, 'text/css')
return base.serialize(sheet, 'text/css')
def pretty_xml(container, name, raw):
' Pretty print the XML represented as a string in raw. If ``name`` is the name of the OPF, extra OPF-specific prettying is performed. '
"""
Pretty print the XML represented as a string in raw. If ``name`` is the
name of the OPF, extra OPF-specific prettying is performed.
"""
root = container.parse_xml(raw)
if name == container.opf_name:
pretty_opf(root)
pretty_xml_tree(root)
return serialize(root, 'text/xml')
return base.serialize(root, 'text/xml')
def fix_all_html(container):
' Fix any parsing errors in all HTML files in the container. Fixing is done using the HTML5 parsing algorithm. '
"""
Fix any parsing errors in all HTML files in the container. Fixing is done
using the HTML5 parsing algorithm. """
for name, mt in container.mime_map.items():
if mt in OEB_DOCS:
if mt in base.OEB_DOCS:
container.parsed(name)
container.dirty(name)
def pretty_all(container):
' Pretty print all HTML/CSS/XML files in the container '
"""
Pretty print all HTML/CSS/XML files in the container
"""
xml_types = {guess_type('a.ncx'), guess_type('a.xml'), guess_type('a.svg')}
for name, mt in container.mime_map.items():
prettied = False
if mt in OEB_DOCS:
if mt in base.OEB_DOCS:
pretty_html_tree(container, container.parsed(name))
prettied = True
elif mt in OEB_STYLES:
elif mt in base.OEB_STYLES:
container.parsed(name)
prettied = True
elif name == container.opf_name: