mirror of
https://github.com/gryf/ebook-converter.git
synced 2026-04-27 17:13:32 +02:00
Use the real constants module.
This is progressing refactor of the calibre code to make it more readable, and transform it to something more coherent. In this patch, there are changes regarding imports for some modules, instead of polluting namespace of each module with some other modules symbols, which often were imported from other modules. Yuck.
This commit is contained in:
+273
-277
File diff suppressed because it is too large
Load Diff
@@ -1,20 +1,16 @@
|
||||
import re
|
||||
|
||||
from lxml import etree, html
|
||||
from lxml import etree
|
||||
from lxml import html
|
||||
|
||||
from ebook_converter import constants as const
|
||||
from ebook_converter import xml_replace_entities, force_unicode
|
||||
from ebook_converter.utils.xml_parse import safe_xml_fromstring
|
||||
from ebook_converter.constants_old import filesystem_encoding
|
||||
from ebook_converter.ebooks.chardet import xml_to_unicode, strip_encoding_declarations
|
||||
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
RECOVER_PARSER = etree.XMLParser(recover=True, no_network=True, resolve_entities=False)
|
||||
XHTML_NS = 'http://www.w3.org/1999/xhtml'
|
||||
XMLNS_NS = 'http://www.w3.org/2000/xmlns/'
|
||||
RECOVER_PARSER = etree.XMLParser(recover=True, no_network=True,
|
||||
resolve_entities=False)
|
||||
|
||||
|
||||
class NotHTML(Exception):
|
||||
@@ -33,15 +29,15 @@ def namespace(name):
|
||||
|
||||
|
||||
def XHTML(name):
|
||||
return '{%s}%s' % (XHTML_NS, name)
|
||||
return '{%s}%s' % (const.XHTML_NS, name)
|
||||
|
||||
|
||||
def xpath(elem, expr):
|
||||
return elem.xpath(expr, namespaces={'h':XHTML_NS})
|
||||
return elem.xpath(expr, namespaces={'h':const.XHTML_NS})
|
||||
|
||||
|
||||
def XPath(expr):
|
||||
return etree.XPath(expr, namespaces={'h':XHTML_NS})
|
||||
return etree.XPath(expr, namespaces={'h':const.XHTML_NS})
|
||||
|
||||
|
||||
META_XP = XPath('/h:html/h:head/h:meta[@http-equiv="Content-Type"]')
|
||||
@@ -111,7 +107,7 @@ def _html4_parse(data):
|
||||
elem.text = elem.text.strip('-')
|
||||
data = etree.tostring(data, encoding='unicode')
|
||||
|
||||
data = safe_xml_fromstring(data)
|
||||
data = etree.fromstring(data)
|
||||
return data
|
||||
|
||||
|
||||
@@ -204,14 +200,14 @@ def parse_html(data, log=None, decoder=None, preprocessor=None,
|
||||
|
||||
# Try with more & more drastic measures to parse
|
||||
try:
|
||||
data = safe_xml_fromstring(data, recover=False)
|
||||
data = etree.fromstring(data)
|
||||
check_for_html5(pre, data)
|
||||
except (HTML5Doc, etree.XMLSyntaxError):
|
||||
log.debug('Initial parse failed, using more'
|
||||
' forgiving parsers')
|
||||
raw = data = xml_replace_entities(raw)
|
||||
try:
|
||||
data = safe_xml_fromstring(data, recover=False)
|
||||
data = etree.fromstring(data)
|
||||
check_for_html5(pre, data)
|
||||
except (HTML5Doc, etree.XMLSyntaxError):
|
||||
log.debug('Parsing %s as HTML' % filename)
|
||||
@@ -240,7 +236,7 @@ def parse_html(data, log=None, decoder=None, preprocessor=None,
|
||||
if barename(data.tag) in non_html_file_tags:
|
||||
raise NotHTML(data.tag)
|
||||
log.warn('File %r does not appear to be (X)HTML'%filename)
|
||||
nroot = safe_xml_fromstring('<html></html>')
|
||||
nroot = etree.fromstring('<html></html>')
|
||||
has_body = False
|
||||
for child in list(data):
|
||||
if isinstance(child.tag, (str, bytes)) and barename(child.tag) == 'body':
|
||||
@@ -249,7 +245,7 @@ def parse_html(data, log=None, decoder=None, preprocessor=None,
|
||||
parent = nroot
|
||||
if not has_body:
|
||||
log.warn('File %r appears to be a HTML fragment'%filename)
|
||||
nroot = safe_xml_fromstring('<html><body/></html>')
|
||||
nroot = etree.fromstring('<html><body/></html>')
|
||||
parent = nroot[0]
|
||||
for child in list(data.iter()):
|
||||
oparent = child.getparent()
|
||||
@@ -261,16 +257,16 @@ def parse_html(data, log=None, decoder=None, preprocessor=None,
|
||||
# Force into the XHTML namespace
|
||||
if not namespace(data.tag):
|
||||
log.warn('Forcing', filename, 'into XHTML namespace')
|
||||
data.attrib['xmlns'] = XHTML_NS
|
||||
data.attrib['xmlns'] = const.XHTML_NS
|
||||
data = etree.tostring(data, encoding='unicode')
|
||||
|
||||
try:
|
||||
data = safe_xml_fromstring(data, recover=False)
|
||||
data = etree.fromstring(data)
|
||||
except:
|
||||
data = data.replace(':=', '=').replace(':>', '>')
|
||||
data = data.replace('<http:/>', '')
|
||||
try:
|
||||
data = safe_xml_fromstring(data, recover=False)
|
||||
data = etree.fromstring(data)
|
||||
except etree.XMLSyntaxError:
|
||||
log.warn('Stripping comments from %s'%
|
||||
filename)
|
||||
@@ -281,17 +277,17 @@ def parse_html(data, log=None, decoder=None, preprocessor=None,
|
||||
'')
|
||||
data = data.replace("<?xml version='1.0' encoding='utf-8'??>", '')
|
||||
try:
|
||||
data = safe_xml_fromstring(data)
|
||||
data = etree.fromstring(data)
|
||||
except etree.XMLSyntaxError:
|
||||
log.warn('Stripping meta tags from %s'% filename)
|
||||
data = re.sub(r'<meta\s+[^>]+?>', '', data)
|
||||
data = safe_xml_fromstring(data)
|
||||
elif namespace(data.tag) != XHTML_NS:
|
||||
data = etree.fromstring(data)
|
||||
elif namespace(data.tag) != const.XHTML_NS:
|
||||
# OEB_DOC_NS, but possibly others
|
||||
ns = namespace(data.tag)
|
||||
attrib = dict(data.attrib)
|
||||
nroot = etree.Element(XHTML('html'),
|
||||
nsmap={None: XHTML_NS}, attrib=attrib)
|
||||
nsmap={None: const.XHTML_NS}, attrib=attrib)
|
||||
for elem in data.iterdescendants():
|
||||
if isinstance(elem.tag, (str, bytes)) and \
|
||||
namespace(elem.tag) == ns:
|
||||
@@ -301,7 +297,7 @@ def parse_html(data, log=None, decoder=None, preprocessor=None,
|
||||
data = nroot
|
||||
|
||||
# Remove non default prefixes referring to the XHTML namespace
|
||||
data = ensure_namespace_prefixes(data, {None: XHTML_NS})
|
||||
data = ensure_namespace_prefixes(data, {None: const.XHTML_NS})
|
||||
|
||||
data = merge_multiple_html_heads_and_bodies(data, log)
|
||||
# Ensure has a <head/>
|
||||
|
||||
@@ -14,7 +14,9 @@ from itertools import count
|
||||
import urllib.parse
|
||||
|
||||
from css_parser import getUrls, replaceUrls
|
||||
from lxml import etree
|
||||
|
||||
from ebook_converter import constants as const
|
||||
from ebook_converter import CurrentDir, walk
|
||||
from ebook_converter.constants_old import iswindows
|
||||
from ebook_converter.customize.ui import plugin_for_input_format, plugin_for_output_format
|
||||
@@ -34,7 +36,7 @@ from ebook_converter.ebooks.mobi import MobiError
|
||||
from ebook_converter.ebooks.mobi.reader.headers import MetadataHeader
|
||||
from ebook_converter.ebooks.mobi.tweak import set_cover
|
||||
from ebook_converter.ebooks.oeb.base import (
|
||||
DC11_NS, OEB_DOCS, OEB_STYLES, OPF, OPF2_NS, Manifest, itercsslinks, iterlinks,
|
||||
OEB_DOCS, OEB_STYLES, Manifest, itercsslinks, iterlinks,
|
||||
rewrite_links, serialize, urlquote, urlunquote
|
||||
)
|
||||
from ebook_converter.ebooks.oeb.parse_utils import NotHTML, parse_html
|
||||
@@ -47,13 +49,11 @@ from ebook_converter.ptempfile import PersistentTemporaryDirectory, PersistentTe
|
||||
from ebook_converter.utils.filenames import hardlink_file, nlinks_file
|
||||
from ebook_converter.utils.ipc.simple_worker import WorkerError, fork_job
|
||||
from ebook_converter.utils.logging import default_log
|
||||
from ebook_converter.utils.xml_parse import safe_xml_fromstring
|
||||
from ebook_converter.utils.zipfile import ZipFile
|
||||
|
||||
exists, join, relpath = os.path.exists, os.path.join, os.path.relpath
|
||||
|
||||
OEB_FONTS = {guess_type('a.ttf'), guess_type('b.otf'), guess_type('a.woff'), 'application/x-font-ttf', 'application/x-font-otf', 'application/font-sfnt'}
|
||||
OPF_NAMESPACES = {'opf':OPF2_NS, 'dc':DC11_NS}
|
||||
null = object()
|
||||
|
||||
|
||||
@@ -195,7 +195,7 @@ class ContainerBase(object): # {{{
|
||||
data, self.used_encoding = xml_to_unicode(
|
||||
data, strip_encoding_pats=True, assume_utf8=True, resolve_entities=True)
|
||||
data = unicodedata.normalize('NFC', data)
|
||||
return safe_xml_fromstring(data)
|
||||
return etree.fromstring(data)
|
||||
|
||||
def parse_xhtml(self, data, fname='<string>', force_html5_parse=False):
|
||||
if self.tweak_mode:
|
||||
@@ -324,7 +324,7 @@ class Container(ContainerBase): # {{{
|
||||
item_id = 'id' + '%d'%c
|
||||
manifest = self.opf_xpath('//opf:manifest')[0]
|
||||
href = self.name_to_href(name, self.opf_name)
|
||||
item = manifest.makeelement(OPF('item'),
|
||||
item = manifest.makeelement(const.OPF_ITEM,
|
||||
id=item_id, href=href)
|
||||
item.set('media-type', self.mime_map[name])
|
||||
self.insert_into_xml(manifest, item)
|
||||
@@ -380,7 +380,7 @@ class Container(ContainerBase): # {{{
|
||||
if mt in OEB_DOCS:
|
||||
manifest = self.opf_xpath('//opf:manifest')[0]
|
||||
spine = self.opf_xpath('//opf:spine')[0]
|
||||
si = manifest.makeelement(OPF('itemref'), idref=item_id)
|
||||
si = manifest.makeelement(const.OPF_ITEMREF, idref=item_id)
|
||||
self.insert_into_xml(spine, si, index=spine_index)
|
||||
return name
|
||||
|
||||
@@ -533,7 +533,7 @@ class Container(ContainerBase): # {{{
|
||||
|
||||
def opf_xpath(self, expr):
|
||||
' Convenience method to evaluate an XPath expression on the OPF file, has the opf: and dc: namespace prefixes pre-defined. '
|
||||
return self.opf.xpath(expr, namespaces=OPF_NAMESPACES)
|
||||
return self.opf.xpath(expr, namespaces=const.OPF_NAMESPACES)
|
||||
|
||||
def has_name(self, name):
|
||||
''' Return True iff a file with the same canonical name as that specified exists. Unlike :meth:`exists` this method is always case-sensitive. '''
|
||||
@@ -813,7 +813,8 @@ class Container(ContainerBase): # {{{
|
||||
spine = self.opf_xpath('//opf:spine')[0]
|
||||
spine.text = tail
|
||||
for name, linear in spine_items:
|
||||
i = spine.makeelement('{%s}itemref' % OPF_NAMESPACES['opf'], nsmap={'opf':OPF_NAMESPACES['opf']})
|
||||
i = spine.makeelement(const.OPF_ITEMREF,
|
||||
nsmap={'opf': const.OPF2_NS})
|
||||
i.tail = tail
|
||||
i.set('idref', imap[name])
|
||||
spine.append(i)
|
||||
@@ -944,7 +945,7 @@ class Container(ContainerBase): # {{{
|
||||
item_id = id_prefix + '%d'%c
|
||||
|
||||
manifest = self.opf_xpath('//opf:manifest')[0]
|
||||
item = manifest.makeelement(OPF('item'),
|
||||
item = manifest.makeelement(const.OPF_ITEM,
|
||||
id=item_id, href=href)
|
||||
item.set('media-type', media_type)
|
||||
self.insert_into_xml(manifest, item)
|
||||
@@ -993,7 +994,7 @@ class Container(ContainerBase): # {{{
|
||||
self.format_opf()
|
||||
data = serialize(data, self.mime_map[name], pretty_print=name in
|
||||
self.pretty_print)
|
||||
if name == self.opf_name and root.nsmap.get(None) == OPF2_NS:
|
||||
if name == self.opf_name and root.nsmap.get(None) == const.OPF2_NS:
|
||||
# Needed as I can't get lxml to output opf:role and
|
||||
# not output <opf:metadata> as well
|
||||
data = re.sub(br'(<[/]{0,1})opf:', r'\1', data)
|
||||
@@ -1172,7 +1173,7 @@ class EpubContainer(Container):
|
||||
container_path = join(self.root, 'META-INF', 'container.xml')
|
||||
if not exists(container_path):
|
||||
raise InvalidEpub('No META-INF/container.xml in epub')
|
||||
container = safe_xml_fromstring(open(container_path, 'rb').read())
|
||||
container = etree.fromstring(open(container_path, 'rb').read())
|
||||
opf_files = container.xpath((
|
||||
r'child::ocf:rootfiles/ocf:rootfile'
|
||||
'[@media-type="%s" and @full-path]'%guess_type('a.opf')
|
||||
|
||||
@@ -2,10 +2,11 @@ from collections import defaultdict
|
||||
from functools import partial
|
||||
|
||||
from css_parser.css import CSSRule, CSSStyleDeclaration
|
||||
from ebook_converter.css_selectors import parse, SelectorSyntaxError
|
||||
|
||||
from ebook_converter import constants as const
|
||||
from ebook_converter import force_unicode
|
||||
from ebook_converter.ebooks.oeb.base import OEB_STYLES, OEB_DOCS, XHTML, css_text
|
||||
from ebook_converter.css_selectors import parse, SelectorSyntaxError
|
||||
from ebook_converter.ebooks.oeb.base import OEB_STYLES, OEB_DOCS, css_text
|
||||
from ebook_converter.ebooks.oeb.normalize_css import normalize_filter_css, normalizers
|
||||
from ebook_converter.ebooks.oeb.polish.pretty import pretty_script_or_style, pretty_xml_tree, serialize
|
||||
from ebook_converter.utils.icu import numeric_sort_key
|
||||
@@ -382,7 +383,7 @@ def add_stylesheet_links(container, name, text):
|
||||
if not sheets:
|
||||
return
|
||||
for sname in sheets:
|
||||
link = head.makeelement(XHTML('link'), type='text/css', rel='stylesheet', href=container.name_to_href(sname, name))
|
||||
link = head.makeelement(const.XHTML_LINK, type='text/css', rel='stylesheet', href=container.name_to_href(sname, name))
|
||||
head.append(link)
|
||||
pretty_xml_tree(head)
|
||||
return serialize(root, 'text/html')
|
||||
|
||||
@@ -1,13 +1,9 @@
|
||||
from lxml import etree
|
||||
|
||||
from ebook_converter.ebooks.oeb.polish.container import OPF_NAMESPACES
|
||||
from ebook_converter import constants as const
|
||||
from ebook_converter.utils.localization import canonicalize_lang
|
||||
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2014, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
|
||||
|
||||
def get_book_language(container):
|
||||
for lang in container.opf_xpath('//dc:language'):
|
||||
raw = lang.text
|
||||
@@ -18,7 +14,7 @@ def get_book_language(container):
|
||||
|
||||
|
||||
def set_guide_item(container, item_type, title, name, frag=None):
|
||||
ref_tag = '{%s}reference' % OPF_NAMESPACES['opf']
|
||||
ref_tag = const.OPF_REFERENCE
|
||||
href = None
|
||||
if name:
|
||||
href = container.name_to_href(name, container.opf_name)
|
||||
@@ -27,23 +23,27 @@ def set_guide_item(container, item_type, title, name, frag=None):
|
||||
|
||||
guides = container.opf_xpath('//opf:guide')
|
||||
if not guides and href:
|
||||
g = container.opf.makeelement('{%s}guide' % OPF_NAMESPACES['opf'], nsmap={'opf':OPF_NAMESPACES['opf']})
|
||||
g = container.opf.makeelement(const.OPF_GUIDE,
|
||||
nsmap={'opf': const.OPF2_NS})
|
||||
container.insert_into_xml(container.opf, g)
|
||||
guides = [g]
|
||||
|
||||
for guide in guides:
|
||||
matches = []
|
||||
for child in guide.iterchildren(etree.Element):
|
||||
if child.tag == ref_tag and child.get('type', '').lower() == item_type.lower():
|
||||
if (child.tag == ref_tag and
|
||||
child.get('type', '').lower() == item_type.lower()):
|
||||
matches.append(child)
|
||||
if not matches and href:
|
||||
r = guide.makeelement(ref_tag, type=item_type, nsmap={'opf':OPF_NAMESPACES['opf']})
|
||||
r = guide.makeelement(ref_tag, type=item_type,
|
||||
nsmap={'opf': const.OPF2_NS})
|
||||
container.insert_into_xml(guide, r)
|
||||
matches.append(r)
|
||||
for m in matches:
|
||||
if href:
|
||||
m.set('title', title), m.set('href', href), m.set('type', item_type)
|
||||
m.set('title', title)
|
||||
m.set('href', href)
|
||||
m.set('type', item_type)
|
||||
else:
|
||||
container.remove_from_xml(m)
|
||||
container.dirty(container.opf_name)
|
||||
|
||||
|
||||
@@ -1,21 +1,18 @@
|
||||
import re
|
||||
|
||||
from lxml.etree import Element as LxmlElement
|
||||
from lxml import etree
|
||||
import html5_parser
|
||||
|
||||
from ebook_converter import constants as const
|
||||
from ebook_converter import xml_replace_entities
|
||||
from ebook_converter.utils.xml_parse import safe_xml_fromstring
|
||||
from ebook_converter.ebooks.chardet import xml_to_unicode, strip_encoding_declarations
|
||||
from ebook_converter.ebooks.chardet import strip_encoding_declarations
|
||||
from ebook_converter.ebooks.chardet import xml_to_unicode
|
||||
from ebook_converter.utils.cleantext import clean_xml_chars
|
||||
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
|
||||
XHTML_NS = 'http://www.w3.org/1999/xhtml'
|
||||
|
||||
|
||||
def parse_html5(raw, decoder=None, log=None, discard_namespaces=False, line_numbers=True, linenumber_attribute=None, replace_entities=True, fix_newlines=True):
|
||||
def parse_html5(raw, decoder=None, log=None, discard_namespaces=False,
|
||||
line_numbers=True, linenumber_attribute=None,
|
||||
replace_entities=True, fix_newlines=True):
|
||||
if isinstance(raw, bytes):
|
||||
raw = xml_to_unicode(raw)[0] if decoder is None else decoder(raw)
|
||||
if replace_entities:
|
||||
@@ -23,10 +20,14 @@ def parse_html5(raw, decoder=None, log=None, discard_namespaces=False, line_numb
|
||||
if fix_newlines:
|
||||
raw = raw.replace('\r\n', '\n').replace('\r', '\n')
|
||||
raw = clean_xml_chars(raw)
|
||||
root = html5_parser.parse(raw, maybe_xhtml=not discard_namespaces, line_number_attr=linenumber_attribute, keep_doctype=False, sanitize_names=True)
|
||||
if (discard_namespaces and root.tag != 'html') or (
|
||||
not discard_namespaces and (root.tag != '{%s}%s' % (XHTML_NS, 'html') or root.prefix)):
|
||||
raise ValueError('Failed to parse correctly, root has tag: %s and prefix: %s' % (root.tag, root.prefix))
|
||||
root = html5_parser.parse(raw, maybe_xhtml=not discard_namespaces,
|
||||
line_number_attr=linenumber_attribute,
|
||||
keep_doctype=False, sanitize_names=True)
|
||||
if ((discard_namespaces and root.tag != 'html') or
|
||||
(not discard_namespaces and
|
||||
(root.tag != '{%s}%s' % (const.XHTML_NS, 'html') or root.prefix))):
|
||||
raise ValueError('Failed to parse correctly, root has tag: %s and '
|
||||
'prefix: %s' % (root.tag, root.prefix))
|
||||
return root
|
||||
|
||||
|
||||
@@ -48,12 +49,14 @@ def handle_private_entities(data):
|
||||
user_entities[match.group(1)] = val
|
||||
if user_entities:
|
||||
data = ('\n' * num_of_nl_in_pre) + data[idx:]
|
||||
pat = re.compile(r'&(%s);'%('|'.join(user_entities.keys())))
|
||||
data = pat.sub(lambda m:user_entities[m.group(1)], data)
|
||||
pat = re.compile(r'&(%s);' % ('|'.join(user_entities.keys())))
|
||||
data = pat.sub(lambda m: user_entities[m.group(1)], data)
|
||||
return data
|
||||
|
||||
|
||||
def parse(raw, decoder=None, log=None, line_numbers=True, linenumber_attribute=None, replace_entities=True, force_html5_parse=False):
|
||||
def parse(raw, decoder=None, log=None, line_numbers=True,
|
||||
linenumber_attribute=None, replace_entities=True,
|
||||
force_html5_parse=False):
|
||||
if isinstance(raw, bytes):
|
||||
raw = xml_to_unicode(raw)[0] if decoder is None else decoder(raw)
|
||||
raw = handle_private_entities(raw)
|
||||
@@ -70,26 +73,32 @@ def parse(raw, decoder=None, log=None, line_numbers=True, linenumber_attribute=N
|
||||
raw = ('\n' * newlines) + raw[match.start():]
|
||||
break
|
||||
|
||||
raw = strip_encoding_declarations(raw, limit=10*1024, preserve_newlines=True)
|
||||
raw = strip_encoding_declarations(raw, limit=10*1024,
|
||||
preserve_newlines=True)
|
||||
if force_html5_parse:
|
||||
return parse_html5(raw, log=log, line_numbers=line_numbers, linenumber_attribute=linenumber_attribute, replace_entities=False, fix_newlines=False)
|
||||
return parse_html5(raw, log=log, line_numbers=line_numbers,
|
||||
linenumber_attribute=linenumber_attribute,
|
||||
replace_entities=False, fix_newlines=False)
|
||||
try:
|
||||
ans = safe_xml_fromstring(raw, recover=False)
|
||||
if ans.tag != '{%s}html' % XHTML_NS:
|
||||
ans = etree.fromstring(raw)
|
||||
if ans.tag != '{%s}html' % const.XHTML_NS:
|
||||
raise ValueError('Root tag is not <html> in the XHTML namespace')
|
||||
if linenumber_attribute:
|
||||
for elem in ans.iter(LxmlElement):
|
||||
for elem in ans.iter(etree.element):
|
||||
if elem.sourceline is not None:
|
||||
elem.set(linenumber_attribute, str(elem.sourceline))
|
||||
return ans
|
||||
except Exception:
|
||||
if log is not None:
|
||||
log.exception('Failed to parse as XML, parsing as tag soup')
|
||||
return parse_html5(raw, log=log, line_numbers=line_numbers, linenumber_attribute=linenumber_attribute, replace_entities=False, fix_newlines=False)
|
||||
return parse_html5(raw, log=log, line_numbers=line_numbers,
|
||||
linenumber_attribute=linenumber_attribute,
|
||||
replace_entities=False, fix_newlines=False)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
from lxml import etree
|
||||
root = parse_html5('\n<html><head><title>a\n</title><p b=1 c=2 a=0> \n<b>b<svg ass="wipe" viewbox="0">', discard_namespaces=False)
|
||||
root = parse_html5('\n<html><head><title>a\n</title><p b=1 c=2 a=0> '
|
||||
'\n<b>b<svg ass="wipe" viewbox="0">',
|
||||
discard_namespaces=False)
|
||||
print(etree.tostring(root, encoding='utf-8'))
|
||||
print()
|
||||
|
||||
@@ -2,10 +2,10 @@ import textwrap
|
||||
|
||||
# from lxml.etree import Element
|
||||
|
||||
from ebook_converter import constants as const
|
||||
from ebook_converter import force_unicode
|
||||
from ebook_converter.ebooks.oeb.base import (
|
||||
serialize, OEB_DOCS, barename, OEB_STYLES, XPNSMAP, XHTML, SVG)
|
||||
from ebook_converter.ebooks.oeb.polish.container import OPF_NAMESPACES
|
||||
from ebook_converter.ebooks.oeb import parse_utils
|
||||
from ebook_converter.ebooks.oeb.base import serialize, OEB_DOCS, OEB_STYLES
|
||||
from ebook_converter.ebooks.oeb.polish.utils import guess_type
|
||||
from ebook_converter.utils.icu import sort_key
|
||||
|
||||
@@ -38,15 +38,15 @@ def pretty_opf(root):
|
||||
# Put all dc: tags first starting with title and author. Preserve order for
|
||||
# the rest.
|
||||
def dckey(x):
|
||||
return {'title':0, 'creator':1}.get(barename(x.tag), 2)
|
||||
for metadata in root.xpath('//opf:metadata', namespaces=OPF_NAMESPACES):
|
||||
dc_tags = metadata.xpath('./*[namespace-uri()="%s"]' % OPF_NAMESPACES['dc'])
|
||||
return {'title':0, 'creator':1}.get(parse_utils.barename(x.tag), 2)
|
||||
for metadata in root.xpath('//opf:metadata', namespaces=const.OPF_NAMESPACES):
|
||||
dc_tags = metadata.xpath('./*[namespace-uri()="%s"]' % const.DC11_NS)
|
||||
dc_tags.sort(key=dckey)
|
||||
for x in reversed(dc_tags):
|
||||
metadata.insert(0, x)
|
||||
|
||||
# Group items in the manifest
|
||||
spine_ids = root.xpath('//opf:spine/opf:itemref/@idref', namespaces=OPF_NAMESPACES)
|
||||
spine_ids = root.xpath('//opf:spine/opf:itemref/@idref', namespaces=const.OPF_NAMESPACES)
|
||||
spine_ids = {x:i for i, x in enumerate(spine_ids)}
|
||||
|
||||
def manifest_key(x):
|
||||
@@ -75,7 +75,7 @@ def pretty_opf(root):
|
||||
i = sort_key(href)
|
||||
return (cat, i)
|
||||
|
||||
for manifest in root.xpath('//opf:manifest', namespaces=OPF_NAMESPACES):
|
||||
for manifest in root.xpath('//opf:manifest', namespaces=const.OPF_NAMESPACES):
|
||||
try:
|
||||
children = sorted(manifest, key=manifest_key)
|
||||
except AttributeError:
|
||||
@@ -84,19 +84,11 @@ def pretty_opf(root):
|
||||
manifest.insert(0, x)
|
||||
|
||||
|
||||
SVG_TAG = SVG('svg')
|
||||
BLOCK_TAGS = frozenset(map(XHTML, (
|
||||
'address', 'article', 'aside', 'audio', 'blockquote', 'body', 'canvas', 'col', 'colgroup', 'dd',
|
||||
'div', 'dl', 'dt', 'fieldset', 'figcaption', 'figure', 'footer', 'form',
|
||||
'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'header', 'hgroup', 'hr', 'li',
|
||||
'noscript', 'ol', 'output', 'p', 'pre', 'script', 'section', 'style', 'table', 'tbody', 'td',
|
||||
'tfoot', 'th', 'thead', 'tr', 'ul', 'video', 'img'))) | {SVG_TAG}
|
||||
|
||||
|
||||
def isblock(x):
|
||||
if callable(x.tag) or not x.tag:
|
||||
return True
|
||||
if x.tag in BLOCK_TAGS:
|
||||
if x.tag in const.XHTML_BLOCK_TAGS | {const.SVG_SVG}:
|
||||
return True
|
||||
return False
|
||||
|
||||
@@ -141,12 +133,12 @@ def pretty_block(parent, level=1, indent=' '):
|
||||
that contain only other block tags '''
|
||||
if not parent.text or isspace(parent.text):
|
||||
parent.text = ''
|
||||
nn = '\n' if hasattr(parent.tag, 'strip') and barename(parent.tag) in {'tr', 'td', 'th'} else '\n\n'
|
||||
nn = '\n' if hasattr(parent.tag, 'strip') and parse_utils.barename(parent.tag) in {'tr', 'td', 'th'} else '\n\n'
|
||||
parent.text = parent.text + nn + (indent * level)
|
||||
for i, child in enumerate(parent):
|
||||
if isblock(child) and has_only_blocks(child):
|
||||
pretty_block(child, level=level+1, indent=indent)
|
||||
elif child.tag == SVG_TAG:
|
||||
elif child.tag == const.SVG_SVG:
|
||||
pretty_xml_tree(child, level=level, indent=indent)
|
||||
l = level
|
||||
if i == len(parent) - 1:
|
||||
@@ -172,13 +164,13 @@ def pretty_html_tree(container, root):
|
||||
child.tail = '\n\n'
|
||||
if hasattr(child.tag, 'endswith') and child.tag.endswith('}head'):
|
||||
pretty_xml_tree(child)
|
||||
for body in root.findall('h:body', namespaces=XPNSMAP):
|
||||
for body in root.findall('h:body', namespaces=const.XPNSMAP):
|
||||
pretty_block(body)
|
||||
# Special case the handling of a body that contains a single block tag
|
||||
# with all content. In this case we prettify the containing block tag
|
||||
# even if it has non block children.
|
||||
if (len(body) == 1 and not callable(body[0].tag) and isblock(body[0]) and not has_only_blocks(
|
||||
body[0]) and barename(body[0].tag) not in (
|
||||
body[0]) and parse_utils.barename(body[0].tag) not in (
|
||||
'pre', 'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6') and len(body[0]) > 0):
|
||||
pretty_block(body[0], level=2)
|
||||
|
||||
|
||||
@@ -1,7 +1,11 @@
|
||||
import copy, os, re
|
||||
import copy
|
||||
import os
|
||||
import re
|
||||
import urllib.parse
|
||||
|
||||
from ebook_converter.ebooks.oeb.base import barename, XPNSMAP, XPath, OPF, XHTML, OEB_DOCS
|
||||
from ebook_converter import constants as const
|
||||
from ebook_converter.ebooks.oeb import base
|
||||
from ebook_converter.ebooks.oeb import parse_utils
|
||||
from ebook_converter.ebooks.oeb.polish.errors import MalformedMarkup
|
||||
from ebook_converter.ebooks.oeb.polish.toc import node_from_loc
|
||||
from ebook_converter.ebooks.oeb.polish.replace import LinkRebaser
|
||||
@@ -35,7 +39,7 @@ def adjust_split_point(split_point, log):
|
||||
parent = sp.getparent()
|
||||
if (
|
||||
parent is None or
|
||||
barename(parent.tag) in {'body', 'html'} or
|
||||
parse_utils.barename(parent.tag) in {'body', 'html'} or
|
||||
(parent.text and parent.text.strip()) or
|
||||
parent.index(sp) > 0
|
||||
):
|
||||
@@ -49,7 +53,7 @@ def adjust_split_point(split_point, log):
|
||||
|
||||
|
||||
def get_body(root):
|
||||
return root.find('h:body', namespaces=XPNSMAP)
|
||||
return root.find('h:body', namespaces=const.XPNSMAP)
|
||||
|
||||
|
||||
def do_split(split_point, log, before=True):
|
||||
@@ -113,7 +117,7 @@ def do_split(split_point, log, before=True):
|
||||
nix_element(elem)
|
||||
|
||||
# Tree 2
|
||||
ancestors = frozenset(XPath('ancestor::*')(split_point2))
|
||||
ancestors = frozenset(base.XPath('ancestor::*')(split_point2))
|
||||
for elem in tuple(body2.iterdescendants()):
|
||||
if elem is split_point2:
|
||||
if not before:
|
||||
@@ -251,7 +255,7 @@ def split(container, name, loc_or_xpath, before=True, totals=None):
|
||||
break
|
||||
index = spine.index(spine_item) + 1
|
||||
|
||||
si = spine.makeelement(OPF('itemref'), idref=manifest_item.get('id'))
|
||||
si = spine.makeelement(base.tag('opf', 'itemref'), idref=manifest_item.get('id'))
|
||||
if not linear:
|
||||
si.set('linear', 'no')
|
||||
container.insert_into_xml(spine, si, index=index)
|
||||
@@ -268,7 +272,7 @@ def multisplit(container, name, xpath, before=True):
|
||||
:param before: If True the splits occur before the identified element otherwise after it.
|
||||
'''
|
||||
root = container.parsed(name)
|
||||
nodes = root.xpath(xpath, namespaces=XPNSMAP)
|
||||
nodes = root.xpath(xpath, namespaces=const.XPNSMAP)
|
||||
if not nodes:
|
||||
raise AbortError('The expression %s did not match any nodes' % xpath)
|
||||
for split_point in nodes:
|
||||
@@ -329,7 +333,7 @@ def all_anchors(root):
|
||||
|
||||
|
||||
def all_stylesheets(container, name):
|
||||
for link in XPath('//h:head/h:link[@href]')(container.parsed(name)):
|
||||
for link in base.XPath('//h:head/h:link[@href]')(container.parsed(name)):
|
||||
name = container.href_to_name(link.get('href'), name)
|
||||
typ = link.get('type', 'text/css')
|
||||
if typ == 'text/css':
|
||||
@@ -358,14 +362,14 @@ def merge_html(container, names, master, insert_page_breaks=False):
|
||||
root = p(master)
|
||||
|
||||
# Ensure master has a <head>
|
||||
head = root.find('h:head', namespaces=XPNSMAP)
|
||||
head = root.find('h:head', namespaces=const.XPNSMAP)
|
||||
if head is None:
|
||||
head = root.makeelement(XHTML('head'))
|
||||
head = root.makeelement(base.tag('xhtml', 'head'))
|
||||
container.insert_into_xml(root, head, 0)
|
||||
|
||||
seen_anchors = all_anchors(root)
|
||||
seen_stylesheets = set(all_stylesheets(container, master))
|
||||
master_body = p(master).findall('h:body', namespaces=XPNSMAP)[-1]
|
||||
master_body = p(master).findall('h:body', namespaces=const.XPNSMAP)[-1]
|
||||
master_base = os.path.dirname(master)
|
||||
anchor_map = {n:{} for n in names if n != master}
|
||||
first_anchor_map = {}
|
||||
@@ -377,7 +381,7 @@ def merge_html(container, names, master, insert_page_breaks=False):
|
||||
for sheet in all_stylesheets(container, name):
|
||||
if sheet not in seen_stylesheets:
|
||||
seen_stylesheets.add(sheet)
|
||||
link = head.makeelement(XHTML('link'), rel='stylesheet', type='text/css', href=container.name_to_href(sheet, master))
|
||||
link = head.makeelement(base.tag('xhtml', 'link'), rel='stylesheet', type='text/css', href=container.name_to_href(sheet, master))
|
||||
container.insert_into_xml(head, link)
|
||||
|
||||
# Rebase links if master is in a different directory
|
||||
@@ -386,7 +390,7 @@ def merge_html(container, names, master, insert_page_breaks=False):
|
||||
|
||||
root = p(name)
|
||||
children = []
|
||||
for body in p(name).findall('h:body', namespaces=XPNSMAP):
|
||||
for body in p(name).findall('h:body', namespaces=const.XPNSMAP):
|
||||
children.append(body.text if body.text and body.text.strip() else '\n\n')
|
||||
children.extend(body)
|
||||
|
||||
@@ -396,7 +400,7 @@ def merge_html(container, names, master, insert_page_breaks=False):
|
||||
break
|
||||
if isinstance(first_child, (str, bytes)):
|
||||
# body contained only text, no tags
|
||||
first_child = body.makeelement(XHTML('p'))
|
||||
first_child = body.makeelement(base.tag('xhtml', 'p'))
|
||||
first_child.text, children[0] = children[0], first_child
|
||||
|
||||
amap = anchor_map[name]
|
||||
@@ -424,7 +428,7 @@ def merge_html(container, names, master, insert_page_breaks=False):
|
||||
amap[''] = first_child.get('id')
|
||||
|
||||
# Fix links that point to local changed anchors
|
||||
for a in XPath('//h:a[starts-with(@href, "#")]')(root):
|
||||
for a in base.XPath('//h:a[starts-with(@href, "#")]')(root):
|
||||
q = a.get('href')[1:]
|
||||
if q in amap:
|
||||
a.set('href', '#' + amap[q])
|
||||
@@ -472,10 +476,10 @@ def merge_css(container, names, master):
|
||||
# Remove links to merged stylesheets in the html files, replacing with a
|
||||
# link to the master sheet
|
||||
for name, mt in container.mime_map.items():
|
||||
if mt in OEB_DOCS:
|
||||
if mt in base.OEB_DOCS:
|
||||
removed = False
|
||||
root = p(name)
|
||||
for link in XPath('//h:link[@href]')(root):
|
||||
for link in base.XPath('//h:link[@href]')(root):
|
||||
q = container.href_to_name(link.get('href'), name)
|
||||
if q in merged:
|
||||
container.remove_from_xml(link)
|
||||
@@ -483,9 +487,9 @@ def merge_css(container, names, master):
|
||||
if removed:
|
||||
container.dirty(name)
|
||||
if removed and master not in set(all_stylesheets(container, name)):
|
||||
head = root.find('h:head', namespaces=XPNSMAP)
|
||||
head = root.find('h:head', namespaces=const.XPNSMAP)
|
||||
if head is not None:
|
||||
link = head.makeelement(XHTML('link'), type='text/css', rel='stylesheet', href=container.name_to_href(master, name))
|
||||
link = head.makeelement(base.tag('xhtml', 'link'), type='text/css', rel='stylesheet', href=container.name_to_href(master, name))
|
||||
container.insert_into_xml(head, link)
|
||||
|
||||
|
||||
|
||||
@@ -1,16 +1,16 @@
|
||||
import re
|
||||
from collections import Counter, OrderedDict
|
||||
from functools import partial
|
||||
from operator import itemgetter
|
||||
import collections
|
||||
import functools
|
||||
import operator
|
||||
import pkg_resources
|
||||
import re
|
||||
import urllib.parse
|
||||
|
||||
from lxml import etree
|
||||
from lxml.builder import ElementMaker
|
||||
|
||||
from ebook_converter import __version__
|
||||
from ebook_converter.ebooks.oeb.base import (
|
||||
XPath, uuid_id, xml2text, NCX, NCX_NS, XML, XHTML, XHTML_NS, serialize, EPUB_NS, XML_NS, OEB_DOCS)
|
||||
from ebook_converter import constants as const
|
||||
from ebook_converter.ebooks.oeb import base
|
||||
from ebook_converter.ebooks.oeb.polish.errors import MalformedMarkup
|
||||
from ebook_converter.ebooks.oeb.polish.utils import guess_type, extract
|
||||
from ebook_converter.ebooks.oeb.polish.opf import set_guide_item, get_book_language
|
||||
@@ -18,10 +18,6 @@ from ebook_converter.ebooks.oeb.polish.pretty import pretty_html_tree
|
||||
from ebook_converter.utils.localization import get_lang, canonicalize_lang, lang_as_iso639_1
|
||||
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
ns = etree.FunctionNamespace('calibre_xpath_extensions')
|
||||
ns.prefix = 'calibre'
|
||||
ns['lower-case'] = lambda c, x: x.lower() if hasattr(x, 'lower') else x
|
||||
@@ -81,7 +77,8 @@ class TOC(object):
|
||||
seen = set()
|
||||
remove = []
|
||||
for child in self:
|
||||
key = child.title if only_text else (child.title, child.dest, (child.frag or None))
|
||||
key = child.title if only_text else (child.title, child.dest,
|
||||
(child.frag or None))
|
||||
if key in seen:
|
||||
remove.append(child)
|
||||
else:
|
||||
@@ -104,7 +101,7 @@ class TOC(object):
|
||||
|
||||
def get_lines(self, lvl=0):
|
||||
frag = ('#'+self.frag) if self.frag else ''
|
||||
ans = [('\t'*lvl) + 'TOC: %s --> %s%s'%(self.title, self.dest, frag)]
|
||||
ans = [('\t'*lvl) + 'TOC: %s --> %s%s' % (self.title, self.dest, frag)]
|
||||
for child in self:
|
||||
ans.extend(child.get_lines(lvl+1))
|
||||
return ans
|
||||
@@ -113,10 +110,8 @@ class TOC(object):
|
||||
return '\n'.join(self.get_lines())
|
||||
|
||||
def to_dict(self, node_counter=None):
|
||||
ans = {
|
||||
'title':self.title, 'dest':self.dest, 'frag':self.frag,
|
||||
'children':[c.to_dict(node_counter) for c in self.children]
|
||||
}
|
||||
ans = {'title': self.title, 'dest': self.dest, 'frag': self.frag,
|
||||
'children': [c.to_dict(node_counter) for c in self.children]}
|
||||
if self.dest_exists is not None:
|
||||
ans['dest_exists'] = self.dest_exists
|
||||
if self.dest_error is not None:
|
||||
@@ -131,7 +126,7 @@ class TOC(object):
|
||||
|
||||
|
||||
def child_xpath(tag, name):
|
||||
return tag.xpath('./*[calibre:lower-case(local-name()) = "%s"]'%name)
|
||||
return tag.xpath('./*[calibre:lower-case(local-name()) = "%s"]' % name)
|
||||
|
||||
|
||||
def add_from_navpoint(container, navpoint, parent, ncx_name):
|
||||
@@ -142,7 +137,7 @@ def add_from_navpoint(container, navpoint, parent, ncx_name):
|
||||
text = ''
|
||||
for txt in child_xpath(nl, 'text'):
|
||||
text += etree.tostring(txt, method='text',
|
||||
encoding='unicode', with_tail=False)
|
||||
encoding='unicode', with_tail=False)
|
||||
content = child_xpath(navpoint, 'content')
|
||||
if content:
|
||||
content = content[0]
|
||||
@@ -154,7 +149,8 @@ def add_from_navpoint(container, navpoint, parent, ncx_name):
|
||||
|
||||
|
||||
def process_ncx_node(container, node, toc_parent, ncx_name):
|
||||
for navpoint in node.xpath('./*[calibre:lower-case(local-name()) = "navpoint"]'):
|
||||
for navpoint in node.xpath('./*[calibre:lower-case(local-name()) '
|
||||
'= "navpoint"]'):
|
||||
child = add_from_navpoint(container, navpoint, toc_parent, ncx_name)
|
||||
if child is not None:
|
||||
process_ncx_node(container, navpoint, child, ncx_name)
|
||||
@@ -171,29 +167,38 @@ def parse_ncx(container, ncx_name):
|
||||
if attr.endswith('lang'):
|
||||
toc_root.lang = str(val)
|
||||
break
|
||||
for uid in root.xpath('//*[calibre:lower-case(local-name()) = "meta" and @name="dtb:uid"]/@content'):
|
||||
for uid in root.xpath('//*[calibre:lower-case(local-name()) = "meta" and '
|
||||
'@name="dtb:uid"]/@content'):
|
||||
if uid:
|
||||
toc_root.uid = str(uid)
|
||||
break
|
||||
for pl in root.xpath('//*[calibre:lower-case(local-name()) = "pagelist"]'):
|
||||
for pt in pl.xpath('descendant::*[calibre:lower-case(local-name()) = "pagetarget"]'):
|
||||
for pt in pl.xpath('descendant::*[calibre:lower-case(local-name()) = '
|
||||
'"pagetarget"]'):
|
||||
pagenum = pt.get('value')
|
||||
if pagenum:
|
||||
href = pt.xpath('descendant::*[calibre:lower-case(local-name()) = "content"]/@src')
|
||||
href = pt.xpath('descendant::*[calibre:lower-case(local-name()'
|
||||
') = "content"]/@src')
|
||||
if href:
|
||||
dest = container.href_to_name(href[0], base=ncx_name)
|
||||
frag = urllib.parse.urlparse(href[0]).fragment or None
|
||||
toc_root.page_list.append({'dest': dest, 'pagenum': pagenum, 'frag': frag})
|
||||
toc_root.page_list.append({'dest': dest,
|
||||
'pagenum': pagenum,
|
||||
'frag': frag})
|
||||
return toc_root
|
||||
|
||||
|
||||
def add_from_li(container, li, parent, nav_name):
|
||||
dest = frag = text = None
|
||||
for x in li.iterchildren(XHTML('a'), XHTML('span')):
|
||||
text = etree.tostring(x, method='text', encoding='unicode', with_tail=False).strip() or ' '.join(x.xpath('descendant-or-self::*/@title')).strip()
|
||||
for x in li.iterchildren(base.tag('xhtml', 'a'),
|
||||
base.tag('xhtml', 'span')):
|
||||
text = (etree.tostring(x, method='text', encoding='unicode',
|
||||
with_tail=False).strip() or
|
||||
' '.join(x.xpath('descendant-or-self::*/@title')).strip())
|
||||
href = x.get('href')
|
||||
if href:
|
||||
dest = nav_name if href.startswith('#') else container.href_to_name(href, base=nav_name)
|
||||
dest = (nav_name if href.startswith('#') else
|
||||
container.href_to_name(href, base=nav_name))
|
||||
frag = urllib.parse.urlparse(href).fragment or None
|
||||
break
|
||||
return parent.add(text or None, dest or None, frag or None)
|
||||
@@ -207,9 +212,9 @@ def first_child(parent, tagname):
|
||||
|
||||
|
||||
def process_nav_node(container, node, toc_parent, nav_name):
|
||||
for li in node.iterchildren(XHTML('li')):
|
||||
for li in node.iterchildren(base.tag('xhtml', 'li')):
|
||||
child = add_from_li(container, li, toc_parent, nav_name)
|
||||
ol = first_child(li, XHTML('ol'))
|
||||
ol = first_child(li, base.tag('xhtml', 'ol'))
|
||||
if child is not None and ol is not None:
|
||||
process_nav_node(container, ol, child, nav_name)
|
||||
|
||||
@@ -218,14 +223,16 @@ def parse_nav(container, nav_name):
|
||||
root = container.parsed(nav_name)
|
||||
toc_root = TOC()
|
||||
toc_root.lang = toc_root.uid = None
|
||||
et = '{%s}type' % EPUB_NS
|
||||
for nav in root.iterdescendants(XHTML('nav')):
|
||||
if nav.get(et) == 'toc':
|
||||
ol = first_child(nav, XHTML('ol'))
|
||||
xhtml = functools.partial(base.tag, 'xhtml')
|
||||
for nav in root.iterdescendants(base.tag('xhtml', 'nav')):
|
||||
if nav.get(base.tag('epub', 'type')) == 'toc':
|
||||
ol = first_child(nav, base.tag('xhtml', 'ol'))
|
||||
if ol is not None:
|
||||
process_nav_node(container, ol, toc_root, nav_name)
|
||||
for h in nav.iterchildren(*map(XHTML, 'h1 h2 h3 h4 h5 h6'.split())):
|
||||
text = etree.tostring(h, method='text', encoding='unicode', with_tail=False) or h.get('title')
|
||||
for h in nav.iterchildren(*map(xhtml,
|
||||
'h1 h2 h3 h4 h5 h6'.split())):
|
||||
text = etree.tostring(h, method='text', encoding='unicode',
|
||||
with_tail=False) or h.get('title')
|
||||
if text:
|
||||
toc_root.toc_title = text
|
||||
break
|
||||
@@ -235,7 +242,7 @@ def parse_nav(container, nav_name):
|
||||
|
||||
def verify_toc_destinations(container, toc):
|
||||
anchor_map = {}
|
||||
anchor_xpath = XPath('//*/@id|//h:a/@name')
|
||||
anchor_xpath = base.XPath('//*/@id|//h:a/@name')
|
||||
for item in toc.iterdescendants():
|
||||
name = item.dest
|
||||
if not name:
|
||||
@@ -284,7 +291,8 @@ def get_x_toc(container, find_toc, parse_toc, verify_destinations=True):
|
||||
ans.lang = ans.uid = None
|
||||
return ans
|
||||
toc = find_toc(container)
|
||||
ans = empty_toc() if toc is None or not container.has_name(toc) else parse_toc(container, toc)
|
||||
ans = (empty_toc() if toc is None or not container.has_name(toc) else
|
||||
parse_toc(container, toc))
|
||||
ans.toc_file_name = toc if toc and container.has_name(toc) else None
|
||||
if verify_destinations:
|
||||
verify_toc_destinations(container, ans)
|
||||
@@ -294,11 +302,14 @@ def get_x_toc(container, find_toc, parse_toc, verify_destinations=True):
|
||||
def get_toc(container, verify_destinations=True):
|
||||
ver = container.opf_version_parsed
|
||||
if ver.major < 3:
|
||||
return get_x_toc(container, find_existing_ncx_toc, parse_ncx, verify_destinations=verify_destinations)
|
||||
return get_x_toc(container, find_existing_ncx_toc, parse_ncx,
|
||||
verify_destinations=verify_destinations)
|
||||
else:
|
||||
ans = get_x_toc(container, find_existing_nav_toc, parse_nav, verify_destinations=verify_destinations)
|
||||
ans = get_x_toc(container, find_existing_nav_toc, parse_nav,
|
||||
verify_destinations=verify_destinations)
|
||||
if len(ans) == 0:
|
||||
ans = get_x_toc(container, find_existing_ncx_toc, parse_ncx, verify_destinations=verify_destinations)
|
||||
ans = get_x_toc(container, find_existing_ncx_toc, parse_ncx,
|
||||
verify_destinations=verify_destinations)
|
||||
return ans
|
||||
|
||||
|
||||
@@ -308,25 +319,33 @@ def get_guide_landmarks(container):
|
||||
href, frag = href.partition('#')[::2]
|
||||
name = container.href_to_name(href, container.opf_name)
|
||||
if container.has_name(name):
|
||||
yield {'dest':name, 'frag':frag, 'title':title or '', 'type':rtype or ''}
|
||||
yield {'dest': name,
|
||||
'frag': frag,
|
||||
'title': title or '',
|
||||
'type': rtype or ''}
|
||||
|
||||
|
||||
def get_nav_landmarks(container):
|
||||
nav = find_existing_nav_toc(container)
|
||||
if nav and container.has_name(nav):
|
||||
root = container.parsed(nav)
|
||||
et = '{%s}type' % EPUB_NS
|
||||
for elem in root.iterdescendants(XHTML('nav')):
|
||||
et = base('epub', 'type')
|
||||
for elem in root.iterdescendants(base.tag('xhtml', 'nav')):
|
||||
if elem.get(et) == 'landmarks':
|
||||
for li in elem.iterdescendants(XHTML('li')):
|
||||
for a in li.iterdescendants(XHTML('a')):
|
||||
for li in elem.iterdescendants(base.tag('xhtml', 'li')):
|
||||
for a in li.iterdescendants(base.tag('xhtml', 'a')):
|
||||
href, rtype = a.get('href'), a.get(et)
|
||||
if href:
|
||||
title = etree.tostring(a, method='text', encoding='unicode', with_tail=False).strip()
|
||||
title = etree.tostring(a, method='text',
|
||||
encoding='unicode',
|
||||
with_tail=False).strip()
|
||||
href, frag = href.partition('#')[::2]
|
||||
name = container.href_to_name(href, nav)
|
||||
if container.has_name(name):
|
||||
yield {'dest':name, 'frag':frag, 'title':title or '', 'type':rtype or ''}
|
||||
yield {'dest': name,
|
||||
'frag': frag,
|
||||
'title': title or '',
|
||||
'type': rtype or ''}
|
||||
break
|
||||
|
||||
|
||||
@@ -344,7 +363,7 @@ def ensure_id(elem, all_ids):
|
||||
elem_id = elem.get('id')
|
||||
if elem_id:
|
||||
return False, elem_id
|
||||
if elem.tag == XHTML('a'):
|
||||
if elem.tag == base.tag('xhtml', 'a'):
|
||||
anchor = elem.get('name', None)
|
||||
if anchor:
|
||||
elem.set('id', anchor)
|
||||
@@ -361,7 +380,7 @@ def ensure_id(elem, all_ids):
|
||||
|
||||
|
||||
def elem_to_toc_text(elem):
|
||||
text = xml2text(elem).strip()
|
||||
text = base.xml2text(elem).strip()
|
||||
if not text:
|
||||
text = elem.get('title', '')
|
||||
if not text:
|
||||
@@ -375,7 +394,7 @@ def elem_to_toc_text(elem):
|
||||
|
||||
def item_at_top(elem):
|
||||
try:
|
||||
body = XPath('//h:body')(elem.getroottree().getroot())[0]
|
||||
body = base.XPath('//h:body')(elem.getroottree().getroot())[0]
|
||||
except (TypeError, IndexError, KeyError, AttributeError):
|
||||
return False
|
||||
tree = body.getroottree()
|
||||
@@ -387,7 +406,7 @@ def item_at_top(elem):
|
||||
try:
|
||||
if el.tag.endswith('}img') or (el.text and el.text.strip()):
|
||||
return False
|
||||
except:
|
||||
except Exception:
|
||||
return False
|
||||
if not path.startswith(epath):
|
||||
# Only check tail of non-parent elements
|
||||
@@ -404,24 +423,26 @@ def from_xpaths(container, xpaths):
|
||||
Table of Contents from the ``<h1>``, ``<h2>`` and ``<h3>`` tags.
|
||||
'''
|
||||
tocroot = TOC()
|
||||
xpaths = [XPath(xp) for xp in xpaths]
|
||||
xpaths = [base.XPath(xp) for xp in xpaths]
|
||||
|
||||
# Find those levels that have no elements in all spine items
|
||||
maps = OrderedDict()
|
||||
maps = collections.OrderedDict()
|
||||
empty_levels = {i+1 for i, xp in enumerate(xpaths)}
|
||||
for spinepath in container.spine_items:
|
||||
name = container.abspath_to_name(spinepath)
|
||||
root = container.parsed(name)
|
||||
level_item_map = maps[name] = {i+1:frozenset(xp(root)) for i, xp in enumerate(xpaths)}
|
||||
level_item_map = maps[name] = {i + 1: frozenset(xp(root))
|
||||
for i, xp in enumerate(xpaths)}
|
||||
for lvl, elems in level_item_map.items():
|
||||
if elems:
|
||||
empty_levels.discard(lvl)
|
||||
# Remove empty levels from all level_maps
|
||||
if empty_levels:
|
||||
for name, lmap in tuple(maps.items()):
|
||||
lmap = {lvl:items for lvl, items in lmap.items() if lvl not in empty_levels}
|
||||
lmap = sorted(lmap.items(), key=itemgetter(0))
|
||||
lmap = {i+1:items for i, (l, items) in enumerate(lmap)}
|
||||
lmap = {lvl: items for lvl, items in lmap.items()
|
||||
if lvl not in empty_levels}
|
||||
lmap = sorted(lmap.items(), key=operator.itemgetter(0))
|
||||
lmap = {i + 1: items for i, (l, items) in enumerate(lmap)}
|
||||
maps[name] = lmap
|
||||
|
||||
node_level_map = {tocroot: 0}
|
||||
@@ -434,13 +455,15 @@ def from_xpaths(container, xpaths):
|
||||
if child is None:
|
||||
return node
|
||||
lvl = node_level_map[child]
|
||||
return node if lvl > limit else child if lvl == limit else process_node(child)
|
||||
return (node if lvl > limit else
|
||||
child if lvl == limit else process_node(child))
|
||||
|
||||
return process_node(tocroot)
|
||||
|
||||
for name, level_item_map in maps.items():
|
||||
root = container.parsed(name)
|
||||
item_level_map = {e:i for i, elems in level_item_map.items() for e in elems}
|
||||
item_level_map = {e: i for i, elems in level_item_map.items()
|
||||
for e in elems}
|
||||
item_dirtied = False
|
||||
all_ids = set(root.xpath('//*/@id'))
|
||||
|
||||
@@ -470,7 +493,7 @@ def from_links(container):
|
||||
Generate a Table of Contents from links in the book.
|
||||
'''
|
||||
toc = TOC()
|
||||
link_path = XPath('//h:a[@href]')
|
||||
link_path = base.XPath('//h:a[@href]')
|
||||
seen_titles, seen_dests = set(), set()
|
||||
for name, is_linear in container.spine_names:
|
||||
root = container.parsed(name)
|
||||
@@ -506,7 +529,7 @@ def find_text(node):
|
||||
pat = re.compile(r'\s+')
|
||||
for child in node:
|
||||
if isinstance(child, etree._Element):
|
||||
text = xml2text(child).strip()
|
||||
text = base.xml2text(child).strip()
|
||||
text = pat.sub(' ', text)
|
||||
if len(text) < 1:
|
||||
continue
|
||||
@@ -526,7 +549,7 @@ def from_files(container):
|
||||
for i, spinepath in enumerate(container.spine_items):
|
||||
name = container.abspath_to_name(spinepath)
|
||||
root = container.parsed(name)
|
||||
body = XPath('//h:body')(root)
|
||||
body = base.XPath('//h:body')(root)
|
||||
if not body:
|
||||
continue
|
||||
text = find_text(body[0])
|
||||
@@ -576,42 +599,46 @@ def add_id(container, name, loc, totals=None):
|
||||
|
||||
def create_ncx(toc, to_href, btitle, lang, uid):
|
||||
lang = lang.replace('_', '-')
|
||||
ncx = etree.Element(NCX('ncx'),
|
||||
attrib={'version': '2005-1', XML('lang'): lang},
|
||||
nsmap={None: NCX_NS})
|
||||
head = etree.SubElement(ncx, NCX('head'))
|
||||
etree.SubElement(head, NCX('meta'),
|
||||
name='dtb:uid', content=str(uid))
|
||||
etree.SubElement(head, NCX('meta'),
|
||||
name='dtb:depth', content=str(toc.depth))
|
||||
ncx = etree.Element(base.tag('ncx', 'ncx'),
|
||||
attrib={'version': '2005-1',
|
||||
base.tag('xml', 'lang'): lang},
|
||||
nsmap={None: const.NCX_NS})
|
||||
head = etree.SubElement(ncx, base.tag('ncx', 'head'))
|
||||
etree.SubElement(head, base.tag('ncx', 'meta'),
|
||||
name='dtb:uid', content=str(uid))
|
||||
etree.SubElement(head, base.tag('ncx', 'meta'),
|
||||
name='dtb:depth', content=str(toc.depth))
|
||||
generator = ''.join(['calibre (', __version__, ')'])
|
||||
etree.SubElement(head, NCX('meta'),
|
||||
name='dtb:generator', content=generator)
|
||||
etree.SubElement(head, NCX('meta'), name='dtb:totalPageCount', content='0')
|
||||
etree.SubElement(head, NCX('meta'), name='dtb:maxPageNumber', content='0')
|
||||
title = etree.SubElement(ncx, NCX('docTitle'))
|
||||
text = etree.SubElement(title, NCX('text'))
|
||||
etree.SubElement(head, base.tag('ncx', 'meta'),
|
||||
name='dtb:generator', content=generator)
|
||||
etree.SubElement(head, base.tag('ncx', 'meta'), name='dtb:totalPageCount',
|
||||
content='0')
|
||||
etree.SubElement(head, base.tag('ncx', 'meta'), name='dtb:maxPageNumber',
|
||||
content='0')
|
||||
title = etree.SubElement(ncx, base.tag('ncx', 'docTitle'))
|
||||
text = etree.SubElement(title, base.tag('ncx', 'text'))
|
||||
text.text = btitle
|
||||
navmap = etree.SubElement(ncx, NCX('navMap'))
|
||||
navmap = etree.SubElement(ncx, base.tag('ncx', 'navMap'))
|
||||
spat = re.compile(r'\s+')
|
||||
|
||||
play_order = Counter()
|
||||
play_order = collections.Counter()
|
||||
|
||||
def process_node(xml_parent, toc_parent):
|
||||
for child in toc_parent:
|
||||
play_order['c'] += 1
|
||||
point = etree.SubElement(xml_parent, NCX('navPoint'), id='num_%d' % play_order['c'],
|
||||
playOrder=str(play_order['c']))
|
||||
label = etree.SubElement(point, NCX('navLabel'))
|
||||
point = etree.SubElement(xml_parent, base.tag('ncx', 'navPoint'),
|
||||
id='num_%d' % play_order['c'],
|
||||
playOrder=str(play_order['c']))
|
||||
label = etree.SubElement(point, base.tag('ncx', 'navLabel'))
|
||||
title = child.title
|
||||
if title:
|
||||
title = spat.sub(' ', title)
|
||||
etree.SubElement(label, NCX('text')).text = title
|
||||
etree.SubElement(label, base.tag('ncx', 'text')).text = title
|
||||
if child.dest:
|
||||
href = to_href(child.dest)
|
||||
if child.frag:
|
||||
href += '#'+child.frag
|
||||
etree.SubElement(point, NCX('content'), src=href)
|
||||
etree.SubElement(point, base.tag('ncx', 'content'), src=href)
|
||||
process_node(point, child)
|
||||
|
||||
process_node(navmap, toc)
|
||||
@@ -622,41 +649,43 @@ def commit_ncx_toc(container, toc, lang=None, uid=None):
|
||||
tocname = find_existing_ncx_toc(container)
|
||||
if tocname is None:
|
||||
item = container.generate_item('toc.ncx', id_prefix='toc')
|
||||
tocname = container.href_to_name(item.get('href'), base=container.opf_name)
|
||||
tocname = container.href_to_name(item.get('href'),
|
||||
base=container.opf_name)
|
||||
ncx_id = item.get('id')
|
||||
[s.set('toc', ncx_id) for s in container.opf_xpath('//opf:spine')]
|
||||
if not lang:
|
||||
lang = get_lang()
|
||||
for l in container.opf_xpath('//dc:language'):
|
||||
l = canonicalize_lang(xml2text(l).strip())
|
||||
if l:
|
||||
lang = l
|
||||
lang = lang_as_iso639_1(l) or l
|
||||
for _l in container.opf_xpath('//dc:language'):
|
||||
_l = canonicalize_lang(base.xml2text(_l).strip())
|
||||
if _l:
|
||||
lang = _l
|
||||
lang = lang_as_iso639_1(_l) or _l
|
||||
break
|
||||
lang = lang_as_iso639_1(lang) or lang
|
||||
if not uid:
|
||||
uid = uuid_id()
|
||||
uid = base.uuid_id()
|
||||
eid = container.opf.get('unique-identifier', None)
|
||||
if eid:
|
||||
m = container.opf_xpath('//*[@id="%s"]'%eid)
|
||||
m = container.opf_xpath('//*[@id="%s"]' % eid)
|
||||
if m:
|
||||
uid = xml2text(m[0])
|
||||
uid = base.xml2text(m[0])
|
||||
|
||||
title = 'Table of Contents'
|
||||
m = container.opf_xpath('//dc:title')
|
||||
if m:
|
||||
x = xml2text(m[0]).strip()
|
||||
x = base.xml2text(m[0]).strip()
|
||||
title = x or title
|
||||
|
||||
to_href = partial(container.name_to_href, base=tocname)
|
||||
to_href = functools.partial(container.name_to_href, base=tocname)
|
||||
root = create_ncx(toc, to_href, title, lang, uid)
|
||||
container.replace(tocname, root)
|
||||
container.pretty_print.add(tocname)
|
||||
|
||||
|
||||
def ensure_single_nav_of_type(root, ntype='toc'):
|
||||
et = '{%s}type' % EPUB_NS
|
||||
navs = [n for n in root.iterdescendants(XHTML('nav')) if n.get(et) == ntype]
|
||||
et = base('epub', 'type')
|
||||
navs = [n for n in root.iterdescendants(base.tag('xhtml', 'nav'))
|
||||
if n.get(et) == ntype]
|
||||
for x in navs[1:]:
|
||||
extract(x)
|
||||
if navs:
|
||||
@@ -667,13 +696,14 @@ def ensure_single_nav_of_type(root, ntype='toc'):
|
||||
nav.attrib.update(attrib)
|
||||
nav.tail = tail
|
||||
else:
|
||||
nav = root.makeelement(XHTML('nav'))
|
||||
first_child(root, XHTML('body')).append(nav)
|
||||
nav.set('{%s}type' % EPUB_NS, ntype)
|
||||
nav = root.makeelement(base.tag('xhtml', 'nav'))
|
||||
first_child(root, base.tag('xhtml', 'body')).append(nav)
|
||||
nav.set(et, ntype)
|
||||
return nav
|
||||
|
||||
|
||||
def commit_nav_toc(container, toc, lang=None, landmarks=None, previous_nav=None):
|
||||
def commit_nav_toc(container, toc, lang=None, landmarks=None,
|
||||
previous_nav=None):
|
||||
from ebook_converter.ebooks.oeb.polish.pretty import pretty_xml_tree
|
||||
tocname = find_existing_nav_toc(container)
|
||||
if previous_nav is not None:
|
||||
@@ -684,7 +714,8 @@ def commit_nav_toc(container, toc, lang=None, landmarks=None, previous_nav=None)
|
||||
if tocname is None:
|
||||
item = container.generate_item('nav.xhtml', id_prefix='nav')
|
||||
item.set('properties', 'nav')
|
||||
tocname = container.href_to_name(item.get('href'), base=container.opf_name)
|
||||
tocname = container.href_to_name(item.get('href'),
|
||||
base=container.opf_name)
|
||||
if previous_nav is not None:
|
||||
root = previous_nav[1]
|
||||
else:
|
||||
@@ -698,24 +729,25 @@ def commit_nav_toc(container, toc, lang=None, landmarks=None, previous_nav=None)
|
||||
if lang:
|
||||
lang = lang_as_iso639_1(lang) or lang
|
||||
root.set('lang', lang)
|
||||
root.set('{%s}lang' % XML_NS, lang)
|
||||
root.set(base.tag('xml', 'lang'), lang)
|
||||
nav = ensure_single_nav_of_type(root, 'toc')
|
||||
if toc.toc_title:
|
||||
nav.append(nav.makeelement(XHTML('h1')))
|
||||
nav.append(nav.makeelement(base.tag('xhtml', 'h1')))
|
||||
nav[-1].text = toc.toc_title
|
||||
|
||||
rnode = nav.makeelement(XHTML('ol'))
|
||||
rnode = nav.makeelement(base.tag('xhtml', 'ol'))
|
||||
nav.append(rnode)
|
||||
to_href = partial(container.name_to_href, base=tocname)
|
||||
to_href = functools.partial(container.name_to_href, base=tocname)
|
||||
spat = re.compile(r'\s+')
|
||||
|
||||
def process_node(xml_parent, toc_parent):
|
||||
for child in toc_parent:
|
||||
li = xml_parent.makeelement(XHTML('li'))
|
||||
li = xml_parent.makeelement(base.tag('xhtml', 'li'))
|
||||
xml_parent.append(li)
|
||||
title = child.title or ''
|
||||
title = spat.sub(' ', title).strip()
|
||||
a = li.makeelement(XHTML('a' if child.dest else 'span'))
|
||||
a = li.makeelement(base.tag('xhtml', 'a'
|
||||
if child.dest else 'span'))
|
||||
a.text = title
|
||||
li.append(a)
|
||||
if child.dest:
|
||||
@@ -724,14 +756,14 @@ def commit_nav_toc(container, toc, lang=None, landmarks=None, previous_nav=None)
|
||||
href += '#'+child.frag
|
||||
a.set('href', href)
|
||||
if len(child):
|
||||
ol = li.makeelement(XHTML('ol'))
|
||||
ol = li.makeelement(base.tag('xhtml', 'ol'))
|
||||
li.append(ol)
|
||||
process_node(ol, child)
|
||||
process_node(rnode, toc)
|
||||
pretty_xml_tree(nav)
|
||||
|
||||
def collapse_li(parent):
|
||||
for li in parent.iterdescendants(XHTML('li')):
|
||||
for li in parent.iterdescendants(base.tag('xhtml', 'li')):
|
||||
if len(li) == 1:
|
||||
li.text = None
|
||||
li[0].tail = None
|
||||
@@ -739,9 +771,9 @@ def commit_nav_toc(container, toc, lang=None, landmarks=None, previous_nav=None)
|
||||
nav.tail = '\n'
|
||||
|
||||
def create_li(ol, entry):
|
||||
li = ol.makeelement(XHTML('li'))
|
||||
li = ol.makeelement(base.tag('xhtml', 'li'))
|
||||
ol.append(li)
|
||||
a = li.makeelement(XHTML('a'))
|
||||
a = li.makeelement(base.tag('xhtml', 'a'))
|
||||
li.append(a)
|
||||
href = container.name_to_href(entry['dest'], tocname)
|
||||
if entry['frag']:
|
||||
@@ -752,12 +784,13 @@ def commit_nav_toc(container, toc, lang=None, landmarks=None, previous_nav=None)
|
||||
if landmarks is not None:
|
||||
nav = ensure_single_nav_of_type(root, 'landmarks')
|
||||
nav.set('hidden', '')
|
||||
ol = nav.makeelement(XHTML('ol'))
|
||||
ol = nav.makeelement(base.tag('xhtml', 'ol'))
|
||||
nav.append(ol)
|
||||
for entry in landmarks:
|
||||
if entry['type'] and container.has_name(entry['dest']) and container.mime_map[entry['dest']] in OEB_DOCS:
|
||||
if (entry['type'] and container.has_name(entry['dest']) and
|
||||
container.mime_map[entry['dest']] in base.OEB_DOCS):
|
||||
a = create_li(ol, entry)
|
||||
a.set('{%s}type' % EPUB_NS, entry['type'])
|
||||
a.set(base.tag('epub', 'type'), entry['type'])
|
||||
a.text = entry['title'] or None
|
||||
pretty_xml_tree(nav)
|
||||
collapse_li(nav)
|
||||
@@ -765,10 +798,11 @@ def commit_nav_toc(container, toc, lang=None, landmarks=None, previous_nav=None)
|
||||
if toc.page_list:
|
||||
nav = ensure_single_nav_of_type(root, 'page-list')
|
||||
nav.set('hidden', '')
|
||||
ol = nav.makeelement(XHTML('ol'))
|
||||
ol = nav.makeelement(base.tag('xhtml', 'ol'))
|
||||
nav.append(ol)
|
||||
for entry in toc.page_list:
|
||||
if container.has_name(entry['dest']) and container.mime_map[entry['dest']] in OEB_DOCS:
|
||||
if (container.has_name(entry['dest']) and
|
||||
container.mime_map[entry['dest']] in base.OEB_DOCS):
|
||||
a = create_li(ol, entry)
|
||||
a.text = str(entry['pagenum'])
|
||||
pretty_xml_tree(nav)
|
||||
@@ -785,11 +819,12 @@ def commit_toc(container, toc, lang=None, uid=None):
|
||||
def remove_names_from_toc(container, names):
|
||||
changed = []
|
||||
names = frozenset(names)
|
||||
for find_toc, parse_toc, commit_toc in (
|
||||
(find_existing_ncx_toc, parse_ncx, commit_ncx_toc),
|
||||
(find_existing_nav_toc, parse_nav, commit_nav_toc),
|
||||
):
|
||||
toc = get_x_toc(container, find_toc, parse_toc, verify_destinations=False)
|
||||
for find_toc, parse_toc, commit_toc in ((find_existing_ncx_toc,
|
||||
parse_ncx, commit_ncx_toc),
|
||||
(find_existing_nav_toc,
|
||||
parse_nav, commit_nav_toc)):
|
||||
toc = get_x_toc(container, find_toc, parse_toc,
|
||||
verify_destinations=False)
|
||||
if len(toc) > 0:
|
||||
remove = []
|
||||
for node in toc.iterdescendants():
|
||||
@@ -805,15 +840,16 @@ def remove_names_from_toc(container, names):
|
||||
|
||||
def find_inline_toc(container):
|
||||
for name, linear in container.spine_names:
|
||||
if container.parsed(name).xpath('//*[local-name()="body" and @id="calibre_generated_inline_toc"]'):
|
||||
if container.parsed(name).xpath('//*[local-name()="body" and @id='
|
||||
'"calibre_generated_inline_toc"]'):
|
||||
return name
|
||||
|
||||
|
||||
def toc_to_html(toc, container, toc_name, title, lang=None):
|
||||
|
||||
def process_node(html_parent, toc, level=1, indent=' ', style_level=2):
|
||||
li = html_parent.makeelement(XHTML('li'))
|
||||
li.tail = '\n'+ (indent*level)
|
||||
li = html_parent.makeelement(base.tag('xhtml', 'li'))
|
||||
li.tail = '\n' + (indent * level)
|
||||
html_parent.append(li)
|
||||
name, frag = toc.dest, toc.frag
|
||||
href = '#'
|
||||
@@ -821,32 +857,29 @@ def toc_to_html(toc, container, toc_name, title, lang=None):
|
||||
href = container.name_to_href(name, toc_name)
|
||||
if frag:
|
||||
href += '#' + frag
|
||||
a = li.makeelement(XHTML('a'), href=href)
|
||||
a = li.makeelement(base.tag('xhtml', 'a'), href=href)
|
||||
a.text = toc.title
|
||||
li.append(a)
|
||||
if len(toc) > 0:
|
||||
parent = li.makeelement(XHTML('ul'))
|
||||
parent = li.makeelement(base.tag('xhtml', 'ul'))
|
||||
parent.set('class', 'level%d' % (style_level))
|
||||
li.append(parent)
|
||||
a.tail = '\n\n' + (indent*(level+2))
|
||||
parent.text = '\n'+(indent*(level+3))
|
||||
parent.tail = '\n\n' + (indent*(level+1))
|
||||
for child in toc:
|
||||
process_node(parent, child, level+3, style_level=style_level + 1)
|
||||
process_node(parent, child, level+3,
|
||||
style_level=style_level + 1)
|
||||
parent[-1].tail = '\n' + (indent*(level+2))
|
||||
|
||||
E = ElementMaker(namespace=XHTML_NS, nsmap={None:XHTML_NS})
|
||||
html = E.html(
|
||||
E.head(
|
||||
E.title(title),
|
||||
E.style(P('templates/inline_toc_styles.css', data=True), type='text/css'),
|
||||
),
|
||||
E.body(
|
||||
E.h2(title),
|
||||
E.ul(),
|
||||
id="calibre_generated_inline_toc",
|
||||
)
|
||||
)
|
||||
E = ElementMaker(namespace=const.XHTML_NS, nsmap={None: const.XHTML_NS})
|
||||
# TODO(gryf): revisit lack of css.
|
||||
css_f = pkg_resources.resource_filename('ebook_converter',
|
||||
'data/inline_toc_styles.css')
|
||||
html = E.html(E.head(E.title(title),
|
||||
E.style(css_f, type='text/css')),
|
||||
E.body(E.h2(title), E.ul(),
|
||||
id="calibre_generated_inline_toc"))
|
||||
|
||||
ul = html[1][1]
|
||||
ul.set('class', 'level1')
|
||||
@@ -859,11 +892,12 @@ def toc_to_html(toc, container, toc_name, title, lang=None):
|
||||
|
||||
|
||||
def create_inline_toc(container, title=None):
|
||||
'''
|
||||
Create an inline (HTML) Table of Contents from an existing NCX Table of Contents.
|
||||
"""
|
||||
Create an inline (HTML) Table of Contents from an existing NCX Table of
|
||||
Contents.
|
||||
|
||||
:param title: The title for this table of contents.
|
||||
'''
|
||||
"""
|
||||
lang = get_book_language(container)
|
||||
default_title = 'Table of Contents'
|
||||
title = title or default_title
|
||||
@@ -874,7 +908,7 @@ def create_inline_toc(container, title=None):
|
||||
|
||||
name = toc_name
|
||||
html = toc_to_html(toc, container, name, title, lang)
|
||||
raw = serialize(html, 'text/html')
|
||||
raw = base.serialize(html, 'text/html')
|
||||
if name is None:
|
||||
name, c = 'toc.xhtml', 0
|
||||
while container.has_name(name):
|
||||
@@ -884,5 +918,6 @@ def create_inline_toc(container, title=None):
|
||||
else:
|
||||
with container.open(name, 'wb') as f:
|
||||
f.write(raw)
|
||||
set_guide_item(container, 'toc', title, name, frag='calibre_generated_inline_toc')
|
||||
set_guide_item(container, 'toc', title, name,
|
||||
frag='calibre_generated_inline_toc')
|
||||
return name
|
||||
|
||||
@@ -1,21 +1,21 @@
|
||||
"""
|
||||
Container-/OPF-based input OEBBook reader.
|
||||
"""
|
||||
import sys, os, uuid, copy, re, io
|
||||
from collections import defaultdict
|
||||
import collections
|
||||
import copy
|
||||
import io
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import urllib.parse
|
||||
import uuid
|
||||
|
||||
from lxml import etree
|
||||
|
||||
from ebook_converter.ebooks.oeb.base import OPF1_NS, OPF2_NS, OPF2_NSMAP, DC11_NS, \
|
||||
DC_NSES, OPF, xml2text, XHTML_MIME
|
||||
from ebook_converter.ebooks.oeb.base import OEB_DOCS, OEB_STYLES, OEB_IMAGES, \
|
||||
PAGE_MAP_MIME, JPEG_MIME, NCX_MIME, SVG_MIME
|
||||
from ebook_converter.ebooks.oeb.base import XMLDECL_RE, COLLAPSE_RE, \
|
||||
MS_COVER_TYPE, iterlinks
|
||||
from ebook_converter.ebooks.oeb.base import namespace, barename, XPath, xpath, \
|
||||
urlnormalize, BINARY_MIME, \
|
||||
OEBError, OEBBook, DirContainer
|
||||
from ebook_converter import constants as const
|
||||
from ebook_converter.ebooks.oeb import base
|
||||
from ebook_converter.ebooks.oeb import parse_utils
|
||||
from ebook_converter.ebooks.metadata import opf2 as opf_meta
|
||||
from ebook_converter.ebooks.oeb.writer import OEBWriter
|
||||
from ebook_converter.utils.xml_parse import safe_xml_fromstring
|
||||
from ebook_converter.utils.cleantext import clean_xml_chars
|
||||
@@ -26,18 +26,13 @@ from ebook_converter import guess_type, xml_replace_entities
|
||||
from ebook_converter.polyglot.urllib import unquote
|
||||
|
||||
|
||||
__all__ = ['OEBReader']
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>'
|
||||
|
||||
|
||||
class OEBReader(object):
|
||||
"""Read an OEBPS 1.x or OPF/OPS 2.0 file collection."""
|
||||
|
||||
COVER_SVG_XP = XPath('h:body//svg:svg[position() = 1]')
|
||||
COVER_OBJECT_XP = XPath('h:body//h:object[@data][position() = 1]')
|
||||
COVER_SVG_XP = base.XPath('h:body//svg:svg[position() = 1]')
|
||||
COVER_OBJECT_XP = base.XPath('h:body//h:object[@data][position() = 1]')
|
||||
|
||||
Container = DirContainer
|
||||
Container = base.DirContainer
|
||||
"""Container type used to access book files. Override in sub-classes."""
|
||||
|
||||
DEFAULT_PROFILE = 'PRS505'
|
||||
@@ -75,61 +70,67 @@ class OEBReader(object):
|
||||
for elem in opf.iter(tag=etree.Element):
|
||||
nsmap.update(elem.nsmap)
|
||||
for elem in opf.iter(tag=etree.Element):
|
||||
if namespace(elem.tag) in ('', OPF1_NS) and ':' not in barename(elem.tag):
|
||||
elem.tag = OPF(barename(elem.tag))
|
||||
nsmap.update(OPF2_NSMAP)
|
||||
if (parse_utils.namespace(elem.tag) in ('', const.OPF1_NS) and
|
||||
':' not in parse_utils.barename(elem.tag)):
|
||||
elem.tag = base.tag('opf', parse_utils.barename(elem.tag))
|
||||
nsmap.update(const.OPF2_NSMAP)
|
||||
attrib = dict(opf.attrib)
|
||||
nroot = etree.Element(OPF('package'),
|
||||
nsmap={None: OPF2_NS}, attrib=attrib)
|
||||
metadata = etree.SubElement(nroot, OPF('metadata'), nsmap=nsmap)
|
||||
ignored = (OPF('dc-metadata'), OPF('x-metadata'))
|
||||
for elem in xpath(opf, 'o2:metadata//*'):
|
||||
nroot = etree.Element(base.tag('opf', 'package'),
|
||||
nsmap={None: const.OPF2_NS}, attrib=attrib)
|
||||
metadata = etree.SubElement(nroot, base.tag('opf', 'metadata'),
|
||||
nsmap=nsmap)
|
||||
ignored = (base.tag('opf', 'dc-metadata'), base.tag('opf', 'x-metadata'))
|
||||
for elem in base.xpath(opf, 'o2:metadata//*'):
|
||||
if elem.tag in ignored:
|
||||
continue
|
||||
if namespace(elem.tag) in DC_NSES:
|
||||
tag = barename(elem.tag).lower()
|
||||
elem.tag = '{%s}%s' % (DC11_NS, tag)
|
||||
if parse_utils.namespace(elem.tag) in const.DC_NSES:
|
||||
tag = parse_utils.barename(elem.tag).lower()
|
||||
elem.tag = '{%s}%s' % (const.DC11_NS, tag)
|
||||
if elem.tag.startswith('dc:'):
|
||||
tag = elem.tag.partition(':')[-1].lower()
|
||||
elem.tag = '{%s}%s' % (DC11_NS, tag)
|
||||
elem.tag = '{%s}%s' % (const.DC11_NS, tag)
|
||||
metadata.append(elem)
|
||||
for element in xpath(opf, 'o2:metadata//o2:meta'):
|
||||
for element in base.xpath(opf, 'o2:metadata//o2:meta'):
|
||||
metadata.append(element)
|
||||
for tag in ('o2:manifest', 'o2:spine', 'o2:tours', 'o2:guide'):
|
||||
for element in xpath(opf, tag):
|
||||
for element in base.xpath(opf, tag):
|
||||
nroot.append(element)
|
||||
return nroot
|
||||
|
||||
def _read_opf(self):
|
||||
data = self.oeb.container.read(None)
|
||||
data = self.oeb.decode(data)
|
||||
data = XMLDECL_RE.sub('', data)
|
||||
data = base.XMLDECL_RE.sub('', data)
|
||||
data = re.sub(r'http://openebook.org/namespaces/oeb-package/1.0(/*)',
|
||||
OPF1_NS, data)
|
||||
const.OPF1_NS, data)
|
||||
try:
|
||||
opf = safe_xml_fromstring(data)
|
||||
opf = etree.fromstring(data)
|
||||
except etree.XMLSyntaxError:
|
||||
data = xml_replace_entities(clean_xml_chars(data), encoding=None)
|
||||
try:
|
||||
opf = safe_xml_fromstring(data)
|
||||
opf = etree.fromstring(data)
|
||||
self.logger.warn('OPF contains invalid HTML named entities')
|
||||
except etree.XMLSyntaxError:
|
||||
data = re.sub(r'(?is)<tours>.+</tours>', '', data)
|
||||
data = data.replace('<dc-metadata>',
|
||||
'<dc-metadata xmlns:dc="http://purl.org/metadata/dublin_core">')
|
||||
opf = safe_xml_fromstring(data)
|
||||
'<dc-metadata xmlns:dc="'
|
||||
'http://purl.org/metadata/dublin_core">')
|
||||
opf = etree.fromstring(data)
|
||||
self.logger.warn('OPF contains invalid tours section')
|
||||
|
||||
ns = namespace(opf.tag)
|
||||
if ns not in ('', OPF1_NS, OPF2_NS):
|
||||
raise OEBError('Invalid namespace %r for OPF document' % ns)
|
||||
ns = parse_utils.namespace(opf.tag)
|
||||
if ns not in ('', const.OPF1_NS, const.OPF2_NS):
|
||||
raise base.OEBError('Invalid namespace %r for OPF document' % ns)
|
||||
opf = self._clean_opf(opf)
|
||||
return opf
|
||||
|
||||
def _metadata_from_opf(self, opf):
|
||||
from ebook_converter.ebooks.metadata.opf2 import OPF
|
||||
from ebook_converter.ebooks.oeb.transforms.metadata import meta_info_to_oeb_metadata
|
||||
stream = io.BytesIO(etree.tostring(opf, xml_declaration=True, encoding='utf-8'))
|
||||
from ebook_converter.ebooks.oeb.transforms.metadata import \
|
||||
meta_info_to_oeb_metadata
|
||||
stream = io.BytesIO(etree.tostring(opf, xml_declaration=True,
|
||||
encoding='utf-8'))
|
||||
# o = opf_meta.OPF(stream)
|
||||
o = OPF(stream)
|
||||
pwm = o.primary_writing_mode
|
||||
if pwm:
|
||||
@@ -139,8 +140,8 @@ class OEBReader(object):
|
||||
mi.language = get_lang().replace('_', '-')
|
||||
self.oeb.metadata.add('language', mi.language)
|
||||
if not mi.book_producer:
|
||||
mi.book_producer = '%(a)s (%(v)s) [http://%(a)s-ebook.com]'%\
|
||||
dict(a=__appname__, v=__version__)
|
||||
mi.book_producer = ('%(a)s (%(v)s) [http://%(a)s-ebook.com]' %
|
||||
dict(a=__appname__, v=__version__))
|
||||
meta_info_to_oeb_metadata(mi, self.oeb.metadata, self.logger)
|
||||
m = self.oeb.metadata
|
||||
m.add('identifier', str(uuid.uuid4()), id='uuid_id', scheme='uuid')
|
||||
@@ -162,16 +163,16 @@ class OEBReader(object):
|
||||
data.
|
||||
'''
|
||||
bad = []
|
||||
check = OEB_DOCS.union(OEB_STYLES)
|
||||
check = base.OEB_DOCS.union(base.OEB_STYLES)
|
||||
for item in list(self.oeb.manifest.values()):
|
||||
if item.media_type in check:
|
||||
try:
|
||||
item.data
|
||||
except KeyboardInterrupt:
|
||||
raise
|
||||
except:
|
||||
self.logger.exception('Failed to parse content in %s'%
|
||||
item.href)
|
||||
except Exception:
|
||||
self.logger.exception('Failed to parse content in %s' %
|
||||
item.href)
|
||||
bad.append(item)
|
||||
self.oeb.manifest.remove(item)
|
||||
return bad
|
||||
@@ -181,25 +182,28 @@ class OEBReader(object):
|
||||
manifest = self.oeb.manifest
|
||||
known = set(manifest.hrefs)
|
||||
unchecked = set(manifest.values())
|
||||
cdoc = OEB_DOCS|OEB_STYLES
|
||||
cdoc = base.OEB_DOCS | base.OEB_STYLES
|
||||
invalid = set()
|
||||
while unchecked:
|
||||
new = set()
|
||||
for item in unchecked:
|
||||
data = None
|
||||
if (item.media_type in cdoc or item.media_type[-4:] in ('/xml', '+xml')):
|
||||
if (item.media_type in cdoc or
|
||||
item.media_type[-4:] in ('/xml', '+xml')):
|
||||
try:
|
||||
data = item.data
|
||||
except:
|
||||
except Exception:
|
||||
self.oeb.log.exception('Failed to read from manifest '
|
||||
'entry with id: %s, ignoring'%item.id)
|
||||
'entry with id: %s, ignoring' %
|
||||
item.id)
|
||||
invalid.add(item)
|
||||
continue
|
||||
if data is None:
|
||||
continue
|
||||
|
||||
if (item.media_type in OEB_DOCS or item.media_type[-4:] in ('/xml', '+xml')):
|
||||
hrefs = [r[2] for r in iterlinks(data)]
|
||||
if (item.media_type in base.OEB_DOCS or
|
||||
item.media_type[-4:] in ('/xml', '+xml')):
|
||||
hrefs = [r[2] for r in base.iterlinks(data)]
|
||||
for href in hrefs:
|
||||
if isinstance(href, bytes):
|
||||
href = href.decode('utf-8')
|
||||
@@ -207,22 +211,22 @@ class OEBReader(object):
|
||||
if not href:
|
||||
continue
|
||||
try:
|
||||
href = item.abshref(urlnormalize(href))
|
||||
href = item.abshref(base.urlnormalize(href))
|
||||
scheme = urllib.parse.urlparse(href).scheme
|
||||
except:
|
||||
self.oeb.log.exception(
|
||||
'Skipping invalid href: %r'%href)
|
||||
except Exception:
|
||||
self.oeb.log.exception('Skipping invalid href: '
|
||||
'%r' % href)
|
||||
continue
|
||||
if not scheme and href not in known:
|
||||
new.add(href)
|
||||
elif item.media_type in OEB_STYLES:
|
||||
elif item.media_type in base.OEB_STYLES:
|
||||
try:
|
||||
urls = list(css_parser.getUrls(data))
|
||||
except:
|
||||
except Exception:
|
||||
urls = []
|
||||
for url in urls:
|
||||
href, _ = urllib.parse.urldefrag(url)
|
||||
href = item.abshref(urlnormalize(href))
|
||||
href = item.abshref(base.urlnormalize(href))
|
||||
scheme = urllib.parse.urlparse(href).scheme
|
||||
if not scheme and href not in known:
|
||||
new.add(href)
|
||||
@@ -232,7 +236,7 @@ class OEBReader(object):
|
||||
known.add(href)
|
||||
is_invalid = False
|
||||
for item in invalid:
|
||||
if href == item.abshref(urlnormalize(href)):
|
||||
if href == item.abshref(base.urlnormalize(href)):
|
||||
is_invalid = True
|
||||
break
|
||||
if is_invalid:
|
||||
@@ -243,11 +247,12 @@ class OEBReader(object):
|
||||
warned.add(href)
|
||||
continue
|
||||
if href not in warned:
|
||||
self.logger.warn('Referenced file %r not in manifest' % href)
|
||||
self.logger.warn('Referenced file %r not in manifest' %
|
||||
href)
|
||||
warned.add(href)
|
||||
id, _ = manifest.generate(id='added')
|
||||
guessed = guess_type(href)[0]
|
||||
media_type = guessed or BINARY_MIME
|
||||
media_type = guessed or base.BINARY_MIME
|
||||
added = manifest.add(id, href, media_type)
|
||||
unchecked.add(added)
|
||||
|
||||
@@ -256,7 +261,7 @@ class OEBReader(object):
|
||||
|
||||
def _manifest_from_opf(self, opf):
|
||||
manifest = self.oeb.manifest
|
||||
for elem in xpath(opf, '/o2:package/o2:manifest/o2:item'):
|
||||
for elem in base.xpath(opf, '/o2:package/o2:manifest/o2:item'):
|
||||
id = elem.get('id')
|
||||
href = elem.get('href')
|
||||
media_type = elem.get('media-type', None)
|
||||
@@ -264,7 +269,7 @@ class OEBReader(object):
|
||||
media_type = elem.get('mediatype', None)
|
||||
if not media_type or media_type == 'text/xml':
|
||||
guessed = guess_type(href)[0]
|
||||
media_type = guessed or media_type or BINARY_MIME
|
||||
media_type = guessed or media_type or base.BINARY_MIME
|
||||
if hasattr(media_type, 'lower'):
|
||||
media_type = media_type.lower()
|
||||
fallback = elem.get('fallback')
|
||||
@@ -285,12 +290,12 @@ class OEBReader(object):
|
||||
manifest = self.oeb.manifest
|
||||
spine = self.oeb.spine
|
||||
unchecked = set(spine)
|
||||
selector = XPath('h:body//h:a/@href')
|
||||
selector = base.XPath('h:body//h:a/@href')
|
||||
extras = set()
|
||||
while unchecked:
|
||||
new = set()
|
||||
for item in unchecked:
|
||||
if item.media_type not in OEB_DOCS:
|
||||
if item.media_type not in base.OEB_DOCS:
|
||||
# TODO: handle fallback chains
|
||||
continue
|
||||
for href in selector(item.data):
|
||||
@@ -298,20 +303,21 @@ class OEBReader(object):
|
||||
if not href:
|
||||
continue
|
||||
try:
|
||||
href = item.abshref(urlnormalize(href))
|
||||
href = item.abshref(base.urlnormalize(href))
|
||||
except ValueError: # Malformed URL
|
||||
continue
|
||||
if href not in manifest.hrefs:
|
||||
continue
|
||||
found = manifest.hrefs[href]
|
||||
if found.media_type not in OEB_DOCS or \
|
||||
if found.media_type not in base.OEB_DOCS or \
|
||||
found in spine or found in extras:
|
||||
continue
|
||||
new.add(found)
|
||||
extras.update(new)
|
||||
unchecked = new
|
||||
version = int(self.oeb.version[0])
|
||||
removed_items_to_ignore = getattr(self.oeb, 'removed_items_to_ignore', ())
|
||||
removed_items_to_ignore = getattr(self.oeb, 'removed_items_to_ignore',
|
||||
())
|
||||
for item in sorted(extras):
|
||||
if item.href in removed_items_to_ignore:
|
||||
continue
|
||||
@@ -323,34 +329,38 @@ class OEBReader(object):
|
||||
def _spine_from_opf(self, opf):
|
||||
spine = self.oeb.spine
|
||||
manifest = self.oeb.manifest
|
||||
for elem in xpath(opf, '/o2:package/o2:spine/o2:itemref'):
|
||||
for elem in base.xpath(opf, '/o2:package/o2:spine/o2:itemref'):
|
||||
idref = elem.get('idref')
|
||||
if idref not in manifest.ids:
|
||||
self.logger.warn('Spine item %r not found' % idref)
|
||||
continue
|
||||
item = manifest.ids[idref]
|
||||
if item.media_type.lower() in OEB_DOCS and hasattr(item.data, 'xpath') and not getattr(item.data, 'tag', '').endswith('}ncx'):
|
||||
if (item.media_type.lower() in base.OEB_DOCS and
|
||||
hasattr(item.data, 'xpath') and not
|
||||
getattr(item.data, 'tag', '').endswith('}ncx')):
|
||||
spine.add(item, elem.get('linear'))
|
||||
else:
|
||||
if hasattr(item.data, 'tag') and item.data.tag and item.data.tag.endswith('}html'):
|
||||
item.media_type = XHTML_MIME
|
||||
if (hasattr(item.data, 'tag') and
|
||||
item.data.tag and item.data.tag.endswith('}html')):
|
||||
item.media_type = base.XHTML_MIME
|
||||
spine.add(item, elem.get('linear'))
|
||||
else:
|
||||
self.oeb.log.warn('The item %s is not a XML document.'
|
||||
' Removing it from spine.'%item.href)
|
||||
' Removing it from spine.' % item.href)
|
||||
if len(spine) == 0:
|
||||
raise OEBError("Spine is empty")
|
||||
raise base.OEBError("Spine is empty")
|
||||
self._spine_add_extra()
|
||||
for val in xpath(opf, '/o2:package/o2:spine/@page-progression-direction'):
|
||||
for val in base.xpath(opf,
|
||||
'/o2:package/o2:spine/@page-progression-direction'):
|
||||
if val in {'ltr', 'rtl'}:
|
||||
spine.page_progression_direction = val
|
||||
|
||||
def _guide_from_opf(self, opf):
|
||||
guide = self.oeb.guide
|
||||
manifest = self.oeb.manifest
|
||||
for elem in xpath(opf, '/o2:package/o2:guide/o2:reference'):
|
||||
for elem in base.xpath(opf, '/o2:package/o2:guide/o2:reference'):
|
||||
ref_href = elem.get('href')
|
||||
path = urlnormalize(urllib.parse.urldefrag(ref_href)[0])
|
||||
path = base.urlnormalize(urllib.parse.urldefrag(ref_href)[0])
|
||||
if path not in manifest.hrefs:
|
||||
corrected_href = None
|
||||
for href in manifest.hrefs:
|
||||
@@ -366,7 +376,7 @@ class OEBReader(object):
|
||||
guide.add(typ, elem.get('title'), ref_href)
|
||||
|
||||
def _find_ncx(self, opf):
|
||||
result = xpath(opf, '/o2:package/o2:spine/@toc')
|
||||
result = base.xpath(opf, '/o2:package/o2:spine/@toc')
|
||||
if result:
|
||||
id = result[0]
|
||||
if id not in self.oeb.manifest.ids:
|
||||
@@ -375,30 +385,33 @@ class OEBReader(object):
|
||||
self.oeb.manifest.remove(item)
|
||||
return item
|
||||
for item in self.oeb.manifest.values():
|
||||
if item.media_type == NCX_MIME:
|
||||
if item.media_type == base.NCX_MIME:
|
||||
self.oeb.manifest.remove(item)
|
||||
return item
|
||||
return None
|
||||
|
||||
def _toc_from_navpoint(self, item, toc, navpoint):
|
||||
children = xpath(navpoint, 'ncx:navPoint')
|
||||
children = base.xpath(navpoint, 'ncx:navPoint')
|
||||
for child in children:
|
||||
title = ''.join(xpath(child, 'ncx:navLabel/ncx:text/text()'))
|
||||
title = COLLAPSE_RE.sub(' ', title.strip())
|
||||
href = xpath(child, 'ncx:content/@src')
|
||||
title = ''.join(base.xpath(child, 'ncx:navLabel/ncx:text/text()'))
|
||||
title = base.COLLAPSE_RE.sub(' ', title.strip())
|
||||
href = base.xpath(child, 'ncx:content/@src')
|
||||
if not title:
|
||||
self._toc_from_navpoint(item, toc, child)
|
||||
continue
|
||||
if (not href or not href[0]) and not xpath(child, 'ncx:navPoint'):
|
||||
if (not href or not href[0]) and not base.xpath(child, 'ncx:navPoint'):
|
||||
# This node is useless
|
||||
continue
|
||||
href = item.abshref(urlnormalize(href[0])) if href and href[0] else ''
|
||||
if href and href[0]:
|
||||
href = item.abshref(base.urlnormalize(href[0]))
|
||||
else:
|
||||
href = ''
|
||||
path, _ = urllib.parse.urldefrag(href)
|
||||
if path and path not in self.oeb.manifest.hrefs:
|
||||
path = urlnormalize(path)
|
||||
path = base.urlnormalize(path)
|
||||
if href and path not in self.oeb.manifest.hrefs:
|
||||
self.logger.warn('TOC reference %r not found' % href)
|
||||
gc = xpath(child, 'ncx:navPoint')
|
||||
gc = base.xpath(child, 'ncx:navPoint')
|
||||
if not gc:
|
||||
# This node is useless
|
||||
continue
|
||||
@@ -406,36 +419,40 @@ class OEBReader(object):
|
||||
klass = child.get('class', 'chapter')
|
||||
|
||||
try:
|
||||
po = int(child.get('playOrder', self.oeb.toc.next_play_order()))
|
||||
except:
|
||||
po = int(child.get('playOrder',
|
||||
self.oeb.toc.next_play_order()))
|
||||
except Exception:
|
||||
po = self.oeb.toc.next_play_order()
|
||||
|
||||
authorElement = xpath(child,
|
||||
'descendant::calibre:meta[@name = "author"]')
|
||||
authorElement = base.xpath(child,
|
||||
'descendant::calibre:meta[@name = "author"]')
|
||||
if authorElement:
|
||||
author = authorElement[0].text
|
||||
else:
|
||||
author = None
|
||||
|
||||
descriptionElement = xpath(child,
|
||||
'descendant::calibre:meta[@name = "description"]')
|
||||
descriptionElement = base.xpath(child,
|
||||
'descendant::calibre:meta[@name = '
|
||||
'"description"]')
|
||||
if descriptionElement:
|
||||
description = etree.tostring(descriptionElement[0],
|
||||
method='text', encoding='unicode').strip()
|
||||
method='text',
|
||||
encoding='unicode').strip()
|
||||
if not description:
|
||||
description = None
|
||||
else:
|
||||
description = None
|
||||
|
||||
index_image = xpath(child,
|
||||
'descendant::calibre:meta[@name = "toc_thumbnail"]')
|
||||
index_image = base.xpath(child,
|
||||
'descendant::calibre:meta[@name = '
|
||||
'"toc_thumbnail"]')
|
||||
toc_thumbnail = (index_image[0].text if index_image else None)
|
||||
if not toc_thumbnail or not toc_thumbnail.strip():
|
||||
toc_thumbnail = None
|
||||
|
||||
node = toc.add(title, href, id=id, klass=klass,
|
||||
play_order=po, description=description, author=author,
|
||||
toc_thumbnail=toc_thumbnail)
|
||||
play_order=po, description=description,
|
||||
author=author, toc_thumbnail=toc_thumbnail)
|
||||
|
||||
self._toc_from_navpoint(item, node, child)
|
||||
|
||||
@@ -444,31 +461,31 @@ class OEBReader(object):
|
||||
return False
|
||||
self.log.debug('Reading TOC from NCX...')
|
||||
ncx = item.data
|
||||
title = ''.join(xpath(ncx, 'ncx:docTitle/ncx:text/text()'))
|
||||
title = COLLAPSE_RE.sub(' ', title.strip())
|
||||
title = ''.join(base.xpath(ncx, 'ncx:docTitle/ncx:text/text()'))
|
||||
title = base.COLLAPSE_RE.sub(' ', title.strip())
|
||||
title = title or str(self.oeb.metadata.title[0])
|
||||
toc = self.oeb.toc
|
||||
toc.title = title
|
||||
navmaps = xpath(ncx, 'ncx:navMap')
|
||||
navmaps = base.xpath(ncx, 'ncx:navMap')
|
||||
for navmap in navmaps:
|
||||
self._toc_from_navpoint(item, toc, navmap)
|
||||
return True
|
||||
|
||||
def _toc_from_tour(self, opf):
|
||||
result = xpath(opf, 'o2:tours/o2:tour')
|
||||
result = base.xpath(opf, 'o2:tours/o2:tour')
|
||||
if not result:
|
||||
return False
|
||||
self.log.debug('Reading TOC from tour...')
|
||||
tour = result[0]
|
||||
toc = self.oeb.toc
|
||||
toc.title = tour.get('title')
|
||||
sites = xpath(tour, 'o2:site')
|
||||
sites = base.xpath(tour, 'o2:site')
|
||||
for site in sites:
|
||||
title = site.get('title')
|
||||
href = site.get('href')
|
||||
if not title or not href:
|
||||
continue
|
||||
path, _ = urllib.parse.urldefrag(urlnormalize(href))
|
||||
path, _ = urllib.parse.urldefrag(base.urlnormalize(href))
|
||||
if path not in self.oeb.manifest.hrefs:
|
||||
self.logger.warn('TOC reference %r not found' % href)
|
||||
continue
|
||||
@@ -484,23 +501,23 @@ class OEBReader(object):
|
||||
item = self.oeb.manifest.hrefs[itempath]
|
||||
html = item.data
|
||||
if frag:
|
||||
elems = xpath(html, './/*[@id="%s"]' % frag)
|
||||
elems = base.xpath(html, './/*[@id="%s"]' % frag)
|
||||
if not elems:
|
||||
elems = xpath(html, './/*[@name="%s"]' % frag)
|
||||
elems = base.xpath(html, './/*[@name="%s"]' % frag)
|
||||
elem = elems[0] if elems else html
|
||||
while elem != html and not xpath(elem, './/h:a[@href]'):
|
||||
while elem != html and not base.xpath(elem, './/h:a[@href]'):
|
||||
elem = elem.getparent()
|
||||
html = elem
|
||||
titles = defaultdict(list)
|
||||
titles = collections.defaultdict(list)
|
||||
order = []
|
||||
for anchor in xpath(html, './/h:a[@href]'):
|
||||
for anchor in base.xpath(html, './/h:a[@href]'):
|
||||
href = anchor.attrib['href']
|
||||
href = item.abshref(urlnormalize(href))
|
||||
href = item.abshref(base.urlnormalize(href))
|
||||
path, frag = urllib.parse.urldefrag(href)
|
||||
if path not in self.oeb.manifest.hrefs:
|
||||
continue
|
||||
title = xml2text(anchor)
|
||||
title = COLLAPSE_RE.sub(' ', title.strip())
|
||||
title = base.xml2text(anchor)
|
||||
title = base.COLLAPSE_RE.sub(' ', title.strip())
|
||||
if href not in titles:
|
||||
order.append(href)
|
||||
titles[href].append(title)
|
||||
@@ -518,15 +535,15 @@ class OEBReader(object):
|
||||
if not item.linear:
|
||||
continue
|
||||
html = item.data
|
||||
title = ''.join(xpath(html, '/h:html/h:head/h:title/text()'))
|
||||
title = COLLAPSE_RE.sub(' ', title.strip())
|
||||
title = ''.join(base.xpath(html, '/h:html/h:head/h:title/text()'))
|
||||
title = base.COLLAPSE_RE.sub(' ', title.strip())
|
||||
if title:
|
||||
titles.append(title)
|
||||
headers.append('(unlabled)')
|
||||
for tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'strong'):
|
||||
expr = '/h:html/h:body//h:%s[position()=1]/text()'
|
||||
header = ''.join(xpath(html, expr % tag))
|
||||
header = COLLAPSE_RE.sub(' ', header.strip())
|
||||
header = ''.join(base.xpath(html, expr % tag))
|
||||
header = base.COLLAPSE_RE.sub(' ', header.strip())
|
||||
if header:
|
||||
headers[-1] = header
|
||||
break
|
||||
@@ -558,17 +575,17 @@ class OEBReader(object):
|
||||
ncx = item.data
|
||||
if ncx is None:
|
||||
return False
|
||||
ptargets = xpath(ncx, 'ncx:pageList/ncx:pageTarget')
|
||||
ptargets = base.xpath(ncx, 'ncx:pageList/ncx:pageTarget')
|
||||
if not ptargets:
|
||||
return False
|
||||
pages = self.oeb.pages
|
||||
for ptarget in ptargets:
|
||||
name = ''.join(xpath(ptarget, 'ncx:navLabel/ncx:text/text()'))
|
||||
name = COLLAPSE_RE.sub(' ', name.strip())
|
||||
href = xpath(ptarget, 'ncx:content/@src')
|
||||
name = ''.join(base.xpath(ptarget, 'ncx:navLabel/ncx:text/text()'))
|
||||
name = base.COLLAPSE_RE.sub(' ', name.strip())
|
||||
href = base.xpath(ptarget, 'ncx:content/@src')
|
||||
if not href:
|
||||
continue
|
||||
href = item.abshref(urlnormalize(href[0]))
|
||||
href = item.abshref(base.urlnormalize(href[0]))
|
||||
id = ptarget.get('id')
|
||||
type = ptarget.get('type', 'normal')
|
||||
klass = ptarget.get('class')
|
||||
@@ -576,7 +593,7 @@ class OEBReader(object):
|
||||
return True
|
||||
|
||||
def _find_page_map(self, opf):
|
||||
result = xpath(opf, '/o2:package/o2:spine/@page-map')
|
||||
result = base.xpath(opf, '/o2:package/o2:spine/@page-map')
|
||||
if result:
|
||||
id = result[0]
|
||||
if id not in self.oeb.manifest.ids:
|
||||
@@ -585,7 +602,7 @@ class OEBReader(object):
|
||||
self.oeb.manifest.remove(item)
|
||||
return item
|
||||
for item in self.oeb.manifest.values():
|
||||
if item.media_type == PAGE_MAP_MIME:
|
||||
if item.media_type == base.PAGE_MAP_MIME:
|
||||
self.oeb.manifest.remove(item)
|
||||
return item
|
||||
return None
|
||||
@@ -596,13 +613,13 @@ class OEBReader(object):
|
||||
return False
|
||||
pmap = item.data
|
||||
pages = self.oeb.pages
|
||||
for page in xpath(pmap, 'o2:page'):
|
||||
for page in base.xpath(pmap, 'o2:page'):
|
||||
name = page.get('name', '')
|
||||
href = page.get('href')
|
||||
if not href:
|
||||
continue
|
||||
name = COLLAPSE_RE.sub(' ', name.strip())
|
||||
href = item.abshref(urlnormalize(href))
|
||||
name = base.COLLAPSE_RE.sub(' ', name.strip())
|
||||
href = item.abshref(base.urlnormalize(href))
|
||||
type = 'normal'
|
||||
if not name:
|
||||
type = 'special'
|
||||
@@ -628,14 +645,14 @@ class OEBReader(object):
|
||||
if not data:
|
||||
data = b''
|
||||
id, href = self.oeb.manifest.generate('cover', 'cover.jpg')
|
||||
item = self.oeb.manifest.add(id, href, JPEG_MIME, data=data)
|
||||
item = self.oeb.manifest.add(id, href, base.JPEG_MIME, data=data)
|
||||
return item
|
||||
|
||||
def _locate_cover_image(self):
|
||||
if self.oeb.metadata.cover:
|
||||
id = str(self.oeb.metadata.cover[0])
|
||||
item = self.oeb.manifest.ids.get(id, None)
|
||||
if item is not None and item.media_type in OEB_IMAGES:
|
||||
if item is not None and item.media_type in base.OEB_IMAGES:
|
||||
return item
|
||||
else:
|
||||
self.logger.warn('Invalid cover image @id %r' % id)
|
||||
@@ -644,27 +661,27 @@ class OEBReader(object):
|
||||
href = self.oeb.guide['cover'].href
|
||||
item = self.oeb.manifest.hrefs[href]
|
||||
media_type = item.media_type
|
||||
if media_type in OEB_IMAGES:
|
||||
if media_type in base.OEB_IMAGES:
|
||||
return item
|
||||
elif media_type in OEB_DOCS:
|
||||
elif media_type in base.OEB_DOCS:
|
||||
hcover = item
|
||||
html = hcover.data
|
||||
if MS_COVER_TYPE in self.oeb.guide:
|
||||
href = self.oeb.guide[MS_COVER_TYPE].href
|
||||
if base.MS_COVER_TYPE in self.oeb.guide:
|
||||
href = self.oeb.guide[base.MS_COVER_TYPE].href
|
||||
item = self.oeb.manifest.hrefs.get(href, None)
|
||||
if item is not None and item.media_type in OEB_IMAGES:
|
||||
if item is not None and item.media_type in base.OEB_IMAGES:
|
||||
return item
|
||||
if self.COVER_SVG_XP(html):
|
||||
svg = copy.deepcopy(self.COVER_SVG_XP(html)[0])
|
||||
href = os.path.splitext(hcover.href)[0] + '.svg'
|
||||
id, href = self.oeb.manifest.generate(hcover.id, href)
|
||||
item = self.oeb.manifest.add(id, href, SVG_MIME, data=svg)
|
||||
item = self.oeb.manifest.add(id, href, base.SVG_MIME, data=svg)
|
||||
return item
|
||||
if self.COVER_OBJECT_XP(html):
|
||||
object = self.COVER_OBJECT_XP(html)[0]
|
||||
href = hcover.abshref(object.get('data'))
|
||||
item = self.oeb.manifest.hrefs.get(href, None)
|
||||
if item is not None and item.media_type in OEB_IMAGES:
|
||||
if item is not None and item.media_type in base.OEB_IMAGES:
|
||||
return item
|
||||
return self._cover_from_html(hcover)
|
||||
|
||||
@@ -687,7 +704,8 @@ class OEBReader(object):
|
||||
items = [x for x in self.oeb.manifest if x.href == href]
|
||||
for x in items:
|
||||
if x not in self.oeb.spine:
|
||||
self.oeb.log.warn('Removing duplicate manifest item with id:', x.id)
|
||||
self.oeb.log.warn('Removing duplicate manifest item with '
|
||||
'id:', x.id)
|
||||
self.oeb.manifest.remove_duplicate_item(x)
|
||||
|
||||
def _all_from_opf(self, opf):
|
||||
@@ -706,7 +724,7 @@ class OEBReader(object):
|
||||
def main(argv=sys.argv):
|
||||
reader = OEBReader()
|
||||
for arg in argv[1:]:
|
||||
oeb = reader(OEBBook(), arg)
|
||||
oeb = reader(base.OEBBook(), arg)
|
||||
for name, doc in oeb.to_opf1().values():
|
||||
print(etree.tostring(doc, pretty_print=True))
|
||||
for name, doc in oeb.to_opf2(page_map=True).values():
|
||||
|
||||
@@ -10,17 +10,16 @@ from css_parser.css import (CSSStyleRule, CSSPageRule, CSSFontFaceRule,
|
||||
cssproperties)
|
||||
from css_parser import (profile as cssprofiles, parseString, parseStyle, log as
|
||||
css_parser_log, CSSParser, profiles, replaceUrls)
|
||||
|
||||
from ebook_converter import constants as const
|
||||
from ebook_converter import force_unicode, as_unicode
|
||||
from ebook_converter.ebooks import unit_convert
|
||||
from ebook_converter.ebooks.oeb.base import XHTML, XHTML_NS, CSS_MIME, OEB_STYLES, xpath, urlnormalize
|
||||
from ebook_converter.ebooks.oeb import base
|
||||
from ebook_converter.ebooks.oeb.normalize_css import DEFAULTS, normalizers
|
||||
from ebook_converter.css_selectors import Select, SelectorError, INAPPROPRIATE_PSEUDO_CLASSES
|
||||
from ebook_converter.tinycss.media3 import CSSMedia3Parser
|
||||
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>'
|
||||
|
||||
css_parser_log.setLevel(logging.WARN)
|
||||
|
||||
_html_css_stylesheet = None
|
||||
@@ -208,7 +207,7 @@ class Stylizer(object):
|
||||
stylesheets = [html_css_stylesheet()]
|
||||
if base_css:
|
||||
stylesheets.append(parseString(base_css, validate=False))
|
||||
style_tags = xpath(tree, '//*[local-name()="style" or local-name()="link"]')
|
||||
style_tags = base.xpath(tree, '//*[local-name()="style" or local-name()="link"]')
|
||||
|
||||
# Add css_parser parsing profiles from output_profile
|
||||
for profile in self.opts.output_profile.extra_css_modules:
|
||||
@@ -219,7 +218,7 @@ class Stylizer(object):
|
||||
parser = CSSParser(fetcher=self._fetch_css_file,
|
||||
log=logging.getLogger('calibre.css'))
|
||||
for elem in style_tags:
|
||||
if (elem.tag == XHTML('style') and elem.get('type', CSS_MIME) in OEB_STYLES and media_ok(elem.get('media'))):
|
||||
if (elem.tag == base.tag('xhtml', 'style') and elem.get('type', base.CSS_MIME) in base.OEB_STYLES and media_ok(elem.get('media'))):
|
||||
text = elem.text if elem.text else ''
|
||||
for x in elem:
|
||||
t = getattr(x, 'text', None)
|
||||
@@ -245,7 +244,7 @@ class Stylizer(object):
|
||||
self.logger.warn('Ignoring missing stylesheet in @import rule:', rule.href)
|
||||
continue
|
||||
sitem = hrefs[ihref]
|
||||
if sitem.media_type not in OEB_STYLES:
|
||||
if sitem.media_type not in base.OEB_STYLES:
|
||||
self.logger.warn('CSS @import of non-CSS file %r' % rule.href)
|
||||
continue
|
||||
stylesheets.append(sitem.data)
|
||||
@@ -254,11 +253,11 @@ class Stylizer(object):
|
||||
replaceUrls(stylesheet, item.abshref,
|
||||
ignoreImportRules=True)
|
||||
stylesheets.append(stylesheet)
|
||||
elif (elem.tag == XHTML('link') and elem.get('href') and elem.get(
|
||||
elif (elem.tag == base.tag('xhtml', 'link') and elem.get('href') and elem.get(
|
||||
'rel', 'stylesheet').lower() == 'stylesheet' and elem.get(
|
||||
'type', CSS_MIME).lower() in OEB_STYLES and media_ok(elem.get('media'))
|
||||
'type', base.CSS_MIME).lower() in base.OEB_STYLES and media_ok(elem.get('media'))
|
||||
):
|
||||
href = urlnormalize(elem.attrib['href'])
|
||||
href = base.urlnormalize(elem.attrib['href'])
|
||||
path = item.abshref(href)
|
||||
sitem = oeb.manifest.hrefs.get(path, None)
|
||||
if sitem is None:
|
||||
@@ -326,7 +325,8 @@ class Stylizer(object):
|
||||
|
||||
special_text = ''.join(punctuation_chars) + \
|
||||
(text[0] if text else '')
|
||||
span = x.makeelement('{%s}span' % XHTML_NS)
|
||||
span = x.makeelement('{%s}span' %
|
||||
const.XHTML_NS)
|
||||
span.text = special_text
|
||||
span.set('data-fake-first-letter', '1')
|
||||
span.tail = text[1:]
|
||||
@@ -340,10 +340,10 @@ class Stylizer(object):
|
||||
else:
|
||||
for elem in matches:
|
||||
self.style(elem)._update_cssdict(cssdict)
|
||||
for elem in xpath(tree, '//h:*[@style]'):
|
||||
for elem in base.xpath(tree, '//h:*[@style]'):
|
||||
self.style(elem)._apply_style_attr(url_replacer=item.abshref)
|
||||
num_pat = re.compile(r'[0-9.]+$')
|
||||
for elem in xpath(tree, '//h:img[@width or @height]'):
|
||||
for elem in base.xpath(tree, '//h:img[@width or @height]'):
|
||||
style = self.style(elem)
|
||||
# Check if either height or width is not default
|
||||
is_styled = style._style.get('width', 'auto') != 'auto' or \
|
||||
@@ -370,7 +370,7 @@ class Stylizer(object):
|
||||
self.logger.warn('CSS import of missing file %r' % path)
|
||||
return (None, None)
|
||||
item = hrefs[path]
|
||||
if item.media_type not in OEB_STYLES:
|
||||
if item.media_type not in base.OEB_STYLES:
|
||||
self.logger.warn('CSS import of non-CSS file %r' % path)
|
||||
return (None, None)
|
||||
data = item.data.cssText
|
||||
|
||||
@@ -1,66 +1,61 @@
|
||||
import textwrap
|
||||
import urllib.parse
|
||||
|
||||
from lxml import etree
|
||||
|
||||
from ebook_converter import guess_type
|
||||
from ebook_converter.utils.imghdr import identify
|
||||
from ebook_converter.utils.xml_parse import safe_xml_fromstring
|
||||
from ebook_converter.polyglot.urllib import unquote
|
||||
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
|
||||
class CoverManager(object):
|
||||
|
||||
SVG_TEMPLATE = textwrap.dedent('''\
|
||||
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en">
|
||||
<head>
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
|
||||
<meta name="calibre:cover" content="true" />
|
||||
<title>Cover</title>
|
||||
<style type="text/css" title="override_css">
|
||||
@page {padding: 0pt; margin:0pt}
|
||||
body { text-align: center; padding:0pt; margin: 0pt; }
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<div>
|
||||
<svg version="1.1" xmlns="http://www.w3.org/2000/svg"
|
||||
xmlns:xlink="http://www.w3.org/1999/xlink"
|
||||
width="100%%" height="100%%" viewBox="__viewbox__"
|
||||
preserveAspectRatio="__ar__">
|
||||
<image width="__width__" height="__height__" xlink:href="%s"/>
|
||||
</svg>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
''')
|
||||
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en">
|
||||
<head>
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
|
||||
<meta name="calibre:cover" content="true" />
|
||||
<title>Cover</title>
|
||||
<style type="text/css" title="override_css">
|
||||
@page {padding: 0pt; margin:0pt}
|
||||
body { text-align: center; padding:0pt; margin: 0pt; }
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<div>
|
||||
<svg version="1.1" xmlns="http://www.w3.org/2000/svg"
|
||||
xmlns:xlink="http://www.w3.org/1999/xlink"
|
||||
width="100%%" height="100%%" viewBox="__viewbox__"
|
||||
preserveAspectRatio="__ar__">
|
||||
<image width="__width__" height="__height__" xlink:href="%s"/>
|
||||
</svg>
|
||||
</div>
|
||||
</body>
|
||||
</html>''')
|
||||
|
||||
NONSVG_TEMPLATE = textwrap.dedent('''\
|
||||
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en">
|
||||
<head>
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
|
||||
<meta name="calibre:cover" content="true" />
|
||||
<title>Cover</title>
|
||||
<style type="text/css" title="override_css">
|
||||
@page {padding: 0pt; margin:0pt}
|
||||
body { text-align: center; padding:0pt; margin: 0pt }
|
||||
div { padding:0pt; margin: 0pt }
|
||||
img { padding:0pt; margin: 0pt }
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<div>
|
||||
<img src="%s" alt="cover" __style__ />
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en">
|
||||
<head>
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
|
||||
<meta name="calibre:cover" content="true" />
|
||||
<title>Cover</title>
|
||||
<style type="text/css" title="override_css">
|
||||
@page {padding: 0pt; margin:0pt}
|
||||
body { text-align: center; padding:0pt; margin: 0pt }
|
||||
div { padding:0pt; margin: 0pt }
|
||||
img { padding:0pt; margin: 0pt }
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<div>
|
||||
<img src="%s" alt="cover" __style__ />
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
''')
|
||||
|
||||
def __init__(self, no_default_cover=False, no_svg_cover=False,
|
||||
preserve_aspect_ratio=False, fixed_size=None):
|
||||
preserve_aspect_ratio=False, fixed_size=None):
|
||||
self.no_default_cover = no_default_cover
|
||||
self.no_svg_cover = no_svg_cover
|
||||
self.preserve_aspect_ratio = preserve_aspect_ratio
|
||||
@@ -72,9 +67,9 @@ class CoverManager(object):
|
||||
style = 'style="height: 100%%"'
|
||||
else:
|
||||
width, height = fixed_size
|
||||
style = 'style="height: %s; width: %s"'%(height, width)
|
||||
style = 'style="height: %s; width: %s"' % (height, width)
|
||||
self.non_svg_template = self.NONSVG_TEMPLATE.replace('__style__',
|
||||
style)
|
||||
style)
|
||||
|
||||
def __call__(self, oeb, opts, log):
|
||||
self.oeb = oeb
|
||||
@@ -108,22 +103,23 @@ class CoverManager(object):
|
||||
# if self.preserve_aspect_ratio:
|
||||
# width, height = 600, 800
|
||||
self.svg_template = self.svg_template.replace('__viewbox__',
|
||||
'0 0 %d %d'%(width, height))
|
||||
'0 0 %d %d' %
|
||||
(width, height))
|
||||
self.svg_template = self.svg_template.replace('__width__',
|
||||
str(width))
|
||||
str(width))
|
||||
self.svg_template = self.svg_template.replace('__height__',
|
||||
str(height))
|
||||
str(height))
|
||||
|
||||
if href is not None:
|
||||
templ = self.non_svg_template if self.no_svg_cover \
|
||||
else self.svg_template
|
||||
tp = templ%unquote(href)
|
||||
tp = templ % unquote(href)
|
||||
id, href = m.generate('titlepage', 'titlepage.xhtml')
|
||||
item = m.add(id, href, guess_type('t.xhtml')[0],
|
||||
data=safe_xml_fromstring(tp))
|
||||
data=etree.fromstring(tp))
|
||||
else:
|
||||
item = self.oeb.manifest.hrefs[
|
||||
urllib.parse.urldefrag(self.oeb.guide['titlepage'].href)[0]]
|
||||
key = urllib.parse.urldefrag(self.oeb.guide['titlepage'].href)[0]
|
||||
item = self.oeb.manifest.hrefs[key]
|
||||
if item is not None:
|
||||
self.oeb.spine.insert(0, item, True)
|
||||
if 'cover' not in self.oeb.guide.refs:
|
||||
|
||||
@@ -1,26 +1,27 @@
|
||||
"""
|
||||
CSS flattening transform.
|
||||
"""
|
||||
import re, operator, math, numbers
|
||||
from collections import defaultdict
|
||||
from xml.dom import SyntaxErr
|
||||
import collections
|
||||
import math
|
||||
import numbers
|
||||
import operator
|
||||
import re
|
||||
from xml import dom
|
||||
|
||||
from lxml import etree
|
||||
import css_parser
|
||||
from css_parser.css import Property
|
||||
from css_parser import css as cp_css
|
||||
|
||||
from ebook_converter import constants as const
|
||||
from ebook_converter import guess_type
|
||||
from ebook_converter.ebooks import unit_convert
|
||||
from ebook_converter.ebooks.oeb.base import (XHTML, XHTML_NS, CSS_MIME, OEB_STYLES,
|
||||
namespace, barename, XPath, css_text)
|
||||
from ebook_converter.ebooks.oeb import base
|
||||
from ebook_converter.ebooks.oeb import parse_utils
|
||||
|
||||
from ebook_converter.ebooks.oeb.stylizer import Stylizer
|
||||
from ebook_converter.utils.filenames import ascii_filename, ascii_text
|
||||
from ebook_converter.utils.icu import numeric_sort_key
|
||||
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>'
|
||||
|
||||
COLLAPSE = re.compile(r'[ \t\r\n\v]+')
|
||||
STRIPNUM = re.compile(r'[-0-9]+$')
|
||||
|
||||
@@ -121,7 +122,7 @@ class EmbedFontsCSSRules(object):
|
||||
return None
|
||||
if not self.href:
|
||||
iid, href = oeb.manifest.generate('page_styles', 'page_styles.css')
|
||||
rules = [css_text(x) for x in self.rules]
|
||||
rules = [base.css_text(x) for x in self.rules]
|
||||
rules = '\n\n'.join(rules)
|
||||
sheet = css_parser.parseString(rules, validate=False)
|
||||
self.href = oeb.manifest.add(iid, href, guess_type(href)[0],
|
||||
@@ -186,7 +187,7 @@ class CSSFlattener(object):
|
||||
for item in oeb.manifest.values():
|
||||
# Make all links to resources absolute, as these sheets will be
|
||||
# consolidated into a single stylesheet at the root of the document
|
||||
if item.media_type in OEB_STYLES:
|
||||
if item.media_type in base.OEB_STYLES:
|
||||
css_parser.replaceUrls(item.data, item.abshref,
|
||||
ignoreImportRules=True)
|
||||
|
||||
@@ -273,7 +274,7 @@ class CSSFlattener(object):
|
||||
css = ''
|
||||
for item in self.items:
|
||||
html = item.data
|
||||
body = html.find(XHTML('body'))
|
||||
body = html.find(base.tag('xhtml', 'body'))
|
||||
if 'style' in html.attrib:
|
||||
b = body.attrib.get('style', '')
|
||||
body.set('style', html.get('style') + ';' + b)
|
||||
@@ -310,11 +311,11 @@ class CSSFlattener(object):
|
||||
sizes[csize] += len(COLLAPSE.sub(' ', child.tail))
|
||||
|
||||
def baseline_spine(self):
|
||||
sizes = defaultdict(float)
|
||||
sizes = collections.defaultdict(float)
|
||||
for item in self.items:
|
||||
html = item.data
|
||||
stylizer = self.stylizers[item]
|
||||
body = html.find(XHTML('body'))
|
||||
body = html.find(base.tag('xhtml', 'body'))
|
||||
fsize = self.context.source.fbase
|
||||
self.baseline_node(body, stylizer, sizes, fsize)
|
||||
try:
|
||||
@@ -351,9 +352,9 @@ class CSSFlattener(object):
|
||||
|
||||
def flatten_node(self, node, stylizer, names, styles, pseudo_styles, psize, item_id, recurse=True):
|
||||
if not isinstance(node.tag, (str, bytes)) \
|
||||
or namespace(node.tag) != XHTML_NS:
|
||||
or parse_utils.namespace(node.tag) != const.XHTML_NS:
|
||||
return
|
||||
tag = barename(node.tag)
|
||||
tag = parse_utils.barename(node.tag)
|
||||
style = stylizer.style(node)
|
||||
cssdict = style.cssdict()
|
||||
try:
|
||||
@@ -375,7 +376,7 @@ class CSSFlattener(object):
|
||||
if 'margin-left' not in cssdict and 'margin-right' not in cssdict:
|
||||
cssdict['margin-left'] = cssdict['margin-right'] = 'auto'
|
||||
else:
|
||||
for table in node.iterchildren(XHTML("table")):
|
||||
for table in node.iterchildren(base.tag('xhtml', "table")):
|
||||
ts = stylizer.style(table)
|
||||
if ts.get('margin-left') is None and ts.get('margin-right') is None:
|
||||
ts.set('margin-left', 'auto')
|
||||
@@ -391,11 +392,12 @@ class CSSFlattener(object):
|
||||
if cssdict.get('vertical-align') == 'inherit':
|
||||
cssdict['vertical-align'] = node.attrib['valign']
|
||||
del node.attrib['valign']
|
||||
if node.tag == XHTML('font'):
|
||||
if node.tag == base.tag('xhtml', 'font'):
|
||||
tags = ['descendant::h:%s'%x for x in ('p', 'div', 'table', 'h1',
|
||||
'h2', 'h3', 'h4', 'h5', 'h6', 'ol', 'ul', 'dl', 'blockquote')]
|
||||
tag = 'div' if XPath('|'.join(tags))(node) else 'span'
|
||||
node.tag = XHTML(tag)
|
||||
# TODO(gryf): this will override tag from line 355. On purpose?
|
||||
tag = 'div' if base.XPath('|'.join(tags))(node) else 'span'
|
||||
node.tag = base.tag('xhtml', tag)
|
||||
if 'size' in node.attrib:
|
||||
def force_int(raw):
|
||||
return int(re.search(r'([0-9+-]+)', raw).group(1))
|
||||
@@ -425,14 +427,14 @@ class CSSFlattener(object):
|
||||
del node.attrib['face']
|
||||
if 'color' in node.attrib:
|
||||
try:
|
||||
cssdict['color'] = Property('color', node.attrib['color']).value
|
||||
except (ValueError, SyntaxErr):
|
||||
cssdict['color'] = cp_css.Property('color', node.attrib['color']).value
|
||||
except (ValueError, dom.SyntaxErr):
|
||||
pass
|
||||
del node.attrib['color']
|
||||
if 'bgcolor' in node.attrib:
|
||||
try:
|
||||
cssdict['background-color'] = Property('background-color', node.attrib['bgcolor']).value
|
||||
except (ValueError, SyntaxErr):
|
||||
cssdict['background-color'] = cp_css.Property('background-color', node.attrib['bgcolor']).value
|
||||
except (ValueError, dom.SyntaxErr):
|
||||
pass
|
||||
del node.attrib['bgcolor']
|
||||
if tag == 'ol' and 'type' in node.attrib:
|
||||
@@ -573,7 +575,7 @@ class CSSFlattener(object):
|
||||
|
||||
def flatten_head(self, item, href, global_href):
|
||||
html = item.data
|
||||
head = html.find(XHTML('head'))
|
||||
head = html.find(base.tag('xhtml', 'head'))
|
||||
|
||||
def safe_lower(x):
|
||||
try:
|
||||
@@ -583,39 +585,39 @@ class CSSFlattener(object):
|
||||
return x
|
||||
|
||||
for node in html.xpath('//*[local-name()="style" or local-name()="link"]'):
|
||||
if node.tag == XHTML('link') \
|
||||
if node.tag == base.tag('xhtml', 'link') \
|
||||
and safe_lower(node.get('rel', 'stylesheet')) == 'stylesheet' \
|
||||
and safe_lower(node.get('type', CSS_MIME)) in OEB_STYLES:
|
||||
and safe_lower(node.get('type', base.CSS_MIME)) in base.OEB_STYLES:
|
||||
node.getparent().remove(node)
|
||||
elif node.tag == XHTML('style') \
|
||||
and node.get('type', CSS_MIME) in OEB_STYLES:
|
||||
elif node.tag == base.tag('xhtml', 'style') \
|
||||
and node.get('type', base.CSS_MIME) in base.OEB_STYLES:
|
||||
node.getparent().remove(node)
|
||||
href = item.relhref(href)
|
||||
l = etree.SubElement(head, XHTML('link'),
|
||||
rel='stylesheet', type=CSS_MIME, href=href)
|
||||
l = etree.SubElement(head, base.tag('xhtml', 'link'),
|
||||
rel='stylesheet', type=base.CSS_MIME, href=href)
|
||||
l.tail='\n'
|
||||
if global_href:
|
||||
href = item.relhref(global_href)
|
||||
l = etree.SubElement(head, XHTML('link'),
|
||||
rel='stylesheet', type=CSS_MIME, href=href)
|
||||
l = etree.SubElement(head, base.tag('xhtml', 'link'),
|
||||
rel='stylesheet', type=base.CSS_MIME, href=href)
|
||||
l.tail = '\n'
|
||||
|
||||
def replace_css(self, css):
|
||||
manifest = self.oeb.manifest
|
||||
for item in manifest.values():
|
||||
if item.media_type in OEB_STYLES:
|
||||
if item.media_type in base.OEB_STYLES:
|
||||
manifest.remove(item)
|
||||
id, href = manifest.generate('css', 'stylesheet.css')
|
||||
sheet = css_parser.parseString(css, validate=False)
|
||||
if self.transform_css_rules:
|
||||
from ebook_converter.ebooks.css_transform_rules import transform_sheet
|
||||
transform_sheet(self.transform_css_rules, sheet)
|
||||
item = manifest.add(id, href, CSS_MIME, data=sheet)
|
||||
item = manifest.add(id, href, base.CSS_MIME, data=sheet)
|
||||
self.oeb.manifest.main_stylesheet = item
|
||||
return href
|
||||
|
||||
def collect_global_css(self):
|
||||
global_css = defaultdict(list)
|
||||
global_css = collections.defaultdict(list)
|
||||
for item in self.items:
|
||||
stylizer = self.stylizers[item]
|
||||
if float(self.context.margin_top) >= 0:
|
||||
@@ -627,7 +629,7 @@ class CSSFlattener(object):
|
||||
items = sorted(stylizer.page_rule.items())
|
||||
css = ';\n'.join("%s: %s" % (key, val) for key, val in items)
|
||||
css = ('@page {\n%s\n}\n'%css) if items else ''
|
||||
rules = [css_text(r) for r in stylizer.font_face_rules + self.embed_font_rules]
|
||||
rules = [base.css_text(r) for r in stylizer.font_face_rules + self.embed_font_rules]
|
||||
raw = '\n\n'.join(rules)
|
||||
css += '\n\n' + raw
|
||||
global_css[css].append(item)
|
||||
@@ -642,7 +644,7 @@ class CSSFlattener(object):
|
||||
if self.transform_css_rules:
|
||||
from ebook_converter.ebooks.css_transform_rules import transform_sheet
|
||||
transform_sheet(self.transform_css_rules, sheet)
|
||||
manifest.add(id_, href, CSS_MIME, data=sheet)
|
||||
manifest.add(id_, href, base.CSS_MIME, data=sheet)
|
||||
gc_map[css] = href
|
||||
|
||||
ans = {}
|
||||
@@ -652,8 +654,8 @@ class CSSFlattener(object):
|
||||
return ans
|
||||
|
||||
def flatten_spine(self):
|
||||
names = defaultdict(int)
|
||||
styles, pseudo_styles = {}, defaultdict(dict)
|
||||
names = collections.defaultdict(int)
|
||||
styles, pseudo_styles = {}, collections.defaultdict(dict)
|
||||
for item in self.items:
|
||||
html = item.data
|
||||
stylizer = self.stylizers[item]
|
||||
@@ -661,7 +663,7 @@ class CSSFlattener(object):
|
||||
self.specializer(item, stylizer)
|
||||
fsize = self.context.dest.fbase
|
||||
self.flatten_node(html, stylizer, names, styles, pseudo_styles, fsize, item.id, recurse=False)
|
||||
self.flatten_node(html.find(XHTML('body')), stylizer, names, styles, pseudo_styles, fsize, item.id)
|
||||
self.flatten_node(html.find(base.tag('xhtml', 'body')), stylizer, names, styles, pseudo_styles, fsize, item.id)
|
||||
items = sorted(((key, val) for (val, key) in styles.items()))
|
||||
# :hover must come after link and :active must come after :hover
|
||||
psels = sorted(pseudo_styles, key=lambda x :
|
||||
|
||||
@@ -1,46 +1,20 @@
|
||||
"""
|
||||
HTML-TOC-adding transform.
|
||||
"""
|
||||
from ebook_converter.ebooks.oeb.base import XML, XHTML, XHTML_NS
|
||||
from ebook_converter.ebooks.oeb.base import XHTML_MIME, CSS_MIME
|
||||
from ebook_converter.ebooks.oeb.base import element, XPath
|
||||
from ebook_converter import constants as const
|
||||
from ebook_converter.ebooks.oeb import base
|
||||
|
||||
|
||||
__all__ = ['HTMLTOCAdder']
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>'
|
||||
|
||||
DEFAULT_TITLE = 'Table of Contents'
|
||||
STYLE_CSS = {'nested': '.calibre_toc_header {\n text-align: center;\n}\n'
|
||||
'.calibre_toc_block {\n margin-left: 1.2em;\n text-indent: '
|
||||
'-1.2em;\n}\n.calibre_toc_block .calibre_toc_block {\n '
|
||||
'margin-left: 2.4em;\n}\n.calibre_toc_block .calibre_toc_block '
|
||||
'.calibre_toc_block {\n margin-left: 3.6em;\n}\n',
|
||||
|
||||
STYLE_CSS = {
|
||||
'nested': """
|
||||
.calibre_toc_header {
|
||||
text-align: center;
|
||||
}
|
||||
.calibre_toc_block {
|
||||
margin-left: 1.2em;
|
||||
text-indent: -1.2em;
|
||||
}
|
||||
.calibre_toc_block .calibre_toc_block {
|
||||
margin-left: 2.4em;
|
||||
}
|
||||
.calibre_toc_block .calibre_toc_block .calibre_toc_block {
|
||||
margin-left: 3.6em;
|
||||
}
|
||||
""",
|
||||
|
||||
'centered': """
|
||||
.calibre_toc_header {
|
||||
text-align: center;
|
||||
}
|
||||
.calibre_toc_block {
|
||||
text-align: center;
|
||||
}
|
||||
body > .calibre_toc_block {
|
||||
margin-top: 1.2em;
|
||||
}
|
||||
"""
|
||||
}
|
||||
'centered': '.calibre_toc_header {\n text-align: center;\n}\n'
|
||||
'.calibre_toc_block {\n text-align: center;\n}\nbody > '
|
||||
'.calibre_toc_block {\n margin-top: 1.2em;\n}\n'}
|
||||
|
||||
|
||||
class HTMLTOCAdder(object):
|
||||
@@ -71,7 +45,7 @@ class HTMLTOCAdder(object):
|
||||
if href in oeb.manifest.hrefs:
|
||||
item = oeb.manifest.hrefs[href]
|
||||
if (hasattr(item.data, 'xpath') and
|
||||
XPath('//h:a[@href]')(item.data)):
|
||||
base.XPath('//h:a[@href]')(item.data)):
|
||||
if oeb.spine.index(item) < 0:
|
||||
if self.position == 'end':
|
||||
oeb.spine.add(item, linear=False)
|
||||
@@ -91,23 +65,24 @@ class HTMLTOCAdder(object):
|
||||
oeb.logger.error('Unknown TOC style %r' % style)
|
||||
style = 'nested'
|
||||
id, css_href = oeb.manifest.generate('tocstyle', 'tocstyle.css')
|
||||
oeb.manifest.add(id, css_href, CSS_MIME, data=STYLE_CSS[style])
|
||||
oeb.manifest.add(id, css_href, base.CSS_MIME, data=STYLE_CSS[style])
|
||||
language = str(oeb.metadata.language[0])
|
||||
contents = element(None, XHTML('html'), nsmap={None: XHTML_NS},
|
||||
attrib={XML('lang'): language})
|
||||
head = element(contents, XHTML('head'))
|
||||
htitle = element(head, XHTML('title'))
|
||||
contents = base.element(None, base.tag('xhtml', 'html'),
|
||||
nsmap={None: const.XHTML_NS},
|
||||
attrib={base.tag('xml', 'lang'): language})
|
||||
head = base.element(contents, base.tag('xhtml', 'head'))
|
||||
htitle = base.element(head, base.tag('xhtml', 'title'))
|
||||
htitle.text = title
|
||||
element(head, XHTML('link'), rel='stylesheet', type=CSS_MIME,
|
||||
href=css_href)
|
||||
body = element(contents, XHTML('body'),
|
||||
attrib={'class': 'calibre_toc'})
|
||||
h1 = element(body, XHTML('h2'),
|
||||
attrib={'class': 'calibre_toc_header'})
|
||||
base.element(head, base.tag('xhtml', 'link'), rel='stylesheet',
|
||||
type=base.CSS_MIME, href=css_href)
|
||||
body = base.element(contents, base.tag('xhtml', 'body'),
|
||||
attrib={'class': 'calibre_toc'})
|
||||
h1 = base.element(body, base.tag('xhtml', 'h2'),
|
||||
attrib={'class': 'calibre_toc_header'})
|
||||
h1.text = title
|
||||
self.add_toc_level(body, oeb.toc)
|
||||
id, href = oeb.manifest.generate('contents', 'contents.xhtml')
|
||||
item = oeb.manifest.add(id, href, XHTML_MIME, data=contents)
|
||||
item = oeb.manifest.add(id, href, base.XHTML_MIME, data=contents)
|
||||
if self.position == 'end':
|
||||
oeb.spine.add(item, linear=False)
|
||||
else:
|
||||
@@ -116,10 +91,10 @@ class HTMLTOCAdder(object):
|
||||
|
||||
def add_toc_level(self, elem, toc):
|
||||
for node in toc:
|
||||
block = element(elem, XHTML('div'),
|
||||
attrib={'class': 'calibre_toc_block'})
|
||||
line = element(block, XHTML('a'),
|
||||
attrib={'href': node.href,
|
||||
'class': 'calibre_toc_line'})
|
||||
block = base.element(elem, base.tag('xhtml', 'div'),
|
||||
attrib={'class': 'calibre_toc_block'})
|
||||
line = base.element(block, base.tag('xhtml', 'a'),
|
||||
attrib={'href': node.href,
|
||||
'class': 'calibre_toc_line'})
|
||||
line.text = node.title
|
||||
self.add_toc_level(block, node)
|
||||
|
||||
@@ -4,9 +4,10 @@ from string import Formatter
|
||||
import pkg_resources
|
||||
import urllib.parse
|
||||
|
||||
from ebook_converter import constants as const
|
||||
from ebook_converter import guess_type, strftime
|
||||
from ebook_converter.constants_old import iswindows
|
||||
from ebook_converter.ebooks.oeb.base import XPath, XHTML_NS, XHTML, xml2text, urlnormalize
|
||||
from ebook_converter.ebooks.oeb.base import XPath, xml2text, urlnormalize
|
||||
from ebook_converter.library.comments import comments_to_html, markdown
|
||||
from ebook_converter.utils.date import is_date_undefined, as_local_time
|
||||
from ebook_converter.ebooks.chardet import strip_encoding_declarations
|
||||
@@ -303,7 +304,7 @@ def render_jacket(mi, output_profile,
|
||||
'tags_label': 'Tags',
|
||||
'title': title,
|
||||
'title_str': title_str,
|
||||
'xmlns': XHTML_NS}
|
||||
'xmlns': const.XHTML_NS}
|
||||
|
||||
for key in mi.custom_field_keys():
|
||||
m = mi.get_user_metadata(key, False) or {}
|
||||
@@ -370,7 +371,7 @@ def render_jacket(mi, output_profile,
|
||||
# We cannot use data-calibre-rescale 100 on the body tag as that will just
|
||||
# give the body tag a font size of 1em, which is useless.
|
||||
for body in root.xpath('//*[local-name()="body"]'):
|
||||
fw = body.makeelement(XHTML('div'))
|
||||
fw = body.makeelement(const.XHTML_DIV)
|
||||
fw.set('data-calibre-rescale', '100')
|
||||
for child in body:
|
||||
fw.append(child)
|
||||
@@ -387,9 +388,9 @@ def linearize_jacket(oeb):
|
||||
for x in oeb.spine[:4]:
|
||||
if XPath(JACKET_XPATH)(x.data):
|
||||
for e in XPath('//h:table|//h:tr|//h:th')(x.data):
|
||||
e.tag = XHTML('div')
|
||||
e.tag = const.XHTML_DIV
|
||||
for e in XPath('//h:td')(x.data):
|
||||
e.tag = XHTML('span')
|
||||
e.tag = const.XHTML_SPAN
|
||||
break
|
||||
|
||||
|
||||
|
||||
@@ -5,9 +5,9 @@ import string
|
||||
|
||||
from lxml import etree
|
||||
|
||||
from ebook_converter.ebooks.oeb.base import XHTML, XHTML_NS
|
||||
from ebook_converter.ebooks.oeb.base import CSS_MIME
|
||||
from ebook_converter.ebooks.oeb.base import namespace
|
||||
from ebook_converter import constants as const
|
||||
from ebook_converter.ebooks.oeb import base
|
||||
from ebook_converter.ebooks.oeb import parse_utils
|
||||
from ebook_converter.ebooks.oeb.stylizer import Stylizer
|
||||
|
||||
|
||||
@@ -43,15 +43,16 @@ class CaseMangler(object):
|
||||
|
||||
def mangle_spine(self):
|
||||
id, href = self.oeb.manifest.generate('manglecase', 'manglecase.css')
|
||||
self.oeb.manifest.add(id, href, CSS_MIME, data=CASE_MANGLER_CSS)
|
||||
self.oeb.manifest.add(id, href, base.CSS_MIME, data=CASE_MANGLER_CSS)
|
||||
for item in self.oeb.spine:
|
||||
html = item.data
|
||||
relhref = item.relhref(href)
|
||||
etree.SubElement(html.find(XHTML('head')), XHTML('link'),
|
||||
rel='stylesheet', href=relhref, type=CSS_MIME)
|
||||
etree.SubElement(html.find(base.tag('xhtml', 'head')),
|
||||
base.tag('xhtml', 'link'), rel='stylesheet',
|
||||
href=relhref, type=base.CSS_MIME)
|
||||
stylizer = Stylizer(html, item.href, self.oeb, self.opts,
|
||||
self.profile)
|
||||
self.mangle_elem(html.find(XHTML('body')), stylizer)
|
||||
self.mangle_elem(html.find(base.tag('xhtml', 'body')), stylizer)
|
||||
|
||||
def text_transform(self, transform, text):
|
||||
if transform == 'capitalize':
|
||||
@@ -85,7 +86,8 @@ class CaseMangler(object):
|
||||
else:
|
||||
last.tail = text
|
||||
else:
|
||||
child = elem.makeelement(XHTML('span'), attrib=attrib)
|
||||
child = elem.makeelement(base.tag('xhtml', 'span'),
|
||||
attrib=attrib)
|
||||
child.text = text.upper()
|
||||
if last is None:
|
||||
elem.insert(0, child)
|
||||
@@ -99,7 +101,7 @@ class CaseMangler(object):
|
||||
|
||||
def mangle_elem(self, elem, stylizer):
|
||||
if not isinstance(elem.tag, (str, bytes)) or \
|
||||
namespace(elem.tag) != XHTML_NS:
|
||||
parse_utils.namespace(elem.tag) != const.XHTML_NS:
|
||||
return
|
||||
children = list(elem)
|
||||
style = stylizer.style(elem)
|
||||
|
||||
@@ -1,15 +1,12 @@
|
||||
import os, re
|
||||
import os
|
||||
import re
|
||||
|
||||
from ebook_converter.ebooks.oeb import base
|
||||
from ebook_converter.utils.date import isoformat, now
|
||||
from ebook_converter import guess_type
|
||||
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
|
||||
def meta_info_to_oeb_metadata(mi, m, log, override_input_metadata=False):
|
||||
from ebook_converter.ebooks.oeb.base import OPF
|
||||
if not mi.is_null('title'):
|
||||
m.clear('title')
|
||||
m.add('title', mi.title)
|
||||
@@ -19,17 +16,17 @@ def meta_info_to_oeb_metadata(mi, m, log, override_input_metadata=False):
|
||||
m.clear('title_sort')
|
||||
m.add('title_sort', mi.title_sort)
|
||||
if not mi.is_null('authors'):
|
||||
m.filter('creator', lambda x : x.role.lower() in ['aut', ''])
|
||||
m.filter('creator', lambda x: x.role.lower() in ['aut', ''])
|
||||
for a in mi.authors:
|
||||
attrib = {'role':'aut'}
|
||||
attrib = {'role': 'aut'}
|
||||
if mi.author_sort:
|
||||
attrib[OPF('file-as')] = mi.author_sort
|
||||
attrib[base.tag('opf', 'file-as')] = mi.author_sort
|
||||
m.add('creator', a, attrib=attrib)
|
||||
if not mi.is_null('book_producer'):
|
||||
m.filter('contributor', lambda x : x.role.lower() == 'bkp')
|
||||
m.filter('contributor', lambda x: x.role.lower() == 'bkp')
|
||||
m.add('contributor', mi.book_producer, role='bkp')
|
||||
elif override_input_metadata:
|
||||
m.filter('contributor', lambda x : x.role.lower() == 'bkp')
|
||||
m.filter('contributor', lambda x: x.role.lower() == 'bkp')
|
||||
if not mi.is_null('comments'):
|
||||
m.clear('description')
|
||||
m.add('description', mi.comments)
|
||||
@@ -71,7 +68,7 @@ def meta_info_to_oeb_metadata(mi, m, log, override_input_metadata=False):
|
||||
m.clear('series_index')
|
||||
if not mi.is_null('rating'):
|
||||
m.clear('rating')
|
||||
m.add('rating', '%.2f'%mi.rating)
|
||||
m.add('rating', '%.2f' % mi.rating)
|
||||
elif override_input_metadata:
|
||||
m.clear('rating')
|
||||
if not mi.is_null('tags'):
|
||||
@@ -101,23 +98,25 @@ class MergeMetadata(object):
|
||||
'Merge in user metadata, including cover'
|
||||
|
||||
def __call__(self, oeb, mi, opts, override_input_metadata=False):
|
||||
_oim = override_input_metadata
|
||||
self.oeb, self.log = oeb, oeb.log
|
||||
m = self.oeb.metadata
|
||||
self.log('Merging user specified metadata...')
|
||||
meta_info_to_oeb_metadata(mi, m, oeb.log,
|
||||
override_input_metadata=override_input_metadata)
|
||||
override_input_metadata=_oim)
|
||||
cover_id = self.set_cover(mi, opts.prefer_metadata_cover)
|
||||
m.clear('cover')
|
||||
if cover_id is not None:
|
||||
m.add('cover', cover_id)
|
||||
if mi.uuid is not None:
|
||||
m.filter('identifier', lambda x:x.id=='uuid_id')
|
||||
m.filter('identifier', lambda x: x.id == 'uuid_id')
|
||||
self.oeb.metadata.add('identifier', mi.uuid, id='uuid_id',
|
||||
scheme='uuid')
|
||||
self.oeb.uid = self.oeb.metadata.identifier[-1]
|
||||
if mi.application_id is not None:
|
||||
m.filter('identifier', lambda x:x.scheme=='calibre')
|
||||
self.oeb.metadata.add('identifier', mi.application_id, scheme='calibre')
|
||||
m.filter('identifier', lambda x: x.scheme == 'calibre')
|
||||
self.oeb.metadata.add('identifier', mi.application_id,
|
||||
scheme='calibre')
|
||||
|
||||
def set_cover(self, mi, prefer_metadata_cover):
|
||||
cdata, ext = b'', 'jpg'
|
||||
@@ -138,7 +137,8 @@ class MergeMetadata(object):
|
||||
if cdata:
|
||||
self.oeb.guide.remove('cover')
|
||||
self.oeb.guide.remove('titlepage')
|
||||
elif self.oeb.plumber_output_format in {'mobi', 'azw3'} and old_cover is not None:
|
||||
elif (self.oeb.plumber_output_format in {'mobi', 'azw3'} and
|
||||
old_cover is not None):
|
||||
# The amazon formats dont support html cover pages, so remove them
|
||||
# even if no cover was specified.
|
||||
self.oeb.guide.remove('titlepage')
|
||||
@@ -156,7 +156,9 @@ class MergeMetadata(object):
|
||||
new_cover_item = None
|
||||
if cdata:
|
||||
id, href = self.oeb.manifest.generate('cover', 'cover.'+ext)
|
||||
new_cover_item = self.oeb.manifest.add(id, href, guess_type('cover.'+ext)[0], data=cdata)
|
||||
new_cover_item = self.oeb.manifest.add(id, href,
|
||||
guess_type('cover.'+ext)[0],
|
||||
data=cdata)
|
||||
self.oeb.guide.add('cover', 'Cover', href)
|
||||
if do_remove_old_cover:
|
||||
self.remove_old_cover(item, new_cover_item.href)
|
||||
@@ -186,7 +188,8 @@ class MergeMetadata(object):
|
||||
if href == cover_item.href:
|
||||
if new_cover_href is not None:
|
||||
replacement_href = item.relhref(new_cover_href)
|
||||
attr = 'src' if img.tag.endswith('img') else XLINK('href')
|
||||
attr = ('src' if img.tag.endswith('img')
|
||||
else XLINK('href'))
|
||||
img.set(attr, replacement_href)
|
||||
else:
|
||||
p = img.getparent()
|
||||
@@ -202,13 +205,14 @@ class MergeMetadata(object):
|
||||
for item in affected_items:
|
||||
body = XPath('//h:body')(item.data)
|
||||
if body:
|
||||
text = etree.tostring(body[0], method='text', encoding='unicode')
|
||||
text = etree.tostring(body[0], method='text',
|
||||
encoding='unicode')
|
||||
else:
|
||||
text = ''
|
||||
text = re.sub(r'\s+', '', text)
|
||||
if not text and not XPath('//h:img|//svg:svg')(item.data):
|
||||
self.log('Removing %s as it is a wrapper around'
|
||||
' the cover image'%item.href)
|
||||
self.log('Removing %s as it is a wrapper around the cover '
|
||||
'image' % item.href)
|
||||
self.oeb.spine.remove(item)
|
||||
self.oeb.manifest.remove(item)
|
||||
self.oeb.guide.remove_by_href(item.href)
|
||||
|
||||
@@ -1,7 +1,8 @@
|
||||
import numbers
|
||||
from collections import Counter
|
||||
|
||||
from ebook_converter.ebooks.oeb.base import barename, XPath
|
||||
from ebook_converter.ebooks.oeb import parse_utils
|
||||
from ebook_converter.ebooks.oeb.base import XPath
|
||||
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
@@ -142,7 +143,7 @@ class RemoveFakeMargins(object):
|
||||
|
||||
for p in paras(body):
|
||||
level = level_of(p, body)
|
||||
level = '%s_%d'%(barename(p.tag), level)
|
||||
level = '%s_%d' % (parse_utils.barename(p.tag), level)
|
||||
if level not in self.levels:
|
||||
self.levels[level] = []
|
||||
self.levels[level].append(p)
|
||||
|
||||
@@ -5,10 +5,8 @@ import os
|
||||
import re
|
||||
import urllib.parse
|
||||
|
||||
# from PyQt5.Qt import (
|
||||
# Qt, QByteArray, QBuffer, QIODevice, QColor, QImage, QPainter, QSvgRenderer)
|
||||
|
||||
from ebook_converter.ebooks.oeb.base import XHTML, XLINK
|
||||
from ebook_converter import constants as const
|
||||
from ebook_converter.ebooks.oeb import base
|
||||
from ebook_converter.ebooks.oeb.base import SVG_MIME, PNG_MIME
|
||||
from ebook_converter.ebooks.oeb.base import xml2str, xpath
|
||||
from ebook_converter.ebooks.oeb.base import urlnormalize
|
||||
@@ -17,10 +15,7 @@ from ebook_converter.ptempfile import PersistentTemporaryFile
|
||||
from ebook_converter.utils.imghdr import what
|
||||
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>'
|
||||
|
||||
IMAGE_TAGS = {XHTML('img'), XHTML('object')}
|
||||
IMAGE_TAGS = {base.tag('xhtml', 'img'), base.tag('xhtml', 'object')}
|
||||
KEEP_ATTRS = {'class', 'style', 'width', 'height', 'align'}
|
||||
|
||||
|
||||
@@ -113,7 +108,7 @@ class SVGRasterizer(object):
|
||||
svg = item.data
|
||||
hrefs = self.oeb.manifest.hrefs
|
||||
for elem in xpath(svg, '//svg:*[@xl:href]'):
|
||||
href = urlnormalize(elem.attrib[XLINK('href')])
|
||||
href = urlnormalize(elem.attrib[base.tag('xlink', 'href')])
|
||||
path = urllib.parse.urldefrag(href)[0]
|
||||
if not path:
|
||||
continue
|
||||
@@ -126,7 +121,7 @@ class SVGRasterizer(object):
|
||||
with PersistentTemporaryFile(suffix='.'+ext) as pt:
|
||||
pt.write(data)
|
||||
self.temp_files.append(pt.name)
|
||||
elem.attrib[XLINK('href')] = pt.name
|
||||
elem.attrib[base.tag('xlink', 'href')] = pt.name
|
||||
return svg
|
||||
|
||||
def stylizer(self, item):
|
||||
@@ -171,7 +166,7 @@ class SVGRasterizer(object):
|
||||
href = os.path.splitext(item.href)[0] + '.png'
|
||||
id, href = manifest.generate(item.id, href)
|
||||
manifest.add(id, href, PNG_MIME, data=data)
|
||||
img = elem.makeelement(XHTML('img'), src=item.relhref(href))
|
||||
img = elem.makeelement(base.tag('xhtml', 'img'), src=item.relhref(href))
|
||||
elem.getparent().replace(elem, img)
|
||||
for prop in ('width', 'height'):
|
||||
if prop in elem.attrib:
|
||||
@@ -208,7 +203,7 @@ class SVGRasterizer(object):
|
||||
id, href = manifest.generate(svgitem.id, href)
|
||||
manifest.add(id, href, PNG_MIME, data=data)
|
||||
self.images[key] = href
|
||||
elem.tag = XHTML('img')
|
||||
elem.tag = base.tag('xhtml', 'img')
|
||||
for attr in elem.attrib:
|
||||
if attr not in KEEP_ATTRS:
|
||||
del elem.attrib[attr]
|
||||
|
||||
@@ -10,10 +10,11 @@ import urllib.parse
|
||||
from lxml.etree import XPath as _XPath
|
||||
from lxml import etree
|
||||
|
||||
from ebook_converter import constants as const
|
||||
from ebook_converter import as_unicode, force_unicode
|
||||
from ebook_converter.ebooks.epub import rules
|
||||
from ebook_converter.ebooks.oeb.base import (OEB_STYLES, XPNSMAP as NAMESPACES,
|
||||
rewrite_links, XHTML, urlnormalize)
|
||||
from ebook_converter.ebooks.oeb.base import \
|
||||
OEB_STYLES, rewrite_links, urlnormalize
|
||||
from ebook_converter.ebooks.oeb.polish.split import do_split
|
||||
from ebook_converter.polyglot.urllib import unquote
|
||||
from ebook_converter.css_selectors import Select, SelectorError
|
||||
@@ -22,7 +23,7 @@ from ebook_converter.css_selectors import Select, SelectorError
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
XPath = functools.partial(_XPath, namespaces=NAMESPACES)
|
||||
XPath = functools.partial(_XPath, namespaces=const.XPNSMAP)
|
||||
|
||||
SPLIT_POINT_ATTR = 'csp'
|
||||
|
||||
@@ -104,7 +105,7 @@ class Split(object):
|
||||
select = Select(item.data)
|
||||
if not self.page_break_selectors:
|
||||
return [], []
|
||||
body = item.data.xpath('//h:body', namespaces=NAMESPACES)
|
||||
body = item.data.xpath('//h:body', namespaces=const.XPNSMAP)
|
||||
if not body:
|
||||
return [], []
|
||||
descendants = frozenset(body[0].iterdescendants('*'))
|
||||
@@ -268,13 +269,13 @@ class FlowSplitter(object):
|
||||
if body is not None:
|
||||
existing_ids = frozenset(body.xpath('//*/@id'))
|
||||
for x in ids - existing_ids:
|
||||
body.insert(0, body.makeelement(XHTML('div'), id=x, style='height:0pt'))
|
||||
body.insert(0, body.makeelement(const.XHTML_div, id=x, style='height:0pt'))
|
||||
ids = set()
|
||||
trees.append(tree)
|
||||
self.trees = trees
|
||||
|
||||
def get_body(self, root):
|
||||
body = root.xpath('//h:body', namespaces=NAMESPACES)
|
||||
body = root.xpath('//h:body', namespaces=const.XPNSMAP)
|
||||
if not body:
|
||||
return None
|
||||
return body[0]
|
||||
@@ -296,7 +297,7 @@ class FlowSplitter(object):
|
||||
etree.tostring(body, method='text', encoding='unicode'))
|
||||
if len(txt) > 1:
|
||||
return False
|
||||
for img in root.xpath('//h:img', namespaces=NAMESPACES):
|
||||
for img in root.xpath('//h:img', namespaces=const.XPNSMAP):
|
||||
if img.get('style', '') != 'display:none':
|
||||
return False
|
||||
if root.xpath('//*[local-name() = "svg"]'):
|
||||
@@ -401,7 +402,7 @@ class FlowSplitter(object):
|
||||
'//h:br',
|
||||
'//h:li',
|
||||
):
|
||||
elems = root.xpath(path, namespaces=NAMESPACES)
|
||||
elems = root.xpath(path, namespaces=const.XPNSMAP)
|
||||
elem = pick_elem(elems)
|
||||
if elem is not None:
|
||||
try:
|
||||
@@ -436,7 +437,7 @@ class FlowSplitter(object):
|
||||
spine_pos = self.item.spine_position
|
||||
|
||||
for current, tree in zip(*map(reversed, (self.files, self.trees))):
|
||||
for a in tree.getroot().xpath('//h:a[@href]', namespaces=NAMESPACES):
|
||||
for a in tree.getroot().xpath('//h:a[@href]', namespaces=const.XPNSMAP):
|
||||
href = a.get('href').strip()
|
||||
if href.startswith('#'):
|
||||
anchor = href[1:]
|
||||
|
||||
@@ -1,22 +1,19 @@
|
||||
import collections
|
||||
import re
|
||||
import uuid
|
||||
import urllib.parse
|
||||
import uuid
|
||||
|
||||
from lxml import etree
|
||||
from collections import OrderedDict, Counter
|
||||
|
||||
from ebook_converter.ebooks.oeb.base import XPNSMAP, TOC, XHTML, xml2text, barename
|
||||
from ebook_converter import constants as const
|
||||
from ebook_converter.ebooks.oeb import parse_utils
|
||||
from ebook_converter.ebooks.oeb.base import TOC, xml2text
|
||||
from ebook_converter.ebooks import ConversionError
|
||||
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
|
||||
def XPath(x):
|
||||
try:
|
||||
return etree.XPath(x, namespaces=XPNSMAP)
|
||||
return etree.XPath(x, namespaces=const.XPNSMAP)
|
||||
except etree.XPathSyntaxError:
|
||||
raise ConversionError(
|
||||
'The syntax of the XPath expression %s is invalid.' % repr(x))
|
||||
@@ -84,7 +81,7 @@ class DetectStructure(object):
|
||||
try:
|
||||
prev = next(elem.itersiblings(tag=etree.Element,
|
||||
preceding=True))
|
||||
if (barename(elem.tag) in {'h1', 'h2'} and barename(
|
||||
if (parse_utils.barename(elem.tag) in {'h1', 'h2'} and parse_utils.barename(
|
||||
prev.tag) in {'h1', 'h2'} and (not prev.tail or
|
||||
not prev.tail.split())):
|
||||
# We have two adjacent headings, do not put a page
|
||||
@@ -165,7 +162,7 @@ class DetectStructure(object):
|
||||
chapter_mark = self.opts.chapter_mark
|
||||
page_break_before = 'display: block; page-break-before: always'
|
||||
page_break_after = 'display: block; page-break-after: always'
|
||||
c = Counter()
|
||||
c = collections.Counter()
|
||||
for item, elem in self.detected_chapters:
|
||||
c[item] += 1
|
||||
text = xml2text(elem).strip()
|
||||
@@ -174,7 +171,7 @@ class DetectStructure(object):
|
||||
if chapter_mark == 'none':
|
||||
continue
|
||||
if chapter_mark == 'rule':
|
||||
mark = elem.makeelement(XHTML('hr'))
|
||||
mark = elem.makeelement(const.XHTML_HR)
|
||||
elif chapter_mark == 'pagebreak':
|
||||
if c[item] < 3 and at_start(elem):
|
||||
# For the first two elements in this item, check if they
|
||||
@@ -184,9 +181,9 @@ class DetectStructure(object):
|
||||
# feedbooks epubs match both a heading tag and its
|
||||
# containing div with the default chapter expression.
|
||||
continue
|
||||
mark = elem.makeelement(XHTML('div'), style=page_break_after)
|
||||
mark = elem.makeelement(const.XHTML_DIV, style=page_break_after)
|
||||
else: # chapter_mark == 'both':
|
||||
mark = elem.makeelement(XHTML('hr'), style=page_break_before)
|
||||
mark = elem.makeelement(const.XHTML_HR, style=page_break_before)
|
||||
try:
|
||||
elem.addprevious(mark)
|
||||
except TypeError:
|
||||
@@ -254,8 +251,8 @@ class DetectStructure(object):
|
||||
return text, href
|
||||
|
||||
def add_leveled_toc_items(self):
|
||||
added = OrderedDict()
|
||||
added2 = OrderedDict()
|
||||
added = collections.OrderedDict()
|
||||
added2 = collections.OrderedDict()
|
||||
counter = 1
|
||||
|
||||
def find_matches(expr, doc):
|
||||
|
||||
Reference in New Issue
Block a user