mirror of
https://github.com/gryf/ebook-converter.git
synced 2026-03-23 10:53:34 +01:00
Here is the first batch of modules, which are needed for converting several formats to LRF. Some of the logic has been change, more cleanups will follow.
253 lines
8.9 KiB
Python
253 lines
8.9 KiB
Python
#!/usr/bin/env python2
|
|
# vim:fileencoding=utf-8
|
|
from __future__ import absolute_import, division, print_function, unicode_literals
|
|
|
|
__license__ = 'GPL v3'
|
|
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
|
|
|
|
import textwrap
|
|
from ebook_converter.polyglot.builtins import iteritems, map
|
|
|
|
# from lxml.etree import Element
|
|
|
|
from ebook_converter import force_unicode
|
|
from ebook_converter.ebooks.oeb.base import (
|
|
serialize, OEB_DOCS, barename, OEB_STYLES, XPNSMAP, XHTML, SVG)
|
|
from ebook_converter.ebooks.oeb.polish.container import OPF_NAMESPACES
|
|
from ebook_converter.ebooks.oeb.polish.utils import guess_type
|
|
from ebook_converter.utils.icu import sort_key
|
|
|
|
|
|
def isspace(x):
|
|
return not x.strip('\u0009\u000a\u000c\u000d\u0020')
|
|
|
|
|
|
def pretty_xml_tree(elem, level=0, indent=' '):
|
|
''' XML beautifier, assumes that elements that have children do not have
|
|
textual content. Also assumes that there is no text immediately after
|
|
closing tags. These are true for opf/ncx and container.xml files. If either
|
|
of the assumptions are violated, there should be no data loss, but pretty
|
|
printing wont produce optimal results.'''
|
|
if (not elem.text and len(elem) > 0) or (elem.text and isspace(elem.text)):
|
|
elem.text = '\n' + (indent * (level+1))
|
|
for i, child in enumerate(elem):
|
|
pretty_xml_tree(child, level=level+1, indent=indent)
|
|
if not child.tail or isspace(child.tail):
|
|
l = level + 1
|
|
if i == len(elem) - 1:
|
|
l -= 1
|
|
child.tail = '\n' + (indent * l)
|
|
|
|
|
|
def pretty_opf(root):
|
|
# Put all dc: tags first starting with title and author. Preserve order for
|
|
# the rest.
|
|
def dckey(x):
|
|
return {'title':0, 'creator':1}.get(barename(x.tag), 2)
|
|
for metadata in root.xpath('//opf:metadata', namespaces=OPF_NAMESPACES):
|
|
dc_tags = metadata.xpath('./*[namespace-uri()="%s"]' % OPF_NAMESPACES['dc'])
|
|
dc_tags.sort(key=dckey)
|
|
for x in reversed(dc_tags):
|
|
metadata.insert(0, x)
|
|
|
|
# Group items in the manifest
|
|
spine_ids = root.xpath('//opf:spine/opf:itemref/@idref', namespaces=OPF_NAMESPACES)
|
|
spine_ids = {x:i for i, x in enumerate(spine_ids)}
|
|
|
|
def manifest_key(x):
|
|
mt = x.get('media-type', '')
|
|
href = x.get('href', '')
|
|
ext = href.rpartition('.')[-1].lower()
|
|
cat = 1000
|
|
if mt in OEB_DOCS:
|
|
cat = 0
|
|
elif mt == guess_type('a.ncx'):
|
|
cat = 1
|
|
elif mt in OEB_STYLES:
|
|
cat = 2
|
|
elif mt.startswith('image/'):
|
|
cat = 3
|
|
elif ext in {'otf', 'ttf', 'woff'}:
|
|
cat = 4
|
|
elif mt.startswith('audio/'):
|
|
cat = 5
|
|
elif mt.startswith('video/'):
|
|
cat = 6
|
|
|
|
if cat == 0:
|
|
i = spine_ids.get(x.get('id', None), 1000000000)
|
|
else:
|
|
i = sort_key(href)
|
|
return (cat, i)
|
|
|
|
for manifest in root.xpath('//opf:manifest', namespaces=OPF_NAMESPACES):
|
|
try:
|
|
children = sorted(manifest, key=manifest_key)
|
|
except AttributeError:
|
|
continue # There are comments so dont sort since that would mess up the comments
|
|
for x in reversed(children):
|
|
manifest.insert(0, x)
|
|
|
|
|
|
SVG_TAG = SVG('svg')
|
|
BLOCK_TAGS = frozenset(map(XHTML, (
|
|
'address', 'article', 'aside', 'audio', 'blockquote', 'body', 'canvas', 'col', 'colgroup', 'dd',
|
|
'div', 'dl', 'dt', 'fieldset', 'figcaption', 'figure', 'footer', 'form',
|
|
'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'header', 'hgroup', 'hr', 'li',
|
|
'noscript', 'ol', 'output', 'p', 'pre', 'script', 'section', 'style', 'table', 'tbody', 'td',
|
|
'tfoot', 'th', 'thead', 'tr', 'ul', 'video', 'img'))) | {SVG_TAG}
|
|
|
|
|
|
def isblock(x):
|
|
if callable(x.tag) or not x.tag:
|
|
return True
|
|
if x.tag in BLOCK_TAGS:
|
|
return True
|
|
return False
|
|
|
|
|
|
def has_only_blocks(x):
|
|
if hasattr(x.tag, 'split') and len(x) == 0:
|
|
# Tag with no children,
|
|
return False
|
|
if x.text and not isspace(x.text):
|
|
return False
|
|
for child in x:
|
|
if not isblock(child) or (child.tail and not isspace(child.tail)):
|
|
return False
|
|
return True
|
|
|
|
|
|
def indent_for_tag(x):
|
|
prev = x.getprevious()
|
|
x = x.getparent().text if prev is None else prev.tail
|
|
if not x:
|
|
return ''
|
|
s = x.rpartition('\n')[-1]
|
|
return s if isspace(s) else ''
|
|
|
|
|
|
def set_indent(elem, attr, indent):
|
|
x = getattr(elem, attr)
|
|
if not x:
|
|
x = indent
|
|
else:
|
|
lines = x.splitlines()
|
|
if isspace(lines[-1]):
|
|
lines[-1] = indent
|
|
else:
|
|
lines.append(indent)
|
|
x = '\n'.join(lines)
|
|
setattr(elem, attr, x)
|
|
|
|
|
|
def pretty_block(parent, level=1, indent=' '):
|
|
''' Surround block tags with blank lines and recurse into child block tags
|
|
that contain only other block tags '''
|
|
if not parent.text or isspace(parent.text):
|
|
parent.text = ''
|
|
nn = '\n' if hasattr(parent.tag, 'strip') and barename(parent.tag) in {'tr', 'td', 'th'} else '\n\n'
|
|
parent.text = parent.text + nn + (indent * level)
|
|
for i, child in enumerate(parent):
|
|
if isblock(child) and has_only_blocks(child):
|
|
pretty_block(child, level=level+1, indent=indent)
|
|
elif child.tag == SVG_TAG:
|
|
pretty_xml_tree(child, level=level, indent=indent)
|
|
l = level
|
|
if i == len(parent) - 1:
|
|
l -= 1
|
|
if not child.tail or isspace(child.tail):
|
|
child.tail = ''
|
|
child.tail = child.tail + nn + (indent * l)
|
|
|
|
|
|
def pretty_script_or_style(container, child):
|
|
if child.text:
|
|
indent = indent_for_tag(child)
|
|
if child.tag.endswith('style'):
|
|
child.text = force_unicode(pretty_css(container, '', child.text), 'utf-8')
|
|
child.text = textwrap.dedent(child.text)
|
|
child.text = '\n' + '\n'.join([(indent + x) if x else '' for x in child.text.splitlines()])
|
|
set_indent(child, 'text', indent)
|
|
|
|
|
|
def pretty_html_tree(container, root):
|
|
root.text = '\n\n'
|
|
for child in root:
|
|
child.tail = '\n\n'
|
|
if hasattr(child.tag, 'endswith') and child.tag.endswith('}head'):
|
|
pretty_xml_tree(child)
|
|
for body in root.findall('h:body', namespaces=XPNSMAP):
|
|
pretty_block(body)
|
|
# Special case the handling of a body that contains a single block tag
|
|
# with all content. In this case we prettify the containing block tag
|
|
# even if it has non block children.
|
|
if (len(body) == 1 and not callable(body[0].tag) and isblock(body[0]) and not has_only_blocks(
|
|
body[0]) and barename(body[0].tag) not in (
|
|
'pre', 'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6') and len(body[0]) > 0):
|
|
pretty_block(body[0], level=2)
|
|
|
|
if container is not None:
|
|
# Handle <script> and <style> tags
|
|
for child in root.xpath('//*[local-name()="script" or local-name()="style"]'):
|
|
pretty_script_or_style(container, child)
|
|
|
|
|
|
def fix_html(container, raw):
|
|
' Fix any parsing errors in the HTML represented as a string in raw. Fixing is done using the HTML5 parsing algorithm. '
|
|
root = container.parse_xhtml(raw)
|
|
return serialize(root, 'text/html')
|
|
|
|
|
|
def pretty_html(container, name, raw):
|
|
' Pretty print the HTML represented as a string in raw '
|
|
root = container.parse_xhtml(raw)
|
|
pretty_html_tree(container, root)
|
|
return serialize(root, 'text/html')
|
|
|
|
|
|
def pretty_css(container, name, raw):
|
|
' Pretty print the CSS represented as a string in raw '
|
|
sheet = container.parse_css(raw)
|
|
return serialize(sheet, 'text/css')
|
|
|
|
|
|
def pretty_xml(container, name, raw):
|
|
' Pretty print the XML represented as a string in raw. If ``name`` is the name of the OPF, extra OPF-specific prettying is performed. '
|
|
root = container.parse_xml(raw)
|
|
if name == container.opf_name:
|
|
pretty_opf(root)
|
|
pretty_xml_tree(root)
|
|
return serialize(root, 'text/xml')
|
|
|
|
|
|
def fix_all_html(container):
|
|
' Fix any parsing errors in all HTML files in the container. Fixing is done using the HTML5 parsing algorithm. '
|
|
for name, mt in iteritems(container.mime_map):
|
|
if mt in OEB_DOCS:
|
|
container.parsed(name)
|
|
container.dirty(name)
|
|
|
|
|
|
def pretty_all(container):
|
|
' Pretty print all HTML/CSS/XML files in the container '
|
|
xml_types = {guess_type('a.ncx'), guess_type('a.xml'), guess_type('a.svg')}
|
|
for name, mt in iteritems(container.mime_map):
|
|
prettied = False
|
|
if mt in OEB_DOCS:
|
|
pretty_html_tree(container, container.parsed(name))
|
|
prettied = True
|
|
elif mt in OEB_STYLES:
|
|
container.parsed(name)
|
|
prettied = True
|
|
elif name == container.opf_name:
|
|
root = container.parsed(name)
|
|
pretty_opf(root)
|
|
pretty_xml_tree(root)
|
|
prettied = True
|
|
elif mt in xml_types:
|
|
pretty_xml_tree(container.parsed(name))
|
|
prettied = True
|
|
if prettied:
|
|
container.dirty(name)
|