mirror of
https://github.com/gryf/ebook-converter.git
synced 2026-04-26 16:41:29 +02:00
Added epub write support
This commit is contained in:
@@ -0,0 +1,172 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import textwrap
|
||||
|
||||
from calibre import guess_type
|
||||
from calibre.utils.imghdr import identify
|
||||
from calibre.utils.xml_parse import safe_xml_fromstring
|
||||
from polyglot.builtins import unicode_type
|
||||
from polyglot.urllib import unquote
|
||||
|
||||
|
||||
class CoverManager(object):
|
||||
|
||||
SVG_TEMPLATE = textwrap.dedent('''\
|
||||
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en">
|
||||
<head>
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
|
||||
<meta name="calibre:cover" content="true" />
|
||||
<title>Cover</title>
|
||||
<style type="text/css" title="override_css">
|
||||
@page {padding: 0pt; margin:0pt}
|
||||
body { text-align: center; padding:0pt; margin: 0pt; }
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<div>
|
||||
<svg version="1.1" xmlns="http://www.w3.org/2000/svg"
|
||||
xmlns:xlink="http://www.w3.org/1999/xlink"
|
||||
width="100%%" height="100%%" viewBox="__viewbox__"
|
||||
preserveAspectRatio="__ar__">
|
||||
<image width="__width__" height="__height__" xlink:href="%s"/>
|
||||
</svg>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
''')
|
||||
|
||||
NONSVG_TEMPLATE = textwrap.dedent('''\
|
||||
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en">
|
||||
<head>
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
|
||||
<meta name="calibre:cover" content="true" />
|
||||
<title>Cover</title>
|
||||
<style type="text/css" title="override_css">
|
||||
@page {padding: 0pt; margin:0pt}
|
||||
body { text-align: center; padding:0pt; margin: 0pt }
|
||||
div { padding:0pt; margin: 0pt }
|
||||
img { padding:0pt; margin: 0pt }
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<div>
|
||||
<img src="%s" alt="cover" __style__ />
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
''')
|
||||
|
||||
def __init__(self, no_default_cover=False, no_svg_cover=False,
|
||||
preserve_aspect_ratio=False, fixed_size=None):
|
||||
self.no_default_cover = no_default_cover
|
||||
self.no_svg_cover = no_svg_cover
|
||||
self.preserve_aspect_ratio = preserve_aspect_ratio
|
||||
|
||||
ar = 'xMidYMid meet' if preserve_aspect_ratio else 'none'
|
||||
self.svg_template = self.SVG_TEMPLATE.replace('__ar__', ar)
|
||||
|
||||
if fixed_size is None:
|
||||
style = 'style="height: 100%%"'
|
||||
else:
|
||||
width, height = fixed_size
|
||||
style = 'style="height: %s; width: %s"'%(height, width)
|
||||
self.non_svg_template = self.NONSVG_TEMPLATE.replace('__style__',
|
||||
style)
|
||||
|
||||
def __call__(self, oeb, opts, log):
|
||||
self.oeb = oeb
|
||||
self.log = log
|
||||
self.insert_cover()
|
||||
|
||||
def default_cover(self):
|
||||
'''
|
||||
Create a generic cover for books that dont have a cover
|
||||
'''
|
||||
if self.no_default_cover:
|
||||
return None
|
||||
self.log('Generating default cover')
|
||||
m = self.oeb.metadata
|
||||
title = unicode_type(m.title[0])
|
||||
authors = [unicode_type(x) for x in m.creator if x.role == 'aut']
|
||||
try:
|
||||
from calibre.ebooks.covers import create_cover
|
||||
series = series_index = None
|
||||
if m.series:
|
||||
try:
|
||||
series, series_index = unicode_type(m.series[0]), m.series_index[0]
|
||||
except IndexError:
|
||||
pass
|
||||
img_data = create_cover(title, authors, series, series_index)
|
||||
id, href = self.oeb.manifest.generate('cover',
|
||||
'cover_image.jpg')
|
||||
item = self.oeb.manifest.add(id, href, guess_type('t.jpg')[0],
|
||||
data=img_data)
|
||||
m.clear('cover')
|
||||
m.add('cover', item.id)
|
||||
|
||||
return item.href
|
||||
except:
|
||||
self.log.exception('Failed to generate default cover')
|
||||
return None
|
||||
|
||||
def inspect_cover(self, href):
|
||||
from calibre.ebooks.oeb.base import urlnormalize
|
||||
for x in self.oeb.manifest:
|
||||
if x.href == urlnormalize(href):
|
||||
try:
|
||||
raw = x.data
|
||||
return identify(raw)[1:]
|
||||
except Exception:
|
||||
self.log.exception('Failed to read cover image dimensions')
|
||||
return -1, -1
|
||||
|
||||
def insert_cover(self):
|
||||
from calibre.ebooks.oeb.base import urldefrag
|
||||
g, m = self.oeb.guide, self.oeb.manifest
|
||||
item = None
|
||||
if 'titlepage' not in g:
|
||||
if 'cover' in g:
|
||||
href = g['cover'].href
|
||||
else:
|
||||
href = self.default_cover()
|
||||
if href is None:
|
||||
return
|
||||
width, height = self.inspect_cover(href)
|
||||
if width == -1 or height == -1:
|
||||
self.log.warning('Failed to read cover dimensions')
|
||||
width, height = 600, 800
|
||||
# if self.preserve_aspect_ratio:
|
||||
# width, height = 600, 800
|
||||
self.svg_template = self.svg_template.replace('__viewbox__',
|
||||
'0 0 %d %d'%(width, height))
|
||||
self.svg_template = self.svg_template.replace('__width__',
|
||||
unicode_type(width))
|
||||
self.svg_template = self.svg_template.replace('__height__',
|
||||
unicode_type(height))
|
||||
|
||||
if href is not None:
|
||||
templ = self.non_svg_template if self.no_svg_cover \
|
||||
else self.svg_template
|
||||
tp = templ%unquote(href)
|
||||
id, href = m.generate('titlepage', 'titlepage.xhtml')
|
||||
item = m.add(id, href, guess_type('t.xhtml')[0],
|
||||
data=safe_xml_fromstring(tp))
|
||||
else:
|
||||
item = self.oeb.manifest.hrefs[
|
||||
urldefrag(self.oeb.guide['titlepage'].href)[0]]
|
||||
if item is not None:
|
||||
self.oeb.spine.insert(0, item, True)
|
||||
if 'cover' not in self.oeb.guide.refs:
|
||||
self.oeb.guide.add('cover', 'Title Page', 'a')
|
||||
self.oeb.guide.refs['cover'].href = item.href
|
||||
if 'titlepage' in self.oeb.guide.refs:
|
||||
self.oeb.guide.refs['titlepage'].href = item.href
|
||||
titem = getattr(self.oeb.toc, 'item_that_refers_to_cover', None)
|
||||
if titem is not None:
|
||||
titem.href = item.href
|
||||
@@ -0,0 +1,187 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import posixpath
|
||||
|
||||
from lxml import etree
|
||||
|
||||
from calibre.ebooks.oeb.base import rewrite_links, urlnormalize
|
||||
from polyglot.urllib import urldefrag, urlparse
|
||||
|
||||
|
||||
class RenameFiles(object): # {{{
|
||||
|
||||
'''
|
||||
Rename files and adjust all links pointing to them. Note that the spine
|
||||
and manifest are not touched by this transform.
|
||||
'''
|
||||
|
||||
def __init__(self, rename_map, renamed_items_map=None):
|
||||
self.rename_map = rename_map
|
||||
self.renamed_items_map = renamed_items_map
|
||||
|
||||
def __call__(self, oeb, opts):
|
||||
import css_parser
|
||||
self.log = oeb.logger
|
||||
self.opts = opts
|
||||
self.oeb = oeb
|
||||
|
||||
for item in oeb.manifest.items:
|
||||
self.current_item = item
|
||||
if etree.iselement(item.data):
|
||||
rewrite_links(self.current_item.data, self.url_replacer)
|
||||
elif hasattr(item.data, 'cssText'):
|
||||
css_parser.replaceUrls(item.data, self.url_replacer)
|
||||
|
||||
if self.oeb.guide:
|
||||
for ref in self.oeb.guide.values():
|
||||
href = urlnormalize(ref.href)
|
||||
href, frag = urldefrag(href)
|
||||
replacement = self.rename_map.get(href, None)
|
||||
if replacement is not None:
|
||||
nhref = replacement
|
||||
if frag:
|
||||
nhref += '#' + frag
|
||||
ref.href = nhref
|
||||
|
||||
if self.oeb.toc:
|
||||
self.fix_toc_entry(self.oeb.toc)
|
||||
|
||||
def fix_toc_entry(self, toc):
|
||||
if toc.href:
|
||||
href = urlnormalize(toc.href)
|
||||
href, frag = urldefrag(href)
|
||||
replacement = self.rename_map.get(href, None)
|
||||
|
||||
if replacement is not None:
|
||||
nhref = replacement
|
||||
if frag:
|
||||
nhref = '#'.join((nhref, frag))
|
||||
toc.href = nhref
|
||||
|
||||
for x in toc:
|
||||
self.fix_toc_entry(x)
|
||||
|
||||
def url_replacer(self, orig_url):
|
||||
url = urlnormalize(orig_url)
|
||||
parts = urlparse(url)
|
||||
if parts.scheme:
|
||||
# Only rewrite local URLs
|
||||
return orig_url
|
||||
path, frag = urldefrag(url)
|
||||
if self.renamed_items_map:
|
||||
orig_item = self.renamed_items_map.get(self.current_item.href, self.current_item)
|
||||
else:
|
||||
orig_item = self.current_item
|
||||
|
||||
href = orig_item.abshref(path)
|
||||
replacement = self.current_item.relhref(self.rename_map.get(href, href))
|
||||
if frag:
|
||||
replacement += '#' + frag
|
||||
return replacement
|
||||
|
||||
# }}}
|
||||
|
||||
|
||||
class UniqueFilenames(object): # {{{
|
||||
|
||||
'Ensure that every item in the manifest has a unique filename'
|
||||
|
||||
def __call__(self, oeb, opts):
|
||||
self.log = oeb.logger
|
||||
self.opts = opts
|
||||
self.oeb = oeb
|
||||
|
||||
self.seen_filenames = set()
|
||||
self.rename_map = {}
|
||||
|
||||
for item in list(oeb.manifest.items):
|
||||
fname = posixpath.basename(item.href)
|
||||
if fname in self.seen_filenames:
|
||||
suffix = self.unique_suffix(fname)
|
||||
data = item.data
|
||||
base, ext = posixpath.splitext(item.href)
|
||||
nhref = base + suffix + ext
|
||||
nhref = oeb.manifest.generate(href=nhref)[1]
|
||||
spine_pos = item.spine_position
|
||||
oeb.manifest.remove(item)
|
||||
nitem = oeb.manifest.add(item.id, nhref, item.media_type, data=data,
|
||||
fallback=item.fallback)
|
||||
self.seen_filenames.add(posixpath.basename(nhref))
|
||||
self.rename_map[item.href] = nhref
|
||||
if spine_pos is not None:
|
||||
oeb.spine.insert(spine_pos, nitem, item.linear)
|
||||
else:
|
||||
self.seen_filenames.add(fname)
|
||||
|
||||
if self.rename_map:
|
||||
self.log('Found non-unique filenames, renaming to support broken'
|
||||
' EPUB readers like FBReader, Aldiko and Stanza...')
|
||||
from pprint import pformat
|
||||
self.log.debug(pformat(self.rename_map))
|
||||
|
||||
renamer = RenameFiles(self.rename_map)
|
||||
renamer(oeb, opts)
|
||||
|
||||
def unique_suffix(self, fname):
|
||||
base, ext = posixpath.splitext(fname)
|
||||
c = 0
|
||||
while True:
|
||||
c += 1
|
||||
suffix = '_u%d'%c
|
||||
candidate = base + suffix + ext
|
||||
if candidate not in self.seen_filenames:
|
||||
return suffix
|
||||
# }}}
|
||||
|
||||
|
||||
class FlatFilenames(object): # {{{
|
||||
|
||||
'Ensure that every item in the manifest has a unique filename without subdirectories.'
|
||||
|
||||
def __call__(self, oeb, opts):
|
||||
self.log = oeb.logger
|
||||
self.opts = opts
|
||||
self.oeb = oeb
|
||||
|
||||
self.rename_map = {}
|
||||
self.renamed_items_map = {}
|
||||
|
||||
for item in list(oeb.manifest.items):
|
||||
# Flatten URL by removing directories.
|
||||
# Example: a/b/c/index.html -> a_b_c_index.html
|
||||
nhref = item.href.replace("/", "_")
|
||||
|
||||
if item.href == nhref:
|
||||
# URL hasn't changed, skip item.
|
||||
continue
|
||||
|
||||
data = item.data
|
||||
isp = item.spine_position
|
||||
nhref = oeb.manifest.generate(href=nhref)[1]
|
||||
if isp is not None:
|
||||
oeb.spine.remove(item)
|
||||
oeb.manifest.remove(item)
|
||||
|
||||
nitem = oeb.manifest.add(item.id, nhref, item.media_type, data=data,
|
||||
fallback=item.fallback)
|
||||
self.rename_map[item.href] = nhref
|
||||
self.renamed_items_map[nhref] = item
|
||||
if isp is not None:
|
||||
oeb.spine.insert(isp, nitem, item.linear)
|
||||
|
||||
if self.rename_map:
|
||||
self.log('Found non-flat filenames, renaming to support broken'
|
||||
' EPUB readers like FBReader...')
|
||||
from pprint import pformat
|
||||
self.log.debug(pformat(self.rename_map))
|
||||
self.log.debug(pformat(self.renamed_items_map))
|
||||
|
||||
renamer = RenameFiles(self.rename_map, self.renamed_items_map)
|
||||
renamer(oeb, opts)
|
||||
# }}}
|
||||
@@ -0,0 +1,81 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
from calibre import fit_image
|
||||
|
||||
|
||||
class RescaleImages(object):
|
||||
|
||||
'Rescale all images to fit inside given screen size'
|
||||
|
||||
def __init__(self, check_colorspaces=False):
|
||||
self.check_colorspaces = check_colorspaces
|
||||
|
||||
def __call__(self, oeb, opts):
|
||||
self.oeb, self.opts, self.log = oeb, opts, oeb.log
|
||||
self.rescale()
|
||||
|
||||
def rescale(self):
|
||||
from PIL import Image
|
||||
from io import BytesIO
|
||||
|
||||
is_image_collection = getattr(self.opts, 'is_image_collection', False)
|
||||
|
||||
if is_image_collection:
|
||||
page_width, page_height = self.opts.dest.comic_screen_size
|
||||
else:
|
||||
page_width, page_height = self.opts.dest.width, self.opts.dest.height
|
||||
page_width -= (self.opts.margin_left + self.opts.margin_right) * self.opts.dest.dpi/72
|
||||
page_height -= (self.opts.margin_top + self.opts.margin_bottom) * self.opts.dest.dpi/72
|
||||
|
||||
for item in self.oeb.manifest:
|
||||
if item.media_type.startswith('image'):
|
||||
ext = item.media_type.split('/')[-1].upper()
|
||||
if ext == 'JPG':
|
||||
ext = 'JPEG'
|
||||
if ext not in ('PNG', 'JPEG', 'GIF'):
|
||||
ext = 'JPEG'
|
||||
|
||||
raw = item.data
|
||||
if hasattr(raw, 'xpath') or not raw:
|
||||
# Probably an svg image
|
||||
continue
|
||||
try:
|
||||
img = Image.open(BytesIO(raw))
|
||||
except Exception:
|
||||
continue
|
||||
width, height = img.size
|
||||
|
||||
try:
|
||||
if self.check_colorspaces and img.mode == 'CMYK':
|
||||
self.log.warn(
|
||||
'The image %s is in the CMYK colorspace, converting it '
|
||||
'to RGB as Adobe Digital Editions cannot display CMYK' % item.href)
|
||||
img = img.convert('RGB')
|
||||
except Exception:
|
||||
self.log.exception('Failed to convert image %s from CMYK to RGB' % item.href)
|
||||
|
||||
scaled, new_width, new_height = fit_image(width, height, page_width, page_height)
|
||||
if scaled:
|
||||
new_width = max(1, new_width)
|
||||
new_height = max(1, new_height)
|
||||
self.log('Rescaling image from %dx%d to %dx%d'%(
|
||||
width, height, new_width, new_height), item.href)
|
||||
try:
|
||||
img = img.resize((new_width, new_height))
|
||||
except Exception:
|
||||
self.log.exception('Failed to rescale image: %s' % item.href)
|
||||
continue
|
||||
buf = BytesIO()
|
||||
try:
|
||||
img.save(buf, ext)
|
||||
except Exception:
|
||||
self.log.exception('Failed to rescale image: %s' % item.href)
|
||||
else:
|
||||
item.data = buf.getvalue()
|
||||
item.unload_data_from_memory()
|
||||
@@ -0,0 +1,488 @@
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
'''
|
||||
Splitting of the XHTML flows. Splitting can happen on page boundaries or can be
|
||||
forced at "likely" locations to conform to size limitations. This transform
|
||||
assumes a prior call to the flatcss transform.
|
||||
'''
|
||||
|
||||
import os, functools, collections, re, copy
|
||||
from collections import OrderedDict
|
||||
|
||||
from lxml.etree import XPath as _XPath
|
||||
from lxml import etree
|
||||
|
||||
from calibre import as_unicode, force_unicode
|
||||
from calibre.ebooks.epub import rules
|
||||
from calibre.ebooks.oeb.base import (OEB_STYLES, XPNSMAP as NAMESPACES,
|
||||
urldefrag, rewrite_links, XHTML, urlnormalize)
|
||||
from calibre.ebooks.oeb.polish.split import do_split
|
||||
from polyglot.builtins import iteritems, range, map, unicode_type
|
||||
from polyglot.urllib import unquote
|
||||
from css_selectors import Select, SelectorError
|
||||
|
||||
XPath = functools.partial(_XPath, namespaces=NAMESPACES)
|
||||
|
||||
SPLIT_POINT_ATTR = 'csp'
|
||||
|
||||
|
||||
def tostring(root):
|
||||
return etree.tostring(root, encoding='utf-8')
|
||||
|
||||
|
||||
class SplitError(ValueError):
|
||||
|
||||
def __init__(self, path, root):
|
||||
size = len(tostring(root))/1024.
|
||||
ValueError.__init__(self,
|
||||
_('Could not find reasonable point at which to split: '
|
||||
'%(path)s Sub-tree size: %(size)d KB')%dict(
|
||||
path=path, size=size))
|
||||
|
||||
|
||||
class Split(object):
|
||||
|
||||
def __init__(self, split_on_page_breaks=True, page_breaks_xpath=None,
|
||||
max_flow_size=0, remove_css_pagebreaks=True):
|
||||
self.split_on_page_breaks = split_on_page_breaks
|
||||
self.page_breaks_xpath = page_breaks_xpath
|
||||
self.max_flow_size = max_flow_size
|
||||
self.page_break_selectors = None
|
||||
self.remove_css_pagebreaks = remove_css_pagebreaks
|
||||
if self.page_breaks_xpath is not None:
|
||||
self.page_break_selectors = [(XPath(self.page_breaks_xpath), False)]
|
||||
|
||||
def __call__(self, oeb, opts):
|
||||
self.oeb = oeb
|
||||
self.log = oeb.log
|
||||
self.log('Splitting markup on page breaks and flow limits, if any...')
|
||||
self.opts = opts
|
||||
self.map = {}
|
||||
for item in list(self.oeb.manifest.items):
|
||||
if item.spine_position is not None and etree.iselement(item.data):
|
||||
self.split_item(item)
|
||||
|
||||
self.fix_links()
|
||||
|
||||
def split_item(self, item):
|
||||
page_breaks, page_break_ids = [], []
|
||||
if self.split_on_page_breaks:
|
||||
page_breaks, page_break_ids = self.find_page_breaks(item)
|
||||
|
||||
splitter = FlowSplitter(item, page_breaks, page_break_ids,
|
||||
self.max_flow_size, self.oeb, self.opts)
|
||||
if splitter.was_split:
|
||||
am = splitter.anchor_map
|
||||
self.map[item.href] = collections.defaultdict(
|
||||
am.default_factory, am)
|
||||
|
||||
def find_page_breaks(self, item):
|
||||
if self.page_break_selectors is None:
|
||||
self.page_break_selectors = set()
|
||||
stylesheets = [x.data for x in self.oeb.manifest if x.media_type in
|
||||
OEB_STYLES]
|
||||
for rule in rules(stylesheets):
|
||||
before = force_unicode(getattr(rule.style.getPropertyCSSValue(
|
||||
'page-break-before'), 'cssText', '').strip().lower())
|
||||
after = force_unicode(getattr(rule.style.getPropertyCSSValue(
|
||||
'page-break-after'), 'cssText', '').strip().lower())
|
||||
try:
|
||||
if before and before not in {'avoid', 'auto', 'inherit'}:
|
||||
self.page_break_selectors.add((rule.selectorText, True))
|
||||
if self.remove_css_pagebreaks:
|
||||
rule.style.removeProperty('page-break-before')
|
||||
except:
|
||||
pass
|
||||
try:
|
||||
if after and after not in {'avoid', 'auto', 'inherit'}:
|
||||
self.page_break_selectors.add((rule.selectorText, False))
|
||||
if self.remove_css_pagebreaks:
|
||||
rule.style.removeProperty('page-break-after')
|
||||
except:
|
||||
pass
|
||||
page_breaks = set()
|
||||
select = Select(item.data)
|
||||
if not self.page_break_selectors:
|
||||
return [], []
|
||||
body = item.data.xpath('//h:body', namespaces=NAMESPACES)
|
||||
if not body:
|
||||
return [], []
|
||||
descendants = frozenset(body[0].iterdescendants('*'))
|
||||
|
||||
for selector, before in self.page_break_selectors:
|
||||
try:
|
||||
for elem in select(selector):
|
||||
if elem in descendants and elem.tag.rpartition('}')[2].lower() not in {'html', 'body', 'head', 'style', 'script', 'meta', 'link'}:
|
||||
elem.set('pb_before', '1' if before else '0')
|
||||
page_breaks.add(elem)
|
||||
except SelectorError as err:
|
||||
self.log.warn('Ignoring page breaks specified with invalid CSS selector: %r (%s)' % (selector, as_unicode(err)))
|
||||
|
||||
for i, elem in enumerate(item.data.iter('*')):
|
||||
try:
|
||||
elem.set('pb_order', unicode_type(i))
|
||||
except TypeError: # Cant set attributes on comment nodes etc.
|
||||
continue
|
||||
|
||||
page_breaks = list(page_breaks)
|
||||
page_breaks.sort(key=lambda x:int(x.get('pb_order')))
|
||||
page_break_ids, page_breaks_ = [], []
|
||||
for i, x in enumerate(page_breaks):
|
||||
x.set('id', x.get('id', 'calibre_pb_%d'%i))
|
||||
id = x.get('id')
|
||||
try:
|
||||
xp = XPath('//*[@id="%s"]'%id)
|
||||
except:
|
||||
try:
|
||||
xp = XPath("//*[@id='%s']"%id)
|
||||
except:
|
||||
# The id has both a quote and an apostrophe or some other
|
||||
# Just replace it since I doubt its going to work anywhere else
|
||||
# either
|
||||
id = 'calibre_pb_%d'%i
|
||||
x.set('id', id)
|
||||
xp = XPath('//*[@id=%r]'%id)
|
||||
page_breaks_.append((xp, x.get('pb_before', '0') == '1'))
|
||||
page_break_ids.append(id)
|
||||
|
||||
for elem in item.data.iter(etree.Element):
|
||||
elem.attrib.pop('pb_order', False)
|
||||
elem.attrib.pop('pb_before', False)
|
||||
|
||||
return page_breaks_, page_break_ids
|
||||
|
||||
def fix_links(self):
|
||||
'''
|
||||
Fix references to the split files in other content files.
|
||||
'''
|
||||
for item in self.oeb.manifest:
|
||||
if etree.iselement(item.data):
|
||||
self.current_item = item
|
||||
rewrite_links(item.data, self.rewrite_links)
|
||||
|
||||
def rewrite_links(self, url):
|
||||
href, frag = urldefrag(url)
|
||||
try:
|
||||
href = self.current_item.abshref(href)
|
||||
except ValueError:
|
||||
# Unparseable URL
|
||||
return url
|
||||
try:
|
||||
href = urlnormalize(href)
|
||||
except ValueError:
|
||||
# href has non utf-8 quoting
|
||||
return url
|
||||
if href in self.map:
|
||||
anchor_map = self.map[href]
|
||||
nhref = anchor_map[frag if frag else None]
|
||||
nhref = self.current_item.relhref(nhref)
|
||||
if frag:
|
||||
nhref = '#'.join((unquote(nhref), frag))
|
||||
|
||||
return nhref
|
||||
return url
|
||||
|
||||
|
||||
class FlowSplitter(object):
|
||||
'The actual splitting logic'
|
||||
|
||||
def __init__(self, item, page_breaks, page_break_ids, max_flow_size, oeb,
|
||||
opts):
|
||||
self.item = item
|
||||
self.oeb = oeb
|
||||
self.opts = opts
|
||||
self.log = oeb.log
|
||||
self.page_breaks = page_breaks
|
||||
self.page_break_ids = page_break_ids
|
||||
self.max_flow_size = max_flow_size
|
||||
self.base = item.href
|
||||
self.csp_counter = 0
|
||||
|
||||
base, ext = os.path.splitext(self.base)
|
||||
self.base = base.replace('%', '%%')+'_split_%.3d'+ext
|
||||
|
||||
self.trees = [self.item.data.getroottree()]
|
||||
self.splitting_on_page_breaks = True
|
||||
if self.page_breaks:
|
||||
self.split_on_page_breaks(self.trees[0])
|
||||
self.splitting_on_page_breaks = False
|
||||
|
||||
if self.max_flow_size > 0:
|
||||
lt_found = False
|
||||
self.log('\tLooking for large trees in %s...'%item.href)
|
||||
trees = list(self.trees)
|
||||
self.tree_map = {}
|
||||
for i, tree in enumerate(trees):
|
||||
size = len(tostring(tree.getroot()))
|
||||
if size > self.max_flow_size:
|
||||
self.log('\tFound large tree #%d'%i)
|
||||
lt_found = True
|
||||
self.split_trees = []
|
||||
self.split_to_size(tree)
|
||||
self.tree_map[tree] = self.split_trees
|
||||
if not lt_found:
|
||||
self.log('\tNo large trees found')
|
||||
self.trees = []
|
||||
for x in trees:
|
||||
self.trees.extend(self.tree_map.get(x, [x]))
|
||||
|
||||
self.was_split = len(self.trees) > 1
|
||||
if self.was_split:
|
||||
self.log('\tSplit into %d parts'%len(self.trees))
|
||||
self.commit()
|
||||
|
||||
def split_on_page_breaks(self, orig_tree):
|
||||
ordered_ids = OrderedDict()
|
||||
all_page_break_ids = frozenset(self.page_break_ids)
|
||||
for elem_id in orig_tree.xpath('//*/@id'):
|
||||
if elem_id in all_page_break_ids:
|
||||
ordered_ids[elem_id] = self.page_breaks[
|
||||
self.page_break_ids.index(elem_id)]
|
||||
|
||||
self.trees = [orig_tree]
|
||||
while ordered_ids:
|
||||
pb_id, (pattern, before) = next(iteritems(ordered_ids))
|
||||
del ordered_ids[pb_id]
|
||||
for i in range(len(self.trees)-1, -1, -1):
|
||||
tree = self.trees[i]
|
||||
elem = pattern(tree)
|
||||
if elem:
|
||||
self.log.debug('\t\tSplitting on page-break at id=%s'%
|
||||
elem[0].get('id'))
|
||||
before_tree, after_tree = self.do_split(tree, elem[0], before)
|
||||
self.trees[i:i+1] = [before_tree, after_tree]
|
||||
break
|
||||
|
||||
trees, ids = [], set()
|
||||
for tree in self.trees:
|
||||
root = tree.getroot()
|
||||
if self.is_page_empty(root):
|
||||
discarded_ids = root.xpath('//*[@id]')
|
||||
for x in discarded_ids:
|
||||
x = x.get('id')
|
||||
if not x.startswith('calibre_'):
|
||||
ids.add(x)
|
||||
else:
|
||||
if ids:
|
||||
body = self.get_body(root)
|
||||
if body is not None:
|
||||
existing_ids = frozenset(body.xpath('//*/@id'))
|
||||
for x in ids - existing_ids:
|
||||
body.insert(0, body.makeelement(XHTML('div'), id=x, style='height:0pt'))
|
||||
ids = set()
|
||||
trees.append(tree)
|
||||
self.trees = trees
|
||||
|
||||
def get_body(self, root):
|
||||
body = root.xpath('//h:body', namespaces=NAMESPACES)
|
||||
if not body:
|
||||
return None
|
||||
return body[0]
|
||||
|
||||
def do_split(self, tree, split_point, before):
|
||||
'''
|
||||
Split ``tree`` into a *before* and *after* tree at ``split_point``.
|
||||
|
||||
:param before: If True tree is split before split_point, otherwise after split_point
|
||||
:return: before_tree, after_tree
|
||||
'''
|
||||
return do_split(split_point, self.log, before=before)
|
||||
|
||||
def is_page_empty(self, root):
|
||||
body = self.get_body(root)
|
||||
if body is None:
|
||||
return False
|
||||
txt = re.sub(r'\s+|\xa0', '',
|
||||
etree.tostring(body, method='text', encoding='unicode'))
|
||||
if len(txt) > 1:
|
||||
return False
|
||||
for img in root.xpath('//h:img', namespaces=NAMESPACES):
|
||||
if img.get('style', '') != 'display:none':
|
||||
return False
|
||||
if root.xpath('//*[local-name() = "svg"]'):
|
||||
return False
|
||||
return True
|
||||
|
||||
def split_text(self, text, root, size):
|
||||
self.log.debug('\t\t\tSplitting text of length: %d'%len(text))
|
||||
rest = text.replace('\r', '')
|
||||
parts = re.split('\n\n', rest)
|
||||
self.log.debug('\t\t\t\tFound %d parts'%len(parts))
|
||||
if max(map(len, parts)) > size:
|
||||
raise SplitError('Cannot split as file contains a <pre> tag '
|
||||
'with a very large paragraph', root)
|
||||
ans = []
|
||||
buf = ''
|
||||
for part in parts:
|
||||
if len(buf) + len(part) < size:
|
||||
buf += '\n\n'+part
|
||||
else:
|
||||
ans.append(buf)
|
||||
buf = part
|
||||
return ans
|
||||
|
||||
def split_to_size(self, tree):
|
||||
self.log.debug('\t\tSplitting...')
|
||||
root = tree.getroot()
|
||||
# Split large <pre> tags if they contain only text
|
||||
for pre in XPath('//h:pre')(root):
|
||||
if len(tuple(pre.iterchildren(etree.Element))) > 0:
|
||||
continue
|
||||
if pre.text and len(pre.text) > self.max_flow_size*0.5:
|
||||
self.log.debug('\t\tSplitting large <pre> tag')
|
||||
frags = self.split_text(pre.text, root, int(0.2*self.max_flow_size))
|
||||
new_pres = []
|
||||
for frag in frags:
|
||||
pre2 = copy.copy(pre)
|
||||
pre2.text = frag
|
||||
pre2.tail = ''
|
||||
new_pres.append(pre2)
|
||||
new_pres[-1].tail = pre.tail
|
||||
p = pre.getparent()
|
||||
i = p.index(pre)
|
||||
p[i:i+1] = new_pres
|
||||
|
||||
split_point, before = self.find_split_point(root)
|
||||
if split_point is None:
|
||||
raise SplitError(self.item.href, root)
|
||||
self.log.debug('\t\t\tSplit point:', split_point.tag, tree.getpath(split_point))
|
||||
|
||||
trees = self.do_split(tree, split_point, before)
|
||||
sizes = [len(tostring(t.getroot())) for t in trees]
|
||||
if min(sizes) < 5*1024:
|
||||
self.log.debug('\t\t\tSplit tree too small')
|
||||
self.split_to_size(tree)
|
||||
return
|
||||
|
||||
for t, size in zip(trees, sizes):
|
||||
r = t.getroot()
|
||||
if self.is_page_empty(r):
|
||||
continue
|
||||
elif size <= self.max_flow_size:
|
||||
self.split_trees.append(t)
|
||||
self.log.debug(
|
||||
'\t\t\tCommitted sub-tree #%d (%d KB)'%(
|
||||
len(self.split_trees), size/1024.))
|
||||
else:
|
||||
self.log.debug(
|
||||
'\t\t\tSplit tree still too large: %d KB' % (size/1024.))
|
||||
self.split_to_size(t)
|
||||
|
||||
def find_split_point(self, root):
|
||||
'''
|
||||
Find the tag at which to split the tree rooted at `root`.
|
||||
Search order is:
|
||||
* Heading tags
|
||||
* <div> tags
|
||||
* <pre> tags
|
||||
* <hr> tags
|
||||
* <p> tags
|
||||
* <br> tags
|
||||
* <li> tags
|
||||
|
||||
We try to split in the "middle" of the file (as defined by tag counts.
|
||||
'''
|
||||
def pick_elem(elems):
|
||||
if elems:
|
||||
elems = [i for i in elems if i.get(SPLIT_POINT_ATTR, '0') !=
|
||||
'1']
|
||||
if elems:
|
||||
i = int(len(elems)//2)
|
||||
elems[i].set(SPLIT_POINT_ATTR, '1')
|
||||
return elems[i]
|
||||
|
||||
for path in (
|
||||
'//*[re:match(name(), "h[1-6]", "i")]',
|
||||
'/h:html/h:body/h:div',
|
||||
'//h:pre',
|
||||
'//h:hr',
|
||||
'//h:p',
|
||||
'//h:div',
|
||||
'//h:br',
|
||||
'//h:li',
|
||||
):
|
||||
elems = root.xpath(path, namespaces=NAMESPACES)
|
||||
elem = pick_elem(elems)
|
||||
if elem is not None:
|
||||
try:
|
||||
XPath(elem.getroottree().getpath(elem))
|
||||
except:
|
||||
continue
|
||||
return elem, True
|
||||
|
||||
return None, True
|
||||
|
||||
def commit(self):
|
||||
'''
|
||||
Commit all changes caused by the split. Calculates an *anchor_map* for
|
||||
all anchors in the original tree. Internal links are re-directed. The
|
||||
original file is deleted and the split files are saved.
|
||||
'''
|
||||
if not self.was_split:
|
||||
return
|
||||
self.anchor_map = collections.defaultdict(lambda :self.base%0)
|
||||
self.files = []
|
||||
|
||||
for i, tree in enumerate(self.trees):
|
||||
root = tree.getroot()
|
||||
self.files.append(self.base%i)
|
||||
for elem in root.xpath('//*[@id or @name]'):
|
||||
for anchor in elem.get('id', ''), elem.get('name', ''):
|
||||
if anchor != '' and anchor not in self.anchor_map:
|
||||
self.anchor_map[anchor] = self.files[-1]
|
||||
for elem in root.xpath('//*[@%s]'%SPLIT_POINT_ATTR):
|
||||
elem.attrib.pop(SPLIT_POINT_ATTR, '0')
|
||||
|
||||
spine_pos = self.item.spine_position
|
||||
|
||||
for current, tree in zip(*map(reversed, (self.files, self.trees))):
|
||||
for a in tree.getroot().xpath('//h:a[@href]', namespaces=NAMESPACES):
|
||||
href = a.get('href').strip()
|
||||
if href.startswith('#'):
|
||||
anchor = href[1:]
|
||||
file = self.anchor_map[anchor]
|
||||
file = self.item.relhref(file)
|
||||
if file != current:
|
||||
a.set('href', file+href)
|
||||
|
||||
new_id = self.oeb.manifest.generate(id=self.item.id)[0]
|
||||
new_item = self.oeb.manifest.add(new_id, current,
|
||||
self.item.media_type, data=tree.getroot())
|
||||
self.oeb.spine.insert(spine_pos, new_item, self.item.linear)
|
||||
|
||||
if self.oeb.guide:
|
||||
for ref in self.oeb.guide.values():
|
||||
href, frag = urldefrag(ref.href)
|
||||
if href == self.item.href:
|
||||
nhref = self.anchor_map[frag if frag else None]
|
||||
if frag:
|
||||
nhref = '#'.join((nhref, frag))
|
||||
ref.href = nhref
|
||||
|
||||
def fix_toc_entry(toc):
|
||||
if toc.href:
|
||||
href, frag = urldefrag(toc.href)
|
||||
if href == self.item.href:
|
||||
nhref = self.anchor_map[frag if frag else None]
|
||||
if frag:
|
||||
nhref = '#'.join((nhref, frag))
|
||||
toc.href = nhref
|
||||
for x in toc:
|
||||
fix_toc_entry(x)
|
||||
|
||||
if self.oeb.toc:
|
||||
fix_toc_entry(self.oeb.toc)
|
||||
|
||||
if self.oeb.pages:
|
||||
for page in self.oeb.pages:
|
||||
href, frag = urldefrag(page.href)
|
||||
if href == self.item.href:
|
||||
nhref = self.anchor_map[frag if frag else None]
|
||||
if frag:
|
||||
nhref = '#'.join((nhref, frag))
|
||||
page.href = nhref
|
||||
|
||||
self.oeb.manifest.remove(self.item)
|
||||
Reference in New Issue
Block a user