1
0
mirror of https://github.com/gryf/ebook-converter.git synced 2025-12-28 04:02:27 +01:00

Added epub write support

This commit is contained in:
2020-04-13 12:46:37 +02:00
parent 9f18513787
commit 79cad46732
9 changed files with 3049 additions and 0 deletions

View File

@@ -0,0 +1,762 @@
#!/usr/bin/env python2
# vim:fileencoding=utf-8
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2014, Kovid Goyal <kovid at kovidgoyal.net>'
import re, random, unicodedata, numbers
from collections import namedtuple
from contextlib import contextmanager
from math import ceil, sqrt, cos, sin, atan2
from polyglot.builtins import iteritems, itervalues, map, zip, string_or_bytes
from itertools import chain
from PyQt5.Qt import (
QImage, Qt, QFont, QPainter, QPointF, QTextLayout, QTextOption,
QFontMetrics, QTextCharFormat, QColor, QRect, QBrush, QLinearGradient,
QPainterPath, QPen, QRectF, QTransform, QRadialGradient
)
from calibre import force_unicode, fit_image
from calibre.constants import __appname__, __version__
from calibre.ebooks.metadata import fmt_sidx
from calibre.ebooks.metadata.book.base import Metadata
from calibre.ebooks.metadata.book.formatter import SafeFormat
from calibre.gui2 import ensure_app, config, load_builtin_fonts, pixmap_to_data
from calibre.utils.cleantext import clean_ascii_chars, clean_xml_chars
from calibre.utils.config import JSONConfig
# Default settings {{{
cprefs = JSONConfig('cover_generation')
cprefs.defaults['title_font_size'] = 120 # px
cprefs.defaults['subtitle_font_size'] = 80 # px
cprefs.defaults['footer_font_size'] = 80 # px
cprefs.defaults['cover_width'] = 1200 # px
cprefs.defaults['cover_height'] = 1600 # px
cprefs.defaults['title_font_family'] = None
cprefs.defaults['subtitle_font_family'] = None
cprefs.defaults['footer_font_family'] = None
cprefs.defaults['color_themes'] = {}
cprefs.defaults['disabled_color_themes'] = []
cprefs.defaults['disabled_styles'] = []
cprefs.defaults['title_template'] = '<b>{title}'
cprefs.defaults['subtitle_template'] = '''{series:'test($, strcat("<i>", $, "</i> - ", raw_field("formatted_series_index")), "")'}'''
cprefs.defaults['footer_template'] = r'''program:
# Show at most two authors, on separate lines.
authors = field('authors');
num = count(authors, ' &amp; ');
authors = sublist(authors, 0, 2, ' &amp; ');
authors = list_re(authors, ' &amp; ', '(.+)', '<b>\1');
authors = re(authors, ' &amp; ', '<br>');
re(authors, '&amp;&amp;', '&amp;')
'''
Prefs = namedtuple('Prefs', ' '.join(sorted(cprefs.defaults)))
_use_roman = None
def get_use_roman():
global _use_roman
if _use_roman is None:
return config['use_roman_numerals_for_series_number']
return _use_roman
def set_use_roman(val):
global _use_roman
_use_roman = bool(val)
# }}}
# Draw text {{{
Point = namedtuple('Point', 'x y')
def parse_text_formatting(text):
pos = 0
tokens = []
for m in re.finditer(r'</?([a-zA-Z1-6]+)/?>', text):
q = text[pos:m.start()]
if q:
tokens.append((False, q))
tokens.append((True, (m.group(1).lower(), '/' in m.group()[:2])))
pos = m.end()
if tokens:
if text[pos:]:
tokens.append((False, text[pos:]))
else:
tokens = [(False, text)]
ranges, open_ranges, text = [], [], []
offset = 0
for is_tag, tok in tokens:
if is_tag:
tag, closing = tok
if closing:
if open_ranges:
r = open_ranges.pop()
r[-1] = offset - r[-2]
if r[-1] > 0:
ranges.append(r)
else:
if tag in {'b', 'strong', 'i', 'em'}:
open_ranges.append([tag, offset, -1])
else:
offset += len(tok.replace('&amp;', '&'))
text.append(tok)
text = ''.join(text)
formats = []
for tag, start, length in chain(ranges, open_ranges):
fmt = QTextCharFormat()
if tag in {'b', 'strong'}:
fmt.setFontWeight(QFont.Bold)
elif tag in {'i', 'em'}:
fmt.setFontItalic(True)
else:
continue
if length == -1:
length = len(text) - start
if length > 0:
r = QTextLayout.FormatRange()
r.format = fmt
r.start, r.length = start, length
formats.append(r)
return text, formats
class Block(object):
def __init__(self, text='', width=0, font=None, img=None, max_height=100, align=Qt.AlignCenter):
self.layouts = []
self._position = Point(0, 0)
self.leading = self.line_spacing = 0
if font is not None:
fm = QFontMetrics(font, img)
self.leading = fm.leading()
self.line_spacing = fm.lineSpacing()
for text in text.split('<br>') if text else ():
text, formats = parse_text_formatting(sanitize(text))
l = QTextLayout(unescape_formatting(text), font, img)
l.setAdditionalFormats(formats)
to = QTextOption(align)
to.setWrapMode(QTextOption.WrapAtWordBoundaryOrAnywhere)
l.setTextOption(to)
l.beginLayout()
height = 0
while height + 3*self.leading < max_height:
line = l.createLine()
if not line.isValid():
break
line.setLineWidth(width)
height += self.leading
line.setPosition(QPointF(0, height))
height += line.height()
max_height -= height
l.endLayout()
if self.layouts:
self.layouts.append(self.leading)
else:
self._position = Point(l.position().x(), l.position().y())
self.layouts.append(l)
if self.layouts:
self.layouts.append(self.leading)
@property
def height(self):
return int(ceil(sum(l if isinstance(l, numbers.Number) else l.boundingRect().height() for l in self.layouts)))
@property
def position(self):
return self._position
@position.setter
def position(self, new_pos):
(x, y) = new_pos
self._position = Point(x, y)
if self.layouts:
self.layouts[0].setPosition(QPointF(x, y))
y += self.layouts[0].boundingRect().height()
for l in self.layouts[1:]:
if isinstance(l, numbers.Number):
y += l
else:
l.setPosition(QPointF(x, y))
y += l.boundingRect().height()
def draw(self, painter):
for l in self.layouts:
if hasattr(l, 'draw'):
# Etch effect for the text
painter.save()
painter.setRenderHints(QPainter.TextAntialiasing | QPainter.Antialiasing)
painter.save()
painter.setPen(QColor(255, 255, 255, 125))
l.draw(painter, QPointF(1, 1))
painter.restore()
l.draw(painter, QPointF())
painter.restore()
def layout_text(prefs, img, title, subtitle, footer, max_height, style):
width = img.width() - 2 * style.hmargin
title, subtitle, footer = title, subtitle, footer
title_font = QFont(prefs.title_font_family or 'Liberation Serif')
title_font.setPixelSize(prefs.title_font_size)
title_font.setStyleStrategy(QFont.PreferAntialias)
title_block = Block(title, width, title_font, img, max_height, style.TITLE_ALIGN)
title_block.position = style.hmargin, style.vmargin
subtitle_block = Block()
if subtitle:
subtitle_font = QFont(prefs.subtitle_font_family or 'Liberation Sans')
subtitle_font.setPixelSize(prefs.subtitle_font_size)
subtitle_font.setStyleStrategy(QFont.PreferAntialias)
gap = 2 * title_block.leading
mh = max_height - title_block.height - gap
subtitle_block = Block(subtitle, width, subtitle_font, img, mh, style.SUBTITLE_ALIGN)
subtitle_block.position = style.hmargin, title_block.position.y + title_block.height + gap
footer_font = QFont(prefs.footer_font_family or 'Liberation Serif')
footer_font.setStyleStrategy(QFont.PreferAntialias)
footer_font.setPixelSize(prefs.footer_font_size)
footer_block = Block(footer, width, footer_font, img, max_height, style.FOOTER_ALIGN)
footer_block.position = style.hmargin, img.height() - style.vmargin - footer_block.height
return title_block, subtitle_block, footer_block
# }}}
# Format text using templates {{{
def sanitize(s):
return unicodedata.normalize('NFC', clean_xml_chars(clean_ascii_chars(force_unicode(s or ''))))
_formatter = None
_template_cache = {}
def escape_formatting(val):
return val.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;')
def unescape_formatting(val):
return val.replace('&lt;', '<').replace('&gt;', '>').replace('&amp;', '&')
class Formatter(SafeFormat):
def get_value(self, orig_key, args, kwargs):
ans = SafeFormat.get_value(self, orig_key, args, kwargs)
return escape_formatting(ans)
def formatter():
global _formatter
if _formatter is None:
_formatter = Formatter()
return _formatter
def format_fields(mi, prefs):
f = formatter()
def safe_format(field):
return f.safe_format(
getattr(prefs, field), mi, _('Template error'), mi, template_cache=_template_cache
)
return map(safe_format, ('title_template', 'subtitle_template', 'footer_template'))
@contextmanager
def preserve_fields(obj, fields):
if isinstance(fields, string_or_bytes):
fields = fields.split()
null = object()
mem = {f:getattr(obj, f, null) for f in fields}
try:
yield
finally:
for f, val in iteritems(mem):
if val is null:
delattr(obj, f)
else:
setattr(obj, f, val)
def format_text(mi, prefs):
with preserve_fields(mi, 'authors formatted_series_index'):
mi.authors = [a for a in mi.authors if a != _('Unknown')]
mi.formatted_series_index = fmt_sidx(mi.series_index or 0, use_roman=get_use_roman())
return tuple(format_fields(mi, prefs))
# }}}
# Colors {{{
ColorTheme = namedtuple('ColorTheme', 'color1 color2 contrast_color1 contrast_color2')
def to_theme(x):
return {k:v for k, v in zip(ColorTheme._fields[:4], x.split())}
fallback_colors = to_theme('ffffff 000000 000000 ffffff')
default_color_themes = {
'Earth' : to_theme('e8d9ac c7b07b 564628 382d1a'),
'Grass' : to_theme('d8edb5 abc8a4 375d3b 183128'),
'Water' : to_theme('d3dcf2 829fe4 00448d 00305a'),
'Silver': to_theme('e6f1f5 aab3b6 6e7476 3b3e40'),
}
def theme_to_colors(theme):
colors = {k:QColor('#' + theme[k]) for k in ColorTheme._fields}
return ColorTheme(**colors)
def load_color_themes(prefs):
t = default_color_themes.copy()
t.update(prefs.color_themes)
disabled = frozenset(prefs.disabled_color_themes)
ans = [theme_to_colors(v) for k, v in iteritems(t) if k not in disabled]
if not ans:
# Ignore disabled and return only the builtin color themes
ans = [theme_to_colors(v) for k, v in iteritems(default_color_themes)]
return ans
def color(color_theme, name):
ans = getattr(color_theme, name)
if not ans.isValid():
ans = QColor('#' + fallback_colors[name])
return ans
# }}}
# Styles {{{
class Style(object):
TITLE_ALIGN = SUBTITLE_ALIGN = FOOTER_ALIGN = Qt.AlignHCenter | Qt.AlignTop
def __init__(self, color_theme, prefs):
self.load_colors(color_theme)
self.calculate_margins(prefs)
def calculate_margins(self, prefs):
self.hmargin = int((50 / 600) * prefs.cover_width)
self.vmargin = int((50 / 800) * prefs.cover_height)
def load_colors(self, color_theme):
self.color1 = color(color_theme, 'color1')
self.color2 = color(color_theme, 'color2')
self.ccolor1 = color(color_theme, 'contrast_color1')
self.ccolor2 = color(color_theme, 'contrast_color2')
class Cross(Style):
NAME = 'The Cross'
GUI_NAME = _('The Cross')
def __call__(self, painter, rect, color_theme, title_block, subtitle_block, footer_block):
painter.fillRect(rect, self.color1)
r = QRect(0, int(title_block.position.y), rect.width(),
title_block.height + subtitle_block.height + subtitle_block.line_spacing // 2 + title_block.leading)
painter.save()
p = QPainterPath()
p.addRoundedRect(QRectF(r), 10, 10 * r.width()/r.height(), Qt.RelativeSize)
painter.setClipPath(p)
painter.setRenderHint(QPainter.Antialiasing)
painter.fillRect(r, self.color2)
painter.restore()
r = QRect(0, 0, int(title_block.position.x), rect.height())
painter.fillRect(r, self.color2)
return self.ccolor2, self.ccolor2, self.ccolor1
class Half(Style):
NAME = 'Half and Half'
GUI_NAME = _('Half and half')
def __call__(self, painter, rect, color_theme, title_block, subtitle_block, footer_block):
g = QLinearGradient(QPointF(0, 0), QPointF(0, rect.height()))
g.setStops([(0, self.color1), (0.7, self.color2), (1, self.color1)])
painter.fillRect(rect, QBrush(g))
return self.ccolor1, self.ccolor1, self.ccolor1
def rotate_vector(angle, x, y):
return x * cos(angle) - y * sin(angle), x * sin(angle) + y * cos(angle)
def draw_curved_line(painter_path, dx, dy, c1_frac, c1_amp, c2_frac, c2_amp):
length = sqrt(dx * dx + dy * dy)
angle = atan2(dy, dx)
c1 = QPointF(*rotate_vector(angle, c1_frac * length, c1_amp * length))
c2 = QPointF(*rotate_vector(angle, c2_frac * length, c2_amp * length))
pos = painter_path.currentPosition()
painter_path.cubicTo(pos + c1, pos + c2, pos + QPointF(dx, dy))
class Banner(Style):
NAME = 'Banner'
GUI_NAME = _('Banner')
GRADE = 0.07
def calculate_margins(self, prefs):
Style.calculate_margins(self, prefs)
self.hmargin = int(0.15 * prefs.cover_width)
self.fold_width = int(0.1 * prefs.cover_width)
def __call__(self, painter, rect, color_theme, title_block, subtitle_block, footer_block):
painter.fillRect(rect, self.color1)
top = title_block.position.y + 2
extra_spacing = subtitle_block.line_spacing // 2 if subtitle_block.line_spacing else title_block.line_spacing // 3
height = title_block.height + subtitle_block.height + extra_spacing + title_block.leading
right = rect.right() - self.hmargin
width = right - self.hmargin
# Draw main banner
p = main = QPainterPath(QPointF(self.hmargin, top))
draw_curved_line(p, rect.width() - 2 * self.hmargin, 0, 0.1, -0.1, 0.9, -0.1)
deltax = self.GRADE * height
p.lineTo(right + deltax, top + height)
right_corner = p.currentPosition()
draw_curved_line(p, - width - 2 * deltax, 0, 0.1, 0.05, 0.9, 0.05)
left_corner = p.currentPosition()
p.closeSubpath()
# Draw fold rectangles
rwidth = self.fold_width
yfrac = 0.1
width23 = int(0.67 * rwidth)
rtop = top + height * yfrac
def draw_fold(x, m=1, corner=left_corner):
ans = p = QPainterPath(QPointF(x, rtop))
draw_curved_line(p, rwidth*m, 0, 0.1, 0.1*m, 0.5, -0.2*m)
fold_upper = p.currentPosition()
p.lineTo(p.currentPosition() + QPointF(-deltax*m, height))
fold_corner = p.currentPosition()
draw_curved_line(p, -rwidth*m, 0, 0.2, -0.1*m, 0.8, -0.1*m)
draw_curved_line(p, deltax*m, -height, 0.2, 0.1*m, 0.8, 0.1*m)
p = inner_fold = QPainterPath(corner)
dp = fold_corner - p.currentPosition()
draw_curved_line(p, dp.x(), dp.y(), 0.5, 0.3*m, 1, 0*m)
p.lineTo(fold_upper), p.closeSubpath()
return ans, inner_fold
left_fold, left_inner = draw_fold(self.hmargin - width23)
right_fold, right_inner = draw_fold(right + width23, m=-1, corner=right_corner)
painter.save()
painter.setRenderHint(QPainter.Antialiasing)
pen = QPen(self.ccolor2)
pen.setWidth(3)
pen.setJoinStyle(Qt.RoundJoin)
painter.setPen(pen)
for r in (left_fold, right_fold):
painter.fillPath(r, QBrush(self.color2))
painter.drawPath(r)
for r in (left_inner, right_inner):
painter.fillPath(r, QBrush(self.color2.darker()))
painter.drawPath(r)
painter.fillPath(main, QBrush(self.color2))
painter.drawPath(main)
painter.restore()
return self.ccolor2, self.ccolor2, self.ccolor1
class Ornamental(Style):
NAME = 'Ornamental'
GUI_NAME = _('Ornamental')
# SVG vectors {{{
CORNER_VECTOR = "m 67.791903,64.260958 c -4.308097,-2.07925 -4.086719,-8.29575 0.334943,-9.40552 4.119758,-1.03399 8.732363,5.05239 5.393055,7.1162 -0.55,0.33992 -1,1.04147 -1,1.55902 0,1.59332 2.597425,1.04548 5.365141,-1.1316 1.999416,-1.57274 2.634859,-2.96609 2.634859,-5.7775 0,-9.55787 -9.827495,-13.42961 -24.43221,-9.62556 -3.218823,0.83839 -5.905663,1.40089 -5.970755,1.25 -0.06509,-0.1509 -0.887601,-1.19493 -1.827799,-2.32007 -1.672708,-2.00174 -1.636693,-2.03722 1.675668,-1.65052 1.861815,0.21736 6.685863,-0.35719 10.720107,-1.27678 12.280767,-2.79934 20.195487,-0.0248 22.846932,8.0092 3.187273,9.65753 -6.423297,17.7497 -15.739941,13.25313 z m 49.881417,-20.53932 c -3.19204,-2.701 -3.72967,-6.67376 -1.24009,-9.16334 2.48236,-2.48236 5.35141,-2.67905 7.51523,-0.51523 1.85966,1.85966 2.07045,6.52954 0.37143,8.22857 -2.04025,2.04024 3.28436,1.44595 6.92316,-0.77272 9.66959,-5.89579 0.88581,-18.22422 -13.0777,-18.35516 -5.28594,-0.0496 -10.31098,1.88721 -14.26764,5.4991 -1.98835,1.81509 -2.16454,1.82692 -2.7936,0.18763 -0.40973,-1.06774 0.12141,-2.82197 1.3628,-4.50104 2.46349,-3.33205 1.67564,-4.01299 -2.891784,-2.49938 -2.85998,0.94777 -3.81038,2.05378 -5.59837,6.51495 -1.184469,2.95536 -3.346819,6.86882 -4.805219,8.69657 -1.4584,1.82776 -2.65164,4.02223 -2.65164,4.87662 0,3.24694 -4.442667,0.59094 -5.872557,-3.51085 -1.361274,-3.90495 0.408198,-8.63869 4.404043,-11.78183 5.155844,-4.05558 1.612374,-3.42079 -9.235926,1.65457 -12.882907,6.02725 -16.864953,7.18038 -24.795556,7.18038 -8.471637,0 -13.38802,-1.64157 -17.634617,-5.88816 -2.832233,-2.83224 -3.849773,-4.81378 -4.418121,-8.6038 -1.946289,-12.9787795 8.03227,-20.91713135 19.767685,-15.7259993 5.547225,2.4538018 6.993631,6.1265383 3.999564,10.1557393 -5.468513,7.35914 -15.917883,-0.19431 -10.657807,-7.7041155 1.486298,-2.1219878 1.441784,-2.2225068 -0.984223,-2.2225068 -1.397511,0 -4.010527,1.3130878 -5.806704,2.9179718 -2.773359,2.4779995 -3.265777,3.5977995 -3.265777,7.4266705 0,5.10943 2.254112,8.84197 7.492986,12.40748 8.921325,6.07175 19.286666,5.61396 37.12088,-1.63946 15.35037,-6.24321 21.294999,-7.42408 34.886123,-6.92999 11.77046,0.4279 19.35803,3.05537 24.34054,8.42878 4.97758,5.3681 2.53939,13.58271 -4.86733,16.39873 -4.17361,1.58681 -11.00702,1.19681 -13.31978,-0.76018 z m 26.50156,-0.0787 c -2.26347,-2.50111 -2.07852,-7.36311 0.39995,-10.51398 2.68134,-3.40877 10.49035,-5.69409 18.87656,-5.52426 l 6.5685,0.13301 -7.84029,0.82767 c -8.47925,0.89511 -12.76997,2.82233 -16.03465,7.20213 -1.92294,2.57976 -1.96722,3.00481 -0.57298,5.5 1.00296,1.79495 2.50427,2.81821 4.46514,3.04333 2.92852,0.33623 2.93789,0.32121 1.08045,-1.73124 -1.53602,-1.69728 -1.64654,-2.34411 -0.61324,-3.58916 2.84565,-3.4288 7.14497,-0.49759 5.03976,3.43603 -1.86726,3.48903 -8.65528,4.21532 -11.3692,1.21647 z m -4.17462,-14.20302 c -0.38836,-0.62838 -0.23556,-1.61305 0.33954,-2.18816 1.3439,-1.34389 4.47714,-0.17168 3.93038,1.47045 -0.5566,1.67168 -3.38637,2.14732 -4.26992,0.71771 z m -8.48037,-9.1829 c -12.462,-4.1101 -12.53952,-4.12156 -25.49998,-3.7694 -24.020921,0.65269 -32.338219,0.31756 -37.082166,-1.49417 -5.113999,-1.95305 -8.192504,-6.3647405 -6.485463,-9.2940713 0.566827,-0.972691 1.020091,-1.181447 1.037211,-0.477701 0.01685,0.692606 1.268676,1.2499998 2.807321,1.2499998 1.685814,0 4.868609,1.571672 8.10041,4.0000015 4.221481,3.171961 6.182506,3.999221 9.473089,3.996261 l 4.149585,-0.004 -3.249996,-1.98156 c -3.056252,-1.863441 -4.051566,-3.8760635 -2.623216,-5.3044145 0.794,-0.794 6.188222,1.901516 9.064482,4.5295635 1.858669,1.698271 3.461409,1.980521 10.559493,1.859621 11.30984,-0.19266 20.89052,1.29095 31.97905,4.95208 7.63881,2.52213 11.51931,3.16471 22.05074,3.65141 7.02931,0.32486 13.01836,0.97543 13.30902,1.44571 0.29065,0.47029 -5.2356,0.83436 -12.28056,0.80906 -12.25942,-0.044 -13.34537,-0.2229 -25.30902,-4.16865 z" # noqa
# }}}
PATH_CACHE = {}
VIEWPORT = (400, 500)
def calculate_margins(self, prefs):
self.hmargin = int((51 / self.VIEWPORT[0]) * prefs.cover_width)
self.vmargin = int((83 / self.VIEWPORT[1]) * prefs.cover_height)
def __call__(self, painter, rect, color_theme, title_block, subtitle_block, footer_block):
if not self.PATH_CACHE:
from calibre.utils.speedups import svg_path_to_painter_path
try:
self.__class__.PATH_CACHE['corner'] = svg_path_to_painter_path(self.CORNER_VECTOR)
except Exception:
import traceback
traceback.print_exc()
p = painter
painter.setRenderHint(QPainter.Antialiasing)
g = QRadialGradient(QPointF(rect.center()), rect.width())
g.setColorAt(0, self.color1), g.setColorAt(1, self.color2)
painter.fillRect(rect, QBrush(g))
painter.save()
painter.setWindow(0, 0, *self.VIEWPORT)
try:
path = self.PATH_CACHE['corner']
except KeyError:
path = QPainterPath()
pen = p.pen()
pen.setColor(self.ccolor1)
p.setPen(pen)
def corner():
b = QBrush(self.ccolor1)
p.fillPath(path, b)
p.rotate(90), p.translate(100, -100), p.scale(1, -1), p.translate(-103, -97)
p.fillPath(path, b)
p.setWorldTransform(QTransform())
# Top-left corner
corner()
# Top right corner
p.scale(-1, 1), p.translate(-400, 0), corner()
# Bottom left corner
p.scale(1, -1), p.translate(0, -500), corner()
# Bottom right corner
p.scale(-1, -1), p.translate(-400, -500), corner()
for y in (28.4, 471.7):
p.drawLine(QPointF(160, y), QPointF(240, y))
for x in (31.3, 368.7):
p.drawLine(QPointF(x, 155), QPointF(x, 345))
pen.setWidthF(1.8)
p.setPen(pen)
for y in (23.8, 476.7):
p.drawLine(QPointF(160, y), QPointF(240, y))
for x in (26.3, 373.7):
p.drawLine(QPointF(x, 155), QPointF(x, 345))
painter.restore()
return self.ccolor2, self.ccolor2, self.ccolor1
class Blocks(Style):
NAME = 'Blocks'
GUI_NAME = _('Blocks')
FOOTER_ALIGN = Qt.AlignRight | Qt.AlignTop
def __call__(self, painter, rect, color_theme, title_block, subtitle_block, footer_block):
painter.fillRect(rect, self.color1)
y = rect.height() - rect.height() // 3
r = QRect(rect)
r.setBottom(y)
painter.fillRect(rect, self.color1)
r = QRect(rect)
r.setTop(y)
painter.fillRect(r, self.color2)
return self.ccolor1, self.ccolor1, self.ccolor2
def all_styles():
return set(
x.NAME for x in itervalues(globals()) if
isinstance(x, type) and issubclass(x, Style) and x is not Style
)
def load_styles(prefs, respect_disabled=True):
disabled = frozenset(prefs.disabled_styles) if respect_disabled else ()
ans = tuple(x for x in itervalues(globals()) if
isinstance(x, type) and issubclass(x, Style) and x is not Style and x.NAME not in disabled)
if not ans and disabled:
# If all styles have been disabled, ignore the disabling and return all
# the styles
ans = load_styles(prefs, respect_disabled=False)
return ans
# }}}
def init_environment():
ensure_app()
load_builtin_fonts()
def generate_cover(mi, prefs=None, as_qimage=False):
init_environment()
prefs = prefs or cprefs
prefs = {k:prefs.get(k) for k in cprefs.defaults}
prefs = Prefs(**prefs)
color_theme = random.choice(load_color_themes(prefs))
style = random.choice(load_styles(prefs))(color_theme, prefs)
title, subtitle, footer = format_text(mi, prefs)
img = QImage(prefs.cover_width, prefs.cover_height, QImage.Format_ARGB32)
title_block, subtitle_block, footer_block = layout_text(
prefs, img, title, subtitle, footer, img.height() // 3, style)
p = QPainter(img)
rect = QRect(0, 0, img.width(), img.height())
colors = style(p, rect, color_theme, title_block, subtitle_block, footer_block)
for block, color in zip((title_block, subtitle_block, footer_block), colors):
p.setPen(color)
block.draw(p)
p.end()
img.setText('Generated cover', '%s %s' % (__appname__, __version__))
if as_qimage:
return img
return pixmap_to_data(img)
def override_prefs(base_prefs, **overrides):
ans = {k:overrides.get(k, base_prefs[k]) for k in cprefs.defaults}
override_color_theme = overrides.get('override_color_theme')
if override_color_theme is not None:
all_themes = set(default_color_themes) | set(ans['color_themes'])
if override_color_theme in all_themes:
all_themes.discard(override_color_theme)
ans['disabled_color_themes'] = all_themes
override_style = overrides.get('override_style')
if override_style is not None:
styles = all_styles()
if override_style in styles:
styles.discard(override_style)
ans['disabled_styles'] = styles
return ans
def create_cover(title, authors, series=None, series_index=1, prefs=None, as_qimage=False):
' Create a cover from the specified title, author and series. Any user set'
' templates are ignored, to ensure that the specified metadata is used. '
mi = Metadata(title, authors)
if series:
mi.series, mi.series_index = series, series_index
d = cprefs.defaults
prefs = override_prefs(
prefs or cprefs, title_template=d['title_template'], subtitle_template=d['subtitle_template'], footer_template=d['footer_template'])
return generate_cover(mi, prefs=prefs, as_qimage=as_qimage)
def calibre_cover2(title, author_string='', series_string='', prefs=None, as_qimage=False, logo_path=None):
init_environment()
title, subtitle, footer = '<b>' + escape_formatting(title), '<i>' + escape_formatting(series_string), '<b>' + escape_formatting(author_string)
prefs = prefs or cprefs
prefs = {k:prefs.get(k) for k in cprefs.defaults}
scale = 800. / prefs['cover_height']
scale_cover(prefs, scale)
prefs = Prefs(**prefs)
img = QImage(prefs.cover_width, prefs.cover_height, QImage.Format_ARGB32)
img.fill(Qt.white)
# colors = to_theme('ffffff ffffff 000000 000000')
color_theme = theme_to_colors(fallback_colors)
class CalibeLogoStyle(Style):
NAME = GUI_NAME = 'calibre'
def __call__(self, painter, rect, color_theme, title_block, subtitle_block, footer_block):
top = title_block.position.y + 10
extra_spacing = subtitle_block.line_spacing // 2 if subtitle_block.line_spacing else title_block.line_spacing // 3
height = title_block.height + subtitle_block.height + extra_spacing + title_block.leading
top += height + 25
bottom = footer_block.position.y - 50
logo = QImage(logo_path or I('library.png'))
pwidth, pheight = rect.width(), bottom - top
scaled, width, height = fit_image(logo.width(), logo.height(), pwidth, pheight)
x, y = (pwidth - width) // 2, (pheight - height) // 2
rect = QRect(x, top + y, width, height)
painter.setRenderHint(QPainter.SmoothPixmapTransform)
painter.drawImage(rect, logo)
return self.ccolor1, self.ccolor1, self.ccolor1
style = CalibeLogoStyle(color_theme, prefs)
title_block, subtitle_block, footer_block = layout_text(
prefs, img, title, subtitle, footer, img.height() // 3, style)
p = QPainter(img)
rect = QRect(0, 0, img.width(), img.height())
colors = style(p, rect, color_theme, title_block, subtitle_block, footer_block)
for block, color in zip((title_block, subtitle_block, footer_block), colors):
p.setPen(color)
block.draw(p)
p.end()
img.setText('Generated cover', '%s %s' % (__appname__, __version__))
if as_qimage:
return img
return pixmap_to_data(img)
def message_image(text, width=500, height=400, font_size=20):
init_environment()
img = QImage(width, height, QImage.Format_ARGB32)
img.fill(Qt.white)
p = QPainter(img)
f = QFont()
f.setPixelSize(font_size)
p.setFont(f)
r = img.rect().adjusted(10, 10, -10, -10)
p.drawText(r, Qt.AlignJustify | Qt.AlignVCenter | Qt.TextWordWrap, text)
p.end()
return pixmap_to_data(img)
def scale_cover(prefs, scale):
for x in ('cover_width', 'cover_height', 'title_font_size', 'subtitle_font_size', 'footer_font_size'):
prefs[x] = int(scale * prefs[x])
def generate_masthead(title, output_path=None, width=600, height=60, as_qimage=False, font_family=None):
init_environment()
font_family = font_family or cprefs['title_font_family'] or 'Liberation Serif'
img = QImage(width, height, QImage.Format_ARGB32)
img.fill(Qt.white)
p = QPainter(img)
p.setRenderHints(QPainter.Antialiasing | QPainter.TextAntialiasing)
f = QFont(font_family)
f.setStyleStrategy(QFont.PreferAntialias)
f.setPixelSize((height * 3) // 4), f.setBold(True)
p.setFont(f)
p.drawText(img.rect(), Qt.AlignLeft | Qt.AlignVCenter, sanitize(title))
p.end()
if as_qimage:
return img
data = pixmap_to_data(img)
if output_path is None:
return data
with open(output_path, 'wb') as f:
f.write(data)
def test(scale=0.25):
from PyQt5.Qt import QLabel, QPixmap, QMainWindow, QWidget, QScrollArea, QGridLayout
from calibre.gui2 import Application
app = Application([])
mi = Metadata('Unknown', ['Kovid Goyal', 'John & Doe', 'Author'])
mi.series = 'A series & styles'
m = QMainWindow()
sa = QScrollArea(m)
w = QWidget(m)
sa.setWidget(w)
l = QGridLayout(w)
w.setLayout(l), l.setSpacing(30)
scale *= w.devicePixelRatioF()
labels = []
for r, color in enumerate(sorted(default_color_themes)):
for c, style in enumerate(sorted(all_styles())):
mi.series_index = c + 1
mi.title = 'An algorithmic cover [%s]' % color
prefs = override_prefs(cprefs, override_color_theme=color, override_style=style)
scale_cover(prefs, scale)
img = generate_cover(mi, prefs=prefs, as_qimage=True)
img.setDevicePixelRatio(w.devicePixelRatioF())
la = QLabel()
la.setPixmap(QPixmap.fromImage(img))
l.addWidget(la, r, c)
labels.append(la)
m.setCentralWidget(sa)
w.resize(w.sizeHint())
m.show()
app.exec_()
if __name__ == '__main__':
test()

View File

@@ -0,0 +1,49 @@
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
__docformat__ = 'restructuredtext en'
'''
Conversion to EPUB.
'''
from calibre.utils.zipfile import ZipFile, ZIP_STORED
def rules(stylesheets):
for s in stylesheets:
if hasattr(s, 'cssText'):
for r in s:
if r.type == r.STYLE_RULE:
yield r
def simple_container_xml(opf_path, extra_entries=''):
return '''\
<?xml version="1.0"?>
<container version="1.0" xmlns="urn:oasis:names:tc:opendocument:xmlns:container">
<rootfiles>
<rootfile full-path="{0}" media-type="application/oebps-package+xml"/>
{extra_entries}
</rootfiles>
</container>
'''.format(opf_path, extra_entries=extra_entries)
def initialize_container(path_to_container, opf_name='metadata.opf',
extra_entries=[]):
'''
Create an empty EPUB document, with a default skeleton.
'''
rootfiles = ''
for path, mimetype, _ in extra_entries:
rootfiles += '<rootfile full-path="{0}" media-type="{1}"/>'.format(
path, mimetype)
CONTAINER = simple_container_xml(opf_name, rootfiles).encode('utf-8')
zf = ZipFile(path_to_container, 'w')
zf.writestr('mimetype', b'application/epub+zip', compression=ZIP_STORED)
zf.writestr('META-INF/', b'', 0o755)
zf.writestr('META-INF/container.xml', CONTAINER)
for path, _, data in extra_entries:
zf.writestr(path, data)
return zf

View File

@@ -0,0 +1,389 @@
#!/usr/bin/env python2
# vim:fileencoding=utf-8
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2014, Kovid Goyal <kovid at kovidgoyal.net>'
from collections import defaultdict
from functools import partial
from css_parser.css import CSSRule, CSSStyleDeclaration
from css_selectors import parse, SelectorSyntaxError
from calibre import force_unicode
from calibre.ebooks.oeb.base import OEB_STYLES, OEB_DOCS, XHTML, css_text
from calibre.ebooks.oeb.normalize_css import normalize_filter_css, normalizers
from calibre.ebooks.oeb.polish.pretty import pretty_script_or_style, pretty_xml_tree, serialize
from calibre.utils.icu import numeric_sort_key
from css_selectors import Select, SelectorError
from polyglot.builtins import iteritems, itervalues, unicode_type, filter
def filter_used_rules(rules, log, select):
for rule in rules:
used = False
for selector in rule.selectorList:
try:
if select.has_matches(selector.selectorText):
used = True
break
except SelectorError:
# Cannot parse/execute this selector, be safe and assume it
# matches something
used = True
break
if not used:
yield rule
def get_imported_sheets(name, container, sheets, recursion_level=10, sheet=None):
ans = set()
sheet = sheet or sheets[name]
for rule in sheet.cssRules.rulesOfType(CSSRule.IMPORT_RULE):
if rule.href:
iname = container.href_to_name(rule.href, name)
if iname in sheets:
ans.add(iname)
if recursion_level > 0:
for imported_sheet in tuple(ans):
ans |= get_imported_sheets(imported_sheet, container, sheets, recursion_level=recursion_level-1)
ans.discard(name)
return ans
def merge_declarations(first, second):
for prop in second.getProperties():
first.setProperty(prop)
def merge_identical_selectors(sheet):
' Merge rules that have identical selectors '
selector_map = defaultdict(list)
for rule in sheet.cssRules.rulesOfType(CSSRule.STYLE_RULE):
selector_map[rule.selectorText].append(rule)
remove = []
for rule_group in itervalues(selector_map):
if len(rule_group) > 1:
for i in range(1, len(rule_group)):
merge_declarations(rule_group[0].style, rule_group[i].style)
remove.append(rule_group[i])
for rule in remove:
sheet.cssRules.remove(rule)
return len(remove)
def remove_unused_css(container, report=None, remove_unused_classes=False, merge_rules=False):
'''
Remove all unused CSS rules from the book. An unused CSS rule is one that does not match any actual content.
:param report: An optional callable that takes a single argument. It is called with information about the operations being performed.
:param remove_unused_classes: If True, class attributes in the HTML that do not match any CSS rules are also removed.
:param merge_rules: If True, rules with identical selectors are merged.
'''
report = report or (lambda x:x)
def safe_parse(name):
try:
return container.parsed(name)
except TypeError:
pass
sheets = {name:safe_parse(name) for name, mt in iteritems(container.mime_map) if mt in OEB_STYLES}
sheets = {k:v for k, v in iteritems(sheets) if v is not None}
num_merged = 0
if merge_rules:
for name, sheet in iteritems(sheets):
num = merge_identical_selectors(sheet)
if num:
container.dirty(name)
num_merged += num
import_map = {name:get_imported_sheets(name, container, sheets) for name in sheets}
if remove_unused_classes:
class_map = {name:{icu_lower(x) for x in classes_in_rule_list(sheet.cssRules)} for name, sheet in iteritems(sheets)}
style_rules = {name:tuple(sheet.cssRules.rulesOfType(CSSRule.STYLE_RULE)) for name, sheet in iteritems(sheets)}
num_of_removed_rules = num_of_removed_classes = 0
for name, mt in iteritems(container.mime_map):
if mt not in OEB_DOCS:
continue
root = container.parsed(name)
select = Select(root, ignore_inappropriate_pseudo_classes=True)
used_classes = set()
for style in root.xpath('//*[local-name()="style"]'):
if style.get('type', 'text/css') == 'text/css' and style.text:
sheet = container.parse_css(style.text)
if merge_rules:
num = merge_identical_selectors(sheet)
if num:
num_merged += num
container.dirty(name)
if remove_unused_classes:
used_classes |= {icu_lower(x) for x in classes_in_rule_list(sheet.cssRules)}
imports = get_imported_sheets(name, container, sheets, sheet=sheet)
for imported_sheet in imports:
style_rules[imported_sheet] = tuple(filter_used_rules(style_rules[imported_sheet], container.log, select))
if remove_unused_classes:
used_classes |= class_map[imported_sheet]
rules = tuple(sheet.cssRules.rulesOfType(CSSRule.STYLE_RULE))
unused_rules = tuple(filter_used_rules(rules, container.log, select))
if unused_rules:
num_of_removed_rules += len(unused_rules)
[sheet.cssRules.remove(r) for r in unused_rules]
style.text = force_unicode(sheet.cssText, 'utf-8')
pretty_script_or_style(container, style)
container.dirty(name)
for link in root.xpath('//*[local-name()="link" and @href]'):
sname = container.href_to_name(link.get('href'), name)
if sname not in sheets:
continue
style_rules[sname] = tuple(filter_used_rules(style_rules[sname], container.log, select))
if remove_unused_classes:
used_classes |= class_map[sname]
for iname in import_map[sname]:
style_rules[iname] = tuple(filter_used_rules(style_rules[iname], container.log, select))
if remove_unused_classes:
used_classes |= class_map[iname]
if remove_unused_classes:
for elem in root.xpath('//*[@class]'):
original_classes, classes = elem.get('class', '').split(), []
for x in original_classes:
if icu_lower(x) in used_classes:
classes.append(x)
if len(classes) != len(original_classes):
if classes:
elem.set('class', ' '.join(classes))
else:
del elem.attrib['class']
num_of_removed_classes += len(original_classes) - len(classes)
container.dirty(name)
for name, sheet in iteritems(sheets):
unused_rules = style_rules[name]
if unused_rules:
num_of_removed_rules += len(unused_rules)
[sheet.cssRules.remove(r) for r in unused_rules]
container.dirty(name)
num_changes = num_of_removed_rules + num_merged + num_of_removed_classes
if num_changes > 0:
if num_of_removed_rules > 0:
report(ngettext('Removed one unused CSS style rule', 'Removed {} unused CSS style rules',
num_of_removed_rules).format(num_of_removed_rules))
if num_of_removed_classes > 0:
report(ngettext('Removed one unused class from the HTML', 'Removed {} unused classes from the HTML',
num_of_removed_classes).format(num_of_removed_classes))
if num_merged > 0:
report(ngettext('Merged one CSS style rule', 'Merged {} CSS style rules',
num_merged).format(num_merged))
if num_of_removed_rules == 0:
report(_('No unused CSS style rules found'))
if remove_unused_classes and num_of_removed_classes == 0:
report(_('No unused class attributes found'))
if merge_rules and num_merged == 0:
report(_('No style rules that could be merged found'))
return num_changes > 0
def filter_declaration(style, properties=()):
changed = False
for prop in properties:
if style.removeProperty(prop) != '':
changed = True
all_props = set(style.keys())
for prop in style.getProperties():
n = normalizers.get(prop.name, None)
if n is not None:
normalized = n(prop.name, prop.propertyValue)
removed = properties.intersection(set(normalized))
if removed:
changed = True
style.removeProperty(prop.name)
for prop in set(normalized) - removed - all_props:
style.setProperty(prop, normalized[prop])
return changed
def filter_sheet(sheet, properties=()):
from css_parser.css import CSSRule
changed = False
remove = []
for rule in sheet.cssRules.rulesOfType(CSSRule.STYLE_RULE):
if filter_declaration(rule.style, properties):
changed = True
if rule.style.length == 0:
remove.append(rule)
for rule in remove:
sheet.cssRules.remove(rule)
return changed
def transform_inline_styles(container, name, transform_sheet, transform_style):
root = container.parsed(name)
changed = False
for style in root.xpath('//*[local-name()="style"]'):
if style.text and (style.get('type') or 'text/css').lower() == 'text/css':
sheet = container.parse_css(style.text)
if transform_sheet(sheet):
changed = True
style.text = force_unicode(sheet.cssText, 'utf-8')
pretty_script_or_style(container, style)
for elem in root.xpath('//*[@style]'):
text = elem.get('style', None)
if text:
style = container.parse_css(text, is_declaration=True)
if transform_style(style):
changed = True
if style.length == 0:
del elem.attrib['style']
else:
elem.set('style', force_unicode(style.getCssText(separator=' '), 'utf-8'))
return changed
def transform_css(container, transform_sheet=None, transform_style=None, names=()):
if not names:
types = OEB_STYLES | OEB_DOCS
names = []
for name, mt in iteritems(container.mime_map):
if mt in types:
names.append(name)
doc_changed = False
for name in names:
mt = container.mime_map[name]
if mt in OEB_STYLES:
sheet = container.parsed(name)
if transform_sheet(sheet):
container.dirty(name)
doc_changed = True
elif mt in OEB_DOCS:
if transform_inline_styles(container, name, transform_sheet, transform_style):
container.dirty(name)
doc_changed = True
return doc_changed
def filter_css(container, properties, names=()):
'''
Remove the specified CSS properties from all CSS rules in the book.
:param properties: Set of properties to remove. For example: :code:`{'font-family', 'color'}`.
:param names: The files from which to remove the properties. Defaults to all HTML and CSS files in the book.
'''
properties = normalize_filter_css(properties)
return transform_css(container, transform_sheet=partial(filter_sheet, properties=properties),
transform_style=partial(filter_declaration, properties=properties), names=names)
def _classes_in_selector(selector, classes):
for attr in ('selector', 'subselector', 'parsed_tree'):
s = getattr(selector, attr, None)
if s is not None:
_classes_in_selector(s, classes)
cn = getattr(selector, 'class_name', None)
if cn is not None:
classes.add(cn)
def classes_in_selector(text):
classes = set()
try:
for selector in parse(text):
_classes_in_selector(selector, classes)
except SelectorSyntaxError:
pass
return classes
def classes_in_rule_list(css_rules):
classes = set()
for rule in css_rules:
if rule.type == rule.STYLE_RULE:
classes |= classes_in_selector(rule.selectorText)
elif hasattr(rule, 'cssRules'):
classes |= classes_in_rule_list(rule.cssRules)
return classes
def iter_declarations(sheet_or_rule):
if hasattr(sheet_or_rule, 'cssRules'):
for rule in sheet_or_rule.cssRules:
for x in iter_declarations(rule):
yield x
elif hasattr(sheet_or_rule, 'style'):
yield sheet_or_rule.style
elif isinstance(sheet_or_rule, CSSStyleDeclaration):
yield sheet_or_rule
def remove_property_value(prop, predicate):
''' Remove the Values that match the predicate from this property. If all
values of the property would be removed, the property is removed from its
parent instead. Note that this means the property must have a parent (a
CSSStyleDeclaration). '''
removed_vals = list(filter(predicate, prop.propertyValue))
if len(removed_vals) == len(prop.propertyValue):
prop.parent.removeProperty(prop.name)
else:
x = css_text(prop.propertyValue)
for v in removed_vals:
x = x.replace(css_text(v), '').strip()
prop.propertyValue.cssText = x
return bool(removed_vals)
RULE_PRIORITIES = {t:i for i, t in enumerate((CSSRule.COMMENT, CSSRule.CHARSET_RULE, CSSRule.IMPORT_RULE, CSSRule.NAMESPACE_RULE))}
def sort_sheet(container, sheet_or_text):
''' Sort the rules in a stylesheet. Note that in the general case this can
change the effective styles, but for most common sheets, it should be safe.
'''
sheet = container.parse_css(sheet_or_text) if isinstance(sheet_or_text, unicode_type) else sheet_or_text
def text_sort_key(x):
return numeric_sort_key(unicode_type(x or ''))
def selector_sort_key(x):
return (x.specificity, text_sort_key(x.selectorText))
def rule_sort_key(rule):
primary = RULE_PRIORITIES.get(rule.type, len(RULE_PRIORITIES))
secondary = text_sort_key(getattr(rule, 'atkeyword', '') or '')
tertiary = None
if rule.type == CSSRule.STYLE_RULE:
primary += 1
selectors = sorted(rule.selectorList, key=selector_sort_key)
tertiary = selector_sort_key(selectors[0])
rule.selectorText = ', '.join(s.selectorText for s in selectors)
elif rule.type == CSSRule.FONT_FACE_RULE:
try:
tertiary = text_sort_key(rule.style.getPropertyValue('font-family'))
except Exception:
pass
return primary, secondary, tertiary
sheet.cssRules.sort(key=rule_sort_key)
return sheet
def add_stylesheet_links(container, name, text):
root = container.parse_xhtml(text, name)
head = root.xpath('//*[local-name() = "head"]')
if not head:
return
head = head[0]
sheets = tuple(container.manifest_items_of_type(lambda mt: mt in OEB_STYLES))
if not sheets:
return
for sname in sheets:
link = head.makeelement(XHTML('link'), type='text/css', rel='stylesheet', href=container.name_to_href(sname, name))
head.append(link)
pretty_xml_tree(head)
return serialize(root, 'text/html')

View File

@@ -0,0 +1,404 @@
#!/usr/bin/env python2
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import codecs, shutil, os, posixpath
from polyglot.builtins import iteritems, itervalues, map
from functools import partial
from collections import Counter, defaultdict
from calibre import sanitize_file_name
from calibre.ebooks.chardet import strip_encoding_declarations
from calibre.ebooks.oeb.base import css_text
from calibre.ebooks.oeb.polish.css import iter_declarations, remove_property_value
from calibre.ebooks.oeb.polish.utils import extract
from polyglot.urllib import urlparse, urlunparse
class LinkReplacer(object):
def __init__(self, base, container, link_map, frag_map):
self.base = base
self.frag_map = frag_map
self.link_map = link_map
self.container = container
self.replaced = False
def __call__(self, url):
if url and url.startswith('#'):
repl = self.frag_map(self.base, url[1:])
if not repl or repl == url[1:]:
return url
self.replaced = True
return '#' + repl
name = self.container.href_to_name(url, self.base)
if not name:
return url
nname = self.link_map.get(name, None)
if not nname:
return url
purl = urlparse(url)
href = self.container.name_to_href(nname, self.base)
if purl.fragment:
nfrag = self.frag_map(name, purl.fragment)
if nfrag:
href += '#%s'%nfrag
if href != url:
self.replaced = True
return href
class IdReplacer(object):
def __init__(self, base, container, id_map):
self.base, self.container, self.replaced = base, container, False
self.id_map = id_map
def __call__(self, url):
if url and url.startswith('#'):
repl = self.id_map.get(self.base, {}).get(url[1:])
if repl is None or repl == url[1:]:
return url
self.replaced = True
return '#' + repl
name = self.container.href_to_name(url, self.base)
if not name:
return url
id_map = self.id_map.get(name)
if id_map is None:
return url
purl = urlparse(url)
nfrag = id_map.get(purl.fragment)
if nfrag is None:
return url
purl = purl._replace(fragment=nfrag)
href = urlunparse(purl)
if href != url:
self.replaced = True
return href
class LinkRebaser(object):
def __init__(self, container, old_name, new_name):
self.old_name, self.new_name = old_name, new_name
self.container = container
self.replaced = False
def __call__(self, url):
if url and url.startswith('#'):
return url
purl = urlparse(url)
frag = purl.fragment
name = self.container.href_to_name(url, self.old_name)
if not name:
return url
if name == self.old_name:
name = self.new_name
href = self.container.name_to_href(name, self.new_name)
if frag:
href += '#' + frag
if href != url:
self.replaced = True
return href
def replace_links(container, link_map, frag_map=lambda name, frag:frag, replace_in_opf=False):
'''
Replace links to files in the container. Will iterate over all files in the container and change the specified links in them.
:param link_map: A mapping of old canonical name to new canonical name. For example: :code:`{'images/old.png': 'images/new.png'}`
:param frag_map: A callable that takes two arguments ``(name, anchor)`` and
returns a new anchor. This is useful if you need to change the anchors in
HTML files. By default, it does nothing.
:param replace_in_opf: If False, links are not replaced in the OPF file.
'''
for name, media_type in iteritems(container.mime_map):
if name == container.opf_name and not replace_in_opf:
continue
repl = LinkReplacer(name, container, link_map, frag_map)
container.replace_links(name, repl)
def replace_ids(container, id_map):
'''
Replace all links in the container that pointed to the changed ids.
:param id_map: A mapping of {name:id_map} where each id_map is a mapping of {old_id:new_id}
:return: True iff at least one link was changed
'''
changed = False
for name, media_type in iteritems(container.mime_map):
repl = IdReplacer(name, container, id_map)
container.replace_links(name, repl)
if name == container.opf_name:
imap = id_map.get(name, {})
for item in container.opf_xpath('//*[@idref]'):
old_id = item.get('idref')
if old_id is not None:
new_id = imap.get(old_id)
if new_id is not None:
item.set('idref', new_id)
if repl.replaced:
changed = True
return changed
def smarten_punctuation(container, report):
from calibre.ebooks.conversion.preprocess import smarten_punctuation
smartened = False
for path in container.spine_items:
name = container.abspath_to_name(path)
changed = False
with container.open(name, 'r+b') as f:
html = container.decode(f.read())
newhtml = smarten_punctuation(html, container.log)
if newhtml != html:
changed = True
report(_('Smartened punctuation in: %s')%name)
newhtml = strip_encoding_declarations(newhtml)
f.seek(0)
f.truncate()
f.write(codecs.BOM_UTF8 + newhtml.encode('utf-8'))
if changed:
# Add an encoding declaration (it will be added automatically when
# serialized)
root = container.parsed(name)
for m in root.xpath('descendant::*[local-name()="meta" and @http-equiv]'):
m.getparent().remove(m)
container.dirty(name)
smartened = True
if not smartened:
report(_('No punctuation that could be smartened found'))
return smartened
def rename_files(container, file_map):
'''
Rename files in the container, automatically updating all links to them.
:param file_map: A mapping of old canonical name to new canonical name, for
example: :code:`{'text/chapter1.html': 'chapter1.html'}`.
'''
overlap = set(file_map).intersection(set(itervalues(file_map)))
if overlap:
raise ValueError('Circular rename detected. The files %s are both rename targets and destinations' % ', '.join(overlap))
for name, dest in iteritems(file_map):
if container.exists(dest):
if name != dest and name.lower() == dest.lower():
# A case change on an OS with a case insensitive file-system.
continue
raise ValueError('Cannot rename {0} to {1} as {1} already exists'.format(name, dest))
if len(tuple(itervalues(file_map))) != len(set(itervalues(file_map))):
raise ValueError('Cannot rename, the set of destination files contains duplicates')
link_map = {}
for current_name, new_name in iteritems(file_map):
container.rename(current_name, new_name)
if new_name != container.opf_name: # OPF is handled by the container
link_map[current_name] = new_name
replace_links(container, link_map, replace_in_opf=True)
def replace_file(container, name, path, basename, force_mt=None):
dirname, base = name.rpartition('/')[0::2]
nname = sanitize_file_name(basename)
if dirname:
nname = dirname + '/' + nname
with open(path, 'rb') as src:
if name != nname:
count = 0
b, e = nname.rpartition('.')[0::2]
while container.exists(nname):
count += 1
nname = b + ('_%d.%s' % (count, e))
rename_files(container, {name:nname})
mt = force_mt or container.guess_type(nname)
container.mime_map[nname] = mt
for itemid, q in iteritems(container.manifest_id_map):
if q == nname:
for item in container.opf_xpath('//opf:manifest/opf:item[@href and @id="%s"]' % itemid):
item.set('media-type', mt)
container.dirty(container.opf_name)
with container.open(nname, 'wb') as dest:
shutil.copyfileobj(src, dest)
def mt_to_category(container, mt):
from calibre.ebooks.oeb.polish.utils import guess_type
from calibre.ebooks.oeb.polish.container import OEB_FONTS
from calibre.ebooks.oeb.base import OEB_DOCS, OEB_STYLES
if mt in OEB_DOCS:
category = 'text'
elif mt in OEB_STYLES:
category = 'style'
elif mt in OEB_FONTS:
category = 'font'
elif mt == guess_type('a.opf'):
category = 'opf'
elif mt == guess_type('a.ncx'):
category = 'toc'
else:
category = mt.partition('/')[0]
return category
def get_recommended_folders(container, names):
''' Return the folders that are recommended for the given filenames. The
recommendation is based on where the majority of files of the same type are
located in the container. If no files of a particular type are present, the
recommended folder is assumed to be the folder containing the OPF file. '''
from calibre.ebooks.oeb.polish.utils import guess_type
counts = defaultdict(Counter)
for name, mt in iteritems(container.mime_map):
folder = name.rpartition('/')[0] if '/' in name else ''
counts[mt_to_category(container, mt)][folder] += 1
try:
opf_folder = counts['opf'].most_common(1)[0][0]
except KeyError:
opf_folder = ''
recommendations = {category:counter.most_common(1)[0][0] for category, counter in iteritems(counts)}
return {n:recommendations.get(mt_to_category(container, guess_type(os.path.basename(n))), opf_folder) for n in names}
def normalize_case(container, val):
def safe_listdir(x):
try:
return os.listdir(x)
except EnvironmentError:
return ()
parts = val.split('/')
ans = []
for i in range(len(parts)):
q = '/'.join(parts[:i+1])
x = container.name_to_abspath(q)
xl = parts[i].lower()
candidates = [c for c in safe_listdir(os.path.dirname(x)) if c != parts[i] and c.lower() == xl]
ans.append(candidates[0] if candidates else parts[i])
return '/'.join(ans)
def rationalize_folders(container, folder_type_map):
all_names = set(container.mime_map)
new_names = set()
name_map = {}
for key in tuple(folder_type_map):
val = folder_type_map[key]
folder_type_map[key] = normalize_case(container, val)
for name in all_names:
if name.startswith('META-INF/'):
continue
category = mt_to_category(container, container.mime_map[name])
folder = folder_type_map.get(category, None)
if folder is not None:
bn = posixpath.basename(name)
new_name = posixpath.join(folder, bn)
if new_name != name:
c = 0
while new_name in all_names or new_name in new_names:
c += 1
n, ext = bn.rpartition('.')[0::2]
new_name = posixpath.join(folder, '%s_%d.%s' % (n, c, ext))
name_map[name] = new_name
new_names.add(new_name)
return name_map
def remove_links_in_sheet(href_to_name, sheet, predicate):
import_rules_to_remove = []
changed = False
for i, r in enumerate(sheet):
if r.type == r.IMPORT_RULE:
name = href_to_name(r.href)
if predicate(name, r.href, None):
import_rules_to_remove.append(i)
for i in sorted(import_rules_to_remove, reverse=True):
sheet.deleteRule(i)
changed = True
for dec in iter_declarations(sheet):
changed = remove_links_in_declaration(href_to_name, dec, predicate) or changed
return changed
def remove_links_in_declaration(href_to_name, style, predicate):
def check_pval(v):
if v.type == v.URI:
name = href_to_name(v.uri)
return predicate(name, v.uri, None)
return False
changed = False
for p in tuple(style.getProperties(all=True)):
changed = remove_property_value(p, check_pval) or changed
return changed
def remove_links_to(container, predicate):
''' predicate must be a function that takes the arguments (name, href,
fragment=None) and returns True iff the link should be removed '''
from calibre.ebooks.oeb.base import iterlinks, OEB_DOCS, OEB_STYLES, XPath, XHTML
stylepath = XPath('//h:style')
styleattrpath = XPath('//*[@style]')
changed = set()
for name, mt in iteritems(container.mime_map):
removed = False
if mt in OEB_DOCS:
root = container.parsed(name)
for el, attr, href, pos in iterlinks(root, find_links_in_css=False):
hname = container.href_to_name(href, name)
frag = href.partition('#')[-1]
if predicate(hname, href, frag):
if attr is None:
el.text = None
else:
if el.tag == XHTML('link') or el.tag == XHTML('img'):
extract(el)
else:
del el.attrib[attr]
removed = True
for tag in stylepath(root):
if tag.text and (tag.get('type') or 'text/css').lower() == 'text/css':
sheet = container.parse_css(tag.text)
if remove_links_in_sheet(partial(container.href_to_name, base=name), sheet, predicate):
tag.text = css_text(sheet)
removed = True
for tag in styleattrpath(root):
style = tag.get('style')
if style:
style = container.parse_css(style, is_declaration=True)
if remove_links_in_declaration(partial(container.href_to_name, base=name), style, predicate):
removed = True
tag.set('style', css_text(style))
elif mt in OEB_STYLES:
removed = remove_links_in_sheet(partial(container.href_to_name, base=name), container.parsed(name), predicate)
if removed:
changed.add(name)
tuple(map(container.dirty, changed))
return changed
def get_spine_order_for_all_files(container):
linear_names, non_linear_names = [], []
for name, is_linear in container.spine_names:
(linear_names if is_linear else non_linear_names).append(name)
all_names = linear_names + non_linear_names
spine_names = frozenset(all_names)
ans = {}
for spine_pos, name in enumerate(all_names):
ans.setdefault(name, (spine_pos, -1))
for i, href in enumerate(container.iterlinks(name, get_line_numbers=False)):
lname = container.href_to_name(href, name)
if lname not in spine_names:
ans.setdefault(lname, (spine_pos, i))
return ans

View File

@@ -0,0 +1,517 @@
#!/usr/bin/env python2
# vim:fileencoding=utf-8
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
import copy, os, re
from polyglot.builtins import map, string_or_bytes, range
from calibre.ebooks.oeb.base import barename, XPNSMAP, XPath, OPF, XHTML, OEB_DOCS
from calibre.ebooks.oeb.polish.errors import MalformedMarkup
from calibre.ebooks.oeb.polish.toc import node_from_loc
from calibre.ebooks.oeb.polish.replace import LinkRebaser
from polyglot.builtins import iteritems, unicode_type
from polyglot.urllib import urlparse
class AbortError(ValueError):
pass
def in_table(node):
while node is not None:
if node.tag.endswith('}table'):
return True
node = node.getparent()
return False
def adjust_split_point(split_point, log):
'''
Move the split point up its ancestor chain if it has no content
before it. This handles the common case:
<div id="chapter1"><h2>Chapter 1</h2>...</div> with a page break on the
h2.
'''
sp = split_point
while True:
parent = sp.getparent()
if (
parent is None or
barename(parent.tag) in {'body', 'html'} or
(parent.text and parent.text.strip()) or
parent.index(sp) > 0
):
break
sp = parent
if sp is not split_point:
log.debug('Adjusted split point to ancestor')
return sp
def get_body(root):
return root.find('h:body', namespaces=XPNSMAP)
def do_split(split_point, log, before=True):
'''
Split tree into a *before* and an *after* tree at ``split_point``.
:param split_point: The Element at which to split
:param before: If True tree is split before split_point, otherwise after split_point
:return: before_tree, after_tree
'''
if before:
# We cannot adjust for after since moving an after split point to a
# parent will cause breakage if the parent contains any content
# after the original split point
split_point = adjust_split_point(split_point, log)
tree = split_point.getroottree()
path = tree.getpath(split_point)
tree, tree2 = copy.deepcopy(tree), copy.deepcopy(tree)
root, root2 = tree.getroot(), tree2.getroot()
body, body2 = map(get_body, (root, root2))
split_point = root.xpath(path)[0]
split_point2 = root2.xpath(path)[0]
def nix_element(elem, top=True):
# Remove elem unless top is False in which case replace elem by its
# children
parent = elem.getparent()
if top:
parent.remove(elem)
else:
index = parent.index(elem)
parent[index:index+1] = list(elem.iterchildren())
# Tree 1
hit_split_point = False
keep_descendants = False
split_point_descendants = frozenset(split_point.iterdescendants())
for elem in tuple(body.iterdescendants()):
if elem is split_point:
hit_split_point = True
if before:
nix_element(elem)
else:
# We want to keep the descendants of the split point in
# Tree 1
keep_descendants = True
# We want the split point element, but not its tail
elem.tail = '\n'
continue
if hit_split_point:
if keep_descendants:
if elem in split_point_descendants:
# elem is a descendant keep it
continue
else:
# We are out of split_point, so prevent further set
# lookups of split_point_descendants
keep_descendants = False
nix_element(elem)
# Tree 2
ancestors = frozenset(XPath('ancestor::*')(split_point2))
for elem in tuple(body2.iterdescendants()):
if elem is split_point2:
if not before:
# Keep the split point element's tail, if it contains non-whitespace
# text
tail = elem.tail
if tail and not tail.isspace():
parent = elem.getparent()
idx = parent.index(elem)
if idx == 0:
parent.text = (parent.text or '') + tail
else:
sib = parent[idx-1]
sib.tail = (sib.tail or '') + tail
# Remove the element itself
nix_element(elem)
break
if elem in ancestors:
# We have to preserve the ancestors as they could have CSS
# styles that are inherited/applicable, like font or
# width. So we only remove the text, if any.
elem.text = '\n'
else:
nix_element(elem, top=False)
body2.text = '\n'
return tree, tree2
class SplitLinkReplacer(object):
def __init__(self, base, bottom_anchors, top_name, bottom_name, container):
self.bottom_anchors, self.bottom_name = bottom_anchors, bottom_name
self.container, self.top_name = container, top_name
self.base = base
self.replaced = False
def __call__(self, url):
if url and url.startswith('#'):
return url
name = self.container.href_to_name(url, self.base)
if name != self.top_name:
return url
purl = urlparse(url)
if purl.fragment and purl.fragment in self.bottom_anchors:
url = self.container.name_to_href(self.bottom_name, self.base) + '#' + purl.fragment
self.replaced = True
return url
def split(container, name, loc_or_xpath, before=True, totals=None):
'''
Split the file specified by name at the position specified by loc_or_xpath.
Splitting automatically migrates all links and references to the affected
files.
:param loc_or_xpath: Should be an XPath expression such as
//h:div[@id="split_here"]. Can also be a *loc* which is used internally to
implement splitting in the preview panel.
:param before: If True the split occurs before the identified element otherwise after it.
:param totals: Used internally
'''
root = container.parsed(name)
if isinstance(loc_or_xpath, unicode_type):
split_point = root.xpath(loc_or_xpath)[0]
else:
try:
split_point = node_from_loc(root, loc_or_xpath, totals=totals)
except MalformedMarkup:
# The webkit HTML parser and the container parser have yielded
# different node counts, this can happen if the file is valid XML
# but contains constructs like nested <p> tags. So force parse it
# with the HTML 5 parser and try again.
raw = container.raw_data(name)
root = container.parse_xhtml(raw, fname=name, force_html5_parse=True)
try:
split_point = node_from_loc(root, loc_or_xpath, totals=totals)
except MalformedMarkup:
raise MalformedMarkup(_('The file %s has malformed markup. Try running the Fix HTML tool'
' before splitting') % name)
container.replace(name, root)
if in_table(split_point):
raise AbortError('Cannot split inside tables')
if split_point.tag.endswith('}body'):
raise AbortError('Cannot split on the <body> tag')
tree1, tree2 = do_split(split_point, container.log, before=before)
root1, root2 = tree1.getroot(), tree2.getroot()
anchors_in_top = frozenset(root1.xpath('//*/@id')) | frozenset(root1.xpath('//*/@name')) | {''}
anchors_in_bottom = frozenset(root2.xpath('//*/@id')) | frozenset(root2.xpath('//*/@name'))
base, ext = name.rpartition('.')[0::2]
base = re.sub(r'_split\d+$', '', base)
nname, s = None, 0
while not nname or container.exists(nname):
s += 1
nname = '%s_split%d.%s' % (base, s, ext)
manifest_item = container.generate_item(nname, media_type=container.mime_map[name])
bottom_name = container.href_to_name(manifest_item.get('href'), container.opf_name)
# Fix links in the split trees
for r in (root1, root2):
for a in r.xpath('//*[@href]'):
url = a.get('href')
if url.startswith('#'):
fname = name
else:
fname = container.href_to_name(url, name)
if fname == name:
purl = urlparse(url)
if purl.fragment in anchors_in_top:
if r is root2:
a.set('href', '%s#%s' % (container.name_to_href(name, bottom_name), purl.fragment))
else:
a.set('href', '#' + purl.fragment)
elif purl.fragment in anchors_in_bottom:
if r is root1:
a.set('href', '%s#%s' % (container.name_to_href(bottom_name, name), purl.fragment))
else:
a.set('href', '#' + purl.fragment)
# Fix all links in the container that point to anchors in the bottom tree
for fname, media_type in iteritems(container.mime_map):
if fname not in {name, bottom_name}:
repl = SplitLinkReplacer(fname, anchors_in_bottom, name, bottom_name, container)
container.replace_links(fname, repl)
container.replace(name, root1)
container.replace(bottom_name, root2)
spine = container.opf_xpath('//opf:spine')[0]
for spine_item, spine_name, linear in container.spine_iter:
if spine_name == name:
break
index = spine.index(spine_item) + 1
si = spine.makeelement(OPF('itemref'), idref=manifest_item.get('id'))
if not linear:
si.set('linear', 'no')
container.insert_into_xml(spine, si, index=index)
container.dirty(container.opf_name)
return bottom_name
def multisplit(container, name, xpath, before=True):
'''
Split the specified file at multiple locations (all tags that match the specified XPath expression). See also: :func:`split`.
Splitting automatically migrates all links and references to the affected
files.
:param before: If True the splits occur before the identified element otherwise after it.
'''
root = container.parsed(name)
nodes = root.xpath(xpath, namespaces=XPNSMAP)
if not nodes:
raise AbortError(_('The expression %s did not match any nodes') % xpath)
for split_point in nodes:
if in_table(split_point):
raise AbortError('Cannot split inside tables')
if split_point.tag.endswith('}body'):
raise AbortError('Cannot split on the <body> tag')
for i, tag in enumerate(nodes):
tag.set('calibre-split-point', unicode_type(i))
current = name
all_names = [name]
for i in range(len(nodes)):
current = split(container, current, '//*[@calibre-split-point="%d"]' % i, before=before)
all_names.append(current)
for x in all_names:
for tag in container.parsed(x).xpath('//*[@calibre-split-point]'):
tag.attrib.pop('calibre-split-point')
container.dirty(x)
return all_names[1:]
class MergeLinkReplacer(object):
def __init__(self, base, anchor_map, master, container):
self.container, self.anchor_map = container, anchor_map
self.master = master
self.base = base
self.replaced = False
def __call__(self, url):
if url and url.startswith('#'):
return url
name = self.container.href_to_name(url, self.base)
amap = self.anchor_map.get(name, None)
if amap is None:
return url
purl = urlparse(url)
frag = purl.fragment or ''
frag = amap.get(frag, frag)
url = self.container.name_to_href(self.master, self.base) + '#' + frag
self.replaced = True
return url
def add_text(body, text):
if len(body) > 0:
body[-1].tail = (body[-1].tail or '') + text
else:
body.text = (body.text or '') + text
def all_anchors(root):
return set(root.xpath('//*/@id')) | set(root.xpath('//*/@name'))
def all_stylesheets(container, name):
for link in XPath('//h:head/h:link[@href]')(container.parsed(name)):
name = container.href_to_name(link.get('href'), name)
typ = link.get('type', 'text/css')
if typ == 'text/css':
yield name
def unique_anchor(seen_anchors, current):
c = 0
ans = current
while ans in seen_anchors:
c += 1
ans = '%s_%d' % (current, c)
return ans
def remove_name_attributes(root):
# Remove all name attributes, replacing them with id attributes
for elem in root.xpath('//*[@id and @name]'):
del elem.attrib['name']
for elem in root.xpath('//*[@name]'):
elem.set('id', elem.attrib.pop('name'))
def merge_html(container, names, master, insert_page_breaks=False):
p = container.parsed
root = p(master)
# Ensure master has a <head>
head = root.find('h:head', namespaces=XPNSMAP)
if head is None:
head = root.makeelement(XHTML('head'))
container.insert_into_xml(root, head, 0)
seen_anchors = all_anchors(root)
seen_stylesheets = set(all_stylesheets(container, master))
master_body = p(master).findall('h:body', namespaces=XPNSMAP)[-1]
master_base = os.path.dirname(master)
anchor_map = {n:{} for n in names if n != master}
first_anchor_map = {}
for name in names:
if name == master:
continue
# Insert new stylesheets into master
for sheet in all_stylesheets(container, name):
if sheet not in seen_stylesheets:
seen_stylesheets.add(sheet)
link = head.makeelement(XHTML('link'), rel='stylesheet', type='text/css', href=container.name_to_href(sheet, master))
container.insert_into_xml(head, link)
# Rebase links if master is in a different directory
if os.path.dirname(name) != master_base:
container.replace_links(name, LinkRebaser(container, name, master))
root = p(name)
children = []
for body in p(name).findall('h:body', namespaces=XPNSMAP):
children.append(body.text if body.text and body.text.strip() else '\n\n')
children.extend(body)
first_child = ''
for first_child in children:
if not isinstance(first_child, string_or_bytes):
break
if isinstance(first_child, string_or_bytes):
# body contained only text, no tags
first_child = body.makeelement(XHTML('p'))
first_child.text, children[0] = children[0], first_child
amap = anchor_map[name]
remove_name_attributes(root)
for elem in root.xpath('//*[@id]'):
val = elem.get('id')
if not val:
continue
if val in seen_anchors:
nval = unique_anchor(seen_anchors, val)
elem.set('id', nval)
amap[val] = nval
else:
seen_anchors.add(val)
if 'id' not in first_child.attrib:
first_child.set('id', unique_anchor(seen_anchors, 'top'))
seen_anchors.add(first_child.get('id'))
first_anchor_map[name] = first_child.get('id')
if insert_page_breaks:
first_child.set('style', first_child.get('style', '') + '; page-break-before: always')
amap[''] = first_child.get('id')
# Fix links that point to local changed anchors
for a in XPath('//h:a[starts-with(@href, "#")]')(root):
q = a.get('href')[1:]
if q in amap:
a.set('href', '#' + amap[q])
for child in children:
if isinstance(child, string_or_bytes):
add_text(master_body, child)
else:
master_body.append(copy.deepcopy(child))
container.remove_item(name, remove_from_guide=False)
# Fix all links in the container that point to merged files
for fname, media_type in iteritems(container.mime_map):
repl = MergeLinkReplacer(fname, anchor_map, master, container)
container.replace_links(fname, repl)
return first_anchor_map
def merge_css(container, names, master):
p = container.parsed
msheet = p(master)
master_base = os.path.dirname(master)
merged = set()
for name in names:
if name == master:
continue
# Rebase links if master is in a different directory
if os.path.dirname(name) != master_base:
container.replace_links(name, LinkRebaser(container, name, master))
sheet = p(name)
# Remove charset rules
cr = [r for r in sheet.cssRules if r.type == r.CHARSET_RULE]
[sheet.deleteRule(sheet.cssRules.index(r)) for r in cr]
for rule in sheet.cssRules:
msheet.add(rule)
container.remove_item(name)
merged.add(name)
# Remove links to merged stylesheets in the html files, replacing with a
# link to the master sheet
for name, mt in iteritems(container.mime_map):
if mt in OEB_DOCS:
removed = False
root = p(name)
for link in XPath('//h:link[@href]')(root):
q = container.href_to_name(link.get('href'), name)
if q in merged:
container.remove_from_xml(link)
removed = True
if removed:
container.dirty(name)
if removed and master not in set(all_stylesheets(container, name)):
head = root.find('h:head', namespaces=XPNSMAP)
if head is not None:
link = head.makeelement(XHTML('link'), type='text/css', rel='stylesheet', href=container.name_to_href(master, name))
container.insert_into_xml(head, link)
def merge(container, category, names, master):
'''
Merge the specified files into a single file, automatically migrating all
links and references to the affected files. The file must all either be HTML or CSS files.
:param category: Must be either ``'text'`` for HTML files or ``'styles'`` for CSS files
:param names: The list of files to be merged
:param master: Which of the merged files is the *master* file, that is, the file that will remain after merging.
'''
if category not in {'text', 'styles'}:
raise AbortError('Cannot merge files of type: %s' % category)
if len(names) < 2:
raise AbortError('Must specify at least two files to be merged')
if master not in names:
raise AbortError('The master file (%s) must be one of the files being merged' % master)
if category == 'text':
merge_html(container, names, master)
elif category == 'styles':
merge_css(container, names, master)
container.dirty(master)

View File

@@ -0,0 +1,172 @@
#!/usr/bin/env python2
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import textwrap
from calibre import guess_type
from calibre.utils.imghdr import identify
from calibre.utils.xml_parse import safe_xml_fromstring
from polyglot.builtins import unicode_type
from polyglot.urllib import unquote
class CoverManager(object):
SVG_TEMPLATE = textwrap.dedent('''\
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en">
<head>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
<meta name="calibre:cover" content="true" />
<title>Cover</title>
<style type="text/css" title="override_css">
@page {padding: 0pt; margin:0pt}
body { text-align: center; padding:0pt; margin: 0pt; }
</style>
</head>
<body>
<div>
<svg version="1.1" xmlns="http://www.w3.org/2000/svg"
xmlns:xlink="http://www.w3.org/1999/xlink"
width="100%%" height="100%%" viewBox="__viewbox__"
preserveAspectRatio="__ar__">
<image width="__width__" height="__height__" xlink:href="%s"/>
</svg>
</div>
</body>
</html>
''')
NONSVG_TEMPLATE = textwrap.dedent('''\
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en">
<head>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
<meta name="calibre:cover" content="true" />
<title>Cover</title>
<style type="text/css" title="override_css">
@page {padding: 0pt; margin:0pt}
body { text-align: center; padding:0pt; margin: 0pt }
div { padding:0pt; margin: 0pt }
img { padding:0pt; margin: 0pt }
</style>
</head>
<body>
<div>
<img src="%s" alt="cover" __style__ />
</div>
</body>
</html>
''')
def __init__(self, no_default_cover=False, no_svg_cover=False,
preserve_aspect_ratio=False, fixed_size=None):
self.no_default_cover = no_default_cover
self.no_svg_cover = no_svg_cover
self.preserve_aspect_ratio = preserve_aspect_ratio
ar = 'xMidYMid meet' if preserve_aspect_ratio else 'none'
self.svg_template = self.SVG_TEMPLATE.replace('__ar__', ar)
if fixed_size is None:
style = 'style="height: 100%%"'
else:
width, height = fixed_size
style = 'style="height: %s; width: %s"'%(height, width)
self.non_svg_template = self.NONSVG_TEMPLATE.replace('__style__',
style)
def __call__(self, oeb, opts, log):
self.oeb = oeb
self.log = log
self.insert_cover()
def default_cover(self):
'''
Create a generic cover for books that dont have a cover
'''
if self.no_default_cover:
return None
self.log('Generating default cover')
m = self.oeb.metadata
title = unicode_type(m.title[0])
authors = [unicode_type(x) for x in m.creator if x.role == 'aut']
try:
from calibre.ebooks.covers import create_cover
series = series_index = None
if m.series:
try:
series, series_index = unicode_type(m.series[0]), m.series_index[0]
except IndexError:
pass
img_data = create_cover(title, authors, series, series_index)
id, href = self.oeb.manifest.generate('cover',
'cover_image.jpg')
item = self.oeb.manifest.add(id, href, guess_type('t.jpg')[0],
data=img_data)
m.clear('cover')
m.add('cover', item.id)
return item.href
except:
self.log.exception('Failed to generate default cover')
return None
def inspect_cover(self, href):
from calibre.ebooks.oeb.base import urlnormalize
for x in self.oeb.manifest:
if x.href == urlnormalize(href):
try:
raw = x.data
return identify(raw)[1:]
except Exception:
self.log.exception('Failed to read cover image dimensions')
return -1, -1
def insert_cover(self):
from calibre.ebooks.oeb.base import urldefrag
g, m = self.oeb.guide, self.oeb.manifest
item = None
if 'titlepage' not in g:
if 'cover' in g:
href = g['cover'].href
else:
href = self.default_cover()
if href is None:
return
width, height = self.inspect_cover(href)
if width == -1 or height == -1:
self.log.warning('Failed to read cover dimensions')
width, height = 600, 800
# if self.preserve_aspect_ratio:
# width, height = 600, 800
self.svg_template = self.svg_template.replace('__viewbox__',
'0 0 %d %d'%(width, height))
self.svg_template = self.svg_template.replace('__width__',
unicode_type(width))
self.svg_template = self.svg_template.replace('__height__',
unicode_type(height))
if href is not None:
templ = self.non_svg_template if self.no_svg_cover \
else self.svg_template
tp = templ%unquote(href)
id, href = m.generate('titlepage', 'titlepage.xhtml')
item = m.add(id, href, guess_type('t.xhtml')[0],
data=safe_xml_fromstring(tp))
else:
item = self.oeb.manifest.hrefs[
urldefrag(self.oeb.guide['titlepage'].href)[0]]
if item is not None:
self.oeb.spine.insert(0, item, True)
if 'cover' not in self.oeb.guide.refs:
self.oeb.guide.add('cover', 'Title Page', 'a')
self.oeb.guide.refs['cover'].href = item.href
if 'titlepage' in self.oeb.guide.refs:
self.oeb.guide.refs['titlepage'].href = item.href
titem = getattr(self.oeb.toc, 'item_that_refers_to_cover', None)
if titem is not None:
titem.href = item.href

View File

@@ -0,0 +1,187 @@
#!/usr/bin/env python2
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import posixpath
from lxml import etree
from calibre.ebooks.oeb.base import rewrite_links, urlnormalize
from polyglot.urllib import urldefrag, urlparse
class RenameFiles(object): # {{{
'''
Rename files and adjust all links pointing to them. Note that the spine
and manifest are not touched by this transform.
'''
def __init__(self, rename_map, renamed_items_map=None):
self.rename_map = rename_map
self.renamed_items_map = renamed_items_map
def __call__(self, oeb, opts):
import css_parser
self.log = oeb.logger
self.opts = opts
self.oeb = oeb
for item in oeb.manifest.items:
self.current_item = item
if etree.iselement(item.data):
rewrite_links(self.current_item.data, self.url_replacer)
elif hasattr(item.data, 'cssText'):
css_parser.replaceUrls(item.data, self.url_replacer)
if self.oeb.guide:
for ref in self.oeb.guide.values():
href = urlnormalize(ref.href)
href, frag = urldefrag(href)
replacement = self.rename_map.get(href, None)
if replacement is not None:
nhref = replacement
if frag:
nhref += '#' + frag
ref.href = nhref
if self.oeb.toc:
self.fix_toc_entry(self.oeb.toc)
def fix_toc_entry(self, toc):
if toc.href:
href = urlnormalize(toc.href)
href, frag = urldefrag(href)
replacement = self.rename_map.get(href, None)
if replacement is not None:
nhref = replacement
if frag:
nhref = '#'.join((nhref, frag))
toc.href = nhref
for x in toc:
self.fix_toc_entry(x)
def url_replacer(self, orig_url):
url = urlnormalize(orig_url)
parts = urlparse(url)
if parts.scheme:
# Only rewrite local URLs
return orig_url
path, frag = urldefrag(url)
if self.renamed_items_map:
orig_item = self.renamed_items_map.get(self.current_item.href, self.current_item)
else:
orig_item = self.current_item
href = orig_item.abshref(path)
replacement = self.current_item.relhref(self.rename_map.get(href, href))
if frag:
replacement += '#' + frag
return replacement
# }}}
class UniqueFilenames(object): # {{{
'Ensure that every item in the manifest has a unique filename'
def __call__(self, oeb, opts):
self.log = oeb.logger
self.opts = opts
self.oeb = oeb
self.seen_filenames = set()
self.rename_map = {}
for item in list(oeb.manifest.items):
fname = posixpath.basename(item.href)
if fname in self.seen_filenames:
suffix = self.unique_suffix(fname)
data = item.data
base, ext = posixpath.splitext(item.href)
nhref = base + suffix + ext
nhref = oeb.manifest.generate(href=nhref)[1]
spine_pos = item.spine_position
oeb.manifest.remove(item)
nitem = oeb.manifest.add(item.id, nhref, item.media_type, data=data,
fallback=item.fallback)
self.seen_filenames.add(posixpath.basename(nhref))
self.rename_map[item.href] = nhref
if spine_pos is not None:
oeb.spine.insert(spine_pos, nitem, item.linear)
else:
self.seen_filenames.add(fname)
if self.rename_map:
self.log('Found non-unique filenames, renaming to support broken'
' EPUB readers like FBReader, Aldiko and Stanza...')
from pprint import pformat
self.log.debug(pformat(self.rename_map))
renamer = RenameFiles(self.rename_map)
renamer(oeb, opts)
def unique_suffix(self, fname):
base, ext = posixpath.splitext(fname)
c = 0
while True:
c += 1
suffix = '_u%d'%c
candidate = base + suffix + ext
if candidate not in self.seen_filenames:
return suffix
# }}}
class FlatFilenames(object): # {{{
'Ensure that every item in the manifest has a unique filename without subdirectories.'
def __call__(self, oeb, opts):
self.log = oeb.logger
self.opts = opts
self.oeb = oeb
self.rename_map = {}
self.renamed_items_map = {}
for item in list(oeb.manifest.items):
# Flatten URL by removing directories.
# Example: a/b/c/index.html -> a_b_c_index.html
nhref = item.href.replace("/", "_")
if item.href == nhref:
# URL hasn't changed, skip item.
continue
data = item.data
isp = item.spine_position
nhref = oeb.manifest.generate(href=nhref)[1]
if isp is not None:
oeb.spine.remove(item)
oeb.manifest.remove(item)
nitem = oeb.manifest.add(item.id, nhref, item.media_type, data=data,
fallback=item.fallback)
self.rename_map[item.href] = nhref
self.renamed_items_map[nhref] = item
if isp is not None:
oeb.spine.insert(isp, nitem, item.linear)
if self.rename_map:
self.log('Found non-flat filenames, renaming to support broken'
' EPUB readers like FBReader...')
from pprint import pformat
self.log.debug(pformat(self.rename_map))
self.log.debug(pformat(self.renamed_items_map))
renamer = RenameFiles(self.rename_map, self.renamed_items_map)
renamer(oeb, opts)
# }}}

View File

@@ -0,0 +1,81 @@
#!/usr/bin/env python2
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
from calibre import fit_image
class RescaleImages(object):
'Rescale all images to fit inside given screen size'
def __init__(self, check_colorspaces=False):
self.check_colorspaces = check_colorspaces
def __call__(self, oeb, opts):
self.oeb, self.opts, self.log = oeb, opts, oeb.log
self.rescale()
def rescale(self):
from PIL import Image
from io import BytesIO
is_image_collection = getattr(self.opts, 'is_image_collection', False)
if is_image_collection:
page_width, page_height = self.opts.dest.comic_screen_size
else:
page_width, page_height = self.opts.dest.width, self.opts.dest.height
page_width -= (self.opts.margin_left + self.opts.margin_right) * self.opts.dest.dpi/72
page_height -= (self.opts.margin_top + self.opts.margin_bottom) * self.opts.dest.dpi/72
for item in self.oeb.manifest:
if item.media_type.startswith('image'):
ext = item.media_type.split('/')[-1].upper()
if ext == 'JPG':
ext = 'JPEG'
if ext not in ('PNG', 'JPEG', 'GIF'):
ext = 'JPEG'
raw = item.data
if hasattr(raw, 'xpath') or not raw:
# Probably an svg image
continue
try:
img = Image.open(BytesIO(raw))
except Exception:
continue
width, height = img.size
try:
if self.check_colorspaces and img.mode == 'CMYK':
self.log.warn(
'The image %s is in the CMYK colorspace, converting it '
'to RGB as Adobe Digital Editions cannot display CMYK' % item.href)
img = img.convert('RGB')
except Exception:
self.log.exception('Failed to convert image %s from CMYK to RGB' % item.href)
scaled, new_width, new_height = fit_image(width, height, page_width, page_height)
if scaled:
new_width = max(1, new_width)
new_height = max(1, new_height)
self.log('Rescaling image from %dx%d to %dx%d'%(
width, height, new_width, new_height), item.href)
try:
img = img.resize((new_width, new_height))
except Exception:
self.log.exception('Failed to rescale image: %s' % item.href)
continue
buf = BytesIO()
try:
img.save(buf, ext)
except Exception:
self.log.exception('Failed to rescale image: %s' % item.href)
else:
item.data = buf.getvalue()
item.unload_data_from_memory()

View File

@@ -0,0 +1,488 @@
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
__docformat__ = 'restructuredtext en'
'''
Splitting of the XHTML flows. Splitting can happen on page boundaries or can be
forced at "likely" locations to conform to size limitations. This transform
assumes a prior call to the flatcss transform.
'''
import os, functools, collections, re, copy
from collections import OrderedDict
from lxml.etree import XPath as _XPath
from lxml import etree
from calibre import as_unicode, force_unicode
from calibre.ebooks.epub import rules
from calibre.ebooks.oeb.base import (OEB_STYLES, XPNSMAP as NAMESPACES,
urldefrag, rewrite_links, XHTML, urlnormalize)
from calibre.ebooks.oeb.polish.split import do_split
from polyglot.builtins import iteritems, range, map, unicode_type
from polyglot.urllib import unquote
from css_selectors import Select, SelectorError
XPath = functools.partial(_XPath, namespaces=NAMESPACES)
SPLIT_POINT_ATTR = 'csp'
def tostring(root):
return etree.tostring(root, encoding='utf-8')
class SplitError(ValueError):
def __init__(self, path, root):
size = len(tostring(root))/1024.
ValueError.__init__(self,
_('Could not find reasonable point at which to split: '
'%(path)s Sub-tree size: %(size)d KB')%dict(
path=path, size=size))
class Split(object):
def __init__(self, split_on_page_breaks=True, page_breaks_xpath=None,
max_flow_size=0, remove_css_pagebreaks=True):
self.split_on_page_breaks = split_on_page_breaks
self.page_breaks_xpath = page_breaks_xpath
self.max_flow_size = max_flow_size
self.page_break_selectors = None
self.remove_css_pagebreaks = remove_css_pagebreaks
if self.page_breaks_xpath is not None:
self.page_break_selectors = [(XPath(self.page_breaks_xpath), False)]
def __call__(self, oeb, opts):
self.oeb = oeb
self.log = oeb.log
self.log('Splitting markup on page breaks and flow limits, if any...')
self.opts = opts
self.map = {}
for item in list(self.oeb.manifest.items):
if item.spine_position is not None and etree.iselement(item.data):
self.split_item(item)
self.fix_links()
def split_item(self, item):
page_breaks, page_break_ids = [], []
if self.split_on_page_breaks:
page_breaks, page_break_ids = self.find_page_breaks(item)
splitter = FlowSplitter(item, page_breaks, page_break_ids,
self.max_flow_size, self.oeb, self.opts)
if splitter.was_split:
am = splitter.anchor_map
self.map[item.href] = collections.defaultdict(
am.default_factory, am)
def find_page_breaks(self, item):
if self.page_break_selectors is None:
self.page_break_selectors = set()
stylesheets = [x.data for x in self.oeb.manifest if x.media_type in
OEB_STYLES]
for rule in rules(stylesheets):
before = force_unicode(getattr(rule.style.getPropertyCSSValue(
'page-break-before'), 'cssText', '').strip().lower())
after = force_unicode(getattr(rule.style.getPropertyCSSValue(
'page-break-after'), 'cssText', '').strip().lower())
try:
if before and before not in {'avoid', 'auto', 'inherit'}:
self.page_break_selectors.add((rule.selectorText, True))
if self.remove_css_pagebreaks:
rule.style.removeProperty('page-break-before')
except:
pass
try:
if after and after not in {'avoid', 'auto', 'inherit'}:
self.page_break_selectors.add((rule.selectorText, False))
if self.remove_css_pagebreaks:
rule.style.removeProperty('page-break-after')
except:
pass
page_breaks = set()
select = Select(item.data)
if not self.page_break_selectors:
return [], []
body = item.data.xpath('//h:body', namespaces=NAMESPACES)
if not body:
return [], []
descendants = frozenset(body[0].iterdescendants('*'))
for selector, before in self.page_break_selectors:
try:
for elem in select(selector):
if elem in descendants and elem.tag.rpartition('}')[2].lower() not in {'html', 'body', 'head', 'style', 'script', 'meta', 'link'}:
elem.set('pb_before', '1' if before else '0')
page_breaks.add(elem)
except SelectorError as err:
self.log.warn('Ignoring page breaks specified with invalid CSS selector: %r (%s)' % (selector, as_unicode(err)))
for i, elem in enumerate(item.data.iter('*')):
try:
elem.set('pb_order', unicode_type(i))
except TypeError: # Cant set attributes on comment nodes etc.
continue
page_breaks = list(page_breaks)
page_breaks.sort(key=lambda x:int(x.get('pb_order')))
page_break_ids, page_breaks_ = [], []
for i, x in enumerate(page_breaks):
x.set('id', x.get('id', 'calibre_pb_%d'%i))
id = x.get('id')
try:
xp = XPath('//*[@id="%s"]'%id)
except:
try:
xp = XPath("//*[@id='%s']"%id)
except:
# The id has both a quote and an apostrophe or some other
# Just replace it since I doubt its going to work anywhere else
# either
id = 'calibre_pb_%d'%i
x.set('id', id)
xp = XPath('//*[@id=%r]'%id)
page_breaks_.append((xp, x.get('pb_before', '0') == '1'))
page_break_ids.append(id)
for elem in item.data.iter(etree.Element):
elem.attrib.pop('pb_order', False)
elem.attrib.pop('pb_before', False)
return page_breaks_, page_break_ids
def fix_links(self):
'''
Fix references to the split files in other content files.
'''
for item in self.oeb.manifest:
if etree.iselement(item.data):
self.current_item = item
rewrite_links(item.data, self.rewrite_links)
def rewrite_links(self, url):
href, frag = urldefrag(url)
try:
href = self.current_item.abshref(href)
except ValueError:
# Unparseable URL
return url
try:
href = urlnormalize(href)
except ValueError:
# href has non utf-8 quoting
return url
if href in self.map:
anchor_map = self.map[href]
nhref = anchor_map[frag if frag else None]
nhref = self.current_item.relhref(nhref)
if frag:
nhref = '#'.join((unquote(nhref), frag))
return nhref
return url
class FlowSplitter(object):
'The actual splitting logic'
def __init__(self, item, page_breaks, page_break_ids, max_flow_size, oeb,
opts):
self.item = item
self.oeb = oeb
self.opts = opts
self.log = oeb.log
self.page_breaks = page_breaks
self.page_break_ids = page_break_ids
self.max_flow_size = max_flow_size
self.base = item.href
self.csp_counter = 0
base, ext = os.path.splitext(self.base)
self.base = base.replace('%', '%%')+'_split_%.3d'+ext
self.trees = [self.item.data.getroottree()]
self.splitting_on_page_breaks = True
if self.page_breaks:
self.split_on_page_breaks(self.trees[0])
self.splitting_on_page_breaks = False
if self.max_flow_size > 0:
lt_found = False
self.log('\tLooking for large trees in %s...'%item.href)
trees = list(self.trees)
self.tree_map = {}
for i, tree in enumerate(trees):
size = len(tostring(tree.getroot()))
if size > self.max_flow_size:
self.log('\tFound large tree #%d'%i)
lt_found = True
self.split_trees = []
self.split_to_size(tree)
self.tree_map[tree] = self.split_trees
if not lt_found:
self.log('\tNo large trees found')
self.trees = []
for x in trees:
self.trees.extend(self.tree_map.get(x, [x]))
self.was_split = len(self.trees) > 1
if self.was_split:
self.log('\tSplit into %d parts'%len(self.trees))
self.commit()
def split_on_page_breaks(self, orig_tree):
ordered_ids = OrderedDict()
all_page_break_ids = frozenset(self.page_break_ids)
for elem_id in orig_tree.xpath('//*/@id'):
if elem_id in all_page_break_ids:
ordered_ids[elem_id] = self.page_breaks[
self.page_break_ids.index(elem_id)]
self.trees = [orig_tree]
while ordered_ids:
pb_id, (pattern, before) = next(iteritems(ordered_ids))
del ordered_ids[pb_id]
for i in range(len(self.trees)-1, -1, -1):
tree = self.trees[i]
elem = pattern(tree)
if elem:
self.log.debug('\t\tSplitting on page-break at id=%s'%
elem[0].get('id'))
before_tree, after_tree = self.do_split(tree, elem[0], before)
self.trees[i:i+1] = [before_tree, after_tree]
break
trees, ids = [], set()
for tree in self.trees:
root = tree.getroot()
if self.is_page_empty(root):
discarded_ids = root.xpath('//*[@id]')
for x in discarded_ids:
x = x.get('id')
if not x.startswith('calibre_'):
ids.add(x)
else:
if ids:
body = self.get_body(root)
if body is not None:
existing_ids = frozenset(body.xpath('//*/@id'))
for x in ids - existing_ids:
body.insert(0, body.makeelement(XHTML('div'), id=x, style='height:0pt'))
ids = set()
trees.append(tree)
self.trees = trees
def get_body(self, root):
body = root.xpath('//h:body', namespaces=NAMESPACES)
if not body:
return None
return body[0]
def do_split(self, tree, split_point, before):
'''
Split ``tree`` into a *before* and *after* tree at ``split_point``.
:param before: If True tree is split before split_point, otherwise after split_point
:return: before_tree, after_tree
'''
return do_split(split_point, self.log, before=before)
def is_page_empty(self, root):
body = self.get_body(root)
if body is None:
return False
txt = re.sub(r'\s+|\xa0', '',
etree.tostring(body, method='text', encoding='unicode'))
if len(txt) > 1:
return False
for img in root.xpath('//h:img', namespaces=NAMESPACES):
if img.get('style', '') != 'display:none':
return False
if root.xpath('//*[local-name() = "svg"]'):
return False
return True
def split_text(self, text, root, size):
self.log.debug('\t\t\tSplitting text of length: %d'%len(text))
rest = text.replace('\r', '')
parts = re.split('\n\n', rest)
self.log.debug('\t\t\t\tFound %d parts'%len(parts))
if max(map(len, parts)) > size:
raise SplitError('Cannot split as file contains a <pre> tag '
'with a very large paragraph', root)
ans = []
buf = ''
for part in parts:
if len(buf) + len(part) < size:
buf += '\n\n'+part
else:
ans.append(buf)
buf = part
return ans
def split_to_size(self, tree):
self.log.debug('\t\tSplitting...')
root = tree.getroot()
# Split large <pre> tags if they contain only text
for pre in XPath('//h:pre')(root):
if len(tuple(pre.iterchildren(etree.Element))) > 0:
continue
if pre.text and len(pre.text) > self.max_flow_size*0.5:
self.log.debug('\t\tSplitting large <pre> tag')
frags = self.split_text(pre.text, root, int(0.2*self.max_flow_size))
new_pres = []
for frag in frags:
pre2 = copy.copy(pre)
pre2.text = frag
pre2.tail = ''
new_pres.append(pre2)
new_pres[-1].tail = pre.tail
p = pre.getparent()
i = p.index(pre)
p[i:i+1] = new_pres
split_point, before = self.find_split_point(root)
if split_point is None:
raise SplitError(self.item.href, root)
self.log.debug('\t\t\tSplit point:', split_point.tag, tree.getpath(split_point))
trees = self.do_split(tree, split_point, before)
sizes = [len(tostring(t.getroot())) for t in trees]
if min(sizes) < 5*1024:
self.log.debug('\t\t\tSplit tree too small')
self.split_to_size(tree)
return
for t, size in zip(trees, sizes):
r = t.getroot()
if self.is_page_empty(r):
continue
elif size <= self.max_flow_size:
self.split_trees.append(t)
self.log.debug(
'\t\t\tCommitted sub-tree #%d (%d KB)'%(
len(self.split_trees), size/1024.))
else:
self.log.debug(
'\t\t\tSplit tree still too large: %d KB' % (size/1024.))
self.split_to_size(t)
def find_split_point(self, root):
'''
Find the tag at which to split the tree rooted at `root`.
Search order is:
* Heading tags
* <div> tags
* <pre> tags
* <hr> tags
* <p> tags
* <br> tags
* <li> tags
We try to split in the "middle" of the file (as defined by tag counts.
'''
def pick_elem(elems):
if elems:
elems = [i for i in elems if i.get(SPLIT_POINT_ATTR, '0') !=
'1']
if elems:
i = int(len(elems)//2)
elems[i].set(SPLIT_POINT_ATTR, '1')
return elems[i]
for path in (
'//*[re:match(name(), "h[1-6]", "i")]',
'/h:html/h:body/h:div',
'//h:pre',
'//h:hr',
'//h:p',
'//h:div',
'//h:br',
'//h:li',
):
elems = root.xpath(path, namespaces=NAMESPACES)
elem = pick_elem(elems)
if elem is not None:
try:
XPath(elem.getroottree().getpath(elem))
except:
continue
return elem, True
return None, True
def commit(self):
'''
Commit all changes caused by the split. Calculates an *anchor_map* for
all anchors in the original tree. Internal links are re-directed. The
original file is deleted and the split files are saved.
'''
if not self.was_split:
return
self.anchor_map = collections.defaultdict(lambda :self.base%0)
self.files = []
for i, tree in enumerate(self.trees):
root = tree.getroot()
self.files.append(self.base%i)
for elem in root.xpath('//*[@id or @name]'):
for anchor in elem.get('id', ''), elem.get('name', ''):
if anchor != '' and anchor not in self.anchor_map:
self.anchor_map[anchor] = self.files[-1]
for elem in root.xpath('//*[@%s]'%SPLIT_POINT_ATTR):
elem.attrib.pop(SPLIT_POINT_ATTR, '0')
spine_pos = self.item.spine_position
for current, tree in zip(*map(reversed, (self.files, self.trees))):
for a in tree.getroot().xpath('//h:a[@href]', namespaces=NAMESPACES):
href = a.get('href').strip()
if href.startswith('#'):
anchor = href[1:]
file = self.anchor_map[anchor]
file = self.item.relhref(file)
if file != current:
a.set('href', file+href)
new_id = self.oeb.manifest.generate(id=self.item.id)[0]
new_item = self.oeb.manifest.add(new_id, current,
self.item.media_type, data=tree.getroot())
self.oeb.spine.insert(spine_pos, new_item, self.item.linear)
if self.oeb.guide:
for ref in self.oeb.guide.values():
href, frag = urldefrag(ref.href)
if href == self.item.href:
nhref = self.anchor_map[frag if frag else None]
if frag:
nhref = '#'.join((nhref, frag))
ref.href = nhref
def fix_toc_entry(toc):
if toc.href:
href, frag = urldefrag(toc.href)
if href == self.item.href:
nhref = self.anchor_map[frag if frag else None]
if frag:
nhref = '#'.join((nhref, frag))
toc.href = nhref
for x in toc:
fix_toc_entry(x)
if self.oeb.toc:
fix_toc_entry(self.oeb.toc)
if self.oeb.pages:
for page in self.oeb.pages:
href, frag = urldefrag(page.href)
if href == self.item.href:
nhref = self.anchor_map[frag if frag else None]
if frag:
nhref = '#'.join((nhref, frag))
page.href = nhref
self.oeb.manifest.remove(self.item)