diff --git a/ebook_converter/ebooks/covers.py b/ebook_converter/ebooks/covers.py new file mode 100644 index 0000000..62ccad0 --- /dev/null +++ b/ebook_converter/ebooks/covers.py @@ -0,0 +1,762 @@ +#!/usr/bin/env python2 +# vim:fileencoding=utf-8 +from __future__ import absolute_import, division, print_function, unicode_literals + +__license__ = 'GPL v3' +__copyright__ = '2014, Kovid Goyal ' + +import re, random, unicodedata, numbers +from collections import namedtuple +from contextlib import contextmanager +from math import ceil, sqrt, cos, sin, atan2 +from polyglot.builtins import iteritems, itervalues, map, zip, string_or_bytes +from itertools import chain + +from PyQt5.Qt import ( + QImage, Qt, QFont, QPainter, QPointF, QTextLayout, QTextOption, + QFontMetrics, QTextCharFormat, QColor, QRect, QBrush, QLinearGradient, + QPainterPath, QPen, QRectF, QTransform, QRadialGradient +) + +from calibre import force_unicode, fit_image +from calibre.constants import __appname__, __version__ +from calibre.ebooks.metadata import fmt_sidx +from calibre.ebooks.metadata.book.base import Metadata +from calibre.ebooks.metadata.book.formatter import SafeFormat +from calibre.gui2 import ensure_app, config, load_builtin_fonts, pixmap_to_data +from calibre.utils.cleantext import clean_ascii_chars, clean_xml_chars +from calibre.utils.config import JSONConfig + +# Default settings {{{ +cprefs = JSONConfig('cover_generation') +cprefs.defaults['title_font_size'] = 120 # px +cprefs.defaults['subtitle_font_size'] = 80 # px +cprefs.defaults['footer_font_size'] = 80 # px +cprefs.defaults['cover_width'] = 1200 # px +cprefs.defaults['cover_height'] = 1600 # px +cprefs.defaults['title_font_family'] = None +cprefs.defaults['subtitle_font_family'] = None +cprefs.defaults['footer_font_family'] = None +cprefs.defaults['color_themes'] = {} +cprefs.defaults['disabled_color_themes'] = [] +cprefs.defaults['disabled_styles'] = [] +cprefs.defaults['title_template'] = '{title}' +cprefs.defaults['subtitle_template'] = '''{series:'test($, strcat("", $, " - ", raw_field("formatted_series_index")), "")'}''' +cprefs.defaults['footer_template'] = r'''program: +# Show at most two authors, on separate lines. +authors = field('authors'); +num = count(authors, ' & '); +authors = sublist(authors, 0, 2, ' & '); +authors = list_re(authors, ' & ', '(.+)', '\1'); +authors = re(authors, ' & ', '
'); +re(authors, '&&', '&') +''' +Prefs = namedtuple('Prefs', ' '.join(sorted(cprefs.defaults))) + +_use_roman = None + + +def get_use_roman(): + global _use_roman + if _use_roman is None: + return config['use_roman_numerals_for_series_number'] + return _use_roman + + +def set_use_roman(val): + global _use_roman + _use_roman = bool(val) + +# }}} + + +# Draw text {{{ +Point = namedtuple('Point', 'x y') + + +def parse_text_formatting(text): + pos = 0 + tokens = [] + for m in re.finditer(r'', text): + q = text[pos:m.start()] + if q: + tokens.append((False, q)) + tokens.append((True, (m.group(1).lower(), '/' in m.group()[:2]))) + pos = m.end() + if tokens: + if text[pos:]: + tokens.append((False, text[pos:])) + else: + tokens = [(False, text)] + + ranges, open_ranges, text = [], [], [] + offset = 0 + for is_tag, tok in tokens: + if is_tag: + tag, closing = tok + if closing: + if open_ranges: + r = open_ranges.pop() + r[-1] = offset - r[-2] + if r[-1] > 0: + ranges.append(r) + else: + if tag in {'b', 'strong', 'i', 'em'}: + open_ranges.append([tag, offset, -1]) + else: + offset += len(tok.replace('&', '&')) + text.append(tok) + text = ''.join(text) + formats = [] + for tag, start, length in chain(ranges, open_ranges): + fmt = QTextCharFormat() + if tag in {'b', 'strong'}: + fmt.setFontWeight(QFont.Bold) + elif tag in {'i', 'em'}: + fmt.setFontItalic(True) + else: + continue + if length == -1: + length = len(text) - start + if length > 0: + r = QTextLayout.FormatRange() + r.format = fmt + r.start, r.length = start, length + formats.append(r) + return text, formats + + +class Block(object): + + def __init__(self, text='', width=0, font=None, img=None, max_height=100, align=Qt.AlignCenter): + self.layouts = [] + self._position = Point(0, 0) + self.leading = self.line_spacing = 0 + if font is not None: + fm = QFontMetrics(font, img) + self.leading = fm.leading() + self.line_spacing = fm.lineSpacing() + for text in text.split('
') if text else (): + text, formats = parse_text_formatting(sanitize(text)) + l = QTextLayout(unescape_formatting(text), font, img) + l.setAdditionalFormats(formats) + to = QTextOption(align) + to.setWrapMode(QTextOption.WrapAtWordBoundaryOrAnywhere) + l.setTextOption(to) + + l.beginLayout() + height = 0 + while height + 3*self.leading < max_height: + line = l.createLine() + if not line.isValid(): + break + line.setLineWidth(width) + height += self.leading + line.setPosition(QPointF(0, height)) + height += line.height() + max_height -= height + l.endLayout() + if self.layouts: + self.layouts.append(self.leading) + else: + self._position = Point(l.position().x(), l.position().y()) + self.layouts.append(l) + if self.layouts: + self.layouts.append(self.leading) + + @property + def height(self): + return int(ceil(sum(l if isinstance(l, numbers.Number) else l.boundingRect().height() for l in self.layouts))) + + @property + def position(self): + return self._position + + @position.setter + def position(self, new_pos): + (x, y) = new_pos + self._position = Point(x, y) + if self.layouts: + self.layouts[0].setPosition(QPointF(x, y)) + y += self.layouts[0].boundingRect().height() + for l in self.layouts[1:]: + if isinstance(l, numbers.Number): + y += l + else: + l.setPosition(QPointF(x, y)) + y += l.boundingRect().height() + + def draw(self, painter): + for l in self.layouts: + if hasattr(l, 'draw'): + # Etch effect for the text + painter.save() + painter.setRenderHints(QPainter.TextAntialiasing | QPainter.Antialiasing) + painter.save() + painter.setPen(QColor(255, 255, 255, 125)) + l.draw(painter, QPointF(1, 1)) + painter.restore() + l.draw(painter, QPointF()) + painter.restore() + + +def layout_text(prefs, img, title, subtitle, footer, max_height, style): + width = img.width() - 2 * style.hmargin + title, subtitle, footer = title, subtitle, footer + title_font = QFont(prefs.title_font_family or 'Liberation Serif') + title_font.setPixelSize(prefs.title_font_size) + title_font.setStyleStrategy(QFont.PreferAntialias) + title_block = Block(title, width, title_font, img, max_height, style.TITLE_ALIGN) + title_block.position = style.hmargin, style.vmargin + subtitle_block = Block() + if subtitle: + subtitle_font = QFont(prefs.subtitle_font_family or 'Liberation Sans') + subtitle_font.setPixelSize(prefs.subtitle_font_size) + subtitle_font.setStyleStrategy(QFont.PreferAntialias) + gap = 2 * title_block.leading + mh = max_height - title_block.height - gap + subtitle_block = Block(subtitle, width, subtitle_font, img, mh, style.SUBTITLE_ALIGN) + subtitle_block.position = style.hmargin, title_block.position.y + title_block.height + gap + + footer_font = QFont(prefs.footer_font_family or 'Liberation Serif') + footer_font.setStyleStrategy(QFont.PreferAntialias) + footer_font.setPixelSize(prefs.footer_font_size) + footer_block = Block(footer, width, footer_font, img, max_height, style.FOOTER_ALIGN) + footer_block.position = style.hmargin, img.height() - style.vmargin - footer_block.height + + return title_block, subtitle_block, footer_block + +# }}} + +# Format text using templates {{{ + + +def sanitize(s): + return unicodedata.normalize('NFC', clean_xml_chars(clean_ascii_chars(force_unicode(s or '')))) + + +_formatter = None +_template_cache = {} + + +def escape_formatting(val): + return val.replace('&', '&').replace('<', '<').replace('>', '>') + + +def unescape_formatting(val): + return val.replace('<', '<').replace('>', '>').replace('&', '&') + + +class Formatter(SafeFormat): + + def get_value(self, orig_key, args, kwargs): + ans = SafeFormat.get_value(self, orig_key, args, kwargs) + return escape_formatting(ans) + + +def formatter(): + global _formatter + if _formatter is None: + _formatter = Formatter() + return _formatter + + +def format_fields(mi, prefs): + f = formatter() + + def safe_format(field): + return f.safe_format( + getattr(prefs, field), mi, _('Template error'), mi, template_cache=_template_cache + ) + return map(safe_format, ('title_template', 'subtitle_template', 'footer_template')) + + +@contextmanager +def preserve_fields(obj, fields): + if isinstance(fields, string_or_bytes): + fields = fields.split() + null = object() + mem = {f:getattr(obj, f, null) for f in fields} + try: + yield + finally: + for f, val in iteritems(mem): + if val is null: + delattr(obj, f) + else: + setattr(obj, f, val) + + +def format_text(mi, prefs): + with preserve_fields(mi, 'authors formatted_series_index'): + mi.authors = [a for a in mi.authors if a != _('Unknown')] + mi.formatted_series_index = fmt_sidx(mi.series_index or 0, use_roman=get_use_roman()) + return tuple(format_fields(mi, prefs)) +# }}} + + +# Colors {{{ +ColorTheme = namedtuple('ColorTheme', 'color1 color2 contrast_color1 contrast_color2') + + +def to_theme(x): + return {k:v for k, v in zip(ColorTheme._fields[:4], x.split())} + + +fallback_colors = to_theme('ffffff 000000 000000 ffffff') + +default_color_themes = { + 'Earth' : to_theme('e8d9ac c7b07b 564628 382d1a'), + 'Grass' : to_theme('d8edb5 abc8a4 375d3b 183128'), + 'Water' : to_theme('d3dcf2 829fe4 00448d 00305a'), + 'Silver': to_theme('e6f1f5 aab3b6 6e7476 3b3e40'), +} + + +def theme_to_colors(theme): + colors = {k:QColor('#' + theme[k]) for k in ColorTheme._fields} + return ColorTheme(**colors) + + +def load_color_themes(prefs): + t = default_color_themes.copy() + t.update(prefs.color_themes) + disabled = frozenset(prefs.disabled_color_themes) + ans = [theme_to_colors(v) for k, v in iteritems(t) if k not in disabled] + if not ans: + # Ignore disabled and return only the builtin color themes + ans = [theme_to_colors(v) for k, v in iteritems(default_color_themes)] + return ans + + +def color(color_theme, name): + ans = getattr(color_theme, name) + if not ans.isValid(): + ans = QColor('#' + fallback_colors[name]) + return ans + +# }}} + +# Styles {{{ + + +class Style(object): + + TITLE_ALIGN = SUBTITLE_ALIGN = FOOTER_ALIGN = Qt.AlignHCenter | Qt.AlignTop + + def __init__(self, color_theme, prefs): + self.load_colors(color_theme) + self.calculate_margins(prefs) + + def calculate_margins(self, prefs): + self.hmargin = int((50 / 600) * prefs.cover_width) + self.vmargin = int((50 / 800) * prefs.cover_height) + + def load_colors(self, color_theme): + self.color1 = color(color_theme, 'color1') + self.color2 = color(color_theme, 'color2') + self.ccolor1 = color(color_theme, 'contrast_color1') + self.ccolor2 = color(color_theme, 'contrast_color2') + + +class Cross(Style): + + NAME = 'The Cross' + GUI_NAME = _('The Cross') + + def __call__(self, painter, rect, color_theme, title_block, subtitle_block, footer_block): + painter.fillRect(rect, self.color1) + r = QRect(0, int(title_block.position.y), rect.width(), + title_block.height + subtitle_block.height + subtitle_block.line_spacing // 2 + title_block.leading) + painter.save() + p = QPainterPath() + p.addRoundedRect(QRectF(r), 10, 10 * r.width()/r.height(), Qt.RelativeSize) + painter.setClipPath(p) + painter.setRenderHint(QPainter.Antialiasing) + painter.fillRect(r, self.color2) + painter.restore() + r = QRect(0, 0, int(title_block.position.x), rect.height()) + painter.fillRect(r, self.color2) + return self.ccolor2, self.ccolor2, self.ccolor1 + + +class Half(Style): + + NAME = 'Half and Half' + GUI_NAME = _('Half and half') + + def __call__(self, painter, rect, color_theme, title_block, subtitle_block, footer_block): + g = QLinearGradient(QPointF(0, 0), QPointF(0, rect.height())) + g.setStops([(0, self.color1), (0.7, self.color2), (1, self.color1)]) + painter.fillRect(rect, QBrush(g)) + return self.ccolor1, self.ccolor1, self.ccolor1 + + +def rotate_vector(angle, x, y): + return x * cos(angle) - y * sin(angle), x * sin(angle) + y * cos(angle) + + +def draw_curved_line(painter_path, dx, dy, c1_frac, c1_amp, c2_frac, c2_amp): + length = sqrt(dx * dx + dy * dy) + angle = atan2(dy, dx) + c1 = QPointF(*rotate_vector(angle, c1_frac * length, c1_amp * length)) + c2 = QPointF(*rotate_vector(angle, c2_frac * length, c2_amp * length)) + pos = painter_path.currentPosition() + painter_path.cubicTo(pos + c1, pos + c2, pos + QPointF(dx, dy)) + + +class Banner(Style): + + NAME = 'Banner' + GUI_NAME = _('Banner') + GRADE = 0.07 + + def calculate_margins(self, prefs): + Style.calculate_margins(self, prefs) + self.hmargin = int(0.15 * prefs.cover_width) + self.fold_width = int(0.1 * prefs.cover_width) + + def __call__(self, painter, rect, color_theme, title_block, subtitle_block, footer_block): + painter.fillRect(rect, self.color1) + top = title_block.position.y + 2 + extra_spacing = subtitle_block.line_spacing // 2 if subtitle_block.line_spacing else title_block.line_spacing // 3 + height = title_block.height + subtitle_block.height + extra_spacing + title_block.leading + right = rect.right() - self.hmargin + width = right - self.hmargin + + # Draw main banner + p = main = QPainterPath(QPointF(self.hmargin, top)) + draw_curved_line(p, rect.width() - 2 * self.hmargin, 0, 0.1, -0.1, 0.9, -0.1) + deltax = self.GRADE * height + p.lineTo(right + deltax, top + height) + right_corner = p.currentPosition() + draw_curved_line(p, - width - 2 * deltax, 0, 0.1, 0.05, 0.9, 0.05) + left_corner = p.currentPosition() + p.closeSubpath() + + # Draw fold rectangles + rwidth = self.fold_width + yfrac = 0.1 + width23 = int(0.67 * rwidth) + rtop = top + height * yfrac + + def draw_fold(x, m=1, corner=left_corner): + ans = p = QPainterPath(QPointF(x, rtop)) + draw_curved_line(p, rwidth*m, 0, 0.1, 0.1*m, 0.5, -0.2*m) + fold_upper = p.currentPosition() + p.lineTo(p.currentPosition() + QPointF(-deltax*m, height)) + fold_corner = p.currentPosition() + draw_curved_line(p, -rwidth*m, 0, 0.2, -0.1*m, 0.8, -0.1*m) + draw_curved_line(p, deltax*m, -height, 0.2, 0.1*m, 0.8, 0.1*m) + p = inner_fold = QPainterPath(corner) + dp = fold_corner - p.currentPosition() + draw_curved_line(p, dp.x(), dp.y(), 0.5, 0.3*m, 1, 0*m) + p.lineTo(fold_upper), p.closeSubpath() + return ans, inner_fold + + left_fold, left_inner = draw_fold(self.hmargin - width23) + right_fold, right_inner = draw_fold(right + width23, m=-1, corner=right_corner) + + painter.save() + painter.setRenderHint(QPainter.Antialiasing) + pen = QPen(self.ccolor2) + pen.setWidth(3) + pen.setJoinStyle(Qt.RoundJoin) + painter.setPen(pen) + for r in (left_fold, right_fold): + painter.fillPath(r, QBrush(self.color2)) + painter.drawPath(r) + for r in (left_inner, right_inner): + painter.fillPath(r, QBrush(self.color2.darker())) + painter.drawPath(r) + painter.fillPath(main, QBrush(self.color2)) + painter.drawPath(main) + painter.restore() + return self.ccolor2, self.ccolor2, self.ccolor1 + + +class Ornamental(Style): + + NAME = 'Ornamental' + GUI_NAME = _('Ornamental') + + # SVG vectors {{{ + CORNER_VECTOR = "m 67.791903,64.260958 c -4.308097,-2.07925 -4.086719,-8.29575 0.334943,-9.40552 4.119758,-1.03399 8.732363,5.05239 5.393055,7.1162 -0.55,0.33992 -1,1.04147 -1,1.55902 0,1.59332 2.597425,1.04548 5.365141,-1.1316 1.999416,-1.57274 2.634859,-2.96609 2.634859,-5.7775 0,-9.55787 -9.827495,-13.42961 -24.43221,-9.62556 -3.218823,0.83839 -5.905663,1.40089 -5.970755,1.25 -0.06509,-0.1509 -0.887601,-1.19493 -1.827799,-2.32007 -1.672708,-2.00174 -1.636693,-2.03722 1.675668,-1.65052 1.861815,0.21736 6.685863,-0.35719 10.720107,-1.27678 12.280767,-2.79934 20.195487,-0.0248 22.846932,8.0092 3.187273,9.65753 -6.423297,17.7497 -15.739941,13.25313 z m 49.881417,-20.53932 c -3.19204,-2.701 -3.72967,-6.67376 -1.24009,-9.16334 2.48236,-2.48236 5.35141,-2.67905 7.51523,-0.51523 1.85966,1.85966 2.07045,6.52954 0.37143,8.22857 -2.04025,2.04024 3.28436,1.44595 6.92316,-0.77272 9.66959,-5.89579 0.88581,-18.22422 -13.0777,-18.35516 -5.28594,-0.0496 -10.31098,1.88721 -14.26764,5.4991 -1.98835,1.81509 -2.16454,1.82692 -2.7936,0.18763 -0.40973,-1.06774 0.12141,-2.82197 1.3628,-4.50104 2.46349,-3.33205 1.67564,-4.01299 -2.891784,-2.49938 -2.85998,0.94777 -3.81038,2.05378 -5.59837,6.51495 -1.184469,2.95536 -3.346819,6.86882 -4.805219,8.69657 -1.4584,1.82776 -2.65164,4.02223 -2.65164,4.87662 0,3.24694 -4.442667,0.59094 -5.872557,-3.51085 -1.361274,-3.90495 0.408198,-8.63869 4.404043,-11.78183 5.155844,-4.05558 1.612374,-3.42079 -9.235926,1.65457 -12.882907,6.02725 -16.864953,7.18038 -24.795556,7.18038 -8.471637,0 -13.38802,-1.64157 -17.634617,-5.88816 -2.832233,-2.83224 -3.849773,-4.81378 -4.418121,-8.6038 -1.946289,-12.9787795 8.03227,-20.91713135 19.767685,-15.7259993 5.547225,2.4538018 6.993631,6.1265383 3.999564,10.1557393 -5.468513,7.35914 -15.917883,-0.19431 -10.657807,-7.7041155 1.486298,-2.1219878 1.441784,-2.2225068 -0.984223,-2.2225068 -1.397511,0 -4.010527,1.3130878 -5.806704,2.9179718 -2.773359,2.4779995 -3.265777,3.5977995 -3.265777,7.4266705 0,5.10943 2.254112,8.84197 7.492986,12.40748 8.921325,6.07175 19.286666,5.61396 37.12088,-1.63946 15.35037,-6.24321 21.294999,-7.42408 34.886123,-6.92999 11.77046,0.4279 19.35803,3.05537 24.34054,8.42878 4.97758,5.3681 2.53939,13.58271 -4.86733,16.39873 -4.17361,1.58681 -11.00702,1.19681 -13.31978,-0.76018 z m 26.50156,-0.0787 c -2.26347,-2.50111 -2.07852,-7.36311 0.39995,-10.51398 2.68134,-3.40877 10.49035,-5.69409 18.87656,-5.52426 l 6.5685,0.13301 -7.84029,0.82767 c -8.47925,0.89511 -12.76997,2.82233 -16.03465,7.20213 -1.92294,2.57976 -1.96722,3.00481 -0.57298,5.5 1.00296,1.79495 2.50427,2.81821 4.46514,3.04333 2.92852,0.33623 2.93789,0.32121 1.08045,-1.73124 -1.53602,-1.69728 -1.64654,-2.34411 -0.61324,-3.58916 2.84565,-3.4288 7.14497,-0.49759 5.03976,3.43603 -1.86726,3.48903 -8.65528,4.21532 -11.3692,1.21647 z m -4.17462,-14.20302 c -0.38836,-0.62838 -0.23556,-1.61305 0.33954,-2.18816 1.3439,-1.34389 4.47714,-0.17168 3.93038,1.47045 -0.5566,1.67168 -3.38637,2.14732 -4.26992,0.71771 z m -8.48037,-9.1829 c -12.462,-4.1101 -12.53952,-4.12156 -25.49998,-3.7694 -24.020921,0.65269 -32.338219,0.31756 -37.082166,-1.49417 -5.113999,-1.95305 -8.192504,-6.3647405 -6.485463,-9.2940713 0.566827,-0.972691 1.020091,-1.181447 1.037211,-0.477701 0.01685,0.692606 1.268676,1.2499998 2.807321,1.2499998 1.685814,0 4.868609,1.571672 8.10041,4.0000015 4.221481,3.171961 6.182506,3.999221 9.473089,3.996261 l 4.149585,-0.004 -3.249996,-1.98156 c -3.056252,-1.863441 -4.051566,-3.8760635 -2.623216,-5.3044145 0.794,-0.794 6.188222,1.901516 9.064482,4.5295635 1.858669,1.698271 3.461409,1.980521 10.559493,1.859621 11.30984,-0.19266 20.89052,1.29095 31.97905,4.95208 7.63881,2.52213 11.51931,3.16471 22.05074,3.65141 7.02931,0.32486 13.01836,0.97543 13.30902,1.44571 0.29065,0.47029 -5.2356,0.83436 -12.28056,0.80906 -12.25942,-0.044 -13.34537,-0.2229 -25.30902,-4.16865 z" # noqa + # }}} + PATH_CACHE = {} + VIEWPORT = (400, 500) + + def calculate_margins(self, prefs): + self.hmargin = int((51 / self.VIEWPORT[0]) * prefs.cover_width) + self.vmargin = int((83 / self.VIEWPORT[1]) * prefs.cover_height) + + def __call__(self, painter, rect, color_theme, title_block, subtitle_block, footer_block): + if not self.PATH_CACHE: + from calibre.utils.speedups import svg_path_to_painter_path + try: + self.__class__.PATH_CACHE['corner'] = svg_path_to_painter_path(self.CORNER_VECTOR) + except Exception: + import traceback + traceback.print_exc() + p = painter + painter.setRenderHint(QPainter.Antialiasing) + g = QRadialGradient(QPointF(rect.center()), rect.width()) + g.setColorAt(0, self.color1), g.setColorAt(1, self.color2) + painter.fillRect(rect, QBrush(g)) + painter.save() + painter.setWindow(0, 0, *self.VIEWPORT) + try: + path = self.PATH_CACHE['corner'] + except KeyError: + path = QPainterPath() + pen = p.pen() + pen.setColor(self.ccolor1) + p.setPen(pen) + + def corner(): + b = QBrush(self.ccolor1) + p.fillPath(path, b) + p.rotate(90), p.translate(100, -100), p.scale(1, -1), p.translate(-103, -97) + p.fillPath(path, b) + p.setWorldTransform(QTransform()) + # Top-left corner + corner() + # Top right corner + p.scale(-1, 1), p.translate(-400, 0), corner() + # Bottom left corner + p.scale(1, -1), p.translate(0, -500), corner() + # Bottom right corner + p.scale(-1, -1), p.translate(-400, -500), corner() + for y in (28.4, 471.7): + p.drawLine(QPointF(160, y), QPointF(240, y)) + for x in (31.3, 368.7): + p.drawLine(QPointF(x, 155), QPointF(x, 345)) + pen.setWidthF(1.8) + p.setPen(pen) + for y in (23.8, 476.7): + p.drawLine(QPointF(160, y), QPointF(240, y)) + for x in (26.3, 373.7): + p.drawLine(QPointF(x, 155), QPointF(x, 345)) + painter.restore() + + return self.ccolor2, self.ccolor2, self.ccolor1 + + +class Blocks(Style): + + NAME = 'Blocks' + GUI_NAME = _('Blocks') + FOOTER_ALIGN = Qt.AlignRight | Qt.AlignTop + + def __call__(self, painter, rect, color_theme, title_block, subtitle_block, footer_block): + painter.fillRect(rect, self.color1) + y = rect.height() - rect.height() // 3 + r = QRect(rect) + r.setBottom(y) + painter.fillRect(rect, self.color1) + r = QRect(rect) + r.setTop(y) + painter.fillRect(r, self.color2) + return self.ccolor1, self.ccolor1, self.ccolor2 + + +def all_styles(): + return set( + x.NAME for x in itervalues(globals()) if + isinstance(x, type) and issubclass(x, Style) and x is not Style + ) + + +def load_styles(prefs, respect_disabled=True): + disabled = frozenset(prefs.disabled_styles) if respect_disabled else () + ans = tuple(x for x in itervalues(globals()) if + isinstance(x, type) and issubclass(x, Style) and x is not Style and x.NAME not in disabled) + if not ans and disabled: + # If all styles have been disabled, ignore the disabling and return all + # the styles + ans = load_styles(prefs, respect_disabled=False) + return ans + +# }}} + + +def init_environment(): + ensure_app() + load_builtin_fonts() + + +def generate_cover(mi, prefs=None, as_qimage=False): + init_environment() + prefs = prefs or cprefs + prefs = {k:prefs.get(k) for k in cprefs.defaults} + prefs = Prefs(**prefs) + color_theme = random.choice(load_color_themes(prefs)) + style = random.choice(load_styles(prefs))(color_theme, prefs) + title, subtitle, footer = format_text(mi, prefs) + img = QImage(prefs.cover_width, prefs.cover_height, QImage.Format_ARGB32) + title_block, subtitle_block, footer_block = layout_text( + prefs, img, title, subtitle, footer, img.height() // 3, style) + p = QPainter(img) + rect = QRect(0, 0, img.width(), img.height()) + colors = style(p, rect, color_theme, title_block, subtitle_block, footer_block) + for block, color in zip((title_block, subtitle_block, footer_block), colors): + p.setPen(color) + block.draw(p) + p.end() + img.setText('Generated cover', '%s %s' % (__appname__, __version__)) + if as_qimage: + return img + return pixmap_to_data(img) + + +def override_prefs(base_prefs, **overrides): + ans = {k:overrides.get(k, base_prefs[k]) for k in cprefs.defaults} + override_color_theme = overrides.get('override_color_theme') + if override_color_theme is not None: + all_themes = set(default_color_themes) | set(ans['color_themes']) + if override_color_theme in all_themes: + all_themes.discard(override_color_theme) + ans['disabled_color_themes'] = all_themes + override_style = overrides.get('override_style') + if override_style is not None: + styles = all_styles() + if override_style in styles: + styles.discard(override_style) + ans['disabled_styles'] = styles + + return ans + + +def create_cover(title, authors, series=None, series_index=1, prefs=None, as_qimage=False): + ' Create a cover from the specified title, author and series. Any user set' + ' templates are ignored, to ensure that the specified metadata is used. ' + mi = Metadata(title, authors) + if series: + mi.series, mi.series_index = series, series_index + d = cprefs.defaults + prefs = override_prefs( + prefs or cprefs, title_template=d['title_template'], subtitle_template=d['subtitle_template'], footer_template=d['footer_template']) + return generate_cover(mi, prefs=prefs, as_qimage=as_qimage) + + +def calibre_cover2(title, author_string='', series_string='', prefs=None, as_qimage=False, logo_path=None): + init_environment() + title, subtitle, footer = '' + escape_formatting(title), '' + escape_formatting(series_string), '' + escape_formatting(author_string) + prefs = prefs or cprefs + prefs = {k:prefs.get(k) for k in cprefs.defaults} + scale = 800. / prefs['cover_height'] + scale_cover(prefs, scale) + prefs = Prefs(**prefs) + img = QImage(prefs.cover_width, prefs.cover_height, QImage.Format_ARGB32) + img.fill(Qt.white) + # colors = to_theme('ffffff ffffff 000000 000000') + color_theme = theme_to_colors(fallback_colors) + + class CalibeLogoStyle(Style): + NAME = GUI_NAME = 'calibre' + + def __call__(self, painter, rect, color_theme, title_block, subtitle_block, footer_block): + top = title_block.position.y + 10 + extra_spacing = subtitle_block.line_spacing // 2 if subtitle_block.line_spacing else title_block.line_spacing // 3 + height = title_block.height + subtitle_block.height + extra_spacing + title_block.leading + top += height + 25 + bottom = footer_block.position.y - 50 + logo = QImage(logo_path or I('library.png')) + pwidth, pheight = rect.width(), bottom - top + scaled, width, height = fit_image(logo.width(), logo.height(), pwidth, pheight) + x, y = (pwidth - width) // 2, (pheight - height) // 2 + rect = QRect(x, top + y, width, height) + painter.setRenderHint(QPainter.SmoothPixmapTransform) + painter.drawImage(rect, logo) + return self.ccolor1, self.ccolor1, self.ccolor1 + style = CalibeLogoStyle(color_theme, prefs) + title_block, subtitle_block, footer_block = layout_text( + prefs, img, title, subtitle, footer, img.height() // 3, style) + p = QPainter(img) + rect = QRect(0, 0, img.width(), img.height()) + colors = style(p, rect, color_theme, title_block, subtitle_block, footer_block) + for block, color in zip((title_block, subtitle_block, footer_block), colors): + p.setPen(color) + block.draw(p) + p.end() + img.setText('Generated cover', '%s %s' % (__appname__, __version__)) + if as_qimage: + return img + return pixmap_to_data(img) + + +def message_image(text, width=500, height=400, font_size=20): + init_environment() + img = QImage(width, height, QImage.Format_ARGB32) + img.fill(Qt.white) + p = QPainter(img) + f = QFont() + f.setPixelSize(font_size) + p.setFont(f) + r = img.rect().adjusted(10, 10, -10, -10) + p.drawText(r, Qt.AlignJustify | Qt.AlignVCenter | Qt.TextWordWrap, text) + p.end() + return pixmap_to_data(img) + + +def scale_cover(prefs, scale): + for x in ('cover_width', 'cover_height', 'title_font_size', 'subtitle_font_size', 'footer_font_size'): + prefs[x] = int(scale * prefs[x]) + + +def generate_masthead(title, output_path=None, width=600, height=60, as_qimage=False, font_family=None): + init_environment() + font_family = font_family or cprefs['title_font_family'] or 'Liberation Serif' + img = QImage(width, height, QImage.Format_ARGB32) + img.fill(Qt.white) + p = QPainter(img) + p.setRenderHints(QPainter.Antialiasing | QPainter.TextAntialiasing) + f = QFont(font_family) + f.setStyleStrategy(QFont.PreferAntialias) + f.setPixelSize((height * 3) // 4), f.setBold(True) + p.setFont(f) + p.drawText(img.rect(), Qt.AlignLeft | Qt.AlignVCenter, sanitize(title)) + p.end() + if as_qimage: + return img + data = pixmap_to_data(img) + if output_path is None: + return data + with open(output_path, 'wb') as f: + f.write(data) + + +def test(scale=0.25): + from PyQt5.Qt import QLabel, QPixmap, QMainWindow, QWidget, QScrollArea, QGridLayout + from calibre.gui2 import Application + app = Application([]) + mi = Metadata('Unknown', ['Kovid Goyal', 'John & Doe', 'Author']) + mi.series = 'A series & styles' + m = QMainWindow() + sa = QScrollArea(m) + w = QWidget(m) + sa.setWidget(w) + l = QGridLayout(w) + w.setLayout(l), l.setSpacing(30) + scale *= w.devicePixelRatioF() + labels = [] + for r, color in enumerate(sorted(default_color_themes)): + for c, style in enumerate(sorted(all_styles())): + mi.series_index = c + 1 + mi.title = 'An algorithmic cover [%s]' % color + prefs = override_prefs(cprefs, override_color_theme=color, override_style=style) + scale_cover(prefs, scale) + img = generate_cover(mi, prefs=prefs, as_qimage=True) + img.setDevicePixelRatio(w.devicePixelRatioF()) + la = QLabel() + la.setPixmap(QPixmap.fromImage(img)) + l.addWidget(la, r, c) + labels.append(la) + m.setCentralWidget(sa) + w.resize(w.sizeHint()) + m.show() + app.exec_() + + +if __name__ == '__main__': + test() diff --git a/ebook_converter/ebooks/epub/__init__.py b/ebook_converter/ebooks/epub/__init__.py new file mode 100644 index 0000000..4629218 --- /dev/null +++ b/ebook_converter/ebooks/epub/__init__.py @@ -0,0 +1,49 @@ +from __future__ import absolute_import, division, print_function, unicode_literals + +__license__ = 'GPL v3' +__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net' +__docformat__ = 'restructuredtext en' + +''' +Conversion to EPUB. +''' +from calibre.utils.zipfile import ZipFile, ZIP_STORED + + +def rules(stylesheets): + for s in stylesheets: + if hasattr(s, 'cssText'): + for r in s: + if r.type == r.STYLE_RULE: + yield r + + +def simple_container_xml(opf_path, extra_entries=''): + return '''\ + + + + + {extra_entries} + + + '''.format(opf_path, extra_entries=extra_entries) + + +def initialize_container(path_to_container, opf_name='metadata.opf', + extra_entries=[]): + ''' + Create an empty EPUB document, with a default skeleton. + ''' + rootfiles = '' + for path, mimetype, _ in extra_entries: + rootfiles += ''.format( + path, mimetype) + CONTAINER = simple_container_xml(opf_name, rootfiles).encode('utf-8') + zf = ZipFile(path_to_container, 'w') + zf.writestr('mimetype', b'application/epub+zip', compression=ZIP_STORED) + zf.writestr('META-INF/', b'', 0o755) + zf.writestr('META-INF/container.xml', CONTAINER) + for path, _, data in extra_entries: + zf.writestr(path, data) + return zf diff --git a/ebook_converter/ebooks/oeb/polish/css.py b/ebook_converter/ebooks/oeb/polish/css.py new file mode 100644 index 0000000..cb84f20 --- /dev/null +++ b/ebook_converter/ebooks/oeb/polish/css.py @@ -0,0 +1,389 @@ +#!/usr/bin/env python2 +# vim:fileencoding=utf-8 +from __future__ import absolute_import, division, print_function, unicode_literals + +__license__ = 'GPL v3' +__copyright__ = '2014, Kovid Goyal ' + +from collections import defaultdict +from functools import partial + +from css_parser.css import CSSRule, CSSStyleDeclaration +from css_selectors import parse, SelectorSyntaxError + +from calibre import force_unicode +from calibre.ebooks.oeb.base import OEB_STYLES, OEB_DOCS, XHTML, css_text +from calibre.ebooks.oeb.normalize_css import normalize_filter_css, normalizers +from calibre.ebooks.oeb.polish.pretty import pretty_script_or_style, pretty_xml_tree, serialize +from calibre.utils.icu import numeric_sort_key +from css_selectors import Select, SelectorError +from polyglot.builtins import iteritems, itervalues, unicode_type, filter + + +def filter_used_rules(rules, log, select): + for rule in rules: + used = False + for selector in rule.selectorList: + try: + if select.has_matches(selector.selectorText): + used = True + break + except SelectorError: + # Cannot parse/execute this selector, be safe and assume it + # matches something + used = True + break + if not used: + yield rule + + +def get_imported_sheets(name, container, sheets, recursion_level=10, sheet=None): + ans = set() + sheet = sheet or sheets[name] + for rule in sheet.cssRules.rulesOfType(CSSRule.IMPORT_RULE): + if rule.href: + iname = container.href_to_name(rule.href, name) + if iname in sheets: + ans.add(iname) + if recursion_level > 0: + for imported_sheet in tuple(ans): + ans |= get_imported_sheets(imported_sheet, container, sheets, recursion_level=recursion_level-1) + ans.discard(name) + return ans + + +def merge_declarations(first, second): + for prop in second.getProperties(): + first.setProperty(prop) + + +def merge_identical_selectors(sheet): + ' Merge rules that have identical selectors ' + selector_map = defaultdict(list) + for rule in sheet.cssRules.rulesOfType(CSSRule.STYLE_RULE): + selector_map[rule.selectorText].append(rule) + remove = [] + for rule_group in itervalues(selector_map): + if len(rule_group) > 1: + for i in range(1, len(rule_group)): + merge_declarations(rule_group[0].style, rule_group[i].style) + remove.append(rule_group[i]) + for rule in remove: + sheet.cssRules.remove(rule) + return len(remove) + + +def remove_unused_css(container, report=None, remove_unused_classes=False, merge_rules=False): + ''' + Remove all unused CSS rules from the book. An unused CSS rule is one that does not match any actual content. + + :param report: An optional callable that takes a single argument. It is called with information about the operations being performed. + :param remove_unused_classes: If True, class attributes in the HTML that do not match any CSS rules are also removed. + :param merge_rules: If True, rules with identical selectors are merged. + ''' + report = report or (lambda x:x) + + def safe_parse(name): + try: + return container.parsed(name) + except TypeError: + pass + sheets = {name:safe_parse(name) for name, mt in iteritems(container.mime_map) if mt in OEB_STYLES} + sheets = {k:v for k, v in iteritems(sheets) if v is not None} + num_merged = 0 + if merge_rules: + for name, sheet in iteritems(sheets): + num = merge_identical_selectors(sheet) + if num: + container.dirty(name) + num_merged += num + import_map = {name:get_imported_sheets(name, container, sheets) for name in sheets} + if remove_unused_classes: + class_map = {name:{icu_lower(x) for x in classes_in_rule_list(sheet.cssRules)} for name, sheet in iteritems(sheets)} + style_rules = {name:tuple(sheet.cssRules.rulesOfType(CSSRule.STYLE_RULE)) for name, sheet in iteritems(sheets)} + + num_of_removed_rules = num_of_removed_classes = 0 + + for name, mt in iteritems(container.mime_map): + if mt not in OEB_DOCS: + continue + root = container.parsed(name) + select = Select(root, ignore_inappropriate_pseudo_classes=True) + used_classes = set() + for style in root.xpath('//*[local-name()="style"]'): + if style.get('type', 'text/css') == 'text/css' and style.text: + sheet = container.parse_css(style.text) + if merge_rules: + num = merge_identical_selectors(sheet) + if num: + num_merged += num + container.dirty(name) + if remove_unused_classes: + used_classes |= {icu_lower(x) for x in classes_in_rule_list(sheet.cssRules)} + imports = get_imported_sheets(name, container, sheets, sheet=sheet) + for imported_sheet in imports: + style_rules[imported_sheet] = tuple(filter_used_rules(style_rules[imported_sheet], container.log, select)) + if remove_unused_classes: + used_classes |= class_map[imported_sheet] + rules = tuple(sheet.cssRules.rulesOfType(CSSRule.STYLE_RULE)) + unused_rules = tuple(filter_used_rules(rules, container.log, select)) + if unused_rules: + num_of_removed_rules += len(unused_rules) + [sheet.cssRules.remove(r) for r in unused_rules] + style.text = force_unicode(sheet.cssText, 'utf-8') + pretty_script_or_style(container, style) + container.dirty(name) + + for link in root.xpath('//*[local-name()="link" and @href]'): + sname = container.href_to_name(link.get('href'), name) + if sname not in sheets: + continue + style_rules[sname] = tuple(filter_used_rules(style_rules[sname], container.log, select)) + if remove_unused_classes: + used_classes |= class_map[sname] + + for iname in import_map[sname]: + style_rules[iname] = tuple(filter_used_rules(style_rules[iname], container.log, select)) + if remove_unused_classes: + used_classes |= class_map[iname] + + if remove_unused_classes: + for elem in root.xpath('//*[@class]'): + original_classes, classes = elem.get('class', '').split(), [] + for x in original_classes: + if icu_lower(x) in used_classes: + classes.append(x) + if len(classes) != len(original_classes): + if classes: + elem.set('class', ' '.join(classes)) + else: + del elem.attrib['class'] + num_of_removed_classes += len(original_classes) - len(classes) + container.dirty(name) + + for name, sheet in iteritems(sheets): + unused_rules = style_rules[name] + if unused_rules: + num_of_removed_rules += len(unused_rules) + [sheet.cssRules.remove(r) for r in unused_rules] + container.dirty(name) + + num_changes = num_of_removed_rules + num_merged + num_of_removed_classes + if num_changes > 0: + if num_of_removed_rules > 0: + report(ngettext('Removed one unused CSS style rule', 'Removed {} unused CSS style rules', + num_of_removed_rules).format(num_of_removed_rules)) + if num_of_removed_classes > 0: + report(ngettext('Removed one unused class from the HTML', 'Removed {} unused classes from the HTML', + num_of_removed_classes).format(num_of_removed_classes)) + if num_merged > 0: + report(ngettext('Merged one CSS style rule', 'Merged {} CSS style rules', + num_merged).format(num_merged)) + if num_of_removed_rules == 0: + report(_('No unused CSS style rules found')) + if remove_unused_classes and num_of_removed_classes == 0: + report(_('No unused class attributes found')) + if merge_rules and num_merged == 0: + report(_('No style rules that could be merged found')) + return num_changes > 0 + + +def filter_declaration(style, properties=()): + changed = False + for prop in properties: + if style.removeProperty(prop) != '': + changed = True + all_props = set(style.keys()) + for prop in style.getProperties(): + n = normalizers.get(prop.name, None) + if n is not None: + normalized = n(prop.name, prop.propertyValue) + removed = properties.intersection(set(normalized)) + if removed: + changed = True + style.removeProperty(prop.name) + for prop in set(normalized) - removed - all_props: + style.setProperty(prop, normalized[prop]) + return changed + + +def filter_sheet(sheet, properties=()): + from css_parser.css import CSSRule + changed = False + remove = [] + for rule in sheet.cssRules.rulesOfType(CSSRule.STYLE_RULE): + if filter_declaration(rule.style, properties): + changed = True + if rule.style.length == 0: + remove.append(rule) + for rule in remove: + sheet.cssRules.remove(rule) + return changed + + +def transform_inline_styles(container, name, transform_sheet, transform_style): + root = container.parsed(name) + changed = False + for style in root.xpath('//*[local-name()="style"]'): + if style.text and (style.get('type') or 'text/css').lower() == 'text/css': + sheet = container.parse_css(style.text) + if transform_sheet(sheet): + changed = True + style.text = force_unicode(sheet.cssText, 'utf-8') + pretty_script_or_style(container, style) + for elem in root.xpath('//*[@style]'): + text = elem.get('style', None) + if text: + style = container.parse_css(text, is_declaration=True) + if transform_style(style): + changed = True + if style.length == 0: + del elem.attrib['style'] + else: + elem.set('style', force_unicode(style.getCssText(separator=' '), 'utf-8')) + return changed + + +def transform_css(container, transform_sheet=None, transform_style=None, names=()): + if not names: + types = OEB_STYLES | OEB_DOCS + names = [] + for name, mt in iteritems(container.mime_map): + if mt in types: + names.append(name) + + doc_changed = False + + for name in names: + mt = container.mime_map[name] + if mt in OEB_STYLES: + sheet = container.parsed(name) + if transform_sheet(sheet): + container.dirty(name) + doc_changed = True + elif mt in OEB_DOCS: + if transform_inline_styles(container, name, transform_sheet, transform_style): + container.dirty(name) + doc_changed = True + + return doc_changed + + +def filter_css(container, properties, names=()): + ''' + Remove the specified CSS properties from all CSS rules in the book. + + :param properties: Set of properties to remove. For example: :code:`{'font-family', 'color'}`. + :param names: The files from which to remove the properties. Defaults to all HTML and CSS files in the book. + ''' + properties = normalize_filter_css(properties) + return transform_css(container, transform_sheet=partial(filter_sheet, properties=properties), + transform_style=partial(filter_declaration, properties=properties), names=names) + + +def _classes_in_selector(selector, classes): + for attr in ('selector', 'subselector', 'parsed_tree'): + s = getattr(selector, attr, None) + if s is not None: + _classes_in_selector(s, classes) + cn = getattr(selector, 'class_name', None) + if cn is not None: + classes.add(cn) + + +def classes_in_selector(text): + classes = set() + try: + for selector in parse(text): + _classes_in_selector(selector, classes) + except SelectorSyntaxError: + pass + return classes + + +def classes_in_rule_list(css_rules): + classes = set() + for rule in css_rules: + if rule.type == rule.STYLE_RULE: + classes |= classes_in_selector(rule.selectorText) + elif hasattr(rule, 'cssRules'): + classes |= classes_in_rule_list(rule.cssRules) + return classes + + +def iter_declarations(sheet_or_rule): + if hasattr(sheet_or_rule, 'cssRules'): + for rule in sheet_or_rule.cssRules: + for x in iter_declarations(rule): + yield x + elif hasattr(sheet_or_rule, 'style'): + yield sheet_or_rule.style + elif isinstance(sheet_or_rule, CSSStyleDeclaration): + yield sheet_or_rule + + +def remove_property_value(prop, predicate): + ''' Remove the Values that match the predicate from this property. If all + values of the property would be removed, the property is removed from its + parent instead. Note that this means the property must have a parent (a + CSSStyleDeclaration). ''' + removed_vals = list(filter(predicate, prop.propertyValue)) + if len(removed_vals) == len(prop.propertyValue): + prop.parent.removeProperty(prop.name) + else: + x = css_text(prop.propertyValue) + for v in removed_vals: + x = x.replace(css_text(v), '').strip() + prop.propertyValue.cssText = x + return bool(removed_vals) + + +RULE_PRIORITIES = {t:i for i, t in enumerate((CSSRule.COMMENT, CSSRule.CHARSET_RULE, CSSRule.IMPORT_RULE, CSSRule.NAMESPACE_RULE))} + + +def sort_sheet(container, sheet_or_text): + ''' Sort the rules in a stylesheet. Note that in the general case this can + change the effective styles, but for most common sheets, it should be safe. + ''' + sheet = container.parse_css(sheet_or_text) if isinstance(sheet_or_text, unicode_type) else sheet_or_text + + def text_sort_key(x): + return numeric_sort_key(unicode_type(x or '')) + + def selector_sort_key(x): + return (x.specificity, text_sort_key(x.selectorText)) + + def rule_sort_key(rule): + primary = RULE_PRIORITIES.get(rule.type, len(RULE_PRIORITIES)) + secondary = text_sort_key(getattr(rule, 'atkeyword', '') or '') + tertiary = None + if rule.type == CSSRule.STYLE_RULE: + primary += 1 + selectors = sorted(rule.selectorList, key=selector_sort_key) + tertiary = selector_sort_key(selectors[0]) + rule.selectorText = ', '.join(s.selectorText for s in selectors) + elif rule.type == CSSRule.FONT_FACE_RULE: + try: + tertiary = text_sort_key(rule.style.getPropertyValue('font-family')) + except Exception: + pass + + return primary, secondary, tertiary + sheet.cssRules.sort(key=rule_sort_key) + return sheet + + +def add_stylesheet_links(container, name, text): + root = container.parse_xhtml(text, name) + head = root.xpath('//*[local-name() = "head"]') + if not head: + return + head = head[0] + sheets = tuple(container.manifest_items_of_type(lambda mt: mt in OEB_STYLES)) + if not sheets: + return + for sname in sheets: + link = head.makeelement(XHTML('link'), type='text/css', rel='stylesheet', href=container.name_to_href(sname, name)) + head.append(link) + pretty_xml_tree(head) + return serialize(root, 'text/html') diff --git a/ebook_converter/ebooks/oeb/polish/replace.py b/ebook_converter/ebooks/oeb/polish/replace.py new file mode 100644 index 0000000..08fdbaa --- /dev/null +++ b/ebook_converter/ebooks/oeb/polish/replace.py @@ -0,0 +1,404 @@ +#!/usr/bin/env python2 +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai +from __future__ import absolute_import, division, print_function, unicode_literals + +__license__ = 'GPL v3' +__copyright__ = '2013, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + +import codecs, shutil, os, posixpath +from polyglot.builtins import iteritems, itervalues, map +from functools import partial +from collections import Counter, defaultdict + +from calibre import sanitize_file_name +from calibre.ebooks.chardet import strip_encoding_declarations +from calibre.ebooks.oeb.base import css_text +from calibre.ebooks.oeb.polish.css import iter_declarations, remove_property_value +from calibre.ebooks.oeb.polish.utils import extract +from polyglot.urllib import urlparse, urlunparse + + +class LinkReplacer(object): + + def __init__(self, base, container, link_map, frag_map): + self.base = base + self.frag_map = frag_map + self.link_map = link_map + self.container = container + self.replaced = False + + def __call__(self, url): + if url and url.startswith('#'): + repl = self.frag_map(self.base, url[1:]) + if not repl or repl == url[1:]: + return url + self.replaced = True + return '#' + repl + name = self.container.href_to_name(url, self.base) + if not name: + return url + nname = self.link_map.get(name, None) + if not nname: + return url + purl = urlparse(url) + href = self.container.name_to_href(nname, self.base) + if purl.fragment: + nfrag = self.frag_map(name, purl.fragment) + if nfrag: + href += '#%s'%nfrag + if href != url: + self.replaced = True + return href + + +class IdReplacer(object): + + def __init__(self, base, container, id_map): + self.base, self.container, self.replaced = base, container, False + self.id_map = id_map + + def __call__(self, url): + if url and url.startswith('#'): + repl = self.id_map.get(self.base, {}).get(url[1:]) + if repl is None or repl == url[1:]: + return url + self.replaced = True + return '#' + repl + name = self.container.href_to_name(url, self.base) + if not name: + return url + id_map = self.id_map.get(name) + if id_map is None: + return url + purl = urlparse(url) + nfrag = id_map.get(purl.fragment) + if nfrag is None: + return url + purl = purl._replace(fragment=nfrag) + href = urlunparse(purl) + if href != url: + self.replaced = True + return href + + +class LinkRebaser(object): + + def __init__(self, container, old_name, new_name): + self.old_name, self.new_name = old_name, new_name + self.container = container + self.replaced = False + + def __call__(self, url): + if url and url.startswith('#'): + return url + purl = urlparse(url) + frag = purl.fragment + name = self.container.href_to_name(url, self.old_name) + if not name: + return url + if name == self.old_name: + name = self.new_name + href = self.container.name_to_href(name, self.new_name) + if frag: + href += '#' + frag + if href != url: + self.replaced = True + return href + + +def replace_links(container, link_map, frag_map=lambda name, frag:frag, replace_in_opf=False): + ''' + Replace links to files in the container. Will iterate over all files in the container and change the specified links in them. + + :param link_map: A mapping of old canonical name to new canonical name. For example: :code:`{'images/old.png': 'images/new.png'}` + :param frag_map: A callable that takes two arguments ``(name, anchor)`` and + returns a new anchor. This is useful if you need to change the anchors in + HTML files. By default, it does nothing. + :param replace_in_opf: If False, links are not replaced in the OPF file. + + ''' + for name, media_type in iteritems(container.mime_map): + if name == container.opf_name and not replace_in_opf: + continue + repl = LinkReplacer(name, container, link_map, frag_map) + container.replace_links(name, repl) + + +def replace_ids(container, id_map): + ''' + Replace all links in the container that pointed to the changed ids. + + :param id_map: A mapping of {name:id_map} where each id_map is a mapping of {old_id:new_id} + :return: True iff at least one link was changed + + ''' + changed = False + for name, media_type in iteritems(container.mime_map): + repl = IdReplacer(name, container, id_map) + container.replace_links(name, repl) + if name == container.opf_name: + imap = id_map.get(name, {}) + for item in container.opf_xpath('//*[@idref]'): + old_id = item.get('idref') + if old_id is not None: + new_id = imap.get(old_id) + if new_id is not None: + item.set('idref', new_id) + if repl.replaced: + changed = True + return changed + + +def smarten_punctuation(container, report): + from calibre.ebooks.conversion.preprocess import smarten_punctuation + smartened = False + for path in container.spine_items: + name = container.abspath_to_name(path) + changed = False + with container.open(name, 'r+b') as f: + html = container.decode(f.read()) + newhtml = smarten_punctuation(html, container.log) + if newhtml != html: + changed = True + report(_('Smartened punctuation in: %s')%name) + newhtml = strip_encoding_declarations(newhtml) + f.seek(0) + f.truncate() + f.write(codecs.BOM_UTF8 + newhtml.encode('utf-8')) + if changed: + # Add an encoding declaration (it will be added automatically when + # serialized) + root = container.parsed(name) + for m in root.xpath('descendant::*[local-name()="meta" and @http-equiv]'): + m.getparent().remove(m) + container.dirty(name) + smartened = True + if not smartened: + report(_('No punctuation that could be smartened found')) + return smartened + + +def rename_files(container, file_map): + ''' + Rename files in the container, automatically updating all links to them. + + :param file_map: A mapping of old canonical name to new canonical name, for + example: :code:`{'text/chapter1.html': 'chapter1.html'}`. + ''' + overlap = set(file_map).intersection(set(itervalues(file_map))) + if overlap: + raise ValueError('Circular rename detected. The files %s are both rename targets and destinations' % ', '.join(overlap)) + for name, dest in iteritems(file_map): + if container.exists(dest): + if name != dest and name.lower() == dest.lower(): + # A case change on an OS with a case insensitive file-system. + continue + raise ValueError('Cannot rename {0} to {1} as {1} already exists'.format(name, dest)) + if len(tuple(itervalues(file_map))) != len(set(itervalues(file_map))): + raise ValueError('Cannot rename, the set of destination files contains duplicates') + link_map = {} + for current_name, new_name in iteritems(file_map): + container.rename(current_name, new_name) + if new_name != container.opf_name: # OPF is handled by the container + link_map[current_name] = new_name + replace_links(container, link_map, replace_in_opf=True) + + +def replace_file(container, name, path, basename, force_mt=None): + dirname, base = name.rpartition('/')[0::2] + nname = sanitize_file_name(basename) + if dirname: + nname = dirname + '/' + nname + with open(path, 'rb') as src: + if name != nname: + count = 0 + b, e = nname.rpartition('.')[0::2] + while container.exists(nname): + count += 1 + nname = b + ('_%d.%s' % (count, e)) + rename_files(container, {name:nname}) + mt = force_mt or container.guess_type(nname) + container.mime_map[nname] = mt + for itemid, q in iteritems(container.manifest_id_map): + if q == nname: + for item in container.opf_xpath('//opf:manifest/opf:item[@href and @id="%s"]' % itemid): + item.set('media-type', mt) + container.dirty(container.opf_name) + with container.open(nname, 'wb') as dest: + shutil.copyfileobj(src, dest) + + +def mt_to_category(container, mt): + from calibre.ebooks.oeb.polish.utils import guess_type + from calibre.ebooks.oeb.polish.container import OEB_FONTS + from calibre.ebooks.oeb.base import OEB_DOCS, OEB_STYLES + if mt in OEB_DOCS: + category = 'text' + elif mt in OEB_STYLES: + category = 'style' + elif mt in OEB_FONTS: + category = 'font' + elif mt == guess_type('a.opf'): + category = 'opf' + elif mt == guess_type('a.ncx'): + category = 'toc' + else: + category = mt.partition('/')[0] + return category + + +def get_recommended_folders(container, names): + ''' Return the folders that are recommended for the given filenames. The + recommendation is based on where the majority of files of the same type are + located in the container. If no files of a particular type are present, the + recommended folder is assumed to be the folder containing the OPF file. ''' + from calibre.ebooks.oeb.polish.utils import guess_type + counts = defaultdict(Counter) + for name, mt in iteritems(container.mime_map): + folder = name.rpartition('/')[0] if '/' in name else '' + counts[mt_to_category(container, mt)][folder] += 1 + + try: + opf_folder = counts['opf'].most_common(1)[0][0] + except KeyError: + opf_folder = '' + + recommendations = {category:counter.most_common(1)[0][0] for category, counter in iteritems(counts)} + return {n:recommendations.get(mt_to_category(container, guess_type(os.path.basename(n))), opf_folder) for n in names} + + +def normalize_case(container, val): + + def safe_listdir(x): + try: + return os.listdir(x) + except EnvironmentError: + return () + + parts = val.split('/') + ans = [] + for i in range(len(parts)): + q = '/'.join(parts[:i+1]) + x = container.name_to_abspath(q) + xl = parts[i].lower() + candidates = [c for c in safe_listdir(os.path.dirname(x)) if c != parts[i] and c.lower() == xl] + ans.append(candidates[0] if candidates else parts[i]) + return '/'.join(ans) + + +def rationalize_folders(container, folder_type_map): + all_names = set(container.mime_map) + new_names = set() + name_map = {} + for key in tuple(folder_type_map): + val = folder_type_map[key] + folder_type_map[key] = normalize_case(container, val) + for name in all_names: + if name.startswith('META-INF/'): + continue + category = mt_to_category(container, container.mime_map[name]) + folder = folder_type_map.get(category, None) + if folder is not None: + bn = posixpath.basename(name) + new_name = posixpath.join(folder, bn) + if new_name != name: + c = 0 + while new_name in all_names or new_name in new_names: + c += 1 + n, ext = bn.rpartition('.')[0::2] + new_name = posixpath.join(folder, '%s_%d.%s' % (n, c, ext)) + name_map[name] = new_name + new_names.add(new_name) + return name_map + + +def remove_links_in_sheet(href_to_name, sheet, predicate): + import_rules_to_remove = [] + changed = False + for i, r in enumerate(sheet): + if r.type == r.IMPORT_RULE: + name = href_to_name(r.href) + if predicate(name, r.href, None): + import_rules_to_remove.append(i) + for i in sorted(import_rules_to_remove, reverse=True): + sheet.deleteRule(i) + changed = True + + for dec in iter_declarations(sheet): + changed = remove_links_in_declaration(href_to_name, dec, predicate) or changed + return changed + + +def remove_links_in_declaration(href_to_name, style, predicate): + def check_pval(v): + if v.type == v.URI: + name = href_to_name(v.uri) + return predicate(name, v.uri, None) + return False + + changed = False + + for p in tuple(style.getProperties(all=True)): + changed = remove_property_value(p, check_pval) or changed + return changed + + +def remove_links_to(container, predicate): + ''' predicate must be a function that takes the arguments (name, href, + fragment=None) and returns True iff the link should be removed ''' + from calibre.ebooks.oeb.base import iterlinks, OEB_DOCS, OEB_STYLES, XPath, XHTML + stylepath = XPath('//h:style') + styleattrpath = XPath('//*[@style]') + changed = set() + for name, mt in iteritems(container.mime_map): + removed = False + if mt in OEB_DOCS: + root = container.parsed(name) + for el, attr, href, pos in iterlinks(root, find_links_in_css=False): + hname = container.href_to_name(href, name) + frag = href.partition('#')[-1] + if predicate(hname, href, frag): + if attr is None: + el.text = None + else: + if el.tag == XHTML('link') or el.tag == XHTML('img'): + extract(el) + else: + del el.attrib[attr] + removed = True + for tag in stylepath(root): + if tag.text and (tag.get('type') or 'text/css').lower() == 'text/css': + sheet = container.parse_css(tag.text) + if remove_links_in_sheet(partial(container.href_to_name, base=name), sheet, predicate): + tag.text = css_text(sheet) + removed = True + for tag in styleattrpath(root): + style = tag.get('style') + if style: + style = container.parse_css(style, is_declaration=True) + if remove_links_in_declaration(partial(container.href_to_name, base=name), style, predicate): + removed = True + tag.set('style', css_text(style)) + elif mt in OEB_STYLES: + removed = remove_links_in_sheet(partial(container.href_to_name, base=name), container.parsed(name), predicate) + if removed: + changed.add(name) + tuple(map(container.dirty, changed)) + return changed + + +def get_spine_order_for_all_files(container): + linear_names, non_linear_names = [], [] + for name, is_linear in container.spine_names: + (linear_names if is_linear else non_linear_names).append(name) + all_names = linear_names + non_linear_names + spine_names = frozenset(all_names) + ans = {} + for spine_pos, name in enumerate(all_names): + ans.setdefault(name, (spine_pos, -1)) + for i, href in enumerate(container.iterlinks(name, get_line_numbers=False)): + lname = container.href_to_name(href, name) + if lname not in spine_names: + ans.setdefault(lname, (spine_pos, i)) + return ans diff --git a/ebook_converter/ebooks/oeb/polish/split.py b/ebook_converter/ebooks/oeb/polish/split.py new file mode 100644 index 0000000..e24440a --- /dev/null +++ b/ebook_converter/ebooks/oeb/polish/split.py @@ -0,0 +1,517 @@ +#!/usr/bin/env python2 +# vim:fileencoding=utf-8 +from __future__ import absolute_import, division, print_function, unicode_literals + +__license__ = 'GPL v3' +__copyright__ = '2013, Kovid Goyal ' + +import copy, os, re +from polyglot.builtins import map, string_or_bytes, range + +from calibre.ebooks.oeb.base import barename, XPNSMAP, XPath, OPF, XHTML, OEB_DOCS +from calibre.ebooks.oeb.polish.errors import MalformedMarkup +from calibre.ebooks.oeb.polish.toc import node_from_loc +from calibre.ebooks.oeb.polish.replace import LinkRebaser +from polyglot.builtins import iteritems, unicode_type +from polyglot.urllib import urlparse + + +class AbortError(ValueError): + pass + + +def in_table(node): + while node is not None: + if node.tag.endswith('}table'): + return True + node = node.getparent() + return False + + +def adjust_split_point(split_point, log): + ''' + Move the split point up its ancestor chain if it has no content + before it. This handles the common case: +

Chapter 1

...
with a page break on the + h2. + ''' + sp = split_point + while True: + parent = sp.getparent() + if ( + parent is None or + barename(parent.tag) in {'body', 'html'} or + (parent.text and parent.text.strip()) or + parent.index(sp) > 0 + ): + break + sp = parent + + if sp is not split_point: + log.debug('Adjusted split point to ancestor') + + return sp + + +def get_body(root): + return root.find('h:body', namespaces=XPNSMAP) + + +def do_split(split_point, log, before=True): + ''' + Split tree into a *before* and an *after* tree at ``split_point``. + + :param split_point: The Element at which to split + :param before: If True tree is split before split_point, otherwise after split_point + :return: before_tree, after_tree + ''' + if before: + # We cannot adjust for after since moving an after split point to a + # parent will cause breakage if the parent contains any content + # after the original split point + split_point = adjust_split_point(split_point, log) + tree = split_point.getroottree() + path = tree.getpath(split_point) + + tree, tree2 = copy.deepcopy(tree), copy.deepcopy(tree) + root, root2 = tree.getroot(), tree2.getroot() + body, body2 = map(get_body, (root, root2)) + split_point = root.xpath(path)[0] + split_point2 = root2.xpath(path)[0] + + def nix_element(elem, top=True): + # Remove elem unless top is False in which case replace elem by its + # children + parent = elem.getparent() + if top: + parent.remove(elem) + else: + index = parent.index(elem) + parent[index:index+1] = list(elem.iterchildren()) + + # Tree 1 + hit_split_point = False + keep_descendants = False + split_point_descendants = frozenset(split_point.iterdescendants()) + for elem in tuple(body.iterdescendants()): + if elem is split_point: + hit_split_point = True + if before: + nix_element(elem) + else: + # We want to keep the descendants of the split point in + # Tree 1 + keep_descendants = True + # We want the split point element, but not its tail + elem.tail = '\n' + + continue + if hit_split_point: + if keep_descendants: + if elem in split_point_descendants: + # elem is a descendant keep it + continue + else: + # We are out of split_point, so prevent further set + # lookups of split_point_descendants + keep_descendants = False + nix_element(elem) + + # Tree 2 + ancestors = frozenset(XPath('ancestor::*')(split_point2)) + for elem in tuple(body2.iterdescendants()): + if elem is split_point2: + if not before: + # Keep the split point element's tail, if it contains non-whitespace + # text + tail = elem.tail + if tail and not tail.isspace(): + parent = elem.getparent() + idx = parent.index(elem) + if idx == 0: + parent.text = (parent.text or '') + tail + else: + sib = parent[idx-1] + sib.tail = (sib.tail or '') + tail + # Remove the element itself + nix_element(elem) + break + if elem in ancestors: + # We have to preserve the ancestors as they could have CSS + # styles that are inherited/applicable, like font or + # width. So we only remove the text, if any. + elem.text = '\n' + else: + nix_element(elem, top=False) + + body2.text = '\n' + + return tree, tree2 + + +class SplitLinkReplacer(object): + + def __init__(self, base, bottom_anchors, top_name, bottom_name, container): + self.bottom_anchors, self.bottom_name = bottom_anchors, bottom_name + self.container, self.top_name = container, top_name + self.base = base + self.replaced = False + + def __call__(self, url): + if url and url.startswith('#'): + return url + name = self.container.href_to_name(url, self.base) + if name != self.top_name: + return url + purl = urlparse(url) + if purl.fragment and purl.fragment in self.bottom_anchors: + url = self.container.name_to_href(self.bottom_name, self.base) + '#' + purl.fragment + self.replaced = True + return url + + +def split(container, name, loc_or_xpath, before=True, totals=None): + ''' + Split the file specified by name at the position specified by loc_or_xpath. + Splitting automatically migrates all links and references to the affected + files. + + :param loc_or_xpath: Should be an XPath expression such as + //h:div[@id="split_here"]. Can also be a *loc* which is used internally to + implement splitting in the preview panel. + :param before: If True the split occurs before the identified element otherwise after it. + :param totals: Used internally + ''' + + root = container.parsed(name) + if isinstance(loc_or_xpath, unicode_type): + split_point = root.xpath(loc_or_xpath)[0] + else: + try: + split_point = node_from_loc(root, loc_or_xpath, totals=totals) + except MalformedMarkup: + # The webkit HTML parser and the container parser have yielded + # different node counts, this can happen if the file is valid XML + # but contains constructs like nested

tags. So force parse it + # with the HTML 5 parser and try again. + raw = container.raw_data(name) + root = container.parse_xhtml(raw, fname=name, force_html5_parse=True) + try: + split_point = node_from_loc(root, loc_or_xpath, totals=totals) + except MalformedMarkup: + raise MalformedMarkup(_('The file %s has malformed markup. Try running the Fix HTML tool' + ' before splitting') % name) + container.replace(name, root) + if in_table(split_point): + raise AbortError('Cannot split inside tables') + if split_point.tag.endswith('}body'): + raise AbortError('Cannot split on the tag') + tree1, tree2 = do_split(split_point, container.log, before=before) + root1, root2 = tree1.getroot(), tree2.getroot() + anchors_in_top = frozenset(root1.xpath('//*/@id')) | frozenset(root1.xpath('//*/@name')) | {''} + anchors_in_bottom = frozenset(root2.xpath('//*/@id')) | frozenset(root2.xpath('//*/@name')) + base, ext = name.rpartition('.')[0::2] + base = re.sub(r'_split\d+$', '', base) + nname, s = None, 0 + while not nname or container.exists(nname): + s += 1 + nname = '%s_split%d.%s' % (base, s, ext) + manifest_item = container.generate_item(nname, media_type=container.mime_map[name]) + bottom_name = container.href_to_name(manifest_item.get('href'), container.opf_name) + + # Fix links in the split trees + for r in (root1, root2): + for a in r.xpath('//*[@href]'): + url = a.get('href') + if url.startswith('#'): + fname = name + else: + fname = container.href_to_name(url, name) + if fname == name: + purl = urlparse(url) + if purl.fragment in anchors_in_top: + if r is root2: + a.set('href', '%s#%s' % (container.name_to_href(name, bottom_name), purl.fragment)) + else: + a.set('href', '#' + purl.fragment) + elif purl.fragment in anchors_in_bottom: + if r is root1: + a.set('href', '%s#%s' % (container.name_to_href(bottom_name, name), purl.fragment)) + else: + a.set('href', '#' + purl.fragment) + + # Fix all links in the container that point to anchors in the bottom tree + for fname, media_type in iteritems(container.mime_map): + if fname not in {name, bottom_name}: + repl = SplitLinkReplacer(fname, anchors_in_bottom, name, bottom_name, container) + container.replace_links(fname, repl) + + container.replace(name, root1) + container.replace(bottom_name, root2) + + spine = container.opf_xpath('//opf:spine')[0] + for spine_item, spine_name, linear in container.spine_iter: + if spine_name == name: + break + index = spine.index(spine_item) + 1 + + si = spine.makeelement(OPF('itemref'), idref=manifest_item.get('id')) + if not linear: + si.set('linear', 'no') + container.insert_into_xml(spine, si, index=index) + container.dirty(container.opf_name) + return bottom_name + + +def multisplit(container, name, xpath, before=True): + ''' + Split the specified file at multiple locations (all tags that match the specified XPath expression). See also: :func:`split`. + Splitting automatically migrates all links and references to the affected + files. + + :param before: If True the splits occur before the identified element otherwise after it. + ''' + root = container.parsed(name) + nodes = root.xpath(xpath, namespaces=XPNSMAP) + if not nodes: + raise AbortError(_('The expression %s did not match any nodes') % xpath) + for split_point in nodes: + if in_table(split_point): + raise AbortError('Cannot split inside tables') + if split_point.tag.endswith('}body'): + raise AbortError('Cannot split on the tag') + + for i, tag in enumerate(nodes): + tag.set('calibre-split-point', unicode_type(i)) + + current = name + all_names = [name] + for i in range(len(nodes)): + current = split(container, current, '//*[@calibre-split-point="%d"]' % i, before=before) + all_names.append(current) + + for x in all_names: + for tag in container.parsed(x).xpath('//*[@calibre-split-point]'): + tag.attrib.pop('calibre-split-point') + container.dirty(x) + + return all_names[1:] + + +class MergeLinkReplacer(object): + + def __init__(self, base, anchor_map, master, container): + self.container, self.anchor_map = container, anchor_map + self.master = master + self.base = base + self.replaced = False + + def __call__(self, url): + if url and url.startswith('#'): + return url + name = self.container.href_to_name(url, self.base) + amap = self.anchor_map.get(name, None) + if amap is None: + return url + purl = urlparse(url) + frag = purl.fragment or '' + frag = amap.get(frag, frag) + url = self.container.name_to_href(self.master, self.base) + '#' + frag + self.replaced = True + return url + + +def add_text(body, text): + if len(body) > 0: + body[-1].tail = (body[-1].tail or '') + text + else: + body.text = (body.text or '') + text + + +def all_anchors(root): + return set(root.xpath('//*/@id')) | set(root.xpath('//*/@name')) + + +def all_stylesheets(container, name): + for link in XPath('//h:head/h:link[@href]')(container.parsed(name)): + name = container.href_to_name(link.get('href'), name) + typ = link.get('type', 'text/css') + if typ == 'text/css': + yield name + + +def unique_anchor(seen_anchors, current): + c = 0 + ans = current + while ans in seen_anchors: + c += 1 + ans = '%s_%d' % (current, c) + return ans + + +def remove_name_attributes(root): + # Remove all name attributes, replacing them with id attributes + for elem in root.xpath('//*[@id and @name]'): + del elem.attrib['name'] + for elem in root.xpath('//*[@name]'): + elem.set('id', elem.attrib.pop('name')) + + +def merge_html(container, names, master, insert_page_breaks=False): + p = container.parsed + root = p(master) + + # Ensure master has a + head = root.find('h:head', namespaces=XPNSMAP) + if head is None: + head = root.makeelement(XHTML('head')) + container.insert_into_xml(root, head, 0) + + seen_anchors = all_anchors(root) + seen_stylesheets = set(all_stylesheets(container, master)) + master_body = p(master).findall('h:body', namespaces=XPNSMAP)[-1] + master_base = os.path.dirname(master) + anchor_map = {n:{} for n in names if n != master} + first_anchor_map = {} + + for name in names: + if name == master: + continue + # Insert new stylesheets into master + for sheet in all_stylesheets(container, name): + if sheet not in seen_stylesheets: + seen_stylesheets.add(sheet) + link = head.makeelement(XHTML('link'), rel='stylesheet', type='text/css', href=container.name_to_href(sheet, master)) + container.insert_into_xml(head, link) + + # Rebase links if master is in a different directory + if os.path.dirname(name) != master_base: + container.replace_links(name, LinkRebaser(container, name, master)) + + root = p(name) + children = [] + for body in p(name).findall('h:body', namespaces=XPNSMAP): + children.append(body.text if body.text and body.text.strip() else '\n\n') + children.extend(body) + + first_child = '' + for first_child in children: + if not isinstance(first_child, string_or_bytes): + break + if isinstance(first_child, string_or_bytes): + # body contained only text, no tags + first_child = body.makeelement(XHTML('p')) + first_child.text, children[0] = children[0], first_child + + amap = anchor_map[name] + remove_name_attributes(root) + + for elem in root.xpath('//*[@id]'): + val = elem.get('id') + if not val: + continue + if val in seen_anchors: + nval = unique_anchor(seen_anchors, val) + elem.set('id', nval) + amap[val] = nval + else: + seen_anchors.add(val) + + if 'id' not in first_child.attrib: + first_child.set('id', unique_anchor(seen_anchors, 'top')) + seen_anchors.add(first_child.get('id')) + first_anchor_map[name] = first_child.get('id') + + if insert_page_breaks: + first_child.set('style', first_child.get('style', '') + '; page-break-before: always') + + amap[''] = first_child.get('id') + + # Fix links that point to local changed anchors + for a in XPath('//h:a[starts-with(@href, "#")]')(root): + q = a.get('href')[1:] + if q in amap: + a.set('href', '#' + amap[q]) + + for child in children: + if isinstance(child, string_or_bytes): + add_text(master_body, child) + else: + master_body.append(copy.deepcopy(child)) + + container.remove_item(name, remove_from_guide=False) + + # Fix all links in the container that point to merged files + for fname, media_type in iteritems(container.mime_map): + repl = MergeLinkReplacer(fname, anchor_map, master, container) + container.replace_links(fname, repl) + + return first_anchor_map + + +def merge_css(container, names, master): + p = container.parsed + msheet = p(master) + master_base = os.path.dirname(master) + merged = set() + + for name in names: + if name == master: + continue + # Rebase links if master is in a different directory + if os.path.dirname(name) != master_base: + container.replace_links(name, LinkRebaser(container, name, master)) + + sheet = p(name) + + # Remove charset rules + cr = [r for r in sheet.cssRules if r.type == r.CHARSET_RULE] + [sheet.deleteRule(sheet.cssRules.index(r)) for r in cr] + for rule in sheet.cssRules: + msheet.add(rule) + + container.remove_item(name) + merged.add(name) + + # Remove links to merged stylesheets in the html files, replacing with a + # link to the master sheet + for name, mt in iteritems(container.mime_map): + if mt in OEB_DOCS: + removed = False + root = p(name) + for link in XPath('//h:link[@href]')(root): + q = container.href_to_name(link.get('href'), name) + if q in merged: + container.remove_from_xml(link) + removed = True + if removed: + container.dirty(name) + if removed and master not in set(all_stylesheets(container, name)): + head = root.find('h:head', namespaces=XPNSMAP) + if head is not None: + link = head.makeelement(XHTML('link'), type='text/css', rel='stylesheet', href=container.name_to_href(master, name)) + container.insert_into_xml(head, link) + + +def merge(container, category, names, master): + ''' + Merge the specified files into a single file, automatically migrating all + links and references to the affected files. The file must all either be HTML or CSS files. + + :param category: Must be either ``'text'`` for HTML files or ``'styles'`` for CSS files + :param names: The list of files to be merged + :param master: Which of the merged files is the *master* file, that is, the file that will remain after merging. + ''' + if category not in {'text', 'styles'}: + raise AbortError('Cannot merge files of type: %s' % category) + if len(names) < 2: + raise AbortError('Must specify at least two files to be merged') + if master not in names: + raise AbortError('The master file (%s) must be one of the files being merged' % master) + + if category == 'text': + merge_html(container, names, master) + elif category == 'styles': + merge_css(container, names, master) + + container.dirty(master) diff --git a/ebook_converter/ebooks/oeb/transforms/cover.py b/ebook_converter/ebooks/oeb/transforms/cover.py new file mode 100644 index 0000000..4409f0c --- /dev/null +++ b/ebook_converter/ebooks/oeb/transforms/cover.py @@ -0,0 +1,172 @@ +#!/usr/bin/env python2 +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import absolute_import, division, print_function, unicode_literals + +__license__ = 'GPL v3' +__copyright__ = '2010, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + +import textwrap + +from calibre import guess_type +from calibre.utils.imghdr import identify +from calibre.utils.xml_parse import safe_xml_fromstring +from polyglot.builtins import unicode_type +from polyglot.urllib import unquote + + +class CoverManager(object): + + SVG_TEMPLATE = textwrap.dedent('''\ + + + + + Cover + + + +

+ + + +
+ + + ''') + + NONSVG_TEMPLATE = textwrap.dedent('''\ + + + + + Cover + + + +
+ cover +
+ + + ''') + + def __init__(self, no_default_cover=False, no_svg_cover=False, + preserve_aspect_ratio=False, fixed_size=None): + self.no_default_cover = no_default_cover + self.no_svg_cover = no_svg_cover + self.preserve_aspect_ratio = preserve_aspect_ratio + + ar = 'xMidYMid meet' if preserve_aspect_ratio else 'none' + self.svg_template = self.SVG_TEMPLATE.replace('__ar__', ar) + + if fixed_size is None: + style = 'style="height: 100%%"' + else: + width, height = fixed_size + style = 'style="height: %s; width: %s"'%(height, width) + self.non_svg_template = self.NONSVG_TEMPLATE.replace('__style__', + style) + + def __call__(self, oeb, opts, log): + self.oeb = oeb + self.log = log + self.insert_cover() + + def default_cover(self): + ''' + Create a generic cover for books that dont have a cover + ''' + if self.no_default_cover: + return None + self.log('Generating default cover') + m = self.oeb.metadata + title = unicode_type(m.title[0]) + authors = [unicode_type(x) for x in m.creator if x.role == 'aut'] + try: + from calibre.ebooks.covers import create_cover + series = series_index = None + if m.series: + try: + series, series_index = unicode_type(m.series[0]), m.series_index[0] + except IndexError: + pass + img_data = create_cover(title, authors, series, series_index) + id, href = self.oeb.manifest.generate('cover', + 'cover_image.jpg') + item = self.oeb.manifest.add(id, href, guess_type('t.jpg')[0], + data=img_data) + m.clear('cover') + m.add('cover', item.id) + + return item.href + except: + self.log.exception('Failed to generate default cover') + return None + + def inspect_cover(self, href): + from calibre.ebooks.oeb.base import urlnormalize + for x in self.oeb.manifest: + if x.href == urlnormalize(href): + try: + raw = x.data + return identify(raw)[1:] + except Exception: + self.log.exception('Failed to read cover image dimensions') + return -1, -1 + + def insert_cover(self): + from calibre.ebooks.oeb.base import urldefrag + g, m = self.oeb.guide, self.oeb.manifest + item = None + if 'titlepage' not in g: + if 'cover' in g: + href = g['cover'].href + else: + href = self.default_cover() + if href is None: + return + width, height = self.inspect_cover(href) + if width == -1 or height == -1: + self.log.warning('Failed to read cover dimensions') + width, height = 600, 800 + # if self.preserve_aspect_ratio: + # width, height = 600, 800 + self.svg_template = self.svg_template.replace('__viewbox__', + '0 0 %d %d'%(width, height)) + self.svg_template = self.svg_template.replace('__width__', + unicode_type(width)) + self.svg_template = self.svg_template.replace('__height__', + unicode_type(height)) + + if href is not None: + templ = self.non_svg_template if self.no_svg_cover \ + else self.svg_template + tp = templ%unquote(href) + id, href = m.generate('titlepage', 'titlepage.xhtml') + item = m.add(id, href, guess_type('t.xhtml')[0], + data=safe_xml_fromstring(tp)) + else: + item = self.oeb.manifest.hrefs[ + urldefrag(self.oeb.guide['titlepage'].href)[0]] + if item is not None: + self.oeb.spine.insert(0, item, True) + if 'cover' not in self.oeb.guide.refs: + self.oeb.guide.add('cover', 'Title Page', 'a') + self.oeb.guide.refs['cover'].href = item.href + if 'titlepage' in self.oeb.guide.refs: + self.oeb.guide.refs['titlepage'].href = item.href + titem = getattr(self.oeb.toc, 'item_that_refers_to_cover', None) + if titem is not None: + titem.href = item.href diff --git a/ebook_converter/ebooks/oeb/transforms/filenames.py b/ebook_converter/ebooks/oeb/transforms/filenames.py new file mode 100644 index 0000000..1539663 --- /dev/null +++ b/ebook_converter/ebooks/oeb/transforms/filenames.py @@ -0,0 +1,187 @@ +#!/usr/bin/env python2 +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import absolute_import, division, print_function, unicode_literals + +__license__ = 'GPL v3' +__copyright__ = '2010, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + +import posixpath + +from lxml import etree + +from calibre.ebooks.oeb.base import rewrite_links, urlnormalize +from polyglot.urllib import urldefrag, urlparse + + +class RenameFiles(object): # {{{ + + ''' + Rename files and adjust all links pointing to them. Note that the spine + and manifest are not touched by this transform. + ''' + + def __init__(self, rename_map, renamed_items_map=None): + self.rename_map = rename_map + self.renamed_items_map = renamed_items_map + + def __call__(self, oeb, opts): + import css_parser + self.log = oeb.logger + self.opts = opts + self.oeb = oeb + + for item in oeb.manifest.items: + self.current_item = item + if etree.iselement(item.data): + rewrite_links(self.current_item.data, self.url_replacer) + elif hasattr(item.data, 'cssText'): + css_parser.replaceUrls(item.data, self.url_replacer) + + if self.oeb.guide: + for ref in self.oeb.guide.values(): + href = urlnormalize(ref.href) + href, frag = urldefrag(href) + replacement = self.rename_map.get(href, None) + if replacement is not None: + nhref = replacement + if frag: + nhref += '#' + frag + ref.href = nhref + + if self.oeb.toc: + self.fix_toc_entry(self.oeb.toc) + + def fix_toc_entry(self, toc): + if toc.href: + href = urlnormalize(toc.href) + href, frag = urldefrag(href) + replacement = self.rename_map.get(href, None) + + if replacement is not None: + nhref = replacement + if frag: + nhref = '#'.join((nhref, frag)) + toc.href = nhref + + for x in toc: + self.fix_toc_entry(x) + + def url_replacer(self, orig_url): + url = urlnormalize(orig_url) + parts = urlparse(url) + if parts.scheme: + # Only rewrite local URLs + return orig_url + path, frag = urldefrag(url) + if self.renamed_items_map: + orig_item = self.renamed_items_map.get(self.current_item.href, self.current_item) + else: + orig_item = self.current_item + + href = orig_item.abshref(path) + replacement = self.current_item.relhref(self.rename_map.get(href, href)) + if frag: + replacement += '#' + frag + return replacement + +# }}} + + +class UniqueFilenames(object): # {{{ + + 'Ensure that every item in the manifest has a unique filename' + + def __call__(self, oeb, opts): + self.log = oeb.logger + self.opts = opts + self.oeb = oeb + + self.seen_filenames = set() + self.rename_map = {} + + for item in list(oeb.manifest.items): + fname = posixpath.basename(item.href) + if fname in self.seen_filenames: + suffix = self.unique_suffix(fname) + data = item.data + base, ext = posixpath.splitext(item.href) + nhref = base + suffix + ext + nhref = oeb.manifest.generate(href=nhref)[1] + spine_pos = item.spine_position + oeb.manifest.remove(item) + nitem = oeb.manifest.add(item.id, nhref, item.media_type, data=data, + fallback=item.fallback) + self.seen_filenames.add(posixpath.basename(nhref)) + self.rename_map[item.href] = nhref + if spine_pos is not None: + oeb.spine.insert(spine_pos, nitem, item.linear) + else: + self.seen_filenames.add(fname) + + if self.rename_map: + self.log('Found non-unique filenames, renaming to support broken' + ' EPUB readers like FBReader, Aldiko and Stanza...') + from pprint import pformat + self.log.debug(pformat(self.rename_map)) + + renamer = RenameFiles(self.rename_map) + renamer(oeb, opts) + + def unique_suffix(self, fname): + base, ext = posixpath.splitext(fname) + c = 0 + while True: + c += 1 + suffix = '_u%d'%c + candidate = base + suffix + ext + if candidate not in self.seen_filenames: + return suffix +# }}} + + +class FlatFilenames(object): # {{{ + + 'Ensure that every item in the manifest has a unique filename without subdirectories.' + + def __call__(self, oeb, opts): + self.log = oeb.logger + self.opts = opts + self.oeb = oeb + + self.rename_map = {} + self.renamed_items_map = {} + + for item in list(oeb.manifest.items): + # Flatten URL by removing directories. + # Example: a/b/c/index.html -> a_b_c_index.html + nhref = item.href.replace("/", "_") + + if item.href == nhref: + # URL hasn't changed, skip item. + continue + + data = item.data + isp = item.spine_position + nhref = oeb.manifest.generate(href=nhref)[1] + if isp is not None: + oeb.spine.remove(item) + oeb.manifest.remove(item) + + nitem = oeb.manifest.add(item.id, nhref, item.media_type, data=data, + fallback=item.fallback) + self.rename_map[item.href] = nhref + self.renamed_items_map[nhref] = item + if isp is not None: + oeb.spine.insert(isp, nitem, item.linear) + + if self.rename_map: + self.log('Found non-flat filenames, renaming to support broken' + ' EPUB readers like FBReader...') + from pprint import pformat + self.log.debug(pformat(self.rename_map)) + self.log.debug(pformat(self.renamed_items_map)) + + renamer = RenameFiles(self.rename_map, self.renamed_items_map) + renamer(oeb, opts) +# }}} diff --git a/ebook_converter/ebooks/oeb/transforms/rescale.py b/ebook_converter/ebooks/oeb/transforms/rescale.py new file mode 100644 index 0000000..214c895 --- /dev/null +++ b/ebook_converter/ebooks/oeb/transforms/rescale.py @@ -0,0 +1,81 @@ +#!/usr/bin/env python2 +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import absolute_import, division, print_function, unicode_literals + +__license__ = 'GPL v3' +__copyright__ = '2009, Kovid Goyal ' +__docformat__ = 'restructuredtext en' + +from calibre import fit_image + + +class RescaleImages(object): + + 'Rescale all images to fit inside given screen size' + + def __init__(self, check_colorspaces=False): + self.check_colorspaces = check_colorspaces + + def __call__(self, oeb, opts): + self.oeb, self.opts, self.log = oeb, opts, oeb.log + self.rescale() + + def rescale(self): + from PIL import Image + from io import BytesIO + + is_image_collection = getattr(self.opts, 'is_image_collection', False) + + if is_image_collection: + page_width, page_height = self.opts.dest.comic_screen_size + else: + page_width, page_height = self.opts.dest.width, self.opts.dest.height + page_width -= (self.opts.margin_left + self.opts.margin_right) * self.opts.dest.dpi/72 + page_height -= (self.opts.margin_top + self.opts.margin_bottom) * self.opts.dest.dpi/72 + + for item in self.oeb.manifest: + if item.media_type.startswith('image'): + ext = item.media_type.split('/')[-1].upper() + if ext == 'JPG': + ext = 'JPEG' + if ext not in ('PNG', 'JPEG', 'GIF'): + ext = 'JPEG' + + raw = item.data + if hasattr(raw, 'xpath') or not raw: + # Probably an svg image + continue + try: + img = Image.open(BytesIO(raw)) + except Exception: + continue + width, height = img.size + + try: + if self.check_colorspaces and img.mode == 'CMYK': + self.log.warn( + 'The image %s is in the CMYK colorspace, converting it ' + 'to RGB as Adobe Digital Editions cannot display CMYK' % item.href) + img = img.convert('RGB') + except Exception: + self.log.exception('Failed to convert image %s from CMYK to RGB' % item.href) + + scaled, new_width, new_height = fit_image(width, height, page_width, page_height) + if scaled: + new_width = max(1, new_width) + new_height = max(1, new_height) + self.log('Rescaling image from %dx%d to %dx%d'%( + width, height, new_width, new_height), item.href) + try: + img = img.resize((new_width, new_height)) + except Exception: + self.log.exception('Failed to rescale image: %s' % item.href) + continue + buf = BytesIO() + try: + img.save(buf, ext) + except Exception: + self.log.exception('Failed to rescale image: %s' % item.href) + else: + item.data = buf.getvalue() + item.unload_data_from_memory() diff --git a/ebook_converter/ebooks/oeb/transforms/split.py b/ebook_converter/ebooks/oeb/transforms/split.py new file mode 100644 index 0000000..3124561 --- /dev/null +++ b/ebook_converter/ebooks/oeb/transforms/split.py @@ -0,0 +1,488 @@ +from __future__ import absolute_import, division, print_function, unicode_literals +__license__ = 'GPL v3' +__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net' +__docformat__ = 'restructuredtext en' + +''' +Splitting of the XHTML flows. Splitting can happen on page boundaries or can be +forced at "likely" locations to conform to size limitations. This transform +assumes a prior call to the flatcss transform. +''' + +import os, functools, collections, re, copy +from collections import OrderedDict + +from lxml.etree import XPath as _XPath +from lxml import etree + +from calibre import as_unicode, force_unicode +from calibre.ebooks.epub import rules +from calibre.ebooks.oeb.base import (OEB_STYLES, XPNSMAP as NAMESPACES, + urldefrag, rewrite_links, XHTML, urlnormalize) +from calibre.ebooks.oeb.polish.split import do_split +from polyglot.builtins import iteritems, range, map, unicode_type +from polyglot.urllib import unquote +from css_selectors import Select, SelectorError + +XPath = functools.partial(_XPath, namespaces=NAMESPACES) + +SPLIT_POINT_ATTR = 'csp' + + +def tostring(root): + return etree.tostring(root, encoding='utf-8') + + +class SplitError(ValueError): + + def __init__(self, path, root): + size = len(tostring(root))/1024. + ValueError.__init__(self, + _('Could not find reasonable point at which to split: ' + '%(path)s Sub-tree size: %(size)d KB')%dict( + path=path, size=size)) + + +class Split(object): + + def __init__(self, split_on_page_breaks=True, page_breaks_xpath=None, + max_flow_size=0, remove_css_pagebreaks=True): + self.split_on_page_breaks = split_on_page_breaks + self.page_breaks_xpath = page_breaks_xpath + self.max_flow_size = max_flow_size + self.page_break_selectors = None + self.remove_css_pagebreaks = remove_css_pagebreaks + if self.page_breaks_xpath is not None: + self.page_break_selectors = [(XPath(self.page_breaks_xpath), False)] + + def __call__(self, oeb, opts): + self.oeb = oeb + self.log = oeb.log + self.log('Splitting markup on page breaks and flow limits, if any...') + self.opts = opts + self.map = {} + for item in list(self.oeb.manifest.items): + if item.spine_position is not None and etree.iselement(item.data): + self.split_item(item) + + self.fix_links() + + def split_item(self, item): + page_breaks, page_break_ids = [], [] + if self.split_on_page_breaks: + page_breaks, page_break_ids = self.find_page_breaks(item) + + splitter = FlowSplitter(item, page_breaks, page_break_ids, + self.max_flow_size, self.oeb, self.opts) + if splitter.was_split: + am = splitter.anchor_map + self.map[item.href] = collections.defaultdict( + am.default_factory, am) + + def find_page_breaks(self, item): + if self.page_break_selectors is None: + self.page_break_selectors = set() + stylesheets = [x.data for x in self.oeb.manifest if x.media_type in + OEB_STYLES] + for rule in rules(stylesheets): + before = force_unicode(getattr(rule.style.getPropertyCSSValue( + 'page-break-before'), 'cssText', '').strip().lower()) + after = force_unicode(getattr(rule.style.getPropertyCSSValue( + 'page-break-after'), 'cssText', '').strip().lower()) + try: + if before and before not in {'avoid', 'auto', 'inherit'}: + self.page_break_selectors.add((rule.selectorText, True)) + if self.remove_css_pagebreaks: + rule.style.removeProperty('page-break-before') + except: + pass + try: + if after and after not in {'avoid', 'auto', 'inherit'}: + self.page_break_selectors.add((rule.selectorText, False)) + if self.remove_css_pagebreaks: + rule.style.removeProperty('page-break-after') + except: + pass + page_breaks = set() + select = Select(item.data) + if not self.page_break_selectors: + return [], [] + body = item.data.xpath('//h:body', namespaces=NAMESPACES) + if not body: + return [], [] + descendants = frozenset(body[0].iterdescendants('*')) + + for selector, before in self.page_break_selectors: + try: + for elem in select(selector): + if elem in descendants and elem.tag.rpartition('}')[2].lower() not in {'html', 'body', 'head', 'style', 'script', 'meta', 'link'}: + elem.set('pb_before', '1' if before else '0') + page_breaks.add(elem) + except SelectorError as err: + self.log.warn('Ignoring page breaks specified with invalid CSS selector: %r (%s)' % (selector, as_unicode(err))) + + for i, elem in enumerate(item.data.iter('*')): + try: + elem.set('pb_order', unicode_type(i)) + except TypeError: # Cant set attributes on comment nodes etc. + continue + + page_breaks = list(page_breaks) + page_breaks.sort(key=lambda x:int(x.get('pb_order'))) + page_break_ids, page_breaks_ = [], [] + for i, x in enumerate(page_breaks): + x.set('id', x.get('id', 'calibre_pb_%d'%i)) + id = x.get('id') + try: + xp = XPath('//*[@id="%s"]'%id) + except: + try: + xp = XPath("//*[@id='%s']"%id) + except: + # The id has both a quote and an apostrophe or some other + # Just replace it since I doubt its going to work anywhere else + # either + id = 'calibre_pb_%d'%i + x.set('id', id) + xp = XPath('//*[@id=%r]'%id) + page_breaks_.append((xp, x.get('pb_before', '0') == '1')) + page_break_ids.append(id) + + for elem in item.data.iter(etree.Element): + elem.attrib.pop('pb_order', False) + elem.attrib.pop('pb_before', False) + + return page_breaks_, page_break_ids + + def fix_links(self): + ''' + Fix references to the split files in other content files. + ''' + for item in self.oeb.manifest: + if etree.iselement(item.data): + self.current_item = item + rewrite_links(item.data, self.rewrite_links) + + def rewrite_links(self, url): + href, frag = urldefrag(url) + try: + href = self.current_item.abshref(href) + except ValueError: + # Unparseable URL + return url + try: + href = urlnormalize(href) + except ValueError: + # href has non utf-8 quoting + return url + if href in self.map: + anchor_map = self.map[href] + nhref = anchor_map[frag if frag else None] + nhref = self.current_item.relhref(nhref) + if frag: + nhref = '#'.join((unquote(nhref), frag)) + + return nhref + return url + + +class FlowSplitter(object): + 'The actual splitting logic' + + def __init__(self, item, page_breaks, page_break_ids, max_flow_size, oeb, + opts): + self.item = item + self.oeb = oeb + self.opts = opts + self.log = oeb.log + self.page_breaks = page_breaks + self.page_break_ids = page_break_ids + self.max_flow_size = max_flow_size + self.base = item.href + self.csp_counter = 0 + + base, ext = os.path.splitext(self.base) + self.base = base.replace('%', '%%')+'_split_%.3d'+ext + + self.trees = [self.item.data.getroottree()] + self.splitting_on_page_breaks = True + if self.page_breaks: + self.split_on_page_breaks(self.trees[0]) + self.splitting_on_page_breaks = False + + if self.max_flow_size > 0: + lt_found = False + self.log('\tLooking for large trees in %s...'%item.href) + trees = list(self.trees) + self.tree_map = {} + for i, tree in enumerate(trees): + size = len(tostring(tree.getroot())) + if size > self.max_flow_size: + self.log('\tFound large tree #%d'%i) + lt_found = True + self.split_trees = [] + self.split_to_size(tree) + self.tree_map[tree] = self.split_trees + if not lt_found: + self.log('\tNo large trees found') + self.trees = [] + for x in trees: + self.trees.extend(self.tree_map.get(x, [x])) + + self.was_split = len(self.trees) > 1 + if self.was_split: + self.log('\tSplit into %d parts'%len(self.trees)) + self.commit() + + def split_on_page_breaks(self, orig_tree): + ordered_ids = OrderedDict() + all_page_break_ids = frozenset(self.page_break_ids) + for elem_id in orig_tree.xpath('//*/@id'): + if elem_id in all_page_break_ids: + ordered_ids[elem_id] = self.page_breaks[ + self.page_break_ids.index(elem_id)] + + self.trees = [orig_tree] + while ordered_ids: + pb_id, (pattern, before) = next(iteritems(ordered_ids)) + del ordered_ids[pb_id] + for i in range(len(self.trees)-1, -1, -1): + tree = self.trees[i] + elem = pattern(tree) + if elem: + self.log.debug('\t\tSplitting on page-break at id=%s'% + elem[0].get('id')) + before_tree, after_tree = self.do_split(tree, elem[0], before) + self.trees[i:i+1] = [before_tree, after_tree] + break + + trees, ids = [], set() + for tree in self.trees: + root = tree.getroot() + if self.is_page_empty(root): + discarded_ids = root.xpath('//*[@id]') + for x in discarded_ids: + x = x.get('id') + if not x.startswith('calibre_'): + ids.add(x) + else: + if ids: + body = self.get_body(root) + if body is not None: + existing_ids = frozenset(body.xpath('//*/@id')) + for x in ids - existing_ids: + body.insert(0, body.makeelement(XHTML('div'), id=x, style='height:0pt')) + ids = set() + trees.append(tree) + self.trees = trees + + def get_body(self, root): + body = root.xpath('//h:body', namespaces=NAMESPACES) + if not body: + return None + return body[0] + + def do_split(self, tree, split_point, before): + ''' + Split ``tree`` into a *before* and *after* tree at ``split_point``. + + :param before: If True tree is split before split_point, otherwise after split_point + :return: before_tree, after_tree + ''' + return do_split(split_point, self.log, before=before) + + def is_page_empty(self, root): + body = self.get_body(root) + if body is None: + return False + txt = re.sub(r'\s+|\xa0', '', + etree.tostring(body, method='text', encoding='unicode')) + if len(txt) > 1: + return False + for img in root.xpath('//h:img', namespaces=NAMESPACES): + if img.get('style', '') != 'display:none': + return False + if root.xpath('//*[local-name() = "svg"]'): + return False + return True + + def split_text(self, text, root, size): + self.log.debug('\t\t\tSplitting text of length: %d'%len(text)) + rest = text.replace('\r', '') + parts = re.split('\n\n', rest) + self.log.debug('\t\t\t\tFound %d parts'%len(parts)) + if max(map(len, parts)) > size: + raise SplitError('Cannot split as file contains a
 tag '
+                'with a very large paragraph', root)
+        ans = []
+        buf = ''
+        for part in parts:
+            if len(buf) + len(part) < size:
+                buf += '\n\n'+part
+            else:
+                ans.append(buf)
+                buf = part
+        return ans
+
+    def split_to_size(self, tree):
+        self.log.debug('\t\tSplitting...')
+        root = tree.getroot()
+        # Split large 
 tags if they contain only text
+        for pre in XPath('//h:pre')(root):
+            if len(tuple(pre.iterchildren(etree.Element))) > 0:
+                continue
+            if pre.text and len(pre.text) > self.max_flow_size*0.5:
+                self.log.debug('\t\tSplitting large 
 tag')
+                frags = self.split_text(pre.text, root, int(0.2*self.max_flow_size))
+                new_pres = []
+                for frag in frags:
+                    pre2 = copy.copy(pre)
+                    pre2.text = frag
+                    pre2.tail = ''
+                    new_pres.append(pre2)
+                new_pres[-1].tail = pre.tail
+                p = pre.getparent()
+                i = p.index(pre)
+                p[i:i+1] = new_pres
+
+        split_point, before = self.find_split_point(root)
+        if split_point is None:
+            raise SplitError(self.item.href, root)
+        self.log.debug('\t\t\tSplit point:', split_point.tag, tree.getpath(split_point))
+
+        trees = self.do_split(tree, split_point, before)
+        sizes = [len(tostring(t.getroot())) for t in trees]
+        if min(sizes) < 5*1024:
+            self.log.debug('\t\t\tSplit tree too small')
+            self.split_to_size(tree)
+            return
+
+        for t, size in zip(trees, sizes):
+            r = t.getroot()
+            if self.is_page_empty(r):
+                continue
+            elif size <= self.max_flow_size:
+                self.split_trees.append(t)
+                self.log.debug(
+                    '\t\t\tCommitted sub-tree #%d (%d KB)'%(
+                               len(self.split_trees), size/1024.))
+            else:
+                self.log.debug(
+                        '\t\t\tSplit tree still too large: %d KB' % (size/1024.))
+                self.split_to_size(t)
+
+    def find_split_point(self, root):
+        '''
+        Find the tag at which to split the tree rooted at `root`.
+        Search order is:
+            * Heading tags
+            * 
tags + *
 tags
+            * 
tags + *

tags + *
tags + *

  • tags + + We try to split in the "middle" of the file (as defined by tag counts. + ''' + def pick_elem(elems): + if elems: + elems = [i for i in elems if i.get(SPLIT_POINT_ATTR, '0') != + '1'] + if elems: + i = int(len(elems)//2) + elems[i].set(SPLIT_POINT_ATTR, '1') + return elems[i] + + for path in ( + '//*[re:match(name(), "h[1-6]", "i")]', + '/h:html/h:body/h:div', + '//h:pre', + '//h:hr', + '//h:p', + '//h:div', + '//h:br', + '//h:li', + ): + elems = root.xpath(path, namespaces=NAMESPACES) + elem = pick_elem(elems) + if elem is not None: + try: + XPath(elem.getroottree().getpath(elem)) + except: + continue + return elem, True + + return None, True + + def commit(self): + ''' + Commit all changes caused by the split. Calculates an *anchor_map* for + all anchors in the original tree. Internal links are re-directed. The + original file is deleted and the split files are saved. + ''' + if not self.was_split: + return + self.anchor_map = collections.defaultdict(lambda :self.base%0) + self.files = [] + + for i, tree in enumerate(self.trees): + root = tree.getroot() + self.files.append(self.base%i) + for elem in root.xpath('//*[@id or @name]'): + for anchor in elem.get('id', ''), elem.get('name', ''): + if anchor != '' and anchor not in self.anchor_map: + self.anchor_map[anchor] = self.files[-1] + for elem in root.xpath('//*[@%s]'%SPLIT_POINT_ATTR): + elem.attrib.pop(SPLIT_POINT_ATTR, '0') + + spine_pos = self.item.spine_position + + for current, tree in zip(*map(reversed, (self.files, self.trees))): + for a in tree.getroot().xpath('//h:a[@href]', namespaces=NAMESPACES): + href = a.get('href').strip() + if href.startswith('#'): + anchor = href[1:] + file = self.anchor_map[anchor] + file = self.item.relhref(file) + if file != current: + a.set('href', file+href) + + new_id = self.oeb.manifest.generate(id=self.item.id)[0] + new_item = self.oeb.manifest.add(new_id, current, + self.item.media_type, data=tree.getroot()) + self.oeb.spine.insert(spine_pos, new_item, self.item.linear) + + if self.oeb.guide: + for ref in self.oeb.guide.values(): + href, frag = urldefrag(ref.href) + if href == self.item.href: + nhref = self.anchor_map[frag if frag else None] + if frag: + nhref = '#'.join((nhref, frag)) + ref.href = nhref + + def fix_toc_entry(toc): + if toc.href: + href, frag = urldefrag(toc.href) + if href == self.item.href: + nhref = self.anchor_map[frag if frag else None] + if frag: + nhref = '#'.join((nhref, frag)) + toc.href = nhref + for x in toc: + fix_toc_entry(x) + + if self.oeb.toc: + fix_toc_entry(self.oeb.toc) + + if self.oeb.pages: + for page in self.oeb.pages: + href, frag = urldefrag(page.href) + if href == self.item.href: + nhref = self.anchor_map[frag if frag else None] + if frag: + nhref = '#'.join((nhref, frag)) + page.href = nhref + + self.oeb.manifest.remove(self.item)