1
0
mirror of https://github.com/gryf/ebook-converter.git synced 2026-03-28 22:53:43 +01:00

Fixed flake8 issues to several modules

This commit is contained in:
2020-06-14 15:49:11 +02:00
parent 1d4f75ceba
commit 9891d02694
10 changed files with 505 additions and 335 deletions

View File

@@ -15,17 +15,18 @@ from ebook_converter.polyglot.builtins import as_unicode
def sanitize_file_name(x): def sanitize_file_name(x):
ans = re.sub(r'\s+', ' ', re.sub(r'[?&=;#]', '_', ascii_filename(x))).strip().rstrip('.') ans = re.sub(r'\s+', ' ', re.sub(r'[?&=;#]', '_',
ascii_filename(x))).strip().rstrip('.')
ans, ext = ans.rpartition('.')[::2] ans, ext = ans.rpartition('.')[::2]
return (ans.strip() + '.' + ext.strip()).rstrip('.') return (ans.strip() + '.' + ext.strip()).rstrip('.')
class HTMLInput(InputFormatPlugin): class HTMLInput(InputFormatPlugin):
name = 'HTML Input' name = 'HTML Input'
author = 'Kovid Goyal' author = 'Kovid Goyal'
description = 'Convert HTML and OPF files to an OEB' description = 'Convert HTML and OPF files to an OEB'
file_types = {'opf', 'html', 'htm', 'xhtml', 'xhtm', 'shtm', 'shtml'} file_types = {'opf', 'html', 'htm', 'xhtml', 'xhtm', 'shtm', 'shtml'}
commit_name = 'html_input' commit_name = 'html_input'
options = { options = {

View File

@@ -6,10 +6,10 @@ from ebook_converter.customize.conversion import InputFormatPlugin
class HTMLZInput(InputFormatPlugin): class HTMLZInput(InputFormatPlugin):
name = 'HTLZ Input' name = 'HTLZ Input'
author = 'John Schember' author = 'John Schember'
description = 'Convert HTML files to HTML' description = 'Convert HTML files to HTML'
file_types = {'htmlz'} file_types = {'htmlz'}
commit_name = 'htmlz_input' commit_name = 'htmlz_input'
def convert(self, stream, options, file_ext, log, def convert(self, stream, options, file_ext, log,
@@ -36,13 +36,14 @@ class HTMLZInput(InputFormatPlugin):
top_levels.append(x) top_levels.append(x)
# Try to find an index. file. # Try to find an index. file.
for x in top_levels: for x in top_levels:
if x.lower() in (u'index.html', u'index.xhtml', u'index.htm'): if x.lower() in ('index.html', 'index.xhtml', 'index.htm'):
index = x index = x
break break
# Look for multiple HTML files in the archive. We look at the # Look for multiple HTML files in the archive. We look at the
# top level files only as only they matter in HTMLZ. # top level files only as only they matter in HTMLZ.
for x in top_levels: for x in top_levels:
if os.path.splitext(x)[1].lower() in (u'.html', u'.xhtml', u'.htm'): if os.path.splitext(x)[1].lower() in ('.html', '.xhtml',
'.htm'):
# Set index to the first HTML file found if it's not # Set index to the first HTML file found if it's not
# called index. # called index.
if not index: if not index:
@@ -84,15 +85,14 @@ class HTMLZInput(InputFormatPlugin):
c = 0 c = 0
while os.path.exists(htmlfile): while os.path.exists(htmlfile):
c += 1 c += 1
htmlfile = u'index%d.html'%c htmlfile = u'index%d.html' % c
with open(htmlfile, 'wb') as f: with open(htmlfile, 'wb') as f:
f.write(html.encode('utf-8')) f.write(html.encode('utf-8'))
odi = options.debug_pipeline odi = options.debug_pipeline
options.debug_pipeline = None options.debug_pipeline = None
# Generate oeb from html conversion. # Generate oeb from html conversion.
with open(htmlfile, 'rb') as f: with open(htmlfile, 'rb') as f:
oeb = html_input.convert(f, options, 'html', log, oeb = html_input.convert(f, options, 'html', log, {})
{})
options.debug_pipeline = odi options.debug_pipeline = odi
os.remove(htmlfile) os.remove(htmlfile)

View File

@@ -1,5 +1,11 @@
import sys, os, re, math, errno, uuid, numbers import sys
from collections import OrderedDict, defaultdict import os
import re
import math
import errno
import uuid
import numbers
import collections
import mimetypes import mimetypes
from lxml import etree from lxml import etree
@@ -7,23 +13,24 @@ from lxml import html
from lxml.html.builder import ( from lxml.html.builder import (
HTML, HEAD, TITLE, BODY, LINK, META, P, SPAN, BR, DIV, A, DT, DL, DD, H1) HTML, HEAD, TITLE, BODY, LINK, META, P, SPAN, BR, DIV, A, DT, DL, DD, H1)
from ebook_converter import guess_type
from ebook_converter.ebooks.docx.container import DOCX
from ebook_converter.ebooks.docx.names import XML, generate_anchor
from ebook_converter.ebooks.docx.styles import Styles, inherit, PageProperties
from ebook_converter.ebooks.docx.numbering import Numbering
from ebook_converter.ebooks.docx.fonts import Fonts, is_symbol_font, map_symbol_text
from ebook_converter.ebooks.docx.images import Images
from ebook_converter.ebooks.docx.tables import Tables
from ebook_converter.ebooks.docx.footnotes import Footnotes
from ebook_converter.ebooks.docx.cleanup import cleanup_markup from ebook_converter.ebooks.docx.cleanup import cleanup_markup
from ebook_converter.ebooks.docx.container import DOCX
from ebook_converter.ebooks.docx.fields import Fields
from ebook_converter.ebooks.docx.fonts import Fonts
from ebook_converter.ebooks.docx.fonts import is_symbol_font
from ebook_converter.ebooks.docx.fonts import map_symbol_text
from ebook_converter.ebooks.docx.footnotes import Footnotes
from ebook_converter.ebooks.docx.images import Images
from ebook_converter.ebooks.docx.names import XML, generate_anchor
from ebook_converter.ebooks.docx.numbering import Numbering
from ebook_converter.ebooks.docx.settings import Settings
from ebook_converter.ebooks.docx.styles import Styles, inherit, PageProperties
from ebook_converter.ebooks.docx.tables import Tables
from ebook_converter.ebooks.docx.theme import Theme from ebook_converter.ebooks.docx.theme import Theme
from ebook_converter.ebooks.docx.toc import create_toc from ebook_converter.ebooks.docx.toc import create_toc
from ebook_converter.ebooks.docx.fields import Fields
from ebook_converter.ebooks.docx.settings import Settings
from ebook_converter.ebooks.metadata.opf2 import OPFCreator from ebook_converter.ebooks.metadata.opf2 import OPFCreator
from ebook_converter.utils.localization import canonicalize_lang, lang_as_iso639_1 from ebook_converter.utils.localization import canonicalize_lang
from ebook_converter.utils.localization import lang_as_iso639_1
NBSP = '\xa0' NBSP = '\xa0'
@@ -54,7 +61,9 @@ def html_lang(docx_lang):
class Convert(object): class Convert(object):
def __init__(self, path_or_stream, dest_dir=None, log=None, detect_cover=True, notes_text=None, notes_nopb=False, nosupsub=False): def __init__(self, path_or_stream, dest_dir=None, log=None,
detect_cover=True, notes_text=None, notes_nopb=False,
nosupsub=False):
self.docx = DOCX(path_or_stream, log=log) self.docx = DOCX(path_or_stream, log=log)
self.namespace = self.docx.namespace self.namespace = self.docx.namespace
self.ms_pat = re.compile(r'\s{2,}') self.ms_pat = re.compile(r'\s{2,}')
@@ -73,7 +82,7 @@ class Convert(object):
self.fields = Fields(self.namespace) self.fields = Fields(self.namespace)
self.styles = Styles(self.namespace, self.tables) self.styles = Styles(self.namespace, self.tables)
self.images = Images(self.namespace, self.log) self.images = Images(self.namespace, self.log)
self.object_map = OrderedDict() self.object_map = collections.OrderedDict()
self.html = HTML( self.html = HTML(
HEAD( HEAD(
META(charset='utf-8'), META(charset='utf-8'),
@@ -82,9 +91,9 @@ class Convert(object):
), ),
self.body self.body
) )
self.html.text='\n\t' self.html.text = '\n\t'
self.html[0].text='\n\t\t' self.html[0].text = '\n\t\t'
self.html[0].tail='\n' self.html[0].tail = '\n'
for child in self.html[0]: for child in self.html[0]:
child.tail = '\n\t\t' child.tail = '\n\t\t'
self.html[0][-1].tail = '\n\t' self.html[0][-1].tail = '\n\t'
@@ -98,17 +107,18 @@ class Convert(object):
def __call__(self): def __call__(self):
doc = self.docx.document doc = self.docx.document
relationships_by_id, relationships_by_type = self.docx.document_relationships (relationships_by_id,
relationships_by_type) = self.docx.document_relationships
self.resolve_alternate_content(doc) self.resolve_alternate_content(doc)
self.fields(doc, self.log) self.fields(doc, self.log)
self.read_styles(relationships_by_type) self.read_styles(relationships_by_type)
self.images(relationships_by_id) self.images(relationships_by_id)
self.layers = OrderedDict() self.layers = collections.OrderedDict()
self.framed = [[]] self.framed = [[]]
self.frame_map = {} self.frame_map = {}
self.framed_map = {} self.framed_map = {}
self.anchor_map = {} self.anchor_map = {}
self.link_map = defaultdict(list) self.link_map = collections.defaultdict(list)
self.link_source_map = {} self.link_source_map = {}
self.toc_anchor = None self.toc_anchor = None
self.block_runs = [] self.block_runs = []
@@ -142,7 +152,8 @@ class Convert(object):
dl = DL(id=anchor) dl = DL(id=anchor)
dl.set('class', 'footnote') dl.set('class', 'footnote')
self.body.append(dl) self.body.append(dl)
dl.append(DT('[', A('' + text, href='#back_%s' % anchor, title=text))) dl.append(DT('[', A('' + text, href='#back_%s' % anchor,
title=text)))
dl[-1][0].tail = ']' dl[-1][0].tail = ']'
dl.append(DD()) dl.append(DD())
paras = [] paras = []
@@ -159,7 +170,8 @@ class Convert(object):
self.mark_block_runs(paras) self.mark_block_runs(paras)
for p, wp in self.object_map.items(): for p, wp in self.object_map.items():
if len(p) > 0 and not p.text and len(p[0]) > 0 and not p[0].text and p[0][0].get('class', None) == 'tab': if (len(p) > 0 and not p.text and len(p[0]) > 0 and
not p[0].text and p[0][0].get('class', None) == 'tab'):
# Paragraph uses tabs for indentation, convert to text-indent # Paragraph uses tabs for indentation, convert to text-indent
parent = p[0] parent = p[0]
tabs = [] tabs = []
@@ -172,7 +184,9 @@ class Convert(object):
break break
indent = len(tabs) * self.settings.default_tab_stop indent = len(tabs) * self.settings.default_tab_stop
style = self.styles.resolve(wp) style = self.styles.resolve(wp)
if style.text_indent is inherit or (hasattr(style.text_indent, 'endswith') and style.text_indent.endswith('pt')): if (style.text_indent is inherit or
(hasattr(style.text_indent, 'endswith') and
style.text_indent.endswith('pt'))):
if style.text_indent is not inherit: if style.text_indent is not inherit:
indent = float(style.text_indent[:-2]) + indent indent = float(style.text_indent[:-2]) + indent
style.text_indent = '%.3gpt' % indent style.text_indent = '%.3gpt' % indent
@@ -197,7 +211,8 @@ class Convert(object):
except (TypeError, ValueError): except (TypeError, ValueError):
lvl = 0 lvl = 0
numbered.append((html_obj, num_id, lvl)) numbered.append((html_obj, num_id, lvl))
self.numbering.apply_markup(numbered, self.body, self.styles, self.object_map, self.images) self.numbering.apply_markup(numbered, self.body, self.styles,
self.object_map, self.images)
self.apply_frames() self.apply_frames()
if len(self.body) > 0: if len(self.body) > 0:
@@ -232,13 +247,15 @@ class Convert(object):
self.fields.polish_markup(self.object_map) self.fields.polish_markup(self.object_map)
self.log.debug('Cleaning up redundant markup generated by Word') self.log.debug('Cleaning up redundant markup generated by Word')
self.cover_image = cleanup_markup(self.log, self.html, self.styles, self.dest_dir, self.detect_cover, self.namespace.XPath) self.cover_image = cleanup_markup(self.log, self.html, self.styles,
self.dest_dir, self.detect_cover,
self.namespace.XPath)
return self.write(doc) return self.write(doc)
def read_page_properties(self, doc): def read_page_properties(self, doc):
current = [] current = []
self.page_map = OrderedDict() self.page_map = collections.OrderedDict()
self.section_starts = [] self.section_starts = []
for p in self.namespace.descendants(doc, 'w:p', 'w:tbl'): for p in self.namespace.descendants(doc, 'w:p', 'w:tbl'):
@@ -267,7 +284,8 @@ class Convert(object):
def resolve_alternate_content(self, doc): def resolve_alternate_content(self, doc):
# For proprietary extensions in Word documents use the fallback, spec # For proprietary extensions in Word documents use the fallback, spec
# compliant form # compliant form
# See https://wiki.openoffice.org/wiki/OOXML/Markup_Compatibility_and_Extensibility # See https://wiki.openoffice.org/wiki/
# OOXML/Markup_Compatibility_and_Extensibility
for ac in self.namespace.descendants(doc, 'mc:AlternateContent'): for ac in self.namespace.descendants(doc, 'mc:AlternateContent'):
choices = self.namespace.XPath('./mc:Choice')(ac) choices = self.namespace.XPath('./mc:Choice')(ac)
fallbacks = self.namespace.XPath('./mc:Fallback')(ac) fallbacks = self.namespace.XPath('./mc:Fallback')(ac)
@@ -284,7 +302,8 @@ class Convert(object):
cname[-1] = defname cname[-1] = defname
if self.docx.exists('/'.join(cname)): if self.docx.exists('/'.join(cname)):
name = name name = name
if name and name.startswith('word/word') and not self.docx.exists(name): if (name and name.startswith('word/word') and
not self.docx.exists(name)):
name = name.partition('/')[2] name = name.partition('/')[2]
return name return name
@@ -327,7 +346,8 @@ class Convert(object):
self.log.warn('Endnotes %s do not exist' % enname) self.log.warn('Endnotes %s do not exist' % enname)
else: else:
enrel = self.docx.get_relationships(enname) enrel = self.docx.get_relationships(enname)
footnotes(etree.fromstring(foraw) if foraw else None, forel, etree.fromstring(enraw) if enraw else None, enrel) footnotes(etree.fromstring(foraw) if foraw else None, forel,
etree.fromstring(enraw) if enraw else None, enrel)
if fname is not None: if fname is not None:
embed_relationships = self.docx.get_relationships(fname)[0] embed_relationships = self.docx.get_relationships(fname)[0]
@@ -336,7 +356,8 @@ class Convert(object):
except KeyError: except KeyError:
self.log.warn('Fonts table %s does not exist' % fname) self.log.warn('Fonts table %s does not exist' % fname)
else: else:
fonts(etree.fromstring(raw), embed_relationships, self.docx, self.dest_dir) fonts(etree.fromstring(raw), embed_relationships, self.docx,
self.dest_dir)
if tname is not None: if tname is not None:
try: try:
@@ -364,16 +385,20 @@ class Convert(object):
except KeyError: except KeyError:
self.log.warn('Numbering styles %s do not exist' % nname) self.log.warn('Numbering styles %s do not exist' % nname)
else: else:
numbering(etree.fromstring(raw), self.styles, self.docx.get_relationships(nname)[0]) numbering(etree.fromstring(raw), self.styles,
self.docx.get_relationships(nname)[0])
self.styles.resolve_numbering(numbering) self.styles.resolve_numbering(numbering)
def write(self, doc): def write(self, doc):
toc = create_toc(doc, self.body, self.resolved_link_map, self.styles, self.object_map, self.log, self.namespace) toc = create_toc(doc, self.body, self.resolved_link_map, self.styles,
raw = html.tostring(self.html, encoding='utf-8', doctype='<!DOCTYPE html>') self.object_map, self.log, self.namespace)
raw = html.tostring(self.html, encoding='utf-8',
doctype='<!DOCTYPE html>')
with open(os.path.join(self.dest_dir, 'index.html'), 'wb') as f: with open(os.path.join(self.dest_dir, 'index.html'), 'wb') as f:
f.write(raw) f.write(raw)
css = self.styles.generate_css(self.dest_dir, self.docx, self.notes_nopb, self.nosupsub) css = self.styles.generate_css(self.dest_dir, self.docx,
self.notes_nopb, self.nosupsub)
if css: if css:
with open(os.path.join(self.dest_dir, 'docx.css'), 'wb') as f: with open(os.path.join(self.dest_dir, 'docx.css'), 'wb') as f:
f.write(css.encode('utf-8')) f.write(css.encode('utf-8'))
@@ -394,23 +419,29 @@ class Convert(object):
title='Table of Contents', title='Table of Contents',
type='toc')) type='toc'))
toc_file = os.path.join(self.dest_dir, 'toc.ncx') toc_file = os.path.join(self.dest_dir, 'toc.ncx')
with open(os.path.join(self.dest_dir, 'metadata.opf'), 'wb') as of, open(toc_file, 'wb') as ncx: with open(os.path.join(self.dest_dir,
'metadata.opf'), 'wb') as of, open(toc_file,
'wb') as ncx:
opf.render(of, ncx, 'toc.ncx', process_guide=process_guide) opf.render(of, ncx, 'toc.ncx', process_guide=process_guide)
if os.path.getsize(toc_file) == 0: if os.path.getsize(toc_file) == 0:
os.remove(toc_file) os.remove(toc_file)
return os.path.join(self.dest_dir, 'metadata.opf') return os.path.join(self.dest_dir, 'metadata.opf')
def read_block_anchors(self, doc): def read_block_anchors(self, doc):
doc_anchors = frozenset(self.namespace.XPath('./w:body/w:bookmarkStart[@w:name]')(doc)) doc_anchors = frozenset(self.namespace.XPath('./w:body/w:bookmarkStart'
'[@w:name]')(doc))
if doc_anchors: if doc_anchors:
current_bm = set() current_bm = set()
rmap = {v:k for k, v in self.object_map.items()} rmap = {v: k for k, v in self.object_map.items()}
for p in self.namespace.descendants(doc, 'w:p', 'w:bookmarkStart[@w:name]'): for p in self.namespace.descendants(doc, 'w:p',
'w:bookmarkStart[@w:name]'):
if p.tag.endswith('}p'): if p.tag.endswith('}p'):
if current_bm and p in rmap: if current_bm and p in rmap:
para = rmap[p] para = rmap[p]
if 'id' not in para.attrib: if 'id' not in para.attrib:
para.set('id', generate_anchor(next(iter(current_bm)), frozenset(self.anchor_map.values()))) _bm = next(iter(current_bm))
_am = frozenset(self.anchor_map.values())
para.set('id', generate_anchor(_bm, _am))
for name in current_bm: for name in current_bm:
self.anchor_map[name] = para.get('id') self.anchor_map[name] = para.get('id')
current_bm = set() current_bm = set()
@@ -442,13 +473,15 @@ class Convert(object):
except AttributeError: except AttributeError:
break break
for x in self.namespace.descendants(p, 'w:r', 'w:bookmarkStart', 'w:hyperlink', 'w:instrText'): for x in self.namespace.descendants(p, 'w:r', 'w:bookmarkStart',
'w:hyperlink', 'w:instrText'):
if p_parent(x) is not p: if p_parent(x) is not p:
continue continue
if x.tag.endswith('}r'): if x.tag.endswith('}r'):
span = self.convert_run(x) span = self.convert_run(x)
if current_anchor is not None: if current_anchor is not None:
(dest if len(dest) == 0 else span).set('id', current_anchor) (dest if len(dest) == 0 else span).set('id',
current_anchor)
current_anchor = None current_anchor = None
if current_hyperlink is not None: if current_hyperlink is not None:
try: try:
@@ -462,11 +495,14 @@ class Convert(object):
self.layers[p].append(x) self.layers[p].append(x)
elif x.tag.endswith('}bookmarkStart'): elif x.tag.endswith('}bookmarkStart'):
anchor = self.namespace.get(x, 'w:name') anchor = self.namespace.get(x, 'w:name')
if anchor and anchor not in self.anchor_map and anchor != '_GoBack': if (anchor and anchor not in self.anchor_map and
anchor != '_GoBack'):
# _GoBack is a special bookmark inserted by Word 2010 for # _GoBack is a special bookmark inserted by Word 2010 for
# the return to previous edit feature, we ignore it # the return to previous edit feature, we ignore it
old_anchor = current_anchor old_anchor = current_anchor
self.anchor_map[anchor] = current_anchor = generate_anchor(anchor, frozenset(self.anchor_map.values())) current_anchor = generate_anchor(
anchor, frozenset(self .anchor_map.values()))
self.anchor_map[anchor] = current_anchor
if old_anchor is not None: if old_anchor is not None:
# The previous anchor was not applied to any element # The previous anchor was not applied to any element
for a, t in tuple(self.anchor_map.items()): for a, t in tuple(self.anchor_map.items()):
@@ -474,10 +510,13 @@ class Convert(object):
self.anchor_map[a] = current_anchor self.anchor_map[a] = current_anchor
elif x.tag.endswith('}hyperlink'): elif x.tag.endswith('}hyperlink'):
current_hyperlink = x current_hyperlink = x
elif x.tag.endswith('}instrText') and x.text and x.text.strip().startswith('TOC '): elif (x.tag.endswith('}instrText') and x.text and
x.text.strip().startswith('TOC ')):
old_anchor = current_anchor old_anchor = current_anchor
anchor = str(uuid.uuid4()) anchor = str(uuid.uuid4())
self.anchor_map[anchor] = current_anchor = generate_anchor('toc', frozenset(self.anchor_map.values())) current_anchor = generate_anchor(
'toc', frozenset(self.anchor_map.values()))
self.anchor_map[anchor] = current_anchor
self.toc_anchor = current_anchor self.toc_anchor = current_anchor
if old_anchor is not None: if old_anchor is not None:
# The previous anchor was not applied to any element # The previous anchor was not applied to any element
@@ -489,7 +528,8 @@ class Convert(object):
dest.set('id', current_anchor) dest.set('id', current_anchor)
current_anchor = None current_anchor = None
m = re.match(r'heading\s+(\d+)$', style.style_name or '', re.IGNORECASE) m = re.match(r'heading\s+(\d+)$', style.style_name or '',
re.IGNORECASE)
if m is not None: if m is not None:
n = min(6, max(1, int(m.group(1)))) n = min(6, max(1, int(m.group(1))))
dest.tag = 'h%d' % n dest.tag = 'h%d' % n
@@ -533,7 +573,8 @@ class Convert(object):
if len(dest) > 0 and not dest[-1].tail: if len(dest) > 0 and not dest[-1].tail:
if dest[-1].tag == 'br': if dest[-1].tag == 'br':
dest[-1].tail = NBSP dest[-1].tail = NBSP
elif len(dest[-1]) > 0 and dest[-1][-1].tag == 'br' and not dest[-1][-1].tail: elif (len(dest[-1]) > 0 and dest[-1][-1].tag == 'br' and
not dest[-1][-1].tail):
dest[-1][-1].tail = NBSP dest[-1][-1].tail = NBSP
return dest return dest
@@ -578,12 +619,12 @@ class Convert(object):
if anchor and anchor in self.anchor_map: if anchor and anchor in self.anchor_map:
span.set('href', '#' + self.anchor_map[anchor]) span.set('href', '#' + self.anchor_map[anchor])
continue continue
self.log.warn('Hyperlink with unknown target (rid=%s, anchor=%s), ignoring' % self.log.warn('Hyperlink with unknown target (rid=%s, anchor=%s), '
(rid, anchor)) 'ignoring' % (rid, anchor))
# hrefs that point nowhere give epubcheck a hernia. The element # hrefs that point nowhere give epubcheck a hernia. The element
# should be styled explicitly by Word anyway. # should be styled explicitly by Word anyway.
# span.set('href', '#') # span.set('href', '#')
rmap = {v:k for k, v in self.object_map.items()} rmap = {v: k for k, v in self.object_map.items()}
for hyperlink, runs in self.fields.hyperlink_fields: for hyperlink, runs in self.fields.hyperlink_fields:
spans = [rmap[r] for r in runs if r in rmap] spans = [rmap[r] for r in runs if r in rmap]
if not spans: if not spans:
@@ -604,7 +645,8 @@ class Convert(object):
if anchor in self.anchor_map: if anchor in self.anchor_map:
span.set('href', '#' + self.anchor_map[anchor]) span.set('href', '#' + self.anchor_map[anchor])
continue continue
self.log.warn('Hyperlink field with unknown anchor: %s' % anchor) self.log.warn('Hyperlink field with unknown anchor: %s' %
anchor)
else: else:
if url in self.anchor_map: if url in self.anchor_map:
span.set('href', '#' + self.anchor_map[url]) span.set('href', '#' + self.anchor_map[url])
@@ -652,7 +694,8 @@ class Convert(object):
# actually needs it, i.e. if it has more than one # actually needs it, i.e. if it has more than one
# consecutive space or it has newlines or tabs. # consecutive space or it has newlines or tabs.
multi_spaces = self.ms_pat.search(ctext) is not None multi_spaces = self.ms_pat.search(ctext) is not None
preserve = multi_spaces or self.ws_pat.search(ctext) is not None preserve = (multi_spaces or
self.ws_pat.search(ctext) is not None)
if preserve: if preserve:
text.add_elem(SPAN(ctext, style="white-space:pre-wrap")) text.add_elem(SPAN(ctext, style="white-space:pre-wrap"))
ans.append(text.elem) ans.append(text.elem)
@@ -668,24 +711,30 @@ class Convert(object):
else: else:
clear = child.get('clear', None) clear = child.get('clear', None)
if clear in {'all', 'left', 'right'}: if clear in {'all', 'left', 'right'}:
br = BR(style='clear:%s'%('both' if clear == 'all' else clear)) br = BR(style='clear:%s' % ('both' if clear == 'all'
else clear))
else: else:
br = BR() br = BR()
text.add_elem(br) text.add_elem(br)
ans.append(text.elem) ans.append(text.elem)
elif self.namespace.is_tag(child, 'w:drawing') or self.namespace.is_tag(child, 'w:pict'): elif (self.namespace.is_tag(child, 'w:drawing') or
for img in self.images.to_html(child, self.current_page, self.docx, self.dest_dir): self.namespace.is_tag(child, 'w:pict')):
for img in self.images.to_html(child, self.current_page,
self.docx, self.dest_dir):
text.add_elem(img) text.add_elem(img)
ans.append(text.elem) ans.append(text.elem)
elif self.namespace.is_tag(child, 'w:footnoteReference') or self.namespace.is_tag(child, 'w:endnoteReference'): elif (self.namespace.is_tag(child, 'w:footnoteReference') or
self.namespace.is_tag(child, 'w:endnoteReference')):
anchor, name = self.footnotes.get_ref(child) anchor, name = self.footnotes.get_ref(child)
if anchor and name: if anchor and name:
l = A(name, id='back_%s' % anchor, href='#' + anchor, title=name) _l = A(name, id='back_%s' % anchor, href='#' + anchor,
l.set('class', 'noteref') title=name)
text.add_elem(l) _l.set('class', 'noteref')
text.add_elem(_l)
ans.append(text.elem) ans.append(text.elem)
elif self.namespace.is_tag(child, 'w:tab'): elif self.namespace.is_tag(child, 'w:tab'):
spaces = int(math.ceil((self.settings.default_tab_stop / 36) * 6)) spaces = int(math.ceil((self.settings.default_tab_stop / 36) *
6))
text.add_elem(SPAN(NBSP * spaces)) text.add_elem(SPAN(NBSP * spaces))
ans.append(text.elem) ans.append(text.elem)
ans[-1].set('class', 'tab') ans[-1].set('class', 'tab')
@@ -699,7 +748,8 @@ class Convert(object):
style = self.styles.resolve_run(run) style = self.styles.resolve_run(run)
if style.vert_align in {'superscript', 'subscript'}: if style.vert_align in {'superscript', 'subscript'}:
if ans.text or len(ans): if ans.text or len(ans):
ans.set('data-docx-vert', 'sup' if style.vert_align == 'superscript' else 'sub') ans.set('data-docx-vert',
'sup' if style.vert_align == 'superscript' else 'sub')
if style.lang is not inherit: if style.lang is not inherit:
lang = html_lang(style.lang) lang = html_lang(style.lang)
if lang is not None and lang != self.doc_lang: if lang is not None and lang != self.doc_lang:
@@ -738,12 +788,14 @@ class Convert(object):
idx = parent.index(paras[0]) idx = parent.index(paras[0])
frame = DIV(*paras) frame = DIV(*paras)
parent.insert(idx, frame) parent.insert(idx, frame)
self.framed_map[frame] = css = style.css(self.page_map[self.object_map[paras[0]]]) self.framed_map[frame] = css = style.css(
self.page_map[self.object_map[paras[0]]])
self.styles.register(css, 'frame') self.styles.register(css, 'frame')
if not self.block_runs: if not self.block_runs:
return return
rmap = {v:k for k, v in self.object_map.items()}
rmap = {v: k for k, v in self.object_map.items()}
for border_style, blocks in self.block_runs: for border_style, blocks in self.block_runs:
paras = tuple(rmap[p] for p in blocks) paras = tuple(rmap[p] for p in blocks)
for p in paras: for p in paras:
@@ -796,17 +848,20 @@ class Convert(object):
else: else:
border_style = style.clone_border_styles() border_style = style.clone_border_styles()
if has_visible_border: if has_visible_border:
border_style.margin_top, style.margin_top = style.margin_top, inherit style.margin_top = inherit
border_style.margin_top = style.margin_top
if p is not run[-1]: if p is not run[-1]:
style.padding_bottom = 0 style.padding_bottom = 0
else: else:
if has_visible_border: if has_visible_border:
border_style.margin_bottom, style.margin_bottom = style.margin_bottom, inherit style.margin_bottom = inherit
border_style.margin_bottom = style.margin_bottom
style.clear_borders() style.clear_borders()
if p is not run[-1]: if p is not run[-1]:
style.apply_between_border() style.apply_between_border()
if has_visible_border: if has_visible_border:
border_style.margin_left, border_style.margin_right = max_left,max_right border_style.margin_left = max_left
border_style.margin_right = max_right
self.block_runs.append((border_style, run)) self.block_runs.append((border_style, run))
run = [] run = []

View File

@@ -1,5 +1,6 @@
import mimetypes import mimetypes
import textwrap, os import os
import textwrap
from lxml import etree from lxml import etree
from lxml.builder import ElementMaker from lxml.builder import ElementMaker
@@ -9,22 +10,48 @@ from ebook_converter.ebooks.docx.names import DOCXNamespace
from ebook_converter.ebooks.metadata import authors_to_string from ebook_converter.ebooks.metadata import authors_to_string
from ebook_converter.ebooks.pdf.render.common import PAPER_SIZES from ebook_converter.ebooks.pdf.render.common import PAPER_SIZES
from ebook_converter.utils.date import utcnow from ebook_converter.utils.date import utcnow
from ebook_converter.utils.localization import canonicalize_lang, lang_as_iso639_1 from ebook_converter.utils.localization import canonicalize_lang
from ebook_converter.utils.localization import lang_as_iso639_1
from ebook_converter.utils.zipfile import ZipFile from ebook_converter.utils.zipfile import ZipFile
WORD_TYPES = {"/word/footnotes.xml": "application/vnd.openxmlformats-"
"officedocument.wordprocessingml.footnotes+xml",
"/word/document.xml": "application/vnd.openxmlformats-"
"officedocument.wordprocessingml.document.main+xml",
"/word/numbering.xml": "application/vnd.openxmlformats-"
"officedocument.wordprocessingml.numbering+xml",
"/word/styles.xml": "application/vnd.openxmlformats-"
"officedocument.wordprocessingml.styles+xml",
"/word/endnotes.xml": "application/vnd.openxmlformats-"
"officedocument.wordprocessingml.endnotes+xml",
"/word/settings.xml": "application/vnd.openxmlformats-"
"officedocument.wordprocessingml.settings+xml",
"/word/theme/theme1.xml": "application/vnd.openxmlformats-"
"officedocument.theme+xml",
"/word/fontTable.xml": "application/vnd.openxmlformats-"
"officedocument.wordprocessingml.fontTable+xml",
"/word/webSettings.xml": "application/vnd.openxmlformats-"
"officedocument.wordprocessingml.webSettings+xml",
"/docProps/core.xml": "application/vnd.openxmlformats-package."
"core-properties+xml",
"/docProps/app.xml": "application/vnd.openxmlformats-"
"officedocument.extended-properties+xml"}
def xml2str(root, pretty_print=False, with_tail=False): def xml2str(root, pretty_print=False, with_tail=False):
if hasattr(etree, 'cleanup_namespaces'): if hasattr(etree, 'cleanup_namespaces'):
etree.cleanup_namespaces(root) etree.cleanup_namespaces(root)
ans = etree.tostring(root, encoding='utf-8', xml_declaration=True, ans = etree.tostring(root, encoding='utf-8', xml_declaration=True,
pretty_print=pretty_print, with_tail=with_tail) pretty_print=pretty_print, with_tail=with_tail)
return ans return ans
def page_size(opts): def page_size(opts):
width, height = PAPER_SIZES[opts.docx_page_size] width, height = PAPER_SIZES[opts.docx_page_size]
if opts.docx_custom_page_size is not None: if opts.docx_custom_page_size is not None:
width, height = map(float, opts.docx_custom_page_size.partition('x')[0::2]) width, height = map(float,
opts.docx_custom_page_size.partition('x')[0::2])
return width, height return width, height
@@ -47,7 +74,9 @@ def create_skeleton(opts, namespaces=None):
def w(x): def w(x):
return '{%s}%s' % (namespaces['w'], x) return '{%s}%s' % (namespaces['w'], x)
dn = {k:v for k, v in namespaces.items() if k in {'w', 'r', 'm', 've', 'o', 'wp', 'w10', 'wne', 'a', 'pic'}} dn = {k: v for k, v in namespaces.items() if k in {'w', 'r', 'm', 've',
'o', 'wp', 'w10', 'wne',
'a', 'pic'}}
E = ElementMaker(namespace=dn['w'], nsmap=dn) E = ElementMaker(namespace=dn['w'], nsmap=dn)
doc = E.document() doc = E.document()
body = E.body() body = E.body()
@@ -59,27 +88,32 @@ def create_skeleton(opts, namespaces=None):
val = page_margin(opts, which) val = page_margin(opts, which)
return w(which), str(int(val * 20)) return w(which), str(int(val * 20))
body.append(E.sectPr( body.append(E.sectPr(
E.pgSz(**{w('w'):str(width), w('h'):str(height)}), E.pgSz(**{w('w'): str(width), w('h'): str(height)}),
E.pgMar(**dict(map(margin, 'left top right bottom'.split()))), E.pgMar(**dict(map(margin, 'left top right bottom'.split()))),
E.cols(**{w('space'):'720'}), E.cols(**{w('space'): '720'}),
E.docGrid(**{w('linePitch'):"360"}), E.docGrid(**{w('linePitch'): "360"}),
)) ))
dn = {k:v for k, v in namespaces.items() if k in tuple('wra') + ('wp',)} dn = {k: v for k, v in namespaces.items() if k in tuple('wra') + ('wp',)}
E = ElementMaker(namespace=dn['w'], nsmap=dn) E = ElementMaker(namespace=dn['w'], nsmap=dn)
styles = E.styles( styles = E.styles(
E.docDefaults( E.docDefaults(
E.rPrDefault( E.rPrDefault(
E.rPr( E.rPr(
E.rFonts(**{w('asciiTheme'):"minorHAnsi", w('eastAsiaTheme'):"minorEastAsia", w('hAnsiTheme'):"minorHAnsi", w('cstheme'):"minorBidi"}), E.rFonts(**{w('asciiTheme'): "minorHAnsi",
E.sz(**{w('val'):'22'}), w('eastAsiaTheme'): "minorEastAsia",
E.szCs(**{w('val'):'22'}), w('hAnsiTheme'): "minorHAnsi",
E.lang(**{w('val'):'en-US', w('eastAsia'):"en-US", w('bidi'):"ar-SA"}) w('cstheme'): "minorBidi"}),
E.sz(**{w('val'): '22'}),
E.szCs(**{w('val'): '22'}),
E.lang(**{w('val'): 'en-US', w('eastAsia'): "en-US",
w('bidi'): "ar-SA"})
) )
), ),
E.pPrDefault( E.pPrDefault(
E.pPr( E.pPr(
E.spacing(**{w('after'):"0", w('line'):"276", w('lineRule'):"auto"}) E.spacing(**{w('after'): "0", w('line'): "276",
w('lineRule'): "auto"})
) )
) )
) )
@@ -103,8 +137,8 @@ def update_doc_props(root, mi, namespace):
if mi.comments: if mi.comments:
setm('description', mi.comments) setm('description', mi.comments)
if mi.languages: if mi.languages:
l = canonicalize_lang(mi.languages[0]) _l = canonicalize_lang(mi.languages[0])
setm('language', lang_as_iso639_1(l) or l) setm('language', lang_as_iso639_1(_l) or _l)
class DocumentRelationships(object): class DocumentRelationships(object):
@@ -115,8 +149,7 @@ class DocumentRelationships(object):
for typ, target in {namespace.names['STYLES']: 'styles.xml', for typ, target in {namespace.names['STYLES']: 'styles.xml',
namespace.names['NUMBERING']: 'numbering.xml', namespace.names['NUMBERING']: 'numbering.xml',
namespace.names['WEB_SETTINGS']: 'webSettings.xml', namespace.names['WEB_SETTINGS']: 'webSettings.xml',
namespace.names['FONTS']: 'fontTable.xml', namespace.names['FONTS']: 'fontTable.xml'}.items():
}.items():
self.add_relationship(target, typ) self.add_relationship(target, typ)
def get_relationship_id(self, target, rtype, target_mode=None): def get_relationship_id(self, target, rtype, target_mode=None):
@@ -134,7 +167,8 @@ class DocumentRelationships(object):
def serialize(self): def serialize(self):
namespaces = self.namespace.namespaces namespaces = self.namespace.namespaces
E = ElementMaker(namespace=namespaces['pr'], nsmap={None:namespaces['pr']}) E = ElementMaker(namespace=namespaces['pr'],
nsmap={None: namespaces['pr']})
relationships = E.Relationships() relationships = E.Relationships()
for (target, rtype, target_mode), rid in self.rmap.items(): for (target, rtype, target_mode), rid in self.rmap.items():
r = E.Relationship(Id=rid, Type=rtype, Target=target) r = E.Relationship(Id=rid, Type=rtype, Target=target)
@@ -151,9 +185,12 @@ class DOCX(object):
namespaces = self.namespace.namespaces namespaces = self.namespace.namespaces
self.opts, self.log = opts, log self.opts, self.log = opts, log
self.document_relationships = DocumentRelationships(self.namespace) self.document_relationships = DocumentRelationships(self.namespace)
self.font_table = etree.Element('{%s}fonts' % namespaces['w'], nsmap={k:namespaces[k] for k in 'wr'}) self.font_table = etree.Element('{%s}fonts' % namespaces['w'],
self.numbering = etree.Element('{%s}numbering' % namespaces['w'], nsmap={k:namespaces[k] for k in 'wr'}) nsmap={k: namespaces[k] for k in 'wr'})
E = ElementMaker(namespace=namespaces['pr'], nsmap={None:namespaces['pr']}) self.numbering = etree.Element('{%s}numbering' % namespaces['w'],
nsmap={k: namespaces[k] for k in 'wr'})
E = ElementMaker(namespace=namespaces['pr'],
nsmap={None: namespaces['pr']})
self.embedded_fonts = E.Relationships() self.embedded_fonts = E.Relationships()
self.fonts = {} self.fonts = {}
self.images = {} self.images = {}
@@ -161,21 +198,10 @@ class DOCX(object):
# Boilerplate {{{ # Boilerplate {{{
@property @property
def contenttypes(self): def contenttypes(self):
E = ElementMaker(namespace=self.namespace.namespaces['ct'], nsmap={None:self.namespace.namespaces['ct']}) E = ElementMaker(namespace=self.namespace.namespaces['ct'],
nsmap={None: self.namespace.namespaces['ct']})
types = E.Types() types = E.Types()
for partname, mt in { for partname, mt in WORD_TYPES.items():
"/word/footnotes.xml": "application/vnd.openxmlformats-officedocument.wordprocessingml.footnotes+xml",
"/word/document.xml": "application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml",
"/word/numbering.xml": "application/vnd.openxmlformats-officedocument.wordprocessingml.numbering+xml",
"/word/styles.xml": "application/vnd.openxmlformats-officedocument.wordprocessingml.styles+xml",
"/word/endnotes.xml": "application/vnd.openxmlformats-officedocument.wordprocessingml.endnotes+xml",
"/word/settings.xml": "application/vnd.openxmlformats-officedocument.wordprocessingml.settings+xml",
"/word/theme/theme1.xml": "application/vnd.openxmlformats-officedocument.theme+xml",
"/word/fontTable.xml": "application/vnd.openxmlformats-officedocument.wordprocessingml.fontTable+xml",
"/word/webSettings.xml": "application/vnd.openxmlformats-officedocument.wordprocessingml.webSettings+xml",
"/docProps/core.xml": "application/vnd.openxmlformats-package.core-properties+xml",
"/docProps/app.xml": "application/vnd.openxmlformats-officedocument.extended-properties+xml",
}.items():
types.append(E.Override(PartName=partname, ContentType=mt)) types.append(E.Override(PartName=partname, ContentType=mt))
added = {'png', 'gif', 'jpeg', 'jpg', 'svg', 'xml'} added = {'png', 'gif', 'jpeg', 'jpg', 'svg', 'xml'}
for ext in added: for ext in added:
@@ -199,7 +225,8 @@ class DOCX(object):
@property @property
def appproperties(self): def appproperties(self):
E = ElementMaker(namespace=self.namespace.namespaces['ep'], nsmap={None:self.namespace.namespaces['ep']}) E = ElementMaker(namespace=self.namespace.namespaces['ep'],
nsmap={None: self.namespace.namespaces['ep']})
props = E.Properties( props = E.Properties(
E.Application(__appname__), E.Application(__appname__),
E.AppVersion('%02d.%04d' % numeric_version[:2]), E.AppVersion('%02d.%04d' % numeric_version[:2]),
@@ -216,16 +243,17 @@ class DOCX(object):
@property @property
def containerrels(self): def containerrels(self):
return textwrap.dedent('''\ return textwrap.dedent('''\
<?xml version='1.0' encoding='utf-8'?> <?xml version='1.0' encoding='utf-8'?>
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships"> <Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
<Relationship Id="rId3" Type="{APPPROPS}" Target="docProps/app.xml"/> <Relationship Id="rId3" Type="{APPPROPS}" Target="docProps/app.xml"/>
<Relationship Id="rId2" Type="{DOCPROPS}" Target="docProps/core.xml"/> <Relationship Id="rId2" Type="{DOCPROPS}" Target="docProps/core.xml"/>
<Relationship Id="rId1" Type="{DOCUMENT}" Target="word/document.xml"/> <Relationship Id="rId1" Type="{DOCUMENT}" Target="word/document.xml"/>
</Relationships>'''.format(**self.namespace.names)).encode('utf-8') </Relationships>'''.format(**self.namespace.names)).encode('utf-8') # noqa
@property @property
def websettings(self): def websettings(self):
E = ElementMaker(namespace=self.namespace.namespaces['w'], nsmap={'w':self.namespace.namespaces['w']}) E = ElementMaker(namespace=self.namespace.namespaces['w'],
nsmap={'w': self.namespace.namespaces['w']})
ws = E.webSettings( ws = E.webSettings(
E.optimizeForBrowser, E.allowPNG, E.doNotSaveAsSingleFile) E.optimizeForBrowser, E.allowPNG, E.doNotSaveAsSingleFile)
return xml2str(ws) return xml2str(ws)
@@ -234,11 +262,15 @@ class DOCX(object):
def convert_metadata(self, mi): def convert_metadata(self, mi):
namespaces = self.namespace.namespaces namespaces = self.namespace.namespaces
E = ElementMaker(namespace=namespaces['cp'], nsmap={x:namespaces[x] for x in 'cp dc dcterms xsi'.split()}) E = ElementMaker(namespace=namespaces['cp'],
nsmap={x: namespaces[x]
for x in 'cp dc dcterms xsi'.split()})
cp = E.coreProperties(E.revision("1"), E.lastModifiedBy('calibre')) cp = E.coreProperties(E.revision("1"), E.lastModifiedBy('calibre'))
ts = utcnow().isoformat('T').rpartition('.')[0] + 'Z' ts = utcnow().isoformat('T').rpartition('.')[0] + 'Z'
for x in 'created modified'.split(): for x in 'created modified'.split():
x = cp.makeelement('{%s}%s' % (namespaces['dcterms'], x), **{'{%s}type' % namespaces['xsi']:'dcterms:W3CDTF'}) x = cp.makeelement('{%s}%s' % (namespaces['dcterms'], x),
**{'{%s}type' %
namespaces['xsi']: 'dcterms:W3CDTF'})
x.text = ts x.text = ts
cp.append(x) cp.append(x)
self.mi = mi self.mi = mi
@@ -261,8 +293,10 @@ class DOCX(object):
zf.writestr('word/styles.xml', xml2str(self.styles)) zf.writestr('word/styles.xml', xml2str(self.styles))
zf.writestr('word/numbering.xml', xml2str(self.numbering)) zf.writestr('word/numbering.xml', xml2str(self.numbering))
zf.writestr('word/fontTable.xml', xml2str(self.font_table)) zf.writestr('word/fontTable.xml', xml2str(self.font_table))
zf.writestr('word/_rels/document.xml.rels', self.document_relationships.serialize()) zf.writestr('word/_rels/document.xml.rels',
zf.writestr('word/_rels/fontTable.xml.rels', xml2str(self.embedded_fonts)) self.document_relationships.serialize())
zf.writestr('word/_rels/fontTable.xml.rels',
xml2str(self.embedded_fonts))
for fname, data_getter in self.images.items(): for fname, data_getter in self.images.items():
zf.writestr(fname, data_getter()) zf.writestr(fname, data_getter())
for fname, data in self.fonts.items(): for fname, data in self.fonts.items():

View File

@@ -18,7 +18,7 @@ try:
_author_pat = re.compile(tweaks['authors_split_regex']) _author_pat = re.compile(tweaks['authors_split_regex'])
except Exception: except Exception:
prints('Author split regexp:', tweaks['authors_split_regex'], prints('Author split regexp:', tweaks['authors_split_regex'],
'is invalid, using default') 'is invalid, using default')
_author_pat = re.compile(r'(?i),?\s+(and|with)\s+') _author_pat = re.compile(r'(?i),?\s+(and|with)\s+')
@@ -76,7 +76,8 @@ def author_to_author_sort(author, method=None):
if method == 'copy': if method == 'copy':
return author return author
prefixes = {force_unicode(y).lower() for y in tweaks['author_name_prefixes']} prefixes = {force_unicode(y).lower()
for y in tweaks['author_name_prefixes']}
prefixes |= {y+'.' for y in prefixes} prefixes |= {y+'.' for y in prefixes}
while True: while True:
if not tokens: if not tokens:
@@ -87,7 +88,8 @@ def author_to_author_sort(author, method=None):
else: else:
break break
suffixes = {force_unicode(y).lower() for y in tweaks['author_name_suffixes']} suffixes = {force_unicode(y).lower()
for y in tweaks['author_name_suffixes']}
suffixes |= {y+'.' for y in suffixes} suffixes |= {y+'.' for y in suffixes}
suffix = '' suffix = ''
@@ -144,7 +146,7 @@ def get_title_sort_pat(lang=None):
except: except:
ans = frozenset((r'A\s+', r'The\s+', r'An\s+')) ans = frozenset((r'A\s+', r'The\s+', r'An\s+'))
ans = '|'.join(ans) ans = '|'.join(ans)
ans = '^(%s)'%ans ans = '^(%s)' % ans
try: try:
ans = re.compile(ans, re.IGNORECASE) ans = re.compile(ans, re.IGNORECASE)
except: except:
@@ -154,7 +156,7 @@ def get_title_sort_pat(lang=None):
_ignore_starts = '\'"'+''.join(chr(x) for x in _ignore_starts = '\'"'+''.join(chr(x) for x in
list(range(0x2018, 0x201e))+[0x2032, 0x2033]) list(range(0x2018, 0x201e))+[0x2032, 0x2033])
def title_sort(title, order=None, lang=None): def title_sort(title, order=None, lang=None):

View File

@@ -12,8 +12,7 @@ from lxml import etree
from ebook_converter.utils.date import parse_only_date from ebook_converter.utils.date import parse_only_date
from ebook_converter.utils.img import save_cover_data_to from ebook_converter.utils.img import save_cover_data_to
from ebook_converter.utils.imghdr import identify from ebook_converter.utils.imghdr import identify
from ebook_converter import guess_type, guess_all_extensions, prints, \ from ebook_converter import guess_all_extensions, prints, force_unicode
force_unicode
from ebook_converter.ebooks.metadata import MetaInformation, check_isbn from ebook_converter.ebooks.metadata import MetaInformation, check_isbn
from ebook_converter.ebooks.chardet import xml_to_unicode from ebook_converter.ebooks.chardet import xml_to_unicode
from ebook_converter.polyglot.binary import as_base64_unicode from ebook_converter.polyglot.binary import as_base64_unicode

View File

@@ -10,11 +10,11 @@ import mimetypes
import os import os
import re import re
import sys import sys
import textwrap
import traceback
import unittest import unittest
import urllib.parse import urllib.parse
import uuid import uuid
import traceback
import textwrap
from lxml import etree from lxml import etree
from lxml.builder import ElementMaker from lxml.builder import ElementMaker
@@ -32,7 +32,7 @@ from ebook_converter.ebooks.metadata import string_to_authors, \
from ebook_converter.ebooks.metadata.book.base import Metadata from ebook_converter.ebooks.metadata.book.base import Metadata
from ebook_converter.utils.date import parse_date, isoformat from ebook_converter.utils.date import parse_date, isoformat
from ebook_converter.utils.localization import get_lang, canonicalize_lang from ebook_converter.utils.localization import get_lang, canonicalize_lang
from ebook_converter import prints, guess_type from ebook_converter import prints
from ebook_converter.utils.cleantext import clean_ascii_chars, clean_xml_chars from ebook_converter.utils.cleantext import clean_ascii_chars, clean_xml_chars
from ebook_converter.utils.config import tweaks from ebook_converter.utils.config import tweaks
from ebook_converter.polyglot.urllib import unquote from ebook_converter.polyglot.urllib import unquote
@@ -1807,8 +1807,7 @@ def test_m2o():
class OPFTest(unittest.TestCase): class OPFTest(unittest.TestCase):
def setUp(self): def setUp(self):
self.stream = io.BytesIO( self.stream = io.BytesIO(b'''\
b'''\
<?xml version="1.0" encoding="UTF-8"?> <?xml version="1.0" encoding="UTF-8"?>
<package version="2.0" xmlns="http://www.idpf.org/2007/opf" > <package version="2.0" xmlns="http://www.idpf.org/2007/opf" >
<metadata xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:opf="http://www.idpf.org/2007/opf"> <metadata xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:opf="http://www.idpf.org/2007/opf">
@@ -1827,8 +1826,7 @@ b'''\
<item id="1" href="a%20%7E%20b" media-type="text/txt" /> <item id="1" href="a%20%7E%20b" media-type="text/txt" />
</manifest> </manifest>
</package> </package>
''' ''')
)
self.opf = OPF(self.stream, os.getcwd()) self.opf = OPF(self.stream, os.getcwd())
def testReading(self, opf=None): def testReading(self, opf=None):

View File

@@ -1,10 +1,15 @@
import shutil, os, re, struct, textwrap, io import io
import logging import logging
import mimetypes import mimetypes
import os
import re
import shutil
import struct
import textwrap
from lxml import html, etree from lxml import html, etree
from ebook_converter import xml_entity_to_unicode, entity_to_unicode, guess_type from ebook_converter import xml_entity_to_unicode, entity_to_unicode
from ebook_converter.utils.cleantext import clean_ascii_chars, clean_xml_chars from ebook_converter.utils.cleantext import clean_ascii_chars, clean_xml_chars
from ebook_converter.ebooks import DRMError, unit_convert from ebook_converter.ebooks import DRMError, unit_convert
from ebook_converter.ebooks.chardet import strip_encoding_declarations from ebook_converter.ebooks.chardet import strip_encoding_declarations
@@ -15,15 +20,11 @@ from ebook_converter.ebooks.metadata import MetaInformation
from ebook_converter.ebooks.metadata.opf2 import OPFCreator, OPF from ebook_converter.ebooks.metadata.opf2 import OPFCreator, OPF
from ebook_converter.ebooks.metadata.toc import TOC from ebook_converter.ebooks.metadata.toc import TOC
from ebook_converter.ebooks.mobi.reader.headers import BookHeader from ebook_converter.ebooks.mobi.reader.headers import BookHeader
from ebook_converter.utils.img import save_cover_data_to, gif_data_to_png_data, AnimatedGIF from ebook_converter.utils.img import save_cover_data_to, gif_data_to_png_data
from ebook_converter.utils.img import AnimatedGIF
from ebook_converter.utils.imghdr import what from ebook_converter.utils.imghdr import what
__license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
class TopazError(ValueError): class TopazError(ValueError):
pass pass
@@ -38,13 +39,14 @@ class KFXError(ValueError):
class MobiReader(object): class MobiReader(object):
PAGE_BREAK_PAT = re.compile( PAGE_BREAK_PAT = re.compile(r'<\s*/{0,1}\s*mbp:pagebreak((?:\s+[^/>]*)'
r'<\s*/{0,1}\s*mbp:pagebreak((?:\s+[^/>]*){0,1})/{0,1}\s*>\s*(?:<\s*/{0,1}\s*mbp:pagebreak\s*/{0,1}\s*>)*', r'{0,1})/{0,1}\s*>\s*(?:<\s*/{0,1}'
re.IGNORECASE) r'\s*mbp:pagebreak\s*/{0,1}\s*>)*',
re.IGNORECASE)
IMAGE_ATTRS = ('lowrecindex', 'recindex', 'hirecindex') IMAGE_ATTRS = ('lowrecindex', 'recindex', 'hirecindex')
def __init__(self, filename_or_stream, log, user_encoding=None, debug=None, def __init__(self, filename_or_stream, log, user_encoding=None, debug=None,
try_extra_data_fix=False): try_extra_data_fix=False):
self.log = log self.log = log
self.debug = debug self.debug = debug
self.embedded_mi = None self.embedded_mi = None
@@ -83,8 +85,8 @@ class MobiReader(object):
if raw.startswith(b'\xeaDRMION\xee'): if raw.startswith(b'\xeaDRMION\xee'):
raise KFXError() raise KFXError()
self.header = raw[0:72] self.header = raw[0:72]
self.name = self.header[:32].replace(b'\x00', b'') self.name = self.header[:32].replace(b'\x00', b'')
self.num_sections, = struct.unpack('>H', raw[76:78]) self.num_sections, = struct.unpack('>H', raw[76:78])
self.ident = self.header[0x3C:0x3C + 8].upper() self.ident = self.header[0x3C:0x3C + 8].upper()
@@ -94,7 +96,9 @@ class MobiReader(object):
self.sections = [] self.sections = []
self.section_headers = [] self.section_headers = []
for i in range(self.num_sections): for i in range(self.num_sections):
offset, a1, a2, a3, a4 = struct.unpack('>LBBBB', raw[78 + i * 8:78 + i * 8 + 8]) offset, a1, a2, a3, a4 = struct.unpack('>LBBBB',
raw[78 + i * 8:78 +
i * 8 + 8])
flags, val = a1, a2 << 16 | a3 << 8 | a4 flags, val = a1, a2 << 16 | a3 << 8 | a4
self.section_headers.append((offset, flags, val)) self.section_headers.append((offset, flags, val))
@@ -109,8 +113,9 @@ class MobiReader(object):
for i in range(self.num_sections): for i in range(self.num_sections):
self.sections.append((section(i), self.section_headers[i])) self.sections.append((section(i), self.section_headers[i]))
self.book_header = bh = BookHeader(self.sections[0][0], self.ident, bh = BookHeader(self.sections[0][0], self.ident, user_encoding,
user_encoding, self.log, try_extra_data_fix=try_extra_data_fix) self.log, try_extra_data_fix=try_extra_data_fix)
self.book_header = bh
self.name = self.name.decode(self.book_header.codec, 'replace') self.name = self.name.decode(self.book_header.codec, 'replace')
self.kf8_type = None self.kf8_type = None
k8i = getattr(self.book_header.exth, 'kf8_header', None) k8i = getattr(self.book_header.exth, 'kf8_header', None)
@@ -118,18 +123,20 @@ class MobiReader(object):
# Ancient PRC files from Baen can have random values for # Ancient PRC files from Baen can have random values for
# mobi_version, so be conservative # mobi_version, so be conservative
if (self.book_header.mobi_version == 8 and hasattr(self.book_header, if (self.book_header.mobi_version == 8 and hasattr(self.book_header,
'skelidx')): 'skelidx')):
self.kf8_type = 'standalone' self.kf8_type = 'standalone'
elif k8i is not None: # Check for joint mobi 6 and kf 8 file elif k8i is not None: # Check for joint mobi 6 and kf 8 file
try: try:
raw = self.sections[k8i-1][0] raw = self.sections[k8i-1][0]
except: except Exception:
raw = None raw = None
if raw == b'BOUNDARY': if raw == b'BOUNDARY':
try: try:
self.book_header = BookHeader(self.sections[k8i][0], self.book_header = BookHeader(self.sections[k8i][0],
self.ident, user_encoding, self.log) self.ident, user_encoding,
self.book_header.kf8_first_image_index = self.book_header.first_image_index + k8i self.log)
_kfii = self.book_header.first_image_index + k8i
self.book_header.kf8_first_image_index = _kfii
self.book_header.mobi6_records = bh.records self.book_header.mobi6_records = bh.records
# Need the first_image_index from the mobi 6 header as well # Need the first_image_index from the mobi 6 header as well
@@ -143,14 +150,14 @@ class MobiReader(object):
self.kf8_type = 'joint' self.kf8_type = 'joint'
self.kf8_boundary = k8i-1 self.kf8_boundary = k8i-1
except: except Exception:
self.book_header = bh self.book_header = bh
def check_for_drm(self): def check_for_drm(self):
if self.book_header.encryption_type != 0: if self.book_header.encryption_type != 0:
try: try:
name = self.book_header.exth.mi.title name = self.book_header.exth.mi.title
except: except Exception:
name = self.name name = self.name
if not name: if not name:
name = self.name name = self.name
@@ -163,20 +170,20 @@ class MobiReader(object):
if self.debug is not None: if self.debug is not None:
parse_cache['calibre_raw_mobi_markup'] = self.mobi_html parse_cache['calibre_raw_mobi_markup'] = self.mobi_html
self.add_anchors() self.add_anchors()
self.processed_html = self.processed_html.decode(self.book_header.codec, self.processed_html = self.processed_html.decode(
'ignore') self.book_header.codec, 'ignore')
self.processed_html = self.processed_html.replace('</</', '</') self.processed_html = self.processed_html.replace('</</', '</')
self.processed_html = re.sub(r'</([a-zA-Z]+)<', r'</\1><', self.processed_html = re.sub(r'</([a-zA-Z]+)<', r'</\1><',
self.processed_html) self.processed_html)
self.processed_html = self.processed_html.replace('\ufeff', '') self.processed_html = self.processed_html.replace('\ufeff', '')
# Remove tags of the form <xyz: ...> as they can cause issues further # Remove tags of the form <xyz: ...> as they can cause issues further
# along the pipeline # along the pipeline
self.processed_html = re.sub(r'</{0,1}[a-zA-Z]+:\s+[^>]*>', '', self.processed_html = re.sub(r'</{0,1}[a-zA-Z]+:\s+[^>]*>', '',
self.processed_html) self.processed_html)
self.processed_html = strip_encoding_declarations(self.processed_html) self.processed_html = strip_encoding_declarations(self.processed_html)
self.processed_html = re.sub(r'&(\S+?);', xml_entity_to_unicode, self.processed_html = re.sub(r'&(\S+?);', xml_entity_to_unicode,
self.processed_html) self.processed_html)
image_name_map = self.extract_images(processed_records, output_dir) image_name_map = self.extract_images(processed_records, output_dir)
self.replace_page_breaks() self.replace_page_breaks()
self.cleanup_html() self.cleanup_html()
@@ -186,31 +193,41 @@ class MobiReader(object):
try: try:
root = html.fromstring(self.processed_html) root = html.fromstring(self.processed_html)
if len(root.xpath('//html')) > 5: if len(root.xpath('//html')) > 5:
root = html.fromstring(self.processed_html.replace('\x0c', root = html.fromstring(self.processed_html
'').replace('\x14', '')) .replace('\x0c', '')
.replace('\x14', ''))
except Exception: except Exception:
self.log.warning('MOBI markup appears to contain random bytes. Stripping.') self.log.warning('MOBI markup appears to contain random bytes. '
'Stripping.')
self.processed_html = self.remove_random_bytes(self.processed_html) self.processed_html = self.remove_random_bytes(self.processed_html)
root = html.fromstring(self.processed_html) root = html.fromstring(self.processed_html)
if root.xpath('descendant::p/descendant::p'): if root.xpath('descendant::p/descendant::p'):
from html5_parser import parse from html5_parser import parse
self.log.warning('Malformed markup, parsing using html5-parser') self.log.warning('Malformed markup, parsing using html5-parser')
self.processed_html = strip_encoding_declarations(self.processed_html) self.processed_html = strip_encoding_declarations(
self.processed_html)
# These trip up the html5 parser causing all content to be placed # These trip up the html5 parser causing all content to be placed
# under the <guide> tag # under the <guide> tag
self.processed_html = re.sub(r'<metadata>.+?</metadata>', '', self.processed_html, flags=re.I) self.processed_html = re.sub(r'<metadata>.+?</metadata>', '',
self.processed_html = re.sub(r'<guide>.+?</guide>', '', self.processed_html, flags=re.I) self.processed_html, flags=re.I)
self.processed_html = re.sub(r'<guide>.+?</guide>', '',
self.processed_html, flags=re.I)
try: try:
root = parse(self.processed_html, maybe_xhtml=False, keep_doctype=False, sanitize_names=True) root = parse(self.processed_html, maybe_xhtml=False,
keep_doctype=False, sanitize_names=True)
except Exception: except Exception:
self.log.warning('MOBI markup appears to contain random bytes. Stripping.') self.log.warning('MOBI markup appears to contain random '
self.processed_html = self.remove_random_bytes(self.processed_html) 'bytes. Stripping.')
root = parse(self.processed_html, maybe_xhtml=False, keep_doctype=False, sanitize_names=True) self.processed_html = self.remove_random_bytes(
self.processed_html)
root = parse(self.processed_html, maybe_xhtml=False,
keep_doctype=False, sanitize_names=True)
if len(root.xpath('body/descendant::*')) < 1: if len(root.xpath('body/descendant::*')) < 1:
# There are probably stray </html>s in the markup # There are probably stray </html>s in the markup
self.processed_html = self.processed_html.replace('</html>', self.processed_html = self.processed_html.replace('</html>',
'') '')
root = parse(self.processed_html, maybe_xhtml=False, keep_doctype=False, sanitize_names=True) root = parse(self.processed_html, maybe_xhtml=False,
keep_doctype=False, sanitize_names=True)
if root.tag != 'html': if root.tag != 'html':
self.log.warn('File does not have opening <html> tag') self.log.warn('File does not have opening <html> tag')
@@ -253,13 +270,14 @@ class MobiReader(object):
head = root.makeelement('head', {}) head = root.makeelement('head', {})
root.insert(0, head) root.insert(0, head)
head.text = '\n\t' head.text = '\n\t'
link = head.makeelement('link', {'type':'text/css', link = head.makeelement('link', {'type': 'text/css',
'href':'styles.css', 'rel':'stylesheet'}) 'href': 'styles.css',
'rel': 'stylesheet'})
head.insert(0, link) head.insert(0, link)
link.tail = '\n\t' link.tail = '\n\t'
title = head.xpath('descendant::title') title = head.xpath('descendant::title')
m = head.makeelement('meta', {'http-equiv':'Content-Type', m = head.makeelement('meta', {'http-equiv': 'Content-Type',
'content':'text/html; charset=utf-8'}) 'content': 'text/html; charset=utf-8'})
head.insert(0, m) head.insert(0, m)
if not title: if not title:
title = head.makeelement('title', {}) title = head.makeelement('title', {})
@@ -283,7 +301,8 @@ class MobiReader(object):
try: try:
for ref in guide.xpath('descendant::reference'): for ref in guide.xpath('descendant::reference'):
if 'href' in ref.attrib: if 'href' in ref.attrib:
ref.attrib['href'] = os.path.basename(htmlfile) + ref.attrib['href'] ref.attrib['href'] = (os.path.basename(htmlfile) +
ref.attrib['href'])
except AttributeError: except AttributeError:
pass pass
@@ -299,7 +318,7 @@ class MobiReader(object):
opf, ncx_manifest_entry = self.create_opf(htmlfile, guide, root) opf, ncx_manifest_entry = self.create_opf(htmlfile, guide, root)
self.created_opf_path = os.path.splitext(htmlfile)[0] + '.opf' self.created_opf_path = os.path.splitext(htmlfile)[0] + '.opf'
opf.render(open(self.created_opf_path, 'wb'), ncx, opf.render(open(self.created_opf_path, 'wb'), ncx,
ncx_manifest_entry=ncx_manifest_entry) ncx_manifest_entry=ncx_manifest_entry)
ncx = ncx.getvalue() ncx = ncx.getvalue()
if ncx: if ncx:
ncx_path = os.path.join(os.path.dirname(htmlfile), 'toc.ncx') ncx_path = os.path.join(os.path.dirname(htmlfile), 'toc.ncx')
@@ -313,9 +332,9 @@ class MobiReader(object):
if self.book_header.exth is not None or self.embedded_mi is not None: if self.book_header.exth is not None or self.embedded_mi is not None:
self.log.debug('Creating OPF...') self.log.debug('Creating OPF...')
ncx = io.BytesIO() ncx = io.BytesIO()
opf, ncx_manifest_entry = self.create_opf(htmlfile, guide, root) opf, ncx_manifest_entry = self.create_opf(htmlfile, guide, root)
opf.render(open(os.path.splitext(htmlfile)[0] + '.opf', 'wb'), ncx, opf.render(open(os.path.splitext(htmlfile)[0] + '.opf', 'wb'), ncx,
ncx_manifest_entry) ncx_manifest_entry)
ncx = ncx.getvalue() ncx = ncx.getvalue()
if ncx: if ncx:
write_as_utf8(os.path.splitext(htmlfile)[0] + '.ncx', ncx) write_as_utf8(os.path.splitext(htmlfile)[0] + '.ncx', ncx)
@@ -348,28 +367,46 @@ class MobiReader(object):
def cleanup_html(self): def cleanup_html(self):
self.log.debug('Cleaning up HTML...') self.log.debug('Cleaning up HTML...')
self.processed_html = re.sub(r'<div height="0(pt|px|ex|em|%){0,1}"></div>', '', self.processed_html) self.processed_html = re.sub(r'<div height="0(pt|px|ex|em|%){0,1}">'
if self.book_header.ancient and b'<html' not in self.mobi_html[:300].lower(): '</div>', '', self.processed_html)
self.processed_html = '<html><p>' + self.processed_html.replace('\n\n', '<p>') + '</html>' if (self.book_header.ancient and
b'<html' not in self.mobi_html[:300].lower()):
self.processed_html = ('<html><p>' +
self.processed_html.replace('\n\n', '<p>') +
'</html>')
self.processed_html = self.processed_html.replace('\r\n', '\n') self.processed_html = self.processed_html.replace('\r\n', '\n')
self.processed_html = self.processed_html.replace('> <', '>\n<') self.processed_html = self.processed_html.replace('> <', '>\n<')
self.processed_html = self.processed_html.replace('<mbp: ', '<mbp:') self.processed_html = self.processed_html.replace('<mbp: ', '<mbp:')
self.processed_html = re.sub(r'<\?xml[^>]*>', '', self.processed_html) self.processed_html = re.sub(r'<\?xml[^>]*>', '', self.processed_html)
self.processed_html = re.sub(r'<\s*(/?)\s*o:p[^>]*>', r'', self.processed_html) self.processed_html = re.sub(r'<\s*(/?)\s*o:p[^>]*>', r'',
# Swap inline and block level elements, and order block level elements according to priority self.processed_html)
# - lxml and beautifulsoup expect/assume a specific order based on xhtml spec # Swap inline and block level elements, and order block level elements
self.processed_html = re.sub( # according to priority
r'(?i)(?P<styletags>(<(h\d+|i|b|u|em|small|big|strong|tt)>\s*){1,})(?P<para><p[^>]*>)', r'\g<para>'+r'\g<styletags>', self.processed_html) # - lxml and beautifulsoup expect/assume a specific order based on
self.processed_html = re.sub( # xhtml spec
r'(?i)(?P<para></p[^>]*>)\s*(?P<styletags>(</(h\d+|i|b|u|em|small|big|strong|tt)>\s*){1,})', r'\g<styletags>'+r'\g<para>', self.processed_html) self.processed_html = re.sub(r'(?i)(?P<styletags>(<(h\d+|i|b|u|em|'
self.processed_html = re.sub( r'small|big|strong|tt)>\s*){1,})'
r'(?i)(?P<blockquote>(</(blockquote|div)[^>]*>\s*){1,})(?P<para></p[^>]*>)', r'\g<para>'+r'\g<blockquote>', self.processed_html) r'(?P<para><p[^>]*>)',
self.processed_html = re.sub( r'\g<para>' + r'\g<styletags>',
r'(?i)(?P<para><p[^>]*>)\s*(?P<blockquote>(<(blockquote|div)[^>]*>\s*){1,})', r'\g<blockquote>'+r'\g<para>', self.processed_html) self.processed_html)
self.processed_html = re.sub(r'(?i)(?P<para></p[^>]*>)\s*'
r'(?P<styletags>(</(h\d+|i|b|u|em|small|'
r'big|strong|tt)>\s*){1,})',
r'\g<styletags>' + r'\g<para>',
self.processed_html)
self.processed_html = re.sub(r'(?i)(?P<blockquote>(</(blockquote|div)'
r'[^>]*>\s*){1,})(?P<para></p[^>]*>)',
r'\g<para>' + r'\g<blockquote>',
self.processed_html)
self.processed_html = re.sub(r'(?i)(?P<para><p[^>]*>)\s*'
r'(?P<blockquote>(<(blockquote|div)[^>]*>'
r'\s*){1,})',
r'\g<blockquote>' + r'\g<para>',
self.processed_html)
bods = htmls = 0 bods = htmls = 0
for x in re.finditer('</body>|</html>', self.processed_html): for x in re.finditer('</body>|</html>', self.processed_html):
if x == '</body>': if x == '</body>':
bods +=1 bods += 1
else: else:
htmls += 1 htmls += 1
if bods > 1 and htmls > 1: if bods > 1 and htmls > 1:
@@ -380,8 +417,8 @@ class MobiReader(object):
self.processed_html = self.processed_html.replace('</html>', '') self.processed_html = self.processed_html.replace('</html>', '')
def remove_random_bytes(self, html): def remove_random_bytes(self, html):
return re.sub('\x14|\x15|\x19|\x1c|\x1d|\xef|\x12|\x13|\xec|\x08|\x01|\x02|\x03|\x04|\x05|\x06|\x07', return re.sub('\x14|\x15|\x19|\x1c|\x1d|\xef|\x12|\x13|\xec|\x08|\x01'
'', html) '|\x02|\x03|\x04|\x05|\x06|\x07', '', html)
def ensure_unit(self, raw, unit='px'): def ensure_unit(self, raw, unit='px'):
if re.search(r'\d+$', raw) is not None: if re.search(r'\d+$', raw) is not None:
@@ -448,9 +485,10 @@ class MobiReader(object):
# discarded by a renderer # discarded by a renderer
tag.text = '\u00a0' # nbsp tag.text = '\u00a0' # nbsp
styles.append('height: %s' % styles.append('height: %s' %
self.ensure_unit(height)) self.ensure_unit(height))
else: else:
styles.append('margin-top: %s' % self.ensure_unit(height)) styles.append('margin-top: %s' %
self.ensure_unit(height))
if 'width' in attrib: if 'width' in attrib:
width = attrib.pop('width').strip() width = attrib.pop('width').strip()
if width and re.search(r'\d+', width): if width and re.search(r'\d+', width):
@@ -464,14 +502,16 @@ class MobiReader(object):
try: try:
ewidth_val = unit_convert(ewidth, 12, 500, 166) ewidth_val = unit_convert(ewidth, 12, 500, 166)
self.text_indents[tag] = ewidth_val self.text_indents[tag] = ewidth_val
except: except Exception:
pass pass
if width.startswith('-'): if width.startswith('-'):
styles.append('margin-left: %s' % self.ensure_unit(width[1:])) styles.append('margin-left: %s' %
self.ensure_unit(width[1:]))
try: try:
ewidth_val = unit_convert(ewidth[1:], 12, 500, 166) ewidth_val = unit_convert(ewidth[1:],
12, 500, 166)
self.left_margins[tag] = ewidth_val self.left_margins[tag] = ewidth_val
except: except Exception:
pass pass
if 'align' in attrib: if 'align' in attrib:
@@ -514,16 +554,20 @@ class MobiReader(object):
except Exception: except Exception:
pass pass
else: else:
attrib['src'] = 'images/' + image_name_map.get(recindex, '%05d.jpg' % recindex) attrib['src'] = ('images/' +
image_name_map.get(recindex,
'%05d.jpg' %
recindex))
for attr in ('width', 'height'): for attr in ('width', 'height'):
if attr in attrib: if attr in attrib:
val = attrib[attr] val = attrib[attr]
if val.lower().endswith('em'): if val.lower().endswith('em'):
try: try:
nval = float(val[:-2]) nval = float(val[:-2])
nval *= 16 * (168.451/72) # Assume this was set using the Kindle profile # Assume this was set using the Kindle profile
attrib[attr] = "%dpx"%int(nval) nval *= 16 * (168.451/72)
except: attrib[attr] = "%dpx" % int(nval)
except Exception:
del attrib[attr] del attrib[attr]
elif val.lower().endswith('%'): elif val.lower().endswith('%'):
del attrib[attr] del attrib[attr]
@@ -550,10 +594,12 @@ class MobiReader(object):
attrib['href'] = "#filepos%d" % int(filepos) attrib['href'] = "#filepos%d" % int(filepos)
except ValueError: except ValueError:
pass pass
if (tag.tag == 'a' and attrib.get('id', '').startswith('filepos') and if (tag.tag == 'a' and
not tag.text and len(tag) == 0 and (tag.tail is None or not attrib.get('id', '').startswith('filepos') and
tag.tail.strip()) and getattr(tag.getnext(), 'tag', not tag.text and len(tag) == 0 and
None) in BLOCK_TAGS): (tag.tail is None or
not tag.tail.strip()) and
getattr(tag.getnext(), 'tag', None) in BLOCK_TAGS):
# This is an empty anchor immediately before a block tag, move # This is an empty anchor immediately before a block tag, move
# the id onto the block tag instead # the id onto the block tag instead
forwardable_anchors.append(tag) forwardable_anchors.append(tag)
@@ -625,11 +671,11 @@ class MobiReader(object):
ti = self.text_indents.get(tag, ti) ti = self.text_indents.get(tag, ti)
try: try:
lm = float(lm) lm = float(lm)
except: except Exception:
lm = 0.0 lm = 0.0
try: try:
ti = float(ti) ti = float(ti)
except: except Exception:
ti = 0.0 ti = 0.0
return lm + ti return lm + ti
@@ -647,13 +693,14 @@ class MobiReader(object):
mi = MetaInformation(self.book_header.title, ['Unknown']) mi = MetaInformation(self.book_header.title, ['Unknown'])
opf = OPFCreator(os.path.dirname(htmlfile), mi) opf = OPFCreator(os.path.dirname(htmlfile), mi)
if hasattr(self.book_header.exth, 'cover_offset'): if hasattr(self.book_header.exth, 'cover_offset'):
opf.cover = 'images/%05d.jpg' % (self.book_header.exth.cover_offset + 1) opf.cover = 'images/%05d.jpg' % (self.book_header
.exth.cover_offset + 1)
elif mi.cover is not None: elif mi.cover is not None:
opf.cover = mi.cover opf.cover = mi.cover
else: else:
opf.cover = 'images/%05d.jpg' % 1 opf.cover = 'images/%05d.jpg' % 1
if not os.path.exists(os.path.join(os.path.dirname(htmlfile), if not os.path.exists(os.path.join(os.path.dirname(htmlfile),
* opf.cover.split('/'))): * opf.cover.split('/'))):
opf.cover = None opf.cover = None
cover = opf.cover cover = opf.cover
@@ -669,7 +716,7 @@ class MobiReader(object):
opf.cover = ncover.replace(os.sep, '/') opf.cover = ncover.replace(os.sep, '/')
manifest = [(htmlfile, 'application/xhtml+xml'), manifest = [(htmlfile, 'application/xhtml+xml'),
(os.path.abspath('styles.css'), 'text/css')] (os.path.abspath('styles.css'), 'text/css')]
bp = os.path.dirname(htmlfile) bp = os.path.dirname(htmlfile)
added = set() added = set()
for i in getattr(self, 'image_names', []): for i in getattr(self, 'image_names', []):
@@ -708,15 +755,17 @@ class MobiReader(object):
if href and re.match(r'\w+://', href) is None: if href and re.match(r'\w+://', href) is None:
try: try:
text = ' '.join([t.strip() for t in text = ' '.join([t.strip() for t in
x.xpath('descendant::text()')]) x.xpath('descendant:'
except: ':text()')])
except Exception:
text = '' text = ''
text = ent_pat.sub(entity_to_unicode, text) text = ent_pat.sub(entity_to_unicode, text)
item = tocobj.add_item(toc.partition('#')[0], href[1:], item = tocobj.add_item(toc.partition('#')[0],
text) href[1:], text)
item.left_space = int(self.get_left_whitespace(x)) item.left_space = int(self.get_left_whitespace(x))
found = True found = True
if reached and found and x.get('class', None) == 'mbp_pagebreak': if (reached and found and
x.get('class', None) == 'mbp_pagebreak'):
break break
if tocobj is not None: if tocobj is not None:
tocobj = self.structure_toc(tocobj) tocobj = self.structure_toc(tocobj)
@@ -748,7 +797,7 @@ class MobiReader(object):
level = indent_vals.index(item.left_space) level = indent_vals.index(item.left_space)
parent = find_parent(level) parent = find_parent(level)
last_found[level] = parent.add_item(item.href, item.fragment, last_found[level] = parent.add_item(item.href, item.fragment,
item.text) item.text)
return newtoc return newtoc
@@ -782,7 +831,9 @@ class MobiReader(object):
def warn_about_trailing_entry_corruption(self): def warn_about_trailing_entry_corruption(self):
if not self.warned_about_trailing_entry_corruption: if not self.warned_about_trailing_entry_corruption:
self.warned_about_trailing_entry_corruption = True self.warned_about_trailing_entry_corruption = True
self.log.warn('The trailing data entries in this MOBI file are corrupted, you might see corrupted text in the output') self.log.warn('The trailing data entries in this MOBI file are '
'corrupted, you might see corrupted text in the '
'output')
def text_section(self, index): def text_section(self, index):
data = self.sections[index][0] data = self.sections[index][0]
@@ -791,19 +842,23 @@ class MobiReader(object):
def extract_text(self, offset=1): def extract_text(self, offset=1):
self.log.debug('Extracting text...') self.log.debug('Extracting text...')
text_sections = [self.text_section(i) for i in range(offset, text_sections = [self.text_section(i)
min(self.book_header.records + offset, len(self.sections)))] for i in range(offset, min(self.book_header.records
+ offset,
len(self.sections)))]
processed_records = list(range(offset-1, self.book_header.records + processed_records = list(range(offset-1, self.book_header.records +
offset)) offset))
self.mobi_html = b'' self.mobi_html = b''
if self.book_header.compression_type == b'DH': if self.book_header.compression_type == b'DH':
huffs = [self.sections[i][0] for i in huffs = [self.sections[i][0]
range(self.book_header.huff_offset, for i in range(self.book_header.huff_offset,
self.book_header.huff_offset + self.book_header.huff_number)] self.book_header.huff_offset +
self.book_header.huff_number)]
processed_records += list(range(self.book_header.huff_offset, processed_records += list(range(self.book_header.huff_offset,
self.book_header.huff_offset + self.book_header.huff_number)) self.book_header.huff_offset +
self.book_header.huff_number))
huff = HuffReader(huffs) huff = HuffReader(huffs)
unpack = huff.unpack unpack = huff.unpack
@@ -811,19 +866,23 @@ class MobiReader(object):
unpack = decompress_doc unpack = decompress_doc
elif self.book_header.compression_type == b'\x00\x01': elif self.book_header.compression_type == b'\x00\x01':
unpack = lambda x: x unpack = lambda x: x # noqa
else: else:
raise MobiError('Unknown compression algorithm: %r' % self.book_header.compression_type) raise MobiError('Unknown compression algorithm: %r' %
self.book_header.compression_type)
self.mobi_html = b''.join(map(unpack, text_sections)) self.mobi_html = b''.join(map(unpack, text_sections))
if self.mobi_html.endswith(b'#'): if self.mobi_html.endswith(b'#'):
self.mobi_html = self.mobi_html[:-1] self.mobi_html = self.mobi_html[:-1]
if self.book_header.ancient and b'<html' not in self.mobi_html[:300].lower(): if (self.book_header.ancient and
b'<html' not in self.mobi_html[:300].lower()):
self.mobi_html = self.mobi_html.replace(b'\r ', b'\n\n ') self.mobi_html = self.mobi_html.replace(b'\r ', b'\n\n ')
self.mobi_html = self.mobi_html.replace(b'\0', b'') self.mobi_html = self.mobi_html.replace(b'\0', b'')
if self.book_header.codec == 'cp1252': if self.book_header.codec == 'cp1252':
self.mobi_html = self.mobi_html.replace(b'\x1e', b'') # record separator # record separator
self.mobi_html = self.mobi_html.replace(b'\x02', b'') # start of text self.mobi_html = self.mobi_html.replace(b'\x1e', b'')
# start of text
self.mobi_html = self.mobi_html.replace(b'\x02', b'')
return processed_records return processed_records
def replace_page_breaks(self): def replace_page_breaks(self):
@@ -835,7 +894,7 @@ class MobiReader(object):
self.log.debug('Adding anchors...') self.log.debug('Adding anchors...')
positions = set() positions = set()
link_pattern = re.compile(br'''<[^<>]+filepos=['"]{0,1}(\d+)[^<>]*>''', link_pattern = re.compile(br'''<[^<>]+filepos=['"]{0,1}(\d+)[^<>]*>''',
re.IGNORECASE) re.IGNORECASE)
for match in link_pattern.finditer(self.mobi_html): for match in link_pattern.finditer(self.mobi_html):
positions.add(int(match.group(1))) positions.add(int(match.group(1)))
pos = 0 pos = 0
@@ -845,12 +904,13 @@ class MobiReader(object):
if end == 0: if end == 0:
continue continue
oend = end oend = end
l = self.mobi_html.find(b'<', end) _l = self.mobi_html.find(b'<', end)
r = self.mobi_html.find(b'>', end) r = self.mobi_html.find(b'>', end)
anchor = b'<a id="filepos%d"></a>' anchor = b'<a id="filepos%d"></a>'
if r > -1 and (r < l or l == end or l == -1): if r > -1 and (r < _l or _l == end or _l == -1):
p = self.mobi_html.rfind(b'<', 0, end + 1) p = self.mobi_html.rfind(b'<', 0, end + 1)
if (pos < end and p > -1 and not end_tag_re.match(self.mobi_html[p:r]) and if (pos < end and p > -1 and
not end_tag_re.match(self.mobi_html[p:r]) and
not self.mobi_html[p:r + 1].endswith(b'/>')): not self.mobi_html[p:r + 1].endswith(b'/>')):
anchor = b' filepos-id="filepos%d"' anchor = b' filepos-id="filepos%d"'
end = r end = r
@@ -862,8 +922,9 @@ class MobiReader(object):
processed_html = b''.join(processed_html) processed_html = b''.join(processed_html)
# Remove anchors placed inside entities # Remove anchors placed inside entities
self.processed_html = re.sub(br'&([^;]*?)(<a id="filepos\d+"></a>)([^;]*);', self.processed_html = re.sub(br'&([^;]*?)(<a id="filepos\d+"></a>)'
br'&\1\3;\2', processed_html) br'([^;]*);', br'&\1\3;\2',
processed_html)
def extract_images(self, processed_records, output_dir): def extract_images(self, processed_records, output_dir):
self.log.debug('Extracting images...') self.log.debug('Extracting images...')
@@ -881,10 +942,11 @@ class MobiReader(object):
if i in processed_records: if i in processed_records:
continue continue
processed_records.append(i) processed_records.append(i)
data = self.sections[i][0] data = self.sections[i][0]
image_index += 1 image_index += 1
if data[:4] in {b'FLIS', b'FCIS', b'SRCS', b'\xe9\x8e\r\n', if data[:4] in {b'FLIS', b'FCIS', b'SRCS', b'\xe9\x8e\r\n',
b'RESC', b'BOUN', b'FDST', b'DATP', b'AUDI', b'VIDE'}: b'RESC', b'BOUN', b'FDST', b'DATP', b'AUDI',
b'VIDE'}:
# This record is a known non image type, no need to try to # This record is a known non image type, no need to try to
# load the image # load the image
continue continue
@@ -920,16 +982,17 @@ class MobiReader(object):
def test_mbp_regex(): def test_mbp_regex():
for raw, m in {'<mbp:pagebreak></mbp:pagebreak>':'', for raw, m in {'<mbp:pagebreak></mbp:pagebreak>': '',
'<mbp:pagebreak xxx></mbp:pagebreak>yyy':' xxxyyy', '<mbp:pagebreak xxx></mbp:pagebreak>yyy': ' xxxyyy',
'<mbp:pagebreak> </mbp:pagebreak>':'', '<mbp:pagebreak> </mbp:pagebreak>': '',
'<mbp:pagebreak>xxx':'xxx', '<mbp:pagebreak>xxx': 'xxx',
'<mbp:pagebreak/>xxx':'xxx', '<mbp:pagebreak/>xxx': 'xxx',
'<mbp:pagebreak sdf/ >xxx':' sdfxxx', '<mbp:pagebreak sdf/ >xxx': ' sdfxxx',
'<mbp:pagebreak / >':' ', '<mbp:pagebreak / >': ' ',
'</mbp:pagebreak>':'', '</mbp:pagebreak>': '',
'</mbp:pagebreak sdf>':' sdf', '</mbp:pagebreak sdf>': ' sdf',
'</mbp:pagebreak><mbp:pagebreak></mbp:pagebreak>xxx':'xxx'}.items(): '</mbp:pagebreak><mbp:pagebreak></mbp:pagebreak>xxx':
'xxx'}.items():
ans = MobiReader.PAGE_BREAK_PAT.sub(r'\1', raw) ans = MobiReader.PAGE_BREAK_PAT.sub(r'\1', raw)
if ans != m: if ans != m:
raise Exception('%r != %r for %r'%(ans, m, raw)) raise Exception('%r != %r for %r' % (ans, m, raw))

View File

@@ -1,13 +1,11 @@
import mimetypes import mimetypes
import re import re
from ebook_converter.ebooks.oeb.base import XPath, urlunquote from ebook_converter.ebooks.oeb.base import XPath, urlunquote
from ebook_converter.polyglot.binary import from_base64_bytes
from ebook_converter.polyglot.builtins import as_bytes from ebook_converter.polyglot.builtins import as_bytes
__license__ = 'GPL v3'
__copyright__ = '2014, Kovid Goyal <kovid at kovidgoyal.net>'
class DataURL(object): class DataURL(object):
def __call__(self, oeb, opts): def __call__(self, oeb, opts):
@@ -27,25 +25,29 @@ class DataURL(object):
continue continue
if ';base64' in header: if ';base64' in header:
data = re.sub(r'\s+', '', data) data = re.sub(r'\s+', '', data)
from ebook_converter.polyglot.binary import from_base64_bytes
try: try:
data = from_base64_bytes(data) data = from_base64_bytes(data)
except Exception: except Exception:
self.log.error('Found invalid base64 encoded data URI, ignoring it') self.log.error('Found invalid base64 encoded data '
'URI, ignoring it')
continue continue
else: else:
data = urlunquote(data) data = urlunquote(data)
data = as_bytes(data) data = as_bytes(data)
fmt = what(None, data) fmt = what(None, data)
if not fmt: if not fmt:
self.log.warn('Image encoded as data URL has unknown format, ignoring') self.log.warn('Image encoded as data URL has unknown '
'format, ignoring')
continue continue
img.set('src', item.relhref(self.convert_image_data_uri(data, fmt, oeb))) img.set('src',
item.relhref(self.convert_image_data_uri(data, fmt,
oeb)))
def convert_image_data_uri(self, data, fmt, oeb): def convert_image_data_uri(self, data, fmt, oeb):
self.log('Found image encoded as data URI converting it to normal image') self.log('Found image encoded as data URI converting it to normal '
from ebook_converter import guess_type 'image')
item_id, item_href = oeb.manifest.generate('data-url-image', 'data-url-image.' + fmt) item_id, item_href = oeb.manifest.generate('data-url-image',
'data-url-image.' + fmt)
oeb.manifest.add(item_id, item_href, oeb.manifest.add(item_id, item_href,
mimetypes.guess_type(item_href)[0], data=data) mimetypes.guess_type(item_href)[0], data=data)
return item_href return item_href

View File

@@ -1,9 +1,11 @@
import mimetypes import mimetypes
import sys, os, re import os
from xml.sax.saxutils import escape
from string import Formatter
import pkg_resources import pkg_resources
import re
import string
import sys
import urllib.parse import urllib.parse
from xml.sax import saxutils
from ebook_converter import constants as const from ebook_converter import constants as const
from ebook_converter import strftime from ebook_converter import strftime
@@ -16,18 +18,14 @@ from ebook_converter.ebooks.chardet import strip_encoding_declarations
from ebook_converter.ebooks.metadata import fmt_sidx, rating_to_stars from ebook_converter.ebooks.metadata import fmt_sidx, rating_to_stars
__license__ = 'GPL v3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
JACKET_XPATH = '//h:meta[@name="calibre-content" and @content="jacket"]' JACKET_XPATH = '//h:meta[@name="calibre-content" and @content="jacket"]'
class SafeFormatter(Formatter): class SafeFormatter(string.Formatter):
def get_value(self, *args, **kwargs): def get_value(self, *args, **kwargs):
try: try:
return Formatter.get_value(self, *args, **kwargs) return string.Formatter.get_value(self, *args, **kwargs)
except KeyError: except KeyError:
return '' return ''
@@ -40,7 +38,7 @@ class Base(object):
for img in path(item.data): for img in path(item.data):
if removed >= limit: if removed >= limit:
break break
href = item.abshref(img.get('src')) href = item.abshref(img.get('src'))
image = self.oeb.manifest.hrefs.get(href) image = self.oeb.manifest.hrefs.get(href)
if image is None: if image is None:
href = urlnormalize(href) href = urlnormalize(href)
@@ -68,7 +66,8 @@ class RemoveFirstImage(Base):
raw = xml2text(body[0]).strip() raw = xml2text(body[0]).strip()
imgs = XPath('//h:img|//svg:svg')(item.data) imgs = XPath('//h:img|//svg:svg')(item.data)
if not raw and not imgs: if not raw and not imgs:
self.log('Removing %s as it has no content'%item.href) self.log('Removing %s as it has no content' %
item.href)
self.oeb.manifest.remove(item) self.oeb.manifest.remove(item)
deleted_item = item deleted_item = item
break break
@@ -82,20 +81,20 @@ class RemoveFirstImage(Base):
self.oeb.guide.remove_by_href(deleted_item.href) self.oeb.guide.remove_by_href(deleted_item.href)
def __call__(self, oeb, opts, metadata): def __call__(self, oeb, opts, metadata):
''' """
Add metadata in jacket.xhtml if specified in opts Add metadata in jacket.xhtml if specified in opts
If not specified, remove previous jacket instance If not specified, remove previous jacket instance
''' """
self.oeb, self.opts, self.log = oeb, opts, oeb.log self.oeb, self.opts, self.log = oeb, opts, oeb.log
if opts.remove_first_image: if opts.remove_first_image:
self.remove_first_image() self.remove_first_image()
class Jacket(Base): class Jacket(Base):
''' """
Book jacket manipulation. Remove first image and insert comments at start of Book jacket manipulation. Remove first image and insert comments at start
book. of book.
''' """
def insert_metadata(self, mi): def insert_metadata(self, mi):
self.log('Inserting metadata into book...') self.log('Inserting metadata into book...')
@@ -107,22 +106,24 @@ class Jacket(Base):
try: try:
comments = str(self.oeb.metadata.description[0]) comments = str(self.oeb.metadata.description[0])
except: except Exception:
comments = '' comments = ''
try: try:
title = str(self.oeb.metadata.title[0]) title = str(self.oeb.metadata.title[0])
except: except Exception:
title = 'Unknown' title = 'Unknown'
try: try:
authors = list(map(str, self.oeb.metadata.creator)) authors = list(map(str, self.oeb.metadata.creator))
except: except Exception:
authors = ['Unknown'] authors = ['Unknown']
root = render_jacket(mi, self.opts.output_profile, root = render_jacket(mi, self.opts.output_profile,
alt_title=title, alt_tags=tags, alt_authors=authors, alt_title=title, alt_tags=tags,
alt_comments=comments, rescale_fonts=True) alt_authors=authors,
alt_comments=comments,
rescale_fonts=True)
id, href = self.oeb.manifest.generate('calibre_jacket', 'jacket.xhtml') id, href = self.oeb.manifest.generate('calibre_jacket', 'jacket.xhtml')
jacket = self.oeb.manifest.add(id, href, mimetypes.guess_type(href)[0], jacket = self.oeb.manifest.add(id, href, mimetypes.guess_type(href)[0],
@@ -132,7 +133,8 @@ class Jacket(Base):
for img, path in referenced_images(root): for img, path in referenced_images(root):
self.oeb.log('Embedding referenced image %s into jacket' % path) self.oeb.log('Embedding referenced image %s into jacket' % path)
ext = path.rpartition('.')[-1].lower() ext = path.rpartition('.')[-1].lower()
item_id, href = self.oeb.manifest.generate('jacket_image', 'jacket_img.'+ext) item_id, href = self.oeb.manifest.generate('jacket_image',
'jacket_img.' + ext)
with open(path, 'rb') as f: with open(path, 'rb') as f:
item = self.oeb.manifest.add( item = self.oeb.manifest.add(
item_id, href, mimetypes.guess_type(href)[0], item_id, href, mimetypes.guess_type(href)[0],
@@ -149,10 +151,10 @@ class Jacket(Base):
break break
def __call__(self, oeb, opts, metadata): def __call__(self, oeb, opts, metadata):
''' """
Add metadata in jacket.xhtml if specified in opts Add metadata in jacket.xhtml if specified in opts
If not specified, remove previous jacket instance If not specified, remove previous jacket instance
''' """
self.oeb, self.opts, self.log = oeb, opts, oeb.log self.oeb, self.opts, self.log = oeb, opts, oeb.log
self.remove_existing_jacket() self.remove_existing_jacket()
if opts.insert_metadata: if opts.insert_metadata:
@@ -164,8 +166,8 @@ class Jacket(Base):
def get_rating(rating, rchar, e_rchar): def get_rating(rating, rchar, e_rchar):
ans = '' ans = ''
try: try:
num = float(rating)/2 num = float(rating) / 2
except: except Exception:
return ans return ans
num = max(0, num) num = max(0, num)
num = min(num, 5) num = min(num, 5)
@@ -180,25 +182,29 @@ class Series(str):
def __new__(self, series, series_index): def __new__(self, series, series_index):
if series and series_index is not None: if series and series_index is not None:
roman = '{1} of <em>{0}</em>'.format( _roman = saxutils.escape(fmt_sidx(series_index, use_roman=True))
escape(series), escape(fmt_sidx(series_index, use_roman=True))) _no_roman = saxutils.escape(fmt_sidx(series_index,
combined = '{1} of <em>{0}</em>'.format( use_roman=False))
escape(series), escape(fmt_sidx(series_index, roman = '{1} of <em>{0}</em>'.format(saxutils.escape(series),
use_roman=False))) _roman)
combined = '{1} of <em>{0}</em>'.format(saxutils.escape(series),
_no_roman)
else: else:
combined = roman = escape(series or u'') combined = roman = saxutils.escape(series or u'')
s = str.__new__(self, combined) s = str.__new__(self, combined)
s.roman = roman s.roman = roman
s.name = escape(series or '') s.name = saxutils.escape(series or '')
s.number = escape(fmt_sidx(series_index or 1.0, use_roman=False)) s.number = saxutils.escape(fmt_sidx(series_index or 1.0,
s.roman_number = escape(fmt_sidx(series_index or 1.0, use_roman=True)) use_roman=False))
s.roman_number = saxutils.escape(fmt_sidx(series_index or 1.0,
use_roman=True))
return s return s
class Tags(str): class Tags(str):
def __new__(self, tags, output_profile): def __new__(self, tags, output_profile):
tags = [escape(x) for x in tags or ()] tags = [saxutils.escape(x) for x in tags or ()]
t = str.__new__(self, ', '.join(tags)) t = str.__new__(self, ', '.join(tags))
t.alphabetical = ', '.join(sorted(tags)) t.alphabetical = ', '.join(sorted(tags))
t.tags_list = tags t.tags_list = tags
@@ -233,9 +239,9 @@ def postprocess_jacket(root, output_profile, has_data):
extract_class('cbj_kindle_banner_hr') extract_class('cbj_kindle_banner_hr')
def render_jacket(mi, output_profile, def render_jacket(mi, output_profile, alt_title='Unknown', alt_tags=[],
alt_title='Unknown', alt_tags=[], alt_comments='', alt_comments='', alt_publisher='', rescale_fonts=False,
alt_publisher='', rescale_fonts=False, alt_authors=None): alt_authors=None):
with open(pkg_resources.resource_filename('ebook_converter', with open(pkg_resources.resource_filename('ebook_converter',
'data/jacket/stylesheet.css'), 'data/jacket/stylesheet.css'),
'rb') as fobj: 'rb') as fobj:
@@ -250,17 +256,20 @@ def render_jacket(mi, output_profile,
try: try:
title_str = alt_title if mi.is_null('title') else mi.title title_str = alt_title if mi.is_null('title') else mi.title
except: except Exception:
title_str = 'Unknown' title_str = 'Unknown'
title_str = escape(title_str) title_str = saxutils.escape(title_str)
title = '<span class="title">%s</span>' % title_str title = '<span class="title">%s</span>' % title_str
series = Series(mi.series, mi.series_index) series = Series(mi.series, mi.series_index)
try: try:
publisher = mi.publisher if not mi.is_null('publisher') else alt_publisher if not mi.is_null('publisher'):
except: publisher = mi.publisher
else:
publisher = alt_publisher
except Exception:
publisher = '' publisher = ''
publisher = escape(publisher) publisher = saxutils.escape(publisher)
try: try:
if is_date_undefined(mi.pubdate): if is_date_undefined(mi.pubdate):
@@ -268,10 +277,11 @@ def render_jacket(mi, output_profile,
else: else:
dt = as_local_time(mi.pubdate) dt = as_local_time(mi.pubdate)
pubdate = strftime('%Y', dt.timetuple()) pubdate = strftime('%Y', dt.timetuple())
except: except Exception:
pubdate = '' pubdate = ''
rating = get_rating(mi.rating, output_profile.ratings_char, output_profile.empty_ratings_char) rating = get_rating(mi.rating, output_profile.ratings_char,
output_profile.empty_ratings_char)
tags = Tags((mi.tags if mi.tags else alt_tags), output_profile) tags = Tags((mi.tags if mi.tags else alt_tags), output_profile)
@@ -285,10 +295,10 @@ def render_jacket(mi, output_profile,
mi.authors = list(alt_authors or ('Unknown',)) mi.authors = list(alt_authors or ('Unknown',))
try: try:
author = mi.format_authors() author = mi.format_authors()
except: except Exception:
author = '' author = ''
mi.authors = orig mi.authors = orig
author = escape(author) author = saxutils.escape(author)
has_data = {} has_data = {}
def generate_html(comments): def generate_html(comments):
@@ -301,7 +311,7 @@ def render_jacket(mi, output_profile,
'publisher': publisher, 'publisher': publisher,
'rating': rating, 'rating': rating,
'rating_label': 'Rating', 'rating_label': 'Rating',
'searchable_tags': ' '.join(escape(t) + 'ttt' 'searchable_tags': ' '.join(saxutils.escape(t) + 'ttt'
for t in tags.tags_list), for t in tags.tags_list),
'series': series, 'series': series,
'series_label': 'Series', 'series_label': 'Series',
@@ -320,25 +330,30 @@ def render_jacket(mi, output_profile,
if dt == 'series': if dt == 'series':
args[dkey] = Series(mi.get(key), mi.get(key + '_index')) args[dkey] = Series(mi.get(key), mi.get(key + '_index'))
elif dt == 'rating': elif dt == 'rating':
args[dkey] = rating_to_stars(mi.get(key), m.get('display', {}).get('allow_half_stars', False)) args[dkey] = rating_to_stars(mi.get(key),
m.get('display', {})
.get('allow_half_stars',
False))
elif dt == 'comments': elif dt == 'comments':
val = val or '' val = val or ''
display = m.get('display', {}) display = m.get('display', {})
ctype = display.get('interpret_as') or 'html' ctype = display.get('interpret_as') or 'html'
if ctype == 'long-text': if ctype == 'long-text':
val = '<pre style="white-space:pre-wrap">%s</pre>' % escape(val) val = ('<pre style="white-space:pre-wrap">%s</pre>' %
saxutils.escape(val))
elif ctype == 'short-text': elif ctype == 'short-text':
val = '<span>%s</span>' % escape(val) val = '<span>%s</span>' % saxutils.escape(val)
elif ctype == 'markdown': elif ctype == 'markdown':
val = markdown(val) val = markdown(val)
else: else:
val = comments_to_html(val) val = comments_to_html(val)
args[dkey] = val args[dkey] = val
else: else:
args[dkey] = escape(val) args[dkey] = saxutils.escape(val)
args[dkey+'_label'] = escape(display_name) args[dkey+'_label'] = saxutils.escape(display_name)
except Exception: except Exception:
# if the val (custom column contents) is None, don't add to args # if the val (custom column contents) is None, don't add to
# args
pass pass
if False: if False:
@@ -371,10 +386,11 @@ def render_jacket(mi, output_profile,
# the text in the book. That means that as long as the jacket uses # the text in the book. That means that as long as the jacket uses
# relative font sizes (em or %), the post conversion font size will be # relative font sizes (em or %), the post conversion font size will be
# the same as for text in the main book. So text with size x em will # the same as for text in the main book. So text with size x em will
# be rescaled to the same value in both the jacket and the main content. # be rescaled to the same value in both the jacket and the main
# content.
# #
# We cannot use data-calibre-rescale 100 on the body tag as that will just # We cannot use data-calibre-rescale 100 on the body tag as that will
# give the body tag a font size of 1em, which is useless. # just give the body tag a font size of 1em, which is useless.
for body in root.xpath('//*[local-name()="body"]'): for body in root.xpath('//*[local-name()="body"]'):
fw = body.makeelement(base.tag('xhtml', 'div')) fw = body.makeelement(base.tag('xhtml', 'div'))
fw.set('data-calibre-rescale', '100') fw.set('data-calibre-rescale', '100')