1
0
mirror of https://github.com/gryf/ebook-converter.git synced 2026-01-19 19:14:11 +01:00

Fixed flake8 issues to several modules

This commit is contained in:
2020-06-14 15:49:11 +02:00
parent 1d4f75ceba
commit 9891d02694
10 changed files with 505 additions and 335 deletions

View File

@@ -15,17 +15,18 @@ from ebook_converter.polyglot.builtins import as_unicode
def sanitize_file_name(x):
ans = re.sub(r'\s+', ' ', re.sub(r'[?&=;#]', '_', ascii_filename(x))).strip().rstrip('.')
ans = re.sub(r'\s+', ' ', re.sub(r'[?&=;#]', '_',
ascii_filename(x))).strip().rstrip('.')
ans, ext = ans.rpartition('.')[::2]
return (ans.strip() + '.' + ext.strip()).rstrip('.')
class HTMLInput(InputFormatPlugin):
name = 'HTML Input'
author = 'Kovid Goyal'
name = 'HTML Input'
author = 'Kovid Goyal'
description = 'Convert HTML and OPF files to an OEB'
file_types = {'opf', 'html', 'htm', 'xhtml', 'xhtm', 'shtm', 'shtml'}
file_types = {'opf', 'html', 'htm', 'xhtml', 'xhtm', 'shtm', 'shtml'}
commit_name = 'html_input'
options = {

View File

@@ -6,10 +6,10 @@ from ebook_converter.customize.conversion import InputFormatPlugin
class HTMLZInput(InputFormatPlugin):
name = 'HTLZ Input'
author = 'John Schember'
name = 'HTLZ Input'
author = 'John Schember'
description = 'Convert HTML files to HTML'
file_types = {'htmlz'}
file_types = {'htmlz'}
commit_name = 'htmlz_input'
def convert(self, stream, options, file_ext, log,
@@ -36,13 +36,14 @@ class HTMLZInput(InputFormatPlugin):
top_levels.append(x)
# Try to find an index. file.
for x in top_levels:
if x.lower() in (u'index.html', u'index.xhtml', u'index.htm'):
if x.lower() in ('index.html', 'index.xhtml', 'index.htm'):
index = x
break
# Look for multiple HTML files in the archive. We look at the
# top level files only as only they matter in HTMLZ.
for x in top_levels:
if os.path.splitext(x)[1].lower() in (u'.html', u'.xhtml', u'.htm'):
if os.path.splitext(x)[1].lower() in ('.html', '.xhtml',
'.htm'):
# Set index to the first HTML file found if it's not
# called index.
if not index:
@@ -84,15 +85,14 @@ class HTMLZInput(InputFormatPlugin):
c = 0
while os.path.exists(htmlfile):
c += 1
htmlfile = u'index%d.html'%c
htmlfile = u'index%d.html' % c
with open(htmlfile, 'wb') as f:
f.write(html.encode('utf-8'))
odi = options.debug_pipeline
options.debug_pipeline = None
# Generate oeb from html conversion.
with open(htmlfile, 'rb') as f:
oeb = html_input.convert(f, options, 'html', log,
{})
oeb = html_input.convert(f, options, 'html', log, {})
options.debug_pipeline = odi
os.remove(htmlfile)

View File

@@ -1,5 +1,11 @@
import sys, os, re, math, errno, uuid, numbers
from collections import OrderedDict, defaultdict
import sys
import os
import re
import math
import errno
import uuid
import numbers
import collections
import mimetypes
from lxml import etree
@@ -7,23 +13,24 @@ from lxml import html
from lxml.html.builder import (
HTML, HEAD, TITLE, BODY, LINK, META, P, SPAN, BR, DIV, A, DT, DL, DD, H1)
from ebook_converter import guess_type
from ebook_converter.ebooks.docx.container import DOCX
from ebook_converter.ebooks.docx.names import XML, generate_anchor
from ebook_converter.ebooks.docx.styles import Styles, inherit, PageProperties
from ebook_converter.ebooks.docx.numbering import Numbering
from ebook_converter.ebooks.docx.fonts import Fonts, is_symbol_font, map_symbol_text
from ebook_converter.ebooks.docx.images import Images
from ebook_converter.ebooks.docx.tables import Tables
from ebook_converter.ebooks.docx.footnotes import Footnotes
from ebook_converter.ebooks.docx.cleanup import cleanup_markup
from ebook_converter.ebooks.docx.container import DOCX
from ebook_converter.ebooks.docx.fields import Fields
from ebook_converter.ebooks.docx.fonts import Fonts
from ebook_converter.ebooks.docx.fonts import is_symbol_font
from ebook_converter.ebooks.docx.fonts import map_symbol_text
from ebook_converter.ebooks.docx.footnotes import Footnotes
from ebook_converter.ebooks.docx.images import Images
from ebook_converter.ebooks.docx.names import XML, generate_anchor
from ebook_converter.ebooks.docx.numbering import Numbering
from ebook_converter.ebooks.docx.settings import Settings
from ebook_converter.ebooks.docx.styles import Styles, inherit, PageProperties
from ebook_converter.ebooks.docx.tables import Tables
from ebook_converter.ebooks.docx.theme import Theme
from ebook_converter.ebooks.docx.toc import create_toc
from ebook_converter.ebooks.docx.fields import Fields
from ebook_converter.ebooks.docx.settings import Settings
from ebook_converter.ebooks.metadata.opf2 import OPFCreator
from ebook_converter.utils.localization import canonicalize_lang, lang_as_iso639_1
from ebook_converter.utils.localization import canonicalize_lang
from ebook_converter.utils.localization import lang_as_iso639_1
NBSP = '\xa0'
@@ -54,7 +61,9 @@ def html_lang(docx_lang):
class Convert(object):
def __init__(self, path_or_stream, dest_dir=None, log=None, detect_cover=True, notes_text=None, notes_nopb=False, nosupsub=False):
def __init__(self, path_or_stream, dest_dir=None, log=None,
detect_cover=True, notes_text=None, notes_nopb=False,
nosupsub=False):
self.docx = DOCX(path_or_stream, log=log)
self.namespace = self.docx.namespace
self.ms_pat = re.compile(r'\s{2,}')
@@ -73,7 +82,7 @@ class Convert(object):
self.fields = Fields(self.namespace)
self.styles = Styles(self.namespace, self.tables)
self.images = Images(self.namespace, self.log)
self.object_map = OrderedDict()
self.object_map = collections.OrderedDict()
self.html = HTML(
HEAD(
META(charset='utf-8'),
@@ -82,9 +91,9 @@ class Convert(object):
),
self.body
)
self.html.text='\n\t'
self.html[0].text='\n\t\t'
self.html[0].tail='\n'
self.html.text = '\n\t'
self.html[0].text = '\n\t\t'
self.html[0].tail = '\n'
for child in self.html[0]:
child.tail = '\n\t\t'
self.html[0][-1].tail = '\n\t'
@@ -98,17 +107,18 @@ class Convert(object):
def __call__(self):
doc = self.docx.document
relationships_by_id, relationships_by_type = self.docx.document_relationships
(relationships_by_id,
relationships_by_type) = self.docx.document_relationships
self.resolve_alternate_content(doc)
self.fields(doc, self.log)
self.read_styles(relationships_by_type)
self.images(relationships_by_id)
self.layers = OrderedDict()
self.layers = collections.OrderedDict()
self.framed = [[]]
self.frame_map = {}
self.framed_map = {}
self.anchor_map = {}
self.link_map = defaultdict(list)
self.link_map = collections.defaultdict(list)
self.link_source_map = {}
self.toc_anchor = None
self.block_runs = []
@@ -142,7 +152,8 @@ class Convert(object):
dl = DL(id=anchor)
dl.set('class', 'footnote')
self.body.append(dl)
dl.append(DT('[', A('' + text, href='#back_%s' % anchor, title=text)))
dl.append(DT('[', A('' + text, href='#back_%s' % anchor,
title=text)))
dl[-1][0].tail = ']'
dl.append(DD())
paras = []
@@ -159,7 +170,8 @@ class Convert(object):
self.mark_block_runs(paras)
for p, wp in self.object_map.items():
if len(p) > 0 and not p.text and len(p[0]) > 0 and not p[0].text and p[0][0].get('class', None) == 'tab':
if (len(p) > 0 and not p.text and len(p[0]) > 0 and
not p[0].text and p[0][0].get('class', None) == 'tab'):
# Paragraph uses tabs for indentation, convert to text-indent
parent = p[0]
tabs = []
@@ -172,7 +184,9 @@ class Convert(object):
break
indent = len(tabs) * self.settings.default_tab_stop
style = self.styles.resolve(wp)
if style.text_indent is inherit or (hasattr(style.text_indent, 'endswith') and style.text_indent.endswith('pt')):
if (style.text_indent is inherit or
(hasattr(style.text_indent, 'endswith') and
style.text_indent.endswith('pt'))):
if style.text_indent is not inherit:
indent = float(style.text_indent[:-2]) + indent
style.text_indent = '%.3gpt' % indent
@@ -197,7 +211,8 @@ class Convert(object):
except (TypeError, ValueError):
lvl = 0
numbered.append((html_obj, num_id, lvl))
self.numbering.apply_markup(numbered, self.body, self.styles, self.object_map, self.images)
self.numbering.apply_markup(numbered, self.body, self.styles,
self.object_map, self.images)
self.apply_frames()
if len(self.body) > 0:
@@ -232,13 +247,15 @@ class Convert(object):
self.fields.polish_markup(self.object_map)
self.log.debug('Cleaning up redundant markup generated by Word')
self.cover_image = cleanup_markup(self.log, self.html, self.styles, self.dest_dir, self.detect_cover, self.namespace.XPath)
self.cover_image = cleanup_markup(self.log, self.html, self.styles,
self.dest_dir, self.detect_cover,
self.namespace.XPath)
return self.write(doc)
def read_page_properties(self, doc):
current = []
self.page_map = OrderedDict()
self.page_map = collections.OrderedDict()
self.section_starts = []
for p in self.namespace.descendants(doc, 'w:p', 'w:tbl'):
@@ -267,7 +284,8 @@ class Convert(object):
def resolve_alternate_content(self, doc):
# For proprietary extensions in Word documents use the fallback, spec
# compliant form
# See https://wiki.openoffice.org/wiki/OOXML/Markup_Compatibility_and_Extensibility
# See https://wiki.openoffice.org/wiki/
# OOXML/Markup_Compatibility_and_Extensibility
for ac in self.namespace.descendants(doc, 'mc:AlternateContent'):
choices = self.namespace.XPath('./mc:Choice')(ac)
fallbacks = self.namespace.XPath('./mc:Fallback')(ac)
@@ -284,7 +302,8 @@ class Convert(object):
cname[-1] = defname
if self.docx.exists('/'.join(cname)):
name = name
if name and name.startswith('word/word') and not self.docx.exists(name):
if (name and name.startswith('word/word') and
not self.docx.exists(name)):
name = name.partition('/')[2]
return name
@@ -327,7 +346,8 @@ class Convert(object):
self.log.warn('Endnotes %s do not exist' % enname)
else:
enrel = self.docx.get_relationships(enname)
footnotes(etree.fromstring(foraw) if foraw else None, forel, etree.fromstring(enraw) if enraw else None, enrel)
footnotes(etree.fromstring(foraw) if foraw else None, forel,
etree.fromstring(enraw) if enraw else None, enrel)
if fname is not None:
embed_relationships = self.docx.get_relationships(fname)[0]
@@ -336,7 +356,8 @@ class Convert(object):
except KeyError:
self.log.warn('Fonts table %s does not exist' % fname)
else:
fonts(etree.fromstring(raw), embed_relationships, self.docx, self.dest_dir)
fonts(etree.fromstring(raw), embed_relationships, self.docx,
self.dest_dir)
if tname is not None:
try:
@@ -364,16 +385,20 @@ class Convert(object):
except KeyError:
self.log.warn('Numbering styles %s do not exist' % nname)
else:
numbering(etree.fromstring(raw), self.styles, self.docx.get_relationships(nname)[0])
numbering(etree.fromstring(raw), self.styles,
self.docx.get_relationships(nname)[0])
self.styles.resolve_numbering(numbering)
def write(self, doc):
toc = create_toc(doc, self.body, self.resolved_link_map, self.styles, self.object_map, self.log, self.namespace)
raw = html.tostring(self.html, encoding='utf-8', doctype='<!DOCTYPE html>')
toc = create_toc(doc, self.body, self.resolved_link_map, self.styles,
self.object_map, self.log, self.namespace)
raw = html.tostring(self.html, encoding='utf-8',
doctype='<!DOCTYPE html>')
with open(os.path.join(self.dest_dir, 'index.html'), 'wb') as f:
f.write(raw)
css = self.styles.generate_css(self.dest_dir, self.docx, self.notes_nopb, self.nosupsub)
css = self.styles.generate_css(self.dest_dir, self.docx,
self.notes_nopb, self.nosupsub)
if css:
with open(os.path.join(self.dest_dir, 'docx.css'), 'wb') as f:
f.write(css.encode('utf-8'))
@@ -394,23 +419,29 @@ class Convert(object):
title='Table of Contents',
type='toc'))
toc_file = os.path.join(self.dest_dir, 'toc.ncx')
with open(os.path.join(self.dest_dir, 'metadata.opf'), 'wb') as of, open(toc_file, 'wb') as ncx:
with open(os.path.join(self.dest_dir,
'metadata.opf'), 'wb') as of, open(toc_file,
'wb') as ncx:
opf.render(of, ncx, 'toc.ncx', process_guide=process_guide)
if os.path.getsize(toc_file) == 0:
os.remove(toc_file)
return os.path.join(self.dest_dir, 'metadata.opf')
def read_block_anchors(self, doc):
doc_anchors = frozenset(self.namespace.XPath('./w:body/w:bookmarkStart[@w:name]')(doc))
doc_anchors = frozenset(self.namespace.XPath('./w:body/w:bookmarkStart'
'[@w:name]')(doc))
if doc_anchors:
current_bm = set()
rmap = {v:k for k, v in self.object_map.items()}
for p in self.namespace.descendants(doc, 'w:p', 'w:bookmarkStart[@w:name]'):
rmap = {v: k for k, v in self.object_map.items()}
for p in self.namespace.descendants(doc, 'w:p',
'w:bookmarkStart[@w:name]'):
if p.tag.endswith('}p'):
if current_bm and p in rmap:
para = rmap[p]
if 'id' not in para.attrib:
para.set('id', generate_anchor(next(iter(current_bm)), frozenset(self.anchor_map.values())))
_bm = next(iter(current_bm))
_am = frozenset(self.anchor_map.values())
para.set('id', generate_anchor(_bm, _am))
for name in current_bm:
self.anchor_map[name] = para.get('id')
current_bm = set()
@@ -442,13 +473,15 @@ class Convert(object):
except AttributeError:
break
for x in self.namespace.descendants(p, 'w:r', 'w:bookmarkStart', 'w:hyperlink', 'w:instrText'):
for x in self.namespace.descendants(p, 'w:r', 'w:bookmarkStart',
'w:hyperlink', 'w:instrText'):
if p_parent(x) is not p:
continue
if x.tag.endswith('}r'):
span = self.convert_run(x)
if current_anchor is not None:
(dest if len(dest) == 0 else span).set('id', current_anchor)
(dest if len(dest) == 0 else span).set('id',
current_anchor)
current_anchor = None
if current_hyperlink is not None:
try:
@@ -462,11 +495,14 @@ class Convert(object):
self.layers[p].append(x)
elif x.tag.endswith('}bookmarkStart'):
anchor = self.namespace.get(x, 'w:name')
if anchor and anchor not in self.anchor_map and anchor != '_GoBack':
if (anchor and anchor not in self.anchor_map and
anchor != '_GoBack'):
# _GoBack is a special bookmark inserted by Word 2010 for
# the return to previous edit feature, we ignore it
old_anchor = current_anchor
self.anchor_map[anchor] = current_anchor = generate_anchor(anchor, frozenset(self.anchor_map.values()))
current_anchor = generate_anchor(
anchor, frozenset(self .anchor_map.values()))
self.anchor_map[anchor] = current_anchor
if old_anchor is not None:
# The previous anchor was not applied to any element
for a, t in tuple(self.anchor_map.items()):
@@ -474,10 +510,13 @@ class Convert(object):
self.anchor_map[a] = current_anchor
elif x.tag.endswith('}hyperlink'):
current_hyperlink = x
elif x.tag.endswith('}instrText') and x.text and x.text.strip().startswith('TOC '):
elif (x.tag.endswith('}instrText') and x.text and
x.text.strip().startswith('TOC ')):
old_anchor = current_anchor
anchor = str(uuid.uuid4())
self.anchor_map[anchor] = current_anchor = generate_anchor('toc', frozenset(self.anchor_map.values()))
current_anchor = generate_anchor(
'toc', frozenset(self.anchor_map.values()))
self.anchor_map[anchor] = current_anchor
self.toc_anchor = current_anchor
if old_anchor is not None:
# The previous anchor was not applied to any element
@@ -489,7 +528,8 @@ class Convert(object):
dest.set('id', current_anchor)
current_anchor = None
m = re.match(r'heading\s+(\d+)$', style.style_name or '', re.IGNORECASE)
m = re.match(r'heading\s+(\d+)$', style.style_name or '',
re.IGNORECASE)
if m is not None:
n = min(6, max(1, int(m.group(1))))
dest.tag = 'h%d' % n
@@ -533,7 +573,8 @@ class Convert(object):
if len(dest) > 0 and not dest[-1].tail:
if dest[-1].tag == 'br':
dest[-1].tail = NBSP
elif len(dest[-1]) > 0 and dest[-1][-1].tag == 'br' and not dest[-1][-1].tail:
elif (len(dest[-1]) > 0 and dest[-1][-1].tag == 'br' and
not dest[-1][-1].tail):
dest[-1][-1].tail = NBSP
return dest
@@ -578,12 +619,12 @@ class Convert(object):
if anchor and anchor in self.anchor_map:
span.set('href', '#' + self.anchor_map[anchor])
continue
self.log.warn('Hyperlink with unknown target (rid=%s, anchor=%s), ignoring' %
(rid, anchor))
self.log.warn('Hyperlink with unknown target (rid=%s, anchor=%s), '
'ignoring' % (rid, anchor))
# hrefs that point nowhere give epubcheck a hernia. The element
# should be styled explicitly by Word anyway.
# span.set('href', '#')
rmap = {v:k for k, v in self.object_map.items()}
rmap = {v: k for k, v in self.object_map.items()}
for hyperlink, runs in self.fields.hyperlink_fields:
spans = [rmap[r] for r in runs if r in rmap]
if not spans:
@@ -604,7 +645,8 @@ class Convert(object):
if anchor in self.anchor_map:
span.set('href', '#' + self.anchor_map[anchor])
continue
self.log.warn('Hyperlink field with unknown anchor: %s' % anchor)
self.log.warn('Hyperlink field with unknown anchor: %s' %
anchor)
else:
if url in self.anchor_map:
span.set('href', '#' + self.anchor_map[url])
@@ -652,7 +694,8 @@ class Convert(object):
# actually needs it, i.e. if it has more than one
# consecutive space or it has newlines or tabs.
multi_spaces = self.ms_pat.search(ctext) is not None
preserve = multi_spaces or self.ws_pat.search(ctext) is not None
preserve = (multi_spaces or
self.ws_pat.search(ctext) is not None)
if preserve:
text.add_elem(SPAN(ctext, style="white-space:pre-wrap"))
ans.append(text.elem)
@@ -668,24 +711,30 @@ class Convert(object):
else:
clear = child.get('clear', None)
if clear in {'all', 'left', 'right'}:
br = BR(style='clear:%s'%('both' if clear == 'all' else clear))
br = BR(style='clear:%s' % ('both' if clear == 'all'
else clear))
else:
br = BR()
text.add_elem(br)
ans.append(text.elem)
elif self.namespace.is_tag(child, 'w:drawing') or self.namespace.is_tag(child, 'w:pict'):
for img in self.images.to_html(child, self.current_page, self.docx, self.dest_dir):
elif (self.namespace.is_tag(child, 'w:drawing') or
self.namespace.is_tag(child, 'w:pict')):
for img in self.images.to_html(child, self.current_page,
self.docx, self.dest_dir):
text.add_elem(img)
ans.append(text.elem)
elif self.namespace.is_tag(child, 'w:footnoteReference') or self.namespace.is_tag(child, 'w:endnoteReference'):
elif (self.namespace.is_tag(child, 'w:footnoteReference') or
self.namespace.is_tag(child, 'w:endnoteReference')):
anchor, name = self.footnotes.get_ref(child)
if anchor and name:
l = A(name, id='back_%s' % anchor, href='#' + anchor, title=name)
l.set('class', 'noteref')
text.add_elem(l)
_l = A(name, id='back_%s' % anchor, href='#' + anchor,
title=name)
_l.set('class', 'noteref')
text.add_elem(_l)
ans.append(text.elem)
elif self.namespace.is_tag(child, 'w:tab'):
spaces = int(math.ceil((self.settings.default_tab_stop / 36) * 6))
spaces = int(math.ceil((self.settings.default_tab_stop / 36) *
6))
text.add_elem(SPAN(NBSP * spaces))
ans.append(text.elem)
ans[-1].set('class', 'tab')
@@ -699,7 +748,8 @@ class Convert(object):
style = self.styles.resolve_run(run)
if style.vert_align in {'superscript', 'subscript'}:
if ans.text or len(ans):
ans.set('data-docx-vert', 'sup' if style.vert_align == 'superscript' else 'sub')
ans.set('data-docx-vert',
'sup' if style.vert_align == 'superscript' else 'sub')
if style.lang is not inherit:
lang = html_lang(style.lang)
if lang is not None and lang != self.doc_lang:
@@ -738,12 +788,14 @@ class Convert(object):
idx = parent.index(paras[0])
frame = DIV(*paras)
parent.insert(idx, frame)
self.framed_map[frame] = css = style.css(self.page_map[self.object_map[paras[0]]])
self.framed_map[frame] = css = style.css(
self.page_map[self.object_map[paras[0]]])
self.styles.register(css, 'frame')
if not self.block_runs:
return
rmap = {v:k for k, v in self.object_map.items()}
rmap = {v: k for k, v in self.object_map.items()}
for border_style, blocks in self.block_runs:
paras = tuple(rmap[p] for p in blocks)
for p in paras:
@@ -796,17 +848,20 @@ class Convert(object):
else:
border_style = style.clone_border_styles()
if has_visible_border:
border_style.margin_top, style.margin_top = style.margin_top, inherit
style.margin_top = inherit
border_style.margin_top = style.margin_top
if p is not run[-1]:
style.padding_bottom = 0
else:
if has_visible_border:
border_style.margin_bottom, style.margin_bottom = style.margin_bottom, inherit
style.margin_bottom = inherit
border_style.margin_bottom = style.margin_bottom
style.clear_borders()
if p is not run[-1]:
style.apply_between_border()
if has_visible_border:
border_style.margin_left, border_style.margin_right = max_left,max_right
border_style.margin_left = max_left
border_style.margin_right = max_right
self.block_runs.append((border_style, run))
run = []

View File

@@ -1,5 +1,6 @@
import mimetypes
import textwrap, os
import os
import textwrap
from lxml import etree
from lxml.builder import ElementMaker
@@ -9,22 +10,48 @@ from ebook_converter.ebooks.docx.names import DOCXNamespace
from ebook_converter.ebooks.metadata import authors_to_string
from ebook_converter.ebooks.pdf.render.common import PAPER_SIZES
from ebook_converter.utils.date import utcnow
from ebook_converter.utils.localization import canonicalize_lang, lang_as_iso639_1
from ebook_converter.utils.localization import canonicalize_lang
from ebook_converter.utils.localization import lang_as_iso639_1
from ebook_converter.utils.zipfile import ZipFile
WORD_TYPES = {"/word/footnotes.xml": "application/vnd.openxmlformats-"
"officedocument.wordprocessingml.footnotes+xml",
"/word/document.xml": "application/vnd.openxmlformats-"
"officedocument.wordprocessingml.document.main+xml",
"/word/numbering.xml": "application/vnd.openxmlformats-"
"officedocument.wordprocessingml.numbering+xml",
"/word/styles.xml": "application/vnd.openxmlformats-"
"officedocument.wordprocessingml.styles+xml",
"/word/endnotes.xml": "application/vnd.openxmlformats-"
"officedocument.wordprocessingml.endnotes+xml",
"/word/settings.xml": "application/vnd.openxmlformats-"
"officedocument.wordprocessingml.settings+xml",
"/word/theme/theme1.xml": "application/vnd.openxmlformats-"
"officedocument.theme+xml",
"/word/fontTable.xml": "application/vnd.openxmlformats-"
"officedocument.wordprocessingml.fontTable+xml",
"/word/webSettings.xml": "application/vnd.openxmlformats-"
"officedocument.wordprocessingml.webSettings+xml",
"/docProps/core.xml": "application/vnd.openxmlformats-package."
"core-properties+xml",
"/docProps/app.xml": "application/vnd.openxmlformats-"
"officedocument.extended-properties+xml"}
def xml2str(root, pretty_print=False, with_tail=False):
if hasattr(etree, 'cleanup_namespaces'):
etree.cleanup_namespaces(root)
ans = etree.tostring(root, encoding='utf-8', xml_declaration=True,
pretty_print=pretty_print, with_tail=with_tail)
pretty_print=pretty_print, with_tail=with_tail)
return ans
def page_size(opts):
width, height = PAPER_SIZES[opts.docx_page_size]
if opts.docx_custom_page_size is not None:
width, height = map(float, opts.docx_custom_page_size.partition('x')[0::2])
width, height = map(float,
opts.docx_custom_page_size.partition('x')[0::2])
return width, height
@@ -47,7 +74,9 @@ def create_skeleton(opts, namespaces=None):
def w(x):
return '{%s}%s' % (namespaces['w'], x)
dn = {k:v for k, v in namespaces.items() if k in {'w', 'r', 'm', 've', 'o', 'wp', 'w10', 'wne', 'a', 'pic'}}
dn = {k: v for k, v in namespaces.items() if k in {'w', 'r', 'm', 've',
'o', 'wp', 'w10', 'wne',
'a', 'pic'}}
E = ElementMaker(namespace=dn['w'], nsmap=dn)
doc = E.document()
body = E.body()
@@ -59,27 +88,32 @@ def create_skeleton(opts, namespaces=None):
val = page_margin(opts, which)
return w(which), str(int(val * 20))
body.append(E.sectPr(
E.pgSz(**{w('w'):str(width), w('h'):str(height)}),
E.pgSz(**{w('w'): str(width), w('h'): str(height)}),
E.pgMar(**dict(map(margin, 'left top right bottom'.split()))),
E.cols(**{w('space'):'720'}),
E.docGrid(**{w('linePitch'):"360"}),
E.cols(**{w('space'): '720'}),
E.docGrid(**{w('linePitch'): "360"}),
))
dn = {k:v for k, v in namespaces.items() if k in tuple('wra') + ('wp',)}
dn = {k: v for k, v in namespaces.items() if k in tuple('wra') + ('wp',)}
E = ElementMaker(namespace=dn['w'], nsmap=dn)
styles = E.styles(
E.docDefaults(
E.rPrDefault(
E.rPr(
E.rFonts(**{w('asciiTheme'):"minorHAnsi", w('eastAsiaTheme'):"minorEastAsia", w('hAnsiTheme'):"minorHAnsi", w('cstheme'):"minorBidi"}),
E.sz(**{w('val'):'22'}),
E.szCs(**{w('val'):'22'}),
E.lang(**{w('val'):'en-US', w('eastAsia'):"en-US", w('bidi'):"ar-SA"})
E.rFonts(**{w('asciiTheme'): "minorHAnsi",
w('eastAsiaTheme'): "minorEastAsia",
w('hAnsiTheme'): "minorHAnsi",
w('cstheme'): "minorBidi"}),
E.sz(**{w('val'): '22'}),
E.szCs(**{w('val'): '22'}),
E.lang(**{w('val'): 'en-US', w('eastAsia'): "en-US",
w('bidi'): "ar-SA"})
)
),
E.pPrDefault(
E.pPr(
E.spacing(**{w('after'):"0", w('line'):"276", w('lineRule'):"auto"})
E.spacing(**{w('after'): "0", w('line'): "276",
w('lineRule'): "auto"})
)
)
)
@@ -103,8 +137,8 @@ def update_doc_props(root, mi, namespace):
if mi.comments:
setm('description', mi.comments)
if mi.languages:
l = canonicalize_lang(mi.languages[0])
setm('language', lang_as_iso639_1(l) or l)
_l = canonicalize_lang(mi.languages[0])
setm('language', lang_as_iso639_1(_l) or _l)
class DocumentRelationships(object):
@@ -115,8 +149,7 @@ class DocumentRelationships(object):
for typ, target in {namespace.names['STYLES']: 'styles.xml',
namespace.names['NUMBERING']: 'numbering.xml',
namespace.names['WEB_SETTINGS']: 'webSettings.xml',
namespace.names['FONTS']: 'fontTable.xml',
}.items():
namespace.names['FONTS']: 'fontTable.xml'}.items():
self.add_relationship(target, typ)
def get_relationship_id(self, target, rtype, target_mode=None):
@@ -134,7 +167,8 @@ class DocumentRelationships(object):
def serialize(self):
namespaces = self.namespace.namespaces
E = ElementMaker(namespace=namespaces['pr'], nsmap={None:namespaces['pr']})
E = ElementMaker(namespace=namespaces['pr'],
nsmap={None: namespaces['pr']})
relationships = E.Relationships()
for (target, rtype, target_mode), rid in self.rmap.items():
r = E.Relationship(Id=rid, Type=rtype, Target=target)
@@ -151,9 +185,12 @@ class DOCX(object):
namespaces = self.namespace.namespaces
self.opts, self.log = opts, log
self.document_relationships = DocumentRelationships(self.namespace)
self.font_table = etree.Element('{%s}fonts' % namespaces['w'], nsmap={k:namespaces[k] for k in 'wr'})
self.numbering = etree.Element('{%s}numbering' % namespaces['w'], nsmap={k:namespaces[k] for k in 'wr'})
E = ElementMaker(namespace=namespaces['pr'], nsmap={None:namespaces['pr']})
self.font_table = etree.Element('{%s}fonts' % namespaces['w'],
nsmap={k: namespaces[k] for k in 'wr'})
self.numbering = etree.Element('{%s}numbering' % namespaces['w'],
nsmap={k: namespaces[k] for k in 'wr'})
E = ElementMaker(namespace=namespaces['pr'],
nsmap={None: namespaces['pr']})
self.embedded_fonts = E.Relationships()
self.fonts = {}
self.images = {}
@@ -161,21 +198,10 @@ class DOCX(object):
# Boilerplate {{{
@property
def contenttypes(self):
E = ElementMaker(namespace=self.namespace.namespaces['ct'], nsmap={None:self.namespace.namespaces['ct']})
E = ElementMaker(namespace=self.namespace.namespaces['ct'],
nsmap={None: self.namespace.namespaces['ct']})
types = E.Types()
for partname, mt in {
"/word/footnotes.xml": "application/vnd.openxmlformats-officedocument.wordprocessingml.footnotes+xml",
"/word/document.xml": "application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml",
"/word/numbering.xml": "application/vnd.openxmlformats-officedocument.wordprocessingml.numbering+xml",
"/word/styles.xml": "application/vnd.openxmlformats-officedocument.wordprocessingml.styles+xml",
"/word/endnotes.xml": "application/vnd.openxmlformats-officedocument.wordprocessingml.endnotes+xml",
"/word/settings.xml": "application/vnd.openxmlformats-officedocument.wordprocessingml.settings+xml",
"/word/theme/theme1.xml": "application/vnd.openxmlformats-officedocument.theme+xml",
"/word/fontTable.xml": "application/vnd.openxmlformats-officedocument.wordprocessingml.fontTable+xml",
"/word/webSettings.xml": "application/vnd.openxmlformats-officedocument.wordprocessingml.webSettings+xml",
"/docProps/core.xml": "application/vnd.openxmlformats-package.core-properties+xml",
"/docProps/app.xml": "application/vnd.openxmlformats-officedocument.extended-properties+xml",
}.items():
for partname, mt in WORD_TYPES.items():
types.append(E.Override(PartName=partname, ContentType=mt))
added = {'png', 'gif', 'jpeg', 'jpg', 'svg', 'xml'}
for ext in added:
@@ -199,7 +225,8 @@ class DOCX(object):
@property
def appproperties(self):
E = ElementMaker(namespace=self.namespace.namespaces['ep'], nsmap={None:self.namespace.namespaces['ep']})
E = ElementMaker(namespace=self.namespace.namespaces['ep'],
nsmap={None: self.namespace.namespaces['ep']})
props = E.Properties(
E.Application(__appname__),
E.AppVersion('%02d.%04d' % numeric_version[:2]),
@@ -216,16 +243,17 @@ class DOCX(object):
@property
def containerrels(self):
return textwrap.dedent('''\
<?xml version='1.0' encoding='utf-8'?>
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
<Relationship Id="rId3" Type="{APPPROPS}" Target="docProps/app.xml"/>
<Relationship Id="rId2" Type="{DOCPROPS}" Target="docProps/core.xml"/>
<Relationship Id="rId1" Type="{DOCUMENT}" Target="word/document.xml"/>
</Relationships>'''.format(**self.namespace.names)).encode('utf-8')
<?xml version='1.0' encoding='utf-8'?>
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
<Relationship Id="rId3" Type="{APPPROPS}" Target="docProps/app.xml"/>
<Relationship Id="rId2" Type="{DOCPROPS}" Target="docProps/core.xml"/>
<Relationship Id="rId1" Type="{DOCUMENT}" Target="word/document.xml"/>
</Relationships>'''.format(**self.namespace.names)).encode('utf-8') # noqa
@property
def websettings(self):
E = ElementMaker(namespace=self.namespace.namespaces['w'], nsmap={'w':self.namespace.namespaces['w']})
E = ElementMaker(namespace=self.namespace.namespaces['w'],
nsmap={'w': self.namespace.namespaces['w']})
ws = E.webSettings(
E.optimizeForBrowser, E.allowPNG, E.doNotSaveAsSingleFile)
return xml2str(ws)
@@ -234,11 +262,15 @@ class DOCX(object):
def convert_metadata(self, mi):
namespaces = self.namespace.namespaces
E = ElementMaker(namespace=namespaces['cp'], nsmap={x:namespaces[x] for x in 'cp dc dcterms xsi'.split()})
E = ElementMaker(namespace=namespaces['cp'],
nsmap={x: namespaces[x]
for x in 'cp dc dcterms xsi'.split()})
cp = E.coreProperties(E.revision("1"), E.lastModifiedBy('calibre'))
ts = utcnow().isoformat('T').rpartition('.')[0] + 'Z'
for x in 'created modified'.split():
x = cp.makeelement('{%s}%s' % (namespaces['dcterms'], x), **{'{%s}type' % namespaces['xsi']:'dcterms:W3CDTF'})
x = cp.makeelement('{%s}%s' % (namespaces['dcterms'], x),
**{'{%s}type' %
namespaces['xsi']: 'dcterms:W3CDTF'})
x.text = ts
cp.append(x)
self.mi = mi
@@ -261,8 +293,10 @@ class DOCX(object):
zf.writestr('word/styles.xml', xml2str(self.styles))
zf.writestr('word/numbering.xml', xml2str(self.numbering))
zf.writestr('word/fontTable.xml', xml2str(self.font_table))
zf.writestr('word/_rels/document.xml.rels', self.document_relationships.serialize())
zf.writestr('word/_rels/fontTable.xml.rels', xml2str(self.embedded_fonts))
zf.writestr('word/_rels/document.xml.rels',
self.document_relationships.serialize())
zf.writestr('word/_rels/fontTable.xml.rels',
xml2str(self.embedded_fonts))
for fname, data_getter in self.images.items():
zf.writestr(fname, data_getter())
for fname, data in self.fonts.items():

View File

@@ -18,7 +18,7 @@ try:
_author_pat = re.compile(tweaks['authors_split_regex'])
except Exception:
prints('Author split regexp:', tweaks['authors_split_regex'],
'is invalid, using default')
'is invalid, using default')
_author_pat = re.compile(r'(?i),?\s+(and|with)\s+')
@@ -76,7 +76,8 @@ def author_to_author_sort(author, method=None):
if method == 'copy':
return author
prefixes = {force_unicode(y).lower() for y in tweaks['author_name_prefixes']}
prefixes = {force_unicode(y).lower()
for y in tweaks['author_name_prefixes']}
prefixes |= {y+'.' for y in prefixes}
while True:
if not tokens:
@@ -87,7 +88,8 @@ def author_to_author_sort(author, method=None):
else:
break
suffixes = {force_unicode(y).lower() for y in tweaks['author_name_suffixes']}
suffixes = {force_unicode(y).lower()
for y in tweaks['author_name_suffixes']}
suffixes |= {y+'.' for y in suffixes}
suffix = ''
@@ -144,7 +146,7 @@ def get_title_sort_pat(lang=None):
except:
ans = frozenset((r'A\s+', r'The\s+', r'An\s+'))
ans = '|'.join(ans)
ans = '^(%s)'%ans
ans = '^(%s)' % ans
try:
ans = re.compile(ans, re.IGNORECASE)
except:
@@ -154,7 +156,7 @@ def get_title_sort_pat(lang=None):
_ignore_starts = '\'"'+''.join(chr(x) for x in
list(range(0x2018, 0x201e))+[0x2032, 0x2033])
list(range(0x2018, 0x201e))+[0x2032, 0x2033])
def title_sort(title, order=None, lang=None):

View File

@@ -12,8 +12,7 @@ from lxml import etree
from ebook_converter.utils.date import parse_only_date
from ebook_converter.utils.img import save_cover_data_to
from ebook_converter.utils.imghdr import identify
from ebook_converter import guess_type, guess_all_extensions, prints, \
force_unicode
from ebook_converter import guess_all_extensions, prints, force_unicode
from ebook_converter.ebooks.metadata import MetaInformation, check_isbn
from ebook_converter.ebooks.chardet import xml_to_unicode
from ebook_converter.polyglot.binary import as_base64_unicode

View File

@@ -10,11 +10,11 @@ import mimetypes
import os
import re
import sys
import textwrap
import traceback
import unittest
import urllib.parse
import uuid
import traceback
import textwrap
from lxml import etree
from lxml.builder import ElementMaker
@@ -32,7 +32,7 @@ from ebook_converter.ebooks.metadata import string_to_authors, \
from ebook_converter.ebooks.metadata.book.base import Metadata
from ebook_converter.utils.date import parse_date, isoformat
from ebook_converter.utils.localization import get_lang, canonicalize_lang
from ebook_converter import prints, guess_type
from ebook_converter import prints
from ebook_converter.utils.cleantext import clean_ascii_chars, clean_xml_chars
from ebook_converter.utils.config import tweaks
from ebook_converter.polyglot.urllib import unquote
@@ -1807,8 +1807,7 @@ def test_m2o():
class OPFTest(unittest.TestCase):
def setUp(self):
self.stream = io.BytesIO(
b'''\
self.stream = io.BytesIO(b'''\
<?xml version="1.0" encoding="UTF-8"?>
<package version="2.0" xmlns="http://www.idpf.org/2007/opf" >
<metadata xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:opf="http://www.idpf.org/2007/opf">
@@ -1827,8 +1826,7 @@ b'''\
<item id="1" href="a%20%7E%20b" media-type="text/txt" />
</manifest>
</package>
'''
)
''')
self.opf = OPF(self.stream, os.getcwd())
def testReading(self, opf=None):

View File

@@ -1,10 +1,15 @@
import shutil, os, re, struct, textwrap, io
import io
import logging
import mimetypes
import os
import re
import shutil
import struct
import textwrap
from lxml import html, etree
from ebook_converter import xml_entity_to_unicode, entity_to_unicode, guess_type
from ebook_converter import xml_entity_to_unicode, entity_to_unicode
from ebook_converter.utils.cleantext import clean_ascii_chars, clean_xml_chars
from ebook_converter.ebooks import DRMError, unit_convert
from ebook_converter.ebooks.chardet import strip_encoding_declarations
@@ -15,15 +20,11 @@ from ebook_converter.ebooks.metadata import MetaInformation
from ebook_converter.ebooks.metadata.opf2 import OPFCreator, OPF
from ebook_converter.ebooks.metadata.toc import TOC
from ebook_converter.ebooks.mobi.reader.headers import BookHeader
from ebook_converter.utils.img import save_cover_data_to, gif_data_to_png_data, AnimatedGIF
from ebook_converter.utils.img import save_cover_data_to, gif_data_to_png_data
from ebook_converter.utils.img import AnimatedGIF
from ebook_converter.utils.imghdr import what
__license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
class TopazError(ValueError):
pass
@@ -38,13 +39,14 @@ class KFXError(ValueError):
class MobiReader(object):
PAGE_BREAK_PAT = re.compile(
r'<\s*/{0,1}\s*mbp:pagebreak((?:\s+[^/>]*){0,1})/{0,1}\s*>\s*(?:<\s*/{0,1}\s*mbp:pagebreak\s*/{0,1}\s*>)*',
re.IGNORECASE)
PAGE_BREAK_PAT = re.compile(r'<\s*/{0,1}\s*mbp:pagebreak((?:\s+[^/>]*)'
r'{0,1})/{0,1}\s*>\s*(?:<\s*/{0,1}'
r'\s*mbp:pagebreak\s*/{0,1}\s*>)*',
re.IGNORECASE)
IMAGE_ATTRS = ('lowrecindex', 'recindex', 'hirecindex')
def __init__(self, filename_or_stream, log, user_encoding=None, debug=None,
try_extra_data_fix=False):
try_extra_data_fix=False):
self.log = log
self.debug = debug
self.embedded_mi = None
@@ -83,8 +85,8 @@ class MobiReader(object):
if raw.startswith(b'\xeaDRMION\xee'):
raise KFXError()
self.header = raw[0:72]
self.name = self.header[:32].replace(b'\x00', b'')
self.header = raw[0:72]
self.name = self.header[:32].replace(b'\x00', b'')
self.num_sections, = struct.unpack('>H', raw[76:78])
self.ident = self.header[0x3C:0x3C + 8].upper()
@@ -94,7 +96,9 @@ class MobiReader(object):
self.sections = []
self.section_headers = []
for i in range(self.num_sections):
offset, a1, a2, a3, a4 = struct.unpack('>LBBBB', raw[78 + i * 8:78 + i * 8 + 8])
offset, a1, a2, a3, a4 = struct.unpack('>LBBBB',
raw[78 + i * 8:78 +
i * 8 + 8])
flags, val = a1, a2 << 16 | a3 << 8 | a4
self.section_headers.append((offset, flags, val))
@@ -109,8 +113,9 @@ class MobiReader(object):
for i in range(self.num_sections):
self.sections.append((section(i), self.section_headers[i]))
self.book_header = bh = BookHeader(self.sections[0][0], self.ident,
user_encoding, self.log, try_extra_data_fix=try_extra_data_fix)
bh = BookHeader(self.sections[0][0], self.ident, user_encoding,
self.log, try_extra_data_fix=try_extra_data_fix)
self.book_header = bh
self.name = self.name.decode(self.book_header.codec, 'replace')
self.kf8_type = None
k8i = getattr(self.book_header.exth, 'kf8_header', None)
@@ -118,18 +123,20 @@ class MobiReader(object):
# Ancient PRC files from Baen can have random values for
# mobi_version, so be conservative
if (self.book_header.mobi_version == 8 and hasattr(self.book_header,
'skelidx')):
'skelidx')):
self.kf8_type = 'standalone'
elif k8i is not None: # Check for joint mobi 6 and kf 8 file
try:
raw = self.sections[k8i-1][0]
except:
except Exception:
raw = None
if raw == b'BOUNDARY':
try:
self.book_header = BookHeader(self.sections[k8i][0],
self.ident, user_encoding, self.log)
self.book_header.kf8_first_image_index = self.book_header.first_image_index + k8i
self.ident, user_encoding,
self.log)
_kfii = self.book_header.first_image_index + k8i
self.book_header.kf8_first_image_index = _kfii
self.book_header.mobi6_records = bh.records
# Need the first_image_index from the mobi 6 header as well
@@ -143,14 +150,14 @@ class MobiReader(object):
self.kf8_type = 'joint'
self.kf8_boundary = k8i-1
except:
except Exception:
self.book_header = bh
def check_for_drm(self):
if self.book_header.encryption_type != 0:
try:
name = self.book_header.exth.mi.title
except:
except Exception:
name = self.name
if not name:
name = self.name
@@ -163,20 +170,20 @@ class MobiReader(object):
if self.debug is not None:
parse_cache['calibre_raw_mobi_markup'] = self.mobi_html
self.add_anchors()
self.processed_html = self.processed_html.decode(self.book_header.codec,
'ignore')
self.processed_html = self.processed_html.decode(
self.book_header.codec, 'ignore')
self.processed_html = self.processed_html.replace('</</', '</')
self.processed_html = re.sub(r'</([a-zA-Z]+)<', r'</\1><',
self.processed_html)
self.processed_html)
self.processed_html = self.processed_html.replace('\ufeff', '')
# Remove tags of the form <xyz: ...> as they can cause issues further
# along the pipeline
self.processed_html = re.sub(r'</{0,1}[a-zA-Z]+:\s+[^>]*>', '',
self.processed_html)
self.processed_html)
self.processed_html = strip_encoding_declarations(self.processed_html)
self.processed_html = re.sub(r'&(\S+?);', xml_entity_to_unicode,
self.processed_html)
self.processed_html)
image_name_map = self.extract_images(processed_records, output_dir)
self.replace_page_breaks()
self.cleanup_html()
@@ -186,31 +193,41 @@ class MobiReader(object):
try:
root = html.fromstring(self.processed_html)
if len(root.xpath('//html')) > 5:
root = html.fromstring(self.processed_html.replace('\x0c',
'').replace('\x14', ''))
root = html.fromstring(self.processed_html
.replace('\x0c', '')
.replace('\x14', ''))
except Exception:
self.log.warning('MOBI markup appears to contain random bytes. Stripping.')
self.log.warning('MOBI markup appears to contain random bytes. '
'Stripping.')
self.processed_html = self.remove_random_bytes(self.processed_html)
root = html.fromstring(self.processed_html)
if root.xpath('descendant::p/descendant::p'):
from html5_parser import parse
self.log.warning('Malformed markup, parsing using html5-parser')
self.processed_html = strip_encoding_declarations(self.processed_html)
self.processed_html = strip_encoding_declarations(
self.processed_html)
# These trip up the html5 parser causing all content to be placed
# under the <guide> tag
self.processed_html = re.sub(r'<metadata>.+?</metadata>', '', self.processed_html, flags=re.I)
self.processed_html = re.sub(r'<guide>.+?</guide>', '', self.processed_html, flags=re.I)
self.processed_html = re.sub(r'<metadata>.+?</metadata>', '',
self.processed_html, flags=re.I)
self.processed_html = re.sub(r'<guide>.+?</guide>', '',
self.processed_html, flags=re.I)
try:
root = parse(self.processed_html, maybe_xhtml=False, keep_doctype=False, sanitize_names=True)
root = parse(self.processed_html, maybe_xhtml=False,
keep_doctype=False, sanitize_names=True)
except Exception:
self.log.warning('MOBI markup appears to contain random bytes. Stripping.')
self.processed_html = self.remove_random_bytes(self.processed_html)
root = parse(self.processed_html, maybe_xhtml=False, keep_doctype=False, sanitize_names=True)
self.log.warning('MOBI markup appears to contain random '
'bytes. Stripping.')
self.processed_html = self.remove_random_bytes(
self.processed_html)
root = parse(self.processed_html, maybe_xhtml=False,
keep_doctype=False, sanitize_names=True)
if len(root.xpath('body/descendant::*')) < 1:
# There are probably stray </html>s in the markup
self.processed_html = self.processed_html.replace('</html>',
'')
root = parse(self.processed_html, maybe_xhtml=False, keep_doctype=False, sanitize_names=True)
'')
root = parse(self.processed_html, maybe_xhtml=False,
keep_doctype=False, sanitize_names=True)
if root.tag != 'html':
self.log.warn('File does not have opening <html> tag')
@@ -253,13 +270,14 @@ class MobiReader(object):
head = root.makeelement('head', {})
root.insert(0, head)
head.text = '\n\t'
link = head.makeelement('link', {'type':'text/css',
'href':'styles.css', 'rel':'stylesheet'})
link = head.makeelement('link', {'type': 'text/css',
'href': 'styles.css',
'rel': 'stylesheet'})
head.insert(0, link)
link.tail = '\n\t'
title = head.xpath('descendant::title')
m = head.makeelement('meta', {'http-equiv':'Content-Type',
'content':'text/html; charset=utf-8'})
m = head.makeelement('meta', {'http-equiv': 'Content-Type',
'content': 'text/html; charset=utf-8'})
head.insert(0, m)
if not title:
title = head.makeelement('title', {})
@@ -283,7 +301,8 @@ class MobiReader(object):
try:
for ref in guide.xpath('descendant::reference'):
if 'href' in ref.attrib:
ref.attrib['href'] = os.path.basename(htmlfile) + ref.attrib['href']
ref.attrib['href'] = (os.path.basename(htmlfile) +
ref.attrib['href'])
except AttributeError:
pass
@@ -299,7 +318,7 @@ class MobiReader(object):
opf, ncx_manifest_entry = self.create_opf(htmlfile, guide, root)
self.created_opf_path = os.path.splitext(htmlfile)[0] + '.opf'
opf.render(open(self.created_opf_path, 'wb'), ncx,
ncx_manifest_entry=ncx_manifest_entry)
ncx_manifest_entry=ncx_manifest_entry)
ncx = ncx.getvalue()
if ncx:
ncx_path = os.path.join(os.path.dirname(htmlfile), 'toc.ncx')
@@ -313,9 +332,9 @@ class MobiReader(object):
if self.book_header.exth is not None or self.embedded_mi is not None:
self.log.debug('Creating OPF...')
ncx = io.BytesIO()
opf, ncx_manifest_entry = self.create_opf(htmlfile, guide, root)
opf, ncx_manifest_entry = self.create_opf(htmlfile, guide, root)
opf.render(open(os.path.splitext(htmlfile)[0] + '.opf', 'wb'), ncx,
ncx_manifest_entry)
ncx_manifest_entry)
ncx = ncx.getvalue()
if ncx:
write_as_utf8(os.path.splitext(htmlfile)[0] + '.ncx', ncx)
@@ -348,28 +367,46 @@ class MobiReader(object):
def cleanup_html(self):
self.log.debug('Cleaning up HTML...')
self.processed_html = re.sub(r'<div height="0(pt|px|ex|em|%){0,1}"></div>', '', self.processed_html)
if self.book_header.ancient and b'<html' not in self.mobi_html[:300].lower():
self.processed_html = '<html><p>' + self.processed_html.replace('\n\n', '<p>') + '</html>'
self.processed_html = re.sub(r'<div height="0(pt|px|ex|em|%){0,1}">'
'</div>', '', self.processed_html)
if (self.book_header.ancient and
b'<html' not in self.mobi_html[:300].lower()):
self.processed_html = ('<html><p>' +
self.processed_html.replace('\n\n', '<p>') +
'</html>')
self.processed_html = self.processed_html.replace('\r\n', '\n')
self.processed_html = self.processed_html.replace('> <', '>\n<')
self.processed_html = self.processed_html.replace('<mbp: ', '<mbp:')
self.processed_html = re.sub(r'<\?xml[^>]*>', '', self.processed_html)
self.processed_html = re.sub(r'<\s*(/?)\s*o:p[^>]*>', r'', self.processed_html)
# Swap inline and block level elements, and order block level elements according to priority
# - lxml and beautifulsoup expect/assume a specific order based on xhtml spec
self.processed_html = re.sub(
r'(?i)(?P<styletags>(<(h\d+|i|b|u|em|small|big|strong|tt)>\s*){1,})(?P<para><p[^>]*>)', r'\g<para>'+r'\g<styletags>', self.processed_html)
self.processed_html = re.sub(
r'(?i)(?P<para></p[^>]*>)\s*(?P<styletags>(</(h\d+|i|b|u|em|small|big|strong|tt)>\s*){1,})', r'\g<styletags>'+r'\g<para>', self.processed_html)
self.processed_html = re.sub(
r'(?i)(?P<blockquote>(</(blockquote|div)[^>]*>\s*){1,})(?P<para></p[^>]*>)', r'\g<para>'+r'\g<blockquote>', self.processed_html)
self.processed_html = re.sub(
r'(?i)(?P<para><p[^>]*>)\s*(?P<blockquote>(<(blockquote|div)[^>]*>\s*){1,})', r'\g<blockquote>'+r'\g<para>', self.processed_html)
self.processed_html = re.sub(r'<\s*(/?)\s*o:p[^>]*>', r'',
self.processed_html)
# Swap inline and block level elements, and order block level elements
# according to priority
# - lxml and beautifulsoup expect/assume a specific order based on
# xhtml spec
self.processed_html = re.sub(r'(?i)(?P<styletags>(<(h\d+|i|b|u|em|'
r'small|big|strong|tt)>\s*){1,})'
r'(?P<para><p[^>]*>)',
r'\g<para>' + r'\g<styletags>',
self.processed_html)
self.processed_html = re.sub(r'(?i)(?P<para></p[^>]*>)\s*'
r'(?P<styletags>(</(h\d+|i|b|u|em|small|'
r'big|strong|tt)>\s*){1,})',
r'\g<styletags>' + r'\g<para>',
self.processed_html)
self.processed_html = re.sub(r'(?i)(?P<blockquote>(</(blockquote|div)'
r'[^>]*>\s*){1,})(?P<para></p[^>]*>)',
r'\g<para>' + r'\g<blockquote>',
self.processed_html)
self.processed_html = re.sub(r'(?i)(?P<para><p[^>]*>)\s*'
r'(?P<blockquote>(<(blockquote|div)[^>]*>'
r'\s*){1,})',
r'\g<blockquote>' + r'\g<para>',
self.processed_html)
bods = htmls = 0
for x in re.finditer('</body>|</html>', self.processed_html):
if x == '</body>':
bods +=1
bods += 1
else:
htmls += 1
if bods > 1 and htmls > 1:
@@ -380,8 +417,8 @@ class MobiReader(object):
self.processed_html = self.processed_html.replace('</html>', '')
def remove_random_bytes(self, html):
return re.sub('\x14|\x15|\x19|\x1c|\x1d|\xef|\x12|\x13|\xec|\x08|\x01|\x02|\x03|\x04|\x05|\x06|\x07',
'', html)
return re.sub('\x14|\x15|\x19|\x1c|\x1d|\xef|\x12|\x13|\xec|\x08|\x01'
'|\x02|\x03|\x04|\x05|\x06|\x07', '', html)
def ensure_unit(self, raw, unit='px'):
if re.search(r'\d+$', raw) is not None:
@@ -448,9 +485,10 @@ class MobiReader(object):
# discarded by a renderer
tag.text = '\u00a0' # nbsp
styles.append('height: %s' %
self.ensure_unit(height))
self.ensure_unit(height))
else:
styles.append('margin-top: %s' % self.ensure_unit(height))
styles.append('margin-top: %s' %
self.ensure_unit(height))
if 'width' in attrib:
width = attrib.pop('width').strip()
if width and re.search(r'\d+', width):
@@ -464,14 +502,16 @@ class MobiReader(object):
try:
ewidth_val = unit_convert(ewidth, 12, 500, 166)
self.text_indents[tag] = ewidth_val
except:
except Exception:
pass
if width.startswith('-'):
styles.append('margin-left: %s' % self.ensure_unit(width[1:]))
styles.append('margin-left: %s' %
self.ensure_unit(width[1:]))
try:
ewidth_val = unit_convert(ewidth[1:], 12, 500, 166)
ewidth_val = unit_convert(ewidth[1:],
12, 500, 166)
self.left_margins[tag] = ewidth_val
except:
except Exception:
pass
if 'align' in attrib:
@@ -514,16 +554,20 @@ class MobiReader(object):
except Exception:
pass
else:
attrib['src'] = 'images/' + image_name_map.get(recindex, '%05d.jpg' % recindex)
attrib['src'] = ('images/' +
image_name_map.get(recindex,
'%05d.jpg' %
recindex))
for attr in ('width', 'height'):
if attr in attrib:
val = attrib[attr]
if val.lower().endswith('em'):
try:
nval = float(val[:-2])
nval *= 16 * (168.451/72) # Assume this was set using the Kindle profile
attrib[attr] = "%dpx"%int(nval)
except:
# Assume this was set using the Kindle profile
nval *= 16 * (168.451/72)
attrib[attr] = "%dpx" % int(nval)
except Exception:
del attrib[attr]
elif val.lower().endswith('%'):
del attrib[attr]
@@ -550,10 +594,12 @@ class MobiReader(object):
attrib['href'] = "#filepos%d" % int(filepos)
except ValueError:
pass
if (tag.tag == 'a' and attrib.get('id', '').startswith('filepos') and
not tag.text and len(tag) == 0 and (tag.tail is None or not
tag.tail.strip()) and getattr(tag.getnext(), 'tag',
None) in BLOCK_TAGS):
if (tag.tag == 'a' and
attrib.get('id', '').startswith('filepos') and
not tag.text and len(tag) == 0 and
(tag.tail is None or
not tag.tail.strip()) and
getattr(tag.getnext(), 'tag', None) in BLOCK_TAGS):
# This is an empty anchor immediately before a block tag, move
# the id onto the block tag instead
forwardable_anchors.append(tag)
@@ -625,11 +671,11 @@ class MobiReader(object):
ti = self.text_indents.get(tag, ti)
try:
lm = float(lm)
except:
except Exception:
lm = 0.0
try:
ti = float(ti)
except:
except Exception:
ti = 0.0
return lm + ti
@@ -647,13 +693,14 @@ class MobiReader(object):
mi = MetaInformation(self.book_header.title, ['Unknown'])
opf = OPFCreator(os.path.dirname(htmlfile), mi)
if hasattr(self.book_header.exth, 'cover_offset'):
opf.cover = 'images/%05d.jpg' % (self.book_header.exth.cover_offset + 1)
opf.cover = 'images/%05d.jpg' % (self.book_header
.exth.cover_offset + 1)
elif mi.cover is not None:
opf.cover = mi.cover
else:
opf.cover = 'images/%05d.jpg' % 1
if not os.path.exists(os.path.join(os.path.dirname(htmlfile),
* opf.cover.split('/'))):
* opf.cover.split('/'))):
opf.cover = None
cover = opf.cover
@@ -669,7 +716,7 @@ class MobiReader(object):
opf.cover = ncover.replace(os.sep, '/')
manifest = [(htmlfile, 'application/xhtml+xml'),
(os.path.abspath('styles.css'), 'text/css')]
(os.path.abspath('styles.css'), 'text/css')]
bp = os.path.dirname(htmlfile)
added = set()
for i in getattr(self, 'image_names', []):
@@ -708,15 +755,17 @@ class MobiReader(object):
if href and re.match(r'\w+://', href) is None:
try:
text = ' '.join([t.strip() for t in
x.xpath('descendant::text()')])
except:
x.xpath('descendant:'
':text()')])
except Exception:
text = ''
text = ent_pat.sub(entity_to_unicode, text)
item = tocobj.add_item(toc.partition('#')[0], href[1:],
text)
item = tocobj.add_item(toc.partition('#')[0],
href[1:], text)
item.left_space = int(self.get_left_whitespace(x))
found = True
if reached and found and x.get('class', None) == 'mbp_pagebreak':
if (reached and found and
x.get('class', None) == 'mbp_pagebreak'):
break
if tocobj is not None:
tocobj = self.structure_toc(tocobj)
@@ -748,7 +797,7 @@ class MobiReader(object):
level = indent_vals.index(item.left_space)
parent = find_parent(level)
last_found[level] = parent.add_item(item.href, item.fragment,
item.text)
item.text)
return newtoc
@@ -782,7 +831,9 @@ class MobiReader(object):
def warn_about_trailing_entry_corruption(self):
if not self.warned_about_trailing_entry_corruption:
self.warned_about_trailing_entry_corruption = True
self.log.warn('The trailing data entries in this MOBI file are corrupted, you might see corrupted text in the output')
self.log.warn('The trailing data entries in this MOBI file are '
'corrupted, you might see corrupted text in the '
'output')
def text_section(self, index):
data = self.sections[index][0]
@@ -791,19 +842,23 @@ class MobiReader(object):
def extract_text(self, offset=1):
self.log.debug('Extracting text...')
text_sections = [self.text_section(i) for i in range(offset,
min(self.book_header.records + offset, len(self.sections)))]
text_sections = [self.text_section(i)
for i in range(offset, min(self.book_header.records
+ offset,
len(self.sections)))]
processed_records = list(range(offset-1, self.book_header.records +
offset))
offset))
self.mobi_html = b''
if self.book_header.compression_type == b'DH':
huffs = [self.sections[i][0] for i in
range(self.book_header.huff_offset,
self.book_header.huff_offset + self.book_header.huff_number)]
huffs = [self.sections[i][0]
for i in range(self.book_header.huff_offset,
self.book_header.huff_offset +
self.book_header.huff_number)]
processed_records += list(range(self.book_header.huff_offset,
self.book_header.huff_offset + self.book_header.huff_number))
self.book_header.huff_offset +
self.book_header.huff_number))
huff = HuffReader(huffs)
unpack = huff.unpack
@@ -811,19 +866,23 @@ class MobiReader(object):
unpack = decompress_doc
elif self.book_header.compression_type == b'\x00\x01':
unpack = lambda x: x
unpack = lambda x: x # noqa
else:
raise MobiError('Unknown compression algorithm: %r' % self.book_header.compression_type)
raise MobiError('Unknown compression algorithm: %r' %
self.book_header.compression_type)
self.mobi_html = b''.join(map(unpack, text_sections))
if self.mobi_html.endswith(b'#'):
self.mobi_html = self.mobi_html[:-1]
if self.book_header.ancient and b'<html' not in self.mobi_html[:300].lower():
if (self.book_header.ancient and
b'<html' not in self.mobi_html[:300].lower()):
self.mobi_html = self.mobi_html.replace(b'\r ', b'\n\n ')
self.mobi_html = self.mobi_html.replace(b'\0', b'')
if self.book_header.codec == 'cp1252':
self.mobi_html = self.mobi_html.replace(b'\x1e', b'') # record separator
self.mobi_html = self.mobi_html.replace(b'\x02', b'') # start of text
# record separator
self.mobi_html = self.mobi_html.replace(b'\x1e', b'')
# start of text
self.mobi_html = self.mobi_html.replace(b'\x02', b'')
return processed_records
def replace_page_breaks(self):
@@ -835,7 +894,7 @@ class MobiReader(object):
self.log.debug('Adding anchors...')
positions = set()
link_pattern = re.compile(br'''<[^<>]+filepos=['"]{0,1}(\d+)[^<>]*>''',
re.IGNORECASE)
re.IGNORECASE)
for match in link_pattern.finditer(self.mobi_html):
positions.add(int(match.group(1)))
pos = 0
@@ -845,12 +904,13 @@ class MobiReader(object):
if end == 0:
continue
oend = end
l = self.mobi_html.find(b'<', end)
_l = self.mobi_html.find(b'<', end)
r = self.mobi_html.find(b'>', end)
anchor = b'<a id="filepos%d"></a>'
if r > -1 and (r < l or l == end or l == -1):
if r > -1 and (r < _l or _l == end or _l == -1):
p = self.mobi_html.rfind(b'<', 0, end + 1)
if (pos < end and p > -1 and not end_tag_re.match(self.mobi_html[p:r]) and
if (pos < end and p > -1 and
not end_tag_re.match(self.mobi_html[p:r]) and
not self.mobi_html[p:r + 1].endswith(b'/>')):
anchor = b' filepos-id="filepos%d"'
end = r
@@ -862,8 +922,9 @@ class MobiReader(object):
processed_html = b''.join(processed_html)
# Remove anchors placed inside entities
self.processed_html = re.sub(br'&([^;]*?)(<a id="filepos\d+"></a>)([^;]*);',
br'&\1\3;\2', processed_html)
self.processed_html = re.sub(br'&([^;]*?)(<a id="filepos\d+"></a>)'
br'([^;]*);', br'&\1\3;\2',
processed_html)
def extract_images(self, processed_records, output_dir):
self.log.debug('Extracting images...')
@@ -881,10 +942,11 @@ class MobiReader(object):
if i in processed_records:
continue
processed_records.append(i)
data = self.sections[i][0]
data = self.sections[i][0]
image_index += 1
if data[:4] in {b'FLIS', b'FCIS', b'SRCS', b'\xe9\x8e\r\n',
b'RESC', b'BOUN', b'FDST', b'DATP', b'AUDI', b'VIDE'}:
b'RESC', b'BOUN', b'FDST', b'DATP', b'AUDI',
b'VIDE'}:
# This record is a known non image type, no need to try to
# load the image
continue
@@ -920,16 +982,17 @@ class MobiReader(object):
def test_mbp_regex():
for raw, m in {'<mbp:pagebreak></mbp:pagebreak>':'',
'<mbp:pagebreak xxx></mbp:pagebreak>yyy':' xxxyyy',
'<mbp:pagebreak> </mbp:pagebreak>':'',
'<mbp:pagebreak>xxx':'xxx',
'<mbp:pagebreak/>xxx':'xxx',
'<mbp:pagebreak sdf/ >xxx':' sdfxxx',
'<mbp:pagebreak / >':' ',
'</mbp:pagebreak>':'',
'</mbp:pagebreak sdf>':' sdf',
'</mbp:pagebreak><mbp:pagebreak></mbp:pagebreak>xxx':'xxx'}.items():
for raw, m in {'<mbp:pagebreak></mbp:pagebreak>': '',
'<mbp:pagebreak xxx></mbp:pagebreak>yyy': ' xxxyyy',
'<mbp:pagebreak> </mbp:pagebreak>': '',
'<mbp:pagebreak>xxx': 'xxx',
'<mbp:pagebreak/>xxx': 'xxx',
'<mbp:pagebreak sdf/ >xxx': ' sdfxxx',
'<mbp:pagebreak / >': ' ',
'</mbp:pagebreak>': '',
'</mbp:pagebreak sdf>': ' sdf',
'</mbp:pagebreak><mbp:pagebreak></mbp:pagebreak>xxx':
'xxx'}.items():
ans = MobiReader.PAGE_BREAK_PAT.sub(r'\1', raw)
if ans != m:
raise Exception('%r != %r for %r'%(ans, m, raw))
raise Exception('%r != %r for %r' % (ans, m, raw))

View File

@@ -1,13 +1,11 @@
import mimetypes
import re
from ebook_converter.ebooks.oeb.base import XPath, urlunquote
from ebook_converter.polyglot.binary import from_base64_bytes
from ebook_converter.polyglot.builtins import as_bytes
__license__ = 'GPL v3'
__copyright__ = '2014, Kovid Goyal <kovid at kovidgoyal.net>'
class DataURL(object):
def __call__(self, oeb, opts):
@@ -27,25 +25,29 @@ class DataURL(object):
continue
if ';base64' in header:
data = re.sub(r'\s+', '', data)
from ebook_converter.polyglot.binary import from_base64_bytes
try:
data = from_base64_bytes(data)
except Exception:
self.log.error('Found invalid base64 encoded data URI, ignoring it')
self.log.error('Found invalid base64 encoded data '
'URI, ignoring it')
continue
else:
data = urlunquote(data)
data = as_bytes(data)
fmt = what(None, data)
if not fmt:
self.log.warn('Image encoded as data URL has unknown format, ignoring')
self.log.warn('Image encoded as data URL has unknown '
'format, ignoring')
continue
img.set('src', item.relhref(self.convert_image_data_uri(data, fmt, oeb)))
img.set('src',
item.relhref(self.convert_image_data_uri(data, fmt,
oeb)))
def convert_image_data_uri(self, data, fmt, oeb):
self.log('Found image encoded as data URI converting it to normal image')
from ebook_converter import guess_type
item_id, item_href = oeb.manifest.generate('data-url-image', 'data-url-image.' + fmt)
self.log('Found image encoded as data URI converting it to normal '
'image')
item_id, item_href = oeb.manifest.generate('data-url-image',
'data-url-image.' + fmt)
oeb.manifest.add(item_id, item_href,
mimetypes.guess_type(item_href)[0], data=data)
return item_href

View File

@@ -1,9 +1,11 @@
import mimetypes
import sys, os, re
from xml.sax.saxutils import escape
from string import Formatter
import os
import pkg_resources
import re
import string
import sys
import urllib.parse
from xml.sax import saxutils
from ebook_converter import constants as const
from ebook_converter import strftime
@@ -16,18 +18,14 @@ from ebook_converter.ebooks.chardet import strip_encoding_declarations
from ebook_converter.ebooks.metadata import fmt_sidx, rating_to_stars
__license__ = 'GPL v3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
JACKET_XPATH = '//h:meta[@name="calibre-content" and @content="jacket"]'
class SafeFormatter(Formatter):
class SafeFormatter(string.Formatter):
def get_value(self, *args, **kwargs):
try:
return Formatter.get_value(self, *args, **kwargs)
return string.Formatter.get_value(self, *args, **kwargs)
except KeyError:
return ''
@@ -40,7 +38,7 @@ class Base(object):
for img in path(item.data):
if removed >= limit:
break
href = item.abshref(img.get('src'))
href = item.abshref(img.get('src'))
image = self.oeb.manifest.hrefs.get(href)
if image is None:
href = urlnormalize(href)
@@ -68,7 +66,8 @@ class RemoveFirstImage(Base):
raw = xml2text(body[0]).strip()
imgs = XPath('//h:img|//svg:svg')(item.data)
if not raw and not imgs:
self.log('Removing %s as it has no content'%item.href)
self.log('Removing %s as it has no content' %
item.href)
self.oeb.manifest.remove(item)
deleted_item = item
break
@@ -82,20 +81,20 @@ class RemoveFirstImage(Base):
self.oeb.guide.remove_by_href(deleted_item.href)
def __call__(self, oeb, opts, metadata):
'''
"""
Add metadata in jacket.xhtml if specified in opts
If not specified, remove previous jacket instance
'''
"""
self.oeb, self.opts, self.log = oeb, opts, oeb.log
if opts.remove_first_image:
self.remove_first_image()
class Jacket(Base):
'''
Book jacket manipulation. Remove first image and insert comments at start of
book.
'''
"""
Book jacket manipulation. Remove first image and insert comments at start
of book.
"""
def insert_metadata(self, mi):
self.log('Inserting metadata into book...')
@@ -107,22 +106,24 @@ class Jacket(Base):
try:
comments = str(self.oeb.metadata.description[0])
except:
except Exception:
comments = ''
try:
title = str(self.oeb.metadata.title[0])
except:
except Exception:
title = 'Unknown'
try:
authors = list(map(str, self.oeb.metadata.creator))
except:
except Exception:
authors = ['Unknown']
root = render_jacket(mi, self.opts.output_profile,
alt_title=title, alt_tags=tags, alt_authors=authors,
alt_comments=comments, rescale_fonts=True)
alt_title=title, alt_tags=tags,
alt_authors=authors,
alt_comments=comments,
rescale_fonts=True)
id, href = self.oeb.manifest.generate('calibre_jacket', 'jacket.xhtml')
jacket = self.oeb.manifest.add(id, href, mimetypes.guess_type(href)[0],
@@ -132,7 +133,8 @@ class Jacket(Base):
for img, path in referenced_images(root):
self.oeb.log('Embedding referenced image %s into jacket' % path)
ext = path.rpartition('.')[-1].lower()
item_id, href = self.oeb.manifest.generate('jacket_image', 'jacket_img.'+ext)
item_id, href = self.oeb.manifest.generate('jacket_image',
'jacket_img.' + ext)
with open(path, 'rb') as f:
item = self.oeb.manifest.add(
item_id, href, mimetypes.guess_type(href)[0],
@@ -149,10 +151,10 @@ class Jacket(Base):
break
def __call__(self, oeb, opts, metadata):
'''
"""
Add metadata in jacket.xhtml if specified in opts
If not specified, remove previous jacket instance
'''
"""
self.oeb, self.opts, self.log = oeb, opts, oeb.log
self.remove_existing_jacket()
if opts.insert_metadata:
@@ -164,8 +166,8 @@ class Jacket(Base):
def get_rating(rating, rchar, e_rchar):
ans = ''
try:
num = float(rating)/2
except:
num = float(rating) / 2
except Exception:
return ans
num = max(0, num)
num = min(num, 5)
@@ -180,25 +182,29 @@ class Series(str):
def __new__(self, series, series_index):
if series and series_index is not None:
roman = '{1} of <em>{0}</em>'.format(
escape(series), escape(fmt_sidx(series_index, use_roman=True)))
combined = '{1} of <em>{0}</em>'.format(
escape(series), escape(fmt_sidx(series_index,
use_roman=False)))
_roman = saxutils.escape(fmt_sidx(series_index, use_roman=True))
_no_roman = saxutils.escape(fmt_sidx(series_index,
use_roman=False))
roman = '{1} of <em>{0}</em>'.format(saxutils.escape(series),
_roman)
combined = '{1} of <em>{0}</em>'.format(saxutils.escape(series),
_no_roman)
else:
combined = roman = escape(series or u'')
combined = roman = saxutils.escape(series or u'')
s = str.__new__(self, combined)
s.roman = roman
s.name = escape(series or '')
s.number = escape(fmt_sidx(series_index or 1.0, use_roman=False))
s.roman_number = escape(fmt_sidx(series_index or 1.0, use_roman=True))
s.name = saxutils.escape(series or '')
s.number = saxutils.escape(fmt_sidx(series_index or 1.0,
use_roman=False))
s.roman_number = saxutils.escape(fmt_sidx(series_index or 1.0,
use_roman=True))
return s
class Tags(str):
def __new__(self, tags, output_profile):
tags = [escape(x) for x in tags or ()]
tags = [saxutils.escape(x) for x in tags or ()]
t = str.__new__(self, ', '.join(tags))
t.alphabetical = ', '.join(sorted(tags))
t.tags_list = tags
@@ -233,9 +239,9 @@ def postprocess_jacket(root, output_profile, has_data):
extract_class('cbj_kindle_banner_hr')
def render_jacket(mi, output_profile,
alt_title='Unknown', alt_tags=[], alt_comments='',
alt_publisher='', rescale_fonts=False, alt_authors=None):
def render_jacket(mi, output_profile, alt_title='Unknown', alt_tags=[],
alt_comments='', alt_publisher='', rescale_fonts=False,
alt_authors=None):
with open(pkg_resources.resource_filename('ebook_converter',
'data/jacket/stylesheet.css'),
'rb') as fobj:
@@ -250,17 +256,20 @@ def render_jacket(mi, output_profile,
try:
title_str = alt_title if mi.is_null('title') else mi.title
except:
except Exception:
title_str = 'Unknown'
title_str = escape(title_str)
title_str = saxutils.escape(title_str)
title = '<span class="title">%s</span>' % title_str
series = Series(mi.series, mi.series_index)
try:
publisher = mi.publisher if not mi.is_null('publisher') else alt_publisher
except:
if not mi.is_null('publisher'):
publisher = mi.publisher
else:
publisher = alt_publisher
except Exception:
publisher = ''
publisher = escape(publisher)
publisher = saxutils.escape(publisher)
try:
if is_date_undefined(mi.pubdate):
@@ -268,10 +277,11 @@ def render_jacket(mi, output_profile,
else:
dt = as_local_time(mi.pubdate)
pubdate = strftime('%Y', dt.timetuple())
except:
except Exception:
pubdate = ''
rating = get_rating(mi.rating, output_profile.ratings_char, output_profile.empty_ratings_char)
rating = get_rating(mi.rating, output_profile.ratings_char,
output_profile.empty_ratings_char)
tags = Tags((mi.tags if mi.tags else alt_tags), output_profile)
@@ -285,10 +295,10 @@ def render_jacket(mi, output_profile,
mi.authors = list(alt_authors or ('Unknown',))
try:
author = mi.format_authors()
except:
except Exception:
author = ''
mi.authors = orig
author = escape(author)
author = saxutils.escape(author)
has_data = {}
def generate_html(comments):
@@ -301,7 +311,7 @@ def render_jacket(mi, output_profile,
'publisher': publisher,
'rating': rating,
'rating_label': 'Rating',
'searchable_tags': ' '.join(escape(t) + 'ttt'
'searchable_tags': ' '.join(saxutils.escape(t) + 'ttt'
for t in tags.tags_list),
'series': series,
'series_label': 'Series',
@@ -320,25 +330,30 @@ def render_jacket(mi, output_profile,
if dt == 'series':
args[dkey] = Series(mi.get(key), mi.get(key + '_index'))
elif dt == 'rating':
args[dkey] = rating_to_stars(mi.get(key), m.get('display', {}).get('allow_half_stars', False))
args[dkey] = rating_to_stars(mi.get(key),
m.get('display', {})
.get('allow_half_stars',
False))
elif dt == 'comments':
val = val or ''
display = m.get('display', {})
ctype = display.get('interpret_as') or 'html'
if ctype == 'long-text':
val = '<pre style="white-space:pre-wrap">%s</pre>' % escape(val)
val = ('<pre style="white-space:pre-wrap">%s</pre>' %
saxutils.escape(val))
elif ctype == 'short-text':
val = '<span>%s</span>' % escape(val)
val = '<span>%s</span>' % saxutils.escape(val)
elif ctype == 'markdown':
val = markdown(val)
else:
val = comments_to_html(val)
args[dkey] = val
else:
args[dkey] = escape(val)
args[dkey+'_label'] = escape(display_name)
args[dkey] = saxutils.escape(val)
args[dkey+'_label'] = saxutils.escape(display_name)
except Exception:
# if the val (custom column contents) is None, don't add to args
# if the val (custom column contents) is None, don't add to
# args
pass
if False:
@@ -371,10 +386,11 @@ def render_jacket(mi, output_profile,
# the text in the book. That means that as long as the jacket uses
# relative font sizes (em or %), the post conversion font size will be
# the same as for text in the main book. So text with size x em will
# be rescaled to the same value in both the jacket and the main content.
# be rescaled to the same value in both the jacket and the main
# content.
#
# We cannot use data-calibre-rescale 100 on the body tag as that will just
# give the body tag a font size of 1em, which is useless.
# We cannot use data-calibre-rescale 100 on the body tag as that will
# just give the body tag a font size of 1em, which is useless.
for body in root.xpath('//*[local-name()="body"]'):
fw = body.makeelement(base.tag('xhtml', 'div'))
fw.set('data-calibre-rescale', '100')