mirror of
https://github.com/gryf/ebook-converter.git
synced 2026-01-19 19:14:11 +01:00
Fixed flake8 issues to several modules
This commit is contained in:
@@ -15,17 +15,18 @@ from ebook_converter.polyglot.builtins import as_unicode
|
||||
|
||||
|
||||
def sanitize_file_name(x):
|
||||
ans = re.sub(r'\s+', ' ', re.sub(r'[?&=;#]', '_', ascii_filename(x))).strip().rstrip('.')
|
||||
ans = re.sub(r'\s+', ' ', re.sub(r'[?&=;#]', '_',
|
||||
ascii_filename(x))).strip().rstrip('.')
|
||||
ans, ext = ans.rpartition('.')[::2]
|
||||
return (ans.strip() + '.' + ext.strip()).rstrip('.')
|
||||
|
||||
|
||||
class HTMLInput(InputFormatPlugin):
|
||||
|
||||
name = 'HTML Input'
|
||||
author = 'Kovid Goyal'
|
||||
name = 'HTML Input'
|
||||
author = 'Kovid Goyal'
|
||||
description = 'Convert HTML and OPF files to an OEB'
|
||||
file_types = {'opf', 'html', 'htm', 'xhtml', 'xhtm', 'shtm', 'shtml'}
|
||||
file_types = {'opf', 'html', 'htm', 'xhtml', 'xhtm', 'shtm', 'shtml'}
|
||||
commit_name = 'html_input'
|
||||
|
||||
options = {
|
||||
|
||||
@@ -6,10 +6,10 @@ from ebook_converter.customize.conversion import InputFormatPlugin
|
||||
|
||||
class HTMLZInput(InputFormatPlugin):
|
||||
|
||||
name = 'HTLZ Input'
|
||||
author = 'John Schember'
|
||||
name = 'HTLZ Input'
|
||||
author = 'John Schember'
|
||||
description = 'Convert HTML files to HTML'
|
||||
file_types = {'htmlz'}
|
||||
file_types = {'htmlz'}
|
||||
commit_name = 'htmlz_input'
|
||||
|
||||
def convert(self, stream, options, file_ext, log,
|
||||
@@ -36,13 +36,14 @@ class HTMLZInput(InputFormatPlugin):
|
||||
top_levels.append(x)
|
||||
# Try to find an index. file.
|
||||
for x in top_levels:
|
||||
if x.lower() in (u'index.html', u'index.xhtml', u'index.htm'):
|
||||
if x.lower() in ('index.html', 'index.xhtml', 'index.htm'):
|
||||
index = x
|
||||
break
|
||||
# Look for multiple HTML files in the archive. We look at the
|
||||
# top level files only as only they matter in HTMLZ.
|
||||
for x in top_levels:
|
||||
if os.path.splitext(x)[1].lower() in (u'.html', u'.xhtml', u'.htm'):
|
||||
if os.path.splitext(x)[1].lower() in ('.html', '.xhtml',
|
||||
'.htm'):
|
||||
# Set index to the first HTML file found if it's not
|
||||
# called index.
|
||||
if not index:
|
||||
@@ -84,15 +85,14 @@ class HTMLZInput(InputFormatPlugin):
|
||||
c = 0
|
||||
while os.path.exists(htmlfile):
|
||||
c += 1
|
||||
htmlfile = u'index%d.html'%c
|
||||
htmlfile = u'index%d.html' % c
|
||||
with open(htmlfile, 'wb') as f:
|
||||
f.write(html.encode('utf-8'))
|
||||
odi = options.debug_pipeline
|
||||
options.debug_pipeline = None
|
||||
# Generate oeb from html conversion.
|
||||
with open(htmlfile, 'rb') as f:
|
||||
oeb = html_input.convert(f, options, 'html', log,
|
||||
{})
|
||||
oeb = html_input.convert(f, options, 'html', log, {})
|
||||
options.debug_pipeline = odi
|
||||
os.remove(htmlfile)
|
||||
|
||||
|
||||
@@ -1,5 +1,11 @@
|
||||
import sys, os, re, math, errno, uuid, numbers
|
||||
from collections import OrderedDict, defaultdict
|
||||
import sys
|
||||
import os
|
||||
import re
|
||||
import math
|
||||
import errno
|
||||
import uuid
|
||||
import numbers
|
||||
import collections
|
||||
import mimetypes
|
||||
|
||||
from lxml import etree
|
||||
@@ -7,23 +13,24 @@ from lxml import html
|
||||
from lxml.html.builder import (
|
||||
HTML, HEAD, TITLE, BODY, LINK, META, P, SPAN, BR, DIV, A, DT, DL, DD, H1)
|
||||
|
||||
from ebook_converter import guess_type
|
||||
from ebook_converter.ebooks.docx.container import DOCX
|
||||
from ebook_converter.ebooks.docx.names import XML, generate_anchor
|
||||
from ebook_converter.ebooks.docx.styles import Styles, inherit, PageProperties
|
||||
from ebook_converter.ebooks.docx.numbering import Numbering
|
||||
from ebook_converter.ebooks.docx.fonts import Fonts, is_symbol_font, map_symbol_text
|
||||
from ebook_converter.ebooks.docx.images import Images
|
||||
from ebook_converter.ebooks.docx.tables import Tables
|
||||
from ebook_converter.ebooks.docx.footnotes import Footnotes
|
||||
from ebook_converter.ebooks.docx.cleanup import cleanup_markup
|
||||
from ebook_converter.ebooks.docx.container import DOCX
|
||||
from ebook_converter.ebooks.docx.fields import Fields
|
||||
from ebook_converter.ebooks.docx.fonts import Fonts
|
||||
from ebook_converter.ebooks.docx.fonts import is_symbol_font
|
||||
from ebook_converter.ebooks.docx.fonts import map_symbol_text
|
||||
from ebook_converter.ebooks.docx.footnotes import Footnotes
|
||||
from ebook_converter.ebooks.docx.images import Images
|
||||
from ebook_converter.ebooks.docx.names import XML, generate_anchor
|
||||
from ebook_converter.ebooks.docx.numbering import Numbering
|
||||
from ebook_converter.ebooks.docx.settings import Settings
|
||||
from ebook_converter.ebooks.docx.styles import Styles, inherit, PageProperties
|
||||
from ebook_converter.ebooks.docx.tables import Tables
|
||||
from ebook_converter.ebooks.docx.theme import Theme
|
||||
from ebook_converter.ebooks.docx.toc import create_toc
|
||||
from ebook_converter.ebooks.docx.fields import Fields
|
||||
from ebook_converter.ebooks.docx.settings import Settings
|
||||
from ebook_converter.ebooks.metadata.opf2 import OPFCreator
|
||||
from ebook_converter.utils.localization import canonicalize_lang, lang_as_iso639_1
|
||||
|
||||
from ebook_converter.utils.localization import canonicalize_lang
|
||||
from ebook_converter.utils.localization import lang_as_iso639_1
|
||||
|
||||
|
||||
NBSP = '\xa0'
|
||||
@@ -54,7 +61,9 @@ def html_lang(docx_lang):
|
||||
|
||||
class Convert(object):
|
||||
|
||||
def __init__(self, path_or_stream, dest_dir=None, log=None, detect_cover=True, notes_text=None, notes_nopb=False, nosupsub=False):
|
||||
def __init__(self, path_or_stream, dest_dir=None, log=None,
|
||||
detect_cover=True, notes_text=None, notes_nopb=False,
|
||||
nosupsub=False):
|
||||
self.docx = DOCX(path_or_stream, log=log)
|
||||
self.namespace = self.docx.namespace
|
||||
self.ms_pat = re.compile(r'\s{2,}')
|
||||
@@ -73,7 +82,7 @@ class Convert(object):
|
||||
self.fields = Fields(self.namespace)
|
||||
self.styles = Styles(self.namespace, self.tables)
|
||||
self.images = Images(self.namespace, self.log)
|
||||
self.object_map = OrderedDict()
|
||||
self.object_map = collections.OrderedDict()
|
||||
self.html = HTML(
|
||||
HEAD(
|
||||
META(charset='utf-8'),
|
||||
@@ -82,9 +91,9 @@ class Convert(object):
|
||||
),
|
||||
self.body
|
||||
)
|
||||
self.html.text='\n\t'
|
||||
self.html[0].text='\n\t\t'
|
||||
self.html[0].tail='\n'
|
||||
self.html.text = '\n\t'
|
||||
self.html[0].text = '\n\t\t'
|
||||
self.html[0].tail = '\n'
|
||||
for child in self.html[0]:
|
||||
child.tail = '\n\t\t'
|
||||
self.html[0][-1].tail = '\n\t'
|
||||
@@ -98,17 +107,18 @@ class Convert(object):
|
||||
|
||||
def __call__(self):
|
||||
doc = self.docx.document
|
||||
relationships_by_id, relationships_by_type = self.docx.document_relationships
|
||||
(relationships_by_id,
|
||||
relationships_by_type) = self.docx.document_relationships
|
||||
self.resolve_alternate_content(doc)
|
||||
self.fields(doc, self.log)
|
||||
self.read_styles(relationships_by_type)
|
||||
self.images(relationships_by_id)
|
||||
self.layers = OrderedDict()
|
||||
self.layers = collections.OrderedDict()
|
||||
self.framed = [[]]
|
||||
self.frame_map = {}
|
||||
self.framed_map = {}
|
||||
self.anchor_map = {}
|
||||
self.link_map = defaultdict(list)
|
||||
self.link_map = collections.defaultdict(list)
|
||||
self.link_source_map = {}
|
||||
self.toc_anchor = None
|
||||
self.block_runs = []
|
||||
@@ -142,7 +152,8 @@ class Convert(object):
|
||||
dl = DL(id=anchor)
|
||||
dl.set('class', 'footnote')
|
||||
self.body.append(dl)
|
||||
dl.append(DT('[', A('←' + text, href='#back_%s' % anchor, title=text)))
|
||||
dl.append(DT('[', A('←' + text, href='#back_%s' % anchor,
|
||||
title=text)))
|
||||
dl[-1][0].tail = ']'
|
||||
dl.append(DD())
|
||||
paras = []
|
||||
@@ -159,7 +170,8 @@ class Convert(object):
|
||||
self.mark_block_runs(paras)
|
||||
|
||||
for p, wp in self.object_map.items():
|
||||
if len(p) > 0 and not p.text and len(p[0]) > 0 and not p[0].text and p[0][0].get('class', None) == 'tab':
|
||||
if (len(p) > 0 and not p.text and len(p[0]) > 0 and
|
||||
not p[0].text and p[0][0].get('class', None) == 'tab'):
|
||||
# Paragraph uses tabs for indentation, convert to text-indent
|
||||
parent = p[0]
|
||||
tabs = []
|
||||
@@ -172,7 +184,9 @@ class Convert(object):
|
||||
break
|
||||
indent = len(tabs) * self.settings.default_tab_stop
|
||||
style = self.styles.resolve(wp)
|
||||
if style.text_indent is inherit or (hasattr(style.text_indent, 'endswith') and style.text_indent.endswith('pt')):
|
||||
if (style.text_indent is inherit or
|
||||
(hasattr(style.text_indent, 'endswith') and
|
||||
style.text_indent.endswith('pt'))):
|
||||
if style.text_indent is not inherit:
|
||||
indent = float(style.text_indent[:-2]) + indent
|
||||
style.text_indent = '%.3gpt' % indent
|
||||
@@ -197,7 +211,8 @@ class Convert(object):
|
||||
except (TypeError, ValueError):
|
||||
lvl = 0
|
||||
numbered.append((html_obj, num_id, lvl))
|
||||
self.numbering.apply_markup(numbered, self.body, self.styles, self.object_map, self.images)
|
||||
self.numbering.apply_markup(numbered, self.body, self.styles,
|
||||
self.object_map, self.images)
|
||||
self.apply_frames()
|
||||
|
||||
if len(self.body) > 0:
|
||||
@@ -232,13 +247,15 @@ class Convert(object):
|
||||
self.fields.polish_markup(self.object_map)
|
||||
|
||||
self.log.debug('Cleaning up redundant markup generated by Word')
|
||||
self.cover_image = cleanup_markup(self.log, self.html, self.styles, self.dest_dir, self.detect_cover, self.namespace.XPath)
|
||||
self.cover_image = cleanup_markup(self.log, self.html, self.styles,
|
||||
self.dest_dir, self.detect_cover,
|
||||
self.namespace.XPath)
|
||||
|
||||
return self.write(doc)
|
||||
|
||||
def read_page_properties(self, doc):
|
||||
current = []
|
||||
self.page_map = OrderedDict()
|
||||
self.page_map = collections.OrderedDict()
|
||||
self.section_starts = []
|
||||
|
||||
for p in self.namespace.descendants(doc, 'w:p', 'w:tbl'):
|
||||
@@ -267,7 +284,8 @@ class Convert(object):
|
||||
def resolve_alternate_content(self, doc):
|
||||
# For proprietary extensions in Word documents use the fallback, spec
|
||||
# compliant form
|
||||
# See https://wiki.openoffice.org/wiki/OOXML/Markup_Compatibility_and_Extensibility
|
||||
# See https://wiki.openoffice.org/wiki/
|
||||
# OOXML/Markup_Compatibility_and_Extensibility
|
||||
for ac in self.namespace.descendants(doc, 'mc:AlternateContent'):
|
||||
choices = self.namespace.XPath('./mc:Choice')(ac)
|
||||
fallbacks = self.namespace.XPath('./mc:Fallback')(ac)
|
||||
@@ -284,7 +302,8 @@ class Convert(object):
|
||||
cname[-1] = defname
|
||||
if self.docx.exists('/'.join(cname)):
|
||||
name = name
|
||||
if name and name.startswith('word/word') and not self.docx.exists(name):
|
||||
if (name and name.startswith('word/word') and
|
||||
not self.docx.exists(name)):
|
||||
name = name.partition('/')[2]
|
||||
return name
|
||||
|
||||
@@ -327,7 +346,8 @@ class Convert(object):
|
||||
self.log.warn('Endnotes %s do not exist' % enname)
|
||||
else:
|
||||
enrel = self.docx.get_relationships(enname)
|
||||
footnotes(etree.fromstring(foraw) if foraw else None, forel, etree.fromstring(enraw) if enraw else None, enrel)
|
||||
footnotes(etree.fromstring(foraw) if foraw else None, forel,
|
||||
etree.fromstring(enraw) if enraw else None, enrel)
|
||||
|
||||
if fname is not None:
|
||||
embed_relationships = self.docx.get_relationships(fname)[0]
|
||||
@@ -336,7 +356,8 @@ class Convert(object):
|
||||
except KeyError:
|
||||
self.log.warn('Fonts table %s does not exist' % fname)
|
||||
else:
|
||||
fonts(etree.fromstring(raw), embed_relationships, self.docx, self.dest_dir)
|
||||
fonts(etree.fromstring(raw), embed_relationships, self.docx,
|
||||
self.dest_dir)
|
||||
|
||||
if tname is not None:
|
||||
try:
|
||||
@@ -364,16 +385,20 @@ class Convert(object):
|
||||
except KeyError:
|
||||
self.log.warn('Numbering styles %s do not exist' % nname)
|
||||
else:
|
||||
numbering(etree.fromstring(raw), self.styles, self.docx.get_relationships(nname)[0])
|
||||
numbering(etree.fromstring(raw), self.styles,
|
||||
self.docx.get_relationships(nname)[0])
|
||||
|
||||
self.styles.resolve_numbering(numbering)
|
||||
|
||||
def write(self, doc):
|
||||
toc = create_toc(doc, self.body, self.resolved_link_map, self.styles, self.object_map, self.log, self.namespace)
|
||||
raw = html.tostring(self.html, encoding='utf-8', doctype='<!DOCTYPE html>')
|
||||
toc = create_toc(doc, self.body, self.resolved_link_map, self.styles,
|
||||
self.object_map, self.log, self.namespace)
|
||||
raw = html.tostring(self.html, encoding='utf-8',
|
||||
doctype='<!DOCTYPE html>')
|
||||
with open(os.path.join(self.dest_dir, 'index.html'), 'wb') as f:
|
||||
f.write(raw)
|
||||
css = self.styles.generate_css(self.dest_dir, self.docx, self.notes_nopb, self.nosupsub)
|
||||
css = self.styles.generate_css(self.dest_dir, self.docx,
|
||||
self.notes_nopb, self.nosupsub)
|
||||
if css:
|
||||
with open(os.path.join(self.dest_dir, 'docx.css'), 'wb') as f:
|
||||
f.write(css.encode('utf-8'))
|
||||
@@ -394,23 +419,29 @@ class Convert(object):
|
||||
title='Table of Contents',
|
||||
type='toc'))
|
||||
toc_file = os.path.join(self.dest_dir, 'toc.ncx')
|
||||
with open(os.path.join(self.dest_dir, 'metadata.opf'), 'wb') as of, open(toc_file, 'wb') as ncx:
|
||||
with open(os.path.join(self.dest_dir,
|
||||
'metadata.opf'), 'wb') as of, open(toc_file,
|
||||
'wb') as ncx:
|
||||
opf.render(of, ncx, 'toc.ncx', process_guide=process_guide)
|
||||
if os.path.getsize(toc_file) == 0:
|
||||
os.remove(toc_file)
|
||||
return os.path.join(self.dest_dir, 'metadata.opf')
|
||||
|
||||
def read_block_anchors(self, doc):
|
||||
doc_anchors = frozenset(self.namespace.XPath('./w:body/w:bookmarkStart[@w:name]')(doc))
|
||||
doc_anchors = frozenset(self.namespace.XPath('./w:body/w:bookmarkStart'
|
||||
'[@w:name]')(doc))
|
||||
if doc_anchors:
|
||||
current_bm = set()
|
||||
rmap = {v:k for k, v in self.object_map.items()}
|
||||
for p in self.namespace.descendants(doc, 'w:p', 'w:bookmarkStart[@w:name]'):
|
||||
rmap = {v: k for k, v in self.object_map.items()}
|
||||
for p in self.namespace.descendants(doc, 'w:p',
|
||||
'w:bookmarkStart[@w:name]'):
|
||||
if p.tag.endswith('}p'):
|
||||
if current_bm and p in rmap:
|
||||
para = rmap[p]
|
||||
if 'id' not in para.attrib:
|
||||
para.set('id', generate_anchor(next(iter(current_bm)), frozenset(self.anchor_map.values())))
|
||||
_bm = next(iter(current_bm))
|
||||
_am = frozenset(self.anchor_map.values())
|
||||
para.set('id', generate_anchor(_bm, _am))
|
||||
for name in current_bm:
|
||||
self.anchor_map[name] = para.get('id')
|
||||
current_bm = set()
|
||||
@@ -442,13 +473,15 @@ class Convert(object):
|
||||
except AttributeError:
|
||||
break
|
||||
|
||||
for x in self.namespace.descendants(p, 'w:r', 'w:bookmarkStart', 'w:hyperlink', 'w:instrText'):
|
||||
for x in self.namespace.descendants(p, 'w:r', 'w:bookmarkStart',
|
||||
'w:hyperlink', 'w:instrText'):
|
||||
if p_parent(x) is not p:
|
||||
continue
|
||||
if x.tag.endswith('}r'):
|
||||
span = self.convert_run(x)
|
||||
if current_anchor is not None:
|
||||
(dest if len(dest) == 0 else span).set('id', current_anchor)
|
||||
(dest if len(dest) == 0 else span).set('id',
|
||||
current_anchor)
|
||||
current_anchor = None
|
||||
if current_hyperlink is not None:
|
||||
try:
|
||||
@@ -462,11 +495,14 @@ class Convert(object):
|
||||
self.layers[p].append(x)
|
||||
elif x.tag.endswith('}bookmarkStart'):
|
||||
anchor = self.namespace.get(x, 'w:name')
|
||||
if anchor and anchor not in self.anchor_map and anchor != '_GoBack':
|
||||
if (anchor and anchor not in self.anchor_map and
|
||||
anchor != '_GoBack'):
|
||||
# _GoBack is a special bookmark inserted by Word 2010 for
|
||||
# the return to previous edit feature, we ignore it
|
||||
old_anchor = current_anchor
|
||||
self.anchor_map[anchor] = current_anchor = generate_anchor(anchor, frozenset(self.anchor_map.values()))
|
||||
current_anchor = generate_anchor(
|
||||
anchor, frozenset(self .anchor_map.values()))
|
||||
self.anchor_map[anchor] = current_anchor
|
||||
if old_anchor is not None:
|
||||
# The previous anchor was not applied to any element
|
||||
for a, t in tuple(self.anchor_map.items()):
|
||||
@@ -474,10 +510,13 @@ class Convert(object):
|
||||
self.anchor_map[a] = current_anchor
|
||||
elif x.tag.endswith('}hyperlink'):
|
||||
current_hyperlink = x
|
||||
elif x.tag.endswith('}instrText') and x.text and x.text.strip().startswith('TOC '):
|
||||
elif (x.tag.endswith('}instrText') and x.text and
|
||||
x.text.strip().startswith('TOC ')):
|
||||
old_anchor = current_anchor
|
||||
anchor = str(uuid.uuid4())
|
||||
self.anchor_map[anchor] = current_anchor = generate_anchor('toc', frozenset(self.anchor_map.values()))
|
||||
current_anchor = generate_anchor(
|
||||
'toc', frozenset(self.anchor_map.values()))
|
||||
self.anchor_map[anchor] = current_anchor
|
||||
self.toc_anchor = current_anchor
|
||||
if old_anchor is not None:
|
||||
# The previous anchor was not applied to any element
|
||||
@@ -489,7 +528,8 @@ class Convert(object):
|
||||
dest.set('id', current_anchor)
|
||||
current_anchor = None
|
||||
|
||||
m = re.match(r'heading\s+(\d+)$', style.style_name or '', re.IGNORECASE)
|
||||
m = re.match(r'heading\s+(\d+)$', style.style_name or '',
|
||||
re.IGNORECASE)
|
||||
if m is not None:
|
||||
n = min(6, max(1, int(m.group(1))))
|
||||
dest.tag = 'h%d' % n
|
||||
@@ -533,7 +573,8 @@ class Convert(object):
|
||||
if len(dest) > 0 and not dest[-1].tail:
|
||||
if dest[-1].tag == 'br':
|
||||
dest[-1].tail = NBSP
|
||||
elif len(dest[-1]) > 0 and dest[-1][-1].tag == 'br' and not dest[-1][-1].tail:
|
||||
elif (len(dest[-1]) > 0 and dest[-1][-1].tag == 'br' and
|
||||
not dest[-1][-1].tail):
|
||||
dest[-1][-1].tail = NBSP
|
||||
|
||||
return dest
|
||||
@@ -578,12 +619,12 @@ class Convert(object):
|
||||
if anchor and anchor in self.anchor_map:
|
||||
span.set('href', '#' + self.anchor_map[anchor])
|
||||
continue
|
||||
self.log.warn('Hyperlink with unknown target (rid=%s, anchor=%s), ignoring' %
|
||||
(rid, anchor))
|
||||
self.log.warn('Hyperlink with unknown target (rid=%s, anchor=%s), '
|
||||
'ignoring' % (rid, anchor))
|
||||
# hrefs that point nowhere give epubcheck a hernia. The element
|
||||
# should be styled explicitly by Word anyway.
|
||||
# span.set('href', '#')
|
||||
rmap = {v:k for k, v in self.object_map.items()}
|
||||
rmap = {v: k for k, v in self.object_map.items()}
|
||||
for hyperlink, runs in self.fields.hyperlink_fields:
|
||||
spans = [rmap[r] for r in runs if r in rmap]
|
||||
if not spans:
|
||||
@@ -604,7 +645,8 @@ class Convert(object):
|
||||
if anchor in self.anchor_map:
|
||||
span.set('href', '#' + self.anchor_map[anchor])
|
||||
continue
|
||||
self.log.warn('Hyperlink field with unknown anchor: %s' % anchor)
|
||||
self.log.warn('Hyperlink field with unknown anchor: %s' %
|
||||
anchor)
|
||||
else:
|
||||
if url in self.anchor_map:
|
||||
span.set('href', '#' + self.anchor_map[url])
|
||||
@@ -652,7 +694,8 @@ class Convert(object):
|
||||
# actually needs it, i.e. if it has more than one
|
||||
# consecutive space or it has newlines or tabs.
|
||||
multi_spaces = self.ms_pat.search(ctext) is not None
|
||||
preserve = multi_spaces or self.ws_pat.search(ctext) is not None
|
||||
preserve = (multi_spaces or
|
||||
self.ws_pat.search(ctext) is not None)
|
||||
if preserve:
|
||||
text.add_elem(SPAN(ctext, style="white-space:pre-wrap"))
|
||||
ans.append(text.elem)
|
||||
@@ -668,24 +711,30 @@ class Convert(object):
|
||||
else:
|
||||
clear = child.get('clear', None)
|
||||
if clear in {'all', 'left', 'right'}:
|
||||
br = BR(style='clear:%s'%('both' if clear == 'all' else clear))
|
||||
br = BR(style='clear:%s' % ('both' if clear == 'all'
|
||||
else clear))
|
||||
else:
|
||||
br = BR()
|
||||
text.add_elem(br)
|
||||
ans.append(text.elem)
|
||||
elif self.namespace.is_tag(child, 'w:drawing') or self.namespace.is_tag(child, 'w:pict'):
|
||||
for img in self.images.to_html(child, self.current_page, self.docx, self.dest_dir):
|
||||
elif (self.namespace.is_tag(child, 'w:drawing') or
|
||||
self.namespace.is_tag(child, 'w:pict')):
|
||||
for img in self.images.to_html(child, self.current_page,
|
||||
self.docx, self.dest_dir):
|
||||
text.add_elem(img)
|
||||
ans.append(text.elem)
|
||||
elif self.namespace.is_tag(child, 'w:footnoteReference') or self.namespace.is_tag(child, 'w:endnoteReference'):
|
||||
elif (self.namespace.is_tag(child, 'w:footnoteReference') or
|
||||
self.namespace.is_tag(child, 'w:endnoteReference')):
|
||||
anchor, name = self.footnotes.get_ref(child)
|
||||
if anchor and name:
|
||||
l = A(name, id='back_%s' % anchor, href='#' + anchor, title=name)
|
||||
l.set('class', 'noteref')
|
||||
text.add_elem(l)
|
||||
_l = A(name, id='back_%s' % anchor, href='#' + anchor,
|
||||
title=name)
|
||||
_l.set('class', 'noteref')
|
||||
text.add_elem(_l)
|
||||
ans.append(text.elem)
|
||||
elif self.namespace.is_tag(child, 'w:tab'):
|
||||
spaces = int(math.ceil((self.settings.default_tab_stop / 36) * 6))
|
||||
spaces = int(math.ceil((self.settings.default_tab_stop / 36) *
|
||||
6))
|
||||
text.add_elem(SPAN(NBSP * spaces))
|
||||
ans.append(text.elem)
|
||||
ans[-1].set('class', 'tab')
|
||||
@@ -699,7 +748,8 @@ class Convert(object):
|
||||
style = self.styles.resolve_run(run)
|
||||
if style.vert_align in {'superscript', 'subscript'}:
|
||||
if ans.text or len(ans):
|
||||
ans.set('data-docx-vert', 'sup' if style.vert_align == 'superscript' else 'sub')
|
||||
ans.set('data-docx-vert',
|
||||
'sup' if style.vert_align == 'superscript' else 'sub')
|
||||
if style.lang is not inherit:
|
||||
lang = html_lang(style.lang)
|
||||
if lang is not None and lang != self.doc_lang:
|
||||
@@ -738,12 +788,14 @@ class Convert(object):
|
||||
idx = parent.index(paras[0])
|
||||
frame = DIV(*paras)
|
||||
parent.insert(idx, frame)
|
||||
self.framed_map[frame] = css = style.css(self.page_map[self.object_map[paras[0]]])
|
||||
self.framed_map[frame] = css = style.css(
|
||||
self.page_map[self.object_map[paras[0]]])
|
||||
self.styles.register(css, 'frame')
|
||||
|
||||
if not self.block_runs:
|
||||
return
|
||||
rmap = {v:k for k, v in self.object_map.items()}
|
||||
|
||||
rmap = {v: k for k, v in self.object_map.items()}
|
||||
for border_style, blocks in self.block_runs:
|
||||
paras = tuple(rmap[p] for p in blocks)
|
||||
for p in paras:
|
||||
@@ -796,17 +848,20 @@ class Convert(object):
|
||||
else:
|
||||
border_style = style.clone_border_styles()
|
||||
if has_visible_border:
|
||||
border_style.margin_top, style.margin_top = style.margin_top, inherit
|
||||
style.margin_top = inherit
|
||||
border_style.margin_top = style.margin_top
|
||||
if p is not run[-1]:
|
||||
style.padding_bottom = 0
|
||||
else:
|
||||
if has_visible_border:
|
||||
border_style.margin_bottom, style.margin_bottom = style.margin_bottom, inherit
|
||||
style.margin_bottom = inherit
|
||||
border_style.margin_bottom = style.margin_bottom
|
||||
style.clear_borders()
|
||||
if p is not run[-1]:
|
||||
style.apply_between_border()
|
||||
if has_visible_border:
|
||||
border_style.margin_left, border_style.margin_right = max_left,max_right
|
||||
border_style.margin_left = max_left
|
||||
border_style.margin_right = max_right
|
||||
self.block_runs.append((border_style, run))
|
||||
|
||||
run = []
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
import mimetypes
|
||||
import textwrap, os
|
||||
import os
|
||||
import textwrap
|
||||
|
||||
from lxml import etree
|
||||
from lxml.builder import ElementMaker
|
||||
@@ -9,22 +10,48 @@ from ebook_converter.ebooks.docx.names import DOCXNamespace
|
||||
from ebook_converter.ebooks.metadata import authors_to_string
|
||||
from ebook_converter.ebooks.pdf.render.common import PAPER_SIZES
|
||||
from ebook_converter.utils.date import utcnow
|
||||
from ebook_converter.utils.localization import canonicalize_lang, lang_as_iso639_1
|
||||
from ebook_converter.utils.localization import canonicalize_lang
|
||||
from ebook_converter.utils.localization import lang_as_iso639_1
|
||||
from ebook_converter.utils.zipfile import ZipFile
|
||||
|
||||
|
||||
WORD_TYPES = {"/word/footnotes.xml": "application/vnd.openxmlformats-"
|
||||
"officedocument.wordprocessingml.footnotes+xml",
|
||||
"/word/document.xml": "application/vnd.openxmlformats-"
|
||||
"officedocument.wordprocessingml.document.main+xml",
|
||||
"/word/numbering.xml": "application/vnd.openxmlformats-"
|
||||
"officedocument.wordprocessingml.numbering+xml",
|
||||
"/word/styles.xml": "application/vnd.openxmlformats-"
|
||||
"officedocument.wordprocessingml.styles+xml",
|
||||
"/word/endnotes.xml": "application/vnd.openxmlformats-"
|
||||
"officedocument.wordprocessingml.endnotes+xml",
|
||||
"/word/settings.xml": "application/vnd.openxmlformats-"
|
||||
"officedocument.wordprocessingml.settings+xml",
|
||||
"/word/theme/theme1.xml": "application/vnd.openxmlformats-"
|
||||
"officedocument.theme+xml",
|
||||
"/word/fontTable.xml": "application/vnd.openxmlformats-"
|
||||
"officedocument.wordprocessingml.fontTable+xml",
|
||||
"/word/webSettings.xml": "application/vnd.openxmlformats-"
|
||||
"officedocument.wordprocessingml.webSettings+xml",
|
||||
"/docProps/core.xml": "application/vnd.openxmlformats-package."
|
||||
"core-properties+xml",
|
||||
"/docProps/app.xml": "application/vnd.openxmlformats-"
|
||||
"officedocument.extended-properties+xml"}
|
||||
|
||||
|
||||
def xml2str(root, pretty_print=False, with_tail=False):
|
||||
if hasattr(etree, 'cleanup_namespaces'):
|
||||
etree.cleanup_namespaces(root)
|
||||
ans = etree.tostring(root, encoding='utf-8', xml_declaration=True,
|
||||
pretty_print=pretty_print, with_tail=with_tail)
|
||||
pretty_print=pretty_print, with_tail=with_tail)
|
||||
return ans
|
||||
|
||||
|
||||
def page_size(opts):
|
||||
width, height = PAPER_SIZES[opts.docx_page_size]
|
||||
if opts.docx_custom_page_size is not None:
|
||||
width, height = map(float, opts.docx_custom_page_size.partition('x')[0::2])
|
||||
width, height = map(float,
|
||||
opts.docx_custom_page_size.partition('x')[0::2])
|
||||
return width, height
|
||||
|
||||
|
||||
@@ -47,7 +74,9 @@ def create_skeleton(opts, namespaces=None):
|
||||
|
||||
def w(x):
|
||||
return '{%s}%s' % (namespaces['w'], x)
|
||||
dn = {k:v for k, v in namespaces.items() if k in {'w', 'r', 'm', 've', 'o', 'wp', 'w10', 'wne', 'a', 'pic'}}
|
||||
dn = {k: v for k, v in namespaces.items() if k in {'w', 'r', 'm', 've',
|
||||
'o', 'wp', 'w10', 'wne',
|
||||
'a', 'pic'}}
|
||||
E = ElementMaker(namespace=dn['w'], nsmap=dn)
|
||||
doc = E.document()
|
||||
body = E.body()
|
||||
@@ -59,27 +88,32 @@ def create_skeleton(opts, namespaces=None):
|
||||
val = page_margin(opts, which)
|
||||
return w(which), str(int(val * 20))
|
||||
body.append(E.sectPr(
|
||||
E.pgSz(**{w('w'):str(width), w('h'):str(height)}),
|
||||
E.pgSz(**{w('w'): str(width), w('h'): str(height)}),
|
||||
E.pgMar(**dict(map(margin, 'left top right bottom'.split()))),
|
||||
E.cols(**{w('space'):'720'}),
|
||||
E.docGrid(**{w('linePitch'):"360"}),
|
||||
E.cols(**{w('space'): '720'}),
|
||||
E.docGrid(**{w('linePitch'): "360"}),
|
||||
))
|
||||
|
||||
dn = {k:v for k, v in namespaces.items() if k in tuple('wra') + ('wp',)}
|
||||
dn = {k: v for k, v in namespaces.items() if k in tuple('wra') + ('wp',)}
|
||||
E = ElementMaker(namespace=dn['w'], nsmap=dn)
|
||||
styles = E.styles(
|
||||
E.docDefaults(
|
||||
E.rPrDefault(
|
||||
E.rPr(
|
||||
E.rFonts(**{w('asciiTheme'):"minorHAnsi", w('eastAsiaTheme'):"minorEastAsia", w('hAnsiTheme'):"minorHAnsi", w('cstheme'):"minorBidi"}),
|
||||
E.sz(**{w('val'):'22'}),
|
||||
E.szCs(**{w('val'):'22'}),
|
||||
E.lang(**{w('val'):'en-US', w('eastAsia'):"en-US", w('bidi'):"ar-SA"})
|
||||
E.rFonts(**{w('asciiTheme'): "minorHAnsi",
|
||||
w('eastAsiaTheme'): "minorEastAsia",
|
||||
w('hAnsiTheme'): "minorHAnsi",
|
||||
w('cstheme'): "minorBidi"}),
|
||||
E.sz(**{w('val'): '22'}),
|
||||
E.szCs(**{w('val'): '22'}),
|
||||
E.lang(**{w('val'): 'en-US', w('eastAsia'): "en-US",
|
||||
w('bidi'): "ar-SA"})
|
||||
)
|
||||
),
|
||||
E.pPrDefault(
|
||||
E.pPr(
|
||||
E.spacing(**{w('after'):"0", w('line'):"276", w('lineRule'):"auto"})
|
||||
E.spacing(**{w('after'): "0", w('line'): "276",
|
||||
w('lineRule'): "auto"})
|
||||
)
|
||||
)
|
||||
)
|
||||
@@ -103,8 +137,8 @@ def update_doc_props(root, mi, namespace):
|
||||
if mi.comments:
|
||||
setm('description', mi.comments)
|
||||
if mi.languages:
|
||||
l = canonicalize_lang(mi.languages[0])
|
||||
setm('language', lang_as_iso639_1(l) or l)
|
||||
_l = canonicalize_lang(mi.languages[0])
|
||||
setm('language', lang_as_iso639_1(_l) or _l)
|
||||
|
||||
|
||||
class DocumentRelationships(object):
|
||||
@@ -115,8 +149,7 @@ class DocumentRelationships(object):
|
||||
for typ, target in {namespace.names['STYLES']: 'styles.xml',
|
||||
namespace.names['NUMBERING']: 'numbering.xml',
|
||||
namespace.names['WEB_SETTINGS']: 'webSettings.xml',
|
||||
namespace.names['FONTS']: 'fontTable.xml',
|
||||
}.items():
|
||||
namespace.names['FONTS']: 'fontTable.xml'}.items():
|
||||
self.add_relationship(target, typ)
|
||||
|
||||
def get_relationship_id(self, target, rtype, target_mode=None):
|
||||
@@ -134,7 +167,8 @@ class DocumentRelationships(object):
|
||||
|
||||
def serialize(self):
|
||||
namespaces = self.namespace.namespaces
|
||||
E = ElementMaker(namespace=namespaces['pr'], nsmap={None:namespaces['pr']})
|
||||
E = ElementMaker(namespace=namespaces['pr'],
|
||||
nsmap={None: namespaces['pr']})
|
||||
relationships = E.Relationships()
|
||||
for (target, rtype, target_mode), rid in self.rmap.items():
|
||||
r = E.Relationship(Id=rid, Type=rtype, Target=target)
|
||||
@@ -151,9 +185,12 @@ class DOCX(object):
|
||||
namespaces = self.namespace.namespaces
|
||||
self.opts, self.log = opts, log
|
||||
self.document_relationships = DocumentRelationships(self.namespace)
|
||||
self.font_table = etree.Element('{%s}fonts' % namespaces['w'], nsmap={k:namespaces[k] for k in 'wr'})
|
||||
self.numbering = etree.Element('{%s}numbering' % namespaces['w'], nsmap={k:namespaces[k] for k in 'wr'})
|
||||
E = ElementMaker(namespace=namespaces['pr'], nsmap={None:namespaces['pr']})
|
||||
self.font_table = etree.Element('{%s}fonts' % namespaces['w'],
|
||||
nsmap={k: namespaces[k] for k in 'wr'})
|
||||
self.numbering = etree.Element('{%s}numbering' % namespaces['w'],
|
||||
nsmap={k: namespaces[k] for k in 'wr'})
|
||||
E = ElementMaker(namespace=namespaces['pr'],
|
||||
nsmap={None: namespaces['pr']})
|
||||
self.embedded_fonts = E.Relationships()
|
||||
self.fonts = {}
|
||||
self.images = {}
|
||||
@@ -161,21 +198,10 @@ class DOCX(object):
|
||||
# Boilerplate {{{
|
||||
@property
|
||||
def contenttypes(self):
|
||||
E = ElementMaker(namespace=self.namespace.namespaces['ct'], nsmap={None:self.namespace.namespaces['ct']})
|
||||
E = ElementMaker(namespace=self.namespace.namespaces['ct'],
|
||||
nsmap={None: self.namespace.namespaces['ct']})
|
||||
types = E.Types()
|
||||
for partname, mt in {
|
||||
"/word/footnotes.xml": "application/vnd.openxmlformats-officedocument.wordprocessingml.footnotes+xml",
|
||||
"/word/document.xml": "application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml",
|
||||
"/word/numbering.xml": "application/vnd.openxmlformats-officedocument.wordprocessingml.numbering+xml",
|
||||
"/word/styles.xml": "application/vnd.openxmlformats-officedocument.wordprocessingml.styles+xml",
|
||||
"/word/endnotes.xml": "application/vnd.openxmlformats-officedocument.wordprocessingml.endnotes+xml",
|
||||
"/word/settings.xml": "application/vnd.openxmlformats-officedocument.wordprocessingml.settings+xml",
|
||||
"/word/theme/theme1.xml": "application/vnd.openxmlformats-officedocument.theme+xml",
|
||||
"/word/fontTable.xml": "application/vnd.openxmlformats-officedocument.wordprocessingml.fontTable+xml",
|
||||
"/word/webSettings.xml": "application/vnd.openxmlformats-officedocument.wordprocessingml.webSettings+xml",
|
||||
"/docProps/core.xml": "application/vnd.openxmlformats-package.core-properties+xml",
|
||||
"/docProps/app.xml": "application/vnd.openxmlformats-officedocument.extended-properties+xml",
|
||||
}.items():
|
||||
for partname, mt in WORD_TYPES.items():
|
||||
types.append(E.Override(PartName=partname, ContentType=mt))
|
||||
added = {'png', 'gif', 'jpeg', 'jpg', 'svg', 'xml'}
|
||||
for ext in added:
|
||||
@@ -199,7 +225,8 @@ class DOCX(object):
|
||||
|
||||
@property
|
||||
def appproperties(self):
|
||||
E = ElementMaker(namespace=self.namespace.namespaces['ep'], nsmap={None:self.namespace.namespaces['ep']})
|
||||
E = ElementMaker(namespace=self.namespace.namespaces['ep'],
|
||||
nsmap={None: self.namespace.namespaces['ep']})
|
||||
props = E.Properties(
|
||||
E.Application(__appname__),
|
||||
E.AppVersion('%02d.%04d' % numeric_version[:2]),
|
||||
@@ -216,16 +243,17 @@ class DOCX(object):
|
||||
@property
|
||||
def containerrels(self):
|
||||
return textwrap.dedent('''\
|
||||
<?xml version='1.0' encoding='utf-8'?>
|
||||
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
|
||||
<Relationship Id="rId3" Type="{APPPROPS}" Target="docProps/app.xml"/>
|
||||
<Relationship Id="rId2" Type="{DOCPROPS}" Target="docProps/core.xml"/>
|
||||
<Relationship Id="rId1" Type="{DOCUMENT}" Target="word/document.xml"/>
|
||||
</Relationships>'''.format(**self.namespace.names)).encode('utf-8')
|
||||
<?xml version='1.0' encoding='utf-8'?>
|
||||
<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">
|
||||
<Relationship Id="rId3" Type="{APPPROPS}" Target="docProps/app.xml"/>
|
||||
<Relationship Id="rId2" Type="{DOCPROPS}" Target="docProps/core.xml"/>
|
||||
<Relationship Id="rId1" Type="{DOCUMENT}" Target="word/document.xml"/>
|
||||
</Relationships>'''.format(**self.namespace.names)).encode('utf-8') # noqa
|
||||
|
||||
@property
|
||||
def websettings(self):
|
||||
E = ElementMaker(namespace=self.namespace.namespaces['w'], nsmap={'w':self.namespace.namespaces['w']})
|
||||
E = ElementMaker(namespace=self.namespace.namespaces['w'],
|
||||
nsmap={'w': self.namespace.namespaces['w']})
|
||||
ws = E.webSettings(
|
||||
E.optimizeForBrowser, E.allowPNG, E.doNotSaveAsSingleFile)
|
||||
return xml2str(ws)
|
||||
@@ -234,11 +262,15 @@ class DOCX(object):
|
||||
|
||||
def convert_metadata(self, mi):
|
||||
namespaces = self.namespace.namespaces
|
||||
E = ElementMaker(namespace=namespaces['cp'], nsmap={x:namespaces[x] for x in 'cp dc dcterms xsi'.split()})
|
||||
E = ElementMaker(namespace=namespaces['cp'],
|
||||
nsmap={x: namespaces[x]
|
||||
for x in 'cp dc dcterms xsi'.split()})
|
||||
cp = E.coreProperties(E.revision("1"), E.lastModifiedBy('calibre'))
|
||||
ts = utcnow().isoformat('T').rpartition('.')[0] + 'Z'
|
||||
for x in 'created modified'.split():
|
||||
x = cp.makeelement('{%s}%s' % (namespaces['dcterms'], x), **{'{%s}type' % namespaces['xsi']:'dcterms:W3CDTF'})
|
||||
x = cp.makeelement('{%s}%s' % (namespaces['dcterms'], x),
|
||||
**{'{%s}type' %
|
||||
namespaces['xsi']: 'dcterms:W3CDTF'})
|
||||
x.text = ts
|
||||
cp.append(x)
|
||||
self.mi = mi
|
||||
@@ -261,8 +293,10 @@ class DOCX(object):
|
||||
zf.writestr('word/styles.xml', xml2str(self.styles))
|
||||
zf.writestr('word/numbering.xml', xml2str(self.numbering))
|
||||
zf.writestr('word/fontTable.xml', xml2str(self.font_table))
|
||||
zf.writestr('word/_rels/document.xml.rels', self.document_relationships.serialize())
|
||||
zf.writestr('word/_rels/fontTable.xml.rels', xml2str(self.embedded_fonts))
|
||||
zf.writestr('word/_rels/document.xml.rels',
|
||||
self.document_relationships.serialize())
|
||||
zf.writestr('word/_rels/fontTable.xml.rels',
|
||||
xml2str(self.embedded_fonts))
|
||||
for fname, data_getter in self.images.items():
|
||||
zf.writestr(fname, data_getter())
|
||||
for fname, data in self.fonts.items():
|
||||
|
||||
@@ -18,7 +18,7 @@ try:
|
||||
_author_pat = re.compile(tweaks['authors_split_regex'])
|
||||
except Exception:
|
||||
prints('Author split regexp:', tweaks['authors_split_regex'],
|
||||
'is invalid, using default')
|
||||
'is invalid, using default')
|
||||
_author_pat = re.compile(r'(?i),?\s+(and|with)\s+')
|
||||
|
||||
|
||||
@@ -76,7 +76,8 @@ def author_to_author_sort(author, method=None):
|
||||
if method == 'copy':
|
||||
return author
|
||||
|
||||
prefixes = {force_unicode(y).lower() for y in tweaks['author_name_prefixes']}
|
||||
prefixes = {force_unicode(y).lower()
|
||||
for y in tweaks['author_name_prefixes']}
|
||||
prefixes |= {y+'.' for y in prefixes}
|
||||
while True:
|
||||
if not tokens:
|
||||
@@ -87,7 +88,8 @@ def author_to_author_sort(author, method=None):
|
||||
else:
|
||||
break
|
||||
|
||||
suffixes = {force_unicode(y).lower() for y in tweaks['author_name_suffixes']}
|
||||
suffixes = {force_unicode(y).lower()
|
||||
for y in tweaks['author_name_suffixes']}
|
||||
suffixes |= {y+'.' for y in suffixes}
|
||||
|
||||
suffix = ''
|
||||
@@ -144,7 +146,7 @@ def get_title_sort_pat(lang=None):
|
||||
except:
|
||||
ans = frozenset((r'A\s+', r'The\s+', r'An\s+'))
|
||||
ans = '|'.join(ans)
|
||||
ans = '^(%s)'%ans
|
||||
ans = '^(%s)' % ans
|
||||
try:
|
||||
ans = re.compile(ans, re.IGNORECASE)
|
||||
except:
|
||||
@@ -154,7 +156,7 @@ def get_title_sort_pat(lang=None):
|
||||
|
||||
|
||||
_ignore_starts = '\'"'+''.join(chr(x) for x in
|
||||
list(range(0x2018, 0x201e))+[0x2032, 0x2033])
|
||||
list(range(0x2018, 0x201e))+[0x2032, 0x2033])
|
||||
|
||||
|
||||
def title_sort(title, order=None, lang=None):
|
||||
|
||||
@@ -12,8 +12,7 @@ from lxml import etree
|
||||
from ebook_converter.utils.date import parse_only_date
|
||||
from ebook_converter.utils.img import save_cover_data_to
|
||||
from ebook_converter.utils.imghdr import identify
|
||||
from ebook_converter import guess_type, guess_all_extensions, prints, \
|
||||
force_unicode
|
||||
from ebook_converter import guess_all_extensions, prints, force_unicode
|
||||
from ebook_converter.ebooks.metadata import MetaInformation, check_isbn
|
||||
from ebook_converter.ebooks.chardet import xml_to_unicode
|
||||
from ebook_converter.polyglot.binary import as_base64_unicode
|
||||
|
||||
@@ -10,11 +10,11 @@ import mimetypes
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import textwrap
|
||||
import traceback
|
||||
import unittest
|
||||
import urllib.parse
|
||||
import uuid
|
||||
import traceback
|
||||
import textwrap
|
||||
|
||||
from lxml import etree
|
||||
from lxml.builder import ElementMaker
|
||||
@@ -32,7 +32,7 @@ from ebook_converter.ebooks.metadata import string_to_authors, \
|
||||
from ebook_converter.ebooks.metadata.book.base import Metadata
|
||||
from ebook_converter.utils.date import parse_date, isoformat
|
||||
from ebook_converter.utils.localization import get_lang, canonicalize_lang
|
||||
from ebook_converter import prints, guess_type
|
||||
from ebook_converter import prints
|
||||
from ebook_converter.utils.cleantext import clean_ascii_chars, clean_xml_chars
|
||||
from ebook_converter.utils.config import tweaks
|
||||
from ebook_converter.polyglot.urllib import unquote
|
||||
@@ -1807,8 +1807,7 @@ def test_m2o():
|
||||
class OPFTest(unittest.TestCase):
|
||||
|
||||
def setUp(self):
|
||||
self.stream = io.BytesIO(
|
||||
b'''\
|
||||
self.stream = io.BytesIO(b'''\
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<package version="2.0" xmlns="http://www.idpf.org/2007/opf" >
|
||||
<metadata xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:opf="http://www.idpf.org/2007/opf">
|
||||
@@ -1827,8 +1826,7 @@ b'''\
|
||||
<item id="1" href="a%20%7E%20b" media-type="text/txt" />
|
||||
</manifest>
|
||||
</package>
|
||||
'''
|
||||
)
|
||||
''')
|
||||
self.opf = OPF(self.stream, os.getcwd())
|
||||
|
||||
def testReading(self, opf=None):
|
||||
|
||||
@@ -1,10 +1,15 @@
|
||||
import shutil, os, re, struct, textwrap, io
|
||||
import io
|
||||
import logging
|
||||
import mimetypes
|
||||
import os
|
||||
import re
|
||||
import shutil
|
||||
import struct
|
||||
import textwrap
|
||||
|
||||
from lxml import html, etree
|
||||
|
||||
from ebook_converter import xml_entity_to_unicode, entity_to_unicode, guess_type
|
||||
from ebook_converter import xml_entity_to_unicode, entity_to_unicode
|
||||
from ebook_converter.utils.cleantext import clean_ascii_chars, clean_xml_chars
|
||||
from ebook_converter.ebooks import DRMError, unit_convert
|
||||
from ebook_converter.ebooks.chardet import strip_encoding_declarations
|
||||
@@ -15,15 +20,11 @@ from ebook_converter.ebooks.metadata import MetaInformation
|
||||
from ebook_converter.ebooks.metadata.opf2 import OPFCreator, OPF
|
||||
from ebook_converter.ebooks.metadata.toc import TOC
|
||||
from ebook_converter.ebooks.mobi.reader.headers import BookHeader
|
||||
from ebook_converter.utils.img import save_cover_data_to, gif_data_to_png_data, AnimatedGIF
|
||||
from ebook_converter.utils.img import save_cover_data_to, gif_data_to_png_data
|
||||
from ebook_converter.utils.img import AnimatedGIF
|
||||
from ebook_converter.utils.imghdr import what
|
||||
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
|
||||
class TopazError(ValueError):
|
||||
pass
|
||||
|
||||
@@ -38,13 +39,14 @@ class KFXError(ValueError):
|
||||
|
||||
|
||||
class MobiReader(object):
|
||||
PAGE_BREAK_PAT = re.compile(
|
||||
r'<\s*/{0,1}\s*mbp:pagebreak((?:\s+[^/>]*){0,1})/{0,1}\s*>\s*(?:<\s*/{0,1}\s*mbp:pagebreak\s*/{0,1}\s*>)*',
|
||||
re.IGNORECASE)
|
||||
PAGE_BREAK_PAT = re.compile(r'<\s*/{0,1}\s*mbp:pagebreak((?:\s+[^/>]*)'
|
||||
r'{0,1})/{0,1}\s*>\s*(?:<\s*/{0,1}'
|
||||
r'\s*mbp:pagebreak\s*/{0,1}\s*>)*',
|
||||
re.IGNORECASE)
|
||||
IMAGE_ATTRS = ('lowrecindex', 'recindex', 'hirecindex')
|
||||
|
||||
def __init__(self, filename_or_stream, log, user_encoding=None, debug=None,
|
||||
try_extra_data_fix=False):
|
||||
try_extra_data_fix=False):
|
||||
self.log = log
|
||||
self.debug = debug
|
||||
self.embedded_mi = None
|
||||
@@ -83,8 +85,8 @@ class MobiReader(object):
|
||||
if raw.startswith(b'\xeaDRMION\xee'):
|
||||
raise KFXError()
|
||||
|
||||
self.header = raw[0:72]
|
||||
self.name = self.header[:32].replace(b'\x00', b'')
|
||||
self.header = raw[0:72]
|
||||
self.name = self.header[:32].replace(b'\x00', b'')
|
||||
self.num_sections, = struct.unpack('>H', raw[76:78])
|
||||
|
||||
self.ident = self.header[0x3C:0x3C + 8].upper()
|
||||
@@ -94,7 +96,9 @@ class MobiReader(object):
|
||||
self.sections = []
|
||||
self.section_headers = []
|
||||
for i in range(self.num_sections):
|
||||
offset, a1, a2, a3, a4 = struct.unpack('>LBBBB', raw[78 + i * 8:78 + i * 8 + 8])
|
||||
offset, a1, a2, a3, a4 = struct.unpack('>LBBBB',
|
||||
raw[78 + i * 8:78 +
|
||||
i * 8 + 8])
|
||||
flags, val = a1, a2 << 16 | a3 << 8 | a4
|
||||
self.section_headers.append((offset, flags, val))
|
||||
|
||||
@@ -109,8 +113,9 @@ class MobiReader(object):
|
||||
for i in range(self.num_sections):
|
||||
self.sections.append((section(i), self.section_headers[i]))
|
||||
|
||||
self.book_header = bh = BookHeader(self.sections[0][0], self.ident,
|
||||
user_encoding, self.log, try_extra_data_fix=try_extra_data_fix)
|
||||
bh = BookHeader(self.sections[0][0], self.ident, user_encoding,
|
||||
self.log, try_extra_data_fix=try_extra_data_fix)
|
||||
self.book_header = bh
|
||||
self.name = self.name.decode(self.book_header.codec, 'replace')
|
||||
self.kf8_type = None
|
||||
k8i = getattr(self.book_header.exth, 'kf8_header', None)
|
||||
@@ -118,18 +123,20 @@ class MobiReader(object):
|
||||
# Ancient PRC files from Baen can have random values for
|
||||
# mobi_version, so be conservative
|
||||
if (self.book_header.mobi_version == 8 and hasattr(self.book_header,
|
||||
'skelidx')):
|
||||
'skelidx')):
|
||||
self.kf8_type = 'standalone'
|
||||
elif k8i is not None: # Check for joint mobi 6 and kf 8 file
|
||||
try:
|
||||
raw = self.sections[k8i-1][0]
|
||||
except:
|
||||
except Exception:
|
||||
raw = None
|
||||
if raw == b'BOUNDARY':
|
||||
try:
|
||||
self.book_header = BookHeader(self.sections[k8i][0],
|
||||
self.ident, user_encoding, self.log)
|
||||
self.book_header.kf8_first_image_index = self.book_header.first_image_index + k8i
|
||||
self.ident, user_encoding,
|
||||
self.log)
|
||||
_kfii = self.book_header.first_image_index + k8i
|
||||
self.book_header.kf8_first_image_index = _kfii
|
||||
self.book_header.mobi6_records = bh.records
|
||||
|
||||
# Need the first_image_index from the mobi 6 header as well
|
||||
@@ -143,14 +150,14 @@ class MobiReader(object):
|
||||
|
||||
self.kf8_type = 'joint'
|
||||
self.kf8_boundary = k8i-1
|
||||
except:
|
||||
except Exception:
|
||||
self.book_header = bh
|
||||
|
||||
def check_for_drm(self):
|
||||
if self.book_header.encryption_type != 0:
|
||||
try:
|
||||
name = self.book_header.exth.mi.title
|
||||
except:
|
||||
except Exception:
|
||||
name = self.name
|
||||
if not name:
|
||||
name = self.name
|
||||
@@ -163,20 +170,20 @@ class MobiReader(object):
|
||||
if self.debug is not None:
|
||||
parse_cache['calibre_raw_mobi_markup'] = self.mobi_html
|
||||
self.add_anchors()
|
||||
self.processed_html = self.processed_html.decode(self.book_header.codec,
|
||||
'ignore')
|
||||
self.processed_html = self.processed_html.decode(
|
||||
self.book_header.codec, 'ignore')
|
||||
self.processed_html = self.processed_html.replace('</</', '</')
|
||||
self.processed_html = re.sub(r'</([a-zA-Z]+)<', r'</\1><',
|
||||
self.processed_html)
|
||||
self.processed_html)
|
||||
self.processed_html = self.processed_html.replace('\ufeff', '')
|
||||
# Remove tags of the form <xyz: ...> as they can cause issues further
|
||||
# along the pipeline
|
||||
self.processed_html = re.sub(r'</{0,1}[a-zA-Z]+:\s+[^>]*>', '',
|
||||
self.processed_html)
|
||||
self.processed_html)
|
||||
|
||||
self.processed_html = strip_encoding_declarations(self.processed_html)
|
||||
self.processed_html = re.sub(r'&(\S+?);', xml_entity_to_unicode,
|
||||
self.processed_html)
|
||||
self.processed_html)
|
||||
image_name_map = self.extract_images(processed_records, output_dir)
|
||||
self.replace_page_breaks()
|
||||
self.cleanup_html()
|
||||
@@ -186,31 +193,41 @@ class MobiReader(object):
|
||||
try:
|
||||
root = html.fromstring(self.processed_html)
|
||||
if len(root.xpath('//html')) > 5:
|
||||
root = html.fromstring(self.processed_html.replace('\x0c',
|
||||
'').replace('\x14', ''))
|
||||
root = html.fromstring(self.processed_html
|
||||
.replace('\x0c', '')
|
||||
.replace('\x14', ''))
|
||||
except Exception:
|
||||
self.log.warning('MOBI markup appears to contain random bytes. Stripping.')
|
||||
self.log.warning('MOBI markup appears to contain random bytes. '
|
||||
'Stripping.')
|
||||
self.processed_html = self.remove_random_bytes(self.processed_html)
|
||||
root = html.fromstring(self.processed_html)
|
||||
if root.xpath('descendant::p/descendant::p'):
|
||||
from html5_parser import parse
|
||||
self.log.warning('Malformed markup, parsing using html5-parser')
|
||||
self.processed_html = strip_encoding_declarations(self.processed_html)
|
||||
self.processed_html = strip_encoding_declarations(
|
||||
self.processed_html)
|
||||
# These trip up the html5 parser causing all content to be placed
|
||||
# under the <guide> tag
|
||||
self.processed_html = re.sub(r'<metadata>.+?</metadata>', '', self.processed_html, flags=re.I)
|
||||
self.processed_html = re.sub(r'<guide>.+?</guide>', '', self.processed_html, flags=re.I)
|
||||
self.processed_html = re.sub(r'<metadata>.+?</metadata>', '',
|
||||
self.processed_html, flags=re.I)
|
||||
self.processed_html = re.sub(r'<guide>.+?</guide>', '',
|
||||
self.processed_html, flags=re.I)
|
||||
try:
|
||||
root = parse(self.processed_html, maybe_xhtml=False, keep_doctype=False, sanitize_names=True)
|
||||
root = parse(self.processed_html, maybe_xhtml=False,
|
||||
keep_doctype=False, sanitize_names=True)
|
||||
except Exception:
|
||||
self.log.warning('MOBI markup appears to contain random bytes. Stripping.')
|
||||
self.processed_html = self.remove_random_bytes(self.processed_html)
|
||||
root = parse(self.processed_html, maybe_xhtml=False, keep_doctype=False, sanitize_names=True)
|
||||
self.log.warning('MOBI markup appears to contain random '
|
||||
'bytes. Stripping.')
|
||||
self.processed_html = self.remove_random_bytes(
|
||||
self.processed_html)
|
||||
root = parse(self.processed_html, maybe_xhtml=False,
|
||||
keep_doctype=False, sanitize_names=True)
|
||||
if len(root.xpath('body/descendant::*')) < 1:
|
||||
# There are probably stray </html>s in the markup
|
||||
self.processed_html = self.processed_html.replace('</html>',
|
||||
'')
|
||||
root = parse(self.processed_html, maybe_xhtml=False, keep_doctype=False, sanitize_names=True)
|
||||
'')
|
||||
root = parse(self.processed_html, maybe_xhtml=False,
|
||||
keep_doctype=False, sanitize_names=True)
|
||||
|
||||
if root.tag != 'html':
|
||||
self.log.warn('File does not have opening <html> tag')
|
||||
@@ -253,13 +270,14 @@ class MobiReader(object):
|
||||
head = root.makeelement('head', {})
|
||||
root.insert(0, head)
|
||||
head.text = '\n\t'
|
||||
link = head.makeelement('link', {'type':'text/css',
|
||||
'href':'styles.css', 'rel':'stylesheet'})
|
||||
link = head.makeelement('link', {'type': 'text/css',
|
||||
'href': 'styles.css',
|
||||
'rel': 'stylesheet'})
|
||||
head.insert(0, link)
|
||||
link.tail = '\n\t'
|
||||
title = head.xpath('descendant::title')
|
||||
m = head.makeelement('meta', {'http-equiv':'Content-Type',
|
||||
'content':'text/html; charset=utf-8'})
|
||||
m = head.makeelement('meta', {'http-equiv': 'Content-Type',
|
||||
'content': 'text/html; charset=utf-8'})
|
||||
head.insert(0, m)
|
||||
if not title:
|
||||
title = head.makeelement('title', {})
|
||||
@@ -283,7 +301,8 @@ class MobiReader(object):
|
||||
try:
|
||||
for ref in guide.xpath('descendant::reference'):
|
||||
if 'href' in ref.attrib:
|
||||
ref.attrib['href'] = os.path.basename(htmlfile) + ref.attrib['href']
|
||||
ref.attrib['href'] = (os.path.basename(htmlfile) +
|
||||
ref.attrib['href'])
|
||||
except AttributeError:
|
||||
pass
|
||||
|
||||
@@ -299,7 +318,7 @@ class MobiReader(object):
|
||||
opf, ncx_manifest_entry = self.create_opf(htmlfile, guide, root)
|
||||
self.created_opf_path = os.path.splitext(htmlfile)[0] + '.opf'
|
||||
opf.render(open(self.created_opf_path, 'wb'), ncx,
|
||||
ncx_manifest_entry=ncx_manifest_entry)
|
||||
ncx_manifest_entry=ncx_manifest_entry)
|
||||
ncx = ncx.getvalue()
|
||||
if ncx:
|
||||
ncx_path = os.path.join(os.path.dirname(htmlfile), 'toc.ncx')
|
||||
@@ -313,9 +332,9 @@ class MobiReader(object):
|
||||
if self.book_header.exth is not None or self.embedded_mi is not None:
|
||||
self.log.debug('Creating OPF...')
|
||||
ncx = io.BytesIO()
|
||||
opf, ncx_manifest_entry = self.create_opf(htmlfile, guide, root)
|
||||
opf, ncx_manifest_entry = self.create_opf(htmlfile, guide, root)
|
||||
opf.render(open(os.path.splitext(htmlfile)[0] + '.opf', 'wb'), ncx,
|
||||
ncx_manifest_entry)
|
||||
ncx_manifest_entry)
|
||||
ncx = ncx.getvalue()
|
||||
if ncx:
|
||||
write_as_utf8(os.path.splitext(htmlfile)[0] + '.ncx', ncx)
|
||||
@@ -348,28 +367,46 @@ class MobiReader(object):
|
||||
|
||||
def cleanup_html(self):
|
||||
self.log.debug('Cleaning up HTML...')
|
||||
self.processed_html = re.sub(r'<div height="0(pt|px|ex|em|%){0,1}"></div>', '', self.processed_html)
|
||||
if self.book_header.ancient and b'<html' not in self.mobi_html[:300].lower():
|
||||
self.processed_html = '<html><p>' + self.processed_html.replace('\n\n', '<p>') + '</html>'
|
||||
self.processed_html = re.sub(r'<div height="0(pt|px|ex|em|%){0,1}">'
|
||||
'</div>', '', self.processed_html)
|
||||
if (self.book_header.ancient and
|
||||
b'<html' not in self.mobi_html[:300].lower()):
|
||||
self.processed_html = ('<html><p>' +
|
||||
self.processed_html.replace('\n\n', '<p>') +
|
||||
'</html>')
|
||||
self.processed_html = self.processed_html.replace('\r\n', '\n')
|
||||
self.processed_html = self.processed_html.replace('> <', '>\n<')
|
||||
self.processed_html = self.processed_html.replace('<mbp: ', '<mbp:')
|
||||
self.processed_html = re.sub(r'<\?xml[^>]*>', '', self.processed_html)
|
||||
self.processed_html = re.sub(r'<\s*(/?)\s*o:p[^>]*>', r'', self.processed_html)
|
||||
# Swap inline and block level elements, and order block level elements according to priority
|
||||
# - lxml and beautifulsoup expect/assume a specific order based on xhtml spec
|
||||
self.processed_html = re.sub(
|
||||
r'(?i)(?P<styletags>(<(h\d+|i|b|u|em|small|big|strong|tt)>\s*){1,})(?P<para><p[^>]*>)', r'\g<para>'+r'\g<styletags>', self.processed_html)
|
||||
self.processed_html = re.sub(
|
||||
r'(?i)(?P<para></p[^>]*>)\s*(?P<styletags>(</(h\d+|i|b|u|em|small|big|strong|tt)>\s*){1,})', r'\g<styletags>'+r'\g<para>', self.processed_html)
|
||||
self.processed_html = re.sub(
|
||||
r'(?i)(?P<blockquote>(</(blockquote|div)[^>]*>\s*){1,})(?P<para></p[^>]*>)', r'\g<para>'+r'\g<blockquote>', self.processed_html)
|
||||
self.processed_html = re.sub(
|
||||
r'(?i)(?P<para><p[^>]*>)\s*(?P<blockquote>(<(blockquote|div)[^>]*>\s*){1,})', r'\g<blockquote>'+r'\g<para>', self.processed_html)
|
||||
self.processed_html = re.sub(r'<\s*(/?)\s*o:p[^>]*>', r'',
|
||||
self.processed_html)
|
||||
# Swap inline and block level elements, and order block level elements
|
||||
# according to priority
|
||||
# - lxml and beautifulsoup expect/assume a specific order based on
|
||||
# xhtml spec
|
||||
self.processed_html = re.sub(r'(?i)(?P<styletags>(<(h\d+|i|b|u|em|'
|
||||
r'small|big|strong|tt)>\s*){1,})'
|
||||
r'(?P<para><p[^>]*>)',
|
||||
r'\g<para>' + r'\g<styletags>',
|
||||
self.processed_html)
|
||||
self.processed_html = re.sub(r'(?i)(?P<para></p[^>]*>)\s*'
|
||||
r'(?P<styletags>(</(h\d+|i|b|u|em|small|'
|
||||
r'big|strong|tt)>\s*){1,})',
|
||||
r'\g<styletags>' + r'\g<para>',
|
||||
self.processed_html)
|
||||
self.processed_html = re.sub(r'(?i)(?P<blockquote>(</(blockquote|div)'
|
||||
r'[^>]*>\s*){1,})(?P<para></p[^>]*>)',
|
||||
r'\g<para>' + r'\g<blockquote>',
|
||||
self.processed_html)
|
||||
self.processed_html = re.sub(r'(?i)(?P<para><p[^>]*>)\s*'
|
||||
r'(?P<blockquote>(<(blockquote|div)[^>]*>'
|
||||
r'\s*){1,})',
|
||||
r'\g<blockquote>' + r'\g<para>',
|
||||
self.processed_html)
|
||||
bods = htmls = 0
|
||||
for x in re.finditer('</body>|</html>', self.processed_html):
|
||||
if x == '</body>':
|
||||
bods +=1
|
||||
bods += 1
|
||||
else:
|
||||
htmls += 1
|
||||
if bods > 1 and htmls > 1:
|
||||
@@ -380,8 +417,8 @@ class MobiReader(object):
|
||||
self.processed_html = self.processed_html.replace('</html>', '')
|
||||
|
||||
def remove_random_bytes(self, html):
|
||||
return re.sub('\x14|\x15|\x19|\x1c|\x1d|\xef|\x12|\x13|\xec|\x08|\x01|\x02|\x03|\x04|\x05|\x06|\x07',
|
||||
'', html)
|
||||
return re.sub('\x14|\x15|\x19|\x1c|\x1d|\xef|\x12|\x13|\xec|\x08|\x01'
|
||||
'|\x02|\x03|\x04|\x05|\x06|\x07', '', html)
|
||||
|
||||
def ensure_unit(self, raw, unit='px'):
|
||||
if re.search(r'\d+$', raw) is not None:
|
||||
@@ -448,9 +485,10 @@ class MobiReader(object):
|
||||
# discarded by a renderer
|
||||
tag.text = '\u00a0' # nbsp
|
||||
styles.append('height: %s' %
|
||||
self.ensure_unit(height))
|
||||
self.ensure_unit(height))
|
||||
else:
|
||||
styles.append('margin-top: %s' % self.ensure_unit(height))
|
||||
styles.append('margin-top: %s' %
|
||||
self.ensure_unit(height))
|
||||
if 'width' in attrib:
|
||||
width = attrib.pop('width').strip()
|
||||
if width and re.search(r'\d+', width):
|
||||
@@ -464,14 +502,16 @@ class MobiReader(object):
|
||||
try:
|
||||
ewidth_val = unit_convert(ewidth, 12, 500, 166)
|
||||
self.text_indents[tag] = ewidth_val
|
||||
except:
|
||||
except Exception:
|
||||
pass
|
||||
if width.startswith('-'):
|
||||
styles.append('margin-left: %s' % self.ensure_unit(width[1:]))
|
||||
styles.append('margin-left: %s' %
|
||||
self.ensure_unit(width[1:]))
|
||||
try:
|
||||
ewidth_val = unit_convert(ewidth[1:], 12, 500, 166)
|
||||
ewidth_val = unit_convert(ewidth[1:],
|
||||
12, 500, 166)
|
||||
self.left_margins[tag] = ewidth_val
|
||||
except:
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if 'align' in attrib:
|
||||
@@ -514,16 +554,20 @@ class MobiReader(object):
|
||||
except Exception:
|
||||
pass
|
||||
else:
|
||||
attrib['src'] = 'images/' + image_name_map.get(recindex, '%05d.jpg' % recindex)
|
||||
attrib['src'] = ('images/' +
|
||||
image_name_map.get(recindex,
|
||||
'%05d.jpg' %
|
||||
recindex))
|
||||
for attr in ('width', 'height'):
|
||||
if attr in attrib:
|
||||
val = attrib[attr]
|
||||
if val.lower().endswith('em'):
|
||||
try:
|
||||
nval = float(val[:-2])
|
||||
nval *= 16 * (168.451/72) # Assume this was set using the Kindle profile
|
||||
attrib[attr] = "%dpx"%int(nval)
|
||||
except:
|
||||
# Assume this was set using the Kindle profile
|
||||
nval *= 16 * (168.451/72)
|
||||
attrib[attr] = "%dpx" % int(nval)
|
||||
except Exception:
|
||||
del attrib[attr]
|
||||
elif val.lower().endswith('%'):
|
||||
del attrib[attr]
|
||||
@@ -550,10 +594,12 @@ class MobiReader(object):
|
||||
attrib['href'] = "#filepos%d" % int(filepos)
|
||||
except ValueError:
|
||||
pass
|
||||
if (tag.tag == 'a' and attrib.get('id', '').startswith('filepos') and
|
||||
not tag.text and len(tag) == 0 and (tag.tail is None or not
|
||||
tag.tail.strip()) and getattr(tag.getnext(), 'tag',
|
||||
None) in BLOCK_TAGS):
|
||||
if (tag.tag == 'a' and
|
||||
attrib.get('id', '').startswith('filepos') and
|
||||
not tag.text and len(tag) == 0 and
|
||||
(tag.tail is None or
|
||||
not tag.tail.strip()) and
|
||||
getattr(tag.getnext(), 'tag', None) in BLOCK_TAGS):
|
||||
# This is an empty anchor immediately before a block tag, move
|
||||
# the id onto the block tag instead
|
||||
forwardable_anchors.append(tag)
|
||||
@@ -625,11 +671,11 @@ class MobiReader(object):
|
||||
ti = self.text_indents.get(tag, ti)
|
||||
try:
|
||||
lm = float(lm)
|
||||
except:
|
||||
except Exception:
|
||||
lm = 0.0
|
||||
try:
|
||||
ti = float(ti)
|
||||
except:
|
||||
except Exception:
|
||||
ti = 0.0
|
||||
return lm + ti
|
||||
|
||||
@@ -647,13 +693,14 @@ class MobiReader(object):
|
||||
mi = MetaInformation(self.book_header.title, ['Unknown'])
|
||||
opf = OPFCreator(os.path.dirname(htmlfile), mi)
|
||||
if hasattr(self.book_header.exth, 'cover_offset'):
|
||||
opf.cover = 'images/%05d.jpg' % (self.book_header.exth.cover_offset + 1)
|
||||
opf.cover = 'images/%05d.jpg' % (self.book_header
|
||||
.exth.cover_offset + 1)
|
||||
elif mi.cover is not None:
|
||||
opf.cover = mi.cover
|
||||
else:
|
||||
opf.cover = 'images/%05d.jpg' % 1
|
||||
if not os.path.exists(os.path.join(os.path.dirname(htmlfile),
|
||||
* opf.cover.split('/'))):
|
||||
* opf.cover.split('/'))):
|
||||
opf.cover = None
|
||||
|
||||
cover = opf.cover
|
||||
@@ -669,7 +716,7 @@ class MobiReader(object):
|
||||
opf.cover = ncover.replace(os.sep, '/')
|
||||
|
||||
manifest = [(htmlfile, 'application/xhtml+xml'),
|
||||
(os.path.abspath('styles.css'), 'text/css')]
|
||||
(os.path.abspath('styles.css'), 'text/css')]
|
||||
bp = os.path.dirname(htmlfile)
|
||||
added = set()
|
||||
for i in getattr(self, 'image_names', []):
|
||||
@@ -708,15 +755,17 @@ class MobiReader(object):
|
||||
if href and re.match(r'\w+://', href) is None:
|
||||
try:
|
||||
text = ' '.join([t.strip() for t in
|
||||
x.xpath('descendant::text()')])
|
||||
except:
|
||||
x.xpath('descendant:'
|
||||
':text()')])
|
||||
except Exception:
|
||||
text = ''
|
||||
text = ent_pat.sub(entity_to_unicode, text)
|
||||
item = tocobj.add_item(toc.partition('#')[0], href[1:],
|
||||
text)
|
||||
item = tocobj.add_item(toc.partition('#')[0],
|
||||
href[1:], text)
|
||||
item.left_space = int(self.get_left_whitespace(x))
|
||||
found = True
|
||||
if reached and found and x.get('class', None) == 'mbp_pagebreak':
|
||||
if (reached and found and
|
||||
x.get('class', None) == 'mbp_pagebreak'):
|
||||
break
|
||||
if tocobj is not None:
|
||||
tocobj = self.structure_toc(tocobj)
|
||||
@@ -748,7 +797,7 @@ class MobiReader(object):
|
||||
level = indent_vals.index(item.left_space)
|
||||
parent = find_parent(level)
|
||||
last_found[level] = parent.add_item(item.href, item.fragment,
|
||||
item.text)
|
||||
item.text)
|
||||
|
||||
return newtoc
|
||||
|
||||
@@ -782,7 +831,9 @@ class MobiReader(object):
|
||||
def warn_about_trailing_entry_corruption(self):
|
||||
if not self.warned_about_trailing_entry_corruption:
|
||||
self.warned_about_trailing_entry_corruption = True
|
||||
self.log.warn('The trailing data entries in this MOBI file are corrupted, you might see corrupted text in the output')
|
||||
self.log.warn('The trailing data entries in this MOBI file are '
|
||||
'corrupted, you might see corrupted text in the '
|
||||
'output')
|
||||
|
||||
def text_section(self, index):
|
||||
data = self.sections[index][0]
|
||||
@@ -791,19 +842,23 @@ class MobiReader(object):
|
||||
|
||||
def extract_text(self, offset=1):
|
||||
self.log.debug('Extracting text...')
|
||||
text_sections = [self.text_section(i) for i in range(offset,
|
||||
min(self.book_header.records + offset, len(self.sections)))]
|
||||
text_sections = [self.text_section(i)
|
||||
for i in range(offset, min(self.book_header.records
|
||||
+ offset,
|
||||
len(self.sections)))]
|
||||
processed_records = list(range(offset-1, self.book_header.records +
|
||||
offset))
|
||||
offset))
|
||||
|
||||
self.mobi_html = b''
|
||||
|
||||
if self.book_header.compression_type == b'DH':
|
||||
huffs = [self.sections[i][0] for i in
|
||||
range(self.book_header.huff_offset,
|
||||
self.book_header.huff_offset + self.book_header.huff_number)]
|
||||
huffs = [self.sections[i][0]
|
||||
for i in range(self.book_header.huff_offset,
|
||||
self.book_header.huff_offset +
|
||||
self.book_header.huff_number)]
|
||||
processed_records += list(range(self.book_header.huff_offset,
|
||||
self.book_header.huff_offset + self.book_header.huff_number))
|
||||
self.book_header.huff_offset +
|
||||
self.book_header.huff_number))
|
||||
huff = HuffReader(huffs)
|
||||
unpack = huff.unpack
|
||||
|
||||
@@ -811,19 +866,23 @@ class MobiReader(object):
|
||||
unpack = decompress_doc
|
||||
|
||||
elif self.book_header.compression_type == b'\x00\x01':
|
||||
unpack = lambda x: x
|
||||
unpack = lambda x: x # noqa
|
||||
else:
|
||||
raise MobiError('Unknown compression algorithm: %r' % self.book_header.compression_type)
|
||||
raise MobiError('Unknown compression algorithm: %r' %
|
||||
self.book_header.compression_type)
|
||||
self.mobi_html = b''.join(map(unpack, text_sections))
|
||||
if self.mobi_html.endswith(b'#'):
|
||||
self.mobi_html = self.mobi_html[:-1]
|
||||
|
||||
if self.book_header.ancient and b'<html' not in self.mobi_html[:300].lower():
|
||||
if (self.book_header.ancient and
|
||||
b'<html' not in self.mobi_html[:300].lower()):
|
||||
self.mobi_html = self.mobi_html.replace(b'\r ', b'\n\n ')
|
||||
self.mobi_html = self.mobi_html.replace(b'\0', b'')
|
||||
if self.book_header.codec == 'cp1252':
|
||||
self.mobi_html = self.mobi_html.replace(b'\x1e', b'') # record separator
|
||||
self.mobi_html = self.mobi_html.replace(b'\x02', b'') # start of text
|
||||
# record separator
|
||||
self.mobi_html = self.mobi_html.replace(b'\x1e', b'')
|
||||
# start of text
|
||||
self.mobi_html = self.mobi_html.replace(b'\x02', b'')
|
||||
return processed_records
|
||||
|
||||
def replace_page_breaks(self):
|
||||
@@ -835,7 +894,7 @@ class MobiReader(object):
|
||||
self.log.debug('Adding anchors...')
|
||||
positions = set()
|
||||
link_pattern = re.compile(br'''<[^<>]+filepos=['"]{0,1}(\d+)[^<>]*>''',
|
||||
re.IGNORECASE)
|
||||
re.IGNORECASE)
|
||||
for match in link_pattern.finditer(self.mobi_html):
|
||||
positions.add(int(match.group(1)))
|
||||
pos = 0
|
||||
@@ -845,12 +904,13 @@ class MobiReader(object):
|
||||
if end == 0:
|
||||
continue
|
||||
oend = end
|
||||
l = self.mobi_html.find(b'<', end)
|
||||
_l = self.mobi_html.find(b'<', end)
|
||||
r = self.mobi_html.find(b'>', end)
|
||||
anchor = b'<a id="filepos%d"></a>'
|
||||
if r > -1 and (r < l or l == end or l == -1):
|
||||
if r > -1 and (r < _l or _l == end or _l == -1):
|
||||
p = self.mobi_html.rfind(b'<', 0, end + 1)
|
||||
if (pos < end and p > -1 and not end_tag_re.match(self.mobi_html[p:r]) and
|
||||
if (pos < end and p > -1 and
|
||||
not end_tag_re.match(self.mobi_html[p:r]) and
|
||||
not self.mobi_html[p:r + 1].endswith(b'/>')):
|
||||
anchor = b' filepos-id="filepos%d"'
|
||||
end = r
|
||||
@@ -862,8 +922,9 @@ class MobiReader(object):
|
||||
processed_html = b''.join(processed_html)
|
||||
|
||||
# Remove anchors placed inside entities
|
||||
self.processed_html = re.sub(br'&([^;]*?)(<a id="filepos\d+"></a>)([^;]*);',
|
||||
br'&\1\3;\2', processed_html)
|
||||
self.processed_html = re.sub(br'&([^;]*?)(<a id="filepos\d+"></a>)'
|
||||
br'([^;]*);', br'&\1\3;\2',
|
||||
processed_html)
|
||||
|
||||
def extract_images(self, processed_records, output_dir):
|
||||
self.log.debug('Extracting images...')
|
||||
@@ -881,10 +942,11 @@ class MobiReader(object):
|
||||
if i in processed_records:
|
||||
continue
|
||||
processed_records.append(i)
|
||||
data = self.sections[i][0]
|
||||
data = self.sections[i][0]
|
||||
image_index += 1
|
||||
if data[:4] in {b'FLIS', b'FCIS', b'SRCS', b'\xe9\x8e\r\n',
|
||||
b'RESC', b'BOUN', b'FDST', b'DATP', b'AUDI', b'VIDE'}:
|
||||
b'RESC', b'BOUN', b'FDST', b'DATP', b'AUDI',
|
||||
b'VIDE'}:
|
||||
# This record is a known non image type, no need to try to
|
||||
# load the image
|
||||
continue
|
||||
@@ -920,16 +982,17 @@ class MobiReader(object):
|
||||
|
||||
|
||||
def test_mbp_regex():
|
||||
for raw, m in {'<mbp:pagebreak></mbp:pagebreak>':'',
|
||||
'<mbp:pagebreak xxx></mbp:pagebreak>yyy':' xxxyyy',
|
||||
'<mbp:pagebreak> </mbp:pagebreak>':'',
|
||||
'<mbp:pagebreak>xxx':'xxx',
|
||||
'<mbp:pagebreak/>xxx':'xxx',
|
||||
'<mbp:pagebreak sdf/ >xxx':' sdfxxx',
|
||||
'<mbp:pagebreak / >':' ',
|
||||
'</mbp:pagebreak>':'',
|
||||
'</mbp:pagebreak sdf>':' sdf',
|
||||
'</mbp:pagebreak><mbp:pagebreak></mbp:pagebreak>xxx':'xxx'}.items():
|
||||
for raw, m in {'<mbp:pagebreak></mbp:pagebreak>': '',
|
||||
'<mbp:pagebreak xxx></mbp:pagebreak>yyy': ' xxxyyy',
|
||||
'<mbp:pagebreak> </mbp:pagebreak>': '',
|
||||
'<mbp:pagebreak>xxx': 'xxx',
|
||||
'<mbp:pagebreak/>xxx': 'xxx',
|
||||
'<mbp:pagebreak sdf/ >xxx': ' sdfxxx',
|
||||
'<mbp:pagebreak / >': ' ',
|
||||
'</mbp:pagebreak>': '',
|
||||
'</mbp:pagebreak sdf>': ' sdf',
|
||||
'</mbp:pagebreak><mbp:pagebreak></mbp:pagebreak>xxx':
|
||||
'xxx'}.items():
|
||||
ans = MobiReader.PAGE_BREAK_PAT.sub(r'\1', raw)
|
||||
if ans != m:
|
||||
raise Exception('%r != %r for %r'%(ans, m, raw))
|
||||
raise Exception('%r != %r for %r' % (ans, m, raw))
|
||||
|
||||
@@ -1,13 +1,11 @@
|
||||
import mimetypes
|
||||
import re
|
||||
|
||||
from ebook_converter.ebooks.oeb.base import XPath, urlunquote
|
||||
from ebook_converter.polyglot.binary import from_base64_bytes
|
||||
from ebook_converter.polyglot.builtins import as_bytes
|
||||
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2014, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
|
||||
|
||||
class DataURL(object):
|
||||
|
||||
def __call__(self, oeb, opts):
|
||||
@@ -27,25 +25,29 @@ class DataURL(object):
|
||||
continue
|
||||
if ';base64' in header:
|
||||
data = re.sub(r'\s+', '', data)
|
||||
from ebook_converter.polyglot.binary import from_base64_bytes
|
||||
try:
|
||||
data = from_base64_bytes(data)
|
||||
except Exception:
|
||||
self.log.error('Found invalid base64 encoded data URI, ignoring it')
|
||||
self.log.error('Found invalid base64 encoded data '
|
||||
'URI, ignoring it')
|
||||
continue
|
||||
else:
|
||||
data = urlunquote(data)
|
||||
data = as_bytes(data)
|
||||
fmt = what(None, data)
|
||||
if not fmt:
|
||||
self.log.warn('Image encoded as data URL has unknown format, ignoring')
|
||||
self.log.warn('Image encoded as data URL has unknown '
|
||||
'format, ignoring')
|
||||
continue
|
||||
img.set('src', item.relhref(self.convert_image_data_uri(data, fmt, oeb)))
|
||||
img.set('src',
|
||||
item.relhref(self.convert_image_data_uri(data, fmt,
|
||||
oeb)))
|
||||
|
||||
def convert_image_data_uri(self, data, fmt, oeb):
|
||||
self.log('Found image encoded as data URI converting it to normal image')
|
||||
from ebook_converter import guess_type
|
||||
item_id, item_href = oeb.manifest.generate('data-url-image', 'data-url-image.' + fmt)
|
||||
self.log('Found image encoded as data URI converting it to normal '
|
||||
'image')
|
||||
item_id, item_href = oeb.manifest.generate('data-url-image',
|
||||
'data-url-image.' + fmt)
|
||||
oeb.manifest.add(item_id, item_href,
|
||||
mimetypes.guess_type(item_href)[0], data=data)
|
||||
return item_href
|
||||
|
||||
@@ -1,9 +1,11 @@
|
||||
import mimetypes
|
||||
import sys, os, re
|
||||
from xml.sax.saxutils import escape
|
||||
from string import Formatter
|
||||
import os
|
||||
import pkg_resources
|
||||
import re
|
||||
import string
|
||||
import sys
|
||||
import urllib.parse
|
||||
from xml.sax import saxutils
|
||||
|
||||
from ebook_converter import constants as const
|
||||
from ebook_converter import strftime
|
||||
@@ -16,18 +18,14 @@ from ebook_converter.ebooks.chardet import strip_encoding_declarations
|
||||
from ebook_converter.ebooks.metadata import fmt_sidx, rating_to_stars
|
||||
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
JACKET_XPATH = '//h:meta[@name="calibre-content" and @content="jacket"]'
|
||||
|
||||
|
||||
class SafeFormatter(Formatter):
|
||||
class SafeFormatter(string.Formatter):
|
||||
|
||||
def get_value(self, *args, **kwargs):
|
||||
try:
|
||||
return Formatter.get_value(self, *args, **kwargs)
|
||||
return string.Formatter.get_value(self, *args, **kwargs)
|
||||
except KeyError:
|
||||
return ''
|
||||
|
||||
@@ -40,7 +38,7 @@ class Base(object):
|
||||
for img in path(item.data):
|
||||
if removed >= limit:
|
||||
break
|
||||
href = item.abshref(img.get('src'))
|
||||
href = item.abshref(img.get('src'))
|
||||
image = self.oeb.manifest.hrefs.get(href)
|
||||
if image is None:
|
||||
href = urlnormalize(href)
|
||||
@@ -68,7 +66,8 @@ class RemoveFirstImage(Base):
|
||||
raw = xml2text(body[0]).strip()
|
||||
imgs = XPath('//h:img|//svg:svg')(item.data)
|
||||
if not raw and not imgs:
|
||||
self.log('Removing %s as it has no content'%item.href)
|
||||
self.log('Removing %s as it has no content' %
|
||||
item.href)
|
||||
self.oeb.manifest.remove(item)
|
||||
deleted_item = item
|
||||
break
|
||||
@@ -82,20 +81,20 @@ class RemoveFirstImage(Base):
|
||||
self.oeb.guide.remove_by_href(deleted_item.href)
|
||||
|
||||
def __call__(self, oeb, opts, metadata):
|
||||
'''
|
||||
"""
|
||||
Add metadata in jacket.xhtml if specified in opts
|
||||
If not specified, remove previous jacket instance
|
||||
'''
|
||||
"""
|
||||
self.oeb, self.opts, self.log = oeb, opts, oeb.log
|
||||
if opts.remove_first_image:
|
||||
self.remove_first_image()
|
||||
|
||||
|
||||
class Jacket(Base):
|
||||
'''
|
||||
Book jacket manipulation. Remove first image and insert comments at start of
|
||||
book.
|
||||
'''
|
||||
"""
|
||||
Book jacket manipulation. Remove first image and insert comments at start
|
||||
of book.
|
||||
"""
|
||||
|
||||
def insert_metadata(self, mi):
|
||||
self.log('Inserting metadata into book...')
|
||||
@@ -107,22 +106,24 @@ class Jacket(Base):
|
||||
|
||||
try:
|
||||
comments = str(self.oeb.metadata.description[0])
|
||||
except:
|
||||
except Exception:
|
||||
comments = ''
|
||||
|
||||
try:
|
||||
title = str(self.oeb.metadata.title[0])
|
||||
except:
|
||||
except Exception:
|
||||
title = 'Unknown'
|
||||
|
||||
try:
|
||||
authors = list(map(str, self.oeb.metadata.creator))
|
||||
except:
|
||||
except Exception:
|
||||
authors = ['Unknown']
|
||||
|
||||
root = render_jacket(mi, self.opts.output_profile,
|
||||
alt_title=title, alt_tags=tags, alt_authors=authors,
|
||||
alt_comments=comments, rescale_fonts=True)
|
||||
alt_title=title, alt_tags=tags,
|
||||
alt_authors=authors,
|
||||
alt_comments=comments,
|
||||
rescale_fonts=True)
|
||||
id, href = self.oeb.manifest.generate('calibre_jacket', 'jacket.xhtml')
|
||||
|
||||
jacket = self.oeb.manifest.add(id, href, mimetypes.guess_type(href)[0],
|
||||
@@ -132,7 +133,8 @@ class Jacket(Base):
|
||||
for img, path in referenced_images(root):
|
||||
self.oeb.log('Embedding referenced image %s into jacket' % path)
|
||||
ext = path.rpartition('.')[-1].lower()
|
||||
item_id, href = self.oeb.manifest.generate('jacket_image', 'jacket_img.'+ext)
|
||||
item_id, href = self.oeb.manifest.generate('jacket_image',
|
||||
'jacket_img.' + ext)
|
||||
with open(path, 'rb') as f:
|
||||
item = self.oeb.manifest.add(
|
||||
item_id, href, mimetypes.guess_type(href)[0],
|
||||
@@ -149,10 +151,10 @@ class Jacket(Base):
|
||||
break
|
||||
|
||||
def __call__(self, oeb, opts, metadata):
|
||||
'''
|
||||
"""
|
||||
Add metadata in jacket.xhtml if specified in opts
|
||||
If not specified, remove previous jacket instance
|
||||
'''
|
||||
"""
|
||||
self.oeb, self.opts, self.log = oeb, opts, oeb.log
|
||||
self.remove_existing_jacket()
|
||||
if opts.insert_metadata:
|
||||
@@ -164,8 +166,8 @@ class Jacket(Base):
|
||||
def get_rating(rating, rchar, e_rchar):
|
||||
ans = ''
|
||||
try:
|
||||
num = float(rating)/2
|
||||
except:
|
||||
num = float(rating) / 2
|
||||
except Exception:
|
||||
return ans
|
||||
num = max(0, num)
|
||||
num = min(num, 5)
|
||||
@@ -180,25 +182,29 @@ class Series(str):
|
||||
|
||||
def __new__(self, series, series_index):
|
||||
if series and series_index is not None:
|
||||
roman = '{1} of <em>{0}</em>'.format(
|
||||
escape(series), escape(fmt_sidx(series_index, use_roman=True)))
|
||||
combined = '{1} of <em>{0}</em>'.format(
|
||||
escape(series), escape(fmt_sidx(series_index,
|
||||
use_roman=False)))
|
||||
_roman = saxutils.escape(fmt_sidx(series_index, use_roman=True))
|
||||
_no_roman = saxutils.escape(fmt_sidx(series_index,
|
||||
use_roman=False))
|
||||
roman = '{1} of <em>{0}</em>'.format(saxutils.escape(series),
|
||||
_roman)
|
||||
combined = '{1} of <em>{0}</em>'.format(saxutils.escape(series),
|
||||
_no_roman)
|
||||
else:
|
||||
combined = roman = escape(series or u'')
|
||||
combined = roman = saxutils.escape(series or u'')
|
||||
s = str.__new__(self, combined)
|
||||
s.roman = roman
|
||||
s.name = escape(series or '')
|
||||
s.number = escape(fmt_sidx(series_index or 1.0, use_roman=False))
|
||||
s.roman_number = escape(fmt_sidx(series_index or 1.0, use_roman=True))
|
||||
s.name = saxutils.escape(series or '')
|
||||
s.number = saxutils.escape(fmt_sidx(series_index or 1.0,
|
||||
use_roman=False))
|
||||
s.roman_number = saxutils.escape(fmt_sidx(series_index or 1.0,
|
||||
use_roman=True))
|
||||
return s
|
||||
|
||||
|
||||
class Tags(str):
|
||||
|
||||
def __new__(self, tags, output_profile):
|
||||
tags = [escape(x) for x in tags or ()]
|
||||
tags = [saxutils.escape(x) for x in tags or ()]
|
||||
t = str.__new__(self, ', '.join(tags))
|
||||
t.alphabetical = ', '.join(sorted(tags))
|
||||
t.tags_list = tags
|
||||
@@ -233,9 +239,9 @@ def postprocess_jacket(root, output_profile, has_data):
|
||||
extract_class('cbj_kindle_banner_hr')
|
||||
|
||||
|
||||
def render_jacket(mi, output_profile,
|
||||
alt_title='Unknown', alt_tags=[], alt_comments='',
|
||||
alt_publisher='', rescale_fonts=False, alt_authors=None):
|
||||
def render_jacket(mi, output_profile, alt_title='Unknown', alt_tags=[],
|
||||
alt_comments='', alt_publisher='', rescale_fonts=False,
|
||||
alt_authors=None):
|
||||
with open(pkg_resources.resource_filename('ebook_converter',
|
||||
'data/jacket/stylesheet.css'),
|
||||
'rb') as fobj:
|
||||
@@ -250,17 +256,20 @@ def render_jacket(mi, output_profile,
|
||||
|
||||
try:
|
||||
title_str = alt_title if mi.is_null('title') else mi.title
|
||||
except:
|
||||
except Exception:
|
||||
title_str = 'Unknown'
|
||||
title_str = escape(title_str)
|
||||
title_str = saxutils.escape(title_str)
|
||||
title = '<span class="title">%s</span>' % title_str
|
||||
|
||||
series = Series(mi.series, mi.series_index)
|
||||
try:
|
||||
publisher = mi.publisher if not mi.is_null('publisher') else alt_publisher
|
||||
except:
|
||||
if not mi.is_null('publisher'):
|
||||
publisher = mi.publisher
|
||||
else:
|
||||
publisher = alt_publisher
|
||||
except Exception:
|
||||
publisher = ''
|
||||
publisher = escape(publisher)
|
||||
publisher = saxutils.escape(publisher)
|
||||
|
||||
try:
|
||||
if is_date_undefined(mi.pubdate):
|
||||
@@ -268,10 +277,11 @@ def render_jacket(mi, output_profile,
|
||||
else:
|
||||
dt = as_local_time(mi.pubdate)
|
||||
pubdate = strftime('%Y', dt.timetuple())
|
||||
except:
|
||||
except Exception:
|
||||
pubdate = ''
|
||||
|
||||
rating = get_rating(mi.rating, output_profile.ratings_char, output_profile.empty_ratings_char)
|
||||
rating = get_rating(mi.rating, output_profile.ratings_char,
|
||||
output_profile.empty_ratings_char)
|
||||
|
||||
tags = Tags((mi.tags if mi.tags else alt_tags), output_profile)
|
||||
|
||||
@@ -285,10 +295,10 @@ def render_jacket(mi, output_profile,
|
||||
mi.authors = list(alt_authors or ('Unknown',))
|
||||
try:
|
||||
author = mi.format_authors()
|
||||
except:
|
||||
except Exception:
|
||||
author = ''
|
||||
mi.authors = orig
|
||||
author = escape(author)
|
||||
author = saxutils.escape(author)
|
||||
has_data = {}
|
||||
|
||||
def generate_html(comments):
|
||||
@@ -301,7 +311,7 @@ def render_jacket(mi, output_profile,
|
||||
'publisher': publisher,
|
||||
'rating': rating,
|
||||
'rating_label': 'Rating',
|
||||
'searchable_tags': ' '.join(escape(t) + 'ttt'
|
||||
'searchable_tags': ' '.join(saxutils.escape(t) + 'ttt'
|
||||
for t in tags.tags_list),
|
||||
'series': series,
|
||||
'series_label': 'Series',
|
||||
@@ -320,25 +330,30 @@ def render_jacket(mi, output_profile,
|
||||
if dt == 'series':
|
||||
args[dkey] = Series(mi.get(key), mi.get(key + '_index'))
|
||||
elif dt == 'rating':
|
||||
args[dkey] = rating_to_stars(mi.get(key), m.get('display', {}).get('allow_half_stars', False))
|
||||
args[dkey] = rating_to_stars(mi.get(key),
|
||||
m.get('display', {})
|
||||
.get('allow_half_stars',
|
||||
False))
|
||||
elif dt == 'comments':
|
||||
val = val or ''
|
||||
display = m.get('display', {})
|
||||
ctype = display.get('interpret_as') or 'html'
|
||||
if ctype == 'long-text':
|
||||
val = '<pre style="white-space:pre-wrap">%s</pre>' % escape(val)
|
||||
val = ('<pre style="white-space:pre-wrap">%s</pre>' %
|
||||
saxutils.escape(val))
|
||||
elif ctype == 'short-text':
|
||||
val = '<span>%s</span>' % escape(val)
|
||||
val = '<span>%s</span>' % saxutils.escape(val)
|
||||
elif ctype == 'markdown':
|
||||
val = markdown(val)
|
||||
else:
|
||||
val = comments_to_html(val)
|
||||
args[dkey] = val
|
||||
else:
|
||||
args[dkey] = escape(val)
|
||||
args[dkey+'_label'] = escape(display_name)
|
||||
args[dkey] = saxutils.escape(val)
|
||||
args[dkey+'_label'] = saxutils.escape(display_name)
|
||||
except Exception:
|
||||
# if the val (custom column contents) is None, don't add to args
|
||||
# if the val (custom column contents) is None, don't add to
|
||||
# args
|
||||
pass
|
||||
|
||||
if False:
|
||||
@@ -371,10 +386,11 @@ def render_jacket(mi, output_profile,
|
||||
# the text in the book. That means that as long as the jacket uses
|
||||
# relative font sizes (em or %), the post conversion font size will be
|
||||
# the same as for text in the main book. So text with size x em will
|
||||
# be rescaled to the same value in both the jacket and the main content.
|
||||
# be rescaled to the same value in both the jacket and the main
|
||||
# content.
|
||||
#
|
||||
# We cannot use data-calibre-rescale 100 on the body tag as that will just
|
||||
# give the body tag a font size of 1em, which is useless.
|
||||
# We cannot use data-calibre-rescale 100 on the body tag as that will
|
||||
# just give the body tag a font size of 1em, which is useless.
|
||||
for body in root.xpath('//*[local-name()="body"]'):
|
||||
fw = body.makeelement(base.tag('xhtml', 'div'))
|
||||
fw.set('data-calibre-rescale', '100')
|
||||
|
||||
Reference in New Issue
Block a user