diff --git a/ebook_converter/ebooks/conversion/plugins/html_input.py b/ebook_converter/ebooks/conversion/plugins/html_input.py index 97c553f..76eab84 100644 --- a/ebook_converter/ebooks/conversion/plugins/html_input.py +++ b/ebook_converter/ebooks/conversion/plugins/html_input.py @@ -15,17 +15,18 @@ from ebook_converter.polyglot.builtins import as_unicode def sanitize_file_name(x): - ans = re.sub(r'\s+', ' ', re.sub(r'[?&=;#]', '_', ascii_filename(x))).strip().rstrip('.') + ans = re.sub(r'\s+', ' ', re.sub(r'[?&=;#]', '_', + ascii_filename(x))).strip().rstrip('.') ans, ext = ans.rpartition('.')[::2] return (ans.strip() + '.' + ext.strip()).rstrip('.') class HTMLInput(InputFormatPlugin): - name = 'HTML Input' - author = 'Kovid Goyal' + name = 'HTML Input' + author = 'Kovid Goyal' description = 'Convert HTML and OPF files to an OEB' - file_types = {'opf', 'html', 'htm', 'xhtml', 'xhtm', 'shtm', 'shtml'} + file_types = {'opf', 'html', 'htm', 'xhtml', 'xhtm', 'shtm', 'shtml'} commit_name = 'html_input' options = { diff --git a/ebook_converter/ebooks/conversion/plugins/htmlz_input.py b/ebook_converter/ebooks/conversion/plugins/htmlz_input.py index e8d7765..6f465f0 100644 --- a/ebook_converter/ebooks/conversion/plugins/htmlz_input.py +++ b/ebook_converter/ebooks/conversion/plugins/htmlz_input.py @@ -6,10 +6,10 @@ from ebook_converter.customize.conversion import InputFormatPlugin class HTMLZInput(InputFormatPlugin): - name = 'HTLZ Input' - author = 'John Schember' + name = 'HTLZ Input' + author = 'John Schember' description = 'Convert HTML files to HTML' - file_types = {'htmlz'} + file_types = {'htmlz'} commit_name = 'htmlz_input' def convert(self, stream, options, file_ext, log, @@ -36,13 +36,14 @@ class HTMLZInput(InputFormatPlugin): top_levels.append(x) # Try to find an index. file. for x in top_levels: - if x.lower() in (u'index.html', u'index.xhtml', u'index.htm'): + if x.lower() in ('index.html', 'index.xhtml', 'index.htm'): index = x break # Look for multiple HTML files in the archive. We look at the # top level files only as only they matter in HTMLZ. for x in top_levels: - if os.path.splitext(x)[1].lower() in (u'.html', u'.xhtml', u'.htm'): + if os.path.splitext(x)[1].lower() in ('.html', '.xhtml', + '.htm'): # Set index to the first HTML file found if it's not # called index. if not index: @@ -84,15 +85,14 @@ class HTMLZInput(InputFormatPlugin): c = 0 while os.path.exists(htmlfile): c += 1 - htmlfile = u'index%d.html'%c + htmlfile = u'index%d.html' % c with open(htmlfile, 'wb') as f: f.write(html.encode('utf-8')) odi = options.debug_pipeline options.debug_pipeline = None # Generate oeb from html conversion. with open(htmlfile, 'rb') as f: - oeb = html_input.convert(f, options, 'html', log, - {}) + oeb = html_input.convert(f, options, 'html', log, {}) options.debug_pipeline = odi os.remove(htmlfile) diff --git a/ebook_converter/ebooks/docx/to_html.py b/ebook_converter/ebooks/docx/to_html.py index d1c4ed8..dc79f2c 100644 --- a/ebook_converter/ebooks/docx/to_html.py +++ b/ebook_converter/ebooks/docx/to_html.py @@ -1,5 +1,11 @@ -import sys, os, re, math, errno, uuid, numbers -from collections import OrderedDict, defaultdict +import sys +import os +import re +import math +import errno +import uuid +import numbers +import collections import mimetypes from lxml import etree @@ -7,23 +13,24 @@ from lxml import html from lxml.html.builder import ( HTML, HEAD, TITLE, BODY, LINK, META, P, SPAN, BR, DIV, A, DT, DL, DD, H1) -from ebook_converter import guess_type -from ebook_converter.ebooks.docx.container import DOCX -from ebook_converter.ebooks.docx.names import XML, generate_anchor -from ebook_converter.ebooks.docx.styles import Styles, inherit, PageProperties -from ebook_converter.ebooks.docx.numbering import Numbering -from ebook_converter.ebooks.docx.fonts import Fonts, is_symbol_font, map_symbol_text -from ebook_converter.ebooks.docx.images import Images -from ebook_converter.ebooks.docx.tables import Tables -from ebook_converter.ebooks.docx.footnotes import Footnotes from ebook_converter.ebooks.docx.cleanup import cleanup_markup +from ebook_converter.ebooks.docx.container import DOCX +from ebook_converter.ebooks.docx.fields import Fields +from ebook_converter.ebooks.docx.fonts import Fonts +from ebook_converter.ebooks.docx.fonts import is_symbol_font +from ebook_converter.ebooks.docx.fonts import map_symbol_text +from ebook_converter.ebooks.docx.footnotes import Footnotes +from ebook_converter.ebooks.docx.images import Images +from ebook_converter.ebooks.docx.names import XML, generate_anchor +from ebook_converter.ebooks.docx.numbering import Numbering +from ebook_converter.ebooks.docx.settings import Settings +from ebook_converter.ebooks.docx.styles import Styles, inherit, PageProperties +from ebook_converter.ebooks.docx.tables import Tables from ebook_converter.ebooks.docx.theme import Theme from ebook_converter.ebooks.docx.toc import create_toc -from ebook_converter.ebooks.docx.fields import Fields -from ebook_converter.ebooks.docx.settings import Settings from ebook_converter.ebooks.metadata.opf2 import OPFCreator -from ebook_converter.utils.localization import canonicalize_lang, lang_as_iso639_1 - +from ebook_converter.utils.localization import canonicalize_lang +from ebook_converter.utils.localization import lang_as_iso639_1 NBSP = '\xa0' @@ -54,7 +61,9 @@ def html_lang(docx_lang): class Convert(object): - def __init__(self, path_or_stream, dest_dir=None, log=None, detect_cover=True, notes_text=None, notes_nopb=False, nosupsub=False): + def __init__(self, path_or_stream, dest_dir=None, log=None, + detect_cover=True, notes_text=None, notes_nopb=False, + nosupsub=False): self.docx = DOCX(path_or_stream, log=log) self.namespace = self.docx.namespace self.ms_pat = re.compile(r'\s{2,}') @@ -73,7 +82,7 @@ class Convert(object): self.fields = Fields(self.namespace) self.styles = Styles(self.namespace, self.tables) self.images = Images(self.namespace, self.log) - self.object_map = OrderedDict() + self.object_map = collections.OrderedDict() self.html = HTML( HEAD( META(charset='utf-8'), @@ -82,9 +91,9 @@ class Convert(object): ), self.body ) - self.html.text='\n\t' - self.html[0].text='\n\t\t' - self.html[0].tail='\n' + self.html.text = '\n\t' + self.html[0].text = '\n\t\t' + self.html[0].tail = '\n' for child in self.html[0]: child.tail = '\n\t\t' self.html[0][-1].tail = '\n\t' @@ -98,17 +107,18 @@ class Convert(object): def __call__(self): doc = self.docx.document - relationships_by_id, relationships_by_type = self.docx.document_relationships + (relationships_by_id, + relationships_by_type) = self.docx.document_relationships self.resolve_alternate_content(doc) self.fields(doc, self.log) self.read_styles(relationships_by_type) self.images(relationships_by_id) - self.layers = OrderedDict() + self.layers = collections.OrderedDict() self.framed = [[]] self.frame_map = {} self.framed_map = {} self.anchor_map = {} - self.link_map = defaultdict(list) + self.link_map = collections.defaultdict(list) self.link_source_map = {} self.toc_anchor = None self.block_runs = [] @@ -142,7 +152,8 @@ class Convert(object): dl = DL(id=anchor) dl.set('class', 'footnote') self.body.append(dl) - dl.append(DT('[', A('←' + text, href='#back_%s' % anchor, title=text))) + dl.append(DT('[', A('←' + text, href='#back_%s' % anchor, + title=text))) dl[-1][0].tail = ']' dl.append(DD()) paras = [] @@ -159,7 +170,8 @@ class Convert(object): self.mark_block_runs(paras) for p, wp in self.object_map.items(): - if len(p) > 0 and not p.text and len(p[0]) > 0 and not p[0].text and p[0][0].get('class', None) == 'tab': + if (len(p) > 0 and not p.text and len(p[0]) > 0 and + not p[0].text and p[0][0].get('class', None) == 'tab'): # Paragraph uses tabs for indentation, convert to text-indent parent = p[0] tabs = [] @@ -172,7 +184,9 @@ class Convert(object): break indent = len(tabs) * self.settings.default_tab_stop style = self.styles.resolve(wp) - if style.text_indent is inherit or (hasattr(style.text_indent, 'endswith') and style.text_indent.endswith('pt')): + if (style.text_indent is inherit or + (hasattr(style.text_indent, 'endswith') and + style.text_indent.endswith('pt'))): if style.text_indent is not inherit: indent = float(style.text_indent[:-2]) + indent style.text_indent = '%.3gpt' % indent @@ -197,7 +211,8 @@ class Convert(object): except (TypeError, ValueError): lvl = 0 numbered.append((html_obj, num_id, lvl)) - self.numbering.apply_markup(numbered, self.body, self.styles, self.object_map, self.images) + self.numbering.apply_markup(numbered, self.body, self.styles, + self.object_map, self.images) self.apply_frames() if len(self.body) > 0: @@ -232,13 +247,15 @@ class Convert(object): self.fields.polish_markup(self.object_map) self.log.debug('Cleaning up redundant markup generated by Word') - self.cover_image = cleanup_markup(self.log, self.html, self.styles, self.dest_dir, self.detect_cover, self.namespace.XPath) + self.cover_image = cleanup_markup(self.log, self.html, self.styles, + self.dest_dir, self.detect_cover, + self.namespace.XPath) return self.write(doc) def read_page_properties(self, doc): current = [] - self.page_map = OrderedDict() + self.page_map = collections.OrderedDict() self.section_starts = [] for p in self.namespace.descendants(doc, 'w:p', 'w:tbl'): @@ -267,7 +284,8 @@ class Convert(object): def resolve_alternate_content(self, doc): # For proprietary extensions in Word documents use the fallback, spec # compliant form - # See https://wiki.openoffice.org/wiki/OOXML/Markup_Compatibility_and_Extensibility + # See https://wiki.openoffice.org/wiki/ + # OOXML/Markup_Compatibility_and_Extensibility for ac in self.namespace.descendants(doc, 'mc:AlternateContent'): choices = self.namespace.XPath('./mc:Choice')(ac) fallbacks = self.namespace.XPath('./mc:Fallback')(ac) @@ -284,7 +302,8 @@ class Convert(object): cname[-1] = defname if self.docx.exists('/'.join(cname)): name = name - if name and name.startswith('word/word') and not self.docx.exists(name): + if (name and name.startswith('word/word') and + not self.docx.exists(name)): name = name.partition('/')[2] return name @@ -327,7 +346,8 @@ class Convert(object): self.log.warn('Endnotes %s do not exist' % enname) else: enrel = self.docx.get_relationships(enname) - footnotes(etree.fromstring(foraw) if foraw else None, forel, etree.fromstring(enraw) if enraw else None, enrel) + footnotes(etree.fromstring(foraw) if foraw else None, forel, + etree.fromstring(enraw) if enraw else None, enrel) if fname is not None: embed_relationships = self.docx.get_relationships(fname)[0] @@ -336,7 +356,8 @@ class Convert(object): except KeyError: self.log.warn('Fonts table %s does not exist' % fname) else: - fonts(etree.fromstring(raw), embed_relationships, self.docx, self.dest_dir) + fonts(etree.fromstring(raw), embed_relationships, self.docx, + self.dest_dir) if tname is not None: try: @@ -364,16 +385,20 @@ class Convert(object): except KeyError: self.log.warn('Numbering styles %s do not exist' % nname) else: - numbering(etree.fromstring(raw), self.styles, self.docx.get_relationships(nname)[0]) + numbering(etree.fromstring(raw), self.styles, + self.docx.get_relationships(nname)[0]) self.styles.resolve_numbering(numbering) def write(self, doc): - toc = create_toc(doc, self.body, self.resolved_link_map, self.styles, self.object_map, self.log, self.namespace) - raw = html.tostring(self.html, encoding='utf-8', doctype='') + toc = create_toc(doc, self.body, self.resolved_link_map, self.styles, + self.object_map, self.log, self.namespace) + raw = html.tostring(self.html, encoding='utf-8', + doctype='') with open(os.path.join(self.dest_dir, 'index.html'), 'wb') as f: f.write(raw) - css = self.styles.generate_css(self.dest_dir, self.docx, self.notes_nopb, self.nosupsub) + css = self.styles.generate_css(self.dest_dir, self.docx, + self.notes_nopb, self.nosupsub) if css: with open(os.path.join(self.dest_dir, 'docx.css'), 'wb') as f: f.write(css.encode('utf-8')) @@ -394,23 +419,29 @@ class Convert(object): title='Table of Contents', type='toc')) toc_file = os.path.join(self.dest_dir, 'toc.ncx') - with open(os.path.join(self.dest_dir, 'metadata.opf'), 'wb') as of, open(toc_file, 'wb') as ncx: + with open(os.path.join(self.dest_dir, + 'metadata.opf'), 'wb') as of, open(toc_file, + 'wb') as ncx: opf.render(of, ncx, 'toc.ncx', process_guide=process_guide) if os.path.getsize(toc_file) == 0: os.remove(toc_file) return os.path.join(self.dest_dir, 'metadata.opf') def read_block_anchors(self, doc): - doc_anchors = frozenset(self.namespace.XPath('./w:body/w:bookmarkStart[@w:name]')(doc)) + doc_anchors = frozenset(self.namespace.XPath('./w:body/w:bookmarkStart' + '[@w:name]')(doc)) if doc_anchors: current_bm = set() - rmap = {v:k for k, v in self.object_map.items()} - for p in self.namespace.descendants(doc, 'w:p', 'w:bookmarkStart[@w:name]'): + rmap = {v: k for k, v in self.object_map.items()} + for p in self.namespace.descendants(doc, 'w:p', + 'w:bookmarkStart[@w:name]'): if p.tag.endswith('}p'): if current_bm and p in rmap: para = rmap[p] if 'id' not in para.attrib: - para.set('id', generate_anchor(next(iter(current_bm)), frozenset(self.anchor_map.values()))) + _bm = next(iter(current_bm)) + _am = frozenset(self.anchor_map.values()) + para.set('id', generate_anchor(_bm, _am)) for name in current_bm: self.anchor_map[name] = para.get('id') current_bm = set() @@ -442,13 +473,15 @@ class Convert(object): except AttributeError: break - for x in self.namespace.descendants(p, 'w:r', 'w:bookmarkStart', 'w:hyperlink', 'w:instrText'): + for x in self.namespace.descendants(p, 'w:r', 'w:bookmarkStart', + 'w:hyperlink', 'w:instrText'): if p_parent(x) is not p: continue if x.tag.endswith('}r'): span = self.convert_run(x) if current_anchor is not None: - (dest if len(dest) == 0 else span).set('id', current_anchor) + (dest if len(dest) == 0 else span).set('id', + current_anchor) current_anchor = None if current_hyperlink is not None: try: @@ -462,11 +495,14 @@ class Convert(object): self.layers[p].append(x) elif x.tag.endswith('}bookmarkStart'): anchor = self.namespace.get(x, 'w:name') - if anchor and anchor not in self.anchor_map and anchor != '_GoBack': + if (anchor and anchor not in self.anchor_map and + anchor != '_GoBack'): # _GoBack is a special bookmark inserted by Word 2010 for # the return to previous edit feature, we ignore it old_anchor = current_anchor - self.anchor_map[anchor] = current_anchor = generate_anchor(anchor, frozenset(self.anchor_map.values())) + current_anchor = generate_anchor( + anchor, frozenset(self .anchor_map.values())) + self.anchor_map[anchor] = current_anchor if old_anchor is not None: # The previous anchor was not applied to any element for a, t in tuple(self.anchor_map.items()): @@ -474,10 +510,13 @@ class Convert(object): self.anchor_map[a] = current_anchor elif x.tag.endswith('}hyperlink'): current_hyperlink = x - elif x.tag.endswith('}instrText') and x.text and x.text.strip().startswith('TOC '): + elif (x.tag.endswith('}instrText') and x.text and + x.text.strip().startswith('TOC ')): old_anchor = current_anchor anchor = str(uuid.uuid4()) - self.anchor_map[anchor] = current_anchor = generate_anchor('toc', frozenset(self.anchor_map.values())) + current_anchor = generate_anchor( + 'toc', frozenset(self.anchor_map.values())) + self.anchor_map[anchor] = current_anchor self.toc_anchor = current_anchor if old_anchor is not None: # The previous anchor was not applied to any element @@ -489,7 +528,8 @@ class Convert(object): dest.set('id', current_anchor) current_anchor = None - m = re.match(r'heading\s+(\d+)$', style.style_name or '', re.IGNORECASE) + m = re.match(r'heading\s+(\d+)$', style.style_name or '', + re.IGNORECASE) if m is not None: n = min(6, max(1, int(m.group(1)))) dest.tag = 'h%d' % n @@ -533,7 +573,8 @@ class Convert(object): if len(dest) > 0 and not dest[-1].tail: if dest[-1].tag == 'br': dest[-1].tail = NBSP - elif len(dest[-1]) > 0 and dest[-1][-1].tag == 'br' and not dest[-1][-1].tail: + elif (len(dest[-1]) > 0 and dest[-1][-1].tag == 'br' and + not dest[-1][-1].tail): dest[-1][-1].tail = NBSP return dest @@ -578,12 +619,12 @@ class Convert(object): if anchor and anchor in self.anchor_map: span.set('href', '#' + self.anchor_map[anchor]) continue - self.log.warn('Hyperlink with unknown target (rid=%s, anchor=%s), ignoring' % - (rid, anchor)) + self.log.warn('Hyperlink with unknown target (rid=%s, anchor=%s), ' + 'ignoring' % (rid, anchor)) # hrefs that point nowhere give epubcheck a hernia. The element # should be styled explicitly by Word anyway. # span.set('href', '#') - rmap = {v:k for k, v in self.object_map.items()} + rmap = {v: k for k, v in self.object_map.items()} for hyperlink, runs in self.fields.hyperlink_fields: spans = [rmap[r] for r in runs if r in rmap] if not spans: @@ -604,7 +645,8 @@ class Convert(object): if anchor in self.anchor_map: span.set('href', '#' + self.anchor_map[anchor]) continue - self.log.warn('Hyperlink field with unknown anchor: %s' % anchor) + self.log.warn('Hyperlink field with unknown anchor: %s' % + anchor) else: if url in self.anchor_map: span.set('href', '#' + self.anchor_map[url]) @@ -652,7 +694,8 @@ class Convert(object): # actually needs it, i.e. if it has more than one # consecutive space or it has newlines or tabs. multi_spaces = self.ms_pat.search(ctext) is not None - preserve = multi_spaces or self.ws_pat.search(ctext) is not None + preserve = (multi_spaces or + self.ws_pat.search(ctext) is not None) if preserve: text.add_elem(SPAN(ctext, style="white-space:pre-wrap")) ans.append(text.elem) @@ -668,24 +711,30 @@ class Convert(object): else: clear = child.get('clear', None) if clear in {'all', 'left', 'right'}: - br = BR(style='clear:%s'%('both' if clear == 'all' else clear)) + br = BR(style='clear:%s' % ('both' if clear == 'all' + else clear)) else: br = BR() text.add_elem(br) ans.append(text.elem) - elif self.namespace.is_tag(child, 'w:drawing') or self.namespace.is_tag(child, 'w:pict'): - for img in self.images.to_html(child, self.current_page, self.docx, self.dest_dir): + elif (self.namespace.is_tag(child, 'w:drawing') or + self.namespace.is_tag(child, 'w:pict')): + for img in self.images.to_html(child, self.current_page, + self.docx, self.dest_dir): text.add_elem(img) ans.append(text.elem) - elif self.namespace.is_tag(child, 'w:footnoteReference') or self.namespace.is_tag(child, 'w:endnoteReference'): + elif (self.namespace.is_tag(child, 'w:footnoteReference') or + self.namespace.is_tag(child, 'w:endnoteReference')): anchor, name = self.footnotes.get_ref(child) if anchor and name: - l = A(name, id='back_%s' % anchor, href='#' + anchor, title=name) - l.set('class', 'noteref') - text.add_elem(l) + _l = A(name, id='back_%s' % anchor, href='#' + anchor, + title=name) + _l.set('class', 'noteref') + text.add_elem(_l) ans.append(text.elem) elif self.namespace.is_tag(child, 'w:tab'): - spaces = int(math.ceil((self.settings.default_tab_stop / 36) * 6)) + spaces = int(math.ceil((self.settings.default_tab_stop / 36) * + 6)) text.add_elem(SPAN(NBSP * spaces)) ans.append(text.elem) ans[-1].set('class', 'tab') @@ -699,7 +748,8 @@ class Convert(object): style = self.styles.resolve_run(run) if style.vert_align in {'superscript', 'subscript'}: if ans.text or len(ans): - ans.set('data-docx-vert', 'sup' if style.vert_align == 'superscript' else 'sub') + ans.set('data-docx-vert', + 'sup' if style.vert_align == 'superscript' else 'sub') if style.lang is not inherit: lang = html_lang(style.lang) if lang is not None and lang != self.doc_lang: @@ -738,12 +788,14 @@ class Convert(object): idx = parent.index(paras[0]) frame = DIV(*paras) parent.insert(idx, frame) - self.framed_map[frame] = css = style.css(self.page_map[self.object_map[paras[0]]]) + self.framed_map[frame] = css = style.css( + self.page_map[self.object_map[paras[0]]]) self.styles.register(css, 'frame') if not self.block_runs: return - rmap = {v:k for k, v in self.object_map.items()} + + rmap = {v: k for k, v in self.object_map.items()} for border_style, blocks in self.block_runs: paras = tuple(rmap[p] for p in blocks) for p in paras: @@ -796,17 +848,20 @@ class Convert(object): else: border_style = style.clone_border_styles() if has_visible_border: - border_style.margin_top, style.margin_top = style.margin_top, inherit + style.margin_top = inherit + border_style.margin_top = style.margin_top if p is not run[-1]: style.padding_bottom = 0 else: if has_visible_border: - border_style.margin_bottom, style.margin_bottom = style.margin_bottom, inherit + style.margin_bottom = inherit + border_style.margin_bottom = style.margin_bottom style.clear_borders() if p is not run[-1]: style.apply_between_border() if has_visible_border: - border_style.margin_left, border_style.margin_right = max_left,max_right + border_style.margin_left = max_left + border_style.margin_right = max_right self.block_runs.append((border_style, run)) run = [] diff --git a/ebook_converter/ebooks/docx/writer/container.py b/ebook_converter/ebooks/docx/writer/container.py index 9af3b0a..47042b4 100644 --- a/ebook_converter/ebooks/docx/writer/container.py +++ b/ebook_converter/ebooks/docx/writer/container.py @@ -1,5 +1,6 @@ import mimetypes -import textwrap, os +import os +import textwrap from lxml import etree from lxml.builder import ElementMaker @@ -9,22 +10,48 @@ from ebook_converter.ebooks.docx.names import DOCXNamespace from ebook_converter.ebooks.metadata import authors_to_string from ebook_converter.ebooks.pdf.render.common import PAPER_SIZES from ebook_converter.utils.date import utcnow -from ebook_converter.utils.localization import canonicalize_lang, lang_as_iso639_1 +from ebook_converter.utils.localization import canonicalize_lang +from ebook_converter.utils.localization import lang_as_iso639_1 from ebook_converter.utils.zipfile import ZipFile +WORD_TYPES = {"/word/footnotes.xml": "application/vnd.openxmlformats-" + "officedocument.wordprocessingml.footnotes+xml", + "/word/document.xml": "application/vnd.openxmlformats-" + "officedocument.wordprocessingml.document.main+xml", + "/word/numbering.xml": "application/vnd.openxmlformats-" + "officedocument.wordprocessingml.numbering+xml", + "/word/styles.xml": "application/vnd.openxmlformats-" + "officedocument.wordprocessingml.styles+xml", + "/word/endnotes.xml": "application/vnd.openxmlformats-" + "officedocument.wordprocessingml.endnotes+xml", + "/word/settings.xml": "application/vnd.openxmlformats-" + "officedocument.wordprocessingml.settings+xml", + "/word/theme/theme1.xml": "application/vnd.openxmlformats-" + "officedocument.theme+xml", + "/word/fontTable.xml": "application/vnd.openxmlformats-" + "officedocument.wordprocessingml.fontTable+xml", + "/word/webSettings.xml": "application/vnd.openxmlformats-" + "officedocument.wordprocessingml.webSettings+xml", + "/docProps/core.xml": "application/vnd.openxmlformats-package." + "core-properties+xml", + "/docProps/app.xml": "application/vnd.openxmlformats-" + "officedocument.extended-properties+xml"} + + def xml2str(root, pretty_print=False, with_tail=False): if hasattr(etree, 'cleanup_namespaces'): etree.cleanup_namespaces(root) ans = etree.tostring(root, encoding='utf-8', xml_declaration=True, - pretty_print=pretty_print, with_tail=with_tail) + pretty_print=pretty_print, with_tail=with_tail) return ans def page_size(opts): width, height = PAPER_SIZES[opts.docx_page_size] if opts.docx_custom_page_size is not None: - width, height = map(float, opts.docx_custom_page_size.partition('x')[0::2]) + width, height = map(float, + opts.docx_custom_page_size.partition('x')[0::2]) return width, height @@ -47,7 +74,9 @@ def create_skeleton(opts, namespaces=None): def w(x): return '{%s}%s' % (namespaces['w'], x) - dn = {k:v for k, v in namespaces.items() if k in {'w', 'r', 'm', 've', 'o', 'wp', 'w10', 'wne', 'a', 'pic'}} + dn = {k: v for k, v in namespaces.items() if k in {'w', 'r', 'm', 've', + 'o', 'wp', 'w10', 'wne', + 'a', 'pic'}} E = ElementMaker(namespace=dn['w'], nsmap=dn) doc = E.document() body = E.body() @@ -59,27 +88,32 @@ def create_skeleton(opts, namespaces=None): val = page_margin(opts, which) return w(which), str(int(val * 20)) body.append(E.sectPr( - E.pgSz(**{w('w'):str(width), w('h'):str(height)}), + E.pgSz(**{w('w'): str(width), w('h'): str(height)}), E.pgMar(**dict(map(margin, 'left top right bottom'.split()))), - E.cols(**{w('space'):'720'}), - E.docGrid(**{w('linePitch'):"360"}), + E.cols(**{w('space'): '720'}), + E.docGrid(**{w('linePitch'): "360"}), )) - dn = {k:v for k, v in namespaces.items() if k in tuple('wra') + ('wp',)} + dn = {k: v for k, v in namespaces.items() if k in tuple('wra') + ('wp',)} E = ElementMaker(namespace=dn['w'], nsmap=dn) styles = E.styles( E.docDefaults( E.rPrDefault( E.rPr( - E.rFonts(**{w('asciiTheme'):"minorHAnsi", w('eastAsiaTheme'):"minorEastAsia", w('hAnsiTheme'):"minorHAnsi", w('cstheme'):"minorBidi"}), - E.sz(**{w('val'):'22'}), - E.szCs(**{w('val'):'22'}), - E.lang(**{w('val'):'en-US', w('eastAsia'):"en-US", w('bidi'):"ar-SA"}) + E.rFonts(**{w('asciiTheme'): "minorHAnsi", + w('eastAsiaTheme'): "minorEastAsia", + w('hAnsiTheme'): "minorHAnsi", + w('cstheme'): "minorBidi"}), + E.sz(**{w('val'): '22'}), + E.szCs(**{w('val'): '22'}), + E.lang(**{w('val'): 'en-US', w('eastAsia'): "en-US", + w('bidi'): "ar-SA"}) ) ), E.pPrDefault( E.pPr( - E.spacing(**{w('after'):"0", w('line'):"276", w('lineRule'):"auto"}) + E.spacing(**{w('after'): "0", w('line'): "276", + w('lineRule'): "auto"}) ) ) ) @@ -103,8 +137,8 @@ def update_doc_props(root, mi, namespace): if mi.comments: setm('description', mi.comments) if mi.languages: - l = canonicalize_lang(mi.languages[0]) - setm('language', lang_as_iso639_1(l) or l) + _l = canonicalize_lang(mi.languages[0]) + setm('language', lang_as_iso639_1(_l) or _l) class DocumentRelationships(object): @@ -115,8 +149,7 @@ class DocumentRelationships(object): for typ, target in {namespace.names['STYLES']: 'styles.xml', namespace.names['NUMBERING']: 'numbering.xml', namespace.names['WEB_SETTINGS']: 'webSettings.xml', - namespace.names['FONTS']: 'fontTable.xml', - }.items(): + namespace.names['FONTS']: 'fontTable.xml'}.items(): self.add_relationship(target, typ) def get_relationship_id(self, target, rtype, target_mode=None): @@ -134,7 +167,8 @@ class DocumentRelationships(object): def serialize(self): namespaces = self.namespace.namespaces - E = ElementMaker(namespace=namespaces['pr'], nsmap={None:namespaces['pr']}) + E = ElementMaker(namespace=namespaces['pr'], + nsmap={None: namespaces['pr']}) relationships = E.Relationships() for (target, rtype, target_mode), rid in self.rmap.items(): r = E.Relationship(Id=rid, Type=rtype, Target=target) @@ -151,9 +185,12 @@ class DOCX(object): namespaces = self.namespace.namespaces self.opts, self.log = opts, log self.document_relationships = DocumentRelationships(self.namespace) - self.font_table = etree.Element('{%s}fonts' % namespaces['w'], nsmap={k:namespaces[k] for k in 'wr'}) - self.numbering = etree.Element('{%s}numbering' % namespaces['w'], nsmap={k:namespaces[k] for k in 'wr'}) - E = ElementMaker(namespace=namespaces['pr'], nsmap={None:namespaces['pr']}) + self.font_table = etree.Element('{%s}fonts' % namespaces['w'], + nsmap={k: namespaces[k] for k in 'wr'}) + self.numbering = etree.Element('{%s}numbering' % namespaces['w'], + nsmap={k: namespaces[k] for k in 'wr'}) + E = ElementMaker(namespace=namespaces['pr'], + nsmap={None: namespaces['pr']}) self.embedded_fonts = E.Relationships() self.fonts = {} self.images = {} @@ -161,21 +198,10 @@ class DOCX(object): # Boilerplate {{{ @property def contenttypes(self): - E = ElementMaker(namespace=self.namespace.namespaces['ct'], nsmap={None:self.namespace.namespaces['ct']}) + E = ElementMaker(namespace=self.namespace.namespaces['ct'], + nsmap={None: self.namespace.namespaces['ct']}) types = E.Types() - for partname, mt in { - "/word/footnotes.xml": "application/vnd.openxmlformats-officedocument.wordprocessingml.footnotes+xml", - "/word/document.xml": "application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml", - "/word/numbering.xml": "application/vnd.openxmlformats-officedocument.wordprocessingml.numbering+xml", - "/word/styles.xml": "application/vnd.openxmlformats-officedocument.wordprocessingml.styles+xml", - "/word/endnotes.xml": "application/vnd.openxmlformats-officedocument.wordprocessingml.endnotes+xml", - "/word/settings.xml": "application/vnd.openxmlformats-officedocument.wordprocessingml.settings+xml", - "/word/theme/theme1.xml": "application/vnd.openxmlformats-officedocument.theme+xml", - "/word/fontTable.xml": "application/vnd.openxmlformats-officedocument.wordprocessingml.fontTable+xml", - "/word/webSettings.xml": "application/vnd.openxmlformats-officedocument.wordprocessingml.webSettings+xml", - "/docProps/core.xml": "application/vnd.openxmlformats-package.core-properties+xml", - "/docProps/app.xml": "application/vnd.openxmlformats-officedocument.extended-properties+xml", - }.items(): + for partname, mt in WORD_TYPES.items(): types.append(E.Override(PartName=partname, ContentType=mt)) added = {'png', 'gif', 'jpeg', 'jpg', 'svg', 'xml'} for ext in added: @@ -199,7 +225,8 @@ class DOCX(object): @property def appproperties(self): - E = ElementMaker(namespace=self.namespace.namespaces['ep'], nsmap={None:self.namespace.namespaces['ep']}) + E = ElementMaker(namespace=self.namespace.namespaces['ep'], + nsmap={None: self.namespace.namespaces['ep']}) props = E.Properties( E.Application(__appname__), E.AppVersion('%02d.%04d' % numeric_version[:2]), @@ -216,16 +243,17 @@ class DOCX(object): @property def containerrels(self): return textwrap.dedent('''\ - - - - - - '''.format(**self.namespace.names)).encode('utf-8') + + + + + +'''.format(**self.namespace.names)).encode('utf-8') # noqa @property def websettings(self): - E = ElementMaker(namespace=self.namespace.namespaces['w'], nsmap={'w':self.namespace.namespaces['w']}) + E = ElementMaker(namespace=self.namespace.namespaces['w'], + nsmap={'w': self.namespace.namespaces['w']}) ws = E.webSettings( E.optimizeForBrowser, E.allowPNG, E.doNotSaveAsSingleFile) return xml2str(ws) @@ -234,11 +262,15 @@ class DOCX(object): def convert_metadata(self, mi): namespaces = self.namespace.namespaces - E = ElementMaker(namespace=namespaces['cp'], nsmap={x:namespaces[x] for x in 'cp dc dcterms xsi'.split()}) + E = ElementMaker(namespace=namespaces['cp'], + nsmap={x: namespaces[x] + for x in 'cp dc dcterms xsi'.split()}) cp = E.coreProperties(E.revision("1"), E.lastModifiedBy('calibre')) ts = utcnow().isoformat('T').rpartition('.')[0] + 'Z' for x in 'created modified'.split(): - x = cp.makeelement('{%s}%s' % (namespaces['dcterms'], x), **{'{%s}type' % namespaces['xsi']:'dcterms:W3CDTF'}) + x = cp.makeelement('{%s}%s' % (namespaces['dcterms'], x), + **{'{%s}type' % + namespaces['xsi']: 'dcterms:W3CDTF'}) x.text = ts cp.append(x) self.mi = mi @@ -261,8 +293,10 @@ class DOCX(object): zf.writestr('word/styles.xml', xml2str(self.styles)) zf.writestr('word/numbering.xml', xml2str(self.numbering)) zf.writestr('word/fontTable.xml', xml2str(self.font_table)) - zf.writestr('word/_rels/document.xml.rels', self.document_relationships.serialize()) - zf.writestr('word/_rels/fontTable.xml.rels', xml2str(self.embedded_fonts)) + zf.writestr('word/_rels/document.xml.rels', + self.document_relationships.serialize()) + zf.writestr('word/_rels/fontTable.xml.rels', + xml2str(self.embedded_fonts)) for fname, data_getter in self.images.items(): zf.writestr(fname, data_getter()) for fname, data in self.fonts.items(): diff --git a/ebook_converter/ebooks/metadata/__init__.py b/ebook_converter/ebooks/metadata/__init__.py index 3196056..918b764 100644 --- a/ebook_converter/ebooks/metadata/__init__.py +++ b/ebook_converter/ebooks/metadata/__init__.py @@ -18,7 +18,7 @@ try: _author_pat = re.compile(tweaks['authors_split_regex']) except Exception: prints('Author split regexp:', tweaks['authors_split_regex'], - 'is invalid, using default') + 'is invalid, using default') _author_pat = re.compile(r'(?i),?\s+(and|with)\s+') @@ -76,7 +76,8 @@ def author_to_author_sort(author, method=None): if method == 'copy': return author - prefixes = {force_unicode(y).lower() for y in tweaks['author_name_prefixes']} + prefixes = {force_unicode(y).lower() + for y in tweaks['author_name_prefixes']} prefixes |= {y+'.' for y in prefixes} while True: if not tokens: @@ -87,7 +88,8 @@ def author_to_author_sort(author, method=None): else: break - suffixes = {force_unicode(y).lower() for y in tweaks['author_name_suffixes']} + suffixes = {force_unicode(y).lower() + for y in tweaks['author_name_suffixes']} suffixes |= {y+'.' for y in suffixes} suffix = '' @@ -144,7 +146,7 @@ def get_title_sort_pat(lang=None): except: ans = frozenset((r'A\s+', r'The\s+', r'An\s+')) ans = '|'.join(ans) - ans = '^(%s)'%ans + ans = '^(%s)' % ans try: ans = re.compile(ans, re.IGNORECASE) except: @@ -154,7 +156,7 @@ def get_title_sort_pat(lang=None): _ignore_starts = '\'"'+''.join(chr(x) for x in - list(range(0x2018, 0x201e))+[0x2032, 0x2033]) + list(range(0x2018, 0x201e))+[0x2032, 0x2033]) def title_sort(title, order=None, lang=None): diff --git a/ebook_converter/ebooks/metadata/fb2.py b/ebook_converter/ebooks/metadata/fb2.py index 6e72e42..b32e7f8 100644 --- a/ebook_converter/ebooks/metadata/fb2.py +++ b/ebook_converter/ebooks/metadata/fb2.py @@ -12,8 +12,7 @@ from lxml import etree from ebook_converter.utils.date import parse_only_date from ebook_converter.utils.img import save_cover_data_to from ebook_converter.utils.imghdr import identify -from ebook_converter import guess_type, guess_all_extensions, prints, \ - force_unicode +from ebook_converter import guess_all_extensions, prints, force_unicode from ebook_converter.ebooks.metadata import MetaInformation, check_isbn from ebook_converter.ebooks.chardet import xml_to_unicode from ebook_converter.polyglot.binary import as_base64_unicode diff --git a/ebook_converter/ebooks/metadata/opf2.py b/ebook_converter/ebooks/metadata/opf2.py index 64b9fa7..a907ad8 100644 --- a/ebook_converter/ebooks/metadata/opf2.py +++ b/ebook_converter/ebooks/metadata/opf2.py @@ -10,11 +10,11 @@ import mimetypes import os import re import sys +import textwrap +import traceback import unittest import urllib.parse import uuid -import traceback -import textwrap from lxml import etree from lxml.builder import ElementMaker @@ -32,7 +32,7 @@ from ebook_converter.ebooks.metadata import string_to_authors, \ from ebook_converter.ebooks.metadata.book.base import Metadata from ebook_converter.utils.date import parse_date, isoformat from ebook_converter.utils.localization import get_lang, canonicalize_lang -from ebook_converter import prints, guess_type +from ebook_converter import prints from ebook_converter.utils.cleantext import clean_ascii_chars, clean_xml_chars from ebook_converter.utils.config import tweaks from ebook_converter.polyglot.urllib import unquote @@ -1807,8 +1807,7 @@ def test_m2o(): class OPFTest(unittest.TestCase): def setUp(self): - self.stream = io.BytesIO( -b'''\ + self.stream = io.BytesIO(b'''\ @@ -1827,8 +1826,7 @@ b'''\ -''' - ) +''') self.opf = OPF(self.stream, os.getcwd()) def testReading(self, opf=None): diff --git a/ebook_converter/ebooks/mobi/reader/mobi6.py b/ebook_converter/ebooks/mobi/reader/mobi6.py index 6087a67..014b846 100644 --- a/ebook_converter/ebooks/mobi/reader/mobi6.py +++ b/ebook_converter/ebooks/mobi/reader/mobi6.py @@ -1,10 +1,15 @@ -import shutil, os, re, struct, textwrap, io +import io import logging import mimetypes +import os +import re +import shutil +import struct +import textwrap from lxml import html, etree -from ebook_converter import xml_entity_to_unicode, entity_to_unicode, guess_type +from ebook_converter import xml_entity_to_unicode, entity_to_unicode from ebook_converter.utils.cleantext import clean_ascii_chars, clean_xml_chars from ebook_converter.ebooks import DRMError, unit_convert from ebook_converter.ebooks.chardet import strip_encoding_declarations @@ -15,15 +20,11 @@ from ebook_converter.ebooks.metadata import MetaInformation from ebook_converter.ebooks.metadata.opf2 import OPFCreator, OPF from ebook_converter.ebooks.metadata.toc import TOC from ebook_converter.ebooks.mobi.reader.headers import BookHeader -from ebook_converter.utils.img import save_cover_data_to, gif_data_to_png_data, AnimatedGIF +from ebook_converter.utils.img import save_cover_data_to, gif_data_to_png_data +from ebook_converter.utils.img import AnimatedGIF from ebook_converter.utils.imghdr import what -__license__ = 'GPL v3' -__copyright__ = '2012, Kovid Goyal ' -__docformat__ = 'restructuredtext en' - - class TopazError(ValueError): pass @@ -38,13 +39,14 @@ class KFXError(ValueError): class MobiReader(object): - PAGE_BREAK_PAT = re.compile( - r'<\s*/{0,1}\s*mbp:pagebreak((?:\s+[^/>]*){0,1})/{0,1}\s*>\s*(?:<\s*/{0,1}\s*mbp:pagebreak\s*/{0,1}\s*>)*', - re.IGNORECASE) + PAGE_BREAK_PAT = re.compile(r'<\s*/{0,1}\s*mbp:pagebreak((?:\s+[^/>]*)' + r'{0,1})/{0,1}\s*>\s*(?:<\s*/{0,1}' + r'\s*mbp:pagebreak\s*/{0,1}\s*>)*', + re.IGNORECASE) IMAGE_ATTRS = ('lowrecindex', 'recindex', 'hirecindex') def __init__(self, filename_or_stream, log, user_encoding=None, debug=None, - try_extra_data_fix=False): + try_extra_data_fix=False): self.log = log self.debug = debug self.embedded_mi = None @@ -83,8 +85,8 @@ class MobiReader(object): if raw.startswith(b'\xeaDRMION\xee'): raise KFXError() - self.header = raw[0:72] - self.name = self.header[:32].replace(b'\x00', b'') + self.header = raw[0:72] + self.name = self.header[:32].replace(b'\x00', b'') self.num_sections, = struct.unpack('>H', raw[76:78]) self.ident = self.header[0x3C:0x3C + 8].upper() @@ -94,7 +96,9 @@ class MobiReader(object): self.sections = [] self.section_headers = [] for i in range(self.num_sections): - offset, a1, a2, a3, a4 = struct.unpack('>LBBBB', raw[78 + i * 8:78 + i * 8 + 8]) + offset, a1, a2, a3, a4 = struct.unpack('>LBBBB', + raw[78 + i * 8:78 + + i * 8 + 8]) flags, val = a1, a2 << 16 | a3 << 8 | a4 self.section_headers.append((offset, flags, val)) @@ -109,8 +113,9 @@ class MobiReader(object): for i in range(self.num_sections): self.sections.append((section(i), self.section_headers[i])) - self.book_header = bh = BookHeader(self.sections[0][0], self.ident, - user_encoding, self.log, try_extra_data_fix=try_extra_data_fix) + bh = BookHeader(self.sections[0][0], self.ident, user_encoding, + self.log, try_extra_data_fix=try_extra_data_fix) + self.book_header = bh self.name = self.name.decode(self.book_header.codec, 'replace') self.kf8_type = None k8i = getattr(self.book_header.exth, 'kf8_header', None) @@ -118,18 +123,20 @@ class MobiReader(object): # Ancient PRC files from Baen can have random values for # mobi_version, so be conservative if (self.book_header.mobi_version == 8 and hasattr(self.book_header, - 'skelidx')): + 'skelidx')): self.kf8_type = 'standalone' elif k8i is not None: # Check for joint mobi 6 and kf 8 file try: raw = self.sections[k8i-1][0] - except: + except Exception: raw = None if raw == b'BOUNDARY': try: self.book_header = BookHeader(self.sections[k8i][0], - self.ident, user_encoding, self.log) - self.book_header.kf8_first_image_index = self.book_header.first_image_index + k8i + self.ident, user_encoding, + self.log) + _kfii = self.book_header.first_image_index + k8i + self.book_header.kf8_first_image_index = _kfii self.book_header.mobi6_records = bh.records # Need the first_image_index from the mobi 6 header as well @@ -143,14 +150,14 @@ class MobiReader(object): self.kf8_type = 'joint' self.kf8_boundary = k8i-1 - except: + except Exception: self.book_header = bh def check_for_drm(self): if self.book_header.encryption_type != 0: try: name = self.book_header.exth.mi.title - except: + except Exception: name = self.name if not name: name = self.name @@ -163,20 +170,20 @@ class MobiReader(object): if self.debug is not None: parse_cache['calibre_raw_mobi_markup'] = self.mobi_html self.add_anchors() - self.processed_html = self.processed_html.decode(self.book_header.codec, - 'ignore') + self.processed_html = self.processed_html.decode( + self.book_header.codec, 'ignore') self.processed_html = self.processed_html.replace('<', - self.processed_html) + self.processed_html) self.processed_html = self.processed_html.replace('\ufeff', '') # Remove tags of the form as they can cause issues further # along the pipeline self.processed_html = re.sub(r']*>', '', - self.processed_html) + self.processed_html) self.processed_html = strip_encoding_declarations(self.processed_html) self.processed_html = re.sub(r'&(\S+?);', xml_entity_to_unicode, - self.processed_html) + self.processed_html) image_name_map = self.extract_images(processed_records, output_dir) self.replace_page_breaks() self.cleanup_html() @@ -186,31 +193,41 @@ class MobiReader(object): try: root = html.fromstring(self.processed_html) if len(root.xpath('//html')) > 5: - root = html.fromstring(self.processed_html.replace('\x0c', - '').replace('\x14', '')) + root = html.fromstring(self.processed_html + .replace('\x0c', '') + .replace('\x14', '')) except Exception: - self.log.warning('MOBI markup appears to contain random bytes. Stripping.') + self.log.warning('MOBI markup appears to contain random bytes. ' + 'Stripping.') self.processed_html = self.remove_random_bytes(self.processed_html) root = html.fromstring(self.processed_html) if root.xpath('descendant::p/descendant::p'): from html5_parser import parse self.log.warning('Malformed markup, parsing using html5-parser') - self.processed_html = strip_encoding_declarations(self.processed_html) + self.processed_html = strip_encoding_declarations( + self.processed_html) # These trip up the html5 parser causing all content to be placed # under the tag - self.processed_html = re.sub(r'.+?', '', self.processed_html, flags=re.I) - self.processed_html = re.sub(r'.+?', '', self.processed_html, flags=re.I) + self.processed_html = re.sub(r'.+?', '', + self.processed_html, flags=re.I) + self.processed_html = re.sub(r'.+?', '', + self.processed_html, flags=re.I) try: - root = parse(self.processed_html, maybe_xhtml=False, keep_doctype=False, sanitize_names=True) + root = parse(self.processed_html, maybe_xhtml=False, + keep_doctype=False, sanitize_names=True) except Exception: - self.log.warning('MOBI markup appears to contain random bytes. Stripping.') - self.processed_html = self.remove_random_bytes(self.processed_html) - root = parse(self.processed_html, maybe_xhtml=False, keep_doctype=False, sanitize_names=True) + self.log.warning('MOBI markup appears to contain random ' + 'bytes. Stripping.') + self.processed_html = self.remove_random_bytes( + self.processed_html) + root = parse(self.processed_html, maybe_xhtml=False, + keep_doctype=False, sanitize_names=True) if len(root.xpath('body/descendant::*')) < 1: # There are probably stray s in the markup self.processed_html = self.processed_html.replace('', - '') - root = parse(self.processed_html, maybe_xhtml=False, keep_doctype=False, sanitize_names=True) + '') + root = parse(self.processed_html, maybe_xhtml=False, + keep_doctype=False, sanitize_names=True) if root.tag != 'html': self.log.warn('File does not have opening tag') @@ -253,13 +270,14 @@ class MobiReader(object): head = root.makeelement('head', {}) root.insert(0, head) head.text = '\n\t' - link = head.makeelement('link', {'type':'text/css', - 'href':'styles.css', 'rel':'stylesheet'}) + link = head.makeelement('link', {'type': 'text/css', + 'href': 'styles.css', + 'rel': 'stylesheet'}) head.insert(0, link) link.tail = '\n\t' title = head.xpath('descendant::title') - m = head.makeelement('meta', {'http-equiv':'Content-Type', - 'content':'text/html; charset=utf-8'}) + m = head.makeelement('meta', {'http-equiv': 'Content-Type', + 'content': 'text/html; charset=utf-8'}) head.insert(0, m) if not title: title = head.makeelement('title', {}) @@ -283,7 +301,8 @@ class MobiReader(object): try: for ref in guide.xpath('descendant::reference'): if 'href' in ref.attrib: - ref.attrib['href'] = os.path.basename(htmlfile) + ref.attrib['href'] + ref.attrib['href'] = (os.path.basename(htmlfile) + + ref.attrib['href']) except AttributeError: pass @@ -299,7 +318,7 @@ class MobiReader(object): opf, ncx_manifest_entry = self.create_opf(htmlfile, guide, root) self.created_opf_path = os.path.splitext(htmlfile)[0] + '.opf' opf.render(open(self.created_opf_path, 'wb'), ncx, - ncx_manifest_entry=ncx_manifest_entry) + ncx_manifest_entry=ncx_manifest_entry) ncx = ncx.getvalue() if ncx: ncx_path = os.path.join(os.path.dirname(htmlfile), 'toc.ncx') @@ -313,9 +332,9 @@ class MobiReader(object): if self.book_header.exth is not None or self.embedded_mi is not None: self.log.debug('Creating OPF...') ncx = io.BytesIO() - opf, ncx_manifest_entry = self.create_opf(htmlfile, guide, root) + opf, ncx_manifest_entry = self.create_opf(htmlfile, guide, root) opf.render(open(os.path.splitext(htmlfile)[0] + '.opf', 'wb'), ncx, - ncx_manifest_entry) + ncx_manifest_entry) ncx = ncx.getvalue() if ncx: write_as_utf8(os.path.splitext(htmlfile)[0] + '.ncx', ncx) @@ -348,28 +367,46 @@ class MobiReader(object): def cleanup_html(self): self.log.debug('Cleaning up HTML...') - self.processed_html = re.sub(r'
', '', self.processed_html) - if self.book_header.ancient and b'') + '' + self.processed_html = re.sub(r'
' + '
', '', self.processed_html) + if (self.book_header.ancient and + b'

' + + self.processed_html.replace('\n\n', '

') + + '') self.processed_html = self.processed_html.replace('\r\n', '\n') self.processed_html = self.processed_html.replace('> <', '>\n<') self.processed_html = self.processed_html.replace(']*>', '', self.processed_html) - self.processed_html = re.sub(r'<\s*(/?)\s*o:p[^>]*>', r'', self.processed_html) - # Swap inline and block level elements, and order block level elements according to priority - # - lxml and beautifulsoup expect/assume a specific order based on xhtml spec - self.processed_html = re.sub( - r'(?i)(?P(<(h\d+|i|b|u|em|small|big|strong|tt)>\s*){1,})(?P]*>)', r'\g'+r'\g', self.processed_html) - self.processed_html = re.sub( - r'(?i)(?P]*>)\s*(?P(\s*){1,})', r'\g'+r'\g', self.processed_html) - self.processed_html = re.sub( - r'(?i)(?P

(]*>\s*){1,})(?P]*>)', r'\g'+r'\g
', self.processed_html) - self.processed_html = re.sub( - r'(?i)(?P]*>)\s*(?P
(<(blockquote|div)[^>]*>\s*){1,})', r'\g
'+r'\g', self.processed_html) + self.processed_html = re.sub(r'<\s*(/?)\s*o:p[^>]*>', r'', + self.processed_html) + # Swap inline and block level elements, and order block level elements + # according to priority + # - lxml and beautifulsoup expect/assume a specific order based on + # xhtml spec + self.processed_html = re.sub(r'(?i)(?P(<(h\d+|i|b|u|em|' + r'small|big|strong|tt)>\s*){1,})' + r'(?P]*>)', + r'\g' + r'\g', + self.processed_html) + self.processed_html = re.sub(r'(?i)(?P]*>)\s*' + r'(?P(\s*){1,})', + r'\g' + r'\g', + self.processed_html) + self.processed_html = re.sub(r'(?i)(?P
(]*>\s*){1,})(?P]*>)', + r'\g' + r'\g
', + self.processed_html) + self.processed_html = re.sub(r'(?i)(?P]*>)\s*' + r'(?P
(<(blockquote|div)[^>]*>' + r'\s*){1,})', + r'\g
' + r'\g', + self.processed_html) bods = htmls = 0 for x in re.finditer('|', self.processed_html): if x == '': - bods +=1 + bods += 1 else: htmls += 1 if bods > 1 and htmls > 1: @@ -380,8 +417,8 @@ class MobiReader(object): self.processed_html = self.processed_html.replace('', '') def remove_random_bytes(self, html): - return re.sub('\x14|\x15|\x19|\x1c|\x1d|\xef|\x12|\x13|\xec|\x08|\x01|\x02|\x03|\x04|\x05|\x06|\x07', - '', html) + return re.sub('\x14|\x15|\x19|\x1c|\x1d|\xef|\x12|\x13|\xec|\x08|\x01' + '|\x02|\x03|\x04|\x05|\x06|\x07', '', html) def ensure_unit(self, raw, unit='px'): if re.search(r'\d+$', raw) is not None: @@ -448,9 +485,10 @@ class MobiReader(object): # discarded by a renderer tag.text = '\u00a0' # nbsp styles.append('height: %s' % - self.ensure_unit(height)) + self.ensure_unit(height)) else: - styles.append('margin-top: %s' % self.ensure_unit(height)) + styles.append('margin-top: %s' % + self.ensure_unit(height)) if 'width' in attrib: width = attrib.pop('width').strip() if width and re.search(r'\d+', width): @@ -464,14 +502,16 @@ class MobiReader(object): try: ewidth_val = unit_convert(ewidth, 12, 500, 166) self.text_indents[tag] = ewidth_val - except: + except Exception: pass if width.startswith('-'): - styles.append('margin-left: %s' % self.ensure_unit(width[1:])) + styles.append('margin-left: %s' % + self.ensure_unit(width[1:])) try: - ewidth_val = unit_convert(ewidth[1:], 12, 500, 166) + ewidth_val = unit_convert(ewidth[1:], + 12, 500, 166) self.left_margins[tag] = ewidth_val - except: + except Exception: pass if 'align' in attrib: @@ -514,16 +554,20 @@ class MobiReader(object): except Exception: pass else: - attrib['src'] = 'images/' + image_name_map.get(recindex, '%05d.jpg' % recindex) + attrib['src'] = ('images/' + + image_name_map.get(recindex, + '%05d.jpg' % + recindex)) for attr in ('width', 'height'): if attr in attrib: val = attrib[attr] if val.lower().endswith('em'): try: nval = float(val[:-2]) - nval *= 16 * (168.451/72) # Assume this was set using the Kindle profile - attrib[attr] = "%dpx"%int(nval) - except: + # Assume this was set using the Kindle profile + nval *= 16 * (168.451/72) + attrib[attr] = "%dpx" % int(nval) + except Exception: del attrib[attr] elif val.lower().endswith('%'): del attrib[attr] @@ -550,10 +594,12 @@ class MobiReader(object): attrib['href'] = "#filepos%d" % int(filepos) except ValueError: pass - if (tag.tag == 'a' and attrib.get('id', '').startswith('filepos') and - not tag.text and len(tag) == 0 and (tag.tail is None or not - tag.tail.strip()) and getattr(tag.getnext(), 'tag', - None) in BLOCK_TAGS): + if (tag.tag == 'a' and + attrib.get('id', '').startswith('filepos') and + not tag.text and len(tag) == 0 and + (tag.tail is None or + not tag.tail.strip()) and + getattr(tag.getnext(), 'tag', None) in BLOCK_TAGS): # This is an empty anchor immediately before a block tag, move # the id onto the block tag instead forwardable_anchors.append(tag) @@ -625,11 +671,11 @@ class MobiReader(object): ti = self.text_indents.get(tag, ti) try: lm = float(lm) - except: + except Exception: lm = 0.0 try: ti = float(ti) - except: + except Exception: ti = 0.0 return lm + ti @@ -647,13 +693,14 @@ class MobiReader(object): mi = MetaInformation(self.book_header.title, ['Unknown']) opf = OPFCreator(os.path.dirname(htmlfile), mi) if hasattr(self.book_header.exth, 'cover_offset'): - opf.cover = 'images/%05d.jpg' % (self.book_header.exth.cover_offset + 1) + opf.cover = 'images/%05d.jpg' % (self.book_header + .exth.cover_offset + 1) elif mi.cover is not None: opf.cover = mi.cover else: opf.cover = 'images/%05d.jpg' % 1 if not os.path.exists(os.path.join(os.path.dirname(htmlfile), - * opf.cover.split('/'))): + * opf.cover.split('/'))): opf.cover = None cover = opf.cover @@ -669,7 +716,7 @@ class MobiReader(object): opf.cover = ncover.replace(os.sep, '/') manifest = [(htmlfile, 'application/xhtml+xml'), - (os.path.abspath('styles.css'), 'text/css')] + (os.path.abspath('styles.css'), 'text/css')] bp = os.path.dirname(htmlfile) added = set() for i in getattr(self, 'image_names', []): @@ -708,15 +755,17 @@ class MobiReader(object): if href and re.match(r'\w+://', href) is None: try: text = ' '.join([t.strip() for t in - x.xpath('descendant::text()')]) - except: + x.xpath('descendant:' + ':text()')]) + except Exception: text = '' text = ent_pat.sub(entity_to_unicode, text) - item = tocobj.add_item(toc.partition('#')[0], href[1:], - text) + item = tocobj.add_item(toc.partition('#')[0], + href[1:], text) item.left_space = int(self.get_left_whitespace(x)) found = True - if reached and found and x.get('class', None) == 'mbp_pagebreak': + if (reached and found and + x.get('class', None) == 'mbp_pagebreak'): break if tocobj is not None: tocobj = self.structure_toc(tocobj) @@ -748,7 +797,7 @@ class MobiReader(object): level = indent_vals.index(item.left_space) parent = find_parent(level) last_found[level] = parent.add_item(item.href, item.fragment, - item.text) + item.text) return newtoc @@ -782,7 +831,9 @@ class MobiReader(object): def warn_about_trailing_entry_corruption(self): if not self.warned_about_trailing_entry_corruption: self.warned_about_trailing_entry_corruption = True - self.log.warn('The trailing data entries in this MOBI file are corrupted, you might see corrupted text in the output') + self.log.warn('The trailing data entries in this MOBI file are ' + 'corrupted, you might see corrupted text in the ' + 'output') def text_section(self, index): data = self.sections[index][0] @@ -791,19 +842,23 @@ class MobiReader(object): def extract_text(self, offset=1): self.log.debug('Extracting text...') - text_sections = [self.text_section(i) for i in range(offset, - min(self.book_header.records + offset, len(self.sections)))] + text_sections = [self.text_section(i) + for i in range(offset, min(self.book_header.records + + offset, + len(self.sections)))] processed_records = list(range(offset-1, self.book_header.records + - offset)) + offset)) self.mobi_html = b'' if self.book_header.compression_type == b'DH': - huffs = [self.sections[i][0] for i in - range(self.book_header.huff_offset, - self.book_header.huff_offset + self.book_header.huff_number)] + huffs = [self.sections[i][0] + for i in range(self.book_header.huff_offset, + self.book_header.huff_offset + + self.book_header.huff_number)] processed_records += list(range(self.book_header.huff_offset, - self.book_header.huff_offset + self.book_header.huff_number)) + self.book_header.huff_offset + + self.book_header.huff_number)) huff = HuffReader(huffs) unpack = huff.unpack @@ -811,19 +866,23 @@ class MobiReader(object): unpack = decompress_doc elif self.book_header.compression_type == b'\x00\x01': - unpack = lambda x: x + unpack = lambda x: x # noqa else: - raise MobiError('Unknown compression algorithm: %r' % self.book_header.compression_type) + raise MobiError('Unknown compression algorithm: %r' % + self.book_header.compression_type) self.mobi_html = b''.join(map(unpack, text_sections)) if self.mobi_html.endswith(b'#'): self.mobi_html = self.mobi_html[:-1] - if self.book_header.ancient and b']+filepos=['"]{0,1}(\d+)[^<>]*>''', - re.IGNORECASE) + re.IGNORECASE) for match in link_pattern.finditer(self.mobi_html): positions.add(int(match.group(1))) pos = 0 @@ -845,12 +904,13 @@ class MobiReader(object): if end == 0: continue oend = end - l = self.mobi_html.find(b'<', end) + _l = self.mobi_html.find(b'<', end) r = self.mobi_html.find(b'>', end) anchor = b'' - if r > -1 and (r < l or l == end or l == -1): + if r > -1 and (r < _l or _l == end or _l == -1): p = self.mobi_html.rfind(b'<', 0, end + 1) - if (pos < end and p > -1 and not end_tag_re.match(self.mobi_html[p:r]) and + if (pos < end and p > -1 and + not end_tag_re.match(self.mobi_html[p:r]) and not self.mobi_html[p:r + 1].endswith(b'/>')): anchor = b' filepos-id="filepos%d"' end = r @@ -862,8 +922,9 @@ class MobiReader(object): processed_html = b''.join(processed_html) # Remove anchors placed inside entities - self.processed_html = re.sub(br'&([^;]*?)()([^;]*);', - br'&\1\3;\2', processed_html) + self.processed_html = re.sub(br'&([^;]*?)()' + br'([^;]*);', br'&\1\3;\2', + processed_html) def extract_images(self, processed_records, output_dir): self.log.debug('Extracting images...') @@ -881,10 +942,11 @@ class MobiReader(object): if i in processed_records: continue processed_records.append(i) - data = self.sections[i][0] + data = self.sections[i][0] image_index += 1 if data[:4] in {b'FLIS', b'FCIS', b'SRCS', b'\xe9\x8e\r\n', - b'RESC', b'BOUN', b'FDST', b'DATP', b'AUDI', b'VIDE'}: + b'RESC', b'BOUN', b'FDST', b'DATP', b'AUDI', + b'VIDE'}: # This record is a known non image type, no need to try to # load the image continue @@ -920,16 +982,17 @@ class MobiReader(object): def test_mbp_regex(): - for raw, m in {'':'', - 'yyy':' xxxyyy', - ' ':'', - 'xxx':'xxx', - 'xxx':'xxx', - 'xxx':' sdfxxx', - '':' ', - '':'', - '':' sdf', - 'xxx':'xxx'}.items(): + for raw, m in {'': '', + 'yyy': ' xxxyyy', + ' ': '', + 'xxx': 'xxx', + 'xxx': 'xxx', + 'xxx': ' sdfxxx', + '': ' ', + '': '', + '': ' sdf', + 'xxx': + 'xxx'}.items(): ans = MobiReader.PAGE_BREAK_PAT.sub(r'\1', raw) if ans != m: - raise Exception('%r != %r for %r'%(ans, m, raw)) + raise Exception('%r != %r for %r' % (ans, m, raw)) diff --git a/ebook_converter/ebooks/oeb/transforms/data_url.py b/ebook_converter/ebooks/oeb/transforms/data_url.py index 07c0363..30b9697 100644 --- a/ebook_converter/ebooks/oeb/transforms/data_url.py +++ b/ebook_converter/ebooks/oeb/transforms/data_url.py @@ -1,13 +1,11 @@ import mimetypes import re + from ebook_converter.ebooks.oeb.base import XPath, urlunquote +from ebook_converter.polyglot.binary import from_base64_bytes from ebook_converter.polyglot.builtins import as_bytes -__license__ = 'GPL v3' -__copyright__ = '2014, Kovid Goyal ' - - class DataURL(object): def __call__(self, oeb, opts): @@ -27,25 +25,29 @@ class DataURL(object): continue if ';base64' in header: data = re.sub(r'\s+', '', data) - from ebook_converter.polyglot.binary import from_base64_bytes try: data = from_base64_bytes(data) except Exception: - self.log.error('Found invalid base64 encoded data URI, ignoring it') + self.log.error('Found invalid base64 encoded data ' + 'URI, ignoring it') continue else: data = urlunquote(data) data = as_bytes(data) fmt = what(None, data) if not fmt: - self.log.warn('Image encoded as data URL has unknown format, ignoring') + self.log.warn('Image encoded as data URL has unknown ' + 'format, ignoring') continue - img.set('src', item.relhref(self.convert_image_data_uri(data, fmt, oeb))) + img.set('src', + item.relhref(self.convert_image_data_uri(data, fmt, + oeb))) def convert_image_data_uri(self, data, fmt, oeb): - self.log('Found image encoded as data URI converting it to normal image') - from ebook_converter import guess_type - item_id, item_href = oeb.manifest.generate('data-url-image', 'data-url-image.' + fmt) + self.log('Found image encoded as data URI converting it to normal ' + 'image') + item_id, item_href = oeb.manifest.generate('data-url-image', + 'data-url-image.' + fmt) oeb.manifest.add(item_id, item_href, mimetypes.guess_type(item_href)[0], data=data) return item_href diff --git a/ebook_converter/ebooks/oeb/transforms/jacket.py b/ebook_converter/ebooks/oeb/transforms/jacket.py index 7f14d66..716e138 100644 --- a/ebook_converter/ebooks/oeb/transforms/jacket.py +++ b/ebook_converter/ebooks/oeb/transforms/jacket.py @@ -1,9 +1,11 @@ import mimetypes -import sys, os, re -from xml.sax.saxutils import escape -from string import Formatter +import os import pkg_resources +import re +import string +import sys import urllib.parse +from xml.sax import saxutils from ebook_converter import constants as const from ebook_converter import strftime @@ -16,18 +18,14 @@ from ebook_converter.ebooks.chardet import strip_encoding_declarations from ebook_converter.ebooks.metadata import fmt_sidx, rating_to_stars -__license__ = 'GPL v3' -__copyright__ = '2009, Kovid Goyal ' -__docformat__ = 'restructuredtext en' - JACKET_XPATH = '//h:meta[@name="calibre-content" and @content="jacket"]' -class SafeFormatter(Formatter): +class SafeFormatter(string.Formatter): def get_value(self, *args, **kwargs): try: - return Formatter.get_value(self, *args, **kwargs) + return string.Formatter.get_value(self, *args, **kwargs) except KeyError: return '' @@ -40,7 +38,7 @@ class Base(object): for img in path(item.data): if removed >= limit: break - href = item.abshref(img.get('src')) + href = item.abshref(img.get('src')) image = self.oeb.manifest.hrefs.get(href) if image is None: href = urlnormalize(href) @@ -68,7 +66,8 @@ class RemoveFirstImage(Base): raw = xml2text(body[0]).strip() imgs = XPath('//h:img|//svg:svg')(item.data) if not raw and not imgs: - self.log('Removing %s as it has no content'%item.href) + self.log('Removing %s as it has no content' % + item.href) self.oeb.manifest.remove(item) deleted_item = item break @@ -82,20 +81,20 @@ class RemoveFirstImage(Base): self.oeb.guide.remove_by_href(deleted_item.href) def __call__(self, oeb, opts, metadata): - ''' + """ Add metadata in jacket.xhtml if specified in opts If not specified, remove previous jacket instance - ''' + """ self.oeb, self.opts, self.log = oeb, opts, oeb.log if opts.remove_first_image: self.remove_first_image() class Jacket(Base): - ''' - Book jacket manipulation. Remove first image and insert comments at start of - book. - ''' + """ + Book jacket manipulation. Remove first image and insert comments at start + of book. + """ def insert_metadata(self, mi): self.log('Inserting metadata into book...') @@ -107,22 +106,24 @@ class Jacket(Base): try: comments = str(self.oeb.metadata.description[0]) - except: + except Exception: comments = '' try: title = str(self.oeb.metadata.title[0]) - except: + except Exception: title = 'Unknown' try: authors = list(map(str, self.oeb.metadata.creator)) - except: + except Exception: authors = ['Unknown'] root = render_jacket(mi, self.opts.output_profile, - alt_title=title, alt_tags=tags, alt_authors=authors, - alt_comments=comments, rescale_fonts=True) + alt_title=title, alt_tags=tags, + alt_authors=authors, + alt_comments=comments, + rescale_fonts=True) id, href = self.oeb.manifest.generate('calibre_jacket', 'jacket.xhtml') jacket = self.oeb.manifest.add(id, href, mimetypes.guess_type(href)[0], @@ -132,7 +133,8 @@ class Jacket(Base): for img, path in referenced_images(root): self.oeb.log('Embedding referenced image %s into jacket' % path) ext = path.rpartition('.')[-1].lower() - item_id, href = self.oeb.manifest.generate('jacket_image', 'jacket_img.'+ext) + item_id, href = self.oeb.manifest.generate('jacket_image', + 'jacket_img.' + ext) with open(path, 'rb') as f: item = self.oeb.manifest.add( item_id, href, mimetypes.guess_type(href)[0], @@ -149,10 +151,10 @@ class Jacket(Base): break def __call__(self, oeb, opts, metadata): - ''' + """ Add metadata in jacket.xhtml if specified in opts If not specified, remove previous jacket instance - ''' + """ self.oeb, self.opts, self.log = oeb, opts, oeb.log self.remove_existing_jacket() if opts.insert_metadata: @@ -164,8 +166,8 @@ class Jacket(Base): def get_rating(rating, rchar, e_rchar): ans = '' try: - num = float(rating)/2 - except: + num = float(rating) / 2 + except Exception: return ans num = max(0, num) num = min(num, 5) @@ -180,25 +182,29 @@ class Series(str): def __new__(self, series, series_index): if series and series_index is not None: - roman = '{1} of {0}'.format( - escape(series), escape(fmt_sidx(series_index, use_roman=True))) - combined = '{1} of {0}'.format( - escape(series), escape(fmt_sidx(series_index, - use_roman=False))) + _roman = saxutils.escape(fmt_sidx(series_index, use_roman=True)) + _no_roman = saxutils.escape(fmt_sidx(series_index, + use_roman=False)) + roman = '{1} of {0}'.format(saxutils.escape(series), + _roman) + combined = '{1} of {0}'.format(saxutils.escape(series), + _no_roman) else: - combined = roman = escape(series or u'') + combined = roman = saxutils.escape(series or u'') s = str.__new__(self, combined) s.roman = roman - s.name = escape(series or '') - s.number = escape(fmt_sidx(series_index or 1.0, use_roman=False)) - s.roman_number = escape(fmt_sidx(series_index or 1.0, use_roman=True)) + s.name = saxutils.escape(series or '') + s.number = saxutils.escape(fmt_sidx(series_index or 1.0, + use_roman=False)) + s.roman_number = saxutils.escape(fmt_sidx(series_index or 1.0, + use_roman=True)) return s class Tags(str): def __new__(self, tags, output_profile): - tags = [escape(x) for x in tags or ()] + tags = [saxutils.escape(x) for x in tags or ()] t = str.__new__(self, ', '.join(tags)) t.alphabetical = ', '.join(sorted(tags)) t.tags_list = tags @@ -233,9 +239,9 @@ def postprocess_jacket(root, output_profile, has_data): extract_class('cbj_kindle_banner_hr') -def render_jacket(mi, output_profile, - alt_title='Unknown', alt_tags=[], alt_comments='', - alt_publisher='', rescale_fonts=False, alt_authors=None): +def render_jacket(mi, output_profile, alt_title='Unknown', alt_tags=[], + alt_comments='', alt_publisher='', rescale_fonts=False, + alt_authors=None): with open(pkg_resources.resource_filename('ebook_converter', 'data/jacket/stylesheet.css'), 'rb') as fobj: @@ -250,17 +256,20 @@ def render_jacket(mi, output_profile, try: title_str = alt_title if mi.is_null('title') else mi.title - except: + except Exception: title_str = 'Unknown' - title_str = escape(title_str) + title_str = saxutils.escape(title_str) title = '%s' % title_str series = Series(mi.series, mi.series_index) try: - publisher = mi.publisher if not mi.is_null('publisher') else alt_publisher - except: + if not mi.is_null('publisher'): + publisher = mi.publisher + else: + publisher = alt_publisher + except Exception: publisher = '' - publisher = escape(publisher) + publisher = saxutils.escape(publisher) try: if is_date_undefined(mi.pubdate): @@ -268,10 +277,11 @@ def render_jacket(mi, output_profile, else: dt = as_local_time(mi.pubdate) pubdate = strftime('%Y', dt.timetuple()) - except: + except Exception: pubdate = '' - rating = get_rating(mi.rating, output_profile.ratings_char, output_profile.empty_ratings_char) + rating = get_rating(mi.rating, output_profile.ratings_char, + output_profile.empty_ratings_char) tags = Tags((mi.tags if mi.tags else alt_tags), output_profile) @@ -285,10 +295,10 @@ def render_jacket(mi, output_profile, mi.authors = list(alt_authors or ('Unknown',)) try: author = mi.format_authors() - except: + except Exception: author = '' mi.authors = orig - author = escape(author) + author = saxutils.escape(author) has_data = {} def generate_html(comments): @@ -301,7 +311,7 @@ def render_jacket(mi, output_profile, 'publisher': publisher, 'rating': rating, 'rating_label': 'Rating', - 'searchable_tags': ' '.join(escape(t) + 'ttt' + 'searchable_tags': ' '.join(saxutils.escape(t) + 'ttt' for t in tags.tags_list), 'series': series, 'series_label': 'Series', @@ -320,25 +330,30 @@ def render_jacket(mi, output_profile, if dt == 'series': args[dkey] = Series(mi.get(key), mi.get(key + '_index')) elif dt == 'rating': - args[dkey] = rating_to_stars(mi.get(key), m.get('display', {}).get('allow_half_stars', False)) + args[dkey] = rating_to_stars(mi.get(key), + m.get('display', {}) + .get('allow_half_stars', + False)) elif dt == 'comments': val = val or '' display = m.get('display', {}) ctype = display.get('interpret_as') or 'html' if ctype == 'long-text': - val = '
%s
' % escape(val) + val = ('
%s
' % + saxutils.escape(val)) elif ctype == 'short-text': - val = '%s' % escape(val) + val = '%s' % saxutils.escape(val) elif ctype == 'markdown': val = markdown(val) else: val = comments_to_html(val) args[dkey] = val else: - args[dkey] = escape(val) - args[dkey+'_label'] = escape(display_name) + args[dkey] = saxutils.escape(val) + args[dkey+'_label'] = saxutils.escape(display_name) except Exception: - # if the val (custom column contents) is None, don't add to args + # if the val (custom column contents) is None, don't add to + # args pass if False: @@ -371,10 +386,11 @@ def render_jacket(mi, output_profile, # the text in the book. That means that as long as the jacket uses # relative font sizes (em or %), the post conversion font size will be # the same as for text in the main book. So text with size x em will - # be rescaled to the same value in both the jacket and the main content. + # be rescaled to the same value in both the jacket and the main + # content. # - # We cannot use data-calibre-rescale 100 on the body tag as that will just - # give the body tag a font size of 1em, which is useless. + # We cannot use data-calibre-rescale 100 on the body tag as that will + # just give the body tag a font size of 1em, which is useless. for body in root.xpath('//*[local-name()="body"]'): fw = body.makeelement(base.tag('xhtml', 'div')) fw.set('data-calibre-rescale', '100')