diff --git a/ebook_converter/ebooks/conversion/plugins/html_input.py b/ebook_converter/ebooks/conversion/plugins/html_input.py
index 97c553f..76eab84 100644
--- a/ebook_converter/ebooks/conversion/plugins/html_input.py
+++ b/ebook_converter/ebooks/conversion/plugins/html_input.py
@@ -15,17 +15,18 @@ from ebook_converter.polyglot.builtins import as_unicode
def sanitize_file_name(x):
- ans = re.sub(r'\s+', ' ', re.sub(r'[?&=;#]', '_', ascii_filename(x))).strip().rstrip('.')
+ ans = re.sub(r'\s+', ' ', re.sub(r'[?&=;#]', '_',
+ ascii_filename(x))).strip().rstrip('.')
ans, ext = ans.rpartition('.')[::2]
return (ans.strip() + '.' + ext.strip()).rstrip('.')
class HTMLInput(InputFormatPlugin):
- name = 'HTML Input'
- author = 'Kovid Goyal'
+ name = 'HTML Input'
+ author = 'Kovid Goyal'
description = 'Convert HTML and OPF files to an OEB'
- file_types = {'opf', 'html', 'htm', 'xhtml', 'xhtm', 'shtm', 'shtml'}
+ file_types = {'opf', 'html', 'htm', 'xhtml', 'xhtm', 'shtm', 'shtml'}
commit_name = 'html_input'
options = {
diff --git a/ebook_converter/ebooks/conversion/plugins/htmlz_input.py b/ebook_converter/ebooks/conversion/plugins/htmlz_input.py
index e8d7765..6f465f0 100644
--- a/ebook_converter/ebooks/conversion/plugins/htmlz_input.py
+++ b/ebook_converter/ebooks/conversion/plugins/htmlz_input.py
@@ -6,10 +6,10 @@ from ebook_converter.customize.conversion import InputFormatPlugin
class HTMLZInput(InputFormatPlugin):
- name = 'HTLZ Input'
- author = 'John Schember'
+ name = 'HTLZ Input'
+ author = 'John Schember'
description = 'Convert HTML files to HTML'
- file_types = {'htmlz'}
+ file_types = {'htmlz'}
commit_name = 'htmlz_input'
def convert(self, stream, options, file_ext, log,
@@ -36,13 +36,14 @@ class HTMLZInput(InputFormatPlugin):
top_levels.append(x)
# Try to find an index. file.
for x in top_levels:
- if x.lower() in (u'index.html', u'index.xhtml', u'index.htm'):
+ if x.lower() in ('index.html', 'index.xhtml', 'index.htm'):
index = x
break
# Look for multiple HTML files in the archive. We look at the
# top level files only as only they matter in HTMLZ.
for x in top_levels:
- if os.path.splitext(x)[1].lower() in (u'.html', u'.xhtml', u'.htm'):
+ if os.path.splitext(x)[1].lower() in ('.html', '.xhtml',
+ '.htm'):
# Set index to the first HTML file found if it's not
# called index.
if not index:
@@ -84,15 +85,14 @@ class HTMLZInput(InputFormatPlugin):
c = 0
while os.path.exists(htmlfile):
c += 1
- htmlfile = u'index%d.html'%c
+ htmlfile = u'index%d.html' % c
with open(htmlfile, 'wb') as f:
f.write(html.encode('utf-8'))
odi = options.debug_pipeline
options.debug_pipeline = None
# Generate oeb from html conversion.
with open(htmlfile, 'rb') as f:
- oeb = html_input.convert(f, options, 'html', log,
- {})
+ oeb = html_input.convert(f, options, 'html', log, {})
options.debug_pipeline = odi
os.remove(htmlfile)
diff --git a/ebook_converter/ebooks/docx/to_html.py b/ebook_converter/ebooks/docx/to_html.py
index d1c4ed8..dc79f2c 100644
--- a/ebook_converter/ebooks/docx/to_html.py
+++ b/ebook_converter/ebooks/docx/to_html.py
@@ -1,5 +1,11 @@
-import sys, os, re, math, errno, uuid, numbers
-from collections import OrderedDict, defaultdict
+import sys
+import os
+import re
+import math
+import errno
+import uuid
+import numbers
+import collections
import mimetypes
from lxml import etree
@@ -7,23 +13,24 @@ from lxml import html
from lxml.html.builder import (
HTML, HEAD, TITLE, BODY, LINK, META, P, SPAN, BR, DIV, A, DT, DL, DD, H1)
-from ebook_converter import guess_type
-from ebook_converter.ebooks.docx.container import DOCX
-from ebook_converter.ebooks.docx.names import XML, generate_anchor
-from ebook_converter.ebooks.docx.styles import Styles, inherit, PageProperties
-from ebook_converter.ebooks.docx.numbering import Numbering
-from ebook_converter.ebooks.docx.fonts import Fonts, is_symbol_font, map_symbol_text
-from ebook_converter.ebooks.docx.images import Images
-from ebook_converter.ebooks.docx.tables import Tables
-from ebook_converter.ebooks.docx.footnotes import Footnotes
from ebook_converter.ebooks.docx.cleanup import cleanup_markup
+from ebook_converter.ebooks.docx.container import DOCX
+from ebook_converter.ebooks.docx.fields import Fields
+from ebook_converter.ebooks.docx.fonts import Fonts
+from ebook_converter.ebooks.docx.fonts import is_symbol_font
+from ebook_converter.ebooks.docx.fonts import map_symbol_text
+from ebook_converter.ebooks.docx.footnotes import Footnotes
+from ebook_converter.ebooks.docx.images import Images
+from ebook_converter.ebooks.docx.names import XML, generate_anchor
+from ebook_converter.ebooks.docx.numbering import Numbering
+from ebook_converter.ebooks.docx.settings import Settings
+from ebook_converter.ebooks.docx.styles import Styles, inherit, PageProperties
+from ebook_converter.ebooks.docx.tables import Tables
from ebook_converter.ebooks.docx.theme import Theme
from ebook_converter.ebooks.docx.toc import create_toc
-from ebook_converter.ebooks.docx.fields import Fields
-from ebook_converter.ebooks.docx.settings import Settings
from ebook_converter.ebooks.metadata.opf2 import OPFCreator
-from ebook_converter.utils.localization import canonicalize_lang, lang_as_iso639_1
-
+from ebook_converter.utils.localization import canonicalize_lang
+from ebook_converter.utils.localization import lang_as_iso639_1
NBSP = '\xa0'
@@ -54,7 +61,9 @@ def html_lang(docx_lang):
class Convert(object):
- def __init__(self, path_or_stream, dest_dir=None, log=None, detect_cover=True, notes_text=None, notes_nopb=False, nosupsub=False):
+ def __init__(self, path_or_stream, dest_dir=None, log=None,
+ detect_cover=True, notes_text=None, notes_nopb=False,
+ nosupsub=False):
self.docx = DOCX(path_or_stream, log=log)
self.namespace = self.docx.namespace
self.ms_pat = re.compile(r'\s{2,}')
@@ -73,7 +82,7 @@ class Convert(object):
self.fields = Fields(self.namespace)
self.styles = Styles(self.namespace, self.tables)
self.images = Images(self.namespace, self.log)
- self.object_map = OrderedDict()
+ self.object_map = collections.OrderedDict()
self.html = HTML(
HEAD(
META(charset='utf-8'),
@@ -82,9 +91,9 @@ class Convert(object):
),
self.body
)
- self.html.text='\n\t'
- self.html[0].text='\n\t\t'
- self.html[0].tail='\n'
+ self.html.text = '\n\t'
+ self.html[0].text = '\n\t\t'
+ self.html[0].tail = '\n'
for child in self.html[0]:
child.tail = '\n\t\t'
self.html[0][-1].tail = '\n\t'
@@ -98,17 +107,18 @@ class Convert(object):
def __call__(self):
doc = self.docx.document
- relationships_by_id, relationships_by_type = self.docx.document_relationships
+ (relationships_by_id,
+ relationships_by_type) = self.docx.document_relationships
self.resolve_alternate_content(doc)
self.fields(doc, self.log)
self.read_styles(relationships_by_type)
self.images(relationships_by_id)
- self.layers = OrderedDict()
+ self.layers = collections.OrderedDict()
self.framed = [[]]
self.frame_map = {}
self.framed_map = {}
self.anchor_map = {}
- self.link_map = defaultdict(list)
+ self.link_map = collections.defaultdict(list)
self.link_source_map = {}
self.toc_anchor = None
self.block_runs = []
@@ -142,7 +152,8 @@ class Convert(object):
dl = DL(id=anchor)
dl.set('class', 'footnote')
self.body.append(dl)
- dl.append(DT('[', A('←' + text, href='#back_%s' % anchor, title=text)))
+ dl.append(DT('[', A('←' + text, href='#back_%s' % anchor,
+ title=text)))
dl[-1][0].tail = ']'
dl.append(DD())
paras = []
@@ -159,7 +170,8 @@ class Convert(object):
self.mark_block_runs(paras)
for p, wp in self.object_map.items():
- if len(p) > 0 and not p.text and len(p[0]) > 0 and not p[0].text and p[0][0].get('class', None) == 'tab':
+ if (len(p) > 0 and not p.text and len(p[0]) > 0 and
+ not p[0].text and p[0][0].get('class', None) == 'tab'):
# Paragraph uses tabs for indentation, convert to text-indent
parent = p[0]
tabs = []
@@ -172,7 +184,9 @@ class Convert(object):
break
indent = len(tabs) * self.settings.default_tab_stop
style = self.styles.resolve(wp)
- if style.text_indent is inherit or (hasattr(style.text_indent, 'endswith') and style.text_indent.endswith('pt')):
+ if (style.text_indent is inherit or
+ (hasattr(style.text_indent, 'endswith') and
+ style.text_indent.endswith('pt'))):
if style.text_indent is not inherit:
indent = float(style.text_indent[:-2]) + indent
style.text_indent = '%.3gpt' % indent
@@ -197,7 +211,8 @@ class Convert(object):
except (TypeError, ValueError):
lvl = 0
numbered.append((html_obj, num_id, lvl))
- self.numbering.apply_markup(numbered, self.body, self.styles, self.object_map, self.images)
+ self.numbering.apply_markup(numbered, self.body, self.styles,
+ self.object_map, self.images)
self.apply_frames()
if len(self.body) > 0:
@@ -232,13 +247,15 @@ class Convert(object):
self.fields.polish_markup(self.object_map)
self.log.debug('Cleaning up redundant markup generated by Word')
- self.cover_image = cleanup_markup(self.log, self.html, self.styles, self.dest_dir, self.detect_cover, self.namespace.XPath)
+ self.cover_image = cleanup_markup(self.log, self.html, self.styles,
+ self.dest_dir, self.detect_cover,
+ self.namespace.XPath)
return self.write(doc)
def read_page_properties(self, doc):
current = []
- self.page_map = OrderedDict()
+ self.page_map = collections.OrderedDict()
self.section_starts = []
for p in self.namespace.descendants(doc, 'w:p', 'w:tbl'):
@@ -267,7 +284,8 @@ class Convert(object):
def resolve_alternate_content(self, doc):
# For proprietary extensions in Word documents use the fallback, spec
# compliant form
- # See https://wiki.openoffice.org/wiki/OOXML/Markup_Compatibility_and_Extensibility
+ # See https://wiki.openoffice.org/wiki/
+ # OOXML/Markup_Compatibility_and_Extensibility
for ac in self.namespace.descendants(doc, 'mc:AlternateContent'):
choices = self.namespace.XPath('./mc:Choice')(ac)
fallbacks = self.namespace.XPath('./mc:Fallback')(ac)
@@ -284,7 +302,8 @@ class Convert(object):
cname[-1] = defname
if self.docx.exists('/'.join(cname)):
name = name
- if name and name.startswith('word/word') and not self.docx.exists(name):
+ if (name and name.startswith('word/word') and
+ not self.docx.exists(name)):
name = name.partition('/')[2]
return name
@@ -327,7 +346,8 @@ class Convert(object):
self.log.warn('Endnotes %s do not exist' % enname)
else:
enrel = self.docx.get_relationships(enname)
- footnotes(etree.fromstring(foraw) if foraw else None, forel, etree.fromstring(enraw) if enraw else None, enrel)
+ footnotes(etree.fromstring(foraw) if foraw else None, forel,
+ etree.fromstring(enraw) if enraw else None, enrel)
if fname is not None:
embed_relationships = self.docx.get_relationships(fname)[0]
@@ -336,7 +356,8 @@ class Convert(object):
except KeyError:
self.log.warn('Fonts table %s does not exist' % fname)
else:
- fonts(etree.fromstring(raw), embed_relationships, self.docx, self.dest_dir)
+ fonts(etree.fromstring(raw), embed_relationships, self.docx,
+ self.dest_dir)
if tname is not None:
try:
@@ -364,16 +385,20 @@ class Convert(object):
except KeyError:
self.log.warn('Numbering styles %s do not exist' % nname)
else:
- numbering(etree.fromstring(raw), self.styles, self.docx.get_relationships(nname)[0])
+ numbering(etree.fromstring(raw), self.styles,
+ self.docx.get_relationships(nname)[0])
self.styles.resolve_numbering(numbering)
def write(self, doc):
- toc = create_toc(doc, self.body, self.resolved_link_map, self.styles, self.object_map, self.log, self.namespace)
- raw = html.tostring(self.html, encoding='utf-8', doctype='')
+ toc = create_toc(doc, self.body, self.resolved_link_map, self.styles,
+ self.object_map, self.log, self.namespace)
+ raw = html.tostring(self.html, encoding='utf-8',
+ doctype='')
with open(os.path.join(self.dest_dir, 'index.html'), 'wb') as f:
f.write(raw)
- css = self.styles.generate_css(self.dest_dir, self.docx, self.notes_nopb, self.nosupsub)
+ css = self.styles.generate_css(self.dest_dir, self.docx,
+ self.notes_nopb, self.nosupsub)
if css:
with open(os.path.join(self.dest_dir, 'docx.css'), 'wb') as f:
f.write(css.encode('utf-8'))
@@ -394,23 +419,29 @@ class Convert(object):
title='Table of Contents',
type='toc'))
toc_file = os.path.join(self.dest_dir, 'toc.ncx')
- with open(os.path.join(self.dest_dir, 'metadata.opf'), 'wb') as of, open(toc_file, 'wb') as ncx:
+ with open(os.path.join(self.dest_dir,
+ 'metadata.opf'), 'wb') as of, open(toc_file,
+ 'wb') as ncx:
opf.render(of, ncx, 'toc.ncx', process_guide=process_guide)
if os.path.getsize(toc_file) == 0:
os.remove(toc_file)
return os.path.join(self.dest_dir, 'metadata.opf')
def read_block_anchors(self, doc):
- doc_anchors = frozenset(self.namespace.XPath('./w:body/w:bookmarkStart[@w:name]')(doc))
+ doc_anchors = frozenset(self.namespace.XPath('./w:body/w:bookmarkStart'
+ '[@w:name]')(doc))
if doc_anchors:
current_bm = set()
- rmap = {v:k for k, v in self.object_map.items()}
- for p in self.namespace.descendants(doc, 'w:p', 'w:bookmarkStart[@w:name]'):
+ rmap = {v: k for k, v in self.object_map.items()}
+ for p in self.namespace.descendants(doc, 'w:p',
+ 'w:bookmarkStart[@w:name]'):
if p.tag.endswith('}p'):
if current_bm and p in rmap:
para = rmap[p]
if 'id' not in para.attrib:
- para.set('id', generate_anchor(next(iter(current_bm)), frozenset(self.anchor_map.values())))
+ _bm = next(iter(current_bm))
+ _am = frozenset(self.anchor_map.values())
+ para.set('id', generate_anchor(_bm, _am))
for name in current_bm:
self.anchor_map[name] = para.get('id')
current_bm = set()
@@ -442,13 +473,15 @@ class Convert(object):
except AttributeError:
break
- for x in self.namespace.descendants(p, 'w:r', 'w:bookmarkStart', 'w:hyperlink', 'w:instrText'):
+ for x in self.namespace.descendants(p, 'w:r', 'w:bookmarkStart',
+ 'w:hyperlink', 'w:instrText'):
if p_parent(x) is not p:
continue
if x.tag.endswith('}r'):
span = self.convert_run(x)
if current_anchor is not None:
- (dest if len(dest) == 0 else span).set('id', current_anchor)
+ (dest if len(dest) == 0 else span).set('id',
+ current_anchor)
current_anchor = None
if current_hyperlink is not None:
try:
@@ -462,11 +495,14 @@ class Convert(object):
self.layers[p].append(x)
elif x.tag.endswith('}bookmarkStart'):
anchor = self.namespace.get(x, 'w:name')
- if anchor and anchor not in self.anchor_map and anchor != '_GoBack':
+ if (anchor and anchor not in self.anchor_map and
+ anchor != '_GoBack'):
# _GoBack is a special bookmark inserted by Word 2010 for
# the return to previous edit feature, we ignore it
old_anchor = current_anchor
- self.anchor_map[anchor] = current_anchor = generate_anchor(anchor, frozenset(self.anchor_map.values()))
+ current_anchor = generate_anchor(
+ anchor, frozenset(self .anchor_map.values()))
+ self.anchor_map[anchor] = current_anchor
if old_anchor is not None:
# The previous anchor was not applied to any element
for a, t in tuple(self.anchor_map.items()):
@@ -474,10 +510,13 @@ class Convert(object):
self.anchor_map[a] = current_anchor
elif x.tag.endswith('}hyperlink'):
current_hyperlink = x
- elif x.tag.endswith('}instrText') and x.text and x.text.strip().startswith('TOC '):
+ elif (x.tag.endswith('}instrText') and x.text and
+ x.text.strip().startswith('TOC ')):
old_anchor = current_anchor
anchor = str(uuid.uuid4())
- self.anchor_map[anchor] = current_anchor = generate_anchor('toc', frozenset(self.anchor_map.values()))
+ current_anchor = generate_anchor(
+ 'toc', frozenset(self.anchor_map.values()))
+ self.anchor_map[anchor] = current_anchor
self.toc_anchor = current_anchor
if old_anchor is not None:
# The previous anchor was not applied to any element
@@ -489,7 +528,8 @@ class Convert(object):
dest.set('id', current_anchor)
current_anchor = None
- m = re.match(r'heading\s+(\d+)$', style.style_name or '', re.IGNORECASE)
+ m = re.match(r'heading\s+(\d+)$', style.style_name or '',
+ re.IGNORECASE)
if m is not None:
n = min(6, max(1, int(m.group(1))))
dest.tag = 'h%d' % n
@@ -533,7 +573,8 @@ class Convert(object):
if len(dest) > 0 and not dest[-1].tail:
if dest[-1].tag == 'br':
dest[-1].tail = NBSP
- elif len(dest[-1]) > 0 and dest[-1][-1].tag == 'br' and not dest[-1][-1].tail:
+ elif (len(dest[-1]) > 0 and dest[-1][-1].tag == 'br' and
+ not dest[-1][-1].tail):
dest[-1][-1].tail = NBSP
return dest
@@ -578,12 +619,12 @@ class Convert(object):
if anchor and anchor in self.anchor_map:
span.set('href', '#' + self.anchor_map[anchor])
continue
- self.log.warn('Hyperlink with unknown target (rid=%s, anchor=%s), ignoring' %
- (rid, anchor))
+ self.log.warn('Hyperlink with unknown target (rid=%s, anchor=%s), '
+ 'ignoring' % (rid, anchor))
# hrefs that point nowhere give epubcheck a hernia. The element
# should be styled explicitly by Word anyway.
# span.set('href', '#')
- rmap = {v:k for k, v in self.object_map.items()}
+ rmap = {v: k for k, v in self.object_map.items()}
for hyperlink, runs in self.fields.hyperlink_fields:
spans = [rmap[r] for r in runs if r in rmap]
if not spans:
@@ -604,7 +645,8 @@ class Convert(object):
if anchor in self.anchor_map:
span.set('href', '#' + self.anchor_map[anchor])
continue
- self.log.warn('Hyperlink field with unknown anchor: %s' % anchor)
+ self.log.warn('Hyperlink field with unknown anchor: %s' %
+ anchor)
else:
if url in self.anchor_map:
span.set('href', '#' + self.anchor_map[url])
@@ -652,7 +694,8 @@ class Convert(object):
# actually needs it, i.e. if it has more than one
# consecutive space or it has newlines or tabs.
multi_spaces = self.ms_pat.search(ctext) is not None
- preserve = multi_spaces or self.ws_pat.search(ctext) is not None
+ preserve = (multi_spaces or
+ self.ws_pat.search(ctext) is not None)
if preserve:
text.add_elem(SPAN(ctext, style="white-space:pre-wrap"))
ans.append(text.elem)
@@ -668,24 +711,30 @@ class Convert(object):
else:
clear = child.get('clear', None)
if clear in {'all', 'left', 'right'}:
- br = BR(style='clear:%s'%('both' if clear == 'all' else clear))
+ br = BR(style='clear:%s' % ('both' if clear == 'all'
+ else clear))
else:
br = BR()
text.add_elem(br)
ans.append(text.elem)
- elif self.namespace.is_tag(child, 'w:drawing') or self.namespace.is_tag(child, 'w:pict'):
- for img in self.images.to_html(child, self.current_page, self.docx, self.dest_dir):
+ elif (self.namespace.is_tag(child, 'w:drawing') or
+ self.namespace.is_tag(child, 'w:pict')):
+ for img in self.images.to_html(child, self.current_page,
+ self.docx, self.dest_dir):
text.add_elem(img)
ans.append(text.elem)
- elif self.namespace.is_tag(child, 'w:footnoteReference') or self.namespace.is_tag(child, 'w:endnoteReference'):
+ elif (self.namespace.is_tag(child, 'w:footnoteReference') or
+ self.namespace.is_tag(child, 'w:endnoteReference')):
anchor, name = self.footnotes.get_ref(child)
if anchor and name:
- l = A(name, id='back_%s' % anchor, href='#' + anchor, title=name)
- l.set('class', 'noteref')
- text.add_elem(l)
+ _l = A(name, id='back_%s' % anchor, href='#' + anchor,
+ title=name)
+ _l.set('class', 'noteref')
+ text.add_elem(_l)
ans.append(text.elem)
elif self.namespace.is_tag(child, 'w:tab'):
- spaces = int(math.ceil((self.settings.default_tab_stop / 36) * 6))
+ spaces = int(math.ceil((self.settings.default_tab_stop / 36) *
+ 6))
text.add_elem(SPAN(NBSP * spaces))
ans.append(text.elem)
ans[-1].set('class', 'tab')
@@ -699,7 +748,8 @@ class Convert(object):
style = self.styles.resolve_run(run)
if style.vert_align in {'superscript', 'subscript'}:
if ans.text or len(ans):
- ans.set('data-docx-vert', 'sup' if style.vert_align == 'superscript' else 'sub')
+ ans.set('data-docx-vert',
+ 'sup' if style.vert_align == 'superscript' else 'sub')
if style.lang is not inherit:
lang = html_lang(style.lang)
if lang is not None and lang != self.doc_lang:
@@ -738,12 +788,14 @@ class Convert(object):
idx = parent.index(paras[0])
frame = DIV(*paras)
parent.insert(idx, frame)
- self.framed_map[frame] = css = style.css(self.page_map[self.object_map[paras[0]]])
+ self.framed_map[frame] = css = style.css(
+ self.page_map[self.object_map[paras[0]]])
self.styles.register(css, 'frame')
if not self.block_runs:
return
- rmap = {v:k for k, v in self.object_map.items()}
+
+ rmap = {v: k for k, v in self.object_map.items()}
for border_style, blocks in self.block_runs:
paras = tuple(rmap[p] for p in blocks)
for p in paras:
@@ -796,17 +848,20 @@ class Convert(object):
else:
border_style = style.clone_border_styles()
if has_visible_border:
- border_style.margin_top, style.margin_top = style.margin_top, inherit
+ style.margin_top = inherit
+ border_style.margin_top = style.margin_top
if p is not run[-1]:
style.padding_bottom = 0
else:
if has_visible_border:
- border_style.margin_bottom, style.margin_bottom = style.margin_bottom, inherit
+ style.margin_bottom = inherit
+ border_style.margin_bottom = style.margin_bottom
style.clear_borders()
if p is not run[-1]:
style.apply_between_border()
if has_visible_border:
- border_style.margin_left, border_style.margin_right = max_left,max_right
+ border_style.margin_left = max_left
+ border_style.margin_right = max_right
self.block_runs.append((border_style, run))
run = []
diff --git a/ebook_converter/ebooks/docx/writer/container.py b/ebook_converter/ebooks/docx/writer/container.py
index 9af3b0a..47042b4 100644
--- a/ebook_converter/ebooks/docx/writer/container.py
+++ b/ebook_converter/ebooks/docx/writer/container.py
@@ -1,5 +1,6 @@
import mimetypes
-import textwrap, os
+import os
+import textwrap
from lxml import etree
from lxml.builder import ElementMaker
@@ -9,22 +10,48 @@ from ebook_converter.ebooks.docx.names import DOCXNamespace
from ebook_converter.ebooks.metadata import authors_to_string
from ebook_converter.ebooks.pdf.render.common import PAPER_SIZES
from ebook_converter.utils.date import utcnow
-from ebook_converter.utils.localization import canonicalize_lang, lang_as_iso639_1
+from ebook_converter.utils.localization import canonicalize_lang
+from ebook_converter.utils.localization import lang_as_iso639_1
from ebook_converter.utils.zipfile import ZipFile
+WORD_TYPES = {"/word/footnotes.xml": "application/vnd.openxmlformats-"
+ "officedocument.wordprocessingml.footnotes+xml",
+ "/word/document.xml": "application/vnd.openxmlformats-"
+ "officedocument.wordprocessingml.document.main+xml",
+ "/word/numbering.xml": "application/vnd.openxmlformats-"
+ "officedocument.wordprocessingml.numbering+xml",
+ "/word/styles.xml": "application/vnd.openxmlformats-"
+ "officedocument.wordprocessingml.styles+xml",
+ "/word/endnotes.xml": "application/vnd.openxmlformats-"
+ "officedocument.wordprocessingml.endnotes+xml",
+ "/word/settings.xml": "application/vnd.openxmlformats-"
+ "officedocument.wordprocessingml.settings+xml",
+ "/word/theme/theme1.xml": "application/vnd.openxmlformats-"
+ "officedocument.theme+xml",
+ "/word/fontTable.xml": "application/vnd.openxmlformats-"
+ "officedocument.wordprocessingml.fontTable+xml",
+ "/word/webSettings.xml": "application/vnd.openxmlformats-"
+ "officedocument.wordprocessingml.webSettings+xml",
+ "/docProps/core.xml": "application/vnd.openxmlformats-package."
+ "core-properties+xml",
+ "/docProps/app.xml": "application/vnd.openxmlformats-"
+ "officedocument.extended-properties+xml"}
+
+
def xml2str(root, pretty_print=False, with_tail=False):
if hasattr(etree, 'cleanup_namespaces'):
etree.cleanup_namespaces(root)
ans = etree.tostring(root, encoding='utf-8', xml_declaration=True,
- pretty_print=pretty_print, with_tail=with_tail)
+ pretty_print=pretty_print, with_tail=with_tail)
return ans
def page_size(opts):
width, height = PAPER_SIZES[opts.docx_page_size]
if opts.docx_custom_page_size is not None:
- width, height = map(float, opts.docx_custom_page_size.partition('x')[0::2])
+ width, height = map(float,
+ opts.docx_custom_page_size.partition('x')[0::2])
return width, height
@@ -47,7 +74,9 @@ def create_skeleton(opts, namespaces=None):
def w(x):
return '{%s}%s' % (namespaces['w'], x)
- dn = {k:v for k, v in namespaces.items() if k in {'w', 'r', 'm', 've', 'o', 'wp', 'w10', 'wne', 'a', 'pic'}}
+ dn = {k: v for k, v in namespaces.items() if k in {'w', 'r', 'm', 've',
+ 'o', 'wp', 'w10', 'wne',
+ 'a', 'pic'}}
E = ElementMaker(namespace=dn['w'], nsmap=dn)
doc = E.document()
body = E.body()
@@ -59,27 +88,32 @@ def create_skeleton(opts, namespaces=None):
val = page_margin(opts, which)
return w(which), str(int(val * 20))
body.append(E.sectPr(
- E.pgSz(**{w('w'):str(width), w('h'):str(height)}),
+ E.pgSz(**{w('w'): str(width), w('h'): str(height)}),
E.pgMar(**dict(map(margin, 'left top right bottom'.split()))),
- E.cols(**{w('space'):'720'}),
- E.docGrid(**{w('linePitch'):"360"}),
+ E.cols(**{w('space'): '720'}),
+ E.docGrid(**{w('linePitch'): "360"}),
))
- dn = {k:v for k, v in namespaces.items() if k in tuple('wra') + ('wp',)}
+ dn = {k: v for k, v in namespaces.items() if k in tuple('wra') + ('wp',)}
E = ElementMaker(namespace=dn['w'], nsmap=dn)
styles = E.styles(
E.docDefaults(
E.rPrDefault(
E.rPr(
- E.rFonts(**{w('asciiTheme'):"minorHAnsi", w('eastAsiaTheme'):"minorEastAsia", w('hAnsiTheme'):"minorHAnsi", w('cstheme'):"minorBidi"}),
- E.sz(**{w('val'):'22'}),
- E.szCs(**{w('val'):'22'}),
- E.lang(**{w('val'):'en-US', w('eastAsia'):"en-US", w('bidi'):"ar-SA"})
+ E.rFonts(**{w('asciiTheme'): "minorHAnsi",
+ w('eastAsiaTheme'): "minorEastAsia",
+ w('hAnsiTheme'): "minorHAnsi",
+ w('cstheme'): "minorBidi"}),
+ E.sz(**{w('val'): '22'}),
+ E.szCs(**{w('val'): '22'}),
+ E.lang(**{w('val'): 'en-US', w('eastAsia'): "en-US",
+ w('bidi'): "ar-SA"})
)
),
E.pPrDefault(
E.pPr(
- E.spacing(**{w('after'):"0", w('line'):"276", w('lineRule'):"auto"})
+ E.spacing(**{w('after'): "0", w('line'): "276",
+ w('lineRule'): "auto"})
)
)
)
@@ -103,8 +137,8 @@ def update_doc_props(root, mi, namespace):
if mi.comments:
setm('description', mi.comments)
if mi.languages:
- l = canonicalize_lang(mi.languages[0])
- setm('language', lang_as_iso639_1(l) or l)
+ _l = canonicalize_lang(mi.languages[0])
+ setm('language', lang_as_iso639_1(_l) or _l)
class DocumentRelationships(object):
@@ -115,8 +149,7 @@ class DocumentRelationships(object):
for typ, target in {namespace.names['STYLES']: 'styles.xml',
namespace.names['NUMBERING']: 'numbering.xml',
namespace.names['WEB_SETTINGS']: 'webSettings.xml',
- namespace.names['FONTS']: 'fontTable.xml',
- }.items():
+ namespace.names['FONTS']: 'fontTable.xml'}.items():
self.add_relationship(target, typ)
def get_relationship_id(self, target, rtype, target_mode=None):
@@ -134,7 +167,8 @@ class DocumentRelationships(object):
def serialize(self):
namespaces = self.namespace.namespaces
- E = ElementMaker(namespace=namespaces['pr'], nsmap={None:namespaces['pr']})
+ E = ElementMaker(namespace=namespaces['pr'],
+ nsmap={None: namespaces['pr']})
relationships = E.Relationships()
for (target, rtype, target_mode), rid in self.rmap.items():
r = E.Relationship(Id=rid, Type=rtype, Target=target)
@@ -151,9 +185,12 @@ class DOCX(object):
namespaces = self.namespace.namespaces
self.opts, self.log = opts, log
self.document_relationships = DocumentRelationships(self.namespace)
- self.font_table = etree.Element('{%s}fonts' % namespaces['w'], nsmap={k:namespaces[k] for k in 'wr'})
- self.numbering = etree.Element('{%s}numbering' % namespaces['w'], nsmap={k:namespaces[k] for k in 'wr'})
- E = ElementMaker(namespace=namespaces['pr'], nsmap={None:namespaces['pr']})
+ self.font_table = etree.Element('{%s}fonts' % namespaces['w'],
+ nsmap={k: namespaces[k] for k in 'wr'})
+ self.numbering = etree.Element('{%s}numbering' % namespaces['w'],
+ nsmap={k: namespaces[k] for k in 'wr'})
+ E = ElementMaker(namespace=namespaces['pr'],
+ nsmap={None: namespaces['pr']})
self.embedded_fonts = E.Relationships()
self.fonts = {}
self.images = {}
@@ -161,21 +198,10 @@ class DOCX(object):
# Boilerplate {{{
@property
def contenttypes(self):
- E = ElementMaker(namespace=self.namespace.namespaces['ct'], nsmap={None:self.namespace.namespaces['ct']})
+ E = ElementMaker(namespace=self.namespace.namespaces['ct'],
+ nsmap={None: self.namespace.namespaces['ct']})
types = E.Types()
- for partname, mt in {
- "/word/footnotes.xml": "application/vnd.openxmlformats-officedocument.wordprocessingml.footnotes+xml",
- "/word/document.xml": "application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml",
- "/word/numbering.xml": "application/vnd.openxmlformats-officedocument.wordprocessingml.numbering+xml",
- "/word/styles.xml": "application/vnd.openxmlformats-officedocument.wordprocessingml.styles+xml",
- "/word/endnotes.xml": "application/vnd.openxmlformats-officedocument.wordprocessingml.endnotes+xml",
- "/word/settings.xml": "application/vnd.openxmlformats-officedocument.wordprocessingml.settings+xml",
- "/word/theme/theme1.xml": "application/vnd.openxmlformats-officedocument.theme+xml",
- "/word/fontTable.xml": "application/vnd.openxmlformats-officedocument.wordprocessingml.fontTable+xml",
- "/word/webSettings.xml": "application/vnd.openxmlformats-officedocument.wordprocessingml.webSettings+xml",
- "/docProps/core.xml": "application/vnd.openxmlformats-package.core-properties+xml",
- "/docProps/app.xml": "application/vnd.openxmlformats-officedocument.extended-properties+xml",
- }.items():
+ for partname, mt in WORD_TYPES.items():
types.append(E.Override(PartName=partname, ContentType=mt))
added = {'png', 'gif', 'jpeg', 'jpg', 'svg', 'xml'}
for ext in added:
@@ -199,7 +225,8 @@ class DOCX(object):
@property
def appproperties(self):
- E = ElementMaker(namespace=self.namespace.namespaces['ep'], nsmap={None:self.namespace.namespaces['ep']})
+ E = ElementMaker(namespace=self.namespace.namespaces['ep'],
+ nsmap={None: self.namespace.namespaces['ep']})
props = E.Properties(
E.Application(__appname__),
E.AppVersion('%02d.%04d' % numeric_version[:2]),
@@ -216,16 +243,17 @@ class DOCX(object):
@property
def containerrels(self):
return textwrap.dedent('''\
-
-
-
-
-
- '''.format(**self.namespace.names)).encode('utf-8')
+
+
+
+
+
+'''.format(**self.namespace.names)).encode('utf-8') # noqa
@property
def websettings(self):
- E = ElementMaker(namespace=self.namespace.namespaces['w'], nsmap={'w':self.namespace.namespaces['w']})
+ E = ElementMaker(namespace=self.namespace.namespaces['w'],
+ nsmap={'w': self.namespace.namespaces['w']})
ws = E.webSettings(
E.optimizeForBrowser, E.allowPNG, E.doNotSaveAsSingleFile)
return xml2str(ws)
@@ -234,11 +262,15 @@ class DOCX(object):
def convert_metadata(self, mi):
namespaces = self.namespace.namespaces
- E = ElementMaker(namespace=namespaces['cp'], nsmap={x:namespaces[x] for x in 'cp dc dcterms xsi'.split()})
+ E = ElementMaker(namespace=namespaces['cp'],
+ nsmap={x: namespaces[x]
+ for x in 'cp dc dcterms xsi'.split()})
cp = E.coreProperties(E.revision("1"), E.lastModifiedBy('calibre'))
ts = utcnow().isoformat('T').rpartition('.')[0] + 'Z'
for x in 'created modified'.split():
- x = cp.makeelement('{%s}%s' % (namespaces['dcterms'], x), **{'{%s}type' % namespaces['xsi']:'dcterms:W3CDTF'})
+ x = cp.makeelement('{%s}%s' % (namespaces['dcterms'], x),
+ **{'{%s}type' %
+ namespaces['xsi']: 'dcterms:W3CDTF'})
x.text = ts
cp.append(x)
self.mi = mi
@@ -261,8 +293,10 @@ class DOCX(object):
zf.writestr('word/styles.xml', xml2str(self.styles))
zf.writestr('word/numbering.xml', xml2str(self.numbering))
zf.writestr('word/fontTable.xml', xml2str(self.font_table))
- zf.writestr('word/_rels/document.xml.rels', self.document_relationships.serialize())
- zf.writestr('word/_rels/fontTable.xml.rels', xml2str(self.embedded_fonts))
+ zf.writestr('word/_rels/document.xml.rels',
+ self.document_relationships.serialize())
+ zf.writestr('word/_rels/fontTable.xml.rels',
+ xml2str(self.embedded_fonts))
for fname, data_getter in self.images.items():
zf.writestr(fname, data_getter())
for fname, data in self.fonts.items():
diff --git a/ebook_converter/ebooks/metadata/__init__.py b/ebook_converter/ebooks/metadata/__init__.py
index 3196056..918b764 100644
--- a/ebook_converter/ebooks/metadata/__init__.py
+++ b/ebook_converter/ebooks/metadata/__init__.py
@@ -18,7 +18,7 @@ try:
_author_pat = re.compile(tweaks['authors_split_regex'])
except Exception:
prints('Author split regexp:', tweaks['authors_split_regex'],
- 'is invalid, using default')
+ 'is invalid, using default')
_author_pat = re.compile(r'(?i),?\s+(and|with)\s+')
@@ -76,7 +76,8 @@ def author_to_author_sort(author, method=None):
if method == 'copy':
return author
- prefixes = {force_unicode(y).lower() for y in tweaks['author_name_prefixes']}
+ prefixes = {force_unicode(y).lower()
+ for y in tweaks['author_name_prefixes']}
prefixes |= {y+'.' for y in prefixes}
while True:
if not tokens:
@@ -87,7 +88,8 @@ def author_to_author_sort(author, method=None):
else:
break
- suffixes = {force_unicode(y).lower() for y in tweaks['author_name_suffixes']}
+ suffixes = {force_unicode(y).lower()
+ for y in tweaks['author_name_suffixes']}
suffixes |= {y+'.' for y in suffixes}
suffix = ''
@@ -144,7 +146,7 @@ def get_title_sort_pat(lang=None):
except:
ans = frozenset((r'A\s+', r'The\s+', r'An\s+'))
ans = '|'.join(ans)
- ans = '^(%s)'%ans
+ ans = '^(%s)' % ans
try:
ans = re.compile(ans, re.IGNORECASE)
except:
@@ -154,7 +156,7 @@ def get_title_sort_pat(lang=None):
_ignore_starts = '\'"'+''.join(chr(x) for x in
- list(range(0x2018, 0x201e))+[0x2032, 0x2033])
+ list(range(0x2018, 0x201e))+[0x2032, 0x2033])
def title_sort(title, order=None, lang=None):
diff --git a/ebook_converter/ebooks/metadata/fb2.py b/ebook_converter/ebooks/metadata/fb2.py
index 6e72e42..b32e7f8 100644
--- a/ebook_converter/ebooks/metadata/fb2.py
+++ b/ebook_converter/ebooks/metadata/fb2.py
@@ -12,8 +12,7 @@ from lxml import etree
from ebook_converter.utils.date import parse_only_date
from ebook_converter.utils.img import save_cover_data_to
from ebook_converter.utils.imghdr import identify
-from ebook_converter import guess_type, guess_all_extensions, prints, \
- force_unicode
+from ebook_converter import guess_all_extensions, prints, force_unicode
from ebook_converter.ebooks.metadata import MetaInformation, check_isbn
from ebook_converter.ebooks.chardet import xml_to_unicode
from ebook_converter.polyglot.binary import as_base64_unicode
diff --git a/ebook_converter/ebooks/metadata/opf2.py b/ebook_converter/ebooks/metadata/opf2.py
index 64b9fa7..a907ad8 100644
--- a/ebook_converter/ebooks/metadata/opf2.py
+++ b/ebook_converter/ebooks/metadata/opf2.py
@@ -10,11 +10,11 @@ import mimetypes
import os
import re
import sys
+import textwrap
+import traceback
import unittest
import urllib.parse
import uuid
-import traceback
-import textwrap
from lxml import etree
from lxml.builder import ElementMaker
@@ -32,7 +32,7 @@ from ebook_converter.ebooks.metadata import string_to_authors, \
from ebook_converter.ebooks.metadata.book.base import Metadata
from ebook_converter.utils.date import parse_date, isoformat
from ebook_converter.utils.localization import get_lang, canonicalize_lang
-from ebook_converter import prints, guess_type
+from ebook_converter import prints
from ebook_converter.utils.cleantext import clean_ascii_chars, clean_xml_chars
from ebook_converter.utils.config import tweaks
from ebook_converter.polyglot.urllib import unquote
@@ -1807,8 +1807,7 @@ def test_m2o():
class OPFTest(unittest.TestCase):
def setUp(self):
- self.stream = io.BytesIO(
-b'''\
+ self.stream = io.BytesIO(b'''\
@@ -1827,8 +1826,7 @@ b'''\
-'''
- )
+''')
self.opf = OPF(self.stream, os.getcwd())
def testReading(self, opf=None):
diff --git a/ebook_converter/ebooks/mobi/reader/mobi6.py b/ebook_converter/ebooks/mobi/reader/mobi6.py
index 6087a67..014b846 100644
--- a/ebook_converter/ebooks/mobi/reader/mobi6.py
+++ b/ebook_converter/ebooks/mobi/reader/mobi6.py
@@ -1,10 +1,15 @@
-import shutil, os, re, struct, textwrap, io
+import io
import logging
import mimetypes
+import os
+import re
+import shutil
+import struct
+import textwrap
from lxml import html, etree
-from ebook_converter import xml_entity_to_unicode, entity_to_unicode, guess_type
+from ebook_converter import xml_entity_to_unicode, entity_to_unicode
from ebook_converter.utils.cleantext import clean_ascii_chars, clean_xml_chars
from ebook_converter.ebooks import DRMError, unit_convert
from ebook_converter.ebooks.chardet import strip_encoding_declarations
@@ -15,15 +20,11 @@ from ebook_converter.ebooks.metadata import MetaInformation
from ebook_converter.ebooks.metadata.opf2 import OPFCreator, OPF
from ebook_converter.ebooks.metadata.toc import TOC
from ebook_converter.ebooks.mobi.reader.headers import BookHeader
-from ebook_converter.utils.img import save_cover_data_to, gif_data_to_png_data, AnimatedGIF
+from ebook_converter.utils.img import save_cover_data_to, gif_data_to_png_data
+from ebook_converter.utils.img import AnimatedGIF
from ebook_converter.utils.imghdr import what
-__license__ = 'GPL v3'
-__copyright__ = '2012, Kovid Goyal '
-__docformat__ = 'restructuredtext en'
-
-
class TopazError(ValueError):
pass
@@ -38,13 +39,14 @@ class KFXError(ValueError):
class MobiReader(object):
- PAGE_BREAK_PAT = re.compile(
- r'<\s*/{0,1}\s*mbp:pagebreak((?:\s+[^/>]*){0,1})/{0,1}\s*>\s*(?:<\s*/{0,1}\s*mbp:pagebreak\s*/{0,1}\s*>)*',
- re.IGNORECASE)
+ PAGE_BREAK_PAT = re.compile(r'<\s*/{0,1}\s*mbp:pagebreak((?:\s+[^/>]*)'
+ r'{0,1})/{0,1}\s*>\s*(?:<\s*/{0,1}'
+ r'\s*mbp:pagebreak\s*/{0,1}\s*>)*',
+ re.IGNORECASE)
IMAGE_ATTRS = ('lowrecindex', 'recindex', 'hirecindex')
def __init__(self, filename_or_stream, log, user_encoding=None, debug=None,
- try_extra_data_fix=False):
+ try_extra_data_fix=False):
self.log = log
self.debug = debug
self.embedded_mi = None
@@ -83,8 +85,8 @@ class MobiReader(object):
if raw.startswith(b'\xeaDRMION\xee'):
raise KFXError()
- self.header = raw[0:72]
- self.name = self.header[:32].replace(b'\x00', b'')
+ self.header = raw[0:72]
+ self.name = self.header[:32].replace(b'\x00', b'')
self.num_sections, = struct.unpack('>H', raw[76:78])
self.ident = self.header[0x3C:0x3C + 8].upper()
@@ -94,7 +96,9 @@ class MobiReader(object):
self.sections = []
self.section_headers = []
for i in range(self.num_sections):
- offset, a1, a2, a3, a4 = struct.unpack('>LBBBB', raw[78 + i * 8:78 + i * 8 + 8])
+ offset, a1, a2, a3, a4 = struct.unpack('>LBBBB',
+ raw[78 + i * 8:78 +
+ i * 8 + 8])
flags, val = a1, a2 << 16 | a3 << 8 | a4
self.section_headers.append((offset, flags, val))
@@ -109,8 +113,9 @@ class MobiReader(object):
for i in range(self.num_sections):
self.sections.append((section(i), self.section_headers[i]))
- self.book_header = bh = BookHeader(self.sections[0][0], self.ident,
- user_encoding, self.log, try_extra_data_fix=try_extra_data_fix)
+ bh = BookHeader(self.sections[0][0], self.ident, user_encoding,
+ self.log, try_extra_data_fix=try_extra_data_fix)
+ self.book_header = bh
self.name = self.name.decode(self.book_header.codec, 'replace')
self.kf8_type = None
k8i = getattr(self.book_header.exth, 'kf8_header', None)
@@ -118,18 +123,20 @@ class MobiReader(object):
# Ancient PRC files from Baen can have random values for
# mobi_version, so be conservative
if (self.book_header.mobi_version == 8 and hasattr(self.book_header,
- 'skelidx')):
+ 'skelidx')):
self.kf8_type = 'standalone'
elif k8i is not None: # Check for joint mobi 6 and kf 8 file
try:
raw = self.sections[k8i-1][0]
- except:
+ except Exception:
raw = None
if raw == b'BOUNDARY':
try:
self.book_header = BookHeader(self.sections[k8i][0],
- self.ident, user_encoding, self.log)
- self.book_header.kf8_first_image_index = self.book_header.first_image_index + k8i
+ self.ident, user_encoding,
+ self.log)
+ _kfii = self.book_header.first_image_index + k8i
+ self.book_header.kf8_first_image_index = _kfii
self.book_header.mobi6_records = bh.records
# Need the first_image_index from the mobi 6 header as well
@@ -143,14 +150,14 @@ class MobiReader(object):
self.kf8_type = 'joint'
self.kf8_boundary = k8i-1
- except:
+ except Exception:
self.book_header = bh
def check_for_drm(self):
if self.book_header.encryption_type != 0:
try:
name = self.book_header.exth.mi.title
- except:
+ except Exception:
name = self.name
if not name:
name = self.name
@@ -163,20 +170,20 @@ class MobiReader(object):
if self.debug is not None:
parse_cache['calibre_raw_mobi_markup'] = self.mobi_html
self.add_anchors()
- self.processed_html = self.processed_html.decode(self.book_header.codec,
- 'ignore')
+ self.processed_html = self.processed_html.decode(
+ self.book_header.codec, 'ignore')
self.processed_html = self.processed_html.replace('', '')
self.processed_html = re.sub(r'([a-zA-Z]+)<', r'\1><',
- self.processed_html)
+ self.processed_html)
self.processed_html = self.processed_html.replace('\ufeff', '')
# Remove tags of the form as they can cause issues further
# along the pipeline
self.processed_html = re.sub(r'{0,1}[a-zA-Z]+:\s+[^>]*>', '',
- self.processed_html)
+ self.processed_html)
self.processed_html = strip_encoding_declarations(self.processed_html)
self.processed_html = re.sub(r'&(\S+?);', xml_entity_to_unicode,
- self.processed_html)
+ self.processed_html)
image_name_map = self.extract_images(processed_records, output_dir)
self.replace_page_breaks()
self.cleanup_html()
@@ -186,31 +193,41 @@ class MobiReader(object):
try:
root = html.fromstring(self.processed_html)
if len(root.xpath('//html')) > 5:
- root = html.fromstring(self.processed_html.replace('\x0c',
- '').replace('\x14', ''))
+ root = html.fromstring(self.processed_html
+ .replace('\x0c', '')
+ .replace('\x14', ''))
except Exception:
- self.log.warning('MOBI markup appears to contain random bytes. Stripping.')
+ self.log.warning('MOBI markup appears to contain random bytes. '
+ 'Stripping.')
self.processed_html = self.remove_random_bytes(self.processed_html)
root = html.fromstring(self.processed_html)
if root.xpath('descendant::p/descendant::p'):
from html5_parser import parse
self.log.warning('Malformed markup, parsing using html5-parser')
- self.processed_html = strip_encoding_declarations(self.processed_html)
+ self.processed_html = strip_encoding_declarations(
+ self.processed_html)
# These trip up the html5 parser causing all content to be placed
# under the tag
- self.processed_html = re.sub(r'.+?', '', self.processed_html, flags=re.I)
- self.processed_html = re.sub(r'.+?', '', self.processed_html, flags=re.I)
+ self.processed_html = re.sub(r'.+?', '',
+ self.processed_html, flags=re.I)
+ self.processed_html = re.sub(r'.+?', '',
+ self.processed_html, flags=re.I)
try:
- root = parse(self.processed_html, maybe_xhtml=False, keep_doctype=False, sanitize_names=True)
+ root = parse(self.processed_html, maybe_xhtml=False,
+ keep_doctype=False, sanitize_names=True)
except Exception:
- self.log.warning('MOBI markup appears to contain random bytes. Stripping.')
- self.processed_html = self.remove_random_bytes(self.processed_html)
- root = parse(self.processed_html, maybe_xhtml=False, keep_doctype=False, sanitize_names=True)
+ self.log.warning('MOBI markup appears to contain random '
+ 'bytes. Stripping.')
+ self.processed_html = self.remove_random_bytes(
+ self.processed_html)
+ root = parse(self.processed_html, maybe_xhtml=False,
+ keep_doctype=False, sanitize_names=True)
if len(root.xpath('body/descendant::*')) < 1:
# There are probably stray