diff --git a/ebook_converter/ebooks/__init__.py b/ebook_converter/ebooks/__init__.py index 66c1756..b4dfce0 100644 --- a/ebook_converter/ebooks/__init__.py +++ b/ebook_converter/ebooks/__init__.py @@ -1,12 +1,14 @@ -__license__ = 'GPL v3' -__copyright__ = '2008, Kovid Goyal ' - -''' +""" Code for the conversion of ebook formats and the reading of metadata from various formats. -''' +""" +import numbers +import os +import re +import sys + +from lxml import etree -import os, re, numbers, sys from ebook_converter import prints from ebook_converter.ebooks.chardet import xml_to_unicode @@ -30,12 +32,15 @@ class ParserError(ValueError): pass -BOOK_EXTENSIONS = ['lrf', 'rar', 'zip', 'rtf', 'lit', 'txt', 'txtz', 'text', 'htm', 'xhtm', - 'html', 'htmlz', 'xhtml', 'pdf', 'pdb', 'updb', 'pdr', 'prc', 'mobi', 'azw', 'doc', - 'epub', 'fb2', 'fbz', 'djv', 'djvu', 'lrx', 'cbr', 'cbz', 'cbc', 'oebzip', - 'rb', 'imp', 'odt', 'chm', 'tpz', 'azw1', 'pml', 'pmlz', 'mbp', 'tan', 'snb', - 'xps', 'oxps', 'azw4', 'book', 'zbf', 'pobi', 'docx', 'docm', 'md', - 'textile', 'markdown', 'ibook', 'ibooks', 'iba', 'azw3', 'ps', 'kepub', 'kfx', 'kpf'] +BOOK_EXTENSIONS = ['lrf', 'rar', 'zip', 'rtf', 'lit', 'txt', 'txtz', 'text', + 'htm', 'xhtm', 'html', 'htmlz', 'xhtml', 'pdf', 'pdb', + 'updb', 'pdr', 'prc', 'mobi', 'azw', 'doc', 'epub', 'fb2', + 'fbz', 'djv', 'djvu', 'lrx', 'cbr', 'cbz', 'cbc', 'oebzip', + 'rb', 'imp', 'odt', 'chm', 'tpz', 'azw1', 'pml', 'pmlz', + 'mbp', 'tan', 'snb', 'xps', 'oxps', 'azw4', 'book', 'zbf', + 'pobi', 'docx', 'docm', 'md', 'textile', 'markdown', + 'ibook', 'ibooks', 'iba', 'azw3', 'ps', 'kepub', 'kfx', + 'kpf'] def return_raster_image(path): @@ -49,8 +54,7 @@ def return_raster_image(path): def extract_cover_from_embedded_svg(html, base, log): from ebook_converter.ebooks.oeb.base import XPath, SVG, XLINK - from ebook_converter.utils.xml_parse import safe_xml_fromstring - root = safe_xml_fromstring(html) + root = etree.fromstring(html) svg = XPath('//svg:svg')(root) if len(svg) == 1 and len(svg[0]) == 1 and svg[0][0].tag == SVG('image'): @@ -65,10 +69,10 @@ def extract_calibre_cover(raw, base, log): from ebook_converter.ebooks.BeautifulSoup import BeautifulSoup soup = BeautifulSoup(raw) matches = soup.find(name=['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'span', - 'font', 'br']) + 'font', 'br']) images = soup.findAll('img', src=True) - if matches is None and len(images) == 1 and \ - images[0].get('alt', '').lower()=='cover': + if (matches is None and len(images) == 1 and + images[0].get('alt', '').lower() == 'cover'): img = images[0] img = os.path.join(base, *img['src'].split('/')) q = return_raster_image(img) @@ -97,13 +101,14 @@ def render_html_svg_workaround(path_to_html, log, width=590, height=750): data = None if SVG_NS in raw: try: - data = extract_cover_from_embedded_svg(raw, - os.path.dirname(path_to_html), log) + data = extract_cover_from_embedded_svg( + raw, os.path.dirname(path_to_html), log) except Exception: pass if data is None: try: - data = extract_calibre_cover(raw, os.path.dirname(path_to_html), log) + data = extract_calibre_cover(raw, os.path.dirname(path_to_html), + log) except Exception: pass @@ -118,7 +123,8 @@ def render_html_data(path_to_html, width, height): result = {} def report_error(text=''): - prints('Failed to render', path_to_html, 'with errors:', file=sys.stderr) + prints('Failed to render', path_to_html, 'with errors:', + file=sys.stderr) if text: prints(text, file=sys.stderr) if result and result['stdout_stderr']: @@ -127,7 +133,8 @@ def render_html_data(path_to_html, width, height): with TemporaryDirectory('-render-html') as tdir: try: - result = fork_job('ebook_converter.ebooks.render_html', 'main', args=(path_to_html, tdir, 'jpeg')) + result = fork_job('ebook_converter.ebooks.render_html', 'main', + args=(path_to_html, tdir, 'jpeg')) except WorkerError as e: report_error(e.orig_tb) else: @@ -156,17 +163,20 @@ def normalize(x): def calibre_cover(title, author_string, series_string=None, - output_format='jpg', title_size=46, author_size=36, logo_path=None): + output_format='jpg', title_size=46, author_size=36, + logo_path=None): title = normalize(title) author_string = normalize(author_string) series_string = normalize(series_string) from ebook_converter.ebooks.covers import calibre_cover2 from ebook_converter.utils.img import image_to_data - ans = calibre_cover2(title, author_string or '', series_string or '', logo_path=logo_path, as_qimage=True) + ans = calibre_cover2(title, author_string or '', series_string or '', + logo_path=logo_path, as_qimage=True) return image_to_data(ans, fmt=output_format) -UNIT_RE = re.compile(r'^(-*[0-9]*[.]?[0-9]*)\s*(%|em|ex|en|px|mm|cm|in|pt|pc|rem|q)$') +UNIT_RE = re.compile(r'^(-*[0-9]*[.]?[0-9]*)\s*(%|em|ex|en|px|mm|cm|in|pt|pc' + r'|rem|q)$') def unit_convert(value, base, font, dpi, body_font_size=12): @@ -175,7 +185,7 @@ def unit_convert(value, base, font, dpi, body_font_size=12): return value try: return float(value) * 72.0 / dpi - except: + except Exception: pass result = value m = UNIT_RE.match(value) @@ -227,7 +237,8 @@ def generate_masthead(title, output_path=None, width=600, height=60): recs = load_defaults('mobi_output') masthead_font_family = recs.get('masthead_font', None) from ebook_converter.ebooks.covers import generate_masthead - return generate_masthead(title, output_path=output_path, width=width, height=height, font_family=masthead_font_family) + return generate_masthead(title, output_path=output_path, width=width, + height=height, font_family=masthead_font_family) def escape_xpath_attr(value): diff --git a/ebook_converter/ebooks/conversion/plugins/comic_input.py b/ebook_converter/ebooks/conversion/plugins/comic_input.py index c95e02d..5a81305 100644 --- a/ebook_converter/ebooks/conversion/plugins/comic_input.py +++ b/ebook_converter/ebooks/conversion/plugins/comic_input.py @@ -3,6 +3,7 @@ Based on ideas from comiclrf created by FangornUK. """ import shutil, textwrap, codecs, os +from ebook_converter import constants as const from ebook_converter.customize.conversion import InputFormatPlugin, OptionRecommendation from ebook_converter import CurrentDir from ebook_converter.ptempfile import PersistentTemporaryDirectory @@ -245,7 +246,6 @@ class ComicInput(InputFormatPlugin): return os.path.abspath('metadata.opf') def create_wrappers(self, pages): - from ebook_converter.ebooks.oeb.base import XHTML_NS wrappers = [] WRAPPER = textwrap.dedent('''\ @@ -267,7 +267,8 @@ class ComicInput(InputFormatPlugin): ''') dir = os.path.dirname(pages[0]) for i, page in enumerate(pages): - wrapper = WRAPPER%(XHTML_NS, i+1, os.path.basename(page), i+1) + wrapper = WRAPPER%(const.XHTML_NS, i+1, os.path.basename(page), + i+1) page = os.path.join(dir, 'page_%d.xhtml'%(i+1)) with open(page, 'wb') as f: f.write(wrapper.encode('utf-8')) @@ -275,8 +276,6 @@ class ComicInput(InputFormatPlugin): return wrappers def create_viewer_wrapper(self, pages): - from ebook_converter.ebooks.oeb.base import XHTML_NS - def page(src): return ''.format(os.path.basename(src)) @@ -303,7 +302,7 @@ class ComicInput(InputFormatPlugin): %s - ''' % (XHTML_NS, pages) + ''' % (const.XHTML_NS, pages) path = os.path.join(base, 'wrapper.xhtml') with open(path, 'wb') as f: f.write(wrapper.encode('utf-8')) diff --git a/ebook_converter/ebooks/conversion/plugins/docx_output.py b/ebook_converter/ebooks/conversion/plugins/docx_output.py index f1ea4eb..cf9e51f 100644 --- a/ebook_converter/ebooks/conversion/plugins/docx_output.py +++ b/ebook_converter/ebooks/conversion/plugins/docx_output.py @@ -1,14 +1,22 @@ -from ebook_converter.customize.conversion import OutputFormatPlugin, OptionRecommendation +import io +from lxml import etree + +from ebook_converter import constants as const +from ebook_converter.customize import conversion +from ebook_converter.ebooks.docx.dump import do_dump +from ebook_converter.ebooks.docx.writer.container import DOCX +from ebook_converter.ebooks.docx.writer.from_html import Convert +from ebook_converter.ebooks.metadata import opf2 as opf_meta +from ebook_converter.ebooks.oeb import base -__license__ = 'GPL v3' -__copyright__ = '2013, Kovid Goyal ' PAGE_SIZES = ['a0', 'a1', 'a2', 'a3', 'a4', 'a5', 'a6', 'b0', 'b1', 'b2', 'b3', 'b4', 'b5', 'b6', 'legal', 'letter'] +_OPT = conversion.OptionRecommendation -class DOCXOutput(OutputFormatPlugin): +class DOCXOutput(conversion.OutputFormatPlugin): name = 'DOCX Output' author = 'Kovid Goyal' @@ -16,75 +24,63 @@ class DOCXOutput(OutputFormatPlugin): commit_name = 'docx_output' ui_data = {'page_sizes': PAGE_SIZES} - options = { - OptionRecommendation(name='docx_page_size', recommended_value='letter', - level=OptionRecommendation.LOW, choices=PAGE_SIZES, - help='The size of the page. Default is letter. Choices ' - 'are %s' % PAGE_SIZES), - - OptionRecommendation(name='docx_custom_page_size', recommended_value=None, - help='Custom size of the document. Use the form widthxheight ' - 'EG. `123x321` to specify the width and height (in pts). ' - 'This overrides any specified page-size.'), - - OptionRecommendation(name='docx_no_cover', recommended_value=False, - help='Do not insert the book cover as an image at the start of the document.' - ' If you use this option, the book cover will be discarded.'), - - OptionRecommendation(name='preserve_cover_aspect_ratio', recommended_value=False, - help='Preserve the aspect ratio of the cover image instead of stretching' - ' it out to cover the entire page.'), - - OptionRecommendation(name='docx_no_toc', recommended_value=False, - help='Do not insert the table of contents as a page at the start of the document.'), - - OptionRecommendation(name='extract_to', - help='Extract the contents of the generated %s file to the ' - 'specified directory. The contents of the directory are first ' - 'deleted, so be careful.' % 'DOCX'), - - OptionRecommendation(name='docx_page_margin_left', recommended_value=72.0, - level=OptionRecommendation.LOW, - help='The size of the left page margin, in pts. Default is 72pt.' - ' Overrides the common left page margin setting.' - ), - - OptionRecommendation(name='docx_page_margin_top', recommended_value=72.0, - level=OptionRecommendation.LOW, - help='The size of the top page margin, in pts. Default is 72pt.' - ' Overrides the common top page margin setting, unless set to zero.' - ), - - OptionRecommendation(name='docx_page_margin_right', recommended_value=72.0, - level=OptionRecommendation.LOW, - help='The size of the right page margin, in pts. Default is 72pt.' - ' Overrides the common right page margin setting, unless set to zero.' - ), - - OptionRecommendation(name='docx_page_margin_bottom', recommended_value=72.0, - level=OptionRecommendation.LOW, - help='The size of the bottom page margin, in pts. Default is 72pt.' - ' Overrides the common bottom page margin setting, unless set to zero.' - ), - - } + options = {_OPT(name='docx_page_size', recommended_value='letter', + level=_OPT.LOW, choices=PAGE_SIZES, + help='The size of the page. Default is letter. Choices ' + 'are %s' % PAGE_SIZES), + _OPT(name='docx_custom_page_size', recommended_value=None, + help='Custom size of the document. Use the form ' + 'widthxheight EG. `123x321` to specify the width and ' + 'height (in pts). This overrides any specified ' + 'page-size.'), + _OPT(name='docx_no_cover', recommended_value=False, + help='Do not insert the book cover as an image at the ' + 'start of the document. If you use this option, the book ' + 'cover will be discarded.'), + _OPT(name='preserve_cover_aspect_ratio', + recommended_value=False, help='Preserve the aspect ratio ' + 'of the cover image instead of stretching it out to cover ' + 'the entire page.'), + _OPT(name='docx_no_toc', recommended_value=False, + help='Do not insert the table of contents as a page at ' + 'the start of the document.'), + _OPT(name='extract_to', help='Extract the contents of the ' + 'generated DOCX file to the specified directory. The ' + 'contents of the directory are first deleted, so be ' + 'careful.'), + _OPT(name='docx_page_margin_left', recommended_value=72.0, + level=_OPT.LOW, help='The size of the left page margin, ' + 'in pts. Default is 72pt. Overrides the common left page ' + 'margin setting.'), + _OPT(name='docx_page_margin_top', recommended_value=72.0, + level=_OPT.LOW, help='The size of the top page margin, ' + 'in pts. Default is 72pt. Overrides the common top page ' + 'margin setting, unless set to zero.'), + _OPT(name='docx_page_margin_right', recommended_value=72.0, + level=_OPT.LOW, help='The size of the right page margin, ' + 'in pts. Default is 72pt. Overrides the common right page ' + 'margin setting, unless set to zero.'), + _OPT(name='docx_page_margin_bottom', recommended_value=72.0, + level=_OPT.LOW, help='The size of the bottom page margin, ' + 'in pts. Default is 72pt. Overrides the common bottom ' + 'page margin setting, unless set to zero.')} def convert_metadata(self, oeb): - from lxml import etree - from ebook_converter.ebooks.oeb.base import OPF, OPF2_NS - from ebook_converter.ebooks.metadata.opf2 import OPF as ReadOPF - from io import BytesIO - package = etree.Element(OPF('package'), attrib={'version': '2.0'}, nsmap={None: OPF2_NS}) + + package = etree.Element(base.tag('opf', 'package'), + attrib={'version': '2.0'}, + nsmap={None: const.OPF2_NS}) oeb.metadata.to_opf2(package) - self.mi = ReadOPF(BytesIO(etree.tostring(package, encoding='utf-8')), populate_spine=False, try_to_guess_cover=False).to_book_metadata() + self.mi = opf_meta.OPF(io.BytesIO(etree.tostring(package, + encoding='utf-8')), + populate_spine=False, + try_to_guess_cover=False).to_book_metadata() def convert(self, oeb, output_path, input_plugin, opts, log): - from ebook_converter.ebooks.docx.writer.container import DOCX - from ebook_converter.ebooks.docx.writer.from_html import Convert docx = DOCX(opts, log) self.convert_metadata(oeb) - Convert(oeb, docx, self.mi, not opts.docx_no_cover, not opts.docx_no_toc)() + Convert(oeb, docx, self.mi, not opts.docx_no_cover, + not opts.docx_no_toc)() docx.write(output_path, self.mi) if opts.extract_to: - from ebook_converter.ebooks.docx.dump import do_dump do_dump(output_path, opts.extract_to) diff --git a/ebook_converter/ebooks/conversion/plugins/epub_input.py b/ebook_converter/ebooks/conversion/plugins/epub_input.py index 4316aa0..9056465 100644 --- a/ebook_converter/ebooks/conversion/plugins/epub_input.py +++ b/ebook_converter/ebooks/conversion/plugins/epub_input.py @@ -1,14 +1,19 @@ -import os, re, posixpath -from itertools import cycle +import hashlib +import itertools +import os +import re +import traceback +import uuid -from ebook_converter.customize.conversion import InputFormatPlugin, OptionRecommendation +from lxml import etree + +from ebook_converter.ebooks.metadata import opf2 as opf_meta +from ebook_converter.ebooks.oeb import base +from ebook_converter.customize.conversion import InputFormatPlugin +from ebook_converter.customize.conversion import OptionRecommendation -__license__ = 'GPL 3' -__copyright__ = '2009, Kovid Goyal ' -__docformat__ = 'restructuredtext en' - -ADOBE_OBFUSCATION = 'http://ns.adobe.com/pdf/enc#RC' +ADOBE_OBFUSCATION = 'http://ns.adobe.com/pdf/enc#RC' IDPF_OBFUSCATION = 'http://www.idpf.org/2008/embedding' @@ -16,8 +21,8 @@ def decrypt_font_data(key, data, algorithm): is_adobe = algorithm == ADOBE_OBFUSCATION crypt_len = 1024 if is_adobe else 1040 crypt = bytearray(data[:crypt_len]) - key = cycle(iter(bytearray(key))) - decrypt = bytes(bytearray(x^next(key) for x in crypt)) + key = itertools.cycle(iter(bytearray(key))) + decrypt = bytes(bytearray(x ^ next(key) for x in crypt)) return decrypt + data[crypt_len:] @@ -29,18 +34,16 @@ def decrypt_font(key, path, algorithm): class EPUBInput(InputFormatPlugin): - name = 'EPUB Input' - author = 'Kovid Goyal' + name = 'EPUB Input' + author = 'Kovid Goyal' description = 'Convert EPUB files (.epub) to HTML' - file_types = {'epub'} + file_types = {'epub'} output_encoding = None commit_name = 'epub_input' recommendations = {('page_breaks_before', '/', OptionRecommendation.MED)} def process_encryption(self, encfile, opf, log): - from lxml import etree - import uuid, hashlib idpf_key = opf.raw_unique_identifier if idpf_key: idpf_key = re.sub('[\u0020\u0009\u000d\u000a]', '', idpf_key) @@ -56,27 +59,28 @@ class EPUBInput(InputFormatPlugin): try: key = item.text.rpartition(':')[-1] key = uuid.UUID(key).bytes - except: - import traceback + except Exception: traceback.print_exc() key = None try: root = etree.parse(encfile) - for em in root.xpath('descendant::*[contains(name(), "EncryptionMethod")]'): + for em in root.xpath('descendant::*[contains(name(), ' + '"EncryptionMethod")]'): algorithm = em.get('Algorithm', '') if algorithm not in {ADOBE_OBFUSCATION, IDPF_OBFUSCATION}: return False - cr = em.getparent().xpath('descendant::*[contains(name(), "CipherReference")]')[0] + cr = em.getparent().xpath('descendant::*[contains(name(), ' + '"CipherReference")]')[0] uri = cr.get('URI') - path = os.path.abspath(os.path.join(os.path.dirname(encfile), '..', *uri.split('/'))) + path = os.path.abspath(os.path.join(os.path.dirname(encfile), + '..', *uri.split('/'))) tkey = (key if algorithm == ADOBE_OBFUSCATION else idpf_key) if (tkey and os.path.exists(path)): self._encrypted_font_uris.append(uri) decrypt_font(tkey, path, algorithm) return True - except: - import traceback + except Exception: traceback.print_exc() return False @@ -97,8 +101,11 @@ class EPUBInput(InputFormatPlugin): return t def rationalize_cover3(self, opf, log): - ''' If there is a reference to the cover/titlepage via manifest properties, convert to - entries in the so that the rest of the pipeline picks it up. ''' + """ + If there is a reference to the cover/titlepage via manifest + properties, convert to entries in the so that the rest of the + pipeline picks it up. + """ from ebook_converter.ebooks.metadata.opf3 import items_with_property removed = guide_titlepage_href = guide_titlepage_id = None @@ -128,7 +135,8 @@ class EPUBInput(InputFormatPlugin): titlepage_id, titlepage_href = tid, href.partition('#')[0] break if titlepage_href is None: - titlepage_href, titlepage_id = guide_titlepage_href, guide_titlepage_id + titlepage_href = guide_titlepage_href + titlepage_id = guide_titlepage_id if titlepage_href is not None: self.set_guide_type(opf, 'titlepage', titlepage_href, 'Title Page') spine = list(opf.iterspine()) @@ -148,7 +156,6 @@ class EPUBInput(InputFormatPlugin): means, at most one entry with type="cover" that points to a raster cover and at most one entry with type="titlepage" that points to an HTML titlepage. ''' - from ebook_converter.ebooks.oeb.base import OPF removed = None from lxml import etree guide_cover, guide_elem = None, None @@ -160,12 +167,14 @@ class EPUBInput(InputFormatPlugin): raster_cover = opf.raster_cover if raster_cover: if guide_elem is None: - g = opf.root.makeelement(OPF('guide')) + g = opf.root.makeelement(base.tag('opf', 'guide')) opf.root.append(g) else: g = guide_elem.getparent() guide_cover = raster_cover - guide_elem = g.makeelement(OPF('reference'), attrib={'href':raster_cover, 'type':'cover'}) + guide_elem = g.makeelement(base.tag('opf', 'reference'), + attrib={'href': raster_cover, + 'type': 'cover'}) g.append(guide_elem) return spine = list(opf.iterspine()) @@ -186,7 +195,8 @@ class EPUBInput(InputFormatPlugin): # specially if not self.for_viewer: if len(spine) == 1: - log.warn('There is only a single spine item and it is marked as the cover. Removing cover marking.') + log.warn('There is only a single spine item and it is marked ' + 'as the cover. Removing cover marking.') for guide_elem in tuple(opf.iterguide()): if guide_elem.get('type', '').lower() == 'cover': guide_elem.getparent().remove(guide_elem) @@ -215,8 +225,9 @@ class EPUBInput(InputFormatPlugin): # Render the titlepage to create a raster cover from ebook_converter.ebooks import render_html_svg_workaround guide_elem.set('href', 'calibre_raster_cover.jpg') - t = etree.SubElement( - elem[0].getparent(), OPF('item'), href=guide_elem.get('href'), id='calibre_raster_cover') + t = etree.SubElement(elem[0].getparent(), base.tag('opf', 'item'), + href=guide_elem.get('href'), + id='calibre_raster_cover') t.set('media-type', 'image/jpeg') if os.path.exists(guide_cover): renderer = render_html_svg_workaround(guide_cover, log) @@ -229,17 +240,16 @@ class EPUBInput(InputFormatPlugin): return removed def find_opf(self): - from ebook_converter.utils.xml_parse import safe_xml_fromstring - def attr(n, attr): for k, v in n.attrib.items(): if k.endswith(attr): return v try: with open('META-INF/container.xml', 'rb') as f: - root = safe_xml_fromstring(f.read()) + root = etree.fromstring(f.read()) for r in root.xpath('//*[local-name()="rootfile"]'): - if attr(r, 'media-type') != "application/oebps-package+xml": + if (attr(r, 'media-type') != + "application/oebps-package+xml"): continue path = attr(r, 'full-path') if not path: @@ -248,20 +258,18 @@ class EPUBInput(InputFormatPlugin): if os.path.exists(path): return path except Exception: - import traceback traceback.print_exc() def convert(self, stream, options, file_ext, log, accelerators): from ebook_converter.utils.zipfile import ZipFile from ebook_converter import walk from ebook_converter.ebooks import DRMError - from ebook_converter.ebooks.metadata.opf2 import OPF try: zf = ZipFile(stream) zf.extractall(os.getcwd()) - except: + except Exception: log.exception('EPUB appears to be invalid ZIP file, trying a' - ' more forgiving ZIP parser') + ' more forgiving ZIP parser') from ebook_converter.utils.localunzip import extractall stream.seek(0) extractall(stream) @@ -276,11 +284,12 @@ class EPUBInput(InputFormatPlugin): path = getattr(stream, 'name', 'stream') if opf is None: - raise ValueError('%s is not a valid EPUB file (could not find opf)'%path) + raise ValueError('%s is not a valid EPUB file (could not find ' + 'opf)' % path) opf = os.path.relpath(opf, os.getcwd()) - parts = os.path.split(opf) - opf = OPF(opf, os.path.dirname(os.path.abspath(opf))) + # parts = os.path.split(opf) + opf = opf_meta.OPF(opf, os.path.dirname(os.path.abspath(opf))) self._encrypted_font_uris = [] if os.path.exists(encfile): @@ -288,18 +297,23 @@ class EPUBInput(InputFormatPlugin): raise DRMError(os.path.basename(path)) self.encrypted_fonts = self._encrypted_font_uris - if len(parts) > 1 and parts[0]: - delta = '/'.join(parts[:-1])+'/' + # XXX(gryf): this code would fail pretty ugly, thus, this part was + # never used. + # if len(parts) > 1 and parts[0]: + # delta = '/'.join(parts[:-1])+'/' - def normpath(x): - return posixpath.normpath(delta + elem.get('href')) + # def normpath(x): + # return posixpath.normpath(delta + elem.get('href')) - for elem in opf.itermanifest(): - elem.set('href', normpath(elem.get('href'))) - for elem in opf.iterguide(): - elem.set('href', normpath(elem.get('href'))) + # for elem in opf.itermanifest(): + # elem.set('href', normpath(elem.get('href'))) + # for elem in opf.iterguide(): + # elem.set('href', normpath(elem.get('href'))) - f = self.rationalize_cover3 if opf.package_version >= 3.0 else self.rationalize_cover2 + if opf.package_version >= 3.0: + f = self.rationalize_cover3 + else: + f = self.rationalize_cover2 self.removed_cover = f(opf, log) if self.removed_cover: self.removed_items_to_ignore = (self.removed_cover,) @@ -352,15 +366,18 @@ class EPUBInput(InputFormatPlugin): from lxml import etree from ebook_converter.ebooks.chardet import xml_to_unicode from ebook_converter.ebooks.oeb.polish.parsing import parse - from ebook_converter.ebooks.oeb.base import EPUB_NS, XHTML, NCX_MIME, NCX, urlnormalize, urlunquote, serialize + from ebook_converter.ebooks.oeb.base import EPUB_NS, XHTML, NCX_MIME, \ + NCX, urlnormalize, urlunquote, serialize from ebook_converter.ebooks.oeb.polish.toc import first_child - from ebook_converter.utils.xml_parse import safe_xml_fromstring from tempfile import NamedTemporaryFile with open(nav_path, 'rb') as f: raw = f.read() - raw = xml_to_unicode(raw, strip_encoding_pats=True, assume_utf8=True)[0] + raw = xml_to_unicode(raw, strip_encoding_pats=True, + assume_utf8=True)[0] root = parse(raw, log=log) - ncx = safe_xml_fromstring('') + ncx = etree.fromstring('' + '') navmap = ncx[0] et = '{%s}type' % EPUB_NS bn = os.path.basename(nav_path) @@ -368,8 +385,8 @@ class EPUBInput(InputFormatPlugin): def add_from_li(li, parent): href = text = None for x in li.iterchildren(XHTML('a'), XHTML('span')): - text = etree.tostring( - x, method='text', encoding='unicode', with_tail=False).strip() or ' '.join( + text = etree.tostring(x, method='text', encoding='unicode', + with_tail=False).strip() or ' '.join( x.xpath('descendant-or-self::*/@title')).strip() href = x.get('href') if href: @@ -382,7 +399,7 @@ class EPUBInput(InputFormatPlugin): np[0].append(np.makeelement(NCX('text'))) np[0][0].text = text if href: - np.append(np.makeelement(NCX('content'), attrib={'src':href})) + np.append(np.makeelement(NCX('content'), attrib={'src': href})) return np def process_nav_node(node, toc_parent): @@ -401,20 +418,25 @@ class EPUBInput(InputFormatPlugin): else: return - with NamedTemporaryFile(suffix='.ncx', dir=os.path.dirname(nav_path), delete=False) as f: + with NamedTemporaryFile(suffix='.ncx', dir=os.path.dirname(nav_path), + delete=False) as f: f.write(etree.tostring(ncx, encoding='utf-8')) ncx_href = os.path.relpath(f.name, os.getcwd()).replace(os.sep, '/') - ncx_id = opf.create_manifest_item(ncx_href, NCX_MIME, append=True).get('id') + ncx_id = opf.create_manifest_item(ncx_href, NCX_MIME, + append=True).get('id') for spine in opf.root.xpath('//*[local-name()="spine"]'): spine.set('toc', ncx_id) - opts.epub3_nav_href = urlnormalize(os.path.relpath(nav_path).replace(os.sep, '/')) + url = os.path.relpath(nav_path).replace(os.sep, '/') + opts.epub3_nav_href = urlnormalize(url) opts.epub3_nav_parsed = root if getattr(self, 'removed_cover', None): changed = False base_path = os.path.dirname(nav_path) for elem in root.xpath('//*[@href]'): href, frag = elem.get('href').partition('#')[::2] - link_path = os.path.relpath(os.path.join(base_path, urlunquote(href)), base_path) + link_path = os.path.relpath(os.path.join(base_path, + urlunquote(href)), + base_path) abs_href = urlnormalize(link_path) if abs_href == self.removed_cover: changed = True diff --git a/ebook_converter/ebooks/conversion/plugins/epub_output.py b/ebook_converter/ebooks/conversion/plugins/epub_output.py index 9f522d7..df7c917 100644 --- a/ebook_converter/ebooks/conversion/plugins/epub_output.py +++ b/ebook_converter/ebooks/conversion/plugins/epub_output.py @@ -2,7 +2,11 @@ import os import re import shutil import urllib.parse +import uuid +from ebook_converter import constants as const +from ebook_converter.ebooks.oeb import base +from ebook_converter.ebooks.oeb import parse_utils from ebook_converter.customize.conversion import OutputFormatPlugin from ebook_converter.customize.conversion import OptionRecommendation @@ -132,39 +136,37 @@ class EPUBOutput(OutputFormatPlugin): recommendations = {('pretty_print', True, OptionRecommendation.HIGH)} def workaround_webkit_quirks(self): # {{{ - from ebook_converter.ebooks.oeb.base import XPath for x in self.oeb.spine: root = x.data - body = XPath('//h:body')(root) + body = base.XPath('//h:body')(root) if body: body = body[0] if not hasattr(body, 'xpath'): continue - for pre in XPath('//h:pre')(body): + for pre in base.XPath('//h:pre')(body): if not pre.text and len(pre) == 0: pre.tag = 'div' # }}} def upshift_markup(self): # {{{ 'Upgrade markup to comply with XHTML 1.1 where possible' - from ebook_converter.ebooks.oeb.base import XPath, XML for x in self.oeb.spine: root = x.data - if (not root.get(XML('lang'))) and (root.get('lang')): - root.set(XML('lang'), root.get('lang')) - body = XPath('//h:body')(root) + if (not root.get(base.tag('xml', 'lang'))) and (root.get('lang')): + root.set(base.tag('xml', 'lang'), root.get('lang')) + body = base.XPath('//h:body')(root) if body: body = body[0] if not hasattr(body, 'xpath'): continue - for u in XPath('//h:u')(root): + for u in base.XPath('//h:u')(root): u.tag = 'span' seen_ids, seen_names = set(), set() - for x in XPath('//*[@id or @name]')(root): + for x in base.XPath('//*[@id or @name]')(root): eid, name = x.get('id', None), x.get('name', None) if eid: if eid in seen_ids: @@ -223,28 +225,27 @@ class EPUBOutput(OutputFormatPlugin): first = next(iter(self.oeb.spine)) self.oeb.toc.add('Start', first.href) - from ebook_converter.ebooks.oeb.base import OPF identifiers = oeb.metadata['identifier'] - uuid = None + _uuid = None for x in identifiers: - if x.get(OPF('scheme'), None).lower() == 'uuid' or str(x).startswith('urn:uuid:'): - uuid = str(x).split(':')[-1] + if (x.get(base.tag('opf', 'scheme'), None).lower() == 'uuid' or + str(x).startswith('urn:uuid:')): + _uuid = str(x).split(':')[-1] break encrypted_fonts = getattr(input_plugin, 'encrypted_fonts', []) - if uuid is None: + if _uuid is None: self.log.warn('No UUID identifier found') - from uuid import uuid4 - uuid = str(uuid4()) - oeb.metadata.add('identifier', uuid, scheme='uuid', id=uuid) + _uuid = str(uuid.uuid4()) + oeb.metadata.add('identifier', _uuid, scheme='uuid', id=_uuid) - if encrypted_fonts and not uuid.startswith('urn:uuid:'): + if encrypted_fonts and not _uuid.startswith('urn:uuid:'): # Apparently ADE requires this value to start with urn:uuid: # for some absurd reason, or it will throw a hissy fit and refuse # to use the obfuscated fonts. for x in identifiers: - if str(x) == uuid: - x.content = 'urn:uuid:'+uuid + if str(x) == _uuid: + x.content = 'urn:uuid:' + _uuid with TemporaryDirectory('_epub_output') as tdir: from ebook_converter.customize.ui import plugin_for_output_format @@ -264,7 +265,7 @@ class EPUBOutput(OutputFormatPlugin): self.upgrade_to_epub3(tdir, opf) encryption = None if encrypted_fonts: - encryption = self.encrypt_fonts(encrypted_fonts, tdir, uuid) + encryption = self.encrypt_fonts(encrypted_fonts, tdir, _uuid) from ebook_converter.ebooks.epub import initialize_container with initialize_container(output_path, os.path.basename(opf), @@ -312,12 +313,12 @@ class EPUBOutput(OutputFormatPlugin): except EnvironmentError: pass - def encrypt_fonts(self, uris, tdir, uuid): # {{{ + def encrypt_fonts(self, uris, tdir, _uuid): # {{{ from ebook_converter.polyglot.binary import from_hex_bytes - key = re.sub(r'[^a-fA-F0-9]', '', uuid) + key = re.sub(r'[^a-fA-F0-9]', '', _uuid) if len(key) < 16: - raise ValueError('UUID identifier %r is invalid'%uuid) + raise ValueError('UUID identifier %r is invalid'% _uuid) key = bytearray(from_hex_bytes((key + key)[:32])) paths = [] with CurrentDir(tdir): @@ -335,7 +336,8 @@ class EPUBOutput(OutputFormatPlugin): if len(data) >= 1024: data = bytearray(data) f.seek(0) - f.write(bytes(bytearray(data[i] ^ key[i%16] for i in range(1024)))) + f.write(bytes(bytearray(data[i] ^ key[i%16] + for i in range(1024)))) else: self.log.warn('Font', path, 'is invalid, ignoring') if not isinstance(uri, str): @@ -374,11 +376,10 @@ class EPUBOutput(OutputFormatPlugin): # }}} def workaround_ade_quirks(self): # {{{ - ''' + """ Perform various markup transforms to get the output to render correctly in the quirky ADE. - ''' - from ebook_converter.ebooks.oeb.base import XPath, XHTML, barename, urlunquote + """ stylesheet = self.oeb.manifest.main_stylesheet @@ -388,23 +389,23 @@ class EPUBOutput(OutputFormatPlugin): for node in self.oeb.toc.iter(): href = getattr(node, 'href', None) if hasattr(href, 'partition'): - base, _, frag = href.partition('#') - frag = urlunquote(frag) + _base, _, frag = href.partition('#') + frag = base.urlunquote(frag) if frag and frag_pat.match(frag) is None: self.log.warn( 'Removing fragment identifier %r from TOC as Adobe Digital Editions cannot handle it'%frag) - node.href = base + node.href = _base for x in self.oeb.spine: root = x.data - body = XPath('//h:body')(root) + body = base.XPath('//h:body')(root) if body: body = body[0] if hasattr(body, 'xpath'): # remove tags with empty src elements bad = [] - for x in XPath('//h:img')(body): + for x in base.XPath('//h:img')(body): src = x.get('src', '').strip() if src in ('', '#') or src.startswith('http:'): bad.append(x) @@ -412,7 +413,7 @@ class EPUBOutput(OutputFormatPlugin): img.getparent().remove(img) # Add id attribute to tags that have name - for x in XPath('//h:a[@name]')(body): + for x in base.XPath('//h:a[@name]')(body): if not x.get('id', False): x.set('id', x.get('name')) # The delightful epubcheck has started complaining about tags that @@ -420,19 +421,19 @@ class EPUBOutput(OutputFormatPlugin): x.attrib.pop('name') # Replace
that are children of as ADE doesn't handle them - for br in XPath('./h:br')(body): + for br in base.XPath('./h:br')(body): if br.getparent() is None: continue try: prior = next(br.itersiblings(preceding=True)) - priortag = barename(prior.tag) + priortag = parse_utils.barename(prior.tag) priortext = prior.tail except: priortag = 'body' priortext = body.text if priortext: priortext = priortext.strip() - br.tag = XHTML('p') + br.tag = base.tag('xhtml', 'p') br.text = '\u00a0' style = br.get('style', '').split(';') style = list(filter(None, map(lambda x: x.strip(), style))) @@ -446,44 +447,44 @@ class EPUBOutput(OutputFormatPlugin): style.append('height:0pt') br.set('style', '; '.join(style)) - for tag in XPath('//h:embed')(root): + for tag in base.XPath('//h:embed')(root): tag.getparent().remove(tag) - for tag in XPath('//h:object')(root): + for tag in base.XPath('//h:object')(root): if tag.get('type', '').lower().strip() in {'image/svg+xml', 'application/svg+xml'}: continue tag.getparent().remove(tag) - for tag in XPath('//h:title|//h:style')(root): + for tag in base.XPath('//h:title|//h:style')(root): if not tag.text: tag.getparent().remove(tag) - for tag in XPath('//h:script')(root): + for tag in base.XPath('//h:script')(root): if (not tag.text and not tag.get('src', False) and tag.get('type', None) != 'text/x-mathjax-config'): tag.getparent().remove(tag) - for tag in XPath('//h:body/descendant::h:script')(root): + for tag in base.XPath('//h:body/descendant::h:script')(root): tag.getparent().remove(tag) - formchildren = XPath('./h:input|./h:button|./h:textarea|' + formchildren = base.XPath('./h:input|./h:button|./h:textarea|' './h:label|./h:fieldset|./h:legend') - for tag in XPath('//h:form')(root): + for tag in base.XPath('//h:form')(root): if formchildren(tag): tag.getparent().remove(tag) else: # Not a real form - tag.tag = XHTML('div') + tag.tag = base.tag('xhtml', 'div') - for tag in XPath('//h:center')(root): - tag.tag = XHTML('div') + for tag in base.XPath('//h:center')(root): + tag.tag = base.tag('xhtml', 'div') tag.set('style', 'text-align:center') # ADE can't handle & in an img url - for tag in XPath('//h:img[@src]')(root): + for tag in base.XPath('//h:img[@src]')(root): tag.set('src', tag.get('src', '').replace('&', '')) # ADE whimpers in fright when it encounters a outside a # - in_table = XPath('ancestor::h:table') - for tag in XPath('//h:td|//h:tr|//h:th')(root): + in_table = base.XPath('ancestor::h:table') + for tag in base.XPath('//h:td|//h:tr|//h:th')(root): if not in_table(tag): - tag.tag = XHTML('div') + tag.tag = base.tag('xhtml', 'div') # ADE fails to render non breaking hyphens/soft hyphens/zero width spaces special_chars = re.compile('[\u200b\u00ad]') @@ -498,7 +499,7 @@ class EPUBOutput(OutputFormatPlugin): if stylesheet is not None: # ADE doesn't render lists correctly if they have left margins from css_parser.css import CSSRule - for lb in XPath('//h:ul[@class]|//h:ol[@class]')(root): + for lb in base.XPath('//h:ul[@class]|//h:ol[@class]')(root): sel = '.'+lb.get('class') for rule in stylesheet.data.cssRules.rulesOfType(CSSRule.STYLE_RULE): if sel == rule.selectorList.selectorText: @@ -519,11 +520,10 @@ class EPUBOutput(OutputFormatPlugin): ''' Perform toc link transforms to alleviate slow loading. ''' - from ebook_converter.ebooks.oeb.base import XPath from ebook_converter.ebooks.oeb.polish.toc import item_at_top def frag_is_at_top(root, frag): - elem = XPath('//*[@id="%s" or @name="%s"]'%(frag, frag))(root) + elem = base.XPath('//*[@id="%s" or @name="%s"]'%(frag, frag))(root) if elem: elem = elem[0] else: diff --git a/ebook_converter/ebooks/conversion/plugins/fb2_input.py b/ebook_converter/ebooks/conversion/plugins/fb2_input.py index c84ea79..62f7b86 100644 --- a/ebook_converter/ebooks/conversion/plugins/fb2_input.py +++ b/ebook_converter/ebooks/conversion/plugins/fb2_input.py @@ -1,59 +1,57 @@ """ Convert .fb2 files to .lrf """ -import os, re +import os import pkg_resources +import re -from ebook_converter.customize.conversion import InputFormatPlugin, OptionRecommendation +from lxml import etree + +from ebook_converter import constants as const +from ebook_converter.customize.conversion import InputFormatPlugin +from ebook_converter.customize.conversion import OptionRecommendation from ebook_converter import guess_type -__license__ = 'GPL v3' -__copyright__ = '2008, Anatoly Shipitsin ' - -FB2NS = 'http://www.gribuser.ru/xml/fictionbook/2.0' +FB2NS = 'http://www.gribuser.ru/xml/fictionbook/2.0' FB21NS = 'http://www.gribuser.ru/xml/fictionbook/2.1' class FB2Input(InputFormatPlugin): - name = 'FB2 Input' - author = 'Anatoly Shipitsin' + name = 'FB2 Input' + author = 'Anatoly Shipitsin' description = 'Convert FB2 and FBZ files to HTML' - file_types = {'fb2', 'fbz'} + file_types = {'fb2', 'fbz'} commit_name = 'fb2_input' - recommendations = { - ('level1_toc', '//h:h1', OptionRecommendation.MED), - ('level2_toc', '//h:h2', OptionRecommendation.MED), - ('level3_toc', '//h:h3', OptionRecommendation.MED), - } + recommendations = {('level1_toc', '//h:h1', OptionRecommendation.MED), + ('level2_toc', '//h:h2', OptionRecommendation.MED), + ('level3_toc', '//h:h3', OptionRecommendation.MED)} - options = { - OptionRecommendation(name='no_inline_fb2_toc', - recommended_value=False, level=OptionRecommendation.LOW, - help='Do not insert a Table of Contents at the beginning of the book.' - )} + options = {OptionRecommendation(name='no_inline_fb2_toc', + recommended_value=False, + level=OptionRecommendation.LOW, + help='Do not insert a Table of Contents ' + 'at the beginning of the book.')} def convert(self, stream, options, file_ext, log, accelerators): - from lxml import etree - from ebook_converter.utils.xml_parse import safe_xml_fromstring - from ebook_converter.ebooks.metadata.fb2 import ensure_namespace, get_fb2_data + from ebook_converter.ebooks.metadata.fb2 import ensure_namespace + from ebook_converter.ebooks.metadata.fb2 import get_fb2_data from ebook_converter.ebooks.metadata.opf2 import OPFCreator from ebook_converter.ebooks.metadata.meta import get_metadata - from ebook_converter.ebooks.oeb.base import XLINK_NS, XHTML_NS from ebook_converter.ebooks.chardet import xml_to_unicode self.log = log log.debug('Parsing XML...') raw = get_fb2_data(stream)[0] raw = raw.replace(b'\0', b'') raw = xml_to_unicode(raw, strip_encoding_pats=True, - assume_utf8=True, resolve_entities=True)[0] + assume_utf8=True, resolve_entities=True)[0] try: - doc = safe_xml_fromstring(raw) + doc = etree.fromstring(raw) except etree.XMLSyntaxError: - doc = safe_xml_fromstring(raw.replace('& ', '&')) + doc = etree.fromstring(raw.replace('& ', '&')) if doc is None: raise ValueError('The FB2 file is not valid XML') doc = ensure_namespace(doc) @@ -62,22 +60,24 @@ class FB2Input(InputFormatPlugin): except Exception: fb_ns = FB2NS - NAMESPACES = {'f':fb_ns, 'l':XLINK_NS} - stylesheets = doc.xpath('//*[local-name() = "stylesheet" and @type="text/css"]') + NAMESPACES = {'f': fb_ns, 'l': const.XLINK_NS} + stylesheets = doc.xpath('//*[local-name() = "stylesheet" and ' + '@type="text/css"]') css = '' for s in stylesheets: css += etree.tostring(s, encoding='unicode', method='text', - with_tail=False) + '\n\n' + with_tail=False) + '\n\n' if css: - import css_parser, logging + import css_parser + import logging parser = css_parser.CSSParser(fetcher=None, - log=logging.getLogger('calibre.css')) + log=logging.getLogger('calibre.css')) - XHTML_CSS_NAMESPACE = '@namespace "%s";\n' % XHTML_NS + XHTML_CSS_NAMESPACE = '@namespace "%s";\n' % const.XHTML_NS text = XHTML_CSS_NAMESPACE + css log.debug('Parsing stylesheet...') stylesheet = parser.parseString(text) - stylesheet.namespaces['h'] = XHTML_NS + stylesheet.namespaces['h'] = const.XHTML_NS css = stylesheet.cssText if isinstance(css, bytes): css = css.decode('utf-8', 'replace') @@ -92,16 +92,20 @@ class FB2Input(InputFormatPlugin): if options.no_inline_fb2_toc: log('Disabling generation of inline FB2 TOC') ss = re.compile(r'.*', - re.DOTALL).sub('', ss) + re.DOTALL).sub('', ss) - styledoc = safe_xml_fromstring(ss) + styledoc = etree.fromstring(ss) transform = etree.XSLT(styledoc) result = transform(doc) # Handle links of type note and cite - notes = {a.get('href')[1:]: a for a in result.xpath('//a[@link_note and @href]') if a.get('href').startswith('#')} - cites = {a.get('link_cite'): a for a in result.xpath('//a[@link_cite]') if not a.get('href', '')} + notes = {a.get('href')[1:]: a + for a in result.xpath('//a[@link_note and @href]') + if a.get('href').startswith('#')} + cites = {a.get('link_cite'): a + for a in result.xpath('//a[@link_cite]') + if not a.get('href', '')} all_ids = {x for x in result.xpath('//*/@id')} for cite, a in cites.items(): note = notes.get(cite, None) @@ -137,8 +141,10 @@ class FB2Input(InputFormatPlugin): f.write(mi.cover_data[1]) cpath = os.path.abspath('fb2_cover_calibre_mi.jpg') else: - for img in doc.xpath('//f:coverpage/f:image', namespaces=NAMESPACES): - href = img.get('{%s}href'%XLINK_NS, img.get('href', None)) + for img in doc.xpath('//f:coverpage/f:image', + namespaces=NAMESPACES): + href = img.get('{%s}href' % const.XLINK_NS, + img.get('href', None)) if href is not None: if href.startswith('#'): href = href[1:] @@ -165,15 +171,15 @@ class FB2Input(InputFormatPlugin): ext = ct.rpartition('/')[-1].lower() if ext in ('png', 'jpeg', 'jpg'): if fname.lower().rpartition('.')[-1] not in {'jpg', 'jpeg', - 'png'}: + 'png'}: fname += '.' + ext self.binary_map[elem.get('id')] = fname raw = elem.text.strip() try: data = base64_decode(raw) except TypeError: - self.log.exception('Binary data with id=%s is corrupted, ignoring'%( - elem.get('id'))) + self.log.exception('Binary data with id=%s is corrupted, ' + 'ignoring' % elem.get('id')) else: with open(fname, 'wb') as f: f.write(data) diff --git a/ebook_converter/ebooks/conversion/plugins/lit_input.py b/ebook_converter/ebooks/conversion/plugins/lit_input.py index 8fbe8ce..446af67 100644 --- a/ebook_converter/ebooks/conversion/plugins/lit_input.py +++ b/ebook_converter/ebooks/conversion/plugins/lit_input.py @@ -1,17 +1,17 @@ +import copy + +from lxml import etree + +from ebook_converter import constants as const from ebook_converter.customize.conversion import InputFormatPlugin -__license__ = 'GPL v3' -__copyright__ = '2009, Kovid Goyal ' -__docformat__ = 'restructuredtext en' - - class LITInput(InputFormatPlugin): - name = 'LIT Input' - author = 'Marshall T. Vandegrift' + name = 'LIT Input' + author = 'Marshall T. Vandegrift' description = 'Convert LIT files to HTML' - file_types = {'lit'} + file_types = {'lit'} commit_name = 'lit_input' def convert(self, stream, options, file_ext, log, @@ -22,7 +22,7 @@ class LITInput(InputFormatPlugin): return create_oebbook(log, stream, options, reader=LitReader) def postprocess_book(self, oeb, opts, log): - from ebook_converter.ebooks.oeb.base import XHTML_NS, XPath, XHTML + from ebook_converter.ebooks.oeb.base import XPath, XHTML for item in oeb.spine: root = item.data if not hasattr(root, 'xpath'): @@ -37,22 +37,23 @@ class LITInput(InputFormatPlugin): body = body[0] if len(body) == 1 and body[0].tag == XHTML('pre'): pre = body[0] - from ebook_converter.ebooks.txt.processor import convert_basic, \ - separate_paragraphs_single_line + from ebook_converter.ebooks.txt.processor import \ + convert_basic, separate_paragraphs_single_line from ebook_converter.ebooks.chardet import xml_to_unicode - from ebook_converter.utils.xml_parse import safe_xml_fromstring - import copy - self.log('LIT file with all text in singe
 tag detected')
+                    self.log('LIT file with all text in singe 
 tag '
+                             'detected')
                     html = separate_paragraphs_single_line(pre.text)
                     html = convert_basic(html).replace('',
-                            ''%XHTML_NS)
+                                                       '' %
+                                                       const.XHTML_NS)
                     html = xml_to_unicode(html, strip_encoding_pats=True,
-                            resolve_entities=True)[0]
+                                          resolve_entities=True)[0]
                     if opts.smarten_punctuation:
                         # SmartyPants skips text inside 
 tags
-                        from ebook_converter.ebooks.conversion.preprocess import smarten_punctuation
-                        html = smarten_punctuation(html, self.log)
-                    root = safe_xml_fromstring(html)
+                        from ebook_converter.ebooks.conversion import \
+                                preprocess
+                        html = preprocess.smarten_punctuation(html, self.log)
+                    root = etree.fromstring(html)
                     body = XPath('//h:body')(root)
                     pre.tag = XHTML('div')
                     pre.text = ''
diff --git a/ebook_converter/ebooks/conversion/plugins/lrf_input.py b/ebook_converter/ebooks/conversion/plugins/lrf_input.py
index 729c3b8..6f51f98 100644
--- a/ebook_converter/ebooks/conversion/plugins/lrf_input.py
+++ b/ebook_converter/ebooks/conversion/plugins/lrf_input.py
@@ -1,54 +1,52 @@
-import os, sys
+import os
+import sys
 import pkg_resources
 
+from lxml import etree
+
 from ebook_converter.customize.conversion import InputFormatPlugin
 
 
-__license__ = 'GPL v3'
-__copyright__ = '2009, Kovid Goyal '
-__docformat__ = 'restructuredtext en'
-
-
 class LRFInput(InputFormatPlugin):
 
-    name        = 'LRF Input'
-    author      = 'Kovid Goyal'
+    name = 'LRF Input'
+    author = 'Kovid Goyal'
     description = 'Convert LRF files to HTML'
-    file_types  = {'lrf'}
+    file_types = {'lrf'}
     commit_name = 'lrf_input'
 
     def convert(self, stream, options, file_ext, log,
                 accelerators):
-        from ebook_converter.ebooks.lrf.input import (MediaType, Styles, TextBlock,
-                Canvas, ImageBlock, RuledLine)
+        from ebook_converter.ebooks.lrf.input import MediaType, Styles, \
+                TextBlock, Canvas, ImageBlock, RuledLine
         self.log = log
         self.log('Generating XML')
         from ebook_converter.ebooks.lrf.lrfparser import LRFDocument
-        from ebook_converter.utils.xml_parse import safe_xml_fromstring
-        from lxml import etree
         d = LRFDocument(stream)
         d.parse()
         xml = d.to_xml(write_files=True)
         if options.verbose > 2:
             open(u'lrs.xml', 'wb').write(xml.encode('utf-8'))
-        doc = safe_xml_fromstring(xml)
+        doc = etree.fromstring(xml)
 
         char_button_map = {}
         for x in doc.xpath('//CharButton[@refobj]'):
             ro = x.get('refobj')
-            jump_button = doc.xpath('//*[@objid="%s"]'%ro)
+            jump_button = doc.xpath('//*[@objid="%s"]' % ro)
             if jump_button:
-                jump_to = jump_button[0].xpath('descendant::JumpTo[@refpage and @refobj]')
+                jump_to = jump_button[0].xpath('descendant::JumpTo[@refpage '
+                                               'and @refobj]')
                 if jump_to:
-                    char_button_map[ro] = '%s.xhtml#%s'%(jump_to[0].get('refpage'),
-                            jump_to[0].get('refobj'))
+                    char_button_map[ro] = ('%s.xhtml#%s' %
+                                           (jump_to[0].get('refpage'),
+                                            jump_to[0].get('refobj')))
         plot_map = {}
         for x in doc.xpath('//Plot[@refobj]'):
             ro = x.get('refobj')
-            image = doc.xpath('//Image[@objid="%s" and @refstream]'%ro)
+            image = doc.xpath('//Image[@objid="%s" and @refstream]' % ro)
             if image:
-                imgstr = doc.xpath('//ImageStream[@objid="%s" and @file]'%
-                    image[0].get('refstream'))
+                imgstr = doc.xpath('//ImageStream[@objid="%s" and @file]' %
+                                   image[0].get('refstream'))
                 if imgstr:
                     plot_map[ro] = imgstr[0].get('file')
 
@@ -58,21 +56,19 @@ class LRFInput(InputFormatPlugin):
                   resource_filename('ebook_converter',
                                     'data/lrf.xsl')) as fobj:
             # TODO(gryf): change this nonsense to etree.parse() instead.
-            styledoc = safe_xml_fromstring(fobj.read())
+            styledoc = etree.fromstring(fobj.read())
         media_type = MediaType()
         styles = Styles()
         text_block = TextBlock(styles, char_button_map, plot_map, log)
         canvas = Canvas(doc, styles, text_block, log)
         image_block = ImageBlock(canvas)
         ruled_line = RuledLine()
-        extensions = {
-                ('calibre', 'media-type') : media_type,
-                ('calibre', 'text-block') : text_block,
-                ('calibre', 'ruled-line') : ruled_line,
-                ('calibre', 'styles')     : styles,
-                ('calibre', 'canvas')     : canvas,
-                ('calibre', 'image-block'): image_block,
-                }
+        extensions = {('calibre', 'media-type'): media_type,
+                      ('calibre', 'text-block'): text_block,
+                      ('calibre', 'ruled-line'): ruled_line,
+                      ('calibre', 'styles'): styles,
+                      ('calibre', 'canvas'): canvas,
+                      ('calibre', 'image-block'): image_block}
         transform = etree.XSLT(styledoc, extensions=extensions)
         try:
             result = transform(doc)
diff --git a/ebook_converter/ebooks/conversion/plugins/rtf_input.py b/ebook_converter/ebooks/conversion/plugins/rtf_input.py
index 3c4192d..5d36fee 100644
--- a/ebook_converter/ebooks/conversion/plugins/rtf_input.py
+++ b/ebook_converter/ebooks/conversion/plugins/rtf_input.py
@@ -1,57 +1,58 @@
-import os, glob, re, textwrap
+import glob
+import os
 import pkg_resources
+import re
+import textwrap
 
-from ebook_converter.customize.conversion import InputFormatPlugin, OptionRecommendation
+from lxml import etree
+
+from ebook_converter.customize.conversion import InputFormatPlugin
+from ebook_converter.customize.conversion import OptionRecommendation
 from ebook_converter.polyglot.builtins import as_bytes
 
-__license__ = 'GPL v3'
-__copyright__ = '2008, Kovid Goyal '
 
-border_style_map = {
-        'single' : 'solid',
-        'double-thickness-border' : 'double',
-        'shadowed-border': 'outset',
-        'double-border': 'double',
-        'dotted-border': 'dotted',
-        'dashed': 'dashed',
-        'hairline': 'solid',
-        'inset': 'inset',
-        'dash-small': 'dashed',
-        'dot-dash': 'dotted',
-        'dot-dot-dash': 'dotted',
-        'outset': 'outset',
-        'tripple': 'double',
-        'triple': 'double',
-        'thick-thin-small': 'solid',
-        'thin-thick-small': 'solid',
-        'thin-thick-thin-small': 'solid',
-        'thick-thin-medium': 'solid',
-        'thin-thick-medium': 'solid',
-        'thin-thick-thin-medium': 'solid',
-        'thick-thin-large': 'solid',
-        'thin-thick-thin-large': 'solid',
-        'wavy': 'ridge',
-        'double-wavy': 'ridge',
-        'striped': 'ridge',
-        'emboss': 'inset',
-        'engrave': 'inset',
-        'frame': 'ridge',
-}
+border_style_map = {'single': 'solid',
+                    'double-thickness-border': 'double',
+                    'shadowed-border': 'outset',
+                    'double-border': 'double',
+                    'dotted-border': 'dotted',
+                    'dashed': 'dashed',
+                    'hairline': 'solid',
+                    'inset': 'inset',
+                    'dash-small': 'dashed',
+                    'dot-dash': 'dotted',
+                    'dot-dot-dash': 'dotted',
+                    'outset': 'outset',
+                    'tripple': 'double',
+                    'triple': 'double',
+                    'thick-thin-small': 'solid',
+                    'thin-thick-small': 'solid',
+                    'thin-thick-thin-small': 'solid',
+                    'thick-thin-medium': 'solid',
+                    'thin-thick-medium': 'solid',
+                    'thin-thick-thin-medium': 'solid',
+                    'thick-thin-large': 'solid',
+                    'thin-thick-thin-large': 'solid',
+                    'wavy': 'ridge',
+                    'double-wavy': 'ridge',
+                    'striped': 'ridge',
+                    'emboss': 'inset',
+                    'engrave': 'inset',
+                    'frame': 'ridge'}
 
 
 class RTFInput(InputFormatPlugin):
 
-    name        = 'RTF Input'
-    author      = 'Kovid Goyal'
+    name = 'RTF Input'
+    author = 'Kovid Goyal'
     description = 'Convert RTF files to HTML'
-    file_types  = {'rtf'}
+    file_types = {'rtf'}
     commit_name = 'rtf_input'
 
-    options = {
-        OptionRecommendation(name='ignore_wmf', recommended_value=False,
-            help='Ignore WMF images instead of replacing them with a '
-                 'placeholder image.'),
-    }
+    options = {OptionRecommendation(name='ignore_wmf', recommended_value=False,
+                                    help='Ignore WMF images instead of '
+                                    'replacing them with a placeholder '
+                                    'image.')}
 
     def generate_xml(self, stream):
         from ebook_converter.ebooks.rtf2xml.ParseRtf import ParseRtf
@@ -64,7 +65,7 @@ class RTFInput(InputFormatPlugin):
                 run_lev = 4
                 indent_out = 1
                 self.log('Running RTFParser in debug mode')
-            except:
+            except Exception:
                 self.log.warn('Impossible to run RTFParser in debug mode')
         parser = ParseRtf(
             in_file=stream,
@@ -108,7 +109,8 @@ class RTFInput(InputFormatPlugin):
             deb_dir=debug_dir,
 
             # Default encoding
-            default_encoding=getattr(self.opts, 'input_encoding', 'cp1252') or 'cp1252',
+            default_encoding=getattr(self.opts, 'input_encoding',
+                                     'cp1252') or 'cp1252',
 
             # Run level
             run_level=run_lev,
@@ -151,7 +153,7 @@ class RTFInput(InputFormatPlugin):
         for count, val in imap.items():
             try:
                 imap[count] = self.convert_image(val)
-            except:
+            except Exception:
                 self.log.exception('Failed to convert', val)
         return imap
 
@@ -161,7 +163,7 @@ class RTFInput(InputFormatPlugin):
         try:
             return self.rasterize_wmf(name)
         except Exception:
-            self.log.exception('Failed to convert WMF image %r'%name)
+            self.log.exception('Failed to convert WMF image %r' % name)
         return self.replace_wmf(name)
 
     def replace_wmf(self, name):
@@ -170,9 +172,11 @@ class RTFInput(InputFormatPlugin):
             return '__REMOVE_ME__'
         from ebook_converter.ebooks.covers import message_image
         if self.default_img is None:
-            self.default_img = message_image('Conversion of WMF images is not supported.'
-            ' Use Microsoft Word or OpenOffice to save this RTF file'
-            ' as HTML and convert that in calibre.')
+            self.default_img = message_image('Conversion of WMF images is not '
+                                             'supported. Use Microsoft Word '
+                                             'or OpenOffice to save this RTF '
+                                             'file as HTML and convert that '
+                                             'in calibre.')
         name = name.replace('.wmf', '.jpg')
         with open(name, 'wb') as f:
             f.write(self.default_img)
@@ -189,10 +193,10 @@ class RTFInput(InputFormatPlugin):
         return name
 
     def write_inline_css(self, ic, border_styles):
-        font_size_classes = ['span.fs%d { font-size: %spt }'%(i, x) for i, x in
-                enumerate(ic.font_sizes)]
-        color_classes = ['span.col%d { color: %s }'%(i, x) for i, x in
-                enumerate(ic.colors) if x != 'false']
+        font_size_classes = ['span.fs%d { font-size: %spt }' % (i, x)
+                             for i, x in enumerate(ic.font_sizes)]
+        color_classes = ['span.col%d { color: %s }' % (i, x)
+                         for i, x in enumerate(ic.colors) if x != 'false']
         css = textwrap.dedent('''
         span.none {
             text-decoration: none; font-weight: normal;
@@ -210,11 +214,11 @@ class RTFInput(InputFormatPlugin):
         span.strike-through { text-decoration: line-through }
 
         ''')
-        css += '\n'+'\n'.join(font_size_classes)
-        css += '\n' +'\n'.join(color_classes)
+        css += '\n' + '\n'.join(font_size_classes)
+        css += '\n' + '\n'.join(color_classes)
 
         for cls, val in border_styles.items():
-            css += '\n\n.%s {\n%s\n}'%(cls, val)
+            css += '\n\n.%s {\n%s\n}' % (cls, val)
 
         with open(u'styles.css', 'ab') as f:
             f.write(css.encode('utf-8'))
@@ -224,35 +228,34 @@ class RTFInput(InputFormatPlugin):
         style_map = {}
         for elem in doc.xpath(r'//*[local-name()="cell"]'):
             style = ['border-style: hidden', 'border-width: 1px',
-                    'border-color: black']
+                     'border-color: black']
             for x in ('bottom', 'top', 'left', 'right'):
-                bs = elem.get('border-cell-%s-style'%x, None)
+                bs = elem.get('border-cell-%s-style' % x, None)
                 if bs:
                     cbs = border_style_map.get(bs, 'solid')
-                    style.append('border-%s-style: %s'%(x, cbs))
-                bw = elem.get('border-cell-%s-line-width'%x, None)
+                    style.append('border-%s-style: %s' % (x, cbs))
+                bw = elem.get('border-cell-%s-line-width' % x, None)
                 if bw:
-                    style.append('border-%s-width: %spt'%(x, bw))
-                bc = elem.get('border-cell-%s-color'%x, None)
+                    style.append('border-%s-width: %spt' % (x, bw))
+                bc = elem.get('border-cell-%s-color' % x, None)
                 if bc:
-                    style.append('border-%s-color: %s'%(x, bc))
+                    style.append('border-%s-color: %s' % (x, bc))
             style = ';\n'.join(style)
             if style not in border_styles:
                 border_styles.append(style)
             idx = border_styles.index(style)
-            cls = 'border_style%d'%idx
+            cls = 'border_style%d' % idx
             style_map[cls] = style
             elem.set('class', cls)
         return style_map
 
     def convert(self, stream, options, file_ext, log,
                 accelerators):
-        from lxml import etree
         from ebook_converter.ebooks.metadata.meta import get_metadata
         from ebook_converter.ebooks.metadata.opf2 import OPFCreator
-        from ebook_converter.ebooks.rtf2xml.ParseRtf import RtfInvalidCodeException
+        from ebook_converter.ebooks.rtf2xml.ParseRtf import \
+            RtfInvalidCodeException
         from ebook_converter.ebooks.rtf.input import InlineClass
-        from ebook_converter.utils.xml_parse import safe_xml_fromstring
         self.opts = options
         self.log = log
         self.log('Converting RTF to XML...')
@@ -269,14 +272,15 @@ class RTFInput(InputFormatPlugin):
             imap = {}
             try:
                 imap = self.extract_images(d[0])
-            except:
+            except Exception:
                 self.log.exception('Failed to extract images...')
 
         self.log('Parsing XML...')
-        doc = safe_xml_fromstring(xml)
+        doc = etree.fromstring(xml)
         border_styles = self.convert_borders(doc)
         for pict in doc.xpath('//rtf:pict[@num]',
-                namespaces={'rtf':'http://rtf2xml.sourceforge.net/'}):
+                              namespaces={'rtf':
+                                          'http://rtf2xml.sourceforge.net/'}):
             num = int(pict.get('num'))
             name = imap.get(num, None)
             if name is not None:
@@ -286,8 +290,8 @@ class RTFInput(InputFormatPlugin):
         inline_class = InlineClass(self.log)
         with open(pkg_resources.resource_filename('ebook_converter',
                                                   'data/rtf.xsl')) as fobj:
-            styledoc = safe_xml_fromstring(fobj.read())
-        extensions = {('calibre', 'inline-class') : inline_class}
+            styledoc = etree.fromstring(fobj.read())
+        extensions = {('calibre', 'inline-class'): inline_class}
         transform = etree.XSLT(styledoc, extensions=extensions)
         result = transform(doc)
         html = u'index.xhtml'
@@ -296,7 +300,8 @@ class RTFInput(InputFormatPlugin):
             # res = res[:100].replace('xmlns:html', 'xmlns') + res[100:]
             # clean multiple \n
             res = re.sub(b'\n+', b'\n', res)
-            # Replace newlines inserted by the 'empty_paragraphs' option in rtf2xml with html blank lines
+            # Replace newlines inserted by the 'empty_paragraphs' option in
+            # rtf2xml with html blank lines
             # res = re.sub('\s*', '', res)
             # res = re.sub('(?<=\n)\n{2}',
             # u'

\u00a0

\n'.encode('utf-8'), res) @@ -316,7 +321,8 @@ class RTFInput(InputFormatPlugin): def postprocess_book(self, oeb, opts, log): for item in oeb.spine: - for img in item.data.xpath('//*[local-name()="img" and @src="__REMOVE_ME__"]'): + for img in item.data.xpath('//*[local-name()="img" and ' + '@src="__REMOVE_ME__"]'): p = img.getparent() idx = p.index(img) p.remove(img) diff --git a/ebook_converter/ebooks/conversion/plugins/snb_input.py b/ebook_converter/ebooks/conversion/plugins/snb_input.py index 51c7d7e..fb16039 100644 --- a/ebook_converter/ebooks/conversion/plugins/snb_input.py +++ b/ebook_converter/ebooks/conversion/plugins/snb_input.py @@ -1,27 +1,33 @@ import os +from lxml import etree + from ebook_converter.customize.conversion import InputFormatPlugin from ebook_converter.ptempfile import TemporaryDirectory from ebook_converter.utils.filenames import ascii_filename -__license__ = 'GPL 3' -__copyright__ = '2010, Li Fanxi ' -__docformat__ = 'restructuredtext en' - -HTML_TEMPLATE = '%s\n%s\n' +HTML_TEMPLATE = ('%s' + '\n%s\n') def html_encode(s): - return s.replace('&', '&').replace('<', '<').replace('>', '>').replace('"', '"').replace("'", ''').replace('\n', '
').replace(' ', ' ') # noqa + return (s.replace('&', '&') + .replace('<', '<') + .replace('>', '>') + .replace('"', '"') + .replace("'", ''') + .replace('\n', '
') + .replace(' ', ' ')) class SNBInput(InputFormatPlugin): - name = 'SNB Input' - author = 'Li Fanxi' + name = 'SNB Input' + author = 'Li Fanxi' description = 'Convert SNB files to OEB' - file_types = {'snb'} + file_types = {'snb'} commit_name = 'snb_input' options = set() @@ -32,13 +38,12 @@ class SNBInput(InputFormatPlugin): from ebook_converter.ebooks.oeb.base import DirContainer from ebook_converter.ebooks.snb.snbfile import SNBFile - from ebook_converter.utils.xml_parse import safe_xml_fromstring log.debug("Parsing SNB file...") snbFile = SNBFile() try: snbFile.Parse(stream) - except: + except Exception: raise ValueError("Invalid SNB file") if not snbFile.IsValid(): log.debug("Invalid SNB file") @@ -46,27 +51,28 @@ class SNBInput(InputFormatPlugin): log.debug("Handle meta data ...") from ebook_converter.ebooks.conversion.plumber import create_oebbook oeb = create_oebbook(log, None, options, - encoding=options.input_encoding, populate=False) + encoding=options.input_encoding, populate=False) meta = snbFile.GetFileStream('snbf/book.snbf') if meta is not None: - meta = safe_xml_fromstring(meta) - l = {'title' : './/head/name', - 'creator' : './/head/author', - 'language' : './/head/language', - 'generator': './/head/generator', - 'publisher': './/head/publisher', - 'cover' : './/head/cover', } + meta = etree.fromstring(meta) + item_map = {'title': './/head/name', + 'creator': './/head/author', + 'language': './/head/language', + 'generator': './/head/generator', + 'publisher': './/head/publisher', + 'cover': './/head/cover'} d = {} - for item in l: - node = meta.find(l[item]) + for key, item in item_map.items(): + node = meta.find(item) if node is not None: - d[item] = node.text if node.text is not None else '' + d[key] = node.text if node.text is not None else '' else: - d[item] = '' + d[key] = '' oeb.metadata.add('title', d['title']) - oeb.metadata.add('creator', d['creator'], attrib={'role':'aut'}) - oeb.metadata.add('language', d['language'].lower().replace('_', '-')) + oeb.metadata.add('creator', d['creator'], attrib={'role': 'aut'}) + oeb.metadata.add('language', + d['language'].lower().replace('_', '-')) oeb.metadata.add('generator', d['generator']) oeb.metadata.add('publisher', d['publisher']) if d['cover'] != '': @@ -84,7 +90,7 @@ class SNBInput(InputFormatPlugin): toc = snbFile.GetFileStream('snbf/toc.snbf') oeb.container = DirContainer(tdir, log) if toc is not None: - toc = safe_xml_fromstring(toc) + toc = etree.fromstring(toc) i = 1 for ch in toc.find('.//body'): chapterName = ch.text @@ -93,18 +99,22 @@ class SNBInput(InputFormatPlugin): data = snbFile.GetFileStream('snbc/' + chapterSrc) if data is None: continue - snbc = safe_xml_fromstring(data) + snbc = etree.fromstring(data) lines = [] for line in snbc.find('.//body'): if line.tag == 'text': lines.append('

%s

' % html_encode(line.text)) elif line.tag == 'img': - lines.append('

' % html_encode(line.text)) + lines.append('

' % + html_encode(line.text)) with open(os.path.join(tdir, fname), 'wb') as f: - f.write((HTML_TEMPLATE % (chapterName, '\n'.join(lines))).encode('utf-8', 'replace')) + f.write((HTML_TEMPLATE % + (chapterName, + '\n'.join(lines))).encode('utf-8', + 'replace')) oeb.toc.add(ch.text, fname) - id, href = oeb.manifest.generate(id='html', - href=ascii_filename(fname)) + id, href = oeb.manifest.generate( + id='html', href=ascii_filename(fname)) item = oeb.manifest.add(id, href, 'text/html') item.html_input_href = fname oeb.spine.add(item, True) @@ -112,7 +122,7 @@ class SNBInput(InputFormatPlugin): imageFiles = snbFile.OutputImageFiles(tdir) for f, m in imageFiles: id, href = oeb.manifest.generate(id='image', - href=ascii_filename(f)) + href=ascii_filename(f)) item = oeb.manifest.add(id, href, m) item.html_input_href = f diff --git a/ebook_converter/ebooks/docx/container.py b/ebook_converter/ebooks/docx/container.py index e9768f4..db11f3d 100644 --- a/ebook_converter/ebooks/docx/container.py +++ b/ebook_converter/ebooks/docx/container.py @@ -1,9 +1,12 @@ -import os, sys, shutil +import os +import shutil +import sys from lxml import etree from ebook_converter import walk, guess_type -from ebook_converter.ebooks.metadata import string_to_authors, authors_to_sort_string +from ebook_converter.ebooks.metadata import authors_to_sort_string +from ebook_converter.ebooks.metadata import string_to_authors from ebook_converter.ebooks.metadata.book.base import Metadata from ebook_converter.ebooks.docx import InvalidDOCX from ebook_converter.ebooks.docx.names import DOCXNamespace @@ -11,21 +14,11 @@ from ebook_converter.ptempfile import PersistentTemporaryDirectory from ebook_converter.utils.localization import canonicalize_lang from ebook_converter.utils.logging import default_log from ebook_converter.utils.zipfile import ZipFile -from ebook_converter.utils.xml_parse import safe_xml_fromstring -__license__ = 'GPL v3' -__copyright__ = '2013, Kovid Goyal ' - - -def fromstring(raw, parser=None): - return safe_xml_fromstring(raw) - # Read metadata {{{ - - def read_doc_props(raw, mi, XPath): - root = fromstring(raw) + root = etree.fromstring(raw) titles = XPath('//dc:title')(root) if titles: title = titles[0].text @@ -53,29 +46,31 @@ def read_doc_props(raw, mi, XPath): desc = XPath('//dc:description')(root) if desc: raw = etree.tostring(desc[0], method='text', encoding='unicode') - raw = raw.replace('_x000d_', '') # Word 2007 mangles newlines in the summary + # Word 2007 mangles newlines in the summary + raw = raw.replace('_x000d_', '') mi.comments = raw.strip() langs = [] for lang in XPath('//dc:language')(root): if lang.text and lang.text.strip(): - l = canonicalize_lang(lang.text) - if l: - langs.append(l) + canonic_lang = canonicalize_lang(lang.text) + if canonic_lang: + langs.append(canonic_lang) if langs: mi.languages = langs def read_app_props(raw, mi): - root = fromstring(raw) + root = etree.fromstring(raw) company = root.xpath('//*[local-name()="Company"]') if company and company[0].text and company[0].text.strip(): mi.publisher = company[0].text.strip() def read_default_style_language(raw, mi, XPath): - root = fromstring(raw) - for lang in XPath('/w:styles/w:docDefaults/w:rPrDefault/w:rPr/w:lang/@w:val')(root): + root = etree.fromstring(raw) + for lang in XPath('/w:styles/w:docDefaults/w:rPrDefault/w:rPr/w:lang/' + '@w:val')(root): lang = canonicalize_lang(lang) if lang: mi.languages = [lang] @@ -87,7 +82,9 @@ class DOCX(object): def __init__(self, path_or_stream, log=None, extract=True): self.docx_is_transitional = True - stream = path_or_stream if hasattr(path_or_stream, 'read') else open(path_or_stream, 'rb') + stream = path_or_stream + if not hasattr(path_or_stream, 'read'): + stream = open(path_or_stream, 'rb') self.name = getattr(stream, 'name', None) or '' self.log = log or default_log if extract: @@ -107,9 +104,9 @@ class DOCX(object): try: zf = ZipFile(stream) zf.extractall(self.tdir) - except: + except Exception: self.log.exception('DOCX appears to be invalid ZIP file, trying a' - ' more forgiving ZIP parser') + ' more forgiving ZIP parser') from ebook_converter.utils.localunzip import extractall stream.seek(0) extractall(stream, self.tdir) @@ -133,13 +130,17 @@ class DOCX(object): try: raw = self.read('[Content_Types].xml') except KeyError: - raise InvalidDOCX('The file %s docx file has no [Content_Types].xml' % self.name) - root = fromstring(raw) + raise InvalidDOCX('The file %s docx file has no ' + '[Content_Types].xml' % self.name) + root = etree.fromstring(raw) self.content_types = {} self.default_content_types = {} - for item in root.xpath('//*[local-name()="Types"]/*[local-name()="Default" and @Extension and @ContentType]'): - self.default_content_types[item.get('Extension').lower()] = item.get('ContentType') - for item in root.xpath('//*[local-name()="Types"]/*[local-name()="Override" and @PartName and @ContentType]'): + for item in root.xpath('//*[local-name()="Types"]/*[local-name()=' + '"Default" and @Extension and @ContentType]'): + self.default_content_types[item.get('Extension').lower()] = \ + item.get('ContentType') + for item in root.xpath('//*[local-name()="Types"]/*[local-name()=' + '"Override" and @PartName and @ContentType]'): name = item.get('PartName').lstrip('/') self.content_types[name] = item.get('ContentType') @@ -155,15 +156,19 @@ class DOCX(object): try: raw = self.read('_rels/.rels') except KeyError: - raise InvalidDOCX('The file %s docx file has no _rels/.rels' % self.name) - root = fromstring(raw) + raise InvalidDOCX('The file %s docx file has no _rels/.rels' % + self.name) + root = etree.fromstring(raw) self.relationships = {} self.relationships_rmap = {} - for item in root.xpath('//*[local-name()="Relationships"]/*[local-name()="Relationship" and @Type and @Target]'): + for item in root.xpath('//*[local-name()="Relationships"]/*[local-name' + '()="Relationship" and @Type and @Target]'): target = item.get('Target').lstrip('/') typ = item.get('Type') if target == 'word/document.xml': - self.docx_is_transitional = typ != 'http://purl.oclc.org/ooxml/officeDocument/relationships/officeDocument' + self.docx_is_transitional = (typ != 'http://purl.oclc.org/' + 'ooxml/officeDocument/' + 'relationships/officeDocument') self.relationships[typ] = target self.relationships_rmap[target] = typ @@ -171,15 +176,17 @@ class DOCX(object): def document_name(self): name = self.relationships.get(self.namespace.names['DOCUMENT'], None) if name is None: - names = tuple(n for n in self.names if n == 'document.xml' or n.endswith('/document.xml')) + names = tuple(n for n in self.names if n == 'document.xml' or + n.endswith('/document.xml')) if not names: - raise InvalidDOCX('The file %s docx file has no main document' % self.name) + raise InvalidDOCX('The file %s docx file has no main ' + 'document' % self.name) name = names[0] return name @property def document(self): - return fromstring(self.read(self.document_name)) + return etree.fromstring(self.read(self.document_name)) @property def document_relationships(self): @@ -195,10 +202,13 @@ class DOCX(object): except KeyError: pass else: - root = fromstring(raw) - for item in root.xpath('//*[local-name()="Relationships"]/*[local-name()="Relationship" and @Type and @Target]'): + root = etree.fromstring(raw) + for item in root.xpath('//*[local-name()="Relationships"]/*' + '[local-name()="Relationship" and @Type ' + 'and @Target]'): target = item.get('Target') - if item.get('TargetMode', None) != 'External' and not target.startswith('#'): + if (item.get('TargetMode', None) != 'External' and not + target.startswith('#')): target = '/'.join((base, target.lstrip('/'))) typ = item.get('Type') Id = item.get('Id') @@ -209,13 +219,15 @@ class DOCX(object): def get_document_properties_names(self): name = self.relationships.get(self.namespace.names['DOCPROPS'], None) if name is None: - names = tuple(n for n in self.names if n.lower() == 'docprops/core.xml') + names = tuple(n for n in self.names + if n.lower() == 'docprops/core.xml') if names: name = names[0] yield name name = self.relationships.get(self.namespace.names['APPPROPS'], None) if name is None: - names = tuple(n for n in self.names if n.lower() == 'docprops/app.xml') + names = tuple(n for n in self.names + if n.lower() == 'docprops/app.xml') if names: name = names[0] yield name @@ -239,7 +251,8 @@ class DOCX(object): else: read_default_style_language(raw, mi, self.namespace.XPath) - ap_name = self.relationships.get(self.namespace.names['APPPROPS'], None) + ap_name = self.relationships.get(self.namespace.names['APPPROPS'], + None) if ap_name: try: raw = self.read(ap_name) diff --git a/ebook_converter/ebooks/docx/to_html.py b/ebook_converter/ebooks/docx/to_html.py index 11ce9e2..30c2a64 100644 --- a/ebook_converter/ebooks/docx/to_html.py +++ b/ebook_converter/ebooks/docx/to_html.py @@ -1,12 +1,13 @@ import sys, os, re, math, errno, uuid, numbers from collections import OrderedDict, defaultdict +from lxml import etree from lxml import html from lxml.html.builder import ( HTML, HEAD, TITLE, BODY, LINK, META, P, SPAN, BR, DIV, A, DT, DL, DD, H1) from ebook_converter import guess_type -from ebook_converter.ebooks.docx.container import DOCX, fromstring +from ebook_converter.ebooks.docx.container import DOCX from ebook_converter.ebooks.docx.names import XML, generate_anchor from ebook_converter.ebooks.docx.styles import Styles, inherit, PageProperties from ebook_converter.ebooks.docx.numbering import Numbering @@ -311,7 +312,7 @@ class Convert(object): raise self.log.warn('Settings %s file missing' % sename) else: - self.settings(fromstring(seraw)) + self.settings(etree.fromstring(seraw)) if foname is not None: try: @@ -327,7 +328,7 @@ class Convert(object): self.log.warn('Endnotes %s do not exist' % enname) else: enrel = self.docx.get_relationships(enname) - footnotes(fromstring(foraw) if foraw else None, forel, fromstring(enraw) if enraw else None, enrel) + footnotes(etree.fromstring(foraw) if foraw else None, forel, etree.fromstring(enraw) if enraw else None, enrel) if fname is not None: embed_relationships = self.docx.get_relationships(fname)[0] @@ -336,7 +337,7 @@ class Convert(object): except KeyError: self.log.warn('Fonts table %s does not exist' % fname) else: - fonts(fromstring(raw), embed_relationships, self.docx, self.dest_dir) + fonts(etree.fromstring(raw), embed_relationships, self.docx, self.dest_dir) if tname is not None: try: @@ -344,7 +345,7 @@ class Convert(object): except KeyError: self.log.warn('Styles %s do not exist' % sname) else: - self.theme(fromstring(raw)) + self.theme(etree.fromstring(raw)) styles_loaded = False if sname is not None: @@ -353,7 +354,7 @@ class Convert(object): except KeyError: self.log.warn('Styles %s do not exist' % sname) else: - self.styles(fromstring(raw), fonts, self.theme) + self.styles(etree.fromstring(raw), fonts, self.theme) styles_loaded = True if not styles_loaded: self.styles(None, fonts, self.theme) @@ -364,7 +365,7 @@ class Convert(object): except KeyError: self.log.warn('Numbering styles %s do not exist' % nname) else: - numbering(fromstring(raw), self.styles, self.docx.get_relationships(nname)[0]) + numbering(etree.fromstring(raw), self.styles, self.docx.get_relationships(nname)[0]) self.styles.resolve_numbering(numbering) diff --git a/ebook_converter/ebooks/docx/writer/from_html.py b/ebook_converter/ebooks/docx/writer/from_html.py index c9894a5..b13a0bb 100644 --- a/ebook_converter/ebooks/docx/writer/from_html.py +++ b/ebook_converter/ebooks/docx/writer/from_html.py @@ -1,22 +1,19 @@ +import collections import re -from collections import Counter from ebook_converter.ebooks.docx.writer.container import create_skeleton, page_size, page_effective_area -from ebook_converter.ebooks.docx.writer.styles import StylesManager, FloatSpec -from ebook_converter.ebooks.docx.writer.links import LinksManager -from ebook_converter.ebooks.docx.writer.images import ImagesManager from ebook_converter.ebooks.docx.writer.fonts import FontsManager -from ebook_converter.ebooks.docx.writer.tables import Table +from ebook_converter.ebooks.docx.writer.images import ImagesManager +from ebook_converter.ebooks.docx.writer.links import LinksManager from ebook_converter.ebooks.docx.writer.lists import ListsManager +from ebook_converter.ebooks.docx.writer.styles import StylesManager, FloatSpec +from ebook_converter.ebooks.docx.writer.tables import Table +from ebook_converter.ebooks.oeb import base +from ebook_converter.ebooks.oeb import parse_utils from ebook_converter.ebooks.oeb.stylizer import Stylizer as Sz, Style as St -from ebook_converter.ebooks.oeb.base import XPath, barename from ebook_converter.utils.localization import lang_as_iso639_1 -__license__ = 'GPL v3' -__copyright__ = '2013, Kovid Goyal ' - - def lang_for_tag(tag): for attr in ('lang', '{http://www.w3.org/XML/1998/namespace}lang'): val = lang_as_iso639_1(tag.get(attr)) @@ -140,7 +137,7 @@ class Block(object): self.numbering_id = None self.parent_items = None self.html_block = html_block - self.html_tag = barename(html_block.tag) + self.html_tag = parse_utils.barename(html_block.tag) self.float_spec = float_spec if float_spec is not None: float_spec.blocks.append(self) @@ -387,7 +384,7 @@ class Blocks(object): def resolve_language(self): default_lang = self.styles_manager.document_lang for block in self.all_blocks: - count = Counter() + count = collections.Counter() for run in block.runs: count[run.lang] += 1 if count: @@ -473,13 +470,13 @@ class Convert(object): self.abshref = self.images_manager.abshref = item.abshref self.current_lang = lang_for_tag(item.data) or self.styles_manager.document_lang - for i, body in enumerate(XPath('//h:body')(item.data)): + for i, body in enumerate(base.XPath('//h:body')(item.data)): with self.blocks: self.blocks.top_bookmark = self.links_manager.bookmark_for_anchor(self.links_manager.top_anchor, self.current_item, body) self.process_tag(body, stylizer, is_first_tag=i == 0) def process_tag(self, html_tag, stylizer, is_first_tag=False, float_spec=None): - tagname = barename(html_tag.tag) + tagname = parse_utils.barename(html_tag.tag) tag_style = stylizer.style(html_tag) ignore_tag_contents = tagname in {'script', 'style', 'title', 'meta'} or tag_style.is_hidden display = tag_style._get('display') @@ -573,7 +570,7 @@ class Convert(object): text = html_tag.text if text: block.add_text(text, tag_style, ignore_leading_whitespace=True, is_parent_style=True, link=self.current_link, lang=self.current_lang) - elif tagname == 'li' and len(html_tag) and barename(html_tag[0].tag) in ('ul', 'ol') and len(html_tag[0]): + elif tagname == 'li' and len(html_tag) and parse_utils.barename(html_tag[0].tag) in ('ul', 'ol') and len(html_tag[0]): block.force_not_empty = True def add_inline_tag(self, tagname, html_tag, tag_style, stylizer): diff --git a/ebook_converter/ebooks/fb2/fb2ml.py b/ebook_converter/ebooks/fb2/fb2ml.py index 9db7965..0749be2 100644 --- a/ebook_converter/ebooks/fb2/fb2ml.py +++ b/ebook_converter/ebooks/fb2/fb2ml.py @@ -9,10 +9,10 @@ import uuid from lxml import etree +from ebook_converter import constants as const from ebook_converter import prepare_string_for_xml from ebook_converter.constants_old import __appname__, __version__ from ebook_converter.utils.localization import lang_as_iso639_1 -from ebook_converter.utils.xml_parse import safe_xml_fromstring from ebook_converter.utils.img import save_cover_data_to from ebook_converter.ebooks.oeb.base import urlnormalize from ebook_converter.polyglot.binary import as_base64_unicode @@ -36,9 +36,10 @@ class FB2MLizer(object): def reset_state(self): # Used to ensure text and tags are always within

and

self.in_p = False - # Mapping of image names. OEB allows for images to have the same name but be stored - # in different directories. FB2 images are all in a flat layout so we rename all images - # into a sequential numbering system to ensure there are no collisions between image names. + # Mapping of image names. OEB allows for images to have the same name + # but be stored in different directories. FB2 images are all in a flat + # layout so we rename all images into a sequential numbering system to + # ensure there are no collisions between image names. self.image_hrefs = {} # Mapping of toc items and their self.toc = {} @@ -68,13 +69,15 @@ class FB2MLizer(object): output = self.clean_text('\n'.join(output)) if self.opts.pretty_print: - output = etree.tostring(safe_xml_fromstring(output), encoding='unicode', pretty_print=True) + output = etree.tostring(etree.fromstring(output), + encoding='unicode', pretty_print=True) return '\n' + output def clean_text(self, text): # Remove pointless tags, but keep their contents. - text = re.sub(r'(?mu)<(strong|emphasis|strikethrough|sub|sup)>(\s*)', r'\2', text) + text = re.sub(r'(?mu)<(strong|emphasis|strikethrough|sub|sup)>' + r'(\s*)', r'\2', text) # Clean up paragraphs endings. text = re.sub(r'(?mu)\s+

', '

', text) @@ -96,7 +99,8 @@ class FB2MLizer(object): text = re.sub(r'(?mu)\s*

', '\n

', text) # Put line breaks between paragraphs on a separate line. - text = re.sub(r'(?mu)\s*', r'\n', text) + text = re.sub(r'(?mu)\s*', + r'\n', text) text = re.sub(r'(?mu)\s*

', '\n

', text) # Remove empty sections. @@ -115,7 +119,9 @@ class FB2MLizer(object): metadata['title'] = self.oeb_book.metadata.title[0].value metadata['appname'] = __appname__ metadata['version'] = __version__ - metadata['date'] = '%i.%i.%i' % (datetime.now().day, datetime.now().month, datetime.now().year) + metadata['date'] = '%i.%i.%i' % (datetime.now().day, + datetime.now().month, + datetime.now().year) if self.oeb_book.metadata.language: lc = lang_as_iso639_1(self.oeb_book.metadata.language[0].value) if not lc: @@ -143,31 +149,38 @@ class FB2MLizer(object): author_middle = ' '.join(author_parts[1:-1]) author_last = author_parts[-1] metadata['author'] += '' - metadata['author'] += '%s' % prepare_string_for_xml(author_first) + metadata['author'] += ('%s' % + prepare_string_for_xml(author_first)) if author_middle: - metadata['author'] += '%s' % prepare_string_for_xml(author_middle) - metadata['author'] += '%s' % prepare_string_for_xml(author_last) + metadata['author'] += ('%s' % + prepare_string_for_xml(author_middle)) + metadata['author'] += ('%s' % + prepare_string_for_xml(author_last)) metadata['author'] += '' if not metadata['author']: - metadata['author'] = '' + metadata['author'] = ('' + '') metadata['keywords'] = '' tags = list(map(str, self.oeb_book.metadata.subject)) if tags: tags = ', '.join(prepare_string_for_xml(x) for x in tags) - metadata['keywords'] = '%s'%tags + metadata['keywords'] = '%s' % tags metadata['sequence'] = '' if self.oeb_book.metadata.series: index = '1' if self.oeb_book.metadata.series_index: index = self.oeb_book.metadata.series_index[0] - metadata['sequence'] = '' % (prepare_string_for_xml('%s' % self.oeb_book.metadata.series[0]), index) + seq = prepare_string_for_xml(str(self.oeb_book.metadata.series[0])) + metadata['sequence'] = ('' % + (seq, index)) year = publisher = isbn = '' identifiers = self.oeb_book.metadata['identifier'] for x in identifiers: - if x.get(OPF('scheme'), None).lower() == 'uuid' or str(x).startswith('urn:uuid:'): + if (x.get(OPF('scheme'), None).lower() == 'uuid' or + str(x).startswith('urn:uuid:')): metadata['id'] = str(x).split(':')[-1] break if metadata['id'] is None: @@ -179,22 +192,27 @@ class FB2MLizer(object): except IndexError: pass else: - year = '%s' % prepare_string_for_xml(date.value.partition('-')[0]) + year = ('%s' % + prepare_string_for_xml(date.value.partition('-')[0])) try: publisher = self.oeb_book.metadata['publisher'][0] except IndexError: pass else: - publisher = '%s' % prepare_string_for_xml(publisher.value) + publisher = ('%s' % + prepare_string_for_xml(publisher.value)) for x in identifiers: if x.get(OPF('scheme'), None).lower() == 'isbn': isbn = '%s' % prepare_string_for_xml(x.value) - metadata['year'], metadata['isbn'], metadata['publisher'] = year, isbn, publisher + metadata['year'] = year + metadata['isbn'] = isbn + metadata['publisher'] = publisher for key, value in metadata.items(): - if key not in ('author', 'cover', 'sequence', 'keywords', 'year', 'publisher', 'isbn'): + if key not in ('author', 'cover', 'sequence', 'keywords', 'year', + 'publisher', 'isbn'): metadata[key] = prepare_string_for_xml(value) try: @@ -203,7 +221,8 @@ class FB2MLizer(object): metadata['comments'] = '' else: from ebook_converter.utils.html2text import html2text - metadata['comments'] = '

{}

'.format(prepare_string_for_xml(html2text(comments.value).strip())) + annot = prepare_string_for_xml(html2text(comments.value).strip()) + metadata['comments'] = f'

{annot}

' # Keep the indentation level of the description the same as the body. header = textwrap.dedent('''\ @@ -245,7 +264,9 @@ class FB2MLizer(object): cover_href = None # Get the raster cover if it's available. - if self.oeb_book.metadata.cover and str(self.oeb_book.metadata.cover[0]) in self.oeb_book.manifest.ids: + if (self.oeb_book.metadata.cover and + str(self.oeb_book.metadata.cover[0]) in + self.oeb_book.manifest.ids): id = str(self.oeb_book.metadata.cover[0]) cover_item = self.oeb_book.manifest.ids[id] if cover_item.media_type in OEB_RASTER_IMAGES: @@ -259,7 +280,8 @@ class FB2MLizer(object): page_name = 'cover' if page_name: - cover_item = self.oeb_book.manifest.hrefs[self.oeb_book.guide[page_name].href] + key = self.oeb_book.guide[page_name].href + cover_item = self.oeb_book.manifest.hrefs[key] # Get the first image in the page for img in cover_item.xpath('//img'): cover_href = cover_item.abshref(img.get('src')) @@ -267,10 +289,11 @@ class FB2MLizer(object): if cover_href: # Only write the image tag if it is in the manifest. - if cover_href in self.oeb_book.manifest.hrefs and cover_href not in self.image_hrefs: + if (cover_href in self.oeb_book.manifest.hrefs and + cover_href not in self.image_hrefs): self.image_hrefs[cover_href] = 'img_%s' % len(self.image_hrefs) - return '' % self.image_hrefs[cover_href] - + return ('' % + self.image_hrefs[cover_href]) return '' def get_text(self): @@ -285,16 +308,20 @@ class FB2MLizer(object): for item in self.oeb_book.spine: self.log.debug('Converting %s to FictionBook2 XML' % item.href) - stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts, self.opts.output_profile) + stylizer = Stylizer(item.data, item.href, self.oeb_book, self.opts, + self.opts.output_profile) - # Start a
if we must sectionize each file or if the TOC references this page + # Start a
if we must sectionize each file or if the TOC + # references this page page_section_open = False - if self.opts.sectionize == 'files' or None in self.toc.get(item.href, ()): + if (self.opts.sectionize == 'files' or + None in self.toc.get(item.href, ())): text.append('
') page_section_open = True self.section_level += 1 - text += self.dump_text(item.data.find(XHTML('body')), stylizer, item) + text += self.dump_text(item.data.find(XHTML('body')), stylizer, + item) if page_section_open: text.append('
') @@ -309,20 +336,23 @@ class FB2MLizer(object): return ''.join(text) def fb2mlize_images(self): - ''' - This function uses the self.image_hrefs dictionary mapping. It is populated by the dump_text function. - ''' + """ + This function uses the self.image_hrefs dictionary mapping. It is + populated by the dump_text function. + """ from ebook_converter.ebooks.oeb.base import OEB_RASTER_IMAGES images = [] for item in self.oeb_book.manifest: - # Don't write the image if it's not referenced in the document's text. + # Don't write the image if it's not referenced in the document's + # text. if item.href not in self.image_hrefs: continue if item.media_type in OEB_RASTER_IMAGES: try: if item.media_type not in ('image/jpeg', 'image/png'): - imdata = save_cover_data_to(item.data, compression_quality=70) + imdata = save_cover_data_to(item.data, + compression_quality=70) raw_data = as_base64_unicode(imdata) content_type = 'image/jpeg' else: @@ -330,11 +360,14 @@ class FB2MLizer(object): content_type = item.media_type # Don't put the encoded image on a single line. step = 72 - data = '\n'.join(raw_data[i:i+step] for i in range(0, len(raw_data), step)) - images.append('%s' % (self.image_hrefs[item.href], content_type, data)) + data = '\n'.join(raw_data[i:i+step] + for i in range(0, len(raw_data), step)) + images.append('%s' + '' % (self.image_hrefs[item.href], + content_type, data)) except Exception as e: self.log.error('Error: Could not include file %s because ' - '%s.' % (item.href, e)) + '%s.' % (item.href, e)) return '\n'.join(images) def create_flat_toc(self, nodes, level): @@ -391,26 +424,31 @@ class FB2MLizer(object): def dump_text(self, elem_tree, stylizer, page, tag_stack=[]): ''' - This function is intended to be used in a recursive manner. dump_text will - run though all elements in the elem_tree and call itself on each element. + This function is intended to be used in a recursive manner. dump_text + will run though all elements in the elem_tree and call itself on each + element. self.image_hrefs will be populated by calling this function. - @param elem_tree: etree representation of XHTML content to be transformed. + @param elem_tree: etree representation of XHTML content to be + transformed. @param stylizer: Used to track the style of elements within the tree. @param page: OEB page used to determine absolute urls. @param tag_stack: List of open FB2 tags to take into account. @return: List of string representing the XHTML converted to FB2 markup. ''' - from ebook_converter.ebooks.oeb.base import XHTML_NS, barename, namespace + from ebook_converter.ebooks.oeb.base import barename + from ebook_converter.ebooks.oeb.base import namespace elem = elem_tree - # Ensure what we are converting is not a string and that the fist tag is part of the XHTML namespace. - if not isinstance(elem_tree.tag, (str, bytes)) or namespace(elem_tree.tag) != XHTML_NS: + # Ensure what we are converting is not a string and that the fist tag + # is part of the XHTML namespace. + if (not isinstance(elem_tree.tag, (str, bytes)) or + namespace(elem_tree.tag) != const.XHTML_NS): p = elem.getparent() - if p is not None and isinstance(p.tag, (str, bytes)) and namespace(p.tag) == XHTML_NS \ - and elem.tail: + if (p is not None and isinstance(p.tag, (str, bytes)) and + namespace(p.tag) == const.XHTML_NS and elem.tail): return [elem.tail] return [] @@ -423,7 +461,8 @@ class FB2MLizer(object): # FB2 generated output. fb2_out = [] - # FB2 tags in the order they are opened. This will be used to close the tags. + # FB2 tags in the order they are opened. This will be used to close + # the tags. tags = [] # First tag in tree tag = barename(elem_tree.tag) @@ -432,26 +471,31 @@ class FB2MLizer(object): ems = int(round((float(style.marginTop) / style.fontSize) - 1)) if ems < 0: ems = 0 - except: + except Exception: ems = 0 # Convert TOC entries to s and add <section>s if self.opts.sectionize == 'toc': - # A section cannot be a child of any other element than another section, - # so leave the tag alone if there are parents + # A section cannot be a child of any other element than another + # section, so leave the tag alone if there are parents if not tag_stack: - # There are two reasons to start a new section here: the TOC pointed to - # this page (then we use the first non-<body> on the page as a <title>), or - # the TOC pointed to a specific element + # There are two reasons to start a new section here: the TOC + # pointed to this page (then we use the first non-<body> on + # the page as a <title>), or the TOC pointed to a specific + # element newlevel = 0 toc_entry = self.toc.get(page.href, None) if toc_entry is not None: if None in toc_entry: - if tag != 'body' and hasattr(elem_tree, 'text') and elem_tree.text: + if (tag != 'body' and hasattr(elem_tree, 'text') and + elem_tree.text): newlevel = 1 self.toc[page.href] = None - if not newlevel and elem_tree.attrib.get('id', None) is not None: - newlevel = toc_entry.get(elem_tree.attrib.get('id', None), None) + if (not newlevel and + elem_tree.attrib.get('id', None) is not None): + newlevel = toc_entry.get(elem_tree.attrib.get('id', + None), + None) # Start a new section if necessary if newlevel: @@ -463,13 +507,14 @@ class FB2MLizer(object): fb2_out.append('<title>') tags.append('title') if self.section_level == 0: - # If none of the prior processing made a section, make one now to be FB2 spec compliant + # If none of the prior processing made a section, make one now + # to be FB2 spec compliant fb2_out.append('<section>') self.section_level += 1 # Process the XHTML tag and styles. Converted to an FB2 tag. - # Use individual if statement not if else. There can be - # only one XHTML tag but it can have multiple styles. + # Use individual if statement not if else. There can be only one XHTML + # tag but it can have multiple styles. if tag == 'img' and elem_tree.attrib.get('src', None): # Only write the image tag if it is in the manifest. ihref = urlnormalize(page.abshref(elem_tree.attrib['src'])) @@ -479,7 +524,8 @@ class FB2MLizer(object): p_txt, p_tag = self.ensure_p() fb2_out += p_txt tags += p_tag - fb2_out.append('<image l:href="#%s"/>' % self.image_hrefs[ihref]) + fb2_out.append('<image l:href="#%s"/>' % + self.image_hrefs[ihref]) else: self.log.warn(u'Ignoring image not in manifest: %s' % ihref) if tag in ('br', 'hr') or ems >= 1: @@ -513,7 +559,8 @@ class FB2MLizer(object): p_txt, p_tag = self.ensure_p() fb2_out += p_txt tags += p_tag - fb2_out.append('<a l:href="%s">' % urlnormalize(elem_tree.attrib['href'])) + fb2_out.append('<a l:href="%s">' % + urlnormalize(elem_tree.attrib['href'])) tags.append('a') if tag == 'b' or style['font-weight'] in ('bold', 'bolder'): s_out, s_tags = self.handle_simple_tag('strong', tag_stack+tags) @@ -523,8 +570,10 @@ class FB2MLizer(object): s_out, s_tags = self.handle_simple_tag('emphasis', tag_stack+tags) fb2_out += s_out tags += s_tags - if tag in ('del', 'strike') or style['text-decoration'] == 'line-through': - s_out, s_tags = self.handle_simple_tag('strikethrough', tag_stack+tags) + if (tag in ('del', 'strike') or + style['text-decoration'] == 'line-through'): + s_out, s_tags = self.handle_simple_tag('strikethrough', + tag_stack+tags) fb2_out += s_out tags += s_tags if tag == 'sub': @@ -552,7 +601,8 @@ class FB2MLizer(object): tags.reverse() fb2_out += self.close_tags(tags) - # Process element text that comes after the close of the XHTML tag but before the next XHTML tag. + # Process element text that comes after the close of the XHTML tag but + # before the next XHTML tag. if hasattr(elem_tree, 'tail') and elem_tree.tail: if not self.in_p: fb2_out.append('<p>') diff --git a/ebook_converter/ebooks/htmlz/oeb2html.py b/ebook_converter/ebooks/htmlz/oeb2html.py index f0c5640..42b0742 100644 --- a/ebook_converter/ebooks/htmlz/oeb2html.py +++ b/ebook_converter/ebooks/htmlz/oeb2html.py @@ -9,8 +9,9 @@ from functools import partial from lxml import html from ebook_converter import prepare_string_for_xml -from ebook_converter.ebooks.oeb.base import ( - XHTML, XHTML_NS, SVG_NS, barename, namespace, OEB_IMAGES, XLINK, rewrite_links, urlnormalize) +from ebook_converter import constants as const +from ebook_converter.ebooks.oeb import base +from ebook_converter.ebooks.oeb import parse_utils from ebook_converter.ebooks.oeb.stylizer import Stylizer from ebook_converter.utils.logging import default_log from ebook_converter.polyglot.builtins import as_bytes @@ -61,9 +62,9 @@ class OEB2HTML(object): for item in oeb_book.spine: self.log.debug('Converting %s to HTML...' % item.href) self.rewrite_ids(item.data, item) - rewrite_links(item.data, partial(self.rewrite_link, page=item)) + base.rewrite_links(item.data, partial(self.rewrite_link, page=item)) stylizer = Stylizer(item.data, item.href, oeb_book, self.opts) - output += self.dump_text(item.data.find(XHTML('body')), stylizer, item) + output += self.dump_text(item.data.find(base.tag('xhtml', 'body')), stylizer, item) output.append('\n\n') output.append('</body></html>') return ''.join(output) @@ -80,7 +81,7 @@ class OEB2HTML(object): def map_resources(self, oeb_book): for item in oeb_book.manifest: - if item.media_type in OEB_IMAGES: + if item.media_type in base.OEB_IMAGES: if item.href not in self.images: ext = os.path.splitext(item.href)[1] fname = '%s%s' % (len(self.images), ext) @@ -88,9 +89,9 @@ class OEB2HTML(object): self.images[item.href] = fname if item in oeb_book.spine: self.get_link_id(item.href) - root = item.data.find(XHTML('body')) + root = item.data.find(base.tag('xhtml', 'body')) link_attrs = set(html.defs.link_attrs) - link_attrs.add(XLINK('href')) + link_attrs.add(base.tag('xlink', 'href')) for el in root.iter(): attribs = el.attrib try: @@ -108,7 +109,7 @@ class OEB2HTML(object): def rewrite_link(self, url, page=None): if not page: return url - abs_url = page.abshref(urlnormalize(url)) + abs_url = page.abshref(base.urlnormalize(url)) if abs_url in self.images: return 'images/%s' % self.images[abs_url] if abs_url in self.links: @@ -121,7 +122,7 @@ class OEB2HTML(object): tag = el.tag except UnicodeDecodeError: continue - if tag == XHTML('body'): + if tag == base.tag('xhtml', 'body'): el.attrib['id'] = self.get_link_id(page.href)[1:] continue if 'id' in el.attrib: @@ -156,9 +157,9 @@ class OEB2HTMLNoCSSizer(OEB2HTML): # We can only processes tags. If there isn't a tag return any text. if not isinstance(elem.tag, (str, bytes)) \ - or namespace(elem.tag) not in (XHTML_NS, SVG_NS): + or parse_utils.namespace(elem.tag) not in (const.XHTML_NS, const.SVG_NS): p = elem.getparent() - if p is not None and isinstance(p.tag, (str, bytes)) and namespace(p.tag) in (XHTML_NS, SVG_NS) \ + if p is not None and isinstance(p.tag, (str, bytes)) and parse_utils.namespace(p.tag) in (const.XHTML_NS, const.SVG_NS) \ and elem.tail: return [elem.tail] return [''] @@ -167,7 +168,7 @@ class OEB2HTMLNoCSSizer(OEB2HTML): text = [''] style = stylizer.style(elem) tags = [] - tag = barename(elem.tag) + tag = parse_utils.barename(elem.tag) attribs = elem.attrib if tag == 'body': @@ -245,9 +246,9 @@ class OEB2HTMLInlineCSSizer(OEB2HTML): # We can only processes tags. If there isn't a tag return any text. if not isinstance(elem.tag, (str, bytes)) \ - or namespace(elem.tag) not in (XHTML_NS, SVG_NS): + or parse_utils.namespace(elem.tag) not in (const.XHTML_NS, const.SVG_NS): p = elem.getparent() - if p is not None and isinstance(p.tag, (str, bytes)) and namespace(p.tag) in (XHTML_NS, SVG_NS) \ + if p is not None and isinstance(p.tag, (str, bytes)) and parse_utils.namespace(p.tag) in (const.XHTML_NS, const.SVG_NS) \ and elem.tail: return [elem.tail] return [''] @@ -256,7 +257,7 @@ class OEB2HTMLInlineCSSizer(OEB2HTML): text = [''] style = stylizer.style(elem) tags = [] - tag = barename(elem.tag) + tag = parse_utils.barename(elem.tag) attribs = elem.attrib style_a = '%s' % style @@ -327,9 +328,9 @@ class OEB2HTMLClassCSSizer(OEB2HTML): for item in oeb_book.spine: self.log.debug('Converting %s to HTML...' % item.href) self.rewrite_ids(item.data, item) - rewrite_links(item.data, partial(self.rewrite_link, page=item)) + base.rewrite_links(item.data, partial(self.rewrite_link, page=item)) stylizer = Stylizer(item.data, item.href, oeb_book, self.opts) - output += self.dump_text(item.data.find(XHTML('body')), stylizer, item) + output += self.dump_text(item.data.find(base.tag('xhtml', 'body')), stylizer, item) output.append('\n\n') if self.opts.htmlz_class_style == 'external': css = u'<link href="style.css" rel="stylesheet" type="text/css" />' @@ -348,9 +349,9 @@ class OEB2HTMLClassCSSizer(OEB2HTML): # We can only processes tags. If there isn't a tag return any text. if not isinstance(elem.tag, (str, bytes)) \ - or namespace(elem.tag) not in (XHTML_NS, SVG_NS): + or parse_utils.namespace(elem.tag) not in (const.XHTML_NS, const.SVG_NS): p = elem.getparent() - if p is not None and isinstance(p.tag, (str, bytes)) and namespace(p.tag) in (XHTML_NS, SVG_NS) \ + if p is not None and isinstance(p.tag, (str, bytes)) and parse_utils.namespace(p.tag) in (const.XHTML_NS, const.SVG_NS) \ and elem.tail: return [elem.tail] return [''] @@ -358,7 +359,7 @@ class OEB2HTMLClassCSSizer(OEB2HTML): # Setup our variables. text = [''] tags = [] - tag = barename(elem.tag) + tag = parse_utils.barename(elem.tag) attribs = elem.attrib if tag == 'body': diff --git a/ebook_converter/ebooks/metadata/fb2.py b/ebook_converter/ebooks/metadata/fb2.py index f38c8e0..43bc493 100644 --- a/ebook_converter/ebooks/metadata/fb2.py +++ b/ebook_converter/ebooks/metadata/fb2.py @@ -1,38 +1,32 @@ """ Read meta information from fb2 files """ -import os, random -from functools import partial -from string import ascii_letters, digits +import functools +import os +import random +import string from lxml import etree from ebook_converter.utils.date import parse_only_date from ebook_converter.utils.img import save_cover_data_to -from ebook_converter.utils.xml_parse import safe_xml_fromstring from ebook_converter.utils.imghdr import identify -from ebook_converter import guess_type, guess_all_extensions, prints, force_unicode +from ebook_converter import guess_type, guess_all_extensions, prints, \ + force_unicode from ebook_converter.ebooks.metadata import MetaInformation, check_isbn from ebook_converter.ebooks.chardet import xml_to_unicode from ebook_converter.polyglot.binary import as_base64_unicode -__license__ = 'GPL v3' -__copyright__ = ('2011, Roman Mukhin <ramses_ru at hotmail.com>, ' - '2008, Anatoly Shipitsin <norguhtar at gmail.com>') +NAMESPACES = {'fb2': 'http://www.gribuser.ru/xml/fictionbook/2.0', + 'fb21': 'http://www.gribuser.ru/xml/fictionbook/2.1', + 'xlink': 'http://www.w3.org/1999/xlink'} - -NAMESPACES = { - 'fb2' : 'http://www.gribuser.ru/xml/fictionbook/2.0', - 'fb21' : 'http://www.gribuser.ru/xml/fictionbook/2.1', - 'xlink' : 'http://www.w3.org/1999/xlink' -} - -tostring = partial(etree.tostring, method='text', encoding='unicode') +tostring = functools.partial(etree.tostring, method='text', encoding='unicode') def XLINK(tag): - return '{%s}%s'%(NAMESPACES['xlink'], tag) + return '{%s}%s' % (NAMESPACES['xlink'], tag) class Context(object): @@ -52,7 +46,7 @@ class Context(object): return etree.XPath(*args, namespaces=self.namespaces) def get_or_create(self, parent, tag, attribs={}, at_start=True): - xpathstr='./fb:'+tag + xpathstr = './fb:'+tag for n, v in attribs.items(): xpathstr += '[@%s="%s"]' % (n, v) ans = self.XPath(xpathstr)(parent) @@ -73,7 +67,7 @@ class Context(object): def clear_meta_tags(self, doc, tag): for parent in ('title-info', 'src-title-info', 'publish-info'): - for x in self.XPath('//fb:%s/fb:%s'%(parent, tag))(doc): + for x in self.XPath('//fb:%s/fb:%s' % (parent, tag))(doc): x.getparent().remove(x) def text2fb2(self, parent, text): @@ -117,42 +111,41 @@ def get_metadata(stream): book_title = str(book_title) else: book_title = force_unicode(os.path.splitext( - os.path.basename(getattr(stream, 'name', - 'Unknown')))[0]) + os.path.basename(getattr(stream, 'name', 'Unknown')))[0]) mi = MetaInformation(book_title, authors) try: _parse_cover(root, mi, ctx) - except: + except Exception: pass try: _parse_comments(root, mi, ctx) - except: + except Exception: pass try: _parse_tags(root, mi, ctx) - except: + except Exception: pass try: _parse_series(root, mi, ctx) - except: + except Exception: pass try: _parse_isbn(root, mi, ctx) - except: + except Exception: pass try: _parse_publisher(root, mi, ctx) - except: + except Exception: pass try: _parse_pubdate(root, mi, ctx) - except: + except Exception: pass try: _parse_language(root, mi, ctx) - except: + except Exception: pass return mi @@ -160,11 +153,11 @@ def get_metadata(stream): def _parse_authors(root, ctx): authors = [] - # pick up authors but only from 1 secrion <title-info>; otherwise it is not consistent! - # Those are fallbacks: <src-title-info>, <document-info> + # pick up authors but only from 1 secrion <title-info>; otherwise it is + # not consistent! Those are fallbacks: <src-title-info>, <document-info> author = None for author_sec in ['title-info', 'src-title-info', 'document-info']: - for au in ctx.XPath('//fb:%s/fb:author'%author_sec)(root): + for au in ctx.XPath('//fb:%s/fb:author' % author_sec)(root): author = _parse_author(au, ctx) if author: authors.append(author) @@ -207,24 +200,26 @@ def _parse_book_title(root, ctx): xp_ti = '//fb:title-info/fb:book-title/text()' xp_pi = '//fb:publish-info/fb:book-title/text()' xp_si = '//fb:src-title-info/fb:book-title/text()' - book_title = ctx.XPath('normalize-space(%s|%s|%s)' % (xp_ti, xp_pi, xp_si))(root) + book_title = ctx.XPath('normalize-space(%s|%s|%s)' % + (xp_ti, xp_pi, xp_si))(root) return book_title def _parse_cover(root, mi, ctx): # pickup from <title-info>, if not exists it fallbacks to <src-title-info> - imgid = ctx.XPath('substring-after(string(//fb:coverpage/fb:image/@xlink:href), "#")')(root) + imgid = ctx.XPath('substring-after(string(//fb:coverpage/fb:image/' + '@xlink:href), "#")')(root) if imgid: try: _parse_cover_data(root, imgid, mi, ctx) - except: + except Exception: pass def _parse_cover_data(root, imgid, mi, ctx): from ebook_converter.ebooks.fb2 import base64_decode - elm_binary = ctx.XPath('//fb:binary[@id="%s"]'%imgid)(root) + elm_binary = ctx.XPath('//fb:binary[@id="%s"]' % imgid)(root) if elm_binary: mimetype = elm_binary[0].get('content-type', 'image/jpeg') mime_extensions = guess_all_extensions(mimetype) @@ -241,12 +236,13 @@ def _parse_cover_data(root, imgid, mi, ctx): fmt = identify(cdata)[0] mi.cover_data = (fmt, cdata) else: - prints("WARNING: Unsupported coverpage mime-type '%s' (id=#%s)" % (mimetype, imgid)) + prints("WARNING: Unsupported coverpage mime-type '%s' (id=#%s)" % + (mimetype, imgid)) def _parse_tags(root, mi, ctx): - # pick up genre but only from 1 secrion <title-info>; otherwise it is not consistent! - # Those are fallbacks: <src-title-info> + # pick up genre but only from 1 secrion <title-info>; otherwise it is not + # consistent! Those are fallbacks: <src-title-info> for genre_sec in ['title-info', 'src-title-info']: # -- i18n Translations-- ? tags = ctx.XPath('//fb:%s/fb:genre/text()' % genre_sec)(root) @@ -267,16 +263,20 @@ def _parse_series(root, mi, ctx): mi.series = elms_sequence[0].get('name', None) if mi.series: try: - mi.series_index = float('.'.join(elms_sequence[0].get('number', None).split()[:2])) + i = float('.'.join(elms_sequence[0].get('number', + None).split()[:2])) + mi.series_index = i except Exception: pass def _parse_isbn(root, mi, ctx): - # some people try to put several isbn in this field, but it is not allowed. try to stick to the 1-st one in this case + # some people try to put several isbn in this field, but it is not + # allowed. try to stick to the 1-st one in this case isbn = ctx.XPath('normalize-space(//fb:publish-info/fb:isbn/text())')(root) if isbn: - # some people try to put several isbn in this field, but it is not allowed. try to stick to the 1-st one in this case + # some people try to put several isbn in this field, but it is not + # allowed. try to stick to the 1-st one in this case if ',' in isbn: isbn = isbn[:isbn.index(',')] if check_isbn(isbn): @@ -284,9 +284,11 @@ def _parse_isbn(root, mi, ctx): def _parse_comments(root, mi, ctx): - # pick up annotation but only from 1 section <title-info>; fallback: <src-title-info> + # pick up annotation but only from 1 section <title-info>; + # fallback: <src-title-info> for annotation_sec in ['title-info', 'src-title-info']: - elms_annotation = ctx.XPath('//fb:%s/fb:annotation' % annotation_sec)(root) + elms_annotation = ctx.XPath('//fb:%s/fb:annotation' % + annotation_sec)(root) if elms_annotation: mi.comments = tostring(elms_annotation[0]) # TODO: tags i18n, xslt? @@ -294,7 +296,8 @@ def _parse_comments(root, mi, ctx): def _parse_publisher(root, mi, ctx): - publisher = ctx.XPath('string(//fb:publish-info/fb:publisher/text())')(root) + publisher = ctx.XPath('string(//fb:publish-info/fb:publisher/' + 'text())')(root) if publisher: mi.publisher = publisher @@ -315,7 +318,7 @@ def _parse_language(root, mi, ctx): def _get_fbroot(raw): raw = xml_to_unicode(raw, strip_encoding_pats=True)[0] - root = safe_xml_fromstring(raw) + root = etree.fromstring(raw) return ensure_namespace(root) @@ -348,10 +351,12 @@ def _set_authors(title_info, mi, ctx): ctx.create_tag(atag, 'first-name').text = author_parts[0] author_parts = author_parts[1:] if len(author_parts) > 1: - ctx.create_tag(atag, 'middle-name', at_start=False).text = author_parts[0] + ctx.create_tag(atag, 'middle-name', + at_start=False).text = author_parts[0] author_parts = author_parts[1:] if author_parts: - ctx.create_tag(atag, 'last-name', at_start=False).text = ' '.join(author_parts) + a = ' '.join(author_parts) + ctx.create_tag(atag, 'last-name', at_start=False).text = a def _set_tags(title_info, mi, ctx): @@ -368,12 +373,12 @@ def _set_series(title_info, mi, ctx): seq = ctx.get_or_create(title_info, 'sequence') seq.set('name', mi.series) try: - seq.set('number', '%g'%mi.series_index) - except: + seq.set('number', '%g' % mi.series_index) + except Exception: seq.set('number', '1') -def _rnd_name(size=8, chars=ascii_letters + digits): +def _rnd_name(size=8, chars=string.ascii_letters + string.digits): return ''.join(random.choice(chars) for x in range(size)) @@ -396,7 +401,9 @@ def _set_cover(title_info, mi, ctx): cim_filename = _rnd_pic_file_name('cover') cim_tag.attrib[XLINK('href')] = '#' + cim_filename fb2_root = cim_tag.getroottree().getroot() - cim_binary = ctx.get_or_create(fb2_root, 'binary', attribs={'id': cim_filename}, at_start=False) + cim_binary = ctx.get_or_create(fb2_root, 'binary', + attribs={'id': cim_filename}, + at_start=False) cim_binary.attrib['content-type'] = 'image/jpeg' cim_binary.text = _encode_into_jpeg(mi.cover_data[1]) @@ -425,7 +432,8 @@ def set_metadata(stream, mi, apply_null=False, update_timestamp=False): # single quotes in xml declaration. Sigh. See # https://www.mobileread.com/forums/showthread.php?p=2273184#post2273184 raw = b'<?xml version="1.0" encoding="UTF-8"?>\n' - raw += etree.tostring(root, method='xml', encoding='utf-8', xml_declaration=False) + raw += etree.tostring(root, method='xml', encoding='utf-8', + xml_declaration=False) stream.seek(0) stream.truncate() @@ -449,6 +457,7 @@ def ensure_namespace(doc): if bare_tags: import re raw = etree.tostring(doc, encoding='unicode') - raw = re.sub(r'''<(description|body)\s+xmlns=['"]['"]>''', r'<\1>', raw) - doc = safe_xml_fromstring(raw) + raw = re.sub(r'''<(description|body)\s+xmlns=['"]['"]>''', r'<\1>', + raw) + doc = etree.fromstring(raw) return doc diff --git a/ebook_converter/ebooks/metadata/opf2.py b/ebook_converter/ebooks/metadata/opf2.py index 586164b..9c85d23 100644 --- a/ebook_converter/ebooks/metadata/opf2.py +++ b/ebook_converter/ebooks/metadata/opf2.py @@ -12,29 +12,31 @@ import sys import unittest import urllib.parse import uuid +import traceback +import textwrap from lxml import etree +from lxml.builder import ElementMaker +from ebook_converter.ebooks.oeb import base +from ebook_converter import constants as const from ebook_converter.ebooks import escape_xpath_attr from ebook_converter.constants_old import __appname__, __version__, \ filesystem_encoding from ebook_converter.ebooks.metadata.toc import TOC -from ebook_converter.ebooks.metadata.utils import parse_opf, pretty_print_opf as _pretty_print -from ebook_converter.ebooks.metadata import string_to_authors, MetaInformation, check_isbn +from ebook_converter.ebooks.metadata.utils import parse_opf, \ + pretty_print_opf as _pretty_print +from ebook_converter.ebooks.metadata import string_to_authors, \ + MetaInformation, check_isbn from ebook_converter.ebooks.metadata.book.base import Metadata from ebook_converter.utils.date import parse_date, isoformat from ebook_converter.utils.localization import get_lang, canonicalize_lang from ebook_converter import prints, guess_type from ebook_converter.utils.cleantext import clean_ascii_chars, clean_xml_chars from ebook_converter.utils.config import tweaks -from ebook_converter.utils.xml_parse import safe_xml_fromstring from ebook_converter.polyglot.urllib import unquote -__license__ = 'GPL v3' -__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net' -__docformat__ = 'restructuredtext en' - pretty_print_opf = False @@ -73,7 +75,7 @@ class Resource(object): # {{{ self.fragment = '' try: self.mime_type = guess_type(href_or_path)[0] - except: + except Exception: self.mime_type = None if self.mime_type is None: self.mime_type = 'application/octet-stream' @@ -94,17 +96,21 @@ class Resource(object): # {{{ if isinstance(pc, str): pc = pc.encode('utf-8') pc = pc.decode('utf-8') - self.path = os.path.abspath(os.path.join(basedir, pc.replace('/', os.sep))) + self.path = os.path.abspath(os.path.join(basedir, + pc.replace('/', + os.sep))) self.fragment = url[-1] def href(self, basedir=None): - ''' - Return a URL pointing to this resource. If it is a file on the filesystem - the URL is relative to `basedir`. + """ + Return a URL pointing to this resource. If it is a file on the + filesystem the URL is relative to `basedir`. - `basedir`: If None, the basedir of this resource is used (see :method:`set_basedir`). - If this resource has no basedir, then the current working directory is used as the basedir. - ''' + `basedir`: If None, the basedir of this resource is used (see + :method:`set_basedir`). + If this resource has no basedir, then the current working directory is + used as the basedir. + """ if basedir is None: if self._basedir: basedir = self._basedir @@ -117,7 +123,8 @@ class Resource(object): # {{{ return frag try: rpath = os.path.relpath(self.path, basedir) - except ValueError: # On windows path and basedir could be on different drives + except ValueError: + # On windows path and basedir could be on different drives rpath = self.path if isinstance(rpath, bytes): rpath = rpath.decode(filesystem_encoding) @@ -130,7 +137,7 @@ class Resource(object): # {{{ return self._basedir def __repr__(self): - return 'Resource(%s, %s)'%(repr(self.path), repr(self.href())) + return 'Resource(%s, %s)' % (repr(self.path), repr(self.href())) # }}} @@ -155,7 +162,7 @@ class ResourceCollection(object): # {{{ def __str__(self): resources = map(repr, self) - return '[%s]'%', '.join(resources) + return '[%s]' % ', '.join(resources) __unicode__ = __str__ def __repr__(self): @@ -211,7 +218,8 @@ class ManifestItem(Resource): # {{{ self.mime_type = val def __unicode__representation__(self): - return u'<item id="%s" href="%s" media-type="%s" />'%(self.id, self.href(), self.media_type) + return (u'<item id="%s" href="%s" media-type="%s" />' % + (self.id, self.href(), self.media_type)) __str__ = __unicode__representation__ @@ -223,7 +231,7 @@ class ManifestItem(Resource): # {{{ return self.href() if index == 1: return self.media_type - raise IndexError('%d out of bounds.'%index) + raise IndexError('%d out of bounds.' % index) # }}} @@ -234,7 +242,7 @@ class Manifest(ResourceCollection): # {{{ self.append(ManifestItem.from_opf_manifest_item(item, dir)) id = item.get('id', '') if not id: - id = 'id%d'%self.next_id + id = 'id%d' % self.next_id self[-1].id = id self.next_id += 1 @@ -250,15 +258,16 @@ class Manifest(ResourceCollection): # {{{ @staticmethod def from_paths(entries): - ''' - `entries`: List of (path, mime-type) If mime-type is None it is autodetected - ''' + """ + `entries`: List of (path, mime-type) If mime-type is None it is + autodetected + """ m = Manifest() for path, mt in entries: mi = ManifestItem(path, is_path=True) if mt: mi.mime_type = mt - mi.id = 'id%d'%m.next_id + mi.id = 'id%d' % m.next_id m.next_id += 1 m.append(mi) return m @@ -267,7 +276,7 @@ class Manifest(ResourceCollection): # {{{ mi = ManifestItem(path, is_path=True) if mime_type: mi.mime_type = mime_type - mi.id = 'id%d'%self.next_id + mi.id = 'id%d' % self.next_id self.next_id += 1 self.append(mi) return mi.id @@ -318,13 +327,13 @@ class Spine(ResourceCollection): # {{{ def from_opf_spine_element(itemrefs, manifest): s = Spine(manifest) seen = set() - path_map = {i.id:i.path for i in s.manifest} + path_map = {i.id: i.path for i in s.manifest} for itemref in itemrefs: idref = itemref.get('idref', None) if idref is not None: path = path_map.get(idref) if path and path not in seen: - r = Spine.Item(lambda x:idref, path, is_path=True) + r = Spine.Item(lambda x: idref, path, is_path=True) r.is_linear = itemref.get('linear', 'yes') == 'yes' r.idref = idref s.append(r) @@ -336,8 +345,9 @@ class Spine(ResourceCollection): # {{{ s = Spine(manifest) for path in paths: try: - s.append(Spine.Item(s.manifest.id_for_path, path, is_path=True)) - except: + s.append(Spine.Item(s.manifest.id_for_path, path, + is_path=True)) + except Exception: continue return s @@ -346,10 +356,10 @@ class Spine(ResourceCollection): # {{{ self.manifest = manifest def replace(self, start, end, ids): - ''' - Replace the items between start (inclusive) and end (not inclusive) with + """ + Replace the items between start (inclusive) and end (not inclusive) with the items identified by ids. ids can be a list of any length. - ''' + """ items = [] for id in ids: path = self.manifest.path_for_id(id) @@ -381,16 +391,18 @@ class Guide(ResourceCollection): # {{{ @staticmethod def from_opf_resource_item(ref, basedir): - title, href, type = ref.get('title', ''), ref.get('href'), ref.get('type') + title = ref.get('title', '') + href = ref.get('href') + type = ref.get('type') res = Guide.Reference(href, basedir, is_path=True) res.title = title res.type = type return res def __repr__(self): - ans = '<reference type="%s" href="%s" '%(self.type, self.href()) + ans = '<reference type="%s" href="%s" ' % (self.type, self.href()) if self.title: - ans += 'title="%s" '%self.title + ans += 'title="%s" ' % self.title return ans + '/>' @staticmethod @@ -400,7 +412,7 @@ class Guide(ResourceCollection): # {{{ try: ref = Guide.Reference.from_opf_resource_item(ref, base_dir) coll.append(ref) - except: + except Exception: continue return coll @@ -408,7 +420,8 @@ class Guide(ResourceCollection): # {{{ for i in tuple(self): if 'cover' in i.type.lower(): self.remove(i) - for typ in ('cover', 'other.ms-coverimage-standard', 'other.ms-coverimage'): + for typ in ('cover', 'other.ms-coverimage-standard', + 'other.ms-coverimage'): self.append(Guide.Reference(path, is_path=True)) self[-1].type = typ self[-1].title = '' @@ -419,12 +432,12 @@ class Guide(ResourceCollection): # {{{ class MetadataField(object): def __init__(self, name, is_dc=True, formatter=None, none_is=None, - renderer=lambda x: str(x)): - self.name = name - self.is_dc = is_dc + renderer=lambda x: str(x)): + self.name = name + self.is_dc = is_dc self.formatter = formatter - self.none_is = none_is - self.renderer = renderer + self.none_is = none_is + self.renderer = renderer def __real_get__(self, obj, type=None): ans = obj.get_metadata_element(self.name) @@ -436,7 +449,7 @@ class MetadataField(object): if self.formatter is not None: try: ans = self.formatter(ans) - except: + except Exception: return None if hasattr(ans, 'strip'): ans = ans.strip() @@ -467,7 +480,8 @@ class TitleSortField(MetadataField): matches = obj.title_path(obj.metadata) if matches: for match in matches: - ans = match.get('{%s}file-as'%obj.NAMESPACES['opf'], None) + ans = match.get('{%s}file-as' % + obj.NAMESPACES['opf'], None) if not ans: ans = match.get('file-as', None) if ans: @@ -488,10 +502,11 @@ class TitleSortField(MetadataField): del match.attrib[attr] -def serialize_user_metadata(metadata_elem, all_user_metadata, tail='\n'+(' '*8)): +def serialize_user_metadata(metadata_elem, all_user_metadata, + tail='\n'+(' '*8)): from ebook_converter.utils.config import to_json - from ebook_converter.ebooks.metadata.book.json_codec import (object_to_unicode, - encode_is_multiple) + from ebook_converter.ebooks.metadata.book.json_codec import \ + object_to_unicode, encode_is_multiple for name, fm in all_user_metadata.items(): try: @@ -499,9 +514,8 @@ def serialize_user_metadata(metadata_elem, all_user_metadata, tail='\n'+(' '*8)) encode_is_multiple(fm) fm = object_to_unicode(fm) fm = json.dumps(fm, default=to_json, ensure_ascii=False) - except: + except Exception: prints('Failed to write user metadata:', name) - import traceback traceback.print_exc() continue meta = metadata_elem.makeelement('meta') @@ -514,91 +528,109 @@ def serialize_user_metadata(metadata_elem, all_user_metadata, tail='\n'+(' '*8)) def dump_dict(cats): if not cats: cats = {} - from ebook_converter.ebooks.metadata.book.json_codec import object_to_unicode + from ebook_converter.ebooks.metadata.book.json_codec import \ + object_to_unicode return json.dumps(object_to_unicode(cats), ensure_ascii=False, - skipkeys=True) + skipkeys=True) class OPF(object): # {{{ - MIMETYPE = 'application/oebps-package+xml' - NAMESPACES = { - None: "http://www.idpf.org/2007/opf", - 'dc': "http://purl.org/dc/elements/1.1/", - 'opf': "http://www.idpf.org/2007/opf", - } - META = '{%s}meta' % NAMESPACES['opf'] + MIMETYPE = 'application/oebps-package+xml' + NAMESPACES = {None: "http://www.idpf.org/2007/opf", + 'dc': "http://purl.org/dc/elements/1.1/", + 'opf': "http://www.idpf.org/2007/opf"} + META = '{%s}meta' % NAMESPACES['opf'] xpn = NAMESPACES.copy() xpn.pop(None) xpn['re'] = 'http://exslt.org/regular-expressions' XPath = functools.partial(etree.XPath, namespaces=xpn) - CONTENT = XPath('self::*[re:match(name(), "meta$", "i")]/@content') - TEXT = XPath('string()') + CONTENT = XPath('self::*[re:match(name(), "meta$", "i")]/@content') + TEXT = XPath('string()') - metadata_path = XPath('descendant::*[re:match(name(), "metadata", "i")]') - metadata_elem_path = XPath( - 'descendant::*[re:match(name(), concat($name, "$"), "i") or (re:match(name(), "meta$", "i") ' - 'and re:match(@name, concat("^calibre:", $name, "$"), "i"))]') - title_path = XPath('descendant::*[re:match(name(), "title", "i")]') - authors_path = XPath('descendant::*[re:match(name(), "creator", "i") and (@role="aut" or @opf:role="aut" or (not(@role) and not(@opf:role)))]') - bkp_path = XPath('descendant::*[re:match(name(), "contributor", "i") and (@role="bkp" or @opf:role="bkp")]') - tags_path = XPath('descendant::*[re:match(name(), "subject", "i")]') - isbn_path = XPath('descendant::*[re:match(name(), "identifier", "i") and ' - '(re:match(@scheme, "isbn", "i") or re:match(@opf:scheme, "isbn", "i"))]') - pubdate_path = XPath('descendant::*[re:match(name(), "date", "i")]') - raster_cover_path = XPath('descendant::*[re:match(name(), "meta", "i") and ' - 're:match(@name, "cover", "i") and @content]') - guide_cover_path = XPath('descendant::*[local-name()="guide"]/*[local-name()="reference" and re:match(@type, "cover", "i")]/@href') - identifier_path = XPath('descendant::*[re:match(name(), "identifier", "i")]') - application_id_path = XPath('descendant::*[re:match(name(), "identifier", "i") and ' - '(re:match(@opf:scheme, "calibre|libprs500", "i") or re:match(@scheme, "calibre|libprs500", "i"))]') - uuid_id_path = XPath('descendant::*[re:match(name(), "identifier", "i") and ' - '(re:match(@opf:scheme, "uuid", "i") or re:match(@scheme, "uuid", "i"))]') - languages_path = XPath('descendant::*[local-name()="language"]') + metadata_path = XPath('descendant::*[re:match(name(), "metadata", "i")]') + metadata_elem_path = XPath('descendant::*[re:match(name(), ' + 'concat($name, "$"), "i") or (re:match(name(), ' + '"meta$", "i") and re:match(@name, ' + 'concat("^calibre:", $name, "$"), "i"))]') + title_path = XPath('descendant::*[re:match(name(), "title", "i")]') + authors_path = XPath('descendant::*[re:match(name(), "creator", "i") ' + 'and (@role="aut" or @opf:role="aut" or (not(@role) ' + 'and not(@opf:role)))]') + bkp_path = XPath('descendant::*[re:match(name(), "contributor", "i") and ' + '(@role="bkp" or @opf:role="bkp")]') + tags_path = XPath('descendant::*[re:match(name(), "subject", "i")]') + isbn_path = XPath('descendant::*[re:match(name(), "identifier", "i") and ' + '(re:match(@scheme, "isbn", "i") or re:match(@opf:' + 'scheme, "isbn", "i"))]') + pubdate_path = XPath('descendant::*[re:match(name(), "date", "i")]') + raster_cover_path = XPath('descendant::*[re:match(name(), "meta", "i") ' + 'and re:match(@name, "cover", "i") and ' + '@content]') + guide_cover_path = XPath('descendant::*[local-name()="guide"]/*[local-' + 'name()="reference" and re:match(@type, "cover", ' + '"i")]/@href') + identifier_path = XPath('descendant::*[re:match(name(), "identifier", ' + '"i")]') + application_id_path = XPath('descendant::*[re:match(name(), "identifier",' + ' "i") and (re:match(@opf:scheme, ' + '"calibre|libprs500", "i") or re:match(@' + 'scheme, "calibre|libprs500", "i"))]') + uuid_id_path = XPath('descendant::*[re:match(name(), "identifier", "i") ' + 'and (re:match(@opf:scheme, "uuid", "i") or re:match' + '(@scheme, "uuid", "i"))]') + languages_path = XPath('descendant::*[local-name()="language"]') - manifest_path = XPath('descendant::*[re:match(name(), "manifest", "i")]/*[re:match(name(), "item", "i")]') - manifest_ppath = XPath('descendant::*[re:match(name(), "manifest", "i")]') - spine_path = XPath('descendant::*[re:match(name(), "spine", "i")]/*[re:match(name(), "itemref", "i")]') - guide_path = XPath('descendant::*[re:match(name(), "guide", "i")]/*[re:match(name(), "reference", "i")]') + manifest_path = XPath('descendant::*[re:match(name(), "manifest", "i")]/' + '*[re:match(name(), "item", "i")]') + manifest_ppath = XPath('descendant::*[re:match(name(), "manifest", "i")]') + spine_path = XPath('descendant::*[re:match(name(), "spine", "i")]/*[re:' + 'match(name(), "itemref", "i")]') + guide_path = XPath('descendant::*[re:match(name(), "guide", "i")]/*[re:' + 'match(name(), "reference", "i")]') - publisher = MetadataField('publisher') - comments = MetadataField('description') - category = MetadataField('type') - rights = MetadataField('rights') - series = MetadataField('series', is_dc=False) + publisher = MetadataField('publisher') + comments = MetadataField('description') + category = MetadataField('type') + rights = MetadataField('rights') + series = MetadataField('series', is_dc=False) if tweaks['use_series_auto_increment_tweak_when_importing']: - series_index = MetadataField('series_index', is_dc=False, - formatter=float, none_is=None) + series_index = MetadataField('series_index', is_dc=False, + formatter=float, none_is=None) else: - series_index = MetadataField('series_index', is_dc=False, - formatter=float, none_is=1) - title_sort = TitleSortField('title_sort', is_dc=False) - rating = MetadataField('rating', is_dc=False, formatter=float) + series_index = MetadataField('series_index', is_dc=False, + formatter=float, none_is=1) + title_sort = TitleSortField('title_sort', is_dc=False) + rating = MetadataField('rating', is_dc=False, formatter=float) publication_type = MetadataField('publication_type', is_dc=False) - timestamp = MetadataField('timestamp', is_dc=False, - formatter=parse_date, renderer=isoformat) + timestamp = MetadataField('timestamp', is_dc=False, + formatter=parse_date, renderer=isoformat) user_categories = MetadataField('user_categories', is_dc=False, formatter=json.loads, renderer=dump_dict) author_link_map = MetadataField('author_link_map', is_dc=False, - formatter=json.loads, renderer=dump_dict) + formatter=json.loads, renderer=dump_dict) def __init__(self, stream, basedir=os.getcwd(), unquote_urls=True, - populate_spine=True, try_to_guess_cover=True, preparsed_opf=None, read_toc=True): + populate_spine=True, try_to_guess_cover=True, + preparsed_opf=None, read_toc=True): self.try_to_guess_cover = try_to_guess_cover - self.basedir = self.base_dir = basedir + self.basedir = self.base_dir = basedir self.path_to_html_toc = self.html_toc_fragment = None - self.root = parse_opf(stream) if preparsed_opf is None else preparsed_opf + self.root = preparsed_opf + if preparsed_opf is None: + self.root = parse_opf(stream) try: self.package_version = float(self.root.get('version', None)) except (AttributeError, TypeError, ValueError): self.package_version = 0 self.metadata = self.metadata_path(self.root) if not self.metadata: - self.metadata = [self.root.makeelement('{http://www.idpf.org/2007/opf}metadata')] + self.metadata = [self.root.makeelement('{http://www.idpf.org/2007' + '/opf}metadata')] self.root.insert(0, self.metadata[0]) self.metadata[0].tail = '\n' - self.metadata = self.metadata[0] + self.metadata = self.metadata[0] if unquote_urls: self.unquote_urls() self.manifest = Manifest() @@ -623,9 +655,10 @@ class OPF(object): # {{{ self._user_metadata_ = {} temp = Metadata('x', ['x']) from ebook_converter.utils.config import from_json - from ebook_converter.ebooks.metadata.book.json_codec import decode_is_multiple + from ebook_converter.ebooks.metadata.book.json_codec import \ + decode_is_multiple elems = self.root.xpath('//*[name() = "meta" and starts-with(@name,' - '"calibre:user_metadata:") and @content]') + '"calibre:user_metadata:") and @content]') for elem in elems: name = elem.get('name') name = ':'.join(name.split(':')[2:]) @@ -636,9 +669,8 @@ class OPF(object): # {{{ fm = json.loads(fm, object_hook=from_json) decode_is_multiple(fm) temp.set_user_metadata(name, fm) - except: + except Exception: prints('Failed to read user metadata:', name) - import traceback traceback.print_exc() continue self._user_metadata_ = temp.get_all_user_metadata(True) @@ -657,16 +689,16 @@ class OPF(object): # {{{ def write_user_metadata(self): elems = self.root.xpath('//*[name() = "meta" and starts-with(@name,' - '"calibre:user_metadata:") and @content]') + '"calibre:user_metadata:") and @content]') for elem in elems: elem.getparent().remove(elem) - serialize_user_metadata(self.metadata, - self._user_metadata_) + serialize_user_metadata(self.metadata, self._user_metadata_) def find_toc(self): self.toc = None try: - spine = self.XPath('descendant::*[re:match(name(), "spine", "i")]')(self.root) + spine = self.XPath('descendant::*[re:match(name(), "spine", ' + '"i")]')(self.root) toc = None if spine: spine = spine[0] @@ -682,9 +714,9 @@ class OPF(object): # {{{ if toc is None: return self.toc = TOC(base_path=self.base_dir) - is_ncx = getattr(self, 'manifest', None) is not None and \ - self.manifest.type_for_id(toc) is not None and \ - 'dtbncx' in self.manifest.type_for_id(toc) + is_ncx = (getattr(self, 'manifest', None) is not None and + self.manifest.type_for_id(toc) is not None and + 'dtbncx' in self.manifest.type_for_id(toc)) if is_ncx or toc.lower() in ('ncx', 'ncxtoc'): path = self.manifest.path_for_id(toc) if path: @@ -700,7 +732,7 @@ class OPF(object): # {{{ not os.path.isfile(self.path_to_html_toc): self.path_to_html_toc = None self.toc.read_html_toc(toc) - except: + except Exception: pass def get_text(self, elem): @@ -721,11 +753,12 @@ class OPF(object): # {{{ c = 1 while manifest_id in ids: c += 1 - manifest_id = 'id%d'%c + manifest_id = 'id%d' % c if not media_type: media_type = 'application/xhtml+xml' - ans = etree.Element('{%s}item'%self.NAMESPACES['opf'], - attrib={'id':manifest_id, 'href':href, 'media-type':media_type}) + ans = etree.Element('{%s}item' % self.NAMESPACES['opf'], + attrib={'id': manifest_id, + 'href': href, 'media-type': media_type}) ans.tail = '\n\t\t' if append: manifest = self.manifest_ppath(self.root)[0] @@ -735,7 +768,7 @@ class OPF(object): # {{{ def replace_manifest_item(self, item, items): items = [self.create_manifest_item(*i) for i in items] for i, item2 in enumerate(items): - item2.set('id', item.get('id')+'.%d'%(i+1)) + item2.set('id', item.get('id')+'.%d' % (i+1)) manifest = item.getparent() index = manifest.index(item) manifest[index:index+1] = items @@ -761,37 +794,43 @@ class OPF(object): # {{{ return x.get('href', None) def create_spine_item(self, idref): - ans = etree.Element('{%s}itemref'%self.NAMESPACES['opf'], idref=idref) + ans = etree.Element('{%s}itemref' % self.NAMESPACES['opf'], + idref=idref) ans.tail = '\n\t\t' return ans def replace_spine_items_by_idref(self, idref, new_idrefs): items = list(map(self.create_spine_item, new_idrefs)) - spine = self.XPath('/opf:package/*[re:match(name(), "spine", "i")]')(self.root)[0] + spine = self.XPath('/opf:package/*[re:match(name(), "spine", ' + '"i")]')(self.root)[0] old = [i for i in self.iterspine() if i.get('idref', None) == idref] for x in old: i = spine.index(x) spine[i:i+1] = items def create_guide_element(self): - e = etree.SubElement(self.root, '{%s}guide'%self.NAMESPACES['opf']) + e = etree.SubElement(self.root, '{%s}guide' % self.NAMESPACES['opf']) e.text = '\n ' - e.tail = '\n' + e.tail = '\n' return e def remove_guide(self): self.guide = None - for g in self.root.xpath('./*[re:match(name(), "guide", "i")]', namespaces={'re':'http://exslt.org/regular-expressions'}): + for g in self.root.xpath('./*[re:match(name(), "guide", "i")]', + namespaces={'re': 'http://exslt.org/regular-' + 'expressions'}): self.root.remove(g) def create_guide_item(self, type, title, href): - e = etree.Element('{%s}reference'%self.NAMESPACES['opf'], - type=type, title=title, href=href) - e.tail='\n' + e = etree.Element('{%s}reference' % self.NAMESPACES['opf'], type=type, + title=title, href=href) + e.tail = '\n' return e def add_guide_item(self, type, title, href): - g = self.root.xpath('./*[re:match(name(), "guide", "i")]', namespaces={'re':'http://exslt.org/regular-expressions'})[0] + g = self.root.xpath('./*[re:match(name(), "guide", "i")]', + namespaces={'re': 'http://exslt.org/regular-' + 'expressions'})[0] g.append(self.create_guide_item(type, title, href)) def iterguide(self): @@ -828,7 +867,10 @@ class OPF(object): # {{{ title.getparent().remove(title) titles = () if val: - title = titles[0] if titles else self.create_metadata_element('title') + if titles: + title = titles[0] + else: + title = self.create_metadata_element('title') title.text = re.sub(r'\s+', ' ', str(val)) @property @@ -847,11 +889,12 @@ class OPF(object): # {{{ # for broken implementations that always use the first # <dc:creator> element with no attention to the role for author in reversed(val): - elem = self.metadata.makeelement('{%s}creator'% - self.NAMESPACES['dc'], nsmap=self.NAMESPACES) + elem = self.metadata.makeelement('{%s}creator' % + self.NAMESPACES['dc'], + nsmap=self.NAMESPACES) elem.tail = '\n' self.metadata.insert(0, elem) - elem.set('{%s}role'%self.NAMESPACES['opf'], 'aut') + elem.set('{%s}role' % self.NAMESPACES['opf'], 'aut') self.set_text(elem, author.strip()) @property @@ -859,7 +902,7 @@ class OPF(object): # {{{ matches = self.authors_path(self.metadata) if matches: for match in matches: - ans = match.get('{%s}file-as'%self.NAMESPACES['opf'], None) + ans = match.get('{%s}file-as' % self.NAMESPACES['opf'], None) if not ans: ans = match.get('file-as', None) if ans: @@ -872,7 +915,7 @@ class OPF(object): # {{{ for key in matches[0].attrib: if key.endswith('file-as'): matches[0].attrib.pop(key) - matches[0].set('{%s}file-as'%self.NAMESPACES['opf'], str(val)) + matches[0].set('{%s}file-as' % self.NAMESPACES['opf'], str(val)) @property def tags(self): @@ -897,8 +940,9 @@ class OPF(object): # {{{ for match in self.pubdate_path(self.metadata): try: val = parse_date(etree.tostring(match, encoding='unicode', - method='text', with_tail=False).strip()) - except: + method='text', + with_tail=False).strip()) + except Exception: continue if ans is None or val < ans: ans = val @@ -910,8 +954,9 @@ class OPF(object): # {{{ for match in self.pubdate_path(self.metadata): try: cval = parse_date(etree.tostring(match, encoding='unicode', - method='text', with_tail=False).strip()) - except: + method='text', + with_tail=False).strip()) + except Exception: match.getparent().remove(match) else: if not val: @@ -953,7 +998,7 @@ class OPF(object): # {{{ x.getparent().remove(x) return if not matches: - attrib = {'{%s}scheme'%self.NAMESPACES['opf']: 'ISBN'} + attrib = {'{%s}scheme' % self.NAMESPACES['opf']: 'ISBN'} matches = [self.create_metadata_element('identifier', attrib=attrib)] self.set_text(matches[0], str(val)) @@ -967,17 +1012,19 @@ class OPF(object): # {{{ for attr, val in x.attrib.items(): if attr.endswith('scheme'): typ = val.lower() - val = etree.tostring(x, with_tail=False, encoding='unicode', - method='text').strip() + val = etree.tostring(x, with_tail=False, + encoding='unicode', + method='text').strip() if val and typ not in ('calibre', 'uuid'): - if typ == 'isbn' and val.lower().startswith('urn:isbn:'): + if (typ == 'isbn' and + val.lower().startswith('urn:isbn:')): val = val[len('urn:isbn:'):] identifiers[typ] = val found_scheme = True break if not found_scheme: val = etree.tostring(x, with_tail=False, encoding='unicode', - method='text').strip() + method='text').strip() if val.lower().startswith('urn:isbn:'): val = check_isbn(val.split(':')[-1]) if val is not None: @@ -997,7 +1044,8 @@ class OPF(object): # {{{ self.metadata): xid = x.get('id', None) is_package_identifier = uuid_id is not None and uuid_id == xid - typ = {val.lower() for attr, val in x.attrib.items() if attr.endswith('scheme')} + typ = {val.lower() for attr, val in x.attrib.items() + if attr.endswith('scheme')} if is_package_identifier: typ = tuple(typ) if typ and typ[0] in identifiers: @@ -1007,7 +1055,7 @@ class OPF(object): # {{{ x.getparent().remove(x) for typ, val in identifiers.items(): - attrib = {'{%s}scheme'%self.NAMESPACES['opf']: typ.upper()} + attrib = {'{%s}scheme' % self.NAMESPACES['opf']: typ.upper()} self.set_text(self.create_metadata_element( 'identifier', attrib=attrib), str(val)) @@ -1028,7 +1076,7 @@ class OPF(object): # {{{ if attr.endswith('unique-identifier'): uuid_id = self.root.attrib[attr] break - attrib = {'{%s}scheme'%self.NAMESPACES['opf']: 'calibre'} + attrib = {'{%s}scheme' % self.NAMESPACES['opf']: 'calibre'} if uuid_id and uuid_id in removed_ids: attrib['id'] = uuid_id self.set_text(self.create_metadata_element( @@ -1043,7 +1091,7 @@ class OPF(object): # {{{ def uuid(self, val): matches = self.uuid_id_path(self.metadata) if not matches: - attrib = {'{%s}scheme'%self.NAMESPACES['opf']: 'uuid'} + attrib = {'{%s}scheme' % self.NAMESPACES['opf']: 'uuid'} matches = [self.create_metadata_element('identifier', attrib=attrib)] self.set_text(matches[0], str(val)) @@ -1064,9 +1112,9 @@ class OPF(object): # {{{ for match in self.languages_path(self.metadata): t = self.get_text(match) if t and t.strip(): - l = canonicalize_lang(t.strip()) - if l: - ans.append(l) + cl = canonicalize_lang(t.strip()) + if cl: + ans.append(cl) return ans @languages.setter @@ -1076,8 +1124,8 @@ class OPF(object): # {{{ x.getparent().remove(x) for lang in val: - l = self.create_metadata_element('language') - self.set_text(l, str(lang)) + cl = self.create_metadata_element('language') + self.set_text(cl, str(lang)) @property def raw_languages(self): @@ -1096,7 +1144,7 @@ class OPF(object): # {{{ matches = self.bkp_path(self.metadata) if not matches: matches = [self.create_metadata_element('contributor')] - matches[0].set('{%s}role'%self.NAMESPACES['opf'], 'bkp') + matches[0].set('{%s}role' % self.NAMESPACES['opf'], 'bkp') self.set_text(matches[0], str(val)) def identifier_iter(self): @@ -1111,7 +1159,8 @@ class OPF(object): # {{{ uuid_elem = self.root.attrib[attr] break if uuid_elem: - matches = self.root.xpath('//*[@id=%s]'%escape_xpath_attr(uuid_elem)) + matches = self.root.xpath('//*[@id=%s]' % + escape_xpath_attr(uuid_elem)) if matches: for m in matches: raw = m.text @@ -1126,15 +1175,18 @@ class OPF(object): # {{{ @property def page_progression_direction(self): - spine = self.XPath('descendant::*[re:match(name(), "spine", "i")][1]')(self.root) + spine = self.XPath('descendant::*[re:match(name(), "spine", ' + '"i")][1]')(self.root) if spine: for k, v in spine[0].attrib.items(): - if k == 'page-progression-direction' or k.endswith('}page-progression-direction'): + if (k == 'page-progression-direction' or + k.endswith('}page-progression-direction')): return v @property def primary_writing_mode(self): - for m in self.XPath('//*[local-name()="meta" and @name="primary-writing-mode" and @content]')(self.root): + for m in self.XPath('//*[local-name()="meta" and @name="primary-' + 'writing-mode" and @content]')(self.root): return m.get('content') def guess_cover(self): @@ -1153,8 +1205,11 @@ class OPF(object): # {{{ if item.text: prefix = item.text.replace('-', '') for suffix in ['.jpg', '.jpeg', '.gif', '.png', '.bmp']: - cpath = os.access(os.path.join(self.base_dir, prefix+suffix), os.R_OK) - if os.access(os.path.join(self.base_dir, prefix+suffix), os.R_OK): + cpath = os.access(os.path.join(self.base_dir, + prefix + suffix), + os.R_OK) + if os.access(os.path.join(self.base_dir, + prefix+suffix), os.R_OK): return cpath @property @@ -1188,13 +1243,16 @@ class OPF(object): # {{{ def guide_raster_cover(self): covers = self.guide_cover_path(self.root) if covers: - mt_map = {i.get('href'):i for i in self.itermanifest()} + mt_map = {i.get('href'): i for i in self.itermanifest()} for href in covers: if href: i = mt_map.get(href) if i is not None: iid, mt = i.get('id'), i.get('media-type') - if iid and mt and mt.lower() in {'image/png', 'image/jpeg', 'image/jpg', 'image/gif'}: + if iid and mt and mt.lower() in {'image/png', + 'image/jpeg', + 'image/jpg', + 'image/gif'}: return i @property @@ -1214,14 +1272,15 @@ class OPF(object): # {{{ @property def cover(self): if self.guide is not None: - for t in ('cover', 'other.ms-coverimage-standard', 'other.ms-coverimage'): + for t in ('cover', 'other.ms-coverimage-standard', + 'other.ms-coverimage'): for item in self.guide: if item.type and item.type.lower() == t: return item.path try: if self.try_to_guess_cover: return self.guess_cover() - except: + except Exception: pass @cover.setter @@ -1237,13 +1296,16 @@ class OPF(object): # {{{ self.guide = Guide() self.guide.set_cover(path) etree.SubElement(g, 'opf:reference', nsmap=self.NAMESPACES, - attrib={'type':'cover', 'href':self.guide[-1].href()}) + attrib={'type': 'cover', + 'href': self.guide[-1].href()}) id = self.manifest.id_for_path(self.cover) if id is None: - for t in ('cover', 'other.ms-coverimage-standard', 'other.ms-coverimage'): + for t in ('cover', 'other.ms-coverimage-standard', + 'other.ms-coverimage'): for item in self.guide: if item.type.lower() == t: - self.create_manifest_item(item.href(), guess_type(path)[0]) + self.create_manifest_item(item.href(), + guess_type(path)[0]) def get_metadata_element(self, name): matches = self.metadata_elem_path(self.metadata, name=name) @@ -1278,9 +1340,11 @@ class OPF(object): # {{{ # We swap attributes instead of elements, as that avoids namespace # re-declarations smap = {} - for child in self.metadata.xpath('./*[@name="calibre:series" or @name="calibre:series_index"]'): + for child in self.metadata.xpath('./*[@name="calibre:series" or @name' + '="calibre:series_index"]'): smap[child.get('name')] = (child, self.metadata.index(child)) - if len(smap) == 2 and smap['calibre:series'][1] > smap['calibre:series_index'][1]: + if (len(smap) == 2 and + smap['calibre:series'][1] > smap['calibre:series_index'][1]): s, si = smap['calibre:series'][0], smap['calibre:series_index'][0] def swap(attr): @@ -1293,7 +1357,8 @@ class OPF(object): # {{{ _pretty_print(self.root) raw = etree.tostring(self.root, encoding=encoding, pretty_print=True) if not raw.lstrip().startswith(b'<?xml '): - raw = ('<?xml version="1.0" encoding="%s"?>\n'%encoding.upper()).encode('ascii') + raw + raw = ('<?xml version="1.0" encoding="%s"?>\n' % + encoding.upper()).encode('ascii') + raw return raw def smart_update(self, mi, replace_metadata=False, apply_null=False): @@ -1302,9 +1367,12 @@ class OPF(object): # {{{ 'isbn', 'tags', 'category', 'comments', 'book_producer', 'pubdate', 'user_categories', 'author_link_map'): val = getattr(mi, attr, None) - is_null = val is None or val in ((), [], (None, None), {}) or (attr == 'rating' and val < 0.1) + is_null = val is None or val in ((), [], (None, None), + {}) or (attr == 'rating' and + val < 0.1) if is_null: - if apply_null and attr in {'series', 'tags', 'isbn', 'comments', 'publisher', 'rating'}: + if apply_null and attr in {'series', 'tags', 'isbn', + 'comments', 'publisher', 'rating'}: setattr(self, attr, ([] if attr == 'tags' else None)) else: setattr(self, attr, val) @@ -1315,7 +1383,9 @@ class OPF(object): # {{{ self.languages = langs or [] temp = self.to_book_metadata() temp.smart_update(mi, replace_metadata=replace_metadata) - if not replace_metadata and callable(getattr(temp, 'custom_field_keys', None)): + if not replace_metadata and callable(getattr(temp, + 'custom_field_keys', + None)): # We have to replace non-null fields regardless of the value of # replace_metadata to match the behavior of the builtin fields # above. @@ -1346,12 +1416,12 @@ class OPF(object): # {{{ class OPFCreator(Metadata): def __init__(self, base_path, other): - ''' + """ Initialize. - @param base_path: An absolute path to the directory in which this OPF file - will eventually be. This is used by the L{create_manifest} method + @param base_path: An absolute path to the directory in which this OPF + file will eventually be. This is used by the L{create_manifest} method to convert paths to files into relative paths. - ''' + """ Metadata.__init__(self, title='', other=other) self.base_path = os.path.abspath(base_path) self.page_progression_direction = None @@ -1368,19 +1438,21 @@ class OPFCreator(Metadata): self.guide.set_cover(self.cover) def create_manifest(self, entries): - ''' + """ Create <manifest> - `entries`: List of (path, mime-type) If mime-type is None it is autodetected - ''' + `entries`: List of (path, mime-type) If mime-type is None it is + autodetected + """ entries = list(map(lambda x: x if os.path.isabs(x[0]) else - (os.path.abspath(os.path.join(self.base_path, x[0])), x[1]), - entries)) + (os.path.abspath(os.path.join(self.base_path, + x[0])), x[1]), + entries)) self.manifest = Manifest.from_paths(entries) self.manifest.set_basedir(self.base_path) def create_manifest_from_files_in(self, files_and_dirs, - exclude=lambda x:False): + exclude=lambda x: False): entries = [] def dodir(dir): @@ -1406,7 +1478,8 @@ class OPFCreator(Metadata): `entries`: List of paths ''' entries = list(map(lambda x: x if os.path.isabs(x) else - os.path.abspath(os.path.join(self.base_path, x)), entries)) + os.path.abspath(os.path.join(self.base_path, x)), + entries)) self.spine = Spine.from_paths(entries, self.manifest) def set_toc(self, toc): @@ -1431,11 +1504,13 @@ class OPFCreator(Metadata): self.manifest.set_basedir(self.base_path) if ncx_manifest_entry is not None and toc is not None: if not os.path.isabs(ncx_manifest_entry): - ncx_manifest_entry = os.path.join(self.base_path, ncx_manifest_entry) + ncx_manifest_entry = os.path.join(self.base_path, + ncx_manifest_entry) remove = [i for i in self.manifest if i.id == 'ncx'] for item in remove: self.manifest.remove(item) - self.manifest.append(ManifestItem(ncx_manifest_entry, self.base_path)) + self.manifest.append(ManifestItem(ncx_manifest_entry, + self.base_path)) self.manifest[-1].id = 'ncx' self.manifest[-1].mime_type = 'application/x-dtbncx+xml' if self.guide is None: @@ -1448,13 +1523,13 @@ class OPFCreator(Metadata): self.guide.set_basedir(self.base_path) # Actual rendering - from lxml.builder import ElementMaker - from ebook_converter.ebooks.oeb.base import OPF2_NS, DC11_NS, CALIBRE_NS - DNS = OPF2_NS+'___xx___' - E = ElementMaker(namespace=DNS, nsmap={None:DNS}) + DNS = const.OPF2_NS + '___xx___' + E = ElementMaker(namespace=DNS, nsmap={None: DNS}) M = ElementMaker(namespace=DNS, - nsmap={'dc':DC11_NS, 'calibre':CALIBRE_NS, 'opf':OPF2_NS}) - DC = ElementMaker(namespace=DC11_NS) + nsmap={'dc': const.DC11_NS, + 'calibre': const.CALIBRE_NS, + 'opf': const.OPF2_NS}) + DC = ElementMaker(namespace=const.DC11_NS) def DC_ELEM(tag, text, dc_attrs={}, opf_attrs={}): if text: @@ -1462,7 +1537,7 @@ class OPFCreator(Metadata): else: elem = getattr(DC, tag)(**dc_attrs) for k, v in opf_attrs.items(): - elem.set('{%s}%s'%(OPF2_NS, k), v) + elem.set('{%s}%s' % (const.OPF2_NS, k), v) return elem def CAL_ELEM(name, content): @@ -1472,18 +1547,18 @@ class OPFCreator(Metadata): a = metadata.append role = {} a(DC_ELEM('title', self.title if self.title else 'Unknown', - opf_attrs=role)) + opf_attrs=role)) for i, author in enumerate(self.authors): - fa = {'role':'aut'} + fa = {'role': 'aut'} if i == 0 and self.author_sort: fa['file-as'] = self.author_sort a(DC_ELEM('creator', author, opf_attrs=fa)) - a(DC_ELEM('contributor', '%s (%s) [%s]'%(__appname__, __version__, - 'https://calibre-ebook.com'), opf_attrs={'role':'bkp', - 'file-as':__appname__})) + a(DC_ELEM('contributor', '%s (%s) [%s]' % + (__appname__, __version__, 'https://calibre-ebook.com'), + opf_attrs={'role': 'bkp', 'file-as': __appname__})) a(DC_ELEM('identifier', str(self.application_id), - opf_attrs={'scheme':__appname__}, - dc_attrs={'id':__appname__+'_id'})) + opf_attrs={'scheme': __appname__}, + dc_attrs={'id': __appname__+'_id'})) if getattr(self, 'pubdate', None) is not None: a(DC_ELEM('date', self.pubdate.isoformat())) langs = self.languages @@ -1496,7 +1571,7 @@ class OPFCreator(Metadata): if self.publisher: a(DC_ELEM('publisher', self.publisher)) for key, val in self.get_identifiers().items(): - a(DC_ELEM('identifier', val, opf_attrs={'scheme':key.upper()})) + a(DC_ELEM('identifier', val, opf_attrs={'scheme': key.upper()})) if self.rights: a(DC_ELEM('rights', self.rights)) if self.tags: @@ -1515,11 +1590,13 @@ class OPFCreator(Metadata): if self.publication_type is not None: a(CAL_ELEM('calibre:publication_type', self.publication_type)) if self.user_categories: - from ebook_converter.ebooks.metadata.book.json_codec import object_to_unicode + from ebook_converter.ebooks.metadata.book.json_codec import \ + object_to_unicode a(CAL_ELEM('calibre:user_categories', json.dumps(object_to_unicode(self.user_categories)))) if self.primary_writing_mode: - a(M.meta(name='primary-writing-mode', content=self.primary_writing_mode)) + a(M.meta(name='primary-writing-mode', + content=self.primary_writing_mode)) manifest = E.manifest() if self.manifest is not None: for ref in self.manifest: @@ -1533,7 +1610,8 @@ class OPFCreator(Metadata): if self.toc is not None: spine.set('toc', 'ncx') if self.page_progression_direction is not None: - spine.set('page-progression-direction', self.page_progression_direction) + spine.set('page-progression-direction', + self.page_progression_direction) if self.spine is not None: for ref in self.spine: if ref.id is not None: @@ -1562,8 +1640,8 @@ class OPFCreator(Metadata): root.set('unique-identifier', __appname__+'_id') root.set('version', '2.0') raw = etree.tostring(root, pretty_print=True, xml_declaration=True, - encoding=encoding) - raw = raw.replace(DNS.encode('utf-8'), OPF2_NS.encode('utf-8')) + encoding=encoding) + raw = raw.replace(DNS.encode('utf-8'), const.OPF2_NS.encode('utf-8')) opf_stream.write(raw) opf_stream.flush() if toc is not None and ncx_stream is not None: @@ -1572,9 +1650,6 @@ class OPFCreator(Metadata): def metadata_to_opf(mi, as_string=True, default_lang=None): - from lxml import etree - import textwrap - from ebook_converter.ebooks.oeb.base import OPF, DC if not mi.application_id: mi.application_id = str(uuid.uuid4()) @@ -1583,15 +1658,15 @@ def metadata_to_opf(mi, as_string=True, default_lang=None): mi.uuid = str(uuid.uuid4()) if not mi.book_producer: - mi.book_producer = __appname__ + ' (%s) '%__version__ + \ - '[https://calibre-ebook.com]' + mi.book_producer = (__appname__ + ' (%s) ' % + __version__ + '[https://calibre-ebook.com]') if not mi.languages: lang = (get_lang().replace('_', '-').partition('-')[0] if default_lang is None else default_lang) mi.languages = [lang] - root = safe_xml_fromstring(textwrap.dedent( + root = etree.fromstring(textwrap.dedent( ''' <package xmlns="http://www.idpf.org/2007/opf" unique-identifier="uuid_id" version="2.0"> <metadata xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:opf="http://www.idpf.org/2007/opf"> @@ -1600,20 +1675,20 @@ def metadata_to_opf(mi, as_string=True, default_lang=None): </metadata> <guide/> </package> - '''%dict(a=__appname__, id=mi.application_id, uuid=mi.uuid))) + ''' % dict(a=__appname__, id=mi.application_id, uuid=mi.uuid))) metadata = root[0] guide = root[1] metadata[0].tail = '\n'+(' '*8) def factory(tag, text=None, sort=None, role=None, scheme=None, name=None, - content=None): + content=None): attrib = {} if sort: - attrib[OPF('file-as')] = sort + attrib[base.tag('opf', 'file-as')] = sort if role: - attrib[OPF('role')] = role + attrib[base.tag('opf', 'role')] = role if scheme: - attrib[OPF('scheme')] = scheme + attrib[base.tag('opf', 'scheme')] = scheme if name: attrib['name'] = name if content: @@ -1621,7 +1696,9 @@ def metadata_to_opf(mi, as_string=True, default_lang=None): try: elem = metadata.makeelement(tag, attrib=attrib) except ValueError: - elem = metadata.makeelement(tag, attrib={k:clean_xml_chars(v) for k, v in attrib.items()}) + elem = metadata.makeelement(tag, attrib={k: clean_xml_chars(v) + for k, v in + attrib.items()}) elem.tail = '\n'+(' '*8) if text: try: @@ -1630,29 +1707,29 @@ def metadata_to_opf(mi, as_string=True, default_lang=None): elem.text = clean_ascii_chars(text.strip()) metadata.append(elem) - factory(DC('title'), mi.title) + factory(base.tag('dc', 'title'), mi.title) for au in mi.authors: - factory(DC('creator'), au, mi.author_sort, 'aut') - factory(DC('contributor'), mi.book_producer, __appname__, 'bkp') + factory(base.tag('dc', 'creator'), au, mi.author_sort, 'aut') + factory(base.tag('dc', 'contributor'), mi.book_producer, __appname__, 'bkp') if hasattr(mi.pubdate, 'isoformat'): - factory(DC('date'), isoformat(mi.pubdate)) + factory(base.tag('dc', 'date'), isoformat(mi.pubdate)) if hasattr(mi, 'category') and mi.category: - factory(DC('type'), mi.category) + factory(base.tag('dc', 'type'), mi.category) if mi.comments: - factory(DC('description'), clean_ascii_chars(mi.comments)) + factory(base.tag('dc', 'description'), clean_ascii_chars(mi.comments)) if mi.publisher: - factory(DC('publisher'), mi.publisher) + factory(base.tag('dc', 'publisher'), mi.publisher) for key, val in mi.get_identifiers().items(): - factory(DC('identifier'), val, scheme=key.upper()) + factory(base.tag('dc', 'identifier'), val, scheme=key.upper()) if mi.rights: - factory(DC('rights'), mi.rights) + factory(base.tag('dc', 'rights'), mi.rights) for lang in mi.languages: if not lang or lang.lower() == 'und': continue - factory(DC('language'), lang) + factory(base.tag('dc', 'language'), lang) if mi.tags: for tag in mi.tags: - factory(DC('subject'), tag) + factory(base.tag('dc', 'subject'), tag) meta = lambda n, c: factory('meta', name='calibre:'+n, content=c) if getattr(mi, 'author_link_map', None) is not None: meta('author_link_map', dump_dict(mi.author_link_map)) @@ -1673,21 +1750,22 @@ def metadata_to_opf(mi, as_string=True, default_lang=None): serialize_user_metadata(metadata, mi.get_all_user_metadata(False)) - metadata[-1].tail = '\n' +(' '*4) + metadata[-1].tail = '\n' + (' '*4) if mi.cover: if not isinstance(mi.cover, str): mi.cover = mi.cover.decode(filesystem_encoding) guide.text = '\n'+(' '*8) - r = guide.makeelement(OPF('reference'), - attrib={'type': 'cover', 'title': 'Cover', 'href': mi.cover}) - r.tail = '\n' +(' '*4) + r = guide.makeelement(base.tag('opf', 'reference'), + attrib={'type': 'cover', 'title': 'Cover', + 'href': mi.cover}) + r.tail = '\n' + (' '*4) guide.append(r) if pretty_print_opf: _pretty_print(root) return etree.tostring(root, pretty_print=True, encoding='utf-8', - xml_declaration=True) if as_string else root + xml_declaration=True) if as_string else root def test_m2o(): @@ -1699,7 +1777,7 @@ def test_m2o(): mi.language = 'en' mi.comments = 'what a fun book\n\n' mi.publisher = 'publisher' - mi.set_identifiers({'isbn':'booo', 'dummy':'dummy'}) + mi.set_identifiers({'isbn': 'booo', 'dummy': 'dummy'}) mi.tags = ['a', 'b'] mi.series = 's"c\'l&<>' mi.series_index = 3.34 @@ -1711,15 +1789,15 @@ def test_m2o(): opf = metadata_to_opf(mi) print(opf) newmi = MetaInformation(OPF(io.BytesIO(opf))) - for attr in ('author_sort', 'title_sort', 'comments', - 'publisher', 'series', 'series_index', 'rating', - 'isbn', 'tags', 'cover_data', 'application_id', - 'language', 'cover', - 'book_producer', 'timestamp', - 'pubdate', 'rights', 'publication_type'): + for attr in ('author_sort', 'title_sort', 'comments', 'publisher', + 'series', 'series_index', 'rating', 'isbn', 'tags', + 'cover_data', 'application_id', 'language', 'cover', + 'book_producer', 'timestamp', 'pubdate', 'rights', + 'publication_type'): o, n = getattr(mi, attr), getattr(newmi, attr) if o != n and o.strip() != n.strip(): - print('FAILED:', attr, getattr(mi, attr), '!=', getattr(newmi, attr)) + print('FAILED:', attr, getattr(mi, attr), '!=', + getattr(newmi, attr)) if mi.get_identifiers() != newmi.get_identifiers(): print('FAILED:', 'identifiers', mi.get_identifiers(), end=' ') print('!=', newmi.get_identifiers()) @@ -1766,8 +1844,8 @@ b'''\ self.assertEqual(opf.rating, 4) self.assertEqual(opf.publication_type, 'test') self.assertEqual(list(opf.itermanifest())[0].get('href'), 'a ~ b') - self.assertEqual(opf.get_identifiers(), {'isbn':'123456789', - 'dummy':'dummy'}) + self.assertEqual(opf.get_identifiers(), {'isbn': '123456789', + 'dummy': 'dummy'}) def testWriting(self): for test in [('title', 'New & Title'), ('authors', ['One', 'Two']), @@ -1802,14 +1880,12 @@ def test(): def test_user_metadata(): mi = Metadata('Test title', ['test author1', 'test author2']) - um = { - '#myseries': {'#value#': u'test series\xe4', 'datatype':'text', - 'is_multiple': None, 'name': u'My Series'}, - '#myseries_index': {'#value#': 2.45, 'datatype': 'float', - 'is_multiple': None}, - '#mytags': {'#value#':['t1','t2','t3'], 'datatype':'text', - 'is_multiple': '|', 'name': u'My Tags'} - } + um = {'#myseries': {'#value#': u'test series\xe4', 'datatype': 'text', + 'is_multiple': None, 'name': u'My Series'}, + '#myseries_index': {'#value#': 2.45, 'datatype': 'float', + 'is_multiple': None}, + '#mytags': {'#value#': ['t1', 't2', 't3'], 'datatype': 'text', + 'is_multiple': '|', 'name': u'My Tags'}} mi.set_all_user_metadata(um) raw = metadata_to_opf(mi) opfc = OPFCreator(os.getcwd(), other=mi) diff --git a/ebook_converter/ebooks/metadata/opf3.py b/ebook_converter/ebooks/metadata/opf3.py index f12a79d..dd146a6 100644 --- a/ebook_converter/ebooks/metadata/opf3.py +++ b/ebook_converter/ebooks/metadata/opf3.py @@ -5,6 +5,7 @@ from functools import wraps from lxml import etree +from ebook_converter import constants as const from ebook_converter import prints from ebook_converter.ebooks.metadata import authors_to_string, check_isbn, string_to_authors from ebook_converter.ebooks.metadata.book.base import Metadata @@ -15,7 +16,6 @@ from ebook_converter.ebooks.metadata.utils import ( create_manifest_item, ensure_unique, normalize_languages, parse_opf, pretty_print_opf ) -from ebook_converter.ebooks.oeb.base import DC, OPF, OPF2_NSMAP from ebook_converter.utils.config import from_json, to_json from ebook_converter.utils.date import ( fix_only_date, is_date_undefined, isoformat, parse_date as parse_date_, utcnow, @@ -46,7 +46,7 @@ def XPath(x): try: return _xpath_cache[x] except KeyError: - _xpath_cache[x] = ans = etree.XPath(x, namespaces=OPF2_NSMAP) + _xpath_cache[x] = ans = etree.XPath(x, namespaces=const.OPF2_NSMAP) return ans @@ -213,7 +213,7 @@ def set_refines(elem, existing_refines, *new_refines): remove_refines(elem, existing_refines) for ref in reversed(new_refines): prop, val, scheme = ref - r = elem.makeelement(OPF('meta')) + r = elem.makeelement(const.OPF_META) r.set('refines', '#' + eid), r.set('property', prop) r.text = val.strip() if scheme: @@ -249,7 +249,7 @@ def parse_identifier(ident, val, refines): # Try the OPF 2 style opf:scheme attribute, which will be present, for # example, in EPUB 3 files that have had their metadata set by an # application that only understands EPUB 2. - scheme = ident.get(OPF('scheme')) + scheme = ident.get(const.OPF_SCHEME) if scheme and not lval.startswith('urn:'): return finalize(scheme, val) @@ -294,7 +294,7 @@ def set_identifiers(root, prefixes, refines, new_identifiers, force_identifiers= continue metadata = XPath('./opf:metadata')(root)[0] for scheme, val in new_identifiers.items(): - ident = metadata.makeelement(DC('identifier')) + ident = metadata.makeelement(const.DC_IDENT) ident.text = '%s:%s' % (scheme, val) if package_identifier is None: metadata.append(ident) @@ -312,11 +312,11 @@ def identifier_writer(name): if is_package_id: package_identifier = ident val = (ident.text or '').strip() - if (val.startswith(name + ':') or ident.get(OPF('scheme')) == name) and not is_package_id: + if (val.startswith(name + ':') or ident.get(const.OPF_SCHEME) == name) and not is_package_id: remove_element(ident, refines) metadata = XPath('./opf:metadata')(root)[0] if ival: - ident = metadata.makeelement(DC('identifier')) + ident = metadata.makeelement(const.DC_IDENT) ident.text = '%s:%s' % (name, ival) if package_identifier is None: metadata.append(ident) @@ -376,7 +376,7 @@ def set_title(root, prefixes, refines, title, title_sort=None): main_title = find_main_title(root, refines, remove_blanks=True) if main_title is None: m = XPath('./opf:metadata')(root)[0] - main_title = m.makeelement(DC('title')) + main_title = m.makeelement(const.DC_TITLE) m.insert(0, main_title) main_title.text = title or None ts = [refdef('file-as', title_sort)] if title_sort else () @@ -411,7 +411,7 @@ def set_languages(root, prefixes, refines, languages): languages = ['und'] metadata = XPath('./opf:metadata')(root)[0] for lang in uniq(languages): - l = metadata.makeelement(DC('language')) + l = metadata.makeelement(const.DC_LANG) l.text = lang metadata.append(l) # }}} @@ -440,7 +440,7 @@ def read_authors(root, prefixes, refines): if file_as: aus = file_as[0][-1] else: - aus = item.get(OPF('file-as')) or None + aus = item.get(const.OPF_FILE_AS) or None return Author(normalize_whitespace(val), normalize_whitespace(aus)) for item in XPath('./opf:metadata/dc:creator')(root): @@ -448,7 +448,7 @@ def read_authors(root, prefixes, refines): if val: props = properties_for_id_with_scheme(item.get('id'), prefixes, refines) role = props.get('role') - opf_role = item.get(OPF('role')) + opf_role = item.get(const.OPF_ROLE) if role: if is_relators_role(props, 'aut'): roled_authors.append(author(item, props, val)) @@ -465,22 +465,22 @@ def set_authors(root, prefixes, refines, authors): ensure_prefix(root, prefixes, 'marc') for item in XPath('./opf:metadata/dc:creator')(root): props = properties_for_id_with_scheme(item.get('id'), prefixes, refines) - opf_role = item.get(OPF('role')) + opf_role = item.get(const.OPF_ROLE) if (opf_role and opf_role.lower() != 'aut') or (props.get('role') and not is_relators_role(props, 'aut')): continue remove_element(item, refines) metadata = XPath('./opf:metadata')(root)[0] for author in authors: if author.name: - a = metadata.makeelement(DC('creator')) + a = metadata.makeelement(const.DC_CREATOR) aid = ensure_id(a) a.text = author.name metadata.append(a) - m = metadata.makeelement(OPF('meta'), attrib={'refines':'#'+aid, 'property':'role', 'scheme':'marc:relators'}) + m = metadata.makeelement(const.OPF_META, attrib={'refines':'#'+aid, 'property':'role', 'scheme':'marc:relators'}) m.text = 'aut' metadata.append(m) if author.sort: - m = metadata.makeelement(OPF('meta'), attrib={'refines':'#'+aid, 'property':'file-as'}) + m = metadata.makeelement(const.OPF_META, attrib={'refines':'#'+aid, 'property':'file-as'}) m.text = author.sort metadata.append(m) @@ -492,7 +492,7 @@ def read_book_producers(root, prefixes, refines): if val: props = properties_for_id_with_scheme(item.get('id'), prefixes, refines) role = props.get('role') - opf_role = item.get(OPF('role')) + opf_role = item.get(const.OPF_ROLE) if role: if is_relators_role(props, 'bkp'): ans.append(normalize_whitespace(val)) @@ -504,18 +504,18 @@ def read_book_producers(root, prefixes, refines): def set_book_producers(root, prefixes, refines, producers): for item in XPath('./opf:metadata/dc:contributor')(root): props = properties_for_id_with_scheme(item.get('id'), prefixes, refines) - opf_role = item.get(OPF('role')) + opf_role = item.get(const.OPF_ROLE) if (opf_role and opf_role.lower() != 'bkp') or (props.get('role') and not is_relators_role(props, 'bkp')): continue remove_element(item, refines) metadata = XPath('./opf:metadata')(root)[0] for bkp in producers: if bkp: - a = metadata.makeelement(DC('contributor')) + a = metadata.makeelement(const.DC_CONTRIBUTOR) aid = ensure_id(a) a.text = bkp metadata.append(a) - m = metadata.makeelement(OPF('meta'), attrib={'refines':'#'+aid, 'property':'role', 'scheme':'marc:relators'}) + m = metadata.makeelement(const.OPF_META, attrib={'refines':'#'+aid, 'property':'role', 'scheme':'marc:relators'}) m.text = 'bkp' metadata.append(m) # }}} @@ -552,7 +552,7 @@ def set_pubdate(root, prefixes, refines, val): if not is_date_undefined(val): val = isoformat(val) m = XPath('./opf:metadata')(root)[0] - d = m.makeelement(DC('date')) + d = m.makeelement(const.DC_DATE) d.text = val m.append(d) @@ -584,7 +584,7 @@ def create_timestamp(root, prefixes, m, val): ensure_prefix(root, prefixes, 'calibre', CALIBRE_PREFIX) ensure_prefix(root, prefixes, 'dcterms') val = w3cdtf(val) - d = m.makeelement(OPF('meta'), attrib={'property':'calibre:timestamp', 'scheme':'dcterms:W3CDTF'}) + d = m.makeelement(const.OPF_META, attrib={'property':'calibre:timestamp', 'scheme':'dcterms:W3CDTF'}) d.text = val m.append(d) @@ -625,7 +625,7 @@ def set_last_modified(root, prefixes, refines, val=None): else: ensure_prefix(root, prefixes, 'dcterms') m = XPath('./opf:metadata')(root)[0] - meta = m.makeelement(OPF('meta'), attrib={'property':'dcterms:modified', 'scheme':'dcterms:W3CDTF'}) + meta = m.makeelement(const.OPF_META, attrib={'property':'dcterms:modified', 'scheme':'dcterms:W3CDTF'}) m.append(meta) meta.text = val # }}} @@ -648,7 +648,7 @@ def set_comments(root, prefixes, refines, val): if val: val = val.strip() if val: - c = m.makeelement(DC('description')) + c = m.makeelement(const.DC_DESC) c.text = val m.append(c) # }}} @@ -670,7 +670,7 @@ def set_publisher(root, prefixes, refines, val): if val: val = val.strip() if val: - c = m.makeelement(DC('publisher')) + c = m.makeelement(const.DC_PUBLISHER('publisher')) c.text = normalize_whitespace(val) m.append(c) # }}} @@ -693,7 +693,7 @@ def set_tags(root, prefixes, refines, val): if val: val = uniq(list(filter(None, val))) for x in val: - c = m.makeelement(DC('subject')) + c = m.makeelement(const.DC_SUBJ) c.text = normalize_whitespace(x) if c.text: m.append(c) @@ -725,7 +725,7 @@ def read_rating(root, prefixes, refines): def create_rating(root, prefixes, val): ensure_prefix(root, prefixes, 'calibre', CALIBRE_PREFIX) m = XPath('./opf:metadata')(root)[0] - d = m.makeelement(OPF('meta'), attrib={'property':'calibre:rating'}) + d = m.makeelement(const.OPF_META, attrib={'property':'calibre:rating'}) d.text = val m.append(d) @@ -772,7 +772,7 @@ def read_series(root, prefixes, refines): def create_series(root, refines, series, series_index): m = XPath('./opf:metadata')(root)[0] - d = m.makeelement(OPF('meta'), attrib={'property':'belongs-to-collection'}) + d = m.makeelement(const.OPF_META, attrib={'property':'belongs-to-collection'}) d.text = series m.append(d) set_refines(d, refines, refdef('collection-type', 'series'), refdef('group-position', series_index)) @@ -836,7 +836,7 @@ def dict_writer(name, serialize=dump_dict, remove2=True): if val: ensure_prefix(root, prefixes, 'calibre', CALIBRE_PREFIX) m = XPath('./opf:metadata')(root)[0] - d = m.makeelement(OPF('meta'), attrib={'property':'calibre:%s' % name}) + d = m.makeelement(const.OPF_META, attrib={'property':'calibre:%s' % name}) d.text = serialize(val) m.append(d) return writer diff --git a/ebook_converter/ebooks/metadata/toc.py b/ebook_converter/ebooks/metadata/toc.py index a9330a4..02f04d5 100644 --- a/ebook_converter/ebooks/metadata/toc.py +++ b/ebook_converter/ebooks/metadata/toc.py @@ -10,17 +10,13 @@ from lxml.builder import ElementMaker from ebook_converter.constants_old import __appname__, __version__ from ebook_converter.ebooks.chardet import xml_to_unicode -from ebook_converter.utils.xml_parse import safe_xml_fromstring from ebook_converter.utils.cleantext import clean_xml_chars from ebook_converter.polyglot.urllib import unquote -__license__ = 'GPL v3' -__copyright__ = '2010, Kovid Goyal <kovid at kovidgoyal.net>' - NCX_NS = "http://www.daisy.org/z3986/2005/ncx/" CALIBRE_NS = "http://calibre.kovidgoyal.net/2009/metadata" -NSMAP = {None: NCX_NS, 'calibre':CALIBRE_NS} +NSMAP = {None: NCX_NS, 'calibre': CALIBRE_NS} E = ElementMaker(namespace=NCX_NS, nsmap=NSMAP) C = ElementMaker(namespace=CALIBRE_NS, nsmap=NSMAP) @@ -30,8 +26,10 @@ def parse_html_toc(data): from ebook_converter.utils.cleantext import clean_xml_chars from lxml import etree if isinstance(data, bytes): - data = xml_to_unicode(data, strip_encoding_pats=True, resolve_entities=True)[0] - root = parse(clean_xml_chars(data), maybe_xhtml=True, keep_doctype=False, sanitize_names=True) + data = xml_to_unicode(data, strip_encoding_pats=True, + resolve_entities=True)[0] + root = parse(clean_xml_chars(data), maybe_xhtml=True, keep_doctype=False, + sanitize_names=True) for a in root.xpath('//*[@href and local-name()="a"]'): purl = urllib.parse.urlparse(unquote(a.get('href'))) href, fragment = purl[2], purl[5] @@ -48,8 +46,8 @@ def parse_html_toc(data): class TOC(list): def __init__(self, href=None, fragment=None, text=None, parent=None, - play_order=0, base_path=os.getcwd(), type='unknown', author=None, - description=None, toc_thumbnail=None): + play_order=0, base_path=os.getcwd(), type='unknown', + author=None, description=None, toc_thumbnail=None): self.href = href self.fragment = fragment if not self.fragment: @@ -64,7 +62,7 @@ class TOC(list): self.toc_thumbnail = toc_thumbnail def __str__(self): - lines = ['TOC: %s#%s %s'%(self.href, self.fragment, self.text)] + lines = ['TOC: %s#%s %s' % (self.href, self.fragment, self.text)] for child in self: c = str(child).splitlines() for l in c: @@ -91,12 +89,14 @@ class TOC(list): entry.parent = None def add_item(self, href, fragment, text, play_order=None, type='unknown', - author=None, description=None, toc_thumbnail=None): + author=None, description=None, toc_thumbnail=None): if play_order is None: - play_order = (self[-1].play_order if len(self) else self.play_order) + 1 + play_order = (self[-1].play_order + if len(self) else self.play_order) + 1 self.append(TOC(href=href, fragment=fragment, text=text, parent=self, base_path=self.base_path, play_order=play_order, - type=type, author=author, description=description, toc_thumbnail=toc_thumbnail)) + type=type, author=author, description=description, + toc_thumbnail=toc_thumbnail)) return self[-1] def top_level_items(self): @@ -121,7 +121,10 @@ class TOC(list): @property def abspath(self): - 'Return the file this toc entry points to as a absolute path to a file on the system.' + """ + Return the file this toc entry points to as a absolute path to a file + on the system. + """ if self.href is None: return None @@ -136,8 +139,9 @@ class TOC(list): toc = toc['toc'] if toc is None: try: - toc = opfreader.soup.find('guide').find('reference', attrs={'type':'toc'})['href'] - except: + toc = (opfreader.soup.find('guide') + .find('reference', attrs={'type': 'toc'})['href']) + except Exception: for item in opfreader.manifest: if 'toc' in item.href().lower(): toc = item.href() @@ -151,13 +155,15 @@ class TOC(list): toc = os.path.join(self.base_path, toc) try: if not os.path.exists(toc): - bn = os.path.basename(toc) - bn = bn.replace('_top.htm', '_toc.htm') # Bug in BAEN OPF files + bn = os.path.basename(toc) + # Bug in BAEN OPF files + bn = bn.replace('_top.htm', '_toc.htm') toc = os.path.join(os.path.dirname(toc), bn) self.read_html_toc(toc) - except: - print('WARNING: Could not read Table of Contents. Continuing anyway.') + except Exception: + print('WARNING: Could not read Table of Contents. ' + 'Continuing anyway.') else: path = opfreader.manifest.item(toc.lower()) path = getattr(path, 'path', path) @@ -177,9 +183,9 @@ class TOC(list): self.base_path = os.path.dirname(toc) if root is None: with open(toc, 'rb') as f: - raw = xml_to_unicode(f.read(), assume_utf8=True, - strip_encoding_pats=True)[0] - root = safe_xml_fromstring(raw) + raw = xml_to_unicode(f.read(), assume_utf8=True, + strip_encoding_pats=True)[0] + root = etree.fromstring(raw) xpn = {'re': 'http://exslt.org/regular-expressions'} XPath = functools.partial(etree.XPath, namespaces=xpn) @@ -197,7 +203,7 @@ class TOC(list): def process_navpoint(np, dest): try: play_order = int(get_attr(np, 1)) - except: + except Exception: play_order = 1 href = fragment = text = None nd = dest @@ -207,7 +213,7 @@ class TOC(list): text = '' for txt in txt_path(nl): text += etree.tostring(txt, method='text', - encoding='unicode', with_tail=False) + encoding='unicode', with_tail=False) content = content_path(np) if content and text: content = content[0] @@ -242,17 +248,14 @@ class TOC(list): self.add_item(href, fragment, txt) def render(self, stream, uid): - root = E.ncx( - E.head( - E.meta(name='dtb:uid', content=str(uid)), - E.meta(name='dtb:depth', content=str(self.depth())), - E.meta(name='dtb:generator', content='%s (%s)'%(__appname__, - __version__)), - E.meta(name='dtb:totalPageCount', content='0'), - E.meta(name='dtb:maxPageNumber', content='0'), - ), - E.docTitle(E.text('Table of Contents')), - ) + root = E.ncx(E.head(E.meta(name='dtb:uid', content=str(uid)), + E.meta(name='dtb:depth', + content=str(self.depth())), + E.meta(name='dtb:generator', content='%s (%s)' % + (__appname__, __version__)), + E.meta(name='dtb:totalPageCount', content='0'), + E.meta(name='dtb:maxPageNumber', content='0')), + E.docTitle(E.text('Table of Contents'))) navmap = E.navMap() root.append(navmap) root.set('{http://www.w3.org/XML/1998/namespace}lang', 'en') @@ -263,12 +266,12 @@ class TOC(list): if not text: text = '' c[1] += 1 - item_id = 'num_%d'%c[1] + item_id = 'num_%d' % c[1] text = clean_xml_chars(text) elem = E.navPoint( E.navLabel(E.text(re.sub(r'\s+', ' ', text))), E.content(src=str(np.href)+(('#' + str(np.fragment)) - if np.fragment else '')), + if np.fragment else '')), id=item_id, playOrder=str(np.play_order) ) @@ -282,7 +285,8 @@ class TOC(list): try: elem.append(C.meta(desc, name='description')) except ValueError: - elem.append(C.meta(clean_xml_chars(desc), name='description')) + elem.append(C.meta(clean_xml_chars(desc), + name='description')) idx = getattr(np, 'toc_thumbnail', None) if idx: elem.append(C.meta(idx, name='toc_thumbnail')) @@ -293,5 +297,5 @@ class TOC(list): for np in self: navpoint(navmap, np) raw = etree.tostring(root, encoding='utf-8', xml_declaration=True, - pretty_print=True) + pretty_print=True) stream.write(raw) diff --git a/ebook_converter/ebooks/metadata/utils.py b/ebook_converter/ebooks/metadata/utils.py index 0fbce09..031c917 100644 --- a/ebook_converter/ebooks/metadata/utils.py +++ b/ebook_converter/ebooks/metadata/utils.py @@ -1,12 +1,13 @@ from collections import namedtuple +from lxml import etree + from ebook_converter.ebooks.chardet import xml_to_unicode -from ebook_converter.ebooks.oeb.base import OPF +from ebook_converter.ebooks.oeb import base from ebook_converter.ebooks.oeb.polish.utils import guess_type from ebook_converter.spell import parse_lang_code from ebook_converter.utils.cleantext import clean_xml_chars from ebook_converter.utils.localization import lang_as_iso639_1 -from ebook_converter.utils.xml_parse import safe_xml_fromstring OPFVersion = namedtuple('OPFVersion', 'major minor patch') @@ -35,23 +36,26 @@ def parse_opf(stream_or_path): raw = stream.read() if not raw: raise ValueError('Empty file: '+getattr(stream, 'name', 'stream')) - raw, encoding = xml_to_unicode(raw, strip_encoding_pats=True, resolve_entities=True, assume_utf8=True) + raw, encoding = xml_to_unicode(raw, strip_encoding_pats=True, + resolve_entities=True, assume_utf8=True) raw = raw[raw.find('<'):] - root = safe_xml_fromstring(clean_xml_chars(raw)) + root = etree.fromstring(clean_xml_chars(raw)) if root is None: raise ValueError('Not an OPF file') return root def normalize_languages(opf_languages, mi_languages): - ' Preserve original country codes and use 2-letter lang codes where possible ' + """ + Preserve original country codes and use 2-letter lang codes where possible + """ def parse(x): try: return parse_lang_code(x) except ValueError: return None opf_languages = filter(None, map(parse, opf_languages)) - cc_map = {c.langcode:c.countrycode for c in opf_languages} + cc_map = {c.langcode: c.countrycode for c in opf_languages} mi_languages = filter(None, map(parse, mi_languages)) def norm(x): @@ -83,9 +87,9 @@ def create_manifest_item(root, href_template, id_template, media_type=None): all_hrefs = frozenset(root.xpath('//*/@href')) href = ensure_unique(href_template, all_hrefs) item_id = ensure_unique(id_template, all_ids) - manifest = root.find(OPF('manifest')) + manifest = root.find(base.tag('opf', 'manifest')) if manifest is not None: - i = manifest.makeelement(OPF('item')) + i = manifest.makeelement(base.tag('opf', 'item')) i.set('href', href), i.set('id', item_id) i.set('media-type', media_type or guess_type(href_template)) manifest.append(i) @@ -93,6 +97,7 @@ def create_manifest_item(root, href_template, id_template, media_type=None): def pretty_print_opf(root): - from ebook_converter.ebooks.oeb.polish.pretty import pretty_opf, pretty_xml_tree + from ebook_converter.ebooks.oeb.polish.pretty import pretty_opf, \ + pretty_xml_tree pretty_opf(root) pretty_xml_tree(root) diff --git a/ebook_converter/ebooks/metadata/xmp.py b/ebook_converter/ebooks/metadata/xmp.py index fc6e154..e9a7faf 100644 --- a/ebook_converter/ebooks/metadata/xmp.py +++ b/ebook_converter/ebooks/metadata/xmp.py @@ -1,44 +1,43 @@ -import re, sys, copy, json -from itertools import repeat -from collections import defaultdict +import collections +import copy +import itertools +import json +import re +import sys from lxml import etree from lxml.builder import ElementMaker from ebook_converter import prints from ebook_converter.ebooks.metadata import check_isbn, check_doi -from ebook_converter.utils.xml_parse import safe_xml_fromstring from ebook_converter.ebooks.metadata.book.base import Metadata from ebook_converter.ebooks.metadata.opf2 import dump_dict from ebook_converter.utils.date import parse_date, isoformat, now -from ebook_converter.utils.localization import canonicalize_lang, lang_as_iso639_1 +from ebook_converter.utils.localization import canonicalize_lang, \ + lang_as_iso639_1 -__license__ = 'GPL v3' -__copyright__ = '2014, Kovid Goyal <kovid at kovidgoyal.net>' +_xml_declaration = re.compile(r'<\?xml[^<>]+encoding\s*=\s*[\'"](.*?)' + r'[\'"][^<>]*>', re.IGNORECASE) -_xml_declaration = re.compile(r'<\?xml[^<>]+encoding\s*=\s*[\'"](.*?)[\'"][^<>]*>', re.IGNORECASE) - -NS_MAP = { - 'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#', - 'dc': 'http://purl.org/dc/elements/1.1/', - 'pdf': 'http://ns.adobe.com/pdf/1.3/', - 'pdfx': 'http://ns.adobe.com/pdfx/1.3/', - 'xmp': 'http://ns.adobe.com/xap/1.0/', - 'xmpidq': 'http://ns.adobe.com/xmp/Identifier/qual/1.0/', - 'xmpMM': 'http://ns.adobe.com/xap/1.0/mm/', - 'xmpRights': 'http://ns.adobe.com/xap/1.0/rights/', - 'xmpBJ': 'http://ns.adobe.com/xap/1.0/bj/', - 'xmpTPg': 'http://ns.adobe.com/xap/1.0/t/pg/', - 'xmpDM': 'http://ns.adobe.com/xmp/1.0/DynamicMedia/', - 'prism': 'http://prismstandard.org/namespaces/basic/2.0/', - 'crossmark': 'http://crossref.org/crossmark/1.0/', - 'xml': 'http://www.w3.org/XML/1998/namespace', - 'x': 'adobe:ns:meta/', - 'calibre': 'http://calibre-ebook.com/xmp-namespace', - 'calibreSI': 'http://calibre-ebook.com/xmp-namespace-series-index', - 'calibreCC': 'http://calibre-ebook.com/xmp-namespace-custom-columns', -} +NS_MAP = {'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#', + 'dc': 'http://purl.org/dc/elements/1.1/', + 'pdf': 'http://ns.adobe.com/pdf/1.3/', + 'pdfx': 'http://ns.adobe.com/pdfx/1.3/', + 'xmp': 'http://ns.adobe.com/xap/1.0/', + 'xmpidq': 'http://ns.adobe.com/xmp/Identifier/qual/1.0/', + 'xmpMM': 'http://ns.adobe.com/xap/1.0/mm/', + 'xmpRights': 'http://ns.adobe.com/xap/1.0/rights/', + 'xmpBJ': 'http://ns.adobe.com/xap/1.0/bj/', + 'xmpTPg': 'http://ns.adobe.com/xap/1.0/t/pg/', + 'xmpDM': 'http://ns.adobe.com/xmp/1.0/DynamicMedia/', + 'prism': 'http://prismstandard.org/namespaces/basic/2.0/', + 'crossmark': 'http://crossref.org/crossmark/1.0/', + 'xml': 'http://www.w3.org/XML/1998/namespace', + 'x': 'adobe:ns:meta/', + 'calibre': 'http://calibre-ebook.com/xmp-namespace', + 'calibreSI': 'http://calibre-ebook.com/xmp-namespace-series-index', + 'calibreCC': 'http://calibre-ebook.com/xmp-namespace-custom-columns'} KNOWN_ID_SCHEMES = {'isbn', 'url', 'doi'} @@ -63,7 +62,7 @@ def parse_xmp_packet(raw_bytes): pat = r'''<?xpacket\s+[^>]*?begin\s*=\s*['"]([^'"]*)['"]''' encodings = ('8', '16-le', '16-be', '32-le', '32-be') header = raw_bytes[:1024] - emap = {'\ufeff'.encode('utf-'+x):'utf-'+x for x in encodings} + emap = {'\ufeff'.encode('utf-'+x): 'utf-'+x for x in encodings} emap[b''] = 'utf-8' for q in encodings: m = re.search(pat.encode('utf-'+q), header) @@ -71,15 +70,19 @@ def parse_xmp_packet(raw_bytes): enc = emap.get(m.group(1), enc) break if enc is None: - return safe_xml_fromstring(raw_bytes) - raw = _xml_declaration.sub('', raw_bytes.decode(enc)) # lxml barfs if encoding declaration present in unicode string - return safe_xml_fromstring(raw) + return etree.fromstring(raw_bytes) + # lxml barfs if encoding declaration present in unicode string + raw = _xml_declaration.sub('', raw_bytes.decode(enc)) + return etree.fromstring(raw) def serialize_xmp_packet(root, encoding='utf-8'): - root.tail = '\n' + '\n'.join(repeat(' '*100, 30)) # Adobe spec recommends inserting padding at the end of the packet - raw_bytes = etree.tostring(root, encoding=encoding, pretty_print=True, with_tail=True, method='xml') - return b'<?xpacket begin="%s" id="W5M0MpCehiHzreSzNTczkc9d"?>\n%s\n<?xpacket end="w"?>' % ('\ufeff'.encode(encoding), raw_bytes) + # Adobe spec recommends inserting padding at the end of the packet + root.tail = '\n' + '\n'.join(itertools.repeat(' '*100, 30)) + raw_bytes = etree.tostring(root, encoding=encoding, pretty_print=True, + with_tail=True, method='xml') + return ('<?xpacket begin="%s" id="W5M0MpCehiHzreSzNTczkc9d"?>\n%s\n' + '<?xpacket end="w"?>' % ('\ufeff'.encode(encoding), raw_bytes)) def read_simple_property(elem): @@ -106,14 +109,15 @@ def read_sequence(parent): yield read_simple_property(item) -def uniq(vals, kmap=lambda x:x): +def uniq(vals, kmap=lambda x: x): ''' Remove all duplicates from vals, while preserving order. kmap must be a callable that returns a hashable value for every item in vals ''' vals = vals or () lvals = (kmap(x) for x in vals) seen = set() seen_add = seen.add - return tuple(x for x, k in zip(vals, lvals) if k not in seen and not seen_add(k)) + return tuple(x for x, k in zip(vals, lvals) if k not in seen + and not seen_add(k)) def multiple_sequences(expr, root): @@ -170,7 +174,8 @@ def read_series(root): def read_user_metadata(mi, root): from ebook_converter.utils.config import from_json - from ebook_converter.ebooks.metadata.book.json_codec import decode_is_multiple + from ebook_converter.ebooks.metadata.book.json_codec import \ + decode_is_multiple fields = set() for item in XPath('//calibre:custom_metadata')(root): for li in XPath('./rdf:Bag/rdf:li')(item): @@ -186,7 +191,7 @@ def read_user_metadata(mi, root): decode_is_multiple(fm) mi.set_user_metadata(name, fm) fields.add(name) - except: + except Exception: prints('Failed to read user metadata:', name) import traceback traceback.print_exc() @@ -194,13 +199,17 @@ def read_user_metadata(mi, root): def read_xmp_identifers(parent): ''' For example: - <rdf:li rdf:parseType="Resource"><xmpidq:Scheme>URL</xmp:idq><rdf:value>http://foo.com</rdf:value></rdf:li> + <rdf:li rdf:parseType="Resource"><xmpidq:Scheme>URL</xmp:idq> + <rdf:value>http://foo.com</rdf:value></rdf:li> or the longer form: - <rdf:li><rdf:Description><xmpidq:Scheme>URL</xmp:idq><rdf:value>http://foo.com</rdf:value></rdf:Description></rdf:li> + <rdf:li><rdf:Description><xmpidq:Scheme>URL</xmp:idq> + <rdf:value>http://foo.com</rdf:value></rdf:Description></rdf:li> ''' for li in XPath('./rdf:Bag/rdf:li')(parent): - is_resource = li.attrib.get(expand('rdf:parseType'), None) == 'Resource' - is_resource = is_resource or (len(li) == 1 and li[0].tag == expand('rdf:Description')) + is_resource = li.attrib.get(expand('rdf:parseType'), + None) == 'Resource' + is_resource = is_resource or (len(li) == 1 and + li[0].tag == expand('rdf:Description')) if not is_resource: yield None, li.text or '' value = XPath('descendant::rdf:value')(li) @@ -241,12 +250,15 @@ def metadata_from_xmp_packet(raw_bytes): if title.startswith(r'\376\377'): # corrupted XMP packet generated by Nitro PDF. See # https://bugs.launchpad.net/calibre/+bug/1541981 - raise ValueError('Corrupted XMP metadata packet detected, probably generated by Nitro PDF') + raise ValueError('Corrupted XMP metadata packet detected, ' + 'probably generated by Nitro PDF') mi.title = title authors = multiple_sequences('//dc:creator', root) if authors: mi.authors = authors - tags = multiple_sequences('//dc:subject', root) or multiple_sequences('//pdf:Keywords', root) + tags = multiple_sequences('//dc:subject', + root) or multiple_sequences('//pdf:Keywords', + root) if tags: mi.tags = tags comments = first_alt('//dc:description', root) @@ -256,8 +268,10 @@ def metadata_from_xmp_packet(raw_bytes): if publishers: mi.publisher = publishers[0] try: - pubdate = parse_date(first_sequence('//dc:date', root) or first_simple('//xmp:CreateDate', root), assume_utc=False) - except: + pubdate = (parse_date(first_sequence('//dc:date', root) or + first_simple('//xmp:CreateDate', root), + assume_utc=False)) + except Exception: pass else: mi.pubdate = pubdate @@ -291,7 +305,7 @@ def metadata_from_xmp_packet(raw_bytes): if val: try: setattr(mi, x, json.loads(val)) - except: + except Exception: pass languages = multiple_sequences('//dc:language', root) @@ -319,7 +333,7 @@ def metadata_from_xmp_packet(raw_bytes): identifiers[scheme] = val # Check Dublin Core for recognizable identifier types - for scheme, check_func in {'doi':check_doi, 'isbn':check_isbn}.items(): + for scheme, check_func in {'doi': check_doi, 'isbn': check_isbn}.items(): if scheme not in identifiers: val = check_func(first_simple('//dc:identifier', root)) if val: @@ -359,17 +373,21 @@ def consolidate_metadata(info_mi, info): else: prefer_info = info_date > xmp_mi.metadata_date if prefer_info: - info_mi.title, info_mi.authors, info_mi.tags = info_title, info_authors, info_tags + info_mi.title = info_title + info_mi.authors = info_authors + info_mi.tags = info_tags else: # We'll use the xmp tags/authors but fallback to the info ones if the - # xmp does not have tags/authors. smart_update() should have taken care of - # the rest - info_mi.authors, info_mi.tags = (info_authors if xmp_mi.is_null('authors') else xmp_mi.authors), xmp_mi.tags or info_tags + # xmp does not have tags/authors. smart_update() should have taken care + # of the rest + info_mi.authors = (info_authors if xmp_mi.is_null('authors') + else xmp_mi.authors) + info_mi.tags = xmp_mi.tags or info_tags return info_mi def nsmap(*args): - return {x:NS_MAP[x] for x in args} + return {x: NS_MAP[x] for x in args} def create_simple_property(parent, tag, value): @@ -435,7 +453,8 @@ def create_series(calibre, series, series_index): def create_user_metadata(calibre, all_user_metadata): from ebook_converter.utils.config import to_json - from ebook_converter.ebooks.metadata.book.json_codec import object_to_unicode, encode_is_multiple + from ebook_converter.ebooks.metadata.book.json_codec import \ + object_to_unicode, encode_is_multiple s = calibre.makeelement(expand('calibre:custom_metadata')) calibre.append(s) @@ -447,7 +466,7 @@ def create_user_metadata(calibre, all_user_metadata): encode_is_multiple(fm) fm = object_to_unicode(fm) fm = json.dumps(fm, default=to_json, ensure_ascii=False) - except: + except Exception: prints('Failed to write user metadata:', name) import traceback traceback.print_exc() @@ -471,7 +490,8 @@ def metadata_to_xmp_packet(mi): dc = rdf.makeelement(expand('rdf:Description'), nsmap=nsmap('dc')) dc.set(expand('rdf:about'), '') rdf.append(dc) - for prop, tag in {'title':'dc:title', 'comments':'dc:description'}.items(): + for prop, tag in {'title': 'dc:title', + 'comments': 'dc:description'}.items(): val = mi.get(prop) or '' create_alt_property(dc, tag, val) for prop, (tag, ordered) in {'authors': ('dc:creator', True), @@ -482,18 +502,23 @@ def metadata_to_xmp_packet(mi): val = [val] create_sequence_property(dc, tag, val, ordered) if not mi.is_null('pubdate'): - create_sequence_property(dc, 'dc:date', [isoformat(mi.pubdate, as_utc=False)]) # Adobe spec recommends local time + # Adobe spec recommends local time + create_sequence_property(dc, 'dc:date', + [isoformat(mi.pubdate, as_utc=False)]) if not mi.is_null('languages'): - langs = list(filter(None, map(lambda x:lang_as_iso639_1(x) or canonicalize_lang(x), mi.languages))) + langs = list(filter(None, map(lambda x: lang_as_iso639_1(x) or + canonicalize_lang(x), mi.languages))) if langs: create_sequence_property(dc, 'dc:language', langs, ordered=False) - xmp = rdf.makeelement(expand('rdf:Description'), nsmap=nsmap('xmp', 'xmpidq')) + xmp = rdf.makeelement(expand('rdf:Description'), + nsmap=nsmap('xmp', 'xmpidq')) xmp.set(expand('rdf:about'), '') rdf.append(xmp) extra_ids = {} for x in ('prism', 'pdfx'): - p = extra_ids[x] = rdf.makeelement(expand('rdf:Description'), nsmap=nsmap(x)) + p = extra_ids[x] = rdf.makeelement(expand('rdf:Description'), + nsmap=nsmap(x)) p.set(expand('rdf:about'), '') rdf.append(p) @@ -503,7 +528,7 @@ def metadata_to_xmp_packet(mi): for scheme, val in identifiers.items(): if scheme in {'isbn', 'doi'}: for prefix, parent in extra_ids.items(): - ie = parent.makeelement(expand('%s:%s'%(prefix, scheme))) + ie = parent.makeelement(expand('%s:%s' % (prefix, scheme))) ie.text = val parent.append(ie) @@ -511,7 +536,8 @@ def metadata_to_xmp_packet(mi): d.text = isoformat(now(), as_utc=False) xmp.append(d) - calibre = rdf.makeelement(expand('rdf:Description'), nsmap=nsmap('calibre', 'calibreSI', 'calibreCC')) + calibre = rdf.makeelement(expand('rdf:Description'), + nsmap=nsmap('calibre', 'calibreSI', 'calibreCC')) calibre.set(expand('rdf:about'), '') rdf.append(calibre) if not mi.is_null('rating'): @@ -524,7 +550,8 @@ def metadata_to_xmp_packet(mi): if not mi.is_null('series'): create_series(calibre, mi.series, mi.series_index) if not mi.is_null('timestamp'): - create_simple_property(calibre, 'calibre:timestamp', isoformat(mi.timestamp, as_utc=False)) + create_simple_property(calibre, 'calibre:timestamp', + isoformat(mi.timestamp, as_utc=False)) for x in ('author_link_map', 'user_categories'): val = getattr(mi, x, None) if val: @@ -550,10 +577,11 @@ def find_used_namespaces(elem): def find_preferred_prefix(namespace, elems): for elem in elems: - ans = {v:k for k, v in elem.nsmap.items()}.get(namespace, None) + ans = {v: k for k, v in elem.nsmap.items()}.get(namespace, None) if ans is not None: return ans - return find_preferred_prefix(namespace, elem.iterchildren(etree.Element)) + return find_preferred_prefix(namespace, + elem.iterchildren(etree.Element)) def find_nsmap(elems): @@ -562,7 +590,7 @@ def find_nsmap(elems): used_namespaces |= find_used_namespaces(elem) ans = {} used_namespaces -= {NS_MAP['xml'], NS_MAP['x'], None, NS_MAP['rdf']} - rmap = {v:k for k, v in NS_MAP.items()} + rmap = {v: k for k, v in NS_MAP.items()} i = 0 for ns in used_namespaces: if ns in rmap: @@ -578,7 +606,10 @@ def find_nsmap(elems): def clone_into(parent, elem): - ' Clone the element, assuming that all namespace declarations are present in parent ' + """ + Clone the element, assuming that all namespace declarations are present + in parent + """ clone = parent.makeelement(elem.tag) parent.append(clone) if elem.text and not elem.text.isspace(): @@ -591,28 +622,38 @@ def clone_into(parent, elem): def merge_xmp_packet(old, new): - ''' Merge metadata present in the old packet that is not present in the new + """ + Merge metadata present in the old packet that is not present in the new one into the new one. Assumes the new packet was generated by - metadata_to_xmp_packet() ''' + metadata_to_xmp_packet() + """ old, new = parse_xmp_packet(old), parse_xmp_packet(new) - # As per the adobe spec all metadata items have to be present inside top-level rdf:Description containers + # As per the adobe spec all metadata items have to be present inside + # top-level rdf:Description containers item_xpath = XPath('//rdf:RDF/rdf:Description/*') # First remove all data fields that metadata_to_xmp_packet() knowns about, # since either they will have been set or if not present, imply they have # been cleared - defined_tags = {expand(prefix + ':' + scheme) for prefix in ('prism', 'pdfx') for scheme in KNOWN_ID_SCHEMES} - defined_tags |= {expand('dc:' + x) for x in ('identifier', 'title', 'creator', 'date', 'description', 'language', 'publisher', 'subject')} - defined_tags |= {expand('xmp:' + x) for x in ('MetadataDate', 'Identifier')} + defined_tags = {expand(prefix + ':' + scheme) + for prefix in ('prism', 'pdfx') + for scheme in KNOWN_ID_SCHEMES} + defined_tags |= {expand('dc:' + x) + for x in ('identifier', 'title', 'creator', 'date', + 'description', 'language', 'publisher', + 'subject')} + defined_tags |= {expand('xmp:' + x) + for x in ('MetadataDate', 'Identifier')} # For redundancy also remove all fields explicitly set in the new packet defined_tags |= {x.tag for x in item_xpath(new)} calibrens = '{%s}' % NS_MAP['calibre'] for elem in item_xpath(old): - if elem.tag in defined_tags or (elem.tag and elem.tag.startswith(calibrens)): + if elem.tag in defined_tags or (elem.tag and + elem.tag.startswith(calibrens)): elem.getparent().remove(elem) # Group all items into groups based on their namespaces - groups = defaultdict(list) + groups = collections.defaultdict(list) for item in item_xpath(new): ns = item.nsmap[item.prefix] groups[ns].append(item) @@ -626,9 +667,14 @@ def merge_xmp_packet(old, new): root = A.xmpmeta(R.RDF) rdf = root[0] - for namespace in sorted(groups, key=lambda x:{NS_MAP['dc']:'a', NS_MAP['xmp']:'b', NS_MAP['calibre']:'c'}.get(x, 'z'+x)): + for namespace in sorted(groups, + key=lambda x: {NS_MAP['dc']: 'a', + NS_MAP['xmp']: 'b', + NS_MAP['calibre']: 'c'}.get(x, + 'z'+x)): items = groups[namespace] - desc = rdf.makeelement(expand('rdf:Description'), nsmap=find_nsmap(items)) + desc = rdf.makeelement(expand('rdf:Description'), + nsmap=find_nsmap(items)) desc.set(expand('rdf:about'), '') rdf.append(desc) for item in items: diff --git a/ebook_converter/ebooks/mobi/mobiml.py b/ebook_converter/ebooks/mobi/mobiml.py index 07cea1c..d833d88 100644 --- a/ebook_converter/ebooks/mobi/mobiml.py +++ b/ebook_converter/ebooks/mobi/mobiml.py @@ -5,8 +5,10 @@ import copy import re import numbers from lxml import etree -from ebook_converter.ebooks.oeb.base import namespace, barename -from ebook_converter.ebooks.oeb.base import XHTML, XHTML_NS, urlnormalize + +from ebook_converter import constants as const +from ebook_converter.ebooks.oeb import base +from ebook_converter.ebooks.oeb import parse_utils from ebook_converter.ebooks.oeb.stylizer import Stylizer from ebook_converter.ebooks.oeb.transforms.flatcss import KeyMapper from ebook_converter.ebooks.mobi.utils import convert_color_for_font_tag @@ -23,7 +25,7 @@ def MBP(name): return '{%s}%s' % (MBP_NS, name) -MOBI_NSMAP = {None: XHTML_NS, 'mbp': MBP_NS} +MOBI_NSMAP = {None: const.XHTML_NS, 'mbp': const.MBP_NS} INLINE_TAGS = {'span', 'a', 'code', 'u', 's', 'big', 'strike', 'tt', 'font', 'q', 'i', 'b', 'em', 'strong', 'sup', 'sub'} HEADER_TAGS = {'h1', 'h2', 'h3', 'h4', 'h5', 'h6'} # GR: Added 'caption' to both sets @@ -129,9 +131,9 @@ class MobiMLizer(object): 'Iterate over the spine and convert it to MOBIML' for item in self.oeb.spine: stylizer = Stylizer(item.data, item.href, self.oeb, self.opts, self.profile) - body = item.data.find(XHTML('body')) - nroot = etree.Element(XHTML('html'), nsmap=MOBI_NSMAP) - nbody = etree.SubElement(nroot, XHTML('body')) + body = item.data.find(base.tag('xhtml', 'body')) + nroot = etree.Element(base.tag('xhtml', 'html'), nsmap=MOBI_NSMAP) + nbody = etree.SubElement(nroot, base.tag('xhtml', 'body')) self.current_spine_item = item self.mobimlize_elem(body, stylizer, BlockState(nbody), [FormatState()]) @@ -162,7 +164,7 @@ class MobiMLizer(object): lines = text.split('\n') result = lines[:1] for line in lines[1:]: - result.append(etree.Element(XHTML('br'))) + result.append(etree.Element(base.tag('xhtml', 'br'))) if line: result.append(line) return result @@ -194,7 +196,7 @@ class MobiMLizer(object): indent = (indent / abs(indent)) * self.profile.fbase if tag in NESTABLE_TAGS and not istate.rendered: para = wrapper = etree.SubElement( - parent, XHTML(tag), attrib=istate.attrib) + parent, base.tag('xhtml', tag), attrib=istate.attrib) bstate.nested.append(para) if tag == 'li' and len(istates) > 1: istates[-2].list_num += 1 @@ -203,21 +205,21 @@ class MobiMLizer(object): para = wrapper = bstate.nested[-1] elif not self.opts.mobi_ignore_margins and left > 0 and indent >= 0: ems = self.profile.mobi_ems_per_blockquote - para = wrapper = etree.SubElement(parent, XHTML('blockquote')) + para = wrapper = etree.SubElement(parent, base.tag('xhtml', 'blockquote')) para = wrapper emleft = int(round(left / self.profile.fbase)) - ems emleft = min((emleft, 10)) while emleft > ems / 2: - para = etree.SubElement(para, XHTML('blockquote')) + para = etree.SubElement(para, base.tag('xhtml', 'blockquote')) emleft -= ems else: - para = wrapper = etree.SubElement(parent, XHTML('p')) + para = wrapper = etree.SubElement(parent, base.tag('xhtml', 'p')) bstate.inline = bstate.para = para vspace = bstate.vpadding + bstate.vmargin bstate.vpadding = bstate.vmargin = 0 if tag not in TABLE_TAGS: if tag in ('ul', 'ol') and vspace > 0: - wrapper.addprevious(etree.Element(XHTML('div'), + wrapper.addprevious(etree.Element(base.tag('xhtml', 'div'), height=self.mobimlize_measure(vspace))) else: wrapper.attrib['height'] = self.mobimlize_measure(vspace) @@ -225,7 +227,7 @@ class MobiMLizer(object): elif tag == 'table' and vspace > 0: vspace = int(round(vspace / self.profile.fbase)) while vspace > 0: - wrapper.addprevious(etree.Element(XHTML('br'))) + wrapper.addprevious(etree.Element(base.tag('xhtml', 'br'))) vspace -= 1 if istate.halign != 'auto' and isinstance(istate.halign, (bytes, str)): if isinstance(istate.halign, bytes): @@ -237,7 +239,7 @@ class MobiMLizer(object): bstate.inline = para pstate = bstate.istate = None try: - etree.SubElement(para, XHTML(tag), attrib=istate.attrib) + etree.SubElement(para, base.tag('xhtml', tag), attrib=istate.attrib) except: print('Invalid subelement:', para, tag, istate.attrib) raise @@ -245,7 +247,7 @@ class MobiMLizer(object): para.attrib['valign'] = 'top' if istate.ids: for id_ in istate.ids: - anchor = etree.Element(XHTML('a'), attrib={'id': id_}) + anchor = etree.Element(base.tag('xhtml', 'a'), attrib={'id': id_}) if tag == 'li': try: last = bstate.body[-1][-1] @@ -262,7 +264,7 @@ class MobiMLizer(object): # This could potentially break if inserting an anchor at # this point in the markup is illegal, but I cannot think # of such a case offhand. - if barename(last.tag) in LEAF_TAGS: + if parse_utils.barename(last.tag) in LEAF_TAGS: last.addprevious(anchor) else: last.append(anchor) @@ -279,28 +281,28 @@ class MobiMLizer(object): elif pstate and pstate.href == href: inline = bstate.anchor else: - inline = etree.SubElement(inline, XHTML('a'), href=href) + inline = etree.SubElement(inline, base.tag('xhtml', 'a'), href=href) bstate.anchor = inline if fsize != 3: - inline = etree.SubElement(inline, XHTML('font'), + inline = etree.SubElement(inline, base.tag('xhtml', 'font'), size=str(fsize)) if istate.family == 'monospace': - inline = etree.SubElement(inline, XHTML('tt')) + inline = etree.SubElement(inline, base.tag('xhtml', 'tt')) if istate.italic: - inline = etree.SubElement(inline, XHTML('i')) + inline = etree.SubElement(inline, base.tag('xhtml', 'i')) if istate.bold: - inline = etree.SubElement(inline, XHTML('b')) + inline = etree.SubElement(inline, base.tag('xhtml', 'b')) if istate.bgcolor is not None and istate.bgcolor != 'transparent' : - inline = etree.SubElement(inline, XHTML('span'), + inline = etree.SubElement(inline, base.tag('xhtml', 'span'), bgcolor=convert_color_for_font_tag(istate.bgcolor)) if istate.fgcolor != 'black': - inline = etree.SubElement(inline, XHTML('font'), + inline = etree.SubElement(inline, base.tag('xhtml', 'font'), color=convert_color_for_font_tag(istate.fgcolor)) if istate.strikethrough: - inline = etree.SubElement(inline, XHTML('s')) + inline = etree.SubElement(inline, base.tag('xhtml', 's')) if istate.underline: - inline = etree.SubElement(inline, XHTML('u')) + inline = etree.SubElement(inline, base.tag('xhtml', 'u')) bstate.inline = inline bstate.istate = istate inline = bstate.inline @@ -318,7 +320,7 @@ class MobiMLizer(object): def mobimlize_elem(self, elem, stylizer, bstate, istates, ignore_valign=False): if not isinstance(elem.tag, (str, bytes)) \ - or namespace(elem.tag) != XHTML_NS: + or parse_utils.namespace(elem.tag) != const.XHTML_NS: return style = stylizer.style(elem) # <mbp:frame-set/> does not exist lalalala @@ -333,10 +335,10 @@ class MobiMLizer(object): elem.text = None elem.set('id', id_) elem.tail = tail - elem.tag = XHTML('a') + elem.tag = base.tag('xhtml', 'a') else: return - tag = barename(elem.tag) + tag = parse_utils.barename(elem.tag) istate = copy.copy(istates[-1]) istate.rendered = False istate.list_num = 0 @@ -451,7 +453,7 @@ class MobiMLizer(object): if 'width' not in istate.attrib or 'height' not in istate.attrib: href = self.current_spine_item.abshref(elem.attrib['src']) try: - item = self.oeb.manifest.hrefs[urlnormalize(href)] + item = self.oeb.manifest.hrefs[base.urlnormalize(href)] except: self.oeb.logger.warn('Failed to find image:', href) @@ -534,9 +536,9 @@ class MobiMLizer(object): isinstance(valign, numbers.Number) and valign > 0) vtag = 'sup' if issup else 'sub' if not_baseline and not ignore_valign and tag not in NOT_VTAGS and not isblock: - nroot = etree.Element(XHTML('html'), nsmap=MOBI_NSMAP) - vbstate = BlockState(etree.SubElement(nroot, XHTML('body'))) - vbstate.para = etree.SubElement(vbstate.body, XHTML('p')) + nroot = etree.Element(base.tag('xhtml', 'html'), nsmap=MOBI_NSMAP) + vbstate = BlockState(etree.SubElement(nroot, base.tag('xhtml', 'body'))) + vbstate.para = etree.SubElement(vbstate.body, base.tag('xhtml', 'p')) self.mobimlize_elem(elem, stylizer, vbstate, istates, ignore_valign=True) if len(istates) > 0: @@ -548,8 +550,8 @@ class MobiMLizer(object): self.mobimlize_content('span', '', bstate, istates) parent = bstate.para if bstate.inline is None else bstate.inline if parent is not None: - vtag = etree.SubElement(parent, XHTML(vtag)) - vtag = etree.SubElement(vtag, XHTML('small')) + vtag = etree.SubElement(parent, base.tag('xhtml', vtag)) + vtag = etree.SubElement(vtag, base.tag('xhtml', 'small')) # Add anchors for child in vbstate.body: if child is not vbstate.para: @@ -601,7 +603,7 @@ class MobiMLizer(object): para = bstate.para if para is not None and para.text == '\xa0' and len(para) < 1: if style.height > 2: - para.getparent().replace(para, etree.Element(XHTML('br'))) + para.getparent().replace(para, etree.Element(base.tag('xhtml', 'br'))) else: # This is too small to be rendered effectively, drop it para.getparent().remove(para) diff --git a/ebook_converter/ebooks/mobi/reader/mobi8.py b/ebook_converter/ebooks/mobi/reader/mobi8.py index a50d6bc..02b2f0e 100644 --- a/ebook_converter/ebooks/mobi/reader/mobi8.py +++ b/ebook_converter/ebooks/mobi/reader/mobi8.py @@ -8,6 +8,7 @@ import uuid from lxml import etree +from ebook_converter import constants as const from ebook_converter.ebooks.mobi.reader.headers import NULL_INDEX from ebook_converter.ebooks.mobi.reader.index import read_index from ebook_converter.ebooks.mobi.reader.ncx import read_ncx, build_toc @@ -17,7 +18,7 @@ from ebook_converter.ebooks.metadata.opf2 import Guide, OPFCreator from ebook_converter.ebooks.metadata.toc import TOC from ebook_converter.ebooks.mobi.utils import read_font_record from ebook_converter.ebooks.oeb.parse_utils import parse_html -from ebook_converter.ebooks.oeb.base import XPath, XHTML, xml2text +from ebook_converter.ebooks.oeb.base import XPath, xml2text from ebook_converter.polyglot.builtins import as_unicode @@ -553,8 +554,8 @@ class Mobi8Reader(object): seen = set() links = [] for elem in root.iterdescendants(etree.Element): - if reached and elem.tag == XHTML('a') and elem.get('href', - False): + if reached and elem.tag == const.XHTML_A and elem.get('href', + False): href = elem.get('href') href, frag = urllib.parse.urldefrag(href) href = base_href + '/' + href diff --git a/ebook_converter/ebooks/mobi/writer2/serializer.py b/ebook_converter/ebooks/mobi/writer2/serializer.py index c1f2587..6d0ae8d 100644 --- a/ebook_converter/ebooks/mobi/writer2/serializer.py +++ b/ebook_converter/ebooks/mobi/writer2/serializer.py @@ -4,16 +4,11 @@ import re import unicodedata import urllib.parse +from ebook_converter import constants as const from ebook_converter.ebooks.mobi.mobiml import MBP_NS from ebook_converter.ebooks.mobi.utils import is_guide_ref_start -from ebook_converter.ebooks.oeb.base import ( - OEB_DOCS, XHTML, XHTML_NS, XML_NS, namespace, prefixname, urlnormalize -) - - -__license__ = 'GPL v3' -__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>' -__docformat__ = 'restructuredtext en' +from ebook_converter.ebooks.oeb import base +from ebook_converter.ebooks.oeb import parse_utils class Buf(io.BytesIO): @@ -25,9 +20,14 @@ class Buf(io.BytesIO): class Serializer(object): - NSRMAP = {'': None, XML_NS: 'xml', XHTML_NS: '', MBP_NS: 'mbp'} + NSRMAP = {'': None, + const.XML_NS: 'xml', + const.XHTML_NS: '', + MBP_NS: 'mbp'} # TODO(gryf): check why this is different than + # MBP_NS from const. - def __init__(self, oeb, images, is_periodical, write_page_breaks_after_item=True): + def __init__(self, oeb, images, is_periodical, + write_page_breaks_after_item=True): ''' Write all the HTML markup in oeb into a single in memory buffer containing a single html document with links replaced by offsets into @@ -157,7 +157,8 @@ class Serializer(object): buf.write(b'<guide>') for ref in self.oeb.guide.values(): path = urllib.parse.urldefrag(ref.href)[0] - if path not in hrefs or hrefs[path].media_type not in OEB_DOCS: + if (path not in hrefs or + hrefs[path].media_type not in base.OEB_DOCS): continue buf.write(b'<reference type="') @@ -178,28 +179,28 @@ class Serializer(object): buf.write(b'</guide>') - def serialize_href(self, href, base=None): - ''' + def serialize_href(self, href, _base=None): + """ Serialize the href attribute of an <a> or <reference> tag. It is serialized as filepos="000000000" and a pointer to its location is stored in self.href_offsets so that the correct value can be filled in at the end. - ''' + """ hrefs = self.oeb.manifest.hrefs try: - path, frag = urllib.parse.urldefrag(urlnormalize(href)) + path, frag = urllib.parse.urldefrag(base.urlnormalize(href)) except ValueError: # Unparseable URL return False - if path and base: - path = base.abshref(path) + if path and _base: + path = _base.abshref(path) if path and path not in hrefs: return False buf = self.buf item = hrefs[path] if path else None if item and item.spine_position is None: return False - path = item.href if item else base.href + path = item.href if item else _base.href href = '#'.join((path, frag)) if frag else path buf.write(b'filepos=') self.href_offsets[href].append(buf.tell()) @@ -219,7 +220,7 @@ class Serializer(object): if href is not None: # resolve the section url in id_offsets buf.write(b'<mbp:pagebreak />') - self.id_offsets[urlnormalize(href)] = buf.tell() + self.id_offsets[base.urlnormalize(href)] = buf.tell() if tocref.klass == "periodical": buf.write(b'<div> <div height="1em"></div>') @@ -267,7 +268,7 @@ class Serializer(object): if self.is_periodical and item.is_section_start: for section_toc in top_toc.nodes: - if urlnormalize(item.href) == section_toc.href: + if base.urlnormalize(item.href) == section_toc.href: # create section url of the form r'feed_\d+/index.html' section_url = re.sub(r'article_\d+/', '', section_toc.href) serialize_toc_level(section_toc, section_url) @@ -287,12 +288,12 @@ class Serializer(object): buf = self.buf if not item.linear: self.breaks.append(buf.tell() - 1) - self.id_offsets[urlnormalize(item.href)] = buf.tell() + self.id_offsets[base.urlnormalize(item.href)] = buf.tell() if item.is_section_start: buf.write(b'<a ></a> ') if item.is_article_start: buf.write(b'<a ></a> <a ></a>') - for elem in item.data.find(XHTML('body')): + for elem in item.data.find(base.tag('xhtml', 'body')): self.serialize_elem(elem, item) if self.write_page_breaks_after_item: buf.write(b'<mbp:pagebreak/>') @@ -306,15 +307,15 @@ class Serializer(object): def serialize_elem(self, elem, item, nsrmap=NSRMAP): buf = self.buf if not isinstance(elem.tag, (str, bytes)) \ - or namespace(elem.tag) not in nsrmap: + or parse_utils.namespace(elem.tag) not in nsrmap: return - tag = prefixname(elem.tag, nsrmap) + tag = base.prefixname(elem.tag, nsrmap) # Previous layers take care of @name id_ = elem.attrib.pop('id', None) if id_: href = '#'.join((item.href, id_)) offset = self.anchor_offset or buf.tell() - key = urlnormalize(href) + key = base.urlnormalize(href) # Only set this id_offset if it wasn't previously seen self.id_offsets[key] = self.id_offsets.get(key, offset) if self.anchor_offset is not None and \ @@ -326,15 +327,15 @@ class Serializer(object): buf.write(tag.encode('utf-8')) if elem.attrib: for attr, val in elem.attrib.items(): - if namespace(attr) not in nsrmap: + if parse_utils.namespace(attr) not in nsrmap: continue - attr = prefixname(attr, nsrmap) + attr = base.prefixname(attr, nsrmap) buf.write(b' ') if attr == 'href': if self.serialize_href(val, item): continue elif attr == 'src': - href = urlnormalize(item.abshref(val)) + href = base.urlnormalize(item.abshref(val)) if href in self.images: index = self.images[href] self.used_images.add(href) diff --git a/ebook_converter/ebooks/mobi/writer8/exth.py b/ebook_converter/ebooks/mobi/writer8/exth.py index 435aa23..4c3ed6e 100644 --- a/ebook_converter/ebooks/mobi/writer8/exth.py +++ b/ebook_converter/ebooks/mobi/writer8/exth.py @@ -2,6 +2,7 @@ import re from struct import pack from io import BytesIO +from ebook_converter.ebooks.oeb import base from ebook_converter.constants_old import iswindows, isosx from ebook_converter.ebooks.mobi.utils import (utf8_text, to_base) from ebook_converter.utils.localization import lang_as_iso639_1 @@ -95,9 +96,8 @@ def build_exth(metadata, prefer_author_sort=False, is_periodical=False, # Write UUID as ASIN uuid = None - from ebook_converter.ebooks.oeb.base import OPF for x in metadata['identifier']: - if (x.get(OPF('scheme'), None).lower() == 'uuid' or + if (x.get(base.tag('opf', 'scheme'), None).lower() == 'uuid' or str(x).startswith('urn:uuid:')): uuid = str(x).split(':')[-1] break diff --git a/ebook_converter/ebooks/odt/input.py b/ebook_converter/ebooks/odt/input.py index ab3f5e2..ba6b7bf 100644 --- a/ebook_converter/ebooks/odt/input.py +++ b/ebook_converter/ebooks/odt/input.py @@ -1,11 +1,12 @@ """ Convert an ODT file into a Open Ebook """ -import os, logging +import logging +import os -from lxml import etree from css_parser import CSSParser from css_parser.css import CSSRule +from lxml import etree from odf.odf2xhtml import ODF2XHTML from odf.opendocument import load as odLoad @@ -14,15 +15,9 @@ from odf.namespaces import TEXTNS as odTEXTNS from ebook_converter import CurrentDir, walk from ebook_converter.ebooks.oeb.base import _css_logger -from ebook_converter.utils.xml_parse import safe_xml_fromstring from ebook_converter.polyglot.builtins import as_bytes -__license__ = 'GPL v3' -__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net' -__docformat__ = 'restructuredtext en' - - class Extract(ODF2XHTML): def extract_pictures(self, zf): @@ -46,7 +41,7 @@ class Extract(ODF2XHTML): ol.set('start', val) def fix_markup(self, html, log): - root = safe_xml_fromstring(html) + root = etree.fromstring(html) self.filter_css(root, log) self.extract_css(root, log) self.epubify_markup(root, log) diff --git a/ebook_converter/ebooks/oeb/base.py b/ebook_converter/ebooks/oeb/base.py index 2862fd9..769c87b 100644 --- a/ebook_converter/ebooks/oeb/base.py +++ b/ebook_converter/ebooks/oeb/base.py @@ -1,97 +1,45 @@ """ Basic support for manipulating OEB 1.x/2.0 content and metadata. """ -import os, re, logging, sys, numbers -from collections import defaultdict -from itertools import count -from operator import attrgetter -import urllib.parse +import collections +import itertools +import logging +import numbers +import operator +import os +import re import string +import sys +import urllib.parse -from lxml import etree, html +from lxml import etree +from lxml import html + +from ebook_converter import constants as const from ebook_converter import force_unicode from ebook_converter.constants_old import filesystem_encoding, __version__ -from ebook_converter.utils.xml_parse import safe_xml_fromstring from ebook_converter.ebooks.chardet import xml_to_unicode from ebook_converter.ebooks.conversion.preprocess import CSSPreProcessor from ebook_converter import (isbytestring, as_unicode, get_types_map) -from ebook_converter.ebooks.oeb.parse_utils import barename, XHTML_NS, namespace, XHTML, parse_html, NotHTML +from ebook_converter.ebooks.oeb import parse_utils from ebook_converter.utils.cleantext import clean_xml_chars from ebook_converter.utils.short_uuid import uuid4 from ebook_converter.polyglot.urllib import unquote as urlunquote -__license__ = 'GPL v3' -__copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>' -__docformat__ = 'restructuredtext en' - -XML_NS = 'http://www.w3.org/XML/1998/namespace' -OEB_DOC_NS = 'http://openebook.org/namespaces/oeb-document/1.0/' -OPF1_NS = 'http://openebook.org/namespaces/oeb-package/1.0/' -OPF2_NS = 'http://www.idpf.org/2007/opf' -OPF_NSES = {OPF1_NS, OPF2_NS} -DC09_NS = 'http://purl.org/metadata/dublin_core' -DC10_NS = 'http://purl.org/dc/elements/1.0/' -DC11_NS = 'http://purl.org/dc/elements/1.1/' -DC_NSES = {DC09_NS, DC10_NS, DC11_NS} -XSI_NS = 'http://www.w3.org/2001/XMLSchema-instance' -DCTERMS_NS = 'http://purl.org/dc/terms/' -NCX_NS = 'http://www.daisy.org/z3986/2005/ncx/' -SVG_NS = 'http://www.w3.org/2000/svg' -XLINK_NS = 'http://www.w3.org/1999/xlink' -CALIBRE_NS = 'http://calibre.kovidgoyal.net/2009/metadata' -RE_NS = 'http://exslt.org/regular-expressions' -MBP_NS = 'http://www.mobipocket.com' -EPUB_NS = 'http://www.idpf.org/2007/ops' -MATHML_NS = 'http://www.w3.org/1998/Math/MathML' - -XPNSMAP = { - 'h': XHTML_NS, 'o1': OPF1_NS, 'o2': OPF2_NS, 'd09': DC09_NS, - 'd10': DC10_NS, 'd11': DC11_NS, 'xsi': XSI_NS, 'dt': DCTERMS_NS, - 'ncx': NCX_NS, 'svg': SVG_NS, 'xl': XLINK_NS, 're': RE_NS, - 'mathml': MATHML_NS, 'mbp': MBP_NS, 'calibre': CALIBRE_NS, - 'epub':EPUB_NS -} - -OPF1_NSMAP = {'dc': DC11_NS, 'oebpackage': OPF1_NS} -OPF2_NSMAP = {'opf': OPF2_NS, 'dc': DC11_NS, 'dcterms': DCTERMS_NS, - 'xsi': XSI_NS, 'calibre': CALIBRE_NS} - - -def XML(name): - return '{%s}%s' % (XML_NS, name) - - -def OPF(name): - return '{%s}%s' % (OPF2_NS, name) - - -def DC(name): - return '{%s}%s' % (DC11_NS, name) - - -def XSI(name): - return '{%s}%s' % (XSI_NS, name) - - -def DCTERMS(name): - return '{%s}%s' % (DCTERMS_NS, name) - - -def NCX(name): - return '{%s}%s' % (NCX_NS, name) - - -def SVG(name): - return '{%s}%s' % (SVG_NS, name) - - -def XLINK(name): - return '{%s}%s' % (XLINK_NS, name) - - -def CALIBRE(name): - return '{%s}%s' % (CALIBRE_NS, name) +def tag(tag_ns, name): + tag_map = {'calibre': const.CALIBRE_NS, + 'dc': const.DC11_NS, + 'dcterms': const.DCTERMS_NS, + 'epub': const.EPUB_NS, + 'ncx': const.NCX_NS, + 'opf': const.OPF2_NS, + 'svg': const.SVG_NS, + 'xhtml': const.XHTML_NS, + 'xlink': const.XLINK_NS, + 'xml': const.XML_NS, + 'xsi': const.XSI_NS} + return '{%s}%s' % (tag_map[tag_ns], name) _css_url_re = re.compile(r'url\s*\([\'"]{0,1}(.*?)[\'"]{0,1}\)', re.I) @@ -99,14 +47,18 @@ _css_import_re = re.compile(r'@import "(.*?)"') _archive_re = re.compile(r'[^ ]+') # Tags that should not be self closed in epub output -self_closing_bad_tags = {'a', 'abbr', 'address', 'article', 'aside', 'audio', 'b', -'bdo', 'blockquote', 'body', 'button', 'cite', 'code', 'dd', 'del', 'details', -'dfn', 'div', 'dl', 'dt', 'em', 'fieldset', 'figcaption', 'figure', 'footer', -'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'header', 'hgroup', 'i', 'iframe', 'ins', 'kbd', -'label', 'legend', 'li', 'map', 'mark', 'meter', 'nav', 'ol', 'output', 'p', -'pre', 'progress', 'q', 'rp', 'rt', 'samp', 'section', 'select', 'small', -'span', 'strong', 'sub', 'summary', 'sup', 'textarea', 'time', 'ul', 'var', -'video', 'title', 'script', 'style'} +self_closing_bad_tags = {'a', 'abbr', 'address', 'article', 'aside', 'audio', + 'b', 'bdo', 'blockquote', 'body', 'button', 'cite', + 'code', 'dd', 'del', 'details', 'dfn', 'div', 'dl', + 'dt', 'em', 'fieldset', 'figcaption', 'figure', + 'footer', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', + 'header', 'hgroup', 'i', 'iframe', 'ins', 'kbd', + 'label', 'legend', 'li', 'map', 'mark', 'meter', + 'nav', 'ol', 'output', 'p', 'pre', 'progress', 'q', + 'rp', 'rt', 'samp', 'section', 'select', 'small', + 'span', 'strong', 'sub', 'summary', 'sup', + 'textarea', 'time', 'ul', 'var', 'video', 'title', + 'script', 'style'} def css_text(x): @@ -157,7 +109,7 @@ def itercsslinks(raw): yield match.group(1), match.start(1) -_link_attrs = set(html.defs.link_attrs) | {XLINK('href'), 'poster'} +_link_attrs = set(html.defs.link_attrs) | {tag('xlink', 'href'), 'poster'} def iterlinks(root, find_links_in_css=True): @@ -170,7 +122,7 @@ def iterlinks(root, find_links_in_css=True): for el in root.iter('*'): try: - tag = barename(el.tag).lower() + tag = parse_utils.barename(el.tag).lower() except Exception: continue attribs = el.attrib @@ -225,7 +177,7 @@ def make_links_absolute(root, base_url): def resolve_base_href(root): base_href = None basetags = root.xpath('//base[@href]|//h:base[@href]', - namespaces=XPNSMAP) + namespaces=const.XPNSMAP) for b in basetags: base_href = b.get('href') b.drop_tree() @@ -278,20 +230,20 @@ def rewrite_links(root, link_repl_func, resolve_base_href=False): el.attrib[attrib] = new parser = CSSParser(raiseExceptions=False, log=_css_logger, - fetcher=lambda x:(None, None)) + fetcher=lambda x: (None, None)) for el in root.iter(etree.Element): try: tag = el.tag except UnicodeDecodeError: continue - if tag == XHTML('style') and el.text and \ + if tag == parse_utils.XHTML('style') and el.text and \ (_css_url_re.search(el.text) is not None or '@import' in - el.text): + el.text): stylesheet = parser.parseString(el.text, validate=False) replaceUrls(stylesheet, link_repl_func) repl = css_text(stylesheet) - el.text = '\n'+ clean_xml_chars(repl) + '\n' + el.text = '\n' + clean_xml_chars(repl) + '\n' text = el.get('style') if text and _css_url_re.search(text) is not None: @@ -301,43 +253,41 @@ def rewrite_links(root, link_repl_func, resolve_base_href=False): # Parsing errors are raised by css_parser continue replaceUrls(stext, link_repl_func) - repl = css_text(stext).replace('\n', ' ').replace('\r', - ' ') + repl = css_text(stext).replace('\n', ' ').replace('\r', ' ') el.set('style', repl) types_map = get_types_map() -EPUB_MIME = types_map['.epub'] -XHTML_MIME = types_map['.xhtml'] -CSS_MIME = types_map['.css'] -NCX_MIME = types_map['.ncx'] -OPF_MIME = types_map['.opf'] -PAGE_MAP_MIME = 'application/oebps-page-map+xml' -OEB_DOC_MIME = 'text/x-oeb1-document' -OEB_CSS_MIME = 'text/x-oeb1-css' -OPENTYPE_MIME = types_map['.otf'] -GIF_MIME = types_map['.gif'] -JPEG_MIME = types_map['.jpeg'] -PNG_MIME = types_map['.png'] -SVG_MIME = types_map['.svg'] -BINARY_MIME = 'application/octet-stream' +EPUB_MIME = types_map['.epub'] +XHTML_MIME = types_map['.xhtml'] +CSS_MIME = types_map['.css'] +NCX_MIME = types_map['.ncx'] +OPF_MIME = types_map['.opf'] +PAGE_MAP_MIME = 'application/oebps-page-map+xml' +OEB_DOC_MIME = 'text/x-oeb1-document' +OEB_CSS_MIME = 'text/x-oeb1-css' +OPENTYPE_MIME = types_map['.otf'] +GIF_MIME = types_map['.gif'] +JPEG_MIME = types_map['.jpeg'] +PNG_MIME = types_map['.png'] +SVG_MIME = types_map['.svg'] +BINARY_MIME = 'application/octet-stream' -XHTML_CSS_NAMESPACE = '@namespace "%s";\n' % XHTML_NS +XHTML_CSS_NAMESPACE = '@namespace "%s";\n' % const.XHTML_NS -OEB_STYLES = {CSS_MIME, OEB_CSS_MIME, 'text/x-oeb-css', 'xhtml/css'} -OEB_DOCS = {XHTML_MIME, 'text/html', OEB_DOC_MIME, - 'text/x-oeb-document'} +OEB_STYLES = {CSS_MIME, OEB_CSS_MIME, 'text/x-oeb-css', 'xhtml/css'} +OEB_DOCS = {XHTML_MIME, 'text/html', OEB_DOC_MIME, 'text/x-oeb-document'} OEB_RASTER_IMAGES = {GIF_MIME, JPEG_MIME, PNG_MIME} -OEB_IMAGES = {GIF_MIME, JPEG_MIME, PNG_MIME, SVG_MIME} +OEB_IMAGES = {GIF_MIME, JPEG_MIME, PNG_MIME, SVG_MIME} MS_COVER_TYPE = 'other.ms-coverimage-standard' -ENTITY_RE = re.compile(r'&([a-zA-Z_:][a-zA-Z0-9.-_:]+);') -COLLAPSE_RE = re.compile(r'[ \t\r\n\v]+') -QNAME_RE = re.compile(r'^[{][^{}]+[}][^{}]+$') +ENTITY_RE = re.compile(r'&([a-zA-Z_:][a-zA-Z0-9.-_:]+);') +COLLAPSE_RE = re.compile(r'[ \t\r\n\v]+') +QNAME_RE = re.compile(r'^[{][^{}]+[}][^{}]+$') PREFIXNAME_RE = re.compile(r'^[^:]+[:][^:]+') -XMLDECL_RE = re.compile(r'^\s*<[?]xml.*?[?]>') -CSSURL_RE = re.compile(r'''url[(](?P<q>["']?)(?P<url>[^)]+)(?P=q)[)]''') +XMLDECL_RE = re.compile(r'^\s*<[?]xml.*?[?]>') +CSSURL_RE = re.compile(r'''url[(](?P<q>["']?)(?P<url>[^)]+)(?P=q)[)]''') def element(parent, *args, **kwargs): @@ -349,13 +299,13 @@ def element(parent, *args, **kwargs): def prefixname(name, nsrmap): if not isqname(name): return name - ns = namespace(name) + ns = parse_utils.namespace(name) if ns not in nsrmap: return name prefix = nsrmap[ns] if not prefix: - return barename(name) - return ':'.join((prefix, barename(name))) + return parse_utils.barename(name) + return ':'.join((prefix, parse_utils.barename(name))) def isprefixname(name): @@ -376,11 +326,11 @@ def isqname(name): def XPath(expr): - return etree.XPath(expr, namespaces=XPNSMAP) + return etree.XPath(expr, namespaces=const.XPNSMAP) def xpath(elem, expr): - return elem.xpath(expr, namespaces=XPNSMAP) + return elem.xpath(expr, namespaces=const.XPNSMAP) def xml2str(root, pretty_print=False, strip_comments=False, with_tail=True): @@ -390,7 +340,7 @@ def xml2str(root, pretty_print=False, strip_comments=False, with_tail=True): if x.text and '--' in x.text: x.text = x.text.replace('--', '__') ans = etree.tostring(root, encoding='utf-8', xml_declaration=True, - pretty_print=pretty_print, with_tail=with_tail) + pretty_print=pretty_print, with_tail=with_tail) if strip_comments: ans = re.compile(br'<!--.*?-->', re.DOTALL).sub(b'', ans) @@ -399,12 +349,14 @@ def xml2str(root, pretty_print=False, strip_comments=False, with_tail=True): def xml2text(elem, pretty_print=False, method='text'): - return etree.tostring(elem, method=method, encoding='unicode', with_tail=False, pretty_print=pretty_print) + return etree.tostring(elem, method=method, encoding='unicode', + with_tail=False, pretty_print=pretty_print) def escape_cdata(root): pat = re.compile(r'[<>&]') - for elem in root.iterdescendants('{%s}style' % XHTML_NS, '{%s}script' % XHTML_NS): + for elem in root.iterdescendants('{%s}style' % const.XHTML_NS, + '{%s}script' % const.XHTML_NS): if elem.text and pat.search(elem.text) is not None: elem.text = etree.CDATA(elem.text.replace(']]>', r'\]\]\>')) @@ -431,12 +383,12 @@ def serialize(data, media_type, pretty_print=False): return bytes(data) -ASCII_CHARS = frozenset(chr(x) for x in range(128)) +ASCII_CHARS = frozenset(chr(x) for x in range(128)) UNIBYTE_CHARS = frozenset(x.encode('ascii') for x in ASCII_CHARS) -USAFE = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ' - 'abcdefghijklmnopqrstuvwxyz' - '0123456789' '_.-/~') -URL_SAFE = frozenset(USAFE) +USAFE = ('ABCDEFGHIJKLMNOPQRSTUVWXYZ' + 'abcdefghijklmnopqrstuvwxyz' + '0123456789' '_.-/~') +URL_SAFE = frozenset(USAFE) URL_SAFE_BYTES = frozenset(USAFE.encode('ascii')) URL_UNSAFE = [ASCII_CHARS - URL_SAFE, UNIBYTE_CHARS - URL_SAFE_BYTES] del USAFE @@ -466,7 +418,8 @@ def urlnormalize(href): try: parts = urllib.parse.urlparse(href) except ValueError as e: - raise ValueError('Failed to parse the URL: %r with underlying error: %s' % (href, as_unicode(e))) + raise ValueError('Failed to parse the URL: %r with underlying error: ' + '%s' % (href, as_unicode(e))) if not parts.scheme or parts.scheme == 'file': path, frag = urllib.parse.urldefrag(href) parts = ('', '', path, '', '', frag) @@ -503,8 +456,10 @@ class DummyHandler(logging.Handler): def emit(self, record): if self.log is not None: msg = self.format(record) - f = self.log.error if record.levelno >= logging.ERROR \ - else self.log.warn + if record.levelno >= logging.ERROR: + f = self.log.error + else: + self.log.warn f(msg) @@ -630,18 +585,21 @@ class Metadata(object): metadata items. """ - DC_TERMS = {'contributor', 'coverage', 'creator', 'date', - 'description', 'format', 'identifier', 'language', - 'publisher', 'relation', 'rights', 'source', - 'subject', 'title', 'type'} + DC_TERMS = {'contributor', 'coverage', 'creator', 'date', 'description', + 'format', 'identifier', 'language', 'publisher', 'relation', + 'rights', 'source', 'subject', 'title', 'type'} CALIBRE_TERMS = {'series', 'series_index', 'rating', 'timestamp', - 'publication_type', 'title_sort'} - OPF_ATTRS = {'role': OPF('role'), 'file-as': OPF('file-as'), - 'scheme': OPF('scheme'), 'event': OPF('event'), - 'type': XSI('type'), 'lang': XML('lang'), 'id': 'id'} - OPF1_NSMAP = {'dc': DC11_NS, 'oebpackage': OPF1_NS} - OPF2_NSMAP = {'opf': OPF2_NS, 'dc': DC11_NS, 'dcterms': DCTERMS_NS, - 'xsi': XSI_NS, 'calibre': CALIBRE_NS} + 'publication_type', 'title_sort'} + OPF_ATTRS = {'role': tag('opf', 'role'), 'file-as': tag('opf', 'file-as'), + 'scheme': tag('opf', 'scheme'), 'event': tag('opf', 'event'), + 'type': tag('xsi', 'type'), 'lang': tag('xml', 'lang'), + 'id': 'id'} + OPF1_NSMAP = {'dc': const.DC11_NS, 'oebpackage': const.OPF1_NS} + OPF2_NSMAP = {'calibre': const.CALIBRE_NS, + 'dc': const.DC11_NS, + 'dcterms': const.DCTERMS_NS, + 'opf': const.OPF2_NS, + 'xsi': const.XSI_NS} class Item(object): """An item of OEB data model metadata. @@ -667,13 +625,13 @@ class Metadata(object): def term_attr(self, obj): term = obj.term - if namespace(term) != DC11_NS: - term = OPF('meta') + if parse_utils.namespace(term) != const.DC11_NS: + term = tag('opf', 'meta') allowed = self.allowed if allowed is not None and term not in allowed: raise AttributeError( 'attribute %r not valid for metadata term %r' % ( - self.attr(term), barename(obj.term))) + self.attr(term), parse_utils.barename(obj.term))) return self.attr(term) def __get__(self, obj, cls): @@ -688,23 +646,25 @@ class Metadata(object): self.attrib = attrib = dict(attrib) self.nsmap = nsmap = dict(nsmap) attrib.update(kwargs) - if namespace(term) == OPF2_NS: - term = barename(term) - ns = namespace(term) - local = barename(term).lower() - if local in Metadata.DC_TERMS and (not ns or ns in DC_NSES): + if parse_utils.namespace(term) == const.OPF2_NS: + term = parse_utils.barename(term) + ns = parse_utils.namespace(term) + local = parse_utils.barename(term).lower() + if local in Metadata.DC_TERMS and (not ns or ns in const.DC_NSES): # Anything looking like Dublin Core is coerced - term = DC(local) - elif local in Metadata.CALIBRE_TERMS and ns in (CALIBRE_NS, ''): + term = tag('dc', local) + elif local in Metadata.CALIBRE_TERMS and ns in (const.CALIBRE_NS, + ''): # Ditto for Calibre-specific metadata - term = CALIBRE(local) + term = tag('calibre', local) self.term = term self.value = value for attr, value in tuple(attrib.items()): if isprefixname(value): attrib[attr] = qname(value, nsmap) nsattr = Metadata.OPF_ATTRS.get(attr, attr) - if nsattr == OPF('scheme') and namespace(term) != DC11_NS: + if (nsattr == tag('opf', 'scheme') and + parse_utils.namespace(term) != const.DC11_NS): # The opf:meta element takes @scheme, not @opf:scheme nsattr = 'scheme' if attr != nsattr: @@ -722,21 +682,29 @@ class Metadata(object): def content(self, value): self.value = value - scheme = Attribute(lambda term: 'scheme' if - term == OPF('meta') else OPF('scheme'), - [DC('identifier'), OPF('meta')]) - file_as = Attribute(OPF('file-as'), [DC('creator'), DC('contributor'), - DC('title')]) - role = Attribute(OPF('role'), [DC('creator'), DC('contributor')]) - event = Attribute(OPF('event'), [DC('date')]) - id = Attribute('id') - type = Attribute(XSI('type'), [DC('date'), DC('format'), - DC('type')]) - lang = Attribute(XML('lang'), [DC('contributor'), DC('coverage'), - DC('creator'), DC('publisher'), - DC('relation'), DC('rights'), - DC('source'), DC('subject'), - OPF('meta')]) + scheme = Attribute(lambda term: 'scheme' if + term == tag('opf', 'meta') else + tag('opf', 'scheme'), + [tag('dc', 'identifier'), tag('opf', 'meta')]) + file_as = Attribute(tag('opf', 'file-as'), [tag('dc', 'creator'), + tag('dc', 'contributor'), + tag('dc', 'title')]) + role = Attribute(tag('opf', 'role'), [tag('dc', 'creator'), + tag('dc', 'contributor')]) + event = Attribute(tag('opf', 'event'), [tag('dc', 'date')]) + id = Attribute('id') + type = Attribute(tag('xsi', 'type'), [tag('dc', 'date'), + tag('dc', 'format'), + tag('dc', 'type')]) + lang = Attribute(tag('xml', 'lang'), [tag('dc', 'contributor'), + tag('dc', 'coverage'), + tag('dc', 'creator'), + tag('dc', 'publisher'), + tag('dc', 'relation'), + tag('dc', 'rights'), + tag('dc', 'source'), + tag('dc', 'subject'), + tag('opf', 'meta')]) def __getitem__(self, key): return self.attrib[key] @@ -752,7 +720,7 @@ class Metadata(object): def __repr__(self): return 'Item(term=%r, value=%r, attrib=%r)' \ - % (barename(self.term), self.value, self.attrib) + % (parse_utils.barename(self.term), self.value, self.attrib) def __str__(self): return as_unicode(self.value) @@ -760,11 +728,12 @@ class Metadata(object): def to_opf1(self, dcmeta=None, xmeta=None, nsrmap={}): attrib = {} for key, value in self.attrib.items(): - if namespace(key) == OPF2_NS: - key = barename(key) + if parse_utils.namespace(key) == const.OPF2_NS: + key = parse_utils.barename(key) attrib[key] = prefixname(value, nsrmap) - if namespace(self.term) == DC11_NS: - name = DC(string.capwords(barename(self.term))) + if parse_utils.namespace(self.term) == const.DC11_NS: + name = tag('dc', + string.capwords(parse_utils.barename(self.term))) elem = element(dcmeta, name, attrib=attrib) elem.text = self.value else: @@ -777,27 +746,27 @@ class Metadata(object): attrib = {} for key, value in self.attrib.items(): attrib[key] = prefixname(value, nsrmap) - if namespace(self.term) == DC11_NS: + if parse_utils.namespace(self.term) == const.DC11_NS: elem = element(parent, self.term, attrib=attrib) try: elem.text = self.value - except: + except Exception: elem.text = repr(self.value) else: - elem = element(parent, OPF('meta'), attrib=attrib) + elem = element(parent, tag('opf', 'meta'), attrib=attrib) elem.attrib['name'] = prefixname(self.term, nsrmap) elem.attrib['content'] = prefixname(self.value, nsrmap) return elem def __init__(self, oeb): self.oeb = oeb - self.items = defaultdict(list) + self.items = collections.defaultdict(list) self.primary_writing_mode = None def add(self, term, value, attrib={}, nsmap={}, **kwargs): """Add a new metadata item.""" item = self.Item(term, value, attrib, nsmap, **kwargs) - items = self.items[barename(item.term)] + items = self.items[parse_utils.barename(item.term)] items.append(item) return item @@ -807,15 +776,15 @@ class Metadata(object): __iter__ = iterkeys def clear(self, key): - l = self.items[key] - for x in list(l): - l.remove(x) + val = self.items[key] + for x in list(val): + val.remove(x) def filter(self, key, predicate): - l = self.items[key] - for x in list(l): + val = self.items[key] + for x in list(val): if predicate(x): - l.remove(x) + val.remove(x) def __getitem__(self, key): return self.items[key] @@ -838,21 +807,21 @@ class Metadata(object): def _opf1_nsmap(self): nsmap = self._nsmap for key, value in nsmap.items(): - if value in OPF_NSES or value in DC_NSES: + if value in const.OPF_NSES or value in const.DC_NSES: del nsmap[key] return nsmap @property def _opf2_nsmap(self): nsmap = self._nsmap - nsmap.update(OPF2_NSMAP) + nsmap.update(const.OPF2_NSMAP) return nsmap def to_opf1(self, parent=None): nsmap = self._opf1_nsmap nsrmap = {value: key for key, value in nsmap.items()} elem = element(parent, 'metadata', nsmap=nsmap) - dcmeta = element(elem, 'dc-metadata', nsmap=OPF1_NSMAP) + dcmeta = element(elem, 'dc-metadata', nsmap=const.OPF1_NSMAP) xmeta = element(elem, 'x-metadata') for term in self.items: for item in self.items[term]: @@ -865,12 +834,16 @@ class Metadata(object): def to_opf2(self, parent=None): nsmap = self._opf2_nsmap nsrmap = {value: key for key, value in nsmap.items()} - elem = element(parent, OPF('metadata'), nsmap=nsmap) + elem = element(parent, tag('opf', 'metadata'), nsmap=nsmap) for term in self.items: for item in self.items[term]: item.to_opf2(elem, nsrmap=nsrmap) if self.primary_writing_mode: - elem.append(elem.makeelement(OPF('meta'), attrib={'name':'primary-writing-mode', 'content':self.primary_writing_mode})) + elem.append(elem.makeelement(tag('opf', 'meta'), + attrib={'name': + 'primary-writing-mode', + 'content': + self.primary_writing_mode})) return elem @@ -936,8 +909,8 @@ class Manifest(object): if not data: return data = xml_to_unicode(data, strip_encoding_pats=True, - assume_utf8=True, resolve_entities=True)[0] - return safe_xml_fromstring(data) + assume_utf8=True, resolve_entities=True)[0] + return etree.fromstring(data) def _parse_xhtml(self, data): orig_data = data @@ -945,11 +918,13 @@ class Manifest(object): self.oeb.log.debug('Parsing', fname, '...') self.oeb.html_preprocessor.current_href = self.href try: - data = parse_html(data, log=self.oeb.log, - decoder=self.oeb.decode, - preprocessor=self.oeb.html_preprocessor, - filename=fname, non_html_file_tags={'ncx'}) - except NotHTML: + data = parse_utils.parse_html(data, log=self.oeb.log, + decoder=self.oeb.decode, + preprocessor=self.oeb. + html_preprocessor, + filename=fname, + non_html_file_tags={'ncx'}) + except parse_utils.NotHTML: return self._parse_xml(orig_data) return data @@ -981,7 +956,8 @@ class Manifest(object): data = self.oeb.decode(data) data = self.oeb.css_preprocessor(data, add_namespace=False) parser = CSSParser(loglevel=logging.WARNING, - fetcher=self.override_css_fetch or self._fetch_css, + fetcher=self.override_css_fetch or + self._fetch_css, log=_css_logger) data = parser.parseString(data, href=self.href, validate=False) data = resolveImports(data) @@ -1013,8 +989,8 @@ class Manifest(object): convert and return as an lxml.etree element in the XHTML namespace. - XML content is parsed and returned as an lxml.etree element. - - CSS and CSS-variant content is parsed and returned as a css_parser - CSS DOM stylesheet. + - CSS and CSS-variant content is parsed and returned as a + css_parser CSS DOM stylesheet. - All other content is returned as a :class:`str` or :class:`bytes` object with no special parsing. """ @@ -1023,7 +999,7 @@ class Manifest(object): if self._loader is None: return None data = self._loader(getattr(self, 'html_input_href', - self.href)) + self.href)) try: mt = self.media_type.lower() except Exception: @@ -1037,8 +1013,8 @@ class Manifest(object): elif mt in OEB_STYLES: data = self._parse_css(data) elif mt == 'text/plain': - self.oeb.log.warn('%s contains data in TXT format'%self.href, - 'converting to HTML') + self.oeb.log.warn('%s contains data in TXT format' % self.href, + 'converting to HTML') data = self._parse_txt(data) self.media_type = XHTML_MIME self._data = data @@ -1055,8 +1031,10 @@ class Manifest(object): def unload_data_from_memory(self, memory=None): if isinstance(self._data, bytes): if memory is None: - from ebook_converter.ptempfile import PersistentTemporaryFile - pt = PersistentTemporaryFile(suffix='_oeb_base_mem_unloader.img') + from ebook_converter.ptempfile import \ + PersistentTemporaryFile + pt = PersistentTemporaryFile(suffix='_oeb_base_mem_' + 'unloader.img') with pt: pt.write(self._data) self.oeb._temp_files.append(pt.name) @@ -1088,7 +1066,8 @@ class Manifest(object): @property def bytes_representation(self): - return serialize(self.data, self.media_type, pretty_print=self.oeb.pretty_print) + return serialize(self.data, self.media_type, + pretty_print=self.oeb.pretty_print) def __str__(self): return self.unicode_representation @@ -1107,7 +1086,11 @@ class Manifest(object): href = self.href if isinstance(href, bytes): href = force_unicode(href) - sp = self.spine_position if isinstance(self.spine_position, numbers.Number) else sys.maxsize + + if isinstance(self.spine_position, numbers.Number): + sp = self.spine_position + else: + sp = sys.maxsize return sp, (self.media_type or '').lower(), href, self.id @@ -1238,8 +1221,8 @@ class Manifest(object): return elem def to_opf2(self, parent=None): - elem = element(parent, OPF('manifest')) - for item in sorted(self.items, key=attrgetter('sort_key')): + elem = element(parent, tag('opf', 'manifest')) + for item in sorted(self.items, key=operator.attrgetter('sort_key')): media_type = item.media_type if media_type in OEB_DOCS: media_type = XHTML_MIME @@ -1249,7 +1232,7 @@ class Manifest(object): 'media-type': media_type} if item.fallback: attrib['fallback'] = item.fallback - element(elem, OPF('item'), attrib=attrib) + element(elem, tag('opf', 'item'), attrib=attrib) return elem @property @@ -1341,12 +1324,12 @@ class Spine(object): return elem def to_opf2(self, parent=None): - elem = element(parent, OPF('spine')) + elem = element(parent, tag('opf', 'spine')) for item in self.items: attrib = {'idref': item.id} if not item.linear: attrib['linear'] = 'no' - element(elem, OPF('itemref'), attrib=attrib) + element(elem, tag('opf', 'itemref'), attrib=attrib) return elem @@ -1394,8 +1377,7 @@ class Guide(object): self.oeb = oeb if type.lower() in self.TYPES: type = type.lower() - elif type not in self.TYPES and \ - not type.startswith('other.'): + elif type not in self.TYPES and not type.startswith('other.'): type = 'other.' + type if not title and type in self.TITLES: title = oeb.translate(self.TITLES[type]) @@ -1440,7 +1422,8 @@ class Guide(object): __iter__ = iterkeys def values(self): - return sorted(self.refs.values(), key=lambda ref: ref.ORDER.get(ref.type, 10000)) + return sorted(self.refs.values(), + key=lambda ref: ref.ORDER.get(ref.type, 10000)) def items(self): for type, ref in self.refs.items(): @@ -1473,12 +1456,12 @@ class Guide(object): def to_opf2(self, parent=None): if not len(self): return - elem = element(parent, OPF('guide')) + elem = element(parent, tag('opf', 'guide')) for ref in self.refs.values(): attrib = {'type': ref.type, 'href': urlunquote(ref.href)} if ref.title: attrib['title'] = ref.title - element(elem, OPF('reference'), attrib=attrib) + element(elem, tag('opf', 'reference'), attrib=attrib) return elem @@ -1499,7 +1482,8 @@ class TOC(object): """ def __init__(self, title=None, href=None, klass=None, id=None, - play_order=None, author=None, description=None, toc_thumbnail=None): + play_order=None, author=None, description=None, + toc_thumbnail=None): self.title = title self.href = urlnormalize(href) if href else href self.klass = klass @@ -1513,9 +1497,11 @@ class TOC(object): self.description = description self.toc_thumbnail = toc_thumbnail - def add(self, title, href, klass=None, id=None, play_order=0, author=None, description=None, toc_thumbnail=None): + def add(self, title, href, klass=None, id=None, play_order=0, author=None, + description=None, toc_thumbnail=None): """Create and return a new sub-node of this node.""" - node = TOC(title, href, klass, id, play_order, author, description, toc_thumbnail) + node = TOC(title, href, klass, id, play_order, author, description, + toc_thumbnail) self.nodes.append(node) return node @@ -1583,7 +1569,8 @@ class TOC(object): """ prev = None for node in list(self.nodes): - if prev and urllib.parse.urldefrag(prev.href)[0] == urllib.parse.urldefrag(node.href)[0]: + if (prev and urllib.parse.urldefrag(prev.href)[0] == + urllib.parse.urldefrag(node.href)[0]): self.nodes.remove(node) prev.nodes.append(node) else: @@ -1597,7 +1584,7 @@ class TOC(object): return 1 def get_lines(self, lvl=0): - ans = [('\t'*lvl) + 'TOC: %s --> %s'%(self.title, self.href)] + ans = [('\t'*lvl) + 'TOC: %s --> %s' % (self.title, self.href)] for child in self: ans.extend(child.get_lines(lvl+1)) return ans @@ -1614,7 +1601,7 @@ class TOC(object): def to_ncx(self, parent=None): if parent is None: - parent = etree.Element(NCX('navMap')) + parent = etree.Element(tag('ncx', 'navMap')) for node in self.nodes: id = node.id or uuid_id() po = node.play_order @@ -1623,15 +1610,15 @@ class TOC(object): attrib = {'id': id, 'playOrder': str(po)} if node.klass: attrib['class'] = node.klass - point = element(parent, NCX('navPoint'), attrib=attrib) - label = etree.SubElement(point, NCX('navLabel')) + point = element(parent, tag('ncx', 'navPoint'), attrib=attrib) + label = etree.SubElement(point, tag('ncx', 'navLabel')) title = node.title if title: title = re.sub(r'\s+', ' ', title) - element(label, NCX('text')).text = title + element(label, tag('ncx', 'text')).text = title # Do not unescape this URL as ADE requires it to be escaped to # handle semi colons and other special characters in the file names - element(point, NCX('content'), src=node.href) + element(point, tag('ncx', 'content'), src=node.href) node.to_ncx(point) return parent @@ -1659,7 +1646,7 @@ class TOC(object): if y is not None: if x.href != y.href: x.play_order = getattr(href_node(x), 'play_order', - self.next_play_order()) + self.next_play_order()) y = href_node(x) if y is not None: x.play_order = y.play_order @@ -1723,8 +1710,9 @@ class PageList(object): return self.pages.remove(page) def to_ncx(self, parent=None): - plist = element(parent, NCX('pageList'), id=uuid_id()) - values = {t: count(1) for t in ('front', 'normal', 'special')} + plist = element(parent, tag('ncx', 'pageList'), id=uuid_id()) + values = {t: itertools.count(1) + for t in ('front', 'normal', 'special')} for page in self.pages: id = page.id or uuid_id() type = page.type @@ -1732,30 +1720,31 @@ class PageList(object): attrib = {'id': id, 'value': value, 'type': type, 'playOrder': '0'} if page.klass: attrib['class'] = page.klass - ptarget = element(plist, NCX('pageTarget'), attrib=attrib) - label = element(ptarget, NCX('navLabel')) - element(label, NCX('text')).text = page.name - element(ptarget, NCX('content'), src=page.href) + ptarget = element(plist, tag('ncx', 'pageTarget'), attrib=attrib) + label = element(ptarget, tag('ncx', 'navLabel')) + element(label, tag('ncx', 'text')).text = page.name + element(ptarget, tag('ncx', 'content'), src=page.href) return plist def to_page_map(self): - pmap = etree.Element(OPF('page-map'), nsmap={None: OPF2_NS}) + pmap = etree.Element(tag('opf', 'page-map'), + nsmap={None: const.OPF2_NS}) for page in self.pages: - element(pmap, OPF('page'), name=page.name, href=page.href) + element(pmap, tag('opf', 'page'), name=page.name, href=page.href) return pmap class OEBBook(object): """Representation of a book in the IDPF OEB data model.""" - COVER_SVG_XP = XPath('h:body//svg:svg[position() = 1]') + COVER_SVG_XP = XPath('h:body//svg:svg[position() = 1]') COVER_OBJECT_XP = XPath('h:body//h:object[@data][position() = 1]') def __init__(self, logger, - html_preprocessor, - css_preprocessor=CSSPreProcessor(), - encoding='utf-8', pretty_print=False, - input_encoding='utf-8'): + html_preprocessor, + css_preprocessor=CSSPreProcessor(), + encoding='utf-8', pretty_print=False, + input_encoding='utf-8'): """Create empty book. Arguments: :param:`encoding`: Default encoding for textual content read @@ -1809,7 +1798,7 @@ class OEBBook(object): for path in self._temp_files: try: os.remove(path) - except: + except Exception: pass @classmethod @@ -1831,11 +1820,12 @@ class OEBBook(object): return fix_data(data) bom_enc = None if data[:4] in (b'\0\0\xfe\xff', b'\xff\xfe\0\0'): - bom_enc = {b'\0\0\xfe\xff':'utf-32-be', - b'\xff\xfe\0\0':'utf-32-le'}[data[:4]] + bom_enc = {b'\0\0\xfe\xff': 'utf-32-be', + b'\xff\xfe\0\0': 'utf-32-le'}[data[:4]] data = data[4:] elif data[:2] in (b'\xff\xfe', b'\xfe\xff'): - bom_enc = {b'\xff\xfe':'utf-16-le', 'b\xfe\xff':'utf-16-be'}[data[:2]] + bom_enc = {b'\xff\xfe': 'utf-16-le', + 'b\xfe\xff': 'utf-16-be'}[data[:2]] data = data[2:] elif data[:3] == b'\xef\xbb\xbf': bom_enc = 'utf-8' @@ -1864,13 +1854,13 @@ class OEBBook(object): are tuples of (default) filenames and lxml.etree element structures. """ package = etree.Element('package', - attrib={'unique-identifier': self.uid.id}) + attrib={'unique-identifier': self.uid.id}) self.metadata.to_opf1(package) self.manifest.to_opf1(package) self.spine.to_opf1(package) tours = element(package, 'tours') tour = element(tours, 'tour', - attrib={'id': 'chaptertour', 'title': 'Chapter Tour'}) + attrib={'id': 'chaptertour', 'title': 'Chapter Tour'}) self.toc.to_opf1(tour) self.guide.to_opf1(package) return {OPF_MIME: ('content.opf', package)} @@ -1898,7 +1888,8 @@ class OEBBook(object): if added: next += 1 selector = XPath('ncx:content/@src') - for i, elem in enumerate(xpath(ncx, '//*[@playOrder and ./ncx:content[@src]]')): + for i, elem in enumerate(xpath(ncx, '//*[@playOrder and ' + './ncx:content[@src]]')): href = urlnormalize(selector(elem)[0]) order = playorder.get(href, i) elem.attrib['playOrder'] = str(order) @@ -1907,25 +1898,26 @@ class OEBBook(object): def _to_ncx(self): lang = str(self.metadata.language[0]) lang = lang.replace('_', '-') - ncx = etree.Element(NCX('ncx'), - attrib={'version': '2005-1', XML('lang'): lang}, - nsmap={None: NCX_NS}) - head = etree.SubElement(ncx, NCX('head')) - etree.SubElement(head, NCX('meta'), - name='dtb:uid', content=str(self.uid)) - etree.SubElement(head, NCX('meta'), - name='dtb:depth', content=str(self.toc.depth())) + ncx = etree.Element(tag('ncx', 'ncx'), + attrib={'version': '2005-1', + tag('xml', 'lang'): lang}, + nsmap={None: const.NCX_NS}) + head = etree.SubElement(ncx, tag('ncx', 'head')) + etree.SubElement(head, tag('ncx', 'meta'), + name='dtb:uid', content=str(self.uid)) + etree.SubElement(head, tag('ncx', 'meta'), + name='dtb:depth', content=str(self.toc.depth())) generator = ''.join(['calibre (', __version__, ')']) - etree.SubElement(head, NCX('meta'), - name='dtb:generator', content=generator) - etree.SubElement(head, NCX('meta'), - name='dtb:totalPageCount', content=str(len(self.pages))) - maxpnum = etree.SubElement(head, NCX('meta'), - name='dtb:maxPageNumber', content='0') - title = etree.SubElement(ncx, NCX('docTitle')) - text = etree.SubElement(title, NCX('text')) + etree.SubElement(head, tag('ncx', 'meta'), + name='dtb:generator', content=generator) + etree.SubElement(head, tag('ncx', 'meta'), name='dtb:totalPageCount', + content=str(len(self.pages))) + maxpnum = etree.SubElement(head, tag('ncx', 'meta'), + name='dtb:maxPageNumber', content='0') + title = etree.SubElement(ncx, tag('ncx', 'docTitle')) + text = etree.SubElement(title, tag('ncx', 'text')) text.text = str(self.metadata.title[0]) - navmap = etree.SubElement(ncx, NCX('navMap')) + navmap = etree.SubElement(ncx, tag('ncx', 'navMap')) self.toc.to_ncx(navmap) if len(self.pages) > 0: plist = self.pages.to_ncx(ncx) @@ -1941,27 +1933,29 @@ class OEBBook(object): are tuples of (default) filenames and lxml.etree element structures. """ results = {} - package = etree.Element(OPF('package'), - attrib={'version': '2.0', 'unique-identifier': self.uid.id}, - nsmap={None: OPF2_NS}) + package = etree.Element(tag('opf', 'package'), + attrib={'version': '2.0', + 'unique-identifier': self.uid.id}, + nsmap={None: const.OPF2_NS}) self.metadata.to_opf2(package) manifest = self.manifest.to_opf2(package) spine = self.spine.to_opf2(package) self.guide.to_opf2(package) results[OPF_MIME] = ('content.opf', package) id, href = self.manifest.generate('ncx', 'toc.ncx') - etree.SubElement(manifest, OPF('item'), id=id, href=href, + etree.SubElement(manifest, tag('opf', 'item'), id=id, href=href, attrib={'media-type': NCX_MIME}) spine.attrib['toc'] = id results[NCX_MIME] = (href, self._to_ncx()) if page_map and len(self.pages) > 0: id, href = self.manifest.generate('page-map', 'page-map.xml') - etree.SubElement(manifest, OPF('item'), id=id, href=href, + etree.SubElement(manifest, tag('opf', 'item'), id=id, href=href, attrib={'media-type': PAGE_MAP_MIME}) spine.attrib['page-map'] = id results[PAGE_MAP_MIME] = (href, self.pages.to_page_map()) if self.spine.page_progression_direction in {'ltr', 'rtl'}: - spine.attrib['page-progression-direction'] = self.spine.page_progression_direction + spine.attrib['page-progression-direction'] = \ + self.spine.page_progression_direction return results @@ -1972,7 +1966,9 @@ def rel_href(base_href, href): return href if '/' not in base_href: return href - base = list(filter(lambda x: x and x != '.', os.path.dirname(os.path.normpath(base_href)).replace(os.sep, '/').split('/'))) + base = list(filter(lambda x: x and x != '.', + os.path.dirname(os.path.normpath(base_href)) + .replace(os.sep, '/').split('/'))) while True: try: idx = base.index('..') diff --git a/ebook_converter/ebooks/oeb/parse_utils.py b/ebook_converter/ebooks/oeb/parse_utils.py index 64be9f2..ef1337b 100644 --- a/ebook_converter/ebooks/oeb/parse_utils.py +++ b/ebook_converter/ebooks/oeb/parse_utils.py @@ -1,20 +1,16 @@ import re -from lxml import etree, html +from lxml import etree +from lxml import html +from ebook_converter import constants as const from ebook_converter import xml_replace_entities, force_unicode -from ebook_converter.utils.xml_parse import safe_xml_fromstring from ebook_converter.constants_old import filesystem_encoding from ebook_converter.ebooks.chardet import xml_to_unicode, strip_encoding_declarations -__license__ = 'GPL v3' -__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>' -__docformat__ = 'restructuredtext en' - -RECOVER_PARSER = etree.XMLParser(recover=True, no_network=True, resolve_entities=False) -XHTML_NS = 'http://www.w3.org/1999/xhtml' -XMLNS_NS = 'http://www.w3.org/2000/xmlns/' +RECOVER_PARSER = etree.XMLParser(recover=True, no_network=True, + resolve_entities=False) class NotHTML(Exception): @@ -33,15 +29,15 @@ def namespace(name): def XHTML(name): - return '{%s}%s' % (XHTML_NS, name) + return '{%s}%s' % (const.XHTML_NS, name) def xpath(elem, expr): - return elem.xpath(expr, namespaces={'h':XHTML_NS}) + return elem.xpath(expr, namespaces={'h':const.XHTML_NS}) def XPath(expr): - return etree.XPath(expr, namespaces={'h':XHTML_NS}) + return etree.XPath(expr, namespaces={'h':const.XHTML_NS}) META_XP = XPath('/h:html/h:head/h:meta[@http-equiv="Content-Type"]') @@ -111,7 +107,7 @@ def _html4_parse(data): elem.text = elem.text.strip('-') data = etree.tostring(data, encoding='unicode') - data = safe_xml_fromstring(data) + data = etree.fromstring(data) return data @@ -204,14 +200,14 @@ def parse_html(data, log=None, decoder=None, preprocessor=None, # Try with more & more drastic measures to parse try: - data = safe_xml_fromstring(data, recover=False) + data = etree.fromstring(data) check_for_html5(pre, data) except (HTML5Doc, etree.XMLSyntaxError): log.debug('Initial parse failed, using more' ' forgiving parsers') raw = data = xml_replace_entities(raw) try: - data = safe_xml_fromstring(data, recover=False) + data = etree.fromstring(data) check_for_html5(pre, data) except (HTML5Doc, etree.XMLSyntaxError): log.debug('Parsing %s as HTML' % filename) @@ -240,7 +236,7 @@ def parse_html(data, log=None, decoder=None, preprocessor=None, if barename(data.tag) in non_html_file_tags: raise NotHTML(data.tag) log.warn('File %r does not appear to be (X)HTML'%filename) - nroot = safe_xml_fromstring('<html></html>') + nroot = etree.fromstring('<html></html>') has_body = False for child in list(data): if isinstance(child.tag, (str, bytes)) and barename(child.tag) == 'body': @@ -249,7 +245,7 @@ def parse_html(data, log=None, decoder=None, preprocessor=None, parent = nroot if not has_body: log.warn('File %r appears to be a HTML fragment'%filename) - nroot = safe_xml_fromstring('<html><body/></html>') + nroot = etree.fromstring('<html><body/></html>') parent = nroot[0] for child in list(data.iter()): oparent = child.getparent() @@ -261,16 +257,16 @@ def parse_html(data, log=None, decoder=None, preprocessor=None, # Force into the XHTML namespace if not namespace(data.tag): log.warn('Forcing', filename, 'into XHTML namespace') - data.attrib['xmlns'] = XHTML_NS + data.attrib['xmlns'] = const.XHTML_NS data = etree.tostring(data, encoding='unicode') try: - data = safe_xml_fromstring(data, recover=False) + data = etree.fromstring(data) except: data = data.replace(':=', '=').replace(':>', '>') data = data.replace('<http:/>', '') try: - data = safe_xml_fromstring(data, recover=False) + data = etree.fromstring(data) except etree.XMLSyntaxError: log.warn('Stripping comments from %s'% filename) @@ -281,17 +277,17 @@ def parse_html(data, log=None, decoder=None, preprocessor=None, '') data = data.replace("<?xml version='1.0' encoding='utf-8'??>", '') try: - data = safe_xml_fromstring(data) + data = etree.fromstring(data) except etree.XMLSyntaxError: log.warn('Stripping meta tags from %s'% filename) data = re.sub(r'<meta\s+[^>]+?>', '', data) - data = safe_xml_fromstring(data) - elif namespace(data.tag) != XHTML_NS: + data = etree.fromstring(data) + elif namespace(data.tag) != const.XHTML_NS: # OEB_DOC_NS, but possibly others ns = namespace(data.tag) attrib = dict(data.attrib) nroot = etree.Element(XHTML('html'), - nsmap={None: XHTML_NS}, attrib=attrib) + nsmap={None: const.XHTML_NS}, attrib=attrib) for elem in data.iterdescendants(): if isinstance(elem.tag, (str, bytes)) and \ namespace(elem.tag) == ns: @@ -301,7 +297,7 @@ def parse_html(data, log=None, decoder=None, preprocessor=None, data = nroot # Remove non default prefixes referring to the XHTML namespace - data = ensure_namespace_prefixes(data, {None: XHTML_NS}) + data = ensure_namespace_prefixes(data, {None: const.XHTML_NS}) data = merge_multiple_html_heads_and_bodies(data, log) # Ensure has a <head/> diff --git a/ebook_converter/ebooks/oeb/polish/container.py b/ebook_converter/ebooks/oeb/polish/container.py index e3b9530..2ee2957 100644 --- a/ebook_converter/ebooks/oeb/polish/container.py +++ b/ebook_converter/ebooks/oeb/polish/container.py @@ -14,7 +14,9 @@ from itertools import count import urllib.parse from css_parser import getUrls, replaceUrls +from lxml import etree +from ebook_converter import constants as const from ebook_converter import CurrentDir, walk from ebook_converter.constants_old import iswindows from ebook_converter.customize.ui import plugin_for_input_format, plugin_for_output_format @@ -34,7 +36,7 @@ from ebook_converter.ebooks.mobi import MobiError from ebook_converter.ebooks.mobi.reader.headers import MetadataHeader from ebook_converter.ebooks.mobi.tweak import set_cover from ebook_converter.ebooks.oeb.base import ( - DC11_NS, OEB_DOCS, OEB_STYLES, OPF, OPF2_NS, Manifest, itercsslinks, iterlinks, + OEB_DOCS, OEB_STYLES, Manifest, itercsslinks, iterlinks, rewrite_links, serialize, urlquote, urlunquote ) from ebook_converter.ebooks.oeb.parse_utils import NotHTML, parse_html @@ -47,13 +49,11 @@ from ebook_converter.ptempfile import PersistentTemporaryDirectory, PersistentTe from ebook_converter.utils.filenames import hardlink_file, nlinks_file from ebook_converter.utils.ipc.simple_worker import WorkerError, fork_job from ebook_converter.utils.logging import default_log -from ebook_converter.utils.xml_parse import safe_xml_fromstring from ebook_converter.utils.zipfile import ZipFile exists, join, relpath = os.path.exists, os.path.join, os.path.relpath OEB_FONTS = {guess_type('a.ttf'), guess_type('b.otf'), guess_type('a.woff'), 'application/x-font-ttf', 'application/x-font-otf', 'application/font-sfnt'} -OPF_NAMESPACES = {'opf':OPF2_NS, 'dc':DC11_NS} null = object() @@ -195,7 +195,7 @@ class ContainerBase(object): # {{{ data, self.used_encoding = xml_to_unicode( data, strip_encoding_pats=True, assume_utf8=True, resolve_entities=True) data = unicodedata.normalize('NFC', data) - return safe_xml_fromstring(data) + return etree.fromstring(data) def parse_xhtml(self, data, fname='<string>', force_html5_parse=False): if self.tweak_mode: @@ -324,7 +324,7 @@ class Container(ContainerBase): # {{{ item_id = 'id' + '%d'%c manifest = self.opf_xpath('//opf:manifest')[0] href = self.name_to_href(name, self.opf_name) - item = manifest.makeelement(OPF('item'), + item = manifest.makeelement(const.OPF_ITEM, id=item_id, href=href) item.set('media-type', self.mime_map[name]) self.insert_into_xml(manifest, item) @@ -380,7 +380,7 @@ class Container(ContainerBase): # {{{ if mt in OEB_DOCS: manifest = self.opf_xpath('//opf:manifest')[0] spine = self.opf_xpath('//opf:spine')[0] - si = manifest.makeelement(OPF('itemref'), idref=item_id) + si = manifest.makeelement(const.OPF_ITEMREF, idref=item_id) self.insert_into_xml(spine, si, index=spine_index) return name @@ -533,7 +533,7 @@ class Container(ContainerBase): # {{{ def opf_xpath(self, expr): ' Convenience method to evaluate an XPath expression on the OPF file, has the opf: and dc: namespace prefixes pre-defined. ' - return self.opf.xpath(expr, namespaces=OPF_NAMESPACES) + return self.opf.xpath(expr, namespaces=const.OPF_NAMESPACES) def has_name(self, name): ''' Return True iff a file with the same canonical name as that specified exists. Unlike :meth:`exists` this method is always case-sensitive. ''' @@ -813,7 +813,8 @@ class Container(ContainerBase): # {{{ spine = self.opf_xpath('//opf:spine')[0] spine.text = tail for name, linear in spine_items: - i = spine.makeelement('{%s}itemref' % OPF_NAMESPACES['opf'], nsmap={'opf':OPF_NAMESPACES['opf']}) + i = spine.makeelement(const.OPF_ITEMREF, + nsmap={'opf': const.OPF2_NS}) i.tail = tail i.set('idref', imap[name]) spine.append(i) @@ -944,7 +945,7 @@ class Container(ContainerBase): # {{{ item_id = id_prefix + '%d'%c manifest = self.opf_xpath('//opf:manifest')[0] - item = manifest.makeelement(OPF('item'), + item = manifest.makeelement(const.OPF_ITEM, id=item_id, href=href) item.set('media-type', media_type) self.insert_into_xml(manifest, item) @@ -993,7 +994,7 @@ class Container(ContainerBase): # {{{ self.format_opf() data = serialize(data, self.mime_map[name], pretty_print=name in self.pretty_print) - if name == self.opf_name and root.nsmap.get(None) == OPF2_NS: + if name == self.opf_name and root.nsmap.get(None) == const.OPF2_NS: # Needed as I can't get lxml to output opf:role and # not output <opf:metadata> as well data = re.sub(br'(<[/]{0,1})opf:', r'\1', data) @@ -1172,7 +1173,7 @@ class EpubContainer(Container): container_path = join(self.root, 'META-INF', 'container.xml') if not exists(container_path): raise InvalidEpub('No META-INF/container.xml in epub') - container = safe_xml_fromstring(open(container_path, 'rb').read()) + container = etree.fromstring(open(container_path, 'rb').read()) opf_files = container.xpath(( r'child::ocf:rootfiles/ocf:rootfile' '[@media-type="%s" and @full-path]'%guess_type('a.opf') diff --git a/ebook_converter/ebooks/oeb/polish/css.py b/ebook_converter/ebooks/oeb/polish/css.py index ef7a5b7..43b209c 100644 --- a/ebook_converter/ebooks/oeb/polish/css.py +++ b/ebook_converter/ebooks/oeb/polish/css.py @@ -2,10 +2,11 @@ from collections import defaultdict from functools import partial from css_parser.css import CSSRule, CSSStyleDeclaration -from ebook_converter.css_selectors import parse, SelectorSyntaxError +from ebook_converter import constants as const from ebook_converter import force_unicode -from ebook_converter.ebooks.oeb.base import OEB_STYLES, OEB_DOCS, XHTML, css_text +from ebook_converter.css_selectors import parse, SelectorSyntaxError +from ebook_converter.ebooks.oeb.base import OEB_STYLES, OEB_DOCS, css_text from ebook_converter.ebooks.oeb.normalize_css import normalize_filter_css, normalizers from ebook_converter.ebooks.oeb.polish.pretty import pretty_script_or_style, pretty_xml_tree, serialize from ebook_converter.utils.icu import numeric_sort_key @@ -382,7 +383,7 @@ def add_stylesheet_links(container, name, text): if not sheets: return for sname in sheets: - link = head.makeelement(XHTML('link'), type='text/css', rel='stylesheet', href=container.name_to_href(sname, name)) + link = head.makeelement(const.XHTML_LINK, type='text/css', rel='stylesheet', href=container.name_to_href(sname, name)) head.append(link) pretty_xml_tree(head) return serialize(root, 'text/html') diff --git a/ebook_converter/ebooks/oeb/polish/opf.py b/ebook_converter/ebooks/oeb/polish/opf.py index b66adcb..77cd848 100644 --- a/ebook_converter/ebooks/oeb/polish/opf.py +++ b/ebook_converter/ebooks/oeb/polish/opf.py @@ -1,13 +1,9 @@ from lxml import etree -from ebook_converter.ebooks.oeb.polish.container import OPF_NAMESPACES +from ebook_converter import constants as const from ebook_converter.utils.localization import canonicalize_lang -__license__ = 'GPL v3' -__copyright__ = '2014, Kovid Goyal <kovid at kovidgoyal.net>' - - def get_book_language(container): for lang in container.opf_xpath('//dc:language'): raw = lang.text @@ -18,7 +14,7 @@ def get_book_language(container): def set_guide_item(container, item_type, title, name, frag=None): - ref_tag = '{%s}reference' % OPF_NAMESPACES['opf'] + ref_tag = const.OPF_REFERENCE href = None if name: href = container.name_to_href(name, container.opf_name) @@ -27,23 +23,27 @@ def set_guide_item(container, item_type, title, name, frag=None): guides = container.opf_xpath('//opf:guide') if not guides and href: - g = container.opf.makeelement('{%s}guide' % OPF_NAMESPACES['opf'], nsmap={'opf':OPF_NAMESPACES['opf']}) + g = container.opf.makeelement(const.OPF_GUIDE, + nsmap={'opf': const.OPF2_NS}) container.insert_into_xml(container.opf, g) guides = [g] for guide in guides: matches = [] for child in guide.iterchildren(etree.Element): - if child.tag == ref_tag and child.get('type', '').lower() == item_type.lower(): + if (child.tag == ref_tag and + child.get('type', '').lower() == item_type.lower()): matches.append(child) if not matches and href: - r = guide.makeelement(ref_tag, type=item_type, nsmap={'opf':OPF_NAMESPACES['opf']}) + r = guide.makeelement(ref_tag, type=item_type, + nsmap={'opf': const.OPF2_NS}) container.insert_into_xml(guide, r) matches.append(r) for m in matches: if href: - m.set('title', title), m.set('href', href), m.set('type', item_type) + m.set('title', title) + m.set('href', href) + m.set('type', item_type) else: container.remove_from_xml(m) container.dirty(container.opf_name) - diff --git a/ebook_converter/ebooks/oeb/polish/parsing.py b/ebook_converter/ebooks/oeb/polish/parsing.py index d847094..99cbd5e 100644 --- a/ebook_converter/ebooks/oeb/polish/parsing.py +++ b/ebook_converter/ebooks/oeb/polish/parsing.py @@ -1,21 +1,18 @@ import re -from lxml.etree import Element as LxmlElement +from lxml import etree import html5_parser +from ebook_converter import constants as const from ebook_converter import xml_replace_entities -from ebook_converter.utils.xml_parse import safe_xml_fromstring -from ebook_converter.ebooks.chardet import xml_to_unicode, strip_encoding_declarations +from ebook_converter.ebooks.chardet import strip_encoding_declarations +from ebook_converter.ebooks.chardet import xml_to_unicode from ebook_converter.utils.cleantext import clean_xml_chars -__license__ = 'GPL v3' -__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>' - -XHTML_NS = 'http://www.w3.org/1999/xhtml' - - -def parse_html5(raw, decoder=None, log=None, discard_namespaces=False, line_numbers=True, linenumber_attribute=None, replace_entities=True, fix_newlines=True): +def parse_html5(raw, decoder=None, log=None, discard_namespaces=False, + line_numbers=True, linenumber_attribute=None, + replace_entities=True, fix_newlines=True): if isinstance(raw, bytes): raw = xml_to_unicode(raw)[0] if decoder is None else decoder(raw) if replace_entities: @@ -23,10 +20,14 @@ def parse_html5(raw, decoder=None, log=None, discard_namespaces=False, line_numb if fix_newlines: raw = raw.replace('\r\n', '\n').replace('\r', '\n') raw = clean_xml_chars(raw) - root = html5_parser.parse(raw, maybe_xhtml=not discard_namespaces, line_number_attr=linenumber_attribute, keep_doctype=False, sanitize_names=True) - if (discard_namespaces and root.tag != 'html') or ( - not discard_namespaces and (root.tag != '{%s}%s' % (XHTML_NS, 'html') or root.prefix)): - raise ValueError('Failed to parse correctly, root has tag: %s and prefix: %s' % (root.tag, root.prefix)) + root = html5_parser.parse(raw, maybe_xhtml=not discard_namespaces, + line_number_attr=linenumber_attribute, + keep_doctype=False, sanitize_names=True) + if ((discard_namespaces and root.tag != 'html') or + (not discard_namespaces and + (root.tag != '{%s}%s' % (const.XHTML_NS, 'html') or root.prefix))): + raise ValueError('Failed to parse correctly, root has tag: %s and ' + 'prefix: %s' % (root.tag, root.prefix)) return root @@ -48,12 +49,14 @@ def handle_private_entities(data): user_entities[match.group(1)] = val if user_entities: data = ('\n' * num_of_nl_in_pre) + data[idx:] - pat = re.compile(r'&(%s);'%('|'.join(user_entities.keys()))) - data = pat.sub(lambda m:user_entities[m.group(1)], data) + pat = re.compile(r'&(%s);' % ('|'.join(user_entities.keys()))) + data = pat.sub(lambda m: user_entities[m.group(1)], data) return data -def parse(raw, decoder=None, log=None, line_numbers=True, linenumber_attribute=None, replace_entities=True, force_html5_parse=False): +def parse(raw, decoder=None, log=None, line_numbers=True, + linenumber_attribute=None, replace_entities=True, + force_html5_parse=False): if isinstance(raw, bytes): raw = xml_to_unicode(raw)[0] if decoder is None else decoder(raw) raw = handle_private_entities(raw) @@ -70,26 +73,32 @@ def parse(raw, decoder=None, log=None, line_numbers=True, linenumber_attribute=N raw = ('\n' * newlines) + raw[match.start():] break - raw = strip_encoding_declarations(raw, limit=10*1024, preserve_newlines=True) + raw = strip_encoding_declarations(raw, limit=10*1024, + preserve_newlines=True) if force_html5_parse: - return parse_html5(raw, log=log, line_numbers=line_numbers, linenumber_attribute=linenumber_attribute, replace_entities=False, fix_newlines=False) + return parse_html5(raw, log=log, line_numbers=line_numbers, + linenumber_attribute=linenumber_attribute, + replace_entities=False, fix_newlines=False) try: - ans = safe_xml_fromstring(raw, recover=False) - if ans.tag != '{%s}html' % XHTML_NS: + ans = etree.fromstring(raw) + if ans.tag != '{%s}html' % const.XHTML_NS: raise ValueError('Root tag is not <html> in the XHTML namespace') if linenumber_attribute: - for elem in ans.iter(LxmlElement): + for elem in ans.iter(etree.element): if elem.sourceline is not None: elem.set(linenumber_attribute, str(elem.sourceline)) return ans except Exception: if log is not None: log.exception('Failed to parse as XML, parsing as tag soup') - return parse_html5(raw, log=log, line_numbers=line_numbers, linenumber_attribute=linenumber_attribute, replace_entities=False, fix_newlines=False) + return parse_html5(raw, log=log, line_numbers=line_numbers, + linenumber_attribute=linenumber_attribute, + replace_entities=False, fix_newlines=False) if __name__ == '__main__': - from lxml import etree - root = parse_html5('\n<html><head><title>a\n

 \nb', discard_namespaces=False) + root = parse_html5('\na\n

 ' + '\nb', + discard_namespaces=False) print(etree.tostring(root, encoding='utf-8')) print() diff --git a/ebook_converter/ebooks/oeb/polish/pretty.py b/ebook_converter/ebooks/oeb/polish/pretty.py index 402a546..8620f9e 100644 --- a/ebook_converter/ebooks/oeb/polish/pretty.py +++ b/ebook_converter/ebooks/oeb/polish/pretty.py @@ -2,10 +2,10 @@ import textwrap # from lxml.etree import Element +from ebook_converter import constants as const from ebook_converter import force_unicode -from ebook_converter.ebooks.oeb.base import ( - serialize, OEB_DOCS, barename, OEB_STYLES, XPNSMAP, XHTML, SVG) -from ebook_converter.ebooks.oeb.polish.container import OPF_NAMESPACES +from ebook_converter.ebooks.oeb import parse_utils +from ebook_converter.ebooks.oeb.base import serialize, OEB_DOCS, OEB_STYLES from ebook_converter.ebooks.oeb.polish.utils import guess_type from ebook_converter.utils.icu import sort_key @@ -38,15 +38,15 @@ def pretty_opf(root): # Put all dc: tags first starting with title and author. Preserve order for # the rest. def dckey(x): - return {'title':0, 'creator':1}.get(barename(x.tag), 2) - for metadata in root.xpath('//opf:metadata', namespaces=OPF_NAMESPACES): - dc_tags = metadata.xpath('./*[namespace-uri()="%s"]' % OPF_NAMESPACES['dc']) + return {'title':0, 'creator':1}.get(parse_utils.barename(x.tag), 2) + for metadata in root.xpath('//opf:metadata', namespaces=const.OPF_NAMESPACES): + dc_tags = metadata.xpath('./*[namespace-uri()="%s"]' % const.DC11_NS) dc_tags.sort(key=dckey) for x in reversed(dc_tags): metadata.insert(0, x) # Group items in the manifest - spine_ids = root.xpath('//opf:spine/opf:itemref/@idref', namespaces=OPF_NAMESPACES) + spine_ids = root.xpath('//opf:spine/opf:itemref/@idref', namespaces=const.OPF_NAMESPACES) spine_ids = {x:i for i, x in enumerate(spine_ids)} def manifest_key(x): @@ -75,7 +75,7 @@ def pretty_opf(root): i = sort_key(href) return (cat, i) - for manifest in root.xpath('//opf:manifest', namespaces=OPF_NAMESPACES): + for manifest in root.xpath('//opf:manifest', namespaces=const.OPF_NAMESPACES): try: children = sorted(manifest, key=manifest_key) except AttributeError: @@ -84,19 +84,11 @@ def pretty_opf(root): manifest.insert(0, x) -SVG_TAG = SVG('svg') -BLOCK_TAGS = frozenset(map(XHTML, ( - 'address', 'article', 'aside', 'audio', 'blockquote', 'body', 'canvas', 'col', 'colgroup', 'dd', - 'div', 'dl', 'dt', 'fieldset', 'figcaption', 'figure', 'footer', 'form', - 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'header', 'hgroup', 'hr', 'li', - 'noscript', 'ol', 'output', 'p', 'pre', 'script', 'section', 'style', 'table', 'tbody', 'td', - 'tfoot', 'th', 'thead', 'tr', 'ul', 'video', 'img'))) | {SVG_TAG} - def isblock(x): if callable(x.tag) or not x.tag: return True - if x.tag in BLOCK_TAGS: + if x.tag in const.XHTML_BLOCK_TAGS | {const.SVG_SVG}: return True return False @@ -141,12 +133,12 @@ def pretty_block(parent, level=1, indent=' '): that contain only other block tags ''' if not parent.text or isspace(parent.text): parent.text = '' - nn = '\n' if hasattr(parent.tag, 'strip') and barename(parent.tag) in {'tr', 'td', 'th'} else '\n\n' + nn = '\n' if hasattr(parent.tag, 'strip') and parse_utils.barename(parent.tag) in {'tr', 'td', 'th'} else '\n\n' parent.text = parent.text + nn + (indent * level) for i, child in enumerate(parent): if isblock(child) and has_only_blocks(child): pretty_block(child, level=level+1, indent=indent) - elif child.tag == SVG_TAG: + elif child.tag == const.SVG_SVG: pretty_xml_tree(child, level=level, indent=indent) l = level if i == len(parent) - 1: @@ -172,13 +164,13 @@ def pretty_html_tree(container, root): child.tail = '\n\n' if hasattr(child.tag, 'endswith') and child.tag.endswith('}head'): pretty_xml_tree(child) - for body in root.findall('h:body', namespaces=XPNSMAP): + for body in root.findall('h:body', namespaces=const.XPNSMAP): pretty_block(body) # Special case the handling of a body that contains a single block tag # with all content. In this case we prettify the containing block tag # even if it has non block children. if (len(body) == 1 and not callable(body[0].tag) and isblock(body[0]) and not has_only_blocks( - body[0]) and barename(body[0].tag) not in ( + body[0]) and parse_utils.barename(body[0].tag) not in ( 'pre', 'p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6') and len(body[0]) > 0): pretty_block(body[0], level=2) diff --git a/ebook_converter/ebooks/oeb/polish/split.py b/ebook_converter/ebooks/oeb/polish/split.py index 568aefa..36b8a67 100644 --- a/ebook_converter/ebooks/oeb/polish/split.py +++ b/ebook_converter/ebooks/oeb/polish/split.py @@ -1,7 +1,11 @@ -import copy, os, re +import copy +import os +import re import urllib.parse -from ebook_converter.ebooks.oeb.base import barename, XPNSMAP, XPath, OPF, XHTML, OEB_DOCS +from ebook_converter import constants as const +from ebook_converter.ebooks.oeb import base +from ebook_converter.ebooks.oeb import parse_utils from ebook_converter.ebooks.oeb.polish.errors import MalformedMarkup from ebook_converter.ebooks.oeb.polish.toc import node_from_loc from ebook_converter.ebooks.oeb.polish.replace import LinkRebaser @@ -35,7 +39,7 @@ def adjust_split_point(split_point, log): parent = sp.getparent() if ( parent is None or - barename(parent.tag) in {'body', 'html'} or + parse_utils.barename(parent.tag) in {'body', 'html'} or (parent.text and parent.text.strip()) or parent.index(sp) > 0 ): @@ -49,7 +53,7 @@ def adjust_split_point(split_point, log): def get_body(root): - return root.find('h:body', namespaces=XPNSMAP) + return root.find('h:body', namespaces=const.XPNSMAP) def do_split(split_point, log, before=True): @@ -113,7 +117,7 @@ def do_split(split_point, log, before=True): nix_element(elem) # Tree 2 - ancestors = frozenset(XPath('ancestor::*')(split_point2)) + ancestors = frozenset(base.XPath('ancestor::*')(split_point2)) for elem in tuple(body2.iterdescendants()): if elem is split_point2: if not before: @@ -251,7 +255,7 @@ def split(container, name, loc_or_xpath, before=True, totals=None): break index = spine.index(spine_item) + 1 - si = spine.makeelement(OPF('itemref'), idref=manifest_item.get('id')) + si = spine.makeelement(base.tag('opf', 'itemref'), idref=manifest_item.get('id')) if not linear: si.set('linear', 'no') container.insert_into_xml(spine, si, index=index) @@ -268,7 +272,7 @@ def multisplit(container, name, xpath, before=True): :param before: If True the splits occur before the identified element otherwise after it. ''' root = container.parsed(name) - nodes = root.xpath(xpath, namespaces=XPNSMAP) + nodes = root.xpath(xpath, namespaces=const.XPNSMAP) if not nodes: raise AbortError('The expression %s did not match any nodes' % xpath) for split_point in nodes: @@ -329,7 +333,7 @@ def all_anchors(root): def all_stylesheets(container, name): - for link in XPath('//h:head/h:link[@href]')(container.parsed(name)): + for link in base.XPath('//h:head/h:link[@href]')(container.parsed(name)): name = container.href_to_name(link.get('href'), name) typ = link.get('type', 'text/css') if typ == 'text/css': @@ -358,14 +362,14 @@ def merge_html(container, names, master, insert_page_breaks=False): root = p(master) # Ensure master has a - head = root.find('h:head', namespaces=XPNSMAP) + head = root.find('h:head', namespaces=const.XPNSMAP) if head is None: - head = root.makeelement(XHTML('head')) + head = root.makeelement(base.tag('xhtml', 'head')) container.insert_into_xml(root, head, 0) seen_anchors = all_anchors(root) seen_stylesheets = set(all_stylesheets(container, master)) - master_body = p(master).findall('h:body', namespaces=XPNSMAP)[-1] + master_body = p(master).findall('h:body', namespaces=const.XPNSMAP)[-1] master_base = os.path.dirname(master) anchor_map = {n:{} for n in names if n != master} first_anchor_map = {} @@ -377,7 +381,7 @@ def merge_html(container, names, master, insert_page_breaks=False): for sheet in all_stylesheets(container, name): if sheet not in seen_stylesheets: seen_stylesheets.add(sheet) - link = head.makeelement(XHTML('link'), rel='stylesheet', type='text/css', href=container.name_to_href(sheet, master)) + link = head.makeelement(base.tag('xhtml', 'link'), rel='stylesheet', type='text/css', href=container.name_to_href(sheet, master)) container.insert_into_xml(head, link) # Rebase links if master is in a different directory @@ -386,7 +390,7 @@ def merge_html(container, names, master, insert_page_breaks=False): root = p(name) children = [] - for body in p(name).findall('h:body', namespaces=XPNSMAP): + for body in p(name).findall('h:body', namespaces=const.XPNSMAP): children.append(body.text if body.text and body.text.strip() else '\n\n') children.extend(body) @@ -396,7 +400,7 @@ def merge_html(container, names, master, insert_page_breaks=False): break if isinstance(first_child, (str, bytes)): # body contained only text, no tags - first_child = body.makeelement(XHTML('p')) + first_child = body.makeelement(base.tag('xhtml', 'p')) first_child.text, children[0] = children[0], first_child amap = anchor_map[name] @@ -424,7 +428,7 @@ def merge_html(container, names, master, insert_page_breaks=False): amap[''] = first_child.get('id') # Fix links that point to local changed anchors - for a in XPath('//h:a[starts-with(@href, "#")]')(root): + for a in base.XPath('//h:a[starts-with(@href, "#")]')(root): q = a.get('href')[1:] if q in amap: a.set('href', '#' + amap[q]) @@ -472,10 +476,10 @@ def merge_css(container, names, master): # Remove links to merged stylesheets in the html files, replacing with a # link to the master sheet for name, mt in container.mime_map.items(): - if mt in OEB_DOCS: + if mt in base.OEB_DOCS: removed = False root = p(name) - for link in XPath('//h:link[@href]')(root): + for link in base.XPath('//h:link[@href]')(root): q = container.href_to_name(link.get('href'), name) if q in merged: container.remove_from_xml(link) @@ -483,9 +487,9 @@ def merge_css(container, names, master): if removed: container.dirty(name) if removed and master not in set(all_stylesheets(container, name)): - head = root.find('h:head', namespaces=XPNSMAP) + head = root.find('h:head', namespaces=const.XPNSMAP) if head is not None: - link = head.makeelement(XHTML('link'), type='text/css', rel='stylesheet', href=container.name_to_href(master, name)) + link = head.makeelement(base.tag('xhtml', 'link'), type='text/css', rel='stylesheet', href=container.name_to_href(master, name)) container.insert_into_xml(head, link) diff --git a/ebook_converter/ebooks/oeb/polish/toc.py b/ebook_converter/ebooks/oeb/polish/toc.py index 15f94b4..42af230 100644 --- a/ebook_converter/ebooks/oeb/polish/toc.py +++ b/ebook_converter/ebooks/oeb/polish/toc.py @@ -1,16 +1,16 @@ -import re -from collections import Counter, OrderedDict -from functools import partial -from operator import itemgetter +import collections +import functools +import operator import pkg_resources +import re import urllib.parse from lxml import etree from lxml.builder import ElementMaker from ebook_converter import __version__ -from ebook_converter.ebooks.oeb.base import ( - XPath, uuid_id, xml2text, NCX, NCX_NS, XML, XHTML, XHTML_NS, serialize, EPUB_NS, XML_NS, OEB_DOCS) +from ebook_converter import constants as const +from ebook_converter.ebooks.oeb import base from ebook_converter.ebooks.oeb.polish.errors import MalformedMarkup from ebook_converter.ebooks.oeb.polish.utils import guess_type, extract from ebook_converter.ebooks.oeb.polish.opf import set_guide_item, get_book_language @@ -18,10 +18,6 @@ from ebook_converter.ebooks.oeb.polish.pretty import pretty_html_tree from ebook_converter.utils.localization import get_lang, canonicalize_lang, lang_as_iso639_1 -__license__ = 'GPL v3' -__copyright__ = '2013, Kovid Goyal ' -__docformat__ = 'restructuredtext en' - ns = etree.FunctionNamespace('calibre_xpath_extensions') ns.prefix = 'calibre' ns['lower-case'] = lambda c, x: x.lower() if hasattr(x, 'lower') else x @@ -81,7 +77,8 @@ class TOC(object): seen = set() remove = [] for child in self: - key = child.title if only_text else (child.title, child.dest, (child.frag or None)) + key = child.title if only_text else (child.title, child.dest, + (child.frag or None)) if key in seen: remove.append(child) else: @@ -104,7 +101,7 @@ class TOC(object): def get_lines(self, lvl=0): frag = ('#'+self.frag) if self.frag else '' - ans = [('\t'*lvl) + 'TOC: %s --> %s%s'%(self.title, self.dest, frag)] + ans = [('\t'*lvl) + 'TOC: %s --> %s%s' % (self.title, self.dest, frag)] for child in self: ans.extend(child.get_lines(lvl+1)) return ans @@ -113,10 +110,8 @@ class TOC(object): return '\n'.join(self.get_lines()) def to_dict(self, node_counter=None): - ans = { - 'title':self.title, 'dest':self.dest, 'frag':self.frag, - 'children':[c.to_dict(node_counter) for c in self.children] - } + ans = {'title': self.title, 'dest': self.dest, 'frag': self.frag, + 'children': [c.to_dict(node_counter) for c in self.children]} if self.dest_exists is not None: ans['dest_exists'] = self.dest_exists if self.dest_error is not None: @@ -131,7 +126,7 @@ class TOC(object): def child_xpath(tag, name): - return tag.xpath('./*[calibre:lower-case(local-name()) = "%s"]'%name) + return tag.xpath('./*[calibre:lower-case(local-name()) = "%s"]' % name) def add_from_navpoint(container, navpoint, parent, ncx_name): @@ -142,7 +137,7 @@ def add_from_navpoint(container, navpoint, parent, ncx_name): text = '' for txt in child_xpath(nl, 'text'): text += etree.tostring(txt, method='text', - encoding='unicode', with_tail=False) + encoding='unicode', with_tail=False) content = child_xpath(navpoint, 'content') if content: content = content[0] @@ -154,7 +149,8 @@ def add_from_navpoint(container, navpoint, parent, ncx_name): def process_ncx_node(container, node, toc_parent, ncx_name): - for navpoint in node.xpath('./*[calibre:lower-case(local-name()) = "navpoint"]'): + for navpoint in node.xpath('./*[calibre:lower-case(local-name()) ' + '= "navpoint"]'): child = add_from_navpoint(container, navpoint, toc_parent, ncx_name) if child is not None: process_ncx_node(container, navpoint, child, ncx_name) @@ -171,29 +167,38 @@ def parse_ncx(container, ncx_name): if attr.endswith('lang'): toc_root.lang = str(val) break - for uid in root.xpath('//*[calibre:lower-case(local-name()) = "meta" and @name="dtb:uid"]/@content'): + for uid in root.xpath('//*[calibre:lower-case(local-name()) = "meta" and ' + '@name="dtb:uid"]/@content'): if uid: toc_root.uid = str(uid) break for pl in root.xpath('//*[calibre:lower-case(local-name()) = "pagelist"]'): - for pt in pl.xpath('descendant::*[calibre:lower-case(local-name()) = "pagetarget"]'): + for pt in pl.xpath('descendant::*[calibre:lower-case(local-name()) = ' + '"pagetarget"]'): pagenum = pt.get('value') if pagenum: - href = pt.xpath('descendant::*[calibre:lower-case(local-name()) = "content"]/@src') + href = pt.xpath('descendant::*[calibre:lower-case(local-name()' + ') = "content"]/@src') if href: dest = container.href_to_name(href[0], base=ncx_name) frag = urllib.parse.urlparse(href[0]).fragment or None - toc_root.page_list.append({'dest': dest, 'pagenum': pagenum, 'frag': frag}) + toc_root.page_list.append({'dest': dest, + 'pagenum': pagenum, + 'frag': frag}) return toc_root def add_from_li(container, li, parent, nav_name): dest = frag = text = None - for x in li.iterchildren(XHTML('a'), XHTML('span')): - text = etree.tostring(x, method='text', encoding='unicode', with_tail=False).strip() or ' '.join(x.xpath('descendant-or-self::*/@title')).strip() + for x in li.iterchildren(base.tag('xhtml', 'a'), + base.tag('xhtml', 'span')): + text = (etree.tostring(x, method='text', encoding='unicode', + with_tail=False).strip() or + ' '.join(x.xpath('descendant-or-self::*/@title')).strip()) href = x.get('href') if href: - dest = nav_name if href.startswith('#') else container.href_to_name(href, base=nav_name) + dest = (nav_name if href.startswith('#') else + container.href_to_name(href, base=nav_name)) frag = urllib.parse.urlparse(href).fragment or None break return parent.add(text or None, dest or None, frag or None) @@ -207,9 +212,9 @@ def first_child(parent, tagname): def process_nav_node(container, node, toc_parent, nav_name): - for li in node.iterchildren(XHTML('li')): + for li in node.iterchildren(base.tag('xhtml', 'li')): child = add_from_li(container, li, toc_parent, nav_name) - ol = first_child(li, XHTML('ol')) + ol = first_child(li, base.tag('xhtml', 'ol')) if child is not None and ol is not None: process_nav_node(container, ol, child, nav_name) @@ -218,14 +223,16 @@ def parse_nav(container, nav_name): root = container.parsed(nav_name) toc_root = TOC() toc_root.lang = toc_root.uid = None - et = '{%s}type' % EPUB_NS - for nav in root.iterdescendants(XHTML('nav')): - if nav.get(et) == 'toc': - ol = first_child(nav, XHTML('ol')) + xhtml = functools.partial(base.tag, 'xhtml') + for nav in root.iterdescendants(base.tag('xhtml', 'nav')): + if nav.get(base.tag('epub', 'type')) == 'toc': + ol = first_child(nav, base.tag('xhtml', 'ol')) if ol is not None: process_nav_node(container, ol, toc_root, nav_name) - for h in nav.iterchildren(*map(XHTML, 'h1 h2 h3 h4 h5 h6'.split())): - text = etree.tostring(h, method='text', encoding='unicode', with_tail=False) or h.get('title') + for h in nav.iterchildren(*map(xhtml, + 'h1 h2 h3 h4 h5 h6'.split())): + text = etree.tostring(h, method='text', encoding='unicode', + with_tail=False) or h.get('title') if text: toc_root.toc_title = text break @@ -235,7 +242,7 @@ def parse_nav(container, nav_name): def verify_toc_destinations(container, toc): anchor_map = {} - anchor_xpath = XPath('//*/@id|//h:a/@name') + anchor_xpath = base.XPath('//*/@id|//h:a/@name') for item in toc.iterdescendants(): name = item.dest if not name: @@ -284,7 +291,8 @@ def get_x_toc(container, find_toc, parse_toc, verify_destinations=True): ans.lang = ans.uid = None return ans toc = find_toc(container) - ans = empty_toc() if toc is None or not container.has_name(toc) else parse_toc(container, toc) + ans = (empty_toc() if toc is None or not container.has_name(toc) else + parse_toc(container, toc)) ans.toc_file_name = toc if toc and container.has_name(toc) else None if verify_destinations: verify_toc_destinations(container, ans) @@ -294,11 +302,14 @@ def get_x_toc(container, find_toc, parse_toc, verify_destinations=True): def get_toc(container, verify_destinations=True): ver = container.opf_version_parsed if ver.major < 3: - return get_x_toc(container, find_existing_ncx_toc, parse_ncx, verify_destinations=verify_destinations) + return get_x_toc(container, find_existing_ncx_toc, parse_ncx, + verify_destinations=verify_destinations) else: - ans = get_x_toc(container, find_existing_nav_toc, parse_nav, verify_destinations=verify_destinations) + ans = get_x_toc(container, find_existing_nav_toc, parse_nav, + verify_destinations=verify_destinations) if len(ans) == 0: - ans = get_x_toc(container, find_existing_ncx_toc, parse_ncx, verify_destinations=verify_destinations) + ans = get_x_toc(container, find_existing_ncx_toc, parse_ncx, + verify_destinations=verify_destinations) return ans @@ -308,25 +319,33 @@ def get_guide_landmarks(container): href, frag = href.partition('#')[::2] name = container.href_to_name(href, container.opf_name) if container.has_name(name): - yield {'dest':name, 'frag':frag, 'title':title or '', 'type':rtype or ''} + yield {'dest': name, + 'frag': frag, + 'title': title or '', + 'type': rtype or ''} def get_nav_landmarks(container): nav = find_existing_nav_toc(container) if nav and container.has_name(nav): root = container.parsed(nav) - et = '{%s}type' % EPUB_NS - for elem in root.iterdescendants(XHTML('nav')): + et = base('epub', 'type') + for elem in root.iterdescendants(base.tag('xhtml', 'nav')): if elem.get(et) == 'landmarks': - for li in elem.iterdescendants(XHTML('li')): - for a in li.iterdescendants(XHTML('a')): + for li in elem.iterdescendants(base.tag('xhtml', 'li')): + for a in li.iterdescendants(base.tag('xhtml', 'a')): href, rtype = a.get('href'), a.get(et) if href: - title = etree.tostring(a, method='text', encoding='unicode', with_tail=False).strip() + title = etree.tostring(a, method='text', + encoding='unicode', + with_tail=False).strip() href, frag = href.partition('#')[::2] name = container.href_to_name(href, nav) if container.has_name(name): - yield {'dest':name, 'frag':frag, 'title':title or '', 'type':rtype or ''} + yield {'dest': name, + 'frag': frag, + 'title': title or '', + 'type': rtype or ''} break @@ -344,7 +363,7 @@ def ensure_id(elem, all_ids): elem_id = elem.get('id') if elem_id: return False, elem_id - if elem.tag == XHTML('a'): + if elem.tag == base.tag('xhtml', 'a'): anchor = elem.get('name', None) if anchor: elem.set('id', anchor) @@ -361,7 +380,7 @@ def ensure_id(elem, all_ids): def elem_to_toc_text(elem): - text = xml2text(elem).strip() + text = base.xml2text(elem).strip() if not text: text = elem.get('title', '') if not text: @@ -375,7 +394,7 @@ def elem_to_toc_text(elem): def item_at_top(elem): try: - body = XPath('//h:body')(elem.getroottree().getroot())[0] + body = base.XPath('//h:body')(elem.getroottree().getroot())[0] except (TypeError, IndexError, KeyError, AttributeError): return False tree = body.getroottree() @@ -387,7 +406,7 @@ def item_at_top(elem): try: if el.tag.endswith('}img') or (el.text and el.text.strip()): return False - except: + except Exception: return False if not path.startswith(epath): # Only check tail of non-parent elements @@ -404,24 +423,26 @@ def from_xpaths(container, xpaths): Table of Contents from the ``

``, ``

`` and ``

`` tags. ''' tocroot = TOC() - xpaths = [XPath(xp) for xp in xpaths] + xpaths = [base.XPath(xp) for xp in xpaths] # Find those levels that have no elements in all spine items - maps = OrderedDict() + maps = collections.OrderedDict() empty_levels = {i+1 for i, xp in enumerate(xpaths)} for spinepath in container.spine_items: name = container.abspath_to_name(spinepath) root = container.parsed(name) - level_item_map = maps[name] = {i+1:frozenset(xp(root)) for i, xp in enumerate(xpaths)} + level_item_map = maps[name] = {i + 1: frozenset(xp(root)) + for i, xp in enumerate(xpaths)} for lvl, elems in level_item_map.items(): if elems: empty_levels.discard(lvl) # Remove empty levels from all level_maps if empty_levels: for name, lmap in tuple(maps.items()): - lmap = {lvl:items for lvl, items in lmap.items() if lvl not in empty_levels} - lmap = sorted(lmap.items(), key=itemgetter(0)) - lmap = {i+1:items for i, (l, items) in enumerate(lmap)} + lmap = {lvl: items for lvl, items in lmap.items() + if lvl not in empty_levels} + lmap = sorted(lmap.items(), key=operator.itemgetter(0)) + lmap = {i + 1: items for i, (l, items) in enumerate(lmap)} maps[name] = lmap node_level_map = {tocroot: 0} @@ -434,13 +455,15 @@ def from_xpaths(container, xpaths): if child is None: return node lvl = node_level_map[child] - return node if lvl > limit else child if lvl == limit else process_node(child) + return (node if lvl > limit else + child if lvl == limit else process_node(child)) return process_node(tocroot) for name, level_item_map in maps.items(): root = container.parsed(name) - item_level_map = {e:i for i, elems in level_item_map.items() for e in elems} + item_level_map = {e: i for i, elems in level_item_map.items() + for e in elems} item_dirtied = False all_ids = set(root.xpath('//*/@id')) @@ -470,7 +493,7 @@ def from_links(container): Generate a Table of Contents from links in the book. ''' toc = TOC() - link_path = XPath('//h:a[@href]') + link_path = base.XPath('//h:a[@href]') seen_titles, seen_dests = set(), set() for name, is_linear in container.spine_names: root = container.parsed(name) @@ -506,7 +529,7 @@ def find_text(node): pat = re.compile(r'\s+') for child in node: if isinstance(child, etree._Element): - text = xml2text(child).strip() + text = base.xml2text(child).strip() text = pat.sub(' ', text) if len(text) < 1: continue @@ -526,7 +549,7 @@ def from_files(container): for i, spinepath in enumerate(container.spine_items): name = container.abspath_to_name(spinepath) root = container.parsed(name) - body = XPath('//h:body')(root) + body = base.XPath('//h:body')(root) if not body: continue text = find_text(body[0]) @@ -576,42 +599,46 @@ def add_id(container, name, loc, totals=None): def create_ncx(toc, to_href, btitle, lang, uid): lang = lang.replace('_', '-') - ncx = etree.Element(NCX('ncx'), - attrib={'version': '2005-1', XML('lang'): lang}, - nsmap={None: NCX_NS}) - head = etree.SubElement(ncx, NCX('head')) - etree.SubElement(head, NCX('meta'), - name='dtb:uid', content=str(uid)) - etree.SubElement(head, NCX('meta'), - name='dtb:depth', content=str(toc.depth)) + ncx = etree.Element(base.tag('ncx', 'ncx'), + attrib={'version': '2005-1', + base.tag('xml', 'lang'): lang}, + nsmap={None: const.NCX_NS}) + head = etree.SubElement(ncx, base.tag('ncx', 'head')) + etree.SubElement(head, base.tag('ncx', 'meta'), + name='dtb:uid', content=str(uid)) + etree.SubElement(head, base.tag('ncx', 'meta'), + name='dtb:depth', content=str(toc.depth)) generator = ''.join(['calibre (', __version__, ')']) - etree.SubElement(head, NCX('meta'), - name='dtb:generator', content=generator) - etree.SubElement(head, NCX('meta'), name='dtb:totalPageCount', content='0') - etree.SubElement(head, NCX('meta'), name='dtb:maxPageNumber', content='0') - title = etree.SubElement(ncx, NCX('docTitle')) - text = etree.SubElement(title, NCX('text')) + etree.SubElement(head, base.tag('ncx', 'meta'), + name='dtb:generator', content=generator) + etree.SubElement(head, base.tag('ncx', 'meta'), name='dtb:totalPageCount', + content='0') + etree.SubElement(head, base.tag('ncx', 'meta'), name='dtb:maxPageNumber', + content='0') + title = etree.SubElement(ncx, base.tag('ncx', 'docTitle')) + text = etree.SubElement(title, base.tag('ncx', 'text')) text.text = btitle - navmap = etree.SubElement(ncx, NCX('navMap')) + navmap = etree.SubElement(ncx, base.tag('ncx', 'navMap')) spat = re.compile(r'\s+') - play_order = Counter() + play_order = collections.Counter() def process_node(xml_parent, toc_parent): for child in toc_parent: play_order['c'] += 1 - point = etree.SubElement(xml_parent, NCX('navPoint'), id='num_%d' % play_order['c'], - playOrder=str(play_order['c'])) - label = etree.SubElement(point, NCX('navLabel')) + point = etree.SubElement(xml_parent, base.tag('ncx', 'navPoint'), + id='num_%d' % play_order['c'], + playOrder=str(play_order['c'])) + label = etree.SubElement(point, base.tag('ncx', 'navLabel')) title = child.title if title: title = spat.sub(' ', title) - etree.SubElement(label, NCX('text')).text = title + etree.SubElement(label, base.tag('ncx', 'text')).text = title if child.dest: href = to_href(child.dest) if child.frag: href += '#'+child.frag - etree.SubElement(point, NCX('content'), src=href) + etree.SubElement(point, base.tag('ncx', 'content'), src=href) process_node(point, child) process_node(navmap, toc) @@ -622,41 +649,43 @@ def commit_ncx_toc(container, toc, lang=None, uid=None): tocname = find_existing_ncx_toc(container) if tocname is None: item = container.generate_item('toc.ncx', id_prefix='toc') - tocname = container.href_to_name(item.get('href'), base=container.opf_name) + tocname = container.href_to_name(item.get('href'), + base=container.opf_name) ncx_id = item.get('id') [s.set('toc', ncx_id) for s in container.opf_xpath('//opf:spine')] if not lang: lang = get_lang() - for l in container.opf_xpath('//dc:language'): - l = canonicalize_lang(xml2text(l).strip()) - if l: - lang = l - lang = lang_as_iso639_1(l) or l + for _l in container.opf_xpath('//dc:language'): + _l = canonicalize_lang(base.xml2text(_l).strip()) + if _l: + lang = _l + lang = lang_as_iso639_1(_l) or _l break lang = lang_as_iso639_1(lang) or lang if not uid: - uid = uuid_id() + uid = base.uuid_id() eid = container.opf.get('unique-identifier', None) if eid: - m = container.opf_xpath('//*[@id="%s"]'%eid) + m = container.opf_xpath('//*[@id="%s"]' % eid) if m: - uid = xml2text(m[0]) + uid = base.xml2text(m[0]) title = 'Table of Contents' m = container.opf_xpath('//dc:title') if m: - x = xml2text(m[0]).strip() + x = base.xml2text(m[0]).strip() title = x or title - to_href = partial(container.name_to_href, base=tocname) + to_href = functools.partial(container.name_to_href, base=tocname) root = create_ncx(toc, to_href, title, lang, uid) container.replace(tocname, root) container.pretty_print.add(tocname) def ensure_single_nav_of_type(root, ntype='toc'): - et = '{%s}type' % EPUB_NS - navs = [n for n in root.iterdescendants(XHTML('nav')) if n.get(et) == ntype] + et = base('epub', 'type') + navs = [n for n in root.iterdescendants(base.tag('xhtml', 'nav')) + if n.get(et) == ntype] for x in navs[1:]: extract(x) if navs: @@ -667,13 +696,14 @@ def ensure_single_nav_of_type(root, ntype='toc'): nav.attrib.update(attrib) nav.tail = tail else: - nav = root.makeelement(XHTML('nav')) - first_child(root, XHTML('body')).append(nav) - nav.set('{%s}type' % EPUB_NS, ntype) + nav = root.makeelement(base.tag('xhtml', 'nav')) + first_child(root, base.tag('xhtml', 'body')).append(nav) + nav.set(et, ntype) return nav -def commit_nav_toc(container, toc, lang=None, landmarks=None, previous_nav=None): +def commit_nav_toc(container, toc, lang=None, landmarks=None, + previous_nav=None): from ebook_converter.ebooks.oeb.polish.pretty import pretty_xml_tree tocname = find_existing_nav_toc(container) if previous_nav is not None: @@ -684,7 +714,8 @@ def commit_nav_toc(container, toc, lang=None, landmarks=None, previous_nav=None) if tocname is None: item = container.generate_item('nav.xhtml', id_prefix='nav') item.set('properties', 'nav') - tocname = container.href_to_name(item.get('href'), base=container.opf_name) + tocname = container.href_to_name(item.get('href'), + base=container.opf_name) if previous_nav is not None: root = previous_nav[1] else: @@ -698,24 +729,25 @@ def commit_nav_toc(container, toc, lang=None, landmarks=None, previous_nav=None) if lang: lang = lang_as_iso639_1(lang) or lang root.set('lang', lang) - root.set('{%s}lang' % XML_NS, lang) + root.set(base.tag('xml', 'lang'), lang) nav = ensure_single_nav_of_type(root, 'toc') if toc.toc_title: - nav.append(nav.makeelement(XHTML('h1'))) + nav.append(nav.makeelement(base.tag('xhtml', 'h1'))) nav[-1].text = toc.toc_title - rnode = nav.makeelement(XHTML('ol')) + rnode = nav.makeelement(base.tag('xhtml', 'ol')) nav.append(rnode) - to_href = partial(container.name_to_href, base=tocname) + to_href = functools.partial(container.name_to_href, base=tocname) spat = re.compile(r'\s+') def process_node(xml_parent, toc_parent): for child in toc_parent: - li = xml_parent.makeelement(XHTML('li')) + li = xml_parent.makeelement(base.tag('xhtml', 'li')) xml_parent.append(li) title = child.title or '' title = spat.sub(' ', title).strip() - a = li.makeelement(XHTML('a' if child.dest else 'span')) + a = li.makeelement(base.tag('xhtml', 'a' + if child.dest else 'span')) a.text = title li.append(a) if child.dest: @@ -724,14 +756,14 @@ def commit_nav_toc(container, toc, lang=None, landmarks=None, previous_nav=None) href += '#'+child.frag a.set('href', href) if len(child): - ol = li.makeelement(XHTML('ol')) + ol = li.makeelement(base.tag('xhtml', 'ol')) li.append(ol) process_node(ol, child) process_node(rnode, toc) pretty_xml_tree(nav) def collapse_li(parent): - for li in parent.iterdescendants(XHTML('li')): + for li in parent.iterdescendants(base.tag('xhtml', 'li')): if len(li) == 1: li.text = None li[0].tail = None @@ -739,9 +771,9 @@ def commit_nav_toc(container, toc, lang=None, landmarks=None, previous_nav=None) nav.tail = '\n' def create_li(ol, entry): - li = ol.makeelement(XHTML('li')) + li = ol.makeelement(base.tag('xhtml', 'li')) ol.append(li) - a = li.makeelement(XHTML('a')) + a = li.makeelement(base.tag('xhtml', 'a')) li.append(a) href = container.name_to_href(entry['dest'], tocname) if entry['frag']: @@ -752,12 +784,13 @@ def commit_nav_toc(container, toc, lang=None, landmarks=None, previous_nav=None) if landmarks is not None: nav = ensure_single_nav_of_type(root, 'landmarks') nav.set('hidden', '') - ol = nav.makeelement(XHTML('ol')) + ol = nav.makeelement(base.tag('xhtml', 'ol')) nav.append(ol) for entry in landmarks: - if entry['type'] and container.has_name(entry['dest']) and container.mime_map[entry['dest']] in OEB_DOCS: + if (entry['type'] and container.has_name(entry['dest']) and + container.mime_map[entry['dest']] in base.OEB_DOCS): a = create_li(ol, entry) - a.set('{%s}type' % EPUB_NS, entry['type']) + a.set(base.tag('epub', 'type'), entry['type']) a.text = entry['title'] or None pretty_xml_tree(nav) collapse_li(nav) @@ -765,10 +798,11 @@ def commit_nav_toc(container, toc, lang=None, landmarks=None, previous_nav=None) if toc.page_list: nav = ensure_single_nav_of_type(root, 'page-list') nav.set('hidden', '') - ol = nav.makeelement(XHTML('ol')) + ol = nav.makeelement(base.tag('xhtml', 'ol')) nav.append(ol) for entry in toc.page_list: - if container.has_name(entry['dest']) and container.mime_map[entry['dest']] in OEB_DOCS: + if (container.has_name(entry['dest']) and + container.mime_map[entry['dest']] in base.OEB_DOCS): a = create_li(ol, entry) a.text = str(entry['pagenum']) pretty_xml_tree(nav) @@ -785,11 +819,12 @@ def commit_toc(container, toc, lang=None, uid=None): def remove_names_from_toc(container, names): changed = [] names = frozenset(names) - for find_toc, parse_toc, commit_toc in ( - (find_existing_ncx_toc, parse_ncx, commit_ncx_toc), - (find_existing_nav_toc, parse_nav, commit_nav_toc), - ): - toc = get_x_toc(container, find_toc, parse_toc, verify_destinations=False) + for find_toc, parse_toc, commit_toc in ((find_existing_ncx_toc, + parse_ncx, commit_ncx_toc), + (find_existing_nav_toc, + parse_nav, commit_nav_toc)): + toc = get_x_toc(container, find_toc, parse_toc, + verify_destinations=False) if len(toc) > 0: remove = [] for node in toc.iterdescendants(): @@ -805,15 +840,16 @@ def remove_names_from_toc(container, names): def find_inline_toc(container): for name, linear in container.spine_names: - if container.parsed(name).xpath('//*[local-name()="body" and @id="calibre_generated_inline_toc"]'): + if container.parsed(name).xpath('//*[local-name()="body" and @id=' + '"calibre_generated_inline_toc"]'): return name def toc_to_html(toc, container, toc_name, title, lang=None): def process_node(html_parent, toc, level=1, indent=' ', style_level=2): - li = html_parent.makeelement(XHTML('li')) - li.tail = '\n'+ (indent*level) + li = html_parent.makeelement(base.tag('xhtml', 'li')) + li.tail = '\n' + (indent * level) html_parent.append(li) name, frag = toc.dest, toc.frag href = '#' @@ -821,32 +857,29 @@ def toc_to_html(toc, container, toc_name, title, lang=None): href = container.name_to_href(name, toc_name) if frag: href += '#' + frag - a = li.makeelement(XHTML('a'), href=href) + a = li.makeelement(base.tag('xhtml', 'a'), href=href) a.text = toc.title li.append(a) if len(toc) > 0: - parent = li.makeelement(XHTML('ul')) + parent = li.makeelement(base.tag('xhtml', 'ul')) parent.set('class', 'level%d' % (style_level)) li.append(parent) a.tail = '\n\n' + (indent*(level+2)) parent.text = '\n'+(indent*(level+3)) parent.tail = '\n\n' + (indent*(level+1)) for child in toc: - process_node(parent, child, level+3, style_level=style_level + 1) + process_node(parent, child, level+3, + style_level=style_level + 1) parent[-1].tail = '\n' + (indent*(level+2)) - E = ElementMaker(namespace=XHTML_NS, nsmap={None:XHTML_NS}) - html = E.html( - E.head( - E.title(title), - E.style(P('templates/inline_toc_styles.css', data=True), type='text/css'), - ), - E.body( - E.h2(title), - E.ul(), - id="calibre_generated_inline_toc", - ) - ) + E = ElementMaker(namespace=const.XHTML_NS, nsmap={None: const.XHTML_NS}) + # TODO(gryf): revisit lack of css. + css_f = pkg_resources.resource_filename('ebook_converter', + 'data/inline_toc_styles.css') + html = E.html(E.head(E.title(title), + E.style(css_f, type='text/css')), + E.body(E.h2(title), E.ul(), + id="calibre_generated_inline_toc")) ul = html[1][1] ul.set('class', 'level1') @@ -859,11 +892,12 @@ def toc_to_html(toc, container, toc_name, title, lang=None): def create_inline_toc(container, title=None): - ''' - Create an inline (HTML) Table of Contents from an existing NCX Table of Contents. + """ + Create an inline (HTML) Table of Contents from an existing NCX Table of + Contents. :param title: The title for this table of contents. - ''' + """ lang = get_book_language(container) default_title = 'Table of Contents' title = title or default_title @@ -874,7 +908,7 @@ def create_inline_toc(container, title=None): name = toc_name html = toc_to_html(toc, container, name, title, lang) - raw = serialize(html, 'text/html') + raw = base.serialize(html, 'text/html') if name is None: name, c = 'toc.xhtml', 0 while container.has_name(name): @@ -884,5 +918,6 @@ def create_inline_toc(container, title=None): else: with container.open(name, 'wb') as f: f.write(raw) - set_guide_item(container, 'toc', title, name, frag='calibre_generated_inline_toc') + set_guide_item(container, 'toc', title, name, + frag='calibre_generated_inline_toc') return name diff --git a/ebook_converter/ebooks/oeb/reader.py b/ebook_converter/ebooks/oeb/reader.py index 4139b1b..f5a02f7 100644 --- a/ebook_converter/ebooks/oeb/reader.py +++ b/ebook_converter/ebooks/oeb/reader.py @@ -1,21 +1,21 @@ """ Container-/OPF-based input OEBBook reader. """ -import sys, os, uuid, copy, re, io -from collections import defaultdict +import collections +import copy +import io +import os +import re +import sys import urllib.parse +import uuid from lxml import etree -from ebook_converter.ebooks.oeb.base import OPF1_NS, OPF2_NS, OPF2_NSMAP, DC11_NS, \ - DC_NSES, OPF, xml2text, XHTML_MIME -from ebook_converter.ebooks.oeb.base import OEB_DOCS, OEB_STYLES, OEB_IMAGES, \ - PAGE_MAP_MIME, JPEG_MIME, NCX_MIME, SVG_MIME -from ebook_converter.ebooks.oeb.base import XMLDECL_RE, COLLAPSE_RE, \ - MS_COVER_TYPE, iterlinks -from ebook_converter.ebooks.oeb.base import namespace, barename, XPath, xpath, \ - urlnormalize, BINARY_MIME, \ - OEBError, OEBBook, DirContainer +from ebook_converter import constants as const +from ebook_converter.ebooks.oeb import base +from ebook_converter.ebooks.oeb import parse_utils +from ebook_converter.ebooks.metadata import opf2 as opf_meta from ebook_converter.ebooks.oeb.writer import OEBWriter from ebook_converter.utils.xml_parse import safe_xml_fromstring from ebook_converter.utils.cleantext import clean_xml_chars @@ -26,18 +26,13 @@ from ebook_converter import guess_type, xml_replace_entities from ebook_converter.polyglot.urllib import unquote -__all__ = ['OEBReader'] -__license__ = 'GPL v3' -__copyright__ = '2008, Marshall T. Vandegrift ' - - class OEBReader(object): """Read an OEBPS 1.x or OPF/OPS 2.0 file collection.""" - COVER_SVG_XP = XPath('h:body//svg:svg[position() = 1]') - COVER_OBJECT_XP = XPath('h:body//h:object[@data][position() = 1]') + COVER_SVG_XP = base.XPath('h:body//svg:svg[position() = 1]') + COVER_OBJECT_XP = base.XPath('h:body//h:object[@data][position() = 1]') - Container = DirContainer + Container = base.DirContainer """Container type used to access book files. Override in sub-classes.""" DEFAULT_PROFILE = 'PRS505' @@ -75,61 +70,67 @@ class OEBReader(object): for elem in opf.iter(tag=etree.Element): nsmap.update(elem.nsmap) for elem in opf.iter(tag=etree.Element): - if namespace(elem.tag) in ('', OPF1_NS) and ':' not in barename(elem.tag): - elem.tag = OPF(barename(elem.tag)) - nsmap.update(OPF2_NSMAP) + if (parse_utils.namespace(elem.tag) in ('', const.OPF1_NS) and + ':' not in parse_utils.barename(elem.tag)): + elem.tag = base.tag('opf', parse_utils.barename(elem.tag)) + nsmap.update(const.OPF2_NSMAP) attrib = dict(opf.attrib) - nroot = etree.Element(OPF('package'), - nsmap={None: OPF2_NS}, attrib=attrib) - metadata = etree.SubElement(nroot, OPF('metadata'), nsmap=nsmap) - ignored = (OPF('dc-metadata'), OPF('x-metadata')) - for elem in xpath(opf, 'o2:metadata//*'): + nroot = etree.Element(base.tag('opf', 'package'), + nsmap={None: const.OPF2_NS}, attrib=attrib) + metadata = etree.SubElement(nroot, base.tag('opf', 'metadata'), + nsmap=nsmap) + ignored = (base.tag('opf', 'dc-metadata'), base.tag('opf', 'x-metadata')) + for elem in base.xpath(opf, 'o2:metadata//*'): if elem.tag in ignored: continue - if namespace(elem.tag) in DC_NSES: - tag = barename(elem.tag).lower() - elem.tag = '{%s}%s' % (DC11_NS, tag) + if parse_utils.namespace(elem.tag) in const.DC_NSES: + tag = parse_utils.barename(elem.tag).lower() + elem.tag = '{%s}%s' % (const.DC11_NS, tag) if elem.tag.startswith('dc:'): tag = elem.tag.partition(':')[-1].lower() - elem.tag = '{%s}%s' % (DC11_NS, tag) + elem.tag = '{%s}%s' % (const.DC11_NS, tag) metadata.append(elem) - for element in xpath(opf, 'o2:metadata//o2:meta'): + for element in base.xpath(opf, 'o2:metadata//o2:meta'): metadata.append(element) for tag in ('o2:manifest', 'o2:spine', 'o2:tours', 'o2:guide'): - for element in xpath(opf, tag): + for element in base.xpath(opf, tag): nroot.append(element) return nroot def _read_opf(self): data = self.oeb.container.read(None) data = self.oeb.decode(data) - data = XMLDECL_RE.sub('', data) + data = base.XMLDECL_RE.sub('', data) data = re.sub(r'http://openebook.org/namespaces/oeb-package/1.0(/*)', - OPF1_NS, data) + const.OPF1_NS, data) try: - opf = safe_xml_fromstring(data) + opf = etree.fromstring(data) except etree.XMLSyntaxError: data = xml_replace_entities(clean_xml_chars(data), encoding=None) try: - opf = safe_xml_fromstring(data) + opf = etree.fromstring(data) self.logger.warn('OPF contains invalid HTML named entities') except etree.XMLSyntaxError: data = re.sub(r'(?is).+', '', data) data = data.replace('', - '') - opf = safe_xml_fromstring(data) + '') + opf = etree.fromstring(data) self.logger.warn('OPF contains invalid tours section') - ns = namespace(opf.tag) - if ns not in ('', OPF1_NS, OPF2_NS): - raise OEBError('Invalid namespace %r for OPF document' % ns) + ns = parse_utils.namespace(opf.tag) + if ns not in ('', const.OPF1_NS, const.OPF2_NS): + raise base.OEBError('Invalid namespace %r for OPF document' % ns) opf = self._clean_opf(opf) return opf def _metadata_from_opf(self, opf): from ebook_converter.ebooks.metadata.opf2 import OPF - from ebook_converter.ebooks.oeb.transforms.metadata import meta_info_to_oeb_metadata - stream = io.BytesIO(etree.tostring(opf, xml_declaration=True, encoding='utf-8')) + from ebook_converter.ebooks.oeb.transforms.metadata import \ + meta_info_to_oeb_metadata + stream = io.BytesIO(etree.tostring(opf, xml_declaration=True, + encoding='utf-8')) + # o = opf_meta.OPF(stream) o = OPF(stream) pwm = o.primary_writing_mode if pwm: @@ -139,8 +140,8 @@ class OEBReader(object): mi.language = get_lang().replace('_', '-') self.oeb.metadata.add('language', mi.language) if not mi.book_producer: - mi.book_producer = '%(a)s (%(v)s) [http://%(a)s-ebook.com]'%\ - dict(a=__appname__, v=__version__) + mi.book_producer = ('%(a)s (%(v)s) [http://%(a)s-ebook.com]' % + dict(a=__appname__, v=__version__)) meta_info_to_oeb_metadata(mi, self.oeb.metadata, self.logger) m = self.oeb.metadata m.add('identifier', str(uuid.uuid4()), id='uuid_id', scheme='uuid') @@ -162,16 +163,16 @@ class OEBReader(object): data. ''' bad = [] - check = OEB_DOCS.union(OEB_STYLES) + check = base.OEB_DOCS.union(base.OEB_STYLES) for item in list(self.oeb.manifest.values()): if item.media_type in check: try: item.data except KeyboardInterrupt: raise - except: - self.logger.exception('Failed to parse content in %s'% - item.href) + except Exception: + self.logger.exception('Failed to parse content in %s' % + item.href) bad.append(item) self.oeb.manifest.remove(item) return bad @@ -181,25 +182,28 @@ class OEBReader(object): manifest = self.oeb.manifest known = set(manifest.hrefs) unchecked = set(manifest.values()) - cdoc = OEB_DOCS|OEB_STYLES + cdoc = base.OEB_DOCS | base.OEB_STYLES invalid = set() while unchecked: new = set() for item in unchecked: data = None - if (item.media_type in cdoc or item.media_type[-4:] in ('/xml', '+xml')): + if (item.media_type in cdoc or + item.media_type[-4:] in ('/xml', '+xml')): try: data = item.data - except: + except Exception: self.oeb.log.exception('Failed to read from manifest ' - 'entry with id: %s, ignoring'%item.id) + 'entry with id: %s, ignoring' % + item.id) invalid.add(item) continue if data is None: continue - if (item.media_type in OEB_DOCS or item.media_type[-4:] in ('/xml', '+xml')): - hrefs = [r[2] for r in iterlinks(data)] + if (item.media_type in base.OEB_DOCS or + item.media_type[-4:] in ('/xml', '+xml')): + hrefs = [r[2] for r in base.iterlinks(data)] for href in hrefs: if isinstance(href, bytes): href = href.decode('utf-8') @@ -207,22 +211,22 @@ class OEBReader(object): if not href: continue try: - href = item.abshref(urlnormalize(href)) + href = item.abshref(base.urlnormalize(href)) scheme = urllib.parse.urlparse(href).scheme - except: - self.oeb.log.exception( - 'Skipping invalid href: %r'%href) + except Exception: + self.oeb.log.exception('Skipping invalid href: ' + '%r' % href) continue if not scheme and href not in known: new.add(href) - elif item.media_type in OEB_STYLES: + elif item.media_type in base.OEB_STYLES: try: urls = list(css_parser.getUrls(data)) - except: + except Exception: urls = [] for url in urls: href, _ = urllib.parse.urldefrag(url) - href = item.abshref(urlnormalize(href)) + href = item.abshref(base.urlnormalize(href)) scheme = urllib.parse.urlparse(href).scheme if not scheme and href not in known: new.add(href) @@ -232,7 +236,7 @@ class OEBReader(object): known.add(href) is_invalid = False for item in invalid: - if href == item.abshref(urlnormalize(href)): + if href == item.abshref(base.urlnormalize(href)): is_invalid = True break if is_invalid: @@ -243,11 +247,12 @@ class OEBReader(object): warned.add(href) continue if href not in warned: - self.logger.warn('Referenced file %r not in manifest' % href) + self.logger.warn('Referenced file %r not in manifest' % + href) warned.add(href) id, _ = manifest.generate(id='added') guessed = guess_type(href)[0] - media_type = guessed or BINARY_MIME + media_type = guessed or base.BINARY_MIME added = manifest.add(id, href, media_type) unchecked.add(added) @@ -256,7 +261,7 @@ class OEBReader(object): def _manifest_from_opf(self, opf): manifest = self.oeb.manifest - for elem in xpath(opf, '/o2:package/o2:manifest/o2:item'): + for elem in base.xpath(opf, '/o2:package/o2:manifest/o2:item'): id = elem.get('id') href = elem.get('href') media_type = elem.get('media-type', None) @@ -264,7 +269,7 @@ class OEBReader(object): media_type = elem.get('mediatype', None) if not media_type or media_type == 'text/xml': guessed = guess_type(href)[0] - media_type = guessed or media_type or BINARY_MIME + media_type = guessed or media_type or base.BINARY_MIME if hasattr(media_type, 'lower'): media_type = media_type.lower() fallback = elem.get('fallback') @@ -285,12 +290,12 @@ class OEBReader(object): manifest = self.oeb.manifest spine = self.oeb.spine unchecked = set(spine) - selector = XPath('h:body//h:a/@href') + selector = base.XPath('h:body//h:a/@href') extras = set() while unchecked: new = set() for item in unchecked: - if item.media_type not in OEB_DOCS: + if item.media_type not in base.OEB_DOCS: # TODO: handle fallback chains continue for href in selector(item.data): @@ -298,20 +303,21 @@ class OEBReader(object): if not href: continue try: - href = item.abshref(urlnormalize(href)) + href = item.abshref(base.urlnormalize(href)) except ValueError: # Malformed URL continue if href not in manifest.hrefs: continue found = manifest.hrefs[href] - if found.media_type not in OEB_DOCS or \ + if found.media_type not in base.OEB_DOCS or \ found in spine or found in extras: continue new.add(found) extras.update(new) unchecked = new version = int(self.oeb.version[0]) - removed_items_to_ignore = getattr(self.oeb, 'removed_items_to_ignore', ()) + removed_items_to_ignore = getattr(self.oeb, 'removed_items_to_ignore', + ()) for item in sorted(extras): if item.href in removed_items_to_ignore: continue @@ -323,34 +329,38 @@ class OEBReader(object): def _spine_from_opf(self, opf): spine = self.oeb.spine manifest = self.oeb.manifest - for elem in xpath(opf, '/o2:package/o2:spine/o2:itemref'): + for elem in base.xpath(opf, '/o2:package/o2:spine/o2:itemref'): idref = elem.get('idref') if idref not in manifest.ids: self.logger.warn('Spine item %r not found' % idref) continue item = manifest.ids[idref] - if item.media_type.lower() in OEB_DOCS and hasattr(item.data, 'xpath') and not getattr(item.data, 'tag', '').endswith('}ncx'): + if (item.media_type.lower() in base.OEB_DOCS and + hasattr(item.data, 'xpath') and not + getattr(item.data, 'tag', '').endswith('}ncx')): spine.add(item, elem.get('linear')) else: - if hasattr(item.data, 'tag') and item.data.tag and item.data.tag.endswith('}html'): - item.media_type = XHTML_MIME + if (hasattr(item.data, 'tag') and + item.data.tag and item.data.tag.endswith('}html')): + item.media_type = base.XHTML_MIME spine.add(item, elem.get('linear')) else: self.oeb.log.warn('The item %s is not a XML document.' - ' Removing it from spine.'%item.href) + ' Removing it from spine.' % item.href) if len(spine) == 0: - raise OEBError("Spine is empty") + raise base.OEBError("Spine is empty") self._spine_add_extra() - for val in xpath(opf, '/o2:package/o2:spine/@page-progression-direction'): + for val in base.xpath(opf, + '/o2:package/o2:spine/@page-progression-direction'): if val in {'ltr', 'rtl'}: spine.page_progression_direction = val def _guide_from_opf(self, opf): guide = self.oeb.guide manifest = self.oeb.manifest - for elem in xpath(opf, '/o2:package/o2:guide/o2:reference'): + for elem in base.xpath(opf, '/o2:package/o2:guide/o2:reference'): ref_href = elem.get('href') - path = urlnormalize(urllib.parse.urldefrag(ref_href)[0]) + path = base.urlnormalize(urllib.parse.urldefrag(ref_href)[0]) if path not in manifest.hrefs: corrected_href = None for href in manifest.hrefs: @@ -366,7 +376,7 @@ class OEBReader(object): guide.add(typ, elem.get('title'), ref_href) def _find_ncx(self, opf): - result = xpath(opf, '/o2:package/o2:spine/@toc') + result = base.xpath(opf, '/o2:package/o2:spine/@toc') if result: id = result[0] if id not in self.oeb.manifest.ids: @@ -375,30 +385,33 @@ class OEBReader(object): self.oeb.manifest.remove(item) return item for item in self.oeb.manifest.values(): - if item.media_type == NCX_MIME: + if item.media_type == base.NCX_MIME: self.oeb.manifest.remove(item) return item return None def _toc_from_navpoint(self, item, toc, navpoint): - children = xpath(navpoint, 'ncx:navPoint') + children = base.xpath(navpoint, 'ncx:navPoint') for child in children: - title = ''.join(xpath(child, 'ncx:navLabel/ncx:text/text()')) - title = COLLAPSE_RE.sub(' ', title.strip()) - href = xpath(child, 'ncx:content/@src') + title = ''.join(base.xpath(child, 'ncx:navLabel/ncx:text/text()')) + title = base.COLLAPSE_RE.sub(' ', title.strip()) + href = base.xpath(child, 'ncx:content/@src') if not title: self._toc_from_navpoint(item, toc, child) continue - if (not href or not href[0]) and not xpath(child, 'ncx:navPoint'): + if (not href or not href[0]) and not base.xpath(child, 'ncx:navPoint'): # This node is useless continue - href = item.abshref(urlnormalize(href[0])) if href and href[0] else '' + if href and href[0]: + href = item.abshref(base.urlnormalize(href[0])) + else: + href = '' path, _ = urllib.parse.urldefrag(href) if path and path not in self.oeb.manifest.hrefs: - path = urlnormalize(path) + path = base.urlnormalize(path) if href and path not in self.oeb.manifest.hrefs: self.logger.warn('TOC reference %r not found' % href) - gc = xpath(child, 'ncx:navPoint') + gc = base.xpath(child, 'ncx:navPoint') if not gc: # This node is useless continue @@ -406,36 +419,40 @@ class OEBReader(object): klass = child.get('class', 'chapter') try: - po = int(child.get('playOrder', self.oeb.toc.next_play_order())) - except: + po = int(child.get('playOrder', + self.oeb.toc.next_play_order())) + except Exception: po = self.oeb.toc.next_play_order() - authorElement = xpath(child, - 'descendant::calibre:meta[@name = "author"]') + authorElement = base.xpath(child, + 'descendant::calibre:meta[@name = "author"]') if authorElement: author = authorElement[0].text else: author = None - descriptionElement = xpath(child, - 'descendant::calibre:meta[@name = "description"]') + descriptionElement = base.xpath(child, + 'descendant::calibre:meta[@name = ' + '"description"]') if descriptionElement: description = etree.tostring(descriptionElement[0], - method='text', encoding='unicode').strip() + method='text', + encoding='unicode').strip() if not description: description = None else: description = None - index_image = xpath(child, - 'descendant::calibre:meta[@name = "toc_thumbnail"]') + index_image = base.xpath(child, + 'descendant::calibre:meta[@name = ' + '"toc_thumbnail"]') toc_thumbnail = (index_image[0].text if index_image else None) if not toc_thumbnail or not toc_thumbnail.strip(): toc_thumbnail = None node = toc.add(title, href, id=id, klass=klass, - play_order=po, description=description, author=author, - toc_thumbnail=toc_thumbnail) + play_order=po, description=description, + author=author, toc_thumbnail=toc_thumbnail) self._toc_from_navpoint(item, node, child) @@ -444,31 +461,31 @@ class OEBReader(object): return False self.log.debug('Reading TOC from NCX...') ncx = item.data - title = ''.join(xpath(ncx, 'ncx:docTitle/ncx:text/text()')) - title = COLLAPSE_RE.sub(' ', title.strip()) + title = ''.join(base.xpath(ncx, 'ncx:docTitle/ncx:text/text()')) + title = base.COLLAPSE_RE.sub(' ', title.strip()) title = title or str(self.oeb.metadata.title[0]) toc = self.oeb.toc toc.title = title - navmaps = xpath(ncx, 'ncx:navMap') + navmaps = base.xpath(ncx, 'ncx:navMap') for navmap in navmaps: self._toc_from_navpoint(item, toc, navmap) return True def _toc_from_tour(self, opf): - result = xpath(opf, 'o2:tours/o2:tour') + result = base.xpath(opf, 'o2:tours/o2:tour') if not result: return False self.log.debug('Reading TOC from tour...') tour = result[0] toc = self.oeb.toc toc.title = tour.get('title') - sites = xpath(tour, 'o2:site') + sites = base.xpath(tour, 'o2:site') for site in sites: title = site.get('title') href = site.get('href') if not title or not href: continue - path, _ = urllib.parse.urldefrag(urlnormalize(href)) + path, _ = urllib.parse.urldefrag(base.urlnormalize(href)) if path not in self.oeb.manifest.hrefs: self.logger.warn('TOC reference %r not found' % href) continue @@ -484,23 +501,23 @@ class OEBReader(object): item = self.oeb.manifest.hrefs[itempath] html = item.data if frag: - elems = xpath(html, './/*[@id="%s"]' % frag) + elems = base.xpath(html, './/*[@id="%s"]' % frag) if not elems: - elems = xpath(html, './/*[@name="%s"]' % frag) + elems = base.xpath(html, './/*[@name="%s"]' % frag) elem = elems[0] if elems else html - while elem != html and not xpath(elem, './/h:a[@href]'): + while elem != html and not base.xpath(elem, './/h:a[@href]'): elem = elem.getparent() html = elem - titles = defaultdict(list) + titles = collections.defaultdict(list) order = [] - for anchor in xpath(html, './/h:a[@href]'): + for anchor in base.xpath(html, './/h:a[@href]'): href = anchor.attrib['href'] - href = item.abshref(urlnormalize(href)) + href = item.abshref(base.urlnormalize(href)) path, frag = urllib.parse.urldefrag(href) if path not in self.oeb.manifest.hrefs: continue - title = xml2text(anchor) - title = COLLAPSE_RE.sub(' ', title.strip()) + title = base.xml2text(anchor) + title = base.COLLAPSE_RE.sub(' ', title.strip()) if href not in titles: order.append(href) titles[href].append(title) @@ -518,15 +535,15 @@ class OEBReader(object): if not item.linear: continue html = item.data - title = ''.join(xpath(html, '/h:html/h:head/h:title/text()')) - title = COLLAPSE_RE.sub(' ', title.strip()) + title = ''.join(base.xpath(html, '/h:html/h:head/h:title/text()')) + title = base.COLLAPSE_RE.sub(' ', title.strip()) if title: titles.append(title) headers.append('(unlabled)') for tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'strong'): expr = '/h:html/h:body//h:%s[position()=1]/text()' - header = ''.join(xpath(html, expr % tag)) - header = COLLAPSE_RE.sub(' ', header.strip()) + header = ''.join(base.xpath(html, expr % tag)) + header = base.COLLAPSE_RE.sub(' ', header.strip()) if header: headers[-1] = header break @@ -558,17 +575,17 @@ class OEBReader(object): ncx = item.data if ncx is None: return False - ptargets = xpath(ncx, 'ncx:pageList/ncx:pageTarget') + ptargets = base.xpath(ncx, 'ncx:pageList/ncx:pageTarget') if not ptargets: return False pages = self.oeb.pages for ptarget in ptargets: - name = ''.join(xpath(ptarget, 'ncx:navLabel/ncx:text/text()')) - name = COLLAPSE_RE.sub(' ', name.strip()) - href = xpath(ptarget, 'ncx:content/@src') + name = ''.join(base.xpath(ptarget, 'ncx:navLabel/ncx:text/text()')) + name = base.COLLAPSE_RE.sub(' ', name.strip()) + href = base.xpath(ptarget, 'ncx:content/@src') if not href: continue - href = item.abshref(urlnormalize(href[0])) + href = item.abshref(base.urlnormalize(href[0])) id = ptarget.get('id') type = ptarget.get('type', 'normal') klass = ptarget.get('class') @@ -576,7 +593,7 @@ class OEBReader(object): return True def _find_page_map(self, opf): - result = xpath(opf, '/o2:package/o2:spine/@page-map') + result = base.xpath(opf, '/o2:package/o2:spine/@page-map') if result: id = result[0] if id not in self.oeb.manifest.ids: @@ -585,7 +602,7 @@ class OEBReader(object): self.oeb.manifest.remove(item) return item for item in self.oeb.manifest.values(): - if item.media_type == PAGE_MAP_MIME: + if item.media_type == base.PAGE_MAP_MIME: self.oeb.manifest.remove(item) return item return None @@ -596,13 +613,13 @@ class OEBReader(object): return False pmap = item.data pages = self.oeb.pages - for page in xpath(pmap, 'o2:page'): + for page in base.xpath(pmap, 'o2:page'): name = page.get('name', '') href = page.get('href') if not href: continue - name = COLLAPSE_RE.sub(' ', name.strip()) - href = item.abshref(urlnormalize(href)) + name = base.COLLAPSE_RE.sub(' ', name.strip()) + href = item.abshref(base.urlnormalize(href)) type = 'normal' if not name: type = 'special' @@ -628,14 +645,14 @@ class OEBReader(object): if not data: data = b'' id, href = self.oeb.manifest.generate('cover', 'cover.jpg') - item = self.oeb.manifest.add(id, href, JPEG_MIME, data=data) + item = self.oeb.manifest.add(id, href, base.JPEG_MIME, data=data) return item def _locate_cover_image(self): if self.oeb.metadata.cover: id = str(self.oeb.metadata.cover[0]) item = self.oeb.manifest.ids.get(id, None) - if item is not None and item.media_type in OEB_IMAGES: + if item is not None and item.media_type in base.OEB_IMAGES: return item else: self.logger.warn('Invalid cover image @id %r' % id) @@ -644,27 +661,27 @@ class OEBReader(object): href = self.oeb.guide['cover'].href item = self.oeb.manifest.hrefs[href] media_type = item.media_type - if media_type in OEB_IMAGES: + if media_type in base.OEB_IMAGES: return item - elif media_type in OEB_DOCS: + elif media_type in base.OEB_DOCS: hcover = item html = hcover.data - if MS_COVER_TYPE in self.oeb.guide: - href = self.oeb.guide[MS_COVER_TYPE].href + if base.MS_COVER_TYPE in self.oeb.guide: + href = self.oeb.guide[base.MS_COVER_TYPE].href item = self.oeb.manifest.hrefs.get(href, None) - if item is not None and item.media_type in OEB_IMAGES: + if item is not None and item.media_type in base.OEB_IMAGES: return item if self.COVER_SVG_XP(html): svg = copy.deepcopy(self.COVER_SVG_XP(html)[0]) href = os.path.splitext(hcover.href)[0] + '.svg' id, href = self.oeb.manifest.generate(hcover.id, href) - item = self.oeb.manifest.add(id, href, SVG_MIME, data=svg) + item = self.oeb.manifest.add(id, href, base.SVG_MIME, data=svg) return item if self.COVER_OBJECT_XP(html): object = self.COVER_OBJECT_XP(html)[0] href = hcover.abshref(object.get('data')) item = self.oeb.manifest.hrefs.get(href, None) - if item is not None and item.media_type in OEB_IMAGES: + if item is not None and item.media_type in base.OEB_IMAGES: return item return self._cover_from_html(hcover) @@ -687,7 +704,8 @@ class OEBReader(object): items = [x for x in self.oeb.manifest if x.href == href] for x in items: if x not in self.oeb.spine: - self.oeb.log.warn('Removing duplicate manifest item with id:', x.id) + self.oeb.log.warn('Removing duplicate manifest item with ' + 'id:', x.id) self.oeb.manifest.remove_duplicate_item(x) def _all_from_opf(self, opf): @@ -706,7 +724,7 @@ class OEBReader(object): def main(argv=sys.argv): reader = OEBReader() for arg in argv[1:]: - oeb = reader(OEBBook(), arg) + oeb = reader(base.OEBBook(), arg) for name, doc in oeb.to_opf1().values(): print(etree.tostring(doc, pretty_print=True)) for name, doc in oeb.to_opf2(page_map=True).values(): diff --git a/ebook_converter/ebooks/oeb/stylizer.py b/ebook_converter/ebooks/oeb/stylizer.py index 178579e..cfef868 100644 --- a/ebook_converter/ebooks/oeb/stylizer.py +++ b/ebook_converter/ebooks/oeb/stylizer.py @@ -10,17 +10,16 @@ from css_parser.css import (CSSStyleRule, CSSPageRule, CSSFontFaceRule, cssproperties) from css_parser import (profile as cssprofiles, parseString, parseStyle, log as css_parser_log, CSSParser, profiles, replaceUrls) + +from ebook_converter import constants as const from ebook_converter import force_unicode, as_unicode from ebook_converter.ebooks import unit_convert -from ebook_converter.ebooks.oeb.base import XHTML, XHTML_NS, CSS_MIME, OEB_STYLES, xpath, urlnormalize +from ebook_converter.ebooks.oeb import base from ebook_converter.ebooks.oeb.normalize_css import DEFAULTS, normalizers from ebook_converter.css_selectors import Select, SelectorError, INAPPROPRIATE_PSEUDO_CLASSES from ebook_converter.tinycss.media3 import CSSMedia3Parser -__license__ = 'GPL v3' -__copyright__ = '2008, Marshall T. Vandegrift ' - css_parser_log.setLevel(logging.WARN) _html_css_stylesheet = None @@ -208,7 +207,7 @@ class Stylizer(object): stylesheets = [html_css_stylesheet()] if base_css: stylesheets.append(parseString(base_css, validate=False)) - style_tags = xpath(tree, '//*[local-name()="style" or local-name()="link"]') + style_tags = base.xpath(tree, '//*[local-name()="style" or local-name()="link"]') # Add css_parser parsing profiles from output_profile for profile in self.opts.output_profile.extra_css_modules: @@ -219,7 +218,7 @@ class Stylizer(object): parser = CSSParser(fetcher=self._fetch_css_file, log=logging.getLogger('calibre.css')) for elem in style_tags: - if (elem.tag == XHTML('style') and elem.get('type', CSS_MIME) in OEB_STYLES and media_ok(elem.get('media'))): + if (elem.tag == base.tag('xhtml', 'style') and elem.get('type', base.CSS_MIME) in base.OEB_STYLES and media_ok(elem.get('media'))): text = elem.text if elem.text else '' for x in elem: t = getattr(x, 'text', None) @@ -245,7 +244,7 @@ class Stylizer(object): self.logger.warn('Ignoring missing stylesheet in @import rule:', rule.href) continue sitem = hrefs[ihref] - if sitem.media_type not in OEB_STYLES: + if sitem.media_type not in base.OEB_STYLES: self.logger.warn('CSS @import of non-CSS file %r' % rule.href) continue stylesheets.append(sitem.data) @@ -254,11 +253,11 @@ class Stylizer(object): replaceUrls(stylesheet, item.abshref, ignoreImportRules=True) stylesheets.append(stylesheet) - elif (elem.tag == XHTML('link') and elem.get('href') and elem.get( + elif (elem.tag == base.tag('xhtml', 'link') and elem.get('href') and elem.get( 'rel', 'stylesheet').lower() == 'stylesheet' and elem.get( - 'type', CSS_MIME).lower() in OEB_STYLES and media_ok(elem.get('media')) + 'type', base.CSS_MIME).lower() in base.OEB_STYLES and media_ok(elem.get('media')) ): - href = urlnormalize(elem.attrib['href']) + href = base.urlnormalize(elem.attrib['href']) path = item.abshref(href) sitem = oeb.manifest.hrefs.get(path, None) if sitem is None: @@ -326,7 +325,8 @@ class Stylizer(object): special_text = ''.join(punctuation_chars) + \ (text[0] if text else '') - span = x.makeelement('{%s}span' % XHTML_NS) + span = x.makeelement('{%s}span' % + const.XHTML_NS) span.text = special_text span.set('data-fake-first-letter', '1') span.tail = text[1:] @@ -340,10 +340,10 @@ class Stylizer(object): else: for elem in matches: self.style(elem)._update_cssdict(cssdict) - for elem in xpath(tree, '//h:*[@style]'): + for elem in base.xpath(tree, '//h:*[@style]'): self.style(elem)._apply_style_attr(url_replacer=item.abshref) num_pat = re.compile(r'[0-9.]+$') - for elem in xpath(tree, '//h:img[@width or @height]'): + for elem in base.xpath(tree, '//h:img[@width or @height]'): style = self.style(elem) # Check if either height or width is not default is_styled = style._style.get('width', 'auto') != 'auto' or \ @@ -370,7 +370,7 @@ class Stylizer(object): self.logger.warn('CSS import of missing file %r' % path) return (None, None) item = hrefs[path] - if item.media_type not in OEB_STYLES: + if item.media_type not in base.OEB_STYLES: self.logger.warn('CSS import of non-CSS file %r' % path) return (None, None) data = item.data.cssText diff --git a/ebook_converter/ebooks/oeb/transforms/cover.py b/ebook_converter/ebooks/oeb/transforms/cover.py index bbc23be..f03e52c 100644 --- a/ebook_converter/ebooks/oeb/transforms/cover.py +++ b/ebook_converter/ebooks/oeb/transforms/cover.py @@ -1,66 +1,61 @@ import textwrap import urllib.parse +from lxml import etree + from ebook_converter import guess_type from ebook_converter.utils.imghdr import identify -from ebook_converter.utils.xml_parse import safe_xml_fromstring from ebook_converter.polyglot.urllib import unquote -__license__ = 'GPL v3' -__copyright__ = '2010, Kovid Goyal ' -__docformat__ = 'restructuredtext en' - - class CoverManager(object): SVG_TEMPLATE = textwrap.dedent('''\ - - - - - Cover - - - -
- - - -
- - - ''') + + + + + Cover + + + +
+ + + +
+ +''') NONSVG_TEMPLATE = textwrap.dedent('''\ - - - - - Cover - - - -
- cover -
- - + + + + + Cover + + + +
+ cover +
+ + ''') def __init__(self, no_default_cover=False, no_svg_cover=False, - preserve_aspect_ratio=False, fixed_size=None): + preserve_aspect_ratio=False, fixed_size=None): self.no_default_cover = no_default_cover self.no_svg_cover = no_svg_cover self.preserve_aspect_ratio = preserve_aspect_ratio @@ -72,9 +67,9 @@ class CoverManager(object): style = 'style="height: 100%%"' else: width, height = fixed_size - style = 'style="height: %s; width: %s"'%(height, width) + style = 'style="height: %s; width: %s"' % (height, width) self.non_svg_template = self.NONSVG_TEMPLATE.replace('__style__', - style) + style) def __call__(self, oeb, opts, log): self.oeb = oeb @@ -108,22 +103,23 @@ class CoverManager(object): # if self.preserve_aspect_ratio: # width, height = 600, 800 self.svg_template = self.svg_template.replace('__viewbox__', - '0 0 %d %d'%(width, height)) + '0 0 %d %d' % + (width, height)) self.svg_template = self.svg_template.replace('__width__', - str(width)) + str(width)) self.svg_template = self.svg_template.replace('__height__', - str(height)) + str(height)) if href is not None: templ = self.non_svg_template if self.no_svg_cover \ else self.svg_template - tp = templ%unquote(href) + tp = templ % unquote(href) id, href = m.generate('titlepage', 'titlepage.xhtml') item = m.add(id, href, guess_type('t.xhtml')[0], - data=safe_xml_fromstring(tp)) + data=etree.fromstring(tp)) else: - item = self.oeb.manifest.hrefs[ - urllib.parse.urldefrag(self.oeb.guide['titlepage'].href)[0]] + key = urllib.parse.urldefrag(self.oeb.guide['titlepage'].href)[0] + item = self.oeb.manifest.hrefs[key] if item is not None: self.oeb.spine.insert(0, item, True) if 'cover' not in self.oeb.guide.refs: diff --git a/ebook_converter/ebooks/oeb/transforms/flatcss.py b/ebook_converter/ebooks/oeb/transforms/flatcss.py index 3696e0a..6d29be3 100644 --- a/ebook_converter/ebooks/oeb/transforms/flatcss.py +++ b/ebook_converter/ebooks/oeb/transforms/flatcss.py @@ -1,26 +1,27 @@ """ CSS flattening transform. """ -import re, operator, math, numbers -from collections import defaultdict -from xml.dom import SyntaxErr +import collections +import math +import numbers +import operator +import re +from xml import dom from lxml import etree import css_parser -from css_parser.css import Property +from css_parser import css as cp_css +from ebook_converter import constants as const from ebook_converter import guess_type from ebook_converter.ebooks import unit_convert -from ebook_converter.ebooks.oeb.base import (XHTML, XHTML_NS, CSS_MIME, OEB_STYLES, - namespace, barename, XPath, css_text) +from ebook_converter.ebooks.oeb import base +from ebook_converter.ebooks.oeb import parse_utils + from ebook_converter.ebooks.oeb.stylizer import Stylizer from ebook_converter.utils.filenames import ascii_filename, ascii_text -from ebook_converter.utils.icu import numeric_sort_key -__license__ = 'GPL v3' -__copyright__ = '2008, Marshall T. Vandegrift ' - COLLAPSE = re.compile(r'[ \t\r\n\v]+') STRIPNUM = re.compile(r'[-0-9]+$') @@ -121,7 +122,7 @@ class EmbedFontsCSSRules(object): return None if not self.href: iid, href = oeb.manifest.generate('page_styles', 'page_styles.css') - rules = [css_text(x) for x in self.rules] + rules = [base.css_text(x) for x in self.rules] rules = '\n\n'.join(rules) sheet = css_parser.parseString(rules, validate=False) self.href = oeb.manifest.add(iid, href, guess_type(href)[0], @@ -186,7 +187,7 @@ class CSSFlattener(object): for item in oeb.manifest.values(): # Make all links to resources absolute, as these sheets will be # consolidated into a single stylesheet at the root of the document - if item.media_type in OEB_STYLES: + if item.media_type in base.OEB_STYLES: css_parser.replaceUrls(item.data, item.abshref, ignoreImportRules=True) @@ -273,7 +274,7 @@ class CSSFlattener(object): css = '' for item in self.items: html = item.data - body = html.find(XHTML('body')) + body = html.find(base.tag('xhtml', 'body')) if 'style' in html.attrib: b = body.attrib.get('style', '') body.set('style', html.get('style') + ';' + b) @@ -310,11 +311,11 @@ class CSSFlattener(object): sizes[csize] += len(COLLAPSE.sub(' ', child.tail)) def baseline_spine(self): - sizes = defaultdict(float) + sizes = collections.defaultdict(float) for item in self.items: html = item.data stylizer = self.stylizers[item] - body = html.find(XHTML('body')) + body = html.find(base.tag('xhtml', 'body')) fsize = self.context.source.fbase self.baseline_node(body, stylizer, sizes, fsize) try: @@ -351,9 +352,9 @@ class CSSFlattener(object): def flatten_node(self, node, stylizer, names, styles, pseudo_styles, psize, item_id, recurse=True): if not isinstance(node.tag, (str, bytes)) \ - or namespace(node.tag) != XHTML_NS: + or parse_utils.namespace(node.tag) != const.XHTML_NS: return - tag = barename(node.tag) + tag = parse_utils.barename(node.tag) style = stylizer.style(node) cssdict = style.cssdict() try: @@ -375,7 +376,7 @@ class CSSFlattener(object): if 'margin-left' not in cssdict and 'margin-right' not in cssdict: cssdict['margin-left'] = cssdict['margin-right'] = 'auto' else: - for table in node.iterchildren(XHTML("table")): + for table in node.iterchildren(base.tag('xhtml', "table")): ts = stylizer.style(table) if ts.get('margin-left') is None and ts.get('margin-right') is None: ts.set('margin-left', 'auto') @@ -391,11 +392,12 @@ class CSSFlattener(object): if cssdict.get('vertical-align') == 'inherit': cssdict['vertical-align'] = node.attrib['valign'] del node.attrib['valign'] - if node.tag == XHTML('font'): + if node.tag == base.tag('xhtml', 'font'): tags = ['descendant::h:%s'%x for x in ('p', 'div', 'table', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'ol', 'ul', 'dl', 'blockquote')] - tag = 'div' if XPath('|'.join(tags))(node) else 'span' - node.tag = XHTML(tag) + # TODO(gryf): this will override tag from line 355. On purpose? + tag = 'div' if base.XPath('|'.join(tags))(node) else 'span' + node.tag = base.tag('xhtml', tag) if 'size' in node.attrib: def force_int(raw): return int(re.search(r'([0-9+-]+)', raw).group(1)) @@ -425,14 +427,14 @@ class CSSFlattener(object): del node.attrib['face'] if 'color' in node.attrib: try: - cssdict['color'] = Property('color', node.attrib['color']).value - except (ValueError, SyntaxErr): + cssdict['color'] = cp_css.Property('color', node.attrib['color']).value + except (ValueError, dom.SyntaxErr): pass del node.attrib['color'] if 'bgcolor' in node.attrib: try: - cssdict['background-color'] = Property('background-color', node.attrib['bgcolor']).value - except (ValueError, SyntaxErr): + cssdict['background-color'] = cp_css.Property('background-color', node.attrib['bgcolor']).value + except (ValueError, dom.SyntaxErr): pass del node.attrib['bgcolor'] if tag == 'ol' and 'type' in node.attrib: @@ -573,7 +575,7 @@ class CSSFlattener(object): def flatten_head(self, item, href, global_href): html = item.data - head = html.find(XHTML('head')) + head = html.find(base.tag('xhtml', 'head')) def safe_lower(x): try: @@ -583,39 +585,39 @@ class CSSFlattener(object): return x for node in html.xpath('//*[local-name()="style" or local-name()="link"]'): - if node.tag == XHTML('link') \ + if node.tag == base.tag('xhtml', 'link') \ and safe_lower(node.get('rel', 'stylesheet')) == 'stylesheet' \ - and safe_lower(node.get('type', CSS_MIME)) in OEB_STYLES: + and safe_lower(node.get('type', base.CSS_MIME)) in base.OEB_STYLES: node.getparent().remove(node) - elif node.tag == XHTML('style') \ - and node.get('type', CSS_MIME) in OEB_STYLES: + elif node.tag == base.tag('xhtml', 'style') \ + and node.get('type', base.CSS_MIME) in base.OEB_STYLES: node.getparent().remove(node) href = item.relhref(href) - l = etree.SubElement(head, XHTML('link'), - rel='stylesheet', type=CSS_MIME, href=href) + l = etree.SubElement(head, base.tag('xhtml', 'link'), + rel='stylesheet', type=base.CSS_MIME, href=href) l.tail='\n' if global_href: href = item.relhref(global_href) - l = etree.SubElement(head, XHTML('link'), - rel='stylesheet', type=CSS_MIME, href=href) + l = etree.SubElement(head, base.tag('xhtml', 'link'), + rel='stylesheet', type=base.CSS_MIME, href=href) l.tail = '\n' def replace_css(self, css): manifest = self.oeb.manifest for item in manifest.values(): - if item.media_type in OEB_STYLES: + if item.media_type in base.OEB_STYLES: manifest.remove(item) id, href = manifest.generate('css', 'stylesheet.css') sheet = css_parser.parseString(css, validate=False) if self.transform_css_rules: from ebook_converter.ebooks.css_transform_rules import transform_sheet transform_sheet(self.transform_css_rules, sheet) - item = manifest.add(id, href, CSS_MIME, data=sheet) + item = manifest.add(id, href, base.CSS_MIME, data=sheet) self.oeb.manifest.main_stylesheet = item return href def collect_global_css(self): - global_css = defaultdict(list) + global_css = collections.defaultdict(list) for item in self.items: stylizer = self.stylizers[item] if float(self.context.margin_top) >= 0: @@ -627,7 +629,7 @@ class CSSFlattener(object): items = sorted(stylizer.page_rule.items()) css = ';\n'.join("%s: %s" % (key, val) for key, val in items) css = ('@page {\n%s\n}\n'%css) if items else '' - rules = [css_text(r) for r in stylizer.font_face_rules + self.embed_font_rules] + rules = [base.css_text(r) for r in stylizer.font_face_rules + self.embed_font_rules] raw = '\n\n'.join(rules) css += '\n\n' + raw global_css[css].append(item) @@ -642,7 +644,7 @@ class CSSFlattener(object): if self.transform_css_rules: from ebook_converter.ebooks.css_transform_rules import transform_sheet transform_sheet(self.transform_css_rules, sheet) - manifest.add(id_, href, CSS_MIME, data=sheet) + manifest.add(id_, href, base.CSS_MIME, data=sheet) gc_map[css] = href ans = {} @@ -652,8 +654,8 @@ class CSSFlattener(object): return ans def flatten_spine(self): - names = defaultdict(int) - styles, pseudo_styles = {}, defaultdict(dict) + names = collections.defaultdict(int) + styles, pseudo_styles = {}, collections.defaultdict(dict) for item in self.items: html = item.data stylizer = self.stylizers[item] @@ -661,7 +663,7 @@ class CSSFlattener(object): self.specializer(item, stylizer) fsize = self.context.dest.fbase self.flatten_node(html, stylizer, names, styles, pseudo_styles, fsize, item.id, recurse=False) - self.flatten_node(html.find(XHTML('body')), stylizer, names, styles, pseudo_styles, fsize, item.id) + self.flatten_node(html.find(base.tag('xhtml', 'body')), stylizer, names, styles, pseudo_styles, fsize, item.id) items = sorted(((key, val) for (val, key) in styles.items())) # :hover must come after link and :active must come after :hover psels = sorted(pseudo_styles, key=lambda x : diff --git a/ebook_converter/ebooks/oeb/transforms/htmltoc.py b/ebook_converter/ebooks/oeb/transforms/htmltoc.py index 8c555f6..1f65c56 100644 --- a/ebook_converter/ebooks/oeb/transforms/htmltoc.py +++ b/ebook_converter/ebooks/oeb/transforms/htmltoc.py @@ -1,46 +1,20 @@ """ HTML-TOC-adding transform. """ -from ebook_converter.ebooks.oeb.base import XML, XHTML, XHTML_NS -from ebook_converter.ebooks.oeb.base import XHTML_MIME, CSS_MIME -from ebook_converter.ebooks.oeb.base import element, XPath +from ebook_converter import constants as const +from ebook_converter.ebooks.oeb import base -__all__ = ['HTMLTOCAdder'] -__license__ = 'GPL v3' -__copyright__ = '2008, Marshall T. Vandegrift ' - DEFAULT_TITLE = 'Table of Contents' +STYLE_CSS = {'nested': '.calibre_toc_header {\n text-align: center;\n}\n' + '.calibre_toc_block {\n margin-left: 1.2em;\n text-indent: ' + '-1.2em;\n}\n.calibre_toc_block .calibre_toc_block {\n ' + 'margin-left: 2.4em;\n}\n.calibre_toc_block .calibre_toc_block ' + '.calibre_toc_block {\n margin-left: 3.6em;\n}\n', -STYLE_CSS = { - 'nested': """ -.calibre_toc_header { - text-align: center; -} -.calibre_toc_block { - margin-left: 1.2em; - text-indent: -1.2em; -} -.calibre_toc_block .calibre_toc_block { - margin-left: 2.4em; -} -.calibre_toc_block .calibre_toc_block .calibre_toc_block { - margin-left: 3.6em; -} -""", - - 'centered': """ -.calibre_toc_header { - text-align: center; -} -.calibre_toc_block { - text-align: center; -} -body > .calibre_toc_block { - margin-top: 1.2em; -} -""" - } + 'centered': '.calibre_toc_header {\n text-align: center;\n}\n' + '.calibre_toc_block {\n text-align: center;\n}\nbody > ' + '.calibre_toc_block {\n margin-top: 1.2em;\n}\n'} class HTMLTOCAdder(object): @@ -71,7 +45,7 @@ class HTMLTOCAdder(object): if href in oeb.manifest.hrefs: item = oeb.manifest.hrefs[href] if (hasattr(item.data, 'xpath') and - XPath('//h:a[@href]')(item.data)): + base.XPath('//h:a[@href]')(item.data)): if oeb.spine.index(item) < 0: if self.position == 'end': oeb.spine.add(item, linear=False) @@ -91,23 +65,24 @@ class HTMLTOCAdder(object): oeb.logger.error('Unknown TOC style %r' % style) style = 'nested' id, css_href = oeb.manifest.generate('tocstyle', 'tocstyle.css') - oeb.manifest.add(id, css_href, CSS_MIME, data=STYLE_CSS[style]) + oeb.manifest.add(id, css_href, base.CSS_MIME, data=STYLE_CSS[style]) language = str(oeb.metadata.language[0]) - contents = element(None, XHTML('html'), nsmap={None: XHTML_NS}, - attrib={XML('lang'): language}) - head = element(contents, XHTML('head')) - htitle = element(head, XHTML('title')) + contents = base.element(None, base.tag('xhtml', 'html'), + nsmap={None: const.XHTML_NS}, + attrib={base.tag('xml', 'lang'): language}) + head = base.element(contents, base.tag('xhtml', 'head')) + htitle = base.element(head, base.tag('xhtml', 'title')) htitle.text = title - element(head, XHTML('link'), rel='stylesheet', type=CSS_MIME, - href=css_href) - body = element(contents, XHTML('body'), - attrib={'class': 'calibre_toc'}) - h1 = element(body, XHTML('h2'), - attrib={'class': 'calibre_toc_header'}) + base.element(head, base.tag('xhtml', 'link'), rel='stylesheet', + type=base.CSS_MIME, href=css_href) + body = base.element(contents, base.tag('xhtml', 'body'), + attrib={'class': 'calibre_toc'}) + h1 = base.element(body, base.tag('xhtml', 'h2'), + attrib={'class': 'calibre_toc_header'}) h1.text = title self.add_toc_level(body, oeb.toc) id, href = oeb.manifest.generate('contents', 'contents.xhtml') - item = oeb.manifest.add(id, href, XHTML_MIME, data=contents) + item = oeb.manifest.add(id, href, base.XHTML_MIME, data=contents) if self.position == 'end': oeb.spine.add(item, linear=False) else: @@ -116,10 +91,10 @@ class HTMLTOCAdder(object): def add_toc_level(self, elem, toc): for node in toc: - block = element(elem, XHTML('div'), - attrib={'class': 'calibre_toc_block'}) - line = element(block, XHTML('a'), - attrib={'href': node.href, - 'class': 'calibre_toc_line'}) + block = base.element(elem, base.tag('xhtml', 'div'), + attrib={'class': 'calibre_toc_block'}) + line = base.element(block, base.tag('xhtml', 'a'), + attrib={'href': node.href, + 'class': 'calibre_toc_line'}) line.text = node.title self.add_toc_level(block, node) diff --git a/ebook_converter/ebooks/oeb/transforms/jacket.py b/ebook_converter/ebooks/oeb/transforms/jacket.py index 0d1a88b..61a0dbf 100644 --- a/ebook_converter/ebooks/oeb/transforms/jacket.py +++ b/ebook_converter/ebooks/oeb/transforms/jacket.py @@ -4,9 +4,10 @@ from string import Formatter import pkg_resources import urllib.parse +from ebook_converter import constants as const from ebook_converter import guess_type, strftime from ebook_converter.constants_old import iswindows -from ebook_converter.ebooks.oeb.base import XPath, XHTML_NS, XHTML, xml2text, urlnormalize +from ebook_converter.ebooks.oeb.base import XPath, xml2text, urlnormalize from ebook_converter.library.comments import comments_to_html, markdown from ebook_converter.utils.date import is_date_undefined, as_local_time from ebook_converter.ebooks.chardet import strip_encoding_declarations @@ -303,7 +304,7 @@ def render_jacket(mi, output_profile, 'tags_label': 'Tags', 'title': title, 'title_str': title_str, - 'xmlns': XHTML_NS} + 'xmlns': const.XHTML_NS} for key in mi.custom_field_keys(): m = mi.get_user_metadata(key, False) or {} @@ -370,7 +371,7 @@ def render_jacket(mi, output_profile, # We cannot use data-calibre-rescale 100 on the body tag as that will just # give the body tag a font size of 1em, which is useless. for body in root.xpath('//*[local-name()="body"]'): - fw = body.makeelement(XHTML('div')) + fw = body.makeelement(const.XHTML_DIV) fw.set('data-calibre-rescale', '100') for child in body: fw.append(child) @@ -387,9 +388,9 @@ def linearize_jacket(oeb): for x in oeb.spine[:4]: if XPath(JACKET_XPATH)(x.data): for e in XPath('//h:table|//h:tr|//h:th')(x.data): - e.tag = XHTML('div') + e.tag = const.XHTML_DIV for e in XPath('//h:td')(x.data): - e.tag = XHTML('span') + e.tag = const.XHTML_SPAN break diff --git a/ebook_converter/ebooks/oeb/transforms/manglecase.py b/ebook_converter/ebooks/oeb/transforms/manglecase.py index 67b6493..2edf957 100644 --- a/ebook_converter/ebooks/oeb/transforms/manglecase.py +++ b/ebook_converter/ebooks/oeb/transforms/manglecase.py @@ -5,9 +5,9 @@ import string from lxml import etree -from ebook_converter.ebooks.oeb.base import XHTML, XHTML_NS -from ebook_converter.ebooks.oeb.base import CSS_MIME -from ebook_converter.ebooks.oeb.base import namespace +from ebook_converter import constants as const +from ebook_converter.ebooks.oeb import base +from ebook_converter.ebooks.oeb import parse_utils from ebook_converter.ebooks.oeb.stylizer import Stylizer @@ -43,15 +43,16 @@ class CaseMangler(object): def mangle_spine(self): id, href = self.oeb.manifest.generate('manglecase', 'manglecase.css') - self.oeb.manifest.add(id, href, CSS_MIME, data=CASE_MANGLER_CSS) + self.oeb.manifest.add(id, href, base.CSS_MIME, data=CASE_MANGLER_CSS) for item in self.oeb.spine: html = item.data relhref = item.relhref(href) - etree.SubElement(html.find(XHTML('head')), XHTML('link'), - rel='stylesheet', href=relhref, type=CSS_MIME) + etree.SubElement(html.find(base.tag('xhtml', 'head')), + base.tag('xhtml', 'link'), rel='stylesheet', + href=relhref, type=base.CSS_MIME) stylizer = Stylizer(html, item.href, self.oeb, self.opts, self.profile) - self.mangle_elem(html.find(XHTML('body')), stylizer) + self.mangle_elem(html.find(base.tag('xhtml', 'body')), stylizer) def text_transform(self, transform, text): if transform == 'capitalize': @@ -85,7 +86,8 @@ class CaseMangler(object): else: last.tail = text else: - child = elem.makeelement(XHTML('span'), attrib=attrib) + child = elem.makeelement(base.tag('xhtml', 'span'), + attrib=attrib) child.text = text.upper() if last is None: elem.insert(0, child) @@ -99,7 +101,7 @@ class CaseMangler(object): def mangle_elem(self, elem, stylizer): if not isinstance(elem.tag, (str, bytes)) or \ - namespace(elem.tag) != XHTML_NS: + parse_utils.namespace(elem.tag) != const.XHTML_NS: return children = list(elem) style = stylizer.style(elem) diff --git a/ebook_converter/ebooks/oeb/transforms/metadata.py b/ebook_converter/ebooks/oeb/transforms/metadata.py index c08a840..be550b6 100644 --- a/ebook_converter/ebooks/oeb/transforms/metadata.py +++ b/ebook_converter/ebooks/oeb/transforms/metadata.py @@ -1,15 +1,12 @@ -import os, re +import os +import re + +from ebook_converter.ebooks.oeb import base from ebook_converter.utils.date import isoformat, now from ebook_converter import guess_type -__license__ = 'GPL v3' -__copyright__ = '2009, Kovid Goyal ' -__docformat__ = 'restructuredtext en' - - def meta_info_to_oeb_metadata(mi, m, log, override_input_metadata=False): - from ebook_converter.ebooks.oeb.base import OPF if not mi.is_null('title'): m.clear('title') m.add('title', mi.title) @@ -19,17 +16,17 @@ def meta_info_to_oeb_metadata(mi, m, log, override_input_metadata=False): m.clear('title_sort') m.add('title_sort', mi.title_sort) if not mi.is_null('authors'): - m.filter('creator', lambda x : x.role.lower() in ['aut', '']) + m.filter('creator', lambda x: x.role.lower() in ['aut', '']) for a in mi.authors: - attrib = {'role':'aut'} + attrib = {'role': 'aut'} if mi.author_sort: - attrib[OPF('file-as')] = mi.author_sort + attrib[base.tag('opf', 'file-as')] = mi.author_sort m.add('creator', a, attrib=attrib) if not mi.is_null('book_producer'): - m.filter('contributor', lambda x : x.role.lower() == 'bkp') + m.filter('contributor', lambda x: x.role.lower() == 'bkp') m.add('contributor', mi.book_producer, role='bkp') elif override_input_metadata: - m.filter('contributor', lambda x : x.role.lower() == 'bkp') + m.filter('contributor', lambda x: x.role.lower() == 'bkp') if not mi.is_null('comments'): m.clear('description') m.add('description', mi.comments) @@ -71,7 +68,7 @@ def meta_info_to_oeb_metadata(mi, m, log, override_input_metadata=False): m.clear('series_index') if not mi.is_null('rating'): m.clear('rating') - m.add('rating', '%.2f'%mi.rating) + m.add('rating', '%.2f' % mi.rating) elif override_input_metadata: m.clear('rating') if not mi.is_null('tags'): @@ -101,23 +98,25 @@ class MergeMetadata(object): 'Merge in user metadata, including cover' def __call__(self, oeb, mi, opts, override_input_metadata=False): + _oim = override_input_metadata self.oeb, self.log = oeb, oeb.log m = self.oeb.metadata self.log('Merging user specified metadata...') meta_info_to_oeb_metadata(mi, m, oeb.log, - override_input_metadata=override_input_metadata) + override_input_metadata=_oim) cover_id = self.set_cover(mi, opts.prefer_metadata_cover) m.clear('cover') if cover_id is not None: m.add('cover', cover_id) if mi.uuid is not None: - m.filter('identifier', lambda x:x.id=='uuid_id') + m.filter('identifier', lambda x: x.id == 'uuid_id') self.oeb.metadata.add('identifier', mi.uuid, id='uuid_id', scheme='uuid') self.oeb.uid = self.oeb.metadata.identifier[-1] if mi.application_id is not None: - m.filter('identifier', lambda x:x.scheme=='calibre') - self.oeb.metadata.add('identifier', mi.application_id, scheme='calibre') + m.filter('identifier', lambda x: x.scheme == 'calibre') + self.oeb.metadata.add('identifier', mi.application_id, + scheme='calibre') def set_cover(self, mi, prefer_metadata_cover): cdata, ext = b'', 'jpg' @@ -138,7 +137,8 @@ class MergeMetadata(object): if cdata: self.oeb.guide.remove('cover') self.oeb.guide.remove('titlepage') - elif self.oeb.plumber_output_format in {'mobi', 'azw3'} and old_cover is not None: + elif (self.oeb.plumber_output_format in {'mobi', 'azw3'} and + old_cover is not None): # The amazon formats dont support html cover pages, so remove them # even if no cover was specified. self.oeb.guide.remove('titlepage') @@ -156,7 +156,9 @@ class MergeMetadata(object): new_cover_item = None if cdata: id, href = self.oeb.manifest.generate('cover', 'cover.'+ext) - new_cover_item = self.oeb.manifest.add(id, href, guess_type('cover.'+ext)[0], data=cdata) + new_cover_item = self.oeb.manifest.add(id, href, + guess_type('cover.'+ext)[0], + data=cdata) self.oeb.guide.add('cover', 'Cover', href) if do_remove_old_cover: self.remove_old_cover(item, new_cover_item.href) @@ -186,7 +188,8 @@ class MergeMetadata(object): if href == cover_item.href: if new_cover_href is not None: replacement_href = item.relhref(new_cover_href) - attr = 'src' if img.tag.endswith('img') else XLINK('href') + attr = ('src' if img.tag.endswith('img') + else XLINK('href')) img.set(attr, replacement_href) else: p = img.getparent() @@ -202,13 +205,14 @@ class MergeMetadata(object): for item in affected_items: body = XPath('//h:body')(item.data) if body: - text = etree.tostring(body[0], method='text', encoding='unicode') + text = etree.tostring(body[0], method='text', + encoding='unicode') else: text = '' text = re.sub(r'\s+', '', text) if not text and not XPath('//h:img|//svg:svg')(item.data): - self.log('Removing %s as it is a wrapper around' - ' the cover image'%item.href) + self.log('Removing %s as it is a wrapper around the cover ' + 'image' % item.href) self.oeb.spine.remove(item) self.oeb.manifest.remove(item) self.oeb.guide.remove_by_href(item.href) diff --git a/ebook_converter/ebooks/oeb/transforms/page_margin.py b/ebook_converter/ebooks/oeb/transforms/page_margin.py index 8c4e232..bfd81c3 100644 --- a/ebook_converter/ebooks/oeb/transforms/page_margin.py +++ b/ebook_converter/ebooks/oeb/transforms/page_margin.py @@ -1,7 +1,8 @@ import numbers from collections import Counter -from ebook_converter.ebooks.oeb.base import barename, XPath +from ebook_converter.ebooks.oeb import parse_utils +from ebook_converter.ebooks.oeb.base import XPath __license__ = 'GPL v3' @@ -142,7 +143,7 @@ class RemoveFakeMargins(object): for p in paras(body): level = level_of(p, body) - level = '%s_%d'%(barename(p.tag), level) + level = '%s_%d' % (parse_utils.barename(p.tag), level) if level not in self.levels: self.levels[level] = [] self.levels[level].append(p) diff --git a/ebook_converter/ebooks/oeb/transforms/rasterize.py b/ebook_converter/ebooks/oeb/transforms/rasterize.py index 074a255..df90c2d 100644 --- a/ebook_converter/ebooks/oeb/transforms/rasterize.py +++ b/ebook_converter/ebooks/oeb/transforms/rasterize.py @@ -5,10 +5,8 @@ import os import re import urllib.parse -# from PyQt5.Qt import ( - # Qt, QByteArray, QBuffer, QIODevice, QColor, QImage, QPainter, QSvgRenderer) - -from ebook_converter.ebooks.oeb.base import XHTML, XLINK +from ebook_converter import constants as const +from ebook_converter.ebooks.oeb import base from ebook_converter.ebooks.oeb.base import SVG_MIME, PNG_MIME from ebook_converter.ebooks.oeb.base import xml2str, xpath from ebook_converter.ebooks.oeb.base import urlnormalize @@ -17,10 +15,7 @@ from ebook_converter.ptempfile import PersistentTemporaryFile from ebook_converter.utils.imghdr import what -__license__ = 'GPL v3' -__copyright__ = '2008, Marshall T. Vandegrift ' - -IMAGE_TAGS = {XHTML('img'), XHTML('object')} +IMAGE_TAGS = {base.tag('xhtml', 'img'), base.tag('xhtml', 'object')} KEEP_ATTRS = {'class', 'style', 'width', 'height', 'align'} @@ -113,7 +108,7 @@ class SVGRasterizer(object): svg = item.data hrefs = self.oeb.manifest.hrefs for elem in xpath(svg, '//svg:*[@xl:href]'): - href = urlnormalize(elem.attrib[XLINK('href')]) + href = urlnormalize(elem.attrib[base.tag('xlink', 'href')]) path = urllib.parse.urldefrag(href)[0] if not path: continue @@ -126,7 +121,7 @@ class SVGRasterizer(object): with PersistentTemporaryFile(suffix='.'+ext) as pt: pt.write(data) self.temp_files.append(pt.name) - elem.attrib[XLINK('href')] = pt.name + elem.attrib[base.tag('xlink', 'href')] = pt.name return svg def stylizer(self, item): @@ -171,7 +166,7 @@ class SVGRasterizer(object): href = os.path.splitext(item.href)[0] + '.png' id, href = manifest.generate(item.id, href) manifest.add(id, href, PNG_MIME, data=data) - img = elem.makeelement(XHTML('img'), src=item.relhref(href)) + img = elem.makeelement(base.tag('xhtml', 'img'), src=item.relhref(href)) elem.getparent().replace(elem, img) for prop in ('width', 'height'): if prop in elem.attrib: @@ -208,7 +203,7 @@ class SVGRasterizer(object): id, href = manifest.generate(svgitem.id, href) manifest.add(id, href, PNG_MIME, data=data) self.images[key] = href - elem.tag = XHTML('img') + elem.tag = base.tag('xhtml', 'img') for attr in elem.attrib: if attr not in KEEP_ATTRS: del elem.attrib[attr] diff --git a/ebook_converter/ebooks/oeb/transforms/split.py b/ebook_converter/ebooks/oeb/transforms/split.py index 6cce60b..b20f32b 100644 --- a/ebook_converter/ebooks/oeb/transforms/split.py +++ b/ebook_converter/ebooks/oeb/transforms/split.py @@ -10,10 +10,11 @@ import urllib.parse from lxml.etree import XPath as _XPath from lxml import etree +from ebook_converter import constants as const from ebook_converter import as_unicode, force_unicode from ebook_converter.ebooks.epub import rules -from ebook_converter.ebooks.oeb.base import (OEB_STYLES, XPNSMAP as NAMESPACES, - rewrite_links, XHTML, urlnormalize) +from ebook_converter.ebooks.oeb.base import \ + OEB_STYLES, rewrite_links, urlnormalize from ebook_converter.ebooks.oeb.polish.split import do_split from ebook_converter.polyglot.urllib import unquote from ebook_converter.css_selectors import Select, SelectorError @@ -22,7 +23,7 @@ from ebook_converter.css_selectors import Select, SelectorError __license__ = 'GPL v3' __copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net' __docformat__ = 'restructuredtext en' -XPath = functools.partial(_XPath, namespaces=NAMESPACES) +XPath = functools.partial(_XPath, namespaces=const.XPNSMAP) SPLIT_POINT_ATTR = 'csp' @@ -104,7 +105,7 @@ class Split(object): select = Select(item.data) if not self.page_break_selectors: return [], [] - body = item.data.xpath('//h:body', namespaces=NAMESPACES) + body = item.data.xpath('//h:body', namespaces=const.XPNSMAP) if not body: return [], [] descendants = frozenset(body[0].iterdescendants('*')) @@ -268,13 +269,13 @@ class FlowSplitter(object): if body is not None: existing_ids = frozenset(body.xpath('//*/@id')) for x in ids - existing_ids: - body.insert(0, body.makeelement(XHTML('div'), id=x, style='height:0pt')) + body.insert(0, body.makeelement(const.XHTML_div, id=x, style='height:0pt')) ids = set() trees.append(tree) self.trees = trees def get_body(self, root): - body = root.xpath('//h:body', namespaces=NAMESPACES) + body = root.xpath('//h:body', namespaces=const.XPNSMAP) if not body: return None return body[0] @@ -296,7 +297,7 @@ class FlowSplitter(object): etree.tostring(body, method='text', encoding='unicode')) if len(txt) > 1: return False - for img in root.xpath('//h:img', namespaces=NAMESPACES): + for img in root.xpath('//h:img', namespaces=const.XPNSMAP): if img.get('style', '') != 'display:none': return False if root.xpath('//*[local-name() = "svg"]'): @@ -401,7 +402,7 @@ class FlowSplitter(object): '//h:br', '//h:li', ): - elems = root.xpath(path, namespaces=NAMESPACES) + elems = root.xpath(path, namespaces=const.XPNSMAP) elem = pick_elem(elems) if elem is not None: try: @@ -436,7 +437,7 @@ class FlowSplitter(object): spine_pos = self.item.spine_position for current, tree in zip(*map(reversed, (self.files, self.trees))): - for a in tree.getroot().xpath('//h:a[@href]', namespaces=NAMESPACES): + for a in tree.getroot().xpath('//h:a[@href]', namespaces=const.XPNSMAP): href = a.get('href').strip() if href.startswith('#'): anchor = href[1:] diff --git a/ebook_converter/ebooks/oeb/transforms/structure.py b/ebook_converter/ebooks/oeb/transforms/structure.py index d3049f2..9b042fd 100644 --- a/ebook_converter/ebooks/oeb/transforms/structure.py +++ b/ebook_converter/ebooks/oeb/transforms/structure.py @@ -1,22 +1,19 @@ +import collections import re -import uuid import urllib.parse +import uuid from lxml import etree -from collections import OrderedDict, Counter -from ebook_converter.ebooks.oeb.base import XPNSMAP, TOC, XHTML, xml2text, barename +from ebook_converter import constants as const +from ebook_converter.ebooks.oeb import parse_utils +from ebook_converter.ebooks.oeb.base import TOC, xml2text from ebook_converter.ebooks import ConversionError -__license__ = 'GPL v3' -__copyright__ = '2009, Kovid Goyal ' -__docformat__ = 'restructuredtext en' - - def XPath(x): try: - return etree.XPath(x, namespaces=XPNSMAP) + return etree.XPath(x, namespaces=const.XPNSMAP) except etree.XPathSyntaxError: raise ConversionError( 'The syntax of the XPath expression %s is invalid.' % repr(x)) @@ -84,7 +81,7 @@ class DetectStructure(object): try: prev = next(elem.itersiblings(tag=etree.Element, preceding=True)) - if (barename(elem.tag) in {'h1', 'h2'} and barename( + if (parse_utils.barename(elem.tag) in {'h1', 'h2'} and parse_utils.barename( prev.tag) in {'h1', 'h2'} and (not prev.tail or not prev.tail.split())): # We have two adjacent headings, do not put a page @@ -165,7 +162,7 @@ class DetectStructure(object): chapter_mark = self.opts.chapter_mark page_break_before = 'display: block; page-break-before: always' page_break_after = 'display: block; page-break-after: always' - c = Counter() + c = collections.Counter() for item, elem in self.detected_chapters: c[item] += 1 text = xml2text(elem).strip() @@ -174,7 +171,7 @@ class DetectStructure(object): if chapter_mark == 'none': continue if chapter_mark == 'rule': - mark = elem.makeelement(XHTML('hr')) + mark = elem.makeelement(const.XHTML_HR) elif chapter_mark == 'pagebreak': if c[item] < 3 and at_start(elem): # For the first two elements in this item, check if they @@ -184,9 +181,9 @@ class DetectStructure(object): # feedbooks epubs match both a heading tag and its # containing div with the default chapter expression. continue - mark = elem.makeelement(XHTML('div'), style=page_break_after) + mark = elem.makeelement(const.XHTML_DIV, style=page_break_after) else: # chapter_mark == 'both': - mark = elem.makeelement(XHTML('hr'), style=page_break_before) + mark = elem.makeelement(const.XHTML_HR, style=page_break_before) try: elem.addprevious(mark) except TypeError: @@ -254,8 +251,8 @@ class DetectStructure(object): return text, href def add_leveled_toc_items(self): - added = OrderedDict() - added2 = OrderedDict() + added = collections.OrderedDict() + added2 = collections.OrderedDict() counter = 1 def find_matches(expr, doc): diff --git a/ebook_converter/ebooks/pdf/pdftohtml.py b/ebook_converter/ebooks/pdf/pdftohtml.py index d10cd82..554dc37 100644 --- a/ebook_converter/ebooks/pdf/pdftohtml.py +++ b/ebook_converter/ebooks/pdf/pdftohtml.py @@ -5,10 +5,10 @@ import shutil import subprocess import sys +from lxml import etree + from ebook_converter import CurrentDir, xml_replace_entities, prints -from ebook_converter.constants_old import ( - filesystem_encoding, isbsd, islinux, isosx, iswindows -) +from ebook_converter.constants_old import isbsd, islinux, isosx, iswindows from ebook_converter.ebooks import ConversionError, DRMError from ebook_converter.ebooks.chardet import xml_to_unicode from ebook_converter.ptempfile import PersistentTemporaryFile @@ -26,10 +26,13 @@ def popen(cmd, **kw): if isosx and hasattr(sys, 'frameworks_dir'): - base = os.path.join(os.path.dirname(sys.frameworks_dir), 'utils.app', 'Contents', 'MacOS') + base = os.path.join(os.path.dirname(sys.frameworks_dir), 'utils.app', + 'Contents', 'MacOS') PDFTOHTML = os.path.join(base, PDFTOHTML) if iswindows and hasattr(sys, 'frozen'): - base = sys.extensions_location if hasattr(sys, 'new_app_layout') else os.path.dirname(sys.executable) + base = os.path.dirname(sys.executable) + if hasattr(sys, 'new_app_layout'): + base = sys.extensions_location PDFTOHTML = os.path.join(base, 'pdftohtml.exe') if (islinux or isbsd) and getattr(sys, 'frozen', False): PDFTOHTML = os.path.join(sys.executables_location, 'bin', 'pdftohtml') @@ -55,7 +58,7 @@ def pdftohtml(output_dir, pdf_path, no_images, as_xml=False): exe = PDFTOHTML cmd = [exe, '-enc', 'UTF-8', '-noframes', '-p', '-nomerge', - '-nodrm', a(pdfsrc), a(index)] + '-nodrm', a(pdfsrc), a(index)] if isbsd: cmd.remove('-nodrm') @@ -67,7 +70,7 @@ def pdftohtml(output_dir, pdf_path, no_images, as_xml=False): logf = PersistentTemporaryFile('pdftohtml_log') try: p = popen(cmd, stderr=logf._fd, stdout=logf._fd, - stdin=subprocess.PIPE) + stdin=subprocess.PIPE) except OSError as err: if err.errno == errno.ENOENT: raise ConversionError('Could not find pdftohtml, check it is ' @@ -79,7 +82,8 @@ def pdftohtml(output_dir, pdf_path, no_images, as_xml=False): logf.close() out = open(logf.name, 'rb').read().decode('utf-8', 'replace').strip() if ret != 0: - raise ConversionError('pdftohtml failed with return code: %d\n%s' % (ret, out)) + raise ConversionError('pdftohtml failed with return code: ' + '%d\n%s' % (ret, out)) if out: prints("pdftohtml log:") prints(out) @@ -90,22 +94,27 @@ def pdftohtml(output_dir, pdf_path, no_images, as_xml=False): with open(index, 'r+b') as i: raw = i.read().decode('utf-8', 'replace') raw = flip_images(raw) - raw = raw.replace('\n \n = 0.20 output self closing
tags, this - # breaks the pdf heuristics regexps, so replace them + # versions of pdftohtml >= 0.20 output self closing
tags, + # this breaks the pdf heuristics regexps, so replace them raw = raw.replace('
', '
') - raw = re.sub(r' 2: - root = create_ncx(toc, (lambda x:x), 'pdftohtml', 'en', 'pdftohtml') + root = create_ncx(toc, (lambda x: x), 'pdftohtml', 'en', + 'pdftohtml') with open(os.path.join(output_dir, 'toc.ncx'), 'wb') as f: - f.write(etree.tostring(root, pretty_print=True, with_tail=False, encoding='utf-8', xml_declaration=True)) + f.write(etree.tostring(root, pretty_print=True, + with_tail=False, encoding='utf-8', + xml_declaration=True)) def flip_image(img, flip): - from ebook_converter.utils.img import flip_image, image_and_format_from_data, image_to_data + from ebook_converter.utils.img import image_to_data + from ebook_converter.utils.img import image_and_format_from_data + from ebook_converter.utils.img import flip_image with open(img, 'r+b') as f: img, fmt = image_and_format_from_data(f.read()) img = flip_image(img, horizontal='x' in flip, vertical='y' in flip) @@ -170,5 +183,5 @@ def flip_images(raw): if not os.path.exists(img): continue flip_image(img, flip) - raw = re.sub(r'\s*', '', raw, flags=re.I|re.DOTALL) + raw = re.sub(r'\s*', '', raw, flags=re.I | re.DOTALL) return raw diff --git a/ebook_converter/ebooks/txt/markdownml.py b/ebook_converter/ebooks/txt/markdownml.py index 1954179..0b862ee 100644 --- a/ebook_converter/ebooks/txt/markdownml.py +++ b/ebook_converter/ebooks/txt/markdownml.py @@ -5,8 +5,9 @@ import re from functools import partial +from ebook_converter import constants as const from ebook_converter.ebooks.htmlz.oeb2html import OEB2HTML -from ebook_converter.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace, rewrite_links +from ebook_converter.ebooks.oeb.base import XHTML, barename, namespace, rewrite_links from ebook_converter.ebooks.oeb.stylizer import Stylizer @@ -110,9 +111,9 @@ class MarkdownMLizer(OEB2HTML): # We can only processes tags. If there isn't a tag return any text. if not isinstance(elem.tag, (str, bytes)) \ - or namespace(elem.tag) != XHTML_NS: + or namespace(elem.tag) != const.XHTML_NS: p = elem.getparent() - if p is not None and isinstance(p.tag, (str, bytes)) and namespace(p.tag) == XHTML_NS \ + if p is not None and isinstance(p.tag, (str, bytes)) and namespace(p.tag) == const.XHTML_NS \ and elem.tail: return [elem.tail] return [''] diff --git a/ebook_converter/ebooks/txt/textileml.py b/ebook_converter/ebooks/txt/textileml.py index 54e90d3..7b6f626 100644 --- a/ebook_converter/ebooks/txt/textileml.py +++ b/ebook_converter/ebooks/txt/textileml.py @@ -5,8 +5,10 @@ import re from functools import partial +from ebook_converter import constants as const from ebook_converter.ebooks.htmlz.oeb2html import OEB2HTML -from ebook_converter.ebooks.oeb.base import XHTML, XHTML_NS, barename, namespace, rewrite_links +from ebook_converter.ebooks.oeb.base import XHTML, barename, namespace, \ + rewrite_links from ebook_converter.ebooks.oeb.stylizer import Stylizer from ebook_converter.ebooks import unit_convert from ebook_converter.ebooks.textile.unsmarten import unsmarten @@ -225,9 +227,9 @@ class TextileMLizer(OEB2HTML): # We can only processes tags. If there isn't a tag return any text. if not isinstance(elem.tag, (str, bytes)) \ - or namespace(elem.tag) != XHTML_NS: + or namespace(elem.tag) != const.XHTML_NS: p = elem.getparent() - if p is not None and isinstance(p.tag, (str, bytes)) and namespace(p.tag) == XHTML_NS \ + if p is not None and isinstance(p.tag, (str, bytes)) and namespace(p.tag) == const.XHTML_NS \ and elem.tail: return [elem.tail] return [''] diff --git a/ebook_converter/ebooks/txt/txtml.py b/ebook_converter/ebooks/txt/txtml.py index 206b5a6..9b17a11 100644 --- a/ebook_converter/ebooks/txt/txtml.py +++ b/ebook_converter/ebooks/txt/txtml.py @@ -5,10 +5,11 @@ import re from lxml import etree +from ebook_converter import constants as const +from ebook_converter.ebooks.oeb import base +from ebook_converter.ebooks.oeb import parse_utils +from ebook_converter.ebooks.oeb.stylizer import Stylizer -__license__ = 'GPL 3' -__copyright__ = '2009, John Schember ' -__docformat__ = 'restructuredtext en' BLOCK_TAGS = [ 'div', @@ -60,9 +61,6 @@ class TXTMLizer(object): return self.mlize_spine() def mlize_spine(self): - from ebook_converter.ebooks.oeb.base import XHTML - from ebook_converter.ebooks.oeb.stylizer import Stylizer - from ebook_converter.utils.xml_parse import safe_xml_fromstring output = [u''] output.append(self.get_toc()) for item in self.oeb_book.spine: @@ -72,9 +70,11 @@ class TXTMLizer(object): x.text = x.text.replace('--', '__') content = etree.tostring(item.data, encoding='unicode') content = self.remove_newlines(content) - content = safe_xml_fromstring(content) - stylizer = Stylizer(content, item.href, self.oeb_book, self.opts, self.opts.output_profile) - output += self.dump_text(content.find(XHTML('body')), stylizer, item) + content = etree.fromstring(content) + stylizer = Stylizer(content, item.href, self.oeb_book, self.opts, + self.opts.output_profile) + output += self.dump_text(content.find(base.tag('xhtml', 'body')), + stylizer, item) output += '\n\n\n\n\n\n' output = ''.join(output) output = '\n'.join(l.rstrip() for l in output.splitlines()) @@ -130,8 +130,12 @@ class TXTMLizer(object): text = re.sub('\n[ ]+\n', '\n\n', text) if self.opts.remove_paragraph_spacing: text = re.sub('\n{2,}', '\n', text) - text = re.sub(r'(?msu)^(?P[^\t\n]+?)$', lambda mo: u'%s\n\n' % mo.group('t'), text) - text = re.sub(r'(?msu)(?P[^\n])\n+(?P[^\t\n]+?)(?=\n)', lambda mo: '%s\n\n\n\n\n\n%s' % (mo.group('b'), mo.group('t')), text) + text = re.sub(r'(?msu)^(?P[^\t\n]+?)$', lambda mo: u'%s\n\n' % + mo.group('t'), text) + text = re.sub(r'(?msu)(?P[^\n])\n+(?P[^\t\n]+?)(?=\n)', + lambda mo: '%s\n\n\n\n\n\n%s' % (mo.group('b'), + mo.group('t')), + text) else: text = re.sub('\n{7,}', '\n\n\n\n\n\n', text) @@ -146,7 +150,8 @@ class TXTMLizer(object): if self.opts.max_line_length: max_length = self.opts.max_line_length - if self.opts.max_line_length < 25 and not self.opts.force_max_line_length: + if (self.opts.max_line_length < 25 and not + self.opts.force_max_line_length): max_length = 25 short_lines = [] lines = text.splitlines() @@ -186,13 +191,13 @@ class TXTMLizer(object): @stylizer: The style information attached to the element. @page: OEB page used to determine absolute urls. ''' - from ebook_converter.ebooks.oeb.base import XHTML_NS, barename, namespace if not isinstance(elem.tag, (str, bytes)) \ - or namespace(elem.tag) != XHTML_NS: + or parse_utils.namespace(elem.tag) != const.XHTML_NS: p = elem.getparent() - if p is not None and isinstance(p.tag, (str, bytes)) and namespace(p.tag) == XHTML_NS \ - and elem.tail: + if (p is not None and isinstance(p.tag, (str, bytes)) and + parse_utils.namespace(p.tag) == const.XHTML_NS and + elem.tail): return [elem.tail] return [''] @@ -205,14 +210,15 @@ class TXTMLizer(object): return [elem.tail] return [''] - tag = barename(elem.tag) + tag = parse_utils.barename(elem.tag) tag_id = elem.attrib.get('id', None) in_block = False in_heading = False # Are we in a heading? # This can either be a heading tag or a TOC item. - if tag in HEADING_TAGS or '%s#%s' % (page.href, tag_id) in self.toc_ids: + if tag in HEADING_TAGS or '%s#%s' % (page.href, + tag_id) in self.toc_ids: in_heading = True if not self.last_was_heading: text.append('\n\n\n\n\n\n') @@ -234,7 +240,7 @@ class TXTMLizer(object): ems = int(round((float(style.marginTop) / style.fontSize) - 1)) if ems >= 1: text.append('\n' * ems) - except: + except Exception: pass # Process tags that contain text. diff --git a/ebook_converter/startup.py b/ebook_converter/startup.py index bfa6ce8..fbfb960 100644 --- a/ebook_converter/startup.py +++ b/ebook_converter/startup.py @@ -10,7 +10,7 @@ import builtins import locale import sys -from ebook_converter import constants +from ebook_converter import constants_old # For backwards compat with some third party plugins builtins.__dict__['dynamic_property'] = lambda func: func(None) @@ -41,8 +41,8 @@ if not _run_once: # # Platform specific modules - if constants.iswindows: - winutil, winutilerror = constants.plugins['winutil'] + if constants_old.iswindows: + winutil, winutilerror = constants_old.plugins['winutil'] if not winutil: raise RuntimeError('Failed to load the winutil plugin: %s'%winutilerror) if len(sys.argv) > 1 and not isinstance(sys.argv[1], str): @@ -57,8 +57,8 @@ if not _run_once: # # Convert command line arguments to unicode - enc = constants.preferred_encoding - if constants.isosx: + enc = constants_old.preferred_encoding + if constants_old.isosx: enc = 'utf-8' for i in range(1, len(sys.argv)): if not isinstance(sys.argv[i], str): @@ -66,7 +66,7 @@ if not _run_once: # # Ensure that the max number of open files is at least 1024 - if constants.iswindows: + if constants_old.iswindows: # See https://msdn.microsoft.com/en-us/library/6e3b887c.aspx if hasattr(winutil, 'setmaxstdio'): winutil.setmaxstdio(max(1024, winutil.getmaxstdio())) @@ -77,7 +77,7 @@ if not _run_once: try: resource.setrlimit(resource.RLIMIT_NOFILE, (min(1024, hard), hard)) except Exception: - if constants.DEBUG: + if constants_old.DEBUG: import traceback traceback.print_exc() @@ -122,7 +122,7 @@ if not _run_once: bound_signal.connect(slot, **kw) builtins.__dict__['connect_lambda'] = connect_lambda - if constants.islinux or constants.isosx or constants.isfreebsd: + if constants_old.islinux or constants_old.isosx or constants_old.isfreebsd: # Name all threads at the OS level created using the threading module, see # http://bugs.python.org/issue15500 import threading @@ -140,7 +140,7 @@ if not _run_once: if name: if isinstance(name, str): name = name.encode('ascii', 'replace').decode('ascii') - constants.plugins['speedup'][0].set_thread_name(name[:15]) + constants_old.plugins['speedup'][0].set_thread_name(name[:15]) except Exception: pass # Don't care about failure to set name threading.Thread.start = new_start @@ -152,7 +152,7 @@ def test_lopen(): n = 'f\xe4llen' print('testing open()') - if constants.iswindows: + if constants_old.iswindows: import msvcrt, win32api def assert_not_inheritable(f): diff --git a/ebook_converter/utils/xml_parse.py b/ebook_converter/utils/xml_parse.py index c83bd5a..2b9a8ef 100644 --- a/ebook_converter/utils/xml_parse.py +++ b/ebook_converter/utils/xml_parse.py @@ -1,3 +1,7 @@ +import os +import tempfile +import unittest + from lxml import etree @@ -24,7 +28,6 @@ def safe_xml_fromstring(string_or_bytes, recover=True): def find_tests(): - import unittest, tempfile, os class TestXMLParse(unittest.TestCase): @@ -37,9 +40,11 @@ def find_tests(): os.remove(self.temp_file) def test_safe_xml_fromstring(self): - templ = ''' ]>&e;''' + templ = ' ]>&e;' external = 'file:///' + self.temp_file.replace(os.sep, '/') - self.assertEqual(etree.fromstring(templ.format(id='SYSTEM', val=external)).text, 'external') + self.assertEqual(etree.fromstring(templ.format(id='SYSTEM', + val=external)).text, + 'external') for eid, val, expected in ( ('', 'normal entity', 'normal entity'), ('', external, external), @@ -50,7 +55,8 @@ def find_tests(): ('PUBLIC', external, None), ('PUBLIC', 'http://example.com', None), ): - got = getattr(safe_xml_fromstring(templ.format(id=eid, val=val)), 'text', None) + got = getattr(etree.fromstring(templ.format(id=eid, val=val)), + 'text', None) self.assertEqual(got, expected) return unittest.defaultTestLoader.loadTestsFromTestCase(TestXMLParse) diff --git a/ebook_converter/utils/zipfile.py b/ebook_converter/utils/zipfile.py index d4dc785..b08d006 100644 --- a/ebook_converter/utils/zipfile.py +++ b/ebook_converter/utils/zipfile.py @@ -8,7 +8,7 @@ from contextlib import closing from tempfile import SpooledTemporaryFile from ebook_converter import sanitize_file_name -from ebook_converter.constants import filesystem_encoding +from ebook_converter.constants_old import filesystem_encoding from ebook_converter.ebooks.chardet import detect from ebook_converter.polyglot.builtins import as_bytes