From f47376830f08725e7346424851d66563803ad375 Mon Sep 17 00:00:00 2001 From: gryf Date: Tue, 25 May 2021 19:06:31 +0200 Subject: [PATCH] Moved misc functions from polyglot package to single polyglot module. --- .../ebooks/conversion/plugins/chm_input.py | 11 +- .../ebooks/conversion/plugins/epub_output.py | 9 +- .../ebooks/conversion/plugins/html_output.py | 10 +- .../ebooks/conversion/plugins/oeb_output.py | 4 +- .../ebooks/conversion/plugins/pdf_input.py | 10 +- .../ebooks/conversion/plugins/rtf_input.py | 4 +- ebook_converter/ebooks/fb2/__init__.py | 14 +- ebook_converter/ebooks/fb2/fb2ml.py | 6 +- ebook_converter/ebooks/htmlz/oeb2html.py | 141 ++++++++++-------- ebook_converter/ebooks/lrf/input.py | 6 +- ebook_converter/ebooks/metadata/__init__.py | 10 +- ebook_converter/ebooks/metadata/fb2.py | 4 +- ebook_converter/ebooks/metadata/opf2.py | 4 +- ebook_converter/ebooks/metadata/toc.py | 9 +- ebook_converter/ebooks/mobi/utils.py | 4 +- ebook_converter/ebooks/odt/input.py | 4 +- ebook_converter/ebooks/oeb/reader.py | 4 +- .../ebooks/oeb/transforms/cover.py | 4 +- .../ebooks/oeb/transforms/data_url.py | 7 +- .../ebooks/oeb/transforms/split.py | 4 +- ebook_converter/ebooks/pdf/render/common.py | 50 +++---- ebook_converter/polyglot.py | 59 ++++++++ ebook_converter/polyglot/__init__.py | 0 ebook_converter/polyglot/binary.py | 26 ---- ebook_converter/polyglot/builtins.py | 10 -- ebook_converter/polyglot/urllib.py | 17 --- ebook_converter/ptempfile.py | 4 +- ebook_converter/tinycss/decoding.py | 5 +- ebook_converter/utils/fonts/utils.py | 4 +- ebook_converter/utils/serialize.py | 7 +- ebook_converter/utils/terminal.py | 8 +- ebook_converter/utils/zipfile.py | 4 +- 32 files changed, 244 insertions(+), 219 deletions(-) create mode 100644 ebook_converter/polyglot.py delete mode 100644 ebook_converter/polyglot/__init__.py delete mode 100644 ebook_converter/polyglot/binary.py delete mode 100644 ebook_converter/polyglot/builtins.py delete mode 100644 ebook_converter/polyglot/urllib.py diff --git a/ebook_converter/ebooks/conversion/plugins/chm_input.py b/ebook_converter/ebooks/conversion/plugins/chm_input.py index 2ee6e92..564d8d0 100644 --- a/ebook_converter/ebooks/conversion/plugins/chm_input.py +++ b/ebook_converter/ebooks/conversion/plugins/chm_input.py @@ -5,17 +5,12 @@ import os from lxml import html from lxml.html import builder -from ebook_converter.polyglot.urllib import unquote as _unquote from ebook_converter.ebooks.oeb.base import urlquote from ebook_converter.ebooks.chardet import xml_to_unicode from ebook_converter.customize.conversion import InputFormatPlugin from ebook_converter.ptempfile import TemporaryDirectory from ebook_converter.constants_old import filesystem_encoding -from ebook_converter.polyglot.builtins import as_bytes - -__license__ = 'GPL v3' -__copyright__ = ('2008, Kovid Goyal , ' - 'and Alex Bramley .') +from ebook_converter import polyglot class CHMInput(InputFormatPlugin): @@ -133,7 +128,7 @@ class CHMInput(InputFormatPlugin): def unquote(x): if isinstance(x, str): x = x.encode('utf-8') - return _unquote(x).decode('utf-8') + return polyglot.unquote(x).decode('utf-8') def unquote_path(x): y = unquote(x) @@ -175,7 +170,7 @@ class CHMInput(InputFormatPlugin): pretty_print=True) f.write(raw) else: - f.write(as_bytes(hhcdata)) + f.write(polyglot.as_bytes(hhcdata)) return htmlpath, toc def _read_file(self, name): diff --git a/ebook_converter/ebooks/conversion/plugins/epub_output.py b/ebook_converter/ebooks/conversion/plugins/epub_output.py index a17d848..fe87c2d 100644 --- a/ebook_converter/ebooks/conversion/plugins/epub_output.py +++ b/ebook_converter/ebooks/conversion/plugins/epub_output.py @@ -9,7 +9,7 @@ from ebook_converter.ebooks.oeb import parse_utils from ebook_converter.customize.conversion import OutputFormatPlugin from ebook_converter.customize.conversion import OptionRecommendation from ebook_converter.ptempfile import TemporaryDirectory -from ebook_converter.polyglot.builtins import as_bytes +from ebook_converter import polyglot from ebook_converter.utils import directory @@ -266,7 +266,8 @@ class EPUBOutput(OutputFormatPlugin): extra_entries=extra_entries) as epub: epub.add_dir(tdir) if encryption is not None: - epub.writestr('META-INF/encryption.xml', as_bytes(encryption)) + epub.writestr('META-INF/encryption.xml', + polyglot.as_bytes(encryption)) if metadata_xml is not None: epub.writestr('META-INF/metadata.xml', metadata_xml.encode('utf-8')) @@ -308,12 +309,10 @@ class EPUBOutput(OutputFormatPlugin): pass def encrypt_fonts(self, uris, tdir, _uuid): # {{{ - from ebook_converter.polyglot.binary import from_hex_bytes - key = re.sub(r'[^a-fA-F0-9]', '', _uuid) if len(key) < 16: raise ValueError('UUID identifier %r is invalid'% _uuid) - key = bytearray(from_hex_bytes((key + key)[:32])) + key = bytearray(polyglot.from_hex_bytes((key + key)[:32])) paths = [] with directory.CurrentDir(tdir): paths = [os.path.join(*x.split('/')) for x in uris] diff --git a/ebook_converter/ebooks/conversion/plugins/html_output.py b/ebook_converter/ebooks/conversion/plugins/html_output.py index 1c7d294..51121df 100644 --- a/ebook_converter/ebooks/conversion/plugins/html_output.py +++ b/ebook_converter/ebooks/conversion/plugins/html_output.py @@ -7,7 +7,7 @@ from lxml import etree from ebook_converter.customize.conversion import OutputFormatPlugin, OptionRecommendation from ebook_converter.ebooks.oeb.base import element -from ebook_converter.polyglot.urllib import unquote +from ebook_converter import polyglot from ebook_converter.ptempfile import PersistentTemporaryDirectory from ebook_converter.utils.cleantext import clean_xml_chars from ebook_converter.utils import directory @@ -56,7 +56,8 @@ class HTMLOutput(OutputFormatPlugin): parent = element(parent, ('ul')) for node in current_node.nodes: point = element(parent, 'li') - href = relpath(os.path.abspath(unquote(node.href)), + href = relpath(os.path.abspath(polyglot + .unquote(node.href)), os.path.dirname(ref_url)) if isinstance(href, bytes): href = href.decode('utf-8') @@ -84,7 +85,6 @@ class HTMLOutput(OutputFormatPlugin): from lxml import etree from ebook_converter.utils import zipfile from templite import Templite - from ebook_converter.polyglot.urllib import unquote from ebook_converter.ebooks.html.meta import EasyMeta # read template files @@ -156,7 +156,7 @@ class HTMLOutput(OutputFormatPlugin): with directory.CurrentDir(output_dir): for item in oeb_book.manifest: - path = os.path.abspath(unquote(item.href)) + path = os.path.abspath(polyglot.unquote(item.href)) dir = os.path.dirname(path) if not os.path.exists(dir): os.makedirs(dir) @@ -169,7 +169,7 @@ class HTMLOutput(OutputFormatPlugin): item.unload_data_from_memory(memory=path) for item in oeb_book.spine: - path = os.path.abspath(unquote(item.href)) + path = os.path.abspath(polyglot.unquote(item.href)) dir = os.path.dirname(path) root = item.data.getroottree() diff --git a/ebook_converter/ebooks/conversion/plugins/oeb_output.py b/ebook_converter/ebooks/conversion/plugins/oeb_output.py index 8788477..e8679c6 100644 --- a/ebook_converter/ebooks/conversion/plugins/oeb_output.py +++ b/ebook_converter/ebooks/conversion/plugins/oeb_output.py @@ -5,7 +5,7 @@ from lxml import etree from ebook_converter.customize.conversion import (OutputFormatPlugin, OptionRecommendation) -from ebook_converter.polyglot.urllib import unquote +from ebook_converter import polyglot from ebook_converter.ebooks.oeb.base import OPF_MIME, NCX_MIME, PAGE_MAP_MIME, OEB_STYLES from ebook_converter.ebooks.oeb.normalize_css import condense_sheet from ebook_converter.utils import directory @@ -56,7 +56,7 @@ class OEBOutput(OutputFormatPlugin): not self.opts.expand_css and item.media_type in OEB_STYLES and hasattr( item.data, 'cssText') and 'nook' not in self.opts.output_profile.short_name): condense_sheet(item.data) - path = os.path.abspath(unquote(item.href)) + path = os.path.abspath(polyglot.unquote(item.href)) dir = os.path.dirname(path) if not os.path.exists(dir): os.makedirs(dir) diff --git a/ebook_converter/ebooks/conversion/plugins/pdf_input.py b/ebook_converter/ebooks/conversion/plugins/pdf_input.py index 81211c5..e063cb5 100644 --- a/ebook_converter/ebooks/conversion/plugins/pdf_input.py +++ b/ebook_converter/ebooks/conversion/plugins/pdf_input.py @@ -1,12 +1,7 @@ import os from ebook_converter.customize.conversion import InputFormatPlugin, OptionRecommendation -from ebook_converter.polyglot.builtins import as_bytes - - -__license__ = 'GPL 3' -__copyright__ = '2009, John Schember ' -__docformat__ = 'restructuredtext en' +from ebook_converter import polyglot class PDFInput(InputFormatPlugin): @@ -72,7 +67,8 @@ class PDFInput(InputFormatPlugin): ncxid = opf.manifest.id_for_path('toc.ncx') if ncxid: with open('metadata.opf', 'r+b') as f: - raw = f.read().replace(b'%s' % ( - entities.prepare_string_for_xml(self.book_title)) - ] + output = ['' + '%s' + '' % entities.prepare_string_for_xml(self.book_title)] for item in oeb_book.spine: self.log.debug('Converting %s to HTML...' % item.href) self.rewrite_ids(item.data, item) - base.rewrite_links(item.data, partial(self.rewrite_link, page=item)) + base.rewrite_links(item.data, partial(self.rewrite_link, + page=item)) stylizer = Stylizer(item.data, item.href, oeb_book, self.opts) - output += self.dump_text(item.data.find(base.tag('xhtml', 'body')), stylizer, item) + output += self.dump_text(item.data.find(base.tag('xhtml', 'body')), + stylizer, item) output.append('\n\n') output.append('') return ''.join(output) @@ -126,13 +126,14 @@ class OEB2HTML(object): el.attrib['id'] = self.get_link_id(page.href)[1:] continue if 'id' in el.attrib: - el.attrib['id'] = self.get_link_id(page.href, el.attrib['id'])[1:] + el.attrib['id'] = self.get_link_id(page.href, + el.attrib['id'])[1:] def get_css(self, oeb_book): css = b'' for item in oeb_book.manifest: if item.media_type == 'text/css': - css += as_bytes(item.data.cssText) + b'\n\n' + css += polyglot.as_bytes(item.data.cssText) + b'\n\n' return css def prepare_string_for_html(self, raw): @@ -157,10 +158,14 @@ class OEB2HTMLNoCSSizer(OEB2HTML): # We can only processes tags. If there isn't a tag return any text. if not isinstance(elem.tag, (str, bytes)) \ - or parse_utils.namespace(elem.tag) not in (const.XHTML_NS, const.SVG_NS): + or parse_utils.namespace(elem.tag) not in (const.XHTML_NS, + const.SVG_NS): p = elem.getparent() - if p is not None and isinstance(p.tag, (str, bytes)) and parse_utils.namespace(p.tag) in (const.XHTML_NS, const.SVG_NS) \ - and elem.tail: + if (p is not None and + isinstance(p.tag, (str, bytes)) and + parse_utils.namespace(p.tag) in (const.XHTML_NS, + const.SVG_NS) and + elem.tail): return [elem.tail] return [''] @@ -176,8 +181,8 @@ class OEB2HTMLNoCSSizer(OEB2HTML): tags.append(tag) # Ignore anything that is set to not be displayed. - if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \ - or style['visibility'] == 'hidden': + if (style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') or + style['visibility'] == 'hidden'): return [''] # Remove attributes we won't want. @@ -186,11 +191,13 @@ class OEB2HTMLNoCSSizer(OEB2HTML): if 'style' in attribs: del attribs['style'] - # Turn the rest of the attributes into a string we can write with the tag. + # Turn the rest of the attributes into a string we can write with the + # tag. at = '' - for k, v in attribs.items(): - at += ' %s="%s"' % (k, entities - .prepare_string_for_xml(v, attribute=True)) + for key, value in attribs.items(): + at += (' %s="%s"' % + (key, entities.prepare_string_for_xml(value, + attribute=True))) # Write the tag. text.append('<%s%s' % (tag, at)) @@ -246,11 +253,15 @@ class OEB2HTMLInlineCSSizer(OEB2HTML): ''' # We can only processes tags. If there isn't a tag return any text. - if not isinstance(elem.tag, (str, bytes)) \ - or parse_utils.namespace(elem.tag) not in (const.XHTML_NS, const.SVG_NS): + if (not isinstance(elem.tag, (str, bytes)) or + parse_utils.namespace(elem.tag) not in (const.XHTML_NS, + const.SVG_NS)): p = elem.getparent() - if p is not None and isinstance(p.tag, (str, bytes)) and parse_utils.namespace(p.tag) in (const.XHTML_NS, const.SVG_NS) \ - and elem.tail: + if (p is not None and + isinstance(p.tag, (str, bytes)) and + parse_utils.namespace(p.tag) in (const.XHTML_NS, + const.SVG_NS) and + elem.tail): return [elem.tail] return [''] @@ -266,9 +277,11 @@ class OEB2HTMLInlineCSSizer(OEB2HTML): if tag == 'body': # Change the body to a div so we can merge multiple files. tag = 'div' - # Add page-break-brefore: always because renders typically treat a new file (we're merging files) - # as a page break and remove all other page break types that might be set. - style_a = 'page-break-before: always; %s' % re.sub('page-break-[^:]+:[^;]+;?', '', style_a) + # Add page-break-brefore: always because renders typically treat + # a new file (we're merging files) as a page break and remove all + # other page break types that might be set. + style_a = ('page-break-before: always; %s' % + re.sub('page-break-[^:]+:[^;]+;?', '', style_a)) # Remove unnecessary spaces. style_a = re.sub(r'\s{2,}', ' ', style_a).strip() tags.append(tag) @@ -279,7 +292,8 @@ class OEB2HTMLInlineCSSizer(OEB2HTML): if 'style' in attribs: del attribs['style'] - # Turn the rest of the attributes into a string we can write with the tag. + # Turn the rest of the attributes into a string we can write with + # the tag. at = '' for k, v in attribs.items(): at += ' %s="%s"' % (k, entities @@ -319,43 +333,51 @@ class OEB2HTMLInlineCSSizer(OEB2HTML): class OEB2HTMLClassCSSizer(OEB2HTML): - ''' - Use CSS classes. css_style option can specify whether to use - inline classes (style tag in the head) or reference an external - CSS file called style.css. - ''' + """ + Use CSS classes. css_style option can specify whether to use inline + classes (style tag in the head) or reference an external CSS file called + style.css. + """ def mlize_spine(self, oeb_book): output = [] for item in oeb_book.spine: self.log.debug('Converting %s to HTML...' % item.href) self.rewrite_ids(item.data, item) - base.rewrite_links(item.data, partial(self.rewrite_link, page=item)) + base.rewrite_links(item.data, partial(self.rewrite_link, + page=item)) stylizer = Stylizer(item.data, item.href, oeb_book, self.opts) - output += self.dump_text(item.data.find(base.tag('xhtml', 'body')), stylizer, item) + output += self.dump_text(item.data.find(base.tag('xhtml', 'body')), + stylizer, item) output.append('\n\n') if self.opts.htmlz_class_style == 'external': - css = u'' + css = '' else: - css = u'' - title = (u'%s' % + css = ('') + title = ('%s' % entities.prepare_string_for_xml(self.book_title)) - output = [u''] + \ - [css] + [title, u''] + output + [u''] + output = ([''] + [css] + + [title, ''] + output + ['']) return ''.join(output) def dump_text(self, elem, stylizer, page): - ''' + """ @elem: The element in the etree that we are working on. @stylizer: The style information attached to the element. - ''' + """ # We can only processes tags. If there isn't a tag return any text. - if not isinstance(elem.tag, (str, bytes)) \ - or parse_utils.namespace(elem.tag) not in (const.XHTML_NS, const.SVG_NS): + if (not isinstance(elem.tag, (str, bytes)) or + parse_utils.namespace(elem.tag) not in (const.XHTML_NS, + const.SVG_NS)): p = elem.getparent() - if p is not None and isinstance(p.tag, (str, bytes)) and parse_utils.namespace(p.tag) in (const.XHTML_NS, const.SVG_NS) \ - and elem.tail: + if (p is not None and + isinstance(p.tag, (str, bytes)) and + parse_utils.namespace(p.tag) in (const.XHTML_NS, + const.SVG_NS) and + elem.tail): return [elem.tail] return [''] @@ -373,11 +395,12 @@ class OEB2HTMLClassCSSizer(OEB2HTML): if 'style' in attribs: del attribs['style'] - # Turn the rest of the attributes into a string we can write with the tag. + # Turn the rest of the attributes into a string we can write with + # the tag. at = '' for k, v in attribs.items(): - at += ' %s="%s"' % (k, - entities.prepare_string_for_xml(v, attribute=True)) + at += ' %s="%s"' % (k, entities + .prepare_string_for_xml(v, attribute=True)) # Write the tag. text.append('<%s%s' % (tag, at)) diff --git a/ebook_converter/ebooks/lrf/input.py b/ebook_converter/ebooks/lrf/input.py index fd28128..731172e 100644 --- a/ebook_converter/ebooks/lrf/input.py +++ b/ebook_converter/ebooks/lrf/input.py @@ -5,7 +5,7 @@ import textwrap from lxml import etree -from ebook_converter.polyglot.builtins import as_bytes +from ebook_converter import polyglot class Canvas(etree.XSLTExtension): @@ -292,7 +292,7 @@ class Styles(etree.XSLTExtension): return '\n\t'.join(ans) with open(name, 'wb') as f: - f.write(as_bytes(self.CSS)) + f.write(polyglot.as_bytes(self.CSS)) for (w, sel) in [(self.text_styles, 'ts'), (self.block_styles, 'bs')]: for i, s in enumerate(w): @@ -300,7 +300,7 @@ class Styles(etree.XSLTExtension): continue rsel = '.%s%d'%(sel, i) s = join(s) - f.write(as_bytes(rsel + ' {\n\t' + s + '\n}\n\n')) + f.write(polyglot.as_bytes(rsel + ' {\n\t' + s + '\n}\n\n')) def execute(self, context, self_node, input_node, output_parent): if input_node.tag == 'TextStyle': diff --git a/ebook_converter/ebooks/metadata/__init__.py b/ebook_converter/ebooks/metadata/__init__.py index d44dbd0..ae459b5 100644 --- a/ebook_converter/ebooks/metadata/__init__.py +++ b/ebook_converter/ebooks/metadata/__init__.py @@ -9,7 +9,7 @@ import sys import urllib.parse from ebook_converter.utils.config_base import tweaks -from ebook_converter.polyglot.urllib import unquote +from ebook_converter import polyglot from ebook_converter.utils import encoding as uenc @@ -248,9 +248,11 @@ class Resource(object): pc = url[2] if isinstance(pc, str): pc = pc.encode('utf-8') - pc = unquote(pc).decode('utf-8') - self.path = os.path.abspath(os.path.join(basedir, pc.replace('/', os.sep))) - self.fragment = unquote(url[-1]) + pc = polyglot.unquote(pc).decode('utf-8') + self.path = os.path.abspath(os.path.join(basedir, + pc.replace('/', + os.sep))) + self.fragment = polyglot.unquote(url[-1]) def href(self, basedir=None): ''' diff --git a/ebook_converter/ebooks/metadata/fb2.py b/ebook_converter/ebooks/metadata/fb2.py index 8ee0a06..3fc28cd 100644 --- a/ebook_converter/ebooks/metadata/fb2.py +++ b/ebook_converter/ebooks/metadata/fb2.py @@ -14,7 +14,7 @@ from ebook_converter.utils.img import save_cover_data_to from ebook_converter.utils.imghdr import identify from ebook_converter.ebooks.metadata import MetaInformation, check_isbn from ebook_converter.ebooks.chardet import xml_to_unicode -from ebook_converter.polyglot.binary import as_base64_unicode +from ebook_converter import polyglot from ebook_converter.utils import encoding as uenc @@ -389,7 +389,7 @@ def _rnd_pic_file_name(prefix='calibre_cover_', size=32, ext='jpg'): def _encode_into_jpeg(data): data = save_cover_data_to(data) - return as_base64_unicode(data) + return polyglot.as_base64_unicode(data) def _set_cover(title_info, mi, ctx): diff --git a/ebook_converter/ebooks/metadata/opf2.py b/ebook_converter/ebooks/metadata/opf2.py index e8ba5b9..2ff150d 100644 --- a/ebook_converter/ebooks/metadata/opf2.py +++ b/ebook_converter/ebooks/metadata/opf2.py @@ -30,11 +30,11 @@ from ebook_converter.ebooks.metadata.utils import parse_opf, \ from ebook_converter.ebooks.metadata import string_to_authors, \ MetaInformation, check_isbn from ebook_converter.ebooks.metadata.book.base import Metadata +from ebook_converter import polyglot from ebook_converter.utils.date import parse_date, isoformat from ebook_converter.utils.localization import get_lang, canonicalize_lang from ebook_converter.utils.cleantext import clean_ascii_chars, clean_xml_chars from ebook_converter.utils.config_base import tweaks -from ebook_converter.polyglot.urllib import unquote pretty_print_opf = False @@ -838,7 +838,7 @@ class OPF(object): # {{{ def unquote_urls(self): def get_href(item): - raw = unquote(item.get('href', '')) + raw = polyglot.unquote(item.get('href', '')) if not isinstance(raw, str): raw = raw.decode('utf-8') return raw diff --git a/ebook_converter/ebooks/metadata/toc.py b/ebook_converter/ebooks/metadata/toc.py index 02f04d5..05762a7 100644 --- a/ebook_converter/ebooks/metadata/toc.py +++ b/ebook_converter/ebooks/metadata/toc.py @@ -11,7 +11,7 @@ from lxml.builder import ElementMaker from ebook_converter.constants_old import __appname__, __version__ from ebook_converter.ebooks.chardet import xml_to_unicode from ebook_converter.utils.cleantext import clean_xml_chars -from ebook_converter.polyglot.urllib import unquote +from ebook_converter import polyglot NCX_NS = "http://www.daisy.org/z3986/2005/ncx/" @@ -31,7 +31,7 @@ def parse_html_toc(data): root = parse(clean_xml_chars(data), maybe_xhtml=True, keep_doctype=False, sanitize_names=True) for a in root.xpath('//*[@href and local-name()="a"]'): - purl = urllib.parse.urlparse(unquote(a.get('href'))) + purl = urllib.parse.urlparse(polyglot.unquote(a.get('href'))) href, fragment = purl[2], purl[5] if not fragment: fragment = None @@ -149,7 +149,7 @@ class TOC(list): if toc is not None: if toc.lower() not in ('ncx', 'ncxtoc'): - toc = urllib.parse.urlparse(unquote(toc))[2] + toc = urllib.parse.urlparse(polyglot.unquote(toc))[2] toc = toc.replace('/', os.sep) if not os.path.isabs(toc): toc = os.path.join(self.base_path, toc) @@ -219,7 +219,8 @@ class TOC(list): content = content[0] # if get_attr(content, attr='src'): purl = urllib.parse.urlparse(content.get('src')) - href, fragment = unquote(purl[2]), unquote(purl[5]) + href = polyglot.unquote(purl[2]) + fragment = polyglot.unquote(purl[5]) nd = dest.add_item(href, fragment, text) nd.play_order = play_order diff --git a/ebook_converter/ebooks/mobi/utils.py b/ebook_converter/ebooks/mobi/utils.py index b261d00..bdb96cc 100644 --- a/ebook_converter/ebooks/mobi/utils.py +++ b/ebook_converter/ebooks/mobi/utils.py @@ -5,7 +5,7 @@ from io import BytesIO from ebook_converter.utils.img import save_cover_data_to, scale_image, image_to_data, image_from_data, resize_image, png_data_to_gif_data from ebook_converter.utils.imghdr import what from ebook_converter.ebooks import normalize -from ebook_converter.polyglot.builtins import as_bytes +from ebook_converter import polyglot from ebook_converter.tinycss.color3 import parse_color_string @@ -61,7 +61,7 @@ def decode_hex_number(raw, codec='utf-8'): def encode_string(raw): - ans = bytearray(as_bytes(raw)) + ans = bytearray(polyglot.as_bytes(raw)) ans.insert(0, len(ans)) return bytes(ans) diff --git a/ebook_converter/ebooks/odt/input.py b/ebook_converter/ebooks/odt/input.py index 0cf22c7..ba22f63 100644 --- a/ebook_converter/ebooks/odt/input.py +++ b/ebook_converter/ebooks/odt/input.py @@ -15,7 +15,7 @@ from odf.namespaces import TEXTNS as odTEXTNS from ebook_converter.utils import directory from ebook_converter.ebooks.oeb.base import _css_logger -from ebook_converter.polyglot.builtins import as_bytes +from ebook_converter import polyglot class Extract(ODF2XHTML): @@ -292,7 +292,7 @@ class Extract(ODF2XHTML): except: log.exception('Failed to filter CSS, conversion may be slow') with open('index.xhtml', 'wb') as f: - f.write(as_bytes(html)) + f.write(polyglot.as_bytes(html)) zf = ZipFile(stream, 'r') self.extract_pictures(zf) opf = OPFCreator(os.path.abspath(os.getcwd()), mi) diff --git a/ebook_converter/ebooks/oeb/reader.py b/ebook_converter/ebooks/oeb/reader.py index da1a9a9..5cbf259 100644 --- a/ebook_converter/ebooks/oeb/reader.py +++ b/ebook_converter/ebooks/oeb/reader.py @@ -24,7 +24,7 @@ from ebook_converter.utils.localization import get_lang from ebook_converter.ptempfile import TemporaryDirectory from ebook_converter.constants_old import __appname__, __version__ from ebook_converter.utils import entities -from ebook_converter.polyglot.urllib import unquote +from ebook_converter import polyglot class OEBReader(object): @@ -641,7 +641,7 @@ class OEBReader(object): with TemporaryDirectory('_html_cover') as tdir: writer = OEBWriter() writer(self.oeb, tdir) - path = os.path.join(tdir, unquote(hcover.href)) + path = os.path.join(tdir, polyglot.unquote(hcover.href)) data = render_html_svg_workaround(path, self.logger) if not data: data = b'' diff --git a/ebook_converter/ebooks/oeb/transforms/cover.py b/ebook_converter/ebooks/oeb/transforms/cover.py index 5ebf265..2ff31b0 100644 --- a/ebook_converter/ebooks/oeb/transforms/cover.py +++ b/ebook_converter/ebooks/oeb/transforms/cover.py @@ -5,7 +5,7 @@ import urllib.parse from lxml import etree from ebook_converter.utils.imghdr import identify -from ebook_converter.polyglot.urllib import unquote +from ebook_converter import polyglot class CoverManager(object): @@ -113,7 +113,7 @@ class CoverManager(object): if href is not None: templ = self.non_svg_template if self.no_svg_cover \ else self.svg_template - tp = templ % unquote(href) + tp = templ % polyglot.unquote(href) id, href = m.generate('titlepage', 'titlepage.xhtml') item = m.add(id, href, mimetypes.guess_type('t.xhtml')[0], data=etree.fromstring(tp)) diff --git a/ebook_converter/ebooks/oeb/transforms/data_url.py b/ebook_converter/ebooks/oeb/transforms/data_url.py index 1e0aff1..166ae45 100644 --- a/ebook_converter/ebooks/oeb/transforms/data_url.py +++ b/ebook_converter/ebooks/oeb/transforms/data_url.py @@ -3,8 +3,7 @@ import re import urllib.parse from ebook_converter.ebooks.oeb.base import XPath -from ebook_converter.polyglot.binary import from_base64_bytes -from ebook_converter.polyglot.builtins import as_bytes +from ebook_converter import polyglot class DataURL(object): @@ -27,14 +26,14 @@ class DataURL(object): if ';base64' in header: data = re.sub(r'\s+', '', data) try: - data = from_base64_bytes(data) + data = polyglot.from_base64_bytes(data) except Exception: self.log.error('Found invalid base64 encoded data ' 'URI, ignoring it') continue else: data = urllib.parse.unquote(data) - data = as_bytes(data) + data = polyglot.as_bytes(data) fmt = what(None, data) if not fmt: self.log.warn('Image encoded as data URL has unknown ' diff --git a/ebook_converter/ebooks/oeb/transforms/split.py b/ebook_converter/ebooks/oeb/transforms/split.py index 26a22da..0aeeb0b 100644 --- a/ebook_converter/ebooks/oeb/transforms/split.py +++ b/ebook_converter/ebooks/oeb/transforms/split.py @@ -17,7 +17,7 @@ from ebook_converter import constants as const from ebook_converter.ebooks.epub import rules from ebook_converter.ebooks.oeb import base from ebook_converter.ebooks.oeb.polish.split import do_split -from ebook_converter.polyglot.urllib import unquote +from ebook_converter import polyglot from ebook_converter.css_selectors import Select, SelectorError from ebook_converter.utils import encoding as uenc @@ -189,7 +189,7 @@ class Split(object): nhref = anchor_map[frag if frag else None] nhref = self.current_item.relhref(nhref) if frag: - nhref = '#'.join((unquote(nhref), frag)) + nhref = '#'.join((polyglot.unquote(nhref), frag)) return nhref return url diff --git a/ebook_converter/ebooks/pdf/render/common.py b/ebook_converter/ebooks/pdf/render/common.py index 5dcd914..b31a4e3 100644 --- a/ebook_converter/ebooks/pdf/render/common.py +++ b/ebook_converter/ebooks/pdf/render/common.py @@ -1,20 +1,18 @@ -import codecs, zlib, numbers -from io import BytesIO +import codecs from datetime import datetime +import io +import numbers +import zlib from ebook_converter.utils.logging import default_log -from ebook_converter.polyglot.binary import as_hex_bytes +from ebook_converter import polyglot -__license__ = 'GPL v3' -__copyright__ = '2012, Kovid Goyal ' -__docformat__ = 'restructuredtext en' - pdf_float = lambda x: f"{x:.1f}" EOL = b'\n' -# Sizes {{{ +# Sizes inch = 72.0 cm = inch / 2.54 mm = cm * 0.1 @@ -45,10 +43,9 @@ B2 = (_BW*2, _BH*2) B1 = (_BH*4, _BW*2) B0 = (_BW*4, _BH*4) -PAPER_SIZES = {k:globals()[k.upper()] for k in ('a0 a1 a2 a3 a4 a5 a6 b0 b1 b2' - ' b3 b4 b5 b6 letter legal').split()} - -# }}} +PAPER_SIZES = {k: globals()[k.upper()] for k in ('a0 a1 a2 a3 a4 a5 a6 b0 b1 ' + 'b2 b3 b4 b5 b6 letter ' + 'legal').split()} def fmtnum(o): @@ -70,12 +67,12 @@ def serialize(o, stream): elif o is None: stream.write_raw(b'null') elif isinstance(o, datetime): - val = o.strftime("D:%Y%m%d%H%M%%02d%z")%min(59, o.second) + val = o.strftime("D:%Y%m%d%H%M%%02d%z") % min(59, o.second) if datetime.tzinfo is not None: - val = "(%s'%s')"%(val[:-2], val[-2:]) + val = "(%s'%s')" % (val[:-2], val[-2:]) stream.write(val.encode('ascii')) else: - raise ValueError('Unknown object: %r'%o) + raise ValueError('Unknown object: %r' % o) class Name(str): @@ -83,7 +80,7 @@ class Name(str): def pdf_serialize(self, stream): raw = self.encode('ascii') if len(raw) > 126: - raise ValueError('Name too long: %r'%self) + raise ValueError('Name too long: %r' % self) raw = bytearray(raw) sharp = ord(b'#') buf = ( @@ -96,7 +93,8 @@ def escape_pdf_string(bytestring): indices = [] bad = [] ba = bytearray(bytestring) - bad_map = {10:ord('n'), 13:ord('r'), 12:ord('f'), 8:ord('b'), 9:ord('\t'), 92:ord('\\')} + bad_map = {10: ord('n'), 13: ord('r'), 12: ord('f'), + 8: ord('b'), 9: ord('\t'), 92: ord('\\')} for i, num in enumerate(ba): if num == 40: # ( indices.append((i, 40)) @@ -134,7 +132,7 @@ class UTF16String(str): if False: # Disabled as the parentheses based strings give easier to debug # PDF files - stream.write(b'<' + as_hex_bytes(raw) + b'>') + stream.write(b'<' + polyglot.as_hex_bytes(raw) + b'>') else: stream.write(b'('+escape_pdf_string(raw)+b')') @@ -143,9 +141,9 @@ class Dictionary(dict): def pdf_serialize(self, stream): stream.write(b'<<' + EOL) - sorted_keys = sorted(self, - key=lambda x:({'Type':'1', 'Subtype':'2'}.get( - x, x)+x)) + sorted_keys = sorted(self, key=lambda x: ({'Type': '1', + 'Subtype': '2'} + .get(x, x) + x)) for k in sorted_keys: serialize(Name(k), stream) stream.write(b' ') @@ -177,10 +175,10 @@ class Array(list): stream.write(b']') -class Stream(BytesIO): +class Stream(io.BytesIO): def __init__(self, compress=False): - BytesIO.__init__(self) + io.BytesIO.__init__(self) self.compress = compress self.filters = Array() @@ -213,7 +211,7 @@ class Stream(BytesIO): raw.encode('ascii')) def write_raw(self, raw): - BytesIO.write(self, raw) + io.BytesIO.write(self, raw) class Reference(object): @@ -222,11 +220,11 @@ class Reference(object): self.num, self.obj = num, obj def pdf_serialize(self, stream): - raw = '%d 0 R'%self.num + raw = '%d 0 R' % self.num stream.write(raw.encode('ascii')) def __repr__(self): - return '%d 0 R'%self.num + return '%d 0 R' % self.num def __str__(self): return repr(self) diff --git a/ebook_converter/polyglot.py b/ebook_converter/polyglot.py new file mode 100644 index 0000000..5a77227 --- /dev/null +++ b/ebook_converter/polyglot.py @@ -0,0 +1,59 @@ +""" +Misc converting functions from polyglot module. +Most of the have something to do with converting between string and binary +""" +import base64 +import binascii +import urllib + + +def as_base64_unicode(x, enc='utf-8'): + if isinstance(x, str): + x = x.encode(enc) + return base64.standard_b64encode(x).decode('ascii') + + +def from_base64_bytes(x): + if isinstance(x, str): + x = x.encode('ascii') + return base64.standard_b64decode(x) + + +def as_hex_bytes(x, enc='utf-8'): + if isinstance(x, str): + x = x.encode(enc) + return binascii.hexlify(x) + + +def from_hex_bytes(x): + if isinstance(x, str): + x = x.encode('ascii') + return binascii.unhexlify(x) + + +def as_bytes(x, encoding='utf-8'): + if isinstance(x, str): + return x.encode(encoding) + if isinstance(x, bytes): + return x + if isinstance(x, bytearray): + return bytes(x) + if isinstance(x, memoryview): + return x.tobytes() + return str(x).encode(encoding) + + +def unquote(x, encoding='utf-8', errors='replace'): + # TODO(gryf): this works like that: if x is a binary, convert it to + # string using encoding and make unquote. After that make it binary again. + # If x is string, just pass it to the unquote. + # This approach is mostly used within lxml etree strings, which suppose to + # be binary because of its inner representation. I'm wondering, if + # xml.etree could be used instead - to be checked. + binary = isinstance(x, bytes) + if binary: + x = x.decode(encoding, errors) + ans = urllib.parse.unquote(x, encoding, errors) + if binary: + ans = ans.encode(encoding, errors) + return ans diff --git a/ebook_converter/polyglot/__init__.py b/ebook_converter/polyglot/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/ebook_converter/polyglot/binary.py b/ebook_converter/polyglot/binary.py deleted file mode 100644 index 312d45d..0000000 --- a/ebook_converter/polyglot/binary.py +++ /dev/null @@ -1,26 +0,0 @@ -from base64 import standard_b64decode, standard_b64encode -from binascii import hexlify, unhexlify - - -def as_base64_unicode(x, enc='utf-8'): - if isinstance(x, str): - x = x.encode(enc) - return standard_b64encode(x).decode('ascii') - - -def from_base64_bytes(x): - if isinstance(x, str): - x = x.encode('ascii') - return standard_b64decode(x) - - -def as_hex_bytes(x, enc='utf-8'): - if isinstance(x, str): - x = x.encode(enc) - return hexlify(x) - - -def from_hex_bytes(x): - if isinstance(x, str): - x = x.encode('ascii') - return unhexlify(x) diff --git a/ebook_converter/polyglot/builtins.py b/ebook_converter/polyglot/builtins.py deleted file mode 100644 index 522eb84..0000000 --- a/ebook_converter/polyglot/builtins.py +++ /dev/null @@ -1,10 +0,0 @@ -def as_bytes(x, encoding='utf-8'): - if isinstance(x, str): - return x.encode(encoding) - if isinstance(x, bytes): - return x - if isinstance(x, bytearray): - return bytes(x) - if isinstance(x, memoryview): - return x.tobytes() - return str(x).encode(encoding) diff --git a/ebook_converter/polyglot/urllib.py b/ebook_converter/polyglot/urllib.py deleted file mode 100644 index bcebb82..0000000 --- a/ebook_converter/polyglot/urllib.py +++ /dev/null @@ -1,17 +0,0 @@ -import urllib.parse - - -def unquote(x, encoding='utf-8', errors='replace'): - # TODO(gryf): this works like that: if x is a binary, convert it to - # string using encoding and make unquote. After that make it binary again. - # If x is string, just pass it to the unquote. - # This approach is mostly used within lxml etree strings, which suppose to - # be binary because of its inner representation. I'm wondering, if - # xml.etree could be used instead - to be checked. - binary = isinstance(x, bytes) - if binary: - x = x.decode(encoding, errors) - ans = urllib.parse.unquote(x, encoding, errors) - if binary: - ans = ans.encode(encoding, errors) - return ans diff --git a/ebook_converter/ptempfile.py b/ebook_converter/ptempfile.py index 6e3d2ed..5d68660 100644 --- a/ebook_converter/ptempfile.py +++ b/ebook_converter/ptempfile.py @@ -8,6 +8,7 @@ import tempfile from ebook_converter.constants_old import __version__, __appname__, \ filesystem_encoding +from ebook_converter import polyglot def cleanup(path): @@ -90,9 +91,8 @@ def base_dir(): td = os.environ.get('CALIBRE_WORKER_TEMP_DIR', None) if td is not None: from ebook_converter.utils.serialize import msgpack_loads - from ebook_converter.polyglot.binary import from_hex_bytes try: - td = msgpack_loads(from_hex_bytes(td)) + td = msgpack_loads(polyglot.from_hex_bytes(td)) except Exception: td = None if td and os.path.exists(td): diff --git a/ebook_converter/tinycss/decoding.py b/ebook_converter/tinycss/decoding.py index 1f683e1..9c157cc 100644 --- a/ebook_converter/tinycss/decoding.py +++ b/ebook_converter/tinycss/decoding.py @@ -11,7 +11,7 @@ import operator import re -from ebook_converter.polyglot.binary import from_hex_bytes +from ebook_converter import polyglot __all__ = ['decode'] # Everything else is implementation detail @@ -94,7 +94,8 @@ def try_encoding(css_bytes, encoding, fallback=True): def hex2re(hex_data): - return re.escape(from_hex_bytes(hex_data.replace(' ', '').encode('ascii'))) + return re.escape(polyglot.from_hex_bytes(hex_data.replace(' ', '') + .encode('ascii'))) class Slicer(object): diff --git a/ebook_converter/utils/fonts/utils.py b/ebook_converter/utils/fonts/utils.py index 4cc874b..3cb7d199 100644 --- a/ebook_converter/utils/fonts/utils.py +++ b/ebook_converter/utils/fonts/utils.py @@ -2,7 +2,7 @@ import struct from io import BytesIO from collections import defaultdict -from ebook_converter.polyglot.builtins import as_bytes +from ebook_converter import polyglot __license__ = 'GPL v3' @@ -38,7 +38,7 @@ def get_tables(raw): def get_table(raw, name): ''' Get the raw table bytes for the specified table in the font ''' - name = as_bytes(name.lower()) + name = polyglot.as_bytes(name.lower()) for table_tag, table, table_index, table_offset, table_checksum in get_tables(raw): if table_tag.lower() == name: return table, table_index, table_offset, table_checksum diff --git a/ebook_converter/utils/serialize.py b/ebook_converter/utils/serialize.py index e92d6ad..742122a 100644 --- a/ebook_converter/utils/serialize.py +++ b/ebook_converter/utils/serialize.py @@ -1,3 +1,6 @@ +from ebook_converter import polyglot + + MSGPACK_MIME = 'application/x-msgpack' CANARY = 'jPoAv3zOyHvQ5JFNYg4hJ9' @@ -56,11 +59,11 @@ def json_dumps(data, **kw): def decode_metadata(x, for_json): - from ebook_converter.polyglot.binary import from_base64_bytes from ebook_converter.ebooks.metadata.book.serialize import metadata_from_dict obj = metadata_from_dict(x) if for_json and obj.cover_data and obj.cover_data[1]: - obj.cover_data = obj.cover_data[0], from_base64_bytes(obj.cover_data[1]) + obj.cover_data = (obj.cover_data[0], + polyglot.from_base64_bytes(obj.cover_data[1])) return obj diff --git a/ebook_converter/utils/terminal.py b/ebook_converter/utils/terminal.py index 56aeff9..538997e 100644 --- a/ebook_converter/utils/terminal.py +++ b/ebook_converter/utils/terminal.py @@ -1,5 +1,9 @@ -import os, sys, re -import fcntl, termios, struct +import fcntl +import os +import re +import struct +import sys +import termios def fmt(code): diff --git a/ebook_converter/utils/zipfile.py b/ebook_converter/utils/zipfile.py index e50e71c..ff950ab 100644 --- a/ebook_converter/utils/zipfile.py +++ b/ebook_converter/utils/zipfile.py @@ -10,7 +10,7 @@ from tempfile import SpooledTemporaryFile from ebook_converter.utils import filenames as fms from ebook_converter.constants_old import filesystem_encoding from ebook_converter.ebooks.chardet import detect -from ebook_converter.polyglot.builtins import as_bytes +from ebook_converter import polyglot try: import zlib # We may need its compression method @@ -330,7 +330,7 @@ class ZipInfo (object): if os.sep != '/': os_sep, sep = os.sep, '/' if isinstance(filename, bytes): - os_sep, sep = as_bytes(os_sep), b'/' + os_sep, sep = polyglot.as_bytes(os_sep), b'/' if os_sep in filename: filename = filename.replace(os_sep, sep)