1
0
mirror of https://github.com/gryf/ebook-converter.git synced 2026-02-09 10:25:45 +01:00

Moved misc functions from polyglot package to single polyglot module.

This commit is contained in:
2021-05-25 19:06:31 +02:00
parent f46984267e
commit f47376830f
32 changed files with 244 additions and 219 deletions

View File

@@ -5,17 +5,12 @@ import os
from lxml import html
from lxml.html import builder
from ebook_converter.polyglot.urllib import unquote as _unquote
from ebook_converter.ebooks.oeb.base import urlquote
from ebook_converter.ebooks.chardet import xml_to_unicode
from ebook_converter.customize.conversion import InputFormatPlugin
from ebook_converter.ptempfile import TemporaryDirectory
from ebook_converter.constants_old import filesystem_encoding
from ebook_converter.polyglot.builtins import as_bytes
__license__ = 'GPL v3'
__copyright__ = ('2008, Kovid Goyal <kovid at kovidgoyal.net>, '
'and Alex Bramley <a.bramley at gmail.com>.')
from ebook_converter import polyglot
class CHMInput(InputFormatPlugin):
@@ -133,7 +128,7 @@ class CHMInput(InputFormatPlugin):
def unquote(x):
if isinstance(x, str):
x = x.encode('utf-8')
return _unquote(x).decode('utf-8')
return polyglot.unquote(x).decode('utf-8')
def unquote_path(x):
y = unquote(x)
@@ -175,7 +170,7 @@ class CHMInput(InputFormatPlugin):
pretty_print=True)
f.write(raw)
else:
f.write(as_bytes(hhcdata))
f.write(polyglot.as_bytes(hhcdata))
return htmlpath, toc
def _read_file(self, name):

View File

@@ -9,7 +9,7 @@ from ebook_converter.ebooks.oeb import parse_utils
from ebook_converter.customize.conversion import OutputFormatPlugin
from ebook_converter.customize.conversion import OptionRecommendation
from ebook_converter.ptempfile import TemporaryDirectory
from ebook_converter.polyglot.builtins import as_bytes
from ebook_converter import polyglot
from ebook_converter.utils import directory
@@ -266,7 +266,8 @@ class EPUBOutput(OutputFormatPlugin):
extra_entries=extra_entries) as epub:
epub.add_dir(tdir)
if encryption is not None:
epub.writestr('META-INF/encryption.xml', as_bytes(encryption))
epub.writestr('META-INF/encryption.xml',
polyglot.as_bytes(encryption))
if metadata_xml is not None:
epub.writestr('META-INF/metadata.xml',
metadata_xml.encode('utf-8'))
@@ -308,12 +309,10 @@ class EPUBOutput(OutputFormatPlugin):
pass
def encrypt_fonts(self, uris, tdir, _uuid): # {{{
from ebook_converter.polyglot.binary import from_hex_bytes
key = re.sub(r'[^a-fA-F0-9]', '', _uuid)
if len(key) < 16:
raise ValueError('UUID identifier %r is invalid'% _uuid)
key = bytearray(from_hex_bytes((key + key)[:32]))
key = bytearray(polyglot.from_hex_bytes((key + key)[:32]))
paths = []
with directory.CurrentDir(tdir):
paths = [os.path.join(*x.split('/')) for x in uris]

View File

@@ -7,7 +7,7 @@ from lxml import etree
from ebook_converter.customize.conversion import OutputFormatPlugin, OptionRecommendation
from ebook_converter.ebooks.oeb.base import element
from ebook_converter.polyglot.urllib import unquote
from ebook_converter import polyglot
from ebook_converter.ptempfile import PersistentTemporaryDirectory
from ebook_converter.utils.cleantext import clean_xml_chars
from ebook_converter.utils import directory
@@ -56,7 +56,8 @@ class HTMLOutput(OutputFormatPlugin):
parent = element(parent, ('ul'))
for node in current_node.nodes:
point = element(parent, 'li')
href = relpath(os.path.abspath(unquote(node.href)),
href = relpath(os.path.abspath(polyglot
.unquote(node.href)),
os.path.dirname(ref_url))
if isinstance(href, bytes):
href = href.decode('utf-8')
@@ -84,7 +85,6 @@ class HTMLOutput(OutputFormatPlugin):
from lxml import etree
from ebook_converter.utils import zipfile
from templite import Templite
from ebook_converter.polyglot.urllib import unquote
from ebook_converter.ebooks.html.meta import EasyMeta
# read template files
@@ -156,7 +156,7 @@ class HTMLOutput(OutputFormatPlugin):
with directory.CurrentDir(output_dir):
for item in oeb_book.manifest:
path = os.path.abspath(unquote(item.href))
path = os.path.abspath(polyglot.unquote(item.href))
dir = os.path.dirname(path)
if not os.path.exists(dir):
os.makedirs(dir)
@@ -169,7 +169,7 @@ class HTMLOutput(OutputFormatPlugin):
item.unload_data_from_memory(memory=path)
for item in oeb_book.spine:
path = os.path.abspath(unquote(item.href))
path = os.path.abspath(polyglot.unquote(item.href))
dir = os.path.dirname(path)
root = item.data.getroottree()

View File

@@ -5,7 +5,7 @@ from lxml import etree
from ebook_converter.customize.conversion import (OutputFormatPlugin,
OptionRecommendation)
from ebook_converter.polyglot.urllib import unquote
from ebook_converter import polyglot
from ebook_converter.ebooks.oeb.base import OPF_MIME, NCX_MIME, PAGE_MAP_MIME, OEB_STYLES
from ebook_converter.ebooks.oeb.normalize_css import condense_sheet
from ebook_converter.utils import directory
@@ -56,7 +56,7 @@ class OEBOutput(OutputFormatPlugin):
not self.opts.expand_css and item.media_type in OEB_STYLES and hasattr(
item.data, 'cssText') and 'nook' not in self.opts.output_profile.short_name):
condense_sheet(item.data)
path = os.path.abspath(unquote(item.href))
path = os.path.abspath(polyglot.unquote(item.href))
dir = os.path.dirname(path)
if not os.path.exists(dir):
os.makedirs(dir)

View File

@@ -1,12 +1,7 @@
import os
from ebook_converter.customize.conversion import InputFormatPlugin, OptionRecommendation
from ebook_converter.polyglot.builtins import as_bytes
__license__ = 'GPL 3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
from ebook_converter import polyglot
class PDFInput(InputFormatPlugin):
@@ -72,7 +67,8 @@ class PDFInput(InputFormatPlugin):
ncxid = opf.manifest.id_for_path('toc.ncx')
if ncxid:
with open('metadata.opf', 'r+b') as f:
raw = f.read().replace(b'<spine', b'<spine toc="%s"' % as_bytes(ncxid))
raw = f.read().replace(b'<spine', b'<spine toc="%s"' %
polyglot.as_bytes(ncxid))
f.seek(0)
f.write(raw)

View File

@@ -8,7 +8,7 @@ from lxml import etree
from ebook_converter.customize.conversion import InputFormatPlugin
from ebook_converter.customize.conversion import OptionRecommendation
from ebook_converter.polyglot.builtins import as_bytes
from ebook_converter import polyglot
border_style_map = {'single': 'solid',
@@ -296,7 +296,7 @@ class RTFInput(InputFormatPlugin):
result = transform(doc)
html = u'index.xhtml'
with open(html, 'wb') as f:
res = as_bytes(transform.tostring(result))
res = polyglot.as_bytes(transform.tostring(result))
# res = res[:100].replace('xmlns:html', 'xmlns') + res[100:]
# clean multiple \n
res = re.sub(b'\n+', b'\n', res)

View File

@@ -1,22 +1,20 @@
__license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import io
from ebook_converter import polyglot
def base64_decode(raw):
from io import BytesIO
from ebook_converter.polyglot.binary import from_base64_bytes
# First try the python implementation as it is faster
try:
return from_base64_bytes(raw)
return polyglot.from_base64_bytes(raw)
except Exception:
pass
# Try a more robust version (adapted from FBReader sources)
A, Z, a, z, zero, nine, plus, slash, equal = bytearray(b'AZaz09+/=')
raw = bytearray(raw)
out = BytesIO()
out = io.BytesIO()
pos = 0
while pos < len(raw):
tot = 0
@@ -32,7 +30,7 @@ def base64_decode(raw):
elif zero <= byt <= nine:
num = byt - zero + 52
else:
num = {plus:62, slash:63, equal:64}.get(byt, None)
num = {plus: 62, slash: 63, equal: 64}.get(byt, None)
if num is None:
# Ignore this byte
continue

View File

@@ -13,7 +13,7 @@ from ebook_converter import constants as const
from ebook_converter.constants_old import __appname__, __version__
from ebook_converter.ebooks.oeb import base
from ebook_converter.ebooks.oeb import parse_utils
from ebook_converter.polyglot.binary import as_base64_unicode
from ebook_converter import polyglot
from ebook_converter.utils import entities
from ebook_converter.utils.img import save_cover_data_to
from ebook_converter.utils.localization import lang_as_iso639_1
@@ -355,10 +355,10 @@ class FB2MLizer(object):
if item.media_type not in ('image/jpeg', 'image/png'):
imdata = save_cover_data_to(item.data,
compression_quality=70)
raw_data = as_base64_unicode(imdata)
raw_data = polyglot.as_base64_unicode(imdata)
content_type = 'image/jpeg'
else:
raw_data = as_base64_unicode(item.data)
raw_data = polyglot.as_base64_unicode(item.data)
content_type = item.media_type
# Don't put the encoded image on a single line.
step = 72

View File

@@ -14,26 +14,24 @@ from ebook_converter.ebooks.oeb import parse_utils
from ebook_converter.ebooks.oeb.stylizer import Stylizer
from ebook_converter.utils import entities
from ebook_converter.utils.logging import default_log
from ebook_converter.polyglot.builtins import as_bytes
from ebook_converter import polyglot
__license__ = 'GPL 3'
__copyright__ = '2011, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
SELF_CLOSING_TAGS = {'area', 'base', 'basefont', 'br', 'hr', 'input', 'img', 'link', 'meta'}
SELF_CLOSING_TAGS = {'area', 'base', 'basefont', 'br', 'hr', 'input', 'img',
'link', 'meta'}
class OEB2HTML(object):
'''
Base class. All subclasses should implement dump_text to actually transform
content. Also, callers should use oeb2html to get the transformed html.
links and images can be retrieved after calling oeb2html to get the mapping
of OEB links and images to the new names used in the html returned by oeb2html.
Images will always be referenced as if they are in an images directory.
"""
Base class. All subclasses should implement dump_text to actually
transform content. Also, callers should use oeb2html to get the
transformed html links and images can be retrieved after calling oeb2html
to get the mapping of OEB links and images to the new names used in the
html returned by oeb2html. Images will always be referenced as if they are
in an images directory.
Use get_css to get the CSS classes for the OEB document as a string.
'''
"""
def __init__(self, log=None):
self.log = default_log if log is None else log
@@ -55,16 +53,18 @@ class OEB2HTML(object):
return self.mlize_spine(oeb_book)
def mlize_spine(self, oeb_book):
output = [
u'<html><head><meta http-equiv="Content-Type" content="text/html;charset=utf-8" /><title>%s</title></head><body>' % (
entities.prepare_string_for_xml(self.book_title))
]
output = ['<html><head><meta http-equiv="Content-Type" '
'content="text/html;charset=utf-8" />'
'<title>%s</title></head>'
'<body>' % entities.prepare_string_for_xml(self.book_title)]
for item in oeb_book.spine:
self.log.debug('Converting %s to HTML...' % item.href)
self.rewrite_ids(item.data, item)
base.rewrite_links(item.data, partial(self.rewrite_link, page=item))
base.rewrite_links(item.data, partial(self.rewrite_link,
page=item))
stylizer = Stylizer(item.data, item.href, oeb_book, self.opts)
output += self.dump_text(item.data.find(base.tag('xhtml', 'body')), stylizer, item)
output += self.dump_text(item.data.find(base.tag('xhtml', 'body')),
stylizer, item)
output.append('\n\n')
output.append('</body></html>')
return ''.join(output)
@@ -126,13 +126,14 @@ class OEB2HTML(object):
el.attrib['id'] = self.get_link_id(page.href)[1:]
continue
if 'id' in el.attrib:
el.attrib['id'] = self.get_link_id(page.href, el.attrib['id'])[1:]
el.attrib['id'] = self.get_link_id(page.href,
el.attrib['id'])[1:]
def get_css(self, oeb_book):
css = b''
for item in oeb_book.manifest:
if item.media_type == 'text/css':
css += as_bytes(item.data.cssText) + b'\n\n'
css += polyglot.as_bytes(item.data.cssText) + b'\n\n'
return css
def prepare_string_for_html(self, raw):
@@ -157,10 +158,14 @@ class OEB2HTMLNoCSSizer(OEB2HTML):
# We can only processes tags. If there isn't a tag return any text.
if not isinstance(elem.tag, (str, bytes)) \
or parse_utils.namespace(elem.tag) not in (const.XHTML_NS, const.SVG_NS):
or parse_utils.namespace(elem.tag) not in (const.XHTML_NS,
const.SVG_NS):
p = elem.getparent()
if p is not None and isinstance(p.tag, (str, bytes)) and parse_utils.namespace(p.tag) in (const.XHTML_NS, const.SVG_NS) \
and elem.tail:
if (p is not None and
isinstance(p.tag, (str, bytes)) and
parse_utils.namespace(p.tag) in (const.XHTML_NS,
const.SVG_NS) and
elem.tail):
return [elem.tail]
return ['']
@@ -176,8 +181,8 @@ class OEB2HTMLNoCSSizer(OEB2HTML):
tags.append(tag)
# Ignore anything that is set to not be displayed.
if style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') \
or style['visibility'] == 'hidden':
if (style['display'] in ('none', 'oeb-page-head', 'oeb-page-foot') or
style['visibility'] == 'hidden'):
return ['']
# Remove attributes we won't want.
@@ -186,11 +191,13 @@ class OEB2HTMLNoCSSizer(OEB2HTML):
if 'style' in attribs:
del attribs['style']
# Turn the rest of the attributes into a string we can write with the tag.
# Turn the rest of the attributes into a string we can write with the
# tag.
at = ''
for k, v in attribs.items():
at += ' %s="%s"' % (k, entities
.prepare_string_for_xml(v, attribute=True))
for key, value in attribs.items():
at += (' %s="%s"' %
(key, entities.prepare_string_for_xml(value,
attribute=True)))
# Write the tag.
text.append('<%s%s' % (tag, at))
@@ -246,11 +253,15 @@ class OEB2HTMLInlineCSSizer(OEB2HTML):
'''
# We can only processes tags. If there isn't a tag return any text.
if not isinstance(elem.tag, (str, bytes)) \
or parse_utils.namespace(elem.tag) not in (const.XHTML_NS, const.SVG_NS):
if (not isinstance(elem.tag, (str, bytes)) or
parse_utils.namespace(elem.tag) not in (const.XHTML_NS,
const.SVG_NS)):
p = elem.getparent()
if p is not None and isinstance(p.tag, (str, bytes)) and parse_utils.namespace(p.tag) in (const.XHTML_NS, const.SVG_NS) \
and elem.tail:
if (p is not None and
isinstance(p.tag, (str, bytes)) and
parse_utils.namespace(p.tag) in (const.XHTML_NS,
const.SVG_NS) and
elem.tail):
return [elem.tail]
return ['']
@@ -266,9 +277,11 @@ class OEB2HTMLInlineCSSizer(OEB2HTML):
if tag == 'body':
# Change the body to a div so we can merge multiple files.
tag = 'div'
# Add page-break-brefore: always because renders typically treat a new file (we're merging files)
# as a page break and remove all other page break types that might be set.
style_a = 'page-break-before: always; %s' % re.sub('page-break-[^:]+:[^;]+;?', '', style_a)
# Add page-break-brefore: always because renders typically treat
# a new file (we're merging files) as a page break and remove all
# other page break types that might be set.
style_a = ('page-break-before: always; %s' %
re.sub('page-break-[^:]+:[^;]+;?', '', style_a))
# Remove unnecessary spaces.
style_a = re.sub(r'\s{2,}', ' ', style_a).strip()
tags.append(tag)
@@ -279,7 +292,8 @@ class OEB2HTMLInlineCSSizer(OEB2HTML):
if 'style' in attribs:
del attribs['style']
# Turn the rest of the attributes into a string we can write with the tag.
# Turn the rest of the attributes into a string we can write with
# the tag.
at = ''
for k, v in attribs.items():
at += ' %s="%s"' % (k, entities
@@ -319,43 +333,51 @@ class OEB2HTMLInlineCSSizer(OEB2HTML):
class OEB2HTMLClassCSSizer(OEB2HTML):
'''
Use CSS classes. css_style option can specify whether to use
inline classes (style tag in the head) or reference an external
CSS file called style.css.
'''
"""
Use CSS classes. css_style option can specify whether to use inline
classes (style tag in the head) or reference an external CSS file called
style.css.
"""
def mlize_spine(self, oeb_book):
output = []
for item in oeb_book.spine:
self.log.debug('Converting %s to HTML...' % item.href)
self.rewrite_ids(item.data, item)
base.rewrite_links(item.data, partial(self.rewrite_link, page=item))
base.rewrite_links(item.data, partial(self.rewrite_link,
page=item))
stylizer = Stylizer(item.data, item.href, oeb_book, self.opts)
output += self.dump_text(item.data.find(base.tag('xhtml', 'body')), stylizer, item)
output += self.dump_text(item.data.find(base.tag('xhtml', 'body')),
stylizer, item)
output.append('\n\n')
if self.opts.htmlz_class_style == 'external':
css = u'<link href="style.css" rel="stylesheet" type="text/css" />'
css = '<link href="style.css" rel="stylesheet" type="text/css" />'
else:
css = u'<style type="text/css">' + self.get_css(oeb_book) + u'</style>'
title = (u'<title>%s</title>' %
css = ('<style type="text/css">' + self.get_css(oeb_book) +
'</style>')
title = ('<title>%s</title>' %
entities.prepare_string_for_xml(self.book_title))
output = [u'<html><head><meta http-equiv="Content-Type" content="text/html;charset=utf-8" />'] + \
[css] + [title, u'</head><body>'] + output + [u'</body></html>']
output = (['<html><head><meta http-equiv="Content-Type" '
'content="text/html;charset=utf-8" />'] + [css] +
[title, '</head><body>'] + output + ['</body></html>'])
return ''.join(output)
def dump_text(self, elem, stylizer, page):
'''
"""
@elem: The element in the etree that we are working on.
@stylizer: The style information attached to the element.
'''
"""
# We can only processes tags. If there isn't a tag return any text.
if not isinstance(elem.tag, (str, bytes)) \
or parse_utils.namespace(elem.tag) not in (const.XHTML_NS, const.SVG_NS):
if (not isinstance(elem.tag, (str, bytes)) or
parse_utils.namespace(elem.tag) not in (const.XHTML_NS,
const.SVG_NS)):
p = elem.getparent()
if p is not None and isinstance(p.tag, (str, bytes)) and parse_utils.namespace(p.tag) in (const.XHTML_NS, const.SVG_NS) \
and elem.tail:
if (p is not None and
isinstance(p.tag, (str, bytes)) and
parse_utils.namespace(p.tag) in (const.XHTML_NS,
const.SVG_NS) and
elem.tail):
return [elem.tail]
return ['']
@@ -373,11 +395,12 @@ class OEB2HTMLClassCSSizer(OEB2HTML):
if 'style' in attribs:
del attribs['style']
# Turn the rest of the attributes into a string we can write with the tag.
# Turn the rest of the attributes into a string we can write with
# the tag.
at = ''
for k, v in attribs.items():
at += ' %s="%s"' % (k,
entities.prepare_string_for_xml(v, attribute=True))
at += ' %s="%s"' % (k, entities
.prepare_string_for_xml(v, attribute=True))
# Write the tag.
text.append('<%s%s' % (tag, at))

View File

@@ -5,7 +5,7 @@ import textwrap
from lxml import etree
from ebook_converter.polyglot.builtins import as_bytes
from ebook_converter import polyglot
class Canvas(etree.XSLTExtension):
@@ -292,7 +292,7 @@ class Styles(etree.XSLTExtension):
return '\n\t'.join(ans)
with open(name, 'wb') as f:
f.write(as_bytes(self.CSS))
f.write(polyglot.as_bytes(self.CSS))
for (w, sel) in [(self.text_styles, 'ts'), (self.block_styles,
'bs')]:
for i, s in enumerate(w):
@@ -300,7 +300,7 @@ class Styles(etree.XSLTExtension):
continue
rsel = '.%s%d'%(sel, i)
s = join(s)
f.write(as_bytes(rsel + ' {\n\t' + s + '\n}\n\n'))
f.write(polyglot.as_bytes(rsel + ' {\n\t' + s + '\n}\n\n'))
def execute(self, context, self_node, input_node, output_parent):
if input_node.tag == 'TextStyle':

View File

@@ -9,7 +9,7 @@ import sys
import urllib.parse
from ebook_converter.utils.config_base import tweaks
from ebook_converter.polyglot.urllib import unquote
from ebook_converter import polyglot
from ebook_converter.utils import encoding as uenc
@@ -248,9 +248,11 @@ class Resource(object):
pc = url[2]
if isinstance(pc, str):
pc = pc.encode('utf-8')
pc = unquote(pc).decode('utf-8')
self.path = os.path.abspath(os.path.join(basedir, pc.replace('/', os.sep)))
self.fragment = unquote(url[-1])
pc = polyglot.unquote(pc).decode('utf-8')
self.path = os.path.abspath(os.path.join(basedir,
pc.replace('/',
os.sep)))
self.fragment = polyglot.unquote(url[-1])
def href(self, basedir=None):
'''

View File

@@ -14,7 +14,7 @@ from ebook_converter.utils.img import save_cover_data_to
from ebook_converter.utils.imghdr import identify
from ebook_converter.ebooks.metadata import MetaInformation, check_isbn
from ebook_converter.ebooks.chardet import xml_to_unicode
from ebook_converter.polyglot.binary import as_base64_unicode
from ebook_converter import polyglot
from ebook_converter.utils import encoding as uenc
@@ -389,7 +389,7 @@ def _rnd_pic_file_name(prefix='calibre_cover_', size=32, ext='jpg'):
def _encode_into_jpeg(data):
data = save_cover_data_to(data)
return as_base64_unicode(data)
return polyglot.as_base64_unicode(data)
def _set_cover(title_info, mi, ctx):

View File

@@ -30,11 +30,11 @@ from ebook_converter.ebooks.metadata.utils import parse_opf, \
from ebook_converter.ebooks.metadata import string_to_authors, \
MetaInformation, check_isbn
from ebook_converter.ebooks.metadata.book.base import Metadata
from ebook_converter import polyglot
from ebook_converter.utils.date import parse_date, isoformat
from ebook_converter.utils.localization import get_lang, canonicalize_lang
from ebook_converter.utils.cleantext import clean_ascii_chars, clean_xml_chars
from ebook_converter.utils.config_base import tweaks
from ebook_converter.polyglot.urllib import unquote
pretty_print_opf = False
@@ -838,7 +838,7 @@ class OPF(object): # {{{
def unquote_urls(self):
def get_href(item):
raw = unquote(item.get('href', ''))
raw = polyglot.unquote(item.get('href', ''))
if not isinstance(raw, str):
raw = raw.decode('utf-8')
return raw

View File

@@ -11,7 +11,7 @@ from lxml.builder import ElementMaker
from ebook_converter.constants_old import __appname__, __version__
from ebook_converter.ebooks.chardet import xml_to_unicode
from ebook_converter.utils.cleantext import clean_xml_chars
from ebook_converter.polyglot.urllib import unquote
from ebook_converter import polyglot
NCX_NS = "http://www.daisy.org/z3986/2005/ncx/"
@@ -31,7 +31,7 @@ def parse_html_toc(data):
root = parse(clean_xml_chars(data), maybe_xhtml=True, keep_doctype=False,
sanitize_names=True)
for a in root.xpath('//*[@href and local-name()="a"]'):
purl = urllib.parse.urlparse(unquote(a.get('href')))
purl = urllib.parse.urlparse(polyglot.unquote(a.get('href')))
href, fragment = purl[2], purl[5]
if not fragment:
fragment = None
@@ -149,7 +149,7 @@ class TOC(list):
if toc is not None:
if toc.lower() not in ('ncx', 'ncxtoc'):
toc = urllib.parse.urlparse(unquote(toc))[2]
toc = urllib.parse.urlparse(polyglot.unquote(toc))[2]
toc = toc.replace('/', os.sep)
if not os.path.isabs(toc):
toc = os.path.join(self.base_path, toc)
@@ -219,7 +219,8 @@ class TOC(list):
content = content[0]
# if get_attr(content, attr='src'):
purl = urllib.parse.urlparse(content.get('src'))
href, fragment = unquote(purl[2]), unquote(purl[5])
href = polyglot.unquote(purl[2])
fragment = polyglot.unquote(purl[5])
nd = dest.add_item(href, fragment, text)
nd.play_order = play_order

View File

@@ -5,7 +5,7 @@ from io import BytesIO
from ebook_converter.utils.img import save_cover_data_to, scale_image, image_to_data, image_from_data, resize_image, png_data_to_gif_data
from ebook_converter.utils.imghdr import what
from ebook_converter.ebooks import normalize
from ebook_converter.polyglot.builtins import as_bytes
from ebook_converter import polyglot
from ebook_converter.tinycss.color3 import parse_color_string
@@ -61,7 +61,7 @@ def decode_hex_number(raw, codec='utf-8'):
def encode_string(raw):
ans = bytearray(as_bytes(raw))
ans = bytearray(polyglot.as_bytes(raw))
ans.insert(0, len(ans))
return bytes(ans)

View File

@@ -15,7 +15,7 @@ from odf.namespaces import TEXTNS as odTEXTNS
from ebook_converter.utils import directory
from ebook_converter.ebooks.oeb.base import _css_logger
from ebook_converter.polyglot.builtins import as_bytes
from ebook_converter import polyglot
class Extract(ODF2XHTML):
@@ -292,7 +292,7 @@ class Extract(ODF2XHTML):
except:
log.exception('Failed to filter CSS, conversion may be slow')
with open('index.xhtml', 'wb') as f:
f.write(as_bytes(html))
f.write(polyglot.as_bytes(html))
zf = ZipFile(stream, 'r')
self.extract_pictures(zf)
opf = OPFCreator(os.path.abspath(os.getcwd()), mi)

View File

@@ -24,7 +24,7 @@ from ebook_converter.utils.localization import get_lang
from ebook_converter.ptempfile import TemporaryDirectory
from ebook_converter.constants_old import __appname__, __version__
from ebook_converter.utils import entities
from ebook_converter.polyglot.urllib import unquote
from ebook_converter import polyglot
class OEBReader(object):
@@ -641,7 +641,7 @@ class OEBReader(object):
with TemporaryDirectory('_html_cover') as tdir:
writer = OEBWriter()
writer(self.oeb, tdir)
path = os.path.join(tdir, unquote(hcover.href))
path = os.path.join(tdir, polyglot.unquote(hcover.href))
data = render_html_svg_workaround(path, self.logger)
if not data:
data = b''

View File

@@ -5,7 +5,7 @@ import urllib.parse
from lxml import etree
from ebook_converter.utils.imghdr import identify
from ebook_converter.polyglot.urllib import unquote
from ebook_converter import polyglot
class CoverManager(object):
@@ -113,7 +113,7 @@ class CoverManager(object):
if href is not None:
templ = self.non_svg_template if self.no_svg_cover \
else self.svg_template
tp = templ % unquote(href)
tp = templ % polyglot.unquote(href)
id, href = m.generate('titlepage', 'titlepage.xhtml')
item = m.add(id, href, mimetypes.guess_type('t.xhtml')[0],
data=etree.fromstring(tp))

View File

@@ -3,8 +3,7 @@ import re
import urllib.parse
from ebook_converter.ebooks.oeb.base import XPath
from ebook_converter.polyglot.binary import from_base64_bytes
from ebook_converter.polyglot.builtins import as_bytes
from ebook_converter import polyglot
class DataURL(object):
@@ -27,14 +26,14 @@ class DataURL(object):
if ';base64' in header:
data = re.sub(r'\s+', '', data)
try:
data = from_base64_bytes(data)
data = polyglot.from_base64_bytes(data)
except Exception:
self.log.error('Found invalid base64 encoded data '
'URI, ignoring it')
continue
else:
data = urllib.parse.unquote(data)
data = as_bytes(data)
data = polyglot.as_bytes(data)
fmt = what(None, data)
if not fmt:
self.log.warn('Image encoded as data URL has unknown '

View File

@@ -17,7 +17,7 @@ from ebook_converter import constants as const
from ebook_converter.ebooks.epub import rules
from ebook_converter.ebooks.oeb import base
from ebook_converter.ebooks.oeb.polish.split import do_split
from ebook_converter.polyglot.urllib import unquote
from ebook_converter import polyglot
from ebook_converter.css_selectors import Select, SelectorError
from ebook_converter.utils import encoding as uenc
@@ -189,7 +189,7 @@ class Split(object):
nhref = anchor_map[frag if frag else None]
nhref = self.current_item.relhref(nhref)
if frag:
nhref = '#'.join((unquote(nhref), frag))
nhref = '#'.join((polyglot.unquote(nhref), frag))
return nhref
return url

View File

@@ -1,20 +1,18 @@
import codecs, zlib, numbers
from io import BytesIO
import codecs
from datetime import datetime
import io
import numbers
import zlib
from ebook_converter.utils.logging import default_log
from ebook_converter.polyglot.binary import as_hex_bytes
from ebook_converter import polyglot
__license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid at kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
pdf_float = lambda x: f"{x:.1f}"
EOL = b'\n'
# Sizes {{{
# Sizes
inch = 72.0
cm = inch / 2.54
mm = cm * 0.1
@@ -45,10 +43,9 @@ B2 = (_BW*2, _BH*2)
B1 = (_BH*4, _BW*2)
B0 = (_BW*4, _BH*4)
PAPER_SIZES = {k:globals()[k.upper()] for k in ('a0 a1 a2 a3 a4 a5 a6 b0 b1 b2'
' b3 b4 b5 b6 letter legal').split()}
# }}}
PAPER_SIZES = {k: globals()[k.upper()] for k in ('a0 a1 a2 a3 a4 a5 a6 b0 b1 '
'b2 b3 b4 b5 b6 letter '
'legal').split()}
def fmtnum(o):
@@ -70,12 +67,12 @@ def serialize(o, stream):
elif o is None:
stream.write_raw(b'null')
elif isinstance(o, datetime):
val = o.strftime("D:%Y%m%d%H%M%%02d%z")%min(59, o.second)
val = o.strftime("D:%Y%m%d%H%M%%02d%z") % min(59, o.second)
if datetime.tzinfo is not None:
val = "(%s'%s')"%(val[:-2], val[-2:])
val = "(%s'%s')" % (val[:-2], val[-2:])
stream.write(val.encode('ascii'))
else:
raise ValueError('Unknown object: %r'%o)
raise ValueError('Unknown object: %r' % o)
class Name(str):
@@ -83,7 +80,7 @@ class Name(str):
def pdf_serialize(self, stream):
raw = self.encode('ascii')
if len(raw) > 126:
raise ValueError('Name too long: %r'%self)
raise ValueError('Name too long: %r' % self)
raw = bytearray(raw)
sharp = ord(b'#')
buf = (
@@ -96,7 +93,8 @@ def escape_pdf_string(bytestring):
indices = []
bad = []
ba = bytearray(bytestring)
bad_map = {10:ord('n'), 13:ord('r'), 12:ord('f'), 8:ord('b'), 9:ord('\t'), 92:ord('\\')}
bad_map = {10: ord('n'), 13: ord('r'), 12: ord('f'),
8: ord('b'), 9: ord('\t'), 92: ord('\\')}
for i, num in enumerate(ba):
if num == 40: # (
indices.append((i, 40))
@@ -134,7 +132,7 @@ class UTF16String(str):
if False:
# Disabled as the parentheses based strings give easier to debug
# PDF files
stream.write(b'<' + as_hex_bytes(raw) + b'>')
stream.write(b'<' + polyglot.as_hex_bytes(raw) + b'>')
else:
stream.write(b'('+escape_pdf_string(raw)+b')')
@@ -143,9 +141,9 @@ class Dictionary(dict):
def pdf_serialize(self, stream):
stream.write(b'<<' + EOL)
sorted_keys = sorted(self,
key=lambda x:({'Type':'1', 'Subtype':'2'}.get(
x, x)+x))
sorted_keys = sorted(self, key=lambda x: ({'Type': '1',
'Subtype': '2'}
.get(x, x) + x))
for k in sorted_keys:
serialize(Name(k), stream)
stream.write(b' ')
@@ -177,10 +175,10 @@ class Array(list):
stream.write(b']')
class Stream(BytesIO):
class Stream(io.BytesIO):
def __init__(self, compress=False):
BytesIO.__init__(self)
io.BytesIO.__init__(self)
self.compress = compress
self.filters = Array()
@@ -213,7 +211,7 @@ class Stream(BytesIO):
raw.encode('ascii'))
def write_raw(self, raw):
BytesIO.write(self, raw)
io.BytesIO.write(self, raw)
class Reference(object):
@@ -222,11 +220,11 @@ class Reference(object):
self.num, self.obj = num, obj
def pdf_serialize(self, stream):
raw = '%d 0 R'%self.num
raw = '%d 0 R' % self.num
stream.write(raw.encode('ascii'))
def __repr__(self):
return '%d 0 R'%self.num
return '%d 0 R' % self.num
def __str__(self):
return repr(self)

View File

@@ -0,0 +1,59 @@
"""
Misc converting functions from polyglot module.
Most of the have something to do with converting between string and binary
"""
import base64
import binascii
import urllib
def as_base64_unicode(x, enc='utf-8'):
if isinstance(x, str):
x = x.encode(enc)
return base64.standard_b64encode(x).decode('ascii')
def from_base64_bytes(x):
if isinstance(x, str):
x = x.encode('ascii')
return base64.standard_b64decode(x)
def as_hex_bytes(x, enc='utf-8'):
if isinstance(x, str):
x = x.encode(enc)
return binascii.hexlify(x)
def from_hex_bytes(x):
if isinstance(x, str):
x = x.encode('ascii')
return binascii.unhexlify(x)
def as_bytes(x, encoding='utf-8'):
if isinstance(x, str):
return x.encode(encoding)
if isinstance(x, bytes):
return x
if isinstance(x, bytearray):
return bytes(x)
if isinstance(x, memoryview):
return x.tobytes()
return str(x).encode(encoding)
def unquote(x, encoding='utf-8', errors='replace'):
# TODO(gryf): this works like that: if x is a binary, convert it to
# string using encoding and make unquote. After that make it binary again.
# If x is string, just pass it to the unquote.
# This approach is mostly used within lxml etree strings, which suppose to
# be binary because of its inner representation. I'm wondering, if
# xml.etree could be used instead - to be checked.
binary = isinstance(x, bytes)
if binary:
x = x.decode(encoding, errors)
ans = urllib.parse.unquote(x, encoding, errors)
if binary:
ans = ans.encode(encoding, errors)
return ans

View File

@@ -1,26 +0,0 @@
from base64 import standard_b64decode, standard_b64encode
from binascii import hexlify, unhexlify
def as_base64_unicode(x, enc='utf-8'):
if isinstance(x, str):
x = x.encode(enc)
return standard_b64encode(x).decode('ascii')
def from_base64_bytes(x):
if isinstance(x, str):
x = x.encode('ascii')
return standard_b64decode(x)
def as_hex_bytes(x, enc='utf-8'):
if isinstance(x, str):
x = x.encode(enc)
return hexlify(x)
def from_hex_bytes(x):
if isinstance(x, str):
x = x.encode('ascii')
return unhexlify(x)

View File

@@ -1,10 +0,0 @@
def as_bytes(x, encoding='utf-8'):
if isinstance(x, str):
return x.encode(encoding)
if isinstance(x, bytes):
return x
if isinstance(x, bytearray):
return bytes(x)
if isinstance(x, memoryview):
return x.tobytes()
return str(x).encode(encoding)

View File

@@ -1,17 +0,0 @@
import urllib.parse
def unquote(x, encoding='utf-8', errors='replace'):
# TODO(gryf): this works like that: if x is a binary, convert it to
# string using encoding and make unquote. After that make it binary again.
# If x is string, just pass it to the unquote.
# This approach is mostly used within lxml etree strings, which suppose to
# be binary because of its inner representation. I'm wondering, if
# xml.etree could be used instead - to be checked.
binary = isinstance(x, bytes)
if binary:
x = x.decode(encoding, errors)
ans = urllib.parse.unquote(x, encoding, errors)
if binary:
ans = ans.encode(encoding, errors)
return ans

View File

@@ -8,6 +8,7 @@ import tempfile
from ebook_converter.constants_old import __version__, __appname__, \
filesystem_encoding
from ebook_converter import polyglot
def cleanup(path):
@@ -90,9 +91,8 @@ def base_dir():
td = os.environ.get('CALIBRE_WORKER_TEMP_DIR', None)
if td is not None:
from ebook_converter.utils.serialize import msgpack_loads
from ebook_converter.polyglot.binary import from_hex_bytes
try:
td = msgpack_loads(from_hex_bytes(td))
td = msgpack_loads(polyglot.from_hex_bytes(td))
except Exception:
td = None
if td and os.path.exists(td):

View File

@@ -11,7 +11,7 @@
import operator
import re
from ebook_converter.polyglot.binary import from_hex_bytes
from ebook_converter import polyglot
__all__ = ['decode'] # Everything else is implementation detail
@@ -94,7 +94,8 @@ def try_encoding(css_bytes, encoding, fallback=True):
def hex2re(hex_data):
return re.escape(from_hex_bytes(hex_data.replace(' ', '').encode('ascii')))
return re.escape(polyglot.from_hex_bytes(hex_data.replace(' ', '')
.encode('ascii')))
class Slicer(object):

View File

@@ -2,7 +2,7 @@ import struct
from io import BytesIO
from collections import defaultdict
from ebook_converter.polyglot.builtins import as_bytes
from ebook_converter import polyglot
__license__ = 'GPL v3'
@@ -38,7 +38,7 @@ def get_tables(raw):
def get_table(raw, name):
''' Get the raw table bytes for the specified table in the font '''
name = as_bytes(name.lower())
name = polyglot.as_bytes(name.lower())
for table_tag, table, table_index, table_offset, table_checksum in get_tables(raw):
if table_tag.lower() == name:
return table, table_index, table_offset, table_checksum

View File

@@ -1,3 +1,6 @@
from ebook_converter import polyglot
MSGPACK_MIME = 'application/x-msgpack'
CANARY = 'jPoAv3zOyHvQ5JFNYg4hJ9'
@@ -56,11 +59,11 @@ def json_dumps(data, **kw):
def decode_metadata(x, for_json):
from ebook_converter.polyglot.binary import from_base64_bytes
from ebook_converter.ebooks.metadata.book.serialize import metadata_from_dict
obj = metadata_from_dict(x)
if for_json and obj.cover_data and obj.cover_data[1]:
obj.cover_data = obj.cover_data[0], from_base64_bytes(obj.cover_data[1])
obj.cover_data = (obj.cover_data[0],
polyglot.from_base64_bytes(obj.cover_data[1]))
return obj

View File

@@ -1,5 +1,9 @@
import os, sys, re
import fcntl, termios, struct
import fcntl
import os
import re
import struct
import sys
import termios
def fmt(code):

View File

@@ -10,7 +10,7 @@ from tempfile import SpooledTemporaryFile
from ebook_converter.utils import filenames as fms
from ebook_converter.constants_old import filesystem_encoding
from ebook_converter.ebooks.chardet import detect
from ebook_converter.polyglot.builtins import as_bytes
from ebook_converter import polyglot
try:
import zlib # We may need its compression method
@@ -330,7 +330,7 @@ class ZipInfo (object):
if os.sep != '/':
os_sep, sep = os.sep, '/'
if isinstance(filename, bytes):
os_sep, sep = as_bytes(os_sep), b'/'
os_sep, sep = polyglot.as_bytes(os_sep), b'/'
if os_sep in filename:
filename = filename.replace(os_sep, sep)