1
0
mirror of https://github.com/gryf/ebook-converter.git synced 2026-01-30 18:25:44 +01:00
Files
ebook-converter/ebook_converter/ebooks/oeb/polish/utils.py
gryf 1465e4267f Sorted out mime initialization.
Every mime related function in main __init__.py has a flag check for the
check if initialization has already done. This is nonsense, since it
should be done implicitly early on the converter is starting.

This commit straight the things out, and initialization is done in cli
module.

Also, function guess_type was removed, since it's just a proxy for
mimetypes.guess_type function.
2020-06-14 15:41:18 +02:00

240 lines
7.7 KiB
Python

import bisect
import os
import re
import mimetypes
from ebook_converter import replace_entities
def _upper(string):
return string.upper()
def guess_type(x):
return mimetypes.guess_type(x)[0] or 'application/octet-stream'
def setup_css_parser_serialization(tab_width=2):
import css_parser
prefs = css_parser.ser.prefs
prefs.indent = tab_width * ' '
prefs.indentClosingBrace = False
prefs.omitLastSemicolon = False
def actual_case_for_name(container, name):
from ebook_converter.utils.filenames import samefile
if not container.exists(name):
raise ValueError('Cannot get actual case for %s as it does not '
'exist' % name)
parts = name.split('/')
base = ''
ans = []
for i, x in enumerate(parts):
base = '/'.join(ans + [x])
path = container.name_to_abspath(base)
pdir = os.path.dirname(path)
candidates = {os.path.join(pdir, q) for q in os.listdir(pdir)}
if x in candidates:
correctx = x
else:
for q in candidates:
if samefile(q, path):
correctx = os.path.basename(q)
break
else:
raise RuntimeError('Something bad happened')
ans.append(correctx)
return '/'.join(ans)
def corrected_case_for_name(container, name):
parts = name.split('/')
ans = []
base = ''
for i, x in enumerate(parts):
base = '/'.join(ans + [x])
if container.exists(base):
correctx = x
else:
try:
dirname = os.path.dirname(container.name_to_abspath(base))
candidates = {q for q in os.listdir(dirname)}
except EnvironmentError:
# one of the non-terminal components of name is a file instead
# of a directory
return None
for q in candidates:
if q.lower() == x.lower():
correctx = q
break
else:
return None
ans.append(correctx)
return '/'.join(ans)
class PositionFinder(object):
def __init__(self, raw):
pat = br'\n' if isinstance(raw, bytes) else r'\n'
self.new_lines = tuple(m.start() + 1 for m in re.finditer(pat, raw))
def __call__(self, pos):
lnum = bisect.bisect(self.new_lines, pos)
try:
offset = abs(pos - self.new_lines[lnum - 1])
except IndexError:
offset = pos
return (lnum + 1, offset)
class CommentFinder(object):
def __init__(self, raw, pat=r'(?s)/\*.*?\*/'):
self.starts, self.ends = [], []
for m in re.finditer(pat, raw):
start, end = m.span()
self.starts.append(start), self.ends.append(end)
def __call__(self, offset):
if not self.starts:
return False
q = bisect.bisect(self.starts, offset) - 1
return q >= 0 and self.starts[q] <= offset <= self.ends[q]
def link_stylesheets(container, names, sheets, remove=False, mtype='text/css'):
from ebook_converter.ebooks.oeb.base import XPath, XHTML
changed_names = set()
snames = set(sheets)
lp = XPath('//h:link[@href]')
hp = XPath('//h:head')
for name in names:
root = container.parsed(name)
if remove:
for link in lp(root):
if (link.get('type', mtype) or mtype) == mtype:
container.remove_from_xml(link)
changed_names.add(name)
container.dirty(name)
existing = {container.href_to_name(l.get('href'), name) for l in lp(root) if (l.get('type', mtype) or mtype) == mtype}
extra = snames - existing
if extra:
changed_names.add(name)
try:
parent = hp(root)[0]
except (TypeError, IndexError):
parent = root.makeelement(XHTML('head'))
container.insert_into_xml(root, parent, index=0)
for sheet in sheets:
if sheet in extra:
container.insert_into_xml(
parent, parent.makeelement(XHTML('link'), rel='stylesheet', type=mtype,
href=container.name_to_href(sheet, name)))
container.dirty(name)
return changed_names
def lead_text(top_elem, num_words=10):
''' Return the leading text contained in top_elem (including descendants)
up to a maximum of num_words words. More efficient than using
etree.tostring(method='text') as it does not have to serialize the entire
sub-tree rooted at top_elem.'''
pat = re.compile(r'\s+', flags=re.UNICODE)
words = []
def get_text(x, attr='text'):
ans = getattr(x, attr)
if ans:
words.extend(filter(None, pat.split(ans)))
stack = [(top_elem, 'text')]
while stack and len(words) < num_words:
elem, attr = stack.pop()
get_text(elem, attr)
if attr == 'text':
if elem is not top_elem:
stack.append((elem, 'tail'))
stack.extend(reversed(list((c, 'text') for c in elem.iterchildren('*'))))
return ' '.join(words[:num_words])
def parse_css(data, fname='<string>', is_declaration=False, decode=None, log_level=None, css_preprocessor=None):
if log_level is None:
import logging
log_level = logging.WARNING
from css_parser import CSSParser, log
from ebook_converter.ebooks.oeb.base import _css_logger
log.setLevel(log_level)
log.raiseExceptions = False
data = data or ''
if isinstance(data, bytes):
data = data.decode('utf-8') if decode is None else decode(data)
if css_preprocessor is not None:
data = css_preprocessor(data)
parser = CSSParser(loglevel=log_level,
# We dont care about @import rules
fetcher=lambda x: (None, None), log=_css_logger)
if is_declaration:
data = parser.parseStyle(data, validate=False)
else:
data = parser.parseString(data, href=fname, validate=False)
return data
def handle_entities(text, func):
return func(replace_entities(text))
def apply_func_to_match_groups(match, func=_upper,
handle_entities=handle_entities):
"""
Apply the specified function to individual groups in the match object (the
result of re.search() or
the whole match if no groups were defined. Returns the replaced string.
"""
found_groups = False
i = 0
parts, pos = [], match.start()
while True:
i += 1
try:
start, end = match.span(i)
except IndexError:
break
found_groups = True
if start > -1:
parts.append(match.string[pos:start])
parts.append(handle_entities(match.string[start:end], func))
pos = end
if not found_groups:
return handle_entities(match.group(), func)
parts.append(match.string[pos:match.end()])
return ''.join(parts)
def apply_func_to_html_text(match, func=_upper,
handle_entities=handle_entities):
"""
Apply the specified function only to text between HTML tag definitions.
"""
parts = re.split(r'(<[^>]+>)', match.group())
parts = (x if x.startswith('<') else handle_entities(x, func)
for x in parts)
return ''.join(parts)
def extract(elem):
''' Remove an element from the tree, keeping elem.tail '''
p = elem.getparent()
if p is not None:
idx = p.index(elem)
p.remove(elem)
if elem.tail:
if idx > 0:
p[idx-1].tail = (p[idx-1].tail or '') + elem.tail
else:
p.text = (p.text or '') + elem.tail