mirror of
https://github.com/gryf/ebook-converter.git
synced 2026-04-24 15:11:30 +02:00
Initial import
This commit is contained in:
@@ -0,0 +1,231 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=utf-8
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
|
||||
import re, os
|
||||
from bisect import bisect
|
||||
|
||||
from calibre import guess_type as _guess_type, replace_entities
|
||||
from polyglot.builtins import filter
|
||||
|
||||
|
||||
def guess_type(x):
|
||||
return _guess_type(x)[0] or 'application/octet-stream'
|
||||
|
||||
|
||||
def setup_css_parser_serialization(tab_width=2):
|
||||
import css_parser
|
||||
prefs = css_parser.ser.prefs
|
||||
prefs.indent = tab_width * ' '
|
||||
prefs.indentClosingBrace = False
|
||||
prefs.omitLastSemicolon = False
|
||||
|
||||
|
||||
def actual_case_for_name(container, name):
|
||||
from calibre.utils.filenames import samefile
|
||||
if not container.exists(name):
|
||||
raise ValueError('Cannot get actual case for %s as it does not exist' % name)
|
||||
parts = name.split('/')
|
||||
base = ''
|
||||
ans = []
|
||||
for i, x in enumerate(parts):
|
||||
base = '/'.join(ans + [x])
|
||||
path = container.name_to_abspath(base)
|
||||
pdir = os.path.dirname(path)
|
||||
candidates = {os.path.join(pdir, q) for q in os.listdir(pdir)}
|
||||
if x in candidates:
|
||||
correctx = x
|
||||
else:
|
||||
for q in candidates:
|
||||
if samefile(q, path):
|
||||
correctx = os.path.basename(q)
|
||||
break
|
||||
else:
|
||||
raise RuntimeError('Something bad happened')
|
||||
ans.append(correctx)
|
||||
return '/'.join(ans)
|
||||
|
||||
|
||||
def corrected_case_for_name(container, name):
|
||||
parts = name.split('/')
|
||||
ans = []
|
||||
base = ''
|
||||
for i, x in enumerate(parts):
|
||||
base = '/'.join(ans + [x])
|
||||
if container.exists(base):
|
||||
correctx = x
|
||||
else:
|
||||
try:
|
||||
candidates = {q for q in os.listdir(os.path.dirname(container.name_to_abspath(base)))}
|
||||
except EnvironmentError:
|
||||
return None # one of the non-terminal components of name is a file instead of a directory
|
||||
for q in candidates:
|
||||
if q.lower() == x.lower():
|
||||
correctx = q
|
||||
break
|
||||
else:
|
||||
return None
|
||||
ans.append(correctx)
|
||||
return '/'.join(ans)
|
||||
|
||||
|
||||
class PositionFinder(object):
|
||||
|
||||
def __init__(self, raw):
|
||||
pat = br'\n' if isinstance(raw, bytes) else r'\n'
|
||||
self.new_lines = tuple(m.start() + 1 for m in re.finditer(pat, raw))
|
||||
|
||||
def __call__(self, pos):
|
||||
lnum = bisect(self.new_lines, pos)
|
||||
try:
|
||||
offset = abs(pos - self.new_lines[lnum - 1])
|
||||
except IndexError:
|
||||
offset = pos
|
||||
return (lnum + 1, offset)
|
||||
|
||||
|
||||
class CommentFinder(object):
|
||||
|
||||
def __init__(self, raw, pat=r'(?s)/\*.*?\*/'):
|
||||
self.starts, self.ends = [], []
|
||||
for m in re.finditer(pat, raw):
|
||||
start, end = m.span()
|
||||
self.starts.append(start), self.ends.append(end)
|
||||
|
||||
def __call__(self, offset):
|
||||
if not self.starts:
|
||||
return False
|
||||
q = bisect(self.starts, offset) - 1
|
||||
return q >= 0 and self.starts[q] <= offset <= self.ends[q]
|
||||
|
||||
|
||||
def link_stylesheets(container, names, sheets, remove=False, mtype='text/css'):
|
||||
from calibre.ebooks.oeb.base import XPath, XHTML
|
||||
changed_names = set()
|
||||
snames = set(sheets)
|
||||
lp = XPath('//h:link[@href]')
|
||||
hp = XPath('//h:head')
|
||||
for name in names:
|
||||
root = container.parsed(name)
|
||||
if remove:
|
||||
for link in lp(root):
|
||||
if (link.get('type', mtype) or mtype) == mtype:
|
||||
container.remove_from_xml(link)
|
||||
changed_names.add(name)
|
||||
container.dirty(name)
|
||||
existing = {container.href_to_name(l.get('href'), name) for l in lp(root) if (l.get('type', mtype) or mtype) == mtype}
|
||||
extra = snames - existing
|
||||
if extra:
|
||||
changed_names.add(name)
|
||||
try:
|
||||
parent = hp(root)[0]
|
||||
except (TypeError, IndexError):
|
||||
parent = root.makeelement(XHTML('head'))
|
||||
container.insert_into_xml(root, parent, index=0)
|
||||
for sheet in sheets:
|
||||
if sheet in extra:
|
||||
container.insert_into_xml(
|
||||
parent, parent.makeelement(XHTML('link'), rel='stylesheet', type=mtype,
|
||||
href=container.name_to_href(sheet, name)))
|
||||
container.dirty(name)
|
||||
|
||||
return changed_names
|
||||
|
||||
|
||||
def lead_text(top_elem, num_words=10):
|
||||
''' Return the leading text contained in top_elem (including descendants)
|
||||
up to a maximum of num_words words. More efficient than using
|
||||
etree.tostring(method='text') as it does not have to serialize the entire
|
||||
sub-tree rooted at top_elem.'''
|
||||
pat = re.compile(r'\s+', flags=re.UNICODE)
|
||||
words = []
|
||||
|
||||
def get_text(x, attr='text'):
|
||||
ans = getattr(x, attr)
|
||||
if ans:
|
||||
words.extend(filter(None, pat.split(ans)))
|
||||
|
||||
stack = [(top_elem, 'text')]
|
||||
while stack and len(words) < num_words:
|
||||
elem, attr = stack.pop()
|
||||
get_text(elem, attr)
|
||||
if attr == 'text':
|
||||
if elem is not top_elem:
|
||||
stack.append((elem, 'tail'))
|
||||
stack.extend(reversed(list((c, 'text') for c in elem.iterchildren('*'))))
|
||||
return ' '.join(words[:num_words])
|
||||
|
||||
|
||||
def parse_css(data, fname='<string>', is_declaration=False, decode=None, log_level=None, css_preprocessor=None):
|
||||
if log_level is None:
|
||||
import logging
|
||||
log_level = logging.WARNING
|
||||
from css_parser import CSSParser, log
|
||||
from calibre.ebooks.oeb.base import _css_logger
|
||||
log.setLevel(log_level)
|
||||
log.raiseExceptions = False
|
||||
data = data or ''
|
||||
if isinstance(data, bytes):
|
||||
data = data.decode('utf-8') if decode is None else decode(data)
|
||||
if css_preprocessor is not None:
|
||||
data = css_preprocessor(data)
|
||||
parser = CSSParser(loglevel=log_level,
|
||||
# We dont care about @import rules
|
||||
fetcher=lambda x: (None, None), log=_css_logger)
|
||||
if is_declaration:
|
||||
data = parser.parseStyle(data, validate=False)
|
||||
else:
|
||||
data = parser.parseString(data, href=fname, validate=False)
|
||||
return data
|
||||
|
||||
|
||||
def handle_entities(text, func):
|
||||
return func(replace_entities(text))
|
||||
|
||||
|
||||
def apply_func_to_match_groups(match, func=icu_upper, handle_entities=handle_entities):
|
||||
'''Apply the specified function to individual groups in the match object (the result of re.search() or
|
||||
the whole match if no groups were defined. Returns the replaced string.'''
|
||||
found_groups = False
|
||||
i = 0
|
||||
parts, pos = [], match.start()
|
||||
f = lambda text:handle_entities(text, func)
|
||||
while True:
|
||||
i += 1
|
||||
try:
|
||||
start, end = match.span(i)
|
||||
except IndexError:
|
||||
break
|
||||
found_groups = True
|
||||
if start > -1:
|
||||
parts.append(match.string[pos:start])
|
||||
parts.append(f(match.string[start:end]))
|
||||
pos = end
|
||||
if not found_groups:
|
||||
return f(match.group())
|
||||
parts.append(match.string[pos:match.end()])
|
||||
return ''.join(parts)
|
||||
|
||||
|
||||
def apply_func_to_html_text(match, func=icu_upper, handle_entities=handle_entities):
|
||||
''' Apply the specified function only to text between HTML tag definitions. '''
|
||||
f = lambda text:handle_entities(text, func)
|
||||
parts = re.split(r'(<[^>]+>)', match.group())
|
||||
parts = (x if x.startswith('<') else f(x) for x in parts)
|
||||
return ''.join(parts)
|
||||
|
||||
|
||||
def extract(elem):
|
||||
''' Remove an element from the tree, keeping elem.tail '''
|
||||
p = elem.getparent()
|
||||
if p is not None:
|
||||
idx = p.index(elem)
|
||||
p.remove(elem)
|
||||
if elem.tail:
|
||||
if idx > 0:
|
||||
p[idx-1].tail = (p[idx-1].tail or '') + elem.tail
|
||||
else:
|
||||
p.text = (p.text or '') + elem.tail
|
||||
Reference in New Issue
Block a user