mirror of
https://github.com/gryf/ebook-converter.git
synced 2026-01-06 19:44:12 +01:00
Here is the first batch of modules, which are needed for converting several formats to LRF. Some of the logic has been change, more cleanups will follow.
190 lines
5.9 KiB
Python
190 lines
5.9 KiB
Python
#!/usr/bin/env python2
|
|
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
|
from __future__ import absolute_import, division, print_function, unicode_literals
|
|
|
|
__license__ = 'GPL v3'
|
|
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
|
__docformat__ = 'restructuredtext en'
|
|
|
|
import re, codecs
|
|
from ebook_converter.polyglot.builtins import unicode_type
|
|
|
|
_encoding_pats = (
|
|
# XML declaration
|
|
r'<\?[^<>]+encoding\s*=\s*[\'"](.*?)[\'"][^<>]*>',
|
|
# HTML 5 charset
|
|
r'''<meta\s+charset=['"]([-_a-z0-9]+)['"][^<>]*>(?:\s*</meta>){0,1}''',
|
|
# HTML 4 Pragma directive
|
|
r'''<meta\s+?[^<>]*?content\s*=\s*['"][^'"]*?charset=([-_a-z0-9]+)[^'"]*?['"][^<>]*>(?:\s*</meta>){0,1}''',
|
|
)
|
|
|
|
|
|
def compile_pats(binary):
|
|
for raw in _encoding_pats:
|
|
if binary:
|
|
raw = raw.encode('ascii')
|
|
yield re.compile(raw, flags=re.IGNORECASE)
|
|
|
|
|
|
class LazyEncodingPats(object):
|
|
|
|
def __call__(self, binary=False):
|
|
attr = 'binary_pats' if binary else 'unicode_pats'
|
|
pats = getattr(self, attr, None)
|
|
if pats is None:
|
|
pats = tuple(compile_pats(binary))
|
|
setattr(self, attr, pats)
|
|
for pat in pats:
|
|
yield pat
|
|
|
|
|
|
lazy_encoding_pats = LazyEncodingPats()
|
|
ENTITY_PATTERN = re.compile(r'&(\S+?);')
|
|
|
|
|
|
def strip_encoding_declarations(raw, limit=50*1024, preserve_newlines=False):
|
|
prefix = raw[:limit]
|
|
suffix = raw[limit:]
|
|
is_binary = isinstance(raw, bytes)
|
|
if preserve_newlines:
|
|
if is_binary:
|
|
sub = lambda m: b'\n' * m.group().count(b'\n')
|
|
else:
|
|
sub = lambda m: '\n' * m.group().count('\n')
|
|
else:
|
|
sub = b'' if is_binary else u''
|
|
for pat in lazy_encoding_pats(is_binary):
|
|
prefix = pat.sub(sub, prefix)
|
|
raw = prefix + suffix
|
|
return raw
|
|
|
|
|
|
def replace_encoding_declarations(raw, enc='utf-8', limit=50*1024):
|
|
prefix = raw[:limit]
|
|
suffix = raw[limit:]
|
|
changed = [False]
|
|
is_binary = isinstance(raw, bytes)
|
|
if is_binary:
|
|
if not isinstance(enc, bytes):
|
|
enc = enc.encode('ascii')
|
|
else:
|
|
if isinstance(enc, bytes):
|
|
enc = enc.decode('ascii')
|
|
|
|
def sub(m):
|
|
ans = m.group()
|
|
if m.group(1).lower() != enc.lower():
|
|
changed[0] = True
|
|
start, end = m.start(1) - m.start(0), m.end(1) - m.end(0)
|
|
ans = ans[:start] + enc + ans[end:]
|
|
return ans
|
|
|
|
for pat in lazy_encoding_pats(is_binary):
|
|
prefix = pat.sub(sub, prefix)
|
|
raw = prefix + suffix
|
|
return raw, changed[0]
|
|
|
|
|
|
def find_declared_encoding(raw, limit=50*1024):
|
|
prefix = raw[:limit]
|
|
is_binary = isinstance(raw, bytes)
|
|
for pat in lazy_encoding_pats(is_binary):
|
|
m = pat.search(prefix)
|
|
if m is not None:
|
|
ans = m.group(1)
|
|
if is_binary:
|
|
ans = ans.decode('ascii', 'replace')
|
|
return ans
|
|
|
|
|
|
def substitute_entites(raw):
|
|
from ebook_converter import xml_entity_to_unicode
|
|
return ENTITY_PATTERN.sub(xml_entity_to_unicode, raw)
|
|
|
|
|
|
_CHARSET_ALIASES = {"macintosh" : "mac-roman",
|
|
"x-sjis" : "shift-jis"}
|
|
|
|
|
|
def detect(*args, **kwargs):
|
|
from chardet import detect
|
|
return detect(*args, **kwargs)
|
|
|
|
|
|
def force_encoding(raw, verbose, assume_utf8=False):
|
|
from ebook_converter.constants import preferred_encoding
|
|
|
|
try:
|
|
chardet = detect(raw[:1024*50])
|
|
except:
|
|
chardet = {'encoding':preferred_encoding, 'confidence':0}
|
|
encoding = chardet['encoding']
|
|
if chardet['confidence'] < 1 and assume_utf8:
|
|
encoding = 'utf-8'
|
|
if chardet['confidence'] < 1 and verbose:
|
|
print('WARNING: Encoding detection confidence for %s is %d%%'%(
|
|
chardet['encoding'], chardet['confidence']*100))
|
|
if not encoding:
|
|
encoding = preferred_encoding
|
|
encoding = encoding.lower()
|
|
encoding = _CHARSET_ALIASES.get(encoding, encoding)
|
|
if encoding == 'ascii':
|
|
encoding = 'utf-8'
|
|
return encoding
|
|
|
|
|
|
def detect_xml_encoding(raw, verbose=False, assume_utf8=False):
|
|
if not raw or isinstance(raw, unicode_type):
|
|
return raw, None
|
|
for x in ('utf8', 'utf-16-le', 'utf-16-be'):
|
|
bom = getattr(codecs, 'BOM_'+x.upper().replace('-16', '16').replace(
|
|
'-', '_'))
|
|
if raw.startswith(bom):
|
|
return raw[len(bom):], x
|
|
encoding = None
|
|
for pat in lazy_encoding_pats(True):
|
|
match = pat.search(raw)
|
|
if match:
|
|
encoding = match.group(1)
|
|
encoding = encoding.decode('ascii', 'replace')
|
|
break
|
|
if encoding is None:
|
|
encoding = force_encoding(raw, verbose, assume_utf8=assume_utf8)
|
|
if encoding.lower().strip() == 'macintosh':
|
|
encoding = 'mac-roman'
|
|
if encoding.lower().replace('_', '-').strip() in (
|
|
'gb2312', 'chinese', 'csiso58gb231280', 'euc-cn', 'euccn',
|
|
'eucgb2312-cn', 'gb2312-1980', 'gb2312-80', 'iso-ir-58'):
|
|
# Microsoft Word exports to HTML with encoding incorrectly set to
|
|
# gb2312 instead of gbk. gbk is a superset of gb2312, anyway.
|
|
encoding = 'gbk'
|
|
try:
|
|
codecs.lookup(encoding)
|
|
except LookupError:
|
|
encoding = 'utf-8'
|
|
|
|
return raw, encoding
|
|
|
|
|
|
def xml_to_unicode(raw, verbose=False, strip_encoding_pats=False,
|
|
resolve_entities=False, assume_utf8=False):
|
|
'''
|
|
Force conversion of byte string to unicode. Tries to look for XML/HTML
|
|
encoding declaration first, if not found uses the chardet library and
|
|
prints a warning if detection confidence is < 100%
|
|
@return: (unicode, encoding used)
|
|
'''
|
|
if not raw:
|
|
return '', None
|
|
raw, encoding = detect_xml_encoding(raw, verbose=verbose,
|
|
assume_utf8=assume_utf8)
|
|
if not isinstance(raw, unicode_type):
|
|
raw = raw.decode(encoding, 'replace')
|
|
|
|
if strip_encoding_pats:
|
|
raw = strip_encoding_declarations(raw)
|
|
if resolve_entities:
|
|
raw = substitute_entites(raw)
|
|
|
|
return raw, encoding
|