import re, codecs __license__ = 'GPL v3' __copyright__ = '2009, Kovid Goyal ' __docformat__ = 'restructuredtext en' _encoding_pats = ( # XML declaration r'<\?[^<>]+encoding\s*=\s*[\'"](.*?)[\'"][^<>]*>', # HTML 5 charset r''']*>(?:\s*){0,1}''', # HTML 4 Pragma directive r''']*?content\s*=\s*['"][^'"]*?charset=([-_a-z0-9]+)[^'"]*?['"][^<>]*>(?:\s*){0,1}''', ) def compile_pats(binary): for raw in _encoding_pats: if binary: raw = raw.encode('ascii') yield re.compile(raw, flags=re.IGNORECASE) class LazyEncodingPats(object): def __call__(self, binary=False): attr = 'binary_pats' if binary else 'unicode_pats' pats = getattr(self, attr, None) if pats is None: pats = tuple(compile_pats(binary)) setattr(self, attr, pats) for pat in pats: yield pat lazy_encoding_pats = LazyEncodingPats() ENTITY_PATTERN = re.compile(r'&(\S+?);') def strip_encoding_declarations(raw, limit=50*1024, preserve_newlines=False): prefix = raw[:limit] suffix = raw[limit:] is_binary = isinstance(raw, bytes) if preserve_newlines: if is_binary: sub = lambda m: b'\n' * m.group().count(b'\n') else: sub = lambda m: '\n' * m.group().count('\n') else: sub = b'' if is_binary else u'' for pat in lazy_encoding_pats(is_binary): prefix = pat.sub(sub, prefix) raw = prefix + suffix return raw def replace_encoding_declarations(raw, enc='utf-8', limit=50*1024): prefix = raw[:limit] suffix = raw[limit:] changed = [False] is_binary = isinstance(raw, bytes) if is_binary: if not isinstance(enc, bytes): enc = enc.encode('ascii') else: if isinstance(enc, bytes): enc = enc.decode('ascii') def sub(m): ans = m.group() if m.group(1).lower() != enc.lower(): changed[0] = True start, end = m.start(1) - m.start(0), m.end(1) - m.end(0) ans = ans[:start] + enc + ans[end:] return ans for pat in lazy_encoding_pats(is_binary): prefix = pat.sub(sub, prefix) raw = prefix + suffix return raw, changed[0] def find_declared_encoding(raw, limit=50*1024): prefix = raw[:limit] is_binary = isinstance(raw, bytes) for pat in lazy_encoding_pats(is_binary): m = pat.search(prefix) if m is not None: ans = m.group(1) if is_binary: ans = ans.decode('ascii', 'replace') return ans def substitute_entites(raw): from ebook_converter import xml_entity_to_unicode return ENTITY_PATTERN.sub(xml_entity_to_unicode, raw) _CHARSET_ALIASES = {"macintosh" : "mac-roman", "x-sjis" : "shift-jis"} def detect(*args, **kwargs): from chardet import detect return detect(*args, **kwargs) def force_encoding(raw, verbose, assume_utf8=False): from ebook_converter.constants_old import preferred_encoding try: chardet = detect(raw[:1024*50]) except: chardet = {'encoding':preferred_encoding, 'confidence':0} encoding = chardet['encoding'] if chardet['confidence'] < 1 and assume_utf8: encoding = 'utf-8' if chardet['confidence'] < 1 and verbose: print('WARNING: Encoding detection confidence for %s is %d%%'%( chardet['encoding'], chardet['confidence']*100)) if not encoding: encoding = preferred_encoding encoding = encoding.lower() encoding = _CHARSET_ALIASES.get(encoding, encoding) if encoding == 'ascii': encoding = 'utf-8' return encoding def detect_xml_encoding(raw, verbose=False, assume_utf8=False): if not raw or isinstance(raw, str): return raw, None for x in ('utf8', 'utf-16-le', 'utf-16-be'): bom = getattr(codecs, 'BOM_'+x.upper().replace('-16', '16').replace( '-', '_')) if raw.startswith(bom): return raw[len(bom):], x encoding = None for pat in lazy_encoding_pats(True): match = pat.search(raw) if match: encoding = match.group(1) encoding = encoding.decode('ascii', 'replace') break if encoding is None: encoding = force_encoding(raw, verbose, assume_utf8=assume_utf8) if encoding.lower().strip() == 'macintosh': encoding = 'mac-roman' if encoding.lower().replace('_', '-').strip() in ( 'gb2312', 'chinese', 'csiso58gb231280', 'euc-cn', 'euccn', 'eucgb2312-cn', 'gb2312-1980', 'gb2312-80', 'iso-ir-58'): # Microsoft Word exports to HTML with encoding incorrectly set to # gb2312 instead of gbk. gbk is a superset of gb2312, anyway. encoding = 'gbk' try: codecs.lookup(encoding) except LookupError: encoding = 'utf-8' return raw, encoding def xml_to_unicode(raw, verbose=False, strip_encoding_pats=False, resolve_entities=False, assume_utf8=False): ''' Force conversion of byte string to unicode. Tries to look for XML/HTML encoding declaration first, if not found uses the chardet library and prints a warning if detection confidence is < 100% @return: (unicode, encoding used) ''' if not raw: return '', None raw, encoding = detect_xml_encoding(raw, verbose=verbose, assume_utf8=assume_utf8) if not isinstance(raw, str): raw = raw.decode(encoding, 'replace') if strip_encoding_pats: raw = strip_encoding_declarations(raw) if resolve_entities: raw = substitute_entites(raw) return raw, encoding