mirror of
https://github.com/gryf/ebook-converter.git
synced 2026-01-02 16:54:12 +01:00
218 lines
8.4 KiB
Python
218 lines
8.4 KiB
Python
#########################################################################
|
|
# #
|
|
# #
|
|
# copyright 2002 Paul Henry Tremblay #
|
|
# #
|
|
# This program is distributed in the hope that it will be useful, #
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
|
|
# General Public License for more details. #
|
|
# #
|
|
# #
|
|
#########################################################################
|
|
import os, re
|
|
|
|
from ebook_converter.ebooks.rtf2xml import copy
|
|
from ebook_converter.utils.mreplace import MReplace
|
|
from ebook_converter.ptempfile import better_mktemp
|
|
from ebook_converter.polyglot.builtins import codepoint_to_chr, range, filter, map
|
|
from . import open_for_read, open_for_write
|
|
|
|
|
|
class Tokenize:
|
|
"""Tokenize RTF into one line per field. Each line will contain information useful for the rest of the script"""
|
|
|
|
def __init__(self,
|
|
in_file,
|
|
bug_handler,
|
|
copy=None,
|
|
run_level=1,
|
|
# out_file = None,
|
|
):
|
|
self.__file = in_file
|
|
self.__bug_handler = bug_handler
|
|
self.__copy = copy
|
|
self.__write_to = better_mktemp()
|
|
# self.__write_to = out_file
|
|
self.__compile_expressions()
|
|
# variables
|
|
self.__uc_char = 0
|
|
self.__uc_bin = False
|
|
self.__uc_value = [1]
|
|
|
|
def __reini_utf8_counters(self):
|
|
self.__uc_char = 0
|
|
self.__uc_bin = False
|
|
|
|
def __remove_uc_chars(self, startchar, token):
|
|
for i in range(startchar, len(token)):
|
|
if self.__uc_char:
|
|
self.__uc_char -= 1
|
|
else:
|
|
return token[i:]
|
|
# if only char to skip
|
|
return ''
|
|
|
|
def __unicode_process(self, token):
|
|
# change scope in
|
|
if token == r'\{':
|
|
self.__uc_value.append(self.__uc_value[-1])
|
|
# basic error handling
|
|
self.__reini_utf8_counters()
|
|
return token
|
|
# change scope out
|
|
elif token == r'\}':
|
|
self.__uc_value.pop()
|
|
self.__reini_utf8_counters()
|
|
return token
|
|
# add a uc control
|
|
elif token[:3] == '\\uc':
|
|
self.__uc_value[-1] = int(token[3:])
|
|
self.__reini_utf8_counters()
|
|
return token
|
|
# bin data to slip
|
|
elif self.__uc_bin:
|
|
self.__uc_bin = False
|
|
return ''
|
|
# uc char to remove
|
|
elif self.__uc_char:
|
|
# handle \bin tag in case of uc char to skip
|
|
if token[:4] == '\bin':
|
|
self.__uc_char -=1
|
|
self.__uc_bin = True
|
|
return ''
|
|
elif token[:1] == "\\" :
|
|
self.__uc_char -=1
|
|
return ''
|
|
else:
|
|
return self.__remove_uc_chars(0, token)
|
|
# go for real \u token
|
|
match_obj = self.__utf_exp.match(token)
|
|
if match_obj is not None:
|
|
self.__reini_utf8_counters()
|
|
# get value and handle negative case
|
|
uni_char = int(match_obj.group(1))
|
|
uni_len = len(match_obj.group(0))
|
|
if uni_char < 0:
|
|
uni_char += 65536
|
|
uni_char = codepoint_to_chr(uni_char).encode('ascii', 'xmlcharrefreplace').decode('ascii')
|
|
self.__uc_char = self.__uc_value[-1]
|
|
# there is only an unicode char
|
|
if len(token)<= uni_len:
|
|
return uni_char
|
|
# an unicode char and something else
|
|
# must be after as it is splited on \
|
|
# necessary? maybe for \bin?
|
|
elif not self.__uc_char:
|
|
return uni_char + token[uni_len:]
|
|
# if not uc0 and chars
|
|
else:
|
|
return uni_char + self.__remove_uc_chars(uni_len, token)
|
|
# default
|
|
return token
|
|
|
|
def __sub_reg_split(self,input_file):
|
|
input_file = self.__replace_spchar.mreplace(input_file)
|
|
# this is for older RTF
|
|
input_file = self.__par_exp.sub(r'\n\\par \n', input_file)
|
|
input_file = self.__cwdigit_exp.sub(r"\g<1>\n\g<2>", input_file)
|
|
input_file = self.__cs_ast.sub(r"\g<1>", input_file)
|
|
input_file = self.__ms_hex_exp.sub(r"\\mshex0\g<1> ", input_file)
|
|
input_file = self.__utf_ud.sub(r"\\{\\uc0 \g<1>\\}", input_file)
|
|
# remove \n in bin data
|
|
input_file = self.__bin_exp.sub(lambda x:
|
|
x.group().replace('\n', '') + '\n', input_file)
|
|
# split
|
|
tokens = re.split(self.__splitexp, input_file)
|
|
# remove empty tokens and \n
|
|
return list(filter(lambda x: len(x) > 0 and x != '\n', tokens))
|
|
|
|
def __compile_expressions(self):
|
|
SIMPLE_RPL = {
|
|
"\\\\": "\\backslash ",
|
|
"\\~": "\\~ ",
|
|
"\\;": "\\; ",
|
|
"&": "&",
|
|
"<": "<",
|
|
">": ">",
|
|
"\\~": "\\~ ",
|
|
"\\_": "\\_ ",
|
|
"\\:": "\\: ",
|
|
"\\-": "\\- ",
|
|
# turn into a generic token to eliminate special
|
|
# cases and make processing easier
|
|
"\\{": "\\ob ",
|
|
# turn into a generic token to eliminate special
|
|
# cases and make processing easier
|
|
"\\}": "\\cb ",
|
|
# put a backslash in front of to eliminate special cases and
|
|
# make processing easier
|
|
"{": "\\{",
|
|
# put a backslash in front of to eliminate special cases and
|
|
# make processing easier
|
|
"}": "\\}",
|
|
}
|
|
self.__replace_spchar = MReplace(SIMPLE_RPL)
|
|
# add ;? in case of char following \u
|
|
self.__ms_hex_exp = re.compile(r"\\\'([0-9a-fA-F]{2})")
|
|
self.__utf_exp = re.compile(r"\\u(-?\d{3,6}) ?")
|
|
self.__bin_exp = re.compile(r"(?:\\bin(-?\d{0,10})[\n ]+)[01\n]+")
|
|
# manage upr/ud situations
|
|
self.__utf_ud = re.compile(r"\\{[\n ]?\\upr[\n ]?(?:\\{.*?\\})[\n ]?" +
|
|
r"\\{[\n ]?\\*[\n ]?\\ud[\n ]?(\\{.*?\\})[\n ]?\\}[\n ]?\\}")
|
|
# add \n in split for whole file reading
|
|
# why keep backslash whereas \is replaced before?
|
|
# remove \n from endline char
|
|
self.__splitexp = re.compile(r"(\\[{}]|\n|\\[^\s\\{}&]+(?:[ \t\r\f\v])?)")
|
|
# this is for old RTF
|
|
self.__par_exp = re.compile(r'(\\\n+|\\ )')
|
|
# handle improper cs char-style with \* before without {
|
|
self.__cs_ast = re.compile(r'\\\*([\n ]*\\cs\d+[\n \\]+)')
|
|
# handle cw using a digit as argument and without space as delimiter
|
|
self.__cwdigit_exp = re.compile(r"(\\[a-zA-Z]+[\-0-9]+)([^0-9 \\]+)")
|
|
|
|
def tokenize(self):
|
|
"""Main class for handling other methods. Reads the file \
|
|
, uses method self.sub_reg to make basic substitutions,\
|
|
and process tokens by itself"""
|
|
# read
|
|
with open_for_read(self.__file) as read_obj:
|
|
input_file = read_obj.read()
|
|
|
|
# process simple replacements and split giving us a correct list
|
|
# remove '' and \n in the process
|
|
tokens = self.__sub_reg_split(input_file)
|
|
# correct unicode
|
|
tokens = map(self.__unicode_process, tokens)
|
|
# remove empty items created by removing \uc
|
|
tokens = list(filter(lambda x: len(x) > 0, tokens))
|
|
|
|
# write
|
|
with open_for_write(self.__write_to) as write_obj:
|
|
write_obj.write('\n'.join(tokens))
|
|
# Move and copy
|
|
copy_obj = copy.Copy(bug_handler=self.__bug_handler)
|
|
if self.__copy:
|
|
copy_obj.copy_file(self.__write_to, "tokenize.data")
|
|
copy_obj.rename(self.__write_to, self.__file)
|
|
os.remove(self.__write_to)
|
|
|
|
# self.__special_tokens = [ '_', '~', "'", '{', '}' ]
|
|
|
|
# import sys
|
|
# def main(args=sys.argv):
|
|
# if len(args) < 2:
|
|
# print 'No file'
|
|
# return
|
|
# file = 'data_tokens.txt'
|
|
# if len(args) == 3:
|
|
# file = args[2]
|
|
# to = Tokenize(args[1], Exception, out_file = file)
|
|
# to.tokenize()
|
|
|
|
|
|
# if __name__ == '__main__':
|
|
# sys.exit(main())
|
|
|
|
# calibre-debug -e src/calibre/ebooks/rtf2xml/tokenize.py
|