mirror of
https://github.com/gryf/ebook-converter.git
synced 2026-01-02 16:54:12 +01:00
Here is the first batch of modules, which are needed for converting several formats to LRF. Some of the logic has been change, more cleanups will follow.
219 lines
8.5 KiB
Python
219 lines
8.5 KiB
Python
from __future__ import unicode_literals, absolute_import, print_function, division
|
|
#########################################################################
|
|
# #
|
|
# #
|
|
# copyright 2002 Paul Henry Tremblay #
|
|
# #
|
|
# This program is distributed in the hope that it will be useful, #
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
|
|
# General Public License for more details. #
|
|
# #
|
|
# #
|
|
#########################################################################
|
|
import os, re
|
|
|
|
from ebook_converter.ebooks.rtf2xml import copy
|
|
from ebook_converter.utils.mreplace import MReplace
|
|
from ebook_converter.ptempfile import better_mktemp
|
|
from ebook_converter.polyglot.builtins import codepoint_to_chr, range, filter, map
|
|
from . import open_for_read, open_for_write
|
|
|
|
|
|
class Tokenize:
|
|
"""Tokenize RTF into one line per field. Each line will contain information useful for the rest of the script"""
|
|
|
|
def __init__(self,
|
|
in_file,
|
|
bug_handler,
|
|
copy=None,
|
|
run_level=1,
|
|
# out_file = None,
|
|
):
|
|
self.__file = in_file
|
|
self.__bug_handler = bug_handler
|
|
self.__copy = copy
|
|
self.__write_to = better_mktemp()
|
|
# self.__write_to = out_file
|
|
self.__compile_expressions()
|
|
# variables
|
|
self.__uc_char = 0
|
|
self.__uc_bin = False
|
|
self.__uc_value = [1]
|
|
|
|
def __reini_utf8_counters(self):
|
|
self.__uc_char = 0
|
|
self.__uc_bin = False
|
|
|
|
def __remove_uc_chars(self, startchar, token):
|
|
for i in range(startchar, len(token)):
|
|
if self.__uc_char:
|
|
self.__uc_char -= 1
|
|
else:
|
|
return token[i:]
|
|
# if only char to skip
|
|
return ''
|
|
|
|
def __unicode_process(self, token):
|
|
# change scope in
|
|
if token == r'\{':
|
|
self.__uc_value.append(self.__uc_value[-1])
|
|
# basic error handling
|
|
self.__reini_utf8_counters()
|
|
return token
|
|
# change scope out
|
|
elif token == r'\}':
|
|
self.__uc_value.pop()
|
|
self.__reini_utf8_counters()
|
|
return token
|
|
# add a uc control
|
|
elif token[:3] == '\\uc':
|
|
self.__uc_value[-1] = int(token[3:])
|
|
self.__reini_utf8_counters()
|
|
return token
|
|
# bin data to slip
|
|
elif self.__uc_bin:
|
|
self.__uc_bin = False
|
|
return ''
|
|
# uc char to remove
|
|
elif self.__uc_char:
|
|
# handle \bin tag in case of uc char to skip
|
|
if token[:4] == '\bin':
|
|
self.__uc_char -=1
|
|
self.__uc_bin = True
|
|
return ''
|
|
elif token[:1] == "\\" :
|
|
self.__uc_char -=1
|
|
return ''
|
|
else:
|
|
return self.__remove_uc_chars(0, token)
|
|
# go for real \u token
|
|
match_obj = self.__utf_exp.match(token)
|
|
if match_obj is not None:
|
|
self.__reini_utf8_counters()
|
|
# get value and handle negative case
|
|
uni_char = int(match_obj.group(1))
|
|
uni_len = len(match_obj.group(0))
|
|
if uni_char < 0:
|
|
uni_char += 65536
|
|
uni_char = codepoint_to_chr(uni_char).encode('ascii', 'xmlcharrefreplace').decode('ascii')
|
|
self.__uc_char = self.__uc_value[-1]
|
|
# there is only an unicode char
|
|
if len(token)<= uni_len:
|
|
return uni_char
|
|
# an unicode char and something else
|
|
# must be after as it is splited on \
|
|
# necessary? maybe for \bin?
|
|
elif not self.__uc_char:
|
|
return uni_char + token[uni_len:]
|
|
# if not uc0 and chars
|
|
else:
|
|
return uni_char + self.__remove_uc_chars(uni_len, token)
|
|
# default
|
|
return token
|
|
|
|
def __sub_reg_split(self,input_file):
|
|
input_file = self.__replace_spchar.mreplace(input_file)
|
|
# this is for older RTF
|
|
input_file = self.__par_exp.sub(r'\n\\par \n', input_file)
|
|
input_file = self.__cwdigit_exp.sub(r"\g<1>\n\g<2>", input_file)
|
|
input_file = self.__cs_ast.sub(r"\g<1>", input_file)
|
|
input_file = self.__ms_hex_exp.sub(r"\\mshex0\g<1> ", input_file)
|
|
input_file = self.__utf_ud.sub(r"\\{\\uc0 \g<1>\\}", input_file)
|
|
# remove \n in bin data
|
|
input_file = self.__bin_exp.sub(lambda x:
|
|
x.group().replace('\n', '') + '\n', input_file)
|
|
# split
|
|
tokens = re.split(self.__splitexp, input_file)
|
|
# remove empty tokens and \n
|
|
return list(filter(lambda x: len(x) > 0 and x != '\n', tokens))
|
|
|
|
def __compile_expressions(self):
|
|
SIMPLE_RPL = {
|
|
"\\\\": "\\backslash ",
|
|
"\\~": "\\~ ",
|
|
"\\;": "\\; ",
|
|
"&": "&",
|
|
"<": "<",
|
|
">": ">",
|
|
"\\~": "\\~ ",
|
|
"\\_": "\\_ ",
|
|
"\\:": "\\: ",
|
|
"\\-": "\\- ",
|
|
# turn into a generic token to eliminate special
|
|
# cases and make processing easier
|
|
"\\{": "\\ob ",
|
|
# turn into a generic token to eliminate special
|
|
# cases and make processing easier
|
|
"\\}": "\\cb ",
|
|
# put a backslash in front of to eliminate special cases and
|
|
# make processing easier
|
|
"{": "\\{",
|
|
# put a backslash in front of to eliminate special cases and
|
|
# make processing easier
|
|
"}": "\\}",
|
|
}
|
|
self.__replace_spchar = MReplace(SIMPLE_RPL)
|
|
# add ;? in case of char following \u
|
|
self.__ms_hex_exp = re.compile(r"\\\'([0-9a-fA-F]{2})")
|
|
self.__utf_exp = re.compile(r"\\u(-?\d{3,6}) ?")
|
|
self.__bin_exp = re.compile(r"(?:\\bin(-?\d{0,10})[\n ]+)[01\n]+")
|
|
# manage upr/ud situations
|
|
self.__utf_ud = re.compile(r"\\{[\n ]?\\upr[\n ]?(?:\\{.*?\\})[\n ]?" +
|
|
r"\\{[\n ]?\\*[\n ]?\\ud[\n ]?(\\{.*?\\})[\n ]?\\}[\n ]?\\}")
|
|
# add \n in split for whole file reading
|
|
# why keep backslash whereas \is replaced before?
|
|
# remove \n from endline char
|
|
self.__splitexp = re.compile(r"(\\[{}]|\n|\\[^\s\\{}&]+(?:[ \t\r\f\v])?)")
|
|
# this is for old RTF
|
|
self.__par_exp = re.compile(r'(\\\n+|\\ )')
|
|
# handle improper cs char-style with \* before without {
|
|
self.__cs_ast = re.compile(r'\\\*([\n ]*\\cs\d+[\n \\]+)')
|
|
# handle cw using a digit as argument and without space as delimiter
|
|
self.__cwdigit_exp = re.compile(r"(\\[a-zA-Z]+[\-0-9]+)([^0-9 \\]+)")
|
|
|
|
def tokenize(self):
|
|
"""Main class for handling other methods. Reads the file \
|
|
, uses method self.sub_reg to make basic substitutions,\
|
|
and process tokens by itself"""
|
|
# read
|
|
with open_for_read(self.__file) as read_obj:
|
|
input_file = read_obj.read()
|
|
|
|
# process simple replacements and split giving us a correct list
|
|
# remove '' and \n in the process
|
|
tokens = self.__sub_reg_split(input_file)
|
|
# correct unicode
|
|
tokens = map(self.__unicode_process, tokens)
|
|
# remove empty items created by removing \uc
|
|
tokens = list(filter(lambda x: len(x) > 0, tokens))
|
|
|
|
# write
|
|
with open_for_write(self.__write_to) as write_obj:
|
|
write_obj.write('\n'.join(tokens))
|
|
# Move and copy
|
|
copy_obj = copy.Copy(bug_handler=self.__bug_handler)
|
|
if self.__copy:
|
|
copy_obj.copy_file(self.__write_to, "tokenize.data")
|
|
copy_obj.rename(self.__write_to, self.__file)
|
|
os.remove(self.__write_to)
|
|
|
|
# self.__special_tokens = [ '_', '~', "'", '{', '}' ]
|
|
|
|
# import sys
|
|
# def main(args=sys.argv):
|
|
# if len(args) < 2:
|
|
# print 'No file'
|
|
# return
|
|
# file = 'data_tokens.txt'
|
|
# if len(args) == 3:
|
|
# file = args[2]
|
|
# to = Tokenize(args[1], Exception, out_file = file)
|
|
# to.tokenize()
|
|
|
|
|
|
# if __name__ == '__main__':
|
|
# sys.exit(main())
|
|
|
|
# calibre-debug -e src/calibre/ebooks/rtf2xml/tokenize.py
|