######################################################################### # # # # # copyright 2002 Paul Henry Tremblay # # # # This program is distributed in the hope that it will be useful, # # but WITHOUT ANY WARRANTY; without even the implied warranty of # # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # # General Public License for more details. # # # # # ######################################################################### import sys, os, io from ebook_converter.ebooks.rtf2xml import get_char_map, copy from ebook_converter.ebooks.rtf2xml.char_set import char_set from ebook_converter.ptempfile import better_mktemp from . import open_for_read, open_for_write class Hex2Utf8: """ Convert Microsoft hexidecimal numbers to utf-8 """ def __init__(self, in_file, area_to_convert, char_file, default_char_map, bug_handler, invalid_rtf_handler, copy=None, temp_dir=None, symbol=None, wingdings=None, caps=None, convert_caps=None, dingbats=None, run_level=1, ): """ Required: 'file' 'area_to_convert'--the area of file to convert 'char_file'--the file containing the character mappings 'default_char_map'--name of default character map Optional: 'copy'-- whether to make a copy of result for debugging 'temp_dir' --where to output temporary results (default is directory from which the script is run.) 'symbol'--whether to load the symbol character map 'winddings'--whether to load the wingdings character map 'caps'--whether to load the caps characer map 'convert_to_caps'--wether to convert caps to utf-8 Returns: nothing """ self.__file = in_file self.__copy = copy if area_to_convert not in ('preamble', 'body'): msg = ( 'Developer error! Wrong flag.\n' 'in module "hex_2_utf8.py\n' '"area_to_convert" must be "body" or "preamble"\n' ) raise self.__bug_handler(msg) self.__char_file = char_file self.__area_to_convert = area_to_convert self.__default_char_map = default_char_map self.__symbol = symbol self.__wingdings = wingdings self.__dingbats = dingbats self.__caps = caps self.__convert_caps = 0 self.__convert_symbol = 0 self.__convert_wingdings = 0 self.__convert_zapf = 0 self.__run_level = run_level self.__write_to = better_mktemp() self.__bug_handler = bug_handler self.__invalid_rtf_handler = invalid_rtf_handler def update_values(self, file, area_to_convert, char_file, convert_caps, convert_symbol, convert_wingdings, convert_zapf, copy=None, temp_dir=None, symbol=None, wingdings=None, caps=None, dingbats=None, ): """ Required: 'file' 'area_to_convert'--the area of file to convert 'char_file'--the file containing the character mappings Optional: 'copy'-- whether to make a copy of result for debugging 'temp_dir' --where to output temporary results (default is directory from which the script is run.) 'symbol'--whether to load the symbol character map 'winddings'--whether to load the wingdings character map 'caps'--whether to load the caps characer map 'convert_to_caps'--wether to convert caps to utf-8 Returns: nothing """ self.__file=file self.__copy = copy if area_to_convert not in ('preamble', 'body'): msg = ( 'in module "hex_2_utf8.py\n' '"area_to_convert" must be "body" or "preamble"\n' ) raise self.__bug_handler(msg) self.__area_to_convert = area_to_convert self.__symbol = symbol self.__wingdings = wingdings self.__dingbats = dingbats self.__caps = caps self.__convert_caps = convert_caps self.__convert_symbol = convert_symbol self.__convert_wingdings = convert_wingdings self.__convert_zapf = convert_zapf # new! # no longer try to convert these # self.__convert_symbol = 0 # self.__convert_wingdings = 0 # self.__convert_zapf = 0 def __initiate_values(self): """ Required: Nothing Set values, including those for the dictionaries. The file that contains the maps is broken down into many different sets. For example, for the Symbol font, there is the standard part for hexidecimal numbers, and the part for Microsoft characters. Read each part in, and then combine them. """ # the default encoding system, the lower map for characters 0 through # 128, and the encoding system for Microsoft characters. # New on 2004-05-8: the self.__char_map is not in directory with other # modules self.__char_file = io.StringIO(char_set) char_map_obj = get_char_map.GetCharMap( char_file=self.__char_file, bug_handler=self.__bug_handler, ) up_128_dict = char_map_obj.get_char_map(map=self.__default_char_map) bt_128_dict = char_map_obj.get_char_map(map='bottom_128') ms_standard_dict = char_map_obj.get_char_map(map='ms_standard') self.__def_dict = {} self.__def_dict.update(up_128_dict) self.__def_dict.update(bt_128_dict) self.__def_dict.update(ms_standard_dict) self.__current_dict = self.__def_dict self.__current_dict_name = 'default' self.__in_caps = 0 self.__special_fonts_found = 0 if self.__symbol: symbol_base_dict = char_map_obj.get_char_map(map='SYMBOL') ms_symbol_dict = char_map_obj.get_char_map(map='ms_symbol') self.__symbol_dict = {} self.__symbol_dict.update(symbol_base_dict) self.__symbol_dict.update(ms_symbol_dict) if self.__wingdings: wingdings_base_dict = char_map_obj.get_char_map(map='wingdings') ms_wingdings_dict = char_map_obj.get_char_map(map='ms_wingdings') self.__wingdings_dict = {} self.__wingdings_dict.update(wingdings_base_dict) self.__wingdings_dict.update(ms_wingdings_dict) if self.__dingbats: dingbats_base_dict = char_map_obj.get_char_map(map='dingbats') ms_dingbats_dict = char_map_obj.get_char_map(map='ms_dingbats') self.__dingbats_dict = {} self.__dingbats_dict.update(dingbats_base_dict) self.__dingbats_dict.update(ms_dingbats_dict) # load dictionary for caps, and make a string for the replacement self.__caps_uni_dict = char_map_obj.get_char_map(map='caps_uni') # # print self.__caps_uni_dict # don't think I'll need this # keys = self.__caps_uni_dict.keys() # self.__caps_uni_replace = '|'.join(keys) self.__preamble_state_dict = { 'preamble' : self.__preamble_func, 'body' : self.__body_func, 'mi 10: self.__write_obj.write('mi%snot-in-table\n' % hex_num) if self.__run_level > 4: # msg = 'no dictionary entry for %s\n' # msg += 'the hexidecimal num is "%s"\n' % (hex_num) # msg += 'dictionary is %s\n' % self.__current_dict_name msg = 'Character "&#x%s;" does not appear to be valid (or is a control character)\n' % token raise self.__bug_handler(msg) def __found_body_func(self, line): self.__state = 'body' self.__write_obj.write(line) def __body_func(self, line): """ When parsing preamble """ self.__write_obj.write(line) def __preamble_func(self, line): action = self.__preamble_state_dict.get(self.__token_info) if action is not None: action(line) else: self.__write_obj.write(line) def __convert_preamble(self): self.__state = 'preamble' with open_for_write(self.__write_to) as self.__write_obj: with open_for_read(self.__file) as read_obj: for line in read_obj: self.__token_info = line[:16] action = self.__preamble_state_dict.get(self.__state) if action is None: sys.stderr.write('error no state found in hex_2_utf8', self.__state ) action(line) copy_obj = copy.Copy(bug_handler=self.__bug_handler) if self.__copy: copy_obj.copy_file(self.__write_to, "preamble_utf_convert.data") copy_obj.rename(self.__write_to, self.__file) os.remove(self.__write_to) def __preamble_for_body_func(self, line): """ Required: line -- line to parse Returns: nothing Logic: Used when parsing the body. """ if self.__token_info == 'mi 1: self.__font_list.pop() else: sys.stderr.write('module is hex_2_utf8\n') sys.stderr.write('method is end_font_func\n') sys.stderr.write('self.__font_list should be greater than one?\n') face = self.__font_list[-1] if face == 'Symbol' and self.__convert_symbol: self.__current_dict_name = 'Symbol' self.__current_dict = self.__symbol_dict elif face == 'Wingdings' and self.__convert_wingdings: self.__current_dict_name = 'Wingdings' self.__current_dict = self.__wingdings_dict elif face == 'Zapf Dingbats' and self.__convert_zapf: self.__current_dict_name = 'Zapf Dingbats' self.__current_dict = self.__dingbats_dict else: self.__current_dict_name = 'default' self.__current_dict = self.__def_dict def __start_special_font_func_old(self, line): """ Required: line -- line Returns; nothing Logic: change the dictionary to use in conversion """ # for error checking if self.__token_info == 'mi 1: self.__caps_list.pop() else: sys.stderr.write('Module is hex_2_utf8\n' 'method is __end_caps_func\n' 'caps list should be more than one?\n') # self.__in_caps not set def __text_func(self, line): """ Required: line -- line to parse Returns: nothing Logic: if in caps, convert. Otherwise, print out. """ text = line[17:-1] # print line if self.__current_dict_name in ('Symbol', 'Wingdings', 'Zapf Dingbats'): the_string = '' for letter in text: hex_num = hex(ord(letter)) hex_num = str(hex_num) hex_num = hex_num.upper() hex_num = hex_num[2:] hex_num = '\'%s' % hex_num converted = self.__current_dict.get(hex_num) if converted is None: sys.stderr.write('module is hex_2_ut8\nmethod is __text_func\n') sys.stderr.write('no hex value for "%s"\n' % hex_num) else: the_string += converted self.__write_obj.write('txа|б)') line2 = re.sub(reg_exp, my_sub_func, line) print line2 """