mirror of
https://github.com/gryf/ebook-converter.git
synced 2026-01-03 01:04:12 +01:00
836 lines
41 KiB
Python
836 lines
41 KiB
Python
#########################################################################
|
|
# #
|
|
# #
|
|
# copyright 2002 Paul Henry Tremblay #
|
|
# #
|
|
# This program is distributed in the hope that it will be useful, #
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
|
|
# General Public License for more details. #
|
|
# #
|
|
# #
|
|
#########################################################################
|
|
import os, re
|
|
|
|
from ebook_converter.ebooks.rtf2xml import copy, check_brackets
|
|
from ebook_converter.ptempfile import better_mktemp
|
|
|
|
from . import open_for_read, open_for_write
|
|
|
|
|
|
class ProcessTokens:
|
|
"""
|
|
Process each token on a line and add information that will be useful for
|
|
later processing. Information will be put on one line, delimited by "<"
|
|
for main fields, and ">" for sub fields
|
|
"""
|
|
|
|
def __init__(self,
|
|
in_file,
|
|
exception_handler,
|
|
bug_handler,
|
|
copy=None,
|
|
run_level=1,
|
|
):
|
|
self.__file = in_file
|
|
self.__bug_handler = bug_handler
|
|
self.__copy = copy
|
|
self.__run_level = run_level
|
|
self.__write_to = better_mktemp()
|
|
self.initiate_token_dict()
|
|
# self.initiate_token_actions()
|
|
self.compile_expressions()
|
|
self.__bracket_count=0
|
|
self.__exception_handler = exception_handler
|
|
self.__bug_handler = bug_handler
|
|
|
|
def compile_expressions(self):
|
|
self.__num_exp = re.compile(r"([a-zA-Z]+)(.*)")
|
|
self.__utf_exp = re.compile(r'(&.*?;)')
|
|
|
|
def initiate_token_dict(self):
|
|
self.__return_code = 0
|
|
self.dict_token={
|
|
# unicode
|
|
'mshex' : ('nu', '__________', self.__ms_hex_func),
|
|
# brackets
|
|
'{' : ('nu', '{', self.ob_func),
|
|
'}' : ('nu', '}', self.cb_func),
|
|
# microsoft characters
|
|
'ldblquote' : ('mc', 'ldblquote', self.ms_sub_func),
|
|
'rdblquote' : ('mc', 'rdblquote', self.ms_sub_func),
|
|
'rquote' : ('mc', 'rquote', self.ms_sub_func),
|
|
'lquote' : ('mc', 'lquote', self.ms_sub_func),
|
|
'emdash' : ('mc', 'emdash', self.ms_sub_func),
|
|
'endash' : ('mc', 'endash', self.ms_sub_func),
|
|
'bullet' : ('mc', 'bullet', self.ms_sub_func),
|
|
'~' : ('mc', '~', self.ms_sub_func),
|
|
'tab' : ('mc', 'tab', self.ms_sub_func),
|
|
'_' : ('mc', '_', self.ms_sub_func),
|
|
';' : ('mc', ';', self.ms_sub_func),
|
|
# this must be wrong
|
|
'-' : ('mc', '-', self.ms_sub_func),
|
|
'line' : ('mi', 'hardline-break', self.direct_conv_func), # calibre
|
|
# misc => ml
|
|
'*' : ('ml', 'asterisk__', self.default_func),
|
|
':' : ('ml', 'colon_____', self.default_func),
|
|
# text
|
|
'backslash' : ('nu', '\\', self.text_func),
|
|
'ob' : ('nu', '{', self.text_func),
|
|
'cb' : ('nu', '}', self.text_func),
|
|
# paragraph formatting => pf
|
|
'page' : ('pf', 'page-break', self.default_func),
|
|
'par' : ('pf', 'par-end___', self.default_func),
|
|
'pard' : ('pf', 'par-def___', self.default_func),
|
|
'keepn' : ('pf', 'keep-w-nex', self.bool_st_func),
|
|
'widctlpar' : ('pf', 'widow-cntl', self.bool_st_func),
|
|
'adjustright' : ('pf', 'adjust-rgt', self.bool_st_func),
|
|
'lang' : ('pf', 'language__', self.__language_func),
|
|
'ri' : ('pf', 'right-inde', self.divide_by_20),
|
|
'fi' : ('pf', 'fir-ln-ind', self.divide_by_20),
|
|
'li' : ('pf', 'left-inden', self.divide_by_20),
|
|
'sb' : ('pf', 'space-befo', self.divide_by_20),
|
|
'sa' : ('pf', 'space-afte', self.divide_by_20),
|
|
'sl' : ('pf', 'line-space', self.divide_by_20),
|
|
'deftab' : ('pf', 'default-ta', self.divide_by_20),
|
|
'ql' : ('pf', 'align_____<left', self.two_part_func),
|
|
'qc' : ('pf', 'align_____<cent', self.two_part_func),
|
|
'qj' : ('pf', 'align_____<just', self.two_part_func),
|
|
'qr' : ('pf', 'align_____<right', self.two_part_func),
|
|
'nowidctlpar' : ('pf', 'widow-cntr<false', self.two_part_func),
|
|
'tx' : ('pf', 'tab-stop__', self.divide_by_20),
|
|
'tb' : ('pf', 'tab-bar-st', self.divide_by_20),
|
|
'tqr' : ('pf', 'tab-right_', self.default_func),
|
|
'tqdec' : ('pf', 'tab-dec___', self.default_func),
|
|
'tqc' : ('pf', 'tab-center', self.default_func),
|
|
'tlul' : ('pf', 'leader-und', self.default_func),
|
|
'tlhyph' : ('pf', 'leader-hyp', self.default_func),
|
|
'tldot' : ('pf', 'leader-dot', self.default_func),
|
|
# stylesheet = > ss
|
|
'stylesheet' : ('ss', 'style-shet', self.default_func),
|
|
'sbasedon' : ('ss', 'based-on__', self.default_func),
|
|
'snext' : ('ss', 'next-style', self.default_func),
|
|
'cs' : ('ss', 'char-style', self.default_func),
|
|
's' : ('ss', 'para-style', self.default_func),
|
|
# graphics => gr
|
|
'pict' : ('gr', 'picture___', self.default_func),
|
|
'objclass' : ('gr', 'obj-class_', self.default_func),
|
|
'macpict' : ('gr', 'mac-pic___', self.default_func),
|
|
# section => sc
|
|
'sect' : ('sc', 'section___', self.default_func),
|
|
'sectd' : ('sc', 'sect-defin', self.default_func),
|
|
'endhere' : ('sc', 'sect-note_', self.default_func),
|
|
# list=> ls
|
|
'pntext' : ('ls', 'list-text_', self.default_func),
|
|
# this line must be wrong because it duplicates an earlier one
|
|
'listtext' : ('ls', 'list-text_', self.default_func),
|
|
'pn' : ('ls', 'list______', self.default_func),
|
|
'pnseclvl' : ('ls', 'list-level', self.default_func),
|
|
'pncard' : ('ls', 'list-cardi', self.bool_st_func),
|
|
'pndec' : ('ls', 'list-decim', self.bool_st_func),
|
|
'pnucltr' : ('ls', 'list-up-al', self.bool_st_func),
|
|
'pnucrm' : ('ls', 'list-up-ro', self.bool_st_func),
|
|
'pnord' : ('ls', 'list-ord__', self.bool_st_func),
|
|
'pnordt' : ('ls', 'list-ordte', self.bool_st_func),
|
|
'pnlvlblt' : ('ls', 'list-bulli', self.bool_st_func),
|
|
'pnlvlbody' : ('ls', 'list-simpi', self.bool_st_func),
|
|
'pnlvlcont' : ('ls', 'list-conti', self.bool_st_func),
|
|
'pnhang' : ('ls', 'list-hang_', self.bool_st_func),
|
|
'pntxtb' : ('ls', 'list-tebef', self.bool_st_func),
|
|
'ilvl' : ('ls', 'list-level', self.default_func),
|
|
'ls' : ('ls', 'list-id___', self.default_func),
|
|
'pnstart' : ('ls', 'list-start', self.default_func),
|
|
'itap' : ('ls', 'nest-level', self.default_func),
|
|
'leveltext' : ('ls', 'level-text', self.default_func),
|
|
'levelnumbers' : ('ls', 'level-numb', self.default_func),
|
|
'list' : ('ls', 'list-in-tb', self.default_func),
|
|
'listlevel' : ('ls', 'list-tb-le', self.default_func),
|
|
'listname' : ('ls', 'list-name_', self.default_func),
|
|
'listtemplateid' : ('ls', 'ls-tem-id_', self.default_func),
|
|
'leveltemplateid' : ('ls', 'lv-tem-id_', self.default_func),
|
|
'listhybrid' : ('ls', 'list-hybri', self.default_func),
|
|
'levelstartat' : ('ls', 'level-star', self.default_func),
|
|
'levelspace' : ('ls', 'level-spac', self.divide_by_20),
|
|
'levelindent' : ('ls', 'level-inde', self.default_func),
|
|
'levelnfc' : ('ls', 'level-type', self.__list_type_func),
|
|
'levelnfcn' : ('ls', 'level-type', self.__list_type_func),
|
|
'listid' : ('ls', 'lis-tbl-id', self.default_func),
|
|
'listoverride' : ('ls', 'lis-overid', self.default_func),
|
|
# duplicate
|
|
'pnlvl' : ('ls', 'list-level', self.default_func),
|
|
# root info => ri
|
|
'rtf' : ('ri', 'rtf_______', self.default_func),
|
|
'deff' : ('ri', 'deflt-font', self.default_func),
|
|
'mac' : ('ri', 'macintosh_', self.default_func),
|
|
'pc' : ('ri', 'pc________', self.default_func),
|
|
'pca' : ('ri', 'pca_______', self.default_func),
|
|
'ansi' : ('ri', 'ansi______', self.default_func),
|
|
'ansicpg' : ('ri', 'ansi-codpg', self.default_func),
|
|
# notes => nt
|
|
'footnote' : ('nt', 'footnote__', self.default_func),
|
|
'ftnalt' : ('nt', 'type______<endnote', self.two_part_func),
|
|
# anchor => an
|
|
'tc' : ('an', 'toc_______', self.default_func),
|
|
'bkmkstt' : ('an', 'book-mk-st', self.default_func),
|
|
'bkmkstart' : ('an', 'book-mk-st', self.default_func),
|
|
'bkmkend' : ('an', 'book-mk-en', self.default_func),
|
|
'xe' : ('an', 'index-mark', self.default_func),
|
|
'rxe' : ('an', 'place_____', self.default_func),
|
|
# index => in
|
|
'bxe' : ('in', 'index-bold', self.default_func),
|
|
'ixe' : ('in', 'index-ital', self.default_func),
|
|
'txe' : ('in', 'index-see_', self.default_func),
|
|
# table of contents => tc
|
|
'tcl' : ('tc', 'toc-level_', self.default_func),
|
|
'tcn' : ('tc', 'toc-sup-nu', self.default_func),
|
|
# field => fd
|
|
'field' : ('fd', 'field_____', self.default_func),
|
|
'fldinst' : ('fd', 'field-inst', self.default_func),
|
|
'fldrslt' : ('fd', 'field-rslt', self.default_func),
|
|
'datafield' : ('fd', 'datafield_', self.default_func),
|
|
# info-tables => it
|
|
'fonttbl' : ('it', 'font-table', self.default_func),
|
|
'colortbl' : ('it', 'colr-table', self.default_func),
|
|
'listoverridetable' : ('it', 'lovr-table', self.default_func),
|
|
'listtable' : ('it', 'listtable_', self.default_func),
|
|
'revtbl' : ('it', 'revi-table', self.default_func),
|
|
# character info => ci
|
|
'b' : ('ci', 'bold______', self.bool_st_func),
|
|
'blue' : ('ci', 'blue______', self.color_func),
|
|
'caps' : ('ci', 'caps______', self.bool_st_func),
|
|
'cf' : ('ci', 'font-color', self.colorz_func),
|
|
'chftn' : ('ci', 'footnot-mk', self.bool_st_func),
|
|
'dn' : ('ci', 'font-down_', self.divide_by_2),
|
|
'embo' : ('ci', 'emboss____', self.bool_st_func),
|
|
'f' : ('ci', 'font-style', self.default_func),
|
|
'fs' : ('ci', 'font-size_', self.divide_by_2),
|
|
'green' : ('ci', 'green_____', self.color_func),
|
|
'i' : ('ci', 'italics___', self.bool_st_func),
|
|
'impr' : ('ci', 'engrave___', self.bool_st_func),
|
|
'outl' : ('ci', 'outline___', self.bool_st_func),
|
|
'plain' : ('ci', 'plain_____', self.bool_st_func),
|
|
'red' : ('ci', 'red_______', self.color_func),
|
|
'scaps' : ('ci', 'small-caps', self.bool_st_func),
|
|
'shad' : ('ci', 'shadow____', self.bool_st_func),
|
|
'strike' : ('ci', 'strike-thr', self.bool_st_func),
|
|
'striked' : ('ci', 'dbl-strike', self.bool_st_func),
|
|
'sub' : ('ci', 'subscript_', self.bool_st_func),
|
|
'super' : ('ci', 'superscrip', self.bool_st_func),
|
|
'nosupersub' : ('ci', 'no-su-supe', self.__no_sup_sub_func),
|
|
'up' : ('ci', 'font-up___', self.divide_by_2),
|
|
'v' : ('ci', 'hidden____', self.default_func),
|
|
# underline
|
|
# can't see why it isn't a char info: 'ul'=>'ci'
|
|
'ul' : ('ci', 'underlined<continous', self.two_part_func),
|
|
'uld' : ('ci', 'underlined<dotted', self.two_part_func),
|
|
'uldash' : ('ci', 'underlined<dash', self.two_part_func),
|
|
'uldashd' : ('ci', 'underlined<dash-dot', self.two_part_func),
|
|
'uldashdd' : ('ci', 'underlined<dash-dot-dot', self.two_part_func),
|
|
'uldb' : ('ci', 'underlined<double', self.two_part_func),
|
|
'ulhwave' : ('ci', 'underlined<heavy-wave', self.two_part_func),
|
|
'ulldash' : ('ci', 'underlined<long-dash', self.two_part_func),
|
|
'ulth' : ('ci', 'underlined<thich', self.two_part_func),
|
|
'ulthd' : ('ci', 'underlined<thick-dotted', self.two_part_func),
|
|
'ulthdash' : ('ci', 'underlined<thick-dash', self.two_part_func),
|
|
'ulthdashd' : ('ci', 'underlined<thick-dash-dot', self.two_part_func),
|
|
'ulthdashdd' : ('ci', 'underlined<thick-dash-dot-dot', self.two_part_func),
|
|
'ulthldash' : ('ci', 'underlined<thick-long-dash', self.two_part_func),
|
|
'ululdbwave' : ('ci', 'underlined<double-wave', self.two_part_func),
|
|
'ulw' : ('ci', 'underlined<word', self.two_part_func),
|
|
'ulwave' : ('ci', 'underlined<wave', self.two_part_func),
|
|
'ulnone' : ('ci', 'underlined<false', self.two_part_func),
|
|
# table => tb
|
|
'trowd' : ('tb', 'row-def___', self.default_func),
|
|
'cell' : ('tb', 'cell______', self.default_func),
|
|
'row' : ('tb', 'row_______', self.default_func),
|
|
'intbl' : ('tb', 'in-table__', self.default_func),
|
|
'cols' : ('tb', 'columns___', self.default_func),
|
|
'trleft' : ('tb', 'row-pos-le', self.divide_by_20),
|
|
'cellx' : ('tb', 'cell-posit', self.divide_by_20),
|
|
'trhdr' : ('tb', 'row-header', self.default_func),
|
|
# preamble => pr
|
|
# document information => di
|
|
# TODO integrate \userprops
|
|
'info' : ('di', 'doc-info__', self.default_func),
|
|
'title' : ('di', 'title_____', self.default_func),
|
|
'author' : ('di', 'author____', self.default_func),
|
|
'operator' : ('di', 'operator__', self.default_func),
|
|
'manager' : ('di', 'manager___', self.default_func),
|
|
'company' : ('di', 'company___', self.default_func),
|
|
'keywords' : ('di', 'keywords__', self.default_func),
|
|
'category' : ('di', 'category__', self.default_func),
|
|
'doccomm' : ('di', 'doc-notes_', self.default_func),
|
|
'comment' : ('di', 'doc-notes_', self.default_func),
|
|
'subject' : ('di', 'subject___', self.default_func),
|
|
'creatim' : ('di', 'create-tim', self.default_func),
|
|
'yr' : ('di', 'year______', self.default_func),
|
|
'mo' : ('di', 'month_____', self.default_func),
|
|
'dy' : ('di', 'day_______', self.default_func),
|
|
'min' : ('di', 'minute____', self.default_func),
|
|
'sec' : ('di', 'second____', self.default_func),
|
|
'revtim' : ('di', 'revis-time', self.default_func),
|
|
'edmins' : ('di', 'edit-time_', self.default_func),
|
|
'printim' : ('di', 'print-time', self.default_func),
|
|
'buptim' : ('di', 'backuptime', self.default_func),
|
|
'nofwords' : ('di', 'num-of-wor', self.default_func),
|
|
'nofchars' : ('di', 'num-of-chr', self.default_func),
|
|
'nofcharsws' : ('di', 'numofchrws', self.default_func),
|
|
'nofpages' : ('di', 'num-of-pag', self.default_func),
|
|
'version' : ('di', 'version___', self.default_func),
|
|
'vern' : ('di', 'intern-ver', self.default_func),
|
|
'hlinkbase' : ('di', 'linkbase__', self.default_func),
|
|
'id' : ('di', 'internalID', self.default_func),
|
|
# headers and footers => hf
|
|
'headerf' : ('hf', 'head-first', self.default_func),
|
|
'headerl' : ('hf', 'head-left_', self.default_func),
|
|
'headerr' : ('hf', 'head-right', self.default_func),
|
|
'footerf' : ('hf', 'foot-first', self.default_func),
|
|
'footerl' : ('hf', 'foot-left_', self.default_func),
|
|
'footerr' : ('hf', 'foot-right', self.default_func),
|
|
'header' : ('hf', 'header____', self.default_func),
|
|
'footer' : ('hf', 'footer____', self.default_func),
|
|
# page => pa
|
|
'margl' : ('pa', 'margin-lef', self.divide_by_20),
|
|
'margr' : ('pa', 'margin-rig', self.divide_by_20),
|
|
'margb' : ('pa', 'margin-bot', self.divide_by_20),
|
|
'margt' : ('pa', 'margin-top', self.divide_by_20),
|
|
'gutter' : ('pa', 'gutter____', self.divide_by_20),
|
|
'paperw' : ('pa', 'paper-widt', self.divide_by_20),
|
|
'paperh' : ('pa', 'paper-hght', self.divide_by_20),
|
|
# annotation => an
|
|
'annotation' : ('an', 'annotation', self.default_func),
|
|
# border => bd
|
|
'trbrdrh' : ('bd', 'bor-t-r-hi', self.default_func),
|
|
'trbrdrv' : ('bd', 'bor-t-r-vi', self.default_func),
|
|
'trbrdrt' : ('bd', 'bor-t-r-to', self.default_func),
|
|
'trbrdrl' : ('bd', 'bor-t-r-le', self.default_func),
|
|
'trbrdrb' : ('bd', 'bor-t-r-bo', self.default_func),
|
|
'trbrdrr' : ('bd', 'bor-t-r-ri', self.default_func),
|
|
'clbrdrb' : ('bd', 'bor-cel-bo', self.default_func),
|
|
'clbrdrt' : ('bd', 'bor-cel-to', self.default_func),
|
|
'clbrdrl' : ('bd', 'bor-cel-le', self.default_func),
|
|
'clbrdrr' : ('bd', 'bor-cel-ri', self.default_func),
|
|
'brdrb' : ('bd', 'bor-par-bo', self.default_func),
|
|
'brdrt' : ('bd', 'bor-par-to', self.default_func),
|
|
'brdrl' : ('bd', 'bor-par-le', self.default_func),
|
|
'brdrr' : ('bd', 'bor-par-ri', self.default_func),
|
|
'box' : ('bd', 'bor-par-bx', self.default_func),
|
|
'chbrdr' : ('bd', 'bor-par-bo', self.default_func),
|
|
'brdrbtw' : ('bd', 'bor-for-ev', self.default_func),
|
|
'brdrbar' : ('bd', 'bor-outsid', self.default_func),
|
|
'brdrnone' : ('bd', 'bor-none__<false', self.two_part_func),
|
|
# border type => bt
|
|
'brdrs' : ('bt', 'bdr-single', self.default_func),
|
|
'brdrth' : ('bt', 'bdr-doubtb', self.default_func),
|
|
'brdrsh' : ('bt', 'bdr-shadow', self.default_func),
|
|
'brdrdb' : ('bt', 'bdr-double', self.default_func),
|
|
'brdrdot' : ('bt', 'bdr-dotted', self.default_func),
|
|
'brdrdash' : ('bt', 'bdr-dashed', self.default_func),
|
|
'brdrhair' : ('bt', 'bdr-hair__', self.default_func),
|
|
'brdrinset' : ('bt', 'bdr-inset_', self.default_func),
|
|
'brdrdashsm' : ('bt', 'bdr-das-sm', self.default_func),
|
|
'brdrdashd' : ('bt', 'bdr-dot-sm', self.default_func),
|
|
'brdrdashdd' : ('bt', 'bdr-dot-do', self.default_func),
|
|
'brdroutset' : ('bt', 'bdr-outset', self.default_func),
|
|
'brdrtriple' : ('bt', 'bdr-trippl', self.default_func),
|
|
'brdrtnthsg' : ('bt', 'bdr-thsm__', self.default_func),
|
|
'brdrthtnsg' : ('bt', 'bdr-htsm__', self.default_func),
|
|
'brdrtnthtnsg' : ('bt', 'bdr-hthsm_', self.default_func),
|
|
'brdrtnthmg' : ('bt', 'bdr-thm___', self.default_func),
|
|
'brdrthtnmg' : ('bt', 'bdr-htm___', self.default_func),
|
|
'brdrtnthtnmg' : ('bt', 'bdr-hthm__', self.default_func),
|
|
'brdrtnthlg' : ('bt', 'bdr-thl___', self.default_func),
|
|
'brdrtnthtnlg' : ('bt', 'bdr-hthl__', self.default_func),
|
|
'brdrwavy' : ('bt', 'bdr-wavy__', self.default_func),
|
|
'brdrwavydb' : ('bt', 'bdr-d-wav_', self.default_func),
|
|
'brdrdashdotstr' : ('bt', 'bdr-strip_', self.default_func),
|
|
'brdremboss' : ('bt', 'bdr-embos_', self.default_func),
|
|
'brdrengrave' : ('bt', 'bdr-engra_', self.default_func),
|
|
'brdrframe' : ('bt', 'bdr-frame_', self.default_func),
|
|
'brdrw' : ('bt', 'bdr-li-wid', self.divide_by_20),
|
|
'brsp' : ('bt', 'bdr-sp-wid', self.divide_by_20),
|
|
'brdrcf' : ('bt', 'bdr-color_', self.default_func),
|
|
# comments
|
|
# 'comment' : ('cm', 'comment___', self.default_func),
|
|
}
|
|
self.__number_type_dict = {
|
|
0: 'Arabic',
|
|
1: 'uppercase Roman numeral',
|
|
2: 'lowercase Roman numeral',
|
|
3: 'uppercase letter',
|
|
4: 'lowercase letter',
|
|
5: 'ordinal number',
|
|
6: 'cardianl text number',
|
|
7: 'ordinal text number',
|
|
10: 'Kanji numbering without the digit character',
|
|
11: 'Kanji numbering with the digit character',
|
|
1246: 'phonetic Katakana characters in aiueo order',
|
|
1346: 'phonetic katakana characters in iroha order',
|
|
14: 'double byte character',
|
|
15: 'single byte character',
|
|
16: 'Kanji numbering 3',
|
|
17: 'Kanji numbering 4',
|
|
18: 'Circle numbering' ,
|
|
19: 'double-byte Arabic numbering',
|
|
2046: 'phonetic double-byte Katakana characters',
|
|
2146: 'phonetic double-byte katakana characters',
|
|
22: 'Arabic with leading zero',
|
|
23: 'bullet',
|
|
24: 'Korean numbering 2',
|
|
25: 'Korean numbering 1',
|
|
26: 'Chinese numbering 1',
|
|
27: 'Chinese numbering 2',
|
|
28: 'Chinese numbering 3',
|
|
29: 'Chinese numbering 4',
|
|
30: 'Chinese Zodiac numbering 1',
|
|
31: 'Chinese Zodiac numbering 2',
|
|
32: 'Chinese Zodiac numbering 3',
|
|
33: 'Taiwanese double-byte numbering 1',
|
|
34: 'Taiwanese double-byte numbering 2',
|
|
35: 'Taiwanese double-byte numbering 3',
|
|
36: 'Taiwanese double-byte numbering 4',
|
|
37: 'Chinese double-byte numbering 1',
|
|
38: 'Chinese double-byte numbering 2',
|
|
39: 'Chinese double-byte numbering 3',
|
|
40: 'Chinese double-byte numbering 4',
|
|
41: 'Korean double-byte numbering 1',
|
|
42: 'Korean double-byte numbering 2',
|
|
43: 'Korean double-byte numbering 3',
|
|
44: 'Korean double-byte numbering 4',
|
|
45: 'Hebrew non-standard decimal',
|
|
46: 'Arabic Alif Ba Tah',
|
|
47: 'Hebrew Biblical standard',
|
|
48: 'Arabic Abjad style',
|
|
255: 'No number',
|
|
}
|
|
self.__language_dict = {
|
|
1078 : 'Afrikaans',
|
|
1052 : 'Albanian',
|
|
1025 : 'Arabic',
|
|
5121 : 'Arabic Algeria',
|
|
15361 : 'Arabic Bahrain',
|
|
3073 : 'Arabic Egypt',
|
|
1 : 'Arabic General',
|
|
2049 : 'Arabic Iraq',
|
|
11265 : 'Arabic Jordan',
|
|
13313 : 'Arabic Kuwait',
|
|
12289 : 'Arabic Lebanon',
|
|
4097 : 'Arabic Libya',
|
|
6145 : 'Arabic Morocco',
|
|
8193 : 'Arabic Oman',
|
|
16385 : 'Arabic Qatar',
|
|
10241 : 'Arabic Syria',
|
|
7169 : 'Arabic Tunisia',
|
|
14337 : 'Arabic U.A.E.',
|
|
9217 : 'Arabic Yemen',
|
|
1067 : 'Armenian',
|
|
1101 : 'Assamese',
|
|
2092 : 'Azeri Cyrillic',
|
|
1068 : 'Azeri Latin',
|
|
1069 : 'Basque',
|
|
1093 : 'Bengali',
|
|
4122 : 'Bosnia Herzegovina',
|
|
1026 : 'Bulgarian',
|
|
1109 : 'Burmese',
|
|
1059 : 'Byelorussian',
|
|
1027 : 'Catalan',
|
|
2052 : 'Chinese China',
|
|
4 : 'Chinese General',
|
|
3076 : 'Chinese Hong Kong',
|
|
4100 : 'Chinese Singapore',
|
|
1028 : 'Chinese Taiwan',
|
|
1050 : 'Croatian',
|
|
1029 : 'Czech',
|
|
1030 : 'Danish',
|
|
2067 : 'Dutch Belgium',
|
|
1043 : 'Dutch Standard',
|
|
3081 : 'English Australia',
|
|
10249 : 'English Belize',
|
|
2057 : 'English British',
|
|
4105 : 'English Canada',
|
|
9225 : 'English Caribbean',
|
|
9 : 'English General',
|
|
6153 : 'English Ireland',
|
|
8201 : 'English Jamaica',
|
|
5129 : 'English New Zealand',
|
|
13321 : 'English Philippines',
|
|
7177 : 'English South Africa',
|
|
11273 : 'English Trinidad',
|
|
1033 : 'English United States',
|
|
1061 : 'Estonian',
|
|
1080 : 'Faerose',
|
|
1065 : 'Farsi',
|
|
1035 : 'Finnish',
|
|
1036 : 'French',
|
|
2060 : 'French Belgium',
|
|
11276 : 'French Cameroon',
|
|
3084 : 'French Canada',
|
|
12300 : 'French Cote d\'Ivoire',
|
|
5132 : 'French Luxembourg',
|
|
13324 : 'French Mali',
|
|
6156 : 'French Monaco',
|
|
8204 : 'French Reunion',
|
|
10252 : 'French Senegal',
|
|
4108 : 'French Swiss',
|
|
7180 : 'French West Indies',
|
|
9228 : 'French Democratic Republic of the Congo',
|
|
1122 : 'Frisian',
|
|
1084 : 'Gaelic',
|
|
2108 : 'Gaelic Ireland',
|
|
1110 : 'Galician',
|
|
1079 : 'Georgian',
|
|
1031 : 'German',
|
|
3079 : 'German Austrian',
|
|
5127 : 'German Liechtenstein',
|
|
4103 : 'German Luxembourg',
|
|
2055 : 'German Switzerland',
|
|
1032 : 'Greek',
|
|
1095 : 'Gujarati',
|
|
1037 : 'Hebrew',
|
|
1081 : 'Hindi',
|
|
1038 : 'Hungarian',
|
|
1039 : 'Icelandic',
|
|
1057 : 'Indonesian',
|
|
1040 : 'Italian',
|
|
2064 : 'Italian Switzerland',
|
|
1041 : 'Japanese',
|
|
1099 : 'Kannada',
|
|
1120 : 'Kashmiri',
|
|
2144 : 'Kashmiri India',
|
|
1087 : 'Kazakh',
|
|
1107 : 'Khmer',
|
|
1088 : 'Kirghiz',
|
|
1111 : 'Konkani',
|
|
1042 : 'Korean',
|
|
2066 : 'Korean Johab',
|
|
1108 : 'Lao',
|
|
1062 : 'Latvian',
|
|
1063 : 'Lithuanian',
|
|
2087 : 'Lithuanian Classic',
|
|
1086 : 'Malay',
|
|
2110 : 'Malay Brunei Darussalam',
|
|
1100 : 'Malayalam',
|
|
1082 : 'Maltese',
|
|
1112 : 'Manipuri',
|
|
1102 : 'Marathi',
|
|
1104 : 'Mongolian',
|
|
1121 : 'Nepali',
|
|
2145 : 'Nepali India',
|
|
1044 : 'Norwegian Bokmal',
|
|
2068 : 'Norwegian Nynorsk',
|
|
1096 : 'Oriya',
|
|
1045 : 'Polish',
|
|
1046 : 'Portuguese (Brazil)',
|
|
2070 : 'Portuguese (Portugal)',
|
|
1094 : 'Punjabi',
|
|
1047 : 'Rhaeto-Romanic',
|
|
1048 : 'Romanian',
|
|
2072 : 'Romanian Moldova',
|
|
1049 : 'Russian',
|
|
2073 : 'Russian Moldova',
|
|
1083 : 'Sami Lappish',
|
|
1103 : 'Sanskrit',
|
|
3098 : 'Serbian Cyrillic',
|
|
2074 : 'Serbian Latin',
|
|
1113 : 'Sindhi',
|
|
1051 : 'Slovak',
|
|
1060 : 'Slovenian',
|
|
1070 : 'Sorbian',
|
|
11274 : 'Spanish Argentina',
|
|
16394 : 'Spanish Bolivia',
|
|
13322 : 'Spanish Chile',
|
|
9226 : 'Spanish Colombia',
|
|
5130 : 'Spanish Costa Rica',
|
|
7178 : 'Spanish Dominican Republic',
|
|
12298 : 'Spanish Ecuador',
|
|
17418 : 'Spanish El Salvador',
|
|
4106 : 'Spanish Guatemala',
|
|
18442 : 'Spanish Honduras',
|
|
2058 : 'Spanish Mexico',
|
|
3082 : 'Spanish Modern',
|
|
19466 : 'Spanish Nicaragua',
|
|
6154 : 'Spanish Panama',
|
|
15370 : 'Spanish Paraguay',
|
|
10250 : 'Spanish Peru',
|
|
20490 : 'Spanish Puerto Rico',
|
|
1034 : 'Spanish Traditional',
|
|
14346 : 'Spanish Uruguay',
|
|
8202 : 'Spanish Venezuela',
|
|
1072 : 'Sutu',
|
|
1089 : 'Swahili',
|
|
1053 : 'Swedish',
|
|
2077 : 'Swedish Finland',
|
|
1064 : 'Tajik',
|
|
1097 : 'Tamil',
|
|
1092 : 'Tatar',
|
|
1098 : 'Telugu',
|
|
1054 : 'Thai',
|
|
1105 : 'Tibetan',
|
|
1073 : 'Tsonga',
|
|
1074 : 'Tswana',
|
|
1055 : 'Turkish',
|
|
1090 : 'Turkmen',
|
|
1058 : 'Ukranian',
|
|
1056 : 'Urdu',
|
|
2080 : 'Urdu India',
|
|
2115 : 'Uzbek Cyrillic',
|
|
1091 : 'Uzbek Latin',
|
|
1075 : 'Venda',
|
|
1066 : 'Vietnamese',
|
|
1106 : 'Welsh',
|
|
1076 : 'Xhosa',
|
|
1085 : 'Yiddish',
|
|
1077 : 'Zulu',
|
|
1024 : 'Unkown',
|
|
255 : 'Unkown',
|
|
}
|
|
"""
|
|
# unknown
|
|
# These must get passed on because they occure after \\*
|
|
'do' : ('un', 'unknown___', self.default_func),
|
|
'company' : ('un', 'company___', self.default_func),
|
|
'shpinst' : ('un', 'unknown___', self.default_func),
|
|
'panose' : ('un', 'unknown___', self.default_func),
|
|
'falt' : ('un', 'unknown___', self.default_func),
|
|
'listoverridetable' : ('un', 'unknown___', self.default_func),
|
|
'category' : ('un', 'unknown___', self.default_func),
|
|
'template' : ('un', 'unknown___', self.default_func),
|
|
'ud' : ('un', 'unknown___', self.default_func),
|
|
'formfield' : ('un', 'unknown___', self.default_func),
|
|
'ts' : ('un', 'unknown___', self.default_func),
|
|
'rsidtbl' : ('un', 'unknown___', self.default_func),
|
|
'generator' : ('un', 'unknown___', self.default_func),
|
|
'ftnsep' : ('un', 'unknown___', self.default_func),
|
|
'aftnsep' : ('un', 'unknown___', self.default_func),
|
|
'aftnsepc' : ('un', 'unknown___', self.default_func),
|
|
'aftncn' : ('un', 'unknown___', self.default_func),
|
|
'objclass' : ('un', 'unknown___', self.default_func),
|
|
'objdata' : ('un', 'unknown___', self.default_func),
|
|
'picprop' : ('un', 'unknown___', self.default_func),
|
|
'blipuid' : ('un', 'unknown___', self.default_func),
|
|
"""
|
|
|
|
def __ms_hex_func(self, pre, token, num):
|
|
num = num[1:] # chop off leading 0, which I added
|
|
num = num.upper() # the mappings store hex in caps
|
|
return 'tx<hx<__________<\'%s\n' % num # add an ' for the mappings
|
|
|
|
def ms_sub_func(self, pre, token, num):
|
|
return 'tx<mc<__________<%s\n' % token
|
|
|
|
def direct_conv_func(self, pre, token, num):
|
|
return 'mi<tg<empty_____<%s\n' % token
|
|
|
|
def default_func(self, pre, token, num):
|
|
if num is None:
|
|
num = 'true'
|
|
return 'cw<%s<%s<nu<%s\n' % (pre, token, num)
|
|
|
|
def colorz_func(self, pre, token, num):
|
|
if num is None:
|
|
num = '0'
|
|
return 'cw<%s<%s<nu<%s\n' % (pre, token, num)
|
|
|
|
def __list_type_func(self, pre, token, num):
|
|
type = 'arabic'
|
|
if num is None:
|
|
type = 'Arabic'
|
|
else:
|
|
try:
|
|
num = int(num)
|
|
except ValueError:
|
|
if self.__run_level > 3:
|
|
msg = 'Number "%s" cannot be converted to integer\n' % num
|
|
raise self.__bug_handler(msg)
|
|
type = self.__number_type_dict.get(num)
|
|
if type is None:
|
|
if self.__run_level > 3:
|
|
msg = 'No type for "%s" in self.__number_type_dict\n'
|
|
raise self.__bug_handler
|
|
type = 'Arabic'
|
|
return 'cw<%s<%s<nu<%s\n' % (pre, token, type)
|
|
|
|
def __language_func(self, pre, token, num):
|
|
lang_name = self.__language_dict.get(int(re.search('[0-9]+', num).group()))
|
|
if not lang_name:
|
|
lang_name = "not defined"
|
|
if self.__run_level > 3:
|
|
msg = 'No entry for number "%s"' % num
|
|
raise self.__bug_handler(msg)
|
|
return 'cw<%s<%s<nu<%s\n' % (pre, token, lang_name)
|
|
|
|
def two_part_func(self, pre, token, num):
|
|
list = token.split("<")
|
|
token = list[0]
|
|
num = list[1]
|
|
return 'cw<%s<%s<nu<%s\n' % (pre, token, num)
|
|
# return 'cw<nu<nu<nu<%s>num<%s\n' % (token, num)
|
|
|
|
def divide_by_2(self, pre, token, num):
|
|
num = self.divide_num(num, 2)
|
|
return 'cw<%s<%s<nu<%s\n' % (pre, token, num)
|
|
# return 'cw<nu<nu<nu<%s>%s<%s\n' % (token, num, token)
|
|
|
|
def divide_by_20(self, pre, token, num):
|
|
num = self.divide_num(num, 20)
|
|
return 'cw<%s<%s<nu<%s\n' % (pre, token, num)
|
|
# return 'cw<nu<nu<nu<%s>%s<%s\n' % (token, num, token)
|
|
|
|
def text_func(self, pre, token, num=None):
|
|
return 'tx<nu<__________<%s\n' % token
|
|
|
|
def ob_func(self, pre, token, num=None):
|
|
self.__bracket_count += 1
|
|
return 'ob<nu<open-brack<%04d\n' % self.__bracket_count
|
|
|
|
def cb_func(self, pre, token, num=None):
|
|
line = 'cb<nu<clos-brack<%04d\n' % self.__bracket_count
|
|
self.__bracket_count -= 1
|
|
return line
|
|
|
|
def color_func(self, pre, token, num):
|
|
third_field = 'nu'
|
|
if num[-1] == ';':
|
|
num = num[:-1]
|
|
third_field = 'en'
|
|
num = str('%X' % int(num))
|
|
if len(num) != 2:
|
|
num = "0" + num
|
|
return 'cw<%s<%s<%s<%s\n' % (pre, token, third_field, num)
|
|
# return 'cw<cl<%s<nu<nu<%s>%s<%s\n' % (third_field, token, num, token)
|
|
|
|
def bool_st_func(self, pre, token, num):
|
|
if num is None or num == '' or num == '1':
|
|
return 'cw<%s<%s<nu<true\n' % (pre, token)
|
|
# return 'cw<nu<nu<nu<%s>true<%s\n' % (token, token)
|
|
elif num == '0':
|
|
return 'cw<%s<%s<nu<false\n' % (pre, token)
|
|
# return 'cw<nu<nu<nu<%s>false<%s\n' % (token, token)
|
|
else:
|
|
msg = "boolean should have some value module process tokens\ntoken is %s\n'%s'\n" % (token, num)
|
|
raise self.__bug_handler(msg)
|
|
|
|
def __no_sup_sub_func(self, pre, token, num):
|
|
the_string = 'cw<ci<subscript_<nu<false\n'
|
|
the_string += 'cw<ci<superscrip<nu<false\n'
|
|
return the_string
|
|
|
|
def divide_num(self, numerator, denominator):
|
|
try:
|
|
# calibre why ignore negative number? Wrong in case of \fi
|
|
numerator = float(re.search('[0-9.\\-]+', numerator).group())
|
|
except TypeError as msg:
|
|
if self.__run_level > 3:
|
|
msg = ('No number to process?\nthis indicates that the token \\(\\li\\) \
|
|
should have a number and does not\nnumerator is \
|
|
"%s"\ndenominator is "%s"\n') % (numerator, denominator)
|
|
raise self.__bug_handler(msg)
|
|
if 5 > self.__return_code:
|
|
self.__return_code = 5
|
|
return 0
|
|
num = '%0.2f' % round(numerator/denominator, 2)
|
|
return num
|
|
string_num = str(num)
|
|
if string_num[-2:] == ".0":
|
|
string_num = string_num[:-2]
|
|
return string_num
|
|
|
|
def split_let_num(self, token):
|
|
match_obj = re.search(self.__num_exp,token)
|
|
if match_obj is not None:
|
|
first = match_obj.group(1)
|
|
second = match_obj.group(2)
|
|
if not second:
|
|
if self.__run_level > 3:
|
|
msg = "token is '%s' \n" % token
|
|
raise self.__bug_handler(msg)
|
|
return first, 0
|
|
else:
|
|
if self.__run_level > 3:
|
|
msg = "token is '%s' \n" % token
|
|
raise self.__bug_handler
|
|
return token, 0
|
|
return first, second
|
|
|
|
def convert_to_hex(self,number):
|
|
"""Convert a string to uppercase hexidecimal"""
|
|
num = int(number)
|
|
try:
|
|
hex_num = "%X" % num
|
|
return hex_num
|
|
except:
|
|
raise self.__bug_handler
|
|
|
|
def process_cw(self, token):
|
|
"""Change the value of the control word by determining what dictionary
|
|
it belongs to"""
|
|
special = ['*', ':', '}', '{', '~', '_', '-', ';']
|
|
# if token != "{" or token != "}":
|
|
token = token[1:] # strip off leading \
|
|
token = token.replace(" ", "")
|
|
# if not token: return
|
|
only_alpha = token.isalpha()
|
|
num = None
|
|
if not only_alpha and token not in special:
|
|
token, num = self.split_let_num(token)
|
|
pre, token, action = self.dict_token.get(token, (None, None, None))
|
|
if action:
|
|
return action(pre, token, num)
|
|
|
|
def __check_brackets(self, in_file):
|
|
self.__check_brack_obj = check_brackets.CheckBrackets(file=in_file)
|
|
good_br = self.__check_brack_obj.check_brackets()[0]
|
|
if not good_br:
|
|
return 1
|
|
|
|
def process_tokens(self):
|
|
"""Main method for handling other methods. """
|
|
line_count = 0
|
|
with open_for_read(self.__file) as read_obj:
|
|
with open_for_write(self.__write_to) as write_obj:
|
|
for line in read_obj:
|
|
token = line.replace("\n", "")
|
|
line_count += 1
|
|
if line_count == 1 and token != '\\{':
|
|
msg = '\nInvalid RTF: document doesn\'t start with {\n'
|
|
raise self.__exception_handler(msg)
|
|
elif line_count == 2 and token[0:4] != '\\rtf':
|
|
msg = '\nInvalid RTF: document doesn\'t start with \\rtf \n'
|
|
raise self.__exception_handler(msg)
|
|
|
|
the_index = token.find('\\ ')
|
|
if token is not None and the_index > -1:
|
|
msg = '\nInvalid RTF: token "\\ " not valid.\nError at line %d'\
|
|
% line_count
|
|
raise self.__exception_handler(msg)
|
|
elif token[:1] == "\\":
|
|
line = self.process_cw(token)
|
|
if line is not None:
|
|
write_obj.write(line)
|
|
else:
|
|
fields = re.split(self.__utf_exp, token)
|
|
for field in fields:
|
|
if not field:
|
|
continue
|
|
if field[0:1] == '&':
|
|
write_obj.write('tx<ut<__________<%s\n' % field)
|
|
else:
|
|
write_obj.write('tx<nu<__________<%s\n' % field)
|
|
|
|
if not line_count:
|
|
msg = '\nInvalid RTF: file appears to be empty.\n'
|
|
raise self.__exception_handler(msg)
|
|
|
|
copy_obj = copy.Copy(bug_handler=self.__bug_handler)
|
|
if self.__copy:
|
|
copy_obj.copy_file(self.__write_to, "processed_tokens.data")
|
|
copy_obj.rename(self.__write_to, self.__file)
|
|
os.remove(self.__write_to)
|
|
|
|
bad_brackets = self.__check_brackets(self.__file)
|
|
if bad_brackets:
|
|
msg = '\nInvalid RTF: document does not have matching brackets.\n'
|
|
raise self.__exception_handler(msg)
|
|
else:
|
|
return self.__return_code
|