1
0
mirror of https://github.com/gryf/ebook-converter.git synced 2026-03-13 13:15:53 +01:00

Initial import

This commit is contained in:
2020-03-31 17:15:23 +02:00
commit d97ea9b0bc
311 changed files with 131419 additions and 0 deletions

View File

@@ -0,0 +1,573 @@
from __future__ import absolute_import, division, print_function, unicode_literals
#########################################################################
# #
# #
# copyright 2002 Paul Henry Tremblay #
# #
# This program is distributed in the hope that it will be useful, #
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
# General Public License for more details. #
# #
# #
#########################################################################
# $Revision: 1.41 $
# $Date: 2006/03/24 23:50:07 $
import sys, os
from calibre.ebooks.rtf2xml import headings_to_sections, \
line_endings, footnote, fields_small, default_encoding, \
make_lists, preamble_div, header, colors, group_borders, \
check_encoding, add_brackets, table, combine_borders, \
fields_large, process_tokens, hex_2_utf8, tokenize, \
delete_info, sections, check_brackets, styles, \
paragraph_def, convert_to_tags, output, copy, \
list_numbers, info, pict, table_info, fonts, paragraphs, \
body_styles, preamble_rest, group_styles, \
inline
from calibre.ebooks.rtf2xml.old_rtf import OldRtf
from polyglot.builtins import unicode_type
from . import open_for_read, open_for_write
"""
Here is an example script using the ParseRTF module directly
#!/usr/bin/env python2
def Handle_Main():
# Handles options and creates a parse object
parse_obj =ParseRtf.ParseRtf(
in_file = 'in.rtf',
# All values from here on are optional
# determine the output file
out_file = 'out.xml',
# determine the run level. The default is 1.
run_level = 3,
# The name of a debug directory, if you are running at
# run level 3 or higer.
debug = 'debug_dir',
# Convert RTF caps to real caps.
# Default is 1.
convert_caps = 1,
# Indent resulting XML.
# Default is 0 (no indent).
indent = 1,
# Form lists from RTF. Default is 1.
form_lists = 1,
# Convert headings to sections. Default is 0.
headings_to_sections = 1,
# Group paragraphs with the same style name. Default is 1.
group_styles = 1,
# Group borders. Default is 1.
group_borders = 1,
# Write or do not write paragraphs. Default is 0.
empty_paragraphs = 0,
# Allow to use a custom default encoding as fallback
default_encoding = 'cp1252',
)
try:
parse_obj.parse_rtf()
except ParseRtf.InvalidRtfException, msg:
sys.stderr.write(msg)
except ParseRtf.RtfInvalidCodeException, msg:
sys.stderr.write(msg)
"""
class InvalidRtfException(Exception):
"""
handle invalid RTF
"""
pass
class RtfInvalidCodeException(Exception):
"""
handle bugs in program
"""
pass
class ParseRtf:
"""
Main class for controlling the rest of the parsing.
"""
def __init__(self,
in_file,
out_file='',
out_dir=None,
dtd='',
deb_dir=None,
convert_symbol=None,
convert_wingdings=None,
convert_zapf=None,
convert_caps=None,
run_level=1,
indent=None,
replace_illegals=1,
form_lists=1,
headings_to_sections=1,
group_styles=1,
group_borders=1,
empty_paragraphs=1,
no_dtd=0,
char_data='',
default_encoding='cp1252',
):
"""
Requires:
'file' --file to parse
'char_data' --file containing character maps
'dtd' --path to dtd
Possible parameters, but not necessary:
'output' --a file to output the parsed file. (Default is standard
output.)
'temp_dir' --directory for temporary output (If not provided, the
script tries to output to directory where is script is exectued.)
'deb_dir' --debug directory. If a debug_dir is provided, the script
will copy each run through as a file to examine in the debug_dir
'check_brackets' -- make sure the brackets match up after each run
through a file. Only for debugging.
Returns: Nothing
"""
self.__file = in_file
self.__out_file = out_file
self.__out_dir = out_dir
self.__temp_dir = out_dir
self.__dtd_path = dtd
self.__check_file(in_file,"file_to_parse")
self.__char_data = char_data
self.__debug_dir = deb_dir
self.__check_dir(self.__temp_dir)
self.__copy = self.__check_dir(self.__debug_dir)
self.__convert_caps = convert_caps
self.__convert_symbol = convert_symbol
self.__convert_wingdings = convert_wingdings
self.__convert_zapf = convert_zapf
self.__run_level = run_level
self.__exit_level = 0
self.__indent = indent
self.__replace_illegals = replace_illegals
self.__form_lists = form_lists
self.__headings_to_sections = headings_to_sections
self.__group_styles = group_styles
self.__group_borders = group_borders
self.__empty_paragraphs = empty_paragraphs
self.__no_dtd = no_dtd
self.__default_encoding = default_encoding
def __check_file(self, the_file, type):
"""Check to see if files exist"""
if hasattr(the_file, 'read'):
return
if the_file is None:
if type == "file_to_parse":
msg = "\nYou must provide a file for the script to work"
raise RtfInvalidCodeException(msg)
elif os.path.exists(the_file):
pass # do nothing
else:
msg = "\nThe file '%s' cannot be found" % the_file
raise RtfInvalidCodeException(msg)
def __check_dir(self, the_dir):
"""Check to see if directory exists"""
if not the_dir :
return
dir_exists = os.path.isdir(the_dir)
if not dir_exists:
msg = "\n%s is not a directory" % the_dir
raise RtfInvalidCodeException(msg)
return 1
def parse_rtf(self):
"""
Parse the file by calling on other classes.
Requires:
Nothing
Returns:
A parsed file in XML, either to standard output or to a file,
depending on the value of 'output' when the instance was created.
"""
self.__temp_file = self.__make_temp_file(self.__file)
# if the self.__deb_dir is true, then create a copy object,
# set the directory to write to, remove files, and copy
# the new temporary file to this directory
if self.__debug_dir:
copy_obj = copy.Copy(
bug_handler=RtfInvalidCodeException,
)
copy_obj.set_dir(self.__debug_dir)
copy_obj.remove_files()
copy_obj.copy_file(self.__temp_file, "original_file")
# Function to check if bracket are well handled
if self.__debug_dir or self.__run_level > 2:
self.__check_brack_obj = check_brackets.CheckBrackets(
file=self.__temp_file,
bug_handler=RtfInvalidCodeException,
)
# convert Macintosh and Windows line endings to Unix line endings
# why do this if you don't wb after?
line_obj = line_endings.FixLineEndings(
in_file=self.__temp_file,
bug_handler=RtfInvalidCodeException,
copy=self.__copy,
run_level=self.__run_level,
replace_illegals=self.__replace_illegals,
)
return_value = line_obj.fix_endings() # calibre return what?
self.__return_code(return_value)
tokenize_obj = tokenize.Tokenize(
bug_handler=RtfInvalidCodeException,
in_file=self.__temp_file,
copy=self.__copy,
run_level=self.__run_level)
tokenize_obj.tokenize()
process_tokens_obj = process_tokens.ProcessTokens(
in_file=self.__temp_file,
bug_handler=RtfInvalidCodeException,
copy=self.__copy,
run_level=self.__run_level,
exception_handler=InvalidRtfException,
)
try:
return_value = process_tokens_obj.process_tokens()
except InvalidRtfException as msg:
# Check to see if the file is correctly encoded
encode_obj = default_encoding.DefaultEncoding(
in_file=self.__temp_file,
run_level=self.__run_level,
bug_handler=RtfInvalidCodeException,
check_raw=True,
default_encoding=self.__default_encoding,
)
platform, code_page, default_font_num = encode_obj.find_default_encoding()
check_encoding_obj = check_encoding.CheckEncoding(
bug_handler=RtfInvalidCodeException,
)
enc = encode_obj.get_codepage()
# TODO: to check if cp is a good idea or if I should use a dict to convert
enc = 'cp' + enc
msg = '%s\nException in token processing' % unicode_type(msg)
if check_encoding_obj.check_encoding(self.__file, enc):
file_name = self.__file if isinstance(self.__file, bytes) \
else self.__file.encode('utf-8')
msg +='\nFile %s does not appear to be correctly encoded.\n' % file_name
try:
os.remove(self.__temp_file)
except OSError:
pass
raise InvalidRtfException(msg)
delete_info_obj = delete_info.DeleteInfo(
in_file=self.__temp_file,
copy=self.__copy,
bug_handler=RtfInvalidCodeException,
run_level=self.__run_level,)
# found destination means {\*\destination
# if found, the RTF should be newer RTF
found_destination = delete_info_obj.delete_info()
self.__bracket_match('delete_data_info')
# put picts in a separate file
pict_obj = pict.Pict(
in_file=self.__temp_file,
bug_handler=RtfInvalidCodeException,
copy=self.__copy,
orig_file=self.__file,
out_file=self.__out_file,
run_level=self.__run_level,
)
pict_obj.process_pict()
self.__bracket_match('pict_data_info')
combine_obj = combine_borders.CombineBorders(
in_file=self.__temp_file,
bug_handler=RtfInvalidCodeException,
copy=self.__copy,
run_level=self.__run_level,)
combine_obj.combine_borders()
self.__bracket_match('combine_borders_info')
footnote_obj = footnote.Footnote(
in_file=self.__temp_file,
bug_handler=RtfInvalidCodeException,
copy=self.__copy,
run_level=self.__run_level,
)
footnote_obj.separate_footnotes()
self.__bracket_match('separate_footnotes_info')
header_obj = header.Header(
in_file=self.__temp_file,
bug_handler=RtfInvalidCodeException,
copy=self.__copy,
run_level=self.__run_level,
)
header_obj.separate_headers()
self.__bracket_match('separate_headers_info')
list_numbers_obj = list_numbers.ListNumbers(
in_file=self.__temp_file,
bug_handler=RtfInvalidCodeException,
copy=self.__copy,
run_level=self.__run_level,
)
list_numbers_obj.fix_list_numbers()
self.__bracket_match('list_number_info')
preamble_div_obj = preamble_div.PreambleDiv(
in_file=self.__temp_file,
bug_handler=RtfInvalidCodeException,
copy=self.__copy,
run_level=self.__run_level,
)
list_of_lists = preamble_div_obj.make_preamble_divisions()
self.__bracket_match('make_preamble_divisions')
encode_obj = default_encoding.DefaultEncoding(
in_file=self.__temp_file,
run_level=self.__run_level,
bug_handler=RtfInvalidCodeException,
default_encoding=self.__default_encoding,
)
platform, code_page, default_font_num = encode_obj.find_default_encoding()
hex2utf_obj = hex_2_utf8.Hex2Utf8(
in_file=self.__temp_file,
copy=self.__copy,
area_to_convert='preamble',
char_file=self.__char_data,
default_char_map=code_page,
run_level=self.__run_level,
bug_handler=RtfInvalidCodeException,
invalid_rtf_handler=InvalidRtfException,
)
hex2utf_obj.convert_hex_2_utf8()
self.__bracket_match('hex_2_utf_preamble')
fonts_obj = fonts.Fonts(
in_file=self.__temp_file,
bug_handler=RtfInvalidCodeException,
copy=self.__copy,
default_font_num=default_font_num,
run_level=self.__run_level,
)
special_font_dict = fonts_obj.convert_fonts()
self.__bracket_match('fonts_info')
color_obj = colors.Colors(
in_file=self.__temp_file,
copy=self.__copy,
bug_handler=RtfInvalidCodeException,
run_level=self.__run_level,
)
color_obj.convert_colors()
self.__bracket_match('colors_info')
style_obj = styles.Styles(
in_file=self.__temp_file,
bug_handler=RtfInvalidCodeException,
copy=self.__copy,
run_level=self.__run_level,
)
style_obj.convert_styles()
self.__bracket_match('styles_info')
info_obj = info.Info(
in_file=self.__temp_file,
bug_handler=RtfInvalidCodeException,
copy=self.__copy,
run_level=self.__run_level,
)
info_obj.fix_info()
default_font = special_font_dict.get('default-font')
preamble_rest_obj = preamble_rest.Preamble(
file=self.__temp_file, copy=self.__copy,
bug_handler=RtfInvalidCodeException,
platform=platform, default_font=default_font,
code_page=code_page)
preamble_rest_obj.fix_preamble()
self.__bracket_match('preamble_rest_info')
old_rtf_obj = OldRtf(
in_file=self.__temp_file,
bug_handler=RtfInvalidCodeException,
run_level=self.__run_level,
)
# RTF can actually have destination groups and old RTF.
# BAH!
old_rtf = old_rtf_obj.check_if_old_rtf()
if old_rtf:
if self.__run_level > 5:
msg = 'Older RTF\n' \
'self.__run_level is "%s"\n' % self.__run_level
raise RtfInvalidCodeException(msg)
if self.__run_level > 1:
sys.stderr.write('File could be older RTF...\n')
if found_destination:
if self.__run_level > 1:
sys.stderr.write(
'File also has newer RTF.\n'
'Will do the best to convert...\n'
)
add_brackets_obj = add_brackets.AddBrackets(
in_file=self.__temp_file,
bug_handler=RtfInvalidCodeException,
copy=self.__copy,
run_level=self.__run_level,
)
add_brackets_obj.add_brackets()
fields_small_obj = fields_small.FieldsSmall(
in_file=self.__temp_file,
copy=self.__copy,
bug_handler=RtfInvalidCodeException,
run_level=self.__run_level,)
fields_small_obj.fix_fields()
self.__bracket_match('fix_small_fields_info')
fields_large_obj = fields_large.FieldsLarge(
in_file=self.__temp_file,
copy=self.__copy,
bug_handler=RtfInvalidCodeException,
run_level=self.__run_level)
fields_large_obj.fix_fields()
self.__bracket_match('fix_large_fields_info')
sections_obj = sections.Sections(
in_file=self.__temp_file,
bug_handler=RtfInvalidCodeException,
copy=self.__copy,
run_level=self.__run_level,)
sections_obj.make_sections()
self.__bracket_match('sections_info')
paragraphs_obj = paragraphs.Paragraphs(
in_file=self.__temp_file,
bug_handler=RtfInvalidCodeException,
copy=self.__copy,
write_empty_para=self.__empty_paragraphs,
run_level=self.__run_level,)
paragraphs_obj.make_paragraphs()
self.__bracket_match('paragraphs_info')
default_font = special_font_dict['default-font']
paragraph_def_obj = paragraph_def.ParagraphDef(
in_file=self.__temp_file,
bug_handler=RtfInvalidCodeException,
copy=self.__copy,
default_font=default_font,
run_level=self.__run_level,)
list_of_styles = paragraph_def_obj.make_paragraph_def()
body_styles_obj = body_styles.BodyStyles(
in_file=self.__temp_file,
bug_handler=RtfInvalidCodeException,
copy=self.__copy,
list_of_styles=list_of_styles,
run_level=self.__run_level,)
body_styles_obj.insert_info()
self.__bracket_match('body_styles_info')
self.__bracket_match('paragraph_def_info')
table_obj = table.Table(
in_file=self.__temp_file,
bug_handler=RtfInvalidCodeException,
copy=self.__copy,
run_level=self.__run_level,)
table_data = table_obj.make_table()
self.__bracket_match('table_info')
table_info_obj = table_info.TableInfo(
in_file=self.__temp_file,
bug_handler=RtfInvalidCodeException,
copy=self.__copy,
table_data=table_data,
run_level=self.__run_level,)
table_info_obj.insert_info()
self.__bracket_match('table__data_info')
if self.__form_lists:
make_list_obj = make_lists.MakeLists(
in_file=self.__temp_file,
bug_handler=RtfInvalidCodeException,
copy=self.__copy,
headings_to_sections=self.__headings_to_sections,
run_level=self.__run_level,
list_of_lists=list_of_lists,
)
make_list_obj.make_lists()
self.__bracket_match('form_lists_info')
if self.__headings_to_sections:
headings_to_sections_obj = headings_to_sections.HeadingsToSections(
in_file=self.__temp_file,
bug_handler=RtfInvalidCodeException,
copy=self.__copy,
run_level=self.__run_level,)
headings_to_sections_obj.make_sections()
self.__bracket_match('headings_to_sections_info')
if self.__group_styles:
group_styles_obj = group_styles.GroupStyles(
in_file=self.__temp_file,
bug_handler=RtfInvalidCodeException,
copy=self.__copy,
wrap=1,
run_level=self.__run_level,)
group_styles_obj.group_styles()
self.__bracket_match('group_styles_info')
if self.__group_borders:
group_borders_obj = group_borders.GroupBorders(
in_file=self.__temp_file,
bug_handler=RtfInvalidCodeException,
copy=self.__copy,
wrap=1,
run_level=self.__run_level,)
group_borders_obj.group_borders()
self.__bracket_match('group_borders_info')
inline_obj = inline.Inline(
in_file=self.__temp_file,
bug_handler=RtfInvalidCodeException,
copy=self.__copy,
run_level=self.__run_level,)
inline_obj.form_tags()
self.__bracket_match('inline_info')
hex2utf_obj.update_values(file=self.__temp_file,
area_to_convert='body',
copy=self.__copy,
char_file=self.__char_data,
convert_caps=self.__convert_caps,
convert_symbol=self.__convert_symbol,
convert_wingdings=self.__convert_wingdings,
convert_zapf=self.__convert_zapf,
symbol=1,
wingdings=1,
dingbats=1,
)
hex2utf_obj.convert_hex_2_utf8()
header_obj.join_headers()
footnote_obj.join_footnotes()
tags_obj = convert_to_tags.ConvertToTags(
in_file=self.__temp_file,
copy=self.__copy,
dtd_path=self.__dtd_path,
indent=self.__indent,
run_level=self.__run_level,
no_dtd=self.__no_dtd,
encoding=encode_obj.get_codepage(),
bug_handler=RtfInvalidCodeException,
)
tags_obj.convert_to_tags()
output_obj = output.Output(
file=self.__temp_file,
orig_file=self.__file,
output_dir=self.__out_dir,
out_file=self.__out_file,
)
output_obj.output()
os.remove(self.__temp_file)
return self.__exit_level
def __bracket_match(self, file_name):
if self.__run_level > 2:
good_br, msg = self.__check_brack_obj.check_brackets()
if good_br:
pass
# sys.stderr.write( msg + ' in ' + file_name + "\n")
else:
msg = '%s in file %s' % (msg, file_name)
print(msg, file=sys.stderr)
def __return_code(self, num):
if num is None:
return
if int(num) > self.__exit_level:
self.__exit_level = num
def __make_temp_file(self,file):
"""Make a temporary file to parse"""
write_file="rtf_write_file"
read_obj = file if hasattr(file, 'read') else open_for_read(file)
with open_for_write(write_file) as write_obj:
for line in read_obj:
write_obj.write(line)
return write_file

View File

@@ -0,0 +1,12 @@
from __future__ import unicode_literals, absolute_import, print_function, division
import io
def open_for_read(path):
return io.open(path, encoding='utf-8', errors='replace')
def open_for_write(path, append=False):
mode = 'a' if append else 'w'
return io.open(path, mode, encoding='utf-8', errors='replace', newline='')

View File

@@ -0,0 +1,232 @@
from __future__ import unicode_literals, absolute_import, print_function, division
#########################################################################
# #
# #
# copyright 2002 Paul Henry Tremblay #
# #
# This program is distributed in the hope that it will be useful, #
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
# General Public License for more details. #
# #
# #
# #
#########################################################################
import sys, os
from calibre.ebooks.rtf2xml import copy, check_brackets
from calibre.ptempfile import better_mktemp
from polyglot.builtins import iteritems
from . import open_for_read, open_for_write
class AddBrackets:
"""
Add brackets for old RTF.
Logic:
When control words without their own brackets are encountered
and in the list of allowed words, this will add brackets
to facilitate the treatment of the file
"""
def __init__(self, in_file,
bug_handler,
copy=None,
run_level=1,
):
"""
Required:
'file'--file to parse
Optional:
'copy'-- whether to make a copy of result for debugging
'temp_dir' --where to output temporary results (default is
directory from which the script is run.)
Returns:
nothing
"""
self.__file = in_file
self.__bug_handler = bug_handler
self.__copy = copy
self.__write_to = better_mktemp()
self.__run_level = run_level
self.__state_dict = {
'before_body' : self.__before_body_func,
'in_body' : self.__in_body_func,
'after_control_word' : self.__after_control_word_func,
'in_ignore' : self.__ignore_func,
}
self.__accept = [
'cw<ci<bold______' ,
'cw<ci<annotation' ,
'cw<ci<blue______' ,
# 'cw<ci<bold______' ,
'cw<ci<caps______' ,
'cw<ci<char-style' ,
'cw<ci<dbl-strike' ,
'cw<ci<emboss____' ,
'cw<ci<engrave___' ,
'cw<ci<font-color' ,
'cw<ci<font-down_' ,
'cw<ci<font-size_' ,
'cw<ci<font-style' ,
'cw<ci<font-up___' ,
'cw<ci<footnot-mk' ,
'cw<ci<green_____' ,
'cw<ci<hidden____' ,
'cw<ci<italics___' ,
'cw<ci<outline___' ,
'cw<ci<red_______' ,
'cw<ci<shadow____' ,
'cw<ci<small-caps' ,
'cw<ci<strike-thr' ,
'cw<ci<subscript_' ,
'cw<ci<superscrip' ,
'cw<ci<underlined' ,
# 'cw<ul<underlined' ,
]
def __initiate_values(self):
"""
Init temp values
"""
self.__state = 'before_body'
self.__inline = {}
self.__temp_group = []
self.__open_bracket = False
self.__found_brackets = False
def __before_body_func(self, line):
"""
If we are before the body, not interest in changing anything
"""
if self.__token_info == 'mi<mk<body-open_':
self.__state = 'in_body'
self.__write_obj.write(line)
def __in_body_func(self, line):
"""
Select what action to take in body:
1-At the end of the file close the braket if a bracket was opened
This happens if there is achange
2-If an open bracket is found the code inside is ignore
(written without modifications)
3-If an accepted control word is found put the line
in a buffer then chage state to after cw
4-Else simply write the line
"""
if line == 'cb<nu<clos-brack<0001\n' and self.__open_bracket:
self.__write_obj.write(
'cb<nu<clos-brack<0003\n'
)
self.__write_obj.write(line)
elif self.__token_info == 'ob<nu<open-brack':
self.__found_brackets = True
self.__state = 'in_ignore'
self.__ignore_count = self.__ob_count
self.__write_obj.write(line)
elif self.__token_info in self.__accept:
self.__temp_group.append(line)
self.__state = 'after_control_word'
else:
self.__write_obj.write(line)
def __after_control_word_func(self, line):
"""
After a cw either add next allowed cw to temporary list or
change groupe and write it.
If the token leading to an exit is an open bracket go to
ignore otherwise goto in body
"""
if self.__token_info in self.__accept:
self.__temp_group.append(line)
else:
self.__change_permanent_group()
self.__write_group()
self.__write_obj.write(line)
if self.__token_info == 'ob<nu<open-brack':
self.__state = 'in_ignore'
self.__ignore_count = self.__ob_count
else:
self.__state = 'in_body'
def __write_group(self):
"""
Write a tempory group after accepted control words end
But this is mostly useless in my opinion as there is no list of rejected cw
This may be a way to implement future old rtf processing for cw
Utility: open a group to just put brackets but why be so complicated?
Scheme: open brackets, write cw then go to body and back with cw after
"""
if self.__open_bracket:
self.__write_obj.write(
'cb<nu<clos-brack<0003\n'
)
self.__open_bracket = False
inline_string = ''.join(['%s<nu<%s\n' % (k, v)
for k, v in iteritems(self.__inline)
if v != 'false'])
if inline_string:
self.__write_obj.write('ob<nu<open-brack<0003\n'
'%s' % inline_string)
self.__open_bracket = True
self.__temp_group = []
def __change_permanent_group(self):
"""
Use temp group to change permanent group
If the control word is not accepted remove it
What is the interest as it is build to accept only accepted cw
in __after_control_word_func?
"""
self.__inline = {line[:16] : line[20:-1]
for line in self.__temp_group\
# Is this really necessary?
if line[:16] in self.__accept}
def __ignore_func(self, line):
"""
Just copy data inside of RTF brackets already here.
"""
self.__write_obj.write(line)
if self.__token_info == 'cb<nu<clos-brack'\
and self.__cb_count == self.__ignore_count:
self.__state = 'in_body'
def __check_brackets(self, in_file):
"""
Return True if brackets match
"""
check_brack_obj = check_brackets.CheckBrackets(file=in_file)
return check_brack_obj.check_brackets()[0]
def add_brackets(self):
"""
"""
self.__initiate_values()
with open_for_read(self.__file) as read_obj:
with open_for_write(self.__write_to) as self.__write_obj:
for line in read_obj:
self.__token_info = line[:16]
if self.__token_info == 'ob<nu<open-brack':
self.__ob_count = line[-5:-1]
if self.__token_info == 'cb<nu<clos-brack':
self.__cb_count = line[-5:-1]
action = self.__state_dict.get(self.__state)
if action is None:
sys.stderr.write(
'No matching state in module add_brackets.py\n'
'%s\n' % self.__state)
action(line)
# Check bad brackets
if self.__check_brackets(self.__write_to):
copy_obj = copy.Copy(bug_handler=self.__bug_handler)
if self.__copy:
copy_obj.copy_file(self.__write_to, "add_brackets.data")
copy_obj.rename(self.__write_to, self.__file)
else:
if self.__run_level > 0:
sys.stderr.write(
'Sorry, but this files has a mix of old and new RTF.\n'
'Some characteristics cannot be converted.\n')
os.remove(self.__write_to)

View File

@@ -0,0 +1,84 @@
from __future__ import unicode_literals, absolute_import, print_function, division
#########################################################################
# #
# #
# copyright 2002 Paul Henry Tremblay #
# #
# This program is distributed in the hope that it will be useful, #
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
# General Public License for more details. #
# #
# #
#########################################################################
import os
from calibre.ebooks.rtf2xml import copy
from calibre.ptempfile import better_mktemp
from . import open_for_read, open_for_write
"""
Simply write the list of strings after style table
"""
class BodyStyles:
"""
Insert table data for tables.
Logic:
"""
def __init__(self,
in_file,
list_of_styles,
bug_handler,
copy=None,
run_level=1,):
"""
Required:
'file'--file to parse
'table_data' -- a dictionary for each table.
Optional:
'copy'-- whether to make a copy of result for debugging
'temp_dir' --where to output temporary results (default is
directory from which the script is run.)
Returns:
nothing
"""
self.__file = in_file
self.__bug_handler = bug_handler
self.__copy = copy
self.__list_of_styles = list_of_styles
self.__run_level = run_level
self.__write_to = better_mktemp()
# self.__write_to = 'table_info.data'
def insert_info(self):
"""
"""
read_obj = open_for_read(self.__file)
self.__write_obj = open_for_write(self.__write_to)
line_to_read = 1
while line_to_read:
line_to_read = read_obj.readline()
line = line_to_read
if line == 'mi<tg<close_____<style-table\n':
if len(self.__list_of_styles) > 0:
self.__write_obj.write('mi<tg<open______<styles-in-body\n')
the_string = ''.join(self.__list_of_styles)
self.__write_obj.write(the_string)
self.__write_obj.write('mi<tg<close_____<styles-in-body\n')
else:
# this shouldn't happen!
if self.__run_level > 3:
msg = 'Not enough data for each table\n'
raise self.__bug_handler(msg)
# why was this line even here?
# self.__write_obj.write('mi<tg<open______<table\n')
self.__write_obj.write(line)
read_obj.close()
self.__write_obj.close()
copy_obj = copy.Copy(bug_handler=self.__bug_handler)
if self.__copy:
copy_obj.copy_file(self.__write_to, "body_styles.data")
copy_obj.rename(self.__write_to, self.__file)
os.remove(self.__write_to)

View File

@@ -0,0 +1,191 @@
from __future__ import unicode_literals, absolute_import, print_function, division
#########################################################################
# #
# #
# copyright 2002 Paul Henry Tremblay #
# #
# This program is distributed in the hope that it will be useful, #
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
# General Public License for more details. #
# #
# #
#########################################################################
import sys
class BorderParse:
"""
Parse a border line and return a dictionary of attributes and values
"""
def __init__(self):
# cw<bd<bor-t-r-hi<nu<true
self.__border_dict = {
'bor-t-r-hi' : 'border-table-row-horizontal-inside',
'bor-t-r-vi' : 'border-table-row-vertical-inside',
'bor-t-r-to' : 'border-table-row-top',
'bor-t-r-le' : 'border-table-row-left',
'bor-t-r-bo' : 'border-table-row-bottom',
'bor-t-r-ri' : 'border-table-row-right',
'bor-cel-bo' : 'border-cell-bottom',
'bor-cel-to' : 'border-cell-top',
'bor-cel-le' : 'border-cell-left',
'bor-cel-ri' : 'border-cell-right',
'bor-par-bo' : 'border-paragraph-bottom',
'bor-par-to' : 'border-paragraph-top',
'bor-par-le' : 'border-paragraph-left',
'bor-par-ri' : 'border-paragraph-right',
'bor-par-bx' : 'border-paragraph-box',
'bor-for-ev' : 'border-for-every-paragraph',
'bor-outsid' : 'border-outside',
'bor-none__' : 'border',
# border type => bt
'bdr-li-wid' : 'line-width',
'bdr-sp-wid' : 'padding',
'bdr-color_' : 'color',
}
self.__border_style_dict = {
'bdr-single' : 'single',
'bdr-doubtb' : 'double-thickness-border',
'bdr-shadow' : 'shadowed-border',
'bdr-double' : 'double-border',
'bdr-dotted' : 'dotted-border',
'bdr-dashed' : 'dashed',
'bdr-hair__' : 'hairline',
'bdr-inset_' : 'inset',
'bdr-das-sm' : 'dash-small',
'bdr-dot-sm' : 'dot-dash',
'bdr-dot-do' : 'dot-dot-dash',
'bdr-outset' : 'outset',
'bdr-trippl' : 'tripple',
'bdr-thsm__' : 'thick-thin-small',
'bdr-htsm__' : 'thin-thick-small',
'bdr-hthsm_' : 'thin-thick-thin-small',
'bdr-thm___' : 'thick-thin-medium',
'bdr-htm___' : 'thin-thick-medium',
'bdr-hthm__' : 'thin-thick-thin-medium',
'bdr-thl___' : 'thick-thin-large',
'bdr-hthl__' : 'thin-thick-thin-large',
'bdr-wavy__' : 'wavy',
'bdr-d-wav_' : 'double-wavy',
'bdr-strip_' : 'striped',
'bdr-embos_' : 'emboss',
'bdr-engra_' : 'engrave',
'bdr-frame_' : 'frame',
}
def parse_border(self, line):
"""
Requires:
line -- line with border definition in it
Returns:
?
Logic:
"""
border_dict = {}
border_style_dict = {}
border_style_list = []
border_type = self.__border_dict.get(line[6:16])
if not border_type:
sys.stderr.write(
'module is border_parse.py\n'
'function is parse_border\n'
'token does not have a dictionary value\n'
'token is "%s"' % line
)
return border_dict
att_line = line[20:-1]
atts = att_line.split('|')
# cw<bd<bor-cel-ri<nu<
# border has no value--should be no lines
if len(atts) == 1 and atts[0] == '':
border_dict[border_type] = 'none'
return border_dict
# border-paragraph-right
for att in atts:
values = att.split(':')
if len(values) ==2:
att = values[0]
value = values[1]
else:
value = 'true'
style_att = self.__border_style_dict.get(att)
if style_att:
att = '%s-%s' % (border_type, att)
border_style_dict[att] = value
border_style_list.append(style_att)
else:
att = self.__border_dict.get(att)
if not att:
sys.stderr.write(
'module is border_parse_def.py\n'
'function is parse_border\n'
'token does not have an att value\n'
'line is "%s"' % line
)
att = '%s-%s' % (border_type, att)
border_dict[att] = value
new_border_dict = self.__determine_styles(border_type, border_style_list)
border_dict.update(new_border_dict)
return border_dict
def __determine_styles(self, border_type, border_style_list):
new_border_dict = {}
att = '%s-style' % border_type
if 'shadowed-border' in border_style_list:
new_border_dict[att] = 'shadowed'
elif 'engraved' in border_style_list:
new_border_dict[att] = 'engraved'
elif 'emboss' in border_style_list:
new_border_dict[att] = 'emboss'
elif 'striped' in border_style_list:
new_border_dict[att] = 'striped'
elif 'thin-thick-thin-small' in border_style_list:
new_border_dict[att] = 'thin-thick-thin-small'
elif 'thick-thin-large' in border_style_list:
new_border_dict[att] = 'thick-thin-large'
elif 'thin-thick-thin-medium' in border_style_list:
new_border_dict[att] = 'thin-thick-thin-medium'
elif 'thin-thick-medium' in border_style_list:
new_border_dict[att] = 'thin-thick-medium'
elif 'thick-thin-medium' in border_style_list:
new_border_dict[att] = 'thick-thin-medium'
elif 'thick-thin-small' in border_style_list:
new_border_dict[att] = 'thick-thin-small'
elif 'thick-thin-small' in border_style_list:
new_border_dict[att] = 'thick-thin-small'
elif 'double-wavy' in border_style_list:
new_border_dict[att] = 'double-wavy'
elif 'dot-dot-dash' in border_style_list:
new_border_dict[att] = 'dot-dot-dash'
elif 'dot-dash' in border_style_list:
new_border_dict[att] = 'dot-dash'
elif 'dotted-border' in border_style_list:
new_border_dict[att] = 'dotted'
elif 'wavy' in border_style_list:
new_border_dict[att] = 'wavy'
elif 'dash-small' in border_style_list:
new_border_dict[att] = 'dash-small'
elif 'dashed' in border_style_list:
new_border_dict[att] = 'dashed'
elif 'frame' in border_style_list:
new_border_dict[att] = 'frame'
elif 'inset' in border_style_list:
new_border_dict[att] = 'inset'
elif 'outset' in border_style_list:
new_border_dict[att] = 'outset'
elif 'tripple-border' in border_style_list:
new_border_dict[att] = 'tripple'
elif 'double-border' in border_style_list:
new_border_dict[att] = 'double'
elif 'double-thickness-border' in border_style_list:
new_border_dict[att] = 'double-thickness'
elif 'hairline' in border_style_list:
new_border_dict[att] = 'hairline'
elif 'single' in border_style_list:
new_border_dict[att] = 'single'
else:
if border_style_list:
new_border_dict[att] = border_style_list[0]
return new_border_dict

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,62 @@
from __future__ import unicode_literals, absolute_import, print_function, division
#########################################################################
# #
# #
# copyright 2002 Paul Henry Tremblay #
# #
# This program is distributed in the hope that it will be useful, #
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
# General Public License for more details. #
# #
# #
#########################################################################
from . import open_for_read
class CheckBrackets:
"""Check that brackets match up"""
def __init__(self, bug_handler=None, file=None):
self.__file=file
self.__bug_handler = bug_handler
self.__bracket_count=0
self.__ob_count = 0
self.__cb_count = 0
self.__open_bracket_num = []
def open_brack(self, line):
num = line[-5:-1]
self.__open_bracket_num.append(num)
self.__bracket_count += 1
def close_brack(self, line):
num = line[-5:-1]
try:
last_num = self.__open_bracket_num.pop()
except:
return False
if num != last_num:
return False
self.__bracket_count -= 1
return True
def check_brackets(self):
line_count = 0
with open_for_read(self.__file) as read_obj:
for line in read_obj:
line_count += 1
self.__token_info = line[:16]
if self.__token_info == 'ob<nu<open-brack':
self.open_brack(line)
if self.__token_info == 'cb<nu<clos-brack':
if not self.close_brack(line):
return (False, "closed bracket doesn't match, line %s" % line_count)
if self.__bracket_count != 0:
msg = ('At end of file open and closed brackets don\'t match\n'
'total number of brackets is %s') % self.__bracket_count
return (False, msg)
return (True, "Brackets match!")

View File

@@ -0,0 +1,42 @@
#!/usr/bin/env python2
from __future__ import absolute_import, division, print_function, unicode_literals
import sys
from polyglot.builtins import unicode_type
class CheckEncoding:
def __init__(self, bug_handler):
self.__bug_handler = bug_handler
def __get_position_error(self, line, encoding, line_num):
char_position = 0
for char in line:
char_position +=1
try:
char.decode(encoding)
except ValueError as msg:
sys.stderr.write('line: %s char: %s\n%s\n' % (line_num, char_position, unicode_type(msg)))
def check_encoding(self, path, encoding='us-ascii', verbose=True):
line_num = 0
with open(path, 'rb') as read_obj:
for line in read_obj:
line_num += 1
try:
line.decode(encoding)
except ValueError:
if verbose:
if len(line) < 1000:
self.__get_position_error(line, encoding, line_num)
else:
sys.stderr.write('line: %d has bad encoding\n' % line_num)
return True
return False
if __name__ == '__main__':
check_encoding_obj = CheckEncoding()
check_encoding_obj.check_encoding(sys.argv[1])

View File

@@ -0,0 +1,258 @@
from __future__ import unicode_literals, absolute_import, print_function, division
#########################################################################
# #
# #
# copyright 2002 Paul Henry Tremblay #
# #
# This program is distributed in the hope that it will be useful, #
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
# General Public License for more details. #
# #
# #
#########################################################################
import sys, os, re
from calibre.ebooks.rtf2xml import copy
from calibre.ptempfile import better_mktemp
from . import open_for_read, open_for_write
class Colors:
"""
Change lines with color info from color numbers to the actual color names.
"""
def __init__(self,
in_file,
bug_handler,
copy=None,
run_level=1
):
"""
Required:
'file'--file to parse
Optional:
'copy'-- whether to make a copy of result for debugging
'temp_dir' --where to output temporary results (default is
directory from which the script is run.)
Returns:
nothing
"""
self.__file = in_file
self.__copy = copy
self.__bug_handler = bug_handler
self.__line = 0
self.__write_to = better_mktemp()
self.__run_level = run_level
def __initiate_values(self):
"""
Initiate all values.
"""
self.__color_dict = {}
self.__state = 'before_color_table'
self.__state_dict = {
'before_color_table': self.__before_color_func,
'in_color_table' : self.__in_color_func,
'after_color_table' : self.__after_color_func,
'cw<ci<red_______' : self.__default_color_func,
'cw<ci<green_____' : self.__default_color_func,
'cw<ci<blue______' : self.__blue_func,
'tx<nu<__________' : self.__do_nothing_func,
}
self.__color_string = '#'
self.__color_num = 1
self.__line_color_exp = re.compile(r'bdr-color_:(\d+)')
# cw<bd<bor-par-to<nu<bdr-hair__|bdr-li-wid:0.50|bdr-sp-wid:1.00|bdr-color_:2
def __before_color_func(self, line):
"""
Requires:
line
Returns:
nothing
Logic:
Check to see if the line marks the beginning of the color table.
If so, change states.
Always print out the line.
"""
# mi<mk<clrtbl-beg
if self.__token_info == 'mi<mk<clrtbl-beg':
self.__state = 'in_color_table'
self.__write_obj.write(line)
def __default_color_func(self, line):
"""
Requires:
line
Returns:
nothing
Logic:
get the hex number from the line and add it to the color string.
"""
hex_num = line[-3:-1]
self.__color_string += hex_num
def __blue_func(self, line):
"""
Requires:
line
Returns:
nothing
Logic:
Get the hex number from the line and add it to the color string.
Add a key -> value pair to the color dictionary, with the number
as the key, and the hex number as the value. Write an empty tag
with the hex number and number as attributes. Add one to the color
number. Reset the color string to '#'
"""
hex_num = line[-3:-1]
self.__color_string += hex_num
self.__color_dict[self.__color_num] = self.__color_string
self.__write_obj.write(
'mi<tg<empty-att_'
'<color-in-table<num>%s<value>%s\n' % (self.__color_num, self.__color_string)
)
self.__color_num += 1
self.__color_string = '#'
def __in_color_func(self, line):
"""
Requires:
line
Returns:
nothing
Logic:
Check if the end of the color table has been reached. If so,
change the state to after the color table.
Othewise, get a function by passing the self.__token_info to the
state dictionary.
"""
# mi<mk<clrtbl-beg
# cw<ci<red_______<nu<00
if self.__token_info == 'mi<mk<clrtbl-end':
self.__state = 'after_color_table'
else:
action = self.__state_dict.get(self.__token_info)
if action is None:
sys.stderr.write('in module colors.py\n'
'function is self.__in_color_func\n'
'no action for %s' % self.__token_info
)
action(line)
def __after_color_func(self, line):
"""
Check the to see if it contains color info. If it does, extract the
number and look up the hex value in the color dictionary. If the color
dictionary has no key for the number, print out an error message.
Otherwise, print out the line.
Added Oct 10, 2003
If the number is 0, that indicates no color
"""
# cw<ci<font-color<nu<2
if self.__token_info == 'cw<ci<font-color':
hex_num = int(line[20:-1])
hex_num = self.__figure_num(hex_num)
if hex_num:
self.__write_obj.write(
'cw<ci<font-color<nu<%s\n' % hex_num
)
elif line[0:5] == 'cw<bd':
the_index = line.find('bdr-color_')
if the_index > -1:
line = re.sub(self.__line_color_exp, self.__sub_from_line_color, line)
self.__write_obj.write(line)
"""
if num == 0:
hex_num = 'false'
else:
hex_num = self.__color_dict.get(num)
if hex_num == None:
if self.__run_level > 0:
sys.stderr.write(
'module is colors.py\n'
'function is self.__after_color_func\n'
'no value in self.__color_dict for key %s\n' % num
)
if self.__run_level > 3:
sys.stderr.write(
'run level is %s\n'
'Script will now quit\n'
% self.__run_level)
else:
self.__write_obj.write(
'cw<ci<font-color<nu<%s\n' % hex_num
)
"""
else:
self.__write_obj.write(line)
# cw<bd<bor-par-to<nu<bdr-hair__|bdr-li-wid:0.50|bdr-sp-wid:1.00|bdr-color_:2
def __sub_from_line_color(self, match_obj):
num = match_obj.group(1)
try:
num = int(num)
except ValueError:
if self.__run_level > 3:
msg = 'can\'t make integer from string\n'
raise self.__bug_handler(msg)
else:
return 'bdr-color_:no-value'
hex_num = self.__figure_num(num)
return 'bdr-color_:%s' % hex_num
def __figure_num(self, num):
if num == 0:
hex_num = 'false'
else:
hex_num = self.__color_dict.get(num)
if hex_num is None:
hex_num = '0'
if self.__run_level > 3:
msg = 'no value in self.__color_dict' \
'for key %s at line %d\n' % (num, self.__line)
raise self.__bug_handler(msg)
return hex_num
def __do_nothing_func(self, line):
"""
Bad RTF will have text in the color table
"""
pass
def convert_colors(self):
"""
Requires:
nothing
Returns:
nothing (changes the original file)
Logic:
Read one line in at a time. Determine what action to take based on
the state. If the state is before the color table, look for the
beginning of the color table.
If the state is in the color table, create the color dictionary
and print out the tags.
If the state if afer the color table, look for lines with color
info, and substitute the number with the hex number.
"""
self.__initiate_values()
with open_for_read(self.__file) as read_obj:
with open_for_write(self.__write_to) as self.__write_obj:
for line in read_obj:
self.__line+=1
self.__token_info = line[:16]
action = self.__state_dict.get(self.__state)
if action is None:
try:
sys.stderr.write('no matching state in module fonts.py\n')
sys.stderr.write(self.__state + '\n')
except:
pass
action(line)
copy_obj = copy.Copy(bug_handler=self.__bug_handler)
if self.__copy:
copy_obj.copy_file(self.__write_to, "color.data")
copy_obj.rename(self.__write_to, self.__file)
os.remove(self.__write_to)

View File

@@ -0,0 +1,93 @@
from __future__ import unicode_literals, absolute_import, print_function, division
#########################################################################
# #
# #
# copyright 2002 Paul Henry Tremblay #
# #
# This program is distributed in the hope that it will be useful, #
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
# General Public License for more details. #
# #
# #
#########################################################################
import os
from calibre.ebooks.rtf2xml import copy
from calibre.ptempfile import better_mktemp
from . import open_for_read, open_for_write
class CombineBorders:
"""Combine borders in RTF tokens to make later processing easier"""
def __init__(self,
in_file ,
bug_handler,
copy=None,
run_level=1,
):
self.__file = in_file
self.__bug_handler = bug_handler
self.__copy = copy
self.__write_to = better_mktemp()
self.__state = 'default'
self.__bord_pos = 'default'
self.__bord_att = []
def found_bd(self, line):
# cw<bd<bor-t-r-vi
self.__state = 'border'
self.__bord_pos = line[6:16]
def __default_func(self, line):
# cw<bd<bor-t-r-vi
if self.__first_five == 'cw<bd':
self.found_bd(line)
return ''
return line
def end_border(self, line, write_obj):
border_string = "|".join(self.__bord_att)
self.__bord_att = []
write_obj.write('cw<bd<%s<nu<%s\n' % (self.__bord_pos,
border_string))
self.__state = 'default'
self.__bord_string = ''
if self.__first_five == 'cw<bd':
self. found_bd(line)
else:
write_obj.write(line)
def add_to_border_desc(self, line):
# cw<bt<bdr-hair__<nu<true
# cw<bt<bdr-linew<nu<0.50
# tx<__________<some text
border_desc = line[6:16]
num = line[20:-1]
if num == 'true':
num = ''
else:
num = ':' + num
self.__bord_att.append(border_desc + num)
def __border_func(self, line, write_obj):
if self.__first_five != 'cw<bt':
self.end_border(line, write_obj)
else:
self.add_to_border_desc(line)
def combine_borders(self):
with open_for_read(self.__file) as read_obj:
with open_for_write(self.__write_to) as write_obj:
for line in read_obj:
self.__first_five = line[0:5]
if self.__state == 'border':
self.__border_func(line, write_obj)
else:
write_obj.write(self.__default_func(line))
copy_obj = copy.Copy(bug_handler=self.__bug_handler)
if self.__copy:
copy_obj.copy_file(self.__write_to, "combine_borders.data")
copy_obj.rename(self.__write_to, self.__file)
os.remove(self.__write_to)

View File

@@ -0,0 +1,284 @@
from __future__ import unicode_literals, absolute_import, print_function, division
import os, sys
from calibre.ebooks.rtf2xml import copy, check_encoding
from calibre.ptempfile import better_mktemp
from . import open_for_read, open_for_write
public_dtd = 'rtf2xml1.0.dtd'
class ConvertToTags:
"""
Convert file to XML
"""
def __init__(self,
in_file,
bug_handler,
dtd_path,
no_dtd,
encoding,
indent=None,
copy=None,
run_level=1,
):
"""
Required:
'file'
Optional:
'copy'-- whether to make a copy of result for debugging
'temp_dir' --where to output temporary results (default is
directory from which the script is run.)
Returns:
nothing
"""
self.__file = in_file
self.__bug_handler = bug_handler
self.__copy = copy
self.__dtd_path = dtd_path
self.__no_dtd = no_dtd
self.__encoding = 'cp' + encoding
# if encoding == 'mac_roman':
# self.__encoding = 'mac_roman'
self.__indent = indent
self.__run_level = run_level
self.__write_to = better_mktemp()
self.__convert_utf = False
self.__bad_encoding = False
def __initiate_values(self):
"""
Set values, including those for the dictionary.
"""
self.__state = 'default'
self.__new_line = 0
self.__block = ('doc', 'preamble', 'rtf-definition', 'font-table',
'font-in-table', 'color-table', 'color-in-table', 'style-sheet',
'paragraph-styles', 'paragraph-style-in-table', 'character-styles',
'character-style-in-table', 'list-table', 'doc-information', 'title',
'author', 'operator', 'creation-time', 'revision-time',
'editing-time', 'time', 'number-of-pages', 'number-of-words',
'number-of-characters', 'page-definition', 'section-definition',
'headers-and-footers', 'section', 'para', 'body',
'paragraph-definition', 'cell', 'row', 'table', 'revision-table',
'style-group', 'border-group','styles-in-body', 'paragraph-style-in-body',
'list-in-table', 'level-in-table', 'override-table','override-list',
)
self.__two_new_line = ('section', 'body', 'table', 'row' 'list-table')
self.__state_dict = {
'default' : self.__default_func,
'mi<tg<open______' : self.__open_func,
'mi<tg<close_____' : self.__close_func,
'mi<tg<open-att__' : self.__open_att_func,
'mi<tg<empty-att_' : self.__empty_att_func,
'tx<nu<__________' : self.__text_func,
'tx<ut<__________' : self.__text_func,
'mi<tg<empty_____' : self.__empty_func,
}
def __open_func(self, line):
"""
Print the opening tag and newlines when needed.
"""
# mi<tg<open______<style-sheet
info = line[17:-1]
self.__new_line = 0
if info in self.__block:
self.__write_new_line()
if info in self.__two_new_line:
self.__write_extra_new_line()
self.__write_obj.write('<%s>' % info)
def __empty_func(self, line):
"""
Print out empty tag and newlines when needed.
"""
info = line[17:-1]
self.__write_obj.write(
'<%s/>' % info)
self.__new_line = 0
if info in self.__block:
self.__write_new_line()
if info in self.__two_new_line:
self.__write_extra_new_line()
def __open_att_func(self, line):
"""
Process lines for open tags that have attributes.
The important info is between [17:-1]. Take this info and split it
with the delimeter '<'. The first token in this group is the element
name. The rest are attributes, separated fromt their values by '>'. So
read each token one at a time, and split them by '>'.
"""
# mi<tg<open-att__<footnote<num>
info = line[17:-1]
tokens = info.split("<")
element_name = tokens[0]
tokens = tokens[1:]
self.__write_obj.write('<%s' % element_name)
for token in tokens:
groups = token.split('>')
try:
val = groups[0]
att = groups[1]
att = att.replace('"', '&quot;')
att = att.replace("'", '&quot;')
self.__write_obj.write(
' %s="%s"' % (val, att)
)
except:
if self.__run_level > 3:
msg = 'index out of range\n'
raise self.__bug_handler(msg)
self.__write_obj.write('>')
self.__new_line = 0
if element_name in self.__block:
self.__write_new_line()
if element_name in self.__two_new_line:
self.__write_extra_new_line()
def __empty_att_func(self, line):
"""
Same as the __open_att_func, except a '/' is placed at the end of the tag.
"""
# mi<tg<open-att__<footnote<num>
info = line[17:-1]
tokens = info.split("<")
element_name = tokens[0]
tokens = tokens[1:]
self.__write_obj.write('<%s' % element_name)
for token in tokens:
groups = token.split('>')
val = groups[0]
att = groups[1]
att = att.replace('"', '&quot;')
att = att.replace("'", '&quot;')
self.__write_obj.write(
' %s="%s"' % (val, att))
self.__write_obj.write('/>')
self.__new_line = 0
if element_name in self.__block:
self.__write_new_line()
if element_name in self.__two_new_line:
self.__write_extra_new_line()
def __close_func(self, line):
"""
Print out the closed tag and new lines, if appropriate.
"""
# mi<tg<close_____<style-sheet\n
info = line[17:-1]
self.__write_obj.write(
'</%s>' % info)
self.__new_line = 0
if info in self.__block:
self.__write_new_line()
if info in self.__two_new_line:
self.__write_extra_new_line()
def __text_func(self, line):
"""
Simply print out the information between [17:-1]
"""
# tx<nu<__________<Normal;
# change this!
self.__write_obj.write(line[17:-1])
def __write_extra_new_line(self):
"""
Print out extra new lines if the new lines have not exceeded two. If
the new lines are greater than two, do nothing.
"""
if not self.__indent:
return
if self.__new_line < 2:
self.__write_obj.write('\n')
def __default_func(self, line):
pass
def __write_new_line(self):
"""
Print out a new line if a new line has not already been printed out.
"""
if not self.__indent:
return
if not self.__new_line:
self.__write_obj.write('\n')
self.__new_line += 1
def __write_dec(self):
"""
Write the XML declaration at the top of the document.
"""
# keep maximum compatibility with previous version
check_encoding_obj = check_encoding.CheckEncoding(
bug_handler=self.__bug_handler)
if not check_encoding_obj.check_encoding(self.__file, verbose=False):
self.__write_obj.write('<?xml version="1.0" encoding="US-ASCII" ?>')
elif not check_encoding_obj.check_encoding(self.__file, self.__encoding, verbose=False):
self.__write_obj.write('<?xml version="1.0" encoding="UTF-8" ?>')
self.__convert_utf = True
else:
self.__write_obj.write('<?xml version="1.0" encoding="US-ASCII" ?>')
sys.stderr.write('Bad RTF encoding, revert to US-ASCII chars and'
' hope for the best')
self.__bad_encoding = True
self.__new_line = 0
self.__write_new_line()
if self.__no_dtd:
pass
elif self.__dtd_path:
self.__write_obj.write(
'<!DOCTYPE doc SYSTEM "%s">' % self.__dtd_path
)
elif self.__dtd_path == '':
# don't print dtd if further transformations are going to take
# place
pass
else:
self.__write_obj.write(
'<!DOCTYPE doc PUBLIC "publicID" '
'"http://rtf2xml.sourceforge.net/dtd/%s">' % public_dtd
)
self.__new_line = 0
self.__write_new_line()
def convert_to_tags(self):
"""
Read in the file one line at a time. Get the important info, between
[:16]. Check if this info matches a dictionary entry. If it does, call
the appropriate function.
The functions that are called:
a text function for text
an open function for open tags
an open with attribute function for tags with attributes
an empty with attribute function for tags that are empty but have
attribtes.
a closed function for closed tags.
an empty tag function.
"""
self.__initiate_values()
with open_for_write(self.__write_to) as self.__write_obj:
self.__write_dec()
with open_for_read(self.__file) as read_obj:
for line in read_obj:
self.__token_info = line[:16]
action = self.__state_dict.get(self.__token_info)
if action is not None:
action(line)
# convert all encodings to UTF8 or ASCII to avoid unsupported encodings in lxml
if self.__convert_utf or self.__bad_encoding:
copy_obj = copy.Copy(bug_handler=self.__bug_handler)
copy_obj.rename(self.__write_to, self.__file)
with open_for_read(self.__file) as read_obj:
with open_for_write(self.__write_to) as write_obj:
for line in read_obj:
write_obj.write(line)
copy_obj = copy.Copy(bug_handler=self.__bug_handler)
if self.__copy:
copy_obj.copy_file(self.__write_to, "convert_to_tags.data")
copy_obj.rename(self.__write_to, self.__file)
os.remove(self.__write_to)

View File

@@ -0,0 +1,63 @@
from __future__ import unicode_literals, absolute_import, print_function, division
#########################################################################
# #
# #
# copyright 2002 Paul Henry Tremblay #
# #
# This program is distributed in the hope that it will be useful, #
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
# General Public License for more details. #
# #
# #
#########################################################################
import os, shutil
class Copy:
"""Copy each changed file to a directory for debugging purposes"""
__dir = ""
def __init__(self, bug_handler, file=None, deb_dir=None, ):
self.__file = file
self.__bug_handler = bug_handler
def set_dir(self, deb_dir):
"""Set the temporary directory to write files to"""
if deb_dir is None:
message = "No directory has been provided to write to in the copy.py"
raise self.__bug_handler(message)
check = os.path.isdir(deb_dir)
if not check:
message = "%(deb_dir)s is not a directory" % vars()
raise self.__bug_handler(message)
Copy.__dir = deb_dir
def remove_files(self):
"""Remove files from directory"""
self.__remove_the_files(Copy.__dir)
def __remove_the_files(self, the_dir):
"""Remove files from directory"""
list_of_files = os.listdir(the_dir)
for file in list_of_files:
rem_file = os.path.join(Copy.__dir,file)
if os.path.isdir(rem_file):
self.__remove_the_files(rem_file)
else:
try:
os.remove(rem_file)
except OSError:
pass
def copy_file(self, file, new_file):
"""
Copy the file to a new name
If the platform is linux, use the faster linux command
of cp. Otherwise, use a safe python method.
"""
write_file = os.path.join(Copy.__dir,new_file)
shutil.copyfile(file, write_file)
def rename(self, source, dest):
shutil.copyfile(source, dest)

View File

@@ -0,0 +1,188 @@
from __future__ import unicode_literals, absolute_import, print_function, division
#########################################################################
# #
# copyright 2002 Paul Henry Tremblay #
# #
#########################################################################
'''
Codepages as to RTF 1.9.1:
437 United States IBM
708 Arabic (ASMO 708)
709 Arabic (ASMO 449+, BCON V4)
710 Arabic (transparent Arabic)
711 Arabic (Nafitha Enhanced)
720 Arabic (transparent ASMO)
819 Windows 3.1 (United States and Western Europe)
850 IBM multilingual
852 Eastern European
860 Portuguese
862 Hebrew
863 French Canadian
864 Arabic
865 Norwegian
866 Soviet Union
874 Thai
932 Japanese
936 Simplified Chinese
949 Korean
950 Traditional Chinese
1250 Eastern European
1251 Cyrillic
1252 Western European
1253 Greek
1254 Turkish
1255 Hebrew
1256 Arabic
1257 Baltic
1258 Vietnamese
1361 Johab
10000 MAC Roman
10001 MAC Japan
10004 MAC Arabic
10005 MAC Hebrew
10006 MAC Greek
10007 MAC Cyrillic
10029 MAC Latin2
10081 MAC Turkish
57002 Devanagari
57003 Bengali
57004 Tamil
57005 Telugu
57006 Assamese
57007 Oriya
57008 Kannada
57009 Malayalam
57010 Gujarati
57011 Punjabi
'''
import re
from . import open_for_read
class DefaultEncoding:
"""
Find the default encoding for the doc
"""
# Note: not all those encoding are really supported by rtf2xml
# See http://msdn.microsoft.com/en-us/library/windows/desktop/dd317756%28v=vs.85%29.aspx
# and src\calibre\gui2\widgets.py for the input list in calibre
ENCODINGS = {
# Special cases
'cp1252':'1252',
'utf-8':'1252',
'ascii':'1252',
# Normal cases
'big5':'950',
'cp1250':'1250',
'cp1251':'1251',
'cp1253':'1253',
'cp1254':'1254',
'cp1255':'1255',
'cp1256':'1256',
'shift_jis':'932',
'gb2312':'936',
# Not in RTF 1.9.1 codepage specification
'hz':'52936',
'iso8859_5':'28595',
'iso2022_jp':'50222',
'iso2022_kr':'50225',
'euc_jp':'51932',
'euc_kr':'51949',
'gb18030':'54936',
}
def __init__(self, in_file, bug_handler, default_encoding, run_level=1, check_raw=False):
self.__file = in_file
self.__bug_handler = bug_handler
self.__platform = 'Windows'
self.__default_num = 'not-defined'
self.__code_page = self.ENCODINGS.get(default_encoding, '1252')
self.__datafetched = False
self.__fetchraw = check_raw
def find_default_encoding(self):
if not self.__datafetched:
self._encoding()
self.__datafetched = True
code_page = 'ansicpg' + self.__code_page
# if self.__code_page == '10000':
# self.__code_page = 'mac_roman'
return self.__platform, code_page, self.__default_num
def get_codepage(self):
if not self.__datafetched:
self._encoding()
self.__datafetched = True
# if self.__code_page == '10000':
# self.__code_page = 'mac_roman'
return self.__code_page
def get_platform(self):
if not self.__datafetched:
self._encoding()
self.__datafetched = True
return self.__platform
def _encoding(self):
with open_for_read(self.__file) as read_obj:
cpfound = False
if not self.__fetchraw:
for line in read_obj:
self.__token_info = line[:16]
if self.__token_info == 'mi<mk<rtfhed-end':
break
if self.__token_info == 'cw<ri<macintosh_':
self.__platform = 'Macintosh'
elif self.__token_info == 'cw<ri<pc________':
self.__platform = 'IBMPC'
elif self.__token_info == 'cw<ri<pca_______':
self.__platform = 'OS/2'
if self.__token_info == 'cw<ri<ansi-codpg' \
and int(line[20:-1]):
self.__code_page = line[20:-1]
if self.__token_info == 'cw<ri<deflt-font':
self.__default_num = line[20:-1]
cpfound = True
# cw<ri<deflt-font<nu<0
if self.__platform != 'Windows' and \
not cpfound:
if self.__platform == 'Macintosh':
self.__code_page = '10000'
elif self.__platform == 'IBMPC':
self.__code_page = '437'
elif self.__platform == 'OS/2':
self.__code_page = '850'
else:
fenc = re.compile(r'\\(mac|pc|ansi|pca)[\\ \{\}\t\n]+')
fenccp = re.compile(r'\\ansicpg(\d+)[\\ \{\}\t\n]+')
for line in read_obj:
if fenc.search(line):
enc = fenc.search(line).group(1)
if fenccp.search(line):
cp = fenccp.search(line).group(1)
if not int(cp):
self.__code_page = cp
cpfound = True
break
if self.__platform != 'Windows' and \
not cpfound:
if enc == 'mac':
self.__code_page = '10000'
elif enc == 'pc':
self.__code_page = '437'
elif enc == 'pca':
self.__code_page = '850'
if __name__ == '__main__':
import sys
encode_obj = DefaultEncoding(
in_file=sys.argv[1],
default_encoding=sys.argv[2],
bug_handler=Exception,
check_raw=True,
)
print(encode_obj.get_codepage())

View File

@@ -0,0 +1,212 @@
from __future__ import unicode_literals, absolute_import, print_function, division
#########################################################################
# #
# #
# copyright 2002 Paul Henry Tremblay #
# #
# This program is distributed in the hope that it will be useful, #
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
# General Public License for more details. #
# #
# #
#########################################################################
import sys, os
from calibre.ebooks.rtf2xml import copy
from calibre.ptempfile import better_mktemp
from . import open_for_read, open_for_write
class DeleteInfo:
"""Delete unecessary destination groups"""
def __init__(self,
in_file ,
bug_handler,
copy=None,
run_level=1,
):
self.__file = in_file
self.__bug_handler = bug_handler
self.__copy = copy
self.__write_to = better_mktemp()
self.__run_level = run_level
self.__initiate_allow()
self.__bracket_count= 0
self.__ob_count = 0
self.__cb_count = 0
self.__ob = 0
self.__write_cb = False
self.__found_delete = False
def __initiate_allow(self):
"""
Initiate a list of destination groups which should be printed out.
"""
self.__allowable = ('cw<ss<char-style',
'cw<it<listtable_',
'cw<it<revi-table',
'cw<ls<list-lev-d',
# Field allowed
'cw<fd<field-inst',
'cw<an<book-mk-st',
'cw<an<book-mk-en',
'cw<an<annotation',
'cw<cm<comment___',
'cw<it<lovr-table',
# info table
'cw<di<company___',
# 'cw<ls<list______',
)
self.__not_allowable = (
'cw<un<unknown___',
'cw<un<company___',
'cw<ls<list-level',
'cw<fd<datafield_',
)
self.__state = 'default'
self.__state_dict = {
'default' : self.__default_func,
'after_asterisk' : self.__asterisk_func,
'delete' : self.__delete_func,
'list' : self.__list_func,
}
def __default_func(self,line):
"""Handle lines when in no special state. Look for an asterisk to
begin a special state. Otherwise, print out line."""
# cw<ml<asterisk__<nu<true
if self.__token_info == 'cw<ml<asterisk__':
self.__state = 'after_asterisk'
self.__delete_count = self.__ob_count
elif self.__token_info == 'ob<nu<open-brack':
# write previous bracket, if exists
if self.__ob:
self.__write_obj.write(self.__ob)
self.__ob = line
return False
else:
# write previous bracket, since didn't find asterisk
if self.__ob:
self.__write_obj.write(self.__ob)
self.__ob = 0
return True
def __delete_func(self,line):
"""Handle lines when in delete state. Don't print out lines
unless the state has ended."""
if self.__delete_count == self.__cb_count:
self.__state = 'default'
if self.__write_cb:
self.__write_cb = True
return True
return False
def __asterisk_func(self,line):
"""
Determine whether to delete info in group
Note on self.__cb flag.
If you find that you are in a delete group, and the previous
token in not an open bracket (self.__ob = 0), that means
that the delete group is nested inside another acceptable
detination group. In this case, you have already written
the open bracket, so you will need to write the closed one
as well.
"""
# Test for {\*}, in which case don't enter
# delete state
self.__found_delete = True
if self.__token_info == 'cb<nu<clos-brack':
if self.__delete_count == self.__cb_count:
self.__state = 'default'
self.__ob = 0
# changed this because haven't printed out start
return False
else:
# not sure what happens here!
# believe I have a '{\*}
if self.__run_level > 3:
msg = 'Flag problem\n'
raise self.__bug_handler(msg)
return True
elif self.__token_info in self.__allowable :
if self.__ob:
self.__write_obj.write(self.__ob)
self.__ob = 0
self.__state = 'default'
else:
pass
return True
elif self.__token_info == 'cw<ls<list______':
self.__ob = 0
self.__found_list_func(line)
elif self.__token_info in self.__not_allowable:
if not self.__ob:
self.__write_cb = True
self.__ob = 0
self.__state = 'delete'
self.__cb_count = 0
return False
else:
if self.__run_level > 5:
msg = ('After an asterisk, and found neither an allowable or non-allowable token\n\
token is "%s"\n') % self.__token_info
raise self.__bug_handler(msg)
if not self.__ob:
self.__write_cb = True
self.__ob = 0
self.__state = 'delete'
self.__cb_count = 0
return False
def __found_list_func(self, line):
"""
print out control words in this group
"""
self.__state = 'list'
def __list_func(self, line):
"""
Check to see if the group has ended.
Return True for all control words.
Return False otherwise.
"""
if self.__delete_count == self.__cb_count and \
self.__token_info == 'cb<nu<clos-brack':
self.__state = 'default'
if self.__write_cb:
self.__write_cb = False
return True
return False
elif line[0:2] == 'cw':
return True
else:
return False
def delete_info(self):
"""Main method for handling other methods. Read one line at
a time, and determine whether to print the line based on the state."""
with open_for_read(self.__file) as read_obj:
with open_for_write(self.__write_to) as self.__write_obj:
for line in read_obj:
# ob<nu<open-brack<0001
self.__token_info = line[:16]
if self.__token_info == 'ob<nu<open-brack':
self.__ob_count = line[-5:-1]
if self.__token_info == 'cb<nu<clos-brack':
self.__cb_count = line[-5:-1]
# Get action to perform
action = self.__state_dict.get(self.__state)
if not action:
sys.stderr.write('No action in dictionary state is "%s" \n'
% self.__state)
# Print if allowed by action
if action(line):
self.__write_obj.write(line)
copy_obj = copy.Copy(bug_handler=self.__bug_handler)
if self.__copy:
copy_obj.copy_file(self.__write_to, "delete_info.data")
copy_obj.rename(self.__write_to, self.__file)
os.remove(self.__write_to)
return self.__found_delete

View File

@@ -0,0 +1,816 @@
from __future__ import unicode_literals, absolute_import, print_function, division
#########################################################################
# #
# #
# copyright 2002 Paul Henry Tremblay #
# #
# This program is distributed in the hope that it will be useful, #
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
# General Public License for more details. #
# #
# #
#########################################################################
import sys, re
class FieldStrings:
"""
This module is given a string. It processes the field instruction string and
returns a list of three values.
"""
def __init__(self, bug_handler, run_level=1):
"""
Requires:
nothing
Returns:
nothing
"""
self.__run_level = run_level
self.__bug_handler = bug_handler
self.__initiate_values()
def __initiate_values(self):
"""
Requires:
nothing.
Returns:
nothing.
Logic:
initiate values for rest of class.
self.__field_instruction_dict:
The dictionary for all field names.
"""
self.__field_instruction_dict = {
# number type (arabic, etc.) and number format (\# " ")
'EDITTIME' : (self.__num_type_and_format_func, 'editing-time'),
'NUMCHARS' : (self.__num_type_and_format_func, 'number-of-characters-in-doc'),
'NUMPAGES' : (self.__num_type_and_format_func, 'number-of-pages-in-doc'),
'NUMWORDS' : (self.__num_type_and_format_func, 'number-of-words-in-doc'),
'REVNUM' : (self.__num_type_and_format_func, 'revision-number'),
'SECTIONPAGES' : (self.__num_type_and_format_func, 'num-of-pages-in-section'),
'SECTION' : (self.__num_type_and_format_func, 'insert-section-number'),
'QUOTE' : (self.__num_type_and_format_func, 'quote'),
# number formatting (\# "")
'PAGE' : (self.__default_inst_func, 'insert-page-number'),
'page' : (self.__default_inst_func, 'insert-page-number'),
# date format (\@ "")
'CREATEDATE' : (self.__date_func, 'insert-date'),
'PRINTDATE' : (self.__date_func, 'insert-date'),
# PRINTDATE?
'SAVEDATE' : (self.__date_func, 'last-saved'),
'TIME' : (self.__date_func, 'insert-time'),
# numbers?
# these fields take four switches
'AUTHOR' : (self.__simple_info_func, 'user-name'),
'COMMENTS' : (self.__simple_info_func, 'comments'),
'FILENAME' : (self.__simple_info_func, 'file-name'),
'filename' : (self.__simple_info_func, 'file-name'),
'KEYWORDS' : (self.__simple_info_func, 'keywords'),
'LASTSAVEDBY' : (self.__simple_info_func, 'last-saved-by'),
'SUBJECT' : (self.__simple_info_func, 'subject'),
'TEMPLATE' : (self.__simple_info_func, 'based-on-template'),
'TITLE' : (self.__simple_info_func, 'document-title'),
'USERADDRESS' : (self.__simple_info_func, 'user-address'),
'USERINITIALS' : (self.__simple_info_func, 'user-initials'),
'USERNAME' : (self.__simple_info_func, 'user-name'),
'EQ' : (self.__equation_func, 'equation'),
'HYPERLINK' : (self.__hyperlink_func, 'hyperlink'),
'INCLUDEPICTURE': (self.__include_pict_func, 'include-picture'),
'INCLUDETEXT' : (self.__include_text_func, 'include-text-from-file'),
'INDEX' : (self.__index_func, 'index'),
'NOTEREF' : (self.__note_ref_func, 'reference-to-note'),
'PAGEREF' : (self.__page_ref_func, 'reference-to-page'),
'REF' : (self.__ref_func, 'reference'),
'ref' : (self.__ref_func, 'reference'),
'SEQ' : (self.__sequence_func, 'numbering-sequence'),
'SYMBOL' : (self.__symbol_func, 'symbol'),
'TA' : (self.__ta_func, 'anchor-for-table-of-authorities'),
'TOA' : (self.__toc_table_func, 'table-of-authorities'),
'TOC' : (self.__toc_table_func, 'table-of-contents'),
# no switches
'AUTONUMOUT' : (self.__no_switch_func, 'auto-num-out?'),
'COMPARE' : (self.__no_switch_func, 'compare'),
'DOCVARIABLE' : (self.__no_switch_func, 'document-variable'),
'GOTOBUTTON' : (self.__no_switch_func, 'go-button'),
'NEXT' : (self.__no_switch_func, 'next'),
'NEXTIF' : (self.__no_switch_func, 'next-if'),
'SKIPIF' : (self.__no_switch_func, 'skip-if'),
'IF' : (self.__no_switch_func, 'if'),
'MERGEFIELD' : (self.__no_switch_func, 'merge-field'),
'MERGEREC' : (self.__no_switch_func, 'merge-record'),
'MERGESEQ' : (self.__no_switch_func, 'merge-sequence'),
'PLACEHOLDER' : (self.__no_switch_func, 'place-holder'),
'PRIVATE' : (self.__no_switch_func, 'private'),
'RD' : (self.__no_switch_func, 'referenced-document'),
'SET' : (self.__no_switch_func, 'set'),
# default instructions (haven't written a method for them
'ADVANCE' : (self.__default_inst_func, 'advance'),
'ASK' : (self.__default_inst_func, 'prompt-user'),
'AUTONUMLGL' : (self.__default_inst_func, 'automatic-number'),
'AUTONUM' : (self.__default_inst_func, 'automatic-number'),
'AUTOTEXTLIST' : (self.__default_inst_func, 'auto-list-text'),
'AUTOTEXT' : (self.__default_inst_func, 'auto-text'),
'BARCODE' : (self.__default_inst_func, 'barcode'),
'CONTACT' : (self.__default_inst_func, 'contact'),
'DATABASE' : (self.__default_inst_func, 'database'),
'DATE' : (self.__default_inst_func, 'date'),
'date' : (self.__default_inst_func, 'date'),
'DOCPROPERTY' : (self.__default_inst_func, 'document-property'),
'FILESIZE' : (self.__default_inst_func, 'file-size'),
'FILLIN' : (self.__default_inst_func, 'fill-in'),
'INFO' : (self.__default_inst_func, 'document-info'),
'LINK' : (self.__default_inst_func, 'link'),
'PA' : (self.__default_inst_func, 'page'),
'PRINT' : (self.__default_inst_func, 'print'),
'STYLEREF' : (self.__default_inst_func, 'style-reference'),
'USERPROPERTY' : (self.__default_inst_func, 'user-property'),
'FORMCHECKBOX' : (self.__default_inst_func, 'form-checkbox'),
'FORMTEXT' : (self.__default_inst_func, 'form-text'),
# buttons
'MACROBUTTON' : (self.__default_inst_func, 'macro-button'),
}
self.__number_dict = {
'Arabic' : 'arabic',
'alphabetic' : 'alphabetic',
'ALPHABETIC' : 'capital-alphabetic',
'roman' : 'roman',
'ROMAN' : 'capital-roman',
'Ordinal' : 'ordinal',
'CardText' : 'cardinal-text',
'OrdText' : 'ordinal-text',
'Hex' : 'hexidecimal',
'DollarText' : 'dollar-text',
'Upper' : 'upper-case',
'Lower' : 'lower-case',
'FirstCap' : 'first-cap',
'Caps' : 'caps',
}
self.__text_format_dict = {
'Upper' : 'upper',
'Lower' : 'lower',
'FirstCap' : 'first-cap',
'Caps' : 'caps',
}
self.__symbol_num_exp = re.compile(r'SYMBOL (.*?) ')
self.__symbol_font_exp = re.compile(r'\\f "(.*?)"')
self.__symbol_size_exp = re.compile(r'\\s (\d+)')
# self.__toc_figure_exp = re.compile(r'\\c "Figure"')
# \\@ "dddd, MMMM d, yyyy"
self.__date_exp = re.compile(r'\\@\s{1,}"(.*?)"')
self.__num_type_exp = re.compile(
r'\\\*\s{1,}(Arabic|alphabetic|ALPHABETIC|roman|ROMAN|Ordinal|CardText|OrdText|Hex|DollarText|Upper|Lower|FirstCap|Caps)')
self.__format_text_exp = re.compile(r'\\\*\s{1,}(Upper|Lower|FirstCap|Caps)')
self.__merge_format_exp = re.compile(r'\\\*\s{1,}MERGEFORMAT')
self.__ta_short_field_exp = re.compile(r'\\s\s{1,}"(.*?)"')
self.__ta_long_field_exp = re.compile(r'\\l\s{1,}"(.*?)"')
self.__ta_category_exp = re.compile(r'\\c\s{1,}(\d+)')
# indices
self.__index_insert_blank_line_exp = re.compile(r'\\h\s{1,}""')
self.__index_insert_letter_exp = re.compile(r'\\h\s{1,}"()"')
self.__index_columns_exp = re.compile(r'\\c\s{1,}"(.*?)"')
self.__bookmark_exp = re.compile(r'\\b\s{1,}(.*?)\s')
self.__d_separator = re.compile(r'\\d\s{1,}(.*?)\s')
self.__e_separator = re.compile(r'\\e\s{1,}(.*?)\s')
self.__l_separator = re.compile(r'\\l\s{1,}(.*?)\s')
self.__p_separator = re.compile(r'\\p\s{1,}(.*?)\s')
self.__index_sequence = re.compile(r'\\s\s{1,}(.*?)\s')
self.__index_entry_typ_exp = re.compile(r'\\f\s{1,}"(.*?)"')
self.__quote_exp = re.compile(r'"(.*?)"')
self.__filter_switch = re.compile(r'\\c\s{1,}(.*?)\s')
self.__link_switch = re.compile(r'\\l\s{1,}(.*?)\s')
def process_string(self, my_string, type):
"""
Requires:
my_string --the string to parse.
type -- the type of string.
Returns:
Returns a string for a field instrution attribute.
Logic:
This handles all "large" fields, which means everything except
toc entries, index entries, and bookmarks
Split the string by spaces, and get the first item in the
resulting list. This item is the field's type. Check for the
action in the field instructions dictionary for further parsing.
If no action is found, print out an error message.
"""
changed_string = ''
lines = my_string.split('\n')
for line in lines:
if line[0:2] == 'tx':
changed_string += line[17:]
fields = changed_string.split()
field_name = fields[0]
action, name = self.__field_instruction_dict.get(field_name, (None, None))
match_obj = re.search(self.__merge_format_exp, changed_string)
if match_obj and name:
name += '<update>dynamic'
elif name:
name += '<update>static'
else:
pass
# no name--not in list above
if action:
the_list = action(field_name, name, changed_string)
else:
# change -1 to 0--for now, I want users to report bugs
msg = 'no key for "%s" "%s"\n' % (field_name, changed_string)
sys.stderr.write(msg)
if self.__run_level > 3:
msg = 'no key for "%s" "%s"\n' % (field_name, changed_string)
raise self.__bug_handler(msg)
the_list = self.__fall_back_func(field_name, line)
return the_list
return the_list
def __default_inst_func(self, field_name, name, line):
"""
Requires:
field_name -- the first word in the string
name -- the changed name according to the dictionary
line -- the string to be parsed
Returns:
The name of the field.
Logic:
I only need the changed name for the field.
"""
return [None, None, name]
def __fall_back_func(self, field_name, line):
"""
Requires:
field_name -- the first word in the string
name -- the changed name according to the dictionary
line -- the string to be parsed
Returns:
The name of the field.
Logic:
Used for fields not found in dict
"""
the_string = field_name
the_string += '<update>none'
return [None, None, the_string]
def __equation_func(self, field_name, name, line):
"""
Requried:
field_name -- the first word in the string
name --the changed name according to the dictionary
line -- the string to be parse
Retuns:
The name of the field
Logic:
"""
return [None, None, name]
def __no_switch_func(self, field_name, name, line):
"""
Required:
field_name --the first
field_name -- the first word in the string
name --the changed name according to the dictionary
line -- the string to be parse
Retuns:
The name of the field
Logic:
"""
return [None, None, name]
def __num_type_and_format_func(self, field_name, name, line):
"""
Required:
field_name -- the first word in the string
name --the changed name according to the dictionary
line -- the string to be parse
Returns:
list of None, None, and part of a tag
Logic:
parse num_type
parse num_format
"""
the_string = name
num_format = self.__parse_num_format(line)
if num_format:
the_string += '<number-format>%s' % num_format
num_type = self.__parse_num_type(line)
if num_type:
the_string += '<number-type>%s' % num_type
# Only QUOTE takes a (mandatory?) argument
if field_name == 'QUOTE':
match_group = re.search(r'QUOTE\s{1,}"(.*?)"', line)
if match_group:
arg = match_group.group(1)
the_string += '<argument>%s' % arg
return [None, None, the_string]
def __num_format_func(self, field_name, name, line):
"""
Required:
field_name -- the first word in the string
name --the changed name according to the dictionary
line -- the string to be parse
Returns:
list of None, None, and part of a tag
Logic:
"""
the_string = name
num_format = self.__parse_num_format(line)
if num_format:
the_string += '<number-format>%s' % num_format
return [None, None, the_string]
def __parse_num_format(self, the_string):
"""
Required:
the_string -- the string to parse
Returns:
a string if the_string contains number formatting information
None, otherwise
Logic:
"""
match_group = re.search(self.__date_exp, the_string)
if match_group:
return match_group(1)
def __parse_num_type(self, the_string):
"""
Required:
the_string -- the string to parse
Returns:
a string if the_string contains number type information
None, otherwise
Logic:
the_string might look like:
USERNAME \\* Arabic \\* MERGEFORMAT
Get the \\* Upper part. Use a dictionary to convert the "Arabic" to
a more-readable word for the value of the key "number-type".
(<field number-type = "Arabic">
"""
match_group = re.search(self.__num_type_exp, the_string)
if match_group:
name = match_group.group(1)
changed_name = self.__number_dict.get(name)
if changed_name:
return changed_name
else:
sys.stderr.write('module is fields_string\n')
sys.stderr.write('method is __parse_num_type\n')
sys.stderr.write('no dictionary entry for %s\n' % name)
def __date_func(self, field_name, name, line):
"""
Required:
field_name --the fist
field_name -- the first word in the string
name --the changed name according to the dictionary
line -- the string to be parse
Returns:
list of None, None, and part of a tag
Logic:
"""
the_string = name
match_group = re.search(self.__date_exp, line)
if match_group:
the_string += '<date-format>%s' % match_group.group(1)
return [None, None, the_string]
def __simple_info_func(self, field_name, name, line):
"""
Requried:
field_name -- the first word in the string
name --the changed name according to the dictionary
line -- the string to be parse
Retuns:
The name of the field
Logic:
These fields can only have the following switches:
1. Upper
2. Lower
3. FirstCap
4. Caps
"""
the_string = name
match_group = re.search(self.__format_text_exp, line)
if match_group:
name = match_group.group(1)
changed_name = self.__text_format_dict.get(name)
if changed_name:
the_string += '<format>%s' % changed_name
else:
sys.stderr.write('module is fields_string\n')
sys.stderr.write('method is __parse_num_type\n')
sys.stderr.write('no dictionary entry for %s\n' % name)
return [None, None, the_string]
def __hyperlink_func(self, field_name, name, line):
"""
Requried:
field_name -- the first word in the string
name --the changed name according to the dictionary
line -- the string to be parse
Retuns:
The name of the field
"""
self.__link_switch = re.compile(r'\\l\s{1,}"{0,1}(.*?)"{0,1}\s')
the_string = name
match_group = re.search(self.__link_switch, line)
if match_group:
link = match_group.group(1)
link = link.replace('"', "&quot;")
the_string += '<link>%s' % link
# \l "txt" "link"
# want "file name" so must get rid of \c "txt"
line = re.sub(self.__link_switch, '', line)
match_group = re.search(self.__quote_exp, line)
if match_group:
arg = match_group.group(1)
the_string += '<argument>%s' % arg
else:
pass
index = line.find('\\m')
if index > -1:
the_string += '<html2-image-map>true'
index = line.find('\\n')
if index > -1:
the_string += '<new-window>true'
index = line.find('\\h')
if index > -1:
the_string += '<no-history>true'
return [None, None, the_string]
def __include_text_func(self, field_name, name, line):
"""
Requried:
field_name -- the first word in the string
name --the changed name according to the dictionary
line -- the string to be parse
Retuns:
The name of the field
Logic:
"""
the_string = name
match_group = re.search(self.__format_text_exp, line)
if match_group:
name = match_group.group(1)
changed_name = self.__text_format_dict.get(name)
if changed_name:
the_string += '<format>%s' % changed_name
else:
sys.stderr.write('module is fields_string\n')
sys.stderr.write('method is __parse_num_type\n')
sys.stderr.write('no dictionary entry for %s\n' % name)
match_group = re.search(self.__filter_switch, line)
if match_group:
arg = match_group.group(1)
the_string += '<filter>%s' % arg
# \c "txt" "file name"
# want "file name" so must get rid of \c "txt"
line = re.sub(self.__filter_switch, '', line)
match_group = re.search(self.__quote_exp, line)
if match_group:
arg = match_group.group(1)
arg = arg.replace('"', "&quot;")
the_string += '<argument>%s' % arg
else:
sys.stderr.write('Module is field_strings\n')
sys.stderr.write('method is include_text_func\n')
sys.stderr.write('no argument for include text\n')
index = line.find('\\!')
if index > -1:
the_string += '<no-field-update>true'
return [None, None, the_string]
def __include_pict_func(self, field_name, name, line):
"""
Requried:
field_name -- the first word in the string
name --the changed name according to the dictionary
line -- the string to be parse
Retuns:
The name of the field
Logic:
"""
the_string = name
match_group = re.search(self.__filter_switch, line)
if match_group:
arg = match_group.group(1)
arg = arg.replace('"', "&quot;")
the_string += '<filter>%s' % arg
# \c "txt" "file name"
# want "file name" so must get rid of \c "txt"
line = re.sub(self.__filter_switch, '', line)
match_group = re.search(self.__quote_exp, line)
if match_group:
arg = match_group.group(1)
the_string += '<argument>%s' % arg
else:
sys.stderr.write('Module is field_strings\n')
sys.stderr.write('method is include_pict_func\n')
sys.stderr.write('no argument for include pict\n')
index = line.find('\\d')
if index > -1:
the_string += '<external>true'
return [None, None, the_string]
def __ref_func(self, field_name, name, line):
"""
Requires:
field_name -- the first word in the string
name -- the changed name according to the dictionary
line -- the string to be parsed
Returns:
The name of the field.
Logic:
A page reference field looks like this:
PAGEREF _Toc440880424 \\h
I want to extract the second line of info, which is used as an
achor in the resulting XML file.
"""
the_string = name
match_group = re.search(self.__format_text_exp, line)
if match_group:
name = match_group.group(1)
changed_name = self.__text_format_dict.get(name)
if changed_name:
the_string += '<format>%s' % changed_name
else:
sys.stderr.write('module is fields_string\n')
sys.stderr.write('method is __parse_num_type\n')
sys.stderr.write('no dictionary entry for %s\n' % name)
line = re.sub(self.__merge_format_exp, '', line)
words = line.split()
words = words[1:] # get rid of field name
for word in words:
if word[0:1] != '\\':
the_string += '<bookmark>%s' % word
index = line.find('\\f')
if index > -1:
the_string += '<include-note-number>true'
index = line.find('\\h')
if index > -1:
the_string += '<hyperlink>true'
index = line.find('\\n')
if index > -1:
the_string += '<insert-number>true'
index = line.find('\\r')
if index > -1:
the_string += '<insert-number-relative>true'
index = line.find('\\p')
if index > -1:
the_string += '<paragraph-relative-position>true'
index = line.find('\\t')
if index > -1:
the_string += '<suppress-non-delimeter>true'
index = line.find('\\w')
if index > -1:
the_string += '<insert-number-full>true'
return [None, None, the_string]
def __toc_table_func(self, field_name, name, line):
"""
Requires:
field_name -- the name of the first word in the string
name --the changed name, according to the dictionary.
line --the string to be parsed.
Returns:
A string for a TOC table field.
Logic:
If the string contains Figure, it is a table of figures.
Otherwise, it is a plain old table of contents.
"""
the_string = name
index = line.find('\\c "Figure"')
if index > -1:
the_string = the_string.replace('table-of-contents', 'table-of-figures')
# don't really need the first value in this list, I don't believe
return [name, None, the_string]
def __sequence_func(self, field_name, name, line):
"""
Requires:
field_name --the name of the first word in the string.
name --the changed name according to the dictionary.
line -- the string to parse.
Returns:
A string with a value for the type and label attributes
Logic:
The type of sequence--whether figure, graph, my-name, or
whatever--is represented by the second word in the string. Extract
and return.
SEQ Figure \\* ARABIC
"""
fields = line.split()
label = fields[1]
my_string = '%s<label>%s' % (name, label)
return [None, None, my_string]
def __ta_func(self, field_name, name, line):
"""
Requires:
field_name --the name of the first word in the string.
name --the changed name according to the dictionary.
line -- the string to parse.
Returns:
A string with a value for the type and label attributes
Logic:
"""
the_string = name
match_group = re.search(self.__ta_short_field_exp, line)
if match_group:
short_name = match_group.group(1)
the_string += '<short-field>%s' % short_name
match_group = re.search(self.__ta_long_field_exp, line)
if match_group:
long_name = match_group.group(1)
the_string += '<long-field>%s' % long_name
match_group = re.search(self.__ta_category_exp, line)
if match_group:
category = match_group.group(1)
the_string += '<category>%s' % category
index = line.find('\\b')
if index > -1:
the_string += '<bold>true'
index = line.find('\\i')
if index > -1:
the_string += '<italics>true'
return [None, None, the_string]
def __index_func(self, field_name, name, line):
"""
Requires:
field_name --the name of the first word in the string.
name --the changed name according to the dictionary.
line -- the string to parse.
Returns:
A string with a value for the type and label attributes
Logic:
"""
# self.__index_insert_blank_line_exp = re.compile(r'\\h\s{1,}""')
# self.__index_insert_letter_exp = re.compile(r'\\h\s{1,}(".*?")')
the_string = name
match_group = re.search(self.__index_insert_blank_line_exp, line)
if match_group:
the_string += '<insert-blank-line>true'
else:
match_group = re.search(self.__index_insert_letter_exp, line)
if match_group:
insert_letter = match_group.group(1)
the_string += '<insert-letter>%s' % insert_letter
match_group = re.search(self.__index_columns_exp, line)
if match_group:
columns = match_group.group(1)
the_string += '<number-of-columns>%s' % columns
# self.__bookmark_exp = re.compile(r'\\b\s{1,}(.*?)\s')
match_group = re.search(self.__bookmark_exp, line)
if match_group:
bookmark = match_group.group(1)
the_string += '<use-bookmark>%s' % bookmark
match_group = re.search(self.__d_separator, line)
if match_group:
separator = match_group.group(1)
separator = separator.replace('"', '&quot;')
the_string += '<sequence-separator>%s' % separator
# self.__e_separator = re.compile(r'\\e\s{1,}(.*?)\s')
match_group = re.search(self.__e_separator, line)
if match_group:
separator = match_group.group(1)
separator = separator.replace('"', '&quot;')
the_string += '<page-separator>%s' % separator
# self.__index_sequence = re.compile(r'\\s\s{1,}(.*?)\s')
match_group = re.search(self.__index_sequence, line)
if match_group:
sequence = match_group.group(1)
separator = separator.replace('"', '&quot;')
the_string += '<use-sequence>%s' % sequence
# self.__index_entry_typ_exp = re.compile(r'\\f\s{1,}"(.*?)"')
match_group = re.search(self.__index_entry_typ_exp, line)
if match_group:
entry_type = match_group.group(1)
the_string += '<entry-type>%s' % entry_type
# self.__p_separator = re.compile(r'\\p\s{1,}(.*?)\s')
match_group = re.search(self.__p_separator, line)
if match_group:
limit = match_group.group(1)
the_string += '<limit-to-letters>%s' % limit
match_group = re.search(self.__l_separator, line)
if match_group:
separator = match_group.group(1)
separator = separator.replace('"', '&quot;')
the_string += '<multi-page-separator>%s' % separator
index = line.find('\\a')
if index > -1:
the_string += '<accented>true'
index = line.find('\\r')
if index > -1:
the_string += '<sub-entry-on-same-line>true'
index = line.find('\\t')
if index > -1:
the_string += '<enable-yomi-text>true'
return [None, None, the_string]
def __page_ref_func(self, field_name, name, line):
"""
Requires:
field_name --first name in the string.
name -- the changed name according to the dictionary.
line -- the string to parse.
Returns:
A string .
Logic:
"""
the_string = name
num_format = self.__parse_num_format(line)
if num_format:
the_string += '<number-format>%s' % num_format
num_type = self.__parse_num_type(line)
if num_type:
the_string += '<number-type>%s' % num_type
line = re.sub(self.__merge_format_exp, '', line)
words = line.split()
words = words[1:] # get rid of field name
for word in words:
if word[0:1] != '\\':
the_string += '<bookmark>%s' % word
index = line.find('\\h')
if index > -1:
the_string += '<hyperlink>true'
index = line.find('\\p')
if index > -1:
the_string += '<paragraph-relative-position>true'
return [None, None, the_string]
def __note_ref_func(self, field_name, name, line):
"""
Requires:
field_name --first name in the string.
name -- the changed name according to the dictionary.
line -- the string to parse.
Returns:
A string .
Logic:
"""
the_string = name
line = re.sub(self.__merge_format_exp, '', line)
words = line.split()
words = words[1:] # get rid of field name
for word in words:
if word[0:1] != '\\':
the_string += '<bookmark>%s' % word
index = line.find('\\h')
if index > -1:
the_string += '<hyperlink>true'
index = line.find('\\p')
if index > -1:
the_string += '<paragraph-relative-position>true'
index = line.find('\\f')
if index > -1:
the_string += '<include-note-number>true'
return [None, None, the_string]
def __symbol_func(self, field_name, name, line):
"""
Requires:
field_name --first name in the string.
name -- the changed name according to the dictionary.
line -- the string to parse.
Returns:
A string containing font size, font style, and a hexidecimal value.
Logic:
The SYMBOL field is one of Microsoft's many quirky ways of
entering text. The string that results from this method looks like
this:
SYMBOL 97 \\f "Symbol" \\s 12
The first word merely tells us that we have encountered a SYMBOL
field.
The next value is the Microsoft decimal value. Change this to
hexidecimal.
The pattern '\\f "some font' tells us the font.
The pattern '\\s some size' tells us the font size.
Extract all of this information. Store this information in a
string, and make this string the last item in a list. The first
item in the list is the simple word 'symbol', which tells me that
I don't really have field, but UTF-8 data.
"""
num = ''
font = ''
font_size = ''
changed_line = ''
search_obj = re.search(self.__symbol_num_exp, line)
if search_obj:
num = search_obj.group(1)
num = int(num)
num = '%X' % num
search_obj = re.search(self.__symbol_font_exp, line)
if search_obj:
font = search_obj.group(1)
changed_line += 'cw<ci<font-style<nu<%s\n' % font
search_obj = re.search(self.__symbol_size_exp, line)
if search_obj:
font_size = search_obj.group(1)
font_size = int(font_size)
font_size = '%.2f' % font_size
changed_line += 'cw<ci<font-size_<nu<%s\n' % font_size
changed_line += 'tx<hx<__________<\'%s\n' % num
return ['Symbol', None, changed_line]

View File

@@ -0,0 +1,378 @@
from __future__ import unicode_literals, absolute_import, print_function, division
#########################################################################
# #
# #
# copyright 2002 Paul Henry Tremblay #
# #
# This program is distributed in the hope that it will be useful, #
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
# General Public License for more details. #
# #
# #
#########################################################################
import sys, os
from calibre.ebooks.rtf2xml import field_strings, copy
from calibre.ptempfile import better_mktemp
from . import open_for_read, open_for_write
class FieldsLarge:
r"""
=========================
Logic
=========================
Make tags for fields.
-Fields reflect text that Microsoft Word automatically generates.
-Each file contains (or should contain) an inner group called field instructions.
-Fields can be nested.
--------------
Logic
--------------
1. As soon as a field is found, make a new text string by appending an empty
text string to the field list. Collect all the lines in this string until the
field instructions are found.
2. Collect all the tokens and text in the field instructions. When the end of
the field instructions is found, process the string of text with the
field_strings module. Append the processed string to the field instructins
list.
3. Continue collecting tokens. Check for paragraphs or sections. If either is found, add to the paragraph or section list.
4. Continue collecting tokens and text either the beginning of a new field is found, or the end of this field is found.
5. If a new field is found, repeat steps 1-3.
6. If the end of the field is found, process the last text string of the field list.
7. If the field list is empty (after removing the last text string), there are
no more fields. Print out the final string. If the list contains other strings,
add the processed string to the last string in the field list.
============================
Examples
============================
This line of RTF:
{\field{\*\fldinst { CREATEDATE \\* MERGEFORMAT }}{\fldrslt {
\lang1024 1/11/03 10:34 PM}}}
Becomes:
<field type = "insert-time">
10:34 PM
</field>
The simple field in the above example conatins no paragraph or sections breaks.
This line of RTF:
{{\field{\*\fldinst SYMBOL 97 \\f "Symbol" \\s 12}{\fldrslt\f3\fs24}}}
Becomes:
<para><inline font-size="18"><inline font-style="Symbol">&#x03A7;</inline></inline></para>
The RTF in the example above should be represented as UTF-8 rather than a field.
This RTF:
{\field\fldedit{\*\fldinst { TOC \\o "1-3" }}{\fldrslt {\lang1024
Heading one\tab }{\field{\*\fldinst {\lang1024 PAGEREF _Toc440880424
\\h }{\lang1024 {\*\datafield
{\lang1024 1}}}{\lang1024 \par }\pard\plain
\s18\li240\widctlpar\tqr\tldot\tx8630\aspalpha\aspnum\faauto\adjustright\rin0\lin240\itap0
\f4\lang1033\cgrid {\lang1024 Heading 2\tab }{\field{\*\fldinst
{\lang1024 PAGEREF _Toc440880425 \\h }{\lang1024 {\*\datafield
{\lang1024 1}}}{\lang1024 \par }\pard\plain
\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0
\f4\lang1033\cgrid }}\pard\plain
\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0
\f4\lang1033\cgrid {\fs28 \\u214\'85 \par }{\fs36 {\field{\*\fldinst
SYMBOL 67 \\f "Symbol" \\s 18}{\fldrslt\f3\fs36}}}
Becomes:
<field-block type="table-of-contents">
<paragraph-definition language="1033" nest-level="0"
font-style="Times" name="toc 1" adjust-right="true"
widow-control="true">
<para><inline language="1024">Heading one&#x009;</inline><field
type="reference-to-page" ref="_Toc440880424"><inline
language="1024">1</inline></field></para>
</paragraph-definition>
<paragraph-definition language="1033" nest-level="0" left-indent="12"
font-style="Times" name="toc 2" adjust-right="true"
widow-control="true">
<para><inline language="1024">Heading 2&#x009;</inline><field
type="reference-to-page" ref="_Toc440880425"><inline
language="1024">1</inline></field></para>
</paragraph-definition>
</field-block>
"""
def __init__(self,
in_file,
bug_handler,
copy=None,
run_level=1,
):
"""
Required:
'file'--file to parse
Optional:
'copy'-- whether to make a copy of result for debugging
'temp_dir' --where to output temporary results (default is
directory from which the script is run.)
Returns:
nothing
"""
self.__file = in_file
self.__bug_handler = bug_handler
self.__copy = copy
self.__run_level = run_level
self.__write_to = better_mktemp()
def __initiate_values(self):
"""
Initiate all values.
"""
self.__text_string = ''
self.__field_instruction_string = ''
self.__marker = 'mi<mk<inline-fld\n'
self.__state = 'before_body'
self.__string_obj = field_strings.FieldStrings(run_level=self.__run_level,
bug_handler=self.__bug_handler,)
self.__state_dict = {
'before_body' : self.__before_body_func,
'in_body' : self.__in_body_func,
'field' : self.__in_field_func,
'field_instruction' : self.__field_instruction_func,
}
self.__in_body_dict = {
'cw<fd<field_____' : self.__found_field_func,
}
self.__field_dict = {
'cw<fd<field-inst' : self.__found_field_instruction_func,
'cw<fd<field_____' : self.__found_field_func,
'cw<pf<par-end___' : self.__par_in_field_func,
'cw<sc<section___' : self.__sec_in_field_func,
}
self.__field_count = [] # keep track of the brackets
self.__field_instruction = [] # field instruction strings
self.__symbol = 0 # wheter or not the field is really UTF-8
# (these fields cannot be nested.)
self.__field_instruction_string = '' # string that collects field instruction
self.__par_in_field = [] # paragraphs in field?
self.__sec_in_field = [] # sections in field?
self.__field_string = [] # list of field strings
def __before_body_func(self, line):
"""
Requried:
line --line ro parse
Returns:
nothing (changes an instant and writes a line)
Logic:
Check for the beginninf of the body. If found, changed the state.
Always write out the line.
"""
if self.__token_info == 'mi<mk<body-open_':
self.__state = 'in_body'
self.__write_obj.write(line)
def __in_body_func(self, line):
"""
Required:
line --line to parse
Returns:
nothing. (Writes a line to the output file, or performs other actions.)
Logic:
Check of the beginning of a field. Always output the line.
"""
action = self.__in_body_dict.get(self.__token_info)
if action:
action(line)
self.__write_obj.write(line)
def __found_field_func(self, line):
"""
Requires:
line --line to parse
Returns:
nothing
Logic:
Set the values for parseing the field. Four lists have to have
items appended to them.
"""
self.__state = 'field'
self.__cb_count = 0
ob_count = self.__ob_count
self.__field_string.append('')
self.__field_count.append(ob_count)
self.__sec_in_field.append(0)
self.__par_in_field.append(0)
def __in_field_func(self, line):
"""
Requires:
line --line to parse
Returns:
nothing.
Logic:
Check for the end of the field; a paragaph break; a section break;
the beginning of another field; or the beginning of the field
instruction.
"""
if self.__cb_count == self.__field_count[-1]:
self.__field_string[-1] += line
self.__end_field_func()
else:
action = self.__field_dict.get(self.__token_info)
if action:
action(line)
else:
self.__field_string[-1] += line
def __par_in_field_func(self, line):
"""
Requires:
line --line to parse
Returns:
nothing
Logic:
Write the line to the output file and set the last item in the
paragraph in field list to true.
"""
self.__field_string[-1] += line
self.__par_in_field[-1] = 1
def __sec_in_field_func(self, line):
"""
Requires:
line --line to parse
Returns:
nothing
Logic:
Write the line to the output file and set the last item in the
section in field list to true.
"""
self.__field_string[-1] += line
self.__sec_in_field[-1] = 1
def __found_field_instruction_func(self, line):
"""
Requires:
line -- line to parse
Returns:
nothing
Change the state to field instruction. Set the open bracket count of
the beginning of this field so you know when it ends. Set the closed
bracket count to 0 so you don't prematureley exit this state.
"""
self.__state = 'field_instruction'
self.__field_instruction_count = self.__ob_count
self.__cb_count = 0
def __field_instruction_func(self, line):
"""
Requires:
line --line to parse
Returns:
nothing
Logic:
Collect all the lines until the end of the field is reached.
Process these lines with the module rtr.field_strings.
Check if the field instruction is 'Symbol' (really UTF-8).
"""
if self.__cb_count == self.__field_instruction_count:
# The closing bracket should be written, since the opening bracket
# was written
self.__field_string[-1] += line
my_list = self.__string_obj.process_string(
self.__field_instruction_string, 'field_instruction')
instruction = my_list[2]
self.__field_instruction.append(instruction)
if my_list[0] == 'Symbol':
self.__symbol = 1
self.__state = 'field'
self.__field_instruction_string = ''
else:
self.__field_instruction_string += line
def __end_field_func(self):
"""
Requires:
nothing
Returns:
Nothing
Logic:
Pop the last values in the instructions list, the fields list, the
paragaph list, and the section list.
If the field is a symbol, do not write the tags <field></field>,
since this field is really just UTF-8.
If the field contains paragraph or section breaks, it is a
field-block rather than just a field.
Write the paragraph or section markers for later parsing of the
file.
If the filed list contains more strings, add the latest
(processed) string to the last string in the list. Otherwise,
write the string to the output file.
"""
last_bracket = self.__field_count.pop()
instruction = self.__field_instruction.pop()
inner_field_string = self.__field_string.pop()
sec_in_field = self.__sec_in_field.pop()
par_in_field = self.__par_in_field.pop()
# add a closing bracket, since the closing bracket is not included in
# the field string
if self.__symbol:
inner_field_string = '%scb<nu<clos-brack<%s\n' % \
(instruction, last_bracket)
elif sec_in_field or par_in_field:
inner_field_string = \
'mi<mk<fldbkstart\n'\
'mi<tg<open-att__<field-block<type>%s\n%s'\
'mi<mk<fldbk-end_\n' \
'mi<tg<close_____<field-block\n'\
'mi<mk<fld-bk-end\n' \
% (instruction, inner_field_string)
# write a marker to show an inline field for later parsing
else:
inner_field_string = \
'%s' \
'mi<tg<open-att__<field<type>%s\n%s'\
'mi<tg<close_____<field\n'\
% (self.__marker, instruction, inner_field_string)
if sec_in_field:
inner_field_string = 'mi<mk<sec-fd-beg\n' + inner_field_string + \
'mi<mk<sec-fd-end\n'
if par_in_field:
inner_field_string = 'mi<mk<par-in-fld\n' + inner_field_string
if len(self.__field_string) == 0:
self.__write_field_string(inner_field_string)
else:
self.__field_string[-1] += inner_field_string
self.__symbol = 0
def __write_field_string(self, the_string):
self.__state = 'in_body'
self.__write_obj.write(the_string)
def fix_fields(self):
"""
Requires:
nothing
Returns:
nothing (changes the original file)
Logic:
Read one line in at a time. Determine what action to take based on
the state. If the state is before the body, look for the
beginning of the body.
If the state is body, send the line to the body method.
"""
self.__initiate_values()
read_obj = open_for_read(self.__file)
self.__write_obj = open_for_write(self.__write_to)
line_to_read = 1
while line_to_read:
line_to_read = read_obj.readline()
line = line_to_read
self.__token_info = line[:16]
if self.__token_info == 'ob<nu<open-brack':
self.__ob_count = line[-5:-1]
if self.__token_info == 'cb<nu<clos-brack':
self.__cb_count = line[-5:-1]
action = self.__state_dict.get(self.__state)
if action is None:
sys.stderr.write('no no matching state in module styles.py\n')
sys.stderr.write(self.__state + '\n')
action(line)
read_obj.close()
self.__write_obj.close()
copy_obj = copy.Copy(bug_handler=self.__bug_handler)
if self.__copy:
copy_obj.copy_file(self.__write_to, "fields_large.data")
copy_obj.rename(self.__write_to, self.__file)
os.remove(self.__write_to)

View File

@@ -0,0 +1,460 @@
from __future__ import unicode_literals, absolute_import, print_function, division
#########################################################################
# #
# #
# copyright 2002 Paul Henry Tremblay #
# #
# This program is distributed in the hope that it will be useful, #
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
# General Public License for more details. #
# #
# #
#########################################################################
import sys, os, re
from calibre.ebooks.rtf2xml import field_strings, copy
from calibre.ptempfile import better_mktemp
from . import open_for_read, open_for_write
class FieldsSmall:
"""
=================
Purpose
=================
Write tags for bookmarks, index and toc entry fields in a tokenized file.
This module does not handle toc or index tables. (This module won't be any
use to you unless you use it as part of the other modules.)
-----------
Method
-----------
Look for the beginning of a bookmark, index, or toc entry. When such a token
is found, store the opeing bracket count in a variable. Collect all the text
until the closing bracket entry is found. Send the string to the module
field_strings to process it. Write the processed string to the output
file.
"""
def __init__(self,
in_file,
bug_handler,
copy=None,
run_level=1,
):
"""
Required:
'file'--file to parse
Optional:
'copy'-- whether to make a copy of result for debugging
'temp_dir' --where to output temporary results (default is
directory from which the script is run.)
Returns:
nothing
"""
self.__file = in_file
self.__bug_handler = bug_handler
self.__copy = copy
self.__write_to = better_mktemp()
self.__run_level = run_level
def __initiate_values(self):
"""
Initiate all values.
"""
self.__string_obj = field_strings.FieldStrings(bug_handler=self.__bug_handler)
self.__state = 'before_body'
self.__text_string = ''
self.__marker = 'mi<mk<inline-fld\n'
self.__state_dict = {
'before_body' : self.__before_body_func,
'body' : self.__body_func,
'bookmark' : self.__bookmark_func,
'toc_index' : self.__toc_index_func,
}
self.__body_dict = {
'cw<an<book-mk-st' : (self.__found_bookmark_func, 'start'),
'cw<an<book-mk-en' : (self.__found_bookmark_func, 'end'),
'cw<an<toc_______' : (self.__found_toc_index_func, 'toc'),
'cw<an<index-mark' : (self.__found_toc_index_func, 'index'),
}
ob = 'ob<nu<open-brack.....'
cb = 'cb<nu<clos-brack'
bk_st = 'cw<an<book-mk-st<nu<true'
tx = 'tx<nu<__________<(.*?)'
reg_st = ob + bk_st + tx + cb
self.__book_start = re.compile(r'%s' % reg_st)
def __before_body_func(self, line):
"""
Requires:
line --the line to parse
Returns:
nothing
Logic:
Look for the beginning of the body. When found, change the state
to body. Always print out the line.
"""
if self.__token_info == 'mi<mk<body-open_':
self.__state = 'body'
self.__write_obj.write(line)
def __body_func(self, line):
"""
Requires:
line --the line to parse
Returns:
nothing
Logic:
This function handles all the lines in the body of the documents.
Look for a bookmark, index or toc entry and take the appropriate action.
"""
action, tag = \
self.__body_dict.get(self.__token_info, (None, None))
if action:
action(line, tag)
else:
self.__write_obj.write(line)
def __found_bookmark_func(self, line, tag):
"""
Requires:
line --the line to parse
Returns:
nothing
Logic:
This function is called when a bookmark is found. The opening
bracket count is stored int eh beginning bracket count. The state
is changed to 'bookmark.'
"""
self.__beg_bracket_count = self.__ob_count
self.__cb_count = 0
self.__state = 'bookmark'
self.__type_of_bookmark = tag
def __bookmark_func(self, line):
"""
Requires:
line --the line to parse
Returns:
nothing
Logic:
This function handles all lines within a bookmark. It adds each
line to a string until the end of the bookmark is found. It
processes the string with the fields_string module, and
prints out the result.
"""
if self.__beg_bracket_count == self.__cb_count:
self.__state = 'body'
type = 'bookmark-%s' % self.__type_of_bookmark
# change here
"""
my_string = self.__string_obj.process_string(
self.__text_string, type)
"""
my_string = self.__parse_bookmark_func(
self.__text_string, type)
self.__write_obj.write(self.__marker)
self.__write_obj.write(my_string)
self.__text_string = ''
self.__write_obj.write(line)
elif line[0:2] == 'tx':
self.__text_string += line[17:-1]
def __parse_index_func(self, my_string):
"""
Requires:
my_string --string to parse
type --type of string
Returns:
A string for a toc instruction field.
Logic:
This method is meant for *both* index and toc entries.
I want to eleminate paragraph endings, and I want to divide the
entry into a main entry and (if it exists) a sub entry.
Split the string by newlines. Read on token at a time. If the
token is a special colon, end the main entry element and start the
sub entry element.
If the token is a pargrah ending, ignore it, since I don't won't
paragraphs within toc or index entries.
"""
my_string, see_string = self.__index_see_func(my_string)
my_string, bookmark_string = self.__index_bookmark_func(my_string)
italics, bold = self.__index__format_func(my_string)
found_sub = 0
my_changed_string = 'mi<tg<empty-att_<field<type>index-entry'
my_changed_string += '<update>static'
if see_string:
my_changed_string += '<additional-text>%s' % see_string
if bookmark_string:
my_changed_string += '<bookmark>%s' % bookmark_string
if italics:
my_changed_string += '<italics>true'
if bold:
my_changed_string += '<bold>true'
main_entry = ''
sub_entry = ''
lines = my_string.split('\n')
for line in lines:
token_info = line[:16]
if token_info == 'cw<ml<colon_____':
found_sub = 1
elif token_info[0:2] == 'tx':
if found_sub:
sub_entry += line[17:]
else:
main_entry += line[17:]
my_changed_string += '<main-entry>%s' % main_entry
if found_sub:
my_changed_string += '<sub-entry>%s' % sub_entry
my_changed_string += '\n'
return my_changed_string
def __index_see_func(self, my_string):
in_see = 0
bracket_count = 0
see_string = ''
changed_string = ''
lines = my_string.split('\n')
end_bracket_count = sys.maxsize
for line in lines:
token_info = line[:16]
if token_info == 'ob<nu<open-brack':
bracket_count += 1
if token_info == 'cb<nu<clos-brack':
bracket_count -= 1
if in_see:
if bracket_count == end_bracket_count and token_info == 'cb<nu<clos-brack':
in_see = 0
else:
if token_info == 'tx<nu<__________':
see_string += line[17:]
else:
if token_info == 'cw<in<index-see_':
end_bracket_count = bracket_count - 1
in_see = 1
changed_string += '%s\n' % line
return changed_string, see_string
def __index_bookmark_func(self, my_string):
"""
Requries:
my_string -- string in all the index
Returns:
bookmark_string -- the text string of the book mark
index_string -- string minus the bookmark_string
"""
# cw<an<place_____<nu<true
in_bookmark = 0
bracket_count = 0
bookmark_string = ''
index_string = ''
lines = my_string.split('\n')
end_bracket_count = sys.maxsize
for line in lines:
token_info = line[:16]
if token_info == 'ob<nu<open-brack':
bracket_count += 1
if token_info == 'cb<nu<clos-brack':
bracket_count -= 1
if in_bookmark:
if bracket_count == end_bracket_count and token_info == 'cb<nu<clos-brack':
in_bookmark = 0
index_string += '%s\n' % line
else:
if token_info == 'tx<nu<__________':
bookmark_string += line[17:]
else:
index_string += '%s\n' % line
else:
if token_info == 'cw<an<place_____':
end_bracket_count = bracket_count - 1
in_bookmark = 1
index_string += '%s\n' % line
return index_string, bookmark_string
def __index__format_func(self, my_string):
italics = 0
bold =0
lines = my_string.split('\n')
for line in lines:
token_info = line[:16]
if token_info == 'cw<in<index-bold':
bold = 1
if token_info == 'cw<in<index-ital':
italics = 1
return italics, bold
def __parse_toc_func(self, my_string):
"""
Requires:
my_string -- all the string in the toc
Returns:
modidified string
Logic:
"""
toc_level = 0
toc_suppress = 0
my_string, book_start_string, book_end_string =\
self.__parse_bookmark_for_toc(my_string)
main_entry = ''
my_changed_string = 'mi<tg<empty-att_<field<type>toc-entry'
my_changed_string += '<update>static'
if book_start_string:
my_changed_string += '<bookmark-start>%s' % book_start_string
if book_end_string:
my_changed_string += '<bookmark-end>%s' % book_end_string
lines = my_string.split('\n')
for line in lines:
token_info = line[:16]
if token_info[0:2] == 'tx':
main_entry += line[17:]
if token_info == 'cw<tc<toc-level_':
toc_level = line[20:]
if token_info == 'cw<tc<toc-sup-nu':
toc_suppress = 1
if toc_level:
my_changed_string += '<toc-level>%s' % toc_level
if toc_suppress:
my_changed_string += '<toc-suppress-number>true'
my_changed_string += '<main-entry>%s' % main_entry
my_changed_string += '\n'
return my_changed_string
def __parse_bookmark_for_toc(self, my_string):
"""
Requires:
the_string --string of toc, with new lines
Returns:
the_string -- string minus bookmarks
bookmark_string -- bookmarks
Logic:
"""
in_bookmark = 0
bracket_count = 0
book_start_string = ''
book_end_string = ''
book_type = 0
toc_string = ''
lines = my_string.split('\n')
end_bracket_count = sys.maxsize
for line in lines:
token_info = line[:16]
if token_info == 'ob<nu<open-brack':
bracket_count += 1
if token_info == 'cb<nu<clos-brack':
bracket_count -= 1
if in_bookmark:
if bracket_count == end_bracket_count and token_info == 'cb<nu<clos-brack':
in_bookmark = 0
toc_string += '%s\n' % line
else:
if token_info == 'tx<nu<__________':
if book_type == 'start':
book_start_string += line[17:]
elif book_type == 'end':
book_end_string += line[17:]
else:
toc_string += '%s\n' % line
else:
if token_info == 'cw<an<book-mk-st' or token_info =='cw<an<book-mk-en':
if token_info == 'cw<an<book-mk-st':
book_type = 'start'
if token_info == 'cw<an<book-mk-en':
book_type = 'end'
end_bracket_count = bracket_count - 1
in_bookmark = 1
toc_string += '%s\n' % line
return toc_string, book_start_string, book_end_string
def __parse_bookmark_func(self, my_string, type):
"""
Requires:
my_string --string to parse
type --type of string
Returns:
A string formated for a field instruction.
Logic:
The type is the name (either bookmark-end or bookmark-start). The
id is the complete text string.
"""
my_changed_string = ('mi<tg<empty-att_<field<type>%s'
'<number>%s<update>none\n' % (type, my_string))
return my_changed_string
def __found_toc_index_func(self, line, tag):
"""
Requires:
line --the line to parse
Returns:
nothing
Logic:
This function is called when a toc or index entry is found. The opening
bracket count is stored in the beginning bracket count. The state
is changed to 'toc_index.'
"""
self.__beg_bracket_count = self.__ob_count
self.__cb_count = 0
self.__state = 'toc_index'
self.__tag = tag
def __toc_index_func(self, line):
"""
Requires:
line --the line to parse
Returns:
nothing
Logic:
This function handles all lines within a toc or index entry. It
adds each line to a string until the end of the entry is found. It
processes the string with the fields_string module, and
prints out the result.
"""
if self.__beg_bracket_count == self.__cb_count:
self.__state = 'body'
type = self.__tag
if type == 'index':
my_string = self.__parse_index_func(
self.__text_string)
elif type == 'toc':
my_string = self.__parse_toc_func(
self.__text_string)
self.__write_obj.write(self.__marker)
self.__write_obj.write(my_string)
self.__text_string = ''
self.__write_obj.write(line)
else:
self.__text_string += line
def fix_fields(self):
"""
Requires:
nothing
Returns:
nothing (changes the original file)
Logic:
Read one line in at a time. Determine what action to take based on
the state. If the state is before the body, look for the
beginning of the body.
The other two states are toc_index (for toc and index entries) and
bookmark.
"""
self.__initiate_values()
with open_for_read(self.__file) as read_obj:
with open_for_write(self.__write_to) as self.__write_obj:
for line in read_obj:
self.__token_info = line[:16]
if self.__token_info == 'ob<nu<open-brack':
self.__ob_count = line[-5:-1]
if self.__token_info == 'cb<nu<clos-brack':
self.__cb_count = line[-5:-1]
action = self.__state_dict.get(self.__state)
if action is None:
sys.stderr.write('No matching state in module fields_small.py\n')
sys.stderr.write(self.__state + '\n')
action(line)
copy_obj = copy.Copy(bug_handler=self.__bug_handler)
if self.__copy:
copy_obj.copy_file(self.__write_to, "fields_small.data")
copy_obj.rename(self.__write_to, self.__file)
os.remove(self.__write_to)

View File

@@ -0,0 +1,226 @@
from __future__ import unicode_literals, absolute_import, print_function, division
#########################################################################
# #
# #
# copyright 2002 Paul Henry Tremblay #
# #
# This program is distributed in the hope that it will be useful, #
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
# General Public License for more details. #
# #
# #
#########################################################################
import sys, os
from calibre.ebooks.rtf2xml import copy
from calibre.ptempfile import better_mktemp
from . import open_for_read, open_for_write
class Fonts:
"""
Change lines with font info from font numbers to the actual font names.
"""
def __init__(self,
in_file,
bug_handler,
default_font_num,
copy=None,
run_level=1,
):
"""
Required:
'file'--file to parse
'default_font_num'--the default font number
Optional:
'copy'-- whether to make a copy of result for debugging
'temp_dir' --where to output temporary results (default is
directory from which the script is run.)
Returns:
nothing
"""
self.__file = in_file
self.__bug_handler = bug_handler
self.__copy = copy
self.__default_font_num = default_font_num
self.__write_to = better_mktemp()
self.__run_level = run_level
def __initiate_values(self):
"""
Initiate all values.
"""
self.__special_font_dict = {
'Symbol' : 0,
'Wingdings' : 0,
'Zapf Dingbats' : 0,
}
self.__special_font_list = [
'Symbol', 'Wingdings', 'Zapf Dingbats'
]
self.__state = 'default'
self.__state_dict = {
'default' : self.__default_func,
'font_table' : self.__font_table_func,
'after_font_table' : self.__after_font_table_func,
'font_in_table' : self.__font_in_table_func,
}
self.__font_table = {}
# individual font written
self.__wrote_ind_font = 0
def __default_func(self, line):
"""
Requires:
line
Returns:
nothing
Handle all lines before the font table. Check for the beginning of the
font table. If found, change the state. Print out all lines.
"""
if self.__token_info == 'mi<mk<fonttb-beg':
self.__state = 'font_table'
self.__write_obj.write(line)
def __font_table_func(self, line):
"""
Requires:
line
Returns:
nothing
Logic:
If the self.__token_info indicates that you have reached the end of
the font table, then change the state to after the font table.
If the self.__token_info indicates that there is a font in the
table, change the state to font in table. Reset the number of the
font to the default font (in case there is no number provided, in
which case RTF assumes the number will be the default font.) Reset
the test string (for the font name) to ''
"""
if self.__token_info == 'mi<mk<fonttb-end':
self.__state = 'after_font_table'
elif self.__token_info == 'mi<mk<fontit-beg':
self.__state = 'font_in_table'
self.__font_num = self.__default_font_num
self.__text_line = ''
# self.__write_obj.write(line)
def __font_in_table_func(self, line):
"""
Requires:
line
Returns:
nothing
Logic:
Check for four conditions:
The line contains font-info. In this case, store the number in
self.__font_num.
The line contains text. In this case, add to the text string
self.__text_string.
The line marks the end of the individual font in the table. In
this case, add a new key-> value pair to the font-table
dictionary. Also create an empty tag with the name and number
as attributes.
Preamture end of font table
"""
# cw<ci<font-style<nu<4
# tx<nu<__________<Times;
if self.__token_info == 'mi<mk<fontit-end':
self.__wrote_ind_font = 1
self.__state = 'font_table'
self.__text_line = self.__text_line[:-1] # get rid of last ';'
self.__font_table[self.__font_num] = self.__text_line
self.__write_obj.write(
'mi<tg<empty-att_'
'<font-in-table<name>%s<num>%s\n' % (self.__text_line, self.__font_num)
)
elif self.__token_info == 'cw<ci<font-style':
self.__font_num = line[20:-1]
elif self.__token_info == 'tx<nu<__________' or \
self.__token_info == 'tx<ut<__________':
self.__text_line += line[17:-1]
elif self.__token_info == 'mi<mk<fonttb-end':
self.__found_end_font_table_func()
self.__state = 'after_font_table'
def __found_end_font_table_func(self):
"""
Required:
nothing
Returns:
nothing
Logic:
If not individual fonts have been written, write one out
"""
if not self.__wrote_ind_font:
self.__write_obj.write(
'mi<tg<empty-att_'
'<font-in-table<name>Times<num>0\n')
def __after_font_table_func(self, line):
"""
Required:
line
Returns:
nothing
Logic:
Check the self.__token_info. If this matches a token with font
info, then extract the number from the line, and look up the font
name in the font dictionary. If no name exists for that number,
print out an error. Otherwise print out the same line, except with
the name rather than the number.
If the line does not contain font info, simply print it out to the
file.
"""
if self.__token_info == 'cw<ci<font-style':
font_num = line[20:-1]
font_name = self.__font_table.get(font_num)
if font_name is None:
if self.__run_level > 3:
msg = 'no value for %s in self.__font_table\n' % font_num
raise self.__bug_handler(msg)
else:
# self.__special_font_dict
if font_name in self.__special_font_list:
self.__special_font_dict[font_name] = 1
self.__write_obj.write(
'cw<ci<font-style<nu<%s\n' % font_name
)
else:
self.__write_obj.write(line)
def convert_fonts(self):
"""
Required:
nothing
Returns:
a dictionary indicating with values for special fonts
Logic:
Read one line in at a time. Determine what action to take based on
the state. If the state is font_table, looke for individual fonts
and add the number and font name to a dictionary. Also create a
tag for each individual font in the font table.
If the state is after the font table, look for lines with font
info. Substitute a font name for a font number.
"""
self.__initiate_values()
with open_for_read(self.__file) as read_obj:
with open_for_write(self.__write_to) as self.__write_obj:
for line in read_obj:
self.__token_info = line[:16]
action = self.__state_dict.get(self.__state)
if action is None:
sys.stderr.write('no matching state in module fonts.py\n' + self.__state + '\n')
action(line)
default_font_name = self.__font_table.get(self.__default_font_num)
if not default_font_name:
default_font_name = 'Not Defined'
self.__special_font_dict['default-font'] = default_font_name
copy_obj = copy.Copy(bug_handler=self.__bug_handler)
if self.__copy:
copy_obj.copy_file(self.__write_to, "fonts.data")
copy_obj.rename(self.__write_to, self.__file)
os.remove(self.__write_to)
return self.__special_font_dict

View File

@@ -0,0 +1,264 @@
from __future__ import absolute_import, division, print_function, unicode_literals
#########################################################################
# #
# #
# copyright 2002 Paul Henry Tremblay #
# #
# This program is distributed in the hope that it will be useful, #
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
# General Public License for more details. #
# #
# #
#########################################################################
import os
from calibre.ebooks.rtf2xml import copy
from calibre.ptempfile import better_mktemp
from polyglot.builtins import unicode_type
from . import open_for_read, open_for_write
class Footnote:
"""
Two public methods are available. The first separates all of the
footnotes from the body and puts them at the bottom of the text, where
they are easier to process. The second joins those footnotes to the
proper places in the body.
"""
def __init__(self,
in_file ,
bug_handler,
copy=None,
run_level=1,
):
self.__file = in_file
self.__bug_handler = bug_handler
self.__copy = copy
self.__write_to = better_mktemp()
self.__found_a_footnote = 0
def __first_line_func(self, line):
"""
Print the tag info for footnotes. Check whether footnote is an
endnote and make the tag according to that.
"""
if self.__token_info == 'cw<nt<type______':
self.__write_to_foot_obj.write(
'mi<tg<open-att__<footnote<type>endnote<num>%s\n' % self.__footnote_count)
else:
self.__write_to_foot_obj.write(
'mi<tg<open-att__<footnote<num>%s\n' % self.__footnote_count)
self.__first_line = 0
def __in_footnote_func(self, line):
"""Handle all tokens that are part of footnote"""
if self.__first_line:
self.__first_line_func(line)
if self.__token_info == 'cw<ci<footnot-mk':
num = unicode_type(self.__footnote_count)
self.__write_to_foot_obj.write(line)
self.__write_to_foot_obj.write(
'tx<nu<__________<%s\n' % num
)
if self.__cb_count == self.__footnote_bracket_count:
self.__in_footnote = 0
self.__write_obj.write(line)
self.__write_to_foot_obj.write(
'mi<mk<foot___clo\n')
self.__write_to_foot_obj.write(
'mi<tg<close_____<footnote\n')
self.__write_to_foot_obj.write(
'mi<mk<footnt-clo\n')
else:
self.__write_to_foot_obj.write(line)
def __found_footnote(self, line):
""" Found a footnote"""
self.__found_a_footnote = 1
self.__in_footnote = 1
self.__first_line = 1
self.__footnote_count += 1
# temporarily set this to zero so I can enter loop
self.__cb_count = 0
self.__footnote_bracket_count = self.__ob_count
self.__write_obj.write(
'mi<mk<footnt-ind<%04d\n' % self.__footnote_count)
self.__write_to_foot_obj.write(
'mi<mk<footnt-ope<%04d\n' % self.__footnote_count)
def __default_sep(self, line):
"""Handle all tokens that are not footnote tokens"""
if self.__token_info == 'cw<nt<footnote__':
self.__found_footnote(line)
self.__write_obj.write(line)
if self.__token_info == 'cw<ci<footnot-mk':
num = unicode_type(self.__footnote_count + 1)
self.__write_obj.write(
'tx<nu<__________<%s\n' % num
)
def __initiate_sep_values(self):
"""
initiate counters for separate_footnotes method.
"""
self.__bracket_count=0
self.__ob_count = 0
self.__cb_count = 0
self.__footnote_bracket_count = 0
self.__in_footnote = 0
self.__first_line = 0 # have not processed the first line of footnote
self.__footnote_count = 0
def separate_footnotes(self):
"""
Separate all the footnotes in an RTF file and put them at the bottom,
where they are easier to process. Each time a footnote is found,
print all of its contents to a temporary file. Close both the main and
temporary file. Print the footnotes from the temporary file to the
bottom of the main file.
"""
self.__initiate_sep_values()
self.__footnote_holder = better_mktemp()
with open_for_read(self.__file) as read_obj:
with open_for_write(self.__write_to) as self.__write_obj:
with open_for_write(self.__footnote_holder) as self.__write_to_foot_obj:
for line in read_obj:
self.__token_info = line[:16]
# keep track of opening and closing brackets
if self.__token_info == 'ob<nu<open-brack':
self.__ob_count = line[-5:-1]
if self.__token_info == 'cb<nu<clos-brack':
self.__cb_count = line[-5:-1]
# In the middle of footnote text
if self.__in_footnote:
self.__in_footnote_func(line)
# not in the middle of footnote text
else:
self.__default_sep(line)
with open_for_read(self.__footnote_holder) as read_obj:
with open_for_write(self.__write_to, append=True) as write_obj:
write_obj.write(
'mi<mk<sect-close\n'
'mi<mk<body-close\n'
'mi<tg<close_____<section\n'
'mi<tg<close_____<body\n'
'mi<tg<close_____<doc\n'
'mi<mk<footnt-beg\n')
for line in read_obj:
write_obj.write(line)
write_obj.write(
'mi<mk<footnt-end\n')
os.remove(self.__footnote_holder)
copy_obj = copy.Copy(bug_handler=self.__bug_handler)
if self.__copy:
copy_obj.copy_file(self.__write_to, "footnote_separate.data")
copy_obj.rename(self.__write_to, self.__file)
os.remove(self.__write_to)
def update_info(self, file, copy):
"""
Unused method
"""
self.__file = file
self.__copy = copy
def __get_foot_body_func(self, line):
"""
Process lines in main body and look for beginning of footnotes.
"""
# mi<mk<footnt-end
if self.__token_info == 'mi<mk<footnt-beg':
self.__state = 'foot'
else:
self.__write_obj.write(line)
def __get_foot_foot_func(self, line):
"""
Copy footnotes from bottom of file to a separate, temporary file.
"""
if self.__token_info == 'mi<mk<footnt-end':
self.__state = 'body'
else:
self.__write_to_foot_obj.write(line)
def __get_footnotes(self):
"""
Private method to remove footnotes from main file. Read one line from
the main file at a time. If the state is 'body', call on the private
__get_foot_foot_func. Otherwise, call on the __get_foot_body_func.
These two functions do the work of separating the footnotes form the
body.
"""
with open_for_read(self.__file) as read_obj:
with open_for_write(self.__write_to) as self.__write_obj:
with open_for_write(self.__footnote_holder) as self.__write_to_foot_obj:
for line in read_obj:
self.__token_info = line[:16]
if self.__state == 'body':
self.__get_foot_body_func(line)
elif self.__state == 'foot':
self.__get_foot_foot_func(line)
def __get_foot_from_temp(self, num):
"""
Private method for joining footnotes to body. This method reads from
the temporary file until the proper footnote marker is found. It
collects all the tokens until the end of the footnote, and returns
them as a string.
"""
look_for = 'mi<mk<footnt-ope<' + num + '\n'
found_foot = 0
string_to_return = ''
for line in self.__read_from_foot_obj:
if found_foot:
if line == 'mi<mk<footnt-clo\n':
return string_to_return
string_to_return = string_to_return + line
else:
if line == look_for:
found_foot = 1
def __join_from_temp(self):
"""
Private method for rejoining footnotes to body. Read from the
newly-created, temporary file that contains the body text but no
footnotes. Each time a footnote marker is found, call the private
method __get_foot_from_temp(). This method will return a string to
print out to the third file.
If no footnote marker is found, simply print out the token (line).
"""
with open_for_read(self.__footnote_holder) as self.__read_from_foot_obj:
with open_for_read(self.__write_to) as read_obj:
with open_for_write(self.__write_to2) as self.__write_obj:
for line in read_obj:
if line[:16] == 'mi<mk<footnt-ind':
line = self.__get_foot_from_temp(line[17:-1])
self.__write_obj.write(line)
def join_footnotes(self):
"""
Join the footnotes from the bottom of the file and put them in their
former places. First, remove the footnotes from the bottom of the
input file, outputting them to a temporary file. This creates two new
files, one without footnotes, and one of just footnotes. Open both
these files to read. When a marker is found in the main file, find the
corresponding marker in the footnote file. Output the mix of body and
footnotes to a third file.
"""
if not self.__found_a_footnote:
return
self.__write_to2 = better_mktemp()
self.__state = 'body'
self.__get_footnotes()
self.__join_from_temp()
# self.__write_obj.close()
# self.__read_from_foot_obj.close()
copy_obj = copy.Copy(bug_handler=self.__bug_handler)
if self.__copy:
copy_obj.copy_file(self.__write_to2, "footnote_joined.data")
copy_obj.rename(self.__write_to2, self.__file)
os.remove(self.__write_to2)
os.remove(self.__footnote_holder)

View File

@@ -0,0 +1,62 @@
from __future__ import unicode_literals, absolute_import, print_function, division
#########################################################################
# #
# #
# copyright 2002 Paul Henry Tremblay #
# #
# This program is distributed in the hope that it will be useful, #
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
# General Public License for more details. #
# #
# #
#########################################################################
class GetCharMap:
"""
Return the character map for the given value
"""
def __init__(self, bug_handler, char_file):
"""
Required:
'char_file'--the file with the mappings
Returns:
nothing
"""
self.__char_file = char_file
self.__bug_handler = bug_handler
def get_char_map(self, map):
# if map == 'ansicpg10000':
# map = 'mac_roman'
found_map = False
map_dict = {}
self.__char_file.seek(0)
for line in self.__char_file:
if not line.strip():
continue
begin_element = '<%s>' % map
end_element = '</%s>' % map
if not found_map:
if begin_element in line:
found_map = True
else:
if end_element in line:
break
fields = line.split(':')
fields[1].replace('\\colon', ':')
map_dict[fields[1]] = fields[3]
if not found_map:
msg = 'no map found\nmap is "%s"\n'%(map,)
raise self.__bug_handler(msg)
return map_dict

View File

@@ -0,0 +1,306 @@
from __future__ import unicode_literals, absolute_import, print_function, division
#########################################################################
# #
# #
# copyright 2002 Paul Henry Tremblay #
# #
# This program is distributed in the hope that it will be useful, #
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
# General Public License for more details. #
# #
# #
#########################################################################
import sys, os, re
from calibre.ebooks.rtf2xml import copy
from calibre.ptempfile import better_mktemp
from . import open_for_read, open_for_write
class GroupBorders:
"""
Form lists.
Use RTF's own formatting to determine if a paragraph definition is part of a
list.
Use indents to determine items and how lists are nested.
"""
def __init__(self,
in_file,
bug_handler,
copy=None,
run_level=1,
wrap=0,
):
"""
Required:
'file'
Optional:
'copy'-- whether to make a copy of result for debugging
'temp_dir' --where to output temporary results (default is
directory from which the script is run.)
Returns:
nothing
"""
self.__file = in_file
self.__bug_handler = bug_handler
self.__copy = copy
self.__run_level = run_level
self.__write_to = better_mktemp()
self.__wrap = wrap
def __initiate_values(self):
"""
Required:
Nothing
Return:
Nothing
Logic:
The self.__end_list is a list of tokens that will force a list to end.
Likewise, the self.__end_lines is a list of lines that forces a list to end.
"""
self.__state = "default"
self.__left_indent = 0
self.__border_num = 0
self.__list_type = 'not-defined'
self.__pard_def = ""
self.__all_lists = []
self.__list_chunk = ''
self.__state_dict={
'default' : self.__default_func,
'in_pard' : self.__in_pard_func,
'after_pard' : self.__after_pard_func,
}
# section end
self.__end_list = [
# section end
'mi<mk<sect-close',
'mi<mk<sect-start',
# table begin
'mi<mk<tabl-start',
# field block begin
'mi<mk<fldbk-end_',
'mi<mk<fldbkstart',
# cell end
'mi<mk<close_cell',
# item end
'mi<tg<item_end__',
# footnote end
'mi<mk<foot___clo',
'mi<mk<footnt-ope',
# heading end
'mi<mk<header-beg',
'mi<mk<header-end',
'mi<mk<head___clo',
# lists
'mi<tg<item_end__',
'mi<tg<item_end__',
'mi<mk<list_start'
# body close
#
# style-group
'mi<mk<style-grp_',
'mi<mk<style_grp_',
'mi<mk<style_gend',
'mi<mk<stylegend_',
# don't use
# 'mi<mk<body-close',
# 'mi<mk<par-in-fld',
# 'cw<tb<cell______',
# 'cw<tb<row-def___',
# 'cw<tb<row_______',
# 'mi<mk<sec-fd-beg',
]
# <name>Normal<
self.__name_regex = re.compile(r'(<name>[^<]+)')
self.__border_regex = re.compile(r'border-paragraph')
self.__found_appt = 0
self.__line_num = 0
self.__border_regex = re.compile(r'(<border-paragraph[^<]+|<border-for-every-paragraph[^<]+)')
self.__last_border_string = ''
def __in_pard_func(self, line):
"""
Required:
line -- the line of current text.
Return:
Nothing
Logic:
You are in a list, but in the middle of a paragraph definition.
Don't do anything until you find the end of the paragraph definition.
"""
if self.__token_info == 'mi<tg<close_____' \
and line[17:-1] == 'paragraph-definition':
self.__state = 'after_pard'
else:
self.__write_obj.write(line)
def __after_pard_func(self, line):
"""
Required:
line -- the line of current text.
Return:
Nothing
Logic:
"""
if self.__token_info == 'mi<tg<open-att__' \
and line[17:37] == 'paragraph-definition':
# found paragraph definition
self.__pard_after_par_def_func(line)
elif self.__token_info == 'mi<tg<close_____' \
and line[17:-1] == 'paragraph-definition':
sys.stderr.write('Wrong flag in __after_pard_func\n')
if self.__run_level > 2:
msg = 'wrong flag'
raise self.__bug_handler(msg)
elif self.__token_info in self.__end_list:
self.__write_obj.write('mi<tg<close_____<paragraph-definition\n')
self.__write_end_border_tag()
self.__write_obj.write(self.__list_chunk)
self.__list_chunk = ''
self.__state = 'default'
self.__write_obj.write(line)
else:
self.__list_chunk += line
def __close_pard_(self, line):
self.__write_obj.write(self.__list_chunk)
self.__write_obj.write('mi<tg<close_____<paragraph-definition\n')
self.__write_end_wrap()
self.__list_chunk = ''
self.__state = 'default'
def __pard_after_par_def_func(self, line):
"""
Required:
line -- the line of current text.
id -- the id of the current list
Return:
Nothing
Logic:
"""
is_border = self.__is_border_func(line)
if not is_border:
self.__write_obj.write('mi<tg<close_____<paragraph-definition\n')
self.__write_end_border_tag()
self.__write_obj.write(self.__list_chunk)
self.__write_obj.write(line)
self.__state = 'default'
self.__list_chunk = ''
else:
border_string, pard_string = self.__parse_pard_with_border(line)
if self.__last_border_string == border_string:
# just keep going
self.__write_obj.write('mi<tg<close_____<paragraph-definition\n')
self.__write_obj.write(self.__list_chunk)
self.__list_chunk = ''
self.__state = 'in_pard'
self.__write_obj.write(pard_string)
else:
# different name for the paragraph definition
self.__write_obj.write('mi<tg<close_____<paragraph-definition\n')
self.__write_end_border_tag()
self.__write_obj.write(self.__list_chunk)
self.__write_start_border_tag(border_string)
self.__write_obj.write(pard_string)
self.__state = 'in_pard'
self.__last_border_string = border_string
self.__list_chunk = ''
def __default_func(self, line):
"""
Required:
self, line
Returns:
Nothing
Logic
Look for the start of a paragraph defintion. If one is found, check if
it contains a list-id. If it does, start a list. Change the state to
in_pard.
"""
if self.__token_info == 'mi<tg<open-att__' \
and line[17:37] == 'paragraph-definition':
contains_border = self.__is_border_func(line)
if contains_border:
border_string, pard_string = self.__parse_pard_with_border(line)
self.__write_start_border_tag(border_string)
self.__write_obj.write(pard_string)
self.__last_border_string = border_string
self.__state = 'in_pard'
else:
self.__write_obj.write(line)
else:
self.__write_obj.write(line)
def __write_start_border_tag(self, the_string):
self.__write_obj.write('mi<mk<start-brdg\n')
self.__border_num += 1
num = '%04d' % self.__border_num
num_string = 's%s' % num
the_string += '<num>%s' % num_string
self.__write_obj.write('mi<tg<open-att__<border-group%s\n' % the_string)
def __write_end_border_tag(self):
self.__write_obj.write('mi<mk<end-brdg__\n')
self.__write_obj.write('mi<tg<close_____<border-group\n')
def __is_border_func(self, line):
line = re.sub(self.__name_regex, '', line)
index = line.find('border-paragraph')
if index > -1:
return 1
return 0
def __parse_pard_with_border(self, line):
border_string = ''
pard_string = ''
tokens = re.split(self.__border_regex, line)
for token in tokens:
if token[0:17] == '<border-paragraph':
border_string += token
else:
pard_string += token
return border_string, pard_string
def __write_pard_with_border(self, line):
border_string = ''
pard_string = ''
tokens = re.split(self.__border_regex, line)
for token in tokens:
if token[0:17] == '<border-paragraph':
border_string += token
else:
pard_string += token
self.__write_start_border_tag(border_string)
self.__write_obj.write(pard_string)
def __get_style_name(self, line):
if self.__token_info == 'mi<mk<style-name':
self.__style_name = line[17:-1]
def group_borders(self):
"""
Required:
nothing
Returns:
original file will be changed
Logic:
"""
self.__initiate_values()
read_obj = open_for_read(self.__file)
self.__write_obj = open_for_write(self.__write_to)
line_to_read = 1
while line_to_read:
line_to_read = read_obj.readline()
line = line_to_read
self.__token_info = line[:16]
self.__get_style_name(line)
action = self.__state_dict.get(self.__state)
action(line)
read_obj.close()
self.__write_obj.close()
copy_obj = copy.Copy(bug_handler=self.__bug_handler)
if self.__copy:
copy_obj.copy_file(self.__write_to, "group_borders.data")
copy_obj.rename(self.__write_to, self.__file)
os.remove(self.__write_to)

View File

@@ -0,0 +1,252 @@
from __future__ import unicode_literals, absolute_import, print_function, division
#########################################################################
# #
# #
# copyright 2002 Paul Henry Tremblay #
# #
# This program is distributed in the hope that it will be useful, #
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
# General Public License for more details. #
# #
# #
#########################################################################
import sys, os, re
from calibre.ebooks.rtf2xml import copy
from calibre.ptempfile import better_mktemp
from . import open_for_read, open_for_write
class GroupStyles:
"""
Form lists.
Use RTF's own formatting to determine if a paragraph definition is part of a
list.
Use indents to determine items and how lists are nested.
"""
def __init__(self,
in_file,
bug_handler,
copy=None,
run_level=1,
wrap=0,
):
"""
Required:
'file'
Optional:
'copy'-- whether to make a copy of result for debugging
'temp_dir' --where to output temporary results (default is
directory from which the script is run.)
Returns:
nothing
"""
self.__file = in_file
self.__bug_handler = bug_handler
self.__copy = copy
self.__run_level = run_level
self.__write_to = better_mktemp()
self.__wrap = wrap
def __initiate_values(self):
"""
Required:
Nothing
Return:
Nothing
Logic:
The self.__end_list is a list of tokens that will force a list to end.
Likewise, the self.__end_lines is a list of lines that forces a list to end.
"""
self.__state = "default"
self.__left_indent = 0
self.__list_type = 'not-defined'
self.__pard_def = ""
self.__all_lists = []
self.__list_chunk = ''
self.__state_dict={
'default' : self.__default_func,
'in_pard' : self.__in_pard_func,
'after_pard' : self.__after_pard_func,
}
# section end
self.__end_list = [
# section end
'mi<mk<sect-close',
'mi<mk<sect-start',
# table begin
'mi<mk<tabl-start',
# field block begin
'mi<mk<fldbk-end_',
'mi<mk<fldbkstart',
# cell end
'mi<mk<close_cell',
# item end
'mi<tg<item_end__',
# footnote end
'mi<mk<foot___clo',
'mi<mk<footnt-ope',
# heading end
'mi<mk<header-beg',
'mi<mk<header-end',
'mi<mk<head___clo',
# lists
'mi<tg<item_end__',
'mi<tg<item_end__',
'mi<mk<list_start'
# body close
# don't use
# 'mi<mk<body-close',
# 'mi<mk<par-in-fld',
# 'cw<tb<cell______',
# 'cw<tb<row-def___',
# 'cw<tb<row_______',
# 'mi<mk<sec-fd-beg',
]
self.__name_regex = re.compile(r'<name>')
self.__found_appt = 0
self.__line_num = 0
def __in_pard_func(self, line):
"""
Required:
line -- the line of current text.
Return:
Nothing
Logic:
You are in a list, but in the middle of a paragraph definition.
Don't do anything until you find the end of the paragraph definition.
"""
if self.__token_info == 'mi<tg<close_____' \
and line[17:-1] == 'paragraph-definition':
self.__state = 'after_pard'
else:
self.__write_obj.write(line)
def __after_pard_func(self, line):
"""
Required:
line -- the line of current text.
Return:
Nothing
Logic:
"""
if self.__token_info == 'mi<tg<open-att__' \
and line[17:37] == 'paragraph-definition':
# found paragraph definition
self.__pard_after_par_def_func(line)
elif self.__token_info == 'mi<tg<close_____' \
and line[17:-1] == 'paragraph-definition':
sys.stderr.write('Wrong flag in __after_pard_func\n')
if self.__run_level > 2:
msg = 'wrong flag'
raise self.__bug_handler(msg)
elif self.__token_info in self.__end_list:
self.__write_obj.write('mi<tg<close_____<paragraph-definition\n')
self.__write_end_wrap()
self.__write_obj.write(self.__list_chunk)
self.__list_chunk = ''
self.__state = 'default'
self.__write_obj.write(line)
else:
self.__list_chunk += line
def __close_pard_(self, line):
self.__write_obj.write(self.__list_chunk)
self.__write_obj.write('mi<tg<close_____<paragraph-definition\n')
self.__write_end_wrap()
self.__list_chunk = ''
self.__state = 'default'
def __write_start_wrap(self, name):
if self.__wrap:
self.__write_obj.write('mi<mk<style-grp_<%s\n' % name)
self.__write_obj.write('mi<tg<open-att__<style-group<name>%s\n' % name)
self.__write_obj.write('mi<mk<style_grp_<%s\n' % name)
def __write_end_wrap(self):
if self.__wrap:
self.__write_obj.write('mi<mk<style_gend\n')
self.__write_obj.write('mi<tg<close_____<style-group\n')
self.__write_obj.write('mi<mk<stylegend_\n')
def __pard_after_par_def_func(self, line):
"""
Required:
line -- the line of current text.
id -- the id of the current list
Return:
Nothing
Logic:
"""
if self.__last_style_name == self.__style_name:
# just keep going
if self.__wrap:
self.__write_obj.write('mi<tg<close_____<paragraph-definition\n')
self.__write_obj.write(self.__list_chunk)
self.__list_chunk = ''
self.__state = 'in_pard'
if self.__wrap:
self.__write_obj.write(line)
else:
# different name for the paragraph definition
self.__write_obj.write('mi<tg<close_____<paragraph-definition\n')
self.__write_end_wrap()
self.__write_obj.write(self.__list_chunk)
self.__write_start_wrap(self.__style_name)
self.__write_obj.write(line)
self.__state = 'in_pard'
self.__last_style_name = self.__style_name
self.__list_chunk = ''
def __default_func(self, line):
"""
Required:
self, line
Returns:
Nothing
Logic
Look for the start of a paragraph defintion. If one is found, check if
it contains a list-id. If it does, start a list. Change the state to
in_pard.
"""
if self.__token_info == 'mi<tg<open-att__' \
and line[17:37] == 'paragraph-definition':
self.__state = 'in_pard'
self.__last_style_name = self.__style_name
self.__write_start_wrap(self.__last_style_name)
self.__write_obj.write(line)
else:
self.__write_obj.write(line)
def __get_style_name(self, line):
if self.__token_info == 'mi<mk<style-name':
self.__style_name = line[17:-1]
def group_styles(self):
"""
Required:
nothing
Returns:
original file will be changed
Logic:
"""
self.__initiate_values()
read_obj = open_for_read(self.__file)
self.__write_obj = open_for_write(self.__write_to)
line_to_read = 1
while line_to_read:
line_to_read = read_obj.readline()
line = line_to_read
self.__token_info = line[:16]
self.__get_style_name(line)
action = self.__state_dict.get(self.__state)
action(line)
read_obj.close()
self.__write_obj.close()
copy_obj = copy.Copy(bug_handler=self.__bug_handler)
if self.__copy:
copy_obj.copy_file(self.__write_to, "group_styles.data")
copy_obj.rename(self.__write_to, self.__file)
os.remove(self.__write_to)

View File

@@ -0,0 +1,261 @@
from __future__ import unicode_literals, absolute_import, print_function, division
#########################################################################
# #
# #
# copyright 2002 Paul Henry Tremblay #
# #
# This program is distributed in the hope that it will be useful, #
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
# General Public License for more details. #
# #
# #
#########################################################################
import sys, os
from calibre.ebooks.rtf2xml import copy
from calibre.ptempfile import better_mktemp
from . import open_for_read, open_for_write
class Header:
"""
Two public methods are available. The first separates all of the headers
and footers from the body and puts them at the bottom of the text, where
they are easier to process. The second joins those headers and footers to
the proper places in the body.
"""
def __init__(self,
in_file ,
bug_handler,
copy=None,
run_level=1,
):
self.__file = in_file
self.__bug_handler = bug_handler
self.__copy = copy
self.__write_to = better_mktemp()
self.__found_a_header = False
def __in_header_func(self, line):
"""
Handle all tokens that are part of header
"""
if self.__cb_count == self.__header_bracket_count:
self.__in_header = False
self.__write_obj.write(line)
self.__write_to_head_obj.write(
'mi<mk<head___clo\n'
'mi<tg<close_____<header-or-footer\n'
'mi<mk<header-clo\n')
else:
self.__write_to_head_obj.write(line)
def __found_header(self, line):
"""
Found a header
"""
# but this could be header or footer
self.__found_a_header = True
self.__in_header = True
self.__header_count += 1
# temporarily set this to zero so I can enter loop
self.__cb_count = 0
self.__header_bracket_count = self.__ob_count
self.__write_obj.write(
'mi<mk<header-ind<%04d\n' % self.__header_count)
self.__write_to_head_obj.write(
'mi<mk<header-ope<%04d\n' % self.__header_count)
info = line[6:16]
type = self.__head_dict.get(info)
if type:
self.__write_to_head_obj.write(
'mi<tg<open-att__<header-or-footer<type>%s\n' % (type)
)
else:
sys.stderr.write(
'module is header\n'
'method is __found_header\n'
'no dict entry\n'
'line is %s' % line)
self.__write_to_head_obj.write(
'mi<tg<open-att__<header-or-footer<type>none\n'
)
def __default_sep(self, line):
"""
Handle all tokens that are not header tokens
"""
if self.__token_info[3:5] == 'hf':
self.__found_header(line)
self.__write_obj.write(line)
def __initiate_sep_values(self):
"""
initiate counters for separate_footnotes method.
"""
self.__bracket_count=0
self.__ob_count = 0
self.__cb_count = 0
self.__header_bracket_count = 0
self.__in_header = False
self.__header_count = 0
self.__head_dict = {
'head-left_' : ('header-left'),
'head-right' : ('header-right'),
'foot-left_' : ('footer-left'),
'foot-right' : ('footer-right'),
'head-first' : ('header-first'),
'foot-first' : ('footer-first'),
'header____' : ('header'),
'footer____' : ('footer'),
}
def separate_headers(self):
"""
Separate all the footnotes in an RTF file and put them at the bottom,
where they are easier to process. Each time a footnote is found,
print all of its contents to a temporary file. Close both the main and
temporary file. Print the footnotes from the temporary file to the
bottom of the main file.
"""
self.__initiate_sep_values()
self.__header_holder = better_mktemp()
with open_for_read(self.__file) as read_obj:
with open_for_write(self.__write_to) as self.__write_obj:
with open_for_write(self.__header_holder) as self.__write_to_head_obj:
for line in read_obj:
self.__token_info = line[:16]
# keep track of opening and closing brackets
if self.__token_info == 'ob<nu<open-brack':
self.__ob_count = line[-5:-1]
if self.__token_info == 'cb<nu<clos-brack':
self.__cb_count = line[-5:-1]
# In the middle of footnote text
if self.__in_header:
self.__in_header_func(line)
# not in the middle of footnote text
else:
self.__default_sep(line)
with open_for_read(self.__header_holder) as read_obj:
with open_for_write(self.__write_to, append=True) as write_obj:
write_obj.write(
'mi<mk<header-beg\n')
for line in read_obj:
write_obj.write(line)
write_obj.write(
'mi<mk<header-end\n')
os.remove(self.__header_holder)
copy_obj = copy.Copy(bug_handler=self.__bug_handler)
if self.__copy:
copy_obj.copy_file(self.__write_to, "header_separate.data")
copy_obj.rename(self.__write_to, self.__file)
os.remove(self.__write_to)
def update_info(self, file, copy):
"""
Unused method
"""
self.__file = file
self.__copy = copy
def __get_head_body_func(self, line):
"""
Process lines in main body and look for beginning of headers.
"""
# mi<mk<footnt-end
if self.__token_info == 'mi<mk<header-beg':
self.__state = 'head'
else:
self.__write_obj.write(line)
def __get_head_head_func(self, line):
"""
Copy headers and footers from bottom of file to a separate, temporary file.
"""
if self.__token_info == 'mi<mk<header-end':
self.__state = 'body'
else:
self.__write_to_head_obj.write(line)
def __get_headers(self):
"""
Private method to remove footnotes from main file. Read one line from
the main file at a time. If the state is 'body', call on the private
__get_foot_foot_func. Otherwise, call on the __get_foot_body_func.
These two functions do the work of separating the footnotes form the
body.
"""
with open_for_read(self.__file) as read_obj:
with open_for_write(self.__write_to) as self.__write_obj:
with open_for_write(self.__header_holder) as self.__write_to_head_obj:
for line in read_obj:
self.__token_info = line[:16]
if self.__state == 'body':
self.__get_head_body_func(line)
elif self.__state == 'head':
self.__get_head_head_func(line)
def __get_head_from_temp(self, num):
"""
Private method for joining headers and footers to body. This method
reads from the temporary file until the proper footnote marker is
found. It collects all the tokens until the end of the footnote, and
returns them as a string.
"""
look_for = 'mi<mk<header-ope<' + num + '\n'
found_head = False
string_to_return = ''
for line in self.__read_from_head_obj:
if found_head:
if line == 'mi<mk<header-clo\n':
return string_to_return
string_to_return += line
else:
if line == look_for:
found_head = True
def __join_from_temp(self):
"""
Private method for rejoining footnotes to body. Read from the
newly-created, temporary file that contains the body text but no
footnotes. Each time a footnote marker is found, call the private
method __get_foot_from_temp(). This method will return a string to
print out to the third file.
If no footnote marker is found, simply print out the token (line).
"""
self.__read_from_head_obj = open_for_read(self.__header_holder)
self.__write_obj = open_for_write(self.__write_to2)
with open_for_read(self.__write_to) as read_obj:
for line in read_obj:
if line[:16] == 'mi<mk<header-ind':
line = self.__get_head_from_temp(line[17:-1])
self.__write_obj.write(line)
def join_headers(self):
"""
Join the footnotes from the bottom of the file and put them in their
former places. First, remove the footnotes from the bottom of the
input file, outputting them to a temporary file. This creates two new
files, one without footnotes, and one of just footnotes. Open both
these files to read. When a marker is found in the main file, find the
corresponding marker in the footnote file. Output the mix of body and
footnotes to a third file.
"""
if not self.__found_a_header:
return
self.__write_to2 = better_mktemp()
self.__state = 'body'
self.__get_headers()
self.__join_from_temp()
self.__write_obj.close()
self.__read_from_head_obj.close()
copy_obj = copy.Copy(bug_handler=self.__bug_handler)
if self.__copy:
copy_obj.copy_file(self.__write_to, "header_join.data")
copy_obj.rename(self.__write_to, self.__file)
os.remove(self.__write_to)
os.remove(self.__header_holder)

View File

@@ -0,0 +1,227 @@
from __future__ import unicode_literals, absolute_import, print_function, division
#########################################################################
# #
# #
# copyright 2002 Paul Henry Tremblay #
# #
# This program is distributed in the hope that it will be useful, #
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
# General Public License for more details. #
# #
# #
#########################################################################
import os, re
from calibre.ebooks.rtf2xml import copy
from calibre.ptempfile import better_mktemp
from . import open_for_read, open_for_write
class HeadingsToSections:
"""
"""
def __init__(self,
in_file,
bug_handler,
copy=None,
run_level=1,
):
"""
Required:
'file'
Optional:
'copy'-- whether to make a copy of result for debugging
'temp_dir' --where to output temporary results (default is
directory from which the script is run.)
Returns:
nothing
"""
self.__file = in_file
self.__bug_handler = bug_handler
self.__copy = copy
self.__write_to = better_mktemp()
def __initiate_values(self):
"""
Required:
Nothing
Return:
Nothing
Logic:
The self.__end_list is a list of tokens that will force a list to end.
Likewise, the self.__end_lines is a list of lines that forces a list to end.
"""
self.__state = "default"
self.__all_sections = []
self.__chunk = ''
self.__state_dict={
'default' : self.__default_func,
'in_table' : self.__in_table_func,
'in_list' : self.__in_list_func,
'after_body' : self.__after_body_func,
}
self.__list_depth = 0
self.__end_list = [
'mi<mk<body-close',
# changed 2004-04-26
# 'mi<mk<par-in-fld',
'mi<mk<sect-close', # right before close of section
'mi<mk<sect-start', # right before section start
# this should be sect-close!
# 'mi<mk<header-beg',
# 'mi<mk<header-end',
# 'mi<mk<head___clo',
#
# changed 2004-04-26
# 'mi<mk<fldbk-end_',
# 'mi<mk<sec-fd-beg',
]
self.__headings = [
'heading 1', 'heading 2', 'heading 3', 'heading 4',
'heading 5', 'heading 6', 'heading 7', 'heading 8',
'heading 9'
]
self.__section_num = [0]
self.__id_regex = re.compile(r'\<list-id\>(\d+)')
def __close_lists(self):
"""
Required:
Nothing
Return:
Nothing
Logic:
Reverse the list of dictionaries. Iterate through the list and
get the indent for each list. If the current indent is less than
or equal to the indent in the dictionary, close that level.
Keep track of how many levels you close. Reduce the list by that
many levels.
Reverse the list again.
"""
current_indent = self.__left_indent
self.__all_lists.reverse()
num_levels_closed = 0
for the_dict in self.__all_lists:
list_indent = the_dict.get('left-indent')
if current_indent <= list_indent:
self.__write_end_item()
self.__write_end_list()
num_levels_closed += 1
self.__all_lists = self.__all_lists[num_levels_closed:]
self.__all_lists.reverse()
def __close_sections(self, current_level):
self.__all_sections.reverse()
num_levels_closed = 0
for level in self.__all_sections:
if current_level <= level:
self.__write_end_section()
num_levels_closed += 1
self.__all_sections = self.__all_sections[num_levels_closed:]
self.__all_sections.reverse()
def __write_start_section(self, current_level, name):
section_num = ''
for the_num in self.__section_num:
section_num += '%s.' % the_num
section_num = section_num[:-1]
num_in_level = len(self.__all_sections)
num_in_level = self.__section_num[num_in_level]
level = len(self.__all_sections)
self.__write_obj.write(
'mi<mk<sect-start\n'
)
self.__write_obj.write(
'mi<tg<open-att__<section<num>%s<num-in-level>%s<level>%s'
'<type>%s\n'
% (section_num, num_in_level, level, name)
)
def __write_end_section(self):
self.__write_obj.write('mi<mk<sect-close\n')
self.__write_obj.write('mi<tg<close_____<section\n')
def __default_func(self, line):
"""
Required:
self, line
Returns:
Nothing
Logic
Look for the start of a paragraph defintion. If one is found, check if
it contains a list-id. If it does, start a list. Change the state to
in_pard.
"""
if self.__token_info == 'mi<mk<sect-start':
self.__section_num[0] += 1
self.__section_num = self.__section_num[0:1]
if self.__token_info == 'mi<mk<tabl-start':
self.__state = 'in_table'
elif self.__token_info == 'mi<mk<list_start':
self.__state = 'in_list'
self.__list_depth += 1
elif self.__token_info in self.__end_list:
self.__close_sections(0)
elif self.__token_info == 'mi<mk<style-name':
name = line[17:-1]
if name in self.__headings:
self.__handle_heading(name)
if self.__token_info == 'mi<mk<body-close':
self.__state = 'after_body'
self.__write_obj.write(line)
def __handle_heading(self, name):
num = self.__headings.index(name) + 1
self.__close_sections(num)
self.__all_sections.append(num)
level_depth = len(self.__all_sections) + 1
self.__section_num = self.__section_num[:level_depth]
if len(self.__section_num) < level_depth:
self.__section_num.append(1)
else:
self.__section_num[-1] += 1
self.__write_start_section(num, name)
def __in_table_func(self, line):
if self.__token_info == 'mi<mk<table-end_':
self.__state = 'default'
self.__write_obj.write(line)
def __in_list_func(self, line):
if self.__token_info == 'mi<mk<list_close':
self.__list_depth -= 1
elif self.__token_info == 'mi<mk<list_start':
self.__list_depth += 1
if self.__list_depth == 0:
self.__state = 'default'
self.__write_obj.write(line)
def __after_body_func(self, line):
self.__write_obj.write(line)
def make_sections(self):
"""
Required:
nothing
Returns:
original file will be changed
Logic:
"""
self.__initiate_values()
read_obj = open_for_read(self.__file)
self.__write_obj = open_for_write(self.__write_to)
line_to_read = 1
while line_to_read:
line_to_read = read_obj.readline()
line = line_to_read
self.__token_info = line[:16]
action = self.__state_dict.get(self.__state)
action(line)
read_obj.close()
self.__write_obj.close()
copy_obj = copy.Copy(bug_handler=self.__bug_handler)
if self.__copy:
copy_obj.copy_file(self.__write_to, "sections_to_headings.data")
copy_obj.rename(self.__write_to, self.__file)
os.remove(self.__write_to)

View File

@@ -0,0 +1,589 @@
from __future__ import absolute_import, division, print_function, unicode_literals
#########################################################################
# #
# #
# copyright 2002 Paul Henry Tremblay #
# #
# This program is distributed in the hope that it will be useful, #
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
# General Public License for more details. #
# #
# #
#########################################################################
import sys, os, io
from calibre.ebooks.rtf2xml import get_char_map, copy
from calibre.ebooks.rtf2xml.char_set import char_set
from calibre.ptempfile import better_mktemp
from polyglot.builtins import unicode_type
from . import open_for_read, open_for_write
class Hex2Utf8:
"""
Convert Microsoft hexidecimal numbers to utf-8
"""
def __init__(self,
in_file,
area_to_convert,
char_file,
default_char_map,
bug_handler,
invalid_rtf_handler,
copy=None,
temp_dir=None,
symbol=None,
wingdings=None,
caps=None,
convert_caps=None,
dingbats=None,
run_level=1,
):
"""
Required:
'file'
'area_to_convert'--the area of file to convert
'char_file'--the file containing the character mappings
'default_char_map'--name of default character map
Optional:
'copy'-- whether to make a copy of result for debugging
'temp_dir' --where to output temporary results (default is
directory from which the script is run.)
'symbol'--whether to load the symbol character map
'winddings'--whether to load the wingdings character map
'caps'--whether to load the caps characer map
'convert_to_caps'--wether to convert caps to utf-8
Returns:
nothing
"""
self.__file = in_file
self.__copy = copy
if area_to_convert not in ('preamble', 'body'):
msg = (
'Developer error! Wrong flag.\n'
'in module "hex_2_utf8.py\n'
'"area_to_convert" must be "body" or "preamble"\n'
)
raise self.__bug_handler(msg)
self.__char_file = char_file
self.__area_to_convert = area_to_convert
self.__default_char_map = default_char_map
self.__symbol = symbol
self.__wingdings = wingdings
self.__dingbats = dingbats
self.__caps = caps
self.__convert_caps = 0
self.__convert_symbol = 0
self.__convert_wingdings = 0
self.__convert_zapf = 0
self.__run_level = run_level
self.__write_to = better_mktemp()
self.__bug_handler = bug_handler
self.__invalid_rtf_handler = invalid_rtf_handler
def update_values(self,
file,
area_to_convert,
char_file,
convert_caps,
convert_symbol,
convert_wingdings,
convert_zapf,
copy=None,
temp_dir=None,
symbol=None,
wingdings=None,
caps=None,
dingbats=None,
):
"""
Required:
'file'
'area_to_convert'--the area of file to convert
'char_file'--the file containing the character mappings
Optional:
'copy'-- whether to make a copy of result for debugging
'temp_dir' --where to output temporary results (default is
directory from which the script is run.)
'symbol'--whether to load the symbol character map
'winddings'--whether to load the wingdings character map
'caps'--whether to load the caps characer map
'convert_to_caps'--wether to convert caps to utf-8
Returns:
nothing
"""
self.__file=file
self.__copy = copy
if area_to_convert not in ('preamble', 'body'):
msg = (
'in module "hex_2_utf8.py\n'
'"area_to_convert" must be "body" or "preamble"\n'
)
raise self.__bug_handler(msg)
self.__area_to_convert = area_to_convert
self.__symbol = symbol
self.__wingdings = wingdings
self.__dingbats = dingbats
self.__caps = caps
self.__convert_caps = convert_caps
self.__convert_symbol = convert_symbol
self.__convert_wingdings = convert_wingdings
self.__convert_zapf = convert_zapf
# new!
# no longer try to convert these
# self.__convert_symbol = 0
# self.__convert_wingdings = 0
# self.__convert_zapf = 0
def __initiate_values(self):
"""
Required:
Nothing
Set values, including those for the dictionaries.
The file that contains the maps is broken down into many different
sets. For example, for the Symbol font, there is the standard part for
hexidecimal numbers, and the part for Microsoft characters. Read
each part in, and then combine them.
"""
# the default encoding system, the lower map for characters 0 through
# 128, and the encoding system for Microsoft characters.
# New on 2004-05-8: the self.__char_map is not in directory with other
# modules
self.__char_file = io.StringIO(char_set)
char_map_obj = get_char_map.GetCharMap(
char_file=self.__char_file,
bug_handler=self.__bug_handler,
)
up_128_dict = char_map_obj.get_char_map(map=self.__default_char_map)
bt_128_dict = char_map_obj.get_char_map(map='bottom_128')
ms_standard_dict = char_map_obj.get_char_map(map='ms_standard')
self.__def_dict = {}
self.__def_dict.update(up_128_dict)
self.__def_dict.update(bt_128_dict)
self.__def_dict.update(ms_standard_dict)
self.__current_dict = self.__def_dict
self.__current_dict_name = 'default'
self.__in_caps = 0
self.__special_fonts_found = 0
if self.__symbol:
symbol_base_dict = char_map_obj.get_char_map(map='SYMBOL')
ms_symbol_dict = char_map_obj.get_char_map(map='ms_symbol')
self.__symbol_dict = {}
self.__symbol_dict.update(symbol_base_dict)
self.__symbol_dict.update(ms_symbol_dict)
if self.__wingdings:
wingdings_base_dict = char_map_obj.get_char_map(map='wingdings')
ms_wingdings_dict = char_map_obj.get_char_map(map='ms_wingdings')
self.__wingdings_dict = {}
self.__wingdings_dict.update(wingdings_base_dict)
self.__wingdings_dict.update(ms_wingdings_dict)
if self.__dingbats:
dingbats_base_dict = char_map_obj.get_char_map(map='dingbats')
ms_dingbats_dict = char_map_obj.get_char_map(map='ms_dingbats')
self.__dingbats_dict = {}
self.__dingbats_dict.update(dingbats_base_dict)
self.__dingbats_dict.update(ms_dingbats_dict)
# load dictionary for caps, and make a string for the replacement
self.__caps_uni_dict = char_map_obj.get_char_map(map='caps_uni')
# # print self.__caps_uni_dict
# don't think I'll need this
# keys = self.__caps_uni_dict.keys()
# self.__caps_uni_replace = '|'.join(keys)
self.__preamble_state_dict = {
'preamble' : self.__preamble_func,
'body' : self.__body_func,
'mi<mk<body-open_' : self.__found_body_func,
'tx<hx<__________' : self.__hex_text_func,
}
self.__body_state_dict = {
'preamble' : self.__preamble_for_body_func,
'body' : self.__body_for_body_func,
}
self.__in_body_dict = {
'mi<mk<body-open_' : self.__found_body_func,
'tx<ut<__________' : self.__utf_to_caps_func,
'tx<hx<__________' : self.__hex_text_func,
'tx<mc<__________' : self.__hex_text_func,
'tx<nu<__________' : self.__text_func,
'mi<mk<font______' : self.__start_font_func,
'mi<mk<caps______' : self.__start_caps_func,
'mi<mk<font-end__' : self.__end_font_func,
'mi<mk<caps-end__' : self.__end_caps_func,
}
self.__caps_list = ['false']
self.__font_list = ['not-defined']
def __hex_text_func(self, line):
"""
Required:
'line' -- the line
Logic:
get the hex_num and look it up in the default dictionary. If the
token is in the dictionary, then check if the value starts with a
"&". If it does, then tag the result as utf text. Otherwise, tag it
as normal text.
If the hex_num is not in the dictionary, then a mistake has been
made.
"""
hex_num = line[17:-1]
converted = self.__current_dict.get(hex_num)
if converted is not None:
# tag as utf-8
if converted[0:1] == "&":
font = self.__current_dict_name
if self.__convert_caps\
and self.__caps_list[-1] == 'true'\
and font not in ('Symbol', 'Wingdings', 'Zapf Dingbats'):
converted = self.__utf_token_to_caps_func(converted)
self.__write_obj.write(
'tx<ut<__________<%s\n' % converted
)
# tag as normal text
else:
font = self.__current_dict_name
if self.__convert_caps\
and self.__caps_list[-1] == 'true'\
and font not in ('Symbol', 'Wingdings', 'Zapf Dingbats'):
converted = converted.upper()
self.__write_obj.write(
'tx<nu<__________<%s\n' % converted
)
# error
else:
token = hex_num.replace("'", '')
the_num = 0
if token:
the_num = int(token, 16)
if the_num > 10:
self.__write_obj.write('mi<tg<empty-att_<udef_symbol<num>%s<description>not-in-table\n' %
hex_num)
if self.__run_level > 4:
# msg = 'no dictionary entry for %s\n'
# msg += 'the hexidecimal num is "%s"\n' % (hex_num)
# msg += 'dictionary is %s\n' % self.__current_dict_name
msg = 'Character "&#x%s;" does not appear to be valid (or is a control character)\n' % token
raise self.__bug_handler(msg)
def __found_body_func(self, line):
self.__state = 'body'
self.__write_obj.write(line)
def __body_func(self, line):
"""
When parsing preamble
"""
self.__write_obj.write(line)
def __preamble_func(self, line):
action = self.__preamble_state_dict.get(self.__token_info)
if action is not None:
action(line)
else:
self.__write_obj.write(line)
def __convert_preamble(self):
self.__state = 'preamble'
with open_for_write(self.__write_to) as self.__write_obj:
with open_for_read(self.__file) as read_obj:
for line in read_obj:
self.__token_info = line[:16]
action = self.__preamble_state_dict.get(self.__state)
if action is None:
sys.stderr.write('error no state found in hex_2_utf8',
self.__state
)
action(line)
copy_obj = copy.Copy(bug_handler=self.__bug_handler)
if self.__copy:
copy_obj.copy_file(self.__write_to, "preamble_utf_convert.data")
copy_obj.rename(self.__write_to, self.__file)
os.remove(self.__write_to)
def __preamble_for_body_func(self, line):
"""
Required:
line -- line to parse
Returns:
nothing
Logic:
Used when parsing the body.
"""
if self.__token_info == 'mi<mk<body-open_':
self.__found_body_func(line)
self.__write_obj.write(line)
def __body_for_body_func(self, line):
"""
Required:
line -- line to parse
Returns:
nothing
Logic:
Used when parsing the body.
"""
action = self.__in_body_dict.get(self.__token_info)
if action is not None:
action(line)
else:
self.__write_obj.write(line)
def __start_font_func(self, line):
"""
Required:
line -- line to parse
Returns:
nothing
Logic:
add font face to font_list
"""
face = line[17:-1]
self.__font_list.append(face)
if face == 'Symbol' and self.__convert_symbol:
self.__current_dict_name = 'Symbol'
self.__current_dict = self.__symbol_dict
elif face == 'Wingdings' and self.__convert_wingdings:
self.__current_dict_name = 'Wingdings'
self.__current_dict = self.__wingdings_dict
elif face == 'Zapf Dingbats' and self.__convert_zapf:
self.__current_dict_name = 'Zapf Dingbats'
self.__current_dict = self.__dingbats_dict
else:
self.__current_dict_name = 'default'
self.__current_dict = self.__def_dict
def __end_font_func(self, line):
"""
Required:
line -- line to parse
Returns:
nothing
Logic:
pop font_list
"""
if len(self.__font_list) > 1:
self.__font_list.pop()
else:
sys.stderr.write('module is hex_2_utf8\n')
sys.stderr.write('method is end_font_func\n')
sys.stderr.write('self.__font_list should be greater than one?\n')
face = self.__font_list[-1]
if face == 'Symbol' and self.__convert_symbol:
self.__current_dict_name = 'Symbol'
self.__current_dict = self.__symbol_dict
elif face == 'Wingdings' and self.__convert_wingdings:
self.__current_dict_name = 'Wingdings'
self.__current_dict = self.__wingdings_dict
elif face == 'Zapf Dingbats' and self.__convert_zapf:
self.__current_dict_name = 'Zapf Dingbats'
self.__current_dict = self.__dingbats_dict
else:
self.__current_dict_name = 'default'
self.__current_dict = self.__def_dict
def __start_special_font_func_old(self, line):
"""
Required:
line -- line
Returns;
nothing
Logic:
change the dictionary to use in conversion
"""
# for error checking
if self.__token_info == 'mi<mk<font-symbo':
self.__current_dict.append(self.__symbol_dict)
self.__special_fonts_found += 1
self.__current_dict_name = 'Symbol'
elif self.__token_info == 'mi<mk<font-wingd':
self.__special_fonts_found += 1
self.__current_dict.append(self.__wingdings_dict)
self.__current_dict_name = 'Wingdings'
elif self.__token_info == 'mi<mk<font-dingb':
self.__current_dict.append(self.__dingbats_dict)
self.__special_fonts_found += 1
self.__current_dict_name = 'Zapf Dingbats'
def __end_special_font_func(self, line):
"""
Required:
line --line to parse
Returns:
nothing
Logic:
pop the last dictionary, which should be a special font
"""
if len(self.__current_dict) < 2:
sys.stderr.write('module is hex_2_utf 8\n')
sys.stderr.write('method is __end_special_font_func\n')
sys.stderr.write('less than two dictionaries --can\'t pop\n')
self.__special_fonts_found -= 1
else:
self.__current_dict.pop()
self.__special_fonts_found -= 1
self.__dict_name = 'default'
def __start_caps_func_old(self, line):
"""
Required:
line -- line to parse
Returns:
nothing
Logic:
A marker that marks the start of caps has been found. Set
self.__in_caps to 1
"""
self.__in_caps = 1
def __start_caps_func(self, line):
"""
Required:
line -- line to parse
Returns:
nothing
Logic:
A marker that marks the start of caps has been found. Set
self.__in_caps to 1
"""
self.__in_caps = 1
value = line[17:-1]
self.__caps_list.append(value)
def __end_caps_func(self, line):
"""
Required:
line -- line to parse
Returns:
nothing
Logic:
A marker that marks the end of caps has been found.
set self.__in_caps to 0
"""
if len(self.__caps_list) > 1:
self.__caps_list.pop()
else:
sys.stderr.write('Module is hex_2_utf8\n'
'method is __end_caps_func\n'
'caps list should be more than one?\n') # self.__in_caps not set
def __text_func(self, line):
"""
Required:
line -- line to parse
Returns:
nothing
Logic:
if in caps, convert. Otherwise, print out.
"""
text = line[17:-1]
# print line
if self.__current_dict_name in ('Symbol', 'Wingdings', 'Zapf Dingbats'):
the_string = ''
for letter in text:
hex_num = hex(ord(letter))
hex_num = unicode_type(hex_num)
hex_num = hex_num.upper()
hex_num = hex_num[2:]
hex_num = '\'%s' % hex_num
converted = self.__current_dict.get(hex_num)
if converted is None:
sys.stderr.write('module is hex_2_ut8\nmethod is __text_func\n')
sys.stderr.write('no hex value for "%s"\n' % hex_num)
else:
the_string += converted
self.__write_obj.write('tx<nu<__________<%s\n' % the_string)
# print the_string
else:
if self.__caps_list[-1] == 'true' \
and self.__convert_caps\
and self.__current_dict_name not in ('Symbol', 'Wingdings', 'Zapf Dingbats'):
text = text.upper()
self.__write_obj.write('tx<nu<__________<%s\n' % text)
def __utf_to_caps_func(self, line):
"""
Required:
line -- line to parse
returns
nothing
Logic
Get the text, and use another method to convert
"""
utf_text = line[17:-1]
if self.__caps_list[-1] == 'true' and self.__convert_caps:
# utf_text = utf_text.upper()
utf_text = self.__utf_token_to_caps_func(utf_text)
self.__write_obj.write('tx<ut<__________<%s\n' % utf_text)
def __utf_token_to_caps_func(self, char_entity):
"""
Required:
utf_text -- such as &xxx;
Returns:
token converted to the capital equivalent
Logic:
RTF often stores text in the improper values. For example, a
capital umlaut o (?), is stores as ?. This function swaps the
case by looking up the value in a dictionary.
"""
hex_num = char_entity[3:]
length = len(hex_num)
if length == 3:
hex_num = '00%s' % hex_num
elif length == 4:
hex_num = '0%s' % hex_num
new_char_entity = '&#x%s' % hex_num
converted = self.__caps_uni_dict.get(new_char_entity)
if not converted:
# bullets and other entities dont' have capital equivelents
return char_entity
else:
return converted
def __convert_body(self):
self.__state = 'body'
with open_for_read(self.__file) as read_obj:
with open_for_write(self.__write_to) as self.__write_obj:
for line in read_obj:
self.__token_info = line[:16]
action = self.__body_state_dict.get(self.__state)
if action is None:
sys.stderr.write('error no state found in hex_2_utf8',
self.__state
)
action(line)
copy_obj = copy.Copy(bug_handler=self.__bug_handler)
if self.__copy:
copy_obj.copy_file(self.__write_to, "body_utf_convert.data")
copy_obj.rename(self.__write_to, self.__file)
os.remove(self.__write_to)
def convert_hex_2_utf8(self):
self.__initiate_values()
if self.__area_to_convert == 'preamble':
self.__convert_preamble()
else:
self.__convert_body()
"""
how to swap case for non-capitals
my_string.swapcase()
An example of how to use a hash for the caps function
(but I shouldn't need this, since utf text is separate
from regular text?)
sub_dict = {
"&#x0430;" : "some other value"
}
def my_sub_func(matchobj):
info = matchobj.group(0)
value = sub_dict.get(info)
return value
return "f"
line = "&#x0430; more text"
reg_exp = re.compile(r'(?P<name>&#x0430;|&#x0431;)')
line2 = re.sub(reg_exp, my_sub_func, line)
print line2
"""

View File

@@ -0,0 +1,285 @@
from __future__ import unicode_literals, absolute_import, print_function, division
#########################################################################
# #
# #
# copyright 2002 Paul Henry Tremblay #
# #
# This program is distributed in the hope that it will be useful, #
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
# General Public License for more details. #
# #
# #
#########################################################################
import sys, os, re
from calibre.ebooks.rtf2xml import copy
from calibre.ptempfile import better_mktemp
from . import open_for_read, open_for_write
class Info:
"""
Make tags for document-information
"""
def __init__(self,
in_file,
bug_handler,
copy=None,
run_level=1,
):
"""
Required:
'file'--file to parse
Optional:
'copy'-- whether to make a copy of result for debugging
'temp_dir' --where to output temporary results (default is
directory from which the script is run.)
Returns:
nothing
"""
self.__file = in_file
self.__bug_handler = bug_handler
self.__copy = copy
self.__run_level = run_level
self.__write_to = better_mktemp()
def __initiate_values(self):
"""
Initiate all values.
"""
self.__text_string = ''
self.__state = 'before_info_table'
self.rmspace = re.compile(r'\s+')
self.__state_dict = {
'before_info_table': self.__before_info_table_func,
'after_info_table': self.__after_info_table_func,
'in_info_table' : self.__in_info_table_func,
'collect_text' : self.__collect_text_func,
'collect_tokens' : self.__collect_tokens_func,
}
self.__info_table_dict = {
'cw<di<title_____' : (self.__found_tag_with_text_func, 'title'),
'cw<di<author____' : (self.__found_tag_with_text_func, 'author'),
'cw<di<operator__' : (self.__found_tag_with_text_func, 'operator'),
'cw<di<manager___' : (self.__found_tag_with_text_func, 'manager'),
'cw<di<company___' : (self.__found_tag_with_text_func, 'company'),
'cw<di<keywords__' : (self.__found_tag_with_text_func, 'keywords'),
'cw<di<category__' : (self.__found_tag_with_text_func, 'category'),
'cw<di<doc-notes_' : (self.__found_tag_with_text_func, 'doc-notes'),
'cw<di<subject___' : (self.__found_tag_with_text_func, 'subject'),
'cw<di<linkbase__' : (self.__found_tag_with_text_func, 'hyperlink-base'),
'cw<di<create-tim' : (self.__found_tag_with_tokens_func, 'creation-time'),
'cw<di<revis-time' : (self.__found_tag_with_tokens_func, 'revision-time'),
'cw<di<print-time' : (self.__found_tag_with_tokens_func, 'printing-time'),
'cw<di<backuptime' : (self.__found_tag_with_tokens_func, 'backup-time'),
'cw<di<num-of-wor' : (self.__single_field_func, 'number-of-words'),
'cw<di<num-of-chr' : (self.__single_field_func, 'number-of-characters'),
'cw<di<numofchrws' : (self.__single_field_func, 'number-of-characters-without-space'),
'cw<di<num-of-pag' : (self.__single_field_func, 'number-of-pages'),
'cw<di<version___' : (self.__single_field_func, 'version'),
'cw<di<edit-time_' : (self.__single_field_func, 'editing-time'),
'cw<di<intern-ver' : (self.__single_field_func, 'internal-version-number'),
'cw<di<internalID' : (self.__single_field_func, 'internal-id-number'),
}
self.__token_dict = {
'year______' : 'year',
'month_____' : 'month',
'day_______' : 'day',
'minute____' : 'minute',
'second____' : 'second',
'revis-time' : 'revision-time',
'create-tim' : 'creation-time',
'edit-time_' : 'editing-time',
'print-time' : 'printing-time',
'backuptime' : 'backup-time',
'num-of-wor' : 'number-of-words',
'num-of-chr' : 'number-of-characters',
'numofchrws' : 'number-of-characters-without-space',
'num-of-pag' : 'number-of-pages',
'version___' : 'version',
'intern-ver' : 'internal-version-number',
'internalID' : 'internal-id-number',
}
def __before_info_table_func(self, line):
"""
Required:
line -- the line to parse
Returns:
nothing
Logic:
Check for the beginning of the informatin table. When found, set
the state to the information table. Always write the line.
"""
if self.__token_info == 'mi<mk<doc-in-beg':
self.__state = 'in_info_table'
self.__write_obj.write(line)
def __in_info_table_func(self, line):
"""
Requires:
line -- line to parse
Returns:
nothing.
Logic:
Check for the end of information. If not found, check if the
token has a special value in the info table dictionay. If it
does, execute that function.
Otherwise, output the line to the file.
"""
if self.__token_info == 'mi<mk<doc-in-end':
self.__state = 'after_info_table'
else:
action, tag = self.__info_table_dict.get(self.__token_info, (None, None))
if action:
action(line, tag)
else:
self.__write_obj.write(line)
def __found_tag_with_text_func(self, line, tag):
"""
Requires:
line -- line to parse
tag --what kind of line
Returns:
nothing
Logic:
This function marks the beginning of informatin fields that have
text that must be collected. Set the type of information field
with the tag option. Set the state to collecting text
"""
self.__tag = tag
self.__state = 'collect_text'
def __collect_text_func(self, line):
"""
Requires:
line -- line to parse
Returns:
nothing
Logic:
If the end of the information field is found, write the text
string to the file.
Otherwise, if the line contains text, add it to the text string.
"""
if self.__token_info == 'mi<mk<docinf-end':
self.__state = 'in_info_table'
# Don't print empty tags
if len(self.rmspace.sub('',self.__text_string)):
self.__write_obj.write(
'mi<tg<open______<%s\n'
'tx<nu<__________<%s\n'
'mi<tg<close_____<%s\n' % (self.__tag, self.__text_string, self.__tag)
)
self.__text_string = ''
elif line[0:2] == 'tx':
self.__text_string += line[17:-1]
def __found_tag_with_tokens_func(self, line, tag):
"""
Requires:
line -- line to parse
tag -- type of field
Returns:
nothing
Logic:
Some fields have a series of tokens (cw<di<year______<nu<2003)
that must be parsed as attributes for the element.
Set the state to collect tokesn, and set the text string to
start an empty element with attributes.
"""
self.__state = 'collect_tokens'
self.__text_string = 'mi<tg<empty-att_<%s' % tag
# mi<tg<empty-att_<page-definition<margin>33\n
def __collect_tokens_func(self, line):
"""
Requires:
line -- line to parse
Returns:
nothing
Logic:
This function collects all the token information and adds it to
the text string until the end of the field is found.
First check of the end of the information field. If found, write
the text string to the file.
If not found, get the relevant information from the text string.
This information cannot be directly added to the text string,
because it exists in abbreviated form. (num-of-wor)
I want to check this information in a dictionary to convert it
to a longer, readable form. If the key does not exist in the
dictionary, print out an error message. Otherise add the value
to the text string.
(num-of-wor => number-of-words)
"""
# cw<di<year______<nu<2003
if self.__token_info == 'mi<mk<docinf-end':
self.__state = 'in_info_table'
self.__write_obj.write(
'%s\n' % self.__text_string
)
self.__text_string = ''
else:
att = line[6:16]
value = line[20:-1]
att_changed = self.__token_dict.get(att)
if att_changed is None:
if self.__run_level > 3:
msg = 'No dictionary match for %s\n' % att
raise self.__bug_handler(msg)
else:
self.__text_string += '<%s>%s' % (att_changed, value)
def __single_field_func(self, line, tag):
value = line[20:-1]
self.__write_obj.write(
'mi<tg<empty-att_<%s<%s>%s\n' % (tag, tag, value)
)
def __after_info_table_func(self, line):
"""
Requires:
line --line to write to file
Returns:
nothing
Logic:
After the end of the information table, simple write the line to
the file.
"""
self.__write_obj.write(line)
def fix_info(self):
"""
Requires:
nothing
Returns:
nothing (changes the original file)
Logic:
Read one line in at a time. Determine what action to take based on
the state. If the state is before the information table, look for the
beginning of the style table.
If the state is in the information table, use other methods to
parse the information
style table, look for lines with style info, and substitute the
number with the name of the style. If the state if afer the
information table, simply write the line to the output file.
"""
self.__initiate_values()
with open_for_read(self.__file) as read_obj:
with open_for_write(self.__write_to) as self.__write_obj:
for line in read_obj:
self.__token_info = line[:16]
action = self.__state_dict.get(self.__state)
if action is None:
sys.stderr.write('No matching state in module styles.py\n')
sys.stderr.write(self.__state + '\n')
action(line)
copy_obj = copy.Copy(bug_handler=self.__bug_handler)
if self.__copy:
copy_obj.copy_file(self.__write_to, "info.data")
copy_obj.rename(self.__write_to, self.__file)
os.remove(self.__write_to)

View File

@@ -0,0 +1,427 @@
from __future__ import unicode_literals, absolute_import, print_function, division
import sys, os
from calibre.ebooks.rtf2xml import copy
from calibre.ptempfile import better_mktemp
from . import open_for_read, open_for_write
"""
States.
1. default
1. an open bracket ends this state.
2. Text print out text. Print out any groups_in_waiting.
3. closed bracket. Close groups
2. after an open bracket
1. The lack of a control word ends this state.
2. paragraph end -- close out all tags
3. footnote beg -- close out all tags
"""
class Inline:
"""
Make inline tags within lists.
Logic:
"""
def __init__(self,
in_file,
bug_handler,
copy=None,
run_level=1,):
"""
Required:
'file'--file to parse
Optional:
'copy'-- whether to make a copy of result for debugging
'temp_dir' --where to output temporary results (default is
directory from which the script is run.)
Returns:
nothing
"""
self.__file = in_file
self.__bug_handler = bug_handler
self.__copy = copy
self.__run_level = run_level
self.__write_to = better_mktemp()
def __initiate_values(self):
"""
Initiate all values.
"""
self.__state_dict = {
'default': self.__default_func,
'after_open_bracket': self.__after_open_bracket_func,
}
self.__default_dict = {
'ob<nu<open-brack': self.__found_open_bracket_func,
'tx<nu<__________' : self.__found_text_func,
'tx<hx<__________' : self.__found_text_func,
'tx<ut<__________' : self.__found_text_func,
'mi<mk<inline-fld' : self.__found_text_func,
'text' : self.__found_text_func,
'cb<nu<clos-brack' : self.__close_bracket_func,
'mi<mk<par-end___' : self.__end_para_func,
'mi<mk<footnt-ope' : self.__end_para_func,
'mi<mk<footnt-ind' : self.__end_para_func,
}
self.__after_open_bracket_dict = {
'cb<nu<clos-brack' : self.__close_bracket_func,
'tx<nu<__________' : self.__found_text_func,
'tx<hx<__________' : self.__found_text_func,
'tx<ut<__________' : self.__found_text_func,
'text' : self.__found_text_func,
'mi<mk<inline-fld' : self.__found_text_func,
'ob<nu<open-brack': self.__found_open_bracket_func,
'mi<mk<par-end___' : self.__end_para_func,
'mi<mk<footnt-ope' : self.__end_para_func,
'mi<mk<footnt-ind' : self.__end_para_func,
'cw<fd<field_____' : self.__found_field_func,
}
self.__state = 'default'
self.__brac_count = 0 # do I need this?
self.__list_inline_list = []
self.__body_inline_list = []
self.__groups_in_waiting_list = [0]
self.__groups_in_waiting_body = [0]
self.__groups_in_waiting = self.__groups_in_waiting_body
self.__place = 'non_list'
self.__inline_list = self.__body_inline_list
self.__in_para = 0 # not in paragraph
self.__char_dict = {
# character info => ci
'annotation' : 'annotation',
'blue______' : 'blue',
'bold______' : 'bold',
'caps______' : 'caps',
'char-style' : 'character-style',
'dbl-strike' : 'double-strike-through',
'emboss____' : 'emboss',
'engrave___' : 'engrave',
'font-color' : 'font-color',
'font-down_' : 'subscript',
'font-size_' : 'font-size',
'font-style' : 'font-style',
'font-up___' : 'superscript',
'footnot-mk' : 'footnote-marker',
'green_____' : 'green',
'hidden____' : 'hidden',
'italics___' : 'italics',
'outline___' : 'outline',
'red_______' : 'red',
'shadow____' : 'shadow',
'small-caps' : 'small-caps',
'strike-thr' : 'strike-through',
'subscript_' : 'subscript',
'superscrip' : 'superscript',
'underlined' : 'underlined',
}
self.__caps_list = ['false']
def __set_list_func(self, line):
"""
Requires:
line--line of text
Returns:
nothing
Logic:
"""
if self.__place == 'in_list':
if self.__token_info == 'mi<mk<lst-tx-end':
self.__place = 'not_in_list'
self.__inline_list = self.__body_inline_list
self.__groups_in_waiting = self.__groups_in_waiting_body
else:
if self.__token_info == 'mi<mk<lst-tx-beg':
self.__place = 'in_list'
self.__inline_list = self.__list_inline_list
self.__groups_in_waiting = self.__groups_in_waiting_list
def __default_func(self, line):
"""
Requires:
line-- line of text
Returns:
nothing
Logic:
Write if not hardline break
"""
action = self.__default_dict.get(self.__token_info)
if action:
action(line)
self.__write_obj.write(line)
def __found_open_bracket_func(self, line):
"""
Requires:
line -- current line of text
Returns:
nothing
Logic:
Change the state to 'after_open_bracket'
"""
self.__state = 'after_open_bracket'
self.__brac_count += 1
self.__groups_in_waiting[0] += 1
self.__inline_list.append({})
self.__inline_list[-1]['contains_inline'] = 0
def __after_open_bracket_func(self, line):
"""
Requires:
line --line of text
Returns:
nothing
Logic:
If the token is a control word for character info (cw<ci), use another
method to add to the dictionary.
Use the dictionary to get the approriate function.
Always print out the line.
"""
if line[0:5] == 'cw<ci': # calibre: bug in original function no diff between cw<ci and cw<pf
self.__handle_control_word(line)
else:
action = self.__after_open_bracket_dict.get(self.__token_info)
if action:
self.__state = 'default' # a non control word?
action(line)
self.__write_obj.write(line)
def __handle_control_word(self, line):
"""
Required:
line --line of text
Returns:
nothing
Logic:
Handle the control word for inline groups.
Add each name - value to a dictionary.
If the font style of Symbol, Wingdings, or Dingbats is found,
always mark this. I need this later to convert the text to
the right utf.
"""
# cw<ci<shadow_____<nu<true
# self.__char_dict = {
char_info = line[6:16]
char_value = line[20:-1]
name = self.__char_dict.get(char_info)
if name:
self.__inline_list[-1]['contains_inline'] = 1
self.__inline_list[-1][name] = char_value
"""
if name == 'font-style':
if char_value == 'Symbol':
self.__write_obj.write('mi<mk<font-symbo\n')
elif char_value == 'Wingdings':
self.__write_obj.write('mi<mk<font-wingd\n')
elif char_value == 'Zapf Dingbats':
self.__write_obj.write('mi<mk<font-dingb\n')
"""
def __close_bracket_func(self, line):
"""
Requires:
line --line of text
Returns:
Nothing
Logic:
If there are no inline groups, do nothing.
Get the keys of the last dictionary in the inline_groups.
If 'contains_inline' in the keys, write a close tag.
If the_dict contains font information, write a mk tag.
"""
if len(self.__inline_list) == 0:
# nothing to add
return
the_dict = self.__inline_list[-1]
the_keys = the_dict.keys()
# always close out
if self.__place == 'in_list':
if 'contains_inline' in the_keys and the_dict['contains_inline'] == 1\
and self.__groups_in_waiting[0] == 0:
self.__write_obj.write('mi<tg<close_____<inline\n')
if 'font-style' in the_keys:
self.__write_obj.write('mi<mk<font-end__\n')
if 'caps' in the_keys:
self.__write_obj.write('mi<mk<caps-end__\n')
else:
# close out only if in a paragraph
if 'contains_inline' in the_keys and the_dict['contains_inline'] == 1\
and self.__in_para and self.__groups_in_waiting[0] == 0:
self.__write_obj.write('mi<tg<close_____<inline\n')
if 'font-style' in the_keys:
self.__write_obj.write('mi<mk<font-end__\n')
if 'caps' in the_keys:
self.__write_obj.write('mi<mk<caps-end__\n')
self.__inline_list.pop()
if self.__groups_in_waiting[0] != 0:
self.__groups_in_waiting[0] -= 1
def __found_text_func(self, line):
"""
Required:
line--line of text
Return:
nothing
Logic:
Three cases:
1. in a list. Simply write inline
2. Not in a list
Text can mark the start of a paragraph.
If already in a paragraph, check to see if any groups are waiting
to be added. If so, use another method to write these groups.
"""
if self.__place == 'in_list':
self.__write_inline()
else:
if not self.__in_para:
self.__in_para = 1
self.__start_para_func(line)
elif self.__groups_in_waiting[0] != 0:
self.__write_inline()
def __write_inline(self):
"""
Required:
nothing
Returns
Nothing
Logic:
Method for writing inline when text is found.
Only write those groups that are "waiting", or that have no
tags yet.
First, slice the list self.__inline list to get just the groups
in waiting.
Iterate through this slice, which contains only dictionaries.
Get the keys in each dictionary. If 'font-style' is in the keys,
write a marker tag. (I will use this marker tag later when conerting
hext text to utf8.)
Write a tag for the inline values.
"""
if self.__groups_in_waiting[0] != 0:
last_index = -1 * self.__groups_in_waiting[0]
inline_list = self.__inline_list[last_index:]
if len(inline_list) <= 0:
if self.__run_level > 3:
msg = 'self.__inline_list is %s\n' % self.__inline_list
raise self.__bug_handler(msg)
self.__write_obj.write('error\n')
self.__groups_in_waiting[0] = 0
return
for the_dict in inline_list:
if the_dict['contains_inline']:
the_keys = the_dict.keys()
if 'font-style' in the_keys:
face = the_dict['font-style']
self.__write_obj.write('mi<mk<font______<%s\n' % face)
if 'caps' in the_keys:
value = the_dict['caps']
self.__write_obj.write('mi<mk<caps______<%s\n' % value)
self.__write_obj.write('mi<tg<open-att__<inline')
for the_key in the_keys:
if the_key != 'contains_inline':
self.__write_obj.write('<%s>%s' % (the_key, the_dict[the_key]))
self.__write_obj.write('\n')
self.__groups_in_waiting[0] = 0
def __end_para_func(self, line):
"""
Requires:
line -- line of text
Returns:
nothing
Logic:
Slice from the end the groups in waiting.
Iterate through the list. If the dictionary contaings info, write
a closing tag.
"""
if not self.__in_para:
return
if self.__groups_in_waiting[0] == 0:
inline_list = self.__inline_list
else:
last_index = -1 * self.__groups_in_waiting[0]
inline_list = self.__inline_list[0:last_index]
for the_dict in inline_list:
contains_info = the_dict.get('contains_inline')
if contains_info:
the_keys = the_dict.keys()
if 'font-style' in the_keys:
self.__write_obj.write('mi<mk<font-end__\n')
if 'caps' in the_keys:
self.__write_obj.write('mi<mk<caps-end__\n')
self.__write_obj.write('mi<tg<close_____<inline\n')
self.__in_para = 0
def __start_para_func(self, line):
"""
Requires:
line -- line of text
Returns:
nothing
Logic:
Iterate through the self.__inline_list to get each dict.
If the dict containst inline info, get the keys.
Iterate through the keys and print out the key and value.
"""
for the_dict in self.__inline_list:
contains_info = the_dict.get('contains_inline')
if contains_info :
the_keys = the_dict.keys()
if 'font-style' in the_keys:
face = the_dict['font-style']
self.__write_obj.write('mi<mk<font______<%s\n' % face)
if 'caps' in the_keys:
value = the_dict['caps']
self.__write_obj.write('mi<mk<caps______<%s\n' % value)
self.__write_obj.write('mi<tg<open-att__<inline')
for the_key in the_keys:
if the_key != 'contains_inline':
self.__write_obj.write('<%s>%s' % (the_key, the_dict[the_key]))
self.__write_obj.write('\n')
self.__groups_in_waiting[0] = 0
def __found_field_func(self, line):
"""
Just a default function to make sure I don't prematurely exit
default state
"""
pass
def form_tags(self):
"""
Requires:
area--area to parse (list or non-list)
Returns:
nothing
Logic:
Read one line in at a time. Determine what action to take based on
the state.
"""
self.__initiate_values()
with open_for_read(self.__file) as read_obj:
with open_for_write(self.__write_to) as self.__write_obj:
for line in read_obj:
token = line[0:-1]
self.__token_info = ''
if token == 'tx<mc<__________<rdblquote'\
or token == 'tx<mc<__________<ldblquote'\
or token == 'tx<mc<__________<lquote'\
or token == 'tx<mc<__________<rquote'\
or token == 'tx<mc<__________<emdash'\
or token == 'tx<mc<__________<endash'\
or token == 'tx<mc<__________<bullet':
self.__token_info = 'text'
else:
self.__token_info = line[:16]
self.__set_list_func(line)
action = self.__state_dict.get(self.__state)
if action is None:
sys.stderr.write('No matching state in module inline.py\n')
sys.stderr.write(self.__state + '\n')
action(line)
copy_obj = copy.Copy(bug_handler=self.__bug_handler)
if self.__copy:
copy_obj.copy_file(self.__write_to, "inline.data")
copy_obj.rename(self.__write_to, self.__file)
os.remove(self.__write_to)

View File

@@ -0,0 +1,56 @@
from __future__ import unicode_literals, absolute_import, print_function, division
#########################################################################
# #
# #
# copyright 2002 Paul Henry Tremblay #
# #
# This program is distributed in the hope that it will be useful, #
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
# General Public License for more details. #
# #
# #
#########################################################################
import os
from calibre.ebooks.rtf2xml import copy
from calibre.utils.cleantext import clean_ascii_chars
from calibre.ptempfile import better_mktemp
class FixLineEndings:
"""Fix line endings"""
def __init__(self,
bug_handler,
in_file=None,
copy=None,
run_level=1,
replace_illegals=1,
):
self.__file = in_file
self.__bug_handler = bug_handler
self.__copy = copy
self.__run_level = run_level
self.__write_to = better_mktemp()
self.__replace_illegals = replace_illegals
def fix_endings(self):
# read
with open(self.__file, 'rb') as read_obj:
input_file = read_obj.read()
# calibre go from win and mac to unix
input_file = input_file.replace(b'\r\n', b'\n')
input_file = input_file.replace(b'\r', b'\n')
# remove ASCII invalid chars : 0 to 8 and 11-14 to 24-26-27
if self.__replace_illegals:
input_file = clean_ascii_chars(input_file)
# write
with open(self.__write_to, 'wb') as write_obj:
write_obj.write(input_file)
# copy
copy_obj = copy.Copy(bug_handler=self.__bug_handler)
if self.__copy:
copy_obj.copy_file(self.__write_to, "line_endings.data")
copy_obj.rename(self.__write_to, self.__file)
os.remove(self.__write_to)

View File

@@ -0,0 +1,201 @@
from __future__ import unicode_literals, absolute_import, print_function, division
#########################################################################
# #
# #
# copyright 2002 Paul Henry Tremblay #
# #
# This program is distributed in the hope that it will be useful, #
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
# General Public License for more details. #
# #
# #
#########################################################################
import os
from calibre.ebooks.rtf2xml import copy
from calibre.ptempfile import better_mktemp
from . import open_for_read, open_for_write
class ListNumbers:
"""
RTF puts list numbers outside of the paragraph. The public method
in this class put the list numbers inside the paragraphs.
"""
def __init__(self,
in_file,
bug_handler,
copy=None,
run_level=1,
):
"""
Required:
'file'
Optional:
'copy'-- whether to make a copy of result for debugging
'temp_dir' --where to output temporary results (default is
directory from which the script is run.)
Returns:
nothing
"""
self.__file = in_file
self.__bug_handler = bug_handler
self.__copy = copy
self.__write_to = better_mktemp()
def __initiate_values(self):
"""
initiate values for fix_list_numbers.
Required:
Nothing
Return:
Nothing
"""
self.__state = "default"
self.__list_chunk = ''
self.__previous_line = ''
self.__list_text_ob_count = ''
self.__state_dict={
'default' : self.__default_func,
'after_ob' : self.__after_ob_func,
'list_text' : self.__list_text_func,
'after_list_text' : self.__after_list_text_func
}
def __after_ob_func(self, line):
"""
Handle the line immediately after an open bracket.
Required:
self, line
Returns:
Nothing
"""
if self.__token_info == 'cw<ls<list-text_':
self.__state = 'list_text'
self.__list_chunk = self.__list_chunk + \
self.__previous_line + line
self.__list_text_ob = self.__ob_count
self.__cb_count = 0
else:
self.__write_obj.write(self.__previous_line)
self.__write_obj.write(line)
self.__state = 'default'
def __after_list_text_func(self, line):
"""
Look for an open bracket or a line of text, and then print out the
self.__list_chunk. Print out the line.
"""
if line[0:2] == 'ob' or line[0:2] == 'tx':
self.__state = 'default'
self.__write_obj.write('mi<mk<lst-txbeg_\n')
self.__write_obj.write('mi<mk<para-beg__\n')
self.__write_obj.write('mi<mk<lst-tx-beg\n')
self.__write_obj.write(
# 'mi<tg<open-att__<list-text<type>%s\n' % self.__list_type)
'mi<tg<open-att__<list-text\n')
self.__write_obj.write(self.__list_chunk)
self.__write_obj.write('mi<tg<close_____<list-text\n')
self.__write_obj.write('mi<mk<lst-tx-end\n')
self.__list_chunk = ''
self.__write_obj.write(line)
def __determine_list_type(self, chunk):
"""
Determine if the list is ordered or itemized
"""
lines = chunk.split('\n')
text_string = ''
for line in lines:
if line[0:5] == 'tx<hx':
if line[17:] == '\'B7':
return "unordered"
elif line[0:5] == 'tx<nu':
text_string += line[17:]
text_string = text_string.replace('.', '')
text_string = text_string.replace('(', '')
text_string = text_string.replace(')', '')
if text_string.isdigit():
return 'ordered'
"""
sys.stderr.write('module is list_numbers\n')
sys.stderr.write('method is __determine type\n')
sys.stderr.write('Couldn\'t get type of list\n')
"""
# must be some type of ordered list -- just a guess!
return 'unordered'
def __list_text_func(self, line):
"""
Handle lines that are part of the list text. If the end of the list
text is found (the closing bracket matches the self.__list_text_ob),
then change the state. Always add the line to the self.__list_chunk
Required:
self, line
Returns:
Nothing
"""
if self.__list_text_ob == self.__cb_count:
self.__state = 'after_list_text'
self.__right_after_list_text = 1
self.__list_type = self.__determine_list_type(self.__list_chunk)
self.__write_obj.write('mi<mk<list-type_<%s\n' % self.__list_type)
if self.__token_info != 'cw<pf<par-def___':
self.__list_chunk = self.__list_chunk + line
def __default_func(self, line):
"""
Handle the lines that are not part of any special state. Look for an
opening bracket. If an open bracket is found, add this line to a
temporary self.__previous line, which other methods need. Otherwise,
print out the line.
Required:
self, line
Returns:
Nothing
"""
if self.__token_info == 'ob<nu<open-brack':
self.__state = 'after_ob'
self.__previous_line = line
else:
self.__write_obj.write(line)
def fix_list_numbers(self):
"""
Required:
nothing
Returns:
original file will be changed
Logic:
Read in one line a time from the file. Keep track of opening and
closing brackets. Determine the method ('action') by passing the
state to the self.__state_dict.
Simply print out the line to a temp file until an open bracket
is found. Check the next line. If it is list-text, then start
adding to the self.__list_chunk until the closing bracket is
found.
Next, look for an open bracket or text. When either is found,
print out self.__list_chunk and the line.
"""
self.__initiate_values()
read_obj = open_for_read(self.__file)
self.__write_obj = open_for_write(self.__write_to)
line_to_read = 1
while line_to_read:
line_to_read = read_obj.readline()
line = line_to_read
self.__token_info = line[:16]
if self.__token_info == 'ob<nu<open-brack':
self.__ob_count = line[-5:-1]
if self.__token_info == 'cb<nu<clos-brack':
self.__cb_count = line[-5:-1]
action = self.__state_dict.get(self.__state)
action(line)
read_obj.close()
self.__write_obj.close()
copy_obj = copy.Copy(bug_handler=self.__bug_handler)
if self.__copy:
copy_obj.copy_file(self.__write_to, "list_numbers.data")
copy_obj.rename(self.__write_to, self.__file)
os.remove(self.__write_to)

View File

@@ -0,0 +1,447 @@
from __future__ import absolute_import, division, print_function, unicode_literals
#########################################################################
# #
# #
# copyright 2002 Paul Henry Tremblay #
# #
# This program is distributed in the hope that it will be useful, #
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
# General Public License for more details. #
# #
# #
#########################################################################
from polyglot.builtins import unicode_type
class ListTable:
"""
Parse the list table line. Make a string. Form a dictionary.
Return the string and the dictionary.
"""
def __init__(
self,
bug_handler,
run_level=1,
):
self.__bug_handler = bug_handler
self.__initiate_values()
self.__run_level = run_level
def __initiate_values(self):
self.__list_table_final = ''
self.__state = 'default'
self.__final_dict = {}
self.__list_dict = {}
self.__all_lists = []
self.__level_text_string = ''
self.__level_text_list = []
self.__found_level_text_length = 0
self.__level_text_position = None
self.__prefix_string = None
self.__level_numbers_string = ''
self.__state_dict = {
'default' : self.__default_func,
'level' : self.__level_func,
'list' : self.__list_func,
'unsure_ob' : self.__after_bracket_func,
'level_number' : self.__level_number_func,
'level_text' : self.__level_text_func,
'list_name' : self.__list_name_func,
}
self.__main_list_dict = {
'cw<ls<ls-tem-id_' : 'list-template-id',
'cw<ls<list-hybri' : 'list-hybrid',
'cw<ls<lis-tbl-id' : 'list-table-id',
}
self.__level_dict = {
'cw<ls<level-star' : 'list-number-start',
'cw<ls<level-spac' : 'list-space',
'cw<ls<level-inde' : 'level-indent',
'cw<ls<fir-ln-ind' : 'first-line-indent',
'cw<ls<left-inden' : 'left-indent',
'cw<ls<tab-stop__' : 'tabs',
'cw<ls<level-type' : 'numbering-type',
'cw<pf<right-inde' : 'right-indent',
'cw<pf<left-inden' : 'left-indent',
'cw<pf<fir-ln-ind' : 'first-line-indent',
'cw<ci<italics___' : 'italics',
'cw<ci<bold______' : 'bold',
'cw<ss<para-style' : 'paragraph-style-name',
}
"""
all_lists =
[{anything here?}
[{list-templateid = ""}
[{level-indent}],[{level-indent}]
]
],
"""
def __parse_lines(self, line):
"""
Required : line --line to parse
Returns: nothing
Logic:
Split the lines into a list by a new line. Process the line
according to the state.
"""
lines = line.split('\n')
self.__ob_count = 0
self.__ob_group = 0
for line in lines:
self.__token_info = line[:16]
if self.__token_info == 'ob<nu<open-brack':
self.__ob_count = line[-4:]
self.__ob_group += 1
if self.__token_info == 'cb<nu<clos-brack':
self.__cb_count = line[-4:]
self.__ob_group -= 1
action = self.__state_dict.get(self.__state)
if action is None:
print(self.__state)
action(line)
self.__write_final_string()
# self.__add_to_final_line()
def __default_func(self, line):
"""
Requires: line --line to process
Return: nothing
Logic:
This state is used at the start and end of a list. Look for an
opening bracket, which marks the change of state.
"""
if self.__token_info == 'ob<nu<open-brack':
self.__state = 'unsure_ob'
def __found_list_func(self, line):
"""
Requires: line -- line to process
Returns: nothing
Logic:
I have found \\list.
Change the state to list
Get the open bracket count so you know when this state ends.
Append an empty list to all lists.
Create a temporary dictionary. This dictionary has the key of
"list-id" and the value of an empty list. Later, this empty list
will be filled with all the ids for which the formatting is valid.
Append the temporary dictionary to the new list.
"""
self.__state = 'list'
self.__list_ob_count = self.__ob_count
self.__all_lists.append([])
the_dict = {'list-id': []}
self.__all_lists[-1].append(the_dict)
def __list_func(self, line):
"""
Requires: line --line to process
Returns: nothing
Logic:
This method is called when you are in a list, but outside of a level.
Check for the end of the list. Otherwise, use the self.__mainlist_dict
to determine if you need to add a lines values to the main list.
"""
if self.__token_info == 'cb<nu<clos-brack' and\
self.__cb_count == self.__list_ob_count:
self.__state = 'default'
elif self.__token_info == 'ob<nu<open-brack':
self.__state = 'unsure_ob'
else:
att = self.__main_list_dict.get(self.__token_info)
if att:
value = line[20:]
# dictionary is always the first item in the last list
# [{att:value}, [], [att:value, []]
self.__all_lists[-1][0][att] = value
def __found_level_func(self, line):
"""
Requires: line -- line to process
Returns: nothing
Logic:
I have found \\listlevel.
Change the state to level
Get the open bracket count so you know when this state ends.
Append an empty list to the last list inside all lists.
Create a temporary dictionary.
Append the temporary dictionary to the new list.
self.__all_lists now looks like:
[[{list-id:[]}, [{}]]]
Where:
self.__all_lists[-1] => a list. The first item is a dictionary.
The second item is a list containing a dictionary:
[{list-id:[]}, [{}]]
self.__all_lists[-1][0] => a dictionary of the list attributes
self.__all_lists[-1][-1] => a list with just a dictionary
self.__all_lists[-1][-1][0] => the dictionary of level attributes
"""
self.__state = 'level'
self.__level_ob_count = self.__ob_count
self.__all_lists[-1].append([])
the_dict = {}
self.__all_lists[-1][-1].append(the_dict)
self.__level_dict
def __level_func(self, line):
"""
Requires:
line -- line to parse
Returns:
nothing
Logic:
Look for the end of the this group.
Change states if an open bracket is found.
Add attributes to all_dicts if an appropriate token is found.
"""
if self.__token_info == 'cb<nu<clos-brack' and\
self.__cb_count == self.__level_ob_count:
self.__state = 'list'
elif self.__token_info == 'ob<nu<open-brack':
self.__state = 'unsure_ob'
else:
att = self.__level_dict.get(self.__token_info)
if att:
value = line[20:]
self.__all_lists[-1][-1][0][att] = value
def __level_number_func(self, line):
"""
Requires:
line -- line to process
Returns:
nothing
Logic:
Check for the end of the group.
Otherwise, if the token is hexidecimal, create an attribute.
Do so by finding the base-10 value of the number. Then divide
this by 2 and round it. Remove the ".0". Sandwwhich the result to
give you something like level1-show-level.
The show-level attribute means the numbering for this level.
"""
if self.__token_info == 'cb<nu<clos-brack' and\
self.__cb_count == self.__level_number_ob_count:
self.__state = 'level'
self.__all_lists[-1][-1][0]['level-numbers'] = self.__level_numbers_string
self.__level_numbers_string = ''
elif self.__token_info == 'tx<hx<__________':
self.__level_numbers_string += '\\&#x0027;%s' % line[18:]
elif self.__token_info == 'tx<nu<__________':
self.__level_numbers_string += line[17:]
"""
num = line[18:]
num = int(num, 16)
level = unicode_type(round((num - 1)/2, 0))
level = level[:-2]
level = 'level%s-show-level' % level
self.__all_lists[-1][-1][0][level] = 'true'
"""
def __level_text_func(self, line):
"""
Requires:
line --line to process
Returns:
nothing
Logic:
Check for the end of the group.
Otherwise, if the text is hexidecimal, call on the method
__parse_level_text_length.
Otheriwse, if the text is regular text, create an attribute.
This attribute indicates the puncuation after a certain level.
An example is "level1-marker = '.'"
Otherwise, check for a level-template-id.
"""
if self.__token_info == 'cb<nu<clos-brack' and\
self.__cb_count == self.__level_text_ob_count:
if self.__prefix_string:
if self.__all_lists[-1][-1][0]['numbering-type'] == 'bullet':
self.__prefix_string = self.__prefix_string.replace('_', '')
self.__all_lists[-1][-1][0]['bullet-type'] = self.__prefix_string
self.__state = 'level'
# self.__figure_level_text_func()
self.__level_text_string = ''
self.__found_level_text_length = 0
elif self.__token_info == 'tx<hx<__________':
self.__parse_level_text_length(line)
elif self.__token_info == 'tx<nu<__________':
text = line[17:]
if text and text[-1] == ';':
text = text.replace(';', '')
if not self.__level_text_position:
self.__prefix_string = text
else:
self.__all_lists[-1][-1][0][self.__level_text_position] = text
elif self.__token_info == 'cw<ls<lv-tem-id_':
value = line[20:]
self.__all_lists[-1][-1][0]['level-template-id'] = value
def __parse_level_text_length(self, line):
"""
Requires:
line --line with hexidecimal number
Returns:
nothing
Logic:
Method is used for to parse text in the \\leveltext group.
"""
num = line[18:]
the_num = int(num, 16)
if not self.__found_level_text_length:
self.__all_lists[-1][-1][0]['list-text-length'] = unicode_type(the_num)
self.__found_level_text_length = 1
else:
the_num += 1
the_string = unicode_type(the_num)
level_marker = 'level%s-suffix' % the_string
show_marker = 'show-level%s' % the_string
self.__level_text_position = level_marker
self.__all_lists[-1][-1][0][show_marker] = 'true'
if self.__prefix_string:
prefix_marker = 'level%s-prefix' % the_string
self.__all_lists[-1][-1][0][prefix_marker] = self.__prefix_string
self.__prefix_string = None
def __list_name_func(self, line):
"""
Requires:
line --line to process
Returns:
nothing
Logic:
Simply check for the end of the group and change states.
"""
if self.__token_info == 'cb<nu<clos-brack' and\
self.__cb_count == self.__list_name_ob_count:
self.__state = 'list'
def __after_bracket_func(self, line):
"""
Requires:
line --line to parse
Returns:
nothing.
Logic:
The last token found was "{". This method determines what group
you are now in.
WARNING: this could cause problems. If no group is found, the state will remain
unsure_ob, which means no other text will be parsed.
"""
if self.__token_info == 'cw<ls<level-text':
self.__state = 'level_text'
self.__level_text_ob_count = self.__ob_count
elif self.__token_info == 'cw<ls<level-numb':
self.__level_number_ob_count = self.__ob_count
self.__state = 'level_number'
elif self.__token_info == 'cw<ls<list-tb-le':
self.__found_level_func(line)
elif self.__token_info == 'cw<ls<list-in-tb':
self.__found_list_func(line)
elif self.__token_info == 'cw<ls<list-name_':
self.__state = 'list_name'
self.__list_name_ob_count = self.__ob_count
else:
if self.__run_level > 3:
msg = 'No matching token after open bracket\n'
msg += 'token is "%s\n"' % (line)
raise self.__bug_handler
def __add_to_final_line(self):
"""
Method no longer used.
"""
self.__list_table_final = 'mi<mk<listabbeg_\n'
self.__list_table_final += 'mi<tg<open______<list-table\n' + \
'mi<mk<listab-beg\n' + self.__list_table_final
self.__list_table_final += \
'mi<mk<listab-end\n' + 'mi<tg<close_____<list-table\n'
self.__list_table_final += 'mi<mk<listabend_\n'
def __write_final_string(self):
"""
Requires:
nothing
Returns:
nothing
Logic:
Write out the list-table start tag.
Iterate through self.__all_lists. For each list, write out
a list-in-table tag. Get the dictionary of this list
(the first item). Print out the key => value pair.
Remove the first item (the dictionary) form this list. Now iterate
through what is left in the list. Each list will conatin one item,
a dictionary. Get this dictionary and print out key => value pair.
"""
not_allow = ['list-id',]
id = 0
self.__list_table_final = 'mi<mk<listabbeg_\n'
self.__list_table_final += 'mi<tg<open______<list-table\n' + \
'mi<mk<listab-beg\n' + self.__list_table_final
for list in self.__all_lists:
id += 1
self.__list_table_final += 'mi<tg<open-att__<list-in-table'
# self.__list_table_final += '<list-id>%s' % (unicode_type(id))
the_dict = list[0]
the_keys = the_dict.keys()
for the_key in the_keys:
if the_key in not_allow:
continue
att = the_key
value = the_dict[att]
self.__list_table_final += '<%s>%s' % (att, value)
self.__list_table_final += '\n'
levels = list[1:]
level_num = 0
for level in levels:
level_num += 1
self.__list_table_final += 'mi<tg<empty-att_<level-in-table'
self.__list_table_final += '<level>%s' % (unicode_type(level_num))
the_dict2 = level[0]
the_keys2 = the_dict2.keys()
is_bullet = 0
bullet_text = ''
for the_key2 in the_keys2:
if the_key2 in not_allow:
continue
test_bullet = the_dict2.get('numbering-type')
if test_bullet == 'bullet':
is_bullet = 1
att2 = the_key2
value2 = the_dict2[att2]
# sys.stderr.write('%s\n' % att2[0:10])
if att2[0:10] == 'show-level' and is_bullet:
# sys.stderr.write('No print %s\n' % att2)
pass
elif att2[-6:] == 'suffix' and is_bullet:
# sys.stderr.write('%s\n' % att2)
bullet_text += value2
elif att2[-6:] == 'prefix' and is_bullet:
# sys.stderr.write('%s\n' % att2)
bullet_text += value2
else:
self.__list_table_final += '<%s>%s' % (att2, value2)
if is_bullet:
pass
# self.__list_table_final += '<bullet-type>%s' % (bullet_text)
self.__list_table_final += '\n'
self.__list_table_final += 'mi<tg<close_____<list-in-table\n'
self.__list_table_final += \
'mi<mk<listab-end\n' + 'mi<tg<close_____<list-table\n'
self.__list_table_final += 'mi<mk<listabend_\n'
def parse_list_table(self, line):
"""
Requires:
line -- line with border definition in it
Returns:
A string and the dictionary of list-table values and attributes.
Logic:
Call on the __parse_lines metod, which splits the text string into
lines (which will be tokens) and processes them.
"""
self.__parse_lines(line)
return self.__list_table_final, self.__all_lists

View File

@@ -0,0 +1,465 @@
from __future__ import unicode_literals, absolute_import, print_function, division
#########################################################################
# #
# #
# copyright 2002 Paul Henry Tremblay #
# #
# This program is distributed in the hope that it will be useful, #
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
# General Public License for more details. #
# #
# #
#########################################################################
import sys, os, re
from calibre.ebooks.rtf2xml import copy
from calibre.ptempfile import better_mktemp
from polyglot.builtins import unicode_type
from . import open_for_read, open_for_write
class MakeLists:
"""
Form lists.
Use RTF's own formatting to determine if a paragraph definition is part of a
list.
Use indents to determine items and how lists are nested.
"""
def __init__(self,
in_file,
bug_handler,
headings_to_sections,
list_of_lists,
copy=None,
run_level=1,
no_headings_as_list=1,
write_list_info=0,
):
"""
Required:
'file'
Optional:
'copy'-- whether to make a copy of result for debugging
'temp_dir' --where to output temporary results (default is
directory from which the script is run.)
Returns:
nothing
"""
self.__file = in_file
self.__bug_handler = bug_handler
self.__run_level = run_level
self.__no_headings_as_list = no_headings_as_list
self.__headings_to_sections = headings_to_sections
self.__copy = copy
self.__write_to = better_mktemp()
self.__list_of_lists = list_of_lists
self.__write_list_info = write_list_info
def __initiate_values(self):
"""
Required:
Nothing
Return:
Nothing
Logic:
The self.__end_list is a list of tokens that will force a list to end.
Likewise, the self.__end_lines is a list of lines that forces a list to end.
"""
self.__state = "default"
self.__left_indent = 0
self.__list_type = 'not-defined'
self.__pard_def = ""
self.__all_lists = []
self.__level = 0
self.__list_chunk = ''
self.__state_dict={
'default' : self.__default_func,
'in_pard' : self.__in_pard_func,
'after_pard' : self.__after_pard_func,
}
self.__headings = [
'heading 1', 'heading 2', 'heading 3', 'heading 4',
'heading 5', 'heading 6', 'heading 7', 'heading 8',
'heading 9'
]
self.__allow_levels = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
self.__style_name = ''
self.__end_list = [
'mi<mk<body-close',
'mi<mk<par-in-fld',
'cw<tb<cell______',
'cw<tb<row-def___',
'cw<tb<row_______',
'mi<mk<sect-close',
'mi<mk<sect-start',
'mi<mk<header-beg',
'mi<mk<header-end',
'mi<mk<head___clo',
'mi<mk<fldbk-end_',
'mi<mk<close_cell',
'mi<mk<footnt-ope',
'mi<mk<foot___clo',
'mi<mk<tabl-start',
# 'mi<mk<sec-fd-beg',
]
self.__end_lines = [
'mi<tg<close_____<cell\n',
]
self.__id_regex = re.compile(r'\<list-id\>(\d+)')
self.__lv_regex = re.compile(r'\<list-level\>(\d+)')
self.__found_appt = 0
self.__line_num = 0
def __in_pard_func(self, line):
"""
Required:
line -- the line of current text.
Return:
Nothing
Logic:
You are in a list, but in the middle of a paragraph definition.
Don't do anything until you find the end of the paragraph definition.
"""
if self.__token_info == 'mi<mk<pard-end__':
self.__state = 'after_pard'
self.__write_obj.write(line)
def __after_pard_func(self, line):
"""
Required:
line -- the line of current text.
Return:
Nothing
Logic:
You are in a list, but after a paragraph definition. You have to
determine if the last pargraph definition ends a list, continues
the old one, or starts a new one.
Otherwise, look for a paragraph definition. If one is found, determine if
the paragraph definition contains a list-id. If it does, use the method
self.__list_after_par_def to determine the action.
If the paragraph definition does not contain a list-id, use the method
close_lists to close out items and lists for a paragraph that is not
If a bigger block is found (such as a section or a cell), end all lists.
indented.
If no special line is found, add each line to a buffer.
"""
if self.__token_info == 'mi<tg<open-att__' and line[17:37] == 'paragraph-definition':
is_heading = self.__is_a_heading()
# found paragraph definition and not heading 1
search_obj = re.search(self.__id_regex, line)
if search_obj and not is_heading: # found list-id
search_obj_lv = re.search(self.__lv_regex, line)
if search_obj_lv:
self.__level = search_obj_lv.group(1)
num = search_obj.group(1)
self.__list_after_par_def_func(line, num)
self.__write_obj.write(line)
self.__state = 'in_pard'
# heading 1
elif is_heading:
self.__left_indent = -1000
self.__close_lists()
self.__write_obj.write(self.__list_chunk)
self.__list_chunk = ''
self.__state = 'default'
self.__write_obj.write(line)
# Normal with no list id
else:
self.__close_lists()
self.__write_obj.write(self.__list_chunk)
self.__list_chunk = ''
self.__write_obj.write(line)
if len(self.__all_lists) == 0:
self.__state= 'default'
else:
self.__state = 'in_pard'
# section to end lists
elif self.__token_info in self.__end_list :
self.__left_indent = -1000
self.__close_lists()
self.__write_obj.write(self.__list_chunk)
self.__list_chunk = ''
self.__state = 'default'
self.__write_obj.write(line)
else:
self.__list_chunk += line
def __list_after_par_def_func(self, line, id):
"""
Required:
line -- the line of current text.
id -- the id of the current list
Return:
Nothing
Logic:
You have found the end of a paragraph definition, and have found
another paragraph definition with a list id.
If the list-id is different from the last paragraph definition,
write the string in the buffer. Close out the lists with another
method and start a new list.
If the list id is the same as the last one, check the indent on the
current paragraph definition. If it is greater than the previous one,
do not end the current list or item. Start a new list.
"""
last_list_id = self.__all_lists[-1]['id']
if id != last_list_id:
self.__close_lists()
self.__write_obj.write(self.__list_chunk)
self.__write_start_list(id)
self.__list_chunk = ''
else:
last_list_indent = self.__all_lists[-1]['left-indent']
if self.__left_indent > last_list_indent:
self.__write_obj.write(self.__list_chunk)
self.__write_start_list(id)
else:
self.__write_end_item()
self.__write_obj.write(self.__list_chunk)
self.__write_start_item()
self.__list_chunk = ''
def __close_lists(self):
"""
Required:
Nothing
Return:
Nothing
Logic:
Reverse the list of dictionaries. Iterate through the list and
get the indent for each list. If the current indent is less than
or equal to the indent in the dictionary, close that level.
Keep track of how many levels you close. Reduce the list by that
many levels.
Reverse the list again.
"""
if self.__line_num < 25 and self.__found_appt:
sys.stderr.write('in closing out lists\n')
sys.stderr.write('current_indent is "%s"\n' % self.__left_indent)
current_indent = self.__left_indent
self.__all_lists.reverse()
num_levels_closed = 0
for the_dict in self.__all_lists:
list_indent = the_dict.get('left-indent')
if self.__line_num < 25 and self.__found_appt:
sys.stderr.write('last indent is "%s"' % list_indent)
if current_indent <= list_indent:
self.__write_end_item()
self.__write_end_list()
num_levels_closed += 1
self.__all_lists = self.__all_lists[num_levels_closed:]
self.__all_lists.reverse()
def __write_end_list(self):
"""
Required:
Nothing
Return:
Nothing
Logic:
Write the end of a list.
"""
self.__write_obj.write('mi<tg<close_____<list\n')
self.__write_obj.write('mi<mk<list_close\n')
def __write_start_list(self, id):
"""
Required:
id -- the id of the current list.
Return:
Nothing
Logic:
Write the start of a list and add the id and left-indent to the
self.__all_lists list.
Write cues of when a list starts for later processing.
In order to determine the type of list, you have to iterate through
the self.__list_of lists. This list looks like:
[[{list-id: [1, 2], [{}], [{}]] [{list-id: [3, 4], [{}]]]
I need to get the inside lists of the main lists. Then I need to get
the first item of what I just got. This is a dictionary. Get the list-id.
This is a list. Check to see if the current id is in this list. If
so, then get the list-type from the dictionary.
"""
the_dict = {}
the_dict['left-indent'] = self.__left_indent
the_dict['id'] = id
self.__all_lists.append(the_dict)
self.__write_obj.write(
'mi<mk<list_start\n'
)
# bogus levels are sometimes written for empty paragraphs
if unicode_type(self.__level) not in self.__allow_levels:
lev_num = '0'
else:
lev_num = self.__level
self.__write_obj.write(
'mi<tg<open-att__<list<list-id>%s<level>%s'
% (id, lev_num)
)
list_dict = {}
if self.__list_of_lists: # older RTF won't generate a list_of_lists
index_of_list = self.__get_index_of_list(id)
if index_of_list is not None: # found a matching id
curlist = self.__list_of_lists[index_of_list]
list_dict = curlist[0]
level = int(self.__level) + 1
if level >= len(curlist):
level = len(curlist) - 1
level_dict = curlist[level][0]
list_type = level_dict.get('numbering-type')
if list_type == 'bullet':
list_type = 'unordered'
else:
list_type = 'ordered'
self.__write_obj.write(
'<list-type>%s' % (list_type))
else: # no matching id
self.__write_obj.write(
'<list-type>%s' % (self.__list_type))
else: # older RTF
self.__write_obj.write(
'<list-type>%s' % (self.__list_type))
# if you want to dump all the info to the list, rather than
# keeping it in the table above, change self.__write_list_info
# to true.
if self.__list_of_lists and self.__write_list_info and list_dict:
not_allow = ['list-id',]
the_keys_list = list_dict.keys()
for the_key in the_keys_list:
if the_key in not_allow:
continue
self.__write_obj.write('<%s>%s' % (the_key, list_dict[the_key]))
the_keys_level = level_dict.keys()
for the_key in the_keys_level:
self.__write_obj.write('<%s>%s' % (the_key, level_dict[the_key]))
self.__write_obj.write('\n')
self.__write_obj.write(
'mi<mk<liststart_\n'
)
self.__write_start_item()
def __get_index_of_list(self, id):
"""
Requires:
id -- id of current paragraph-definition
Returns:
an index of where the id occurs in list_of_lists, the
dictionary passed to this module.
Logic:
Iterate through the big lists, the one passed to this module and
get the first item, the dictionary. Use a counter to keep
track of how many times you iterate with the counter.
Once you find a match, return the counter.
If no match is found, print out an error message.
"""
# some RTF use 0 indexed list. Don't know what to do?
if id == '0':
return
the_index = 0
for list in self.__list_of_lists:
the_dict = list[0]
id_in_list = the_dict.get('list-id')
if id in id_in_list:
return the_index
the_index += 1
if self.__run_level > 0:
sys.stderr.write('Module is make_lists.py\n'
'Method is __get_index_of_list\n'
'The main list does not appear to have a matching id for %s \n'
% (id)
)
# sys.stderr.write(repr(self.__list_of_lists))
# if self.__run_level > 3:
# msg = 'level is "%s"\n' % self.__run_level
# self.__bug_handler
def __write_start_item(self):
self.__write_obj.write('mi<mk<item_start\n')
self.__write_obj.write('mi<tg<open______<item\n')
self.__write_obj.write('mi<mk<itemstart_\n')
def __write_end_item(self):
self.__write_obj.write('mi<tg<item_end__\n')
self.__write_obj.write('mi<tg<close_____<item\n')
self.__write_obj.write('mi<tg<item__end_\n')
def __default_func(self, line):
"""
Required:
self, line
Returns:
Nothing
Logic
Look for the start of a paragraph defintion. If one is found, check if
it contains a list-id. If it does, start a list. Change the state to
in_pard.
"""
if self.__token_info == 'mi<tg<open-att__' and line[17:37] == 'paragraph-definition':
is_a_heading = self.__is_a_heading()
if not is_a_heading:
search_obj = re.search(self.__id_regex, line)
if search_obj:
num = search_obj.group(1)
self.__state = 'in_pard'
search_obj_lv = re.search(self.__lv_regex, line)
if search_obj_lv:
self.__level = search_obj_lv.group(1)
self.__write_start_list(num)
self.__write_obj.write(line)
def __is_a_heading(self):
if self.__style_name in self.__headings:
if self.__headings_to_sections:
return 1
else:
if self.__no_headings_as_list:
return 1
else:
return 0
else:
return 0
def __get_indent(self, line):
if self.__token_info == 'mi<mk<left_inden':
self.__left_indent = float(line[17:-1])
def __get_list_type(self, line):
if self.__token_info == 'mi<mk<list-type_': # <ordered
self.__list_type = line[17:-1]
if self.__list_type == 'item':
self.__list_type = "unordered"
def __get_style_name(self, line):
if self.__token_info == 'mi<mk<style-name':
self.__style_name = line[17:-1]
def make_lists(self):
"""
Required:
nothing
Returns:
original file will be changed
Logic:
"""
self.__initiate_values()
read_obj = open_for_read(self.__file)
self.__write_obj = open_for_write(self.__write_to)
line_to_read = 1
while line_to_read:
line_to_read = read_obj.readline()
line = line_to_read
self.__token_info = line[:16]
self.__get_indent(line)
self.__get_list_type(line)
self.__get_style_name(line)
action = self.__state_dict.get(self.__state)
action(line)
read_obj.close()
self.__write_obj.close()
copy_obj = copy.Copy(bug_handler=self.__bug_handler)
if self.__copy:
copy_obj.copy_file(self.__write_to, "make_lists.data")
copy_obj.rename(self.__write_to, self.__file)
os.remove(self.__write_to)

View File

@@ -0,0 +1,146 @@
from __future__ import absolute_import, division, print_function, unicode_literals
#########################################################################
# #
# #
# copyright 2002 Paul Henry Tremblay #
# #
# This program is distributed in the hope that it will be useful, #
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
# General Public License for more details. #
# #
# #
#########################################################################
import sys
from polyglot.builtins import unicode_type
from . import open_for_read
class OldRtf:
"""
Check to see if the RTF is an older version
Logic:
If allowable control word/properties happen in text without being enclosed
in brackets the file will be considered old rtf
"""
def __init__(self, in_file,
bug_handler,
run_level,
):
"""
Required:
'file'--file to parse
'table_data' -- a dictionary for each table.
Optional:
'copy'-- whether to make a copy of result for debugging
'temp_dir' --where to output temporary results (default is
directory from which the script is run.)
Returns:
nothing
"""
self.__file = in_file
self.__bug_handler = bug_handler
self.__run_level = run_level
self.__allowable = [
'annotation' ,
'blue______' ,
'bold______',
'caps______',
'char-style' ,
'dbl-strike' ,
'emboss____',
'engrave___' ,
'font-color',
'font-down_' ,
'font-size_',
'font-style',
'font-up___',
'footnot-mk' ,
'green_____' ,
'hidden____',
'italics___',
'outline___',
'red_______',
'shadow____' ,
'small-caps',
'strike-thr',
'subscript_',
'superscrip' ,
'underlined' ,
]
self.__action_dict = {
'before_body' : self.__before_body_func,
'in_body' : self.__check_tokens_func,
'after_pard' : self.__after_pard_func,
}
def __initiate_values(self):
self.__previous_token = ''
self.__state = 'before_body'
self.__found_new = 0
self.__ob_group = 0
def __check_tokens_func(self, line):
if self.__inline_info in self.__allowable:
if self.__ob_group == self.__base_ob_count:
return 'old_rtf'
else:
self.__found_new += 1
elif self.__token_info == 'cw<pf<par-def___':
self.__state = 'after_pard'
def __before_body_func(self, line):
if self.__token_info == 'mi<mk<body-open_':
self.__state = 'in_body'
self.__base_ob_count = self.__ob_group
def __after_pard_func(self, line):
if line[0:2] != 'cw':
self.__state = 'in_body'
def check_if_old_rtf(self):
"""
Requires:
nothing
Returns:
True if file is older RTf
False if file is newer RTF
"""
self.__initiate_values()
line_num = 0
with open_for_read(self.__file) as read_obj:
for line in read_obj:
line_num += 1
self.__token_info = line[:16]
if self.__token_info == 'mi<mk<body-close':
return False
if self.__token_info == 'ob<nu<open-brack':
self.__ob_group += 1
self.__ob_count = line[-5:-1]
if self.__token_info == 'cb<nu<clos-brack':
self.__ob_group -= 1
self.__cb_count = line[-5:-1]
self.__inline_info = line[6:16]
if self.__state == 'after_body':
return False
action = self.__action_dict.get(self.__state)
if action is None:
try:
sys.stderr.write('No action for this state!\n')
except:
pass
result = action(line)
if result == 'new_rtf':
return False
elif result == 'old_rtf':
if self.__run_level > 3:
sys.stderr.write(
'Old rtf construction %s (bracket %s, line %s)\n' % (
self.__inline_info, unicode_type(self.__ob_group), line_num)
)
return True
self.__previous_token = line[6:16]
return False

View File

@@ -0,0 +1,121 @@
from __future__ import unicode_literals, absolute_import, print_function, division
#########################################################################
# #
# #
# copyright 2002 Paul Henry Tremblay #
# #
# This program is distributed in the hope that it will be useful, #
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
# General Public License for more details. #
# #
# #
#########################################################################
import sys, os
from polyglot.builtins import raw_input
from . import open_for_read, open_for_write
# , codecs
class Output:
"""
Output file
"""
def __init__(self,
file,
orig_file,
output_dir=None,
out_file=None,
no_ask=True
):
"""
Required:
'file' -- xml file ready to output
orig_file -- original rtf file
Optional:
output_file -- the file to output to
Returns:
nothing
"""
self.__file = file
self.__orig_file = orig_file
self.__output_dir = output_dir
self.__no_ask = no_ask
self.__out_file = out_file
def output(self):
"""
Required:
nothing
Returns:
nothing
Logic:
output the line to the screen if no output file given. Otherwise, output to
the file.
"""
if self.__output_dir:
self.__output_to_dir_func()
elif self.__out_file:
self.__output_to_file_func()
# self.__output_xml(self.__file, self.__out_file)
else:
self.__output_to_standard_func()
def __output_to_dir_func(self):
"""
Requires:
nothing
Returns:
nothing
Logic:
Create a file within the output directory.
Read one file at a time. Output line to the newly-created file.
"""
base_name = os.path.basename(self.__orig_file)
base_name, ext = os.path.splitext(base_name)
output_file = os.path.join(self.__output_dir, '%s.xml' % base_name)
# change if user wants to output to a specific file
if self.__out_file:
output_file = os.path.join(self.__output_dir, self.__out_file)
user_response = 'o'
if os.path.isfile(output_file) and not self.__no_ask:
msg = 'Do you want to overwrite %s?\n' % output_file
msg += ('Type "o" to overwrite.\n'
'Type any other key to print to standard output.\n')
sys.stderr.write(msg)
user_response = raw_input()
if user_response == 'o':
with open_for_read(self.__file) as read_obj:
with open_for_write(self.output_file) as write_obj:
for line in read_obj:
write_obj.write(line)
else:
self.__output_to_standard_func()
def __output_to_file_func(self):
"""
Required:
nothing
Returns:
nothing
Logic:
read one line at a time. Output to standard
"""
with open_for_read(self.__file) as read_obj:
with open_for_write(self.__out_file) as write_obj:
for line in read_obj:
write_obj.write(line)
def __output_to_standard_func(self):
"""
Required:
nothing
Returns:
nothing
Logic:
read one line at a time. Output to standard
"""
with open_for_read(self.__file) as read_obj:
for line in read_obj:
sys.stdout.write(line)

View File

@@ -0,0 +1,209 @@
from __future__ import unicode_literals, absolute_import, print_function, division
from __future__ import print_function
#########################################################################
# #
# #
# copyright 2002 Paul Henry Tremblay #
# #
# This program is distributed in the hope that it will be useful, #
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
# General Public License for more details. #
# #
# #
#########################################################################
class OverrideTable:
"""
Parse a line of text to make the override table. Return a string
(which will convert to XML) and the dictionary containing all the
information about the lists. This dictionary is the result of the
dictionary that is first passed to this module. This module
modifies the dictionary, assigning lists numbers to each list.
"""
def __init__(
self,
list_of_lists,
run_level=1,
):
self.__list_of_lists = list_of_lists
self.__initiate_values()
self.__run_level = run_level
def __initiate_values(self):
self.__override_table_final = ''
self.__state = 'default'
self.__override_list = []
self.__state_dict = {
'default' : self.__default_func,
'override' : self.__override_func,
'unsure_ob' : self.__after_bracket_func,
}
self.__override_dict = {
'cw<ls<lis-tbl-id' : 'list-table-id',
'cw<ls<list-id___' : 'list-id',
}
def __override_func(self, line):
"""
Requires:
line -- line to parse
Returns:
nothing
Logic:
The group {\\override has been found.
Check for the end of the group.
Otherwise, add appropriate tokens to the override dictionary.
"""
if self.__token_info == 'cb<nu<clos-brack' and\
self.__cb_count == self.__override_ob_count:
self.__state = 'default'
self.__parse_override_dict()
else:
att = self.__override_dict.get(self.__token_info)
if att:
value = line[20:]
self.__override_list[-1][att] = value
def __parse_override_dict(self):
"""
Requires:
nothing
Returns:
nothing
Logic:
The list of all information about RTF lists has been passed to
this module. As of this point, this python list has no id number,
which is needed later to identify which lists in the body should
be assigned which formatting commands from the list-table.
In order to get an id, I have to check to see when the list-table-id
from the override_dict (generated in this module) matches the list-table-id
in list_of_lists (generated in the list_table.py module). When a match is found,
append the lists numbers to the self.__list_of_lists dictionary
that contains the empty lists:
[[{list-id:[HERE!],[{}]]
This is a list, since one list in the table in the preamble of RTF can
apply to multiple lists in the body.
"""
override_dict = self.__override_list[-1]
list_id = override_dict.get('list-id')
if list_id is None and self.__level > 3:
msg = 'This override does not appear to have a list-id\n'
raise self.__bug_handler(msg)
current_table_id = override_dict.get('list-table-id')
if current_table_id is None and self.__run_level > 3:
msg = 'This override does not appear to have a list-table-id\n'
raise self.__bug_handler(msg)
counter = 0
for list in self.__list_of_lists:
info_dict = list[0]
old_table_id = info_dict.get('list-table-id')
if old_table_id == current_table_id:
self.__list_of_lists[counter][0]['list-id'].append(list_id)
break
counter += 1
def __parse_lines(self, line):
"""
Requires:
line --ine to parse
Returns:
nothing
Logic:
Break the into tokens by splitting it on the newline.
Call on the method according to the state.
"""
lines = line.split('\n')
self.__ob_count = 0
self.__ob_group = 0
for line in lines:
self.__token_info = line[:16]
if self.__token_info == 'ob<nu<open-brack':
self.__ob_count = line[-4:]
self.__ob_group += 1
if self.__token_info == 'cb<nu<clos-brack':
self.__cb_count = line[-4:]
self.__ob_group -= 1
action = self.__state_dict.get(self.__state)
if action is None:
print(self.__state)
action(line)
self.__write_final_string()
# self.__add_to_final_line()
def __default_func(self, line):
"""
Requires:
line -- line to parse
Return:
nothing
Logic:
Look for an open bracket and change states when found.
"""
if self.__token_info == 'ob<nu<open-brack':
self.__state = 'unsure_ob'
def __after_bracket_func(self, line):
"""
Requires:
line -- line to parse
Returns:
nothing
Logic:
The last token was an open bracket. You need to determine
the group based on the token after.
WARNING: this could cause problems. If no group is found, the
state will remain unsure_ob, which means no other text will be
parsed. I should do states by a list and simply pop this
unsure_ob state to get the previous state.
"""
if self.__token_info == 'cw<ls<lis-overid':
self.__state = 'override'
self.__override_ob_count = self.__ob_count
the_dict = {}
self.__override_list.append(the_dict)
elif self.__run_level > 3:
msg = 'No matching token after open bracket\n'
msg += 'token is "%s\n"' % (line)
raise self.__bug_handler(msg)
def __write_final_string(self):
"""
Requires:
line -- line to parse
Returns:
nothing
Logic:
First write out the override-table tag.
Iteratere through the dictionaries in the main override_list.
For each dictionary, write an empty tag "override-list". Add
the attributes and values of the tag from the dictionary.
"""
self.__override_table_final = 'mi<mk<over_beg_\n'
self.__override_table_final += 'mi<tg<open______<override-table\n' + \
'mi<mk<overbeg__\n' + self.__override_table_final
for the_dict in self.__override_list:
self.__override_table_final += 'mi<tg<empty-att_<override-list'
the_keys = the_dict.keys()
for the_key in the_keys:
self.__override_table_final += \
'<%s>%s' % (the_key, the_dict[the_key])
self.__override_table_final += '\n'
self.__override_table_final += '\n'
self.__override_table_final += \
'mi<mk<overri-end\n' + 'mi<tg<close_____<override-table\n'
self.__override_table_final += 'mi<mk<overribend_\n'
def parse_override_table(self, line):
"""
Requires:
line -- line with border definition in it
Returns:
A string that will be converted to XML, and a dictionary of
all the properties of the RTF lists.
Logic:
"""
self.__parse_lines(line)
return self.__override_table_final, self.__list_of_lists

View File

@@ -0,0 +1,763 @@
from __future__ import absolute_import, division, print_function, unicode_literals
#########################################################################
# #
# #
# copyright 2002 Paul Henry Tremblay #
# #
# This program is distributed in the hope that it will be useful, #
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
# General Public License for more details. #
# #
# #
#########################################################################
import sys, os
from calibre.ebooks.rtf2xml import copy, border_parse
from calibre.ptempfile import better_mktemp
from polyglot.builtins import unicode_type
from . import open_for_read, open_for_write
class ParagraphDef:
"""
=================
Purpose
=================
Write paragraph definition tags.
States:
1. before_1st_para_def.
Before any para_def token is found. This means all the text in the preamble.
Look for the token 'cw<pf<par-def___'. This will changet the state to collect_tokens.
2. collect_tokens.
Found a paragraph_def. Need to get all tokens.
Change with start of a paragrph ('mi<mk<para-start'). State then becomes
in_paragraphs
If another paragraph definition is found, the state does not change.
But the dictionary is reset.
3. in_paragraphs
State changes when 'mi<mk<para-end__', or end of paragraph is found.
State then becomes 'self.__state = 'after_para_end'
4. after_para_end
If 'mi<mk<para-start' (the start of a paragraph) or 'mi<mk<para-end__' (the end of a paragraph--must be empty paragraph?) are found:
state changes to 'in_paragraphs'
If 'cw<pf<par-def___' (paragraph_definition) is found:
state changes to collect_tokens
if 'mi<mk<body-close', 'mi<mk<par-in-fld',
'cw<tb<cell______','cw<tb<row-def___','cw<tb<row_______',
'mi<mk<sect-close', 'mi<mk<header-beg', 'mi<mk<header-end'
are found. (All these tokens mark the start of a bigger element. para_def must
be closed:
state changes to 'after_para_def'
5. after_para_def
'mi<mk<para-start' changes state to in_paragraphs
if another paragraph_def is found, the state changes to collect_tokens.
"""
def __init__(self,
in_file,
bug_handler,
default_font,
copy=None,
run_level=1,):
"""
Required:
'file'--file to parse
'default_font' --document default font
Optional:
'copy'-- whether to make a copy of result for debugging
'temp_dir' --where to output temporary results (default is
directory from which the script is run.)
Returns:
nothing
"""
self.__file = in_file
self.__bug_handler = bug_handler
self.__default_font = default_font
self.__copy = copy
self.__run_level = run_level
self.__write_to = better_mktemp()
def __initiate_values(self):
"""
Initiate all values.
"""
# Dictionary needed to convert shortened style names to readable names
self.__token_dict={
# paragraph formatting => pf
'par-end___' : 'para',
'par-def___' : 'paragraph-definition',
'keep-w-nex' : 'keep-with-next',
'widow-cntl' : 'widow-control',
'adjust-rgt' : 'adjust-right',
'language__' : 'language',
'right-inde' : 'right-indent',
'fir-ln-ind' : 'first-line-indent',
'left-inden' : 'left-indent',
'space-befo' : 'space-before',
'space-afte' : 'space-after',
'line-space' : 'line-spacing',
'default-ta' : 'default-tab',
'align_____' : 'align',
'widow-cntr' : 'widow-control',
# stylesheet = > ss
'style-shet' : 'stylesheet',
'based-on__' : 'based-on-style',
'next-style' : 'next-style',
'char-style' : 'character-style',
# this is changed to get a nice attribute
'para-style' : 'name',
# graphics => gr
'picture___' : 'pict',
'obj-class_' : 'obj_class',
'mac-pic___' : 'mac-pict',
# section => sc
'section___' : 'section-new',
'sect-defin' : 'section-reset',
'sect-note_' : 'endnotes-in-section',
# list=> ls
'list-text_' : 'list-text',
# this line must be wrong because it duplicates an earlier one
'list-text_' : 'list-text',
'list______' : 'list',
'list-lev-d' : 'list-level-definition',
'list-cardi' : 'list-cardinal-numbering',
'list-decim' : 'list-decimal-numbering',
'list-up-al' : 'list-uppercase-alphabetic-numbering',
'list-up-ro' : 'list-uppercae-roman-numbering',
'list-ord__' : 'list-ordinal-numbering',
'list-ordte' : 'list-ordinal-text-numbering',
'list-bulli' : 'list-bullet',
'list-simpi' : 'list-simple',
'list-conti' : 'list-continue',
'list-hang_' : 'list-hang',
# 'list-tebef' : 'list-text-before',
# 'list-level' : 'level',
'list-id___' : 'list-id',
'list-start' : 'list-start',
'nest-level' : 'nest-level',
# duplicate
'list-level' : 'list-level',
# notes => nt
'footnote__' : 'footnote',
'type______' : 'type',
# anchor => an
'toc_______' : 'anchor-toc',
'book-mk-st' : 'bookmark-start',
'book-mk-en' : 'bookmark-end',
'index-mark' : 'anchor-index',
'place_____' : 'place',
# field => fd
'field_____' : 'field',
'field-inst' : 'field-instruction',
'field-rslt' : 'field-result',
'datafield_' : 'data-field',
# info-tables => it
'font-table' : 'font-table',
'colr-table' : 'color-table',
'lovr-table' : 'list-override-table',
'listtable_' : 'list-table',
'revi-table' : 'revision-table',
# character info => ci
'hidden____' : 'hidden',
'italics___' : 'italics',
'bold______' : 'bold',
'strike-thr' : 'strike-through',
'shadow____' : 'shadow',
'outline___' : 'outline',
'small-caps' : 'small-caps',
'caps______' : 'caps',
'dbl-strike' : 'double-strike-through',
'emboss____' : 'emboss',
'engrave___' : 'engrave',
'subscript_' : 'subscript',
'superscrip' : 'superscipt',
'font-style' : 'font-style',
'font-color' : 'font-color',
'font-size_' : 'font-size',
'font-up___' : 'superscript',
'font-down_' : 'subscript',
'red_______' : 'red',
'blue______' : 'blue',
'green_____' : 'green',
# table => tb
'row-def___' : 'row-definition',
'cell______' : 'cell',
'row_______' : 'row',
'in-table__' : 'in-table',
'columns___' : 'columns',
'row-pos-le' : 'row-position-left',
'cell-posit' : 'cell-position',
# preamble => pr
# underline
'underlined' : 'underlined',
# border => bd
'bor-t-r-hi' : 'border-table-row-horizontal-inside',
'bor-t-r-vi' : 'border-table-row-vertical-inside',
'bor-t-r-to' : 'border-table-row-top',
'bor-t-r-le' : 'border-table-row-left',
'bor-t-r-bo' : 'border-table-row-bottom',
'bor-t-r-ri' : 'border-table-row-right',
'bor-cel-bo' : 'border-cell-bottom',
'bor-cel-to' : 'border-cell-top',
'bor-cel-le' : 'border-cell-left',
'bor-cel-ri' : 'border-cell-right',
# 'bor-par-bo' : 'border-paragraph-bottom',
'bor-par-to' : 'border-paragraph-top',
'bor-par-le' : 'border-paragraph-left',
'bor-par-ri' : 'border-paragraph-right',
'bor-par-bo' : 'border-paragraph-box',
'bor-for-ev' : 'border-for-every-paragraph',
'bor-outsid' : 'border-outisde',
'bor-none__' : 'border',
# border type => bt
'bdr-single' : 'single',
'bdr-doubtb' : 'double-thickness-border',
'bdr-shadow' : 'shadowed-border',
'bdr-double' : 'double-border',
'bdr-dotted' : 'dotted-border',
'bdr-dashed' : 'dashed',
'bdr-hair__' : 'hairline',
'bdr-inset_' : 'inset',
'bdr-das-sm' : 'dash-small',
'bdr-dot-sm' : 'dot-dash',
'bdr-dot-do' : 'dot-dot-dash',
'bdr-outset' : 'outset',
'bdr-trippl' : 'tripple',
'bdr-thsm__' : 'thick-thin-small',
'bdr-htsm__' : 'thin-thick-small',
'bdr-hthsm_' : 'thin-thick-thin-small',
'bdr-thm__' : 'thick-thin-medium',
'bdr-htm__' : 'thin-thick-medium',
'bdr-hthm_' : 'thin-thick-thin-medium',
'bdr-thl__' : 'thick-thin-large',
'bdr-hthl_' : 'think-thick-think-large',
'bdr-wavy_' : 'wavy',
'bdr-d-wav' : 'double-wavy',
'bdr-strip' : 'striped',
'bdr-embos' : 'emboss',
'bdr-engra' : 'engrave',
'bdr-frame' : 'frame',
'bdr-li-wid' : 'line-width',
}
self.__tabs_dict = {
'cw<pf<tab-stop__' : self.__tab_stop_func,
'cw<pf<tab-center' : self.__tab_type_func,
'cw<pf<tab-right_' : self.__tab_type_func,
'cw<pf<tab-dec___' : self.__tab_type_func,
'cw<pf<leader-dot' : self.__tab_leader_func,
'cw<pf<leader-hyp' : self.__tab_leader_func,
'cw<pf<leader-und' : self.__tab_leader_func,
'cw<pf<tab-bar-st' : self.__tab_bar_func,
}
self.__tab_type_dict = {
'cw<pf<tab-center' : 'center',
'cw<pf<tab-right_' : 'right',
'cw<pf<tab-dec___' : 'decimal',
'cw<pf<leader-dot' : 'leader-dot',
'cw<pf<leader-hyp' : 'leader-hyphen',
'cw<pf<leader-und' : 'leader-underline',
}
self.__border_obj = border_parse.BorderParse()
self.__style_num_strings = []
self.__body_style_strings = []
self.__state = 'before_1st_para_def'
self.__att_val_dict = {}
self.__start_marker = 'mi<mk<pard-start\n' # outside para tags
self.__start2_marker = 'mi<mk<pardstart_\n' # inside para tags
self.__end2_marker = 'mi<mk<pardend___\n' # inside para tags
self.__end_marker = 'mi<mk<pard-end__\n' # outside para tags
self.__text_string = ''
self.__state_dict = {
'before_1st_para_def' : self.__before_1st_para_def_func,
'collect_tokens' : self.__collect_tokens_func,
'after_para_def' : self.__after_para_def_func,
'in_paragraphs' : self.__in_paragraphs_func,
'after_para_end' : self.__after_para_end_func,
}
self.__collect_tokens_dict = {
'mi<mk<para-start' : self.__end_para_def_func,
'cw<pf<par-def___' : self.__para_def_in_para_def_func,
'cw<tb<cell______' : self.__empty_table_element_func,
'cw<tb<row_______' : self.__empty_table_element_func,
}
self.__after_para_def_dict = {
'mi<mk<para-start' : self.__start_para_after_def_func,
'cw<pf<par-def___' : self.__found_para_def_func,
'cw<tb<cell______' : self.__empty_table_element_func,
'cw<tb<row_______' : self.__empty_table_element_func,
}
self.__in_paragraphs_dict = {
'mi<mk<para-end__' : self.__found_para_end_func,
}
self.__after_para_end_dict = {
'mi<mk<para-start' : self.__continue_block_func,
'mi<mk<para-end__' : self.__continue_block_func,
'cw<pf<par-def___' : self.__new_para_def_func,
'mi<mk<body-close' : self.__stop_block_func,
'mi<mk<par-in-fld' : self.__stop_block_func,
'cw<tb<cell______' : self.__stop_block_func,
'cw<tb<row-def___' : self.__stop_block_func,
'cw<tb<row_______' : self.__stop_block_func,
'mi<mk<sect-close' : self.__stop_block_func,
'mi<mk<sect-start' : self.__stop_block_func,
'mi<mk<header-beg' : self.__stop_block_func,
'mi<mk<header-end' : self.__stop_block_func,
'mi<mk<head___clo' : self.__stop_block_func,
'mi<mk<fldbk-end_' : self.__stop_block_func,
'mi<mk<lst-txbeg_' : self.__stop_block_func,
}
def __before_1st_para_def_func(self, line):
"""
Required:
line -- line to parse
Returns:
nothing
Logic:
Look for the beginning of a paragaraph definition
"""
# cw<pf<par-def___<nu<true
if self.__token_info == 'cw<pf<par-def___':
self.__found_para_def_func()
else:
self.__write_obj.write(line)
def __found_para_def_func(self):
self.__state = 'collect_tokens'
# not exactly right--have to reset the dictionary--give it default
# values
self.__reset_dict()
def __collect_tokens_func(self, line):
"""
Required:
line --line to parse
Returns:
nothing
Logic:
Check the collect_tokens_dict for either the beginning of a
paragraph or a new paragraph definition. Take the actions
according to the value in the dict.
Otherwise, check if the token is not a control word. If it is not,
change the state to after_para_def.
Otherwise, check if the token is a paragraph definition word; if
so, add it to the attributes and values dictionary.
"""
action = self.__collect_tokens_dict.get(self.__token_info)
if action:
action(line)
elif line[0:2] != 'cw':
self.__write_obj.write(line)
self.__state = 'after_para_def'
elif line[0:5] == 'cw<bd':
self.__parse_border(line)
else:
action = self.__tabs_dict.get(self.__token_info)
if action:
action(line)
else:
token = self.__token_dict.get(line[6:16])
if token:
self.__att_val_dict[token] = line[20:-1]
def __tab_stop_func(self, line):
"""
"""
self.__att_val_dict['tabs'] += '%s:' % self.__tab_type
self.__att_val_dict['tabs'] += '%s;' % line[20:-1]
self.__tab_type = 'left'
def __tab_type_func(self, line):
"""
"""
type = self.__tab_type_dict.get(self.__token_info)
if type is not None:
self.__tab_type = type
else:
if self.__run_level > 3:
msg = 'no entry for %s\n' % self.__token_info
raise self.__bug_handler(msg)
def __tab_leader_func(self, line):
"""
"""
leader = self.__tab_type_dict.get(self.__token_info)
if leader is not None:
self.__att_val_dict['tabs'] += '%s^' % leader
else:
if self.__run_level > 3:
msg = 'no entry for %s\n' % self.__token_info
raise self.__bug_handler(msg)
def __tab_bar_func(self, line):
"""
"""
# self.__att_val_dict['tabs-bar'] += '%s:' % line[20:-1]
self.__att_val_dict['tabs'] += 'bar:%s;' % (line[20:-1])
self.__tab_type = 'left'
def __parse_border(self, line):
"""
Requires:
line --line to parse
Returns:
nothing (updates dictionary)
Logic:
Uses the border_parse module to return a dictionary of attribute
value pairs for a border line.
"""
border_dict = self.__border_obj.parse_border(line)
self.__att_val_dict.update(border_dict)
def __para_def_in_para_def_func(self, line):
"""
Requires:
line --line to parse
Returns:
nothing
Logic:
I have found a \\pard while I am collecting tokens. I want to reset
the dectionary and do nothing else.
"""
# Change this
self.__state = 'collect_tokens'
self.__reset_dict()
def __end_para_def_func(self, line):
"""
Requires:
Nothing
Returns:
Nothing
Logic:
The previous state was collect tokens, and I have found the start
of a paragraph. I want to outut the defintion tag; output the line
itself (telling me of the beginning of a paragraph);change the
state to 'in_paragraphs';
"""
self.__write_para_def_beg()
self.__write_obj.write(line)
self.__state = 'in_paragraphs'
def __start_para_after_def_func(self, line):
"""
Requires:
Nothing
Returns:
Nothing
Logic:
The state was is after_para_def. and I have found the start of a
paragraph. I want to outut the defintion tag; output the line
itself (telling me of the beginning of a paragraph);change the
state to 'in_paragraphs'.
(I now realize that this is absolutely identical to the function above!)
"""
self.__write_para_def_beg()
self.__write_obj.write(line)
self.__state = 'in_paragraphs'
def __after_para_def_func(self, line):
"""
Requires:
line -- line to parse
Returns:
nothing
Logic:
Check if the token info is the start of a paragraph. If so, call
on the function found in the value of the dictionary.
"""
action = self.__after_para_def_dict.get(self.__token_info)
if self.__token_info == 'cw<pf<par-def___':
self.__found_para_def_func()
elif action:
action(line)
else:
self.__write_obj.write(line)
def __in_paragraphs_func(self, line):
"""
Requires:
line --current line
Returns:
nothing
Logic:
Look for the end of a paragraph, the start of a cell or row.
"""
action = self.__in_paragraphs_dict.get(self.__token_info)
if action:
action(line)
else:
self.__write_obj.write(line)
def __found_para_end_func(self,line):
"""
Requires:
line -- line to print out
Returns:
Nothing
Logic:
State is in paragraphs. You have found the end of a paragraph. You
need to print out the line and change the state to after
paragraphs.
"""
self.__state = 'after_para_end'
self.__write_obj.write(line)
def __after_para_end_func(self, line):
"""
Requires:
line -- line to output
Returns:
nothing
Logic:
The state is after the end of a paragraph. You are collecting all
the lines in a string and waiting to see if you need to write
out the paragraph definition. If you find another paragraph
definition, then you write out the old paragraph dictionary and
print out the string. You change the state to collect tokens.
If you find any larger block elemens, such as cell, row,
field-block, or section, you write out the paragraph defintion and
then the text string.
If you find the beginning of a paragraph, then you don't need to
write out the paragraph definition. Write out the string, and
change the state to in paragraphs.
"""
self.__text_string += line
action = self.__after_para_end_dict.get(self.__token_info)
if action:
action(line)
def __continue_block_func(self, line):
"""
Requires:
line --line to print out
Returns:
Nothing
Logic:
The state is after the end of a paragraph. You have found the
start of a paragaph, so you don't need to print out the paragaph
definition. Print out the string, the line, and change the state
to in paragraphs.
"""
self.__state = 'in_paragraphs'
self.__write_obj.write(self.__text_string)
self.__text_string = ''
# found a new paragraph definition after an end of a paragraph
def __new_para_def_func(self, line):
"""
Requires:
line -- line to output
Returns:
Nothing
Logic:
You have found a new paragraph defintion at the end of a
paragraph. Output the end of the old paragraph defintion. Output
the text string. Output the line. Change the state to collect
tokens. (And don't forget to set the text string to ''!)
"""
self.__write_para_def_end_func()
self.__found_para_def_func()
# after a paragraph and found reason to stop this block
def __stop_block_func(self, line):
"""
Requires:
line --(shouldn't be here?)
Returns:
nothing
Logic:
The state is after a paragraph, and you have found a larger block
than paragraph-definition. You want to write the end tag of the
old defintion and reset the text string (handled by other
methods).
"""
self.__write_para_def_end_func()
self.__state = 'after_para_def'
def __write_para_def_end_func(self):
"""
Requires:
nothing
Returns:
nothing
Logic:
Print out the end of the pargraph definition tag, and the markers
that let me know when I have reached this tag. (These markers are
used for later parsing.)
"""
self.__write_obj.write(self.__end2_marker)
self.__write_obj.write('mi<tg<close_____<paragraph-definition\n')
self.__write_obj.write(self.__end_marker)
self.__write_obj.write(self.__text_string)
self.__text_string = ''
keys = self.__att_val_dict.keys()
if 'font-style' in keys:
self.__write_obj.write('mi<mk<font-end__\n')
if 'caps' in keys:
self.__write_obj.write('mi<mk<caps-end__\n')
def __get_num_of_style(self):
"""
Requires:
nothing
Returns:
nothing
Logic:
Get a unique value for each style.
"""
my_string = ''
new_style = 0
# when determining uniqueness for a style, ingorne these values, since
# they don't tell us if the style is unique
ignore_values = ['style-num', 'nest-level', 'in-table']
for k in sorted(self.__att_val_dict):
if k not in ignore_values:
my_string += '%s:%s' % (k, self.__att_val_dict[k])
if my_string in self.__style_num_strings:
num = self.__style_num_strings.index(my_string)
num += 1 # since indexing starts at zero, rather than 1
else:
self.__style_num_strings.append(my_string)
num = len(self.__style_num_strings)
new_style = 1
num = '%04d' % num
self.__att_val_dict['style-num'] = 's' + unicode_type(num)
if new_style:
self.__write_body_styles()
def __write_body_styles(self):
style_string = ''
style_string += 'mi<tg<empty-att_<paragraph-style-in-body'
style_string += '<name>%s' % self.__att_val_dict['name']
style_string += '<style-number>%s' % self.__att_val_dict['style-num']
tabs_list = ['tabs-left', 'tabs-right', 'tabs-decimal', 'tabs-center',
'tabs-bar', 'tabs']
if self.__att_val_dict['tabs'] != '':
the_value = self.__att_val_dict['tabs']
# the_value = the_value[:-1]
style_string += ('<%s>%s' % ('tabs', the_value))
exclude = frozenset(['name', 'style-num', 'in-table'] + tabs_list)
for k in sorted(self.__att_val_dict):
if k not in exclude:
style_string += ('<%s>%s' % (k, self.__att_val_dict[k]))
style_string += '\n'
self.__body_style_strings.append(style_string)
def __write_para_def_beg(self):
"""
Requires:
nothing
Returns:
nothing
Logic:
Print out the beginning of the pargraph definition tag, and the markers
that let me know when I have reached this tag. (These markers are
used for later parsing.)
"""
self.__get_num_of_style()
table = self.__att_val_dict.get('in-table')
if table:
# del self.__att_val_dict['in-table']
self.__write_obj.write('mi<mk<in-table__\n')
else:
self.__write_obj.write('mi<mk<not-in-tbl\n')
left_indent = self.__att_val_dict.get('left-indent')
if left_indent:
self.__write_obj.write('mi<mk<left_inden<%s\n' % left_indent)
is_list = self.__att_val_dict.get('list-id')
if is_list:
self.__write_obj.write('mi<mk<list-id___<%s\n' % is_list)
else:
self.__write_obj.write('mi<mk<no-list___\n')
self.__write_obj.write('mi<mk<style-name<%s\n' % self.__att_val_dict['name'])
self.__write_obj.write(self.__start_marker)
self.__write_obj.write('mi<tg<open-att__<paragraph-definition')
self.__write_obj.write('<name>%s' % self.__att_val_dict['name'])
self.__write_obj.write('<style-number>%s' % self.__att_val_dict['style-num'])
tabs_list = ['tabs-left', 'tabs-right', 'tabs-decimal', 'tabs-center',
'tabs-bar', 'tabs']
"""
for tab_item in tabs_list:
if self.__att_val_dict[tab_item] != '':
the_value = self.__att_val_dict[tab_item]
the_value = the_value[:-1]
self.__write_obj.write('<%s>%s' % (tab_item, the_value))
"""
if self.__att_val_dict['tabs'] != '':
the_value = self.__att_val_dict['tabs']
# the_value = the_value[:-1]
self.__write_obj.write('<%s>%s' % ('tabs', the_value))
keys = sorted(self.__att_val_dict)
exclude = frozenset(['name', 'style-num', 'in-table'] + tabs_list)
for key in keys:
if key not in exclude:
self.__write_obj.write('<%s>%s' % (key, self.__att_val_dict[key]))
self.__write_obj.write('\n')
self.__write_obj.write(self.__start2_marker)
if 'font-style' in keys:
face = self.__att_val_dict['font-style']
self.__write_obj.write('mi<mk<font______<%s\n' % face)
if 'caps' in keys:
value = self.__att_val_dict['caps']
self.__write_obj.write('mi<mk<caps______<%s\n' % value)
def __empty_table_element_func(self, line):
self.__write_obj.write('mi<mk<in-table__\n')
self.__write_obj.write(line)
self.__state = 'after_para_def'
def __reset_dict(self):
"""
Requires:
nothing
Returns:
nothing
Logic:
The dictionary containing values and attributes must be reset each
time a new paragraphs definition is found.
"""
self.__att_val_dict.clear()
self.__att_val_dict['name'] = 'Normal'
self.__att_val_dict['font-style'] = self.__default_font
self.__tab_type = 'left'
self.__att_val_dict['tabs-left'] = ''
self.__att_val_dict['tabs-right'] = ''
self.__att_val_dict['tabs-center'] = ''
self.__att_val_dict['tabs-decimal'] = ''
self.__att_val_dict['tabs-bar'] = ''
self.__att_val_dict['tabs'] = ''
def make_paragraph_def(self):
"""
Requires:
nothing
Returns:
nothing (changes the original file)
Logic:
Read one line in at a time. Determine what action to take based on
the state.
"""
self.__initiate_values()
read_obj = open_for_read(self.__file)
self.__write_obj = open_for_write(self.__write_to)
line_to_read = 1
while line_to_read:
line_to_read = read_obj.readline()
line = line_to_read
self.__token_info = line[:16]
action = self.__state_dict.get(self.__state)
if action is None:
sys.stderr.write('no no matching state in module sections.py\n')
sys.stderr.write(self.__state + '\n')
action(line)
read_obj.close()
self.__write_obj.close()
copy_obj = copy.Copy(bug_handler=self.__bug_handler)
if self.__copy:
copy_obj.copy_file(self.__write_to, "paragraphs_def.data")
copy_obj.rename(self.__write_to, self.__file)
os.remove(self.__write_to)
return self.__body_style_strings

View File

@@ -0,0 +1,263 @@
from __future__ import unicode_literals, absolute_import, print_function, division
#########################################################################
# #
# #
# copyright 2002 Paul Henry Tremblay #
# #
# This program is distributed in the hope that it will be useful, #
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
# General Public License for more details. #
# #
# #
#########################################################################
import sys, os
from calibre.ebooks.rtf2xml import copy
from calibre.ptempfile import better_mktemp
from . import open_for_read, open_for_write
class Paragraphs:
"""
=================
Purpose
=================
Write paragraph tags for a tokenized file. (This module won't be any use to use
to you unless you use it as part of the other modules.)
-------------
Method
-------------
RTF does not tell you when a paragraph begins. It only tells you when the
paragraph ends.
In order to make paragraphs out of this limited info, the parser starts in the
body of the documents and assumes it is not in a paragraph. It looks for clues
to begin a paragraph. Text starts a paragraph; so does an inline field or
list-text. If an end of paragraph marker (\\par) is found, then this indicates
a blank paragraph.
Once a paragraph is found, the state changes to 'paragraph.' In this state,
clues are looked to for the end of a paragraph. The end of a paragraph marker
(\\par) marks the end of a paragraph. So does the end of a footnote or heading;
a paragraph definition; the end of a field-block; and the beginning of a
section. (How about the end of a section or the end of a field-block?)
"""
def __init__(self,
in_file,
bug_handler,
copy=None,
write_empty_para=1,
run_level=1,
):
"""
Required:
'file'--file to parse
Optional:
'copy'-- whether to make a copy of result for debugging
'temp_dir' --where to output temporary results (default is
directory from which the script is run.)
Returns:
nothing
"""
self.__file = in_file
self.__bug_handler = bug_handler
self.__copy = copy
self.__write_empty_para = write_empty_para
self.__run_level = run_level
self.__write_to = better_mktemp()
def __initiate_values(self):
"""
Initiate all values.
"""
self.__state = 'before_body'
self.__start_marker = 'mi<mk<para-start\n' # outside para tags
self.__start2_marker = 'mi<mk<par-start_\n' # inside para tags
self.__end2_marker = 'mi<mk<par-end___\n' # inside para tags
self.__end_marker = 'mi<mk<para-end__\n' # outside para tags
self.__state_dict = {
'before_body' : self.__before_body_func,
'not_paragraph' : self.__not_paragraph_func,
'paragraph' : self.__paragraph_func,
}
self.__paragraph_dict = {
'cw<pf<par-end___' : self.__close_para_func, # end of paragraph
'mi<mk<headi_-end' : self.__close_para_func, # end of header or footer
# 'cw<pf<par-def___' : self.__close_para_func, # paragraph definition
# 'mi<mk<fld-bk-end' : self.__close_para_func, # end of field-block
'mi<mk<fldbk-end_' : self.__close_para_func, # end of field-block
'mi<mk<body-close' : self.__close_para_func, # end of body
'mi<mk<sect-close' : self.__close_para_func, # end of body
'mi<mk<sect-start' : self.__close_para_func, # start of section
'mi<mk<foot___clo' : self.__close_para_func, # end of footnote
'cw<tb<cell______' : self.__close_para_func, # end of cell
'mi<mk<par-in-fld' : self.__close_para_func, # start of block field
'cw<pf<par-def___' : self.__bogus_para__def_func, # paragraph definition
}
self.__not_paragraph_dict = {
'tx<nu<__________' : self.__start_para_func,
'tx<hx<__________' : self.__start_para_func,
'tx<ut<__________' : self.__start_para_func,
'tx<mc<__________' : self.__start_para_func,
'mi<mk<inline-fld' : self.__start_para_func,
'mi<mk<para-beg__' : self.__start_para_func,
'cw<pf<par-end___' : self.__empty_para_func,
'mi<mk<pict-start' : self.__start_para_func,
'cw<pf<page-break' : self.__empty_pgbk_func, # page break
}
def __before_body_func(self, line):
"""
Required:
line -- line to parse
Returns:
nothing
Logic:
This function handles all the lines before the start of the body.
Once the body starts, the state is switched to 'not_paragraph'
"""
if self.__token_info == 'mi<mk<body-open_':
self.__state = 'not_paragraph'
self.__write_obj.write(line)
def __not_paragraph_func(self, line):
"""
Required:
line --line to parse
Returns:
nothing
Logic:
This function handles all lines that are outside of the paragraph.
It looks for clues that start a paragraph, and when found,
switches states and writes the start tags.
"""
action = self.__not_paragraph_dict.get(self.__token_info)
if action:
action(line)
self.__write_obj.write(line)
def __paragraph_func(self, line):
"""
Required:
line --line to parse
Returns:
nothing
Logic:
This function handles all the lines that are in the paragraph. It
looks for clues to the end of the paragraph. When a clue is found,
it calls on another method to write the end of the tag and change
the state.
"""
action = self.__paragraph_dict.get(self.__token_info)
if action:
action(line)
else:
self.__write_obj.write(line)
def __start_para_func(self, line):
"""
Requires:
line --line to parse
Returns:
nothing
Logic:
This function writes the beginning tags for a paragraph and
changes the state to paragraph.
"""
self.__write_obj.write(self.__start_marker) # marker for later parsing
self.__write_obj.write(
'mi<tg<open______<para\n'
)
self.__write_obj.write(self.__start2_marker)
self.__state = 'paragraph'
def __empty_para_func(self, line):
"""
Requires:
line --line to parse
Returns:
nothing
Logic:
This function writes the empty tags for a paragraph.
It does not do anything if self.__write_empty_para is 0.
"""
if self.__write_empty_para:
self.__write_obj.write(self.__start_marker) # marker for later parsing
self.__write_obj.write(
'mi<tg<empty_____<para\n'
)
self.__write_obj.write(self.__end_marker) # marker for later parsing
def __empty_pgbk_func(self, line):
"""
Requires:
line --line to parse
Returns:
nothing
Logic:
This function writes the empty tags for a page break.
"""
self.__write_obj.write(
'mi<tg<empty_____<page-break\n'
)
def __close_para_func(self, line):
"""
Requires:
line --line to parse
Returns:
nothing
Logic:
This function writes the end tags for a paragraph and
changes the state to not_paragraph.
"""
self.__write_obj.write(self.__end2_marker) # marker for later parser
self.__write_obj.write(
'mi<tg<close_____<para\n'
)
self.__write_obj.write(self.__end_marker) # marker for later parser
self.__write_obj.write(line)
self.__state = 'not_paragraph'
def __bogus_para__def_func(self, line):
"""
Requires:
line --line to parse
Returns:
nothing
Logic:
if a \\pard occurs in a paragraph, I want to ignore it. (I believe)
"""
self.__write_obj.write('mi<mk<bogus-pard\n')
def make_paragraphs(self):
"""
Requires:
nothing
Returns:
nothing (changes the original file)
Logic:
Read one line in at a time. Determine what action to take based on
the state. If the state is before the body, look for the
beginning of the body.
When the body is found, change the state to 'not_paragraph'. The
only other state is 'paragraph'.
"""
self.__initiate_values()
with open_for_read(self.__file) as read_obj:
with open_for_write(self.__write_to) as self.__write_obj:
for line in read_obj:
self.__token_info = line[:16]
action = self.__state_dict.get(self.__state)
if action is None:
try:
sys.stderr.write('no matching state in module paragraphs.py\n')
sys.stderr.write(self.__state + '\n')
except:
pass
action(line)
copy_obj = copy.Copy(bug_handler=self.__bug_handler)
if self.__copy:
copy_obj.copy_file(self.__write_to, "paragraphs.data")
copy_obj.rename(self.__write_to, self.__file)
os.remove(self.__write_to)

View File

@@ -0,0 +1,182 @@
from __future__ import absolute_import, division, print_function, unicode_literals
#########################################################################
# #
# #
# copyright 2002 Paul Henry Tremblay #
# #
# This program is distributed in the hope that it will be useful, #
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
# General Public License for more details. #
# #
# #
#########################################################################
import sys, os
from calibre.ebooks.rtf2xml import copy
from calibre.ptempfile import better_mktemp
from polyglot.builtins import unicode_type
from . import open_for_read, open_for_write
class Pict:
"""Process graphic information"""
def __init__(self,
in_file,
bug_handler,
out_file,
copy=None,
orig_file=None,
run_level=1,
):
self.__file = in_file
self.__bug_handler = bug_handler
self.__copy = copy
self.__run_level = run_level
self.__write_to = better_mktemp()
self.__bracket_count = 0
self.__ob_count = 0
self.__cb_count = 0
self.__pict_count = 0
self.__in_pict = False
self.__already_found_pict = False
self.__orig_file = orig_file
self.__initiate_pict_dict()
self.__out_file = out_file
def __initiate_pict_dict(self):
self.__pict_dict = {
'ob<nu<open-brack' : self.__open_br_func,
'cb<nu<clos-brack' : self.__close_br_func,
'tx<nu<__________' : self.__text_func,
}
def __open_br_func(self, line):
return "{\n"
def __close_br_func(self, line):
return "}\n"
def __text_func(self, line):
# tx<nu<__________<true text
return line[17:]
def __make_dir(self):
""" Make a directory to put the image data in"""
base_name = os.path.basename(getattr(self.__orig_file, 'name',
self.__orig_file))
base_name = os.path.splitext(base_name)[0]
if self.__out_file:
dir_name = os.path.dirname(getattr(self.__out_file, 'name',
self.__out_file))
else:
dir_name = os.path.dirname(self.__orig_file)
self.__dir_name = base_name + "_rtf_pict_dir/"
self.__dir_name = os.path.join(dir_name, self.__dir_name)
if not os.path.isdir(self.__dir_name):
try:
os.mkdir(self.__dir_name)
except OSError as msg:
msg = "%sCouldn't make directory '%s':\n" % (unicode_type(msg), self.__dir_name)
raise self.__bug_handler
else:
if self.__run_level > 1:
sys.stderr.write('Removing files from old pict directory...\n')
all_files = os.listdir(self.__dir_name)
for the_file in all_files:
the_file = os.path.join(self.__dir_name, the_file)
try:
os.remove(the_file)
except OSError:
pass
if self.__run_level > 1:
sys.stderr.write('Files removed.\n')
def __create_pict_file(self):
"""Create a file for all the pict data to be written to.
"""
self.__pict_file = os.path.join(self.__dir_name, 'picts.rtf')
self.__write_pic_obj = open_for_write(self.__pict_file, append=True)
def __in_pict_func(self, line):
if self.__cb_count == self.__pict_br_count:
self.__in_pict = False
self.__write_pic_obj.write("}\n")
return True
else:
action = self.__pict_dict.get(self.__token_info)
if action:
self.__write_pic_obj.write(action(line))
return False
def __default(self, line, write_obj):
"""Determine if each token marks the beginning of pict data.
If it does, create a new file to write data to (if that file
has not already been created.) Set the self.__in_pict flag to true.
If the line does not contain pict data, return 1
"""
"""
$pict_count++;
$pict_count = sprintf("%03d", $pict_count);
print OUTPUT "dv<xx<em<nu<pict<at<num>$pict_count\n";
"""
if self.__token_info == 'cw<gr<picture___':
self.__pict_count += 1
# write_obj.write("mi<tg<em<at<pict<num>%03d\n" % self.__pict_count)
write_obj.write('mi<mk<pict-start\n')
write_obj.write('mi<tg<empty-att_<pict<num>%03d\n' % self.__pict_count)
write_obj.write('mi<mk<pict-end__\n')
if not self.__already_found_pict:
self.__create_pict_file()
self.__already_found_pict=True
self.__print_rtf_header()
self.__in_pict = 1
self.__pict_br_count = self.__ob_count
self.__cb_count = 0
self.__write_pic_obj.write("{\\pict\n")
return False
return True
def __print_rtf_header(self):
"""Print to pict file the necessary RTF data for the file to be
recognized as an RTF file.
"""
self.__write_pic_obj.write("{\\rtf1 \n{\\fonttbl\\f0\\null;} \n")
self.__write_pic_obj.write("{\\colortbl\\red255\\green255\\blue255;} \n\\pard \n")
def process_pict(self):
self.__make_dir()
with open_for_read(self.__file) as read_obj:
with open_for_write(self.__write_to) as write_obj:
for line in read_obj:
self.__token_info = line[:16]
if self.__token_info == 'ob<nu<open-brack':
self.__ob_count = line[-5:-1]
if self.__token_info == 'cb<nu<clos-brack':
self.__cb_count = line[-5:-1]
if not self.__in_pict:
to_print = self.__default(line, write_obj)
if to_print :
write_obj.write(line)
else:
to_print = self.__in_pict_func(line)
if to_print :
write_obj.write(line)
if self.__already_found_pict:
self.__write_pic_obj.write("}\n")
self.__write_pic_obj.close()
copy_obj = copy.Copy(bug_handler=self.__bug_handler)
if self.__copy:
copy_obj.copy_file(self.__write_to, "pict.data")
try:
copy_obj.copy_file(self.__pict_file, "pict.rtf")
except:
pass
copy_obj.rename(self.__write_to, self.__file)
os.remove(self.__write_to)
if self.__pict_count == 0:
try:
os.rmdir(self.__dir_name)
except OSError:
pass

View File

@@ -0,0 +1,591 @@
from __future__ import unicode_literals, absolute_import, print_function, division
from __future__ import print_function
#########################################################################
# #
# #
# copyright 2002 Paul Henry Tremblay #
# #
# This program is distributed in the hope that it will be useful, #
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
# General Public License for more details. #
# #
# #
#########################################################################
import sys, os
from calibre.ebooks.rtf2xml import copy, override_table, list_table
from calibre.ptempfile import better_mktemp
from . import open_for_read, open_for_write
class PreambleDiv:
"""
Break the preamble into divisions.
"""
def __init__(self, in_file,
bug_handler,
copy=None,
no_namespace=None,
run_level=1,
):
"""
Required:
'file'
Optional:
'copy'-- whether to make a copy of result for debugging
'temp_dir' --where to output temporary results (default is
directory from which the script is run.)
Returns:
nothing
"""
self.__file = in_file
self.__bug_handler = bug_handler
self.__copy = copy
self.__no_namespace = no_namespace
self.__write_to = better_mktemp()
self.__run_level = run_level
def __initiate_values(self):
"""
Set values, including those for the dictionary.
"""
self.__all_lists = {}
self.__page = {
'margin-top' : 72,
'margin-bottom' : 72,
'margin-left' : 90,
'margin-right' : 90,
'gutter' : 0,
}
self.__cb_count = ''
self.__ob_count = ''
self.__state = 'preamble'
self.__rtf_final = ''
self.__close_group_count = ''
self.__found_font_table = 0
self.__list_table_final = ''
self.__override_table_final = ''
self.__revision_table_final = ''
self.__doc_info_table_final = ''
self.__state_dict = {
'default' : self.__default_func,
'rtf_header' : self.__rtf_head_func,
'preamble' : self.__preamble_func,
'font_table' : self.__font_table_func,
'color_table' : self.__color_table_func,
'style_sheet' : self.__style_sheet_func,
'list_table' : self.__list_table_func,
'override_table' : self.__override_table_func,
'revision_table' : self.__revision_table_func,
'doc_info' : self.__doc_info_func,
'body' : self.__body_func,
'ignore' : self.__ignore_func,
'cw<ri<rtf_______' : self.__found_rtf_head_func,
'cw<pf<par-def___' : self.__para_def_func,
'tx<nu<__________' : self.__text_func,
'cw<tb<row-def___' : self.__row_def_func,
'cw<sc<section___' : self.__new_section_func,
'cw<sc<sect-defin' : self.__new_section_func,
'cw<it<font-table' : self.__found_font_table_func,
'cw<it<colr-table' : self.__found_color_table_func,
'cw<ss<style-shet' : self.__found_style_sheet_func,
'cw<it<listtable_' : self.__found_list_table_func,
'cw<it<lovr-table' : self.__found_override_table_func,
'cw<it<revi-table' : self.__found_revision_table_func,
'cw<di<doc-info__' : self.__found_doc_info_func,
'cw<pa<margin-lef' : self.__margin_func,
'cw<pa<margin-rig' : self.__margin_func,
'cw<pa<margin-top' : self.__margin_func,
'cw<pa<margin-bot' : self.__margin_func,
'cw<pa<gutter____' : self.__margin_func,
'cw<pa<paper-widt' : self.__margin_func,
'cw<pa<paper-hght' : self.__margin_func,
# 'cw<tb<columns___' : self.__section_func,
}
self.__margin_dict = {
'margin-lef' : 'margin-left',
'margin-rig' : 'margin-right',
'margin-top' : 'margin-top',
'margin-bot' : 'margin-bottom',
'gutter____' : 'gutter',
'paper-widt' : 'paper-width',
'paper-hght' : 'paper-height',
}
self.__translate_sec = {
'columns___' : 'column',
}
self.__section = {}
# self.__write_obj.write(self.__color_table_final)
self.__color_table_final = ''
self.__style_sheet_final = ''
self.__individual_font = 0
self.__old_font = 0
self.__ob_group = 0 # depth of group
self.__font_table_final = 0
self.__list_table_obj = list_table.ListTable(
run_level=self.__run_level,
bug_handler=self.__bug_handler,
)
def __ignore_func(self, line):
"""
Ignore all lines, until the bracket is found that marks the end of
the group.
"""
if self.__ignore_num == self.__cb_count:
self.__state = self.__previous_state
def __found_rtf_head_func(self, line):
self.__state = 'rtf_header'
def __rtf_head_func(self, line):
if self.__ob_count == '0002':
self.__rtf_final = (
'mi<mk<rtfhed-beg\n' +
self.__rtf_final +
'mi<mk<rtfhed-end\n'
)
self.__state = 'preamble'
elif self.__token_info == 'tx<nu<__________' or \
self.__token_info == 'cw<pf<par-def___':
self.__state = 'body'
self.__rtf_final = (
'mi<mk<rtfhed-beg\n' +
self.__rtf_final +
'mi<mk<rtfhed-end\n'
)
self.__make_default_font_table()
self.__write_preamble()
self.__write_obj.write(line)
else:
self.__rtf_final = self.__rtf_final + line
def __make_default_font_table(self):
"""
If not font table is fount, need to write one out.
"""
self.__font_table_final = 'mi<tg<open______<font-table\n'
self.__font_table_final += 'mi<mk<fonttb-beg\n'
self.__font_table_final += 'mi<mk<fontit-beg\n'
self.__font_table_final += 'cw<ci<font-style<nu<0\n'
self.__font_table_final += 'tx<nu<__________<Times;\n'
self.__font_table_final += 'mi<mk<fontit-end\n'
self.__font_table_final += 'mi<mk<fonttb-end\n'
self.__font_table_final += 'mi<tg<close_____<font-table\n'
def __make_default_color_table(self):
"""
If no color table is found, write a string for a default one
"""
self.__color_table_final = 'mi<tg<open______<color-table\n'
self.__color_table_final += 'mi<mk<clrtbl-beg\n'
self.__color_table_final += 'cw<ci<red_______<nu<00\n'
self.__color_table_final += 'cw<ci<green_____<nu<00\n'
self.__color_table_final += 'cw<ci<blue______<en<00\n'
self.__color_table_final += 'mi<mk<clrtbl-end\n'
self.__color_table_final += 'mi<tg<close_____<color-table\n'
def __make_default_style_table(self):
"""
If not font table is found, make a string for a default one
"""
"""
self.__style_sheet_final = 'mi<tg<open______<style-table\n'
self.__style_sheet_final +=
self.__style_sheet_final +=
self.__style_sheet_final +=
self.__style_sheet_final +=
self.__style_sheet_final +=
self.__style_sheet_final += 'mi<tg<close_____<style-table\n'
"""
self.__style_sheet_final = """mi<tg<open______<style-table
mi<mk<styles-beg
mi<mk<stylei-beg
cw<ci<font-style<nu<0
tx<nu<__________<Normal;
mi<mk<stylei-end
mi<mk<stylei-beg
cw<ss<char-style<nu<0
tx<nu<__________<Default Paragraph Font;
mi<mk<stylei-end
mi<mk<styles-end
mi<tg<close_____<style-table
"""
def __found_font_table_func(self, line):
if self.__found_font_table:
self.__state = 'ignore'
else:
self.__state = 'font_table'
self.__font_table_final = ''
self.__close_group_count = self.__ob_count
self.__cb_count = 0
self.__found_font_table = 1
def __font_table_func(self, line):
"""
Keep adding to the self.__individual_font string until end of group
found. If a bracket is found, check that it is only one bracket deep.
If it is, then set the marker for an individual font. If it is not,
then ignore all data in this group.
cw<ci<font-style<nu<0
"""
if self.__cb_count == self.__close_group_count:
self.__state = 'preamble'
self.__font_table_final = 'mi<tg<open______<font-table\n' + \
'mi<mk<fonttb-beg\n' + self.__font_table_final
self.__font_table_final += \
'mi<mk<fonttb-end\n' + 'mi<tg<close_____<font-table\n'
elif self.__token_info == 'ob<nu<open-brack':
if int(self.__ob_count) == int(self.__close_group_count) + 1:
self.__font_table_final += \
'mi<mk<fontit-beg\n'
self.__individual_font = 1
else:
# ignore
self.__previous_state = 'font_table'
self.__state = 'ignore'
self.__ignore_num = self.__ob_count
elif self.__token_info == 'cb<nu<clos-brack':
if int(self.__cb_count) == int(self.__close_group_count) + 1:
self.__individual_font = 0
self.__font_table_final += \
'mi<mk<fontit-end\n'
elif self.__individual_font:
if self.__old_font and self.__token_info == 'tx<nu<__________':
if ';' in line:
self.__font_table_final += line
self.__font_table_final += 'mi<mk<fontit-end\n'
self.__individual_font = 0
else:
self.__font_table_final += line
elif self.__token_info == 'cw<ci<font-style':
self.__old_font = 1
self.__individual_font = 1
self.__font_table_final += 'mi<mk<fontit-beg\n'
self.__font_table_final += line
def __old_font_func(self, line):
"""
Required:
line --line to parse
Returns:
nothing
Logic:
used for older forms of RTF:
\f3\fswiss\fcharset77 Helvetica-Oblique;\f4\fnil\fcharset77 Geneva;}
Note how each font is not divided by a bracket
"""
def __found_color_table_func(self, line):
"""
all functions that start with __found operate the same. They set the
state, initiate a string, determine the self.__close_group_count, and
set self.__cb_count to zero.
"""
self.__state = 'color_table'
self.__color_table_final = ''
self.__close_group_count = self.__ob_count
self.__cb_count = 0
def __color_table_func(self, line):
if int(self.__cb_count) == int(self.__close_group_count):
self.__state = 'preamble'
self.__color_table_final = 'mi<tg<open______<color-table\n' + \
'mi<mk<clrtbl-beg\n' + self.__color_table_final
self.__color_table_final += \
'mi<mk<clrtbl-end\n' + 'mi<tg<close_____<color-table\n'
else:
self.__color_table_final += line
def __found_style_sheet_func(self, line):
self.__state = 'style_sheet'
self.__style_sheet_final = ''
self.__close_group_count = self.__ob_count
self.__cb_count = 0
def __style_sheet_func(self, line):
"""
Same logic as the font_table_func.
"""
if self.__cb_count == self.__close_group_count:
self.__state = 'preamble'
self.__style_sheet_final = 'mi<tg<open______<style-table\n' + \
'mi<mk<styles-beg\n' + self.__style_sheet_final
self.__style_sheet_final += \
'mi<mk<styles-end\n' + 'mi<tg<close_____<style-table\n'
elif self.__token_info == 'ob<nu<open-brack':
if int(self.__ob_count) == int(self.__close_group_count) + 1:
self.__style_sheet_final += \
'mi<mk<stylei-beg\n'
elif self.__token_info == 'cb<nu<clos-brack':
if int(self.__cb_count) == int(self.__close_group_count) + 1:
self.__style_sheet_final += \
'mi<mk<stylei-end\n'
else:
self.__style_sheet_final += line
def __found_list_table_func(self, line):
self.__state = 'list_table'
self.__list_table_final = ''
self.__close_group_count = self.__ob_count
self.__cb_count = 0
def __list_table_func(self, line):
if self.__cb_count == self.__close_group_count:
self.__state = 'preamble'
self.__list_table_final, self.__all_lists =\
self.__list_table_obj.parse_list_table(
self.__list_table_final)
# sys.stderr.write(repr(all_lists))
elif self.__token_info == '':
pass
else:
self.__list_table_final += line
pass
def __found_override_table_func(self, line):
self.__override_table_obj = override_table.OverrideTable(
run_level=self.__run_level,
list_of_lists=self.__all_lists,
)
self.__state = 'override_table'
self.__override_table_final = ''
self.__close_group_count = self.__ob_count
self.__cb_count = 0
# cw<it<lovr-table
def __override_table_func(self, line):
if self.__cb_count == self.__close_group_count:
self.__state = 'preamble'
self.__override_table_final, self.__all_lists =\
self.__override_table_obj.parse_override_table(self.__override_table_final)
elif self.__token_info == '':
pass
else:
self.__override_table_final += line
def __found_revision_table_func(self, line):
self.__state = 'revision_table'
self.__revision_table_final = ''
self.__close_group_count = self.__ob_count
self.__cb_count = 0
def __revision_table_func(self, line):
if int(self.__cb_count) == int(self.__close_group_count):
self.__state = 'preamble'
self.__revision_table_final = 'mi<tg<open______<revision-table\n' + \
'mi<mk<revtbl-beg\n' + self.__revision_table_final
self.__revision_table_final += \
'mi<mk<revtbl-end\n' + 'mi<tg<close_____<revision-table\n'
else:
self.__revision_table_final += line
def __found_doc_info_func(self, line):
self.__state = 'doc_info'
self.__doc_info_table_final = ''
self.__close_group_count = self.__ob_count
self.__cb_count = 0
def __doc_info_func(self, line):
if self.__cb_count == self.__close_group_count:
self.__state = 'preamble'
self.__doc_info_table_final = 'mi<tg<open______<doc-information\n' + \
'mi<mk<doc-in-beg\n' + self.__doc_info_table_final
self.__doc_info_table_final += \
'mi<mk<doc-in-end\n' + 'mi<tg<close_____<doc-information\n'
elif self.__token_info == 'ob<nu<open-brack':
if int(self.__ob_count) == int(self.__close_group_count) + 1:
self.__doc_info_table_final += \
'mi<mk<docinf-beg\n'
elif self.__token_info == 'cb<nu<clos-brack':
if int(self.__cb_count) == int(self.__close_group_count) + 1:
self.__doc_info_table_final += \
'mi<mk<docinf-end\n'
else:
self.__doc_info_table_final += line
def __margin_func(self, line):
"""
Handles lines that describe page info. Add the apporpriate info in the
token to the self.__margin_dict dicitonary.
"""
info = line[6:16]
changed = self.__margin_dict.get(info)
if changed is None:
print('woops!')
else:
self.__page[changed] = line[20:-1]
# cw<pa<margin-lef<nu<1728
def __print_page_info(self):
self.__write_obj.write('mi<tg<empty-att_<page-definition')
for key in self.__page.keys():
self.__write_obj.write(
'<%s>%s' % (key, self.__page[key])
)
self.__write_obj.write('\n')
# mi<tg<open-att__<footn
def __print_sec_info(self):
"""
Check if there is any section info. If so, print it out.
If not, print out an empty tag to satisfy the dtd.
"""
if len(self.__section.keys()) == 0:
self.__write_obj.write(
'mi<tg<open______<section-definition\n'
)
else:
self.__write_obj.write(
'mi<tg<open-att__<section-definition')
keys = self.__section.keys()
for key in keys:
self.__write_obj.write(
'<%s>%s' % (key, self.__section[key])
)
self.__write_obj.write('\n')
def __section_func(self, line):
"""
Add info pertaining to section to the self.__section dictionary, to be
printed out later.
"""
info = self.__translate_sec.get(line[6:16])
if info is None:
sys.stderr.write('woops!\n')
else:
self.__section[info] = 'true'
def __body_func(self, line):
self.__write_obj.write(line)
def __default_func(self, line):
# either in preamble or in body
pass
def __para_def_func(self, line):
# if self.__ob_group == 1
# this tells dept of group
if self.__cb_count == '0002':
self.__state = 'body'
self.__write_preamble()
self.__write_obj.write(line)
def __text_func(self, line):
"""
If the cb_count is less than 1, you have hit the body
For older RTF
Newer RTF should never have to use this function
"""
if self.__cb_count == '':
cb_count = '0002'
else:
cb_count = self.__cb_count
# ignore previous lines
# should be
# if self.__ob_group == 1
# this tells dept of group
if cb_count == '0002':
self.__state = 'body'
self.__write_preamble()
self.__write_obj.write(line)
def __row_def_func(self, line):
# if self.__ob_group == 1
# this tells dept of group
if self.__cb_count == '0002':
self.__state = 'body'
self.__write_preamble()
self.__write_obj.write(line)
def __new_section_func(self, line):
"""
This is new. The start of a section marks the end of the preamble
"""
if self.__cb_count == '0002':
self.__state = 'body'
self.__write_preamble()
else:
sys.stderr.write('module is preamble_div\n')
sys.stderr.write('method is __new_section_func\n')
sys.stderr.write('bracket count should be 2?\n')
self.__write_obj.write(line)
def __write_preamble(self):
"""
Write all the strings, which represent all the data in the preamble.
Write a body and section beginning.
"""
if self.__no_namespace:
self.__write_obj.write(
'mi<tg<open______<doc\n'
)
else:
self.__write_obj.write(
'mi<tg<open-att__<doc<xmlns>http://rtf2xml.sourceforge.net/\n')
self.__write_obj.write('mi<tg<open______<preamble\n')
self.__write_obj.write(self.__rtf_final)
if not self.__color_table_final:
self.__make_default_color_table()
if not self.__font_table_final:
self.__make_default_font_table()
self.__write_obj.write(self.__font_table_final)
self.__write_obj.write(self.__color_table_final)
if not self.__style_sheet_final:
self.__make_default_style_table()
self.__write_obj.write(self.__style_sheet_final)
self.__write_obj.write(self.__list_table_final)
self.__write_obj.write(self.__override_table_final)
self.__write_obj.write(self.__revision_table_final)
self.__write_obj.write(self.__doc_info_table_final)
self.__print_page_info()
self.__write_obj.write('ob<nu<open-brack<0001\n')
self.__write_obj.write('ob<nu<open-brack<0002\n')
self.__write_obj.write('cb<nu<clos-brack<0002\n')
self.__write_obj.write('mi<tg<close_____<preamble\n')
self.__write_obj.write('mi<tg<open______<body\n')
# self.__write_obj.write('mi<tg<open-att__<section<num>1\n')
# self.__print_sec_info()
# self.__write_obj.write('mi<tg<open______<headers-and-footers\n')
# self.__write_obj.write('mi<mk<head_foot_<\n')
# self.__write_obj.write('mi<tg<close_____<headers-and-footers\n')
self.__write_obj.write('mi<mk<body-open_\n')
def __preamble_func(self, line):
"""
Check if the token info belongs to the dictionary. If so, take the
appropriate action.
"""
action = self.__state_dict.get(self.__token_info)
if action:
action(line)
def make_preamble_divisions(self):
self.__initiate_values()
read_obj = open_for_read(self.__file)
self.__write_obj = open_for_write(self.__write_to)
line_to_read = 1
while line_to_read:
line_to_read = read_obj.readline()
line = line_to_read
self.__token_info = line[:16]
if self.__token_info == 'ob<nu<open-brack':
self.__ob_count = line[-5:-1]
self.__ob_group += 1
if self.__token_info == 'cb<nu<clos-brack':
self.__cb_count = line[-5:-1]
self.__ob_group -= 1
action = self.__state_dict.get(self.__state)
if action is None:
print(self.__state)
action(line)
read_obj.close()
self.__write_obj.close()
copy_obj = copy.Copy(bug_handler=self.__bug_handler)
if self.__copy:
copy_obj.copy_file(self.__write_to, "preamble_div.data")
copy_obj.rename(self.__write_to, self.__file)
os.remove(self.__write_to)
return self.__all_lists

View File

@@ -0,0 +1,157 @@
from __future__ import unicode_literals, absolute_import, print_function, division
#########################################################################
# #
# #
# copyright 2002 Paul Henry Tremblay #
# #
# This program is distributed in the hope that it will be useful, #
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
# General Public License for more details. #
# #
# #
#########################################################################
import sys,os
from calibre.ebooks.rtf2xml import copy
from . import open_for_read, open_for_write
class Preamble:
"""
Fix the reamaing parts of the preamble. This module does very little. It
makes sure that no text gets put in the revision of list table. In the
future, when I understand how to interpret the revision table and list
table, I will make these methods more functional.
"""
def __init__(self, file,
bug_handler,
platform,
default_font,
code_page,
copy=None,
temp_dir=None,
):
"""
Required:
file--file to parse
platform --Windows or Macintosh
default_font -- the default font
code_page --the code page (ansi1252, for example)
Optional:
'copy'-- whether to make a copy of result for debugging
'temp_dir' --where to output temporary results (default is
directory from which the script is run.)
Returns:
nothing
"""
self.__file=file
self.__bug_handler = bug_handler
self.__copy = copy
self.__default_font = default_font
self.__code_page = code_page
self.__platform = platform
if temp_dir:
self.__write_to = os.path.join(temp_dir,"info_table_info.data")
else:
self.__write_to = "info_table_info.data"
def __initiate_values(self):
"""
Initiate all values.
"""
self.__state = 'default'
self.__text_string = ''
self.__state_dict = {
'default' : self.__default_func,
'revision' : self.__revision_table_func,
'list_table' : self.__list_table_func,
'body' : self.__body_func,
}
self.__default_dict = {
'mi<mk<rtfhed-beg' : self.__found_rtf_head_func,
'mi<mk<listabbeg_' : self.__found_list_table_func,
'mi<mk<revtbl-beg' : self.__found_revision_table_func,
'mi<mk<body-open_' : self.__found_body_func,
}
def __default_func(self, line):
action = self.__default_dict.get(self.__token_info)
if action:
action(line)
else:
self.__write_obj.write(line)
def __found_rtf_head_func(self, line):
"""
Requires:
line -- the line to parse
Returns:
nothing.
Logic:
Write to the output file the default font info, the code page
info, and the platform info.
"""
self.__write_obj.write(
'mi<tg<empty-att_<rtf-definition'
'<default-font>%s<code-page>%s'
'<platform>%s\n' % (self.__default_font, self.__code_page,
self.__platform)
)
def __found_list_table_func(self, line):
self.__state = 'list_table'
def __list_table_func(self, line):
if self.__token_info == 'mi<mk<listabend_':
self.__state = 'default'
elif line[0:2] == 'tx':
pass
else:
self.__write_obj.write(line)
def __found_revision_table_func(self, line):
self.__state = 'revision'
def __revision_table_func(self, line):
if self.__token_info == 'mi<mk<revtbl-end':
self.__state = 'default'
elif line[0:2] == 'tx':
pass
else:
self.__write_obj.write(line)
def __found_body_func(self, line):
self.__state = 'body'
self.__write_obj.write(line)
def __body_func(self, line):
self.__write_obj.write(line)
def fix_preamble(self):
"""
Requires:
nothing
Returns:
nothing (changes the original file)
Logic:
Read one line in at a time. Determine what action to take based on
the state. The state can either be defaut, the revision table, or
the list table.
"""
self.__initiate_values()
with open_for_read(self.__file) as read_obj:
with open_for_write(self.__write_to) as self.__write_obj:
for line in read_obj:
self.__token_info = line[:16]
action = self.__state_dict.get(self.__state)
if action is None:
sys.stderr.write(
'no matching state in module preamble_rest.py\n' + self.__state + '\n')
action(line)
copy_obj = copy.Copy(bug_handler=self.__bug_handler)
if self.__copy:
copy_obj.copy_file(self.__write_to, "preamble_div.data")
copy_obj.rename(self.__write_to, self.__file)
os.remove(self.__write_to)

View File

@@ -0,0 +1,837 @@
from __future__ import absolute_import, division, print_function, unicode_literals
#########################################################################
# #
# #
# copyright 2002 Paul Henry Tremblay #
# #
# This program is distributed in the hope that it will be useful, #
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
# General Public License for more details. #
# #
# #
#########################################################################
import os, re
from calibre.ebooks.rtf2xml import copy, check_brackets
from calibre.ptempfile import better_mktemp
from polyglot.builtins import unicode_type
from . import open_for_read, open_for_write
class ProcessTokens:
"""
Process each token on a line and add information that will be useful for
later processing. Information will be put on one line, delimited by "<"
for main fields, and ">" for sub fields
"""
def __init__(self,
in_file,
exception_handler,
bug_handler,
copy=None,
run_level=1,
):
self.__file = in_file
self.__bug_handler = bug_handler
self.__copy = copy
self.__run_level = run_level
self.__write_to = better_mktemp()
self.initiate_token_dict()
# self.initiate_token_actions()
self.compile_expressions()
self.__bracket_count=0
self.__exception_handler = exception_handler
self.__bug_handler = bug_handler
def compile_expressions(self):
self.__num_exp = re.compile(r"([a-zA-Z]+)(.*)")
self.__utf_exp = re.compile(r'(&.*?;)')
def initiate_token_dict(self):
self.__return_code = 0
self.dict_token={
# unicode
'mshex' : ('nu', '__________', self.__ms_hex_func),
# brackets
'{' : ('nu', '{', self.ob_func),
'}' : ('nu', '}', self.cb_func),
# microsoft characters
'ldblquote' : ('mc', 'ldblquote', self.ms_sub_func),
'rdblquote' : ('mc', 'rdblquote', self.ms_sub_func),
'rquote' : ('mc', 'rquote', self.ms_sub_func),
'lquote' : ('mc', 'lquote', self.ms_sub_func),
'emdash' : ('mc', 'emdash', self.ms_sub_func),
'endash' : ('mc', 'endash', self.ms_sub_func),
'bullet' : ('mc', 'bullet', self.ms_sub_func),
'~' : ('mc', '~', self.ms_sub_func),
'tab' : ('mc', 'tab', self.ms_sub_func),
'_' : ('mc', '_', self.ms_sub_func),
';' : ('mc', ';', self.ms_sub_func),
# this must be wrong
'-' : ('mc', '-', self.ms_sub_func),
'line' : ('mi', 'hardline-break', self.direct_conv_func), # calibre
# misc => ml
'*' : ('ml', 'asterisk__', self.default_func),
':' : ('ml', 'colon_____', self.default_func),
# text
'backslash' : ('nu', '\\', self.text_func),
'ob' : ('nu', '{', self.text_func),
'cb' : ('nu', '}', self.text_func),
# paragraph formatting => pf
'page' : ('pf', 'page-break', self.default_func),
'par' : ('pf', 'par-end___', self.default_func),
'pard' : ('pf', 'par-def___', self.default_func),
'keepn' : ('pf', 'keep-w-nex', self.bool_st_func),
'widctlpar' : ('pf', 'widow-cntl', self.bool_st_func),
'adjustright' : ('pf', 'adjust-rgt', self.bool_st_func),
'lang' : ('pf', 'language__', self.__language_func),
'ri' : ('pf', 'right-inde', self.divide_by_20),
'fi' : ('pf', 'fir-ln-ind', self.divide_by_20),
'li' : ('pf', 'left-inden', self.divide_by_20),
'sb' : ('pf', 'space-befo', self.divide_by_20),
'sa' : ('pf', 'space-afte', self.divide_by_20),
'sl' : ('pf', 'line-space', self.divide_by_20),
'deftab' : ('pf', 'default-ta', self.divide_by_20),
'ql' : ('pf', 'align_____<left', self.two_part_func),
'qc' : ('pf', 'align_____<cent', self.two_part_func),
'qj' : ('pf', 'align_____<just', self.two_part_func),
'qr' : ('pf', 'align_____<right', self.two_part_func),
'nowidctlpar' : ('pf', 'widow-cntr<false', self.two_part_func),
'tx' : ('pf', 'tab-stop__', self.divide_by_20),
'tb' : ('pf', 'tab-bar-st', self.divide_by_20),
'tqr' : ('pf', 'tab-right_', self.default_func),
'tqdec' : ('pf', 'tab-dec___', self.default_func),
'tqc' : ('pf', 'tab-center', self.default_func),
'tlul' : ('pf', 'leader-und', self.default_func),
'tlhyph' : ('pf', 'leader-hyp', self.default_func),
'tldot' : ('pf', 'leader-dot', self.default_func),
# stylesheet = > ss
'stylesheet' : ('ss', 'style-shet', self.default_func),
'sbasedon' : ('ss', 'based-on__', self.default_func),
'snext' : ('ss', 'next-style', self.default_func),
'cs' : ('ss', 'char-style', self.default_func),
's' : ('ss', 'para-style', self.default_func),
# graphics => gr
'pict' : ('gr', 'picture___', self.default_func),
'objclass' : ('gr', 'obj-class_', self.default_func),
'macpict' : ('gr', 'mac-pic___', self.default_func),
# section => sc
'sect' : ('sc', 'section___', self.default_func),
'sectd' : ('sc', 'sect-defin', self.default_func),
'endhere' : ('sc', 'sect-note_', self.default_func),
# list=> ls
'pntext' : ('ls', 'list-text_', self.default_func),
# this line must be wrong because it duplicates an earlier one
'listtext' : ('ls', 'list-text_', self.default_func),
'pn' : ('ls', 'list______', self.default_func),
'pnseclvl' : ('ls', 'list-level', self.default_func),
'pncard' : ('ls', 'list-cardi', self.bool_st_func),
'pndec' : ('ls', 'list-decim', self.bool_st_func),
'pnucltr' : ('ls', 'list-up-al', self.bool_st_func),
'pnucrm' : ('ls', 'list-up-ro', self.bool_st_func),
'pnord' : ('ls', 'list-ord__', self.bool_st_func),
'pnordt' : ('ls', 'list-ordte', self.bool_st_func),
'pnlvlblt' : ('ls', 'list-bulli', self.bool_st_func),
'pnlvlbody' : ('ls', 'list-simpi', self.bool_st_func),
'pnlvlcont' : ('ls', 'list-conti', self.bool_st_func),
'pnhang' : ('ls', 'list-hang_', self.bool_st_func),
'pntxtb' : ('ls', 'list-tebef', self.bool_st_func),
'ilvl' : ('ls', 'list-level', self.default_func),
'ls' : ('ls', 'list-id___', self.default_func),
'pnstart' : ('ls', 'list-start', self.default_func),
'itap' : ('ls', 'nest-level', self.default_func),
'leveltext' : ('ls', 'level-text', self.default_func),
'levelnumbers' : ('ls', 'level-numb', self.default_func),
'list' : ('ls', 'list-in-tb', self.default_func),
'listlevel' : ('ls', 'list-tb-le', self.default_func),
'listname' : ('ls', 'list-name_', self.default_func),
'listtemplateid' : ('ls', 'ls-tem-id_', self.default_func),
'leveltemplateid' : ('ls', 'lv-tem-id_', self.default_func),
'listhybrid' : ('ls', 'list-hybri', self.default_func),
'levelstartat' : ('ls', 'level-star', self.default_func),
'levelspace' : ('ls', 'level-spac', self.divide_by_20),
'levelindent' : ('ls', 'level-inde', self.default_func),
'levelnfc' : ('ls', 'level-type', self.__list_type_func),
'levelnfcn' : ('ls', 'level-type', self.__list_type_func),
'listid' : ('ls', 'lis-tbl-id', self.default_func),
'listoverride' : ('ls', 'lis-overid', self.default_func),
# duplicate
'pnlvl' : ('ls', 'list-level', self.default_func),
# root info => ri
'rtf' : ('ri', 'rtf_______', self.default_func),
'deff' : ('ri', 'deflt-font', self.default_func),
'mac' : ('ri', 'macintosh_', self.default_func),
'pc' : ('ri', 'pc________', self.default_func),
'pca' : ('ri', 'pca_______', self.default_func),
'ansi' : ('ri', 'ansi______', self.default_func),
'ansicpg' : ('ri', 'ansi-codpg', self.default_func),
# notes => nt
'footnote' : ('nt', 'footnote__', self.default_func),
'ftnalt' : ('nt', 'type______<endnote', self.two_part_func),
# anchor => an
'tc' : ('an', 'toc_______', self.default_func),
'bkmkstt' : ('an', 'book-mk-st', self.default_func),
'bkmkstart' : ('an', 'book-mk-st', self.default_func),
'bkmkend' : ('an', 'book-mk-en', self.default_func),
'xe' : ('an', 'index-mark', self.default_func),
'rxe' : ('an', 'place_____', self.default_func),
# index => in
'bxe' : ('in', 'index-bold', self.default_func),
'ixe' : ('in', 'index-ital', self.default_func),
'txe' : ('in', 'index-see_', self.default_func),
# table of contents => tc
'tcl' : ('tc', 'toc-level_', self.default_func),
'tcn' : ('tc', 'toc-sup-nu', self.default_func),
# field => fd
'field' : ('fd', 'field_____', self.default_func),
'fldinst' : ('fd', 'field-inst', self.default_func),
'fldrslt' : ('fd', 'field-rslt', self.default_func),
'datafield' : ('fd', 'datafield_', self.default_func),
# info-tables => it
'fonttbl' : ('it', 'font-table', self.default_func),
'colortbl' : ('it', 'colr-table', self.default_func),
'listoverridetable' : ('it', 'lovr-table', self.default_func),
'listtable' : ('it', 'listtable_', self.default_func),
'revtbl' : ('it', 'revi-table', self.default_func),
# character info => ci
'b' : ('ci', 'bold______', self.bool_st_func),
'blue' : ('ci', 'blue______', self.color_func),
'caps' : ('ci', 'caps______', self.bool_st_func),
'cf' : ('ci', 'font-color', self.colorz_func),
'chftn' : ('ci', 'footnot-mk', self.bool_st_func),
'dn' : ('ci', 'font-down_', self.divide_by_2),
'embo' : ('ci', 'emboss____', self.bool_st_func),
'f' : ('ci', 'font-style', self.default_func),
'fs' : ('ci', 'font-size_', self.divide_by_2),
'green' : ('ci', 'green_____', self.color_func),
'i' : ('ci', 'italics___', self.bool_st_func),
'impr' : ('ci', 'engrave___', self.bool_st_func),
'outl' : ('ci', 'outline___', self.bool_st_func),
'plain' : ('ci', 'plain_____', self.bool_st_func),
'red' : ('ci', 'red_______', self.color_func),
'scaps' : ('ci', 'small-caps', self.bool_st_func),
'shad' : ('ci', 'shadow____', self.bool_st_func),
'strike' : ('ci', 'strike-thr', self.bool_st_func),
'striked' : ('ci', 'dbl-strike', self.bool_st_func),
'sub' : ('ci', 'subscript_', self.bool_st_func),
'super' : ('ci', 'superscrip', self.bool_st_func),
'nosupersub' : ('ci', 'no-su-supe', self.__no_sup_sub_func),
'up' : ('ci', 'font-up___', self.divide_by_2),
'v' : ('ci', 'hidden____', self.default_func),
# underline
# can't see why it isn't a char info: 'ul'=>'ci'
'ul' : ('ci', 'underlined<continous', self.two_part_func),
'uld' : ('ci', 'underlined<dotted', self.two_part_func),
'uldash' : ('ci', 'underlined<dash', self.two_part_func),
'uldashd' : ('ci', 'underlined<dash-dot', self.two_part_func),
'uldashdd' : ('ci', 'underlined<dash-dot-dot', self.two_part_func),
'uldb' : ('ci', 'underlined<double', self.two_part_func),
'ulhwave' : ('ci', 'underlined<heavy-wave', self.two_part_func),
'ulldash' : ('ci', 'underlined<long-dash', self.two_part_func),
'ulth' : ('ci', 'underlined<thich', self.two_part_func),
'ulthd' : ('ci', 'underlined<thick-dotted', self.two_part_func),
'ulthdash' : ('ci', 'underlined<thick-dash', self.two_part_func),
'ulthdashd' : ('ci', 'underlined<thick-dash-dot', self.two_part_func),
'ulthdashdd' : ('ci', 'underlined<thick-dash-dot-dot', self.two_part_func),
'ulthldash' : ('ci', 'underlined<thick-long-dash', self.two_part_func),
'ululdbwave' : ('ci', 'underlined<double-wave', self.two_part_func),
'ulw' : ('ci', 'underlined<word', self.two_part_func),
'ulwave' : ('ci', 'underlined<wave', self.two_part_func),
'ulnone' : ('ci', 'underlined<false', self.two_part_func),
# table => tb
'trowd' : ('tb', 'row-def___', self.default_func),
'cell' : ('tb', 'cell______', self.default_func),
'row' : ('tb', 'row_______', self.default_func),
'intbl' : ('tb', 'in-table__', self.default_func),
'cols' : ('tb', 'columns___', self.default_func),
'trleft' : ('tb', 'row-pos-le', self.divide_by_20),
'cellx' : ('tb', 'cell-posit', self.divide_by_20),
'trhdr' : ('tb', 'row-header', self.default_func),
# preamble => pr
# document information => di
# TODO integrate \userprops
'info' : ('di', 'doc-info__', self.default_func),
'title' : ('di', 'title_____', self.default_func),
'author' : ('di', 'author____', self.default_func),
'operator' : ('di', 'operator__', self.default_func),
'manager' : ('di', 'manager___', self.default_func),
'company' : ('di', 'company___', self.default_func),
'keywords' : ('di', 'keywords__', self.default_func),
'category' : ('di', 'category__', self.default_func),
'doccomm' : ('di', 'doc-notes_', self.default_func),
'comment' : ('di', 'doc-notes_', self.default_func),
'subject' : ('di', 'subject___', self.default_func),
'creatim' : ('di', 'create-tim', self.default_func),
'yr' : ('di', 'year______', self.default_func),
'mo' : ('di', 'month_____', self.default_func),
'dy' : ('di', 'day_______', self.default_func),
'min' : ('di', 'minute____', self.default_func),
'sec' : ('di', 'second____', self.default_func),
'revtim' : ('di', 'revis-time', self.default_func),
'edmins' : ('di', 'edit-time_', self.default_func),
'printim' : ('di', 'print-time', self.default_func),
'buptim' : ('di', 'backuptime', self.default_func),
'nofwords' : ('di', 'num-of-wor', self.default_func),
'nofchars' : ('di', 'num-of-chr', self.default_func),
'nofcharsws' : ('di', 'numofchrws', self.default_func),
'nofpages' : ('di', 'num-of-pag', self.default_func),
'version' : ('di', 'version___', self.default_func),
'vern' : ('di', 'intern-ver', self.default_func),
'hlinkbase' : ('di', 'linkbase__', self.default_func),
'id' : ('di', 'internalID', self.default_func),
# headers and footers => hf
'headerf' : ('hf', 'head-first', self.default_func),
'headerl' : ('hf', 'head-left_', self.default_func),
'headerr' : ('hf', 'head-right', self.default_func),
'footerf' : ('hf', 'foot-first', self.default_func),
'footerl' : ('hf', 'foot-left_', self.default_func),
'footerr' : ('hf', 'foot-right', self.default_func),
'header' : ('hf', 'header____', self.default_func),
'footer' : ('hf', 'footer____', self.default_func),
# page => pa
'margl' : ('pa', 'margin-lef', self.divide_by_20),
'margr' : ('pa', 'margin-rig', self.divide_by_20),
'margb' : ('pa', 'margin-bot', self.divide_by_20),
'margt' : ('pa', 'margin-top', self.divide_by_20),
'gutter' : ('pa', 'gutter____', self.divide_by_20),
'paperw' : ('pa', 'paper-widt', self.divide_by_20),
'paperh' : ('pa', 'paper-hght', self.divide_by_20),
# annotation => an
'annotation' : ('an', 'annotation', self.default_func),
# border => bd
'trbrdrh' : ('bd', 'bor-t-r-hi', self.default_func),
'trbrdrv' : ('bd', 'bor-t-r-vi', self.default_func),
'trbrdrt' : ('bd', 'bor-t-r-to', self.default_func),
'trbrdrl' : ('bd', 'bor-t-r-le', self.default_func),
'trbrdrb' : ('bd', 'bor-t-r-bo', self.default_func),
'trbrdrr' : ('bd', 'bor-t-r-ri', self.default_func),
'clbrdrb' : ('bd', 'bor-cel-bo', self.default_func),
'clbrdrt' : ('bd', 'bor-cel-to', self.default_func),
'clbrdrl' : ('bd', 'bor-cel-le', self.default_func),
'clbrdrr' : ('bd', 'bor-cel-ri', self.default_func),
'brdrb' : ('bd', 'bor-par-bo', self.default_func),
'brdrt' : ('bd', 'bor-par-to', self.default_func),
'brdrl' : ('bd', 'bor-par-le', self.default_func),
'brdrr' : ('bd', 'bor-par-ri', self.default_func),
'box' : ('bd', 'bor-par-bx', self.default_func),
'chbrdr' : ('bd', 'bor-par-bo', self.default_func),
'brdrbtw' : ('bd', 'bor-for-ev', self.default_func),
'brdrbar' : ('bd', 'bor-outsid', self.default_func),
'brdrnone' : ('bd', 'bor-none__<false', self.two_part_func),
# border type => bt
'brdrs' : ('bt', 'bdr-single', self.default_func),
'brdrth' : ('bt', 'bdr-doubtb', self.default_func),
'brdrsh' : ('bt', 'bdr-shadow', self.default_func),
'brdrdb' : ('bt', 'bdr-double', self.default_func),
'brdrdot' : ('bt', 'bdr-dotted', self.default_func),
'brdrdash' : ('bt', 'bdr-dashed', self.default_func),
'brdrhair' : ('bt', 'bdr-hair__', self.default_func),
'brdrinset' : ('bt', 'bdr-inset_', self.default_func),
'brdrdashsm' : ('bt', 'bdr-das-sm', self.default_func),
'brdrdashd' : ('bt', 'bdr-dot-sm', self.default_func),
'brdrdashdd' : ('bt', 'bdr-dot-do', self.default_func),
'brdroutset' : ('bt', 'bdr-outset', self.default_func),
'brdrtriple' : ('bt', 'bdr-trippl', self.default_func),
'brdrtnthsg' : ('bt', 'bdr-thsm__', self.default_func),
'brdrthtnsg' : ('bt', 'bdr-htsm__', self.default_func),
'brdrtnthtnsg' : ('bt', 'bdr-hthsm_', self.default_func),
'brdrtnthmg' : ('bt', 'bdr-thm___', self.default_func),
'brdrthtnmg' : ('bt', 'bdr-htm___', self.default_func),
'brdrtnthtnmg' : ('bt', 'bdr-hthm__', self.default_func),
'brdrtnthlg' : ('bt', 'bdr-thl___', self.default_func),
'brdrtnthtnlg' : ('bt', 'bdr-hthl__', self.default_func),
'brdrwavy' : ('bt', 'bdr-wavy__', self.default_func),
'brdrwavydb' : ('bt', 'bdr-d-wav_', self.default_func),
'brdrdashdotstr' : ('bt', 'bdr-strip_', self.default_func),
'brdremboss' : ('bt', 'bdr-embos_', self.default_func),
'brdrengrave' : ('bt', 'bdr-engra_', self.default_func),
'brdrframe' : ('bt', 'bdr-frame_', self.default_func),
'brdrw' : ('bt', 'bdr-li-wid', self.divide_by_20),
'brsp' : ('bt', 'bdr-sp-wid', self.divide_by_20),
'brdrcf' : ('bt', 'bdr-color_', self.default_func),
# comments
# 'comment' : ('cm', 'comment___', self.default_func),
}
self.__number_type_dict = {
0: 'Arabic',
1: 'uppercase Roman numeral',
2: 'lowercase Roman numeral',
3: 'uppercase letter',
4: 'lowercase letter',
5: 'ordinal number',
6: 'cardianl text number',
7: 'ordinal text number',
10: 'Kanji numbering without the digit character',
11: 'Kanji numbering with the digit character',
1246: 'phonetic Katakana characters in aiueo order',
1346: 'phonetic katakana characters in iroha order',
14: 'double byte character',
15: 'single byte character',
16: 'Kanji numbering 3',
17: 'Kanji numbering 4',
18: 'Circle numbering' ,
19: 'double-byte Arabic numbering',
2046: 'phonetic double-byte Katakana characters',
2146: 'phonetic double-byte katakana characters',
22: 'Arabic with leading zero',
23: 'bullet',
24: 'Korean numbering 2',
25: 'Korean numbering 1',
26: 'Chinese numbering 1',
27: 'Chinese numbering 2',
28: 'Chinese numbering 3',
29: 'Chinese numbering 4',
30: 'Chinese Zodiac numbering 1',
31: 'Chinese Zodiac numbering 2',
32: 'Chinese Zodiac numbering 3',
33: 'Taiwanese double-byte numbering 1',
34: 'Taiwanese double-byte numbering 2',
35: 'Taiwanese double-byte numbering 3',
36: 'Taiwanese double-byte numbering 4',
37: 'Chinese double-byte numbering 1',
38: 'Chinese double-byte numbering 2',
39: 'Chinese double-byte numbering 3',
40: 'Chinese double-byte numbering 4',
41: 'Korean double-byte numbering 1',
42: 'Korean double-byte numbering 2',
43: 'Korean double-byte numbering 3',
44: 'Korean double-byte numbering 4',
45: 'Hebrew non-standard decimal',
46: 'Arabic Alif Ba Tah',
47: 'Hebrew Biblical standard',
48: 'Arabic Abjad style',
255: 'No number',
}
self.__language_dict = {
1078 : 'Afrikaans',
1052 : 'Albanian',
1025 : 'Arabic',
5121 : 'Arabic Algeria',
15361 : 'Arabic Bahrain',
3073 : 'Arabic Egypt',
1 : 'Arabic General',
2049 : 'Arabic Iraq',
11265 : 'Arabic Jordan',
13313 : 'Arabic Kuwait',
12289 : 'Arabic Lebanon',
4097 : 'Arabic Libya',
6145 : 'Arabic Morocco',
8193 : 'Arabic Oman',
16385 : 'Arabic Qatar',
10241 : 'Arabic Syria',
7169 : 'Arabic Tunisia',
14337 : 'Arabic U.A.E.',
9217 : 'Arabic Yemen',
1067 : 'Armenian',
1101 : 'Assamese',
2092 : 'Azeri Cyrillic',
1068 : 'Azeri Latin',
1069 : 'Basque',
1093 : 'Bengali',
4122 : 'Bosnia Herzegovina',
1026 : 'Bulgarian',
1109 : 'Burmese',
1059 : 'Byelorussian',
1027 : 'Catalan',
2052 : 'Chinese China',
4 : 'Chinese General',
3076 : 'Chinese Hong Kong',
4100 : 'Chinese Singapore',
1028 : 'Chinese Taiwan',
1050 : 'Croatian',
1029 : 'Czech',
1030 : 'Danish',
2067 : 'Dutch Belgium',
1043 : 'Dutch Standard',
3081 : 'English Australia',
10249 : 'English Belize',
2057 : 'English British',
4105 : 'English Canada',
9225 : 'English Caribbean',
9 : 'English General',
6153 : 'English Ireland',
8201 : 'English Jamaica',
5129 : 'English New Zealand',
13321 : 'English Philippines',
7177 : 'English South Africa',
11273 : 'English Trinidad',
1033 : 'English United States',
1061 : 'Estonian',
1080 : 'Faerose',
1065 : 'Farsi',
1035 : 'Finnish',
1036 : 'French',
2060 : 'French Belgium',
11276 : 'French Cameroon',
3084 : 'French Canada',
12300 : 'French Cote d\'Ivoire',
5132 : 'French Luxembourg',
13324 : 'French Mali',
6156 : 'French Monaco',
8204 : 'French Reunion',
10252 : 'French Senegal',
4108 : 'French Swiss',
7180 : 'French West Indies',
9228 : 'French Democratic Republic of the Congo',
1122 : 'Frisian',
1084 : 'Gaelic',
2108 : 'Gaelic Ireland',
1110 : 'Galician',
1079 : 'Georgian',
1031 : 'German',
3079 : 'German Austrian',
5127 : 'German Liechtenstein',
4103 : 'German Luxembourg',
2055 : 'German Switzerland',
1032 : 'Greek',
1095 : 'Gujarati',
1037 : 'Hebrew',
1081 : 'Hindi',
1038 : 'Hungarian',
1039 : 'Icelandic',
1057 : 'Indonesian',
1040 : 'Italian',
2064 : 'Italian Switzerland',
1041 : 'Japanese',
1099 : 'Kannada',
1120 : 'Kashmiri',
2144 : 'Kashmiri India',
1087 : 'Kazakh',
1107 : 'Khmer',
1088 : 'Kirghiz',
1111 : 'Konkani',
1042 : 'Korean',
2066 : 'Korean Johab',
1108 : 'Lao',
1062 : 'Latvian',
1063 : 'Lithuanian',
2087 : 'Lithuanian Classic',
1086 : 'Malay',
2110 : 'Malay Brunei Darussalam',
1100 : 'Malayalam',
1082 : 'Maltese',
1112 : 'Manipuri',
1102 : 'Marathi',
1104 : 'Mongolian',
1121 : 'Nepali',
2145 : 'Nepali India',
1044 : 'Norwegian Bokmal',
2068 : 'Norwegian Nynorsk',
1096 : 'Oriya',
1045 : 'Polish',
1046 : 'Portuguese (Brazil)',
2070 : 'Portuguese (Portugal)',
1094 : 'Punjabi',
1047 : 'Rhaeto-Romanic',
1048 : 'Romanian',
2072 : 'Romanian Moldova',
1049 : 'Russian',
2073 : 'Russian Moldova',
1083 : 'Sami Lappish',
1103 : 'Sanskrit',
3098 : 'Serbian Cyrillic',
2074 : 'Serbian Latin',
1113 : 'Sindhi',
1051 : 'Slovak',
1060 : 'Slovenian',
1070 : 'Sorbian',
11274 : 'Spanish Argentina',
16394 : 'Spanish Bolivia',
13322 : 'Spanish Chile',
9226 : 'Spanish Colombia',
5130 : 'Spanish Costa Rica',
7178 : 'Spanish Dominican Republic',
12298 : 'Spanish Ecuador',
17418 : 'Spanish El Salvador',
4106 : 'Spanish Guatemala',
18442 : 'Spanish Honduras',
2058 : 'Spanish Mexico',
3082 : 'Spanish Modern',
19466 : 'Spanish Nicaragua',
6154 : 'Spanish Panama',
15370 : 'Spanish Paraguay',
10250 : 'Spanish Peru',
20490 : 'Spanish Puerto Rico',
1034 : 'Spanish Traditional',
14346 : 'Spanish Uruguay',
8202 : 'Spanish Venezuela',
1072 : 'Sutu',
1089 : 'Swahili',
1053 : 'Swedish',
2077 : 'Swedish Finland',
1064 : 'Tajik',
1097 : 'Tamil',
1092 : 'Tatar',
1098 : 'Telugu',
1054 : 'Thai',
1105 : 'Tibetan',
1073 : 'Tsonga',
1074 : 'Tswana',
1055 : 'Turkish',
1090 : 'Turkmen',
1058 : 'Ukranian',
1056 : 'Urdu',
2080 : 'Urdu India',
2115 : 'Uzbek Cyrillic',
1091 : 'Uzbek Latin',
1075 : 'Venda',
1066 : 'Vietnamese',
1106 : 'Welsh',
1076 : 'Xhosa',
1085 : 'Yiddish',
1077 : 'Zulu',
1024 : 'Unkown',
255 : 'Unkown',
}
"""
# unknown
# These must get passed on because they occure after \\*
'do' : ('un', 'unknown___', self.default_func),
'company' : ('un', 'company___', self.default_func),
'shpinst' : ('un', 'unknown___', self.default_func),
'panose' : ('un', 'unknown___', self.default_func),
'falt' : ('un', 'unknown___', self.default_func),
'listoverridetable' : ('un', 'unknown___', self.default_func),
'category' : ('un', 'unknown___', self.default_func),
'template' : ('un', 'unknown___', self.default_func),
'ud' : ('un', 'unknown___', self.default_func),
'formfield' : ('un', 'unknown___', self.default_func),
'ts' : ('un', 'unknown___', self.default_func),
'rsidtbl' : ('un', 'unknown___', self.default_func),
'generator' : ('un', 'unknown___', self.default_func),
'ftnsep' : ('un', 'unknown___', self.default_func),
'aftnsep' : ('un', 'unknown___', self.default_func),
'aftnsepc' : ('un', 'unknown___', self.default_func),
'aftncn' : ('un', 'unknown___', self.default_func),
'objclass' : ('un', 'unknown___', self.default_func),
'objdata' : ('un', 'unknown___', self.default_func),
'picprop' : ('un', 'unknown___', self.default_func),
'blipuid' : ('un', 'unknown___', self.default_func),
"""
def __ms_hex_func(self, pre, token, num):
num = num[1:] # chop off leading 0, which I added
num = num.upper() # the mappings store hex in caps
return 'tx<hx<__________<\'%s\n' % num # add an ' for the mappings
def ms_sub_func(self, pre, token, num):
return 'tx<mc<__________<%s\n' % token
def direct_conv_func(self, pre, token, num):
return 'mi<tg<empty_____<%s\n' % token
def default_func(self, pre, token, num):
if num is None:
num = 'true'
return 'cw<%s<%s<nu<%s\n' % (pre, token, num)
def colorz_func(self, pre, token, num):
if num is None:
num = '0'
return 'cw<%s<%s<nu<%s\n' % (pre, token, num)
def __list_type_func(self, pre, token, num):
type = 'arabic'
if num is None:
type = 'Arabic'
else:
try:
num = int(num)
except ValueError:
if self.__run_level > 3:
msg = 'Number "%s" cannot be converted to integer\n' % num
raise self.__bug_handler(msg)
type = self.__number_type_dict.get(num)
if type is None:
if self.__run_level > 3:
msg = 'No type for "%s" in self.__number_type_dict\n'
raise self.__bug_handler
type = 'Arabic'
return 'cw<%s<%s<nu<%s\n' % (pre, token, type)
def __language_func(self, pre, token, num):
lang_name = self.__language_dict.get(int(re.search('[0-9]+', num).group()))
if not lang_name:
lang_name = "not defined"
if self.__run_level > 3:
msg = 'No entry for number "%s"' % num
raise self.__bug_handler(msg)
return 'cw<%s<%s<nu<%s\n' % (pre, token, lang_name)
def two_part_func(self, pre, token, num):
list = token.split("<")
token = list[0]
num = list[1]
return 'cw<%s<%s<nu<%s\n' % (pre, token, num)
# return 'cw<nu<nu<nu<%s>num<%s\n' % (token, num)
def divide_by_2(self, pre, token, num):
num = self.divide_num(num, 2)
return 'cw<%s<%s<nu<%s\n' % (pre, token, num)
# return 'cw<nu<nu<nu<%s>%s<%s\n' % (token, num, token)
def divide_by_20(self, pre, token, num):
num = self.divide_num(num, 20)
return 'cw<%s<%s<nu<%s\n' % (pre, token, num)
# return 'cw<nu<nu<nu<%s>%s<%s\n' % (token, num, token)
def text_func(self, pre, token, num=None):
return 'tx<nu<__________<%s\n' % token
def ob_func(self, pre, token, num=None):
self.__bracket_count += 1
return 'ob<nu<open-brack<%04d\n' % self.__bracket_count
def cb_func(self, pre, token, num=None):
line = 'cb<nu<clos-brack<%04d\n' % self.__bracket_count
self.__bracket_count -= 1
return line
def color_func(self, pre, token, num):
third_field = 'nu'
if num[-1] == ';':
num = num[:-1]
third_field = 'en'
num = unicode_type('%X' % int(num))
if len(num) != 2:
num = "0" + num
return 'cw<%s<%s<%s<%s\n' % (pre, token, third_field, num)
# return 'cw<cl<%s<nu<nu<%s>%s<%s\n' % (third_field, token, num, token)
def bool_st_func(self, pre, token, num):
if num is None or num == '' or num == '1':
return 'cw<%s<%s<nu<true\n' % (pre, token)
# return 'cw<nu<nu<nu<%s>true<%s\n' % (token, token)
elif num == '0':
return 'cw<%s<%s<nu<false\n' % (pre, token)
# return 'cw<nu<nu<nu<%s>false<%s\n' % (token, token)
else:
msg = "boolean should have some value module process tokens\ntoken is %s\n'%s'\n" % (token, num)
raise self.__bug_handler(msg)
def __no_sup_sub_func(self, pre, token, num):
the_string = 'cw<ci<subscript_<nu<false\n'
the_string += 'cw<ci<superscrip<nu<false\n'
return the_string
def divide_num(self, numerator, denominator):
try:
# calibre why ignore negative number? Wrong in case of \fi
numerator = float(re.search('[0-9.\\-]+', numerator).group())
except TypeError as msg:
if self.__run_level > 3:
msg = ('No number to process?\nthis indicates that the token \\(\\li\\) \
should have a number and does not\nnumerator is \
"%s"\ndenominator is "%s"\n') % (numerator, denominator)
raise self.__bug_handler(msg)
if 5 > self.__return_code:
self.__return_code = 5
return 0
num = '%0.2f' % round(numerator/denominator, 2)
return num
string_num = unicode_type(num)
if string_num[-2:] == ".0":
string_num = string_num[:-2]
return string_num
def split_let_num(self, token):
match_obj = re.search(self.__num_exp,token)
if match_obj is not None:
first = match_obj.group(1)
second = match_obj.group(2)
if not second:
if self.__run_level > 3:
msg = "token is '%s' \n" % token
raise self.__bug_handler(msg)
return first, 0
else:
if self.__run_level > 3:
msg = "token is '%s' \n" % token
raise self.__bug_handler
return token, 0
return first, second
def convert_to_hex(self,number):
"""Convert a string to uppercase hexidecimal"""
num = int(number)
try:
hex_num = "%X" % num
return hex_num
except:
raise self.__bug_handler
def process_cw(self, token):
"""Change the value of the control word by determining what dictionary
it belongs to"""
special = ['*', ':', '}', '{', '~', '_', '-', ';']
# if token != "{" or token != "}":
token = token[1:] # strip off leading \
token = token.replace(" ", "")
# if not token: return
only_alpha = token.isalpha()
num = None
if not only_alpha and token not in special:
token, num = self.split_let_num(token)
pre, token, action = self.dict_token.get(token, (None, None, None))
if action:
return action(pre, token, num)
def __check_brackets(self, in_file):
self.__check_brack_obj = check_brackets.CheckBrackets(file=in_file)
good_br = self.__check_brack_obj.check_brackets()[0]
if not good_br:
return 1
def process_tokens(self):
"""Main method for handling other methods. """
line_count = 0
with open_for_read(self.__file) as read_obj:
with open_for_write(self.__write_to) as write_obj:
for line in read_obj:
token = line.replace("\n", "")
line_count += 1
if line_count == 1 and token != '\\{':
msg = '\nInvalid RTF: document doesn\'t start with {\n'
raise self.__exception_handler(msg)
elif line_count == 2 and token[0:4] != '\\rtf':
msg = '\nInvalid RTF: document doesn\'t start with \\rtf \n'
raise self.__exception_handler(msg)
the_index = token.find('\\ ')
if token is not None and the_index > -1:
msg = '\nInvalid RTF: token "\\ " not valid.\nError at line %d'\
% line_count
raise self.__exception_handler(msg)
elif token[:1] == "\\":
line = self.process_cw(token)
if line is not None:
write_obj.write(line)
else:
fields = re.split(self.__utf_exp, token)
for field in fields:
if not field:
continue
if field[0:1] == '&':
write_obj.write('tx<ut<__________<%s\n' % field)
else:
write_obj.write('tx<nu<__________<%s\n' % field)
if not line_count:
msg = '\nInvalid RTF: file appears to be empty.\n'
raise self.__exception_handler(msg)
copy_obj = copy.Copy(bug_handler=self.__bug_handler)
if self.__copy:
copy_obj.copy_file(self.__write_to, "processed_tokens.data")
copy_obj.rename(self.__write_to, self.__file)
os.remove(self.__write_to)
bad_brackets = self.__check_brackets(self.__file)
if bad_brackets:
msg = '\nInvalid RTF: document does not have matching brackets.\n'
raise self.__exception_handler(msg)
else:
return self.__return_code

View File

@@ -0,0 +1,538 @@
from __future__ import absolute_import, division, print_function, unicode_literals
#########################################################################
# #
# #
# copyright 2002 Paul Henry Tremblay #
# #
# This program is distributed in the hope that it will be useful, #
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
# General Public License for more details. #
# #
# #
#########################################################################
import sys, os
from calibre.ebooks.rtf2xml import copy
from calibre.ptempfile import better_mktemp
from polyglot.builtins import unicode_type
from . import open_for_read, open_for_write
class Sections:
"""
=================
Purpose
=================
Write section tags for a tokenized file. (This module won't be any use to use
to you unless you use it as part of the other modules.)
---------------
logic
---------------
The tags for the first section breaks have already been written.
RTF stores section breaks with the \\sect tag. Each time this tag is
encountered, add one to the counter.
When I encounter the \\sectd tag, I want to collect all the appropriate tokens
that describe the section. When I reach a \\pard, I know I an stop collecting
tokens and write the section tags.
The exception to this method occurs when sections occur in field blocks, such
as the index. Normally, two section break occur within the index and other
field-blocks. (If less or more section breaks occur, this code may not work.)
I want the sections to occur outside of the index. That is, the index
should be nested inside one section tag. After the index is complete, a new
section should begin.
In order to write the sections outside of the field blocks, I have to store
all of the field block as a string. When I ecounter the \\sect tag, add one to
the section counter, but store this number in a list. Likewise, store the
information describing the section in another list.
When I reach the end of the field block, choose the first item from the
numbered list as the section number. Choose the first item in the description
list as the values and attributes of the section. Enclose the field string
between the section tags.
Start a new section outside the field-block strings. Use the second number in
the list; use the second item in the description list.
CHANGE (2004-04-26) No longer write sections that occurr in field-blocks.
Instead, ingore all section information in a field-block.
"""
def __init__(self,
in_file,
bug_handler,
copy=None,
run_level=1):
"""
Required:
'file'--file to parse
Optional:
'copy'-- whether to make a copy of result for debugging
'temp_dir' --where to output temporary results (default is
directory from which the script is run.)
Returns:
nothing
"""
self.__file = in_file
self.__bug_handler = bug_handler
self.__copy = copy
self.__run_level = run_level
self.__write_to = better_mktemp()
def __initiate_values(self):
"""
Initiate all values.
"""
self.__mark_start = 'mi<mk<sect-start\n'
self.__mark_end = 'mi<mk<sect-end__\n'
self.__in_field = 0
self.__section_values = {}
self.__list_of_sec_values = []
self.__field_num = []
self.__section_num = 0
self.__state = 'before_body'
self.__found_first_sec = 0
self.__text_string = ''
self.__field_instruction_string = ''
self.__state_dict = {
'before_body' : self.__before_body_func,
'body' : self.__body_func,
'before_first_sec' : self.__before_first_sec_func,
'section' : self.__section_func,
'section_def' : self.__section_def_func,
'sec_in_field' : self.__sec_in_field_func,
}
# cw<sc<sect-defin<nu<true
self.__body_dict = {
'cw<sc<section___' : self.__found_section_func,
'mi<mk<sec-fd-beg' : self.__found_sec_in_field_func,
'cw<sc<sect-defin' : self.__found_section_def_bef_sec_func,
}
self.__section_def_dict = {
'cw<pf<par-def___' : (self.__end_sec_def_func, None),
'mi<mk<body-open_' : (self.__end_sec_def_func, None),
'cw<tb<columns___' : (self.__attribute_func, 'columns'),
'cw<pa<margin-lef' : (self.__attribute_func, 'margin-left'),
'cw<pa<margin-rig' : (self.__attribute_func, 'margin-right'),
'mi<mk<header-ind' : (self.__end_sec_def_func, None),
# premature endings
# __end_sec_premature_func
'tx<nu<__________' : (self.__end_sec_premature_func, None),
'cw<ci<font-style' : (self.__end_sec_premature_func, None),
'cw<ci<font-size_' : (self.__end_sec_premature_func, None),
}
self.__sec_in_field_dict = {
'mi<mk<sec-fd-end' : self.__end_sec_in_field_func,
# changed this 2004-04-26
# two lines
# 'cw<sc<section___' : self.__found_section_in_field_func,
# 'cw<sc<sect-defin' : self.__found_section_def_in_field_func,
}
def __found_section_def_func(self, line):
"""
Required:
line -- the line to parse
Returns:
nothing
Logic:
I have found a section definition. Change the state to
setion_def (so subsequent lines will be processesed as part of
the section definition), and clear the section_values dictionary.
"""
self.__state = 'section_def'
self.__section_values.clear()
def __attribute_func(self, line, name):
"""
Required:
line -- the line to be parsed
name -- the changed, readable name (as opposed to the
abbreviated one)
Returns:
nothing
Logic:
I need to add the right data to the section values dictionary so I
can retrive it later. The attribute (or key) is the name; the
value is the last part of the text string.
ex: cw<tb<columns___<nu<2
"""
attribute = name
value = line[20:-1]
self.__section_values[attribute] = value
def __found_section_func(self, line):
"""
Requires:
line -- the line to parse
Returns:
nothing
Logic:
I have found the beginning of a section, so change the state
accordingly. Also add one to the section counter.
"""
self.__state = 'section'
self.__write_obj.write(line)
self.__section_num += 1
def __found_section_def_bef_sec_func(self, line):
"""
Requires:
line -- the line to parse
Returns:
nothing
Logic:
I have found the beginning of a section, so change the state
accordingly. Also add one to the section counter.
"""
self.__section_num += 1
self.__found_section_def_func(line)
self.__write_obj.write(line)
def __section_func(self, line):
"""
Requires:
line --the line to parse
Returns:
nothing
Logic:
"""
if self.__token_info == 'cw<sc<sect-defin':
self.__found_section_def_func(line)
self.__write_obj.write(line)
def __section_def_func(self, line):
"""
Required:
line --line to parse
Returns:
nothing
Logic:
I have found a section definition. Check if the line is the end of
the defnition (a paragraph defintion), or if it contains info that
should be added to the values dictionary. If neither of these
cases are true, output the line to a file.
"""
action, name = self.__section_def_dict.get(self.__token_info, (None, None))
if action:
action(line, name)
if self.__in_field:
self.__sec_in_field_string += line
else:
self.__write_obj.write(line)
else:
self.__write_obj.write(line)
def __end_sec_def_func(self, line, name):
"""
Requires:
line --the line to parse
name --changed, readable name
Returns:
nothing
Logic:
The end of the section definition has been found. Reset the state.
Call on the write_section method.
"""
if not self.__in_field:
self.__state = 'body'
else:
self.__state = 'sec_in_field'
self.__write_section(line)
def __end_sec_premature_func(self, line, name):
"""
Requires:
line --the line to parse
name --changed, readable name
Returns:
nothing
Logic:
Text or control words indicating text have been found
before \\pard. This shoud indicate older RTF. Reset the state
Write the section defintion. Insert a paragraph definition.
Insert {} to mark the end of a paragraph defintion
"""
if not self.__in_field:
self.__state = 'body'
else:
self.__state = 'sec_in_field'
self.__write_section(line)
self.__write_obj.write('cw<pf<par-def___<nu<true\n')
self.__write_obj.write('ob<nu<open-brack<0000\n')
self.__write_obj.write('cb<nu<clos-brack<0000\n')
def __write_section(self, line):
"""
Requires:
nothing
Returns:
nothing
Logic:
Form a string of attributes and values. If you are not in a field
block, write this string to the output file. Otherwise, call on
the handle_sec_def method to handle this string.
"""
my_string = self.__mark_start
if self.__found_first_sec:
my_string += 'mi<tg<close_____<section\n'
else:
self.__found_first_sec = 1
my_string += 'mi<tg<open-att__<section<num>%s' % unicode_type(self.__section_num)
my_string += '<num-in-level>%s' % unicode_type(self.__section_num)
my_string += '<type>rtf-native'
my_string += '<level>0'
keys = self.__section_values.keys()
if len(keys) > 0:
for key in keys:
my_string += '<%s>%s' % (key, self.__section_values[key])
my_string += '\n'
my_string += self.__mark_end
# # my_string += line
if self.__state == 'body':
self.__write_obj.write(my_string)
elif self.__state == 'sec_in_field':
self.__handle_sec_def(my_string)
elif self.__run_level > 3:
msg = 'missed a flag\n'
raise self.__bug_handler(msg)
def __handle_sec_def(self, my_string):
"""
Requires:
my_string -- the string of attributes and values. (Do I need this?)
Returns:
nothing
Logic:
I need to append the dictionary of attributes and values to list
so I can use it later when I reach the end of the field-block.
"""
values_dict = self.__section_values
self.__list_of_sec_values.append(values_dict)
def __body_func(self, line):
"""
Requires:
line --the line to parse
Returns:
nothing
Logic:
Look for the beginning of a section. Otherwise, print the line to
the output file.
"""
action = self.__body_dict.get(self.__token_info)
if action:
action(line)
else:
self.__write_obj.write(line)
def __before_body_func(self, line):
"""
Requires:
line --line to parse
Returns:
nothing
Logic:
Look for the beginning of the body. Always print out the line.
"""
if self.__token_info == 'mi<mk<body-open_':
self.__state = 'before_first_sec'
self.__write_obj.write(line)
def __before_first_sec_func(self, line):
"""
Requires:
line -- line to parse
Returns:
nothing
Logic:
Look for the beginning of the first section. This can be \\sectd,
but in older RTF it could mean the any paragraph or row definition
"""
if self.__token_info == 'cw<sc<sect-defin':
self.__state = 'section_def'
self.__section_num += 1
self.__section_values.clear()
elif self.__token_info == 'cw<pf<par-def___':
self.__state = 'body'
self.__section_num += 1
self.__write_obj.write(
'mi<tg<open-att__<section<num>%s'
'<num-in-level>%s'
'<type>rtf-native'
'<level>0\n'
% (unicode_type(self.__section_num), unicode_type(self.__section_num))
)
self.__found_first_sec = 1
elif self.__token_info == 'tx<nu<__________':
self.__state = 'body'
self.__section_num += 1
self.__write_obj.write(
'mi<tg<open-att__<section<num>%s'
'<num-in-level>%s'
'<type>rtf-native'
'<level>0\n'
% (unicode_type(self.__section_num), unicode_type(self.__section_num))
)
self.__write_obj.write(
'cw<pf<par-def___<true\n'
)
self.__found_first_sec = 1
self.__write_obj.write(line)
def __found_sec_in_field_func(self, line):
"""
Requires:
line --line to parse
Returns:
nothing
Logic:
I have found the beginning of a field that has a section (or
really, two) inside of it. Change the state, and start adding to
one long string.
"""
self.__state = 'sec_in_field'
self.__sec_in_field_string = line
self.__in_field = 1
def __sec_in_field_func(self, line):
"""
Requires:
line --the line to parse
Returns:
nothing
Logic:
Check for the end of the field, or the beginning of a section
definition.
CHANGED! Just print out each line. Ignore any sections or
section definition info.
"""
action = self.__sec_in_field_dict.get(self.__token_info)
if action:
action(line)
else:
# change this 2004-04-26
# self.__sec_in_field_string += line
self.__write_obj.write(line)
def __end_sec_in_field_func(self, line):
"""
Requires:
line --line to parse
Returns:
nothing
Logic:
Add the last line to the field string. Call on the method
print_field_sec_attributes to write the close and beginning of a
section tag. Print out the field string. Call on the same method
to again write the close and beginning of a section tag.
Change the state.
"""
# change this 2004-04-26
# Don't do anyting
"""
self.__sec_in_field_string += line
self.__print_field_sec_attributes()
self.__write_obj.write(self.__sec_in_field_string)
self.__print_field_sec_attributes()
"""
self.__state = 'body'
self.__in_field = 0
# this is changed too
self.__write_obj.write(line)
def __print_field_sec_attributes(self):
"""
Requires:
nothing
Returns:
nothing
Logic:
Get the number and dictionary of values from the lists. The number
and dictionary will be the first item of each list. Write the
close tag. Write the start tag. Write the attribute and values in
the dictionary. Get rid of the first item in each list.
keys = self.__section_values.keys()
if len(keys) > 0:
my_string += 'mi<tg<open-att__<section-definition'
for key in keys:
my_string += '<%s>%s' % (key, self.__section_values[key])
my_string += '\n'
else:
my_string += 'mi<tg<open______<section-definition\n'
"""
num = self.__field_num[0]
self.__field_num = self.__field_num[1:]
self.__write_obj.write(
'mi<tg<close_____<section\n'
'mi<tg<open-att__<section<num>%s' % unicode_type(num)
)
if self.__list_of_sec_values:
keys = self.__list_of_sec_values[0].keys()
for key in keys:
self.__write_obj.write(
'<%s>%s\n' % (key, self.__list_of_sec_values[0][key]))
self.__list_of_sec_values = self.__list_of_sec_values[1:]
self.__write_obj.write('<level>0')
self.__write_obj.write('<type>rtf-native')
self.__write_obj.write('<num-in-level>%s' % unicode_type(self.__section_num))
self.__write_obj.write('\n')
# Look here
def __found_section_in_field_func(self, line):
"""
Requires:
line --line to parse
Returns:
nothing
Logic:
I have found a section in a field block. Add one to section
counter, and append this number to a list.
"""
self.__section_num += 1
self.__field_num.append(self.__section_num)
self.__sec_in_field_string += line
def __found_section_def_in_field_func(self, line):
"""
Requires:
line --line to parse
Returns:
nothing
Logic:
I have found a section definition in a filed block. Change the
state and clear the values dictionary.
"""
self.__state = 'section_def'
self.__section_values.clear()
def make_sections(self):
"""
Requires:
nothing
Returns:
nothing (changes the original file)
Logic:
Read one line in at a time. Determine what action to take based on
the state. If the state is before the body, look for the
beginning of the body.
If the state is body, send the line to the body method.
"""
self.__initiate_values()
read_obj = open_for_read(self.__file)
self.__write_obj = open_for_write(self.__write_to)
line_to_read = 1
while line_to_read:
line_to_read = read_obj.readline()
line = line_to_read
self.__token_info = line[:16]
action = self.__state_dict.get(self.__state)
if action is None:
sys.stderr.write('no matching state in module sections.py\n')
sys.stderr.write(self.__state + '\n')
action(line)
read_obj.close()
self.__write_obj.close()
copy_obj = copy.Copy(bug_handler=self.__bug_handler)
if self.__copy:
copy_obj.copy_file(self.__write_to, "sections.data")
copy_obj.rename(self.__write_to, self.__file)
os.remove(self.__write_to)

View File

@@ -0,0 +1,723 @@
from __future__ import unicode_literals, absolute_import, print_function, division
#########################################################################
# #
# #
# copyright 2002 Paul Henry Tremblay #
# #
# This program is distributed in the hope that it will be useful, #
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
# General Public License for more details. #
# #
# #
#########################################################################
import sys, os
from calibre.ebooks.rtf2xml import copy, border_parse
from calibre.ptempfile import better_mktemp
from . import open_for_read, open_for_write
class Styles:
"""
Change lines with style numbers to actual style names.
"""
def __init__(self,
in_file,
bug_handler,
copy=None,
run_level=1,
):
"""
Required:
'file'--file to parse
Optional:
'copy'-- whether to make a copy of result for debugging
'temp_dir' --where to output temporary results (default is
directory from which the script is run.)
Returns:
nothing
"""
self.__file = in_file
self.__bug_handler = bug_handler
self.__copy = copy
self.__write_to = better_mktemp()
self.__run_level = run_level
def __initiate_values(self):
"""
Initiate all values.
"""
self.__border_obj = border_parse.BorderParse()
self.__styles_dict = {'par':{}, 'char':{}}
self.__styles_num = '0'
self.__type_of_style = 'par'
self.__text_string = ''
self.__state = 'before_styles_table'
self.__state_dict = {
'before_styles_table': self.__before_styles_func,
'in_styles_table' : self.__in_styles_func,
'in_individual_style' : self.__in_individual_style_func,
'after_styles_table' : self.__after_styles_func,
'mi<mk<styles-beg' : self.__found_styles_table_func,
'mi<mk<styles-end' : self.__found_end_styles_table_func,
'mi<mk<stylei-beg' : self.__found_beg_ind_style_func,
'mi<mk<stylei-end' : self.__found_end_ind_style_func,
'cw<ss<para-style' : self.__para_style_func,
'cw<ss<char-style' : self.__char_style_func,
}
# A separate dictionary for parsing the body text
self.__body_dict = {
'cw<ss<para-style' : (self.__para_style_in_body_func, 'par'),
'cw<ss<char-style' : (self.__para_style_in_body_func, 'char'),
}
# Dictionary needed to convert shortened style names to readable names
self.__token_dict={
# paragraph formatting => pf
'par-end___' : 'para',
'par-def___' : 'paragraph-definition',
'keep-w-nex' : 'keep-with-next',
'widow-cntl' : 'widow-control',
'adjust-rgt' : 'adjust-right',
'language__' : 'language',
'right-inde' : 'right-indent',
'fir-ln-ind' : 'first-line-indent',
'left-inden' : 'left-indent',
'space-befo' : 'space-before',
'space-afte' : 'space-after',
'line-space' : 'line-spacing',
'default-ta' : 'default-tab',
'align_____' : 'align',
'widow-cntr' : 'widow-control',
# page fomratting mixed in! (Just in older RTF?)
'margin-lef' : 'left-indent',
'margin-rig' : 'right-indent',
'margin-bot' : 'space-after',
'margin-top' : 'space-before',
# stylesheet = > ss
'style-shet' : 'stylesheet',
'based-on__' : 'based-on-style',
'next-style' : 'next-style',
'char-style' : 'character-style',
'para-style' : 'paragraph-style',
# graphics => gr
'picture___' : 'pict',
'obj-class_' : 'obj_class',
'mac-pic___' : 'mac-pict',
# section => sc
'section___' : 'section-new',
'sect-defin' : 'section-reset',
'sect-note_' : 'endnotes-in-section',
# list=> ls
'list-text_' : 'list-text',
'list______' : 'list',
'list-lev-d' : 'list-level-definition',
'list-cardi' : 'list-cardinal-numbering',
'list-decim' : 'list-decimal-numbering',
'list-up-al' : 'list-uppercase-alphabetic-numbering',
'list-up-ro' : 'list-uppercae-roman-numbering',
'list-ord__' : 'list-ordinal-numbering',
'list-ordte' : 'list-ordinal-text-numbering',
'list-bulli' : 'list-bullet',
'list-simpi' : 'list-simple',
'list-conti' : 'list-continue',
'list-hang_' : 'list-hang',
# 'list-tebef' : 'list-text-before',
# 'list-level' : 'level',
'list-id___' : 'list-id',
'list-start' : 'list-start',
'nest-level' : 'nest-level',
# duplicate
'list-level' : 'list-level',
# notes => nt
'footnote__' : 'footnote',
'type______' : 'type',
# anchor => an
'toc_______' : 'anchor-toc',
'book-mk-st' : 'bookmark-start',
'book-mk-en' : 'bookmark-end',
'index-mark' : 'anchor-index',
'place_____' : 'place',
# field => fd
'field_____' : 'field',
'field-inst' : 'field-instruction',
'field-rslt' : 'field-result',
'datafield_' : 'data-field',
# info-tables => it
'font-table' : 'font-table',
'colr-table' : 'color-table',
'lovr-table' : 'list-override-table',
'listtable_' : 'list-table',
'revi-table' : 'revision-table',
# character info => ci
'hidden____' : 'hidden',
'italics___' : 'italics',
'bold______' : 'bold',
'strike-thr' : 'strike-through',
'shadow____' : 'shadow',
'outline___' : 'outline',
'small-caps' : 'small-caps',
'dbl-strike' : 'double-strike-through',
'emboss____' : 'emboss',
'engrave___' : 'engrave',
'subscript_' : 'subscript',
'superscrip' : 'superscript',
'plain_____' : 'plain',
'font-style' : 'font-style',
'font-color' : 'font-color',
'font-size_' : 'font-size',
'font-up___' : 'superscript',
'font-down_' : 'subscript',
'red_______' : 'red',
'blue______' : 'blue',
'green_____' : 'green',
'caps______' : 'caps',
# table => tb
'row-def___' : 'row-definition',
'cell______' : 'cell',
'row_______' : 'row',
'in-table__' : 'in-table',
'columns___' : 'columns',
'row-pos-le' : 'row-position-left',
'cell-posit' : 'cell-position',
# preamble => pr
# underline
'underlined' : 'underlined',
# border => bd
'bor-t-r-hi' : 'border-table-row-horizontal-inside',
'bor-t-r-vi' : 'border-table-row-vertical-inside',
'bor-t-r-to' : 'border-table-row-top',
'bor-t-r-le' : 'border-table-row-left',
'bor-t-r-bo' : 'border-table-row-bottom',
'bor-t-r-ri' : 'border-table-row-right',
'bor-cel-bo' : 'border-cell-bottom',
'bor-cel-to' : 'border-cell-top',
'bor-cel-le' : 'border-cell-left',
'bor-cel-ri' : 'border-cell-right',
# 'bor-par-bo' : 'border-paragraph-bottom',
'bor-par-to' : 'border-paragraph-top',
'bor-par-le' : 'border-paragraph-left',
'bor-par-ri' : 'border-paragraph-right',
'bor-par-bo' : 'border-paragraph-box',
'bor-for-ev' : 'border-for-every-paragraph',
'bor-outsid' : 'border-outisde',
'bor-none__' : 'border',
# border type => bt
'bdr-single' : 'single',
'bdr-doubtb' : 'double-thickness-border',
'bdr-shadow' : 'shadowed-border',
'bdr-double' : 'double-border',
'bdr-dotted' : 'dotted-border',
'bdr-dashed' : 'dashed',
'bdr-hair__' : 'hairline',
'bdr-inset_' : 'inset',
'bdr-das-sm' : 'dash-small',
'bdr-dot-sm' : 'dot-dash',
'bdr-dot-do' : 'dot-dot-dash',
'bdr-outset' : 'outset',
'bdr-trippl' : 'tripple',
'bdr-thsm__' : 'thick-thin-small',
'bdr-htsm__' : 'thin-thick-small',
'bdr-hthsm_' : 'thin-thick-thin-small',
'bdr-thm__' : 'thick-thin-medium',
'bdr-htm__' : 'thin-thick-medium',
'bdr-hthm_' : 'thin-thick-thin-medium',
'bdr-thl__' : 'thick-thin-large',
'bdr-hthl_' : 'think-thick-think-large',
'bdr-wavy_' : 'wavy',
'bdr-d-wav' : 'double-wavy',
'bdr-strip' : 'striped',
'bdr-embos' : 'emboss',
'bdr-engra' : 'engrave',
'bdr-frame' : 'frame',
'bdr-li-wid' : 'line-width',
# tabs
'tab-center' : 'center',
'tab-right_' : 'right',
'tab-dec___' : 'decimal',
'leader-dot' : 'leader-dot',
'leader-hyp' : 'leader-hyphen',
'leader-und' : 'leader-underline',
}
self.__tabs_dict = {
'cw<pf<tab-stop__' : self.__tab_stop_func,
'cw<pf<tab-center' : self.__tab_type_func,
'cw<pf<tab-right_' : self.__tab_type_func,
'cw<pf<tab-dec___' : self.__tab_type_func,
'cw<pf<leader-dot' : self.__tab_leader_func,
'cw<pf<leader-hyp' : self.__tab_leader_func,
'cw<pf<leader-und' : self.__tab_leader_func,
'cw<pf<tab-bar-st' : self.__tab_bar_func,
}
self.__tab_type_dict = {
'cw<pf<tab-center' : 'center',
'cw<pf<tab-right_' : 'right',
'cw<pf<tab-dec___' : 'decimal',
'cw<pf<leader-dot' : 'leader-dot',
'cw<pf<leader-hyp' : 'leader-hyphen',
'cw<pf<leader-und' : 'leader-underline',
}
self.__ignore_list = [
'list-tebef',
]
self.__tabs_list = self.__tabs_dict.keys()
self.__tab_type = 'left'
self.__leader_found = 0
def __in_individual_style_func(self, line):
"""
Required:
line
Returns:
nothing
Logic:
Check if the token marks the end of the individual style. (Action
is the value of the state dictionary, and the only key that will
match in this function is the end of the individual style.)
If the end of the individual style is not found, check if the line
is a control word. If it is, extract the relelvant info and look
up this info in the tokens dictionary. I want to change
abbreviated names for longer, more readable ones.
Write an error message if no key is found for the info.
If the line is text, add the text to a text string. The text
string will be the name of the style.
"""
action = self.__state_dict.get(self.__token_info)
if action:
action(line)
# have to parse border lines with external module
elif line[0:5] == 'cw<bd':
border_dict = self.__border_obj.parse_border(line)
keys = border_dict.keys()
for key in keys:
self.__enter_dict_entry(key, border_dict[key])
elif self.__token_info in self.__tabs_list:
action = self.__tabs_dict.get(self.__token_info)
if action is not None:
action(line)
elif line[0:2] == 'cw':
# cw<pf<widow-cntl<nu<true
info = line[6:16]
att = self.__token_dict.get(info)
if att is None :
if info not in self.__ignore_list:
if self.__run_level > 3:
msg = 'no value for key %s\n' % info
raise self.__bug_handler(msg)
else:
value = line[20:-1]
self.__enter_dict_entry(att, value)
elif line[0:2] == 'tx':
self.__text_string += line[17:-1]
def __tab_stop_func(self, line):
"""
Requires:
line -- line to parse
Returns:
nothing
Logic:
Try to add the number to dictionary entry tabs-left, or tabs-right, etc.
If the dictionary entry doesn't exist, create one.
"""
try:
if self.__leader_found:
self.__styles_dict['par'][self.__styles_num]['tabs']\
+= '%s:' % self.__tab_type
self.__styles_dict['par'][self.__styles_num]['tabs']\
+= '%s;' % line[20:-1]
else:
self.__styles_dict['par'][self.__styles_num]['tabs']\
+= '%s:' % self.__tab_type
self.__styles_dict['par'][self.__styles_num]['tabs']\
+= '%s;' % line[20:-1]
except KeyError:
self.__enter_dict_entry('tabs', '')
self.__styles_dict['par'][self.__styles_num]['tabs']\
+= '%s:' % self.__tab_type
self.__styles_dict['par'][self.__styles_num]['tabs'] += '%s;' % line[20:-1]
self.__tab_type = 'left'
self.__leader_found = 0
def __tab_type_func(self, line):
"""
"""
type = self.__tab_type_dict.get(self.__token_info)
if type is not None:
self.__tab_type = type
else:
if self.__run_level > 3:
msg = 'no entry for %s\n' % self.__token_info
raise self.__bug_handler(msg)
def __tab_leader_func(self, line):
"""
Requires:
line --line to parse
Returns:
nothing
Logic:
Try to add the string of the tab leader to dictionary entry
tabs-left, or tabs-right, etc. If the dictionary entry doesn't
exist, create one.
"""
self.__leader_found = 1
leader = self.__tab_type_dict.get(self.__token_info)
if leader is not None:
leader += '^'
try:
self.__styles_dict['par'][self.__styles_num]['tabs'] += ':%s;' % leader
except KeyError:
self.__enter_dict_entry('tabs', '')
self.__styles_dict['par'][self.__styles_num]['tabs'] += '%s;' % leader
else:
if self.__run_level > 3:
msg = 'no entry for %s\n' % self.__token_info
raise self.__bug_handler(msg)
def __tab_bar_func(self, line):
"""
Requires:
line -- line to parse
Returns:
nothing
Logic:
Try to add the string of the tab bar to dictionary entry tabs-bar.
If the dictionary entry doesn't exist, create one.
"""
# self.__add_dict_entry('tabs-bar', line[20:-1])
try:
self.__styles_dict['par'][self.__styles_num]['tabs']\
+= '%s:' % 'bar'
self.__styles_dict['par'][self.__styles_num]['tabs']\
+= '%s;' % line[20:-1]
except KeyError:
self.__enter_dict_entry('tabs', '')
self.__styles_dict['par'][self.__styles_num]['tabs']\
+= '%s:' % 'bar'
self.__styles_dict['par'][self.__styles_num]['tabs']\
+= '%s;' % line[20:-1]
self.__tab_type = 'left'
def __enter_dict_entry(self, att, value):
"""
Required:
att -- the attribute
value -- the value
Returns:
nothing
Logic:
Try to add the attribute value directly to the styles dictionary.
If a keyerror is found, that means I have to build the "branches"
of the dictionary before I can add the key value pair.
"""
try:
self.__styles_dict[self.__type_of_style][self.__styles_num][att] = value
except KeyError:
self.__add_dict_entry(att, value)
def __add_dict_entry(self, att, value):
"""
Required:
att --the attribute
value --the value
Returns:
nothing
Logic:
I have to build the branches of the dictionary before I can add
the leaves. (I am comparing a dictionary to a tree.) To achieve
this, I first make a temporary dictionary by extracting either the
inside dictionary of the keyword par or char. This temporary
dictionary is called type_dict.
Next, create a second, smaller dictionary with just the attribute and value.
Add the small dictionary to the type dictionary.
Add this type dictionary to the main styles dictionary.
"""
if self.__type_of_style == 'par':
type_dict =self.__styles_dict['par']
elif self.__type_of_style == 'char':
type_dict = self.__styles_dict['char']
else:
if self.__run_level > 3:
msg = self.__type_of_style + 'error\n'
raise self.__bug_handler(msg)
smallest_dict = {}
smallest_dict[att] = value
type_dict[self.__styles_num] = smallest_dict
self.__styles_dict[self.__type_of_style] = type_dict
def __para_style_func(self, line):
"""
Required:
line
Returns:
nothing
Logic:
Set the type of style to paragraph.
Extract the number for a line such as "cw<ss<para-style<nu<15".
"""
self.__type_of_style = 'par'
self.__styles_num = line[20:-1]
"""
self.__enter_dict_entry('tabs-left', '')
self.__enter_dict_entry('tabs-right', '')
self.__enter_dict_entry('tabs-center', '')
self.__enter_dict_entry('tabs-decimal', '')
self.__enter_dict_entry('tabs-bar', '')
"""
def __char_style_func(self, line):
"""
Required:
line
Returns:
nothing
Logic:
Set the type of style to character.
Extract the number for a line such as "cw<ss<char-style<nu<15".
"""
self.__type_of_style = 'char'
self.__styles_num = line[20:-1]
def __found_beg_ind_style_func(self, line):
"""
Required:
line
Returns:
nothing
Logic:
Get rid of the last semicolon in the text string. Add the text
string as the value with 'name' as the key in the style
dictionary.
"""
self.__state = 'in_individual_style'
def __found_end_ind_style_func(self, line):
name = self.__text_string[:-1] # get rid of semicolon
# add 2005-04-29
# get rid of space before or after
name = name.strip()
self.__enter_dict_entry('name', name)
self.__text_string = ''
def __found_end_styles_table_func(self, line):
"""
Required:
line
Returns:
nothing
Logic:
Set the state to after the styles table.
Fix the styles. (I explain this below.)
Print out the style table.
"""
self.__state = 'after_styles_table'
self.__fix_based_on()
self.__print_style_table()
def __fix_based_on(self):
"""
Requires:
nothing
Returns:
nothing
Logic:
The styles dictionary may contain a pair of key values such as
'next-style' => '15'. I want to change the 15 to the name of the
style. I accomplish this by simply looking up the value of 15 in
the styles table.
Use two loops. First, check all the paragraph styles. Then check
all the characer styles.
The inner loop: first check 'next-style', then check 'based-on-style'.
Make sure values exist for the keys to avoid the nasty keyerror message.
"""
types = ['par', 'char']
for type in types:
keys = self.__styles_dict[type].keys()
for key in keys:
styles = ['next-style', 'based-on-style']
for style in styles:
value = self.__styles_dict[type][key].get(style)
if value is not None:
temp_dict = self.__styles_dict[type].get(value)
if temp_dict:
changed_value = self.__styles_dict[type][value].get('name')
if changed_value:
self.__styles_dict[type][key][style] = \
changed_value
else:
if value == 0 or value == '0':
pass
else:
if self.__run_level > 4:
msg = '%s %s is based on %s\n' % (type, key, value)
msg = 'There is no style with %s\n' % value
raise self.__bug_handler(msg)
del self.__styles_dict[type][key][style]
def __print_style_table(self):
"""
Required:
nothing
Returns:
nothing
Logic:
This function prints out the style table.
I use three nested for loops. The outer loop prints out the
paragraphs styles, then the character styles.
The next loop iterates through the style numbers.
The most inside loop iterates over the pairs of attributes and
values, and prints them out.
"""
types = ['par', 'char']
for type in types:
if type == 'par':
prefix = 'paragraph'
else:
prefix = 'character'
self.__write_obj.write(
'mi<tg<open______<%s-styles\n' % prefix
)
style_numbers = self.__styles_dict[type].keys()
for num in style_numbers:
self.__write_obj.write(
'mi<tg<empty-att_<%s-style-in-table<num>%s' % (prefix, num)
)
attributes = self.__styles_dict[type][num].keys()
for att in attributes:
this_value = self.__styles_dict[type][num][att]
self.__write_obj.write(
'<%s>%s' % (att, this_value)
)
self.__write_obj.write('\n')
self.__write_obj.write(
'mi<tg<close_____<%s-styles\n' % prefix
)
def __found_styles_table_func(self, line):
"""
Required:
line
Returns:
nothing
Logic:
Change the state to in the style table when the marker has been found.
"""
self.__state = 'in_styles_table'
def __before_styles_func(self, line):
"""
Required:
line
Returns:
nothing.
Logic:
Check the line info in the state dictionary. When the beginning of
the styles table is found, change the state to in the styles
table.
"""
action = self.__state_dict.get(self.__token_info)
if not action:
self.__write_obj.write(line)
else:
action(line)
def __in_styles_func(self, line):
"""
Required:
line
Returns:
nothing
Logic:
Check the line for the beginning of an individaul style. If it is
not found, simply print out the line.
"""
action = self.__state_dict.get(self.__token_info)
if action is None:
self.__write_obj.write(line)
else:
action(line)
def __para_style_in_body_func(self, line, type):
"""
Required:
line-- the line
type -- whether a character or paragraph
Returns:
nothing
Logic:
Determine the prefix by whether the type is "par" or "char".
Extract the number from a line such as "cw<ss<para-style<nu<15".
Look up that number in the styles dictionary and put a name for a number
"""
if type == 'par':
prefix = 'para'
else:
prefix = 'char'
num = line[20:-1]
# may be invalid RTF--a style down below not defined above!
try:
value = self.__styles_dict[type][num]['name']
except KeyError:
value = None
if value:
self.__write_obj.write(
'cw<ss<%s-style<nu<%s\n' % (prefix, value)
)
else:
self.__write_obj.write(
'cw<ss<%s_style<nu<not-defined\n' % prefix
)
def __after_styles_func(self, line):
"""
Required:
line
Returns:
nothing
Logic:
Determine if a line with either character of paragraph style info
has been found. If so, then use the appropriate method to parse
the line. Otherwise, write the line to a file.
"""
action, type = self.__body_dict.get(self.__token_info, (None, None))
if action:
action(line, type)
else:
self.__write_obj.write(line)
def convert_styles(self):
"""
Requires:
nothing
Returns:
nothing (changes the original file)
Logic:
Read one line in at a time. Determine what action to take based on
the state. If the state is before the style table, look for the
beginning of the style table.
If the state is in the style table, create the style dictionary
and print out the tags.
If the state if afer the style table, look for lines with style
info, and substitute the number with the name of the style.
"""
self.__initiate_values()
read_obj = open_for_read(self.__file)
self.__write_obj = open_for_write(self.__write_to)
line_to_read = 1
while line_to_read:
line_to_read = read_obj.readline()
line = line_to_read
self.__token_info = line[:16]
action = self.__state_dict.get(self.__state)
if action is None:
sys.stderr.write('no matching state in module styles.py\n')
sys.stderr.write(self.__state + '\n')
action(line)
read_obj.close()
self.__write_obj.close()
copy_obj = copy.Copy(bug_handler=self.__bug_handler)
if self.__copy:
copy_obj.copy_file(self.__write_to, "styles.data")
copy_obj.rename(self.__write_to, self.__file)
os.remove(self.__write_to)

View File

@@ -0,0 +1,568 @@
from __future__ import absolute_import, division, print_function, unicode_literals
#########################################################################
# #
# #
# copyright 2002 Paul Henry Tremblay #
# #
# This program is distributed in the hope that it will be useful, #
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
# General Public License for more details. #
# #
# #
#########################################################################
import sys, os
from calibre.ebooks.rtf2xml import copy, border_parse
from calibre.ptempfile import better_mktemp
from polyglot.builtins import unicode_type
from . import open_for_read, open_for_write
"""
States.
1. 'not_in_table'
1. 'cw<tb<row-def___' start a row definition
2. 'mi<mk<in-table__' start table
2. 'in_table'
1. 'mi<mk<pard-start', start of a row, cell
2. 'mi<mk<not-in-tbl', end the table.
3. 'cw<tb<row-def___' start a row definition
3. in_row_definition
1. 'mi<mk<not-in-tbl' : end the row defintion. If in table, end the table.
2. 'mi<mk<pard-start' : end the row defintion
if already in the table, start a row and cell.
3. 'cw<tb<row_______' : end the row definition, end the row
4. 'cw...' use another method to handle the control word
control word might be added to dictionary.
5. 'mi<mk<in-table__' If already in table, do nothing. Otherwise
start the table.
4. 'in_row'
1. 'mi<mk<pard-start', start cell
2. 'mi<mk<not-in-tbl' end table,
3. 'cw<tb<row_______' close row,
5. 'in_cell'
1. 'mi<mk<not-in-tbl', end table
2. 'cw<tb<cell______', end cell
"""
class Table:
"""
Make tables.
Logic:
Read one line at a time. The default state (self.__state) is
'not_in_table'. Look for either a 'cw<tb<in-table__', or a row definition.
"""
def __init__(self,
in_file,
bug_handler,
copy=None,
run_level=1,):
"""
Required:
'file'--file to parse
Optional:
'copy'-- whether to make a copy of result for debugging
'temp_dir' --where to output temporary results (default is
directory from which the script is run.)
Returns:
nothing
"""
self.__file = in_file
self.__bug_handler = bug_handler
self.__copy = copy
self.__run_level = run_level
self.__write_to = better_mktemp()
def __initiate_values(self):
"""
Initiate all values.
"""
self.__state_dict = {
'in_table': self.__in_table_func,
'in_row_def': self.__in_row_def_func,
'not_in_table': self.__not_in_table_func,
'in_cell': self.__in_cell_func,
'in_row': self.__in_row_func,
}
self.__not_in_table_dict = {
'cw<tb<row-def___': self.__found_row_def_func,
'cw<tb<in-table__': self.__start_table_func,
'mi<mk<in-table__' : self.__start_table_func,
}
# can't use this dictionary. When in row_definition, many tokens
# require multiple definitions
self.__in_row_definition_dict = {
'mi<mk<not-in-tbl' : self.__end_row_table_func,
'mi<mk<pard-start' : self.__end_row_def_func,
}
self.__in_row_dict = {
'mi<mk<not-in-tbl' : self.__close_table,
'mi<mk<pard-start' : self.__start_cell_func,
'cw<tb<row_______' : self.__end_row_func,
'cw<tb<cell______' : self.__empty_cell,
}
# set the default state
self.__state = ['not_in_table']
# set empty data for all tables
self.__table_data = []
# just in case there is no table data
self.__row_dict = {}
self.__cell_list = []
self.__cell_widths = []
def __in_table_func(self, line):
"""
Requires:
line -- line to parse
Logic:
Look for the end of the table. If found, close out the table.
Look for 'mi<mk<pard-start', which marks the beginning of a row. Start
a row and start a cell.
"""
# 'cell' : ('tb', 'cell______', self.default_func),
if self.__token_info == 'mi<mk<not-in-tbl' or\
self.__token_info == 'mi<mk<sect-start' or\
self.__token_info == 'mi<mk<sect-close' or\
self.__token_info == 'mi<mk<body-close':
self.__close_table(line)
elif self.__token_info == 'mi<mk<pard-start':
self.__start_row_func(line)
self.__start_cell_func(line)
elif self.__token_info == 'cw<tb<row-def___':
self.__found_row_def_func(line)
elif self.__token_info == 'cw<tb<cell______':
self.__start_row_func(line)
self.__empty_cell(line)
self.__write_obj.write(line)
def __not_in_table_func(self, line):
"""
Requires:
line -- the line of text read in from document
Returns:
nothing
Logic:
The state is not in a table, so look for the two tokens that
mark the start of a table: 'cw<tb<row-def', or 'cw<tb<in-table__'.
If these tokens are found, use another method to start a table
and change states. Otherwise, just output the line.
"""
action = self.__not_in_table_dict.get(self.__token_info)
if action:
action(line)
self.__write_obj.write(line)
def __close_table(self, line):
"""
Requires:
line -- line to parse
Returns:
?
Logic:
Write the end marker for the table.
Write the end tag for the table.
Set the state to ['not_in_table']
"""
self.__write_obj.write('mi<mk<table-end_\n')
self.__state = ['not_in_table']
self.__table_data[-1]['number-of-columns'] = self.__max_number_cells_in_row
self.__table_data[-1]['number-of-rows'] = self.__rows_in_table
average_cells_in_row = self.__mode(self.__list_of_cells_in_row)
self.__table_data[-1]['average-cells-per-row'] = average_cells_in_row
average_cell_width = self.__mode(self.__cell_widths)
self.__table_data[-1]['average-cell-width'] = average_cell_width
def __found_row_def_func(self, line):
"""
Requires:
line don't need this except for consistency with other methods.
Returns:
nothing
Logic:
A row definition has been found. Collect all the data from this
to use later in writing attributes for the table.
"""
self.__state.append('in_row_def')
self.__last_cell_position = 0
self.__row_dict = {}
self.__cell_list = []
self.__cell_list.append({})
self.__cell_widths = []
def __start_table_func(self, line):
"""
Requires:
line -- line to parse
Returns:
?
Logic:
Add the 'in_table' to the state list.
Write out the table marker.
Initialize table values (not sure about these yet)
"""
self.__rows_in_table = 0
self.__cells_in_table = 0
self.__cells_in_row = 0
self.__max_number_cells_in_row = 0
self.__table_data.append({})
self.__list_of_cells_in_row = []
self.__write_obj.write('mi<mk<tabl-start\n')
self.__state.append('in_table')
def __end_row_table_func(self, line):
"""
Requires:
line --just for consistencey
Returns:
?
Logic:
?
"""
self.__close_table(self, line)
def __end_row_def_func(self, line):
"""
Requires:
line --just for consistency
Returns:
nothing
Logic:
change the state.
get rid of the last {} in the cell list
figure out the number of cells based on the self.__row_dict[widths]
('122, 122')
"""
if len(self.__state) > 0:
if self.__state[-1] == 'in_row_def':
self.__state.pop()
# added [{]] at the *end* of each /cell. Get rid of extra one
self.__cell_list.pop()
widths = self.__row_dict.get('widths')
if widths:
width_list = widths.split(',')
num_cells = len(width_list)
self.__row_dict['number-of-cells'] = num_cells
def __in_row_def_func(self, line):
"""
Requires:
line --line to parse
Returns:
nothing
Logic:
In the text that defines a row. If a control word is found, handle the
control word with another method.
Check for states that will end this state.
While in the row definition, certain tokens can end a row or end a table.
If a paragrah definition (pard-start) is found, and the you are already in
a table, start of a row.
"""
if self.__token_info == 'cw<tb<row_______':
# write tags
self.__end_row_func(line)
# change the state
self.__end_row_def_func(line)
self.__write_obj.write(line)
elif line[0:2] == 'cw':
self.__handle_row_token(line)
self.__write_obj.write(line)
elif self.__token_info == 'mi<mk<not-in-tbl' and 'in_table' in self.__state:
self.__end_row_def_func(line)
self.__close_table(line)
self.__write_obj.write(line)
elif self.__token_info == 'mi<mk<pard-start':
self.__end_row_def_func(line)
# if already in the table, start a row, then cell.
if (self.__state) > 0 and self.__state[-1] == 'in_table':
self.__start_row_func(line)
self.__start_cell_func(line)
self.__write_obj.write(line)
elif self.__token_info == 'mi<mk<in-table__':
self.__end_row_def_func(line)
# if not in table, start a new table
if len(self.__state) > 0 and self.__state[-1] != 'in_table':
self.__start_table_func(line)
self.__write_obj.write(line)
else:
self.__write_obj.write(line)
def __handle_row_token(self, line):
"""
Requires:
line -- line to parse
Returns:
?
Logic:
the tokens in the row definition contain the following information:
1. row borders.
2. cell borders for all cells in the row.
3. cell postions for all cells in the row.
Put all information about row borders into a row dictionary.
Put all information about cell borders into into the dictionary in
the last item in the cell list. ([{border:something, width:something},
{border:something, width:something}])
cw<bd<bor-t-r-to<nu<bdr-hair__|bdr-li-wid:0.50
"""
if line[3:5] == 'bd':
border_obj = border_parse.BorderParse()
the_dict = border_obj.parse_border(line)
keys = the_dict.keys()
# border-cell-top-hairline
in_cell = 0
for key in keys:
if key[0:11] == 'border-cell':
in_cell = 1
for key in keys:
if in_cell:
self.__cell_list[-1][key] = the_dict[key]
else:
self.__row_dict[key] = the_dict[key]
# cw<tb<cell-posit<nu<216.00
elif self.__token_info == 'cw<tb<cell-posit':
self.__found_cell_position(line)
# cw<tb<row-pos-le<nu<-5.40
elif self.__token_info == 'cw<tb<row-pos-le':
position = line[20:-1]
self.__row_dict['left-row-position'] = position
elif self.__token_info == 'cw<tb<row-header':
self.__row_dict['header'] = 'true'
def __start_cell_func(self, line):
"""
Required:
line -- the line of text
Returns:
nothing
Logic:
Append 'in_cell' for states
If the self.__cell list containst dictionaries, get the last dictionary.
Write value => attributes for key=> value
pop the self.__cell_list.
Otherwise, print out a cell tag.
"""
self.__state.append('in_cell')
# self.__cell_list = []
if len(self.__cell_list) > 0:
self.__write_obj.write('mi<tg<open-att__<cell')
# cell_dict = self.__cell_list[-1]
cell_dict = self.__cell_list[0]
keys = cell_dict.keys()
for key in keys:
self.__write_obj.write('<%s>%s' % (key, cell_dict[key]))
self.__write_obj.write('\n')
# self.__cell_list.pop()
self.__cell_list.pop(0)
# self.__cell_list = self.__cell_list[1:]
else:
self.__write_obj.write('mi<tg<open______<cell\n')
self.__cells_in_table += 1
self.__cells_in_row += 1
def __start_row_func(self, line):
"""
Required:
line -- the line of text
Returns:
nothing
Logic:
Append 'in_row' for states
Write value => attributes for key=> value
"""
self.__state.append('in_row')
self.__write_obj.write('mi<tg<open-att__<row')
keys = self.__row_dict.keys()
for key in keys:
self.__write_obj.write('<%s>%s' % (key, self.__row_dict[key]))
self.__write_obj.write('\n')
self.__cells_in_row = 0
self.__rows_in_table += 1
def __found_cell_position(self, line):
"""
needs:
line: current line
returns:
nothing
logic:
Calculate the cell width.
If the cell is the first cell, you should add the left cell position to it.
(This value is often negative.)
Next, set the new last_cell_position to the current cell position.
"""
# cw<tb<cell-posit<nu<216.00
new_cell_position = round(float(line[20:-1]), 2)
left_position = 0
if self.__last_cell_position == 0:
left_position = self.__row_dict.get('left-row-position', 0)
left_position = float(left_position)
width = new_cell_position - self.__last_cell_position - left_position
# width = round(width, 2)
width = unicode_type('%.2f' % width)
self.__last_cell_position = new_cell_position
widths_exists = self.__row_dict.get('widths')
if widths_exists:
self.__row_dict['widths'] += ', %s' % unicode_type(width)
else:
self.__row_dict['widths'] = unicode_type(width)
self.__cell_list[-1]['width'] = width
self.__cell_list.append({})
self.__cell_widths.append(width)
def __in_cell_func(self, line):
"""
Required:
line
Returns:
nothing
Logic:
In the middle of a cell.
Look for the close of the table. If found, use the close table function to close
the table.
Look for the close of the cell. If found, use the close cell function to close out
the cell.
Otherwise, print out the line.
"""
# cw<tb<cell______<nu<true
# mi<mk<sect-start
if self.__token_info == 'mi<mk<not-in-tbl' or\
self.__token_info == 'mi<mk<sect-start' or\
self.__token_info == 'mi<mk<sect-close' or\
self.__token_info == 'mi<mk<body-close':
self.__end_cell_func(line)
self.__end_row_func(line)
self.__close_table(line)
self.__write_obj.write(line)
elif self.__token_info == 'cw<tb<cell______':
self.__end_cell_func(line)
else:
self.__write_obj.write(line)
def __end_cell_func(self, line):
"""
Requires:
line
Returns:
nothing
Logic:
End the cell. Print out the closing marks. Pop the self.__state.
"""
if len(self.__state) > 1:
if self.__state[-1] == 'in_cell':
self.__state.pop()
self.__write_obj.write('mi<mk<close_cell\n')
self.__write_obj.write('mi<tg<close_____<cell\n')
self.__write_obj.write('mi<mk<closecell_\n')
def __in_row_func(self, line):
if self.__token_info == 'mi<mk<not-in-tbl' or\
self.__token_info == 'mi<mk<sect-start' or\
self.__token_info == 'mi<mk<sect-close' or\
self.__token_info == 'mi<mk<body-close':
self.__end_row_func(line)
self.__close_table(line)
self.__write_obj.write(line)
else:
action = self.__in_row_dict.get(self.__token_info)
if action:
action(line)
self.__write_obj.write(line)
"""
elif self.__token_info == 'mi<mk<pard-start':
self.__start_cell_func(line)
self.__write_obj.write(line)
elif self.__token_info == 'cw<tb<row_______':
self.__end_row_func(line)
self.__write_obj.write(line)
else:
self.__write_obj.write(line)
"""
def __end_row_func(self, line):
"""
"""
if len(self.__state) > 1 and self.__state[-1] == 'in_row':
self.__state.pop()
self.__write_obj.write('mi<tg<close_____<row\n')
else:
self.__write_obj.write('mi<tg<empty_____<row\n')
self.__rows_in_table += 1
if self.__cells_in_row > self.__max_number_cells_in_row:
self.__max_number_cells_in_row = self.__cells_in_row
self.__list_of_cells_in_row.append(self.__cells_in_row)
def __empty_cell(self, line):
"""
Required:
line -- line of text
Returns:
nothing
Logic:
Write an empty tag with attributes if there are attributes.
Otherwise, writen an empty tag with cell as element.
"""
if len(self.__cell_list) > 0:
self.__write_obj.write('mi<tg<empty-att_<cell')
cell_dict = self.__cell_list[-1]
keys = cell_dict.keys()
for key in keys:
self.__write_obj.write('<%s>%s' % (key, cell_dict[key]))
self.__write_obj.write('\n')
else:
self.__write_obj.write('mi<tg<empty_____<cell\n')
self.__cells_in_table += 1
self.__cells_in_row += 1
def __mode(self, the_list):
"""
Required:
the_list -- a list of something
Returns:
the number that occurs the most
Logic:
get the count of each item in list. The count that is the greatest
is the mode.
"""
max = 0
mode = 'not-defined'
for item in the_list:
num_of_values = the_list.count(item)
if num_of_values > max:
mode = item
max = num_of_values
return mode
def make_table(self):
"""
Requires:
nothing
Returns:
A dictionary of values for the beginning of the table.
Logic:
Read one line in at a time. Determine what action to take based on
the state.
"""
self.__initiate_values()
read_obj = open_for_read(self.__file)
self.__write_obj = open_for_write(self.__write_to)
line_to_read = 1
while line_to_read:
line_to_read = read_obj.readline()
line = line_to_read
self.__token_info = line[:16]
action = self.__state_dict.get(self.__state[-1])
# print self.__state[-1]
if action is None:
sys.stderr.write('No matching state in module table.py\n')
sys.stderr.write(self.__state[-1] + '\n')
action(line)
read_obj.close()
self.__write_obj.close()
copy_obj = copy.Copy(bug_handler=self.__bug_handler)
if self.__copy:
copy_obj.copy_file(self.__write_to, "table.data")
copy_obj.rename(self.__write_to, self.__file)
os.remove(self.__write_to)
return self.__table_data

View File

@@ -0,0 +1,88 @@
from __future__ import unicode_literals, absolute_import, print_function, division
#########################################################################
# #
# #
# copyright 2002 Paul Henry Tremblay #
# #
# This program is distributed in the hope that it will be useful, #
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
# General Public License for more details. #
# #
# #
#########################################################################
import os
from calibre.ebooks.rtf2xml import copy
from calibre.ptempfile import better_mktemp
from . import open_for_read, open_for_write
# note to self. This is the first module in which I use tempfile. A good idea?
"""
"""
class TableInfo:
"""
Insert table data for tables.
Logic:
"""
def __init__(self,
in_file,
bug_handler,
table_data,
copy=None,
run_level=1,):
"""
Required:
'file'--file to parse
'table_data' -- a dictionary for each table.
Optional:
'copy'-- whether to make a copy of result for debugging
'temp_dir' --where to output temporary results (default is
directory from which the script is run.)
Returns:
nothing
"""
self.__file = in_file
self.__bug_handler = bug_handler
self.__copy = copy
self.__table_data = table_data
self.__run_level = run_level
self.__write_to = better_mktemp()
# self.__write_to = 'table_info.data'
def insert_info(self):
"""
"""
read_obj = open_for_read(self.__file)
self.__write_obj = open_for_write(self.__write_to)
line_to_read = 1
while line_to_read:
line_to_read = read_obj.readline()
line = line_to_read
if line == 'mi<mk<tabl-start\n':
if len(self.__table_data) > 0:
table_dict = self.__table_data[0]
self.__write_obj.write('mi<tg<open-att__<table')
keys = table_dict.keys()
for key in keys:
self.__write_obj.write('<%s>%s' % (key, table_dict[key]))
self.__write_obj.write('\n')
self.__table_data = self.__table_data[1:]
else:
# this shouldn't happen!
if self.__run_level > 3:
msg = 'Not enough data for each table\n'
raise self.__bug_handler(msg)
self.__write_obj.write('mi<tg<open______<table\n')
elif line == 'mi<mk<table-end_\n':
self.__write_obj.write('mi<tg<close_____<table\n')
self.__write_obj.write(line)
read_obj.close()
self.__write_obj.close()
copy_obj = copy.Copy(bug_handler=self.__bug_handler)
if self.__copy:
copy_obj.copy_file(self.__write_to, "table_info.data")
copy_obj.rename(self.__write_to, self.__file)
os.remove(self.__write_to)

View File

@@ -0,0 +1,218 @@
from __future__ import unicode_literals, absolute_import, print_function, division
#########################################################################
# #
# #
# copyright 2002 Paul Henry Tremblay #
# #
# This program is distributed in the hope that it will be useful, #
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
# General Public License for more details. #
# #
# #
#########################################################################
import os, re
from calibre.ebooks.rtf2xml import copy
from calibre.utils.mreplace import MReplace
from calibre.ptempfile import better_mktemp
from polyglot.builtins import codepoint_to_chr, range, filter, map
from . import open_for_read, open_for_write
class Tokenize:
"""Tokenize RTF into one line per field. Each line will contain information useful for the rest of the script"""
def __init__(self,
in_file,
bug_handler,
copy=None,
run_level=1,
# out_file = None,
):
self.__file = in_file
self.__bug_handler = bug_handler
self.__copy = copy
self.__write_to = better_mktemp()
# self.__write_to = out_file
self.__compile_expressions()
# variables
self.__uc_char = 0
self.__uc_bin = False
self.__uc_value = [1]
def __reini_utf8_counters(self):
self.__uc_char = 0
self.__uc_bin = False
def __remove_uc_chars(self, startchar, token):
for i in range(startchar, len(token)):
if self.__uc_char:
self.__uc_char -= 1
else:
return token[i:]
# if only char to skip
return ''
def __unicode_process(self, token):
# change scope in
if token == r'\{':
self.__uc_value.append(self.__uc_value[-1])
# basic error handling
self.__reini_utf8_counters()
return token
# change scope out
elif token == r'\}':
self.__uc_value.pop()
self.__reini_utf8_counters()
return token
# add a uc control
elif token[:3] == '\\uc':
self.__uc_value[-1] = int(token[3:])
self.__reini_utf8_counters()
return token
# bin data to slip
elif self.__uc_bin:
self.__uc_bin = False
return ''
# uc char to remove
elif self.__uc_char:
# handle \bin tag in case of uc char to skip
if token[:4] == '\bin':
self.__uc_char -=1
self.__uc_bin = True
return ''
elif token[:1] == "\\" :
self.__uc_char -=1
return ''
else:
return self.__remove_uc_chars(0, token)
# go for real \u token
match_obj = self.__utf_exp.match(token)
if match_obj is not None:
self.__reini_utf8_counters()
# get value and handle negative case
uni_char = int(match_obj.group(1))
uni_len = len(match_obj.group(0))
if uni_char < 0:
uni_char += 65536
uni_char = codepoint_to_chr(uni_char).encode('ascii', 'xmlcharrefreplace').decode('ascii')
self.__uc_char = self.__uc_value[-1]
# there is only an unicode char
if len(token)<= uni_len:
return uni_char
# an unicode char and something else
# must be after as it is splited on \
# necessary? maybe for \bin?
elif not self.__uc_char:
return uni_char + token[uni_len:]
# if not uc0 and chars
else:
return uni_char + self.__remove_uc_chars(uni_len, token)
# default
return token
def __sub_reg_split(self,input_file):
input_file = self.__replace_spchar.mreplace(input_file)
# this is for older RTF
input_file = self.__par_exp.sub(r'\n\\par \n', input_file)
input_file = self.__cwdigit_exp.sub(r"\g<1>\n\g<2>", input_file)
input_file = self.__cs_ast.sub(r"\g<1>", input_file)
input_file = self.__ms_hex_exp.sub(r"\\mshex0\g<1> ", input_file)
input_file = self.__utf_ud.sub(r"\\{\\uc0 \g<1>\\}", input_file)
# remove \n in bin data
input_file = self.__bin_exp.sub(lambda x:
x.group().replace('\n', '') + '\n', input_file)
# split
tokens = re.split(self.__splitexp, input_file)
# remove empty tokens and \n
return list(filter(lambda x: len(x) > 0 and x != '\n', tokens))
def __compile_expressions(self):
SIMPLE_RPL = {
"\\\\": "\\backslash ",
"\\~": "\\~ ",
"\\;": "\\; ",
"&": "&amp;",
"<": "&lt;",
">": "&gt;",
"\\~": "\\~ ",
"\\_": "\\_ ",
"\\:": "\\: ",
"\\-": "\\- ",
# turn into a generic token to eliminate special
# cases and make processing easier
"\\{": "\\ob ",
# turn into a generic token to eliminate special
# cases and make processing easier
"\\}": "\\cb ",
# put a backslash in front of to eliminate special cases and
# make processing easier
"{": "\\{",
# put a backslash in front of to eliminate special cases and
# make processing easier
"}": "\\}",
}
self.__replace_spchar = MReplace(SIMPLE_RPL)
# add ;? in case of char following \u
self.__ms_hex_exp = re.compile(r"\\\'([0-9a-fA-F]{2})")
self.__utf_exp = re.compile(r"\\u(-?\d{3,6}) ?")
self.__bin_exp = re.compile(r"(?:\\bin(-?\d{0,10})[\n ]+)[01\n]+")
# manage upr/ud situations
self.__utf_ud = re.compile(r"\\{[\n ]?\\upr[\n ]?(?:\\{.*?\\})[\n ]?" +
r"\\{[\n ]?\\*[\n ]?\\ud[\n ]?(\\{.*?\\})[\n ]?\\}[\n ]?\\}")
# add \n in split for whole file reading
# why keep backslash whereas \is replaced before?
# remove \n from endline char
self.__splitexp = re.compile(r"(\\[{}]|\n|\\[^\s\\{}&]+(?:[ \t\r\f\v])?)")
# this is for old RTF
self.__par_exp = re.compile(r'(\\\n+|\\ )')
# handle improper cs char-style with \* before without {
self.__cs_ast = re.compile(r'\\\*([\n ]*\\cs\d+[\n \\]+)')
# handle cw using a digit as argument and without space as delimiter
self.__cwdigit_exp = re.compile(r"(\\[a-zA-Z]+[\-0-9]+)([^0-9 \\]+)")
def tokenize(self):
"""Main class for handling other methods. Reads the file \
, uses method self.sub_reg to make basic substitutions,\
and process tokens by itself"""
# read
with open_for_read(self.__file) as read_obj:
input_file = read_obj.read()
# process simple replacements and split giving us a correct list
# remove '' and \n in the process
tokens = self.__sub_reg_split(input_file)
# correct unicode
tokens = map(self.__unicode_process, tokens)
# remove empty items created by removing \uc
tokens = list(filter(lambda x: len(x) > 0, tokens))
# write
with open_for_write(self.__write_to) as write_obj:
write_obj.write('\n'.join(tokens))
# Move and copy
copy_obj = copy.Copy(bug_handler=self.__bug_handler)
if self.__copy:
copy_obj.copy_file(self.__write_to, "tokenize.data")
copy_obj.rename(self.__write_to, self.__file)
os.remove(self.__write_to)
# self.__special_tokens = [ '_', '~', "'", '{', '}' ]
# import sys
# def main(args=sys.argv):
# if len(args) < 2:
# print 'No file'
# return
# file = 'data_tokens.txt'
# if len(args) == 3:
# file = args[2]
# to = Tokenize(args[1], Exception, out_file = file)
# to.tokenize()
# if __name__ == '__main__':
# sys.exit(main())
# calibre-debug -e src/calibre/ebooks/rtf2xml/tokenize.py