mirror of
https://github.com/gryf/ebook-converter.git
synced 2026-03-13 13:15:53 +01:00
Initial import
This commit is contained in:
573
ebook_converter/ebooks/rtf2xml/ParseRtf.py
Normal file
573
ebook_converter/ebooks/rtf2xml/ParseRtf.py
Normal file
@@ -0,0 +1,573 @@
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
#########################################################################
|
||||
# #
|
||||
# #
|
||||
# copyright 2002 Paul Henry Tremblay #
|
||||
# #
|
||||
# This program is distributed in the hope that it will be useful, #
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
|
||||
# General Public License for more details. #
|
||||
# #
|
||||
# #
|
||||
#########################################################################
|
||||
# $Revision: 1.41 $
|
||||
# $Date: 2006/03/24 23:50:07 $
|
||||
import sys, os
|
||||
|
||||
from calibre.ebooks.rtf2xml import headings_to_sections, \
|
||||
line_endings, footnote, fields_small, default_encoding, \
|
||||
make_lists, preamble_div, header, colors, group_borders, \
|
||||
check_encoding, add_brackets, table, combine_borders, \
|
||||
fields_large, process_tokens, hex_2_utf8, tokenize, \
|
||||
delete_info, sections, check_brackets, styles, \
|
||||
paragraph_def, convert_to_tags, output, copy, \
|
||||
list_numbers, info, pict, table_info, fonts, paragraphs, \
|
||||
body_styles, preamble_rest, group_styles, \
|
||||
inline
|
||||
from calibre.ebooks.rtf2xml.old_rtf import OldRtf
|
||||
from polyglot.builtins import unicode_type
|
||||
|
||||
from . import open_for_read, open_for_write
|
||||
|
||||
"""
|
||||
Here is an example script using the ParseRTF module directly
|
||||
#!/usr/bin/env python2
|
||||
|
||||
def Handle_Main():
|
||||
# Handles options and creates a parse object
|
||||
parse_obj =ParseRtf.ParseRtf(
|
||||
in_file = 'in.rtf',
|
||||
# All values from here on are optional
|
||||
# determine the output file
|
||||
out_file = 'out.xml',
|
||||
# determine the run level. The default is 1.
|
||||
run_level = 3,
|
||||
# The name of a debug directory, if you are running at
|
||||
# run level 3 or higer.
|
||||
debug = 'debug_dir',
|
||||
# Convert RTF caps to real caps.
|
||||
# Default is 1.
|
||||
convert_caps = 1,
|
||||
# Indent resulting XML.
|
||||
# Default is 0 (no indent).
|
||||
indent = 1,
|
||||
# Form lists from RTF. Default is 1.
|
||||
form_lists = 1,
|
||||
# Convert headings to sections. Default is 0.
|
||||
headings_to_sections = 1,
|
||||
# Group paragraphs with the same style name. Default is 1.
|
||||
group_styles = 1,
|
||||
# Group borders. Default is 1.
|
||||
group_borders = 1,
|
||||
# Write or do not write paragraphs. Default is 0.
|
||||
empty_paragraphs = 0,
|
||||
# Allow to use a custom default encoding as fallback
|
||||
default_encoding = 'cp1252',
|
||||
)
|
||||
try:
|
||||
parse_obj.parse_rtf()
|
||||
except ParseRtf.InvalidRtfException, msg:
|
||||
sys.stderr.write(msg)
|
||||
except ParseRtf.RtfInvalidCodeException, msg:
|
||||
sys.stderr.write(msg)
|
||||
"""
|
||||
|
||||
|
||||
class InvalidRtfException(Exception):
|
||||
"""
|
||||
handle invalid RTF
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
class RtfInvalidCodeException(Exception):
|
||||
"""
|
||||
handle bugs in program
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
class ParseRtf:
|
||||
"""
|
||||
Main class for controlling the rest of the parsing.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
in_file,
|
||||
out_file='',
|
||||
out_dir=None,
|
||||
dtd='',
|
||||
deb_dir=None,
|
||||
convert_symbol=None,
|
||||
convert_wingdings=None,
|
||||
convert_zapf=None,
|
||||
convert_caps=None,
|
||||
run_level=1,
|
||||
indent=None,
|
||||
replace_illegals=1,
|
||||
form_lists=1,
|
||||
headings_to_sections=1,
|
||||
group_styles=1,
|
||||
group_borders=1,
|
||||
empty_paragraphs=1,
|
||||
no_dtd=0,
|
||||
char_data='',
|
||||
default_encoding='cp1252',
|
||||
):
|
||||
"""
|
||||
Requires:
|
||||
'file' --file to parse
|
||||
'char_data' --file containing character maps
|
||||
'dtd' --path to dtd
|
||||
Possible parameters, but not necessary:
|
||||
'output' --a file to output the parsed file. (Default is standard
|
||||
output.)
|
||||
'temp_dir' --directory for temporary output (If not provided, the
|
||||
script tries to output to directory where is script is exectued.)
|
||||
'deb_dir' --debug directory. If a debug_dir is provided, the script
|
||||
will copy each run through as a file to examine in the debug_dir
|
||||
'check_brackets' -- make sure the brackets match up after each run
|
||||
through a file. Only for debugging.
|
||||
Returns: Nothing
|
||||
"""
|
||||
|
||||
self.__file = in_file
|
||||
self.__out_file = out_file
|
||||
self.__out_dir = out_dir
|
||||
self.__temp_dir = out_dir
|
||||
self.__dtd_path = dtd
|
||||
self.__check_file(in_file,"file_to_parse")
|
||||
self.__char_data = char_data
|
||||
self.__debug_dir = deb_dir
|
||||
self.__check_dir(self.__temp_dir)
|
||||
self.__copy = self.__check_dir(self.__debug_dir)
|
||||
self.__convert_caps = convert_caps
|
||||
self.__convert_symbol = convert_symbol
|
||||
self.__convert_wingdings = convert_wingdings
|
||||
self.__convert_zapf = convert_zapf
|
||||
self.__run_level = run_level
|
||||
self.__exit_level = 0
|
||||
self.__indent = indent
|
||||
self.__replace_illegals = replace_illegals
|
||||
self.__form_lists = form_lists
|
||||
self.__headings_to_sections = headings_to_sections
|
||||
self.__group_styles = group_styles
|
||||
self.__group_borders = group_borders
|
||||
self.__empty_paragraphs = empty_paragraphs
|
||||
self.__no_dtd = no_dtd
|
||||
self.__default_encoding = default_encoding
|
||||
|
||||
def __check_file(self, the_file, type):
|
||||
"""Check to see if files exist"""
|
||||
if hasattr(the_file, 'read'):
|
||||
return
|
||||
if the_file is None:
|
||||
if type == "file_to_parse":
|
||||
msg = "\nYou must provide a file for the script to work"
|
||||
raise RtfInvalidCodeException(msg)
|
||||
elif os.path.exists(the_file):
|
||||
pass # do nothing
|
||||
else:
|
||||
msg = "\nThe file '%s' cannot be found" % the_file
|
||||
raise RtfInvalidCodeException(msg)
|
||||
|
||||
def __check_dir(self, the_dir):
|
||||
"""Check to see if directory exists"""
|
||||
if not the_dir :
|
||||
return
|
||||
dir_exists = os.path.isdir(the_dir)
|
||||
if not dir_exists:
|
||||
msg = "\n%s is not a directory" % the_dir
|
||||
raise RtfInvalidCodeException(msg)
|
||||
return 1
|
||||
|
||||
def parse_rtf(self):
|
||||
"""
|
||||
Parse the file by calling on other classes.
|
||||
Requires:
|
||||
Nothing
|
||||
Returns:
|
||||
A parsed file in XML, either to standard output or to a file,
|
||||
depending on the value of 'output' when the instance was created.
|
||||
"""
|
||||
self.__temp_file = self.__make_temp_file(self.__file)
|
||||
# if the self.__deb_dir is true, then create a copy object,
|
||||
# set the directory to write to, remove files, and copy
|
||||
# the new temporary file to this directory
|
||||
if self.__debug_dir:
|
||||
copy_obj = copy.Copy(
|
||||
bug_handler=RtfInvalidCodeException,
|
||||
)
|
||||
copy_obj.set_dir(self.__debug_dir)
|
||||
copy_obj.remove_files()
|
||||
copy_obj.copy_file(self.__temp_file, "original_file")
|
||||
# Function to check if bracket are well handled
|
||||
if self.__debug_dir or self.__run_level > 2:
|
||||
self.__check_brack_obj = check_brackets.CheckBrackets(
|
||||
file=self.__temp_file,
|
||||
bug_handler=RtfInvalidCodeException,
|
||||
)
|
||||
# convert Macintosh and Windows line endings to Unix line endings
|
||||
# why do this if you don't wb after?
|
||||
line_obj = line_endings.FixLineEndings(
|
||||
in_file=self.__temp_file,
|
||||
bug_handler=RtfInvalidCodeException,
|
||||
copy=self.__copy,
|
||||
run_level=self.__run_level,
|
||||
replace_illegals=self.__replace_illegals,
|
||||
)
|
||||
return_value = line_obj.fix_endings() # calibre return what?
|
||||
self.__return_code(return_value)
|
||||
tokenize_obj = tokenize.Tokenize(
|
||||
bug_handler=RtfInvalidCodeException,
|
||||
in_file=self.__temp_file,
|
||||
copy=self.__copy,
|
||||
run_level=self.__run_level)
|
||||
tokenize_obj.tokenize()
|
||||
process_tokens_obj = process_tokens.ProcessTokens(
|
||||
in_file=self.__temp_file,
|
||||
bug_handler=RtfInvalidCodeException,
|
||||
copy=self.__copy,
|
||||
run_level=self.__run_level,
|
||||
exception_handler=InvalidRtfException,
|
||||
)
|
||||
try:
|
||||
return_value = process_tokens_obj.process_tokens()
|
||||
except InvalidRtfException as msg:
|
||||
# Check to see if the file is correctly encoded
|
||||
encode_obj = default_encoding.DefaultEncoding(
|
||||
in_file=self.__temp_file,
|
||||
run_level=self.__run_level,
|
||||
bug_handler=RtfInvalidCodeException,
|
||||
check_raw=True,
|
||||
default_encoding=self.__default_encoding,
|
||||
)
|
||||
platform, code_page, default_font_num = encode_obj.find_default_encoding()
|
||||
check_encoding_obj = check_encoding.CheckEncoding(
|
||||
bug_handler=RtfInvalidCodeException,
|
||||
)
|
||||
enc = encode_obj.get_codepage()
|
||||
# TODO: to check if cp is a good idea or if I should use a dict to convert
|
||||
enc = 'cp' + enc
|
||||
msg = '%s\nException in token processing' % unicode_type(msg)
|
||||
if check_encoding_obj.check_encoding(self.__file, enc):
|
||||
file_name = self.__file if isinstance(self.__file, bytes) \
|
||||
else self.__file.encode('utf-8')
|
||||
msg +='\nFile %s does not appear to be correctly encoded.\n' % file_name
|
||||
try:
|
||||
os.remove(self.__temp_file)
|
||||
except OSError:
|
||||
pass
|
||||
raise InvalidRtfException(msg)
|
||||
delete_info_obj = delete_info.DeleteInfo(
|
||||
in_file=self.__temp_file,
|
||||
copy=self.__copy,
|
||||
bug_handler=RtfInvalidCodeException,
|
||||
run_level=self.__run_level,)
|
||||
# found destination means {\*\destination
|
||||
# if found, the RTF should be newer RTF
|
||||
found_destination = delete_info_obj.delete_info()
|
||||
self.__bracket_match('delete_data_info')
|
||||
# put picts in a separate file
|
||||
pict_obj = pict.Pict(
|
||||
in_file=self.__temp_file,
|
||||
bug_handler=RtfInvalidCodeException,
|
||||
copy=self.__copy,
|
||||
orig_file=self.__file,
|
||||
out_file=self.__out_file,
|
||||
run_level=self.__run_level,
|
||||
)
|
||||
pict_obj.process_pict()
|
||||
self.__bracket_match('pict_data_info')
|
||||
combine_obj = combine_borders.CombineBorders(
|
||||
in_file=self.__temp_file,
|
||||
bug_handler=RtfInvalidCodeException,
|
||||
copy=self.__copy,
|
||||
run_level=self.__run_level,)
|
||||
combine_obj.combine_borders()
|
||||
self.__bracket_match('combine_borders_info')
|
||||
footnote_obj = footnote.Footnote(
|
||||
in_file=self.__temp_file,
|
||||
bug_handler=RtfInvalidCodeException,
|
||||
copy=self.__copy,
|
||||
run_level=self.__run_level,
|
||||
)
|
||||
footnote_obj.separate_footnotes()
|
||||
self.__bracket_match('separate_footnotes_info')
|
||||
header_obj = header.Header(
|
||||
in_file=self.__temp_file,
|
||||
bug_handler=RtfInvalidCodeException,
|
||||
copy=self.__copy,
|
||||
run_level=self.__run_level,
|
||||
)
|
||||
header_obj.separate_headers()
|
||||
self.__bracket_match('separate_headers_info')
|
||||
list_numbers_obj = list_numbers.ListNumbers(
|
||||
in_file=self.__temp_file,
|
||||
bug_handler=RtfInvalidCodeException,
|
||||
copy=self.__copy,
|
||||
run_level=self.__run_level,
|
||||
)
|
||||
list_numbers_obj.fix_list_numbers()
|
||||
self.__bracket_match('list_number_info')
|
||||
preamble_div_obj = preamble_div.PreambleDiv(
|
||||
in_file=self.__temp_file,
|
||||
bug_handler=RtfInvalidCodeException,
|
||||
copy=self.__copy,
|
||||
run_level=self.__run_level,
|
||||
)
|
||||
list_of_lists = preamble_div_obj.make_preamble_divisions()
|
||||
self.__bracket_match('make_preamble_divisions')
|
||||
encode_obj = default_encoding.DefaultEncoding(
|
||||
in_file=self.__temp_file,
|
||||
run_level=self.__run_level,
|
||||
bug_handler=RtfInvalidCodeException,
|
||||
default_encoding=self.__default_encoding,
|
||||
)
|
||||
platform, code_page, default_font_num = encode_obj.find_default_encoding()
|
||||
hex2utf_obj = hex_2_utf8.Hex2Utf8(
|
||||
in_file=self.__temp_file,
|
||||
copy=self.__copy,
|
||||
area_to_convert='preamble',
|
||||
char_file=self.__char_data,
|
||||
default_char_map=code_page,
|
||||
run_level=self.__run_level,
|
||||
bug_handler=RtfInvalidCodeException,
|
||||
invalid_rtf_handler=InvalidRtfException,
|
||||
)
|
||||
hex2utf_obj.convert_hex_2_utf8()
|
||||
self.__bracket_match('hex_2_utf_preamble')
|
||||
fonts_obj = fonts.Fonts(
|
||||
in_file=self.__temp_file,
|
||||
bug_handler=RtfInvalidCodeException,
|
||||
copy=self.__copy,
|
||||
default_font_num=default_font_num,
|
||||
run_level=self.__run_level,
|
||||
)
|
||||
special_font_dict = fonts_obj.convert_fonts()
|
||||
self.__bracket_match('fonts_info')
|
||||
color_obj = colors.Colors(
|
||||
in_file=self.__temp_file,
|
||||
copy=self.__copy,
|
||||
bug_handler=RtfInvalidCodeException,
|
||||
run_level=self.__run_level,
|
||||
)
|
||||
color_obj.convert_colors()
|
||||
self.__bracket_match('colors_info')
|
||||
style_obj = styles.Styles(
|
||||
in_file=self.__temp_file,
|
||||
bug_handler=RtfInvalidCodeException,
|
||||
copy=self.__copy,
|
||||
run_level=self.__run_level,
|
||||
)
|
||||
style_obj.convert_styles()
|
||||
self.__bracket_match('styles_info')
|
||||
info_obj = info.Info(
|
||||
in_file=self.__temp_file,
|
||||
bug_handler=RtfInvalidCodeException,
|
||||
copy=self.__copy,
|
||||
run_level=self.__run_level,
|
||||
)
|
||||
info_obj.fix_info()
|
||||
default_font = special_font_dict.get('default-font')
|
||||
preamble_rest_obj = preamble_rest.Preamble(
|
||||
file=self.__temp_file, copy=self.__copy,
|
||||
bug_handler=RtfInvalidCodeException,
|
||||
platform=platform, default_font=default_font,
|
||||
code_page=code_page)
|
||||
preamble_rest_obj.fix_preamble()
|
||||
self.__bracket_match('preamble_rest_info')
|
||||
old_rtf_obj = OldRtf(
|
||||
in_file=self.__temp_file,
|
||||
bug_handler=RtfInvalidCodeException,
|
||||
run_level=self.__run_level,
|
||||
)
|
||||
# RTF can actually have destination groups and old RTF.
|
||||
# BAH!
|
||||
old_rtf = old_rtf_obj.check_if_old_rtf()
|
||||
if old_rtf:
|
||||
if self.__run_level > 5:
|
||||
msg = 'Older RTF\n' \
|
||||
'self.__run_level is "%s"\n' % self.__run_level
|
||||
raise RtfInvalidCodeException(msg)
|
||||
if self.__run_level > 1:
|
||||
sys.stderr.write('File could be older RTF...\n')
|
||||
if found_destination:
|
||||
if self.__run_level > 1:
|
||||
sys.stderr.write(
|
||||
'File also has newer RTF.\n'
|
||||
'Will do the best to convert...\n'
|
||||
)
|
||||
add_brackets_obj = add_brackets.AddBrackets(
|
||||
in_file=self.__temp_file,
|
||||
bug_handler=RtfInvalidCodeException,
|
||||
copy=self.__copy,
|
||||
run_level=self.__run_level,
|
||||
)
|
||||
add_brackets_obj.add_brackets()
|
||||
fields_small_obj = fields_small.FieldsSmall(
|
||||
in_file=self.__temp_file,
|
||||
copy=self.__copy,
|
||||
bug_handler=RtfInvalidCodeException,
|
||||
run_level=self.__run_level,)
|
||||
fields_small_obj.fix_fields()
|
||||
self.__bracket_match('fix_small_fields_info')
|
||||
fields_large_obj = fields_large.FieldsLarge(
|
||||
in_file=self.__temp_file,
|
||||
copy=self.__copy,
|
||||
bug_handler=RtfInvalidCodeException,
|
||||
run_level=self.__run_level)
|
||||
fields_large_obj.fix_fields()
|
||||
self.__bracket_match('fix_large_fields_info')
|
||||
sections_obj = sections.Sections(
|
||||
in_file=self.__temp_file,
|
||||
bug_handler=RtfInvalidCodeException,
|
||||
copy=self.__copy,
|
||||
run_level=self.__run_level,)
|
||||
sections_obj.make_sections()
|
||||
self.__bracket_match('sections_info')
|
||||
paragraphs_obj = paragraphs.Paragraphs(
|
||||
in_file=self.__temp_file,
|
||||
bug_handler=RtfInvalidCodeException,
|
||||
copy=self.__copy,
|
||||
write_empty_para=self.__empty_paragraphs,
|
||||
run_level=self.__run_level,)
|
||||
paragraphs_obj.make_paragraphs()
|
||||
self.__bracket_match('paragraphs_info')
|
||||
default_font = special_font_dict['default-font']
|
||||
paragraph_def_obj = paragraph_def.ParagraphDef(
|
||||
in_file=self.__temp_file,
|
||||
bug_handler=RtfInvalidCodeException,
|
||||
copy=self.__copy,
|
||||
default_font=default_font,
|
||||
run_level=self.__run_level,)
|
||||
list_of_styles = paragraph_def_obj.make_paragraph_def()
|
||||
body_styles_obj = body_styles.BodyStyles(
|
||||
in_file=self.__temp_file,
|
||||
bug_handler=RtfInvalidCodeException,
|
||||
copy=self.__copy,
|
||||
list_of_styles=list_of_styles,
|
||||
run_level=self.__run_level,)
|
||||
body_styles_obj.insert_info()
|
||||
self.__bracket_match('body_styles_info')
|
||||
self.__bracket_match('paragraph_def_info')
|
||||
table_obj = table.Table(
|
||||
in_file=self.__temp_file,
|
||||
bug_handler=RtfInvalidCodeException,
|
||||
copy=self.__copy,
|
||||
run_level=self.__run_level,)
|
||||
table_data = table_obj.make_table()
|
||||
self.__bracket_match('table_info')
|
||||
table_info_obj = table_info.TableInfo(
|
||||
in_file=self.__temp_file,
|
||||
bug_handler=RtfInvalidCodeException,
|
||||
copy=self.__copy,
|
||||
table_data=table_data,
|
||||
run_level=self.__run_level,)
|
||||
table_info_obj.insert_info()
|
||||
self.__bracket_match('table__data_info')
|
||||
if self.__form_lists:
|
||||
make_list_obj = make_lists.MakeLists(
|
||||
in_file=self.__temp_file,
|
||||
bug_handler=RtfInvalidCodeException,
|
||||
copy=self.__copy,
|
||||
headings_to_sections=self.__headings_to_sections,
|
||||
run_level=self.__run_level,
|
||||
list_of_lists=list_of_lists,
|
||||
)
|
||||
make_list_obj.make_lists()
|
||||
self.__bracket_match('form_lists_info')
|
||||
if self.__headings_to_sections:
|
||||
headings_to_sections_obj = headings_to_sections.HeadingsToSections(
|
||||
in_file=self.__temp_file,
|
||||
bug_handler=RtfInvalidCodeException,
|
||||
copy=self.__copy,
|
||||
run_level=self.__run_level,)
|
||||
headings_to_sections_obj.make_sections()
|
||||
self.__bracket_match('headings_to_sections_info')
|
||||
if self.__group_styles:
|
||||
group_styles_obj = group_styles.GroupStyles(
|
||||
in_file=self.__temp_file,
|
||||
bug_handler=RtfInvalidCodeException,
|
||||
copy=self.__copy,
|
||||
wrap=1,
|
||||
run_level=self.__run_level,)
|
||||
group_styles_obj.group_styles()
|
||||
self.__bracket_match('group_styles_info')
|
||||
if self.__group_borders:
|
||||
group_borders_obj = group_borders.GroupBorders(
|
||||
in_file=self.__temp_file,
|
||||
bug_handler=RtfInvalidCodeException,
|
||||
copy=self.__copy,
|
||||
wrap=1,
|
||||
run_level=self.__run_level,)
|
||||
group_borders_obj.group_borders()
|
||||
self.__bracket_match('group_borders_info')
|
||||
inline_obj = inline.Inline(
|
||||
in_file=self.__temp_file,
|
||||
bug_handler=RtfInvalidCodeException,
|
||||
copy=self.__copy,
|
||||
run_level=self.__run_level,)
|
||||
inline_obj.form_tags()
|
||||
self.__bracket_match('inline_info')
|
||||
hex2utf_obj.update_values(file=self.__temp_file,
|
||||
area_to_convert='body',
|
||||
copy=self.__copy,
|
||||
char_file=self.__char_data,
|
||||
convert_caps=self.__convert_caps,
|
||||
convert_symbol=self.__convert_symbol,
|
||||
convert_wingdings=self.__convert_wingdings,
|
||||
convert_zapf=self.__convert_zapf,
|
||||
symbol=1,
|
||||
wingdings=1,
|
||||
dingbats=1,
|
||||
)
|
||||
hex2utf_obj.convert_hex_2_utf8()
|
||||
header_obj.join_headers()
|
||||
footnote_obj.join_footnotes()
|
||||
tags_obj = convert_to_tags.ConvertToTags(
|
||||
in_file=self.__temp_file,
|
||||
copy=self.__copy,
|
||||
dtd_path=self.__dtd_path,
|
||||
indent=self.__indent,
|
||||
run_level=self.__run_level,
|
||||
no_dtd=self.__no_dtd,
|
||||
encoding=encode_obj.get_codepage(),
|
||||
bug_handler=RtfInvalidCodeException,
|
||||
)
|
||||
tags_obj.convert_to_tags()
|
||||
output_obj = output.Output(
|
||||
file=self.__temp_file,
|
||||
orig_file=self.__file,
|
||||
output_dir=self.__out_dir,
|
||||
out_file=self.__out_file,
|
||||
)
|
||||
output_obj.output()
|
||||
os.remove(self.__temp_file)
|
||||
return self.__exit_level
|
||||
|
||||
def __bracket_match(self, file_name):
|
||||
if self.__run_level > 2:
|
||||
good_br, msg = self.__check_brack_obj.check_brackets()
|
||||
if good_br:
|
||||
pass
|
||||
# sys.stderr.write( msg + ' in ' + file_name + "\n")
|
||||
else:
|
||||
msg = '%s in file %s' % (msg, file_name)
|
||||
print(msg, file=sys.stderr)
|
||||
|
||||
def __return_code(self, num):
|
||||
if num is None:
|
||||
return
|
||||
if int(num) > self.__exit_level:
|
||||
self.__exit_level = num
|
||||
|
||||
def __make_temp_file(self,file):
|
||||
"""Make a temporary file to parse"""
|
||||
write_file="rtf_write_file"
|
||||
read_obj = file if hasattr(file, 'read') else open_for_read(file)
|
||||
with open_for_write(write_file) as write_obj:
|
||||
for line in read_obj:
|
||||
write_obj.write(line)
|
||||
return write_file
|
||||
12
ebook_converter/ebooks/rtf2xml/__init__.py
Normal file
12
ebook_converter/ebooks/rtf2xml/__init__.py
Normal file
@@ -0,0 +1,12 @@
|
||||
from __future__ import unicode_literals, absolute_import, print_function, division
|
||||
|
||||
import io
|
||||
|
||||
|
||||
def open_for_read(path):
|
||||
return io.open(path, encoding='utf-8', errors='replace')
|
||||
|
||||
|
||||
def open_for_write(path, append=False):
|
||||
mode = 'a' if append else 'w'
|
||||
return io.open(path, mode, encoding='utf-8', errors='replace', newline='')
|
||||
232
ebook_converter/ebooks/rtf2xml/add_brackets.py
Normal file
232
ebook_converter/ebooks/rtf2xml/add_brackets.py
Normal file
@@ -0,0 +1,232 @@
|
||||
from __future__ import unicode_literals, absolute_import, print_function, division
|
||||
#########################################################################
|
||||
# #
|
||||
# #
|
||||
# copyright 2002 Paul Henry Tremblay #
|
||||
# #
|
||||
# This program is distributed in the hope that it will be useful, #
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
|
||||
# General Public License for more details. #
|
||||
# #
|
||||
# #
|
||||
# #
|
||||
#########################################################################
|
||||
import sys, os
|
||||
|
||||
from calibre.ebooks.rtf2xml import copy, check_brackets
|
||||
from calibre.ptempfile import better_mktemp
|
||||
from polyglot.builtins import iteritems
|
||||
from . import open_for_read, open_for_write
|
||||
|
||||
|
||||
class AddBrackets:
|
||||
"""
|
||||
Add brackets for old RTF.
|
||||
Logic:
|
||||
When control words without their own brackets are encountered
|
||||
and in the list of allowed words, this will add brackets
|
||||
to facilitate the treatment of the file
|
||||
"""
|
||||
|
||||
def __init__(self, in_file,
|
||||
bug_handler,
|
||||
copy=None,
|
||||
run_level=1,
|
||||
):
|
||||
"""
|
||||
Required:
|
||||
'file'--file to parse
|
||||
Optional:
|
||||
'copy'-- whether to make a copy of result for debugging
|
||||
'temp_dir' --where to output temporary results (default is
|
||||
directory from which the script is run.)
|
||||
Returns:
|
||||
nothing
|
||||
"""
|
||||
self.__file = in_file
|
||||
self.__bug_handler = bug_handler
|
||||
self.__copy = copy
|
||||
self.__write_to = better_mktemp()
|
||||
self.__run_level = run_level
|
||||
self.__state_dict = {
|
||||
'before_body' : self.__before_body_func,
|
||||
'in_body' : self.__in_body_func,
|
||||
'after_control_word' : self.__after_control_word_func,
|
||||
'in_ignore' : self.__ignore_func,
|
||||
}
|
||||
self.__accept = [
|
||||
'cw<ci<bold______' ,
|
||||
'cw<ci<annotation' ,
|
||||
'cw<ci<blue______' ,
|
||||
# 'cw<ci<bold______' ,
|
||||
'cw<ci<caps______' ,
|
||||
'cw<ci<char-style' ,
|
||||
'cw<ci<dbl-strike' ,
|
||||
'cw<ci<emboss____' ,
|
||||
'cw<ci<engrave___' ,
|
||||
'cw<ci<font-color' ,
|
||||
'cw<ci<font-down_' ,
|
||||
'cw<ci<font-size_' ,
|
||||
'cw<ci<font-style' ,
|
||||
'cw<ci<font-up___' ,
|
||||
'cw<ci<footnot-mk' ,
|
||||
'cw<ci<green_____' ,
|
||||
'cw<ci<hidden____' ,
|
||||
'cw<ci<italics___' ,
|
||||
'cw<ci<outline___' ,
|
||||
'cw<ci<red_______' ,
|
||||
'cw<ci<shadow____' ,
|
||||
'cw<ci<small-caps' ,
|
||||
'cw<ci<strike-thr' ,
|
||||
'cw<ci<subscript_' ,
|
||||
'cw<ci<superscrip' ,
|
||||
'cw<ci<underlined' ,
|
||||
# 'cw<ul<underlined' ,
|
||||
]
|
||||
|
||||
def __initiate_values(self):
|
||||
"""
|
||||
Init temp values
|
||||
"""
|
||||
self.__state = 'before_body'
|
||||
self.__inline = {}
|
||||
self.__temp_group = []
|
||||
self.__open_bracket = False
|
||||
self.__found_brackets = False
|
||||
|
||||
def __before_body_func(self, line):
|
||||
"""
|
||||
If we are before the body, not interest in changing anything
|
||||
"""
|
||||
if self.__token_info == 'mi<mk<body-open_':
|
||||
self.__state = 'in_body'
|
||||
self.__write_obj.write(line)
|
||||
|
||||
def __in_body_func(self, line):
|
||||
"""
|
||||
Select what action to take in body:
|
||||
1-At the end of the file close the braket if a bracket was opened
|
||||
This happens if there is achange
|
||||
2-If an open bracket is found the code inside is ignore
|
||||
(written without modifications)
|
||||
3-If an accepted control word is found put the line
|
||||
in a buffer then chage state to after cw
|
||||
4-Else simply write the line
|
||||
"""
|
||||
if line == 'cb<nu<clos-brack<0001\n' and self.__open_bracket:
|
||||
self.__write_obj.write(
|
||||
'cb<nu<clos-brack<0003\n'
|
||||
)
|
||||
self.__write_obj.write(line)
|
||||
elif self.__token_info == 'ob<nu<open-brack':
|
||||
self.__found_brackets = True
|
||||
self.__state = 'in_ignore'
|
||||
self.__ignore_count = self.__ob_count
|
||||
self.__write_obj.write(line)
|
||||
elif self.__token_info in self.__accept:
|
||||
self.__temp_group.append(line)
|
||||
self.__state = 'after_control_word'
|
||||
else:
|
||||
self.__write_obj.write(line)
|
||||
|
||||
def __after_control_word_func(self, line):
|
||||
"""
|
||||
After a cw either add next allowed cw to temporary list or
|
||||
change groupe and write it.
|
||||
If the token leading to an exit is an open bracket go to
|
||||
ignore otherwise goto in body
|
||||
"""
|
||||
if self.__token_info in self.__accept:
|
||||
self.__temp_group.append(line)
|
||||
else:
|
||||
self.__change_permanent_group()
|
||||
self.__write_group()
|
||||
self.__write_obj.write(line)
|
||||
if self.__token_info == 'ob<nu<open-brack':
|
||||
self.__state = 'in_ignore'
|
||||
self.__ignore_count = self.__ob_count
|
||||
else:
|
||||
self.__state = 'in_body'
|
||||
|
||||
def __write_group(self):
|
||||
"""
|
||||
Write a tempory group after accepted control words end
|
||||
But this is mostly useless in my opinion as there is no list of rejected cw
|
||||
This may be a way to implement future old rtf processing for cw
|
||||
Utility: open a group to just put brackets but why be so complicated?
|
||||
Scheme: open brackets, write cw then go to body and back with cw after
|
||||
"""
|
||||
if self.__open_bracket:
|
||||
self.__write_obj.write(
|
||||
'cb<nu<clos-brack<0003\n'
|
||||
)
|
||||
self.__open_bracket = False
|
||||
|
||||
inline_string = ''.join(['%s<nu<%s\n' % (k, v)
|
||||
for k, v in iteritems(self.__inline)
|
||||
if v != 'false'])
|
||||
if inline_string:
|
||||
self.__write_obj.write('ob<nu<open-brack<0003\n'
|
||||
'%s' % inline_string)
|
||||
self.__open_bracket = True
|
||||
self.__temp_group = []
|
||||
|
||||
def __change_permanent_group(self):
|
||||
"""
|
||||
Use temp group to change permanent group
|
||||
If the control word is not accepted remove it
|
||||
What is the interest as it is build to accept only accepted cw
|
||||
in __after_control_word_func?
|
||||
"""
|
||||
self.__inline = {line[:16] : line[20:-1]
|
||||
for line in self.__temp_group\
|
||||
# Is this really necessary?
|
||||
if line[:16] in self.__accept}
|
||||
|
||||
def __ignore_func(self, line):
|
||||
"""
|
||||
Just copy data inside of RTF brackets already here.
|
||||
"""
|
||||
self.__write_obj.write(line)
|
||||
if self.__token_info == 'cb<nu<clos-brack'\
|
||||
and self.__cb_count == self.__ignore_count:
|
||||
self.__state = 'in_body'
|
||||
|
||||
def __check_brackets(self, in_file):
|
||||
"""
|
||||
Return True if brackets match
|
||||
"""
|
||||
check_brack_obj = check_brackets.CheckBrackets(file=in_file)
|
||||
return check_brack_obj.check_brackets()[0]
|
||||
|
||||
def add_brackets(self):
|
||||
"""
|
||||
"""
|
||||
self.__initiate_values()
|
||||
with open_for_read(self.__file) as read_obj:
|
||||
with open_for_write(self.__write_to) as self.__write_obj:
|
||||
for line in read_obj:
|
||||
self.__token_info = line[:16]
|
||||
if self.__token_info == 'ob<nu<open-brack':
|
||||
self.__ob_count = line[-5:-1]
|
||||
if self.__token_info == 'cb<nu<clos-brack':
|
||||
self.__cb_count = line[-5:-1]
|
||||
action = self.__state_dict.get(self.__state)
|
||||
if action is None:
|
||||
sys.stderr.write(
|
||||
'No matching state in module add_brackets.py\n'
|
||||
'%s\n' % self.__state)
|
||||
action(line)
|
||||
# Check bad brackets
|
||||
if self.__check_brackets(self.__write_to):
|
||||
copy_obj = copy.Copy(bug_handler=self.__bug_handler)
|
||||
if self.__copy:
|
||||
copy_obj.copy_file(self.__write_to, "add_brackets.data")
|
||||
copy_obj.rename(self.__write_to, self.__file)
|
||||
else:
|
||||
if self.__run_level > 0:
|
||||
sys.stderr.write(
|
||||
'Sorry, but this files has a mix of old and new RTF.\n'
|
||||
'Some characteristics cannot be converted.\n')
|
||||
os.remove(self.__write_to)
|
||||
84
ebook_converter/ebooks/rtf2xml/body_styles.py
Normal file
84
ebook_converter/ebooks/rtf2xml/body_styles.py
Normal file
@@ -0,0 +1,84 @@
|
||||
from __future__ import unicode_literals, absolute_import, print_function, division
|
||||
#########################################################################
|
||||
# #
|
||||
# #
|
||||
# copyright 2002 Paul Henry Tremblay #
|
||||
# #
|
||||
# This program is distributed in the hope that it will be useful, #
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
|
||||
# General Public License for more details. #
|
||||
# #
|
||||
# #
|
||||
#########################################################################
|
||||
import os
|
||||
from calibre.ebooks.rtf2xml import copy
|
||||
from calibre.ptempfile import better_mktemp
|
||||
from . import open_for_read, open_for_write
|
||||
|
||||
"""
|
||||
Simply write the list of strings after style table
|
||||
"""
|
||||
|
||||
|
||||
class BodyStyles:
|
||||
"""
|
||||
Insert table data for tables.
|
||||
Logic:
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
in_file,
|
||||
list_of_styles,
|
||||
bug_handler,
|
||||
copy=None,
|
||||
run_level=1,):
|
||||
"""
|
||||
Required:
|
||||
'file'--file to parse
|
||||
'table_data' -- a dictionary for each table.
|
||||
Optional:
|
||||
'copy'-- whether to make a copy of result for debugging
|
||||
'temp_dir' --where to output temporary results (default is
|
||||
directory from which the script is run.)
|
||||
Returns:
|
||||
nothing
|
||||
"""
|
||||
self.__file = in_file
|
||||
self.__bug_handler = bug_handler
|
||||
self.__copy = copy
|
||||
self.__list_of_styles = list_of_styles
|
||||
self.__run_level = run_level
|
||||
self.__write_to = better_mktemp()
|
||||
# self.__write_to = 'table_info.data'
|
||||
|
||||
def insert_info(self):
|
||||
"""
|
||||
"""
|
||||
read_obj = open_for_read(self.__file)
|
||||
self.__write_obj = open_for_write(self.__write_to)
|
||||
line_to_read = 1
|
||||
while line_to_read:
|
||||
line_to_read = read_obj.readline()
|
||||
line = line_to_read
|
||||
if line == 'mi<tg<close_____<style-table\n':
|
||||
if len(self.__list_of_styles) > 0:
|
||||
self.__write_obj.write('mi<tg<open______<styles-in-body\n')
|
||||
the_string = ''.join(self.__list_of_styles)
|
||||
self.__write_obj.write(the_string)
|
||||
self.__write_obj.write('mi<tg<close_____<styles-in-body\n')
|
||||
else:
|
||||
# this shouldn't happen!
|
||||
if self.__run_level > 3:
|
||||
msg = 'Not enough data for each table\n'
|
||||
raise self.__bug_handler(msg)
|
||||
# why was this line even here?
|
||||
# self.__write_obj.write('mi<tg<open______<table\n')
|
||||
self.__write_obj.write(line)
|
||||
read_obj.close()
|
||||
self.__write_obj.close()
|
||||
copy_obj = copy.Copy(bug_handler=self.__bug_handler)
|
||||
if self.__copy:
|
||||
copy_obj.copy_file(self.__write_to, "body_styles.data")
|
||||
copy_obj.rename(self.__write_to, self.__file)
|
||||
os.remove(self.__write_to)
|
||||
191
ebook_converter/ebooks/rtf2xml/border_parse.py
Normal file
191
ebook_converter/ebooks/rtf2xml/border_parse.py
Normal file
@@ -0,0 +1,191 @@
|
||||
from __future__ import unicode_literals, absolute_import, print_function, division
|
||||
#########################################################################
|
||||
# #
|
||||
# #
|
||||
# copyright 2002 Paul Henry Tremblay #
|
||||
# #
|
||||
# This program is distributed in the hope that it will be useful, #
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
|
||||
# General Public License for more details. #
|
||||
# #
|
||||
# #
|
||||
#########################################################################
|
||||
import sys
|
||||
|
||||
|
||||
class BorderParse:
|
||||
"""
|
||||
Parse a border line and return a dictionary of attributes and values
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
# cw<bd<bor-t-r-hi<nu<true
|
||||
self.__border_dict = {
|
||||
'bor-t-r-hi' : 'border-table-row-horizontal-inside',
|
||||
'bor-t-r-vi' : 'border-table-row-vertical-inside',
|
||||
'bor-t-r-to' : 'border-table-row-top',
|
||||
'bor-t-r-le' : 'border-table-row-left',
|
||||
'bor-t-r-bo' : 'border-table-row-bottom',
|
||||
'bor-t-r-ri' : 'border-table-row-right',
|
||||
'bor-cel-bo' : 'border-cell-bottom',
|
||||
'bor-cel-to' : 'border-cell-top',
|
||||
'bor-cel-le' : 'border-cell-left',
|
||||
'bor-cel-ri' : 'border-cell-right',
|
||||
'bor-par-bo' : 'border-paragraph-bottom',
|
||||
'bor-par-to' : 'border-paragraph-top',
|
||||
'bor-par-le' : 'border-paragraph-left',
|
||||
'bor-par-ri' : 'border-paragraph-right',
|
||||
'bor-par-bx' : 'border-paragraph-box',
|
||||
'bor-for-ev' : 'border-for-every-paragraph',
|
||||
'bor-outsid' : 'border-outside',
|
||||
'bor-none__' : 'border',
|
||||
# border type => bt
|
||||
'bdr-li-wid' : 'line-width',
|
||||
'bdr-sp-wid' : 'padding',
|
||||
'bdr-color_' : 'color',
|
||||
}
|
||||
self.__border_style_dict = {
|
||||
'bdr-single' : 'single',
|
||||
'bdr-doubtb' : 'double-thickness-border',
|
||||
'bdr-shadow' : 'shadowed-border',
|
||||
'bdr-double' : 'double-border',
|
||||
'bdr-dotted' : 'dotted-border',
|
||||
'bdr-dashed' : 'dashed',
|
||||
'bdr-hair__' : 'hairline',
|
||||
'bdr-inset_' : 'inset',
|
||||
'bdr-das-sm' : 'dash-small',
|
||||
'bdr-dot-sm' : 'dot-dash',
|
||||
'bdr-dot-do' : 'dot-dot-dash',
|
||||
'bdr-outset' : 'outset',
|
||||
'bdr-trippl' : 'tripple',
|
||||
'bdr-thsm__' : 'thick-thin-small',
|
||||
'bdr-htsm__' : 'thin-thick-small',
|
||||
'bdr-hthsm_' : 'thin-thick-thin-small',
|
||||
'bdr-thm___' : 'thick-thin-medium',
|
||||
'bdr-htm___' : 'thin-thick-medium',
|
||||
'bdr-hthm__' : 'thin-thick-thin-medium',
|
||||
'bdr-thl___' : 'thick-thin-large',
|
||||
'bdr-hthl__' : 'thin-thick-thin-large',
|
||||
'bdr-wavy__' : 'wavy',
|
||||
'bdr-d-wav_' : 'double-wavy',
|
||||
'bdr-strip_' : 'striped',
|
||||
'bdr-embos_' : 'emboss',
|
||||
'bdr-engra_' : 'engrave',
|
||||
'bdr-frame_' : 'frame',
|
||||
}
|
||||
|
||||
def parse_border(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line -- line with border definition in it
|
||||
Returns:
|
||||
?
|
||||
Logic:
|
||||
"""
|
||||
border_dict = {}
|
||||
border_style_dict = {}
|
||||
border_style_list = []
|
||||
border_type = self.__border_dict.get(line[6:16])
|
||||
if not border_type:
|
||||
sys.stderr.write(
|
||||
'module is border_parse.py\n'
|
||||
'function is parse_border\n'
|
||||
'token does not have a dictionary value\n'
|
||||
'token is "%s"' % line
|
||||
)
|
||||
return border_dict
|
||||
att_line = line[20:-1]
|
||||
atts = att_line.split('|')
|
||||
# cw<bd<bor-cel-ri<nu<
|
||||
# border has no value--should be no lines
|
||||
if len(atts) == 1 and atts[0] == '':
|
||||
border_dict[border_type] = 'none'
|
||||
return border_dict
|
||||
# border-paragraph-right
|
||||
for att in atts:
|
||||
values = att.split(':')
|
||||
if len(values) ==2:
|
||||
att = values[0]
|
||||
value = values[1]
|
||||
else:
|
||||
value = 'true'
|
||||
style_att = self.__border_style_dict.get(att)
|
||||
if style_att:
|
||||
att = '%s-%s' % (border_type, att)
|
||||
border_style_dict[att] = value
|
||||
border_style_list.append(style_att)
|
||||
else:
|
||||
att = self.__border_dict.get(att)
|
||||
if not att:
|
||||
sys.stderr.write(
|
||||
'module is border_parse_def.py\n'
|
||||
'function is parse_border\n'
|
||||
'token does not have an att value\n'
|
||||
'line is "%s"' % line
|
||||
)
|
||||
att = '%s-%s' % (border_type, att)
|
||||
border_dict[att] = value
|
||||
new_border_dict = self.__determine_styles(border_type, border_style_list)
|
||||
border_dict.update(new_border_dict)
|
||||
return border_dict
|
||||
|
||||
def __determine_styles(self, border_type, border_style_list):
|
||||
new_border_dict = {}
|
||||
att = '%s-style' % border_type
|
||||
if 'shadowed-border' in border_style_list:
|
||||
new_border_dict[att] = 'shadowed'
|
||||
elif 'engraved' in border_style_list:
|
||||
new_border_dict[att] = 'engraved'
|
||||
elif 'emboss' in border_style_list:
|
||||
new_border_dict[att] = 'emboss'
|
||||
elif 'striped' in border_style_list:
|
||||
new_border_dict[att] = 'striped'
|
||||
elif 'thin-thick-thin-small' in border_style_list:
|
||||
new_border_dict[att] = 'thin-thick-thin-small'
|
||||
elif 'thick-thin-large' in border_style_list:
|
||||
new_border_dict[att] = 'thick-thin-large'
|
||||
elif 'thin-thick-thin-medium' in border_style_list:
|
||||
new_border_dict[att] = 'thin-thick-thin-medium'
|
||||
elif 'thin-thick-medium' in border_style_list:
|
||||
new_border_dict[att] = 'thin-thick-medium'
|
||||
elif 'thick-thin-medium' in border_style_list:
|
||||
new_border_dict[att] = 'thick-thin-medium'
|
||||
elif 'thick-thin-small' in border_style_list:
|
||||
new_border_dict[att] = 'thick-thin-small'
|
||||
elif 'thick-thin-small' in border_style_list:
|
||||
new_border_dict[att] = 'thick-thin-small'
|
||||
elif 'double-wavy' in border_style_list:
|
||||
new_border_dict[att] = 'double-wavy'
|
||||
elif 'dot-dot-dash' in border_style_list:
|
||||
new_border_dict[att] = 'dot-dot-dash'
|
||||
elif 'dot-dash' in border_style_list:
|
||||
new_border_dict[att] = 'dot-dash'
|
||||
elif 'dotted-border' in border_style_list:
|
||||
new_border_dict[att] = 'dotted'
|
||||
elif 'wavy' in border_style_list:
|
||||
new_border_dict[att] = 'wavy'
|
||||
elif 'dash-small' in border_style_list:
|
||||
new_border_dict[att] = 'dash-small'
|
||||
elif 'dashed' in border_style_list:
|
||||
new_border_dict[att] = 'dashed'
|
||||
elif 'frame' in border_style_list:
|
||||
new_border_dict[att] = 'frame'
|
||||
elif 'inset' in border_style_list:
|
||||
new_border_dict[att] = 'inset'
|
||||
elif 'outset' in border_style_list:
|
||||
new_border_dict[att] = 'outset'
|
||||
elif 'tripple-border' in border_style_list:
|
||||
new_border_dict[att] = 'tripple'
|
||||
elif 'double-border' in border_style_list:
|
||||
new_border_dict[att] = 'double'
|
||||
elif 'double-thickness-border' in border_style_list:
|
||||
new_border_dict[att] = 'double-thickness'
|
||||
elif 'hairline' in border_style_list:
|
||||
new_border_dict[att] = 'hairline'
|
||||
elif 'single' in border_style_list:
|
||||
new_border_dict[att] = 'single'
|
||||
else:
|
||||
if border_style_list:
|
||||
new_border_dict[att] = border_style_list[0]
|
||||
return new_border_dict
|
||||
16709
ebook_converter/ebooks/rtf2xml/char_set.py
Normal file
16709
ebook_converter/ebooks/rtf2xml/char_set.py
Normal file
File diff suppressed because it is too large
Load Diff
62
ebook_converter/ebooks/rtf2xml/check_brackets.py
Normal file
62
ebook_converter/ebooks/rtf2xml/check_brackets.py
Normal file
@@ -0,0 +1,62 @@
|
||||
from __future__ import unicode_literals, absolute_import, print_function, division
|
||||
#########################################################################
|
||||
# #
|
||||
# #
|
||||
# copyright 2002 Paul Henry Tremblay #
|
||||
# #
|
||||
# This program is distributed in the hope that it will be useful, #
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
|
||||
# General Public License for more details. #
|
||||
# #
|
||||
# #
|
||||
#########################################################################
|
||||
|
||||
|
||||
from . import open_for_read
|
||||
|
||||
|
||||
class CheckBrackets:
|
||||
"""Check that brackets match up"""
|
||||
|
||||
def __init__(self, bug_handler=None, file=None):
|
||||
self.__file=file
|
||||
self.__bug_handler = bug_handler
|
||||
self.__bracket_count=0
|
||||
self.__ob_count = 0
|
||||
self.__cb_count = 0
|
||||
self.__open_bracket_num = []
|
||||
|
||||
def open_brack(self, line):
|
||||
num = line[-5:-1]
|
||||
self.__open_bracket_num.append(num)
|
||||
self.__bracket_count += 1
|
||||
|
||||
def close_brack(self, line):
|
||||
num = line[-5:-1]
|
||||
try:
|
||||
last_num = self.__open_bracket_num.pop()
|
||||
except:
|
||||
return False
|
||||
if num != last_num:
|
||||
return False
|
||||
self.__bracket_count -= 1
|
||||
return True
|
||||
|
||||
def check_brackets(self):
|
||||
line_count = 0
|
||||
with open_for_read(self.__file) as read_obj:
|
||||
for line in read_obj:
|
||||
line_count += 1
|
||||
self.__token_info = line[:16]
|
||||
if self.__token_info == 'ob<nu<open-brack':
|
||||
self.open_brack(line)
|
||||
if self.__token_info == 'cb<nu<clos-brack':
|
||||
if not self.close_brack(line):
|
||||
return (False, "closed bracket doesn't match, line %s" % line_count)
|
||||
|
||||
if self.__bracket_count != 0:
|
||||
msg = ('At end of file open and closed brackets don\'t match\n'
|
||||
'total number of brackets is %s') % self.__bracket_count
|
||||
return (False, msg)
|
||||
return (True, "Brackets match!")
|
||||
42
ebook_converter/ebooks/rtf2xml/check_encoding.py
Normal file
42
ebook_converter/ebooks/rtf2xml/check_encoding.py
Normal file
@@ -0,0 +1,42 @@
|
||||
#!/usr/bin/env python2
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
import sys
|
||||
|
||||
from polyglot.builtins import unicode_type
|
||||
|
||||
|
||||
class CheckEncoding:
|
||||
|
||||
def __init__(self, bug_handler):
|
||||
self.__bug_handler = bug_handler
|
||||
|
||||
def __get_position_error(self, line, encoding, line_num):
|
||||
char_position = 0
|
||||
for char in line:
|
||||
char_position +=1
|
||||
try:
|
||||
char.decode(encoding)
|
||||
except ValueError as msg:
|
||||
sys.stderr.write('line: %s char: %s\n%s\n' % (line_num, char_position, unicode_type(msg)))
|
||||
|
||||
def check_encoding(self, path, encoding='us-ascii', verbose=True):
|
||||
line_num = 0
|
||||
with open(path, 'rb') as read_obj:
|
||||
for line in read_obj:
|
||||
line_num += 1
|
||||
try:
|
||||
line.decode(encoding)
|
||||
except ValueError:
|
||||
if verbose:
|
||||
if len(line) < 1000:
|
||||
self.__get_position_error(line, encoding, line_num)
|
||||
else:
|
||||
sys.stderr.write('line: %d has bad encoding\n' % line_num)
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
check_encoding_obj = CheckEncoding()
|
||||
check_encoding_obj.check_encoding(sys.argv[1])
|
||||
258
ebook_converter/ebooks/rtf2xml/colors.py
Normal file
258
ebook_converter/ebooks/rtf2xml/colors.py
Normal file
@@ -0,0 +1,258 @@
|
||||
from __future__ import unicode_literals, absolute_import, print_function, division
|
||||
#########################################################################
|
||||
# #
|
||||
# #
|
||||
# copyright 2002 Paul Henry Tremblay #
|
||||
# #
|
||||
# This program is distributed in the hope that it will be useful, #
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
|
||||
# General Public License for more details. #
|
||||
# #
|
||||
# #
|
||||
#########################################################################
|
||||
import sys, os, re
|
||||
|
||||
from calibre.ebooks.rtf2xml import copy
|
||||
from calibre.ptempfile import better_mktemp
|
||||
from . import open_for_read, open_for_write
|
||||
|
||||
|
||||
class Colors:
|
||||
"""
|
||||
Change lines with color info from color numbers to the actual color names.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
in_file,
|
||||
bug_handler,
|
||||
copy=None,
|
||||
run_level=1
|
||||
):
|
||||
"""
|
||||
Required:
|
||||
'file'--file to parse
|
||||
Optional:
|
||||
'copy'-- whether to make a copy of result for debugging
|
||||
'temp_dir' --where to output temporary results (default is
|
||||
directory from which the script is run.)
|
||||
Returns:
|
||||
nothing
|
||||
"""
|
||||
self.__file = in_file
|
||||
self.__copy = copy
|
||||
self.__bug_handler = bug_handler
|
||||
self.__line = 0
|
||||
self.__write_to = better_mktemp()
|
||||
self.__run_level = run_level
|
||||
|
||||
def __initiate_values(self):
|
||||
"""
|
||||
Initiate all values.
|
||||
"""
|
||||
self.__color_dict = {}
|
||||
self.__state = 'before_color_table'
|
||||
self.__state_dict = {
|
||||
'before_color_table': self.__before_color_func,
|
||||
'in_color_table' : self.__in_color_func,
|
||||
'after_color_table' : self.__after_color_func,
|
||||
'cw<ci<red_______' : self.__default_color_func,
|
||||
'cw<ci<green_____' : self.__default_color_func,
|
||||
'cw<ci<blue______' : self.__blue_func,
|
||||
'tx<nu<__________' : self.__do_nothing_func,
|
||||
}
|
||||
self.__color_string = '#'
|
||||
self.__color_num = 1
|
||||
self.__line_color_exp = re.compile(r'bdr-color_:(\d+)')
|
||||
# cw<bd<bor-par-to<nu<bdr-hair__|bdr-li-wid:0.50|bdr-sp-wid:1.00|bdr-color_:2
|
||||
|
||||
def __before_color_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
Check to see if the line marks the beginning of the color table.
|
||||
If so, change states.
|
||||
Always print out the line.
|
||||
"""
|
||||
# mi<mk<clrtbl-beg
|
||||
if self.__token_info == 'mi<mk<clrtbl-beg':
|
||||
self.__state = 'in_color_table'
|
||||
self.__write_obj.write(line)
|
||||
|
||||
def __default_color_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
get the hex number from the line and add it to the color string.
|
||||
"""
|
||||
hex_num = line[-3:-1]
|
||||
self.__color_string += hex_num
|
||||
|
||||
def __blue_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
Get the hex number from the line and add it to the color string.
|
||||
Add a key -> value pair to the color dictionary, with the number
|
||||
as the key, and the hex number as the value. Write an empty tag
|
||||
with the hex number and number as attributes. Add one to the color
|
||||
number. Reset the color string to '#'
|
||||
"""
|
||||
hex_num = line[-3:-1]
|
||||
self.__color_string += hex_num
|
||||
self.__color_dict[self.__color_num] = self.__color_string
|
||||
self.__write_obj.write(
|
||||
'mi<tg<empty-att_'
|
||||
'<color-in-table<num>%s<value>%s\n' % (self.__color_num, self.__color_string)
|
||||
)
|
||||
self.__color_num += 1
|
||||
self.__color_string = '#'
|
||||
|
||||
def __in_color_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
Check if the end of the color table has been reached. If so,
|
||||
change the state to after the color table.
|
||||
Othewise, get a function by passing the self.__token_info to the
|
||||
state dictionary.
|
||||
"""
|
||||
# mi<mk<clrtbl-beg
|
||||
# cw<ci<red_______<nu<00
|
||||
if self.__token_info == 'mi<mk<clrtbl-end':
|
||||
self.__state = 'after_color_table'
|
||||
else:
|
||||
action = self.__state_dict.get(self.__token_info)
|
||||
if action is None:
|
||||
sys.stderr.write('in module colors.py\n'
|
||||
'function is self.__in_color_func\n'
|
||||
'no action for %s' % self.__token_info
|
||||
)
|
||||
action(line)
|
||||
|
||||
def __after_color_func(self, line):
|
||||
"""
|
||||
Check the to see if it contains color info. If it does, extract the
|
||||
number and look up the hex value in the color dictionary. If the color
|
||||
dictionary has no key for the number, print out an error message.
|
||||
Otherwise, print out the line.
|
||||
Added Oct 10, 2003
|
||||
If the number is 0, that indicates no color
|
||||
"""
|
||||
# cw<ci<font-color<nu<2
|
||||
if self.__token_info == 'cw<ci<font-color':
|
||||
hex_num = int(line[20:-1])
|
||||
hex_num = self.__figure_num(hex_num)
|
||||
if hex_num:
|
||||
self.__write_obj.write(
|
||||
'cw<ci<font-color<nu<%s\n' % hex_num
|
||||
)
|
||||
elif line[0:5] == 'cw<bd':
|
||||
the_index = line.find('bdr-color_')
|
||||
if the_index > -1:
|
||||
line = re.sub(self.__line_color_exp, self.__sub_from_line_color, line)
|
||||
self.__write_obj.write(line)
|
||||
"""
|
||||
if num == 0:
|
||||
hex_num = 'false'
|
||||
else:
|
||||
hex_num = self.__color_dict.get(num)
|
||||
if hex_num == None:
|
||||
if self.__run_level > 0:
|
||||
sys.stderr.write(
|
||||
'module is colors.py\n'
|
||||
'function is self.__after_color_func\n'
|
||||
'no value in self.__color_dict for key %s\n' % num
|
||||
)
|
||||
if self.__run_level > 3:
|
||||
sys.stderr.write(
|
||||
'run level is %s\n'
|
||||
'Script will now quit\n'
|
||||
% self.__run_level)
|
||||
else:
|
||||
self.__write_obj.write(
|
||||
'cw<ci<font-color<nu<%s\n' % hex_num
|
||||
)
|
||||
"""
|
||||
else:
|
||||
self.__write_obj.write(line)
|
||||
# cw<bd<bor-par-to<nu<bdr-hair__|bdr-li-wid:0.50|bdr-sp-wid:1.00|bdr-color_:2
|
||||
|
||||
def __sub_from_line_color(self, match_obj):
|
||||
num = match_obj.group(1)
|
||||
try:
|
||||
num = int(num)
|
||||
except ValueError:
|
||||
if self.__run_level > 3:
|
||||
msg = 'can\'t make integer from string\n'
|
||||
raise self.__bug_handler(msg)
|
||||
else:
|
||||
return 'bdr-color_:no-value'
|
||||
hex_num = self.__figure_num(num)
|
||||
return 'bdr-color_:%s' % hex_num
|
||||
|
||||
def __figure_num(self, num):
|
||||
if num == 0:
|
||||
hex_num = 'false'
|
||||
else:
|
||||
hex_num = self.__color_dict.get(num)
|
||||
if hex_num is None:
|
||||
hex_num = '0'
|
||||
if self.__run_level > 3:
|
||||
msg = 'no value in self.__color_dict' \
|
||||
'for key %s at line %d\n' % (num, self.__line)
|
||||
raise self.__bug_handler(msg)
|
||||
return hex_num
|
||||
|
||||
def __do_nothing_func(self, line):
|
||||
"""
|
||||
Bad RTF will have text in the color table
|
||||
"""
|
||||
pass
|
||||
|
||||
def convert_colors(self):
|
||||
"""
|
||||
Requires:
|
||||
nothing
|
||||
Returns:
|
||||
nothing (changes the original file)
|
||||
Logic:
|
||||
Read one line in at a time. Determine what action to take based on
|
||||
the state. If the state is before the color table, look for the
|
||||
beginning of the color table.
|
||||
If the state is in the color table, create the color dictionary
|
||||
and print out the tags.
|
||||
If the state if afer the color table, look for lines with color
|
||||
info, and substitute the number with the hex number.
|
||||
"""
|
||||
self.__initiate_values()
|
||||
with open_for_read(self.__file) as read_obj:
|
||||
with open_for_write(self.__write_to) as self.__write_obj:
|
||||
for line in read_obj:
|
||||
self.__line+=1
|
||||
self.__token_info = line[:16]
|
||||
action = self.__state_dict.get(self.__state)
|
||||
if action is None:
|
||||
try:
|
||||
sys.stderr.write('no matching state in module fonts.py\n')
|
||||
sys.stderr.write(self.__state + '\n')
|
||||
except:
|
||||
pass
|
||||
action(line)
|
||||
copy_obj = copy.Copy(bug_handler=self.__bug_handler)
|
||||
if self.__copy:
|
||||
copy_obj.copy_file(self.__write_to, "color.data")
|
||||
copy_obj.rename(self.__write_to, self.__file)
|
||||
os.remove(self.__write_to)
|
||||
93
ebook_converter/ebooks/rtf2xml/combine_borders.py
Normal file
93
ebook_converter/ebooks/rtf2xml/combine_borders.py
Normal file
@@ -0,0 +1,93 @@
|
||||
from __future__ import unicode_literals, absolute_import, print_function, division
|
||||
#########################################################################
|
||||
# #
|
||||
# #
|
||||
# copyright 2002 Paul Henry Tremblay #
|
||||
# #
|
||||
# This program is distributed in the hope that it will be useful, #
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
|
||||
# General Public License for more details. #
|
||||
# #
|
||||
# #
|
||||
#########################################################################
|
||||
import os
|
||||
|
||||
from calibre.ebooks.rtf2xml import copy
|
||||
from calibre.ptempfile import better_mktemp
|
||||
from . import open_for_read, open_for_write
|
||||
|
||||
|
||||
class CombineBorders:
|
||||
"""Combine borders in RTF tokens to make later processing easier"""
|
||||
|
||||
def __init__(self,
|
||||
in_file ,
|
||||
bug_handler,
|
||||
copy=None,
|
||||
run_level=1,
|
||||
):
|
||||
self.__file = in_file
|
||||
self.__bug_handler = bug_handler
|
||||
self.__copy = copy
|
||||
self.__write_to = better_mktemp()
|
||||
self.__state = 'default'
|
||||
self.__bord_pos = 'default'
|
||||
self.__bord_att = []
|
||||
|
||||
def found_bd(self, line):
|
||||
# cw<bd<bor-t-r-vi
|
||||
self.__state = 'border'
|
||||
self.__bord_pos = line[6:16]
|
||||
|
||||
def __default_func(self, line):
|
||||
# cw<bd<bor-t-r-vi
|
||||
if self.__first_five == 'cw<bd':
|
||||
self.found_bd(line)
|
||||
return ''
|
||||
return line
|
||||
|
||||
def end_border(self, line, write_obj):
|
||||
border_string = "|".join(self.__bord_att)
|
||||
self.__bord_att = []
|
||||
write_obj.write('cw<bd<%s<nu<%s\n' % (self.__bord_pos,
|
||||
border_string))
|
||||
self.__state = 'default'
|
||||
self.__bord_string = ''
|
||||
if self.__first_five == 'cw<bd':
|
||||
self. found_bd(line)
|
||||
else:
|
||||
write_obj.write(line)
|
||||
|
||||
def add_to_border_desc(self, line):
|
||||
# cw<bt<bdr-hair__<nu<true
|
||||
# cw<bt<bdr-linew<nu<0.50
|
||||
# tx<__________<some text
|
||||
border_desc = line[6:16]
|
||||
num = line[20:-1]
|
||||
if num == 'true':
|
||||
num = ''
|
||||
else:
|
||||
num = ':' + num
|
||||
self.__bord_att.append(border_desc + num)
|
||||
|
||||
def __border_func(self, line, write_obj):
|
||||
if self.__first_five != 'cw<bt':
|
||||
self.end_border(line, write_obj)
|
||||
else:
|
||||
self.add_to_border_desc(line)
|
||||
|
||||
def combine_borders(self):
|
||||
with open_for_read(self.__file) as read_obj:
|
||||
with open_for_write(self.__write_to) as write_obj:
|
||||
for line in read_obj:
|
||||
self.__first_five = line[0:5]
|
||||
if self.__state == 'border':
|
||||
self.__border_func(line, write_obj)
|
||||
else:
|
||||
write_obj.write(self.__default_func(line))
|
||||
copy_obj = copy.Copy(bug_handler=self.__bug_handler)
|
||||
if self.__copy:
|
||||
copy_obj.copy_file(self.__write_to, "combine_borders.data")
|
||||
copy_obj.rename(self.__write_to, self.__file)
|
||||
os.remove(self.__write_to)
|
||||
284
ebook_converter/ebooks/rtf2xml/convert_to_tags.py
Normal file
284
ebook_converter/ebooks/rtf2xml/convert_to_tags.py
Normal file
@@ -0,0 +1,284 @@
|
||||
from __future__ import unicode_literals, absolute_import, print_function, division
|
||||
import os, sys
|
||||
|
||||
from calibre.ebooks.rtf2xml import copy, check_encoding
|
||||
from calibre.ptempfile import better_mktemp
|
||||
from . import open_for_read, open_for_write
|
||||
|
||||
public_dtd = 'rtf2xml1.0.dtd'
|
||||
|
||||
|
||||
class ConvertToTags:
|
||||
"""
|
||||
Convert file to XML
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
in_file,
|
||||
bug_handler,
|
||||
dtd_path,
|
||||
no_dtd,
|
||||
encoding,
|
||||
indent=None,
|
||||
copy=None,
|
||||
run_level=1,
|
||||
):
|
||||
"""
|
||||
Required:
|
||||
'file'
|
||||
Optional:
|
||||
'copy'-- whether to make a copy of result for debugging
|
||||
'temp_dir' --where to output temporary results (default is
|
||||
directory from which the script is run.)
|
||||
Returns:
|
||||
nothing
|
||||
"""
|
||||
self.__file = in_file
|
||||
self.__bug_handler = bug_handler
|
||||
self.__copy = copy
|
||||
self.__dtd_path = dtd_path
|
||||
self.__no_dtd = no_dtd
|
||||
self.__encoding = 'cp' + encoding
|
||||
# if encoding == 'mac_roman':
|
||||
# self.__encoding = 'mac_roman'
|
||||
self.__indent = indent
|
||||
self.__run_level = run_level
|
||||
self.__write_to = better_mktemp()
|
||||
self.__convert_utf = False
|
||||
self.__bad_encoding = False
|
||||
|
||||
def __initiate_values(self):
|
||||
"""
|
||||
Set values, including those for the dictionary.
|
||||
"""
|
||||
self.__state = 'default'
|
||||
self.__new_line = 0
|
||||
self.__block = ('doc', 'preamble', 'rtf-definition', 'font-table',
|
||||
'font-in-table', 'color-table', 'color-in-table', 'style-sheet',
|
||||
'paragraph-styles', 'paragraph-style-in-table', 'character-styles',
|
||||
'character-style-in-table', 'list-table', 'doc-information', 'title',
|
||||
'author', 'operator', 'creation-time', 'revision-time',
|
||||
'editing-time', 'time', 'number-of-pages', 'number-of-words',
|
||||
'number-of-characters', 'page-definition', 'section-definition',
|
||||
'headers-and-footers', 'section', 'para', 'body',
|
||||
'paragraph-definition', 'cell', 'row', 'table', 'revision-table',
|
||||
'style-group', 'border-group','styles-in-body', 'paragraph-style-in-body',
|
||||
'list-in-table', 'level-in-table', 'override-table','override-list',
|
||||
)
|
||||
self.__two_new_line = ('section', 'body', 'table', 'row' 'list-table')
|
||||
self.__state_dict = {
|
||||
'default' : self.__default_func,
|
||||
'mi<tg<open______' : self.__open_func,
|
||||
'mi<tg<close_____' : self.__close_func,
|
||||
'mi<tg<open-att__' : self.__open_att_func,
|
||||
'mi<tg<empty-att_' : self.__empty_att_func,
|
||||
'tx<nu<__________' : self.__text_func,
|
||||
'tx<ut<__________' : self.__text_func,
|
||||
'mi<tg<empty_____' : self.__empty_func,
|
||||
}
|
||||
|
||||
def __open_func(self, line):
|
||||
"""
|
||||
Print the opening tag and newlines when needed.
|
||||
"""
|
||||
# mi<tg<open______<style-sheet
|
||||
info = line[17:-1]
|
||||
self.__new_line = 0
|
||||
if info in self.__block:
|
||||
self.__write_new_line()
|
||||
if info in self.__two_new_line:
|
||||
self.__write_extra_new_line()
|
||||
self.__write_obj.write('<%s>' % info)
|
||||
|
||||
def __empty_func(self, line):
|
||||
"""
|
||||
Print out empty tag and newlines when needed.
|
||||
"""
|
||||
info = line[17:-1]
|
||||
self.__write_obj.write(
|
||||
'<%s/>' % info)
|
||||
self.__new_line = 0
|
||||
if info in self.__block:
|
||||
self.__write_new_line()
|
||||
if info in self.__two_new_line:
|
||||
self.__write_extra_new_line()
|
||||
|
||||
def __open_att_func(self, line):
|
||||
"""
|
||||
Process lines for open tags that have attributes.
|
||||
The important info is between [17:-1]. Take this info and split it
|
||||
with the delimeter '<'. The first token in this group is the element
|
||||
name. The rest are attributes, separated fromt their values by '>'. So
|
||||
read each token one at a time, and split them by '>'.
|
||||
"""
|
||||
# mi<tg<open-att__<footnote<num>
|
||||
info = line[17:-1]
|
||||
tokens = info.split("<")
|
||||
element_name = tokens[0]
|
||||
tokens = tokens[1:]
|
||||
self.__write_obj.write('<%s' % element_name)
|
||||
for token in tokens:
|
||||
groups = token.split('>')
|
||||
try:
|
||||
val = groups[0]
|
||||
att = groups[1]
|
||||
att = att.replace('"', '"')
|
||||
att = att.replace("'", '"')
|
||||
self.__write_obj.write(
|
||||
' %s="%s"' % (val, att)
|
||||
)
|
||||
except:
|
||||
if self.__run_level > 3:
|
||||
msg = 'index out of range\n'
|
||||
raise self.__bug_handler(msg)
|
||||
self.__write_obj.write('>')
|
||||
self.__new_line = 0
|
||||
if element_name in self.__block:
|
||||
self.__write_new_line()
|
||||
if element_name in self.__two_new_line:
|
||||
self.__write_extra_new_line()
|
||||
|
||||
def __empty_att_func(self, line):
|
||||
"""
|
||||
Same as the __open_att_func, except a '/' is placed at the end of the tag.
|
||||
"""
|
||||
# mi<tg<open-att__<footnote<num>
|
||||
info = line[17:-1]
|
||||
tokens = info.split("<")
|
||||
element_name = tokens[0]
|
||||
tokens = tokens[1:]
|
||||
self.__write_obj.write('<%s' % element_name)
|
||||
for token in tokens:
|
||||
groups = token.split('>')
|
||||
val = groups[0]
|
||||
att = groups[1]
|
||||
att = att.replace('"', '"')
|
||||
att = att.replace("'", '"')
|
||||
self.__write_obj.write(
|
||||
' %s="%s"' % (val, att))
|
||||
self.__write_obj.write('/>')
|
||||
self.__new_line = 0
|
||||
if element_name in self.__block:
|
||||
self.__write_new_line()
|
||||
if element_name in self.__two_new_line:
|
||||
self.__write_extra_new_line()
|
||||
|
||||
def __close_func(self, line):
|
||||
"""
|
||||
Print out the closed tag and new lines, if appropriate.
|
||||
"""
|
||||
# mi<tg<close_____<style-sheet\n
|
||||
info = line[17:-1]
|
||||
self.__write_obj.write(
|
||||
'</%s>' % info)
|
||||
self.__new_line = 0
|
||||
if info in self.__block:
|
||||
self.__write_new_line()
|
||||
if info in self.__two_new_line:
|
||||
self.__write_extra_new_line()
|
||||
|
||||
def __text_func(self, line):
|
||||
"""
|
||||
Simply print out the information between [17:-1]
|
||||
"""
|
||||
# tx<nu<__________<Normal;
|
||||
# change this!
|
||||
self.__write_obj.write(line[17:-1])
|
||||
|
||||
def __write_extra_new_line(self):
|
||||
"""
|
||||
Print out extra new lines if the new lines have not exceeded two. If
|
||||
the new lines are greater than two, do nothing.
|
||||
"""
|
||||
if not self.__indent:
|
||||
return
|
||||
if self.__new_line < 2:
|
||||
self.__write_obj.write('\n')
|
||||
|
||||
def __default_func(self, line):
|
||||
pass
|
||||
|
||||
def __write_new_line(self):
|
||||
"""
|
||||
Print out a new line if a new line has not already been printed out.
|
||||
"""
|
||||
if not self.__indent:
|
||||
return
|
||||
if not self.__new_line:
|
||||
self.__write_obj.write('\n')
|
||||
self.__new_line += 1
|
||||
|
||||
def __write_dec(self):
|
||||
"""
|
||||
Write the XML declaration at the top of the document.
|
||||
"""
|
||||
# keep maximum compatibility with previous version
|
||||
check_encoding_obj = check_encoding.CheckEncoding(
|
||||
bug_handler=self.__bug_handler)
|
||||
|
||||
if not check_encoding_obj.check_encoding(self.__file, verbose=False):
|
||||
self.__write_obj.write('<?xml version="1.0" encoding="US-ASCII" ?>')
|
||||
elif not check_encoding_obj.check_encoding(self.__file, self.__encoding, verbose=False):
|
||||
self.__write_obj.write('<?xml version="1.0" encoding="UTF-8" ?>')
|
||||
self.__convert_utf = True
|
||||
else:
|
||||
self.__write_obj.write('<?xml version="1.0" encoding="US-ASCII" ?>')
|
||||
sys.stderr.write('Bad RTF encoding, revert to US-ASCII chars and'
|
||||
' hope for the best')
|
||||
self.__bad_encoding = True
|
||||
self.__new_line = 0
|
||||
self.__write_new_line()
|
||||
if self.__no_dtd:
|
||||
pass
|
||||
elif self.__dtd_path:
|
||||
self.__write_obj.write(
|
||||
'<!DOCTYPE doc SYSTEM "%s">' % self.__dtd_path
|
||||
)
|
||||
elif self.__dtd_path == '':
|
||||
# don't print dtd if further transformations are going to take
|
||||
# place
|
||||
pass
|
||||
else:
|
||||
self.__write_obj.write(
|
||||
'<!DOCTYPE doc PUBLIC "publicID" '
|
||||
'"http://rtf2xml.sourceforge.net/dtd/%s">' % public_dtd
|
||||
)
|
||||
self.__new_line = 0
|
||||
self.__write_new_line()
|
||||
|
||||
def convert_to_tags(self):
|
||||
"""
|
||||
Read in the file one line at a time. Get the important info, between
|
||||
[:16]. Check if this info matches a dictionary entry. If it does, call
|
||||
the appropriate function.
|
||||
The functions that are called:
|
||||
a text function for text
|
||||
an open function for open tags
|
||||
an open with attribute function for tags with attributes
|
||||
an empty with attribute function for tags that are empty but have
|
||||
attribtes.
|
||||
a closed function for closed tags.
|
||||
an empty tag function.
|
||||
"""
|
||||
self.__initiate_values()
|
||||
with open_for_write(self.__write_to) as self.__write_obj:
|
||||
self.__write_dec()
|
||||
with open_for_read(self.__file) as read_obj:
|
||||
for line in read_obj:
|
||||
self.__token_info = line[:16]
|
||||
action = self.__state_dict.get(self.__token_info)
|
||||
if action is not None:
|
||||
action(line)
|
||||
# convert all encodings to UTF8 or ASCII to avoid unsupported encodings in lxml
|
||||
if self.__convert_utf or self.__bad_encoding:
|
||||
copy_obj = copy.Copy(bug_handler=self.__bug_handler)
|
||||
copy_obj.rename(self.__write_to, self.__file)
|
||||
with open_for_read(self.__file) as read_obj:
|
||||
with open_for_write(self.__write_to) as write_obj:
|
||||
for line in read_obj:
|
||||
write_obj.write(line)
|
||||
copy_obj = copy.Copy(bug_handler=self.__bug_handler)
|
||||
if self.__copy:
|
||||
copy_obj.copy_file(self.__write_to, "convert_to_tags.data")
|
||||
copy_obj.rename(self.__write_to, self.__file)
|
||||
os.remove(self.__write_to)
|
||||
63
ebook_converter/ebooks/rtf2xml/copy.py
Normal file
63
ebook_converter/ebooks/rtf2xml/copy.py
Normal file
@@ -0,0 +1,63 @@
|
||||
from __future__ import unicode_literals, absolute_import, print_function, division
|
||||
#########################################################################
|
||||
# #
|
||||
# #
|
||||
# copyright 2002 Paul Henry Tremblay #
|
||||
# #
|
||||
# This program is distributed in the hope that it will be useful, #
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
|
||||
# General Public License for more details. #
|
||||
# #
|
||||
# #
|
||||
#########################################################################
|
||||
import os, shutil
|
||||
|
||||
|
||||
class Copy:
|
||||
"""Copy each changed file to a directory for debugging purposes"""
|
||||
__dir = ""
|
||||
|
||||
def __init__(self, bug_handler, file=None, deb_dir=None, ):
|
||||
self.__file = file
|
||||
self.__bug_handler = bug_handler
|
||||
|
||||
def set_dir(self, deb_dir):
|
||||
"""Set the temporary directory to write files to"""
|
||||
if deb_dir is None:
|
||||
message = "No directory has been provided to write to in the copy.py"
|
||||
raise self.__bug_handler(message)
|
||||
check = os.path.isdir(deb_dir)
|
||||
if not check:
|
||||
message = "%(deb_dir)s is not a directory" % vars()
|
||||
raise self.__bug_handler(message)
|
||||
Copy.__dir = deb_dir
|
||||
|
||||
def remove_files(self):
|
||||
"""Remove files from directory"""
|
||||
self.__remove_the_files(Copy.__dir)
|
||||
|
||||
def __remove_the_files(self, the_dir):
|
||||
"""Remove files from directory"""
|
||||
list_of_files = os.listdir(the_dir)
|
||||
for file in list_of_files:
|
||||
rem_file = os.path.join(Copy.__dir,file)
|
||||
if os.path.isdir(rem_file):
|
||||
self.__remove_the_files(rem_file)
|
||||
else:
|
||||
try:
|
||||
os.remove(rem_file)
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
def copy_file(self, file, new_file):
|
||||
"""
|
||||
Copy the file to a new name
|
||||
If the platform is linux, use the faster linux command
|
||||
of cp. Otherwise, use a safe python method.
|
||||
"""
|
||||
write_file = os.path.join(Copy.__dir,new_file)
|
||||
shutil.copyfile(file, write_file)
|
||||
|
||||
def rename(self, source, dest):
|
||||
shutil.copyfile(source, dest)
|
||||
188
ebook_converter/ebooks/rtf2xml/default_encoding.py
Normal file
188
ebook_converter/ebooks/rtf2xml/default_encoding.py
Normal file
@@ -0,0 +1,188 @@
|
||||
from __future__ import unicode_literals, absolute_import, print_function, division
|
||||
#########################################################################
|
||||
# #
|
||||
# copyright 2002 Paul Henry Tremblay #
|
||||
# #
|
||||
#########################################################################
|
||||
|
||||
'''
|
||||
Codepages as to RTF 1.9.1:
|
||||
437 United States IBM
|
||||
708 Arabic (ASMO 708)
|
||||
709 Arabic (ASMO 449+, BCON V4)
|
||||
710 Arabic (transparent Arabic)
|
||||
711 Arabic (Nafitha Enhanced)
|
||||
720 Arabic (transparent ASMO)
|
||||
819 Windows 3.1 (United States and Western Europe)
|
||||
850 IBM multilingual
|
||||
852 Eastern European
|
||||
860 Portuguese
|
||||
862 Hebrew
|
||||
863 French Canadian
|
||||
864 Arabic
|
||||
865 Norwegian
|
||||
866 Soviet Union
|
||||
874 Thai
|
||||
932 Japanese
|
||||
936 Simplified Chinese
|
||||
949 Korean
|
||||
950 Traditional Chinese
|
||||
1250 Eastern European
|
||||
1251 Cyrillic
|
||||
1252 Western European
|
||||
1253 Greek
|
||||
1254 Turkish
|
||||
1255 Hebrew
|
||||
1256 Arabic
|
||||
1257 Baltic
|
||||
1258 Vietnamese
|
||||
1361 Johab
|
||||
10000 MAC Roman
|
||||
10001 MAC Japan
|
||||
10004 MAC Arabic
|
||||
10005 MAC Hebrew
|
||||
10006 MAC Greek
|
||||
10007 MAC Cyrillic
|
||||
10029 MAC Latin2
|
||||
10081 MAC Turkish
|
||||
57002 Devanagari
|
||||
57003 Bengali
|
||||
57004 Tamil
|
||||
57005 Telugu
|
||||
57006 Assamese
|
||||
57007 Oriya
|
||||
57008 Kannada
|
||||
57009 Malayalam
|
||||
57010 Gujarati
|
||||
57011 Punjabi
|
||||
'''
|
||||
import re
|
||||
from . import open_for_read
|
||||
|
||||
|
||||
class DefaultEncoding:
|
||||
"""
|
||||
Find the default encoding for the doc
|
||||
"""
|
||||
|
||||
# Note: not all those encoding are really supported by rtf2xml
|
||||
# See http://msdn.microsoft.com/en-us/library/windows/desktop/dd317756%28v=vs.85%29.aspx
|
||||
# and src\calibre\gui2\widgets.py for the input list in calibre
|
||||
ENCODINGS = {
|
||||
# Special cases
|
||||
'cp1252':'1252',
|
||||
'utf-8':'1252',
|
||||
'ascii':'1252',
|
||||
# Normal cases
|
||||
'big5':'950',
|
||||
'cp1250':'1250',
|
||||
'cp1251':'1251',
|
||||
'cp1253':'1253',
|
||||
'cp1254':'1254',
|
||||
'cp1255':'1255',
|
||||
'cp1256':'1256',
|
||||
'shift_jis':'932',
|
||||
'gb2312':'936',
|
||||
# Not in RTF 1.9.1 codepage specification
|
||||
'hz':'52936',
|
||||
'iso8859_5':'28595',
|
||||
'iso2022_jp':'50222',
|
||||
'iso2022_kr':'50225',
|
||||
'euc_jp':'51932',
|
||||
'euc_kr':'51949',
|
||||
'gb18030':'54936',
|
||||
}
|
||||
|
||||
def __init__(self, in_file, bug_handler, default_encoding, run_level=1, check_raw=False):
|
||||
self.__file = in_file
|
||||
self.__bug_handler = bug_handler
|
||||
self.__platform = 'Windows'
|
||||
self.__default_num = 'not-defined'
|
||||
self.__code_page = self.ENCODINGS.get(default_encoding, '1252')
|
||||
self.__datafetched = False
|
||||
self.__fetchraw = check_raw
|
||||
|
||||
def find_default_encoding(self):
|
||||
if not self.__datafetched:
|
||||
self._encoding()
|
||||
self.__datafetched = True
|
||||
code_page = 'ansicpg' + self.__code_page
|
||||
# if self.__code_page == '10000':
|
||||
# self.__code_page = 'mac_roman'
|
||||
return self.__platform, code_page, self.__default_num
|
||||
|
||||
def get_codepage(self):
|
||||
if not self.__datafetched:
|
||||
self._encoding()
|
||||
self.__datafetched = True
|
||||
# if self.__code_page == '10000':
|
||||
# self.__code_page = 'mac_roman'
|
||||
return self.__code_page
|
||||
|
||||
def get_platform(self):
|
||||
if not self.__datafetched:
|
||||
self._encoding()
|
||||
self.__datafetched = True
|
||||
return self.__platform
|
||||
|
||||
def _encoding(self):
|
||||
with open_for_read(self.__file) as read_obj:
|
||||
cpfound = False
|
||||
if not self.__fetchraw:
|
||||
for line in read_obj:
|
||||
self.__token_info = line[:16]
|
||||
if self.__token_info == 'mi<mk<rtfhed-end':
|
||||
break
|
||||
if self.__token_info == 'cw<ri<macintosh_':
|
||||
self.__platform = 'Macintosh'
|
||||
elif self.__token_info == 'cw<ri<pc________':
|
||||
self.__platform = 'IBMPC'
|
||||
elif self.__token_info == 'cw<ri<pca_______':
|
||||
self.__platform = 'OS/2'
|
||||
if self.__token_info == 'cw<ri<ansi-codpg' \
|
||||
and int(line[20:-1]):
|
||||
self.__code_page = line[20:-1]
|
||||
if self.__token_info == 'cw<ri<deflt-font':
|
||||
self.__default_num = line[20:-1]
|
||||
cpfound = True
|
||||
# cw<ri<deflt-font<nu<0
|
||||
if self.__platform != 'Windows' and \
|
||||
not cpfound:
|
||||
if self.__platform == 'Macintosh':
|
||||
self.__code_page = '10000'
|
||||
elif self.__platform == 'IBMPC':
|
||||
self.__code_page = '437'
|
||||
elif self.__platform == 'OS/2':
|
||||
self.__code_page = '850'
|
||||
else:
|
||||
fenc = re.compile(r'\\(mac|pc|ansi|pca)[\\ \{\}\t\n]+')
|
||||
fenccp = re.compile(r'\\ansicpg(\d+)[\\ \{\}\t\n]+')
|
||||
|
||||
for line in read_obj:
|
||||
if fenc.search(line):
|
||||
enc = fenc.search(line).group(1)
|
||||
if fenccp.search(line):
|
||||
cp = fenccp.search(line).group(1)
|
||||
if not int(cp):
|
||||
self.__code_page = cp
|
||||
cpfound = True
|
||||
break
|
||||
if self.__platform != 'Windows' and \
|
||||
not cpfound:
|
||||
if enc == 'mac':
|
||||
self.__code_page = '10000'
|
||||
elif enc == 'pc':
|
||||
self.__code_page = '437'
|
||||
elif enc == 'pca':
|
||||
self.__code_page = '850'
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
import sys
|
||||
encode_obj = DefaultEncoding(
|
||||
in_file=sys.argv[1],
|
||||
default_encoding=sys.argv[2],
|
||||
bug_handler=Exception,
|
||||
check_raw=True,
|
||||
)
|
||||
print(encode_obj.get_codepage())
|
||||
212
ebook_converter/ebooks/rtf2xml/delete_info.py
Normal file
212
ebook_converter/ebooks/rtf2xml/delete_info.py
Normal file
@@ -0,0 +1,212 @@
|
||||
from __future__ import unicode_literals, absolute_import, print_function, division
|
||||
#########################################################################
|
||||
# #
|
||||
# #
|
||||
# copyright 2002 Paul Henry Tremblay #
|
||||
# #
|
||||
# This program is distributed in the hope that it will be useful, #
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
|
||||
# General Public License for more details. #
|
||||
# #
|
||||
# #
|
||||
#########################################################################
|
||||
import sys, os
|
||||
|
||||
from calibre.ebooks.rtf2xml import copy
|
||||
from calibre.ptempfile import better_mktemp
|
||||
from . import open_for_read, open_for_write
|
||||
|
||||
|
||||
class DeleteInfo:
|
||||
"""Delete unecessary destination groups"""
|
||||
|
||||
def __init__(self,
|
||||
in_file ,
|
||||
bug_handler,
|
||||
copy=None,
|
||||
run_level=1,
|
||||
):
|
||||
self.__file = in_file
|
||||
self.__bug_handler = bug_handler
|
||||
self.__copy = copy
|
||||
self.__write_to = better_mktemp()
|
||||
self.__run_level = run_level
|
||||
self.__initiate_allow()
|
||||
self.__bracket_count= 0
|
||||
self.__ob_count = 0
|
||||
self.__cb_count = 0
|
||||
self.__ob = 0
|
||||
self.__write_cb = False
|
||||
self.__found_delete = False
|
||||
|
||||
def __initiate_allow(self):
|
||||
"""
|
||||
Initiate a list of destination groups which should be printed out.
|
||||
"""
|
||||
self.__allowable = ('cw<ss<char-style',
|
||||
'cw<it<listtable_',
|
||||
'cw<it<revi-table',
|
||||
'cw<ls<list-lev-d',
|
||||
# Field allowed
|
||||
'cw<fd<field-inst',
|
||||
'cw<an<book-mk-st',
|
||||
'cw<an<book-mk-en',
|
||||
'cw<an<annotation',
|
||||
'cw<cm<comment___',
|
||||
'cw<it<lovr-table',
|
||||
# info table
|
||||
'cw<di<company___',
|
||||
# 'cw<ls<list______',
|
||||
)
|
||||
self.__not_allowable = (
|
||||
'cw<un<unknown___',
|
||||
'cw<un<company___',
|
||||
'cw<ls<list-level',
|
||||
'cw<fd<datafield_',
|
||||
)
|
||||
self.__state = 'default'
|
||||
self.__state_dict = {
|
||||
'default' : self.__default_func,
|
||||
'after_asterisk' : self.__asterisk_func,
|
||||
'delete' : self.__delete_func,
|
||||
'list' : self.__list_func,
|
||||
}
|
||||
|
||||
def __default_func(self,line):
|
||||
"""Handle lines when in no special state. Look for an asterisk to
|
||||
begin a special state. Otherwise, print out line."""
|
||||
# cw<ml<asterisk__<nu<true
|
||||
if self.__token_info == 'cw<ml<asterisk__':
|
||||
self.__state = 'after_asterisk'
|
||||
self.__delete_count = self.__ob_count
|
||||
elif self.__token_info == 'ob<nu<open-brack':
|
||||
# write previous bracket, if exists
|
||||
if self.__ob:
|
||||
self.__write_obj.write(self.__ob)
|
||||
self.__ob = line
|
||||
return False
|
||||
else:
|
||||
# write previous bracket, since didn't find asterisk
|
||||
if self.__ob:
|
||||
self.__write_obj.write(self.__ob)
|
||||
self.__ob = 0
|
||||
return True
|
||||
|
||||
def __delete_func(self,line):
|
||||
"""Handle lines when in delete state. Don't print out lines
|
||||
unless the state has ended."""
|
||||
if self.__delete_count == self.__cb_count:
|
||||
self.__state = 'default'
|
||||
if self.__write_cb:
|
||||
self.__write_cb = True
|
||||
return True
|
||||
return False
|
||||
|
||||
def __asterisk_func(self,line):
|
||||
"""
|
||||
Determine whether to delete info in group
|
||||
Note on self.__cb flag.
|
||||
If you find that you are in a delete group, and the previous
|
||||
token in not an open bracket (self.__ob = 0), that means
|
||||
that the delete group is nested inside another acceptable
|
||||
detination group. In this case, you have already written
|
||||
the open bracket, so you will need to write the closed one
|
||||
as well.
|
||||
"""
|
||||
# Test for {\*}, in which case don't enter
|
||||
# delete state
|
||||
self.__found_delete = True
|
||||
if self.__token_info == 'cb<nu<clos-brack':
|
||||
if self.__delete_count == self.__cb_count:
|
||||
self.__state = 'default'
|
||||
self.__ob = 0
|
||||
# changed this because haven't printed out start
|
||||
return False
|
||||
else:
|
||||
# not sure what happens here!
|
||||
# believe I have a '{\*}
|
||||
if self.__run_level > 3:
|
||||
msg = 'Flag problem\n'
|
||||
raise self.__bug_handler(msg)
|
||||
return True
|
||||
elif self.__token_info in self.__allowable :
|
||||
if self.__ob:
|
||||
self.__write_obj.write(self.__ob)
|
||||
self.__ob = 0
|
||||
self.__state = 'default'
|
||||
else:
|
||||
pass
|
||||
return True
|
||||
elif self.__token_info == 'cw<ls<list______':
|
||||
self.__ob = 0
|
||||
self.__found_list_func(line)
|
||||
elif self.__token_info in self.__not_allowable:
|
||||
if not self.__ob:
|
||||
self.__write_cb = True
|
||||
self.__ob = 0
|
||||
self.__state = 'delete'
|
||||
self.__cb_count = 0
|
||||
return False
|
||||
else:
|
||||
if self.__run_level > 5:
|
||||
msg = ('After an asterisk, and found neither an allowable or non-allowable token\n\
|
||||
token is "%s"\n') % self.__token_info
|
||||
raise self.__bug_handler(msg)
|
||||
if not self.__ob:
|
||||
self.__write_cb = True
|
||||
self.__ob = 0
|
||||
self.__state = 'delete'
|
||||
self.__cb_count = 0
|
||||
return False
|
||||
|
||||
def __found_list_func(self, line):
|
||||
"""
|
||||
print out control words in this group
|
||||
"""
|
||||
self.__state = 'list'
|
||||
|
||||
def __list_func(self, line):
|
||||
"""
|
||||
Check to see if the group has ended.
|
||||
Return True for all control words.
|
||||
Return False otherwise.
|
||||
"""
|
||||
if self.__delete_count == self.__cb_count and \
|
||||
self.__token_info == 'cb<nu<clos-brack':
|
||||
self.__state = 'default'
|
||||
if self.__write_cb:
|
||||
self.__write_cb = False
|
||||
return True
|
||||
return False
|
||||
elif line[0:2] == 'cw':
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
def delete_info(self):
|
||||
"""Main method for handling other methods. Read one line at
|
||||
a time, and determine whether to print the line based on the state."""
|
||||
with open_for_read(self.__file) as read_obj:
|
||||
with open_for_write(self.__write_to) as self.__write_obj:
|
||||
for line in read_obj:
|
||||
# ob<nu<open-brack<0001
|
||||
self.__token_info = line[:16]
|
||||
if self.__token_info == 'ob<nu<open-brack':
|
||||
self.__ob_count = line[-5:-1]
|
||||
if self.__token_info == 'cb<nu<clos-brack':
|
||||
self.__cb_count = line[-5:-1]
|
||||
# Get action to perform
|
||||
action = self.__state_dict.get(self.__state)
|
||||
if not action:
|
||||
sys.stderr.write('No action in dictionary state is "%s" \n'
|
||||
% self.__state)
|
||||
# Print if allowed by action
|
||||
if action(line):
|
||||
self.__write_obj.write(line)
|
||||
copy_obj = copy.Copy(bug_handler=self.__bug_handler)
|
||||
if self.__copy:
|
||||
copy_obj.copy_file(self.__write_to, "delete_info.data")
|
||||
copy_obj.rename(self.__write_to, self.__file)
|
||||
os.remove(self.__write_to)
|
||||
return self.__found_delete
|
||||
816
ebook_converter/ebooks/rtf2xml/field_strings.py
Normal file
816
ebook_converter/ebooks/rtf2xml/field_strings.py
Normal file
@@ -0,0 +1,816 @@
|
||||
from __future__ import unicode_literals, absolute_import, print_function, division
|
||||
#########################################################################
|
||||
# #
|
||||
# #
|
||||
# copyright 2002 Paul Henry Tremblay #
|
||||
# #
|
||||
# This program is distributed in the hope that it will be useful, #
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
|
||||
# General Public License for more details. #
|
||||
# #
|
||||
# #
|
||||
#########################################################################
|
||||
import sys, re
|
||||
|
||||
|
||||
class FieldStrings:
|
||||
"""
|
||||
This module is given a string. It processes the field instruction string and
|
||||
returns a list of three values.
|
||||
"""
|
||||
|
||||
def __init__(self, bug_handler, run_level=1):
|
||||
"""
|
||||
Requires:
|
||||
nothing
|
||||
Returns:
|
||||
nothing
|
||||
"""
|
||||
self.__run_level = run_level
|
||||
self.__bug_handler = bug_handler
|
||||
self.__initiate_values()
|
||||
|
||||
def __initiate_values(self):
|
||||
"""
|
||||
Requires:
|
||||
nothing.
|
||||
Returns:
|
||||
nothing.
|
||||
Logic:
|
||||
initiate values for rest of class.
|
||||
self.__field_instruction_dict:
|
||||
The dictionary for all field names.
|
||||
"""
|
||||
self.__field_instruction_dict = {
|
||||
# number type (arabic, etc.) and number format (\# " ")
|
||||
'EDITTIME' : (self.__num_type_and_format_func, 'editing-time'),
|
||||
'NUMCHARS' : (self.__num_type_and_format_func, 'number-of-characters-in-doc'),
|
||||
'NUMPAGES' : (self.__num_type_and_format_func, 'number-of-pages-in-doc'),
|
||||
'NUMWORDS' : (self.__num_type_and_format_func, 'number-of-words-in-doc'),
|
||||
'REVNUM' : (self.__num_type_and_format_func, 'revision-number'),
|
||||
'SECTIONPAGES' : (self.__num_type_and_format_func, 'num-of-pages-in-section'),
|
||||
'SECTION' : (self.__num_type_and_format_func, 'insert-section-number'),
|
||||
'QUOTE' : (self.__num_type_and_format_func, 'quote'),
|
||||
# number formatting (\# "")
|
||||
'PAGE' : (self.__default_inst_func, 'insert-page-number'),
|
||||
'page' : (self.__default_inst_func, 'insert-page-number'),
|
||||
# date format (\@ "")
|
||||
'CREATEDATE' : (self.__date_func, 'insert-date'),
|
||||
'PRINTDATE' : (self.__date_func, 'insert-date'),
|
||||
# PRINTDATE?
|
||||
'SAVEDATE' : (self.__date_func, 'last-saved'),
|
||||
'TIME' : (self.__date_func, 'insert-time'),
|
||||
# numbers?
|
||||
# these fields take four switches
|
||||
'AUTHOR' : (self.__simple_info_func, 'user-name'),
|
||||
'COMMENTS' : (self.__simple_info_func, 'comments'),
|
||||
'FILENAME' : (self.__simple_info_func, 'file-name'),
|
||||
'filename' : (self.__simple_info_func, 'file-name'),
|
||||
'KEYWORDS' : (self.__simple_info_func, 'keywords'),
|
||||
'LASTSAVEDBY' : (self.__simple_info_func, 'last-saved-by'),
|
||||
'SUBJECT' : (self.__simple_info_func, 'subject'),
|
||||
'TEMPLATE' : (self.__simple_info_func, 'based-on-template'),
|
||||
'TITLE' : (self.__simple_info_func, 'document-title'),
|
||||
'USERADDRESS' : (self.__simple_info_func, 'user-address'),
|
||||
'USERINITIALS' : (self.__simple_info_func, 'user-initials'),
|
||||
'USERNAME' : (self.__simple_info_func, 'user-name'),
|
||||
'EQ' : (self.__equation_func, 'equation'),
|
||||
'HYPERLINK' : (self.__hyperlink_func, 'hyperlink'),
|
||||
'INCLUDEPICTURE': (self.__include_pict_func, 'include-picture'),
|
||||
'INCLUDETEXT' : (self.__include_text_func, 'include-text-from-file'),
|
||||
'INDEX' : (self.__index_func, 'index'),
|
||||
'NOTEREF' : (self.__note_ref_func, 'reference-to-note'),
|
||||
'PAGEREF' : (self.__page_ref_func, 'reference-to-page'),
|
||||
'REF' : (self.__ref_func, 'reference'),
|
||||
'ref' : (self.__ref_func, 'reference'),
|
||||
'SEQ' : (self.__sequence_func, 'numbering-sequence'),
|
||||
'SYMBOL' : (self.__symbol_func, 'symbol'),
|
||||
'TA' : (self.__ta_func, 'anchor-for-table-of-authorities'),
|
||||
'TOA' : (self.__toc_table_func, 'table-of-authorities'),
|
||||
'TOC' : (self.__toc_table_func, 'table-of-contents'),
|
||||
# no switches
|
||||
'AUTONUMOUT' : (self.__no_switch_func, 'auto-num-out?'),
|
||||
'COMPARE' : (self.__no_switch_func, 'compare'),
|
||||
'DOCVARIABLE' : (self.__no_switch_func, 'document-variable'),
|
||||
'GOTOBUTTON' : (self.__no_switch_func, 'go-button'),
|
||||
'NEXT' : (self.__no_switch_func, 'next'),
|
||||
'NEXTIF' : (self.__no_switch_func, 'next-if'),
|
||||
'SKIPIF' : (self.__no_switch_func, 'skip-if'),
|
||||
'IF' : (self.__no_switch_func, 'if'),
|
||||
'MERGEFIELD' : (self.__no_switch_func, 'merge-field'),
|
||||
'MERGEREC' : (self.__no_switch_func, 'merge-record'),
|
||||
'MERGESEQ' : (self.__no_switch_func, 'merge-sequence'),
|
||||
'PLACEHOLDER' : (self.__no_switch_func, 'place-holder'),
|
||||
'PRIVATE' : (self.__no_switch_func, 'private'),
|
||||
'RD' : (self.__no_switch_func, 'referenced-document'),
|
||||
'SET' : (self.__no_switch_func, 'set'),
|
||||
# default instructions (haven't written a method for them
|
||||
'ADVANCE' : (self.__default_inst_func, 'advance'),
|
||||
'ASK' : (self.__default_inst_func, 'prompt-user'),
|
||||
'AUTONUMLGL' : (self.__default_inst_func, 'automatic-number'),
|
||||
'AUTONUM' : (self.__default_inst_func, 'automatic-number'),
|
||||
'AUTOTEXTLIST' : (self.__default_inst_func, 'auto-list-text'),
|
||||
'AUTOTEXT' : (self.__default_inst_func, 'auto-text'),
|
||||
'BARCODE' : (self.__default_inst_func, 'barcode'),
|
||||
'CONTACT' : (self.__default_inst_func, 'contact'),
|
||||
'DATABASE' : (self.__default_inst_func, 'database'),
|
||||
'DATE' : (self.__default_inst_func, 'date'),
|
||||
'date' : (self.__default_inst_func, 'date'),
|
||||
'DOCPROPERTY' : (self.__default_inst_func, 'document-property'),
|
||||
'FILESIZE' : (self.__default_inst_func, 'file-size'),
|
||||
'FILLIN' : (self.__default_inst_func, 'fill-in'),
|
||||
'INFO' : (self.__default_inst_func, 'document-info'),
|
||||
'LINK' : (self.__default_inst_func, 'link'),
|
||||
'PA' : (self.__default_inst_func, 'page'),
|
||||
'PRINT' : (self.__default_inst_func, 'print'),
|
||||
'STYLEREF' : (self.__default_inst_func, 'style-reference'),
|
||||
'USERPROPERTY' : (self.__default_inst_func, 'user-property'),
|
||||
'FORMCHECKBOX' : (self.__default_inst_func, 'form-checkbox'),
|
||||
'FORMTEXT' : (self.__default_inst_func, 'form-text'),
|
||||
# buttons
|
||||
'MACROBUTTON' : (self.__default_inst_func, 'macro-button'),
|
||||
}
|
||||
self.__number_dict = {
|
||||
'Arabic' : 'arabic',
|
||||
'alphabetic' : 'alphabetic',
|
||||
'ALPHABETIC' : 'capital-alphabetic',
|
||||
'roman' : 'roman',
|
||||
'ROMAN' : 'capital-roman',
|
||||
'Ordinal' : 'ordinal',
|
||||
'CardText' : 'cardinal-text',
|
||||
'OrdText' : 'ordinal-text',
|
||||
'Hex' : 'hexidecimal',
|
||||
'DollarText' : 'dollar-text',
|
||||
'Upper' : 'upper-case',
|
||||
'Lower' : 'lower-case',
|
||||
'FirstCap' : 'first-cap',
|
||||
'Caps' : 'caps',
|
||||
}
|
||||
self.__text_format_dict = {
|
||||
'Upper' : 'upper',
|
||||
'Lower' : 'lower',
|
||||
'FirstCap' : 'first-cap',
|
||||
'Caps' : 'caps',
|
||||
}
|
||||
self.__symbol_num_exp = re.compile(r'SYMBOL (.*?) ')
|
||||
self.__symbol_font_exp = re.compile(r'\\f "(.*?)"')
|
||||
self.__symbol_size_exp = re.compile(r'\\s (\d+)')
|
||||
# self.__toc_figure_exp = re.compile(r'\\c "Figure"')
|
||||
# \\@ "dddd, MMMM d, yyyy"
|
||||
self.__date_exp = re.compile(r'\\@\s{1,}"(.*?)"')
|
||||
self.__num_type_exp = re.compile(
|
||||
r'\\\*\s{1,}(Arabic|alphabetic|ALPHABETIC|roman|ROMAN|Ordinal|CardText|OrdText|Hex|DollarText|Upper|Lower|FirstCap|Caps)')
|
||||
self.__format_text_exp = re.compile(r'\\\*\s{1,}(Upper|Lower|FirstCap|Caps)')
|
||||
self.__merge_format_exp = re.compile(r'\\\*\s{1,}MERGEFORMAT')
|
||||
self.__ta_short_field_exp = re.compile(r'\\s\s{1,}"(.*?)"')
|
||||
self.__ta_long_field_exp = re.compile(r'\\l\s{1,}"(.*?)"')
|
||||
self.__ta_category_exp = re.compile(r'\\c\s{1,}(\d+)')
|
||||
# indices
|
||||
self.__index_insert_blank_line_exp = re.compile(r'\\h\s{1,}""')
|
||||
self.__index_insert_letter_exp = re.compile(r'\\h\s{1,}"()"')
|
||||
self.__index_columns_exp = re.compile(r'\\c\s{1,}"(.*?)"')
|
||||
self.__bookmark_exp = re.compile(r'\\b\s{1,}(.*?)\s')
|
||||
self.__d_separator = re.compile(r'\\d\s{1,}(.*?)\s')
|
||||
self.__e_separator = re.compile(r'\\e\s{1,}(.*?)\s')
|
||||
self.__l_separator = re.compile(r'\\l\s{1,}(.*?)\s')
|
||||
self.__p_separator = re.compile(r'\\p\s{1,}(.*?)\s')
|
||||
self.__index_sequence = re.compile(r'\\s\s{1,}(.*?)\s')
|
||||
self.__index_entry_typ_exp = re.compile(r'\\f\s{1,}"(.*?)"')
|
||||
self.__quote_exp = re.compile(r'"(.*?)"')
|
||||
self.__filter_switch = re.compile(r'\\c\s{1,}(.*?)\s')
|
||||
self.__link_switch = re.compile(r'\\l\s{1,}(.*?)\s')
|
||||
|
||||
def process_string(self, my_string, type):
|
||||
"""
|
||||
Requires:
|
||||
my_string --the string to parse.
|
||||
type -- the type of string.
|
||||
Returns:
|
||||
Returns a string for a field instrution attribute.
|
||||
Logic:
|
||||
This handles all "large" fields, which means everything except
|
||||
toc entries, index entries, and bookmarks
|
||||
Split the string by spaces, and get the first item in the
|
||||
resulting list. This item is the field's type. Check for the
|
||||
action in the field instructions dictionary for further parsing.
|
||||
If no action is found, print out an error message.
|
||||
"""
|
||||
changed_string = ''
|
||||
lines = my_string.split('\n')
|
||||
for line in lines:
|
||||
if line[0:2] == 'tx':
|
||||
changed_string += line[17:]
|
||||
fields = changed_string.split()
|
||||
field_name = fields[0]
|
||||
action, name = self.__field_instruction_dict.get(field_name, (None, None))
|
||||
match_obj = re.search(self.__merge_format_exp, changed_string)
|
||||
if match_obj and name:
|
||||
name += '<update>dynamic'
|
||||
elif name:
|
||||
name += '<update>static'
|
||||
else:
|
||||
pass
|
||||
# no name--not in list above
|
||||
if action:
|
||||
the_list = action(field_name, name, changed_string)
|
||||
else:
|
||||
# change -1 to 0--for now, I want users to report bugs
|
||||
msg = 'no key for "%s" "%s"\n' % (field_name, changed_string)
|
||||
sys.stderr.write(msg)
|
||||
if self.__run_level > 3:
|
||||
msg = 'no key for "%s" "%s"\n' % (field_name, changed_string)
|
||||
raise self.__bug_handler(msg)
|
||||
the_list = self.__fall_back_func(field_name, line)
|
||||
return the_list
|
||||
return the_list
|
||||
|
||||
def __default_inst_func(self, field_name, name, line):
|
||||
"""
|
||||
Requires:
|
||||
field_name -- the first word in the string
|
||||
name -- the changed name according to the dictionary
|
||||
line -- the string to be parsed
|
||||
Returns:
|
||||
The name of the field.
|
||||
Logic:
|
||||
I only need the changed name for the field.
|
||||
"""
|
||||
return [None, None, name]
|
||||
|
||||
def __fall_back_func(self, field_name, line):
|
||||
"""
|
||||
Requires:
|
||||
field_name -- the first word in the string
|
||||
name -- the changed name according to the dictionary
|
||||
line -- the string to be parsed
|
||||
Returns:
|
||||
The name of the field.
|
||||
Logic:
|
||||
Used for fields not found in dict
|
||||
"""
|
||||
the_string = field_name
|
||||
the_string += '<update>none'
|
||||
return [None, None, the_string]
|
||||
|
||||
def __equation_func(self, field_name, name, line):
|
||||
"""
|
||||
Requried:
|
||||
field_name -- the first word in the string
|
||||
name --the changed name according to the dictionary
|
||||
line -- the string to be parse
|
||||
Retuns:
|
||||
The name of the field
|
||||
Logic:
|
||||
"""
|
||||
return [None, None, name]
|
||||
|
||||
def __no_switch_func(self, field_name, name, line):
|
||||
"""
|
||||
Required:
|
||||
field_name --the first
|
||||
field_name -- the first word in the string
|
||||
name --the changed name according to the dictionary
|
||||
line -- the string to be parse
|
||||
Retuns:
|
||||
The name of the field
|
||||
Logic:
|
||||
"""
|
||||
return [None, None, name]
|
||||
|
||||
def __num_type_and_format_func(self, field_name, name, line):
|
||||
"""
|
||||
Required:
|
||||
field_name -- the first word in the string
|
||||
name --the changed name according to the dictionary
|
||||
line -- the string to be parse
|
||||
Returns:
|
||||
list of None, None, and part of a tag
|
||||
Logic:
|
||||
parse num_type
|
||||
parse num_format
|
||||
"""
|
||||
the_string = name
|
||||
num_format = self.__parse_num_format(line)
|
||||
if num_format:
|
||||
the_string += '<number-format>%s' % num_format
|
||||
num_type = self.__parse_num_type(line)
|
||||
if num_type:
|
||||
the_string += '<number-type>%s' % num_type
|
||||
# Only QUOTE takes a (mandatory?) argument
|
||||
if field_name == 'QUOTE':
|
||||
match_group = re.search(r'QUOTE\s{1,}"(.*?)"', line)
|
||||
if match_group:
|
||||
arg = match_group.group(1)
|
||||
the_string += '<argument>%s' % arg
|
||||
return [None, None, the_string]
|
||||
|
||||
def __num_format_func(self, field_name, name, line):
|
||||
"""
|
||||
Required:
|
||||
field_name -- the first word in the string
|
||||
name --the changed name according to the dictionary
|
||||
line -- the string to be parse
|
||||
Returns:
|
||||
list of None, None, and part of a tag
|
||||
Logic:
|
||||
"""
|
||||
the_string = name
|
||||
num_format = self.__parse_num_format(line)
|
||||
if num_format:
|
||||
the_string += '<number-format>%s' % num_format
|
||||
return [None, None, the_string]
|
||||
|
||||
def __parse_num_format(self, the_string):
|
||||
"""
|
||||
Required:
|
||||
the_string -- the string to parse
|
||||
Returns:
|
||||
a string if the_string contains number formatting information
|
||||
None, otherwise
|
||||
Logic:
|
||||
"""
|
||||
match_group = re.search(self.__date_exp, the_string)
|
||||
if match_group:
|
||||
return match_group(1)
|
||||
|
||||
def __parse_num_type(self, the_string):
|
||||
"""
|
||||
Required:
|
||||
the_string -- the string to parse
|
||||
Returns:
|
||||
a string if the_string contains number type information
|
||||
None, otherwise
|
||||
Logic:
|
||||
the_string might look like:
|
||||
USERNAME \\* Arabic \\* MERGEFORMAT
|
||||
Get the \\* Upper part. Use a dictionary to convert the "Arabic" to
|
||||
a more-readable word for the value of the key "number-type".
|
||||
(<field number-type = "Arabic">
|
||||
"""
|
||||
match_group = re.search(self.__num_type_exp, the_string)
|
||||
if match_group:
|
||||
name = match_group.group(1)
|
||||
changed_name = self.__number_dict.get(name)
|
||||
if changed_name:
|
||||
return changed_name
|
||||
else:
|
||||
sys.stderr.write('module is fields_string\n')
|
||||
sys.stderr.write('method is __parse_num_type\n')
|
||||
sys.stderr.write('no dictionary entry for %s\n' % name)
|
||||
|
||||
def __date_func(self, field_name, name, line):
|
||||
"""
|
||||
Required:
|
||||
field_name --the fist
|
||||
field_name -- the first word in the string
|
||||
name --the changed name according to the dictionary
|
||||
line -- the string to be parse
|
||||
Returns:
|
||||
list of None, None, and part of a tag
|
||||
Logic:
|
||||
"""
|
||||
the_string = name
|
||||
match_group = re.search(self.__date_exp, line)
|
||||
if match_group:
|
||||
the_string += '<date-format>%s' % match_group.group(1)
|
||||
return [None, None, the_string]
|
||||
|
||||
def __simple_info_func(self, field_name, name, line):
|
||||
"""
|
||||
Requried:
|
||||
field_name -- the first word in the string
|
||||
name --the changed name according to the dictionary
|
||||
line -- the string to be parse
|
||||
Retuns:
|
||||
The name of the field
|
||||
Logic:
|
||||
These fields can only have the following switches:
|
||||
1. Upper
|
||||
2. Lower
|
||||
3. FirstCap
|
||||
4. Caps
|
||||
"""
|
||||
the_string = name
|
||||
match_group = re.search(self.__format_text_exp, line)
|
||||
if match_group:
|
||||
name = match_group.group(1)
|
||||
changed_name = self.__text_format_dict.get(name)
|
||||
if changed_name:
|
||||
the_string += '<format>%s' % changed_name
|
||||
else:
|
||||
sys.stderr.write('module is fields_string\n')
|
||||
sys.stderr.write('method is __parse_num_type\n')
|
||||
sys.stderr.write('no dictionary entry for %s\n' % name)
|
||||
return [None, None, the_string]
|
||||
|
||||
def __hyperlink_func(self, field_name, name, line):
|
||||
"""
|
||||
Requried:
|
||||
field_name -- the first word in the string
|
||||
name --the changed name according to the dictionary
|
||||
line -- the string to be parse
|
||||
Retuns:
|
||||
The name of the field
|
||||
"""
|
||||
self.__link_switch = re.compile(r'\\l\s{1,}"{0,1}(.*?)"{0,1}\s')
|
||||
the_string = name
|
||||
match_group = re.search(self.__link_switch, line)
|
||||
if match_group:
|
||||
link = match_group.group(1)
|
||||
link = link.replace('"', """)
|
||||
the_string += '<link>%s' % link
|
||||
# \l "txt" "link"
|
||||
# want "file name" so must get rid of \c "txt"
|
||||
line = re.sub(self.__link_switch, '', line)
|
||||
match_group = re.search(self.__quote_exp, line)
|
||||
if match_group:
|
||||
arg = match_group.group(1)
|
||||
the_string += '<argument>%s' % arg
|
||||
else:
|
||||
pass
|
||||
index = line.find('\\m')
|
||||
if index > -1:
|
||||
the_string += '<html2-image-map>true'
|
||||
index = line.find('\\n')
|
||||
if index > -1:
|
||||
the_string += '<new-window>true'
|
||||
index = line.find('\\h')
|
||||
if index > -1:
|
||||
the_string += '<no-history>true'
|
||||
return [None, None, the_string]
|
||||
|
||||
def __include_text_func(self, field_name, name, line):
|
||||
"""
|
||||
Requried:
|
||||
field_name -- the first word in the string
|
||||
name --the changed name according to the dictionary
|
||||
line -- the string to be parse
|
||||
Retuns:
|
||||
The name of the field
|
||||
Logic:
|
||||
"""
|
||||
the_string = name
|
||||
match_group = re.search(self.__format_text_exp, line)
|
||||
if match_group:
|
||||
name = match_group.group(1)
|
||||
changed_name = self.__text_format_dict.get(name)
|
||||
if changed_name:
|
||||
the_string += '<format>%s' % changed_name
|
||||
else:
|
||||
sys.stderr.write('module is fields_string\n')
|
||||
sys.stderr.write('method is __parse_num_type\n')
|
||||
sys.stderr.write('no dictionary entry for %s\n' % name)
|
||||
match_group = re.search(self.__filter_switch, line)
|
||||
if match_group:
|
||||
arg = match_group.group(1)
|
||||
the_string += '<filter>%s' % arg
|
||||
# \c "txt" "file name"
|
||||
# want "file name" so must get rid of \c "txt"
|
||||
line = re.sub(self.__filter_switch, '', line)
|
||||
match_group = re.search(self.__quote_exp, line)
|
||||
if match_group:
|
||||
arg = match_group.group(1)
|
||||
arg = arg.replace('"', """)
|
||||
the_string += '<argument>%s' % arg
|
||||
else:
|
||||
sys.stderr.write('Module is field_strings\n')
|
||||
sys.stderr.write('method is include_text_func\n')
|
||||
sys.stderr.write('no argument for include text\n')
|
||||
index = line.find('\\!')
|
||||
if index > -1:
|
||||
the_string += '<no-field-update>true'
|
||||
return [None, None, the_string]
|
||||
|
||||
def __include_pict_func(self, field_name, name, line):
|
||||
"""
|
||||
Requried:
|
||||
field_name -- the first word in the string
|
||||
name --the changed name according to the dictionary
|
||||
line -- the string to be parse
|
||||
Retuns:
|
||||
The name of the field
|
||||
Logic:
|
||||
"""
|
||||
the_string = name
|
||||
match_group = re.search(self.__filter_switch, line)
|
||||
if match_group:
|
||||
arg = match_group.group(1)
|
||||
arg = arg.replace('"', """)
|
||||
the_string += '<filter>%s' % arg
|
||||
# \c "txt" "file name"
|
||||
# want "file name" so must get rid of \c "txt"
|
||||
line = re.sub(self.__filter_switch, '', line)
|
||||
match_group = re.search(self.__quote_exp, line)
|
||||
if match_group:
|
||||
arg = match_group.group(1)
|
||||
the_string += '<argument>%s' % arg
|
||||
else:
|
||||
sys.stderr.write('Module is field_strings\n')
|
||||
sys.stderr.write('method is include_pict_func\n')
|
||||
sys.stderr.write('no argument for include pict\n')
|
||||
index = line.find('\\d')
|
||||
if index > -1:
|
||||
the_string += '<external>true'
|
||||
return [None, None, the_string]
|
||||
|
||||
def __ref_func(self, field_name, name, line):
|
||||
"""
|
||||
Requires:
|
||||
field_name -- the first word in the string
|
||||
name -- the changed name according to the dictionary
|
||||
line -- the string to be parsed
|
||||
Returns:
|
||||
The name of the field.
|
||||
Logic:
|
||||
A page reference field looks like this:
|
||||
PAGEREF _Toc440880424 \\h
|
||||
I want to extract the second line of info, which is used as an
|
||||
achor in the resulting XML file.
|
||||
"""
|
||||
the_string = name
|
||||
match_group = re.search(self.__format_text_exp, line)
|
||||
if match_group:
|
||||
name = match_group.group(1)
|
||||
changed_name = self.__text_format_dict.get(name)
|
||||
if changed_name:
|
||||
the_string += '<format>%s' % changed_name
|
||||
else:
|
||||
sys.stderr.write('module is fields_string\n')
|
||||
sys.stderr.write('method is __parse_num_type\n')
|
||||
sys.stderr.write('no dictionary entry for %s\n' % name)
|
||||
line = re.sub(self.__merge_format_exp, '', line)
|
||||
words = line.split()
|
||||
words = words[1:] # get rid of field name
|
||||
for word in words:
|
||||
if word[0:1] != '\\':
|
||||
the_string += '<bookmark>%s' % word
|
||||
index = line.find('\\f')
|
||||
if index > -1:
|
||||
the_string += '<include-note-number>true'
|
||||
index = line.find('\\h')
|
||||
if index > -1:
|
||||
the_string += '<hyperlink>true'
|
||||
index = line.find('\\n')
|
||||
if index > -1:
|
||||
the_string += '<insert-number>true'
|
||||
index = line.find('\\r')
|
||||
if index > -1:
|
||||
the_string += '<insert-number-relative>true'
|
||||
index = line.find('\\p')
|
||||
if index > -1:
|
||||
the_string += '<paragraph-relative-position>true'
|
||||
index = line.find('\\t')
|
||||
if index > -1:
|
||||
the_string += '<suppress-non-delimeter>true'
|
||||
index = line.find('\\w')
|
||||
if index > -1:
|
||||
the_string += '<insert-number-full>true'
|
||||
return [None, None, the_string]
|
||||
|
||||
def __toc_table_func(self, field_name, name, line):
|
||||
"""
|
||||
Requires:
|
||||
field_name -- the name of the first word in the string
|
||||
name --the changed name, according to the dictionary.
|
||||
line --the string to be parsed.
|
||||
Returns:
|
||||
A string for a TOC table field.
|
||||
Logic:
|
||||
If the string contains Figure, it is a table of figures.
|
||||
Otherwise, it is a plain old table of contents.
|
||||
"""
|
||||
the_string = name
|
||||
index = line.find('\\c "Figure"')
|
||||
if index > -1:
|
||||
the_string = the_string.replace('table-of-contents', 'table-of-figures')
|
||||
# don't really need the first value in this list, I don't believe
|
||||
return [name, None, the_string]
|
||||
|
||||
def __sequence_func(self, field_name, name, line):
|
||||
"""
|
||||
Requires:
|
||||
field_name --the name of the first word in the string.
|
||||
name --the changed name according to the dictionary.
|
||||
line -- the string to parse.
|
||||
Returns:
|
||||
A string with a value for the type and label attributes
|
||||
Logic:
|
||||
The type of sequence--whether figure, graph, my-name, or
|
||||
whatever--is represented by the second word in the string. Extract
|
||||
and return.
|
||||
SEQ Figure \\* ARABIC
|
||||
"""
|
||||
fields = line.split()
|
||||
label = fields[1]
|
||||
my_string = '%s<label>%s' % (name, label)
|
||||
return [None, None, my_string]
|
||||
|
||||
def __ta_func(self, field_name, name, line):
|
||||
"""
|
||||
Requires:
|
||||
field_name --the name of the first word in the string.
|
||||
name --the changed name according to the dictionary.
|
||||
line -- the string to parse.
|
||||
Returns:
|
||||
A string with a value for the type and label attributes
|
||||
Logic:
|
||||
"""
|
||||
the_string = name
|
||||
match_group = re.search(self.__ta_short_field_exp, line)
|
||||
if match_group:
|
||||
short_name = match_group.group(1)
|
||||
the_string += '<short-field>%s' % short_name
|
||||
match_group = re.search(self.__ta_long_field_exp, line)
|
||||
if match_group:
|
||||
long_name = match_group.group(1)
|
||||
the_string += '<long-field>%s' % long_name
|
||||
match_group = re.search(self.__ta_category_exp, line)
|
||||
if match_group:
|
||||
category = match_group.group(1)
|
||||
the_string += '<category>%s' % category
|
||||
index = line.find('\\b')
|
||||
if index > -1:
|
||||
the_string += '<bold>true'
|
||||
index = line.find('\\i')
|
||||
if index > -1:
|
||||
the_string += '<italics>true'
|
||||
return [None, None, the_string]
|
||||
|
||||
def __index_func(self, field_name, name, line):
|
||||
"""
|
||||
Requires:
|
||||
field_name --the name of the first word in the string.
|
||||
name --the changed name according to the dictionary.
|
||||
line -- the string to parse.
|
||||
Returns:
|
||||
A string with a value for the type and label attributes
|
||||
Logic:
|
||||
"""
|
||||
# self.__index_insert_blank_line_exp = re.compile(r'\\h\s{1,}""')
|
||||
# self.__index_insert_letter_exp = re.compile(r'\\h\s{1,}(".*?")')
|
||||
the_string = name
|
||||
match_group = re.search(self.__index_insert_blank_line_exp, line)
|
||||
if match_group:
|
||||
the_string += '<insert-blank-line>true'
|
||||
else:
|
||||
match_group = re.search(self.__index_insert_letter_exp, line)
|
||||
if match_group:
|
||||
insert_letter = match_group.group(1)
|
||||
the_string += '<insert-letter>%s' % insert_letter
|
||||
match_group = re.search(self.__index_columns_exp, line)
|
||||
if match_group:
|
||||
columns = match_group.group(1)
|
||||
the_string += '<number-of-columns>%s' % columns
|
||||
# self.__bookmark_exp = re.compile(r'\\b\s{1,}(.*?)\s')
|
||||
match_group = re.search(self.__bookmark_exp, line)
|
||||
if match_group:
|
||||
bookmark = match_group.group(1)
|
||||
the_string += '<use-bookmark>%s' % bookmark
|
||||
match_group = re.search(self.__d_separator, line)
|
||||
if match_group:
|
||||
separator = match_group.group(1)
|
||||
separator = separator.replace('"', '"')
|
||||
the_string += '<sequence-separator>%s' % separator
|
||||
# self.__e_separator = re.compile(r'\\e\s{1,}(.*?)\s')
|
||||
match_group = re.search(self.__e_separator, line)
|
||||
if match_group:
|
||||
separator = match_group.group(1)
|
||||
separator = separator.replace('"', '"')
|
||||
the_string += '<page-separator>%s' % separator
|
||||
# self.__index_sequence = re.compile(r'\\s\s{1,}(.*?)\s')
|
||||
match_group = re.search(self.__index_sequence, line)
|
||||
if match_group:
|
||||
sequence = match_group.group(1)
|
||||
separator = separator.replace('"', '"')
|
||||
the_string += '<use-sequence>%s' % sequence
|
||||
# self.__index_entry_typ_exp = re.compile(r'\\f\s{1,}"(.*?)"')
|
||||
match_group = re.search(self.__index_entry_typ_exp, line)
|
||||
if match_group:
|
||||
entry_type = match_group.group(1)
|
||||
the_string += '<entry-type>%s' % entry_type
|
||||
# self.__p_separator = re.compile(r'\\p\s{1,}(.*?)\s')
|
||||
match_group = re.search(self.__p_separator, line)
|
||||
if match_group:
|
||||
limit = match_group.group(1)
|
||||
the_string += '<limit-to-letters>%s' % limit
|
||||
match_group = re.search(self.__l_separator, line)
|
||||
if match_group:
|
||||
separator = match_group.group(1)
|
||||
separator = separator.replace('"', '"')
|
||||
the_string += '<multi-page-separator>%s' % separator
|
||||
index = line.find('\\a')
|
||||
if index > -1:
|
||||
the_string += '<accented>true'
|
||||
index = line.find('\\r')
|
||||
if index > -1:
|
||||
the_string += '<sub-entry-on-same-line>true'
|
||||
index = line.find('\\t')
|
||||
if index > -1:
|
||||
the_string += '<enable-yomi-text>true'
|
||||
return [None, None, the_string]
|
||||
|
||||
def __page_ref_func(self, field_name, name, line):
|
||||
"""
|
||||
Requires:
|
||||
field_name --first name in the string.
|
||||
name -- the changed name according to the dictionary.
|
||||
line -- the string to parse.
|
||||
Returns:
|
||||
A string .
|
||||
Logic:
|
||||
"""
|
||||
the_string = name
|
||||
num_format = self.__parse_num_format(line)
|
||||
if num_format:
|
||||
the_string += '<number-format>%s' % num_format
|
||||
num_type = self.__parse_num_type(line)
|
||||
if num_type:
|
||||
the_string += '<number-type>%s' % num_type
|
||||
line = re.sub(self.__merge_format_exp, '', line)
|
||||
words = line.split()
|
||||
words = words[1:] # get rid of field name
|
||||
for word in words:
|
||||
if word[0:1] != '\\':
|
||||
the_string += '<bookmark>%s' % word
|
||||
index = line.find('\\h')
|
||||
if index > -1:
|
||||
the_string += '<hyperlink>true'
|
||||
index = line.find('\\p')
|
||||
if index > -1:
|
||||
the_string += '<paragraph-relative-position>true'
|
||||
return [None, None, the_string]
|
||||
|
||||
def __note_ref_func(self, field_name, name, line):
|
||||
"""
|
||||
Requires:
|
||||
field_name --first name in the string.
|
||||
name -- the changed name according to the dictionary.
|
||||
line -- the string to parse.
|
||||
Returns:
|
||||
A string .
|
||||
Logic:
|
||||
"""
|
||||
the_string = name
|
||||
line = re.sub(self.__merge_format_exp, '', line)
|
||||
words = line.split()
|
||||
words = words[1:] # get rid of field name
|
||||
for word in words:
|
||||
if word[0:1] != '\\':
|
||||
the_string += '<bookmark>%s' % word
|
||||
index = line.find('\\h')
|
||||
if index > -1:
|
||||
the_string += '<hyperlink>true'
|
||||
index = line.find('\\p')
|
||||
if index > -1:
|
||||
the_string += '<paragraph-relative-position>true'
|
||||
index = line.find('\\f')
|
||||
if index > -1:
|
||||
the_string += '<include-note-number>true'
|
||||
return [None, None, the_string]
|
||||
|
||||
def __symbol_func(self, field_name, name, line):
|
||||
"""
|
||||
Requires:
|
||||
field_name --first name in the string.
|
||||
name -- the changed name according to the dictionary.
|
||||
line -- the string to parse.
|
||||
Returns:
|
||||
A string containing font size, font style, and a hexidecimal value.
|
||||
Logic:
|
||||
The SYMBOL field is one of Microsoft's many quirky ways of
|
||||
entering text. The string that results from this method looks like
|
||||
this:
|
||||
SYMBOL 97 \\f "Symbol" \\s 12
|
||||
The first word merely tells us that we have encountered a SYMBOL
|
||||
field.
|
||||
The next value is the Microsoft decimal value. Change this to
|
||||
hexidecimal.
|
||||
The pattern '\\f "some font' tells us the font.
|
||||
The pattern '\\s some size' tells us the font size.
|
||||
Extract all of this information. Store this information in a
|
||||
string, and make this string the last item in a list. The first
|
||||
item in the list is the simple word 'symbol', which tells me that
|
||||
I don't really have field, but UTF-8 data.
|
||||
"""
|
||||
num = ''
|
||||
font = ''
|
||||
font_size = ''
|
||||
changed_line = ''
|
||||
search_obj = re.search(self.__symbol_num_exp, line)
|
||||
if search_obj:
|
||||
num = search_obj.group(1)
|
||||
num = int(num)
|
||||
num = '%X' % num
|
||||
search_obj = re.search(self.__symbol_font_exp, line)
|
||||
if search_obj:
|
||||
font = search_obj.group(1)
|
||||
changed_line += 'cw<ci<font-style<nu<%s\n' % font
|
||||
search_obj = re.search(self.__symbol_size_exp, line)
|
||||
if search_obj:
|
||||
font_size = search_obj.group(1)
|
||||
font_size = int(font_size)
|
||||
font_size = '%.2f' % font_size
|
||||
changed_line += 'cw<ci<font-size_<nu<%s\n' % font_size
|
||||
changed_line += 'tx<hx<__________<\'%s\n' % num
|
||||
return ['Symbol', None, changed_line]
|
||||
378
ebook_converter/ebooks/rtf2xml/fields_large.py
Normal file
378
ebook_converter/ebooks/rtf2xml/fields_large.py
Normal file
@@ -0,0 +1,378 @@
|
||||
from __future__ import unicode_literals, absolute_import, print_function, division
|
||||
#########################################################################
|
||||
# #
|
||||
# #
|
||||
# copyright 2002 Paul Henry Tremblay #
|
||||
# #
|
||||
# This program is distributed in the hope that it will be useful, #
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
|
||||
# General Public License for more details. #
|
||||
# #
|
||||
# #
|
||||
#########################################################################
|
||||
import sys, os
|
||||
from calibre.ebooks.rtf2xml import field_strings, copy
|
||||
from calibre.ptempfile import better_mktemp
|
||||
from . import open_for_read, open_for_write
|
||||
|
||||
|
||||
class FieldsLarge:
|
||||
r"""
|
||||
=========================
|
||||
Logic
|
||||
=========================
|
||||
Make tags for fields.
|
||||
-Fields reflect text that Microsoft Word automatically generates.
|
||||
-Each file contains (or should contain) an inner group called field instructions.
|
||||
-Fields can be nested.
|
||||
--------------
|
||||
Logic
|
||||
--------------
|
||||
1. As soon as a field is found, make a new text string by appending an empty
|
||||
text string to the field list. Collect all the lines in this string until the
|
||||
field instructions are found.
|
||||
2. Collect all the tokens and text in the field instructions. When the end of
|
||||
the field instructions is found, process the string of text with the
|
||||
field_strings module. Append the processed string to the field instructins
|
||||
list.
|
||||
3. Continue collecting tokens. Check for paragraphs or sections. If either is found, add to the paragraph or section list.
|
||||
4. Continue collecting tokens and text either the beginning of a new field is found, or the end of this field is found.
|
||||
5. If a new field is found, repeat steps 1-3.
|
||||
6. If the end of the field is found, process the last text string of the field list.
|
||||
7. If the field list is empty (after removing the last text string), there are
|
||||
no more fields. Print out the final string. If the list contains other strings,
|
||||
add the processed string to the last string in the field list.
|
||||
============================
|
||||
Examples
|
||||
============================
|
||||
This line of RTF:
|
||||
{\field{\*\fldinst { CREATEDATE \\* MERGEFORMAT }}{\fldrslt {
|
||||
\lang1024 1/11/03 10:34 PM}}}
|
||||
Becomes:
|
||||
<field type = "insert-time">
|
||||
10:34 PM
|
||||
</field>
|
||||
The simple field in the above example conatins no paragraph or sections breaks.
|
||||
This line of RTF:
|
||||
{{\field{\*\fldinst SYMBOL 97 \\f "Symbol" \\s 12}{\fldrslt\f3\fs24}}}
|
||||
Becomes:
|
||||
<para><inline font-size="18"><inline font-style="Symbol">Χ</inline></inline></para>
|
||||
The RTF in the example above should be represented as UTF-8 rather than a field.
|
||||
This RTF:
|
||||
{\field\fldedit{\*\fldinst { TOC \\o "1-3" }}{\fldrslt {\lang1024
|
||||
Heading one\tab }{\field{\*\fldinst {\lang1024 PAGEREF _Toc440880424
|
||||
\\h }{\lang1024 {\*\datafield
|
||||
{\lang1024 1}}}{\lang1024 \par }\pard\plain
|
||||
\s18\li240\widctlpar\tqr\tldot\tx8630\aspalpha\aspnum\faauto\adjustright\rin0\lin240\itap0
|
||||
\f4\lang1033\cgrid {\lang1024 Heading 2\tab }{\field{\*\fldinst
|
||||
{\lang1024 PAGEREF _Toc440880425 \\h }{\lang1024 {\*\datafield
|
||||
{\lang1024 1}}}{\lang1024 \par }\pard\plain
|
||||
\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0
|
||||
\f4\lang1033\cgrid }}\pard\plain
|
||||
\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0
|
||||
\f4\lang1033\cgrid {\fs28 \\u214\'85 \par }{\fs36 {\field{\*\fldinst
|
||||
SYMBOL 67 \\f "Symbol" \\s 18}{\fldrslt\f3\fs36}}}
|
||||
Becomes:
|
||||
<field-block type="table-of-contents">
|
||||
<paragraph-definition language="1033" nest-level="0"
|
||||
font-style="Times" name="toc 1" adjust-right="true"
|
||||
widow-control="true">
|
||||
<para><inline language="1024">Heading one	</inline><field
|
||||
type="reference-to-page" ref="_Toc440880424"><inline
|
||||
language="1024">1</inline></field></para>
|
||||
</paragraph-definition>
|
||||
<paragraph-definition language="1033" nest-level="0" left-indent="12"
|
||||
font-style="Times" name="toc 2" adjust-right="true"
|
||||
widow-control="true">
|
||||
<para><inline language="1024">Heading 2	</inline><field
|
||||
type="reference-to-page" ref="_Toc440880425"><inline
|
||||
language="1024">1</inline></field></para>
|
||||
</paragraph-definition>
|
||||
</field-block>
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
in_file,
|
||||
bug_handler,
|
||||
copy=None,
|
||||
run_level=1,
|
||||
):
|
||||
"""
|
||||
Required:
|
||||
'file'--file to parse
|
||||
Optional:
|
||||
'copy'-- whether to make a copy of result for debugging
|
||||
'temp_dir' --where to output temporary results (default is
|
||||
directory from which the script is run.)
|
||||
Returns:
|
||||
nothing
|
||||
"""
|
||||
self.__file = in_file
|
||||
self.__bug_handler = bug_handler
|
||||
self.__copy = copy
|
||||
self.__run_level = run_level
|
||||
self.__write_to = better_mktemp()
|
||||
|
||||
def __initiate_values(self):
|
||||
"""
|
||||
Initiate all values.
|
||||
"""
|
||||
self.__text_string = ''
|
||||
self.__field_instruction_string = ''
|
||||
self.__marker = 'mi<mk<inline-fld\n'
|
||||
self.__state = 'before_body'
|
||||
self.__string_obj = field_strings.FieldStrings(run_level=self.__run_level,
|
||||
bug_handler=self.__bug_handler,)
|
||||
self.__state_dict = {
|
||||
'before_body' : self.__before_body_func,
|
||||
'in_body' : self.__in_body_func,
|
||||
'field' : self.__in_field_func,
|
||||
'field_instruction' : self.__field_instruction_func,
|
||||
}
|
||||
self.__in_body_dict = {
|
||||
'cw<fd<field_____' : self.__found_field_func,
|
||||
}
|
||||
self.__field_dict = {
|
||||
'cw<fd<field-inst' : self.__found_field_instruction_func,
|
||||
'cw<fd<field_____' : self.__found_field_func,
|
||||
'cw<pf<par-end___' : self.__par_in_field_func,
|
||||
'cw<sc<section___' : self.__sec_in_field_func,
|
||||
}
|
||||
self.__field_count = [] # keep track of the brackets
|
||||
self.__field_instruction = [] # field instruction strings
|
||||
self.__symbol = 0 # wheter or not the field is really UTF-8
|
||||
# (these fields cannot be nested.)
|
||||
self.__field_instruction_string = '' # string that collects field instruction
|
||||
self.__par_in_field = [] # paragraphs in field?
|
||||
self.__sec_in_field = [] # sections in field?
|
||||
self.__field_string = [] # list of field strings
|
||||
|
||||
def __before_body_func(self, line):
|
||||
"""
|
||||
Requried:
|
||||
line --line ro parse
|
||||
Returns:
|
||||
nothing (changes an instant and writes a line)
|
||||
Logic:
|
||||
Check for the beginninf of the body. If found, changed the state.
|
||||
Always write out the line.
|
||||
"""
|
||||
if self.__token_info == 'mi<mk<body-open_':
|
||||
self.__state = 'in_body'
|
||||
self.__write_obj.write(line)
|
||||
|
||||
def __in_body_func(self, line):
|
||||
"""
|
||||
Required:
|
||||
line --line to parse
|
||||
Returns:
|
||||
nothing. (Writes a line to the output file, or performs other actions.)
|
||||
Logic:
|
||||
Check of the beginning of a field. Always output the line.
|
||||
"""
|
||||
action = self.__in_body_dict.get(self.__token_info)
|
||||
if action:
|
||||
action(line)
|
||||
self.__write_obj.write(line)
|
||||
|
||||
def __found_field_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line --line to parse
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
Set the values for parseing the field. Four lists have to have
|
||||
items appended to them.
|
||||
"""
|
||||
self.__state = 'field'
|
||||
self.__cb_count = 0
|
||||
ob_count = self.__ob_count
|
||||
self.__field_string.append('')
|
||||
self.__field_count.append(ob_count)
|
||||
self.__sec_in_field.append(0)
|
||||
self.__par_in_field.append(0)
|
||||
|
||||
def __in_field_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line --line to parse
|
||||
Returns:
|
||||
nothing.
|
||||
Logic:
|
||||
Check for the end of the field; a paragaph break; a section break;
|
||||
the beginning of another field; or the beginning of the field
|
||||
instruction.
|
||||
"""
|
||||
if self.__cb_count == self.__field_count[-1]:
|
||||
self.__field_string[-1] += line
|
||||
self.__end_field_func()
|
||||
else:
|
||||
action = self.__field_dict.get(self.__token_info)
|
||||
if action:
|
||||
action(line)
|
||||
else:
|
||||
self.__field_string[-1] += line
|
||||
|
||||
def __par_in_field_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line --line to parse
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
Write the line to the output file and set the last item in the
|
||||
paragraph in field list to true.
|
||||
"""
|
||||
self.__field_string[-1] += line
|
||||
self.__par_in_field[-1] = 1
|
||||
|
||||
def __sec_in_field_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line --line to parse
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
Write the line to the output file and set the last item in the
|
||||
section in field list to true.
|
||||
"""
|
||||
self.__field_string[-1] += line
|
||||
self.__sec_in_field[-1] = 1
|
||||
|
||||
def __found_field_instruction_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line -- line to parse
|
||||
Returns:
|
||||
nothing
|
||||
Change the state to field instruction. Set the open bracket count of
|
||||
the beginning of this field so you know when it ends. Set the closed
|
||||
bracket count to 0 so you don't prematureley exit this state.
|
||||
"""
|
||||
self.__state = 'field_instruction'
|
||||
self.__field_instruction_count = self.__ob_count
|
||||
self.__cb_count = 0
|
||||
|
||||
def __field_instruction_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line --line to parse
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
Collect all the lines until the end of the field is reached.
|
||||
Process these lines with the module rtr.field_strings.
|
||||
Check if the field instruction is 'Symbol' (really UTF-8).
|
||||
"""
|
||||
if self.__cb_count == self.__field_instruction_count:
|
||||
# The closing bracket should be written, since the opening bracket
|
||||
# was written
|
||||
self.__field_string[-1] += line
|
||||
my_list = self.__string_obj.process_string(
|
||||
self.__field_instruction_string, 'field_instruction')
|
||||
instruction = my_list[2]
|
||||
self.__field_instruction.append(instruction)
|
||||
if my_list[0] == 'Symbol':
|
||||
self.__symbol = 1
|
||||
self.__state = 'field'
|
||||
self.__field_instruction_string = ''
|
||||
else:
|
||||
self.__field_instruction_string += line
|
||||
|
||||
def __end_field_func(self):
|
||||
"""
|
||||
Requires:
|
||||
nothing
|
||||
Returns:
|
||||
Nothing
|
||||
Logic:
|
||||
Pop the last values in the instructions list, the fields list, the
|
||||
paragaph list, and the section list.
|
||||
If the field is a symbol, do not write the tags <field></field>,
|
||||
since this field is really just UTF-8.
|
||||
If the field contains paragraph or section breaks, it is a
|
||||
field-block rather than just a field.
|
||||
Write the paragraph or section markers for later parsing of the
|
||||
file.
|
||||
If the filed list contains more strings, add the latest
|
||||
(processed) string to the last string in the list. Otherwise,
|
||||
write the string to the output file.
|
||||
"""
|
||||
last_bracket = self.__field_count.pop()
|
||||
instruction = self.__field_instruction.pop()
|
||||
inner_field_string = self.__field_string.pop()
|
||||
sec_in_field = self.__sec_in_field.pop()
|
||||
par_in_field = self.__par_in_field.pop()
|
||||
# add a closing bracket, since the closing bracket is not included in
|
||||
# the field string
|
||||
if self.__symbol:
|
||||
inner_field_string = '%scb<nu<clos-brack<%s\n' % \
|
||||
(instruction, last_bracket)
|
||||
elif sec_in_field or par_in_field:
|
||||
inner_field_string = \
|
||||
'mi<mk<fldbkstart\n'\
|
||||
'mi<tg<open-att__<field-block<type>%s\n%s'\
|
||||
'mi<mk<fldbk-end_\n' \
|
||||
'mi<tg<close_____<field-block\n'\
|
||||
'mi<mk<fld-bk-end\n' \
|
||||
% (instruction, inner_field_string)
|
||||
# write a marker to show an inline field for later parsing
|
||||
else:
|
||||
inner_field_string = \
|
||||
'%s' \
|
||||
'mi<tg<open-att__<field<type>%s\n%s'\
|
||||
'mi<tg<close_____<field\n'\
|
||||
% (self.__marker, instruction, inner_field_string)
|
||||
if sec_in_field:
|
||||
inner_field_string = 'mi<mk<sec-fd-beg\n' + inner_field_string + \
|
||||
'mi<mk<sec-fd-end\n'
|
||||
if par_in_field:
|
||||
inner_field_string = 'mi<mk<par-in-fld\n' + inner_field_string
|
||||
if len(self.__field_string) == 0:
|
||||
self.__write_field_string(inner_field_string)
|
||||
else:
|
||||
self.__field_string[-1] += inner_field_string
|
||||
self.__symbol = 0
|
||||
|
||||
def __write_field_string(self, the_string):
|
||||
self.__state = 'in_body'
|
||||
self.__write_obj.write(the_string)
|
||||
|
||||
def fix_fields(self):
|
||||
"""
|
||||
Requires:
|
||||
nothing
|
||||
Returns:
|
||||
nothing (changes the original file)
|
||||
Logic:
|
||||
Read one line in at a time. Determine what action to take based on
|
||||
the state. If the state is before the body, look for the
|
||||
beginning of the body.
|
||||
If the state is body, send the line to the body method.
|
||||
"""
|
||||
self.__initiate_values()
|
||||
read_obj = open_for_read(self.__file)
|
||||
self.__write_obj = open_for_write(self.__write_to)
|
||||
line_to_read = 1
|
||||
while line_to_read:
|
||||
line_to_read = read_obj.readline()
|
||||
line = line_to_read
|
||||
self.__token_info = line[:16]
|
||||
if self.__token_info == 'ob<nu<open-brack':
|
||||
self.__ob_count = line[-5:-1]
|
||||
if self.__token_info == 'cb<nu<clos-brack':
|
||||
self.__cb_count = line[-5:-1]
|
||||
action = self.__state_dict.get(self.__state)
|
||||
if action is None:
|
||||
sys.stderr.write('no no matching state in module styles.py\n')
|
||||
sys.stderr.write(self.__state + '\n')
|
||||
action(line)
|
||||
read_obj.close()
|
||||
self.__write_obj.close()
|
||||
copy_obj = copy.Copy(bug_handler=self.__bug_handler)
|
||||
if self.__copy:
|
||||
copy_obj.copy_file(self.__write_to, "fields_large.data")
|
||||
copy_obj.rename(self.__write_to, self.__file)
|
||||
os.remove(self.__write_to)
|
||||
460
ebook_converter/ebooks/rtf2xml/fields_small.py
Normal file
460
ebook_converter/ebooks/rtf2xml/fields_small.py
Normal file
@@ -0,0 +1,460 @@
|
||||
from __future__ import unicode_literals, absolute_import, print_function, division
|
||||
#########################################################################
|
||||
# #
|
||||
# #
|
||||
# copyright 2002 Paul Henry Tremblay #
|
||||
# #
|
||||
# This program is distributed in the hope that it will be useful, #
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
|
||||
# General Public License for more details. #
|
||||
# #
|
||||
# #
|
||||
#########################################################################
|
||||
import sys, os, re
|
||||
|
||||
from calibre.ebooks.rtf2xml import field_strings, copy
|
||||
from calibre.ptempfile import better_mktemp
|
||||
from . import open_for_read, open_for_write
|
||||
|
||||
|
||||
class FieldsSmall:
|
||||
"""
|
||||
=================
|
||||
Purpose
|
||||
=================
|
||||
Write tags for bookmarks, index and toc entry fields in a tokenized file.
|
||||
This module does not handle toc or index tables. (This module won't be any
|
||||
use to you unless you use it as part of the other modules.)
|
||||
-----------
|
||||
Method
|
||||
-----------
|
||||
Look for the beginning of a bookmark, index, or toc entry. When such a token
|
||||
is found, store the opeing bracket count in a variable. Collect all the text
|
||||
until the closing bracket entry is found. Send the string to the module
|
||||
field_strings to process it. Write the processed string to the output
|
||||
file.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
in_file,
|
||||
bug_handler,
|
||||
copy=None,
|
||||
run_level=1,
|
||||
):
|
||||
"""
|
||||
Required:
|
||||
'file'--file to parse
|
||||
Optional:
|
||||
'copy'-- whether to make a copy of result for debugging
|
||||
'temp_dir' --where to output temporary results (default is
|
||||
directory from which the script is run.)
|
||||
Returns:
|
||||
nothing
|
||||
"""
|
||||
self.__file = in_file
|
||||
self.__bug_handler = bug_handler
|
||||
self.__copy = copy
|
||||
self.__write_to = better_mktemp()
|
||||
self.__run_level = run_level
|
||||
|
||||
def __initiate_values(self):
|
||||
"""
|
||||
Initiate all values.
|
||||
"""
|
||||
self.__string_obj = field_strings.FieldStrings(bug_handler=self.__bug_handler)
|
||||
self.__state = 'before_body'
|
||||
self.__text_string = ''
|
||||
self.__marker = 'mi<mk<inline-fld\n'
|
||||
self.__state_dict = {
|
||||
'before_body' : self.__before_body_func,
|
||||
'body' : self.__body_func,
|
||||
'bookmark' : self.__bookmark_func,
|
||||
'toc_index' : self.__toc_index_func,
|
||||
}
|
||||
self.__body_dict = {
|
||||
'cw<an<book-mk-st' : (self.__found_bookmark_func, 'start'),
|
||||
'cw<an<book-mk-en' : (self.__found_bookmark_func, 'end'),
|
||||
'cw<an<toc_______' : (self.__found_toc_index_func, 'toc'),
|
||||
'cw<an<index-mark' : (self.__found_toc_index_func, 'index'),
|
||||
}
|
||||
ob = 'ob<nu<open-brack.....'
|
||||
cb = 'cb<nu<clos-brack'
|
||||
bk_st = 'cw<an<book-mk-st<nu<true'
|
||||
tx = 'tx<nu<__________<(.*?)'
|
||||
reg_st = ob + bk_st + tx + cb
|
||||
self.__book_start = re.compile(r'%s' % reg_st)
|
||||
|
||||
def __before_body_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line --the line to parse
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
Look for the beginning of the body. When found, change the state
|
||||
to body. Always print out the line.
|
||||
"""
|
||||
if self.__token_info == 'mi<mk<body-open_':
|
||||
self.__state = 'body'
|
||||
self.__write_obj.write(line)
|
||||
|
||||
def __body_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line --the line to parse
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
This function handles all the lines in the body of the documents.
|
||||
Look for a bookmark, index or toc entry and take the appropriate action.
|
||||
"""
|
||||
action, tag = \
|
||||
self.__body_dict.get(self.__token_info, (None, None))
|
||||
if action:
|
||||
action(line, tag)
|
||||
else:
|
||||
self.__write_obj.write(line)
|
||||
|
||||
def __found_bookmark_func(self, line, tag):
|
||||
"""
|
||||
Requires:
|
||||
line --the line to parse
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
This function is called when a bookmark is found. The opening
|
||||
bracket count is stored int eh beginning bracket count. The state
|
||||
is changed to 'bookmark.'
|
||||
"""
|
||||
self.__beg_bracket_count = self.__ob_count
|
||||
self.__cb_count = 0
|
||||
self.__state = 'bookmark'
|
||||
self.__type_of_bookmark = tag
|
||||
|
||||
def __bookmark_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line --the line to parse
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
This function handles all lines within a bookmark. It adds each
|
||||
line to a string until the end of the bookmark is found. It
|
||||
processes the string with the fields_string module, and
|
||||
prints out the result.
|
||||
"""
|
||||
if self.__beg_bracket_count == self.__cb_count:
|
||||
self.__state = 'body'
|
||||
type = 'bookmark-%s' % self.__type_of_bookmark
|
||||
# change here
|
||||
"""
|
||||
my_string = self.__string_obj.process_string(
|
||||
self.__text_string, type)
|
||||
"""
|
||||
my_string = self.__parse_bookmark_func(
|
||||
self.__text_string, type)
|
||||
self.__write_obj.write(self.__marker)
|
||||
self.__write_obj.write(my_string)
|
||||
self.__text_string = ''
|
||||
self.__write_obj.write(line)
|
||||
elif line[0:2] == 'tx':
|
||||
self.__text_string += line[17:-1]
|
||||
|
||||
def __parse_index_func(self, my_string):
|
||||
"""
|
||||
Requires:
|
||||
my_string --string to parse
|
||||
type --type of string
|
||||
Returns:
|
||||
A string for a toc instruction field.
|
||||
Logic:
|
||||
This method is meant for *both* index and toc entries.
|
||||
I want to eleminate paragraph endings, and I want to divide the
|
||||
entry into a main entry and (if it exists) a sub entry.
|
||||
Split the string by newlines. Read on token at a time. If the
|
||||
token is a special colon, end the main entry element and start the
|
||||
sub entry element.
|
||||
If the token is a pargrah ending, ignore it, since I don't won't
|
||||
paragraphs within toc or index entries.
|
||||
"""
|
||||
my_string, see_string = self.__index_see_func(my_string)
|
||||
my_string, bookmark_string = self.__index_bookmark_func(my_string)
|
||||
italics, bold = self.__index__format_func(my_string)
|
||||
found_sub = 0
|
||||
my_changed_string = 'mi<tg<empty-att_<field<type>index-entry'
|
||||
my_changed_string += '<update>static'
|
||||
if see_string:
|
||||
my_changed_string += '<additional-text>%s' % see_string
|
||||
if bookmark_string:
|
||||
my_changed_string += '<bookmark>%s' % bookmark_string
|
||||
if italics:
|
||||
my_changed_string += '<italics>true'
|
||||
if bold:
|
||||
my_changed_string += '<bold>true'
|
||||
main_entry = ''
|
||||
sub_entry = ''
|
||||
lines = my_string.split('\n')
|
||||
for line in lines:
|
||||
token_info = line[:16]
|
||||
if token_info == 'cw<ml<colon_____':
|
||||
found_sub = 1
|
||||
elif token_info[0:2] == 'tx':
|
||||
if found_sub:
|
||||
sub_entry += line[17:]
|
||||
else:
|
||||
main_entry += line[17:]
|
||||
my_changed_string += '<main-entry>%s' % main_entry
|
||||
if found_sub:
|
||||
my_changed_string += '<sub-entry>%s' % sub_entry
|
||||
my_changed_string += '\n'
|
||||
return my_changed_string
|
||||
|
||||
def __index_see_func(self, my_string):
|
||||
in_see = 0
|
||||
bracket_count = 0
|
||||
see_string = ''
|
||||
changed_string = ''
|
||||
lines = my_string.split('\n')
|
||||
end_bracket_count = sys.maxsize
|
||||
for line in lines:
|
||||
token_info = line[:16]
|
||||
if token_info == 'ob<nu<open-brack':
|
||||
bracket_count += 1
|
||||
if token_info == 'cb<nu<clos-brack':
|
||||
bracket_count -= 1
|
||||
if in_see:
|
||||
if bracket_count == end_bracket_count and token_info == 'cb<nu<clos-brack':
|
||||
in_see = 0
|
||||
else:
|
||||
if token_info == 'tx<nu<__________':
|
||||
see_string += line[17:]
|
||||
else:
|
||||
if token_info == 'cw<in<index-see_':
|
||||
end_bracket_count = bracket_count - 1
|
||||
in_see = 1
|
||||
changed_string += '%s\n' % line
|
||||
return changed_string, see_string
|
||||
|
||||
def __index_bookmark_func(self, my_string):
|
||||
"""
|
||||
Requries:
|
||||
my_string -- string in all the index
|
||||
Returns:
|
||||
bookmark_string -- the text string of the book mark
|
||||
index_string -- string minus the bookmark_string
|
||||
"""
|
||||
# cw<an<place_____<nu<true
|
||||
in_bookmark = 0
|
||||
bracket_count = 0
|
||||
bookmark_string = ''
|
||||
index_string = ''
|
||||
lines = my_string.split('\n')
|
||||
end_bracket_count = sys.maxsize
|
||||
for line in lines:
|
||||
token_info = line[:16]
|
||||
if token_info == 'ob<nu<open-brack':
|
||||
bracket_count += 1
|
||||
if token_info == 'cb<nu<clos-brack':
|
||||
bracket_count -= 1
|
||||
if in_bookmark:
|
||||
if bracket_count == end_bracket_count and token_info == 'cb<nu<clos-brack':
|
||||
in_bookmark = 0
|
||||
index_string += '%s\n' % line
|
||||
else:
|
||||
if token_info == 'tx<nu<__________':
|
||||
bookmark_string += line[17:]
|
||||
else:
|
||||
index_string += '%s\n' % line
|
||||
else:
|
||||
if token_info == 'cw<an<place_____':
|
||||
end_bracket_count = bracket_count - 1
|
||||
in_bookmark = 1
|
||||
index_string += '%s\n' % line
|
||||
return index_string, bookmark_string
|
||||
|
||||
def __index__format_func(self, my_string):
|
||||
italics = 0
|
||||
bold =0
|
||||
lines = my_string.split('\n')
|
||||
for line in lines:
|
||||
token_info = line[:16]
|
||||
if token_info == 'cw<in<index-bold':
|
||||
bold = 1
|
||||
if token_info == 'cw<in<index-ital':
|
||||
italics = 1
|
||||
return italics, bold
|
||||
|
||||
def __parse_toc_func(self, my_string):
|
||||
"""
|
||||
Requires:
|
||||
my_string -- all the string in the toc
|
||||
Returns:
|
||||
modidified string
|
||||
Logic:
|
||||
"""
|
||||
toc_level = 0
|
||||
toc_suppress = 0
|
||||
my_string, book_start_string, book_end_string =\
|
||||
self.__parse_bookmark_for_toc(my_string)
|
||||
main_entry = ''
|
||||
my_changed_string = 'mi<tg<empty-att_<field<type>toc-entry'
|
||||
my_changed_string += '<update>static'
|
||||
if book_start_string:
|
||||
my_changed_string += '<bookmark-start>%s' % book_start_string
|
||||
if book_end_string:
|
||||
my_changed_string += '<bookmark-end>%s' % book_end_string
|
||||
lines = my_string.split('\n')
|
||||
for line in lines:
|
||||
token_info = line[:16]
|
||||
if token_info[0:2] == 'tx':
|
||||
main_entry += line[17:]
|
||||
if token_info == 'cw<tc<toc-level_':
|
||||
toc_level = line[20:]
|
||||
if token_info == 'cw<tc<toc-sup-nu':
|
||||
toc_suppress = 1
|
||||
if toc_level:
|
||||
my_changed_string += '<toc-level>%s' % toc_level
|
||||
if toc_suppress:
|
||||
my_changed_string += '<toc-suppress-number>true'
|
||||
my_changed_string += '<main-entry>%s' % main_entry
|
||||
my_changed_string += '\n'
|
||||
return my_changed_string
|
||||
|
||||
def __parse_bookmark_for_toc(self, my_string):
|
||||
"""
|
||||
Requires:
|
||||
the_string --string of toc, with new lines
|
||||
Returns:
|
||||
the_string -- string minus bookmarks
|
||||
bookmark_string -- bookmarks
|
||||
Logic:
|
||||
"""
|
||||
in_bookmark = 0
|
||||
bracket_count = 0
|
||||
book_start_string = ''
|
||||
book_end_string = ''
|
||||
book_type = 0
|
||||
toc_string = ''
|
||||
lines = my_string.split('\n')
|
||||
end_bracket_count = sys.maxsize
|
||||
for line in lines:
|
||||
token_info = line[:16]
|
||||
if token_info == 'ob<nu<open-brack':
|
||||
bracket_count += 1
|
||||
if token_info == 'cb<nu<clos-brack':
|
||||
bracket_count -= 1
|
||||
if in_bookmark:
|
||||
if bracket_count == end_bracket_count and token_info == 'cb<nu<clos-brack':
|
||||
in_bookmark = 0
|
||||
toc_string += '%s\n' % line
|
||||
else:
|
||||
if token_info == 'tx<nu<__________':
|
||||
if book_type == 'start':
|
||||
book_start_string += line[17:]
|
||||
elif book_type == 'end':
|
||||
book_end_string += line[17:]
|
||||
else:
|
||||
toc_string += '%s\n' % line
|
||||
else:
|
||||
if token_info == 'cw<an<book-mk-st' or token_info =='cw<an<book-mk-en':
|
||||
if token_info == 'cw<an<book-mk-st':
|
||||
book_type = 'start'
|
||||
if token_info == 'cw<an<book-mk-en':
|
||||
book_type = 'end'
|
||||
end_bracket_count = bracket_count - 1
|
||||
in_bookmark = 1
|
||||
toc_string += '%s\n' % line
|
||||
return toc_string, book_start_string, book_end_string
|
||||
|
||||
def __parse_bookmark_func(self, my_string, type):
|
||||
"""
|
||||
Requires:
|
||||
my_string --string to parse
|
||||
type --type of string
|
||||
Returns:
|
||||
A string formated for a field instruction.
|
||||
Logic:
|
||||
The type is the name (either bookmark-end or bookmark-start). The
|
||||
id is the complete text string.
|
||||
"""
|
||||
my_changed_string = ('mi<tg<empty-att_<field<type>%s'
|
||||
'<number>%s<update>none\n' % (type, my_string))
|
||||
return my_changed_string
|
||||
|
||||
def __found_toc_index_func(self, line, tag):
|
||||
"""
|
||||
Requires:
|
||||
line --the line to parse
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
This function is called when a toc or index entry is found. The opening
|
||||
bracket count is stored in the beginning bracket count. The state
|
||||
is changed to 'toc_index.'
|
||||
"""
|
||||
self.__beg_bracket_count = self.__ob_count
|
||||
self.__cb_count = 0
|
||||
self.__state = 'toc_index'
|
||||
self.__tag = tag
|
||||
|
||||
def __toc_index_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line --the line to parse
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
This function handles all lines within a toc or index entry. It
|
||||
adds each line to a string until the end of the entry is found. It
|
||||
processes the string with the fields_string module, and
|
||||
prints out the result.
|
||||
"""
|
||||
if self.__beg_bracket_count == self.__cb_count:
|
||||
self.__state = 'body'
|
||||
type = self.__tag
|
||||
if type == 'index':
|
||||
my_string = self.__parse_index_func(
|
||||
self.__text_string)
|
||||
elif type == 'toc':
|
||||
my_string = self.__parse_toc_func(
|
||||
self.__text_string)
|
||||
self.__write_obj.write(self.__marker)
|
||||
self.__write_obj.write(my_string)
|
||||
self.__text_string = ''
|
||||
self.__write_obj.write(line)
|
||||
else:
|
||||
self.__text_string += line
|
||||
|
||||
def fix_fields(self):
|
||||
"""
|
||||
Requires:
|
||||
nothing
|
||||
Returns:
|
||||
nothing (changes the original file)
|
||||
Logic:
|
||||
Read one line in at a time. Determine what action to take based on
|
||||
the state. If the state is before the body, look for the
|
||||
beginning of the body.
|
||||
The other two states are toc_index (for toc and index entries) and
|
||||
bookmark.
|
||||
"""
|
||||
self.__initiate_values()
|
||||
with open_for_read(self.__file) as read_obj:
|
||||
with open_for_write(self.__write_to) as self.__write_obj:
|
||||
for line in read_obj:
|
||||
self.__token_info = line[:16]
|
||||
if self.__token_info == 'ob<nu<open-brack':
|
||||
self.__ob_count = line[-5:-1]
|
||||
if self.__token_info == 'cb<nu<clos-brack':
|
||||
self.__cb_count = line[-5:-1]
|
||||
action = self.__state_dict.get(self.__state)
|
||||
if action is None:
|
||||
sys.stderr.write('No matching state in module fields_small.py\n')
|
||||
sys.stderr.write(self.__state + '\n')
|
||||
action(line)
|
||||
copy_obj = copy.Copy(bug_handler=self.__bug_handler)
|
||||
if self.__copy:
|
||||
copy_obj.copy_file(self.__write_to, "fields_small.data")
|
||||
copy_obj.rename(self.__write_to, self.__file)
|
||||
os.remove(self.__write_to)
|
||||
226
ebook_converter/ebooks/rtf2xml/fonts.py
Normal file
226
ebook_converter/ebooks/rtf2xml/fonts.py
Normal file
@@ -0,0 +1,226 @@
|
||||
from __future__ import unicode_literals, absolute_import, print_function, division
|
||||
#########################################################################
|
||||
# #
|
||||
# #
|
||||
# copyright 2002 Paul Henry Tremblay #
|
||||
# #
|
||||
# This program is distributed in the hope that it will be useful, #
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
|
||||
# General Public License for more details. #
|
||||
# #
|
||||
# #
|
||||
#########################################################################
|
||||
import sys, os
|
||||
|
||||
from calibre.ebooks.rtf2xml import copy
|
||||
from calibre.ptempfile import better_mktemp
|
||||
from . import open_for_read, open_for_write
|
||||
|
||||
|
||||
class Fonts:
|
||||
"""
|
||||
Change lines with font info from font numbers to the actual font names.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
in_file,
|
||||
bug_handler,
|
||||
default_font_num,
|
||||
copy=None,
|
||||
run_level=1,
|
||||
):
|
||||
"""
|
||||
Required:
|
||||
'file'--file to parse
|
||||
'default_font_num'--the default font number
|
||||
Optional:
|
||||
'copy'-- whether to make a copy of result for debugging
|
||||
'temp_dir' --where to output temporary results (default is
|
||||
directory from which the script is run.)
|
||||
Returns:
|
||||
nothing
|
||||
"""
|
||||
self.__file = in_file
|
||||
self.__bug_handler = bug_handler
|
||||
self.__copy = copy
|
||||
self.__default_font_num = default_font_num
|
||||
self.__write_to = better_mktemp()
|
||||
self.__run_level = run_level
|
||||
|
||||
def __initiate_values(self):
|
||||
"""
|
||||
Initiate all values.
|
||||
"""
|
||||
self.__special_font_dict = {
|
||||
'Symbol' : 0,
|
||||
'Wingdings' : 0,
|
||||
'Zapf Dingbats' : 0,
|
||||
}
|
||||
self.__special_font_list = [
|
||||
'Symbol', 'Wingdings', 'Zapf Dingbats'
|
||||
]
|
||||
self.__state = 'default'
|
||||
self.__state_dict = {
|
||||
'default' : self.__default_func,
|
||||
'font_table' : self.__font_table_func,
|
||||
'after_font_table' : self.__after_font_table_func,
|
||||
'font_in_table' : self.__font_in_table_func,
|
||||
}
|
||||
self.__font_table = {}
|
||||
# individual font written
|
||||
self.__wrote_ind_font = 0
|
||||
|
||||
def __default_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line
|
||||
Returns:
|
||||
nothing
|
||||
Handle all lines before the font table. Check for the beginning of the
|
||||
font table. If found, change the state. Print out all lines.
|
||||
"""
|
||||
if self.__token_info == 'mi<mk<fonttb-beg':
|
||||
self.__state = 'font_table'
|
||||
self.__write_obj.write(line)
|
||||
|
||||
def __font_table_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
If the self.__token_info indicates that you have reached the end of
|
||||
the font table, then change the state to after the font table.
|
||||
If the self.__token_info indicates that there is a font in the
|
||||
table, change the state to font in table. Reset the number of the
|
||||
font to the default font (in case there is no number provided, in
|
||||
which case RTF assumes the number will be the default font.) Reset
|
||||
the test string (for the font name) to ''
|
||||
"""
|
||||
if self.__token_info == 'mi<mk<fonttb-end':
|
||||
self.__state = 'after_font_table'
|
||||
elif self.__token_info == 'mi<mk<fontit-beg':
|
||||
self.__state = 'font_in_table'
|
||||
self.__font_num = self.__default_font_num
|
||||
self.__text_line = ''
|
||||
# self.__write_obj.write(line)
|
||||
|
||||
def __font_in_table_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
Check for four conditions:
|
||||
The line contains font-info. In this case, store the number in
|
||||
self.__font_num.
|
||||
The line contains text. In this case, add to the text string
|
||||
self.__text_string.
|
||||
The line marks the end of the individual font in the table. In
|
||||
this case, add a new key-> value pair to the font-table
|
||||
dictionary. Also create an empty tag with the name and number
|
||||
as attributes.
|
||||
Preamture end of font table
|
||||
"""
|
||||
# cw<ci<font-style<nu<4
|
||||
# tx<nu<__________<Times;
|
||||
if self.__token_info == 'mi<mk<fontit-end':
|
||||
self.__wrote_ind_font = 1
|
||||
self.__state = 'font_table'
|
||||
self.__text_line = self.__text_line[:-1] # get rid of last ';'
|
||||
self.__font_table[self.__font_num] = self.__text_line
|
||||
self.__write_obj.write(
|
||||
'mi<tg<empty-att_'
|
||||
'<font-in-table<name>%s<num>%s\n' % (self.__text_line, self.__font_num)
|
||||
)
|
||||
elif self.__token_info == 'cw<ci<font-style':
|
||||
self.__font_num = line[20:-1]
|
||||
elif self.__token_info == 'tx<nu<__________' or \
|
||||
self.__token_info == 'tx<ut<__________':
|
||||
self.__text_line += line[17:-1]
|
||||
elif self.__token_info == 'mi<mk<fonttb-end':
|
||||
self.__found_end_font_table_func()
|
||||
self.__state = 'after_font_table'
|
||||
|
||||
def __found_end_font_table_func(self):
|
||||
"""
|
||||
Required:
|
||||
nothing
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
If not individual fonts have been written, write one out
|
||||
"""
|
||||
if not self.__wrote_ind_font:
|
||||
self.__write_obj.write(
|
||||
'mi<tg<empty-att_'
|
||||
'<font-in-table<name>Times<num>0\n')
|
||||
|
||||
def __after_font_table_func(self, line):
|
||||
"""
|
||||
Required:
|
||||
line
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
Check the self.__token_info. If this matches a token with font
|
||||
info, then extract the number from the line, and look up the font
|
||||
name in the font dictionary. If no name exists for that number,
|
||||
print out an error. Otherwise print out the same line, except with
|
||||
the name rather than the number.
|
||||
If the line does not contain font info, simply print it out to the
|
||||
file.
|
||||
"""
|
||||
if self.__token_info == 'cw<ci<font-style':
|
||||
font_num = line[20:-1]
|
||||
font_name = self.__font_table.get(font_num)
|
||||
if font_name is None:
|
||||
if self.__run_level > 3:
|
||||
msg = 'no value for %s in self.__font_table\n' % font_num
|
||||
raise self.__bug_handler(msg)
|
||||
else:
|
||||
# self.__special_font_dict
|
||||
if font_name in self.__special_font_list:
|
||||
self.__special_font_dict[font_name] = 1
|
||||
self.__write_obj.write(
|
||||
'cw<ci<font-style<nu<%s\n' % font_name
|
||||
)
|
||||
else:
|
||||
self.__write_obj.write(line)
|
||||
|
||||
def convert_fonts(self):
|
||||
"""
|
||||
Required:
|
||||
nothing
|
||||
Returns:
|
||||
a dictionary indicating with values for special fonts
|
||||
Logic:
|
||||
Read one line in at a time. Determine what action to take based on
|
||||
the state. If the state is font_table, looke for individual fonts
|
||||
and add the number and font name to a dictionary. Also create a
|
||||
tag for each individual font in the font table.
|
||||
If the state is after the font table, look for lines with font
|
||||
info. Substitute a font name for a font number.
|
||||
"""
|
||||
self.__initiate_values()
|
||||
with open_for_read(self.__file) as read_obj:
|
||||
with open_for_write(self.__write_to) as self.__write_obj:
|
||||
for line in read_obj:
|
||||
self.__token_info = line[:16]
|
||||
action = self.__state_dict.get(self.__state)
|
||||
if action is None:
|
||||
sys.stderr.write('no matching state in module fonts.py\n' + self.__state + '\n')
|
||||
action(line)
|
||||
default_font_name = self.__font_table.get(self.__default_font_num)
|
||||
if not default_font_name:
|
||||
default_font_name = 'Not Defined'
|
||||
self.__special_font_dict['default-font'] = default_font_name
|
||||
copy_obj = copy.Copy(bug_handler=self.__bug_handler)
|
||||
if self.__copy:
|
||||
copy_obj.copy_file(self.__write_to, "fonts.data")
|
||||
copy_obj.rename(self.__write_to, self.__file)
|
||||
os.remove(self.__write_to)
|
||||
return self.__special_font_dict
|
||||
264
ebook_converter/ebooks/rtf2xml/footnote.py
Normal file
264
ebook_converter/ebooks/rtf2xml/footnote.py
Normal file
@@ -0,0 +1,264 @@
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
#########################################################################
|
||||
# #
|
||||
# #
|
||||
# copyright 2002 Paul Henry Tremblay #
|
||||
# #
|
||||
# This program is distributed in the hope that it will be useful, #
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
|
||||
# General Public License for more details. #
|
||||
# #
|
||||
# #
|
||||
#########################################################################
|
||||
import os
|
||||
|
||||
from calibre.ebooks.rtf2xml import copy
|
||||
from calibre.ptempfile import better_mktemp
|
||||
from polyglot.builtins import unicode_type
|
||||
|
||||
from . import open_for_read, open_for_write
|
||||
|
||||
|
||||
class Footnote:
|
||||
"""
|
||||
Two public methods are available. The first separates all of the
|
||||
footnotes from the body and puts them at the bottom of the text, where
|
||||
they are easier to process. The second joins those footnotes to the
|
||||
proper places in the body.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
in_file ,
|
||||
bug_handler,
|
||||
copy=None,
|
||||
run_level=1,
|
||||
):
|
||||
self.__file = in_file
|
||||
self.__bug_handler = bug_handler
|
||||
self.__copy = copy
|
||||
self.__write_to = better_mktemp()
|
||||
self.__found_a_footnote = 0
|
||||
|
||||
def __first_line_func(self, line):
|
||||
"""
|
||||
Print the tag info for footnotes. Check whether footnote is an
|
||||
endnote and make the tag according to that.
|
||||
"""
|
||||
if self.__token_info == 'cw<nt<type______':
|
||||
self.__write_to_foot_obj.write(
|
||||
'mi<tg<open-att__<footnote<type>endnote<num>%s\n' % self.__footnote_count)
|
||||
else:
|
||||
self.__write_to_foot_obj.write(
|
||||
'mi<tg<open-att__<footnote<num>%s\n' % self.__footnote_count)
|
||||
self.__first_line = 0
|
||||
|
||||
def __in_footnote_func(self, line):
|
||||
"""Handle all tokens that are part of footnote"""
|
||||
if self.__first_line:
|
||||
self.__first_line_func(line)
|
||||
if self.__token_info == 'cw<ci<footnot-mk':
|
||||
num = unicode_type(self.__footnote_count)
|
||||
self.__write_to_foot_obj.write(line)
|
||||
self.__write_to_foot_obj.write(
|
||||
'tx<nu<__________<%s\n' % num
|
||||
)
|
||||
if self.__cb_count == self.__footnote_bracket_count:
|
||||
self.__in_footnote = 0
|
||||
self.__write_obj.write(line)
|
||||
self.__write_to_foot_obj.write(
|
||||
'mi<mk<foot___clo\n')
|
||||
self.__write_to_foot_obj.write(
|
||||
'mi<tg<close_____<footnote\n')
|
||||
self.__write_to_foot_obj.write(
|
||||
'mi<mk<footnt-clo\n')
|
||||
else:
|
||||
self.__write_to_foot_obj.write(line)
|
||||
|
||||
def __found_footnote(self, line):
|
||||
""" Found a footnote"""
|
||||
self.__found_a_footnote = 1
|
||||
self.__in_footnote = 1
|
||||
self.__first_line = 1
|
||||
self.__footnote_count += 1
|
||||
# temporarily set this to zero so I can enter loop
|
||||
self.__cb_count = 0
|
||||
self.__footnote_bracket_count = self.__ob_count
|
||||
self.__write_obj.write(
|
||||
'mi<mk<footnt-ind<%04d\n' % self.__footnote_count)
|
||||
self.__write_to_foot_obj.write(
|
||||
'mi<mk<footnt-ope<%04d\n' % self.__footnote_count)
|
||||
|
||||
def __default_sep(self, line):
|
||||
"""Handle all tokens that are not footnote tokens"""
|
||||
if self.__token_info == 'cw<nt<footnote__':
|
||||
self.__found_footnote(line)
|
||||
self.__write_obj.write(line)
|
||||
if self.__token_info == 'cw<ci<footnot-mk':
|
||||
num = unicode_type(self.__footnote_count + 1)
|
||||
self.__write_obj.write(
|
||||
'tx<nu<__________<%s\n' % num
|
||||
)
|
||||
|
||||
def __initiate_sep_values(self):
|
||||
"""
|
||||
initiate counters for separate_footnotes method.
|
||||
"""
|
||||
self.__bracket_count=0
|
||||
self.__ob_count = 0
|
||||
self.__cb_count = 0
|
||||
self.__footnote_bracket_count = 0
|
||||
self.__in_footnote = 0
|
||||
self.__first_line = 0 # have not processed the first line of footnote
|
||||
self.__footnote_count = 0
|
||||
|
||||
def separate_footnotes(self):
|
||||
"""
|
||||
Separate all the footnotes in an RTF file and put them at the bottom,
|
||||
where they are easier to process. Each time a footnote is found,
|
||||
print all of its contents to a temporary file. Close both the main and
|
||||
temporary file. Print the footnotes from the temporary file to the
|
||||
bottom of the main file.
|
||||
"""
|
||||
self.__initiate_sep_values()
|
||||
self.__footnote_holder = better_mktemp()
|
||||
with open_for_read(self.__file) as read_obj:
|
||||
with open_for_write(self.__write_to) as self.__write_obj:
|
||||
with open_for_write(self.__footnote_holder) as self.__write_to_foot_obj:
|
||||
for line in read_obj:
|
||||
self.__token_info = line[:16]
|
||||
# keep track of opening and closing brackets
|
||||
if self.__token_info == 'ob<nu<open-brack':
|
||||
self.__ob_count = line[-5:-1]
|
||||
if self.__token_info == 'cb<nu<clos-brack':
|
||||
self.__cb_count = line[-5:-1]
|
||||
# In the middle of footnote text
|
||||
if self.__in_footnote:
|
||||
self.__in_footnote_func(line)
|
||||
# not in the middle of footnote text
|
||||
else:
|
||||
self.__default_sep(line)
|
||||
with open_for_read(self.__footnote_holder) as read_obj:
|
||||
with open_for_write(self.__write_to, append=True) as write_obj:
|
||||
write_obj.write(
|
||||
'mi<mk<sect-close\n'
|
||||
'mi<mk<body-close\n'
|
||||
'mi<tg<close_____<section\n'
|
||||
'mi<tg<close_____<body\n'
|
||||
'mi<tg<close_____<doc\n'
|
||||
'mi<mk<footnt-beg\n')
|
||||
for line in read_obj:
|
||||
write_obj.write(line)
|
||||
write_obj.write(
|
||||
'mi<mk<footnt-end\n')
|
||||
os.remove(self.__footnote_holder)
|
||||
copy_obj = copy.Copy(bug_handler=self.__bug_handler)
|
||||
if self.__copy:
|
||||
copy_obj.copy_file(self.__write_to, "footnote_separate.data")
|
||||
copy_obj.rename(self.__write_to, self.__file)
|
||||
os.remove(self.__write_to)
|
||||
|
||||
def update_info(self, file, copy):
|
||||
"""
|
||||
Unused method
|
||||
"""
|
||||
self.__file = file
|
||||
self.__copy = copy
|
||||
|
||||
def __get_foot_body_func(self, line):
|
||||
"""
|
||||
Process lines in main body and look for beginning of footnotes.
|
||||
"""
|
||||
# mi<mk<footnt-end
|
||||
if self.__token_info == 'mi<mk<footnt-beg':
|
||||
self.__state = 'foot'
|
||||
else:
|
||||
self.__write_obj.write(line)
|
||||
|
||||
def __get_foot_foot_func(self, line):
|
||||
"""
|
||||
Copy footnotes from bottom of file to a separate, temporary file.
|
||||
"""
|
||||
if self.__token_info == 'mi<mk<footnt-end':
|
||||
self.__state = 'body'
|
||||
else:
|
||||
self.__write_to_foot_obj.write(line)
|
||||
|
||||
def __get_footnotes(self):
|
||||
"""
|
||||
Private method to remove footnotes from main file. Read one line from
|
||||
the main file at a time. If the state is 'body', call on the private
|
||||
__get_foot_foot_func. Otherwise, call on the __get_foot_body_func.
|
||||
These two functions do the work of separating the footnotes form the
|
||||
body.
|
||||
"""
|
||||
with open_for_read(self.__file) as read_obj:
|
||||
with open_for_write(self.__write_to) as self.__write_obj:
|
||||
with open_for_write(self.__footnote_holder) as self.__write_to_foot_obj:
|
||||
for line in read_obj:
|
||||
self.__token_info = line[:16]
|
||||
if self.__state == 'body':
|
||||
self.__get_foot_body_func(line)
|
||||
elif self.__state == 'foot':
|
||||
self.__get_foot_foot_func(line)
|
||||
|
||||
def __get_foot_from_temp(self, num):
|
||||
"""
|
||||
Private method for joining footnotes to body. This method reads from
|
||||
the temporary file until the proper footnote marker is found. It
|
||||
collects all the tokens until the end of the footnote, and returns
|
||||
them as a string.
|
||||
"""
|
||||
look_for = 'mi<mk<footnt-ope<' + num + '\n'
|
||||
found_foot = 0
|
||||
string_to_return = ''
|
||||
for line in self.__read_from_foot_obj:
|
||||
if found_foot:
|
||||
if line == 'mi<mk<footnt-clo\n':
|
||||
return string_to_return
|
||||
string_to_return = string_to_return + line
|
||||
else:
|
||||
if line == look_for:
|
||||
found_foot = 1
|
||||
|
||||
def __join_from_temp(self):
|
||||
"""
|
||||
Private method for rejoining footnotes to body. Read from the
|
||||
newly-created, temporary file that contains the body text but no
|
||||
footnotes. Each time a footnote marker is found, call the private
|
||||
method __get_foot_from_temp(). This method will return a string to
|
||||
print out to the third file.
|
||||
If no footnote marker is found, simply print out the token (line).
|
||||
"""
|
||||
with open_for_read(self.__footnote_holder) as self.__read_from_foot_obj:
|
||||
with open_for_read(self.__write_to) as read_obj:
|
||||
with open_for_write(self.__write_to2) as self.__write_obj:
|
||||
for line in read_obj:
|
||||
if line[:16] == 'mi<mk<footnt-ind':
|
||||
line = self.__get_foot_from_temp(line[17:-1])
|
||||
self.__write_obj.write(line)
|
||||
|
||||
def join_footnotes(self):
|
||||
"""
|
||||
Join the footnotes from the bottom of the file and put them in their
|
||||
former places. First, remove the footnotes from the bottom of the
|
||||
input file, outputting them to a temporary file. This creates two new
|
||||
files, one without footnotes, and one of just footnotes. Open both
|
||||
these files to read. When a marker is found in the main file, find the
|
||||
corresponding marker in the footnote file. Output the mix of body and
|
||||
footnotes to a third file.
|
||||
"""
|
||||
if not self.__found_a_footnote:
|
||||
return
|
||||
self.__write_to2 = better_mktemp()
|
||||
self.__state = 'body'
|
||||
self.__get_footnotes()
|
||||
self.__join_from_temp()
|
||||
# self.__write_obj.close()
|
||||
# self.__read_from_foot_obj.close()
|
||||
copy_obj = copy.Copy(bug_handler=self.__bug_handler)
|
||||
if self.__copy:
|
||||
copy_obj.copy_file(self.__write_to2, "footnote_joined.data")
|
||||
copy_obj.rename(self.__write_to2, self.__file)
|
||||
os.remove(self.__write_to2)
|
||||
os.remove(self.__footnote_holder)
|
||||
62
ebook_converter/ebooks/rtf2xml/get_char_map.py
Normal file
62
ebook_converter/ebooks/rtf2xml/get_char_map.py
Normal file
@@ -0,0 +1,62 @@
|
||||
from __future__ import unicode_literals, absolute_import, print_function, division
|
||||
#########################################################################
|
||||
# #
|
||||
# #
|
||||
# copyright 2002 Paul Henry Tremblay #
|
||||
# #
|
||||
# This program is distributed in the hope that it will be useful, #
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
|
||||
# General Public License for more details. #
|
||||
# #
|
||||
# #
|
||||
#########################################################################
|
||||
|
||||
|
||||
class GetCharMap:
|
||||
"""
|
||||
|
||||
Return the character map for the given value
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, bug_handler, char_file):
|
||||
"""
|
||||
|
||||
Required:
|
||||
|
||||
'char_file'--the file with the mappings
|
||||
|
||||
Returns:
|
||||
|
||||
nothing
|
||||
|
||||
"""
|
||||
self.__char_file = char_file
|
||||
self.__bug_handler = bug_handler
|
||||
|
||||
def get_char_map(self, map):
|
||||
# if map == 'ansicpg10000':
|
||||
# map = 'mac_roman'
|
||||
found_map = False
|
||||
map_dict = {}
|
||||
self.__char_file.seek(0)
|
||||
for line in self.__char_file:
|
||||
if not line.strip():
|
||||
continue
|
||||
begin_element = '<%s>' % map
|
||||
end_element = '</%s>' % map
|
||||
if not found_map:
|
||||
if begin_element in line:
|
||||
found_map = True
|
||||
else:
|
||||
if end_element in line:
|
||||
break
|
||||
fields = line.split(':')
|
||||
fields[1].replace('\\colon', ':')
|
||||
map_dict[fields[1]] = fields[3]
|
||||
|
||||
if not found_map:
|
||||
msg = 'no map found\nmap is "%s"\n'%(map,)
|
||||
raise self.__bug_handler(msg)
|
||||
return map_dict
|
||||
306
ebook_converter/ebooks/rtf2xml/group_borders.py
Normal file
306
ebook_converter/ebooks/rtf2xml/group_borders.py
Normal file
@@ -0,0 +1,306 @@
|
||||
from __future__ import unicode_literals, absolute_import, print_function, division
|
||||
#########################################################################
|
||||
# #
|
||||
# #
|
||||
# copyright 2002 Paul Henry Tremblay #
|
||||
# #
|
||||
# This program is distributed in the hope that it will be useful, #
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
|
||||
# General Public License for more details. #
|
||||
# #
|
||||
# #
|
||||
#########################################################################
|
||||
import sys, os, re
|
||||
from calibre.ebooks.rtf2xml import copy
|
||||
from calibre.ptempfile import better_mktemp
|
||||
from . import open_for_read, open_for_write
|
||||
|
||||
|
||||
class GroupBorders:
|
||||
"""
|
||||
Form lists.
|
||||
Use RTF's own formatting to determine if a paragraph definition is part of a
|
||||
list.
|
||||
Use indents to determine items and how lists are nested.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
in_file,
|
||||
bug_handler,
|
||||
copy=None,
|
||||
run_level=1,
|
||||
wrap=0,
|
||||
):
|
||||
"""
|
||||
Required:
|
||||
'file'
|
||||
Optional:
|
||||
'copy'-- whether to make a copy of result for debugging
|
||||
'temp_dir' --where to output temporary results (default is
|
||||
directory from which the script is run.)
|
||||
Returns:
|
||||
nothing
|
||||
"""
|
||||
self.__file = in_file
|
||||
self.__bug_handler = bug_handler
|
||||
self.__copy = copy
|
||||
self.__run_level = run_level
|
||||
self.__write_to = better_mktemp()
|
||||
self.__wrap = wrap
|
||||
|
||||
def __initiate_values(self):
|
||||
"""
|
||||
Required:
|
||||
Nothing
|
||||
Return:
|
||||
Nothing
|
||||
Logic:
|
||||
The self.__end_list is a list of tokens that will force a list to end.
|
||||
Likewise, the self.__end_lines is a list of lines that forces a list to end.
|
||||
"""
|
||||
self.__state = "default"
|
||||
self.__left_indent = 0
|
||||
self.__border_num = 0
|
||||
self.__list_type = 'not-defined'
|
||||
self.__pard_def = ""
|
||||
self.__all_lists = []
|
||||
self.__list_chunk = ''
|
||||
self.__state_dict={
|
||||
'default' : self.__default_func,
|
||||
'in_pard' : self.__in_pard_func,
|
||||
'after_pard' : self.__after_pard_func,
|
||||
}
|
||||
# section end
|
||||
self.__end_list = [
|
||||
# section end
|
||||
'mi<mk<sect-close',
|
||||
'mi<mk<sect-start',
|
||||
# table begin
|
||||
'mi<mk<tabl-start',
|
||||
# field block begin
|
||||
'mi<mk<fldbk-end_',
|
||||
'mi<mk<fldbkstart',
|
||||
# cell end
|
||||
'mi<mk<close_cell',
|
||||
# item end
|
||||
'mi<tg<item_end__',
|
||||
# footnote end
|
||||
'mi<mk<foot___clo',
|
||||
'mi<mk<footnt-ope',
|
||||
# heading end
|
||||
'mi<mk<header-beg',
|
||||
'mi<mk<header-end',
|
||||
'mi<mk<head___clo',
|
||||
# lists
|
||||
'mi<tg<item_end__',
|
||||
'mi<tg<item_end__',
|
||||
'mi<mk<list_start'
|
||||
# body close
|
||||
#
|
||||
# style-group
|
||||
'mi<mk<style-grp_',
|
||||
'mi<mk<style_grp_',
|
||||
'mi<mk<style_gend',
|
||||
'mi<mk<stylegend_',
|
||||
# don't use
|
||||
# 'mi<mk<body-close',
|
||||
# 'mi<mk<par-in-fld',
|
||||
# 'cw<tb<cell______',
|
||||
# 'cw<tb<row-def___',
|
||||
# 'cw<tb<row_______',
|
||||
# 'mi<mk<sec-fd-beg',
|
||||
]
|
||||
# <name>Normal<
|
||||
self.__name_regex = re.compile(r'(<name>[^<]+)')
|
||||
self.__border_regex = re.compile(r'border-paragraph')
|
||||
self.__found_appt = 0
|
||||
self.__line_num = 0
|
||||
self.__border_regex = re.compile(r'(<border-paragraph[^<]+|<border-for-every-paragraph[^<]+)')
|
||||
self.__last_border_string = ''
|
||||
|
||||
def __in_pard_func(self, line):
|
||||
"""
|
||||
Required:
|
||||
line -- the line of current text.
|
||||
Return:
|
||||
Nothing
|
||||
Logic:
|
||||
You are in a list, but in the middle of a paragraph definition.
|
||||
Don't do anything until you find the end of the paragraph definition.
|
||||
"""
|
||||
if self.__token_info == 'mi<tg<close_____' \
|
||||
and line[17:-1] == 'paragraph-definition':
|
||||
self.__state = 'after_pard'
|
||||
else:
|
||||
self.__write_obj.write(line)
|
||||
|
||||
def __after_pard_func(self, line):
|
||||
"""
|
||||
Required:
|
||||
line -- the line of current text.
|
||||
Return:
|
||||
Nothing
|
||||
Logic:
|
||||
"""
|
||||
if self.__token_info == 'mi<tg<open-att__' \
|
||||
and line[17:37] == 'paragraph-definition':
|
||||
# found paragraph definition
|
||||
self.__pard_after_par_def_func(line)
|
||||
elif self.__token_info == 'mi<tg<close_____' \
|
||||
and line[17:-1] == 'paragraph-definition':
|
||||
sys.stderr.write('Wrong flag in __after_pard_func\n')
|
||||
if self.__run_level > 2:
|
||||
msg = 'wrong flag'
|
||||
raise self.__bug_handler(msg)
|
||||
elif self.__token_info in self.__end_list:
|
||||
self.__write_obj.write('mi<tg<close_____<paragraph-definition\n')
|
||||
self.__write_end_border_tag()
|
||||
self.__write_obj.write(self.__list_chunk)
|
||||
self.__list_chunk = ''
|
||||
self.__state = 'default'
|
||||
self.__write_obj.write(line)
|
||||
else:
|
||||
self.__list_chunk += line
|
||||
|
||||
def __close_pard_(self, line):
|
||||
self.__write_obj.write(self.__list_chunk)
|
||||
self.__write_obj.write('mi<tg<close_____<paragraph-definition\n')
|
||||
self.__write_end_wrap()
|
||||
self.__list_chunk = ''
|
||||
self.__state = 'default'
|
||||
|
||||
def __pard_after_par_def_func(self, line):
|
||||
"""
|
||||
Required:
|
||||
line -- the line of current text.
|
||||
id -- the id of the current list
|
||||
Return:
|
||||
Nothing
|
||||
Logic:
|
||||
"""
|
||||
is_border = self.__is_border_func(line)
|
||||
if not is_border:
|
||||
self.__write_obj.write('mi<tg<close_____<paragraph-definition\n')
|
||||
self.__write_end_border_tag()
|
||||
self.__write_obj.write(self.__list_chunk)
|
||||
self.__write_obj.write(line)
|
||||
self.__state = 'default'
|
||||
self.__list_chunk = ''
|
||||
else:
|
||||
border_string, pard_string = self.__parse_pard_with_border(line)
|
||||
if self.__last_border_string == border_string:
|
||||
# just keep going
|
||||
self.__write_obj.write('mi<tg<close_____<paragraph-definition\n')
|
||||
self.__write_obj.write(self.__list_chunk)
|
||||
self.__list_chunk = ''
|
||||
self.__state = 'in_pard'
|
||||
self.__write_obj.write(pard_string)
|
||||
else:
|
||||
# different name for the paragraph definition
|
||||
self.__write_obj.write('mi<tg<close_____<paragraph-definition\n')
|
||||
self.__write_end_border_tag()
|
||||
self.__write_obj.write(self.__list_chunk)
|
||||
self.__write_start_border_tag(border_string)
|
||||
self.__write_obj.write(pard_string)
|
||||
self.__state = 'in_pard'
|
||||
self.__last_border_string = border_string
|
||||
self.__list_chunk = ''
|
||||
|
||||
def __default_func(self, line):
|
||||
"""
|
||||
Required:
|
||||
self, line
|
||||
Returns:
|
||||
Nothing
|
||||
Logic
|
||||
Look for the start of a paragraph defintion. If one is found, check if
|
||||
it contains a list-id. If it does, start a list. Change the state to
|
||||
in_pard.
|
||||
"""
|
||||
if self.__token_info == 'mi<tg<open-att__' \
|
||||
and line[17:37] == 'paragraph-definition':
|
||||
contains_border = self.__is_border_func(line)
|
||||
if contains_border:
|
||||
border_string, pard_string = self.__parse_pard_with_border(line)
|
||||
self.__write_start_border_tag(border_string)
|
||||
self.__write_obj.write(pard_string)
|
||||
self.__last_border_string = border_string
|
||||
self.__state = 'in_pard'
|
||||
else:
|
||||
self.__write_obj.write(line)
|
||||
else:
|
||||
self.__write_obj.write(line)
|
||||
|
||||
def __write_start_border_tag(self, the_string):
|
||||
self.__write_obj.write('mi<mk<start-brdg\n')
|
||||
self.__border_num += 1
|
||||
num = '%04d' % self.__border_num
|
||||
num_string = 's%s' % num
|
||||
the_string += '<num>%s' % num_string
|
||||
self.__write_obj.write('mi<tg<open-att__<border-group%s\n' % the_string)
|
||||
|
||||
def __write_end_border_tag(self):
|
||||
self.__write_obj.write('mi<mk<end-brdg__\n')
|
||||
self.__write_obj.write('mi<tg<close_____<border-group\n')
|
||||
|
||||
def __is_border_func(self, line):
|
||||
line = re.sub(self.__name_regex, '', line)
|
||||
index = line.find('border-paragraph')
|
||||
if index > -1:
|
||||
return 1
|
||||
return 0
|
||||
|
||||
def __parse_pard_with_border(self, line):
|
||||
border_string = ''
|
||||
pard_string = ''
|
||||
tokens = re.split(self.__border_regex, line)
|
||||
for token in tokens:
|
||||
if token[0:17] == '<border-paragraph':
|
||||
border_string += token
|
||||
else:
|
||||
pard_string += token
|
||||
return border_string, pard_string
|
||||
|
||||
def __write_pard_with_border(self, line):
|
||||
border_string = ''
|
||||
pard_string = ''
|
||||
tokens = re.split(self.__border_regex, line)
|
||||
for token in tokens:
|
||||
if token[0:17] == '<border-paragraph':
|
||||
border_string += token
|
||||
else:
|
||||
pard_string += token
|
||||
self.__write_start_border_tag(border_string)
|
||||
self.__write_obj.write(pard_string)
|
||||
|
||||
def __get_style_name(self, line):
|
||||
if self.__token_info == 'mi<mk<style-name':
|
||||
self.__style_name = line[17:-1]
|
||||
|
||||
def group_borders(self):
|
||||
"""
|
||||
Required:
|
||||
nothing
|
||||
Returns:
|
||||
original file will be changed
|
||||
Logic:
|
||||
"""
|
||||
self.__initiate_values()
|
||||
read_obj = open_for_read(self.__file)
|
||||
self.__write_obj = open_for_write(self.__write_to)
|
||||
line_to_read = 1
|
||||
while line_to_read:
|
||||
line_to_read = read_obj.readline()
|
||||
line = line_to_read
|
||||
self.__token_info = line[:16]
|
||||
self.__get_style_name(line)
|
||||
action = self.__state_dict.get(self.__state)
|
||||
action(line)
|
||||
read_obj.close()
|
||||
self.__write_obj.close()
|
||||
copy_obj = copy.Copy(bug_handler=self.__bug_handler)
|
||||
if self.__copy:
|
||||
copy_obj.copy_file(self.__write_to, "group_borders.data")
|
||||
copy_obj.rename(self.__write_to, self.__file)
|
||||
os.remove(self.__write_to)
|
||||
252
ebook_converter/ebooks/rtf2xml/group_styles.py
Normal file
252
ebook_converter/ebooks/rtf2xml/group_styles.py
Normal file
@@ -0,0 +1,252 @@
|
||||
from __future__ import unicode_literals, absolute_import, print_function, division
|
||||
#########################################################################
|
||||
# #
|
||||
# #
|
||||
# copyright 2002 Paul Henry Tremblay #
|
||||
# #
|
||||
# This program is distributed in the hope that it will be useful, #
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
|
||||
# General Public License for more details. #
|
||||
# #
|
||||
# #
|
||||
#########################################################################
|
||||
import sys, os, re
|
||||
from calibre.ebooks.rtf2xml import copy
|
||||
from calibre.ptempfile import better_mktemp
|
||||
from . import open_for_read, open_for_write
|
||||
|
||||
|
||||
class GroupStyles:
|
||||
"""
|
||||
Form lists.
|
||||
Use RTF's own formatting to determine if a paragraph definition is part of a
|
||||
list.
|
||||
Use indents to determine items and how lists are nested.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
in_file,
|
||||
bug_handler,
|
||||
copy=None,
|
||||
run_level=1,
|
||||
wrap=0,
|
||||
):
|
||||
"""
|
||||
Required:
|
||||
'file'
|
||||
Optional:
|
||||
'copy'-- whether to make a copy of result for debugging
|
||||
'temp_dir' --where to output temporary results (default is
|
||||
directory from which the script is run.)
|
||||
Returns:
|
||||
nothing
|
||||
"""
|
||||
self.__file = in_file
|
||||
self.__bug_handler = bug_handler
|
||||
self.__copy = copy
|
||||
self.__run_level = run_level
|
||||
self.__write_to = better_mktemp()
|
||||
self.__wrap = wrap
|
||||
|
||||
def __initiate_values(self):
|
||||
"""
|
||||
Required:
|
||||
Nothing
|
||||
Return:
|
||||
Nothing
|
||||
Logic:
|
||||
The self.__end_list is a list of tokens that will force a list to end.
|
||||
Likewise, the self.__end_lines is a list of lines that forces a list to end.
|
||||
"""
|
||||
self.__state = "default"
|
||||
self.__left_indent = 0
|
||||
self.__list_type = 'not-defined'
|
||||
self.__pard_def = ""
|
||||
self.__all_lists = []
|
||||
self.__list_chunk = ''
|
||||
self.__state_dict={
|
||||
'default' : self.__default_func,
|
||||
'in_pard' : self.__in_pard_func,
|
||||
'after_pard' : self.__after_pard_func,
|
||||
}
|
||||
# section end
|
||||
self.__end_list = [
|
||||
# section end
|
||||
'mi<mk<sect-close',
|
||||
'mi<mk<sect-start',
|
||||
# table begin
|
||||
'mi<mk<tabl-start',
|
||||
# field block begin
|
||||
'mi<mk<fldbk-end_',
|
||||
'mi<mk<fldbkstart',
|
||||
# cell end
|
||||
'mi<mk<close_cell',
|
||||
# item end
|
||||
'mi<tg<item_end__',
|
||||
# footnote end
|
||||
'mi<mk<foot___clo',
|
||||
'mi<mk<footnt-ope',
|
||||
# heading end
|
||||
'mi<mk<header-beg',
|
||||
'mi<mk<header-end',
|
||||
'mi<mk<head___clo',
|
||||
# lists
|
||||
'mi<tg<item_end__',
|
||||
'mi<tg<item_end__',
|
||||
'mi<mk<list_start'
|
||||
# body close
|
||||
# don't use
|
||||
# 'mi<mk<body-close',
|
||||
# 'mi<mk<par-in-fld',
|
||||
# 'cw<tb<cell______',
|
||||
# 'cw<tb<row-def___',
|
||||
# 'cw<tb<row_______',
|
||||
# 'mi<mk<sec-fd-beg',
|
||||
]
|
||||
self.__name_regex = re.compile(r'<name>')
|
||||
self.__found_appt = 0
|
||||
self.__line_num = 0
|
||||
|
||||
def __in_pard_func(self, line):
|
||||
"""
|
||||
Required:
|
||||
line -- the line of current text.
|
||||
Return:
|
||||
Nothing
|
||||
Logic:
|
||||
You are in a list, but in the middle of a paragraph definition.
|
||||
Don't do anything until you find the end of the paragraph definition.
|
||||
"""
|
||||
if self.__token_info == 'mi<tg<close_____' \
|
||||
and line[17:-1] == 'paragraph-definition':
|
||||
self.__state = 'after_pard'
|
||||
else:
|
||||
self.__write_obj.write(line)
|
||||
|
||||
def __after_pard_func(self, line):
|
||||
"""
|
||||
Required:
|
||||
line -- the line of current text.
|
||||
Return:
|
||||
Nothing
|
||||
Logic:
|
||||
"""
|
||||
if self.__token_info == 'mi<tg<open-att__' \
|
||||
and line[17:37] == 'paragraph-definition':
|
||||
# found paragraph definition
|
||||
self.__pard_after_par_def_func(line)
|
||||
elif self.__token_info == 'mi<tg<close_____' \
|
||||
and line[17:-1] == 'paragraph-definition':
|
||||
sys.stderr.write('Wrong flag in __after_pard_func\n')
|
||||
if self.__run_level > 2:
|
||||
msg = 'wrong flag'
|
||||
raise self.__bug_handler(msg)
|
||||
elif self.__token_info in self.__end_list:
|
||||
self.__write_obj.write('mi<tg<close_____<paragraph-definition\n')
|
||||
self.__write_end_wrap()
|
||||
self.__write_obj.write(self.__list_chunk)
|
||||
self.__list_chunk = ''
|
||||
self.__state = 'default'
|
||||
self.__write_obj.write(line)
|
||||
else:
|
||||
self.__list_chunk += line
|
||||
|
||||
def __close_pard_(self, line):
|
||||
self.__write_obj.write(self.__list_chunk)
|
||||
self.__write_obj.write('mi<tg<close_____<paragraph-definition\n')
|
||||
self.__write_end_wrap()
|
||||
self.__list_chunk = ''
|
||||
self.__state = 'default'
|
||||
|
||||
def __write_start_wrap(self, name):
|
||||
if self.__wrap:
|
||||
self.__write_obj.write('mi<mk<style-grp_<%s\n' % name)
|
||||
self.__write_obj.write('mi<tg<open-att__<style-group<name>%s\n' % name)
|
||||
self.__write_obj.write('mi<mk<style_grp_<%s\n' % name)
|
||||
|
||||
def __write_end_wrap(self):
|
||||
if self.__wrap:
|
||||
self.__write_obj.write('mi<mk<style_gend\n')
|
||||
self.__write_obj.write('mi<tg<close_____<style-group\n')
|
||||
self.__write_obj.write('mi<mk<stylegend_\n')
|
||||
|
||||
def __pard_after_par_def_func(self, line):
|
||||
"""
|
||||
Required:
|
||||
line -- the line of current text.
|
||||
id -- the id of the current list
|
||||
Return:
|
||||
Nothing
|
||||
Logic:
|
||||
"""
|
||||
if self.__last_style_name == self.__style_name:
|
||||
# just keep going
|
||||
if self.__wrap:
|
||||
self.__write_obj.write('mi<tg<close_____<paragraph-definition\n')
|
||||
self.__write_obj.write(self.__list_chunk)
|
||||
self.__list_chunk = ''
|
||||
self.__state = 'in_pard'
|
||||
if self.__wrap:
|
||||
self.__write_obj.write(line)
|
||||
else:
|
||||
# different name for the paragraph definition
|
||||
self.__write_obj.write('mi<tg<close_____<paragraph-definition\n')
|
||||
self.__write_end_wrap()
|
||||
self.__write_obj.write(self.__list_chunk)
|
||||
self.__write_start_wrap(self.__style_name)
|
||||
self.__write_obj.write(line)
|
||||
self.__state = 'in_pard'
|
||||
self.__last_style_name = self.__style_name
|
||||
self.__list_chunk = ''
|
||||
|
||||
def __default_func(self, line):
|
||||
"""
|
||||
Required:
|
||||
self, line
|
||||
Returns:
|
||||
Nothing
|
||||
Logic
|
||||
Look for the start of a paragraph defintion. If one is found, check if
|
||||
it contains a list-id. If it does, start a list. Change the state to
|
||||
in_pard.
|
||||
"""
|
||||
if self.__token_info == 'mi<tg<open-att__' \
|
||||
and line[17:37] == 'paragraph-definition':
|
||||
self.__state = 'in_pard'
|
||||
self.__last_style_name = self.__style_name
|
||||
self.__write_start_wrap(self.__last_style_name)
|
||||
self.__write_obj.write(line)
|
||||
else:
|
||||
self.__write_obj.write(line)
|
||||
|
||||
def __get_style_name(self, line):
|
||||
if self.__token_info == 'mi<mk<style-name':
|
||||
self.__style_name = line[17:-1]
|
||||
|
||||
def group_styles(self):
|
||||
"""
|
||||
Required:
|
||||
nothing
|
||||
Returns:
|
||||
original file will be changed
|
||||
Logic:
|
||||
"""
|
||||
self.__initiate_values()
|
||||
read_obj = open_for_read(self.__file)
|
||||
self.__write_obj = open_for_write(self.__write_to)
|
||||
line_to_read = 1
|
||||
while line_to_read:
|
||||
line_to_read = read_obj.readline()
|
||||
line = line_to_read
|
||||
self.__token_info = line[:16]
|
||||
self.__get_style_name(line)
|
||||
action = self.__state_dict.get(self.__state)
|
||||
action(line)
|
||||
read_obj.close()
|
||||
self.__write_obj.close()
|
||||
copy_obj = copy.Copy(bug_handler=self.__bug_handler)
|
||||
if self.__copy:
|
||||
copy_obj.copy_file(self.__write_to, "group_styles.data")
|
||||
copy_obj.rename(self.__write_to, self.__file)
|
||||
os.remove(self.__write_to)
|
||||
261
ebook_converter/ebooks/rtf2xml/header.py
Normal file
261
ebook_converter/ebooks/rtf2xml/header.py
Normal file
@@ -0,0 +1,261 @@
|
||||
from __future__ import unicode_literals, absolute_import, print_function, division
|
||||
#########################################################################
|
||||
# #
|
||||
# #
|
||||
# copyright 2002 Paul Henry Tremblay #
|
||||
# #
|
||||
# This program is distributed in the hope that it will be useful, #
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
|
||||
# General Public License for more details. #
|
||||
# #
|
||||
# #
|
||||
#########################################################################
|
||||
import sys, os
|
||||
|
||||
from calibre.ebooks.rtf2xml import copy
|
||||
from calibre.ptempfile import better_mktemp
|
||||
from . import open_for_read, open_for_write
|
||||
|
||||
|
||||
class Header:
|
||||
"""
|
||||
Two public methods are available. The first separates all of the headers
|
||||
and footers from the body and puts them at the bottom of the text, where
|
||||
they are easier to process. The second joins those headers and footers to
|
||||
the proper places in the body.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
in_file ,
|
||||
bug_handler,
|
||||
copy=None,
|
||||
run_level=1,
|
||||
):
|
||||
self.__file = in_file
|
||||
self.__bug_handler = bug_handler
|
||||
self.__copy = copy
|
||||
self.__write_to = better_mktemp()
|
||||
self.__found_a_header = False
|
||||
|
||||
def __in_header_func(self, line):
|
||||
"""
|
||||
Handle all tokens that are part of header
|
||||
"""
|
||||
if self.__cb_count == self.__header_bracket_count:
|
||||
self.__in_header = False
|
||||
self.__write_obj.write(line)
|
||||
self.__write_to_head_obj.write(
|
||||
'mi<mk<head___clo\n'
|
||||
'mi<tg<close_____<header-or-footer\n'
|
||||
'mi<mk<header-clo\n')
|
||||
else:
|
||||
self.__write_to_head_obj.write(line)
|
||||
|
||||
def __found_header(self, line):
|
||||
"""
|
||||
Found a header
|
||||
"""
|
||||
# but this could be header or footer
|
||||
self.__found_a_header = True
|
||||
self.__in_header = True
|
||||
self.__header_count += 1
|
||||
# temporarily set this to zero so I can enter loop
|
||||
self.__cb_count = 0
|
||||
self.__header_bracket_count = self.__ob_count
|
||||
self.__write_obj.write(
|
||||
'mi<mk<header-ind<%04d\n' % self.__header_count)
|
||||
self.__write_to_head_obj.write(
|
||||
'mi<mk<header-ope<%04d\n' % self.__header_count)
|
||||
info = line[6:16]
|
||||
type = self.__head_dict.get(info)
|
||||
if type:
|
||||
self.__write_to_head_obj.write(
|
||||
'mi<tg<open-att__<header-or-footer<type>%s\n' % (type)
|
||||
)
|
||||
else:
|
||||
sys.stderr.write(
|
||||
'module is header\n'
|
||||
'method is __found_header\n'
|
||||
'no dict entry\n'
|
||||
'line is %s' % line)
|
||||
self.__write_to_head_obj.write(
|
||||
'mi<tg<open-att__<header-or-footer<type>none\n'
|
||||
)
|
||||
|
||||
def __default_sep(self, line):
|
||||
"""
|
||||
Handle all tokens that are not header tokens
|
||||
"""
|
||||
if self.__token_info[3:5] == 'hf':
|
||||
self.__found_header(line)
|
||||
self.__write_obj.write(line)
|
||||
|
||||
def __initiate_sep_values(self):
|
||||
"""
|
||||
initiate counters for separate_footnotes method.
|
||||
"""
|
||||
self.__bracket_count=0
|
||||
self.__ob_count = 0
|
||||
self.__cb_count = 0
|
||||
self.__header_bracket_count = 0
|
||||
self.__in_header = False
|
||||
self.__header_count = 0
|
||||
self.__head_dict = {
|
||||
'head-left_' : ('header-left'),
|
||||
'head-right' : ('header-right'),
|
||||
'foot-left_' : ('footer-left'),
|
||||
'foot-right' : ('footer-right'),
|
||||
'head-first' : ('header-first'),
|
||||
'foot-first' : ('footer-first'),
|
||||
'header____' : ('header'),
|
||||
'footer____' : ('footer'),
|
||||
}
|
||||
|
||||
def separate_headers(self):
|
||||
"""
|
||||
Separate all the footnotes in an RTF file and put them at the bottom,
|
||||
where they are easier to process. Each time a footnote is found,
|
||||
print all of its contents to a temporary file. Close both the main and
|
||||
temporary file. Print the footnotes from the temporary file to the
|
||||
bottom of the main file.
|
||||
"""
|
||||
self.__initiate_sep_values()
|
||||
self.__header_holder = better_mktemp()
|
||||
with open_for_read(self.__file) as read_obj:
|
||||
with open_for_write(self.__write_to) as self.__write_obj:
|
||||
with open_for_write(self.__header_holder) as self.__write_to_head_obj:
|
||||
for line in read_obj:
|
||||
self.__token_info = line[:16]
|
||||
# keep track of opening and closing brackets
|
||||
if self.__token_info == 'ob<nu<open-brack':
|
||||
self.__ob_count = line[-5:-1]
|
||||
if self.__token_info == 'cb<nu<clos-brack':
|
||||
self.__cb_count = line[-5:-1]
|
||||
# In the middle of footnote text
|
||||
if self.__in_header:
|
||||
self.__in_header_func(line)
|
||||
# not in the middle of footnote text
|
||||
else:
|
||||
self.__default_sep(line)
|
||||
|
||||
with open_for_read(self.__header_holder) as read_obj:
|
||||
with open_for_write(self.__write_to, append=True) as write_obj:
|
||||
write_obj.write(
|
||||
'mi<mk<header-beg\n')
|
||||
for line in read_obj:
|
||||
write_obj.write(line)
|
||||
write_obj.write(
|
||||
'mi<mk<header-end\n')
|
||||
os.remove(self.__header_holder)
|
||||
|
||||
copy_obj = copy.Copy(bug_handler=self.__bug_handler)
|
||||
if self.__copy:
|
||||
copy_obj.copy_file(self.__write_to, "header_separate.data")
|
||||
copy_obj.rename(self.__write_to, self.__file)
|
||||
os.remove(self.__write_to)
|
||||
|
||||
def update_info(self, file, copy):
|
||||
"""
|
||||
Unused method
|
||||
"""
|
||||
self.__file = file
|
||||
self.__copy = copy
|
||||
|
||||
def __get_head_body_func(self, line):
|
||||
"""
|
||||
Process lines in main body and look for beginning of headers.
|
||||
"""
|
||||
# mi<mk<footnt-end
|
||||
if self.__token_info == 'mi<mk<header-beg':
|
||||
self.__state = 'head'
|
||||
else:
|
||||
self.__write_obj.write(line)
|
||||
|
||||
def __get_head_head_func(self, line):
|
||||
"""
|
||||
Copy headers and footers from bottom of file to a separate, temporary file.
|
||||
"""
|
||||
if self.__token_info == 'mi<mk<header-end':
|
||||
self.__state = 'body'
|
||||
else:
|
||||
self.__write_to_head_obj.write(line)
|
||||
|
||||
def __get_headers(self):
|
||||
"""
|
||||
Private method to remove footnotes from main file. Read one line from
|
||||
the main file at a time. If the state is 'body', call on the private
|
||||
__get_foot_foot_func. Otherwise, call on the __get_foot_body_func.
|
||||
These two functions do the work of separating the footnotes form the
|
||||
body.
|
||||
"""
|
||||
with open_for_read(self.__file) as read_obj:
|
||||
with open_for_write(self.__write_to) as self.__write_obj:
|
||||
with open_for_write(self.__header_holder) as self.__write_to_head_obj:
|
||||
for line in read_obj:
|
||||
self.__token_info = line[:16]
|
||||
if self.__state == 'body':
|
||||
self.__get_head_body_func(line)
|
||||
elif self.__state == 'head':
|
||||
self.__get_head_head_func(line)
|
||||
|
||||
def __get_head_from_temp(self, num):
|
||||
"""
|
||||
Private method for joining headers and footers to body. This method
|
||||
reads from the temporary file until the proper footnote marker is
|
||||
found. It collects all the tokens until the end of the footnote, and
|
||||
returns them as a string.
|
||||
"""
|
||||
look_for = 'mi<mk<header-ope<' + num + '\n'
|
||||
found_head = False
|
||||
string_to_return = ''
|
||||
for line in self.__read_from_head_obj:
|
||||
if found_head:
|
||||
if line == 'mi<mk<header-clo\n':
|
||||
return string_to_return
|
||||
string_to_return += line
|
||||
else:
|
||||
if line == look_for:
|
||||
found_head = True
|
||||
|
||||
def __join_from_temp(self):
|
||||
"""
|
||||
Private method for rejoining footnotes to body. Read from the
|
||||
newly-created, temporary file that contains the body text but no
|
||||
footnotes. Each time a footnote marker is found, call the private
|
||||
method __get_foot_from_temp(). This method will return a string to
|
||||
print out to the third file.
|
||||
If no footnote marker is found, simply print out the token (line).
|
||||
"""
|
||||
self.__read_from_head_obj = open_for_read(self.__header_holder)
|
||||
self.__write_obj = open_for_write(self.__write_to2)
|
||||
with open_for_read(self.__write_to) as read_obj:
|
||||
for line in read_obj:
|
||||
if line[:16] == 'mi<mk<header-ind':
|
||||
line = self.__get_head_from_temp(line[17:-1])
|
||||
self.__write_obj.write(line)
|
||||
|
||||
def join_headers(self):
|
||||
"""
|
||||
Join the footnotes from the bottom of the file and put them in their
|
||||
former places. First, remove the footnotes from the bottom of the
|
||||
input file, outputting them to a temporary file. This creates two new
|
||||
files, one without footnotes, and one of just footnotes. Open both
|
||||
these files to read. When a marker is found in the main file, find the
|
||||
corresponding marker in the footnote file. Output the mix of body and
|
||||
footnotes to a third file.
|
||||
"""
|
||||
if not self.__found_a_header:
|
||||
return
|
||||
self.__write_to2 = better_mktemp()
|
||||
self.__state = 'body'
|
||||
self.__get_headers()
|
||||
self.__join_from_temp()
|
||||
self.__write_obj.close()
|
||||
self.__read_from_head_obj.close()
|
||||
copy_obj = copy.Copy(bug_handler=self.__bug_handler)
|
||||
if self.__copy:
|
||||
copy_obj.copy_file(self.__write_to, "header_join.data")
|
||||
copy_obj.rename(self.__write_to, self.__file)
|
||||
os.remove(self.__write_to)
|
||||
os.remove(self.__header_holder)
|
||||
227
ebook_converter/ebooks/rtf2xml/headings_to_sections.py
Normal file
227
ebook_converter/ebooks/rtf2xml/headings_to_sections.py
Normal file
@@ -0,0 +1,227 @@
|
||||
from __future__ import unicode_literals, absolute_import, print_function, division
|
||||
#########################################################################
|
||||
# #
|
||||
# #
|
||||
# copyright 2002 Paul Henry Tremblay #
|
||||
# #
|
||||
# This program is distributed in the hope that it will be useful, #
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
|
||||
# General Public License for more details. #
|
||||
# #
|
||||
# #
|
||||
#########################################################################
|
||||
import os, re
|
||||
from calibre.ebooks.rtf2xml import copy
|
||||
from calibre.ptempfile import better_mktemp
|
||||
from . import open_for_read, open_for_write
|
||||
|
||||
|
||||
class HeadingsToSections:
|
||||
"""
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
in_file,
|
||||
bug_handler,
|
||||
copy=None,
|
||||
run_level=1,
|
||||
):
|
||||
"""
|
||||
Required:
|
||||
'file'
|
||||
Optional:
|
||||
'copy'-- whether to make a copy of result for debugging
|
||||
'temp_dir' --where to output temporary results (default is
|
||||
directory from which the script is run.)
|
||||
Returns:
|
||||
nothing
|
||||
"""
|
||||
self.__file = in_file
|
||||
self.__bug_handler = bug_handler
|
||||
self.__copy = copy
|
||||
self.__write_to = better_mktemp()
|
||||
|
||||
def __initiate_values(self):
|
||||
"""
|
||||
Required:
|
||||
Nothing
|
||||
Return:
|
||||
Nothing
|
||||
Logic:
|
||||
The self.__end_list is a list of tokens that will force a list to end.
|
||||
Likewise, the self.__end_lines is a list of lines that forces a list to end.
|
||||
"""
|
||||
self.__state = "default"
|
||||
self.__all_sections = []
|
||||
self.__chunk = ''
|
||||
self.__state_dict={
|
||||
'default' : self.__default_func,
|
||||
'in_table' : self.__in_table_func,
|
||||
'in_list' : self.__in_list_func,
|
||||
'after_body' : self.__after_body_func,
|
||||
}
|
||||
self.__list_depth = 0
|
||||
self.__end_list = [
|
||||
'mi<mk<body-close',
|
||||
# changed 2004-04-26
|
||||
# 'mi<mk<par-in-fld',
|
||||
'mi<mk<sect-close', # right before close of section
|
||||
'mi<mk<sect-start', # right before section start
|
||||
# this should be sect-close!
|
||||
# 'mi<mk<header-beg',
|
||||
# 'mi<mk<header-end',
|
||||
# 'mi<mk<head___clo',
|
||||
#
|
||||
# changed 2004-04-26
|
||||
# 'mi<mk<fldbk-end_',
|
||||
# 'mi<mk<sec-fd-beg',
|
||||
]
|
||||
self.__headings = [
|
||||
'heading 1', 'heading 2', 'heading 3', 'heading 4',
|
||||
'heading 5', 'heading 6', 'heading 7', 'heading 8',
|
||||
'heading 9'
|
||||
]
|
||||
self.__section_num = [0]
|
||||
self.__id_regex = re.compile(r'\<list-id\>(\d+)')
|
||||
|
||||
def __close_lists(self):
|
||||
"""
|
||||
Required:
|
||||
Nothing
|
||||
Return:
|
||||
Nothing
|
||||
Logic:
|
||||
Reverse the list of dictionaries. Iterate through the list and
|
||||
get the indent for each list. If the current indent is less than
|
||||
or equal to the indent in the dictionary, close that level.
|
||||
Keep track of how many levels you close. Reduce the list by that
|
||||
many levels.
|
||||
Reverse the list again.
|
||||
"""
|
||||
current_indent = self.__left_indent
|
||||
self.__all_lists.reverse()
|
||||
num_levels_closed = 0
|
||||
for the_dict in self.__all_lists:
|
||||
list_indent = the_dict.get('left-indent')
|
||||
if current_indent <= list_indent:
|
||||
self.__write_end_item()
|
||||
self.__write_end_list()
|
||||
num_levels_closed += 1
|
||||
self.__all_lists = self.__all_lists[num_levels_closed:]
|
||||
self.__all_lists.reverse()
|
||||
|
||||
def __close_sections(self, current_level):
|
||||
self.__all_sections.reverse()
|
||||
num_levels_closed = 0
|
||||
for level in self.__all_sections:
|
||||
if current_level <= level:
|
||||
self.__write_end_section()
|
||||
num_levels_closed += 1
|
||||
self.__all_sections = self.__all_sections[num_levels_closed:]
|
||||
self.__all_sections.reverse()
|
||||
|
||||
def __write_start_section(self, current_level, name):
|
||||
section_num = ''
|
||||
for the_num in self.__section_num:
|
||||
section_num += '%s.' % the_num
|
||||
section_num = section_num[:-1]
|
||||
num_in_level = len(self.__all_sections)
|
||||
num_in_level = self.__section_num[num_in_level]
|
||||
level = len(self.__all_sections)
|
||||
self.__write_obj.write(
|
||||
'mi<mk<sect-start\n'
|
||||
)
|
||||
self.__write_obj.write(
|
||||
'mi<tg<open-att__<section<num>%s<num-in-level>%s<level>%s'
|
||||
'<type>%s\n'
|
||||
% (section_num, num_in_level, level, name)
|
||||
)
|
||||
|
||||
def __write_end_section(self):
|
||||
self.__write_obj.write('mi<mk<sect-close\n')
|
||||
self.__write_obj.write('mi<tg<close_____<section\n')
|
||||
|
||||
def __default_func(self, line):
|
||||
"""
|
||||
Required:
|
||||
self, line
|
||||
Returns:
|
||||
Nothing
|
||||
Logic
|
||||
Look for the start of a paragraph defintion. If one is found, check if
|
||||
it contains a list-id. If it does, start a list. Change the state to
|
||||
in_pard.
|
||||
"""
|
||||
if self.__token_info == 'mi<mk<sect-start':
|
||||
self.__section_num[0] += 1
|
||||
self.__section_num = self.__section_num[0:1]
|
||||
if self.__token_info == 'mi<mk<tabl-start':
|
||||
self.__state = 'in_table'
|
||||
elif self.__token_info == 'mi<mk<list_start':
|
||||
self.__state = 'in_list'
|
||||
self.__list_depth += 1
|
||||
elif self.__token_info in self.__end_list:
|
||||
self.__close_sections(0)
|
||||
elif self.__token_info == 'mi<mk<style-name':
|
||||
name = line[17:-1]
|
||||
if name in self.__headings:
|
||||
self.__handle_heading(name)
|
||||
if self.__token_info == 'mi<mk<body-close':
|
||||
self.__state = 'after_body'
|
||||
self.__write_obj.write(line)
|
||||
|
||||
def __handle_heading(self, name):
|
||||
num = self.__headings.index(name) + 1
|
||||
self.__close_sections(num)
|
||||
self.__all_sections.append(num)
|
||||
level_depth = len(self.__all_sections) + 1
|
||||
self.__section_num = self.__section_num[:level_depth]
|
||||
if len(self.__section_num) < level_depth:
|
||||
self.__section_num.append(1)
|
||||
else:
|
||||
self.__section_num[-1] += 1
|
||||
self.__write_start_section(num, name)
|
||||
|
||||
def __in_table_func(self, line):
|
||||
if self.__token_info == 'mi<mk<table-end_':
|
||||
self.__state = 'default'
|
||||
self.__write_obj.write(line)
|
||||
|
||||
def __in_list_func(self, line):
|
||||
if self.__token_info == 'mi<mk<list_close':
|
||||
self.__list_depth -= 1
|
||||
elif self.__token_info == 'mi<mk<list_start':
|
||||
self.__list_depth += 1
|
||||
if self.__list_depth == 0:
|
||||
self.__state = 'default'
|
||||
self.__write_obj.write(line)
|
||||
|
||||
def __after_body_func(self, line):
|
||||
self.__write_obj.write(line)
|
||||
|
||||
def make_sections(self):
|
||||
"""
|
||||
Required:
|
||||
nothing
|
||||
Returns:
|
||||
original file will be changed
|
||||
Logic:
|
||||
"""
|
||||
self.__initiate_values()
|
||||
read_obj = open_for_read(self.__file)
|
||||
self.__write_obj = open_for_write(self.__write_to)
|
||||
line_to_read = 1
|
||||
while line_to_read:
|
||||
line_to_read = read_obj.readline()
|
||||
line = line_to_read
|
||||
self.__token_info = line[:16]
|
||||
action = self.__state_dict.get(self.__state)
|
||||
action(line)
|
||||
read_obj.close()
|
||||
self.__write_obj.close()
|
||||
copy_obj = copy.Copy(bug_handler=self.__bug_handler)
|
||||
if self.__copy:
|
||||
copy_obj.copy_file(self.__write_to, "sections_to_headings.data")
|
||||
copy_obj.rename(self.__write_to, self.__file)
|
||||
os.remove(self.__write_to)
|
||||
589
ebook_converter/ebooks/rtf2xml/hex_2_utf8.py
Normal file
589
ebook_converter/ebooks/rtf2xml/hex_2_utf8.py
Normal file
@@ -0,0 +1,589 @@
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
#########################################################################
|
||||
# #
|
||||
# #
|
||||
# copyright 2002 Paul Henry Tremblay #
|
||||
# #
|
||||
# This program is distributed in the hope that it will be useful, #
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
|
||||
# General Public License for more details. #
|
||||
# #
|
||||
# #
|
||||
#########################################################################
|
||||
import sys, os, io
|
||||
|
||||
from calibre.ebooks.rtf2xml import get_char_map, copy
|
||||
from calibre.ebooks.rtf2xml.char_set import char_set
|
||||
from calibre.ptempfile import better_mktemp
|
||||
from polyglot.builtins import unicode_type
|
||||
|
||||
from . import open_for_read, open_for_write
|
||||
|
||||
|
||||
class Hex2Utf8:
|
||||
"""
|
||||
Convert Microsoft hexidecimal numbers to utf-8
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
in_file,
|
||||
area_to_convert,
|
||||
char_file,
|
||||
default_char_map,
|
||||
bug_handler,
|
||||
invalid_rtf_handler,
|
||||
copy=None,
|
||||
temp_dir=None,
|
||||
symbol=None,
|
||||
wingdings=None,
|
||||
caps=None,
|
||||
convert_caps=None,
|
||||
dingbats=None,
|
||||
run_level=1,
|
||||
):
|
||||
"""
|
||||
Required:
|
||||
'file'
|
||||
'area_to_convert'--the area of file to convert
|
||||
'char_file'--the file containing the character mappings
|
||||
'default_char_map'--name of default character map
|
||||
Optional:
|
||||
'copy'-- whether to make a copy of result for debugging
|
||||
'temp_dir' --where to output temporary results (default is
|
||||
directory from which the script is run.)
|
||||
'symbol'--whether to load the symbol character map
|
||||
'winddings'--whether to load the wingdings character map
|
||||
'caps'--whether to load the caps characer map
|
||||
'convert_to_caps'--wether to convert caps to utf-8
|
||||
Returns:
|
||||
nothing
|
||||
"""
|
||||
self.__file = in_file
|
||||
self.__copy = copy
|
||||
if area_to_convert not in ('preamble', 'body'):
|
||||
msg = (
|
||||
'Developer error! Wrong flag.\n'
|
||||
'in module "hex_2_utf8.py\n'
|
||||
'"area_to_convert" must be "body" or "preamble"\n'
|
||||
)
|
||||
raise self.__bug_handler(msg)
|
||||
self.__char_file = char_file
|
||||
self.__area_to_convert = area_to_convert
|
||||
self.__default_char_map = default_char_map
|
||||
self.__symbol = symbol
|
||||
self.__wingdings = wingdings
|
||||
self.__dingbats = dingbats
|
||||
self.__caps = caps
|
||||
self.__convert_caps = 0
|
||||
self.__convert_symbol = 0
|
||||
self.__convert_wingdings = 0
|
||||
self.__convert_zapf = 0
|
||||
self.__run_level = run_level
|
||||
self.__write_to = better_mktemp()
|
||||
self.__bug_handler = bug_handler
|
||||
self.__invalid_rtf_handler = invalid_rtf_handler
|
||||
|
||||
def update_values(self,
|
||||
file,
|
||||
area_to_convert,
|
||||
char_file,
|
||||
convert_caps,
|
||||
convert_symbol,
|
||||
convert_wingdings,
|
||||
convert_zapf,
|
||||
copy=None,
|
||||
temp_dir=None,
|
||||
symbol=None,
|
||||
wingdings=None,
|
||||
caps=None,
|
||||
dingbats=None,
|
||||
):
|
||||
"""
|
||||
Required:
|
||||
'file'
|
||||
'area_to_convert'--the area of file to convert
|
||||
'char_file'--the file containing the character mappings
|
||||
Optional:
|
||||
'copy'-- whether to make a copy of result for debugging
|
||||
'temp_dir' --where to output temporary results (default is
|
||||
directory from which the script is run.)
|
||||
'symbol'--whether to load the symbol character map
|
||||
'winddings'--whether to load the wingdings character map
|
||||
'caps'--whether to load the caps characer map
|
||||
'convert_to_caps'--wether to convert caps to utf-8
|
||||
Returns:
|
||||
nothing
|
||||
"""
|
||||
self.__file=file
|
||||
self.__copy = copy
|
||||
if area_to_convert not in ('preamble', 'body'):
|
||||
msg = (
|
||||
'in module "hex_2_utf8.py\n'
|
||||
'"area_to_convert" must be "body" or "preamble"\n'
|
||||
)
|
||||
raise self.__bug_handler(msg)
|
||||
self.__area_to_convert = area_to_convert
|
||||
self.__symbol = symbol
|
||||
self.__wingdings = wingdings
|
||||
self.__dingbats = dingbats
|
||||
self.__caps = caps
|
||||
self.__convert_caps = convert_caps
|
||||
self.__convert_symbol = convert_symbol
|
||||
self.__convert_wingdings = convert_wingdings
|
||||
self.__convert_zapf = convert_zapf
|
||||
# new!
|
||||
# no longer try to convert these
|
||||
# self.__convert_symbol = 0
|
||||
# self.__convert_wingdings = 0
|
||||
# self.__convert_zapf = 0
|
||||
|
||||
def __initiate_values(self):
|
||||
"""
|
||||
Required:
|
||||
Nothing
|
||||
Set values, including those for the dictionaries.
|
||||
The file that contains the maps is broken down into many different
|
||||
sets. For example, for the Symbol font, there is the standard part for
|
||||
hexidecimal numbers, and the part for Microsoft characters. Read
|
||||
each part in, and then combine them.
|
||||
"""
|
||||
# the default encoding system, the lower map for characters 0 through
|
||||
# 128, and the encoding system for Microsoft characters.
|
||||
# New on 2004-05-8: the self.__char_map is not in directory with other
|
||||
# modules
|
||||
self.__char_file = io.StringIO(char_set)
|
||||
char_map_obj = get_char_map.GetCharMap(
|
||||
char_file=self.__char_file,
|
||||
bug_handler=self.__bug_handler,
|
||||
)
|
||||
up_128_dict = char_map_obj.get_char_map(map=self.__default_char_map)
|
||||
bt_128_dict = char_map_obj.get_char_map(map='bottom_128')
|
||||
ms_standard_dict = char_map_obj.get_char_map(map='ms_standard')
|
||||
self.__def_dict = {}
|
||||
self.__def_dict.update(up_128_dict)
|
||||
self.__def_dict.update(bt_128_dict)
|
||||
self.__def_dict.update(ms_standard_dict)
|
||||
self.__current_dict = self.__def_dict
|
||||
self.__current_dict_name = 'default'
|
||||
self.__in_caps = 0
|
||||
self.__special_fonts_found = 0
|
||||
if self.__symbol:
|
||||
symbol_base_dict = char_map_obj.get_char_map(map='SYMBOL')
|
||||
ms_symbol_dict = char_map_obj.get_char_map(map='ms_symbol')
|
||||
self.__symbol_dict = {}
|
||||
self.__symbol_dict.update(symbol_base_dict)
|
||||
self.__symbol_dict.update(ms_symbol_dict)
|
||||
if self.__wingdings:
|
||||
wingdings_base_dict = char_map_obj.get_char_map(map='wingdings')
|
||||
ms_wingdings_dict = char_map_obj.get_char_map(map='ms_wingdings')
|
||||
self.__wingdings_dict = {}
|
||||
self.__wingdings_dict.update(wingdings_base_dict)
|
||||
self.__wingdings_dict.update(ms_wingdings_dict)
|
||||
if self.__dingbats:
|
||||
dingbats_base_dict = char_map_obj.get_char_map(map='dingbats')
|
||||
ms_dingbats_dict = char_map_obj.get_char_map(map='ms_dingbats')
|
||||
self.__dingbats_dict = {}
|
||||
self.__dingbats_dict.update(dingbats_base_dict)
|
||||
self.__dingbats_dict.update(ms_dingbats_dict)
|
||||
# load dictionary for caps, and make a string for the replacement
|
||||
self.__caps_uni_dict = char_map_obj.get_char_map(map='caps_uni')
|
||||
# # print self.__caps_uni_dict
|
||||
# don't think I'll need this
|
||||
# keys = self.__caps_uni_dict.keys()
|
||||
# self.__caps_uni_replace = '|'.join(keys)
|
||||
self.__preamble_state_dict = {
|
||||
'preamble' : self.__preamble_func,
|
||||
'body' : self.__body_func,
|
||||
'mi<mk<body-open_' : self.__found_body_func,
|
||||
'tx<hx<__________' : self.__hex_text_func,
|
||||
}
|
||||
self.__body_state_dict = {
|
||||
'preamble' : self.__preamble_for_body_func,
|
||||
'body' : self.__body_for_body_func,
|
||||
}
|
||||
self.__in_body_dict = {
|
||||
'mi<mk<body-open_' : self.__found_body_func,
|
||||
'tx<ut<__________' : self.__utf_to_caps_func,
|
||||
'tx<hx<__________' : self.__hex_text_func,
|
||||
'tx<mc<__________' : self.__hex_text_func,
|
||||
'tx<nu<__________' : self.__text_func,
|
||||
'mi<mk<font______' : self.__start_font_func,
|
||||
'mi<mk<caps______' : self.__start_caps_func,
|
||||
'mi<mk<font-end__' : self.__end_font_func,
|
||||
'mi<mk<caps-end__' : self.__end_caps_func,
|
||||
}
|
||||
self.__caps_list = ['false']
|
||||
self.__font_list = ['not-defined']
|
||||
|
||||
def __hex_text_func(self, line):
|
||||
"""
|
||||
Required:
|
||||
'line' -- the line
|
||||
Logic:
|
||||
get the hex_num and look it up in the default dictionary. If the
|
||||
token is in the dictionary, then check if the value starts with a
|
||||
"&". If it does, then tag the result as utf text. Otherwise, tag it
|
||||
as normal text.
|
||||
If the hex_num is not in the dictionary, then a mistake has been
|
||||
made.
|
||||
"""
|
||||
hex_num = line[17:-1]
|
||||
converted = self.__current_dict.get(hex_num)
|
||||
if converted is not None:
|
||||
# tag as utf-8
|
||||
if converted[0:1] == "&":
|
||||
font = self.__current_dict_name
|
||||
if self.__convert_caps\
|
||||
and self.__caps_list[-1] == 'true'\
|
||||
and font not in ('Symbol', 'Wingdings', 'Zapf Dingbats'):
|
||||
converted = self.__utf_token_to_caps_func(converted)
|
||||
self.__write_obj.write(
|
||||
'tx<ut<__________<%s\n' % converted
|
||||
)
|
||||
# tag as normal text
|
||||
else:
|
||||
font = self.__current_dict_name
|
||||
if self.__convert_caps\
|
||||
and self.__caps_list[-1] == 'true'\
|
||||
and font not in ('Symbol', 'Wingdings', 'Zapf Dingbats'):
|
||||
converted = converted.upper()
|
||||
self.__write_obj.write(
|
||||
'tx<nu<__________<%s\n' % converted
|
||||
)
|
||||
# error
|
||||
else:
|
||||
token = hex_num.replace("'", '')
|
||||
the_num = 0
|
||||
if token:
|
||||
the_num = int(token, 16)
|
||||
if the_num > 10:
|
||||
self.__write_obj.write('mi<tg<empty-att_<udef_symbol<num>%s<description>not-in-table\n' %
|
||||
hex_num)
|
||||
if self.__run_level > 4:
|
||||
# msg = 'no dictionary entry for %s\n'
|
||||
# msg += 'the hexidecimal num is "%s"\n' % (hex_num)
|
||||
# msg += 'dictionary is %s\n' % self.__current_dict_name
|
||||
msg = 'Character "&#x%s;" does not appear to be valid (or is a control character)\n' % token
|
||||
raise self.__bug_handler(msg)
|
||||
|
||||
def __found_body_func(self, line):
|
||||
self.__state = 'body'
|
||||
self.__write_obj.write(line)
|
||||
|
||||
def __body_func(self, line):
|
||||
"""
|
||||
When parsing preamble
|
||||
"""
|
||||
self.__write_obj.write(line)
|
||||
|
||||
def __preamble_func(self, line):
|
||||
action = self.__preamble_state_dict.get(self.__token_info)
|
||||
if action is not None:
|
||||
action(line)
|
||||
else:
|
||||
self.__write_obj.write(line)
|
||||
|
||||
def __convert_preamble(self):
|
||||
self.__state = 'preamble'
|
||||
with open_for_write(self.__write_to) as self.__write_obj:
|
||||
with open_for_read(self.__file) as read_obj:
|
||||
for line in read_obj:
|
||||
self.__token_info = line[:16]
|
||||
action = self.__preamble_state_dict.get(self.__state)
|
||||
if action is None:
|
||||
sys.stderr.write('error no state found in hex_2_utf8',
|
||||
self.__state
|
||||
)
|
||||
action(line)
|
||||
copy_obj = copy.Copy(bug_handler=self.__bug_handler)
|
||||
if self.__copy:
|
||||
copy_obj.copy_file(self.__write_to, "preamble_utf_convert.data")
|
||||
copy_obj.rename(self.__write_to, self.__file)
|
||||
os.remove(self.__write_to)
|
||||
|
||||
def __preamble_for_body_func(self, line):
|
||||
"""
|
||||
Required:
|
||||
line -- line to parse
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
Used when parsing the body.
|
||||
"""
|
||||
if self.__token_info == 'mi<mk<body-open_':
|
||||
self.__found_body_func(line)
|
||||
self.__write_obj.write(line)
|
||||
|
||||
def __body_for_body_func(self, line):
|
||||
"""
|
||||
Required:
|
||||
line -- line to parse
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
Used when parsing the body.
|
||||
"""
|
||||
action = self.__in_body_dict.get(self.__token_info)
|
||||
if action is not None:
|
||||
action(line)
|
||||
else:
|
||||
self.__write_obj.write(line)
|
||||
|
||||
def __start_font_func(self, line):
|
||||
"""
|
||||
Required:
|
||||
line -- line to parse
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
add font face to font_list
|
||||
"""
|
||||
face = line[17:-1]
|
||||
self.__font_list.append(face)
|
||||
if face == 'Symbol' and self.__convert_symbol:
|
||||
self.__current_dict_name = 'Symbol'
|
||||
self.__current_dict = self.__symbol_dict
|
||||
elif face == 'Wingdings' and self.__convert_wingdings:
|
||||
self.__current_dict_name = 'Wingdings'
|
||||
self.__current_dict = self.__wingdings_dict
|
||||
elif face == 'Zapf Dingbats' and self.__convert_zapf:
|
||||
self.__current_dict_name = 'Zapf Dingbats'
|
||||
self.__current_dict = self.__dingbats_dict
|
||||
else:
|
||||
self.__current_dict_name = 'default'
|
||||
self.__current_dict = self.__def_dict
|
||||
|
||||
def __end_font_func(self, line):
|
||||
"""
|
||||
Required:
|
||||
line -- line to parse
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
pop font_list
|
||||
"""
|
||||
if len(self.__font_list) > 1:
|
||||
self.__font_list.pop()
|
||||
else:
|
||||
sys.stderr.write('module is hex_2_utf8\n')
|
||||
sys.stderr.write('method is end_font_func\n')
|
||||
sys.stderr.write('self.__font_list should be greater than one?\n')
|
||||
face = self.__font_list[-1]
|
||||
if face == 'Symbol' and self.__convert_symbol:
|
||||
self.__current_dict_name = 'Symbol'
|
||||
self.__current_dict = self.__symbol_dict
|
||||
elif face == 'Wingdings' and self.__convert_wingdings:
|
||||
self.__current_dict_name = 'Wingdings'
|
||||
self.__current_dict = self.__wingdings_dict
|
||||
elif face == 'Zapf Dingbats' and self.__convert_zapf:
|
||||
self.__current_dict_name = 'Zapf Dingbats'
|
||||
self.__current_dict = self.__dingbats_dict
|
||||
else:
|
||||
self.__current_dict_name = 'default'
|
||||
self.__current_dict = self.__def_dict
|
||||
|
||||
def __start_special_font_func_old(self, line):
|
||||
"""
|
||||
Required:
|
||||
line -- line
|
||||
Returns;
|
||||
nothing
|
||||
Logic:
|
||||
change the dictionary to use in conversion
|
||||
"""
|
||||
# for error checking
|
||||
if self.__token_info == 'mi<mk<font-symbo':
|
||||
self.__current_dict.append(self.__symbol_dict)
|
||||
self.__special_fonts_found += 1
|
||||
self.__current_dict_name = 'Symbol'
|
||||
elif self.__token_info == 'mi<mk<font-wingd':
|
||||
self.__special_fonts_found += 1
|
||||
self.__current_dict.append(self.__wingdings_dict)
|
||||
self.__current_dict_name = 'Wingdings'
|
||||
elif self.__token_info == 'mi<mk<font-dingb':
|
||||
self.__current_dict.append(self.__dingbats_dict)
|
||||
self.__special_fonts_found += 1
|
||||
self.__current_dict_name = 'Zapf Dingbats'
|
||||
|
||||
def __end_special_font_func(self, line):
|
||||
"""
|
||||
Required:
|
||||
line --line to parse
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
pop the last dictionary, which should be a special font
|
||||
"""
|
||||
if len(self.__current_dict) < 2:
|
||||
sys.stderr.write('module is hex_2_utf 8\n')
|
||||
sys.stderr.write('method is __end_special_font_func\n')
|
||||
sys.stderr.write('less than two dictionaries --can\'t pop\n')
|
||||
self.__special_fonts_found -= 1
|
||||
else:
|
||||
self.__current_dict.pop()
|
||||
self.__special_fonts_found -= 1
|
||||
self.__dict_name = 'default'
|
||||
|
||||
def __start_caps_func_old(self, line):
|
||||
"""
|
||||
Required:
|
||||
line -- line to parse
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
A marker that marks the start of caps has been found. Set
|
||||
self.__in_caps to 1
|
||||
"""
|
||||
self.__in_caps = 1
|
||||
|
||||
def __start_caps_func(self, line):
|
||||
"""
|
||||
Required:
|
||||
line -- line to parse
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
A marker that marks the start of caps has been found. Set
|
||||
self.__in_caps to 1
|
||||
"""
|
||||
self.__in_caps = 1
|
||||
value = line[17:-1]
|
||||
self.__caps_list.append(value)
|
||||
|
||||
def __end_caps_func(self, line):
|
||||
"""
|
||||
Required:
|
||||
line -- line to parse
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
A marker that marks the end of caps has been found.
|
||||
set self.__in_caps to 0
|
||||
"""
|
||||
if len(self.__caps_list) > 1:
|
||||
self.__caps_list.pop()
|
||||
else:
|
||||
sys.stderr.write('Module is hex_2_utf8\n'
|
||||
'method is __end_caps_func\n'
|
||||
'caps list should be more than one?\n') # self.__in_caps not set
|
||||
|
||||
def __text_func(self, line):
|
||||
"""
|
||||
Required:
|
||||
line -- line to parse
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
if in caps, convert. Otherwise, print out.
|
||||
"""
|
||||
text = line[17:-1]
|
||||
# print line
|
||||
if self.__current_dict_name in ('Symbol', 'Wingdings', 'Zapf Dingbats'):
|
||||
the_string = ''
|
||||
for letter in text:
|
||||
hex_num = hex(ord(letter))
|
||||
hex_num = unicode_type(hex_num)
|
||||
hex_num = hex_num.upper()
|
||||
hex_num = hex_num[2:]
|
||||
hex_num = '\'%s' % hex_num
|
||||
converted = self.__current_dict.get(hex_num)
|
||||
if converted is None:
|
||||
sys.stderr.write('module is hex_2_ut8\nmethod is __text_func\n')
|
||||
sys.stderr.write('no hex value for "%s"\n' % hex_num)
|
||||
else:
|
||||
the_string += converted
|
||||
self.__write_obj.write('tx<nu<__________<%s\n' % the_string)
|
||||
# print the_string
|
||||
else:
|
||||
if self.__caps_list[-1] == 'true' \
|
||||
and self.__convert_caps\
|
||||
and self.__current_dict_name not in ('Symbol', 'Wingdings', 'Zapf Dingbats'):
|
||||
text = text.upper()
|
||||
self.__write_obj.write('tx<nu<__________<%s\n' % text)
|
||||
|
||||
def __utf_to_caps_func(self, line):
|
||||
"""
|
||||
Required:
|
||||
line -- line to parse
|
||||
returns
|
||||
nothing
|
||||
Logic
|
||||
Get the text, and use another method to convert
|
||||
"""
|
||||
utf_text = line[17:-1]
|
||||
if self.__caps_list[-1] == 'true' and self.__convert_caps:
|
||||
# utf_text = utf_text.upper()
|
||||
utf_text = self.__utf_token_to_caps_func(utf_text)
|
||||
self.__write_obj.write('tx<ut<__________<%s\n' % utf_text)
|
||||
|
||||
def __utf_token_to_caps_func(self, char_entity):
|
||||
"""
|
||||
Required:
|
||||
utf_text -- such as &xxx;
|
||||
Returns:
|
||||
token converted to the capital equivalent
|
||||
Logic:
|
||||
RTF often stores text in the improper values. For example, a
|
||||
capital umlaut o (?), is stores as ?. This function swaps the
|
||||
case by looking up the value in a dictionary.
|
||||
"""
|
||||
hex_num = char_entity[3:]
|
||||
length = len(hex_num)
|
||||
if length == 3:
|
||||
hex_num = '00%s' % hex_num
|
||||
elif length == 4:
|
||||
hex_num = '0%s' % hex_num
|
||||
new_char_entity = '&#x%s' % hex_num
|
||||
converted = self.__caps_uni_dict.get(new_char_entity)
|
||||
if not converted:
|
||||
# bullets and other entities dont' have capital equivelents
|
||||
return char_entity
|
||||
else:
|
||||
return converted
|
||||
|
||||
def __convert_body(self):
|
||||
self.__state = 'body'
|
||||
with open_for_read(self.__file) as read_obj:
|
||||
with open_for_write(self.__write_to) as self.__write_obj:
|
||||
for line in read_obj:
|
||||
self.__token_info = line[:16]
|
||||
action = self.__body_state_dict.get(self.__state)
|
||||
if action is None:
|
||||
sys.stderr.write('error no state found in hex_2_utf8',
|
||||
self.__state
|
||||
)
|
||||
action(line)
|
||||
copy_obj = copy.Copy(bug_handler=self.__bug_handler)
|
||||
if self.__copy:
|
||||
copy_obj.copy_file(self.__write_to, "body_utf_convert.data")
|
||||
copy_obj.rename(self.__write_to, self.__file)
|
||||
os.remove(self.__write_to)
|
||||
|
||||
def convert_hex_2_utf8(self):
|
||||
self.__initiate_values()
|
||||
if self.__area_to_convert == 'preamble':
|
||||
self.__convert_preamble()
|
||||
else:
|
||||
self.__convert_body()
|
||||
|
||||
|
||||
"""
|
||||
how to swap case for non-capitals
|
||||
my_string.swapcase()
|
||||
An example of how to use a hash for the caps function
|
||||
(but I shouldn't need this, since utf text is separate
|
||||
from regular text?)
|
||||
sub_dict = {
|
||||
"а" : "some other value"
|
||||
}
|
||||
def my_sub_func(matchobj):
|
||||
info = matchobj.group(0)
|
||||
value = sub_dict.get(info)
|
||||
return value
|
||||
return "f"
|
||||
line = "а more text"
|
||||
reg_exp = re.compile(r'(?P<name>а|б)')
|
||||
line2 = re.sub(reg_exp, my_sub_func, line)
|
||||
print line2
|
||||
"""
|
||||
285
ebook_converter/ebooks/rtf2xml/info.py
Normal file
285
ebook_converter/ebooks/rtf2xml/info.py
Normal file
@@ -0,0 +1,285 @@
|
||||
from __future__ import unicode_literals, absolute_import, print_function, division
|
||||
#########################################################################
|
||||
# #
|
||||
# #
|
||||
# copyright 2002 Paul Henry Tremblay #
|
||||
# #
|
||||
# This program is distributed in the hope that it will be useful, #
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
|
||||
# General Public License for more details. #
|
||||
# #
|
||||
# #
|
||||
#########################################################################
|
||||
import sys, os, re
|
||||
|
||||
from calibre.ebooks.rtf2xml import copy
|
||||
from calibre.ptempfile import better_mktemp
|
||||
from . import open_for_read, open_for_write
|
||||
|
||||
|
||||
class Info:
|
||||
"""
|
||||
Make tags for document-information
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
in_file,
|
||||
bug_handler,
|
||||
copy=None,
|
||||
run_level=1,
|
||||
):
|
||||
"""
|
||||
Required:
|
||||
'file'--file to parse
|
||||
Optional:
|
||||
'copy'-- whether to make a copy of result for debugging
|
||||
'temp_dir' --where to output temporary results (default is
|
||||
directory from which the script is run.)
|
||||
Returns:
|
||||
nothing
|
||||
"""
|
||||
self.__file = in_file
|
||||
self.__bug_handler = bug_handler
|
||||
self.__copy = copy
|
||||
self.__run_level = run_level
|
||||
self.__write_to = better_mktemp()
|
||||
|
||||
def __initiate_values(self):
|
||||
"""
|
||||
Initiate all values.
|
||||
"""
|
||||
self.__text_string = ''
|
||||
self.__state = 'before_info_table'
|
||||
self.rmspace = re.compile(r'\s+')
|
||||
self.__state_dict = {
|
||||
'before_info_table': self.__before_info_table_func,
|
||||
'after_info_table': self.__after_info_table_func,
|
||||
'in_info_table' : self.__in_info_table_func,
|
||||
'collect_text' : self.__collect_text_func,
|
||||
'collect_tokens' : self.__collect_tokens_func,
|
||||
}
|
||||
self.__info_table_dict = {
|
||||
'cw<di<title_____' : (self.__found_tag_with_text_func, 'title'),
|
||||
'cw<di<author____' : (self.__found_tag_with_text_func, 'author'),
|
||||
'cw<di<operator__' : (self.__found_tag_with_text_func, 'operator'),
|
||||
'cw<di<manager___' : (self.__found_tag_with_text_func, 'manager'),
|
||||
'cw<di<company___' : (self.__found_tag_with_text_func, 'company'),
|
||||
'cw<di<keywords__' : (self.__found_tag_with_text_func, 'keywords'),
|
||||
'cw<di<category__' : (self.__found_tag_with_text_func, 'category'),
|
||||
'cw<di<doc-notes_' : (self.__found_tag_with_text_func, 'doc-notes'),
|
||||
'cw<di<subject___' : (self.__found_tag_with_text_func, 'subject'),
|
||||
'cw<di<linkbase__' : (self.__found_tag_with_text_func, 'hyperlink-base'),
|
||||
|
||||
'cw<di<create-tim' : (self.__found_tag_with_tokens_func, 'creation-time'),
|
||||
'cw<di<revis-time' : (self.__found_tag_with_tokens_func, 'revision-time'),
|
||||
'cw<di<print-time' : (self.__found_tag_with_tokens_func, 'printing-time'),
|
||||
'cw<di<backuptime' : (self.__found_tag_with_tokens_func, 'backup-time'),
|
||||
|
||||
'cw<di<num-of-wor' : (self.__single_field_func, 'number-of-words'),
|
||||
'cw<di<num-of-chr' : (self.__single_field_func, 'number-of-characters'),
|
||||
'cw<di<numofchrws' : (self.__single_field_func, 'number-of-characters-without-space'),
|
||||
'cw<di<num-of-pag' : (self.__single_field_func, 'number-of-pages'),
|
||||
'cw<di<version___' : (self.__single_field_func, 'version'),
|
||||
'cw<di<edit-time_' : (self.__single_field_func, 'editing-time'),
|
||||
'cw<di<intern-ver' : (self.__single_field_func, 'internal-version-number'),
|
||||
'cw<di<internalID' : (self.__single_field_func, 'internal-id-number'),
|
||||
}
|
||||
self.__token_dict = {
|
||||
'year______' : 'year',
|
||||
'month_____' : 'month',
|
||||
'day_______' : 'day',
|
||||
'minute____' : 'minute',
|
||||
'second____' : 'second',
|
||||
'revis-time' : 'revision-time',
|
||||
'create-tim' : 'creation-time',
|
||||
'edit-time_' : 'editing-time',
|
||||
'print-time' : 'printing-time',
|
||||
'backuptime' : 'backup-time',
|
||||
'num-of-wor' : 'number-of-words',
|
||||
'num-of-chr' : 'number-of-characters',
|
||||
'numofchrws' : 'number-of-characters-without-space',
|
||||
'num-of-pag' : 'number-of-pages',
|
||||
'version___' : 'version',
|
||||
'intern-ver' : 'internal-version-number',
|
||||
'internalID' : 'internal-id-number',
|
||||
}
|
||||
|
||||
def __before_info_table_func(self, line):
|
||||
"""
|
||||
Required:
|
||||
line -- the line to parse
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
Check for the beginning of the informatin table. When found, set
|
||||
the state to the information table. Always write the line.
|
||||
"""
|
||||
if self.__token_info == 'mi<mk<doc-in-beg':
|
||||
self.__state = 'in_info_table'
|
||||
self.__write_obj.write(line)
|
||||
|
||||
def __in_info_table_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line -- line to parse
|
||||
Returns:
|
||||
nothing.
|
||||
Logic:
|
||||
Check for the end of information. If not found, check if the
|
||||
token has a special value in the info table dictionay. If it
|
||||
does, execute that function.
|
||||
Otherwise, output the line to the file.
|
||||
"""
|
||||
if self.__token_info == 'mi<mk<doc-in-end':
|
||||
self.__state = 'after_info_table'
|
||||
else:
|
||||
action, tag = self.__info_table_dict.get(self.__token_info, (None, None))
|
||||
if action:
|
||||
action(line, tag)
|
||||
else:
|
||||
self.__write_obj.write(line)
|
||||
|
||||
def __found_tag_with_text_func(self, line, tag):
|
||||
"""
|
||||
Requires:
|
||||
line -- line to parse
|
||||
tag --what kind of line
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
This function marks the beginning of informatin fields that have
|
||||
text that must be collected. Set the type of information field
|
||||
with the tag option. Set the state to collecting text
|
||||
"""
|
||||
self.__tag = tag
|
||||
self.__state = 'collect_text'
|
||||
|
||||
def __collect_text_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line -- line to parse
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
If the end of the information field is found, write the text
|
||||
string to the file.
|
||||
Otherwise, if the line contains text, add it to the text string.
|
||||
"""
|
||||
if self.__token_info == 'mi<mk<docinf-end':
|
||||
self.__state = 'in_info_table'
|
||||
# Don't print empty tags
|
||||
if len(self.rmspace.sub('',self.__text_string)):
|
||||
self.__write_obj.write(
|
||||
'mi<tg<open______<%s\n'
|
||||
'tx<nu<__________<%s\n'
|
||||
'mi<tg<close_____<%s\n' % (self.__tag, self.__text_string, self.__tag)
|
||||
)
|
||||
self.__text_string = ''
|
||||
elif line[0:2] == 'tx':
|
||||
self.__text_string += line[17:-1]
|
||||
|
||||
def __found_tag_with_tokens_func(self, line, tag):
|
||||
"""
|
||||
Requires:
|
||||
line -- line to parse
|
||||
tag -- type of field
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
Some fields have a series of tokens (cw<di<year______<nu<2003)
|
||||
that must be parsed as attributes for the element.
|
||||
Set the state to collect tokesn, and set the text string to
|
||||
start an empty element with attributes.
|
||||
"""
|
||||
self.__state = 'collect_tokens'
|
||||
self.__text_string = 'mi<tg<empty-att_<%s' % tag
|
||||
# mi<tg<empty-att_<page-definition<margin>33\n
|
||||
|
||||
def __collect_tokens_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line -- line to parse
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
This function collects all the token information and adds it to
|
||||
the text string until the end of the field is found.
|
||||
First check of the end of the information field. If found, write
|
||||
the text string to the file.
|
||||
If not found, get the relevant information from the text string.
|
||||
This information cannot be directly added to the text string,
|
||||
because it exists in abbreviated form. (num-of-wor)
|
||||
I want to check this information in a dictionary to convert it
|
||||
to a longer, readable form. If the key does not exist in the
|
||||
dictionary, print out an error message. Otherise add the value
|
||||
to the text string.
|
||||
(num-of-wor => number-of-words)
|
||||
"""
|
||||
# cw<di<year______<nu<2003
|
||||
if self.__token_info == 'mi<mk<docinf-end':
|
||||
self.__state = 'in_info_table'
|
||||
self.__write_obj.write(
|
||||
'%s\n' % self.__text_string
|
||||
)
|
||||
self.__text_string = ''
|
||||
else:
|
||||
att = line[6:16]
|
||||
value = line[20:-1]
|
||||
att_changed = self.__token_dict.get(att)
|
||||
if att_changed is None:
|
||||
if self.__run_level > 3:
|
||||
msg = 'No dictionary match for %s\n' % att
|
||||
raise self.__bug_handler(msg)
|
||||
else:
|
||||
self.__text_string += '<%s>%s' % (att_changed, value)
|
||||
|
||||
def __single_field_func(self, line, tag):
|
||||
value = line[20:-1]
|
||||
self.__write_obj.write(
|
||||
'mi<tg<empty-att_<%s<%s>%s\n' % (tag, tag, value)
|
||||
)
|
||||
|
||||
def __after_info_table_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line --line to write to file
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
After the end of the information table, simple write the line to
|
||||
the file.
|
||||
"""
|
||||
self.__write_obj.write(line)
|
||||
|
||||
def fix_info(self):
|
||||
"""
|
||||
Requires:
|
||||
nothing
|
||||
Returns:
|
||||
nothing (changes the original file)
|
||||
Logic:
|
||||
Read one line in at a time. Determine what action to take based on
|
||||
the state. If the state is before the information table, look for the
|
||||
beginning of the style table.
|
||||
If the state is in the information table, use other methods to
|
||||
parse the information
|
||||
style table, look for lines with style info, and substitute the
|
||||
number with the name of the style. If the state if afer the
|
||||
information table, simply write the line to the output file.
|
||||
"""
|
||||
self.__initiate_values()
|
||||
with open_for_read(self.__file) as read_obj:
|
||||
with open_for_write(self.__write_to) as self.__write_obj:
|
||||
for line in read_obj:
|
||||
self.__token_info = line[:16]
|
||||
action = self.__state_dict.get(self.__state)
|
||||
if action is None:
|
||||
sys.stderr.write('No matching state in module styles.py\n')
|
||||
sys.stderr.write(self.__state + '\n')
|
||||
action(line)
|
||||
copy_obj = copy.Copy(bug_handler=self.__bug_handler)
|
||||
if self.__copy:
|
||||
copy_obj.copy_file(self.__write_to, "info.data")
|
||||
copy_obj.rename(self.__write_to, self.__file)
|
||||
os.remove(self.__write_to)
|
||||
427
ebook_converter/ebooks/rtf2xml/inline.py
Normal file
427
ebook_converter/ebooks/rtf2xml/inline.py
Normal file
@@ -0,0 +1,427 @@
|
||||
from __future__ import unicode_literals, absolute_import, print_function, division
|
||||
import sys, os
|
||||
|
||||
from calibre.ebooks.rtf2xml import copy
|
||||
from calibre.ptempfile import better_mktemp
|
||||
from . import open_for_read, open_for_write
|
||||
|
||||
"""
|
||||
States.
|
||||
1. default
|
||||
1. an open bracket ends this state.
|
||||
2. Text print out text. Print out any groups_in_waiting.
|
||||
3. closed bracket. Close groups
|
||||
2. after an open bracket
|
||||
1. The lack of a control word ends this state.
|
||||
2. paragraph end -- close out all tags
|
||||
3. footnote beg -- close out all tags
|
||||
"""
|
||||
|
||||
|
||||
class Inline:
|
||||
"""
|
||||
Make inline tags within lists.
|
||||
Logic:
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
in_file,
|
||||
bug_handler,
|
||||
copy=None,
|
||||
run_level=1,):
|
||||
"""
|
||||
Required:
|
||||
'file'--file to parse
|
||||
Optional:
|
||||
'copy'-- whether to make a copy of result for debugging
|
||||
'temp_dir' --where to output temporary results (default is
|
||||
directory from which the script is run.)
|
||||
Returns:
|
||||
nothing
|
||||
"""
|
||||
self.__file = in_file
|
||||
self.__bug_handler = bug_handler
|
||||
self.__copy = copy
|
||||
self.__run_level = run_level
|
||||
self.__write_to = better_mktemp()
|
||||
|
||||
def __initiate_values(self):
|
||||
"""
|
||||
Initiate all values.
|
||||
"""
|
||||
self.__state_dict = {
|
||||
'default': self.__default_func,
|
||||
'after_open_bracket': self.__after_open_bracket_func,
|
||||
}
|
||||
self.__default_dict = {
|
||||
'ob<nu<open-brack': self.__found_open_bracket_func,
|
||||
'tx<nu<__________' : self.__found_text_func,
|
||||
'tx<hx<__________' : self.__found_text_func,
|
||||
'tx<ut<__________' : self.__found_text_func,
|
||||
'mi<mk<inline-fld' : self.__found_text_func,
|
||||
'text' : self.__found_text_func,
|
||||
'cb<nu<clos-brack' : self.__close_bracket_func,
|
||||
'mi<mk<par-end___' : self.__end_para_func,
|
||||
'mi<mk<footnt-ope' : self.__end_para_func,
|
||||
'mi<mk<footnt-ind' : self.__end_para_func,
|
||||
}
|
||||
self.__after_open_bracket_dict = {
|
||||
'cb<nu<clos-brack' : self.__close_bracket_func,
|
||||
'tx<nu<__________' : self.__found_text_func,
|
||||
'tx<hx<__________' : self.__found_text_func,
|
||||
'tx<ut<__________' : self.__found_text_func,
|
||||
'text' : self.__found_text_func,
|
||||
'mi<mk<inline-fld' : self.__found_text_func,
|
||||
'ob<nu<open-brack': self.__found_open_bracket_func,
|
||||
'mi<mk<par-end___' : self.__end_para_func,
|
||||
'mi<mk<footnt-ope' : self.__end_para_func,
|
||||
'mi<mk<footnt-ind' : self.__end_para_func,
|
||||
'cw<fd<field_____' : self.__found_field_func,
|
||||
}
|
||||
self.__state = 'default'
|
||||
self.__brac_count = 0 # do I need this?
|
||||
self.__list_inline_list = []
|
||||
self.__body_inline_list = []
|
||||
self.__groups_in_waiting_list = [0]
|
||||
self.__groups_in_waiting_body = [0]
|
||||
self.__groups_in_waiting = self.__groups_in_waiting_body
|
||||
self.__place = 'non_list'
|
||||
self.__inline_list = self.__body_inline_list
|
||||
self.__in_para = 0 # not in paragraph
|
||||
self.__char_dict = {
|
||||
# character info => ci
|
||||
'annotation' : 'annotation',
|
||||
'blue______' : 'blue',
|
||||
'bold______' : 'bold',
|
||||
'caps______' : 'caps',
|
||||
'char-style' : 'character-style',
|
||||
'dbl-strike' : 'double-strike-through',
|
||||
'emboss____' : 'emboss',
|
||||
'engrave___' : 'engrave',
|
||||
'font-color' : 'font-color',
|
||||
'font-down_' : 'subscript',
|
||||
'font-size_' : 'font-size',
|
||||
'font-style' : 'font-style',
|
||||
'font-up___' : 'superscript',
|
||||
'footnot-mk' : 'footnote-marker',
|
||||
'green_____' : 'green',
|
||||
'hidden____' : 'hidden',
|
||||
'italics___' : 'italics',
|
||||
'outline___' : 'outline',
|
||||
'red_______' : 'red',
|
||||
'shadow____' : 'shadow',
|
||||
'small-caps' : 'small-caps',
|
||||
'strike-thr' : 'strike-through',
|
||||
'subscript_' : 'subscript',
|
||||
'superscrip' : 'superscript',
|
||||
'underlined' : 'underlined',
|
||||
}
|
||||
self.__caps_list = ['false']
|
||||
|
||||
def __set_list_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line--line of text
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
"""
|
||||
if self.__place == 'in_list':
|
||||
if self.__token_info == 'mi<mk<lst-tx-end':
|
||||
self.__place = 'not_in_list'
|
||||
self.__inline_list = self.__body_inline_list
|
||||
self.__groups_in_waiting = self.__groups_in_waiting_body
|
||||
else:
|
||||
if self.__token_info == 'mi<mk<lst-tx-beg':
|
||||
self.__place = 'in_list'
|
||||
self.__inline_list = self.__list_inline_list
|
||||
self.__groups_in_waiting = self.__groups_in_waiting_list
|
||||
|
||||
def __default_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line-- line of text
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
Write if not hardline break
|
||||
"""
|
||||
action = self.__default_dict.get(self.__token_info)
|
||||
if action:
|
||||
action(line)
|
||||
self.__write_obj.write(line)
|
||||
|
||||
def __found_open_bracket_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line -- current line of text
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
Change the state to 'after_open_bracket'
|
||||
"""
|
||||
self.__state = 'after_open_bracket'
|
||||
self.__brac_count += 1
|
||||
self.__groups_in_waiting[0] += 1
|
||||
self.__inline_list.append({})
|
||||
self.__inline_list[-1]['contains_inline'] = 0
|
||||
|
||||
def __after_open_bracket_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line --line of text
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
If the token is a control word for character info (cw<ci), use another
|
||||
method to add to the dictionary.
|
||||
Use the dictionary to get the approriate function.
|
||||
Always print out the line.
|
||||
"""
|
||||
if line[0:5] == 'cw<ci': # calibre: bug in original function no diff between cw<ci and cw<pf
|
||||
self.__handle_control_word(line)
|
||||
else:
|
||||
action = self.__after_open_bracket_dict.get(self.__token_info)
|
||||
if action:
|
||||
self.__state = 'default' # a non control word?
|
||||
action(line)
|
||||
self.__write_obj.write(line)
|
||||
|
||||
def __handle_control_word(self, line):
|
||||
"""
|
||||
Required:
|
||||
line --line of text
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
Handle the control word for inline groups.
|
||||
Add each name - value to a dictionary.
|
||||
If the font style of Symbol, Wingdings, or Dingbats is found,
|
||||
always mark this. I need this later to convert the text to
|
||||
the right utf.
|
||||
"""
|
||||
# cw<ci<shadow_____<nu<true
|
||||
# self.__char_dict = {
|
||||
char_info = line[6:16]
|
||||
char_value = line[20:-1]
|
||||
name = self.__char_dict.get(char_info)
|
||||
if name:
|
||||
self.__inline_list[-1]['contains_inline'] = 1
|
||||
self.__inline_list[-1][name] = char_value
|
||||
"""
|
||||
if name == 'font-style':
|
||||
if char_value == 'Symbol':
|
||||
self.__write_obj.write('mi<mk<font-symbo\n')
|
||||
elif char_value == 'Wingdings':
|
||||
self.__write_obj.write('mi<mk<font-wingd\n')
|
||||
elif char_value == 'Zapf Dingbats':
|
||||
self.__write_obj.write('mi<mk<font-dingb\n')
|
||||
"""
|
||||
|
||||
def __close_bracket_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line --line of text
|
||||
Returns:
|
||||
Nothing
|
||||
Logic:
|
||||
If there are no inline groups, do nothing.
|
||||
Get the keys of the last dictionary in the inline_groups.
|
||||
If 'contains_inline' in the keys, write a close tag.
|
||||
If the_dict contains font information, write a mk tag.
|
||||
"""
|
||||
if len(self.__inline_list) == 0:
|
||||
# nothing to add
|
||||
return
|
||||
the_dict = self.__inline_list[-1]
|
||||
the_keys = the_dict.keys()
|
||||
# always close out
|
||||
if self.__place == 'in_list':
|
||||
if 'contains_inline' in the_keys and the_dict['contains_inline'] == 1\
|
||||
and self.__groups_in_waiting[0] == 0:
|
||||
self.__write_obj.write('mi<tg<close_____<inline\n')
|
||||
if 'font-style' in the_keys:
|
||||
self.__write_obj.write('mi<mk<font-end__\n')
|
||||
if 'caps' in the_keys:
|
||||
self.__write_obj.write('mi<mk<caps-end__\n')
|
||||
else:
|
||||
# close out only if in a paragraph
|
||||
if 'contains_inline' in the_keys and the_dict['contains_inline'] == 1\
|
||||
and self.__in_para and self.__groups_in_waiting[0] == 0:
|
||||
self.__write_obj.write('mi<tg<close_____<inline\n')
|
||||
if 'font-style' in the_keys:
|
||||
self.__write_obj.write('mi<mk<font-end__\n')
|
||||
if 'caps' in the_keys:
|
||||
self.__write_obj.write('mi<mk<caps-end__\n')
|
||||
self.__inline_list.pop()
|
||||
if self.__groups_in_waiting[0] != 0:
|
||||
self.__groups_in_waiting[0] -= 1
|
||||
|
||||
def __found_text_func(self, line):
|
||||
"""
|
||||
Required:
|
||||
line--line of text
|
||||
Return:
|
||||
nothing
|
||||
Logic:
|
||||
Three cases:
|
||||
1. in a list. Simply write inline
|
||||
2. Not in a list
|
||||
Text can mark the start of a paragraph.
|
||||
If already in a paragraph, check to see if any groups are waiting
|
||||
to be added. If so, use another method to write these groups.
|
||||
"""
|
||||
if self.__place == 'in_list':
|
||||
self.__write_inline()
|
||||
else:
|
||||
if not self.__in_para:
|
||||
self.__in_para = 1
|
||||
self.__start_para_func(line)
|
||||
elif self.__groups_in_waiting[0] != 0:
|
||||
self.__write_inline()
|
||||
|
||||
def __write_inline(self):
|
||||
"""
|
||||
Required:
|
||||
nothing
|
||||
Returns
|
||||
Nothing
|
||||
Logic:
|
||||
Method for writing inline when text is found.
|
||||
Only write those groups that are "waiting", or that have no
|
||||
tags yet.
|
||||
First, slice the list self.__inline list to get just the groups
|
||||
in waiting.
|
||||
Iterate through this slice, which contains only dictionaries.
|
||||
Get the keys in each dictionary. If 'font-style' is in the keys,
|
||||
write a marker tag. (I will use this marker tag later when conerting
|
||||
hext text to utf8.)
|
||||
Write a tag for the inline values.
|
||||
"""
|
||||
if self.__groups_in_waiting[0] != 0:
|
||||
last_index = -1 * self.__groups_in_waiting[0]
|
||||
inline_list = self.__inline_list[last_index:]
|
||||
if len(inline_list) <= 0:
|
||||
if self.__run_level > 3:
|
||||
msg = 'self.__inline_list is %s\n' % self.__inline_list
|
||||
raise self.__bug_handler(msg)
|
||||
self.__write_obj.write('error\n')
|
||||
self.__groups_in_waiting[0] = 0
|
||||
return
|
||||
for the_dict in inline_list:
|
||||
if the_dict['contains_inline']:
|
||||
the_keys = the_dict.keys()
|
||||
if 'font-style' in the_keys:
|
||||
face = the_dict['font-style']
|
||||
self.__write_obj.write('mi<mk<font______<%s\n' % face)
|
||||
if 'caps' in the_keys:
|
||||
value = the_dict['caps']
|
||||
self.__write_obj.write('mi<mk<caps______<%s\n' % value)
|
||||
self.__write_obj.write('mi<tg<open-att__<inline')
|
||||
for the_key in the_keys:
|
||||
if the_key != 'contains_inline':
|
||||
self.__write_obj.write('<%s>%s' % (the_key, the_dict[the_key]))
|
||||
self.__write_obj.write('\n')
|
||||
self.__groups_in_waiting[0] = 0
|
||||
|
||||
def __end_para_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line -- line of text
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
Slice from the end the groups in waiting.
|
||||
Iterate through the list. If the dictionary contaings info, write
|
||||
a closing tag.
|
||||
"""
|
||||
if not self.__in_para:
|
||||
return
|
||||
if self.__groups_in_waiting[0] == 0:
|
||||
inline_list = self.__inline_list
|
||||
else:
|
||||
last_index = -1 * self.__groups_in_waiting[0]
|
||||
inline_list = self.__inline_list[0:last_index]
|
||||
for the_dict in inline_list:
|
||||
contains_info = the_dict.get('contains_inline')
|
||||
if contains_info:
|
||||
the_keys = the_dict.keys()
|
||||
if 'font-style' in the_keys:
|
||||
self.__write_obj.write('mi<mk<font-end__\n')
|
||||
if 'caps' in the_keys:
|
||||
self.__write_obj.write('mi<mk<caps-end__\n')
|
||||
self.__write_obj.write('mi<tg<close_____<inline\n')
|
||||
self.__in_para = 0
|
||||
|
||||
def __start_para_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line -- line of text
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
Iterate through the self.__inline_list to get each dict.
|
||||
If the dict containst inline info, get the keys.
|
||||
Iterate through the keys and print out the key and value.
|
||||
"""
|
||||
for the_dict in self.__inline_list:
|
||||
contains_info = the_dict.get('contains_inline')
|
||||
if contains_info :
|
||||
the_keys = the_dict.keys()
|
||||
if 'font-style' in the_keys:
|
||||
face = the_dict['font-style']
|
||||
self.__write_obj.write('mi<mk<font______<%s\n' % face)
|
||||
if 'caps' in the_keys:
|
||||
value = the_dict['caps']
|
||||
self.__write_obj.write('mi<mk<caps______<%s\n' % value)
|
||||
self.__write_obj.write('mi<tg<open-att__<inline')
|
||||
for the_key in the_keys:
|
||||
if the_key != 'contains_inline':
|
||||
self.__write_obj.write('<%s>%s' % (the_key, the_dict[the_key]))
|
||||
self.__write_obj.write('\n')
|
||||
self.__groups_in_waiting[0] = 0
|
||||
|
||||
def __found_field_func(self, line):
|
||||
"""
|
||||
Just a default function to make sure I don't prematurely exit
|
||||
default state
|
||||
"""
|
||||
pass
|
||||
|
||||
def form_tags(self):
|
||||
"""
|
||||
Requires:
|
||||
area--area to parse (list or non-list)
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
Read one line in at a time. Determine what action to take based on
|
||||
the state.
|
||||
"""
|
||||
self.__initiate_values()
|
||||
with open_for_read(self.__file) as read_obj:
|
||||
with open_for_write(self.__write_to) as self.__write_obj:
|
||||
for line in read_obj:
|
||||
token = line[0:-1]
|
||||
self.__token_info = ''
|
||||
if token == 'tx<mc<__________<rdblquote'\
|
||||
or token == 'tx<mc<__________<ldblquote'\
|
||||
or token == 'tx<mc<__________<lquote'\
|
||||
or token == 'tx<mc<__________<rquote'\
|
||||
or token == 'tx<mc<__________<emdash'\
|
||||
or token == 'tx<mc<__________<endash'\
|
||||
or token == 'tx<mc<__________<bullet':
|
||||
self.__token_info = 'text'
|
||||
else:
|
||||
self.__token_info = line[:16]
|
||||
self.__set_list_func(line)
|
||||
action = self.__state_dict.get(self.__state)
|
||||
if action is None:
|
||||
sys.stderr.write('No matching state in module inline.py\n')
|
||||
sys.stderr.write(self.__state + '\n')
|
||||
action(line)
|
||||
copy_obj = copy.Copy(bug_handler=self.__bug_handler)
|
||||
if self.__copy:
|
||||
copy_obj.copy_file(self.__write_to, "inline.data")
|
||||
copy_obj.rename(self.__write_to, self.__file)
|
||||
os.remove(self.__write_to)
|
||||
56
ebook_converter/ebooks/rtf2xml/line_endings.py
Normal file
56
ebook_converter/ebooks/rtf2xml/line_endings.py
Normal file
@@ -0,0 +1,56 @@
|
||||
from __future__ import unicode_literals, absolute_import, print_function, division
|
||||
#########################################################################
|
||||
# #
|
||||
# #
|
||||
# copyright 2002 Paul Henry Tremblay #
|
||||
# #
|
||||
# This program is distributed in the hope that it will be useful, #
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
|
||||
# General Public License for more details. #
|
||||
# #
|
||||
# #
|
||||
#########################################################################
|
||||
import os
|
||||
|
||||
from calibre.ebooks.rtf2xml import copy
|
||||
from calibre.utils.cleantext import clean_ascii_chars
|
||||
from calibre.ptempfile import better_mktemp
|
||||
|
||||
|
||||
class FixLineEndings:
|
||||
"""Fix line endings"""
|
||||
|
||||
def __init__(self,
|
||||
bug_handler,
|
||||
in_file=None,
|
||||
copy=None,
|
||||
run_level=1,
|
||||
replace_illegals=1,
|
||||
):
|
||||
self.__file = in_file
|
||||
self.__bug_handler = bug_handler
|
||||
self.__copy = copy
|
||||
self.__run_level = run_level
|
||||
self.__write_to = better_mktemp()
|
||||
self.__replace_illegals = replace_illegals
|
||||
|
||||
def fix_endings(self):
|
||||
# read
|
||||
with open(self.__file, 'rb') as read_obj:
|
||||
input_file = read_obj.read()
|
||||
# calibre go from win and mac to unix
|
||||
input_file = input_file.replace(b'\r\n', b'\n')
|
||||
input_file = input_file.replace(b'\r', b'\n')
|
||||
# remove ASCII invalid chars : 0 to 8 and 11-14 to 24-26-27
|
||||
if self.__replace_illegals:
|
||||
input_file = clean_ascii_chars(input_file)
|
||||
# write
|
||||
with open(self.__write_to, 'wb') as write_obj:
|
||||
write_obj.write(input_file)
|
||||
# copy
|
||||
copy_obj = copy.Copy(bug_handler=self.__bug_handler)
|
||||
if self.__copy:
|
||||
copy_obj.copy_file(self.__write_to, "line_endings.data")
|
||||
copy_obj.rename(self.__write_to, self.__file)
|
||||
os.remove(self.__write_to)
|
||||
201
ebook_converter/ebooks/rtf2xml/list_numbers.py
Normal file
201
ebook_converter/ebooks/rtf2xml/list_numbers.py
Normal file
@@ -0,0 +1,201 @@
|
||||
from __future__ import unicode_literals, absolute_import, print_function, division
|
||||
#########################################################################
|
||||
# #
|
||||
# #
|
||||
# copyright 2002 Paul Henry Tremblay #
|
||||
# #
|
||||
# This program is distributed in the hope that it will be useful, #
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
|
||||
# General Public License for more details. #
|
||||
# #
|
||||
# #
|
||||
#########################################################################
|
||||
import os
|
||||
from calibre.ebooks.rtf2xml import copy
|
||||
from calibre.ptempfile import better_mktemp
|
||||
from . import open_for_read, open_for_write
|
||||
|
||||
|
||||
class ListNumbers:
|
||||
"""
|
||||
RTF puts list numbers outside of the paragraph. The public method
|
||||
in this class put the list numbers inside the paragraphs.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
in_file,
|
||||
bug_handler,
|
||||
copy=None,
|
||||
run_level=1,
|
||||
):
|
||||
"""
|
||||
Required:
|
||||
'file'
|
||||
Optional:
|
||||
'copy'-- whether to make a copy of result for debugging
|
||||
'temp_dir' --where to output temporary results (default is
|
||||
directory from which the script is run.)
|
||||
Returns:
|
||||
nothing
|
||||
"""
|
||||
self.__file = in_file
|
||||
self.__bug_handler = bug_handler
|
||||
self.__copy = copy
|
||||
self.__write_to = better_mktemp()
|
||||
|
||||
def __initiate_values(self):
|
||||
"""
|
||||
initiate values for fix_list_numbers.
|
||||
Required:
|
||||
Nothing
|
||||
Return:
|
||||
Nothing
|
||||
"""
|
||||
self.__state = "default"
|
||||
self.__list_chunk = ''
|
||||
self.__previous_line = ''
|
||||
self.__list_text_ob_count = ''
|
||||
self.__state_dict={
|
||||
'default' : self.__default_func,
|
||||
'after_ob' : self.__after_ob_func,
|
||||
'list_text' : self.__list_text_func,
|
||||
'after_list_text' : self.__after_list_text_func
|
||||
}
|
||||
|
||||
def __after_ob_func(self, line):
|
||||
"""
|
||||
Handle the line immediately after an open bracket.
|
||||
Required:
|
||||
self, line
|
||||
Returns:
|
||||
Nothing
|
||||
"""
|
||||
if self.__token_info == 'cw<ls<list-text_':
|
||||
self.__state = 'list_text'
|
||||
self.__list_chunk = self.__list_chunk + \
|
||||
self.__previous_line + line
|
||||
self.__list_text_ob = self.__ob_count
|
||||
self.__cb_count = 0
|
||||
else:
|
||||
self.__write_obj.write(self.__previous_line)
|
||||
self.__write_obj.write(line)
|
||||
self.__state = 'default'
|
||||
|
||||
def __after_list_text_func(self, line):
|
||||
"""
|
||||
Look for an open bracket or a line of text, and then print out the
|
||||
self.__list_chunk. Print out the line.
|
||||
"""
|
||||
if line[0:2] == 'ob' or line[0:2] == 'tx':
|
||||
self.__state = 'default'
|
||||
self.__write_obj.write('mi<mk<lst-txbeg_\n')
|
||||
self.__write_obj.write('mi<mk<para-beg__\n')
|
||||
self.__write_obj.write('mi<mk<lst-tx-beg\n')
|
||||
self.__write_obj.write(
|
||||
# 'mi<tg<open-att__<list-text<type>%s\n' % self.__list_type)
|
||||
'mi<tg<open-att__<list-text\n')
|
||||
self.__write_obj.write(self.__list_chunk)
|
||||
self.__write_obj.write('mi<tg<close_____<list-text\n')
|
||||
self.__write_obj.write('mi<mk<lst-tx-end\n')
|
||||
self.__list_chunk = ''
|
||||
self.__write_obj.write(line)
|
||||
|
||||
def __determine_list_type(self, chunk):
|
||||
"""
|
||||
Determine if the list is ordered or itemized
|
||||
"""
|
||||
lines = chunk.split('\n')
|
||||
text_string = ''
|
||||
for line in lines:
|
||||
if line[0:5] == 'tx<hx':
|
||||
if line[17:] == '\'B7':
|
||||
return "unordered"
|
||||
elif line[0:5] == 'tx<nu':
|
||||
text_string += line[17:]
|
||||
text_string = text_string.replace('.', '')
|
||||
text_string = text_string.replace('(', '')
|
||||
text_string = text_string.replace(')', '')
|
||||
if text_string.isdigit():
|
||||
return 'ordered'
|
||||
"""
|
||||
sys.stderr.write('module is list_numbers\n')
|
||||
sys.stderr.write('method is __determine type\n')
|
||||
sys.stderr.write('Couldn\'t get type of list\n')
|
||||
"""
|
||||
# must be some type of ordered list -- just a guess!
|
||||
return 'unordered'
|
||||
|
||||
def __list_text_func(self, line):
|
||||
"""
|
||||
Handle lines that are part of the list text. If the end of the list
|
||||
text is found (the closing bracket matches the self.__list_text_ob),
|
||||
then change the state. Always add the line to the self.__list_chunk
|
||||
Required:
|
||||
self, line
|
||||
Returns:
|
||||
Nothing
|
||||
"""
|
||||
if self.__list_text_ob == self.__cb_count:
|
||||
self.__state = 'after_list_text'
|
||||
self.__right_after_list_text = 1
|
||||
self.__list_type = self.__determine_list_type(self.__list_chunk)
|
||||
self.__write_obj.write('mi<mk<list-type_<%s\n' % self.__list_type)
|
||||
if self.__token_info != 'cw<pf<par-def___':
|
||||
self.__list_chunk = self.__list_chunk + line
|
||||
|
||||
def __default_func(self, line):
|
||||
"""
|
||||
Handle the lines that are not part of any special state. Look for an
|
||||
opening bracket. If an open bracket is found, add this line to a
|
||||
temporary self.__previous line, which other methods need. Otherwise,
|
||||
print out the line.
|
||||
Required:
|
||||
self, line
|
||||
Returns:
|
||||
Nothing
|
||||
"""
|
||||
if self.__token_info == 'ob<nu<open-brack':
|
||||
self.__state = 'after_ob'
|
||||
self.__previous_line = line
|
||||
else:
|
||||
self.__write_obj.write(line)
|
||||
|
||||
def fix_list_numbers(self):
|
||||
"""
|
||||
Required:
|
||||
nothing
|
||||
Returns:
|
||||
original file will be changed
|
||||
Logic:
|
||||
Read in one line a time from the file. Keep track of opening and
|
||||
closing brackets. Determine the method ('action') by passing the
|
||||
state to the self.__state_dict.
|
||||
Simply print out the line to a temp file until an open bracket
|
||||
is found. Check the next line. If it is list-text, then start
|
||||
adding to the self.__list_chunk until the closing bracket is
|
||||
found.
|
||||
Next, look for an open bracket or text. When either is found,
|
||||
print out self.__list_chunk and the line.
|
||||
"""
|
||||
self.__initiate_values()
|
||||
read_obj = open_for_read(self.__file)
|
||||
self.__write_obj = open_for_write(self.__write_to)
|
||||
line_to_read = 1
|
||||
while line_to_read:
|
||||
line_to_read = read_obj.readline()
|
||||
line = line_to_read
|
||||
self.__token_info = line[:16]
|
||||
if self.__token_info == 'ob<nu<open-brack':
|
||||
self.__ob_count = line[-5:-1]
|
||||
if self.__token_info == 'cb<nu<clos-brack':
|
||||
self.__cb_count = line[-5:-1]
|
||||
action = self.__state_dict.get(self.__state)
|
||||
action(line)
|
||||
read_obj.close()
|
||||
self.__write_obj.close()
|
||||
copy_obj = copy.Copy(bug_handler=self.__bug_handler)
|
||||
if self.__copy:
|
||||
copy_obj.copy_file(self.__write_to, "list_numbers.data")
|
||||
copy_obj.rename(self.__write_to, self.__file)
|
||||
os.remove(self.__write_to)
|
||||
447
ebook_converter/ebooks/rtf2xml/list_table.py
Normal file
447
ebook_converter/ebooks/rtf2xml/list_table.py
Normal file
@@ -0,0 +1,447 @@
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
#########################################################################
|
||||
# #
|
||||
# #
|
||||
# copyright 2002 Paul Henry Tremblay #
|
||||
# #
|
||||
# This program is distributed in the hope that it will be useful, #
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
|
||||
# General Public License for more details. #
|
||||
# #
|
||||
# #
|
||||
#########################################################################
|
||||
|
||||
from polyglot.builtins import unicode_type
|
||||
|
||||
|
||||
class ListTable:
|
||||
"""
|
||||
Parse the list table line. Make a string. Form a dictionary.
|
||||
Return the string and the dictionary.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
bug_handler,
|
||||
run_level=1,
|
||||
):
|
||||
self.__bug_handler = bug_handler
|
||||
self.__initiate_values()
|
||||
self.__run_level = run_level
|
||||
|
||||
def __initiate_values(self):
|
||||
self.__list_table_final = ''
|
||||
self.__state = 'default'
|
||||
self.__final_dict = {}
|
||||
self.__list_dict = {}
|
||||
self.__all_lists = []
|
||||
self.__level_text_string = ''
|
||||
self.__level_text_list = []
|
||||
self.__found_level_text_length = 0
|
||||
self.__level_text_position = None
|
||||
self.__prefix_string = None
|
||||
self.__level_numbers_string = ''
|
||||
self.__state_dict = {
|
||||
'default' : self.__default_func,
|
||||
'level' : self.__level_func,
|
||||
'list' : self.__list_func,
|
||||
'unsure_ob' : self.__after_bracket_func,
|
||||
'level_number' : self.__level_number_func,
|
||||
'level_text' : self.__level_text_func,
|
||||
'list_name' : self.__list_name_func,
|
||||
}
|
||||
self.__main_list_dict = {
|
||||
'cw<ls<ls-tem-id_' : 'list-template-id',
|
||||
'cw<ls<list-hybri' : 'list-hybrid',
|
||||
'cw<ls<lis-tbl-id' : 'list-table-id',
|
||||
}
|
||||
self.__level_dict = {
|
||||
'cw<ls<level-star' : 'list-number-start',
|
||||
'cw<ls<level-spac' : 'list-space',
|
||||
'cw<ls<level-inde' : 'level-indent',
|
||||
'cw<ls<fir-ln-ind' : 'first-line-indent',
|
||||
'cw<ls<left-inden' : 'left-indent',
|
||||
'cw<ls<tab-stop__' : 'tabs',
|
||||
'cw<ls<level-type' : 'numbering-type',
|
||||
'cw<pf<right-inde' : 'right-indent',
|
||||
'cw<pf<left-inden' : 'left-indent',
|
||||
'cw<pf<fir-ln-ind' : 'first-line-indent',
|
||||
'cw<ci<italics___' : 'italics',
|
||||
'cw<ci<bold______' : 'bold',
|
||||
'cw<ss<para-style' : 'paragraph-style-name',
|
||||
}
|
||||
"""
|
||||
all_lists =
|
||||
[{anything here?}
|
||||
[{list-templateid = ""}
|
||||
[{level-indent}],[{level-indent}]
|
||||
]
|
||||
],
|
||||
"""
|
||||
|
||||
def __parse_lines(self, line):
|
||||
"""
|
||||
Required : line --line to parse
|
||||
Returns: nothing
|
||||
Logic:
|
||||
Split the lines into a list by a new line. Process the line
|
||||
according to the state.
|
||||
"""
|
||||
lines = line.split('\n')
|
||||
self.__ob_count = 0
|
||||
self.__ob_group = 0
|
||||
for line in lines:
|
||||
self.__token_info = line[:16]
|
||||
if self.__token_info == 'ob<nu<open-brack':
|
||||
self.__ob_count = line[-4:]
|
||||
self.__ob_group += 1
|
||||
if self.__token_info == 'cb<nu<clos-brack':
|
||||
self.__cb_count = line[-4:]
|
||||
self.__ob_group -= 1
|
||||
action = self.__state_dict.get(self.__state)
|
||||
if action is None:
|
||||
print(self.__state)
|
||||
action(line)
|
||||
self.__write_final_string()
|
||||
# self.__add_to_final_line()
|
||||
|
||||
def __default_func(self, line):
|
||||
"""
|
||||
Requires: line --line to process
|
||||
Return: nothing
|
||||
Logic:
|
||||
This state is used at the start and end of a list. Look for an
|
||||
opening bracket, which marks the change of state.
|
||||
"""
|
||||
if self.__token_info == 'ob<nu<open-brack':
|
||||
self.__state = 'unsure_ob'
|
||||
|
||||
def __found_list_func(self, line):
|
||||
"""
|
||||
Requires: line -- line to process
|
||||
Returns: nothing
|
||||
Logic:
|
||||
I have found \\list.
|
||||
Change the state to list
|
||||
Get the open bracket count so you know when this state ends.
|
||||
Append an empty list to all lists.
|
||||
Create a temporary dictionary. This dictionary has the key of
|
||||
"list-id" and the value of an empty list. Later, this empty list
|
||||
will be filled with all the ids for which the formatting is valid.
|
||||
Append the temporary dictionary to the new list.
|
||||
"""
|
||||
self.__state = 'list'
|
||||
self.__list_ob_count = self.__ob_count
|
||||
self.__all_lists.append([])
|
||||
the_dict = {'list-id': []}
|
||||
self.__all_lists[-1].append(the_dict)
|
||||
|
||||
def __list_func(self, line):
|
||||
"""
|
||||
Requires: line --line to process
|
||||
Returns: nothing
|
||||
Logic:
|
||||
This method is called when you are in a list, but outside of a level.
|
||||
Check for the end of the list. Otherwise, use the self.__mainlist_dict
|
||||
to determine if you need to add a lines values to the main list.
|
||||
"""
|
||||
if self.__token_info == 'cb<nu<clos-brack' and\
|
||||
self.__cb_count == self.__list_ob_count:
|
||||
self.__state = 'default'
|
||||
elif self.__token_info == 'ob<nu<open-brack':
|
||||
self.__state = 'unsure_ob'
|
||||
else:
|
||||
att = self.__main_list_dict.get(self.__token_info)
|
||||
if att:
|
||||
value = line[20:]
|
||||
# dictionary is always the first item in the last list
|
||||
# [{att:value}, [], [att:value, []]
|
||||
self.__all_lists[-1][0][att] = value
|
||||
|
||||
def __found_level_func(self, line):
|
||||
"""
|
||||
Requires: line -- line to process
|
||||
Returns: nothing
|
||||
Logic:
|
||||
I have found \\listlevel.
|
||||
Change the state to level
|
||||
Get the open bracket count so you know when this state ends.
|
||||
Append an empty list to the last list inside all lists.
|
||||
Create a temporary dictionary.
|
||||
Append the temporary dictionary to the new list.
|
||||
self.__all_lists now looks like:
|
||||
[[{list-id:[]}, [{}]]]
|
||||
Where:
|
||||
self.__all_lists[-1] => a list. The first item is a dictionary.
|
||||
The second item is a list containing a dictionary:
|
||||
[{list-id:[]}, [{}]]
|
||||
self.__all_lists[-1][0] => a dictionary of the list attributes
|
||||
self.__all_lists[-1][-1] => a list with just a dictionary
|
||||
self.__all_lists[-1][-1][0] => the dictionary of level attributes
|
||||
"""
|
||||
self.__state = 'level'
|
||||
self.__level_ob_count = self.__ob_count
|
||||
self.__all_lists[-1].append([])
|
||||
the_dict = {}
|
||||
self.__all_lists[-1][-1].append(the_dict)
|
||||
self.__level_dict
|
||||
|
||||
def __level_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line -- line to parse
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
Look for the end of the this group.
|
||||
Change states if an open bracket is found.
|
||||
Add attributes to all_dicts if an appropriate token is found.
|
||||
"""
|
||||
if self.__token_info == 'cb<nu<clos-brack' and\
|
||||
self.__cb_count == self.__level_ob_count:
|
||||
self.__state = 'list'
|
||||
elif self.__token_info == 'ob<nu<open-brack':
|
||||
self.__state = 'unsure_ob'
|
||||
else:
|
||||
att = self.__level_dict.get(self.__token_info)
|
||||
if att:
|
||||
value = line[20:]
|
||||
self.__all_lists[-1][-1][0][att] = value
|
||||
|
||||
def __level_number_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line -- line to process
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
Check for the end of the group.
|
||||
Otherwise, if the token is hexidecimal, create an attribute.
|
||||
Do so by finding the base-10 value of the number. Then divide
|
||||
this by 2 and round it. Remove the ".0". Sandwwhich the result to
|
||||
give you something like level1-show-level.
|
||||
The show-level attribute means the numbering for this level.
|
||||
"""
|
||||
if self.__token_info == 'cb<nu<clos-brack' and\
|
||||
self.__cb_count == self.__level_number_ob_count:
|
||||
self.__state = 'level'
|
||||
self.__all_lists[-1][-1][0]['level-numbers'] = self.__level_numbers_string
|
||||
self.__level_numbers_string = ''
|
||||
elif self.__token_info == 'tx<hx<__________':
|
||||
self.__level_numbers_string += '\\'%s' % line[18:]
|
||||
elif self.__token_info == 'tx<nu<__________':
|
||||
self.__level_numbers_string += line[17:]
|
||||
"""
|
||||
num = line[18:]
|
||||
num = int(num, 16)
|
||||
level = unicode_type(round((num - 1)/2, 0))
|
||||
level = level[:-2]
|
||||
level = 'level%s-show-level' % level
|
||||
self.__all_lists[-1][-1][0][level] = 'true'
|
||||
"""
|
||||
|
||||
def __level_text_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line --line to process
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
Check for the end of the group.
|
||||
Otherwise, if the text is hexidecimal, call on the method
|
||||
__parse_level_text_length.
|
||||
Otheriwse, if the text is regular text, create an attribute.
|
||||
This attribute indicates the puncuation after a certain level.
|
||||
An example is "level1-marker = '.'"
|
||||
Otherwise, check for a level-template-id.
|
||||
"""
|
||||
if self.__token_info == 'cb<nu<clos-brack' and\
|
||||
self.__cb_count == self.__level_text_ob_count:
|
||||
if self.__prefix_string:
|
||||
if self.__all_lists[-1][-1][0]['numbering-type'] == 'bullet':
|
||||
self.__prefix_string = self.__prefix_string.replace('_', '')
|
||||
self.__all_lists[-1][-1][0]['bullet-type'] = self.__prefix_string
|
||||
self.__state = 'level'
|
||||
# self.__figure_level_text_func()
|
||||
self.__level_text_string = ''
|
||||
self.__found_level_text_length = 0
|
||||
elif self.__token_info == 'tx<hx<__________':
|
||||
self.__parse_level_text_length(line)
|
||||
elif self.__token_info == 'tx<nu<__________':
|
||||
text = line[17:]
|
||||
if text and text[-1] == ';':
|
||||
text = text.replace(';', '')
|
||||
if not self.__level_text_position:
|
||||
self.__prefix_string = text
|
||||
else:
|
||||
self.__all_lists[-1][-1][0][self.__level_text_position] = text
|
||||
elif self.__token_info == 'cw<ls<lv-tem-id_':
|
||||
value = line[20:]
|
||||
self.__all_lists[-1][-1][0]['level-template-id'] = value
|
||||
|
||||
def __parse_level_text_length(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line --line with hexidecimal number
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
Method is used for to parse text in the \\leveltext group.
|
||||
"""
|
||||
num = line[18:]
|
||||
the_num = int(num, 16)
|
||||
if not self.__found_level_text_length:
|
||||
self.__all_lists[-1][-1][0]['list-text-length'] = unicode_type(the_num)
|
||||
self.__found_level_text_length = 1
|
||||
else:
|
||||
the_num += 1
|
||||
the_string = unicode_type(the_num)
|
||||
level_marker = 'level%s-suffix' % the_string
|
||||
show_marker = 'show-level%s' % the_string
|
||||
self.__level_text_position = level_marker
|
||||
self.__all_lists[-1][-1][0][show_marker] = 'true'
|
||||
if self.__prefix_string:
|
||||
prefix_marker = 'level%s-prefix' % the_string
|
||||
self.__all_lists[-1][-1][0][prefix_marker] = self.__prefix_string
|
||||
self.__prefix_string = None
|
||||
|
||||
def __list_name_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line --line to process
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
Simply check for the end of the group and change states.
|
||||
"""
|
||||
if self.__token_info == 'cb<nu<clos-brack' and\
|
||||
self.__cb_count == self.__list_name_ob_count:
|
||||
self.__state = 'list'
|
||||
|
||||
def __after_bracket_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line --line to parse
|
||||
Returns:
|
||||
nothing.
|
||||
Logic:
|
||||
The last token found was "{". This method determines what group
|
||||
you are now in.
|
||||
WARNING: this could cause problems. If no group is found, the state will remain
|
||||
unsure_ob, which means no other text will be parsed.
|
||||
"""
|
||||
if self.__token_info == 'cw<ls<level-text':
|
||||
self.__state = 'level_text'
|
||||
self.__level_text_ob_count = self.__ob_count
|
||||
elif self.__token_info == 'cw<ls<level-numb':
|
||||
self.__level_number_ob_count = self.__ob_count
|
||||
self.__state = 'level_number'
|
||||
elif self.__token_info == 'cw<ls<list-tb-le':
|
||||
self.__found_level_func(line)
|
||||
elif self.__token_info == 'cw<ls<list-in-tb':
|
||||
self.__found_list_func(line)
|
||||
elif self.__token_info == 'cw<ls<list-name_':
|
||||
self.__state = 'list_name'
|
||||
self.__list_name_ob_count = self.__ob_count
|
||||
else:
|
||||
if self.__run_level > 3:
|
||||
msg = 'No matching token after open bracket\n'
|
||||
msg += 'token is "%s\n"' % (line)
|
||||
raise self.__bug_handler
|
||||
|
||||
def __add_to_final_line(self):
|
||||
"""
|
||||
Method no longer used.
|
||||
"""
|
||||
self.__list_table_final = 'mi<mk<listabbeg_\n'
|
||||
self.__list_table_final += 'mi<tg<open______<list-table\n' + \
|
||||
'mi<mk<listab-beg\n' + self.__list_table_final
|
||||
self.__list_table_final += \
|
||||
'mi<mk<listab-end\n' + 'mi<tg<close_____<list-table\n'
|
||||
self.__list_table_final += 'mi<mk<listabend_\n'
|
||||
|
||||
def __write_final_string(self):
|
||||
"""
|
||||
Requires:
|
||||
nothing
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
Write out the list-table start tag.
|
||||
Iterate through self.__all_lists. For each list, write out
|
||||
a list-in-table tag. Get the dictionary of this list
|
||||
(the first item). Print out the key => value pair.
|
||||
Remove the first item (the dictionary) form this list. Now iterate
|
||||
through what is left in the list. Each list will conatin one item,
|
||||
a dictionary. Get this dictionary and print out key => value pair.
|
||||
"""
|
||||
not_allow = ['list-id',]
|
||||
id = 0
|
||||
self.__list_table_final = 'mi<mk<listabbeg_\n'
|
||||
self.__list_table_final += 'mi<tg<open______<list-table\n' + \
|
||||
'mi<mk<listab-beg\n' + self.__list_table_final
|
||||
for list in self.__all_lists:
|
||||
id += 1
|
||||
self.__list_table_final += 'mi<tg<open-att__<list-in-table'
|
||||
# self.__list_table_final += '<list-id>%s' % (unicode_type(id))
|
||||
the_dict = list[0]
|
||||
the_keys = the_dict.keys()
|
||||
for the_key in the_keys:
|
||||
if the_key in not_allow:
|
||||
continue
|
||||
att = the_key
|
||||
value = the_dict[att]
|
||||
self.__list_table_final += '<%s>%s' % (att, value)
|
||||
self.__list_table_final += '\n'
|
||||
levels = list[1:]
|
||||
level_num = 0
|
||||
for level in levels:
|
||||
level_num += 1
|
||||
self.__list_table_final += 'mi<tg<empty-att_<level-in-table'
|
||||
self.__list_table_final += '<level>%s' % (unicode_type(level_num))
|
||||
the_dict2 = level[0]
|
||||
the_keys2 = the_dict2.keys()
|
||||
is_bullet = 0
|
||||
bullet_text = ''
|
||||
for the_key2 in the_keys2:
|
||||
if the_key2 in not_allow:
|
||||
continue
|
||||
test_bullet = the_dict2.get('numbering-type')
|
||||
if test_bullet == 'bullet':
|
||||
is_bullet = 1
|
||||
att2 = the_key2
|
||||
value2 = the_dict2[att2]
|
||||
# sys.stderr.write('%s\n' % att2[0:10])
|
||||
if att2[0:10] == 'show-level' and is_bullet:
|
||||
# sys.stderr.write('No print %s\n' % att2)
|
||||
pass
|
||||
elif att2[-6:] == 'suffix' and is_bullet:
|
||||
# sys.stderr.write('%s\n' % att2)
|
||||
bullet_text += value2
|
||||
elif att2[-6:] == 'prefix' and is_bullet:
|
||||
# sys.stderr.write('%s\n' % att2)
|
||||
bullet_text += value2
|
||||
else:
|
||||
self.__list_table_final += '<%s>%s' % (att2, value2)
|
||||
if is_bullet:
|
||||
pass
|
||||
# self.__list_table_final += '<bullet-type>%s' % (bullet_text)
|
||||
self.__list_table_final += '\n'
|
||||
self.__list_table_final += 'mi<tg<close_____<list-in-table\n'
|
||||
self.__list_table_final += \
|
||||
'mi<mk<listab-end\n' + 'mi<tg<close_____<list-table\n'
|
||||
self.__list_table_final += 'mi<mk<listabend_\n'
|
||||
|
||||
def parse_list_table(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line -- line with border definition in it
|
||||
Returns:
|
||||
A string and the dictionary of list-table values and attributes.
|
||||
Logic:
|
||||
Call on the __parse_lines metod, which splits the text string into
|
||||
lines (which will be tokens) and processes them.
|
||||
"""
|
||||
self.__parse_lines(line)
|
||||
return self.__list_table_final, self.__all_lists
|
||||
465
ebook_converter/ebooks/rtf2xml/make_lists.py
Normal file
465
ebook_converter/ebooks/rtf2xml/make_lists.py
Normal file
@@ -0,0 +1,465 @@
|
||||
from __future__ import unicode_literals, absolute_import, print_function, division
|
||||
#########################################################################
|
||||
# #
|
||||
# #
|
||||
# copyright 2002 Paul Henry Tremblay #
|
||||
# #
|
||||
# This program is distributed in the hope that it will be useful, #
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
|
||||
# General Public License for more details. #
|
||||
# #
|
||||
# #
|
||||
#########################################################################
|
||||
import sys, os, re
|
||||
|
||||
from calibre.ebooks.rtf2xml import copy
|
||||
from calibre.ptempfile import better_mktemp
|
||||
from polyglot.builtins import unicode_type
|
||||
|
||||
from . import open_for_read, open_for_write
|
||||
|
||||
|
||||
class MakeLists:
|
||||
"""
|
||||
Form lists.
|
||||
Use RTF's own formatting to determine if a paragraph definition is part of a
|
||||
list.
|
||||
Use indents to determine items and how lists are nested.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
in_file,
|
||||
bug_handler,
|
||||
headings_to_sections,
|
||||
list_of_lists,
|
||||
copy=None,
|
||||
run_level=1,
|
||||
no_headings_as_list=1,
|
||||
write_list_info=0,
|
||||
):
|
||||
"""
|
||||
Required:
|
||||
'file'
|
||||
Optional:
|
||||
'copy'-- whether to make a copy of result for debugging
|
||||
'temp_dir' --where to output temporary results (default is
|
||||
directory from which the script is run.)
|
||||
Returns:
|
||||
nothing
|
||||
"""
|
||||
self.__file = in_file
|
||||
self.__bug_handler = bug_handler
|
||||
self.__run_level = run_level
|
||||
self.__no_headings_as_list = no_headings_as_list
|
||||
self.__headings_to_sections = headings_to_sections
|
||||
self.__copy = copy
|
||||
self.__write_to = better_mktemp()
|
||||
self.__list_of_lists = list_of_lists
|
||||
self.__write_list_info = write_list_info
|
||||
|
||||
def __initiate_values(self):
|
||||
"""
|
||||
Required:
|
||||
Nothing
|
||||
Return:
|
||||
Nothing
|
||||
Logic:
|
||||
The self.__end_list is a list of tokens that will force a list to end.
|
||||
Likewise, the self.__end_lines is a list of lines that forces a list to end.
|
||||
"""
|
||||
self.__state = "default"
|
||||
self.__left_indent = 0
|
||||
self.__list_type = 'not-defined'
|
||||
self.__pard_def = ""
|
||||
self.__all_lists = []
|
||||
self.__level = 0
|
||||
self.__list_chunk = ''
|
||||
self.__state_dict={
|
||||
'default' : self.__default_func,
|
||||
'in_pard' : self.__in_pard_func,
|
||||
'after_pard' : self.__after_pard_func,
|
||||
}
|
||||
self.__headings = [
|
||||
'heading 1', 'heading 2', 'heading 3', 'heading 4',
|
||||
'heading 5', 'heading 6', 'heading 7', 'heading 8',
|
||||
'heading 9'
|
||||
]
|
||||
self.__allow_levels = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
|
||||
self.__style_name = ''
|
||||
self.__end_list = [
|
||||
'mi<mk<body-close',
|
||||
'mi<mk<par-in-fld',
|
||||
'cw<tb<cell______',
|
||||
'cw<tb<row-def___',
|
||||
'cw<tb<row_______',
|
||||
'mi<mk<sect-close',
|
||||
'mi<mk<sect-start',
|
||||
'mi<mk<header-beg',
|
||||
'mi<mk<header-end',
|
||||
'mi<mk<head___clo',
|
||||
'mi<mk<fldbk-end_',
|
||||
'mi<mk<close_cell',
|
||||
'mi<mk<footnt-ope',
|
||||
'mi<mk<foot___clo',
|
||||
'mi<mk<tabl-start',
|
||||
# 'mi<mk<sec-fd-beg',
|
||||
]
|
||||
self.__end_lines = [
|
||||
'mi<tg<close_____<cell\n',
|
||||
]
|
||||
self.__id_regex = re.compile(r'\<list-id\>(\d+)')
|
||||
self.__lv_regex = re.compile(r'\<list-level\>(\d+)')
|
||||
self.__found_appt = 0
|
||||
self.__line_num = 0
|
||||
|
||||
def __in_pard_func(self, line):
|
||||
"""
|
||||
Required:
|
||||
line -- the line of current text.
|
||||
Return:
|
||||
Nothing
|
||||
Logic:
|
||||
You are in a list, but in the middle of a paragraph definition.
|
||||
Don't do anything until you find the end of the paragraph definition.
|
||||
"""
|
||||
if self.__token_info == 'mi<mk<pard-end__':
|
||||
self.__state = 'after_pard'
|
||||
self.__write_obj.write(line)
|
||||
|
||||
def __after_pard_func(self, line):
|
||||
"""
|
||||
Required:
|
||||
line -- the line of current text.
|
||||
Return:
|
||||
Nothing
|
||||
Logic:
|
||||
You are in a list, but after a paragraph definition. You have to
|
||||
determine if the last pargraph definition ends a list, continues
|
||||
the old one, or starts a new one.
|
||||
Otherwise, look for a paragraph definition. If one is found, determine if
|
||||
the paragraph definition contains a list-id. If it does, use the method
|
||||
self.__list_after_par_def to determine the action.
|
||||
If the paragraph definition does not contain a list-id, use the method
|
||||
close_lists to close out items and lists for a paragraph that is not
|
||||
If a bigger block is found (such as a section or a cell), end all lists.
|
||||
indented.
|
||||
If no special line is found, add each line to a buffer.
|
||||
"""
|
||||
if self.__token_info == 'mi<tg<open-att__' and line[17:37] == 'paragraph-definition':
|
||||
is_heading = self.__is_a_heading()
|
||||
# found paragraph definition and not heading 1
|
||||
search_obj = re.search(self.__id_regex, line)
|
||||
if search_obj and not is_heading: # found list-id
|
||||
search_obj_lv = re.search(self.__lv_regex, line)
|
||||
if search_obj_lv:
|
||||
self.__level = search_obj_lv.group(1)
|
||||
num = search_obj.group(1)
|
||||
self.__list_after_par_def_func(line, num)
|
||||
self.__write_obj.write(line)
|
||||
self.__state = 'in_pard'
|
||||
# heading 1
|
||||
elif is_heading:
|
||||
self.__left_indent = -1000
|
||||
self.__close_lists()
|
||||
self.__write_obj.write(self.__list_chunk)
|
||||
self.__list_chunk = ''
|
||||
self.__state = 'default'
|
||||
self.__write_obj.write(line)
|
||||
# Normal with no list id
|
||||
else:
|
||||
self.__close_lists()
|
||||
self.__write_obj.write(self.__list_chunk)
|
||||
self.__list_chunk = ''
|
||||
self.__write_obj.write(line)
|
||||
if len(self.__all_lists) == 0:
|
||||
self.__state= 'default'
|
||||
else:
|
||||
self.__state = 'in_pard'
|
||||
# section to end lists
|
||||
elif self.__token_info in self.__end_list :
|
||||
self.__left_indent = -1000
|
||||
self.__close_lists()
|
||||
self.__write_obj.write(self.__list_chunk)
|
||||
self.__list_chunk = ''
|
||||
self.__state = 'default'
|
||||
self.__write_obj.write(line)
|
||||
else:
|
||||
self.__list_chunk += line
|
||||
|
||||
def __list_after_par_def_func(self, line, id):
|
||||
"""
|
||||
Required:
|
||||
line -- the line of current text.
|
||||
id -- the id of the current list
|
||||
Return:
|
||||
Nothing
|
||||
Logic:
|
||||
You have found the end of a paragraph definition, and have found
|
||||
another paragraph definition with a list id.
|
||||
If the list-id is different from the last paragraph definition,
|
||||
write the string in the buffer. Close out the lists with another
|
||||
method and start a new list.
|
||||
If the list id is the same as the last one, check the indent on the
|
||||
current paragraph definition. If it is greater than the previous one,
|
||||
do not end the current list or item. Start a new list.
|
||||
"""
|
||||
last_list_id = self.__all_lists[-1]['id']
|
||||
if id != last_list_id:
|
||||
self.__close_lists()
|
||||
self.__write_obj.write(self.__list_chunk)
|
||||
self.__write_start_list(id)
|
||||
self.__list_chunk = ''
|
||||
else:
|
||||
last_list_indent = self.__all_lists[-1]['left-indent']
|
||||
if self.__left_indent > last_list_indent:
|
||||
self.__write_obj.write(self.__list_chunk)
|
||||
self.__write_start_list(id)
|
||||
else:
|
||||
self.__write_end_item()
|
||||
self.__write_obj.write(self.__list_chunk)
|
||||
self.__write_start_item()
|
||||
self.__list_chunk = ''
|
||||
|
||||
def __close_lists(self):
|
||||
"""
|
||||
Required:
|
||||
Nothing
|
||||
Return:
|
||||
Nothing
|
||||
Logic:
|
||||
Reverse the list of dictionaries. Iterate through the list and
|
||||
get the indent for each list. If the current indent is less than
|
||||
or equal to the indent in the dictionary, close that level.
|
||||
Keep track of how many levels you close. Reduce the list by that
|
||||
many levels.
|
||||
Reverse the list again.
|
||||
"""
|
||||
if self.__line_num < 25 and self.__found_appt:
|
||||
sys.stderr.write('in closing out lists\n')
|
||||
sys.stderr.write('current_indent is "%s"\n' % self.__left_indent)
|
||||
current_indent = self.__left_indent
|
||||
self.__all_lists.reverse()
|
||||
num_levels_closed = 0
|
||||
for the_dict in self.__all_lists:
|
||||
list_indent = the_dict.get('left-indent')
|
||||
if self.__line_num < 25 and self.__found_appt:
|
||||
sys.stderr.write('last indent is "%s"' % list_indent)
|
||||
if current_indent <= list_indent:
|
||||
self.__write_end_item()
|
||||
self.__write_end_list()
|
||||
num_levels_closed += 1
|
||||
self.__all_lists = self.__all_lists[num_levels_closed:]
|
||||
self.__all_lists.reverse()
|
||||
|
||||
def __write_end_list(self):
|
||||
"""
|
||||
Required:
|
||||
Nothing
|
||||
Return:
|
||||
Nothing
|
||||
Logic:
|
||||
Write the end of a list.
|
||||
"""
|
||||
self.__write_obj.write('mi<tg<close_____<list\n')
|
||||
self.__write_obj.write('mi<mk<list_close\n')
|
||||
|
||||
def __write_start_list(self, id):
|
||||
"""
|
||||
Required:
|
||||
id -- the id of the current list.
|
||||
Return:
|
||||
Nothing
|
||||
Logic:
|
||||
Write the start of a list and add the id and left-indent to the
|
||||
self.__all_lists list.
|
||||
Write cues of when a list starts for later processing.
|
||||
In order to determine the type of list, you have to iterate through
|
||||
the self.__list_of lists. This list looks like:
|
||||
[[{list-id: [1, 2], [{}], [{}]] [{list-id: [3, 4], [{}]]]
|
||||
I need to get the inside lists of the main lists. Then I need to get
|
||||
the first item of what I just got. This is a dictionary. Get the list-id.
|
||||
This is a list. Check to see if the current id is in this list. If
|
||||
so, then get the list-type from the dictionary.
|
||||
"""
|
||||
the_dict = {}
|
||||
the_dict['left-indent'] = self.__left_indent
|
||||
the_dict['id'] = id
|
||||
self.__all_lists.append(the_dict)
|
||||
self.__write_obj.write(
|
||||
'mi<mk<list_start\n'
|
||||
)
|
||||
# bogus levels are sometimes written for empty paragraphs
|
||||
if unicode_type(self.__level) not in self.__allow_levels:
|
||||
lev_num = '0'
|
||||
else:
|
||||
lev_num = self.__level
|
||||
self.__write_obj.write(
|
||||
'mi<tg<open-att__<list<list-id>%s<level>%s'
|
||||
% (id, lev_num)
|
||||
)
|
||||
list_dict = {}
|
||||
if self.__list_of_lists: # older RTF won't generate a list_of_lists
|
||||
index_of_list = self.__get_index_of_list(id)
|
||||
if index_of_list is not None: # found a matching id
|
||||
curlist = self.__list_of_lists[index_of_list]
|
||||
list_dict = curlist[0]
|
||||
level = int(self.__level) + 1
|
||||
if level >= len(curlist):
|
||||
level = len(curlist) - 1
|
||||
level_dict = curlist[level][0]
|
||||
list_type = level_dict.get('numbering-type')
|
||||
if list_type == 'bullet':
|
||||
list_type = 'unordered'
|
||||
else:
|
||||
list_type = 'ordered'
|
||||
self.__write_obj.write(
|
||||
'<list-type>%s' % (list_type))
|
||||
else: # no matching id
|
||||
self.__write_obj.write(
|
||||
'<list-type>%s' % (self.__list_type))
|
||||
else: # older RTF
|
||||
self.__write_obj.write(
|
||||
'<list-type>%s' % (self.__list_type))
|
||||
# if you want to dump all the info to the list, rather than
|
||||
# keeping it in the table above, change self.__write_list_info
|
||||
# to true.
|
||||
if self.__list_of_lists and self.__write_list_info and list_dict:
|
||||
not_allow = ['list-id',]
|
||||
the_keys_list = list_dict.keys()
|
||||
for the_key in the_keys_list:
|
||||
if the_key in not_allow:
|
||||
continue
|
||||
self.__write_obj.write('<%s>%s' % (the_key, list_dict[the_key]))
|
||||
the_keys_level = level_dict.keys()
|
||||
for the_key in the_keys_level:
|
||||
self.__write_obj.write('<%s>%s' % (the_key, level_dict[the_key]))
|
||||
self.__write_obj.write('\n')
|
||||
self.__write_obj.write(
|
||||
'mi<mk<liststart_\n'
|
||||
)
|
||||
self.__write_start_item()
|
||||
|
||||
def __get_index_of_list(self, id):
|
||||
"""
|
||||
Requires:
|
||||
id -- id of current paragraph-definition
|
||||
Returns:
|
||||
an index of where the id occurs in list_of_lists, the
|
||||
dictionary passed to this module.
|
||||
Logic:
|
||||
Iterate through the big lists, the one passed to this module and
|
||||
get the first item, the dictionary. Use a counter to keep
|
||||
track of how many times you iterate with the counter.
|
||||
Once you find a match, return the counter.
|
||||
If no match is found, print out an error message.
|
||||
"""
|
||||
# some RTF use 0 indexed list. Don't know what to do?
|
||||
if id == '0':
|
||||
return
|
||||
the_index = 0
|
||||
for list in self.__list_of_lists:
|
||||
the_dict = list[0]
|
||||
id_in_list = the_dict.get('list-id')
|
||||
if id in id_in_list:
|
||||
return the_index
|
||||
the_index += 1
|
||||
if self.__run_level > 0:
|
||||
sys.stderr.write('Module is make_lists.py\n'
|
||||
'Method is __get_index_of_list\n'
|
||||
'The main list does not appear to have a matching id for %s \n'
|
||||
% (id)
|
||||
)
|
||||
# sys.stderr.write(repr(self.__list_of_lists))
|
||||
# if self.__run_level > 3:
|
||||
# msg = 'level is "%s"\n' % self.__run_level
|
||||
# self.__bug_handler
|
||||
|
||||
def __write_start_item(self):
|
||||
self.__write_obj.write('mi<mk<item_start\n')
|
||||
self.__write_obj.write('mi<tg<open______<item\n')
|
||||
self.__write_obj.write('mi<mk<itemstart_\n')
|
||||
|
||||
def __write_end_item(self):
|
||||
self.__write_obj.write('mi<tg<item_end__\n')
|
||||
self.__write_obj.write('mi<tg<close_____<item\n')
|
||||
self.__write_obj.write('mi<tg<item__end_\n')
|
||||
|
||||
def __default_func(self, line):
|
||||
"""
|
||||
Required:
|
||||
self, line
|
||||
Returns:
|
||||
Nothing
|
||||
Logic
|
||||
Look for the start of a paragraph defintion. If one is found, check if
|
||||
it contains a list-id. If it does, start a list. Change the state to
|
||||
in_pard.
|
||||
"""
|
||||
if self.__token_info == 'mi<tg<open-att__' and line[17:37] == 'paragraph-definition':
|
||||
is_a_heading = self.__is_a_heading()
|
||||
if not is_a_heading:
|
||||
search_obj = re.search(self.__id_regex, line)
|
||||
if search_obj:
|
||||
num = search_obj.group(1)
|
||||
self.__state = 'in_pard'
|
||||
search_obj_lv = re.search(self.__lv_regex, line)
|
||||
if search_obj_lv:
|
||||
self.__level = search_obj_lv.group(1)
|
||||
self.__write_start_list(num)
|
||||
self.__write_obj.write(line)
|
||||
|
||||
def __is_a_heading(self):
|
||||
if self.__style_name in self.__headings:
|
||||
if self.__headings_to_sections:
|
||||
return 1
|
||||
else:
|
||||
if self.__no_headings_as_list:
|
||||
return 1
|
||||
else:
|
||||
return 0
|
||||
else:
|
||||
return 0
|
||||
|
||||
def __get_indent(self, line):
|
||||
if self.__token_info == 'mi<mk<left_inden':
|
||||
self.__left_indent = float(line[17:-1])
|
||||
|
||||
def __get_list_type(self, line):
|
||||
if self.__token_info == 'mi<mk<list-type_': # <ordered
|
||||
self.__list_type = line[17:-1]
|
||||
if self.__list_type == 'item':
|
||||
self.__list_type = "unordered"
|
||||
|
||||
def __get_style_name(self, line):
|
||||
if self.__token_info == 'mi<mk<style-name':
|
||||
self.__style_name = line[17:-1]
|
||||
|
||||
def make_lists(self):
|
||||
"""
|
||||
Required:
|
||||
nothing
|
||||
Returns:
|
||||
original file will be changed
|
||||
Logic:
|
||||
"""
|
||||
self.__initiate_values()
|
||||
read_obj = open_for_read(self.__file)
|
||||
self.__write_obj = open_for_write(self.__write_to)
|
||||
line_to_read = 1
|
||||
while line_to_read:
|
||||
line_to_read = read_obj.readline()
|
||||
line = line_to_read
|
||||
self.__token_info = line[:16]
|
||||
self.__get_indent(line)
|
||||
self.__get_list_type(line)
|
||||
self.__get_style_name(line)
|
||||
action = self.__state_dict.get(self.__state)
|
||||
action(line)
|
||||
read_obj.close()
|
||||
self.__write_obj.close()
|
||||
copy_obj = copy.Copy(bug_handler=self.__bug_handler)
|
||||
if self.__copy:
|
||||
copy_obj.copy_file(self.__write_to, "make_lists.data")
|
||||
copy_obj.rename(self.__write_to, self.__file)
|
||||
os.remove(self.__write_to)
|
||||
146
ebook_converter/ebooks/rtf2xml/old_rtf.py
Normal file
146
ebook_converter/ebooks/rtf2xml/old_rtf.py
Normal file
@@ -0,0 +1,146 @@
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
#########################################################################
|
||||
# #
|
||||
# #
|
||||
# copyright 2002 Paul Henry Tremblay #
|
||||
# #
|
||||
# This program is distributed in the hope that it will be useful, #
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
|
||||
# General Public License for more details. #
|
||||
# #
|
||||
# #
|
||||
#########################################################################
|
||||
import sys
|
||||
|
||||
from polyglot.builtins import unicode_type
|
||||
|
||||
from . import open_for_read
|
||||
|
||||
|
||||
class OldRtf:
|
||||
"""
|
||||
Check to see if the RTF is an older version
|
||||
Logic:
|
||||
If allowable control word/properties happen in text without being enclosed
|
||||
in brackets the file will be considered old rtf
|
||||
"""
|
||||
|
||||
def __init__(self, in_file,
|
||||
bug_handler,
|
||||
run_level,
|
||||
):
|
||||
"""
|
||||
Required:
|
||||
'file'--file to parse
|
||||
'table_data' -- a dictionary for each table.
|
||||
Optional:
|
||||
'copy'-- whether to make a copy of result for debugging
|
||||
'temp_dir' --where to output temporary results (default is
|
||||
directory from which the script is run.)
|
||||
Returns:
|
||||
nothing
|
||||
"""
|
||||
self.__file = in_file
|
||||
self.__bug_handler = bug_handler
|
||||
self.__run_level = run_level
|
||||
self.__allowable = [
|
||||
'annotation' ,
|
||||
'blue______' ,
|
||||
'bold______',
|
||||
'caps______',
|
||||
'char-style' ,
|
||||
'dbl-strike' ,
|
||||
'emboss____',
|
||||
'engrave___' ,
|
||||
'font-color',
|
||||
'font-down_' ,
|
||||
'font-size_',
|
||||
'font-style',
|
||||
'font-up___',
|
||||
'footnot-mk' ,
|
||||
'green_____' ,
|
||||
'hidden____',
|
||||
'italics___',
|
||||
'outline___',
|
||||
'red_______',
|
||||
'shadow____' ,
|
||||
'small-caps',
|
||||
'strike-thr',
|
||||
'subscript_',
|
||||
'superscrip' ,
|
||||
'underlined' ,
|
||||
]
|
||||
self.__action_dict = {
|
||||
'before_body' : self.__before_body_func,
|
||||
'in_body' : self.__check_tokens_func,
|
||||
'after_pard' : self.__after_pard_func,
|
||||
}
|
||||
|
||||
def __initiate_values(self):
|
||||
self.__previous_token = ''
|
||||
self.__state = 'before_body'
|
||||
self.__found_new = 0
|
||||
self.__ob_group = 0
|
||||
|
||||
def __check_tokens_func(self, line):
|
||||
if self.__inline_info in self.__allowable:
|
||||
if self.__ob_group == self.__base_ob_count:
|
||||
return 'old_rtf'
|
||||
else:
|
||||
self.__found_new += 1
|
||||
elif self.__token_info == 'cw<pf<par-def___':
|
||||
self.__state = 'after_pard'
|
||||
|
||||
def __before_body_func(self, line):
|
||||
if self.__token_info == 'mi<mk<body-open_':
|
||||
self.__state = 'in_body'
|
||||
self.__base_ob_count = self.__ob_group
|
||||
|
||||
def __after_pard_func(self, line):
|
||||
if line[0:2] != 'cw':
|
||||
self.__state = 'in_body'
|
||||
|
||||
def check_if_old_rtf(self):
|
||||
"""
|
||||
Requires:
|
||||
nothing
|
||||
Returns:
|
||||
True if file is older RTf
|
||||
False if file is newer RTF
|
||||
"""
|
||||
self.__initiate_values()
|
||||
line_num = 0
|
||||
with open_for_read(self.__file) as read_obj:
|
||||
for line in read_obj:
|
||||
line_num += 1
|
||||
self.__token_info = line[:16]
|
||||
if self.__token_info == 'mi<mk<body-close':
|
||||
return False
|
||||
if self.__token_info == 'ob<nu<open-brack':
|
||||
self.__ob_group += 1
|
||||
self.__ob_count = line[-5:-1]
|
||||
if self.__token_info == 'cb<nu<clos-brack':
|
||||
self.__ob_group -= 1
|
||||
self.__cb_count = line[-5:-1]
|
||||
self.__inline_info = line[6:16]
|
||||
if self.__state == 'after_body':
|
||||
return False
|
||||
action = self.__action_dict.get(self.__state)
|
||||
if action is None:
|
||||
try:
|
||||
sys.stderr.write('No action for this state!\n')
|
||||
except:
|
||||
pass
|
||||
result = action(line)
|
||||
if result == 'new_rtf':
|
||||
return False
|
||||
elif result == 'old_rtf':
|
||||
if self.__run_level > 3:
|
||||
sys.stderr.write(
|
||||
'Old rtf construction %s (bracket %s, line %s)\n' % (
|
||||
self.__inline_info, unicode_type(self.__ob_group), line_num)
|
||||
)
|
||||
return True
|
||||
self.__previous_token = line[6:16]
|
||||
return False
|
||||
121
ebook_converter/ebooks/rtf2xml/output.py
Normal file
121
ebook_converter/ebooks/rtf2xml/output.py
Normal file
@@ -0,0 +1,121 @@
|
||||
from __future__ import unicode_literals, absolute_import, print_function, division
|
||||
#########################################################################
|
||||
# #
|
||||
# #
|
||||
# copyright 2002 Paul Henry Tremblay #
|
||||
# #
|
||||
# This program is distributed in the hope that it will be useful, #
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
|
||||
# General Public License for more details. #
|
||||
# #
|
||||
# #
|
||||
#########################################################################
|
||||
import sys, os
|
||||
from polyglot.builtins import raw_input
|
||||
from . import open_for_read, open_for_write
|
||||
# , codecs
|
||||
|
||||
|
||||
class Output:
|
||||
"""
|
||||
Output file
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
file,
|
||||
orig_file,
|
||||
output_dir=None,
|
||||
out_file=None,
|
||||
no_ask=True
|
||||
):
|
||||
"""
|
||||
Required:
|
||||
'file' -- xml file ready to output
|
||||
orig_file -- original rtf file
|
||||
Optional:
|
||||
output_file -- the file to output to
|
||||
Returns:
|
||||
nothing
|
||||
"""
|
||||
self.__file = file
|
||||
self.__orig_file = orig_file
|
||||
self.__output_dir = output_dir
|
||||
self.__no_ask = no_ask
|
||||
self.__out_file = out_file
|
||||
|
||||
def output(self):
|
||||
"""
|
||||
Required:
|
||||
nothing
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
output the line to the screen if no output file given. Otherwise, output to
|
||||
the file.
|
||||
"""
|
||||
if self.__output_dir:
|
||||
self.__output_to_dir_func()
|
||||
elif self.__out_file:
|
||||
self.__output_to_file_func()
|
||||
# self.__output_xml(self.__file, self.__out_file)
|
||||
else:
|
||||
self.__output_to_standard_func()
|
||||
|
||||
def __output_to_dir_func(self):
|
||||
"""
|
||||
Requires:
|
||||
nothing
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
Create a file within the output directory.
|
||||
Read one file at a time. Output line to the newly-created file.
|
||||
"""
|
||||
base_name = os.path.basename(self.__orig_file)
|
||||
base_name, ext = os.path.splitext(base_name)
|
||||
output_file = os.path.join(self.__output_dir, '%s.xml' % base_name)
|
||||
# change if user wants to output to a specific file
|
||||
if self.__out_file:
|
||||
output_file = os.path.join(self.__output_dir, self.__out_file)
|
||||
user_response = 'o'
|
||||
if os.path.isfile(output_file) and not self.__no_ask:
|
||||
msg = 'Do you want to overwrite %s?\n' % output_file
|
||||
msg += ('Type "o" to overwrite.\n'
|
||||
'Type any other key to print to standard output.\n')
|
||||
sys.stderr.write(msg)
|
||||
user_response = raw_input()
|
||||
if user_response == 'o':
|
||||
with open_for_read(self.__file) as read_obj:
|
||||
with open_for_write(self.output_file) as write_obj:
|
||||
for line in read_obj:
|
||||
write_obj.write(line)
|
||||
else:
|
||||
self.__output_to_standard_func()
|
||||
|
||||
def __output_to_file_func(self):
|
||||
"""
|
||||
Required:
|
||||
nothing
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
read one line at a time. Output to standard
|
||||
"""
|
||||
with open_for_read(self.__file) as read_obj:
|
||||
with open_for_write(self.__out_file) as write_obj:
|
||||
for line in read_obj:
|
||||
write_obj.write(line)
|
||||
|
||||
def __output_to_standard_func(self):
|
||||
"""
|
||||
Required:
|
||||
nothing
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
read one line at a time. Output to standard
|
||||
"""
|
||||
with open_for_read(self.__file) as read_obj:
|
||||
for line in read_obj:
|
||||
sys.stdout.write(line)
|
||||
209
ebook_converter/ebooks/rtf2xml/override_table.py
Normal file
209
ebook_converter/ebooks/rtf2xml/override_table.py
Normal file
@@ -0,0 +1,209 @@
|
||||
from __future__ import unicode_literals, absolute_import, print_function, division
|
||||
from __future__ import print_function
|
||||
#########################################################################
|
||||
# #
|
||||
# #
|
||||
# copyright 2002 Paul Henry Tremblay #
|
||||
# #
|
||||
# This program is distributed in the hope that it will be useful, #
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
|
||||
# General Public License for more details. #
|
||||
# #
|
||||
# #
|
||||
#########################################################################
|
||||
|
||||
|
||||
class OverrideTable:
|
||||
"""
|
||||
Parse a line of text to make the override table. Return a string
|
||||
(which will convert to XML) and the dictionary containing all the
|
||||
information about the lists. This dictionary is the result of the
|
||||
dictionary that is first passed to this module. This module
|
||||
modifies the dictionary, assigning lists numbers to each list.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
list_of_lists,
|
||||
run_level=1,
|
||||
):
|
||||
self.__list_of_lists = list_of_lists
|
||||
self.__initiate_values()
|
||||
self.__run_level = run_level
|
||||
|
||||
def __initiate_values(self):
|
||||
self.__override_table_final = ''
|
||||
self.__state = 'default'
|
||||
self.__override_list = []
|
||||
self.__state_dict = {
|
||||
'default' : self.__default_func,
|
||||
'override' : self.__override_func,
|
||||
'unsure_ob' : self.__after_bracket_func,
|
||||
}
|
||||
self.__override_dict = {
|
||||
'cw<ls<lis-tbl-id' : 'list-table-id',
|
||||
'cw<ls<list-id___' : 'list-id',
|
||||
}
|
||||
|
||||
def __override_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line -- line to parse
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
The group {\\override has been found.
|
||||
Check for the end of the group.
|
||||
Otherwise, add appropriate tokens to the override dictionary.
|
||||
"""
|
||||
if self.__token_info == 'cb<nu<clos-brack' and\
|
||||
self.__cb_count == self.__override_ob_count:
|
||||
self.__state = 'default'
|
||||
self.__parse_override_dict()
|
||||
else:
|
||||
att = self.__override_dict.get(self.__token_info)
|
||||
if att:
|
||||
value = line[20:]
|
||||
self.__override_list[-1][att] = value
|
||||
|
||||
def __parse_override_dict(self):
|
||||
"""
|
||||
Requires:
|
||||
nothing
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
The list of all information about RTF lists has been passed to
|
||||
this module. As of this point, this python list has no id number,
|
||||
which is needed later to identify which lists in the body should
|
||||
be assigned which formatting commands from the list-table.
|
||||
In order to get an id, I have to check to see when the list-table-id
|
||||
from the override_dict (generated in this module) matches the list-table-id
|
||||
in list_of_lists (generated in the list_table.py module). When a match is found,
|
||||
append the lists numbers to the self.__list_of_lists dictionary
|
||||
that contains the empty lists:
|
||||
[[{list-id:[HERE!],[{}]]
|
||||
This is a list, since one list in the table in the preamble of RTF can
|
||||
apply to multiple lists in the body.
|
||||
"""
|
||||
override_dict = self.__override_list[-1]
|
||||
list_id = override_dict.get('list-id')
|
||||
if list_id is None and self.__level > 3:
|
||||
msg = 'This override does not appear to have a list-id\n'
|
||||
raise self.__bug_handler(msg)
|
||||
current_table_id = override_dict.get('list-table-id')
|
||||
if current_table_id is None and self.__run_level > 3:
|
||||
msg = 'This override does not appear to have a list-table-id\n'
|
||||
raise self.__bug_handler(msg)
|
||||
counter = 0
|
||||
for list in self.__list_of_lists:
|
||||
info_dict = list[0]
|
||||
old_table_id = info_dict.get('list-table-id')
|
||||
if old_table_id == current_table_id:
|
||||
self.__list_of_lists[counter][0]['list-id'].append(list_id)
|
||||
break
|
||||
counter += 1
|
||||
|
||||
def __parse_lines(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line --ine to parse
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
Break the into tokens by splitting it on the newline.
|
||||
Call on the method according to the state.
|
||||
"""
|
||||
lines = line.split('\n')
|
||||
self.__ob_count = 0
|
||||
self.__ob_group = 0
|
||||
for line in lines:
|
||||
self.__token_info = line[:16]
|
||||
if self.__token_info == 'ob<nu<open-brack':
|
||||
self.__ob_count = line[-4:]
|
||||
self.__ob_group += 1
|
||||
if self.__token_info == 'cb<nu<clos-brack':
|
||||
self.__cb_count = line[-4:]
|
||||
self.__ob_group -= 1
|
||||
action = self.__state_dict.get(self.__state)
|
||||
if action is None:
|
||||
print(self.__state)
|
||||
action(line)
|
||||
self.__write_final_string()
|
||||
# self.__add_to_final_line()
|
||||
|
||||
def __default_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line -- line to parse
|
||||
Return:
|
||||
nothing
|
||||
Logic:
|
||||
Look for an open bracket and change states when found.
|
||||
"""
|
||||
if self.__token_info == 'ob<nu<open-brack':
|
||||
self.__state = 'unsure_ob'
|
||||
|
||||
def __after_bracket_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line -- line to parse
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
The last token was an open bracket. You need to determine
|
||||
the group based on the token after.
|
||||
WARNING: this could cause problems. If no group is found, the
|
||||
state will remain unsure_ob, which means no other text will be
|
||||
parsed. I should do states by a list and simply pop this
|
||||
unsure_ob state to get the previous state.
|
||||
"""
|
||||
if self.__token_info == 'cw<ls<lis-overid':
|
||||
self.__state = 'override'
|
||||
self.__override_ob_count = self.__ob_count
|
||||
the_dict = {}
|
||||
self.__override_list.append(the_dict)
|
||||
elif self.__run_level > 3:
|
||||
msg = 'No matching token after open bracket\n'
|
||||
msg += 'token is "%s\n"' % (line)
|
||||
raise self.__bug_handler(msg)
|
||||
|
||||
def __write_final_string(self):
|
||||
"""
|
||||
Requires:
|
||||
line -- line to parse
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
First write out the override-table tag.
|
||||
Iteratere through the dictionaries in the main override_list.
|
||||
For each dictionary, write an empty tag "override-list". Add
|
||||
the attributes and values of the tag from the dictionary.
|
||||
"""
|
||||
self.__override_table_final = 'mi<mk<over_beg_\n'
|
||||
self.__override_table_final += 'mi<tg<open______<override-table\n' + \
|
||||
'mi<mk<overbeg__\n' + self.__override_table_final
|
||||
for the_dict in self.__override_list:
|
||||
self.__override_table_final += 'mi<tg<empty-att_<override-list'
|
||||
the_keys = the_dict.keys()
|
||||
for the_key in the_keys:
|
||||
self.__override_table_final += \
|
||||
'<%s>%s' % (the_key, the_dict[the_key])
|
||||
self.__override_table_final += '\n'
|
||||
self.__override_table_final += '\n'
|
||||
self.__override_table_final += \
|
||||
'mi<mk<overri-end\n' + 'mi<tg<close_____<override-table\n'
|
||||
self.__override_table_final += 'mi<mk<overribend_\n'
|
||||
|
||||
def parse_override_table(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line -- line with border definition in it
|
||||
Returns:
|
||||
A string that will be converted to XML, and a dictionary of
|
||||
all the properties of the RTF lists.
|
||||
Logic:
|
||||
"""
|
||||
self.__parse_lines(line)
|
||||
return self.__override_table_final, self.__list_of_lists
|
||||
763
ebook_converter/ebooks/rtf2xml/paragraph_def.py
Normal file
763
ebook_converter/ebooks/rtf2xml/paragraph_def.py
Normal file
@@ -0,0 +1,763 @@
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
#########################################################################
|
||||
# #
|
||||
# #
|
||||
# copyright 2002 Paul Henry Tremblay #
|
||||
# #
|
||||
# This program is distributed in the hope that it will be useful, #
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
|
||||
# General Public License for more details. #
|
||||
# #
|
||||
# #
|
||||
#########################################################################
|
||||
import sys, os
|
||||
|
||||
from calibre.ebooks.rtf2xml import copy, border_parse
|
||||
from calibre.ptempfile import better_mktemp
|
||||
from polyglot.builtins import unicode_type
|
||||
|
||||
from . import open_for_read, open_for_write
|
||||
|
||||
|
||||
class ParagraphDef:
|
||||
"""
|
||||
=================
|
||||
Purpose
|
||||
=================
|
||||
Write paragraph definition tags.
|
||||
States:
|
||||
1. before_1st_para_def.
|
||||
Before any para_def token is found. This means all the text in the preamble.
|
||||
Look for the token 'cw<pf<par-def___'. This will changet the state to collect_tokens.
|
||||
2. collect_tokens.
|
||||
Found a paragraph_def. Need to get all tokens.
|
||||
Change with start of a paragrph ('mi<mk<para-start'). State then becomes
|
||||
in_paragraphs
|
||||
If another paragraph definition is found, the state does not change.
|
||||
But the dictionary is reset.
|
||||
3. in_paragraphs
|
||||
State changes when 'mi<mk<para-end__', or end of paragraph is found.
|
||||
State then becomes 'self.__state = 'after_para_end'
|
||||
4. after_para_end
|
||||
If 'mi<mk<para-start' (the start of a paragraph) or 'mi<mk<para-end__' (the end of a paragraph--must be empty paragraph?) are found:
|
||||
state changes to 'in_paragraphs'
|
||||
If 'cw<pf<par-def___' (paragraph_definition) is found:
|
||||
state changes to collect_tokens
|
||||
if 'mi<mk<body-close', 'mi<mk<par-in-fld',
|
||||
'cw<tb<cell______','cw<tb<row-def___','cw<tb<row_______',
|
||||
'mi<mk<sect-close', 'mi<mk<header-beg', 'mi<mk<header-end'
|
||||
are found. (All these tokens mark the start of a bigger element. para_def must
|
||||
be closed:
|
||||
state changes to 'after_para_def'
|
||||
5. after_para_def
|
||||
'mi<mk<para-start' changes state to in_paragraphs
|
||||
if another paragraph_def is found, the state changes to collect_tokens.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
in_file,
|
||||
bug_handler,
|
||||
default_font,
|
||||
copy=None,
|
||||
run_level=1,):
|
||||
"""
|
||||
Required:
|
||||
'file'--file to parse
|
||||
'default_font' --document default font
|
||||
Optional:
|
||||
'copy'-- whether to make a copy of result for debugging
|
||||
'temp_dir' --where to output temporary results (default is
|
||||
directory from which the script is run.)
|
||||
Returns:
|
||||
nothing
|
||||
"""
|
||||
self.__file = in_file
|
||||
self.__bug_handler = bug_handler
|
||||
self.__default_font = default_font
|
||||
self.__copy = copy
|
||||
self.__run_level = run_level
|
||||
self.__write_to = better_mktemp()
|
||||
|
||||
def __initiate_values(self):
|
||||
"""
|
||||
Initiate all values.
|
||||
"""
|
||||
# Dictionary needed to convert shortened style names to readable names
|
||||
self.__token_dict={
|
||||
# paragraph formatting => pf
|
||||
'par-end___' : 'para',
|
||||
'par-def___' : 'paragraph-definition',
|
||||
'keep-w-nex' : 'keep-with-next',
|
||||
'widow-cntl' : 'widow-control',
|
||||
'adjust-rgt' : 'adjust-right',
|
||||
'language__' : 'language',
|
||||
'right-inde' : 'right-indent',
|
||||
'fir-ln-ind' : 'first-line-indent',
|
||||
'left-inden' : 'left-indent',
|
||||
'space-befo' : 'space-before',
|
||||
'space-afte' : 'space-after',
|
||||
'line-space' : 'line-spacing',
|
||||
'default-ta' : 'default-tab',
|
||||
'align_____' : 'align',
|
||||
'widow-cntr' : 'widow-control',
|
||||
# stylesheet = > ss
|
||||
'style-shet' : 'stylesheet',
|
||||
'based-on__' : 'based-on-style',
|
||||
'next-style' : 'next-style',
|
||||
'char-style' : 'character-style',
|
||||
# this is changed to get a nice attribute
|
||||
'para-style' : 'name',
|
||||
# graphics => gr
|
||||
'picture___' : 'pict',
|
||||
'obj-class_' : 'obj_class',
|
||||
'mac-pic___' : 'mac-pict',
|
||||
# section => sc
|
||||
'section___' : 'section-new',
|
||||
'sect-defin' : 'section-reset',
|
||||
'sect-note_' : 'endnotes-in-section',
|
||||
# list=> ls
|
||||
'list-text_' : 'list-text',
|
||||
# this line must be wrong because it duplicates an earlier one
|
||||
'list-text_' : 'list-text',
|
||||
'list______' : 'list',
|
||||
'list-lev-d' : 'list-level-definition',
|
||||
'list-cardi' : 'list-cardinal-numbering',
|
||||
'list-decim' : 'list-decimal-numbering',
|
||||
'list-up-al' : 'list-uppercase-alphabetic-numbering',
|
||||
'list-up-ro' : 'list-uppercae-roman-numbering',
|
||||
'list-ord__' : 'list-ordinal-numbering',
|
||||
'list-ordte' : 'list-ordinal-text-numbering',
|
||||
'list-bulli' : 'list-bullet',
|
||||
'list-simpi' : 'list-simple',
|
||||
'list-conti' : 'list-continue',
|
||||
'list-hang_' : 'list-hang',
|
||||
# 'list-tebef' : 'list-text-before',
|
||||
# 'list-level' : 'level',
|
||||
'list-id___' : 'list-id',
|
||||
'list-start' : 'list-start',
|
||||
'nest-level' : 'nest-level',
|
||||
# duplicate
|
||||
'list-level' : 'list-level',
|
||||
# notes => nt
|
||||
'footnote__' : 'footnote',
|
||||
'type______' : 'type',
|
||||
# anchor => an
|
||||
'toc_______' : 'anchor-toc',
|
||||
'book-mk-st' : 'bookmark-start',
|
||||
'book-mk-en' : 'bookmark-end',
|
||||
'index-mark' : 'anchor-index',
|
||||
'place_____' : 'place',
|
||||
# field => fd
|
||||
'field_____' : 'field',
|
||||
'field-inst' : 'field-instruction',
|
||||
'field-rslt' : 'field-result',
|
||||
'datafield_' : 'data-field',
|
||||
# info-tables => it
|
||||
'font-table' : 'font-table',
|
||||
'colr-table' : 'color-table',
|
||||
'lovr-table' : 'list-override-table',
|
||||
'listtable_' : 'list-table',
|
||||
'revi-table' : 'revision-table',
|
||||
# character info => ci
|
||||
'hidden____' : 'hidden',
|
||||
'italics___' : 'italics',
|
||||
'bold______' : 'bold',
|
||||
'strike-thr' : 'strike-through',
|
||||
'shadow____' : 'shadow',
|
||||
'outline___' : 'outline',
|
||||
'small-caps' : 'small-caps',
|
||||
'caps______' : 'caps',
|
||||
'dbl-strike' : 'double-strike-through',
|
||||
'emboss____' : 'emboss',
|
||||
'engrave___' : 'engrave',
|
||||
'subscript_' : 'subscript',
|
||||
'superscrip' : 'superscipt',
|
||||
'font-style' : 'font-style',
|
||||
'font-color' : 'font-color',
|
||||
'font-size_' : 'font-size',
|
||||
'font-up___' : 'superscript',
|
||||
'font-down_' : 'subscript',
|
||||
'red_______' : 'red',
|
||||
'blue______' : 'blue',
|
||||
'green_____' : 'green',
|
||||
# table => tb
|
||||
'row-def___' : 'row-definition',
|
||||
'cell______' : 'cell',
|
||||
'row_______' : 'row',
|
||||
'in-table__' : 'in-table',
|
||||
'columns___' : 'columns',
|
||||
'row-pos-le' : 'row-position-left',
|
||||
'cell-posit' : 'cell-position',
|
||||
# preamble => pr
|
||||
# underline
|
||||
'underlined' : 'underlined',
|
||||
# border => bd
|
||||
'bor-t-r-hi' : 'border-table-row-horizontal-inside',
|
||||
'bor-t-r-vi' : 'border-table-row-vertical-inside',
|
||||
'bor-t-r-to' : 'border-table-row-top',
|
||||
'bor-t-r-le' : 'border-table-row-left',
|
||||
'bor-t-r-bo' : 'border-table-row-bottom',
|
||||
'bor-t-r-ri' : 'border-table-row-right',
|
||||
'bor-cel-bo' : 'border-cell-bottom',
|
||||
'bor-cel-to' : 'border-cell-top',
|
||||
'bor-cel-le' : 'border-cell-left',
|
||||
'bor-cel-ri' : 'border-cell-right',
|
||||
# 'bor-par-bo' : 'border-paragraph-bottom',
|
||||
'bor-par-to' : 'border-paragraph-top',
|
||||
'bor-par-le' : 'border-paragraph-left',
|
||||
'bor-par-ri' : 'border-paragraph-right',
|
||||
'bor-par-bo' : 'border-paragraph-box',
|
||||
'bor-for-ev' : 'border-for-every-paragraph',
|
||||
'bor-outsid' : 'border-outisde',
|
||||
'bor-none__' : 'border',
|
||||
# border type => bt
|
||||
'bdr-single' : 'single',
|
||||
'bdr-doubtb' : 'double-thickness-border',
|
||||
'bdr-shadow' : 'shadowed-border',
|
||||
'bdr-double' : 'double-border',
|
||||
'bdr-dotted' : 'dotted-border',
|
||||
'bdr-dashed' : 'dashed',
|
||||
'bdr-hair__' : 'hairline',
|
||||
'bdr-inset_' : 'inset',
|
||||
'bdr-das-sm' : 'dash-small',
|
||||
'bdr-dot-sm' : 'dot-dash',
|
||||
'bdr-dot-do' : 'dot-dot-dash',
|
||||
'bdr-outset' : 'outset',
|
||||
'bdr-trippl' : 'tripple',
|
||||
'bdr-thsm__' : 'thick-thin-small',
|
||||
'bdr-htsm__' : 'thin-thick-small',
|
||||
'bdr-hthsm_' : 'thin-thick-thin-small',
|
||||
'bdr-thm__' : 'thick-thin-medium',
|
||||
'bdr-htm__' : 'thin-thick-medium',
|
||||
'bdr-hthm_' : 'thin-thick-thin-medium',
|
||||
'bdr-thl__' : 'thick-thin-large',
|
||||
'bdr-hthl_' : 'think-thick-think-large',
|
||||
'bdr-wavy_' : 'wavy',
|
||||
'bdr-d-wav' : 'double-wavy',
|
||||
'bdr-strip' : 'striped',
|
||||
'bdr-embos' : 'emboss',
|
||||
'bdr-engra' : 'engrave',
|
||||
'bdr-frame' : 'frame',
|
||||
'bdr-li-wid' : 'line-width',
|
||||
}
|
||||
self.__tabs_dict = {
|
||||
'cw<pf<tab-stop__' : self.__tab_stop_func,
|
||||
'cw<pf<tab-center' : self.__tab_type_func,
|
||||
'cw<pf<tab-right_' : self.__tab_type_func,
|
||||
'cw<pf<tab-dec___' : self.__tab_type_func,
|
||||
'cw<pf<leader-dot' : self.__tab_leader_func,
|
||||
'cw<pf<leader-hyp' : self.__tab_leader_func,
|
||||
'cw<pf<leader-und' : self.__tab_leader_func,
|
||||
'cw<pf<tab-bar-st' : self.__tab_bar_func,
|
||||
}
|
||||
self.__tab_type_dict = {
|
||||
'cw<pf<tab-center' : 'center',
|
||||
'cw<pf<tab-right_' : 'right',
|
||||
'cw<pf<tab-dec___' : 'decimal',
|
||||
'cw<pf<leader-dot' : 'leader-dot',
|
||||
'cw<pf<leader-hyp' : 'leader-hyphen',
|
||||
'cw<pf<leader-und' : 'leader-underline',
|
||||
}
|
||||
self.__border_obj = border_parse.BorderParse()
|
||||
self.__style_num_strings = []
|
||||
self.__body_style_strings = []
|
||||
self.__state = 'before_1st_para_def'
|
||||
self.__att_val_dict = {}
|
||||
self.__start_marker = 'mi<mk<pard-start\n' # outside para tags
|
||||
self.__start2_marker = 'mi<mk<pardstart_\n' # inside para tags
|
||||
self.__end2_marker = 'mi<mk<pardend___\n' # inside para tags
|
||||
self.__end_marker = 'mi<mk<pard-end__\n' # outside para tags
|
||||
self.__text_string = ''
|
||||
self.__state_dict = {
|
||||
'before_1st_para_def' : self.__before_1st_para_def_func,
|
||||
'collect_tokens' : self.__collect_tokens_func,
|
||||
'after_para_def' : self.__after_para_def_func,
|
||||
'in_paragraphs' : self.__in_paragraphs_func,
|
||||
'after_para_end' : self.__after_para_end_func,
|
||||
}
|
||||
self.__collect_tokens_dict = {
|
||||
'mi<mk<para-start' : self.__end_para_def_func,
|
||||
'cw<pf<par-def___' : self.__para_def_in_para_def_func,
|
||||
'cw<tb<cell______' : self.__empty_table_element_func,
|
||||
'cw<tb<row_______' : self.__empty_table_element_func,
|
||||
}
|
||||
self.__after_para_def_dict = {
|
||||
'mi<mk<para-start' : self.__start_para_after_def_func,
|
||||
'cw<pf<par-def___' : self.__found_para_def_func,
|
||||
'cw<tb<cell______' : self.__empty_table_element_func,
|
||||
'cw<tb<row_______' : self.__empty_table_element_func,
|
||||
}
|
||||
self.__in_paragraphs_dict = {
|
||||
'mi<mk<para-end__' : self.__found_para_end_func,
|
||||
}
|
||||
self.__after_para_end_dict = {
|
||||
'mi<mk<para-start' : self.__continue_block_func,
|
||||
'mi<mk<para-end__' : self.__continue_block_func,
|
||||
'cw<pf<par-def___' : self.__new_para_def_func,
|
||||
'mi<mk<body-close' : self.__stop_block_func,
|
||||
'mi<mk<par-in-fld' : self.__stop_block_func,
|
||||
'cw<tb<cell______' : self.__stop_block_func,
|
||||
'cw<tb<row-def___' : self.__stop_block_func,
|
||||
'cw<tb<row_______' : self.__stop_block_func,
|
||||
'mi<mk<sect-close' : self.__stop_block_func,
|
||||
'mi<mk<sect-start' : self.__stop_block_func,
|
||||
'mi<mk<header-beg' : self.__stop_block_func,
|
||||
'mi<mk<header-end' : self.__stop_block_func,
|
||||
'mi<mk<head___clo' : self.__stop_block_func,
|
||||
'mi<mk<fldbk-end_' : self.__stop_block_func,
|
||||
'mi<mk<lst-txbeg_' : self.__stop_block_func,
|
||||
}
|
||||
|
||||
def __before_1st_para_def_func(self, line):
|
||||
"""
|
||||
Required:
|
||||
line -- line to parse
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
Look for the beginning of a paragaraph definition
|
||||
"""
|
||||
# cw<pf<par-def___<nu<true
|
||||
if self.__token_info == 'cw<pf<par-def___':
|
||||
self.__found_para_def_func()
|
||||
else:
|
||||
self.__write_obj.write(line)
|
||||
|
||||
def __found_para_def_func(self):
|
||||
self.__state = 'collect_tokens'
|
||||
# not exactly right--have to reset the dictionary--give it default
|
||||
# values
|
||||
self.__reset_dict()
|
||||
|
||||
def __collect_tokens_func(self, line):
|
||||
"""
|
||||
Required:
|
||||
line --line to parse
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
Check the collect_tokens_dict for either the beginning of a
|
||||
paragraph or a new paragraph definition. Take the actions
|
||||
according to the value in the dict.
|
||||
Otherwise, check if the token is not a control word. If it is not,
|
||||
change the state to after_para_def.
|
||||
Otherwise, check if the token is a paragraph definition word; if
|
||||
so, add it to the attributes and values dictionary.
|
||||
"""
|
||||
action = self.__collect_tokens_dict.get(self.__token_info)
|
||||
if action:
|
||||
action(line)
|
||||
elif line[0:2] != 'cw':
|
||||
self.__write_obj.write(line)
|
||||
self.__state = 'after_para_def'
|
||||
elif line[0:5] == 'cw<bd':
|
||||
self.__parse_border(line)
|
||||
else:
|
||||
action = self.__tabs_dict.get(self.__token_info)
|
||||
if action:
|
||||
action(line)
|
||||
else:
|
||||
token = self.__token_dict.get(line[6:16])
|
||||
if token:
|
||||
self.__att_val_dict[token] = line[20:-1]
|
||||
|
||||
def __tab_stop_func(self, line):
|
||||
"""
|
||||
"""
|
||||
self.__att_val_dict['tabs'] += '%s:' % self.__tab_type
|
||||
self.__att_val_dict['tabs'] += '%s;' % line[20:-1]
|
||||
self.__tab_type = 'left'
|
||||
|
||||
def __tab_type_func(self, line):
|
||||
"""
|
||||
"""
|
||||
type = self.__tab_type_dict.get(self.__token_info)
|
||||
if type is not None:
|
||||
self.__tab_type = type
|
||||
else:
|
||||
if self.__run_level > 3:
|
||||
msg = 'no entry for %s\n' % self.__token_info
|
||||
raise self.__bug_handler(msg)
|
||||
|
||||
def __tab_leader_func(self, line):
|
||||
"""
|
||||
"""
|
||||
leader = self.__tab_type_dict.get(self.__token_info)
|
||||
if leader is not None:
|
||||
self.__att_val_dict['tabs'] += '%s^' % leader
|
||||
else:
|
||||
if self.__run_level > 3:
|
||||
msg = 'no entry for %s\n' % self.__token_info
|
||||
raise self.__bug_handler(msg)
|
||||
|
||||
def __tab_bar_func(self, line):
|
||||
"""
|
||||
"""
|
||||
# self.__att_val_dict['tabs-bar'] += '%s:' % line[20:-1]
|
||||
self.__att_val_dict['tabs'] += 'bar:%s;' % (line[20:-1])
|
||||
self.__tab_type = 'left'
|
||||
|
||||
def __parse_border(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line --line to parse
|
||||
Returns:
|
||||
nothing (updates dictionary)
|
||||
Logic:
|
||||
Uses the border_parse module to return a dictionary of attribute
|
||||
value pairs for a border line.
|
||||
"""
|
||||
border_dict = self.__border_obj.parse_border(line)
|
||||
self.__att_val_dict.update(border_dict)
|
||||
|
||||
def __para_def_in_para_def_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line --line to parse
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
I have found a \\pard while I am collecting tokens. I want to reset
|
||||
the dectionary and do nothing else.
|
||||
"""
|
||||
# Change this
|
||||
self.__state = 'collect_tokens'
|
||||
self.__reset_dict()
|
||||
|
||||
def __end_para_def_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
Nothing
|
||||
Returns:
|
||||
Nothing
|
||||
Logic:
|
||||
The previous state was collect tokens, and I have found the start
|
||||
of a paragraph. I want to outut the defintion tag; output the line
|
||||
itself (telling me of the beginning of a paragraph);change the
|
||||
state to 'in_paragraphs';
|
||||
"""
|
||||
self.__write_para_def_beg()
|
||||
self.__write_obj.write(line)
|
||||
self.__state = 'in_paragraphs'
|
||||
|
||||
def __start_para_after_def_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
Nothing
|
||||
Returns:
|
||||
Nothing
|
||||
Logic:
|
||||
The state was is after_para_def. and I have found the start of a
|
||||
paragraph. I want to outut the defintion tag; output the line
|
||||
itself (telling me of the beginning of a paragraph);change the
|
||||
state to 'in_paragraphs'.
|
||||
(I now realize that this is absolutely identical to the function above!)
|
||||
"""
|
||||
self.__write_para_def_beg()
|
||||
self.__write_obj.write(line)
|
||||
self.__state = 'in_paragraphs'
|
||||
|
||||
def __after_para_def_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line -- line to parse
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
Check if the token info is the start of a paragraph. If so, call
|
||||
on the function found in the value of the dictionary.
|
||||
"""
|
||||
action = self.__after_para_def_dict.get(self.__token_info)
|
||||
if self.__token_info == 'cw<pf<par-def___':
|
||||
self.__found_para_def_func()
|
||||
elif action:
|
||||
action(line)
|
||||
else:
|
||||
self.__write_obj.write(line)
|
||||
|
||||
def __in_paragraphs_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line --current line
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
Look for the end of a paragraph, the start of a cell or row.
|
||||
"""
|
||||
action = self.__in_paragraphs_dict.get(self.__token_info)
|
||||
if action:
|
||||
action(line)
|
||||
else:
|
||||
self.__write_obj.write(line)
|
||||
|
||||
def __found_para_end_func(self,line):
|
||||
"""
|
||||
Requires:
|
||||
line -- line to print out
|
||||
Returns:
|
||||
Nothing
|
||||
Logic:
|
||||
State is in paragraphs. You have found the end of a paragraph. You
|
||||
need to print out the line and change the state to after
|
||||
paragraphs.
|
||||
"""
|
||||
self.__state = 'after_para_end'
|
||||
self.__write_obj.write(line)
|
||||
|
||||
def __after_para_end_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line -- line to output
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
The state is after the end of a paragraph. You are collecting all
|
||||
the lines in a string and waiting to see if you need to write
|
||||
out the paragraph definition. If you find another paragraph
|
||||
definition, then you write out the old paragraph dictionary and
|
||||
print out the string. You change the state to collect tokens.
|
||||
If you find any larger block elemens, such as cell, row,
|
||||
field-block, or section, you write out the paragraph defintion and
|
||||
then the text string.
|
||||
If you find the beginning of a paragraph, then you don't need to
|
||||
write out the paragraph definition. Write out the string, and
|
||||
change the state to in paragraphs.
|
||||
"""
|
||||
self.__text_string += line
|
||||
action = self.__after_para_end_dict.get(self.__token_info)
|
||||
if action:
|
||||
action(line)
|
||||
|
||||
def __continue_block_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line --line to print out
|
||||
Returns:
|
||||
Nothing
|
||||
Logic:
|
||||
The state is after the end of a paragraph. You have found the
|
||||
start of a paragaph, so you don't need to print out the paragaph
|
||||
definition. Print out the string, the line, and change the state
|
||||
to in paragraphs.
|
||||
"""
|
||||
self.__state = 'in_paragraphs'
|
||||
self.__write_obj.write(self.__text_string)
|
||||
self.__text_string = ''
|
||||
# found a new paragraph definition after an end of a paragraph
|
||||
|
||||
def __new_para_def_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line -- line to output
|
||||
Returns:
|
||||
Nothing
|
||||
Logic:
|
||||
You have found a new paragraph defintion at the end of a
|
||||
paragraph. Output the end of the old paragraph defintion. Output
|
||||
the text string. Output the line. Change the state to collect
|
||||
tokens. (And don't forget to set the text string to ''!)
|
||||
"""
|
||||
self.__write_para_def_end_func()
|
||||
self.__found_para_def_func()
|
||||
# after a paragraph and found reason to stop this block
|
||||
|
||||
def __stop_block_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line --(shouldn't be here?)
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
The state is after a paragraph, and you have found a larger block
|
||||
than paragraph-definition. You want to write the end tag of the
|
||||
old defintion and reset the text string (handled by other
|
||||
methods).
|
||||
"""
|
||||
self.__write_para_def_end_func()
|
||||
self.__state = 'after_para_def'
|
||||
|
||||
def __write_para_def_end_func(self):
|
||||
"""
|
||||
Requires:
|
||||
nothing
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
Print out the end of the pargraph definition tag, and the markers
|
||||
that let me know when I have reached this tag. (These markers are
|
||||
used for later parsing.)
|
||||
"""
|
||||
self.__write_obj.write(self.__end2_marker)
|
||||
self.__write_obj.write('mi<tg<close_____<paragraph-definition\n')
|
||||
self.__write_obj.write(self.__end_marker)
|
||||
self.__write_obj.write(self.__text_string)
|
||||
self.__text_string = ''
|
||||
keys = self.__att_val_dict.keys()
|
||||
if 'font-style' in keys:
|
||||
self.__write_obj.write('mi<mk<font-end__\n')
|
||||
if 'caps' in keys:
|
||||
self.__write_obj.write('mi<mk<caps-end__\n')
|
||||
|
||||
def __get_num_of_style(self):
|
||||
"""
|
||||
Requires:
|
||||
nothing
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
Get a unique value for each style.
|
||||
"""
|
||||
my_string = ''
|
||||
new_style = 0
|
||||
# when determining uniqueness for a style, ingorne these values, since
|
||||
# they don't tell us if the style is unique
|
||||
ignore_values = ['style-num', 'nest-level', 'in-table']
|
||||
for k in sorted(self.__att_val_dict):
|
||||
if k not in ignore_values:
|
||||
my_string += '%s:%s' % (k, self.__att_val_dict[k])
|
||||
if my_string in self.__style_num_strings:
|
||||
num = self.__style_num_strings.index(my_string)
|
||||
num += 1 # since indexing starts at zero, rather than 1
|
||||
else:
|
||||
self.__style_num_strings.append(my_string)
|
||||
num = len(self.__style_num_strings)
|
||||
new_style = 1
|
||||
num = '%04d' % num
|
||||
self.__att_val_dict['style-num'] = 's' + unicode_type(num)
|
||||
if new_style:
|
||||
self.__write_body_styles()
|
||||
|
||||
def __write_body_styles(self):
|
||||
style_string = ''
|
||||
style_string += 'mi<tg<empty-att_<paragraph-style-in-body'
|
||||
style_string += '<name>%s' % self.__att_val_dict['name']
|
||||
style_string += '<style-number>%s' % self.__att_val_dict['style-num']
|
||||
tabs_list = ['tabs-left', 'tabs-right', 'tabs-decimal', 'tabs-center',
|
||||
'tabs-bar', 'tabs']
|
||||
if self.__att_val_dict['tabs'] != '':
|
||||
the_value = self.__att_val_dict['tabs']
|
||||
# the_value = the_value[:-1]
|
||||
style_string += ('<%s>%s' % ('tabs', the_value))
|
||||
exclude = frozenset(['name', 'style-num', 'in-table'] + tabs_list)
|
||||
for k in sorted(self.__att_val_dict):
|
||||
if k not in exclude:
|
||||
style_string += ('<%s>%s' % (k, self.__att_val_dict[k]))
|
||||
style_string += '\n'
|
||||
self.__body_style_strings.append(style_string)
|
||||
|
||||
def __write_para_def_beg(self):
|
||||
"""
|
||||
Requires:
|
||||
nothing
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
Print out the beginning of the pargraph definition tag, and the markers
|
||||
that let me know when I have reached this tag. (These markers are
|
||||
used for later parsing.)
|
||||
"""
|
||||
self.__get_num_of_style()
|
||||
table = self.__att_val_dict.get('in-table')
|
||||
if table:
|
||||
# del self.__att_val_dict['in-table']
|
||||
self.__write_obj.write('mi<mk<in-table__\n')
|
||||
else:
|
||||
self.__write_obj.write('mi<mk<not-in-tbl\n')
|
||||
left_indent = self.__att_val_dict.get('left-indent')
|
||||
if left_indent:
|
||||
self.__write_obj.write('mi<mk<left_inden<%s\n' % left_indent)
|
||||
is_list = self.__att_val_dict.get('list-id')
|
||||
if is_list:
|
||||
self.__write_obj.write('mi<mk<list-id___<%s\n' % is_list)
|
||||
else:
|
||||
self.__write_obj.write('mi<mk<no-list___\n')
|
||||
self.__write_obj.write('mi<mk<style-name<%s\n' % self.__att_val_dict['name'])
|
||||
self.__write_obj.write(self.__start_marker)
|
||||
self.__write_obj.write('mi<tg<open-att__<paragraph-definition')
|
||||
self.__write_obj.write('<name>%s' % self.__att_val_dict['name'])
|
||||
self.__write_obj.write('<style-number>%s' % self.__att_val_dict['style-num'])
|
||||
tabs_list = ['tabs-left', 'tabs-right', 'tabs-decimal', 'tabs-center',
|
||||
'tabs-bar', 'tabs']
|
||||
"""
|
||||
for tab_item in tabs_list:
|
||||
if self.__att_val_dict[tab_item] != '':
|
||||
the_value = self.__att_val_dict[tab_item]
|
||||
the_value = the_value[:-1]
|
||||
self.__write_obj.write('<%s>%s' % (tab_item, the_value))
|
||||
"""
|
||||
if self.__att_val_dict['tabs'] != '':
|
||||
the_value = self.__att_val_dict['tabs']
|
||||
# the_value = the_value[:-1]
|
||||
self.__write_obj.write('<%s>%s' % ('tabs', the_value))
|
||||
keys = sorted(self.__att_val_dict)
|
||||
exclude = frozenset(['name', 'style-num', 'in-table'] + tabs_list)
|
||||
for key in keys:
|
||||
if key not in exclude:
|
||||
self.__write_obj.write('<%s>%s' % (key, self.__att_val_dict[key]))
|
||||
self.__write_obj.write('\n')
|
||||
self.__write_obj.write(self.__start2_marker)
|
||||
if 'font-style' in keys:
|
||||
face = self.__att_val_dict['font-style']
|
||||
self.__write_obj.write('mi<mk<font______<%s\n' % face)
|
||||
if 'caps' in keys:
|
||||
value = self.__att_val_dict['caps']
|
||||
self.__write_obj.write('mi<mk<caps______<%s\n' % value)
|
||||
|
||||
def __empty_table_element_func(self, line):
|
||||
self.__write_obj.write('mi<mk<in-table__\n')
|
||||
self.__write_obj.write(line)
|
||||
self.__state = 'after_para_def'
|
||||
|
||||
def __reset_dict(self):
|
||||
"""
|
||||
Requires:
|
||||
nothing
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
The dictionary containing values and attributes must be reset each
|
||||
time a new paragraphs definition is found.
|
||||
"""
|
||||
self.__att_val_dict.clear()
|
||||
self.__att_val_dict['name'] = 'Normal'
|
||||
self.__att_val_dict['font-style'] = self.__default_font
|
||||
self.__tab_type = 'left'
|
||||
self.__att_val_dict['tabs-left'] = ''
|
||||
self.__att_val_dict['tabs-right'] = ''
|
||||
self.__att_val_dict['tabs-center'] = ''
|
||||
self.__att_val_dict['tabs-decimal'] = ''
|
||||
self.__att_val_dict['tabs-bar'] = ''
|
||||
self.__att_val_dict['tabs'] = ''
|
||||
|
||||
def make_paragraph_def(self):
|
||||
"""
|
||||
Requires:
|
||||
nothing
|
||||
Returns:
|
||||
nothing (changes the original file)
|
||||
Logic:
|
||||
Read one line in at a time. Determine what action to take based on
|
||||
the state.
|
||||
"""
|
||||
self.__initiate_values()
|
||||
read_obj = open_for_read(self.__file)
|
||||
self.__write_obj = open_for_write(self.__write_to)
|
||||
line_to_read = 1
|
||||
while line_to_read:
|
||||
line_to_read = read_obj.readline()
|
||||
line = line_to_read
|
||||
self.__token_info = line[:16]
|
||||
action = self.__state_dict.get(self.__state)
|
||||
if action is None:
|
||||
sys.stderr.write('no no matching state in module sections.py\n')
|
||||
sys.stderr.write(self.__state + '\n')
|
||||
action(line)
|
||||
read_obj.close()
|
||||
self.__write_obj.close()
|
||||
copy_obj = copy.Copy(bug_handler=self.__bug_handler)
|
||||
if self.__copy:
|
||||
copy_obj.copy_file(self.__write_to, "paragraphs_def.data")
|
||||
copy_obj.rename(self.__write_to, self.__file)
|
||||
os.remove(self.__write_to)
|
||||
return self.__body_style_strings
|
||||
263
ebook_converter/ebooks/rtf2xml/paragraphs.py
Normal file
263
ebook_converter/ebooks/rtf2xml/paragraphs.py
Normal file
@@ -0,0 +1,263 @@
|
||||
from __future__ import unicode_literals, absolute_import, print_function, division
|
||||
#########################################################################
|
||||
# #
|
||||
# #
|
||||
# copyright 2002 Paul Henry Tremblay #
|
||||
# #
|
||||
# This program is distributed in the hope that it will be useful, #
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
|
||||
# General Public License for more details. #
|
||||
# #
|
||||
# #
|
||||
#########################################################################
|
||||
import sys, os
|
||||
|
||||
from calibre.ebooks.rtf2xml import copy
|
||||
from calibre.ptempfile import better_mktemp
|
||||
from . import open_for_read, open_for_write
|
||||
|
||||
|
||||
class Paragraphs:
|
||||
"""
|
||||
=================
|
||||
Purpose
|
||||
=================
|
||||
Write paragraph tags for a tokenized file. (This module won't be any use to use
|
||||
to you unless you use it as part of the other modules.)
|
||||
-------------
|
||||
Method
|
||||
-------------
|
||||
RTF does not tell you when a paragraph begins. It only tells you when the
|
||||
paragraph ends.
|
||||
In order to make paragraphs out of this limited info, the parser starts in the
|
||||
body of the documents and assumes it is not in a paragraph. It looks for clues
|
||||
to begin a paragraph. Text starts a paragraph; so does an inline field or
|
||||
list-text. If an end of paragraph marker (\\par) is found, then this indicates
|
||||
a blank paragraph.
|
||||
Once a paragraph is found, the state changes to 'paragraph.' In this state,
|
||||
clues are looked to for the end of a paragraph. The end of a paragraph marker
|
||||
(\\par) marks the end of a paragraph. So does the end of a footnote or heading;
|
||||
a paragraph definition; the end of a field-block; and the beginning of a
|
||||
section. (How about the end of a section or the end of a field-block?)
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
in_file,
|
||||
bug_handler,
|
||||
copy=None,
|
||||
write_empty_para=1,
|
||||
run_level=1,
|
||||
):
|
||||
"""
|
||||
Required:
|
||||
'file'--file to parse
|
||||
Optional:
|
||||
'copy'-- whether to make a copy of result for debugging
|
||||
'temp_dir' --where to output temporary results (default is
|
||||
directory from which the script is run.)
|
||||
Returns:
|
||||
nothing
|
||||
"""
|
||||
self.__file = in_file
|
||||
self.__bug_handler = bug_handler
|
||||
self.__copy = copy
|
||||
self.__write_empty_para = write_empty_para
|
||||
self.__run_level = run_level
|
||||
self.__write_to = better_mktemp()
|
||||
|
||||
def __initiate_values(self):
|
||||
"""
|
||||
Initiate all values.
|
||||
"""
|
||||
self.__state = 'before_body'
|
||||
self.__start_marker = 'mi<mk<para-start\n' # outside para tags
|
||||
self.__start2_marker = 'mi<mk<par-start_\n' # inside para tags
|
||||
self.__end2_marker = 'mi<mk<par-end___\n' # inside para tags
|
||||
self.__end_marker = 'mi<mk<para-end__\n' # outside para tags
|
||||
self.__state_dict = {
|
||||
'before_body' : self.__before_body_func,
|
||||
'not_paragraph' : self.__not_paragraph_func,
|
||||
'paragraph' : self.__paragraph_func,
|
||||
}
|
||||
self.__paragraph_dict = {
|
||||
'cw<pf<par-end___' : self.__close_para_func, # end of paragraph
|
||||
'mi<mk<headi_-end' : self.__close_para_func, # end of header or footer
|
||||
# 'cw<pf<par-def___' : self.__close_para_func, # paragraph definition
|
||||
# 'mi<mk<fld-bk-end' : self.__close_para_func, # end of field-block
|
||||
'mi<mk<fldbk-end_' : self.__close_para_func, # end of field-block
|
||||
'mi<mk<body-close' : self.__close_para_func, # end of body
|
||||
'mi<mk<sect-close' : self.__close_para_func, # end of body
|
||||
'mi<mk<sect-start' : self.__close_para_func, # start of section
|
||||
'mi<mk<foot___clo' : self.__close_para_func, # end of footnote
|
||||
'cw<tb<cell______' : self.__close_para_func, # end of cell
|
||||
'mi<mk<par-in-fld' : self.__close_para_func, # start of block field
|
||||
'cw<pf<par-def___' : self.__bogus_para__def_func, # paragraph definition
|
||||
}
|
||||
self.__not_paragraph_dict = {
|
||||
'tx<nu<__________' : self.__start_para_func,
|
||||
'tx<hx<__________' : self.__start_para_func,
|
||||
'tx<ut<__________' : self.__start_para_func,
|
||||
'tx<mc<__________' : self.__start_para_func,
|
||||
'mi<mk<inline-fld' : self.__start_para_func,
|
||||
'mi<mk<para-beg__' : self.__start_para_func,
|
||||
'cw<pf<par-end___' : self.__empty_para_func,
|
||||
'mi<mk<pict-start' : self.__start_para_func,
|
||||
'cw<pf<page-break' : self.__empty_pgbk_func, # page break
|
||||
}
|
||||
|
||||
def __before_body_func(self, line):
|
||||
"""
|
||||
Required:
|
||||
line -- line to parse
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
This function handles all the lines before the start of the body.
|
||||
Once the body starts, the state is switched to 'not_paragraph'
|
||||
"""
|
||||
if self.__token_info == 'mi<mk<body-open_':
|
||||
self.__state = 'not_paragraph'
|
||||
self.__write_obj.write(line)
|
||||
|
||||
def __not_paragraph_func(self, line):
|
||||
"""
|
||||
Required:
|
||||
line --line to parse
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
This function handles all lines that are outside of the paragraph.
|
||||
It looks for clues that start a paragraph, and when found,
|
||||
switches states and writes the start tags.
|
||||
"""
|
||||
action = self.__not_paragraph_dict.get(self.__token_info)
|
||||
if action:
|
||||
action(line)
|
||||
self.__write_obj.write(line)
|
||||
|
||||
def __paragraph_func(self, line):
|
||||
"""
|
||||
Required:
|
||||
line --line to parse
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
This function handles all the lines that are in the paragraph. It
|
||||
looks for clues to the end of the paragraph. When a clue is found,
|
||||
it calls on another method to write the end of the tag and change
|
||||
the state.
|
||||
"""
|
||||
action = self.__paragraph_dict.get(self.__token_info)
|
||||
if action:
|
||||
action(line)
|
||||
else:
|
||||
self.__write_obj.write(line)
|
||||
|
||||
def __start_para_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line --line to parse
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
This function writes the beginning tags for a paragraph and
|
||||
changes the state to paragraph.
|
||||
"""
|
||||
self.__write_obj.write(self.__start_marker) # marker for later parsing
|
||||
self.__write_obj.write(
|
||||
'mi<tg<open______<para\n'
|
||||
)
|
||||
self.__write_obj.write(self.__start2_marker)
|
||||
self.__state = 'paragraph'
|
||||
|
||||
def __empty_para_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line --line to parse
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
This function writes the empty tags for a paragraph.
|
||||
It does not do anything if self.__write_empty_para is 0.
|
||||
"""
|
||||
if self.__write_empty_para:
|
||||
self.__write_obj.write(self.__start_marker) # marker for later parsing
|
||||
self.__write_obj.write(
|
||||
'mi<tg<empty_____<para\n'
|
||||
)
|
||||
self.__write_obj.write(self.__end_marker) # marker for later parsing
|
||||
|
||||
def __empty_pgbk_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line --line to parse
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
This function writes the empty tags for a page break.
|
||||
"""
|
||||
self.__write_obj.write(
|
||||
'mi<tg<empty_____<page-break\n'
|
||||
)
|
||||
|
||||
def __close_para_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line --line to parse
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
This function writes the end tags for a paragraph and
|
||||
changes the state to not_paragraph.
|
||||
"""
|
||||
self.__write_obj.write(self.__end2_marker) # marker for later parser
|
||||
self.__write_obj.write(
|
||||
'mi<tg<close_____<para\n'
|
||||
)
|
||||
self.__write_obj.write(self.__end_marker) # marker for later parser
|
||||
self.__write_obj.write(line)
|
||||
self.__state = 'not_paragraph'
|
||||
|
||||
def __bogus_para__def_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line --line to parse
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
if a \\pard occurs in a paragraph, I want to ignore it. (I believe)
|
||||
"""
|
||||
self.__write_obj.write('mi<mk<bogus-pard\n')
|
||||
|
||||
def make_paragraphs(self):
|
||||
"""
|
||||
Requires:
|
||||
nothing
|
||||
Returns:
|
||||
nothing (changes the original file)
|
||||
Logic:
|
||||
Read one line in at a time. Determine what action to take based on
|
||||
the state. If the state is before the body, look for the
|
||||
beginning of the body.
|
||||
When the body is found, change the state to 'not_paragraph'. The
|
||||
only other state is 'paragraph'.
|
||||
"""
|
||||
self.__initiate_values()
|
||||
with open_for_read(self.__file) as read_obj:
|
||||
with open_for_write(self.__write_to) as self.__write_obj:
|
||||
for line in read_obj:
|
||||
self.__token_info = line[:16]
|
||||
action = self.__state_dict.get(self.__state)
|
||||
if action is None:
|
||||
try:
|
||||
sys.stderr.write('no matching state in module paragraphs.py\n')
|
||||
sys.stderr.write(self.__state + '\n')
|
||||
except:
|
||||
pass
|
||||
action(line)
|
||||
copy_obj = copy.Copy(bug_handler=self.__bug_handler)
|
||||
if self.__copy:
|
||||
copy_obj.copy_file(self.__write_to, "paragraphs.data")
|
||||
copy_obj.rename(self.__write_to, self.__file)
|
||||
os.remove(self.__write_to)
|
||||
182
ebook_converter/ebooks/rtf2xml/pict.py
Normal file
182
ebook_converter/ebooks/rtf2xml/pict.py
Normal file
@@ -0,0 +1,182 @@
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
#########################################################################
|
||||
# #
|
||||
# #
|
||||
# copyright 2002 Paul Henry Tremblay #
|
||||
# #
|
||||
# This program is distributed in the hope that it will be useful, #
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
|
||||
# General Public License for more details. #
|
||||
# #
|
||||
# #
|
||||
#########################################################################
|
||||
import sys, os
|
||||
|
||||
from calibre.ebooks.rtf2xml import copy
|
||||
from calibre.ptempfile import better_mktemp
|
||||
from polyglot.builtins import unicode_type
|
||||
|
||||
from . import open_for_read, open_for_write
|
||||
|
||||
|
||||
class Pict:
|
||||
"""Process graphic information"""
|
||||
def __init__(self,
|
||||
in_file,
|
||||
bug_handler,
|
||||
out_file,
|
||||
copy=None,
|
||||
orig_file=None,
|
||||
run_level=1,
|
||||
):
|
||||
self.__file = in_file
|
||||
self.__bug_handler = bug_handler
|
||||
self.__copy = copy
|
||||
self.__run_level = run_level
|
||||
self.__write_to = better_mktemp()
|
||||
self.__bracket_count = 0
|
||||
self.__ob_count = 0
|
||||
self.__cb_count = 0
|
||||
self.__pict_count = 0
|
||||
self.__in_pict = False
|
||||
self.__already_found_pict = False
|
||||
self.__orig_file = orig_file
|
||||
self.__initiate_pict_dict()
|
||||
self.__out_file = out_file
|
||||
|
||||
def __initiate_pict_dict(self):
|
||||
self.__pict_dict = {
|
||||
'ob<nu<open-brack' : self.__open_br_func,
|
||||
'cb<nu<clos-brack' : self.__close_br_func,
|
||||
'tx<nu<__________' : self.__text_func,
|
||||
}
|
||||
|
||||
def __open_br_func(self, line):
|
||||
return "{\n"
|
||||
|
||||
def __close_br_func(self, line):
|
||||
return "}\n"
|
||||
|
||||
def __text_func(self, line):
|
||||
# tx<nu<__________<true text
|
||||
return line[17:]
|
||||
|
||||
def __make_dir(self):
|
||||
""" Make a directory to put the image data in"""
|
||||
base_name = os.path.basename(getattr(self.__orig_file, 'name',
|
||||
self.__orig_file))
|
||||
base_name = os.path.splitext(base_name)[0]
|
||||
if self.__out_file:
|
||||
dir_name = os.path.dirname(getattr(self.__out_file, 'name',
|
||||
self.__out_file))
|
||||
else:
|
||||
dir_name = os.path.dirname(self.__orig_file)
|
||||
self.__dir_name = base_name + "_rtf_pict_dir/"
|
||||
self.__dir_name = os.path.join(dir_name, self.__dir_name)
|
||||
if not os.path.isdir(self.__dir_name):
|
||||
try:
|
||||
os.mkdir(self.__dir_name)
|
||||
except OSError as msg:
|
||||
msg = "%sCouldn't make directory '%s':\n" % (unicode_type(msg), self.__dir_name)
|
||||
raise self.__bug_handler
|
||||
else:
|
||||
if self.__run_level > 1:
|
||||
sys.stderr.write('Removing files from old pict directory...\n')
|
||||
all_files = os.listdir(self.__dir_name)
|
||||
for the_file in all_files:
|
||||
the_file = os.path.join(self.__dir_name, the_file)
|
||||
try:
|
||||
os.remove(the_file)
|
||||
except OSError:
|
||||
pass
|
||||
if self.__run_level > 1:
|
||||
sys.stderr.write('Files removed.\n')
|
||||
|
||||
def __create_pict_file(self):
|
||||
"""Create a file for all the pict data to be written to.
|
||||
"""
|
||||
self.__pict_file = os.path.join(self.__dir_name, 'picts.rtf')
|
||||
self.__write_pic_obj = open_for_write(self.__pict_file, append=True)
|
||||
|
||||
def __in_pict_func(self, line):
|
||||
if self.__cb_count == self.__pict_br_count:
|
||||
self.__in_pict = False
|
||||
self.__write_pic_obj.write("}\n")
|
||||
return True
|
||||
else:
|
||||
action = self.__pict_dict.get(self.__token_info)
|
||||
if action:
|
||||
self.__write_pic_obj.write(action(line))
|
||||
return False
|
||||
|
||||
def __default(self, line, write_obj):
|
||||
"""Determine if each token marks the beginning of pict data.
|
||||
If it does, create a new file to write data to (if that file
|
||||
has not already been created.) Set the self.__in_pict flag to true.
|
||||
If the line does not contain pict data, return 1
|
||||
"""
|
||||
"""
|
||||
$pict_count++;
|
||||
$pict_count = sprintf("%03d", $pict_count);
|
||||
print OUTPUT "dv<xx<em<nu<pict<at<num>$pict_count\n";
|
||||
"""
|
||||
if self.__token_info == 'cw<gr<picture___':
|
||||
self.__pict_count += 1
|
||||
# write_obj.write("mi<tg<em<at<pict<num>%03d\n" % self.__pict_count)
|
||||
write_obj.write('mi<mk<pict-start\n')
|
||||
write_obj.write('mi<tg<empty-att_<pict<num>%03d\n' % self.__pict_count)
|
||||
write_obj.write('mi<mk<pict-end__\n')
|
||||
if not self.__already_found_pict:
|
||||
self.__create_pict_file()
|
||||
self.__already_found_pict=True
|
||||
self.__print_rtf_header()
|
||||
self.__in_pict = 1
|
||||
self.__pict_br_count = self.__ob_count
|
||||
self.__cb_count = 0
|
||||
self.__write_pic_obj.write("{\\pict\n")
|
||||
return False
|
||||
return True
|
||||
|
||||
def __print_rtf_header(self):
|
||||
"""Print to pict file the necessary RTF data for the file to be
|
||||
recognized as an RTF file.
|
||||
"""
|
||||
self.__write_pic_obj.write("{\\rtf1 \n{\\fonttbl\\f0\\null;} \n")
|
||||
self.__write_pic_obj.write("{\\colortbl\\red255\\green255\\blue255;} \n\\pard \n")
|
||||
|
||||
def process_pict(self):
|
||||
self.__make_dir()
|
||||
with open_for_read(self.__file) as read_obj:
|
||||
with open_for_write(self.__write_to) as write_obj:
|
||||
for line in read_obj:
|
||||
self.__token_info = line[:16]
|
||||
if self.__token_info == 'ob<nu<open-brack':
|
||||
self.__ob_count = line[-5:-1]
|
||||
if self.__token_info == 'cb<nu<clos-brack':
|
||||
self.__cb_count = line[-5:-1]
|
||||
if not self.__in_pict:
|
||||
to_print = self.__default(line, write_obj)
|
||||
if to_print :
|
||||
write_obj.write(line)
|
||||
else:
|
||||
to_print = self.__in_pict_func(line)
|
||||
if to_print :
|
||||
write_obj.write(line)
|
||||
if self.__already_found_pict:
|
||||
self.__write_pic_obj.write("}\n")
|
||||
self.__write_pic_obj.close()
|
||||
copy_obj = copy.Copy(bug_handler=self.__bug_handler)
|
||||
if self.__copy:
|
||||
copy_obj.copy_file(self.__write_to, "pict.data")
|
||||
try:
|
||||
copy_obj.copy_file(self.__pict_file, "pict.rtf")
|
||||
except:
|
||||
pass
|
||||
copy_obj.rename(self.__write_to, self.__file)
|
||||
os.remove(self.__write_to)
|
||||
if self.__pict_count == 0:
|
||||
try:
|
||||
os.rmdir(self.__dir_name)
|
||||
except OSError:
|
||||
pass
|
||||
591
ebook_converter/ebooks/rtf2xml/preamble_div.py
Normal file
591
ebook_converter/ebooks/rtf2xml/preamble_div.py
Normal file
@@ -0,0 +1,591 @@
|
||||
from __future__ import unicode_literals, absolute_import, print_function, division
|
||||
from __future__ import print_function
|
||||
#########################################################################
|
||||
# #
|
||||
# #
|
||||
# copyright 2002 Paul Henry Tremblay #
|
||||
# #
|
||||
# This program is distributed in the hope that it will be useful, #
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
|
||||
# General Public License for more details. #
|
||||
# #
|
||||
# #
|
||||
#########################################################################
|
||||
import sys, os
|
||||
from calibre.ebooks.rtf2xml import copy, override_table, list_table
|
||||
from calibre.ptempfile import better_mktemp
|
||||
from . import open_for_read, open_for_write
|
||||
|
||||
|
||||
class PreambleDiv:
|
||||
"""
|
||||
Break the preamble into divisions.
|
||||
"""
|
||||
|
||||
def __init__(self, in_file,
|
||||
bug_handler,
|
||||
copy=None,
|
||||
no_namespace=None,
|
||||
run_level=1,
|
||||
):
|
||||
"""
|
||||
Required:
|
||||
'file'
|
||||
Optional:
|
||||
'copy'-- whether to make a copy of result for debugging
|
||||
'temp_dir' --where to output temporary results (default is
|
||||
directory from which the script is run.)
|
||||
Returns:
|
||||
nothing
|
||||
"""
|
||||
self.__file = in_file
|
||||
self.__bug_handler = bug_handler
|
||||
self.__copy = copy
|
||||
self.__no_namespace = no_namespace
|
||||
self.__write_to = better_mktemp()
|
||||
self.__run_level = run_level
|
||||
|
||||
def __initiate_values(self):
|
||||
"""
|
||||
Set values, including those for the dictionary.
|
||||
"""
|
||||
self.__all_lists = {}
|
||||
self.__page = {
|
||||
'margin-top' : 72,
|
||||
'margin-bottom' : 72,
|
||||
'margin-left' : 90,
|
||||
'margin-right' : 90,
|
||||
'gutter' : 0,
|
||||
}
|
||||
self.__cb_count = ''
|
||||
self.__ob_count = ''
|
||||
self.__state = 'preamble'
|
||||
self.__rtf_final = ''
|
||||
self.__close_group_count = ''
|
||||
self.__found_font_table = 0
|
||||
self.__list_table_final = ''
|
||||
self.__override_table_final = ''
|
||||
self.__revision_table_final = ''
|
||||
self.__doc_info_table_final = ''
|
||||
self.__state_dict = {
|
||||
'default' : self.__default_func,
|
||||
'rtf_header' : self.__rtf_head_func,
|
||||
'preamble' : self.__preamble_func,
|
||||
'font_table' : self.__font_table_func,
|
||||
'color_table' : self.__color_table_func,
|
||||
'style_sheet' : self.__style_sheet_func,
|
||||
'list_table' : self.__list_table_func,
|
||||
'override_table' : self.__override_table_func,
|
||||
'revision_table' : self.__revision_table_func,
|
||||
'doc_info' : self.__doc_info_func,
|
||||
'body' : self.__body_func,
|
||||
'ignore' : self.__ignore_func,
|
||||
'cw<ri<rtf_______' : self.__found_rtf_head_func,
|
||||
'cw<pf<par-def___' : self.__para_def_func,
|
||||
'tx<nu<__________' : self.__text_func,
|
||||
'cw<tb<row-def___' : self.__row_def_func,
|
||||
'cw<sc<section___' : self.__new_section_func,
|
||||
'cw<sc<sect-defin' : self.__new_section_func,
|
||||
'cw<it<font-table' : self.__found_font_table_func,
|
||||
'cw<it<colr-table' : self.__found_color_table_func,
|
||||
'cw<ss<style-shet' : self.__found_style_sheet_func,
|
||||
'cw<it<listtable_' : self.__found_list_table_func,
|
||||
'cw<it<lovr-table' : self.__found_override_table_func,
|
||||
'cw<it<revi-table' : self.__found_revision_table_func,
|
||||
'cw<di<doc-info__' : self.__found_doc_info_func,
|
||||
'cw<pa<margin-lef' : self.__margin_func,
|
||||
'cw<pa<margin-rig' : self.__margin_func,
|
||||
'cw<pa<margin-top' : self.__margin_func,
|
||||
'cw<pa<margin-bot' : self.__margin_func,
|
||||
'cw<pa<gutter____' : self.__margin_func,
|
||||
'cw<pa<paper-widt' : self.__margin_func,
|
||||
'cw<pa<paper-hght' : self.__margin_func,
|
||||
# 'cw<tb<columns___' : self.__section_func,
|
||||
}
|
||||
self.__margin_dict = {
|
||||
'margin-lef' : 'margin-left',
|
||||
'margin-rig' : 'margin-right',
|
||||
'margin-top' : 'margin-top',
|
||||
'margin-bot' : 'margin-bottom',
|
||||
'gutter____' : 'gutter',
|
||||
'paper-widt' : 'paper-width',
|
||||
'paper-hght' : 'paper-height',
|
||||
}
|
||||
self.__translate_sec = {
|
||||
'columns___' : 'column',
|
||||
}
|
||||
self.__section = {}
|
||||
# self.__write_obj.write(self.__color_table_final)
|
||||
self.__color_table_final = ''
|
||||
self.__style_sheet_final = ''
|
||||
self.__individual_font = 0
|
||||
self.__old_font = 0
|
||||
self.__ob_group = 0 # depth of group
|
||||
self.__font_table_final = 0
|
||||
self.__list_table_obj = list_table.ListTable(
|
||||
run_level=self.__run_level,
|
||||
bug_handler=self.__bug_handler,
|
||||
)
|
||||
|
||||
def __ignore_func(self, line):
|
||||
"""
|
||||
Ignore all lines, until the bracket is found that marks the end of
|
||||
the group.
|
||||
"""
|
||||
if self.__ignore_num == self.__cb_count:
|
||||
self.__state = self.__previous_state
|
||||
|
||||
def __found_rtf_head_func(self, line):
|
||||
self.__state = 'rtf_header'
|
||||
|
||||
def __rtf_head_func(self, line):
|
||||
if self.__ob_count == '0002':
|
||||
self.__rtf_final = (
|
||||
'mi<mk<rtfhed-beg\n' +
|
||||
self.__rtf_final +
|
||||
'mi<mk<rtfhed-end\n'
|
||||
)
|
||||
self.__state = 'preamble'
|
||||
elif self.__token_info == 'tx<nu<__________' or \
|
||||
self.__token_info == 'cw<pf<par-def___':
|
||||
self.__state = 'body'
|
||||
self.__rtf_final = (
|
||||
'mi<mk<rtfhed-beg\n' +
|
||||
self.__rtf_final +
|
||||
'mi<mk<rtfhed-end\n'
|
||||
)
|
||||
self.__make_default_font_table()
|
||||
self.__write_preamble()
|
||||
self.__write_obj.write(line)
|
||||
else:
|
||||
self.__rtf_final = self.__rtf_final + line
|
||||
|
||||
def __make_default_font_table(self):
|
||||
"""
|
||||
If not font table is fount, need to write one out.
|
||||
"""
|
||||
self.__font_table_final = 'mi<tg<open______<font-table\n'
|
||||
self.__font_table_final += 'mi<mk<fonttb-beg\n'
|
||||
self.__font_table_final += 'mi<mk<fontit-beg\n'
|
||||
self.__font_table_final += 'cw<ci<font-style<nu<0\n'
|
||||
self.__font_table_final += 'tx<nu<__________<Times;\n'
|
||||
self.__font_table_final += 'mi<mk<fontit-end\n'
|
||||
self.__font_table_final += 'mi<mk<fonttb-end\n'
|
||||
self.__font_table_final += 'mi<tg<close_____<font-table\n'
|
||||
|
||||
def __make_default_color_table(self):
|
||||
"""
|
||||
If no color table is found, write a string for a default one
|
||||
"""
|
||||
self.__color_table_final = 'mi<tg<open______<color-table\n'
|
||||
self.__color_table_final += 'mi<mk<clrtbl-beg\n'
|
||||
self.__color_table_final += 'cw<ci<red_______<nu<00\n'
|
||||
self.__color_table_final += 'cw<ci<green_____<nu<00\n'
|
||||
self.__color_table_final += 'cw<ci<blue______<en<00\n'
|
||||
self.__color_table_final += 'mi<mk<clrtbl-end\n'
|
||||
self.__color_table_final += 'mi<tg<close_____<color-table\n'
|
||||
|
||||
def __make_default_style_table(self):
|
||||
"""
|
||||
If not font table is found, make a string for a default one
|
||||
"""
|
||||
"""
|
||||
self.__style_sheet_final = 'mi<tg<open______<style-table\n'
|
||||
self.__style_sheet_final +=
|
||||
self.__style_sheet_final +=
|
||||
self.__style_sheet_final +=
|
||||
self.__style_sheet_final +=
|
||||
self.__style_sheet_final +=
|
||||
self.__style_sheet_final += 'mi<tg<close_____<style-table\n'
|
||||
"""
|
||||
self.__style_sheet_final = """mi<tg<open______<style-table
|
||||
mi<mk<styles-beg
|
||||
mi<mk<stylei-beg
|
||||
cw<ci<font-style<nu<0
|
||||
tx<nu<__________<Normal;
|
||||
mi<mk<stylei-end
|
||||
mi<mk<stylei-beg
|
||||
cw<ss<char-style<nu<0
|
||||
tx<nu<__________<Default Paragraph Font;
|
||||
mi<mk<stylei-end
|
||||
mi<mk<styles-end
|
||||
mi<tg<close_____<style-table
|
||||
"""
|
||||
|
||||
def __found_font_table_func(self, line):
|
||||
if self.__found_font_table:
|
||||
self.__state = 'ignore'
|
||||
else:
|
||||
self.__state = 'font_table'
|
||||
self.__font_table_final = ''
|
||||
self.__close_group_count = self.__ob_count
|
||||
self.__cb_count = 0
|
||||
self.__found_font_table = 1
|
||||
|
||||
def __font_table_func(self, line):
|
||||
"""
|
||||
Keep adding to the self.__individual_font string until end of group
|
||||
found. If a bracket is found, check that it is only one bracket deep.
|
||||
If it is, then set the marker for an individual font. If it is not,
|
||||
then ignore all data in this group.
|
||||
cw<ci<font-style<nu<0
|
||||
"""
|
||||
if self.__cb_count == self.__close_group_count:
|
||||
self.__state = 'preamble'
|
||||
self.__font_table_final = 'mi<tg<open______<font-table\n' + \
|
||||
'mi<mk<fonttb-beg\n' + self.__font_table_final
|
||||
self.__font_table_final += \
|
||||
'mi<mk<fonttb-end\n' + 'mi<tg<close_____<font-table\n'
|
||||
elif self.__token_info == 'ob<nu<open-brack':
|
||||
if int(self.__ob_count) == int(self.__close_group_count) + 1:
|
||||
self.__font_table_final += \
|
||||
'mi<mk<fontit-beg\n'
|
||||
self.__individual_font = 1
|
||||
else:
|
||||
# ignore
|
||||
self.__previous_state = 'font_table'
|
||||
self.__state = 'ignore'
|
||||
self.__ignore_num = self.__ob_count
|
||||
elif self.__token_info == 'cb<nu<clos-brack':
|
||||
if int(self.__cb_count) == int(self.__close_group_count) + 1:
|
||||
self.__individual_font = 0
|
||||
self.__font_table_final += \
|
||||
'mi<mk<fontit-end\n'
|
||||
elif self.__individual_font:
|
||||
if self.__old_font and self.__token_info == 'tx<nu<__________':
|
||||
if ';' in line:
|
||||
self.__font_table_final += line
|
||||
self.__font_table_final += 'mi<mk<fontit-end\n'
|
||||
self.__individual_font = 0
|
||||
else:
|
||||
self.__font_table_final += line
|
||||
elif self.__token_info == 'cw<ci<font-style':
|
||||
self.__old_font = 1
|
||||
self.__individual_font = 1
|
||||
self.__font_table_final += 'mi<mk<fontit-beg\n'
|
||||
self.__font_table_final += line
|
||||
|
||||
def __old_font_func(self, line):
|
||||
"""
|
||||
Required:
|
||||
line --line to parse
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
used for older forms of RTF:
|
||||
\f3\fswiss\fcharset77 Helvetica-Oblique;\f4\fnil\fcharset77 Geneva;}
|
||||
Note how each font is not divided by a bracket
|
||||
"""
|
||||
|
||||
def __found_color_table_func(self, line):
|
||||
"""
|
||||
all functions that start with __found operate the same. They set the
|
||||
state, initiate a string, determine the self.__close_group_count, and
|
||||
set self.__cb_count to zero.
|
||||
"""
|
||||
self.__state = 'color_table'
|
||||
self.__color_table_final = ''
|
||||
self.__close_group_count = self.__ob_count
|
||||
self.__cb_count = 0
|
||||
|
||||
def __color_table_func(self, line):
|
||||
if int(self.__cb_count) == int(self.__close_group_count):
|
||||
self.__state = 'preamble'
|
||||
self.__color_table_final = 'mi<tg<open______<color-table\n' + \
|
||||
'mi<mk<clrtbl-beg\n' + self.__color_table_final
|
||||
self.__color_table_final += \
|
||||
'mi<mk<clrtbl-end\n' + 'mi<tg<close_____<color-table\n'
|
||||
else:
|
||||
self.__color_table_final += line
|
||||
|
||||
def __found_style_sheet_func(self, line):
|
||||
self.__state = 'style_sheet'
|
||||
self.__style_sheet_final = ''
|
||||
self.__close_group_count = self.__ob_count
|
||||
self.__cb_count = 0
|
||||
|
||||
def __style_sheet_func(self, line):
|
||||
"""
|
||||
Same logic as the font_table_func.
|
||||
"""
|
||||
if self.__cb_count == self.__close_group_count:
|
||||
self.__state = 'preamble'
|
||||
self.__style_sheet_final = 'mi<tg<open______<style-table\n' + \
|
||||
'mi<mk<styles-beg\n' + self.__style_sheet_final
|
||||
self.__style_sheet_final += \
|
||||
'mi<mk<styles-end\n' + 'mi<tg<close_____<style-table\n'
|
||||
elif self.__token_info == 'ob<nu<open-brack':
|
||||
if int(self.__ob_count) == int(self.__close_group_count) + 1:
|
||||
self.__style_sheet_final += \
|
||||
'mi<mk<stylei-beg\n'
|
||||
elif self.__token_info == 'cb<nu<clos-brack':
|
||||
if int(self.__cb_count) == int(self.__close_group_count) + 1:
|
||||
self.__style_sheet_final += \
|
||||
'mi<mk<stylei-end\n'
|
||||
else:
|
||||
self.__style_sheet_final += line
|
||||
|
||||
def __found_list_table_func(self, line):
|
||||
self.__state = 'list_table'
|
||||
self.__list_table_final = ''
|
||||
self.__close_group_count = self.__ob_count
|
||||
self.__cb_count = 0
|
||||
|
||||
def __list_table_func(self, line):
|
||||
if self.__cb_count == self.__close_group_count:
|
||||
self.__state = 'preamble'
|
||||
self.__list_table_final, self.__all_lists =\
|
||||
self.__list_table_obj.parse_list_table(
|
||||
self.__list_table_final)
|
||||
# sys.stderr.write(repr(all_lists))
|
||||
elif self.__token_info == '':
|
||||
pass
|
||||
else:
|
||||
self.__list_table_final += line
|
||||
pass
|
||||
|
||||
def __found_override_table_func(self, line):
|
||||
self.__override_table_obj = override_table.OverrideTable(
|
||||
run_level=self.__run_level,
|
||||
list_of_lists=self.__all_lists,
|
||||
)
|
||||
self.__state = 'override_table'
|
||||
self.__override_table_final = ''
|
||||
self.__close_group_count = self.__ob_count
|
||||
self.__cb_count = 0
|
||||
# cw<it<lovr-table
|
||||
|
||||
def __override_table_func(self, line):
|
||||
if self.__cb_count == self.__close_group_count:
|
||||
self.__state = 'preamble'
|
||||
self.__override_table_final, self.__all_lists =\
|
||||
self.__override_table_obj.parse_override_table(self.__override_table_final)
|
||||
elif self.__token_info == '':
|
||||
pass
|
||||
else:
|
||||
self.__override_table_final += line
|
||||
|
||||
def __found_revision_table_func(self, line):
|
||||
self.__state = 'revision_table'
|
||||
self.__revision_table_final = ''
|
||||
self.__close_group_count = self.__ob_count
|
||||
self.__cb_count = 0
|
||||
|
||||
def __revision_table_func(self, line):
|
||||
if int(self.__cb_count) == int(self.__close_group_count):
|
||||
self.__state = 'preamble'
|
||||
self.__revision_table_final = 'mi<tg<open______<revision-table\n' + \
|
||||
'mi<mk<revtbl-beg\n' + self.__revision_table_final
|
||||
self.__revision_table_final += \
|
||||
'mi<mk<revtbl-end\n' + 'mi<tg<close_____<revision-table\n'
|
||||
else:
|
||||
self.__revision_table_final += line
|
||||
|
||||
def __found_doc_info_func(self, line):
|
||||
self.__state = 'doc_info'
|
||||
self.__doc_info_table_final = ''
|
||||
self.__close_group_count = self.__ob_count
|
||||
self.__cb_count = 0
|
||||
|
||||
def __doc_info_func(self, line):
|
||||
if self.__cb_count == self.__close_group_count:
|
||||
self.__state = 'preamble'
|
||||
self.__doc_info_table_final = 'mi<tg<open______<doc-information\n' + \
|
||||
'mi<mk<doc-in-beg\n' + self.__doc_info_table_final
|
||||
self.__doc_info_table_final += \
|
||||
'mi<mk<doc-in-end\n' + 'mi<tg<close_____<doc-information\n'
|
||||
elif self.__token_info == 'ob<nu<open-brack':
|
||||
if int(self.__ob_count) == int(self.__close_group_count) + 1:
|
||||
self.__doc_info_table_final += \
|
||||
'mi<mk<docinf-beg\n'
|
||||
elif self.__token_info == 'cb<nu<clos-brack':
|
||||
if int(self.__cb_count) == int(self.__close_group_count) + 1:
|
||||
self.__doc_info_table_final += \
|
||||
'mi<mk<docinf-end\n'
|
||||
else:
|
||||
self.__doc_info_table_final += line
|
||||
|
||||
def __margin_func(self, line):
|
||||
"""
|
||||
Handles lines that describe page info. Add the apporpriate info in the
|
||||
token to the self.__margin_dict dicitonary.
|
||||
"""
|
||||
info = line[6:16]
|
||||
changed = self.__margin_dict.get(info)
|
||||
if changed is None:
|
||||
print('woops!')
|
||||
else:
|
||||
self.__page[changed] = line[20:-1]
|
||||
# cw<pa<margin-lef<nu<1728
|
||||
|
||||
def __print_page_info(self):
|
||||
self.__write_obj.write('mi<tg<empty-att_<page-definition')
|
||||
for key in self.__page.keys():
|
||||
self.__write_obj.write(
|
||||
'<%s>%s' % (key, self.__page[key])
|
||||
)
|
||||
self.__write_obj.write('\n')
|
||||
# mi<tg<open-att__<footn
|
||||
|
||||
def __print_sec_info(self):
|
||||
"""
|
||||
Check if there is any section info. If so, print it out.
|
||||
If not, print out an empty tag to satisfy the dtd.
|
||||
"""
|
||||
if len(self.__section.keys()) == 0:
|
||||
self.__write_obj.write(
|
||||
'mi<tg<open______<section-definition\n'
|
||||
)
|
||||
else:
|
||||
self.__write_obj.write(
|
||||
'mi<tg<open-att__<section-definition')
|
||||
keys = self.__section.keys()
|
||||
for key in keys:
|
||||
self.__write_obj.write(
|
||||
'<%s>%s' % (key, self.__section[key])
|
||||
)
|
||||
self.__write_obj.write('\n')
|
||||
|
||||
def __section_func(self, line):
|
||||
"""
|
||||
Add info pertaining to section to the self.__section dictionary, to be
|
||||
printed out later.
|
||||
"""
|
||||
info = self.__translate_sec.get(line[6:16])
|
||||
if info is None:
|
||||
sys.stderr.write('woops!\n')
|
||||
else:
|
||||
self.__section[info] = 'true'
|
||||
|
||||
def __body_func(self, line):
|
||||
self.__write_obj.write(line)
|
||||
|
||||
def __default_func(self, line):
|
||||
# either in preamble or in body
|
||||
pass
|
||||
|
||||
def __para_def_func(self, line):
|
||||
# if self.__ob_group == 1
|
||||
# this tells dept of group
|
||||
if self.__cb_count == '0002':
|
||||
self.__state = 'body'
|
||||
self.__write_preamble()
|
||||
self.__write_obj.write(line)
|
||||
|
||||
def __text_func(self, line):
|
||||
"""
|
||||
If the cb_count is less than 1, you have hit the body
|
||||
For older RTF
|
||||
Newer RTF should never have to use this function
|
||||
"""
|
||||
if self.__cb_count == '':
|
||||
cb_count = '0002'
|
||||
else:
|
||||
cb_count = self.__cb_count
|
||||
# ignore previous lines
|
||||
# should be
|
||||
# if self.__ob_group == 1
|
||||
# this tells dept of group
|
||||
if cb_count == '0002':
|
||||
self.__state = 'body'
|
||||
self.__write_preamble()
|
||||
self.__write_obj.write(line)
|
||||
|
||||
def __row_def_func(self, line):
|
||||
# if self.__ob_group == 1
|
||||
# this tells dept of group
|
||||
if self.__cb_count == '0002':
|
||||
self.__state = 'body'
|
||||
self.__write_preamble()
|
||||
self.__write_obj.write(line)
|
||||
|
||||
def __new_section_func(self, line):
|
||||
"""
|
||||
This is new. The start of a section marks the end of the preamble
|
||||
"""
|
||||
if self.__cb_count == '0002':
|
||||
self.__state = 'body'
|
||||
self.__write_preamble()
|
||||
else:
|
||||
sys.stderr.write('module is preamble_div\n')
|
||||
sys.stderr.write('method is __new_section_func\n')
|
||||
sys.stderr.write('bracket count should be 2?\n')
|
||||
self.__write_obj.write(line)
|
||||
|
||||
def __write_preamble(self):
|
||||
"""
|
||||
Write all the strings, which represent all the data in the preamble.
|
||||
Write a body and section beginning.
|
||||
"""
|
||||
if self.__no_namespace:
|
||||
self.__write_obj.write(
|
||||
'mi<tg<open______<doc\n'
|
||||
)
|
||||
else:
|
||||
self.__write_obj.write(
|
||||
'mi<tg<open-att__<doc<xmlns>http://rtf2xml.sourceforge.net/\n')
|
||||
self.__write_obj.write('mi<tg<open______<preamble\n')
|
||||
self.__write_obj.write(self.__rtf_final)
|
||||
if not self.__color_table_final:
|
||||
self.__make_default_color_table()
|
||||
if not self.__font_table_final:
|
||||
self.__make_default_font_table()
|
||||
self.__write_obj.write(self.__font_table_final)
|
||||
self.__write_obj.write(self.__color_table_final)
|
||||
if not self.__style_sheet_final:
|
||||
self.__make_default_style_table()
|
||||
self.__write_obj.write(self.__style_sheet_final)
|
||||
self.__write_obj.write(self.__list_table_final)
|
||||
self.__write_obj.write(self.__override_table_final)
|
||||
self.__write_obj.write(self.__revision_table_final)
|
||||
self.__write_obj.write(self.__doc_info_table_final)
|
||||
self.__print_page_info()
|
||||
self.__write_obj.write('ob<nu<open-brack<0001\n')
|
||||
self.__write_obj.write('ob<nu<open-brack<0002\n')
|
||||
self.__write_obj.write('cb<nu<clos-brack<0002\n')
|
||||
self.__write_obj.write('mi<tg<close_____<preamble\n')
|
||||
self.__write_obj.write('mi<tg<open______<body\n')
|
||||
# self.__write_obj.write('mi<tg<open-att__<section<num>1\n')
|
||||
# self.__print_sec_info()
|
||||
# self.__write_obj.write('mi<tg<open______<headers-and-footers\n')
|
||||
# self.__write_obj.write('mi<mk<head_foot_<\n')
|
||||
# self.__write_obj.write('mi<tg<close_____<headers-and-footers\n')
|
||||
self.__write_obj.write('mi<mk<body-open_\n')
|
||||
|
||||
def __preamble_func(self, line):
|
||||
"""
|
||||
Check if the token info belongs to the dictionary. If so, take the
|
||||
appropriate action.
|
||||
"""
|
||||
action = self.__state_dict.get(self.__token_info)
|
||||
if action:
|
||||
action(line)
|
||||
|
||||
def make_preamble_divisions(self):
|
||||
self.__initiate_values()
|
||||
read_obj = open_for_read(self.__file)
|
||||
self.__write_obj = open_for_write(self.__write_to)
|
||||
line_to_read = 1
|
||||
while line_to_read:
|
||||
line_to_read = read_obj.readline()
|
||||
line = line_to_read
|
||||
self.__token_info = line[:16]
|
||||
if self.__token_info == 'ob<nu<open-brack':
|
||||
self.__ob_count = line[-5:-1]
|
||||
self.__ob_group += 1
|
||||
if self.__token_info == 'cb<nu<clos-brack':
|
||||
self.__cb_count = line[-5:-1]
|
||||
self.__ob_group -= 1
|
||||
action = self.__state_dict.get(self.__state)
|
||||
if action is None:
|
||||
print(self.__state)
|
||||
action(line)
|
||||
read_obj.close()
|
||||
self.__write_obj.close()
|
||||
copy_obj = copy.Copy(bug_handler=self.__bug_handler)
|
||||
if self.__copy:
|
||||
copy_obj.copy_file(self.__write_to, "preamble_div.data")
|
||||
copy_obj.rename(self.__write_to, self.__file)
|
||||
os.remove(self.__write_to)
|
||||
return self.__all_lists
|
||||
157
ebook_converter/ebooks/rtf2xml/preamble_rest.py
Normal file
157
ebook_converter/ebooks/rtf2xml/preamble_rest.py
Normal file
@@ -0,0 +1,157 @@
|
||||
from __future__ import unicode_literals, absolute_import, print_function, division
|
||||
#########################################################################
|
||||
# #
|
||||
# #
|
||||
# copyright 2002 Paul Henry Tremblay #
|
||||
# #
|
||||
# This program is distributed in the hope that it will be useful, #
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
|
||||
# General Public License for more details. #
|
||||
# #
|
||||
# #
|
||||
#########################################################################
|
||||
import sys,os
|
||||
|
||||
from calibre.ebooks.rtf2xml import copy
|
||||
from . import open_for_read, open_for_write
|
||||
|
||||
|
||||
class Preamble:
|
||||
"""
|
||||
Fix the reamaing parts of the preamble. This module does very little. It
|
||||
makes sure that no text gets put in the revision of list table. In the
|
||||
future, when I understand how to interpret the revision table and list
|
||||
table, I will make these methods more functional.
|
||||
"""
|
||||
|
||||
def __init__(self, file,
|
||||
bug_handler,
|
||||
platform,
|
||||
default_font,
|
||||
code_page,
|
||||
copy=None,
|
||||
temp_dir=None,
|
||||
):
|
||||
"""
|
||||
Required:
|
||||
file--file to parse
|
||||
platform --Windows or Macintosh
|
||||
default_font -- the default font
|
||||
code_page --the code page (ansi1252, for example)
|
||||
Optional:
|
||||
'copy'-- whether to make a copy of result for debugging
|
||||
'temp_dir' --where to output temporary results (default is
|
||||
directory from which the script is run.)
|
||||
Returns:
|
||||
nothing
|
||||
"""
|
||||
self.__file=file
|
||||
self.__bug_handler = bug_handler
|
||||
self.__copy = copy
|
||||
self.__default_font = default_font
|
||||
self.__code_page = code_page
|
||||
self.__platform = platform
|
||||
if temp_dir:
|
||||
self.__write_to = os.path.join(temp_dir,"info_table_info.data")
|
||||
else:
|
||||
self.__write_to = "info_table_info.data"
|
||||
|
||||
def __initiate_values(self):
|
||||
"""
|
||||
Initiate all values.
|
||||
"""
|
||||
self.__state = 'default'
|
||||
self.__text_string = ''
|
||||
self.__state_dict = {
|
||||
'default' : self.__default_func,
|
||||
'revision' : self.__revision_table_func,
|
||||
'list_table' : self.__list_table_func,
|
||||
'body' : self.__body_func,
|
||||
}
|
||||
self.__default_dict = {
|
||||
'mi<mk<rtfhed-beg' : self.__found_rtf_head_func,
|
||||
'mi<mk<listabbeg_' : self.__found_list_table_func,
|
||||
'mi<mk<revtbl-beg' : self.__found_revision_table_func,
|
||||
'mi<mk<body-open_' : self.__found_body_func,
|
||||
}
|
||||
|
||||
def __default_func(self, line):
|
||||
action = self.__default_dict.get(self.__token_info)
|
||||
if action:
|
||||
action(line)
|
||||
else:
|
||||
self.__write_obj.write(line)
|
||||
|
||||
def __found_rtf_head_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line -- the line to parse
|
||||
Returns:
|
||||
nothing.
|
||||
Logic:
|
||||
Write to the output file the default font info, the code page
|
||||
info, and the platform info.
|
||||
"""
|
||||
self.__write_obj.write(
|
||||
'mi<tg<empty-att_<rtf-definition'
|
||||
'<default-font>%s<code-page>%s'
|
||||
'<platform>%s\n' % (self.__default_font, self.__code_page,
|
||||
self.__platform)
|
||||
)
|
||||
|
||||
def __found_list_table_func(self, line):
|
||||
self.__state = 'list_table'
|
||||
|
||||
def __list_table_func(self, line):
|
||||
if self.__token_info == 'mi<mk<listabend_':
|
||||
self.__state = 'default'
|
||||
elif line[0:2] == 'tx':
|
||||
pass
|
||||
else:
|
||||
self.__write_obj.write(line)
|
||||
|
||||
def __found_revision_table_func(self, line):
|
||||
self.__state = 'revision'
|
||||
|
||||
def __revision_table_func(self, line):
|
||||
if self.__token_info == 'mi<mk<revtbl-end':
|
||||
self.__state = 'default'
|
||||
elif line[0:2] == 'tx':
|
||||
pass
|
||||
else:
|
||||
self.__write_obj.write(line)
|
||||
|
||||
def __found_body_func(self, line):
|
||||
self.__state = 'body'
|
||||
self.__write_obj.write(line)
|
||||
|
||||
def __body_func(self, line):
|
||||
self.__write_obj.write(line)
|
||||
|
||||
def fix_preamble(self):
|
||||
"""
|
||||
Requires:
|
||||
nothing
|
||||
Returns:
|
||||
nothing (changes the original file)
|
||||
Logic:
|
||||
Read one line in at a time. Determine what action to take based on
|
||||
the state. The state can either be defaut, the revision table, or
|
||||
the list table.
|
||||
"""
|
||||
self.__initiate_values()
|
||||
with open_for_read(self.__file) as read_obj:
|
||||
with open_for_write(self.__write_to) as self.__write_obj:
|
||||
for line in read_obj:
|
||||
self.__token_info = line[:16]
|
||||
action = self.__state_dict.get(self.__state)
|
||||
if action is None:
|
||||
sys.stderr.write(
|
||||
'no matching state in module preamble_rest.py\n' + self.__state + '\n')
|
||||
action(line)
|
||||
copy_obj = copy.Copy(bug_handler=self.__bug_handler)
|
||||
if self.__copy:
|
||||
copy_obj.copy_file(self.__write_to, "preamble_div.data")
|
||||
copy_obj.rename(self.__write_to, self.__file)
|
||||
os.remove(self.__write_to)
|
||||
837
ebook_converter/ebooks/rtf2xml/process_tokens.py
Normal file
837
ebook_converter/ebooks/rtf2xml/process_tokens.py
Normal file
@@ -0,0 +1,837 @@
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
#########################################################################
|
||||
# #
|
||||
# #
|
||||
# copyright 2002 Paul Henry Tremblay #
|
||||
# #
|
||||
# This program is distributed in the hope that it will be useful, #
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
|
||||
# General Public License for more details. #
|
||||
# #
|
||||
# #
|
||||
#########################################################################
|
||||
import os, re
|
||||
|
||||
from calibre.ebooks.rtf2xml import copy, check_brackets
|
||||
from calibre.ptempfile import better_mktemp
|
||||
from polyglot.builtins import unicode_type
|
||||
|
||||
from . import open_for_read, open_for_write
|
||||
|
||||
|
||||
class ProcessTokens:
|
||||
"""
|
||||
Process each token on a line and add information that will be useful for
|
||||
later processing. Information will be put on one line, delimited by "<"
|
||||
for main fields, and ">" for sub fields
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
in_file,
|
||||
exception_handler,
|
||||
bug_handler,
|
||||
copy=None,
|
||||
run_level=1,
|
||||
):
|
||||
self.__file = in_file
|
||||
self.__bug_handler = bug_handler
|
||||
self.__copy = copy
|
||||
self.__run_level = run_level
|
||||
self.__write_to = better_mktemp()
|
||||
self.initiate_token_dict()
|
||||
# self.initiate_token_actions()
|
||||
self.compile_expressions()
|
||||
self.__bracket_count=0
|
||||
self.__exception_handler = exception_handler
|
||||
self.__bug_handler = bug_handler
|
||||
|
||||
def compile_expressions(self):
|
||||
self.__num_exp = re.compile(r"([a-zA-Z]+)(.*)")
|
||||
self.__utf_exp = re.compile(r'(&.*?;)')
|
||||
|
||||
def initiate_token_dict(self):
|
||||
self.__return_code = 0
|
||||
self.dict_token={
|
||||
# unicode
|
||||
'mshex' : ('nu', '__________', self.__ms_hex_func),
|
||||
# brackets
|
||||
'{' : ('nu', '{', self.ob_func),
|
||||
'}' : ('nu', '}', self.cb_func),
|
||||
# microsoft characters
|
||||
'ldblquote' : ('mc', 'ldblquote', self.ms_sub_func),
|
||||
'rdblquote' : ('mc', 'rdblquote', self.ms_sub_func),
|
||||
'rquote' : ('mc', 'rquote', self.ms_sub_func),
|
||||
'lquote' : ('mc', 'lquote', self.ms_sub_func),
|
||||
'emdash' : ('mc', 'emdash', self.ms_sub_func),
|
||||
'endash' : ('mc', 'endash', self.ms_sub_func),
|
||||
'bullet' : ('mc', 'bullet', self.ms_sub_func),
|
||||
'~' : ('mc', '~', self.ms_sub_func),
|
||||
'tab' : ('mc', 'tab', self.ms_sub_func),
|
||||
'_' : ('mc', '_', self.ms_sub_func),
|
||||
';' : ('mc', ';', self.ms_sub_func),
|
||||
# this must be wrong
|
||||
'-' : ('mc', '-', self.ms_sub_func),
|
||||
'line' : ('mi', 'hardline-break', self.direct_conv_func), # calibre
|
||||
# misc => ml
|
||||
'*' : ('ml', 'asterisk__', self.default_func),
|
||||
':' : ('ml', 'colon_____', self.default_func),
|
||||
# text
|
||||
'backslash' : ('nu', '\\', self.text_func),
|
||||
'ob' : ('nu', '{', self.text_func),
|
||||
'cb' : ('nu', '}', self.text_func),
|
||||
# paragraph formatting => pf
|
||||
'page' : ('pf', 'page-break', self.default_func),
|
||||
'par' : ('pf', 'par-end___', self.default_func),
|
||||
'pard' : ('pf', 'par-def___', self.default_func),
|
||||
'keepn' : ('pf', 'keep-w-nex', self.bool_st_func),
|
||||
'widctlpar' : ('pf', 'widow-cntl', self.bool_st_func),
|
||||
'adjustright' : ('pf', 'adjust-rgt', self.bool_st_func),
|
||||
'lang' : ('pf', 'language__', self.__language_func),
|
||||
'ri' : ('pf', 'right-inde', self.divide_by_20),
|
||||
'fi' : ('pf', 'fir-ln-ind', self.divide_by_20),
|
||||
'li' : ('pf', 'left-inden', self.divide_by_20),
|
||||
'sb' : ('pf', 'space-befo', self.divide_by_20),
|
||||
'sa' : ('pf', 'space-afte', self.divide_by_20),
|
||||
'sl' : ('pf', 'line-space', self.divide_by_20),
|
||||
'deftab' : ('pf', 'default-ta', self.divide_by_20),
|
||||
'ql' : ('pf', 'align_____<left', self.two_part_func),
|
||||
'qc' : ('pf', 'align_____<cent', self.two_part_func),
|
||||
'qj' : ('pf', 'align_____<just', self.two_part_func),
|
||||
'qr' : ('pf', 'align_____<right', self.two_part_func),
|
||||
'nowidctlpar' : ('pf', 'widow-cntr<false', self.two_part_func),
|
||||
'tx' : ('pf', 'tab-stop__', self.divide_by_20),
|
||||
'tb' : ('pf', 'tab-bar-st', self.divide_by_20),
|
||||
'tqr' : ('pf', 'tab-right_', self.default_func),
|
||||
'tqdec' : ('pf', 'tab-dec___', self.default_func),
|
||||
'tqc' : ('pf', 'tab-center', self.default_func),
|
||||
'tlul' : ('pf', 'leader-und', self.default_func),
|
||||
'tlhyph' : ('pf', 'leader-hyp', self.default_func),
|
||||
'tldot' : ('pf', 'leader-dot', self.default_func),
|
||||
# stylesheet = > ss
|
||||
'stylesheet' : ('ss', 'style-shet', self.default_func),
|
||||
'sbasedon' : ('ss', 'based-on__', self.default_func),
|
||||
'snext' : ('ss', 'next-style', self.default_func),
|
||||
'cs' : ('ss', 'char-style', self.default_func),
|
||||
's' : ('ss', 'para-style', self.default_func),
|
||||
# graphics => gr
|
||||
'pict' : ('gr', 'picture___', self.default_func),
|
||||
'objclass' : ('gr', 'obj-class_', self.default_func),
|
||||
'macpict' : ('gr', 'mac-pic___', self.default_func),
|
||||
# section => sc
|
||||
'sect' : ('sc', 'section___', self.default_func),
|
||||
'sectd' : ('sc', 'sect-defin', self.default_func),
|
||||
'endhere' : ('sc', 'sect-note_', self.default_func),
|
||||
# list=> ls
|
||||
'pntext' : ('ls', 'list-text_', self.default_func),
|
||||
# this line must be wrong because it duplicates an earlier one
|
||||
'listtext' : ('ls', 'list-text_', self.default_func),
|
||||
'pn' : ('ls', 'list______', self.default_func),
|
||||
'pnseclvl' : ('ls', 'list-level', self.default_func),
|
||||
'pncard' : ('ls', 'list-cardi', self.bool_st_func),
|
||||
'pndec' : ('ls', 'list-decim', self.bool_st_func),
|
||||
'pnucltr' : ('ls', 'list-up-al', self.bool_st_func),
|
||||
'pnucrm' : ('ls', 'list-up-ro', self.bool_st_func),
|
||||
'pnord' : ('ls', 'list-ord__', self.bool_st_func),
|
||||
'pnordt' : ('ls', 'list-ordte', self.bool_st_func),
|
||||
'pnlvlblt' : ('ls', 'list-bulli', self.bool_st_func),
|
||||
'pnlvlbody' : ('ls', 'list-simpi', self.bool_st_func),
|
||||
'pnlvlcont' : ('ls', 'list-conti', self.bool_st_func),
|
||||
'pnhang' : ('ls', 'list-hang_', self.bool_st_func),
|
||||
'pntxtb' : ('ls', 'list-tebef', self.bool_st_func),
|
||||
'ilvl' : ('ls', 'list-level', self.default_func),
|
||||
'ls' : ('ls', 'list-id___', self.default_func),
|
||||
'pnstart' : ('ls', 'list-start', self.default_func),
|
||||
'itap' : ('ls', 'nest-level', self.default_func),
|
||||
'leveltext' : ('ls', 'level-text', self.default_func),
|
||||
'levelnumbers' : ('ls', 'level-numb', self.default_func),
|
||||
'list' : ('ls', 'list-in-tb', self.default_func),
|
||||
'listlevel' : ('ls', 'list-tb-le', self.default_func),
|
||||
'listname' : ('ls', 'list-name_', self.default_func),
|
||||
'listtemplateid' : ('ls', 'ls-tem-id_', self.default_func),
|
||||
'leveltemplateid' : ('ls', 'lv-tem-id_', self.default_func),
|
||||
'listhybrid' : ('ls', 'list-hybri', self.default_func),
|
||||
'levelstartat' : ('ls', 'level-star', self.default_func),
|
||||
'levelspace' : ('ls', 'level-spac', self.divide_by_20),
|
||||
'levelindent' : ('ls', 'level-inde', self.default_func),
|
||||
'levelnfc' : ('ls', 'level-type', self.__list_type_func),
|
||||
'levelnfcn' : ('ls', 'level-type', self.__list_type_func),
|
||||
'listid' : ('ls', 'lis-tbl-id', self.default_func),
|
||||
'listoverride' : ('ls', 'lis-overid', self.default_func),
|
||||
# duplicate
|
||||
'pnlvl' : ('ls', 'list-level', self.default_func),
|
||||
# root info => ri
|
||||
'rtf' : ('ri', 'rtf_______', self.default_func),
|
||||
'deff' : ('ri', 'deflt-font', self.default_func),
|
||||
'mac' : ('ri', 'macintosh_', self.default_func),
|
||||
'pc' : ('ri', 'pc________', self.default_func),
|
||||
'pca' : ('ri', 'pca_______', self.default_func),
|
||||
'ansi' : ('ri', 'ansi______', self.default_func),
|
||||
'ansicpg' : ('ri', 'ansi-codpg', self.default_func),
|
||||
# notes => nt
|
||||
'footnote' : ('nt', 'footnote__', self.default_func),
|
||||
'ftnalt' : ('nt', 'type______<endnote', self.two_part_func),
|
||||
# anchor => an
|
||||
'tc' : ('an', 'toc_______', self.default_func),
|
||||
'bkmkstt' : ('an', 'book-mk-st', self.default_func),
|
||||
'bkmkstart' : ('an', 'book-mk-st', self.default_func),
|
||||
'bkmkend' : ('an', 'book-mk-en', self.default_func),
|
||||
'xe' : ('an', 'index-mark', self.default_func),
|
||||
'rxe' : ('an', 'place_____', self.default_func),
|
||||
# index => in
|
||||
'bxe' : ('in', 'index-bold', self.default_func),
|
||||
'ixe' : ('in', 'index-ital', self.default_func),
|
||||
'txe' : ('in', 'index-see_', self.default_func),
|
||||
# table of contents => tc
|
||||
'tcl' : ('tc', 'toc-level_', self.default_func),
|
||||
'tcn' : ('tc', 'toc-sup-nu', self.default_func),
|
||||
# field => fd
|
||||
'field' : ('fd', 'field_____', self.default_func),
|
||||
'fldinst' : ('fd', 'field-inst', self.default_func),
|
||||
'fldrslt' : ('fd', 'field-rslt', self.default_func),
|
||||
'datafield' : ('fd', 'datafield_', self.default_func),
|
||||
# info-tables => it
|
||||
'fonttbl' : ('it', 'font-table', self.default_func),
|
||||
'colortbl' : ('it', 'colr-table', self.default_func),
|
||||
'listoverridetable' : ('it', 'lovr-table', self.default_func),
|
||||
'listtable' : ('it', 'listtable_', self.default_func),
|
||||
'revtbl' : ('it', 'revi-table', self.default_func),
|
||||
# character info => ci
|
||||
'b' : ('ci', 'bold______', self.bool_st_func),
|
||||
'blue' : ('ci', 'blue______', self.color_func),
|
||||
'caps' : ('ci', 'caps______', self.bool_st_func),
|
||||
'cf' : ('ci', 'font-color', self.colorz_func),
|
||||
'chftn' : ('ci', 'footnot-mk', self.bool_st_func),
|
||||
'dn' : ('ci', 'font-down_', self.divide_by_2),
|
||||
'embo' : ('ci', 'emboss____', self.bool_st_func),
|
||||
'f' : ('ci', 'font-style', self.default_func),
|
||||
'fs' : ('ci', 'font-size_', self.divide_by_2),
|
||||
'green' : ('ci', 'green_____', self.color_func),
|
||||
'i' : ('ci', 'italics___', self.bool_st_func),
|
||||
'impr' : ('ci', 'engrave___', self.bool_st_func),
|
||||
'outl' : ('ci', 'outline___', self.bool_st_func),
|
||||
'plain' : ('ci', 'plain_____', self.bool_st_func),
|
||||
'red' : ('ci', 'red_______', self.color_func),
|
||||
'scaps' : ('ci', 'small-caps', self.bool_st_func),
|
||||
'shad' : ('ci', 'shadow____', self.bool_st_func),
|
||||
'strike' : ('ci', 'strike-thr', self.bool_st_func),
|
||||
'striked' : ('ci', 'dbl-strike', self.bool_st_func),
|
||||
'sub' : ('ci', 'subscript_', self.bool_st_func),
|
||||
'super' : ('ci', 'superscrip', self.bool_st_func),
|
||||
'nosupersub' : ('ci', 'no-su-supe', self.__no_sup_sub_func),
|
||||
'up' : ('ci', 'font-up___', self.divide_by_2),
|
||||
'v' : ('ci', 'hidden____', self.default_func),
|
||||
# underline
|
||||
# can't see why it isn't a char info: 'ul'=>'ci'
|
||||
'ul' : ('ci', 'underlined<continous', self.two_part_func),
|
||||
'uld' : ('ci', 'underlined<dotted', self.two_part_func),
|
||||
'uldash' : ('ci', 'underlined<dash', self.two_part_func),
|
||||
'uldashd' : ('ci', 'underlined<dash-dot', self.two_part_func),
|
||||
'uldashdd' : ('ci', 'underlined<dash-dot-dot', self.two_part_func),
|
||||
'uldb' : ('ci', 'underlined<double', self.two_part_func),
|
||||
'ulhwave' : ('ci', 'underlined<heavy-wave', self.two_part_func),
|
||||
'ulldash' : ('ci', 'underlined<long-dash', self.two_part_func),
|
||||
'ulth' : ('ci', 'underlined<thich', self.two_part_func),
|
||||
'ulthd' : ('ci', 'underlined<thick-dotted', self.two_part_func),
|
||||
'ulthdash' : ('ci', 'underlined<thick-dash', self.two_part_func),
|
||||
'ulthdashd' : ('ci', 'underlined<thick-dash-dot', self.two_part_func),
|
||||
'ulthdashdd' : ('ci', 'underlined<thick-dash-dot-dot', self.two_part_func),
|
||||
'ulthldash' : ('ci', 'underlined<thick-long-dash', self.two_part_func),
|
||||
'ululdbwave' : ('ci', 'underlined<double-wave', self.two_part_func),
|
||||
'ulw' : ('ci', 'underlined<word', self.two_part_func),
|
||||
'ulwave' : ('ci', 'underlined<wave', self.two_part_func),
|
||||
'ulnone' : ('ci', 'underlined<false', self.two_part_func),
|
||||
# table => tb
|
||||
'trowd' : ('tb', 'row-def___', self.default_func),
|
||||
'cell' : ('tb', 'cell______', self.default_func),
|
||||
'row' : ('tb', 'row_______', self.default_func),
|
||||
'intbl' : ('tb', 'in-table__', self.default_func),
|
||||
'cols' : ('tb', 'columns___', self.default_func),
|
||||
'trleft' : ('tb', 'row-pos-le', self.divide_by_20),
|
||||
'cellx' : ('tb', 'cell-posit', self.divide_by_20),
|
||||
'trhdr' : ('tb', 'row-header', self.default_func),
|
||||
# preamble => pr
|
||||
# document information => di
|
||||
# TODO integrate \userprops
|
||||
'info' : ('di', 'doc-info__', self.default_func),
|
||||
'title' : ('di', 'title_____', self.default_func),
|
||||
'author' : ('di', 'author____', self.default_func),
|
||||
'operator' : ('di', 'operator__', self.default_func),
|
||||
'manager' : ('di', 'manager___', self.default_func),
|
||||
'company' : ('di', 'company___', self.default_func),
|
||||
'keywords' : ('di', 'keywords__', self.default_func),
|
||||
'category' : ('di', 'category__', self.default_func),
|
||||
'doccomm' : ('di', 'doc-notes_', self.default_func),
|
||||
'comment' : ('di', 'doc-notes_', self.default_func),
|
||||
'subject' : ('di', 'subject___', self.default_func),
|
||||
'creatim' : ('di', 'create-tim', self.default_func),
|
||||
'yr' : ('di', 'year______', self.default_func),
|
||||
'mo' : ('di', 'month_____', self.default_func),
|
||||
'dy' : ('di', 'day_______', self.default_func),
|
||||
'min' : ('di', 'minute____', self.default_func),
|
||||
'sec' : ('di', 'second____', self.default_func),
|
||||
'revtim' : ('di', 'revis-time', self.default_func),
|
||||
'edmins' : ('di', 'edit-time_', self.default_func),
|
||||
'printim' : ('di', 'print-time', self.default_func),
|
||||
'buptim' : ('di', 'backuptime', self.default_func),
|
||||
'nofwords' : ('di', 'num-of-wor', self.default_func),
|
||||
'nofchars' : ('di', 'num-of-chr', self.default_func),
|
||||
'nofcharsws' : ('di', 'numofchrws', self.default_func),
|
||||
'nofpages' : ('di', 'num-of-pag', self.default_func),
|
||||
'version' : ('di', 'version___', self.default_func),
|
||||
'vern' : ('di', 'intern-ver', self.default_func),
|
||||
'hlinkbase' : ('di', 'linkbase__', self.default_func),
|
||||
'id' : ('di', 'internalID', self.default_func),
|
||||
# headers and footers => hf
|
||||
'headerf' : ('hf', 'head-first', self.default_func),
|
||||
'headerl' : ('hf', 'head-left_', self.default_func),
|
||||
'headerr' : ('hf', 'head-right', self.default_func),
|
||||
'footerf' : ('hf', 'foot-first', self.default_func),
|
||||
'footerl' : ('hf', 'foot-left_', self.default_func),
|
||||
'footerr' : ('hf', 'foot-right', self.default_func),
|
||||
'header' : ('hf', 'header____', self.default_func),
|
||||
'footer' : ('hf', 'footer____', self.default_func),
|
||||
# page => pa
|
||||
'margl' : ('pa', 'margin-lef', self.divide_by_20),
|
||||
'margr' : ('pa', 'margin-rig', self.divide_by_20),
|
||||
'margb' : ('pa', 'margin-bot', self.divide_by_20),
|
||||
'margt' : ('pa', 'margin-top', self.divide_by_20),
|
||||
'gutter' : ('pa', 'gutter____', self.divide_by_20),
|
||||
'paperw' : ('pa', 'paper-widt', self.divide_by_20),
|
||||
'paperh' : ('pa', 'paper-hght', self.divide_by_20),
|
||||
# annotation => an
|
||||
'annotation' : ('an', 'annotation', self.default_func),
|
||||
# border => bd
|
||||
'trbrdrh' : ('bd', 'bor-t-r-hi', self.default_func),
|
||||
'trbrdrv' : ('bd', 'bor-t-r-vi', self.default_func),
|
||||
'trbrdrt' : ('bd', 'bor-t-r-to', self.default_func),
|
||||
'trbrdrl' : ('bd', 'bor-t-r-le', self.default_func),
|
||||
'trbrdrb' : ('bd', 'bor-t-r-bo', self.default_func),
|
||||
'trbrdrr' : ('bd', 'bor-t-r-ri', self.default_func),
|
||||
'clbrdrb' : ('bd', 'bor-cel-bo', self.default_func),
|
||||
'clbrdrt' : ('bd', 'bor-cel-to', self.default_func),
|
||||
'clbrdrl' : ('bd', 'bor-cel-le', self.default_func),
|
||||
'clbrdrr' : ('bd', 'bor-cel-ri', self.default_func),
|
||||
'brdrb' : ('bd', 'bor-par-bo', self.default_func),
|
||||
'brdrt' : ('bd', 'bor-par-to', self.default_func),
|
||||
'brdrl' : ('bd', 'bor-par-le', self.default_func),
|
||||
'brdrr' : ('bd', 'bor-par-ri', self.default_func),
|
||||
'box' : ('bd', 'bor-par-bx', self.default_func),
|
||||
'chbrdr' : ('bd', 'bor-par-bo', self.default_func),
|
||||
'brdrbtw' : ('bd', 'bor-for-ev', self.default_func),
|
||||
'brdrbar' : ('bd', 'bor-outsid', self.default_func),
|
||||
'brdrnone' : ('bd', 'bor-none__<false', self.two_part_func),
|
||||
# border type => bt
|
||||
'brdrs' : ('bt', 'bdr-single', self.default_func),
|
||||
'brdrth' : ('bt', 'bdr-doubtb', self.default_func),
|
||||
'brdrsh' : ('bt', 'bdr-shadow', self.default_func),
|
||||
'brdrdb' : ('bt', 'bdr-double', self.default_func),
|
||||
'brdrdot' : ('bt', 'bdr-dotted', self.default_func),
|
||||
'brdrdash' : ('bt', 'bdr-dashed', self.default_func),
|
||||
'brdrhair' : ('bt', 'bdr-hair__', self.default_func),
|
||||
'brdrinset' : ('bt', 'bdr-inset_', self.default_func),
|
||||
'brdrdashsm' : ('bt', 'bdr-das-sm', self.default_func),
|
||||
'brdrdashd' : ('bt', 'bdr-dot-sm', self.default_func),
|
||||
'brdrdashdd' : ('bt', 'bdr-dot-do', self.default_func),
|
||||
'brdroutset' : ('bt', 'bdr-outset', self.default_func),
|
||||
'brdrtriple' : ('bt', 'bdr-trippl', self.default_func),
|
||||
'brdrtnthsg' : ('bt', 'bdr-thsm__', self.default_func),
|
||||
'brdrthtnsg' : ('bt', 'bdr-htsm__', self.default_func),
|
||||
'brdrtnthtnsg' : ('bt', 'bdr-hthsm_', self.default_func),
|
||||
'brdrtnthmg' : ('bt', 'bdr-thm___', self.default_func),
|
||||
'brdrthtnmg' : ('bt', 'bdr-htm___', self.default_func),
|
||||
'brdrtnthtnmg' : ('bt', 'bdr-hthm__', self.default_func),
|
||||
'brdrtnthlg' : ('bt', 'bdr-thl___', self.default_func),
|
||||
'brdrtnthtnlg' : ('bt', 'bdr-hthl__', self.default_func),
|
||||
'brdrwavy' : ('bt', 'bdr-wavy__', self.default_func),
|
||||
'brdrwavydb' : ('bt', 'bdr-d-wav_', self.default_func),
|
||||
'brdrdashdotstr' : ('bt', 'bdr-strip_', self.default_func),
|
||||
'brdremboss' : ('bt', 'bdr-embos_', self.default_func),
|
||||
'brdrengrave' : ('bt', 'bdr-engra_', self.default_func),
|
||||
'brdrframe' : ('bt', 'bdr-frame_', self.default_func),
|
||||
'brdrw' : ('bt', 'bdr-li-wid', self.divide_by_20),
|
||||
'brsp' : ('bt', 'bdr-sp-wid', self.divide_by_20),
|
||||
'brdrcf' : ('bt', 'bdr-color_', self.default_func),
|
||||
# comments
|
||||
# 'comment' : ('cm', 'comment___', self.default_func),
|
||||
}
|
||||
self.__number_type_dict = {
|
||||
0: 'Arabic',
|
||||
1: 'uppercase Roman numeral',
|
||||
2: 'lowercase Roman numeral',
|
||||
3: 'uppercase letter',
|
||||
4: 'lowercase letter',
|
||||
5: 'ordinal number',
|
||||
6: 'cardianl text number',
|
||||
7: 'ordinal text number',
|
||||
10: 'Kanji numbering without the digit character',
|
||||
11: 'Kanji numbering with the digit character',
|
||||
1246: 'phonetic Katakana characters in aiueo order',
|
||||
1346: 'phonetic katakana characters in iroha order',
|
||||
14: 'double byte character',
|
||||
15: 'single byte character',
|
||||
16: 'Kanji numbering 3',
|
||||
17: 'Kanji numbering 4',
|
||||
18: 'Circle numbering' ,
|
||||
19: 'double-byte Arabic numbering',
|
||||
2046: 'phonetic double-byte Katakana characters',
|
||||
2146: 'phonetic double-byte katakana characters',
|
||||
22: 'Arabic with leading zero',
|
||||
23: 'bullet',
|
||||
24: 'Korean numbering 2',
|
||||
25: 'Korean numbering 1',
|
||||
26: 'Chinese numbering 1',
|
||||
27: 'Chinese numbering 2',
|
||||
28: 'Chinese numbering 3',
|
||||
29: 'Chinese numbering 4',
|
||||
30: 'Chinese Zodiac numbering 1',
|
||||
31: 'Chinese Zodiac numbering 2',
|
||||
32: 'Chinese Zodiac numbering 3',
|
||||
33: 'Taiwanese double-byte numbering 1',
|
||||
34: 'Taiwanese double-byte numbering 2',
|
||||
35: 'Taiwanese double-byte numbering 3',
|
||||
36: 'Taiwanese double-byte numbering 4',
|
||||
37: 'Chinese double-byte numbering 1',
|
||||
38: 'Chinese double-byte numbering 2',
|
||||
39: 'Chinese double-byte numbering 3',
|
||||
40: 'Chinese double-byte numbering 4',
|
||||
41: 'Korean double-byte numbering 1',
|
||||
42: 'Korean double-byte numbering 2',
|
||||
43: 'Korean double-byte numbering 3',
|
||||
44: 'Korean double-byte numbering 4',
|
||||
45: 'Hebrew non-standard decimal',
|
||||
46: 'Arabic Alif Ba Tah',
|
||||
47: 'Hebrew Biblical standard',
|
||||
48: 'Arabic Abjad style',
|
||||
255: 'No number',
|
||||
}
|
||||
self.__language_dict = {
|
||||
1078 : 'Afrikaans',
|
||||
1052 : 'Albanian',
|
||||
1025 : 'Arabic',
|
||||
5121 : 'Arabic Algeria',
|
||||
15361 : 'Arabic Bahrain',
|
||||
3073 : 'Arabic Egypt',
|
||||
1 : 'Arabic General',
|
||||
2049 : 'Arabic Iraq',
|
||||
11265 : 'Arabic Jordan',
|
||||
13313 : 'Arabic Kuwait',
|
||||
12289 : 'Arabic Lebanon',
|
||||
4097 : 'Arabic Libya',
|
||||
6145 : 'Arabic Morocco',
|
||||
8193 : 'Arabic Oman',
|
||||
16385 : 'Arabic Qatar',
|
||||
10241 : 'Arabic Syria',
|
||||
7169 : 'Arabic Tunisia',
|
||||
14337 : 'Arabic U.A.E.',
|
||||
9217 : 'Arabic Yemen',
|
||||
1067 : 'Armenian',
|
||||
1101 : 'Assamese',
|
||||
2092 : 'Azeri Cyrillic',
|
||||
1068 : 'Azeri Latin',
|
||||
1069 : 'Basque',
|
||||
1093 : 'Bengali',
|
||||
4122 : 'Bosnia Herzegovina',
|
||||
1026 : 'Bulgarian',
|
||||
1109 : 'Burmese',
|
||||
1059 : 'Byelorussian',
|
||||
1027 : 'Catalan',
|
||||
2052 : 'Chinese China',
|
||||
4 : 'Chinese General',
|
||||
3076 : 'Chinese Hong Kong',
|
||||
4100 : 'Chinese Singapore',
|
||||
1028 : 'Chinese Taiwan',
|
||||
1050 : 'Croatian',
|
||||
1029 : 'Czech',
|
||||
1030 : 'Danish',
|
||||
2067 : 'Dutch Belgium',
|
||||
1043 : 'Dutch Standard',
|
||||
3081 : 'English Australia',
|
||||
10249 : 'English Belize',
|
||||
2057 : 'English British',
|
||||
4105 : 'English Canada',
|
||||
9225 : 'English Caribbean',
|
||||
9 : 'English General',
|
||||
6153 : 'English Ireland',
|
||||
8201 : 'English Jamaica',
|
||||
5129 : 'English New Zealand',
|
||||
13321 : 'English Philippines',
|
||||
7177 : 'English South Africa',
|
||||
11273 : 'English Trinidad',
|
||||
1033 : 'English United States',
|
||||
1061 : 'Estonian',
|
||||
1080 : 'Faerose',
|
||||
1065 : 'Farsi',
|
||||
1035 : 'Finnish',
|
||||
1036 : 'French',
|
||||
2060 : 'French Belgium',
|
||||
11276 : 'French Cameroon',
|
||||
3084 : 'French Canada',
|
||||
12300 : 'French Cote d\'Ivoire',
|
||||
5132 : 'French Luxembourg',
|
||||
13324 : 'French Mali',
|
||||
6156 : 'French Monaco',
|
||||
8204 : 'French Reunion',
|
||||
10252 : 'French Senegal',
|
||||
4108 : 'French Swiss',
|
||||
7180 : 'French West Indies',
|
||||
9228 : 'French Democratic Republic of the Congo',
|
||||
1122 : 'Frisian',
|
||||
1084 : 'Gaelic',
|
||||
2108 : 'Gaelic Ireland',
|
||||
1110 : 'Galician',
|
||||
1079 : 'Georgian',
|
||||
1031 : 'German',
|
||||
3079 : 'German Austrian',
|
||||
5127 : 'German Liechtenstein',
|
||||
4103 : 'German Luxembourg',
|
||||
2055 : 'German Switzerland',
|
||||
1032 : 'Greek',
|
||||
1095 : 'Gujarati',
|
||||
1037 : 'Hebrew',
|
||||
1081 : 'Hindi',
|
||||
1038 : 'Hungarian',
|
||||
1039 : 'Icelandic',
|
||||
1057 : 'Indonesian',
|
||||
1040 : 'Italian',
|
||||
2064 : 'Italian Switzerland',
|
||||
1041 : 'Japanese',
|
||||
1099 : 'Kannada',
|
||||
1120 : 'Kashmiri',
|
||||
2144 : 'Kashmiri India',
|
||||
1087 : 'Kazakh',
|
||||
1107 : 'Khmer',
|
||||
1088 : 'Kirghiz',
|
||||
1111 : 'Konkani',
|
||||
1042 : 'Korean',
|
||||
2066 : 'Korean Johab',
|
||||
1108 : 'Lao',
|
||||
1062 : 'Latvian',
|
||||
1063 : 'Lithuanian',
|
||||
2087 : 'Lithuanian Classic',
|
||||
1086 : 'Malay',
|
||||
2110 : 'Malay Brunei Darussalam',
|
||||
1100 : 'Malayalam',
|
||||
1082 : 'Maltese',
|
||||
1112 : 'Manipuri',
|
||||
1102 : 'Marathi',
|
||||
1104 : 'Mongolian',
|
||||
1121 : 'Nepali',
|
||||
2145 : 'Nepali India',
|
||||
1044 : 'Norwegian Bokmal',
|
||||
2068 : 'Norwegian Nynorsk',
|
||||
1096 : 'Oriya',
|
||||
1045 : 'Polish',
|
||||
1046 : 'Portuguese (Brazil)',
|
||||
2070 : 'Portuguese (Portugal)',
|
||||
1094 : 'Punjabi',
|
||||
1047 : 'Rhaeto-Romanic',
|
||||
1048 : 'Romanian',
|
||||
2072 : 'Romanian Moldova',
|
||||
1049 : 'Russian',
|
||||
2073 : 'Russian Moldova',
|
||||
1083 : 'Sami Lappish',
|
||||
1103 : 'Sanskrit',
|
||||
3098 : 'Serbian Cyrillic',
|
||||
2074 : 'Serbian Latin',
|
||||
1113 : 'Sindhi',
|
||||
1051 : 'Slovak',
|
||||
1060 : 'Slovenian',
|
||||
1070 : 'Sorbian',
|
||||
11274 : 'Spanish Argentina',
|
||||
16394 : 'Spanish Bolivia',
|
||||
13322 : 'Spanish Chile',
|
||||
9226 : 'Spanish Colombia',
|
||||
5130 : 'Spanish Costa Rica',
|
||||
7178 : 'Spanish Dominican Republic',
|
||||
12298 : 'Spanish Ecuador',
|
||||
17418 : 'Spanish El Salvador',
|
||||
4106 : 'Spanish Guatemala',
|
||||
18442 : 'Spanish Honduras',
|
||||
2058 : 'Spanish Mexico',
|
||||
3082 : 'Spanish Modern',
|
||||
19466 : 'Spanish Nicaragua',
|
||||
6154 : 'Spanish Panama',
|
||||
15370 : 'Spanish Paraguay',
|
||||
10250 : 'Spanish Peru',
|
||||
20490 : 'Spanish Puerto Rico',
|
||||
1034 : 'Spanish Traditional',
|
||||
14346 : 'Spanish Uruguay',
|
||||
8202 : 'Spanish Venezuela',
|
||||
1072 : 'Sutu',
|
||||
1089 : 'Swahili',
|
||||
1053 : 'Swedish',
|
||||
2077 : 'Swedish Finland',
|
||||
1064 : 'Tajik',
|
||||
1097 : 'Tamil',
|
||||
1092 : 'Tatar',
|
||||
1098 : 'Telugu',
|
||||
1054 : 'Thai',
|
||||
1105 : 'Tibetan',
|
||||
1073 : 'Tsonga',
|
||||
1074 : 'Tswana',
|
||||
1055 : 'Turkish',
|
||||
1090 : 'Turkmen',
|
||||
1058 : 'Ukranian',
|
||||
1056 : 'Urdu',
|
||||
2080 : 'Urdu India',
|
||||
2115 : 'Uzbek Cyrillic',
|
||||
1091 : 'Uzbek Latin',
|
||||
1075 : 'Venda',
|
||||
1066 : 'Vietnamese',
|
||||
1106 : 'Welsh',
|
||||
1076 : 'Xhosa',
|
||||
1085 : 'Yiddish',
|
||||
1077 : 'Zulu',
|
||||
1024 : 'Unkown',
|
||||
255 : 'Unkown',
|
||||
}
|
||||
"""
|
||||
# unknown
|
||||
# These must get passed on because they occure after \\*
|
||||
'do' : ('un', 'unknown___', self.default_func),
|
||||
'company' : ('un', 'company___', self.default_func),
|
||||
'shpinst' : ('un', 'unknown___', self.default_func),
|
||||
'panose' : ('un', 'unknown___', self.default_func),
|
||||
'falt' : ('un', 'unknown___', self.default_func),
|
||||
'listoverridetable' : ('un', 'unknown___', self.default_func),
|
||||
'category' : ('un', 'unknown___', self.default_func),
|
||||
'template' : ('un', 'unknown___', self.default_func),
|
||||
'ud' : ('un', 'unknown___', self.default_func),
|
||||
'formfield' : ('un', 'unknown___', self.default_func),
|
||||
'ts' : ('un', 'unknown___', self.default_func),
|
||||
'rsidtbl' : ('un', 'unknown___', self.default_func),
|
||||
'generator' : ('un', 'unknown___', self.default_func),
|
||||
'ftnsep' : ('un', 'unknown___', self.default_func),
|
||||
'aftnsep' : ('un', 'unknown___', self.default_func),
|
||||
'aftnsepc' : ('un', 'unknown___', self.default_func),
|
||||
'aftncn' : ('un', 'unknown___', self.default_func),
|
||||
'objclass' : ('un', 'unknown___', self.default_func),
|
||||
'objdata' : ('un', 'unknown___', self.default_func),
|
||||
'picprop' : ('un', 'unknown___', self.default_func),
|
||||
'blipuid' : ('un', 'unknown___', self.default_func),
|
||||
"""
|
||||
|
||||
def __ms_hex_func(self, pre, token, num):
|
||||
num = num[1:] # chop off leading 0, which I added
|
||||
num = num.upper() # the mappings store hex in caps
|
||||
return 'tx<hx<__________<\'%s\n' % num # add an ' for the mappings
|
||||
|
||||
def ms_sub_func(self, pre, token, num):
|
||||
return 'tx<mc<__________<%s\n' % token
|
||||
|
||||
def direct_conv_func(self, pre, token, num):
|
||||
return 'mi<tg<empty_____<%s\n' % token
|
||||
|
||||
def default_func(self, pre, token, num):
|
||||
if num is None:
|
||||
num = 'true'
|
||||
return 'cw<%s<%s<nu<%s\n' % (pre, token, num)
|
||||
|
||||
def colorz_func(self, pre, token, num):
|
||||
if num is None:
|
||||
num = '0'
|
||||
return 'cw<%s<%s<nu<%s\n' % (pre, token, num)
|
||||
|
||||
def __list_type_func(self, pre, token, num):
|
||||
type = 'arabic'
|
||||
if num is None:
|
||||
type = 'Arabic'
|
||||
else:
|
||||
try:
|
||||
num = int(num)
|
||||
except ValueError:
|
||||
if self.__run_level > 3:
|
||||
msg = 'Number "%s" cannot be converted to integer\n' % num
|
||||
raise self.__bug_handler(msg)
|
||||
type = self.__number_type_dict.get(num)
|
||||
if type is None:
|
||||
if self.__run_level > 3:
|
||||
msg = 'No type for "%s" in self.__number_type_dict\n'
|
||||
raise self.__bug_handler
|
||||
type = 'Arabic'
|
||||
return 'cw<%s<%s<nu<%s\n' % (pre, token, type)
|
||||
|
||||
def __language_func(self, pre, token, num):
|
||||
lang_name = self.__language_dict.get(int(re.search('[0-9]+', num).group()))
|
||||
if not lang_name:
|
||||
lang_name = "not defined"
|
||||
if self.__run_level > 3:
|
||||
msg = 'No entry for number "%s"' % num
|
||||
raise self.__bug_handler(msg)
|
||||
return 'cw<%s<%s<nu<%s\n' % (pre, token, lang_name)
|
||||
|
||||
def two_part_func(self, pre, token, num):
|
||||
list = token.split("<")
|
||||
token = list[0]
|
||||
num = list[1]
|
||||
return 'cw<%s<%s<nu<%s\n' % (pre, token, num)
|
||||
# return 'cw<nu<nu<nu<%s>num<%s\n' % (token, num)
|
||||
|
||||
def divide_by_2(self, pre, token, num):
|
||||
num = self.divide_num(num, 2)
|
||||
return 'cw<%s<%s<nu<%s\n' % (pre, token, num)
|
||||
# return 'cw<nu<nu<nu<%s>%s<%s\n' % (token, num, token)
|
||||
|
||||
def divide_by_20(self, pre, token, num):
|
||||
num = self.divide_num(num, 20)
|
||||
return 'cw<%s<%s<nu<%s\n' % (pre, token, num)
|
||||
# return 'cw<nu<nu<nu<%s>%s<%s\n' % (token, num, token)
|
||||
|
||||
def text_func(self, pre, token, num=None):
|
||||
return 'tx<nu<__________<%s\n' % token
|
||||
|
||||
def ob_func(self, pre, token, num=None):
|
||||
self.__bracket_count += 1
|
||||
return 'ob<nu<open-brack<%04d\n' % self.__bracket_count
|
||||
|
||||
def cb_func(self, pre, token, num=None):
|
||||
line = 'cb<nu<clos-brack<%04d\n' % self.__bracket_count
|
||||
self.__bracket_count -= 1
|
||||
return line
|
||||
|
||||
def color_func(self, pre, token, num):
|
||||
third_field = 'nu'
|
||||
if num[-1] == ';':
|
||||
num = num[:-1]
|
||||
third_field = 'en'
|
||||
num = unicode_type('%X' % int(num))
|
||||
if len(num) != 2:
|
||||
num = "0" + num
|
||||
return 'cw<%s<%s<%s<%s\n' % (pre, token, third_field, num)
|
||||
# return 'cw<cl<%s<nu<nu<%s>%s<%s\n' % (third_field, token, num, token)
|
||||
|
||||
def bool_st_func(self, pre, token, num):
|
||||
if num is None or num == '' or num == '1':
|
||||
return 'cw<%s<%s<nu<true\n' % (pre, token)
|
||||
# return 'cw<nu<nu<nu<%s>true<%s\n' % (token, token)
|
||||
elif num == '0':
|
||||
return 'cw<%s<%s<nu<false\n' % (pre, token)
|
||||
# return 'cw<nu<nu<nu<%s>false<%s\n' % (token, token)
|
||||
else:
|
||||
msg = "boolean should have some value module process tokens\ntoken is %s\n'%s'\n" % (token, num)
|
||||
raise self.__bug_handler(msg)
|
||||
|
||||
def __no_sup_sub_func(self, pre, token, num):
|
||||
the_string = 'cw<ci<subscript_<nu<false\n'
|
||||
the_string += 'cw<ci<superscrip<nu<false\n'
|
||||
return the_string
|
||||
|
||||
def divide_num(self, numerator, denominator):
|
||||
try:
|
||||
# calibre why ignore negative number? Wrong in case of \fi
|
||||
numerator = float(re.search('[0-9.\\-]+', numerator).group())
|
||||
except TypeError as msg:
|
||||
if self.__run_level > 3:
|
||||
msg = ('No number to process?\nthis indicates that the token \\(\\li\\) \
|
||||
should have a number and does not\nnumerator is \
|
||||
"%s"\ndenominator is "%s"\n') % (numerator, denominator)
|
||||
raise self.__bug_handler(msg)
|
||||
if 5 > self.__return_code:
|
||||
self.__return_code = 5
|
||||
return 0
|
||||
num = '%0.2f' % round(numerator/denominator, 2)
|
||||
return num
|
||||
string_num = unicode_type(num)
|
||||
if string_num[-2:] == ".0":
|
||||
string_num = string_num[:-2]
|
||||
return string_num
|
||||
|
||||
def split_let_num(self, token):
|
||||
match_obj = re.search(self.__num_exp,token)
|
||||
if match_obj is not None:
|
||||
first = match_obj.group(1)
|
||||
second = match_obj.group(2)
|
||||
if not second:
|
||||
if self.__run_level > 3:
|
||||
msg = "token is '%s' \n" % token
|
||||
raise self.__bug_handler(msg)
|
||||
return first, 0
|
||||
else:
|
||||
if self.__run_level > 3:
|
||||
msg = "token is '%s' \n" % token
|
||||
raise self.__bug_handler
|
||||
return token, 0
|
||||
return first, second
|
||||
|
||||
def convert_to_hex(self,number):
|
||||
"""Convert a string to uppercase hexidecimal"""
|
||||
num = int(number)
|
||||
try:
|
||||
hex_num = "%X" % num
|
||||
return hex_num
|
||||
except:
|
||||
raise self.__bug_handler
|
||||
|
||||
def process_cw(self, token):
|
||||
"""Change the value of the control word by determining what dictionary
|
||||
it belongs to"""
|
||||
special = ['*', ':', '}', '{', '~', '_', '-', ';']
|
||||
# if token != "{" or token != "}":
|
||||
token = token[1:] # strip off leading \
|
||||
token = token.replace(" ", "")
|
||||
# if not token: return
|
||||
only_alpha = token.isalpha()
|
||||
num = None
|
||||
if not only_alpha and token not in special:
|
||||
token, num = self.split_let_num(token)
|
||||
pre, token, action = self.dict_token.get(token, (None, None, None))
|
||||
if action:
|
||||
return action(pre, token, num)
|
||||
|
||||
def __check_brackets(self, in_file):
|
||||
self.__check_brack_obj = check_brackets.CheckBrackets(file=in_file)
|
||||
good_br = self.__check_brack_obj.check_brackets()[0]
|
||||
if not good_br:
|
||||
return 1
|
||||
|
||||
def process_tokens(self):
|
||||
"""Main method for handling other methods. """
|
||||
line_count = 0
|
||||
with open_for_read(self.__file) as read_obj:
|
||||
with open_for_write(self.__write_to) as write_obj:
|
||||
for line in read_obj:
|
||||
token = line.replace("\n", "")
|
||||
line_count += 1
|
||||
if line_count == 1 and token != '\\{':
|
||||
msg = '\nInvalid RTF: document doesn\'t start with {\n'
|
||||
raise self.__exception_handler(msg)
|
||||
elif line_count == 2 and token[0:4] != '\\rtf':
|
||||
msg = '\nInvalid RTF: document doesn\'t start with \\rtf \n'
|
||||
raise self.__exception_handler(msg)
|
||||
|
||||
the_index = token.find('\\ ')
|
||||
if token is not None and the_index > -1:
|
||||
msg = '\nInvalid RTF: token "\\ " not valid.\nError at line %d'\
|
||||
% line_count
|
||||
raise self.__exception_handler(msg)
|
||||
elif token[:1] == "\\":
|
||||
line = self.process_cw(token)
|
||||
if line is not None:
|
||||
write_obj.write(line)
|
||||
else:
|
||||
fields = re.split(self.__utf_exp, token)
|
||||
for field in fields:
|
||||
if not field:
|
||||
continue
|
||||
if field[0:1] == '&':
|
||||
write_obj.write('tx<ut<__________<%s\n' % field)
|
||||
else:
|
||||
write_obj.write('tx<nu<__________<%s\n' % field)
|
||||
|
||||
if not line_count:
|
||||
msg = '\nInvalid RTF: file appears to be empty.\n'
|
||||
raise self.__exception_handler(msg)
|
||||
|
||||
copy_obj = copy.Copy(bug_handler=self.__bug_handler)
|
||||
if self.__copy:
|
||||
copy_obj.copy_file(self.__write_to, "processed_tokens.data")
|
||||
copy_obj.rename(self.__write_to, self.__file)
|
||||
os.remove(self.__write_to)
|
||||
|
||||
bad_brackets = self.__check_brackets(self.__file)
|
||||
if bad_brackets:
|
||||
msg = '\nInvalid RTF: document does not have matching brackets.\n'
|
||||
raise self.__exception_handler(msg)
|
||||
else:
|
||||
return self.__return_code
|
||||
538
ebook_converter/ebooks/rtf2xml/sections.py
Normal file
538
ebook_converter/ebooks/rtf2xml/sections.py
Normal file
@@ -0,0 +1,538 @@
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
#########################################################################
|
||||
# #
|
||||
# #
|
||||
# copyright 2002 Paul Henry Tremblay #
|
||||
# #
|
||||
# This program is distributed in the hope that it will be useful, #
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
|
||||
# General Public License for more details. #
|
||||
# #
|
||||
# #
|
||||
#########################################################################
|
||||
import sys, os
|
||||
|
||||
from calibre.ebooks.rtf2xml import copy
|
||||
from calibre.ptempfile import better_mktemp
|
||||
from polyglot.builtins import unicode_type
|
||||
|
||||
from . import open_for_read, open_for_write
|
||||
|
||||
|
||||
class Sections:
|
||||
"""
|
||||
=================
|
||||
Purpose
|
||||
=================
|
||||
Write section tags for a tokenized file. (This module won't be any use to use
|
||||
to you unless you use it as part of the other modules.)
|
||||
---------------
|
||||
logic
|
||||
---------------
|
||||
The tags for the first section breaks have already been written.
|
||||
RTF stores section breaks with the \\sect tag. Each time this tag is
|
||||
encountered, add one to the counter.
|
||||
When I encounter the \\sectd tag, I want to collect all the appropriate tokens
|
||||
that describe the section. When I reach a \\pard, I know I an stop collecting
|
||||
tokens and write the section tags.
|
||||
The exception to this method occurs when sections occur in field blocks, such
|
||||
as the index. Normally, two section break occur within the index and other
|
||||
field-blocks. (If less or more section breaks occur, this code may not work.)
|
||||
I want the sections to occur outside of the index. That is, the index
|
||||
should be nested inside one section tag. After the index is complete, a new
|
||||
section should begin.
|
||||
In order to write the sections outside of the field blocks, I have to store
|
||||
all of the field block as a string. When I ecounter the \\sect tag, add one to
|
||||
the section counter, but store this number in a list. Likewise, store the
|
||||
information describing the section in another list.
|
||||
When I reach the end of the field block, choose the first item from the
|
||||
numbered list as the section number. Choose the first item in the description
|
||||
list as the values and attributes of the section. Enclose the field string
|
||||
between the section tags.
|
||||
Start a new section outside the field-block strings. Use the second number in
|
||||
the list; use the second item in the description list.
|
||||
CHANGE (2004-04-26) No longer write sections that occurr in field-blocks.
|
||||
Instead, ingore all section information in a field-block.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
in_file,
|
||||
bug_handler,
|
||||
copy=None,
|
||||
run_level=1):
|
||||
"""
|
||||
Required:
|
||||
'file'--file to parse
|
||||
Optional:
|
||||
'copy'-- whether to make a copy of result for debugging
|
||||
'temp_dir' --where to output temporary results (default is
|
||||
directory from which the script is run.)
|
||||
Returns:
|
||||
nothing
|
||||
"""
|
||||
self.__file = in_file
|
||||
self.__bug_handler = bug_handler
|
||||
self.__copy = copy
|
||||
self.__run_level = run_level
|
||||
self.__write_to = better_mktemp()
|
||||
|
||||
def __initiate_values(self):
|
||||
"""
|
||||
Initiate all values.
|
||||
"""
|
||||
self.__mark_start = 'mi<mk<sect-start\n'
|
||||
self.__mark_end = 'mi<mk<sect-end__\n'
|
||||
self.__in_field = 0
|
||||
self.__section_values = {}
|
||||
self.__list_of_sec_values = []
|
||||
self.__field_num = []
|
||||
self.__section_num = 0
|
||||
self.__state = 'before_body'
|
||||
self.__found_first_sec = 0
|
||||
self.__text_string = ''
|
||||
self.__field_instruction_string = ''
|
||||
self.__state_dict = {
|
||||
'before_body' : self.__before_body_func,
|
||||
'body' : self.__body_func,
|
||||
'before_first_sec' : self.__before_first_sec_func,
|
||||
'section' : self.__section_func,
|
||||
'section_def' : self.__section_def_func,
|
||||
'sec_in_field' : self.__sec_in_field_func,
|
||||
}
|
||||
# cw<sc<sect-defin<nu<true
|
||||
self.__body_dict = {
|
||||
'cw<sc<section___' : self.__found_section_func,
|
||||
'mi<mk<sec-fd-beg' : self.__found_sec_in_field_func,
|
||||
'cw<sc<sect-defin' : self.__found_section_def_bef_sec_func,
|
||||
}
|
||||
self.__section_def_dict = {
|
||||
'cw<pf<par-def___' : (self.__end_sec_def_func, None),
|
||||
'mi<mk<body-open_' : (self.__end_sec_def_func, None),
|
||||
'cw<tb<columns___' : (self.__attribute_func, 'columns'),
|
||||
'cw<pa<margin-lef' : (self.__attribute_func, 'margin-left'),
|
||||
'cw<pa<margin-rig' : (self.__attribute_func, 'margin-right'),
|
||||
'mi<mk<header-ind' : (self.__end_sec_def_func, None),
|
||||
# premature endings
|
||||
# __end_sec_premature_func
|
||||
'tx<nu<__________' : (self.__end_sec_premature_func, None),
|
||||
'cw<ci<font-style' : (self.__end_sec_premature_func, None),
|
||||
'cw<ci<font-size_' : (self.__end_sec_premature_func, None),
|
||||
}
|
||||
self.__sec_in_field_dict = {
|
||||
'mi<mk<sec-fd-end' : self.__end_sec_in_field_func,
|
||||
# changed this 2004-04-26
|
||||
# two lines
|
||||
# 'cw<sc<section___' : self.__found_section_in_field_func,
|
||||
# 'cw<sc<sect-defin' : self.__found_section_def_in_field_func,
|
||||
}
|
||||
|
||||
def __found_section_def_func(self, line):
|
||||
"""
|
||||
Required:
|
||||
line -- the line to parse
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
I have found a section definition. Change the state to
|
||||
setion_def (so subsequent lines will be processesed as part of
|
||||
the section definition), and clear the section_values dictionary.
|
||||
"""
|
||||
self.__state = 'section_def'
|
||||
self.__section_values.clear()
|
||||
|
||||
def __attribute_func(self, line, name):
|
||||
"""
|
||||
Required:
|
||||
line -- the line to be parsed
|
||||
name -- the changed, readable name (as opposed to the
|
||||
abbreviated one)
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
I need to add the right data to the section values dictionary so I
|
||||
can retrive it later. The attribute (or key) is the name; the
|
||||
value is the last part of the text string.
|
||||
ex: cw<tb<columns___<nu<2
|
||||
"""
|
||||
attribute = name
|
||||
value = line[20:-1]
|
||||
self.__section_values[attribute] = value
|
||||
|
||||
def __found_section_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line -- the line to parse
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
I have found the beginning of a section, so change the state
|
||||
accordingly. Also add one to the section counter.
|
||||
"""
|
||||
self.__state = 'section'
|
||||
self.__write_obj.write(line)
|
||||
self.__section_num += 1
|
||||
|
||||
def __found_section_def_bef_sec_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line -- the line to parse
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
I have found the beginning of a section, so change the state
|
||||
accordingly. Also add one to the section counter.
|
||||
"""
|
||||
self.__section_num += 1
|
||||
self.__found_section_def_func(line)
|
||||
self.__write_obj.write(line)
|
||||
|
||||
def __section_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line --the line to parse
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
"""
|
||||
if self.__token_info == 'cw<sc<sect-defin':
|
||||
self.__found_section_def_func(line)
|
||||
self.__write_obj.write(line)
|
||||
|
||||
def __section_def_func(self, line):
|
||||
"""
|
||||
Required:
|
||||
line --line to parse
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
I have found a section definition. Check if the line is the end of
|
||||
the defnition (a paragraph defintion), or if it contains info that
|
||||
should be added to the values dictionary. If neither of these
|
||||
cases are true, output the line to a file.
|
||||
"""
|
||||
action, name = self.__section_def_dict.get(self.__token_info, (None, None))
|
||||
if action:
|
||||
action(line, name)
|
||||
if self.__in_field:
|
||||
self.__sec_in_field_string += line
|
||||
else:
|
||||
self.__write_obj.write(line)
|
||||
else:
|
||||
self.__write_obj.write(line)
|
||||
|
||||
def __end_sec_def_func(self, line, name):
|
||||
"""
|
||||
Requires:
|
||||
line --the line to parse
|
||||
name --changed, readable name
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
The end of the section definition has been found. Reset the state.
|
||||
Call on the write_section method.
|
||||
"""
|
||||
if not self.__in_field:
|
||||
self.__state = 'body'
|
||||
else:
|
||||
self.__state = 'sec_in_field'
|
||||
self.__write_section(line)
|
||||
|
||||
def __end_sec_premature_func(self, line, name):
|
||||
"""
|
||||
Requires:
|
||||
line --the line to parse
|
||||
name --changed, readable name
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
Text or control words indicating text have been found
|
||||
before \\pard. This shoud indicate older RTF. Reset the state
|
||||
Write the section defintion. Insert a paragraph definition.
|
||||
Insert {} to mark the end of a paragraph defintion
|
||||
"""
|
||||
if not self.__in_field:
|
||||
self.__state = 'body'
|
||||
else:
|
||||
self.__state = 'sec_in_field'
|
||||
self.__write_section(line)
|
||||
self.__write_obj.write('cw<pf<par-def___<nu<true\n')
|
||||
self.__write_obj.write('ob<nu<open-brack<0000\n')
|
||||
self.__write_obj.write('cb<nu<clos-brack<0000\n')
|
||||
|
||||
def __write_section(self, line):
|
||||
"""
|
||||
Requires:
|
||||
nothing
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
Form a string of attributes and values. If you are not in a field
|
||||
block, write this string to the output file. Otherwise, call on
|
||||
the handle_sec_def method to handle this string.
|
||||
"""
|
||||
my_string = self.__mark_start
|
||||
if self.__found_first_sec:
|
||||
my_string += 'mi<tg<close_____<section\n'
|
||||
else:
|
||||
self.__found_first_sec = 1
|
||||
my_string += 'mi<tg<open-att__<section<num>%s' % unicode_type(self.__section_num)
|
||||
my_string += '<num-in-level>%s' % unicode_type(self.__section_num)
|
||||
my_string += '<type>rtf-native'
|
||||
my_string += '<level>0'
|
||||
keys = self.__section_values.keys()
|
||||
if len(keys) > 0:
|
||||
for key in keys:
|
||||
my_string += '<%s>%s' % (key, self.__section_values[key])
|
||||
my_string += '\n'
|
||||
my_string += self.__mark_end
|
||||
# # my_string += line
|
||||
if self.__state == 'body':
|
||||
self.__write_obj.write(my_string)
|
||||
elif self.__state == 'sec_in_field':
|
||||
self.__handle_sec_def(my_string)
|
||||
elif self.__run_level > 3:
|
||||
msg = 'missed a flag\n'
|
||||
raise self.__bug_handler(msg)
|
||||
|
||||
def __handle_sec_def(self, my_string):
|
||||
"""
|
||||
Requires:
|
||||
my_string -- the string of attributes and values. (Do I need this?)
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
I need to append the dictionary of attributes and values to list
|
||||
so I can use it later when I reach the end of the field-block.
|
||||
"""
|
||||
values_dict = self.__section_values
|
||||
self.__list_of_sec_values.append(values_dict)
|
||||
|
||||
def __body_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line --the line to parse
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
Look for the beginning of a section. Otherwise, print the line to
|
||||
the output file.
|
||||
"""
|
||||
action = self.__body_dict.get(self.__token_info)
|
||||
if action:
|
||||
action(line)
|
||||
else:
|
||||
self.__write_obj.write(line)
|
||||
|
||||
def __before_body_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line --line to parse
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
Look for the beginning of the body. Always print out the line.
|
||||
"""
|
||||
if self.__token_info == 'mi<mk<body-open_':
|
||||
self.__state = 'before_first_sec'
|
||||
self.__write_obj.write(line)
|
||||
|
||||
def __before_first_sec_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line -- line to parse
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
Look for the beginning of the first section. This can be \\sectd,
|
||||
but in older RTF it could mean the any paragraph or row definition
|
||||
"""
|
||||
if self.__token_info == 'cw<sc<sect-defin':
|
||||
self.__state = 'section_def'
|
||||
self.__section_num += 1
|
||||
self.__section_values.clear()
|
||||
elif self.__token_info == 'cw<pf<par-def___':
|
||||
self.__state = 'body'
|
||||
self.__section_num += 1
|
||||
self.__write_obj.write(
|
||||
'mi<tg<open-att__<section<num>%s'
|
||||
'<num-in-level>%s'
|
||||
'<type>rtf-native'
|
||||
'<level>0\n'
|
||||
% (unicode_type(self.__section_num), unicode_type(self.__section_num))
|
||||
)
|
||||
self.__found_first_sec = 1
|
||||
elif self.__token_info == 'tx<nu<__________':
|
||||
self.__state = 'body'
|
||||
self.__section_num += 1
|
||||
self.__write_obj.write(
|
||||
'mi<tg<open-att__<section<num>%s'
|
||||
'<num-in-level>%s'
|
||||
'<type>rtf-native'
|
||||
'<level>0\n'
|
||||
% (unicode_type(self.__section_num), unicode_type(self.__section_num))
|
||||
)
|
||||
self.__write_obj.write(
|
||||
'cw<pf<par-def___<true\n'
|
||||
)
|
||||
self.__found_first_sec = 1
|
||||
self.__write_obj.write(line)
|
||||
|
||||
def __found_sec_in_field_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line --line to parse
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
I have found the beginning of a field that has a section (or
|
||||
really, two) inside of it. Change the state, and start adding to
|
||||
one long string.
|
||||
"""
|
||||
self.__state = 'sec_in_field'
|
||||
self.__sec_in_field_string = line
|
||||
self.__in_field = 1
|
||||
|
||||
def __sec_in_field_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line --the line to parse
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
Check for the end of the field, or the beginning of a section
|
||||
definition.
|
||||
CHANGED! Just print out each line. Ignore any sections or
|
||||
section definition info.
|
||||
"""
|
||||
action = self.__sec_in_field_dict.get(self.__token_info)
|
||||
if action:
|
||||
action(line)
|
||||
else:
|
||||
# change this 2004-04-26
|
||||
# self.__sec_in_field_string += line
|
||||
self.__write_obj.write(line)
|
||||
|
||||
def __end_sec_in_field_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line --line to parse
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
Add the last line to the field string. Call on the method
|
||||
print_field_sec_attributes to write the close and beginning of a
|
||||
section tag. Print out the field string. Call on the same method
|
||||
to again write the close and beginning of a section tag.
|
||||
Change the state.
|
||||
"""
|
||||
# change this 2004-04-26
|
||||
# Don't do anyting
|
||||
"""
|
||||
self.__sec_in_field_string += line
|
||||
self.__print_field_sec_attributes()
|
||||
self.__write_obj.write(self.__sec_in_field_string)
|
||||
self.__print_field_sec_attributes()
|
||||
"""
|
||||
self.__state = 'body'
|
||||
self.__in_field = 0
|
||||
# this is changed too
|
||||
self.__write_obj.write(line)
|
||||
|
||||
def __print_field_sec_attributes(self):
|
||||
"""
|
||||
Requires:
|
||||
nothing
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
Get the number and dictionary of values from the lists. The number
|
||||
and dictionary will be the first item of each list. Write the
|
||||
close tag. Write the start tag. Write the attribute and values in
|
||||
the dictionary. Get rid of the first item in each list.
|
||||
keys = self.__section_values.keys()
|
||||
if len(keys) > 0:
|
||||
my_string += 'mi<tg<open-att__<section-definition'
|
||||
for key in keys:
|
||||
my_string += '<%s>%s' % (key, self.__section_values[key])
|
||||
my_string += '\n'
|
||||
else:
|
||||
my_string += 'mi<tg<open______<section-definition\n'
|
||||
"""
|
||||
num = self.__field_num[0]
|
||||
self.__field_num = self.__field_num[1:]
|
||||
self.__write_obj.write(
|
||||
'mi<tg<close_____<section\n'
|
||||
'mi<tg<open-att__<section<num>%s' % unicode_type(num)
|
||||
)
|
||||
if self.__list_of_sec_values:
|
||||
keys = self.__list_of_sec_values[0].keys()
|
||||
for key in keys:
|
||||
self.__write_obj.write(
|
||||
'<%s>%s\n' % (key, self.__list_of_sec_values[0][key]))
|
||||
self.__list_of_sec_values = self.__list_of_sec_values[1:]
|
||||
self.__write_obj.write('<level>0')
|
||||
self.__write_obj.write('<type>rtf-native')
|
||||
self.__write_obj.write('<num-in-level>%s' % unicode_type(self.__section_num))
|
||||
self.__write_obj.write('\n')
|
||||
# Look here
|
||||
|
||||
def __found_section_in_field_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line --line to parse
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
I have found a section in a field block. Add one to section
|
||||
counter, and append this number to a list.
|
||||
"""
|
||||
self.__section_num += 1
|
||||
self.__field_num.append(self.__section_num)
|
||||
self.__sec_in_field_string += line
|
||||
|
||||
def __found_section_def_in_field_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line --line to parse
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
I have found a section definition in a filed block. Change the
|
||||
state and clear the values dictionary.
|
||||
"""
|
||||
self.__state = 'section_def'
|
||||
self.__section_values.clear()
|
||||
|
||||
def make_sections(self):
|
||||
"""
|
||||
Requires:
|
||||
nothing
|
||||
Returns:
|
||||
nothing (changes the original file)
|
||||
Logic:
|
||||
Read one line in at a time. Determine what action to take based on
|
||||
the state. If the state is before the body, look for the
|
||||
beginning of the body.
|
||||
If the state is body, send the line to the body method.
|
||||
"""
|
||||
self.__initiate_values()
|
||||
read_obj = open_for_read(self.__file)
|
||||
self.__write_obj = open_for_write(self.__write_to)
|
||||
line_to_read = 1
|
||||
while line_to_read:
|
||||
line_to_read = read_obj.readline()
|
||||
line = line_to_read
|
||||
self.__token_info = line[:16]
|
||||
action = self.__state_dict.get(self.__state)
|
||||
if action is None:
|
||||
sys.stderr.write('no matching state in module sections.py\n')
|
||||
sys.stderr.write(self.__state + '\n')
|
||||
action(line)
|
||||
read_obj.close()
|
||||
self.__write_obj.close()
|
||||
copy_obj = copy.Copy(bug_handler=self.__bug_handler)
|
||||
if self.__copy:
|
||||
copy_obj.copy_file(self.__write_to, "sections.data")
|
||||
copy_obj.rename(self.__write_to, self.__file)
|
||||
os.remove(self.__write_to)
|
||||
723
ebook_converter/ebooks/rtf2xml/styles.py
Normal file
723
ebook_converter/ebooks/rtf2xml/styles.py
Normal file
@@ -0,0 +1,723 @@
|
||||
from __future__ import unicode_literals, absolute_import, print_function, division
|
||||
#########################################################################
|
||||
# #
|
||||
# #
|
||||
# copyright 2002 Paul Henry Tremblay #
|
||||
# #
|
||||
# This program is distributed in the hope that it will be useful, #
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
|
||||
# General Public License for more details. #
|
||||
# #
|
||||
# #
|
||||
#########################################################################
|
||||
import sys, os
|
||||
from calibre.ebooks.rtf2xml import copy, border_parse
|
||||
from calibre.ptempfile import better_mktemp
|
||||
from . import open_for_read, open_for_write
|
||||
|
||||
|
||||
class Styles:
|
||||
"""
|
||||
Change lines with style numbers to actual style names.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
in_file,
|
||||
bug_handler,
|
||||
copy=None,
|
||||
run_level=1,
|
||||
):
|
||||
"""
|
||||
Required:
|
||||
'file'--file to parse
|
||||
Optional:
|
||||
'copy'-- whether to make a copy of result for debugging
|
||||
'temp_dir' --where to output temporary results (default is
|
||||
directory from which the script is run.)
|
||||
Returns:
|
||||
nothing
|
||||
"""
|
||||
self.__file = in_file
|
||||
self.__bug_handler = bug_handler
|
||||
self.__copy = copy
|
||||
self.__write_to = better_mktemp()
|
||||
self.__run_level = run_level
|
||||
|
||||
def __initiate_values(self):
|
||||
"""
|
||||
Initiate all values.
|
||||
"""
|
||||
self.__border_obj = border_parse.BorderParse()
|
||||
self.__styles_dict = {'par':{}, 'char':{}}
|
||||
self.__styles_num = '0'
|
||||
self.__type_of_style = 'par'
|
||||
self.__text_string = ''
|
||||
self.__state = 'before_styles_table'
|
||||
self.__state_dict = {
|
||||
'before_styles_table': self.__before_styles_func,
|
||||
'in_styles_table' : self.__in_styles_func,
|
||||
'in_individual_style' : self.__in_individual_style_func,
|
||||
'after_styles_table' : self.__after_styles_func,
|
||||
'mi<mk<styles-beg' : self.__found_styles_table_func,
|
||||
'mi<mk<styles-end' : self.__found_end_styles_table_func,
|
||||
'mi<mk<stylei-beg' : self.__found_beg_ind_style_func,
|
||||
'mi<mk<stylei-end' : self.__found_end_ind_style_func,
|
||||
'cw<ss<para-style' : self.__para_style_func,
|
||||
'cw<ss<char-style' : self.__char_style_func,
|
||||
}
|
||||
# A separate dictionary for parsing the body text
|
||||
self.__body_dict = {
|
||||
'cw<ss<para-style' : (self.__para_style_in_body_func, 'par'),
|
||||
'cw<ss<char-style' : (self.__para_style_in_body_func, 'char'),
|
||||
}
|
||||
# Dictionary needed to convert shortened style names to readable names
|
||||
self.__token_dict={
|
||||
# paragraph formatting => pf
|
||||
'par-end___' : 'para',
|
||||
'par-def___' : 'paragraph-definition',
|
||||
'keep-w-nex' : 'keep-with-next',
|
||||
'widow-cntl' : 'widow-control',
|
||||
'adjust-rgt' : 'adjust-right',
|
||||
'language__' : 'language',
|
||||
'right-inde' : 'right-indent',
|
||||
'fir-ln-ind' : 'first-line-indent',
|
||||
'left-inden' : 'left-indent',
|
||||
'space-befo' : 'space-before',
|
||||
'space-afte' : 'space-after',
|
||||
'line-space' : 'line-spacing',
|
||||
'default-ta' : 'default-tab',
|
||||
'align_____' : 'align',
|
||||
'widow-cntr' : 'widow-control',
|
||||
# page fomratting mixed in! (Just in older RTF?)
|
||||
'margin-lef' : 'left-indent',
|
||||
'margin-rig' : 'right-indent',
|
||||
'margin-bot' : 'space-after',
|
||||
'margin-top' : 'space-before',
|
||||
# stylesheet = > ss
|
||||
'style-shet' : 'stylesheet',
|
||||
'based-on__' : 'based-on-style',
|
||||
'next-style' : 'next-style',
|
||||
'char-style' : 'character-style',
|
||||
'para-style' : 'paragraph-style',
|
||||
# graphics => gr
|
||||
'picture___' : 'pict',
|
||||
'obj-class_' : 'obj_class',
|
||||
'mac-pic___' : 'mac-pict',
|
||||
# section => sc
|
||||
'section___' : 'section-new',
|
||||
'sect-defin' : 'section-reset',
|
||||
'sect-note_' : 'endnotes-in-section',
|
||||
# list=> ls
|
||||
'list-text_' : 'list-text',
|
||||
'list______' : 'list',
|
||||
'list-lev-d' : 'list-level-definition',
|
||||
'list-cardi' : 'list-cardinal-numbering',
|
||||
'list-decim' : 'list-decimal-numbering',
|
||||
'list-up-al' : 'list-uppercase-alphabetic-numbering',
|
||||
'list-up-ro' : 'list-uppercae-roman-numbering',
|
||||
'list-ord__' : 'list-ordinal-numbering',
|
||||
'list-ordte' : 'list-ordinal-text-numbering',
|
||||
'list-bulli' : 'list-bullet',
|
||||
'list-simpi' : 'list-simple',
|
||||
'list-conti' : 'list-continue',
|
||||
'list-hang_' : 'list-hang',
|
||||
# 'list-tebef' : 'list-text-before',
|
||||
# 'list-level' : 'level',
|
||||
'list-id___' : 'list-id',
|
||||
'list-start' : 'list-start',
|
||||
'nest-level' : 'nest-level',
|
||||
# duplicate
|
||||
'list-level' : 'list-level',
|
||||
# notes => nt
|
||||
'footnote__' : 'footnote',
|
||||
'type______' : 'type',
|
||||
# anchor => an
|
||||
'toc_______' : 'anchor-toc',
|
||||
'book-mk-st' : 'bookmark-start',
|
||||
'book-mk-en' : 'bookmark-end',
|
||||
'index-mark' : 'anchor-index',
|
||||
'place_____' : 'place',
|
||||
# field => fd
|
||||
'field_____' : 'field',
|
||||
'field-inst' : 'field-instruction',
|
||||
'field-rslt' : 'field-result',
|
||||
'datafield_' : 'data-field',
|
||||
# info-tables => it
|
||||
'font-table' : 'font-table',
|
||||
'colr-table' : 'color-table',
|
||||
'lovr-table' : 'list-override-table',
|
||||
'listtable_' : 'list-table',
|
||||
'revi-table' : 'revision-table',
|
||||
# character info => ci
|
||||
'hidden____' : 'hidden',
|
||||
'italics___' : 'italics',
|
||||
'bold______' : 'bold',
|
||||
'strike-thr' : 'strike-through',
|
||||
'shadow____' : 'shadow',
|
||||
'outline___' : 'outline',
|
||||
'small-caps' : 'small-caps',
|
||||
'dbl-strike' : 'double-strike-through',
|
||||
'emboss____' : 'emboss',
|
||||
'engrave___' : 'engrave',
|
||||
'subscript_' : 'subscript',
|
||||
'superscrip' : 'superscript',
|
||||
'plain_____' : 'plain',
|
||||
'font-style' : 'font-style',
|
||||
'font-color' : 'font-color',
|
||||
'font-size_' : 'font-size',
|
||||
'font-up___' : 'superscript',
|
||||
'font-down_' : 'subscript',
|
||||
'red_______' : 'red',
|
||||
'blue______' : 'blue',
|
||||
'green_____' : 'green',
|
||||
'caps______' : 'caps',
|
||||
# table => tb
|
||||
'row-def___' : 'row-definition',
|
||||
'cell______' : 'cell',
|
||||
'row_______' : 'row',
|
||||
'in-table__' : 'in-table',
|
||||
'columns___' : 'columns',
|
||||
'row-pos-le' : 'row-position-left',
|
||||
'cell-posit' : 'cell-position',
|
||||
# preamble => pr
|
||||
# underline
|
||||
'underlined' : 'underlined',
|
||||
# border => bd
|
||||
'bor-t-r-hi' : 'border-table-row-horizontal-inside',
|
||||
'bor-t-r-vi' : 'border-table-row-vertical-inside',
|
||||
'bor-t-r-to' : 'border-table-row-top',
|
||||
'bor-t-r-le' : 'border-table-row-left',
|
||||
'bor-t-r-bo' : 'border-table-row-bottom',
|
||||
'bor-t-r-ri' : 'border-table-row-right',
|
||||
'bor-cel-bo' : 'border-cell-bottom',
|
||||
'bor-cel-to' : 'border-cell-top',
|
||||
'bor-cel-le' : 'border-cell-left',
|
||||
'bor-cel-ri' : 'border-cell-right',
|
||||
# 'bor-par-bo' : 'border-paragraph-bottom',
|
||||
'bor-par-to' : 'border-paragraph-top',
|
||||
'bor-par-le' : 'border-paragraph-left',
|
||||
'bor-par-ri' : 'border-paragraph-right',
|
||||
'bor-par-bo' : 'border-paragraph-box',
|
||||
'bor-for-ev' : 'border-for-every-paragraph',
|
||||
'bor-outsid' : 'border-outisde',
|
||||
'bor-none__' : 'border',
|
||||
# border type => bt
|
||||
'bdr-single' : 'single',
|
||||
'bdr-doubtb' : 'double-thickness-border',
|
||||
'bdr-shadow' : 'shadowed-border',
|
||||
'bdr-double' : 'double-border',
|
||||
'bdr-dotted' : 'dotted-border',
|
||||
'bdr-dashed' : 'dashed',
|
||||
'bdr-hair__' : 'hairline',
|
||||
'bdr-inset_' : 'inset',
|
||||
'bdr-das-sm' : 'dash-small',
|
||||
'bdr-dot-sm' : 'dot-dash',
|
||||
'bdr-dot-do' : 'dot-dot-dash',
|
||||
'bdr-outset' : 'outset',
|
||||
'bdr-trippl' : 'tripple',
|
||||
'bdr-thsm__' : 'thick-thin-small',
|
||||
'bdr-htsm__' : 'thin-thick-small',
|
||||
'bdr-hthsm_' : 'thin-thick-thin-small',
|
||||
'bdr-thm__' : 'thick-thin-medium',
|
||||
'bdr-htm__' : 'thin-thick-medium',
|
||||
'bdr-hthm_' : 'thin-thick-thin-medium',
|
||||
'bdr-thl__' : 'thick-thin-large',
|
||||
'bdr-hthl_' : 'think-thick-think-large',
|
||||
'bdr-wavy_' : 'wavy',
|
||||
'bdr-d-wav' : 'double-wavy',
|
||||
'bdr-strip' : 'striped',
|
||||
'bdr-embos' : 'emboss',
|
||||
'bdr-engra' : 'engrave',
|
||||
'bdr-frame' : 'frame',
|
||||
'bdr-li-wid' : 'line-width',
|
||||
# tabs
|
||||
'tab-center' : 'center',
|
||||
'tab-right_' : 'right',
|
||||
'tab-dec___' : 'decimal',
|
||||
'leader-dot' : 'leader-dot',
|
||||
'leader-hyp' : 'leader-hyphen',
|
||||
'leader-und' : 'leader-underline',
|
||||
}
|
||||
self.__tabs_dict = {
|
||||
'cw<pf<tab-stop__' : self.__tab_stop_func,
|
||||
'cw<pf<tab-center' : self.__tab_type_func,
|
||||
'cw<pf<tab-right_' : self.__tab_type_func,
|
||||
'cw<pf<tab-dec___' : self.__tab_type_func,
|
||||
'cw<pf<leader-dot' : self.__tab_leader_func,
|
||||
'cw<pf<leader-hyp' : self.__tab_leader_func,
|
||||
'cw<pf<leader-und' : self.__tab_leader_func,
|
||||
'cw<pf<tab-bar-st' : self.__tab_bar_func,
|
||||
}
|
||||
self.__tab_type_dict = {
|
||||
'cw<pf<tab-center' : 'center',
|
||||
'cw<pf<tab-right_' : 'right',
|
||||
'cw<pf<tab-dec___' : 'decimal',
|
||||
'cw<pf<leader-dot' : 'leader-dot',
|
||||
'cw<pf<leader-hyp' : 'leader-hyphen',
|
||||
'cw<pf<leader-und' : 'leader-underline',
|
||||
}
|
||||
self.__ignore_list = [
|
||||
'list-tebef',
|
||||
]
|
||||
self.__tabs_list = self.__tabs_dict.keys()
|
||||
self.__tab_type = 'left'
|
||||
self.__leader_found = 0
|
||||
|
||||
def __in_individual_style_func(self, line):
|
||||
"""
|
||||
Required:
|
||||
line
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
Check if the token marks the end of the individual style. (Action
|
||||
is the value of the state dictionary, and the only key that will
|
||||
match in this function is the end of the individual style.)
|
||||
If the end of the individual style is not found, check if the line
|
||||
is a control word. If it is, extract the relelvant info and look
|
||||
up this info in the tokens dictionary. I want to change
|
||||
abbreviated names for longer, more readable ones.
|
||||
Write an error message if no key is found for the info.
|
||||
If the line is text, add the text to a text string. The text
|
||||
string will be the name of the style.
|
||||
"""
|
||||
action = self.__state_dict.get(self.__token_info)
|
||||
if action:
|
||||
action(line)
|
||||
# have to parse border lines with external module
|
||||
elif line[0:5] == 'cw<bd':
|
||||
border_dict = self.__border_obj.parse_border(line)
|
||||
keys = border_dict.keys()
|
||||
for key in keys:
|
||||
self.__enter_dict_entry(key, border_dict[key])
|
||||
elif self.__token_info in self.__tabs_list:
|
||||
action = self.__tabs_dict.get(self.__token_info)
|
||||
if action is not None:
|
||||
action(line)
|
||||
elif line[0:2] == 'cw':
|
||||
# cw<pf<widow-cntl<nu<true
|
||||
info = line[6:16]
|
||||
att = self.__token_dict.get(info)
|
||||
if att is None :
|
||||
if info not in self.__ignore_list:
|
||||
if self.__run_level > 3:
|
||||
msg = 'no value for key %s\n' % info
|
||||
raise self.__bug_handler(msg)
|
||||
else:
|
||||
value = line[20:-1]
|
||||
self.__enter_dict_entry(att, value)
|
||||
elif line[0:2] == 'tx':
|
||||
self.__text_string += line[17:-1]
|
||||
|
||||
def __tab_stop_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line -- line to parse
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
Try to add the number to dictionary entry tabs-left, or tabs-right, etc.
|
||||
If the dictionary entry doesn't exist, create one.
|
||||
"""
|
||||
try:
|
||||
if self.__leader_found:
|
||||
self.__styles_dict['par'][self.__styles_num]['tabs']\
|
||||
+= '%s:' % self.__tab_type
|
||||
self.__styles_dict['par'][self.__styles_num]['tabs']\
|
||||
+= '%s;' % line[20:-1]
|
||||
else:
|
||||
self.__styles_dict['par'][self.__styles_num]['tabs']\
|
||||
+= '%s:' % self.__tab_type
|
||||
self.__styles_dict['par'][self.__styles_num]['tabs']\
|
||||
+= '%s;' % line[20:-1]
|
||||
except KeyError:
|
||||
self.__enter_dict_entry('tabs', '')
|
||||
self.__styles_dict['par'][self.__styles_num]['tabs']\
|
||||
+= '%s:' % self.__tab_type
|
||||
self.__styles_dict['par'][self.__styles_num]['tabs'] += '%s;' % line[20:-1]
|
||||
self.__tab_type = 'left'
|
||||
self.__leader_found = 0
|
||||
|
||||
def __tab_type_func(self, line):
|
||||
"""
|
||||
"""
|
||||
type = self.__tab_type_dict.get(self.__token_info)
|
||||
if type is not None:
|
||||
self.__tab_type = type
|
||||
else:
|
||||
if self.__run_level > 3:
|
||||
msg = 'no entry for %s\n' % self.__token_info
|
||||
raise self.__bug_handler(msg)
|
||||
|
||||
def __tab_leader_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line --line to parse
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
Try to add the string of the tab leader to dictionary entry
|
||||
tabs-left, or tabs-right, etc. If the dictionary entry doesn't
|
||||
exist, create one.
|
||||
"""
|
||||
self.__leader_found = 1
|
||||
leader = self.__tab_type_dict.get(self.__token_info)
|
||||
if leader is not None:
|
||||
leader += '^'
|
||||
try:
|
||||
self.__styles_dict['par'][self.__styles_num]['tabs'] += ':%s;' % leader
|
||||
except KeyError:
|
||||
self.__enter_dict_entry('tabs', '')
|
||||
self.__styles_dict['par'][self.__styles_num]['tabs'] += '%s;' % leader
|
||||
else:
|
||||
if self.__run_level > 3:
|
||||
msg = 'no entry for %s\n' % self.__token_info
|
||||
raise self.__bug_handler(msg)
|
||||
|
||||
def __tab_bar_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line -- line to parse
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
Try to add the string of the tab bar to dictionary entry tabs-bar.
|
||||
If the dictionary entry doesn't exist, create one.
|
||||
"""
|
||||
# self.__add_dict_entry('tabs-bar', line[20:-1])
|
||||
try:
|
||||
self.__styles_dict['par'][self.__styles_num]['tabs']\
|
||||
+= '%s:' % 'bar'
|
||||
self.__styles_dict['par'][self.__styles_num]['tabs']\
|
||||
+= '%s;' % line[20:-1]
|
||||
except KeyError:
|
||||
self.__enter_dict_entry('tabs', '')
|
||||
self.__styles_dict['par'][self.__styles_num]['tabs']\
|
||||
+= '%s:' % 'bar'
|
||||
self.__styles_dict['par'][self.__styles_num]['tabs']\
|
||||
+= '%s;' % line[20:-1]
|
||||
self.__tab_type = 'left'
|
||||
|
||||
def __enter_dict_entry(self, att, value):
|
||||
"""
|
||||
Required:
|
||||
att -- the attribute
|
||||
value -- the value
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
Try to add the attribute value directly to the styles dictionary.
|
||||
If a keyerror is found, that means I have to build the "branches"
|
||||
of the dictionary before I can add the key value pair.
|
||||
"""
|
||||
try:
|
||||
self.__styles_dict[self.__type_of_style][self.__styles_num][att] = value
|
||||
except KeyError:
|
||||
self.__add_dict_entry(att, value)
|
||||
|
||||
def __add_dict_entry(self, att, value):
|
||||
"""
|
||||
Required:
|
||||
att --the attribute
|
||||
value --the value
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
I have to build the branches of the dictionary before I can add
|
||||
the leaves. (I am comparing a dictionary to a tree.) To achieve
|
||||
this, I first make a temporary dictionary by extracting either the
|
||||
inside dictionary of the keyword par or char. This temporary
|
||||
dictionary is called type_dict.
|
||||
Next, create a second, smaller dictionary with just the attribute and value.
|
||||
Add the small dictionary to the type dictionary.
|
||||
Add this type dictionary to the main styles dictionary.
|
||||
"""
|
||||
if self.__type_of_style == 'par':
|
||||
type_dict =self.__styles_dict['par']
|
||||
elif self.__type_of_style == 'char':
|
||||
type_dict = self.__styles_dict['char']
|
||||
else:
|
||||
if self.__run_level > 3:
|
||||
msg = self.__type_of_style + 'error\n'
|
||||
raise self.__bug_handler(msg)
|
||||
smallest_dict = {}
|
||||
smallest_dict[att] = value
|
||||
type_dict[self.__styles_num] = smallest_dict
|
||||
self.__styles_dict[self.__type_of_style] = type_dict
|
||||
|
||||
def __para_style_func(self, line):
|
||||
"""
|
||||
Required:
|
||||
line
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
Set the type of style to paragraph.
|
||||
Extract the number for a line such as "cw<ss<para-style<nu<15".
|
||||
"""
|
||||
self.__type_of_style = 'par'
|
||||
self.__styles_num = line[20:-1]
|
||||
"""
|
||||
self.__enter_dict_entry('tabs-left', '')
|
||||
self.__enter_dict_entry('tabs-right', '')
|
||||
self.__enter_dict_entry('tabs-center', '')
|
||||
self.__enter_dict_entry('tabs-decimal', '')
|
||||
self.__enter_dict_entry('tabs-bar', '')
|
||||
"""
|
||||
|
||||
def __char_style_func(self, line):
|
||||
"""
|
||||
Required:
|
||||
line
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
Set the type of style to character.
|
||||
Extract the number for a line such as "cw<ss<char-style<nu<15".
|
||||
"""
|
||||
self.__type_of_style = 'char'
|
||||
self.__styles_num = line[20:-1]
|
||||
|
||||
def __found_beg_ind_style_func(self, line):
|
||||
"""
|
||||
Required:
|
||||
line
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
Get rid of the last semicolon in the text string. Add the text
|
||||
string as the value with 'name' as the key in the style
|
||||
dictionary.
|
||||
"""
|
||||
self.__state = 'in_individual_style'
|
||||
|
||||
def __found_end_ind_style_func(self, line):
|
||||
name = self.__text_string[:-1] # get rid of semicolon
|
||||
# add 2005-04-29
|
||||
# get rid of space before or after
|
||||
name = name.strip()
|
||||
self.__enter_dict_entry('name', name)
|
||||
self.__text_string = ''
|
||||
|
||||
def __found_end_styles_table_func(self, line):
|
||||
"""
|
||||
Required:
|
||||
line
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
Set the state to after the styles table.
|
||||
Fix the styles. (I explain this below.)
|
||||
Print out the style table.
|
||||
"""
|
||||
self.__state = 'after_styles_table'
|
||||
self.__fix_based_on()
|
||||
self.__print_style_table()
|
||||
|
||||
def __fix_based_on(self):
|
||||
"""
|
||||
Requires:
|
||||
nothing
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
The styles dictionary may contain a pair of key values such as
|
||||
'next-style' => '15'. I want to change the 15 to the name of the
|
||||
style. I accomplish this by simply looking up the value of 15 in
|
||||
the styles table.
|
||||
Use two loops. First, check all the paragraph styles. Then check
|
||||
all the characer styles.
|
||||
The inner loop: first check 'next-style', then check 'based-on-style'.
|
||||
Make sure values exist for the keys to avoid the nasty keyerror message.
|
||||
"""
|
||||
types = ['par', 'char']
|
||||
for type in types:
|
||||
keys = self.__styles_dict[type].keys()
|
||||
for key in keys:
|
||||
styles = ['next-style', 'based-on-style']
|
||||
for style in styles:
|
||||
value = self.__styles_dict[type][key].get(style)
|
||||
if value is not None:
|
||||
temp_dict = self.__styles_dict[type].get(value)
|
||||
if temp_dict:
|
||||
changed_value = self.__styles_dict[type][value].get('name')
|
||||
if changed_value:
|
||||
self.__styles_dict[type][key][style] = \
|
||||
changed_value
|
||||
else:
|
||||
if value == 0 or value == '0':
|
||||
pass
|
||||
else:
|
||||
if self.__run_level > 4:
|
||||
msg = '%s %s is based on %s\n' % (type, key, value)
|
||||
msg = 'There is no style with %s\n' % value
|
||||
raise self.__bug_handler(msg)
|
||||
del self.__styles_dict[type][key][style]
|
||||
|
||||
def __print_style_table(self):
|
||||
"""
|
||||
Required:
|
||||
nothing
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
This function prints out the style table.
|
||||
I use three nested for loops. The outer loop prints out the
|
||||
paragraphs styles, then the character styles.
|
||||
The next loop iterates through the style numbers.
|
||||
The most inside loop iterates over the pairs of attributes and
|
||||
values, and prints them out.
|
||||
"""
|
||||
types = ['par', 'char']
|
||||
for type in types:
|
||||
if type == 'par':
|
||||
prefix = 'paragraph'
|
||||
else:
|
||||
prefix = 'character'
|
||||
self.__write_obj.write(
|
||||
'mi<tg<open______<%s-styles\n' % prefix
|
||||
)
|
||||
style_numbers = self.__styles_dict[type].keys()
|
||||
for num in style_numbers:
|
||||
self.__write_obj.write(
|
||||
'mi<tg<empty-att_<%s-style-in-table<num>%s' % (prefix, num)
|
||||
)
|
||||
attributes = self.__styles_dict[type][num].keys()
|
||||
for att in attributes:
|
||||
this_value = self.__styles_dict[type][num][att]
|
||||
self.__write_obj.write(
|
||||
'<%s>%s' % (att, this_value)
|
||||
)
|
||||
self.__write_obj.write('\n')
|
||||
self.__write_obj.write(
|
||||
'mi<tg<close_____<%s-styles\n' % prefix
|
||||
)
|
||||
|
||||
def __found_styles_table_func(self, line):
|
||||
"""
|
||||
Required:
|
||||
line
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
Change the state to in the style table when the marker has been found.
|
||||
"""
|
||||
self.__state = 'in_styles_table'
|
||||
|
||||
def __before_styles_func(self, line):
|
||||
"""
|
||||
Required:
|
||||
line
|
||||
Returns:
|
||||
nothing.
|
||||
Logic:
|
||||
Check the line info in the state dictionary. When the beginning of
|
||||
the styles table is found, change the state to in the styles
|
||||
table.
|
||||
"""
|
||||
action = self.__state_dict.get(self.__token_info)
|
||||
if not action:
|
||||
self.__write_obj.write(line)
|
||||
else:
|
||||
action(line)
|
||||
|
||||
def __in_styles_func(self, line):
|
||||
"""
|
||||
Required:
|
||||
line
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
Check the line for the beginning of an individaul style. If it is
|
||||
not found, simply print out the line.
|
||||
"""
|
||||
action = self.__state_dict.get(self.__token_info)
|
||||
if action is None:
|
||||
self.__write_obj.write(line)
|
||||
else:
|
||||
action(line)
|
||||
|
||||
def __para_style_in_body_func(self, line, type):
|
||||
"""
|
||||
Required:
|
||||
line-- the line
|
||||
type -- whether a character or paragraph
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
Determine the prefix by whether the type is "par" or "char".
|
||||
Extract the number from a line such as "cw<ss<para-style<nu<15".
|
||||
Look up that number in the styles dictionary and put a name for a number
|
||||
"""
|
||||
if type == 'par':
|
||||
prefix = 'para'
|
||||
else:
|
||||
prefix = 'char'
|
||||
num = line[20:-1]
|
||||
# may be invalid RTF--a style down below not defined above!
|
||||
try:
|
||||
value = self.__styles_dict[type][num]['name']
|
||||
except KeyError:
|
||||
value = None
|
||||
if value:
|
||||
self.__write_obj.write(
|
||||
'cw<ss<%s-style<nu<%s\n' % (prefix, value)
|
||||
)
|
||||
else:
|
||||
self.__write_obj.write(
|
||||
'cw<ss<%s_style<nu<not-defined\n' % prefix
|
||||
)
|
||||
|
||||
def __after_styles_func(self, line):
|
||||
"""
|
||||
Required:
|
||||
line
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
Determine if a line with either character of paragraph style info
|
||||
has been found. If so, then use the appropriate method to parse
|
||||
the line. Otherwise, write the line to a file.
|
||||
"""
|
||||
action, type = self.__body_dict.get(self.__token_info, (None, None))
|
||||
if action:
|
||||
action(line, type)
|
||||
else:
|
||||
self.__write_obj.write(line)
|
||||
|
||||
def convert_styles(self):
|
||||
"""
|
||||
Requires:
|
||||
nothing
|
||||
Returns:
|
||||
nothing (changes the original file)
|
||||
Logic:
|
||||
Read one line in at a time. Determine what action to take based on
|
||||
the state. If the state is before the style table, look for the
|
||||
beginning of the style table.
|
||||
If the state is in the style table, create the style dictionary
|
||||
and print out the tags.
|
||||
If the state if afer the style table, look for lines with style
|
||||
info, and substitute the number with the name of the style.
|
||||
"""
|
||||
self.__initiate_values()
|
||||
read_obj = open_for_read(self.__file)
|
||||
self.__write_obj = open_for_write(self.__write_to)
|
||||
line_to_read = 1
|
||||
while line_to_read:
|
||||
line_to_read = read_obj.readline()
|
||||
line = line_to_read
|
||||
self.__token_info = line[:16]
|
||||
action = self.__state_dict.get(self.__state)
|
||||
if action is None:
|
||||
sys.stderr.write('no matching state in module styles.py\n')
|
||||
sys.stderr.write(self.__state + '\n')
|
||||
action(line)
|
||||
read_obj.close()
|
||||
self.__write_obj.close()
|
||||
copy_obj = copy.Copy(bug_handler=self.__bug_handler)
|
||||
if self.__copy:
|
||||
copy_obj.copy_file(self.__write_to, "styles.data")
|
||||
copy_obj.rename(self.__write_to, self.__file)
|
||||
os.remove(self.__write_to)
|
||||
568
ebook_converter/ebooks/rtf2xml/table.py
Normal file
568
ebook_converter/ebooks/rtf2xml/table.py
Normal file
@@ -0,0 +1,568 @@
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
#########################################################################
|
||||
# #
|
||||
# #
|
||||
# copyright 2002 Paul Henry Tremblay #
|
||||
# #
|
||||
# This program is distributed in the hope that it will be useful, #
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
|
||||
# General Public License for more details. #
|
||||
# #
|
||||
# #
|
||||
#########################################################################
|
||||
import sys, os
|
||||
|
||||
from calibre.ebooks.rtf2xml import copy, border_parse
|
||||
from calibre.ptempfile import better_mktemp
|
||||
from polyglot.builtins import unicode_type
|
||||
|
||||
from . import open_for_read, open_for_write
|
||||
|
||||
"""
|
||||
States.
|
||||
1. 'not_in_table'
|
||||
1. 'cw<tb<row-def___' start a row definition
|
||||
2. 'mi<mk<in-table__' start table
|
||||
2. 'in_table'
|
||||
1. 'mi<mk<pard-start', start of a row, cell
|
||||
2. 'mi<mk<not-in-tbl', end the table.
|
||||
3. 'cw<tb<row-def___' start a row definition
|
||||
3. in_row_definition
|
||||
1. 'mi<mk<not-in-tbl' : end the row defintion. If in table, end the table.
|
||||
2. 'mi<mk<pard-start' : end the row defintion
|
||||
if already in the table, start a row and cell.
|
||||
3. 'cw<tb<row_______' : end the row definition, end the row
|
||||
4. 'cw...' use another method to handle the control word
|
||||
control word might be added to dictionary.
|
||||
5. 'mi<mk<in-table__' If already in table, do nothing. Otherwise
|
||||
start the table.
|
||||
4. 'in_row'
|
||||
1. 'mi<mk<pard-start', start cell
|
||||
2. 'mi<mk<not-in-tbl' end table,
|
||||
3. 'cw<tb<row_______' close row,
|
||||
5. 'in_cell'
|
||||
1. 'mi<mk<not-in-tbl', end table
|
||||
2. 'cw<tb<cell______', end cell
|
||||
"""
|
||||
|
||||
|
||||
class Table:
|
||||
"""
|
||||
Make tables.
|
||||
Logic:
|
||||
Read one line at a time. The default state (self.__state) is
|
||||
'not_in_table'. Look for either a 'cw<tb<in-table__', or a row definition.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
in_file,
|
||||
bug_handler,
|
||||
copy=None,
|
||||
run_level=1,):
|
||||
"""
|
||||
Required:
|
||||
'file'--file to parse
|
||||
Optional:
|
||||
'copy'-- whether to make a copy of result for debugging
|
||||
'temp_dir' --where to output temporary results (default is
|
||||
directory from which the script is run.)
|
||||
Returns:
|
||||
nothing
|
||||
"""
|
||||
self.__file = in_file
|
||||
self.__bug_handler = bug_handler
|
||||
self.__copy = copy
|
||||
self.__run_level = run_level
|
||||
self.__write_to = better_mktemp()
|
||||
|
||||
def __initiate_values(self):
|
||||
"""
|
||||
Initiate all values.
|
||||
"""
|
||||
self.__state_dict = {
|
||||
'in_table': self.__in_table_func,
|
||||
'in_row_def': self.__in_row_def_func,
|
||||
'not_in_table': self.__not_in_table_func,
|
||||
'in_cell': self.__in_cell_func,
|
||||
'in_row': self.__in_row_func,
|
||||
}
|
||||
self.__not_in_table_dict = {
|
||||
'cw<tb<row-def___': self.__found_row_def_func,
|
||||
'cw<tb<in-table__': self.__start_table_func,
|
||||
'mi<mk<in-table__' : self.__start_table_func,
|
||||
}
|
||||
# can't use this dictionary. When in row_definition, many tokens
|
||||
# require multiple definitions
|
||||
self.__in_row_definition_dict = {
|
||||
'mi<mk<not-in-tbl' : self.__end_row_table_func,
|
||||
'mi<mk<pard-start' : self.__end_row_def_func,
|
||||
}
|
||||
self.__in_row_dict = {
|
||||
'mi<mk<not-in-tbl' : self.__close_table,
|
||||
'mi<mk<pard-start' : self.__start_cell_func,
|
||||
'cw<tb<row_______' : self.__end_row_func,
|
||||
'cw<tb<cell______' : self.__empty_cell,
|
||||
}
|
||||
# set the default state
|
||||
self.__state = ['not_in_table']
|
||||
# set empty data for all tables
|
||||
self.__table_data = []
|
||||
# just in case there is no table data
|
||||
self.__row_dict = {}
|
||||
self.__cell_list = []
|
||||
self.__cell_widths = []
|
||||
|
||||
def __in_table_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line -- line to parse
|
||||
Logic:
|
||||
Look for the end of the table. If found, close out the table.
|
||||
Look for 'mi<mk<pard-start', which marks the beginning of a row. Start
|
||||
a row and start a cell.
|
||||
"""
|
||||
# 'cell' : ('tb', 'cell______', self.default_func),
|
||||
if self.__token_info == 'mi<mk<not-in-tbl' or\
|
||||
self.__token_info == 'mi<mk<sect-start' or\
|
||||
self.__token_info == 'mi<mk<sect-close' or\
|
||||
self.__token_info == 'mi<mk<body-close':
|
||||
self.__close_table(line)
|
||||
elif self.__token_info == 'mi<mk<pard-start':
|
||||
self.__start_row_func(line)
|
||||
self.__start_cell_func(line)
|
||||
elif self.__token_info == 'cw<tb<row-def___':
|
||||
self.__found_row_def_func(line)
|
||||
elif self.__token_info == 'cw<tb<cell______':
|
||||
self.__start_row_func(line)
|
||||
self.__empty_cell(line)
|
||||
self.__write_obj.write(line)
|
||||
|
||||
def __not_in_table_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line -- the line of text read in from document
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
The state is not in a table, so look for the two tokens that
|
||||
mark the start of a table: 'cw<tb<row-def', or 'cw<tb<in-table__'.
|
||||
If these tokens are found, use another method to start a table
|
||||
and change states. Otherwise, just output the line.
|
||||
"""
|
||||
action = self.__not_in_table_dict.get(self.__token_info)
|
||||
if action:
|
||||
action(line)
|
||||
self.__write_obj.write(line)
|
||||
|
||||
def __close_table(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line -- line to parse
|
||||
Returns:
|
||||
?
|
||||
Logic:
|
||||
Write the end marker for the table.
|
||||
Write the end tag for the table.
|
||||
Set the state to ['not_in_table']
|
||||
"""
|
||||
self.__write_obj.write('mi<mk<table-end_\n')
|
||||
self.__state = ['not_in_table']
|
||||
self.__table_data[-1]['number-of-columns'] = self.__max_number_cells_in_row
|
||||
self.__table_data[-1]['number-of-rows'] = self.__rows_in_table
|
||||
average_cells_in_row = self.__mode(self.__list_of_cells_in_row)
|
||||
self.__table_data[-1]['average-cells-per-row'] = average_cells_in_row
|
||||
average_cell_width = self.__mode(self.__cell_widths)
|
||||
self.__table_data[-1]['average-cell-width'] = average_cell_width
|
||||
|
||||
def __found_row_def_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line don't need this except for consistency with other methods.
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
A row definition has been found. Collect all the data from this
|
||||
to use later in writing attributes for the table.
|
||||
"""
|
||||
self.__state.append('in_row_def')
|
||||
self.__last_cell_position = 0
|
||||
self.__row_dict = {}
|
||||
self.__cell_list = []
|
||||
self.__cell_list.append({})
|
||||
self.__cell_widths = []
|
||||
|
||||
def __start_table_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line -- line to parse
|
||||
Returns:
|
||||
?
|
||||
Logic:
|
||||
Add the 'in_table' to the state list.
|
||||
Write out the table marker.
|
||||
Initialize table values (not sure about these yet)
|
||||
"""
|
||||
self.__rows_in_table = 0
|
||||
self.__cells_in_table = 0
|
||||
self.__cells_in_row = 0
|
||||
self.__max_number_cells_in_row = 0
|
||||
self.__table_data.append({})
|
||||
self.__list_of_cells_in_row = []
|
||||
self.__write_obj.write('mi<mk<tabl-start\n')
|
||||
self.__state.append('in_table')
|
||||
|
||||
def __end_row_table_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line --just for consistencey
|
||||
Returns:
|
||||
?
|
||||
Logic:
|
||||
?
|
||||
"""
|
||||
self.__close_table(self, line)
|
||||
|
||||
def __end_row_def_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line --just for consistency
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
change the state.
|
||||
get rid of the last {} in the cell list
|
||||
figure out the number of cells based on the self.__row_dict[widths]
|
||||
('122, 122')
|
||||
"""
|
||||
if len(self.__state) > 0:
|
||||
if self.__state[-1] == 'in_row_def':
|
||||
self.__state.pop()
|
||||
# added [{]] at the *end* of each /cell. Get rid of extra one
|
||||
self.__cell_list.pop()
|
||||
widths = self.__row_dict.get('widths')
|
||||
if widths:
|
||||
width_list = widths.split(',')
|
||||
num_cells = len(width_list)
|
||||
self.__row_dict['number-of-cells'] = num_cells
|
||||
|
||||
def __in_row_def_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line --line to parse
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
In the text that defines a row. If a control word is found, handle the
|
||||
control word with another method.
|
||||
Check for states that will end this state.
|
||||
While in the row definition, certain tokens can end a row or end a table.
|
||||
If a paragrah definition (pard-start) is found, and the you are already in
|
||||
a table, start of a row.
|
||||
"""
|
||||
if self.__token_info == 'cw<tb<row_______':
|
||||
# write tags
|
||||
self.__end_row_func(line)
|
||||
# change the state
|
||||
self.__end_row_def_func(line)
|
||||
self.__write_obj.write(line)
|
||||
elif line[0:2] == 'cw':
|
||||
self.__handle_row_token(line)
|
||||
self.__write_obj.write(line)
|
||||
elif self.__token_info == 'mi<mk<not-in-tbl' and 'in_table' in self.__state:
|
||||
self.__end_row_def_func(line)
|
||||
self.__close_table(line)
|
||||
self.__write_obj.write(line)
|
||||
elif self.__token_info == 'mi<mk<pard-start':
|
||||
self.__end_row_def_func(line)
|
||||
# if already in the table, start a row, then cell.
|
||||
if (self.__state) > 0 and self.__state[-1] == 'in_table':
|
||||
self.__start_row_func(line)
|
||||
self.__start_cell_func(line)
|
||||
self.__write_obj.write(line)
|
||||
elif self.__token_info == 'mi<mk<in-table__':
|
||||
self.__end_row_def_func(line)
|
||||
# if not in table, start a new table
|
||||
if len(self.__state) > 0 and self.__state[-1] != 'in_table':
|
||||
self.__start_table_func(line)
|
||||
self.__write_obj.write(line)
|
||||
else:
|
||||
self.__write_obj.write(line)
|
||||
|
||||
def __handle_row_token(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line -- line to parse
|
||||
Returns:
|
||||
?
|
||||
Logic:
|
||||
the tokens in the row definition contain the following information:
|
||||
1. row borders.
|
||||
2. cell borders for all cells in the row.
|
||||
3. cell postions for all cells in the row.
|
||||
Put all information about row borders into a row dictionary.
|
||||
Put all information about cell borders into into the dictionary in
|
||||
the last item in the cell list. ([{border:something, width:something},
|
||||
{border:something, width:something}])
|
||||
cw<bd<bor-t-r-to<nu<bdr-hair__|bdr-li-wid:0.50
|
||||
"""
|
||||
if line[3:5] == 'bd':
|
||||
border_obj = border_parse.BorderParse()
|
||||
the_dict = border_obj.parse_border(line)
|
||||
keys = the_dict.keys()
|
||||
# border-cell-top-hairline
|
||||
in_cell = 0
|
||||
for key in keys:
|
||||
if key[0:11] == 'border-cell':
|
||||
in_cell = 1
|
||||
for key in keys:
|
||||
if in_cell:
|
||||
self.__cell_list[-1][key] = the_dict[key]
|
||||
else:
|
||||
self.__row_dict[key] = the_dict[key]
|
||||
# cw<tb<cell-posit<nu<216.00
|
||||
elif self.__token_info == 'cw<tb<cell-posit':
|
||||
self.__found_cell_position(line)
|
||||
# cw<tb<row-pos-le<nu<-5.40
|
||||
elif self.__token_info == 'cw<tb<row-pos-le':
|
||||
position = line[20:-1]
|
||||
self.__row_dict['left-row-position'] = position
|
||||
elif self.__token_info == 'cw<tb<row-header':
|
||||
self.__row_dict['header'] = 'true'
|
||||
|
||||
def __start_cell_func(self, line):
|
||||
"""
|
||||
Required:
|
||||
line -- the line of text
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
Append 'in_cell' for states
|
||||
If the self.__cell list containst dictionaries, get the last dictionary.
|
||||
Write value => attributes for key=> value
|
||||
pop the self.__cell_list.
|
||||
Otherwise, print out a cell tag.
|
||||
"""
|
||||
self.__state.append('in_cell')
|
||||
# self.__cell_list = []
|
||||
if len(self.__cell_list) > 0:
|
||||
self.__write_obj.write('mi<tg<open-att__<cell')
|
||||
# cell_dict = self.__cell_list[-1]
|
||||
cell_dict = self.__cell_list[0]
|
||||
keys = cell_dict.keys()
|
||||
for key in keys:
|
||||
self.__write_obj.write('<%s>%s' % (key, cell_dict[key]))
|
||||
self.__write_obj.write('\n')
|
||||
# self.__cell_list.pop()
|
||||
self.__cell_list.pop(0)
|
||||
# self.__cell_list = self.__cell_list[1:]
|
||||
else:
|
||||
self.__write_obj.write('mi<tg<open______<cell\n')
|
||||
self.__cells_in_table += 1
|
||||
self.__cells_in_row += 1
|
||||
|
||||
def __start_row_func(self, line):
|
||||
"""
|
||||
Required:
|
||||
line -- the line of text
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
Append 'in_row' for states
|
||||
Write value => attributes for key=> value
|
||||
"""
|
||||
self.__state.append('in_row')
|
||||
self.__write_obj.write('mi<tg<open-att__<row')
|
||||
keys = self.__row_dict.keys()
|
||||
for key in keys:
|
||||
self.__write_obj.write('<%s>%s' % (key, self.__row_dict[key]))
|
||||
self.__write_obj.write('\n')
|
||||
self.__cells_in_row = 0
|
||||
self.__rows_in_table += 1
|
||||
|
||||
def __found_cell_position(self, line):
|
||||
"""
|
||||
needs:
|
||||
line: current line
|
||||
returns:
|
||||
nothing
|
||||
logic:
|
||||
Calculate the cell width.
|
||||
If the cell is the first cell, you should add the left cell position to it.
|
||||
(This value is often negative.)
|
||||
Next, set the new last_cell_position to the current cell position.
|
||||
"""
|
||||
# cw<tb<cell-posit<nu<216.00
|
||||
new_cell_position = round(float(line[20:-1]), 2)
|
||||
left_position = 0
|
||||
if self.__last_cell_position == 0:
|
||||
left_position = self.__row_dict.get('left-row-position', 0)
|
||||
left_position = float(left_position)
|
||||
width = new_cell_position - self.__last_cell_position - left_position
|
||||
# width = round(width, 2)
|
||||
width = unicode_type('%.2f' % width)
|
||||
self.__last_cell_position = new_cell_position
|
||||
widths_exists = self.__row_dict.get('widths')
|
||||
if widths_exists:
|
||||
self.__row_dict['widths'] += ', %s' % unicode_type(width)
|
||||
else:
|
||||
self.__row_dict['widths'] = unicode_type(width)
|
||||
self.__cell_list[-1]['width'] = width
|
||||
self.__cell_list.append({})
|
||||
self.__cell_widths.append(width)
|
||||
|
||||
def __in_cell_func(self, line):
|
||||
"""
|
||||
Required:
|
||||
line
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
In the middle of a cell.
|
||||
Look for the close of the table. If found, use the close table function to close
|
||||
the table.
|
||||
Look for the close of the cell. If found, use the close cell function to close out
|
||||
the cell.
|
||||
Otherwise, print out the line.
|
||||
"""
|
||||
# cw<tb<cell______<nu<true
|
||||
# mi<mk<sect-start
|
||||
if self.__token_info == 'mi<mk<not-in-tbl' or\
|
||||
self.__token_info == 'mi<mk<sect-start' or\
|
||||
self.__token_info == 'mi<mk<sect-close' or\
|
||||
self.__token_info == 'mi<mk<body-close':
|
||||
self.__end_cell_func(line)
|
||||
self.__end_row_func(line)
|
||||
self.__close_table(line)
|
||||
self.__write_obj.write(line)
|
||||
elif self.__token_info == 'cw<tb<cell______':
|
||||
self.__end_cell_func(line)
|
||||
else:
|
||||
self.__write_obj.write(line)
|
||||
|
||||
def __end_cell_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
End the cell. Print out the closing marks. Pop the self.__state.
|
||||
"""
|
||||
if len(self.__state) > 1:
|
||||
if self.__state[-1] == 'in_cell':
|
||||
self.__state.pop()
|
||||
self.__write_obj.write('mi<mk<close_cell\n')
|
||||
self.__write_obj.write('mi<tg<close_____<cell\n')
|
||||
self.__write_obj.write('mi<mk<closecell_\n')
|
||||
|
||||
def __in_row_func(self, line):
|
||||
if self.__token_info == 'mi<mk<not-in-tbl' or\
|
||||
self.__token_info == 'mi<mk<sect-start' or\
|
||||
self.__token_info == 'mi<mk<sect-close' or\
|
||||
self.__token_info == 'mi<mk<body-close':
|
||||
self.__end_row_func(line)
|
||||
self.__close_table(line)
|
||||
self.__write_obj.write(line)
|
||||
else:
|
||||
action = self.__in_row_dict.get(self.__token_info)
|
||||
if action:
|
||||
action(line)
|
||||
self.__write_obj.write(line)
|
||||
"""
|
||||
elif self.__token_info == 'mi<mk<pard-start':
|
||||
self.__start_cell_func(line)
|
||||
self.__write_obj.write(line)
|
||||
elif self.__token_info == 'cw<tb<row_______':
|
||||
self.__end_row_func(line)
|
||||
self.__write_obj.write(line)
|
||||
else:
|
||||
self.__write_obj.write(line)
|
||||
"""
|
||||
|
||||
def __end_row_func(self, line):
|
||||
"""
|
||||
"""
|
||||
if len(self.__state) > 1 and self.__state[-1] == 'in_row':
|
||||
self.__state.pop()
|
||||
self.__write_obj.write('mi<tg<close_____<row\n')
|
||||
else:
|
||||
self.__write_obj.write('mi<tg<empty_____<row\n')
|
||||
self.__rows_in_table += 1
|
||||
if self.__cells_in_row > self.__max_number_cells_in_row:
|
||||
self.__max_number_cells_in_row = self.__cells_in_row
|
||||
self.__list_of_cells_in_row.append(self.__cells_in_row)
|
||||
|
||||
def __empty_cell(self, line):
|
||||
"""
|
||||
Required:
|
||||
line -- line of text
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
Write an empty tag with attributes if there are attributes.
|
||||
Otherwise, writen an empty tag with cell as element.
|
||||
"""
|
||||
if len(self.__cell_list) > 0:
|
||||
self.__write_obj.write('mi<tg<empty-att_<cell')
|
||||
cell_dict = self.__cell_list[-1]
|
||||
keys = cell_dict.keys()
|
||||
for key in keys:
|
||||
self.__write_obj.write('<%s>%s' % (key, cell_dict[key]))
|
||||
self.__write_obj.write('\n')
|
||||
else:
|
||||
self.__write_obj.write('mi<tg<empty_____<cell\n')
|
||||
self.__cells_in_table += 1
|
||||
self.__cells_in_row += 1
|
||||
|
||||
def __mode(self, the_list):
|
||||
"""
|
||||
Required:
|
||||
the_list -- a list of something
|
||||
Returns:
|
||||
the number that occurs the most
|
||||
Logic:
|
||||
get the count of each item in list. The count that is the greatest
|
||||
is the mode.
|
||||
"""
|
||||
max = 0
|
||||
mode = 'not-defined'
|
||||
for item in the_list:
|
||||
num_of_values = the_list.count(item)
|
||||
if num_of_values > max:
|
||||
mode = item
|
||||
max = num_of_values
|
||||
return mode
|
||||
|
||||
def make_table(self):
|
||||
"""
|
||||
Requires:
|
||||
nothing
|
||||
Returns:
|
||||
A dictionary of values for the beginning of the table.
|
||||
Logic:
|
||||
Read one line in at a time. Determine what action to take based on
|
||||
the state.
|
||||
"""
|
||||
self.__initiate_values()
|
||||
read_obj = open_for_read(self.__file)
|
||||
self.__write_obj = open_for_write(self.__write_to)
|
||||
line_to_read = 1
|
||||
while line_to_read:
|
||||
line_to_read = read_obj.readline()
|
||||
line = line_to_read
|
||||
self.__token_info = line[:16]
|
||||
action = self.__state_dict.get(self.__state[-1])
|
||||
# print self.__state[-1]
|
||||
if action is None:
|
||||
sys.stderr.write('No matching state in module table.py\n')
|
||||
sys.stderr.write(self.__state[-1] + '\n')
|
||||
action(line)
|
||||
read_obj.close()
|
||||
self.__write_obj.close()
|
||||
copy_obj = copy.Copy(bug_handler=self.__bug_handler)
|
||||
if self.__copy:
|
||||
copy_obj.copy_file(self.__write_to, "table.data")
|
||||
copy_obj.rename(self.__write_to, self.__file)
|
||||
os.remove(self.__write_to)
|
||||
return self.__table_data
|
||||
88
ebook_converter/ebooks/rtf2xml/table_info.py
Normal file
88
ebook_converter/ebooks/rtf2xml/table_info.py
Normal file
@@ -0,0 +1,88 @@
|
||||
from __future__ import unicode_literals, absolute_import, print_function, division
|
||||
#########################################################################
|
||||
# #
|
||||
# #
|
||||
# copyright 2002 Paul Henry Tremblay #
|
||||
# #
|
||||
# This program is distributed in the hope that it will be useful, #
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
|
||||
# General Public License for more details. #
|
||||
# #
|
||||
# #
|
||||
#########################################################################
|
||||
import os
|
||||
from calibre.ebooks.rtf2xml import copy
|
||||
from calibre.ptempfile import better_mktemp
|
||||
from . import open_for_read, open_for_write
|
||||
|
||||
# note to self. This is the first module in which I use tempfile. A good idea?
|
||||
"""
|
||||
"""
|
||||
|
||||
|
||||
class TableInfo:
|
||||
"""
|
||||
Insert table data for tables.
|
||||
Logic:
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
in_file,
|
||||
bug_handler,
|
||||
table_data,
|
||||
copy=None,
|
||||
run_level=1,):
|
||||
"""
|
||||
Required:
|
||||
'file'--file to parse
|
||||
'table_data' -- a dictionary for each table.
|
||||
Optional:
|
||||
'copy'-- whether to make a copy of result for debugging
|
||||
'temp_dir' --where to output temporary results (default is
|
||||
directory from which the script is run.)
|
||||
Returns:
|
||||
nothing
|
||||
"""
|
||||
self.__file = in_file
|
||||
self.__bug_handler = bug_handler
|
||||
self.__copy = copy
|
||||
self.__table_data = table_data
|
||||
self.__run_level = run_level
|
||||
self.__write_to = better_mktemp()
|
||||
# self.__write_to = 'table_info.data'
|
||||
|
||||
def insert_info(self):
|
||||
"""
|
||||
"""
|
||||
read_obj = open_for_read(self.__file)
|
||||
self.__write_obj = open_for_write(self.__write_to)
|
||||
line_to_read = 1
|
||||
while line_to_read:
|
||||
line_to_read = read_obj.readline()
|
||||
line = line_to_read
|
||||
if line == 'mi<mk<tabl-start\n':
|
||||
if len(self.__table_data) > 0:
|
||||
table_dict = self.__table_data[0]
|
||||
self.__write_obj.write('mi<tg<open-att__<table')
|
||||
keys = table_dict.keys()
|
||||
for key in keys:
|
||||
self.__write_obj.write('<%s>%s' % (key, table_dict[key]))
|
||||
self.__write_obj.write('\n')
|
||||
self.__table_data = self.__table_data[1:]
|
||||
else:
|
||||
# this shouldn't happen!
|
||||
if self.__run_level > 3:
|
||||
msg = 'Not enough data for each table\n'
|
||||
raise self.__bug_handler(msg)
|
||||
self.__write_obj.write('mi<tg<open______<table\n')
|
||||
elif line == 'mi<mk<table-end_\n':
|
||||
self.__write_obj.write('mi<tg<close_____<table\n')
|
||||
self.__write_obj.write(line)
|
||||
read_obj.close()
|
||||
self.__write_obj.close()
|
||||
copy_obj = copy.Copy(bug_handler=self.__bug_handler)
|
||||
if self.__copy:
|
||||
copy_obj.copy_file(self.__write_to, "table_info.data")
|
||||
copy_obj.rename(self.__write_to, self.__file)
|
||||
os.remove(self.__write_to)
|
||||
218
ebook_converter/ebooks/rtf2xml/tokenize.py
Normal file
218
ebook_converter/ebooks/rtf2xml/tokenize.py
Normal file
@@ -0,0 +1,218 @@
|
||||
from __future__ import unicode_literals, absolute_import, print_function, division
|
||||
#########################################################################
|
||||
# #
|
||||
# #
|
||||
# copyright 2002 Paul Henry Tremblay #
|
||||
# #
|
||||
# This program is distributed in the hope that it will be useful, #
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
|
||||
# General Public License for more details. #
|
||||
# #
|
||||
# #
|
||||
#########################################################################
|
||||
import os, re
|
||||
|
||||
from calibre.ebooks.rtf2xml import copy
|
||||
from calibre.utils.mreplace import MReplace
|
||||
from calibre.ptempfile import better_mktemp
|
||||
from polyglot.builtins import codepoint_to_chr, range, filter, map
|
||||
from . import open_for_read, open_for_write
|
||||
|
||||
|
||||
class Tokenize:
|
||||
"""Tokenize RTF into one line per field. Each line will contain information useful for the rest of the script"""
|
||||
|
||||
def __init__(self,
|
||||
in_file,
|
||||
bug_handler,
|
||||
copy=None,
|
||||
run_level=1,
|
||||
# out_file = None,
|
||||
):
|
||||
self.__file = in_file
|
||||
self.__bug_handler = bug_handler
|
||||
self.__copy = copy
|
||||
self.__write_to = better_mktemp()
|
||||
# self.__write_to = out_file
|
||||
self.__compile_expressions()
|
||||
# variables
|
||||
self.__uc_char = 0
|
||||
self.__uc_bin = False
|
||||
self.__uc_value = [1]
|
||||
|
||||
def __reini_utf8_counters(self):
|
||||
self.__uc_char = 0
|
||||
self.__uc_bin = False
|
||||
|
||||
def __remove_uc_chars(self, startchar, token):
|
||||
for i in range(startchar, len(token)):
|
||||
if self.__uc_char:
|
||||
self.__uc_char -= 1
|
||||
else:
|
||||
return token[i:]
|
||||
# if only char to skip
|
||||
return ''
|
||||
|
||||
def __unicode_process(self, token):
|
||||
# change scope in
|
||||
if token == r'\{':
|
||||
self.__uc_value.append(self.__uc_value[-1])
|
||||
# basic error handling
|
||||
self.__reini_utf8_counters()
|
||||
return token
|
||||
# change scope out
|
||||
elif token == r'\}':
|
||||
self.__uc_value.pop()
|
||||
self.__reini_utf8_counters()
|
||||
return token
|
||||
# add a uc control
|
||||
elif token[:3] == '\\uc':
|
||||
self.__uc_value[-1] = int(token[3:])
|
||||
self.__reini_utf8_counters()
|
||||
return token
|
||||
# bin data to slip
|
||||
elif self.__uc_bin:
|
||||
self.__uc_bin = False
|
||||
return ''
|
||||
# uc char to remove
|
||||
elif self.__uc_char:
|
||||
# handle \bin tag in case of uc char to skip
|
||||
if token[:4] == '\bin':
|
||||
self.__uc_char -=1
|
||||
self.__uc_bin = True
|
||||
return ''
|
||||
elif token[:1] == "\\" :
|
||||
self.__uc_char -=1
|
||||
return ''
|
||||
else:
|
||||
return self.__remove_uc_chars(0, token)
|
||||
# go for real \u token
|
||||
match_obj = self.__utf_exp.match(token)
|
||||
if match_obj is not None:
|
||||
self.__reini_utf8_counters()
|
||||
# get value and handle negative case
|
||||
uni_char = int(match_obj.group(1))
|
||||
uni_len = len(match_obj.group(0))
|
||||
if uni_char < 0:
|
||||
uni_char += 65536
|
||||
uni_char = codepoint_to_chr(uni_char).encode('ascii', 'xmlcharrefreplace').decode('ascii')
|
||||
self.__uc_char = self.__uc_value[-1]
|
||||
# there is only an unicode char
|
||||
if len(token)<= uni_len:
|
||||
return uni_char
|
||||
# an unicode char and something else
|
||||
# must be after as it is splited on \
|
||||
# necessary? maybe for \bin?
|
||||
elif not self.__uc_char:
|
||||
return uni_char + token[uni_len:]
|
||||
# if not uc0 and chars
|
||||
else:
|
||||
return uni_char + self.__remove_uc_chars(uni_len, token)
|
||||
# default
|
||||
return token
|
||||
|
||||
def __sub_reg_split(self,input_file):
|
||||
input_file = self.__replace_spchar.mreplace(input_file)
|
||||
# this is for older RTF
|
||||
input_file = self.__par_exp.sub(r'\n\\par \n', input_file)
|
||||
input_file = self.__cwdigit_exp.sub(r"\g<1>\n\g<2>", input_file)
|
||||
input_file = self.__cs_ast.sub(r"\g<1>", input_file)
|
||||
input_file = self.__ms_hex_exp.sub(r"\\mshex0\g<1> ", input_file)
|
||||
input_file = self.__utf_ud.sub(r"\\{\\uc0 \g<1>\\}", input_file)
|
||||
# remove \n in bin data
|
||||
input_file = self.__bin_exp.sub(lambda x:
|
||||
x.group().replace('\n', '') + '\n', input_file)
|
||||
# split
|
||||
tokens = re.split(self.__splitexp, input_file)
|
||||
# remove empty tokens and \n
|
||||
return list(filter(lambda x: len(x) > 0 and x != '\n', tokens))
|
||||
|
||||
def __compile_expressions(self):
|
||||
SIMPLE_RPL = {
|
||||
"\\\\": "\\backslash ",
|
||||
"\\~": "\\~ ",
|
||||
"\\;": "\\; ",
|
||||
"&": "&",
|
||||
"<": "<",
|
||||
">": ">",
|
||||
"\\~": "\\~ ",
|
||||
"\\_": "\\_ ",
|
||||
"\\:": "\\: ",
|
||||
"\\-": "\\- ",
|
||||
# turn into a generic token to eliminate special
|
||||
# cases and make processing easier
|
||||
"\\{": "\\ob ",
|
||||
# turn into a generic token to eliminate special
|
||||
# cases and make processing easier
|
||||
"\\}": "\\cb ",
|
||||
# put a backslash in front of to eliminate special cases and
|
||||
# make processing easier
|
||||
"{": "\\{",
|
||||
# put a backslash in front of to eliminate special cases and
|
||||
# make processing easier
|
||||
"}": "\\}",
|
||||
}
|
||||
self.__replace_spchar = MReplace(SIMPLE_RPL)
|
||||
# add ;? in case of char following \u
|
||||
self.__ms_hex_exp = re.compile(r"\\\'([0-9a-fA-F]{2})")
|
||||
self.__utf_exp = re.compile(r"\\u(-?\d{3,6}) ?")
|
||||
self.__bin_exp = re.compile(r"(?:\\bin(-?\d{0,10})[\n ]+)[01\n]+")
|
||||
# manage upr/ud situations
|
||||
self.__utf_ud = re.compile(r"\\{[\n ]?\\upr[\n ]?(?:\\{.*?\\})[\n ]?" +
|
||||
r"\\{[\n ]?\\*[\n ]?\\ud[\n ]?(\\{.*?\\})[\n ]?\\}[\n ]?\\}")
|
||||
# add \n in split for whole file reading
|
||||
# why keep backslash whereas \is replaced before?
|
||||
# remove \n from endline char
|
||||
self.__splitexp = re.compile(r"(\\[{}]|\n|\\[^\s\\{}&]+(?:[ \t\r\f\v])?)")
|
||||
# this is for old RTF
|
||||
self.__par_exp = re.compile(r'(\\\n+|\\ )')
|
||||
# handle improper cs char-style with \* before without {
|
||||
self.__cs_ast = re.compile(r'\\\*([\n ]*\\cs\d+[\n \\]+)')
|
||||
# handle cw using a digit as argument and without space as delimiter
|
||||
self.__cwdigit_exp = re.compile(r"(\\[a-zA-Z]+[\-0-9]+)([^0-9 \\]+)")
|
||||
|
||||
def tokenize(self):
|
||||
"""Main class for handling other methods. Reads the file \
|
||||
, uses method self.sub_reg to make basic substitutions,\
|
||||
and process tokens by itself"""
|
||||
# read
|
||||
with open_for_read(self.__file) as read_obj:
|
||||
input_file = read_obj.read()
|
||||
|
||||
# process simple replacements and split giving us a correct list
|
||||
# remove '' and \n in the process
|
||||
tokens = self.__sub_reg_split(input_file)
|
||||
# correct unicode
|
||||
tokens = map(self.__unicode_process, tokens)
|
||||
# remove empty items created by removing \uc
|
||||
tokens = list(filter(lambda x: len(x) > 0, tokens))
|
||||
|
||||
# write
|
||||
with open_for_write(self.__write_to) as write_obj:
|
||||
write_obj.write('\n'.join(tokens))
|
||||
# Move and copy
|
||||
copy_obj = copy.Copy(bug_handler=self.__bug_handler)
|
||||
if self.__copy:
|
||||
copy_obj.copy_file(self.__write_to, "tokenize.data")
|
||||
copy_obj.rename(self.__write_to, self.__file)
|
||||
os.remove(self.__write_to)
|
||||
|
||||
# self.__special_tokens = [ '_', '~', "'", '{', '}' ]
|
||||
|
||||
# import sys
|
||||
# def main(args=sys.argv):
|
||||
# if len(args) < 2:
|
||||
# print 'No file'
|
||||
# return
|
||||
# file = 'data_tokens.txt'
|
||||
# if len(args) == 3:
|
||||
# file = args[2]
|
||||
# to = Tokenize(args[1], Exception, out_file = file)
|
||||
# to.tokenize()
|
||||
|
||||
|
||||
# if __name__ == '__main__':
|
||||
# sys.exit(main())
|
||||
|
||||
# calibre-debug -e src/calibre/ebooks/rtf2xml/tokenize.py
|
||||
Reference in New Issue
Block a user