mirror of
https://github.com/gryf/ebook-converter.git
synced 2026-01-02 00:22:25 +01:00
537 lines
20 KiB
Python
537 lines
20 KiB
Python
#########################################################################
|
|
# #
|
|
# #
|
|
# copyright 2002 Paul Henry Tremblay #
|
|
# #
|
|
# This program is distributed in the hope that it will be useful, #
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
|
|
# General Public License for more details. #
|
|
# #
|
|
# #
|
|
#########################################################################
|
|
import sys, os
|
|
|
|
from ebook_converter.ebooks.rtf2xml import copy
|
|
from ebook_converter.ptempfile import better_mktemp
|
|
|
|
from . import open_for_read, open_for_write
|
|
|
|
|
|
class Sections:
|
|
"""
|
|
=================
|
|
Purpose
|
|
=================
|
|
Write section tags for a tokenized file. (This module won't be any use to use
|
|
to you unless you use it as part of the other modules.)
|
|
---------------
|
|
logic
|
|
---------------
|
|
The tags for the first section breaks have already been written.
|
|
RTF stores section breaks with the \\sect tag. Each time this tag is
|
|
encountered, add one to the counter.
|
|
When I encounter the \\sectd tag, I want to collect all the appropriate tokens
|
|
that describe the section. When I reach a \\pard, I know I an stop collecting
|
|
tokens and write the section tags.
|
|
The exception to this method occurs when sections occur in field blocks, such
|
|
as the index. Normally, two section break occur within the index and other
|
|
field-blocks. (If less or more section breaks occur, this code may not work.)
|
|
I want the sections to occur outside of the index. That is, the index
|
|
should be nested inside one section tag. After the index is complete, a new
|
|
section should begin.
|
|
In order to write the sections outside of the field blocks, I have to store
|
|
all of the field block as a string. When I ecounter the \\sect tag, add one to
|
|
the section counter, but store this number in a list. Likewise, store the
|
|
information describing the section in another list.
|
|
When I reach the end of the field block, choose the first item from the
|
|
numbered list as the section number. Choose the first item in the description
|
|
list as the values and attributes of the section. Enclose the field string
|
|
between the section tags.
|
|
Start a new section outside the field-block strings. Use the second number in
|
|
the list; use the second item in the description list.
|
|
CHANGE (2004-04-26) No longer write sections that occurr in field-blocks.
|
|
Instead, ingore all section information in a field-block.
|
|
"""
|
|
|
|
def __init__(self,
|
|
in_file,
|
|
bug_handler,
|
|
copy=None,
|
|
run_level=1):
|
|
"""
|
|
Required:
|
|
'file'--file to parse
|
|
Optional:
|
|
'copy'-- whether to make a copy of result for debugging
|
|
'temp_dir' --where to output temporary results (default is
|
|
directory from which the script is run.)
|
|
Returns:
|
|
nothing
|
|
"""
|
|
self.__file = in_file
|
|
self.__bug_handler = bug_handler
|
|
self.__copy = copy
|
|
self.__run_level = run_level
|
|
self.__write_to = better_mktemp()
|
|
|
|
def __initiate_values(self):
|
|
"""
|
|
Initiate all values.
|
|
"""
|
|
self.__mark_start = 'mi<mk<sect-start\n'
|
|
self.__mark_end = 'mi<mk<sect-end__\n'
|
|
self.__in_field = 0
|
|
self.__section_values = {}
|
|
self.__list_of_sec_values = []
|
|
self.__field_num = []
|
|
self.__section_num = 0
|
|
self.__state = 'before_body'
|
|
self.__found_first_sec = 0
|
|
self.__text_string = ''
|
|
self.__field_instruction_string = ''
|
|
self.__state_dict = {
|
|
'before_body' : self.__before_body_func,
|
|
'body' : self.__body_func,
|
|
'before_first_sec' : self.__before_first_sec_func,
|
|
'section' : self.__section_func,
|
|
'section_def' : self.__section_def_func,
|
|
'sec_in_field' : self.__sec_in_field_func,
|
|
}
|
|
# cw<sc<sect-defin<nu<true
|
|
self.__body_dict = {
|
|
'cw<sc<section___' : self.__found_section_func,
|
|
'mi<mk<sec-fd-beg' : self.__found_sec_in_field_func,
|
|
'cw<sc<sect-defin' : self.__found_section_def_bef_sec_func,
|
|
}
|
|
self.__section_def_dict = {
|
|
'cw<pf<par-def___' : (self.__end_sec_def_func, None),
|
|
'mi<mk<body-open_' : (self.__end_sec_def_func, None),
|
|
'cw<tb<columns___' : (self.__attribute_func, 'columns'),
|
|
'cw<pa<margin-lef' : (self.__attribute_func, 'margin-left'),
|
|
'cw<pa<margin-rig' : (self.__attribute_func, 'margin-right'),
|
|
'mi<mk<header-ind' : (self.__end_sec_def_func, None),
|
|
# premature endings
|
|
# __end_sec_premature_func
|
|
'tx<nu<__________' : (self.__end_sec_premature_func, None),
|
|
'cw<ci<font-style' : (self.__end_sec_premature_func, None),
|
|
'cw<ci<font-size_' : (self.__end_sec_premature_func, None),
|
|
}
|
|
self.__sec_in_field_dict = {
|
|
'mi<mk<sec-fd-end' : self.__end_sec_in_field_func,
|
|
# changed this 2004-04-26
|
|
# two lines
|
|
# 'cw<sc<section___' : self.__found_section_in_field_func,
|
|
# 'cw<sc<sect-defin' : self.__found_section_def_in_field_func,
|
|
}
|
|
|
|
def __found_section_def_func(self, line):
|
|
"""
|
|
Required:
|
|
line -- the line to parse
|
|
Returns:
|
|
nothing
|
|
Logic:
|
|
I have found a section definition. Change the state to
|
|
setion_def (so subsequent lines will be processesed as part of
|
|
the section definition), and clear the section_values dictionary.
|
|
"""
|
|
self.__state = 'section_def'
|
|
self.__section_values.clear()
|
|
|
|
def __attribute_func(self, line, name):
|
|
"""
|
|
Required:
|
|
line -- the line to be parsed
|
|
name -- the changed, readable name (as opposed to the
|
|
abbreviated one)
|
|
Returns:
|
|
nothing
|
|
Logic:
|
|
I need to add the right data to the section values dictionary so I
|
|
can retrive it later. The attribute (or key) is the name; the
|
|
value is the last part of the text string.
|
|
ex: cw<tb<columns___<nu<2
|
|
"""
|
|
attribute = name
|
|
value = line[20:-1]
|
|
self.__section_values[attribute] = value
|
|
|
|
def __found_section_func(self, line):
|
|
"""
|
|
Requires:
|
|
line -- the line to parse
|
|
Returns:
|
|
nothing
|
|
Logic:
|
|
I have found the beginning of a section, so change the state
|
|
accordingly. Also add one to the section counter.
|
|
"""
|
|
self.__state = 'section'
|
|
self.__write_obj.write(line)
|
|
self.__section_num += 1
|
|
|
|
def __found_section_def_bef_sec_func(self, line):
|
|
"""
|
|
Requires:
|
|
line -- the line to parse
|
|
Returns:
|
|
nothing
|
|
Logic:
|
|
I have found the beginning of a section, so change the state
|
|
accordingly. Also add one to the section counter.
|
|
"""
|
|
self.__section_num += 1
|
|
self.__found_section_def_func(line)
|
|
self.__write_obj.write(line)
|
|
|
|
def __section_func(self, line):
|
|
"""
|
|
Requires:
|
|
line --the line to parse
|
|
Returns:
|
|
nothing
|
|
Logic:
|
|
"""
|
|
if self.__token_info == 'cw<sc<sect-defin':
|
|
self.__found_section_def_func(line)
|
|
self.__write_obj.write(line)
|
|
|
|
def __section_def_func(self, line):
|
|
"""
|
|
Required:
|
|
line --line to parse
|
|
Returns:
|
|
nothing
|
|
Logic:
|
|
I have found a section definition. Check if the line is the end of
|
|
the defnition (a paragraph defintion), or if it contains info that
|
|
should be added to the values dictionary. If neither of these
|
|
cases are true, output the line to a file.
|
|
"""
|
|
action, name = self.__section_def_dict.get(self.__token_info, (None, None))
|
|
if action:
|
|
action(line, name)
|
|
if self.__in_field:
|
|
self.__sec_in_field_string += line
|
|
else:
|
|
self.__write_obj.write(line)
|
|
else:
|
|
self.__write_obj.write(line)
|
|
|
|
def __end_sec_def_func(self, line, name):
|
|
"""
|
|
Requires:
|
|
line --the line to parse
|
|
name --changed, readable name
|
|
Returns:
|
|
nothing
|
|
Logic:
|
|
The end of the section definition has been found. Reset the state.
|
|
Call on the write_section method.
|
|
"""
|
|
if not self.__in_field:
|
|
self.__state = 'body'
|
|
else:
|
|
self.__state = 'sec_in_field'
|
|
self.__write_section(line)
|
|
|
|
def __end_sec_premature_func(self, line, name):
|
|
"""
|
|
Requires:
|
|
line --the line to parse
|
|
name --changed, readable name
|
|
Returns:
|
|
nothing
|
|
Logic:
|
|
Text or control words indicating text have been found
|
|
before \\pard. This shoud indicate older RTF. Reset the state
|
|
Write the section defintion. Insert a paragraph definition.
|
|
Insert {} to mark the end of a paragraph defintion
|
|
"""
|
|
if not self.__in_field:
|
|
self.__state = 'body'
|
|
else:
|
|
self.__state = 'sec_in_field'
|
|
self.__write_section(line)
|
|
self.__write_obj.write('cw<pf<par-def___<nu<true\n')
|
|
self.__write_obj.write('ob<nu<open-brack<0000\n')
|
|
self.__write_obj.write('cb<nu<clos-brack<0000\n')
|
|
|
|
def __write_section(self, line):
|
|
"""
|
|
Requires:
|
|
nothing
|
|
Returns:
|
|
nothing
|
|
Logic:
|
|
Form a string of attributes and values. If you are not in a field
|
|
block, write this string to the output file. Otherwise, call on
|
|
the handle_sec_def method to handle this string.
|
|
"""
|
|
my_string = self.__mark_start
|
|
if self.__found_first_sec:
|
|
my_string += 'mi<tg<close_____<section\n'
|
|
else:
|
|
self.__found_first_sec = 1
|
|
my_string += 'mi<tg<open-att__<section<num>%s' % str(self.__section_num)
|
|
my_string += '<num-in-level>%s' % str(self.__section_num)
|
|
my_string += '<type>rtf-native'
|
|
my_string += '<level>0'
|
|
keys = self.__section_values.keys()
|
|
if len(keys) > 0:
|
|
for key in keys:
|
|
my_string += '<%s>%s' % (key, self.__section_values[key])
|
|
my_string += '\n'
|
|
my_string += self.__mark_end
|
|
# # my_string += line
|
|
if self.__state == 'body':
|
|
self.__write_obj.write(my_string)
|
|
elif self.__state == 'sec_in_field':
|
|
self.__handle_sec_def(my_string)
|
|
elif self.__run_level > 3:
|
|
msg = 'missed a flag\n'
|
|
raise self.__bug_handler(msg)
|
|
|
|
def __handle_sec_def(self, my_string):
|
|
"""
|
|
Requires:
|
|
my_string -- the string of attributes and values. (Do I need this?)
|
|
Returns:
|
|
nothing
|
|
Logic:
|
|
I need to append the dictionary of attributes and values to list
|
|
so I can use it later when I reach the end of the field-block.
|
|
"""
|
|
values_dict = self.__section_values
|
|
self.__list_of_sec_values.append(values_dict)
|
|
|
|
def __body_func(self, line):
|
|
"""
|
|
Requires:
|
|
line --the line to parse
|
|
Returns:
|
|
nothing
|
|
Logic:
|
|
Look for the beginning of a section. Otherwise, print the line to
|
|
the output file.
|
|
"""
|
|
action = self.__body_dict.get(self.__token_info)
|
|
if action:
|
|
action(line)
|
|
else:
|
|
self.__write_obj.write(line)
|
|
|
|
def __before_body_func(self, line):
|
|
"""
|
|
Requires:
|
|
line --line to parse
|
|
Returns:
|
|
nothing
|
|
Logic:
|
|
Look for the beginning of the body. Always print out the line.
|
|
"""
|
|
if self.__token_info == 'mi<mk<body-open_':
|
|
self.__state = 'before_first_sec'
|
|
self.__write_obj.write(line)
|
|
|
|
def __before_first_sec_func(self, line):
|
|
"""
|
|
Requires:
|
|
line -- line to parse
|
|
Returns:
|
|
nothing
|
|
Logic:
|
|
Look for the beginning of the first section. This can be \\sectd,
|
|
but in older RTF it could mean the any paragraph or row definition
|
|
"""
|
|
if self.__token_info == 'cw<sc<sect-defin':
|
|
self.__state = 'section_def'
|
|
self.__section_num += 1
|
|
self.__section_values.clear()
|
|
elif self.__token_info == 'cw<pf<par-def___':
|
|
self.__state = 'body'
|
|
self.__section_num += 1
|
|
self.__write_obj.write(
|
|
'mi<tg<open-att__<section<num>%s'
|
|
'<num-in-level>%s'
|
|
'<type>rtf-native'
|
|
'<level>0\n'
|
|
% (str(self.__section_num), str(self.__section_num))
|
|
)
|
|
self.__found_first_sec = 1
|
|
elif self.__token_info == 'tx<nu<__________':
|
|
self.__state = 'body'
|
|
self.__section_num += 1
|
|
self.__write_obj.write(
|
|
'mi<tg<open-att__<section<num>%s'
|
|
'<num-in-level>%s'
|
|
'<type>rtf-native'
|
|
'<level>0\n'
|
|
% (str(self.__section_num), str(self.__section_num))
|
|
)
|
|
self.__write_obj.write(
|
|
'cw<pf<par-def___<true\n'
|
|
)
|
|
self.__found_first_sec = 1
|
|
self.__write_obj.write(line)
|
|
|
|
def __found_sec_in_field_func(self, line):
|
|
"""
|
|
Requires:
|
|
line --line to parse
|
|
Returns:
|
|
nothing
|
|
Logic:
|
|
I have found the beginning of a field that has a section (or
|
|
really, two) inside of it. Change the state, and start adding to
|
|
one long string.
|
|
"""
|
|
self.__state = 'sec_in_field'
|
|
self.__sec_in_field_string = line
|
|
self.__in_field = 1
|
|
|
|
def __sec_in_field_func(self, line):
|
|
"""
|
|
Requires:
|
|
line --the line to parse
|
|
Returns:
|
|
nothing
|
|
Logic:
|
|
Check for the end of the field, or the beginning of a section
|
|
definition.
|
|
CHANGED! Just print out each line. Ignore any sections or
|
|
section definition info.
|
|
"""
|
|
action = self.__sec_in_field_dict.get(self.__token_info)
|
|
if action:
|
|
action(line)
|
|
else:
|
|
# change this 2004-04-26
|
|
# self.__sec_in_field_string += line
|
|
self.__write_obj.write(line)
|
|
|
|
def __end_sec_in_field_func(self, line):
|
|
"""
|
|
Requires:
|
|
line --line to parse
|
|
Returns:
|
|
nothing
|
|
Logic:
|
|
Add the last line to the field string. Call on the method
|
|
print_field_sec_attributes to write the close and beginning of a
|
|
section tag. Print out the field string. Call on the same method
|
|
to again write the close and beginning of a section tag.
|
|
Change the state.
|
|
"""
|
|
# change this 2004-04-26
|
|
# Don't do anyting
|
|
"""
|
|
self.__sec_in_field_string += line
|
|
self.__print_field_sec_attributes()
|
|
self.__write_obj.write(self.__sec_in_field_string)
|
|
self.__print_field_sec_attributes()
|
|
"""
|
|
self.__state = 'body'
|
|
self.__in_field = 0
|
|
# this is changed too
|
|
self.__write_obj.write(line)
|
|
|
|
def __print_field_sec_attributes(self):
|
|
"""
|
|
Requires:
|
|
nothing
|
|
Returns:
|
|
nothing
|
|
Logic:
|
|
Get the number and dictionary of values from the lists. The number
|
|
and dictionary will be the first item of each list. Write the
|
|
close tag. Write the start tag. Write the attribute and values in
|
|
the dictionary. Get rid of the first item in each list.
|
|
keys = self.__section_values.keys()
|
|
if len(keys) > 0:
|
|
my_string += 'mi<tg<open-att__<section-definition'
|
|
for key in keys:
|
|
my_string += '<%s>%s' % (key, self.__section_values[key])
|
|
my_string += '\n'
|
|
else:
|
|
my_string += 'mi<tg<open______<section-definition\n'
|
|
"""
|
|
num = self.__field_num[0]
|
|
self.__field_num = self.__field_num[1:]
|
|
self.__write_obj.write(
|
|
'mi<tg<close_____<section\n'
|
|
'mi<tg<open-att__<section<num>%s' % str(num)
|
|
)
|
|
if self.__list_of_sec_values:
|
|
keys = self.__list_of_sec_values[0].keys()
|
|
for key in keys:
|
|
self.__write_obj.write(
|
|
'<%s>%s\n' % (key, self.__list_of_sec_values[0][key]))
|
|
self.__list_of_sec_values = self.__list_of_sec_values[1:]
|
|
self.__write_obj.write('<level>0')
|
|
self.__write_obj.write('<type>rtf-native')
|
|
self.__write_obj.write('<num-in-level>%s' % str(self.__section_num))
|
|
self.__write_obj.write('\n')
|
|
# Look here
|
|
|
|
def __found_section_in_field_func(self, line):
|
|
"""
|
|
Requires:
|
|
line --line to parse
|
|
Returns:
|
|
nothing
|
|
Logic:
|
|
I have found a section in a field block. Add one to section
|
|
counter, and append this number to a list.
|
|
"""
|
|
self.__section_num += 1
|
|
self.__field_num.append(self.__section_num)
|
|
self.__sec_in_field_string += line
|
|
|
|
def __found_section_def_in_field_func(self, line):
|
|
"""
|
|
Requires:
|
|
line --line to parse
|
|
Returns:
|
|
nothing
|
|
Logic:
|
|
I have found a section definition in a filed block. Change the
|
|
state and clear the values dictionary.
|
|
"""
|
|
self.__state = 'section_def'
|
|
self.__section_values.clear()
|
|
|
|
def make_sections(self):
|
|
"""
|
|
Requires:
|
|
nothing
|
|
Returns:
|
|
nothing (changes the original file)
|
|
Logic:
|
|
Read one line in at a time. Determine what action to take based on
|
|
the state. If the state is before the body, look for the
|
|
beginning of the body.
|
|
If the state is body, send the line to the body method.
|
|
"""
|
|
self.__initiate_values()
|
|
read_obj = open_for_read(self.__file)
|
|
self.__write_obj = open_for_write(self.__write_to)
|
|
line_to_read = 1
|
|
while line_to_read:
|
|
line_to_read = read_obj.readline()
|
|
line = line_to_read
|
|
self.__token_info = line[:16]
|
|
action = self.__state_dict.get(self.__state)
|
|
if action is None:
|
|
sys.stderr.write('no matching state in module sections.py\n')
|
|
sys.stderr.write(self.__state + '\n')
|
|
action(line)
|
|
read_obj.close()
|
|
self.__write_obj.close()
|
|
copy_obj = copy.Copy(bug_handler=self.__bug_handler)
|
|
if self.__copy:
|
|
copy_obj.copy_file(self.__write_to, "sections.data")
|
|
copy_obj.rename(self.__write_to, self.__file)
|
|
os.remove(self.__write_to)
|