mirror of
https://github.com/gryf/ebook-converter.git
synced 2026-01-02 16:54:12 +01:00
Here is the first batch of modules, which are needed for converting several formats to LRF. Some of the logic has been change, more cleanups will follow.
466 lines
18 KiB
Python
466 lines
18 KiB
Python
from __future__ import unicode_literals, absolute_import, print_function, division
|
|
#########################################################################
|
|
# #
|
|
# #
|
|
# copyright 2002 Paul Henry Tremblay #
|
|
# #
|
|
# This program is distributed in the hope that it will be useful, #
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
|
|
# General Public License for more details. #
|
|
# #
|
|
# #
|
|
#########################################################################
|
|
import sys, os, re
|
|
|
|
from ebook_converter.ebooks.rtf2xml import copy
|
|
from ebook_converter.ptempfile import better_mktemp
|
|
from ebook_converter.polyglot.builtins import unicode_type
|
|
|
|
from . import open_for_read, open_for_write
|
|
|
|
|
|
class MakeLists:
|
|
"""
|
|
Form lists.
|
|
Use RTF's own formatting to determine if a paragraph definition is part of a
|
|
list.
|
|
Use indents to determine items and how lists are nested.
|
|
"""
|
|
|
|
def __init__(self,
|
|
in_file,
|
|
bug_handler,
|
|
headings_to_sections,
|
|
list_of_lists,
|
|
copy=None,
|
|
run_level=1,
|
|
no_headings_as_list=1,
|
|
write_list_info=0,
|
|
):
|
|
"""
|
|
Required:
|
|
'file'
|
|
Optional:
|
|
'copy'-- whether to make a copy of result for debugging
|
|
'temp_dir' --where to output temporary results (default is
|
|
directory from which the script is run.)
|
|
Returns:
|
|
nothing
|
|
"""
|
|
self.__file = in_file
|
|
self.__bug_handler = bug_handler
|
|
self.__run_level = run_level
|
|
self.__no_headings_as_list = no_headings_as_list
|
|
self.__headings_to_sections = headings_to_sections
|
|
self.__copy = copy
|
|
self.__write_to = better_mktemp()
|
|
self.__list_of_lists = list_of_lists
|
|
self.__write_list_info = write_list_info
|
|
|
|
def __initiate_values(self):
|
|
"""
|
|
Required:
|
|
Nothing
|
|
Return:
|
|
Nothing
|
|
Logic:
|
|
The self.__end_list is a list of tokens that will force a list to end.
|
|
Likewise, the self.__end_lines is a list of lines that forces a list to end.
|
|
"""
|
|
self.__state = "default"
|
|
self.__left_indent = 0
|
|
self.__list_type = 'not-defined'
|
|
self.__pard_def = ""
|
|
self.__all_lists = []
|
|
self.__level = 0
|
|
self.__list_chunk = ''
|
|
self.__state_dict={
|
|
'default' : self.__default_func,
|
|
'in_pard' : self.__in_pard_func,
|
|
'after_pard' : self.__after_pard_func,
|
|
}
|
|
self.__headings = [
|
|
'heading 1', 'heading 2', 'heading 3', 'heading 4',
|
|
'heading 5', 'heading 6', 'heading 7', 'heading 8',
|
|
'heading 9'
|
|
]
|
|
self.__allow_levels = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
|
|
self.__style_name = ''
|
|
self.__end_list = [
|
|
'mi<mk<body-close',
|
|
'mi<mk<par-in-fld',
|
|
'cw<tb<cell______',
|
|
'cw<tb<row-def___',
|
|
'cw<tb<row_______',
|
|
'mi<mk<sect-close',
|
|
'mi<mk<sect-start',
|
|
'mi<mk<header-beg',
|
|
'mi<mk<header-end',
|
|
'mi<mk<head___clo',
|
|
'mi<mk<fldbk-end_',
|
|
'mi<mk<close_cell',
|
|
'mi<mk<footnt-ope',
|
|
'mi<mk<foot___clo',
|
|
'mi<mk<tabl-start',
|
|
# 'mi<mk<sec-fd-beg',
|
|
]
|
|
self.__end_lines = [
|
|
'mi<tg<close_____<cell\n',
|
|
]
|
|
self.__id_regex = re.compile(r'\<list-id\>(\d+)')
|
|
self.__lv_regex = re.compile(r'\<list-level\>(\d+)')
|
|
self.__found_appt = 0
|
|
self.__line_num = 0
|
|
|
|
def __in_pard_func(self, line):
|
|
"""
|
|
Required:
|
|
line -- the line of current text.
|
|
Return:
|
|
Nothing
|
|
Logic:
|
|
You are in a list, but in the middle of a paragraph definition.
|
|
Don't do anything until you find the end of the paragraph definition.
|
|
"""
|
|
if self.__token_info == 'mi<mk<pard-end__':
|
|
self.__state = 'after_pard'
|
|
self.__write_obj.write(line)
|
|
|
|
def __after_pard_func(self, line):
|
|
"""
|
|
Required:
|
|
line -- the line of current text.
|
|
Return:
|
|
Nothing
|
|
Logic:
|
|
You are in a list, but after a paragraph definition. You have to
|
|
determine if the last pargraph definition ends a list, continues
|
|
the old one, or starts a new one.
|
|
Otherwise, look for a paragraph definition. If one is found, determine if
|
|
the paragraph definition contains a list-id. If it does, use the method
|
|
self.__list_after_par_def to determine the action.
|
|
If the paragraph definition does not contain a list-id, use the method
|
|
close_lists to close out items and lists for a paragraph that is not
|
|
If a bigger block is found (such as a section or a cell), end all lists.
|
|
indented.
|
|
If no special line is found, add each line to a buffer.
|
|
"""
|
|
if self.__token_info == 'mi<tg<open-att__' and line[17:37] == 'paragraph-definition':
|
|
is_heading = self.__is_a_heading()
|
|
# found paragraph definition and not heading 1
|
|
search_obj = re.search(self.__id_regex, line)
|
|
if search_obj and not is_heading: # found list-id
|
|
search_obj_lv = re.search(self.__lv_regex, line)
|
|
if search_obj_lv:
|
|
self.__level = search_obj_lv.group(1)
|
|
num = search_obj.group(1)
|
|
self.__list_after_par_def_func(line, num)
|
|
self.__write_obj.write(line)
|
|
self.__state = 'in_pard'
|
|
# heading 1
|
|
elif is_heading:
|
|
self.__left_indent = -1000
|
|
self.__close_lists()
|
|
self.__write_obj.write(self.__list_chunk)
|
|
self.__list_chunk = ''
|
|
self.__state = 'default'
|
|
self.__write_obj.write(line)
|
|
# Normal with no list id
|
|
else:
|
|
self.__close_lists()
|
|
self.__write_obj.write(self.__list_chunk)
|
|
self.__list_chunk = ''
|
|
self.__write_obj.write(line)
|
|
if len(self.__all_lists) == 0:
|
|
self.__state= 'default'
|
|
else:
|
|
self.__state = 'in_pard'
|
|
# section to end lists
|
|
elif self.__token_info in self.__end_list :
|
|
self.__left_indent = -1000
|
|
self.__close_lists()
|
|
self.__write_obj.write(self.__list_chunk)
|
|
self.__list_chunk = ''
|
|
self.__state = 'default'
|
|
self.__write_obj.write(line)
|
|
else:
|
|
self.__list_chunk += line
|
|
|
|
def __list_after_par_def_func(self, line, id):
|
|
"""
|
|
Required:
|
|
line -- the line of current text.
|
|
id -- the id of the current list
|
|
Return:
|
|
Nothing
|
|
Logic:
|
|
You have found the end of a paragraph definition, and have found
|
|
another paragraph definition with a list id.
|
|
If the list-id is different from the last paragraph definition,
|
|
write the string in the buffer. Close out the lists with another
|
|
method and start a new list.
|
|
If the list id is the same as the last one, check the indent on the
|
|
current paragraph definition. If it is greater than the previous one,
|
|
do not end the current list or item. Start a new list.
|
|
"""
|
|
last_list_id = self.__all_lists[-1]['id']
|
|
if id != last_list_id:
|
|
self.__close_lists()
|
|
self.__write_obj.write(self.__list_chunk)
|
|
self.__write_start_list(id)
|
|
self.__list_chunk = ''
|
|
else:
|
|
last_list_indent = self.__all_lists[-1]['left-indent']
|
|
if self.__left_indent > last_list_indent:
|
|
self.__write_obj.write(self.__list_chunk)
|
|
self.__write_start_list(id)
|
|
else:
|
|
self.__write_end_item()
|
|
self.__write_obj.write(self.__list_chunk)
|
|
self.__write_start_item()
|
|
self.__list_chunk = ''
|
|
|
|
def __close_lists(self):
|
|
"""
|
|
Required:
|
|
Nothing
|
|
Return:
|
|
Nothing
|
|
Logic:
|
|
Reverse the list of dictionaries. Iterate through the list and
|
|
get the indent for each list. If the current indent is less than
|
|
or equal to the indent in the dictionary, close that level.
|
|
Keep track of how many levels you close. Reduce the list by that
|
|
many levels.
|
|
Reverse the list again.
|
|
"""
|
|
if self.__line_num < 25 and self.__found_appt:
|
|
sys.stderr.write('in closing out lists\n')
|
|
sys.stderr.write('current_indent is "%s"\n' % self.__left_indent)
|
|
current_indent = self.__left_indent
|
|
self.__all_lists.reverse()
|
|
num_levels_closed = 0
|
|
for the_dict in self.__all_lists:
|
|
list_indent = the_dict.get('left-indent')
|
|
if self.__line_num < 25 and self.__found_appt:
|
|
sys.stderr.write('last indent is "%s"' % list_indent)
|
|
if current_indent <= list_indent:
|
|
self.__write_end_item()
|
|
self.__write_end_list()
|
|
num_levels_closed += 1
|
|
self.__all_lists = self.__all_lists[num_levels_closed:]
|
|
self.__all_lists.reverse()
|
|
|
|
def __write_end_list(self):
|
|
"""
|
|
Required:
|
|
Nothing
|
|
Return:
|
|
Nothing
|
|
Logic:
|
|
Write the end of a list.
|
|
"""
|
|
self.__write_obj.write('mi<tg<close_____<list\n')
|
|
self.__write_obj.write('mi<mk<list_close\n')
|
|
|
|
def __write_start_list(self, id):
|
|
"""
|
|
Required:
|
|
id -- the id of the current list.
|
|
Return:
|
|
Nothing
|
|
Logic:
|
|
Write the start of a list and add the id and left-indent to the
|
|
self.__all_lists list.
|
|
Write cues of when a list starts for later processing.
|
|
In order to determine the type of list, you have to iterate through
|
|
the self.__list_of lists. This list looks like:
|
|
[[{list-id: [1, 2], [{}], [{}]] [{list-id: [3, 4], [{}]]]
|
|
I need to get the inside lists of the main lists. Then I need to get
|
|
the first item of what I just got. This is a dictionary. Get the list-id.
|
|
This is a list. Check to see if the current id is in this list. If
|
|
so, then get the list-type from the dictionary.
|
|
"""
|
|
the_dict = {}
|
|
the_dict['left-indent'] = self.__left_indent
|
|
the_dict['id'] = id
|
|
self.__all_lists.append(the_dict)
|
|
self.__write_obj.write(
|
|
'mi<mk<list_start\n'
|
|
)
|
|
# bogus levels are sometimes written for empty paragraphs
|
|
if unicode_type(self.__level) not in self.__allow_levels:
|
|
lev_num = '0'
|
|
else:
|
|
lev_num = self.__level
|
|
self.__write_obj.write(
|
|
'mi<tg<open-att__<list<list-id>%s<level>%s'
|
|
% (id, lev_num)
|
|
)
|
|
list_dict = {}
|
|
if self.__list_of_lists: # older RTF won't generate a list_of_lists
|
|
index_of_list = self.__get_index_of_list(id)
|
|
if index_of_list is not None: # found a matching id
|
|
curlist = self.__list_of_lists[index_of_list]
|
|
list_dict = curlist[0]
|
|
level = int(self.__level) + 1
|
|
if level >= len(curlist):
|
|
level = len(curlist) - 1
|
|
level_dict = curlist[level][0]
|
|
list_type = level_dict.get('numbering-type')
|
|
if list_type == 'bullet':
|
|
list_type = 'unordered'
|
|
else:
|
|
list_type = 'ordered'
|
|
self.__write_obj.write(
|
|
'<list-type>%s' % (list_type))
|
|
else: # no matching id
|
|
self.__write_obj.write(
|
|
'<list-type>%s' % (self.__list_type))
|
|
else: # older RTF
|
|
self.__write_obj.write(
|
|
'<list-type>%s' % (self.__list_type))
|
|
# if you want to dump all the info to the list, rather than
|
|
# keeping it in the table above, change self.__write_list_info
|
|
# to true.
|
|
if self.__list_of_lists and self.__write_list_info and list_dict:
|
|
not_allow = ['list-id',]
|
|
the_keys_list = list_dict.keys()
|
|
for the_key in the_keys_list:
|
|
if the_key in not_allow:
|
|
continue
|
|
self.__write_obj.write('<%s>%s' % (the_key, list_dict[the_key]))
|
|
the_keys_level = level_dict.keys()
|
|
for the_key in the_keys_level:
|
|
self.__write_obj.write('<%s>%s' % (the_key, level_dict[the_key]))
|
|
self.__write_obj.write('\n')
|
|
self.__write_obj.write(
|
|
'mi<mk<liststart_\n'
|
|
)
|
|
self.__write_start_item()
|
|
|
|
def __get_index_of_list(self, id):
|
|
"""
|
|
Requires:
|
|
id -- id of current paragraph-definition
|
|
Returns:
|
|
an index of where the id occurs in list_of_lists, the
|
|
dictionary passed to this module.
|
|
Logic:
|
|
Iterate through the big lists, the one passed to this module and
|
|
get the first item, the dictionary. Use a counter to keep
|
|
track of how many times you iterate with the counter.
|
|
Once you find a match, return the counter.
|
|
If no match is found, print out an error message.
|
|
"""
|
|
# some RTF use 0 indexed list. Don't know what to do?
|
|
if id == '0':
|
|
return
|
|
the_index = 0
|
|
for list in self.__list_of_lists:
|
|
the_dict = list[0]
|
|
id_in_list = the_dict.get('list-id')
|
|
if id in id_in_list:
|
|
return the_index
|
|
the_index += 1
|
|
if self.__run_level > 0:
|
|
sys.stderr.write('Module is make_lists.py\n'
|
|
'Method is __get_index_of_list\n'
|
|
'The main list does not appear to have a matching id for %s \n'
|
|
% (id)
|
|
)
|
|
# sys.stderr.write(repr(self.__list_of_lists))
|
|
# if self.__run_level > 3:
|
|
# msg = 'level is "%s"\n' % self.__run_level
|
|
# self.__bug_handler
|
|
|
|
def __write_start_item(self):
|
|
self.__write_obj.write('mi<mk<item_start\n')
|
|
self.__write_obj.write('mi<tg<open______<item\n')
|
|
self.__write_obj.write('mi<mk<itemstart_\n')
|
|
|
|
def __write_end_item(self):
|
|
self.__write_obj.write('mi<tg<item_end__\n')
|
|
self.__write_obj.write('mi<tg<close_____<item\n')
|
|
self.__write_obj.write('mi<tg<item__end_\n')
|
|
|
|
def __default_func(self, line):
|
|
"""
|
|
Required:
|
|
self, line
|
|
Returns:
|
|
Nothing
|
|
Logic
|
|
Look for the start of a paragraph defintion. If one is found, check if
|
|
it contains a list-id. If it does, start a list. Change the state to
|
|
in_pard.
|
|
"""
|
|
if self.__token_info == 'mi<tg<open-att__' and line[17:37] == 'paragraph-definition':
|
|
is_a_heading = self.__is_a_heading()
|
|
if not is_a_heading:
|
|
search_obj = re.search(self.__id_regex, line)
|
|
if search_obj:
|
|
num = search_obj.group(1)
|
|
self.__state = 'in_pard'
|
|
search_obj_lv = re.search(self.__lv_regex, line)
|
|
if search_obj_lv:
|
|
self.__level = search_obj_lv.group(1)
|
|
self.__write_start_list(num)
|
|
self.__write_obj.write(line)
|
|
|
|
def __is_a_heading(self):
|
|
if self.__style_name in self.__headings:
|
|
if self.__headings_to_sections:
|
|
return 1
|
|
else:
|
|
if self.__no_headings_as_list:
|
|
return 1
|
|
else:
|
|
return 0
|
|
else:
|
|
return 0
|
|
|
|
def __get_indent(self, line):
|
|
if self.__token_info == 'mi<mk<left_inden':
|
|
self.__left_indent = float(line[17:-1])
|
|
|
|
def __get_list_type(self, line):
|
|
if self.__token_info == 'mi<mk<list-type_': # <ordered
|
|
self.__list_type = line[17:-1]
|
|
if self.__list_type == 'item':
|
|
self.__list_type = "unordered"
|
|
|
|
def __get_style_name(self, line):
|
|
if self.__token_info == 'mi<mk<style-name':
|
|
self.__style_name = line[17:-1]
|
|
|
|
def make_lists(self):
|
|
"""
|
|
Required:
|
|
nothing
|
|
Returns:
|
|
original file will be changed
|
|
Logic:
|
|
"""
|
|
self.__initiate_values()
|
|
read_obj = open_for_read(self.__file)
|
|
self.__write_obj = open_for_write(self.__write_to)
|
|
line_to_read = 1
|
|
while line_to_read:
|
|
line_to_read = read_obj.readline()
|
|
line = line_to_read
|
|
self.__token_info = line[:16]
|
|
self.__get_indent(line)
|
|
self.__get_list_type(line)
|
|
self.__get_style_name(line)
|
|
action = self.__state_dict.get(self.__state)
|
|
action(line)
|
|
read_obj.close()
|
|
self.__write_obj.close()
|
|
copy_obj = copy.Copy(bug_handler=self.__bug_handler)
|
|
if self.__copy:
|
|
copy_obj.copy_file(self.__write_to, "make_lists.data")
|
|
copy_obj.rename(self.__write_to, self.__file)
|
|
os.remove(self.__write_to)
|