mirror of
https://github.com/gryf/ebook-converter.git
synced 2026-01-02 16:54:12 +01:00
Here is the first batch of modules, which are needed for converting several formats to LRF. Some of the logic has been change, more cleanups will follow.
569 lines
21 KiB
Python
569 lines
21 KiB
Python
from __future__ import absolute_import, division, print_function, unicode_literals
|
|
#########################################################################
|
|
# #
|
|
# #
|
|
# copyright 2002 Paul Henry Tremblay #
|
|
# #
|
|
# This program is distributed in the hope that it will be useful, #
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
|
|
# General Public License for more details. #
|
|
# #
|
|
# #
|
|
#########################################################################
|
|
import sys, os
|
|
|
|
from ebook_converter.ebooks.rtf2xml import copy, border_parse
|
|
from ebook_converter.ptempfile import better_mktemp
|
|
from ebook_converter.polyglot.builtins import unicode_type
|
|
|
|
from . import open_for_read, open_for_write
|
|
|
|
"""
|
|
States.
|
|
1. 'not_in_table'
|
|
1. 'cw<tb<row-def___' start a row definition
|
|
2. 'mi<mk<in-table__' start table
|
|
2. 'in_table'
|
|
1. 'mi<mk<pard-start', start of a row, cell
|
|
2. 'mi<mk<not-in-tbl', end the table.
|
|
3. 'cw<tb<row-def___' start a row definition
|
|
3. in_row_definition
|
|
1. 'mi<mk<not-in-tbl' : end the row defintion. If in table, end the table.
|
|
2. 'mi<mk<pard-start' : end the row defintion
|
|
if already in the table, start a row and cell.
|
|
3. 'cw<tb<row_______' : end the row definition, end the row
|
|
4. 'cw...' use another method to handle the control word
|
|
control word might be added to dictionary.
|
|
5. 'mi<mk<in-table__' If already in table, do nothing. Otherwise
|
|
start the table.
|
|
4. 'in_row'
|
|
1. 'mi<mk<pard-start', start cell
|
|
2. 'mi<mk<not-in-tbl' end table,
|
|
3. 'cw<tb<row_______' close row,
|
|
5. 'in_cell'
|
|
1. 'mi<mk<not-in-tbl', end table
|
|
2. 'cw<tb<cell______', end cell
|
|
"""
|
|
|
|
|
|
class Table:
|
|
"""
|
|
Make tables.
|
|
Logic:
|
|
Read one line at a time. The default state (self.__state) is
|
|
'not_in_table'. Look for either a 'cw<tb<in-table__', or a row definition.
|
|
"""
|
|
|
|
def __init__(self,
|
|
in_file,
|
|
bug_handler,
|
|
copy=None,
|
|
run_level=1,):
|
|
"""
|
|
Required:
|
|
'file'--file to parse
|
|
Optional:
|
|
'copy'-- whether to make a copy of result for debugging
|
|
'temp_dir' --where to output temporary results (default is
|
|
directory from which the script is run.)
|
|
Returns:
|
|
nothing
|
|
"""
|
|
self.__file = in_file
|
|
self.__bug_handler = bug_handler
|
|
self.__copy = copy
|
|
self.__run_level = run_level
|
|
self.__write_to = better_mktemp()
|
|
|
|
def __initiate_values(self):
|
|
"""
|
|
Initiate all values.
|
|
"""
|
|
self.__state_dict = {
|
|
'in_table': self.__in_table_func,
|
|
'in_row_def': self.__in_row_def_func,
|
|
'not_in_table': self.__not_in_table_func,
|
|
'in_cell': self.__in_cell_func,
|
|
'in_row': self.__in_row_func,
|
|
}
|
|
self.__not_in_table_dict = {
|
|
'cw<tb<row-def___': self.__found_row_def_func,
|
|
'cw<tb<in-table__': self.__start_table_func,
|
|
'mi<mk<in-table__' : self.__start_table_func,
|
|
}
|
|
# can't use this dictionary. When in row_definition, many tokens
|
|
# require multiple definitions
|
|
self.__in_row_definition_dict = {
|
|
'mi<mk<not-in-tbl' : self.__end_row_table_func,
|
|
'mi<mk<pard-start' : self.__end_row_def_func,
|
|
}
|
|
self.__in_row_dict = {
|
|
'mi<mk<not-in-tbl' : self.__close_table,
|
|
'mi<mk<pard-start' : self.__start_cell_func,
|
|
'cw<tb<row_______' : self.__end_row_func,
|
|
'cw<tb<cell______' : self.__empty_cell,
|
|
}
|
|
# set the default state
|
|
self.__state = ['not_in_table']
|
|
# set empty data for all tables
|
|
self.__table_data = []
|
|
# just in case there is no table data
|
|
self.__row_dict = {}
|
|
self.__cell_list = []
|
|
self.__cell_widths = []
|
|
|
|
def __in_table_func(self, line):
|
|
"""
|
|
Requires:
|
|
line -- line to parse
|
|
Logic:
|
|
Look for the end of the table. If found, close out the table.
|
|
Look for 'mi<mk<pard-start', which marks the beginning of a row. Start
|
|
a row and start a cell.
|
|
"""
|
|
# 'cell' : ('tb', 'cell______', self.default_func),
|
|
if self.__token_info == 'mi<mk<not-in-tbl' or\
|
|
self.__token_info == 'mi<mk<sect-start' or\
|
|
self.__token_info == 'mi<mk<sect-close' or\
|
|
self.__token_info == 'mi<mk<body-close':
|
|
self.__close_table(line)
|
|
elif self.__token_info == 'mi<mk<pard-start':
|
|
self.__start_row_func(line)
|
|
self.__start_cell_func(line)
|
|
elif self.__token_info == 'cw<tb<row-def___':
|
|
self.__found_row_def_func(line)
|
|
elif self.__token_info == 'cw<tb<cell______':
|
|
self.__start_row_func(line)
|
|
self.__empty_cell(line)
|
|
self.__write_obj.write(line)
|
|
|
|
def __not_in_table_func(self, line):
|
|
"""
|
|
Requires:
|
|
line -- the line of text read in from document
|
|
Returns:
|
|
nothing
|
|
Logic:
|
|
The state is not in a table, so look for the two tokens that
|
|
mark the start of a table: 'cw<tb<row-def', or 'cw<tb<in-table__'.
|
|
If these tokens are found, use another method to start a table
|
|
and change states. Otherwise, just output the line.
|
|
"""
|
|
action = self.__not_in_table_dict.get(self.__token_info)
|
|
if action:
|
|
action(line)
|
|
self.__write_obj.write(line)
|
|
|
|
def __close_table(self, line):
|
|
"""
|
|
Requires:
|
|
line -- line to parse
|
|
Returns:
|
|
?
|
|
Logic:
|
|
Write the end marker for the table.
|
|
Write the end tag for the table.
|
|
Set the state to ['not_in_table']
|
|
"""
|
|
self.__write_obj.write('mi<mk<table-end_\n')
|
|
self.__state = ['not_in_table']
|
|
self.__table_data[-1]['number-of-columns'] = self.__max_number_cells_in_row
|
|
self.__table_data[-1]['number-of-rows'] = self.__rows_in_table
|
|
average_cells_in_row = self.__mode(self.__list_of_cells_in_row)
|
|
self.__table_data[-1]['average-cells-per-row'] = average_cells_in_row
|
|
average_cell_width = self.__mode(self.__cell_widths)
|
|
self.__table_data[-1]['average-cell-width'] = average_cell_width
|
|
|
|
def __found_row_def_func(self, line):
|
|
"""
|
|
Requires:
|
|
line don't need this except for consistency with other methods.
|
|
Returns:
|
|
nothing
|
|
Logic:
|
|
A row definition has been found. Collect all the data from this
|
|
to use later in writing attributes for the table.
|
|
"""
|
|
self.__state.append('in_row_def')
|
|
self.__last_cell_position = 0
|
|
self.__row_dict = {}
|
|
self.__cell_list = []
|
|
self.__cell_list.append({})
|
|
self.__cell_widths = []
|
|
|
|
def __start_table_func(self, line):
|
|
"""
|
|
Requires:
|
|
line -- line to parse
|
|
Returns:
|
|
?
|
|
Logic:
|
|
Add the 'in_table' to the state list.
|
|
Write out the table marker.
|
|
Initialize table values (not sure about these yet)
|
|
"""
|
|
self.__rows_in_table = 0
|
|
self.__cells_in_table = 0
|
|
self.__cells_in_row = 0
|
|
self.__max_number_cells_in_row = 0
|
|
self.__table_data.append({})
|
|
self.__list_of_cells_in_row = []
|
|
self.__write_obj.write('mi<mk<tabl-start\n')
|
|
self.__state.append('in_table')
|
|
|
|
def __end_row_table_func(self, line):
|
|
"""
|
|
Requires:
|
|
line --just for consistencey
|
|
Returns:
|
|
?
|
|
Logic:
|
|
?
|
|
"""
|
|
self.__close_table(self, line)
|
|
|
|
def __end_row_def_func(self, line):
|
|
"""
|
|
Requires:
|
|
line --just for consistency
|
|
Returns:
|
|
nothing
|
|
Logic:
|
|
change the state.
|
|
get rid of the last {} in the cell list
|
|
figure out the number of cells based on the self.__row_dict[widths]
|
|
('122, 122')
|
|
"""
|
|
if len(self.__state) > 0:
|
|
if self.__state[-1] == 'in_row_def':
|
|
self.__state.pop()
|
|
# added [{]] at the *end* of each /cell. Get rid of extra one
|
|
self.__cell_list.pop()
|
|
widths = self.__row_dict.get('widths')
|
|
if widths:
|
|
width_list = widths.split(',')
|
|
num_cells = len(width_list)
|
|
self.__row_dict['number-of-cells'] = num_cells
|
|
|
|
def __in_row_def_func(self, line):
|
|
"""
|
|
Requires:
|
|
line --line to parse
|
|
Returns:
|
|
nothing
|
|
Logic:
|
|
In the text that defines a row. If a control word is found, handle the
|
|
control word with another method.
|
|
Check for states that will end this state.
|
|
While in the row definition, certain tokens can end a row or end a table.
|
|
If a paragrah definition (pard-start) is found, and the you are already in
|
|
a table, start of a row.
|
|
"""
|
|
if self.__token_info == 'cw<tb<row_______':
|
|
# write tags
|
|
self.__end_row_func(line)
|
|
# change the state
|
|
self.__end_row_def_func(line)
|
|
self.__write_obj.write(line)
|
|
elif line[0:2] == 'cw':
|
|
self.__handle_row_token(line)
|
|
self.__write_obj.write(line)
|
|
elif self.__token_info == 'mi<mk<not-in-tbl' and 'in_table' in self.__state:
|
|
self.__end_row_def_func(line)
|
|
self.__close_table(line)
|
|
self.__write_obj.write(line)
|
|
elif self.__token_info == 'mi<mk<pard-start':
|
|
self.__end_row_def_func(line)
|
|
# if already in the table, start a row, then cell.
|
|
if (self.__state) > 0 and self.__state[-1] == 'in_table':
|
|
self.__start_row_func(line)
|
|
self.__start_cell_func(line)
|
|
self.__write_obj.write(line)
|
|
elif self.__token_info == 'mi<mk<in-table__':
|
|
self.__end_row_def_func(line)
|
|
# if not in table, start a new table
|
|
if len(self.__state) > 0 and self.__state[-1] != 'in_table':
|
|
self.__start_table_func(line)
|
|
self.__write_obj.write(line)
|
|
else:
|
|
self.__write_obj.write(line)
|
|
|
|
def __handle_row_token(self, line):
|
|
"""
|
|
Requires:
|
|
line -- line to parse
|
|
Returns:
|
|
?
|
|
Logic:
|
|
the tokens in the row definition contain the following information:
|
|
1. row borders.
|
|
2. cell borders for all cells in the row.
|
|
3. cell postions for all cells in the row.
|
|
Put all information about row borders into a row dictionary.
|
|
Put all information about cell borders into into the dictionary in
|
|
the last item in the cell list. ([{border:something, width:something},
|
|
{border:something, width:something}])
|
|
cw<bd<bor-t-r-to<nu<bdr-hair__|bdr-li-wid:0.50
|
|
"""
|
|
if line[3:5] == 'bd':
|
|
border_obj = border_parse.BorderParse()
|
|
the_dict = border_obj.parse_border(line)
|
|
keys = the_dict.keys()
|
|
# border-cell-top-hairline
|
|
in_cell = 0
|
|
for key in keys:
|
|
if key[0:11] == 'border-cell':
|
|
in_cell = 1
|
|
for key in keys:
|
|
if in_cell:
|
|
self.__cell_list[-1][key] = the_dict[key]
|
|
else:
|
|
self.__row_dict[key] = the_dict[key]
|
|
# cw<tb<cell-posit<nu<216.00
|
|
elif self.__token_info == 'cw<tb<cell-posit':
|
|
self.__found_cell_position(line)
|
|
# cw<tb<row-pos-le<nu<-5.40
|
|
elif self.__token_info == 'cw<tb<row-pos-le':
|
|
position = line[20:-1]
|
|
self.__row_dict['left-row-position'] = position
|
|
elif self.__token_info == 'cw<tb<row-header':
|
|
self.__row_dict['header'] = 'true'
|
|
|
|
def __start_cell_func(self, line):
|
|
"""
|
|
Required:
|
|
line -- the line of text
|
|
Returns:
|
|
nothing
|
|
Logic:
|
|
Append 'in_cell' for states
|
|
If the self.__cell list containst dictionaries, get the last dictionary.
|
|
Write value => attributes for key=> value
|
|
pop the self.__cell_list.
|
|
Otherwise, print out a cell tag.
|
|
"""
|
|
self.__state.append('in_cell')
|
|
# self.__cell_list = []
|
|
if len(self.__cell_list) > 0:
|
|
self.__write_obj.write('mi<tg<open-att__<cell')
|
|
# cell_dict = self.__cell_list[-1]
|
|
cell_dict = self.__cell_list[0]
|
|
keys = cell_dict.keys()
|
|
for key in keys:
|
|
self.__write_obj.write('<%s>%s' % (key, cell_dict[key]))
|
|
self.__write_obj.write('\n')
|
|
# self.__cell_list.pop()
|
|
self.__cell_list.pop(0)
|
|
# self.__cell_list = self.__cell_list[1:]
|
|
else:
|
|
self.__write_obj.write('mi<tg<open______<cell\n')
|
|
self.__cells_in_table += 1
|
|
self.__cells_in_row += 1
|
|
|
|
def __start_row_func(self, line):
|
|
"""
|
|
Required:
|
|
line -- the line of text
|
|
Returns:
|
|
nothing
|
|
Logic:
|
|
Append 'in_row' for states
|
|
Write value => attributes for key=> value
|
|
"""
|
|
self.__state.append('in_row')
|
|
self.__write_obj.write('mi<tg<open-att__<row')
|
|
keys = self.__row_dict.keys()
|
|
for key in keys:
|
|
self.__write_obj.write('<%s>%s' % (key, self.__row_dict[key]))
|
|
self.__write_obj.write('\n')
|
|
self.__cells_in_row = 0
|
|
self.__rows_in_table += 1
|
|
|
|
def __found_cell_position(self, line):
|
|
"""
|
|
needs:
|
|
line: current line
|
|
returns:
|
|
nothing
|
|
logic:
|
|
Calculate the cell width.
|
|
If the cell is the first cell, you should add the left cell position to it.
|
|
(This value is often negative.)
|
|
Next, set the new last_cell_position to the current cell position.
|
|
"""
|
|
# cw<tb<cell-posit<nu<216.00
|
|
new_cell_position = round(float(line[20:-1]), 2)
|
|
left_position = 0
|
|
if self.__last_cell_position == 0:
|
|
left_position = self.__row_dict.get('left-row-position', 0)
|
|
left_position = float(left_position)
|
|
width = new_cell_position - self.__last_cell_position - left_position
|
|
# width = round(width, 2)
|
|
width = unicode_type('%.2f' % width)
|
|
self.__last_cell_position = new_cell_position
|
|
widths_exists = self.__row_dict.get('widths')
|
|
if widths_exists:
|
|
self.__row_dict['widths'] += ', %s' % unicode_type(width)
|
|
else:
|
|
self.__row_dict['widths'] = unicode_type(width)
|
|
self.__cell_list[-1]['width'] = width
|
|
self.__cell_list.append({})
|
|
self.__cell_widths.append(width)
|
|
|
|
def __in_cell_func(self, line):
|
|
"""
|
|
Required:
|
|
line
|
|
Returns:
|
|
nothing
|
|
Logic:
|
|
In the middle of a cell.
|
|
Look for the close of the table. If found, use the close table function to close
|
|
the table.
|
|
Look for the close of the cell. If found, use the close cell function to close out
|
|
the cell.
|
|
Otherwise, print out the line.
|
|
"""
|
|
# cw<tb<cell______<nu<true
|
|
# mi<mk<sect-start
|
|
if self.__token_info == 'mi<mk<not-in-tbl' or\
|
|
self.__token_info == 'mi<mk<sect-start' or\
|
|
self.__token_info == 'mi<mk<sect-close' or\
|
|
self.__token_info == 'mi<mk<body-close':
|
|
self.__end_cell_func(line)
|
|
self.__end_row_func(line)
|
|
self.__close_table(line)
|
|
self.__write_obj.write(line)
|
|
elif self.__token_info == 'cw<tb<cell______':
|
|
self.__end_cell_func(line)
|
|
else:
|
|
self.__write_obj.write(line)
|
|
|
|
def __end_cell_func(self, line):
|
|
"""
|
|
Requires:
|
|
line
|
|
Returns:
|
|
nothing
|
|
Logic:
|
|
End the cell. Print out the closing marks. Pop the self.__state.
|
|
"""
|
|
if len(self.__state) > 1:
|
|
if self.__state[-1] == 'in_cell':
|
|
self.__state.pop()
|
|
self.__write_obj.write('mi<mk<close_cell\n')
|
|
self.__write_obj.write('mi<tg<close_____<cell\n')
|
|
self.__write_obj.write('mi<mk<closecell_\n')
|
|
|
|
def __in_row_func(self, line):
|
|
if self.__token_info == 'mi<mk<not-in-tbl' or\
|
|
self.__token_info == 'mi<mk<sect-start' or\
|
|
self.__token_info == 'mi<mk<sect-close' or\
|
|
self.__token_info == 'mi<mk<body-close':
|
|
self.__end_row_func(line)
|
|
self.__close_table(line)
|
|
self.__write_obj.write(line)
|
|
else:
|
|
action = self.__in_row_dict.get(self.__token_info)
|
|
if action:
|
|
action(line)
|
|
self.__write_obj.write(line)
|
|
"""
|
|
elif self.__token_info == 'mi<mk<pard-start':
|
|
self.__start_cell_func(line)
|
|
self.__write_obj.write(line)
|
|
elif self.__token_info == 'cw<tb<row_______':
|
|
self.__end_row_func(line)
|
|
self.__write_obj.write(line)
|
|
else:
|
|
self.__write_obj.write(line)
|
|
"""
|
|
|
|
def __end_row_func(self, line):
|
|
"""
|
|
"""
|
|
if len(self.__state) > 1 and self.__state[-1] == 'in_row':
|
|
self.__state.pop()
|
|
self.__write_obj.write('mi<tg<close_____<row\n')
|
|
else:
|
|
self.__write_obj.write('mi<tg<empty_____<row\n')
|
|
self.__rows_in_table += 1
|
|
if self.__cells_in_row > self.__max_number_cells_in_row:
|
|
self.__max_number_cells_in_row = self.__cells_in_row
|
|
self.__list_of_cells_in_row.append(self.__cells_in_row)
|
|
|
|
def __empty_cell(self, line):
|
|
"""
|
|
Required:
|
|
line -- line of text
|
|
Returns:
|
|
nothing
|
|
Logic:
|
|
Write an empty tag with attributes if there are attributes.
|
|
Otherwise, writen an empty tag with cell as element.
|
|
"""
|
|
if len(self.__cell_list) > 0:
|
|
self.__write_obj.write('mi<tg<empty-att_<cell')
|
|
cell_dict = self.__cell_list[-1]
|
|
keys = cell_dict.keys()
|
|
for key in keys:
|
|
self.__write_obj.write('<%s>%s' % (key, cell_dict[key]))
|
|
self.__write_obj.write('\n')
|
|
else:
|
|
self.__write_obj.write('mi<tg<empty_____<cell\n')
|
|
self.__cells_in_table += 1
|
|
self.__cells_in_row += 1
|
|
|
|
def __mode(self, the_list):
|
|
"""
|
|
Required:
|
|
the_list -- a list of something
|
|
Returns:
|
|
the number that occurs the most
|
|
Logic:
|
|
get the count of each item in list. The count that is the greatest
|
|
is the mode.
|
|
"""
|
|
max = 0
|
|
mode = 'not-defined'
|
|
for item in the_list:
|
|
num_of_values = the_list.count(item)
|
|
if num_of_values > max:
|
|
mode = item
|
|
max = num_of_values
|
|
return mode
|
|
|
|
def make_table(self):
|
|
"""
|
|
Requires:
|
|
nothing
|
|
Returns:
|
|
A dictionary of values for the beginning of the table.
|
|
Logic:
|
|
Read one line in at a time. Determine what action to take based on
|
|
the state.
|
|
"""
|
|
self.__initiate_values()
|
|
read_obj = open_for_read(self.__file)
|
|
self.__write_obj = open_for_write(self.__write_to)
|
|
line_to_read = 1
|
|
while line_to_read:
|
|
line_to_read = read_obj.readline()
|
|
line = line_to_read
|
|
self.__token_info = line[:16]
|
|
action = self.__state_dict.get(self.__state[-1])
|
|
# print self.__state[-1]
|
|
if action is None:
|
|
sys.stderr.write('No matching state in module table.py\n')
|
|
sys.stderr.write(self.__state[-1] + '\n')
|
|
action(line)
|
|
read_obj.close()
|
|
self.__write_obj.close()
|
|
copy_obj = copy.Copy(bug_handler=self.__bug_handler)
|
|
if self.__copy:
|
|
copy_obj.copy_file(self.__write_to, "table.data")
|
|
copy_obj.rename(self.__write_to, self.__file)
|
|
os.remove(self.__write_to)
|
|
return self.__table_data
|