mirror of
https://github.com/gryf/ebook-converter.git
synced 2026-01-06 19:44:12 +01:00
Initial import
This commit is contained in:
460
ebook_converter/ebooks/rtf2xml/fields_small.py
Normal file
460
ebook_converter/ebooks/rtf2xml/fields_small.py
Normal file
@@ -0,0 +1,460 @@
|
||||
from __future__ import unicode_literals, absolute_import, print_function, division
|
||||
#########################################################################
|
||||
# #
|
||||
# #
|
||||
# copyright 2002 Paul Henry Tremblay #
|
||||
# #
|
||||
# This program is distributed in the hope that it will be useful, #
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
|
||||
# General Public License for more details. #
|
||||
# #
|
||||
# #
|
||||
#########################################################################
|
||||
import sys, os, re
|
||||
|
||||
from calibre.ebooks.rtf2xml import field_strings, copy
|
||||
from calibre.ptempfile import better_mktemp
|
||||
from . import open_for_read, open_for_write
|
||||
|
||||
|
||||
class FieldsSmall:
|
||||
"""
|
||||
=================
|
||||
Purpose
|
||||
=================
|
||||
Write tags for bookmarks, index and toc entry fields in a tokenized file.
|
||||
This module does not handle toc or index tables. (This module won't be any
|
||||
use to you unless you use it as part of the other modules.)
|
||||
-----------
|
||||
Method
|
||||
-----------
|
||||
Look for the beginning of a bookmark, index, or toc entry. When such a token
|
||||
is found, store the opeing bracket count in a variable. Collect all the text
|
||||
until the closing bracket entry is found. Send the string to the module
|
||||
field_strings to process it. Write the processed string to the output
|
||||
file.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
in_file,
|
||||
bug_handler,
|
||||
copy=None,
|
||||
run_level=1,
|
||||
):
|
||||
"""
|
||||
Required:
|
||||
'file'--file to parse
|
||||
Optional:
|
||||
'copy'-- whether to make a copy of result for debugging
|
||||
'temp_dir' --where to output temporary results (default is
|
||||
directory from which the script is run.)
|
||||
Returns:
|
||||
nothing
|
||||
"""
|
||||
self.__file = in_file
|
||||
self.__bug_handler = bug_handler
|
||||
self.__copy = copy
|
||||
self.__write_to = better_mktemp()
|
||||
self.__run_level = run_level
|
||||
|
||||
def __initiate_values(self):
|
||||
"""
|
||||
Initiate all values.
|
||||
"""
|
||||
self.__string_obj = field_strings.FieldStrings(bug_handler=self.__bug_handler)
|
||||
self.__state = 'before_body'
|
||||
self.__text_string = ''
|
||||
self.__marker = 'mi<mk<inline-fld\n'
|
||||
self.__state_dict = {
|
||||
'before_body' : self.__before_body_func,
|
||||
'body' : self.__body_func,
|
||||
'bookmark' : self.__bookmark_func,
|
||||
'toc_index' : self.__toc_index_func,
|
||||
}
|
||||
self.__body_dict = {
|
||||
'cw<an<book-mk-st' : (self.__found_bookmark_func, 'start'),
|
||||
'cw<an<book-mk-en' : (self.__found_bookmark_func, 'end'),
|
||||
'cw<an<toc_______' : (self.__found_toc_index_func, 'toc'),
|
||||
'cw<an<index-mark' : (self.__found_toc_index_func, 'index'),
|
||||
}
|
||||
ob = 'ob<nu<open-brack.....'
|
||||
cb = 'cb<nu<clos-brack'
|
||||
bk_st = 'cw<an<book-mk-st<nu<true'
|
||||
tx = 'tx<nu<__________<(.*?)'
|
||||
reg_st = ob + bk_st + tx + cb
|
||||
self.__book_start = re.compile(r'%s' % reg_st)
|
||||
|
||||
def __before_body_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line --the line to parse
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
Look for the beginning of the body. When found, change the state
|
||||
to body. Always print out the line.
|
||||
"""
|
||||
if self.__token_info == 'mi<mk<body-open_':
|
||||
self.__state = 'body'
|
||||
self.__write_obj.write(line)
|
||||
|
||||
def __body_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line --the line to parse
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
This function handles all the lines in the body of the documents.
|
||||
Look for a bookmark, index or toc entry and take the appropriate action.
|
||||
"""
|
||||
action, tag = \
|
||||
self.__body_dict.get(self.__token_info, (None, None))
|
||||
if action:
|
||||
action(line, tag)
|
||||
else:
|
||||
self.__write_obj.write(line)
|
||||
|
||||
def __found_bookmark_func(self, line, tag):
|
||||
"""
|
||||
Requires:
|
||||
line --the line to parse
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
This function is called when a bookmark is found. The opening
|
||||
bracket count is stored int eh beginning bracket count. The state
|
||||
is changed to 'bookmark.'
|
||||
"""
|
||||
self.__beg_bracket_count = self.__ob_count
|
||||
self.__cb_count = 0
|
||||
self.__state = 'bookmark'
|
||||
self.__type_of_bookmark = tag
|
||||
|
||||
def __bookmark_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line --the line to parse
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
This function handles all lines within a bookmark. It adds each
|
||||
line to a string until the end of the bookmark is found. It
|
||||
processes the string with the fields_string module, and
|
||||
prints out the result.
|
||||
"""
|
||||
if self.__beg_bracket_count == self.__cb_count:
|
||||
self.__state = 'body'
|
||||
type = 'bookmark-%s' % self.__type_of_bookmark
|
||||
# change here
|
||||
"""
|
||||
my_string = self.__string_obj.process_string(
|
||||
self.__text_string, type)
|
||||
"""
|
||||
my_string = self.__parse_bookmark_func(
|
||||
self.__text_string, type)
|
||||
self.__write_obj.write(self.__marker)
|
||||
self.__write_obj.write(my_string)
|
||||
self.__text_string = ''
|
||||
self.__write_obj.write(line)
|
||||
elif line[0:2] == 'tx':
|
||||
self.__text_string += line[17:-1]
|
||||
|
||||
def __parse_index_func(self, my_string):
|
||||
"""
|
||||
Requires:
|
||||
my_string --string to parse
|
||||
type --type of string
|
||||
Returns:
|
||||
A string for a toc instruction field.
|
||||
Logic:
|
||||
This method is meant for *both* index and toc entries.
|
||||
I want to eleminate paragraph endings, and I want to divide the
|
||||
entry into a main entry and (if it exists) a sub entry.
|
||||
Split the string by newlines. Read on token at a time. If the
|
||||
token is a special colon, end the main entry element and start the
|
||||
sub entry element.
|
||||
If the token is a pargrah ending, ignore it, since I don't won't
|
||||
paragraphs within toc or index entries.
|
||||
"""
|
||||
my_string, see_string = self.__index_see_func(my_string)
|
||||
my_string, bookmark_string = self.__index_bookmark_func(my_string)
|
||||
italics, bold = self.__index__format_func(my_string)
|
||||
found_sub = 0
|
||||
my_changed_string = 'mi<tg<empty-att_<field<type>index-entry'
|
||||
my_changed_string += '<update>static'
|
||||
if see_string:
|
||||
my_changed_string += '<additional-text>%s' % see_string
|
||||
if bookmark_string:
|
||||
my_changed_string += '<bookmark>%s' % bookmark_string
|
||||
if italics:
|
||||
my_changed_string += '<italics>true'
|
||||
if bold:
|
||||
my_changed_string += '<bold>true'
|
||||
main_entry = ''
|
||||
sub_entry = ''
|
||||
lines = my_string.split('\n')
|
||||
for line in lines:
|
||||
token_info = line[:16]
|
||||
if token_info == 'cw<ml<colon_____':
|
||||
found_sub = 1
|
||||
elif token_info[0:2] == 'tx':
|
||||
if found_sub:
|
||||
sub_entry += line[17:]
|
||||
else:
|
||||
main_entry += line[17:]
|
||||
my_changed_string += '<main-entry>%s' % main_entry
|
||||
if found_sub:
|
||||
my_changed_string += '<sub-entry>%s' % sub_entry
|
||||
my_changed_string += '\n'
|
||||
return my_changed_string
|
||||
|
||||
def __index_see_func(self, my_string):
|
||||
in_see = 0
|
||||
bracket_count = 0
|
||||
see_string = ''
|
||||
changed_string = ''
|
||||
lines = my_string.split('\n')
|
||||
end_bracket_count = sys.maxsize
|
||||
for line in lines:
|
||||
token_info = line[:16]
|
||||
if token_info == 'ob<nu<open-brack':
|
||||
bracket_count += 1
|
||||
if token_info == 'cb<nu<clos-brack':
|
||||
bracket_count -= 1
|
||||
if in_see:
|
||||
if bracket_count == end_bracket_count and token_info == 'cb<nu<clos-brack':
|
||||
in_see = 0
|
||||
else:
|
||||
if token_info == 'tx<nu<__________':
|
||||
see_string += line[17:]
|
||||
else:
|
||||
if token_info == 'cw<in<index-see_':
|
||||
end_bracket_count = bracket_count - 1
|
||||
in_see = 1
|
||||
changed_string += '%s\n' % line
|
||||
return changed_string, see_string
|
||||
|
||||
def __index_bookmark_func(self, my_string):
|
||||
"""
|
||||
Requries:
|
||||
my_string -- string in all the index
|
||||
Returns:
|
||||
bookmark_string -- the text string of the book mark
|
||||
index_string -- string minus the bookmark_string
|
||||
"""
|
||||
# cw<an<place_____<nu<true
|
||||
in_bookmark = 0
|
||||
bracket_count = 0
|
||||
bookmark_string = ''
|
||||
index_string = ''
|
||||
lines = my_string.split('\n')
|
||||
end_bracket_count = sys.maxsize
|
||||
for line in lines:
|
||||
token_info = line[:16]
|
||||
if token_info == 'ob<nu<open-brack':
|
||||
bracket_count += 1
|
||||
if token_info == 'cb<nu<clos-brack':
|
||||
bracket_count -= 1
|
||||
if in_bookmark:
|
||||
if bracket_count == end_bracket_count and token_info == 'cb<nu<clos-brack':
|
||||
in_bookmark = 0
|
||||
index_string += '%s\n' % line
|
||||
else:
|
||||
if token_info == 'tx<nu<__________':
|
||||
bookmark_string += line[17:]
|
||||
else:
|
||||
index_string += '%s\n' % line
|
||||
else:
|
||||
if token_info == 'cw<an<place_____':
|
||||
end_bracket_count = bracket_count - 1
|
||||
in_bookmark = 1
|
||||
index_string += '%s\n' % line
|
||||
return index_string, bookmark_string
|
||||
|
||||
def __index__format_func(self, my_string):
|
||||
italics = 0
|
||||
bold =0
|
||||
lines = my_string.split('\n')
|
||||
for line in lines:
|
||||
token_info = line[:16]
|
||||
if token_info == 'cw<in<index-bold':
|
||||
bold = 1
|
||||
if token_info == 'cw<in<index-ital':
|
||||
italics = 1
|
||||
return italics, bold
|
||||
|
||||
def __parse_toc_func(self, my_string):
|
||||
"""
|
||||
Requires:
|
||||
my_string -- all the string in the toc
|
||||
Returns:
|
||||
modidified string
|
||||
Logic:
|
||||
"""
|
||||
toc_level = 0
|
||||
toc_suppress = 0
|
||||
my_string, book_start_string, book_end_string =\
|
||||
self.__parse_bookmark_for_toc(my_string)
|
||||
main_entry = ''
|
||||
my_changed_string = 'mi<tg<empty-att_<field<type>toc-entry'
|
||||
my_changed_string += '<update>static'
|
||||
if book_start_string:
|
||||
my_changed_string += '<bookmark-start>%s' % book_start_string
|
||||
if book_end_string:
|
||||
my_changed_string += '<bookmark-end>%s' % book_end_string
|
||||
lines = my_string.split('\n')
|
||||
for line in lines:
|
||||
token_info = line[:16]
|
||||
if token_info[0:2] == 'tx':
|
||||
main_entry += line[17:]
|
||||
if token_info == 'cw<tc<toc-level_':
|
||||
toc_level = line[20:]
|
||||
if token_info == 'cw<tc<toc-sup-nu':
|
||||
toc_suppress = 1
|
||||
if toc_level:
|
||||
my_changed_string += '<toc-level>%s' % toc_level
|
||||
if toc_suppress:
|
||||
my_changed_string += '<toc-suppress-number>true'
|
||||
my_changed_string += '<main-entry>%s' % main_entry
|
||||
my_changed_string += '\n'
|
||||
return my_changed_string
|
||||
|
||||
def __parse_bookmark_for_toc(self, my_string):
|
||||
"""
|
||||
Requires:
|
||||
the_string --string of toc, with new lines
|
||||
Returns:
|
||||
the_string -- string minus bookmarks
|
||||
bookmark_string -- bookmarks
|
||||
Logic:
|
||||
"""
|
||||
in_bookmark = 0
|
||||
bracket_count = 0
|
||||
book_start_string = ''
|
||||
book_end_string = ''
|
||||
book_type = 0
|
||||
toc_string = ''
|
||||
lines = my_string.split('\n')
|
||||
end_bracket_count = sys.maxsize
|
||||
for line in lines:
|
||||
token_info = line[:16]
|
||||
if token_info == 'ob<nu<open-brack':
|
||||
bracket_count += 1
|
||||
if token_info == 'cb<nu<clos-brack':
|
||||
bracket_count -= 1
|
||||
if in_bookmark:
|
||||
if bracket_count == end_bracket_count and token_info == 'cb<nu<clos-brack':
|
||||
in_bookmark = 0
|
||||
toc_string += '%s\n' % line
|
||||
else:
|
||||
if token_info == 'tx<nu<__________':
|
||||
if book_type == 'start':
|
||||
book_start_string += line[17:]
|
||||
elif book_type == 'end':
|
||||
book_end_string += line[17:]
|
||||
else:
|
||||
toc_string += '%s\n' % line
|
||||
else:
|
||||
if token_info == 'cw<an<book-mk-st' or token_info =='cw<an<book-mk-en':
|
||||
if token_info == 'cw<an<book-mk-st':
|
||||
book_type = 'start'
|
||||
if token_info == 'cw<an<book-mk-en':
|
||||
book_type = 'end'
|
||||
end_bracket_count = bracket_count - 1
|
||||
in_bookmark = 1
|
||||
toc_string += '%s\n' % line
|
||||
return toc_string, book_start_string, book_end_string
|
||||
|
||||
def __parse_bookmark_func(self, my_string, type):
|
||||
"""
|
||||
Requires:
|
||||
my_string --string to parse
|
||||
type --type of string
|
||||
Returns:
|
||||
A string formated for a field instruction.
|
||||
Logic:
|
||||
The type is the name (either bookmark-end or bookmark-start). The
|
||||
id is the complete text string.
|
||||
"""
|
||||
my_changed_string = ('mi<tg<empty-att_<field<type>%s'
|
||||
'<number>%s<update>none\n' % (type, my_string))
|
||||
return my_changed_string
|
||||
|
||||
def __found_toc_index_func(self, line, tag):
|
||||
"""
|
||||
Requires:
|
||||
line --the line to parse
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
This function is called when a toc or index entry is found. The opening
|
||||
bracket count is stored in the beginning bracket count. The state
|
||||
is changed to 'toc_index.'
|
||||
"""
|
||||
self.__beg_bracket_count = self.__ob_count
|
||||
self.__cb_count = 0
|
||||
self.__state = 'toc_index'
|
||||
self.__tag = tag
|
||||
|
||||
def __toc_index_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line --the line to parse
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
This function handles all lines within a toc or index entry. It
|
||||
adds each line to a string until the end of the entry is found. It
|
||||
processes the string with the fields_string module, and
|
||||
prints out the result.
|
||||
"""
|
||||
if self.__beg_bracket_count == self.__cb_count:
|
||||
self.__state = 'body'
|
||||
type = self.__tag
|
||||
if type == 'index':
|
||||
my_string = self.__parse_index_func(
|
||||
self.__text_string)
|
||||
elif type == 'toc':
|
||||
my_string = self.__parse_toc_func(
|
||||
self.__text_string)
|
||||
self.__write_obj.write(self.__marker)
|
||||
self.__write_obj.write(my_string)
|
||||
self.__text_string = ''
|
||||
self.__write_obj.write(line)
|
||||
else:
|
||||
self.__text_string += line
|
||||
|
||||
def fix_fields(self):
|
||||
"""
|
||||
Requires:
|
||||
nothing
|
||||
Returns:
|
||||
nothing (changes the original file)
|
||||
Logic:
|
||||
Read one line in at a time. Determine what action to take based on
|
||||
the state. If the state is before the body, look for the
|
||||
beginning of the body.
|
||||
The other two states are toc_index (for toc and index entries) and
|
||||
bookmark.
|
||||
"""
|
||||
self.__initiate_values()
|
||||
with open_for_read(self.__file) as read_obj:
|
||||
with open_for_write(self.__write_to) as self.__write_obj:
|
||||
for line in read_obj:
|
||||
self.__token_info = line[:16]
|
||||
if self.__token_info == 'ob<nu<open-brack':
|
||||
self.__ob_count = line[-5:-1]
|
||||
if self.__token_info == 'cb<nu<clos-brack':
|
||||
self.__cb_count = line[-5:-1]
|
||||
action = self.__state_dict.get(self.__state)
|
||||
if action is None:
|
||||
sys.stderr.write('No matching state in module fields_small.py\n')
|
||||
sys.stderr.write(self.__state + '\n')
|
||||
action(line)
|
||||
copy_obj = copy.Copy(bug_handler=self.__bug_handler)
|
||||
if self.__copy:
|
||||
copy_obj.copy_file(self.__write_to, "fields_small.data")
|
||||
copy_obj.rename(self.__write_to, self.__file)
|
||||
os.remove(self.__write_to)
|
||||
Reference in New Issue
Block a user