mirror of
https://github.com/gryf/ebook-converter.git
synced 2026-01-02 08:32:26 +01:00
378 lines
15 KiB
Python
378 lines
15 KiB
Python
#########################################################################
|
|
# #
|
|
# #
|
|
# copyright 2002 Paul Henry Tremblay #
|
|
# #
|
|
# This program is distributed in the hope that it will be useful, #
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
|
|
# General Public License for more details. #
|
|
# #
|
|
# #
|
|
#########################################################################
|
|
import sys, os
|
|
from ebook_converter.ebooks.rtf2xml import field_strings, copy
|
|
from ebook_converter.ptempfile import better_mktemp
|
|
from . import open_for_read, open_for_write
|
|
|
|
|
|
class FieldsLarge:
|
|
r"""
|
|
=========================
|
|
Logic
|
|
=========================
|
|
Make tags for fields.
|
|
-Fields reflect text that Microsoft Word automatically generates.
|
|
-Each file contains (or should contain) an inner group called field instructions.
|
|
-Fields can be nested.
|
|
--------------
|
|
Logic
|
|
--------------
|
|
1. As soon as a field is found, make a new text string by appending an empty
|
|
text string to the field list. Collect all the lines in this string until the
|
|
field instructions are found.
|
|
2. Collect all the tokens and text in the field instructions. When the end of
|
|
the field instructions is found, process the string of text with the
|
|
field_strings module. Append the processed string to the field instructins
|
|
list.
|
|
3. Continue collecting tokens. Check for paragraphs or sections. If either is found, add to the paragraph or section list.
|
|
4. Continue collecting tokens and text either the beginning of a new field is found, or the end of this field is found.
|
|
5. If a new field is found, repeat steps 1-3.
|
|
6. If the end of the field is found, process the last text string of the field list.
|
|
7. If the field list is empty (after removing the last text string), there are
|
|
no more fields. Print out the final string. If the list contains other strings,
|
|
add the processed string to the last string in the field list.
|
|
============================
|
|
Examples
|
|
============================
|
|
This line of RTF:
|
|
{\field{\*\fldinst { CREATEDATE \\* MERGEFORMAT }}{\fldrslt {
|
|
\lang1024 1/11/03 10:34 PM}}}
|
|
Becomes:
|
|
<field type = "insert-time">
|
|
10:34 PM
|
|
</field>
|
|
The simple field in the above example conatins no paragraph or sections breaks.
|
|
This line of RTF:
|
|
{{\field{\*\fldinst SYMBOL 97 \\f "Symbol" \\s 12}{\fldrslt\f3\fs24}}}
|
|
Becomes:
|
|
<para><inline font-size="18"><inline font-style="Symbol">Χ</inline></inline></para>
|
|
The RTF in the example above should be represented as UTF-8 rather than a field.
|
|
This RTF:
|
|
{\field\fldedit{\*\fldinst { TOC \\o "1-3" }}{\fldrslt {\lang1024
|
|
Heading one\tab }{\field{\*\fldinst {\lang1024 PAGEREF _Toc440880424
|
|
\\h }{\lang1024 {\*\datafield
|
|
{\lang1024 1}}}{\lang1024 \par }\pard\plain
|
|
\s18\li240\widctlpar\tqr\tldot\tx8630\aspalpha\aspnum\faauto\adjustright\rin0\lin240\itap0
|
|
\f4\lang1033\cgrid {\lang1024 Heading 2\tab }{\field{\*\fldinst
|
|
{\lang1024 PAGEREF _Toc440880425 \\h }{\lang1024 {\*\datafield
|
|
{\lang1024 1}}}{\lang1024 \par }\pard\plain
|
|
\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0
|
|
\f4\lang1033\cgrid }}\pard\plain
|
|
\widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0
|
|
\f4\lang1033\cgrid {\fs28 \\u214\'85 \par }{\fs36 {\field{\*\fldinst
|
|
SYMBOL 67 \\f "Symbol" \\s 18}{\fldrslt\f3\fs36}}}
|
|
Becomes:
|
|
<field-block type="table-of-contents">
|
|
<paragraph-definition language="1033" nest-level="0"
|
|
font-style="Times" name="toc 1" adjust-right="true"
|
|
widow-control="true">
|
|
<para><inline language="1024">Heading one	</inline><field
|
|
type="reference-to-page" ref="_Toc440880424"><inline
|
|
language="1024">1</inline></field></para>
|
|
</paragraph-definition>
|
|
<paragraph-definition language="1033" nest-level="0" left-indent="12"
|
|
font-style="Times" name="toc 2" adjust-right="true"
|
|
widow-control="true">
|
|
<para><inline language="1024">Heading 2	</inline><field
|
|
type="reference-to-page" ref="_Toc440880425"><inline
|
|
language="1024">1</inline></field></para>
|
|
</paragraph-definition>
|
|
</field-block>
|
|
"""
|
|
|
|
def __init__(self,
|
|
in_file,
|
|
bug_handler,
|
|
copy=None,
|
|
run_level=1,
|
|
):
|
|
"""
|
|
Required:
|
|
'file'--file to parse
|
|
Optional:
|
|
'copy'-- whether to make a copy of result for debugging
|
|
'temp_dir' --where to output temporary results (default is
|
|
directory from which the script is run.)
|
|
Returns:
|
|
nothing
|
|
"""
|
|
self.__file = in_file
|
|
self.__bug_handler = bug_handler
|
|
self.__copy = copy
|
|
self.__run_level = run_level
|
|
self.__write_to = better_mktemp()
|
|
|
|
def __initiate_values(self):
|
|
"""
|
|
Initiate all values.
|
|
"""
|
|
self.__text_string = ''
|
|
self.__field_instruction_string = ''
|
|
self.__marker = 'mi<mk<inline-fld\n'
|
|
self.__state = 'before_body'
|
|
self.__string_obj = field_strings.FieldStrings(run_level=self.__run_level,
|
|
bug_handler=self.__bug_handler,)
|
|
self.__state_dict = {
|
|
'before_body' : self.__before_body_func,
|
|
'in_body' : self.__in_body_func,
|
|
'field' : self.__in_field_func,
|
|
'field_instruction' : self.__field_instruction_func,
|
|
}
|
|
self.__in_body_dict = {
|
|
'cw<fd<field_____' : self.__found_field_func,
|
|
}
|
|
self.__field_dict = {
|
|
'cw<fd<field-inst' : self.__found_field_instruction_func,
|
|
'cw<fd<field_____' : self.__found_field_func,
|
|
'cw<pf<par-end___' : self.__par_in_field_func,
|
|
'cw<sc<section___' : self.__sec_in_field_func,
|
|
}
|
|
self.__field_count = [] # keep track of the brackets
|
|
self.__field_instruction = [] # field instruction strings
|
|
self.__symbol = 0 # wheter or not the field is really UTF-8
|
|
# (these fields cannot be nested.)
|
|
self.__field_instruction_string = '' # string that collects field instruction
|
|
self.__par_in_field = [] # paragraphs in field?
|
|
self.__sec_in_field = [] # sections in field?
|
|
self.__field_string = [] # list of field strings
|
|
|
|
def __before_body_func(self, line):
|
|
"""
|
|
Requried:
|
|
line --line ro parse
|
|
Returns:
|
|
nothing (changes an instant and writes a line)
|
|
Logic:
|
|
Check for the beginninf of the body. If found, changed the state.
|
|
Always write out the line.
|
|
"""
|
|
if self.__token_info == 'mi<mk<body-open_':
|
|
self.__state = 'in_body'
|
|
self.__write_obj.write(line)
|
|
|
|
def __in_body_func(self, line):
|
|
"""
|
|
Required:
|
|
line --line to parse
|
|
Returns:
|
|
nothing. (Writes a line to the output file, or performs other actions.)
|
|
Logic:
|
|
Check of the beginning of a field. Always output the line.
|
|
"""
|
|
action = self.__in_body_dict.get(self.__token_info)
|
|
if action:
|
|
action(line)
|
|
self.__write_obj.write(line)
|
|
|
|
def __found_field_func(self, line):
|
|
"""
|
|
Requires:
|
|
line --line to parse
|
|
Returns:
|
|
nothing
|
|
Logic:
|
|
Set the values for parseing the field. Four lists have to have
|
|
items appended to them.
|
|
"""
|
|
self.__state = 'field'
|
|
self.__cb_count = 0
|
|
ob_count = self.__ob_count
|
|
self.__field_string.append('')
|
|
self.__field_count.append(ob_count)
|
|
self.__sec_in_field.append(0)
|
|
self.__par_in_field.append(0)
|
|
|
|
def __in_field_func(self, line):
|
|
"""
|
|
Requires:
|
|
line --line to parse
|
|
Returns:
|
|
nothing.
|
|
Logic:
|
|
Check for the end of the field; a paragaph break; a section break;
|
|
the beginning of another field; or the beginning of the field
|
|
instruction.
|
|
"""
|
|
if self.__cb_count == self.__field_count[-1]:
|
|
self.__field_string[-1] += line
|
|
self.__end_field_func()
|
|
else:
|
|
action = self.__field_dict.get(self.__token_info)
|
|
if action:
|
|
action(line)
|
|
else:
|
|
self.__field_string[-1] += line
|
|
|
|
def __par_in_field_func(self, line):
|
|
"""
|
|
Requires:
|
|
line --line to parse
|
|
Returns:
|
|
nothing
|
|
Logic:
|
|
Write the line to the output file and set the last item in the
|
|
paragraph in field list to true.
|
|
"""
|
|
self.__field_string[-1] += line
|
|
self.__par_in_field[-1] = 1
|
|
|
|
def __sec_in_field_func(self, line):
|
|
"""
|
|
Requires:
|
|
line --line to parse
|
|
Returns:
|
|
nothing
|
|
Logic:
|
|
Write the line to the output file and set the last item in the
|
|
section in field list to true.
|
|
"""
|
|
self.__field_string[-1] += line
|
|
self.__sec_in_field[-1] = 1
|
|
|
|
def __found_field_instruction_func(self, line):
|
|
"""
|
|
Requires:
|
|
line -- line to parse
|
|
Returns:
|
|
nothing
|
|
Change the state to field instruction. Set the open bracket count of
|
|
the beginning of this field so you know when it ends. Set the closed
|
|
bracket count to 0 so you don't prematureley exit this state.
|
|
"""
|
|
self.__state = 'field_instruction'
|
|
self.__field_instruction_count = self.__ob_count
|
|
self.__cb_count = 0
|
|
|
|
def __field_instruction_func(self, line):
|
|
"""
|
|
Requires:
|
|
line --line to parse
|
|
Returns:
|
|
nothing
|
|
Logic:
|
|
Collect all the lines until the end of the field is reached.
|
|
Process these lines with the module rtr.field_strings.
|
|
Check if the field instruction is 'Symbol' (really UTF-8).
|
|
"""
|
|
if self.__cb_count == self.__field_instruction_count:
|
|
# The closing bracket should be written, since the opening bracket
|
|
# was written
|
|
self.__field_string[-1] += line
|
|
my_list = self.__string_obj.process_string(
|
|
self.__field_instruction_string, 'field_instruction')
|
|
instruction = my_list[2]
|
|
self.__field_instruction.append(instruction)
|
|
if my_list[0] == 'Symbol':
|
|
self.__symbol = 1
|
|
self.__state = 'field'
|
|
self.__field_instruction_string = ''
|
|
else:
|
|
self.__field_instruction_string += line
|
|
|
|
def __end_field_func(self):
|
|
"""
|
|
Requires:
|
|
nothing
|
|
Returns:
|
|
Nothing
|
|
Logic:
|
|
Pop the last values in the instructions list, the fields list, the
|
|
paragaph list, and the section list.
|
|
If the field is a symbol, do not write the tags <field></field>,
|
|
since this field is really just UTF-8.
|
|
If the field contains paragraph or section breaks, it is a
|
|
field-block rather than just a field.
|
|
Write the paragraph or section markers for later parsing of the
|
|
file.
|
|
If the filed list contains more strings, add the latest
|
|
(processed) string to the last string in the list. Otherwise,
|
|
write the string to the output file.
|
|
"""
|
|
last_bracket = self.__field_count.pop()
|
|
instruction = self.__field_instruction.pop()
|
|
inner_field_string = self.__field_string.pop()
|
|
sec_in_field = self.__sec_in_field.pop()
|
|
par_in_field = self.__par_in_field.pop()
|
|
# add a closing bracket, since the closing bracket is not included in
|
|
# the field string
|
|
if self.__symbol:
|
|
inner_field_string = '%scb<nu<clos-brack<%s\n' % \
|
|
(instruction, last_bracket)
|
|
elif sec_in_field or par_in_field:
|
|
inner_field_string = \
|
|
'mi<mk<fldbkstart\n'\
|
|
'mi<tg<open-att__<field-block<type>%s\n%s'\
|
|
'mi<mk<fldbk-end_\n' \
|
|
'mi<tg<close_____<field-block\n'\
|
|
'mi<mk<fld-bk-end\n' \
|
|
% (instruction, inner_field_string)
|
|
# write a marker to show an inline field for later parsing
|
|
else:
|
|
inner_field_string = \
|
|
'%s' \
|
|
'mi<tg<open-att__<field<type>%s\n%s'\
|
|
'mi<tg<close_____<field\n'\
|
|
% (self.__marker, instruction, inner_field_string)
|
|
if sec_in_field:
|
|
inner_field_string = 'mi<mk<sec-fd-beg\n' + inner_field_string + \
|
|
'mi<mk<sec-fd-end\n'
|
|
if par_in_field:
|
|
inner_field_string = 'mi<mk<par-in-fld\n' + inner_field_string
|
|
if len(self.__field_string) == 0:
|
|
self.__write_field_string(inner_field_string)
|
|
else:
|
|
self.__field_string[-1] += inner_field_string
|
|
self.__symbol = 0
|
|
|
|
def __write_field_string(self, the_string):
|
|
self.__state = 'in_body'
|
|
self.__write_obj.write(the_string)
|
|
|
|
def fix_fields(self):
|
|
"""
|
|
Requires:
|
|
nothing
|
|
Returns:
|
|
nothing (changes the original file)
|
|
Logic:
|
|
Read one line in at a time. Determine what action to take based on
|
|
the state. If the state is before the body, look for the
|
|
beginning of the body.
|
|
If the state is body, send the line to the body method.
|
|
"""
|
|
self.__initiate_values()
|
|
read_obj = open_for_read(self.__file)
|
|
self.__write_obj = open_for_write(self.__write_to)
|
|
line_to_read = 1
|
|
while line_to_read:
|
|
line_to_read = read_obj.readline()
|
|
line = line_to_read
|
|
self.__token_info = line[:16]
|
|
if self.__token_info == 'ob<nu<open-brack':
|
|
self.__ob_count = line[-5:-1]
|
|
if self.__token_info == 'cb<nu<clos-brack':
|
|
self.__cb_count = line[-5:-1]
|
|
action = self.__state_dict.get(self.__state)
|
|
if action is None:
|
|
sys.stderr.write('no no matching state in module styles.py\n')
|
|
sys.stderr.write(self.__state + '\n')
|
|
action(line)
|
|
read_obj.close()
|
|
self.__write_obj.close()
|
|
copy_obj = copy.Copy(bug_handler=self.__bug_handler)
|
|
if self.__copy:
|
|
copy_obj.copy_file(self.__write_to, "fields_large.data")
|
|
copy_obj.rename(self.__write_to, self.__file)
|
|
os.remove(self.__write_to)
|