Initial import

2026-03-30 17:03:31 +02:00 · 2020-03-31 17:15:23 +02:00
commit d97ea9b0bc
311 changed files with 131419 additions and 0 deletions
--- a/ebook_converter/ebooks/rtf2xml/sections.py
+++ b/ebook_converter/ebooks/rtf2xml/sections.py
@@ -0,0 +1,538 @@
+from __future__ import absolute_import, division, print_function, unicode_literals
+#########################################################################
+#                                                                       #
+#                                                                       #
+#   copyright 2002 Paul Henry Tremblay                                  #
+#                                                                       #
+#   This program is distributed in the hope that it will be useful,     #
+#   but WITHOUT ANY WARRANTY; without even the implied warranty of      #
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU    #
+#   General Public License for more details.                            #
+#                                                                       #
+#                                                                       #
+#########################################################################
+import sys, os
+
+from calibre.ebooks.rtf2xml import copy
+from calibre.ptempfile import better_mktemp
+from polyglot.builtins import unicode_type
+
+from . import open_for_read, open_for_write
+
+
+class Sections:
+    """
+    =================
+    Purpose
+    =================
+    Write section tags for a tokenized file. (This module won't be any use to use
+    to you unless you use it as part of the other modules.)
+    ---------------
+    logic
+    ---------------
+    The tags for the first section breaks have already been written.
+    RTF stores section breaks with the \\sect tag. Each time this tag is
+    encountered, add one to the counter.
+    When I encounter the \\sectd tag, I want to collect all the appropriate tokens
+    that describe the section. When I reach a \\pard, I know I an stop collecting
+    tokens and write the section tags.
+    The exception to this method occurs when sections occur in field blocks, such
+    as the index. Normally, two section break occur within the index and other
+    field-blocks. (If less or more section breaks occur, this code may not work.)
+    I want the sections to occur outside of the index. That is, the index
+    should be nested inside one section tag. After the index is complete, a new
+    section should begin.
+    In order to write the sections outside of the field blocks, I have to store
+    all of the field block as a string. When I ecounter the \\sect tag, add one to
+    the section counter, but store this number in a list. Likewise, store the
+    information describing the section in another list.
+    When I reach the end of the field block, choose the first item from the
+    numbered list as the section number. Choose the first item in the description
+    list as the values and attributes of the section. Enclose the field string
+    between the section tags.
+    Start a new section outside the field-block strings. Use the second number in
+    the list; use the second item in the description list.
+    CHANGE (2004-04-26) No longer write sections that occurr in field-blocks.
+    Instead, ingore all section information in a field-block.
+    """
+
+    def __init__(self,
+            in_file,
+            bug_handler,
+            copy=None,
+            run_level=1):
+        """
+        Required:
+            'file'--file to parse
+        Optional:
+            'copy'-- whether to make a copy of result for debugging
+            'temp_dir' --where to output temporary results (default is
+            directory from which the script is run.)
+        Returns:
+            nothing
+            """
+        self.__file = in_file
+        self.__bug_handler = bug_handler
+        self.__copy = copy
+        self.__run_level = run_level
+        self.__write_to = better_mktemp()
+
+    def __initiate_values(self):
+        """
+        Initiate all values.
+        """
+        self.__mark_start = 'mi<mk<sect-start\n'
+        self.__mark_end =   'mi<mk<sect-end__\n'
+        self.__in_field = 0
+        self.__section_values = {}
+        self.__list_of_sec_values = []
+        self.__field_num = []
+        self.__section_num = 0
+        self.__state = 'before_body'
+        self.__found_first_sec = 0
+        self.__text_string = ''
+        self.__field_instruction_string = ''
+        self.__state_dict = {
+        'before_body'       : self.__before_body_func,
+        'body'              : self.__body_func,
+        'before_first_sec'  : self.__before_first_sec_func,
+        'section'           : self.__section_func,
+        'section_def'       : self.__section_def_func,
+        'sec_in_field'      : self.__sec_in_field_func,
+        }
+        # cw<sc<sect-defin<nu<true
+        self.__body_dict = {
+        'cw<sc<section___'      : self.__found_section_func,
+        'mi<mk<sec-fd-beg'      : self.__found_sec_in_field_func,
+        'cw<sc<sect-defin'      : self.__found_section_def_bef_sec_func,
+        }
+        self.__section_def_dict = {
+        'cw<pf<par-def___'      : (self.__end_sec_def_func, None),
+        'mi<mk<body-open_'      : (self.__end_sec_def_func, None),
+        'cw<tb<columns___'      : (self.__attribute_func, 'columns'),
+        'cw<pa<margin-lef'      : (self.__attribute_func, 'margin-left'),
+        'cw<pa<margin-rig'      : (self.__attribute_func, 'margin-right'),
+        'mi<mk<header-ind'      : (self.__end_sec_def_func, None),
+        # premature endings
+        # __end_sec_premature_func
+        'tx<nu<__________'      : (self.__end_sec_premature_func, None),
+        'cw<ci<font-style'      : (self.__end_sec_premature_func, None),
+        'cw<ci<font-size_'      : (self.__end_sec_premature_func, None),
+        }
+        self.__sec_in_field_dict = {
+        'mi<mk<sec-fd-end'      : self.__end_sec_in_field_func,
+        # changed this 2004-04-26
+        # two lines
+        # 'cw<sc<section___'      : self.__found_section_in_field_func,
+        # 'cw<sc<sect-defin'      : self.__found_section_def_in_field_func,
+        }
+
+    def __found_section_def_func(self, line):
+        """
+        Required:
+            line -- the line to parse
+        Returns:
+            nothing
+        Logic:
+            I have found a section definition. Change the state to
+            setion_def (so subsequent lines will be processesed as part of
+            the section definition), and clear the section_values dictionary.
+        """
+        self.__state = 'section_def'
+        self.__section_values.clear()
+
+    def __attribute_func(self, line, name):
+        """
+        Required:
+            line -- the line to be parsed
+            name -- the changed, readable name (as opposed to the
+            abbreviated one)
+        Returns:
+            nothing
+        Logic:
+            I need to add the right data to the section values dictionary so I
+            can retrive it later. The attribute (or key) is the name; the
+            value is the last part of the text string.
+            ex: cw<tb<columns___<nu<2
+        """
+        attribute = name
+        value = line[20:-1]
+        self.__section_values[attribute] = value
+
+    def __found_section_func(self, line):
+        """
+        Requires:
+            line -- the line to parse
+        Returns:
+            nothing
+        Logic:
+            I have found the beginning of a section, so change the state
+            accordingly. Also add one to the section counter.
+        """
+        self.__state = 'section'
+        self.__write_obj.write(line)
+        self.__section_num += 1
+
+    def __found_section_def_bef_sec_func(self, line):
+        """
+        Requires:
+            line -- the line to parse
+        Returns:
+            nothing
+        Logic:
+            I have found the beginning of a section, so change the state
+            accordingly. Also add one to the section counter.
+        """
+        self.__section_num += 1
+        self.__found_section_def_func(line)
+        self.__write_obj.write(line)
+
+    def __section_func(self, line):
+        """
+        Requires:
+            line --the line to parse
+        Returns:
+            nothing
+        Logic:
+        """
+        if self.__token_info == 'cw<sc<sect-defin':
+            self.__found_section_def_func(line)
+        self.__write_obj.write(line)
+
+    def __section_def_func(self, line):
+        """
+        Required:
+            line --line to parse
+        Returns:
+            nothing
+        Logic:
+            I have found a section definition. Check if the line is the end of
+            the defnition (a paragraph defintion), or if it contains info that
+            should be added to the values dictionary. If neither of these
+            cases are true, output the line to a file.
+        """
+        action, name = self.__section_def_dict.get(self.__token_info, (None, None))
+        if action:
+            action(line, name)
+            if self.__in_field:
+                self.__sec_in_field_string += line
+            else:
+                self.__write_obj.write(line)
+        else:
+            self.__write_obj.write(line)
+
+    def __end_sec_def_func(self, line, name):
+        """
+        Requires:
+            line --the line to parse
+            name --changed, readable name
+        Returns:
+            nothing
+        Logic:
+            The end of the section definition has been found. Reset the state.
+            Call on the write_section method.
+        """
+        if not self.__in_field:
+            self.__state = 'body'
+        else:
+            self.__state = 'sec_in_field'
+        self.__write_section(line)
+
+    def __end_sec_premature_func(self, line, name):
+        """
+        Requires:
+            line --the line to parse
+            name --changed, readable name
+        Returns:
+            nothing
+        Logic:
+            Text or control words indicating text have been found
+            before \\pard. This shoud indicate older RTF. Reset the state
+            Write the section defintion. Insert a paragraph definition.
+            Insert {} to mark the end of a paragraph defintion
+        """
+        if not self.__in_field:
+            self.__state = 'body'
+        else:
+            self.__state = 'sec_in_field'
+        self.__write_section(line)
+        self.__write_obj.write('cw<pf<par-def___<nu<true\n')
+        self.__write_obj.write('ob<nu<open-brack<0000\n')
+        self.__write_obj.write('cb<nu<clos-brack<0000\n')
+
+    def __write_section(self, line):
+        """
+        Requires:
+            nothing
+        Returns:
+            nothing
+        Logic:
+            Form a string of attributes and values. If you are not in a field
+            block, write this string to the output file. Otherwise, call on
+            the handle_sec_def method to handle this string.
+        """
+        my_string = self.__mark_start
+        if self.__found_first_sec:
+            my_string += 'mi<tg<close_____<section\n'
+        else:
+            self.__found_first_sec = 1
+        my_string += 'mi<tg<open-att__<section<num>%s' % unicode_type(self.__section_num)
+        my_string += '<num-in-level>%s' % unicode_type(self.__section_num)
+        my_string += '<type>rtf-native'
+        my_string += '<level>0'
+        keys = self.__section_values.keys()
+        if len(keys) > 0:
+            for key in keys:
+                my_string += '<%s>%s' % (key, self.__section_values[key])
+        my_string += '\n'
+        my_string += self.__mark_end
+        # # my_string += line
+        if self.__state == 'body':
+            self.__write_obj.write(my_string)
+        elif self.__state == 'sec_in_field':
+            self.__handle_sec_def(my_string)
+        elif self.__run_level > 3:
+            msg = 'missed a flag\n'
+            raise self.__bug_handler(msg)
+
+    def __handle_sec_def(self, my_string):
+        """
+        Requires:
+            my_string -- the string of attributes and values. (Do I need this?)
+        Returns:
+            nothing
+        Logic:
+            I need to append the dictionary of attributes and values to list
+            so I can use it later when I reach the end of the field-block.
+        """
+        values_dict = self.__section_values
+        self.__list_of_sec_values.append(values_dict)
+
+    def __body_func(self, line):
+        """
+        Requires:
+            line --the line to parse
+        Returns:
+            nothing
+        Logic:
+            Look for the beginning of a section. Otherwise, print the line to
+            the output file.
+        """
+        action = self.__body_dict.get(self.__token_info)
+        if action:
+            action(line)
+        else:
+            self.__write_obj.write(line)
+
+    def __before_body_func(self, line):
+        """
+        Requires:
+            line --line to parse
+        Returns:
+            nothing
+        Logic:
+            Look for the beginning of the body. Always print out the line.
+        """
+        if self.__token_info == 'mi<mk<body-open_':
+            self.__state = 'before_first_sec'
+        self.__write_obj.write(line)
+
+    def __before_first_sec_func(self, line):
+        """
+        Requires:
+            line -- line to parse
+        Returns:
+            nothing
+        Logic:
+            Look for the beginning of the first section. This can be \\sectd,
+            but in older RTF it could mean the any paragraph or row definition
+        """
+        if self.__token_info == 'cw<sc<sect-defin':
+            self.__state = 'section_def'
+            self.__section_num += 1
+            self.__section_values.clear()
+        elif self.__token_info == 'cw<pf<par-def___':
+            self.__state = 'body'
+            self.__section_num += 1
+            self.__write_obj.write(
+                    'mi<tg<open-att__<section<num>%s'
+                    '<num-in-level>%s'
+                    '<type>rtf-native'
+                    '<level>0\n'
+                    % (unicode_type(self.__section_num), unicode_type(self.__section_num))
+                    )
+            self.__found_first_sec = 1
+        elif self.__token_info == 'tx<nu<__________':
+            self.__state = 'body'
+            self.__section_num += 1
+            self.__write_obj.write(
+                    'mi<tg<open-att__<section<num>%s'
+                    '<num-in-level>%s'
+                    '<type>rtf-native'
+                    '<level>0\n'
+                    % (unicode_type(self.__section_num), unicode_type(self.__section_num))
+                    )
+            self.__write_obj.write(
+                'cw<pf<par-def___<true\n'
+                    )
+            self.__found_first_sec = 1
+        self.__write_obj.write(line)
+
+    def __found_sec_in_field_func(self, line):
+        """
+        Requires:
+            line --line to parse
+        Returns:
+            nothing
+        Logic:
+            I have found the beginning of a field that has a section (or
+            really, two) inside of it. Change the state, and start adding to
+            one long string.
+        """
+        self.__state = 'sec_in_field'
+        self.__sec_in_field_string = line
+        self.__in_field = 1
+
+    def __sec_in_field_func(self, line):
+        """
+        Requires:
+            line --the line to parse
+        Returns:
+            nothing
+        Logic:
+            Check for the end of the field, or the beginning of a section
+            definition.
+            CHANGED! Just print out each line. Ignore any sections or
+            section definition info.
+        """
+        action = self.__sec_in_field_dict.get(self.__token_info)
+        if action:
+            action(line)
+        else:
+            # change this 2004-04-26
+            # self.__sec_in_field_string += line
+            self.__write_obj.write(line)
+
+    def __end_sec_in_field_func(self, line):
+        """
+        Requires:
+            line --line to parse
+        Returns:
+            nothing
+        Logic:
+            Add the last line to the field string. Call on the method
+            print_field_sec_attributes to write the close and beginning of a
+            section tag. Print out the field string. Call on the same method
+            to again write the close and beginning of a section tag.
+            Change the state.
+        """
+        # change this 2004-04-26
+        # Don't do anyting
+        """
+        self.__sec_in_field_string += line
+        self.__print_field_sec_attributes()
+        self.__write_obj.write(self.__sec_in_field_string)
+        self.__print_field_sec_attributes()
+        """
+        self.__state = 'body'
+        self.__in_field = 0
+        # this is changed too
+        self.__write_obj.write(line)
+
+    def __print_field_sec_attributes(self):
+        """
+        Requires:
+            nothing
+        Returns:
+            nothing
+        Logic:
+            Get the number and dictionary of values from the lists. The number
+            and dictionary will be the first item of each list. Write the
+            close tag. Write the start tag. Write the attribute and values in
+            the dictionary. Get rid of the first item in each list.
+        keys = self.__section_values.keys()
+        if len(keys) > 0:
+            my_string += 'mi<tg<open-att__<section-definition'
+            for key in keys:
+                my_string += '<%s>%s' % (key, self.__section_values[key])
+            my_string += '\n'
+        else:
+            my_string += 'mi<tg<open______<section-definition\n'
+        """
+        num = self.__field_num[0]
+        self.__field_num = self.__field_num[1:]
+        self.__write_obj.write(
+        'mi<tg<close_____<section\n'
+        'mi<tg<open-att__<section<num>%s' % unicode_type(num)
+        )
+        if self.__list_of_sec_values:
+            keys =  self.__list_of_sec_values[0].keys()
+            for key in keys:
+                self.__write_obj.write(
+                '<%s>%s\n' % (key, self.__list_of_sec_values[0][key]))
+            self.__list_of_sec_values = self.__list_of_sec_values[1:]
+        self.__write_obj.write('<level>0')
+        self.__write_obj.write('<type>rtf-native')
+        self.__write_obj.write('<num-in-level>%s' % unicode_type(self.__section_num))
+        self.__write_obj.write('\n')
+        # Look here
+
+    def __found_section_in_field_func(self, line):
+        """
+        Requires:
+            line --line to parse
+        Returns:
+            nothing
+        Logic:
+            I have found a section in a field block. Add one to section
+            counter, and append this number to a list.
+        """
+        self.__section_num += 1
+        self.__field_num.append(self.__section_num)
+        self.__sec_in_field_string += line
+
+    def __found_section_def_in_field_func(self, line):
+        """
+        Requires:
+            line --line to parse
+        Returns:
+            nothing
+        Logic:
+            I have found a section definition in a filed block. Change the
+            state and clear the values dictionary.
+        """
+        self.__state = 'section_def'
+        self.__section_values.clear()
+
+    def make_sections(self):
+        """
+        Requires:
+            nothing
+        Returns:
+            nothing (changes the original file)
+        Logic:
+            Read one line in at a time. Determine what action to take based on
+            the state. If the state is before the body, look for the
+            beginning of the body.
+            If the state is body, send the line to the body method.
+        """
+        self.__initiate_values()
+        read_obj = open_for_read(self.__file)
+        self.__write_obj = open_for_write(self.__write_to)
+        line_to_read = 1
+        while line_to_read:
+            line_to_read = read_obj.readline()
+            line = line_to_read
+            self.__token_info = line[:16]
+            action = self.__state_dict.get(self.__state)
+            if action is None:
+                sys.stderr.write('no matching state in module sections.py\n')
+                sys.stderr.write(self.__state + '\n')
+            action(line)
+        read_obj.close()
+        self.__write_obj.close()
+        copy_obj = copy.Copy(bug_handler=self.__bug_handler)
+        if self.__copy:
+            copy_obj.copy_file(self.__write_to, "sections.data")
+        copy_obj.rename(self.__write_to, self.__file)
+        os.remove(self.__write_to)