Initial import

2026-04-01 18:03:35 +02:00 · 2020-03-31 17:15:23 +02:00
commit d97ea9b0bc
311 changed files with 131419 additions and 0 deletions
--- a/ebook_converter/ebooks/rtf2xml/make_lists.py
+++ b/ebook_converter/ebooks/rtf2xml/make_lists.py
@@ -0,0 +1,465 @@
+from __future__ import unicode_literals, absolute_import, print_function, division
+#########################################################################
+#                                                                       #
+#                                                                       #
+#   copyright 2002 Paul Henry Tremblay                                  #
+#                                                                       #
+#   This program is distributed in the hope that it will be useful,     #
+#   but WITHOUT ANY WARRANTY; without even the implied warranty of      #
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU    #
+#   General Public License for more details.                            #
+#                                                                       #
+#                                                                       #
+#########################################################################
+import sys, os, re
+
+from calibre.ebooks.rtf2xml import copy
+from calibre.ptempfile import better_mktemp
+from polyglot.builtins import unicode_type
+
+from . import open_for_read, open_for_write
+
+
+class MakeLists:
+    """
+    Form lists.
+    Use RTF's own formatting to determine if a paragraph definition is part of a
+    list.
+    Use indents to determine items and how lists are nested.
+    """
+
+    def __init__(self,
+            in_file,
+            bug_handler,
+            headings_to_sections,
+            list_of_lists,
+            copy=None,
+            run_level=1,
+            no_headings_as_list=1,
+            write_list_info=0,
+            ):
+        """
+        Required:
+            'file'
+        Optional:
+            'copy'-- whether to make a copy of result for debugging
+            'temp_dir' --where to output temporary results (default is
+            directory from which the script is run.)
+        Returns:
+            nothing
+            """
+        self.__file = in_file
+        self.__bug_handler = bug_handler
+        self.__run_level = run_level
+        self.__no_headings_as_list = no_headings_as_list
+        self.__headings_to_sections = headings_to_sections
+        self.__copy = copy
+        self.__write_to = better_mktemp()
+        self.__list_of_lists = list_of_lists
+        self.__write_list_info = write_list_info
+
+    def __initiate_values(self):
+        """
+        Required:
+            Nothing
+        Return:
+            Nothing
+        Logic:
+            The self.__end_list is a list of tokens that will force a list to end.
+            Likewise, the self.__end_lines is a list of lines that forces a list to end.
+        """
+        self.__state = "default"
+        self.__left_indent = 0
+        self.__list_type = 'not-defined'
+        self.__pard_def = ""
+        self.__all_lists = []
+        self.__level = 0
+        self.__list_chunk = ''
+        self.__state_dict={
+        'default'           :   self.__default_func,
+        'in_pard'           :   self.__in_pard_func,
+        'after_pard'        :   self.__after_pard_func,
+        }
+        self.__headings = [
+        'heading 1', 'heading 2', 'heading 3', 'heading 4',
+        'heading 5', 'heading 6', 'heading 7', 'heading 8',
+        'heading 9'
+        ]
+        self.__allow_levels = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
+        self.__style_name = ''
+        self.__end_list = [
+        'mi<mk<body-close',
+        'mi<mk<par-in-fld',
+        'cw<tb<cell______',
+        'cw<tb<row-def___',
+        'cw<tb<row_______',
+        'mi<mk<sect-close',
+        'mi<mk<sect-start',
+        'mi<mk<header-beg',
+        'mi<mk<header-end',
+        'mi<mk<head___clo',
+        'mi<mk<fldbk-end_',
+        'mi<mk<close_cell',
+        'mi<mk<footnt-ope',
+        'mi<mk<foot___clo',
+        'mi<mk<tabl-start',
+        # 'mi<mk<sec-fd-beg',
+        ]
+        self.__end_lines = [
+            'mi<tg<close_____<cell\n',
+        ]
+        self.__id_regex = re.compile(r'\<list-id\>(\d+)')
+        self.__lv_regex = re.compile(r'\<list-level\>(\d+)')
+        self.__found_appt = 0
+        self.__line_num = 0
+
+    def __in_pard_func(self, line):
+        """
+        Required:
+            line -- the line of current text.
+        Return:
+            Nothing
+        Logic:
+            You are in a list, but in the middle of a paragraph definition.
+            Don't do anything until you find the end of the paragraph definition.
+        """
+        if self.__token_info == 'mi<mk<pard-end__':
+            self.__state = 'after_pard'
+        self.__write_obj.write(line)
+
+    def __after_pard_func(self, line):
+        """
+        Required:
+            line -- the line of current text.
+        Return:
+            Nothing
+        Logic:
+            You are in a list, but after a paragraph definition. You have to
+            determine if the last pargraph definition ends a list, continues
+            the old one, or starts a new one.
+            Otherwise, look for a paragraph definition. If one is found, determine if
+            the paragraph definition contains a list-id. If it does, use the method
+            self.__list_after_par_def to determine the action.
+            If the paragraph definition does not contain a list-id, use the method
+            close_lists to close out items and lists for a paragraph that is not
+            If a bigger block is found (such as a section or a cell), end all lists.
+            indented.
+            If no special line is found, add each line to a buffer.
+        """
+        if self.__token_info == 'mi<tg<open-att__' and line[17:37] == 'paragraph-definition':
+            is_heading = self.__is_a_heading()
+            # found paragraph definition and not heading 1
+            search_obj = re.search(self.__id_regex, line)
+            if search_obj and not is_heading:  # found list-id
+                search_obj_lv = re.search(self.__lv_regex, line)
+                if search_obj_lv:
+                    self.__level = search_obj_lv.group(1)
+                num = search_obj.group(1)
+                self.__list_after_par_def_func(line, num)
+                self.__write_obj.write(line)
+                self.__state = 'in_pard'
+            # heading 1
+            elif is_heading:
+                self.__left_indent = -1000
+                self.__close_lists()
+                self.__write_obj.write(self.__list_chunk)
+                self.__list_chunk = ''
+                self.__state = 'default'
+                self.__write_obj.write(line)
+            # Normal with no list id
+            else:
+                self.__close_lists()
+                self.__write_obj.write(self.__list_chunk)
+                self.__list_chunk = ''
+                self.__write_obj.write(line)
+                if len(self.__all_lists) == 0:
+                    self.__state= 'default'
+                else:
+                    self.__state = 'in_pard'
+        # section to end lists
+        elif self.__token_info in self.__end_list :
+            self.__left_indent = -1000
+            self.__close_lists()
+            self.__write_obj.write(self.__list_chunk)
+            self.__list_chunk = ''
+            self.__state = 'default'
+            self.__write_obj.write(line)
+        else:
+            self.__list_chunk += line
+
+    def __list_after_par_def_func(self, line, id):
+        """
+        Required:
+            line -- the line of current text.
+            id -- the id of the current list
+        Return:
+            Nothing
+        Logic:
+            You have found the end of a paragraph definition, and have found
+            another paragraph definition with a list id.
+            If the list-id is different from the last paragraph definition,
+            write the string in the buffer. Close out the lists with another
+            method and start a new list.
+            If the list id is the same as the last one, check the indent on the
+            current paragraph definition. If it is greater than the previous one,
+            do not end the current list or item. Start a new list.
+        """
+        last_list_id = self.__all_lists[-1]['id']
+        if id != last_list_id:
+            self.__close_lists()
+            self.__write_obj.write(self.__list_chunk)
+            self.__write_start_list(id)
+            self.__list_chunk = ''
+        else:
+            last_list_indent = self.__all_lists[-1]['left-indent']
+            if self.__left_indent > last_list_indent:
+                self.__write_obj.write(self.__list_chunk)
+                self.__write_start_list(id)
+            else:
+                self.__write_end_item()
+                self.__write_obj.write(self.__list_chunk)
+                self.__write_start_item()
+            self.__list_chunk = ''
+
+    def __close_lists(self):
+        """
+        Required:
+            Nothing
+        Return:
+            Nothing
+        Logic:
+            Reverse the list of dictionaries. Iterate through the list and
+            get the indent for each list. If the current indent is less than
+            or equal to the indent in the dictionary, close that level.
+            Keep track of how many levels you close. Reduce the list by that
+            many levels.
+            Reverse the list again.
+        """
+        if self.__line_num < 25 and self.__found_appt:
+            sys.stderr.write('in closing out lists\n')
+            sys.stderr.write('current_indent is "%s"\n' % self.__left_indent)
+        current_indent = self.__left_indent
+        self.__all_lists.reverse()
+        num_levels_closed = 0
+        for the_dict in self.__all_lists:
+            list_indent = the_dict.get('left-indent')
+            if self.__line_num < 25 and self.__found_appt:
+                sys.stderr.write('last indent is "%s"' % list_indent)
+            if current_indent <= list_indent:
+                self.__write_end_item()
+                self.__write_end_list()
+                num_levels_closed += 1
+        self.__all_lists = self.__all_lists[num_levels_closed:]
+        self.__all_lists.reverse()
+
+    def __write_end_list(self):
+        """
+        Required:
+            Nothing
+        Return:
+            Nothing
+        Logic:
+            Write the end of a list.
+        """
+        self.__write_obj.write('mi<tg<close_____<list\n')
+        self.__write_obj.write('mi<mk<list_close\n')
+
+    def __write_start_list(self, id):
+        """
+        Required:
+            id -- the id of the current list.
+        Return:
+            Nothing
+        Logic:
+            Write the start of a list and add the id and left-indent to the
+            self.__all_lists list.
+            Write cues of when a list starts for later processing.
+            In order to determine the type of list, you have to iterate through
+            the self.__list_of lists. This list looks like:
+                [[{list-id: [1, 2], [{}], [{}]] [{list-id: [3, 4], [{}]]]
+            I need to get the inside lists of the main lists. Then I need to get
+            the first item of what I just got. This is a dictionary. Get the list-id.
+            This is  a list. Check to see if the current id is in this list. If
+            so, then get the list-type from the dictionary.
+        """
+        the_dict = {}
+        the_dict['left-indent'] = self.__left_indent
+        the_dict['id'] = id
+        self.__all_lists.append(the_dict)
+        self.__write_obj.write(
+            'mi<mk<list_start\n'
+                )
+        # bogus levels are sometimes written for empty paragraphs
+        if unicode_type(self.__level) not in self.__allow_levels:
+            lev_num = '0'
+        else:
+            lev_num = self.__level
+        self.__write_obj.write(
+            'mi<tg<open-att__<list<list-id>%s<level>%s'
+            % (id, lev_num)
+                )
+        list_dict = {}
+        if self.__list_of_lists:  # older RTF won't generate a list_of_lists
+            index_of_list = self.__get_index_of_list(id)
+            if index_of_list is not None:  # found a matching id
+                curlist = self.__list_of_lists[index_of_list]
+                list_dict = curlist[0]
+                level = int(self.__level) + 1
+                if level >= len(curlist):
+                    level = len(curlist) - 1
+                level_dict = curlist[level][0]
+                list_type = level_dict.get('numbering-type')
+                if list_type == 'bullet':
+                    list_type = 'unordered'
+                else:
+                    list_type = 'ordered'
+                self.__write_obj.write(
+                    '<list-type>%s' % (list_type))
+            else:  # no matching id
+                self.__write_obj.write(
+                    '<list-type>%s' % (self.__list_type))
+        else:  # older RTF
+            self.__write_obj.write(
+                '<list-type>%s' % (self.__list_type))
+        # if you want to dump all the info to the list, rather than
+        # keeping it in the table above, change self.__write_list_info
+        # to true.
+        if self.__list_of_lists and self.__write_list_info and list_dict:
+            not_allow = ['list-id',]
+            the_keys_list = list_dict.keys()
+            for the_key in the_keys_list:
+                if the_key in not_allow:
+                    continue
+                self.__write_obj.write('<%s>%s' % (the_key, list_dict[the_key]))
+            the_keys_level = level_dict.keys()
+            for the_key in the_keys_level:
+                self.__write_obj.write('<%s>%s' % (the_key, level_dict[the_key]))
+        self.__write_obj.write('\n')
+        self.__write_obj.write(
+            'mi<mk<liststart_\n'
+                )
+        self.__write_start_item()
+
+    def __get_index_of_list(self, id):
+        """
+        Requires:
+            id -- id of current paragraph-definition
+        Returns:
+            an index of where the id occurs in list_of_lists, the
+            dictionary passed to this module.
+        Logic:
+            Iterate through the big lists, the one passed to this module and
+            get the first item, the dictionary. Use a counter to keep
+            track of how many times you iterate with the counter.
+            Once you find a match, return the counter.
+            If no match is found, print out an error message.
+        """
+        # some RTF use 0 indexed list. Don't know what to do?
+        if id == '0':
+            return
+        the_index = 0
+        for list in self.__list_of_lists:
+            the_dict = list[0]
+            id_in_list = the_dict.get('list-id')
+            if id in id_in_list:
+                return the_index
+            the_index += 1
+        if self.__run_level > 0:
+            sys.stderr.write('Module is make_lists.py\n'
+                'Method is __get_index_of_list\n'
+                'The main list does not appear to have a matching id for %s \n'
+                % (id)
+                )
+            # sys.stderr.write(repr(self.__list_of_lists))
+#        if self.__run_level > 3:
+#            msg = 'level is "%s"\n' % self.__run_level
+#            self.__bug_handler
+
+    def __write_start_item(self):
+        self.__write_obj.write('mi<mk<item_start\n')
+        self.__write_obj.write('mi<tg<open______<item\n')
+        self.__write_obj.write('mi<mk<itemstart_\n')
+
+    def __write_end_item(self):
+        self.__write_obj.write('mi<tg<item_end__\n')
+        self.__write_obj.write('mi<tg<close_____<item\n')
+        self.__write_obj.write('mi<tg<item__end_\n')
+
+    def __default_func(self, line):
+        """
+        Required:
+            self, line
+        Returns:
+            Nothing
+        Logic
+            Look for the start of a paragraph defintion. If one is found, check if
+            it contains a list-id. If it does, start a list. Change the state to
+            in_pard.
+            """
+        if self.__token_info == 'mi<tg<open-att__' and line[17:37] == 'paragraph-definition':
+            is_a_heading = self.__is_a_heading()
+            if not is_a_heading:
+                search_obj = re.search(self.__id_regex, line)
+                if search_obj:
+                    num = search_obj.group(1)
+                    self.__state = 'in_pard'
+                    search_obj_lv = re.search(self.__lv_regex, line)
+                    if search_obj_lv:
+                        self.__level = search_obj_lv.group(1)
+                    self.__write_start_list(num)
+        self.__write_obj.write(line)
+
+    def __is_a_heading(self):
+        if self.__style_name in self.__headings:
+            if self.__headings_to_sections:
+                return 1
+            else:
+                if self.__no_headings_as_list:
+                    return 1
+                else:
+                    return 0
+        else:
+            return 0
+
+    def __get_indent(self, line):
+        if self.__token_info == 'mi<mk<left_inden':
+            self.__left_indent = float(line[17:-1])
+
+    def __get_list_type(self, line):
+        if self.__token_info == 'mi<mk<list-type_':  # <ordered
+            self.__list_type = line[17:-1]
+            if self.__list_type == 'item':
+                self.__list_type = "unordered"
+
+    def __get_style_name(self, line):
+        if self.__token_info == 'mi<mk<style-name':
+            self.__style_name = line[17:-1]
+
+    def make_lists(self):
+        """
+        Required:
+            nothing
+        Returns:
+            original file will be changed
+        Logic:
+        """
+        self.__initiate_values()
+        read_obj = open_for_read(self.__file)
+        self.__write_obj = open_for_write(self.__write_to)
+        line_to_read = 1
+        while line_to_read:
+            line_to_read = read_obj.readline()
+            line = line_to_read
+            self.__token_info = line[:16]
+            self.__get_indent(line)
+            self.__get_list_type(line)
+            self.__get_style_name(line)
+            action = self.__state_dict.get(self.__state)
+            action(line)
+        read_obj.close()
+        self.__write_obj.close()
+        copy_obj = copy.Copy(bug_handler=self.__bug_handler)
+        if self.__copy:
+            copy_obj.copy_file(self.__write_to, "make_lists.data")
+        copy_obj.rename(self.__write_to, self.__file)
+        os.remove(self.__write_to)