Initial import

2026-03-01 06:05:55 +01:00 · 2020-03-31 17:15:23 +02:00
commit d97ea9b0bc
311 changed files with 131419 additions and 0 deletions
--- a/ebook_converter/ebooks/rtf2xml/fields_small.py
+++ b/ebook_converter/ebooks/rtf2xml/fields_small.py
@@ -0,0 +1,460 @@
+from __future__ import unicode_literals, absolute_import, print_function, division
+#########################################################################
+#                                                                       #
+#                                                                       #
+#   copyright 2002 Paul Henry Tremblay                                  #
+#                                                                       #
+#   This program is distributed in the hope that it will be useful,     #
+#   but WITHOUT ANY WARRANTY; without even the implied warranty of      #
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU    #
+#   General Public License for more details.                            #
+#                                                                       #
+#                                                                       #
+#########################################################################
+import sys, os, re
+
+from calibre.ebooks.rtf2xml import field_strings, copy
+from calibre.ptempfile import better_mktemp
+from . import open_for_read, open_for_write
+
+
+class FieldsSmall:
+    """
+=================
+Purpose
+=================
+Write tags for bookmarks, index and toc entry fields in a tokenized file.
+This module does not handle toc or index tables.  (This module won't be any
+use to you unless you use it as part of the other modules.)
+-----------
+Method
+-----------
+Look for the beginning of a bookmark, index, or toc entry. When such a token
+is found, store the opeing bracket count in a variable. Collect all the text
+until the closing bracket entry is found. Send the string to the module
+field_strings to process it. Write the processed string to the output
+file.
+    """
+
+    def __init__(self,
+            in_file,
+            bug_handler,
+            copy=None,
+            run_level=1,
+            ):
+        """
+        Required:
+            'file'--file to parse
+        Optional:
+            'copy'-- whether to make a copy of result for debugging
+            'temp_dir' --where to output temporary results (default is
+            directory from which the script is run.)
+        Returns:
+            nothing
+            """
+        self.__file = in_file
+        self.__bug_handler = bug_handler
+        self.__copy = copy
+        self.__write_to = better_mktemp()
+        self.__run_level = run_level
+
+    def __initiate_values(self):
+        """
+        Initiate all values.
+        """
+        self.__string_obj = field_strings.FieldStrings(bug_handler=self.__bug_handler)
+        self.__state = 'before_body'
+        self.__text_string = ''
+        self.__marker = 'mi<mk<inline-fld\n'
+        self.__state_dict = {
+        'before_body'   : self.__before_body_func,
+        'body'  : self.__body_func,
+        'bookmark'  : self.__bookmark_func,
+        'toc_index'       : self.__toc_index_func,
+        }
+        self.__body_dict = {
+        'cw<an<book-mk-st'      : (self.__found_bookmark_func, 'start'),
+        'cw<an<book-mk-en'      : (self.__found_bookmark_func, 'end'),
+        'cw<an<toc_______'      : (self.__found_toc_index_func, 'toc'),
+        'cw<an<index-mark'      : (self.__found_toc_index_func, 'index'),
+        }
+        ob = 'ob<nu<open-brack.....'
+        cb = 'cb<nu<clos-brack'
+        bk_st = 'cw<an<book-mk-st<nu<true'
+        tx = 'tx<nu<__________<(.*?)'
+        reg_st = ob + bk_st + tx + cb
+        self.__book_start = re.compile(r'%s' % reg_st)
+
+    def __before_body_func(self, line):
+        """
+        Requires:
+            line --the line to parse
+        Returns:
+            nothing
+        Logic:
+            Look for the beginning of the body. When found, change the state
+            to body. Always print out the line.
+        """
+        if self.__token_info == 'mi<mk<body-open_':
+            self.__state = 'body'
+        self.__write_obj.write(line)
+
+    def __body_func(self, line):
+        """
+        Requires:
+            line --the line to parse
+        Returns:
+            nothing
+        Logic:
+            This function handles all the lines in the body of the documents.
+            Look for a bookmark, index or toc entry and take the appropriate action.
+        """
+        action, tag = \
+           self.__body_dict.get(self.__token_info, (None, None))
+        if action:
+            action(line, tag)
+        else:
+            self.__write_obj.write(line)
+
+    def __found_bookmark_func(self, line, tag):
+        """
+        Requires:
+            line --the line to parse
+        Returns:
+            nothing
+        Logic:
+            This function is called when a bookmark is found. The opening
+            bracket count is stored int eh beginning bracket count. The state
+            is changed to 'bookmark.'
+        """
+        self.__beg_bracket_count = self.__ob_count
+        self.__cb_count = 0
+        self.__state = 'bookmark'
+        self.__type_of_bookmark = tag
+
+    def __bookmark_func(self, line):
+        """
+        Requires:
+            line --the line to parse
+        Returns:
+            nothing
+        Logic:
+            This function handles all lines within a bookmark. It adds each
+            line to a string until the end of the bookmark is found. It
+            processes the string with the fields_string module, and
+            prints out the result.
+        """
+        if self.__beg_bracket_count == self.__cb_count:
+            self.__state = 'body'
+            type = 'bookmark-%s'  % self.__type_of_bookmark
+            # change here
+            """
+            my_string = self.__string_obj.process_string(
+                self.__text_string, type)
+            """
+            my_string = self.__parse_bookmark_func(
+                self.__text_string, type)
+            self.__write_obj.write(self.__marker)
+            self.__write_obj.write(my_string)
+            self.__text_string = ''
+            self.__write_obj.write(line)
+        elif line[0:2] == 'tx':
+            self.__text_string += line[17:-1]
+
+    def __parse_index_func(self, my_string):
+        """
+        Requires:
+            my_string --string to parse
+            type --type of string
+        Returns:
+            A string for a toc instruction field.
+        Logic:
+            This method is meant for *both* index and toc entries.
+            I want to eleminate paragraph endings, and I want to divide the
+            entry into a main entry and (if it exists) a sub entry.
+            Split the string by newlines. Read on token at a time. If the
+            token is a special colon, end the main entry element and start the
+            sub entry element.
+            If the token is a pargrah ending, ignore it, since I don't won't
+            paragraphs within toc or index entries.
+        """
+        my_string, see_string = self.__index_see_func(my_string)
+        my_string, bookmark_string = self.__index_bookmark_func(my_string)
+        italics, bold = self.__index__format_func(my_string)
+        found_sub = 0
+        my_changed_string = 'mi<tg<empty-att_<field<type>index-entry'
+        my_changed_string += '<update>static'
+        if see_string:
+            my_changed_string += '<additional-text>%s' % see_string
+        if bookmark_string:
+            my_changed_string += '<bookmark>%s' % bookmark_string
+        if italics:
+            my_changed_string += '<italics>true'
+        if bold:
+            my_changed_string += '<bold>true'
+        main_entry = ''
+        sub_entry = ''
+        lines = my_string.split('\n')
+        for line in lines:
+            token_info = line[:16]
+            if token_info == 'cw<ml<colon_____':
+                found_sub = 1
+            elif token_info[0:2] == 'tx':
+                if found_sub:
+                    sub_entry += line[17:]
+                else:
+                    main_entry += line[17:]
+        my_changed_string += '<main-entry>%s' % main_entry
+        if found_sub:
+            my_changed_string += '<sub-entry>%s' % sub_entry
+        my_changed_string += '\n'
+        return my_changed_string
+
+    def __index_see_func(self, my_string):
+        in_see = 0
+        bracket_count = 0
+        see_string = ''
+        changed_string = ''
+        lines = my_string.split('\n')
+        end_bracket_count = sys.maxsize
+        for line in lines:
+            token_info = line[:16]
+            if token_info == 'ob<nu<open-brack':
+                bracket_count += 1
+            if token_info == 'cb<nu<clos-brack':
+                bracket_count -= 1
+            if in_see:
+                if bracket_count == end_bracket_count and token_info == 'cb<nu<clos-brack':
+                    in_see = 0
+                else:
+                    if token_info == 'tx<nu<__________':
+                        see_string += line[17:]
+            else:
+                if token_info == 'cw<in<index-see_':
+                    end_bracket_count = bracket_count - 1
+                    in_see = 1
+                changed_string += '%s\n' % line
+        return changed_string, see_string
+
+    def __index_bookmark_func(self, my_string):
+        """
+        Requries:
+            my_string -- string in all the index
+        Returns:
+            bookmark_string -- the text string of the book mark
+            index_string -- string minus the bookmark_string
+        """
+        # cw<an<place_____<nu<true
+        in_bookmark = 0
+        bracket_count = 0
+        bookmark_string = ''
+        index_string = ''
+        lines = my_string.split('\n')
+        end_bracket_count = sys.maxsize
+        for line in lines:
+            token_info = line[:16]
+            if token_info == 'ob<nu<open-brack':
+                bracket_count += 1
+            if token_info == 'cb<nu<clos-brack':
+                bracket_count -= 1
+            if in_bookmark:
+                if bracket_count == end_bracket_count and token_info == 'cb<nu<clos-brack':
+                    in_bookmark = 0
+                    index_string += '%s\n' % line
+                else:
+                    if token_info == 'tx<nu<__________':
+                        bookmark_string += line[17:]
+                    else:
+                        index_string += '%s\n' % line
+            else:
+                if token_info == 'cw<an<place_____':
+                    end_bracket_count = bracket_count - 1
+                    in_bookmark = 1
+                index_string += '%s\n' % line
+        return index_string, bookmark_string
+
+    def __index__format_func(self, my_string):
+        italics = 0
+        bold =0
+        lines = my_string.split('\n')
+        for line in lines:
+            token_info = line[:16]
+            if token_info == 'cw<in<index-bold':
+                bold = 1
+            if token_info == 'cw<in<index-ital':
+                italics = 1
+        return italics, bold
+
+    def __parse_toc_func(self, my_string):
+        """
+        Requires:
+            my_string -- all the string in the toc
+        Returns:
+            modidified string
+        Logic:
+        """
+        toc_level = 0
+        toc_suppress = 0
+        my_string, book_start_string, book_end_string =\
+        self.__parse_bookmark_for_toc(my_string)
+        main_entry = ''
+        my_changed_string = 'mi<tg<empty-att_<field<type>toc-entry'
+        my_changed_string += '<update>static'
+        if book_start_string:
+            my_changed_string += '<bookmark-start>%s' % book_start_string
+        if book_end_string:
+            my_changed_string += '<bookmark-end>%s' % book_end_string
+        lines = my_string.split('\n')
+        for line in lines:
+            token_info = line[:16]
+            if token_info[0:2] == 'tx':
+                main_entry += line[17:]
+            if token_info == 'cw<tc<toc-level_':
+                toc_level = line[20:]
+            if token_info == 'cw<tc<toc-sup-nu':
+                toc_suppress = 1
+        if toc_level:
+            my_changed_string += '<toc-level>%s' % toc_level
+        if toc_suppress:
+            my_changed_string += '<toc-suppress-number>true'
+        my_changed_string += '<main-entry>%s' % main_entry
+        my_changed_string += '\n'
+        return my_changed_string
+
+    def __parse_bookmark_for_toc(self, my_string):
+        """
+        Requires:
+            the_string --string of toc, with new lines
+        Returns:
+            the_string -- string minus bookmarks
+            bookmark_string -- bookmarks
+        Logic:
+        """
+        in_bookmark = 0
+        bracket_count = 0
+        book_start_string = ''
+        book_end_string = ''
+        book_type = 0
+        toc_string = ''
+        lines = my_string.split('\n')
+        end_bracket_count = sys.maxsize
+        for line in lines:
+            token_info = line[:16]
+            if token_info == 'ob<nu<open-brack':
+                bracket_count += 1
+            if token_info == 'cb<nu<clos-brack':
+                bracket_count -= 1
+            if in_bookmark:
+                if bracket_count == end_bracket_count and token_info == 'cb<nu<clos-brack':
+                    in_bookmark = 0
+                    toc_string += '%s\n' % line
+                else:
+                    if token_info == 'tx<nu<__________':
+                        if book_type == 'start':
+                            book_start_string += line[17:]
+                        elif book_type == 'end':
+                            book_end_string += line[17:]
+                    else:
+                        toc_string += '%s\n' % line
+            else:
+                if token_info == 'cw<an<book-mk-st' or token_info =='cw<an<book-mk-en':
+                    if token_info == 'cw<an<book-mk-st':
+                        book_type = 'start'
+                    if token_info == 'cw<an<book-mk-en':
+                        book_type = 'end'
+                    end_bracket_count = bracket_count - 1
+                    in_bookmark = 1
+                toc_string += '%s\n' % line
+        return toc_string, book_start_string, book_end_string
+
+    def __parse_bookmark_func(self, my_string, type):
+        """
+        Requires:
+            my_string --string to parse
+            type --type of string
+        Returns:
+            A string formated for a field instruction.
+        Logic:
+            The type is the name (either bookmark-end or bookmark-start). The
+            id is the complete text string.
+        """
+        my_changed_string = ('mi<tg<empty-att_<field<type>%s'
+        '<number>%s<update>none\n' % (type, my_string))
+        return my_changed_string
+
+    def __found_toc_index_func(self, line, tag):
+        """
+        Requires:
+            line --the line to parse
+        Returns:
+            nothing
+        Logic:
+            This function is called when a toc or index entry is found. The opening
+            bracket count is stored in the beginning bracket count. The state
+            is changed to 'toc_index.'
+        """
+        self.__beg_bracket_count = self.__ob_count
+        self.__cb_count = 0
+        self.__state = 'toc_index'
+        self.__tag = tag
+
+    def __toc_index_func(self, line):
+        """
+        Requires:
+            line --the line to parse
+        Returns:
+            nothing
+        Logic:
+            This function handles all lines within a toc or index entry. It
+            adds each line to a string until the end of the entry is found. It
+            processes the string with the fields_string module, and
+            prints out the result.
+        """
+        if self.__beg_bracket_count == self.__cb_count:
+            self.__state = 'body'
+            type = self.__tag
+            if type == 'index':
+                my_string = self.__parse_index_func(
+                self.__text_string)
+            elif type == 'toc':
+                my_string = self.__parse_toc_func(
+                self.__text_string)
+            self.__write_obj.write(self.__marker)
+            self.__write_obj.write(my_string)
+            self.__text_string = ''
+            self.__write_obj.write(line)
+        else:
+            self.__text_string += line
+
+    def fix_fields(self):
+        """
+        Requires:
+            nothing
+        Returns:
+            nothing (changes the original file)
+        Logic:
+            Read one line in at a time. Determine what action to take based on
+            the state. If the state is before the body, look for the
+            beginning of the body.
+           The other two states are toc_index (for toc and index entries) and
+           bookmark.
+        """
+        self.__initiate_values()
+        with open_for_read(self.__file) as read_obj:
+            with open_for_write(self.__write_to) as self.__write_obj:
+                for line in read_obj:
+                    self.__token_info = line[:16]
+                    if self.__token_info == 'ob<nu<open-brack':
+                        self.__ob_count = line[-5:-1]
+                    if self.__token_info == 'cb<nu<clos-brack':
+                        self.__cb_count = line[-5:-1]
+                    action = self.__state_dict.get(self.__state)
+                    if action is None:
+                        sys.stderr.write('No matching state in module fields_small.py\n')
+                        sys.stderr.write(self.__state + '\n')
+                    action(line)
+        copy_obj = copy.Copy(bug_handler=self.__bug_handler)
+        if self.__copy:
+            copy_obj.copy_file(self.__write_to, "fields_small.data")
+        copy_obj.rename(self.__write_to, self.__file)
+        os.remove(self.__write_to)