Initial import

2026-04-16 11:03:33 +02:00 · 2020-03-31 17:15:23 +02:00
commit d97ea9b0bc
311 changed files with 131419 additions and 0 deletions
--- a/ebook_converter/ebooks/rtf2xml/info.py
+++ b/ebook_converter/ebooks/rtf2xml/info.py
@@ -0,0 +1,285 @@
+from __future__ import unicode_literals, absolute_import, print_function, division
+#########################################################################
+#                                                                       #
+#                                                                       #
+#   copyright 2002 Paul Henry Tremblay                                  #
+#                                                                       #
+#   This program is distributed in the hope that it will be useful,     #
+#   but WITHOUT ANY WARRANTY; without even the implied warranty of      #
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU    #
+#   General Public License for more details.                            #
+#                                                                       #
+#                                                                       #
+#########################################################################
+import sys, os, re
+
+from calibre.ebooks.rtf2xml import copy
+from calibre.ptempfile import better_mktemp
+from . import open_for_read, open_for_write
+
+
+class Info:
+    """
+    Make tags for document-information
+    """
+
+    def __init__(self,
+            in_file,
+            bug_handler,
+            copy=None,
+            run_level=1,
+            ):
+        """
+        Required:
+            'file'--file to parse
+        Optional:
+            'copy'-- whether to make a copy of result for debugging
+            'temp_dir' --where to output temporary results (default is
+            directory from which the script is run.)
+        Returns:
+            nothing
+            """
+        self.__file = in_file
+        self.__bug_handler = bug_handler
+        self.__copy = copy
+        self.__run_level = run_level
+        self.__write_to = better_mktemp()
+
+    def __initiate_values(self):
+        """
+        Initiate all values.
+        """
+        self.__text_string = ''
+        self.__state = 'before_info_table'
+        self.rmspace = re.compile(r'\s+')
+        self.__state_dict = {
+        'before_info_table': self.__before_info_table_func,
+        'after_info_table': self.__after_info_table_func,
+        'in_info_table'    : self.__in_info_table_func,
+        'collect_text'      : self.__collect_text_func,
+        'collect_tokens'      : self.__collect_tokens_func,
+        }
+        self.__info_table_dict = {
+        'cw<di<title_____'  : (self.__found_tag_with_text_func, 'title'),
+        'cw<di<author____'  : (self.__found_tag_with_text_func, 'author'),
+        'cw<di<operator__'  : (self.__found_tag_with_text_func, 'operator'),
+        'cw<di<manager___'  : (self.__found_tag_with_text_func, 'manager'),
+        'cw<di<company___'  : (self.__found_tag_with_text_func, 'company'),
+        'cw<di<keywords__'  : (self.__found_tag_with_text_func, 'keywords'),
+        'cw<di<category__'  : (self.__found_tag_with_text_func, 'category'),
+        'cw<di<doc-notes_'  : (self.__found_tag_with_text_func, 'doc-notes'),
+        'cw<di<subject___'  : (self.__found_tag_with_text_func, 'subject'),
+        'cw<di<linkbase__'  : (self.__found_tag_with_text_func, 'hyperlink-base'),
+
+        'cw<di<create-tim'  : (self.__found_tag_with_tokens_func, 'creation-time'),
+        'cw<di<revis-time'  : (self.__found_tag_with_tokens_func, 'revision-time'),
+        'cw<di<print-time'  : (self.__found_tag_with_tokens_func, 'printing-time'),
+        'cw<di<backuptime'  : (self.__found_tag_with_tokens_func, 'backup-time'),
+
+        'cw<di<num-of-wor'  : (self.__single_field_func, 'number-of-words'),
+        'cw<di<num-of-chr'  : (self.__single_field_func, 'number-of-characters'),
+        'cw<di<numofchrws'  : (self.__single_field_func, 'number-of-characters-without-space'),
+        'cw<di<num-of-pag'  : (self.__single_field_func, 'number-of-pages'),
+        'cw<di<version___'  : (self.__single_field_func, 'version'),
+        'cw<di<edit-time_'  : (self.__single_field_func, 'editing-time'),
+        'cw<di<intern-ver'  : (self.__single_field_func, 'internal-version-number'),
+        'cw<di<internalID'  : (self.__single_field_func, 'internal-id-number'),
+        }
+        self.__token_dict = {
+        'year______'        : 'year',
+        'month_____'        : 'month',
+        'day_______'        : 'day',
+        'minute____'        : 'minute',
+        'second____'        : 'second',
+        'revis-time'        : 'revision-time',
+        'create-tim'        : 'creation-time',
+        'edit-time_'        : 'editing-time',
+        'print-time'        : 'printing-time',
+        'backuptime'        : 'backup-time',
+        'num-of-wor'        : 'number-of-words',
+        'num-of-chr'        : 'number-of-characters',
+        'numofchrws'        : 'number-of-characters-without-space',
+        'num-of-pag'        : 'number-of-pages',
+        'version___'        : 'version',
+        'intern-ver'        : 'internal-version-number',
+        'internalID'        : 'internal-id-number',
+        }
+
+    def __before_info_table_func(self, line):
+        """
+        Required:
+            line -- the line to parse
+        Returns:
+            nothing
+        Logic:
+            Check for the beginning of the informatin table. When found, set
+            the state to the information table. Always write the line.
+        """
+        if self.__token_info == 'mi<mk<doc-in-beg':
+            self.__state = 'in_info_table'
+        self.__write_obj.write(line)
+
+    def __in_info_table_func(self, line):
+        """
+        Requires:
+            line -- line to parse
+        Returns:
+            nothing.
+        Logic:
+            Check for the end of information. If not found, check if the
+            token has a special value in the info table dictionay. If it
+            does, execute that function.
+            Otherwise, output the line to the file.
+        """
+        if self.__token_info == 'mi<mk<doc-in-end':
+            self.__state = 'after_info_table'
+        else:
+            action, tag = self.__info_table_dict.get(self.__token_info, (None, None))
+            if action:
+                action(line, tag)
+            else:
+                self.__write_obj.write(line)
+
+    def __found_tag_with_text_func(self, line, tag):
+        """
+        Requires:
+            line -- line to parse
+            tag --what kind of line
+        Returns:
+            nothing
+        Logic:
+            This function marks the beginning of informatin fields that have
+            text that must be collected.  Set the type of information field
+            with the tag option. Set the state to collecting text
+        """
+        self.__tag = tag
+        self.__state = 'collect_text'
+
+    def __collect_text_func(self, line):
+        """
+        Requires:
+            line -- line to parse
+        Returns:
+            nothing
+        Logic:
+            If the end of the information field is found, write the text
+            string to the file.
+            Otherwise, if the line contains text, add it to the text string.
+        """
+        if self.__token_info == 'mi<mk<docinf-end':
+            self.__state = 'in_info_table'
+            # Don't print empty tags
+            if len(self.rmspace.sub('',self.__text_string)):
+                self.__write_obj.write(
+                    'mi<tg<open______<%s\n'
+                    'tx<nu<__________<%s\n'
+                    'mi<tg<close_____<%s\n' % (self.__tag, self.__text_string, self.__tag)
+                )
+            self.__text_string = ''
+        elif line[0:2] == 'tx':
+            self.__text_string += line[17:-1]
+
+    def __found_tag_with_tokens_func(self, line, tag):
+        """
+        Requires:
+            line -- line to parse
+            tag -- type of field
+        Returns:
+            nothing
+        Logic:
+            Some fields have a series of tokens (cw<di<year______<nu<2003)
+            that must be parsed as attributes for the element.
+            Set the state to collect tokesn, and set the text string to
+            start an empty element with attributes.
+        """
+        self.__state = 'collect_tokens'
+        self.__text_string = 'mi<tg<empty-att_<%s' % tag
+        # mi<tg<empty-att_<page-definition<margin>33\n
+
+    def __collect_tokens_func(self, line):
+        """
+        Requires:
+            line -- line to parse
+        Returns:
+            nothing
+        Logic:
+            This function collects all the token information and adds it to
+            the text string until the end of the field is found.
+            First check of the end of the information field. If found, write
+            the text string to the file.
+            If not found, get the relevant information from the text string.
+            This information cannot be directly added to the text string,
+            because it exists in abbreviated form.  (num-of-wor)
+            I want to check this information in a dictionary to convert it
+            to a longer, readable form. If the key does not exist in the
+            dictionary, print out an error message. Otherise add the value
+            to the text string.
+            (num-of-wor => number-of-words)
+        """
+        # cw<di<year______<nu<2003
+        if self.__token_info == 'mi<mk<docinf-end':
+            self.__state = 'in_info_table'
+            self.__write_obj.write(
+            '%s\n' % self.__text_string
+            )
+            self.__text_string = ''
+        else:
+            att = line[6:16]
+            value = line[20:-1]
+            att_changed = self.__token_dict.get(att)
+            if att_changed is None:
+                if self.__run_level > 3:
+                    msg = 'No dictionary match for %s\n' % att
+                    raise self.__bug_handler(msg)
+            else:
+                self.__text_string += '<%s>%s' % (att_changed, value)
+
+    def __single_field_func(self, line, tag):
+        value = line[20:-1]
+        self.__write_obj.write(
+        'mi<tg<empty-att_<%s<%s>%s\n' % (tag, tag, value)
+        )
+
+    def __after_info_table_func(self, line):
+        """
+        Requires:
+            line --line to write to file
+        Returns:
+            nothing
+        Logic:
+            After the end of the information table, simple write the line to
+            the file.
+        """
+        self.__write_obj.write(line)
+
+    def fix_info(self):
+        """
+        Requires:
+            nothing
+        Returns:
+            nothing (changes the original file)
+        Logic:
+            Read one line in at a time. Determine what action to take based on
+            the state. If the state is before the information table, look for the
+            beginning of the style table.
+            If the state is in the information table, use other methods to
+            parse the information
+            style table, look for lines with style info, and substitute the
+            number with the name of the style.  If the state if afer the
+            information table, simply write the line to the output file.
+        """
+        self.__initiate_values()
+        with open_for_read(self.__file) as read_obj:
+            with open_for_write(self.__write_to) as self.__write_obj:
+                for line in read_obj:
+                    self.__token_info = line[:16]
+                    action = self.__state_dict.get(self.__state)
+                    if action is None:
+                        sys.stderr.write('No matching state in module styles.py\n')
+                        sys.stderr.write(self.__state + '\n')
+                    action(line)
+        copy_obj = copy.Copy(bug_handler=self.__bug_handler)
+        if self.__copy:
+            copy_obj.copy_file(self.__write_to, "info.data")
+        copy_obj.rename(self.__write_to, self.__file)
+        os.remove(self.__write_to)