Initial import

2026-03-24 19:33:33 +01:00 · 2020-03-31 17:15:23 +02:00
commit d97ea9b0bc
311 changed files with 131419 additions and 0 deletions
--- a/ebook_converter/ebooks/rtf2xml/ParseRtf.py
+++ b/ebook_converter/ebooks/rtf2xml/ParseRtf.py
@@ -0,0 +1,573 @@
+from __future__ import absolute_import, division, print_function, unicode_literals
+#########################################################################
+#                                                                       #
+#                                                                       #
+#   copyright 2002 Paul Henry Tremblay                                  #
+#                                                                       #
+#   This program is distributed in the hope that it will be useful,     #
+#   but WITHOUT ANY WARRANTY; without even the implied warranty of      #
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU    #
+#   General Public License for more details.                            #
+#                                                                       #
+#                                                                       #
+#########################################################################
+# $Revision: 1.41 $
+# $Date: 2006/03/24 23:50:07 $
+import sys, os
+
+from calibre.ebooks.rtf2xml import headings_to_sections, \
+    line_endings, footnote, fields_small, default_encoding, \
+    make_lists, preamble_div, header, colors, group_borders, \
+    check_encoding, add_brackets, table, combine_borders, \
+    fields_large, process_tokens, hex_2_utf8, tokenize, \
+    delete_info, sections, check_brackets, styles, \
+    paragraph_def, convert_to_tags, output, copy, \
+    list_numbers, info, pict, table_info, fonts, paragraphs, \
+    body_styles, preamble_rest, group_styles, \
+    inline
+from calibre.ebooks.rtf2xml.old_rtf import OldRtf
+from polyglot.builtins import unicode_type
+
+from . import open_for_read, open_for_write
+
+"""
+Here is an example script using the ParseRTF module directly
+#!/usr/bin/env python2
+
+def Handle_Main():
+    # Handles options and creates a parse object
+    parse_obj =ParseRtf.ParseRtf(
+            in_file = 'in.rtf',
+            # All values from here on are optional
+            # determine the output file
+            out_file = 'out.xml',
+            # determine the run level. The default is 1.
+            run_level = 3,
+            # The name of a debug directory, if you are running at
+            # run level 3 or higer.
+            debug = 'debug_dir',
+            # Convert RTF caps to real caps.
+            # Default is 1.
+            convert_caps = 1,
+            # Indent resulting XML.
+            # Default is 0 (no indent).
+            indent = 1,
+            # Form lists from RTF. Default is 1.
+            form_lists = 1,
+            # Convert headings to sections. Default is 0.
+            headings_to_sections = 1,
+            # Group paragraphs with the same style name. Default is 1.
+            group_styles = 1,
+            # Group borders. Default is 1.
+            group_borders = 1,
+            # Write or do not write paragraphs. Default is 0.
+            empty_paragraphs = 0,
+            # Allow to use a custom default encoding as fallback
+            default_encoding = 'cp1252',
+    )
+    try:
+        parse_obj.parse_rtf()
+    except ParseRtf.InvalidRtfException, msg:
+        sys.stderr.write(msg)
+    except ParseRtf.RtfInvalidCodeException, msg:
+        sys.stderr.write(msg)
+"""
+
+
+class InvalidRtfException(Exception):
+    """
+    handle invalid RTF
+    """
+    pass
+
+
+class RtfInvalidCodeException(Exception):
+    """
+    handle bugs in program
+    """
+    pass
+
+
+class ParseRtf:
+    """
+    Main class for controlling the rest of the parsing.
+    """
+
+    def __init__(self,
+                in_file,
+                out_file='',
+                out_dir=None,
+                dtd='',
+                deb_dir=None,
+                convert_symbol=None,
+                convert_wingdings=None,
+                convert_zapf=None,
+                convert_caps=None,
+                run_level=1,
+                indent=None,
+                replace_illegals=1,
+                form_lists=1,
+                headings_to_sections=1,
+                group_styles=1,
+                group_borders=1,
+                empty_paragraphs=1,
+                no_dtd=0,
+                char_data='',
+                default_encoding='cp1252',
+                ):
+        """
+        Requires:
+        'file' --file to parse
+        'char_data' --file containing character maps
+        'dtd' --path to dtd
+        Possible parameters, but not necessary:
+            'output' --a file to output the parsed file. (Default is standard
+            output.)
+            'temp_dir' --directory for temporary output (If not provided, the
+            script tries to output to directory where is script is exectued.)
+            'deb_dir' --debug directory. If a debug_dir is provided, the script
+            will copy each run through as a file to examine in the debug_dir
+            'check_brackets' -- make sure the brackets match up after each run
+            through a file. Only for debugging.
+        Returns: Nothing
+        """
+
+        self.__file = in_file
+        self.__out_file = out_file
+        self.__out_dir = out_dir
+        self.__temp_dir = out_dir
+        self.__dtd_path = dtd
+        self.__check_file(in_file,"file_to_parse")
+        self.__char_data = char_data
+        self.__debug_dir = deb_dir
+        self.__check_dir(self.__temp_dir)
+        self.__copy = self.__check_dir(self.__debug_dir)
+        self.__convert_caps = convert_caps
+        self.__convert_symbol = convert_symbol
+        self.__convert_wingdings = convert_wingdings
+        self.__convert_zapf = convert_zapf
+        self.__run_level = run_level
+        self.__exit_level = 0
+        self.__indent = indent
+        self.__replace_illegals = replace_illegals
+        self.__form_lists = form_lists
+        self.__headings_to_sections = headings_to_sections
+        self.__group_styles = group_styles
+        self.__group_borders = group_borders
+        self.__empty_paragraphs = empty_paragraphs
+        self.__no_dtd = no_dtd
+        self.__default_encoding = default_encoding
+
+    def __check_file(self, the_file, type):
+        """Check to see if files exist"""
+        if hasattr(the_file, 'read'):
+            return
+        if the_file is None:
+            if type == "file_to_parse":
+                msg = "\nYou must provide a file for the script to work"
+            raise RtfInvalidCodeException(msg)
+        elif os.path.exists(the_file):
+            pass  # do nothing
+        else:
+            msg = "\nThe file '%s' cannot be found" % the_file
+            raise RtfInvalidCodeException(msg)
+
+    def __check_dir(self, the_dir):
+        """Check to see if directory exists"""
+        if not the_dir :
+            return
+        dir_exists = os.path.isdir(the_dir)
+        if not dir_exists:
+            msg = "\n%s is not a directory" % the_dir
+            raise RtfInvalidCodeException(msg)
+        return 1
+
+    def parse_rtf(self):
+        """
+        Parse the file by calling on other classes.
+        Requires:
+            Nothing
+        Returns:
+            A parsed file in XML, either to standard output or to a file,
+            depending on the value of 'output' when the instance was created.
+        """
+        self.__temp_file = self.__make_temp_file(self.__file)
+        # if the self.__deb_dir is true, then create a copy object,
+        # set the directory to write to, remove files, and copy
+        # the new temporary file to this directory
+        if self.__debug_dir:
+            copy_obj = copy.Copy(
+                bug_handler=RtfInvalidCodeException,
+                    )
+            copy_obj.set_dir(self.__debug_dir)
+            copy_obj.remove_files()
+            copy_obj.copy_file(self.__temp_file, "original_file")
+        # Function to check if bracket are well handled
+        if self.__debug_dir or self.__run_level > 2:
+            self.__check_brack_obj = check_brackets.CheckBrackets(
+                file=self.__temp_file,
+                bug_handler=RtfInvalidCodeException,
+            )
+        # convert Macintosh and Windows line endings to Unix line endings
+        # why do this if you don't wb after?
+        line_obj = line_endings.FixLineEndings(
+                in_file=self.__temp_file,
+                bug_handler=RtfInvalidCodeException,
+                copy=self.__copy,
+                run_level=self.__run_level,
+                replace_illegals=self.__replace_illegals,
+                )
+        return_value = line_obj.fix_endings()  # calibre return what?
+        self.__return_code(return_value)
+        tokenize_obj = tokenize.Tokenize(
+                bug_handler=RtfInvalidCodeException,
+                in_file=self.__temp_file,
+                copy=self.__copy,
+                run_level=self.__run_level)
+        tokenize_obj.tokenize()
+        process_tokens_obj = process_tokens.ProcessTokens(
+            in_file=self.__temp_file,
+            bug_handler=RtfInvalidCodeException,
+            copy=self.__copy,
+            run_level=self.__run_level,
+            exception_handler=InvalidRtfException,
+            )
+        try:
+            return_value = process_tokens_obj.process_tokens()
+        except InvalidRtfException as msg:
+            # Check to see if the file is correctly encoded
+            encode_obj = default_encoding.DefaultEncoding(
+            in_file=self.__temp_file,
+            run_level=self.__run_level,
+            bug_handler=RtfInvalidCodeException,
+            check_raw=True,
+            default_encoding=self.__default_encoding,
+            )
+            platform, code_page, default_font_num = encode_obj.find_default_encoding()
+            check_encoding_obj = check_encoding.CheckEncoding(
+                    bug_handler=RtfInvalidCodeException,
+                        )
+            enc = encode_obj.get_codepage()
+            # TODO: to check if cp is a good idea or if I should use a dict to convert
+            enc = 'cp' + enc
+            msg = '%s\nException in token processing' % unicode_type(msg)
+            if check_encoding_obj.check_encoding(self.__file, enc):
+                file_name = self.__file if isinstance(self.__file, bytes) \
+                                    else self.__file.encode('utf-8')
+                msg +='\nFile %s does not appear to be correctly encoded.\n' % file_name
+            try:
+                os.remove(self.__temp_file)
+            except OSError:
+                pass
+            raise InvalidRtfException(msg)
+        delete_info_obj = delete_info.DeleteInfo(
+            in_file=self.__temp_file,
+            copy=self.__copy,
+            bug_handler=RtfInvalidCodeException,
+            run_level=self.__run_level,)
+        # found destination means {\*\destination
+        # if found, the RTF should be newer RTF
+        found_destination = delete_info_obj.delete_info()
+        self.__bracket_match('delete_data_info')
+        # put picts in a separate file
+        pict_obj = pict.Pict(
+            in_file=self.__temp_file,
+            bug_handler=RtfInvalidCodeException,
+            copy=self.__copy,
+            orig_file=self.__file,
+            out_file=self.__out_file,
+            run_level=self.__run_level,
+           )
+        pict_obj.process_pict()
+        self.__bracket_match('pict_data_info')
+        combine_obj = combine_borders.CombineBorders(
+            in_file=self.__temp_file,
+            bug_handler=RtfInvalidCodeException,
+            copy=self.__copy,
+            run_level=self.__run_level,)
+        combine_obj.combine_borders()
+        self.__bracket_match('combine_borders_info')
+        footnote_obj = footnote.Footnote(
+            in_file=self.__temp_file,
+            bug_handler=RtfInvalidCodeException,
+            copy=self.__copy,
+            run_level=self.__run_level,
+            )
+        footnote_obj.separate_footnotes()
+        self.__bracket_match('separate_footnotes_info')
+        header_obj = header.Header(
+            in_file=self.__temp_file,
+            bug_handler=RtfInvalidCodeException,
+            copy=self.__copy,
+            run_level=self.__run_level,
+            )
+        header_obj.separate_headers()
+        self.__bracket_match('separate_headers_info')
+        list_numbers_obj = list_numbers.ListNumbers(
+            in_file=self.__temp_file,
+            bug_handler=RtfInvalidCodeException,
+            copy=self.__copy,
+            run_level=self.__run_level,
+            )
+        list_numbers_obj.fix_list_numbers()
+        self.__bracket_match('list_number_info')
+        preamble_div_obj = preamble_div.PreambleDiv(
+            in_file=self.__temp_file,
+            bug_handler=RtfInvalidCodeException,
+            copy=self.__copy,
+            run_level=self.__run_level,
+             )
+        list_of_lists = preamble_div_obj.make_preamble_divisions()
+        self.__bracket_match('make_preamble_divisions')
+        encode_obj = default_encoding.DefaultEncoding(
+            in_file=self.__temp_file,
+            run_level=self.__run_level,
+            bug_handler=RtfInvalidCodeException,
+            default_encoding=self.__default_encoding,
+            )
+        platform, code_page, default_font_num = encode_obj.find_default_encoding()
+        hex2utf_obj = hex_2_utf8.Hex2Utf8(
+                in_file=self.__temp_file,
+                copy=self.__copy,
+                area_to_convert='preamble',
+                char_file=self.__char_data,
+                default_char_map=code_page,
+                run_level=self.__run_level,
+                bug_handler=RtfInvalidCodeException,
+                invalid_rtf_handler=InvalidRtfException,
+                )
+        hex2utf_obj.convert_hex_2_utf8()
+        self.__bracket_match('hex_2_utf_preamble')
+        fonts_obj = fonts.Fonts(
+            in_file=self.__temp_file,
+            bug_handler=RtfInvalidCodeException,
+            copy=self.__copy,
+            default_font_num=default_font_num,
+            run_level=self.__run_level,
+            )
+        special_font_dict = fonts_obj.convert_fonts()
+        self.__bracket_match('fonts_info')
+        color_obj = colors.Colors(
+            in_file=self.__temp_file,
+            copy=self.__copy,
+            bug_handler=RtfInvalidCodeException,
+            run_level=self.__run_level,
+            )
+        color_obj.convert_colors()
+        self.__bracket_match('colors_info')
+        style_obj = styles.Styles(
+            in_file=self.__temp_file,
+            bug_handler=RtfInvalidCodeException,
+            copy=self.__copy,
+            run_level=self.__run_level,
+            )
+        style_obj.convert_styles()
+        self.__bracket_match('styles_info')
+        info_obj = info.Info(
+            in_file=self.__temp_file,
+            bug_handler=RtfInvalidCodeException,
+            copy=self.__copy,
+            run_level=self.__run_level,
+            )
+        info_obj.fix_info()
+        default_font = special_font_dict.get('default-font')
+        preamble_rest_obj = preamble_rest.Preamble(
+            file=self.__temp_file, copy=self.__copy,
+            bug_handler=RtfInvalidCodeException,
+            platform=platform, default_font=default_font,
+            code_page=code_page)
+        preamble_rest_obj.fix_preamble()
+        self.__bracket_match('preamble_rest_info')
+        old_rtf_obj = OldRtf(
+                in_file=self.__temp_file,
+                bug_handler=RtfInvalidCodeException,
+                run_level=self.__run_level,
+                )
+        # RTF can actually have destination groups and old RTF.
+        # BAH!
+        old_rtf = old_rtf_obj.check_if_old_rtf()
+        if old_rtf:
+            if self.__run_level > 5:
+                msg = 'Older RTF\n' \
+                'self.__run_level is "%s"\n' % self.__run_level
+                raise RtfInvalidCodeException(msg)
+            if self.__run_level > 1:
+                sys.stderr.write('File could be older RTF...\n')
+            if found_destination:
+                if self.__run_level > 1:
+                    sys.stderr.write(
+                        'File also has newer RTF.\n'
+                        'Will do the best to convert...\n'
+                    )
+            add_brackets_obj = add_brackets.AddBrackets(
+                    in_file=self.__temp_file,
+                    bug_handler=RtfInvalidCodeException,
+                    copy=self.__copy,
+                    run_level=self.__run_level,
+                    )
+            add_brackets_obj.add_brackets()
+        fields_small_obj = fields_small.FieldsSmall(
+            in_file=self.__temp_file,
+            copy=self.__copy,
+            bug_handler=RtfInvalidCodeException,
+            run_level=self.__run_level,)
+        fields_small_obj.fix_fields()
+        self.__bracket_match('fix_small_fields_info')
+        fields_large_obj = fields_large.FieldsLarge(
+            in_file=self.__temp_file,
+            copy=self.__copy,
+            bug_handler=RtfInvalidCodeException,
+            run_level=self.__run_level)
+        fields_large_obj.fix_fields()
+        self.__bracket_match('fix_large_fields_info')
+        sections_obj = sections.Sections(
+            in_file=self.__temp_file,
+            bug_handler=RtfInvalidCodeException,
+            copy=self.__copy,
+            run_level=self.__run_level,)
+        sections_obj.make_sections()
+        self.__bracket_match('sections_info')
+        paragraphs_obj = paragraphs.Paragraphs(
+            in_file=self.__temp_file,
+            bug_handler=RtfInvalidCodeException,
+            copy=self.__copy,
+            write_empty_para=self.__empty_paragraphs,
+            run_level=self.__run_level,)
+        paragraphs_obj.make_paragraphs()
+        self.__bracket_match('paragraphs_info')
+        default_font = special_font_dict['default-font']
+        paragraph_def_obj = paragraph_def.ParagraphDef(
+            in_file=self.__temp_file,
+            bug_handler=RtfInvalidCodeException,
+            copy=self.__copy,
+            default_font=default_font,
+            run_level=self.__run_level,)
+        list_of_styles = paragraph_def_obj.make_paragraph_def()
+        body_styles_obj = body_styles.BodyStyles(
+            in_file=self.__temp_file,
+            bug_handler=RtfInvalidCodeException,
+            copy=self.__copy,
+            list_of_styles=list_of_styles,
+            run_level=self.__run_level,)
+        body_styles_obj.insert_info()
+        self.__bracket_match('body_styles_info')
+        self.__bracket_match('paragraph_def_info')
+        table_obj = table.Table(
+                in_file=self.__temp_file,
+                bug_handler=RtfInvalidCodeException,
+                copy=self.__copy,
+                run_level=self.__run_level,)
+        table_data = table_obj.make_table()
+        self.__bracket_match('table_info')
+        table_info_obj = table_info.TableInfo(
+            in_file=self.__temp_file,
+            bug_handler=RtfInvalidCodeException,
+            copy=self.__copy,
+            table_data=table_data,
+            run_level=self.__run_level,)
+        table_info_obj.insert_info()
+        self.__bracket_match('table__data_info')
+        if self.__form_lists:
+            make_list_obj =  make_lists.MakeLists(
+                in_file=self.__temp_file,
+                bug_handler=RtfInvalidCodeException,
+                copy=self.__copy,
+                headings_to_sections=self.__headings_to_sections,
+                run_level=self.__run_level,
+                list_of_lists=list_of_lists,
+                )
+            make_list_obj.make_lists()
+            self.__bracket_match('form_lists_info')
+        if self.__headings_to_sections:
+            headings_to_sections_obj =  headings_to_sections.HeadingsToSections(
+                in_file=self.__temp_file,
+                bug_handler=RtfInvalidCodeException,
+                copy=self.__copy,
+                run_level=self.__run_level,)
+            headings_to_sections_obj.make_sections()
+            self.__bracket_match('headings_to_sections_info')
+        if self.__group_styles:
+            group_styles_obj = group_styles.GroupStyles(
+                in_file=self.__temp_file,
+                bug_handler=RtfInvalidCodeException,
+                copy=self.__copy,
+                wrap=1,
+                run_level=self.__run_level,)
+            group_styles_obj.group_styles()
+            self.__bracket_match('group_styles_info')
+        if self.__group_borders:
+            group_borders_obj = group_borders.GroupBorders(
+                in_file=self.__temp_file,
+                bug_handler=RtfInvalidCodeException,
+                copy=self.__copy,
+                wrap=1,
+                run_level=self.__run_level,)
+            group_borders_obj.group_borders()
+            self.__bracket_match('group_borders_info')
+        inline_obj = inline.Inline(
+                in_file=self.__temp_file,
+                bug_handler=RtfInvalidCodeException,
+                copy=self.__copy,
+                run_level=self.__run_level,)
+        inline_obj.form_tags()
+        self.__bracket_match('inline_info')
+        hex2utf_obj.update_values(file=self.__temp_file,
+                            area_to_convert='body',
+                            copy=self.__copy,
+                            char_file=self.__char_data,
+                            convert_caps=self.__convert_caps,
+                            convert_symbol=self.__convert_symbol,
+                            convert_wingdings=self.__convert_wingdings,
+                            convert_zapf=self.__convert_zapf,
+                            symbol=1,
+                            wingdings=1,
+                            dingbats=1,
+                )
+        hex2utf_obj.convert_hex_2_utf8()
+        header_obj.join_headers()
+        footnote_obj.join_footnotes()
+        tags_obj = convert_to_tags.ConvertToTags(
+                in_file=self.__temp_file,
+                copy=self.__copy,
+                dtd_path=self.__dtd_path,
+                indent=self.__indent,
+                run_level=self.__run_level,
+                no_dtd=self.__no_dtd,
+                encoding=encode_obj.get_codepage(),
+                bug_handler=RtfInvalidCodeException,
+                )
+        tags_obj.convert_to_tags()
+        output_obj = output.Output(
+                file=self.__temp_file,
+                orig_file=self.__file,
+                output_dir=self.__out_dir,
+                out_file=self.__out_file,
+            )
+        output_obj.output()
+        os.remove(self.__temp_file)
+        return self.__exit_level
+
+    def __bracket_match(self, file_name):
+        if self.__run_level > 2:
+            good_br, msg =  self.__check_brack_obj.check_brackets()
+            if good_br:
+                pass
+                # sys.stderr.write( msg + ' in ' + file_name + "\n")
+            else:
+                msg = '%s in file %s' % (msg, file_name)
+                print(msg, file=sys.stderr)
+
+    def __return_code(self, num):
+        if num is None:
+            return
+        if int(num) > self.__exit_level:
+            self.__exit_level = num
+
+    def __make_temp_file(self,file):
+        """Make a temporary file to parse"""
+        write_file="rtf_write_file"
+        read_obj = file if hasattr(file, 'read') else open_for_read(file)
+        with open_for_write(write_file) as write_obj:
+            for line in read_obj:
+                write_obj.write(line)
+        return write_file
--- a/ebook_converter/ebooks/rtf2xml/init.py
+++ b/ebook_converter/ebooks/rtf2xml/init.py
@@ -0,0 +1,12 @@
+from __future__ import unicode_literals, absolute_import, print_function, division
+
+import io
+
+
+def open_for_read(path):
+    return io.open(path, encoding='utf-8', errors='replace')
+
+
+def open_for_write(path, append=False):
+    mode = 'a' if append else 'w'
+    return io.open(path, mode, encoding='utf-8', errors='replace', newline='')
--- a/ebook_converter/ebooks/rtf2xml/add_brackets.py
+++ b/ebook_converter/ebooks/rtf2xml/add_brackets.py
@@ -0,0 +1,232 @@
+from __future__ import unicode_literals, absolute_import, print_function, division
+#########################################################################
+#                                                                       #
+#                                                                       #
+#   copyright 2002 Paul Henry Tremblay                                  #
+#                                                                       #
+#   This program is distributed in the hope that it will be useful,     #
+#   but WITHOUT ANY WARRANTY; without even the implied warranty of      #
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU    #
+#   General Public License for more details.                            #
+#                                                                       #
+#                                                                       #
+#                                                                       #
+#########################################################################
+import sys, os
+
+from calibre.ebooks.rtf2xml import copy, check_brackets
+from calibre.ptempfile import better_mktemp
+from polyglot.builtins import iteritems
+from . import open_for_read, open_for_write
+
+
+class AddBrackets:
+    """
+    Add brackets for old RTF.
+    Logic:
+    When control words without their own brackets are encountered
+    and in the list of allowed words, this will add brackets
+    to facilitate the treatment of the file
+    """
+
+    def __init__(self, in_file,
+            bug_handler,
+            copy=None,
+            run_level=1,
+            ):
+        """
+        Required:
+            'file'--file to parse
+        Optional:
+            'copy'-- whether to make a copy of result for debugging
+            'temp_dir' --where to output temporary results (default is
+            directory from which the script is run.)
+        Returns:
+            nothing
+            """
+        self.__file = in_file
+        self.__bug_handler = bug_handler
+        self.__copy = copy
+        self.__write_to = better_mktemp()
+        self.__run_level = run_level
+        self.__state_dict = {
+            'before_body'           : self.__before_body_func,
+            'in_body'               : self.__in_body_func,
+            'after_control_word'    : self.__after_control_word_func,
+            'in_ignore'             : self.__ignore_func,
+        }
+        self.__accept = [
+            'cw<ci<bold______' ,
+            'cw<ci<annotation' ,
+            'cw<ci<blue______' ,
+            # 'cw<ci<bold______' ,
+            'cw<ci<caps______' ,
+            'cw<ci<char-style' ,
+            'cw<ci<dbl-strike' ,
+            'cw<ci<emboss____' ,
+            'cw<ci<engrave___' ,
+            'cw<ci<font-color' ,
+            'cw<ci<font-down_' ,
+            'cw<ci<font-size_' ,
+            'cw<ci<font-style' ,
+            'cw<ci<font-up___' ,
+            'cw<ci<footnot-mk' ,
+            'cw<ci<green_____' ,
+            'cw<ci<hidden____' ,
+            'cw<ci<italics___' ,
+            'cw<ci<outline___' ,
+            'cw<ci<red_______' ,
+            'cw<ci<shadow____' ,
+            'cw<ci<small-caps' ,
+            'cw<ci<strike-thr' ,
+            'cw<ci<subscript_' ,
+            'cw<ci<superscrip' ,
+            'cw<ci<underlined' ,
+            # 'cw<ul<underlined' ,
+        ]
+
+    def __initiate_values(self):
+        """
+        Init temp values
+        """
+        self.__state = 'before_body'
+        self.__inline = {}
+        self.__temp_group = []
+        self.__open_bracket = False
+        self.__found_brackets = False
+
+    def __before_body_func(self, line):
+        """
+        If we are before the body, not interest in changing anything
+        """
+        if self.__token_info == 'mi<mk<body-open_':
+            self.__state = 'in_body'
+        self.__write_obj.write(line)
+
+    def __in_body_func(self, line):
+        """
+        Select what action to take in body:
+            1-At the end of the file close the braket if a bracket was opened
+            This happens if there is achange
+            2-If an open bracket is found the code inside is ignore
+            (written without modifications)
+            3-If an accepted control word is found put the line
+            in a buffer then chage state to after cw
+            4-Else simply write the line
+        """
+        if line == 'cb<nu<clos-brack<0001\n' and self.__open_bracket:
+            self.__write_obj.write(
+                'cb<nu<clos-brack<0003\n'
+                    )
+            self.__write_obj.write(line)
+        elif self.__token_info == 'ob<nu<open-brack':
+            self.__found_brackets = True
+            self.__state = 'in_ignore'
+            self.__ignore_count = self.__ob_count
+            self.__write_obj.write(line)
+        elif self.__token_info in self.__accept:
+            self.__temp_group.append(line)
+            self.__state = 'after_control_word'
+        else:
+            self.__write_obj.write(line)
+
+    def __after_control_word_func(self, line):
+        """
+        After a cw either add next allowed cw to temporary list or
+        change groupe and write it.
+        If the token leading to an exit is an open bracket go to
+        ignore otherwise goto in body
+        """
+        if self.__token_info in self.__accept:
+            self.__temp_group.append(line)
+        else:
+            self.__change_permanent_group()
+            self.__write_group()
+            self.__write_obj.write(line)
+            if self.__token_info == 'ob<nu<open-brack':
+                self.__state = 'in_ignore'
+                self.__ignore_count = self.__ob_count
+            else:
+                self.__state = 'in_body'
+
+    def __write_group(self):
+        """
+        Write a tempory group after accepted control words end
+        But this is mostly useless in my opinion as there is no list of rejected cw
+        This may be a way to implement future old rtf processing for cw
+        Utility: open a group to just put brackets but why be so complicated?
+        Scheme: open brackets, write cw then go to body and back with cw after
+        """
+        if self.__open_bracket:
+            self.__write_obj.write(
+                'cb<nu<clos-brack<0003\n'
+                )
+            self.__open_bracket = False
+
+        inline_string = ''.join(['%s<nu<%s\n' % (k, v)
+                for k, v in iteritems(self.__inline)
+                    if v != 'false'])
+        if inline_string:
+            self.__write_obj.write('ob<nu<open-brack<0003\n'
+                '%s' % inline_string)
+            self.__open_bracket = True
+        self.__temp_group = []
+
+    def __change_permanent_group(self):
+        """
+        Use temp group to change permanent group
+        If the control word is not accepted remove it
+        What is the interest as it is build to accept only accepted cw
+        in __after_control_word_func?
+        """
+        self.__inline = {line[:16] : line[20:-1]
+            for line in self.__temp_group\
+            # Is this really necessary?
+                if line[:16] in self.__accept}
+
+    def __ignore_func(self, line):
+        """
+        Just copy data inside of RTF brackets already here.
+        """
+        self.__write_obj.write(line)
+        if self.__token_info == 'cb<nu<clos-brack'\
+            and self.__cb_count == self.__ignore_count:
+            self.__state = 'in_body'
+
+    def __check_brackets(self, in_file):
+        """
+        Return True if brackets match
+        """
+        check_brack_obj = check_brackets.CheckBrackets(file=in_file)
+        return check_brack_obj.check_brackets()[0]
+
+    def add_brackets(self):
+        """
+        """
+        self.__initiate_values()
+        with open_for_read(self.__file) as read_obj:
+            with open_for_write(self.__write_to) as self.__write_obj:
+                for line in read_obj:
+                    self.__token_info = line[:16]
+                    if self.__token_info == 'ob<nu<open-brack':
+                        self.__ob_count = line[-5:-1]
+                    if self.__token_info == 'cb<nu<clos-brack':
+                        self.__cb_count = line[-5:-1]
+                    action = self.__state_dict.get(self.__state)
+                    if action is None:
+                        sys.stderr.write(
+                            'No matching state in module add_brackets.py\n'
+                            '%s\n' % self.__state)
+                    action(line)
+        # Check bad brackets
+        if self.__check_brackets(self.__write_to):
+            copy_obj = copy.Copy(bug_handler=self.__bug_handler)
+            if self.__copy:
+                copy_obj.copy_file(self.__write_to, "add_brackets.data")
+            copy_obj.rename(self.__write_to, self.__file)
+        else:
+            if self.__run_level > 0:
+                sys.stderr.write(
+                    'Sorry, but this files has a mix of old and new RTF.\n'
+                    'Some characteristics cannot be converted.\n')
+        os.remove(self.__write_to)
--- a/ebook_converter/ebooks/rtf2xml/body_styles.py
+++ b/ebook_converter/ebooks/rtf2xml/body_styles.py
@@ -0,0 +1,84 @@
+from __future__ import unicode_literals, absolute_import, print_function, division
+#########################################################################
+#                                                                       #
+#                                                                       #
+#   copyright 2002 Paul Henry Tremblay                                  #
+#                                                                       #
+#   This program is distributed in the hope that it will be useful,     #
+#   but WITHOUT ANY WARRANTY; without even the implied warranty of      #
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU    #
+#   General Public License for more details.                            #
+#                                                                       #
+#                                                                       #
+#########################################################################
+import os
+from calibre.ebooks.rtf2xml import copy
+from calibre.ptempfile import better_mktemp
+from . import open_for_read, open_for_write
+
+"""
+Simply write the list of strings after style table
+"""
+
+
+class BodyStyles:
+    """
+    Insert table data for tables.
+    Logic:
+    """
+
+    def __init__(self,
+            in_file,
+            list_of_styles,
+            bug_handler,
+            copy=None,
+            run_level=1,):
+        """
+        Required:
+            'file'--file to parse
+            'table_data' -- a dictionary for each table.
+        Optional:
+            'copy'-- whether to make a copy of result for debugging
+            'temp_dir' --where to output temporary results (default is
+            directory from which the script is run.)
+        Returns:
+            nothing
+            """
+        self.__file = in_file
+        self.__bug_handler = bug_handler
+        self.__copy = copy
+        self.__list_of_styles = list_of_styles
+        self.__run_level = run_level
+        self.__write_to = better_mktemp()
+        # self.__write_to = 'table_info.data'
+
+    def insert_info(self):
+        """
+        """
+        read_obj = open_for_read(self.__file)
+        self.__write_obj = open_for_write(self.__write_to)
+        line_to_read = 1
+        while line_to_read:
+            line_to_read = read_obj.readline()
+            line = line_to_read
+            if line == 'mi<tg<close_____<style-table\n':
+                if len(self.__list_of_styles) > 0:
+                    self.__write_obj.write('mi<tg<open______<styles-in-body\n')
+                    the_string = ''.join(self.__list_of_styles)
+                    self.__write_obj.write(the_string)
+                    self.__write_obj.write('mi<tg<close_____<styles-in-body\n')
+                else:
+                    # this shouldn't happen!
+                    if self.__run_level > 3:
+                        msg = 'Not enough data for each table\n'
+                        raise self.__bug_handler(msg)
+                    # why was this line even here?
+                    # self.__write_obj.write('mi<tg<open______<table\n')
+            self.__write_obj.write(line)
+        read_obj.close()
+        self.__write_obj.close()
+        copy_obj = copy.Copy(bug_handler=self.__bug_handler)
+        if self.__copy:
+            copy_obj.copy_file(self.__write_to, "body_styles.data")
+        copy_obj.rename(self.__write_to, self.__file)
+        os.remove(self.__write_to)
--- a/ebook_converter/ebooks/rtf2xml/border_parse.py
+++ b/ebook_converter/ebooks/rtf2xml/border_parse.py
@@ -0,0 +1,191 @@
+from __future__ import unicode_literals, absolute_import, print_function, division
+#########################################################################
+#                                                                       #
+#                                                                       #
+#   copyright 2002 Paul Henry Tremblay                                  #
+#                                                                       #
+#   This program is distributed in the hope that it will be useful,     #
+#   but WITHOUT ANY WARRANTY; without even the implied warranty of      #
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU    #
+#   General Public License for more details.                            #
+#                                                                       #
+#                                                                       #
+#########################################################################
+import sys
+
+
+class BorderParse:
+    """
+    Parse a border line and return a dictionary of attributes and values
+    """
+
+    def __init__(self):
+        # cw<bd<bor-t-r-hi<nu<true
+        self.__border_dict = {
+        'bor-t-r-hi'    : 'border-table-row-horizontal-inside',
+        'bor-t-r-vi'    : 'border-table-row-vertical-inside',
+        'bor-t-r-to'    : 'border-table-row-top',
+        'bor-t-r-le'    : 'border-table-row-left',
+        'bor-t-r-bo'    : 'border-table-row-bottom',
+        'bor-t-r-ri'    : 'border-table-row-right',
+        'bor-cel-bo'    : 'border-cell-bottom',
+        'bor-cel-to'    : 'border-cell-top',
+        'bor-cel-le'    : 'border-cell-left',
+        'bor-cel-ri'    : 'border-cell-right',
+        'bor-par-bo'    : 'border-paragraph-bottom',
+        'bor-par-to'    : 'border-paragraph-top',
+        'bor-par-le'    : 'border-paragraph-left',
+        'bor-par-ri'    : 'border-paragraph-right',
+        'bor-par-bx'    : 'border-paragraph-box',
+        'bor-for-ev'    : 'border-for-every-paragraph',
+        'bor-outsid'    : 'border-outside',
+        'bor-none__'    : 'border',
+        # border type => bt
+        'bdr-li-wid'    : 'line-width',
+        'bdr-sp-wid'    :       'padding',
+        'bdr-color_'    :       'color',
+        }
+        self.__border_style_dict = {
+        'bdr-single'    : 'single',
+        'bdr-doubtb'    : 'double-thickness-border',
+        'bdr-shadow'    : 'shadowed-border',
+        'bdr-double'    : 'double-border',
+        'bdr-dotted'    : 'dotted-border',
+        'bdr-dashed'    : 'dashed',
+        'bdr-hair__'    : 'hairline',
+        'bdr-inset_'    : 'inset',
+        'bdr-das-sm'    : 'dash-small',
+        'bdr-dot-sm'    : 'dot-dash',
+        'bdr-dot-do'    : 'dot-dot-dash',
+        'bdr-outset'    : 'outset',
+        'bdr-trippl'    : 'tripple',
+        'bdr-thsm__'    : 'thick-thin-small',
+        'bdr-htsm__'    : 'thin-thick-small',
+        'bdr-hthsm_'    : 'thin-thick-thin-small',
+        'bdr-thm___'     : 'thick-thin-medium',
+        'bdr-htm___'     : 'thin-thick-medium',
+        'bdr-hthm__'     : 'thin-thick-thin-medium',
+        'bdr-thl___'     : 'thick-thin-large',
+        'bdr-hthl__'     : 'thin-thick-thin-large',
+        'bdr-wavy__'     : 'wavy',
+        'bdr-d-wav_'     : 'double-wavy',
+        'bdr-strip_'     : 'striped',
+        'bdr-embos_'     : 'emboss',
+        'bdr-engra_'     : 'engrave',
+        'bdr-frame_'     : 'frame',
+        }
+
+    def parse_border(self, line):
+        """
+        Requires:
+            line -- line with border definition in it
+        Returns:
+            ?
+        Logic:
+        """
+        border_dict = {}
+        border_style_dict = {}
+        border_style_list = []
+        border_type = self.__border_dict.get(line[6:16])
+        if not border_type:
+            sys.stderr.write(
+            'module is border_parse.py\n'
+            'function is parse_border\n'
+            'token does not have a dictionary value\n'
+            'token is "%s"' % line
+            )
+            return border_dict
+        att_line = line[20:-1]
+        atts = att_line.split('|')
+        # cw<bd<bor-cel-ri<nu<
+        # border has no value--should be no lines
+        if len(atts) == 1 and atts[0] == '':
+            border_dict[border_type] = 'none'
+            return border_dict
+            # border-paragraph-right
+        for att in atts:
+            values = att.split(':')
+            if len(values) ==2:
+                att = values[0]
+                value = values[1]
+            else:
+                value = 'true'
+            style_att = self.__border_style_dict.get(att)
+            if style_att:
+                att = '%s-%s' % (border_type, att)
+                border_style_dict[att] = value
+                border_style_list.append(style_att)
+            else:
+                att = self.__border_dict.get(att)
+                if not att:
+                    sys.stderr.write(
+                    'module is border_parse_def.py\n'
+                    'function is parse_border\n'
+                    'token does not have an att value\n'
+                    'line is "%s"' % line
+                    )
+                att = '%s-%s' % (border_type, att)
+                border_dict[att] = value
+        new_border_dict = self.__determine_styles(border_type, border_style_list)
+        border_dict.update(new_border_dict)
+        return border_dict
+
+    def __determine_styles(self, border_type, border_style_list):
+        new_border_dict = {}
+        att = '%s-style' % border_type
+        if 'shadowed-border' in border_style_list:
+            new_border_dict[att] = 'shadowed'
+        elif 'engraved' in border_style_list:
+            new_border_dict[att] = 'engraved'
+        elif 'emboss' in border_style_list:
+            new_border_dict[att] = 'emboss'
+        elif 'striped' in border_style_list:
+            new_border_dict[att] = 'striped'
+        elif 'thin-thick-thin-small' in border_style_list:
+            new_border_dict[att] = 'thin-thick-thin-small'
+        elif 'thick-thin-large' in border_style_list:
+            new_border_dict[att] = 'thick-thin-large'
+        elif 'thin-thick-thin-medium' in border_style_list:
+            new_border_dict[att] = 'thin-thick-thin-medium'
+        elif 'thin-thick-medium' in border_style_list:
+            new_border_dict[att] = 'thin-thick-medium'
+        elif 'thick-thin-medium' in border_style_list:
+            new_border_dict[att] = 'thick-thin-medium'
+        elif 'thick-thin-small' in border_style_list:
+            new_border_dict[att] = 'thick-thin-small'
+        elif 'thick-thin-small' in border_style_list:
+            new_border_dict[att] = 'thick-thin-small'
+        elif 'double-wavy' in border_style_list:
+            new_border_dict[att] = 'double-wavy'
+        elif 'dot-dot-dash' in border_style_list:
+            new_border_dict[att] = 'dot-dot-dash'
+        elif 'dot-dash' in border_style_list:
+            new_border_dict[att] = 'dot-dash'
+        elif 'dotted-border' in border_style_list:
+            new_border_dict[att] = 'dotted'
+        elif 'wavy' in border_style_list:
+            new_border_dict[att] = 'wavy'
+        elif 'dash-small' in border_style_list:
+            new_border_dict[att] = 'dash-small'
+        elif 'dashed' in border_style_list:
+            new_border_dict[att] = 'dashed'
+        elif 'frame' in border_style_list:
+            new_border_dict[att] = 'frame'
+        elif 'inset' in border_style_list:
+            new_border_dict[att] = 'inset'
+        elif 'outset' in border_style_list:
+            new_border_dict[att] = 'outset'
+        elif 'tripple-border' in border_style_list:
+            new_border_dict[att] = 'tripple'
+        elif 'double-border' in border_style_list:
+            new_border_dict[att] = 'double'
+        elif 'double-thickness-border' in border_style_list:
+            new_border_dict[att] = 'double-thickness'
+        elif 'hairline' in border_style_list:
+            new_border_dict[att] = 'hairline'
+        elif 'single' in border_style_list:
+            new_border_dict[att] = 'single'
+        else:
+            if border_style_list:
+                new_border_dict[att] = border_style_list[0]
+        return new_border_dict
--- a/ebook_converter/ebooks/rtf2xml/char_set.py
+++ b/ebook_converter/ebooks/rtf2xml/char_set.py
--- a/ebook_converter/ebooks/rtf2xml/check_brackets.py
+++ b/ebook_converter/ebooks/rtf2xml/check_brackets.py
@@ -0,0 +1,62 @@
+from __future__ import unicode_literals, absolute_import, print_function, division
+#########################################################################
+#                                                                       #
+#                                                                       #
+#   copyright 2002 Paul Henry Tremblay                                  #
+#                                                                       #
+#   This program is distributed in the hope that it will be useful,     #
+#   but WITHOUT ANY WARRANTY; without even the implied warranty of      #
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU    #
+#   General Public License for more details.                            #
+#                                                                       #
+#                                                                       #
+#########################################################################
+
+
+from . import open_for_read
+
+
+class CheckBrackets:
+    """Check that brackets match up"""
+
+    def __init__(self, bug_handler=None, file=None):
+        self.__file=file
+        self.__bug_handler = bug_handler
+        self.__bracket_count=0
+        self.__ob_count = 0
+        self.__cb_count = 0
+        self.__open_bracket_num = []
+
+    def open_brack(self, line):
+        num = line[-5:-1]
+        self.__open_bracket_num.append(num)
+        self.__bracket_count += 1
+
+    def close_brack(self, line):
+        num = line[-5:-1]
+        try:
+            last_num = self.__open_bracket_num.pop()
+        except:
+            return False
+        if num != last_num:
+            return False
+        self.__bracket_count -= 1
+        return True
+
+    def check_brackets(self):
+        line_count = 0
+        with open_for_read(self.__file) as read_obj:
+            for line in read_obj:
+                line_count += 1
+                self.__token_info = line[:16]
+                if self.__token_info == 'ob<nu<open-brack':
+                    self.open_brack(line)
+                if self.__token_info == 'cb<nu<clos-brack':
+                    if not self.close_brack(line):
+                        return (False, "closed bracket doesn't match, line %s" % line_count)
+
+        if self.__bracket_count != 0:
+            msg = ('At end of file open and closed brackets don\'t match\n'
+                        'total number of brackets is %s') % self.__bracket_count
+            return (False, msg)
+        return (True, "Brackets match!")
--- a/ebook_converter/ebooks/rtf2xml/check_encoding.py
+++ b/ebook_converter/ebooks/rtf2xml/check_encoding.py
@@ -0,0 +1,42 @@
+#!/usr/bin/env python2
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import sys
+
+from polyglot.builtins import unicode_type
+
+
+class CheckEncoding:
+
+    def __init__(self, bug_handler):
+        self.__bug_handler = bug_handler
+
+    def __get_position_error(self, line, encoding, line_num):
+        char_position = 0
+        for char in line:
+            char_position +=1
+            try:
+                char.decode(encoding)
+            except ValueError as msg:
+                sys.stderr.write('line: %s char: %s\n%s\n' %  (line_num, char_position, unicode_type(msg)))
+
+    def check_encoding(self, path, encoding='us-ascii', verbose=True):
+        line_num = 0
+        with open(path, 'rb') as read_obj:
+            for line in read_obj:
+                line_num += 1
+                try:
+                    line.decode(encoding)
+                except ValueError:
+                    if verbose:
+                        if len(line) < 1000:
+                            self.__get_position_error(line, encoding, line_num)
+                        else:
+                            sys.stderr.write('line: %d has bad encoding\n' % line_num)
+                    return True
+        return False
+
+
+if __name__ == '__main__':
+    check_encoding_obj = CheckEncoding()
+    check_encoding_obj.check_encoding(sys.argv[1])
--- a/ebook_converter/ebooks/rtf2xml/colors.py
+++ b/ebook_converter/ebooks/rtf2xml/colors.py
@@ -0,0 +1,258 @@
+from __future__ import unicode_literals, absolute_import, print_function, division
+#########################################################################
+#                                                                       #
+#                                                                       #
+#   copyright 2002 Paul Henry Tremblay                                  #
+#                                                                       #
+#   This program is distributed in the hope that it will be useful,     #
+#   but WITHOUT ANY WARRANTY; without even the implied warranty of      #
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU    #
+#   General Public License for more details.                            #
+#                                                                       #
+#                                                                       #
+#########################################################################
+import sys, os, re
+
+from calibre.ebooks.rtf2xml import copy
+from calibre.ptempfile import better_mktemp
+from . import open_for_read, open_for_write
+
+
+class Colors:
+    """
+    Change lines with color info from color numbers to the actual color names.
+    """
+
+    def __init__(self,
+            in_file,
+            bug_handler,
+            copy=None,
+            run_level=1
+            ):
+        """
+        Required:
+            'file'--file to parse
+        Optional:
+            'copy'-- whether to make a copy of result for debugging
+            'temp_dir' --where to output temporary results (default is
+            directory from which the script is run.)
+        Returns:
+            nothing
+            """
+        self.__file = in_file
+        self.__copy = copy
+        self.__bug_handler = bug_handler
+        self.__line = 0
+        self.__write_to = better_mktemp()
+        self.__run_level = run_level
+
+    def __initiate_values(self):
+        """
+        Initiate all values.
+        """
+        self.__color_dict = {}
+        self.__state = 'before_color_table'
+        self.__state_dict = {
+        'before_color_table': self.__before_color_func,
+        'in_color_table'    : self.__in_color_func,
+        'after_color_table'  : self.__after_color_func,
+        'cw<ci<red_______'  : self.__default_color_func,
+        'cw<ci<green_____'  : self.__default_color_func,
+        'cw<ci<blue______'  : self.__blue_func,
+        'tx<nu<__________'  : self.__do_nothing_func,
+        }
+        self.__color_string = '#'
+        self.__color_num = 1
+        self.__line_color_exp = re.compile(r'bdr-color_:(\d+)')
+        # cw<bd<bor-par-to<nu<bdr-hair__|bdr-li-wid:0.50|bdr-sp-wid:1.00|bdr-color_:2
+
+    def __before_color_func(self, line):
+        """
+        Requires:
+            line
+        Returns:
+            nothing
+        Logic:
+            Check to see if the line marks the beginning of the color table.
+            If so, change states.
+            Always print out the line.
+        """
+        # mi<mk<clrtbl-beg
+        if self.__token_info == 'mi<mk<clrtbl-beg':
+            self.__state = 'in_color_table'
+        self.__write_obj.write(line)
+
+    def __default_color_func(self, line):
+        """
+        Requires:
+            line
+        Returns:
+            nothing
+        Logic:
+            get the hex number from the line and add it to the color string.
+            """
+        hex_num = line[-3:-1]
+        self.__color_string += hex_num
+
+    def __blue_func(self, line):
+        """
+        Requires:
+            line
+        Returns:
+            nothing
+        Logic:
+            Get the hex number from the line and add it to the color string.
+            Add a key -> value pair to the color dictionary, with the number
+            as the key, and the hex number as the value. Write an empty tag
+            with the hex number and number as attributes. Add one to the color
+            number. Reset the color string to '#'
+            """
+        hex_num = line[-3:-1]
+        self.__color_string +=  hex_num
+        self.__color_dict[self.__color_num] = self.__color_string
+        self.__write_obj.write(
+        'mi<tg<empty-att_'
+        '<color-in-table<num>%s<value>%s\n' % (self.__color_num, self.__color_string)
+        )
+        self.__color_num += 1
+        self.__color_string = '#'
+
+    def __in_color_func(self, line):
+        """
+        Requires:
+            line
+        Returns:
+            nothing
+        Logic:
+            Check if the end of the color table has been reached. If so,
+            change the state to after the color table.
+            Othewise, get a function by passing the self.__token_info to the
+            state dictionary.
+            """
+        # mi<mk<clrtbl-beg
+        # cw<ci<red_______<nu<00
+        if self.__token_info == 'mi<mk<clrtbl-end':
+            self.__state = 'after_color_table'
+        else:
+            action = self.__state_dict.get(self.__token_info)
+            if action is None:
+                sys.stderr.write('in module colors.py\n'
+                'function is self.__in_color_func\n'
+                'no action for %s' % self.__token_info
+                )
+            action(line)
+
+    def __after_color_func(self, line):
+        """
+        Check the to see if it contains color info. If it does, extract the
+        number and look up the hex value in the color dictionary. If the color
+        dictionary has no key for the number, print out an error message.
+        Otherwise, print out the line.
+        Added Oct 10, 2003
+        If the number is 0, that indicates no color
+        """
+        # cw<ci<font-color<nu<2
+        if self.__token_info == 'cw<ci<font-color':
+            hex_num = int(line[20:-1])
+            hex_num = self.__figure_num(hex_num)
+            if hex_num:
+                self.__write_obj.write(
+                'cw<ci<font-color<nu<%s\n' % hex_num
+                )
+        elif line[0:5] == 'cw<bd':
+            the_index = line.find('bdr-color_')
+            if the_index > -1:
+                line = re.sub(self.__line_color_exp, self.__sub_from_line_color, line)
+            self.__write_obj.write(line)
+            """
+            if num == 0:
+                hex_num = 'false'
+            else:
+                hex_num = self.__color_dict.get(num)
+            if hex_num == None:
+                if self.__run_level > 0:
+                    sys.stderr.write(
+                    'module is colors.py\n'
+                    'function is self.__after_color_func\n'
+                    'no value in self.__color_dict for key %s\n' % num
+                    )
+                if self.__run_level > 3:
+                    sys.stderr.write(
+                        'run level is %s\n'
+                        'Script will now quit\n'
+                        % self.__run_level)
+            else:
+                self.__write_obj.write(
+                'cw<ci<font-color<nu<%s\n' % hex_num
+                )
+            """
+        else:
+            self.__write_obj.write(line)
+        # cw<bd<bor-par-to<nu<bdr-hair__|bdr-li-wid:0.50|bdr-sp-wid:1.00|bdr-color_:2
+
+    def __sub_from_line_color(self, match_obj):
+        num = match_obj.group(1)
+        try:
+            num = int(num)
+        except ValueError:
+            if self.__run_level > 3:
+                msg = 'can\'t make integer from string\n'
+                raise self.__bug_handler(msg)
+            else:
+                return 'bdr-color_:no-value'
+        hex_num = self.__figure_num(num)
+        return 'bdr-color_:%s' % hex_num
+
+    def __figure_num(self, num):
+        if num == 0:
+            hex_num = 'false'
+        else:
+            hex_num = self.__color_dict.get(num)
+        if hex_num is None:
+            hex_num = '0'
+            if self.__run_level > 3:
+                msg = 'no value in self.__color_dict' \
+                'for key %s at line %d\n' % (num, self.__line)
+                raise self.__bug_handler(msg)
+        return hex_num
+
+    def __do_nothing_func(self, line):
+        """
+        Bad RTF will have text in the color table
+        """
+        pass
+
+    def convert_colors(self):
+        """
+        Requires:
+            nothing
+        Returns:
+            nothing (changes the original file)
+        Logic:
+            Read one line in at a time. Determine what action to take based on
+            the state. If the state is before the color table, look for the
+            beginning of the color table.
+            If the state is in the color table, create the color dictionary
+            and print out the tags.
+            If the state if afer the color table, look for lines with color
+            info, and substitute the number with the hex number.
+        """
+        self.__initiate_values()
+        with open_for_read(self.__file) as read_obj:
+            with open_for_write(self.__write_to) as self.__write_obj:
+                for line in read_obj:
+                    self.__line+=1
+                    self.__token_info = line[:16]
+                    action = self.__state_dict.get(self.__state)
+                    if action is None:
+                        try:
+                            sys.stderr.write('no matching state in module fonts.py\n')
+                            sys.stderr.write(self.__state + '\n')
+                        except:
+                            pass
+                    action(line)
+        copy_obj = copy.Copy(bug_handler=self.__bug_handler)
+        if self.__copy:
+            copy_obj.copy_file(self.__write_to, "color.data")
+        copy_obj.rename(self.__write_to, self.__file)
+        os.remove(self.__write_to)
--- a/ebook_converter/ebooks/rtf2xml/combine_borders.py
+++ b/ebook_converter/ebooks/rtf2xml/combine_borders.py
@@ -0,0 +1,93 @@
+from __future__ import unicode_literals, absolute_import, print_function, division
+#########################################################################
+#                                                                       #
+#                                                                       #
+#   copyright 2002 Paul Henry Tremblay                                  #
+#                                                                       #
+#   This program is distributed in the hope that it will be useful,     #
+#   but WITHOUT ANY WARRANTY; without even the implied warranty of      #
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU    #
+#   General Public License for more details.                            #
+#                                                                       #
+#                                                                       #
+#########################################################################
+import os
+
+from calibre.ebooks.rtf2xml import copy
+from calibre.ptempfile import better_mktemp
+from . import open_for_read, open_for_write
+
+
+class CombineBorders:
+    """Combine borders in RTF tokens to make later processing easier"""
+
+    def __init__(self,
+            in_file ,
+            bug_handler,
+            copy=None,
+            run_level=1,
+            ):
+        self.__file = in_file
+        self.__bug_handler = bug_handler
+        self.__copy = copy
+        self.__write_to = better_mktemp()
+        self.__state = 'default'
+        self.__bord_pos = 'default'
+        self.__bord_att = []
+
+    def found_bd(self, line):
+        # cw<bd<bor-t-r-vi
+        self.__state = 'border'
+        self.__bord_pos = line[6:16]
+
+    def __default_func(self, line):
+        # cw<bd<bor-t-r-vi
+        if self.__first_five == 'cw<bd':
+            self.found_bd(line)
+            return ''
+        return line
+
+    def end_border(self, line, write_obj):
+        border_string = "|".join(self.__bord_att)
+        self.__bord_att = []
+        write_obj.write('cw<bd<%s<nu<%s\n' % (self.__bord_pos,
+                                                border_string))
+        self.__state = 'default'
+        self.__bord_string = ''
+        if self.__first_five == 'cw<bd':
+            self. found_bd(line)
+        else:
+            write_obj.write(line)
+
+    def add_to_border_desc(self, line):
+        # cw<bt<bdr-hair__<nu<true
+        # cw<bt<bdr-linew<nu<0.50
+        # tx<__________<some text
+        border_desc = line[6:16]
+        num = line[20:-1]
+        if num == 'true':
+            num = ''
+        else:
+            num = ':' + num
+        self.__bord_att.append(border_desc + num)
+
+    def __border_func(self, line, write_obj):
+        if self.__first_five != 'cw<bt':
+            self.end_border(line, write_obj)
+        else:
+            self.add_to_border_desc(line)
+
+    def combine_borders(self):
+        with open_for_read(self.__file) as read_obj:
+            with open_for_write(self.__write_to) as write_obj:
+                for line in read_obj:
+                    self.__first_five = line[0:5]
+                    if self.__state == 'border':
+                        self.__border_func(line, write_obj)
+                    else:
+                        write_obj.write(self.__default_func(line))
+        copy_obj = copy.Copy(bug_handler=self.__bug_handler)
+        if self.__copy:
+            copy_obj.copy_file(self.__write_to, "combine_borders.data")
+        copy_obj.rename(self.__write_to, self.__file)
+        os.remove(self.__write_to)
--- a/ebook_converter/ebooks/rtf2xml/convert_to_tags.py
+++ b/ebook_converter/ebooks/rtf2xml/convert_to_tags.py
@@ -0,0 +1,284 @@
+from __future__ import unicode_literals, absolute_import, print_function, division
+import os, sys
+
+from calibre.ebooks.rtf2xml import copy, check_encoding
+from calibre.ptempfile import better_mktemp
+from . import open_for_read, open_for_write
+
+public_dtd = 'rtf2xml1.0.dtd'
+
+
+class ConvertToTags:
+    """
+    Convert file to XML
+    """
+
+    def __init__(self,
+            in_file,
+            bug_handler,
+            dtd_path,
+            no_dtd,
+            encoding,
+            indent=None,
+            copy=None,
+            run_level=1,
+            ):
+        """
+        Required:
+            'file'
+        Optional:
+            'copy'-- whether to make a copy of result for debugging
+            'temp_dir' --where to output temporary results (default is
+            directory from which the script is run.)
+        Returns:
+            nothing
+            """
+        self.__file = in_file
+        self.__bug_handler = bug_handler
+        self.__copy = copy
+        self.__dtd_path = dtd_path
+        self.__no_dtd = no_dtd
+        self.__encoding = 'cp' + encoding
+        # if encoding == 'mac_roman':
+        # self.__encoding = 'mac_roman'
+        self.__indent = indent
+        self.__run_level = run_level
+        self.__write_to = better_mktemp()
+        self.__convert_utf = False
+        self.__bad_encoding = False
+
+    def __initiate_values(self):
+        """
+        Set values, including those for the dictionary.
+        """
+        self.__state = 'default'
+        self.__new_line = 0
+        self.__block = ('doc', 'preamble', 'rtf-definition', 'font-table',
+                'font-in-table', 'color-table', 'color-in-table', 'style-sheet',
+                'paragraph-styles', 'paragraph-style-in-table', 'character-styles',
+                'character-style-in-table', 'list-table', 'doc-information', 'title',
+                'author', 'operator', 'creation-time', 'revision-time',
+                'editing-time', 'time', 'number-of-pages', 'number-of-words',
+                'number-of-characters', 'page-definition', 'section-definition',
+                'headers-and-footers', 'section', 'para', 'body',
+                'paragraph-definition', 'cell', 'row', 'table', 'revision-table',
+                'style-group', 'border-group','styles-in-body', 'paragraph-style-in-body',
+                'list-in-table', 'level-in-table', 'override-table','override-list',
+                )
+        self.__two_new_line = ('section',  'body',  'table', 'row' 'list-table')
+        self.__state_dict = {
+        'default'           :   self.__default_func,
+        'mi<tg<open______'  :   self.__open_func,
+        'mi<tg<close_____'  :   self.__close_func,
+        'mi<tg<open-att__'  :   self.__open_att_func,
+        'mi<tg<empty-att_'  :   self.__empty_att_func,
+        'tx<nu<__________'  :   self.__text_func,
+        'tx<ut<__________'  :   self.__text_func,
+        'mi<tg<empty_____'  :   self.__empty_func,
+        }
+
+    def __open_func(self, line):
+        """
+        Print the opening tag and newlines when needed.
+        """
+        # mi<tg<open______<style-sheet
+        info = line[17:-1]
+        self.__new_line = 0
+        if info in self.__block:
+            self.__write_new_line()
+        if info in self.__two_new_line:
+            self.__write_extra_new_line()
+        self.__write_obj.write('<%s>' % info)
+
+    def __empty_func(self, line):
+        """
+        Print out empty tag and newlines when needed.
+        """
+        info = line[17:-1]
+        self.__write_obj.write(
+        '<%s/>' % info)
+        self.__new_line = 0
+        if info in self.__block:
+            self.__write_new_line()
+        if info in self.__two_new_line:
+            self.__write_extra_new_line()
+
+    def __open_att_func(self, line):
+        """
+        Process lines for open tags that have attributes.
+        The important info is between [17:-1]. Take this info and split it
+        with the delimeter '<'. The first token in this group is the element
+        name. The rest are attributes, separated fromt their values by '>'. So
+        read each token one at a time, and split them by '>'.
+        """
+        # mi<tg<open-att__<footnote<num>
+        info = line[17:-1]
+        tokens = info.split("<")
+        element_name = tokens[0]
+        tokens = tokens[1:]
+        self.__write_obj.write('<%s' % element_name)
+        for token in tokens:
+            groups = token.split('>')
+            try:
+                val = groups[0]
+                att = groups[1]
+                att = att.replace('"', '&quot;')
+                att = att.replace("'", '&quot;')
+                self.__write_obj.write(
+                ' %s="%s"' % (val, att)
+                )
+            except:
+                if self.__run_level > 3:
+                    msg = 'index out of range\n'
+                    raise self.__bug_handler(msg)
+        self.__write_obj.write('>')
+        self.__new_line = 0
+        if element_name in self.__block:
+            self.__write_new_line()
+        if element_name in self.__two_new_line:
+            self.__write_extra_new_line()
+
+    def __empty_att_func(self, line):
+        """
+        Same as the __open_att_func, except a '/' is placed at the end of the tag.
+        """
+        # mi<tg<open-att__<footnote<num>
+        info = line[17:-1]
+        tokens = info.split("<")
+        element_name = tokens[0]
+        tokens = tokens[1:]
+        self.__write_obj.write('<%s' % element_name)
+        for token in tokens:
+            groups = token.split('>')
+            val = groups[0]
+            att = groups[1]
+            att = att.replace('"', '&quot;')
+            att = att.replace("'", '&quot;')
+            self.__write_obj.write(
+            ' %s="%s"' % (val, att))
+        self.__write_obj.write('/>')
+        self.__new_line = 0
+        if element_name in self.__block:
+            self.__write_new_line()
+        if element_name in self.__two_new_line:
+            self.__write_extra_new_line()
+
+    def __close_func(self, line):
+        """
+        Print out the closed tag and new lines, if appropriate.
+        """
+        # mi<tg<close_____<style-sheet\n
+        info = line[17:-1]
+        self.__write_obj.write(
+        '</%s>' % info)
+        self.__new_line = 0
+        if info in self.__block:
+            self.__write_new_line()
+        if info in self.__two_new_line:
+            self.__write_extra_new_line()
+
+    def __text_func(self, line):
+        """
+        Simply print out the information between [17:-1]
+        """
+        # tx<nu<__________<Normal;
+        # change this!
+        self.__write_obj.write(line[17:-1])
+
+    def __write_extra_new_line(self):
+        """
+        Print out extra new lines if the new lines have not exceeded two. If
+        the new lines are greater than two, do nothing.
+        """
+        if not self.__indent:
+            return
+        if self.__new_line < 2:
+            self.__write_obj.write('\n')
+
+    def __default_func(self, line):
+        pass
+
+    def __write_new_line(self):
+        """
+        Print out a new line if a new line has not already been printed out.
+        """
+        if not self.__indent:
+            return
+        if not self.__new_line:
+            self.__write_obj.write('\n')
+            self.__new_line += 1
+
+    def __write_dec(self):
+        """
+        Write the XML declaration at the top of the document.
+        """
+        # keep maximum compatibility with previous version
+        check_encoding_obj = check_encoding.CheckEncoding(
+                    bug_handler=self.__bug_handler)
+
+        if not check_encoding_obj.check_encoding(self.__file, verbose=False):
+            self.__write_obj.write('<?xml version="1.0" encoding="US-ASCII" ?>')
+        elif not check_encoding_obj.check_encoding(self.__file, self.__encoding, verbose=False):
+            self.__write_obj.write('<?xml version="1.0" encoding="UTF-8" ?>')
+            self.__convert_utf = True
+        else:
+            self.__write_obj.write('<?xml version="1.0" encoding="US-ASCII" ?>')
+            sys.stderr.write('Bad RTF encoding, revert to US-ASCII chars and'
+                    ' hope for the best')
+            self.__bad_encoding = True
+        self.__new_line = 0
+        self.__write_new_line()
+        if self.__no_dtd:
+            pass
+        elif self.__dtd_path:
+            self.__write_obj.write(
+            '<!DOCTYPE doc SYSTEM "%s">' % self.__dtd_path
+            )
+        elif self.__dtd_path == '':
+            # don't print dtd if further transformations are going to take
+            # place
+            pass
+        else:
+            self.__write_obj.write(
+                    '<!DOCTYPE doc PUBLIC "publicID" '
+                    '"http://rtf2xml.sourceforge.net/dtd/%s">' % public_dtd
+            )
+        self.__new_line = 0
+        self.__write_new_line()
+
+    def convert_to_tags(self):
+        """
+        Read in the file one line at a time. Get the important info, between
+        [:16]. Check if this info matches a dictionary entry. If it does, call
+        the appropriate function.
+        The functions that are called:
+            a text function for text
+            an open function for open tags
+            an open with attribute function for tags with attributes
+            an empty with attribute function for tags that are empty but have
+            attribtes.
+            a closed function for closed tags.
+            an empty tag function.
+            """
+        self.__initiate_values()
+        with open_for_write(self.__write_to) as self.__write_obj:
+            self.__write_dec()
+            with open_for_read(self.__file) as read_obj:
+                for line in read_obj:
+                    self.__token_info = line[:16]
+                    action = self.__state_dict.get(self.__token_info)
+                    if action is not None:
+                        action(line)
+        # convert all encodings to UTF8 or ASCII to avoid unsupported encodings in lxml
+        if self.__convert_utf or self.__bad_encoding:
+            copy_obj = copy.Copy(bug_handler=self.__bug_handler)
+            copy_obj.rename(self.__write_to, self.__file)
+            with open_for_read(self.__file) as read_obj:
+                with open_for_write(self.__write_to) as write_obj:
+                    for line in read_obj:
+                        write_obj.write(line)
+        copy_obj = copy.Copy(bug_handler=self.__bug_handler)
+        if self.__copy:
+            copy_obj.copy_file(self.__write_to, "convert_to_tags.data")
+        copy_obj.rename(self.__write_to, self.__file)
+        os.remove(self.__write_to)
--- a/ebook_converter/ebooks/rtf2xml/copy.py
+++ b/ebook_converter/ebooks/rtf2xml/copy.py
@@ -0,0 +1,63 @@
+from __future__ import unicode_literals, absolute_import, print_function, division
+#########################################################################
+#                                                                       #
+#                                                                       #
+#   copyright 2002 Paul Henry Tremblay                                  #
+#                                                                       #
+#   This program is distributed in the hope that it will be useful,     #
+#   but WITHOUT ANY WARRANTY; without even the implied warranty of      #
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU    #
+#   General Public License for more details.                            #
+#                                                                       #
+#                                                                       #
+#########################################################################
+import os, shutil
+
+
+class Copy:
+    """Copy each changed file to a directory for debugging purposes"""
+    __dir = ""
+
+    def __init__(self, bug_handler, file=None, deb_dir=None, ):
+        self.__file = file
+        self.__bug_handler = bug_handler
+
+    def set_dir(self, deb_dir):
+        """Set the temporary directory to write files to"""
+        if deb_dir is None:
+            message = "No directory has been provided to write to in the copy.py"
+            raise self.__bug_handler(message)
+        check = os.path.isdir(deb_dir)
+        if not check:
+            message = "%(deb_dir)s is not a directory" % vars()
+            raise self.__bug_handler(message)
+        Copy.__dir = deb_dir
+
+    def remove_files(self):
+        """Remove files from directory"""
+        self.__remove_the_files(Copy.__dir)
+
+    def __remove_the_files(self, the_dir):
+        """Remove files from directory"""
+        list_of_files = os.listdir(the_dir)
+        for file in list_of_files:
+            rem_file = os.path.join(Copy.__dir,file)
+            if os.path.isdir(rem_file):
+                self.__remove_the_files(rem_file)
+            else:
+                try:
+                    os.remove(rem_file)
+                except OSError:
+                    pass
+
+    def copy_file(self, file, new_file):
+        """
+        Copy the file to a new name
+        If the platform is linux, use the faster linux command
+        of cp. Otherwise, use a safe python method.
+        """
+        write_file = os.path.join(Copy.__dir,new_file)
+        shutil.copyfile(file, write_file)
+
+    def rename(self, source, dest):
+        shutil.copyfile(source, dest)
--- a/ebook_converter/ebooks/rtf2xml/default_encoding.py
+++ b/ebook_converter/ebooks/rtf2xml/default_encoding.py
@@ -0,0 +1,188 @@
+from __future__ import unicode_literals, absolute_import, print_function, division
+#########################################################################
+#                                                                       #
+#   copyright 2002 Paul Henry Tremblay                                  #
+#                                                                       #
+#########################################################################
+
+'''
+Codepages as to RTF 1.9.1:
+    437	United States IBM
+    708	Arabic (ASMO 708)
+    709	Arabic (ASMO 449+, BCON V4)
+    710	Arabic (transparent Arabic)
+    711	Arabic (Nafitha Enhanced)
+    720	Arabic (transparent ASMO)
+    819	Windows 3.1 (United States and Western Europe)
+    850	IBM multilingual
+    852	Eastern European
+    860	Portuguese
+    862	Hebrew
+    863	French Canadian
+    864	Arabic
+    865	Norwegian
+    866	Soviet Union
+    874	Thai
+    932	Japanese
+    936	Simplified Chinese
+    949	Korean
+    950	Traditional Chinese
+    1250	Eastern European
+    1251	Cyrillic
+    1252	Western European
+    1253	Greek
+    1254	Turkish
+    1255	Hebrew
+    1256	Arabic
+    1257	Baltic
+    1258	Vietnamese
+    1361	Johab
+    10000	MAC Roman
+    10001	MAC Japan
+    10004	MAC Arabic
+    10005	MAC Hebrew
+    10006	MAC Greek
+    10007	MAC Cyrillic
+    10029	MAC Latin2
+    10081	MAC Turkish
+    57002	Devanagari
+    57003	Bengali
+    57004	Tamil
+    57005	Telugu
+    57006	Assamese
+    57007	Oriya
+    57008	Kannada
+    57009	Malayalam
+    57010	Gujarati
+    57011	Punjabi
+'''
+import re
+from . import open_for_read
+
+
+class DefaultEncoding:
+    """
+    Find the default encoding for the doc
+    """
+
+    # Note: not all those encoding are really supported by rtf2xml
+    # See http://msdn.microsoft.com/en-us/library/windows/desktop/dd317756%28v=vs.85%29.aspx
+    # and src\calibre\gui2\widgets.py for the input list in calibre
+    ENCODINGS = {
+                # Special cases
+                'cp1252':'1252',
+                'utf-8':'1252',
+                'ascii':'1252',
+                # Normal cases
+                'big5':'950',
+                'cp1250':'1250',
+                'cp1251':'1251',
+                'cp1253':'1253',
+                'cp1254':'1254',
+                'cp1255':'1255',
+                'cp1256':'1256',
+                'shift_jis':'932',
+                'gb2312':'936',
+                # Not in RTF 1.9.1 codepage specification
+                'hz':'52936',
+                'iso8859_5':'28595',
+                'iso2022_jp':'50222',
+                'iso2022_kr':'50225',
+                'euc_jp':'51932',
+                'euc_kr':'51949',
+                'gb18030':'54936',
+                }
+
+    def __init__(self, in_file, bug_handler, default_encoding, run_level=1, check_raw=False):
+        self.__file = in_file
+        self.__bug_handler = bug_handler
+        self.__platform = 'Windows'
+        self.__default_num = 'not-defined'
+        self.__code_page = self.ENCODINGS.get(default_encoding, '1252')
+        self.__datafetched = False
+        self.__fetchraw = check_raw
+
+    def find_default_encoding(self):
+        if not self.__datafetched:
+            self._encoding()
+            self.__datafetched = True
+            code_page = 'ansicpg' + self.__code_page
+            # if self.__code_page == '10000':
+            # self.__code_page = 'mac_roman'
+        return self.__platform, code_page, self.__default_num
+
+    def get_codepage(self):
+        if not self.__datafetched:
+            self._encoding()
+            self.__datafetched = True
+            # if self.__code_page == '10000':
+            # self.__code_page = 'mac_roman'
+        return self.__code_page
+
+    def get_platform(self):
+        if not self.__datafetched:
+            self._encoding()
+            self.__datafetched = True
+        return self.__platform
+
+    def _encoding(self):
+        with open_for_read(self.__file) as read_obj:
+            cpfound = False
+            if not self.__fetchraw:
+                for line in read_obj:
+                    self.__token_info = line[:16]
+                    if self.__token_info == 'mi<mk<rtfhed-end':
+                        break
+                    if self.__token_info == 'cw<ri<macintosh_':
+                        self.__platform = 'Macintosh'
+                    elif self.__token_info == 'cw<ri<pc________':
+                        self.__platform = 'IBMPC'
+                    elif self.__token_info == 'cw<ri<pca_______':
+                        self.__platform = 'OS/2'
+                    if self.__token_info == 'cw<ri<ansi-codpg' \
+                        and int(line[20:-1]):
+                        self.__code_page = line[20:-1]
+                    if self.__token_info == 'cw<ri<deflt-font':
+                        self.__default_num = line[20:-1]
+                        cpfound = True
+                        # cw<ri<deflt-font<nu<0
+                if self.__platform != 'Windows' and \
+                        not cpfound:
+                    if self.__platform == 'Macintosh':
+                        self.__code_page = '10000'
+                    elif self.__platform == 'IBMPC':
+                        self.__code_page = '437'
+                    elif self.__platform == 'OS/2':
+                        self.__code_page = '850'
+            else:
+                fenc = re.compile(r'\\(mac|pc|ansi|pca)[\\ \{\}\t\n]+')
+                fenccp = re.compile(r'\\ansicpg(\d+)[\\ \{\}\t\n]+')
+
+                for line in read_obj:
+                    if fenc.search(line):
+                        enc = fenc.search(line).group(1)
+                    if fenccp.search(line):
+                        cp = fenccp.search(line).group(1)
+                        if not int(cp):
+                            self.__code_page = cp
+                        cpfound = True
+                        break
+                if self.__platform != 'Windows' and \
+                        not cpfound:
+                    if enc == 'mac':
+                        self.__code_page = '10000'
+                    elif enc == 'pc':
+                        self.__code_page = '437'
+                    elif enc == 'pca':
+                        self.__code_page = '850'
+
+
+if __name__ == '__main__':
+    import sys
+    encode_obj = DefaultEncoding(
+            in_file=sys.argv[1],
+            default_encoding=sys.argv[2],
+            bug_handler=Exception,
+            check_raw=True,
+            )
+    print(encode_obj.get_codepage())
--- a/ebook_converter/ebooks/rtf2xml/delete_info.py
+++ b/ebook_converter/ebooks/rtf2xml/delete_info.py
@@ -0,0 +1,212 @@
+from __future__ import unicode_literals, absolute_import, print_function, division
+#########################################################################
+#                                                                       #
+#                                                                       #
+#   copyright 2002 Paul Henry Tremblay                                  #
+#                                                                       #
+#   This program is distributed in the hope that it will be useful,     #
+#   but WITHOUT ANY WARRANTY; without even the implied warranty of      #
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU    #
+#   General Public License for more details.                            #
+#                                                                       #
+#                                                                       #
+#########################################################################
+import sys, os
+
+from calibre.ebooks.rtf2xml import copy
+from calibre.ptempfile import better_mktemp
+from . import open_for_read, open_for_write
+
+
+class DeleteInfo:
+    """Delete unecessary destination groups"""
+
+    def __init__(self,
+            in_file ,
+            bug_handler,
+            copy=None,
+            run_level=1,
+            ):
+        self.__file = in_file
+        self.__bug_handler = bug_handler
+        self.__copy = copy
+        self.__write_to = better_mktemp()
+        self.__run_level = run_level
+        self.__initiate_allow()
+        self.__bracket_count= 0
+        self.__ob_count = 0
+        self.__cb_count = 0
+        self.__ob = 0
+        self.__write_cb = False
+        self.__found_delete = False
+
+    def __initiate_allow(self):
+        """
+        Initiate a list of destination groups which should be printed out.
+        """
+        self.__allowable = ('cw<ss<char-style',
+                            'cw<it<listtable_',
+                            'cw<it<revi-table',
+                            'cw<ls<list-lev-d',
+                            # Field allowed
+                            'cw<fd<field-inst',
+                            'cw<an<book-mk-st',
+                            'cw<an<book-mk-en',
+                            'cw<an<annotation',
+                            'cw<cm<comment___',
+                            'cw<it<lovr-table',
+                            # info table
+                            'cw<di<company___',
+                            # 'cw<ls<list______',
+                        )
+        self.__not_allowable = (
+                'cw<un<unknown___',
+                'cw<un<company___',
+                'cw<ls<list-level',
+                'cw<fd<datafield_',
+                )
+        self.__state = 'default'
+        self.__state_dict = {
+            'default'           : self.__default_func,
+            'after_asterisk'    : self.__asterisk_func,
+            'delete'            : self.__delete_func,
+            'list'              : self.__list_func,
+        }
+
+    def __default_func(self,line):
+        """Handle lines when in no special state. Look for an asterisk to
+        begin a special state. Otherwise, print out line."""
+        # cw<ml<asterisk__<nu<true
+        if self.__token_info == 'cw<ml<asterisk__':
+            self.__state = 'after_asterisk'
+            self.__delete_count = self.__ob_count
+        elif self.__token_info == 'ob<nu<open-brack':
+            # write previous bracket, if exists
+            if self.__ob:
+                self.__write_obj.write(self.__ob)
+            self.__ob = line
+            return False
+        else:
+            # write previous bracket, since didn't find asterisk
+            if self.__ob:
+                self.__write_obj.write(self.__ob)
+                self.__ob = 0
+            return True
+
+    def __delete_func(self,line):
+        """Handle lines when in delete state. Don't print out lines
+        unless the state has ended."""
+        if self.__delete_count == self.__cb_count:
+            self.__state = 'default'
+            if self.__write_cb:
+                self.__write_cb = True
+                return True
+            return False
+
+    def __asterisk_func(self,line):
+        """
+        Determine whether to delete info in group
+        Note on self.__cb flag.
+        If you find that you are in a delete group, and the previous
+        token in not an open bracket (self.__ob = 0), that means
+        that the delete group is nested inside another acceptable
+        detination group. In this case, you have already written
+        the open bracket, so you will need to write the closed one
+        as well.
+        """
+        # Test for {\*}, in which case don't enter
+        # delete state
+        self.__found_delete = True
+        if self.__token_info == 'cb<nu<clos-brack':
+            if self.__delete_count == self.__cb_count:
+                self.__state = 'default'
+                self.__ob = 0
+                # changed this because haven't printed out start
+                return False
+            else:
+                # not sure what happens here!
+                # believe I have a '{\*}
+                if self.__run_level > 3:
+                    msg = 'Flag problem\n'
+                    raise self.__bug_handler(msg)
+                return True
+        elif self.__token_info in self.__allowable :
+            if self.__ob:
+                self.__write_obj.write(self.__ob)
+                self.__ob = 0
+                self.__state = 'default'
+            else:
+                pass
+            return True
+        elif self.__token_info == 'cw<ls<list______':
+            self.__ob = 0
+            self.__found_list_func(line)
+        elif self.__token_info in self.__not_allowable:
+            if not self.__ob:
+                self.__write_cb = True
+            self.__ob = 0
+            self.__state = 'delete'
+            self.__cb_count = 0
+            return False
+        else:
+            if self.__run_level > 5:
+                msg = ('After an asterisk, and found neither an allowable or non-allowable token\n\
+                            token is "%s"\n') % self.__token_info
+                raise self.__bug_handler(msg)
+            if not self.__ob:
+                self.__write_cb = True
+            self.__ob = 0
+            self.__state = 'delete'
+            self.__cb_count = 0
+            return False
+
+    def __found_list_func(self, line):
+        """
+        print out control words in this group
+        """
+        self.__state = 'list'
+
+    def __list_func(self, line):
+        """
+        Check to see if the group has ended.
+        Return True for all control words.
+        Return False otherwise.
+        """
+        if self.__delete_count == self.__cb_count and \
+                self.__token_info == 'cb<nu<clos-brack':
+            self.__state = 'default'
+            if self.__write_cb:
+                self.__write_cb = False
+                return True
+            return False
+        elif line[0:2] == 'cw':
+            return True
+        else:
+            return False
+
+    def delete_info(self):
+        """Main method for handling other methods. Read one line at
+        a time, and determine whether to print the line based on the state."""
+        with open_for_read(self.__file) as read_obj:
+            with open_for_write(self.__write_to) as self.__write_obj:
+                for line in read_obj:
+                    # ob<nu<open-brack<0001
+                    self.__token_info = line[:16]
+                    if self.__token_info == 'ob<nu<open-brack':
+                        self.__ob_count = line[-5:-1]
+                    if self.__token_info == 'cb<nu<clos-brack':
+                        self.__cb_count = line[-5:-1]
+                    # Get action to perform
+                    action = self.__state_dict.get(self.__state)
+                    if not action:
+                        sys.stderr.write('No action in dictionary state is "%s" \n'
+                                % self.__state)
+                    # Print if allowed by action
+                    if action(line):
+                        self.__write_obj.write(line)
+        copy_obj = copy.Copy(bug_handler=self.__bug_handler)
+        if self.__copy:
+            copy_obj.copy_file(self.__write_to, "delete_info.data")
+        copy_obj.rename(self.__write_to, self.__file)
+        os.remove(self.__write_to)
+        return self.__found_delete
--- a/ebook_converter/ebooks/rtf2xml/field_strings.py
+++ b/ebook_converter/ebooks/rtf2xml/field_strings.py
@@ -0,0 +1,816 @@
+from __future__ import unicode_literals, absolute_import, print_function, division
+#########################################################################
+#                                                                       #
+#                                                                       #
+#   copyright 2002 Paul Henry Tremblay                                  #
+#                                                                       #
+#   This program is distributed in the hope that it will be useful,     #
+#   but WITHOUT ANY WARRANTY; without even the implied warranty of      #
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU    #
+#   General Public License for more details.                            #
+#                                                                       #
+#                                                                       #
+#########################################################################
+import sys, re
+
+
+class FieldStrings:
+    """
+    This module is given a string. It processes the field instruction string and
+    returns a list of three values.
+    """
+
+    def __init__(self, bug_handler, run_level=1):
+        """
+        Requires:
+            nothing
+        Returns:
+            nothing
+        """
+        self.__run_level = run_level
+        self.__bug_handler = bug_handler
+        self.__initiate_values()
+
+    def __initiate_values(self):
+        """
+        Requires:
+            nothing.
+        Returns:
+            nothing.
+        Logic:
+            initiate values for rest of class.
+            self.__field_instruction_dict:
+                The dictionary for all field names.
+        """
+        self.__field_instruction_dict = {
+        # number type (arabic, etc.) and number format (\# " ")
+        'EDITTIME'      :       (self.__num_type_and_format_func, 'editing-time'),
+        'NUMCHARS'      :       (self.__num_type_and_format_func, 'number-of-characters-in-doc'),
+        'NUMPAGES'      :       (self.__num_type_and_format_func, 'number-of-pages-in-doc'),
+        'NUMWORDS'      :       (self.__num_type_and_format_func, 'number-of-words-in-doc'),
+        'REVNUM'        :       (self.__num_type_and_format_func, 'revision-number'),
+        'SECTIONPAGES'  :       (self.__num_type_and_format_func, 'num-of-pages-in-section'),
+        'SECTION'       :       (self.__num_type_and_format_func, 'insert-section-number'),
+        'QUOTE'         :       (self.__num_type_and_format_func, 'quote'),
+        # number formatting (\# "")
+        'PAGE'          :       (self.__default_inst_func, 'insert-page-number'),
+        'page'          :       (self.__default_inst_func, 'insert-page-number'),
+        # date format (\@ "")
+        'CREATEDATE'    :       (self.__date_func, 'insert-date'),
+        'PRINTDATE'     :       (self.__date_func, 'insert-date'),
+        # PRINTDATE?
+        'SAVEDATE'      :       (self.__date_func, 'last-saved'),
+        'TIME'          :       (self.__date_func, 'insert-time'),
+        # numbers?
+        # these fields take four switches
+        'AUTHOR'        :       (self.__simple_info_func, 'user-name'),
+        'COMMENTS'      :       (self.__simple_info_func, 'comments'),
+        'FILENAME'      :       (self.__simple_info_func, 'file-name'),
+        'filename'      :       (self.__simple_info_func, 'file-name'),
+        'KEYWORDS'      :       (self.__simple_info_func, 'keywords'),
+        'LASTSAVEDBY'   :       (self.__simple_info_func, 'last-saved-by'),
+        'SUBJECT'       :       (self.__simple_info_func, 'subject'),
+        'TEMPLATE'      :       (self.__simple_info_func, 'based-on-template'),
+        'TITLE'         :       (self.__simple_info_func, 'document-title'),
+        'USERADDRESS'   :       (self.__simple_info_func, 'user-address'),
+        'USERINITIALS'  :       (self.__simple_info_func, 'user-initials'),
+        'USERNAME'      :       (self.__simple_info_func, 'user-name'),
+        'EQ'            :       (self.__equation_func, 'equation'),
+        'HYPERLINK'     :       (self.__hyperlink_func, 'hyperlink'),
+        'INCLUDEPICTURE':       (self.__include_pict_func, 'include-picture'),
+        'INCLUDETEXT'   :       (self.__include_text_func, 'include-text-from-file'),
+        'INDEX'         :       (self.__index_func, 'index'),
+        'NOTEREF'       :       (self.__note_ref_func, 'reference-to-note'),
+        'PAGEREF'	: (self.__page_ref_func, 'reference-to-page'),
+        'REF'           :       (self.__ref_func, 'reference'),
+        'ref'           :       (self.__ref_func, 'reference'),
+        'SEQ'           :       (self.__sequence_func, 'numbering-sequence'),
+        'SYMBOL'        :       (self.__symbol_func, 'symbol'),
+        'TA'            :       (self.__ta_func, 'anchor-for-table-of-authorities'),
+        'TOA'           :       (self.__toc_table_func, 'table-of-authorities'),
+        'TOC'           :       (self.__toc_table_func, 'table-of-contents'),
+        # no switches
+        'AUTONUMOUT'    :       (self.__no_switch_func, 'auto-num-out?'),
+        'COMPARE'       :       (self.__no_switch_func, 'compare'),
+        'DOCVARIABLE'   :       (self.__no_switch_func, 'document-variable'),
+        'GOTOBUTTON'    :       (self.__no_switch_func, 'go-button'),
+        'NEXT'          :       (self.__no_switch_func, 'next'),
+        'NEXTIF'        :       (self.__no_switch_func, 'next-if'),
+        'SKIPIF'        :       (self.__no_switch_func, 'skip-if'),
+        'IF'            :       (self.__no_switch_func, 'if'),
+        'MERGEFIELD'    :       (self.__no_switch_func, 'merge-field'),
+        'MERGEREC'      :       (self.__no_switch_func, 'merge-record'),
+        'MERGESEQ'      :       (self.__no_switch_func, 'merge-sequence'),
+        'PLACEHOLDER'   :       (self.__no_switch_func, 'place-holder'),
+        'PRIVATE'       :       (self.__no_switch_func, 'private'),
+        'RD'            :       (self.__no_switch_func, 'referenced-document'),
+        'SET'           :       (self.__no_switch_func, 'set'),
+        # default instructions (haven't written a method for them
+        'ADVANCE'       :       (self.__default_inst_func, 'advance'),
+        'ASK'           :       (self.__default_inst_func, 'prompt-user'),
+        'AUTONUMLGL'    :       (self.__default_inst_func, 'automatic-number'),
+        'AUTONUM'       : (self.__default_inst_func, 'automatic-number'),
+        'AUTOTEXTLIST'  :       (self.__default_inst_func, 'auto-list-text'),
+        'AUTOTEXT'      :       (self.__default_inst_func, 'auto-text'),
+        'BARCODE'       :       (self.__default_inst_func, 'barcode'),
+        'CONTACT'       :       (self.__default_inst_func, 'contact'),
+        'DATABASE'      :       (self.__default_inst_func, 'database'),
+        'DATE'          :       (self.__default_inst_func, 'date'),
+        'date'          :       (self.__default_inst_func, 'date'),
+        'DOCPROPERTY'   :       (self.__default_inst_func, 'document-property'),
+        'FILESIZE'      :       (self.__default_inst_func, 'file-size'),
+        'FILLIN'        :       (self.__default_inst_func, 'fill-in'),
+        'INFO'          :       (self.__default_inst_func, 'document-info'),
+        'LINK'          :       (self.__default_inst_func, 'link'),
+        'PA'            :       (self.__default_inst_func, 'page'),
+        'PRINT'         :       (self.__default_inst_func, 'print'),
+        'STYLEREF'      :       (self.__default_inst_func, 'style-reference'),
+        'USERPROPERTY'  :       (self.__default_inst_func, 'user-property'),
+        'FORMCHECKBOX'  :       (self.__default_inst_func, 'form-checkbox'),
+        'FORMTEXT'      :       (self.__default_inst_func, 'form-text'),
+        # buttons
+        'MACROBUTTON'   :       (self.__default_inst_func, 'macro-button'),
+        }
+        self.__number_dict = {
+            'Arabic'        :   'arabic',
+            'alphabetic'    :   'alphabetic',
+            'ALPHABETIC'    :   'capital-alphabetic',
+            'roman'         :   'roman',
+            'ROMAN'         :   'capital-roman',
+            'Ordinal'       :   'ordinal',
+            'CardText'      :   'cardinal-text',
+            'OrdText'       :   'ordinal-text',
+            'Hex'           :   'hexidecimal',
+            'DollarText'    :   'dollar-text',
+            'Upper'         :   'upper-case',
+            'Lower'         :   'lower-case',
+            'FirstCap'      :   'first-cap',
+            'Caps'          :   'caps',
+        }
+        self.__text_format_dict = {
+            'Upper'         :   'upper',
+            'Lower'         :   'lower',
+            'FirstCap'      :   'first-cap',
+            'Caps'          :   'caps',
+        }
+        self.__symbol_num_exp = re.compile(r'SYMBOL (.*?) ')
+        self.__symbol_font_exp = re.compile(r'\\f "(.*?)"')
+        self.__symbol_size_exp = re.compile(r'\\s (\d+)')
+        # self.__toc_figure_exp = re.compile(r'\\c "Figure"')
+        # \\@ "dddd, MMMM d, yyyy"
+        self.__date_exp = re.compile(r'\\@\s{1,}"(.*?)"')
+        self.__num_type_exp = re.compile(
+            r'\\\*\s{1,}(Arabic|alphabetic|ALPHABETIC|roman|ROMAN|Ordinal|CardText|OrdText|Hex|DollarText|Upper|Lower|FirstCap|Caps)')
+        self.__format_text_exp = re.compile(r'\\\*\s{1,}(Upper|Lower|FirstCap|Caps)')
+        self.__merge_format_exp = re.compile(r'\\\*\s{1,}MERGEFORMAT')
+        self.__ta_short_field_exp = re.compile(r'\\s\s{1,}"(.*?)"')
+        self.__ta_long_field_exp = re.compile(r'\\l\s{1,}"(.*?)"')
+        self.__ta_category_exp = re.compile(r'\\c\s{1,}(\d+)')
+        # indices
+        self.__index_insert_blank_line_exp = re.compile(r'\\h\s{1,}""')
+        self.__index_insert_letter_exp = re.compile(r'\\h\s{1,}"()"')
+        self.__index_columns_exp = re.compile(r'\\c\s{1,}"(.*?)"')
+        self.__bookmark_exp = re.compile(r'\\b\s{1,}(.*?)\s')
+        self.__d_separator = re.compile(r'\\d\s{1,}(.*?)\s')
+        self.__e_separator = re.compile(r'\\e\s{1,}(.*?)\s')
+        self.__l_separator = re.compile(r'\\l\s{1,}(.*?)\s')
+        self.__p_separator = re.compile(r'\\p\s{1,}(.*?)\s')
+        self.__index_sequence = re.compile(r'\\s\s{1,}(.*?)\s')
+        self.__index_entry_typ_exp = re.compile(r'\\f\s{1,}"(.*?)"')
+        self.__quote_exp = re.compile(r'"(.*?)"')
+        self.__filter_switch = re.compile(r'\\c\s{1,}(.*?)\s')
+        self.__link_switch = re.compile(r'\\l\s{1,}(.*?)\s')
+
+    def process_string(self, my_string, type):
+        """
+        Requires:
+            my_string --the string to parse.
+            type -- the type of string.
+        Returns:
+            Returns a string for a field instrution attribute.
+        Logic:
+            This handles all "large" fields, which means everything except
+            toc entries, index entries, and bookmarks
+            Split the string by spaces, and get the first item in the
+            resulting list. This item is the field's type. Check for the
+            action in the field instructions dictionary for further parsing.
+            If no action is found, print out an error message.
+        """
+        changed_string = ''
+        lines = my_string.split('\n')
+        for line in lines:
+            if line[0:2] == 'tx':
+                changed_string += line[17:]
+        fields = changed_string.split()
+        field_name = fields[0]
+        action, name = self.__field_instruction_dict.get(field_name, (None, None))
+        match_obj = re.search(self.__merge_format_exp, changed_string)
+        if match_obj and name:
+            name += '<update>dynamic'
+        elif name:
+            name += '<update>static'
+        else:
+            pass
+            # no name--not in list above
+        if action:
+            the_list = action(field_name, name, changed_string)
+        else:
+            # change -1 to 0--for now, I want users to report bugs
+            msg = 'no key for "%s" "%s"\n' % (field_name, changed_string)
+            sys.stderr.write(msg)
+            if self.__run_level > 3:
+                msg = 'no key for "%s" "%s"\n' % (field_name, changed_string)
+                raise self.__bug_handler(msg)
+            the_list = self.__fall_back_func(field_name, line)
+            return the_list
+        return the_list
+
+    def __default_inst_func(self, field_name, name, line):
+        """
+        Requires:
+            field_name -- the first word in the string
+            name -- the changed name according to the dictionary
+            line -- the string to be parsed
+        Returns:
+            The name of the field.
+        Logic:
+            I only need the changed name for the field.
+        """
+        return [None, None, name]
+
+    def __fall_back_func(self, field_name,  line):
+        """
+        Requires:
+            field_name -- the first word in the string
+            name -- the changed name according to the dictionary
+            line -- the string to be parsed
+        Returns:
+            The name of the field.
+        Logic:
+            Used for fields not found in dict
+        """
+        the_string = field_name
+        the_string += '<update>none'
+        return [None, None, the_string]
+
+    def __equation_func(self, field_name, name, line):
+        """
+        Requried:
+            field_name -- the first word in the string
+            name --the changed name according to the dictionary
+            line -- the string to be parse
+        Retuns:
+            The name of the field
+        Logic:
+        """
+        return [None, None, name]
+
+    def __no_switch_func(self, field_name, name, line):
+        """
+        Required:
+            field_name --the first
+            field_name -- the first word in the string
+            name --the changed name according to the dictionary
+            line -- the string to be parse
+        Retuns:
+            The name of the field
+        Logic:
+        """
+        return [None, None, name]
+
+    def __num_type_and_format_func(self, field_name, name, line):
+        """
+        Required:
+            field_name -- the first word in the string
+            name --the changed name according to the dictionary
+            line -- the string to be parse
+        Returns:
+            list of None, None, and part of a tag
+        Logic:
+            parse num_type
+            parse num_format
+        """
+        the_string = name
+        num_format = self.__parse_num_format(line)
+        if num_format:
+            the_string += '<number-format>%s' % num_format
+        num_type = self.__parse_num_type(line)
+        if num_type:
+            the_string += '<number-type>%s' % num_type
+        # Only QUOTE takes a (mandatory?) argument
+        if field_name == 'QUOTE':
+            match_group = re.search(r'QUOTE\s{1,}"(.*?)"', line)
+            if match_group:
+                arg = match_group.group(1)
+                the_string += '<argument>%s' % arg
+        return [None, None, the_string]
+
+    def __num_format_func(self, field_name, name, line):
+        """
+        Required:
+            field_name -- the first word in the string
+            name --the changed name according to the dictionary
+            line -- the string to be parse
+        Returns:
+            list of None, None, and part of a tag
+        Logic:
+        """
+        the_string = name
+        num_format = self.__parse_num_format(line)
+        if num_format:
+            the_string += '<number-format>%s' % num_format
+        return [None, None, the_string]
+
+    def __parse_num_format(self, the_string):
+        """
+        Required:
+            the_string -- the string to parse
+        Returns:
+            a string if the_string contains number formatting information
+            None, otherwise
+        Logic:
+        """
+        match_group = re.search(self.__date_exp, the_string)
+        if match_group:
+            return match_group(1)
+
+    def __parse_num_type(self, the_string):
+        """
+        Required:
+            the_string -- the string to parse
+        Returns:
+            a string if the_string contains number type information
+            None, otherwise
+        Logic:
+            the_string might look like:
+            USERNAME \\* Arabic \\* MERGEFORMAT
+            Get the \\* Upper part. Use a dictionary to convert the "Arabic" to
+            a more-readable word for the value of the key "number-type".
+            (<field number-type = "Arabic">
+        """
+        match_group = re.search(self.__num_type_exp, the_string)
+        if match_group:
+            name =  match_group.group(1)
+            changed_name =   self.__number_dict.get(name)
+            if changed_name:
+                return changed_name
+            else:
+                sys.stderr.write('module is fields_string\n')
+                sys.stderr.write('method is __parse_num_type\n')
+                sys.stderr.write('no dictionary entry for %s\n' % name)
+
+    def __date_func(self, field_name, name, line):
+        """
+        Required:
+            field_name --the fist
+            field_name -- the first word in the string
+            name --the changed name according to the dictionary
+            line -- the string to be parse
+        Returns:
+            list of None, None, and part of a tag
+        Logic:
+        """
+        the_string = name
+        match_group = re.search(self.__date_exp, line)
+        if match_group:
+            the_string += '<date-format>%s' % match_group.group(1)
+        return [None, None, the_string]
+
+    def __simple_info_func(self, field_name, name, line):
+        """
+        Requried:
+            field_name -- the first word in the string
+            name --the changed name according to the dictionary
+            line -- the string to be parse
+        Retuns:
+            The name of the field
+        Logic:
+            These fields can only have the following switches:
+                1. Upper
+                2. Lower
+                3. FirstCap
+                4. Caps
+        """
+        the_string = name
+        match_group = re.search(self.__format_text_exp, line)
+        if match_group:
+            name =  match_group.group(1)
+            changed_name =   self.__text_format_dict.get(name)
+            if changed_name:
+                the_string += '<format>%s' % changed_name
+            else:
+                sys.stderr.write('module is fields_string\n')
+                sys.stderr.write('method is __parse_num_type\n')
+                sys.stderr.write('no dictionary entry for %s\n' % name)
+        return [None, None, the_string]
+
+    def __hyperlink_func(self, field_name, name, line):
+        """
+        Requried:
+            field_name -- the first word in the string
+            name --the changed name according to the dictionary
+            line -- the string to be parse
+        Retuns:
+            The name of the field
+        """
+        self.__link_switch = re.compile(r'\\l\s{1,}"{0,1}(.*?)"{0,1}\s')
+        the_string = name
+        match_group = re.search(self.__link_switch, line)
+        if match_group:
+            link = match_group.group(1)
+            link = link.replace('"', "&quot;")
+            the_string += '<link>%s' % link
+        # \l "txt" "link"
+        # want "file name" so must get rid of \c "txt"
+        line = re.sub(self.__link_switch, '', line)
+        match_group = re.search(self.__quote_exp, line)
+        if match_group:
+            arg = match_group.group(1)
+            the_string += '<argument>%s' % arg
+        else:
+            pass
+        index = line.find('\\m')
+        if index > -1:
+            the_string += '<html2-image-map>true'
+        index = line.find('\\n')
+        if index > -1:
+            the_string += '<new-window>true'
+        index = line.find('\\h')
+        if index > -1:
+            the_string += '<no-history>true'
+        return [None, None, the_string]
+
+    def __include_text_func(self, field_name, name, line):
+        """
+        Requried:
+            field_name -- the first word in the string
+            name --the changed name according to the dictionary
+            line -- the string to be parse
+        Retuns:
+            The name of the field
+        Logic:
+        """
+        the_string = name
+        match_group = re.search(self.__format_text_exp, line)
+        if match_group:
+            name =  match_group.group(1)
+            changed_name =   self.__text_format_dict.get(name)
+            if changed_name:
+                the_string += '<format>%s' % changed_name
+            else:
+                sys.stderr.write('module is fields_string\n')
+                sys.stderr.write('method is __parse_num_type\n')
+                sys.stderr.write('no dictionary entry for %s\n' % name)
+        match_group = re.search(self.__filter_switch, line)
+        if match_group:
+            arg = match_group.group(1)
+            the_string += '<filter>%s' % arg
+        # \c "txt" "file name"
+        # want "file name" so must get rid of \c "txt"
+        line = re.sub(self.__filter_switch, '', line)
+        match_group = re.search(self.__quote_exp, line)
+        if match_group:
+            arg = match_group.group(1)
+            arg = arg.replace('"', "&quot;")
+            the_string += '<argument>%s' % arg
+        else:
+            sys.stderr.write('Module is field_strings\n')
+            sys.stderr.write('method is include_text_func\n')
+            sys.stderr.write('no argument for include text\n')
+        index = line.find('\\!')
+        if index > -1:
+            the_string += '<no-field-update>true'
+        return [None, None, the_string]
+
+    def __include_pict_func(self, field_name, name, line):
+        """
+        Requried:
+            field_name -- the first word in the string
+            name --the changed name according to the dictionary
+            line -- the string to be parse
+        Retuns:
+            The name of the field
+        Logic:
+        """
+        the_string = name
+        match_group = re.search(self.__filter_switch, line)
+        if match_group:
+            arg = match_group.group(1)
+            arg = arg.replace('"', "&quot;")
+            the_string += '<filter>%s' % arg
+        # \c "txt" "file name"
+        # want "file name" so must get rid of \c "txt"
+        line = re.sub(self.__filter_switch, '', line)
+        match_group = re.search(self.__quote_exp, line)
+        if match_group:
+            arg = match_group.group(1)
+            the_string += '<argument>%s' % arg
+        else:
+            sys.stderr.write('Module is field_strings\n')
+            sys.stderr.write('method is include_pict_func\n')
+            sys.stderr.write('no argument for include pict\n')
+        index = line.find('\\d')
+        if index > -1:
+            the_string += '<external>true'
+        return [None, None, the_string]
+
+    def __ref_func(self, field_name, name, line):
+        """
+        Requires:
+            field_name -- the first word in the string
+            name -- the changed name according to the dictionary
+            line -- the string to be parsed
+        Returns:
+            The name of the field.
+        Logic:
+            A page reference field looks like this:
+                PAGEREF _Toc440880424 \\h
+            I want to extract the second line of info, which is used as an
+            achor in the resulting XML file.
+        """
+        the_string = name
+        match_group = re.search(self.__format_text_exp, line)
+        if match_group:
+            name =  match_group.group(1)
+            changed_name =   self.__text_format_dict.get(name)
+            if changed_name:
+                the_string += '<format>%s' % changed_name
+            else:
+                sys.stderr.write('module is fields_string\n')
+                sys.stderr.write('method is __parse_num_type\n')
+                sys.stderr.write('no dictionary entry for %s\n' % name)
+        line = re.sub(self.__merge_format_exp, '', line)
+        words = line.split()
+        words = words[1:]  # get rid of field name
+        for word in words:
+            if word[0:1] != '\\':
+                the_string += '<bookmark>%s' % word
+        index = line.find('\\f')
+        if index > -1:
+            the_string += '<include-note-number>true'
+        index = line.find('\\h')
+        if index > -1:
+            the_string += '<hyperlink>true'
+        index = line.find('\\n')
+        if index > -1:
+            the_string += '<insert-number>true'
+        index = line.find('\\r')
+        if index > -1:
+            the_string += '<insert-number-relative>true'
+        index = line.find('\\p')
+        if index > -1:
+            the_string += '<paragraph-relative-position>true'
+        index = line.find('\\t')
+        if index > -1:
+            the_string += '<suppress-non-delimeter>true'
+        index = line.find('\\w')
+        if index > -1:
+            the_string += '<insert-number-full>true'
+        return [None, None, the_string]
+
+    def __toc_table_func(self, field_name, name, line):
+        """
+        Requires:
+            field_name -- the name of the first word in the string
+            name --the changed name, according to the dictionary.
+            line --the string to be parsed.
+        Returns:
+            A string for a TOC table field.
+        Logic:
+            If the string contains Figure, it is a table of figures.
+            Otherwise, it is a plain old table of contents.
+        """
+        the_string = name
+        index = line.find('\\c "Figure"')
+        if index > -1:
+            the_string = the_string.replace('table-of-contents', 'table-of-figures')
+        # don't really need the first value in this list, I don't believe
+        return [name, None, the_string]
+
+    def __sequence_func(self, field_name, name, line):
+        """
+        Requires:
+            field_name --the name of the first word in the string.
+            name --the changed name according to the dictionary.
+            line -- the string to parse.
+        Returns:
+            A string with a value for the type and label attributes
+        Logic:
+            The type of sequence--whether figure, graph, my-name, or
+            whatever--is represented by the second word in the string. Extract
+            and return.
+            SEQ Figure \\* ARABIC
+        """
+        fields = line.split()
+        label = fields[1]
+        my_string = '%s<label>%s' % (name, label)
+        return [None, None, my_string]
+
+    def __ta_func(self, field_name, name, line):
+        """
+        Requires:
+            field_name --the name of the first word in the string.
+            name --the changed name according to the dictionary.
+            line -- the string to parse.
+        Returns:
+            A string with a value for the type and label attributes
+        Logic:
+        """
+        the_string = name
+        match_group = re.search(self.__ta_short_field_exp, line)
+        if match_group:
+            short_name =  match_group.group(1)
+            the_string += '<short-field>%s' % short_name
+        match_group = re.search(self.__ta_long_field_exp, line)
+        if match_group:
+            long_name =  match_group.group(1)
+            the_string += '<long-field>%s' % long_name
+        match_group = re.search(self.__ta_category_exp, line)
+        if match_group:
+            category =  match_group.group(1)
+            the_string += '<category>%s' % category
+        index = line.find('\\b')
+        if index > -1:
+            the_string += '<bold>true'
+        index = line.find('\\i')
+        if index > -1:
+            the_string += '<italics>true'
+        return [None, None, the_string]
+
+    def __index_func(self, field_name, name, line):
+        """
+        Requires:
+            field_name --the name of the first word in the string.
+            name --the changed name according to the dictionary.
+            line -- the string to parse.
+        Returns:
+            A string with a value for the type and label attributes
+        Logic:
+        """
+        # self.__index_insert_blank_line_exp = re.compile(r'\\h\s{1,}""')
+        # self.__index_insert_letter_exp = re.compile(r'\\h\s{1,}(".*?")')
+        the_string = name
+        match_group = re.search(self.__index_insert_blank_line_exp, line)
+        if match_group:
+            the_string += '<insert-blank-line>true'
+        else:
+            match_group = re.search(self.__index_insert_letter_exp, line)
+            if match_group:
+                insert_letter = match_group.group(1)
+                the_string += '<insert-letter>%s' % insert_letter
+        match_group = re.search(self.__index_columns_exp, line)
+        if match_group:
+            columns = match_group.group(1)
+            the_string += '<number-of-columns>%s' % columns
+        # self.__bookmark_exp = re.compile(r'\\b\s{1,}(.*?)\s')
+        match_group = re.search(self.__bookmark_exp, line)
+        if match_group:
+            bookmark = match_group.group(1)
+            the_string += '<use-bookmark>%s' % bookmark
+        match_group = re.search(self.__d_separator, line)
+        if match_group:
+            separator = match_group.group(1)
+            separator = separator.replace('"', '&quot;')
+            the_string += '<sequence-separator>%s' % separator
+        # self.__e_separator = re.compile(r'\\e\s{1,}(.*?)\s')
+        match_group = re.search(self.__e_separator, line)
+        if match_group:
+            separator = match_group.group(1)
+            separator = separator.replace('"', '&quot;')
+            the_string += '<page-separator>%s' % separator
+        # self.__index_sequence = re.compile(r'\\s\s{1,}(.*?)\s')
+        match_group = re.search(self.__index_sequence, line)
+        if match_group:
+            sequence = match_group.group(1)
+            separator = separator.replace('"', '&quot;')
+            the_string += '<use-sequence>%s' % sequence
+        # self.__index_entry_typ_exp = re.compile(r'\\f\s{1,}"(.*?)"')
+        match_group = re.search(self.__index_entry_typ_exp, line)
+        if match_group:
+            entry_type = match_group.group(1)
+            the_string += '<entry-type>%s' % entry_type
+        # self.__p_separator = re.compile(r'\\p\s{1,}(.*?)\s')
+        match_group = re.search(self.__p_separator, line)
+        if match_group:
+            limit = match_group.group(1)
+            the_string += '<limit-to-letters>%s' % limit
+        match_group = re.search(self.__l_separator, line)
+        if match_group:
+            separator = match_group.group(1)
+            separator = separator.replace('"', '&quot;')
+            the_string += '<multi-page-separator>%s' % separator
+        index = line.find('\\a')
+        if index > -1:
+            the_string += '<accented>true'
+        index = line.find('\\r')
+        if index > -1:
+            the_string += '<sub-entry-on-same-line>true'
+        index = line.find('\\t')
+        if index > -1:
+            the_string += '<enable-yomi-text>true'
+        return [None, None, the_string]
+
+    def __page_ref_func(self, field_name, name, line):
+        """
+        Requires:
+            field_name --first name in the string.
+            name -- the changed name according to the dictionary.
+            line -- the string to parse.
+        Returns:
+            A string .
+        Logic:
+        """
+        the_string = name
+        num_format = self.__parse_num_format(line)
+        if num_format:
+            the_string += '<number-format>%s' % num_format
+        num_type = self.__parse_num_type(line)
+        if num_type:
+            the_string += '<number-type>%s' % num_type
+        line = re.sub(self.__merge_format_exp, '', line)
+        words = line.split()
+        words = words[1:]  # get rid of field name
+        for word in words:
+            if word[0:1] != '\\':
+                the_string += '<bookmark>%s' % word
+        index = line.find('\\h')
+        if index > -1:
+            the_string += '<hyperlink>true'
+        index = line.find('\\p')
+        if index > -1:
+            the_string += '<paragraph-relative-position>true'
+        return [None, None, the_string]
+
+    def __note_ref_func(self, field_name, name, line):
+        """
+        Requires:
+            field_name --first name in the string.
+            name -- the changed name according to the dictionary.
+            line -- the string to parse.
+        Returns:
+            A string .
+        Logic:
+        """
+        the_string = name
+        line = re.sub(self.__merge_format_exp, '', line)
+        words = line.split()
+        words = words[1:]  # get rid of field name
+        for word in words:
+            if word[0:1] != '\\':
+                the_string += '<bookmark>%s' % word
+        index = line.find('\\h')
+        if index > -1:
+            the_string += '<hyperlink>true'
+        index = line.find('\\p')
+        if index > -1:
+            the_string += '<paragraph-relative-position>true'
+        index = line.find('\\f')
+        if index > -1:
+            the_string += '<include-note-number>true'
+        return [None, None, the_string]
+
+    def __symbol_func(self, field_name, name, line):
+        """
+        Requires:
+            field_name --first name in the string.
+            name -- the changed name according to the dictionary.
+            line -- the string to parse.
+        Returns:
+            A string containing font size, font style, and a hexidecimal value.
+        Logic:
+            The SYMBOL field is one of Microsoft's many quirky ways of
+            entering text. The string that results from this method looks like
+            this:
+                SYMBOL 97 \\f "Symbol" \\s 12
+            The first word merely tells us that we have encountered a SYMBOL
+            field.
+            The next value is the Microsoft decimal value. Change this to
+            hexidecimal.
+            The pattern '\\f "some font' tells us the font.
+            The pattern '\\s some size'  tells us the font size.
+            Extract all of this information. Store this information in a
+            string, and make this string the last item in a list. The first
+            item in the list is the simple word 'symbol', which tells me that
+            I don't really have  field, but UTF-8 data.
+        """
+        num = ''
+        font = ''
+        font_size = ''
+        changed_line = ''
+        search_obj = re.search(self.__symbol_num_exp, line)
+        if search_obj:
+            num = search_obj.group(1)
+            num = int(num)
+            num = '%X' % num
+        search_obj = re.search(self.__symbol_font_exp, line)
+        if search_obj:
+            font = search_obj.group(1)
+            changed_line += 'cw<ci<font-style<nu<%s\n' % font
+        search_obj = re.search(self.__symbol_size_exp, line)
+        if search_obj:
+            font_size = search_obj.group(1)
+            font_size = int(font_size)
+            font_size = '%.2f' % font_size
+            changed_line += 'cw<ci<font-size_<nu<%s\n' % font_size
+        changed_line += 'tx<hx<__________<\'%s\n' % num
+        return ['Symbol', None, changed_line]
--- a/ebook_converter/ebooks/rtf2xml/fields_large.py
+++ b/ebook_converter/ebooks/rtf2xml/fields_large.py
@@ -0,0 +1,378 @@
+from __future__ import unicode_literals, absolute_import, print_function, division
+#########################################################################
+#                                                                       #
+#                                                                       #
+#   copyright 2002 Paul Henry Tremblay                                  #
+#                                                                       #
+#   This program is distributed in the hope that it will be useful,     #
+#   but WITHOUT ANY WARRANTY; without even the implied warranty of      #
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU    #
+#   General Public License for more details.                            #
+#                                                                       #
+#                                                                       #
+#########################################################################
+import sys, os
+from calibre.ebooks.rtf2xml import field_strings, copy
+from calibre.ptempfile import better_mktemp
+from . import open_for_read, open_for_write
+
+
+class FieldsLarge:
+    r"""
+=========================
+Logic
+=========================
+Make tags for fields.
+-Fields reflect text that Microsoft Word automatically generates.
+-Each file contains (or should contain) an inner group called field instructions.
+-Fields can be nested.
+--------------
+Logic
+--------------
+1. As soon as a field is found, make a new text string by appending an empty
+text string to the field list. Collect all the lines in this string until the
+field instructions are found.
+2. Collect all the tokens and text in the field instructions. When the end of
+the field instructions is found, process the string of text with the
+field_strings module. Append the processed string to the field instructins
+list.
+3. Continue collecting tokens. Check for paragraphs or sections. If either is found, add to the paragraph or section list.
+4. Continue collecting tokens and text either the beginning of a new field is found, or the end of this field is found.
+5. If a new field is found, repeat steps 1-3.
+6. If the end of the field is found, process the last text string of the field list.
+7. If the field list is empty (after removing the last text string), there are
+no more fields. Print out the final string. If the list contains other strings,
+add the processed string to the last string in the field list.
+============================
+Examples
+============================
+    This line of RTF:
+        {\field{\*\fldinst { CREATEDATE  \\* MERGEFORMAT }}{\fldrslt {
+        \lang1024 1/11/03 10:34 PM}}}
+    Becomes:
+        <field type = "insert-time">
+            10:34 PM
+        </field>
+    The simple field in the above example conatins no paragraph or sections breaks.
+    This line of RTF:
+        {{\field{\*\fldinst SYMBOL 97 \\f "Symbol" \\s 12}{\fldrslt\f3\fs24}}}
+    Becomes:
+        <para><inline font-size="18"><inline font-style="Symbol">&#x03A7;</inline></inline></para>
+        The RTF in the example above should be represented as UTF-8 rather than a field.
+    This RTF:
+        {\field\fldedit{\*\fldinst { TOC \\o "1-3" }}{\fldrslt {\lang1024
+        Heading one\tab }{\field{\*\fldinst {\lang1024  PAGEREF _Toc440880424
+        \\h }{\lang1024 {\*\datafield
+        {\lang1024 1}}}{\lang1024 \par }\pard\plain
+        \s18\li240\widctlpar\tqr\tldot\tx8630\aspalpha\aspnum\faauto\adjustright\rin0\lin240\itap0
+        \f4\lang1033\cgrid {\lang1024 Heading 2\tab }{\field{\*\fldinst
+        {\lang1024  PAGEREF _Toc440880425 \\h }{\lang1024 {\*\datafield
+        {\lang1024 1}}}{\lang1024 \par }\pard\plain
+        \widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0
+        \f4\lang1033\cgrid }}\pard\plain
+        \widctlpar\aspalpha\aspnum\faauto\adjustright\rin0\lin0\itap0
+        \f4\lang1033\cgrid {\fs28 \\u214\'85 \par }{\fs36 {\field{\*\fldinst
+        SYMBOL 67 \\f "Symbol" \\s 18}{\fldrslt\f3\fs36}}}
+    Becomes:
+        <field-block type="table-of-contents">
+        <paragraph-definition language="1033" nest-level="0"
+        font-style="Times" name="toc 1" adjust-right="true"
+        widow-control="true">
+        <para><inline language="1024">Heading one&#x009;</inline><field
+        type="reference-to-page" ref="_Toc440880424"><inline
+        language="1024">1</inline></field></para>
+        </paragraph-definition>
+        <paragraph-definition language="1033" nest-level="0" left-indent="12"
+        font-style="Times" name="toc 2" adjust-right="true"
+        widow-control="true">
+        <para><inline language="1024">Heading 2&#x009;</inline><field
+        type="reference-to-page" ref="_Toc440880425"><inline
+        language="1024">1</inline></field></para>
+        </paragraph-definition>
+        </field-block>
+    """
+
+    def __init__(self,
+            in_file,
+            bug_handler,
+            copy=None,
+            run_level=1,
+            ):
+        """
+        Required:
+            'file'--file to parse
+        Optional:
+            'copy'-- whether to make a copy of result for debugging
+            'temp_dir' --where to output temporary results (default is
+            directory from which the script is run.)
+        Returns:
+            nothing
+            """
+        self.__file = in_file
+        self.__bug_handler = bug_handler
+        self.__copy = copy
+        self.__run_level = run_level
+        self.__write_to = better_mktemp()
+
+    def __initiate_values(self):
+        """
+        Initiate all values.
+        """
+        self.__text_string = ''
+        self.__field_instruction_string = ''
+        self.__marker = 'mi<mk<inline-fld\n'
+        self.__state = 'before_body'
+        self.__string_obj = field_strings.FieldStrings(run_level=self.__run_level,
+                bug_handler=self.__bug_handler,)
+        self.__state_dict = {
+        'before_body'       : self.__before_body_func,
+        'in_body'           : self.__in_body_func,
+        'field'             : self.__in_field_func,
+        'field_instruction' : self.__field_instruction_func,
+        }
+        self.__in_body_dict = {
+        'cw<fd<field_____'  : self.__found_field_func,
+        }
+        self.__field_dict = {
+        'cw<fd<field-inst'  :   self.__found_field_instruction_func,
+        'cw<fd<field_____'  : self.__found_field_func,
+        'cw<pf<par-end___'  : self.__par_in_field_func,
+        'cw<sc<section___'  : self.__sec_in_field_func,
+        }
+        self.__field_count = []  # keep track of the brackets
+        self.__field_instruction = []  # field instruction strings
+        self.__symbol = 0   # wheter or not the field is really UTF-8
+        # (these fields cannot be nested.)
+        self.__field_instruction_string = ''  # string that collects field instruction
+        self.__par_in_field = []  # paragraphs in field?
+        self.__sec_in_field = []  # sections in field?
+        self.__field_string = []  # list of field strings
+
+    def __before_body_func(self, line):
+        """
+        Requried:
+            line --line ro parse
+        Returns:
+            nothing (changes an instant and writes a line)
+        Logic:
+            Check for the beginninf of the body. If found, changed the state.
+            Always write out the line.
+        """
+        if self.__token_info == 'mi<mk<body-open_':
+            self.__state = 'in_body'
+        self.__write_obj.write(line)
+
+    def __in_body_func(self, line):
+        """
+        Required:
+            line --line to parse
+        Returns:
+            nothing. (Writes a line to the output file, or performs other actions.)
+        Logic:
+            Check of the beginning of a field. Always output the line.
+        """
+        action = self.__in_body_dict.get(self.__token_info)
+        if action:
+            action(line)
+        self.__write_obj.write(line)
+
+    def __found_field_func(self, line):
+        """
+        Requires:
+            line --line to parse
+        Returns:
+            nothing
+        Logic:
+            Set the values for parseing the field. Four lists have to have
+            items appended to them.
+        """
+        self.__state = 'field'
+        self.__cb_count = 0
+        ob_count = self.__ob_count
+        self.__field_string.append('')
+        self.__field_count.append(ob_count)
+        self.__sec_in_field.append(0)
+        self.__par_in_field.append(0)
+
+    def __in_field_func(self, line):
+        """
+        Requires:
+            line --line to parse
+        Returns:
+            nothing.
+        Logic:
+            Check for the end of the field; a paragaph break; a section break;
+            the beginning of another field; or the beginning of the field
+            instruction.
+        """
+        if self.__cb_count == self.__field_count[-1]:
+            self.__field_string[-1] += line
+            self.__end_field_func()
+        else:
+            action = self.__field_dict.get(self.__token_info)
+            if action:
+                action(line)
+            else:
+                self.__field_string[-1] += line
+
+    def __par_in_field_func(self, line):
+        """
+        Requires:
+            line --line to parse
+        Returns:
+            nothing
+        Logic:
+            Write the line to the output file and set the last item in the
+            paragraph in field list to true.
+        """
+        self.__field_string[-1] += line
+        self.__par_in_field[-1] = 1
+
+    def __sec_in_field_func(self, line):
+        """
+        Requires:
+            line --line to parse
+        Returns:
+            nothing
+        Logic:
+            Write the line to the output file and set the last item in the
+            section in field list to true.
+        """
+        self.__field_string[-1] += line
+        self.__sec_in_field[-1] = 1
+
+    def __found_field_instruction_func(self, line):
+        """
+        Requires:
+            line -- line to parse
+        Returns:
+            nothing
+        Change the state to field instruction. Set the open bracket count of
+        the beginning of this field so  you know when it ends. Set the closed
+        bracket count to 0 so you don't prematureley exit this state.
+        """
+        self.__state = 'field_instruction'
+        self.__field_instruction_count = self.__ob_count
+        self.__cb_count = 0
+
+    def __field_instruction_func(self, line):
+        """
+        Requires:
+            line --line to parse
+        Returns:
+            nothing
+        Logic:
+            Collect all the lines until the end of the field is reached.
+            Process these lines with the module rtr.field_strings.
+            Check if the field instruction is 'Symbol' (really UTF-8).
+        """
+        if self.__cb_count == self.__field_instruction_count:
+            # The closing bracket should be written, since the opening bracket
+            # was written
+            self.__field_string[-1] += line
+            my_list = self.__string_obj.process_string(
+                self.__field_instruction_string, 'field_instruction')
+            instruction = my_list[2]
+            self.__field_instruction.append(instruction)
+            if my_list[0] == 'Symbol':
+                self.__symbol = 1
+            self.__state = 'field'
+            self.__field_instruction_string = ''
+        else:
+            self.__field_instruction_string += line
+
+    def __end_field_func(self):
+        """
+        Requires:
+            nothing
+        Returns:
+            Nothing
+        Logic:
+            Pop the last values in the instructions list, the fields list, the
+            paragaph list, and the section list.
+            If the field is a symbol, do not write the tags <field></field>,
+            since this field is really just UTF-8.
+            If the field contains paragraph or section breaks, it is a
+            field-block rather than just a field.
+            Write the paragraph or section markers for later parsing of the
+            file.
+            If the filed list contains more strings, add the latest
+            (processed) string to the last string in the list. Otherwise,
+            write the string to the output file.
+        """
+        last_bracket = self.__field_count.pop()
+        instruction = self.__field_instruction.pop()
+        inner_field_string = self.__field_string.pop()
+        sec_in_field = self.__sec_in_field.pop()
+        par_in_field = self.__par_in_field.pop()
+        # add a closing bracket, since the closing bracket is not included in
+        # the field string
+        if self.__symbol:
+            inner_field_string = '%scb<nu<clos-brack<%s\n' % \
+            (instruction, last_bracket)
+        elif sec_in_field or par_in_field:
+            inner_field_string = \
+            'mi<mk<fldbkstart\n'\
+            'mi<tg<open-att__<field-block<type>%s\n%s'\
+            'mi<mk<fldbk-end_\n' \
+            'mi<tg<close_____<field-block\n'\
+            'mi<mk<fld-bk-end\n' \
+            % (instruction, inner_field_string)
+        # write a marker to show an inline field for later parsing
+        else:
+            inner_field_string = \
+            '%s' \
+            'mi<tg<open-att__<field<type>%s\n%s'\
+            'mi<tg<close_____<field\n'\
+            % (self.__marker, instruction, inner_field_string)
+        if sec_in_field:
+            inner_field_string = 'mi<mk<sec-fd-beg\n' + inner_field_string + \
+            'mi<mk<sec-fd-end\n'
+        if par_in_field:
+            inner_field_string = 'mi<mk<par-in-fld\n' + inner_field_string
+        if len(self.__field_string) == 0:
+            self.__write_field_string(inner_field_string)
+        else:
+            self.__field_string[-1] += inner_field_string
+        self.__symbol = 0
+
+    def __write_field_string(self, the_string):
+        self.__state = 'in_body'
+        self.__write_obj.write(the_string)
+
+    def fix_fields(self):
+        """
+        Requires:
+            nothing
+        Returns:
+            nothing (changes the original file)
+        Logic:
+            Read one line in at a time. Determine what action to take based on
+            the state. If the state is before the body, look for the
+            beginning of the body.
+            If the state is body, send the line to the body method.
+        """
+        self.__initiate_values()
+        read_obj = open_for_read(self.__file)
+        self.__write_obj = open_for_write(self.__write_to)
+        line_to_read = 1
+        while line_to_read:
+            line_to_read = read_obj.readline()
+            line = line_to_read
+            self.__token_info = line[:16]
+            if self.__token_info == 'ob<nu<open-brack':
+                self.__ob_count = line[-5:-1]
+            if self.__token_info == 'cb<nu<clos-brack':
+                self.__cb_count = line[-5:-1]
+            action = self.__state_dict.get(self.__state)
+            if action is None:
+                sys.stderr.write('no no matching state in module styles.py\n')
+                sys.stderr.write(self.__state + '\n')
+            action(line)
+        read_obj.close()
+        self.__write_obj.close()
+        copy_obj = copy.Copy(bug_handler=self.__bug_handler)
+        if self.__copy:
+            copy_obj.copy_file(self.__write_to, "fields_large.data")
+        copy_obj.rename(self.__write_to, self.__file)
+        os.remove(self.__write_to)
--- a/ebook_converter/ebooks/rtf2xml/fields_small.py
+++ b/ebook_converter/ebooks/rtf2xml/fields_small.py
@@ -0,0 +1,460 @@
+from __future__ import unicode_literals, absolute_import, print_function, division
+#########################################################################
+#                                                                       #
+#                                                                       #
+#   copyright 2002 Paul Henry Tremblay                                  #
+#                                                                       #
+#   This program is distributed in the hope that it will be useful,     #
+#   but WITHOUT ANY WARRANTY; without even the implied warranty of      #
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU    #
+#   General Public License for more details.                            #
+#                                                                       #
+#                                                                       #
+#########################################################################
+import sys, os, re
+
+from calibre.ebooks.rtf2xml import field_strings, copy
+from calibre.ptempfile import better_mktemp
+from . import open_for_read, open_for_write
+
+
+class FieldsSmall:
+    """
+=================
+Purpose
+=================
+Write tags for bookmarks, index and toc entry fields in a tokenized file.
+This module does not handle toc or index tables.  (This module won't be any
+use to you unless you use it as part of the other modules.)
+-----------
+Method
+-----------
+Look for the beginning of a bookmark, index, or toc entry. When such a token
+is found, store the opeing bracket count in a variable. Collect all the text
+until the closing bracket entry is found. Send the string to the module
+field_strings to process it. Write the processed string to the output
+file.
+    """
+
+    def __init__(self,
+            in_file,
+            bug_handler,
+            copy=None,
+            run_level=1,
+            ):
+        """
+        Required:
+            'file'--file to parse
+        Optional:
+            'copy'-- whether to make a copy of result for debugging
+            'temp_dir' --where to output temporary results (default is
+            directory from which the script is run.)
+        Returns:
+            nothing
+            """
+        self.__file = in_file
+        self.__bug_handler = bug_handler
+        self.__copy = copy
+        self.__write_to = better_mktemp()
+        self.__run_level = run_level
+
+    def __initiate_values(self):
+        """
+        Initiate all values.
+        """
+        self.__string_obj = field_strings.FieldStrings(bug_handler=self.__bug_handler)
+        self.__state = 'before_body'
+        self.__text_string = ''
+        self.__marker = 'mi<mk<inline-fld\n'
+        self.__state_dict = {
+        'before_body'   : self.__before_body_func,
+        'body'  : self.__body_func,
+        'bookmark'  : self.__bookmark_func,
+        'toc_index'       : self.__toc_index_func,
+        }
+        self.__body_dict = {
+        'cw<an<book-mk-st'      : (self.__found_bookmark_func, 'start'),
+        'cw<an<book-mk-en'      : (self.__found_bookmark_func, 'end'),
+        'cw<an<toc_______'      : (self.__found_toc_index_func, 'toc'),
+        'cw<an<index-mark'      : (self.__found_toc_index_func, 'index'),
+        }
+        ob = 'ob<nu<open-brack.....'
+        cb = 'cb<nu<clos-brack'
+        bk_st = 'cw<an<book-mk-st<nu<true'
+        tx = 'tx<nu<__________<(.*?)'
+        reg_st = ob + bk_st + tx + cb
+        self.__book_start = re.compile(r'%s' % reg_st)
+
+    def __before_body_func(self, line):
+        """
+        Requires:
+            line --the line to parse
+        Returns:
+            nothing
+        Logic:
+            Look for the beginning of the body. When found, change the state
+            to body. Always print out the line.
+        """
+        if self.__token_info == 'mi<mk<body-open_':
+            self.__state = 'body'
+        self.__write_obj.write(line)
+
+    def __body_func(self, line):
+        """
+        Requires:
+            line --the line to parse
+        Returns:
+            nothing
+        Logic:
+            This function handles all the lines in the body of the documents.
+            Look for a bookmark, index or toc entry and take the appropriate action.
+        """
+        action, tag = \
+           self.__body_dict.get(self.__token_info, (None, None))
+        if action:
+            action(line, tag)
+        else:
+            self.__write_obj.write(line)
+
+    def __found_bookmark_func(self, line, tag):
+        """
+        Requires:
+            line --the line to parse
+        Returns:
+            nothing
+        Logic:
+            This function is called when a bookmark is found. The opening
+            bracket count is stored int eh beginning bracket count. The state
+            is changed to 'bookmark.'
+        """
+        self.__beg_bracket_count = self.__ob_count
+        self.__cb_count = 0
+        self.__state = 'bookmark'
+        self.__type_of_bookmark = tag
+
+    def __bookmark_func(self, line):
+        """
+        Requires:
+            line --the line to parse
+        Returns:
+            nothing
+        Logic:
+            This function handles all lines within a bookmark. It adds each
+            line to a string until the end of the bookmark is found. It
+            processes the string with the fields_string module, and
+            prints out the result.
+        """
+        if self.__beg_bracket_count == self.__cb_count:
+            self.__state = 'body'
+            type = 'bookmark-%s'  % self.__type_of_bookmark
+            # change here
+            """
+            my_string = self.__string_obj.process_string(
+                self.__text_string, type)
+            """
+            my_string = self.__parse_bookmark_func(
+                self.__text_string, type)
+            self.__write_obj.write(self.__marker)
+            self.__write_obj.write(my_string)
+            self.__text_string = ''
+            self.__write_obj.write(line)
+        elif line[0:2] == 'tx':
+            self.__text_string += line[17:-1]
+
+    def __parse_index_func(self, my_string):
+        """
+        Requires:
+            my_string --string to parse
+            type --type of string
+        Returns:
+            A string for a toc instruction field.
+        Logic:
+            This method is meant for *both* index and toc entries.
+            I want to eleminate paragraph endings, and I want to divide the
+            entry into a main entry and (if it exists) a sub entry.
+            Split the string by newlines. Read on token at a time. If the
+            token is a special colon, end the main entry element and start the
+            sub entry element.
+            If the token is a pargrah ending, ignore it, since I don't won't
+            paragraphs within toc or index entries.
+        """
+        my_string, see_string = self.__index_see_func(my_string)
+        my_string, bookmark_string = self.__index_bookmark_func(my_string)
+        italics, bold = self.__index__format_func(my_string)
+        found_sub = 0
+        my_changed_string = 'mi<tg<empty-att_<field<type>index-entry'
+        my_changed_string += '<update>static'
+        if see_string:
+            my_changed_string += '<additional-text>%s' % see_string
+        if bookmark_string:
+            my_changed_string += '<bookmark>%s' % bookmark_string
+        if italics:
+            my_changed_string += '<italics>true'
+        if bold:
+            my_changed_string += '<bold>true'
+        main_entry = ''
+        sub_entry = ''
+        lines = my_string.split('\n')
+        for line in lines:
+            token_info = line[:16]
+            if token_info == 'cw<ml<colon_____':
+                found_sub = 1
+            elif token_info[0:2] == 'tx':
+                if found_sub:
+                    sub_entry += line[17:]
+                else:
+                    main_entry += line[17:]
+        my_changed_string += '<main-entry>%s' % main_entry
+        if found_sub:
+            my_changed_string += '<sub-entry>%s' % sub_entry
+        my_changed_string += '\n'
+        return my_changed_string
+
+    def __index_see_func(self, my_string):
+        in_see = 0
+        bracket_count = 0
+        see_string = ''
+        changed_string = ''
+        lines = my_string.split('\n')
+        end_bracket_count = sys.maxsize
+        for line in lines:
+            token_info = line[:16]
+            if token_info == 'ob<nu<open-brack':
+                bracket_count += 1
+            if token_info == 'cb<nu<clos-brack':
+                bracket_count -= 1
+            if in_see:
+                if bracket_count == end_bracket_count and token_info == 'cb<nu<clos-brack':
+                    in_see = 0
+                else:
+                    if token_info == 'tx<nu<__________':
+                        see_string += line[17:]
+            else:
+                if token_info == 'cw<in<index-see_':
+                    end_bracket_count = bracket_count - 1
+                    in_see = 1
+                changed_string += '%s\n' % line
+        return changed_string, see_string
+
+    def __index_bookmark_func(self, my_string):
+        """
+        Requries:
+            my_string -- string in all the index
+        Returns:
+            bookmark_string -- the text string of the book mark
+            index_string -- string minus the bookmark_string
+        """
+        # cw<an<place_____<nu<true
+        in_bookmark = 0
+        bracket_count = 0
+        bookmark_string = ''
+        index_string = ''
+        lines = my_string.split('\n')
+        end_bracket_count = sys.maxsize
+        for line in lines:
+            token_info = line[:16]
+            if token_info == 'ob<nu<open-brack':
+                bracket_count += 1
+            if token_info == 'cb<nu<clos-brack':
+                bracket_count -= 1
+            if in_bookmark:
+                if bracket_count == end_bracket_count and token_info == 'cb<nu<clos-brack':
+                    in_bookmark = 0
+                    index_string += '%s\n' % line
+                else:
+                    if token_info == 'tx<nu<__________':
+                        bookmark_string += line[17:]
+                    else:
+                        index_string += '%s\n' % line
+            else:
+                if token_info == 'cw<an<place_____':
+                    end_bracket_count = bracket_count - 1
+                    in_bookmark = 1
+                index_string += '%s\n' % line
+        return index_string, bookmark_string
+
+    def __index__format_func(self, my_string):
+        italics = 0
+        bold =0
+        lines = my_string.split('\n')
+        for line in lines:
+            token_info = line[:16]
+            if token_info == 'cw<in<index-bold':
+                bold = 1
+            if token_info == 'cw<in<index-ital':
+                italics = 1
+        return italics, bold
+
+    def __parse_toc_func(self, my_string):
+        """
+        Requires:
+            my_string -- all the string in the toc
+        Returns:
+            modidified string
+        Logic:
+        """
+        toc_level = 0
+        toc_suppress = 0
+        my_string, book_start_string, book_end_string =\
+        self.__parse_bookmark_for_toc(my_string)
+        main_entry = ''
+        my_changed_string = 'mi<tg<empty-att_<field<type>toc-entry'
+        my_changed_string += '<update>static'
+        if book_start_string:
+            my_changed_string += '<bookmark-start>%s' % book_start_string
+        if book_end_string:
+            my_changed_string += '<bookmark-end>%s' % book_end_string
+        lines = my_string.split('\n')
+        for line in lines:
+            token_info = line[:16]
+            if token_info[0:2] == 'tx':
+                main_entry += line[17:]
+            if token_info == 'cw<tc<toc-level_':
+                toc_level = line[20:]
+            if token_info == 'cw<tc<toc-sup-nu':
+                toc_suppress = 1
+        if toc_level:
+            my_changed_string += '<toc-level>%s' % toc_level
+        if toc_suppress:
+            my_changed_string += '<toc-suppress-number>true'
+        my_changed_string += '<main-entry>%s' % main_entry
+        my_changed_string += '\n'
+        return my_changed_string
+
+    def __parse_bookmark_for_toc(self, my_string):
+        """
+        Requires:
+            the_string --string of toc, with new lines
+        Returns:
+            the_string -- string minus bookmarks
+            bookmark_string -- bookmarks
+        Logic:
+        """
+        in_bookmark = 0
+        bracket_count = 0
+        book_start_string = ''
+        book_end_string = ''
+        book_type = 0
+        toc_string = ''
+        lines = my_string.split('\n')
+        end_bracket_count = sys.maxsize
+        for line in lines:
+            token_info = line[:16]
+            if token_info == 'ob<nu<open-brack':
+                bracket_count += 1
+            if token_info == 'cb<nu<clos-brack':
+                bracket_count -= 1
+            if in_bookmark:
+                if bracket_count == end_bracket_count and token_info == 'cb<nu<clos-brack':
+                    in_bookmark = 0
+                    toc_string += '%s\n' % line
+                else:
+                    if token_info == 'tx<nu<__________':
+                        if book_type == 'start':
+                            book_start_string += line[17:]
+                        elif book_type == 'end':
+                            book_end_string += line[17:]
+                    else:
+                        toc_string += '%s\n' % line
+            else:
+                if token_info == 'cw<an<book-mk-st' or token_info =='cw<an<book-mk-en':
+                    if token_info == 'cw<an<book-mk-st':
+                        book_type = 'start'
+                    if token_info == 'cw<an<book-mk-en':
+                        book_type = 'end'
+                    end_bracket_count = bracket_count - 1
+                    in_bookmark = 1
+                toc_string += '%s\n' % line
+        return toc_string, book_start_string, book_end_string
+
+    def __parse_bookmark_func(self, my_string, type):
+        """
+        Requires:
+            my_string --string to parse
+            type --type of string
+        Returns:
+            A string formated for a field instruction.
+        Logic:
+            The type is the name (either bookmark-end or bookmark-start). The
+            id is the complete text string.
+        """
+        my_changed_string = ('mi<tg<empty-att_<field<type>%s'
+        '<number>%s<update>none\n' % (type, my_string))
+        return my_changed_string
+
+    def __found_toc_index_func(self, line, tag):
+        """
+        Requires:
+            line --the line to parse
+        Returns:
+            nothing
+        Logic:
+            This function is called when a toc or index entry is found. The opening
+            bracket count is stored in the beginning bracket count. The state
+            is changed to 'toc_index.'
+        """
+        self.__beg_bracket_count = self.__ob_count
+        self.__cb_count = 0
+        self.__state = 'toc_index'
+        self.__tag = tag
+
+    def __toc_index_func(self, line):
+        """
+        Requires:
+            line --the line to parse
+        Returns:
+            nothing
+        Logic:
+            This function handles all lines within a toc or index entry. It
+            adds each line to a string until the end of the entry is found. It
+            processes the string with the fields_string module, and
+            prints out the result.
+        """
+        if self.__beg_bracket_count == self.__cb_count:
+            self.__state = 'body'
+            type = self.__tag
+            if type == 'index':
+                my_string = self.__parse_index_func(
+                self.__text_string)
+            elif type == 'toc':
+                my_string = self.__parse_toc_func(
+                self.__text_string)
+            self.__write_obj.write(self.__marker)
+            self.__write_obj.write(my_string)
+            self.__text_string = ''
+            self.__write_obj.write(line)
+        else:
+            self.__text_string += line
+
+    def fix_fields(self):
+        """
+        Requires:
+            nothing
+        Returns:
+            nothing (changes the original file)
+        Logic:
+            Read one line in at a time. Determine what action to take based on
+            the state. If the state is before the body, look for the
+            beginning of the body.
+           The other two states are toc_index (for toc and index entries) and
+           bookmark.
+        """
+        self.__initiate_values()
+        with open_for_read(self.__file) as read_obj:
+            with open_for_write(self.__write_to) as self.__write_obj:
+                for line in read_obj:
+                    self.__token_info = line[:16]
+                    if self.__token_info == 'ob<nu<open-brack':
+                        self.__ob_count = line[-5:-1]
+                    if self.__token_info == 'cb<nu<clos-brack':
+                        self.__cb_count = line[-5:-1]
+                    action = self.__state_dict.get(self.__state)
+                    if action is None:
+                        sys.stderr.write('No matching state in module fields_small.py\n')
+                        sys.stderr.write(self.__state + '\n')
+                    action(line)
+        copy_obj = copy.Copy(bug_handler=self.__bug_handler)
+        if self.__copy:
+            copy_obj.copy_file(self.__write_to, "fields_small.data")
+        copy_obj.rename(self.__write_to, self.__file)
+        os.remove(self.__write_to)
--- a/ebook_converter/ebooks/rtf2xml/fonts.py
+++ b/ebook_converter/ebooks/rtf2xml/fonts.py
@@ -0,0 +1,226 @@
+from __future__ import unicode_literals, absolute_import, print_function, division
+#########################################################################
+#                                                                       #
+#                                                                       #
+#   copyright 2002 Paul Henry Tremblay                                  #
+#                                                                       #
+#   This program is distributed in the hope that it will be useful,     #
+#   but WITHOUT ANY WARRANTY; without even the implied warranty of      #
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU    #
+#   General Public License for more details.                            #
+#                                                                       #
+#                                                                       #
+#########################################################################
+import sys, os
+
+from calibre.ebooks.rtf2xml import copy
+from calibre.ptempfile import better_mktemp
+from . import open_for_read, open_for_write
+
+
+class Fonts:
+    """
+    Change lines with font info from font numbers to the actual font names.
+    """
+
+    def __init__(self,
+            in_file,
+            bug_handler,
+            default_font_num,
+            copy=None,
+            run_level=1,
+            ):
+        """
+        Required:
+            'file'--file to parse
+            'default_font_num'--the default font number
+        Optional:
+            'copy'-- whether to make a copy of result for debugging
+            'temp_dir' --where to output temporary results (default is
+            directory from which the script is run.)
+        Returns:
+            nothing
+            """
+        self.__file = in_file
+        self.__bug_handler = bug_handler
+        self.__copy = copy
+        self.__default_font_num = default_font_num
+        self.__write_to = better_mktemp()
+        self.__run_level = run_level
+
+    def __initiate_values(self):
+        """
+        Initiate all values.
+        """
+        self.__special_font_dict = {
+        'Symbol'        :   0,
+        'Wingdings'     :   0,
+        'Zapf Dingbats'      :   0,
+        }
+        self.__special_font_list = [
+        'Symbol', 'Wingdings', 'Zapf Dingbats'
+        ]
+        self.__state = 'default'
+        self.__state_dict = {
+        'default'           : self.__default_func,
+        'font_table'        : self.__font_table_func,
+        'after_font_table'  : self.__after_font_table_func,
+        'font_in_table'     : self.__font_in_table_func,
+        }
+        self.__font_table = {}
+        # individual font written
+        self.__wrote_ind_font = 0
+
+    def __default_func(self, line):
+        """
+        Requires:
+            line
+        Returns:
+            nothing
+        Handle all lines before the font table. Check for the beginning of the
+        font table. If found, change the state. Print out all lines.
+        """
+        if self.__token_info == 'mi<mk<fonttb-beg':
+            self.__state = 'font_table'
+        self.__write_obj.write(line)
+
+    def __font_table_func(self, line):
+        """
+        Requires:
+            line
+        Returns:
+            nothing
+        Logic:
+            If the self.__token_info indicates that you have reached the end of
+            the font table, then change the state to after the font table.
+            If the self.__token_info indicates that there is a font in the
+            table, change the state to font in table. Reset the number of the
+            font to the default font (in case there is no number provided, in
+            which case RTF assumes the number will be the default font.) Reset
+            the test string (for the font name) to ''
+            """
+        if self.__token_info == 'mi<mk<fonttb-end':
+            self.__state = 'after_font_table'
+        elif self.__token_info == 'mi<mk<fontit-beg':
+            self.__state = 'font_in_table'
+            self.__font_num = self.__default_font_num
+            self.__text_line = ''
+        # self.__write_obj.write(line)
+
+    def __font_in_table_func(self, line):
+        """
+        Requires:
+            line
+        Returns:
+            nothing
+        Logic:
+            Check for four conditions:
+                The line contains font-info. In this case, store the number in
+                self.__font_num.
+                The line contains text. In this case, add to the text string
+                self.__text_string.
+                The line marks the end of the individual font in the table. In
+                this case, add a new key-> value pair to the font-table
+                dictionary. Also create an empty tag with the name and number
+                as attributes.
+                Preamture end of font table
+            """
+        # cw<ci<font-style<nu<4
+        # tx<nu<__________<Times;
+        if self.__token_info == 'mi<mk<fontit-end':
+            self.__wrote_ind_font = 1
+            self.__state = 'font_table'
+            self.__text_line = self.__text_line[:-1]  # get rid of last ';'
+            self.__font_table[self.__font_num] = self.__text_line
+            self.__write_obj.write(
+            'mi<tg<empty-att_'
+            '<font-in-table<name>%s<num>%s\n' % (self.__text_line, self.__font_num)
+            )
+        elif self.__token_info == 'cw<ci<font-style':
+            self.__font_num = line[20:-1]
+        elif self.__token_info == 'tx<nu<__________' or \
+        self.__token_info == 'tx<ut<__________':
+            self.__text_line += line[17:-1]
+        elif self.__token_info == 'mi<mk<fonttb-end':
+            self.__found_end_font_table_func()
+            self.__state = 'after_font_table'
+
+    def __found_end_font_table_func(self):
+        """
+        Required:
+            nothing
+        Returns:
+            nothing
+        Logic:
+            If not individual fonts have been written, write one out
+        """
+        if not self.__wrote_ind_font:
+            self.__write_obj.write(
+            'mi<tg<empty-att_'
+            '<font-in-table<name>Times<num>0\n')
+
+    def __after_font_table_func(self, line):
+        """
+        Required:
+            line
+        Returns:
+            nothing
+        Logic:
+            Check the self.__token_info. If this matches a token with font
+            info, then extract the number from the line, and look up the font
+            name in the font dictionary. If no name exists for that number,
+            print out an error. Otherwise print out the same line, except with
+            the name rather than the number.
+            If the line does not contain font info, simply print it out to the
+            file.
+            """
+        if self.__token_info == 'cw<ci<font-style':
+            font_num = line[20:-1]
+            font_name = self.__font_table.get(font_num)
+            if font_name is None:
+                if self.__run_level > 3:
+                    msg = 'no value for %s in self.__font_table\n' % font_num
+                    raise self.__bug_handler(msg)
+            else:
+                # self.__special_font_dict
+                if font_name in self.__special_font_list:
+                    self.__special_font_dict[font_name] = 1
+                self.__write_obj.write(
+                'cw<ci<font-style<nu<%s\n' % font_name
+                )
+        else:
+            self.__write_obj.write(line)
+
+    def convert_fonts(self):
+        """
+        Required:
+            nothing
+        Returns:
+            a dictionary indicating with values for special fonts
+        Logic:
+            Read one line in at a time. Determine what action to take based on
+            the state. If the state is font_table, looke for individual fonts
+            and add the number and font name to a dictionary. Also create a
+            tag for each individual font in the font table.
+            If the state is after the font table, look for lines with font
+            info. Substitute a font name for a font number.
+            """
+        self.__initiate_values()
+        with open_for_read(self.__file) as read_obj:
+            with open_for_write(self.__write_to) as self.__write_obj:
+                for line in read_obj:
+                    self.__token_info = line[:16]
+                    action = self.__state_dict.get(self.__state)
+                    if action is None:
+                        sys.stderr.write('no matching state in module fonts.py\n' + self.__state + '\n')
+                    action(line)
+        default_font_name = self.__font_table.get(self.__default_font_num)
+        if not default_font_name:
+            default_font_name = 'Not Defined'
+        self.__special_font_dict['default-font'] = default_font_name
+        copy_obj = copy.Copy(bug_handler=self.__bug_handler)
+        if self.__copy:
+            copy_obj.copy_file(self.__write_to, "fonts.data")
+        copy_obj.rename(self.__write_to, self.__file)
+        os.remove(self.__write_to)
+        return self.__special_font_dict
--- a/ebook_converter/ebooks/rtf2xml/footnote.py
+++ b/ebook_converter/ebooks/rtf2xml/footnote.py
@@ -0,0 +1,264 @@
+from __future__ import absolute_import, division, print_function, unicode_literals
+#########################################################################
+#                                                                       #
+#                                                                       #
+#   copyright 2002 Paul Henry Tremblay                                  #
+#                                                                       #
+#   This program is distributed in the hope that it will be useful,     #
+#   but WITHOUT ANY WARRANTY; without even the implied warranty of      #
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU    #
+#   General Public License for more details.                            #
+#                                                                       #
+#                                                                       #
+#########################################################################
+import os
+
+from calibre.ebooks.rtf2xml import copy
+from calibre.ptempfile import better_mktemp
+from polyglot.builtins import unicode_type
+
+from . import open_for_read, open_for_write
+
+
+class Footnote:
+    """
+    Two public methods are available. The first separates all of the
+    footnotes from the body and puts them at the bottom of the text, where
+    they are easier to process. The second joins those footnotes to the
+    proper places in the body.
+    """
+
+    def __init__(self,
+            in_file ,
+            bug_handler,
+            copy=None,
+            run_level=1,
+            ):
+        self.__file = in_file
+        self.__bug_handler = bug_handler
+        self.__copy = copy
+        self.__write_to = better_mktemp()
+        self.__found_a_footnote = 0
+
+    def __first_line_func(self, line):
+        """
+        Print the tag info for footnotes.  Check whether footnote is an
+        endnote and make the tag according to that.
+        """
+        if self.__token_info == 'cw<nt<type______':
+            self.__write_to_foot_obj.write(
+            'mi<tg<open-att__<footnote<type>endnote<num>%s\n' % self.__footnote_count)
+        else:
+            self.__write_to_foot_obj.write(
+            'mi<tg<open-att__<footnote<num>%s\n' % self.__footnote_count)
+        self.__first_line = 0
+
+    def __in_footnote_func(self, line):
+        """Handle all tokens that are part of footnote"""
+        if self.__first_line:
+            self.__first_line_func(line)
+        if self.__token_info == 'cw<ci<footnot-mk':
+            num = unicode_type(self.__footnote_count)
+            self.__write_to_foot_obj.write(line)
+            self.__write_to_foot_obj.write(
+                'tx<nu<__________<%s\n' % num
+            )
+        if self.__cb_count == self.__footnote_bracket_count:
+            self.__in_footnote = 0
+            self.__write_obj.write(line)
+            self.__write_to_foot_obj.write(
+            'mi<mk<foot___clo\n')
+            self.__write_to_foot_obj.write(
+            'mi<tg<close_____<footnote\n')
+            self.__write_to_foot_obj.write(
+            'mi<mk<footnt-clo\n')
+        else:
+            self.__write_to_foot_obj.write(line)
+
+    def __found_footnote(self, line):
+        """ Found a footnote"""
+        self.__found_a_footnote = 1
+        self.__in_footnote = 1
+        self.__first_line = 1
+        self.__footnote_count += 1
+        # temporarily set this to zero so I can enter loop
+        self.__cb_count = 0
+        self.__footnote_bracket_count = self.__ob_count
+        self.__write_obj.write(
+        'mi<mk<footnt-ind<%04d\n' % self.__footnote_count)
+        self.__write_to_foot_obj.write(
+        'mi<mk<footnt-ope<%04d\n' % self.__footnote_count)
+
+    def __default_sep(self, line):
+        """Handle all tokens that are not footnote tokens"""
+        if self.__token_info == 'cw<nt<footnote__':
+            self.__found_footnote(line)
+        self.__write_obj.write(line)
+        if self.__token_info == 'cw<ci<footnot-mk':
+            num = unicode_type(self.__footnote_count + 1)
+            self.__write_obj.write(
+                'tx<nu<__________<%s\n' % num
+            )
+
+    def __initiate_sep_values(self):
+        """
+        initiate counters for separate_footnotes method.
+        """
+        self.__bracket_count=0
+        self.__ob_count = 0
+        self.__cb_count = 0
+        self.__footnote_bracket_count = 0
+        self.__in_footnote = 0
+        self.__first_line = 0  # have not processed the first line of footnote
+        self.__footnote_count = 0
+
+    def separate_footnotes(self):
+        """
+        Separate all the footnotes in an RTF file and put them at the bottom,
+        where they are easier to process.  Each time a footnote is found,
+        print all of its contents to a temporary file. Close both the main and
+        temporary file. Print the footnotes from the temporary file to the
+        bottom of the main file.
+        """
+        self.__initiate_sep_values()
+        self.__footnote_holder = better_mktemp()
+        with open_for_read(self.__file) as read_obj:
+            with open_for_write(self.__write_to) as self.__write_obj:
+                with open_for_write(self.__footnote_holder) as self.__write_to_foot_obj:
+                    for line in read_obj:
+                        self.__token_info = line[:16]
+                        # keep track of opening and closing brackets
+                        if self.__token_info == 'ob<nu<open-brack':
+                            self.__ob_count = line[-5:-1]
+                        if self.__token_info == 'cb<nu<clos-brack':
+                            self.__cb_count = line[-5:-1]
+                        # In the middle of footnote text
+                        if self.__in_footnote:
+                            self.__in_footnote_func(line)
+                        # not in the middle of footnote text
+                        else:
+                            self.__default_sep(line)
+        with open_for_read(self.__footnote_holder) as read_obj:
+            with open_for_write(self.__write_to, append=True) as write_obj:
+                write_obj.write(
+                    'mi<mk<sect-close\n'
+                    'mi<mk<body-close\n'
+                    'mi<tg<close_____<section\n'
+                    'mi<tg<close_____<body\n'
+                    'mi<tg<close_____<doc\n'
+                    'mi<mk<footnt-beg\n')
+                for line in read_obj:
+                    write_obj.write(line)
+                write_obj.write(
+                'mi<mk<footnt-end\n')
+        os.remove(self.__footnote_holder)
+        copy_obj = copy.Copy(bug_handler=self.__bug_handler)
+        if self.__copy:
+            copy_obj.copy_file(self.__write_to, "footnote_separate.data")
+        copy_obj.rename(self.__write_to, self.__file)
+        os.remove(self.__write_to)
+
+    def update_info(self, file, copy):
+        """
+        Unused method
+        """
+        self.__file = file
+        self.__copy = copy
+
+    def __get_foot_body_func(self, line):
+        """
+        Process lines in main body and look for beginning of footnotes.
+        """
+        # mi<mk<footnt-end
+        if self.__token_info == 'mi<mk<footnt-beg':
+            self.__state = 'foot'
+        else:
+            self.__write_obj.write(line)
+
+    def __get_foot_foot_func(self, line):
+        """
+        Copy footnotes from bottom of file to a separate, temporary file.
+        """
+        if self.__token_info == 'mi<mk<footnt-end':
+            self.__state = 'body'
+        else:
+            self.__write_to_foot_obj.write(line)
+
+    def __get_footnotes(self):
+        """
+        Private method to remove footnotes from main file.  Read one line from
+        the main file at a time. If the state is 'body', call on the private
+        __get_foot_foot_func. Otherwise, call on the __get_foot_body_func.
+        These two functions do the work of separating the footnotes form the
+        body.
+        """
+        with open_for_read(self.__file) as read_obj:
+            with open_for_write(self.__write_to) as self.__write_obj:
+                with open_for_write(self.__footnote_holder) as self.__write_to_foot_obj:
+                    for line in read_obj:
+                        self.__token_info = line[:16]
+                        if self.__state == 'body':
+                            self.__get_foot_body_func(line)
+                        elif self.__state == 'foot':
+                            self.__get_foot_foot_func(line)
+
+    def __get_foot_from_temp(self, num):
+        """
+        Private method for joining footnotes to body. This method reads from
+        the temporary file until the proper footnote marker is found. It
+        collects all the tokens until the end of the footnote, and returns
+        them as a string.
+        """
+        look_for = 'mi<mk<footnt-ope<' + num + '\n'
+        found_foot = 0
+        string_to_return = ''
+        for line in self.__read_from_foot_obj:
+            if found_foot:
+                if line == 'mi<mk<footnt-clo\n':
+                    return string_to_return
+                string_to_return = string_to_return + line
+            else:
+                if line == look_for:
+                    found_foot = 1
+
+    def __join_from_temp(self):
+        """
+        Private method for rejoining footnotes to body.  Read from the
+        newly-created, temporary file that contains the body text but no
+        footnotes. Each time a footnote marker is found, call the private
+        method __get_foot_from_temp(). This method will return a string to
+        print out to the third file.
+        If no footnote marker is found, simply print out the token (line).
+        """
+        with open_for_read(self.__footnote_holder) as self.__read_from_foot_obj:
+            with open_for_read(self.__write_to) as read_obj:
+                with open_for_write(self.__write_to2) as self.__write_obj:
+                    for line in read_obj:
+                        if line[:16] == 'mi<mk<footnt-ind':
+                            line = self.__get_foot_from_temp(line[17:-1])
+                        self.__write_obj.write(line)
+
+    def join_footnotes(self):
+        """
+        Join the footnotes from the bottom of the file and put them in their
+        former places.  First, remove the footnotes from the bottom of the
+        input file, outputting them to a temporary file. This creates two new
+        files, one without footnotes, and one of just footnotes. Open both
+        these files to read. When a marker is found in the main file, find the
+        corresponding marker in the footnote file. Output the mix of body and
+        footnotes to a third file.
+        """
+        if not self.__found_a_footnote:
+            return
+        self.__write_to2 = better_mktemp()
+        self.__state = 'body'
+        self.__get_footnotes()
+        self.__join_from_temp()
+        # self.__write_obj.close()
+        # self.__read_from_foot_obj.close()
+        copy_obj = copy.Copy(bug_handler=self.__bug_handler)
+        if self.__copy:
+            copy_obj.copy_file(self.__write_to2, "footnote_joined.data")
+        copy_obj.rename(self.__write_to2, self.__file)
+        os.remove(self.__write_to2)
+        os.remove(self.__footnote_holder)
--- a/ebook_converter/ebooks/rtf2xml/get_char_map.py
+++ b/ebook_converter/ebooks/rtf2xml/get_char_map.py
@@ -0,0 +1,62 @@
+from __future__ import unicode_literals, absolute_import, print_function, division
+#########################################################################
+#                                                                       #
+#                                                                       #
+#   copyright 2002 Paul Henry Tremblay                                  #
+#                                                                       #
+#   This program is distributed in the hope that it will be useful,     #
+#   but WITHOUT ANY WARRANTY; without even the implied warranty of      #
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU    #
+#   General Public License for more details.                            #
+#                                                                       #
+#                                                                       #
+#########################################################################
+
+
+class GetCharMap:
+    """
+
+    Return the character map for the given value
+
+    """
+
+    def __init__(self, bug_handler, char_file):
+        """
+
+        Required:
+
+            'char_file'--the file with the mappings
+
+        Returns:
+
+            nothing
+
+            """
+        self.__char_file = char_file
+        self.__bug_handler = bug_handler
+
+    def get_char_map(self, map):
+        # if map == 'ansicpg10000':
+        #   map = 'mac_roman'
+        found_map = False
+        map_dict = {}
+        self.__char_file.seek(0)
+        for line in self.__char_file:
+            if not line.strip():
+                continue
+            begin_element = '<%s>' % map
+            end_element = '</%s>' % map
+            if not found_map:
+                if begin_element in line:
+                    found_map = True
+            else:
+                if end_element in line:
+                    break
+                fields = line.split(':')
+                fields[1].replace('\\colon', ':')
+                map_dict[fields[1]] = fields[3]
+
+        if not found_map:
+            msg = 'no map found\nmap is "%s"\n'%(map,)
+            raise self.__bug_handler(msg)
+        return map_dict
--- a/ebook_converter/ebooks/rtf2xml/group_borders.py
+++ b/ebook_converter/ebooks/rtf2xml/group_borders.py
@@ -0,0 +1,306 @@
+from __future__ import unicode_literals, absolute_import, print_function, division
+#########################################################################
+#                                                                       #
+#                                                                       #
+#   copyright 2002 Paul Henry Tremblay                                  #
+#                                                                       #
+#   This program is distributed in the hope that it will be useful,     #
+#   but WITHOUT ANY WARRANTY; without even the implied warranty of      #
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU    #
+#   General Public License for more details.                            #
+#                                                                       #
+#                                                                       #
+#########################################################################
+import sys, os, re
+from calibre.ebooks.rtf2xml import copy
+from calibre.ptempfile import better_mktemp
+from . import open_for_read, open_for_write
+
+
+class GroupBorders:
+    """
+    Form lists.
+    Use RTF's own formatting to determine if a paragraph definition is part of a
+    list.
+    Use indents to determine items and how lists are nested.
+    """
+
+    def __init__(self,
+            in_file,
+            bug_handler,
+            copy=None,
+            run_level=1,
+            wrap=0,
+            ):
+        """
+        Required:
+            'file'
+        Optional:
+            'copy'-- whether to make a copy of result for debugging
+            'temp_dir' --where to output temporary results (default is
+            directory from which the script is run.)
+        Returns:
+            nothing
+            """
+        self.__file = in_file
+        self.__bug_handler = bug_handler
+        self.__copy = copy
+        self.__run_level = run_level
+        self.__write_to = better_mktemp()
+        self.__wrap = wrap
+
+    def __initiate_values(self):
+        """
+        Required:
+            Nothing
+        Return:
+            Nothing
+        Logic:
+            The self.__end_list is a list of tokens that will force a list to end.
+            Likewise, the self.__end_lines is a list of lines that forces a list to end.
+        """
+        self.__state = "default"
+        self.__left_indent = 0
+        self.__border_num = 0
+        self.__list_type = 'not-defined'
+        self.__pard_def = ""
+        self.__all_lists = []
+        self.__list_chunk = ''
+        self.__state_dict={
+        'default'           :   self.__default_func,
+        'in_pard'           :   self.__in_pard_func,
+        'after_pard'        :   self.__after_pard_func,
+        }
+        # section end
+        self.__end_list = [
+        # section end
+        'mi<mk<sect-close',
+        'mi<mk<sect-start',
+        # table begin
+        'mi<mk<tabl-start',
+        # field block begin
+        'mi<mk<fldbk-end_',
+        'mi<mk<fldbkstart',
+        # cell end
+        'mi<mk<close_cell',
+        # item end
+        'mi<tg<item_end__',
+        # footnote end
+        'mi<mk<foot___clo',
+        'mi<mk<footnt-ope',
+        # heading end
+        'mi<mk<header-beg',
+        'mi<mk<header-end',
+        'mi<mk<head___clo',
+        # lists
+        'mi<tg<item_end__',
+        'mi<tg<item_end__',
+        'mi<mk<list_start'
+        # body close
+        #
+        # style-group
+        'mi<mk<style-grp_',
+        'mi<mk<style_grp_',
+        'mi<mk<style_gend',
+        'mi<mk<stylegend_',
+        # don't use
+        # 'mi<mk<body-close',
+        # 'mi<mk<par-in-fld',
+        # 'cw<tb<cell______',
+        # 'cw<tb<row-def___',
+        # 'cw<tb<row_______',
+        # 'mi<mk<sec-fd-beg',
+        ]
+        # <name>Normal<
+        self.__name_regex = re.compile(r'(<name>[^<]+)')
+        self.__border_regex = re.compile(r'border-paragraph')
+        self.__found_appt = 0
+        self.__line_num = 0
+        self.__border_regex  = re.compile(r'(<border-paragraph[^<]+|<border-for-every-paragraph[^<]+)')
+        self.__last_border_string = ''
+
+    def __in_pard_func(self, line):
+        """
+        Required:
+            line -- the line of current text.
+        Return:
+            Nothing
+        Logic:
+            You are in a list, but in the middle of a paragraph definition.
+            Don't do anything until you find the end of the paragraph definition.
+        """
+        if self.__token_info == 'mi<tg<close_____' \
+            and line[17:-1] == 'paragraph-definition':
+            self.__state = 'after_pard'
+        else:
+            self.__write_obj.write(line)
+
+    def __after_pard_func(self, line):
+        """
+        Required:
+            line -- the line of current text.
+        Return:
+            Nothing
+        Logic:
+        """
+        if self.__token_info == 'mi<tg<open-att__' \
+            and line[17:37] == 'paragraph-definition':
+            # found paragraph definition
+            self.__pard_after_par_def_func(line)
+        elif self.__token_info == 'mi<tg<close_____' \
+            and line[17:-1] == 'paragraph-definition':
+            sys.stderr.write('Wrong flag in __after_pard_func\n')
+            if self.__run_level > 2:
+                msg =  'wrong flag'
+                raise self.__bug_handler(msg)
+        elif self.__token_info in self.__end_list:
+            self.__write_obj.write('mi<tg<close_____<paragraph-definition\n')
+            self.__write_end_border_tag()
+            self.__write_obj.write(self.__list_chunk)
+            self.__list_chunk = ''
+            self.__state = 'default'
+            self.__write_obj.write(line)
+        else:
+            self.__list_chunk += line
+
+    def __close_pard_(self, line):
+        self.__write_obj.write(self.__list_chunk)
+        self.__write_obj.write('mi<tg<close_____<paragraph-definition\n')
+        self.__write_end_wrap()
+        self.__list_chunk = ''
+        self.__state = 'default'
+
+    def __pard_after_par_def_func(self, line):
+        """
+        Required:
+            line -- the line of current text.
+            id -- the id of the current list
+        Return:
+            Nothing
+        Logic:
+        """
+        is_border = self.__is_border_func(line)
+        if not is_border:
+            self.__write_obj.write('mi<tg<close_____<paragraph-definition\n')
+            self.__write_end_border_tag()
+            self.__write_obj.write(self.__list_chunk)
+            self.__write_obj.write(line)
+            self.__state = 'default'
+            self.__list_chunk = ''
+        else:
+            border_string, pard_string = self.__parse_pard_with_border(line)
+            if self.__last_border_string == border_string:
+                # just keep going
+                self.__write_obj.write('mi<tg<close_____<paragraph-definition\n')
+                self.__write_obj.write(self.__list_chunk)
+                self.__list_chunk = ''
+                self.__state = 'in_pard'
+                self.__write_obj.write(pard_string)
+            else:
+                # different name for the paragraph definition
+                self.__write_obj.write('mi<tg<close_____<paragraph-definition\n')
+                self.__write_end_border_tag()
+                self.__write_obj.write(self.__list_chunk)
+                self.__write_start_border_tag(border_string)
+                self.__write_obj.write(pard_string)
+                self.__state = 'in_pard'
+                self.__last_border_string = border_string
+                self.__list_chunk = ''
+
+    def __default_func(self, line):
+        """
+        Required:
+            self, line
+        Returns:
+            Nothing
+        Logic
+            Look for the start of a paragraph defintion. If one is found, check if
+            it contains a list-id. If it does, start a list. Change the state to
+            in_pard.
+            """
+        if self.__token_info == 'mi<tg<open-att__' \
+            and line[17:37] == 'paragraph-definition':
+            contains_border = self.__is_border_func(line)
+            if contains_border:
+                border_string, pard_string = self.__parse_pard_with_border(line)
+                self.__write_start_border_tag(border_string)
+                self.__write_obj.write(pard_string)
+                self.__last_border_string = border_string
+                self.__state = 'in_pard'
+            else:
+                self.__write_obj.write(line)
+        else:
+            self.__write_obj.write(line)
+
+    def __write_start_border_tag(self, the_string):
+        self.__write_obj.write('mi<mk<start-brdg\n')
+        self.__border_num += 1
+        num = '%04d' % self.__border_num
+        num_string = 's%s' % num
+        the_string += '<num>%s' % num_string
+        self.__write_obj.write('mi<tg<open-att__<border-group%s\n' % the_string)
+
+    def __write_end_border_tag(self):
+        self.__write_obj.write('mi<mk<end-brdg__\n')
+        self.__write_obj.write('mi<tg<close_____<border-group\n')
+
+    def __is_border_func(self, line):
+        line = re.sub(self.__name_regex, '', line)
+        index = line.find('border-paragraph')
+        if index > -1:
+            return 1
+        return 0
+
+    def __parse_pard_with_border(self, line):
+        border_string = ''
+        pard_string = ''
+        tokens = re.split(self.__border_regex, line)
+        for token in tokens:
+            if token[0:17] == '<border-paragraph':
+                border_string += token
+            else:
+                pard_string += token
+        return border_string, pard_string
+
+    def __write_pard_with_border(self, line):
+        border_string = ''
+        pard_string = ''
+        tokens = re.split(self.__border_regex, line)
+        for token in tokens:
+            if token[0:17] == '<border-paragraph':
+                border_string += token
+            else:
+                pard_string += token
+        self.__write_start_border_tag(border_string)
+        self.__write_obj.write(pard_string)
+
+    def __get_style_name(self, line):
+        if self.__token_info == 'mi<mk<style-name':
+            self.__style_name = line[17:-1]
+
+    def group_borders(self):
+        """
+        Required:
+            nothing
+        Returns:
+            original file will be changed
+        Logic:
+        """
+        self.__initiate_values()
+        read_obj = open_for_read(self.__file)
+        self.__write_obj = open_for_write(self.__write_to)
+        line_to_read = 1
+        while line_to_read:
+            line_to_read = read_obj.readline()
+            line = line_to_read
+            self.__token_info = line[:16]
+            self.__get_style_name(line)
+            action = self.__state_dict.get(self.__state)
+            action(line)
+        read_obj.close()
+        self.__write_obj.close()
+        copy_obj = copy.Copy(bug_handler=self.__bug_handler)
+        if self.__copy:
+            copy_obj.copy_file(self.__write_to, "group_borders.data")
+        copy_obj.rename(self.__write_to, self.__file)
+        os.remove(self.__write_to)
--- a/ebook_converter/ebooks/rtf2xml/group_styles.py
+++ b/ebook_converter/ebooks/rtf2xml/group_styles.py
@@ -0,0 +1,252 @@
+from __future__ import unicode_literals, absolute_import, print_function, division
+#########################################################################
+#                                                                       #
+#                                                                       #
+#   copyright 2002 Paul Henry Tremblay                                  #
+#                                                                       #
+#   This program is distributed in the hope that it will be useful,     #
+#   but WITHOUT ANY WARRANTY; without even the implied warranty of      #
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU    #
+#   General Public License for more details.                            #
+#                                                                       #
+#                                                                       #
+#########################################################################
+import sys, os,  re
+from calibre.ebooks.rtf2xml import copy
+from calibre.ptempfile import better_mktemp
+from . import open_for_read, open_for_write
+
+
+class GroupStyles:
+    """
+    Form lists.
+    Use RTF's own formatting to determine if a paragraph definition is part of a
+    list.
+    Use indents to determine items and how lists are nested.
+    """
+
+    def __init__(self,
+            in_file,
+            bug_handler,
+            copy=None,
+            run_level=1,
+            wrap=0,
+            ):
+        """
+        Required:
+            'file'
+        Optional:
+            'copy'-- whether to make a copy of result for debugging
+            'temp_dir' --where to output temporary results (default is
+            directory from which the script is run.)
+        Returns:
+            nothing
+            """
+        self.__file = in_file
+        self.__bug_handler = bug_handler
+        self.__copy = copy
+        self.__run_level = run_level
+        self.__write_to =  better_mktemp()
+        self.__wrap = wrap
+
+    def __initiate_values(self):
+        """
+        Required:
+            Nothing
+        Return:
+            Nothing
+        Logic:
+            The self.__end_list is a list of tokens that will force a list to end.
+            Likewise, the self.__end_lines is a list of lines that forces a list to end.
+        """
+        self.__state = "default"
+        self.__left_indent = 0
+        self.__list_type = 'not-defined'
+        self.__pard_def = ""
+        self.__all_lists = []
+        self.__list_chunk = ''
+        self.__state_dict={
+        'default'           :   self.__default_func,
+        'in_pard'           :   self.__in_pard_func,
+        'after_pard'        :   self.__after_pard_func,
+        }
+        # section end
+        self.__end_list = [
+        # section end
+        'mi<mk<sect-close',
+        'mi<mk<sect-start',
+        # table begin
+        'mi<mk<tabl-start',
+        # field block begin
+        'mi<mk<fldbk-end_',
+        'mi<mk<fldbkstart',
+        # cell end
+        'mi<mk<close_cell',
+        # item end
+        'mi<tg<item_end__',
+        # footnote end
+        'mi<mk<foot___clo',
+        'mi<mk<footnt-ope',
+        # heading end
+        'mi<mk<header-beg',
+        'mi<mk<header-end',
+        'mi<mk<head___clo',
+        # lists
+        'mi<tg<item_end__',
+        'mi<tg<item_end__',
+        'mi<mk<list_start'
+        # body close
+        # don't use
+        # 'mi<mk<body-close',
+        # 'mi<mk<par-in-fld',
+        # 'cw<tb<cell______',
+        # 'cw<tb<row-def___',
+        # 'cw<tb<row_______',
+        # 'mi<mk<sec-fd-beg',
+        ]
+        self.__name_regex = re.compile(r'<name>')
+        self.__found_appt = 0
+        self.__line_num = 0
+
+    def __in_pard_func(self, line):
+        """
+        Required:
+            line -- the line of current text.
+        Return:
+            Nothing
+        Logic:
+            You are in a list, but in the middle of a paragraph definition.
+            Don't do anything until you find the end of the paragraph definition.
+        """
+        if self.__token_info == 'mi<tg<close_____' \
+            and line[17:-1] == 'paragraph-definition':
+            self.__state = 'after_pard'
+        else:
+            self.__write_obj.write(line)
+
+    def __after_pard_func(self, line):
+        """
+        Required:
+            line -- the line of current text.
+        Return:
+            Nothing
+        Logic:
+        """
+        if self.__token_info == 'mi<tg<open-att__' \
+            and line[17:37] == 'paragraph-definition':
+            # found paragraph definition
+            self.__pard_after_par_def_func(line)
+        elif self.__token_info == 'mi<tg<close_____' \
+            and line[17:-1] == 'paragraph-definition':
+            sys.stderr.write('Wrong flag in __after_pard_func\n')
+            if self.__run_level > 2:
+                msg =  'wrong flag'
+                raise self.__bug_handler(msg)
+        elif self.__token_info in self.__end_list:
+            self.__write_obj.write('mi<tg<close_____<paragraph-definition\n')
+            self.__write_end_wrap()
+            self.__write_obj.write(self.__list_chunk)
+            self.__list_chunk = ''
+            self.__state = 'default'
+            self.__write_obj.write(line)
+        else:
+            self.__list_chunk += line
+
+    def __close_pard_(self, line):
+        self.__write_obj.write(self.__list_chunk)
+        self.__write_obj.write('mi<tg<close_____<paragraph-definition\n')
+        self.__write_end_wrap()
+        self.__list_chunk = ''
+        self.__state = 'default'
+
+    def __write_start_wrap(self, name):
+        if self.__wrap:
+            self.__write_obj.write('mi<mk<style-grp_<%s\n' % name)
+            self.__write_obj.write('mi<tg<open-att__<style-group<name>%s\n' % name)
+            self.__write_obj.write('mi<mk<style_grp_<%s\n' % name)
+
+    def __write_end_wrap(self):
+        if self.__wrap:
+            self.__write_obj.write('mi<mk<style_gend\n')
+            self.__write_obj.write('mi<tg<close_____<style-group\n')
+            self.__write_obj.write('mi<mk<stylegend_\n')
+
+    def __pard_after_par_def_func(self, line):
+        """
+        Required:
+            line -- the line of current text.
+            id -- the id of the current list
+        Return:
+            Nothing
+        Logic:
+        """
+        if self.__last_style_name == self.__style_name:
+            # just keep going
+            if self.__wrap:
+                self.__write_obj.write('mi<tg<close_____<paragraph-definition\n')
+            self.__write_obj.write(self.__list_chunk)
+            self.__list_chunk = ''
+            self.__state = 'in_pard'
+            if self.__wrap:
+                self.__write_obj.write(line)
+        else:
+            # different name for the paragraph definition
+            self.__write_obj.write('mi<tg<close_____<paragraph-definition\n')
+            self.__write_end_wrap()
+            self.__write_obj.write(self.__list_chunk)
+            self.__write_start_wrap(self.__style_name)
+            self.__write_obj.write(line)
+            self.__state = 'in_pard'
+            self.__last_style_name = self.__style_name
+            self.__list_chunk = ''
+
+    def __default_func(self, line):
+        """
+        Required:
+            self, line
+        Returns:
+            Nothing
+        Logic
+            Look for the start of a paragraph defintion. If one is found, check if
+            it contains a list-id. If it does, start a list. Change the state to
+            in_pard.
+            """
+        if self.__token_info == 'mi<tg<open-att__' \
+            and line[17:37] == 'paragraph-definition':
+            self.__state = 'in_pard'
+            self.__last_style_name = self.__style_name
+            self.__write_start_wrap(self.__last_style_name)
+            self.__write_obj.write(line)
+        else:
+            self.__write_obj.write(line)
+
+    def __get_style_name(self, line):
+        if self.__token_info == 'mi<mk<style-name':
+            self.__style_name = line[17:-1]
+
+    def group_styles(self):
+        """
+        Required:
+            nothing
+        Returns:
+            original file will be changed
+        Logic:
+        """
+        self.__initiate_values()
+        read_obj = open_for_read(self.__file)
+        self.__write_obj = open_for_write(self.__write_to)
+        line_to_read = 1
+        while line_to_read:
+            line_to_read = read_obj.readline()
+            line = line_to_read
+            self.__token_info = line[:16]
+            self.__get_style_name(line)
+            action = self.__state_dict.get(self.__state)
+            action(line)
+        read_obj.close()
+        self.__write_obj.close()
+        copy_obj = copy.Copy(bug_handler=self.__bug_handler)
+        if self.__copy:
+            copy_obj.copy_file(self.__write_to, "group_styles.data")
+        copy_obj.rename(self.__write_to, self.__file)
+        os.remove(self.__write_to)
--- a/ebook_converter/ebooks/rtf2xml/header.py
+++ b/ebook_converter/ebooks/rtf2xml/header.py
@@ -0,0 +1,261 @@
+from __future__ import unicode_literals, absolute_import, print_function, division
+#########################################################################
+#                                                                       #
+#                                                                       #
+#   copyright 2002 Paul Henry Tremblay                                  #
+#                                                                       #
+#   This program is distributed in the hope that it will be useful,     #
+#   but WITHOUT ANY WARRANTY; without even the implied warranty of      #
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU    #
+#   General Public License for more details.                            #
+#                                                                       #
+#                                                                       #
+#########################################################################
+import sys, os
+
+from calibre.ebooks.rtf2xml import copy
+from calibre.ptempfile import better_mktemp
+from . import open_for_read, open_for_write
+
+
+class Header:
+    """
+    Two public methods are available. The first separates all of the headers
+    and footers from the body and puts them at the bottom of the text, where
+    they are easier to process. The second joins those headers and footers to
+    the proper places in the body.
+    """
+
+    def __init__(self,
+            in_file ,
+            bug_handler,
+            copy=None,
+            run_level=1,
+            ):
+        self.__file = in_file
+        self.__bug_handler = bug_handler
+        self.__copy = copy
+        self.__write_to = better_mktemp()
+        self.__found_a_header = False
+
+    def __in_header_func(self, line):
+        """
+        Handle all tokens that are part of header
+        """
+        if self.__cb_count == self.__header_bracket_count:
+            self.__in_header = False
+            self.__write_obj.write(line)
+            self.__write_to_head_obj.write(
+            'mi<mk<head___clo\n'
+            'mi<tg<close_____<header-or-footer\n'
+            'mi<mk<header-clo\n')
+        else:
+            self.__write_to_head_obj.write(line)
+
+    def __found_header(self, line):
+        """
+        Found a header
+        """
+        # but this could be header or footer
+        self.__found_a_header = True
+        self.__in_header = True
+        self.__header_count += 1
+        # temporarily set this to zero so I can enter loop
+        self.__cb_count = 0
+        self.__header_bracket_count = self.__ob_count
+        self.__write_obj.write(
+        'mi<mk<header-ind<%04d\n' % self.__header_count)
+        self.__write_to_head_obj.write(
+        'mi<mk<header-ope<%04d\n' % self.__header_count)
+        info = line[6:16]
+        type = self.__head_dict.get(info)
+        if type:
+            self.__write_to_head_obj.write(
+                    'mi<tg<open-att__<header-or-footer<type>%s\n' % (type)
+                    )
+        else:
+            sys.stderr.write(
+            'module is header\n'
+            'method is __found_header\n'
+            'no dict entry\n'
+            'line is %s' % line)
+            self.__write_to_head_obj.write(
+                    'mi<tg<open-att__<header-or-footer<type>none\n'
+                    )
+
+    def __default_sep(self, line):
+        """
+        Handle all tokens that are not header tokens
+        """
+        if self.__token_info[3:5] == 'hf':
+            self.__found_header(line)
+        self.__write_obj.write(line)
+
+    def __initiate_sep_values(self):
+        """
+        initiate counters for separate_footnotes method.
+        """
+        self.__bracket_count=0
+        self.__ob_count = 0
+        self.__cb_count = 0
+        self.__header_bracket_count = 0
+        self.__in_header = False
+        self.__header_count = 0
+        self.__head_dict = {
+            'head-left_'        :   ('header-left'),
+            'head-right'        :   ('header-right'),
+            'foot-left_'        :   ('footer-left'),
+            'foot-right'        :   ('footer-right'),
+            'head-first'        :   ('header-first'),
+            'foot-first'        :   ('footer-first'),
+            'header____'        :   ('header'),
+            'footer____'        :   ('footer'),
+        }
+
+    def separate_headers(self):
+        """
+        Separate all the footnotes in an RTF file and put them at the bottom,
+        where they are easier to process.  Each time a footnote is found,
+        print all of its contents to a temporary file. Close both the main and
+        temporary file. Print the footnotes from the temporary file to the
+        bottom of the main file.
+        """
+        self.__initiate_sep_values()
+        self.__header_holder = better_mktemp()
+        with open_for_read(self.__file) as read_obj:
+            with open_for_write(self.__write_to) as self.__write_obj:
+                with open_for_write(self.__header_holder) as self.__write_to_head_obj:
+                    for line in read_obj:
+                        self.__token_info = line[:16]
+                        # keep track of opening and closing brackets
+                        if self.__token_info == 'ob<nu<open-brack':
+                            self.__ob_count = line[-5:-1]
+                        if self.__token_info == 'cb<nu<clos-brack':
+                            self.__cb_count = line[-5:-1]
+                        # In the middle of footnote text
+                        if self.__in_header:
+                            self.__in_header_func(line)
+                        # not in the middle of footnote text
+                        else:
+                            self.__default_sep(line)
+
+        with open_for_read(self.__header_holder) as read_obj:
+            with open_for_write(self.__write_to, append=True) as write_obj:
+                write_obj.write(
+                'mi<mk<header-beg\n')
+                for line in read_obj:
+                    write_obj.write(line)
+                write_obj.write(
+                'mi<mk<header-end\n')
+        os.remove(self.__header_holder)
+
+        copy_obj = copy.Copy(bug_handler=self.__bug_handler)
+        if self.__copy:
+            copy_obj.copy_file(self.__write_to, "header_separate.data")
+        copy_obj.rename(self.__write_to, self.__file)
+        os.remove(self.__write_to)
+
+    def update_info(self, file, copy):
+        """
+        Unused method
+        """
+        self.__file = file
+        self.__copy = copy
+
+    def __get_head_body_func(self, line):
+        """
+        Process lines in main body and look for beginning of headers.
+        """
+        # mi<mk<footnt-end
+        if self.__token_info == 'mi<mk<header-beg':
+            self.__state = 'head'
+        else:
+            self.__write_obj.write(line)
+
+    def __get_head_head_func(self, line):
+        """
+        Copy headers and footers from bottom of file to a separate, temporary file.
+        """
+        if self.__token_info == 'mi<mk<header-end':
+            self.__state = 'body'
+        else:
+            self.__write_to_head_obj.write(line)
+
+    def __get_headers(self):
+        """
+        Private method to remove footnotes from main file.  Read one line from
+        the main file at a time. If the state is 'body', call on the private
+        __get_foot_foot_func. Otherwise, call on the __get_foot_body_func.
+        These two functions do the work of separating the footnotes form the
+        body.
+        """
+        with open_for_read(self.__file) as read_obj:
+            with open_for_write(self.__write_to) as self.__write_obj:
+                with open_for_write(self.__header_holder) as self.__write_to_head_obj:
+                    for line in read_obj:
+                        self.__token_info = line[:16]
+                        if self.__state == 'body':
+                            self.__get_head_body_func(line)
+                        elif self.__state == 'head':
+                            self.__get_head_head_func(line)
+
+    def __get_head_from_temp(self, num):
+        """
+        Private method for joining headers and footers to body. This method
+        reads from the temporary file until the proper footnote marker is
+        found. It collects all the tokens until the end of the footnote, and
+        returns them as a string.
+        """
+        look_for = 'mi<mk<header-ope<' + num + '\n'
+        found_head = False
+        string_to_return = ''
+        for line in self.__read_from_head_obj:
+            if found_head:
+                if line == 'mi<mk<header-clo\n':
+                    return string_to_return
+                string_to_return += line
+            else:
+                if line == look_for:
+                    found_head = True
+
+    def __join_from_temp(self):
+        """
+        Private method for rejoining footnotes to body.  Read from the
+        newly-created, temporary file that contains the body text but no
+        footnotes. Each time a footnote marker is found, call the private
+        method __get_foot_from_temp(). This method will return a string to
+        print out to the third file.
+        If no footnote marker is found, simply print out the token (line).
+        """
+        self.__read_from_head_obj = open_for_read(self.__header_holder)
+        self.__write_obj = open_for_write(self.__write_to2)
+        with open_for_read(self.__write_to) as read_obj:
+            for line in read_obj:
+                if line[:16] == 'mi<mk<header-ind':
+                    line = self.__get_head_from_temp(line[17:-1])
+                self.__write_obj.write(line)
+
+    def join_headers(self):
+        """
+        Join the footnotes from the bottom of the file and put them in their
+        former places.  First, remove the footnotes from the bottom of the
+        input file, outputting them to a temporary file. This creates two new
+        files, one without footnotes, and one of just footnotes. Open both
+        these files to read. When a marker is found in the main file, find the
+        corresponding marker in the footnote file. Output the mix of body and
+        footnotes to a third file.
+        """
+        if not self.__found_a_header:
+            return
+        self.__write_to2 = better_mktemp()
+        self.__state = 'body'
+        self.__get_headers()
+        self.__join_from_temp()
+        self.__write_obj.close()
+        self.__read_from_head_obj.close()
+        copy_obj = copy.Copy(bug_handler=self.__bug_handler)
+        if self.__copy:
+            copy_obj.copy_file(self.__write_to, "header_join.data")
+        copy_obj.rename(self.__write_to, self.__file)
+        os.remove(self.__write_to)
+        os.remove(self.__header_holder)
--- a/ebook_converter/ebooks/rtf2xml/headings_to_sections.py
+++ b/ebook_converter/ebooks/rtf2xml/headings_to_sections.py
@@ -0,0 +1,227 @@
+from __future__ import unicode_literals, absolute_import, print_function, division
+#########################################################################
+#                                                                       #
+#                                                                       #
+#   copyright 2002 Paul Henry Tremblay                                  #
+#                                                                       #
+#   This program is distributed in the hope that it will be useful,     #
+#   but WITHOUT ANY WARRANTY; without even the implied warranty of      #
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU    #
+#   General Public License for more details.                            #
+#                                                                       #
+#                                                                       #
+#########################################################################
+import os, re
+from calibre.ebooks.rtf2xml import copy
+from calibre.ptempfile import better_mktemp
+from . import open_for_read, open_for_write
+
+
+class HeadingsToSections:
+    """
+    """
+
+    def __init__(self,
+            in_file,
+            bug_handler,
+            copy=None,
+            run_level=1,
+            ):
+        """
+        Required:
+            'file'
+        Optional:
+            'copy'-- whether to make a copy of result for debugging
+            'temp_dir' --where to output temporary results (default is
+            directory from which the script is run.)
+        Returns:
+            nothing
+            """
+        self.__file = in_file
+        self.__bug_handler = bug_handler
+        self.__copy = copy
+        self.__write_to = better_mktemp()
+
+    def __initiate_values(self):
+        """
+        Required:
+            Nothing
+        Return:
+            Nothing
+        Logic:
+            The self.__end_list is a list of tokens that will force a list to end.
+            Likewise, the self.__end_lines is a list of lines that forces a list to end.
+        """
+        self.__state = "default"
+        self.__all_sections = []
+        self.__chunk = ''
+        self.__state_dict={
+        'default'           :   self.__default_func,
+        'in_table'          :   self.__in_table_func,
+        'in_list'           :   self.__in_list_func,
+        'after_body'        :   self.__after_body_func,
+        }
+        self.__list_depth = 0
+        self.__end_list = [
+        'mi<mk<body-close',
+        # changed 2004-04-26
+        # 'mi<mk<par-in-fld',
+        'mi<mk<sect-close',  # right before close of section
+        'mi<mk<sect-start',  # right before section start
+                            # this should be sect-close!
+        # 'mi<mk<header-beg',
+        # 'mi<mk<header-end',
+        # 'mi<mk<head___clo',
+        #
+        # changed 2004-04-26
+        # 'mi<mk<fldbk-end_',
+        # 'mi<mk<sec-fd-beg',
+        ]
+        self.__headings = [
+        'heading 1', 'heading 2', 'heading 3', 'heading 4',
+        'heading 5', 'heading 6', 'heading 7', 'heading 8',
+        'heading 9'
+        ]
+        self.__section_num = [0]
+        self.__id_regex = re.compile(r'\<list-id\>(\d+)')
+
+    def __close_lists(self):
+        """
+        Required:
+            Nothing
+        Return:
+            Nothing
+        Logic:
+            Reverse the list of dictionaries. Iterate through the list and
+            get the indent for each list. If the current indent is less than
+            or equal to the indent in the dictionary, close that level.
+            Keep track of how many levels you close. Reduce the list by that
+            many levels.
+            Reverse the list again.
+        """
+        current_indent = self.__left_indent
+        self.__all_lists.reverse()
+        num_levels_closed = 0
+        for the_dict in self.__all_lists:
+            list_indent = the_dict.get('left-indent')
+            if current_indent <= list_indent:
+                self.__write_end_item()
+                self.__write_end_list()
+                num_levels_closed += 1
+        self.__all_lists = self.__all_lists[num_levels_closed:]
+        self.__all_lists.reverse()
+
+    def __close_sections(self, current_level):
+        self.__all_sections.reverse()
+        num_levels_closed = 0
+        for level in self.__all_sections:
+            if current_level <= level:
+                self.__write_end_section()
+                num_levels_closed += 1
+        self.__all_sections = self.__all_sections[num_levels_closed:]
+        self.__all_sections.reverse()
+
+    def __write_start_section(self, current_level, name):
+        section_num = ''
+        for the_num in self.__section_num:
+            section_num += '%s.' % the_num
+        section_num = section_num[:-1]
+        num_in_level = len(self.__all_sections)
+        num_in_level = self.__section_num[num_in_level]
+        level = len(self.__all_sections)
+        self.__write_obj.write(
+            'mi<mk<sect-start\n'
+                )
+        self.__write_obj.write(
+                'mi<tg<open-att__<section<num>%s<num-in-level>%s<level>%s'
+                '<type>%s\n'
+                % (section_num, num_in_level, level, name)
+                )
+
+    def __write_end_section(self):
+        self.__write_obj.write('mi<mk<sect-close\n')
+        self.__write_obj.write('mi<tg<close_____<section\n')
+
+    def __default_func(self, line):
+        """
+        Required:
+            self, line
+        Returns:
+            Nothing
+        Logic
+            Look for the start of a paragraph defintion. If one is found, check if
+            it contains a list-id. If it does, start a list. Change the state to
+            in_pard.
+            """
+        if self.__token_info == 'mi<mk<sect-start':
+            self.__section_num[0] += 1
+            self.__section_num = self.__section_num[0:1]
+        if self.__token_info == 'mi<mk<tabl-start':
+            self.__state = 'in_table'
+        elif self.__token_info == 'mi<mk<list_start':
+            self.__state = 'in_list'
+            self.__list_depth += 1
+        elif self.__token_info in self.__end_list:
+            self.__close_sections(0)
+        elif self.__token_info == 'mi<mk<style-name':
+            name = line[17:-1]
+            if name in self.__headings:
+                self.__handle_heading(name)
+        if self.__token_info == 'mi<mk<body-close':
+            self.__state = 'after_body'
+        self.__write_obj.write(line)
+
+    def __handle_heading(self, name):
+        num = self.__headings.index(name) + 1
+        self.__close_sections(num)
+        self.__all_sections.append(num)
+        level_depth = len(self.__all_sections) + 1
+        self.__section_num = self.__section_num[:level_depth]
+        if len(self.__section_num) < level_depth:
+            self.__section_num.append(1)
+        else:
+            self.__section_num[-1] += 1
+        self.__write_start_section(num, name)
+
+    def __in_table_func(self, line):
+        if self.__token_info == 'mi<mk<table-end_':
+            self.__state = 'default'
+        self.__write_obj.write(line)
+
+    def __in_list_func(self, line):
+        if self.__token_info == 'mi<mk<list_close':
+            self.__list_depth -= 1
+        elif self.__token_info == 'mi<mk<list_start':
+            self.__list_depth += 1
+        if self.__list_depth == 0:
+            self.__state = 'default'
+        self.__write_obj.write(line)
+
+    def __after_body_func(self, line):
+        self.__write_obj.write(line)
+
+    def make_sections(self):
+        """
+        Required:
+            nothing
+        Returns:
+            original file will be changed
+        Logic:
+        """
+        self.__initiate_values()
+        read_obj = open_for_read(self.__file)
+        self.__write_obj = open_for_write(self.__write_to)
+        line_to_read = 1
+        while line_to_read:
+            line_to_read = read_obj.readline()
+            line = line_to_read
+            self.__token_info = line[:16]
+            action = self.__state_dict.get(self.__state)
+            action(line)
+        read_obj.close()
+        self.__write_obj.close()
+        copy_obj = copy.Copy(bug_handler=self.__bug_handler)
+        if self.__copy:
+            copy_obj.copy_file(self.__write_to, "sections_to_headings.data")
+        copy_obj.rename(self.__write_to, self.__file)
+        os.remove(self.__write_to)
--- a/ebook_converter/ebooks/rtf2xml/hex_2_utf8.py
+++ b/ebook_converter/ebooks/rtf2xml/hex_2_utf8.py
@@ -0,0 +1,589 @@
+from __future__ import absolute_import, division, print_function, unicode_literals
+#########################################################################
+#                                                                       #
+#                                                                       #
+#   copyright 2002 Paul Henry Tremblay                                  #
+#                                                                       #
+#   This program is distributed in the hope that it will be useful,     #
+#   but WITHOUT ANY WARRANTY; without even the implied warranty of      #
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU    #
+#   General Public License for more details.                            #
+#                                                                       #
+#                                                                       #
+#########################################################################
+import sys, os, io
+
+from calibre.ebooks.rtf2xml import get_char_map, copy
+from calibre.ebooks.rtf2xml.char_set import char_set
+from calibre.ptempfile import better_mktemp
+from polyglot.builtins import unicode_type
+
+from . import open_for_read, open_for_write
+
+
+class Hex2Utf8:
+    """
+    Convert Microsoft hexidecimal numbers to utf-8
+    """
+
+    def __init__(self,
+            in_file,
+            area_to_convert,
+            char_file,
+            default_char_map,
+            bug_handler,
+            invalid_rtf_handler,
+            copy=None,
+            temp_dir=None,
+            symbol=None,
+            wingdings=None,
+            caps=None,
+            convert_caps=None,
+            dingbats=None,
+            run_level=1,
+            ):
+        """
+        Required:
+            'file'
+            'area_to_convert'--the area of file to convert
+            'char_file'--the file containing the character mappings
+            'default_char_map'--name of default character map
+        Optional:
+            'copy'-- whether to make a copy of result for debugging
+            'temp_dir' --where to output temporary results (default is
+            directory from which the script is run.)
+            'symbol'--whether to load the symbol character map
+            'winddings'--whether to load the wingdings character map
+            'caps'--whether to load the caps characer map
+            'convert_to_caps'--wether to convert caps to utf-8
+        Returns:
+            nothing
+        """
+        self.__file = in_file
+        self.__copy = copy
+        if area_to_convert not in ('preamble', 'body'):
+            msg = (
+            'Developer error! Wrong flag.\n'
+            'in module "hex_2_utf8.py\n'
+            '"area_to_convert" must be "body" or "preamble"\n'
+            )
+            raise self.__bug_handler(msg)
+        self.__char_file = char_file
+        self.__area_to_convert = area_to_convert
+        self.__default_char_map = default_char_map
+        self.__symbol = symbol
+        self.__wingdings = wingdings
+        self.__dingbats = dingbats
+        self.__caps = caps
+        self.__convert_caps = 0
+        self.__convert_symbol = 0
+        self.__convert_wingdings = 0
+        self.__convert_zapf = 0
+        self.__run_level = run_level
+        self.__write_to = better_mktemp()
+        self.__bug_handler = bug_handler
+        self.__invalid_rtf_handler = invalid_rtf_handler
+
+    def update_values(self,
+                        file,
+                        area_to_convert,
+                        char_file,
+                        convert_caps,
+                        convert_symbol,
+                        convert_wingdings,
+                        convert_zapf,
+                        copy=None,
+                        temp_dir=None,
+                        symbol=None,
+                        wingdings=None,
+                        caps=None,
+                        dingbats=None,
+                    ):
+        """
+        Required:
+            'file'
+            'area_to_convert'--the area of file to convert
+            'char_file'--the file containing the character mappings
+        Optional:
+            'copy'-- whether to make a copy of result for debugging
+            'temp_dir' --where to output temporary results (default is
+            directory from which the script is run.)
+            'symbol'--whether to load the symbol character map
+            'winddings'--whether to load the wingdings character map
+            'caps'--whether to load the caps characer map
+            'convert_to_caps'--wether to convert caps to utf-8
+        Returns:
+            nothing
+            """
+        self.__file=file
+        self.__copy = copy
+        if area_to_convert not in ('preamble', 'body'):
+            msg = (
+            'in module "hex_2_utf8.py\n'
+            '"area_to_convert" must be "body" or "preamble"\n'
+            )
+            raise self.__bug_handler(msg)
+        self.__area_to_convert = area_to_convert
+        self.__symbol = symbol
+        self.__wingdings = wingdings
+        self.__dingbats = dingbats
+        self.__caps = caps
+        self.__convert_caps = convert_caps
+        self.__convert_symbol = convert_symbol
+        self.__convert_wingdings = convert_wingdings
+        self.__convert_zapf = convert_zapf
+        # new!
+        # no longer try to convert these
+        # self.__convert_symbol = 0
+        # self.__convert_wingdings = 0
+        # self.__convert_zapf = 0
+
+    def __initiate_values(self):
+        """
+        Required:
+            Nothing
+        Set values, including those for the dictionaries.
+        The file that contains the maps is broken down into many different
+        sets. For example, for the Symbol font, there is the standard part for
+        hexidecimal numbers, and the part for Microsoft characters. Read
+        each part in, and then combine them.
+        """
+        # the default encoding system, the lower map for characters 0 through
+        # 128, and the encoding system for Microsoft characters.
+        # New on 2004-05-8: the self.__char_map is not in directory with other
+        # modules
+        self.__char_file = io.StringIO(char_set)
+        char_map_obj =  get_char_map.GetCharMap(
+                char_file=self.__char_file,
+                bug_handler=self.__bug_handler,
+                )
+        up_128_dict = char_map_obj.get_char_map(map=self.__default_char_map)
+        bt_128_dict = char_map_obj.get_char_map(map='bottom_128')
+        ms_standard_dict = char_map_obj.get_char_map(map='ms_standard')
+        self.__def_dict = {}
+        self.__def_dict.update(up_128_dict)
+        self.__def_dict.update(bt_128_dict)
+        self.__def_dict.update(ms_standard_dict)
+        self.__current_dict = self.__def_dict
+        self.__current_dict_name = 'default'
+        self.__in_caps = 0
+        self.__special_fonts_found = 0
+        if self.__symbol:
+            symbol_base_dict = char_map_obj.get_char_map(map='SYMBOL')
+            ms_symbol_dict = char_map_obj.get_char_map(map='ms_symbol')
+            self.__symbol_dict = {}
+            self.__symbol_dict.update(symbol_base_dict)
+            self.__symbol_dict.update(ms_symbol_dict)
+        if self.__wingdings:
+            wingdings_base_dict = char_map_obj.get_char_map(map='wingdings')
+            ms_wingdings_dict = char_map_obj.get_char_map(map='ms_wingdings')
+            self.__wingdings_dict = {}
+            self.__wingdings_dict.update(wingdings_base_dict)
+            self.__wingdings_dict.update(ms_wingdings_dict)
+        if self.__dingbats:
+            dingbats_base_dict = char_map_obj.get_char_map(map='dingbats')
+            ms_dingbats_dict = char_map_obj.get_char_map(map='ms_dingbats')
+            self.__dingbats_dict = {}
+            self.__dingbats_dict.update(dingbats_base_dict)
+            self.__dingbats_dict.update(ms_dingbats_dict)
+        # load dictionary for caps, and make a string for the replacement
+        self.__caps_uni_dict = char_map_obj.get_char_map(map='caps_uni')
+        # # print self.__caps_uni_dict
+        # don't think I'll need this
+        # keys = self.__caps_uni_dict.keys()
+        # self.__caps_uni_replace = '|'.join(keys)
+        self.__preamble_state_dict = {
+            'preamble'      :       self.__preamble_func,
+            'body'          :       self.__body_func,
+            'mi<mk<body-open_'  :   self.__found_body_func,
+            'tx<hx<__________'  :   self.__hex_text_func,
+            }
+        self.__body_state_dict = {
+            'preamble'      :       self.__preamble_for_body_func,
+            'body'          :       self.__body_for_body_func,
+            }
+        self.__in_body_dict = {
+            'mi<mk<body-open_'  :   self.__found_body_func,
+            'tx<ut<__________'  :   self.__utf_to_caps_func,
+            'tx<hx<__________'  :   self.__hex_text_func,
+            'tx<mc<__________'  :   self.__hex_text_func,
+            'tx<nu<__________'  :   self.__text_func,
+            'mi<mk<font______'  :   self.__start_font_func,
+            'mi<mk<caps______'  :   self.__start_caps_func,
+            'mi<mk<font-end__'  :   self.__end_font_func,
+            'mi<mk<caps-end__'  :   self.__end_caps_func,
+        }
+        self.__caps_list = ['false']
+        self.__font_list = ['not-defined']
+
+    def __hex_text_func(self, line):
+        """
+        Required:
+            'line' -- the line
+        Logic:
+            get the hex_num and look it up in the default dictionary. If the
+            token is in the dictionary, then check if the value starts with a
+            "&". If it does, then tag the result as utf text. Otherwise, tag it
+            as normal text.
+            If the hex_num is not in the dictionary, then a mistake has been
+            made.
+            """
+        hex_num = line[17:-1]
+        converted = self.__current_dict.get(hex_num)
+        if converted is not None:
+            # tag as utf-8
+            if converted[0:1] == "&":
+                font = self.__current_dict_name
+                if self.__convert_caps\
+                and self.__caps_list[-1] == 'true'\
+                and font not in ('Symbol', 'Wingdings', 'Zapf Dingbats'):
+                    converted = self.__utf_token_to_caps_func(converted)
+                self.__write_obj.write(
+                'tx<ut<__________<%s\n' % converted
+                )
+            # tag as normal text
+            else:
+                font = self.__current_dict_name
+                if self.__convert_caps\
+                and self.__caps_list[-1] == 'true'\
+                and font not in ('Symbol', 'Wingdings', 'Zapf Dingbats'):
+                    converted = converted.upper()
+                self.__write_obj.write(
+                'tx<nu<__________<%s\n' % converted
+                )
+        # error
+        else:
+            token = hex_num.replace("'", '')
+            the_num = 0
+            if token:
+                the_num = int(token, 16)
+            if the_num > 10:
+                self.__write_obj.write('mi<tg<empty-att_<udef_symbol<num>%s<description>not-in-table\n' %
+                    hex_num)
+                if self.__run_level > 4:
+                    # msg = 'no dictionary entry for %s\n'
+                    # msg += 'the hexidecimal num is "%s"\n' % (hex_num)
+                    # msg += 'dictionary is %s\n' % self.__current_dict_name
+                    msg = 'Character "&#x%s;" does not appear to be valid (or is a control character)\n' % token
+                    raise self.__bug_handler(msg)
+
+    def __found_body_func(self, line):
+        self.__state = 'body'
+        self.__write_obj.write(line)
+
+    def __body_func(self, line):
+        """
+        When parsing preamble
+        """
+        self.__write_obj.write(line)
+
+    def __preamble_func(self, line):
+        action = self.__preamble_state_dict.get(self.__token_info)
+        if action is not None:
+            action(line)
+        else:
+            self.__write_obj.write(line)
+
+    def __convert_preamble(self):
+        self.__state = 'preamble'
+        with open_for_write(self.__write_to) as self.__write_obj:
+            with open_for_read(self.__file) as read_obj:
+                for line in read_obj:
+                    self.__token_info = line[:16]
+                    action = self.__preamble_state_dict.get(self.__state)
+                    if action is None:
+                        sys.stderr.write('error no state found in hex_2_utf8',
+                        self.__state
+                        )
+                    action(line)
+        copy_obj = copy.Copy(bug_handler=self.__bug_handler)
+        if self.__copy:
+            copy_obj.copy_file(self.__write_to, "preamble_utf_convert.data")
+        copy_obj.rename(self.__write_to, self.__file)
+        os.remove(self.__write_to)
+
+    def __preamble_for_body_func(self, line):
+        """
+        Required:
+            line -- line to parse
+        Returns:
+            nothing
+        Logic:
+            Used when parsing the body.
+        """
+        if self.__token_info == 'mi<mk<body-open_':
+            self.__found_body_func(line)
+        self.__write_obj.write(line)
+
+    def __body_for_body_func(self, line):
+        """
+        Required:
+            line -- line to parse
+        Returns:
+            nothing
+        Logic:
+            Used when parsing the body.
+        """
+        action = self.__in_body_dict.get(self.__token_info)
+        if action is not None:
+            action(line)
+        else:
+            self.__write_obj.write(line)
+
+    def __start_font_func(self, line):
+        """
+        Required:
+            line -- line to parse
+        Returns:
+            nothing
+        Logic:
+            add font face to font_list
+        """
+        face = line[17:-1]
+        self.__font_list.append(face)
+        if face == 'Symbol' and self.__convert_symbol:
+            self.__current_dict_name = 'Symbol'
+            self.__current_dict = self.__symbol_dict
+        elif face == 'Wingdings' and self.__convert_wingdings:
+            self.__current_dict_name = 'Wingdings'
+            self.__current_dict = self.__wingdings_dict
+        elif face == 'Zapf Dingbats' and self.__convert_zapf:
+            self.__current_dict_name = 'Zapf Dingbats'
+            self.__current_dict = self.__dingbats_dict
+        else:
+            self.__current_dict_name = 'default'
+            self.__current_dict = self.__def_dict
+
+    def __end_font_func(self, line):
+        """
+        Required:
+            line -- line to parse
+        Returns:
+            nothing
+        Logic:
+            pop font_list
+        """
+        if len(self.__font_list) > 1:
+            self.__font_list.pop()
+        else:
+            sys.stderr.write('module is hex_2_utf8\n')
+            sys.stderr.write('method is end_font_func\n')
+            sys.stderr.write('self.__font_list should be greater than one?\n')
+        face = self.__font_list[-1]
+        if face == 'Symbol' and self.__convert_symbol:
+            self.__current_dict_name = 'Symbol'
+            self.__current_dict = self.__symbol_dict
+        elif face == 'Wingdings' and self.__convert_wingdings:
+            self.__current_dict_name = 'Wingdings'
+            self.__current_dict = self.__wingdings_dict
+        elif face == 'Zapf Dingbats' and self.__convert_zapf:
+            self.__current_dict_name = 'Zapf Dingbats'
+            self.__current_dict = self.__dingbats_dict
+        else:
+            self.__current_dict_name = 'default'
+            self.__current_dict = self.__def_dict
+
+    def __start_special_font_func_old(self, line):
+        """
+        Required:
+            line -- line
+        Returns;
+            nothing
+        Logic:
+            change the dictionary to use in conversion
+        """
+        # for error checking
+        if self.__token_info == 'mi<mk<font-symbo':
+            self.__current_dict.append(self.__symbol_dict)
+            self.__special_fonts_found += 1
+            self.__current_dict_name = 'Symbol'
+        elif self.__token_info == 'mi<mk<font-wingd':
+            self.__special_fonts_found += 1
+            self.__current_dict.append(self.__wingdings_dict)
+            self.__current_dict_name = 'Wingdings'
+        elif self.__token_info == 'mi<mk<font-dingb':
+            self.__current_dict.append(self.__dingbats_dict)
+            self.__special_fonts_found += 1
+            self.__current_dict_name = 'Zapf Dingbats'
+
+    def __end_special_font_func(self, line):
+        """
+        Required:
+            line --line to parse
+        Returns:
+            nothing
+        Logic:
+            pop the last dictionary, which should be a special font
+        """
+        if len(self.__current_dict) < 2:
+            sys.stderr.write('module is hex_2_utf 8\n')
+            sys.stderr.write('method is __end_special_font_func\n')
+            sys.stderr.write('less than two dictionaries --can\'t pop\n')
+            self.__special_fonts_found -= 1
+        else:
+            self.__current_dict.pop()
+            self.__special_fonts_found -= 1
+            self.__dict_name = 'default'
+
+    def __start_caps_func_old(self, line):
+        """
+        Required:
+            line -- line to parse
+        Returns:
+            nothing
+        Logic:
+            A marker that marks the start of caps has been found. Set
+            self.__in_caps to 1
+        """
+        self.__in_caps = 1
+
+    def __start_caps_func(self, line):
+        """
+        Required:
+            line -- line to parse
+        Returns:
+            nothing
+        Logic:
+            A marker that marks the start of caps has been found. Set
+            self.__in_caps to 1
+        """
+        self.__in_caps = 1
+        value = line[17:-1]
+        self.__caps_list.append(value)
+
+    def __end_caps_func(self, line):
+        """
+        Required:
+            line -- line to parse
+        Returns:
+            nothing
+        Logic:
+            A marker that marks the end of caps has been found.
+            set self.__in_caps to 0
+        """
+        if len(self.__caps_list) > 1:
+            self.__caps_list.pop()
+        else:
+            sys.stderr.write('Module is hex_2_utf8\n'
+            'method is __end_caps_func\n'
+            'caps list should be more than one?\n')  # self.__in_caps not set
+
+    def __text_func(self, line):
+        """
+        Required:
+            line -- line to parse
+        Returns:
+            nothing
+        Logic:
+            if in caps, convert. Otherwise, print out.
+        """
+        text = line[17:-1]
+        # print line
+        if self.__current_dict_name in ('Symbol', 'Wingdings', 'Zapf Dingbats'):
+            the_string = ''
+            for letter in text:
+                hex_num = hex(ord(letter))
+                hex_num = unicode_type(hex_num)
+                hex_num = hex_num.upper()
+                hex_num = hex_num[2:]
+                hex_num = '\'%s' % hex_num
+                converted = self.__current_dict.get(hex_num)
+                if converted is None:
+                    sys.stderr.write('module is hex_2_ut8\nmethod is __text_func\n')
+                    sys.stderr.write('no hex value for "%s"\n' % hex_num)
+                else:
+                    the_string += converted
+            self.__write_obj.write('tx<nu<__________<%s\n' % the_string)
+            # print the_string
+        else:
+            if self.__caps_list[-1] == 'true' \
+                and self.__convert_caps\
+                and self.__current_dict_name not in ('Symbol', 'Wingdings', 'Zapf Dingbats'):
+                text = text.upper()
+            self.__write_obj.write('tx<nu<__________<%s\n' % text)
+
+    def __utf_to_caps_func(self, line):
+        """
+        Required:
+            line -- line to parse
+        returns
+            nothing
+        Logic
+            Get the text, and use another method to convert
+        """
+        utf_text = line[17:-1]
+        if self.__caps_list[-1] == 'true' and self.__convert_caps:
+            # utf_text = utf_text.upper()
+            utf_text = self.__utf_token_to_caps_func(utf_text)
+        self.__write_obj.write('tx<ut<__________<%s\n' % utf_text)
+
+    def __utf_token_to_caps_func(self, char_entity):
+        """
+        Required:
+            utf_text -- such as &xxx;
+        Returns:
+            token converted to the capital equivalent
+        Logic:
+            RTF often stores text in the improper values. For example, a
+            capital umlaut o (?), is stores as ?. This function swaps the
+            case by looking up the value in a dictionary.
+        """
+        hex_num = char_entity[3:]
+        length = len(hex_num)
+        if length == 3:
+            hex_num = '00%s' % hex_num
+        elif length == 4:
+            hex_num = '0%s' % hex_num
+        new_char_entity = '&#x%s' % hex_num
+        converted = self.__caps_uni_dict.get(new_char_entity)
+        if not converted:
+            # bullets and other entities dont' have capital equivelents
+            return char_entity
+        else:
+            return converted
+
+    def __convert_body(self):
+        self.__state = 'body'
+        with open_for_read(self.__file) as read_obj:
+            with open_for_write(self.__write_to) as self.__write_obj:
+                for line in read_obj:
+                    self.__token_info = line[:16]
+                    action = self.__body_state_dict.get(self.__state)
+                    if action is None:
+                        sys.stderr.write('error no state found in hex_2_utf8',
+                        self.__state
+                        )
+                    action(line)
+        copy_obj = copy.Copy(bug_handler=self.__bug_handler)
+        if self.__copy:
+            copy_obj.copy_file(self.__write_to, "body_utf_convert.data")
+        copy_obj.rename(self.__write_to, self.__file)
+        os.remove(self.__write_to)
+
+    def convert_hex_2_utf8(self):
+        self.__initiate_values()
+        if self.__area_to_convert == 'preamble':
+            self.__convert_preamble()
+        else:
+            self.__convert_body()
+
+
+"""
+how to swap case for non-capitals
+my_string.swapcase()
+An example of how to use a hash for the caps function
+(but I shouldn't need this, since utf text is separate
+ from regular text?)
+sub_dict = {
+    "&#x0430;"   : "some other value"
+    }
+def my_sub_func(matchobj):
+    info =  matchobj.group(0)
+    value = sub_dict.get(info)
+    return value
+    return "f"
+line = "&#x0430; more text"
+reg_exp = re.compile(r'(?P<name>&#x0430;|&#x0431;)')
+line2 = re.sub(reg_exp, my_sub_func, line)
+print line2
+"""
--- a/ebook_converter/ebooks/rtf2xml/info.py
+++ b/ebook_converter/ebooks/rtf2xml/info.py
@@ -0,0 +1,285 @@
+from __future__ import unicode_literals, absolute_import, print_function, division
+#########################################################################
+#                                                                       #
+#                                                                       #
+#   copyright 2002 Paul Henry Tremblay                                  #
+#                                                                       #
+#   This program is distributed in the hope that it will be useful,     #
+#   but WITHOUT ANY WARRANTY; without even the implied warranty of      #
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU    #
+#   General Public License for more details.                            #
+#                                                                       #
+#                                                                       #
+#########################################################################
+import sys, os, re
+
+from calibre.ebooks.rtf2xml import copy
+from calibre.ptempfile import better_mktemp
+from . import open_for_read, open_for_write
+
+
+class Info:
+    """
+    Make tags for document-information
+    """
+
+    def __init__(self,
+            in_file,
+            bug_handler,
+            copy=None,
+            run_level=1,
+            ):
+        """
+        Required:
+            'file'--file to parse
+        Optional:
+            'copy'-- whether to make a copy of result for debugging
+            'temp_dir' --where to output temporary results (default is
+            directory from which the script is run.)
+        Returns:
+            nothing
+            """
+        self.__file = in_file
+        self.__bug_handler = bug_handler
+        self.__copy = copy
+        self.__run_level = run_level
+        self.__write_to = better_mktemp()
+
+    def __initiate_values(self):
+        """
+        Initiate all values.
+        """
+        self.__text_string = ''
+        self.__state = 'before_info_table'
+        self.rmspace = re.compile(r'\s+')
+        self.__state_dict = {
+        'before_info_table': self.__before_info_table_func,
+        'after_info_table': self.__after_info_table_func,
+        'in_info_table'    : self.__in_info_table_func,
+        'collect_text'      : self.__collect_text_func,
+        'collect_tokens'      : self.__collect_tokens_func,
+        }
+        self.__info_table_dict = {
+        'cw<di<title_____'  : (self.__found_tag_with_text_func, 'title'),
+        'cw<di<author____'  : (self.__found_tag_with_text_func, 'author'),
+        'cw<di<operator__'  : (self.__found_tag_with_text_func, 'operator'),
+        'cw<di<manager___'  : (self.__found_tag_with_text_func, 'manager'),
+        'cw<di<company___'  : (self.__found_tag_with_text_func, 'company'),
+        'cw<di<keywords__'  : (self.__found_tag_with_text_func, 'keywords'),
+        'cw<di<category__'  : (self.__found_tag_with_text_func, 'category'),
+        'cw<di<doc-notes_'  : (self.__found_tag_with_text_func, 'doc-notes'),
+        'cw<di<subject___'  : (self.__found_tag_with_text_func, 'subject'),
+        'cw<di<linkbase__'  : (self.__found_tag_with_text_func, 'hyperlink-base'),
+
+        'cw<di<create-tim'  : (self.__found_tag_with_tokens_func, 'creation-time'),
+        'cw<di<revis-time'  : (self.__found_tag_with_tokens_func, 'revision-time'),
+        'cw<di<print-time'  : (self.__found_tag_with_tokens_func, 'printing-time'),
+        'cw<di<backuptime'  : (self.__found_tag_with_tokens_func, 'backup-time'),
+
+        'cw<di<num-of-wor'  : (self.__single_field_func, 'number-of-words'),
+        'cw<di<num-of-chr'  : (self.__single_field_func, 'number-of-characters'),
+        'cw<di<numofchrws'  : (self.__single_field_func, 'number-of-characters-without-space'),
+        'cw<di<num-of-pag'  : (self.__single_field_func, 'number-of-pages'),
+        'cw<di<version___'  : (self.__single_field_func, 'version'),
+        'cw<di<edit-time_'  : (self.__single_field_func, 'editing-time'),
+        'cw<di<intern-ver'  : (self.__single_field_func, 'internal-version-number'),
+        'cw<di<internalID'  : (self.__single_field_func, 'internal-id-number'),
+        }
+        self.__token_dict = {
+        'year______'        : 'year',
+        'month_____'        : 'month',
+        'day_______'        : 'day',
+        'minute____'        : 'minute',
+        'second____'        : 'second',
+        'revis-time'        : 'revision-time',
+        'create-tim'        : 'creation-time',
+        'edit-time_'        : 'editing-time',
+        'print-time'        : 'printing-time',
+        'backuptime'        : 'backup-time',
+        'num-of-wor'        : 'number-of-words',
+        'num-of-chr'        : 'number-of-characters',
+        'numofchrws'        : 'number-of-characters-without-space',
+        'num-of-pag'        : 'number-of-pages',
+        'version___'        : 'version',
+        'intern-ver'        : 'internal-version-number',
+        'internalID'        : 'internal-id-number',
+        }
+
+    def __before_info_table_func(self, line):
+        """
+        Required:
+            line -- the line to parse
+        Returns:
+            nothing
+        Logic:
+            Check for the beginning of the informatin table. When found, set
+            the state to the information table. Always write the line.
+        """
+        if self.__token_info == 'mi<mk<doc-in-beg':
+            self.__state = 'in_info_table'
+        self.__write_obj.write(line)
+
+    def __in_info_table_func(self, line):
+        """
+        Requires:
+            line -- line to parse
+        Returns:
+            nothing.
+        Logic:
+            Check for the end of information. If not found, check if the
+            token has a special value in the info table dictionay. If it
+            does, execute that function.
+            Otherwise, output the line to the file.
+        """
+        if self.__token_info == 'mi<mk<doc-in-end':
+            self.__state = 'after_info_table'
+        else:
+            action, tag = self.__info_table_dict.get(self.__token_info, (None, None))
+            if action:
+                action(line, tag)
+            else:
+                self.__write_obj.write(line)
+
+    def __found_tag_with_text_func(self, line, tag):
+        """
+        Requires:
+            line -- line to parse
+            tag --what kind of line
+        Returns:
+            nothing
+        Logic:
+            This function marks the beginning of informatin fields that have
+            text that must be collected.  Set the type of information field
+            with the tag option. Set the state to collecting text
+        """
+        self.__tag = tag
+        self.__state = 'collect_text'
+
+    def __collect_text_func(self, line):
+        """
+        Requires:
+            line -- line to parse
+        Returns:
+            nothing
+        Logic:
+            If the end of the information field is found, write the text
+            string to the file.
+            Otherwise, if the line contains text, add it to the text string.
+        """
+        if self.__token_info == 'mi<mk<docinf-end':
+            self.__state = 'in_info_table'
+            # Don't print empty tags
+            if len(self.rmspace.sub('',self.__text_string)):
+                self.__write_obj.write(
+                    'mi<tg<open______<%s\n'
+                    'tx<nu<__________<%s\n'
+                    'mi<tg<close_____<%s\n' % (self.__tag, self.__text_string, self.__tag)
+                )
+            self.__text_string = ''
+        elif line[0:2] == 'tx':
+            self.__text_string += line[17:-1]
+
+    def __found_tag_with_tokens_func(self, line, tag):
+        """
+        Requires:
+            line -- line to parse
+            tag -- type of field
+        Returns:
+            nothing
+        Logic:
+            Some fields have a series of tokens (cw<di<year______<nu<2003)
+            that must be parsed as attributes for the element.
+            Set the state to collect tokesn, and set the text string to
+            start an empty element with attributes.
+        """
+        self.__state = 'collect_tokens'
+        self.__text_string = 'mi<tg<empty-att_<%s' % tag
+        # mi<tg<empty-att_<page-definition<margin>33\n
+
+    def __collect_tokens_func(self, line):
+        """
+        Requires:
+            line -- line to parse
+        Returns:
+            nothing
+        Logic:
+            This function collects all the token information and adds it to
+            the text string until the end of the field is found.
+            First check of the end of the information field. If found, write
+            the text string to the file.
+            If not found, get the relevant information from the text string.
+            This information cannot be directly added to the text string,
+            because it exists in abbreviated form.  (num-of-wor)
+            I want to check this information in a dictionary to convert it
+            to a longer, readable form. If the key does not exist in the
+            dictionary, print out an error message. Otherise add the value
+            to the text string.
+            (num-of-wor => number-of-words)
+        """
+        # cw<di<year______<nu<2003
+        if self.__token_info == 'mi<mk<docinf-end':
+            self.__state = 'in_info_table'
+            self.__write_obj.write(
+            '%s\n' % self.__text_string
+            )
+            self.__text_string = ''
+        else:
+            att = line[6:16]
+            value = line[20:-1]
+            att_changed = self.__token_dict.get(att)
+            if att_changed is None:
+                if self.__run_level > 3:
+                    msg = 'No dictionary match for %s\n' % att
+                    raise self.__bug_handler(msg)
+            else:
+                self.__text_string += '<%s>%s' % (att_changed, value)
+
+    def __single_field_func(self, line, tag):
+        value = line[20:-1]
+        self.__write_obj.write(
+        'mi<tg<empty-att_<%s<%s>%s\n' % (tag, tag, value)
+        )
+
+    def __after_info_table_func(self, line):
+        """
+        Requires:
+            line --line to write to file
+        Returns:
+            nothing
+        Logic:
+            After the end of the information table, simple write the line to
+            the file.
+        """
+        self.__write_obj.write(line)
+
+    def fix_info(self):
+        """
+        Requires:
+            nothing
+        Returns:
+            nothing (changes the original file)
+        Logic:
+            Read one line in at a time. Determine what action to take based on
+            the state. If the state is before the information table, look for the
+            beginning of the style table.
+            If the state is in the information table, use other methods to
+            parse the information
+            style table, look for lines with style info, and substitute the
+            number with the name of the style.  If the state if afer the
+            information table, simply write the line to the output file.
+        """
+        self.__initiate_values()
+        with open_for_read(self.__file) as read_obj:
+            with open_for_write(self.__write_to) as self.__write_obj:
+                for line in read_obj:
+                    self.__token_info = line[:16]
+                    action = self.__state_dict.get(self.__state)
+                    if action is None:
+                        sys.stderr.write('No matching state in module styles.py\n')
+                        sys.stderr.write(self.__state + '\n')
+                    action(line)
+        copy_obj = copy.Copy(bug_handler=self.__bug_handler)
+        if self.__copy:
+            copy_obj.copy_file(self.__write_to, "info.data")
+        copy_obj.rename(self.__write_to, self.__file)
+        os.remove(self.__write_to)
--- a/ebook_converter/ebooks/rtf2xml/inline.py
+++ b/ebook_converter/ebooks/rtf2xml/inline.py
@@ -0,0 +1,427 @@
+from __future__ import unicode_literals, absolute_import, print_function, division
+import sys, os
+
+from calibre.ebooks.rtf2xml import copy
+from calibre.ptempfile import better_mktemp
+from . import open_for_read, open_for_write
+
+"""
+States.
+1. default
+    1. an open bracket ends this state.
+    2. Text print out text. Print out any groups_in_waiting.
+    3. closed bracket. Close groups
+2. after an open bracket
+    1. The lack of a control word ends this state.
+    2. paragraph end -- close out all tags
+    3. footnote beg -- close out all tags
+"""
+
+
+class Inline:
+    """
+    Make inline tags within lists.
+    Logic:
+    """
+
+    def __init__(self,
+            in_file,
+            bug_handler,
+            copy=None,
+            run_level=1,):
+        """
+        Required:
+            'file'--file to parse
+        Optional:
+            'copy'-- whether to make a copy of result for debugging
+            'temp_dir' --where to output temporary results (default is
+            directory from which the script is run.)
+        Returns:
+            nothing
+            """
+        self.__file = in_file
+        self.__bug_handler = bug_handler
+        self.__copy = copy
+        self.__run_level = run_level
+        self.__write_to = better_mktemp()
+
+    def __initiate_values(self):
+        """
+        Initiate all values.
+        """
+        self.__state_dict = {
+            'default':              self.__default_func,
+            'after_open_bracket':   self.__after_open_bracket_func,
+        }
+        self.__default_dict = {
+            'ob<nu<open-brack':         self.__found_open_bracket_func,
+            'tx<nu<__________'  :       self.__found_text_func,
+            'tx<hx<__________'  :       self.__found_text_func,
+            'tx<ut<__________'  :       self.__found_text_func,
+            'mi<mk<inline-fld'  :       self.__found_text_func,
+            'text'              :       self.__found_text_func,
+            'cb<nu<clos-brack'  :       self.__close_bracket_func,
+            'mi<mk<par-end___'  :       self.__end_para_func,
+            'mi<mk<footnt-ope'  :       self.__end_para_func,
+            'mi<mk<footnt-ind'  :       self.__end_para_func,
+        }
+        self.__after_open_bracket_dict = {
+            'cb<nu<clos-brack'  :       self.__close_bracket_func,
+            'tx<nu<__________'  :       self.__found_text_func,
+            'tx<hx<__________'  :       self.__found_text_func,
+            'tx<ut<__________'  :       self.__found_text_func,
+            'text'              :       self.__found_text_func,
+            'mi<mk<inline-fld'  :       self.__found_text_func,
+            'ob<nu<open-brack':         self.__found_open_bracket_func,
+            'mi<mk<par-end___'  :       self.__end_para_func,
+            'mi<mk<footnt-ope'  :       self.__end_para_func,
+            'mi<mk<footnt-ind'  :       self.__end_para_func,
+            'cw<fd<field_____'  :       self.__found_field_func,
+        }
+        self.__state = 'default'
+        self.__brac_count = 0  # do I need this?
+        self.__list_inline_list = []
+        self.__body_inline_list = []
+        self.__groups_in_waiting_list = [0]
+        self.__groups_in_waiting_body = [0]
+        self.__groups_in_waiting = self.__groups_in_waiting_body
+        self.__place = 'non_list'
+        self.__inline_list = self.__body_inline_list
+        self.__in_para = 0  # not in paragraph
+        self.__char_dict = {
+            # character info => ci
+            'annotation'    :   'annotation',
+            'blue______'    :   'blue',
+            'bold______'    :   'bold',
+            'caps______'    :   'caps',
+            'char-style'    :   'character-style',
+            'dbl-strike'    :   'double-strike-through',
+            'emboss____'    :   'emboss',
+            'engrave___'    :   'engrave',
+            'font-color'    :   'font-color',
+            'font-down_'    :   'subscript',
+            'font-size_'    :   'font-size',
+            'font-style'    :   'font-style',
+            'font-up___'    :   'superscript',
+            'footnot-mk'    :   'footnote-marker',
+            'green_____'    :   'green',
+            'hidden____'    :   'hidden',
+            'italics___'    :   'italics',
+            'outline___'    :   'outline',
+            'red_______'    :   'red',
+            'shadow____'    :   'shadow',
+            'small-caps'    :   'small-caps',
+            'strike-thr'    :   'strike-through',
+            'subscript_'    :   'subscript',
+            'superscrip'    :   'superscript',
+            'underlined'    :   'underlined',
+        }
+        self.__caps_list = ['false']
+
+    def __set_list_func(self, line):
+        """
+        Requires:
+            line--line of text
+        Returns:
+            nothing
+        Logic:
+        """
+        if self.__place == 'in_list':
+            if self.__token_info == 'mi<mk<lst-tx-end':
+                self.__place = 'not_in_list'
+                self.__inline_list = self.__body_inline_list
+                self.__groups_in_waiting = self.__groups_in_waiting_body
+        else:
+            if self.__token_info == 'mi<mk<lst-tx-beg':
+                self.__place = 'in_list'
+                self.__inline_list = self.__list_inline_list
+                self.__groups_in_waiting = self.__groups_in_waiting_list
+
+    def __default_func(self, line):
+        """
+        Requires:
+            line-- line of text
+        Returns:
+            nothing
+        Logic:
+            Write if not hardline break
+        """
+        action = self.__default_dict.get(self.__token_info)
+        if action:
+            action(line)
+        self.__write_obj.write(line)
+
+    def __found_open_bracket_func(self, line):
+        """
+        Requires:
+            line -- current line of text
+        Returns:
+            nothing
+        Logic:
+            Change the state to 'after_open_bracket'
+        """
+        self.__state = 'after_open_bracket'
+        self.__brac_count += 1
+        self.__groups_in_waiting[0] += 1
+        self.__inline_list.append({})
+        self.__inline_list[-1]['contains_inline'] = 0
+
+    def __after_open_bracket_func(self, line):
+        """
+        Requires:
+            line --line of text
+        Returns:
+            nothing
+        Logic:
+            If the token is a control word for character info (cw<ci), use another
+            method to add to the dictionary.
+            Use the dictionary to get the approriate function.
+            Always print out the line.
+        """
+        if line[0:5] == 'cw<ci':  # calibre: bug in original function no diff between cw<ci and cw<pf
+            self.__handle_control_word(line)
+        else:
+            action = self.__after_open_bracket_dict.get(self.__token_info)
+            if action:
+                self.__state = 'default'  # a non control word?
+                action(line)
+        self.__write_obj.write(line)
+
+    def __handle_control_word(self, line):
+        """
+        Required:
+            line --line of text
+        Returns:
+            nothing
+        Logic:
+            Handle the control word for inline groups.
+            Add each name - value to a dictionary.
+            If the font style of Symbol, Wingdings, or Dingbats is found,
+            always mark this. I need this later to convert the text to
+            the right utf.
+        """
+        # cw<ci<shadow_____<nu<true
+        # self.__char_dict = {
+        char_info = line[6:16]
+        char_value = line[20:-1]
+        name = self.__char_dict.get(char_info)
+        if name:
+            self.__inline_list[-1]['contains_inline'] = 1
+            self.__inline_list[-1][name] = char_value
+            """
+            if name == 'font-style':
+                if char_value == 'Symbol':
+                    self.__write_obj.write('mi<mk<font-symbo\n')
+                elif char_value == 'Wingdings':
+                    self.__write_obj.write('mi<mk<font-wingd\n')
+                elif char_value == 'Zapf Dingbats':
+                    self.__write_obj.write('mi<mk<font-dingb\n')
+            """
+
+    def __close_bracket_func(self, line):
+        """
+        Requires:
+            line --line of text
+        Returns:
+            Nothing
+        Logic:
+            If there are no inline groups, do nothing.
+            Get the keys of the last dictionary in the inline_groups.
+            If 'contains_inline' in the keys, write a close tag.
+            If the_dict contains font information, write a mk tag.
+        """
+        if len(self.__inline_list) == 0:
+            # nothing to add
+            return
+        the_dict = self.__inline_list[-1]
+        the_keys = the_dict.keys()
+        # always close out
+        if self.__place == 'in_list':
+            if 'contains_inline' in the_keys and the_dict['contains_inline'] == 1\
+                and self.__groups_in_waiting[0] == 0:
+                self.__write_obj.write('mi<tg<close_____<inline\n')
+                if 'font-style' in the_keys:
+                    self.__write_obj.write('mi<mk<font-end__\n')
+                if 'caps' in the_keys:
+                    self.__write_obj.write('mi<mk<caps-end__\n')
+        else:
+            # close out only if in a paragraph
+            if 'contains_inline' in the_keys and the_dict['contains_inline'] == 1\
+                and self.__in_para and self.__groups_in_waiting[0] == 0:
+                self.__write_obj.write('mi<tg<close_____<inline\n')
+                if 'font-style' in the_keys:
+                    self.__write_obj.write('mi<mk<font-end__\n')
+                if 'caps' in the_keys:
+                    self.__write_obj.write('mi<mk<caps-end__\n')
+        self.__inline_list.pop()
+        if self.__groups_in_waiting[0] != 0:
+            self.__groups_in_waiting[0] -= 1
+
+    def __found_text_func(self, line):
+        """
+        Required:
+            line--line of text
+        Return:
+            nothing
+        Logic:
+            Three cases:
+            1. in a list. Simply write inline
+            2. Not in a list
+                Text can mark the start of a paragraph.
+                If already in a paragraph, check to see if any groups are waiting
+                to be added. If so, use another method to write these groups.
+        """
+        if self.__place == 'in_list':
+            self.__write_inline()
+        else:
+            if not self.__in_para:
+                self.__in_para = 1
+                self.__start_para_func(line)
+            elif self.__groups_in_waiting[0] != 0:
+                self.__write_inline()
+
+    def __write_inline(self):
+        """
+        Required:
+            nothing
+        Returns
+            Nothing
+        Logic:
+            Method for writing inline when text is found.
+            Only write those groups that are "waiting", or that have no
+            tags yet.
+            First, slice the list self.__inline list to get just the groups
+            in waiting.
+            Iterate through this slice, which contains only dictionaries.
+            Get the keys in each dictionary. If 'font-style' is in the keys,
+            write a marker tag. (I will use this marker tag later when conerting
+            hext text to utf8.)
+            Write a tag for the inline values.
+        """
+        if self.__groups_in_waiting[0] != 0:
+            last_index = -1 * self.__groups_in_waiting[0]
+            inline_list = self.__inline_list[last_index:]
+            if len(inline_list) <= 0:
+                if self.__run_level > 3:
+                    msg = 'self.__inline_list is %s\n' % self.__inline_list
+                    raise self.__bug_handler(msg)
+                self.__write_obj.write('error\n')
+                self.__groups_in_waiting[0] = 0
+                return
+            for the_dict in inline_list:
+                if the_dict['contains_inline']:
+                    the_keys = the_dict.keys()
+                    if 'font-style' in the_keys:
+                        face = the_dict['font-style']
+                        self.__write_obj.write('mi<mk<font______<%s\n' % face)
+                    if 'caps' in the_keys:
+                        value = the_dict['caps']
+                        self.__write_obj.write('mi<mk<caps______<%s\n' % value)
+                    self.__write_obj.write('mi<tg<open-att__<inline')
+                    for the_key in the_keys:
+                        if the_key != 'contains_inline':
+                            self.__write_obj.write('<%s>%s' % (the_key, the_dict[the_key]))
+                    self.__write_obj.write('\n')
+        self.__groups_in_waiting[0] = 0
+
+    def __end_para_func(self, line):
+        """
+        Requires:
+            line -- line of text
+        Returns:
+            nothing
+        Logic:
+            Slice from the end the groups in waiting.
+            Iterate through the list. If the dictionary contaings info, write
+            a closing tag.
+        """
+        if not self.__in_para:
+            return
+        if self.__groups_in_waiting[0] == 0:
+            inline_list = self.__inline_list
+        else:
+            last_index = -1 * self.__groups_in_waiting[0]
+            inline_list = self.__inline_list[0:last_index]
+        for the_dict in inline_list:
+            contains_info = the_dict.get('contains_inline')
+            if contains_info:
+                the_keys = the_dict.keys()
+                if 'font-style' in the_keys:
+                    self.__write_obj.write('mi<mk<font-end__\n')
+                if 'caps' in the_keys:
+                    self.__write_obj.write('mi<mk<caps-end__\n')
+                self.__write_obj.write('mi<tg<close_____<inline\n')
+        self.__in_para = 0
+
+    def __start_para_func(self, line):
+        """
+        Requires:
+            line -- line of text
+        Returns:
+            nothing
+        Logic:
+            Iterate through the self.__inline_list to get each dict.
+            If the dict containst inline info, get the keys.
+            Iterate through the keys and print out the key and value.
+        """
+        for the_dict in self.__inline_list:
+            contains_info = the_dict.get('contains_inline')
+            if contains_info :
+                the_keys = the_dict.keys()
+                if 'font-style' in the_keys:
+                    face = the_dict['font-style']
+                    self.__write_obj.write('mi<mk<font______<%s\n' % face)
+                if 'caps' in the_keys:
+                    value = the_dict['caps']
+                    self.__write_obj.write('mi<mk<caps______<%s\n' % value)
+                self.__write_obj.write('mi<tg<open-att__<inline')
+                for the_key in the_keys:
+                    if the_key != 'contains_inline':
+                        self.__write_obj.write('<%s>%s' % (the_key, the_dict[the_key]))
+                self.__write_obj.write('\n')
+        self.__groups_in_waiting[0] = 0
+
+    def __found_field_func(self, line):
+        """
+        Just a default function to make sure I don't prematurely exit
+        default state
+        """
+        pass
+
+    def form_tags(self):
+        """
+        Requires:
+            area--area to parse (list or non-list)
+        Returns:
+            nothing
+        Logic:
+            Read one line in at a time. Determine what action to take based on
+            the state.
+        """
+        self.__initiate_values()
+        with open_for_read(self.__file) as read_obj:
+            with open_for_write(self.__write_to) as self.__write_obj:
+                for line in read_obj:
+                    token = line[0:-1]
+                    self.__token_info = ''
+                    if token == 'tx<mc<__________<rdblquote'\
+                        or token == 'tx<mc<__________<ldblquote'\
+                        or token == 'tx<mc<__________<lquote'\
+                        or token == 'tx<mc<__________<rquote'\
+                        or token == 'tx<mc<__________<emdash'\
+                        or token == 'tx<mc<__________<endash'\
+                        or token == 'tx<mc<__________<bullet':
+                        self.__token_info = 'text'
+                    else:
+                        self.__token_info = line[:16]
+                    self.__set_list_func(line)
+                    action = self.__state_dict.get(self.__state)
+                    if action is None:
+                        sys.stderr.write('No matching state in module inline.py\n')
+                        sys.stderr.write(self.__state + '\n')
+                    action(line)
+        copy_obj = copy.Copy(bug_handler=self.__bug_handler)
+        if self.__copy:
+            copy_obj.copy_file(self.__write_to, "inline.data")
+        copy_obj.rename(self.__write_to, self.__file)
+        os.remove(self.__write_to)
--- a/ebook_converter/ebooks/rtf2xml/line_endings.py
+++ b/ebook_converter/ebooks/rtf2xml/line_endings.py
@@ -0,0 +1,56 @@
+from __future__ import unicode_literals, absolute_import, print_function, division
+#########################################################################
+#                                                                       #
+#                                                                       #
+#   copyright 2002 Paul Henry Tremblay                                  #
+#                                                                       #
+#   This program is distributed in the hope that it will be useful,     #
+#   but WITHOUT ANY WARRANTY; without even the implied warranty of      #
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU    #
+#   General Public License for more details.                            #
+#                                                                       #
+#                                                                       #
+#########################################################################
+import os
+
+from calibre.ebooks.rtf2xml import copy
+from calibre.utils.cleantext import clean_ascii_chars
+from calibre.ptempfile import better_mktemp
+
+
+class FixLineEndings:
+    """Fix line endings"""
+
+    def __init__(self,
+            bug_handler,
+            in_file=None,
+            copy=None,
+            run_level=1,
+            replace_illegals=1,
+            ):
+        self.__file = in_file
+        self.__bug_handler = bug_handler
+        self.__copy = copy
+        self.__run_level = run_level
+        self.__write_to = better_mktemp()
+        self.__replace_illegals = replace_illegals
+
+    def fix_endings(self):
+        # read
+        with open(self.__file, 'rb') as read_obj:
+            input_file = read_obj.read()
+        # calibre go from win and mac to unix
+        input_file = input_file.replace(b'\r\n', b'\n')
+        input_file = input_file.replace(b'\r', b'\n')
+        # remove ASCII invalid chars : 0 to 8 and 11-14 to 24-26-27
+        if self.__replace_illegals:
+            input_file = clean_ascii_chars(input_file)
+        # write
+        with open(self.__write_to, 'wb') as write_obj:
+            write_obj.write(input_file)
+        # copy
+        copy_obj = copy.Copy(bug_handler=self.__bug_handler)
+        if self.__copy:
+            copy_obj.copy_file(self.__write_to, "line_endings.data")
+        copy_obj.rename(self.__write_to, self.__file)
+        os.remove(self.__write_to)
--- a/ebook_converter/ebooks/rtf2xml/list_numbers.py
+++ b/ebook_converter/ebooks/rtf2xml/list_numbers.py
@@ -0,0 +1,201 @@
+from __future__ import unicode_literals, absolute_import, print_function, division
+#########################################################################
+#                                                                       #
+#                                                                       #
+#   copyright 2002 Paul Henry Tremblay                                  #
+#                                                                       #
+#   This program is distributed in the hope that it will be useful,     #
+#   but WITHOUT ANY WARRANTY; without even the implied warranty of      #
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU    #
+#   General Public License for more details.                            #
+#                                                                       #
+#                                                                       #
+#########################################################################
+import os
+from calibre.ebooks.rtf2xml import copy
+from calibre.ptempfile import better_mktemp
+from . import open_for_read, open_for_write
+
+
+class ListNumbers:
+    """
+        RTF puts list numbers outside of the paragraph. The public method
+        in this class put the list numbers inside the paragraphs.
+    """
+
+    def __init__(self,
+            in_file,
+            bug_handler,
+            copy=None,
+            run_level=1,
+            ):
+        """
+        Required:
+            'file'
+        Optional:
+            'copy'-- whether to make a copy of result for debugging
+            'temp_dir' --where to output temporary results (default is
+            directory from which the script is run.)
+        Returns:
+            nothing
+            """
+        self.__file = in_file
+        self.__bug_handler = bug_handler
+        self.__copy = copy
+        self.__write_to = better_mktemp()
+
+    def __initiate_values(self):
+        """
+        initiate values for fix_list_numbers.
+        Required:
+            Nothing
+        Return:
+            Nothing
+        """
+        self.__state = "default"
+        self.__list_chunk = ''
+        self.__previous_line = ''
+        self.__list_text_ob_count = ''
+        self.__state_dict={
+        'default'           :   self.__default_func,
+        'after_ob'          :   self.__after_ob_func,
+        'list_text'         :   self.__list_text_func,
+        'after_list_text'   :   self.__after_list_text_func
+        }
+
+    def __after_ob_func(self, line):
+        """
+        Handle the line immediately after an open bracket.
+        Required:
+            self, line
+        Returns:
+            Nothing
+            """
+        if self.__token_info == 'cw<ls<list-text_':
+            self.__state = 'list_text'
+            self.__list_chunk = self.__list_chunk + \
+            self.__previous_line + line
+            self.__list_text_ob = self.__ob_count
+            self.__cb_count = 0
+        else:
+            self.__write_obj.write(self.__previous_line)
+            self.__write_obj.write(line)
+            self.__state = 'default'
+
+    def __after_list_text_func(self, line):
+        """
+        Look for an open bracket or a line of text, and then print out the
+        self.__list_chunk. Print out the line.
+        """
+        if line[0:2] == 'ob' or line[0:2] == 'tx':
+            self.__state = 'default'
+            self.__write_obj.write('mi<mk<lst-txbeg_\n')
+            self.__write_obj.write('mi<mk<para-beg__\n')
+            self.__write_obj.write('mi<mk<lst-tx-beg\n')
+            self.__write_obj.write(
+                # 'mi<tg<open-att__<list-text<type>%s\n' % self.__list_type)
+                'mi<tg<open-att__<list-text\n')
+            self.__write_obj.write(self.__list_chunk)
+            self.__write_obj.write('mi<tg<close_____<list-text\n')
+            self.__write_obj.write('mi<mk<lst-tx-end\n')
+            self.__list_chunk = ''
+        self.__write_obj.write(line)
+
+    def __determine_list_type(self, chunk):
+        """
+        Determine if the list is ordered or itemized
+        """
+        lines = chunk.split('\n')
+        text_string = ''
+        for line in lines:
+            if line[0:5] == 'tx<hx':
+                if line[17:] == '\'B7':
+                    return "unordered"
+            elif line[0:5] == 'tx<nu':
+                text_string += line[17:]
+        text_string = text_string.replace('.', '')
+        text_string = text_string.replace('(', '')
+        text_string = text_string.replace(')', '')
+        if text_string.isdigit():
+            return 'ordered'
+        """
+        sys.stderr.write('module is list_numbers\n')
+        sys.stderr.write('method is __determine type\n')
+        sys.stderr.write('Couldn\'t get type of list\n')
+        """
+        # must be some type of ordered list -- just a guess!
+        return 'unordered'
+
+    def __list_text_func(self, line):
+        """
+        Handle lines that are part of the list text. If the end of the list
+        text is found (the closing bracket matches the self.__list_text_ob),
+        then change  the state. Always add the line to the self.__list_chunk
+        Required:
+            self, line
+        Returns:
+            Nothing
+            """
+        if self.__list_text_ob == self.__cb_count:
+            self.__state = 'after_list_text'
+            self.__right_after_list_text = 1
+            self.__list_type = self.__determine_list_type(self.__list_chunk)
+            self.__write_obj.write('mi<mk<list-type_<%s\n' % self.__list_type)
+        if self.__token_info != 'cw<pf<par-def___':
+            self.__list_chunk = self.__list_chunk + line
+
+    def __default_func(self, line):
+        """
+        Handle the lines that are not part of any special state. Look for an
+        opening bracket. If an open bracket is found, add this line to a
+        temporary self.__previous line, which other methods need. Otherwise,
+        print out the line.
+        Required:
+            self, line
+        Returns:
+            Nothing
+            """
+        if self.__token_info == 'ob<nu<open-brack':
+            self.__state = 'after_ob'
+            self.__previous_line = line
+        else:
+            self.__write_obj.write(line)
+
+    def fix_list_numbers(self):
+        """
+        Required:
+            nothing
+        Returns:
+            original file will be changed
+        Logic:
+            Read in one line a time from the file. Keep track of opening and
+            closing brackets. Determine the method ('action') by passing the
+            state to the self.__state_dict.
+            Simply print out the line to a temp file until an open bracket
+            is found. Check the next line. If it is list-text, then start
+            adding to the self.__list_chunk until the closing bracket is
+            found.
+            Next, look for an open bracket or text. When either is found,
+            print out self.__list_chunk and the line.
+        """
+        self.__initiate_values()
+        read_obj = open_for_read(self.__file)
+        self.__write_obj = open_for_write(self.__write_to)
+        line_to_read = 1
+        while line_to_read:
+            line_to_read = read_obj.readline()
+            line = line_to_read
+            self.__token_info = line[:16]
+            if self.__token_info == 'ob<nu<open-brack':
+                self.__ob_count = line[-5:-1]
+            if self.__token_info == 'cb<nu<clos-brack':
+                self.__cb_count = line[-5:-1]
+            action = self.__state_dict.get(self.__state)
+            action(line)
+        read_obj.close()
+        self.__write_obj.close()
+        copy_obj = copy.Copy(bug_handler=self.__bug_handler)
+        if self.__copy:
+            copy_obj.copy_file(self.__write_to, "list_numbers.data")
+        copy_obj.rename(self.__write_to, self.__file)
+        os.remove(self.__write_to)
--- a/ebook_converter/ebooks/rtf2xml/list_table.py
+++ b/ebook_converter/ebooks/rtf2xml/list_table.py
@@ -0,0 +1,447 @@
+from __future__ import absolute_import, division, print_function, unicode_literals
+#########################################################################
+#                                                                       #
+#                                                                       #
+#   copyright 2002 Paul Henry Tremblay                                  #
+#                                                                       #
+#   This program is distributed in the hope that it will be useful,     #
+#   but WITHOUT ANY WARRANTY; without even the implied warranty of      #
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU    #
+#   General Public License for more details.                            #
+#                                                                       #
+#                                                                       #
+#########################################################################
+
+from polyglot.builtins import unicode_type
+
+
+class ListTable:
+    """
+    Parse the list table line. Make a string. Form a dictionary.
+    Return the string and the dictionary.
+    """
+
+    def __init__(
+                self,
+                bug_handler,
+                run_level=1,
+                ):
+        self.__bug_handler = bug_handler
+        self.__initiate_values()
+        self.__run_level = run_level
+
+    def __initiate_values(self):
+        self.__list_table_final = ''
+        self.__state = 'default'
+        self.__final_dict = {}
+        self.__list_dict = {}
+        self.__all_lists = []
+        self.__level_text_string = ''
+        self.__level_text_list = []
+        self.__found_level_text_length = 0
+        self.__level_text_position = None
+        self.__prefix_string = None
+        self.__level_numbers_string = ''
+        self.__state_dict = {
+            'default'       : self.__default_func,
+            'level'         : self.__level_func,
+            'list'          : self.__list_func,
+            'unsure_ob'     : self.__after_bracket_func,
+            'level_number'  : self.__level_number_func,
+            'level_text'    : self.__level_text_func,
+            'list_name'     : self.__list_name_func,
+        }
+        self.__main_list_dict = {
+            'cw<ls<ls-tem-id_'  :       'list-template-id',
+            'cw<ls<list-hybri'  :       'list-hybrid',
+            'cw<ls<lis-tbl-id'  :       'list-table-id',
+        }
+        self.__level_dict = {
+            'cw<ls<level-star'  :       'list-number-start',
+            'cw<ls<level-spac'  :       'list-space',
+            'cw<ls<level-inde'  :       'level-indent',
+            'cw<ls<fir-ln-ind'  :       'first-line-indent',
+            'cw<ls<left-inden'  :       'left-indent',
+            'cw<ls<tab-stop__'  :       'tabs',
+            'cw<ls<level-type'  :       'numbering-type',
+            'cw<pf<right-inde'  :       'right-indent',
+            'cw<pf<left-inden'  :       'left-indent',
+            'cw<pf<fir-ln-ind'  :       'first-line-indent',
+            'cw<ci<italics___'  :       'italics',
+            'cw<ci<bold______'  :       'bold',
+            'cw<ss<para-style'  :       'paragraph-style-name',
+        }
+        """
+        all_lists =
+        [{anything here?}
+            [{list-templateid = ""}
+                [{level-indent}],[{level-indent}]
+            ]
+        ],
+        """
+
+    def __parse_lines(self, line):
+        """
+        Required : line --line to parse
+        Returns:  nothing
+        Logic:
+            Split the lines into a list by a new line. Process the line
+            according to the state.
+        """
+        lines = line.split('\n')
+        self.__ob_count = 0
+        self.__ob_group = 0
+        for line in lines:
+            self.__token_info = line[:16]
+            if self.__token_info == 'ob<nu<open-brack':
+                self.__ob_count = line[-4:]
+                self.__ob_group += 1
+            if self.__token_info == 'cb<nu<clos-brack':
+                self.__cb_count = line[-4:]
+                self.__ob_group -= 1
+            action = self.__state_dict.get(self.__state)
+            if action is None:
+                print(self.__state)
+            action(line)
+        self.__write_final_string()
+        # self.__add_to_final_line()
+
+    def __default_func(self, line):
+        """
+        Requires: line --line to process
+        Return: nothing
+        Logic:
+            This state is used at the start and end of a list. Look for an
+            opening bracket, which marks the change of state.
+        """
+        if self.__token_info == 'ob<nu<open-brack':
+            self.__state = 'unsure_ob'
+
+    def __found_list_func(self, line):
+        """
+        Requires: line -- line to process
+        Returns: nothing
+        Logic:
+            I have found \\list.
+            Change the state to list
+            Get the open bracket count so you know when this state ends.
+            Append an empty list to all lists.
+            Create a temporary dictionary. This dictionary has the key of
+            "list-id" and the value of an empty list. Later, this empty list
+            will be filled with all the ids for which the formatting is valid.
+            Append the temporary dictionary to the new list.
+        """
+        self.__state = 'list'
+        self.__list_ob_count = self.__ob_count
+        self.__all_lists.append([])
+        the_dict = {'list-id': []}
+        self.__all_lists[-1].append(the_dict)
+
+    def __list_func(self, line):
+        """
+        Requires: line --line to process
+        Returns: nothing
+        Logic:
+            This method is called when you are in a list, but outside of a level.
+            Check for the end of the list. Otherwise, use the self.__mainlist_dict
+            to determine if you need to add a lines values to the main list.
+        """
+        if self.__token_info == 'cb<nu<clos-brack' and\
+            self.__cb_count == self.__list_ob_count:
+            self.__state = 'default'
+        elif self.__token_info == 'ob<nu<open-brack':
+            self.__state = 'unsure_ob'
+        else:
+            att = self.__main_list_dict.get(self.__token_info)
+            if att:
+                value = line[20:]
+                # dictionary is always the first item in the last list
+                # [{att:value}, [], [att:value, []]
+                self.__all_lists[-1][0][att] = value
+
+    def __found_level_func(self, line):
+        """
+        Requires: line -- line to process
+        Returns: nothing
+        Logic:
+            I have found \\listlevel.
+            Change the state to level
+            Get the open bracket count so you know when this state ends.
+            Append an empty list to the last list inside all lists.
+            Create a temporary dictionary.
+            Append the temporary dictionary to the new list.
+            self.__all_lists now looks like:
+                [[{list-id:[]}, [{}]]]
+                Where:
+                    self.__all_lists[-1] => a list. The first item is a dictionary.
+                    The second item is a list containing a dictionary:
+                    [{list-id:[]}, [{}]]
+                    self.__all_lists[-1][0] => a dictionary of the list attributes
+                    self.__all_lists[-1][-1] => a list with just a dictionary
+                    self.__all_lists[-1][-1][0] => the dictionary of level attributes
+        """
+        self.__state = 'level'
+        self.__level_ob_count = self.__ob_count
+        self.__all_lists[-1].append([])
+        the_dict = {}
+        self.__all_lists[-1][-1].append(the_dict)
+        self.__level_dict
+
+    def __level_func(self, line):
+        """
+        Requires:
+            line -- line to parse
+        Returns:
+            nothing
+        Logic:
+            Look for the end of the this group.
+            Change states if an open bracket is found.
+            Add attributes to all_dicts if an appropriate token is found.
+        """
+        if self.__token_info == 'cb<nu<clos-brack' and\
+            self.__cb_count == self.__level_ob_count:
+            self.__state = 'list'
+        elif self.__token_info == 'ob<nu<open-brack':
+            self.__state = 'unsure_ob'
+        else:
+            att = self.__level_dict.get(self.__token_info)
+            if att:
+                value = line[20:]
+                self.__all_lists[-1][-1][0][att] = value
+
+    def __level_number_func(self, line):
+        """
+        Requires:
+            line -- line to process
+        Returns:
+            nothing
+        Logic:
+            Check for the end of the group.
+            Otherwise, if the token is hexidecimal, create an attribute.
+            Do so by finding the base-10 value of the number. Then divide
+            this by 2 and round it. Remove the ".0". Sandwwhich the result to
+            give you something like level1-show-level.
+            The show-level attribute means the numbering for this level.
+        """
+        if self.__token_info == 'cb<nu<clos-brack' and\
+            self.__cb_count == self.__level_number_ob_count:
+            self.__state = 'level'
+            self.__all_lists[-1][-1][0]['level-numbers'] = self.__level_numbers_string
+            self.__level_numbers_string = ''
+        elif self.__token_info == 'tx<hx<__________':
+            self.__level_numbers_string += '\\&#x0027;%s' % line[18:]
+        elif self.__token_info == 'tx<nu<__________':
+            self.__level_numbers_string += line[17:]
+            """
+            num = line[18:]
+            num = int(num, 16)
+            level = unicode_type(round((num - 1)/2, 0))
+            level = level[:-2]
+            level = 'level%s-show-level' % level
+            self.__all_lists[-1][-1][0][level] = 'true'
+            """
+
+    def __level_text_func(self, line):
+        """
+        Requires:
+            line --line to process
+        Returns:
+            nothing
+        Logic:
+            Check for the end of the group.
+            Otherwise, if the text is hexidecimal, call on the method
+            __parse_level_text_length.
+            Otheriwse, if the text is regular text, create an attribute.
+            This attribute indicates the puncuation after a certain level.
+            An example is "level1-marker = '.'"
+            Otherwise, check for a level-template-id.
+        """
+        if self.__token_info == 'cb<nu<clos-brack' and\
+            self.__cb_count == self.__level_text_ob_count:
+            if self.__prefix_string:
+                if self.__all_lists[-1][-1][0]['numbering-type'] == 'bullet':
+                    self.__prefix_string = self.__prefix_string.replace('_', '')
+                    self.__all_lists[-1][-1][0]['bullet-type'] = self.__prefix_string
+            self.__state = 'level'
+            # self.__figure_level_text_func()
+            self.__level_text_string = ''
+            self.__found_level_text_length = 0
+        elif self.__token_info == 'tx<hx<__________':
+            self.__parse_level_text_length(line)
+        elif self.__token_info == 'tx<nu<__________':
+            text = line[17:]
+            if text and text[-1] == ';':
+                text = text.replace(';', '')
+            if not self.__level_text_position:
+                self.__prefix_string = text
+            else:
+                self.__all_lists[-1][-1][0][self.__level_text_position] = text
+        elif self.__token_info == 'cw<ls<lv-tem-id_':
+            value = line[20:]
+            self.__all_lists[-1][-1][0]['level-template-id'] = value
+
+    def __parse_level_text_length(self, line):
+        """
+        Requires:
+            line --line with hexidecimal number
+        Returns:
+            nothing
+        Logic:
+            Method is used for to parse text in the \\leveltext group.
+        """
+        num = line[18:]
+        the_num = int(num, 16)
+        if not self.__found_level_text_length:
+            self.__all_lists[-1][-1][0]['list-text-length'] = unicode_type(the_num)
+            self.__found_level_text_length = 1
+        else:
+            the_num += 1
+            the_string = unicode_type(the_num)
+            level_marker = 'level%s-suffix' % the_string
+            show_marker = 'show-level%s' % the_string
+            self.__level_text_position = level_marker
+            self.__all_lists[-1][-1][0][show_marker] = 'true'
+            if self.__prefix_string:
+                prefix_marker = 'level%s-prefix' % the_string
+                self.__all_lists[-1][-1][0][prefix_marker] = self.__prefix_string
+                self.__prefix_string = None
+
+    def __list_name_func(self, line):
+        """
+        Requires:
+            line --line to process
+        Returns:
+            nothing
+        Logic:
+            Simply check for the end of the group and change states.
+        """
+        if self.__token_info == 'cb<nu<clos-brack' and\
+            self.__cb_count == self.__list_name_ob_count:
+            self.__state = 'list'
+
+    def __after_bracket_func(self, line):
+        """
+        Requires:
+            line --line to parse
+        Returns:
+            nothing.
+        Logic:
+            The last token found was "{". This method determines what group
+            you are now in.
+            WARNING: this could cause problems. If no group is found, the state will remain
+            unsure_ob, which means no other text will be parsed.
+        """
+        if self.__token_info == 'cw<ls<level-text':
+            self.__state = 'level_text'
+            self.__level_text_ob_count = self.__ob_count
+        elif self.__token_info == 'cw<ls<level-numb':
+            self.__level_number_ob_count = self.__ob_count
+            self.__state = 'level_number'
+        elif self.__token_info == 'cw<ls<list-tb-le':
+            self.__found_level_func(line)
+        elif self.__token_info == 'cw<ls<list-in-tb':
+            self.__found_list_func(line)
+        elif self.__token_info == 'cw<ls<list-name_':
+            self.__state = 'list_name'
+            self.__list_name_ob_count = self.__ob_count
+        else:
+            if self.__run_level > 3:
+                msg = 'No matching token after open bracket\n'
+                msg += 'token is "%s\n"' % (line)
+                raise self.__bug_handler
+
+    def __add_to_final_line(self):
+        """
+        Method no longer used.
+        """
+        self.__list_table_final = 'mi<mk<listabbeg_\n'
+        self.__list_table_final += 'mi<tg<open______<list-table\n' + \
+        'mi<mk<listab-beg\n' + self.__list_table_final
+        self.__list_table_final += \
+        'mi<mk<listab-end\n' + 'mi<tg<close_____<list-table\n'
+        self.__list_table_final += 'mi<mk<listabend_\n'
+
+    def __write_final_string(self):
+        """
+        Requires:
+            nothing
+        Returns:
+            nothing
+        Logic:
+            Write out the list-table start tag.
+            Iterate through self.__all_lists. For each list, write out
+            a list-in-table tag. Get the dictionary of this list
+            (the first item). Print out the key => value pair.
+            Remove the first item (the dictionary) form this list. Now iterate
+            through what is left in the list. Each list will conatin one item,
+            a dictionary. Get this dictionary and print out key => value pair.
+        """
+        not_allow = ['list-id',]
+        id = 0
+        self.__list_table_final = 'mi<mk<listabbeg_\n'
+        self.__list_table_final += 'mi<tg<open______<list-table\n' + \
+        'mi<mk<listab-beg\n' + self.__list_table_final
+        for list in self.__all_lists:
+            id += 1
+            self.__list_table_final += 'mi<tg<open-att__<list-in-table'
+            # self.__list_table_final += '<list-id>%s' % (unicode_type(id))
+            the_dict = list[0]
+            the_keys = the_dict.keys()
+            for the_key in the_keys:
+                if the_key in not_allow:
+                    continue
+                att = the_key
+                value = the_dict[att]
+                self.__list_table_final += '<%s>%s' % (att, value)
+            self.__list_table_final += '\n'
+            levels = list[1:]
+            level_num = 0
+            for level in levels:
+                level_num += 1
+                self.__list_table_final += 'mi<tg<empty-att_<level-in-table'
+                self.__list_table_final += '<level>%s' % (unicode_type(level_num))
+                the_dict2 = level[0]
+                the_keys2 = the_dict2.keys()
+                is_bullet = 0
+                bullet_text = ''
+                for the_key2 in the_keys2:
+                    if the_key2 in not_allow:
+                        continue
+                    test_bullet = the_dict2.get('numbering-type')
+                    if test_bullet == 'bullet':
+                        is_bullet = 1
+                    att2 = the_key2
+                    value2 = the_dict2[att2]
+                    # sys.stderr.write('%s\n' % att2[0:10])
+                    if att2[0:10] == 'show-level' and is_bullet:
+                        # sys.stderr.write('No print %s\n' % att2)
+                        pass
+                    elif att2[-6:] == 'suffix' and is_bullet:
+                        # sys.stderr.write('%s\n' % att2)
+                        bullet_text += value2
+                    elif att2[-6:] == 'prefix' and is_bullet:
+                        # sys.stderr.write('%s\n' % att2)
+                        bullet_text += value2
+                    else:
+                        self.__list_table_final += '<%s>%s' % (att2, value2)
+                if is_bullet:
+                    pass
+                    # self.__list_table_final += '<bullet-type>%s' % (bullet_text)
+                self.__list_table_final += '\n'
+            self.__list_table_final += 'mi<tg<close_____<list-in-table\n'
+        self.__list_table_final += \
+        'mi<mk<listab-end\n' + 'mi<tg<close_____<list-table\n'
+        self.__list_table_final += 'mi<mk<listabend_\n'
+
+    def parse_list_table(self, line):
+        """
+        Requires:
+            line -- line with border definition in it
+        Returns:
+            A string and the dictionary of list-table values and attributes.
+        Logic:
+            Call on the __parse_lines metod, which splits the text string into
+            lines (which will be tokens) and processes them.
+        """
+        self.__parse_lines(line)
+        return self.__list_table_final, self.__all_lists
--- a/ebook_converter/ebooks/rtf2xml/make_lists.py
+++ b/ebook_converter/ebooks/rtf2xml/make_lists.py
@@ -0,0 +1,465 @@
+from __future__ import unicode_literals, absolute_import, print_function, division
+#########################################################################
+#                                                                       #
+#                                                                       #
+#   copyright 2002 Paul Henry Tremblay                                  #
+#                                                                       #
+#   This program is distributed in the hope that it will be useful,     #
+#   but WITHOUT ANY WARRANTY; without even the implied warranty of      #
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU    #
+#   General Public License for more details.                            #
+#                                                                       #
+#                                                                       #
+#########################################################################
+import sys, os, re
+
+from calibre.ebooks.rtf2xml import copy
+from calibre.ptempfile import better_mktemp
+from polyglot.builtins import unicode_type
+
+from . import open_for_read, open_for_write
+
+
+class MakeLists:
+    """
+    Form lists.
+    Use RTF's own formatting to determine if a paragraph definition is part of a
+    list.
+    Use indents to determine items and how lists are nested.
+    """
+
+    def __init__(self,
+            in_file,
+            bug_handler,
+            headings_to_sections,
+            list_of_lists,
+            copy=None,
+            run_level=1,
+            no_headings_as_list=1,
+            write_list_info=0,
+            ):
+        """
+        Required:
+            'file'
+        Optional:
+            'copy'-- whether to make a copy of result for debugging
+            'temp_dir' --where to output temporary results (default is
+            directory from which the script is run.)
+        Returns:
+            nothing
+            """
+        self.__file = in_file
+        self.__bug_handler = bug_handler
+        self.__run_level = run_level
+        self.__no_headings_as_list = no_headings_as_list
+        self.__headings_to_sections = headings_to_sections
+        self.__copy = copy
+        self.__write_to = better_mktemp()
+        self.__list_of_lists = list_of_lists
+        self.__write_list_info = write_list_info
+
+    def __initiate_values(self):
+        """
+        Required:
+            Nothing
+        Return:
+            Nothing
+        Logic:
+            The self.__end_list is a list of tokens that will force a list to end.
+            Likewise, the self.__end_lines is a list of lines that forces a list to end.
+        """
+        self.__state = "default"
+        self.__left_indent = 0
+        self.__list_type = 'not-defined'
+        self.__pard_def = ""
+        self.__all_lists = []
+        self.__level = 0
+        self.__list_chunk = ''
+        self.__state_dict={
+        'default'           :   self.__default_func,
+        'in_pard'           :   self.__in_pard_func,
+        'after_pard'        :   self.__after_pard_func,
+        }
+        self.__headings = [
+        'heading 1', 'heading 2', 'heading 3', 'heading 4',
+        'heading 5', 'heading 6', 'heading 7', 'heading 8',
+        'heading 9'
+        ]
+        self.__allow_levels = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
+        self.__style_name = ''
+        self.__end_list = [
+        'mi<mk<body-close',
+        'mi<mk<par-in-fld',
+        'cw<tb<cell______',
+        'cw<tb<row-def___',
+        'cw<tb<row_______',
+        'mi<mk<sect-close',
+        'mi<mk<sect-start',
+        'mi<mk<header-beg',
+        'mi<mk<header-end',
+        'mi<mk<head___clo',
+        'mi<mk<fldbk-end_',
+        'mi<mk<close_cell',
+        'mi<mk<footnt-ope',
+        'mi<mk<foot___clo',
+        'mi<mk<tabl-start',
+        # 'mi<mk<sec-fd-beg',
+        ]
+        self.__end_lines = [
+            'mi<tg<close_____<cell\n',
+        ]
+        self.__id_regex = re.compile(r'\<list-id\>(\d+)')
+        self.__lv_regex = re.compile(r'\<list-level\>(\d+)')
+        self.__found_appt = 0
+        self.__line_num = 0
+
+    def __in_pard_func(self, line):
+        """
+        Required:
+            line -- the line of current text.
+        Return:
+            Nothing
+        Logic:
+            You are in a list, but in the middle of a paragraph definition.
+            Don't do anything until you find the end of the paragraph definition.
+        """
+        if self.__token_info == 'mi<mk<pard-end__':
+            self.__state = 'after_pard'
+        self.__write_obj.write(line)
+
+    def __after_pard_func(self, line):
+        """
+        Required:
+            line -- the line of current text.
+        Return:
+            Nothing
+        Logic:
+            You are in a list, but after a paragraph definition. You have to
+            determine if the last pargraph definition ends a list, continues
+            the old one, or starts a new one.
+            Otherwise, look for a paragraph definition. If one is found, determine if
+            the paragraph definition contains a list-id. If it does, use the method
+            self.__list_after_par_def to determine the action.
+            If the paragraph definition does not contain a list-id, use the method
+            close_lists to close out items and lists for a paragraph that is not
+            If a bigger block is found (such as a section or a cell), end all lists.
+            indented.
+            If no special line is found, add each line to a buffer.
+        """
+        if self.__token_info == 'mi<tg<open-att__' and line[17:37] == 'paragraph-definition':
+            is_heading = self.__is_a_heading()
+            # found paragraph definition and not heading 1
+            search_obj = re.search(self.__id_regex, line)
+            if search_obj and not is_heading:  # found list-id
+                search_obj_lv = re.search(self.__lv_regex, line)
+                if search_obj_lv:
+                    self.__level = search_obj_lv.group(1)
+                num = search_obj.group(1)
+                self.__list_after_par_def_func(line, num)
+                self.__write_obj.write(line)
+                self.__state = 'in_pard'
+            # heading 1
+            elif is_heading:
+                self.__left_indent = -1000
+                self.__close_lists()
+                self.__write_obj.write(self.__list_chunk)
+                self.__list_chunk = ''
+                self.__state = 'default'
+                self.__write_obj.write(line)
+            # Normal with no list id
+            else:
+                self.__close_lists()
+                self.__write_obj.write(self.__list_chunk)
+                self.__list_chunk = ''
+                self.__write_obj.write(line)
+                if len(self.__all_lists) == 0:
+                    self.__state= 'default'
+                else:
+                    self.__state = 'in_pard'
+        # section to end lists
+        elif self.__token_info in self.__end_list :
+            self.__left_indent = -1000
+            self.__close_lists()
+            self.__write_obj.write(self.__list_chunk)
+            self.__list_chunk = ''
+            self.__state = 'default'
+            self.__write_obj.write(line)
+        else:
+            self.__list_chunk += line
+
+    def __list_after_par_def_func(self, line, id):
+        """
+        Required:
+            line -- the line of current text.
+            id -- the id of the current list
+        Return:
+            Nothing
+        Logic:
+            You have found the end of a paragraph definition, and have found
+            another paragraph definition with a list id.
+            If the list-id is different from the last paragraph definition,
+            write the string in the buffer. Close out the lists with another
+            method and start a new list.
+            If the list id is the same as the last one, check the indent on the
+            current paragraph definition. If it is greater than the previous one,
+            do not end the current list or item. Start a new list.
+        """
+        last_list_id = self.__all_lists[-1]['id']
+        if id != last_list_id:
+            self.__close_lists()
+            self.__write_obj.write(self.__list_chunk)
+            self.__write_start_list(id)
+            self.__list_chunk = ''
+        else:
+            last_list_indent = self.__all_lists[-1]['left-indent']
+            if self.__left_indent > last_list_indent:
+                self.__write_obj.write(self.__list_chunk)
+                self.__write_start_list(id)
+            else:
+                self.__write_end_item()
+                self.__write_obj.write(self.__list_chunk)
+                self.__write_start_item()
+            self.__list_chunk = ''
+
+    def __close_lists(self):
+        """
+        Required:
+            Nothing
+        Return:
+            Nothing
+        Logic:
+            Reverse the list of dictionaries. Iterate through the list and
+            get the indent for each list. If the current indent is less than
+            or equal to the indent in the dictionary, close that level.
+            Keep track of how many levels you close. Reduce the list by that
+            many levels.
+            Reverse the list again.
+        """
+        if self.__line_num < 25 and self.__found_appt:
+            sys.stderr.write('in closing out lists\n')
+            sys.stderr.write('current_indent is "%s"\n' % self.__left_indent)
+        current_indent = self.__left_indent
+        self.__all_lists.reverse()
+        num_levels_closed = 0
+        for the_dict in self.__all_lists:
+            list_indent = the_dict.get('left-indent')
+            if self.__line_num < 25 and self.__found_appt:
+                sys.stderr.write('last indent is "%s"' % list_indent)
+            if current_indent <= list_indent:
+                self.__write_end_item()
+                self.__write_end_list()
+                num_levels_closed += 1
+        self.__all_lists = self.__all_lists[num_levels_closed:]
+        self.__all_lists.reverse()
+
+    def __write_end_list(self):
+        """
+        Required:
+            Nothing
+        Return:
+            Nothing
+        Logic:
+            Write the end of a list.
+        """
+        self.__write_obj.write('mi<tg<close_____<list\n')
+        self.__write_obj.write('mi<mk<list_close\n')
+
+    def __write_start_list(self, id):
+        """
+        Required:
+            id -- the id of the current list.
+        Return:
+            Nothing
+        Logic:
+            Write the start of a list and add the id and left-indent to the
+            self.__all_lists list.
+            Write cues of when a list starts for later processing.
+            In order to determine the type of list, you have to iterate through
+            the self.__list_of lists. This list looks like:
+                [[{list-id: [1, 2], [{}], [{}]] [{list-id: [3, 4], [{}]]]
+            I need to get the inside lists of the main lists. Then I need to get
+            the first item of what I just got. This is a dictionary. Get the list-id.
+            This is  a list. Check to see if the current id is in this list. If
+            so, then get the list-type from the dictionary.
+        """
+        the_dict = {}
+        the_dict['left-indent'] = self.__left_indent
+        the_dict['id'] = id
+        self.__all_lists.append(the_dict)
+        self.__write_obj.write(
+            'mi<mk<list_start\n'
+                )
+        # bogus levels are sometimes written for empty paragraphs
+        if unicode_type(self.__level) not in self.__allow_levels:
+            lev_num = '0'
+        else:
+            lev_num = self.__level
+        self.__write_obj.write(
+            'mi<tg<open-att__<list<list-id>%s<level>%s'
+            % (id, lev_num)
+                )
+        list_dict = {}
+        if self.__list_of_lists:  # older RTF won't generate a list_of_lists
+            index_of_list = self.__get_index_of_list(id)
+            if index_of_list is not None:  # found a matching id
+                curlist = self.__list_of_lists[index_of_list]
+                list_dict = curlist[0]
+                level = int(self.__level) + 1
+                if level >= len(curlist):
+                    level = len(curlist) - 1
+                level_dict = curlist[level][0]
+                list_type = level_dict.get('numbering-type')
+                if list_type == 'bullet':
+                    list_type = 'unordered'
+                else:
+                    list_type = 'ordered'
+                self.__write_obj.write(
+                    '<list-type>%s' % (list_type))
+            else:  # no matching id
+                self.__write_obj.write(
+                    '<list-type>%s' % (self.__list_type))
+        else:  # older RTF
+            self.__write_obj.write(
+                '<list-type>%s' % (self.__list_type))
+        # if you want to dump all the info to the list, rather than
+        # keeping it in the table above, change self.__write_list_info
+        # to true.
+        if self.__list_of_lists and self.__write_list_info and list_dict:
+            not_allow = ['list-id',]
+            the_keys_list = list_dict.keys()
+            for the_key in the_keys_list:
+                if the_key in not_allow:
+                    continue
+                self.__write_obj.write('<%s>%s' % (the_key, list_dict[the_key]))
+            the_keys_level = level_dict.keys()
+            for the_key in the_keys_level:
+                self.__write_obj.write('<%s>%s' % (the_key, level_dict[the_key]))
+        self.__write_obj.write('\n')
+        self.__write_obj.write(
+            'mi<mk<liststart_\n'
+                )
+        self.__write_start_item()
+
+    def __get_index_of_list(self, id):
+        """
+        Requires:
+            id -- id of current paragraph-definition
+        Returns:
+            an index of where the id occurs in list_of_lists, the
+            dictionary passed to this module.
+        Logic:
+            Iterate through the big lists, the one passed to this module and
+            get the first item, the dictionary. Use a counter to keep
+            track of how many times you iterate with the counter.
+            Once you find a match, return the counter.
+            If no match is found, print out an error message.
+        """
+        # some RTF use 0 indexed list. Don't know what to do?
+        if id == '0':
+            return
+        the_index = 0
+        for list in self.__list_of_lists:
+            the_dict = list[0]
+            id_in_list = the_dict.get('list-id')
+            if id in id_in_list:
+                return the_index
+            the_index += 1
+        if self.__run_level > 0:
+            sys.stderr.write('Module is make_lists.py\n'
+                'Method is __get_index_of_list\n'
+                'The main list does not appear to have a matching id for %s \n'
+                % (id)
+                )
+            # sys.stderr.write(repr(self.__list_of_lists))
+#        if self.__run_level > 3:
+#            msg = 'level is "%s"\n' % self.__run_level
+#            self.__bug_handler
+
+    def __write_start_item(self):
+        self.__write_obj.write('mi<mk<item_start\n')
+        self.__write_obj.write('mi<tg<open______<item\n')
+        self.__write_obj.write('mi<mk<itemstart_\n')
+
+    def __write_end_item(self):
+        self.__write_obj.write('mi<tg<item_end__\n')
+        self.__write_obj.write('mi<tg<close_____<item\n')
+        self.__write_obj.write('mi<tg<item__end_\n')
+
+    def __default_func(self, line):
+        """
+        Required:
+            self, line
+        Returns:
+            Nothing
+        Logic
+            Look for the start of a paragraph defintion. If one is found, check if
+            it contains a list-id. If it does, start a list. Change the state to
+            in_pard.
+            """
+        if self.__token_info == 'mi<tg<open-att__' and line[17:37] == 'paragraph-definition':
+            is_a_heading = self.__is_a_heading()
+            if not is_a_heading:
+                search_obj = re.search(self.__id_regex, line)
+                if search_obj:
+                    num = search_obj.group(1)
+                    self.__state = 'in_pard'
+                    search_obj_lv = re.search(self.__lv_regex, line)
+                    if search_obj_lv:
+                        self.__level = search_obj_lv.group(1)
+                    self.__write_start_list(num)
+        self.__write_obj.write(line)
+
+    def __is_a_heading(self):
+        if self.__style_name in self.__headings:
+            if self.__headings_to_sections:
+                return 1
+            else:
+                if self.__no_headings_as_list:
+                    return 1
+                else:
+                    return 0
+        else:
+            return 0
+
+    def __get_indent(self, line):
+        if self.__token_info == 'mi<mk<left_inden':
+            self.__left_indent = float(line[17:-1])
+
+    def __get_list_type(self, line):
+        if self.__token_info == 'mi<mk<list-type_':  # <ordered
+            self.__list_type = line[17:-1]
+            if self.__list_type == 'item':
+                self.__list_type = "unordered"
+
+    def __get_style_name(self, line):
+        if self.__token_info == 'mi<mk<style-name':
+            self.__style_name = line[17:-1]
+
+    def make_lists(self):
+        """
+        Required:
+            nothing
+        Returns:
+            original file will be changed
+        Logic:
+        """
+        self.__initiate_values()
+        read_obj = open_for_read(self.__file)
+        self.__write_obj = open_for_write(self.__write_to)
+        line_to_read = 1
+        while line_to_read:
+            line_to_read = read_obj.readline()
+            line = line_to_read
+            self.__token_info = line[:16]
+            self.__get_indent(line)
+            self.__get_list_type(line)
+            self.__get_style_name(line)
+            action = self.__state_dict.get(self.__state)
+            action(line)
+        read_obj.close()
+        self.__write_obj.close()
+        copy_obj = copy.Copy(bug_handler=self.__bug_handler)
+        if self.__copy:
+            copy_obj.copy_file(self.__write_to, "make_lists.data")
+        copy_obj.rename(self.__write_to, self.__file)
+        os.remove(self.__write_to)
--- a/ebook_converter/ebooks/rtf2xml/old_rtf.py
+++ b/ebook_converter/ebooks/rtf2xml/old_rtf.py
@@ -0,0 +1,146 @@
+from __future__ import absolute_import, division, print_function, unicode_literals
+#########################################################################
+#                                                                       #
+#                                                                       #
+#   copyright 2002 Paul Henry Tremblay                                  #
+#                                                                       #
+#   This program is distributed in the hope that it will be useful,     #
+#   but WITHOUT ANY WARRANTY; without even the implied warranty of      #
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU    #
+#   General Public License for more details.                            #
+#                                                                       #
+#                                                                       #
+#########################################################################
+import sys
+
+from polyglot.builtins import unicode_type
+
+from . import open_for_read
+
+
+class OldRtf:
+    """
+    Check to see if the RTF is an older version
+    Logic:
+    If allowable control word/properties happen in text without being enclosed
+    in brackets the file will be considered old rtf
+    """
+
+    def __init__(self, in_file,
+                bug_handler,
+                run_level,
+                ):
+        """
+        Required:
+            'file'--file to parse
+            'table_data' -- a dictionary for each table.
+        Optional:
+            'copy'-- whether to make a copy of result for debugging
+            'temp_dir' --where to output temporary results (default is
+            directory from which the script is run.)
+        Returns:
+            nothing
+            """
+        self.__file = in_file
+        self.__bug_handler = bug_handler
+        self.__run_level = run_level
+        self.__allowable = [
+            'annotation' ,
+            'blue______'  ,
+            'bold______',
+            'caps______',
+            'char-style' ,
+            'dbl-strike' ,
+            'emboss____',
+            'engrave___' ,
+            'font-color',
+            'font-down_' ,
+            'font-size_',
+            'font-style',
+            'font-up___',
+            'footnot-mk' ,
+            'green_____' ,
+            'hidden____',
+            'italics___',
+            'outline___',
+            'red_______',
+            'shadow____' ,
+            'small-caps',
+            'strike-thr',
+            'subscript_',
+            'superscrip' ,
+            'underlined' ,
+        ]
+        self.__action_dict = {
+            'before_body'   : self.__before_body_func,
+            'in_body'       : self.__check_tokens_func,
+            'after_pard'    : self.__after_pard_func,
+        }
+
+    def __initiate_values(self):
+        self.__previous_token = ''
+        self.__state = 'before_body'
+        self.__found_new = 0
+        self.__ob_group = 0
+
+    def __check_tokens_func(self, line):
+        if self.__inline_info in self.__allowable:
+            if self.__ob_group == self.__base_ob_count:
+                return 'old_rtf'
+            else:
+                self.__found_new += 1
+        elif self.__token_info ==  'cw<pf<par-def___':
+            self.__state = 'after_pard'
+
+    def __before_body_func(self, line):
+        if self.__token_info == 'mi<mk<body-open_':
+            self.__state = 'in_body'
+            self.__base_ob_count = self.__ob_group
+
+    def __after_pard_func(self, line):
+        if line[0:2] != 'cw':
+            self.__state = 'in_body'
+
+    def check_if_old_rtf(self):
+        """
+        Requires:
+            nothing
+        Returns:
+            True if file is older RTf
+            False if file is newer RTF
+        """
+        self.__initiate_values()
+        line_num = 0
+        with open_for_read(self.__file) as read_obj:
+            for line in read_obj:
+                line_num += 1
+                self.__token_info = line[:16]
+                if self.__token_info == 'mi<mk<body-close':
+                    return False
+                if self.__token_info == 'ob<nu<open-brack':
+                    self.__ob_group += 1
+                    self.__ob_count = line[-5:-1]
+                if self.__token_info == 'cb<nu<clos-brack':
+                    self.__ob_group -= 1
+                    self.__cb_count = line[-5:-1]
+                self.__inline_info = line[6:16]
+                if self.__state == 'after_body':
+                    return False
+                action = self.__action_dict.get(self.__state)
+                if action is None:
+                    try:
+                        sys.stderr.write('No action for this state!\n')
+                    except:
+                        pass
+                result = action(line)
+                if result == 'new_rtf':
+                    return False
+                elif result == 'old_rtf':
+                    if self.__run_level > 3:
+                        sys.stderr.write(
+                            'Old rtf construction %s (bracket %s, line %s)\n' % (
+                                self.__inline_info, unicode_type(self.__ob_group), line_num)
+                        )
+                    return True
+                self.__previous_token = line[6:16]
+        return False
--- a/ebook_converter/ebooks/rtf2xml/output.py
+++ b/ebook_converter/ebooks/rtf2xml/output.py
@@ -0,0 +1,121 @@
+from __future__ import unicode_literals, absolute_import, print_function, division
+#########################################################################
+#                                                                       #
+#                                                                       #
+#   copyright 2002 Paul Henry Tremblay                                  #
+#                                                                       #
+#   This program is distributed in the hope that it will be useful,     #
+#   but WITHOUT ANY WARRANTY; without even the implied warranty of      #
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU    #
+#   General Public License for more details.                            #
+#                                                                       #
+#                                                                       #
+#########################################################################
+import sys, os
+from polyglot.builtins import raw_input
+from . import open_for_read, open_for_write
+# , codecs
+
+
+class Output:
+    """
+    Output file
+    """
+
+    def __init__(self,
+            file,
+            orig_file,
+            output_dir=None,
+            out_file=None,
+            no_ask=True
+            ):
+        """
+        Required:
+            'file' -- xml file ready to output
+            orig_file -- original rtf file
+        Optional:
+            output_file -- the file to output to
+        Returns:
+            nothing
+            """
+        self.__file = file
+        self.__orig_file = orig_file
+        self.__output_dir = output_dir
+        self.__no_ask = no_ask
+        self.__out_file = out_file
+
+    def output(self):
+        """
+        Required:
+            nothing
+        Returns:
+            nothing
+        Logic:
+            output the line to the screen if no output file given. Otherwise, output to
+            the file.
+        """
+        if self.__output_dir:
+            self.__output_to_dir_func()
+        elif self.__out_file:
+            self.__output_to_file_func()
+            # self.__output_xml(self.__file, self.__out_file)
+        else:
+            self.__output_to_standard_func()
+
+    def __output_to_dir_func(self):
+        """
+        Requires:
+            nothing
+        Returns:
+            nothing
+        Logic:
+            Create a file within the output directory.
+            Read one file at a time. Output line to the newly-created file.
+        """
+        base_name = os.path.basename(self.__orig_file)
+        base_name, ext  = os.path.splitext(base_name)
+        output_file = os.path.join(self.__output_dir, '%s.xml' % base_name)
+        # change if user wants to output to a specific file
+        if self.__out_file:
+            output_file = os.path.join(self.__output_dir, self.__out_file)
+        user_response = 'o'
+        if os.path.isfile(output_file) and not self.__no_ask:
+            msg = 'Do you want to overwrite %s?\n' % output_file
+            msg += ('Type "o" to overwrite.\n'
+                    'Type any other key to print to standard output.\n')
+            sys.stderr.write(msg)
+            user_response = raw_input()
+        if user_response == 'o':
+            with open_for_read(self.__file) as read_obj:
+                with open_for_write(self.output_file) as write_obj:
+                    for line in read_obj:
+                        write_obj.write(line)
+        else:
+            self.__output_to_standard_func()
+
+    def __output_to_file_func(self):
+        """
+        Required:
+            nothing
+        Returns:
+            nothing
+        Logic:
+            read one line at a time. Output to standard
+        """
+        with open_for_read(self.__file) as read_obj:
+            with open_for_write(self.__out_file) as write_obj:
+                for line in read_obj:
+                    write_obj.write(line)
+
+    def __output_to_standard_func(self):
+        """
+        Required:
+            nothing
+        Returns:
+            nothing
+        Logic:
+            read one line at a time. Output to standard
+        """
+        with open_for_read(self.__file) as read_obj:
+            for line in read_obj:
+                sys.stdout.write(line)
--- a/ebook_converter/ebooks/rtf2xml/override_table.py
+++ b/ebook_converter/ebooks/rtf2xml/override_table.py
@@ -0,0 +1,209 @@
+from __future__ import unicode_literals, absolute_import, print_function, division
+from __future__ import print_function
+#########################################################################
+#                                                                       #
+#                                                                       #
+#   copyright 2002 Paul Henry Tremblay                                  #
+#                                                                       #
+#   This program is distributed in the hope that it will be useful,     #
+#   but WITHOUT ANY WARRANTY; without even the implied warranty of      #
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU    #
+#   General Public License for more details.                            #
+#                                                                       #
+#                                                                       #
+#########################################################################
+
+
+class OverrideTable:
+    """
+    Parse a line of text to make the override table. Return a string
+    (which will convert to XML) and the dictionary containing all the
+    information about the lists. This dictionary is the result of the
+    dictionary that is first passed to this module. This module
+    modifies the dictionary, assigning lists numbers to each list.
+    """
+
+    def __init__(
+                self,
+                list_of_lists,
+                run_level=1,
+                ):
+        self.__list_of_lists = list_of_lists
+        self.__initiate_values()
+        self.__run_level = run_level
+
+    def __initiate_values(self):
+        self.__override_table_final = ''
+        self.__state = 'default'
+        self.__override_list = []
+        self.__state_dict = {
+            'default'       : self.__default_func,
+            'override'      : self.__override_func,
+            'unsure_ob'     : self.__after_bracket_func,
+        }
+        self.__override_dict = {
+            'cw<ls<lis-tbl-id'  :       'list-table-id',
+            'cw<ls<list-id___'  :       'list-id',
+        }
+
+    def __override_func(self, line):
+        """
+        Requires:
+            line -- line to parse
+        Returns:
+            nothing
+        Logic:
+            The group {\\override has been found.
+            Check for the end of the group.
+            Otherwise, add appropriate tokens to the override dictionary.
+        """
+        if self.__token_info == 'cb<nu<clos-brack' and\
+            self.__cb_count == self.__override_ob_count:
+            self.__state = 'default'
+            self.__parse_override_dict()
+        else:
+            att = self.__override_dict.get(self.__token_info)
+            if att:
+                value = line[20:]
+                self.__override_list[-1][att] = value
+
+    def __parse_override_dict(self):
+        """
+        Requires:
+            nothing
+        Returns:
+            nothing
+        Logic:
+            The list of all information about RTF lists has been passed to
+            this module. As of this point, this python list has no id number,
+            which is needed later to identify which lists in the body should
+            be assigned which formatting commands from the list-table.
+            In order to get an id, I have to check to see when the list-table-id
+            from the override_dict (generated in this module) matches the list-table-id
+            in list_of_lists (generated in the list_table.py module). When a match is found,
+            append the lists numbers to the self.__list_of_lists dictionary
+            that contains the empty lists:
+                [[{list-id:[HERE!],[{}]]
+            This is a list, since one list in the table in the preamble of RTF can
+            apply to multiple lists in the body.
+        """
+        override_dict = self.__override_list[-1]
+        list_id = override_dict.get('list-id')
+        if list_id is None and self.__level > 3:
+            msg = 'This override does not appear to have a list-id\n'
+            raise self.__bug_handler(msg)
+        current_table_id = override_dict.get('list-table-id')
+        if current_table_id is None and self.__run_level > 3:
+            msg = 'This override does not appear to have a list-table-id\n'
+            raise self.__bug_handler(msg)
+        counter = 0
+        for list in self.__list_of_lists:
+            info_dict = list[0]
+            old_table_id = info_dict.get('list-table-id')
+            if old_table_id == current_table_id:
+                self.__list_of_lists[counter][0]['list-id'].append(list_id)
+                break
+            counter += 1
+
+    def __parse_lines(self, line):
+        """
+        Requires:
+            line --ine to parse
+        Returns:
+            nothing
+        Logic:
+            Break the into tokens by splitting it on the newline.
+            Call on the method according to the state.
+        """
+        lines = line.split('\n')
+        self.__ob_count = 0
+        self.__ob_group = 0
+        for line in lines:
+            self.__token_info = line[:16]
+            if self.__token_info == 'ob<nu<open-brack':
+                self.__ob_count = line[-4:]
+                self.__ob_group += 1
+            if self.__token_info == 'cb<nu<clos-brack':
+                self.__cb_count = line[-4:]
+                self.__ob_group -= 1
+            action = self.__state_dict.get(self.__state)
+            if action is None:
+                print(self.__state)
+            action(line)
+        self.__write_final_string()
+        # self.__add_to_final_line()
+
+    def __default_func(self, line):
+        """
+        Requires:
+            line -- line to parse
+        Return:
+            nothing
+        Logic:
+            Look for an open bracket and change states when found.
+        """
+        if self.__token_info == 'ob<nu<open-brack':
+            self.__state = 'unsure_ob'
+
+    def __after_bracket_func(self, line):
+        """
+        Requires:
+            line -- line to parse
+        Returns:
+            nothing
+        Logic:
+            The last token was an open bracket. You need to determine
+            the group based on the token after.
+            WARNING: this could cause problems. If no group is found, the
+            state will remain unsure_ob, which means no other text will be
+            parsed. I should do states by a list and simply pop this
+            unsure_ob state to get the previous state.
+        """
+        if self.__token_info == 'cw<ls<lis-overid':
+            self.__state = 'override'
+            self.__override_ob_count = self.__ob_count
+            the_dict = {}
+            self.__override_list.append(the_dict)
+        elif self.__run_level > 3:
+            msg = 'No matching token after open bracket\n'
+            msg += 'token is "%s\n"' % (line)
+            raise self.__bug_handler(msg)
+
+    def __write_final_string(self):
+        """
+        Requires:
+            line -- line to parse
+        Returns:
+            nothing
+        Logic:
+            First write out the override-table tag.
+            Iteratere through the dictionaries in the main override_list.
+            For each dictionary, write an empty tag "override-list". Add
+            the attributes and values of the tag from the dictionary.
+        """
+        self.__override_table_final = 'mi<mk<over_beg_\n'
+        self.__override_table_final += 'mi<tg<open______<override-table\n' + \
+        'mi<mk<overbeg__\n' + self.__override_table_final
+        for the_dict in self.__override_list:
+            self.__override_table_final += 'mi<tg<empty-att_<override-list'
+            the_keys = the_dict.keys()
+            for the_key in the_keys:
+                self.__override_table_final += \
+                    '<%s>%s' % (the_key, the_dict[the_key])
+            self.__override_table_final += '\n'
+        self.__override_table_final += '\n'
+        self.__override_table_final += \
+        'mi<mk<overri-end\n' + 'mi<tg<close_____<override-table\n'
+        self.__override_table_final += 'mi<mk<overribend_\n'
+
+    def parse_override_table(self, line):
+        """
+        Requires:
+            line -- line with border definition in it
+        Returns:
+            A string that will be converted to XML, and a dictionary of
+            all the properties of the RTF lists.
+        Logic:
+        """
+        self.__parse_lines(line)
+        return self.__override_table_final, self.__list_of_lists
--- a/ebook_converter/ebooks/rtf2xml/paragraph_def.py
+++ b/ebook_converter/ebooks/rtf2xml/paragraph_def.py
@@ -0,0 +1,763 @@
+from __future__ import absolute_import, division, print_function, unicode_literals
+#########################################################################
+#                                                                       #
+#                                                                       #
+#   copyright 2002 Paul Henry Tremblay                                  #
+#                                                                       #
+#   This program is distributed in the hope that it will be useful,     #
+#   but WITHOUT ANY WARRANTY; without even the implied warranty of      #
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU    #
+#   General Public License for more details.                            #
+#                                                                       #
+#                                                                       #
+#########################################################################
+import sys, os
+
+from calibre.ebooks.rtf2xml import copy, border_parse
+from calibre.ptempfile import better_mktemp
+from polyglot.builtins import unicode_type
+
+from . import open_for_read, open_for_write
+
+
+class ParagraphDef:
+    """
+=================
+Purpose
+=================
+Write paragraph definition tags.
+States:
+1. before_1st_para_def.
+Before any para_def token is found. This means all the text in the preamble.
+Look for the token 'cw<pf<par-def___'. This will changet the state to collect_tokens.
+2. collect_tokens.
+Found a paragraph_def. Need to get all tokens.
+Change with start of a paragrph ('mi<mk<para-start'). State then becomes
+in_paragraphs
+If another paragraph definition is found, the state does not change.
+But the dictionary is reset.
+3. in_paragraphs
+State changes when 'mi<mk<para-end__', or end of paragraph is found.
+State then becomes 'self.__state = 'after_para_end'
+4. after_para_end
+If 'mi<mk<para-start' (the start of a paragraph) or 'mi<mk<para-end__' (the end of a paragraph--must be empty paragraph?) are found:
+    state changes to 'in_paragraphs'
+If 'cw<pf<par-def___' (paragraph_definition) is found:
+    state changes to collect_tokens
+if 'mi<mk<body-close', 'mi<mk<par-in-fld',
+'cw<tb<cell______','cw<tb<row-def___','cw<tb<row_______',
+'mi<mk<sect-close',   'mi<mk<header-beg',  'mi<mk<header-end'
+are found. (All these tokens mark the start of a bigger element. para_def must
+be closed:
+    state changes to  'after_para_def'
+5. after_para_def
+'mi<mk<para-start'  changes state to in_paragraphs
+if another paragraph_def is found, the state changes to collect_tokens.
+    """
+
+    def __init__(self,
+        in_file,
+        bug_handler,
+        default_font,
+        copy=None,
+        run_level=1,):
+        """
+        Required:
+            'file'--file to parse
+            'default_font' --document default font
+        Optional:
+            'copy'-- whether to make a copy of result for debugging
+            'temp_dir' --where to output temporary results (default is
+            directory from which the script is run.)
+        Returns:
+            nothing
+            """
+        self.__file = in_file
+        self.__bug_handler = bug_handler
+        self.__default_font = default_font
+        self.__copy = copy
+        self.__run_level = run_level
+        self.__write_to = better_mktemp()
+
+    def __initiate_values(self):
+        """
+        Initiate all values.
+        """
+        # Dictionary needed to convert shortened style names to readable names
+        self.__token_dict={
+        # paragraph formatting => pf
+        'par-end___'    : 'para',
+        'par-def___'    : 'paragraph-definition',
+        'keep-w-nex'    : 'keep-with-next',
+        'widow-cntl'    : 'widow-control',
+        'adjust-rgt'    : 'adjust-right',
+        'language__'    : 'language',
+        'right-inde'    : 'right-indent',
+        'fir-ln-ind'    : 'first-line-indent',
+        'left-inden'    : 'left-indent',
+        'space-befo'    : 'space-before',
+        'space-afte'    : 'space-after',
+        'line-space'    : 'line-spacing',
+        'default-ta'    : 'default-tab',
+        'align_____'    : 'align',
+        'widow-cntr'    : 'widow-control',
+        # stylesheet = > ss
+        'style-shet'    : 'stylesheet',
+        'based-on__'    : 'based-on-style',
+        'next-style'    : 'next-style',
+        'char-style'    : 'character-style',
+        # this is changed to get a nice attribute
+        'para-style'    : 'name',
+        # graphics => gr
+        'picture___'    : 'pict',
+        'obj-class_'    : 'obj_class',
+        'mac-pic___'    : 'mac-pict',
+        # section => sc
+        'section___'    : 'section-new',
+        'sect-defin'    : 'section-reset',
+        'sect-note_'    : 'endnotes-in-section',
+        # list=> ls
+        'list-text_'    : 'list-text',
+        # this line must be wrong because it duplicates an earlier one
+        'list-text_'    : 'list-text',
+        'list______'    : 'list',
+        'list-lev-d'    : 'list-level-definition',
+        'list-cardi'    : 'list-cardinal-numbering',
+        'list-decim'    : 'list-decimal-numbering',
+        'list-up-al'    : 'list-uppercase-alphabetic-numbering',
+        'list-up-ro'    : 'list-uppercae-roman-numbering',
+        'list-ord__'    : 'list-ordinal-numbering',
+        'list-ordte'    : 'list-ordinal-text-numbering',
+        'list-bulli'    : 'list-bullet',
+        'list-simpi'    : 'list-simple',
+        'list-conti'    : 'list-continue',
+        'list-hang_'    : 'list-hang',
+        # 'list-tebef'    :	'list-text-before',
+        # 'list-level'    : 'level',
+        'list-id___'    : 'list-id',
+        'list-start'    : 'list-start',
+        'nest-level'    : 'nest-level',
+        # duplicate
+        'list-level'    : 'list-level',
+        # notes => nt
+        'footnote__'    : 'footnote',
+        'type______'    : 'type',
+        # anchor => an
+        'toc_______'    : 'anchor-toc',
+        'book-mk-st'    : 'bookmark-start',
+        'book-mk-en'    : 'bookmark-end',
+        'index-mark'    : 'anchor-index',
+        'place_____'    : 'place',
+        # field => fd
+        'field_____'    : 'field',
+        'field-inst'    : 'field-instruction',
+        'field-rslt'    : 'field-result',
+        'datafield_'    : 'data-field',
+        # info-tables => it
+        'font-table'    : 'font-table',
+        'colr-table'    : 'color-table',
+        'lovr-table'    : 'list-override-table',
+        'listtable_'    : 'list-table',
+        'revi-table'    : 'revision-table',
+        # character info => ci
+        'hidden____'    : 'hidden',
+        'italics___'    : 'italics',
+        'bold______'    : 'bold',
+        'strike-thr'   : 'strike-through',
+        'shadow____'   : 'shadow',
+        'outline___'   : 'outline',
+        'small-caps'   : 'small-caps',
+        'caps______'   :       'caps',
+        'dbl-strike'   : 'double-strike-through',
+        'emboss____'    : 'emboss',
+        'engrave___'    : 'engrave',
+        'subscript_'    : 'subscript',
+        'superscrip'    : 'superscipt',
+        'font-style'    : 'font-style',
+        'font-color'    : 'font-color',
+        'font-size_'    : 'font-size',
+        'font-up___'    : 'superscript',
+        'font-down_'    : 'subscript',
+        'red_______'    : 'red',
+        'blue______'    : 'blue',
+        'green_____'    : 'green',
+        # table => tb
+        'row-def___'    : 'row-definition',
+        'cell______'    : 'cell',
+        'row_______'    : 'row',
+        'in-table__'    : 'in-table',
+        'columns___'    : 'columns',
+        'row-pos-le'    : 'row-position-left',
+        'cell-posit'    : 'cell-position',
+        # preamble => pr
+        # underline
+        'underlined'    : 'underlined',
+        # border => bd
+        'bor-t-r-hi'    : 'border-table-row-horizontal-inside',
+        'bor-t-r-vi'    : 'border-table-row-vertical-inside',
+        'bor-t-r-to'    : 'border-table-row-top',
+        'bor-t-r-le'    : 'border-table-row-left',
+        'bor-t-r-bo'    : 'border-table-row-bottom',
+        'bor-t-r-ri'    : 'border-table-row-right',
+        'bor-cel-bo'    : 'border-cell-bottom',
+        'bor-cel-to'    : 'border-cell-top',
+        'bor-cel-le'    : 'border-cell-left',
+        'bor-cel-ri'    : 'border-cell-right',
+        # 'bor-par-bo'    : 'border-paragraph-bottom',
+        'bor-par-to'    : 'border-paragraph-top',
+        'bor-par-le'    : 'border-paragraph-left',
+        'bor-par-ri'    : 'border-paragraph-right',
+        'bor-par-bo'    : 'border-paragraph-box',
+        'bor-for-ev'    : 'border-for-every-paragraph',
+        'bor-outsid'    : 'border-outisde',
+        'bor-none__'    : 'border',
+        # border type => bt
+        'bdr-single'    : 'single',
+        'bdr-doubtb'    : 'double-thickness-border',
+        'bdr-shadow'    : 'shadowed-border',
+        'bdr-double'    : 'double-border',
+        'bdr-dotted'    : 'dotted-border',
+        'bdr-dashed'    : 'dashed',
+        'bdr-hair__'    : 'hairline',
+        'bdr-inset_'    : 'inset',
+        'bdr-das-sm'    : 'dash-small',
+        'bdr-dot-sm'    : 'dot-dash',
+        'bdr-dot-do'    : 'dot-dot-dash',
+        'bdr-outset'    : 'outset',
+        'bdr-trippl'    : 'tripple',
+        'bdr-thsm__'    : 'thick-thin-small',
+        'bdr-htsm__'    : 'thin-thick-small',
+        'bdr-hthsm_'    : 'thin-thick-thin-small',
+        'bdr-thm__'     : 'thick-thin-medium',
+        'bdr-htm__'     : 'thin-thick-medium',
+        'bdr-hthm_'     : 'thin-thick-thin-medium',
+        'bdr-thl__'     : 'thick-thin-large',
+        'bdr-hthl_'     : 'think-thick-think-large',
+        'bdr-wavy_'     : 'wavy',
+        'bdr-d-wav'     : 'double-wavy',
+        'bdr-strip'     : 'striped',
+        'bdr-embos'     : 'emboss',
+        'bdr-engra'     : 'engrave',
+        'bdr-frame'     : 'frame',
+        'bdr-li-wid'    : 'line-width',
+        }
+        self.__tabs_dict = {
+        'cw<pf<tab-stop__'  :   self.__tab_stop_func,
+        'cw<pf<tab-center'  :   self.__tab_type_func,
+        'cw<pf<tab-right_'  :   self.__tab_type_func,
+        'cw<pf<tab-dec___'  :   self.__tab_type_func,
+        'cw<pf<leader-dot'  :   self.__tab_leader_func,
+        'cw<pf<leader-hyp'  :   self.__tab_leader_func,
+        'cw<pf<leader-und'  :   self.__tab_leader_func,
+        'cw<pf<tab-bar-st'  :   self.__tab_bar_func,
+        }
+        self.__tab_type_dict = {
+        'cw<pf<tab-center'  :   'center',
+        'cw<pf<tab-right_'  :   'right',
+        'cw<pf<tab-dec___'  :   'decimal',
+        'cw<pf<leader-dot'  :   'leader-dot',
+        'cw<pf<leader-hyp'  :   'leader-hyphen',
+        'cw<pf<leader-und'  :   'leader-underline',
+        }
+        self.__border_obj = border_parse.BorderParse()
+        self.__style_num_strings = []
+        self.__body_style_strings = []
+        self.__state = 'before_1st_para_def'
+        self.__att_val_dict = {}
+        self.__start_marker =  'mi<mk<pard-start\n'  # outside para tags
+        self.__start2_marker = 'mi<mk<pardstart_\n'  # inside para tags
+        self.__end2_marker =   'mi<mk<pardend___\n'  # inside para tags
+        self.__end_marker =    'mi<mk<pard-end__\n'  # outside para tags
+        self.__text_string = ''
+        self.__state_dict = {
+        'before_1st_para_def'   : self.__before_1st_para_def_func,
+        'collect_tokens'        : self.__collect_tokens_func,
+        'after_para_def'        : self.__after_para_def_func,
+        'in_paragraphs'         : self.__in_paragraphs_func,
+        'after_para_end'        : self.__after_para_end_func,
+        }
+        self.__collect_tokens_dict = {
+        'mi<mk<para-start'  :  self.__end_para_def_func,
+        'cw<pf<par-def___'  :  self.__para_def_in_para_def_func,
+        'cw<tb<cell______'  : self.__empty_table_element_func,
+        'cw<tb<row_______'  : self.__empty_table_element_func,
+        }
+        self.__after_para_def_dict = {
+        'mi<mk<para-start'  :   self.__start_para_after_def_func,
+        'cw<pf<par-def___'  :   self.__found_para_def_func,
+        'cw<tb<cell______'  :   self.__empty_table_element_func,
+        'cw<tb<row_______'  :   self.__empty_table_element_func,
+        }
+        self.__in_paragraphs_dict = {
+        'mi<mk<para-end__'      : self.__found_para_end_func,
+        }
+        self.__after_para_end_dict = {
+        'mi<mk<para-start'      : self.__continue_block_func,
+        'mi<mk<para-end__'      : self.__continue_block_func,
+        'cw<pf<par-def___'      : self.__new_para_def_func,
+        'mi<mk<body-close'      : self.__stop_block_func,
+        'mi<mk<par-in-fld'      : self.__stop_block_func,
+        'cw<tb<cell______'      : self.__stop_block_func,
+        'cw<tb<row-def___'      : self.__stop_block_func,
+        'cw<tb<row_______'      : self.__stop_block_func,
+        'mi<mk<sect-close'      : self.__stop_block_func,
+        'mi<mk<sect-start'      : self.__stop_block_func,
+        'mi<mk<header-beg'      : self.__stop_block_func,
+        'mi<mk<header-end'      : self.__stop_block_func,
+        'mi<mk<head___clo'      : self.__stop_block_func,
+        'mi<mk<fldbk-end_'      : self.__stop_block_func,
+        'mi<mk<lst-txbeg_'      : self.__stop_block_func,
+        }
+
+    def __before_1st_para_def_func(self, line):
+        """
+        Required:
+            line -- line to parse
+        Returns:
+            nothing
+        Logic:
+            Look for the beginning of a paragaraph definition
+        """
+        # cw<pf<par-def___<nu<true
+        if self.__token_info == 'cw<pf<par-def___':
+            self.__found_para_def_func()
+        else:
+            self.__write_obj.write(line)
+
+    def __found_para_def_func(self):
+        self.__state = 'collect_tokens'
+        # not exactly right--have to reset the dictionary--give it default
+        # values
+        self.__reset_dict()
+
+    def __collect_tokens_func(self, line):
+        """
+        Required:
+            line --line to parse
+        Returns:
+            nothing
+        Logic:
+            Check the collect_tokens_dict for either the beginning of a
+            paragraph or a new paragraph definition. Take the actions
+            according to the value in the dict.
+            Otherwise, check if the token is not a control word. If it is not,
+            change the state to after_para_def.
+            Otherwise, check if the token is a paragraph definition word; if
+            so, add it to the attributes and values dictionary.
+        """
+        action = self.__collect_tokens_dict.get(self.__token_info)
+        if action:
+            action(line)
+        elif line[0:2] != 'cw':
+            self.__write_obj.write(line)
+            self.__state = 'after_para_def'
+        elif line[0:5] == 'cw<bd':
+            self.__parse_border(line)
+        else:
+            action = self.__tabs_dict.get(self.__token_info)
+            if action:
+                action(line)
+            else:
+                token = self.__token_dict.get(line[6:16])
+                if token:
+                    self.__att_val_dict[token] = line[20:-1]
+
+    def __tab_stop_func(self, line):
+        """
+        """
+        self.__att_val_dict['tabs'] += '%s:' % self.__tab_type
+        self.__att_val_dict['tabs'] += '%s;' % line[20:-1]
+        self.__tab_type = 'left'
+
+    def __tab_type_func(self, line):
+        """
+        """
+        type = self.__tab_type_dict.get(self.__token_info)
+        if type is not None:
+            self.__tab_type = type
+        else:
+            if self.__run_level > 3:
+                msg = 'no entry for %s\n' % self.__token_info
+                raise self.__bug_handler(msg)
+
+    def __tab_leader_func(self, line):
+        """
+        """
+        leader = self.__tab_type_dict.get(self.__token_info)
+        if leader is not None:
+            self.__att_val_dict['tabs'] += '%s^' % leader
+        else:
+            if self.__run_level > 3:
+                msg = 'no entry for %s\n' % self.__token_info
+                raise self.__bug_handler(msg)
+
+    def __tab_bar_func(self, line):
+        """
+        """
+        # self.__att_val_dict['tabs-bar'] += '%s:' % line[20:-1]
+        self.__att_val_dict['tabs'] += 'bar:%s;' % (line[20:-1])
+        self.__tab_type = 'left'
+
+    def __parse_border(self, line):
+        """
+        Requires:
+            line --line to parse
+        Returns:
+            nothing (updates dictionary)
+        Logic:
+            Uses the border_parse module to return a dictionary of attribute
+            value pairs for a border line.
+        """
+        border_dict = self.__border_obj.parse_border(line)
+        self.__att_val_dict.update(border_dict)
+
+    def __para_def_in_para_def_func(self, line):
+        """
+        Requires:
+            line --line to parse
+        Returns:
+            nothing
+        Logic:
+            I have found a \\pard while I am collecting tokens. I want to reset
+            the dectionary and do nothing else.
+        """
+        # Change this
+        self.__state = 'collect_tokens'
+        self.__reset_dict()
+
+    def __end_para_def_func(self, line):
+        """
+        Requires:
+            Nothing
+        Returns:
+            Nothing
+        Logic:
+            The previous state was collect tokens, and I have found the start
+            of a paragraph. I want to outut the defintion tag; output the line
+            itself (telling me of the beginning of a paragraph);change the
+            state to 'in_paragraphs';
+        """
+        self.__write_para_def_beg()
+        self.__write_obj.write(line)
+        self.__state = 'in_paragraphs'
+
+    def __start_para_after_def_func(self, line):
+        """
+        Requires:
+            Nothing
+        Returns:
+            Nothing
+        Logic:
+            The state was is after_para_def. and I have found the start of a
+            paragraph. I want to outut the defintion tag; output the line
+            itself (telling me of the beginning of a paragraph);change the
+            state to 'in_paragraphs'.
+            (I now realize that this is absolutely identical to the function above!)
+        """
+        self.__write_para_def_beg()
+        self.__write_obj.write(line)
+        self.__state = 'in_paragraphs'
+
+    def __after_para_def_func(self, line):
+        """
+        Requires:
+            line -- line to parse
+        Returns:
+            nothing
+        Logic:
+            Check if the token info is the start of a paragraph. If so, call
+            on the function found in the value of the dictionary.
+        """
+        action = self.__after_para_def_dict.get(self.__token_info)
+        if self.__token_info == 'cw<pf<par-def___':
+            self.__found_para_def_func()
+        elif action:
+            action(line)
+        else:
+            self.__write_obj.write(line)
+
+    def __in_paragraphs_func(self, line):
+        """
+        Requires:
+            line --current line
+        Returns:
+            nothing
+        Logic:
+            Look for the end of a paragraph, the start of a cell or row.
+        """
+        action = self.__in_paragraphs_dict.get(self.__token_info)
+        if action:
+            action(line)
+        else:
+            self.__write_obj.write(line)
+
+    def __found_para_end_func(self,line):
+        """
+        Requires:
+            line -- line to print out
+        Returns:
+            Nothing
+        Logic:
+            State is in paragraphs. You have found the end of a paragraph. You
+            need to print out the line and change the state to after
+            paragraphs.
+        """
+        self.__state = 'after_para_end'
+        self.__write_obj.write(line)
+
+    def __after_para_end_func(self, line):
+        """
+        Requires:
+            line -- line to output
+        Returns:
+            nothing
+        Logic:
+            The state is after the end of a paragraph. You are collecting all
+            the lines in a string and waiting to see if you need to write
+            out the paragraph definition. If you find another paragraph
+            definition, then you write out the old paragraph dictionary and
+            print out the string. You change the state to collect tokens.
+            If you find any larger block elemens, such as cell, row,
+            field-block, or section, you write out the paragraph defintion and
+            then the text string.
+            If you find the beginning of a paragraph, then you don't need to
+            write out the paragraph definition. Write out the string, and
+            change the state to in paragraphs.
+        """
+        self.__text_string += line
+        action = self.__after_para_end_dict.get(self.__token_info)
+        if action:
+            action(line)
+
+    def __continue_block_func(self, line):
+        """
+        Requires:
+            line --line to print out
+        Returns:
+            Nothing
+        Logic:
+            The state is after the end of a paragraph. You have found the
+            start of a paragaph, so you don't need to print out the paragaph
+            definition. Print out the string, the line, and change the state
+            to in paragraphs.
+        """
+        self.__state = 'in_paragraphs'
+        self.__write_obj.write(self.__text_string)
+        self.__text_string = ''
+    # found a new paragraph definition after an end of a paragraph
+
+    def __new_para_def_func(self, line):
+        """
+        Requires:
+            line -- line to output
+        Returns:
+            Nothing
+        Logic:
+            You have found a new paragraph defintion at the end of a
+            paragraph. Output the end of the old paragraph defintion. Output
+            the text string. Output the line. Change the state to collect
+            tokens. (And don't forget to set the text string to ''!)
+        """
+        self.__write_para_def_end_func()
+        self.__found_para_def_func()
+    # after a paragraph and found reason to stop this block
+
+    def __stop_block_func(self, line):
+        """
+        Requires:
+            line --(shouldn't be here?)
+        Returns:
+            nothing
+        Logic:
+            The state is after a paragraph, and you have found a larger block
+            than paragraph-definition. You want to write the end tag of the
+            old defintion and reset the text string (handled by other
+            methods).
+        """
+        self.__write_para_def_end_func()
+        self.__state = 'after_para_def'
+
+    def __write_para_def_end_func(self):
+        """
+        Requires:
+            nothing
+        Returns:
+            nothing
+        Logic:
+            Print out the end of the pargraph definition tag, and the markers
+            that let me know when I have reached this tag. (These markers are
+            used for later parsing.)
+        """
+        self.__write_obj.write(self.__end2_marker)
+        self.__write_obj.write('mi<tg<close_____<paragraph-definition\n')
+        self.__write_obj.write(self.__end_marker)
+        self.__write_obj.write(self.__text_string)
+        self.__text_string = ''
+        keys = self.__att_val_dict.keys()
+        if 'font-style' in keys:
+            self.__write_obj.write('mi<mk<font-end__\n')
+        if 'caps' in keys:
+            self.__write_obj.write('mi<mk<caps-end__\n')
+
+    def __get_num_of_style(self):
+        """
+        Requires:
+            nothing
+        Returns:
+            nothing
+        Logic:
+            Get a unique value for each style.
+        """
+        my_string = ''
+        new_style = 0
+        # when determining uniqueness for a style, ingorne these values, since
+        # they don't tell us if the style is unique
+        ignore_values = ['style-num', 'nest-level', 'in-table']
+        for k in sorted(self.__att_val_dict):
+            if k not in ignore_values:
+                my_string += '%s:%s' % (k, self.__att_val_dict[k])
+        if my_string in self.__style_num_strings:
+            num = self.__style_num_strings.index(my_string)
+            num += 1  # since indexing starts at zero, rather than 1
+        else:
+            self.__style_num_strings.append(my_string)
+            num = len(self.__style_num_strings)
+            new_style = 1
+        num = '%04d' % num
+        self.__att_val_dict['style-num'] = 's' + unicode_type(num)
+        if new_style:
+            self.__write_body_styles()
+
+    def __write_body_styles(self):
+        style_string = ''
+        style_string += 'mi<tg<empty-att_<paragraph-style-in-body'
+        style_string += '<name>%s' % self.__att_val_dict['name']
+        style_string += '<style-number>%s' % self.__att_val_dict['style-num']
+        tabs_list = ['tabs-left', 'tabs-right', 'tabs-decimal', 'tabs-center',
+            'tabs-bar', 'tabs']
+        if self.__att_val_dict['tabs'] != '':
+            the_value = self.__att_val_dict['tabs']
+            # the_value = the_value[:-1]
+            style_string += ('<%s>%s' % ('tabs', the_value))
+        exclude = frozenset(['name', 'style-num', 'in-table'] + tabs_list)
+        for k in sorted(self.__att_val_dict):
+            if k not in exclude:
+                style_string += ('<%s>%s' % (k, self.__att_val_dict[k]))
+        style_string += '\n'
+        self.__body_style_strings.append(style_string)
+
+    def __write_para_def_beg(self):
+        """
+        Requires:
+            nothing
+        Returns:
+            nothing
+        Logic:
+            Print out the beginning of the pargraph definition tag, and the markers
+            that let me know when I have reached this tag. (These markers are
+            used for later parsing.)
+        """
+        self.__get_num_of_style()
+        table = self.__att_val_dict.get('in-table')
+        if table:
+            # del self.__att_val_dict['in-table']
+            self.__write_obj.write('mi<mk<in-table__\n')
+        else:
+            self.__write_obj.write('mi<mk<not-in-tbl\n')
+        left_indent = self.__att_val_dict.get('left-indent')
+        if left_indent:
+            self.__write_obj.write('mi<mk<left_inden<%s\n' % left_indent)
+        is_list =  self.__att_val_dict.get('list-id')
+        if is_list:
+            self.__write_obj.write('mi<mk<list-id___<%s\n' % is_list)
+        else:
+            self.__write_obj.write('mi<mk<no-list___\n')
+        self.__write_obj.write('mi<mk<style-name<%s\n' % self.__att_val_dict['name'])
+        self.__write_obj.write(self.__start_marker)
+        self.__write_obj.write('mi<tg<open-att__<paragraph-definition')
+        self.__write_obj.write('<name>%s' % self.__att_val_dict['name'])
+        self.__write_obj.write('<style-number>%s' % self.__att_val_dict['style-num'])
+        tabs_list = ['tabs-left', 'tabs-right', 'tabs-decimal', 'tabs-center',
+            'tabs-bar', 'tabs']
+        """
+        for tab_item in tabs_list:
+            if self.__att_val_dict[tab_item] != '':
+                the_value = self.__att_val_dict[tab_item]
+                the_value = the_value[:-1]
+                self.__write_obj.write('<%s>%s' % (tab_item, the_value))
+        """
+        if self.__att_val_dict['tabs'] != '':
+            the_value = self.__att_val_dict['tabs']
+            # the_value = the_value[:-1]
+            self.__write_obj.write('<%s>%s' % ('tabs', the_value))
+        keys = sorted(self.__att_val_dict)
+        exclude = frozenset(['name', 'style-num', 'in-table'] + tabs_list)
+        for key in keys:
+            if key not in exclude:
+                self.__write_obj.write('<%s>%s' % (key, self.__att_val_dict[key]))
+        self.__write_obj.write('\n')
+        self.__write_obj.write(self.__start2_marker)
+        if 'font-style' in keys:
+            face = self.__att_val_dict['font-style']
+            self.__write_obj.write('mi<mk<font______<%s\n' % face)
+        if 'caps' in keys:
+            value = self.__att_val_dict['caps']
+            self.__write_obj.write('mi<mk<caps______<%s\n' % value)
+
+    def __empty_table_element_func(self, line):
+        self.__write_obj.write('mi<mk<in-table__\n')
+        self.__write_obj.write(line)
+        self.__state = 'after_para_def'
+
+    def __reset_dict(self):
+        """
+        Requires:
+            nothing
+        Returns:
+            nothing
+        Logic:
+            The dictionary containing values and attributes must be reset each
+            time a new paragraphs definition is found.
+        """
+        self.__att_val_dict.clear()
+        self.__att_val_dict['name'] = 'Normal'
+        self.__att_val_dict['font-style'] = self.__default_font
+        self.__tab_type = 'left'
+        self.__att_val_dict['tabs-left'] = ''
+        self.__att_val_dict['tabs-right'] = ''
+        self.__att_val_dict['tabs-center'] = ''
+        self.__att_val_dict['tabs-decimal'] = ''
+        self.__att_val_dict['tabs-bar'] = ''
+        self.__att_val_dict['tabs'] = ''
+
+    def make_paragraph_def(self):
+        """
+        Requires:
+            nothing
+        Returns:
+            nothing (changes the original file)
+        Logic:
+            Read one line in at a time. Determine what action to take based on
+            the state.
+        """
+        self.__initiate_values()
+        read_obj = open_for_read(self.__file)
+        self.__write_obj = open_for_write(self.__write_to)
+        line_to_read = 1
+        while line_to_read:
+            line_to_read = read_obj.readline()
+            line = line_to_read
+            self.__token_info = line[:16]
+            action = self.__state_dict.get(self.__state)
+            if action is None:
+                sys.stderr.write('no no matching state in module sections.py\n')
+                sys.stderr.write(self.__state + '\n')
+            action(line)
+        read_obj.close()
+        self.__write_obj.close()
+        copy_obj = copy.Copy(bug_handler=self.__bug_handler)
+        if self.__copy:
+            copy_obj.copy_file(self.__write_to, "paragraphs_def.data")
+        copy_obj.rename(self.__write_to, self.__file)
+        os.remove(self.__write_to)
+        return self.__body_style_strings
--- a/ebook_converter/ebooks/rtf2xml/paragraphs.py
+++ b/ebook_converter/ebooks/rtf2xml/paragraphs.py
@@ -0,0 +1,263 @@
+from __future__ import unicode_literals, absolute_import, print_function, division
+#########################################################################
+#                                                                       #
+#                                                                       #
+#   copyright 2002 Paul Henry Tremblay                                  #
+#                                                                       #
+#   This program is distributed in the hope that it will be useful,     #
+#   but WITHOUT ANY WARRANTY; without even the implied warranty of      #
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU    #
+#   General Public License for more details.                            #
+#                                                                       #
+#                                                                       #
+#########################################################################
+import sys, os
+
+from calibre.ebooks.rtf2xml import copy
+from calibre.ptempfile import better_mktemp
+from . import open_for_read, open_for_write
+
+
+class Paragraphs:
+    """
+    =================
+    Purpose
+    =================
+    Write paragraph tags for a tokenized file. (This module won't be any use to use
+    to you unless you use it as part of the other modules.)
+    -------------
+    Method
+    -------------
+    RTF does not tell you when a paragraph begins. It only tells you when the
+    paragraph ends.
+    In order to make paragraphs out of this limited info, the parser starts in the
+    body of the documents and assumes it is not in a paragraph. It looks for clues
+    to begin a paragraph. Text starts a paragraph; so does an inline field or
+    list-text. If an end of paragraph marker (\\par) is found, then this indicates
+    a blank paragraph.
+    Once a paragraph is found, the state changes to 'paragraph.' In this state,
+    clues are looked to for the end of a paragraph. The end of a paragraph marker
+    (\\par) marks the end of a paragraph. So does the end of a footnote or heading;
+    a paragraph definition; the end of a field-block; and the beginning of a
+    section. (How about the end of a section or the end of a field-block?)
+    """
+
+    def __init__(self,
+            in_file,
+            bug_handler,
+            copy=None,
+            write_empty_para=1,
+            run_level=1,
+            ):
+        """
+        Required:
+            'file'--file to parse
+        Optional:
+            'copy'-- whether to make a copy of result for debugging
+            'temp_dir' --where to output temporary results (default is
+            directory from which the script is run.)
+        Returns:
+            nothing
+            """
+        self.__file = in_file
+        self.__bug_handler = bug_handler
+        self.__copy = copy
+        self.__write_empty_para = write_empty_para
+        self.__run_level = run_level
+        self.__write_to = better_mktemp()
+
+    def __initiate_values(self):
+        """
+        Initiate all values.
+        """
+        self.__state = 'before_body'
+        self.__start_marker =  'mi<mk<para-start\n'  # outside para tags
+        self.__start2_marker = 'mi<mk<par-start_\n'  # inside para tags
+        self.__end2_marker =   'mi<mk<par-end___\n'  # inside para tags
+        self.__end_marker =    'mi<mk<para-end__\n'  # outside para tags
+        self.__state_dict = {
+        'before_body'       : self.__before_body_func,
+        'not_paragraph'     : self.__not_paragraph_func,
+        'paragraph'         : self.__paragraph_func,
+        }
+        self.__paragraph_dict = {
+        'cw<pf<par-end___'      : self.__close_para_func,   # end of paragraph
+        'mi<mk<headi_-end'      : self.__close_para_func,   # end of header or footer
+        # 'cw<pf<par-def___'      : self.__close_para_func,   # paragraph definition
+        # 'mi<mk<fld-bk-end'      : self.__close_para_func,   # end of field-block
+        'mi<mk<fldbk-end_'      : self.__close_para_func,   # end of field-block
+        'mi<mk<body-close'      : self.__close_para_func,   # end of body
+        'mi<mk<sect-close'      : self.__close_para_func,   # end of body
+        'mi<mk<sect-start'      : self.__close_para_func,   # start of section
+        'mi<mk<foot___clo'      : self.__close_para_func,   # end of footnote
+        'cw<tb<cell______'      : self.__close_para_func,   # end of cell
+        'mi<mk<par-in-fld'      : self.__close_para_func,   # start of block field
+        'cw<pf<par-def___'      : self.__bogus_para__def_func,   # paragraph definition
+        }
+        self.__not_paragraph_dict = {
+        'tx<nu<__________'      : self.__start_para_func,
+        'tx<hx<__________'      : self.__start_para_func,
+        'tx<ut<__________'      : self.__start_para_func,
+        'tx<mc<__________'      : self.__start_para_func,
+        'mi<mk<inline-fld'      : self.__start_para_func,
+        'mi<mk<para-beg__'      : self.__start_para_func,
+        'cw<pf<par-end___'      : self.__empty_para_func,
+        'mi<mk<pict-start'      : self.__start_para_func,
+        'cw<pf<page-break'      : self.__empty_pgbk_func,    # page break
+        }
+
+    def __before_body_func(self, line):
+        """
+        Required:
+            line -- line to parse
+        Returns:
+            nothing
+        Logic:
+            This function handles all the lines before the start of the body.
+            Once the body starts, the state is switched to 'not_paragraph'
+        """
+        if self.__token_info == 'mi<mk<body-open_':
+            self.__state = 'not_paragraph'
+        self.__write_obj.write(line)
+
+    def __not_paragraph_func(self, line):
+        """
+        Required:
+            line --line to parse
+        Returns:
+            nothing
+        Logic:
+            This function handles all lines that are outside of the paragraph.
+            It looks for clues that start a paragraph, and when found,
+            switches states and writes the start tags.
+        """
+        action = self.__not_paragraph_dict.get(self.__token_info)
+        if action:
+            action(line)
+        self.__write_obj.write(line)
+
+    def __paragraph_func(self, line):
+        """
+        Required:
+            line --line to parse
+        Returns:
+            nothing
+        Logic:
+            This function handles all the lines that are in the paragraph. It
+            looks for clues to the end of the paragraph. When a clue is found,
+            it calls on another method to write the end of the tag and change
+            the state.
+        """
+        action = self.__paragraph_dict.get(self.__token_info)
+        if action:
+            action(line)
+        else:
+            self.__write_obj.write(line)
+
+    def __start_para_func(self, line):
+        """
+        Requires:
+            line --line to parse
+        Returns:
+            nothing
+        Logic:
+            This function writes the beginning tags for a paragraph and
+            changes the state to paragraph.
+        """
+        self.__write_obj.write(self.__start_marker)  # marker for later parsing
+        self.__write_obj.write(
+        'mi<tg<open______<para\n'
+        )
+        self.__write_obj.write(self.__start2_marker)
+        self.__state = 'paragraph'
+
+    def __empty_para_func(self, line):
+        """
+        Requires:
+            line --line to parse
+        Returns:
+            nothing
+        Logic:
+            This function writes the empty tags for a paragraph.
+            It does not do anything if self.__write_empty_para is 0.
+        """
+        if self.__write_empty_para:
+            self.__write_obj.write(self.__start_marker)  # marker for later parsing
+            self.__write_obj.write(
+            'mi<tg<empty_____<para\n'
+            )
+            self.__write_obj.write(self.__end_marker)   # marker for later parsing
+
+    def __empty_pgbk_func(self, line):
+        """
+        Requires:
+            line --line to parse
+        Returns:
+            nothing
+        Logic:
+            This function writes the empty tags for a page break.
+        """
+        self.__write_obj.write(
+        'mi<tg<empty_____<page-break\n'
+        )
+
+    def __close_para_func(self, line):
+        """
+        Requires:
+            line --line to parse
+        Returns:
+            nothing
+        Logic:
+            This function writes the end tags for a paragraph and
+            changes the state to not_paragraph.
+        """
+        self.__write_obj.write(self.__end2_marker)  # marker for later parser
+        self.__write_obj.write(
+        'mi<tg<close_____<para\n'
+        )
+        self.__write_obj.write(self.__end_marker)  # marker for later parser
+        self.__write_obj.write(line)
+        self.__state = 'not_paragraph'
+
+    def __bogus_para__def_func(self, line):
+        """
+        Requires:
+            line --line to parse
+        Returns:
+            nothing
+        Logic:
+            if a \\pard occurs in a paragraph, I want to ignore it. (I believe)
+        """
+        self.__write_obj.write('mi<mk<bogus-pard\n')
+
+    def make_paragraphs(self):
+        """
+        Requires:
+            nothing
+        Returns:
+            nothing (changes the original file)
+        Logic:
+            Read one line in at a time. Determine what action to take based on
+            the state. If the state is before the body, look for the
+            beginning of the body.
+            When the body is found, change the state to 'not_paragraph'. The
+            only other state is 'paragraph'.
+        """
+        self.__initiate_values()
+        with open_for_read(self.__file) as read_obj:
+            with open_for_write(self.__write_to) as self.__write_obj:
+                for line in read_obj:
+                    self.__token_info = line[:16]
+                    action = self.__state_dict.get(self.__state)
+                    if action is None:
+                        try:
+                            sys.stderr.write('no matching state in module paragraphs.py\n')
+                            sys.stderr.write(self.__state + '\n')
+                        except:
+                            pass
+                    action(line)
+        copy_obj = copy.Copy(bug_handler=self.__bug_handler)
+        if self.__copy:
+            copy_obj.copy_file(self.__write_to, "paragraphs.data")
+        copy_obj.rename(self.__write_to, self.__file)
+        os.remove(self.__write_to)
--- a/ebook_converter/ebooks/rtf2xml/pict.py
+++ b/ebook_converter/ebooks/rtf2xml/pict.py
@@ -0,0 +1,182 @@
+from __future__ import absolute_import, division, print_function, unicode_literals
+#########################################################################
+#                                                                       #
+#                                                                       #
+#   copyright 2002 Paul Henry Tremblay                                  #
+#                                                                       #
+#   This program is distributed in the hope that it will be useful,     #
+#   but WITHOUT ANY WARRANTY; without even the implied warranty of      #
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU    #
+#   General Public License for more details.                            #
+#                                                                       #
+#                                                                       #
+#########################################################################
+import sys, os
+
+from calibre.ebooks.rtf2xml import copy
+from calibre.ptempfile import better_mktemp
+from polyglot.builtins import unicode_type
+
+from . import open_for_read, open_for_write
+
+
+class Pict:
+    """Process graphic information"""
+    def __init__(self,
+            in_file,
+            bug_handler,
+            out_file,
+            copy=None,
+            orig_file=None,
+            run_level=1,
+        ):
+        self.__file = in_file
+        self.__bug_handler = bug_handler
+        self.__copy = copy
+        self.__run_level = run_level
+        self.__write_to = better_mktemp()
+        self.__bracket_count = 0
+        self.__ob_count = 0
+        self.__cb_count = 0
+        self.__pict_count = 0
+        self.__in_pict = False
+        self.__already_found_pict = False
+        self.__orig_file = orig_file
+        self.__initiate_pict_dict()
+        self.__out_file = out_file
+
+    def __initiate_pict_dict(self):
+        self.__pict_dict = {
+        'ob<nu<open-brack'    :   self.__open_br_func,
+        'cb<nu<clos-brack'    :   self.__close_br_func,
+        'tx<nu<__________'    :   self.__text_func,
+        }
+
+    def __open_br_func(self, line):
+        return "{\n"
+
+    def __close_br_func(self, line):
+        return "}\n"
+
+    def __text_func(self, line):
+        # tx<nu<__________<true text
+        return line[17:]
+
+    def __make_dir(self):
+        """ Make a directory to put the image data in"""
+        base_name = os.path.basename(getattr(self.__orig_file, 'name',
+            self.__orig_file))
+        base_name = os.path.splitext(base_name)[0]
+        if self.__out_file:
+            dir_name = os.path.dirname(getattr(self.__out_file, 'name',
+                self.__out_file))
+        else:
+            dir_name = os.path.dirname(self.__orig_file)
+        self.__dir_name = base_name + "_rtf_pict_dir/"
+        self.__dir_name = os.path.join(dir_name, self.__dir_name)
+        if not os.path.isdir(self.__dir_name):
+            try:
+                os.mkdir(self.__dir_name)
+            except OSError as msg:
+                msg = "%sCouldn't make directory '%s':\n" % (unicode_type(msg), self.__dir_name)
+                raise self.__bug_handler
+        else:
+            if self.__run_level > 1:
+                sys.stderr.write('Removing files from old pict directory...\n')
+            all_files = os.listdir(self.__dir_name)
+            for the_file in all_files:
+                the_file = os.path.join(self.__dir_name, the_file)
+                try:
+                    os.remove(the_file)
+                except OSError:
+                    pass
+            if self.__run_level > 1:
+                sys.stderr.write('Files removed.\n')
+
+    def __create_pict_file(self):
+        """Create a file for all the pict data to be written to.
+        """
+        self.__pict_file = os.path.join(self.__dir_name, 'picts.rtf')
+        self.__write_pic_obj = open_for_write(self.__pict_file, append=True)
+
+    def __in_pict_func(self, line):
+        if self.__cb_count == self.__pict_br_count:
+            self.__in_pict = False
+            self.__write_pic_obj.write("}\n")
+            return True
+        else:
+            action = self.__pict_dict.get(self.__token_info)
+            if action:
+                self.__write_pic_obj.write(action(line))
+            return False
+
+    def __default(self, line, write_obj):
+        """Determine if each token marks the beginning of pict data.
+        If it does, create a new file to write data to (if that file
+        has not already been created.) Set the self.__in_pict flag to true.
+        If the line does not contain pict data, return 1
+        """
+        """
+        $pict_count++;
+        $pict_count =  sprintf("%03d", $pict_count);
+        print OUTPUT "dv<xx<em<nu<pict<at<num>$pict_count\n";
+        """
+        if self.__token_info == 'cw<gr<picture___':
+            self.__pict_count += 1
+            # write_obj.write("mi<tg<em<at<pict<num>%03d\n" % self.__pict_count)
+            write_obj.write('mi<mk<pict-start\n')
+            write_obj.write('mi<tg<empty-att_<pict<num>%03d\n' % self.__pict_count)
+            write_obj.write('mi<mk<pict-end__\n')
+            if not self.__already_found_pict:
+                self.__create_pict_file()
+                self.__already_found_pict=True
+                self.__print_rtf_header()
+            self.__in_pict = 1
+            self.__pict_br_count = self.__ob_count
+            self.__cb_count = 0
+            self.__write_pic_obj.write("{\\pict\n")
+            return False
+        return True
+
+    def __print_rtf_header(self):
+        """Print to pict file the necessary RTF data for the file to be
+        recognized as an RTF file.
+        """
+        self.__write_pic_obj.write("{\\rtf1 \n{\\fonttbl\\f0\\null;} \n")
+        self.__write_pic_obj.write("{\\colortbl\\red255\\green255\\blue255;} \n\\pard \n")
+
+    def process_pict(self):
+        self.__make_dir()
+        with open_for_read(self.__file) as read_obj:
+            with open_for_write(self.__write_to) as write_obj:
+                for line in read_obj:
+                    self.__token_info = line[:16]
+                    if self.__token_info == 'ob<nu<open-brack':
+                        self.__ob_count = line[-5:-1]
+                    if self.__token_info == 'cb<nu<clos-brack':
+                        self.__cb_count = line[-5:-1]
+                    if not self.__in_pict:
+                        to_print = self.__default(line, write_obj)
+                        if to_print :
+                            write_obj.write(line)
+                    else:
+                        to_print = self.__in_pict_func(line)
+                        if to_print :
+                            write_obj.write(line)
+                if self.__already_found_pict:
+                    self.__write_pic_obj.write("}\n")
+                    self.__write_pic_obj.close()
+        copy_obj = copy.Copy(bug_handler=self.__bug_handler)
+        if self.__copy:
+            copy_obj.copy_file(self.__write_to, "pict.data")
+            try:
+                copy_obj.copy_file(self.__pict_file, "pict.rtf")
+            except:
+                pass
+        copy_obj.rename(self.__write_to, self.__file)
+        os.remove(self.__write_to)
+        if self.__pict_count == 0:
+            try:
+                os.rmdir(self.__dir_name)
+            except OSError:
+                pass
--- a/ebook_converter/ebooks/rtf2xml/preamble_div.py
+++ b/ebook_converter/ebooks/rtf2xml/preamble_div.py
@@ -0,0 +1,591 @@
+from __future__ import unicode_literals, absolute_import, print_function, division
+from __future__ import print_function
+#########################################################################
+#                                                                       #
+#                                                                       #
+#   copyright 2002 Paul Henry Tremblay                                  #
+#                                                                       #
+#   This program is distributed in the hope that it will be useful,     #
+#   but WITHOUT ANY WARRANTY; without even the implied warranty of      #
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU    #
+#   General Public License for more details.                            #
+#                                                                       #
+#                                                                       #
+#########################################################################
+import sys, os
+from calibre.ebooks.rtf2xml import copy, override_table, list_table
+from calibre.ptempfile import better_mktemp
+from . import open_for_read, open_for_write
+
+
+class PreambleDiv:
+    """
+    Break the preamble into divisions.
+    """
+
+    def __init__(self, in_file,
+            bug_handler,
+            copy=None,
+            no_namespace=None,
+            run_level=1,
+            ):
+        """
+        Required:
+            'file'
+        Optional:
+            'copy'-- whether to make a copy of result for debugging
+            'temp_dir' --where to output temporary results (default is
+            directory from which the script is run.)
+        Returns:
+            nothing
+            """
+        self.__file = in_file
+        self.__bug_handler = bug_handler
+        self.__copy = copy
+        self.__no_namespace = no_namespace
+        self.__write_to = better_mktemp()
+        self.__run_level = run_level
+
+    def __initiate_values(self):
+        """
+        Set values, including those for the dictionary.
+        """
+        self.__all_lists = {}
+        self.__page = {
+        'margin-top'    : 72,
+        'margin-bottom' : 72,
+        'margin-left'   : 90,
+        'margin-right'  : 90,
+        'gutter'        : 0,
+        }
+        self.__cb_count = ''
+        self.__ob_count = ''
+        self.__state = 'preamble'
+        self.__rtf_final = ''
+        self.__close_group_count = ''
+        self.__found_font_table = 0
+        self.__list_table_final = ''
+        self.__override_table_final = ''
+        self.__revision_table_final = ''
+        self.__doc_info_table_final = ''
+        self.__state_dict = {
+        'default'           :   self.__default_func,
+        'rtf_header'        :   self.__rtf_head_func,
+        'preamble'          :   self.__preamble_func,
+        'font_table'        :   self.__font_table_func,
+        'color_table'       :   self.__color_table_func,
+        'style_sheet'       :   self.__style_sheet_func,
+        'list_table'        :   self.__list_table_func,
+        'override_table'    :   self.__override_table_func,
+        'revision_table'    :   self.__revision_table_func,
+        'doc_info'          :   self.__doc_info_func,
+        'body'              :   self.__body_func,
+        'ignore'            :   self.__ignore_func,
+        'cw<ri<rtf_______'  :   self.__found_rtf_head_func,
+        'cw<pf<par-def___'  :   self.__para_def_func,
+        'tx<nu<__________'  :   self.__text_func,
+        'cw<tb<row-def___'  :   self.__row_def_func,
+        'cw<sc<section___'  :   self.__new_section_func,
+        'cw<sc<sect-defin'  :   self.__new_section_func,
+        'cw<it<font-table'  :   self.__found_font_table_func,
+        'cw<it<colr-table'  :   self.__found_color_table_func,
+        'cw<ss<style-shet'  :   self.__found_style_sheet_func,
+        'cw<it<listtable_'  :   self.__found_list_table_func,
+        'cw<it<lovr-table'  :   self.__found_override_table_func,
+        'cw<it<revi-table'  :   self.__found_revision_table_func,
+        'cw<di<doc-info__'  :   self.__found_doc_info_func,
+        'cw<pa<margin-lef'  :   self.__margin_func,
+        'cw<pa<margin-rig'  :   self.__margin_func,
+        'cw<pa<margin-top'  :   self.__margin_func,
+        'cw<pa<margin-bot'  :   self.__margin_func,
+        'cw<pa<gutter____'  :   self.__margin_func,
+        'cw<pa<paper-widt'  :   self.__margin_func,
+        'cw<pa<paper-hght'  :   self.__margin_func,
+        # 'cw<tb<columns___'  :   self.__section_func,
+        }
+        self.__margin_dict = {
+        'margin-lef'        :   'margin-left',
+        'margin-rig'        :   'margin-right',
+        'margin-top'        :   'margin-top',
+        'margin-bot'        :   'margin-bottom',
+        'gutter____'        :   'gutter',
+        'paper-widt'        :   'paper-width',
+        'paper-hght'        :   'paper-height',
+        }
+        self.__translate_sec = {
+        'columns___'        :   'column',
+        }
+        self.__section = {}
+        # self.__write_obj.write(self.__color_table_final)
+        self.__color_table_final = ''
+        self.__style_sheet_final = ''
+        self.__individual_font = 0
+        self.__old_font = 0
+        self.__ob_group = 0  # depth of group
+        self.__font_table_final = 0
+        self.__list_table_obj = list_table.ListTable(
+                run_level=self.__run_level,
+                bug_handler=self.__bug_handler,
+                )
+
+    def __ignore_func(self, line):
+        """
+        Ignore all  lines, until the bracket is found that marks the end of
+        the group.
+        """
+        if self.__ignore_num == self.__cb_count:
+            self.__state = self.__previous_state
+
+    def __found_rtf_head_func(self, line):
+        self.__state = 'rtf_header'
+
+    def __rtf_head_func(self, line):
+        if self.__ob_count == '0002':
+            self.__rtf_final = (
+            'mi<mk<rtfhed-beg\n' +
+            self.__rtf_final +
+            'mi<mk<rtfhed-end\n'
+            )
+            self.__state = 'preamble'
+        elif self.__token_info == 'tx<nu<__________' or \
+            self.__token_info == 'cw<pf<par-def___':
+            self.__state = 'body'
+            self.__rtf_final = (
+            'mi<mk<rtfhed-beg\n' +
+            self.__rtf_final +
+            'mi<mk<rtfhed-end\n'
+            )
+            self.__make_default_font_table()
+            self.__write_preamble()
+            self.__write_obj.write(line)
+        else:
+            self.__rtf_final = self.__rtf_final + line
+
+    def __make_default_font_table(self):
+        """
+        If not font table is fount, need to write one out.
+        """
+        self.__font_table_final = 'mi<tg<open______<font-table\n'
+        self.__font_table_final += 'mi<mk<fonttb-beg\n'
+        self.__font_table_final += 'mi<mk<fontit-beg\n'
+        self.__font_table_final += 'cw<ci<font-style<nu<0\n'
+        self.__font_table_final += 'tx<nu<__________<Times;\n'
+        self.__font_table_final += 'mi<mk<fontit-end\n'
+        self.__font_table_final +=  'mi<mk<fonttb-end\n'
+        self.__font_table_final += 'mi<tg<close_____<font-table\n'
+
+    def __make_default_color_table(self):
+        """
+        If no color table is found, write a string for a default one
+        """
+        self.__color_table_final = 'mi<tg<open______<color-table\n'
+        self.__color_table_final += 'mi<mk<clrtbl-beg\n'
+        self.__color_table_final += 'cw<ci<red_______<nu<00\n'
+        self.__color_table_final += 'cw<ci<green_____<nu<00\n'
+        self.__color_table_final += 'cw<ci<blue______<en<00\n'
+        self.__color_table_final += 'mi<mk<clrtbl-end\n'
+        self.__color_table_final += 'mi<tg<close_____<color-table\n'
+
+    def __make_default_style_table(self):
+        """
+        If not font table is found, make a string for a default one
+        """
+        """
+        self.__style_sheet_final = 'mi<tg<open______<style-table\n'
+        self.__style_sheet_final +=
+        self.__style_sheet_final +=
+        self.__style_sheet_final +=
+        self.__style_sheet_final +=
+        self.__style_sheet_final +=
+        self.__style_sheet_final += 'mi<tg<close_____<style-table\n'
+        """
+        self.__style_sheet_final = """mi<tg<open______<style-table
+mi<mk<styles-beg
+mi<mk<stylei-beg
+cw<ci<font-style<nu<0
+tx<nu<__________<Normal;
+mi<mk<stylei-end
+mi<mk<stylei-beg
+cw<ss<char-style<nu<0
+tx<nu<__________<Default Paragraph Font;
+mi<mk<stylei-end
+mi<mk<styles-end
+mi<tg<close_____<style-table
+"""
+
+    def __found_font_table_func(self, line):
+        if self.__found_font_table:
+            self.__state = 'ignore'
+        else:
+            self.__state = 'font_table'
+            self.__font_table_final = ''
+        self.__close_group_count = self.__ob_count
+        self.__cb_count = 0
+        self.__found_font_table = 1
+
+    def __font_table_func(self, line):
+        """
+        Keep adding to the self.__individual_font string until end of group
+        found. If a bracket is found, check that it is only one bracket deep.
+        If it is, then set the marker for an individual font. If it is not,
+        then ignore all data in this group.
+cw<ci<font-style<nu<0
+        """
+        if self.__cb_count == self.__close_group_count:
+            self.__state = 'preamble'
+            self.__font_table_final = 'mi<tg<open______<font-table\n' + \
+            'mi<mk<fonttb-beg\n' + self.__font_table_final
+            self.__font_table_final += \
+            'mi<mk<fonttb-end\n' + 'mi<tg<close_____<font-table\n'
+        elif self.__token_info == 'ob<nu<open-brack':
+            if int(self.__ob_count) == int(self.__close_group_count) + 1:
+                self.__font_table_final +=  \
+                'mi<mk<fontit-beg\n'
+                self.__individual_font = 1
+            else:
+                # ignore
+                self.__previous_state = 'font_table'
+                self.__state = 'ignore'
+                self.__ignore_num = self.__ob_count
+        elif self.__token_info == 'cb<nu<clos-brack':
+            if int(self.__cb_count) == int(self.__close_group_count) + 1:
+                self.__individual_font = 0
+                self.__font_table_final +=  \
+                'mi<mk<fontit-end\n'
+        elif self.__individual_font:
+            if self.__old_font and self.__token_info == 'tx<nu<__________':
+                if ';' in line:
+                    self.__font_table_final +=  line
+                    self.__font_table_final +=   'mi<mk<fontit-end\n'
+                    self.__individual_font = 0
+            else:
+                self.__font_table_final +=  line
+        elif self.__token_info == 'cw<ci<font-style':
+            self.__old_font = 1
+            self.__individual_font = 1
+            self.__font_table_final +=   'mi<mk<fontit-beg\n'
+            self.__font_table_final +=  line
+
+    def __old_font_func(self, line):
+        """
+        Required:
+            line --line to parse
+        Returns:
+            nothing
+        Logic:
+            used for older forms of RTF:
+            \f3\fswiss\fcharset77 Helvetica-Oblique;\f4\fnil\fcharset77 Geneva;}
+            Note how each font is not divided by a bracket
+        """
+
+    def __found_color_table_func(self, line):
+        """
+        all functions that start with __found operate the same. They set the
+        state, initiate a string, determine the self.__close_group_count, and
+        set self.__cb_count to zero.
+        """
+        self.__state = 'color_table'
+        self.__color_table_final = ''
+        self.__close_group_count = self.__ob_count
+        self.__cb_count = 0
+
+    def __color_table_func(self, line):
+        if int(self.__cb_count) == int(self.__close_group_count):
+            self.__state = 'preamble'
+            self.__color_table_final = 'mi<tg<open______<color-table\n' + \
+            'mi<mk<clrtbl-beg\n' + self.__color_table_final
+            self.__color_table_final += \
+            'mi<mk<clrtbl-end\n' + 'mi<tg<close_____<color-table\n'
+        else:
+            self.__color_table_final += line
+
+    def __found_style_sheet_func(self, line):
+        self.__state = 'style_sheet'
+        self.__style_sheet_final = ''
+        self.__close_group_count = self.__ob_count
+        self.__cb_count = 0
+
+    def __style_sheet_func(self, line):
+        """
+        Same logic as the  font_table_func.
+        """
+        if self.__cb_count == self.__close_group_count:
+            self.__state = 'preamble'
+            self.__style_sheet_final = 'mi<tg<open______<style-table\n' + \
+            'mi<mk<styles-beg\n' + self.__style_sheet_final
+            self.__style_sheet_final += \
+            'mi<mk<styles-end\n' + 'mi<tg<close_____<style-table\n'
+        elif self.__token_info == 'ob<nu<open-brack':
+            if int(self.__ob_count) == int(self.__close_group_count) + 1:
+                self.__style_sheet_final +=  \
+                'mi<mk<stylei-beg\n'
+        elif self.__token_info == 'cb<nu<clos-brack':
+            if int(self.__cb_count) == int(self.__close_group_count) + 1:
+                self.__style_sheet_final +=  \
+                'mi<mk<stylei-end\n'
+        else:
+            self.__style_sheet_final +=  line
+
+    def __found_list_table_func(self, line):
+        self.__state = 'list_table'
+        self.__list_table_final = ''
+        self.__close_group_count = self.__ob_count
+        self.__cb_count = 0
+
+    def __list_table_func(self, line):
+        if self.__cb_count == self.__close_group_count:
+            self.__state = 'preamble'
+            self.__list_table_final, self.__all_lists =\
+                self.__list_table_obj.parse_list_table(
+                self.__list_table_final)
+            # sys.stderr.write(repr(all_lists))
+        elif self.__token_info == '':
+            pass
+        else:
+            self.__list_table_final += line
+            pass
+
+    def __found_override_table_func(self, line):
+        self.__override_table_obj = override_table.OverrideTable(
+            run_level=self.__run_level,
+            list_of_lists=self.__all_lists,
+            )
+        self.__state = 'override_table'
+        self.__override_table_final = ''
+        self.__close_group_count = self.__ob_count
+        self.__cb_count = 0
+        # cw<it<lovr-table
+
+    def __override_table_func(self, line):
+        if self.__cb_count == self.__close_group_count:
+            self.__state = 'preamble'
+            self.__override_table_final, self.__all_lists =\
+                self.__override_table_obj.parse_override_table(self.__override_table_final)
+        elif self.__token_info == '':
+            pass
+        else:
+            self.__override_table_final += line
+
+    def __found_revision_table_func(self, line):
+        self.__state = 'revision_table'
+        self.__revision_table_final = ''
+        self.__close_group_count = self.__ob_count
+        self.__cb_count = 0
+
+    def __revision_table_func(self, line):
+        if int(self.__cb_count) == int(self.__close_group_count):
+            self.__state = 'preamble'
+            self.__revision_table_final = 'mi<tg<open______<revision-table\n' + \
+            'mi<mk<revtbl-beg\n' + self.__revision_table_final
+            self.__revision_table_final += \
+            'mi<mk<revtbl-end\n' + 'mi<tg<close_____<revision-table\n'
+        else:
+            self.__revision_table_final += line
+
+    def __found_doc_info_func(self, line):
+        self.__state = 'doc_info'
+        self.__doc_info_table_final = ''
+        self.__close_group_count = self.__ob_count
+        self.__cb_count = 0
+
+    def __doc_info_func(self, line):
+        if self.__cb_count == self.__close_group_count:
+            self.__state = 'preamble'
+            self.__doc_info_table_final = 'mi<tg<open______<doc-information\n' + \
+            'mi<mk<doc-in-beg\n' + self.__doc_info_table_final
+            self.__doc_info_table_final += \
+            'mi<mk<doc-in-end\n' + 'mi<tg<close_____<doc-information\n'
+        elif self.__token_info == 'ob<nu<open-brack':
+            if int(self.__ob_count) == int(self.__close_group_count) + 1:
+                self.__doc_info_table_final +=  \
+                'mi<mk<docinf-beg\n'
+        elif self.__token_info == 'cb<nu<clos-brack':
+            if int(self.__cb_count) == int(self.__close_group_count) + 1:
+                self.__doc_info_table_final +=  \
+                'mi<mk<docinf-end\n'
+        else:
+            self.__doc_info_table_final +=  line
+
+    def __margin_func(self, line):
+        """
+        Handles lines that describe page info. Add the apporpriate info in the
+        token to the self.__margin_dict dicitonary.
+        """
+        info = line[6:16]
+        changed = self.__margin_dict.get(info)
+        if changed is None:
+            print('woops!')
+        else:
+            self.__page[changed] = line[20:-1]
+        # cw<pa<margin-lef<nu<1728
+
+    def __print_page_info(self):
+        self.__write_obj.write('mi<tg<empty-att_<page-definition')
+        for key in self.__page.keys():
+            self.__write_obj.write(
+            '<%s>%s' % (key, self.__page[key])
+            )
+        self.__write_obj.write('\n')
+# mi<tg<open-att__<footn
+
+    def __print_sec_info(self):
+        """
+        Check if there is any section info. If so, print it out.
+        If not, print out an empty tag to satisfy the dtd.
+        """
+        if len(self.__section.keys()) == 0:
+            self.__write_obj.write(
+            'mi<tg<open______<section-definition\n'
+                    )
+        else:
+            self.__write_obj.write(
+            'mi<tg<open-att__<section-definition')
+            keys = self.__section.keys()
+            for key in keys:
+                self.__write_obj.write(
+                '<%s>%s' %  (key, self.__section[key])
+                )
+            self.__write_obj.write('\n')
+
+    def __section_func(self, line):
+        """
+        Add info pertaining to section to the self.__section dictionary, to be
+        printed out later.
+        """
+        info = self.__translate_sec.get(line[6:16])
+        if info is None:
+            sys.stderr.write('woops!\n')
+        else:
+            self.__section[info] = 'true'
+
+    def __body_func(self, line):
+        self.__write_obj.write(line)
+
+    def __default_func(self, line):
+        # either in preamble or in body
+        pass
+
+    def __para_def_func(self, line):
+        # if self.__ob_group == 1
+        # this tells dept of group
+        if self.__cb_count == '0002':
+            self.__state = 'body'
+            self.__write_preamble()
+        self.__write_obj.write(line)
+
+    def __text_func(self, line):
+        """
+        If the cb_count is less than 1, you have hit the body
+        For older RTF
+        Newer RTF should never have to use this function
+        """
+        if self.__cb_count == '':
+            cb_count = '0002'
+        else:
+            cb_count = self.__cb_count
+        # ignore previous lines
+        # should be
+        # if self.__ob_group == 1
+        # this tells dept of group
+        if cb_count == '0002':
+            self.__state = 'body'
+            self.__write_preamble()
+        self.__write_obj.write(line)
+
+    def __row_def_func(self, line):
+        # if self.__ob_group == 1
+        # this tells dept of group
+        if self.__cb_count == '0002':
+            self.__state = 'body'
+            self.__write_preamble()
+        self.__write_obj.write(line)
+
+    def __new_section_func(self, line):
+        """
+        This is new. The start of a section marks the end of the preamble
+        """
+        if self.__cb_count == '0002':
+            self.__state = 'body'
+            self.__write_preamble()
+        else:
+            sys.stderr.write('module is preamble_div\n')
+            sys.stderr.write('method is __new_section_func\n')
+            sys.stderr.write('bracket count should be 2?\n')
+        self.__write_obj.write(line)
+
+    def __write_preamble(self):
+        """
+        Write all the strings, which represent all the data in the preamble.
+        Write a body and section beginning.
+        """
+        if self.__no_namespace:
+            self.__write_obj.write(
+                'mi<tg<open______<doc\n'
+                    )
+        else:
+            self.__write_obj.write(
+                    'mi<tg<open-att__<doc<xmlns>http://rtf2xml.sourceforge.net/\n')
+        self.__write_obj.write('mi<tg<open______<preamble\n')
+        self.__write_obj.write(self.__rtf_final)
+        if not self.__color_table_final:
+            self.__make_default_color_table()
+        if not self.__font_table_final:
+            self.__make_default_font_table()
+        self.__write_obj.write(self.__font_table_final)
+        self.__write_obj.write(self.__color_table_final)
+        if not self.__style_sheet_final:
+            self.__make_default_style_table()
+        self.__write_obj.write(self.__style_sheet_final)
+        self.__write_obj.write(self.__list_table_final)
+        self.__write_obj.write(self.__override_table_final)
+        self.__write_obj.write(self.__revision_table_final)
+        self.__write_obj.write(self.__doc_info_table_final)
+        self.__print_page_info()
+        self.__write_obj.write('ob<nu<open-brack<0001\n')
+        self.__write_obj.write('ob<nu<open-brack<0002\n')
+        self.__write_obj.write('cb<nu<clos-brack<0002\n')
+        self.__write_obj.write('mi<tg<close_____<preamble\n')
+        self.__write_obj.write('mi<tg<open______<body\n')
+        # self.__write_obj.write('mi<tg<open-att__<section<num>1\n')
+        # self.__print_sec_info()
+        # self.__write_obj.write('mi<tg<open______<headers-and-footers\n')
+        # self.__write_obj.write('mi<mk<head_foot_<\n')
+        # self.__write_obj.write('mi<tg<close_____<headers-and-footers\n')
+        self.__write_obj.write('mi<mk<body-open_\n')
+
+    def __preamble_func(self, line):
+        """
+        Check if the token info belongs to the dictionary. If so, take the
+        appropriate action.
+        """
+        action = self.__state_dict.get(self.__token_info)
+        if action:
+            action(line)
+
+    def make_preamble_divisions(self):
+        self.__initiate_values()
+        read_obj = open_for_read(self.__file)
+        self.__write_obj = open_for_write(self.__write_to)
+        line_to_read = 1
+        while line_to_read:
+            line_to_read = read_obj.readline()
+            line = line_to_read
+            self.__token_info = line[:16]
+            if self.__token_info == 'ob<nu<open-brack':
+                self.__ob_count = line[-5:-1]
+                self.__ob_group += 1
+            if self.__token_info == 'cb<nu<clos-brack':
+                self.__cb_count = line[-5:-1]
+                self.__ob_group -= 1
+            action = self.__state_dict.get(self.__state)
+            if action is None:
+                print(self.__state)
+            action(line)
+        read_obj.close()
+        self.__write_obj.close()
+        copy_obj = copy.Copy(bug_handler=self.__bug_handler)
+        if self.__copy:
+            copy_obj.copy_file(self.__write_to, "preamble_div.data")
+        copy_obj.rename(self.__write_to, self.__file)
+        os.remove(self.__write_to)
+        return self.__all_lists
--- a/ebook_converter/ebooks/rtf2xml/preamble_rest.py
+++ b/ebook_converter/ebooks/rtf2xml/preamble_rest.py
@@ -0,0 +1,157 @@
+from __future__ import unicode_literals, absolute_import, print_function, division
+#########################################################################
+#                                                                       #
+#                                                                       #
+#   copyright 2002 Paul Henry Tremblay                                  #
+#                                                                       #
+#   This program is distributed in the hope that it will be useful,     #
+#   but WITHOUT ANY WARRANTY; without even the implied warranty of      #
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU    #
+#   General Public License for more details.                            #
+#                                                                       #
+#                                                                       #
+#########################################################################
+import sys,os
+
+from calibre.ebooks.rtf2xml import copy
+from . import open_for_read, open_for_write
+
+
+class Preamble:
+    """
+    Fix the reamaing parts of the preamble. This module does very little. It
+    makes sure that no text gets put in the revision of list table. In the
+    future, when I understand how to interpret the revision table and list
+    table, I will make these methods more functional.
+    """
+
+    def __init__(self, file,
+                bug_handler,
+                platform,
+                default_font,
+                code_page,
+                copy=None,
+                temp_dir=None,
+                ):
+        """
+        Required:
+            file--file to parse
+            platform --Windows or Macintosh
+            default_font -- the default font
+            code_page --the code page (ansi1252, for example)
+        Optional:
+            'copy'-- whether to make a copy of result for debugging
+            'temp_dir' --where to output temporary results (default is
+            directory from which the script is run.)
+        Returns:
+            nothing
+            """
+        self.__file=file
+        self.__bug_handler = bug_handler
+        self.__copy = copy
+        self.__default_font = default_font
+        self.__code_page = code_page
+        self.__platform = platform
+        if temp_dir:
+            self.__write_to = os.path.join(temp_dir,"info_table_info.data")
+        else:
+            self.__write_to = "info_table_info.data"
+
+    def __initiate_values(self):
+        """
+        Initiate all values.
+        """
+        self.__state = 'default'
+        self.__text_string = ''
+        self.__state_dict = {
+        'default'   : self.__default_func,
+        'revision'  : self.__revision_table_func,
+        'list_table'  : self.__list_table_func,
+        'body'        : self.__body_func,
+        }
+        self.__default_dict = {
+        'mi<mk<rtfhed-beg'      : self.__found_rtf_head_func,
+        'mi<mk<listabbeg_'      : self.__found_list_table_func,
+        'mi<mk<revtbl-beg'      : self.__found_revision_table_func,
+        'mi<mk<body-open_'      : self.__found_body_func,
+        }
+
+    def __default_func(self, line):
+        action = self.__default_dict.get(self.__token_info)
+        if action:
+            action(line)
+        else:
+            self.__write_obj.write(line)
+
+    def __found_rtf_head_func(self, line):
+        """
+        Requires:
+            line -- the line to parse
+        Returns:
+            nothing.
+        Logic:
+            Write to the output file the default font info, the code page
+            info, and the platform info.
+        """
+        self.__write_obj.write(
+            'mi<tg<empty-att_<rtf-definition'
+            '<default-font>%s<code-page>%s'
+            '<platform>%s\n' % (self.__default_font, self.__code_page,
+            self.__platform)
+        )
+
+    def __found_list_table_func(self, line):
+        self.__state = 'list_table'
+
+    def __list_table_func(self, line):
+        if self.__token_info == 'mi<mk<listabend_':
+            self.__state = 'default'
+        elif line[0:2] == 'tx':
+            pass
+        else:
+            self.__write_obj.write(line)
+
+    def __found_revision_table_func(self, line):
+        self.__state = 'revision'
+
+    def __revision_table_func(self, line):
+        if self.__token_info == 'mi<mk<revtbl-end':
+            self.__state = 'default'
+        elif line[0:2] == 'tx':
+            pass
+        else:
+            self.__write_obj.write(line)
+
+    def __found_body_func(self, line):
+        self.__state = 'body'
+        self.__write_obj.write(line)
+
+    def __body_func(self, line):
+        self.__write_obj.write(line)
+
+    def fix_preamble(self):
+        """
+        Requires:
+            nothing
+        Returns:
+            nothing (changes the original file)
+        Logic:
+            Read one line in at a time. Determine what action to take based on
+            the state. The state can either be defaut, the revision table, or
+            the list table.
+        """
+        self.__initiate_values()
+        with open_for_read(self.__file) as read_obj:
+            with open_for_write(self.__write_to) as self.__write_obj:
+                for line in read_obj:
+                    self.__token_info = line[:16]
+                    action = self.__state_dict.get(self.__state)
+                    if action is None:
+                        sys.stderr.write(
+                        'no matching state in module preamble_rest.py\n' + self.__state + '\n')
+                    action(line)
+        copy_obj = copy.Copy(bug_handler=self.__bug_handler)
+        if self.__copy:
+            copy_obj.copy_file(self.__write_to, "preamble_div.data")
+        copy_obj.rename(self.__write_to, self.__file)
+        os.remove(self.__write_to)
--- a/ebook_converter/ebooks/rtf2xml/process_tokens.py
+++ b/ebook_converter/ebooks/rtf2xml/process_tokens.py
@@ -0,0 +1,837 @@
+from __future__ import absolute_import, division, print_function, unicode_literals
+#########################################################################
+#                                                                       #
+#                                                                       #
+#   copyright 2002 Paul Henry Tremblay                                  #
+#                                                                       #
+#   This program is distributed in the hope that it will be useful,     #
+#   but WITHOUT ANY WARRANTY; without even the implied warranty of      #
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU    #
+#   General Public License for more details.                            #
+#                                                                       #
+#                                                                       #
+#########################################################################
+import os, re
+
+from calibre.ebooks.rtf2xml import copy, check_brackets
+from calibre.ptempfile import better_mktemp
+from polyglot.builtins import unicode_type
+
+from . import open_for_read, open_for_write
+
+
+class ProcessTokens:
+    """
+    Process each token on a line and add information that will be useful for
+    later processing. Information will be put on one line, delimited by "<"
+    for main fields, and ">" for sub fields
+    """
+
+    def __init__(self,
+            in_file,
+            exception_handler,
+            bug_handler,
+            copy=None,
+            run_level=1,
+            ):
+        self.__file = in_file
+        self.__bug_handler = bug_handler
+        self.__copy = copy
+        self.__run_level = run_level
+        self.__write_to = better_mktemp()
+        self.initiate_token_dict()
+        # self.initiate_token_actions()
+        self.compile_expressions()
+        self.__bracket_count=0
+        self.__exception_handler = exception_handler
+        self.__bug_handler = bug_handler
+
+    def compile_expressions(self):
+        self.__num_exp = re.compile(r"([a-zA-Z]+)(.*)")
+        self.__utf_exp = re.compile(r'(&.*?;)')
+
+    def initiate_token_dict(self):
+        self.__return_code = 0
+        self.dict_token={
+        # unicode
+        'mshex'              :  ('nu', '__________', self.__ms_hex_func),
+        # brackets
+        '{'                  : ('nu', '{', self.ob_func),
+        '}'                  : ('nu', '}', self.cb_func),
+        # microsoft characters
+        'ldblquote'          : ('mc', 'ldblquote', self.ms_sub_func),
+        'rdblquote'          : ('mc', 'rdblquote', self.ms_sub_func),
+        'rquote'             : ('mc', 'rquote', self.ms_sub_func),
+        'lquote'             : ('mc', 'lquote', self.ms_sub_func),
+        'emdash'             : ('mc', 'emdash', self.ms_sub_func),
+        'endash'             : ('mc', 'endash', self.ms_sub_func),
+        'bullet'             : ('mc', 'bullet', self.ms_sub_func),
+        '~'                  : ('mc', '~', self.ms_sub_func),
+        'tab'                : ('mc', 'tab', self.ms_sub_func),
+        '_'                  : ('mc', '_', self.ms_sub_func),
+        ';'                  : ('mc', ';', self.ms_sub_func),
+        # this must be wrong
+        '-'                  : ('mc', '-', self.ms_sub_func),
+        'line'               :  ('mi', 'hardline-break', self.direct_conv_func),  # calibre
+        # misc => ml
+        '*'                  : ('ml', 'asterisk__', self.default_func),
+        ':'                  : ('ml', 'colon_____', self.default_func),
+        # text
+        'backslash'          : ('nu', '\\', self.text_func),
+        'ob'                 : ('nu', '{', self.text_func),
+        'cb'                 : ('nu', '}', self.text_func),
+        # paragraph formatting => pf
+        'page'               :  ('pf', 'page-break', self.default_func),
+        'par'                : ('pf', 'par-end___', self.default_func),
+        'pard'               : ('pf', 'par-def___', self.default_func),
+        'keepn'              : ('pf', 'keep-w-nex', self.bool_st_func),
+        'widctlpar'          : ('pf', 'widow-cntl', self.bool_st_func),
+        'adjustright'        : ('pf', 'adjust-rgt', self.bool_st_func),
+        'lang'               : ('pf', 'language__', self.__language_func),
+        'ri'                 : ('pf', 'right-inde', self.divide_by_20),
+        'fi'                 : ('pf', 'fir-ln-ind', self.divide_by_20),
+        'li'                 : ('pf', 'left-inden', self.divide_by_20),
+        'sb'                 : ('pf', 'space-befo', self.divide_by_20),
+        'sa'                 : ('pf', 'space-afte', self.divide_by_20),
+        'sl'                 : ('pf', 'line-space', self.divide_by_20),
+        'deftab'             : ('pf', 'default-ta', self.divide_by_20),
+        'ql'                 : ('pf', 'align_____<left', self.two_part_func),
+        'qc'                 : ('pf', 'align_____<cent', self.two_part_func),
+        'qj'                 : ('pf', 'align_____<just', self.two_part_func),
+        'qr'                 : ('pf', 'align_____<right', self.two_part_func),
+        'nowidctlpar'        : ('pf', 'widow-cntr<false', self.two_part_func),
+        'tx'                 :  ('pf', 'tab-stop__', self.divide_by_20),
+        'tb'                 :  ('pf', 'tab-bar-st', self.divide_by_20),
+        'tqr'                :  ('pf', 'tab-right_', self.default_func),
+        'tqdec'              :  ('pf', 'tab-dec___', self.default_func),
+        'tqc'                :  ('pf', 'tab-center', self.default_func),
+        'tlul'               :  ('pf', 'leader-und', self.default_func),
+        'tlhyph'             :  ('pf', 'leader-hyp', self.default_func),
+        'tldot'              :  ('pf', 'leader-dot', self.default_func),
+        # stylesheet = > ss
+        'stylesheet'         : ('ss', 'style-shet', self.default_func),
+        'sbasedon'           : ('ss', 'based-on__', self.default_func),
+        'snext'              : ('ss', 'next-style', self.default_func),
+        'cs'                 : ('ss', 'char-style', self.default_func),
+        's'                  : ('ss', 'para-style', self.default_func),
+        # graphics => gr
+        'pict'               : ('gr', 'picture___', self.default_func),
+        'objclass'           : ('gr', 'obj-class_', self.default_func),
+        'macpict'            : ('gr', 'mac-pic___', self.default_func),
+        # section => sc
+        'sect'               : ('sc', 'section___', self.default_func),
+        'sectd'              : ('sc', 'sect-defin', self.default_func),
+        'endhere'            : ('sc', 'sect-note_', self.default_func),
+        # list=> ls
+        'pntext'             : ('ls', 'list-text_', self.default_func),
+        # this line must be wrong because it duplicates an earlier one
+        'listtext'           : ('ls', 'list-text_', self.default_func),
+        'pn'                 : ('ls', 'list______', self.default_func),
+        'pnseclvl'           : ('ls', 'list-level', self.default_func),
+        'pncard'             : ('ls', 'list-cardi', self.bool_st_func),
+        'pndec'              : ('ls', 'list-decim', self.bool_st_func),
+        'pnucltr'            : ('ls', 'list-up-al', self.bool_st_func),
+        'pnucrm'             : ('ls', 'list-up-ro', self.bool_st_func),
+        'pnord'              : ('ls', 'list-ord__', self.bool_st_func),
+        'pnordt'             : ('ls', 'list-ordte', self.bool_st_func),
+        'pnlvlblt'           : ('ls', 'list-bulli', self.bool_st_func),
+        'pnlvlbody'          : ('ls', 'list-simpi', self.bool_st_func),
+        'pnlvlcont'          : ('ls', 'list-conti', self.bool_st_func),
+        'pnhang'             : ('ls', 'list-hang_', self.bool_st_func),
+        'pntxtb'             : ('ls', 'list-tebef', self.bool_st_func),
+        'ilvl'               : ('ls', 'list-level', self.default_func),
+        'ls'                 : ('ls', 'list-id___', self.default_func),
+        'pnstart'            : ('ls', 'list-start', self.default_func),
+        'itap'               : ('ls', 'nest-level', self.default_func),
+        'leveltext'          :  ('ls', 'level-text', self.default_func),
+        'levelnumbers'       :  ('ls', 'level-numb', self.default_func),
+        'list'               :  ('ls', 'list-in-tb', self.default_func),
+        'listlevel'          :  ('ls', 'list-tb-le', self.default_func),
+        'listname'           :  ('ls', 'list-name_', self.default_func),
+        'listtemplateid'     :  ('ls', 'ls-tem-id_', self.default_func),
+        'leveltemplateid'    :  ('ls', 'lv-tem-id_', self.default_func),
+        'listhybrid'         :  ('ls', 'list-hybri', self.default_func),
+        'levelstartat'       :  ('ls', 'level-star', self.default_func),
+        'levelspace'         :  ('ls', 'level-spac', self.divide_by_20),
+        'levelindent'        :  ('ls', 'level-inde', self.default_func),
+        'levelnfc'           :  ('ls', 'level-type', self.__list_type_func),
+        'levelnfcn'          :  ('ls', 'level-type', self.__list_type_func),
+        'listid'             :  ('ls', 'lis-tbl-id',  self.default_func),
+        'listoverride'       :  ('ls', 'lis-overid', self.default_func),
+        # duplicate
+        'pnlvl'              : ('ls', 'list-level', self.default_func),
+        # root info => ri
+        'rtf'                : ('ri', 'rtf_______', self.default_func),
+        'deff'               : ('ri', 'deflt-font', self.default_func),
+        'mac'                : ('ri', 'macintosh_', self.default_func),
+        'pc'                 : ('ri', 'pc________', self.default_func),
+        'pca'                : ('ri', 'pca_______', self.default_func),
+        'ansi'               : ('ri', 'ansi______', self.default_func),
+        'ansicpg'            : ('ri', 'ansi-codpg', self.default_func),
+        # notes => nt
+        'footnote'           : ('nt', 'footnote__', self.default_func),
+        'ftnalt'             : ('nt', 'type______<endnote', self.two_part_func),
+        # anchor => an
+        'tc'                 : ('an', 'toc_______', self.default_func),
+        'bkmkstt'            : ('an', 'book-mk-st', self.default_func),
+        'bkmkstart'          : ('an', 'book-mk-st', self.default_func),
+        'bkmkend'            : ('an', 'book-mk-en', self.default_func),
+        'xe'                 : ('an', 'index-mark', self.default_func),
+        'rxe'                : ('an', 'place_____', self.default_func),
+        # index => in
+        'bxe'                : ('in', 'index-bold', self.default_func),
+        'ixe'                : ('in', 'index-ital', self.default_func),
+        'txe'                : ('in', 'index-see_', self.default_func),
+        # table of contents => tc
+        'tcl'               :   ('tc', 'toc-level_', self.default_func),
+        'tcn'               :   ('tc', 'toc-sup-nu', self.default_func),
+        # field => fd
+        'field'              : ('fd', 'field_____', self.default_func),
+        'fldinst'            : ('fd', 'field-inst', self.default_func),
+        'fldrslt'            : ('fd', 'field-rslt', self.default_func),
+        'datafield'          : ('fd', 'datafield_', self.default_func),
+        # info-tables => it
+        'fonttbl'            : ('it', 'font-table', self.default_func),
+        'colortbl'           : ('it', 'colr-table', self.default_func),
+        'listoverridetable'  : ('it', 'lovr-table', self.default_func),
+        'listtable'          : ('it', 'listtable_', self.default_func),
+        'revtbl'             : ('it', 'revi-table', self.default_func),
+        # character info => ci
+        'b'                  : ('ci', 'bold______', self.bool_st_func),
+        'blue'               : ('ci', 'blue______', self.color_func),
+        'caps'               : ('ci', 'caps______', self.bool_st_func),
+        'cf'                 : ('ci', 'font-color', self.colorz_func),
+        'chftn'              : ('ci', 'footnot-mk', self.bool_st_func),
+        'dn'                 : ('ci', 'font-down_', self.divide_by_2),
+        'embo'               : ('ci', 'emboss____', self.bool_st_func),
+        'f'                  : ('ci', 'font-style', self.default_func),
+        'fs'                 : ('ci', 'font-size_', self.divide_by_2),
+        'green'              : ('ci', 'green_____', self.color_func),
+        'i'                  : ('ci', 'italics___', self.bool_st_func),
+        'impr'               : ('ci', 'engrave___', self.bool_st_func),
+        'outl'               : ('ci', 'outline___', self.bool_st_func),
+        'plain'              : ('ci', 'plain_____', self.bool_st_func),
+        'red'                : ('ci', 'red_______', self.color_func),
+        'scaps'              : ('ci', 'small-caps', self.bool_st_func),
+        'shad'               : ('ci', 'shadow____', self.bool_st_func),
+        'strike'             : ('ci', 'strike-thr', self.bool_st_func),
+        'striked'            : ('ci', 'dbl-strike', self.bool_st_func),
+        'sub'                : ('ci', 'subscript_', self.bool_st_func),
+        'super'              : ('ci', 'superscrip', self.bool_st_func),
+        'nosupersub'         : ('ci', 'no-su-supe', self.__no_sup_sub_func),
+        'up'                 : ('ci', 'font-up___', self.divide_by_2),
+        'v'                  : ('ci', 'hidden____', self.default_func),
+        # underline
+        # can't see why it isn't a char info: 'ul'=>'ci'
+        'ul'                 : ('ci', 'underlined<continous', self.two_part_func),
+        'uld'                : ('ci', 'underlined<dotted', self.two_part_func),
+        'uldash'             : ('ci', 'underlined<dash', self.two_part_func),
+        'uldashd'            : ('ci', 'underlined<dash-dot', self.two_part_func),
+        'uldashdd'           : ('ci', 'underlined<dash-dot-dot', self.two_part_func),
+        'uldb'               : ('ci', 'underlined<double', self.two_part_func),
+        'ulhwave'            : ('ci', 'underlined<heavy-wave', self.two_part_func),
+        'ulldash'            : ('ci', 'underlined<long-dash', self.two_part_func),
+        'ulth'               : ('ci', 'underlined<thich', self.two_part_func),
+        'ulthd'              : ('ci', 'underlined<thick-dotted', self.two_part_func),
+        'ulthdash'           : ('ci', 'underlined<thick-dash', self.two_part_func),
+        'ulthdashd'          : ('ci', 'underlined<thick-dash-dot', self.two_part_func),
+        'ulthdashdd'         : ('ci', 'underlined<thick-dash-dot-dot', self.two_part_func),
+        'ulthldash'          : ('ci', 'underlined<thick-long-dash', self.two_part_func),
+        'ululdbwave'         : ('ci', 'underlined<double-wave', self.two_part_func),
+        'ulw'                : ('ci', 'underlined<word', self.two_part_func),
+        'ulwave'             : ('ci', 'underlined<wave', self.two_part_func),
+        'ulnone'             : ('ci', 'underlined<false', self.two_part_func),
+        # table => tb
+        'trowd'              : ('tb', 'row-def___', self.default_func),
+        'cell'               : ('tb', 'cell______', self.default_func),
+        'row'                : ('tb', 'row_______', self.default_func),
+        'intbl'              : ('tb', 'in-table__', self.default_func),
+        'cols'               : ('tb', 'columns___', self.default_func),
+        'trleft'             : ('tb', 'row-pos-le', self.divide_by_20),
+        'cellx'              : ('tb', 'cell-posit', self.divide_by_20),
+        'trhdr'              :  ('tb', 'row-header', self.default_func),
+        # preamble => pr
+        # document information => di
+        # TODO integrate \userprops
+        'info'               : ('di', 'doc-info__', self.default_func),
+        'title'              : ('di', 'title_____', self.default_func),
+        'author'             : ('di', 'author____', self.default_func),
+        'operator'           : ('di', 'operator__', self.default_func),
+        'manager'            : ('di', 'manager___', self.default_func),
+        'company'            : ('di', 'company___', self.default_func),
+        'keywords'           :  ('di', 'keywords__', self.default_func),
+        'category'           :  ('di', 'category__', self.default_func),
+        'doccomm'            :  ('di', 'doc-notes_', self.default_func),
+        'comment'            :  ('di', 'doc-notes_', self.default_func),
+        'subject'            :  ('di', 'subject___', self.default_func),
+        'creatim'            : ('di', 'create-tim', self.default_func),
+        'yr'                 : ('di', 'year______', self.default_func),
+        'mo'                 : ('di', 'month_____', self.default_func),
+        'dy'                 : ('di', 'day_______', self.default_func),
+        'min'                : ('di', 'minute____', self.default_func),
+        'sec'                : ('di', 'second____', self.default_func),
+        'revtim'             : ('di', 'revis-time', self.default_func),
+        'edmins'             : ('di', 'edit-time_', self.default_func),
+        'printim'            : ('di', 'print-time', self.default_func),
+        'buptim'             : ('di', 'backuptime', self.default_func),
+        'nofwords'           : ('di', 'num-of-wor', self.default_func),
+        'nofchars'           : ('di', 'num-of-chr', self.default_func),
+        'nofcharsws'         : ('di', 'numofchrws', self.default_func),
+        'nofpages'           : ('di', 'num-of-pag', self.default_func),
+        'version'            : ('di', 'version___', self.default_func),
+        'vern'               : ('di', 'intern-ver', self.default_func),
+        'hlinkbase'          : ('di', 'linkbase__', self.default_func),
+        'id'                 : ('di', 'internalID', self.default_func),
+        # headers and footers => hf
+        'headerf'            : ('hf', 'head-first', self.default_func),
+        'headerl'            : ('hf', 'head-left_', self.default_func),
+        'headerr'            : ('hf', 'head-right', self.default_func),
+        'footerf'            : ('hf', 'foot-first', self.default_func),
+        'footerl'            : ('hf', 'foot-left_', self.default_func),
+        'footerr'            : ('hf', 'foot-right', self.default_func),
+        'header'             : ('hf', 'header____', self.default_func),
+        'footer'             : ('hf', 'footer____', self.default_func),
+        # page => pa
+        'margl'              : ('pa', 'margin-lef', self.divide_by_20),
+        'margr'              : ('pa', 'margin-rig', self.divide_by_20),
+        'margb'              : ('pa', 'margin-bot', self.divide_by_20),
+        'margt'              : ('pa', 'margin-top', self.divide_by_20),
+        'gutter'             : ('pa', 'gutter____', self.divide_by_20),
+        'paperw'             : ('pa', 'paper-widt', self.divide_by_20),
+        'paperh'             : ('pa', 'paper-hght', self.divide_by_20),
+        # annotation => an
+        'annotation'         :  ('an', 'annotation', self.default_func),
+        # border => bd
+        'trbrdrh'            : ('bd', 'bor-t-r-hi', self.default_func),
+        'trbrdrv'            : ('bd', 'bor-t-r-vi', self.default_func),
+        'trbrdrt'            : ('bd', 'bor-t-r-to', self.default_func),
+        'trbrdrl'            : ('bd', 'bor-t-r-le', self.default_func),
+        'trbrdrb'            : ('bd', 'bor-t-r-bo', self.default_func),
+        'trbrdrr'            : ('bd', 'bor-t-r-ri', self.default_func),
+        'clbrdrb'            : ('bd', 'bor-cel-bo', self.default_func),
+        'clbrdrt'            : ('bd', 'bor-cel-to', self.default_func),
+        'clbrdrl'            : ('bd', 'bor-cel-le', self.default_func),
+        'clbrdrr'            : ('bd', 'bor-cel-ri', self.default_func),
+        'brdrb'              : ('bd', 'bor-par-bo', self.default_func),
+        'brdrt'              : ('bd', 'bor-par-to', self.default_func),
+        'brdrl'              : ('bd', 'bor-par-le', self.default_func),
+        'brdrr'              : ('bd', 'bor-par-ri', self.default_func),
+        'box'                : ('bd', 'bor-par-bx', self.default_func),
+        'chbrdr'            : ('bd', 'bor-par-bo', self.default_func),
+        'brdrbtw'            : ('bd', 'bor-for-ev', self.default_func),
+        'brdrbar'            : ('bd', 'bor-outsid', self.default_func),
+        'brdrnone'           : ('bd', 'bor-none__<false', self.two_part_func),
+        # border type => bt
+        'brdrs'              : ('bt', 'bdr-single', self.default_func),
+        'brdrth'             : ('bt', 'bdr-doubtb', self.default_func),
+        'brdrsh'             : ('bt', 'bdr-shadow', self.default_func),
+        'brdrdb'             : ('bt', 'bdr-double', self.default_func),
+        'brdrdot'            : ('bt', 'bdr-dotted', self.default_func),
+        'brdrdash'           : ('bt', 'bdr-dashed', self.default_func),
+        'brdrhair'           : ('bt', 'bdr-hair__', self.default_func),
+        'brdrinset'          : ('bt', 'bdr-inset_', self.default_func),
+        'brdrdashsm'         : ('bt', 'bdr-das-sm', self.default_func),
+        'brdrdashd'          : ('bt', 'bdr-dot-sm', self.default_func),
+        'brdrdashdd'         : ('bt', 'bdr-dot-do', self.default_func),
+        'brdroutset'         : ('bt', 'bdr-outset', self.default_func),
+        'brdrtriple'         : ('bt', 'bdr-trippl', self.default_func),
+        'brdrtnthsg'         : ('bt', 'bdr-thsm__', self.default_func),
+        'brdrthtnsg'         : ('bt', 'bdr-htsm__', self.default_func),
+        'brdrtnthtnsg'       : ('bt', 'bdr-hthsm_', self.default_func),
+        'brdrtnthmg'         : ('bt', 'bdr-thm___', self.default_func),
+        'brdrthtnmg'         : ('bt', 'bdr-htm___', self.default_func),
+        'brdrtnthtnmg'       : ('bt', 'bdr-hthm__', self.default_func),
+        'brdrtnthlg'         : ('bt', 'bdr-thl___', self.default_func),
+        'brdrtnthtnlg'       : ('bt', 'bdr-hthl__', self.default_func),
+        'brdrwavy'           : ('bt', 'bdr-wavy__', self.default_func),
+        'brdrwavydb'         : ('bt', 'bdr-d-wav_', self.default_func),
+        'brdrdashdotstr'     : ('bt', 'bdr-strip_', self.default_func),
+        'brdremboss'         : ('bt', 'bdr-embos_', self.default_func),
+        'brdrengrave'        : ('bt', 'bdr-engra_', self.default_func),
+        'brdrframe'          : ('bt', 'bdr-frame_', self.default_func),
+        'brdrw'              : ('bt', 'bdr-li-wid', self.divide_by_20),
+        'brsp'              : ('bt', 'bdr-sp-wid', self.divide_by_20),
+        'brdrcf'              : ('bt', 'bdr-color_', self.default_func),
+        # comments
+        # 'comment'              :	('cm', 'comment___', self.default_func),
+        }
+        self.__number_type_dict = {
+            0:      'Arabic',
+            1:      'uppercase Roman numeral',
+            2:      'lowercase Roman numeral',
+            3:      'uppercase letter',
+            4:      'lowercase letter',
+            5:      'ordinal number',
+            6:      'cardianl text number',
+            7:      'ordinal text number',
+            10:     'Kanji numbering without the digit character',
+            11:     'Kanji numbering with the digit character',
+            1246:   'phonetic Katakana characters in aiueo order',
+            1346:   'phonetic katakana characters in iroha order',
+            14:     'double byte character',
+            15:     'single byte character',
+            16:     'Kanji numbering 3',
+            17:     'Kanji numbering 4',
+            18:     'Circle numbering' ,
+            19:     'double-byte Arabic numbering',
+            2046:   'phonetic double-byte Katakana characters',
+            2146:   'phonetic double-byte katakana characters',
+            22:     'Arabic with leading zero',
+            23:     'bullet',
+            24:     'Korean numbering 2',
+            25:     'Korean numbering 1',
+            26:     'Chinese numbering 1',
+            27:     'Chinese numbering 2',
+            28:     'Chinese numbering 3',
+            29:     'Chinese numbering 4',
+            30:     'Chinese Zodiac numbering 1',
+            31:     'Chinese Zodiac numbering 2',
+            32:     'Chinese Zodiac numbering 3',
+            33:     'Taiwanese double-byte numbering 1',
+            34:     'Taiwanese double-byte numbering 2',
+            35:     'Taiwanese double-byte numbering 3',
+            36:     'Taiwanese double-byte numbering 4',
+            37:     'Chinese double-byte numbering 1',
+            38:     'Chinese double-byte numbering 2',
+            39:     'Chinese double-byte numbering 3',
+            40:     'Chinese double-byte numbering 4',
+            41:     'Korean double-byte numbering 1',
+            42:     'Korean double-byte numbering 2',
+            43:     'Korean double-byte numbering 3',
+            44:     'Korean double-byte numbering 4',
+            45:     'Hebrew non-standard decimal',
+            46:     'Arabic Alif Ba Tah',
+            47:     'Hebrew Biblical standard',
+            48:     'Arabic Abjad style',
+            255:    'No number',
+        }
+        self.__language_dict = {
+            1078 	:  'Afrikaans',
+            1052 	:  'Albanian',
+            1025 	:  'Arabic',
+            5121 	:  'Arabic Algeria',
+            15361 	:  'Arabic Bahrain',
+            3073 	:  'Arabic Egypt',
+            1 	    :   'Arabic General',
+            2049 	:  'Arabic Iraq',
+            11265 	:  'Arabic Jordan',
+            13313 	:  'Arabic Kuwait',
+            12289 	:  'Arabic Lebanon',
+            4097 	:  'Arabic Libya',
+            6145 	:  'Arabic Morocco',
+            8193 	:  'Arabic Oman',
+            16385 	:  'Arabic Qatar',
+            10241 	:  'Arabic Syria',
+            7169 	:  'Arabic Tunisia',
+            14337 	:  'Arabic U.A.E.',
+            9217 	:  'Arabic Yemen',
+            1067 	:  'Armenian',
+            1101 	:  'Assamese',
+            2092 	:  'Azeri Cyrillic',
+            1068 	:  'Azeri Latin',
+            1069 	:  'Basque',
+            1093 	:  'Bengali',
+            4122 	:  'Bosnia Herzegovina',
+            1026 	:  'Bulgarian',
+            1109 	:  'Burmese',
+            1059 	:  'Byelorussian',
+            1027 	:  'Catalan',
+            2052 	:  'Chinese China',
+            4 	    :  'Chinese General',
+            3076 	:  'Chinese Hong Kong',
+            4100 	:  'Chinese Singapore',
+            1028 	:  'Chinese Taiwan',
+            1050 	:  'Croatian',
+            1029 	:  'Czech',
+            1030 	:  'Danish',
+            2067 	:  'Dutch Belgium',
+            1043 	:  'Dutch Standard',
+            3081 	:  'English Australia',
+            10249 	:  'English Belize',
+            2057 	:  'English British',
+            4105 	:  'English Canada',
+            9225 	:  'English Caribbean',
+            9 	    :  'English General',
+            6153 	:  'English Ireland',
+            8201 	:  'English Jamaica',
+            5129 	:  'English New Zealand',
+            13321 	:  'English Philippines',
+            7177 	:  'English South Africa',
+            11273 	:  'English Trinidad',
+            1033 	:  'English United States',
+            1061 	:  'Estonian',
+            1080 	:  'Faerose',
+            1065 	:  'Farsi',
+            1035 	:  'Finnish',
+            1036 	:  'French',
+            2060 	:  'French Belgium',
+            11276 	:  'French Cameroon',
+            3084 	:  'French Canada',
+            12300 	:  'French Cote d\'Ivoire',
+            5132 	:  'French Luxembourg',
+            13324 	:  'French Mali',
+            6156 	:  'French Monaco',
+            8204 	:  'French Reunion',
+            10252 	:  'French Senegal',
+            4108 	:  'French Swiss',
+            7180 	:  'French West Indies',
+            9228 	:  'French Democratic Republic of the Congo',
+            1122 	:  'Frisian',
+            1084 	:  'Gaelic',
+            2108 	:  'Gaelic Ireland',
+            1110 	:  'Galician',
+            1079 	:  'Georgian',
+            1031 	:  'German',
+            3079 	:  'German Austrian',
+            5127 	:  'German Liechtenstein',
+            4103 	:  'German Luxembourg',
+            2055 	:  'German Switzerland',
+            1032 	:  'Greek',
+            1095 	:  'Gujarati',
+            1037 	:  'Hebrew',
+            1081 	:  'Hindi',
+            1038 	:  'Hungarian',
+            1039 	:  'Icelandic',
+            1057 	:  'Indonesian',
+            1040 	:  'Italian',
+            2064 	:  'Italian Switzerland',
+            1041 	:  'Japanese',
+            1099 	:  'Kannada',
+            1120 	:  'Kashmiri',
+            2144 	:  'Kashmiri India',
+            1087 	:  'Kazakh',
+            1107 	:  'Khmer',
+            1088 	:  'Kirghiz',
+            1111 	:  'Konkani',
+            1042 	:  'Korean',
+            2066 	:  'Korean Johab',
+            1108 	:  'Lao',
+            1062 	:  'Latvian',
+            1063 	:  'Lithuanian',
+            2087 	:  'Lithuanian Classic',
+            1086 	:  'Malay',
+            2110 	:  'Malay Brunei Darussalam',
+            1100 	:  'Malayalam',
+            1082 	:  'Maltese',
+            1112 	:  'Manipuri',
+            1102 	:  'Marathi',
+            1104 	:  'Mongolian',
+            1121 	:  'Nepali',
+            2145 	:  'Nepali India',
+            1044 	:  'Norwegian Bokmal',
+            2068 	:  'Norwegian Nynorsk',
+            1096 	:  'Oriya',
+            1045 	:  'Polish',
+            1046 	:  'Portuguese (Brazil)',
+            2070 	:  'Portuguese (Portugal)',
+            1094 	:  'Punjabi',
+            1047 	:  'Rhaeto-Romanic',
+            1048 	:  'Romanian',
+            2072 	:  'Romanian Moldova',
+            1049 	:  'Russian',
+            2073 	:  'Russian Moldova',
+            1083 	:  'Sami Lappish',
+            1103 	:  'Sanskrit',
+            3098 	:  'Serbian Cyrillic',
+            2074 	:  'Serbian Latin',
+            1113 	:  'Sindhi',
+            1051 	:  'Slovak',
+            1060 	:  'Slovenian',
+            1070 	:  'Sorbian',
+            11274 	:  'Spanish Argentina',
+            16394 	:  'Spanish Bolivia',
+            13322 	:  'Spanish Chile',
+            9226 	:  'Spanish Colombia',
+            5130 	:  'Spanish Costa Rica',
+            7178 	:  'Spanish Dominican Republic',
+            12298 	:  'Spanish Ecuador',
+            17418 	:  'Spanish El Salvador',
+            4106 	:  'Spanish Guatemala',
+            18442 	:  'Spanish Honduras',
+            2058 	:  'Spanish Mexico',
+            3082 	:  'Spanish Modern',
+            19466 	:  'Spanish Nicaragua',
+            6154 	:  'Spanish Panama',
+            15370 	:  'Spanish Paraguay',
+            10250 	:  'Spanish Peru',
+            20490 	:  'Spanish Puerto Rico',
+            1034 	:  'Spanish Traditional',
+            14346 	:  'Spanish Uruguay',
+            8202 	:  'Spanish Venezuela',
+            1072 	:  'Sutu',
+            1089 	:  'Swahili',
+            1053 	:  'Swedish',
+            2077 	:  'Swedish Finland',
+            1064 	:  'Tajik',
+            1097 	:  'Tamil',
+            1092 	:  'Tatar',
+            1098 	:  'Telugu',
+            1054 	:  'Thai',
+            1105 	:  'Tibetan',
+            1073 	:  'Tsonga',
+            1074 	:  'Tswana',
+            1055 	:  'Turkish',
+            1090 	:  'Turkmen',
+            1058 	:  'Ukranian',
+            1056 	:  'Urdu',
+            2080 	:  'Urdu India',
+            2115 	:  'Uzbek Cyrillic',
+            1091 	:  'Uzbek Latin',
+            1075 	:  'Venda',
+            1066 	:  'Vietnamese',
+            1106 	:  'Welsh',
+            1076 	:  'Xhosa',
+            1085 	:  'Yiddish',
+            1077 	:  'Zulu',
+            1024 	:  'Unkown',
+            255 	:  'Unkown',
+        }
+    """
+        # unknown
+        # These must get passed on because they occure after \\*
+        'do'                :   ('un', 'unknown___', self.default_func),
+        'company'           :	('un', 'company___', self.default_func),
+        'shpinst'           :   ('un', 'unknown___', self.default_func),
+        'panose'            :   ('un', 'unknown___', self.default_func),
+        'falt'              :   ('un', 'unknown___', self.default_func),
+        'listoverridetable' :   ('un', 'unknown___', self.default_func),
+        'category'          :   ('un', 'unknown___', self.default_func),
+        'template'          :   ('un', 'unknown___', self.default_func),
+        'ud'                :   ('un', 'unknown___', self.default_func),
+        'formfield'         :   ('un', 'unknown___', self.default_func),
+        'ts'                :   ('un', 'unknown___', self.default_func),
+        'rsidtbl'           :   ('un', 'unknown___', self.default_func),
+        'generator'         :   ('un', 'unknown___', self.default_func),
+        'ftnsep'            :   ('un', 'unknown___', self.default_func),
+        'aftnsep'           :   ('un', 'unknown___', self.default_func),
+        'aftnsepc'           :   ('un', 'unknown___', self.default_func),
+        'aftncn'            :   ('un', 'unknown___', self.default_func),
+        'objclass'           :   ('un', 'unknown___', self.default_func),
+        'objdata'           :   ('un', 'unknown___', self.default_func),
+        'picprop'           :   ('un', 'unknown___', self.default_func),
+        'blipuid'           :   ('un', 'unknown___', self.default_func),
+    """
+
+    def __ms_hex_func(self, pre, token, num):
+        num = num[1:]  # chop off leading 0, which I added
+        num = num.upper()  # the mappings store hex in caps
+        return 'tx<hx<__________<\'%s\n' % num  # add an ' for the mappings
+
+    def ms_sub_func(self, pre, token, num):
+        return 'tx<mc<__________<%s\n' % token
+
+    def direct_conv_func(self, pre, token, num):
+        return 'mi<tg<empty_____<%s\n' % token
+
+    def default_func(self, pre, token, num):
+        if num is None:
+            num = 'true'
+        return 'cw<%s<%s<nu<%s\n' % (pre, token, num)
+
+    def colorz_func(self, pre, token, num):
+        if num is None:
+            num = '0'
+        return 'cw<%s<%s<nu<%s\n' % (pre, token, num)
+
+    def __list_type_func(self, pre, token, num):
+        type = 'arabic'
+        if num is None:
+            type = 'Arabic'
+        else:
+            try:
+                num = int(num)
+            except ValueError:
+                if self.__run_level > 3:
+                    msg = 'Number "%s" cannot be converted to integer\n' % num
+                    raise self.__bug_handler(msg)
+            type = self.__number_type_dict.get(num)
+            if type is None:
+                if self.__run_level > 3:
+                    msg = 'No type for "%s" in self.__number_type_dict\n'
+                    raise self.__bug_handler
+                type = 'Arabic'
+        return 'cw<%s<%s<nu<%s\n' % (pre, token, type)
+
+    def __language_func(self, pre, token, num):
+        lang_name = self.__language_dict.get(int(re.search('[0-9]+', num).group()))
+        if not lang_name:
+            lang_name = "not defined"
+            if self.__run_level > 3:
+                msg = 'No entry for number "%s"' % num
+                raise self.__bug_handler(msg)
+        return 'cw<%s<%s<nu<%s\n' % (pre, token, lang_name)
+
+    def two_part_func(self, pre, token, num):
+        list = token.split("<")
+        token = list[0]
+        num = list[1]
+        return 'cw<%s<%s<nu<%s\n' % (pre, token, num)
+        # return 'cw<nu<nu<nu<%s>num<%s\n' % (token, num)
+
+    def divide_by_2(self, pre, token, num):
+        num = self.divide_num(num, 2)
+        return 'cw<%s<%s<nu<%s\n' % (pre, token, num)
+        # return 'cw<nu<nu<nu<%s>%s<%s\n' % (token, num, token)
+
+    def divide_by_20(self, pre, token, num):
+        num = self.divide_num(num, 20)
+        return 'cw<%s<%s<nu<%s\n' % (pre, token, num)
+        # return 'cw<nu<nu<nu<%s>%s<%s\n' % (token, num, token)
+
+    def text_func(self, pre, token, num=None):
+        return 'tx<nu<__________<%s\n' % token
+
+    def ob_func(self, pre, token, num=None):
+        self.__bracket_count += 1
+        return 'ob<nu<open-brack<%04d\n' % self.__bracket_count
+
+    def cb_func(self, pre, token, num=None):
+        line = 'cb<nu<clos-brack<%04d\n' % self.__bracket_count
+        self.__bracket_count -= 1
+        return line
+
+    def color_func(self, pre, token, num):
+        third_field = 'nu'
+        if num[-1] == ';':
+            num = num[:-1]
+            third_field = 'en'
+        num = unicode_type('%X' % int(num))
+        if len(num) != 2:
+            num = "0" + num
+        return 'cw<%s<%s<%s<%s\n' % (pre, token, third_field, num)
+        # return 'cw<cl<%s<nu<nu<%s>%s<%s\n' % (third_field, token, num, token)
+
+    def bool_st_func(self, pre, token, num):
+        if num is None or num == '' or num == '1':
+            return 'cw<%s<%s<nu<true\n' % (pre, token)
+            # return 'cw<nu<nu<nu<%s>true<%s\n' % (token, token)
+        elif num == '0':
+            return 'cw<%s<%s<nu<false\n' % (pre, token)
+            # return 'cw<nu<nu<nu<%s>false<%s\n' % (token, token)
+        else:
+            msg = "boolean should have some value module process tokens\ntoken is %s\n'%s'\n" % (token, num)
+            raise self.__bug_handler(msg)
+
+    def __no_sup_sub_func(self, pre, token, num):
+        the_string = 'cw<ci<subscript_<nu<false\n'
+        the_string += 'cw<ci<superscrip<nu<false\n'
+        return the_string
+
+    def divide_num(self, numerator, denominator):
+        try:
+            # calibre why ignore negative number? Wrong in case of \fi
+            numerator = float(re.search('[0-9.\\-]+', numerator).group())
+        except TypeError as msg:
+            if self.__run_level > 3:
+                msg = ('No number to process?\nthis indicates that the token \\(\\li\\) \
+                should have a number and does not\nnumerator is \
+                "%s"\ndenominator is "%s"\n') % (numerator, denominator)
+                raise self.__bug_handler(msg)
+            if 5 > self.__return_code:
+                self.__return_code = 5
+            return 0
+        num = '%0.2f' % round(numerator/denominator, 2)
+        return num
+        string_num = unicode_type(num)
+        if string_num[-2:] == ".0":
+            string_num = string_num[:-2]
+        return string_num
+
+    def split_let_num(self, token):
+        match_obj = re.search(self.__num_exp,token)
+        if match_obj is not None:
+            first = match_obj.group(1)
+            second = match_obj.group(2)
+            if not second:
+                if self.__run_level > 3:
+                    msg = "token is '%s' \n" % token
+                    raise self.__bug_handler(msg)
+                return first, 0
+        else:
+            if self.__run_level > 3:
+                msg = "token is '%s' \n" % token
+                raise self.__bug_handler
+            return token, 0
+        return first, second
+
+    def convert_to_hex(self,number):
+        """Convert a string to uppercase hexidecimal"""
+        num = int(number)
+        try:
+            hex_num = "%X" % num
+            return hex_num
+        except:
+            raise self.__bug_handler
+
+    def process_cw(self, token):
+        """Change the value of the control word by determining what dictionary
+        it belongs to"""
+        special = ['*', ':', '}', '{', '~', '_', '-', ';']
+        # if token != "{" or token != "}":
+        token = token[1:]  # strip off leading \
+        token = token.replace(" ", "")
+        # if not token: return
+        only_alpha = token.isalpha()
+        num = None
+        if not only_alpha and token not in special:
+            token, num = self.split_let_num(token)
+        pre, token, action = self.dict_token.get(token, (None, None, None))
+        if action:
+            return action(pre, token, num)
+
+    def __check_brackets(self, in_file):
+        self.__check_brack_obj = check_brackets.CheckBrackets(file=in_file)
+        good_br =  self.__check_brack_obj.check_brackets()[0]
+        if not good_br:
+            return 1
+
+    def process_tokens(self):
+        """Main method for handling other methods. """
+        line_count = 0
+        with open_for_read(self.__file) as read_obj:
+            with open_for_write(self.__write_to) as write_obj:
+                for line in read_obj:
+                    token = line.replace("\n", "")
+                    line_count += 1
+                    if line_count == 1 and token != '\\{':
+                        msg = '\nInvalid RTF: document doesn\'t start with {\n'
+                        raise self.__exception_handler(msg)
+                    elif line_count == 2 and token[0:4] != '\\rtf':
+                        msg = '\nInvalid RTF: document doesn\'t start with \\rtf \n'
+                        raise self.__exception_handler(msg)
+
+                    the_index = token.find('\\ ')
+                    if token is not None and the_index > -1:
+                        msg = '\nInvalid RTF: token "\\ " not valid.\nError at line %d'\
+                            % line_count
+                        raise self.__exception_handler(msg)
+                    elif token[:1] == "\\":
+                        line = self.process_cw(token)
+                        if line is not None:
+                            write_obj.write(line)
+                    else:
+                        fields = re.split(self.__utf_exp, token)
+                        for field in fields:
+                            if not field:
+                                continue
+                            if field[0:1] == '&':
+                                write_obj.write('tx<ut<__________<%s\n' % field)
+                            else:
+                                write_obj.write('tx<nu<__________<%s\n' % field)
+
+        if not line_count:
+            msg = '\nInvalid RTF: file appears to be empty.\n'
+            raise self.__exception_handler(msg)
+
+        copy_obj = copy.Copy(bug_handler=self.__bug_handler)
+        if self.__copy:
+            copy_obj.copy_file(self.__write_to, "processed_tokens.data")
+        copy_obj.rename(self.__write_to, self.__file)
+        os.remove(self.__write_to)
+
+        bad_brackets = self.__check_brackets(self.__file)
+        if bad_brackets:
+            msg = '\nInvalid RTF: document does not have matching brackets.\n'
+            raise self.__exception_handler(msg)
+        else:
+            return self.__return_code
--- a/ebook_converter/ebooks/rtf2xml/sections.py
+++ b/ebook_converter/ebooks/rtf2xml/sections.py
@@ -0,0 +1,538 @@
+from __future__ import absolute_import, division, print_function, unicode_literals
+#########################################################################
+#                                                                       #
+#                                                                       #
+#   copyright 2002 Paul Henry Tremblay                                  #
+#                                                                       #
+#   This program is distributed in the hope that it will be useful,     #
+#   but WITHOUT ANY WARRANTY; without even the implied warranty of      #
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU    #
+#   General Public License for more details.                            #
+#                                                                       #
+#                                                                       #
+#########################################################################
+import sys, os
+
+from calibre.ebooks.rtf2xml import copy
+from calibre.ptempfile import better_mktemp
+from polyglot.builtins import unicode_type
+
+from . import open_for_read, open_for_write
+
+
+class Sections:
+    """
+    =================
+    Purpose
+    =================
+    Write section tags for a tokenized file. (This module won't be any use to use
+    to you unless you use it as part of the other modules.)
+    ---------------
+    logic
+    ---------------
+    The tags for the first section breaks have already been written.
+    RTF stores section breaks with the \\sect tag. Each time this tag is
+    encountered, add one to the counter.
+    When I encounter the \\sectd tag, I want to collect all the appropriate tokens
+    that describe the section. When I reach a \\pard, I know I an stop collecting
+    tokens and write the section tags.
+    The exception to this method occurs when sections occur in field blocks, such
+    as the index. Normally, two section break occur within the index and other
+    field-blocks. (If less or more section breaks occur, this code may not work.)
+    I want the sections to occur outside of the index. That is, the index
+    should be nested inside one section tag. After the index is complete, a new
+    section should begin.
+    In order to write the sections outside of the field blocks, I have to store
+    all of the field block as a string. When I ecounter the \\sect tag, add one to
+    the section counter, but store this number in a list. Likewise, store the
+    information describing the section in another list.
+    When I reach the end of the field block, choose the first item from the
+    numbered list as the section number. Choose the first item in the description
+    list as the values and attributes of the section. Enclose the field string
+    between the section tags.
+    Start a new section outside the field-block strings. Use the second number in
+    the list; use the second item in the description list.
+    CHANGE (2004-04-26) No longer write sections that occurr in field-blocks.
+    Instead, ingore all section information in a field-block.
+    """
+
+    def __init__(self,
+            in_file,
+            bug_handler,
+            copy=None,
+            run_level=1):
+        """
+        Required:
+            'file'--file to parse
+        Optional:
+            'copy'-- whether to make a copy of result for debugging
+            'temp_dir' --where to output temporary results (default is
+            directory from which the script is run.)
+        Returns:
+            nothing
+            """
+        self.__file = in_file
+        self.__bug_handler = bug_handler
+        self.__copy = copy
+        self.__run_level = run_level
+        self.__write_to = better_mktemp()
+
+    def __initiate_values(self):
+        """
+        Initiate all values.
+        """
+        self.__mark_start = 'mi<mk<sect-start\n'
+        self.__mark_end =   'mi<mk<sect-end__\n'
+        self.__in_field = 0
+        self.__section_values = {}
+        self.__list_of_sec_values = []
+        self.__field_num = []
+        self.__section_num = 0
+        self.__state = 'before_body'
+        self.__found_first_sec = 0
+        self.__text_string = ''
+        self.__field_instruction_string = ''
+        self.__state_dict = {
+        'before_body'       : self.__before_body_func,
+        'body'              : self.__body_func,
+        'before_first_sec'  : self.__before_first_sec_func,
+        'section'           : self.__section_func,
+        'section_def'       : self.__section_def_func,
+        'sec_in_field'      : self.__sec_in_field_func,
+        }
+        # cw<sc<sect-defin<nu<true
+        self.__body_dict = {
+        'cw<sc<section___'      : self.__found_section_func,
+        'mi<mk<sec-fd-beg'      : self.__found_sec_in_field_func,
+        'cw<sc<sect-defin'      : self.__found_section_def_bef_sec_func,
+        }
+        self.__section_def_dict = {
+        'cw<pf<par-def___'      : (self.__end_sec_def_func, None),
+        'mi<mk<body-open_'      : (self.__end_sec_def_func, None),
+        'cw<tb<columns___'      : (self.__attribute_func, 'columns'),
+        'cw<pa<margin-lef'      : (self.__attribute_func, 'margin-left'),
+        'cw<pa<margin-rig'      : (self.__attribute_func, 'margin-right'),
+        'mi<mk<header-ind'      : (self.__end_sec_def_func, None),
+        # premature endings
+        # __end_sec_premature_func
+        'tx<nu<__________'      : (self.__end_sec_premature_func, None),
+        'cw<ci<font-style'      : (self.__end_sec_premature_func, None),
+        'cw<ci<font-size_'      : (self.__end_sec_premature_func, None),
+        }
+        self.__sec_in_field_dict = {
+        'mi<mk<sec-fd-end'      : self.__end_sec_in_field_func,
+        # changed this 2004-04-26
+        # two lines
+        # 'cw<sc<section___'      : self.__found_section_in_field_func,
+        # 'cw<sc<sect-defin'      : self.__found_section_def_in_field_func,
+        }
+
+    def __found_section_def_func(self, line):
+        """
+        Required:
+            line -- the line to parse
+        Returns:
+            nothing
+        Logic:
+            I have found a section definition. Change the state to
+            setion_def (so subsequent lines will be processesed as part of
+            the section definition), and clear the section_values dictionary.
+        """
+        self.__state = 'section_def'
+        self.__section_values.clear()
+
+    def __attribute_func(self, line, name):
+        """
+        Required:
+            line -- the line to be parsed
+            name -- the changed, readable name (as opposed to the
+            abbreviated one)
+        Returns:
+            nothing
+        Logic:
+            I need to add the right data to the section values dictionary so I
+            can retrive it later. The attribute (or key) is the name; the
+            value is the last part of the text string.
+            ex: cw<tb<columns___<nu<2
+        """
+        attribute = name
+        value = line[20:-1]
+        self.__section_values[attribute] = value
+
+    def __found_section_func(self, line):
+        """
+        Requires:
+            line -- the line to parse
+        Returns:
+            nothing
+        Logic:
+            I have found the beginning of a section, so change the state
+            accordingly. Also add one to the section counter.
+        """
+        self.__state = 'section'
+        self.__write_obj.write(line)
+        self.__section_num += 1
+
+    def __found_section_def_bef_sec_func(self, line):
+        """
+        Requires:
+            line -- the line to parse
+        Returns:
+            nothing
+        Logic:
+            I have found the beginning of a section, so change the state
+            accordingly. Also add one to the section counter.
+        """
+        self.__section_num += 1
+        self.__found_section_def_func(line)
+        self.__write_obj.write(line)
+
+    def __section_func(self, line):
+        """
+        Requires:
+            line --the line to parse
+        Returns:
+            nothing
+        Logic:
+        """
+        if self.__token_info == 'cw<sc<sect-defin':
+            self.__found_section_def_func(line)
+        self.__write_obj.write(line)
+
+    def __section_def_func(self, line):
+        """
+        Required:
+            line --line to parse
+        Returns:
+            nothing
+        Logic:
+            I have found a section definition. Check if the line is the end of
+            the defnition (a paragraph defintion), or if it contains info that
+            should be added to the values dictionary. If neither of these
+            cases are true, output the line to a file.
+        """
+        action, name = self.__section_def_dict.get(self.__token_info, (None, None))
+        if action:
+            action(line, name)
+            if self.__in_field:
+                self.__sec_in_field_string += line
+            else:
+                self.__write_obj.write(line)
+        else:
+            self.__write_obj.write(line)
+
+    def __end_sec_def_func(self, line, name):
+        """
+        Requires:
+            line --the line to parse
+            name --changed, readable name
+        Returns:
+            nothing
+        Logic:
+            The end of the section definition has been found. Reset the state.
+            Call on the write_section method.
+        """
+        if not self.__in_field:
+            self.__state = 'body'
+        else:
+            self.__state = 'sec_in_field'
+        self.__write_section(line)
+
+    def __end_sec_premature_func(self, line, name):
+        """
+        Requires:
+            line --the line to parse
+            name --changed, readable name
+        Returns:
+            nothing
+        Logic:
+            Text or control words indicating text have been found
+            before \\pard. This shoud indicate older RTF. Reset the state
+            Write the section defintion. Insert a paragraph definition.
+            Insert {} to mark the end of a paragraph defintion
+        """
+        if not self.__in_field:
+            self.__state = 'body'
+        else:
+            self.__state = 'sec_in_field'
+        self.__write_section(line)
+        self.__write_obj.write('cw<pf<par-def___<nu<true\n')
+        self.__write_obj.write('ob<nu<open-brack<0000\n')
+        self.__write_obj.write('cb<nu<clos-brack<0000\n')
+
+    def __write_section(self, line):
+        """
+        Requires:
+            nothing
+        Returns:
+            nothing
+        Logic:
+            Form a string of attributes and values. If you are not in a field
+            block, write this string to the output file. Otherwise, call on
+            the handle_sec_def method to handle this string.
+        """
+        my_string = self.__mark_start
+        if self.__found_first_sec:
+            my_string += 'mi<tg<close_____<section\n'
+        else:
+            self.__found_first_sec = 1
+        my_string += 'mi<tg<open-att__<section<num>%s' % unicode_type(self.__section_num)
+        my_string += '<num-in-level>%s' % unicode_type(self.__section_num)
+        my_string += '<type>rtf-native'
+        my_string += '<level>0'
+        keys = self.__section_values.keys()
+        if len(keys) > 0:
+            for key in keys:
+                my_string += '<%s>%s' % (key, self.__section_values[key])
+        my_string += '\n'
+        my_string += self.__mark_end
+        # # my_string += line
+        if self.__state == 'body':
+            self.__write_obj.write(my_string)
+        elif self.__state == 'sec_in_field':
+            self.__handle_sec_def(my_string)
+        elif self.__run_level > 3:
+            msg = 'missed a flag\n'
+            raise self.__bug_handler(msg)
+
+    def __handle_sec_def(self, my_string):
+        """
+        Requires:
+            my_string -- the string of attributes and values. (Do I need this?)
+        Returns:
+            nothing
+        Logic:
+            I need to append the dictionary of attributes and values to list
+            so I can use it later when I reach the end of the field-block.
+        """
+        values_dict = self.__section_values
+        self.__list_of_sec_values.append(values_dict)
+
+    def __body_func(self, line):
+        """
+        Requires:
+            line --the line to parse
+        Returns:
+            nothing
+        Logic:
+            Look for the beginning of a section. Otherwise, print the line to
+            the output file.
+        """
+        action = self.__body_dict.get(self.__token_info)
+        if action:
+            action(line)
+        else:
+            self.__write_obj.write(line)
+
+    def __before_body_func(self, line):
+        """
+        Requires:
+            line --line to parse
+        Returns:
+            nothing
+        Logic:
+            Look for the beginning of the body. Always print out the line.
+        """
+        if self.__token_info == 'mi<mk<body-open_':
+            self.__state = 'before_first_sec'
+        self.__write_obj.write(line)
+
+    def __before_first_sec_func(self, line):
+        """
+        Requires:
+            line -- line to parse
+        Returns:
+            nothing
+        Logic:
+            Look for the beginning of the first section. This can be \\sectd,
+            but in older RTF it could mean the any paragraph or row definition
+        """
+        if self.__token_info == 'cw<sc<sect-defin':
+            self.__state = 'section_def'
+            self.__section_num += 1
+            self.__section_values.clear()
+        elif self.__token_info == 'cw<pf<par-def___':
+            self.__state = 'body'
+            self.__section_num += 1
+            self.__write_obj.write(
+                    'mi<tg<open-att__<section<num>%s'
+                    '<num-in-level>%s'
+                    '<type>rtf-native'
+                    '<level>0\n'
+                    % (unicode_type(self.__section_num), unicode_type(self.__section_num))
+                    )
+            self.__found_first_sec = 1
+        elif self.__token_info == 'tx<nu<__________':
+            self.__state = 'body'
+            self.__section_num += 1
+            self.__write_obj.write(
+                    'mi<tg<open-att__<section<num>%s'
+                    '<num-in-level>%s'
+                    '<type>rtf-native'
+                    '<level>0\n'
+                    % (unicode_type(self.__section_num), unicode_type(self.__section_num))
+                    )
+            self.__write_obj.write(
+                'cw<pf<par-def___<true\n'
+                    )
+            self.__found_first_sec = 1
+        self.__write_obj.write(line)
+
+    def __found_sec_in_field_func(self, line):
+        """
+        Requires:
+            line --line to parse
+        Returns:
+            nothing
+        Logic:
+            I have found the beginning of a field that has a section (or
+            really, two) inside of it. Change the state, and start adding to
+            one long string.
+        """
+        self.__state = 'sec_in_field'
+        self.__sec_in_field_string = line
+        self.__in_field = 1
+
+    def __sec_in_field_func(self, line):
+        """
+        Requires:
+            line --the line to parse
+        Returns:
+            nothing
+        Logic:
+            Check for the end of the field, or the beginning of a section
+            definition.
+            CHANGED! Just print out each line. Ignore any sections or
+            section definition info.
+        """
+        action = self.__sec_in_field_dict.get(self.__token_info)
+        if action:
+            action(line)
+        else:
+            # change this 2004-04-26
+            # self.__sec_in_field_string += line
+            self.__write_obj.write(line)
+
+    def __end_sec_in_field_func(self, line):
+        """
+        Requires:
+            line --line to parse
+        Returns:
+            nothing
+        Logic:
+            Add the last line to the field string. Call on the method
+            print_field_sec_attributes to write the close and beginning of a
+            section tag. Print out the field string. Call on the same method
+            to again write the close and beginning of a section tag.
+            Change the state.
+        """
+        # change this 2004-04-26
+        # Don't do anyting
+        """
+        self.__sec_in_field_string += line
+        self.__print_field_sec_attributes()
+        self.__write_obj.write(self.__sec_in_field_string)
+        self.__print_field_sec_attributes()
+        """
+        self.__state = 'body'
+        self.__in_field = 0
+        # this is changed too
+        self.__write_obj.write(line)
+
+    def __print_field_sec_attributes(self):
+        """
+        Requires:
+            nothing
+        Returns:
+            nothing
+        Logic:
+            Get the number and dictionary of values from the lists. The number
+            and dictionary will be the first item of each list. Write the
+            close tag. Write the start tag. Write the attribute and values in
+            the dictionary. Get rid of the first item in each list.
+        keys = self.__section_values.keys()
+        if len(keys) > 0:
+            my_string += 'mi<tg<open-att__<section-definition'
+            for key in keys:
+                my_string += '<%s>%s' % (key, self.__section_values[key])
+            my_string += '\n'
+        else:
+            my_string += 'mi<tg<open______<section-definition\n'
+        """
+        num = self.__field_num[0]
+        self.__field_num = self.__field_num[1:]
+        self.__write_obj.write(
+        'mi<tg<close_____<section\n'
+        'mi<tg<open-att__<section<num>%s' % unicode_type(num)
+        )
+        if self.__list_of_sec_values:
+            keys =  self.__list_of_sec_values[0].keys()
+            for key in keys:
+                self.__write_obj.write(
+                '<%s>%s\n' % (key, self.__list_of_sec_values[0][key]))
+            self.__list_of_sec_values = self.__list_of_sec_values[1:]
+        self.__write_obj.write('<level>0')
+        self.__write_obj.write('<type>rtf-native')
+        self.__write_obj.write('<num-in-level>%s' % unicode_type(self.__section_num))
+        self.__write_obj.write('\n')
+        # Look here
+
+    def __found_section_in_field_func(self, line):
+        """
+        Requires:
+            line --line to parse
+        Returns:
+            nothing
+        Logic:
+            I have found a section in a field block. Add one to section
+            counter, and append this number to a list.
+        """
+        self.__section_num += 1
+        self.__field_num.append(self.__section_num)
+        self.__sec_in_field_string += line
+
+    def __found_section_def_in_field_func(self, line):
+        """
+        Requires:
+            line --line to parse
+        Returns:
+            nothing
+        Logic:
+            I have found a section definition in a filed block. Change the
+            state and clear the values dictionary.
+        """
+        self.__state = 'section_def'
+        self.__section_values.clear()
+
+    def make_sections(self):
+        """
+        Requires:
+            nothing
+        Returns:
+            nothing (changes the original file)
+        Logic:
+            Read one line in at a time. Determine what action to take based on
+            the state. If the state is before the body, look for the
+            beginning of the body.
+            If the state is body, send the line to the body method.
+        """
+        self.__initiate_values()
+        read_obj = open_for_read(self.__file)
+        self.__write_obj = open_for_write(self.__write_to)
+        line_to_read = 1
+        while line_to_read:
+            line_to_read = read_obj.readline()
+            line = line_to_read
+            self.__token_info = line[:16]
+            action = self.__state_dict.get(self.__state)
+            if action is None:
+                sys.stderr.write('no matching state in module sections.py\n')
+                sys.stderr.write(self.__state + '\n')
+            action(line)
+        read_obj.close()
+        self.__write_obj.close()
+        copy_obj = copy.Copy(bug_handler=self.__bug_handler)
+        if self.__copy:
+            copy_obj.copy_file(self.__write_to, "sections.data")
+        copy_obj.rename(self.__write_to, self.__file)
+        os.remove(self.__write_to)
--- a/ebook_converter/ebooks/rtf2xml/styles.py
+++ b/ebook_converter/ebooks/rtf2xml/styles.py
@@ -0,0 +1,723 @@
+from __future__ import unicode_literals, absolute_import, print_function, division
+#########################################################################
+#                                                                       #
+#                                                                       #
+#   copyright 2002 Paul Henry Tremblay                                  #
+#                                                                       #
+#   This program is distributed in the hope that it will be useful,     #
+#   but WITHOUT ANY WARRANTY; without even the implied warranty of      #
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU    #
+#   General Public License for more details.                            #
+#                                                                       #
+#                                                                       #
+#########################################################################
+import sys, os
+from calibre.ebooks.rtf2xml import copy, border_parse
+from calibre.ptempfile import better_mktemp
+from . import open_for_read, open_for_write
+
+
+class Styles:
+    """
+    Change lines with style numbers to actual style names.
+    """
+
+    def __init__(self,
+            in_file,
+            bug_handler,
+            copy=None,
+            run_level=1,
+            ):
+        """
+        Required:
+            'file'--file to parse
+        Optional:
+            'copy'-- whether to make a copy of result for debugging
+            'temp_dir' --where to output temporary results (default is
+            directory from which the script is run.)
+        Returns:
+            nothing
+            """
+        self.__file = in_file
+        self.__bug_handler = bug_handler
+        self.__copy = copy
+        self.__write_to = better_mktemp()
+        self.__run_level = run_level
+
+    def __initiate_values(self):
+        """
+        Initiate all values.
+        """
+        self.__border_obj = border_parse.BorderParse()
+        self.__styles_dict =  {'par':{}, 'char':{}}
+        self.__styles_num = '0'
+        self.__type_of_style = 'par'
+        self.__text_string = ''
+        self.__state = 'before_styles_table'
+        self.__state_dict = {
+        'before_styles_table': self.__before_styles_func,
+        'in_styles_table'    : self.__in_styles_func,
+        'in_individual_style' : self.__in_individual_style_func,
+        'after_styles_table'  : self.__after_styles_func,
+        'mi<mk<styles-beg'  : self.__found_styles_table_func,
+        'mi<mk<styles-end'  : self.__found_end_styles_table_func,
+        'mi<mk<stylei-beg'  : self.__found_beg_ind_style_func,
+        'mi<mk<stylei-end'  : self.__found_end_ind_style_func,
+        'cw<ss<para-style'  : self.__para_style_func,
+        'cw<ss<char-style'  : self.__char_style_func,
+        }
+        # A separate dictionary for parsing the body text
+        self.__body_dict = {
+        'cw<ss<para-style'  : (self.__para_style_in_body_func, 'par'),
+        'cw<ss<char-style'  : (self.__para_style_in_body_func, 'char'),
+        }
+        # Dictionary needed to convert shortened style names to readable names
+        self.__token_dict={
+        # paragraph formatting => pf
+        'par-end___'    : 'para',
+        'par-def___'    : 'paragraph-definition',
+        'keep-w-nex'    : 'keep-with-next',
+        'widow-cntl'    : 'widow-control',
+        'adjust-rgt'    : 'adjust-right',
+        'language__'    : 'language',
+        'right-inde'    : 'right-indent',
+        'fir-ln-ind'    : 'first-line-indent',
+        'left-inden'    : 'left-indent',
+        'space-befo'    : 'space-before',
+        'space-afte'    : 'space-after',
+        'line-space'    : 'line-spacing',
+        'default-ta'    : 'default-tab',
+        'align_____'    : 'align',
+        'widow-cntr'    : 'widow-control',
+        # page fomratting mixed in! (Just in older RTF?)
+        'margin-lef'    :       'left-indent',
+        'margin-rig'    :       'right-indent',
+        'margin-bot'    :       'space-after',
+        'margin-top'    :       'space-before',
+        # stylesheet = > ss
+        'style-shet'    : 'stylesheet',
+        'based-on__'    : 'based-on-style',
+        'next-style'    : 'next-style',
+        'char-style'    : 'character-style',
+        'para-style'    : 'paragraph-style',
+        # graphics => gr
+        'picture___'    : 'pict',
+        'obj-class_'    : 'obj_class',
+        'mac-pic___'    : 'mac-pict',
+        # section => sc
+        'section___'    : 'section-new',
+        'sect-defin'    : 'section-reset',
+        'sect-note_'    : 'endnotes-in-section',
+        # list=> ls
+        'list-text_'    : 'list-text',
+        'list______'    : 'list',
+        'list-lev-d'    : 'list-level-definition',
+        'list-cardi'    : 'list-cardinal-numbering',
+        'list-decim'    : 'list-decimal-numbering',
+        'list-up-al'    : 'list-uppercase-alphabetic-numbering',
+        'list-up-ro'    : 'list-uppercae-roman-numbering',
+        'list-ord__'    : 'list-ordinal-numbering',
+        'list-ordte'    : 'list-ordinal-text-numbering',
+        'list-bulli'    : 'list-bullet',
+        'list-simpi'    : 'list-simple',
+        'list-conti'    : 'list-continue',
+        'list-hang_'    : 'list-hang',
+        # 'list-tebef'    :	'list-text-before',
+        # 'list-level'    : 'level',
+        'list-id___'    : 'list-id',
+        'list-start'    : 'list-start',
+        'nest-level'    : 'nest-level',
+        # duplicate
+        'list-level'    : 'list-level',
+        # notes => nt
+        'footnote__'    : 'footnote',
+        'type______'    : 'type',
+        # anchor => an
+        'toc_______'    : 'anchor-toc',
+        'book-mk-st'    : 'bookmark-start',
+        'book-mk-en'    : 'bookmark-end',
+        'index-mark'    : 'anchor-index',
+        'place_____'    : 'place',
+        # field => fd
+        'field_____'    : 'field',
+        'field-inst'    : 'field-instruction',
+        'field-rslt'    : 'field-result',
+        'datafield_'    : 'data-field',
+        # info-tables => it
+        'font-table'    : 'font-table',
+        'colr-table'    : 'color-table',
+        'lovr-table'    : 'list-override-table',
+        'listtable_'    : 'list-table',
+        'revi-table'    : 'revision-table',
+        # character info => ci
+        'hidden____'    : 'hidden',
+        'italics___'    : 'italics',
+        'bold______'    : 'bold',
+        'strike-thr'   : 'strike-through',
+        'shadow____'   : 'shadow',
+        'outline___'   : 'outline',
+        'small-caps'   : 'small-caps',
+        'dbl-strike'   : 'double-strike-through',
+        'emboss____'    : 'emboss',
+        'engrave___'    : 'engrave',
+        'subscript_'    : 'subscript',
+        'superscrip'    : 'superscript',
+        'plain_____'    : 'plain',
+        'font-style'    : 'font-style',
+        'font-color'    : 'font-color',
+        'font-size_'    : 'font-size',
+        'font-up___'    : 'superscript',
+        'font-down_'    : 'subscript',
+        'red_______'    : 'red',
+        'blue______'    : 'blue',
+        'green_____'    : 'green',
+        'caps______'    :       'caps',
+        # table => tb
+        'row-def___'    : 'row-definition',
+        'cell______'    : 'cell',
+        'row_______'    : 'row',
+        'in-table__'    : 'in-table',
+        'columns___'    : 'columns',
+        'row-pos-le'    : 'row-position-left',
+        'cell-posit'    : 'cell-position',
+        # preamble => pr
+        # underline
+        'underlined'    : 'underlined',
+        # border => bd
+        'bor-t-r-hi'    : 'border-table-row-horizontal-inside',
+        'bor-t-r-vi'    : 'border-table-row-vertical-inside',
+        'bor-t-r-to'    : 'border-table-row-top',
+        'bor-t-r-le'    : 'border-table-row-left',
+        'bor-t-r-bo'    : 'border-table-row-bottom',
+        'bor-t-r-ri'    : 'border-table-row-right',
+        'bor-cel-bo'    : 'border-cell-bottom',
+        'bor-cel-to'    : 'border-cell-top',
+        'bor-cel-le'    : 'border-cell-left',
+        'bor-cel-ri'    : 'border-cell-right',
+        # 'bor-par-bo'    : 'border-paragraph-bottom',
+        'bor-par-to'    : 'border-paragraph-top',
+        'bor-par-le'    : 'border-paragraph-left',
+        'bor-par-ri'    : 'border-paragraph-right',
+        'bor-par-bo'    : 'border-paragraph-box',
+        'bor-for-ev'    : 'border-for-every-paragraph',
+        'bor-outsid'    : 'border-outisde',
+        'bor-none__'    : 'border',
+        # border type => bt
+        'bdr-single'    : 'single',
+        'bdr-doubtb'    : 'double-thickness-border',
+        'bdr-shadow'    : 'shadowed-border',
+        'bdr-double'    : 'double-border',
+        'bdr-dotted'    : 'dotted-border',
+        'bdr-dashed'    : 'dashed',
+        'bdr-hair__'    : 'hairline',
+        'bdr-inset_'    : 'inset',
+        'bdr-das-sm'    : 'dash-small',
+        'bdr-dot-sm'    : 'dot-dash',
+        'bdr-dot-do'    : 'dot-dot-dash',
+        'bdr-outset'    : 'outset',
+        'bdr-trippl'    : 'tripple',
+        'bdr-thsm__'    : 'thick-thin-small',
+        'bdr-htsm__'    : 'thin-thick-small',
+        'bdr-hthsm_'    : 'thin-thick-thin-small',
+        'bdr-thm__'     : 'thick-thin-medium',
+        'bdr-htm__'     : 'thin-thick-medium',
+        'bdr-hthm_'     : 'thin-thick-thin-medium',
+        'bdr-thl__'     : 'thick-thin-large',
+        'bdr-hthl_'     : 'think-thick-think-large',
+        'bdr-wavy_'     : 'wavy',
+        'bdr-d-wav'     : 'double-wavy',
+        'bdr-strip'     : 'striped',
+        'bdr-embos'     : 'emboss',
+        'bdr-engra'     : 'engrave',
+        'bdr-frame'     : 'frame',
+        'bdr-li-wid'    : 'line-width',
+        # tabs
+        'tab-center'  :   'center',
+        'tab-right_'  :   'right',
+        'tab-dec___'  :   'decimal',
+        'leader-dot'  :   'leader-dot',
+        'leader-hyp'  :   'leader-hyphen',
+        'leader-und'  :   'leader-underline',
+        }
+        self.__tabs_dict = {
+        'cw<pf<tab-stop__'  :   self.__tab_stop_func,
+        'cw<pf<tab-center'  :   self.__tab_type_func,
+        'cw<pf<tab-right_'  :   self.__tab_type_func,
+        'cw<pf<tab-dec___'  :   self.__tab_type_func,
+        'cw<pf<leader-dot'  :   self.__tab_leader_func,
+        'cw<pf<leader-hyp'  :   self.__tab_leader_func,
+        'cw<pf<leader-und'  :   self.__tab_leader_func,
+        'cw<pf<tab-bar-st'  :   self.__tab_bar_func,
+        }
+        self.__tab_type_dict = {
+        'cw<pf<tab-center'  :   'center',
+        'cw<pf<tab-right_'  :   'right',
+        'cw<pf<tab-dec___'  :   'decimal',
+        'cw<pf<leader-dot'  :   'leader-dot',
+        'cw<pf<leader-hyp'  :   'leader-hyphen',
+        'cw<pf<leader-und'  :   'leader-underline',
+        }
+        self.__ignore_list = [
+        'list-tebef',
+            ]
+        self.__tabs_list = self.__tabs_dict.keys()
+        self.__tab_type = 'left'
+        self.__leader_found = 0
+
+    def __in_individual_style_func(self, line):
+        """
+        Required:
+            line
+        Returns:
+            nothing
+        Logic:
+            Check if the token marks the end of the individual style. (Action
+            is the value of the state dictionary, and the only key that will
+            match in this function is the end of the individual style.)
+            If the end of the individual style is not found, check if the line
+            is a control word. If it is, extract the relelvant info and look
+            up this info in the tokens dictionary. I want to change
+            abbreviated names for longer, more readable ones.
+            Write an error message if no key is found for the info.
+            If the line is text, add the text to a text string. The text
+            string will be the name of the style.
+            """
+        action = self.__state_dict.get(self.__token_info)
+        if action:
+            action(line)
+        # have to parse border lines with external module
+        elif line[0:5] == 'cw<bd':
+            border_dict = self.__border_obj.parse_border(line)
+            keys = border_dict.keys()
+            for key in keys:
+                self.__enter_dict_entry(key, border_dict[key])
+        elif self.__token_info in self.__tabs_list:
+            action = self.__tabs_dict.get(self.__token_info)
+            if action is not None:
+                action(line)
+        elif line[0:2] == 'cw':
+            # cw<pf<widow-cntl<nu<true
+            info = line[6:16]
+            att = self.__token_dict.get(info)
+            if att is None :
+                if info not in self.__ignore_list:
+                    if self.__run_level > 3:
+                        msg = 'no value for key %s\n' % info
+                        raise self.__bug_handler(msg)
+            else:
+                value = line[20:-1]
+                self.__enter_dict_entry(att, value)
+        elif line[0:2] == 'tx':
+            self.__text_string += line[17:-1]
+
+    def __tab_stop_func(self, line):
+        """
+        Requires:
+            line -- line to parse
+        Returns:
+            nothing
+        Logic:
+            Try to add the number to dictionary entry tabs-left, or tabs-right, etc.
+            If the dictionary entry doesn't exist, create one.
+        """
+        try:
+            if self.__leader_found:
+                self.__styles_dict['par'][self.__styles_num]['tabs']\
+                += '%s:' % self.__tab_type
+                self.__styles_dict['par'][self.__styles_num]['tabs']\
+                += '%s;' % line[20:-1]
+            else:
+                self.__styles_dict['par'][self.__styles_num]['tabs']\
+                += '%s:' % self.__tab_type
+                self.__styles_dict['par'][self.__styles_num]['tabs']\
+                += '%s;' % line[20:-1]
+        except KeyError:
+            self.__enter_dict_entry('tabs', '')
+            self.__styles_dict['par'][self.__styles_num]['tabs']\
+                += '%s:' % self.__tab_type
+            self.__styles_dict['par'][self.__styles_num]['tabs'] += '%s;' % line[20:-1]
+        self.__tab_type = 'left'
+        self.__leader_found = 0
+
+    def __tab_type_func(self, line):
+        """
+        """
+        type = self.__tab_type_dict.get(self.__token_info)
+        if type is not None:
+            self.__tab_type = type
+        else:
+            if self.__run_level > 3:
+                msg = 'no entry for %s\n' % self.__token_info
+                raise self.__bug_handler(msg)
+
+    def __tab_leader_func(self, line):
+        """
+        Requires:
+            line --line to parse
+        Returns:
+            nothing
+        Logic:
+            Try to add the string of the tab leader to dictionary entry
+            tabs-left, or tabs-right, etc.  If the dictionary entry doesn't
+            exist, create one.
+        """
+        self.__leader_found = 1
+        leader = self.__tab_type_dict.get(self.__token_info)
+        if leader is not None:
+            leader += '^'
+            try:
+                self.__styles_dict['par'][self.__styles_num]['tabs'] += ':%s;' % leader
+            except KeyError:
+                self.__enter_dict_entry('tabs', '')
+                self.__styles_dict['par'][self.__styles_num]['tabs'] += '%s;' % leader
+        else:
+            if self.__run_level > 3:
+                msg = 'no entry for %s\n' % self.__token_info
+                raise self.__bug_handler(msg)
+
+    def __tab_bar_func(self, line):
+        """
+        Requires:
+            line -- line to parse
+        Returns:
+            nothing
+        Logic:
+            Try to add the string of the tab bar to dictionary entry tabs-bar.
+            If the dictionary entry doesn't exist, create one.
+        """
+        # self.__add_dict_entry('tabs-bar', line[20:-1])
+        try:
+            self.__styles_dict['par'][self.__styles_num]['tabs']\
+            += '%s:' % 'bar'
+            self.__styles_dict['par'][self.__styles_num]['tabs']\
+            += '%s;' % line[20:-1]
+        except KeyError:
+            self.__enter_dict_entry('tabs', '')
+            self.__styles_dict['par'][self.__styles_num]['tabs']\
+            += '%s:' % 'bar'
+            self.__styles_dict['par'][self.__styles_num]['tabs']\
+            += '%s;' % line[20:-1]
+        self.__tab_type = 'left'
+
+    def __enter_dict_entry(self, att, value):
+        """
+        Required:
+            att -- the attribute
+            value -- the value
+        Returns:
+            nothing
+        Logic:
+            Try to add the attribute value directly to the styles dictionary.
+            If a keyerror is found, that means I have to build the "branches"
+            of the dictionary before I can add the key value pair.
+        """
+        try:
+            self.__styles_dict[self.__type_of_style][self.__styles_num][att] = value
+        except KeyError:
+            self.__add_dict_entry(att, value)
+
+    def __add_dict_entry(self, att, value):
+        """
+        Required:
+            att --the attribute
+            value --the value
+        Returns:
+            nothing
+        Logic:
+            I have to build the branches of the dictionary before I can add
+            the leaves. (I am comparing a dictionary to a tree.) To achieve
+            this, I first make a temporary dictionary by extracting either the
+            inside dictionary of the keyword par or char. This temporary
+            dictionary is called type_dict.
+            Next, create a second, smaller dictionary with just the attribute and value.
+            Add the small dictionary to the type dictionary.
+            Add this type dictionary to the main styles dictionary.
+        """
+        if self.__type_of_style == 'par':
+            type_dict =self.__styles_dict['par']
+        elif self.__type_of_style == 'char':
+            type_dict = self.__styles_dict['char']
+        else:
+            if self.__run_level > 3:
+                msg = self.__type_of_style + 'error\n'
+                raise self.__bug_handler(msg)
+        smallest_dict = {}
+        smallest_dict[att] = value
+        type_dict[self.__styles_num] = smallest_dict
+        self.__styles_dict[self.__type_of_style] = type_dict
+
+    def __para_style_func(self, line):
+        """
+        Required:
+            line
+        Returns:
+            nothing
+        Logic:
+            Set the type of style to paragraph.
+            Extract the number for a line such as "cw<ss<para-style<nu<15".
+        """
+        self.__type_of_style = 'par'
+        self.__styles_num = line[20:-1]
+        """
+        self.__enter_dict_entry('tabs-left', '')
+        self.__enter_dict_entry('tabs-right', '')
+        self.__enter_dict_entry('tabs-center', '')
+        self.__enter_dict_entry('tabs-decimal', '')
+        self.__enter_dict_entry('tabs-bar', '')
+        """
+
+    def __char_style_func(self, line):
+        """
+        Required:
+            line
+        Returns:
+            nothing
+        Logic:
+            Set the type of style to character.
+            Extract the number for a line such as "cw<ss<char-style<nu<15".
+        """
+        self.__type_of_style = 'char'
+        self.__styles_num = line[20:-1]
+
+    def __found_beg_ind_style_func(self, line):
+        """
+        Required:
+            line
+        Returns:
+            nothing
+        Logic:
+            Get rid of the last semicolon in the text string. Add the text
+            string as the value with 'name' as the key in the style
+            dictionary.
+        """
+        self.__state = 'in_individual_style'
+
+    def __found_end_ind_style_func(self, line):
+        name = self.__text_string[:-1]  # get rid of semicolon
+        # add 2005-04-29
+        # get rid of space before or after
+        name = name.strip()
+        self.__enter_dict_entry('name', name)
+        self.__text_string = ''
+
+    def __found_end_styles_table_func(self, line):
+        """
+        Required:
+            line
+        Returns:
+            nothing
+        Logic:
+            Set the state to after the styles table.
+            Fix the styles. (I explain this below.)
+            Print out the style table.
+        """
+        self.__state = 'after_styles_table'
+        self.__fix_based_on()
+        self.__print_style_table()
+
+    def __fix_based_on(self):
+        """
+        Requires:
+            nothing
+        Returns:
+            nothing
+        Logic:
+            The styles dictionary may contain a pair of key values such as
+            'next-style' => '15'. I want to change the 15 to the name of the
+            style. I accomplish this by simply looking up the value of 15 in
+            the styles table.
+            Use two loops. First, check all the paragraph styles. Then check
+            all the characer styles.
+            The inner loop: first check 'next-style', then check 'based-on-style'.
+            Make sure values exist for the keys to avoid the nasty keyerror message.
+        """
+        types = ['par', 'char']
+        for type in types:
+            keys = self.__styles_dict[type].keys()
+            for key in keys:
+                styles = ['next-style', 'based-on-style']
+                for style in styles:
+                    value = self.__styles_dict[type][key].get(style)
+                    if value is not None:
+                        temp_dict = self.__styles_dict[type].get(value)
+                        if temp_dict:
+                            changed_value = self.__styles_dict[type][value].get('name')
+                            if changed_value:
+                                self.__styles_dict[type][key][style] = \
+                                changed_value
+                        else:
+                            if value == 0 or value == '0':
+                                pass
+                            else:
+                                if self.__run_level > 4:
+                                    msg = '%s %s is based on %s\n' % (type, key, value)
+                                    msg = 'There is no style with %s\n' % value
+                                    raise self.__bug_handler(msg)
+                            del self.__styles_dict[type][key][style]
+
+    def __print_style_table(self):
+        """
+        Required:
+            nothing
+        Returns:
+            nothing
+        Logic:
+            This function prints out the style table.
+            I use three nested for loops. The outer loop prints out the
+            paragraphs styles, then the character styles.
+            The next loop iterates through the style numbers.
+            The most inside loop iterates over the pairs of attributes and
+            values, and prints them out.
+        """
+        types = ['par', 'char']
+        for type in types:
+            if type == 'par':
+                prefix = 'paragraph'
+            else:
+                prefix = 'character'
+            self.__write_obj.write(
+            'mi<tg<open______<%s-styles\n' % prefix
+            )
+            style_numbers = self.__styles_dict[type].keys()
+            for num in style_numbers:
+                self.__write_obj.write(
+                'mi<tg<empty-att_<%s-style-in-table<num>%s' % (prefix, num)
+                )
+                attributes = self.__styles_dict[type][num].keys()
+                for att in attributes:
+                    this_value = self.__styles_dict[type][num][att]
+                    self.__write_obj.write(
+                        '<%s>%s' % (att, this_value)
+                        )
+                self.__write_obj.write('\n')
+            self.__write_obj.write(
+            'mi<tg<close_____<%s-styles\n' % prefix
+            )
+
+    def __found_styles_table_func(self, line):
+        """
+        Required:
+            line
+        Returns:
+            nothing
+        Logic:
+            Change the state to in the style table when the marker has been found.
+        """
+        self.__state = 'in_styles_table'
+
+    def __before_styles_func(self, line):
+        """
+        Required:
+            line
+        Returns:
+            nothing.
+        Logic:
+            Check the line info in the state dictionary. When the beginning of
+            the styles table is found, change the state to in the styles
+            table.
+        """
+        action = self.__state_dict.get(self.__token_info)
+        if not action:
+            self.__write_obj.write(line)
+        else:
+            action(line)
+
+    def __in_styles_func(self, line):
+        """
+        Required:
+            line
+        Returns:
+            nothing
+        Logic:
+            Check the line for the beginning of an individaul style. If it is
+            not found, simply print out the line.
+        """
+        action = self.__state_dict.get(self.__token_info)
+        if action is None:
+            self.__write_obj.write(line)
+        else:
+            action(line)
+
+    def __para_style_in_body_func(self, line, type):
+        """
+        Required:
+            line-- the line
+            type -- whether a character or paragraph
+        Returns:
+            nothing
+        Logic:
+            Determine the prefix by whether the type is "par" or "char".
+            Extract the number from a line such as "cw<ss<para-style<nu<15".
+            Look up that number in the styles dictionary and put a name for a number
+        """
+        if type == 'par':
+            prefix = 'para'
+        else:
+            prefix = 'char'
+        num = line[20:-1]
+        # may be invalid RTF--a style down below not defined above!
+        try:
+            value = self.__styles_dict[type][num]['name']
+        except KeyError:
+            value = None
+        if value:
+            self.__write_obj.write(
+            'cw<ss<%s-style<nu<%s\n' % (prefix, value)
+            )
+        else:
+            self.__write_obj.write(
+            'cw<ss<%s_style<nu<not-defined\n' % prefix
+            )
+
+    def __after_styles_func(self, line):
+        """
+        Required:
+            line
+        Returns:
+            nothing
+        Logic:
+            Determine if a line with either character of paragraph style info
+            has been found. If so, then use the appropriate method to parse
+            the line. Otherwise, write the line to a file.
+        """
+        action, type = self.__body_dict.get(self.__token_info, (None, None))
+        if action:
+            action(line, type)
+        else:
+            self.__write_obj.write(line)
+
+    def convert_styles(self):
+        """
+        Requires:
+            nothing
+        Returns:
+            nothing (changes the original file)
+        Logic:
+            Read one line in at a time. Determine what action to take based on
+            the state. If the state is before the style table, look for the
+            beginning of the style table.
+            If the state is in the style table, create the style dictionary
+            and print out the tags.
+            If the state if afer the style table, look for lines with style
+            info, and substitute the number with the name of the style.
+        """
+        self.__initiate_values()
+        read_obj = open_for_read(self.__file)
+        self.__write_obj = open_for_write(self.__write_to)
+        line_to_read = 1
+        while line_to_read:
+            line_to_read = read_obj.readline()
+            line = line_to_read
+            self.__token_info = line[:16]
+            action = self.__state_dict.get(self.__state)
+            if action is None:
+                sys.stderr.write('no matching state in module styles.py\n')
+                sys.stderr.write(self.__state + '\n')
+            action(line)
+        read_obj.close()
+        self.__write_obj.close()
+        copy_obj = copy.Copy(bug_handler=self.__bug_handler)
+        if self.__copy:
+            copy_obj.copy_file(self.__write_to, "styles.data")
+        copy_obj.rename(self.__write_to, self.__file)
+        os.remove(self.__write_to)
--- a/ebook_converter/ebooks/rtf2xml/table.py
+++ b/ebook_converter/ebooks/rtf2xml/table.py
@@ -0,0 +1,568 @@
+from __future__ import absolute_import, division, print_function, unicode_literals
+#########################################################################
+#                                                                       #
+#                                                                       #
+#   copyright 2002 Paul Henry Tremblay                                  #
+#                                                                       #
+#   This program is distributed in the hope that it will be useful,     #
+#   but WITHOUT ANY WARRANTY; without even the implied warranty of      #
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU    #
+#   General Public License for more details.                            #
+#                                                                       #
+#                                                                       #
+#########################################################################
+import sys, os
+
+from calibre.ebooks.rtf2xml import copy, border_parse
+from calibre.ptempfile import better_mktemp
+from polyglot.builtins import unicode_type
+
+from . import open_for_read, open_for_write
+
+"""
+States.
+1. 'not_in_table'
+    1. 'cw<tb<row-def___' start a row definition
+    2. 'mi<mk<in-table__' start table
+2. 'in_table'
+    1. 'mi<mk<pard-start', start of a row, cell
+    2. 'mi<mk<not-in-tbl', end the table.
+    3. 'cw<tb<row-def___' start a row definition
+3. in_row_definition
+    1.  'mi<mk<not-in-tbl'  :   end the row defintion. If in table, end the table.
+    2.  'mi<mk<pard-start'  :   end the row defintion
+        if already in the table, start a row and cell.
+    3.  'cw<tb<row_______'  : end the row definition, end the row
+    4.  'cw...' use another method to handle the control word
+        control word might be added to dictionary.
+    5.  'mi<mk<in-table__' If already in table, do nothing. Otherwise
+        start the table.
+4. 'in_row'
+    1. 'mi<mk<pard-start', start  cell
+    2. 'mi<mk<not-in-tbl'  end table,
+    3. 'cw<tb<row_______'  close row,
+5. 'in_cell'
+    1. 'mi<mk<not-in-tbl', end table
+    2. 'cw<tb<cell______', end cell
+"""
+
+
+class Table:
+    """
+    Make tables.
+    Logic:
+    Read one line at a time. The default state (self.__state) is
+    'not_in_table'. Look for either a 'cw<tb<in-table__', or a row definition.
+    """
+
+    def __init__(self,
+            in_file,
+            bug_handler,
+            copy=None,
+            run_level=1,):
+        """
+        Required:
+            'file'--file to parse
+        Optional:
+            'copy'-- whether to make a copy of result for debugging
+            'temp_dir' --where to output temporary results (default is
+            directory from which the script is run.)
+        Returns:
+            nothing
+            """
+        self.__file = in_file
+        self.__bug_handler = bug_handler
+        self.__copy = copy
+        self.__run_level = run_level
+        self.__write_to = better_mktemp()
+
+    def __initiate_values(self):
+        """
+        Initiate all values.
+        """
+        self.__state_dict = {
+        'in_table':         self.__in_table_func,
+        'in_row_def':       self.__in_row_def_func,
+        'not_in_table':     self.__not_in_table_func,
+        'in_cell':          self.__in_cell_func,
+        'in_row':           self.__in_row_func,
+        }
+        self.__not_in_table_dict = {
+        'cw<tb<row-def___':   self.__found_row_def_func,
+        'cw<tb<in-table__': self.__start_table_func,
+        'mi<mk<in-table__'  : self.__start_table_func,
+        }
+        # can't use this dictionary. When in row_definition, many tokens
+        # require multiple definitions
+        self.__in_row_definition_dict = {
+        'mi<mk<not-in-tbl'  :   self.__end_row_table_func,
+        'mi<mk<pard-start'  :   self.__end_row_def_func,
+        }
+        self.__in_row_dict = {
+        'mi<mk<not-in-tbl'  :   self.__close_table,
+        'mi<mk<pard-start'  :   self.__start_cell_func,
+        'cw<tb<row_______'  :   self.__end_row_func,
+        'cw<tb<cell______'  :   self.__empty_cell,
+        }
+        # set the default state
+        self.__state = ['not_in_table']
+        # set empty data for all tables
+        self.__table_data = []
+        # just in case there is no table data
+        self.__row_dict = {}
+        self.__cell_list = []
+        self.__cell_widths = []
+
+    def __in_table_func(self, line):
+        """
+        Requires:
+            line -- line to parse
+        Logic:
+            Look for the end of the table. If found, close out the table.
+            Look for  'mi<mk<pard-start', which marks the beginning of a row. Start
+            a row and start a cell.
+        """
+        # 'cell'               :	('tb', 'cell______', self.default_func),
+        if self.__token_info == 'mi<mk<not-in-tbl' or\
+            self.__token_info == 'mi<mk<sect-start' or\
+            self.__token_info == 'mi<mk<sect-close' or\
+            self.__token_info == 'mi<mk<body-close':
+            self.__close_table(line)
+        elif self.__token_info == 'mi<mk<pard-start':
+            self.__start_row_func(line)
+            self.__start_cell_func(line)
+        elif self.__token_info == 'cw<tb<row-def___':
+            self.__found_row_def_func(line)
+        elif self.__token_info == 'cw<tb<cell______':
+            self.__start_row_func(line)
+            self.__empty_cell(line)
+        self.__write_obj.write(line)
+
+    def __not_in_table_func(self, line):
+        """
+        Requires:
+            line -- the line of text read in from document
+        Returns:
+            nothing
+        Logic:
+            The state is not in a table, so look for the two tokens that
+            mark the start of a table: 'cw<tb<row-def', or 'cw<tb<in-table__'.
+            If these tokens are found, use another method to start a table
+            and change states. Otherwise, just output the line.
+        """
+        action = self.__not_in_table_dict.get(self.__token_info)
+        if action:
+            action(line)
+        self.__write_obj.write(line)
+
+    def __close_table(self, line):
+        """
+        Requires:
+            line -- line to parse
+        Returns:
+            ?
+        Logic:
+            Write the end marker for the table.
+            Write the end tag for the table.
+            Set the state to ['not_in_table']
+        """
+        self.__write_obj.write('mi<mk<table-end_\n')
+        self.__state = ['not_in_table']
+        self.__table_data[-1]['number-of-columns'] = self.__max_number_cells_in_row
+        self.__table_data[-1]['number-of-rows'] = self.__rows_in_table
+        average_cells_in_row = self.__mode(self.__list_of_cells_in_row)
+        self.__table_data[-1]['average-cells-per-row'] = average_cells_in_row
+        average_cell_width = self.__mode(self.__cell_widths)
+        self.__table_data[-1]['average-cell-width'] = average_cell_width
+
+    def __found_row_def_func(self, line):
+        """
+        Requires:
+            line don't need this except for consistency with other methods.
+        Returns:
+            nothing
+        Logic:
+            A row definition has been found. Collect all the data from this
+            to use later in writing attributes for the table.
+        """
+        self.__state.append('in_row_def')
+        self.__last_cell_position = 0
+        self.__row_dict = {}
+        self.__cell_list = []
+        self.__cell_list.append({})
+        self.__cell_widths = []
+
+    def __start_table_func(self, line):
+        """
+        Requires:
+            line -- line to parse
+        Returns:
+            ?
+        Logic:
+            Add the 'in_table' to the state list.
+            Write out the table marker.
+            Initialize table values (not sure about these yet)
+        """
+        self.__rows_in_table = 0
+        self.__cells_in_table = 0
+        self.__cells_in_row = 0
+        self.__max_number_cells_in_row = 0
+        self.__table_data.append({})
+        self.__list_of_cells_in_row = []
+        self.__write_obj.write('mi<mk<tabl-start\n')
+        self.__state.append('in_table')
+
+    def __end_row_table_func(self, line):
+        """
+        Requires:
+            line --just for consistencey
+        Returns:
+            ?
+        Logic:
+            ?
+        """
+        self.__close_table(self, line)
+
+    def __end_row_def_func(self, line):
+        """
+        Requires:
+            line --just for consistency
+        Returns:
+            nothing
+        Logic:
+            change the state.
+            get rid of the last {} in the cell list
+            figure out the number of cells based on the self.__row_dict[widths]
+            ('122, 122')
+        """
+        if len(self.__state) > 0:
+            if self.__state[-1] == 'in_row_def':
+                self.__state.pop()
+        # added [{]] at the *end* of each /cell. Get rid of extra one
+        self.__cell_list.pop()
+        widths = self.__row_dict.get('widths')
+        if widths:
+            width_list = widths.split(',')
+            num_cells = len(width_list)
+            self.__row_dict['number-of-cells'] = num_cells
+
+    def __in_row_def_func(self, line):
+        """
+        Requires:
+            line --line to parse
+        Returns:
+            nothing
+        Logic:
+            In the text that defines a row. If a control word is found, handle the
+            control word with another method.
+            Check for states that will end this state.
+            While in the row definition, certain tokens can end a row or end a table.
+            If a paragrah definition (pard-start) is found, and the you are already in
+            a table, start of a row.
+        """
+        if self.__token_info == 'cw<tb<row_______':
+            # write tags
+            self.__end_row_func(line)
+            # change the state
+            self.__end_row_def_func(line)
+            self.__write_obj.write(line)
+        elif line[0:2] == 'cw':
+            self.__handle_row_token(line)
+            self.__write_obj.write(line)
+        elif self.__token_info == 'mi<mk<not-in-tbl' and 'in_table' in self.__state:
+            self.__end_row_def_func(line)
+            self.__close_table(line)
+            self.__write_obj.write(line)
+        elif self.__token_info == 'mi<mk<pard-start':
+            self.__end_row_def_func(line)
+            # if already in the table, start a row, then cell.
+            if (self.__state) > 0 and self.__state[-1] == 'in_table':
+                self.__start_row_func(line)
+                self.__start_cell_func(line)
+            self.__write_obj.write(line)
+        elif self.__token_info == 'mi<mk<in-table__':
+            self.__end_row_def_func(line)
+            # if not in table, start a new table
+            if len(self.__state) > 0 and self.__state[-1] != 'in_table':
+                self.__start_table_func(line)
+            self.__write_obj.write(line)
+        else:
+            self.__write_obj.write(line)
+
+    def __handle_row_token(self, line):
+        """
+        Requires:
+            line -- line to parse
+        Returns:
+            ?
+        Logic:
+            the tokens in the row definition contain the following information:
+               1. row borders.
+               2. cell borders for all cells in the row.
+               3. cell postions for all cells in the row.
+            Put all information about row borders into a row dictionary.
+            Put all information about cell borders into into the dictionary in
+            the last item in the cell list. ([{border:something, width:something},
+                    {border:something, width:something}])
+    cw<bd<bor-t-r-to<nu<bdr-hair__|bdr-li-wid:0.50
+        """
+        if line[3:5] == 'bd':
+            border_obj = border_parse.BorderParse()
+            the_dict = border_obj.parse_border(line)
+            keys = the_dict.keys()
+            # border-cell-top-hairline
+            in_cell = 0
+            for key in keys:
+                if key[0:11] == 'border-cell':
+                    in_cell = 1
+            for key in keys:
+                if in_cell:
+                    self.__cell_list[-1][key] = the_dict[key]
+                else:
+                    self.__row_dict[key] = the_dict[key]
+        # cw<tb<cell-posit<nu<216.00
+        elif self.__token_info == 'cw<tb<cell-posit':
+            self.__found_cell_position(line)
+        # cw<tb<row-pos-le<nu<-5.40
+        elif self.__token_info == 'cw<tb<row-pos-le':
+            position = line[20:-1]
+            self.__row_dict['left-row-position'] = position
+        elif self.__token_info == 'cw<tb<row-header':
+            self.__row_dict['header'] = 'true'
+
+    def __start_cell_func(self, line):
+        """
+        Required:
+            line -- the line of text
+        Returns:
+            nothing
+        Logic:
+            Append 'in_cell' for states
+            If the self.__cell list containst dictionaries, get the last dictionary.
+            Write value => attributes for key=> value
+            pop the self.__cell_list.
+            Otherwise, print out a cell tag.
+        """
+        self.__state.append('in_cell')
+        # self.__cell_list = []
+        if len(self.__cell_list) > 0:
+            self.__write_obj.write('mi<tg<open-att__<cell')
+            # cell_dict = self.__cell_list[-1]
+            cell_dict = self.__cell_list[0]
+            keys = cell_dict.keys()
+            for key in keys:
+                self.__write_obj.write('<%s>%s' % (key, cell_dict[key]))
+            self.__write_obj.write('\n')
+            # self.__cell_list.pop()
+            self.__cell_list.pop(0)
+            # self.__cell_list = self.__cell_list[1:]
+        else:
+            self.__write_obj.write('mi<tg<open______<cell\n')
+        self.__cells_in_table += 1
+        self.__cells_in_row += 1
+
+    def __start_row_func(self, line):
+        """
+        Required:
+            line -- the line of text
+        Returns:
+            nothing
+        Logic:
+            Append 'in_row' for states
+            Write value => attributes for key=> value
+        """
+        self.__state.append('in_row')
+        self.__write_obj.write('mi<tg<open-att__<row')
+        keys = self.__row_dict.keys()
+        for key in keys:
+            self.__write_obj.write('<%s>%s' % (key, self.__row_dict[key]))
+        self.__write_obj.write('\n')
+        self.__cells_in_row = 0
+        self.__rows_in_table += 1
+
+    def __found_cell_position(self, line):
+        """
+        needs:
+            line: current line
+        returns:
+            nothing
+        logic:
+           Calculate the cell width.
+           If the cell is the first cell, you should add the left cell position to it.
+           (This value is often negative.)
+            Next, set the new last_cell_position to the current cell position.
+        """
+        # cw<tb<cell-posit<nu<216.00
+        new_cell_position = round(float(line[20:-1]), 2)
+        left_position = 0
+        if self.__last_cell_position == 0:
+            left_position = self.__row_dict.get('left-row-position', 0)
+            left_position = float(left_position)
+        width = new_cell_position - self.__last_cell_position - left_position
+        # width = round(width, 2)
+        width = unicode_type('%.2f' % width)
+        self.__last_cell_position = new_cell_position
+        widths_exists = self.__row_dict.get('widths')
+        if widths_exists:
+            self.__row_dict['widths'] += ', %s' % unicode_type(width)
+        else:
+            self.__row_dict['widths'] = unicode_type(width)
+        self.__cell_list[-1]['width'] = width
+        self.__cell_list.append({})
+        self.__cell_widths.append(width)
+
+    def __in_cell_func(self, line):
+        """
+        Required:
+            line
+        Returns:
+            nothing
+        Logic:
+            In the middle of a cell.
+            Look for the close of the table. If found, use the close table function to close
+            the table.
+            Look for the close of the cell. If found, use the close cell function to close out
+            the cell.
+            Otherwise, print out the line.
+        """
+        # cw<tb<cell______<nu<true
+        # mi<mk<sect-start
+        if self.__token_info == 'mi<mk<not-in-tbl' or\
+            self.__token_info == 'mi<mk<sect-start' or\
+            self.__token_info == 'mi<mk<sect-close' or\
+            self.__token_info == 'mi<mk<body-close':
+            self.__end_cell_func(line)
+            self.__end_row_func(line)
+            self.__close_table(line)
+            self.__write_obj.write(line)
+        elif self.__token_info ==  'cw<tb<cell______':
+            self.__end_cell_func(line)
+        else:
+            self.__write_obj.write(line)
+
+    def __end_cell_func(self, line):
+        """
+        Requires:
+            line
+        Returns:
+            nothing
+        Logic:
+            End the cell. Print out the closing marks. Pop the self.__state.
+        """
+        if len(self.__state) > 1:
+            if self.__state[-1] == 'in_cell':
+                self.__state.pop()
+        self.__write_obj.write('mi<mk<close_cell\n')
+        self.__write_obj.write('mi<tg<close_____<cell\n')
+        self.__write_obj.write('mi<mk<closecell_\n')
+
+    def __in_row_func(self, line):
+        if self.__token_info == 'mi<mk<not-in-tbl' or\
+            self.__token_info == 'mi<mk<sect-start' or\
+            self.__token_info == 'mi<mk<sect-close' or\
+            self.__token_info == 'mi<mk<body-close':
+            self.__end_row_func(line)
+            self.__close_table(line)
+            self.__write_obj.write(line)
+        else:
+            action = self.__in_row_dict.get(self.__token_info)
+            if action:
+                action(line)
+            self.__write_obj.write(line)
+        """
+        elif self.__token_info == 'mi<mk<pard-start':
+            self.__start_cell_func(line)
+            self.__write_obj.write(line)
+        elif self.__token_info == 'cw<tb<row_______':
+            self.__end_row_func(line)
+            self.__write_obj.write(line)
+        else:
+            self.__write_obj.write(line)
+        """
+
+    def __end_row_func(self, line):
+        """
+        """
+        if len(self.__state) > 1 and self.__state[-1] == 'in_row':
+            self.__state.pop()
+            self.__write_obj.write('mi<tg<close_____<row\n')
+        else:
+            self.__write_obj.write('mi<tg<empty_____<row\n')
+            self.__rows_in_table += 1
+        if self.__cells_in_row > self.__max_number_cells_in_row:
+            self.__max_number_cells_in_row = self.__cells_in_row
+        self.__list_of_cells_in_row.append(self.__cells_in_row)
+
+    def __empty_cell(self, line):
+        """
+        Required:
+            line -- line of text
+        Returns:
+            nothing
+        Logic:
+            Write an empty tag with attributes if there are attributes.
+            Otherwise, writen an empty tag with cell as element.
+        """
+        if len(self.__cell_list) > 0:
+            self.__write_obj.write('mi<tg<empty-att_<cell')
+            cell_dict = self.__cell_list[-1]
+            keys = cell_dict.keys()
+            for key in keys:
+                self.__write_obj.write('<%s>%s' % (key, cell_dict[key]))
+            self.__write_obj.write('\n')
+        else:
+            self.__write_obj.write('mi<tg<empty_____<cell\n')
+        self.__cells_in_table += 1
+        self.__cells_in_row += 1
+
+    def __mode(self, the_list):
+        """
+        Required:
+            the_list -- a list of something
+        Returns:
+            the number that occurs the most
+        Logic:
+            get the count of each item in list. The count that is the greatest
+            is the mode.
+        """
+        max = 0
+        mode = 'not-defined'
+        for item in the_list:
+            num_of_values = the_list.count(item)
+            if num_of_values > max:
+                mode = item
+                max = num_of_values
+        return mode
+
+    def make_table(self):
+        """
+        Requires:
+            nothing
+        Returns:
+            A dictionary of values for the beginning of the table.
+        Logic:
+            Read one line in at a time. Determine what action to take based on
+            the state.
+        """
+        self.__initiate_values()
+        read_obj = open_for_read(self.__file)
+        self.__write_obj = open_for_write(self.__write_to)
+        line_to_read = 1
+        while line_to_read:
+            line_to_read = read_obj.readline()
+            line = line_to_read
+            self.__token_info = line[:16]
+            action = self.__state_dict.get(self.__state[-1])
+            # print self.__state[-1]
+            if action is None:
+                sys.stderr.write('No matching state in module table.py\n')
+                sys.stderr.write(self.__state[-1] + '\n')
+            action(line)
+        read_obj.close()
+        self.__write_obj.close()
+        copy_obj = copy.Copy(bug_handler=self.__bug_handler)
+        if self.__copy:
+            copy_obj.copy_file(self.__write_to, "table.data")
+        copy_obj.rename(self.__write_to, self.__file)
+        os.remove(self.__write_to)
+        return self.__table_data
--- a/ebook_converter/ebooks/rtf2xml/table_info.py
+++ b/ebook_converter/ebooks/rtf2xml/table_info.py
@@ -0,0 +1,88 @@
+from __future__ import unicode_literals, absolute_import, print_function, division
+#########################################################################
+#                                                                       #
+#                                                                       #
+#   copyright 2002 Paul Henry Tremblay                                  #
+#                                                                       #
+#   This program is distributed in the hope that it will be useful,     #
+#   but WITHOUT ANY WARRANTY; without even the implied warranty of      #
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU    #
+#   General Public License for more details.                            #
+#                                                                       #
+#                                                                       #
+#########################################################################
+import os
+from calibre.ebooks.rtf2xml import copy
+from calibre.ptempfile import better_mktemp
+from . import open_for_read, open_for_write
+
+# note to self. This is the first module in which I use tempfile. A good idea?
+"""
+"""
+
+
+class TableInfo:
+    """
+    Insert table data for tables.
+    Logic:
+    """
+
+    def __init__(self,
+            in_file,
+            bug_handler,
+            table_data,
+            copy=None,
+            run_level=1,):
+        """
+        Required:
+            'file'--file to parse
+            'table_data' -- a dictionary for each table.
+        Optional:
+            'copy'-- whether to make a copy of result for debugging
+            'temp_dir' --where to output temporary results (default is
+            directory from which the script is run.)
+        Returns:
+            nothing
+            """
+        self.__file = in_file
+        self.__bug_handler = bug_handler
+        self.__copy = copy
+        self.__table_data = table_data
+        self.__run_level = run_level
+        self.__write_to = better_mktemp()
+        # self.__write_to = 'table_info.data'
+
+    def insert_info(self):
+        """
+        """
+        read_obj = open_for_read(self.__file)
+        self.__write_obj = open_for_write(self.__write_to)
+        line_to_read = 1
+        while line_to_read:
+            line_to_read = read_obj.readline()
+            line = line_to_read
+            if line == 'mi<mk<tabl-start\n':
+                if len(self.__table_data) > 0:
+                    table_dict = self.__table_data[0]
+                    self.__write_obj.write('mi<tg<open-att__<table')
+                    keys = table_dict.keys()
+                    for key in keys:
+                        self.__write_obj.write('<%s>%s' % (key, table_dict[key]))
+                    self.__write_obj.write('\n')
+                    self.__table_data = self.__table_data[1:]
+                else:
+                    # this shouldn't happen!
+                    if self.__run_level > 3:
+                        msg = 'Not enough data for each table\n'
+                        raise self.__bug_handler(msg)
+                    self.__write_obj.write('mi<tg<open______<table\n')
+            elif line == 'mi<mk<table-end_\n':
+                self.__write_obj.write('mi<tg<close_____<table\n')
+            self.__write_obj.write(line)
+        read_obj.close()
+        self.__write_obj.close()
+        copy_obj = copy.Copy(bug_handler=self.__bug_handler)
+        if self.__copy:
+            copy_obj.copy_file(self.__write_to, "table_info.data")
+        copy_obj.rename(self.__write_to, self.__file)
+        os.remove(self.__write_to)
--- a/ebook_converter/ebooks/rtf2xml/tokenize.py
+++ b/ebook_converter/ebooks/rtf2xml/tokenize.py
@@ -0,0 +1,218 @@
+from __future__ import unicode_literals, absolute_import, print_function, division
+#########################################################################
+#                                                                       #
+#                                                                       #
+#   copyright 2002 Paul Henry Tremblay                                  #
+#                                                                       #
+#   This program is distributed in the hope that it will be useful,     #
+#   but WITHOUT ANY WARRANTY; without even the implied warranty of      #
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU    #
+#   General Public License for more details.                            #
+#                                                                       #
+#                                                                       #
+#########################################################################
+import os, re
+
+from calibre.ebooks.rtf2xml import copy
+from calibre.utils.mreplace import MReplace
+from calibre.ptempfile import better_mktemp
+from polyglot.builtins import codepoint_to_chr, range, filter, map
+from . import open_for_read, open_for_write
+
+
+class Tokenize:
+    """Tokenize RTF into one line per field. Each line will contain information useful for the rest of the script"""
+
+    def __init__(self,
+            in_file,
+            bug_handler,
+            copy=None,
+            run_level=1,
+            # out_file = None,
+        ):
+        self.__file = in_file
+        self.__bug_handler = bug_handler
+        self.__copy = copy
+        self.__write_to = better_mktemp()
+        # self.__write_to = out_file
+        self.__compile_expressions()
+        # variables
+        self.__uc_char = 0
+        self.__uc_bin = False
+        self.__uc_value = [1]
+
+    def __reini_utf8_counters(self):
+        self.__uc_char = 0
+        self.__uc_bin = False
+
+    def __remove_uc_chars(self, startchar, token):
+        for i in range(startchar, len(token)):
+            if self.__uc_char:
+                self.__uc_char -= 1
+            else:
+                return token[i:]
+        # if only char to skip
+        return ''
+
+    def __unicode_process(self, token):
+        # change scope in
+        if token == r'\{':
+            self.__uc_value.append(self.__uc_value[-1])
+            # basic error handling
+            self.__reini_utf8_counters()
+            return token
+        # change scope out
+        elif token == r'\}':
+            self.__uc_value.pop()
+            self.__reini_utf8_counters()
+            return token
+        # add a uc control
+        elif token[:3] == '\\uc':
+            self.__uc_value[-1] = int(token[3:])
+            self.__reini_utf8_counters()
+            return token
+        # bin data to slip
+        elif self.__uc_bin:
+            self.__uc_bin = False
+            return ''
+        # uc char to remove
+        elif self.__uc_char:
+            # handle \bin tag in case of uc char to skip
+            if token[:4] == '\bin':
+                self.__uc_char -=1
+                self.__uc_bin = True
+                return ''
+            elif token[:1] == "\\" :
+                self.__uc_char -=1
+                return ''
+            else:
+                return self.__remove_uc_chars(0, token)
+        # go for real \u token
+        match_obj = self.__utf_exp.match(token)
+        if match_obj is not None:
+            self.__reini_utf8_counters()
+            # get value and handle negative case
+            uni_char = int(match_obj.group(1))
+            uni_len = len(match_obj.group(0))
+            if uni_char < 0:
+                uni_char += 65536
+            uni_char = codepoint_to_chr(uni_char).encode('ascii', 'xmlcharrefreplace').decode('ascii')
+            self.__uc_char = self.__uc_value[-1]
+            # there is only an unicode char
+            if len(token)<= uni_len:
+                return uni_char
+            # an unicode char and something else
+            # must be after as it is splited on \
+            # necessary? maybe for \bin?
+            elif not self.__uc_char:
+                return uni_char + token[uni_len:]
+            # if not uc0 and chars
+            else:
+                return uni_char + self.__remove_uc_chars(uni_len, token)
+        # default
+        return token
+
+    def __sub_reg_split(self,input_file):
+        input_file = self.__replace_spchar.mreplace(input_file)
+        # this is for older RTF
+        input_file = self.__par_exp.sub(r'\n\\par \n', input_file)
+        input_file = self.__cwdigit_exp.sub(r"\g<1>\n\g<2>", input_file)
+        input_file = self.__cs_ast.sub(r"\g<1>", input_file)
+        input_file = self.__ms_hex_exp.sub(r"\\mshex0\g<1> ", input_file)
+        input_file = self.__utf_ud.sub(r"\\{\\uc0 \g<1>\\}", input_file)
+        # remove \n in bin data
+        input_file = self.__bin_exp.sub(lambda x:
+                                        x.group().replace('\n', '') + '\n', input_file)
+        # split
+        tokens = re.split(self.__splitexp, input_file)
+        # remove empty tokens and \n
+        return list(filter(lambda x: len(x) > 0 and x != '\n', tokens))
+
+    def __compile_expressions(self):
+        SIMPLE_RPL = {
+            "\\\\": "\\backslash ",
+            "\\~": "\\~ ",
+            "\\;": "\\; ",
+            "&": "&amp;",
+            "<": "&lt;",
+            ">": "&gt;",
+            "\\~": "\\~ ",
+            "\\_": "\\_ ",
+            "\\:": "\\: ",
+            "\\-": "\\- ",
+            # turn into a generic token to eliminate special
+            # cases and make processing easier
+            "\\{": "\\ob ",
+            # turn into a generic token to eliminate special
+            # cases and make processing easier
+            "\\}": "\\cb ",
+            # put a backslash in front of to eliminate special cases and
+            # make processing easier
+            "{": "\\{",
+            # put a backslash in front of to eliminate special cases and
+            # make processing easier
+            "}": "\\}",
+            }
+        self.__replace_spchar = MReplace(SIMPLE_RPL)
+        # add ;? in case of char following \u
+        self.__ms_hex_exp = re.compile(r"\\\'([0-9a-fA-F]{2})")
+        self.__utf_exp = re.compile(r"\\u(-?\d{3,6}) ?")
+        self.__bin_exp = re.compile(r"(?:\\bin(-?\d{0,10})[\n ]+)[01\n]+")
+        # manage upr/ud situations
+        self.__utf_ud = re.compile(r"\\{[\n ]?\\upr[\n ]?(?:\\{.*?\\})[\n ]?" +
+                       r"\\{[\n ]?\\*[\n ]?\\ud[\n ]?(\\{.*?\\})[\n ]?\\}[\n ]?\\}")
+        # add \n in split for whole file reading
+        # why keep backslash whereas \is replaced before?
+        # remove \n from endline char
+        self.__splitexp = re.compile(r"(\\[{}]|\n|\\[^\s\\{}&]+(?:[ \t\r\f\v])?)")
+        # this is for old RTF
+        self.__par_exp = re.compile(r'(\\\n+|\\ )')
+        # handle improper cs char-style with \* before without {
+        self.__cs_ast = re.compile(r'\\\*([\n ]*\\cs\d+[\n \\]+)')
+        # handle cw using a digit as argument and without space as delimiter
+        self.__cwdigit_exp = re.compile(r"(\\[a-zA-Z]+[\-0-9]+)([^0-9 \\]+)")
+
+    def tokenize(self):
+        """Main class for handling other methods. Reads the file \
+        , uses method self.sub_reg to make basic substitutions,\
+        and process tokens by itself"""
+        # read
+        with open_for_read(self.__file) as read_obj:
+            input_file = read_obj.read()
+
+        # process simple replacements and split giving us a correct list
+        # remove '' and \n in the process
+        tokens = self.__sub_reg_split(input_file)
+        # correct unicode
+        tokens = map(self.__unicode_process, tokens)
+        # remove empty items created by removing \uc
+        tokens = list(filter(lambda x: len(x) > 0, tokens))
+
+        # write
+        with open_for_write(self.__write_to) as write_obj:
+            write_obj.write('\n'.join(tokens))
+        # Move and copy
+        copy_obj = copy.Copy(bug_handler=self.__bug_handler)
+        if self.__copy:
+            copy_obj.copy_file(self.__write_to, "tokenize.data")
+        copy_obj.rename(self.__write_to, self.__file)
+        os.remove(self.__write_to)
+
+        # self.__special_tokens = [ '_', '~', "'", '{', '}' ]
+
+# import sys
+# def main(args=sys.argv):
+    # if len(args) < 2:
+        # print 'No file'
+        # return
+    # file = 'data_tokens.txt'
+    # if len(args) == 3:
+        # file = args[2]
+    # to = Tokenize(args[1], Exception, out_file = file)
+    # to.tokenize()
+
+
+# if __name__ == '__main__':
+    # sys.exit(main())
+
+# calibre-debug -e src/calibre/ebooks/rtf2xml/tokenize.py