ebook-converter/ebook_converter/ebooks/rtf2xml/inline.py

from __future__ import unicode_literals, absolute_import, print_function, division
import sys, os

from ebook_converter.ebooks.rtf2xml import copy
from ebook_converter.ptempfile import better_mktemp
from . import open_for_read, open_for_write

"""
States.
1. default
    1. an open bracket ends this state.
    2. Text print out text. Print out any groups_in_waiting.
    3. closed bracket. Close groups
2. after an open bracket
    1. The lack of a control word ends this state.
    2. paragraph end -- close out all tags
    3. footnote beg -- close out all tags
"""


class Inline:
    """
    Make inline tags within lists.
    Logic:
    """

    def __init__(self,
            in_file,
            bug_handler,
            copy=None,
            run_level=1,):
        """
        Required:
            'file'--file to parse
        Optional:
            'copy'-- whether to make a copy of result for debugging
            'temp_dir' --where to output temporary results (default is
            directory from which the script is run.)
        Returns:
            nothing
            """
        self.__file = in_file
        self.__bug_handler = bug_handler
        self.__copy = copy
        self.__run_level = run_level
        self.__write_to = better_mktemp()

    def __initiate_values(self):
        """
        Initiate all values.
        """
        self.__state_dict = {
            'default':              self.__default_func,
            'after_open_bracket':   self.__after_open_bracket_func,
        }
        self.__default_dict = {
            'ob<nu<open-brack':         self.__found_open_bracket_func,
            'tx<nu<__________'  :       self.__found_text_func,
            'tx<hx<__________'  :       self.__found_text_func,
            'tx<ut<__________'  :       self.__found_text_func,
            'mi<mk<inline-fld'  :       self.__found_text_func,
            'text'              :       self.__found_text_func,
            'cb<nu<clos-brack'  :       self.__close_bracket_func,
            'mi<mk<par-end___'  :       self.__end_para_func,
            'mi<mk<footnt-ope'  :       self.__end_para_func,
            'mi<mk<footnt-ind'  :       self.__end_para_func,
        }
        self.__after_open_bracket_dict = {
            'cb<nu<clos-brack'  :       self.__close_bracket_func,
            'tx<nu<__________'  :       self.__found_text_func,
            'tx<hx<__________'  :       self.__found_text_func,
            'tx<ut<__________'  :       self.__found_text_func,
            'text'              :       self.__found_text_func,
            'mi<mk<inline-fld'  :       self.__found_text_func,
            'ob<nu<open-brack':         self.__found_open_bracket_func,
            'mi<mk<par-end___'  :       self.__end_para_func,
            'mi<mk<footnt-ope'  :       self.__end_para_func,
            'mi<mk<footnt-ind'  :       self.__end_para_func,
            'cw<fd<field_____'  :       self.__found_field_func,
        }
        self.__state = 'default'
        self.__brac_count = 0  # do I need this?
        self.__list_inline_list = []
        self.__body_inline_list = []
        self.__groups_in_waiting_list = [0]
        self.__groups_in_waiting_body = [0]
        self.__groups_in_waiting = self.__groups_in_waiting_body
        self.__place = 'non_list'
        self.__inline_list = self.__body_inline_list
        self.__in_para = 0  # not in paragraph
        self.__char_dict = {
            # character info => ci
            'annotation'    :   'annotation',
            'blue______'    :   'blue',
            'bold______'    :   'bold',
            'caps______'    :   'caps',
            'char-style'    :   'character-style',
            'dbl-strike'    :   'double-strike-through',
            'emboss____'    :   'emboss',
            'engrave___'    :   'engrave',
            'font-color'    :   'font-color',
            'font-down_'    :   'subscript',
            'font-size_'    :   'font-size',
            'font-style'    :   'font-style',
            'font-up___'    :   'superscript',
            'footnot-mk'    :   'footnote-marker',
            'green_____'    :   'green',
            'hidden____'    :   'hidden',
            'italics___'    :   'italics',
            'outline___'    :   'outline',
            'red_______'    :   'red',
            'shadow____'    :   'shadow',
            'small-caps'    :   'small-caps',
            'strike-thr'    :   'strike-through',
            'subscript_'    :   'subscript',
            'superscrip'    :   'superscript',
            'underlined'    :   'underlined',
        }
        self.__caps_list = ['false']

    def __set_list_func(self, line):
        """
        Requires:
            line--line of text
        Returns:
            nothing
        Logic:
        """
        if self.__place == 'in_list':
            if self.__token_info == 'mi<mk<lst-tx-end':
                self.__place = 'not_in_list'
                self.__inline_list = self.__body_inline_list
                self.__groups_in_waiting = self.__groups_in_waiting_body
        else:
            if self.__token_info == 'mi<mk<lst-tx-beg':
                self.__place = 'in_list'
                self.__inline_list = self.__list_inline_list
                self.__groups_in_waiting = self.__groups_in_waiting_list

    def __default_func(self, line):
        """
        Requires:
            line-- line of text
        Returns:
            nothing
        Logic:
            Write if not hardline break
        """
        action = self.__default_dict.get(self.__token_info)
        if action:
            action(line)
        self.__write_obj.write(line)

    def __found_open_bracket_func(self, line):
        """
        Requires:
            line -- current line of text
        Returns:
            nothing
        Logic:
            Change the state to 'after_open_bracket'
        """
        self.__state = 'after_open_bracket'
        self.__brac_count += 1
        self.__groups_in_waiting[0] += 1
        self.__inline_list.append({})
        self.__inline_list[-1]['contains_inline'] = 0

    def __after_open_bracket_func(self, line):
        """
        Requires:
            line --line of text
        Returns:
            nothing
        Logic:
            If the token is a control word for character info (cw<ci), use another
            method to add to the dictionary.
            Use the dictionary to get the approriate function.
            Always print out the line.
        """
        if line[0:5] == 'cw<ci':  # calibre: bug in original function no diff between cw<ci and cw<pf
            self.__handle_control_word(line)
        else:
            action = self.__after_open_bracket_dict.get(self.__token_info)
            if action:
                self.__state = 'default'  # a non control word?
                action(line)
        self.__write_obj.write(line)

    def __handle_control_word(self, line):
        """
        Required:
            line --line of text
        Returns:
            nothing
        Logic:
            Handle the control word for inline groups.
            Add each name - value to a dictionary.
            If the font style of Symbol, Wingdings, or Dingbats is found,
            always mark this. I need this later to convert the text to
            the right utf.
        """
        # cw<ci<shadow_____<nu<true
        # self.__char_dict = {
        char_info = line[6:16]
        char_value = line[20:-1]
        name = self.__char_dict.get(char_info)
        if name:
            self.__inline_list[-1]['contains_inline'] = 1
            self.__inline_list[-1][name] = char_value
            """
            if name == 'font-style':
                if char_value == 'Symbol':
                    self.__write_obj.write('mi<mk<font-symbo\n')
                elif char_value == 'Wingdings':
                    self.__write_obj.write('mi<mk<font-wingd\n')
                elif char_value == 'Zapf Dingbats':
                    self.__write_obj.write('mi<mk<font-dingb\n')
            """

    def __close_bracket_func(self, line):
        """
        Requires:
            line --line of text
        Returns:
            Nothing
        Logic:
            If there are no inline groups, do nothing.
            Get the keys of the last dictionary in the inline_groups.
            If 'contains_inline' in the keys, write a close tag.
            If the_dict contains font information, write a mk tag.
        """
        if len(self.__inline_list) == 0:
            # nothing to add
            return
        the_dict = self.__inline_list[-1]
        the_keys = the_dict.keys()
        # always close out
        if self.__place == 'in_list':
            if 'contains_inline' in the_keys and the_dict['contains_inline'] == 1\
                and self.__groups_in_waiting[0] == 0:
                self.__write_obj.write('mi<tg<close_____<inline\n')
                if 'font-style' in the_keys:
                    self.__write_obj.write('mi<mk<font-end__\n')
                if 'caps' in the_keys:
                    self.__write_obj.write('mi<mk<caps-end__\n')
        else:
            # close out only if in a paragraph
            if 'contains_inline' in the_keys and the_dict['contains_inline'] == 1\
                and self.__in_para and self.__groups_in_waiting[0] == 0:
                self.__write_obj.write('mi<tg<close_____<inline\n')
                if 'font-style' in the_keys:
                    self.__write_obj.write('mi<mk<font-end__\n')
                if 'caps' in the_keys:
                    self.__write_obj.write('mi<mk<caps-end__\n')
        self.__inline_list.pop()
        if self.__groups_in_waiting[0] != 0:
            self.__groups_in_waiting[0] -= 1

    def __found_text_func(self, line):
        """
        Required:
            line--line of text
        Return:
            nothing
        Logic:
            Three cases:
            1. in a list. Simply write inline
            2. Not in a list
                Text can mark the start of a paragraph.
                If already in a paragraph, check to see if any groups are waiting
                to be added. If so, use another method to write these groups.
        """
        if self.__place == 'in_list':
            self.__write_inline()
        else:
            if not self.__in_para:
                self.__in_para = 1
                self.__start_para_func(line)
            elif self.__groups_in_waiting[0] != 0:
                self.__write_inline()

    def __write_inline(self):
        """
        Required:
            nothing
        Returns
            Nothing
        Logic:
            Method for writing inline when text is found.
            Only write those groups that are "waiting", or that have no
            tags yet.
            First, slice the list self.__inline list to get just the groups
            in waiting.
            Iterate through this slice, which contains only dictionaries.
            Get the keys in each dictionary. If 'font-style' is in the keys,
            write a marker tag. (I will use this marker tag later when conerting
            hext text to utf8.)
            Write a tag for the inline values.
        """
        if self.__groups_in_waiting[0] != 0:
            last_index = -1 * self.__groups_in_waiting[0]
            inline_list = self.__inline_list[last_index:]
            if len(inline_list) <= 0:
                if self.__run_level > 3:
                    msg = 'self.__inline_list is %s\n' % self.__inline_list
                    raise self.__bug_handler(msg)
                self.__write_obj.write('error\n')
                self.__groups_in_waiting[0] = 0
                return
            for the_dict in inline_list:
                if the_dict['contains_inline']:
                    the_keys = the_dict.keys()
                    if 'font-style' in the_keys:
                        face = the_dict['font-style']
                        self.__write_obj.write('mi<mk<font______<%s\n' % face)
                    if 'caps' in the_keys:
                        value = the_dict['caps']
                        self.__write_obj.write('mi<mk<caps______<%s\n' % value)
                    self.__write_obj.write('mi<tg<open-att__<inline')
                    for the_key in the_keys:
                        if the_key != 'contains_inline':
                            self.__write_obj.write('<%s>%s' % (the_key, the_dict[the_key]))
                    self.__write_obj.write('\n')
        self.__groups_in_waiting[0] = 0

    def __end_para_func(self, line):
        """
        Requires:
            line -- line of text
        Returns:
            nothing
        Logic:
            Slice from the end the groups in waiting.
            Iterate through the list. If the dictionary contaings info, write
            a closing tag.
        """
        if not self.__in_para:
            return
        if self.__groups_in_waiting[0] == 0:
            inline_list = self.__inline_list
        else:
            last_index = -1 * self.__groups_in_waiting[0]
            inline_list = self.__inline_list[0:last_index]
        for the_dict in inline_list:
            contains_info = the_dict.get('contains_inline')
            if contains_info:
                the_keys = the_dict.keys()
                if 'font-style' in the_keys:
                    self.__write_obj.write('mi<mk<font-end__\n')
                if 'caps' in the_keys:
                    self.__write_obj.write('mi<mk<caps-end__\n')
                self.__write_obj.write('mi<tg<close_____<inline\n')
        self.__in_para = 0

    def __start_para_func(self, line):
        """
        Requires:
            line -- line of text
        Returns:
            nothing
        Logic:
            Iterate through the self.__inline_list to get each dict.
            If the dict containst inline info, get the keys.
            Iterate through the keys and print out the key and value.
        """
        for the_dict in self.__inline_list:
            contains_info = the_dict.get('contains_inline')
            if contains_info :
                the_keys = the_dict.keys()
                if 'font-style' in the_keys:
                    face = the_dict['font-style']
                    self.__write_obj.write('mi<mk<font______<%s\n' % face)
                if 'caps' in the_keys:
                    value = the_dict['caps']
                    self.__write_obj.write('mi<mk<caps______<%s\n' % value)
                self.__write_obj.write('mi<tg<open-att__<inline')
                for the_key in the_keys:
                    if the_key != 'contains_inline':
                        self.__write_obj.write('<%s>%s' % (the_key, the_dict[the_key]))
                self.__write_obj.write('\n')
        self.__groups_in_waiting[0] = 0

    def __found_field_func(self, line):
        """
        Just a default function to make sure I don't prematurely exit
        default state
        """
        pass

    def form_tags(self):
        """
        Requires:
            area--area to parse (list or non-list)
        Returns:
            nothing
        Logic:
            Read one line in at a time. Determine what action to take based on
            the state.
        """
        self.__initiate_values()
        with open_for_read(self.__file) as read_obj:
            with open_for_write(self.__write_to) as self.__write_obj:
                for line in read_obj:
                    token = line[0:-1]
                    self.__token_info = ''
                    if token == 'tx<mc<__________<rdblquote'\
                        or token == 'tx<mc<__________<ldblquote'\
                        or token == 'tx<mc<__________<lquote'\
                        or token == 'tx<mc<__________<rquote'\
                        or token == 'tx<mc<__________<emdash'\
                        or token == 'tx<mc<__________<endash'\
                        or token == 'tx<mc<__________<bullet':
                        self.__token_info = 'text'
                    else:
                        self.__token_info = line[:16]
                    self.__set_list_func(line)
                    action = self.__state_dict.get(self.__state)
                    if action is None:
                        sys.stderr.write('No matching state in module inline.py\n')
                        sys.stderr.write(self.__state + '\n')
                    action(line)
        copy_obj = copy.Copy(bug_handler=self.__bug_handler)
        if self.__copy:
            copy_obj.copy_file(self.__write_to, "inline.data")
        copy_obj.rename(self.__write_to, self.__file)
        os.remove(self.__write_to)