mirror of
https://github.com/gryf/ebook-converter.git
synced 2026-04-03 19:23:32 +02:00
Initial import
This commit is contained in:
427
ebook_converter/ebooks/rtf2xml/inline.py
Normal file
427
ebook_converter/ebooks/rtf2xml/inline.py
Normal file
@@ -0,0 +1,427 @@
|
||||
from __future__ import unicode_literals, absolute_import, print_function, division
|
||||
import sys, os
|
||||
|
||||
from calibre.ebooks.rtf2xml import copy
|
||||
from calibre.ptempfile import better_mktemp
|
||||
from . import open_for_read, open_for_write
|
||||
|
||||
"""
|
||||
States.
|
||||
1. default
|
||||
1. an open bracket ends this state.
|
||||
2. Text print out text. Print out any groups_in_waiting.
|
||||
3. closed bracket. Close groups
|
||||
2. after an open bracket
|
||||
1. The lack of a control word ends this state.
|
||||
2. paragraph end -- close out all tags
|
||||
3. footnote beg -- close out all tags
|
||||
"""
|
||||
|
||||
|
||||
class Inline:
|
||||
"""
|
||||
Make inline tags within lists.
|
||||
Logic:
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
in_file,
|
||||
bug_handler,
|
||||
copy=None,
|
||||
run_level=1,):
|
||||
"""
|
||||
Required:
|
||||
'file'--file to parse
|
||||
Optional:
|
||||
'copy'-- whether to make a copy of result for debugging
|
||||
'temp_dir' --where to output temporary results (default is
|
||||
directory from which the script is run.)
|
||||
Returns:
|
||||
nothing
|
||||
"""
|
||||
self.__file = in_file
|
||||
self.__bug_handler = bug_handler
|
||||
self.__copy = copy
|
||||
self.__run_level = run_level
|
||||
self.__write_to = better_mktemp()
|
||||
|
||||
def __initiate_values(self):
|
||||
"""
|
||||
Initiate all values.
|
||||
"""
|
||||
self.__state_dict = {
|
||||
'default': self.__default_func,
|
||||
'after_open_bracket': self.__after_open_bracket_func,
|
||||
}
|
||||
self.__default_dict = {
|
||||
'ob<nu<open-brack': self.__found_open_bracket_func,
|
||||
'tx<nu<__________' : self.__found_text_func,
|
||||
'tx<hx<__________' : self.__found_text_func,
|
||||
'tx<ut<__________' : self.__found_text_func,
|
||||
'mi<mk<inline-fld' : self.__found_text_func,
|
||||
'text' : self.__found_text_func,
|
||||
'cb<nu<clos-brack' : self.__close_bracket_func,
|
||||
'mi<mk<par-end___' : self.__end_para_func,
|
||||
'mi<mk<footnt-ope' : self.__end_para_func,
|
||||
'mi<mk<footnt-ind' : self.__end_para_func,
|
||||
}
|
||||
self.__after_open_bracket_dict = {
|
||||
'cb<nu<clos-brack' : self.__close_bracket_func,
|
||||
'tx<nu<__________' : self.__found_text_func,
|
||||
'tx<hx<__________' : self.__found_text_func,
|
||||
'tx<ut<__________' : self.__found_text_func,
|
||||
'text' : self.__found_text_func,
|
||||
'mi<mk<inline-fld' : self.__found_text_func,
|
||||
'ob<nu<open-brack': self.__found_open_bracket_func,
|
||||
'mi<mk<par-end___' : self.__end_para_func,
|
||||
'mi<mk<footnt-ope' : self.__end_para_func,
|
||||
'mi<mk<footnt-ind' : self.__end_para_func,
|
||||
'cw<fd<field_____' : self.__found_field_func,
|
||||
}
|
||||
self.__state = 'default'
|
||||
self.__brac_count = 0 # do I need this?
|
||||
self.__list_inline_list = []
|
||||
self.__body_inline_list = []
|
||||
self.__groups_in_waiting_list = [0]
|
||||
self.__groups_in_waiting_body = [0]
|
||||
self.__groups_in_waiting = self.__groups_in_waiting_body
|
||||
self.__place = 'non_list'
|
||||
self.__inline_list = self.__body_inline_list
|
||||
self.__in_para = 0 # not in paragraph
|
||||
self.__char_dict = {
|
||||
# character info => ci
|
||||
'annotation' : 'annotation',
|
||||
'blue______' : 'blue',
|
||||
'bold______' : 'bold',
|
||||
'caps______' : 'caps',
|
||||
'char-style' : 'character-style',
|
||||
'dbl-strike' : 'double-strike-through',
|
||||
'emboss____' : 'emboss',
|
||||
'engrave___' : 'engrave',
|
||||
'font-color' : 'font-color',
|
||||
'font-down_' : 'subscript',
|
||||
'font-size_' : 'font-size',
|
||||
'font-style' : 'font-style',
|
||||
'font-up___' : 'superscript',
|
||||
'footnot-mk' : 'footnote-marker',
|
||||
'green_____' : 'green',
|
||||
'hidden____' : 'hidden',
|
||||
'italics___' : 'italics',
|
||||
'outline___' : 'outline',
|
||||
'red_______' : 'red',
|
||||
'shadow____' : 'shadow',
|
||||
'small-caps' : 'small-caps',
|
||||
'strike-thr' : 'strike-through',
|
||||
'subscript_' : 'subscript',
|
||||
'superscrip' : 'superscript',
|
||||
'underlined' : 'underlined',
|
||||
}
|
||||
self.__caps_list = ['false']
|
||||
|
||||
def __set_list_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line--line of text
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
"""
|
||||
if self.__place == 'in_list':
|
||||
if self.__token_info == 'mi<mk<lst-tx-end':
|
||||
self.__place = 'not_in_list'
|
||||
self.__inline_list = self.__body_inline_list
|
||||
self.__groups_in_waiting = self.__groups_in_waiting_body
|
||||
else:
|
||||
if self.__token_info == 'mi<mk<lst-tx-beg':
|
||||
self.__place = 'in_list'
|
||||
self.__inline_list = self.__list_inline_list
|
||||
self.__groups_in_waiting = self.__groups_in_waiting_list
|
||||
|
||||
def __default_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line-- line of text
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
Write if not hardline break
|
||||
"""
|
||||
action = self.__default_dict.get(self.__token_info)
|
||||
if action:
|
||||
action(line)
|
||||
self.__write_obj.write(line)
|
||||
|
||||
def __found_open_bracket_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line -- current line of text
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
Change the state to 'after_open_bracket'
|
||||
"""
|
||||
self.__state = 'after_open_bracket'
|
||||
self.__brac_count += 1
|
||||
self.__groups_in_waiting[0] += 1
|
||||
self.__inline_list.append({})
|
||||
self.__inline_list[-1]['contains_inline'] = 0
|
||||
|
||||
def __after_open_bracket_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line --line of text
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
If the token is a control word for character info (cw<ci), use another
|
||||
method to add to the dictionary.
|
||||
Use the dictionary to get the approriate function.
|
||||
Always print out the line.
|
||||
"""
|
||||
if line[0:5] == 'cw<ci': # calibre: bug in original function no diff between cw<ci and cw<pf
|
||||
self.__handle_control_word(line)
|
||||
else:
|
||||
action = self.__after_open_bracket_dict.get(self.__token_info)
|
||||
if action:
|
||||
self.__state = 'default' # a non control word?
|
||||
action(line)
|
||||
self.__write_obj.write(line)
|
||||
|
||||
def __handle_control_word(self, line):
|
||||
"""
|
||||
Required:
|
||||
line --line of text
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
Handle the control word for inline groups.
|
||||
Add each name - value to a dictionary.
|
||||
If the font style of Symbol, Wingdings, or Dingbats is found,
|
||||
always mark this. I need this later to convert the text to
|
||||
the right utf.
|
||||
"""
|
||||
# cw<ci<shadow_____<nu<true
|
||||
# self.__char_dict = {
|
||||
char_info = line[6:16]
|
||||
char_value = line[20:-1]
|
||||
name = self.__char_dict.get(char_info)
|
||||
if name:
|
||||
self.__inline_list[-1]['contains_inline'] = 1
|
||||
self.__inline_list[-1][name] = char_value
|
||||
"""
|
||||
if name == 'font-style':
|
||||
if char_value == 'Symbol':
|
||||
self.__write_obj.write('mi<mk<font-symbo\n')
|
||||
elif char_value == 'Wingdings':
|
||||
self.__write_obj.write('mi<mk<font-wingd\n')
|
||||
elif char_value == 'Zapf Dingbats':
|
||||
self.__write_obj.write('mi<mk<font-dingb\n')
|
||||
"""
|
||||
|
||||
def __close_bracket_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line --line of text
|
||||
Returns:
|
||||
Nothing
|
||||
Logic:
|
||||
If there are no inline groups, do nothing.
|
||||
Get the keys of the last dictionary in the inline_groups.
|
||||
If 'contains_inline' in the keys, write a close tag.
|
||||
If the_dict contains font information, write a mk tag.
|
||||
"""
|
||||
if len(self.__inline_list) == 0:
|
||||
# nothing to add
|
||||
return
|
||||
the_dict = self.__inline_list[-1]
|
||||
the_keys = the_dict.keys()
|
||||
# always close out
|
||||
if self.__place == 'in_list':
|
||||
if 'contains_inline' in the_keys and the_dict['contains_inline'] == 1\
|
||||
and self.__groups_in_waiting[0] == 0:
|
||||
self.__write_obj.write('mi<tg<close_____<inline\n')
|
||||
if 'font-style' in the_keys:
|
||||
self.__write_obj.write('mi<mk<font-end__\n')
|
||||
if 'caps' in the_keys:
|
||||
self.__write_obj.write('mi<mk<caps-end__\n')
|
||||
else:
|
||||
# close out only if in a paragraph
|
||||
if 'contains_inline' in the_keys and the_dict['contains_inline'] == 1\
|
||||
and self.__in_para and self.__groups_in_waiting[0] == 0:
|
||||
self.__write_obj.write('mi<tg<close_____<inline\n')
|
||||
if 'font-style' in the_keys:
|
||||
self.__write_obj.write('mi<mk<font-end__\n')
|
||||
if 'caps' in the_keys:
|
||||
self.__write_obj.write('mi<mk<caps-end__\n')
|
||||
self.__inline_list.pop()
|
||||
if self.__groups_in_waiting[0] != 0:
|
||||
self.__groups_in_waiting[0] -= 1
|
||||
|
||||
def __found_text_func(self, line):
|
||||
"""
|
||||
Required:
|
||||
line--line of text
|
||||
Return:
|
||||
nothing
|
||||
Logic:
|
||||
Three cases:
|
||||
1. in a list. Simply write inline
|
||||
2. Not in a list
|
||||
Text can mark the start of a paragraph.
|
||||
If already in a paragraph, check to see if any groups are waiting
|
||||
to be added. If so, use another method to write these groups.
|
||||
"""
|
||||
if self.__place == 'in_list':
|
||||
self.__write_inline()
|
||||
else:
|
||||
if not self.__in_para:
|
||||
self.__in_para = 1
|
||||
self.__start_para_func(line)
|
||||
elif self.__groups_in_waiting[0] != 0:
|
||||
self.__write_inline()
|
||||
|
||||
def __write_inline(self):
|
||||
"""
|
||||
Required:
|
||||
nothing
|
||||
Returns
|
||||
Nothing
|
||||
Logic:
|
||||
Method for writing inline when text is found.
|
||||
Only write those groups that are "waiting", or that have no
|
||||
tags yet.
|
||||
First, slice the list self.__inline list to get just the groups
|
||||
in waiting.
|
||||
Iterate through this slice, which contains only dictionaries.
|
||||
Get the keys in each dictionary. If 'font-style' is in the keys,
|
||||
write a marker tag. (I will use this marker tag later when conerting
|
||||
hext text to utf8.)
|
||||
Write a tag for the inline values.
|
||||
"""
|
||||
if self.__groups_in_waiting[0] != 0:
|
||||
last_index = -1 * self.__groups_in_waiting[0]
|
||||
inline_list = self.__inline_list[last_index:]
|
||||
if len(inline_list) <= 0:
|
||||
if self.__run_level > 3:
|
||||
msg = 'self.__inline_list is %s\n' % self.__inline_list
|
||||
raise self.__bug_handler(msg)
|
||||
self.__write_obj.write('error\n')
|
||||
self.__groups_in_waiting[0] = 0
|
||||
return
|
||||
for the_dict in inline_list:
|
||||
if the_dict['contains_inline']:
|
||||
the_keys = the_dict.keys()
|
||||
if 'font-style' in the_keys:
|
||||
face = the_dict['font-style']
|
||||
self.__write_obj.write('mi<mk<font______<%s\n' % face)
|
||||
if 'caps' in the_keys:
|
||||
value = the_dict['caps']
|
||||
self.__write_obj.write('mi<mk<caps______<%s\n' % value)
|
||||
self.__write_obj.write('mi<tg<open-att__<inline')
|
||||
for the_key in the_keys:
|
||||
if the_key != 'contains_inline':
|
||||
self.__write_obj.write('<%s>%s' % (the_key, the_dict[the_key]))
|
||||
self.__write_obj.write('\n')
|
||||
self.__groups_in_waiting[0] = 0
|
||||
|
||||
def __end_para_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line -- line of text
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
Slice from the end the groups in waiting.
|
||||
Iterate through the list. If the dictionary contaings info, write
|
||||
a closing tag.
|
||||
"""
|
||||
if not self.__in_para:
|
||||
return
|
||||
if self.__groups_in_waiting[0] == 0:
|
||||
inline_list = self.__inline_list
|
||||
else:
|
||||
last_index = -1 * self.__groups_in_waiting[0]
|
||||
inline_list = self.__inline_list[0:last_index]
|
||||
for the_dict in inline_list:
|
||||
contains_info = the_dict.get('contains_inline')
|
||||
if contains_info:
|
||||
the_keys = the_dict.keys()
|
||||
if 'font-style' in the_keys:
|
||||
self.__write_obj.write('mi<mk<font-end__\n')
|
||||
if 'caps' in the_keys:
|
||||
self.__write_obj.write('mi<mk<caps-end__\n')
|
||||
self.__write_obj.write('mi<tg<close_____<inline\n')
|
||||
self.__in_para = 0
|
||||
|
||||
def __start_para_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line -- line of text
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
Iterate through the self.__inline_list to get each dict.
|
||||
If the dict containst inline info, get the keys.
|
||||
Iterate through the keys and print out the key and value.
|
||||
"""
|
||||
for the_dict in self.__inline_list:
|
||||
contains_info = the_dict.get('contains_inline')
|
||||
if contains_info :
|
||||
the_keys = the_dict.keys()
|
||||
if 'font-style' in the_keys:
|
||||
face = the_dict['font-style']
|
||||
self.__write_obj.write('mi<mk<font______<%s\n' % face)
|
||||
if 'caps' in the_keys:
|
||||
value = the_dict['caps']
|
||||
self.__write_obj.write('mi<mk<caps______<%s\n' % value)
|
||||
self.__write_obj.write('mi<tg<open-att__<inline')
|
||||
for the_key in the_keys:
|
||||
if the_key != 'contains_inline':
|
||||
self.__write_obj.write('<%s>%s' % (the_key, the_dict[the_key]))
|
||||
self.__write_obj.write('\n')
|
||||
self.__groups_in_waiting[0] = 0
|
||||
|
||||
def __found_field_func(self, line):
|
||||
"""
|
||||
Just a default function to make sure I don't prematurely exit
|
||||
default state
|
||||
"""
|
||||
pass
|
||||
|
||||
def form_tags(self):
|
||||
"""
|
||||
Requires:
|
||||
area--area to parse (list or non-list)
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
Read one line in at a time. Determine what action to take based on
|
||||
the state.
|
||||
"""
|
||||
self.__initiate_values()
|
||||
with open_for_read(self.__file) as read_obj:
|
||||
with open_for_write(self.__write_to) as self.__write_obj:
|
||||
for line in read_obj:
|
||||
token = line[0:-1]
|
||||
self.__token_info = ''
|
||||
if token == 'tx<mc<__________<rdblquote'\
|
||||
or token == 'tx<mc<__________<ldblquote'\
|
||||
or token == 'tx<mc<__________<lquote'\
|
||||
or token == 'tx<mc<__________<rquote'\
|
||||
or token == 'tx<mc<__________<emdash'\
|
||||
or token == 'tx<mc<__________<endash'\
|
||||
or token == 'tx<mc<__________<bullet':
|
||||
self.__token_info = 'text'
|
||||
else:
|
||||
self.__token_info = line[:16]
|
||||
self.__set_list_func(line)
|
||||
action = self.__state_dict.get(self.__state)
|
||||
if action is None:
|
||||
sys.stderr.write('No matching state in module inline.py\n')
|
||||
sys.stderr.write(self.__state + '\n')
|
||||
action(line)
|
||||
copy_obj = copy.Copy(bug_handler=self.__bug_handler)
|
||||
if self.__copy:
|
||||
copy_obj.copy_file(self.__write_to, "inline.data")
|
||||
copy_obj.rename(self.__write_to, self.__file)
|
||||
os.remove(self.__write_to)
|
||||
Reference in New Issue
Block a user