mirror of
https://github.com/gryf/ebook-converter.git
synced 2026-04-06 21:13:34 +02:00
Initial import
This commit is contained in:
263
ebook_converter/ebooks/rtf2xml/paragraphs.py
Normal file
263
ebook_converter/ebooks/rtf2xml/paragraphs.py
Normal file
@@ -0,0 +1,263 @@
|
||||
from __future__ import unicode_literals, absolute_import, print_function, division
|
||||
#########################################################################
|
||||
# #
|
||||
# #
|
||||
# copyright 2002 Paul Henry Tremblay #
|
||||
# #
|
||||
# This program is distributed in the hope that it will be useful, #
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
|
||||
# General Public License for more details. #
|
||||
# #
|
||||
# #
|
||||
#########################################################################
|
||||
import sys, os
|
||||
|
||||
from calibre.ebooks.rtf2xml import copy
|
||||
from calibre.ptempfile import better_mktemp
|
||||
from . import open_for_read, open_for_write
|
||||
|
||||
|
||||
class Paragraphs:
|
||||
"""
|
||||
=================
|
||||
Purpose
|
||||
=================
|
||||
Write paragraph tags for a tokenized file. (This module won't be any use to use
|
||||
to you unless you use it as part of the other modules.)
|
||||
-------------
|
||||
Method
|
||||
-------------
|
||||
RTF does not tell you when a paragraph begins. It only tells you when the
|
||||
paragraph ends.
|
||||
In order to make paragraphs out of this limited info, the parser starts in the
|
||||
body of the documents and assumes it is not in a paragraph. It looks for clues
|
||||
to begin a paragraph. Text starts a paragraph; so does an inline field or
|
||||
list-text. If an end of paragraph marker (\\par) is found, then this indicates
|
||||
a blank paragraph.
|
||||
Once a paragraph is found, the state changes to 'paragraph.' In this state,
|
||||
clues are looked to for the end of a paragraph. The end of a paragraph marker
|
||||
(\\par) marks the end of a paragraph. So does the end of a footnote or heading;
|
||||
a paragraph definition; the end of a field-block; and the beginning of a
|
||||
section. (How about the end of a section or the end of a field-block?)
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
in_file,
|
||||
bug_handler,
|
||||
copy=None,
|
||||
write_empty_para=1,
|
||||
run_level=1,
|
||||
):
|
||||
"""
|
||||
Required:
|
||||
'file'--file to parse
|
||||
Optional:
|
||||
'copy'-- whether to make a copy of result for debugging
|
||||
'temp_dir' --where to output temporary results (default is
|
||||
directory from which the script is run.)
|
||||
Returns:
|
||||
nothing
|
||||
"""
|
||||
self.__file = in_file
|
||||
self.__bug_handler = bug_handler
|
||||
self.__copy = copy
|
||||
self.__write_empty_para = write_empty_para
|
||||
self.__run_level = run_level
|
||||
self.__write_to = better_mktemp()
|
||||
|
||||
def __initiate_values(self):
|
||||
"""
|
||||
Initiate all values.
|
||||
"""
|
||||
self.__state = 'before_body'
|
||||
self.__start_marker = 'mi<mk<para-start\n' # outside para tags
|
||||
self.__start2_marker = 'mi<mk<par-start_\n' # inside para tags
|
||||
self.__end2_marker = 'mi<mk<par-end___\n' # inside para tags
|
||||
self.__end_marker = 'mi<mk<para-end__\n' # outside para tags
|
||||
self.__state_dict = {
|
||||
'before_body' : self.__before_body_func,
|
||||
'not_paragraph' : self.__not_paragraph_func,
|
||||
'paragraph' : self.__paragraph_func,
|
||||
}
|
||||
self.__paragraph_dict = {
|
||||
'cw<pf<par-end___' : self.__close_para_func, # end of paragraph
|
||||
'mi<mk<headi_-end' : self.__close_para_func, # end of header or footer
|
||||
# 'cw<pf<par-def___' : self.__close_para_func, # paragraph definition
|
||||
# 'mi<mk<fld-bk-end' : self.__close_para_func, # end of field-block
|
||||
'mi<mk<fldbk-end_' : self.__close_para_func, # end of field-block
|
||||
'mi<mk<body-close' : self.__close_para_func, # end of body
|
||||
'mi<mk<sect-close' : self.__close_para_func, # end of body
|
||||
'mi<mk<sect-start' : self.__close_para_func, # start of section
|
||||
'mi<mk<foot___clo' : self.__close_para_func, # end of footnote
|
||||
'cw<tb<cell______' : self.__close_para_func, # end of cell
|
||||
'mi<mk<par-in-fld' : self.__close_para_func, # start of block field
|
||||
'cw<pf<par-def___' : self.__bogus_para__def_func, # paragraph definition
|
||||
}
|
||||
self.__not_paragraph_dict = {
|
||||
'tx<nu<__________' : self.__start_para_func,
|
||||
'tx<hx<__________' : self.__start_para_func,
|
||||
'tx<ut<__________' : self.__start_para_func,
|
||||
'tx<mc<__________' : self.__start_para_func,
|
||||
'mi<mk<inline-fld' : self.__start_para_func,
|
||||
'mi<mk<para-beg__' : self.__start_para_func,
|
||||
'cw<pf<par-end___' : self.__empty_para_func,
|
||||
'mi<mk<pict-start' : self.__start_para_func,
|
||||
'cw<pf<page-break' : self.__empty_pgbk_func, # page break
|
||||
}
|
||||
|
||||
def __before_body_func(self, line):
|
||||
"""
|
||||
Required:
|
||||
line -- line to parse
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
This function handles all the lines before the start of the body.
|
||||
Once the body starts, the state is switched to 'not_paragraph'
|
||||
"""
|
||||
if self.__token_info == 'mi<mk<body-open_':
|
||||
self.__state = 'not_paragraph'
|
||||
self.__write_obj.write(line)
|
||||
|
||||
def __not_paragraph_func(self, line):
|
||||
"""
|
||||
Required:
|
||||
line --line to parse
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
This function handles all lines that are outside of the paragraph.
|
||||
It looks for clues that start a paragraph, and when found,
|
||||
switches states and writes the start tags.
|
||||
"""
|
||||
action = self.__not_paragraph_dict.get(self.__token_info)
|
||||
if action:
|
||||
action(line)
|
||||
self.__write_obj.write(line)
|
||||
|
||||
def __paragraph_func(self, line):
|
||||
"""
|
||||
Required:
|
||||
line --line to parse
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
This function handles all the lines that are in the paragraph. It
|
||||
looks for clues to the end of the paragraph. When a clue is found,
|
||||
it calls on another method to write the end of the tag and change
|
||||
the state.
|
||||
"""
|
||||
action = self.__paragraph_dict.get(self.__token_info)
|
||||
if action:
|
||||
action(line)
|
||||
else:
|
||||
self.__write_obj.write(line)
|
||||
|
||||
def __start_para_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line --line to parse
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
This function writes the beginning tags for a paragraph and
|
||||
changes the state to paragraph.
|
||||
"""
|
||||
self.__write_obj.write(self.__start_marker) # marker for later parsing
|
||||
self.__write_obj.write(
|
||||
'mi<tg<open______<para\n'
|
||||
)
|
||||
self.__write_obj.write(self.__start2_marker)
|
||||
self.__state = 'paragraph'
|
||||
|
||||
def __empty_para_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line --line to parse
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
This function writes the empty tags for a paragraph.
|
||||
It does not do anything if self.__write_empty_para is 0.
|
||||
"""
|
||||
if self.__write_empty_para:
|
||||
self.__write_obj.write(self.__start_marker) # marker for later parsing
|
||||
self.__write_obj.write(
|
||||
'mi<tg<empty_____<para\n'
|
||||
)
|
||||
self.__write_obj.write(self.__end_marker) # marker for later parsing
|
||||
|
||||
def __empty_pgbk_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line --line to parse
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
This function writes the empty tags for a page break.
|
||||
"""
|
||||
self.__write_obj.write(
|
||||
'mi<tg<empty_____<page-break\n'
|
||||
)
|
||||
|
||||
def __close_para_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line --line to parse
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
This function writes the end tags for a paragraph and
|
||||
changes the state to not_paragraph.
|
||||
"""
|
||||
self.__write_obj.write(self.__end2_marker) # marker for later parser
|
||||
self.__write_obj.write(
|
||||
'mi<tg<close_____<para\n'
|
||||
)
|
||||
self.__write_obj.write(self.__end_marker) # marker for later parser
|
||||
self.__write_obj.write(line)
|
||||
self.__state = 'not_paragraph'
|
||||
|
||||
def __bogus_para__def_func(self, line):
|
||||
"""
|
||||
Requires:
|
||||
line --line to parse
|
||||
Returns:
|
||||
nothing
|
||||
Logic:
|
||||
if a \\pard occurs in a paragraph, I want to ignore it. (I believe)
|
||||
"""
|
||||
self.__write_obj.write('mi<mk<bogus-pard\n')
|
||||
|
||||
def make_paragraphs(self):
|
||||
"""
|
||||
Requires:
|
||||
nothing
|
||||
Returns:
|
||||
nothing (changes the original file)
|
||||
Logic:
|
||||
Read one line in at a time. Determine what action to take based on
|
||||
the state. If the state is before the body, look for the
|
||||
beginning of the body.
|
||||
When the body is found, change the state to 'not_paragraph'. The
|
||||
only other state is 'paragraph'.
|
||||
"""
|
||||
self.__initiate_values()
|
||||
with open_for_read(self.__file) as read_obj:
|
||||
with open_for_write(self.__write_to) as self.__write_obj:
|
||||
for line in read_obj:
|
||||
self.__token_info = line[:16]
|
||||
action = self.__state_dict.get(self.__state)
|
||||
if action is None:
|
||||
try:
|
||||
sys.stderr.write('no matching state in module paragraphs.py\n')
|
||||
sys.stderr.write(self.__state + '\n')
|
||||
except:
|
||||
pass
|
||||
action(line)
|
||||
copy_obj = copy.Copy(bug_handler=self.__bug_handler)
|
||||
if self.__copy:
|
||||
copy_obj.copy_file(self.__write_to, "paragraphs.data")
|
||||
copy_obj.rename(self.__write_to, self.__file)
|
||||
os.remove(self.__write_to)
|
||||
Reference in New Issue
Block a user