mirror of
https://github.com/gryf/ebook-converter.git
synced 2026-01-06 03:04:11 +01:00
Here is the first batch of modules, which are needed for converting several formats to LRF. Some of the logic has been change, more cleanups will follow.
574 lines
23 KiB
Python
574 lines
23 KiB
Python
from __future__ import absolute_import, division, print_function, unicode_literals
|
|
#########################################################################
|
|
# #
|
|
# #
|
|
# copyright 2002 Paul Henry Tremblay #
|
|
# #
|
|
# This program is distributed in the hope that it will be useful, #
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU #
|
|
# General Public License for more details. #
|
|
# #
|
|
# #
|
|
#########################################################################
|
|
# $Revision: 1.41 $
|
|
# $Date: 2006/03/24 23:50:07 $
|
|
import sys, os
|
|
|
|
from ebook_converter.ebooks.rtf2xml import headings_to_sections, \
|
|
line_endings, footnote, fields_small, default_encoding, \
|
|
make_lists, preamble_div, header, colors, group_borders, \
|
|
check_encoding, add_brackets, table, combine_borders, \
|
|
fields_large, process_tokens, hex_2_utf8, tokenize, \
|
|
delete_info, sections, check_brackets, styles, \
|
|
paragraph_def, convert_to_tags, output, copy, \
|
|
list_numbers, info, pict, table_info, fonts, paragraphs, \
|
|
body_styles, preamble_rest, group_styles, \
|
|
inline
|
|
from ebook_converter.ebooks.rtf2xml.old_rtf import OldRtf
|
|
from ebook_converter.polyglot.builtins import unicode_type
|
|
|
|
from . import open_for_read, open_for_write
|
|
|
|
"""
|
|
Here is an example script using the ParseRTF module directly
|
|
#!/usr/bin/env python2
|
|
|
|
def Handle_Main():
|
|
# Handles options and creates a parse object
|
|
parse_obj =ParseRtf.ParseRtf(
|
|
in_file = 'in.rtf',
|
|
# All values from here on are optional
|
|
# determine the output file
|
|
out_file = 'out.xml',
|
|
# determine the run level. The default is 1.
|
|
run_level = 3,
|
|
# The name of a debug directory, if you are running at
|
|
# run level 3 or higer.
|
|
debug = 'debug_dir',
|
|
# Convert RTF caps to real caps.
|
|
# Default is 1.
|
|
convert_caps = 1,
|
|
# Indent resulting XML.
|
|
# Default is 0 (no indent).
|
|
indent = 1,
|
|
# Form lists from RTF. Default is 1.
|
|
form_lists = 1,
|
|
# Convert headings to sections. Default is 0.
|
|
headings_to_sections = 1,
|
|
# Group paragraphs with the same style name. Default is 1.
|
|
group_styles = 1,
|
|
# Group borders. Default is 1.
|
|
group_borders = 1,
|
|
# Write or do not write paragraphs. Default is 0.
|
|
empty_paragraphs = 0,
|
|
# Allow to use a custom default encoding as fallback
|
|
default_encoding = 'cp1252',
|
|
)
|
|
try:
|
|
parse_obj.parse_rtf()
|
|
except ParseRtf.InvalidRtfException, msg:
|
|
sys.stderr.write(msg)
|
|
except ParseRtf.RtfInvalidCodeException, msg:
|
|
sys.stderr.write(msg)
|
|
"""
|
|
|
|
|
|
class InvalidRtfException(Exception):
|
|
"""
|
|
handle invalid RTF
|
|
"""
|
|
pass
|
|
|
|
|
|
class RtfInvalidCodeException(Exception):
|
|
"""
|
|
handle bugs in program
|
|
"""
|
|
pass
|
|
|
|
|
|
class ParseRtf:
|
|
"""
|
|
Main class for controlling the rest of the parsing.
|
|
"""
|
|
|
|
def __init__(self,
|
|
in_file,
|
|
out_file='',
|
|
out_dir=None,
|
|
dtd='',
|
|
deb_dir=None,
|
|
convert_symbol=None,
|
|
convert_wingdings=None,
|
|
convert_zapf=None,
|
|
convert_caps=None,
|
|
run_level=1,
|
|
indent=None,
|
|
replace_illegals=1,
|
|
form_lists=1,
|
|
headings_to_sections=1,
|
|
group_styles=1,
|
|
group_borders=1,
|
|
empty_paragraphs=1,
|
|
no_dtd=0,
|
|
char_data='',
|
|
default_encoding='cp1252',
|
|
):
|
|
"""
|
|
Requires:
|
|
'file' --file to parse
|
|
'char_data' --file containing character maps
|
|
'dtd' --path to dtd
|
|
Possible parameters, but not necessary:
|
|
'output' --a file to output the parsed file. (Default is standard
|
|
output.)
|
|
'temp_dir' --directory for temporary output (If not provided, the
|
|
script tries to output to directory where is script is exectued.)
|
|
'deb_dir' --debug directory. If a debug_dir is provided, the script
|
|
will copy each run through as a file to examine in the debug_dir
|
|
'check_brackets' -- make sure the brackets match up after each run
|
|
through a file. Only for debugging.
|
|
Returns: Nothing
|
|
"""
|
|
|
|
self.__file = in_file
|
|
self.__out_file = out_file
|
|
self.__out_dir = out_dir
|
|
self.__temp_dir = out_dir
|
|
self.__dtd_path = dtd
|
|
self.__check_file(in_file,"file_to_parse")
|
|
self.__char_data = char_data
|
|
self.__debug_dir = deb_dir
|
|
self.__check_dir(self.__temp_dir)
|
|
self.__copy = self.__check_dir(self.__debug_dir)
|
|
self.__convert_caps = convert_caps
|
|
self.__convert_symbol = convert_symbol
|
|
self.__convert_wingdings = convert_wingdings
|
|
self.__convert_zapf = convert_zapf
|
|
self.__run_level = run_level
|
|
self.__exit_level = 0
|
|
self.__indent = indent
|
|
self.__replace_illegals = replace_illegals
|
|
self.__form_lists = form_lists
|
|
self.__headings_to_sections = headings_to_sections
|
|
self.__group_styles = group_styles
|
|
self.__group_borders = group_borders
|
|
self.__empty_paragraphs = empty_paragraphs
|
|
self.__no_dtd = no_dtd
|
|
self.__default_encoding = default_encoding
|
|
|
|
def __check_file(self, the_file, type):
|
|
"""Check to see if files exist"""
|
|
if hasattr(the_file, 'read'):
|
|
return
|
|
if the_file is None:
|
|
if type == "file_to_parse":
|
|
msg = "\nYou must provide a file for the script to work"
|
|
raise RtfInvalidCodeException(msg)
|
|
elif os.path.exists(the_file):
|
|
pass # do nothing
|
|
else:
|
|
msg = "\nThe file '%s' cannot be found" % the_file
|
|
raise RtfInvalidCodeException(msg)
|
|
|
|
def __check_dir(self, the_dir):
|
|
"""Check to see if directory exists"""
|
|
if not the_dir :
|
|
return
|
|
dir_exists = os.path.isdir(the_dir)
|
|
if not dir_exists:
|
|
msg = "\n%s is not a directory" % the_dir
|
|
raise RtfInvalidCodeException(msg)
|
|
return 1
|
|
|
|
def parse_rtf(self):
|
|
"""
|
|
Parse the file by calling on other classes.
|
|
Requires:
|
|
Nothing
|
|
Returns:
|
|
A parsed file in XML, either to standard output or to a file,
|
|
depending on the value of 'output' when the instance was created.
|
|
"""
|
|
self.__temp_file = self.__make_temp_file(self.__file)
|
|
# if the self.__deb_dir is true, then create a copy object,
|
|
# set the directory to write to, remove files, and copy
|
|
# the new temporary file to this directory
|
|
if self.__debug_dir:
|
|
copy_obj = copy.Copy(
|
|
bug_handler=RtfInvalidCodeException,
|
|
)
|
|
copy_obj.set_dir(self.__debug_dir)
|
|
copy_obj.remove_files()
|
|
copy_obj.copy_file(self.__temp_file, "original_file")
|
|
# Function to check if bracket are well handled
|
|
if self.__debug_dir or self.__run_level > 2:
|
|
self.__check_brack_obj = check_brackets.CheckBrackets(
|
|
file=self.__temp_file,
|
|
bug_handler=RtfInvalidCodeException,
|
|
)
|
|
# convert Macintosh and Windows line endings to Unix line endings
|
|
# why do this if you don't wb after?
|
|
line_obj = line_endings.FixLineEndings(
|
|
in_file=self.__temp_file,
|
|
bug_handler=RtfInvalidCodeException,
|
|
copy=self.__copy,
|
|
run_level=self.__run_level,
|
|
replace_illegals=self.__replace_illegals,
|
|
)
|
|
return_value = line_obj.fix_endings() # calibre return what?
|
|
self.__return_code(return_value)
|
|
tokenize_obj = tokenize.Tokenize(
|
|
bug_handler=RtfInvalidCodeException,
|
|
in_file=self.__temp_file,
|
|
copy=self.__copy,
|
|
run_level=self.__run_level)
|
|
tokenize_obj.tokenize()
|
|
process_tokens_obj = process_tokens.ProcessTokens(
|
|
in_file=self.__temp_file,
|
|
bug_handler=RtfInvalidCodeException,
|
|
copy=self.__copy,
|
|
run_level=self.__run_level,
|
|
exception_handler=InvalidRtfException,
|
|
)
|
|
try:
|
|
return_value = process_tokens_obj.process_tokens()
|
|
except InvalidRtfException as msg:
|
|
# Check to see if the file is correctly encoded
|
|
encode_obj = default_encoding.DefaultEncoding(
|
|
in_file=self.__temp_file,
|
|
run_level=self.__run_level,
|
|
bug_handler=RtfInvalidCodeException,
|
|
check_raw=True,
|
|
default_encoding=self.__default_encoding,
|
|
)
|
|
platform, code_page, default_font_num = encode_obj.find_default_encoding()
|
|
check_encoding_obj = check_encoding.CheckEncoding(
|
|
bug_handler=RtfInvalidCodeException,
|
|
)
|
|
enc = encode_obj.get_codepage()
|
|
# TODO: to check if cp is a good idea or if I should use a dict to convert
|
|
enc = 'cp' + enc
|
|
msg = '%s\nException in token processing' % unicode_type(msg)
|
|
if check_encoding_obj.check_encoding(self.__file, enc):
|
|
file_name = self.__file if isinstance(self.__file, bytes) \
|
|
else self.__file.encode('utf-8')
|
|
msg +='\nFile %s does not appear to be correctly encoded.\n' % file_name
|
|
try:
|
|
os.remove(self.__temp_file)
|
|
except OSError:
|
|
pass
|
|
raise InvalidRtfException(msg)
|
|
delete_info_obj = delete_info.DeleteInfo(
|
|
in_file=self.__temp_file,
|
|
copy=self.__copy,
|
|
bug_handler=RtfInvalidCodeException,
|
|
run_level=self.__run_level,)
|
|
# found destination means {\*\destination
|
|
# if found, the RTF should be newer RTF
|
|
found_destination = delete_info_obj.delete_info()
|
|
self.__bracket_match('delete_data_info')
|
|
# put picts in a separate file
|
|
pict_obj = pict.Pict(
|
|
in_file=self.__temp_file,
|
|
bug_handler=RtfInvalidCodeException,
|
|
copy=self.__copy,
|
|
orig_file=self.__file,
|
|
out_file=self.__out_file,
|
|
run_level=self.__run_level,
|
|
)
|
|
pict_obj.process_pict()
|
|
self.__bracket_match('pict_data_info')
|
|
combine_obj = combine_borders.CombineBorders(
|
|
in_file=self.__temp_file,
|
|
bug_handler=RtfInvalidCodeException,
|
|
copy=self.__copy,
|
|
run_level=self.__run_level,)
|
|
combine_obj.combine_borders()
|
|
self.__bracket_match('combine_borders_info')
|
|
footnote_obj = footnote.Footnote(
|
|
in_file=self.__temp_file,
|
|
bug_handler=RtfInvalidCodeException,
|
|
copy=self.__copy,
|
|
run_level=self.__run_level,
|
|
)
|
|
footnote_obj.separate_footnotes()
|
|
self.__bracket_match('separate_footnotes_info')
|
|
header_obj = header.Header(
|
|
in_file=self.__temp_file,
|
|
bug_handler=RtfInvalidCodeException,
|
|
copy=self.__copy,
|
|
run_level=self.__run_level,
|
|
)
|
|
header_obj.separate_headers()
|
|
self.__bracket_match('separate_headers_info')
|
|
list_numbers_obj = list_numbers.ListNumbers(
|
|
in_file=self.__temp_file,
|
|
bug_handler=RtfInvalidCodeException,
|
|
copy=self.__copy,
|
|
run_level=self.__run_level,
|
|
)
|
|
list_numbers_obj.fix_list_numbers()
|
|
self.__bracket_match('list_number_info')
|
|
preamble_div_obj = preamble_div.PreambleDiv(
|
|
in_file=self.__temp_file,
|
|
bug_handler=RtfInvalidCodeException,
|
|
copy=self.__copy,
|
|
run_level=self.__run_level,
|
|
)
|
|
list_of_lists = preamble_div_obj.make_preamble_divisions()
|
|
self.__bracket_match('make_preamble_divisions')
|
|
encode_obj = default_encoding.DefaultEncoding(
|
|
in_file=self.__temp_file,
|
|
run_level=self.__run_level,
|
|
bug_handler=RtfInvalidCodeException,
|
|
default_encoding=self.__default_encoding,
|
|
)
|
|
platform, code_page, default_font_num = encode_obj.find_default_encoding()
|
|
hex2utf_obj = hex_2_utf8.Hex2Utf8(
|
|
in_file=self.__temp_file,
|
|
copy=self.__copy,
|
|
area_to_convert='preamble',
|
|
char_file=self.__char_data,
|
|
default_char_map=code_page,
|
|
run_level=self.__run_level,
|
|
bug_handler=RtfInvalidCodeException,
|
|
invalid_rtf_handler=InvalidRtfException,
|
|
)
|
|
hex2utf_obj.convert_hex_2_utf8()
|
|
self.__bracket_match('hex_2_utf_preamble')
|
|
fonts_obj = fonts.Fonts(
|
|
in_file=self.__temp_file,
|
|
bug_handler=RtfInvalidCodeException,
|
|
copy=self.__copy,
|
|
default_font_num=default_font_num,
|
|
run_level=self.__run_level,
|
|
)
|
|
special_font_dict = fonts_obj.convert_fonts()
|
|
self.__bracket_match('fonts_info')
|
|
color_obj = colors.Colors(
|
|
in_file=self.__temp_file,
|
|
copy=self.__copy,
|
|
bug_handler=RtfInvalidCodeException,
|
|
run_level=self.__run_level,
|
|
)
|
|
color_obj.convert_colors()
|
|
self.__bracket_match('colors_info')
|
|
style_obj = styles.Styles(
|
|
in_file=self.__temp_file,
|
|
bug_handler=RtfInvalidCodeException,
|
|
copy=self.__copy,
|
|
run_level=self.__run_level,
|
|
)
|
|
style_obj.convert_styles()
|
|
self.__bracket_match('styles_info')
|
|
info_obj = info.Info(
|
|
in_file=self.__temp_file,
|
|
bug_handler=RtfInvalidCodeException,
|
|
copy=self.__copy,
|
|
run_level=self.__run_level,
|
|
)
|
|
info_obj.fix_info()
|
|
default_font = special_font_dict.get('default-font')
|
|
preamble_rest_obj = preamble_rest.Preamble(
|
|
file=self.__temp_file, copy=self.__copy,
|
|
bug_handler=RtfInvalidCodeException,
|
|
platform=platform, default_font=default_font,
|
|
code_page=code_page)
|
|
preamble_rest_obj.fix_preamble()
|
|
self.__bracket_match('preamble_rest_info')
|
|
old_rtf_obj = OldRtf(
|
|
in_file=self.__temp_file,
|
|
bug_handler=RtfInvalidCodeException,
|
|
run_level=self.__run_level,
|
|
)
|
|
# RTF can actually have destination groups and old RTF.
|
|
# BAH!
|
|
old_rtf = old_rtf_obj.check_if_old_rtf()
|
|
if old_rtf:
|
|
if self.__run_level > 5:
|
|
msg = 'Older RTF\n' \
|
|
'self.__run_level is "%s"\n' % self.__run_level
|
|
raise RtfInvalidCodeException(msg)
|
|
if self.__run_level > 1:
|
|
sys.stderr.write('File could be older RTF...\n')
|
|
if found_destination:
|
|
if self.__run_level > 1:
|
|
sys.stderr.write(
|
|
'File also has newer RTF.\n'
|
|
'Will do the best to convert...\n'
|
|
)
|
|
add_brackets_obj = add_brackets.AddBrackets(
|
|
in_file=self.__temp_file,
|
|
bug_handler=RtfInvalidCodeException,
|
|
copy=self.__copy,
|
|
run_level=self.__run_level,
|
|
)
|
|
add_brackets_obj.add_brackets()
|
|
fields_small_obj = fields_small.FieldsSmall(
|
|
in_file=self.__temp_file,
|
|
copy=self.__copy,
|
|
bug_handler=RtfInvalidCodeException,
|
|
run_level=self.__run_level,)
|
|
fields_small_obj.fix_fields()
|
|
self.__bracket_match('fix_small_fields_info')
|
|
fields_large_obj = fields_large.FieldsLarge(
|
|
in_file=self.__temp_file,
|
|
copy=self.__copy,
|
|
bug_handler=RtfInvalidCodeException,
|
|
run_level=self.__run_level)
|
|
fields_large_obj.fix_fields()
|
|
self.__bracket_match('fix_large_fields_info')
|
|
sections_obj = sections.Sections(
|
|
in_file=self.__temp_file,
|
|
bug_handler=RtfInvalidCodeException,
|
|
copy=self.__copy,
|
|
run_level=self.__run_level,)
|
|
sections_obj.make_sections()
|
|
self.__bracket_match('sections_info')
|
|
paragraphs_obj = paragraphs.Paragraphs(
|
|
in_file=self.__temp_file,
|
|
bug_handler=RtfInvalidCodeException,
|
|
copy=self.__copy,
|
|
write_empty_para=self.__empty_paragraphs,
|
|
run_level=self.__run_level,)
|
|
paragraphs_obj.make_paragraphs()
|
|
self.__bracket_match('paragraphs_info')
|
|
default_font = special_font_dict['default-font']
|
|
paragraph_def_obj = paragraph_def.ParagraphDef(
|
|
in_file=self.__temp_file,
|
|
bug_handler=RtfInvalidCodeException,
|
|
copy=self.__copy,
|
|
default_font=default_font,
|
|
run_level=self.__run_level,)
|
|
list_of_styles = paragraph_def_obj.make_paragraph_def()
|
|
body_styles_obj = body_styles.BodyStyles(
|
|
in_file=self.__temp_file,
|
|
bug_handler=RtfInvalidCodeException,
|
|
copy=self.__copy,
|
|
list_of_styles=list_of_styles,
|
|
run_level=self.__run_level,)
|
|
body_styles_obj.insert_info()
|
|
self.__bracket_match('body_styles_info')
|
|
self.__bracket_match('paragraph_def_info')
|
|
table_obj = table.Table(
|
|
in_file=self.__temp_file,
|
|
bug_handler=RtfInvalidCodeException,
|
|
copy=self.__copy,
|
|
run_level=self.__run_level,)
|
|
table_data = table_obj.make_table()
|
|
self.__bracket_match('table_info')
|
|
table_info_obj = table_info.TableInfo(
|
|
in_file=self.__temp_file,
|
|
bug_handler=RtfInvalidCodeException,
|
|
copy=self.__copy,
|
|
table_data=table_data,
|
|
run_level=self.__run_level,)
|
|
table_info_obj.insert_info()
|
|
self.__bracket_match('table__data_info')
|
|
if self.__form_lists:
|
|
make_list_obj = make_lists.MakeLists(
|
|
in_file=self.__temp_file,
|
|
bug_handler=RtfInvalidCodeException,
|
|
copy=self.__copy,
|
|
headings_to_sections=self.__headings_to_sections,
|
|
run_level=self.__run_level,
|
|
list_of_lists=list_of_lists,
|
|
)
|
|
make_list_obj.make_lists()
|
|
self.__bracket_match('form_lists_info')
|
|
if self.__headings_to_sections:
|
|
headings_to_sections_obj = headings_to_sections.HeadingsToSections(
|
|
in_file=self.__temp_file,
|
|
bug_handler=RtfInvalidCodeException,
|
|
copy=self.__copy,
|
|
run_level=self.__run_level,)
|
|
headings_to_sections_obj.make_sections()
|
|
self.__bracket_match('headings_to_sections_info')
|
|
if self.__group_styles:
|
|
group_styles_obj = group_styles.GroupStyles(
|
|
in_file=self.__temp_file,
|
|
bug_handler=RtfInvalidCodeException,
|
|
copy=self.__copy,
|
|
wrap=1,
|
|
run_level=self.__run_level,)
|
|
group_styles_obj.group_styles()
|
|
self.__bracket_match('group_styles_info')
|
|
if self.__group_borders:
|
|
group_borders_obj = group_borders.GroupBorders(
|
|
in_file=self.__temp_file,
|
|
bug_handler=RtfInvalidCodeException,
|
|
copy=self.__copy,
|
|
wrap=1,
|
|
run_level=self.__run_level,)
|
|
group_borders_obj.group_borders()
|
|
self.__bracket_match('group_borders_info')
|
|
inline_obj = inline.Inline(
|
|
in_file=self.__temp_file,
|
|
bug_handler=RtfInvalidCodeException,
|
|
copy=self.__copy,
|
|
run_level=self.__run_level,)
|
|
inline_obj.form_tags()
|
|
self.__bracket_match('inline_info')
|
|
hex2utf_obj.update_values(file=self.__temp_file,
|
|
area_to_convert='body',
|
|
copy=self.__copy,
|
|
char_file=self.__char_data,
|
|
convert_caps=self.__convert_caps,
|
|
convert_symbol=self.__convert_symbol,
|
|
convert_wingdings=self.__convert_wingdings,
|
|
convert_zapf=self.__convert_zapf,
|
|
symbol=1,
|
|
wingdings=1,
|
|
dingbats=1,
|
|
)
|
|
hex2utf_obj.convert_hex_2_utf8()
|
|
header_obj.join_headers()
|
|
footnote_obj.join_footnotes()
|
|
tags_obj = convert_to_tags.ConvertToTags(
|
|
in_file=self.__temp_file,
|
|
copy=self.__copy,
|
|
dtd_path=self.__dtd_path,
|
|
indent=self.__indent,
|
|
run_level=self.__run_level,
|
|
no_dtd=self.__no_dtd,
|
|
encoding=encode_obj.get_codepage(),
|
|
bug_handler=RtfInvalidCodeException,
|
|
)
|
|
tags_obj.convert_to_tags()
|
|
output_obj = output.Output(
|
|
file=self.__temp_file,
|
|
orig_file=self.__file,
|
|
output_dir=self.__out_dir,
|
|
out_file=self.__out_file,
|
|
)
|
|
output_obj.output()
|
|
os.remove(self.__temp_file)
|
|
return self.__exit_level
|
|
|
|
def __bracket_match(self, file_name):
|
|
if self.__run_level > 2:
|
|
good_br, msg = self.__check_brack_obj.check_brackets()
|
|
if good_br:
|
|
pass
|
|
# sys.stderr.write( msg + ' in ' + file_name + "\n")
|
|
else:
|
|
msg = '%s in file %s' % (msg, file_name)
|
|
print(msg, file=sys.stderr)
|
|
|
|
def __return_code(self, num):
|
|
if num is None:
|
|
return
|
|
if int(num) > self.__exit_level:
|
|
self.__exit_level = num
|
|
|
|
def __make_temp_file(self,file):
|
|
"""Make a temporary file to parse"""
|
|
write_file="rtf_write_file"
|
|
read_obj = file if hasattr(file, 'read') else open_for_read(file)
|
|
with open_for_write(write_file) as write_obj:
|
|
for line in read_obj:
|
|
write_obj.write(line)
|
|
return write_file
|