1
0
mirror of https://github.com/gryf/ebook-converter.git synced 2026-01-23 21:55:44 +01:00
Files
ebook-converter/ebook_converter/ebooks/conversion/plugins/rtf_input.py
gryf ce89f5c9d1 Use the real constants module.
This is progressing refactor of the calibre code to make it more
readable, and transform it to something more coherent.

In this patch, there are changes regarding imports for some modules,
instead of polluting namespace of each module with some other modules
symbols, which often were imported from other modules. Yuck.
2020-05-29 17:04:53 +02:00

334 lines
12 KiB
Python

import glob
import os
import pkg_resources
import re
import textwrap
from lxml import etree
from ebook_converter.customize.conversion import InputFormatPlugin
from ebook_converter.customize.conversion import OptionRecommendation
from ebook_converter.polyglot.builtins import as_bytes
border_style_map = {'single': 'solid',
'double-thickness-border': 'double',
'shadowed-border': 'outset',
'double-border': 'double',
'dotted-border': 'dotted',
'dashed': 'dashed',
'hairline': 'solid',
'inset': 'inset',
'dash-small': 'dashed',
'dot-dash': 'dotted',
'dot-dot-dash': 'dotted',
'outset': 'outset',
'tripple': 'double',
'triple': 'double',
'thick-thin-small': 'solid',
'thin-thick-small': 'solid',
'thin-thick-thin-small': 'solid',
'thick-thin-medium': 'solid',
'thin-thick-medium': 'solid',
'thin-thick-thin-medium': 'solid',
'thick-thin-large': 'solid',
'thin-thick-thin-large': 'solid',
'wavy': 'ridge',
'double-wavy': 'ridge',
'striped': 'ridge',
'emboss': 'inset',
'engrave': 'inset',
'frame': 'ridge'}
class RTFInput(InputFormatPlugin):
name = 'RTF Input'
author = 'Kovid Goyal'
description = 'Convert RTF files to HTML'
file_types = {'rtf'}
commit_name = 'rtf_input'
options = {OptionRecommendation(name='ignore_wmf', recommended_value=False,
help='Ignore WMF images instead of '
'replacing them with a placeholder '
'image.')}
def generate_xml(self, stream):
from ebook_converter.ebooks.rtf2xml.ParseRtf import ParseRtf
ofile = u'dataxml.xml'
run_lev, debug_dir, indent_out = 1, None, 0
if getattr(self.opts, 'debug_pipeline', None) is not None:
try:
os.mkdir(u'rtfdebug')
debug_dir = u'rtfdebug'
run_lev = 4
indent_out = 1
self.log('Running RTFParser in debug mode')
except Exception:
self.log.warn('Impossible to run RTFParser in debug mode')
parser = ParseRtf(
in_file=stream,
out_file=ofile,
# Convert symbol fonts to unicode equivalents. Default
# is 1
convert_symbol=1,
# Convert Zapf fonts to unicode equivalents. Default
# is 1.
convert_zapf=1,
# Convert Wingding fonts to unicode equivalents.
# Default is 1.
convert_wingdings=1,
# Convert RTF caps to real caps.
# Default is 1.
convert_caps=1,
# Indent resulting XML.
# Default is 0 (no indent).
indent=indent_out,
# Form lists from RTF. Default is 1.
form_lists=1,
# Convert headings to sections. Default is 0.
headings_to_sections=1,
# Group paragraphs with the same style name. Default is 1.
group_styles=1,
# Group borders. Default is 1.
group_borders=1,
# Write or do not write paragraphs. Default is 0.
empty_paragraphs=1,
# Debug
deb_dir=debug_dir,
# Default encoding
default_encoding=getattr(self.opts, 'input_encoding',
'cp1252') or 'cp1252',
# Run level
run_level=run_lev,
)
parser.parse_rtf()
with open(ofile, 'rb') as f:
return f.read()
def extract_images(self, picts):
from ebook_converter.utils.imghdr import what
from binascii import unhexlify
self.log('Extracting images...')
with open(picts, 'rb') as f:
raw = f.read()
picts = filter(len, re.findall(br'\{\\pict([^}]+)\}', raw))
hex_pat = re.compile(br'[^a-fA-F0-9]')
encs = [hex_pat.sub(b'', pict) for pict in picts]
count = 0
imap = {}
for enc in encs:
if len(enc) % 2 == 1:
enc = enc[:-1]
data = unhexlify(enc)
fmt = what(None, data)
if fmt is None:
fmt = 'wmf'
count += 1
name = u'%04d.%s' % (count, fmt)
with open(name, 'wb') as f:
f.write(data)
imap[count] = name
# with open(name+'.hex', 'wb') as f:
# f.write(enc)
return self.convert_images(imap)
def convert_images(self, imap):
self.default_img = None
for count, val in imap.items():
try:
imap[count] = self.convert_image(val)
except Exception:
self.log.exception('Failed to convert', val)
return imap
def convert_image(self, name):
if not name.endswith('.wmf'):
return name
try:
return self.rasterize_wmf(name)
except Exception:
self.log.exception('Failed to convert WMF image %r' % name)
return self.replace_wmf(name)
def replace_wmf(self, name):
if self.opts.ignore_wmf:
os.remove(name)
return '__REMOVE_ME__'
from ebook_converter.ebooks.covers import message_image
if self.default_img is None:
self.default_img = message_image('Conversion of WMF images is not '
'supported. Use Microsoft Word '
'or OpenOffice to save this RTF '
'file as HTML and convert that '
'in calibre.')
name = name.replace('.wmf', '.jpg')
with open(name, 'wb') as f:
f.write(self.default_img)
return name
def rasterize_wmf(self, name):
from ebook_converter.utils.wmf.parse import wmf_unwrap
with open(name, 'rb') as f:
data = f.read()
data = wmf_unwrap(data)
name = name.replace('.wmf', '.png')
with open(name, 'wb') as f:
f.write(data)
return name
def write_inline_css(self, ic, border_styles):
font_size_classes = ['span.fs%d { font-size: %spt }' % (i, x)
for i, x in enumerate(ic.font_sizes)]
color_classes = ['span.col%d { color: %s }' % (i, x)
for i, x in enumerate(ic.colors) if x != 'false']
css = textwrap.dedent('''
span.none {
text-decoration: none; font-weight: normal;
font-style: normal; font-variant: normal
}
span.italics { font-style: italic }
span.bold { font-weight: bold }
span.small-caps { font-variant: small-caps }
span.underlined { text-decoration: underline }
span.strike-through { text-decoration: line-through }
''')
css += '\n' + '\n'.join(font_size_classes)
css += '\n' + '\n'.join(color_classes)
for cls, val in border_styles.items():
css += '\n\n.%s {\n%s\n}' % (cls, val)
with open(u'styles.css', 'ab') as f:
f.write(css.encode('utf-8'))
def convert_borders(self, doc):
border_styles = []
style_map = {}
for elem in doc.xpath(r'//*[local-name()="cell"]'):
style = ['border-style: hidden', 'border-width: 1px',
'border-color: black']
for x in ('bottom', 'top', 'left', 'right'):
bs = elem.get('border-cell-%s-style' % x, None)
if bs:
cbs = border_style_map.get(bs, 'solid')
style.append('border-%s-style: %s' % (x, cbs))
bw = elem.get('border-cell-%s-line-width' % x, None)
if bw:
style.append('border-%s-width: %spt' % (x, bw))
bc = elem.get('border-cell-%s-color' % x, None)
if bc:
style.append('border-%s-color: %s' % (x, bc))
style = ';\n'.join(style)
if style not in border_styles:
border_styles.append(style)
idx = border_styles.index(style)
cls = 'border_style%d' % idx
style_map[cls] = style
elem.set('class', cls)
return style_map
def convert(self, stream, options, file_ext, log,
accelerators):
from ebook_converter.ebooks.metadata.meta import get_metadata
from ebook_converter.ebooks.metadata.opf2 import OPFCreator
from ebook_converter.ebooks.rtf2xml.ParseRtf import \
RtfInvalidCodeException
from ebook_converter.ebooks.rtf.input import InlineClass
self.opts = options
self.log = log
self.log('Converting RTF to XML...')
try:
xml = self.generate_xml(stream.name)
except RtfInvalidCodeException as e:
self.log.exception('Unable to parse RTF')
raise ValueError('This RTF file has a feature calibre does not '
'support. Convert it to HTML first and then try '
'it.\n%s' % e)
d = glob.glob(os.path.join('*_rtf_pict_dir', 'picts.rtf'))
if d:
imap = {}
try:
imap = self.extract_images(d[0])
except Exception:
self.log.exception('Failed to extract images...')
self.log('Parsing XML...')
doc = etree.fromstring(xml)
border_styles = self.convert_borders(doc)
for pict in doc.xpath('//rtf:pict[@num]',
namespaces={'rtf':
'http://rtf2xml.sourceforge.net/'}):
num = int(pict.get('num'))
name = imap.get(num, None)
if name is not None:
pict.set('num', name)
self.log('Converting XML to HTML...')
inline_class = InlineClass(self.log)
with open(pkg_resources.resource_filename('ebook_converter',
'data/rtf.xsl')) as fobj:
styledoc = etree.fromstring(fobj.read())
extensions = {('calibre', 'inline-class'): inline_class}
transform = etree.XSLT(styledoc, extensions=extensions)
result = transform(doc)
html = u'index.xhtml'
with open(html, 'wb') as f:
res = as_bytes(transform.tostring(result))
# res = res[:100].replace('xmlns:html', 'xmlns') + res[100:]
# clean multiple \n
res = re.sub(b'\n+', b'\n', res)
# Replace newlines inserted by the 'empty_paragraphs' option in
# rtf2xml with html blank lines
# res = re.sub('\s*<body>', '<body>', res)
# res = re.sub('(?<=\n)\n{2}',
# u'<p>\u00a0</p>\n'.encode('utf-8'), res)
f.write(res)
self.write_inline_css(inline_class, border_styles)
stream.seek(0)
mi = get_metadata(stream, 'rtf')
if not mi.title:
mi.title = 'Unknown'
if not mi.authors:
mi.authors = ['Unknown']
opf = OPFCreator(os.getcwd(), mi)
opf.create_manifest([(u'index.xhtml', None)])
opf.create_spine([u'index.xhtml'])
opf.render(open(u'metadata.opf', 'wb'))
return os.path.abspath(u'metadata.opf')
def postprocess_book(self, oeb, opts, log):
for item in oeb.spine:
for img in item.data.xpath('//*[local-name()="img" and '
'@src="__REMOVE_ME__"]'):
p = img.getparent()
idx = p.index(img)
p.remove(img)
if img.tail:
if idx == 0:
p.text = (p.text or '') + img.tail
else:
p[idx-1].tail = (p[idx-1].tail or '') + img.tail