mirror of
https://github.com/gryf/ebook-converter.git
synced 2026-01-24 05:55:46 +01:00
This is progressing refactor of the calibre code to make it more readable, and transform it to something more coherent. In this patch, there are changes regarding imports for some modules, instead of polluting namespace of each module with some other modules symbols, which often were imported from other modules. Yuck.
334 lines
12 KiB
Python
334 lines
12 KiB
Python
import glob
|
|
import os
|
|
import pkg_resources
|
|
import re
|
|
import textwrap
|
|
|
|
from lxml import etree
|
|
|
|
from ebook_converter.customize.conversion import InputFormatPlugin
|
|
from ebook_converter.customize.conversion import OptionRecommendation
|
|
from ebook_converter.polyglot.builtins import as_bytes
|
|
|
|
|
|
border_style_map = {'single': 'solid',
|
|
'double-thickness-border': 'double',
|
|
'shadowed-border': 'outset',
|
|
'double-border': 'double',
|
|
'dotted-border': 'dotted',
|
|
'dashed': 'dashed',
|
|
'hairline': 'solid',
|
|
'inset': 'inset',
|
|
'dash-small': 'dashed',
|
|
'dot-dash': 'dotted',
|
|
'dot-dot-dash': 'dotted',
|
|
'outset': 'outset',
|
|
'tripple': 'double',
|
|
'triple': 'double',
|
|
'thick-thin-small': 'solid',
|
|
'thin-thick-small': 'solid',
|
|
'thin-thick-thin-small': 'solid',
|
|
'thick-thin-medium': 'solid',
|
|
'thin-thick-medium': 'solid',
|
|
'thin-thick-thin-medium': 'solid',
|
|
'thick-thin-large': 'solid',
|
|
'thin-thick-thin-large': 'solid',
|
|
'wavy': 'ridge',
|
|
'double-wavy': 'ridge',
|
|
'striped': 'ridge',
|
|
'emboss': 'inset',
|
|
'engrave': 'inset',
|
|
'frame': 'ridge'}
|
|
|
|
|
|
class RTFInput(InputFormatPlugin):
|
|
|
|
name = 'RTF Input'
|
|
author = 'Kovid Goyal'
|
|
description = 'Convert RTF files to HTML'
|
|
file_types = {'rtf'}
|
|
commit_name = 'rtf_input'
|
|
|
|
options = {OptionRecommendation(name='ignore_wmf', recommended_value=False,
|
|
help='Ignore WMF images instead of '
|
|
'replacing them with a placeholder '
|
|
'image.')}
|
|
|
|
def generate_xml(self, stream):
|
|
from ebook_converter.ebooks.rtf2xml.ParseRtf import ParseRtf
|
|
ofile = u'dataxml.xml'
|
|
run_lev, debug_dir, indent_out = 1, None, 0
|
|
if getattr(self.opts, 'debug_pipeline', None) is not None:
|
|
try:
|
|
os.mkdir(u'rtfdebug')
|
|
debug_dir = u'rtfdebug'
|
|
run_lev = 4
|
|
indent_out = 1
|
|
self.log('Running RTFParser in debug mode')
|
|
except Exception:
|
|
self.log.warn('Impossible to run RTFParser in debug mode')
|
|
parser = ParseRtf(
|
|
in_file=stream,
|
|
out_file=ofile,
|
|
# Convert symbol fonts to unicode equivalents. Default
|
|
# is 1
|
|
convert_symbol=1,
|
|
|
|
# Convert Zapf fonts to unicode equivalents. Default
|
|
# is 1.
|
|
convert_zapf=1,
|
|
|
|
# Convert Wingding fonts to unicode equivalents.
|
|
# Default is 1.
|
|
convert_wingdings=1,
|
|
|
|
# Convert RTF caps to real caps.
|
|
# Default is 1.
|
|
convert_caps=1,
|
|
|
|
# Indent resulting XML.
|
|
# Default is 0 (no indent).
|
|
indent=indent_out,
|
|
|
|
# Form lists from RTF. Default is 1.
|
|
form_lists=1,
|
|
|
|
# Convert headings to sections. Default is 0.
|
|
headings_to_sections=1,
|
|
|
|
# Group paragraphs with the same style name. Default is 1.
|
|
group_styles=1,
|
|
|
|
# Group borders. Default is 1.
|
|
group_borders=1,
|
|
|
|
# Write or do not write paragraphs. Default is 0.
|
|
empty_paragraphs=1,
|
|
|
|
# Debug
|
|
deb_dir=debug_dir,
|
|
|
|
# Default encoding
|
|
default_encoding=getattr(self.opts, 'input_encoding',
|
|
'cp1252') or 'cp1252',
|
|
|
|
# Run level
|
|
run_level=run_lev,
|
|
)
|
|
parser.parse_rtf()
|
|
with open(ofile, 'rb') as f:
|
|
return f.read()
|
|
|
|
def extract_images(self, picts):
|
|
from ebook_converter.utils.imghdr import what
|
|
from binascii import unhexlify
|
|
self.log('Extracting images...')
|
|
|
|
with open(picts, 'rb') as f:
|
|
raw = f.read()
|
|
picts = filter(len, re.findall(br'\{\\pict([^}]+)\}', raw))
|
|
hex_pat = re.compile(br'[^a-fA-F0-9]')
|
|
encs = [hex_pat.sub(b'', pict) for pict in picts]
|
|
|
|
count = 0
|
|
imap = {}
|
|
for enc in encs:
|
|
if len(enc) % 2 == 1:
|
|
enc = enc[:-1]
|
|
data = unhexlify(enc)
|
|
fmt = what(None, data)
|
|
if fmt is None:
|
|
fmt = 'wmf'
|
|
count += 1
|
|
name = u'%04d.%s' % (count, fmt)
|
|
with open(name, 'wb') as f:
|
|
f.write(data)
|
|
imap[count] = name
|
|
# with open(name+'.hex', 'wb') as f:
|
|
# f.write(enc)
|
|
return self.convert_images(imap)
|
|
|
|
def convert_images(self, imap):
|
|
self.default_img = None
|
|
for count, val in imap.items():
|
|
try:
|
|
imap[count] = self.convert_image(val)
|
|
except Exception:
|
|
self.log.exception('Failed to convert', val)
|
|
return imap
|
|
|
|
def convert_image(self, name):
|
|
if not name.endswith('.wmf'):
|
|
return name
|
|
try:
|
|
return self.rasterize_wmf(name)
|
|
except Exception:
|
|
self.log.exception('Failed to convert WMF image %r' % name)
|
|
return self.replace_wmf(name)
|
|
|
|
def replace_wmf(self, name):
|
|
if self.opts.ignore_wmf:
|
|
os.remove(name)
|
|
return '__REMOVE_ME__'
|
|
from ebook_converter.ebooks.covers import message_image
|
|
if self.default_img is None:
|
|
self.default_img = message_image('Conversion of WMF images is not '
|
|
'supported. Use Microsoft Word '
|
|
'or OpenOffice to save this RTF '
|
|
'file as HTML and convert that '
|
|
'in calibre.')
|
|
name = name.replace('.wmf', '.jpg')
|
|
with open(name, 'wb') as f:
|
|
f.write(self.default_img)
|
|
return name
|
|
|
|
def rasterize_wmf(self, name):
|
|
from ebook_converter.utils.wmf.parse import wmf_unwrap
|
|
with open(name, 'rb') as f:
|
|
data = f.read()
|
|
data = wmf_unwrap(data)
|
|
name = name.replace('.wmf', '.png')
|
|
with open(name, 'wb') as f:
|
|
f.write(data)
|
|
return name
|
|
|
|
def write_inline_css(self, ic, border_styles):
|
|
font_size_classes = ['span.fs%d { font-size: %spt }' % (i, x)
|
|
for i, x in enumerate(ic.font_sizes)]
|
|
color_classes = ['span.col%d { color: %s }' % (i, x)
|
|
for i, x in enumerate(ic.colors) if x != 'false']
|
|
css = textwrap.dedent('''
|
|
span.none {
|
|
text-decoration: none; font-weight: normal;
|
|
font-style: normal; font-variant: normal
|
|
}
|
|
|
|
span.italics { font-style: italic }
|
|
|
|
span.bold { font-weight: bold }
|
|
|
|
span.small-caps { font-variant: small-caps }
|
|
|
|
span.underlined { text-decoration: underline }
|
|
|
|
span.strike-through { text-decoration: line-through }
|
|
|
|
''')
|
|
css += '\n' + '\n'.join(font_size_classes)
|
|
css += '\n' + '\n'.join(color_classes)
|
|
|
|
for cls, val in border_styles.items():
|
|
css += '\n\n.%s {\n%s\n}' % (cls, val)
|
|
|
|
with open(u'styles.css', 'ab') as f:
|
|
f.write(css.encode('utf-8'))
|
|
|
|
def convert_borders(self, doc):
|
|
border_styles = []
|
|
style_map = {}
|
|
for elem in doc.xpath(r'//*[local-name()="cell"]'):
|
|
style = ['border-style: hidden', 'border-width: 1px',
|
|
'border-color: black']
|
|
for x in ('bottom', 'top', 'left', 'right'):
|
|
bs = elem.get('border-cell-%s-style' % x, None)
|
|
if bs:
|
|
cbs = border_style_map.get(bs, 'solid')
|
|
style.append('border-%s-style: %s' % (x, cbs))
|
|
bw = elem.get('border-cell-%s-line-width' % x, None)
|
|
if bw:
|
|
style.append('border-%s-width: %spt' % (x, bw))
|
|
bc = elem.get('border-cell-%s-color' % x, None)
|
|
if bc:
|
|
style.append('border-%s-color: %s' % (x, bc))
|
|
style = ';\n'.join(style)
|
|
if style not in border_styles:
|
|
border_styles.append(style)
|
|
idx = border_styles.index(style)
|
|
cls = 'border_style%d' % idx
|
|
style_map[cls] = style
|
|
elem.set('class', cls)
|
|
return style_map
|
|
|
|
def convert(self, stream, options, file_ext, log,
|
|
accelerators):
|
|
from ebook_converter.ebooks.metadata.meta import get_metadata
|
|
from ebook_converter.ebooks.metadata.opf2 import OPFCreator
|
|
from ebook_converter.ebooks.rtf2xml.ParseRtf import \
|
|
RtfInvalidCodeException
|
|
from ebook_converter.ebooks.rtf.input import InlineClass
|
|
self.opts = options
|
|
self.log = log
|
|
self.log('Converting RTF to XML...')
|
|
try:
|
|
xml = self.generate_xml(stream.name)
|
|
except RtfInvalidCodeException as e:
|
|
self.log.exception('Unable to parse RTF')
|
|
raise ValueError('This RTF file has a feature calibre does not '
|
|
'support. Convert it to HTML first and then try '
|
|
'it.\n%s' % e)
|
|
|
|
d = glob.glob(os.path.join('*_rtf_pict_dir', 'picts.rtf'))
|
|
if d:
|
|
imap = {}
|
|
try:
|
|
imap = self.extract_images(d[0])
|
|
except Exception:
|
|
self.log.exception('Failed to extract images...')
|
|
|
|
self.log('Parsing XML...')
|
|
doc = etree.fromstring(xml)
|
|
border_styles = self.convert_borders(doc)
|
|
for pict in doc.xpath('//rtf:pict[@num]',
|
|
namespaces={'rtf':
|
|
'http://rtf2xml.sourceforge.net/'}):
|
|
num = int(pict.get('num'))
|
|
name = imap.get(num, None)
|
|
if name is not None:
|
|
pict.set('num', name)
|
|
|
|
self.log('Converting XML to HTML...')
|
|
inline_class = InlineClass(self.log)
|
|
with open(pkg_resources.resource_filename('ebook_converter',
|
|
'data/rtf.xsl')) as fobj:
|
|
styledoc = etree.fromstring(fobj.read())
|
|
extensions = {('calibre', 'inline-class'): inline_class}
|
|
transform = etree.XSLT(styledoc, extensions=extensions)
|
|
result = transform(doc)
|
|
html = u'index.xhtml'
|
|
with open(html, 'wb') as f:
|
|
res = as_bytes(transform.tostring(result))
|
|
# res = res[:100].replace('xmlns:html', 'xmlns') + res[100:]
|
|
# clean multiple \n
|
|
res = re.sub(b'\n+', b'\n', res)
|
|
# Replace newlines inserted by the 'empty_paragraphs' option in
|
|
# rtf2xml with html blank lines
|
|
# res = re.sub('\s*<body>', '<body>', res)
|
|
# res = re.sub('(?<=\n)\n{2}',
|
|
# u'<p>\u00a0</p>\n'.encode('utf-8'), res)
|
|
f.write(res)
|
|
self.write_inline_css(inline_class, border_styles)
|
|
stream.seek(0)
|
|
mi = get_metadata(stream, 'rtf')
|
|
if not mi.title:
|
|
mi.title = 'Unknown'
|
|
if not mi.authors:
|
|
mi.authors = ['Unknown']
|
|
opf = OPFCreator(os.getcwd(), mi)
|
|
opf.create_manifest([(u'index.xhtml', None)])
|
|
opf.create_spine([u'index.xhtml'])
|
|
opf.render(open(u'metadata.opf', 'wb'))
|
|
return os.path.abspath(u'metadata.opf')
|
|
|
|
def postprocess_book(self, oeb, opts, log):
|
|
for item in oeb.spine:
|
|
for img in item.data.xpath('//*[local-name()="img" and '
|
|
'@src="__REMOVE_ME__"]'):
|
|
p = img.getparent()
|
|
idx = p.index(img)
|
|
p.remove(img)
|
|
if img.tail:
|
|
if idx == 0:
|
|
p.text = (p.text or '') + img.tail
|
|
else:
|
|
p[idx-1].tail = (p[idx-1].tail or '') + img.tail
|