1
0
mirror of https://github.com/gryf/ebook-converter.git synced 2026-03-26 12:33:32 +01:00

Initial import

This commit is contained in:
2020-03-31 17:15:23 +02:00
commit d97ea9b0bc
311 changed files with 131419 additions and 0 deletions

View File

@@ -0,0 +1,41 @@
#!/usr/bin/env python2
# vim:fileencoding=utf-8
# License: GPLv3 Copyright: 2019, Kovid Goyal <kovid at kovidgoyal.net>
from __future__ import absolute_import, division, print_function, unicode_literals
import bs4
from bs4 import ( # noqa
CData, Comment, Declaration, NavigableString, ProcessingInstruction,
SoupStrainer, Tag, __version__
)
from polyglot.builtins import unicode_type
def parse_html(markup):
from calibre.ebooks.chardet import strip_encoding_declarations, xml_to_unicode, substitute_entites
from calibre.utils.cleantext import clean_xml_chars
if isinstance(markup, unicode_type):
markup = strip_encoding_declarations(markup)
markup = substitute_entites(markup)
else:
markup = xml_to_unicode(markup, strip_encoding_pats=True, resolve_entities=True)[0]
markup = clean_xml_chars(markup)
from html5_parser.soup import parse
return parse(markup, return_root=False)
def prettify(soup):
ans = soup.prettify()
if isinstance(ans, bytes):
ans = ans.decode('utf-8')
return ans
def BeautifulSoup(markup='', *a, **kw):
return parse_html(markup)
def BeautifulStoneSoup(markup='', *a, **kw):
return bs4.BeautifulSoup(markup, 'xml')

View File

@@ -0,0 +1,248 @@
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
'''
Code for the conversion of ebook formats and the reading of metadata
from various formats.
'''
import os, re, numbers, sys
from calibre import prints
from calibre.ebooks.chardet import xml_to_unicode
from polyglot.builtins import unicode_type
class ConversionError(Exception):
def __init__(self, msg, only_msg=False):
Exception.__init__(self, msg)
self.only_msg = only_msg
class UnknownFormatError(Exception):
pass
class DRMError(ValueError):
pass
class ParserError(ValueError):
pass
BOOK_EXTENSIONS = ['lrf', 'rar', 'zip', 'rtf', 'lit', 'txt', 'txtz', 'text', 'htm', 'xhtm',
'html', 'htmlz', 'xhtml', 'pdf', 'pdb', 'updb', 'pdr', 'prc', 'mobi', 'azw', 'doc',
'epub', 'fb2', 'fbz', 'djv', 'djvu', 'lrx', 'cbr', 'cbz', 'cbc', 'oebzip',
'rb', 'imp', 'odt', 'chm', 'tpz', 'azw1', 'pml', 'pmlz', 'mbp', 'tan', 'snb',
'xps', 'oxps', 'azw4', 'book', 'zbf', 'pobi', 'docx', 'docm', 'md',
'textile', 'markdown', 'ibook', 'ibooks', 'iba', 'azw3', 'ps', 'kepub', 'kfx', 'kpf']
def return_raster_image(path):
from calibre.utils.imghdr import what
if os.access(path, os.R_OK):
with open(path, 'rb') as f:
raw = f.read()
if what(None, raw) not in (None, 'svg'):
return raw
def extract_cover_from_embedded_svg(html, base, log):
from calibre.ebooks.oeb.base import XPath, SVG, XLINK
from calibre.utils.xml_parse import safe_xml_fromstring
root = safe_xml_fromstring(html)
svg = XPath('//svg:svg')(root)
if len(svg) == 1 and len(svg[0]) == 1 and svg[0][0].tag == SVG('image'):
image = svg[0][0]
href = image.get(XLINK('href'), None)
if href:
path = os.path.join(base, *href.split('/'))
return return_raster_image(path)
def extract_calibre_cover(raw, base, log):
from calibre.ebooks.BeautifulSoup import BeautifulSoup
soup = BeautifulSoup(raw)
matches = soup.find(name=['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'span',
'font', 'br'])
images = soup.findAll('img', src=True)
if matches is None and len(images) == 1 and \
images[0].get('alt', '').lower()=='cover':
img = images[0]
img = os.path.join(base, *img['src'].split('/'))
q = return_raster_image(img)
if q is not None:
return q
# Look for a simple cover, i.e. a body with no text and only one <img> tag
if matches is None:
body = soup.find('body')
if body is not None:
text = u''.join(map(unicode_type, body.findAll(text=True)))
if text.strip():
# Body has text, abort
return
images = body.findAll('img', src=True)
if len(images) == 1:
img = os.path.join(base, *images[0]['src'].split('/'))
return return_raster_image(img)
def render_html_svg_workaround(path_to_html, log, width=590, height=750):
from calibre.ebooks.oeb.base import SVG_NS
with open(path_to_html, 'rb') as f:
raw = f.read()
raw = xml_to_unicode(raw, strip_encoding_pats=True)[0]
data = None
if SVG_NS in raw:
try:
data = extract_cover_from_embedded_svg(raw,
os.path.dirname(path_to_html), log)
except Exception:
pass
if data is None:
try:
data = extract_calibre_cover(raw, os.path.dirname(path_to_html), log)
except Exception:
pass
if data is None:
data = render_html_data(path_to_html, width, height)
return data
def render_html_data(path_to_html, width, height):
from calibre.ptempfile import TemporaryDirectory
from calibre.utils.ipc.simple_worker import fork_job, WorkerError
result = {}
def report_error(text=''):
prints('Failed to render', path_to_html, 'with errors:', file=sys.stderr)
if text:
prints(text, file=sys.stderr)
if result and result['stdout_stderr']:
with open(result['stdout_stderr'], 'rb') as f:
prints(f.read(), file=sys.stderr)
with TemporaryDirectory('-render-html') as tdir:
try:
result = fork_job('calibre.ebooks.render_html', 'main', args=(path_to_html, tdir, 'jpeg'))
except WorkerError as e:
report_error(e.orig_tb)
else:
if result['result']:
with open(os.path.join(tdir, 'rendered.jpeg'), 'rb') as f:
return f.read()
else:
report_error()
def check_ebook_format(stream, current_guess):
ans = current_guess
if current_guess.lower() in ('prc', 'mobi', 'azw', 'azw1', 'azw3'):
stream.seek(0)
if stream.read(3) == b'TPZ':
ans = 'tpz'
stream.seek(0)
return ans
def normalize(x):
if isinstance(x, unicode_type):
import unicodedata
x = unicodedata.normalize('NFC', x)
return x
def calibre_cover(title, author_string, series_string=None,
output_format='jpg', title_size=46, author_size=36, logo_path=None):
title = normalize(title)
author_string = normalize(author_string)
series_string = normalize(series_string)
from calibre.ebooks.covers import calibre_cover2
from calibre.utils.img import image_to_data
ans = calibre_cover2(title, author_string or '', series_string or '', logo_path=logo_path, as_qimage=True)
return image_to_data(ans, fmt=output_format)
UNIT_RE = re.compile(r'^(-*[0-9]*[.]?[0-9]*)\s*(%|em|ex|en|px|mm|cm|in|pt|pc|rem|q)$')
def unit_convert(value, base, font, dpi, body_font_size=12):
' Return value in pts'
if isinstance(value, numbers.Number):
return value
try:
return float(value) * 72.0 / dpi
except:
pass
result = value
m = UNIT_RE.match(value)
if m is not None and m.group(1):
value = float(m.group(1))
unit = m.group(2)
if unit == '%':
result = (value / 100.0) * base
elif unit == 'px':
result = value * 72.0 / dpi
elif unit == 'in':
result = value * 72.0
elif unit == 'pt':
result = value
elif unit == 'em':
result = value * font
elif unit in ('ex', 'en'):
# This is a hack for ex since we have no way to know
# the x-height of the font
font = font
result = value * font * 0.5
elif unit == 'pc':
result = value * 12.0
elif unit == 'mm':
result = value * 2.8346456693
elif unit == 'cm':
result = value * 28.346456693
elif unit == 'rem':
result = value * body_font_size
elif unit == 'q':
result = value * 0.708661417325
return result
def parse_css_length(value):
try:
m = UNIT_RE.match(value)
except TypeError:
return None, None
if m is not None and m.group(1):
value = float(m.group(1))
unit = m.group(2)
return value, unit.lower()
return None, None
def generate_masthead(title, output_path=None, width=600, height=60):
from calibre.ebooks.conversion.config import load_defaults
recs = load_defaults('mobi_output')
masthead_font_family = recs.get('masthead_font', None)
from calibre.ebooks.covers import generate_masthead
return generate_masthead(title, output_path=output_path, width=width, height=height, font_family=masthead_font_family)
def escape_xpath_attr(value):
if '"' in value:
if "'" in value:
parts = re.split('("+)', value)
ans = []
for x in parts:
if x:
q = "'" if '"' in x else '"'
ans.append(q + x + q)
return 'concat(%s)' % ', '.join(ans)
else:
return "'%s'" % value
return '"%s"' % value

View File

@@ -0,0 +1,189 @@
#!/usr/bin/env python2
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import re, codecs
from polyglot.builtins import unicode_type
_encoding_pats = (
# XML declaration
r'<\?[^<>]+encoding\s*=\s*[\'"](.*?)[\'"][^<>]*>',
# HTML 5 charset
r'''<meta\s+charset=['"]([-_a-z0-9]+)['"][^<>]*>(?:\s*</meta>){0,1}''',
# HTML 4 Pragma directive
r'''<meta\s+?[^<>]*?content\s*=\s*['"][^'"]*?charset=([-_a-z0-9]+)[^'"]*?['"][^<>]*>(?:\s*</meta>){0,1}''',
)
def compile_pats(binary):
for raw in _encoding_pats:
if binary:
raw = raw.encode('ascii')
yield re.compile(raw, flags=re.IGNORECASE)
class LazyEncodingPats(object):
def __call__(self, binary=False):
attr = 'binary_pats' if binary else 'unicode_pats'
pats = getattr(self, attr, None)
if pats is None:
pats = tuple(compile_pats(binary))
setattr(self, attr, pats)
for pat in pats:
yield pat
lazy_encoding_pats = LazyEncodingPats()
ENTITY_PATTERN = re.compile(r'&(\S+?);')
def strip_encoding_declarations(raw, limit=50*1024, preserve_newlines=False):
prefix = raw[:limit]
suffix = raw[limit:]
is_binary = isinstance(raw, bytes)
if preserve_newlines:
if is_binary:
sub = lambda m: b'\n' * m.group().count(b'\n')
else:
sub = lambda m: '\n' * m.group().count('\n')
else:
sub = b'' if is_binary else u''
for pat in lazy_encoding_pats(is_binary):
prefix = pat.sub(sub, prefix)
raw = prefix + suffix
return raw
def replace_encoding_declarations(raw, enc='utf-8', limit=50*1024):
prefix = raw[:limit]
suffix = raw[limit:]
changed = [False]
is_binary = isinstance(raw, bytes)
if is_binary:
if not isinstance(enc, bytes):
enc = enc.encode('ascii')
else:
if isinstance(enc, bytes):
enc = enc.decode('ascii')
def sub(m):
ans = m.group()
if m.group(1).lower() != enc.lower():
changed[0] = True
start, end = m.start(1) - m.start(0), m.end(1) - m.end(0)
ans = ans[:start] + enc + ans[end:]
return ans
for pat in lazy_encoding_pats(is_binary):
prefix = pat.sub(sub, prefix)
raw = prefix + suffix
return raw, changed[0]
def find_declared_encoding(raw, limit=50*1024):
prefix = raw[:limit]
is_binary = isinstance(raw, bytes)
for pat in lazy_encoding_pats(is_binary):
m = pat.search(prefix)
if m is not None:
ans = m.group(1)
if is_binary:
ans = ans.decode('ascii', 'replace')
return ans
def substitute_entites(raw):
from calibre import xml_entity_to_unicode
return ENTITY_PATTERN.sub(xml_entity_to_unicode, raw)
_CHARSET_ALIASES = {"macintosh" : "mac-roman",
"x-sjis" : "shift-jis"}
def detect(*args, **kwargs):
from chardet import detect
return detect(*args, **kwargs)
def force_encoding(raw, verbose, assume_utf8=False):
from calibre.constants import preferred_encoding
try:
chardet = detect(raw[:1024*50])
except:
chardet = {'encoding':preferred_encoding, 'confidence':0}
encoding = chardet['encoding']
if chardet['confidence'] < 1 and assume_utf8:
encoding = 'utf-8'
if chardet['confidence'] < 1 and verbose:
print('WARNING: Encoding detection confidence for %s is %d%%'%(
chardet['encoding'], chardet['confidence']*100))
if not encoding:
encoding = preferred_encoding
encoding = encoding.lower()
encoding = _CHARSET_ALIASES.get(encoding, encoding)
if encoding == 'ascii':
encoding = 'utf-8'
return encoding
def detect_xml_encoding(raw, verbose=False, assume_utf8=False):
if not raw or isinstance(raw, unicode_type):
return raw, None
for x in ('utf8', 'utf-16-le', 'utf-16-be'):
bom = getattr(codecs, 'BOM_'+x.upper().replace('-16', '16').replace(
'-', '_'))
if raw.startswith(bom):
return raw[len(bom):], x
encoding = None
for pat in lazy_encoding_pats(True):
match = pat.search(raw)
if match:
encoding = match.group(1)
encoding = encoding.decode('ascii', 'replace')
break
if encoding is None:
encoding = force_encoding(raw, verbose, assume_utf8=assume_utf8)
if encoding.lower().strip() == 'macintosh':
encoding = 'mac-roman'
if encoding.lower().replace('_', '-').strip() in (
'gb2312', 'chinese', 'csiso58gb231280', 'euc-cn', 'euccn',
'eucgb2312-cn', 'gb2312-1980', 'gb2312-80', 'iso-ir-58'):
# Microsoft Word exports to HTML with encoding incorrectly set to
# gb2312 instead of gbk. gbk is a superset of gb2312, anyway.
encoding = 'gbk'
try:
codecs.lookup(encoding)
except LookupError:
encoding = 'utf-8'
return raw, encoding
def xml_to_unicode(raw, verbose=False, strip_encoding_pats=False,
resolve_entities=False, assume_utf8=False):
'''
Force conversion of byte string to unicode. Tries to look for XML/HTML
encoding declaration first, if not found uses the chardet library and
prints a warning if detection confidence is < 100%
@return: (unicode, encoding used)
'''
if not raw:
return '', None
raw, encoding = detect_xml_encoding(raw, verbose=verbose,
assume_utf8=assume_utf8)
if not isinstance(raw, unicode_type):
raw = raw.decode(encoding, 'replace')
if strip_encoding_pats:
raw = strip_encoding_declarations(raw)
if resolve_entities:
raw = substitute_entites(raw)
return raw, encoding

View File

@@ -0,0 +1,6 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL 3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'

View File

@@ -0,0 +1,238 @@
/*
:mod:`cPalmdoc` -- Palmdoc compression/decompression
=====================================================
.. module:: cPalmdoc
:platform: All
:synopsis: Compression decompression of Palmdoc implemented in C for speed
.. moduleauthor:: Kovid Goyal <kovid@kovidgoyal.net> Copyright 2009
*/
#define PY_SSIZE_T_CLEAN
#include <Python.h>
#include <stdio.h>
#define BUFFER 6000
#define MIN(x, y) ( ((x) < (y)) ? (x) : (y) )
#define MAX(x, y) ( ((x) > (y)) ? (x) : (y) )
typedef unsigned short int Byte;
typedef struct {
Byte *data;
Py_ssize_t len;
} buffer;
#ifdef bool
#undef bool
#endif
#define bool int
#ifdef false
#undef false
#endif
#define false 0
#ifdef true
#undef true
#endif
#define true 1
#define CHAR(x) (( (x) > 127 ) ? (x)-256 : (x))
#if PY_MAJOR_VERSION >= 3
#define BUFFER_FMT "y#"
#define BYTES_FMT "y#"
#else
#define BUFFER_FMT "t#"
#define BYTES_FMT "s#"
#endif
static PyObject *
cpalmdoc_decompress(PyObject *self, PyObject *args) {
const char *_input = NULL; Py_ssize_t input_len = 0;
Byte *input; char *output; Byte c; PyObject *ans;
Py_ssize_t i = 0, o = 0, j = 0, di, n;
if (!PyArg_ParseTuple(args, BUFFER_FMT, &_input, &input_len))
return NULL;
input = (Byte *) PyMem_Malloc(sizeof(Byte)*input_len);
if (input == NULL) return PyErr_NoMemory();
// Map chars to bytes
for (j = 0; j < input_len; j++)
input[j] = (_input[j] < 0) ? _input[j]+256 : _input[j];
output = (char *)PyMem_Malloc(sizeof(char)*(MAX(BUFFER, 8*input_len)));
if (output == NULL) return PyErr_NoMemory();
while (i < input_len) {
c = input[i++];
if (c >= 1 && c <= 8) // copy 'c' bytes
while (c--) output[o++] = (char)input[i++];
else if (c <= 0x7F) // 0, 09-7F = self
output[o++] = (char)c;
else if (c >= 0xC0) { // space + ASCII char
output[o++] = ' ';
output[o++] = c ^ 0x80;
}
else { // 80-BF repeat sequences
c = (c << 8) + input[i++];
di = (c & 0x3FFF) >> 3;
for ( n = (c & 7) + 3; n--; ++o )
output[o] = output[o - di];
}
}
ans = Py_BuildValue(BYTES_FMT, output, o);
if (output != NULL) PyMem_Free(output);
if (input != NULL) PyMem_Free(input);
return ans;
}
static bool
cpalmdoc_memcmp( Byte *a, Byte *b, Py_ssize_t len) {
Py_ssize_t i;
for (i = 0; i < len; i++) if (a[i] != b[i]) return false;
return true;
}
static Py_ssize_t
cpalmdoc_rfind(Byte *data, Py_ssize_t pos, Py_ssize_t chunk_length) {
Py_ssize_t i;
for (i = pos - chunk_length; i > -1; i--)
if (cpalmdoc_memcmp(data+i, data+pos, chunk_length)) return i;
return pos;
}
static Py_ssize_t
cpalmdoc_do_compress(buffer *b, char *output) {
Py_ssize_t i = 0, j, chunk_len, dist;
unsigned int compound;
Byte c, n;
bool found;
char *head;
buffer temp;
head = output;
temp.data = (Byte *)PyMem_Malloc(sizeof(Byte)*8); temp.len = 0;
if (temp.data == NULL) return 0;
while (i < b->len) {
c = b->data[i];
//do repeats
if ( i > 10 && (b->len - i) > 10) {
found = false;
for (chunk_len = 10; chunk_len > 2; chunk_len--) {
j = cpalmdoc_rfind(b->data, i, chunk_len);
dist = i - j;
if (j < i && dist <= 2047) {
found = true;
compound = (unsigned int)((dist << 3) + chunk_len-3);
*(output++) = CHAR(0x80 + (compound >> 8 ));
*(output++) = CHAR(compound & 0xFF);
i += chunk_len;
break;
}
}
if (found) continue;
}
//write single character
i++;
if (c == 32 && i < b->len) {
n = b->data[i];
if ( n >= 0x40 && n <= 0x7F) {
*(output++) = CHAR(n^0x80); i++; continue;
}
}
if (c == 0 || (c > 8 && c < 0x80))
*(output++) = CHAR(c);
else { // Write binary data
j = i;
temp.data[0] = c; temp.len = 1;
while (j < b->len && temp.len < 8) {
c = b->data[j];
if (c == 0 || (c > 8 && c < 0x80)) break;
temp.data[temp.len++] = c; j++;
}
i += temp.len - 1;
*(output++) = (char)temp.len;
for (j=0; j < temp.len; j++) *(output++) = (char)temp.data[j];
}
}
PyMem_Free(temp.data);
return output - head;
}
static PyObject *
cpalmdoc_compress(PyObject *self, PyObject *args) {
const char *_input = NULL; Py_ssize_t input_len = 0;
char *output; PyObject *ans;
Py_ssize_t j = 0;
buffer b;
if (!PyArg_ParseTuple(args, BUFFER_FMT, &_input, &input_len))
return NULL;
b.data = (Byte *)PyMem_Malloc(sizeof(Byte)*input_len);
if (b.data == NULL) return PyErr_NoMemory();
// Map chars to bytes
for (j = 0; j < input_len; j++)
b.data[j] = (_input[j] < 0) ? _input[j]+256 : _input[j];
b.len = input_len;
// Make the output buffer larger than the input as sometimes
// compression results in a larger block
output = (char *)PyMem_Malloc(sizeof(char) * (int)(1.25*b.len));
if (output == NULL) return PyErr_NoMemory();
j = cpalmdoc_do_compress(&b, output);
if ( j == 0) return PyErr_NoMemory();
ans = Py_BuildValue(BYTES_FMT, output, j);
PyMem_Free(output);
PyMem_Free(b.data);
return ans;
}
static char cPalmdoc_doc[] = "Compress and decompress palmdoc strings.";
static PyMethodDef cPalmdoc_methods[] = {
{"decompress", cpalmdoc_decompress, METH_VARARGS,
"decompress(bytestring) -> decompressed bytestring\n\n"
"Decompress a palmdoc compressed byte string. "
},
{"compress", cpalmdoc_compress, METH_VARARGS,
"compress(bytestring) -> compressed bytestring\n\n"
"Palmdoc compress a byte string. "
},
{NULL, NULL, 0, NULL}
};
#if PY_MAJOR_VERSION >= 3
#define INITERROR return NULL
#define INITMODULE PyModule_Create(&cPalmdoc_module)
static struct PyModuleDef cPalmdoc_module = {
/* m_base */ PyModuleDef_HEAD_INIT,
/* m_name */ "cPalmdoc",
/* m_doc */ cPalmdoc_doc,
/* m_size */ -1,
/* m_methods */ cPalmdoc_methods,
/* m_slots */ 0,
/* m_traverse */ 0,
/* m_clear */ 0,
/* m_free */ 0,
};
CALIBRE_MODINIT_FUNC PyInit_cPalmdoc(void) {
#else
#define INITERROR return
#define INITMODULE Py_InitModule3("cPalmdoc", cPalmdoc_methods, cPalmdoc_doc)
CALIBRE_MODINIT_FUNC initcPalmdoc(void) {
#endif
PyObject *m;
m = INITMODULE;
if (m == NULL) {
INITERROR;
}
#if PY_MAJOR_VERSION >= 3
return m;
#endif
}

View File

@@ -0,0 +1,96 @@
#!/usr/bin/env python2
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
import io
from struct import pack
from calibre.constants import plugins
from polyglot.builtins import range
cPalmdoc = plugins['cPalmdoc'][0]
if not cPalmdoc:
raise RuntimeError(('Failed to load required cPalmdoc module: '
'%s')%plugins['cPalmdoc'][1])
def decompress_doc(data):
return cPalmdoc.decompress(data)
def compress_doc(data):
return cPalmdoc.compress(data) if data else b''
def py_compress_doc(data):
out = io.BytesIO()
i = 0
ldata = len(data)
while i < ldata:
if i > 10 and (ldata - i) > 10:
chunk = b''
match = -1
for j in range(10, 2, -1):
chunk = data[i:i+j]
try:
match = data.rindex(chunk, 0, i)
except ValueError:
continue
if (i - match) <= 2047:
break
match = -1
if match >= 0:
n = len(chunk)
m = i - match
code = 0x8000 + ((m << 3) & 0x3ff8) + (n - 3)
out.write(pack('>H', code))
i += n
continue
ch = data[i:i+1]
och = ord(ch)
i += 1
if ch == b' ' and (i + 1) < ldata:
onch = ord(data[i:i+1])
if onch >= 0x40 and onch < 0x80:
out.write(pack('>B', onch ^ 0x80))
i += 1
continue
if och == 0 or (och > 8 and och < 0x80):
out.write(ch)
else:
j = i
binseq = [ch]
while j < ldata and len(binseq) < 8:
ch = data[j:j+1]
och = ord(ch)
if och == 0 or (och > 8 and och < 0x80):
break
binseq.append(ch)
j += 1
out.write(pack('>B', len(binseq)))
out.write(b''.join(binseq))
i += len(binseq) - 1
return out.getvalue()
def find_tests():
import unittest
class Test(unittest.TestCase):
def test_palmdoc_compression(self):
for test in [
b'abc\x03\x04\x05\x06ms', # Test binary writing
b'a b c \xfed ', # Test encoding of spaces
b'0123456789axyz2bxyz2cdfgfo9iuyerh',
b'0123456789asd0123456789asd|yyzzxxffhhjjkk',
(b'ciewacnaq eiu743 r787q 0w% ; sa fd\xef\ffdxosac wocjp acoiecowei '
b'owaic jociowapjcivcjpoivjporeivjpoavca; p9aw8743y6r74%$^$^%8 ')
]:
x = compress_doc(test)
self.assertEqual(py_compress_doc(test), x)
self.assertEqual(decompress_doc(x), test)
return unittest.defaultTestLoader.loadTestsFromTestCase(Test)

View File

@@ -0,0 +1,30 @@
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
from polyglot.builtins import native_string_type
class ConversionUserFeedBack(Exception):
def __init__(self, title, msg, level='info', det_msg=''):
''' Show a simple message to the user
:param title: The title (very short description)
:param msg: The message to show the user
:param level: Must be one of 'info', 'warn' or 'error'
:param det_msg: Optional detailed message to show the user
'''
import json
Exception.__init__(self, json.dumps({'msg':msg, 'level':level,
'det_msg':det_msg, 'title':title}))
self.title, self.msg, self.det_msg = title, msg, det_msg
self.level = level
# Ensure exception uses fully qualified name as this is used to detect it in
# the GUI.
ConversionUserFeedBack.__name__ = native_string_type('calibre.ebooks.conversion.ConversionUserFeedBack')

View File

@@ -0,0 +1,428 @@
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL 3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
'''
Command line interface to conversion sub-system
'''
import sys, os, numbers
from optparse import OptionGroup, Option
from collections import OrderedDict
from calibre.utils.config import OptionParser
from calibre.utils.logging import Log
from calibre.customize.conversion import OptionRecommendation
from calibre import patheq
from calibre.ebooks.conversion import ConversionUserFeedBack
from calibre.utils.localization import localize_user_manual_link
from polyglot.builtins import iteritems
USAGE = '%prog ' + _('''\
input_file output_file [options]
Convert an e-book from one format to another.
input_file is the input and output_file is the output. Both must be \
specified as the first two arguments to the command.
The output e-book format is guessed from the file extension of \
output_file. output_file can also be of the special format .EXT where \
EXT is the output file extension. In this case, the name of the output \
file is derived from the name of the input file. Note that the filenames must \
not start with a hyphen. Finally, if output_file has no extension, then \
it is treated as a directory and an "open e-book" (OEB) consisting of HTML \
files is written to that directory. These files are the files that would \
normally have been passed to the output plugin.
After specifying the input \
and output file you can customize the conversion by specifying various \
options. The available options depend on the input and output file types. \
To get help on them specify the input and output file and then use the -h \
option.
For full documentation of the conversion system see
''') + localize_user_manual_link('https://manual.calibre-ebook.com/conversion.html')
HEURISTIC_OPTIONS = ['markup_chapter_headings',
'italicize_common_cases', 'fix_indents',
'html_unwrap_factor', 'unwrap_lines',
'delete_blank_paragraphs', 'format_scene_breaks',
'dehyphenate', 'renumber_headings',
'replace_scene_breaks']
DEFAULT_TRUE_OPTIONS = HEURISTIC_OPTIONS + ['remove_fake_margins']
def print_help(parser, log):
parser.print_help()
def check_command_line_options(parser, args, log):
if len(args) < 3 or args[1].startswith('-') or args[2].startswith('-'):
print_help(parser, log)
log.error('\n\nYou must specify the input AND output files')
raise SystemExit(1)
input = os.path.abspath(args[1])
if not input.endswith('.recipe') and not os.access(input, os.R_OK) and not \
('-h' in args or '--help' in args):
log.error('Cannot read from', input)
raise SystemExit(1)
if input.endswith('.recipe') and not os.access(input, os.R_OK):
input = args[1]
output = args[2]
if (output.startswith('.') and output[:2] not in {'..', '.'} and '/' not in
output and '\\' not in output):
output = os.path.splitext(os.path.basename(input))[0]+output
output = os.path.abspath(output)
return input, output
def option_recommendation_to_cli_option(add_option, rec):
opt = rec.option
switches = ['-'+opt.short_switch] if opt.short_switch else []
switches.append('--'+opt.long_switch)
attrs = dict(dest=opt.name, help=opt.help,
choices=opt.choices, default=rec.recommended_value)
if isinstance(rec.recommended_value, type(True)):
attrs['action'] = 'store_false' if rec.recommended_value else \
'store_true'
else:
if isinstance(rec.recommended_value, numbers.Integral):
attrs['type'] = 'int'
if isinstance(rec.recommended_value, numbers.Real):
attrs['type'] = 'float'
if opt.long_switch == 'verbose':
attrs['action'] = 'count'
attrs.pop('type', '')
if opt.name == 'read_metadata_from_opf':
switches.append('--from-opf')
if opt.name == 'transform_css_rules':
attrs['help'] = _(
'Path to a file containing rules to transform the CSS styles'
' in this book. The easiest way to create such a file is to'
' use the wizard for creating rules in the calibre GUI. Access'
' it in the "Look & feel->Transform styles" section of the conversion'
' dialog. Once you create the rules, you can use the "Export" button'
' to save them to a file.'
)
if opt.name in DEFAULT_TRUE_OPTIONS and rec.recommended_value is True:
switches = ['--disable-'+opt.long_switch]
add_option(Option(*switches, **attrs))
def group_titles():
return _('INPUT OPTIONS'), _('OUTPUT OPTIONS')
def recipe_test(option, opt_str, value, parser):
assert value is None
value = []
def floatable(s):
try:
float(s)
return True
except ValueError:
return False
for arg in parser.rargs:
# stop on --foo like options
if arg[:2] == "--":
break
# stop on -a, but not on -3 or -3.0
if arg[:1] == "-" and len(arg) > 1 and not floatable(arg):
break
try:
value.append(int(arg))
except (TypeError, ValueError, AttributeError):
break
if len(value) == 2:
break
del parser.rargs[:len(value)]
while len(value) < 2:
value.append(2)
setattr(parser.values, option.dest, tuple(value))
def add_input_output_options(parser, plumber):
input_options, output_options = \
plumber.input_options, plumber.output_options
def add_options(group, options):
for opt in options:
if plumber.input_fmt == 'recipe' and opt.option.long_switch == 'test':
group(Option('--test', dest='test', action='callback', callback=recipe_test))
else:
option_recommendation_to_cli_option(group, opt)
if input_options:
title = group_titles()[0]
io = OptionGroup(parser, title, _('Options to control the processing'
' of the input %s file')%plumber.input_fmt)
add_options(io.add_option, input_options)
parser.add_option_group(io)
if output_options:
title = group_titles()[1]
oo = OptionGroup(parser, title, _('Options to control the processing'
' of the output %s')%plumber.output_fmt)
add_options(oo.add_option, output_options)
parser.add_option_group(oo)
def add_pipeline_options(parser, plumber):
groups = OrderedDict((
('' , ('',
[
'input_profile',
'output_profile',
]
)),
(_('LOOK AND FEEL') , (
_('Options to control the look and feel of the output'),
[
'base_font_size', 'disable_font_rescaling',
'font_size_mapping', 'embed_font_family',
'subset_embedded_fonts', 'embed_all_fonts',
'line_height', 'minimum_line_height',
'linearize_tables',
'extra_css', 'filter_css', 'transform_css_rules', 'expand_css',
'smarten_punctuation', 'unsmarten_punctuation',
'margin_top', 'margin_left', 'margin_right',
'margin_bottom', 'change_justification',
'insert_blank_line', 'insert_blank_line_size',
'remove_paragraph_spacing',
'remove_paragraph_spacing_indent_size',
'asciiize', 'keep_ligatures',
]
)),
(_('HEURISTIC PROCESSING') , (
_('Modify the document text and structure using common'
' patterns. Disabled by default. Use %(en)s to enable. '
' Individual actions can be disabled with the %(dis)s options.')
% dict(en='--enable-heuristics', dis='--disable-*'),
['enable_heuristics'] + HEURISTIC_OPTIONS
)),
(_('SEARCH AND REPLACE') , (
_('Modify the document text and structure using user defined patterns.'),
[
'sr1_search', 'sr1_replace',
'sr2_search', 'sr2_replace',
'sr3_search', 'sr3_replace',
'search_replace',
]
)),
(_('STRUCTURE DETECTION') , (
_('Control auto-detection of document structure.'),
[
'chapter', 'chapter_mark',
'prefer_metadata_cover', 'remove_first_image',
'insert_metadata', 'page_breaks_before',
'remove_fake_margins', 'start_reading_at',
]
)),
(_('TABLE OF CONTENTS') , (
_('Control the automatic generation of a Table of Contents. By '
'default, if the source file has a Table of Contents, it will '
'be used in preference to the automatically generated one.'),
[
'level1_toc', 'level2_toc', 'level3_toc',
'toc_threshold', 'max_toc_links', 'no_chapters_in_toc',
'use_auto_toc', 'toc_filter', 'duplicate_links_in_toc',
]
)),
(_('METADATA') , (_('Options to set metadata in the output'),
plumber.metadata_option_names + ['read_metadata_from_opf'],
)),
(_('DEBUG'), (_('Options to help with debugging the conversion'),
[
'verbose',
'debug_pipeline',
])),
))
for group, (desc, options) in iteritems(groups):
if group:
group = OptionGroup(parser, group, desc)
parser.add_option_group(group)
add_option = group.add_option if group != '' else parser.add_option
for name in options:
rec = plumber.get_option_by_name(name)
if rec.level < rec.HIGH:
option_recommendation_to_cli_option(add_option, rec)
def option_parser():
parser = OptionParser(usage=USAGE)
parser.add_option('--list-recipes', default=False, action='store_true',
help=_('List builtin recipe names. You can create an e-book from '
'a builtin recipe like this: ebook-convert "Recipe Name.recipe" '
'output.epub'))
return parser
class ProgressBar(object):
def __init__(self, log):
self.log = log
def __call__(self, frac, msg=''):
if msg:
percent = int(frac*100)
self.log('%d%% %s'%(percent, msg))
def create_option_parser(args, log):
if '--version' in args:
from calibre.constants import __appname__, __version__, __author__
log(os.path.basename(args[0]), '('+__appname__, __version__+')')
log('Created by:', __author__)
raise SystemExit(0)
if '--list-recipes' in args:
from calibre.web.feeds.recipes.collection import get_builtin_recipe_titles
log('Available recipes:')
titles = sorted(get_builtin_recipe_titles())
for title in titles:
try:
log('\t'+title)
except:
log('\t'+repr(title))
log('%d recipes available'%len(titles))
raise SystemExit(0)
parser = option_parser()
if len(args) < 3:
print_help(parser, log)
if any(x in args for x in ('-h', '--help')):
raise SystemExit(0)
else:
raise SystemExit(1)
input, output = check_command_line_options(parser, args, log)
from calibre.ebooks.conversion.plumber import Plumber
reporter = ProgressBar(log)
if patheq(input, output):
raise ValueError('Input file is the same as the output file')
plumber = Plumber(input, output, log, reporter)
add_input_output_options(parser, plumber)
add_pipeline_options(parser, plumber)
return parser, plumber
def abspath(x):
if x.startswith('http:') or x.startswith('https:'):
return x
return os.path.abspath(os.path.expanduser(x))
def escape_sr_pattern(exp):
return exp.replace('\n', '\ue123')
def read_sr_patterns(path, log=None):
import json, re
pats = []
with open(path, 'rb') as f:
lines = f.read().decode('utf-8').splitlines()
pat = None
for line in lines:
if pat is None:
if not line.strip():
continue
line = line.replace('\ue123', '\n')
try:
re.compile(line)
except:
msg = 'Invalid regular expression: %r from file: %r'%(
line, path)
if log is not None:
log.error(msg)
raise SystemExit(1)
else:
raise ValueError(msg)
pat = line
else:
pats.append((pat, line))
pat = None
return json.dumps(pats)
def main(args=sys.argv):
log = Log()
parser, plumber = create_option_parser(args, log)
opts, leftover_args = parser.parse_args(args)
if len(leftover_args) > 3:
log.error('Extra arguments not understood:', u', '.join(leftover_args[3:]))
return 1
for x in ('read_metadata_from_opf', 'cover'):
if getattr(opts, x, None) is not None:
setattr(opts, x, abspath(getattr(opts, x)))
if opts.search_replace:
opts.search_replace = read_sr_patterns(opts.search_replace, log)
if opts.transform_css_rules:
from calibre.ebooks.css_transform_rules import import_rules, validate_rule
with open(opts.transform_css_rules, 'rb') as tcr:
opts.transform_css_rules = rules = list(import_rules(tcr.read()))
for rule in rules:
title, msg = validate_rule(rule)
if title and msg:
log.error('Failed to parse CSS transform rules')
log.error(title)
log.error(msg)
return 1
recommendations = [(n.dest, getattr(opts, n.dest),
OptionRecommendation.HIGH)
for n in parser.options_iter()
if n.dest]
plumber.merge_ui_recommendations(recommendations)
try:
plumber.run()
except ConversionUserFeedBack as e:
ll = {'info': log.info, 'warn': log.warn,
'error':log.error}.get(e.level, log.info)
ll(e.title)
if e.det_msg:
log.debug(e.detmsg)
ll(e.msg)
raise SystemExit(1)
log(_('Output saved to'), ' ', plumber.output)
return 0
def manual_index_strings():
return _('''\
The options and default values for the options change depending on both the
input and output formats, so you should always check with::
%s
Below are the options that are common to all conversion, followed by the
options specific to every input and output format.''')
if __name__ == '__main__':
sys.exit(main())

View File

@@ -0,0 +1,10 @@
#!/usr/bin/env python2
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'

View File

@@ -0,0 +1,29 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2011, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
from calibre.customize.conversion import InputFormatPlugin
from polyglot.builtins import getcwd
class AZW4Input(InputFormatPlugin):
name = 'AZW4 Input'
author = 'John Schember'
description = 'Convert AZW4 to HTML'
file_types = {'azw4'}
commit_name = 'azw4_input'
def convert(self, stream, options, file_ext, log,
accelerators):
from calibre.ebooks.pdb.header import PdbHeaderReader
from calibre.ebooks.azw4.reader import Reader
header = PdbHeaderReader(stream)
reader = Reader(header, stream, log, options)
opf = reader.extract_content(getcwd())
return opf

View File

@@ -0,0 +1,202 @@
from __future__ import absolute_import, division, print_function, unicode_literals
''' CHM File decoding support '''
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>,' \
' and Alex Bramley <a.bramley at gmail.com>.'
import os
from calibre.customize.conversion import InputFormatPlugin
from calibre.ptempfile import TemporaryDirectory
from calibre.constants import filesystem_encoding
from polyglot.builtins import unicode_type, as_bytes
class CHMInput(InputFormatPlugin):
name = 'CHM Input'
author = 'Kovid Goyal and Alex Bramley'
description = 'Convert CHM files to OEB'
file_types = {'chm'}
commit_name = 'chm_input'
def _chmtohtml(self, output_dir, chm_path, no_images, log, debug_dump=False):
from calibre.ebooks.chm.reader import CHMReader
log.debug('Opening CHM file')
rdr = CHMReader(chm_path, log, input_encoding=self.opts.input_encoding)
log.debug('Extracting CHM to %s' % output_dir)
rdr.extract_content(output_dir, debug_dump=debug_dump)
self._chm_reader = rdr
return rdr.hhc_path
def convert(self, stream, options, file_ext, log, accelerators):
from calibre.ebooks.chm.metadata import get_metadata_from_reader
from calibre.customize.ui import plugin_for_input_format
self.opts = options
log.debug('Processing CHM...')
with TemporaryDirectory('_chm2oeb') as tdir:
if not isinstance(tdir, unicode_type):
tdir = tdir.decode(filesystem_encoding)
html_input = plugin_for_input_format('html')
for opt in html_input.options:
setattr(options, opt.option.name, opt.recommended_value)
no_images = False # options.no_images
chm_name = stream.name
# chm_data = stream.read()
# closing stream so CHM can be opened by external library
stream.close()
log.debug('tdir=%s' % tdir)
log.debug('stream.name=%s' % stream.name)
debug_dump = False
odi = options.debug_pipeline
if odi:
debug_dump = os.path.join(odi, 'input')
mainname = self._chmtohtml(tdir, chm_name, no_images, log,
debug_dump=debug_dump)
mainpath = os.path.join(tdir, mainname)
try:
metadata = get_metadata_from_reader(self._chm_reader)
except Exception:
log.exception('Failed to read metadata, using filename')
from calibre.ebooks.metadata.book.base import Metadata
metadata = Metadata(os.path.basename(chm_name))
encoding = self._chm_reader.get_encoding() or options.input_encoding or 'cp1252'
self._chm_reader.CloseCHM()
# print((tdir, mainpath))
# from calibre import ipython
# ipython()
options.debug_pipeline = None
options.input_encoding = 'utf-8'
uenc = encoding
if os.path.abspath(mainpath) in self._chm_reader.re_encoded_files:
uenc = 'utf-8'
htmlpath, toc = self._create_html_root(mainpath, log, uenc)
oeb = self._create_oebbook_html(htmlpath, tdir, options, log, metadata)
options.debug_pipeline = odi
if toc.count() > 1:
oeb.toc = self.parse_html_toc(oeb.spine[0])
oeb.manifest.remove(oeb.spine[0])
oeb.auto_generated_toc = False
return oeb
def parse_html_toc(self, item):
from calibre.ebooks.oeb.base import TOC, XPath
dx = XPath('./h:div')
ax = XPath('./h:a[1]')
def do_node(parent, div):
for child in dx(div):
a = ax(child)[0]
c = parent.add(a.text, a.attrib['href'])
do_node(c, child)
toc = TOC()
root = XPath('//h:div[1]')(item.data)[0]
do_node(toc, root)
return toc
def _create_oebbook_html(self, htmlpath, basedir, opts, log, mi):
# use HTMLInput plugin to generate book
from calibre.customize.builtins import HTMLInput
opts.breadth_first = True
htmlinput = HTMLInput(None)
oeb = htmlinput.create_oebbook(htmlpath, basedir, opts, log, mi)
return oeb
def _create_html_root(self, hhcpath, log, encoding):
from lxml import html
from polyglot.urllib import unquote as _unquote
from calibre.ebooks.oeb.base import urlquote
from calibre.ebooks.chardet import xml_to_unicode
hhcdata = self._read_file(hhcpath)
hhcdata = hhcdata.decode(encoding)
hhcdata = xml_to_unicode(hhcdata, verbose=True,
strip_encoding_pats=True, resolve_entities=True)[0]
hhcroot = html.fromstring(hhcdata)
toc = self._process_nodes(hhcroot)
# print("=============================")
# print("Printing hhcroot")
# print(etree.tostring(hhcroot, pretty_print=True))
# print("=============================")
log.debug('Found %d section nodes' % toc.count())
htmlpath = os.path.splitext(hhcpath)[0] + ".html"
base = os.path.dirname(os.path.abspath(htmlpath))
def unquote(x):
if isinstance(x, unicode_type):
x = x.encode('utf-8')
return _unquote(x).decode('utf-8')
def unquote_path(x):
y = unquote(x)
if (not os.path.exists(os.path.join(base, x)) and os.path.exists(os.path.join(base, y))):
x = y
return x
def donode(item, parent, base, subpath):
for child in item:
title = child.title
if not title:
continue
raw = unquote_path(child.href or '')
rsrcname = os.path.basename(raw)
rsrcpath = os.path.join(subpath, rsrcname)
if (not os.path.exists(os.path.join(base, rsrcpath)) and os.path.exists(os.path.join(base, raw))):
rsrcpath = raw
if '%' not in rsrcpath:
rsrcpath = urlquote(rsrcpath)
if not raw:
rsrcpath = ''
c = DIV(A(title, href=rsrcpath))
donode(child, c, base, subpath)
parent.append(c)
with open(htmlpath, 'wb') as f:
if toc.count() > 1:
from lxml.html.builder import HTML, BODY, DIV, A
path0 = toc[0].href
path0 = unquote_path(path0)
subpath = os.path.dirname(path0)
base = os.path.dirname(f.name)
root = DIV()
donode(toc, root, base, subpath)
raw = html.tostring(HTML(BODY(root)), encoding='utf-8',
pretty_print=True)
f.write(raw)
else:
f.write(as_bytes(hhcdata))
return htmlpath, toc
def _read_file(self, name):
with lopen(name, 'rb') as f:
data = f.read()
return data
def add_node(self, node, toc, ancestor_map):
from calibre.ebooks.chm.reader import match_string
if match_string(node.attrib.get('type', ''), 'text/sitemap'):
p = node.xpath('ancestor::ul[1]/ancestor::li[1]/object[1]')
parent = p[0] if p else None
toc = ancestor_map.get(parent, toc)
title = href = ''
for param in node.xpath('./param'):
if match_string(param.attrib['name'], 'name'):
title = param.attrib['value']
elif match_string(param.attrib['name'], 'local'):
href = param.attrib['value']
child = toc.add(title or _('Unknown'), href)
ancestor_map[node] = child
def _process_nodes(self, root):
from calibre.ebooks.oeb.base import TOC
toc = TOC()
ancestor_map = {}
for node in root.xpath('//object'):
self.add_node(node, toc, ancestor_map)
return toc

View File

@@ -0,0 +1,310 @@
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
__docformat__ = 'restructuredtext en'
'''
Based on ideas from comiclrf created by FangornUK.
'''
import shutil, textwrap, codecs, os
from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
from calibre import CurrentDir
from calibre.ptempfile import PersistentTemporaryDirectory
from polyglot.builtins import getcwd, map
class ComicInput(InputFormatPlugin):
name = 'Comic Input'
author = 'Kovid Goyal'
description = 'Optimize comic files (.cbz, .cbr, .cbc) for viewing on portable devices'
file_types = {'cbz', 'cbr', 'cbc'}
is_image_collection = True
commit_name = 'comic_input'
core_usage = -1
options = {
OptionRecommendation(name='colors', recommended_value=0,
help=_('Reduce the number of colors used in the image. This works only'
' if you choose the PNG output format. It is useful to reduce file sizes.'
' Set to zero to turn off. Maximum value is 256. It is off by default.')),
OptionRecommendation(name='dont_normalize', recommended_value=False,
help=_('Disable normalize (improve contrast) color range '
'for pictures. Default: False')),
OptionRecommendation(name='keep_aspect_ratio', recommended_value=False,
help=_('Maintain picture aspect ratio. Default is to fill the screen.')),
OptionRecommendation(name='dont_sharpen', recommended_value=False,
help=_('Disable sharpening.')),
OptionRecommendation(name='disable_trim', recommended_value=False,
help=_('Disable trimming of comic pages. For some comics, '
'trimming might remove content as well as borders.')),
OptionRecommendation(name='landscape', recommended_value=False,
help=_("Don't split landscape images into two portrait images")),
OptionRecommendation(name='wide', recommended_value=False,
help=_("Keep aspect ratio and scale image using screen height as "
"image width for viewing in landscape mode.")),
OptionRecommendation(name='right2left', recommended_value=False,
help=_('Used for right-to-left publications like manga. '
'Causes landscape pages to be split into portrait pages '
'from right to left.')),
OptionRecommendation(name='despeckle', recommended_value=False,
help=_('Enable Despeckle. Reduces speckle noise. '
'May greatly increase processing time.')),
OptionRecommendation(name='no_sort', recommended_value=False,
help=_("Don't sort the files found in the comic "
"alphabetically by name. Instead use the order they were "
"added to the comic.")),
OptionRecommendation(name='output_format', choices=['png', 'jpg'],
recommended_value='png', help=_('The format that images in the created e-book '
'are converted to. You can experiment to see which format gives '
'you optimal size and look on your device.')),
OptionRecommendation(name='no_process', recommended_value=False,
help=_("Apply no processing to the image")),
OptionRecommendation(name='dont_grayscale', recommended_value=False,
help=_('Do not convert the image to grayscale (black and white)')),
OptionRecommendation(name='comic_image_size', recommended_value=None,
help=_('Specify the image size as widthxheight pixels. Normally,'
' an image size is automatically calculated from the output '
'profile, this option overrides it.')),
OptionRecommendation(name='dont_add_comic_pages_to_toc', recommended_value=False,
help=_('When converting a CBC do not add links to each page to'
' the TOC. Note this only applies if the TOC has more than one'
' section')),
}
recommendations = {
('margin_left', 0, OptionRecommendation.HIGH),
('margin_top', 0, OptionRecommendation.HIGH),
('margin_right', 0, OptionRecommendation.HIGH),
('margin_bottom', 0, OptionRecommendation.HIGH),
('insert_blank_line', False, OptionRecommendation.HIGH),
('remove_paragraph_spacing', False, OptionRecommendation.HIGH),
('change_justification', 'left', OptionRecommendation.HIGH),
('dont_split_on_pagebreaks', True, OptionRecommendation.HIGH),
('chapter', None, OptionRecommendation.HIGH),
('page_breaks_brefore', None, OptionRecommendation.HIGH),
('use_auto_toc', False, OptionRecommendation.HIGH),
('page_breaks_before', None, OptionRecommendation.HIGH),
('disable_font_rescaling', True, OptionRecommendation.HIGH),
('linearize_tables', False, OptionRecommendation.HIGH),
}
def get_comics_from_collection(self, stream):
from calibre.libunzip import extract as zipextract
tdir = PersistentTemporaryDirectory('_comic_collection')
zipextract(stream, tdir)
comics = []
with CurrentDir(tdir):
if not os.path.exists('comics.txt'):
raise ValueError((
'%s is not a valid comic collection'
' no comics.txt was found in the file')
%stream.name)
with open('comics.txt', 'rb') as f:
raw = f.read()
if raw.startswith(codecs.BOM_UTF16_BE):
raw = raw.decode('utf-16-be')[1:]
elif raw.startswith(codecs.BOM_UTF16_LE):
raw = raw.decode('utf-16-le')[1:]
elif raw.startswith(codecs.BOM_UTF8):
raw = raw.decode('utf-8')[1:]
else:
raw = raw.decode('utf-8')
for line in raw.splitlines():
line = line.strip()
if not line:
continue
fname, title = line.partition(':')[0], line.partition(':')[-1]
fname = fname.replace('#', '_')
fname = os.path.join(tdir, *fname.split('/'))
if not title:
title = os.path.basename(fname).rpartition('.')[0]
if os.access(fname, os.R_OK):
comics.append([title, fname])
if not comics:
raise ValueError('%s has no comics'%stream.name)
return comics
def get_pages(self, comic, tdir2):
from calibre.ebooks.comic.input import (extract_comic, process_pages,
find_pages)
tdir = extract_comic(comic)
new_pages = find_pages(tdir, sort_on_mtime=self.opts.no_sort,
verbose=self.opts.verbose)
thumbnail = None
if not new_pages:
raise ValueError('Could not find any pages in the comic: %s'
%comic)
if self.opts.no_process:
n2 = []
for i, page in enumerate(new_pages):
n2.append(os.path.join(tdir2, '{} - {}' .format(i, os.path.basename(page))))
shutil.copyfile(page, n2[-1])
new_pages = n2
else:
new_pages, failures = process_pages(new_pages, self.opts,
self.report_progress, tdir2)
if failures:
self.log.warning('Could not process the following pages '
'(run with --verbose to see why):')
for f in failures:
self.log.warning('\t', f)
if not new_pages:
raise ValueError('Could not find any valid pages in comic: %s'
% comic)
thumbnail = os.path.join(tdir2,
'thumbnail.'+self.opts.output_format.lower())
if not os.access(thumbnail, os.R_OK):
thumbnail = None
return new_pages
def get_images(self):
return self._images
def convert(self, stream, opts, file_ext, log, accelerators):
from calibre.ebooks.metadata import MetaInformation
from calibre.ebooks.metadata.opf2 import OPFCreator
from calibre.ebooks.metadata.toc import TOC
self.opts, self.log= opts, log
if file_ext == 'cbc':
comics_ = self.get_comics_from_collection(stream)
else:
comics_ = [['Comic', os.path.abspath(stream.name)]]
stream.close()
comics = []
for i, x in enumerate(comics_):
title, fname = x
cdir = 'comic_%d'%(i+1) if len(comics_) > 1 else '.'
cdir = os.path.abspath(cdir)
if not os.path.exists(cdir):
os.makedirs(cdir)
pages = self.get_pages(fname, cdir)
if not pages:
continue
if self.for_viewer:
comics.append((title, pages, [self.create_viewer_wrapper(pages)]))
else:
wrappers = self.create_wrappers(pages)
comics.append((title, pages, wrappers))
if not comics:
raise ValueError('No comic pages found in %s'%stream.name)
mi = MetaInformation(os.path.basename(stream.name).rpartition('.')[0],
[_('Unknown')])
opf = OPFCreator(getcwd(), mi)
entries = []
def href(x):
if len(comics) == 1:
return os.path.basename(x)
return '/'.join(x.split(os.sep)[-2:])
cover_href = None
for comic in comics:
pages, wrappers = comic[1:]
page_entries = [(x, None) for x in map(href, pages)]
entries += [(w, None) for w in map(href, wrappers)] + page_entries
if cover_href is None and page_entries:
cover_href = page_entries[0][0]
opf.create_manifest(entries)
spine = []
for comic in comics:
spine.extend(map(href, comic[2]))
self._images = []
for comic in comics:
self._images.extend(comic[1])
opf.create_spine(spine)
if self.for_viewer and cover_href:
opf.guide.set_cover(cover_href)
toc = TOC()
if len(comics) == 1:
wrappers = comics[0][2]
for i, x in enumerate(wrappers):
toc.add_item(href(x), None, _('Page')+' %d'%(i+1),
play_order=i)
else:
po = 0
for comic in comics:
po += 1
wrappers = comic[2]
stoc = toc.add_item(href(wrappers[0]),
None, comic[0], play_order=po)
if not opts.dont_add_comic_pages_to_toc:
for i, x in enumerate(wrappers):
stoc.add_item(href(x), None,
_('Page')+' %d'%(i+1), play_order=po)
po += 1
opf.set_toc(toc)
with open('metadata.opf', 'wb') as m, open('toc.ncx', 'wb') as n:
opf.render(m, n, 'toc.ncx')
return os.path.abspath('metadata.opf')
def create_wrappers(self, pages):
from calibre.ebooks.oeb.base import XHTML_NS
wrappers = []
WRAPPER = textwrap.dedent('''\
<html xmlns="%s">
<head>
<meta charset="utf-8"/>
<title>Page #%d</title>
<style type="text/css">
@page { margin:0pt; padding: 0pt}
body { margin: 0pt; padding: 0pt}
div { text-align: center }
</style>
</head>
<body>
<div>
<img src="%s" alt="comic page #%d" />
</div>
</body>
</html>
''')
dir = os.path.dirname(pages[0])
for i, page in enumerate(pages):
wrapper = WRAPPER%(XHTML_NS, i+1, os.path.basename(page), i+1)
page = os.path.join(dir, 'page_%d.xhtml'%(i+1))
with open(page, 'wb') as f:
f.write(wrapper.encode('utf-8'))
wrappers.append(page)
return wrappers
def create_viewer_wrapper(self, pages):
from calibre.ebooks.oeb.base import XHTML_NS
def page(src):
return '<img src="{}"></img>'.format(os.path.basename(src))
pages = '\n'.join(map(page, pages))
base = os.path.dirname(pages[0])
wrapper = '''
<html xmlns="%s">
<head>
<meta charset="utf-8"/>
<style type="text/css">
html, body, img { height: 100vh; display: block; margin: 0; padding: 0; border-width: 0; }
img {
width: 100%%; height: 100%%;
object-fit: contain;
margin-left: auto; margin-right: auto;
max-width: 100vw; max-height: 100vh;
top: 50vh; transform: translateY(-50%%);
position: relative;
page-break-after: always;
}
</style>
</head>
<body>
%s
</body>
</html>
''' % (XHTML_NS, pages)
path = os.path.join(base, 'wrapper.xhtml')
with open(path, 'wb') as f:
f.write(wrapper.encode('utf-8'))
return path

View File

@@ -0,0 +1,67 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL 3'
__copyright__ = '2011, Anthon van der Neut <anthon@mnt.org>'
__docformat__ = 'restructuredtext en'
import os
from io import BytesIO
from calibre.customize.conversion import InputFormatPlugin
from polyglot.builtins import getcwd
class DJVUInput(InputFormatPlugin):
name = 'DJVU Input'
author = 'Anthon van der Neut'
description = 'Convert OCR-ed DJVU files (.djvu) to HTML'
file_types = {'djvu', 'djv'}
commit_name = 'djvu_input'
def convert(self, stream, options, file_ext, log, accelerators):
from calibre.ebooks.txt.processor import convert_basic
stdout = BytesIO()
from calibre.ebooks.djvu.djvu import DJVUFile
x = DJVUFile(stream)
x.get_text(stdout)
raw_text = stdout.getvalue()
if not raw_text:
raise ValueError('The DJVU file contains no text, only images, probably page scans.'
' calibre only supports conversion of DJVU files with actual text in them.')
html = convert_basic(raw_text.replace(b"\n", b' ').replace(
b'\037', b'\n\n'))
# Run the HTMLized text through the html processing plugin.
from calibre.customize.ui import plugin_for_input_format
html_input = plugin_for_input_format('html')
for opt in html_input.options:
setattr(options, opt.option.name, opt.recommended_value)
options.input_encoding = 'utf-8'
base = getcwd()
htmlfile = os.path.join(base, 'index.html')
c = 0
while os.path.exists(htmlfile):
c += 1
htmlfile = os.path.join(base, 'index%d.html'%c)
with open(htmlfile, 'wb') as f:
f.write(html.encode('utf-8'))
odi = options.debug_pipeline
options.debug_pipeline = None
# Generate oeb from html conversion.
with open(htmlfile, 'rb') as f:
oeb = html_input.convert(f, options, 'html', log,
{})
options.debug_pipeline = odi
os.remove(htmlfile)
# Set metadata from file.
from calibre.customize.ui import get_file_type_metadata
from calibre.ebooks.oeb.transforms.metadata import meta_info_to_oeb_metadata
mi = get_file_type_metadata(stream, file_ext)
meta_info_to_oeb_metadata(mi, oeb.metadata, log)
return oeb

View File

@@ -0,0 +1,34 @@
#!/usr/bin/env python2
# vim:fileencoding=utf-8
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
class DOCXInput(InputFormatPlugin):
name = 'DOCX Input'
author = 'Kovid Goyal'
description = _('Convert DOCX files (.docx and .docm) to HTML')
file_types = {'docx', 'docm'}
commit_name = 'docx_input'
options = {
OptionRecommendation(name='docx_no_cover', recommended_value=False,
help=_('Normally, if a large image is present at the start of the document that looks like a cover, '
'it will be removed from the document and used as the cover for created e-book. This option '
'turns off that behavior.')),
OptionRecommendation(name='docx_no_pagebreaks_between_notes', recommended_value=False,
help=_('Do not insert a page break after every endnote.')),
OptionRecommendation(name='docx_inline_subsup', recommended_value=False,
help=_('Render superscripts and subscripts so that they do not affect the line height.')),
}
recommendations = {('page_breaks_before', '/', OptionRecommendation.MED)}
def convert(self, stream, options, file_ext, log, accelerators):
from calibre.ebooks.docx.to_html import Convert
return Convert(stream, detect_cover=not options.docx_no_cover, log=log, notes_nopb=options.docx_no_pagebreaks_between_notes,
nosupsub=options.docx_inline_subsup)()

View File

@@ -0,0 +1,93 @@
#!/usr/bin/env python2
# vim:fileencoding=utf-8
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
from calibre.customize.conversion import OutputFormatPlugin, OptionRecommendation
PAGE_SIZES = ['a0', 'a1', 'a2', 'a3', 'a4', 'a5', 'a6', 'b0', 'b1',
'b2', 'b3', 'b4', 'b5', 'b6', 'legal', 'letter']
class DOCXOutput(OutputFormatPlugin):
name = 'DOCX Output'
author = 'Kovid Goyal'
file_type = 'docx'
commit_name = 'docx_output'
ui_data = {'page_sizes': PAGE_SIZES}
options = {
OptionRecommendation(name='docx_page_size', recommended_value='letter',
level=OptionRecommendation.LOW, choices=PAGE_SIZES,
help=_('The size of the page. Default is letter. Choices '
'are %s') % PAGE_SIZES),
OptionRecommendation(name='docx_custom_page_size', recommended_value=None,
help=_('Custom size of the document. Use the form widthxheight '
'EG. `123x321` to specify the width and height (in pts). '
'This overrides any specified page-size.')),
OptionRecommendation(name='docx_no_cover', recommended_value=False,
help=_('Do not insert the book cover as an image at the start of the document.'
' If you use this option, the book cover will be discarded.')),
OptionRecommendation(name='preserve_cover_aspect_ratio', recommended_value=False,
help=_('Preserve the aspect ratio of the cover image instead of stretching'
' it out to cover the entire page.')),
OptionRecommendation(name='docx_no_toc', recommended_value=False,
help=_('Do not insert the table of contents as a page at the start of the document.')),
OptionRecommendation(name='extract_to',
help=_('Extract the contents of the generated %s file to the '
'specified directory. The contents of the directory are first '
'deleted, so be careful.') % 'DOCX'),
OptionRecommendation(name='docx_page_margin_left', recommended_value=72.0,
level=OptionRecommendation.LOW,
help=_('The size of the left page margin, in pts. Default is 72pt.'
' Overrides the common left page margin setting.')
),
OptionRecommendation(name='docx_page_margin_top', recommended_value=72.0,
level=OptionRecommendation.LOW,
help=_('The size of the top page margin, in pts. Default is 72pt.'
' Overrides the common top page margin setting, unless set to zero.')
),
OptionRecommendation(name='docx_page_margin_right', recommended_value=72.0,
level=OptionRecommendation.LOW,
help=_('The size of the right page margin, in pts. Default is 72pt.'
' Overrides the common right page margin setting, unless set to zero.')
),
OptionRecommendation(name='docx_page_margin_bottom', recommended_value=72.0,
level=OptionRecommendation.LOW,
help=_('The size of the bottom page margin, in pts. Default is 72pt.'
' Overrides the common bottom page margin setting, unless set to zero.')
),
}
def convert_metadata(self, oeb):
from lxml import etree
from calibre.ebooks.oeb.base import OPF, OPF2_NS
from calibre.ebooks.metadata.opf2 import OPF as ReadOPF
from io import BytesIO
package = etree.Element(OPF('package'), attrib={'version': '2.0'}, nsmap={None: OPF2_NS})
oeb.metadata.to_opf2(package)
self.mi = ReadOPF(BytesIO(etree.tostring(package, encoding='utf-8')), populate_spine=False, try_to_guess_cover=False).to_book_metadata()
def convert(self, oeb, output_path, input_plugin, opts, log):
from calibre.ebooks.docx.writer.container import DOCX
from calibre.ebooks.docx.writer.from_html import Convert
docx = DOCX(opts, log)
self.convert_metadata(oeb)
Convert(oeb, docx, self.mi, not opts.docx_no_cover, not opts.docx_no_toc)()
docx.write(output_path, self.mi)
if opts.extract_to:
from calibre.ebooks.docx.dump import do_dump
do_dump(output_path, opts.extract_to)

View File

@@ -0,0 +1,438 @@
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL 3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import os, re, posixpath
from itertools import cycle
from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
from polyglot.builtins import getcwd
ADOBE_OBFUSCATION = 'http://ns.adobe.com/pdf/enc#RC'
IDPF_OBFUSCATION = 'http://www.idpf.org/2008/embedding'
def decrypt_font_data(key, data, algorithm):
is_adobe = algorithm == ADOBE_OBFUSCATION
crypt_len = 1024 if is_adobe else 1040
crypt = bytearray(data[:crypt_len])
key = cycle(iter(bytearray(key)))
decrypt = bytes(bytearray(x^next(key) for x in crypt))
return decrypt + data[crypt_len:]
def decrypt_font(key, path, algorithm):
with lopen(path, 'r+b') as f:
data = decrypt_font_data(key, f.read(), algorithm)
f.seek(0), f.truncate(), f.write(data)
class EPUBInput(InputFormatPlugin):
name = 'EPUB Input'
author = 'Kovid Goyal'
description = 'Convert EPUB files (.epub) to HTML'
file_types = {'epub'}
output_encoding = None
commit_name = 'epub_input'
recommendations = {('page_breaks_before', '/', OptionRecommendation.MED)}
def process_encryption(self, encfile, opf, log):
from lxml import etree
import uuid, hashlib
idpf_key = opf.raw_unique_identifier
if idpf_key:
idpf_key = re.sub('[\u0020\u0009\u000d\u000a]', '', idpf_key)
idpf_key = hashlib.sha1(idpf_key.encode('utf-8')).digest()
key = None
for item in opf.identifier_iter():
scheme = None
for xkey in item.attrib.keys():
if xkey.endswith('scheme'):
scheme = item.get(xkey)
if (scheme and scheme.lower() == 'uuid') or \
(item.text and item.text.startswith('urn:uuid:')):
try:
key = item.text.rpartition(':')[-1]
key = uuid.UUID(key).bytes
except:
import traceback
traceback.print_exc()
key = None
try:
root = etree.parse(encfile)
for em in root.xpath('descendant::*[contains(name(), "EncryptionMethod")]'):
algorithm = em.get('Algorithm', '')
if algorithm not in {ADOBE_OBFUSCATION, IDPF_OBFUSCATION}:
return False
cr = em.getparent().xpath('descendant::*[contains(name(), "CipherReference")]')[0]
uri = cr.get('URI')
path = os.path.abspath(os.path.join(os.path.dirname(encfile), '..', *uri.split('/')))
tkey = (key if algorithm == ADOBE_OBFUSCATION else idpf_key)
if (tkey and os.path.exists(path)):
self._encrypted_font_uris.append(uri)
decrypt_font(tkey, path, algorithm)
return True
except:
import traceback
traceback.print_exc()
return False
def set_guide_type(self, opf, gtype, href=None, title=''):
# Set the specified guide entry
for elem in list(opf.iterguide()):
if elem.get('type', '').lower() == gtype:
elem.getparent().remove(elem)
if href is not None:
t = opf.create_guide_item(gtype, title, href)
for guide in opf.root.xpath('./*[local-name()="guide"]'):
guide.append(t)
return
guide = opf.create_guide_element()
opf.root.append(guide)
guide.append(t)
return t
def rationalize_cover3(self, opf, log):
''' If there is a reference to the cover/titlepage via manifest properties, convert to
entries in the <guide> so that the rest of the pipeline picks it up. '''
from calibre.ebooks.metadata.opf3 import items_with_property
removed = guide_titlepage_href = guide_titlepage_id = None
# Look for titlepages incorrectly marked in the <guide> as covers
guide_cover, guide_elem = None, None
for guide_elem in opf.iterguide():
if guide_elem.get('type', '').lower() == 'cover':
guide_cover = guide_elem.get('href', '').partition('#')[0]
break
if guide_cover:
spine = list(opf.iterspine())
if spine:
idref = spine[0].get('idref', '')
for x in opf.itermanifest():
if x.get('id') == idref and x.get('href') == guide_cover:
guide_titlepage_href = guide_cover
guide_titlepage_id = idref
break
raster_cover_href = opf.epub3_raster_cover or opf.raster_cover
if raster_cover_href:
self.set_guide_type(opf, 'cover', raster_cover_href, 'Cover Image')
titlepage_id = titlepage_href = None
for item in items_with_property(opf.root, 'calibre:title-page'):
tid, href = item.get('id'), item.get('href')
if href and tid:
titlepage_id, titlepage_href = tid, href.partition('#')[0]
break
if titlepage_href is None:
titlepage_href, titlepage_id = guide_titlepage_href, guide_titlepage_id
if titlepage_href is not None:
self.set_guide_type(opf, 'titlepage', titlepage_href, 'Title Page')
spine = list(opf.iterspine())
if len(spine) > 1:
for item in spine:
if item.get('idref') == titlepage_id:
log('Found HTML cover', titlepage_href)
if self.for_viewer:
item.attrib.pop('linear', None)
else:
item.getparent().remove(item)
removed = titlepage_href
return removed
def rationalize_cover2(self, opf, log):
''' Ensure that the cover information in the guide is correct. That
means, at most one entry with type="cover" that points to a raster
cover and at most one entry with type="titlepage" that points to an
HTML titlepage. '''
from calibre.ebooks.oeb.base import OPF
removed = None
from lxml import etree
guide_cover, guide_elem = None, None
for guide_elem in opf.iterguide():
if guide_elem.get('type', '').lower() == 'cover':
guide_cover = guide_elem.get('href', '').partition('#')[0]
break
if not guide_cover:
raster_cover = opf.raster_cover
if raster_cover:
if guide_elem is None:
g = opf.root.makeelement(OPF('guide'))
opf.root.append(g)
else:
g = guide_elem.getparent()
guide_cover = raster_cover
guide_elem = g.makeelement(OPF('reference'), attrib={'href':raster_cover, 'type':'cover'})
g.append(guide_elem)
return
spine = list(opf.iterspine())
if not spine:
return
# Check if the cover specified in the guide is also
# the first element in spine
idref = spine[0].get('idref', '')
manifest = list(opf.itermanifest())
if not manifest:
return
elem = [x for x in manifest if x.get('id', '') == idref]
if not elem or elem[0].get('href', None) != guide_cover:
return
log('Found HTML cover', guide_cover)
# Remove from spine as covers must be treated
# specially
if not self.for_viewer:
if len(spine) == 1:
log.warn('There is only a single spine item and it is marked as the cover. Removing cover marking.')
for guide_elem in tuple(opf.iterguide()):
if guide_elem.get('type', '').lower() == 'cover':
guide_elem.getparent().remove(guide_elem)
return
else:
spine[0].getparent().remove(spine[0])
removed = guide_cover
else:
# Ensure the cover is displayed as the first item in the book, some
# epub files have it set with linear='no' which causes the cover to
# display in the end
spine[0].attrib.pop('linear', None)
opf.spine[0].is_linear = True
# Ensure that the guide has a cover entry pointing to a raster cover
# and a titlepage entry pointing to the html titlepage. The titlepage
# entry will be used by the epub output plugin, the raster cover entry
# by other output plugins.
# Search for a raster cover identified in the OPF
raster_cover = opf.raster_cover
# Set the cover guide entry
if raster_cover is not None:
guide_elem.set('href', raster_cover)
else:
# Render the titlepage to create a raster cover
from calibre.ebooks import render_html_svg_workaround
guide_elem.set('href', 'calibre_raster_cover.jpg')
t = etree.SubElement(
elem[0].getparent(), OPF('item'), href=guide_elem.get('href'), id='calibre_raster_cover')
t.set('media-type', 'image/jpeg')
if os.path.exists(guide_cover):
renderer = render_html_svg_workaround(guide_cover, log)
if renderer is not None:
with lopen('calibre_raster_cover.jpg', 'wb') as f:
f.write(renderer)
# Set the titlepage guide entry
self.set_guide_type(opf, 'titlepage', guide_cover, 'Title Page')
return removed
def find_opf(self):
from calibre.utils.xml_parse import safe_xml_fromstring
def attr(n, attr):
for k, v in n.attrib.items():
if k.endswith(attr):
return v
try:
with lopen('META-INF/container.xml', 'rb') as f:
root = safe_xml_fromstring(f.read())
for r in root.xpath('//*[local-name()="rootfile"]'):
if attr(r, 'media-type') != "application/oebps-package+xml":
continue
path = attr(r, 'full-path')
if not path:
continue
path = os.path.join(getcwd(), *path.split('/'))
if os.path.exists(path):
return path
except Exception:
import traceback
traceback.print_exc()
def convert(self, stream, options, file_ext, log, accelerators):
from calibre.utils.zipfile import ZipFile
from calibre import walk
from calibre.ebooks import DRMError
from calibre.ebooks.metadata.opf2 import OPF
try:
zf = ZipFile(stream)
zf.extractall(getcwd())
except:
log.exception('EPUB appears to be invalid ZIP file, trying a'
' more forgiving ZIP parser')
from calibre.utils.localunzip import extractall
stream.seek(0)
extractall(stream)
encfile = os.path.abspath(os.path.join('META-INF', 'encryption.xml'))
opf = self.find_opf()
if opf is None:
for f in walk('.'):
if f.lower().endswith('.opf') and '__MACOSX' not in f and \
not os.path.basename(f).startswith('.'):
opf = os.path.abspath(f)
break
path = getattr(stream, 'name', 'stream')
if opf is None:
raise ValueError('%s is not a valid EPUB file (could not find opf)'%path)
opf = os.path.relpath(opf, getcwd())
parts = os.path.split(opf)
opf = OPF(opf, os.path.dirname(os.path.abspath(opf)))
self._encrypted_font_uris = []
if os.path.exists(encfile):
if not self.process_encryption(encfile, opf, log):
raise DRMError(os.path.basename(path))
self.encrypted_fonts = self._encrypted_font_uris
if len(parts) > 1 and parts[0]:
delta = '/'.join(parts[:-1])+'/'
def normpath(x):
return posixpath.normpath(delta + elem.get('href'))
for elem in opf.itermanifest():
elem.set('href', normpath(elem.get('href')))
for elem in opf.iterguide():
elem.set('href', normpath(elem.get('href')))
f = self.rationalize_cover3 if opf.package_version >= 3.0 else self.rationalize_cover2
self.removed_cover = f(opf, log)
if self.removed_cover:
self.removed_items_to_ignore = (self.removed_cover,)
epub3_nav = opf.epub3_nav
if epub3_nav is not None:
self.convert_epub3_nav(epub3_nav, opf, log, options)
for x in opf.itermanifest():
if x.get('media-type', '') == 'application/x-dtbook+xml':
raise ValueError(
'EPUB files with DTBook markup are not supported')
not_for_spine = set()
for y in opf.itermanifest():
id_ = y.get('id', None)
if id_:
mt = y.get('media-type', None)
if mt in {
'application/vnd.adobe-page-template+xml',
'application/vnd.adobe.page-template+xml',
'application/adobe-page-template+xml',
'application/adobe.page-template+xml',
'application/text'
}:
not_for_spine.add(id_)
ext = y.get('href', '').rpartition('.')[-1].lower()
if mt == 'text/plain' and ext in {'otf', 'ttf'}:
# some epub authoring software sets font mime types to
# text/plain
not_for_spine.add(id_)
y.set('media-type', 'application/font')
seen = set()
for x in list(opf.iterspine()):
ref = x.get('idref', None)
if not ref or ref in not_for_spine or ref in seen:
x.getparent().remove(x)
continue
seen.add(ref)
if len(list(opf.iterspine())) == 0:
raise ValueError('No valid entries in the spine of this EPUB')
with lopen('content.opf', 'wb') as nopf:
nopf.write(opf.render())
return os.path.abspath('content.opf')
def convert_epub3_nav(self, nav_path, opf, log, opts):
from lxml import etree
from calibre.ebooks.chardet import xml_to_unicode
from calibre.ebooks.oeb.polish.parsing import parse
from calibre.ebooks.oeb.base import EPUB_NS, XHTML, NCX_MIME, NCX, urlnormalize, urlunquote, serialize
from calibre.ebooks.oeb.polish.toc import first_child
from calibre.utils.xml_parse import safe_xml_fromstring
from tempfile import NamedTemporaryFile
with lopen(nav_path, 'rb') as f:
raw = f.read()
raw = xml_to_unicode(raw, strip_encoding_pats=True, assume_utf8=True)[0]
root = parse(raw, log=log)
ncx = safe_xml_fromstring('<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1" xml:lang="eng"><navMap/></ncx>')
navmap = ncx[0]
et = '{%s}type' % EPUB_NS
bn = os.path.basename(nav_path)
def add_from_li(li, parent):
href = text = None
for x in li.iterchildren(XHTML('a'), XHTML('span')):
text = etree.tostring(
x, method='text', encoding='unicode', with_tail=False).strip() or ' '.join(
x.xpath('descendant-or-self::*/@title')).strip()
href = x.get('href')
if href:
if href.startswith('#'):
href = bn + href
break
np = parent.makeelement(NCX('navPoint'))
parent.append(np)
np.append(np.makeelement(NCX('navLabel')))
np[0].append(np.makeelement(NCX('text')))
np[0][0].text = text
if href:
np.append(np.makeelement(NCX('content'), attrib={'src':href}))
return np
def process_nav_node(node, toc_parent):
for li in node.iterchildren(XHTML('li')):
child = add_from_li(li, toc_parent)
ol = first_child(li, XHTML('ol'))
if child is not None and ol is not None:
process_nav_node(ol, child)
for nav in root.iterdescendants(XHTML('nav')):
if nav.get(et) == 'toc':
ol = first_child(nav, XHTML('ol'))
if ol is not None:
process_nav_node(ol, navmap)
break
else:
return
with NamedTemporaryFile(suffix='.ncx', dir=os.path.dirname(nav_path), delete=False) as f:
f.write(etree.tostring(ncx, encoding='utf-8'))
ncx_href = os.path.relpath(f.name, getcwd()).replace(os.sep, '/')
ncx_id = opf.create_manifest_item(ncx_href, NCX_MIME, append=True).get('id')
for spine in opf.root.xpath('//*[local-name()="spine"]'):
spine.set('toc', ncx_id)
opts.epub3_nav_href = urlnormalize(os.path.relpath(nav_path).replace(os.sep, '/'))
opts.epub3_nav_parsed = root
if getattr(self, 'removed_cover', None):
changed = False
base_path = os.path.dirname(nav_path)
for elem in root.xpath('//*[@href]'):
href, frag = elem.get('href').partition('#')[::2]
link_path = os.path.relpath(os.path.join(base_path, urlunquote(href)), base_path)
abs_href = urlnormalize(link_path)
if abs_href == self.removed_cover:
changed = True
elem.set('data-calibre-removed-titlepage', '1')
if changed:
with lopen(nav_path, 'wb') as f:
f.write(serialize(root, 'application/xhtml+xml'))
def postprocess_book(self, oeb, opts, log):
rc = getattr(self, 'removed_cover', None)
if rc:
cover_toc_item = None
for item in oeb.toc.iterdescendants():
if item.href and item.href.partition('#')[0] == rc:
cover_toc_item = item
break
spine = {x.href for x in oeb.spine}
if (cover_toc_item is not None and cover_toc_item not in spine):
oeb.toc.item_that_refers_to_cover = cover_toc_item

View File

@@ -0,0 +1,548 @@
#!/usr/bin/env python2
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import os, shutil, re
from calibre.customize.conversion import (OutputFormatPlugin,
OptionRecommendation)
from calibre.ptempfile import TemporaryDirectory
from calibre import CurrentDir
from polyglot.builtins import unicode_type, filter, map, zip, range, as_bytes
block_level_tags = (
'address',
'body',
'blockquote',
'center',
'dir',
'div',
'dl',
'fieldset',
'form',
'h1',
'h2',
'h3',
'h4',
'h5',
'h6',
'hr',
'isindex',
'menu',
'noframes',
'noscript',
'ol',
'p',
'pre',
'table',
'ul',
)
class EPUBOutput(OutputFormatPlugin):
name = 'EPUB Output'
author = 'Kovid Goyal'
file_type = 'epub'
commit_name = 'epub_output'
ui_data = {'versions': ('2', '3')}
options = {
OptionRecommendation(name='extract_to',
help=_('Extract the contents of the generated %s file to the '
'specified directory. The contents of the directory are first '
'deleted, so be careful.') % 'EPUB'),
OptionRecommendation(name='dont_split_on_page_breaks',
recommended_value=False, level=OptionRecommendation.LOW,
help=_('Turn off splitting at page breaks. Normally, input '
'files are automatically split at every page break into '
'two files. This gives an output e-book that can be '
'parsed faster and with less resources. However, '
'splitting is slow and if your source file contains a '
'very large number of page breaks, you should turn off '
'splitting on page breaks.'
)
),
OptionRecommendation(name='flow_size', recommended_value=260,
help=_('Split all HTML files larger than this size (in KB). '
'This is necessary as most EPUB readers cannot handle large '
'file sizes. The default of %defaultKB is the size required '
'for Adobe Digital Editions. Set to 0 to disable size based splitting.')
),
OptionRecommendation(name='no_default_epub_cover', recommended_value=False,
help=_('Normally, if the input file has no cover and you don\'t'
' specify one, a default cover is generated with the title, '
'authors, etc. This option disables the generation of this cover.')
),
OptionRecommendation(name='no_svg_cover', recommended_value=False,
help=_('Do not use SVG for the book cover. Use this option if '
'your EPUB is going to be used on a device that does not '
'support SVG, like the iPhone or the JetBook Lite. '
'Without this option, such devices will display the cover '
'as a blank page.')
),
OptionRecommendation(name='preserve_cover_aspect_ratio',
recommended_value=False, help=_(
'When using an SVG cover, this option will cause the cover to scale '
'to cover the available screen area, but still preserve its aspect ratio '
'(ratio of width to height). That means there may be white borders '
'at the sides or top and bottom of the image, but the image will '
'never be distorted. Without this option the image may be slightly '
'distorted, but there will be no borders.'
)
),
OptionRecommendation(name='epub_flatten', recommended_value=False,
help=_('This option is needed only if you intend to use the EPUB'
' with FBReaderJ. It will flatten the file system inside the'
' EPUB, putting all files into the top level.')
),
OptionRecommendation(name='epub_inline_toc', recommended_value=False,
help=_('Insert an inline Table of Contents that will appear as part of the main book content.')
),
OptionRecommendation(name='epub_toc_at_end', recommended_value=False,
help=_('Put the inserted inline Table of Contents at the end of the book instead of the start.')
),
OptionRecommendation(name='toc_title', recommended_value=None,
help=_('Title for any generated in-line table of contents.')
),
OptionRecommendation(name='epub_version', recommended_value='2', choices=ui_data['versions'],
help=_('The version of the EPUB file to generate. EPUB 2 is the'
' most widely compatible, only use EPUB 3 if you know you'
' actually need it.')
),
}
recommendations = {('pretty_print', True, OptionRecommendation.HIGH)}
def workaround_webkit_quirks(self): # {{{
from calibre.ebooks.oeb.base import XPath
for x in self.oeb.spine:
root = x.data
body = XPath('//h:body')(root)
if body:
body = body[0]
if not hasattr(body, 'xpath'):
continue
for pre in XPath('//h:pre')(body):
if not pre.text and len(pre) == 0:
pre.tag = 'div'
# }}}
def upshift_markup(self): # {{{
'Upgrade markup to comply with XHTML 1.1 where possible'
from calibre.ebooks.oeb.base import XPath, XML
for x in self.oeb.spine:
root = x.data
if (not root.get(XML('lang'))) and (root.get('lang')):
root.set(XML('lang'), root.get('lang'))
body = XPath('//h:body')(root)
if body:
body = body[0]
if not hasattr(body, 'xpath'):
continue
for u in XPath('//h:u')(root):
u.tag = 'span'
seen_ids, seen_names = set(), set()
for x in XPath('//*[@id or @name]')(root):
eid, name = x.get('id', None), x.get('name', None)
if eid:
if eid in seen_ids:
del x.attrib['id']
else:
seen_ids.add(eid)
if name:
if name in seen_names:
del x.attrib['name']
else:
seen_names.add(name)
# }}}
def convert(self, oeb, output_path, input_plugin, opts, log):
self.log, self.opts, self.oeb = log, opts, oeb
if self.opts.epub_inline_toc:
from calibre.ebooks.mobi.writer8.toc import TOCAdder
opts.mobi_toc_at_start = not opts.epub_toc_at_end
opts.mobi_passthrough = False
opts.no_inline_toc = False
TOCAdder(oeb, opts, replace_previous_inline_toc=True, ignore_existing_toc=True)
if self.opts.epub_flatten:
from calibre.ebooks.oeb.transforms.filenames import FlatFilenames
FlatFilenames()(oeb, opts)
else:
from calibre.ebooks.oeb.transforms.filenames import UniqueFilenames
UniqueFilenames()(oeb, opts)
self.workaround_ade_quirks()
self.workaround_webkit_quirks()
self.upshift_markup()
from calibre.ebooks.oeb.transforms.rescale import RescaleImages
RescaleImages(check_colorspaces=True)(oeb, opts)
from calibre.ebooks.oeb.transforms.split import Split
split = Split(not self.opts.dont_split_on_page_breaks,
max_flow_size=self.opts.flow_size*1024
)
split(self.oeb, self.opts)
from calibre.ebooks.oeb.transforms.cover import CoverManager
cm = CoverManager(
no_default_cover=self.opts.no_default_epub_cover,
no_svg_cover=self.opts.no_svg_cover,
preserve_aspect_ratio=self.opts.preserve_cover_aspect_ratio)
cm(self.oeb, self.opts, self.log)
self.workaround_sony_quirks()
if self.oeb.toc.count() == 0:
self.log.warn('This EPUB file has no Table of Contents. '
'Creating a default TOC')
first = next(iter(self.oeb.spine))
self.oeb.toc.add(_('Start'), first.href)
from calibre.ebooks.oeb.base import OPF
identifiers = oeb.metadata['identifier']
uuid = None
for x in identifiers:
if x.get(OPF('scheme'), None).lower() == 'uuid' or unicode_type(x).startswith('urn:uuid:'):
uuid = unicode_type(x).split(':')[-1]
break
encrypted_fonts = getattr(input_plugin, 'encrypted_fonts', [])
if uuid is None:
self.log.warn('No UUID identifier found')
from uuid import uuid4
uuid = unicode_type(uuid4())
oeb.metadata.add('identifier', uuid, scheme='uuid', id=uuid)
if encrypted_fonts and not uuid.startswith('urn:uuid:'):
# Apparently ADE requires this value to start with urn:uuid:
# for some absurd reason, or it will throw a hissy fit and refuse
# to use the obfuscated fonts.
for x in identifiers:
if unicode_type(x) == uuid:
x.content = 'urn:uuid:'+uuid
with TemporaryDirectory('_epub_output') as tdir:
from calibre.customize.ui import plugin_for_output_format
metadata_xml = None
extra_entries = []
if self.is_periodical:
if self.opts.output_profile.epub_periodical_format == 'sony':
from calibre.ebooks.epub.periodical import sony_metadata
metadata_xml, atom_xml = sony_metadata(oeb)
extra_entries = [('atom.xml', 'application/atom+xml', atom_xml)]
oeb_output = plugin_for_output_format('oeb')
oeb_output.convert(oeb, tdir, input_plugin, opts, log)
opf = [x for x in os.listdir(tdir) if x.endswith('.opf')][0]
self.condense_ncx([os.path.join(tdir, x) for x in os.listdir(tdir)
if x.endswith('.ncx')][0])
if self.opts.epub_version == '3':
self.upgrade_to_epub3(tdir, opf)
encryption = None
if encrypted_fonts:
encryption = self.encrypt_fonts(encrypted_fonts, tdir, uuid)
from calibre.ebooks.epub import initialize_container
with initialize_container(output_path, os.path.basename(opf),
extra_entries=extra_entries) as epub:
epub.add_dir(tdir)
if encryption is not None:
epub.writestr('META-INF/encryption.xml', as_bytes(encryption))
if metadata_xml is not None:
epub.writestr('META-INF/metadata.xml',
metadata_xml.encode('utf-8'))
if opts.extract_to is not None:
from calibre.utils.zipfile import ZipFile
if os.path.exists(opts.extract_to):
if os.path.isdir(opts.extract_to):
shutil.rmtree(opts.extract_to)
else:
os.remove(opts.extract_to)
os.mkdir(opts.extract_to)
with ZipFile(output_path) as zf:
zf.extractall(path=opts.extract_to)
self.log.info('EPUB extracted to', opts.extract_to)
def upgrade_to_epub3(self, tdir, opf):
self.log.info('Upgrading to EPUB 3...')
from calibre.ebooks.epub import simple_container_xml
from calibre.ebooks.oeb.polish.cover import fix_conversion_titlepage_links_in_nav
try:
os.mkdir(os.path.join(tdir, 'META-INF'))
except EnvironmentError:
pass
with open(os.path.join(tdir, 'META-INF', 'container.xml'), 'wb') as f:
f.write(simple_container_xml(os.path.basename(opf)).encode('utf-8'))
from calibre.ebooks.oeb.polish.container import EpubContainer
container = EpubContainer(tdir, self.log)
from calibre.ebooks.oeb.polish.upgrade import epub_2_to_3
existing_nav = getattr(self.opts, 'epub3_nav_parsed', None)
nav_href = getattr(self.opts, 'epub3_nav_href', None)
previous_nav = (nav_href, existing_nav) if existing_nav and nav_href else None
epub_2_to_3(container, self.log.info, previous_nav=previous_nav)
fix_conversion_titlepage_links_in_nav(container)
container.commit()
os.remove(f.name)
try:
os.rmdir(os.path.join(tdir, 'META-INF'))
except EnvironmentError:
pass
def encrypt_fonts(self, uris, tdir, uuid): # {{{
from polyglot.binary import from_hex_bytes
key = re.sub(r'[^a-fA-F0-9]', '', uuid)
if len(key) < 16:
raise ValueError('UUID identifier %r is invalid'%uuid)
key = bytearray(from_hex_bytes((key + key)[:32]))
paths = []
with CurrentDir(tdir):
paths = [os.path.join(*x.split('/')) for x in uris]
uris = dict(zip(uris, paths))
fonts = []
for uri in list(uris.keys()):
path = uris[uri]
if not os.path.exists(path):
uris.pop(uri)
continue
self.log.debug('Encrypting font:', uri)
with lopen(path, 'r+b') as f:
data = f.read(1024)
if len(data) >= 1024:
data = bytearray(data)
f.seek(0)
f.write(bytes(bytearray(data[i] ^ key[i%16] for i in range(1024))))
else:
self.log.warn('Font', path, 'is invalid, ignoring')
if not isinstance(uri, unicode_type):
uri = uri.decode('utf-8')
fonts.append('''
<enc:EncryptedData>
<enc:EncryptionMethod Algorithm="http://ns.adobe.com/pdf/enc#RC"/>
<enc:CipherData>
<enc:CipherReference URI="%s"/>
</enc:CipherData>
</enc:EncryptedData>
'''%(uri.replace('"', '\\"')))
if fonts:
ans = '''<encryption
xmlns="urn:oasis:names:tc:opendocument:xmlns:container"
xmlns:enc="http://www.w3.org/2001/04/xmlenc#"
xmlns:deenc="http://ns.adobe.com/digitaleditions/enc">
'''
ans += '\n'.join(fonts)
ans += '\n</encryption>'
return ans
# }}}
def condense_ncx(self, ncx_path): # {{{
from lxml import etree
if not self.opts.pretty_print:
tree = etree.parse(ncx_path)
for tag in tree.getroot().iter(tag=etree.Element):
if tag.text:
tag.text = tag.text.strip()
if tag.tail:
tag.tail = tag.tail.strip()
compressed = etree.tostring(tree.getroot(), encoding='utf-8')
with open(ncx_path, 'wb') as f:
f.write(compressed)
# }}}
def workaround_ade_quirks(self): # {{{
'''
Perform various markup transforms to get the output to render correctly
in the quirky ADE.
'''
from calibre.ebooks.oeb.base import XPath, XHTML, barename, urlunquote
stylesheet = self.oeb.manifest.main_stylesheet
# ADE cries big wet tears when it encounters an invalid fragment
# identifier in the NCX toc.
frag_pat = re.compile(r'[-A-Za-z0-9_:.]+$')
for node in self.oeb.toc.iter():
href = getattr(node, 'href', None)
if hasattr(href, 'partition'):
base, _, frag = href.partition('#')
frag = urlunquote(frag)
if frag and frag_pat.match(frag) is None:
self.log.warn(
'Removing fragment identifier %r from TOC as Adobe Digital Editions cannot handle it'%frag)
node.href = base
for x in self.oeb.spine:
root = x.data
body = XPath('//h:body')(root)
if body:
body = body[0]
if hasattr(body, 'xpath'):
# remove <img> tags with empty src elements
bad = []
for x in XPath('//h:img')(body):
src = x.get('src', '').strip()
if src in ('', '#') or src.startswith('http:'):
bad.append(x)
for img in bad:
img.getparent().remove(img)
# Add id attribute to <a> tags that have name
for x in XPath('//h:a[@name]')(body):
if not x.get('id', False):
x.set('id', x.get('name'))
# The delightful epubcheck has started complaining about <a> tags that
# have name attributes.
x.attrib.pop('name')
# Replace <br> that are children of <body> as ADE doesn't handle them
for br in XPath('./h:br')(body):
if br.getparent() is None:
continue
try:
prior = next(br.itersiblings(preceding=True))
priortag = barename(prior.tag)
priortext = prior.tail
except:
priortag = 'body'
priortext = body.text
if priortext:
priortext = priortext.strip()
br.tag = XHTML('p')
br.text = '\u00a0'
style = br.get('style', '').split(';')
style = list(filter(None, map(lambda x: x.strip(), style)))
style.append('margin:0pt; border:0pt')
# If the prior tag is a block (including a <br> we replaced)
# then this <br> replacement should have a 1-line height.
# Otherwise it should have no height.
if not priortext and priortag in block_level_tags:
style.append('height:1em')
else:
style.append('height:0pt')
br.set('style', '; '.join(style))
for tag in XPath('//h:embed')(root):
tag.getparent().remove(tag)
for tag in XPath('//h:object')(root):
if tag.get('type', '').lower().strip() in {'image/svg+xml', 'application/svg+xml'}:
continue
tag.getparent().remove(tag)
for tag in XPath('//h:title|//h:style')(root):
if not tag.text:
tag.getparent().remove(tag)
for tag in XPath('//h:script')(root):
if (not tag.text and not tag.get('src', False) and tag.get('type', None) != 'text/x-mathjax-config'):
tag.getparent().remove(tag)
for tag in XPath('//h:body/descendant::h:script')(root):
tag.getparent().remove(tag)
formchildren = XPath('./h:input|./h:button|./h:textarea|'
'./h:label|./h:fieldset|./h:legend')
for tag in XPath('//h:form')(root):
if formchildren(tag):
tag.getparent().remove(tag)
else:
# Not a real form
tag.tag = XHTML('div')
for tag in XPath('//h:center')(root):
tag.tag = XHTML('div')
tag.set('style', 'text-align:center')
# ADE can't handle &amp; in an img url
for tag in XPath('//h:img[@src]')(root):
tag.set('src', tag.get('src', '').replace('&', ''))
# ADE whimpers in fright when it encounters a <td> outside a
# <table>
in_table = XPath('ancestor::h:table')
for tag in XPath('//h:td|//h:tr|//h:th')(root):
if not in_table(tag):
tag.tag = XHTML('div')
# ADE fails to render non breaking hyphens/soft hyphens/zero width spaces
special_chars = re.compile('[\u200b\u00ad]')
for elem in root.iterdescendants('*'):
if elem.text:
elem.text = special_chars.sub('', elem.text)
elem.text = elem.text.replace('\u2011', '-')
if elem.tail:
elem.tail = special_chars.sub('', elem.tail)
elem.tail = elem.tail.replace('\u2011', '-')
if stylesheet is not None:
# ADE doesn't render lists correctly if they have left margins
from css_parser.css import CSSRule
for lb in XPath('//h:ul[@class]|//h:ol[@class]')(root):
sel = '.'+lb.get('class')
for rule in stylesheet.data.cssRules.rulesOfType(CSSRule.STYLE_RULE):
if sel == rule.selectorList.selectorText:
rule.style.removeProperty('margin-left')
# padding-left breaks rendering in webkit and gecko
rule.style.removeProperty('padding-left')
# Change whitespace:pre to pre-wrap to accommodate readers that
# cannot scroll horizontally
for rule in stylesheet.data.cssRules.rulesOfType(CSSRule.STYLE_RULE):
style = rule.style
ws = style.getPropertyValue('white-space')
if ws == 'pre':
style.setProperty('white-space', 'pre-wrap')
# }}}
def workaround_sony_quirks(self): # {{{
'''
Perform toc link transforms to alleviate slow loading.
'''
from calibre.ebooks.oeb.base import urldefrag, XPath
from calibre.ebooks.oeb.polish.toc import item_at_top
def frag_is_at_top(root, frag):
elem = XPath('//*[@id="%s" or @name="%s"]'%(frag, frag))(root)
if elem:
elem = elem[0]
else:
return False
return item_at_top(elem)
def simplify_toc_entry(toc):
if toc.href:
href, frag = urldefrag(toc.href)
if frag:
for x in self.oeb.spine:
if x.href == href:
if frag_is_at_top(x.data, frag):
self.log.debug('Removing anchor from TOC href:',
href+'#'+frag)
toc.href = href
break
for x in toc:
simplify_toc_entry(x)
if self.oeb.toc:
simplify_toc_entry(self.oeb.toc)
# }}}

View File

@@ -0,0 +1,179 @@
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2008, Anatoly Shipitsin <norguhtar at gmail.com>'
"""
Convert .fb2 files to .lrf
"""
import os, re
from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
from calibre import guess_type
from polyglot.builtins import iteritems, getcwd
FB2NS = 'http://www.gribuser.ru/xml/fictionbook/2.0'
FB21NS = 'http://www.gribuser.ru/xml/fictionbook/2.1'
class FB2Input(InputFormatPlugin):
name = 'FB2 Input'
author = 'Anatoly Shipitsin'
description = 'Convert FB2 and FBZ files to HTML'
file_types = {'fb2', 'fbz'}
commit_name = 'fb2_input'
recommendations = {
('level1_toc', '//h:h1', OptionRecommendation.MED),
('level2_toc', '//h:h2', OptionRecommendation.MED),
('level3_toc', '//h:h3', OptionRecommendation.MED),
}
options = {
OptionRecommendation(name='no_inline_fb2_toc',
recommended_value=False, level=OptionRecommendation.LOW,
help=_('Do not insert a Table of Contents at the beginning of the book.'
)
)}
def convert(self, stream, options, file_ext, log,
accelerators):
from lxml import etree
from calibre.utils.xml_parse import safe_xml_fromstring
from calibre.ebooks.metadata.fb2 import ensure_namespace, get_fb2_data
from calibre.ebooks.metadata.opf2 import OPFCreator
from calibre.ebooks.metadata.meta import get_metadata
from calibre.ebooks.oeb.base import XLINK_NS, XHTML_NS
from calibre.ebooks.chardet import xml_to_unicode
self.log = log
log.debug('Parsing XML...')
raw = get_fb2_data(stream)[0]
raw = raw.replace(b'\0', b'')
raw = xml_to_unicode(raw, strip_encoding_pats=True,
assume_utf8=True, resolve_entities=True)[0]
try:
doc = safe_xml_fromstring(raw)
except etree.XMLSyntaxError:
doc = safe_xml_fromstring(raw.replace('& ', '&amp;'))
if doc is None:
raise ValueError('The FB2 file is not valid XML')
doc = ensure_namespace(doc)
try:
fb_ns = doc.nsmap[doc.prefix]
except Exception:
fb_ns = FB2NS
NAMESPACES = {'f':fb_ns, 'l':XLINK_NS}
stylesheets = doc.xpath('//*[local-name() = "stylesheet" and @type="text/css"]')
css = ''
for s in stylesheets:
css += etree.tostring(s, encoding='unicode', method='text',
with_tail=False) + '\n\n'
if css:
import css_parser, logging
parser = css_parser.CSSParser(fetcher=None,
log=logging.getLogger('calibre.css'))
XHTML_CSS_NAMESPACE = '@namespace "%s";\n' % XHTML_NS
text = XHTML_CSS_NAMESPACE + css
log.debug('Parsing stylesheet...')
stylesheet = parser.parseString(text)
stylesheet.namespaces['h'] = XHTML_NS
css = stylesheet.cssText
if isinstance(css, bytes):
css = css.decode('utf-8', 'replace')
css = css.replace('h|style', 'h|span')
css = re.sub(r'name\s*=\s*', 'class=', css)
self.extract_embedded_content(doc)
log.debug('Converting XML to HTML...')
with open(P('templates/fb2.xsl'), 'rb') as f:
ss = f.read().decode('utf-8')
ss = ss.replace("__FB_NS__", fb_ns)
if options.no_inline_fb2_toc:
log('Disabling generation of inline FB2 TOC')
ss = re.compile(r'<!-- BUILD TOC -->.*<!-- END BUILD TOC -->',
re.DOTALL).sub('', ss)
styledoc = safe_xml_fromstring(ss)
transform = etree.XSLT(styledoc)
result = transform(doc)
# Handle links of type note and cite
notes = {a.get('href')[1:]: a for a in result.xpath('//a[@link_note and @href]') if a.get('href').startswith('#')}
cites = {a.get('link_cite'): a for a in result.xpath('//a[@link_cite]') if not a.get('href', '')}
all_ids = {x for x in result.xpath('//*/@id')}
for cite, a in iteritems(cites):
note = notes.get(cite, None)
if note:
c = 1
while 'cite%d' % c in all_ids:
c += 1
if not note.get('id', None):
note.set('id', 'cite%d' % c)
all_ids.add(note.get('id'))
a.set('href', '#%s' % note.get('id'))
for x in result.xpath('//*[@link_note or @link_cite]'):
x.attrib.pop('link_note', None)
x.attrib.pop('link_cite', None)
for img in result.xpath('//img[@src]'):
src = img.get('src')
img.set('src', self.binary_map.get(src, src))
index = transform.tostring(result)
with open('index.xhtml', 'wb') as f:
f.write(index.encode('utf-8'))
with open('inline-styles.css', 'wb') as f:
f.write(css.encode('utf-8'))
stream.seek(0)
mi = get_metadata(stream, 'fb2')
if not mi.title:
mi.title = _('Unknown')
if not mi.authors:
mi.authors = [_('Unknown')]
cpath = None
if mi.cover_data and mi.cover_data[1]:
with open('fb2_cover_calibre_mi.jpg', 'wb') as f:
f.write(mi.cover_data[1])
cpath = os.path.abspath('fb2_cover_calibre_mi.jpg')
else:
for img in doc.xpath('//f:coverpage/f:image', namespaces=NAMESPACES):
href = img.get('{%s}href'%XLINK_NS, img.get('href', None))
if href is not None:
if href.startswith('#'):
href = href[1:]
cpath = os.path.abspath(href)
break
opf = OPFCreator(getcwd(), mi)
entries = [(f2, guess_type(f2)[0]) for f2 in os.listdir(u'.')]
opf.create_manifest(entries)
opf.create_spine(['index.xhtml'])
if cpath:
opf.guide.set_cover(cpath)
with open('metadata.opf', 'wb') as f:
opf.render(f)
return os.path.join(getcwd(), 'metadata.opf')
def extract_embedded_content(self, doc):
from calibre.ebooks.fb2 import base64_decode
self.binary_map = {}
for elem in doc.xpath('./*'):
if elem.text and 'binary' in elem.tag and 'id' in elem.attrib:
ct = elem.get('content-type', '')
fname = elem.attrib['id']
ext = ct.rpartition('/')[-1].lower()
if ext in ('png', 'jpeg', 'jpg'):
if fname.lower().rpartition('.')[-1] not in {'jpg', 'jpeg',
'png'}:
fname += '.' + ext
self.binary_map[elem.get('id')] = fname
raw = elem.text.strip()
try:
data = base64_decode(raw)
except TypeError:
self.log.exception('Binary data with id=%s is corrupted, ignoring'%(
elem.get('id')))
else:
with open(fname, 'wb') as f:
f.write(data)

View File

@@ -0,0 +1,203 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL 3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
import os
from calibre.customize.conversion import OutputFormatPlugin, OptionRecommendation
class FB2Output(OutputFormatPlugin):
name = 'FB2 Output'
author = 'John Schember'
file_type = 'fb2'
commit_name = 'fb2_output'
FB2_GENRES = [
# Science Fiction & Fantasy
'sf_history', # Alternative history
'sf_action', # Action
'sf_epic', # Epic
'sf_heroic', # Heroic
'sf_detective', # Detective
'sf_cyberpunk', # Cyberpunk
'sf_space', # Space
'sf_social', # Social#philosophical
'sf_horror', # Horror & mystic
'sf_humor', # Humor
'sf_fantasy', # Fantasy
'sf', # Science Fiction
# Detectives & Thrillers
'det_classic', # Classical detectives
'det_police', # Police Stories
'det_action', # Action
'det_irony', # Ironical detectives
'det_history', # Historical detectives
'det_espionage', # Espionage detectives
'det_crime', # Crime detectives
'det_political', # Political detectives
'det_maniac', # Maniacs
'det_hard', # Hard#boiled
'thriller', # Thrillers
'detective', # Detectives
# Prose
'prose_classic', # Classics prose
'prose_history', # Historical prose
'prose_contemporary', # Contemporary prose
'prose_counter', # Counterculture
'prose_rus_classic', # Russial classics prose
'prose_su_classics', # Soviet classics prose
# Romance
'love_contemporary', # Contemporary Romance
'love_history', # Historical Romance
'love_detective', # Detective Romance
'love_short', # Short Romance
'love_erotica', # Erotica
# Adventure
'adv_western', # Western
'adv_history', # History
'adv_indian', # Indians
'adv_maritime', # Maritime Fiction
'adv_geo', # Travel & geography
'adv_animal', # Nature & animals
'adventure', # Other
# Children's
'child_tale', # Fairy Tales
'child_verse', # Verses
'child_prose', # Prose
'child_sf', # Science Fiction
'child_det', # Detectives & Thrillers
'child_adv', # Adventures
'child_education', # Educational
'children', # Other
# Poetry & Dramaturgy
'poetry', # Poetry
'dramaturgy', # Dramaturgy
# Antique literature
'antique_ant', # Antique
'antique_european', # European
'antique_russian', # Old russian
'antique_east', # Old east
'antique_myths', # Myths. Legends. Epos
'antique', # Other
# Scientific#educational
'sci_history', # History
'sci_psychology', # Psychology
'sci_culture', # Cultural science
'sci_religion', # Religious studies
'sci_philosophy', # Philosophy
'sci_politics', # Politics
'sci_business', # Business literature
'sci_juris', # Jurisprudence
'sci_linguistic', # Linguistics
'sci_medicine', # Medicine
'sci_phys', # Physics
'sci_math', # Mathematics
'sci_chem', # Chemistry
'sci_biology', # Biology
'sci_tech', # Technical
'science', # Other
# Computers & Internet
'comp_www', # Internet
'comp_programming', # Programming
'comp_hard', # Hardware
'comp_soft', # Software
'comp_db', # Databases
'comp_osnet', # OS & Networking
'computers', # Other
# Reference
'ref_encyc', # Encyclopedias
'ref_dict', # Dictionaries
'ref_ref', # Reference
'ref_guide', # Guidebooks
'reference', # Other
# Nonfiction
'nonf_biography', # Biography & Memoirs
'nonf_publicism', # Publicism
'nonf_criticism', # Criticism
'design', # Art & design
'nonfiction', # Other
# Religion & Inspiration
'religion_rel', # Religion
'religion_esoterics', # Esoterics
'religion_self', # Self#improvement
'religion', # Other
# Humor
'humor_anecdote', # Anecdote (funny stories)
'humor_prose', # Prose
'humor_verse', # Verses
'humor', # Other
# Home & Family
'home_cooking', # Cooking
'home_pets', # Pets
'home_crafts', # Hobbies & Crafts
'home_entertain', # Entertaining
'home_health', # Health
'home_garden', # Garden
'home_diy', # Do it yourself
'home_sport', # Sports
'home_sex', # Erotica & sex
'home', # Other
]
ui_data = {
'sectionize': {
'toc': _('Section per entry in the ToC'),
'files': _('Section per file'),
'nothing': _('A single section')
},
'genres': FB2_GENRES,
}
options = {
OptionRecommendation(name='sectionize',
recommended_value='files', level=OptionRecommendation.LOW,
choices=list(ui_data['sectionize']),
help=_('Specify how sections are created:\n'
' * nothing: {nothing}\n'
' * files: {files}\n'
' * toc: {toc}\n'
'If ToC based generation fails, adjust the "Structure detection" and/or "Table of Contents" settings '
'(turn on "Force use of auto-generated Table of Contents").').format(**ui_data['sectionize'])
),
OptionRecommendation(name='fb2_genre',
recommended_value='antique', level=OptionRecommendation.LOW,
choices=FB2_GENRES,
help=(_('Genre for the book. Choices: %s\n\n See: ') % ', '.join(FB2_GENRES)
) + 'http://www.fictionbook.org/index.php/Eng:FictionBook_2.1_genres ' + _('for a complete list with descriptions.')),
}
def convert(self, oeb_book, output_path, input_plugin, opts, log):
from calibre.ebooks.oeb.transforms.jacket import linearize_jacket
from calibre.ebooks.oeb.transforms.rasterize import SVGRasterizer, Unavailable
from calibre.ebooks.fb2.fb2ml import FB2MLizer
try:
rasterizer = SVGRasterizer()
rasterizer(oeb_book, opts)
except Unavailable:
log.warn('SVG rasterizer unavailable, SVG will not be converted')
linearize_jacket(oeb_book)
fb2mlizer = FB2MLizer(log)
fb2_content = fb2mlizer.extract_content(oeb_book, opts)
close = False
if not hasattr(output_path, 'write'):
close = True
if not os.path.exists(os.path.dirname(output_path)) and os.path.dirname(output_path) != '':
os.makedirs(os.path.dirname(output_path))
out_stream = lopen(output_path, 'wb')
else:
out_stream = output_path
out_stream.seek(0)
out_stream.truncate()
out_stream.write(fb2_content.encode('utf-8', 'replace'))
if close:
out_stream.close()

View File

@@ -0,0 +1,316 @@
#!/usr/bin/env python2
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import re, tempfile, os
from functools import partial
from calibre.constants import islinux, isbsd
from calibre.customize.conversion import (InputFormatPlugin,
OptionRecommendation)
from calibre.utils.localization import get_lang
from calibre.utils.filenames import ascii_filename
from calibre.utils.imghdr import what
from polyglot.builtins import unicode_type, zip, getcwd, as_unicode
def sanitize_file_name(x):
ans = re.sub(r'\s+', ' ', re.sub(r'[?&=;#]', '_', ascii_filename(x))).strip().rstrip('.')
ans, ext = ans.rpartition('.')[::2]
return (ans.strip() + '.' + ext.strip()).rstrip('.')
class HTMLInput(InputFormatPlugin):
name = 'HTML Input'
author = 'Kovid Goyal'
description = 'Convert HTML and OPF files to an OEB'
file_types = {'opf', 'html', 'htm', 'xhtml', 'xhtm', 'shtm', 'shtml'}
commit_name = 'html_input'
options = {
OptionRecommendation(name='breadth_first',
recommended_value=False, level=OptionRecommendation.LOW,
help=_('Traverse links in HTML files breadth first. Normally, '
'they are traversed depth first.'
)
),
OptionRecommendation(name='max_levels',
recommended_value=5, level=OptionRecommendation.LOW,
help=_('Maximum levels of recursion when following links in '
'HTML files. Must be non-negative. 0 implies that no '
'links in the root HTML file are followed. Default is '
'%default.'
)
),
OptionRecommendation(name='dont_package',
recommended_value=False, level=OptionRecommendation.LOW,
help=_('Normally this input plugin re-arranges all the input '
'files into a standard folder hierarchy. Only use this option '
'if you know what you are doing as it can result in various '
'nasty side effects in the rest of the conversion pipeline.'
)
),
}
def convert(self, stream, opts, file_ext, log,
accelerators):
self._is_case_sensitive = None
basedir = getcwd()
self.opts = opts
fname = None
if hasattr(stream, 'name'):
basedir = os.path.dirname(stream.name)
fname = os.path.basename(stream.name)
if file_ext != 'opf':
if opts.dont_package:
raise ValueError('The --dont-package option is not supported for an HTML input file')
from calibre.ebooks.metadata.html import get_metadata
mi = get_metadata(stream)
if fname:
from calibre.ebooks.metadata.meta import metadata_from_filename
fmi = metadata_from_filename(fname)
fmi.smart_update(mi)
mi = fmi
oeb = self.create_oebbook(stream.name, basedir, opts, log, mi)
return oeb
from calibre.ebooks.conversion.plumber import create_oebbook
return create_oebbook(log, stream.name, opts,
encoding=opts.input_encoding)
def is_case_sensitive(self, path):
if getattr(self, '_is_case_sensitive', None) is not None:
return self._is_case_sensitive
if not path or not os.path.exists(path):
return islinux or isbsd
self._is_case_sensitive = not (os.path.exists(path.lower()) and os.path.exists(path.upper()))
return self._is_case_sensitive
def create_oebbook(self, htmlpath, basedir, opts, log, mi):
import uuid
from calibre.ebooks.conversion.plumber import create_oebbook
from calibre.ebooks.oeb.base import (DirContainer,
rewrite_links, urlnormalize, urldefrag, BINARY_MIME, OEB_STYLES,
xpath, urlquote)
from calibre import guess_type
from calibre.ebooks.oeb.transforms.metadata import \
meta_info_to_oeb_metadata
from calibre.ebooks.html.input import get_filelist
from calibre.ebooks.metadata import string_to_authors
from calibre.utils.localization import canonicalize_lang
import css_parser, logging
css_parser.log.setLevel(logging.WARN)
self.OEB_STYLES = OEB_STYLES
oeb = create_oebbook(log, None, opts, self,
encoding=opts.input_encoding, populate=False)
self.oeb = oeb
metadata = oeb.metadata
meta_info_to_oeb_metadata(mi, metadata, log)
if not metadata.language:
l = canonicalize_lang(getattr(opts, 'language', None))
if not l:
oeb.logger.warn('Language not specified')
l = get_lang().replace('_', '-')
metadata.add('language', l)
if not metadata.creator:
a = getattr(opts, 'authors', None)
if a:
a = string_to_authors(a)
if not a:
oeb.logger.warn('Creator not specified')
a = [self.oeb.translate(__('Unknown'))]
for aut in a:
metadata.add('creator', aut)
if not metadata.title:
oeb.logger.warn('Title not specified')
metadata.add('title', self.oeb.translate(__('Unknown')))
bookid = unicode_type(uuid.uuid4())
metadata.add('identifier', bookid, id='uuid_id', scheme='uuid')
for ident in metadata.identifier:
if 'id' in ident.attrib:
self.oeb.uid = metadata.identifier[0]
break
filelist = get_filelist(htmlpath, basedir, opts, log)
filelist = [f for f in filelist if not f.is_binary]
htmlfile_map = {}
for f in filelist:
path = f.path
oeb.container = DirContainer(os.path.dirname(path), log,
ignore_opf=True)
bname = os.path.basename(path)
id, href = oeb.manifest.generate(id='html', href=sanitize_file_name(bname))
htmlfile_map[path] = href
item = oeb.manifest.add(id, href, 'text/html')
if path == htmlpath and '%' in path:
bname = urlquote(bname)
item.html_input_href = bname
oeb.spine.add(item, True)
self.added_resources = {}
self.log = log
self.log('Normalizing filename cases')
for path, href in htmlfile_map.items():
if not self.is_case_sensitive(path):
path = path.lower()
self.added_resources[path] = href
self.urlnormalize, self.DirContainer = urlnormalize, DirContainer
self.urldefrag = urldefrag
self.guess_type, self.BINARY_MIME = guess_type, BINARY_MIME
self.log('Rewriting HTML links')
for f in filelist:
path = f.path
dpath = os.path.dirname(path)
oeb.container = DirContainer(dpath, log, ignore_opf=True)
href = htmlfile_map[path]
try:
item = oeb.manifest.hrefs[href]
except KeyError:
item = oeb.manifest.hrefs[urlnormalize(href)]
rewrite_links(item.data, partial(self.resource_adder, base=dpath))
for item in oeb.manifest.values():
if item.media_type in self.OEB_STYLES:
dpath = None
for path, href in self.added_resources.items():
if href == item.href:
dpath = os.path.dirname(path)
break
css_parser.replaceUrls(item.data,
partial(self.resource_adder, base=dpath))
toc = self.oeb.toc
self.oeb.auto_generated_toc = True
titles = []
headers = []
for item in self.oeb.spine:
if not item.linear:
continue
html = item.data
title = ''.join(xpath(html, '/h:html/h:head/h:title/text()'))
title = re.sub(r'\s+', ' ', title.strip())
if title:
titles.append(title)
headers.append('(unlabled)')
for tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'strong'):
expr = '/h:html/h:body//h:%s[position()=1]/text()'
header = ''.join(xpath(html, expr % tag))
header = re.sub(r'\s+', ' ', header.strip())
if header:
headers[-1] = header
break
use = titles
if len(titles) > len(set(titles)):
use = headers
for title, item in zip(use, self.oeb.spine):
if not item.linear:
continue
toc.add(title, item.href)
oeb.container = DirContainer(getcwd(), oeb.log, ignore_opf=True)
return oeb
def link_to_local_path(self, link_, base=None):
from calibre.ebooks.html.input import Link
if not isinstance(link_, unicode_type):
try:
link_ = link_.decode('utf-8', 'error')
except:
self.log.warn('Failed to decode link %r. Ignoring'%link_)
return None, None
try:
l = Link(link_, base if base else getcwd())
except:
self.log.exception('Failed to process link: %r'%link_)
return None, None
if l.path is None:
# Not a local resource
return None, None
link = l.path.replace('/', os.sep).strip()
frag = l.fragment
if not link:
return None, None
return link, frag
def resource_adder(self, link_, base=None):
from polyglot.urllib import quote
link, frag = self.link_to_local_path(link_, base=base)
if link is None:
return link_
try:
if base and not os.path.isabs(link):
link = os.path.join(base, link)
link = os.path.abspath(link)
except:
return link_
if not os.access(link, os.R_OK):
return link_
if os.path.isdir(link):
self.log.warn(link_, 'is a link to a directory. Ignoring.')
return link_
if not self.is_case_sensitive(tempfile.gettempdir()):
link = link.lower()
if link not in self.added_resources:
bhref = os.path.basename(link)
id, href = self.oeb.manifest.generate(id='added', href=sanitize_file_name(bhref))
guessed = self.guess_type(href)[0]
media_type = guessed or self.BINARY_MIME
if media_type == 'text/plain':
self.log.warn('Ignoring link to text file %r'%link_)
return None
if media_type == self.BINARY_MIME:
# Check for the common case, images
try:
img = what(link)
except EnvironmentError:
pass
else:
if img:
media_type = self.guess_type('dummy.'+img)[0] or self.BINARY_MIME
self.oeb.log.debug('Added', link)
self.oeb.container = self.DirContainer(os.path.dirname(link),
self.oeb.log, ignore_opf=True)
# Load into memory
item = self.oeb.manifest.add(id, href, media_type)
# bhref refers to an already existing file. The read() method of
# DirContainer will call unquote on it before trying to read the
# file, therefore we quote it here.
if isinstance(bhref, unicode_type):
bhref = bhref.encode('utf-8')
item.html_input_href = as_unicode(quote(bhref))
if guessed in self.OEB_STYLES:
item.override_css_fetch = partial(
self.css_import_handler, os.path.dirname(link))
item.data
self.added_resources[link] = href
nlink = self.added_resources[link]
if frag:
nlink = '#'.join((nlink, frag))
return nlink
def css_import_handler(self, base, href):
link, frag = self.link_to_local_path(href, base=base)
if link is None or not os.access(link, os.R_OK) or os.path.isdir(link):
return (None, None)
try:
with open(link, 'rb') as f:
raw = f.read().decode('utf-8', 'replace')
raw = self.oeb.css_preprocessor(raw, add_namespace=False)
except:
self.log.exception('Failed to read CSS file: %r'%link)
return (None, None)
return (None, raw)

View File

@@ -0,0 +1,226 @@
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL 3'
__copyright__ = '2010, Fabian Grassl <fg@jusmeum.de>'
__docformat__ = 'restructuredtext en'
import os, re, shutil
from os.path import dirname, abspath, relpath as _relpath, exists, basename
from calibre.customize.conversion import OutputFormatPlugin, OptionRecommendation
from calibre import CurrentDir
from calibre.ptempfile import PersistentTemporaryDirectory
from polyglot.builtins import unicode_type
def relpath(*args):
return _relpath(*args).replace(os.sep, '/')
class HTMLOutput(OutputFormatPlugin):
name = 'HTML Output'
author = 'Fabian Grassl'
file_type = 'zip'
commit_name = 'html_output'
options = {
OptionRecommendation(name='template_css',
help=_('CSS file used for the output instead of the default file')),
OptionRecommendation(name='template_html_index',
help=_('Template used for generation of the HTML index file instead of the default file')),
OptionRecommendation(name='template_html',
help=_('Template used for the generation of the HTML contents of the book instead of the default file')),
OptionRecommendation(name='extract_to',
help=_('Extract the contents of the generated ZIP file to the '
'specified directory. WARNING: The contents of the directory '
'will be deleted.')
),
}
recommendations = {('pretty_print', True, OptionRecommendation.HIGH)}
def generate_toc(self, oeb_book, ref_url, output_dir):
'''
Generate table of contents
'''
from lxml import etree
from polyglot.urllib import unquote
from calibre.ebooks.oeb.base import element
from calibre.utils.cleantext import clean_xml_chars
with CurrentDir(output_dir):
def build_node(current_node, parent=None):
if parent is None:
parent = etree.Element('ul')
elif len(current_node.nodes):
parent = element(parent, ('ul'))
for node in current_node.nodes:
point = element(parent, 'li')
href = relpath(abspath(unquote(node.href)), dirname(ref_url))
if isinstance(href, bytes):
href = href.decode('utf-8')
link = element(point, 'a', href=clean_xml_chars(href))
title = node.title
if isinstance(title, bytes):
title = title.decode('utf-8')
if title:
title = re.sub(r'\s+', ' ', title)
link.text = clean_xml_chars(title)
build_node(node, point)
return parent
wrap = etree.Element('div')
wrap.append(build_node(oeb_book.toc))
return wrap
def generate_html_toc(self, oeb_book, ref_url, output_dir):
from lxml import etree
root = self.generate_toc(oeb_book, ref_url, output_dir)
return etree.tostring(root, pretty_print=True, encoding='unicode',
xml_declaration=False)
def convert(self, oeb_book, output_path, input_plugin, opts, log):
from lxml import etree
from calibre.utils import zipfile
from templite import Templite
from polyglot.urllib import unquote
from calibre.ebooks.html.meta import EasyMeta
# read template files
if opts.template_html_index is not None:
with open(opts.template_html_index, 'rb') as f:
template_html_index_data = f.read()
else:
template_html_index_data = P('templates/html_export_default_index.tmpl', data=True)
if opts.template_html is not None:
with open(opts.template_html, 'rb') as f:
template_html_data = f.read()
else:
template_html_data = P('templates/html_export_default.tmpl', data=True)
if opts.template_css is not None:
with open(opts.template_css, 'rb') as f:
template_css_data = f.read()
else:
template_css_data = P('templates/html_export_default.css', data=True)
template_html_index_data = template_html_index_data.decode('utf-8')
template_html_data = template_html_data.decode('utf-8')
template_css_data = template_css_data.decode('utf-8')
self.log = log
self.opts = opts
meta = EasyMeta(oeb_book.metadata)
tempdir = os.path.realpath(PersistentTemporaryDirectory())
output_file = os.path.join(tempdir,
basename(re.sub(r'\.zip', '', output_path)+'.html'))
output_dir = re.sub(r'\.html', '', output_file)+'_files'
if not exists(output_dir):
os.makedirs(output_dir)
css_path = output_dir+os.sep+'calibreHtmlOutBasicCss.css'
with open(css_path, 'wb') as f:
f.write(template_css_data.encode('utf-8'))
with open(output_file, 'wb') as f:
html_toc = self.generate_html_toc(oeb_book, output_file, output_dir)
templite = Templite(template_html_index_data)
nextLink = oeb_book.spine[0].href
nextLink = relpath(output_dir+os.sep+nextLink, dirname(output_file))
cssLink = relpath(abspath(css_path), dirname(output_file))
tocUrl = relpath(output_file, dirname(output_file))
t = templite.render(has_toc=bool(oeb_book.toc.count()),
toc=html_toc, meta=meta, nextLink=nextLink,
tocUrl=tocUrl, cssLink=cssLink,
firstContentPageLink=nextLink)
if isinstance(t, unicode_type):
t = t.encode('utf-8')
f.write(t)
with CurrentDir(output_dir):
for item in oeb_book.manifest:
path = abspath(unquote(item.href))
dir = dirname(path)
if not exists(dir):
os.makedirs(dir)
if item.spine_position is not None:
with open(path, 'wb') as f:
pass
else:
with open(path, 'wb') as f:
f.write(item.bytes_representation)
item.unload_data_from_memory(memory=path)
for item in oeb_book.spine:
path = abspath(unquote(item.href))
dir = dirname(path)
root = item.data.getroottree()
# get & clean HTML <HEAD>-data
head = root.xpath('//h:head', namespaces={'h': 'http://www.w3.org/1999/xhtml'})[0]
head_content = etree.tostring(head, pretty_print=True, encoding='unicode')
head_content = re.sub(r'\<\/?head.*\>', '', head_content)
head_content = re.sub(re.compile(r'\<style.*\/style\>', re.M|re.S), '', head_content)
head_content = re.sub(r'<(title)([^>]*)/>', r'<\1\2></\1>', head_content)
# get & clean HTML <BODY>-data
body = root.xpath('//h:body', namespaces={'h': 'http://www.w3.org/1999/xhtml'})[0]
ebook_content = etree.tostring(body, pretty_print=True, encoding='unicode')
ebook_content = re.sub(r'\<\/?body.*\>', '', ebook_content)
ebook_content = re.sub(r'<(div|a|span)([^>]*)/>', r'<\1\2></\1>', ebook_content)
# generate link to next page
if item.spine_position+1 < len(oeb_book.spine):
nextLink = oeb_book.spine[item.spine_position+1].href
nextLink = relpath(abspath(nextLink), dir)
else:
nextLink = None
# generate link to previous page
if item.spine_position > 0:
prevLink = oeb_book.spine[item.spine_position-1].href
prevLink = relpath(abspath(prevLink), dir)
else:
prevLink = None
cssLink = relpath(abspath(css_path), dir)
tocUrl = relpath(output_file, dir)
firstContentPageLink = oeb_book.spine[0].href
# render template
templite = Templite(template_html_data)
toc = lambda: self.generate_html_toc(oeb_book, path, output_dir)
t = templite.render(ebookContent=ebook_content,
prevLink=prevLink, nextLink=nextLink,
has_toc=bool(oeb_book.toc.count()), toc=toc,
tocUrl=tocUrl, head_content=head_content,
meta=meta, cssLink=cssLink,
firstContentPageLink=firstContentPageLink)
# write html to file
with open(path, 'wb') as f:
f.write(t.encode('utf-8'))
item.unload_data_from_memory(memory=path)
zfile = zipfile.ZipFile(output_path, "w")
zfile.add_dir(output_dir, basename(output_dir))
zfile.write(output_file, basename(output_file), zipfile.ZIP_DEFLATED)
if opts.extract_to:
if os.path.exists(opts.extract_to):
shutil.rmtree(opts.extract_to)
os.makedirs(opts.extract_to)
zfile.extractall(opts.extract_to)
self.log('Zip file extracted to', opts.extract_to)
zfile.close()
# cleanup temp dir
shutil.rmtree(tempdir)

View File

@@ -0,0 +1,133 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL 3'
__copyright__ = '2011, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
import os
from calibre import guess_type
from calibre.customize.conversion import InputFormatPlugin
from polyglot.builtins import getcwd
class HTMLZInput(InputFormatPlugin):
name = 'HTLZ Input'
author = 'John Schember'
description = 'Convert HTML files to HTML'
file_types = {'htmlz'}
commit_name = 'htmlz_input'
def convert(self, stream, options, file_ext, log,
accelerators):
from calibre.ebooks.chardet import xml_to_unicode
from calibre.ebooks.metadata.opf2 import OPF
from calibre.utils.zipfile import ZipFile
self.log = log
html = u''
top_levels = []
# Extract content from zip archive.
zf = ZipFile(stream)
zf.extractall()
# Find the HTML file in the archive. It needs to be
# top level.
index = u''
multiple_html = False
# Get a list of all top level files in the archive.
for x in os.listdir(u'.'):
if os.path.isfile(x):
top_levels.append(x)
# Try to find an index. file.
for x in top_levels:
if x.lower() in (u'index.html', u'index.xhtml', u'index.htm'):
index = x
break
# Look for multiple HTML files in the archive. We look at the
# top level files only as only they matter in HTMLZ.
for x in top_levels:
if os.path.splitext(x)[1].lower() in (u'.html', u'.xhtml', u'.htm'):
# Set index to the first HTML file found if it's not
# called index.
if not index:
index = x
else:
multiple_html = True
# Warn the user if there multiple HTML file in the archive. HTMLZ
# supports a single HTML file. A conversion with a multiple HTML file
# HTMLZ archive probably won't turn out as the user expects. With
# Multiple HTML files ZIP input should be used in place of HTMLZ.
if multiple_html:
log.warn(_('Multiple HTML files found in the archive. Only %s will be used.') % index)
if index:
with open(index, 'rb') as tf:
html = tf.read()
else:
raise Exception(_('No top level HTML file found.'))
if not html:
raise Exception(_('Top level HTML file %s is empty') % index)
# Encoding
if options.input_encoding:
ienc = options.input_encoding
else:
ienc = xml_to_unicode(html[:4096])[-1]
html = html.decode(ienc, 'replace')
# Run the HTML through the html processing plugin.
from calibre.customize.ui import plugin_for_input_format
html_input = plugin_for_input_format('html')
for opt in html_input.options:
setattr(options, opt.option.name, opt.recommended_value)
options.input_encoding = 'utf-8'
base = getcwd()
htmlfile = os.path.join(base, u'index.html')
c = 0
while os.path.exists(htmlfile):
c += 1
htmlfile = u'index%d.html'%c
with open(htmlfile, 'wb') as f:
f.write(html.encode('utf-8'))
odi = options.debug_pipeline
options.debug_pipeline = None
# Generate oeb from html conversion.
with open(htmlfile, 'rb') as f:
oeb = html_input.convert(f, options, 'html', log,
{})
options.debug_pipeline = odi
os.remove(htmlfile)
# Set metadata from file.
from calibre.customize.ui import get_file_type_metadata
from calibre.ebooks.oeb.transforms.metadata import meta_info_to_oeb_metadata
mi = get_file_type_metadata(stream, file_ext)
meta_info_to_oeb_metadata(mi, oeb.metadata, log)
# Get the cover path from the OPF.
cover_path = None
opf = None
for x in top_levels:
if os.path.splitext(x)[1].lower() == u'.opf':
opf = x
break
if opf:
opf = OPF(opf, basedir=getcwd())
cover_path = opf.raster_cover or opf.cover
# Set the cover.
if cover_path:
cdata = None
with open(os.path.join(getcwd(), cover_path), 'rb') as cf:
cdata = cf.read()
cover_name = os.path.basename(cover_path)
id, href = oeb.manifest.generate('cover', cover_name)
oeb.manifest.add(id, href, guess_type(cover_name)[0], data=cdata)
oeb.guide.add('cover', 'Cover', href)
return oeb

View File

@@ -0,0 +1,136 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL 3'
__copyright__ = '2011, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
import io
import os
from calibre.customize.conversion import OutputFormatPlugin, \
OptionRecommendation
from calibre.ptempfile import TemporaryDirectory
from polyglot.builtins import unicode_type
class HTMLZOutput(OutputFormatPlugin):
name = 'HTMLZ Output'
author = 'John Schember'
file_type = 'htmlz'
commit_name = 'htmlz_output'
ui_data = {
'css_choices': {
'class': _('Use CSS classes'),
'inline': _('Use the style attribute'),
'tag': _('Use HTML tags wherever possible')
},
'sheet_choices': {
'external': _('Use an external CSS file'),
'inline': _('Use a <style> tag in the HTML file')
}
}
options = {
OptionRecommendation(name='htmlz_css_type', recommended_value='class',
level=OptionRecommendation.LOW,
choices=list(ui_data['css_choices']),
help=_('Specify the handling of CSS. Default is class.\n'
'class: {class}\n'
'inline: {inline}\n'
'tag: {tag}'
).format(**ui_data['css_choices'])),
OptionRecommendation(name='htmlz_class_style', recommended_value='external',
level=OptionRecommendation.LOW,
choices=list(ui_data['sheet_choices']),
help=_('How to handle the CSS when using css-type = \'class\'.\n'
'Default is external.\n'
'external: {external}\n'
'inline: {inline}'
).format(**ui_data['sheet_choices'])),
OptionRecommendation(name='htmlz_title_filename',
recommended_value=False, level=OptionRecommendation.LOW,
help=_('If set this option causes the file name of the HTML file'
' inside the HTMLZ archive to be based on the book title.')
),
}
def convert(self, oeb_book, output_path, input_plugin, opts, log):
from lxml import etree
from calibre.ebooks.oeb.base import OEB_IMAGES, SVG_MIME
from calibre.ebooks.metadata.opf2 import OPF, metadata_to_opf
from calibre.utils.zipfile import ZipFile
from calibre.utils.filenames import ascii_filename
# HTML
if opts.htmlz_css_type == 'inline':
from calibre.ebooks.htmlz.oeb2html import OEB2HTMLInlineCSSizer
OEB2HTMLizer = OEB2HTMLInlineCSSizer
elif opts.htmlz_css_type == 'tag':
from calibre.ebooks.htmlz.oeb2html import OEB2HTMLNoCSSizer
OEB2HTMLizer = OEB2HTMLNoCSSizer
else:
from calibre.ebooks.htmlz.oeb2html import OEB2HTMLClassCSSizer as OEB2HTMLizer
with TemporaryDirectory(u'_htmlz_output') as tdir:
htmlizer = OEB2HTMLizer(log)
html = htmlizer.oeb2html(oeb_book, opts)
fname = u'index'
if opts.htmlz_title_filename:
from calibre.utils.filenames import shorten_components_to
fname = shorten_components_to(100, (ascii_filename(unicode_type(oeb_book.metadata.title[0])),))[0]
with open(os.path.join(tdir, fname+u'.html'), 'wb') as tf:
if isinstance(html, unicode_type):
html = html.encode('utf-8')
tf.write(html)
# CSS
if opts.htmlz_css_type == 'class' and opts.htmlz_class_style == 'external':
with open(os.path.join(tdir, u'style.css'), 'wb') as tf:
tf.write(htmlizer.get_css(oeb_book))
# Images
images = htmlizer.images
if images:
if not os.path.exists(os.path.join(tdir, u'images')):
os.makedirs(os.path.join(tdir, u'images'))
for item in oeb_book.manifest:
if item.media_type in OEB_IMAGES and item.href in images:
if item.media_type == SVG_MIME:
data = etree.tostring(item.data, encoding='unicode')
else:
data = item.data
fname = os.path.join(tdir, u'images', images[item.href])
with open(fname, 'wb') as img:
img.write(data)
# Cover
cover_path = None
try:
cover_data = None
if oeb_book.metadata.cover:
term = oeb_book.metadata.cover[0].term
cover_data = oeb_book.guide[term].item.data
if cover_data:
from calibre.utils.img import save_cover_data_to
cover_path = os.path.join(tdir, u'cover.jpg')
with lopen(cover_path, 'w') as cf:
cf.write('')
save_cover_data_to(cover_data, cover_path)
except:
import traceback
traceback.print_exc()
# Metadata
with open(os.path.join(tdir, u'metadata.opf'), 'wb') as mdataf:
opf = OPF(io.BytesIO(etree.tostring(oeb_book.metadata.to_opf1(), encoding='UTF-8')))
mi = opf.to_book_metadata()
if cover_path:
mi.cover = u'cover.jpg'
mdataf.write(metadata_to_opf(mi))
htmlz = ZipFile(output_path, 'w')
htmlz.add_dir(tdir)

View File

@@ -0,0 +1,64 @@
#!/usr/bin/env python2
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
from calibre.customize.conversion import InputFormatPlugin
class LITInput(InputFormatPlugin):
name = 'LIT Input'
author = 'Marshall T. Vandegrift'
description = 'Convert LIT files to HTML'
file_types = {'lit'}
commit_name = 'lit_input'
def convert(self, stream, options, file_ext, log,
accelerators):
from calibre.ebooks.lit.reader import LitReader
from calibre.ebooks.conversion.plumber import create_oebbook
self.log = log
return create_oebbook(log, stream, options, reader=LitReader)
def postprocess_book(self, oeb, opts, log):
from calibre.ebooks.oeb.base import XHTML_NS, XPath, XHTML
for item in oeb.spine:
root = item.data
if not hasattr(root, 'xpath'):
continue
for bad in ('metadata', 'guide'):
metadata = XPath('//h:'+bad)(root)
if metadata:
for x in metadata:
x.getparent().remove(x)
body = XPath('//h:body')(root)
if body:
body = body[0]
if len(body) == 1 and body[0].tag == XHTML('pre'):
pre = body[0]
from calibre.ebooks.txt.processor import convert_basic, \
separate_paragraphs_single_line
from calibre.ebooks.chardet import xml_to_unicode
from calibre.utils.xml_parse import safe_xml_fromstring
import copy
self.log('LIT file with all text in singe <pre> tag detected')
html = separate_paragraphs_single_line(pre.text)
html = convert_basic(html).replace('<html>',
'<html xmlns="%s">'%XHTML_NS)
html = xml_to_unicode(html, strip_encoding_pats=True,
resolve_entities=True)[0]
if opts.smarten_punctuation:
# SmartyPants skips text inside <pre> tags
from calibre.ebooks.conversion.preprocess import smarten_punctuation
html = smarten_punctuation(html, self.log)
root = safe_xml_fromstring(html)
body = XPath('//h:body')(root)
pre.tag = XHTML('div')
pre.text = ''
for elem in body:
ne = copy.deepcopy(elem)
pre.append(ne)

View File

@@ -0,0 +1,38 @@
#!/usr/bin/env python2
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
from calibre.customize.conversion import OutputFormatPlugin
class LITOutput(OutputFormatPlugin):
name = 'LIT Output'
author = 'Marshall T. Vandegrift'
file_type = 'lit'
commit_name = 'lit_output'
def convert(self, oeb, output_path, input_plugin, opts, log):
self.log, self.opts, self.oeb = log, opts, oeb
from calibre.ebooks.oeb.transforms.manglecase import CaseMangler
from calibre.ebooks.oeb.transforms.rasterize import SVGRasterizer
from calibre.ebooks.oeb.transforms.htmltoc import HTMLTOCAdder
from calibre.ebooks.lit.writer import LitWriter
from calibre.ebooks.oeb.transforms.split import Split
split = Split(split_on_page_breaks=True, max_flow_size=0,
remove_css_pagebreaks=False)
split(self.oeb, self.opts)
tocadder = HTMLTOCAdder()
tocadder(oeb, opts)
mangler = CaseMangler()
mangler(oeb, opts)
rasterizer = SVGRasterizer()
rasterizer(oeb, opts)
lit = LitWriter(self.opts)
lit(oeb, output_path)

View File

@@ -0,0 +1,82 @@
#!/usr/bin/env python2
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import os, sys
from calibre.customize.conversion import InputFormatPlugin
class LRFInput(InputFormatPlugin):
name = 'LRF Input'
author = 'Kovid Goyal'
description = 'Convert LRF files to HTML'
file_types = {'lrf'}
commit_name = 'lrf_input'
def convert(self, stream, options, file_ext, log,
accelerators):
from calibre.ebooks.lrf.input import (MediaType, Styles, TextBlock,
Canvas, ImageBlock, RuledLine)
self.log = log
self.log('Generating XML')
from calibre.ebooks.lrf.lrfparser import LRFDocument
from calibre.utils.xml_parse import safe_xml_fromstring
from lxml import etree
d = LRFDocument(stream)
d.parse()
xml = d.to_xml(write_files=True)
if options.verbose > 2:
open(u'lrs.xml', 'wb').write(xml.encode('utf-8'))
doc = safe_xml_fromstring(xml)
char_button_map = {}
for x in doc.xpath('//CharButton[@refobj]'):
ro = x.get('refobj')
jump_button = doc.xpath('//*[@objid="%s"]'%ro)
if jump_button:
jump_to = jump_button[0].xpath('descendant::JumpTo[@refpage and @refobj]')
if jump_to:
char_button_map[ro] = '%s.xhtml#%s'%(jump_to[0].get('refpage'),
jump_to[0].get('refobj'))
plot_map = {}
for x in doc.xpath('//Plot[@refobj]'):
ro = x.get('refobj')
image = doc.xpath('//Image[@objid="%s" and @refstream]'%ro)
if image:
imgstr = doc.xpath('//ImageStream[@objid="%s" and @file]'%
image[0].get('refstream'))
if imgstr:
plot_map[ro] = imgstr[0].get('file')
self.log('Converting XML to HTML...')
styledoc = safe_xml_fromstring(P('templates/lrf.xsl', data=True))
media_type = MediaType()
styles = Styles()
text_block = TextBlock(styles, char_button_map, plot_map, log)
canvas = Canvas(doc, styles, text_block, log)
image_block = ImageBlock(canvas)
ruled_line = RuledLine()
extensions = {
('calibre', 'media-type') : media_type,
('calibre', 'text-block') : text_block,
('calibre', 'ruled-line') : ruled_line,
('calibre', 'styles') : styles,
('calibre', 'canvas') : canvas,
('calibre', 'image-block'): image_block,
}
transform = etree.XSLT(styledoc, extensions=extensions)
try:
result = transform(doc)
except RuntimeError:
sys.setrecursionlimit(5000)
result = transform(doc)
with open('content.opf', 'wb') as f:
f.write(result)
styles.write()
return os.path.abspath('content.opf')

View File

@@ -0,0 +1,196 @@
#!/usr/bin/env python2
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import sys, os
from calibre.customize.conversion import OutputFormatPlugin
from calibre.customize.conversion import OptionRecommendation
from polyglot.builtins import unicode_type
class LRFOptions(object):
def __init__(self, output, opts, oeb):
def f2s(f):
try:
return unicode_type(f[0])
except:
return ''
m = oeb.metadata
for x in ('left', 'top', 'right', 'bottom'):
attr = 'margin_'+x
val = getattr(opts, attr)
if val < 0:
setattr(opts, attr, 0)
self.title = None
self.author = self.publisher = _('Unknown')
self.title_sort = self.author_sort = ''
for x in m.creator:
if x.role == 'aut':
self.author = unicode_type(x)
fa = unicode_type(getattr(x, 'file_as', ''))
if fa:
self.author_sort = fa
for x in m.title:
if unicode_type(x.file_as):
self.title_sort = unicode_type(x.file_as)
self.freetext = f2s(m.description)
self.category = f2s(m.subject)
self.cover = None
self.use_metadata_cover = True
self.output = output
self.ignore_tables = opts.linearize_tables
if opts.disable_font_rescaling:
self.base_font_size = 0
else:
self.base_font_size = opts.base_font_size
self.blank_after_para = opts.insert_blank_line
self.use_spine = True
self.font_delta = 0
self.ignore_colors = False
from calibre.ebooks.lrf import PRS500_PROFILE
self.profile = PRS500_PROFILE
self.link_levels = sys.maxsize
self.link_exclude = '@'
self.no_links_in_toc = True
self.disable_chapter_detection = True
self.chapter_regex = 'dsadcdswcdec'
self.chapter_attr = '$,,$'
self.override_css = self._override_css = ''
self.page_break = 'h[12]'
self.force_page_break = '$'
self.force_page_break_attr = '$'
self.add_chapters_to_toc = False
self.baen = self.pdftohtml = self.book_designer = False
self.verbose = opts.verbose
self.encoding = 'utf-8'
self.lrs = False
self.minimize_memory_usage = False
self.autorotation = opts.enable_autorotation
self.header_separation = (self.profile.dpi/72.) * opts.header_separation
self.headerformat = opts.header_format
for x in ('top', 'bottom', 'left', 'right'):
setattr(self, x+'_margin',
(self.profile.dpi/72.) * float(getattr(opts, 'margin_'+x)))
for x in ('wordspace', 'header', 'header_format',
'minimum_indent', 'serif_family',
'render_tables_as_images', 'sans_family', 'mono_family',
'text_size_multiplier_for_rendered_tables'):
setattr(self, x, getattr(opts, x))
class LRFOutput(OutputFormatPlugin):
name = 'LRF Output'
author = 'Kovid Goyal'
file_type = 'lrf'
commit_name = 'lrf_output'
options = {
OptionRecommendation(name='enable_autorotation', recommended_value=False,
help=_('Enable auto-rotation of images that are wider than the screen width.')
),
OptionRecommendation(name='wordspace',
recommended_value=2.5, level=OptionRecommendation.LOW,
help=_('Set the space between words in pts. Default is %default')
),
OptionRecommendation(name='header', recommended_value=False,
help=_('Add a header to all the pages with title and author.')
),
OptionRecommendation(name='header_format', recommended_value="%t by %a",
help=_('Set the format of the header. %a is replaced by the author '
'and %t by the title. Default is %default')
),
OptionRecommendation(name='header_separation', recommended_value=0,
help=_('Add extra spacing below the header. Default is %default pt.')
),
OptionRecommendation(name='minimum_indent', recommended_value=0,
help=_('Minimum paragraph indent (the indent of the first line '
'of a paragraph) in pts. Default: %default')
),
OptionRecommendation(name='render_tables_as_images',
recommended_value=False,
help=_('This option has no effect')
),
OptionRecommendation(name='text_size_multiplier_for_rendered_tables',
recommended_value=1.0,
help=_('Multiply the size of text in rendered tables by this '
'factor. Default is %default')
),
OptionRecommendation(name='serif_family', recommended_value=None,
help=_('The serif family of fonts to embed')
),
OptionRecommendation(name='sans_family', recommended_value=None,
help=_('The sans-serif family of fonts to embed')
),
OptionRecommendation(name='mono_family', recommended_value=None,
help=_('The monospace family of fonts to embed')
),
}
recommendations = {
('change_justification', 'original', OptionRecommendation.HIGH)}
def convert_images(self, pages, opts, wide):
from calibre.ebooks.lrf.pylrs.pylrs import Book, BookSetting, ImageStream, ImageBlock
from uuid import uuid4
from calibre.constants import __appname__, __version__
width, height = (784, 1012) if wide else (584, 754)
ps = {}
ps['topmargin'] = 0
ps['evensidemargin'] = 0
ps['oddsidemargin'] = 0
ps['textwidth'] = width
ps['textheight'] = height
book = Book(title=opts.title, author=opts.author,
bookid=uuid4().hex,
publisher='%s %s'%(__appname__, __version__),
category=_('Comic'), pagestyledefault=ps,
booksetting=BookSetting(screenwidth=width, screenheight=height))
for page in pages:
imageStream = ImageStream(page)
_page = book.create_page()
_page.append(ImageBlock(refstream=imageStream,
blockwidth=width, blockheight=height, xsize=width,
ysize=height, x1=width, y1=height))
book.append(_page)
book.renderLrf(open(opts.output, 'wb'))
def flatten_toc(self):
from calibre.ebooks.oeb.base import TOC
nroot = TOC()
for x in self.oeb.toc.iterdescendants():
nroot.add(x.title, x.href)
self.oeb.toc = nroot
def convert(self, oeb, output_path, input_plugin, opts, log):
self.log, self.opts, self.oeb = log, opts, oeb
lrf_opts = LRFOptions(output_path, opts, oeb)
if input_plugin.is_image_collection:
self.convert_images(input_plugin.get_images(), lrf_opts,
getattr(opts, 'wide', False))
return
self.flatten_toc()
from calibre.ptempfile import TemporaryDirectory
with TemporaryDirectory('_lrf_output') as tdir:
from calibre.customize.ui import plugin_for_output_format
oeb_output = plugin_for_output_format('oeb')
oeb_output.convert(oeb, tdir, input_plugin, opts, log)
opf = [x for x in os.listdir(tdir) if x.endswith('.opf')][0]
from calibre.ebooks.lrf.html.convert_from import process_file
process_file(os.path.join(tdir, opf), lrf_opts, self.log)

View File

@@ -0,0 +1,66 @@
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL 3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import os
from calibre.customize.conversion import InputFormatPlugin
from polyglot.builtins import unicode_type
class MOBIInput(InputFormatPlugin):
name = 'MOBI Input'
author = 'Kovid Goyal'
description = 'Convert MOBI files (.mobi, .prc, .azw) to HTML'
file_types = {'mobi', 'prc', 'azw', 'azw3', 'pobi'}
commit_name = 'mobi_input'
def convert(self, stream, options, file_ext, log,
accelerators):
self.is_kf8 = False
self.mobi_is_joint = False
from calibre.ebooks.mobi.reader.mobi6 import MobiReader
from lxml import html
parse_cache = {}
try:
mr = MobiReader(stream, log, options.input_encoding,
options.debug_pipeline)
if mr.kf8_type is None:
mr.extract_content('.', parse_cache)
except:
mr = MobiReader(stream, log, options.input_encoding,
options.debug_pipeline, try_extra_data_fix=True)
if mr.kf8_type is None:
mr.extract_content('.', parse_cache)
if mr.kf8_type is not None:
log('Found KF8 MOBI of type %r'%mr.kf8_type)
if mr.kf8_type == 'joint':
self.mobi_is_joint = True
from calibre.ebooks.mobi.reader.mobi8 import Mobi8Reader
mr = Mobi8Reader(mr, log)
opf = os.path.abspath(mr())
self.encrypted_fonts = mr.encrypted_fonts
self.is_kf8 = True
return opf
raw = parse_cache.pop('calibre_raw_mobi_markup', False)
if raw:
if isinstance(raw, unicode_type):
raw = raw.encode('utf-8')
with lopen('debug-raw.html', 'wb') as f:
f.write(raw)
from calibre.ebooks.oeb.base import close_self_closing_tags
for f, root in parse_cache.items():
raw = html.tostring(root, encoding='utf-8', method='xml',
include_meta_content_type=False)
raw = close_self_closing_tags(raw)
with lopen(f, 'wb') as q:
q.write(raw)
accelerators['pagebreaks'] = '//h:div[@class="mbp_pagebreak"]'
return mr.created_opf_path

View File

@@ -0,0 +1,337 @@
#!/usr/bin/env python2
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
from calibre.customize.conversion import (OutputFormatPlugin,
OptionRecommendation)
from polyglot.builtins import unicode_type
def remove_html_cover(oeb, log):
from calibre.ebooks.oeb.base import OEB_DOCS
if not oeb.metadata.cover \
or 'cover' not in oeb.guide:
return
href = oeb.guide['cover'].href
del oeb.guide['cover']
item = oeb.manifest.hrefs[href]
if item.spine_position is not None:
log.warn('Found an HTML cover: ', item.href, 'removing it.',
'If you find some content missing from the output MOBI, it '
'is because you misidentified the HTML cover in the input '
'document')
oeb.spine.remove(item)
if item.media_type in OEB_DOCS:
oeb.manifest.remove(item)
def extract_mobi(output_path, opts):
if opts.extract_to is not None:
from calibre.ebooks.mobi.debug.main import inspect_mobi
ddir = opts.extract_to
inspect_mobi(output_path, ddir=ddir)
class MOBIOutput(OutputFormatPlugin):
name = 'MOBI Output'
author = 'Kovid Goyal'
file_type = 'mobi'
commit_name = 'mobi_output'
ui_data = {'file_types': ['old', 'both', 'new']}
options = {
OptionRecommendation(name='prefer_author_sort',
recommended_value=False, level=OptionRecommendation.LOW,
help=_('When present, use author sort field as author.')
),
OptionRecommendation(name='no_inline_toc',
recommended_value=False, level=OptionRecommendation.LOW,
help=_('Don\'t add Table of Contents to the book. Useful if '
'the book has its own table of contents.')),
OptionRecommendation(name='toc_title', recommended_value=None,
help=_('Title for any generated in-line table of contents.')
),
OptionRecommendation(name='dont_compress',
recommended_value=False, level=OptionRecommendation.LOW,
help=_('Disable compression of the file contents.')
),
OptionRecommendation(name='personal_doc', recommended_value='[PDOC]',
help=_('Tag for MOBI files to be marked as personal documents.'
' This option has no effect on the conversion. It is used'
' only when sending MOBI files to a device. If the file'
' being sent has the specified tag, it will be marked as'
' a personal document when sent to the Kindle.')
),
OptionRecommendation(name='mobi_ignore_margins',
recommended_value=False,
help=_('Ignore margins in the input document. If False, then '
'the MOBI output plugin will try to convert margins specified'
' in the input document, otherwise it will ignore them.')
),
OptionRecommendation(name='mobi_toc_at_start',
recommended_value=False,
help=_('When adding the Table of Contents to the book, add it at the start of the '
'book instead of the end. Not recommended.')
),
OptionRecommendation(name='extract_to',
help=_('Extract the contents of the generated %s file to the '
'specified directory. The contents of the directory are first '
'deleted, so be careful.') % 'MOBI'
),
OptionRecommendation(name='share_not_sync', recommended_value=False,
help=_('Enable sharing of book content via Facebook etc. '
' on the Kindle. WARNING: Using this feature means that '
' the book will not auto sync its last read position '
' on multiple devices. Complain to Amazon.')
),
OptionRecommendation(name='mobi_keep_original_images',
recommended_value=False,
help=_('By default calibre converts all images to JPEG format '
'in the output MOBI file. This is for maximum compatibility '
'as some older MOBI viewers have problems with other image '
'formats. This option tells calibre not to do this. '
'Useful if your document contains lots of GIF/PNG images that '
'become very large when converted to JPEG.')),
OptionRecommendation(name='mobi_file_type', choices=ui_data['file_types'], recommended_value='old',
help=_('By default calibre generates MOBI files that contain the '
'old MOBI 6 format. This format is compatible with all '
'devices. However, by changing this setting, you can tell '
'calibre to generate MOBI files that contain both MOBI 6 and '
'the new KF8 format, or only the new KF8 format. KF8 has '
'more features than MOBI 6, but only works with newer Kindles. '
'Allowed values: {}').format('old, both, new')),
}
def check_for_periodical(self):
if self.is_periodical:
self.periodicalize_toc()
self.check_for_masthead()
self.opts.mobi_periodical = True
else:
self.opts.mobi_periodical = False
def check_for_masthead(self):
found = 'masthead' in self.oeb.guide
if not found:
from calibre.ebooks import generate_masthead
self.oeb.log.debug('No masthead found in manifest, generating default mastheadImage...')
raw = generate_masthead(unicode_type(self.oeb.metadata['title'][0]))
id, href = self.oeb.manifest.generate('masthead', 'masthead')
self.oeb.manifest.add(id, href, 'image/gif', data=raw)
self.oeb.guide.add('masthead', 'Masthead Image', href)
else:
self.oeb.log.debug('Using mastheadImage supplied in manifest...')
def periodicalize_toc(self):
from calibre.ebooks.oeb.base import TOC
toc = self.oeb.toc
if not toc or len(self.oeb.spine) < 3:
return
if toc and toc[0].klass != 'periodical':
one, two = self.oeb.spine[0], self.oeb.spine[1]
self.log('Converting TOC for MOBI periodical indexing...')
articles = {}
if toc.depth() < 3:
# single section periodical
self.oeb.manifest.remove(one)
self.oeb.manifest.remove(two)
sections = [TOC(klass='section', title=_('All articles'),
href=self.oeb.spine[0].href)]
for x in toc:
sections[0].nodes.append(x)
else:
# multi-section periodical
self.oeb.manifest.remove(one)
sections = list(toc)
for i,x in enumerate(sections):
x.klass = 'section'
articles_ = list(x)
if articles_:
self.oeb.manifest.remove(self.oeb.manifest.hrefs[x.href])
x.href = articles_[0].href
for sec in sections:
articles[id(sec)] = []
for a in list(sec):
a.klass = 'article'
articles[id(sec)].append(a)
sec.nodes.remove(a)
root = TOC(klass='periodical', href=self.oeb.spine[0].href,
title=unicode_type(self.oeb.metadata.title[0]))
for s in sections:
if articles[id(s)]:
for a in articles[id(s)]:
s.nodes.append(a)
root.nodes.append(s)
for x in list(toc.nodes):
toc.nodes.remove(x)
toc.nodes.append(root)
# Fix up the periodical href to point to first section href
toc.nodes[0].href = toc.nodes[0].nodes[0].href
def convert(self, oeb, output_path, input_plugin, opts, log):
from calibre.ebooks.mobi.writer2.resources import Resources
self.log, self.opts, self.oeb = log, opts, oeb
mobi_type = opts.mobi_file_type
if self.is_periodical:
mobi_type = 'old' # Amazon does not support KF8 periodicals
create_kf8 = mobi_type in ('new', 'both')
remove_html_cover(self.oeb, self.log)
resources = Resources(oeb, opts, self.is_periodical,
add_fonts=create_kf8)
self.check_for_periodical()
if create_kf8:
from calibre.ebooks.mobi.writer8.cleanup import remove_duplicate_anchors
remove_duplicate_anchors(self.oeb)
# Split on pagebreaks so that the resulting KF8 is faster to load
from calibre.ebooks.oeb.transforms.split import Split
Split()(self.oeb, self.opts)
kf8 = self.create_kf8(resources, for_joint=mobi_type=='both'
) if create_kf8 else None
if mobi_type == 'new':
kf8.write(output_path)
extract_mobi(output_path, opts)
return
self.log('Creating MOBI 6 output')
self.write_mobi(input_plugin, output_path, kf8, resources)
def create_kf8(self, resources, for_joint=False):
from calibre.ebooks.mobi.writer8.main import create_kf8_book
return create_kf8_book(self.oeb, self.opts, resources,
for_joint=for_joint)
def write_mobi(self, input_plugin, output_path, kf8, resources):
from calibre.ebooks.mobi.mobiml import MobiMLizer
from calibre.ebooks.oeb.transforms.manglecase import CaseMangler
from calibre.ebooks.oeb.transforms.rasterize import SVGRasterizer, Unavailable
from calibre.ebooks.oeb.transforms.htmltoc import HTMLTOCAdder
from calibre.customize.ui import plugin_for_input_format
opts, oeb = self.opts, self.oeb
if not opts.no_inline_toc:
tocadder = HTMLTOCAdder(title=opts.toc_title, position='start' if
opts.mobi_toc_at_start else 'end')
tocadder(oeb, opts)
mangler = CaseMangler()
mangler(oeb, opts)
try:
rasterizer = SVGRasterizer()
rasterizer(oeb, opts)
except Unavailable:
self.log.warn('SVG rasterizer unavailable, SVG will not be converted')
else:
# Add rasterized SVG images
resources.add_extra_images()
if hasattr(self.oeb, 'inserted_metadata_jacket'):
self.workaround_fire_bugs(self.oeb.inserted_metadata_jacket)
mobimlizer = MobiMLizer(ignore_tables=opts.linearize_tables)
mobimlizer(oeb, opts)
write_page_breaks_after_item = input_plugin is not plugin_for_input_format('cbz')
from calibre.ebooks.mobi.writer2.main import MobiWriter
writer = MobiWriter(opts, resources, kf8,
write_page_breaks_after_item=write_page_breaks_after_item)
writer(oeb, output_path)
extract_mobi(output_path, opts)
def specialize_css_for_output(self, log, opts, item, stylizer):
from calibre.ebooks.mobi.writer8.cleanup import CSSCleanup
CSSCleanup(log, opts)(item, stylizer)
def workaround_fire_bugs(self, jacket):
# The idiotic Fire crashes when trying to render the table used to
# layout the jacket
from calibre.ebooks.oeb.base import XHTML
for table in jacket.data.xpath('//*[local-name()="table"]'):
table.tag = XHTML('div')
for tr in table.xpath('descendant::*[local-name()="tr"]'):
cols = tr.xpath('descendant::*[local-name()="td"]')
tr.tag = XHTML('div')
for td in cols:
td.tag = XHTML('span' if cols else 'div')
class AZW3Output(OutputFormatPlugin):
name = 'AZW3 Output'
author = 'Kovid Goyal'
file_type = 'azw3'
commit_name = 'azw3_output'
options = {
OptionRecommendation(name='prefer_author_sort',
recommended_value=False, level=OptionRecommendation.LOW,
help=_('When present, use author sort field as author.')
),
OptionRecommendation(name='no_inline_toc',
recommended_value=False, level=OptionRecommendation.LOW,
help=_('Don\'t add Table of Contents to the book. Useful if '
'the book has its own table of contents.')),
OptionRecommendation(name='toc_title', recommended_value=None,
help=_('Title for any generated in-line table of contents.')
),
OptionRecommendation(name='dont_compress',
recommended_value=False, level=OptionRecommendation.LOW,
help=_('Disable compression of the file contents.')
),
OptionRecommendation(name='mobi_toc_at_start',
recommended_value=False,
help=_('When adding the Table of Contents to the book, add it at the start of the '
'book instead of the end. Not recommended.')
),
OptionRecommendation(name='extract_to',
help=_('Extract the contents of the generated %s file to the '
'specified directory. The contents of the directory are first '
'deleted, so be careful.') % 'AZW3'),
OptionRecommendation(name='share_not_sync', recommended_value=False,
help=_('Enable sharing of book content via Facebook etc. '
' on the Kindle. WARNING: Using this feature means that '
' the book will not auto sync its last read position '
' on multiple devices. Complain to Amazon.')
),
}
def convert(self, oeb, output_path, input_plugin, opts, log):
from calibre.ebooks.mobi.writer2.resources import Resources
from calibre.ebooks.mobi.writer8.main import create_kf8_book
from calibre.ebooks.mobi.writer8.cleanup import remove_duplicate_anchors
self.oeb, self.opts, self.log = oeb, opts, log
opts.mobi_periodical = self.is_periodical
passthrough = getattr(opts, 'mobi_passthrough', False)
remove_duplicate_anchors(oeb)
resources = Resources(self.oeb, self.opts, self.is_periodical,
add_fonts=True, process_images=False)
if not passthrough:
remove_html_cover(self.oeb, self.log)
# Split on pagebreaks so that the resulting KF8 is faster to load
from calibre.ebooks.oeb.transforms.split import Split
Split()(self.oeb, self.opts)
kf8 = create_kf8_book(self.oeb, self.opts, resources, for_joint=False)
kf8.write(output_path)
extract_mobi(output_path, opts)
def specialize_css_for_output(self, log, opts, item, stylizer):
from calibre.ebooks.mobi.writer8.cleanup import CSSCleanup
CSSCleanup(log, opts)(item, stylizer)

View File

@@ -0,0 +1,25 @@
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
__docformat__ = 'restructuredtext en'
'''
Convert an ODT file into a Open Ebook
'''
from calibre.customize.conversion import InputFormatPlugin
class ODTInput(InputFormatPlugin):
name = 'ODT Input'
author = 'Kovid Goyal'
description = 'Convert ODT (OpenOffice) files to HTML'
file_types = {'odt'}
commit_name = 'odt_input'
def convert(self, stream, options, file_ext, log,
accelerators):
from calibre.ebooks.odt.input import Extract
return Extract()(stream, '.', log)

View File

@@ -0,0 +1,122 @@
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL 3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import os, re
from calibre.customize.conversion import (OutputFormatPlugin,
OptionRecommendation)
from calibre import CurrentDir
class OEBOutput(OutputFormatPlugin):
name = 'OEB Output'
author = 'Kovid Goyal'
file_type = 'oeb'
commit_name = 'oeb_output'
recommendations = {('pretty_print', True, OptionRecommendation.HIGH)}
def convert(self, oeb_book, output_path, input_plugin, opts, log):
from polyglot.urllib import unquote
from lxml import etree
self.log, self.opts = log, opts
if not os.path.exists(output_path):
os.makedirs(output_path)
from calibre.ebooks.oeb.base import OPF_MIME, NCX_MIME, PAGE_MAP_MIME, OEB_STYLES
from calibre.ebooks.oeb.normalize_css import condense_sheet
with CurrentDir(output_path):
results = oeb_book.to_opf2(page_map=True)
for key in (OPF_MIME, NCX_MIME, PAGE_MAP_MIME):
href, root = results.pop(key, [None, None])
if root is not None:
if key == OPF_MIME:
try:
self.workaround_nook_cover_bug(root)
except:
self.log.exception('Something went wrong while trying to'
' workaround Nook cover bug, ignoring')
try:
self.workaround_pocketbook_cover_bug(root)
except:
self.log.exception('Something went wrong while trying to'
' workaround Pocketbook cover bug, ignoring')
self.migrate_lang_code(root)
raw = etree.tostring(root, pretty_print=True,
encoding='utf-8', xml_declaration=True)
if key == OPF_MIME:
# Needed as I can't get lxml to output opf:role and
# not output <opf:metadata> as well
raw = re.sub(br'(<[/]{0,1})opf:', br'\1', raw)
with lopen(href, 'wb') as f:
f.write(raw)
for item in oeb_book.manifest:
if (
not self.opts.expand_css and item.media_type in OEB_STYLES and hasattr(
item.data, 'cssText') and 'nook' not in self.opts.output_profile.short_name):
condense_sheet(item.data)
path = os.path.abspath(unquote(item.href))
dir = os.path.dirname(path)
if not os.path.exists(dir):
os.makedirs(dir)
with lopen(path, 'wb') as f:
f.write(item.bytes_representation)
item.unload_data_from_memory(memory=path)
def workaround_nook_cover_bug(self, root): # {{{
cov = root.xpath('//*[local-name() = "meta" and @name="cover" and'
' @content != "cover"]')
def manifest_items_with_id(id_):
return root.xpath('//*[local-name() = "manifest"]/*[local-name() = "item" '
' and @id="%s"]'%id_)
if len(cov) == 1:
cov = cov[0]
covid = cov.get('content', '')
if covid:
manifest_item = manifest_items_with_id(covid)
if len(manifest_item) == 1 and \
manifest_item[0].get('media-type',
'').startswith('image/'):
self.log.warn('The cover image has an id != "cover". Renaming'
' to work around bug in Nook Color')
from calibre.ebooks.oeb.base import uuid_id
newid = uuid_id()
for item in manifest_items_with_id('cover'):
item.set('id', newid)
for x in root.xpath('//*[@idref="cover"]'):
x.set('idref', newid)
manifest_item = manifest_item[0]
manifest_item.set('id', 'cover')
cov.set('content', 'cover')
# }}}
def workaround_pocketbook_cover_bug(self, root): # {{{
m = root.xpath('//*[local-name() = "manifest"]/*[local-name() = "item" '
' and @id="cover"]')
if len(m) == 1:
m = m[0]
p = m.getparent()
p.remove(m)
p.insert(0, m)
# }}}
def migrate_lang_code(self, root): # {{{
from calibre.utils.localization import lang_as_iso639_1
for lang in root.xpath('//*[local-name() = "language"]'):
clc = lang_as_iso639_1(lang.text)
if clc:
lang.text = clc
# }}}

View File

@@ -0,0 +1,37 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
from calibre.customize.conversion import InputFormatPlugin
from polyglot.builtins import getcwd
class PDBInput(InputFormatPlugin):
name = 'PDB Input'
author = 'John Schember'
description = 'Convert PDB to HTML'
file_types = {'pdb', 'updb'}
commit_name = 'pdb_input'
def convert(self, stream, options, file_ext, log,
accelerators):
from calibre.ebooks.pdb.header import PdbHeaderReader
from calibre.ebooks.pdb import PDBError, IDENTITY_TO_NAME, get_reader
header = PdbHeaderReader(stream)
Reader = get_reader(header.ident)
if Reader is None:
raise PDBError('No reader available for format within container.\n Identity is %s. Book type is %s' %
(header.ident, IDENTITY_TO_NAME.get(header.ident, _('Unknown'))))
log.debug('Detected ebook format as: %s with identity: %s' % (IDENTITY_TO_NAME[header.ident], header.ident))
reader = Reader(header, stream, log, options)
opf = reader.extract_content(getcwd())
return opf

View File

@@ -0,0 +1,64 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL 3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
import os
from calibre.customize.conversion import OutputFormatPlugin, \
OptionRecommendation
from calibre.ebooks.pdb import PDBError, get_writer, ALL_FORMAT_WRITERS
class PDBOutput(OutputFormatPlugin):
name = 'PDB Output'
author = 'John Schember'
file_type = 'pdb'
commit_name = 'pdb_output'
ui_data = {'formats': tuple(ALL_FORMAT_WRITERS)}
options = {
OptionRecommendation(name='format', recommended_value='doc',
level=OptionRecommendation.LOW,
short_switch='f', choices=list(ALL_FORMAT_WRITERS),
help=(_('Format to use inside the pdb container. Choices are:') + ' %s' % sorted(ALL_FORMAT_WRITERS))),
OptionRecommendation(name='pdb_output_encoding', recommended_value='cp1252',
level=OptionRecommendation.LOW,
help=_('Specify the character encoding of the output document. '
'The default is cp1252. Note: This option is not honored by all '
'formats.')),
OptionRecommendation(name='inline_toc',
recommended_value=False, level=OptionRecommendation.LOW,
help=_('Add Table of Contents to beginning of the book.')),
}
def convert(self, oeb_book, output_path, input_plugin, opts, log):
close = False
if not hasattr(output_path, 'write'):
close = True
if not os.path.exists(os.path.dirname(output_path)) and os.path.dirname(output_path):
os.makedirs(os.path.dirname(output_path))
out_stream = lopen(output_path, 'wb')
else:
out_stream = output_path
Writer = get_writer(opts.format)
if Writer is None:
raise PDBError('No writer available for format %s.' % format)
setattr(opts, 'max_line_length', 0)
setattr(opts, 'force_max_line_length', False)
writer = Writer(opts, log)
out_stream.seek(0)
out_stream.truncate()
writer.write_content(oeb_book, out_stream, oeb_book.metadata)
if close:
out_stream.close()

View File

@@ -0,0 +1,82 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL 3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
import os
from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
from polyglot.builtins import as_bytes, getcwd
class PDFInput(InputFormatPlugin):
name = 'PDF Input'
author = 'Kovid Goyal and John Schember'
description = 'Convert PDF files to HTML'
file_types = {'pdf'}
commit_name = 'pdf_input'
options = {
OptionRecommendation(name='no_images', recommended_value=False,
help=_('Do not extract images from the document')),
OptionRecommendation(name='unwrap_factor', recommended_value=0.45,
help=_('Scale used to determine the length at which a line should '
'be unwrapped. Valid values are a decimal between 0 and 1. The '
'default is 0.45, just below the median line length.')),
OptionRecommendation(name='new_pdf_engine', recommended_value=False,
help=_('Use the new PDF conversion engine. Currently not operational.'))
}
def convert_new(self, stream, accelerators):
from calibre.ebooks.pdf.pdftohtml import pdftohtml
from calibre.utils.cleantext import clean_ascii_chars
from calibre.ebooks.pdf.reflow import PDFDocument
pdftohtml(getcwd(), stream.name, self.opts.no_images, as_xml=True)
with lopen('index.xml', 'rb') as f:
xml = clean_ascii_chars(f.read())
PDFDocument(xml, self.opts, self.log)
return os.path.join(getcwd(), 'metadata.opf')
def convert(self, stream, options, file_ext, log,
accelerators):
from calibre.ebooks.metadata.opf2 import OPFCreator
from calibre.ebooks.pdf.pdftohtml import pdftohtml
log.debug('Converting file to html...')
# The main html file will be named index.html
self.opts, self.log = options, log
if options.new_pdf_engine:
return self.convert_new(stream, accelerators)
pdftohtml(getcwd(), stream.name, options.no_images)
from calibre.ebooks.metadata.meta import get_metadata
log.debug('Retrieving document metadata...')
mi = get_metadata(stream, 'pdf')
opf = OPFCreator(getcwd(), mi)
manifest = [('index.html', None)]
images = os.listdir(getcwd())
images.remove('index.html')
for i in images:
manifest.append((i, None))
log.debug('Generating manifest...')
opf.create_manifest(manifest)
opf.create_spine(['index.html'])
log.debug('Rendering manifest...')
with lopen('metadata.opf', 'wb') as opffile:
opf.render(opffile)
if os.path.exists('toc.ncx'):
ncxid = opf.manifest.id_for_path('toc.ncx')
if ncxid:
with lopen('metadata.opf', 'r+b') as f:
raw = f.read().replace(b'<spine', b'<spine toc="%s"' % as_bytes(ncxid))
f.seek(0)
f.write(raw)
return os.path.join(getcwd(), 'metadata.opf')

View File

@@ -0,0 +1,256 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL 3'
__copyright__ = '2012, Kovid Goyal <kovid at kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
'''
Convert OEB ebook format to PDF.
'''
import glob, os
from calibre.customize.conversion import (OutputFormatPlugin,
OptionRecommendation)
from calibre.ptempfile import TemporaryDirectory
from polyglot.builtins import iteritems, unicode_type
UNITS = ('millimeter', 'centimeter', 'point', 'inch' , 'pica' , 'didot',
'cicero', 'devicepixel')
PAPER_SIZES = ('a0', 'a1', 'a2', 'a3', 'a4', 'a5', 'a6', 'b0', 'b1',
'b2', 'b3', 'b4', 'b5', 'b6', 'legal', 'letter')
class PDFOutput(OutputFormatPlugin):
name = 'PDF Output'
author = 'Kovid Goyal'
file_type = 'pdf'
commit_name = 'pdf_output'
ui_data = {'paper_sizes': PAPER_SIZES, 'units': UNITS, 'font_types': ('serif', 'sans', 'mono')}
options = {
OptionRecommendation(name='use_profile_size', recommended_value=False,
help=_('Instead of using the paper size specified in the PDF Output options,'
' use a paper size corresponding to the current output profile.'
' Useful if you want to generate a PDF for viewing on a specific device.')),
OptionRecommendation(name='unit', recommended_value='inch',
level=OptionRecommendation.LOW, short_switch='u', choices=UNITS,
help=_('The unit of measure for page sizes. Default is inch. Choices '
'are {} '
'Note: This does not override the unit for margins!').format(', '.join(UNITS))),
OptionRecommendation(name='paper_size', recommended_value='letter',
level=OptionRecommendation.LOW, choices=PAPER_SIZES,
help=_('The size of the paper. This size will be overridden when a '
'non default output profile is used. Default is letter. Choices '
'are {}').format(', '.join(PAPER_SIZES))),
OptionRecommendation(name='custom_size', recommended_value=None,
help=_('Custom size of the document. Use the form widthxheight '
'e.g. `123x321` to specify the width and height. '
'This overrides any specified paper-size.')),
OptionRecommendation(name='preserve_cover_aspect_ratio',
recommended_value=False,
help=_('Preserve the aspect ratio of the cover, instead'
' of stretching it to fill the full first page of the'
' generated pdf.')),
OptionRecommendation(name='pdf_serif_family',
recommended_value='Times', help=_(
'The font family used to render serif fonts. Will work only if the font is available system-wide.')),
OptionRecommendation(name='pdf_sans_family',
recommended_value='Helvetica', help=_(
'The font family used to render sans-serif fonts. Will work only if the font is available system-wide.')),
OptionRecommendation(name='pdf_mono_family',
recommended_value='Courier', help=_(
'The font family used to render monospace fonts. Will work only if the font is available system-wide.')),
OptionRecommendation(name='pdf_standard_font', choices=ui_data['font_types'],
recommended_value='serif', help=_(
'The font family used to render monospace fonts')),
OptionRecommendation(name='pdf_default_font_size',
recommended_value=20, help=_(
'The default font size')),
OptionRecommendation(name='pdf_mono_font_size',
recommended_value=16, help=_(
'The default font size for monospaced text')),
OptionRecommendation(name='pdf_hyphenate', recommended_value=False,
help=_('Break long words at the end of lines. This can give the text at the right margin a more even appearance.')),
OptionRecommendation(name='pdf_mark_links', recommended_value=False,
help=_('Surround all links with a red box, useful for debugging.')),
OptionRecommendation(name='pdf_page_numbers', recommended_value=False,
help=_('Add page numbers to the bottom of every page in the generated PDF file. If you '
'specify a footer template, it will take precedence '
'over this option.')),
OptionRecommendation(name='pdf_footer_template', recommended_value=None,
help=_('An HTML template used to generate %s on every page.'
' The strings _PAGENUM_, _TITLE_, _AUTHOR_ and _SECTION_ will be replaced by their current values.')%_('footers')),
OptionRecommendation(name='pdf_header_template', recommended_value=None,
help=_('An HTML template used to generate %s on every page.'
' The strings _PAGENUM_, _TITLE_, _AUTHOR_ and _SECTION_ will be replaced by their current values.')%_('headers')),
OptionRecommendation(name='pdf_add_toc', recommended_value=False,
help=_('Add a Table of Contents at the end of the PDF that lists page numbers. '
'Useful if you want to print out the PDF. If this PDF is intended for electronic use, use the PDF Outline instead.')),
OptionRecommendation(name='toc_title', recommended_value=None,
help=_('Title for generated table of contents.')
),
OptionRecommendation(name='pdf_page_margin_left', recommended_value=72.0,
level=OptionRecommendation.LOW,
help=_('The size of the left page margin, in pts. Default is 72pt.'
' Overrides the common left page margin setting.')
),
OptionRecommendation(name='pdf_page_margin_top', recommended_value=72.0,
level=OptionRecommendation.LOW,
help=_('The size of the top page margin, in pts. Default is 72pt.'
' Overrides the common top page margin setting, unless set to zero.')
),
OptionRecommendation(name='pdf_page_margin_right', recommended_value=72.0,
level=OptionRecommendation.LOW,
help=_('The size of the right page margin, in pts. Default is 72pt.'
' Overrides the common right page margin setting, unless set to zero.')
),
OptionRecommendation(name='pdf_page_margin_bottom', recommended_value=72.0,
level=OptionRecommendation.LOW,
help=_('The size of the bottom page margin, in pts. Default is 72pt.'
' Overrides the common bottom page margin setting, unless set to zero.')
),
OptionRecommendation(name='pdf_use_document_margins', recommended_value=False,
help=_('Use the page margins specified in the input document via @page CSS rules.'
' This will cause the margins specified in the conversion settings to be ignored.'
' If the document does not specify page margins, the conversion settings will be used as a fallback.')
),
OptionRecommendation(name='pdf_page_number_map', recommended_value=None,
help=_('Adjust page numbers, as needed. Syntax is a JavaScript expression for the page number.'
' For example, "if (n < 3) 0; else n - 3;", where n is current page number.')
),
OptionRecommendation(name='uncompressed_pdf',
recommended_value=False, help=_(
'Generate an uncompressed PDF, useful for debugging.')
),
OptionRecommendation(name='pdf_odd_even_offset', recommended_value=0.0,
level=OptionRecommendation.LOW,
help=_(
'Shift the text horizontally by the specified offset (in pts).'
' On odd numbered pages, it is shifted to the right and on even'
' numbered pages to the left. Use negative numbers for the opposite'
' effect. Note that this setting is ignored on pages where the margins'
' are smaller than the specified offset. Shifting is done by setting'
' the PDF CropBox, not all software respects the CropBox.'
)
),
}
def specialize_options(self, log, opts, input_fmt):
# Ensure Qt is setup to be used with WebEngine
# specialize_options is called early enough in the pipeline
# that hopefully no Qt application has been constructed as yet
from PyQt5.QtWebEngineCore import QWebEngineUrlScheme
from PyQt5.QtWebEngineWidgets import QWebEnginePage # noqa
from calibre.gui2 import must_use_qt
from calibre.constants import FAKE_PROTOCOL
scheme = QWebEngineUrlScheme(FAKE_PROTOCOL.encode('ascii'))
scheme.setSyntax(QWebEngineUrlScheme.Syntax.Host)
scheme.setFlags(QWebEngineUrlScheme.SecureScheme)
QWebEngineUrlScheme.registerScheme(scheme)
must_use_qt()
self.input_fmt = input_fmt
if opts.pdf_use_document_margins:
# Prevent the conversion pipeline from overwriting document margins
opts.margin_left = opts.margin_right = opts.margin_top = opts.margin_bottom = -1
def convert(self, oeb_book, output_path, input_plugin, opts, log):
self.stored_page_margins = getattr(opts, '_stored_page_margins', {})
self.oeb = oeb_book
self.input_plugin, self.opts, self.log = input_plugin, opts, log
self.output_path = output_path
from calibre.ebooks.oeb.base import OPF, OPF2_NS
from lxml import etree
from io import BytesIO
package = etree.Element(OPF('package'),
attrib={'version': '2.0', 'unique-identifier': 'dummy'},
nsmap={None: OPF2_NS})
from calibre.ebooks.metadata.opf2 import OPF
self.oeb.metadata.to_opf2(package)
self.metadata = OPF(BytesIO(etree.tostring(package))).to_book_metadata()
self.cover_data = None
if input_plugin.is_image_collection:
log.debug('Converting input as an image collection...')
self.convert_images(input_plugin.get_images())
else:
log.debug('Converting input as a text based book...')
self.convert_text(oeb_book)
def convert_images(self, images):
from calibre.ebooks.pdf.image_writer import convert
convert(images, self.output_path, self.opts, self.metadata, self.report_progress)
def get_cover_data(self):
oeb = self.oeb
if (oeb.metadata.cover and unicode_type(oeb.metadata.cover[0]) in oeb.manifest.ids):
cover_id = unicode_type(oeb.metadata.cover[0])
item = oeb.manifest.ids[cover_id]
self.cover_data = item.data
def process_fonts(self):
''' Make sure all fonts are embeddable '''
from calibre.ebooks.oeb.base import urlnormalize
from calibre.utils.fonts.utils import remove_embed_restriction
processed = set()
for item in list(self.oeb.manifest):
if not hasattr(item.data, 'cssRules'):
continue
for i, rule in enumerate(item.data.cssRules):
if rule.type == rule.FONT_FACE_RULE:
try:
s = rule.style
src = s.getProperty('src').propertyValue[0].uri
except:
continue
path = item.abshref(src)
ff = self.oeb.manifest.hrefs.get(urlnormalize(path), None)
if ff is None:
continue
raw = nraw = ff.data
if path not in processed:
processed.add(path)
try:
nraw = remove_embed_restriction(raw)
except:
continue
if nraw != raw:
ff.data = nraw
self.oeb.container.write(path, nraw)
def convert_text(self, oeb_book):
import json
from calibre.ebooks.pdf.html_writer import convert
self.get_cover_data()
self.process_fonts()
if self.opts.pdf_use_document_margins and self.stored_page_margins:
for href, margins in iteritems(self.stored_page_margins):
item = oeb_book.manifest.hrefs.get(href)
if item is not None:
root = item.data
if hasattr(root, 'xpath') and margins:
root.set('data-calibre-pdf-output-page-margins', json.dumps(margins))
with TemporaryDirectory('_pdf_out') as oeb_dir:
from calibre.customize.ui import plugin_for_output_format
oeb_dir = os.path.realpath(oeb_dir)
oeb_output = plugin_for_output_format('oeb')
oeb_output.convert(oeb_book, oeb_dir, self.input_plugin, self.opts, self.log)
opfpath = glob.glob(os.path.join(oeb_dir, '*.opf'))[0]
convert(
opfpath, self.opts, metadata=self.metadata, output_path=self.output_path,
log=self.log, cover_data=self.cover_data, report_progress=self.report_progress
)

View File

@@ -0,0 +1,165 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
import glob
import os
import shutil
from calibre.customize.conversion import InputFormatPlugin
from calibre.ptempfile import TemporaryDirectory
from polyglot.builtins import getcwd
class PMLInput(InputFormatPlugin):
name = 'PML Input'
author = 'John Schember'
description = 'Convert PML to OEB'
# pmlz is a zip file containing pml files and png images.
file_types = {'pml', 'pmlz'}
commit_name = 'pml_input'
def process_pml(self, pml_path, html_path, close_all=False):
from calibre.ebooks.pml.pmlconverter import PML_HTMLizer
pclose = False
hclose = False
if not hasattr(pml_path, 'read'):
pml_stream = lopen(pml_path, 'rb')
pclose = True
else:
pml_stream = pml_path
pml_stream.seek(0)
if not hasattr(html_path, 'write'):
html_stream = lopen(html_path, 'wb')
hclose = True
else:
html_stream = html_path
ienc = getattr(pml_stream, 'encoding', None)
if ienc is None:
ienc = 'cp1252'
if self.options.input_encoding:
ienc = self.options.input_encoding
self.log.debug('Converting PML to HTML...')
hizer = PML_HTMLizer()
html = hizer.parse_pml(pml_stream.read().decode(ienc), html_path)
html = '<html><head><title></title></head><body>%s</body></html>'%html
html_stream.write(html.encode('utf-8', 'replace'))
if pclose:
pml_stream.close()
if hclose:
html_stream.close()
return hizer.get_toc()
def get_images(self, stream, tdir, top_level=False):
images = []
imgs = []
if top_level:
imgs = glob.glob(os.path.join(tdir, '*.png'))
# Images not in top level try bookname_img directory because
# that's where Dropbook likes to see them.
if not imgs:
if hasattr(stream, 'name'):
imgs = glob.glob(os.path.join(tdir, os.path.splitext(os.path.basename(stream.name))[0] + '_img', '*.png'))
# No images in Dropbook location try generic images directory
if not imgs:
imgs = glob.glob(os.path.join(os.path.join(tdir, 'images'), '*.png'))
if imgs:
os.makedirs(os.path.join(getcwd(), 'images'))
for img in imgs:
pimg_name = os.path.basename(img)
pimg_path = os.path.join(getcwd(), 'images', pimg_name)
images.append('images/' + pimg_name)
shutil.copy(img, pimg_path)
return images
def convert(self, stream, options, file_ext, log,
accelerators):
from calibre.ebooks.metadata.toc import TOC
from calibre.ebooks.metadata.opf2 import OPFCreator
from calibre.utils.zipfile import ZipFile
self.options = options
self.log = log
pages, images = [], []
toc = TOC()
if file_ext == 'pmlz':
log.debug('De-compressing content to temporary directory...')
with TemporaryDirectory('_unpmlz') as tdir:
zf = ZipFile(stream)
zf.extractall(tdir)
pmls = glob.glob(os.path.join(tdir, '*.pml'))
for pml in pmls:
html_name = os.path.splitext(os.path.basename(pml))[0]+'.html'
html_path = os.path.join(getcwd(), html_name)
pages.append(html_name)
log.debug('Processing PML item %s...' % pml)
ttoc = self.process_pml(pml, html_path)
toc += ttoc
images = self.get_images(stream, tdir, True)
else:
toc = self.process_pml(stream, 'index.html')
pages.append('index.html')
if hasattr(stream, 'name'):
images = self.get_images(stream, os.path.abspath(os.path.dirname(stream.name)))
# We want pages to be orded alphabetically.
pages.sort()
manifest_items = []
for item in pages+images:
manifest_items.append((item, None))
from calibre.ebooks.metadata.meta import get_metadata
log.debug('Reading metadata from input file...')
mi = get_metadata(stream, 'pml')
if 'images/cover.png' in images:
mi.cover = 'images/cover.png'
opf = OPFCreator(getcwd(), mi)
log.debug('Generating manifest...')
opf.create_manifest(manifest_items)
opf.create_spine(pages)
opf.set_toc(toc)
with lopen('metadata.opf', 'wb') as opffile:
with lopen('toc.ncx', 'wb') as tocfile:
opf.render(opffile, tocfile, 'toc.ncx')
return os.path.join(getcwd(), 'metadata.opf')
def postprocess_book(self, oeb, opts, log):
from calibre.ebooks.oeb.base import XHTML, barename
for item in oeb.spine:
if hasattr(item.data, 'xpath'):
for heading in item.data.iterdescendants(*map(XHTML, 'h1 h2 h3 h4 h5 h6'.split())):
if not len(heading):
continue
span = heading[0]
if not heading.text and not span.text and not len(span) and barename(span.tag) == 'span':
if not heading.get('id') and span.get('id'):
heading.set('id', span.get('id'))
heading.text = span.tail
heading.remove(span)
if len(heading) == 1 and heading[0].get('style') == 'text-align: center; margin: auto;':
div = heading[0]
if barename(div.tag) == 'div' and not len(div) and not div.get('id') and not heading.get('style'):
heading.text = (heading.text or '') + (div.text or '') + (div.tail or '')
heading.remove(div)
heading.set('style', 'text-align: center')

View File

@@ -0,0 +1,77 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL 3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
import os, io
from calibre.customize.conversion import (OutputFormatPlugin,
OptionRecommendation)
from calibre.ptempfile import TemporaryDirectory
from polyglot.builtins import unicode_type
class PMLOutput(OutputFormatPlugin):
name = 'PML Output'
author = 'John Schember'
file_type = 'pmlz'
commit_name = 'pml_output'
options = {
OptionRecommendation(name='pml_output_encoding', recommended_value='cp1252',
level=OptionRecommendation.LOW,
help=_('Specify the character encoding of the output document. '
'The default is cp1252.')),
OptionRecommendation(name='inline_toc',
recommended_value=False, level=OptionRecommendation.LOW,
help=_('Add Table of Contents to beginning of the book.')),
OptionRecommendation(name='full_image_depth',
recommended_value=False, level=OptionRecommendation.LOW,
help=_('Do not reduce the size or bit depth of images. Images '
'have their size and depth reduced by default to accommodate '
'applications that can not convert images on their '
'own such as Dropbook.')),
}
def convert(self, oeb_book, output_path, input_plugin, opts, log):
from calibre.ebooks.pml.pmlml import PMLMLizer
from calibre.utils.zipfile import ZipFile
with TemporaryDirectory('_pmlz_output') as tdir:
pmlmlizer = PMLMLizer(log)
pml = unicode_type(pmlmlizer.extract_content(oeb_book, opts))
with lopen(os.path.join(tdir, 'index.pml'), 'wb') as out:
out.write(pml.encode(opts.pml_output_encoding, 'replace'))
img_path = os.path.join(tdir, 'index_img')
if not os.path.exists(img_path):
os.makedirs(img_path)
self.write_images(oeb_book.manifest, pmlmlizer.image_hrefs, img_path, opts)
log.debug('Compressing output...')
pmlz = ZipFile(output_path, 'w')
pmlz.add_dir(tdir)
def write_images(self, manifest, image_hrefs, out_dir, opts):
from PIL import Image
from calibre.ebooks.oeb.base import OEB_RASTER_IMAGES
for item in manifest:
if item.media_type in OEB_RASTER_IMAGES and item.href in image_hrefs.keys():
if opts.full_image_depth:
im = Image.open(io.BytesIO(item.data))
else:
im = Image.open(io.BytesIO(item.data)).convert('P')
im.thumbnail((300,300), Image.ANTIALIAS)
data = io.BytesIO()
im.save(data, 'PNG')
data = data.getvalue()
path = os.path.join(out_dir, image_hrefs[item.href])
with lopen(path, 'wb') as out:
out.write(data)

View File

@@ -0,0 +1,28 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL 3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
from calibre.customize.conversion import InputFormatPlugin
from polyglot.builtins import getcwd
class RBInput(InputFormatPlugin):
name = 'RB Input'
author = 'John Schember'
description = 'Convert RB files to HTML'
file_types = {'rb'}
commit_name = 'rb_input'
def convert(self, stream, options, file_ext, log,
accelerators):
from calibre.ebooks.rb.reader import Reader
reader = Reader(stream, log, options.input_encoding)
opf = reader.extract_content(getcwd())
return opf

View File

@@ -0,0 +1,45 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL 3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
import os
from calibre.customize.conversion import OutputFormatPlugin, OptionRecommendation
class RBOutput(OutputFormatPlugin):
name = 'RB Output'
author = 'John Schember'
file_type = 'rb'
commit_name = 'rb_output'
options = {
OptionRecommendation(name='inline_toc',
recommended_value=False, level=OptionRecommendation.LOW,
help=_('Add Table of Contents to beginning of the book.'))}
def convert(self, oeb_book, output_path, input_plugin, opts, log):
from calibre.ebooks.rb.writer import RBWriter
close = False
if not hasattr(output_path, 'write'):
close = True
if not os.path.exists(os.path.dirname(output_path)) and os.path.dirname(output_path):
os.makedirs(os.path.dirname(output_path))
out_stream = lopen(output_path, 'wb')
else:
out_stream = output_path
writer = RBWriter(opts, log)
out_stream.seek(0)
out_stream.truncate()
writer.write_content(oeb_book, out_stream, oeb_book.metadata)
if close:
out_stream.close()

View File

@@ -0,0 +1,169 @@
#!/usr/bin/env python2
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import os
from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
from calibre.constants import numeric_version
from calibre import walk
from polyglot.builtins import unicode_type
class RecipeDisabled(Exception):
pass
class RecipeInput(InputFormatPlugin):
name = 'Recipe Input'
author = 'Kovid Goyal'
description = _('Download periodical content from the internet')
file_types = {'recipe', 'downloaded_recipe'}
commit_name = 'recipe_input'
recommendations = {
('chapter', None, OptionRecommendation.HIGH),
('dont_split_on_page_breaks', True, OptionRecommendation.HIGH),
('use_auto_toc', False, OptionRecommendation.HIGH),
('input_encoding', None, OptionRecommendation.HIGH),
('input_profile', 'default', OptionRecommendation.HIGH),
('page_breaks_before', None, OptionRecommendation.HIGH),
('insert_metadata', False, OptionRecommendation.HIGH),
}
options = {
OptionRecommendation(name='test', recommended_value=False,
help=_(
'Useful for recipe development. Forces'
' max_articles_per_feed to 2 and downloads at most 2 feeds.'
' You can change the number of feeds and articles by supplying optional arguments.'
' For example: --test 3 1 will download at most 3 feeds and only 1 article per feed.')),
OptionRecommendation(name='username', recommended_value=None,
help=_('Username for sites that require a login to access '
'content.')),
OptionRecommendation(name='password', recommended_value=None,
help=_('Password for sites that require a login to access '
'content.')),
OptionRecommendation(name='dont_download_recipe',
recommended_value=False,
help=_('Do not download latest version of builtin recipes from the calibre server')),
OptionRecommendation(name='lrf', recommended_value=False,
help='Optimize fetching for subsequent conversion to LRF.'),
}
def convert(self, recipe_or_file, opts, file_ext, log,
accelerators):
from calibre.web.feeds.recipes import compile_recipe
opts.output_profile.flow_size = 0
if file_ext == 'downloaded_recipe':
from calibre.utils.zipfile import ZipFile
zf = ZipFile(recipe_or_file, 'r')
zf.extractall()
zf.close()
with lopen('download.recipe', 'rb') as f:
self.recipe_source = f.read()
recipe = compile_recipe(self.recipe_source)
recipe.needs_subscription = False
self.recipe_object = recipe(opts, log, self.report_progress)
else:
if os.environ.get('CALIBRE_RECIPE_URN'):
from calibre.web.feeds.recipes.collection import get_custom_recipe, get_builtin_recipe_by_id
urn = os.environ['CALIBRE_RECIPE_URN']
log('Downloading recipe urn: ' + urn)
rtype, recipe_id = urn.partition(':')[::2]
if not recipe_id:
raise ValueError('Invalid recipe urn: ' + urn)
if rtype == 'custom':
self.recipe_source = get_custom_recipe(recipe_id)
else:
self.recipe_source = get_builtin_recipe_by_id(urn, log=log, download_recipe=True)
if not self.recipe_source:
raise ValueError('Could not find recipe with urn: ' + urn)
if not isinstance(self.recipe_source, bytes):
self.recipe_source = self.recipe_source.encode('utf-8')
recipe = compile_recipe(self.recipe_source)
elif os.access(recipe_or_file, os.R_OK):
with lopen(recipe_or_file, 'rb') as f:
self.recipe_source = f.read()
recipe = compile_recipe(self.recipe_source)
log('Using custom recipe')
else:
from calibre.web.feeds.recipes.collection import (
get_builtin_recipe_by_title, get_builtin_recipe_titles)
title = getattr(opts, 'original_recipe_input_arg', recipe_or_file)
title = os.path.basename(title).rpartition('.')[0]
titles = frozenset(get_builtin_recipe_titles())
if title not in titles:
title = getattr(opts, 'original_recipe_input_arg', recipe_or_file)
title = title.rpartition('.')[0]
raw = get_builtin_recipe_by_title(title, log=log,
download_recipe=not opts.dont_download_recipe)
builtin = False
try:
recipe = compile_recipe(raw)
self.recipe_source = raw
if recipe.requires_version > numeric_version:
log.warn(
'Downloaded recipe needs calibre version at least: %s' %
('.'.join(recipe.requires_version)))
builtin = True
except:
log.exception('Failed to compile downloaded recipe. Falling '
'back to builtin one')
builtin = True
if builtin:
log('Using bundled builtin recipe')
raw = get_builtin_recipe_by_title(title, log=log,
download_recipe=False)
if raw is None:
raise ValueError('Failed to find builtin recipe: '+title)
recipe = compile_recipe(raw)
self.recipe_source = raw
else:
log('Using downloaded builtin recipe')
if recipe is None:
raise ValueError('%r is not a valid recipe file or builtin recipe' %
recipe_or_file)
disabled = getattr(recipe, 'recipe_disabled', None)
if disabled is not None:
raise RecipeDisabled(disabled)
ro = recipe(opts, log, self.report_progress)
ro.download()
self.recipe_object = ro
for key, val in self.recipe_object.conversion_options.items():
setattr(opts, key, val)
for f in os.listdir('.'):
if f.endswith('.opf'):
return os.path.abspath(f)
for f in walk('.'):
if f.endswith('.opf'):
return os.path.abspath(f)
def postprocess_book(self, oeb, opts, log):
if self.recipe_object is not None:
self.recipe_object.internal_postprocess_book(oeb, opts, log)
self.recipe_object.postprocess_book(oeb, opts, log)
def specialize(self, oeb, opts, log, output_fmt):
if opts.no_inline_navbars:
from calibre.ebooks.oeb.base import XPath
for item in oeb.spine:
for div in XPath('//h:div[contains(@class, "calibre_navbar")]')(item.data):
div.getparent().remove(div)
def save_download(self, zf):
raw = self.recipe_source
if isinstance(raw, unicode_type):
raw = raw.encode('utf-8')
zf.writestr('download.recipe', raw)

View File

@@ -0,0 +1,323 @@
from __future__ import with_statement, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
import os, glob, re, textwrap
from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
from polyglot.builtins import iteritems, filter, getcwd, as_bytes
border_style_map = {
'single' : 'solid',
'double-thickness-border' : 'double',
'shadowed-border': 'outset',
'double-border': 'double',
'dotted-border': 'dotted',
'dashed': 'dashed',
'hairline': 'solid',
'inset': 'inset',
'dash-small': 'dashed',
'dot-dash': 'dotted',
'dot-dot-dash': 'dotted',
'outset': 'outset',
'tripple': 'double',
'triple': 'double',
'thick-thin-small': 'solid',
'thin-thick-small': 'solid',
'thin-thick-thin-small': 'solid',
'thick-thin-medium': 'solid',
'thin-thick-medium': 'solid',
'thin-thick-thin-medium': 'solid',
'thick-thin-large': 'solid',
'thin-thick-thin-large': 'solid',
'wavy': 'ridge',
'double-wavy': 'ridge',
'striped': 'ridge',
'emboss': 'inset',
'engrave': 'inset',
'frame': 'ridge',
}
class RTFInput(InputFormatPlugin):
name = 'RTF Input'
author = 'Kovid Goyal'
description = 'Convert RTF files to HTML'
file_types = {'rtf'}
commit_name = 'rtf_input'
options = {
OptionRecommendation(name='ignore_wmf', recommended_value=False,
help=_('Ignore WMF images instead of replacing them with a placeholder image.')),
}
def generate_xml(self, stream):
from calibre.ebooks.rtf2xml.ParseRtf import ParseRtf
ofile = u'dataxml.xml'
run_lev, debug_dir, indent_out = 1, None, 0
if getattr(self.opts, 'debug_pipeline', None) is not None:
try:
os.mkdir(u'rtfdebug')
debug_dir = u'rtfdebug'
run_lev = 4
indent_out = 1
self.log('Running RTFParser in debug mode')
except:
self.log.warn('Impossible to run RTFParser in debug mode')
parser = ParseRtf(
in_file=stream,
out_file=ofile,
# Convert symbol fonts to unicode equivalents. Default
# is 1
convert_symbol=1,
# Convert Zapf fonts to unicode equivalents. Default
# is 1.
convert_zapf=1,
# Convert Wingding fonts to unicode equivalents.
# Default is 1.
convert_wingdings=1,
# Convert RTF caps to real caps.
# Default is 1.
convert_caps=1,
# Indent resulting XML.
# Default is 0 (no indent).
indent=indent_out,
# Form lists from RTF. Default is 1.
form_lists=1,
# Convert headings to sections. Default is 0.
headings_to_sections=1,
# Group paragraphs with the same style name. Default is 1.
group_styles=1,
# Group borders. Default is 1.
group_borders=1,
# Write or do not write paragraphs. Default is 0.
empty_paragraphs=1,
# Debug
deb_dir=debug_dir,
# Default encoding
default_encoding=getattr(self.opts, 'input_encoding', 'cp1252') or 'cp1252',
# Run level
run_level=run_lev,
)
parser.parse_rtf()
with open(ofile, 'rb') as f:
return f.read()
def extract_images(self, picts):
from calibre.utils.imghdr import what
from binascii import unhexlify
self.log('Extracting images...')
with open(picts, 'rb') as f:
raw = f.read()
picts = filter(len, re.findall(br'\{\\pict([^}]+)\}', raw))
hex_pat = re.compile(br'[^a-fA-F0-9]')
encs = [hex_pat.sub(b'', pict) for pict in picts]
count = 0
imap = {}
for enc in encs:
if len(enc) % 2 == 1:
enc = enc[:-1]
data = unhexlify(enc)
fmt = what(None, data)
if fmt is None:
fmt = 'wmf'
count += 1
name = u'%04d.%s' % (count, fmt)
with open(name, 'wb') as f:
f.write(data)
imap[count] = name
# with open(name+'.hex', 'wb') as f:
# f.write(enc)
return self.convert_images(imap)
def convert_images(self, imap):
self.default_img = None
for count, val in iteritems(imap):
try:
imap[count] = self.convert_image(val)
except:
self.log.exception('Failed to convert', val)
return imap
def convert_image(self, name):
if not name.endswith('.wmf'):
return name
try:
return self.rasterize_wmf(name)
except Exception:
self.log.exception('Failed to convert WMF image %r'%name)
return self.replace_wmf(name)
def replace_wmf(self, name):
if self.opts.ignore_wmf:
os.remove(name)
return '__REMOVE_ME__'
from calibre.ebooks.covers import message_image
if self.default_img is None:
self.default_img = message_image('Conversion of WMF images is not supported.'
' Use Microsoft Word or OpenOffice to save this RTF file'
' as HTML and convert that in calibre.')
name = name.replace('.wmf', '.jpg')
with lopen(name, 'wb') as f:
f.write(self.default_img)
return name
def rasterize_wmf(self, name):
from calibre.utils.wmf.parse import wmf_unwrap
with open(name, 'rb') as f:
data = f.read()
data = wmf_unwrap(data)
name = name.replace('.wmf', '.png')
with open(name, 'wb') as f:
f.write(data)
return name
def write_inline_css(self, ic, border_styles):
font_size_classes = ['span.fs%d { font-size: %spt }'%(i, x) for i, x in
enumerate(ic.font_sizes)]
color_classes = ['span.col%d { color: %s }'%(i, x) for i, x in
enumerate(ic.colors) if x != 'false']
css = textwrap.dedent('''
span.none {
text-decoration: none; font-weight: normal;
font-style: normal; font-variant: normal
}
span.italics { font-style: italic }
span.bold { font-weight: bold }
span.small-caps { font-variant: small-caps }
span.underlined { text-decoration: underline }
span.strike-through { text-decoration: line-through }
''')
css += '\n'+'\n'.join(font_size_classes)
css += '\n' +'\n'.join(color_classes)
for cls, val in iteritems(border_styles):
css += '\n\n.%s {\n%s\n}'%(cls, val)
with open(u'styles.css', 'ab') as f:
f.write(css.encode('utf-8'))
def convert_borders(self, doc):
border_styles = []
style_map = {}
for elem in doc.xpath(r'//*[local-name()="cell"]'):
style = ['border-style: hidden', 'border-width: 1px',
'border-color: black']
for x in ('bottom', 'top', 'left', 'right'):
bs = elem.get('border-cell-%s-style'%x, None)
if bs:
cbs = border_style_map.get(bs, 'solid')
style.append('border-%s-style: %s'%(x, cbs))
bw = elem.get('border-cell-%s-line-width'%x, None)
if bw:
style.append('border-%s-width: %spt'%(x, bw))
bc = elem.get('border-cell-%s-color'%x, None)
if bc:
style.append('border-%s-color: %s'%(x, bc))
style = ';\n'.join(style)
if style not in border_styles:
border_styles.append(style)
idx = border_styles.index(style)
cls = 'border_style%d'%idx
style_map[cls] = style
elem.set('class', cls)
return style_map
def convert(self, stream, options, file_ext, log,
accelerators):
from lxml import etree
from calibre.ebooks.metadata.meta import get_metadata
from calibre.ebooks.metadata.opf2 import OPFCreator
from calibre.ebooks.rtf2xml.ParseRtf import RtfInvalidCodeException
from calibre.ebooks.rtf.input import InlineClass
from calibre.utils.xml_parse import safe_xml_fromstring
self.opts = options
self.log = log
self.log('Converting RTF to XML...')
try:
xml = self.generate_xml(stream.name)
except RtfInvalidCodeException as e:
self.log.exception('Unable to parse RTF')
raise ValueError(_('This RTF file has a feature calibre does not '
'support. Convert it to HTML first and then try it.\n%s')%e)
d = glob.glob(os.path.join('*_rtf_pict_dir', 'picts.rtf'))
if d:
imap = {}
try:
imap = self.extract_images(d[0])
except:
self.log.exception('Failed to extract images...')
self.log('Parsing XML...')
doc = safe_xml_fromstring(xml)
border_styles = self.convert_borders(doc)
for pict in doc.xpath('//rtf:pict[@num]',
namespaces={'rtf':'http://rtf2xml.sourceforge.net/'}):
num = int(pict.get('num'))
name = imap.get(num, None)
if name is not None:
pict.set('num', name)
self.log('Converting XML to HTML...')
inline_class = InlineClass(self.log)
styledoc = safe_xml_fromstring(P('templates/rtf.xsl', data=True), recover=False)
extensions = {('calibre', 'inline-class') : inline_class}
transform = etree.XSLT(styledoc, extensions=extensions)
result = transform(doc)
html = u'index.xhtml'
with open(html, 'wb') as f:
res = as_bytes(transform.tostring(result))
# res = res[:100].replace('xmlns:html', 'xmlns') + res[100:]
# clean multiple \n
res = re.sub(b'\n+', b'\n', res)
# Replace newlines inserted by the 'empty_paragraphs' option in rtf2xml with html blank lines
# res = re.sub('\s*<body>', '<body>', res)
# res = re.sub('(?<=\n)\n{2}',
# u'<p>\u00a0</p>\n'.encode('utf-8'), res)
f.write(res)
self.write_inline_css(inline_class, border_styles)
stream.seek(0)
mi = get_metadata(stream, 'rtf')
if not mi.title:
mi.title = _('Unknown')
if not mi.authors:
mi.authors = [_('Unknown')]
opf = OPFCreator(getcwd(), mi)
opf.create_manifest([(u'index.xhtml', None)])
opf.create_spine([u'index.xhtml'])
opf.render(open(u'metadata.opf', 'wb'))
return os.path.abspath(u'metadata.opf')
def postprocess_book(self, oeb, opts, log):
for item in oeb.spine:
for img in item.data.xpath('//*[local-name()="img" and @src="__REMOVE_ME__"]'):
p = img.getparent()
idx = p.index(img)
p.remove(img)
if img.tail:
if idx == 0:
p.text = (p.text or '') + img.tail
else:
p[idx-1].tail = (p[idx-1].tail or '') + img.tail

View File

@@ -0,0 +1,40 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL 3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
import os
from calibre.customize.conversion import OutputFormatPlugin
class RTFOutput(OutputFormatPlugin):
name = 'RTF Output'
author = 'John Schember'
file_type = 'rtf'
commit_name = 'rtf_output'
def convert(self, oeb_book, output_path, input_plugin, opts, log):
from calibre.ebooks.rtf.rtfml import RTFMLizer
rtfmlitzer = RTFMLizer(log)
content = rtfmlitzer.extract_content(oeb_book, opts)
close = False
if not hasattr(output_path, 'write'):
close = True
if not os.path.exists(os.path.dirname(output_path)) and os.path.dirname(output_path) != '':
os.makedirs(os.path.dirname(output_path))
out_stream = lopen(output_path, 'wb')
else:
out_stream = output_path
out_stream.seek(0)
out_stream.truncate()
out_stream.write(content.encode('ascii', 'replace'))
if close:
out_stream.close()

View File

@@ -0,0 +1,122 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL 3'
__copyright__ = '2010, Li Fanxi <lifanxi@freemindworld.com>'
__docformat__ = 'restructuredtext en'
import os
from calibre.customize.conversion import InputFormatPlugin
from calibre.ptempfile import TemporaryDirectory
from calibre.utils.filenames import ascii_filename
from polyglot.builtins import unicode_type
HTML_TEMPLATE = '<html><head><meta http-equiv="Content-Type" content="text/html; charset=utf-8"/><title>%s</title></head><body>\n%s\n</body></html>'
def html_encode(s):
return s.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;').replace('"', '&quot;').replace("'", '&apos;').replace('\n', '<br/>').replace(' ', '&nbsp;') # noqa
class SNBInput(InputFormatPlugin):
name = 'SNB Input'
author = 'Li Fanxi'
description = 'Convert SNB files to OEB'
file_types = {'snb'}
commit_name = 'snb_input'
options = set()
def convert(self, stream, options, file_ext, log,
accelerators):
import uuid
from calibre.ebooks.oeb.base import DirContainer
from calibre.ebooks.snb.snbfile import SNBFile
from calibre.utils.xml_parse import safe_xml_fromstring
log.debug("Parsing SNB file...")
snbFile = SNBFile()
try:
snbFile.Parse(stream)
except:
raise ValueError("Invalid SNB file")
if not snbFile.IsValid():
log.debug("Invalid SNB file")
raise ValueError("Invalid SNB file")
log.debug("Handle meta data ...")
from calibre.ebooks.conversion.plumber import create_oebbook
oeb = create_oebbook(log, None, options,
encoding=options.input_encoding, populate=False)
meta = snbFile.GetFileStream('snbf/book.snbf')
if meta is not None:
meta = safe_xml_fromstring(meta)
l = {'title' : './/head/name',
'creator' : './/head/author',
'language' : './/head/language',
'generator': './/head/generator',
'publisher': './/head/publisher',
'cover' : './/head/cover', }
d = {}
for item in l:
node = meta.find(l[item])
if node is not None:
d[item] = node.text if node.text is not None else ''
else:
d[item] = ''
oeb.metadata.add('title', d['title'])
oeb.metadata.add('creator', d['creator'], attrib={'role':'aut'})
oeb.metadata.add('language', d['language'].lower().replace('_', '-'))
oeb.metadata.add('generator', d['generator'])
oeb.metadata.add('publisher', d['publisher'])
if d['cover'] != '':
oeb.guide.add('cover', 'Cover', d['cover'])
bookid = unicode_type(uuid.uuid4())
oeb.metadata.add('identifier', bookid, id='uuid_id', scheme='uuid')
for ident in oeb.metadata.identifier:
if 'id' in ident.attrib:
oeb.uid = oeb.metadata.identifier[0]
break
with TemporaryDirectory('_snb2oeb', keep=True) as tdir:
log.debug('Process TOC ...')
toc = snbFile.GetFileStream('snbf/toc.snbf')
oeb.container = DirContainer(tdir, log)
if toc is not None:
toc = safe_xml_fromstring(toc)
i = 1
for ch in toc.find('.//body'):
chapterName = ch.text
chapterSrc = ch.get('src')
fname = 'ch_%d.htm' % i
data = snbFile.GetFileStream('snbc/' + chapterSrc)
if data is None:
continue
snbc = safe_xml_fromstring(data)
lines = []
for line in snbc.find('.//body'):
if line.tag == 'text':
lines.append('<p>%s</p>' % html_encode(line.text))
elif line.tag == 'img':
lines.append('<p><img src="%s" /></p>' % html_encode(line.text))
with open(os.path.join(tdir, fname), 'wb') as f:
f.write((HTML_TEMPLATE % (chapterName, '\n'.join(lines))).encode('utf-8', 'replace'))
oeb.toc.add(ch.text, fname)
id, href = oeb.manifest.generate(id='html',
href=ascii_filename(fname))
item = oeb.manifest.add(id, href, 'text/html')
item.html_input_href = fname
oeb.spine.add(item, True)
i = i + 1
imageFiles = snbFile.OutputImageFiles(tdir)
for f, m in imageFiles:
id, href = oeb.manifest.generate(id='image',
href=ascii_filename(f))
item = oeb.manifest.add(id, href, m)
item.html_input_href = f
return oeb

View File

@@ -0,0 +1,269 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL 3'
__copyright__ = '2010, Li Fanxi <lifanxi@freemindworld.com>'
__docformat__ = 'restructuredtext en'
import os
from calibre.customize.conversion import OutputFormatPlugin, OptionRecommendation
from calibre.ptempfile import TemporaryDirectory
from calibre.constants import __appname__, __version__
from polyglot.builtins import unicode_type
class SNBOutput(OutputFormatPlugin):
name = 'SNB Output'
author = 'Li Fanxi'
file_type = 'snb'
commit_name = 'snb_output'
options = {
OptionRecommendation(name='snb_output_encoding', recommended_value='utf-8',
level=OptionRecommendation.LOW,
help=_('Specify the character encoding of the output document. '
'The default is utf-8.')),
OptionRecommendation(name='snb_max_line_length',
recommended_value=0, level=OptionRecommendation.LOW,
help=_('The maximum number of characters per line. This splits on '
'the first space before the specified value. If no space is found '
'the line will be broken at the space after and will exceed the '
'specified value. Also, there is a minimum of 25 characters. '
'Use 0 to disable line splitting.')),
OptionRecommendation(name='snb_insert_empty_line',
recommended_value=False, level=OptionRecommendation.LOW,
help=_('Specify whether or not to insert an empty line between '
'two paragraphs.')),
OptionRecommendation(name='snb_dont_indent_first_line',
recommended_value=False, level=OptionRecommendation.LOW,
help=_('Specify whether or not to insert two space characters '
'to indent the first line of each paragraph.')),
OptionRecommendation(name='snb_hide_chapter_name',
recommended_value=False, level=OptionRecommendation.LOW,
help=_('Specify whether or not to hide the chapter title for each '
'chapter. Useful for image-only output (eg. comics).')),
OptionRecommendation(name='snb_full_screen',
recommended_value=False, level=OptionRecommendation.LOW,
help=_('Resize all the images for full screen view. ')),
}
def convert(self, oeb_book, output_path, input_plugin, opts, log):
from lxml import etree
from calibre.ebooks.snb.snbfile import SNBFile
from calibre.ebooks.snb.snbml import SNBMLizer, ProcessFileName
self.opts = opts
from calibre.ebooks.oeb.transforms.rasterize import SVGRasterizer, Unavailable
try:
rasterizer = SVGRasterizer()
rasterizer(oeb_book, opts)
except Unavailable:
log.warn('SVG rasterizer unavailable, SVG will not be converted')
# Create temp dir
with TemporaryDirectory('_snb_output') as tdir:
# Create stub directories
snbfDir = os.path.join(tdir, 'snbf')
snbcDir = os.path.join(tdir, 'snbc')
snbiDir = os.path.join(tdir, 'snbc/images')
os.mkdir(snbfDir)
os.mkdir(snbcDir)
os.mkdir(snbiDir)
# Process Meta data
meta = oeb_book.metadata
if meta.title:
title = unicode_type(meta.title[0])
else:
title = ''
authors = [unicode_type(x) for x in meta.creator if x.role == 'aut']
if meta.publisher:
publishers = unicode_type(meta.publisher[0])
else:
publishers = ''
if meta.language:
lang = unicode_type(meta.language[0]).upper()
else:
lang = ''
if meta.description:
abstract = unicode_type(meta.description[0])
else:
abstract = ''
# Process Cover
g, m, s = oeb_book.guide, oeb_book.manifest, oeb_book.spine
href = None
if 'titlepage' not in g:
if 'cover' in g:
href = g['cover'].href
# Output book info file
bookInfoTree = etree.Element("book-snbf", version="1.0")
headTree = etree.SubElement(bookInfoTree, "head")
etree.SubElement(headTree, "name").text = title
etree.SubElement(headTree, "author").text = ' '.join(authors)
etree.SubElement(headTree, "language").text = lang
etree.SubElement(headTree, "rights")
etree.SubElement(headTree, "publisher").text = publishers
etree.SubElement(headTree, "generator").text = __appname__ + ' ' + __version__
etree.SubElement(headTree, "created")
etree.SubElement(headTree, "abstract").text = abstract
if href is not None:
etree.SubElement(headTree, "cover").text = ProcessFileName(href)
else:
etree.SubElement(headTree, "cover")
with open(os.path.join(snbfDir, 'book.snbf'), 'wb') as f:
f.write(etree.tostring(bookInfoTree, pretty_print=True, encoding='utf-8'))
# Output TOC
tocInfoTree = etree.Element("toc-snbf")
tocHead = etree.SubElement(tocInfoTree, "head")
tocBody = etree.SubElement(tocInfoTree, "body")
outputFiles = {}
if oeb_book.toc.count() == 0:
log.warn('This SNB file has no Table of Contents. '
'Creating a default TOC')
first = next(iter(oeb_book.spine))
oeb_book.toc.add(_('Start page'), first.href)
else:
first = next(iter(oeb_book.spine))
if oeb_book.toc[0].href != first.href:
# The pages before the fist item in toc will be stored as
# "Cover Pages".
# oeb_book.toc does not support "insert", so we generate
# the tocInfoTree directly instead of modifying the toc
ch = etree.SubElement(tocBody, "chapter")
ch.set("src", ProcessFileName(first.href) + ".snbc")
ch.text = _('Cover pages')
outputFiles[first.href] = []
outputFiles[first.href].append(("", _("Cover pages")))
for tocitem in oeb_book.toc:
if tocitem.href.find('#') != -1:
item = tocitem.href.split('#')
if len(item) != 2:
log.error('Error in TOC item: %s' % tocitem)
else:
if item[0] in outputFiles:
outputFiles[item[0]].append((item[1], tocitem.title))
else:
outputFiles[item[0]] = []
if "" not in outputFiles[item[0]]:
outputFiles[item[0]].append(("", tocitem.title + _(" (Preface)")))
ch = etree.SubElement(tocBody, "chapter")
ch.set("src", ProcessFileName(item[0]) + ".snbc")
ch.text = tocitem.title + _(" (Preface)")
outputFiles[item[0]].append((item[1], tocitem.title))
else:
if tocitem.href in outputFiles:
outputFiles[tocitem.href].append(("", tocitem.title))
else:
outputFiles[tocitem.href] = []
outputFiles[tocitem.href].append(("", tocitem.title))
ch = etree.SubElement(tocBody, "chapter")
ch.set("src", ProcessFileName(tocitem.href) + ".snbc")
ch.text = tocitem.title
etree.SubElement(tocHead, "chapters").text = '%d' % len(tocBody)
with open(os.path.join(snbfDir, 'toc.snbf'), 'wb') as f:
f.write(etree.tostring(tocInfoTree, pretty_print=True, encoding='utf-8'))
# Output Files
oldTree = None
mergeLast = False
lastName = None
for item in s:
from calibre.ebooks.oeb.base import OEB_DOCS, OEB_IMAGES
if m.hrefs[item.href].media_type in OEB_DOCS:
if item.href not in outputFiles:
log.debug('File %s is unused in TOC. Continue in last chapter' % item.href)
mergeLast = True
else:
if oldTree is not None and mergeLast:
log.debug('Output the modified chapter again: %s' % lastName)
with open(os.path.join(snbcDir, lastName), 'wb') as f:
f.write(etree.tostring(oldTree, pretty_print=True, encoding='utf-8'))
mergeLast = False
log.debug('Converting %s to snbc...' % item.href)
snbwriter = SNBMLizer(log)
snbcTrees = None
if not mergeLast:
snbcTrees = snbwriter.extract_content(oeb_book, item, outputFiles[item.href], opts)
for subName in snbcTrees:
postfix = ''
if subName != '':
postfix = '_' + subName
lastName = ProcessFileName(item.href + postfix + ".snbc")
oldTree = snbcTrees[subName]
with open(os.path.join(snbcDir, lastName), 'wb') as f:
f.write(etree.tostring(oldTree, pretty_print=True, encoding='utf-8'))
else:
log.debug('Merge %s with last TOC item...' % item.href)
snbwriter.merge_content(oldTree, oeb_book, item, [('', _("Start"))], opts)
# Output the last one if needed
log.debug('Output the last modified chapter again: %s' % lastName)
if oldTree is not None and mergeLast:
with open(os.path.join(snbcDir, lastName), 'wb') as f:
f.write(etree.tostring(oldTree, pretty_print=True, encoding='utf-8'))
mergeLast = False
for item in m:
if m.hrefs[item.href].media_type in OEB_IMAGES:
log.debug('Converting image: %s ...' % item.href)
content = m.hrefs[item.href].data
# Convert & Resize image
self.HandleImage(content, os.path.join(snbiDir, ProcessFileName(item.href)))
# Package as SNB File
snbFile = SNBFile()
snbFile.FromDir(tdir)
snbFile.Output(output_path)
def HandleImage(self, imageData, imagePath):
from calibre.utils.img import image_from_data, resize_image, image_to_data
img = image_from_data(imageData)
x, y = img.width(), img.height()
if self.opts:
if self.opts.snb_full_screen:
SCREEN_X, SCREEN_Y = self.opts.output_profile.screen_size
else:
SCREEN_X, SCREEN_Y = self.opts.output_profile.comic_screen_size
else:
SCREEN_X = 540
SCREEN_Y = 700
# Handle big image only
if x > SCREEN_X or y > SCREEN_Y:
xScale = float(x) / SCREEN_X
yScale = float(y) / SCREEN_Y
scale = max(xScale, yScale)
# TODO : intelligent image rotation
# img = img.rotate(90)
# x,y = y,x
img = resize_image(img, x // scale, y // scale)
with lopen(imagePath, 'wb') as f:
f.write(image_to_data(img, fmt=imagePath.rpartition('.')[-1]))
if __name__ == '__main__':
from calibre.ebooks.oeb.reader import OEBReader
from calibre.ebooks.oeb.base import OEBBook
from calibre.ebooks.conversion.preprocess import HTMLPreProcessor
from calibre.customize.profiles import HanlinV3Output
class OptionValues(object):
pass
opts = OptionValues()
opts.output_profile = HanlinV3Output(None)
html_preprocessor = HTMLPreProcessor(None, None, opts)
from calibre.utils.logging import default_log
oeb = OEBBook(default_log, html_preprocessor)
reader = OEBReader
reader()(oeb, '/tmp/bbb/processed/')
SNBOutput(None).convert(oeb, '/tmp/test.snb', None, None, default_log)

View File

@@ -0,0 +1,39 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL 3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
from io import BytesIO
from calibre.customize.conversion import InputFormatPlugin
class TCRInput(InputFormatPlugin):
name = 'TCR Input'
author = 'John Schember'
description = 'Convert TCR files to HTML'
file_types = {'tcr'}
commit_name = 'tcr_input'
def convert(self, stream, options, file_ext, log, accelerators):
from calibre.ebooks.compression.tcr import decompress
log.info('Decompressing text...')
raw_txt = decompress(stream)
log.info('Converting text to OEB...')
stream = BytesIO(raw_txt)
from calibre.customize.ui import plugin_for_input_format
txt_plugin = plugin_for_input_format('txt')
for opt in txt_plugin.options:
if not hasattr(self.options, opt.option.name):
setattr(options, opt.option.name, opt.recommended_value)
stream.seek(0)
return txt_plugin.convert(stream, options,
'txt', log, accelerators)

View File

@@ -0,0 +1,56 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL 3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
import os
from calibre.customize.conversion import OutputFormatPlugin, \
OptionRecommendation
class TCROutput(OutputFormatPlugin):
name = 'TCR Output'
author = 'John Schember'
file_type = 'tcr'
commit_name = 'tcr_output'
options = {
OptionRecommendation(name='tcr_output_encoding', recommended_value='utf-8',
level=OptionRecommendation.LOW,
help=_('Specify the character encoding of the output document. '
'The default is utf-8.'))}
def convert(self, oeb_book, output_path, input_plugin, opts, log):
from calibre.ebooks.txt.txtml import TXTMLizer
from calibre.ebooks.compression.tcr import compress
close = False
if not hasattr(output_path, 'write'):
close = True
if not os.path.exists(os.path.dirname(output_path)) and os.path.dirname(output_path):
os.makedirs(os.path.dirname(output_path))
out_stream = lopen(output_path, 'wb')
else:
out_stream = output_path
setattr(opts, 'flush_paras', False)
setattr(opts, 'max_line_length', 0)
setattr(opts, 'force_max_line_length', False)
setattr(opts, 'indent_paras', False)
writer = TXTMLizer(log)
txt = writer.extract_content(oeb_book, opts).encode(opts.tcr_output_encoding, 'replace')
log.info('Compressing text...')
txt = compress(txt)
out_stream.seek(0)
out_stream.truncate()
out_stream.write(txt)
if close:
out_stream.close()

View File

@@ -0,0 +1,308 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL 3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
import os
from calibre import _ent_pat, walk, xml_entity_to_unicode
from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
from polyglot.builtins import getcwd
MD_EXTENSIONS = {
'abbr': _('Abbreviations'),
'admonition': _('Support admonitions'),
'attr_list': _('Add attribute to HTML tags'),
'codehilite': _('Add code highlighting via Pygments'),
'def_list': _('Definition lists'),
'extra': _('Enables various common extensions'),
'fenced_code': _('Alternative code block syntax'),
'footnotes': _('Footnotes'),
'legacy_attrs': _('Use legacy element attributes'),
'legacy_em': _('Use legacy underscore handling for connected words'),
'meta': _('Metadata in the document'),
'nl2br': _('Treat newlines as hard breaks'),
'sane_lists': _('Do not allow mixing list types'),
'smarty': _('Use markdown\'s internal smartypants parser'),
'tables': _('Support tables'),
'toc': _('Generate a table of contents'),
'wikilinks': _('Wiki style links'),
}
class TXTInput(InputFormatPlugin):
name = 'TXT Input'
author = 'John Schember'
description = 'Convert TXT files to HTML'
file_types = {'txt', 'txtz', 'text', 'md', 'textile', 'markdown'}
commit_name = 'txt_input'
ui_data = {
'md_extensions': MD_EXTENSIONS,
'paragraph_types': {
'auto': _('Try to auto detect paragraph type'),
'block': _('Treat a blank line as a paragraph break'),
'single': _('Assume every line is a paragraph'),
'print': _('Assume every line starting with 2+ spaces or a tab starts a paragraph'),
'unformatted': _('Most lines have hard line breaks, few/no blank lines or indents'),
'off': _('Don\'t modify the paragraph structure'),
},
'formatting_types': {
'auto': _('Automatically decide which formatting processor to use'),
'plain': _('No formatting'),
'heuristic': _('Use heuristics to determine chapter headings, italics, etc.'),
'textile': _('Use the TexTile markup language'),
'markdown': _('Use the Markdown markup language')
},
}
options = {
OptionRecommendation(name='formatting_type', recommended_value='auto',
choices=list(ui_data['formatting_types']),
help=_('Formatting used within the document.\n'
'* auto: {auto}\n'
'* plain: {plain}\n'
'* heuristic: {heuristic}\n'
'* textile: {textile}\n'
'* markdown: {markdown}\n'
'To learn more about markdown see {url}').format(
url='https://daringfireball.net/projects/markdown/', **ui_data['formatting_types'])
),
OptionRecommendation(name='paragraph_type', recommended_value='auto',
choices=list(ui_data['paragraph_types']),
help=_('Paragraph structure to assume. The value of "off" is useful for formatted documents such as Markdown or Textile. '
'Choices are:\n'
'* auto: {auto}\n'
'* block: {block}\n'
'* single: {single}\n'
'* print: {print}\n'
'* unformatted: {unformatted}\n'
'* off: {off}').format(**ui_data['paragraph_types'])
),
OptionRecommendation(name='preserve_spaces', recommended_value=False,
help=_('Normally extra spaces are condensed into a single space. '
'With this option all spaces will be displayed.')),
OptionRecommendation(name='txt_in_remove_indents', recommended_value=False,
help=_('Normally extra space at the beginning of lines is retained. '
'With this option they will be removed.')),
OptionRecommendation(name="markdown_extensions", recommended_value='footnotes, tables, toc',
help=_('Enable extensions to markdown syntax. Extensions are formatting that is not part '
'of the standard markdown format. The extensions enabled by default: %default.\n'
'To learn more about markdown extensions, see {}\n'
'This should be a comma separated list of extensions to enable:\n'
).format('https://python-markdown.github.io/extensions/') + '\n'.join('* %s: %s' % (k, MD_EXTENSIONS[k]) for k in sorted(MD_EXTENSIONS))),
}
def shift_file(self, fname, data):
name, ext = os.path.splitext(fname)
candidate = os.path.join(self.output_dir, fname)
c = 0
while os.path.exists(candidate):
c += 1
candidate = os.path.join(self.output_dir, '{}-{}{}'.format(name, c, ext))
ans = candidate
with open(ans, 'wb') as f:
f.write(data)
return f.name
def fix_resources(self, html, base_dir):
from html5_parser import parse
root = parse(html)
changed = False
for img in root.xpath('//img[@src]'):
src = img.get('src')
prefix = src.split(':', 1)[0].lower()
if prefix not in ('file', 'http', 'https', 'ftp') and not os.path.isabs(src):
src = os.path.join(base_dir, src)
if os.access(src, os.R_OK):
with open(src, 'rb') as f:
data = f.read()
f = self.shift_file(os.path.basename(src), data)
changed = True
img.set('src', os.path.basename(f))
if changed:
from lxml import etree
html = etree.tostring(root, encoding='unicode')
return html
def convert(self, stream, options, file_ext, log,
accelerators):
from calibre.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator
from calibre.ebooks.chardet import detect
from calibre.utils.zipfile import ZipFile
from calibre.ebooks.txt.processor import (convert_basic,
convert_markdown_with_metadata, separate_paragraphs_single_line,
separate_paragraphs_print_formatted, preserve_spaces,
detect_paragraph_type, detect_formatting_type,
normalize_line_endings, convert_textile, remove_indents,
block_to_single_line, separate_hard_scene_breaks)
self.log = log
txt = b''
log.debug('Reading text from file...')
length = 0
base_dir = self.output_dir = getcwd()
# Extract content from zip archive.
if file_ext == 'txtz':
zf = ZipFile(stream)
zf.extractall('.')
for x in walk('.'):
if os.path.splitext(x)[1].lower() in ('.txt', '.text'):
with open(x, 'rb') as tf:
txt += tf.read() + b'\n\n'
else:
if getattr(stream, 'name', None):
base_dir = os.path.dirname(stream.name)
txt = stream.read()
if file_ext in {'md', 'textile', 'markdown'}:
options.formatting_type = {'md': 'markdown'}.get(file_ext, file_ext)
log.info('File extension indicates particular formatting. '
'Forcing formatting type to: %s'%options.formatting_type)
options.paragraph_type = 'off'
# Get the encoding of the document.
if options.input_encoding:
ienc = options.input_encoding
log.debug('Using user specified input encoding of %s' % ienc)
else:
det_encoding = detect(txt[:4096])
det_encoding, confidence = det_encoding['encoding'], det_encoding['confidence']
if det_encoding and det_encoding.lower().replace('_', '-').strip() in (
'gb2312', 'chinese', 'csiso58gb231280', 'euc-cn', 'euccn',
'eucgb2312-cn', 'gb2312-1980', 'gb2312-80', 'iso-ir-58'):
# Microsoft Word exports to HTML with encoding incorrectly set to
# gb2312 instead of gbk. gbk is a superset of gb2312, anyway.
det_encoding = 'gbk'
ienc = det_encoding
log.debug('Detected input encoding as %s with a confidence of %s%%' % (ienc, confidence * 100))
if not ienc:
ienc = 'utf-8'
log.debug('No input encoding specified and could not auto detect using %s' % ienc)
# Remove BOM from start of txt as its presence can confuse markdown
import codecs
for bom in (codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE, codecs.BOM_UTF8, codecs.BOM_UTF32_LE, codecs.BOM_UTF32_BE):
if txt.startswith(bom):
txt = txt[len(bom):]
break
txt = txt.decode(ienc, 'replace')
# Replace entities
txt = _ent_pat.sub(xml_entity_to_unicode, txt)
# Normalize line endings
txt = normalize_line_endings(txt)
# Determine the paragraph type of the document.
if options.paragraph_type == 'auto':
options.paragraph_type = detect_paragraph_type(txt)
if options.paragraph_type == 'unknown':
log.debug('Could not reliably determine paragraph type using block')
options.paragraph_type = 'block'
else:
log.debug('Auto detected paragraph type as %s' % options.paragraph_type)
# Detect formatting
if options.formatting_type == 'auto':
options.formatting_type = detect_formatting_type(txt)
log.debug('Auto detected formatting as %s' % options.formatting_type)
if options.formatting_type == 'heuristic':
setattr(options, 'enable_heuristics', True)
setattr(options, 'unwrap_lines', False)
setattr(options, 'smarten_punctuation', True)
# Reformat paragraphs to block formatting based on the detected type.
# We don't check for block because the processor assumes block.
# single and print at transformed to block for processing.
if options.paragraph_type == 'single':
txt = separate_paragraphs_single_line(txt)
elif options.paragraph_type == 'print':
txt = separate_hard_scene_breaks(txt)
txt = separate_paragraphs_print_formatted(txt)
txt = block_to_single_line(txt)
elif options.paragraph_type == 'unformatted':
from calibre.ebooks.conversion.utils import HeuristicProcessor
# unwrap lines based on punctuation
docanalysis = DocAnalysis('txt', txt)
length = docanalysis.line_length(.5)
preprocessor = HeuristicProcessor(options, log=getattr(self, 'log', None))
txt = preprocessor.punctuation_unwrap(length, txt, 'txt')
txt = separate_paragraphs_single_line(txt)
elif options.paragraph_type == 'block':
txt = separate_hard_scene_breaks(txt)
txt = block_to_single_line(txt)
if getattr(options, 'enable_heuristics', False) and getattr(options, 'dehyphenate', False):
docanalysis = DocAnalysis('txt', txt)
if not length:
length = docanalysis.line_length(.5)
dehyphenator = Dehyphenator(options.verbose, log=self.log)
txt = dehyphenator(txt,'txt', length)
# User requested transformation on the text.
if options.txt_in_remove_indents:
txt = remove_indents(txt)
# Preserve spaces will replace multiple spaces to a space
# followed by the &nbsp; entity.
if options.preserve_spaces:
txt = preserve_spaces(txt)
# Process the text using the appropriate text processor.
self.shifted_files = []
try:
html = ''
input_mi = None
if options.formatting_type == 'markdown':
log.debug('Running text through markdown conversion...')
try:
input_mi, html = convert_markdown_with_metadata(txt, extensions=[x.strip() for x in options.markdown_extensions.split(',') if x.strip()])
except RuntimeError:
raise ValueError('This txt file has malformed markup, it cannot be'
' converted by calibre. See https://daringfireball.net/projects/markdown/syntax')
html = self.fix_resources(html, base_dir)
elif options.formatting_type == 'textile':
log.debug('Running text through textile conversion...')
html = convert_textile(txt)
html = self.fix_resources(html, base_dir)
else:
log.debug('Running text through basic conversion...')
flow_size = getattr(options, 'flow_size', 0)
html = convert_basic(txt, epub_split_size_kb=flow_size)
# Run the HTMLized text through the html processing plugin.
from calibre.customize.ui import plugin_for_input_format
html_input = plugin_for_input_format('html')
for opt in html_input.options:
setattr(options, opt.option.name, opt.recommended_value)
options.input_encoding = 'utf-8'
htmlfile = self.shift_file('index.html', html.encode('utf-8'))
odi = options.debug_pipeline
options.debug_pipeline = None
# Generate oeb from html conversion.
oeb = html_input.convert(open(htmlfile, 'rb'), options, 'html', log, {})
options.debug_pipeline = odi
finally:
for x in self.shifted_files:
os.remove(x)
# Set metadata from file.
if input_mi is None:
from calibre.customize.ui import get_file_type_metadata
input_mi = get_file_type_metadata(stream, file_ext)
from calibre.ebooks.oeb.transforms.metadata import meta_info_to_oeb_metadata
meta_info_to_oeb_metadata(input_mi, oeb.metadata, log)
self.html_postprocess_title = input_mi.title
return oeb
def postprocess_book(self, oeb, opts, log):
for item in oeb.spine:
if hasattr(item.data, 'xpath'):
for title in item.data.xpath('//*[local-name()="title"]'):
if title.text == _('Unknown'):
title.text = self.html_postprocess_title

View File

@@ -0,0 +1,165 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL 3'
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
__docformat__ = 'restructuredtext en'
import os
import shutil
from calibre.customize.conversion import OutputFormatPlugin, \
OptionRecommendation
from calibre.ptempfile import TemporaryDirectory, TemporaryFile
NEWLINE_TYPES = ['system', 'unix', 'old_mac', 'windows']
class TXTOutput(OutputFormatPlugin):
name = 'TXT Output'
author = 'John Schember'
file_type = 'txt'
commit_name = 'txt_output'
ui_data = {
'newline_types': NEWLINE_TYPES,
'formatting_types': {
'plain': _('Plain text'),
'markdown': _('Markdown formatted text'),
'textile': _('TexTile formatted text')
},
}
options = {
OptionRecommendation(name='newline', recommended_value='system',
level=OptionRecommendation.LOW,
short_switch='n', choices=NEWLINE_TYPES,
help=_('Type of newline to use. Options are %s. Default is \'system\'. '
'Use \'old_mac\' for compatibility with Mac OS 9 and earlier. '
'For macOS use \'unix\'. \'system\' will default to the newline '
'type used by this OS.') % sorted(NEWLINE_TYPES)),
OptionRecommendation(name='txt_output_encoding', recommended_value='utf-8',
level=OptionRecommendation.LOW,
help=_('Specify the character encoding of the output document. '
'The default is utf-8.')),
OptionRecommendation(name='inline_toc',
recommended_value=False, level=OptionRecommendation.LOW,
help=_('Add Table of Contents to beginning of the book.')),
OptionRecommendation(name='max_line_length',
recommended_value=0, level=OptionRecommendation.LOW,
help=_('The maximum number of characters per line. This splits on '
'the first space before the specified value. If no space is found '
'the line will be broken at the space after and will exceed the '
'specified value. Also, there is a minimum of 25 characters. '
'Use 0 to disable line splitting.')),
OptionRecommendation(name='force_max_line_length',
recommended_value=False, level=OptionRecommendation.LOW,
help=_('Force splitting on the max-line-length value when no space '
'is present. Also allows max-line-length to be below the minimum')),
OptionRecommendation(name='txt_output_formatting',
recommended_value='plain',
choices=list(ui_data['formatting_types']),
help=_('Formatting used within the document.\n'
'* plain: {plain}\n'
'* markdown: {markdown}\n'
'* textile: {textile}').format(**ui_data['formatting_types'])),
OptionRecommendation(name='keep_links',
recommended_value=False, level=OptionRecommendation.LOW,
help=_('Do not remove links within the document. This is only '
'useful when paired with a txt-output-formatting option that '
'is not none because links are always removed with plain text output.')),
OptionRecommendation(name='keep_image_references',
recommended_value=False, level=OptionRecommendation.LOW,
help=_('Do not remove image references within the document. This is only '
'useful when paired with a txt-output-formatting option that '
'is not none because links are always removed with plain text output.')),
OptionRecommendation(name='keep_color',
recommended_value=False, level=OptionRecommendation.LOW,
help=_('Do not remove font color from output. This is only useful when '
'txt-output-formatting is set to textile. Textile is the only '
'formatting that supports setting font color. If this option is '
'not specified font color will not be set and default to the '
'color displayed by the reader (generally this is black).')),
}
def convert(self, oeb_book, output_path, input_plugin, opts, log):
from calibre.ebooks.txt.txtml import TXTMLizer
from calibre.utils.cleantext import clean_ascii_chars
from calibre.ebooks.txt.newlines import specified_newlines, TxtNewlines
if opts.txt_output_formatting.lower() == 'markdown':
from calibre.ebooks.txt.markdownml import MarkdownMLizer
self.writer = MarkdownMLizer(log)
elif opts.txt_output_formatting.lower() == 'textile':
from calibre.ebooks.txt.textileml import TextileMLizer
self.writer = TextileMLizer(log)
else:
self.writer = TXTMLizer(log)
txt = self.writer.extract_content(oeb_book, opts)
txt = clean_ascii_chars(txt)
log.debug('\tReplacing newlines with selected type...')
txt = specified_newlines(TxtNewlines(opts.newline).newline, txt)
close = False
if not hasattr(output_path, 'write'):
close = True
if not os.path.exists(os.path.dirname(output_path)) and os.path.dirname(output_path) != '':
os.makedirs(os.path.dirname(output_path))
out_stream = open(output_path, 'wb')
else:
out_stream = output_path
out_stream.seek(0)
out_stream.truncate()
out_stream.write(txt.encode(opts.txt_output_encoding, 'replace'))
if close:
out_stream.close()
class TXTZOutput(TXTOutput):
name = 'TXTZ Output'
author = 'John Schember'
file_type = 'txtz'
def convert(self, oeb_book, output_path, input_plugin, opts, log):
from calibre.ebooks.oeb.base import OEB_IMAGES
from calibre.utils.zipfile import ZipFile
from lxml import etree
with TemporaryDirectory('_txtz_output') as tdir:
# TXT
txt_name = 'index.txt'
if opts.txt_output_formatting.lower() == 'textile':
txt_name = 'index.text'
with TemporaryFile(txt_name) as tf:
TXTOutput.convert(self, oeb_book, tf, input_plugin, opts, log)
shutil.copy(tf, os.path.join(tdir, txt_name))
# Images
for item in oeb_book.manifest:
if item.media_type in OEB_IMAGES:
if hasattr(self.writer, 'images'):
path = os.path.join(tdir, 'images')
if item.href in self.writer.images:
href = self.writer.images[item.href]
else:
continue
else:
path = os.path.join(tdir, os.path.dirname(item.href))
href = os.path.basename(item.href)
if not os.path.exists(path):
os.makedirs(path)
with open(os.path.join(path, href), 'wb') as imgf:
imgf.write(item.data)
# Metadata
with open(os.path.join(tdir, 'metadata.opf'), 'wb') as mdataf:
mdataf.write(etree.tostring(oeb_book.metadata.to_opf1()))
txtz = ZipFile(output_path, 'w')
txtz.add_dir(tdir)

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,646 @@
#!/usr/bin/env python2
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import functools, re, json
from math import ceil
from calibre import entity_to_unicode, as_unicode
from polyglot.builtins import unicode_type, range
XMLDECL_RE = re.compile(r'^\s*<[?]xml.*?[?]>')
SVG_NS = 'http://www.w3.org/2000/svg'
XLINK_NS = 'http://www.w3.org/1999/xlink'
convert_entities = functools.partial(entity_to_unicode,
result_exceptions={
'<' : '&lt;',
'>' : '&gt;',
"'" : '&apos;',
'"' : '&quot;',
'&' : '&amp;',
})
_span_pat = re.compile('<span.*?</span>', re.DOTALL|re.IGNORECASE)
LIGATURES = {
# '\u00c6': 'AE',
# '\u00e6': 'ae',
# '\u0152': 'OE',
# '\u0153': 'oe',
# '\u0132': 'IJ',
# '\u0133': 'ij',
# '\u1D6B': 'ue',
'\uFB00': 'ff',
'\uFB01': 'fi',
'\uFB02': 'fl',
'\uFB03': 'ffi',
'\uFB04': 'ffl',
'\uFB05': 'ft',
'\uFB06': 'st',
}
_ligpat = re.compile('|'.join(LIGATURES))
def sanitize_head(match):
x = match.group(1)
x = _span_pat.sub('', x)
return '<head>\n%s\n</head>' % x
def chap_head(match):
chap = match.group('chap')
title = match.group('title')
if not title:
return '<h1>'+chap+'</h1><br/>\n'
else:
return '<h1>'+chap+'</h1>\n<h3>'+title+'</h3>\n'
def wrap_lines(match):
ital = match.group('ital')
if not ital:
return ' '
else:
return ital+' '
def smarten_punctuation(html, log=None):
from calibre.utils.smartypants import smartyPants
from calibre.ebooks.chardet import substitute_entites
from calibre.ebooks.conversion.utils import HeuristicProcessor
preprocessor = HeuristicProcessor(log=log)
from uuid import uuid4
start = 'calibre-smartypants-'+unicode_type(uuid4())
stop = 'calibre-smartypants-'+unicode_type(uuid4())
html = html.replace('<!--', start)
html = html.replace('-->', stop)
html = preprocessor.fix_nbsp_indents(html)
html = smartyPants(html)
html = html.replace(start, '<!--')
html = html.replace(stop, '-->')
return substitute_entites(html)
class DocAnalysis(object):
'''
Provides various text analysis functions to determine how the document is structured.
format is the type of document analysis will be done against.
raw is the raw text to determine the line length to use for wrapping.
Blank lines are excluded from analysis
'''
def __init__(self, format='html', raw=''):
raw = raw.replace('&nbsp;', ' ')
if format == 'html':
linere = re.compile(r'(?<=<p)(?![^>]*>\s*</p>).*?(?=</p>)', re.DOTALL)
elif format == 'pdf':
linere = re.compile(r'(?<=<br>)(?!\s*<br>).*?(?=<br>)', re.DOTALL)
elif format == 'spanned_html':
linere = re.compile('(?<=<span).*?(?=</span>)', re.DOTALL)
elif format == 'txt':
linere = re.compile('.*?\n')
self.lines = linere.findall(raw)
def line_length(self, percent):
'''
Analyses the document to find the median line length.
percentage is a decimal number, 0 - 1 which is used to determine
how far in the list of line lengths to use. The list of line lengths is
ordered smallest to largest and does not include duplicates. 0.5 is the
median value.
'''
lengths = []
for line in self.lines:
if len(line) > 0:
lengths.append(len(line))
if not lengths:
return 0
lengths = list(set(lengths))
total = sum(lengths)
avg = total / len(lengths)
max_line = ceil(avg * 2)
lengths = sorted(lengths)
for i in range(len(lengths) - 1, -1, -1):
if lengths[i] > max_line:
del lengths[i]
if percent > 1:
percent = 1
if percent < 0:
percent = 0
index = int(len(lengths) * percent) - 1
return lengths[index]
def line_histogram(self, percent):
'''
Creates a broad histogram of the document to determine whether it incorporates hard
line breaks. Lines are sorted into 20 'buckets' based on length.
percent is the percentage of lines that should be in a single bucket to return true
The majority of the lines will exist in 1-2 buckets in typical docs with hard line breaks
'''
minLineLength=20 # Ignore lines under 20 chars (typical of spaces)
maxLineLength=1900 # Discard larger than this to stay in range
buckets=20 # Each line is divided into a bucket based on length
# print("there are "+unicode_type(len(lines))+" lines")
# max = 0
# for line in self.lines:
# l = len(line)
# if l > max:
# max = l
# print("max line found is "+unicode_type(max))
# Build the line length histogram
hRaw = [0 for i in range(0,buckets)]
for line in self.lines:
l = len(line)
if l > minLineLength and l < maxLineLength:
l = int(l // 100)
# print("adding "+unicode_type(l))
hRaw[l]+=1
# Normalize the histogram into percents
totalLines = len(self.lines)
if totalLines > 0:
h = [float(count)/totalLines for count in hRaw]
else:
h = []
# print("\nhRaw histogram lengths are: "+unicode_type(hRaw))
# print(" percents are: "+unicode_type(h)+"\n")
# Find the biggest bucket
maxValue = 0
for i in range(0,len(h)):
if h[i] > maxValue:
maxValue = h[i]
if maxValue < percent:
# print("Line lengths are too variable. Not unwrapping.")
return False
else:
# print(unicode_type(maxValue)+" of the lines were in one bucket")
return True
class Dehyphenator(object):
'''
Analyzes words to determine whether hyphens should be retained/removed. Uses the document
itself is as a dictionary. This method handles all languages along with uncommon, made-up, and
scientific words. The primary disadvantage is that words appearing only once in the document
retain hyphens.
'''
def __init__(self, verbose=0, log=None):
self.log = log
self.verbose = verbose
# Add common suffixes to the regex below to increase the likelihood of a match -
# don't add suffixes which are also complete words, such as 'able' or 'sex'
# only remove if it's not already the point of hyphenation
self.suffix_string = (
"((ed)?ly|'?e?s||a?(t|s)?ion(s|al(ly)?)?|ings?|er|(i)?ous|"
"(i|a)ty|(it)?ies|ive|gence|istic(ally)?|(e|a)nce|m?ents?|ism|ated|"
"(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex|ian)$")
self.suffixes = re.compile(r"^%s" % self.suffix_string, re.IGNORECASE)
self.removesuffixes = re.compile(r"%s" % self.suffix_string, re.IGNORECASE)
# remove prefixes if the prefix was not already the point of hyphenation
self.prefix_string = '^(dis|re|un|in|ex)'
self.prefixes = re.compile(r'%s$' % self.prefix_string, re.IGNORECASE)
self.removeprefix = re.compile(r'%s' % self.prefix_string, re.IGNORECASE)
def dehyphenate(self, match):
firsthalf = match.group('firstpart')
secondhalf = match.group('secondpart')
try:
wraptags = match.group('wraptags')
except:
wraptags = ''
hyphenated = unicode_type(firsthalf) + "-" + unicode_type(secondhalf)
dehyphenated = unicode_type(firsthalf) + unicode_type(secondhalf)
if self.suffixes.match(secondhalf) is None:
lookupword = self.removesuffixes.sub('', dehyphenated)
else:
lookupword = dehyphenated
if len(firsthalf) > 4 and self.prefixes.match(firsthalf) is None:
lookupword = self.removeprefix.sub('', lookupword)
if self.verbose > 2:
self.log("lookup word is: "+lookupword+", orig is: " + hyphenated)
try:
searchresult = self.html.find(lookupword.lower())
except:
return hyphenated
if self.format == 'html_cleanup' or self.format == 'txt_cleanup':
if self.html.find(lookupword) != -1 or searchresult != -1:
if self.verbose > 2:
self.log(" Cleanup:returned dehyphenated word: " + dehyphenated)
return dehyphenated
elif self.html.find(hyphenated) != -1:
if self.verbose > 2:
self.log(" Cleanup:returned hyphenated word: " + hyphenated)
return hyphenated
else:
if self.verbose > 2:
self.log(" Cleanup:returning original text "+firsthalf+" + linefeed "+secondhalf)
return firsthalf+'\u2014'+wraptags+secondhalf
else:
if self.format == 'individual_words' and len(firsthalf) + len(secondhalf) <= 6:
if self.verbose > 2:
self.log("too short, returned hyphenated word: " + hyphenated)
return hyphenated
if len(firsthalf) <= 2 and len(secondhalf) <= 2:
if self.verbose > 2:
self.log("too short, returned hyphenated word: " + hyphenated)
return hyphenated
if self.html.find(lookupword) != -1 or searchresult != -1:
if self.verbose > 2:
self.log(" returned dehyphenated word: " + dehyphenated)
return dehyphenated
else:
if self.verbose > 2:
self.log(" returned hyphenated word: " + hyphenated)
return hyphenated
def __call__(self, html, format, length=1):
self.html = html
self.format = format
if format == 'html':
intextmatch = re.compile((
r'(?<=.{%i})(?P<firstpart>[^\W\-]+)(-|)\s*(?=<)(?P<wraptags>(</span>)?'
r'\s*(</[iubp]>\s*){1,2}(?P<up2threeblanks><(p|div)[^>]*>\s*(<p[^>]*>\s*</p>\s*)'
r'?</(p|div)>\s+){0,3}\s*(<[iubp][^>]*>\s*){1,2}(<span[^>]*>)?)\s*(?P<secondpart>[\w\d]+)') % length)
elif format == 'pdf':
intextmatch = re.compile((
r'(?<=.{%i})(?P<firstpart>[^\W\-]+)(-|)\s*(?P<wraptags><p>|'
r'</[iub]>\s*<p>\s*<[iub]>)\s*(?P<secondpart>[\w\d]+)')% length)
elif format == 'txt':
intextmatch = re.compile(
'(?<=.{%i})(?P<firstpart>[^\\W\\-]+)(-|)(\u0020|\u0009)*(?P<wraptags>(\n(\u0020|\u0009)*)+)(?P<secondpart>[\\w\\d]+)'% length)
elif format == 'individual_words':
intextmatch = re.compile(
r'(?!<)(?P<firstpart>[^\W\-]+)(-|)\s*(?P<secondpart>\w+)(?![^<]*?>)', re.UNICODE)
elif format == 'html_cleanup':
intextmatch = re.compile(
r'(?P<firstpart>[^\W\-]+)(-|)\s*(?=<)(?P<wraptags></span>\s*(</[iubp]>'
r'\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?P<secondpart>[\w\d]+)')
elif format == 'txt_cleanup':
intextmatch = re.compile(
r'(?P<firstpart>[^\W\-]+)(-|)(?P<wraptags>\s+)(?P<secondpart>[\w\d]+)')
html = intextmatch.sub(self.dehyphenate, html)
return html
class CSSPreProcessor(object):
# Remove some of the broken CSS Microsoft products
# create
MS_PAT = re.compile(r'''
(?P<start>^|;|\{)\s* # The end of the previous rule or block start
(%s).+? # The invalid selectors
(?P<end>$|;|\}) # The end of the declaration
'''%'mso-|panose-|text-underline|tab-interval',
re.MULTILINE|re.IGNORECASE|re.VERBOSE)
def ms_sub(self, match):
end = match.group('end')
try:
start = match.group('start')
except:
start = ''
if end == ';':
end = ''
return start + end
def __call__(self, data, add_namespace=False):
from calibre.ebooks.oeb.base import XHTML_CSS_NAMESPACE
data = self.MS_PAT.sub(self.ms_sub, data)
if not add_namespace:
return data
# Remove comments as the following namespace logic will break if there
# are commented lines before the first @import or @charset rule. Since
# the conversion will remove all stylesheets anyway, we don't lose
# anything
data = re.sub(unicode_type(r'/\*.*?\*/'), '', data, flags=re.DOTALL)
ans, namespaced = [], False
for line in data.splitlines():
ll = line.lstrip()
if not (namespaced or ll.startswith('@import') or not ll or
ll.startswith('@charset')):
ans.append(XHTML_CSS_NAMESPACE.strip())
namespaced = True
ans.append(line)
return '\n'.join(ans)
def accent_regex(accent_maps, letter_before=False):
accent_cat = set()
letters = set()
for accent in tuple(accent_maps):
accent_cat.add(accent)
k, v = accent_maps[accent].split(':', 1)
if len(k) != len(v):
raise ValueError('Invalid mapping for: {} -> {}'.format(k, v))
accent_maps[accent] = lmap = dict(zip(k, v))
letters |= set(lmap)
if letter_before:
args = ''.join(letters), ''.join(accent_cat)
accent_group, letter_group = 2, 1
else:
args = ''.join(accent_cat), ''.join(letters)
accent_group, letter_group = 1, 2
pat = re.compile(r'([{}])\s*(?:<br[^>]*>){{0,1}}\s*([{}])'.format(*args), re.UNICODE)
def sub(m):
lmap = accent_maps[m.group(accent_group)]
return lmap.get(m.group(letter_group)) or m.group()
return pat, sub
def html_preprocess_rules():
ans = getattr(html_preprocess_rules, 'ans', None)
if ans is None:
ans = html_preprocess_rules.ans = [
# Remove huge block of contiguous spaces as they slow down
# the following regexes pretty badly
(re.compile(r'\s{10000,}'), ''),
# Some idiotic HTML generators (Frontpage I'm looking at you)
# Put all sorts of crap into <head>. This messes up lxml
(re.compile(r'<head[^>]*>\n*(.*?)\n*</head>', re.IGNORECASE|re.DOTALL),
sanitize_head),
# Convert all entities, since lxml doesn't handle them well
(re.compile(r'&(\S+?);'), convert_entities),
# Remove the <![if/endif tags inserted by everybody's darling, MS Word
(re.compile(r'</{0,1}!\[(end){0,1}if\]{0,1}>', re.IGNORECASE), ''),
]
return ans
def pdftohtml_rules():
ans = getattr(pdftohtml_rules, 'ans', None)
if ans is None:
ans = pdftohtml_rules.ans = [
accent_regex({
'¨': 'aAeEiIoOuU:äÄëËïÏöÖüÜ',
'`': 'aAeEiIoOuU:àÀèÈìÌòÒùÙ',
'´': 'aAcCeEiIlLoOnNrRsSuUzZ:áÁćĆéÉíÍĺĹóÓńŃŕŔśŚúÚźŹ',
'ˆ': 'aAeEiIoOuU:âÂêÊîÎôÔûÛ',
'¸': 'cC:çÇ',
'˛': 'aAeE:ąĄęĘ',
'˙': 'zZ:żŻ',
'ˇ': 'cCdDeElLnNrRsStTzZ:čČďĎěĚľĽňŇřŘšŠťŤžŽ',
'°': 'uU:ůŮ',
}),
accent_regex({'`': 'aAeEiIoOuU:àÀèÈìÌòÒùÙ'}, letter_before=True),
# If pdf printed from a browser then the header/footer has a reliable pattern
(re.compile(r'((?<=</a>)\s*file:/{2,4}[A-Z].*<br>|file:////?[A-Z].*<br>(?=\s*<hr>))', re.IGNORECASE), lambda match: ''),
# Center separator lines
(re.compile(r'<br>\s*(?P<break>([*#•✦=] *){3,})\s*<br>'), lambda match: '<p>\n<p style="text-align:center">' + match.group('break') + '</p>'),
# Remove <hr> tags
(re.compile(r'<hr.*?>', re.IGNORECASE), ''),
# Remove gray background
(re.compile(r'<BODY[^<>]+>'), '<BODY>'),
# Convert line breaks to paragraphs
(re.compile(r'<br[^>]*>\s*'), '</p>\n<p>'),
(re.compile(r'<body[^>]*>\s*'), '<body>\n<p>'),
(re.compile(r'\s*</body>'), '</p>\n</body>'),
# Clean up spaces
(re.compile(r'(?<=[\.,;\?!”"\'])[\s^ ]*(?=<)'), ' '),
# Add space before and after italics
(re.compile(r'(?<!“)<i>'), ' <i>'),
(re.compile(r'</i>(?=\w)'), '</i> '),
]
return ans
def book_designer_rules():
ans = getattr(book_designer_rules, 'ans', None)
if ans is None:
ans = book_designer_rules.ans = [
# HR
(re.compile('<hr>', re.IGNORECASE),
lambda match : '<span style="page-break-after:always"> </span>'),
# Create header tags
(re.compile(r'<h2[^><]*?id=BookTitle[^><]*?(align=)*(?(1)(\w+))*[^><]*?>[^><]*?</h2>', re.IGNORECASE),
lambda match : '<h1 id="BookTitle" align="%s">%s</h1>'%(match.group(2) if match.group(2) else 'center', match.group(3))),
(re.compile(r'<h2[^><]*?id=BookAuthor[^><]*?(align=)*(?(1)(\w+))*[^><]*?>[^><]*?</h2>', re.IGNORECASE),
lambda match : '<h2 id="BookAuthor" align="%s">%s</h2>'%(match.group(2) if match.group(2) else 'center', match.group(3))),
(re.compile('<span[^><]*?id=title[^><]*?>(.*?)</span>', re.IGNORECASE|re.DOTALL),
lambda match : '<h2 class="title">%s</h2>'%(match.group(1),)),
(re.compile('<span[^><]*?id=subtitle[^><]*?>(.*?)</span>', re.IGNORECASE|re.DOTALL),
lambda match : '<h3 class="subtitle">%s</h3>'%(match.group(1),)),
]
return None
class HTMLPreProcessor(object):
def __init__(self, log=None, extra_opts=None, regex_wizard_callback=None):
self.log = log
self.extra_opts = extra_opts
self.regex_wizard_callback = regex_wizard_callback
self.current_href = None
def is_baen(self, src):
return re.compile(r'<meta\s+name="Publisher"\s+content=".*?Baen.*?"',
re.IGNORECASE).search(src) is not None
def is_book_designer(self, raw):
return re.search('<H2[^><]*id=BookTitle', raw) is not None
def is_pdftohtml(self, src):
return '<!-- created by calibre\'s pdftohtml -->' in src[:1000]
def __call__(self, html, remove_special_chars=None,
get_preprocess_html=False):
if remove_special_chars is not None:
html = remove_special_chars.sub('', html)
html = html.replace('\0', '')
is_pdftohtml = self.is_pdftohtml(html)
if self.is_baen(html):
rules = []
elif self.is_book_designer(html):
rules = book_designer_rules()
elif is_pdftohtml:
rules = pdftohtml_rules()
else:
rules = []
start_rules = []
if not getattr(self.extra_opts, 'keep_ligatures', False):
html = _ligpat.sub(lambda m:LIGATURES[m.group()], html)
user_sr_rules = {}
# Function for processing search and replace
def do_search_replace(search_pattern, replace_txt):
from calibre.ebooks.conversion.search_replace import compile_regular_expression
try:
search_re = compile_regular_expression(search_pattern)
if not replace_txt:
replace_txt = ''
rules.insert(0, (search_re, replace_txt))
user_sr_rules[(search_re, replace_txt)] = search_pattern
except Exception as e:
self.log.error('Failed to parse %r regexp because %s' %
(search, as_unicode(e)))
# search / replace using the sr?_search / sr?_replace options
for i in range(1, 4):
search, replace = 'sr%d_search'%i, 'sr%d_replace'%i
search_pattern = getattr(self.extra_opts, search, '')
replace_txt = getattr(self.extra_opts, replace, '')
if search_pattern:
do_search_replace(search_pattern, replace_txt)
# multi-search / replace using the search_replace option
search_replace = getattr(self.extra_opts, 'search_replace', None)
if search_replace:
search_replace = json.loads(search_replace)
for search_pattern, replace_txt in reversed(search_replace):
do_search_replace(search_pattern, replace_txt)
end_rules = []
# delete soft hyphens - moved here so it's executed after header/footer removal
if is_pdftohtml:
# unwrap/delete soft hyphens
end_rules.append((re.compile(
r'[­](</p>\s*<p>\s*)+\s*(?=[\[a-z\d])'), lambda match: ''))
# unwrap/delete soft hyphens with formatting
end_rules.append((re.compile(
r'[­]\s*(</(i|u|b)>)+(</p>\s*<p>\s*)+\s*(<(i|u|b)>)+\s*(?=[\[a-z\d])'), lambda match: ''))
length = -1
if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01:
docanalysis = DocAnalysis('pdf', html)
length = docanalysis.line_length(getattr(self.extra_opts, 'unwrap_factor'))
if length:
# print("The pdf line length returned is " + unicode_type(length))
# unwrap em/en dashes
end_rules.append((re.compile(
r'(?<=.{%i}[–—])\s*<p>\s*(?=[\[a-z\d])' % length), lambda match: ''))
end_rules.append(
# Un wrap using punctuation
(re.compile((
r'(?<=.{%i}([a-zäëïöüàèìòùáćéíĺóŕńśúýâêîôûçąężıãõñæøþðßěľščťžňďřů,:)\\IAß]'
r'|(?<!\&\w{4});))\s*(?P<ital></(i|b|u)>)?\s*(</p>\s*<p>\s*)+\s*(?=(<(i|b|u)>)?'
r'\s*[\w\d$(])') % length, re.UNICODE), wrap_lines),
)
for rule in html_preprocess_rules() + start_rules:
html = rule[0].sub(rule[1], html)
if self.regex_wizard_callback is not None:
self.regex_wizard_callback(self.current_href, html)
if get_preprocess_html:
return html
def dump(raw, where):
import os
dp = getattr(self.extra_opts, 'debug_pipeline', None)
if dp and os.path.exists(dp):
odir = os.path.join(dp, 'input')
if os.path.exists(odir):
odir = os.path.join(odir, where)
if not os.path.exists(odir):
os.makedirs(odir)
name, i = None, 0
while not name or os.path.exists(os.path.join(odir, name)):
i += 1
name = '%04d.html'%i
with open(os.path.join(odir, name), 'wb') as f:
f.write(raw.encode('utf-8'))
# dump(html, 'pre-preprocess')
for rule in rules + end_rules:
try:
html = rule[0].sub(rule[1], html)
except Exception as e:
if rule in user_sr_rules:
self.log.error(
'User supplied search & replace rule: %s -> %s '
'failed with error: %s, ignoring.'%(
user_sr_rules[rule], rule[1], e))
else:
raise
if is_pdftohtml and length > -1:
# Dehyphenate
dehyphenator = Dehyphenator(self.extra_opts.verbose, self.log)
html = dehyphenator(html,'html', length)
if is_pdftohtml:
from calibre.ebooks.conversion.utils import HeuristicProcessor
pdf_markup = HeuristicProcessor(self.extra_opts, None)
totalwords = 0
if pdf_markup.get_word_count(html) > 7000:
html = pdf_markup.markup_chapters(html, totalwords, True)
# dump(html, 'post-preprocess')
# Handle broken XHTML w/ SVG (ugh)
if 'svg:' in html and SVG_NS not in html:
html = html.replace(
'<html', '<html xmlns:svg="%s"' % SVG_NS, 1)
if 'xlink:' in html and XLINK_NS not in html:
html = html.replace(
'<html', '<html xmlns:xlink="%s"' % XLINK_NS, 1)
html = XMLDECL_RE.sub('', html)
if getattr(self.extra_opts, 'asciiize', False):
from calibre.utils.localization import get_udc
from calibre.utils.mreplace import MReplace
unihandecoder = get_udc()
mr = MReplace(data={'«':'&lt;'*3, '»':'&gt;'*3})
html = mr.mreplace(html)
html = unihandecoder.decode(html)
if getattr(self.extra_opts, 'enable_heuristics', False):
from calibre.ebooks.conversion.utils import HeuristicProcessor
preprocessor = HeuristicProcessor(self.extra_opts, self.log)
html = preprocessor(html)
if is_pdftohtml:
html = html.replace('<!-- created by calibre\'s pdftohtml -->', '')
if getattr(self.extra_opts, 'smarten_punctuation', False):
html = smarten_punctuation(html, self.log)
try:
unsupported_unicode_chars = self.extra_opts.output_profile.unsupported_unicode_chars
except AttributeError:
unsupported_unicode_chars = ''
if unsupported_unicode_chars:
from calibre.utils.localization import get_udc
unihandecoder = get_udc()
for char in unsupported_unicode_chars:
asciichar = unihandecoder.decode(char)
html = html.replace(char, asciichar)
return html

View File

@@ -0,0 +1,881 @@
#!/usr/bin/env python2
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import re
from math import ceil
from calibre.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator
from calibre.utils.logging import default_log
from calibre.utils.wordcount import get_wordcount_obj
from polyglot.builtins import unicode_type
class HeuristicProcessor(object):
def __init__(self, extra_opts=None, log=None):
self.log = default_log if log is None else log
self.html_preprocess_sections = 0
self.found_indents = 0
self.extra_opts = extra_opts
self.deleted_nbsps = False
self.totalwords = 0
self.min_chapters = 1
self.chapters_no_title = 0
self.chapters_with_title = 0
self.blanks_deleted = False
self.blanks_between_paragraphs = False
self.linereg = re.compile('(?<=<p).*?(?=</p>)', re.IGNORECASE|re.DOTALL)
self.blankreg = re.compile(r'\s*(?P<openline><p(?!\sclass=\"(softbreak|whitespace)\")[^>]*>)\s*(?P<closeline></p>)', re.IGNORECASE)
self.anyblank = re.compile(r'\s*(?P<openline><p[^>]*>)\s*(?P<closeline></p>)', re.IGNORECASE)
self.multi_blank = re.compile(r'(\s*<p[^>]*>\s*</p>(\s*<div[^>]*>\s*</div>\s*)*){2,}(?!\s*<h\d)', re.IGNORECASE)
self.any_multi_blank = re.compile(r'(\s*<p[^>]*>\s*</p>(\s*<div[^>]*>\s*</div>\s*)*){2,}', re.IGNORECASE)
self.line_open = (
r"<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>font|span|[ibu])[^>]*>)?\s*"
r"(<(?P<inner2>font|span|[ibu])[^>]*>)?\s*(<(?P<inner3>font|span|[ibu])[^>]*>)?\s*")
self.line_close = "(</(?P=inner3)>)?\\s*(</(?P=inner2)>)?\\s*(</(?P=inner1)>)?\\s*</(?P=outer)>"
self.single_blank = re.compile(r'(\s*<(p|div)[^>]*>\s*</(p|div)>)', re.IGNORECASE)
self.scene_break_open = '<p class="scenebreak" style="text-align:center; text-indent:0%; margin-top:1em; margin-bottom:1em; page-break-before:avoid">'
self.common_in_text_endings = '[\"\'—’”,\\.!\\?\\\\)„\\w]'
self.common_in_text_beginnings = '[\\w\'\"“‘‛]'
def is_pdftohtml(self, src):
return '<!-- created by calibre\'s pdftohtml -->' in src[:1000]
def is_abbyy(self, src):
return '<meta name="generator" content="ABBYY FineReader' in src[:1000]
def chapter_head(self, match):
from calibre.utils.html2text import html2text
chap = match.group('chap')
title = match.group('title')
if not title:
self.html_preprocess_sections = self.html_preprocess_sections + 1
self.log.debug("marked " + unicode_type(self.html_preprocess_sections) +
" chapters. - " + unicode_type(chap))
return '<h2>'+chap+'</h2>\n'
else:
delete_whitespace = re.compile('^\\s*(?P<c>.*?)\\s*$')
delete_quotes = re.compile('\'\"')
txt_chap = delete_quotes.sub('', delete_whitespace.sub('\\g<c>', html2text(chap)))
txt_title = delete_quotes.sub('', delete_whitespace.sub('\\g<c>', html2text(title)))
self.html_preprocess_sections = self.html_preprocess_sections + 1
self.log.debug("marked " + unicode_type(self.html_preprocess_sections) +
" chapters & titles. - " + unicode_type(chap) + ", " + unicode_type(title))
return '<h2 title="'+txt_chap+', '+txt_title+'">'+chap+'</h2>\n<h3 class="sigilNotInTOC">'+title+'</h3>\n'
def chapter_break(self, match):
chap = match.group('section')
styles = match.group('styles')
self.html_preprocess_sections = self.html_preprocess_sections + 1
self.log.debug("marked " + unicode_type(self.html_preprocess_sections) +
" section markers based on punctuation. - " + unicode_type(chap))
return '<'+styles+' style="page-break-before:always">'+chap
def analyze_title_matches(self, match):
# chap = match.group('chap')
title = match.group('title')
if not title:
self.chapters_no_title = self.chapters_no_title + 1
else:
self.chapters_with_title = self.chapters_with_title + 1
def insert_indent(self, match):
pstyle = match.group('formatting')
tag = match.group('tagtype')
span = match.group('span')
self.found_indents = self.found_indents + 1
if pstyle:
if pstyle.lower().find('style') != -1:
pstyle = re.sub(r'"$', '; text-indent:3%"', pstyle)
else:
pstyle = pstyle+' style="text-indent:3%"'
if not span:
return '<'+tag+' '+pstyle+'>'
else:
return '<'+tag+' '+pstyle+'>'+span
else:
if not span:
return '<'+tag+' style="text-indent:3%">'
else:
return '<'+tag+' style="text-indent:3%">'+span
def no_markup(self, raw, percent):
'''
Detects total marked up line endings in the file. raw is the text to
inspect. Percent is the minimum percent of line endings which should
be marked up to return true.
'''
htm_end_ere = re.compile('</(p|div)>', re.DOTALL)
line_end_ere = re.compile('(\n|\r|\r\n)', re.DOTALL)
htm_end = htm_end_ere.findall(raw)
line_end = line_end_ere.findall(raw)
tot_htm_ends = len(htm_end)
tot_ln_fds = len(line_end)
# self.log.debug("There are " + unicode_type(tot_ln_fds) + " total Line feeds, and " +
# unicode_type(tot_htm_ends) + " marked up endings")
if percent > 1:
percent = 1
if percent < 0:
percent = 0
min_lns = tot_ln_fds * percent
# self.log.debug("There must be fewer than " + unicode_type(min_lns) + " unmarked lines to add markup")
return min_lns > tot_htm_ends
def dump(self, raw, where):
import os
dp = getattr(self.extra_opts, 'debug_pipeline', None)
if dp and os.path.exists(dp):
odir = os.path.join(dp, 'preprocess')
if not os.path.exists(odir):
os.makedirs(odir)
if os.path.exists(odir):
odir = os.path.join(odir, where)
if not os.path.exists(odir):
os.makedirs(odir)
name, i = None, 0
while not name or os.path.exists(os.path.join(odir, name)):
i += 1
name = '%04d.html'%i
with open(os.path.join(odir, name), 'wb') as f:
f.write(raw.encode('utf-8'))
def get_word_count(self, html):
word_count_text = re.sub(r'(?s)<head[^>]*>.*?</head>', '', html)
word_count_text = re.sub(r'<[^>]*>', '', word_count_text)
wordcount = get_wordcount_obj(word_count_text)
return wordcount.words
def markup_italicis(self, html):
# self.log.debug("\n\n\nitalicize debugging \n\n\n")
ITALICIZE_WORDS = [
'Etc.', 'etc.', 'viz.', 'ie.', 'i.e.', 'Ie.', 'I.e.', 'eg.',
'e.g.', 'Eg.', 'E.g.', 'et al.', 'et cetera', 'n.b.', 'N.b.',
'nota bene', 'Nota bene', 'Ste.', 'Mme.', 'Mdme.',
'Mlle.', 'Mons.', 'PS.', 'PPS.',
]
ITALICIZE_STYLE_PATS = [
unicode_type(r'(?msu)(?<=[\s>"\'])_\*/(?P<words>[^\*_]+)/\*_'),
unicode_type(r'(?msu)(?<=[\s>"\'])~~(?P<words>[^~]+)~~'),
unicode_type(r'(?msu)(?<=[\s>"\'])_/(?P<words>[^/_]+)/_'),
unicode_type(r'(?msu)(?<=[\s>"\'])_\*(?P<words>[^\*_]+)\*_'),
unicode_type(r'(?msu)(?<=[\s>"\'])\*/(?P<words>[^/\*]+)/\*'),
unicode_type(r'(?msu)(?<=[\s>"\'])/:(?P<words>[^:/]+):/'),
unicode_type(r'(?msu)(?<=[\s>"\'])\|:(?P<words>[^:\|]+):\|'),
unicode_type(r'(?msu)(?<=[\s>"\'])\*(?P<words>[^\*]+)\*'),
unicode_type(r'(?msu)(?<=[\s>"\'])~(?P<words>[^~]+)~'),
unicode_type(r'(?msu)(?<=[\s>"\'])/(?P<words>[^/\*><]+)/'),
unicode_type(r'(?msu)(?<=[\s>"\'])_(?P<words>[^_]+)_'),
]
for word in ITALICIZE_WORDS:
html = re.sub(r'(?<=\s|>)' + re.escape(word) + r'(?=\s|<)', '<i>%s</i>' % word, html)
search_text = re.sub(r'(?s)<head[^>]*>.*?</head>', '', html)
search_text = re.sub(r'<[^>]*>', '', search_text)
for pat in ITALICIZE_STYLE_PATS:
for match in re.finditer(pat, search_text):
ital_string = unicode_type(match.group('words'))
# self.log.debug("italicising "+unicode_type(match.group(0))+" with <i>"+ital_string+"</i>")
try:
html = re.sub(re.escape(unicode_type(match.group(0))), '<i>%s</i>' % ital_string, html)
except OverflowError:
# match.group(0) was too large to be compiled into a regex
continue
except re.error:
# the match was not a valid regular expression
continue
return html
def markup_chapters(self, html, wordcount, blanks_between_paragraphs):
'''
Searches for common chapter headings throughout the document
attempts multiple patterns based on likelihood of a match
with minimum false positives. Exits after finding a successful pattern
'''
# Typical chapters are between 2000 and 7000 words, use the larger number to decide the
# minimum of chapters to search for. A max limit is calculated to prevent things like OCR
# or pdf page numbers from being treated as TOC markers
max_chapters = 150
typical_chapters = 7000.
if wordcount > 7000:
if wordcount > 200000:
typical_chapters = 15000.
self.min_chapters = int(ceil(wordcount / typical_chapters))
self.log.debug("minimum chapters required are: "+unicode_type(self.min_chapters))
heading = re.compile('<h[1-3][^>]*>', re.IGNORECASE)
self.html_preprocess_sections = len(heading.findall(html))
self.log.debug("found " + unicode_type(self.html_preprocess_sections) + " pre-existing headings")
# Build the Regular Expressions in pieces
init_lookahead = "(?=<(p|div))"
chapter_line_open = self.line_open
title_line_open = (r"<(?P<outer2>p|div)[^>]*>\s*(<(?P<inner4>font|span|[ibu])[^>]*>)?"
r"\s*(<(?P<inner5>font|span|[ibu])[^>]*>)?\s*(<(?P<inner6>font|span|[ibu])[^>]*>)?\s*")
chapter_header_open = r"(?P<chap>"
title_header_open = r"(?P<title>"
chapter_header_close = ")\\s*"
title_header_close = ")"
chapter_line_close = self.line_close
title_line_close = "(</(?P=inner6)>)?\\s*(</(?P=inner5)>)?\\s*(</(?P=inner4)>)?\\s*</(?P=outer2)>"
is_pdftohtml = self.is_pdftohtml(html)
if is_pdftohtml:
title_line_open = "<(?P<outer2>p)[^>]*>\\s*"
title_line_close = "\\s*</(?P=outer2)>"
if blanks_between_paragraphs:
blank_lines = "(\\s*<p[^>]*>\\s*</p>){0,2}\\s*"
else:
blank_lines = ""
opt_title_open = "("
opt_title_close = ")?"
n_lookahead_open = "(?!\\s*"
n_lookahead_close = ")\\s*"
default_title = r"(<[ibu][^>]*>)?\s{0,3}(?!Chapter)([\w\:\'\"-]+\s{0,3}){1,5}?(</[ibu][^>]*>)?(?=<)"
simple_title = r"(<[ibu][^>]*>)?\s{0,3}(?!(Chapter|\s+<)).{0,65}?(</[ibu][^>]*>)?(?=<)"
analysis_result = []
chapter_types = [
[(
r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Epilogue|CHAPTER|Kapitel|Volume\b|Prologue|Book\b|Part\b|Dedication|Preface)"
r"\s*([\d\w-]+\:?\'?\s*){0,5}"), True, True, True, False, "Searching for common section headings", 'common'],
# Highest frequency headings which include titles
[r"[^'\"]?(CHAPTER|Kapitel)\s*([\dA-Z\-\'\"\?!#,]+\s*){0,7}\s*", True, True, True, False, "Searching for most common chapter headings", 'chapter'],
[r"<b[^>]*>\s*(<span[^>]*>)?\s*(?!([*#•=]+\s*)+)(\s*(?=[\d.\w#\-*\s]+<)([\d.\w#-*]+\s*){1,5}\s*)(?!\.)(</span>)?\s*</b>",
True, True, True, False, "Searching for emphasized lines", 'emphasized'], # Emphasized lines
[r"[^'\"]?(\d+(\.|:))\s*([\w\-\'\"#,]+\s*){0,7}\s*", True, True, True, False,
"Searching for numeric chapter headings", 'numeric'], # Numeric Chapters
[r"([A-Z]\s+){3,}\s*([\d\w-]+\s*){0,3}\s*", True, True, True, False, "Searching for letter spaced headings", 'letter_spaced'], # Spaced Lettering
[r"[^'\"]?(\d+\.?\s+([\d\w-]+\:?\'?-?\s?){0,5})\s*", True, True, True, False,
"Searching for numeric chapters with titles", 'numeric_title'], # Numeric Titles
[r"[^'\"]?(\d+)\s*([\dA-Z\-\'\"\?!#,]+\s*){0,7}\s*", True, True, True, False,
"Searching for simple numeric headings", 'plain_number'], # Numeric Chapters, no dot or colon
[r"\s*[^'\"]?([A-Z#]+(\s|-){0,3}){1,5}\s*", False, True, False, False,
"Searching for chapters with Uppercase Characters", 'uppercase'] # Uppercase Chapters
]
def recurse_patterns(html, analyze):
# Start with most typical chapter headings, get more aggressive until one works
for [chapter_type, n_lookahead_req, strict_title, ignorecase, title_req, log_message, type_name] in chapter_types:
n_lookahead = ''
hits = 0
self.chapters_no_title = 0
self.chapters_with_title = 0
if n_lookahead_req:
lp_n_lookahead_open = n_lookahead_open
lp_n_lookahead_close = n_lookahead_close
else:
lp_n_lookahead_open = ''
lp_n_lookahead_close = ''
if strict_title:
lp_title = default_title
else:
lp_title = simple_title
if ignorecase:
arg_ignorecase = r'(?i)'
else:
arg_ignorecase = ''
if title_req:
lp_opt_title_open = ''
lp_opt_title_close = ''
else:
lp_opt_title_open = opt_title_open
lp_opt_title_close = opt_title_close
if self.html_preprocess_sections >= self.min_chapters:
break
full_chapter_line = chapter_line_open+chapter_header_open+chapter_type+chapter_header_close+chapter_line_close
if n_lookahead_req:
n_lookahead = re.sub("(ou|in|cha)", "lookahead_", full_chapter_line)
if not analyze:
self.log.debug("Marked " + unicode_type(self.html_preprocess_sections) + " headings, " + log_message)
chapter_marker = arg_ignorecase+init_lookahead+full_chapter_line+blank_lines+lp_n_lookahead_open+n_lookahead+lp_n_lookahead_close+ \
lp_opt_title_open+title_line_open+title_header_open+lp_title+title_header_close+title_line_close+lp_opt_title_close
chapdetect = re.compile(r'%s' % chapter_marker)
if analyze:
hits = len(chapdetect.findall(html))
if hits:
chapdetect.sub(self.analyze_title_matches, html)
if float(self.chapters_with_title) / float(hits) > .5:
title_req = True
strict_title = False
self.log.debug(
unicode_type(type_name)+" had "+unicode_type(hits)+
" hits - "+unicode_type(self.chapters_no_title)+" chapters with no title, "+
unicode_type(self.chapters_with_title)+" chapters with titles, "+
unicode_type(float(self.chapters_with_title) / float(hits))+" percent. ")
if type_name == 'common':
analysis_result.append([chapter_type, n_lookahead_req, strict_title, ignorecase, title_req, log_message, type_name])
elif self.min_chapters <= hits < max_chapters or self.min_chapters < 3 > hits:
analysis_result.append([chapter_type, n_lookahead_req, strict_title, ignorecase, title_req, log_message, type_name])
break
else:
html = chapdetect.sub(self.chapter_head, html)
return html
recurse_patterns(html, True)
chapter_types = analysis_result
html = recurse_patterns(html, False)
words_per_chptr = wordcount
if words_per_chptr > 0 and self.html_preprocess_sections > 0:
words_per_chptr = wordcount // self.html_preprocess_sections
self.log.debug("Total wordcount is: "+ unicode_type(wordcount)+", Average words per section is: "+
unicode_type(words_per_chptr)+", Marked up "+unicode_type(self.html_preprocess_sections)+" chapters")
return html
def punctuation_unwrap(self, length, content, format):
'''
Unwraps lines based on line length and punctuation
supports a range of html markup and text files
the lookahead regex below is meant look for any non-full stop characters - punctuation
characters which can be used as a full stop should *not* be added below - e.g. ?!“”. etc
the reason for this is to prevent false positive wrapping. False positives are more
difficult to detect than false negatives during a manual review of the doc
This function intentionally leaves hyphenated content alone as that is handled by the
dehyphenate routine in a separate step
'''
def style_unwrap(match):
style_close = match.group('style_close')
style_open = match.group('style_open')
if style_open and style_close:
return style_close+' '+style_open
elif style_open and not style_close:
return ' '+style_open
elif not style_open and style_close:
return style_close+' '
else:
return ' '
# define the pieces of the regex
# (?<!\&\w{4});) is a semicolon not part of an entity
lookahead = "(?<=.{"+unicode_type(length)+r"}([a-zა-ჰäëïöüàèìòùáćéíĺóŕńśúýâêîôûçąężıãõñæøþðßěľščťžňďřů,:)\\IAß]|(?<!\&\w{4});))"
em_en_lookahead = "(?<=.{"+unicode_type(length)+"}[\u2013\u2014])"
soft_hyphen = "\xad"
line_ending = "\\s*(?P<style_close></(span|[iub])>)?\\s*(</(p|div)>)?"
blanklines = "\\s*(?P<up2threeblanks><(p|span|div)[^>]*>\\s*(<(p|span|div)[^>]*>\\s*</(span|p|div)>\\s*)</(span|p|div)>\\s*){0,3}\\s*"
line_opening = "<(p|div)[^>]*>\\s*(?P<style_open><(span|[iub])[^>]*>)?\\s*"
txt_line_wrap = "((\u0020|\u0009)*\n){1,4}"
if format == 'txt':
unwrap_regex = lookahead+txt_line_wrap
em_en_unwrap_regex = em_en_lookahead+txt_line_wrap
shy_unwrap_regex = soft_hyphen+txt_line_wrap
else:
unwrap_regex = lookahead+line_ending+blanklines+line_opening
em_en_unwrap_regex = em_en_lookahead+line_ending+blanklines+line_opening
shy_unwrap_regex = soft_hyphen+line_ending+blanklines+line_opening
unwrap = re.compile("%s" % unwrap_regex, re.UNICODE)
em_en_unwrap = re.compile("%s" % em_en_unwrap_regex, re.UNICODE)
shy_unwrap = re.compile("%s" % shy_unwrap_regex, re.UNICODE)
if format == 'txt':
content = unwrap.sub(' ', content)
content = em_en_unwrap.sub('', content)
content = shy_unwrap.sub('', content)
else:
content = unwrap.sub(style_unwrap, content)
content = em_en_unwrap.sub(style_unwrap, content)
content = shy_unwrap.sub(style_unwrap, content)
return content
def txt_process(self, match):
from calibre.ebooks.txt.processor import convert_basic, separate_paragraphs_single_line
content = match.group('text')
content = separate_paragraphs_single_line(content)
content = convert_basic(content, epub_split_size_kb=0)
return content
def markup_pre(self, html):
pre = re.compile(r'<pre>', re.IGNORECASE)
if len(pre.findall(html)) >= 1:
self.log.debug("Running Text Processing")
outerhtml = re.compile(r'.*?(?<=<pre>)(?P<text>.*?)</pre>', re.IGNORECASE|re.DOTALL)
html = outerhtml.sub(self.txt_process, html)
from calibre.ebooks.conversion.preprocess import convert_entities
html = re.sub(r'&(\S+?);', convert_entities, html)
else:
# Add markup naively
# TODO - find out if there are cases where there are more than one <pre> tag or
# other types of unmarked html and handle them in some better fashion
add_markup = re.compile('(?<!>)(\n)')
html = add_markup.sub('</p>\n<p>', html)
return html
def arrange_htm_line_endings(self, html):
html = re.sub(r"\s*</(?P<tag>p|div)>", "</"+"\\g<tag>"+">\n", html)
html = re.sub(r"\s*<(?P<tag>p|div)(?P<style>[^>]*)>\s*", "\n<"+"\\g<tag>"+"\\g<style>"+">", html)
return html
def fix_nbsp_indents(self, html):
txtindent = re.compile(unicode_type(r'<(?P<tagtype>p|div)(?P<formatting>[^>]*)>\s*(?P<span>(<span[^>]*>\s*)+)?\s*(\u00a0){2,}'), re.IGNORECASE)
html = txtindent.sub(self.insert_indent, html)
if self.found_indents > 1:
self.log.debug("replaced "+unicode_type(self.found_indents)+ " nbsp indents with inline styles")
return html
def cleanup_markup(self, html):
# remove remaining non-breaking spaces
html = re.sub(unicode_type(r'\u00a0'), ' ', html)
# Get rid of various common microsoft specific tags which can cause issues later
# Get rid of empty <o:p> tags to simplify other processing
html = re.sub(unicode_type(r'\s*<o:p>\s*</o:p>'), ' ', html)
# Delete microsoft 'smart' tags
html = re.sub('(?i)</?st1:\\w+>', '', html)
# Re-open self closing paragraph tags
html = re.sub('<p[^>/]*/>', '<p> </p>', html)
# Get rid of empty span, bold, font, em, & italics tags
fmt_tags = 'font|[ibu]|em|strong'
open_fmt_pat, close_fmt_pat = r'<(?:{})(?:\s[^>]*)?>'.format(fmt_tags), '</(?:{})>'.format(fmt_tags)
for i in range(2):
html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]*>\s*</span>){0,2}\s*</span>\s*", " ", html)
html = re.sub(
r"\s*{open}\s*({open}\s*{close}\s*){{0,2}}\s*{close}".format(open=open_fmt_pat, close=close_fmt_pat) , " ", html)
# delete surrounding divs from empty paragraphs
html = re.sub('<div[^>]*>\\s*<p[^>]*>\\s*</p>\\s*</div>', '<p> </p>', html)
# Empty heading tags
html = re.sub(r'(?i)<h\d+>\s*</h\d+>', '', html)
self.deleted_nbsps = True
return html
def analyze_line_endings(self, html):
'''
determines the type of html line ending used most commonly in a document
use before calling docanalysis functions
'''
paras_reg = re.compile('<p[^>]*>', re.IGNORECASE)
spans_reg = re.compile('<span[^>]*>', re.IGNORECASE)
paras = len(paras_reg.findall(html))
spans = len(spans_reg.findall(html))
if spans > 1:
if float(paras) / float(spans) < 0.75:
return 'spanned_html'
else:
return 'html'
else:
return 'html'
def analyze_blanks(self, html):
blanklines = self.blankreg.findall(html)
lines = self.linereg.findall(html)
if len(lines) > 1:
self.log.debug("There are " + unicode_type(len(blanklines)) + " blank lines. " +
unicode_type(float(len(blanklines)) / float(len(lines))) + " percent blank")
if float(len(blanklines)) / float(len(lines)) > 0.40:
return True
else:
return False
def cleanup_required(self):
for option in ['unwrap_lines', 'markup_chapter_headings', 'format_scene_breaks', 'delete_blank_paragraphs']:
if getattr(self.extra_opts, option, False):
return True
return False
def merge_blanks(self, html, blanks_count=None):
base_em = .5 # Baseline is 1.5em per blank line, 1st line is .5 em css and 1em for the nbsp
em_per_line = 1.5 # Add another 1.5 em for each additional blank
def merge_matches(match):
to_merge = match.group(0)
lines = float(len(self.single_blank.findall(to_merge))) - 1.
em = base_em + (em_per_line * lines)
if to_merge.find('whitespace'):
newline = self.any_multi_blank.sub('\n<p class="whitespace'+unicode_type(int(em * 10))+
'" style="text-align:center; margin-top:'+unicode_type(em)+'em"> </p>', match.group(0))
else:
newline = self.any_multi_blank.sub('\n<p class="softbreak'+unicode_type(int(em * 10))+
'" style="text-align:center; margin-top:'+unicode_type(em)+'em"> </p>', match.group(0))
return newline
html = self.any_multi_blank.sub(merge_matches, html)
return html
def detect_whitespace(self, html):
blanks_around_headings = re.compile(
r'(?P<initparas>(<(p|div)[^>]*>\s*</(p|div)>\s*){1,}\s*)?'
r'(?P<content><h(?P<hnum>\d+)[^>]*>.*?</h(?P=hnum)>)(?P<endparas>\s*(<(p|div)[^>]*>\s*</(p|div)>\s*){1,})?', re.IGNORECASE|re.DOTALL)
blanks_around_scene_breaks = re.compile(
r'(?P<initparas>(<(p|div)[^>]*>\s*</(p|div)>\s*){1,}\s*)?'
r'(?P<content><p class="scenebreak"[^>]*>.*?</p>)(?P<endparas>\s*(<(p|div)[^>]*>\s*</(p|div)>\s*){1,})?', re.IGNORECASE|re.DOTALL)
blanks_n_nopunct = re.compile(
r'(?P<initparas>(<p[^>]*>\s*</p>\s*){1,}\s*)?<p[^>]*>\s*(<(span|[ibu]|em|strong|font)[^>]*>\s*)*'
r'.{1,100}?[^\W](</(span|[ibu]|em|strong|font)>\s*)*</p>(?P<endparas>\s*(<p[^>]*>\s*</p>\s*){1,})?', re.IGNORECASE|re.DOTALL)
def merge_header_whitespace(match):
initblanks = match.group('initparas')
endblanks = match.group('endparas')
content = match.group('content')
top_margin = ''
bottom_margin = ''
if initblanks is not None:
top_margin = 'margin-top:'+unicode_type(len(self.single_blank.findall(initblanks)))+'em;'
if endblanks is not None:
bottom_margin = 'margin-bottom:'+unicode_type(len(self.single_blank.findall(endblanks)))+'em;'
if initblanks is None and endblanks is None:
return content
elif content.find('scenebreak') != -1:
return content
else:
content = re.sub('(?i)<h(?P<hnum>\\d+)[^>]*>', '\n\n<h'+'\\g<hnum>'+' style="'+top_margin+bottom_margin+'">', content)
return content
html = blanks_around_headings.sub(merge_header_whitespace, html)
html = blanks_around_scene_breaks.sub(merge_header_whitespace, html)
def markup_whitespaces(match):
blanks = match.group(0)
blanks = self.blankreg.sub('\n<p class="whitespace" style="text-align:center; margin-top:0em; margin-bottom:0em"> </p>', blanks)
return blanks
html = blanks_n_nopunct.sub(markup_whitespaces, html)
if self.html_preprocess_sections > self.min_chapters:
html = re.sub('(?si)^.*?(?=<h\\d)', markup_whitespaces, html)
return html
def detect_soft_breaks(self, html):
line = '(?P<initline>'+self.line_open+'\\s*(?P<init_content>.*?)'+self.line_close+')'
line_two = '(?P<line_two>'+re.sub('(ou|in|cha)', 'linetwo_', self.line_open)+ \
'\\s*(?P<line_two_content>.*?)'+re.sub('(ou|in|cha)', 'linetwo_', self.line_close)+')'
div_break_candidate_pattern = line+'\\s*<div[^>]*>\\s*</div>\\s*'+line_two
div_break_candidate = re.compile(r'%s' % div_break_candidate_pattern, re.IGNORECASE|re.UNICODE)
def convert_div_softbreaks(match):
init_is_paragraph = self.check_paragraph(match.group('init_content'))
line_two_is_paragraph = self.check_paragraph(match.group('line_two_content'))
if init_is_paragraph and line_two_is_paragraph:
return (match.group('initline')+
'\n<p class="softbreak" style="margin-top:.5em; page-break-before:avoid; text-align:center"> </p>\n'+
match.group('line_two'))
else:
return match.group(0)
html = div_break_candidate.sub(convert_div_softbreaks, html)
if not self.blanks_deleted and self.blanks_between_paragraphs:
html = self.multi_blank.sub('\n<p class="softbreak" style="margin-top:1em; page-break-before:avoid; text-align:center"> </p>', html)
else:
html = self.blankreg.sub('\n<p class="softbreak" style="margin-top:.5em; page-break-before:avoid; text-align:center"> </p>', html)
return html
def detect_scene_breaks(self, html):
scene_break_regex = self.line_open+'(?!('+self.common_in_text_beginnings+'|.*?'+self.common_in_text_endings+ \
'<))(?P<break>((?P<break_char>((?!\\s)\\W))\\s*(?P=break_char)?)+)\\s*'+self.line_close
scene_breaks = re.compile(r'%s' % scene_break_regex, re.IGNORECASE|re.UNICODE)
html = scene_breaks.sub(self.scene_break_open+'\\g<break>'+'</p>', html)
return html
def markup_user_break(self, replacement_break):
'''
Takes string a user supplies and wraps it in markup that will be centered with
appropriate margins. <hr> and <img> tags are allowed. If the user specifies
a style with width attributes in the <hr> tag then the appropriate margins are
applied to wrapping divs. This is because many ebook devices don't support margin:auto
All other html is converted to text.
'''
hr_open = '<div id="scenebreak" style="margin-left: 45%; margin-right: 45%; margin-top:1.5em; margin-bottom:1.5em; page-break-before:avoid">'
if re.findall('(<|>)', replacement_break):
if re.match('^<hr', replacement_break):
if replacement_break.find('width') != -1:
try:
width = int(re.sub('.*?width(:|=)(?P<wnum>\\d+).*', '\\g<wnum>', replacement_break))
except:
scene_break = hr_open+'<hr style="height: 3px; background:#505050" /></div>'
self.log.warn('Invalid replacement scene break'
' expression, using default')
else:
replacement_break = re.sub('(?i)(width=\\d+\\%?|width:\\s*\\d+(\\%|px|pt|em)?;?)', '', replacement_break)
divpercent = (100 - width) // 2
hr_open = re.sub('45', unicode_type(divpercent), hr_open)
scene_break = hr_open+replacement_break+'</div>'
else:
scene_break = hr_open+'<hr style="height: 3px; background:#505050" /></div>'
elif re.match('^<img', replacement_break):
scene_break = self.scene_break_open+replacement_break+'</p>'
else:
from calibre.utils.html2text import html2text
replacement_break = html2text(replacement_break)
replacement_break = re.sub('\\s', '&nbsp;', replacement_break)
scene_break = self.scene_break_open+replacement_break+'</p>'
else:
replacement_break = re.sub('\\s', '&nbsp;', replacement_break)
scene_break = self.scene_break_open+replacement_break+'</p>'
return scene_break
def check_paragraph(self, content):
content = re.sub('\\s*</?span[^>]*>\\s*', '', content)
if re.match('.*[\"\'.!?:]$', content):
# print "detected this as a paragraph"
return True
else:
return False
def abbyy_processor(self, html):
abbyy_line = re.compile('((?P<linestart><p\\sstyle="(?P<styles>[^\"]*?);?">)(?P<content>.*?)(?P<lineend></p>)|(?P<image><img[^>]*>))', re.IGNORECASE)
empty_paragraph = '\n<p> </p>\n'
self.in_blockquote = False
self.previous_was_paragraph = False
html = re.sub('</?a[^>]*>', '', html)
def convert_styles(match):
# print "raw styles are: "+match.group('styles')
content = match.group('content')
# print "raw content is: "+match.group('content')
image = match.group('image')
is_paragraph = False
text_align = ''
text_indent = ''
paragraph_before = ''
paragraph_after = ''
blockquote_open = '\n<blockquote>\n'
blockquote_close = '</blockquote>\n'
indented_text = 'text-indent:3%;'
blockquote_open_loop = ''
blockquote_close_loop = ''
debugabby = False
if image:
debugabby = True
if self.in_blockquote:
self.in_blockquote = False
blockquote_close_loop = blockquote_close
self.previous_was_paragraph = False
return blockquote_close_loop+'\n'+image+'\n'
else:
styles = match.group('styles').split(';')
is_paragraph = self.check_paragraph(content)
# print "styles for this line are: "+unicode_type(styles)
split_styles = []
for style in styles:
# print "style is: "+unicode_type(style)
newstyle = style.split(':')
# print "newstyle is: "+unicode_type(newstyle)
split_styles.append(newstyle)
styles = split_styles
for style, setting in styles:
if style == 'text-align' and setting != 'left':
text_align = style+':'+setting+';'
if style == 'text-indent':
setting = int(re.sub('\\s*pt\\s*', '', setting))
if 9 < setting < 14:
text_indent = indented_text
else:
text_indent = style+':'+unicode_type(setting)+'pt;'
if style == 'padding':
setting = re.sub('pt', '', setting).split(' ')
if int(setting[1]) < 16 and int(setting[3]) < 16:
if self.in_blockquote:
debugabby = True
if is_paragraph:
self.in_blockquote = False
blockquote_close_loop = blockquote_close
if int(setting[3]) > 8 and text_indent == '':
text_indent = indented_text
if int(setting[0]) > 5:
paragraph_before = empty_paragraph
if int(setting[2]) > 5:
paragraph_after = empty_paragraph
elif not self.in_blockquote and self.previous_was_paragraph:
debugabby = True
self.in_blockquote = True
blockquote_open_loop = blockquote_open
if debugabby:
self.log.debug('\n\n******\n')
self.log.debug('padding top is: '+unicode_type(setting[0]))
self.log.debug('padding right is:' +unicode_type(setting[1]))
self.log.debug('padding bottom is: ' + unicode_type(setting[2]))
self.log.debug('padding left is: ' +unicode_type(setting[3]))
# print "text-align is: "+unicode_type(text_align)
# print "\n***\nline is:\n "+unicode_type(match.group(0))+'\n'
if debugabby:
# print "this line is a paragraph = "+unicode_type(is_paragraph)+", previous line was "+unicode_type(self.previous_was_paragraph)
self.log.debug("styles for this line were:", styles)
self.log.debug('newline is:')
self.log.debug(blockquote_open_loop+blockquote_close_loop+
paragraph_before+'<p style="'+text_indent+text_align+
'">'+content+'</p>'+paragraph_after+'\n\n\n\n\n')
# print "is_paragraph is "+unicode_type(is_paragraph)+", previous_was_paragraph is "+unicode_type(self.previous_was_paragraph)
self.previous_was_paragraph = is_paragraph
# print "previous_was_paragraph is now set to "+unicode_type(self.previous_was_paragraph)+"\n\n\n"
return blockquote_open_loop+blockquote_close_loop+paragraph_before+'<p style="'+text_indent+text_align+'">'+content+'</p>'+paragraph_after
html = abbyy_line.sub(convert_styles, html)
return html
def __call__(self, html):
self.log.debug("********* Heuristic processing HTML *********")
# Count the words in the document to estimate how many chapters to look for and whether
# other types of processing are attempted
try:
self.totalwords = self.get_word_count(html)
except:
self.log.warn("Can't get wordcount")
if self.totalwords < 50:
self.log.warn("flow is too short, not running heuristics")
return html
is_abbyy = self.is_abbyy(html)
if is_abbyy:
html = self.abbyy_processor(html)
# Arrange line feeds and </p> tags so the line_length and no_markup functions work correctly
html = self.arrange_htm_line_endings(html)
# self.dump(html, 'after_arrange_line_endings')
if self.cleanup_required():
# ##### Check Markup ######
#
# some lit files don't have any <p> tags or equivalent (generally just plain text between
# <pre> tags), check and mark up line endings if required before proceeding
# fix indents must run after this step
if self.no_markup(html, 0.1):
self.log.debug("not enough paragraph markers, adding now")
# markup using text processing
html = self.markup_pre(html)
# Replace series of non-breaking spaces with text-indent
if getattr(self.extra_opts, 'fix_indents', False):
html = self.fix_nbsp_indents(html)
if self.cleanup_required():
# fix indents must run before this step, as it removes non-breaking spaces
html = self.cleanup_markup(html)
is_pdftohtml = self.is_pdftohtml(html)
if is_pdftohtml:
self.line_open = "<(?P<outer>p)[^>]*>(\\s*<[ibu][^>]*>)?\\s*"
self.line_close = "\\s*(</[ibu][^>]*>\\s*)?</(?P=outer)>"
# ADE doesn't render <br />, change to empty paragraphs
# html = re.sub('<br[^>]*>', u'<p>\u00a0</p>', html)
# Determine whether the document uses interleaved blank lines
self.blanks_between_paragraphs = self.analyze_blanks(html)
# detect chapters/sections to match xpath or splitting logic
if getattr(self.extra_opts, 'markup_chapter_headings', False):
html = self.markup_chapters(html, self.totalwords, self.blanks_between_paragraphs)
# self.dump(html, 'after_chapter_markup')
if getattr(self.extra_opts, 'italicize_common_cases', False):
html = self.markup_italicis(html)
# If more than 40% of the lines are empty paragraphs and the user has enabled delete
# blank paragraphs then delete blank lines to clean up spacing
if self.blanks_between_paragraphs and getattr(self.extra_opts, 'delete_blank_paragraphs', False):
self.log.debug("deleting blank lines")
self.blanks_deleted = True
html = self.multi_blank.sub('\n<p class="softbreak" style="margin-top:.5em; page-break-before:avoid; text-align:center"> </p>', html)
html = self.blankreg.sub('', html)
# Determine line ending type
# Some OCR sourced files have line breaks in the html using a combination of span & p tags
# span are used for hard line breaks, p for new paragraphs. Determine which is used so
# that lines can be un-wrapped across page boundaries
format = self.analyze_line_endings(html)
# Check Line histogram to determine if the document uses hard line breaks, If 50% or
# more of the lines break in the same region of the document then unwrapping is required
docanalysis = DocAnalysis(format, html)
hardbreaks = docanalysis.line_histogram(.50)
self.log.debug("Hard line breaks check returned "+unicode_type(hardbreaks))
# Calculate Length
unwrap_factor = getattr(self.extra_opts, 'html_unwrap_factor', 0.4)
length = docanalysis.line_length(unwrap_factor)
self.log.debug("Median line length is " + unicode_type(length) + ", calculated with " + format + " format")
# ##### Unwrap lines ######
if getattr(self.extra_opts, 'unwrap_lines', False):
# only go through unwrapping code if the histogram shows unwrapping is required or if the user decreased the default unwrap_factor
if hardbreaks or unwrap_factor < 0.4:
self.log.debug("Unwrapping required, unwrapping Lines")
# Dehyphenate with line length limiters
dehyphenator = Dehyphenator(self.extra_opts.verbose, self.log)
html = dehyphenator(html,'html', length)
html = self.punctuation_unwrap(length, html, 'html')
if getattr(self.extra_opts, 'dehyphenate', False):
# dehyphenate in cleanup mode to fix anything previous conversions/editing missed
self.log.debug("Fixing hyphenated content")
dehyphenator = Dehyphenator(self.extra_opts.verbose, self.log)
html = dehyphenator(html,'html_cleanup', length)
html = dehyphenator(html, 'individual_words', length)
# If still no sections after unwrapping mark split points on lines with no punctuation
if self.html_preprocess_sections < self.min_chapters and getattr(self.extra_opts, 'markup_chapter_headings', False):
self.log.debug("Looking for more split points based on punctuation,"
" currently have " + unicode_type(self.html_preprocess_sections))
chapdetect3 = re.compile(
r'<(?P<styles>(p|div)[^>]*)>\s*(?P<section>(<span[^>]*>)?\s*(?!([\W]+\s*)+)'
r'(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*'
r'.?(?=[a-z#\-*\s]+<)([a-z#-*]+\s*){1,5}\s*\s*(</span>)?(</[ibu]>){0,2}\s*'
r'(</span>)?\s*(</[ibu]>){0,2}\s*(</span>)?\s*</(p|div)>)', re.IGNORECASE)
html = chapdetect3.sub(self.chapter_break, html)
if getattr(self.extra_opts, 'renumber_headings', False):
# search for places where a first or second level heading is immediately followed by another
# top level heading. demote the second heading to h3 to prevent splitting between chapter
# headings and titles, images, etc
doubleheading = re.compile(
r'(?P<firsthead><h(1|2)[^>]*>.+?</h(1|2)>\s*(<(?!h\d)[^>]*>\s*)*)<h(1|2)(?P<secondhead>[^>]*>.+?)</h(1|2)>', re.IGNORECASE)
html = doubleheading.sub('\\g<firsthead>'+'\n<h3'+'\\g<secondhead>'+'</h3>', html)
# If scene break formatting is enabled, find all blank paragraphs that definitely aren't scenebreaks,
# style it with the 'whitespace' class. All remaining blank lines are styled as softbreaks.
# Multiple sequential blank paragraphs are merged with appropriate margins
# If non-blank scene breaks exist they are center aligned and styled with appropriate margins.
if getattr(self.extra_opts, 'format_scene_breaks', False):
self.log.debug('Formatting scene breaks')
html = re.sub('(?i)<div[^>]*>\\s*<br(\\s?/)?>\\s*</div>', '<p></p>', html)
html = self.detect_scene_breaks(html)
html = self.detect_whitespace(html)
html = self.detect_soft_breaks(html)
blanks_count = len(self.any_multi_blank.findall(html))
if blanks_count >= 1:
html = self.merge_blanks(html, blanks_count)
detected_scene_break = re.compile(r'<p class="scenebreak"[^>]*>.*?</p>')
scene_break_count = len(detected_scene_break.findall(html))
# If the user has enabled scene break replacement, then either softbreaks
# or 'hard' scene breaks are replaced, depending on which is in use
# Otherwise separator lines are centered, use a bit larger margin in this case
replacement_break = getattr(self.extra_opts, 'replace_scene_breaks', None)
if replacement_break:
replacement_break = self.markup_user_break(replacement_break)
if scene_break_count >= 1:
html = detected_scene_break.sub(replacement_break, html)
html = re.sub('<p\\s+class="softbreak"[^>]*>\\s*</p>', replacement_break, html)
else:
html = re.sub('<p\\s+class="softbreak"[^>]*>\\s*</p>', replacement_break, html)
if self.deleted_nbsps:
# put back non-breaking spaces in empty paragraphs so they render correctly
html = self.anyblank.sub('\n'+r'\g<openline>'+'\u00a0'+r'\g<closeline>', html)
return html

View File

@@ -0,0 +1,11 @@
#!/usr/bin/env python2
# vim:fileencoding=utf-8
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
class InvalidDOCX(ValueError):
pass

View File

@@ -0,0 +1,478 @@
#!/usr/bin/env python2
# vim:fileencoding=utf-8
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
import numbers
from collections import OrderedDict
from polyglot.builtins import iteritems
class Inherit(object):
def __eq__(self, other):
return other is self
def __hash__(self):
return id(self)
def __lt__(self, other):
return False
def __gt__(self, other):
return other is not self
def __ge__(self, other):
if self is other:
return True
return True
def __le__(self, other):
if self is other:
return True
return False
inherit = Inherit()
def binary_property(parent, name, XPath, get):
vals = XPath('./w:%s' % name)(parent)
if not vals:
return inherit
val = get(vals[0], 'w:val', 'on')
return True if val in {'on', '1', 'true'} else False
def simple_color(col, auto='black'):
if not col or col == 'auto' or len(col) != 6:
return auto
return '#'+col
def simple_float(val, mult=1.0):
try:
return float(val) * mult
except (ValueError, TypeError, AttributeError, KeyError):
pass
def twips(val, mult=0.05):
''' Parse val as either a pure number representing twentieths of a point or a number followed by the suffix pt, representing pts.'''
try:
return float(val) * mult
except (ValueError, TypeError, AttributeError, KeyError):
if val and val.endswith('pt') and mult == 0.05:
return twips(val[:-2], mult=1.0)
LINE_STYLES = { # {{{
'basicBlackDashes': 'dashed',
'basicBlackDots': 'dotted',
'basicBlackSquares': 'dashed',
'basicThinLines': 'solid',
'dashDotStroked': 'groove',
'dashed': 'dashed',
'dashSmallGap': 'dashed',
'dotDash': 'dashed',
'dotDotDash': 'dashed',
'dotted': 'dotted',
'double': 'double',
'inset': 'inset',
'nil': 'none',
'none': 'none',
'outset': 'outset',
'single': 'solid',
'thick': 'solid',
'thickThinLargeGap': 'double',
'thickThinMediumGap': 'double',
'thickThinSmallGap' : 'double',
'thinThickLargeGap': 'double',
'thinThickMediumGap': 'double',
'thinThickSmallGap': 'double',
'thinThickThinLargeGap': 'double',
'thinThickThinMediumGap': 'double',
'thinThickThinSmallGap': 'double',
'threeDEmboss': 'ridge',
'threeDEngrave': 'groove',
'triple': 'double',
} # }}}
# Read from XML {{{
border_props = ('padding_%s', 'border_%s_width', 'border_%s_style', 'border_%s_color')
border_edges = ('left', 'top', 'right', 'bottom', 'between')
def read_single_border(parent, edge, XPath, get):
color = style = width = padding = None
for elem in XPath('./w:%s' % edge)(parent):
c = get(elem, 'w:color')
if c is not None:
color = simple_color(c)
s = get(elem, 'w:val')
if s is not None:
style = LINE_STYLES.get(s, 'solid')
space = get(elem, 'w:space')
if space is not None:
try:
padding = float(space)
except (ValueError, TypeError):
pass
sz = get(elem, 'w:sz')
if sz is not None:
# we dont care about art borders (they are only used for page borders)
try:
width = min(96, max(2, float(sz))) / 8
except (ValueError, TypeError):
pass
return {p:v for p, v in zip(border_props, (padding, width, style, color))}
def read_border(parent, dest, XPath, get, border_edges=border_edges, name='pBdr'):
vals = {k % edge:inherit for edge in border_edges for k in border_props}
for border in XPath('./w:' + name)(parent):
for edge in border_edges:
for prop, val in iteritems(read_single_border(border, edge, XPath, get)):
if val is not None:
vals[prop % edge] = val
for key, val in iteritems(vals):
setattr(dest, key, val)
def border_to_css(edge, style, css):
bs = getattr(style, 'border_%s_style' % edge)
bc = getattr(style, 'border_%s_color' % edge)
bw = getattr(style, 'border_%s_width' % edge)
if isinstance(bw, numbers.Number):
# WebKit needs at least 1pt to render borders and 3pt to render double borders
bw = max(bw, (3 if bs == 'double' else 1))
if bs is not inherit and bs is not None:
css['border-%s-style' % edge] = bs
if bc is not inherit and bc is not None:
css['border-%s-color' % edge] = bc
if bw is not inherit and bw is not None:
if isinstance(bw, numbers.Number):
bw = '%.3gpt' % bw
css['border-%s-width' % edge] = bw
def read_indent(parent, dest, XPath, get):
padding_left = padding_right = text_indent = inherit
for indent in XPath('./w:ind')(parent):
l, lc = get(indent, 'w:left'), get(indent, 'w:leftChars')
pl = simple_float(lc, 0.01) if lc is not None else simple_float(l, 0.05) if l is not None else None
if pl is not None:
padding_left = '%.3g%s' % (pl, 'em' if lc is not None else 'pt')
r, rc = get(indent, 'w:right'), get(indent, 'w:rightChars')
pr = simple_float(rc, 0.01) if rc is not None else simple_float(r, 0.05) if r is not None else None
if pr is not None:
padding_right = '%.3g%s' % (pr, 'em' if rc is not None else 'pt')
h, hc = get(indent, 'w:hanging'), get(indent, 'w:hangingChars')
fl, flc = get(indent, 'w:firstLine'), get(indent, 'w:firstLineChars')
h = h if h is None else '-'+h
hc = hc if hc is None else '-'+hc
ti = (simple_float(hc, 0.01) if hc is not None else simple_float(h, 0.05) if h is not None else
simple_float(flc, 0.01) if flc is not None else simple_float(fl, 0.05) if fl is not None else None)
if ti is not None:
text_indent = '%.3g%s' % (ti, 'em' if hc is not None or (h is None and flc is not None) else 'pt')
setattr(dest, 'margin_left', padding_left)
setattr(dest, 'margin_right', padding_right)
setattr(dest, 'text_indent', text_indent)
def read_justification(parent, dest, XPath, get):
ans = inherit
for jc in XPath('./w:jc[@w:val]')(parent):
val = get(jc, 'w:val')
if not val:
continue
if val in {'both', 'distribute'} or 'thai' in val or 'kashida' in val:
ans = 'justify'
elif val in {'left', 'center', 'right', 'start', 'end'}:
ans = val
elif val in {'start', 'end'}:
ans = {'start':'left'}.get(val, 'right')
setattr(dest, 'text_align', ans)
def read_spacing(parent, dest, XPath, get):
padding_top = padding_bottom = line_height = inherit
for s in XPath('./w:spacing')(parent):
a, al, aa = get(s, 'w:after'), get(s, 'w:afterLines'), get(s, 'w:afterAutospacing')
pb = None if aa in {'on', '1', 'true'} else simple_float(al, 0.02) if al is not None else simple_float(a, 0.05) if a is not None else None
if pb is not None:
padding_bottom = '%.3g%s' % (pb, 'ex' if al is not None else 'pt')
b, bl, bb = get(s, 'w:before'), get(s, 'w:beforeLines'), get(s, 'w:beforeAutospacing')
pt = None if bb in {'on', '1', 'true'} else simple_float(bl, 0.02) if bl is not None else simple_float(b, 0.05) if b is not None else None
if pt is not None:
padding_top = '%.3g%s' % (pt, 'ex' if bl is not None else 'pt')
l, lr = get(s, 'w:line'), get(s, 'w:lineRule', 'auto')
if l is not None:
lh = simple_float(l, 0.05) if lr in {'exact', 'atLeast'} else simple_float(l, 1/240.0)
if lh is not None:
line_height = '%.3g%s' % (lh, 'pt' if lr in {'exact', 'atLeast'} else '')
setattr(dest, 'margin_top', padding_top)
setattr(dest, 'margin_bottom', padding_bottom)
setattr(dest, 'line_height', line_height)
def read_shd(parent, dest, XPath, get):
ans = inherit
for shd in XPath('./w:shd[@w:fill]')(parent):
val = get(shd, 'w:fill')
if val:
ans = simple_color(val, auto='transparent')
setattr(dest, 'background_color', ans)
def read_numbering(parent, dest, XPath, get):
lvl = num_id = inherit
for np in XPath('./w:numPr')(parent):
for ilvl in XPath('./w:ilvl[@w:val]')(np):
try:
lvl = int(get(ilvl, 'w:val'))
except (ValueError, TypeError):
pass
for num in XPath('./w:numId[@w:val]')(np):
num_id = get(num, 'w:val')
setattr(dest, 'numbering_id', num_id)
setattr(dest, 'numbering_level', lvl)
class Frame(object):
all_attributes = ('drop_cap', 'h', 'w', 'h_anchor', 'h_rule', 'v_anchor', 'wrap',
'h_space', 'v_space', 'lines', 'x_align', 'y_align', 'x', 'y')
def __init__(self, fp, XPath, get):
self.drop_cap = get(fp, 'w:dropCap', 'none')
try:
self.h = int(get(fp, 'w:h'))/20
except (ValueError, TypeError):
self.h = 0
try:
self.w = int(get(fp, 'w:w'))/20
except (ValueError, TypeError):
self.w = None
try:
self.x = int(get(fp, 'w:x'))/20
except (ValueError, TypeError):
self.x = 0
try:
self.y = int(get(fp, 'w:y'))/20
except (ValueError, TypeError):
self.y = 0
self.h_anchor = get(fp, 'w:hAnchor', 'page')
self.h_rule = get(fp, 'w:hRule', 'auto')
self.v_anchor = get(fp, 'w:vAnchor', 'page')
self.wrap = get(fp, 'w:wrap', 'around')
self.x_align = get(fp, 'w:xAlign')
self.y_align = get(fp, 'w:yAlign')
try:
self.h_space = int(get(fp, 'w:hSpace'))/20
except (ValueError, TypeError):
self.h_space = 0
try:
self.v_space = int(get(fp, 'w:vSpace'))/20
except (ValueError, TypeError):
self.v_space = 0
try:
self.lines = int(get(fp, 'w:lines'))
except (ValueError, TypeError):
self.lines = 1
def css(self, page):
is_dropcap = self.drop_cap in {'drop', 'margin'}
ans = {'overflow': 'hidden'}
if is_dropcap:
ans['float'] = 'left'
ans['margin'] = '0'
ans['padding-right'] = '0.2em'
else:
if self.h_rule != 'auto':
t = 'min-height' if self.h_rule == 'atLeast' else 'height'
ans[t] = '%.3gpt' % self.h
if self.w is not None:
ans['width'] = '%.3gpt' % self.w
ans['padding-top'] = ans['padding-bottom'] = '%.3gpt' % self.v_space
if self.wrap not in {None, 'none'}:
ans['padding-left'] = ans['padding-right'] = '%.3gpt' % self.h_space
if self.x_align is None:
fl = 'left' if self.x/page.width < 0.5 else 'right'
else:
fl = 'right' if self.x_align == 'right' else 'left'
ans['float'] = fl
return ans
def __eq__(self, other):
for x in self.all_attributes:
if getattr(other, x, inherit) != getattr(self, x):
return False
return True
def __ne__(self, other):
return not self.__eq__(other)
def read_frame(parent, dest, XPath, get):
ans = inherit
for fp in XPath('./w:framePr')(parent):
ans = Frame(fp, XPath, get)
setattr(dest, 'frame', ans)
# }}}
class ParagraphStyle(object):
all_properties = (
'adjustRightInd', 'autoSpaceDE', 'autoSpaceDN', 'bidi',
'contextualSpacing', 'keepLines', 'keepNext', 'mirrorIndents',
'pageBreakBefore', 'snapToGrid', 'suppressLineNumbers',
'suppressOverlap', 'topLinePunct', 'widowControl', 'wordWrap',
# Border margins padding
'border_left_width', 'border_left_style', 'border_left_color', 'padding_left',
'border_top_width', 'border_top_style', 'border_top_color', 'padding_top',
'border_right_width', 'border_right_style', 'border_right_color', 'padding_right',
'border_bottom_width', 'border_bottom_style', 'border_bottom_color', 'padding_bottom',
'border_between_width', 'border_between_style', 'border_between_color', 'padding_between',
'margin_left', 'margin_top', 'margin_right', 'margin_bottom',
# Misc.
'text_indent', 'text_align', 'line_height', 'background_color',
'numbering_id', 'numbering_level', 'font_family', 'font_size', 'color', 'frame',
'cs_font_size', 'cs_font_family',
)
def __init__(self, namespace, pPr=None):
self.namespace = namespace
self.linked_style = None
if pPr is None:
for p in self.all_properties:
setattr(self, p, inherit)
else:
for p in (
'adjustRightInd', 'autoSpaceDE', 'autoSpaceDN', 'bidi',
'contextualSpacing', 'keepLines', 'keepNext', 'mirrorIndents',
'pageBreakBefore', 'snapToGrid', 'suppressLineNumbers',
'suppressOverlap', 'topLinePunct', 'widowControl', 'wordWrap',
):
setattr(self, p, binary_property(pPr, p, namespace.XPath, namespace.get))
for x in ('border', 'indent', 'justification', 'spacing', 'shd', 'numbering', 'frame'):
f = read_funcs[x]
f(pPr, self, namespace.XPath, namespace.get)
for s in namespace.XPath('./w:pStyle[@w:val]')(pPr):
self.linked_style = namespace.get(s, 'w:val')
self.font_family = self.font_size = self.color = self.cs_font_size = self.cs_font_family = inherit
self._css = None
self._border_key = None
def update(self, other):
for prop in self.all_properties:
nval = getattr(other, prop)
if nval is not inherit:
setattr(self, prop, nval)
if other.linked_style is not None:
self.linked_style = other.linked_style
def resolve_based_on(self, parent):
for p in self.all_properties:
val = getattr(self, p)
if val is inherit:
setattr(self, p, getattr(parent, p))
@property
def css(self):
if self._css is None:
self._css = c = OrderedDict()
if self.keepLines is True:
c['page-break-inside'] = 'avoid'
if self.pageBreakBefore is True:
c['page-break-before'] = 'always'
if self.keepNext is True:
c['page-break-after'] = 'avoid'
for edge in ('left', 'top', 'right', 'bottom'):
border_to_css(edge, self, c)
val = getattr(self, 'padding_%s' % edge)
if val is not inherit:
c['padding-%s' % edge] = '%.3gpt' % val
val = getattr(self, 'margin_%s' % edge)
if val is not inherit:
c['margin-%s' % edge] = val
if self.line_height not in {inherit, '1'}:
c['line-height'] = self.line_height
for x in ('text_indent', 'background_color', 'font_family', 'font_size', 'color'):
val = getattr(self, x)
if val is not inherit:
if x == 'font_size':
val = '%.3gpt' % val
c[x.replace('_', '-')] = val
ta = self.text_align
if ta is not inherit:
if self.bidi is True:
ta = {'left':'right', 'right':'left'}.get(ta, ta)
c['text-align'] = ta
return self._css
@property
def border_key(self):
if self._border_key is None:
k = []
for edge in border_edges:
for prop in border_props:
prop = prop % edge
k.append(getattr(self, prop))
self._border_key = tuple(k)
return self._border_key
def has_identical_borders(self, other_style):
return self.border_key == getattr(other_style, 'border_key', None)
def clear_borders(self):
for edge in border_edges[:-1]:
for prop in ('width', 'color', 'style'):
setattr(self, 'border_%s_%s' % (edge, prop), inherit)
def clone_border_styles(self):
style = ParagraphStyle(self.namespace)
for edge in border_edges[:-1]:
for prop in ('width', 'color', 'style'):
attr = 'border_%s_%s' % (edge, prop)
setattr(style, attr, getattr(self, attr))
return style
def apply_between_border(self):
for prop in ('width', 'color', 'style'):
setattr(self, 'border_bottom_%s' % prop, getattr(self, 'border_between_%s' % prop))
def has_visible_border(self):
for edge in border_edges[:-1]:
bw, bs = getattr(self, 'border_%s_width' % edge), getattr(self, 'border_%s_style' % edge)
if bw is not inherit and bw and bs is not inherit and bs != 'none':
return True
return False
read_funcs = {k[5:]:v for k, v in iteritems(globals()) if k.startswith('read_')}

View File

@@ -0,0 +1,302 @@
#!/usr/bin/env python2
# vim:fileencoding=utf-8
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
from collections import OrderedDict
from calibre.ebooks.docx.block_styles import ( # noqa
inherit, simple_color, LINE_STYLES, simple_float, binary_property, read_shd)
# Read from XML {{{
def read_text_border(parent, dest, XPath, get):
border_color = border_style = border_width = padding = inherit
elems = XPath('./w:bdr')(parent)
if elems and elems[0].attrib:
border_color = simple_color('auto')
border_style = 'none'
border_width = 1
for elem in elems:
color = get(elem, 'w:color')
if color is not None:
border_color = simple_color(color)
style = get(elem, 'w:val')
if style is not None:
border_style = LINE_STYLES.get(style, 'solid')
space = get(elem, 'w:space')
if space is not None:
try:
padding = float(space)
except (ValueError, TypeError):
pass
sz = get(elem, 'w:sz')
if sz is not None:
# we dont care about art borders (they are only used for page borders)
try:
# A border of less than 1pt is not rendered by WebKit
border_width = min(96, max(8, float(sz))) / 8
except (ValueError, TypeError):
pass
setattr(dest, 'border_color', border_color)
setattr(dest, 'border_style', border_style)
setattr(dest, 'border_width', border_width)
setattr(dest, 'padding', padding)
def read_color(parent, dest, XPath, get):
ans = inherit
for col in XPath('./w:color[@w:val]')(parent):
val = get(col, 'w:val')
if not val:
continue
ans = simple_color(val)
setattr(dest, 'color', ans)
def convert_highlight_color(val):
return {
'darkBlue': '#000080', 'darkCyan': '#008080', 'darkGray': '#808080',
'darkGreen': '#008000', 'darkMagenta': '#800080', 'darkRed': '#800000', 'darkYellow': '#808000',
'lightGray': '#c0c0c0'}.get(val, val)
def read_highlight(parent, dest, XPath, get):
ans = inherit
for col in XPath('./w:highlight[@w:val]')(parent):
val = get(col, 'w:val')
if not val:
continue
if not val or val == 'none':
val = 'transparent'
else:
val = convert_highlight_color(val)
ans = val
setattr(dest, 'highlight', ans)
def read_lang(parent, dest, XPath, get):
ans = inherit
for col in XPath('./w:lang[@w:val]')(parent):
val = get(col, 'w:val')
if not val:
continue
try:
code = int(val, 16)
except (ValueError, TypeError):
ans = val
else:
from calibre.ebooks.docx.lcid import lcid
val = lcid.get(code, None)
if val:
ans = val
setattr(dest, 'lang', ans)
def read_letter_spacing(parent, dest, XPath, get):
ans = inherit
for col in XPath('./w:spacing[@w:val]')(parent):
val = simple_float(get(col, 'w:val'), 0.05)
if val is not None:
ans = val
setattr(dest, 'letter_spacing', ans)
def read_underline(parent, dest, XPath, get):
ans = inherit
for col in XPath('./w:u[@w:val]')(parent):
val = get(col, 'w:val')
if val:
ans = val if val == 'none' else 'underline'
setattr(dest, 'text_decoration', ans)
def read_vert_align(parent, dest, XPath, get):
ans = inherit
for col in XPath('./w:vertAlign[@w:val]')(parent):
val = get(col, 'w:val')
if val and val in {'baseline', 'subscript', 'superscript'}:
ans = val
setattr(dest, 'vert_align', ans)
def read_position(parent, dest, XPath, get):
ans = inherit
for col in XPath('./w:position[@w:val]')(parent):
val = get(col, 'w:val')
try:
ans = float(val)/2.0
except Exception:
pass
setattr(dest, 'position', ans)
def read_font(parent, dest, XPath, get):
ff = inherit
for col in XPath('./w:rFonts')(parent):
val = get(col, 'w:asciiTheme')
if val:
val = '|%s|' % val
else:
val = get(col, 'w:ascii')
if val:
ff = val
setattr(dest, 'font_family', ff)
for col in XPath('./w:sz[@w:val]')(parent):
val = simple_float(get(col, 'w:val'), 0.5)
if val is not None:
setattr(dest, 'font_size', val)
return
setattr(dest, 'font_size', inherit)
def read_font_cs(parent, dest, XPath, get):
ff = inherit
for col in XPath('./w:rFonts')(parent):
val = get(col, 'w:csTheme')
if val:
val = '|%s|' % val
else:
val = get(col, 'w:cs')
if val:
ff = val
setattr(dest, 'cs_font_family', ff)
for col in XPath('./w:szCS[@w:val]')(parent):
val = simple_float(get(col, 'w:val'), 0.5)
if val is not None:
setattr(dest, 'font_size', val)
return
setattr(dest, 'cs_font_size', inherit)
# }}}
class RunStyle(object):
all_properties = {
'b', 'bCs', 'caps', 'cs', 'dstrike', 'emboss', 'i', 'iCs', 'imprint',
'rtl', 'shadow', 'smallCaps', 'strike', 'vanish', 'webHidden',
'border_color', 'border_style', 'border_width', 'padding', 'color', 'highlight', 'background_color',
'letter_spacing', 'font_size', 'text_decoration', 'vert_align', 'lang', 'font_family', 'position',
'cs_font_size', 'cs_font_family'
}
toggle_properties = {
'b', 'bCs', 'caps', 'emboss', 'i', 'iCs', 'imprint', 'shadow', 'smallCaps', 'strike', 'vanish',
}
def __init__(self, namespace, rPr=None):
self.namespace = namespace
self.linked_style = None
if rPr is None:
for p in self.all_properties:
setattr(self, p, inherit)
else:
X, g = namespace.XPath, namespace.get
for p in (
'b', 'bCs', 'caps', 'cs', 'dstrike', 'emboss', 'i', 'iCs', 'imprint', 'rtl', 'shadow',
'smallCaps', 'strike', 'vanish', 'webHidden',
):
setattr(self, p, binary_property(rPr, p, X, g))
read_font(rPr, self, X, g)
read_font_cs(rPr, self, X, g)
read_text_border(rPr, self, X, g)
read_color(rPr, self, X, g)
read_highlight(rPr, self, X, g)
read_shd(rPr, self, X, g)
read_letter_spacing(rPr, self, X, g)
read_underline(rPr, self, X, g)
read_vert_align(rPr, self, X, g)
read_position(rPr, self, X, g)
read_lang(rPr, self, X, g)
for s in X('./w:rStyle[@w:val]')(rPr):
self.linked_style = g(s, 'w:val')
self._css = None
def update(self, other):
for prop in self.all_properties:
nval = getattr(other, prop)
if nval is not inherit:
setattr(self, prop, nval)
if other.linked_style is not None:
self.linked_style = other.linked_style
def resolve_based_on(self, parent):
for p in self.all_properties:
val = getattr(self, p)
if val is inherit:
setattr(self, p, getattr(parent, p))
def get_border_css(self, ans):
for x in ('color', 'style', 'width'):
val = getattr(self, 'border_'+x)
if x == 'width' and val is not inherit:
val = '%.3gpt' % val
if val is not inherit:
ans['border-%s' % x] = val
def clear_border_css(self):
for x in ('color', 'style', 'width'):
setattr(self, 'border_'+x, inherit)
@property
def css(self):
if self._css is None:
c = self._css = OrderedDict()
td = set()
if self.text_decoration is not inherit:
td.add(self.text_decoration)
if self.strike and self.strike is not inherit:
td.add('line-through')
if self.dstrike and self.dstrike is not inherit:
td.add('line-through')
if td:
c['text-decoration'] = ' '.join(td)
if self.caps is True:
c['text-transform'] = 'uppercase'
if self.i is True:
c['font-style'] = 'italic'
if self.shadow and self.shadow is not inherit:
c['text-shadow'] = '2px 2px'
if self.smallCaps is True:
c['font-variant'] = 'small-caps'
if self.vanish is True or self.webHidden is True:
c['display'] = 'none'
self.get_border_css(c)
if self.padding is not inherit:
c['padding'] = '%.3gpt' % self.padding
for x in ('color', 'background_color'):
val = getattr(self, x)
if val is not inherit:
c[x.replace('_', '-')] = val
for x in ('letter_spacing', 'font_size'):
val = getattr(self, x)
if val is not inherit:
c[x.replace('_', '-')] = '%.3gpt' % val
if self.position is not inherit:
c['vertical-align'] = '%.3gpt' % self.position
if self.highlight is not inherit and self.highlight != 'transparent':
c['background-color'] = self.highlight
if self.b:
c['font-weight'] = 'bold'
if self.font_family is not inherit:
c['font-family'] = self.font_family
return self._css
def same_border(self, other):
return self.get_border_css({}) == other.get_border_css({})

View File

@@ -0,0 +1,235 @@
#!/usr/bin/env python2
# vim:fileencoding=utf-8
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
import os
from polyglot.builtins import itervalues, range
NBSP = '\xa0'
def mergeable(previous, current):
if previous.tail or current.tail:
return False
if previous.get('class', None) != current.get('class', None):
return False
if current.get('id', False):
return False
for attr in ('style', 'lang', 'dir'):
if previous.get(attr) != current.get(attr):
return False
try:
return next(previous.itersiblings()) is current
except StopIteration:
return False
def append_text(parent, text):
if len(parent) > 0:
parent[-1].tail = (parent[-1].tail or '') + text
else:
parent.text = (parent.text or '') + text
def merge(parent, span):
if span.text:
append_text(parent, span.text)
for child in span:
parent.append(child)
if span.tail:
append_text(parent, span.tail)
span.getparent().remove(span)
def merge_run(run):
parent = run[0]
for span in run[1:]:
merge(parent, span)
def liftable(css):
# A <span> is liftable if all its styling would work just as well if it is
# specified on the parent element.
prefixes = {x.partition('-')[0] for x in css}
return not (prefixes - {'text', 'font', 'letter', 'color', 'background'})
def add_text(elem, attr, text):
old = getattr(elem, attr) or ''
setattr(elem, attr, old + text)
def lift(span):
# Replace an element by its content (text, children and tail)
parent = span.getparent()
idx = parent.index(span)
try:
last_child = span[-1]
except IndexError:
last_child = None
if span.text:
if idx == 0:
add_text(parent, 'text', span.text)
else:
add_text(parent[idx - 1], 'tail', span.text)
for child in reversed(span):
parent.insert(idx, child)
parent.remove(span)
if span.tail:
if last_child is None:
if idx == 0:
add_text(parent, 'text', span.tail)
else:
add_text(parent[idx - 1], 'tail', span.tail)
else:
add_text(last_child, 'tail', span.tail)
def before_count(root, tag, limit=10):
body = root.xpath('//body[1]')
if not body:
return limit
ans = 0
for elem in body[0].iterdescendants():
if elem is tag:
return ans
ans += 1
if ans > limit:
return limit
def wrap_contents(tag_name, elem):
wrapper = elem.makeelement(tag_name)
wrapper.text, elem.text = elem.text, ''
for child in elem:
elem.remove(child)
wrapper.append(child)
elem.append(wrapper)
def cleanup_markup(log, root, styles, dest_dir, detect_cover, XPath):
# Apply vertical-align
for span in root.xpath('//span[@data-docx-vert]'):
wrap_contents(span.attrib.pop('data-docx-vert'), span)
# Move <hr>s outside paragraphs, if possible.
pancestor = XPath('|'.join('ancestor::%s[1]' % x for x in ('p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6')))
for hr in root.xpath('//span/hr'):
p = pancestor(hr)
if p:
p = p[0]
descendants = tuple(p.iterdescendants())
if descendants[-1] is hr:
parent = p.getparent()
idx = parent.index(p)
parent.insert(idx+1, hr)
hr.tail = '\n\t'
# Merge consecutive spans that have the same styling
current_run = []
for span in root.xpath('//span'):
if not current_run:
current_run.append(span)
else:
last = current_run[-1]
if mergeable(last, span):
current_run.append(span)
else:
if len(current_run) > 1:
merge_run(current_run)
current_run = [span]
# Process dir attributes
class_map = dict(itervalues(styles.classes))
parents = ('p', 'div') + tuple('h%d' % i for i in range(1, 7))
for parent in root.xpath('//*[(%s)]' % ' or '.join('name()="%s"' % t for t in parents)):
# Ensure that children of rtl parents that are not rtl have an
# explicit dir set. Also, remove dir from children if it is the same as
# that of the parent.
if len(parent):
parent_dir = parent.get('dir')
for child in parent.iterchildren('span'):
child_dir = child.get('dir')
if parent_dir == 'rtl' and child_dir != 'rtl':
child_dir = 'ltr'
child.set('dir', child_dir)
if child_dir and child_dir == parent_dir:
child.attrib.pop('dir')
# Remove unnecessary span tags that are the only child of a parent block
# element
for parent in root.xpath('//*[(%s) and count(span)=1]' % ' or '.join('name()="%s"' % t for t in parents)):
if len(parent) == 1 and not parent.text and not parent[0].tail and not parent[0].get('id', None):
# We have a block whose contents are entirely enclosed in a <span>
span = parent[0]
span_class = span.get('class', None)
span_css = class_map.get(span_class, {})
span_dir = span.get('dir')
if liftable(span_css) and (not span_dir or span_dir == parent.get('dir')):
pclass = parent.get('class', None)
if span_class:
pclass = (pclass + ' ' + span_class) if pclass else span_class
parent.set('class', pclass)
parent.text = span.text
parent.remove(span)
if span.get('lang'):
parent.set('lang', span.get('lang'))
if span.get('dir'):
parent.set('dir', span.get('dir'))
for child in span:
parent.append(child)
# Make spans whose only styling is bold or italic into <b> and <i> tags
for span in root.xpath('//span[@class and not(@style)]'):
css = class_map.get(span.get('class', None), {})
if len(css) == 1:
if css == {'font-style':'italic'}:
span.tag = 'i'
del span.attrib['class']
elif css == {'font-weight':'bold'}:
span.tag = 'b'
del span.attrib['class']
# Get rid of <span>s that have no styling
for span in root.xpath('//span[not(@class or @id or @style or @lang or @dir)]'):
lift(span)
# Convert <p><br style="page-break-after:always"> </p> style page breaks
# into something the viewer will render as a page break
for p in root.xpath('//p[br[@style="page-break-after:always"]]'):
if len(p) == 1 and (not p[0].tail or not p[0].tail.strip()):
p.remove(p[0])
prefix = p.get('style', '')
if prefix:
prefix += '; '
p.set('style', prefix + 'page-break-after:always')
p.text = NBSP if not p.text else p.text
if detect_cover:
# Check if the first image in the document is possibly a cover
img = root.xpath('//img[@src][1]')
if img:
img = img[0]
path = os.path.join(dest_dir, img.get('src'))
if os.path.exists(path) and before_count(root, img, limit=10) < 5:
from calibre.utils.imghdr import identify
try:
with lopen(path, 'rb') as imf:
fmt, width, height = identify(imf)
except:
width, height, fmt = 0, 0, None # noqa
del fmt
try:
is_cover = 0.8 <= height/width <= 1.8 and height*width >= 160000
except ZeroDivisionError:
is_cover = False
if is_cover:
log.debug('Detected an image that looks like a cover')
img.getparent().remove(img)
return path

View File

@@ -0,0 +1,268 @@
#!/usr/bin/env python2
# vim:fileencoding=utf-8
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
import os, sys, shutil
from lxml import etree
from calibre import walk, guess_type
from calibre.ebooks.metadata import string_to_authors, authors_to_sort_string
from calibre.ebooks.metadata.book.base import Metadata
from calibre.ebooks.docx import InvalidDOCX
from calibre.ebooks.docx.names import DOCXNamespace
from calibre.ptempfile import PersistentTemporaryDirectory
from calibre.utils.localization import canonicalize_lang
from calibre.utils.logging import default_log
from calibre.utils.zipfile import ZipFile
from calibre.utils.xml_parse import safe_xml_fromstring
def fromstring(raw, parser=None):
return safe_xml_fromstring(raw)
# Read metadata {{{
def read_doc_props(raw, mi, XPath):
root = fromstring(raw)
titles = XPath('//dc:title')(root)
if titles:
title = titles[0].text
if title and title.strip():
mi.title = title.strip()
tags = []
for subject in XPath('//dc:subject')(root):
if subject.text and subject.text.strip():
tags.append(subject.text.strip().replace(',', '_'))
for keywords in XPath('//cp:keywords')(root):
if keywords.text and keywords.text.strip():
for x in keywords.text.split():
tags.extend(y.strip() for y in x.split(',') if y.strip())
if tags:
mi.tags = tags
authors = XPath('//dc:creator')(root)
aut = []
for author in authors:
if author.text and author.text.strip():
aut.extend(string_to_authors(author.text))
if aut:
mi.authors = aut
mi.author_sort = authors_to_sort_string(aut)
desc = XPath('//dc:description')(root)
if desc:
raw = etree.tostring(desc[0], method='text', encoding='unicode')
raw = raw.replace('_x000d_', '') # Word 2007 mangles newlines in the summary
mi.comments = raw.strip()
langs = []
for lang in XPath('//dc:language')(root):
if lang.text and lang.text.strip():
l = canonicalize_lang(lang.text)
if l:
langs.append(l)
if langs:
mi.languages = langs
def read_app_props(raw, mi):
root = fromstring(raw)
company = root.xpath('//*[local-name()="Company"]')
if company and company[0].text and company[0].text.strip():
mi.publisher = company[0].text.strip()
def read_default_style_language(raw, mi, XPath):
root = fromstring(raw)
for lang in XPath('/w:styles/w:docDefaults/w:rPrDefault/w:rPr/w:lang/@w:val')(root):
lang = canonicalize_lang(lang)
if lang:
mi.languages = [lang]
break
# }}}
class DOCX(object):
def __init__(self, path_or_stream, log=None, extract=True):
self.docx_is_transitional = True
stream = path_or_stream if hasattr(path_or_stream, 'read') else open(path_or_stream, 'rb')
self.name = getattr(stream, 'name', None) or '<stream>'
self.log = log or default_log
if extract:
self.extract(stream)
else:
self.init_zipfile(stream)
self.read_content_types()
self.read_package_relationships()
self.namespace = DOCXNamespace(self.docx_is_transitional)
def init_zipfile(self, stream):
self.zipf = ZipFile(stream)
self.names = frozenset(self.zipf.namelist())
def extract(self, stream):
self.tdir = PersistentTemporaryDirectory('docx_container')
try:
zf = ZipFile(stream)
zf.extractall(self.tdir)
except:
self.log.exception('DOCX appears to be invalid ZIP file, trying a'
' more forgiving ZIP parser')
from calibre.utils.localunzip import extractall
stream.seek(0)
extractall(stream, self.tdir)
self.names = {}
for f in walk(self.tdir):
name = os.path.relpath(f, self.tdir).replace(os.sep, '/')
self.names[name] = f
def exists(self, name):
return name in self.names
def read(self, name):
if hasattr(self, 'zipf'):
return self.zipf.open(name).read()
path = self.names[name]
with open(path, 'rb') as f:
return f.read()
def read_content_types(self):
try:
raw = self.read('[Content_Types].xml')
except KeyError:
raise InvalidDOCX('The file %s docx file has no [Content_Types].xml' % self.name)
root = fromstring(raw)
self.content_types = {}
self.default_content_types = {}
for item in root.xpath('//*[local-name()="Types"]/*[local-name()="Default" and @Extension and @ContentType]'):
self.default_content_types[item.get('Extension').lower()] = item.get('ContentType')
for item in root.xpath('//*[local-name()="Types"]/*[local-name()="Override" and @PartName and @ContentType]'):
name = item.get('PartName').lstrip('/')
self.content_types[name] = item.get('ContentType')
def content_type(self, name):
if name in self.content_types:
return self.content_types[name]
ext = name.rpartition('.')[-1].lower()
if ext in self.default_content_types:
return self.default_content_types[ext]
return guess_type(name)[0]
def read_package_relationships(self):
try:
raw = self.read('_rels/.rels')
except KeyError:
raise InvalidDOCX('The file %s docx file has no _rels/.rels' % self.name)
root = fromstring(raw)
self.relationships = {}
self.relationships_rmap = {}
for item in root.xpath('//*[local-name()="Relationships"]/*[local-name()="Relationship" and @Type and @Target]'):
target = item.get('Target').lstrip('/')
typ = item.get('Type')
if target == 'word/document.xml':
self.docx_is_transitional = typ != 'http://purl.oclc.org/ooxml/officeDocument/relationships/officeDocument'
self.relationships[typ] = target
self.relationships_rmap[target] = typ
@property
def document_name(self):
name = self.relationships.get(self.namespace.names['DOCUMENT'], None)
if name is None:
names = tuple(n for n in self.names if n == 'document.xml' or n.endswith('/document.xml'))
if not names:
raise InvalidDOCX('The file %s docx file has no main document' % self.name)
name = names[0]
return name
@property
def document(self):
return fromstring(self.read(self.document_name))
@property
def document_relationships(self):
return self.get_relationships(self.document_name)
def get_relationships(self, name):
base = '/'.join(name.split('/')[:-1])
by_id, by_type = {}, {}
parts = name.split('/')
name = '/'.join(parts[:-1] + ['_rels', parts[-1] + '.rels'])
try:
raw = self.read(name)
except KeyError:
pass
else:
root = fromstring(raw)
for item in root.xpath('//*[local-name()="Relationships"]/*[local-name()="Relationship" and @Type and @Target]'):
target = item.get('Target')
if item.get('TargetMode', None) != 'External' and not target.startswith('#'):
target = '/'.join((base, target.lstrip('/')))
typ = item.get('Type')
Id = item.get('Id')
by_id[Id] = by_type[typ] = target
return by_id, by_type
def get_document_properties_names(self):
name = self.relationships.get(self.namespace.names['DOCPROPS'], None)
if name is None:
names = tuple(n for n in self.names if n.lower() == 'docprops/core.xml')
if names:
name = names[0]
yield name
name = self.relationships.get(self.namespace.names['APPPROPS'], None)
if name is None:
names = tuple(n for n in self.names if n.lower() == 'docprops/app.xml')
if names:
name = names[0]
yield name
@property
def metadata(self):
mi = Metadata(_('Unknown'))
dp_name, ap_name = self.get_document_properties_names()
if dp_name:
try:
raw = self.read(dp_name)
except KeyError:
pass
else:
read_doc_props(raw, mi, self.namespace.XPath)
if mi.is_null('language'):
try:
raw = self.read('word/styles.xml')
except KeyError:
pass
else:
read_default_style_language(raw, mi, self.namespace.XPath)
ap_name = self.relationships.get(self.namespace.names['APPPROPS'], None)
if ap_name:
try:
raw = self.read(ap_name)
except KeyError:
pass
else:
read_app_props(raw, mi)
return mi
def close(self):
if hasattr(self, 'zipf'):
self.zipf.close()
else:
try:
shutil.rmtree(self.tdir)
except EnvironmentError:
pass
if __name__ == '__main__':
d = DOCX(sys.argv[-1], extract=False)
print(d.metadata)

View File

@@ -0,0 +1,276 @@
#!/usr/bin/env python2
# vim:fileencoding=utf-8
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
import re
from calibre.ebooks.docx.index import process_index, polish_index_markup
from polyglot.builtins import iteritems, native_string_type
class Field(object):
def __init__(self, start):
self.start = start
self.end = None
self.contents = []
self.buf = []
self.instructions = None
self.name = None
def add_instr(self, elem):
self.add_raw(elem.text)
def add_raw(self, raw):
if not raw:
return
if self.name is None:
# There are cases where partial index entries end with
# a significant space, along the lines of
# <>Summary <> ... <>Hearing<>.
# No known examples of starting with a space yet.
# self.name, raw = raw.strip().partition(' ')[0::2]
self.name, raw = raw.lstrip().partition(' ')[0::2]
self.buf.append(raw)
def finalize(self):
self.instructions = ''.join(self.buf)
del self.buf
WORD, FLAG = 0, 1
scanner = re.Scanner([
(r'\\\S{1}', lambda s, t: (t, FLAG)), # A flag of the form \x
(r'"[^"]*"', lambda s, t: (t[1:-1], WORD)), # Quoted word
(r'[^\s\\"]\S*', lambda s, t: (t, WORD)), # A non-quoted word, must not start with a backslash or a space or a quote
(r'\s+', None),
], flags=re.DOTALL)
null = object()
def parser(name, field_map, default_field_name=None):
field_map = dict((x.split(':') for x in field_map.split()))
def parse(raw, log=None):
ans = {}
last_option = None
raw = raw.replace('\\\\', '\x01').replace('\\"', '\x02')
for token, token_type in scanner.scan(raw)[0]:
token = token.replace('\x01', '\\').replace('\x02', '"')
if token_type is FLAG:
last_option = field_map.get(token[1], null)
if last_option is not None:
ans[last_option] = None
elif token_type is WORD:
if last_option is None:
ans[default_field_name] = token
else:
ans[last_option] = token
last_option = None
ans.pop(null, None)
return ans
parse.__name__ = native_string_type('parse_' + name)
return parse
parse_hyperlink = parser('hyperlink',
'l:anchor m:image-map n:target o:title t:target', 'url')
parse_xe = parser('xe',
'b:bold i:italic f:entry-type r:page-range-bookmark t:page-number-text y:yomi', 'text')
parse_index = parser('index',
'b:bookmark c:columns-per-page d:sequence-separator e:first-page-number-separator'
' f:entry-type g:page-range-separator h:heading k:crossref-separator'
' l:page-number-separator p:letter-range s:sequence-name r:run-together y:yomi z:langcode')
parse_ref = parser('ref',
'd:separator f:footnote h:hyperlink n:number p:position r:relative-number t:suppress w:number-full-context')
parse_noteref = parser('noteref',
'f:footnote h:hyperlink p:position')
class Fields(object):
def __init__(self, namespace):
self.namespace = namespace
self.fields = []
self.index_bookmark_counter = 0
self.index_bookmark_prefix = 'index-'
def __call__(self, doc, log):
all_ids = frozenset(self.namespace.XPath('//*/@w:id')(doc))
c = 0
while self.index_bookmark_prefix in all_ids:
c += 1
self.index_bookmark_prefix = self.index_bookmark_prefix.replace('-', '%d-' % c)
stack = []
for elem in self.namespace.XPath(
'//*[name()="w:p" or name()="w:r" or'
' name()="w:instrText" or'
' (name()="w:fldChar" and (@w:fldCharType="begin" or @w:fldCharType="end") or'
' name()="w:fldSimple")]')(doc):
if elem.tag.endswith('}fldChar'):
typ = self.namespace.get(elem, 'w:fldCharType')
if typ == 'begin':
stack.append(Field(elem))
self.fields.append(stack[-1])
else:
try:
stack.pop().end = elem
except IndexError:
pass
elif elem.tag.endswith('}instrText'):
if stack:
stack[-1].add_instr(elem)
elif elem.tag.endswith('}fldSimple'):
field = Field(elem)
instr = self.namespace.get(elem, 'w:instr')
if instr:
field.add_raw(instr)
self.fields.append(field)
for r in self.namespace.XPath('descendant::w:r')(elem):
field.contents.append(r)
else:
if stack:
stack[-1].contents.append(elem)
field_types = ('hyperlink', 'xe', 'index', 'ref', 'noteref')
parsers = {x.upper():getattr(self, 'parse_'+x) for x in field_types}
parsers.update({x:getattr(self, 'parse_'+x) for x in field_types})
field_parsers = {f.upper():globals()['parse_%s' % f] for f in field_types}
field_parsers.update({f:globals()['parse_%s' % f] for f in field_types})
for f in field_types:
setattr(self, '%s_fields' % f, [])
unknown_fields = {'TOC', 'toc', 'PAGEREF', 'pageref'} # The TOC and PAGEREF fields are handled separately
for field in self.fields:
field.finalize()
if field.instructions:
func = parsers.get(field.name, None)
if func is not None:
func(field, field_parsers[field.name], log)
elif field.name not in unknown_fields:
log.warn('Encountered unknown field: %s, ignoring it.' % field.name)
unknown_fields.add(field.name)
def get_runs(self, field):
all_runs = []
current_runs = []
# We only handle spans in a single paragraph
# being wrapped in <a>
for x in field.contents:
if x.tag.endswith('}p'):
if current_runs:
all_runs.append(current_runs)
current_runs = []
elif x.tag.endswith('}r'):
current_runs.append(x)
if current_runs:
all_runs.append(current_runs)
return all_runs
def parse_hyperlink(self, field, parse_func, log):
# Parse hyperlink fields
hl = parse_func(field.instructions, log)
if hl:
if 'target' in hl and hl['target'] is None:
hl['target'] = '_blank'
for runs in self.get_runs(field):
self.hyperlink_fields.append((hl, runs))
def parse_ref(self, field, parse_func, log):
ref = parse_func(field.instructions, log)
dest = ref.get(None, None)
if dest is not None and 'hyperlink' in ref:
for runs in self.get_runs(field):
self.hyperlink_fields.append(({'anchor':dest}, runs))
else:
log.warn('Unsupported reference field (%s), ignoring: %r' % (field.name, ref))
parse_noteref = parse_ref
def parse_xe(self, field, parse_func, log):
# Parse XE fields
if None in (field.start, field.end):
return
xe = parse_func(field.instructions, log)
if xe:
# We insert a synthetic bookmark around this index item so that we
# can link to it later
def WORD(x):
return self.namespace.expand('w:' + x)
self.index_bookmark_counter += 1
bmark = xe['anchor'] = '%s%d' % (self.index_bookmark_prefix, self.index_bookmark_counter)
p = field.start.getparent()
bm = p.makeelement(WORD('bookmarkStart'))
bm.set(WORD('id'), bmark), bm.set(WORD('name'), bmark)
p.insert(p.index(field.start), bm)
p = field.end.getparent()
bm = p.makeelement(WORD('bookmarkEnd'))
bm.set(WORD('id'), bmark)
p.insert(p.index(field.end) + 1, bm)
xe['start_elem'] = field.start
self.xe_fields.append(xe)
def parse_index(self, field, parse_func, log):
if not field.contents:
return
idx = parse_func(field.instructions, log)
hyperlinks, blocks = process_index(field, idx, self.xe_fields, log, self.namespace.XPath, self.namespace.expand)
if not blocks:
return
for anchor, run in hyperlinks:
self.hyperlink_fields.append(({'anchor':anchor}, [run]))
self.index_fields.append((idx, blocks))
def polish_markup(self, object_map):
if not self.index_fields:
return
rmap = {v:k for k, v in iteritems(object_map)}
for idx, blocks in self.index_fields:
polish_index_markup(idx, [rmap[b] for b in blocks])
def test_parse_fields(return_tests=False):
import unittest
class TestParseFields(unittest.TestCase):
def test_hyperlink(self):
ae = lambda x, y: self.assertEqual(parse_hyperlink(x, None), y)
ae(r'\l anchor1', {'anchor':'anchor1'})
ae(r'www.calibre-ebook.com', {'url':'www.calibre-ebook.com'})
ae(r'www.calibre-ebook.com \t target \o tt', {'url':'www.calibre-ebook.com', 'target':'target', 'title': 'tt'})
ae(r'"c:\\Some Folder"', {'url': 'c:\\Some Folder'})
ae(r'xxxx \y yyyy', {'url': 'xxxx'})
def test_xe(self):
ae = lambda x, y: self.assertEqual(parse_xe(x, None), y)
ae(r'"some name"', {'text':'some name'})
ae(r'name \b \i', {'text':'name', 'bold':None, 'italic':None})
ae(r'xxx \y a', {'text':'xxx', 'yomi':'a'})
def test_index(self):
ae = lambda x, y: self.assertEqual(parse_index(x, None), y)
ae(r'', {})
ae(r'\b \c 1', {'bookmark':None, 'columns-per-page': '1'})
suite = unittest.TestLoader().loadTestsFromTestCase(TestParseFields)
if return_tests:
return suite
unittest.TextTestRunner(verbosity=4).run(suite)
if __name__ == '__main__':
test_parse_fields()

View File

@@ -0,0 +1,197 @@
#!/usr/bin/env python2
# vim:fileencoding=utf-8
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
import os, re
from collections import namedtuple
from calibre.ebooks.docx.block_styles import binary_property, inherit
from calibre.utils.filenames import ascii_filename
from calibre.utils.fonts.scanner import font_scanner, NoFonts
from calibre.utils.fonts.utils import panose_to_css_generic_family, is_truetype_font
from calibre.utils.icu import ord_string
from polyglot.builtins import codepoint_to_chr, iteritems, range
Embed = namedtuple('Embed', 'name key subsetted')
def has_system_fonts(name):
try:
return bool(font_scanner.fonts_for_family(name))
except NoFonts:
return False
def get_variant(bold=False, italic=False):
return {(False, False):'Regular', (False, True):'Italic',
(True, False):'Bold', (True, True):'BoldItalic'}[(bold, italic)]
def find_fonts_matching(fonts, style='normal', stretch='normal'):
for font in fonts:
if font['font-style'] == style and font['font-stretch'] == stretch:
yield font
def weight_key(font):
w = font['font-weight']
try:
return abs(int(w) - 400)
except Exception:
return abs({'normal': 400, 'bold': 700}.get(w, 1000000) - 400)
def get_best_font(fonts, style, stretch):
try:
return sorted(find_fonts_matching(fonts, style, stretch), key=weight_key)[0]
except Exception:
pass
class Family(object):
def __init__(self, elem, embed_relationships, XPath, get):
self.name = self.family_name = get(elem, 'w:name')
self.alt_names = tuple(get(x, 'w:val') for x in XPath('./w:altName')(elem))
if self.alt_names and not has_system_fonts(self.name):
for x in self.alt_names:
if has_system_fonts(x):
self.family_name = x
break
self.embedded = {}
for x in ('Regular', 'Bold', 'Italic', 'BoldItalic'):
for y in XPath('./w:embed%s[@r:id]' % x)(elem):
rid = get(y, 'r:id')
key = get(y, 'w:fontKey')
subsetted = get(y, 'w:subsetted') in {'1', 'true', 'on'}
if rid in embed_relationships:
self.embedded[x] = Embed(embed_relationships[rid], key, subsetted)
self.generic_family = 'auto'
for x in XPath('./w:family[@w:val]')(elem):
self.generic_family = get(x, 'w:val', 'auto')
ntt = binary_property(elem, 'notTrueType', XPath, get)
self.is_ttf = ntt is inherit or not ntt
self.panose1 = None
self.panose_name = None
for x in XPath('./w:panose1[@w:val]')(elem):
try:
v = get(x, 'w:val')
v = tuple(int(v[i:i+2], 16) for i in range(0, len(v), 2))
except (TypeError, ValueError, IndexError):
pass
else:
self.panose1 = v
self.panose_name = panose_to_css_generic_family(v)
self.css_generic_family = {'roman':'serif', 'swiss':'sans-serif', 'modern':'monospace',
'decorative':'fantasy', 'script':'cursive'}.get(self.generic_family, None)
self.css_generic_family = self.css_generic_family or self.panose_name or 'serif'
SYMBOL_MAPS = { # {{{
'Wingdings': (' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '🖉', '', '', '👓', '🕭', '🕮', '🕯', '🕿', '', '🖂', '🖃', '📪', '📫', '📬', '📭', '🗀', '🗁', '🗎', '🗏', '🗐', '🗄', '', '🖮', '🖰', '🖲', '🖳', '🖴', '🖫', '🖬', '', '', '🖎', '', '🖏', '👍', '👎', '', '', '', '🖗', '🖐', '', '😐', '', '💣', '🕱', '🏳', '🏱', '', '', '🌢', '', '🕆', '', '🕈', '', '', '', '', '🕉', '', '', '', '', '', '', '', '', '', '', '', '', '', '🙰', '🙵', '', '🔾', '', '🞏', '🞐', '', '', '🞟', '', '', '', '🞙', '', '', '', '🏵', '🏶', '🙶', '🙷', ' ', '🄋', '', '', '', '', '', '', '', '', '', '', '🄌', '', '', '', '', '', '', '', '', '', '', '🙢', '🙠', '🙡', '🙣', '🙦', '🙤', '🙥', '🙧', '', '', '', '', '🞆', '🞈', '🞊', '🞋', '🔿', '', '🞎', '🟀', '🟁', '', '🟋', '🟏', '🟓', '🟑', '', '', '', '', '', '', '', '🕐', '🕑', '🕒', '🕓', '🕔', '🕕', '🕖', '🕗', '🕘', '🕙', '🕚', '🕛', '', '', '', '', '', '', '', '', '🙪', '🙫', '🙕', '🙔', '🙗', '🙖', '🙐', '🙑', '🙒', '🙓', '', '', '', '', '', '', '', '', '', '', '🡨', '🡪', '🡩', '🡫', '🡬', '🡭', '🡯', '🡮', '🡸', '🡺', '🡹', '🡻', '🡼', '🡽', '🡿', '🡾', '', '', '', '', '', '', '', '', '', '', '🢬', '🢭', '🗶', '', '🗷', '🗹', ' '), # noqa
'Wingdings 2': (' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '🖊', '🖋', '🖌', '🖍', '', '', '🕾', '🕽', '🗅', '🗆', '🗇', '🗈', '🗉', '🗊', '🗋', '🗌', '🗍', '📋', '🗑', '🗔', '🖵', '🖶', '🖷', '🖸', '🖭', '🖯', '🖱', '🖒', '🖓', '🖘', '🖙', '🖚', '🖛', '👈', '👉', '🖜', '🖝', '🖞', '🖟', '🖠', '🖡', '👆', '👇', '🖢', '🖣', '🖑', '🗴', '🗸', '🗵', '', '', '', '', '⮿', '🛇', '', '🙱', '🙴', '🙲', '🙳', '', '🙹', '🙺', '🙻', '🙦', '🙤', '🙥', '🙧', '🙚', '🙘', '🙙', '🙛', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', ' ', '', '🌕', '', '', '⸿', '', '🕇', '🕜', '🕝', '🕞', '🕟', '🕠', '🕡', '🕢', '🕣', '🕤', '🕥', '🕦', '🕧', '🙨', '🙩', '', '🞄', '', '', '', '🞅', '🞇', '🞉', '', '⦿', '🞌', '🞍', '', '', '', '🞑', '🞒', '🞓', '🞔', '', '🞕', '🞖', '🞗', '🞘', '', '', '', '🞚', '', '🞛', '🞜', '🞝', '🞞', '', '', '', '🞠', '', '', '', '', '', '', '', '', '', '', '', '', '🞡', '🞢', '🞣', '🞤', '🞥', '🞦', '🞧', '🞨', '🞩', '🞪', '🞫', '🞬', '🞭', '🞮', '🞯', '🞰', '🞱', '🞲', '🞳', '🞴', '🞵', '🞶', '🞷', '🞸', '🞹', '🞺', '🞻', '🞼', '🞽', '🞾', '🞿', '🟀', '🟂', '🟄', '🟆', '🟉', '🟊', '', '🟌', '🟎', '🟐', '🟒', '', '🟃', '🟇', '', '🟍', '🟔', '', '', '', '', ' ', ' ', ' ', ' ', ' ', ' ',), # noqa
'Wingdings 3': (' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '⭿', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '🢠', '🢡', '🢢', '🢣', '🢤', '🢥', '🢦', '🢧', '🢨', '🢩', '🢪', '🢫', '🡐', '🡒', '🡑', '🡓', '🡔', '🡕', '🡗', '🡖', '🡘', '🡙', '', '', '', '', '', '', '', '', '', '', '', '', '🞀', '🞂', '🞁', ' ', '🞃', '', '', '', '', '', '', '', '', '🠐', '🠒', '🠑', '🠓', '🠔', '🠖', '🠕', '🠗', '🠘', '🠚', '🠙', '🠛', '🠜', '🠞', '🠝', '🠟', '🠀', '🠂', '🠁', '🠃', '🠄', '🠆', '🠅', '🠇', '🠈', '🠊', '🠉', '🠋', '🠠', '🠢', '🠤', '🠦', '🠨', '🠪', '🠬', '🢜', '🢝', '🢞', '🢟', '🠮', '🠰', '🠲', '🠴', '🠶', '🠸', '🠺', '🠹', '🠻', '🢘', '🢚', '🢙', '🢛', '🠼', '🠾', '🠽', '🠿', '🡀', '🡂', '🡁', '🡃', '🡄', '🡆', '🡅', '🡇', '', '', '', '', '', '', '', '', '🡠', '🡢', '🡡', '🡣', '🡤', '🡥', '🡧', '🡦', '🡰', '🡲', '🡱', '🡳', '🡴', '🡵', '🡷', '🡶', '🢀', '🢂', '🢁', '🢃', '🢄', '🢅', '🢇', '🢆', '🢐', '🢒', '🢑', '🢓', '🢔', '🢕', '🢗', '🢖', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',), # noqa
'Webdings': (' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '🕷', '🕸', '🕲', '🕶', '🏆', '🎖', '🖇', '🗨', '🗩', '🗰', '🗱', '🌶', '🎗', '🙾', '🙼', '🗕', '🗖', '🗗', '', '', '', '', '', '', '', '', '', '', '', '🗚', '🗳', '🛠', '🏗', '🏘', '🏙', '🏚', '🏜', '🏭', '🏛', '🏠', '🏖', '🏝', '🛣', '🔍', '🏔', '👁', '👂', '🏞', '🏕', '🛤', '🏟', '🛳', '🕬', '🕫', '🕨', '🔈', '🎔', '🎕', '🗬', '🙽', '🗭', '🗪', '🗫', '', '', '🚲', '', '🛡', '📦', '🛱', '', '🚑', '🛈', '🛩', '🛰', '🟈', '🕴', '', '🛥', '🚔', '🗘', '🗙', '', '🛲', '🚇', '🚍', '', '', '', '🚭', '🗮', '', '🗯', '🗲', ' ', '🚹', '🚺', '🛉', '🛊', '🚼', '👽', '🏋', '', '🏂', '🏌', '🏊', '🏄', '🏍', '🏎', '🚘', '🗠', '🛢', '📠', '🏷', '📣', '👪', '🗡', '🗢', '🗣', '', '🖄', '🖅', '🖃', '🖆', '🖹', '🖺', '🖻', '🕵', '🕰', '🖽', '🖾', '📋', '🗒', '🗓', '🕮', '📚', '🗞', '🗟', '🗃', '🗂', '🖼', '🎭', '🎜', '🎘', '🎙', '🎧', '💿', '🎞', '📷', '🎟', '🎬', '📽', '📹', '📾', '📻', '🎚', '🎛', '📺', '💻', '🖥', '🖦', '🖧', '🍹', '🎮', '🎮', '🕻', '🕼', '🖁', '🖀', '🖨', '🖩', '🖿', '🖪', '🗜', '🔒', '🔓', '🗝', '📥', '📤', '🕳', '🌣', '🌤', '🌥', '🌦', '', '🌨', '🌧', '🌩', '🌪', '🌬', '🌫', '🌜', '🌡', '🛋', '🛏', '🍽', '🍸', '🛎', '🛍', '', '', '🛆', '🖈', '🎓', '🗤', '🗥', '🗦', '🗧', '🛪', '🐿', '🐦', '🐟', '🐕', '🐈', '🙬', '🙮', '🙭', '🙯', '🗺', '🌍', '🌏', '🌎', '🕊',), # noqa
'Symbol': (' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '!', '', '#', '', '%', '&', '', '(', ')', '*', '+', ',', '', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '<', '=', '>', '?', '', 'Α', 'Β', 'Χ', 'Δ', 'Ε', 'Φ', 'Γ', 'Η', 'Ι', 'ϑ', 'Λ', 'Μ', 'Ν', 'Ξ', 'Ο', 'Π', 'Θ', 'Ρ', 'Σ', 'Τ', 'Υ', 'ς', 'Ω', 'Ξ', 'Ψ', 'Ζ', '[', '', ']', '', '_', '', 'α', 'β', 'χ', 'δ', 'ε', 'φ', 'γ', 'η', 'ι', 'ϕ', 'λ', 'μ', 'ν', 'ξ', 'ο', 'π', 'θ', 'ρ', 'σ', 'τ', 'υ', 'ϖ', 'ω', 'ξ', 'ψ', 'ζ', '{', '|', '}', '~', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '', 'ϒ', '', '', '', '', 'ƒ', '', '', '', '', '', '', '', '', '', '°', '±', '', '', '×', '', '', '', '÷', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '®', '©', '', '', '', '', '¬', '', '', '', '', '', '', '', '', '', '®', '©', '', '', '', '', '', '', '', '', '', '', '', '', ' ', '', '', '', '', '', '', '', '', '', '', '', '', '', '', ' ',), # noqa
} # }}}
SYMBOL_FONT_NAMES = frozenset(n.lower() for n in SYMBOL_MAPS)
def is_symbol_font(family):
try:
return family.lower() in SYMBOL_FONT_NAMES
except AttributeError:
return False
def do_map(m, points):
base = 0xf000
limit = len(m) + base
for p in points:
if base < p < limit:
yield m[p - base]
else:
yield codepoint_to_chr(p)
def map_symbol_text(text, font):
m = SYMBOL_MAPS[font]
if isinstance(text, bytes):
text = text.decode('utf-8')
return ''.join(do_map(m, ord_string(text)))
class Fonts(object):
def __init__(self, namespace):
self.namespace = namespace
self.fonts = {}
self.used = set()
def __call__(self, root, embed_relationships, docx, dest_dir):
for elem in self.namespace.XPath('//w:font[@w:name]')(root):
self.fonts[self.namespace.get(elem, 'w:name')] = Family(elem, embed_relationships, self.namespace.XPath, self.namespace.get)
def family_for(self, name, bold=False, italic=False):
f = self.fonts.get(name, None)
if f is None:
return 'serif'
variant = get_variant(bold, italic)
self.used.add((name, variant))
name = f.name if variant in f.embedded else f.family_name
if is_symbol_font(name):
return name
return '"%s", %s' % (name.replace('"', ''), f.css_generic_family)
def embed_fonts(self, dest_dir, docx):
defs = []
dest_dir = os.path.join(dest_dir, 'fonts')
for name, variant in self.used:
f = self.fonts[name]
if variant in f.embedded:
if not os.path.exists(dest_dir):
os.mkdir(dest_dir)
fname = self.write(name, dest_dir, docx, variant)
if fname is not None:
d = {'font-family':'"%s"' % name.replace('"', ''), 'src': 'url("fonts/%s")' % fname}
if 'Bold' in variant:
d['font-weight'] = 'bold'
if 'Italic' in variant:
d['font-style'] = 'italic'
d = ['%s: %s' % (k, v) for k, v in iteritems(d)]
d = ';\n\t'.join(d)
defs.append('@font-face {\n\t%s\n}\n' % d)
return '\n'.join(defs)
def write(self, name, dest_dir, docx, variant):
f = self.fonts[name]
ef = f.embedded[variant]
raw = docx.read(ef.name)
prefix = raw[:32]
if ef.key:
key = re.sub(r'[^A-Fa-f0-9]', '', ef.key)
key = bytearray(reversed(tuple(int(key[i:i+2], 16) for i in range(0, len(key), 2))))
prefix = bytearray(prefix)
prefix = bytes(bytearray(prefix[i]^key[i % len(key)] for i in range(len(prefix))))
if not is_truetype_font(prefix):
return None
ext = 'otf' if prefix.startswith(b'OTTO') else 'ttf'
fname = ascii_filename('%s - %s.%s' % (name, variant, ext))
with open(os.path.join(dest_dir, fname), 'wb') as dest:
dest.write(prefix)
dest.write(raw[32:])
return fname

View File

@@ -0,0 +1,65 @@
#!/usr/bin/env python2
# vim:fileencoding=utf-8
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
from collections import OrderedDict
from polyglot.builtins import iteritems, unicode_type
class Note(object):
def __init__(self, namespace, parent, rels):
self.type = namespace.get(parent, 'w:type', 'normal')
self.parent = parent
self.rels = rels
self.namespace = namespace
def __iter__(self):
for p in self.namespace.descendants(self.parent, 'w:p', 'w:tbl'):
yield p
class Footnotes(object):
def __init__(self, namespace):
self.namespace = namespace
self.footnotes = {}
self.endnotes = {}
self.counter = 0
self.notes = OrderedDict()
def __call__(self, footnotes, footnotes_rels, endnotes, endnotes_rels):
XPath, get = self.namespace.XPath, self.namespace.get
if footnotes is not None:
for footnote in XPath('./w:footnote[@w:id]')(footnotes):
fid = get(footnote, 'w:id')
if fid:
self.footnotes[fid] = Note(self.namespace, footnote, footnotes_rels)
if endnotes is not None:
for endnote in XPath('./w:endnote[@w:id]')(endnotes):
fid = get(endnote, 'w:id')
if fid:
self.endnotes[fid] = Note(self.namespace, endnote, endnotes_rels)
def get_ref(self, ref):
fid = self.namespace.get(ref, 'w:id')
notes = self.footnotes if ref.tag.endswith('}footnoteReference') else self.endnotes
note = notes.get(fid, None)
if note is not None and note.type == 'normal':
self.counter += 1
anchor = 'note_%d' % self.counter
self.notes[anchor] = (unicode_type(self.counter), note)
return anchor, unicode_type(self.counter)
return None, None
def __iter__(self):
for anchor, (counter, note) in iteritems(self.notes):
yield anchor, counter, note
@property
def has_notes(self):
return bool(self.notes)

View File

@@ -0,0 +1,343 @@
#!/usr/bin/env python2
# vim:fileencoding=utf-8
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
import os
from lxml.html.builder import IMG, HR
from calibre.constants import iswindows
from calibre.ebooks.docx.names import barename
from calibre.utils.filenames import ascii_filename
from calibre.utils.img import resize_to_fit, image_to_data
from calibre.utils.imghdr import what
from polyglot.builtins import iteritems, itervalues
class LinkedImageNotFound(ValueError):
def __init__(self, fname):
ValueError.__init__(self, fname)
self.fname = fname
def image_filename(x):
return ascii_filename(x).replace(' ', '_').replace('#', '_')
def emu_to_pt(x):
return x / 12700
def pt_to_emu(x):
return int(x * 12700)
def get_image_properties(parent, XPath, get):
width = height = None
for extent in XPath('./wp:extent')(parent):
try:
width = emu_to_pt(int(extent.get('cx')))
except (TypeError, ValueError):
pass
try:
height = emu_to_pt(int(extent.get('cy')))
except (TypeError, ValueError):
pass
ans = {}
if width is not None:
ans['width'] = '%.3gpt' % width
if height is not None:
ans['height'] = '%.3gpt' % height
alt = None
title = None
for docPr in XPath('./wp:docPr')(parent):
alt = docPr.get('descr') or alt
title = docPr.get('title') or title
if docPr.get('hidden', None) in {'true', 'on', '1'}:
ans['display'] = 'none'
return ans, alt, title
def get_image_margins(elem):
ans = {}
for w, css in iteritems({'L':'left', 'T':'top', 'R':'right', 'B':'bottom'}):
val = elem.get('dist%s' % w, None)
if val is not None:
try:
val = emu_to_pt(val)
except (TypeError, ValueError):
continue
ans['padding-%s' % css] = '%.3gpt' % val
return ans
def get_hpos(anchor, page_width, XPath, get, width_frac):
for ph in XPath('./wp:positionH')(anchor):
rp = ph.get('relativeFrom', None)
if rp == 'leftMargin':
return 0 + width_frac
if rp == 'rightMargin':
return 1 + width_frac
al = None
almap = {'left':0, 'center':0.5, 'right':1}
for align in XPath('./wp:align')(ph):
al = almap.get(align.text)
if al is not None:
if rp == 'page':
return al
return al + width_frac
for po in XPath('./wp:posOffset')(ph):
try:
pos = emu_to_pt(int(po.text))
except (TypeError, ValueError):
continue
return pos/page_width + width_frac
for sp in XPath('./wp:simplePos')(anchor):
try:
x = emu_to_pt(sp.get('x', None))
except (TypeError, ValueError):
continue
return x/page_width + width_frac
return 0
class Images(object):
def __init__(self, namespace, log):
self.namespace = namespace
self.rid_map = {}
self.used = {}
self.resized = {}
self.names = set()
self.all_images = set()
self.links = []
self.log = log
def __call__(self, relationships_by_id):
self.rid_map = relationships_by_id
def read_image_data(self, fname, base=None):
if fname.startswith('file://'):
src = fname[len('file://'):]
if iswindows and src and src[0] == '/':
src = src[1:]
if not src or not os.path.exists(src):
raise LinkedImageNotFound(src)
with open(src, 'rb') as rawsrc:
raw = rawsrc.read()
else:
try:
raw = self.docx.read(fname)
except KeyError:
raise LinkedImageNotFound(fname)
base = base or image_filename(fname.rpartition('/')[-1]) or 'image'
ext = what(None, raw) or base.rpartition('.')[-1] or 'jpeg'
if ext == 'emf':
# For an example, see: https://bugs.launchpad.net/bugs/1224849
self.log('Found an EMF image: %s, trying to extract embedded raster image' % fname)
from calibre.utils.wmf.emf import emf_unwrap
try:
raw = emf_unwrap(raw)
except Exception:
self.log.exception('Failed to extract embedded raster image from EMF')
else:
ext = 'png'
base = base.rpartition('.')[0]
if not base:
base = 'image'
base += '.' + ext
return raw, base
def unique_name(self, base):
exists = frozenset(itervalues(self.used))
c = 1
name = base
while name in exists:
n, e = base.rpartition('.')[0::2]
name = '%s-%d.%s' % (n, c, e)
c += 1
return name
def resize_image(self, raw, base, max_width, max_height):
resized, img = resize_to_fit(raw, max_width, max_height)
if resized:
base, ext = os.path.splitext(base)
base = base + '-%dx%d%s' % (max_width, max_height, ext)
raw = image_to_data(img, fmt=ext[1:])
return raw, base, resized
def generate_filename(self, rid, base=None, rid_map=None, max_width=None, max_height=None):
rid_map = self.rid_map if rid_map is None else rid_map
fname = rid_map[rid]
key = (fname, max_width, max_height)
ans = self.used.get(key)
if ans is not None:
return ans
raw, base = self.read_image_data(fname, base=base)
resized = False
if max_width is not None and max_height is not None:
raw, base, resized = self.resize_image(raw, base, max_width, max_height)
name = self.unique_name(base)
self.used[key] = name
if max_width is not None and max_height is not None and not resized:
okey = (fname, None, None)
if okey in self.used:
return self.used[okey]
self.used[okey] = name
with open(os.path.join(self.dest_dir, name), 'wb') as f:
f.write(raw)
self.all_images.add('images/' + name)
return name
def pic_to_img(self, pic, alt, parent, title):
XPath, get = self.namespace.XPath, self.namespace.get
name = None
link = None
for hl in XPath('descendant::a:hlinkClick[@r:id]')(parent):
link = {'id':get(hl, 'r:id')}
tgt = hl.get('tgtFrame', None)
if tgt:
link['target'] = tgt
title = hl.get('tooltip', None)
if title:
link['title'] = title
for pr in XPath('descendant::pic:cNvPr')(pic):
name = pr.get('name', None)
if name:
name = image_filename(name)
alt = pr.get('descr') or alt
for a in XPath('descendant::a:blip[@r:embed or @r:link]')(pic):
rid = get(a, 'r:embed')
if not rid:
rid = get(a, 'r:link')
if rid and rid in self.rid_map:
try:
src = self.generate_filename(rid, name)
except LinkedImageNotFound as err:
self.log.warn('Linked image: %s not found, ignoring' % err.fname)
continue
img = IMG(src='images/%s' % src)
img.set('alt', alt or 'Image')
if title:
img.set('title', title)
if link is not None:
self.links.append((img, link, self.rid_map))
return img
def drawing_to_html(self, drawing, page):
XPath, get = self.namespace.XPath, self.namespace.get
# First process the inline pictures
for inline in XPath('./wp:inline')(drawing):
style, alt, title = get_image_properties(inline, XPath, get)
for pic in XPath('descendant::pic:pic')(inline):
ans = self.pic_to_img(pic, alt, inline, title)
if ans is not None:
if style:
ans.set('style', '; '.join('%s: %s' % (k, v) for k, v in iteritems(style)))
yield ans
# Now process the floats
for anchor in XPath('./wp:anchor')(drawing):
style, alt, title = get_image_properties(anchor, XPath, get)
self.get_float_properties(anchor, style, page)
for pic in XPath('descendant::pic:pic')(anchor):
ans = self.pic_to_img(pic, alt, anchor, title)
if ans is not None:
if style:
ans.set('style', '; '.join('%s: %s' % (k, v) for k, v in iteritems(style)))
yield ans
def pict_to_html(self, pict, page):
XPath, get = self.namespace.XPath, self.namespace.get
# First see if we have an <hr>
is_hr = len(pict) == 1 and get(pict[0], 'o:hr') in {'t', 'true'}
if is_hr:
style = {}
hr = HR()
try:
pct = float(get(pict[0], 'o:hrpct'))
except (ValueError, TypeError, AttributeError):
pass
else:
if pct > 0:
style['width'] = '%.3g%%' % pct
align = get(pict[0], 'o:hralign', 'center')
if align in {'left', 'right'}:
style['margin-left'] = '0' if align == 'left' else 'auto'
style['margin-right'] = 'auto' if align == 'left' else '0'
if style:
hr.set('style', '; '.join(('%s:%s' % (k, v) for k, v in iteritems(style))))
yield hr
for imagedata in XPath('descendant::v:imagedata[@r:id]')(pict):
rid = get(imagedata, 'r:id')
if rid in self.rid_map:
try:
src = self.generate_filename(rid)
except LinkedImageNotFound as err:
self.log.warn('Linked image: %s not found, ignoring' % err.fname)
continue
img = IMG(src='images/%s' % src, style="display:block")
alt = get(imagedata, 'o:title')
img.set('alt', alt or 'Image')
yield img
def get_float_properties(self, anchor, style, page):
XPath, get = self.namespace.XPath, self.namespace.get
if 'display' not in style:
style['display'] = 'block'
padding = get_image_margins(anchor)
width = float(style.get('width', '100pt')[:-2])
page_width = page.width - page.margin_left - page.margin_right
if page_width <= 0:
# Ignore margins
page_width = page.width
hpos = get_hpos(anchor, page_width, XPath, get, width/(2*page_width))
wrap_elem = None
dofloat = False
for child in reversed(anchor):
bt = barename(child.tag)
if bt in {'wrapNone', 'wrapSquare', 'wrapThrough', 'wrapTight', 'wrapTopAndBottom'}:
wrap_elem = child
dofloat = bt not in {'wrapNone', 'wrapTopAndBottom'}
break
if wrap_elem is not None:
padding.update(get_image_margins(wrap_elem))
wt = wrap_elem.get('wrapText', None)
hpos = 0 if wt == 'right' else 1 if wt == 'left' else hpos
if dofloat:
style['float'] = 'left' if hpos < 0.65 else 'right'
else:
ml, mr = (None, None) if hpos < 0.34 else ('auto', None) if hpos > 0.65 else ('auto', 'auto')
if ml is not None:
style['margin-left'] = ml
if mr is not None:
style['margin-right'] = mr
style.update(padding)
def to_html(self, elem, page, docx, dest_dir):
dest = os.path.join(dest_dir, 'images')
if not os.path.exists(dest):
os.mkdir(dest)
self.dest_dir, self.docx = dest, docx
if elem.tag.endswith('}drawing'):
for tag in self.drawing_to_html(elem, page):
yield tag
else:
for tag in self.pict_to_html(elem, page):
yield tag

View File

@@ -0,0 +1,273 @@
#!/usr/bin/env python2
# vim:fileencoding=utf-8
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2014, Kovid Goyal <kovid at kovidgoyal.net>'
from operator import itemgetter
from lxml import etree
from calibre.utils.icu import partition_by_first_letter, sort_key
from polyglot.builtins import iteritems, filter
def get_applicable_xe_fields(index, xe_fields, XPath, expand):
iet = index.get('entry-type', None)
xe_fields = [xe for xe in xe_fields if xe.get('entry-type', None) == iet]
lr = index.get('letter-range', None)
if lr is not None:
sl, el = lr.parition('-')[0::2]
sl, el = sl.strip(), el.strip()
if sl and el:
def inrange(text):
return sl <= text[0] <= el
xe_fields = [xe for xe in xe_fields if inrange(xe.get('text', ''))]
bmark = index.get('bookmark', None)
if bmark is None:
return xe_fields
attr = expand('w:name')
bookmarks = {b for b in XPath('//w:bookmarkStart')(xe_fields[0]['start_elem']) if b.get(attr, None) == bmark}
ancestors = XPath('ancestor::w:bookmarkStart')
def contained(xe):
# Check if the xe field is contained inside a bookmark with the
# specified name
return bool(set(ancestors(xe['start_elem'])) & bookmarks)
return [xe for xe in xe_fields if contained(xe)]
def make_block(expand, style, parent, pos):
p = parent.makeelement(expand('w:p'))
parent.insert(pos, p)
if style is not None:
ppr = p.makeelement(expand('w:pPr'))
p.append(ppr)
ps = ppr.makeelement(expand('w:pStyle'))
ppr.append(ps)
ps.set(expand('w:val'), style)
r = p.makeelement(expand('w:r'))
p.append(r)
t = r.makeelement(expand('w:t'))
t.set(expand('xml:space'), 'preserve')
r.append(t)
return p, t
def add_xe(xe, t, expand):
run = t.getparent()
idx = run.index(t)
t.text = xe.get('text') or ' '
pt = xe.get('page-number-text', None)
if pt:
p = t.getparent().getparent()
r = p.makeelement(expand('w:r'))
p.append(r)
t2 = r.makeelement(expand('w:t'))
t2.set(expand('xml:space'), 'preserve')
t2.text = ' [%s]' % pt
r.append(t2)
# put separate entries on separate lines
run.insert(idx + 1, run.makeelement(expand('w:br')))
return xe['anchor'], run
def process_index(field, index, xe_fields, log, XPath, expand):
'''
We remove all the word generated index markup and replace it with our own
that is more suitable for an ebook.
'''
styles = []
heading_text = index.get('heading', None)
heading_style = 'IndexHeading'
start_pos = None
for elem in field.contents:
if elem.tag.endswith('}p'):
s = XPath('descendant::pStyle/@w:val')(elem)
if s:
styles.append(s[0])
p = elem.getparent()
if start_pos is None:
start_pos = (p, p.index(elem))
p.remove(elem)
xe_fields = get_applicable_xe_fields(index, xe_fields, XPath, expand)
if not xe_fields:
return [], []
if heading_text is not None:
groups = partition_by_first_letter(xe_fields, key=itemgetter('text'))
items = []
for key, fields in iteritems(groups):
items.append(key), items.extend(fields)
if styles:
heading_style = styles[0]
else:
items = sorted(xe_fields, key=lambda x:sort_key(x['text']))
hyperlinks = []
blocks = []
for item in reversed(items):
is_heading = not isinstance(item, dict)
style = heading_style if is_heading else None
p, t = make_block(expand, style, *start_pos)
if is_heading:
text = heading_text
if text.lower().startswith('a'):
text = item + text[1:]
t.text = text
else:
hyperlinks.append(add_xe(item, t, expand))
blocks.append(p)
return hyperlinks, blocks
def split_up_block(block, a, text, parts, ldict):
prefix = parts[:-1]
a.text = parts[-1]
parent = a.getparent()
style = 'display:block; margin-left: %.3gem'
for i, prefix in enumerate(prefix):
m = 1.5 * i
span = parent.makeelement('span', style=style % m)
ldict[span] = i
parent.append(span)
span.text = prefix
span = parent.makeelement('span', style=style % ((i + 1) * 1.5))
parent.append(span)
span.append(a)
ldict[span] = len(prefix)
"""
The merge algorithm is a little tricky.
We start with a list of elementary blocks. Each is an HtmlElement, a p node
with a list of child nodes. The last child may be a link, and the earlier ones are
just text.
The list is in reverse order from what we want in the index.
There is a dictionary ldict which records the level of each child node.
Now we want to do a reduce-like operation, combining all blocks with the same
top level index entry into a single block representing the structure of all
references, subentries, etc. under that top entry.
Here's the algorithm.
Given a block p and the next block n, and the top level entries p1 and n1 in each
block, which we assume have the same text:
Start with (p, p1) and (n, n1).
Given (p, p1, ..., pk) and (n, n1, ..., nk) which we want to merge:
If there are no more levels in n, and we have a link in nk,
then add the link from nk to the links for pk.
This might be the first link for pk, or we might get a list of references.
Otherwise nk+1 is the next level in n. Look for a matching entry in p. It must have
the same text, it must follow pk, it must come before we find any other p entries at
the same level as pk, and it must have the same level as nk+1.
If we find such a matching entry, go back to the start with (p ... pk+1) and (n ... nk+1).
If there is no matching entry, then because of the original reversed order we want
to insert nk+1 and all following entries from n into p immediately following pk.
"""
def find_match(prev_block, pind, nextent, ldict):
curlevel = ldict.get(prev_block[pind], -1)
if curlevel < 0:
return -1
for p in range(pind+1, len(prev_block)):
trylev = ldict.get(prev_block[p], -1)
if trylev <= curlevel:
return -1
if trylev > (curlevel+1):
continue
if prev_block[p].text_content() == nextent.text_content():
return p
return -1
def add_link(pent, nent, ldict):
na = nent.xpath('descendant::a[1]')
# If there is no link, leave it as text
if not na or len(na) == 0:
return
na = na[0]
pa = pent.xpath('descendant::a')
if pa and len(pa) > 0:
# Put on same line with a comma
pa = pa[-1]
pa.tail = ', '
p = pa.getparent()
p.insert(p.index(pa) + 1, na)
else:
# substitute link na for plain text in pent
pent.text = ""
pent.append(na)
def merge_blocks(prev_block, next_block, pind, nind, next_path, ldict):
# First elements match. Any more in next?
if len(next_path) == (nind + 1):
nextent = next_block[nind]
add_link(prev_block[pind], nextent, ldict)
return
nind = nind + 1
nextent = next_block[nind]
prevent = find_match(prev_block, pind, nextent, ldict)
if prevent > 0:
merge_blocks(prev_block, next_block, prevent, nind, next_path, ldict)
return
# Want to insert elements into previous block
while nind < len(next_block):
# insert takes it out of old
pind = pind + 1
prev_block.insert(pind, next_block[nind])
next_block.getparent().remove(next_block)
def polish_index_markup(index, blocks):
# Blocks are in reverse order at this point
path_map = {}
ldict = {}
for block in blocks:
cls = block.get('class', '') or ''
block.set('class', (cls + ' index-entry').lstrip())
a = block.xpath('descendant::a[1]')
text = ''
if a:
text = etree.tostring(a[0], method='text', with_tail=False, encoding='unicode').strip()
if ':' in text:
path_map[block] = parts = list(filter(None, (x.strip() for x in text.split(':'))))
if len(parts) > 1:
split_up_block(block, a[0], text, parts, ldict)
else:
# try using a span all the time
path_map[block] = [text]
parent = a[0].getparent()
span = parent.makeelement('span', style='display:block; margin-left: 0em')
parent.append(span)
span.append(a[0])
ldict[span] = 0
for br in block.xpath('descendant::br'):
br.tail = None
# We want a single block for each main entry
prev_block = blocks[0]
for block in blocks[1:]:
pp, pn = path_map[prev_block], path_map[block]
if pp[0] == pn[0]:
merge_blocks(prev_block, block, 0, 0, pn, ldict)
else:
prev_block = block

View File

@@ -0,0 +1,144 @@
#!/usr/bin/env python2
# vim:fileencoding=utf-8
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
import re
from lxml.etree import XPath as X
from calibre.utils.filenames import ascii_text
from polyglot.builtins import iteritems
# Names {{{
TRANSITIONAL_NAMES = {
'DOCUMENT' : 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument',
'DOCPROPS' : 'http://schemas.openxmlformats.org/package/2006/relationships/metadata/core-properties',
'APPPROPS' : 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/extended-properties',
'STYLES' : 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/styles',
'NUMBERING' : 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/numbering',
'FONTS' : 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/fontTable',
'EMBEDDED_FONT' : 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/font',
'IMAGES' : 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/image',
'LINKS' : 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/hyperlink',
'FOOTNOTES' : 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/footnotes',
'ENDNOTES' : 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/endnotes',
'THEMES' : 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/theme',
'SETTINGS' : 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/settings',
'WEB_SETTINGS' : 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/webSettings',
}
STRICT_NAMES = {
k:v.replace('http://schemas.openxmlformats.org/officeDocument/2006', 'http://purl.oclc.org/ooxml/officeDocument')
for k, v in iteritems(TRANSITIONAL_NAMES)
}
TRANSITIONAL_NAMESPACES = {
'mo': 'http://schemas.microsoft.com/office/mac/office/2008/main',
'o': 'urn:schemas-microsoft-com:office:office',
've': 'http://schemas.openxmlformats.org/markup-compatibility/2006',
'mc': 'http://schemas.openxmlformats.org/markup-compatibility/2006',
# Text Content
'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main',
'w10': 'urn:schemas-microsoft-com:office:word',
'wne': 'http://schemas.microsoft.com/office/word/2006/wordml',
'xml': 'http://www.w3.org/XML/1998/namespace',
# Drawing
'a': 'http://schemas.openxmlformats.org/drawingml/2006/main',
'm': 'http://schemas.openxmlformats.org/officeDocument/2006/math',
'mv': 'urn:schemas-microsoft-com:mac:vml',
'pic': 'http://schemas.openxmlformats.org/drawingml/2006/picture',
'v': 'urn:schemas-microsoft-com:vml',
'wp': 'http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing',
# Properties (core and extended)
'cp': 'http://schemas.openxmlformats.org/package/2006/metadata/core-properties',
'dc': 'http://purl.org/dc/elements/1.1/',
'ep': 'http://schemas.openxmlformats.org/officeDocument/2006/extended-properties',
'xsi': 'http://www.w3.org/2001/XMLSchema-instance',
# Content Types
'ct': 'http://schemas.openxmlformats.org/package/2006/content-types',
# Package Relationships
'r': 'http://schemas.openxmlformats.org/officeDocument/2006/relationships',
'pr': 'http://schemas.openxmlformats.org/package/2006/relationships',
# Dublin Core document properties
'dcmitype': 'http://purl.org/dc/dcmitype/',
'dcterms': 'http://purl.org/dc/terms/'
}
STRICT_NAMESPACES = {
k:v.replace(
'http://schemas.openxmlformats.org/officeDocument/2006', 'http://purl.oclc.org/ooxml/officeDocument').replace(
'http://schemas.openxmlformats.org/wordprocessingml/2006', 'http://purl.oclc.org/ooxml/wordprocessingml').replace(
'http://schemas.openxmlformats.org/drawingml/2006', 'http://purl.oclc.org/ooxml/drawingml')
for k, v in iteritems(TRANSITIONAL_NAMESPACES)
}
# }}}
def barename(x):
return x.rpartition('}')[-1]
def XML(x):
return '{%s}%s' % (TRANSITIONAL_NAMESPACES['xml'], x)
def generate_anchor(name, existing):
x = y = 'id_' + re.sub(r'[^0-9a-zA-Z_]', '', ascii_text(name)).lstrip('_')
c = 1
while y in existing:
y = '%s_%d' % (x, c)
c += 1
return y
class DOCXNamespace(object):
def __init__(self, transitional=True):
self.xpath_cache = {}
if transitional:
self.namespaces = TRANSITIONAL_NAMESPACES.copy()
self.names = TRANSITIONAL_NAMES.copy()
else:
self.namespaces = STRICT_NAMESPACES.copy()
self.names = STRICT_NAMES.copy()
def XPath(self, expr):
ans = self.xpath_cache.get(expr, None)
if ans is None:
self.xpath_cache[expr] = ans = X(expr, namespaces=self.namespaces)
return ans
def is_tag(self, x, q):
tag = getattr(x, 'tag', x)
ns, name = q.partition(':')[0::2]
return '{%s}%s' % (self.namespaces.get(ns, None), name) == tag
def expand(self, name, sep=':'):
ns, tag = name.partition(sep)[::2]
if ns and tag:
tag = '{%s}%s' % (self.namespaces[ns], tag)
return tag or ns
def get(self, x, attr, default=None):
return x.attrib.get(self.expand(attr), default)
def ancestor(self, elem, name):
try:
return self.XPath('ancestor::%s[1]' % name)(elem)[0]
except IndexError:
return None
def children(self, elem, *args):
return self.XPath('|'.join('child::%s' % a for a in args))(elem)
def descendants(self, elem, *args):
return self.XPath('|'.join('descendant::%s' % a for a in args))(elem)
def makeelement(self, root, tag, append=True, **attrs):
ans = root.makeelement(self.expand(tag), **{self.expand(k, sep='_'):v for k, v in iteritems(attrs)})
if append:
root.append(ans)
return ans

View File

@@ -0,0 +1,388 @@
#!/usr/bin/env python2
# vim:fileencoding=utf-8
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
import re, string
from collections import Counter, defaultdict
from functools import partial
from lxml.html.builder import OL, UL, SPAN
from calibre.ebooks.docx.block_styles import ParagraphStyle
from calibre.ebooks.docx.char_styles import RunStyle, inherit
from calibre.ebooks.metadata import roman
from polyglot.builtins import iteritems, unicode_type
STYLE_MAP = {
'aiueo': 'hiragana',
'aiueoFullWidth': 'hiragana',
'hebrew1': 'hebrew',
'iroha': 'katakana-iroha',
'irohaFullWidth': 'katakana-iroha',
'lowerLetter': 'lower-alpha',
'lowerRoman': 'lower-roman',
'none': 'none',
'upperLetter': 'upper-alpha',
'upperRoman': 'upper-roman',
'chineseCounting': 'cjk-ideographic',
'decimalZero': 'decimal-leading-zero',
}
def alphabet(val, lower=True):
x = string.ascii_lowercase if lower else string.ascii_uppercase
return x[(abs(val - 1)) % len(x)]
alphabet_map = {
'lower-alpha':alphabet, 'upper-alpha':partial(alphabet, lower=False),
'lower-roman':lambda x:roman(x).lower(), 'upper-roman':roman,
'decimal-leading-zero': lambda x: '0%d' % x
}
class Level(object):
def __init__(self, namespace, lvl=None):
self.namespace = namespace
self.restart = None
self.start = 0
self.fmt = 'decimal'
self.para_link = None
self.paragraph_style = self.character_style = None
self.is_numbered = False
self.num_template = None
self.bullet_template = None
self.pic_id = None
if lvl is not None:
self.read_from_xml(lvl)
def copy(self):
ans = Level(self.namespace)
for x in ('restart', 'pic_id', 'start', 'fmt', 'para_link', 'paragraph_style', 'character_style', 'is_numbered', 'num_template', 'bullet_template'):
setattr(ans, x, getattr(self, x))
return ans
def format_template(self, counter, ilvl, template):
def sub(m):
x = int(m.group(1)) - 1
if x > ilvl or x not in counter:
return ''
val = counter[x] - (0 if x == ilvl else 1)
formatter = alphabet_map.get(self.fmt, lambda x: '%d' % x)
return formatter(val)
return re.sub(r'%(\d+)', sub, template).rstrip() + '\xa0'
def read_from_xml(self, lvl, override=False):
XPath, get = self.namespace.XPath, self.namespace.get
for lr in XPath('./w:lvlRestart[@w:val]')(lvl):
try:
self.restart = int(get(lr, 'w:val'))
except (TypeError, ValueError):
pass
for lr in XPath('./w:start[@w:val]')(lvl):
try:
self.start = int(get(lr, 'w:val'))
except (TypeError, ValueError):
pass
for rPr in XPath('./w:rPr')(lvl):
ps = RunStyle(self.namespace, rPr)
if self.character_style is None:
self.character_style = ps
else:
self.character_style.update(ps)
lt = None
for lr in XPath('./w:lvlText[@w:val]')(lvl):
lt = get(lr, 'w:val')
for lr in XPath('./w:numFmt[@w:val]')(lvl):
val = get(lr, 'w:val')
if val == 'bullet':
self.is_numbered = False
cs = self.character_style
if lt in {'\uf0a7', 'o'} or (
cs is not None and cs.font_family is not inherit and cs.font_family.lower() in {'wingdings', 'symbol'}):
self.fmt = {'\uf0a7':'square', 'o':'circle'}.get(lt, 'disc')
else:
self.bullet_template = lt
for lpid in XPath('./w:lvlPicBulletId[@w:val]')(lvl):
self.pic_id = get(lpid, 'w:val')
else:
self.is_numbered = True
self.fmt = STYLE_MAP.get(val, 'decimal')
if lt and re.match(r'%\d+\.$', lt) is None:
self.num_template = lt
for lr in XPath('./w:pStyle[@w:val]')(lvl):
self.para_link = get(lr, 'w:val')
for pPr in XPath('./w:pPr')(lvl):
ps = ParagraphStyle(self.namespace, pPr)
if self.paragraph_style is None:
self.paragraph_style = ps
else:
self.paragraph_style.update(ps)
def css(self, images, pic_map, rid_map):
ans = {'list-style-type': self.fmt}
if self.pic_id:
rid = pic_map.get(self.pic_id, None)
if rid:
try:
fname = images.generate_filename(rid, rid_map=rid_map, max_width=20, max_height=20)
except Exception:
fname = None
else:
ans['list-style-image'] = 'url("images/%s")' % fname
return ans
def char_css(self):
try:
css = self.character_style.css
except AttributeError:
css = {}
css.pop('font-family', None)
return css
class NumberingDefinition(object):
def __init__(self, namespace, parent=None, an_id=None):
self.namespace = namespace
XPath, get = self.namespace.XPath, self.namespace.get
self.levels = {}
self.abstract_numbering_definition_id = an_id
if parent is not None:
for lvl in XPath('./w:lvl')(parent):
try:
ilvl = int(get(lvl, 'w:ilvl', 0))
except (TypeError, ValueError):
ilvl = 0
self.levels[ilvl] = Level(namespace, lvl)
def copy(self):
ans = NumberingDefinition(self.namespace, an_id=self.abstract_numbering_definition_id)
for l, lvl in iteritems(self.levels):
ans.levels[l] = lvl.copy()
return ans
class Numbering(object):
def __init__(self, namespace):
self.namespace = namespace
self.definitions = {}
self.instances = {}
self.counters = defaultdict(Counter)
self.starts = {}
self.pic_map = {}
def __call__(self, root, styles, rid_map):
' Read all numbering style definitions '
XPath, get = self.namespace.XPath, self.namespace.get
self.rid_map = rid_map
for npb in XPath('./w:numPicBullet[@w:numPicBulletId]')(root):
npbid = get(npb, 'w:numPicBulletId')
for idata in XPath('descendant::v:imagedata[@r:id]')(npb):
rid = get(idata, 'r:id')
self.pic_map[npbid] = rid
lazy_load = {}
for an in XPath('./w:abstractNum[@w:abstractNumId]')(root):
an_id = get(an, 'w:abstractNumId')
nsl = XPath('./w:numStyleLink[@w:val]')(an)
if nsl:
lazy_load[an_id] = get(nsl[0], 'w:val')
else:
nd = NumberingDefinition(self.namespace, an, an_id=an_id)
self.definitions[an_id] = nd
def create_instance(n, definition):
nd = definition.copy()
start_overrides = {}
for lo in XPath('./w:lvlOverride')(n):
try:
ilvl = int(get(lo, 'w:ilvl'))
except (ValueError, TypeError):
ilvl = None
for so in XPath('./w:startOverride[@w:val]')(lo):
try:
start_override = int(get(so, 'w:val'))
except (TypeError, ValueError):
pass
else:
start_overrides[ilvl] = start_override
for lvl in XPath('./w:lvl')(lo)[:1]:
nilvl = get(lvl, 'w:ilvl')
ilvl = nilvl if ilvl is None else ilvl
alvl = nd.levels.get(ilvl, None)
if alvl is None:
alvl = Level(self.namespace)
alvl.read_from_xml(lvl, override=True)
for ilvl, so in iteritems(start_overrides):
try:
nd.levels[ilvl].start = start_override
except KeyError:
pass
return nd
next_pass = {}
for n in XPath('./w:num[@w:numId]')(root):
an_id = None
num_id = get(n, 'w:numId')
for an in XPath('./w:abstractNumId[@w:val]')(n):
an_id = get(an, 'w:val')
d = self.definitions.get(an_id, None)
if d is None:
next_pass[num_id] = (an_id, n)
continue
self.instances[num_id] = create_instance(n, d)
numbering_links = styles.numbering_style_links
for an_id, style_link in iteritems(lazy_load):
num_id = numbering_links[style_link]
self.definitions[an_id] = self.instances[num_id].copy()
for num_id, (an_id, n) in iteritems(next_pass):
d = self.definitions.get(an_id, None)
if d is not None:
self.instances[num_id] = create_instance(n, d)
for num_id, d in iteritems(self.instances):
self.starts[num_id] = {lvl:d.levels[lvl].start for lvl in d.levels}
def get_pstyle(self, num_id, style_id):
d = self.instances.get(num_id, None)
if d is not None:
for ilvl, lvl in iteritems(d.levels):
if lvl.para_link == style_id:
return ilvl
def get_para_style(self, num_id, lvl):
d = self.instances.get(num_id, None)
if d is not None:
lvl = d.levels.get(lvl, None)
return getattr(lvl, 'paragraph_style', None)
def update_counter(self, counter, levelnum, levels):
counter[levelnum] += 1
for ilvl, lvl in iteritems(levels):
restart = lvl.restart
if (restart is None and ilvl == levelnum + 1) or restart == levelnum + 1:
counter[ilvl] = lvl.start
def apply_markup(self, items, body, styles, object_map, images):
seen_instances = set()
for p, num_id, ilvl in items:
d = self.instances.get(num_id, None)
if d is not None:
lvl = d.levels.get(ilvl, None)
if lvl is not None:
an_id = d.abstract_numbering_definition_id
counter = self.counters[an_id]
if ilvl not in counter or num_id not in seen_instances:
counter[ilvl] = self.starts[num_id][ilvl]
seen_instances.add(num_id)
p.tag = 'li'
p.set('value', '%s' % counter[ilvl])
p.set('list-lvl', unicode_type(ilvl))
p.set('list-id', num_id)
if lvl.num_template is not None:
val = lvl.format_template(counter, ilvl, lvl.num_template)
p.set('list-template', val)
elif lvl.bullet_template is not None:
val = lvl.format_template(counter, ilvl, lvl.bullet_template)
p.set('list-template', val)
self.update_counter(counter, ilvl, d.levels)
templates = {}
def commit(current_run):
if not current_run:
return
start = current_run[0]
parent = start.getparent()
idx = parent.index(start)
d = self.instances[start.get('list-id')]
ilvl = int(start.get('list-lvl'))
lvl = d.levels[ilvl]
lvlid = start.get('list-id') + start.get('list-lvl')
has_template = 'list-template' in start.attrib
wrap = (OL if lvl.is_numbered or has_template else UL)('\n\t')
if has_template:
wrap.set('lvlid', lvlid)
else:
wrap.set('class', styles.register(lvl.css(images, self.pic_map, self.rid_map), 'list'))
ccss = lvl.char_css()
if ccss:
ccss = styles.register(ccss, 'bullet')
parent.insert(idx, wrap)
last_val = None
for child in current_run:
wrap.append(child)
child.tail = '\n\t'
if has_template:
span = SPAN()
span.text = child.text
child.text = None
for gc in child:
span.append(gc)
child.append(span)
span = SPAN(child.get('list-template'))
if ccss:
span.set('class', ccss)
last = templates.get(lvlid, '')
if span.text and len(span.text) > len(last):
templates[lvlid] = span.text
child.insert(0, span)
for attr in ('list-lvl', 'list-id', 'list-template'):
child.attrib.pop(attr, None)
val = int(child.get('value'))
if last_val == val - 1 or wrap.tag == 'ul' or (last_val is None and val == 1):
child.attrib.pop('value')
last_val = val
current_run[-1].tail = '\n'
del current_run[:]
parents = set()
for child in body.iterdescendants('li'):
parents.add(child.getparent())
for parent in parents:
current_run = []
for child in parent:
if child.tag == 'li':
if current_run:
last = current_run[-1]
if (last.get('list-id') , last.get('list-lvl')) != (child.get('list-id'), child.get('list-lvl')):
commit(current_run)
current_run.append(child)
else:
commit(current_run)
commit(current_run)
# Convert the list items that use custom text for bullets into tables
# so that they display correctly
for wrap in body.xpath('//ol[@lvlid]'):
wrap.attrib.pop('lvlid')
wrap.tag = 'div'
wrap.set('style', 'display:table')
for i, li in enumerate(wrap.iterchildren('li')):
li.tag = 'div'
li.attrib.pop('value', None)
li.set('style', 'display:table-row')
obj = object_map[li]
bs = styles.para_cache[obj]
if i == 0:
wrap.set('style', 'display:table; padding-left:%s' %
bs.css.get('margin-left', '0'))
bs.css.pop('margin-left', None)
for child in li:
child.set('style', 'display:table-cell')

View File

@@ -0,0 +1,21 @@
#!/usr/bin/env python2
# vim:fileencoding=utf-8
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
class Settings(object):
def __init__(self, namespace):
self.default_tab_stop = 720 / 20
self.namespace = namespace
def __call__(self, root):
for dts in self.namespace.XPath('//w:defaultTabStop[@w:val]')(root):
try:
self.default_tab_stop = int(self.namespace.get(dts, 'w:val')) / 20
except (ValueError, TypeError, AttributeError):
pass

View File

@@ -0,0 +1,504 @@
#!/usr/bin/env python2
# vim:fileencoding=utf-8
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
import textwrap
from collections import OrderedDict, Counter
from calibre.ebooks.docx.block_styles import ParagraphStyle, inherit, twips
from calibre.ebooks.docx.char_styles import RunStyle
from calibre.ebooks.docx.tables import TableStyle
from polyglot.builtins import iteritems, itervalues
class PageProperties(object):
'''
Class representing page level properties (page size/margins) read from
sectPr elements.
'''
def __init__(self, namespace, elems=()):
self.width, self.height = 595.28, 841.89 # pts, A4
self.margin_left = self.margin_right = 72 # pts
def setval(attr, val):
val = twips(val)
if val is not None:
setattr(self, attr, val)
for sectPr in elems:
for pgSz in namespace.XPath('./w:pgSz')(sectPr):
w, h = namespace.get(pgSz, 'w:w'), namespace.get(pgSz, 'w:h')
setval('width', w), setval('height', h)
for pgMar in namespace.XPath('./w:pgMar')(sectPr):
l, r = namespace.get(pgMar, 'w:left'), namespace.get(pgMar, 'w:right')
setval('margin_left', l), setval('margin_right', r)
class Style(object):
'''
Class representing a <w:style> element. Can contain block, character, etc. styles.
'''
def __init__(self, namespace, elem):
self.namespace = namespace
self.name_path = namespace.XPath('./w:name[@w:val]')
self.based_on_path = namespace.XPath('./w:basedOn[@w:val]')
self.resolved = False
self.style_id = namespace.get(elem, 'w:styleId')
self.style_type = namespace.get(elem, 'w:type')
names = self.name_path(elem)
self.name = namespace.get(names[-1], 'w:val') if names else None
based_on = self.based_on_path(elem)
self.based_on = namespace.get(based_on[0], 'w:val') if based_on else None
if self.style_type == 'numbering':
self.based_on = None
self.is_default = namespace.get(elem, 'w:default') in {'1', 'on', 'true'}
self.paragraph_style = self.character_style = self.table_style = None
if self.style_type in {'paragraph', 'character', 'table'}:
if self.style_type == 'table':
for tblPr in namespace.XPath('./w:tblPr')(elem):
ts = TableStyle(namespace, tblPr)
if self.table_style is None:
self.table_style = ts
else:
self.table_style.update(ts)
if self.style_type in {'paragraph', 'table'}:
for pPr in namespace.XPath('./w:pPr')(elem):
ps = ParagraphStyle(namespace, pPr)
if self.paragraph_style is None:
self.paragraph_style = ps
else:
self.paragraph_style.update(ps)
for rPr in namespace.XPath('./w:rPr')(elem):
rs = RunStyle(namespace, rPr)
if self.character_style is None:
self.character_style = rs
else:
self.character_style.update(rs)
if self.style_type in {'numbering', 'paragraph'}:
self.numbering_style_link = None
for x in namespace.XPath('./w:pPr/w:numPr/w:numId[@w:val]')(elem):
self.numbering_style_link = namespace.get(x, 'w:val')
def resolve_based_on(self, parent):
if parent.table_style is not None:
if self.table_style is None:
self.table_style = TableStyle(self.namespace)
self.table_style.resolve_based_on(parent.table_style)
if parent.paragraph_style is not None:
if self.paragraph_style is None:
self.paragraph_style = ParagraphStyle(self.namespace)
self.paragraph_style.resolve_based_on(parent.paragraph_style)
if parent.character_style is not None:
if self.character_style is None:
self.character_style = RunStyle(self.namespace)
self.character_style.resolve_based_on(parent.character_style)
class Styles(object):
'''
Collection of all styles defined in the document. Used to get the final styles applicable to elements in the document markup.
'''
def __init__(self, namespace, tables):
self.namespace = namespace
self.id_map = OrderedDict()
self.para_cache = {}
self.para_char_cache = {}
self.run_cache = {}
self.classes = {}
self.counter = Counter()
self.default_styles = {}
self.tables = tables
self.numbering_style_links = {}
self.default_paragraph_style = self.default_character_style = None
def __iter__(self):
for s in itervalues(self.id_map):
yield s
def __getitem__(self, key):
return self.id_map[key]
def __len__(self):
return len(self.id_map)
def get(self, key, default=None):
return self.id_map.get(key, default)
def __call__(self, root, fonts, theme):
self.fonts, self.theme = fonts, theme
self.default_paragraph_style = self.default_character_style = None
if root is not None:
for s in self.namespace.XPath('//w:style')(root):
s = Style(self.namespace, s)
if s.style_id:
self.id_map[s.style_id] = s
if s.is_default:
self.default_styles[s.style_type] = s
if getattr(s, 'numbering_style_link', None) is not None:
self.numbering_style_links[s.style_id] = s.numbering_style_link
for dd in self.namespace.XPath('./w:docDefaults')(root):
for pd in self.namespace.XPath('./w:pPrDefault')(dd):
for pPr in self.namespace.XPath('./w:pPr')(pd):
ps = ParagraphStyle(self.namespace, pPr)
if self.default_paragraph_style is None:
self.default_paragraph_style = ps
else:
self.default_paragraph_style.update(ps)
for pd in self.namespace.XPath('./w:rPrDefault')(dd):
for pPr in self.namespace.XPath('./w:rPr')(pd):
ps = RunStyle(self.namespace, pPr)
if self.default_character_style is None:
self.default_character_style = ps
else:
self.default_character_style.update(ps)
def resolve(s, p):
if p is not None:
if not p.resolved:
resolve(p, self.get(p.based_on))
s.resolve_based_on(p)
s.resolved = True
for s in self:
if not s.resolved:
resolve(s, self.get(s.based_on))
def para_val(self, parent_styles, direct_formatting, attr):
val = getattr(direct_formatting, attr)
if val is inherit:
for ps in reversed(parent_styles):
pval = getattr(ps, attr)
if pval is not inherit:
val = pval
break
return val
def run_val(self, parent_styles, direct_formatting, attr):
val = getattr(direct_formatting, attr)
if val is not inherit:
return val
if attr in direct_formatting.toggle_properties:
# The spec (section 17.7.3) does not make sense, so we follow the behavior
# of Word, which seems to only consider the document default if the
# property has not been defined in any styles.
vals = [int(getattr(rs, attr)) for rs in parent_styles if rs is not self.default_character_style and getattr(rs, attr) is not inherit]
if vals:
return sum(vals) % 2 == 1
if self.default_character_style is not None:
return getattr(self.default_character_style, attr) is True
return False
for rs in reversed(parent_styles):
rval = getattr(rs, attr)
if rval is not inherit:
return rval
return val
def resolve_paragraph(self, p):
ans = self.para_cache.get(p, None)
if ans is None:
linked_style = None
ans = self.para_cache[p] = ParagraphStyle(self.namespace)
ans.style_name = None
direct_formatting = None
is_section_break = False
for pPr in self.namespace.XPath('./w:pPr')(p):
ps = ParagraphStyle(self.namespace, pPr)
if direct_formatting is None:
direct_formatting = ps
else:
direct_formatting.update(ps)
if self.namespace.XPath('./w:sectPr')(pPr):
is_section_break = True
if direct_formatting is None:
direct_formatting = ParagraphStyle(self.namespace)
parent_styles = []
if self.default_paragraph_style is not None:
parent_styles.append(self.default_paragraph_style)
ts = self.tables.para_style(p)
if ts is not None:
parent_styles.append(ts)
default_para = self.default_styles.get('paragraph', None)
if direct_formatting.linked_style is not None:
ls = linked_style = self.get(direct_formatting.linked_style)
if ls is not None:
ans.style_name = ls.name
ps = ls.paragraph_style
if ps is not None:
parent_styles.append(ps)
if ls.character_style is not None:
self.para_char_cache[p] = ls.character_style
elif default_para is not None:
if default_para.paragraph_style is not None:
parent_styles.append(default_para.paragraph_style)
if default_para.character_style is not None:
self.para_char_cache[p] = default_para.character_style
def has_numbering(block_style):
num_id, lvl = getattr(block_style, 'numbering_id', inherit), getattr(block_style, 'numbering_level', inherit)
return num_id is not None and num_id is not inherit and lvl is not None and lvl is not inherit
is_numbering = has_numbering(direct_formatting)
is_section_break = is_section_break and not self.namespace.XPath('./w:r')(p)
if is_numbering and not is_section_break:
num_id, lvl = direct_formatting.numbering_id, direct_formatting.numbering_level
p.set('calibre_num_id', '%s:%s' % (lvl, num_id))
ps = self.numbering.get_para_style(num_id, lvl)
if ps is not None:
parent_styles.append(ps)
if (
not is_numbering and not is_section_break and linked_style is not None and has_numbering(linked_style.paragraph_style)
):
num_id, lvl = linked_style.paragraph_style.numbering_id, linked_style.paragraph_style.numbering_level
p.set('calibre_num_id', '%s:%s' % (lvl, num_id))
is_numbering = True
ps = self.numbering.get_para_style(num_id, lvl)
if ps is not None:
parent_styles.append(ps)
for attr in ans.all_properties:
if not (is_numbering and attr == 'text_indent'): # skip text-indent for lists
setattr(ans, attr, self.para_val(parent_styles, direct_formatting, attr))
ans.linked_style = direct_formatting.linked_style
return ans
def resolve_run(self, r):
ans = self.run_cache.get(r, None)
if ans is None:
p = self.namespace.XPath('ancestor::w:p[1]')(r)
p = p[0] if p else None
ans = self.run_cache[r] = RunStyle(self.namespace)
direct_formatting = None
for rPr in self.namespace.XPath('./w:rPr')(r):
rs = RunStyle(self.namespace, rPr)
if direct_formatting is None:
direct_formatting = rs
else:
direct_formatting.update(rs)
if direct_formatting is None:
direct_formatting = RunStyle(self.namespace)
parent_styles = []
default_char = self.default_styles.get('character', None)
if self.default_character_style is not None:
parent_styles.append(self.default_character_style)
pstyle = self.para_char_cache.get(p, None)
if pstyle is not None:
parent_styles.append(pstyle)
# As best as I can understand the spec, table overrides should be
# applied before paragraph overrides, but word does it
# this way, see the December 2007 table header in the demo
# document.
ts = self.tables.run_style(p)
if ts is not None:
parent_styles.append(ts)
if direct_formatting.linked_style is not None:
ls = getattr(self.get(direct_formatting.linked_style), 'character_style', None)
if ls is not None:
parent_styles.append(ls)
elif default_char is not None and default_char.character_style is not None:
parent_styles.append(default_char.character_style)
for attr in ans.all_properties:
setattr(ans, attr, self.run_val(parent_styles, direct_formatting, attr))
if ans.font_family is not inherit:
ff = self.theme.resolve_font_family(ans.font_family)
ans.font_family = self.fonts.family_for(ff, ans.b, ans.i)
return ans
def resolve(self, obj):
if obj.tag.endswith('}p'):
return self.resolve_paragraph(obj)
if obj.tag.endswith('}r'):
return self.resolve_run(obj)
def cascade(self, layers):
self.body_font_family = 'serif'
self.body_font_size = '10pt'
self.body_color = 'black'
def promote_property(char_styles, block_style, prop):
vals = {getattr(s, prop) for s in char_styles}
if len(vals) == 1:
# All the character styles have the same value
for s in char_styles:
setattr(s, prop, inherit)
setattr(block_style, prop, next(iter(vals)))
for p, runs in iteritems(layers):
has_links = '1' in {r.get('is-link', None) for r in runs}
char_styles = [self.resolve_run(r) for r in runs]
block_style = self.resolve_paragraph(p)
for prop in ('font_family', 'font_size', 'cs_font_family', 'cs_font_size', 'color'):
if has_links and prop == 'color':
# We cannot promote color as browser rendering engines will
# override the link color setting it to blue, unless the
# color is specified on the link element itself
continue
promote_property(char_styles, block_style, prop)
for s in char_styles:
if s.text_decoration == 'none':
# The default text decoration is 'none'
s.text_decoration = inherit
def promote_most_common(block_styles, prop, default):
c = Counter()
for s in block_styles:
val = getattr(s, prop)
if val is not inherit:
c[val] += 1
val = None
if c:
val = c.most_common(1)[0][0]
for s in block_styles:
oval = getattr(s, prop)
if oval is inherit:
if default != val:
setattr(s, prop, default)
elif oval == val:
setattr(s, prop, inherit)
return val
block_styles = tuple(self.resolve_paragraph(p) for p in layers)
ff = promote_most_common(block_styles, 'font_family', self.body_font_family)
if ff is not None:
self.body_font_family = ff
fs = promote_most_common(block_styles, 'font_size', int(self.body_font_size[:2]))
if fs is not None:
self.body_font_size = '%.3gpt' % fs
color = promote_most_common(block_styles, 'color', self.body_color)
if color is not None:
self.body_color = color
def resolve_numbering(self, numbering):
# When a numPr element appears inside a paragraph style, the lvl info
# must be discarded and pStyle used instead.
self.numbering = numbering
for style in self:
ps = style.paragraph_style
if ps is not None and ps.numbering_id is not inherit:
lvl = numbering.get_pstyle(ps.numbering_id, style.style_id)
if lvl is None:
ps.numbering_id = ps.numbering_level = inherit
else:
ps.numbering_level = lvl
def apply_contextual_spacing(self, paras):
last_para = None
for p in paras:
if last_para is not None:
ls = self.resolve_paragraph(last_para)
ps = self.resolve_paragraph(p)
if ls.linked_style is not None and ls.linked_style == ps.linked_style:
if ls.contextualSpacing is True:
ls.margin_bottom = 0
if ps.contextualSpacing is True:
ps.margin_top = 0
last_para = p
def apply_section_page_breaks(self, paras):
for p in paras:
ps = self.resolve_paragraph(p)
ps.pageBreakBefore = True
def register(self, css, prefix):
h = hash(frozenset(iteritems(css)))
ans, _ = self.classes.get(h, (None, None))
if ans is None:
self.counter[prefix] += 1
ans = '%s_%d' % (prefix, self.counter[prefix])
self.classes[h] = (ans, css)
return ans
def generate_classes(self):
for bs in itervalues(self.para_cache):
css = bs.css
if css:
self.register(css, 'block')
for bs in itervalues(self.run_cache):
css = bs.css
if css:
self.register(css, 'text')
def class_name(self, css):
h = hash(frozenset(iteritems(css)))
return self.classes.get(h, (None, None))[0]
def generate_css(self, dest_dir, docx, notes_nopb, nosupsub):
ef = self.fonts.embed_fonts(dest_dir, docx)
s = '''\
body { font-family: %s; font-size: %s; color: %s }
/* In word all paragraphs have zero margins unless explicitly specified in a style */
p, h1, h2, h3, h4, h5, h6, div { margin: 0; padding: 0 }
/* In word headings only have bold font if explicitly specified,
similarly the font size is the body font size, unless explicitly set. */
h1, h2, h3, h4, h5, h6 { font-weight: normal; font-size: 1rem }
/* Setting padding-left to zero breaks rendering of lists, so we only set the other values to zero and leave padding-left for the user-agent */
ul, ol { margin: 0; padding-top: 0; padding-bottom: 0; padding-right: 0 }
/* The word hyperlink styling will set text-decoration to underline if needed */
a { text-decoration: none }
sup.noteref a { text-decoration: none }
h1.notes-header { page-break-before: always }
dl.footnote dt { font-size: large }
dl.footnote dt a { text-decoration: none }
'''
if not notes_nopb:
s += '''\
dl.footnote { page-break-after: always }
dl.footnote:last-of-type { page-break-after: avoid }
'''
s = s + '''\
span.tab { white-space: pre }
p.index-entry { text-indent: 0pt; }
p.index-entry a:visited { color: blue }
p.index-entry a:hover { color: red }
'''
if nosupsub:
s = s + '''\
sup { vertical-align: top }
sub { vertical-align: bottom }
'''
prefix = textwrap.dedent(s) % (self.body_font_family, self.body_font_size, self.body_color)
if ef:
prefix = ef + '\n' + prefix
ans = []
for (cls, css) in sorted(itervalues(self.classes), key=lambda x:x[0]):
b = ('\t%s: %s;' % (k, v) for k, v in iteritems(css))
b = '\n'.join(b)
ans.append('.%s {\n%s\n}\n' % (cls, b.rstrip(';')))
return prefix + '\n' + '\n'.join(ans)

View File

@@ -0,0 +1,700 @@
#!/usr/bin/env python2
# vim:fileencoding=utf-8
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
from lxml.html.builder import TABLE, TR, TD
from calibre.ebooks.docx.block_styles import inherit, read_shd as rs, read_border, binary_property, border_props, ParagraphStyle, border_to_css
from calibre.ebooks.docx.char_styles import RunStyle
from polyglot.builtins import filter, iteritems, itervalues, range, unicode_type
# Read from XML {{{
read_shd = rs
edges = ('left', 'top', 'right', 'bottom')
def _read_width(elem, get):
ans = inherit
try:
w = int(get(elem, 'w:w'))
except (TypeError, ValueError):
w = 0
typ = get(elem, 'w:type', 'auto')
if typ == 'nil':
ans = '0'
elif typ == 'auto':
ans = 'auto'
elif typ == 'dxa':
ans = '%.3gpt' % (w/20)
elif typ == 'pct':
ans = '%.3g%%' % (w/50)
return ans
def read_width(parent, dest, XPath, get):
ans = inherit
for tblW in XPath('./w:tblW')(parent):
ans = _read_width(tblW, get)
setattr(dest, 'width', ans)
def read_cell_width(parent, dest, XPath, get):
ans = inherit
for tblW in XPath('./w:tcW')(parent):
ans = _read_width(tblW, get)
setattr(dest, 'width', ans)
def read_padding(parent, dest, XPath, get):
name = 'tblCellMar' if parent.tag.endswith('}tblPr') else 'tcMar'
ans = {x:inherit for x in edges}
for mar in XPath('./w:%s' % name)(parent):
for x in edges:
for edge in XPath('./w:%s' % x)(mar):
ans[x] = _read_width(edge, get)
for x in edges:
setattr(dest, 'cell_padding_%s' % x, ans[x])
def read_justification(parent, dest, XPath, get):
left = right = inherit
for jc in XPath('./w:jc[@w:val]')(parent):
val = get(jc, 'w:val')
if not val:
continue
if val == 'left':
right = 'auto'
elif val == 'right':
left = 'auto'
elif val == 'center':
left = right = 'auto'
setattr(dest, 'margin_left', left)
setattr(dest, 'margin_right', right)
def read_spacing(parent, dest, XPath, get):
ans = inherit
for cs in XPath('./w:tblCellSpacing')(parent):
ans = _read_width(cs, get)
setattr(dest, 'spacing', ans)
def read_float(parent, dest, XPath, get):
ans = inherit
for x in XPath('./w:tblpPr')(parent):
ans = {k.rpartition('}')[-1]: v for k, v in iteritems(x.attrib)}
setattr(dest, 'float', ans)
def read_indent(parent, dest, XPath, get):
ans = inherit
for cs in XPath('./w:tblInd')(parent):
ans = _read_width(cs, get)
setattr(dest, 'indent', ans)
border_edges = ('left', 'top', 'right', 'bottom', 'insideH', 'insideV')
def read_borders(parent, dest, XPath, get):
name = 'tblBorders' if parent.tag.endswith('}tblPr') else 'tcBorders'
read_border(parent, dest, XPath, get, border_edges, name)
def read_height(parent, dest, XPath, get):
ans = inherit
for rh in XPath('./w:trHeight')(parent):
rule = get(rh, 'w:hRule', 'auto')
if rule in {'auto', 'atLeast', 'exact'}:
val = get(rh, 'w:val')
ans = (rule, val)
setattr(dest, 'height', ans)
def read_vertical_align(parent, dest, XPath, get):
ans = inherit
for va in XPath('./w:vAlign')(parent):
val = get(va, 'w:val')
ans = {'center': 'middle', 'top': 'top', 'bottom': 'bottom'}.get(val, 'middle')
setattr(dest, 'vertical_align', ans)
def read_col_span(parent, dest, XPath, get):
ans = inherit
for gs in XPath('./w:gridSpan')(parent):
try:
ans = int(get(gs, 'w:val'))
except (TypeError, ValueError):
continue
setattr(dest, 'col_span', ans)
def read_merge(parent, dest, XPath, get):
for x in ('hMerge', 'vMerge'):
ans = inherit
for m in XPath('./w:%s' % x)(parent):
ans = get(m, 'w:val', 'continue')
setattr(dest, x, ans)
def read_band_size(parent, dest, XPath, get):
for x in ('Col', 'Row'):
ans = 1
for y in XPath('./w:tblStyle%sBandSize' % x)(parent):
try:
ans = int(get(y, 'w:val'))
except (TypeError, ValueError):
continue
setattr(dest, '%s_band_size' % x.lower(), ans)
def read_look(parent, dest, XPath, get):
ans = 0
for x in XPath('./w:tblLook')(parent):
try:
ans = int(get(x, 'w:val'), 16)
except (ValueError, TypeError):
continue
setattr(dest, 'look', ans)
# }}}
def clone(style):
if style is None:
return None
try:
ans = type(style)(style.namespace)
except TypeError:
return None
ans.update(style)
return ans
class Style(object):
is_bidi = False
def update(self, other):
for prop in self.all_properties:
nval = getattr(other, prop)
if nval is not inherit:
setattr(self, prop, nval)
def apply_bidi(self):
self.is_bidi = True
def convert_spacing(self):
ans = {}
if self.spacing is not inherit:
if self.spacing in {'auto', '0'}:
ans['border-collapse'] = 'collapse'
else:
ans['border-collapse'] = 'separate'
ans['border-spacing'] = self.spacing
return ans
def convert_border(self):
c = {}
for x in edges:
border_to_css(x, self, c)
val = getattr(self, 'padding_%s' % x)
if val is not inherit:
c['padding-%s' % x] = '%.3gpt' % val
if self.is_bidi:
for a in ('padding-%s', 'border-%s-style', 'border-%s-color', 'border-%s-width'):
l, r = c.get(a % 'left'), c.get(a % 'right')
if l is not None:
c[a % 'right'] = l
if r is not None:
c[a % 'left'] = r
return c
class RowStyle(Style):
all_properties = ('height', 'cantSplit', 'hidden', 'spacing',)
def __init__(self, namespace, trPr=None):
self.namespace = namespace
if trPr is None:
for p in self.all_properties:
setattr(self, p, inherit)
else:
for p in ('hidden', 'cantSplit'):
setattr(self, p, binary_property(trPr, p, namespace.XPath, namespace.get))
for p in ('spacing', 'height'):
f = globals()['read_%s' % p]
f(trPr, self, namespace.XPath, namespace.get)
self._css = None
@property
def css(self):
if self._css is None:
c = self._css = {}
if self.hidden is True:
c['display'] = 'none'
if self.cantSplit is True:
c['page-break-inside'] = 'avoid'
if self.height is not inherit:
rule, val = self.height
if rule != 'auto':
try:
c['min-height' if rule == 'atLeast' else 'height'] = '%.3gpt' % (int(val)/20)
except (ValueError, TypeError):
pass
c.update(self.convert_spacing())
return self._css
class CellStyle(Style):
all_properties = ('background_color', 'cell_padding_left', 'cell_padding_right', 'cell_padding_top',
'cell_padding_bottom', 'width', 'vertical_align', 'col_span', 'vMerge', 'hMerge', 'row_span',
) + tuple(k % edge for edge in border_edges for k in border_props)
def __init__(self, namespace, tcPr=None):
self.namespace = namespace
if tcPr is None:
for p in self.all_properties:
setattr(self, p, inherit)
else:
for x in ('borders', 'shd', 'padding', 'cell_width', 'vertical_align', 'col_span', 'merge'):
f = globals()['read_%s' % x]
f(tcPr, self, namespace.XPath, namespace.get)
self.row_span = inherit
self._css = None
@property
def css(self):
if self._css is None:
self._css = c = {}
if self.background_color is not inherit:
c['background-color'] = self.background_color
if self.width not in (inherit, 'auto'):
c['width'] = self.width
c['vertical-align'] = 'top' if self.vertical_align is inherit else self.vertical_align
for x in edges:
val = getattr(self, 'cell_padding_%s' % x)
if val not in (inherit, 'auto'):
c['padding-%s' % x] = val
elif val is inherit and x in {'left', 'right'}:
c['padding-%s' % x] = '%.3gpt' % (115/20)
# In Word, tables are apparently rendered with some default top and
# bottom padding irrespective of the cellMargin values. Simulate
# that here.
for x in ('top', 'bottom'):
if c.get('padding-%s' % x, '0pt') == '0pt':
c['padding-%s' % x] = '0.5ex'
c.update(self.convert_border())
return self._css
class TableStyle(Style):
all_properties = (
'width', 'float', 'cell_padding_left', 'cell_padding_right', 'cell_padding_top',
'cell_padding_bottom', 'margin_left', 'margin_right', 'background_color',
'spacing', 'indent', 'overrides', 'col_band_size', 'row_band_size', 'look', 'bidi',
) + tuple(k % edge for edge in border_edges for k in border_props)
def __init__(self, namespace, tblPr=None):
self.namespace = namespace
if tblPr is None:
for p in self.all_properties:
setattr(self, p, inherit)
else:
self.overrides = inherit
self.bidi = binary_property(tblPr, 'bidiVisual', namespace.XPath, namespace.get)
for x in ('width', 'float', 'padding', 'shd', 'justification', 'spacing', 'indent', 'borders', 'band_size', 'look'):
f = globals()['read_%s' % x]
f(tblPr, self, self.namespace.XPath, self.namespace.get)
parent = tblPr.getparent()
if self.namespace.is_tag(parent, 'w:style'):
self.overrides = {}
for tblStylePr in self.namespace.XPath('./w:tblStylePr[@w:type]')(parent):
otype = self.namespace.get(tblStylePr, 'w:type')
orides = self.overrides[otype] = {}
for tblPr in self.namespace.XPath('./w:tblPr')(tblStylePr):
orides['table'] = TableStyle(self.namespace, tblPr)
for trPr in self.namespace.XPath('./w:trPr')(tblStylePr):
orides['row'] = RowStyle(self.namespace, trPr)
for tcPr in self.namespace.XPath('./w:tcPr')(tblStylePr):
orides['cell'] = CellStyle(self.namespace, tcPr)
for pPr in self.namespace.XPath('./w:pPr')(tblStylePr):
orides['para'] = ParagraphStyle(self.namespace, pPr)
for rPr in self.namespace.XPath('./w:rPr')(tblStylePr):
orides['run'] = RunStyle(self.namespace, rPr)
self._css = None
def resolve_based_on(self, parent):
for p in self.all_properties:
val = getattr(self, p)
if val is inherit:
setattr(self, p, getattr(parent, p))
@property
def css(self):
if self._css is None:
c = self._css = {}
if self.width not in (inherit, 'auto'):
c['width'] = self.width
for x in ('background_color', 'margin_left', 'margin_right'):
val = getattr(self, x)
if val is not inherit:
c[x.replace('_', '-')] = val
if self.indent not in (inherit, 'auto') and self.margin_left != 'auto':
c['margin-left'] = self.indent
if self.float is not inherit:
for x in ('left', 'top', 'right', 'bottom'):
val = self.float.get('%sFromText' % x, 0)
try:
val = '%.3gpt' % (int(val) / 20)
except (ValueError, TypeError):
val = '0'
c['margin-%s' % x] = val
if 'tblpXSpec' in self.float:
c['float'] = 'right' if self.float['tblpXSpec'] in {'right', 'outside'} else 'left'
else:
page = self.page
page_width = page.width - page.margin_left - page.margin_right
try:
x = int(self.float['tblpX']) / 20
except (KeyError, ValueError, TypeError):
x = 0
c['float'] = 'left' if (x/page_width) < 0.65 else 'right'
c.update(self.convert_spacing())
if 'border-collapse' not in c:
c['border-collapse'] = 'collapse'
c.update(self.convert_border())
return self._css
class Table(object):
def __init__(self, namespace, tbl, styles, para_map, is_sub_table=False):
self.namespace = namespace
self.tbl = tbl
self.styles = styles
self.is_sub_table = is_sub_table
# Read Table Style
style = {'table':TableStyle(self.namespace)}
for tblPr in self.namespace.XPath('./w:tblPr')(tbl):
for ts in self.namespace.XPath('./w:tblStyle[@w:val]')(tblPr):
style_id = self.namespace.get(ts, 'w:val')
s = styles.get(style_id)
if s is not None:
if s.table_style is not None:
style['table'].update(s.table_style)
if s.paragraph_style is not None:
if 'paragraph' in style:
style['paragraph'].update(s.paragraph_style)
else:
style['paragraph'] = s.paragraph_style
if s.character_style is not None:
if 'run' in style:
style['run'].update(s.character_style)
else:
style['run'] = s.character_style
style['table'].update(TableStyle(self.namespace, tblPr))
self.table_style, self.paragraph_style = style['table'], style.get('paragraph', None)
self.run_style = style.get('run', None)
self.overrides = self.table_style.overrides
if self.overrides is inherit:
self.overrides = {}
if 'wholeTable' in self.overrides and 'table' in self.overrides['wholeTable']:
self.table_style.update(self.overrides['wholeTable']['table'])
self.style_map = {}
self.paragraphs = []
self.cell_map = []
rows = self.namespace.XPath('./w:tr')(tbl)
for r, tr in enumerate(rows):
overrides = self.get_overrides(r, None, len(rows), None)
self.resolve_row_style(tr, overrides)
cells = self.namespace.XPath('./w:tc')(tr)
self.cell_map.append([])
for c, tc in enumerate(cells):
overrides = self.get_overrides(r, c, len(rows), len(cells))
self.resolve_cell_style(tc, overrides, r, c, len(rows), len(cells))
self.cell_map[-1].append(tc)
for p in self.namespace.XPath('./w:p')(tc):
para_map[p] = self
self.paragraphs.append(p)
self.resolve_para_style(p, overrides)
self.handle_merged_cells()
self.sub_tables = {x:Table(namespace, x, styles, para_map, is_sub_table=True) for x in self.namespace.XPath('./w:tr/w:tc/w:tbl')(tbl)}
@property
def bidi(self):
return self.table_style.bidi is True
def override_allowed(self, name):
'Check if the named override is allowed by the tblLook element'
if name.endswith('Cell') or name == 'wholeTable':
return True
look = self.table_style.look
if (look & 0x0020 and name == 'firstRow') or (look & 0x0040 and name == 'lastRow') or \
(look & 0x0080 and name == 'firstCol') or (look & 0x0100 and name == 'lastCol'):
return True
if name.startswith('band'):
if name.endswith('Horz'):
return not bool(look & 0x0200)
if name.endswith('Vert'):
return not bool(look & 0x0400)
return False
def get_overrides(self, r, c, num_of_rows, num_of_cols_in_row):
'List of possible overrides for the given para'
overrides = ['wholeTable']
def divisor(m, n):
return (m - (m % n)) // n
if c is not None:
odd_column_band = (divisor(c, self.table_style.col_band_size) % 2) == 1
overrides.append('band%dVert' % (1 if odd_column_band else 2))
odd_row_band = (divisor(r, self.table_style.row_band_size) % 2) == 1
overrides.append('band%dHorz' % (1 if odd_row_band else 2))
# According to the OOXML spec columns should have higher override
# priority than rows, but Word seems to do it the other way around.
if c is not None:
if c == 0:
overrides.append('firstCol')
if c >= num_of_cols_in_row - 1:
overrides.append('lastCol')
if r == 0:
overrides.append('firstRow')
if r >= num_of_rows - 1:
overrides.append('lastRow')
if c is not None:
if r == 0:
if c == 0:
overrides.append('nwCell')
if c == num_of_cols_in_row - 1:
overrides.append('neCell')
if r == num_of_rows - 1:
if c == 0:
overrides.append('swCell')
if c == num_of_cols_in_row - 1:
overrides.append('seCell')
return tuple(filter(self.override_allowed, overrides))
def resolve_row_style(self, tr, overrides):
rs = RowStyle(self.namespace)
for o in overrides:
if o in self.overrides:
ovr = self.overrides[o]
ors = ovr.get('row', None)
if ors is not None:
rs.update(ors)
for trPr in self.namespace.XPath('./w:trPr')(tr):
rs.update(RowStyle(self.namespace, trPr))
if self.bidi:
rs.apply_bidi()
self.style_map[tr] = rs
def resolve_cell_style(self, tc, overrides, row, col, rows, cols_in_row):
cs = CellStyle(self.namespace)
for o in overrides:
if o in self.overrides:
ovr = self.overrides[o]
ors = ovr.get('cell', None)
if ors is not None:
cs.update(ors)
for tcPr in self.namespace.XPath('./w:tcPr')(tc):
cs.update(CellStyle(self.namespace, tcPr))
for x in edges:
p = 'cell_padding_%s' % x
val = getattr(cs, p)
if val is inherit:
setattr(cs, p, getattr(self.table_style, p))
is_inside_edge = (
(x == 'left' and col > 0) or
(x == 'top' and row > 0) or
(x == 'right' and col < cols_in_row - 1) or
(x == 'bottom' and row < rows -1)
)
inside_edge = ('insideH' if x in {'top', 'bottom'} else 'insideV') if is_inside_edge else None
for prop in border_props:
if not prop.startswith('border'):
continue
eprop = prop % x
iprop = (prop % inside_edge) if inside_edge else None
val = getattr(cs, eprop)
if val is inherit and iprop is not None:
# Use the insideX borders if the main cell borders are not
# specified
val = getattr(cs, iprop)
if val is inherit:
val = getattr(self.table_style, iprop)
if not is_inside_edge and val == 'none':
# Cell borders must override table borders even when the
# table border is not null and the cell border is null.
val = 'hidden'
setattr(cs, eprop, val)
if self.bidi:
cs.apply_bidi()
self.style_map[tc] = cs
def resolve_para_style(self, p, overrides):
text_styles = [clone(self.paragraph_style), clone(self.run_style)]
for o in overrides:
if o in self.overrides:
ovr = self.overrides[o]
for i, name in enumerate(('para', 'run')):
ops = ovr.get(name, None)
if ops is not None:
if text_styles[i] is None:
text_styles[i] = ops
else:
text_styles[i].update(ops)
self.style_map[p] = text_styles
def handle_merged_cells(self):
if not self.cell_map:
return
# Handle vMerge
max_col_num = max(len(r) for r in self.cell_map)
for c in range(max_col_num):
cells = [row[c] if c < len(row) else None for row in self.cell_map]
runs = [[]]
for cell in cells:
try:
s = self.style_map[cell]
except KeyError: # cell is None
s = CellStyle(self.namespace)
if s.vMerge == 'restart':
runs.append([cell])
elif s.vMerge == 'continue':
runs[-1].append(cell)
else:
runs.append([])
for run in runs:
if len(run) > 1:
self.style_map[run[0]].row_span = len(run)
for tc in run[1:]:
tc.getparent().remove(tc)
# Handle hMerge
for cells in self.cell_map:
runs = [[]]
for cell in cells:
try:
s = self.style_map[cell]
except KeyError: # cell is None
s = CellStyle(self.namespace)
if s.col_span is not inherit:
runs.append([])
continue
if s.hMerge == 'restart':
runs.append([cell])
elif s.hMerge == 'continue':
runs[-1].append(cell)
else:
runs.append([])
for run in runs:
if len(run) > 1:
self.style_map[run[0]].col_span = len(run)
for tc in run[1:]:
tc.getparent().remove(tc)
def __iter__(self):
for p in self.paragraphs:
yield p
for t in itervalues(self.sub_tables):
for p in t:
yield p
def apply_markup(self, rmap, page, parent=None):
table = TABLE('\n\t\t')
if self.bidi:
table.set('dir', 'rtl')
self.table_style.page = page
style_map = {}
if parent is None:
try:
first_para = rmap[next(iter(self))]
except StopIteration:
return
parent = first_para.getparent()
idx = parent.index(first_para)
parent.insert(idx, table)
else:
parent.append(table)
for row in self.namespace.XPath('./w:tr')(self.tbl):
tr = TR('\n\t\t\t')
style_map[tr] = self.style_map[row]
tr.tail = '\n\t\t'
table.append(tr)
for tc in self.namespace.XPath('./w:tc')(row):
td = TD()
style_map[td] = s = self.style_map[tc]
if s.col_span is not inherit:
td.set('colspan', unicode_type(s.col_span))
if s.row_span is not inherit:
td.set('rowspan', unicode_type(s.row_span))
td.tail = '\n\t\t\t'
tr.append(td)
for x in self.namespace.XPath('./w:p|./w:tbl')(tc):
if x.tag.endswith('}p'):
td.append(rmap[x])
else:
self.sub_tables[x].apply_markup(rmap, page, parent=td)
if len(tr):
tr[-1].tail = '\n\t\t'
if len(table):
table[-1].tail = '\n\t'
table_style = self.table_style.css
if table_style:
table.set('class', self.styles.register(table_style, 'table'))
for elem, style in iteritems(style_map):
css = style.css
if css:
elem.set('class', self.styles.register(css, elem.tag))
class Tables(object):
def __init__(self, namespace):
self.tables = []
self.para_map = {}
self.sub_tables = set()
self.namespace = namespace
def register(self, tbl, styles):
if tbl in self.sub_tables:
return
self.tables.append(Table(self.namespace, tbl, styles, self.para_map))
self.sub_tables |= set(self.tables[-1].sub_tables)
def apply_markup(self, object_map, page_map):
rmap = {v:k for k, v in iteritems(object_map)}
for table in self.tables:
table.apply_markup(rmap, page_map[table.tbl])
def para_style(self, p):
table = self.para_map.get(p, None)
if table is not None:
return table.style_map.get(p, (None, None))[0]
def run_style(self, p):
table = self.para_map.get(p, None)
if table is not None:
return table.style_map.get(p, (None, None))[1]

View File

@@ -0,0 +1,29 @@
#!/usr/bin/env python2
# vim:fileencoding=utf-8
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
class Theme(object):
def __init__(self, namespace):
self.major_latin_font = 'Cambria'
self.minor_latin_font = 'Calibri'
self.namespace = namespace
def __call__(self, root):
for fs in self.namespace.XPath('//a:fontScheme')(root):
for mj in self.namespace.XPath('./a:majorFont')(fs):
for l in self.namespace.XPath('./a:latin[@typeface]')(mj):
self.major_latin_font = l.get('typeface')
for mj in self.namespace.XPath('./a:minorFont')(fs):
for l in self.namespace.XPath('./a:latin[@typeface]')(mj):
self.minor_latin_font = l.get('typeface')
def resolve_font_family(self, ff):
if ff.startswith('|'):
ff = ff[1:-1]
ff = self.major_latin_font if ff.startswith('major') else self.minor_latin_font
return ff

View File

@@ -0,0 +1,839 @@
#!/usr/bin/env python2
# vim:fileencoding=utf-8
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
import sys, os, re, math, errno, uuid, numbers
from collections import OrderedDict, defaultdict
from lxml import html
from lxml.html.builder import (
HTML, HEAD, TITLE, BODY, LINK, META, P, SPAN, BR, DIV, A, DT, DL, DD, H1)
from calibre import guess_type
from calibre.ebooks.docx.container import DOCX, fromstring
from calibre.ebooks.docx.names import XML, generate_anchor
from calibre.ebooks.docx.styles import Styles, inherit, PageProperties
from calibre.ebooks.docx.numbering import Numbering
from calibre.ebooks.docx.fonts import Fonts, is_symbol_font, map_symbol_text
from calibre.ebooks.docx.images import Images
from calibre.ebooks.docx.tables import Tables
from calibre.ebooks.docx.footnotes import Footnotes
from calibre.ebooks.docx.cleanup import cleanup_markup
from calibre.ebooks.docx.theme import Theme
from calibre.ebooks.docx.toc import create_toc
from calibre.ebooks.docx.fields import Fields
from calibre.ebooks.docx.settings import Settings
from calibre.ebooks.metadata.opf2 import OPFCreator
from calibre.utils.localization import canonicalize_lang, lang_as_iso639_1
from polyglot.builtins import iteritems, itervalues, filter, getcwd, map, unicode_type
NBSP = '\xa0'
class Text:
def __init__(self, elem, attr, buf):
self.elem, self.attr, self.buf = elem, attr, buf
self.elems = [self.elem]
def add_elem(self, elem):
self.elems.append(elem)
setattr(self.elem, self.attr, ''.join(self.buf))
self.elem, self.attr, self.buf = elem, 'tail', []
def __iter__(self):
return iter(self.elems)
def html_lang(docx_lang):
lang = canonicalize_lang(docx_lang)
if lang and lang != 'und':
lang = lang_as_iso639_1(lang)
if lang:
return lang
class Convert(object):
def __init__(self, path_or_stream, dest_dir=None, log=None, detect_cover=True, notes_text=None, notes_nopb=False, nosupsub=False):
self.docx = DOCX(path_or_stream, log=log)
self.namespace = self.docx.namespace
self.ms_pat = re.compile(r'\s{2,}')
self.ws_pat = re.compile(r'[\n\r\t]')
self.log = self.docx.log
self.detect_cover = detect_cover
self.notes_text = notes_text or _('Notes')
self.notes_nopb = notes_nopb
self.nosupsub = nosupsub
self.dest_dir = dest_dir or getcwd()
self.mi = self.docx.metadata
self.body = BODY()
self.theme = Theme(self.namespace)
self.settings = Settings(self.namespace)
self.tables = Tables(self.namespace)
self.fields = Fields(self.namespace)
self.styles = Styles(self.namespace, self.tables)
self.images = Images(self.namespace, self.log)
self.object_map = OrderedDict()
self.html = HTML(
HEAD(
META(charset='utf-8'),
TITLE(self.mi.title or _('Unknown')),
LINK(rel='stylesheet', type='text/css', href='docx.css'),
),
self.body
)
self.html.text='\n\t'
self.html[0].text='\n\t\t'
self.html[0].tail='\n'
for child in self.html[0]:
child.tail = '\n\t\t'
self.html[0][-1].tail = '\n\t'
self.html[1].text = self.html[1].tail = '\n'
lang = html_lang(self.mi.language)
if lang:
self.html.set('lang', lang)
self.doc_lang = lang
else:
self.doc_lang = None
def __call__(self):
doc = self.docx.document
relationships_by_id, relationships_by_type = self.docx.document_relationships
self.resolve_alternate_content(doc)
self.fields(doc, self.log)
self.read_styles(relationships_by_type)
self.images(relationships_by_id)
self.layers = OrderedDict()
self.framed = [[]]
self.frame_map = {}
self.framed_map = {}
self.anchor_map = {}
self.link_map = defaultdict(list)
self.link_source_map = {}
self.toc_anchor = None
self.block_runs = []
paras = []
self.log.debug('Converting Word markup to HTML')
self.read_page_properties(doc)
self.current_rels = relationships_by_id
for wp, page_properties in iteritems(self.page_map):
self.current_page = page_properties
if wp.tag.endswith('}p'):
p = self.convert_p(wp)
self.body.append(p)
paras.append(wp)
self.read_block_anchors(doc)
self.styles.apply_contextual_spacing(paras)
self.mark_block_runs(paras)
# Apply page breaks at the start of every section, except the first
# section (since that will be the start of the file)
self.styles.apply_section_page_breaks(self.section_starts[1:])
notes_header = None
orig_rid_map = self.images.rid_map
if self.footnotes.has_notes:
self.body.append(H1(self.notes_text))
notes_header = self.body[-1]
notes_header.set('class', 'notes-header')
for anchor, text, note in self.footnotes:
dl = DL(id=anchor)
dl.set('class', 'footnote')
self.body.append(dl)
dl.append(DT('[', A('' + text, href='#back_%s' % anchor, title=text)))
dl[-1][0].tail = ']'
dl.append(DD())
paras = []
self.images.rid_map = self.current_rels = note.rels[0]
for wp in note:
if wp.tag.endswith('}tbl'):
self.tables.register(wp, self.styles)
self.page_map[wp] = self.current_page
else:
p = self.convert_p(wp)
dl[-1].append(p)
paras.append(wp)
self.styles.apply_contextual_spacing(paras)
self.mark_block_runs(paras)
for p, wp in iteritems(self.object_map):
if len(p) > 0 and not p.text and len(p[0]) > 0 and not p[0].text and p[0][0].get('class', None) == 'tab':
# Paragraph uses tabs for indentation, convert to text-indent
parent = p[0]
tabs = []
for child in parent:
if child.get('class', None) == 'tab':
tabs.append(child)
if child.tail:
break
else:
break
indent = len(tabs) * self.settings.default_tab_stop
style = self.styles.resolve(wp)
if style.text_indent is inherit or (hasattr(style.text_indent, 'endswith') and style.text_indent.endswith('pt')):
if style.text_indent is not inherit:
indent = float(style.text_indent[:-2]) + indent
style.text_indent = '%.3gpt' % indent
parent.text = tabs[-1].tail or ''
list(map(parent.remove, tabs))
self.images.rid_map = orig_rid_map
self.resolve_links()
self.styles.cascade(self.layers)
self.tables.apply_markup(self.object_map, self.page_map)
numbered = []
for html_obj, obj in iteritems(self.object_map):
raw = obj.get('calibre_num_id', None)
if raw is not None:
lvl, num_id = raw.partition(':')[0::2]
try:
lvl = int(lvl)
except (TypeError, ValueError):
lvl = 0
numbered.append((html_obj, num_id, lvl))
self.numbering.apply_markup(numbered, self.body, self.styles, self.object_map, self.images)
self.apply_frames()
if len(self.body) > 0:
self.body.text = '\n\t'
for child in self.body:
child.tail = '\n\t'
self.body[-1].tail = '\n'
self.log.debug('Converting styles to CSS')
self.styles.generate_classes()
for html_obj, obj in iteritems(self.object_map):
style = self.styles.resolve(obj)
if style is not None:
css = style.css
if css:
cls = self.styles.class_name(css)
if cls:
html_obj.set('class', cls)
for html_obj, css in iteritems(self.framed_map):
cls = self.styles.class_name(css)
if cls:
html_obj.set('class', cls)
if notes_header is not None:
for h in self.namespace.children(self.body, 'h1', 'h2', 'h3'):
notes_header.tag = h.tag
cls = h.get('class', None)
if cls and cls != 'notes-header':
notes_header.set('class', '%s notes-header' % cls)
break
self.fields.polish_markup(self.object_map)
self.log.debug('Cleaning up redundant markup generated by Word')
self.cover_image = cleanup_markup(self.log, self.html, self.styles, self.dest_dir, self.detect_cover, self.namespace.XPath)
return self.write(doc)
def read_page_properties(self, doc):
current = []
self.page_map = OrderedDict()
self.section_starts = []
for p in self.namespace.descendants(doc, 'w:p', 'w:tbl'):
if p.tag.endswith('}tbl'):
self.tables.register(p, self.styles)
current.append(p)
continue
sect = tuple(self.namespace.descendants(p, 'w:sectPr'))
if sect:
pr = PageProperties(self.namespace, sect)
paras = current + [p]
for x in paras:
self.page_map[x] = pr
self.section_starts.append(paras[0])
current = []
else:
current.append(p)
if current:
self.section_starts.append(current[0])
last = self.namespace.XPath('./w:body/w:sectPr')(doc)
pr = PageProperties(self.namespace, last)
for x in current:
self.page_map[x] = pr
def resolve_alternate_content(self, doc):
# For proprietary extensions in Word documents use the fallback, spec
# compliant form
# See https://wiki.openoffice.org/wiki/OOXML/Markup_Compatibility_and_Extensibility
for ac in self.namespace.descendants(doc, 'mc:AlternateContent'):
choices = self.namespace.XPath('./mc:Choice')(ac)
fallbacks = self.namespace.XPath('./mc:Fallback')(ac)
if fallbacks:
for choice in choices:
ac.remove(choice)
def read_styles(self, relationships_by_type):
def get_name(rtype, defname):
name = relationships_by_type.get(rtype, None)
if name is None:
cname = self.docx.document_name.split('/')
cname[-1] = defname
if self.docx.exists('/'.join(cname)):
name = name
if name and name.startswith('word/word') and not self.docx.exists(name):
name = name.partition('/')[2]
return name
nname = get_name(self.namespace.names['NUMBERING'], 'numbering.xml')
sname = get_name(self.namespace.names['STYLES'], 'styles.xml')
sename = get_name(self.namespace.names['SETTINGS'], 'settings.xml')
fname = get_name(self.namespace.names['FONTS'], 'fontTable.xml')
tname = get_name(self.namespace.names['THEMES'], 'theme1.xml')
foname = get_name(self.namespace.names['FOOTNOTES'], 'footnotes.xml')
enname = get_name(self.namespace.names['ENDNOTES'], 'endnotes.xml')
numbering = self.numbering = Numbering(self.namespace)
footnotes = self.footnotes = Footnotes(self.namespace)
fonts = self.fonts = Fonts(self.namespace)
foraw = enraw = None
forel, enrel = ({}, {}), ({}, {})
if sename is not None:
try:
seraw = self.docx.read(sename)
except KeyError:
self.log.warn('Settings %s do not exist' % sename)
except EnvironmentError as e:
if e.errno != errno.ENOENT:
raise
self.log.warn('Settings %s file missing' % sename)
else:
self.settings(fromstring(seraw))
if foname is not None:
try:
foraw = self.docx.read(foname)
except KeyError:
self.log.warn('Footnotes %s do not exist' % foname)
else:
forel = self.docx.get_relationships(foname)
if enname is not None:
try:
enraw = self.docx.read(enname)
except KeyError:
self.log.warn('Endnotes %s do not exist' % enname)
else:
enrel = self.docx.get_relationships(enname)
footnotes(fromstring(foraw) if foraw else None, forel, fromstring(enraw) if enraw else None, enrel)
if fname is not None:
embed_relationships = self.docx.get_relationships(fname)[0]
try:
raw = self.docx.read(fname)
except KeyError:
self.log.warn('Fonts table %s does not exist' % fname)
else:
fonts(fromstring(raw), embed_relationships, self.docx, self.dest_dir)
if tname is not None:
try:
raw = self.docx.read(tname)
except KeyError:
self.log.warn('Styles %s do not exist' % sname)
else:
self.theme(fromstring(raw))
styles_loaded = False
if sname is not None:
try:
raw = self.docx.read(sname)
except KeyError:
self.log.warn('Styles %s do not exist' % sname)
else:
self.styles(fromstring(raw), fonts, self.theme)
styles_loaded = True
if not styles_loaded:
self.styles(None, fonts, self.theme)
if nname is not None:
try:
raw = self.docx.read(nname)
except KeyError:
self.log.warn('Numbering styles %s do not exist' % nname)
else:
numbering(fromstring(raw), self.styles, self.docx.get_relationships(nname)[0])
self.styles.resolve_numbering(numbering)
def write(self, doc):
toc = create_toc(doc, self.body, self.resolved_link_map, self.styles, self.object_map, self.log, self.namespace)
raw = html.tostring(self.html, encoding='utf-8', doctype='<!DOCTYPE html>')
with lopen(os.path.join(self.dest_dir, 'index.html'), 'wb') as f:
f.write(raw)
css = self.styles.generate_css(self.dest_dir, self.docx, self.notes_nopb, self.nosupsub)
if css:
with lopen(os.path.join(self.dest_dir, 'docx.css'), 'wb') as f:
f.write(css.encode('utf-8'))
opf = OPFCreator(self.dest_dir, self.mi)
opf.toc = toc
opf.create_manifest_from_files_in([self.dest_dir])
for item in opf.manifest:
if item.media_type == 'text/html':
item.media_type = guess_type('a.xhtml')[0]
opf.create_spine(['index.html'])
if self.cover_image is not None:
opf.guide.set_cover(self.cover_image)
def process_guide(E, guide):
if self.toc_anchor is not None:
guide.append(E.reference(
href='index.html#' + self.toc_anchor, title=_('Table of Contents'), type='toc'))
toc_file = os.path.join(self.dest_dir, 'toc.ncx')
with lopen(os.path.join(self.dest_dir, 'metadata.opf'), 'wb') as of, open(toc_file, 'wb') as ncx:
opf.render(of, ncx, 'toc.ncx', process_guide=process_guide)
if os.path.getsize(toc_file) == 0:
os.remove(toc_file)
return os.path.join(self.dest_dir, 'metadata.opf')
def read_block_anchors(self, doc):
doc_anchors = frozenset(self.namespace.XPath('./w:body/w:bookmarkStart[@w:name]')(doc))
if doc_anchors:
current_bm = set()
rmap = {v:k for k, v in iteritems(self.object_map)}
for p in self.namespace.descendants(doc, 'w:p', 'w:bookmarkStart[@w:name]'):
if p.tag.endswith('}p'):
if current_bm and p in rmap:
para = rmap[p]
if 'id' not in para.attrib:
para.set('id', generate_anchor(next(iter(current_bm)), frozenset(itervalues(self.anchor_map))))
for name in current_bm:
self.anchor_map[name] = para.get('id')
current_bm = set()
elif p in doc_anchors:
anchor = self.namespace.get(p, 'w:name')
if anchor:
current_bm.add(anchor)
def convert_p(self, p):
dest = P()
self.object_map[dest] = p
style = self.styles.resolve_paragraph(p)
self.layers[p] = []
self.frame_map[p] = style.frame
self.add_frame(dest, style.frame)
current_anchor = None
current_hyperlink = None
hl_xpath = self.namespace.XPath('ancestor::w:hyperlink[1]')
def p_parent(x):
# Ensure that nested <w:p> tags are handled. These can occur if a
# textbox is present inside a paragraph.
while True:
x = x.getparent()
try:
if x.tag.endswith('}p'):
return x
except AttributeError:
break
for x in self.namespace.descendants(p, 'w:r', 'w:bookmarkStart', 'w:hyperlink', 'w:instrText'):
if p_parent(x) is not p:
continue
if x.tag.endswith('}r'):
span = self.convert_run(x)
if current_anchor is not None:
(dest if len(dest) == 0 else span).set('id', current_anchor)
current_anchor = None
if current_hyperlink is not None:
try:
hl = hl_xpath(x)[0]
self.link_map[hl].append(span)
self.link_source_map[hl] = self.current_rels
x.set('is-link', '1')
except IndexError:
current_hyperlink = None
dest.append(span)
self.layers[p].append(x)
elif x.tag.endswith('}bookmarkStart'):
anchor = self.namespace.get(x, 'w:name')
if anchor and anchor not in self.anchor_map and anchor != '_GoBack':
# _GoBack is a special bookmark inserted by Word 2010 for
# the return to previous edit feature, we ignore it
old_anchor = current_anchor
self.anchor_map[anchor] = current_anchor = generate_anchor(anchor, frozenset(itervalues(self.anchor_map)))
if old_anchor is not None:
# The previous anchor was not applied to any element
for a, t in tuple(iteritems(self.anchor_map)):
if t == old_anchor:
self.anchor_map[a] = current_anchor
elif x.tag.endswith('}hyperlink'):
current_hyperlink = x
elif x.tag.endswith('}instrText') and x.text and x.text.strip().startswith('TOC '):
old_anchor = current_anchor
anchor = unicode_type(uuid.uuid4())
self.anchor_map[anchor] = current_anchor = generate_anchor('toc', frozenset(itervalues(self.anchor_map)))
self.toc_anchor = current_anchor
if old_anchor is not None:
# The previous anchor was not applied to any element
for a, t in tuple(iteritems(self.anchor_map)):
if t == old_anchor:
self.anchor_map[a] = current_anchor
if current_anchor is not None:
# This paragraph had no <w:r> descendants
dest.set('id', current_anchor)
current_anchor = None
m = re.match(r'heading\s+(\d+)$', style.style_name or '', re.IGNORECASE)
if m is not None:
n = min(6, max(1, int(m.group(1))))
dest.tag = 'h%d' % n
dest.set('data-heading-level', unicode_type(n))
if style.bidi is True:
dest.set('dir', 'rtl')
border_runs = []
common_borders = []
for span in dest:
run = self.object_map[span]
style = self.styles.resolve_run(run)
if not border_runs or border_runs[-1][1].same_border(style):
border_runs.append((span, style))
elif border_runs:
if len(border_runs) > 1:
common_borders.append(border_runs)
border_runs = []
for border_run in common_borders:
spans = []
bs = {}
for span, style in border_run:
style.get_border_css(bs)
style.clear_border_css()
spans.append(span)
if bs:
cls = self.styles.register(bs, 'text_border')
wrapper = self.wrap_elems(spans, SPAN())
wrapper.set('class', cls)
if not dest.text and len(dest) == 0 and not style.has_visible_border():
# Empty paragraph add a non-breaking space so that it is rendered
# by WebKit
dest.text = NBSP
# If the last element in a block is a <br> the <br> is not rendered in
# HTML, unless it is followed by a trailing space. Word, on the other
# hand inserts a blank line for trailing <br>s.
if len(dest) > 0 and not dest[-1].tail:
if dest[-1].tag == 'br':
dest[-1].tail = NBSP
elif len(dest[-1]) > 0 and dest[-1][-1].tag == 'br' and not dest[-1][-1].tail:
dest[-1][-1].tail = NBSP
return dest
def wrap_elems(self, elems, wrapper):
p = elems[0].getparent()
idx = p.index(elems[0])
p.insert(idx, wrapper)
wrapper.tail = elems[-1].tail
elems[-1].tail = None
for elem in elems:
try:
p.remove(elem)
except ValueError:
# Probably a hyperlink that spans multiple
# paragraphs,theoretically we should break this up into
# multiple hyperlinks, but I can't be bothered.
elem.getparent().remove(elem)
wrapper.append(elem)
return wrapper
def resolve_links(self):
self.resolved_link_map = {}
for hyperlink, spans in iteritems(self.link_map):
relationships_by_id = self.link_source_map[hyperlink]
span = spans[0]
if len(spans) > 1:
span = self.wrap_elems(spans, SPAN())
span.tag = 'a'
self.resolved_link_map[hyperlink] = span
tgt = self.namespace.get(hyperlink, 'w:tgtFrame')
if tgt:
span.set('target', tgt)
tt = self.namespace.get(hyperlink, 'w:tooltip')
if tt:
span.set('title', tt)
rid = self.namespace.get(hyperlink, 'r:id')
if rid and rid in relationships_by_id:
span.set('href', relationships_by_id[rid])
continue
anchor = self.namespace.get(hyperlink, 'w:anchor')
if anchor and anchor in self.anchor_map:
span.set('href', '#' + self.anchor_map[anchor])
continue
self.log.warn('Hyperlink with unknown target (rid=%s, anchor=%s), ignoring' %
(rid, anchor))
# hrefs that point nowhere give epubcheck a hernia. The element
# should be styled explicitly by Word anyway.
# span.set('href', '#')
rmap = {v:k for k, v in iteritems(self.object_map)}
for hyperlink, runs in self.fields.hyperlink_fields:
spans = [rmap[r] for r in runs if r in rmap]
if not spans:
continue
span = spans[0]
if len(spans) > 1:
span = self.wrap_elems(spans, SPAN())
span.tag = 'a'
tgt = hyperlink.get('target', None)
if tgt:
span.set('target', tgt)
tt = hyperlink.get('title', None)
if tt:
span.set('title', tt)
url = hyperlink.get('url', None)
if url is None:
anchor = hyperlink.get('anchor', None)
if anchor in self.anchor_map:
span.set('href', '#' + self.anchor_map[anchor])
continue
self.log.warn('Hyperlink field with unknown anchor: %s' % anchor)
else:
if url in self.anchor_map:
span.set('href', '#' + self.anchor_map[url])
continue
span.set('href', url)
for img, link, relationships_by_id in self.images.links:
parent = img.getparent()
idx = parent.index(img)
a = A(img)
a.tail, img.tail = img.tail, None
parent.insert(idx, a)
tgt = link.get('target', None)
if tgt:
a.set('target', tgt)
tt = link.get('title', None)
if tt:
a.set('title', tt)
rid = link['id']
if rid in relationships_by_id:
dest = relationships_by_id[rid]
if dest.startswith('#'):
if dest[1:] in self.anchor_map:
a.set('href', '#' + self.anchor_map[dest[1:]])
else:
a.set('href', dest)
def convert_run(self, run):
ans = SPAN()
self.object_map[ans] = run
text = Text(ans, 'text', [])
for child in run:
if self.namespace.is_tag(child, 'w:t'):
if not child.text:
continue
space = child.get(XML('space'), None)
preserve = False
ctext = child.text
if space != 'preserve':
# Remove leading and trailing whitespace. Word ignores
# leading and trailing whitespace without preserve
ctext = ctext.strip(' \n\r\t')
# Only use a <span> with white-space:pre-wrap if this element
# actually needs it, i.e. if it has more than one
# consecutive space or it has newlines or tabs.
multi_spaces = self.ms_pat.search(ctext) is not None
preserve = multi_spaces or self.ws_pat.search(ctext) is not None
if preserve:
text.add_elem(SPAN(ctext, style="white-space:pre-wrap"))
ans.append(text.elem)
else:
text.buf.append(ctext)
elif self.namespace.is_tag(child, 'w:cr'):
text.add_elem(BR())
ans.append(text.elem)
elif self.namespace.is_tag(child, 'w:br'):
typ = self.namespace.get(child, 'w:type')
if typ in {'column', 'page'}:
br = BR(style='page-break-after:always')
else:
clear = child.get('clear', None)
if clear in {'all', 'left', 'right'}:
br = BR(style='clear:%s'%('both' if clear == 'all' else clear))
else:
br = BR()
text.add_elem(br)
ans.append(text.elem)
elif self.namespace.is_tag(child, 'w:drawing') or self.namespace.is_tag(child, 'w:pict'):
for img in self.images.to_html(child, self.current_page, self.docx, self.dest_dir):
text.add_elem(img)
ans.append(text.elem)
elif self.namespace.is_tag(child, 'w:footnoteReference') or self.namespace.is_tag(child, 'w:endnoteReference'):
anchor, name = self.footnotes.get_ref(child)
if anchor and name:
l = A(name, id='back_%s' % anchor, href='#' + anchor, title=name)
l.set('class', 'noteref')
text.add_elem(l)
ans.append(text.elem)
elif self.namespace.is_tag(child, 'w:tab'):
spaces = int(math.ceil((self.settings.default_tab_stop / 36) * 6))
text.add_elem(SPAN(NBSP * spaces))
ans.append(text.elem)
ans[-1].set('class', 'tab')
elif self.namespace.is_tag(child, 'w:noBreakHyphen'):
text.buf.append('\u2011')
elif self.namespace.is_tag(child, 'w:softHyphen'):
text.buf.append('\u00ad')
if text.buf:
setattr(text.elem, text.attr, ''.join(text.buf))
style = self.styles.resolve_run(run)
if style.vert_align in {'superscript', 'subscript'}:
if ans.text or len(ans):
ans.set('data-docx-vert', 'sup' if style.vert_align == 'superscript' else 'sub')
if style.lang is not inherit:
lang = html_lang(style.lang)
if lang is not None and lang != self.doc_lang:
ans.set('lang', lang)
if style.rtl is True:
ans.set('dir', 'rtl')
if is_symbol_font(style.font_family):
for elem in text:
if elem.text:
elem.text = map_symbol_text(elem.text, style.font_family)
if elem.tail:
elem.tail = map_symbol_text(elem.tail, style.font_family)
style.font_family = 'sans-serif'
return ans
def add_frame(self, html_obj, style):
last_run = self.framed[-1]
if style is inherit:
if last_run:
self.framed.append([])
return
if last_run:
if last_run[-1][1] == style:
last_run.append((html_obj, style))
else:
self.framed[-1].append((html_obj, style))
else:
last_run.append((html_obj, style))
def apply_frames(self):
for run in filter(None, self.framed):
style = run[0][1]
paras = tuple(x[0] for x in run)
parent = paras[0].getparent()
idx = parent.index(paras[0])
frame = DIV(*paras)
parent.insert(idx, frame)
self.framed_map[frame] = css = style.css(self.page_map[self.object_map[paras[0]]])
self.styles.register(css, 'frame')
if not self.block_runs:
return
rmap = {v:k for k, v in iteritems(self.object_map)}
for border_style, blocks in self.block_runs:
paras = tuple(rmap[p] for p in blocks)
for p in paras:
if p.tag == 'li':
has_li = True
break
else:
has_li = False
parent = paras[0].getparent()
if parent.tag in ('ul', 'ol'):
ul = parent
parent = ul.getparent()
idx = parent.index(ul)
frame = DIV(ul)
elif has_li:
def top_level_tag(x):
while True:
q = x.getparent()
if q is parent or q is None:
break
x = q
return x
paras = tuple(map(top_level_tag, paras))
idx = parent.index(paras[0])
frame = DIV(*paras)
else:
idx = parent.index(paras[0])
frame = DIV(*paras)
parent.insert(idx, frame)
self.framed_map[frame] = css = border_style.css
self.styles.register(css, 'frame')
def mark_block_runs(self, paras):
def process_run(run):
max_left = max_right = 0
has_visible_border = None
for p in run:
style = self.styles.resolve_paragraph(p)
if has_visible_border is None:
has_visible_border = style.has_visible_border()
if isinstance(style.margin_left, numbers.Number):
max_left = max(style.margin_left, max_left)
if isinstance(style.margin_right, numbers.Number):
max_right = max(style.margin_right, max_right)
if has_visible_border:
style.margin_left = style.margin_right = inherit
if p is not run[0]:
style.padding_top = 0
else:
border_style = style.clone_border_styles()
if has_visible_border:
border_style.margin_top, style.margin_top = style.margin_top, inherit
if p is not run[-1]:
style.padding_bottom = 0
else:
if has_visible_border:
border_style.margin_bottom, style.margin_bottom = style.margin_bottom, inherit
style.clear_borders()
if p is not run[-1]:
style.apply_between_border()
if has_visible_border:
border_style.margin_left, border_style.margin_right = max_left,max_right
self.block_runs.append((border_style, run))
run = []
for p in paras:
if run and self.frame_map.get(p) == self.frame_map.get(run[-1]):
style = self.styles.resolve_paragraph(p)
last_style = self.styles.resolve_paragraph(run[-1])
if style.has_identical_borders(last_style):
run.append(p)
continue
if len(run) > 1:
process_run(run)
run = [p]
if len(run) > 1:
process_run(run)
if __name__ == '__main__':
import shutil
from calibre.utils.logging import default_log
default_log.filter_level = default_log.DEBUG
dest_dir = os.path.join(getcwd(), 'docx_input')
if os.path.exists(dest_dir):
shutil.rmtree(dest_dir)
os.mkdir(dest_dir)
Convert(sys.argv[-1], dest_dir=dest_dir, log=default_log)()

View File

@@ -0,0 +1,143 @@
#!/usr/bin/env python2
# vim:fileencoding=utf-8
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
from collections import namedtuple
from itertools import count
from lxml.etree import tostring
from calibre.ebooks.metadata.toc import TOC
from calibre.ebooks.oeb.polish.toc import elem_to_toc_text
from polyglot.builtins import iteritems, range
def from_headings(body, log, namespace, num_levels=3):
' Create a TOC from headings in the document '
tocroot = TOC()
all_heading_nodes = body.xpath('//*[@data-heading-level]')
level_prev = {i+1:None for i in range(num_levels)}
level_prev[0] = tocroot
level_item_map = {i:frozenset(
x for x in all_heading_nodes if int(x.get('data-heading-level')) == i)
for i in range(1, num_levels+1)}
item_level_map = {e:i for i, elems in iteritems(level_item_map) for e in elems}
idcount = count()
def ensure_id(elem):
ans = elem.get('id', None)
if not ans:
ans = 'toc_id_%d' % (next(idcount) + 1)
elem.set('id', ans)
return ans
for item in all_heading_nodes:
lvl = plvl = item_level_map.get(item, None)
if lvl is None:
continue
parent = None
while parent is None:
plvl -= 1
parent = level_prev[plvl]
lvl = plvl + 1
elem_id = ensure_id(item)
text = elem_to_toc_text(item)
toc = parent.add_item('index.html', elem_id, text)
level_prev[lvl] = toc
for i in range(lvl+1, num_levels+1):
level_prev[i] = None
if len(tuple(tocroot.flat())) > 1:
log('Generating Table of Contents from headings')
return tocroot
def structure_toc(entries):
indent_vals = sorted({x.indent for x in entries})
last_found = [None for i in indent_vals]
newtoc = TOC()
if len(indent_vals) > 6:
for x in entries:
newtoc.add_item('index.html', x.anchor, x.text)
return newtoc
def find_parent(level):
candidates = last_found[:level]
for x in reversed(candidates):
if x is not None:
return x
return newtoc
for item in entries:
level = indent_vals.index(item.indent)
parent = find_parent(level)
last_found[level] = parent.add_item('index.html', item.anchor,
item.text)
for i in range(level+1, len(last_found)):
last_found[i] = None
return newtoc
def link_to_txt(a, styles, object_map):
if len(a) > 1:
for child in a:
run = object_map.get(child, None)
if run is not None:
rs = styles.resolve(run)
if rs.css.get('display', None) == 'none':
a.remove(child)
return tostring(a, method='text', with_tail=False, encoding='unicode').strip()
def from_toc(docx, link_map, styles, object_map, log, namespace):
XPath, get, ancestor = namespace.XPath, namespace.get, namespace.ancestor
toc_level = None
level = 0
TI = namedtuple('TI', 'text anchor indent')
toc = []
for tag in XPath('//*[(@w:fldCharType and name()="w:fldChar") or name()="w:hyperlink" or name()="w:instrText"]')(docx):
n = tag.tag.rpartition('}')[-1]
if n == 'fldChar':
t = get(tag, 'w:fldCharType')
if t == 'begin':
level += 1
elif t == 'end':
level -= 1
if toc_level is not None and level < toc_level:
break
elif n == 'instrText':
if level > 0 and tag.text and tag.text.strip().startswith('TOC '):
toc_level = level
elif n == 'hyperlink':
if toc_level is not None and level >= toc_level and tag in link_map:
a = link_map[tag]
href = a.get('href', None)
txt = link_to_txt(a, styles, object_map)
p = ancestor(tag, 'w:p')
if txt and href and p is not None:
ps = styles.resolve_paragraph(p)
try:
ml = int(ps.margin_left[:-2])
except (TypeError, ValueError, AttributeError):
ml = 0
if ps.text_align in {'center', 'right'}:
ml = 0
toc.append(TI(txt, href[1:], ml))
if toc:
log('Found Word Table of Contents, using it to generate the Table of Contents')
return structure_toc(toc)
def create_toc(docx, body, link_map, styles, object_map, log, namespace):
ans = from_toc(docx, link_map, styles, object_map, log, namespace) or from_headings(body, log, namespace)
# Remove heading level attributes
for h in body.xpath('//*[@data-heading-level]'):
del h.attrib['data-heading-level']
return ans

View File

@@ -0,0 +1,7 @@
#!/usr/bin/env python2
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'

View File

@@ -0,0 +1,258 @@
#!/usr/bin/env python2
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
'''
Input plugin for HTML or OPF ebooks.
'''
import os, re, sys, errno as gerrno
from calibre.ebooks.oeb.base import urlunquote
from calibre.ebooks.chardet import detect_xml_encoding
from calibre.constants import iswindows
from calibre import unicode_path, as_unicode, replace_entities
from polyglot.builtins import is_py3, unicode_type
from polyglot.urllib import urlparse, urlunparse
class Link(object):
'''
Represents a link in a HTML file.
'''
@classmethod
def url_to_local_path(cls, url, base):
path = url.path
isabs = False
if iswindows and path.startswith('/'):
path = path[1:]
isabs = True
path = urlunparse(('', '', path, url.params, url.query, ''))
path = urlunquote(path)
if isabs or os.path.isabs(path):
return path
return os.path.abspath(os.path.join(base, path))
def __init__(self, url, base):
'''
:param url: The url this link points to. Must be an unquoted unicode string.
:param base: The base directory that relative URLs are with respect to.
Must be a unicode string.
'''
assert isinstance(url, unicode_type) and isinstance(base, unicode_type)
self.url = url
self.parsed_url = urlparse(self.url)
self.is_local = self.parsed_url.scheme in ('', 'file')
self.is_internal = self.is_local and not bool(self.parsed_url.path)
self.path = None
self.fragment = urlunquote(self.parsed_url.fragment)
if self.is_local and not self.is_internal:
self.path = self.url_to_local_path(self.parsed_url, base)
def __hash__(self):
if self.path is None:
return hash(self.url)
return hash(self.path)
def __eq__(self, other):
return self.path == getattr(other, 'path', other)
def __str__(self):
return 'Link: %s --> %s'%(self.url, self.path)
if not is_py3:
__unicode__ = __str__
class IgnoreFile(Exception):
def __init__(self, msg, errno):
Exception.__init__(self, msg)
self.doesnt_exist = errno == gerrno.ENOENT
self.errno = errno
class HTMLFile(object):
'''
Contains basic information about an HTML file. This
includes a list of links to other files as well as
the encoding of each file. Also tries to detect if the file is not a HTML
file in which case :member:`is_binary` is set to True.
The encoding of the file is available as :member:`encoding`.
'''
HTML_PAT = re.compile(r'<\s*html', re.IGNORECASE)
TITLE_PAT = re.compile('<title>([^<>]+)</title>', re.IGNORECASE)
LINK_PAT = re.compile(
r'<\s*a\s+.*?href\s*=\s*(?:(?:"(?P<url1>[^"]+)")|(?:\'(?P<url2>[^\']+)\')|(?P<url3>[^\s>]+))',
re.DOTALL|re.IGNORECASE)
def __init__(self, path_to_html_file, level, encoding, verbose, referrer=None):
'''
:param level: The level of this file. Should be 0 for the root file.
:param encoding: Use `encoding` to decode HTML.
:param referrer: The :class:`HTMLFile` that first refers to this file.
'''
self.path = unicode_path(path_to_html_file, abs=True)
self.title = os.path.splitext(os.path.basename(self.path))[0]
self.base = os.path.dirname(self.path)
self.level = level
self.referrer = referrer
self.links = []
try:
with open(self.path, 'rb') as f:
src = header = f.read(4096)
encoding = detect_xml_encoding(src)[1]
if encoding:
try:
header = header.decode(encoding)
except ValueError:
pass
self.is_binary = level > 0 and not bool(self.HTML_PAT.search(header))
if not self.is_binary:
src += f.read()
except IOError as err:
msg = 'Could not read from file: %s with error: %s'%(self.path, as_unicode(err))
if level == 0:
raise IOError(msg)
raise IgnoreFile(msg, err.errno)
if not src:
if level == 0:
raise ValueError('The file %s is empty'%self.path)
self.is_binary = True
if not self.is_binary:
if not encoding:
encoding = detect_xml_encoding(src[:4096], verbose=verbose)[1]
self.encoding = encoding
else:
self.encoding = encoding
src = src.decode(encoding, 'replace')
match = self.TITLE_PAT.search(src)
self.title = match.group(1) if match is not None else self.title
self.find_links(src)
def __eq__(self, other):
return self.path == getattr(other, 'path', other)
def __hash__(self):
return hash(self.path)
def __str__(self):
return 'HTMLFile:%d:%s:%s'%(self.level, 'b' if self.is_binary else 'a', self.path)
def __repr__(self):
return unicode_type(self)
def find_links(self, src):
for match in self.LINK_PAT.finditer(src):
url = None
for i in ('url1', 'url2', 'url3'):
url = match.group(i)
if url:
break
url = replace_entities(url)
try:
link = self.resolve(url)
except ValueError:
# Unparseable URL, ignore
continue
if link not in self.links:
self.links.append(link)
def resolve(self, url):
return Link(url, self.base)
def depth_first(root, flat, visited=None):
yield root
if visited is None:
visited = set()
visited.add(root)
for link in root.links:
if link.path is not None and link not in visited:
try:
index = flat.index(link)
except ValueError: # Can happen if max_levels is used
continue
hf = flat[index]
if hf not in visited:
yield hf
visited.add(hf)
for hf in depth_first(hf, flat, visited):
if hf not in visited:
yield hf
visited.add(hf)
def traverse(path_to_html_file, max_levels=sys.maxsize, verbose=0, encoding=None):
'''
Recursively traverse all links in the HTML file.
:param max_levels: Maximum levels of recursion. Must be non-negative. 0
implies that no links in the root HTML file are followed.
:param encoding: Specify character encoding of HTML files. If `None` it is
auto-detected.
:return: A pair of lists (breadth_first, depth_first). Each list contains
:class:`HTMLFile` objects.
'''
assert max_levels >= 0
level = 0
flat = [HTMLFile(path_to_html_file, level, encoding, verbose)]
next_level = list(flat)
while level < max_levels and len(next_level) > 0:
level += 1
nl = []
for hf in next_level:
rejects = []
for link in hf.links:
if link.path is None or link.path in flat:
continue
try:
nf = HTMLFile(link.path, level, encoding, verbose, referrer=hf)
if nf.is_binary:
raise IgnoreFile('%s is a binary file'%nf.path, -1)
nl.append(nf)
flat.append(nf)
except IgnoreFile as err:
rejects.append(link)
if not err.doesnt_exist or verbose > 1:
print(repr(err))
for link in rejects:
hf.links.remove(link)
next_level = list(nl)
orec = sys.getrecursionlimit()
sys.setrecursionlimit(500000)
try:
return flat, list(depth_first(flat[0], flat))
finally:
sys.setrecursionlimit(orec)
def get_filelist(htmlfile, dir, opts, log):
'''
Build list of files referenced by html file or try to detect and use an
OPF file instead.
'''
log.info('Building file list...')
filelist = traverse(htmlfile, max_levels=int(opts.max_levels),
verbose=opts.verbose,
encoding=opts.input_encoding)[0 if opts.breadth_first else 1]
if opts.verbose:
log.debug('\tFound files...')
for f in filelist:
log.debug('\t\t', f)
return filelist

View File

@@ -0,0 +1,122 @@
#!/usr/bin/env python2
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import textwrap, os, glob
from calibre.customize import FileTypePlugin
from calibre.constants import numeric_version
from polyglot.builtins import unicode_type
class HTML2ZIP(FileTypePlugin):
name = 'HTML to ZIP'
author = 'Kovid Goyal'
description = textwrap.dedent(_('''\
Follow all local links in an HTML file and create a ZIP \
file containing all linked files. This plugin is run \
every time you add an HTML file to the library.\
'''))
version = numeric_version
file_types = {'html', 'htm', 'xhtml', 'xhtm', 'shtm', 'shtml'}
supported_platforms = ['windows', 'osx', 'linux']
on_import = True
def run(self, htmlfile):
import codecs
from calibre import prints
from calibre.ptempfile import TemporaryDirectory
from calibre.gui2.convert.gui_conversion import gui_convert
from calibre.customize.conversion import OptionRecommendation
from calibre.ebooks.epub import initialize_container
with TemporaryDirectory('_plugin_html2zip') as tdir:
recs =[('debug_pipeline', tdir, OptionRecommendation.HIGH)]
recs.append(['keep_ligatures', True, OptionRecommendation.HIGH])
if self.site_customization and self.site_customization.strip():
sc = self.site_customization.strip()
enc, _, bf = sc.partition('|')
if enc:
try:
codecs.lookup(enc)
except Exception:
prints('Ignoring invalid input encoding for HTML:', enc)
else:
recs.append(['input_encoding', enc, OptionRecommendation.HIGH])
if bf == 'bf':
recs.append(['breadth_first', True,
OptionRecommendation.HIGH])
gui_convert(htmlfile, tdir, recs, abort_after_input_dump=True)
of = self.temporary_file('_plugin_html2zip.zip')
tdir = os.path.join(tdir, 'input')
opf = glob.glob(os.path.join(tdir, '*.opf'))[0]
ncx = glob.glob(os.path.join(tdir, '*.ncx'))
if ncx:
os.remove(ncx[0])
epub = initialize_container(of.name, os.path.basename(opf))
epub.add_dir(tdir)
epub.close()
return of.name
def customization_help(self, gui=False):
return _('Character encoding for the input HTML files. Common choices '
'include: cp1252, cp1251, latin1 and utf-8.')
def do_user_config(self, parent=None):
'''
This method shows a configuration dialog for this plugin. It returns
True if the user clicks OK, False otherwise. The changes are
automatically applied.
'''
from PyQt5.Qt import (QDialog, QDialogButtonBox, QVBoxLayout,
QLabel, Qt, QLineEdit, QCheckBox)
config_dialog = QDialog(parent)
button_box = QDialogButtonBox(QDialogButtonBox.Ok | QDialogButtonBox.Cancel)
v = QVBoxLayout(config_dialog)
def size_dialog():
config_dialog.resize(config_dialog.sizeHint())
button_box.accepted.connect(config_dialog.accept)
button_box.rejected.connect(config_dialog.reject)
config_dialog.setWindowTitle(_('Customize') + ' ' + self.name)
from calibre.customize.ui import (plugin_customization,
customize_plugin)
help_text = self.customization_help(gui=True)
help_text = QLabel(help_text, config_dialog)
help_text.setWordWrap(True)
help_text.setTextInteractionFlags(Qt.LinksAccessibleByMouse | Qt.LinksAccessibleByKeyboard)
help_text.setOpenExternalLinks(True)
v.addWidget(help_text)
bf = QCheckBox(_('Add linked files in breadth first order'))
bf.setToolTip(_('Normally, when following links in HTML files'
' calibre does it depth first, i.e. if file A links to B and '
' C, but B links to D, the files are added in the order A, B, D, C. '
' With this option, they will instead be added as A, B, C, D'))
sc = plugin_customization(self)
if not sc:
sc = ''
sc = sc.strip()
enc = sc.partition('|')[0]
bfs = sc.partition('|')[-1]
bf.setChecked(bfs == 'bf')
sc = QLineEdit(enc, config_dialog)
v.addWidget(sc)
v.addWidget(bf)
v.addWidget(button_box)
size_dialog()
config_dialog.exec_()
if config_dialog.result() == QDialog.Accepted:
sc = unicode_type(sc.text()).strip()
if bf.isChecked():
sc += '|bf'
customize_plugin(self, sc)
return config_dialog.result()

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,115 @@
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
"""
This package contains logic to read and write LRF files.
The LRF file format is documented at U{http://www.sven.de/librie/Librie/LrfFormat}.
"""
from calibre.ebooks.lrf.pylrs.pylrs import Book as _Book
from calibre.ebooks.lrf.pylrs.pylrs import TextBlock, Header, \
TextStyle, BlockStyle
from calibre.ebooks.lrf.fonts import FONT_FILE_MAP
from calibre.ebooks import ConversionError
__docformat__ = "epytext"
class LRFParseError(Exception):
pass
class PRS500_PROFILE(object):
screen_width = 600
screen_height = 775
dpi = 166
# Number of pixels to subtract from screen_height when calculating height of text area
fudge = 0
font_size = 10 #: Default (in pt)
parindent = 10 #: Default (in pt)
line_space = 1.2 # : Default (in pt)
header_font_size = 6 #: In pt
header_height = 30 # : In px
default_fonts = {'sans': "Swis721 BT Roman", 'mono': "Courier10 BT Roman",
'serif': "Dutch801 Rm BT Roman"}
name = 'prs500'
def find_custom_fonts(options, logger):
from calibre.utils.fonts.scanner import font_scanner
fonts = {'serif' : None, 'sans' : None, 'mono' : None}
def family(cmd):
return cmd.split(',')[-1].strip()
if options.serif_family:
f = family(options.serif_family)
fonts['serif'] = font_scanner.legacy_fonts_for_family(f)
if not fonts['serif']:
logger.warn('Unable to find serif family %s'%f)
if options.sans_family:
f = family(options.sans_family)
fonts['sans'] = font_scanner.legacy_fonts_for_family(f)
if not fonts['sans']:
logger.warn('Unable to find sans family %s'%f)
if options.mono_family:
f = family(options.mono_family)
fonts['mono'] = font_scanner.legacy_fonts_for_family(f)
if not fonts['mono']:
logger.warn('Unable to find mono family %s'%f)
return fonts
def Book(options, logger, font_delta=0, header=None,
profile=PRS500_PROFILE, **settings):
from uuid import uuid4
ps = {}
ps['topmargin'] = options.top_margin
ps['evensidemargin'] = options.left_margin
ps['oddsidemargin'] = options.left_margin
ps['textwidth'] = profile.screen_width - (options.left_margin + options.right_margin)
ps['textheight'] = profile.screen_height - (options.top_margin + options.bottom_margin) \
- profile.fudge
if header:
hdr = Header()
hb = TextBlock(textStyle=TextStyle(align='foot',
fontsize=int(profile.header_font_size*10)),
blockStyle=BlockStyle(blockwidth=ps['textwidth']))
hb.append(header)
hdr.PutObj(hb)
ps['headheight'] = profile.header_height
ps['headsep'] = options.header_separation
ps['header'] = hdr
ps['topmargin'] = 0
ps['textheight'] = profile.screen_height - (options.bottom_margin + ps['topmargin']) \
- ps['headheight'] - ps['headsep'] - profile.fudge
fontsize = int(10*profile.font_size+font_delta*20)
baselineskip = fontsize + 20
fonts = find_custom_fonts(options, logger)
tsd = dict(fontsize=fontsize,
parindent=int(10*profile.parindent),
linespace=int(10*profile.line_space),
baselineskip=baselineskip,
wordspace=10*options.wordspace)
if fonts['serif'] and 'normal' in fonts['serif']:
tsd['fontfacename'] = fonts['serif']['normal'][1]
book = _Book(textstyledefault=tsd,
pagestyledefault=ps,
blockstyledefault=dict(blockwidth=ps['textwidth']),
bookid=uuid4().hex,
**settings)
for family in fonts.keys():
if fonts[family]:
for font in fonts[family].values():
book.embed_font(*font)
FONT_FILE_MAP[font[1]] = font[0]
for family in ['serif', 'sans', 'mono']:
if not fonts[family]:
fonts[family] = {'normal' : (None, profile.default_fonts[family])}
elif 'normal' not in fonts[family]:
raise ConversionError('Could not find the normal version of the ' + family + ' font')
return book, fonts

View File

@@ -0,0 +1,33 @@
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
from PIL import ImageFont
'''
Default fonts used in the PRS500
'''
LIBERATION_FONT_MAP = {
'Swis721 BT Roman' : 'LiberationSans-Regular',
'Dutch801 Rm BT Roman' : 'LiberationSerif-Regular',
'Courier10 BT Roman' : 'LiberationMono-Regular',
}
FONT_FILE_MAP = {}
def get_font(name, size, encoding='unic'):
'''
Get an ImageFont object by name.
@param size: Font height in pixels. To convert from pts:
sz in pixels = (dpi/72) * size in pts
@param encoding: Font encoding to use. E.g. 'unic', 'symbol', 'ADOB', 'ADBE', 'aprm'
@param manager: A dict that will store the PersistentTemporary
'''
if name in LIBERATION_FONT_MAP:
return ImageFont.truetype(P('fonts/liberation/%s.ttf' % LIBERATION_FONT_MAP[name]), size, encoding=encoding)
elif name in FONT_FILE_MAP:
return ImageFont.truetype(FONT_FILE_MAP[name], size, encoding=encoding)

View File

@@ -0,0 +1,10 @@
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
"""
This package contains code to convert HTML ebooks to LRF ebooks.
"""
__docformat__ = "epytext"
__author__ = "Kovid Goyal <kovid@kovidgoyal.net>"

View File

@@ -0,0 +1,115 @@
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
import re
NAME_MAP = {
'aliceblue': '#F0F8FF',
'antiquewhite': '#FAEBD7',
'aqua': '#00FFFF',
'aquamarine': '#7FFFD4',
'azure': '#F0FFFF',
'beige': '#F5F5DC',
'bisque': '#FFE4C4',
'black': '#000000',
'blanchedalmond': '#FFEBCD',
'blue': '#0000FF',
'brown': '#A52A2A',
'burlywood': '#DEB887',
'cadetblue': '#5F9EA0',
'chartreuse': '#7FFF00',
'chocolate': '#D2691E',
'coral': '#FF7F50',
'crimson': '#DC143C',
'cyan': '#00FFFF',
'darkblue': '#00008B',
'darkgoldenrod': '#B8860B',
'darkgreen': '#006400',
'darkkhaki': '#BDB76B',
'darkmagenta': '#8B008B',
'darkolivegreen': '#556B2F',
'darkorange': '#FF8C00',
'darkorchid': '#9932CC',
'darkred': '#8B0000',
'darksalmon': '#E9967A',
'darkslateblue': '#483D8B',
'darkslategrey': '#2F4F4F',
'darkviolet': '#9400D3',
'deeppink': '#FF1493',
'dodgerblue': '#1E90FF',
'firebrick': '#B22222',
'floralwhite': '#FFFAF0',
'forestgreen': '#228B22',
'fuchsia': '#FF00FF',
'gainsboro': '#DCDCDC',
'ghostwhite': '#F8F8FF',
'gold': '#FFD700',
'goldenrod': '#DAA520',
'indianred ': '#CD5C5C',
'indigo ': '#4B0082',
'khaki': '#F0E68C',
'lavenderblush': '#FFF0F5',
'lawngreen': '#7CFC00',
'lightblue': '#ADD8E6',
'lightcoral': '#F08080',
'lightgoldenrodyellow': '#FAFAD2',
'lightgray': '#D3D3D3',
'lightgrey': '#D3D3D3',
'lightskyblue': '#87CEFA',
'lightslategrey': '#778899',
'lightsteelblue': '#B0C4DE',
'lime': '#87CEFA',
'linen': '#FAF0E6',
'magenta': '#FF00FF',
'maroon': '#800000',
'mediumaquamarine': '#66CDAA',
'mediumblue': '#0000CD',
'mediumorchid': '#BA55D3',
'mediumpurple': '#9370D8',
'mediumseagreen': '#3CB371',
'mediumslateblue': '#7B68EE',
'midnightblue': '#191970',
'moccasin': '#FFE4B5',
'navajowhite': '#FFDEAD',
'navy': '#000080',
'oldlace': '#FDF5E6',
'olive': '#808000',
'orange': '#FFA500',
'orangered': '#FF4500',
'orchid': '#DA70D6',
'paleturquoise': '#AFEEEE',
'papayawhip': '#FFEFD5',
'peachpuff': '#FFDAB9',
'powderblue': '#B0E0E6',
'rosybrown': '#BC8F8F',
'royalblue': '#4169E1',
'saddlebrown': '#8B4513',
'sandybrown': '#8B4513',
'seashell': '#FFF5EE',
'sienna': '#A0522D',
'silver': '#C0C0C0',
'skyblue': '#87CEEB',
'slategrey': '#708090',
'snow': '#FFFAFA',
'springgreen': '#00FF7F',
'violet': '#EE82EE',
'yellowgreen': '#9ACD32'
}
hex_pat = re.compile(r'#(\d{2})(\d{2})(\d{2})')
rgb_pat = re.compile(r'rgb\(\s*(\d+)\s*,\s*(\d+)\s*,\s*(\d+)\s*\)', re.IGNORECASE)
def lrs_color(html_color):
hcol = html_color.lower()
match = hex_pat.search(hcol)
if match:
return '0x00'+match.group(1)+match.group(2)+match.group(3)
match = rgb_pat.search(hcol)
if match:
return '0x00'+hex(int(match.group(1)))[2:]+hex(int(match.group(2)))[2:]+hex(int(match.group(3)))[2:]
if hcol in NAME_MAP:
return NAME_MAP[hcol].replace('#', '0x00')
return '0x00000000'

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,386 @@
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
import math, sys, re, numbers
from calibre.ebooks.lrf.fonts import get_font
from calibre.ebooks.lrf.pylrs.pylrs import TextBlock, Text, CR, Span, \
CharButton, Plot, Paragraph, \
LrsTextTag
from polyglot.builtins import string_or_bytes, range, native_string_type
def ceil(num):
return int(math.ceil(num))
def print_xml(elem):
from calibre.ebooks.lrf.pylrs.pylrs import ElementWriter
elem = elem.toElement(native_string_type('utf8'))
ew = ElementWriter(elem, sourceEncoding=native_string_type('utf8'))
ew.write(sys.stdout)
print()
def cattrs(base, extra):
new = base.copy()
new.update(extra)
return new
def tokens(tb):
'''
Return the next token. A token is :
1. A string
a block of text that has the same style
'''
def process_element(x, attrs):
if isinstance(x, CR):
yield 2, None
elif isinstance(x, Text):
yield x.text, cattrs(attrs, {})
elif isinstance(x, string_or_bytes):
yield x, cattrs(attrs, {})
elif isinstance(x, (CharButton, LrsTextTag)):
if x.contents:
if hasattr(x.contents[0], 'text'):
yield x.contents[0].text, cattrs(attrs, {})
elif hasattr(x.contents[0], 'attrs'):
for z in process_element(x.contents[0], x.contents[0].attrs):
yield z
elif isinstance(x, Plot):
yield x, None
elif isinstance(x, Span):
attrs = cattrs(attrs, x.attrs)
for y in x.contents:
for z in process_element(y, attrs):
yield z
for i in tb.contents:
if isinstance(i, CR):
yield 1, None
elif isinstance(i, Paragraph):
for j in i.contents:
attrs = {}
if hasattr(j, 'attrs'):
attrs = j.attrs
for k in process_element(j, attrs):
yield k
class Cell(object):
def __init__(self, conv, tag, css):
self.conv = conv
self.tag = tag
self.css = css
self.text_blocks = []
self.pwidth = -1.
if tag.has_attr('width') and '%' in tag['width']:
try:
self.pwidth = float(tag['width'].replace('%', ''))
except ValueError:
pass
if 'width' in css and '%' in css['width']:
try:
self.pwidth = float(css['width'].replace('%', ''))
except ValueError:
pass
if self.pwidth > 100:
self.pwidth = -1
self.rowspan = self.colspan = 1
try:
self.colspan = int(tag['colspan']) if tag.has_attr('colspan') else 1
self.rowspan = int(tag['rowspan']) if tag.has_attr('rowspan') else 1
except:
pass
pp = conv.current_page
conv.book.allow_new_page = False
conv.current_page = conv.book.create_page()
conv.parse_tag(tag, css)
conv.end_current_block()
for item in conv.current_page.contents:
if isinstance(item, TextBlock):
self.text_blocks.append(item)
conv.current_page = pp
conv.book.allow_new_page = True
if not self.text_blocks:
tb = conv.book.create_text_block()
tb.Paragraph(' ')
self.text_blocks.append(tb)
for tb in self.text_blocks:
tb.parent = None
tb.objId = 0
# Needed as we have to eventually change this BlockStyle's width and
# height attributes. This blockstyle may be shared with other
# elements, so doing that causes havoc.
tb.blockStyle = conv.book.create_block_style()
ts = conv.book.create_text_style(**tb.textStyle.attrs)
ts.attrs['parindent'] = 0
tb.textStyle = ts
if ts.attrs['align'] == 'foot':
if isinstance(tb.contents[-1], Paragraph):
tb.contents[-1].append(' ')
def pts_to_pixels(self, pts):
pts = int(pts)
return ceil((float(self.conv.profile.dpi)/72)*(pts/10))
def minimum_width(self):
return max([self.minimum_tb_width(tb) for tb in self.text_blocks])
def minimum_tb_width(self, tb):
ts = tb.textStyle.attrs
default_font = get_font(ts['fontfacename'], self.pts_to_pixels(ts['fontsize']))
parindent = self.pts_to_pixels(ts['parindent'])
mwidth = 0
for token, attrs in tokens(tb):
font = default_font
if isinstance(token, numbers.Integral): # Handle para and line breaks
continue
if isinstance(token, Plot):
return self.pts_to_pixels(token.xsize)
ff = attrs.get('fontfacename', ts['fontfacename'])
fs = attrs.get('fontsize', ts['fontsize'])
if (ff, fs) != (ts['fontfacename'], ts['fontsize']):
font = get_font(ff, self.pts_to_pixels(fs))
if not token.strip():
continue
word = token.split()
word = word[0] if word else ""
width = font.getsize(word)[0]
if width > mwidth:
mwidth = width
return parindent + mwidth + 2
def text_block_size(self, tb, maxwidth=sys.maxsize, debug=False):
ts = tb.textStyle.attrs
default_font = get_font(ts['fontfacename'], self.pts_to_pixels(ts['fontsize']))
parindent = self.pts_to_pixels(ts['parindent'])
top, bottom, left, right = 0, 0, parindent, parindent
def add_word(width, height, left, right, top, bottom, ls, ws):
if left + width > maxwidth:
left = width + ws
top += ls
bottom = top+ls if top+ls > bottom else bottom
else:
left += (width + ws)
right = left if left > right else right
bottom = top+ls if top+ls > bottom else bottom
return left, right, top, bottom
for token, attrs in tokens(tb):
if attrs is None:
attrs = {}
font = default_font
ls = self.pts_to_pixels(attrs.get('baselineskip', ts['baselineskip']))+\
self.pts_to_pixels(attrs.get('linespace', ts['linespace']))
ws = self.pts_to_pixels(attrs.get('wordspace', ts['wordspace']))
if isinstance(token, numbers.Integral): # Handle para and line breaks
if top != bottom: # Previous element not a line break
top = bottom
else:
top += ls
bottom += ls
left = parindent if int == 1 else 0
continue
if isinstance(token, Plot):
width, height = self.pts_to_pixels(token.xsize), self.pts_to_pixels(token.ysize)
left, right, top, bottom = add_word(width, height, left, right, top, bottom, height, ws)
continue
ff = attrs.get('fontfacename', ts['fontfacename'])
fs = attrs.get('fontsize', ts['fontsize'])
if (ff, fs) != (ts['fontfacename'], ts['fontsize']):
font = get_font(ff, self.pts_to_pixels(fs))
for word in token.split():
width, height = font.getsize(word)
left, right, top, bottom = add_word(width, height, left, right, top, bottom, ls, ws)
return right+3+max(parindent, 10), bottom
def text_block_preferred_width(self, tb, debug=False):
return self.text_block_size(tb, sys.maxsize, debug=debug)[0]
def preferred_width(self, debug=False):
return ceil(max([self.text_block_preferred_width(i, debug=debug) for i in self.text_blocks]))
def height(self, width):
return sum([self.text_block_size(i, width)[1] for i in self.text_blocks])
class Row(object):
def __init__(self, conv, row, css, colpad):
self.cells = []
self.colpad = colpad
cells = row.findAll(re.compile('td|th', re.IGNORECASE))
self.targets = []
for cell in cells:
ccss = conv.tag_css(cell, css)[0]
self.cells.append(Cell(conv, cell, ccss))
for a in row.findAll(id=True) + row.findAll(name=True):
name = a['name'] if a.has_attr('name') else a['id'] if a.has_attr('id') else None
if name is not None:
self.targets.append(name.replace('#', ''))
def number_of_cells(self):
'''Number of cells in this row. Respects colspan'''
ans = 0
for cell in self.cells:
ans += cell.colspan
return ans
def height(self, widths):
i, heights = 0, []
for cell in self.cells:
width = sum(widths[i:i+cell.colspan])
heights.append(cell.height(width))
i += cell.colspan
if not heights:
return 0
return max(heights)
def cell_from_index(self, col):
i = -1
cell = None
for cell in self.cells:
for k in range(0, cell.colspan):
if i == col:
break
i += 1
if i == col:
break
return cell
def minimum_width(self, col):
cell = self.cell_from_index(col)
if not cell:
return 0
return cell.minimum_width()
def preferred_width(self, col):
cell = self.cell_from_index(col)
if not cell:
return 0
return 0 if cell.colspan > 1 else cell.preferred_width()
def width_percent(self, col):
cell = self.cell_from_index(col)
if not cell:
return -1
return -1 if cell.colspan > 1 else cell.pwidth
def cell_iterator(self):
for c in self.cells:
yield c
class Table(object):
def __init__(self, conv, table, css, rowpad=10, colpad=10):
self.rows = []
self.conv = conv
self.rowpad = rowpad
self.colpad = colpad
rows = table.findAll('tr')
conv.in_table = True
for row in rows:
rcss = conv.tag_css(row, css)[0]
self.rows.append(Row(conv, row, rcss, colpad))
conv.in_table = False
def number_of_columns(self):
max = 0
for row in self.rows:
max = row.number_of_cells() if row.number_of_cells() > max else max
return max
def number_or_rows(self):
return len(self.rows)
def height(self, maxwidth):
''' Return row heights + self.rowpad'''
widths = self.get_widths(maxwidth)
return sum([row.height(widths) + self.rowpad for row in self.rows]) - self.rowpad
def minimum_width(self, col):
return max([row.minimum_width(col) for row in self.rows])
def width_percent(self, col):
return max([row.width_percent(col) for row in self.rows])
def get_widths(self, maxwidth):
'''
Return widths of columns + self.colpad
'''
rows, cols = self.number_or_rows(), self.number_of_columns()
widths = list(range(cols))
for c in range(cols):
cellwidths = [0 for i in range(rows)]
for r in range(rows):
try:
cellwidths[r] = self.rows[r].preferred_width(c)
except IndexError:
continue
widths[c] = max(cellwidths)
min_widths = [self.minimum_width(i)+10 for i in range(cols)]
for i in range(len(widths)):
wp = self.width_percent(i)
if wp >= 0:
widths[i] = max(min_widths[i], ceil((wp/100) * (maxwidth - (cols-1)*self.colpad)))
itercount = 0
while sum(widths) > maxwidth-((len(widths)-1)*self.colpad) and itercount < 100:
for i in range(cols):
widths[i] = ceil((95/100)*widths[i]) if \
ceil((95/100)*widths[i]) >= min_widths[i] else widths[i]
itercount += 1
return [i+self.colpad for i in widths]
def blocks(self, maxwidth, maxheight):
rows, cols = self.number_or_rows(), self.number_of_columns()
cellmatrix = [[None for c in range(cols)] for r in range(rows)]
rowpos = [0 for i in range(rows)]
for r in range(rows):
nc = self.rows[r].cell_iterator()
try:
while True:
cell = next(nc)
cellmatrix[r][rowpos[r]] = cell
rowpos[r] += cell.colspan
for k in range(1, cell.rowspan):
try:
rowpos[r+k] += 1
except IndexError:
break
except StopIteration: # No more cells in this row
continue
widths = self.get_widths(maxwidth)
heights = [row.height(widths) for row in self.rows]
xpos = [sum(widths[:i]) for i in range(cols)]
delta = maxwidth - sum(widths)
if delta < 0:
delta = 0
for r in range(len(cellmatrix)):
yield None, 0, heights[r], 0, self.rows[r].targets
for c in range(len(cellmatrix[r])):
cell = cellmatrix[r][c]
if not cell:
continue
width = sum(widths[c:c+cell.colspan])-self.colpad*cell.colspan
sypos = 0
for tb in cell.text_blocks:
tb.blockStyle = self.conv.book.create_block_style(
blockwidth=width,
blockheight=cell.text_block_size(tb, width)[1],
blockrule='horz-fixed')
yield tb, xpos[c], sypos, delta, None
sypos += tb.blockStyle.attrs['blockheight']

View File

@@ -0,0 +1,7 @@
from __future__ import absolute_import, division, print_function, unicode_literals
"""
This package contains code to generate ebooks in the SONY LRS/F format. It was
originally developed by Mike Higgins and has been extended and modified by Kovid
Goyal.
"""

View File

@@ -0,0 +1,78 @@
from __future__ import absolute_import, division, print_function, unicode_literals
""" elements.py -- replacements and helpers for ElementTree """
from polyglot.builtins import unicode_type, string_or_bytes
class ElementWriter(object):
def __init__(self, e, header=False, sourceEncoding="ascii",
spaceBeforeClose=True, outputEncodingName="UTF-16"):
self.header = header
self.e = e
self.sourceEncoding=sourceEncoding
self.spaceBeforeClose = spaceBeforeClose
self.outputEncodingName = outputEncodingName
def _encodeCdata(self, rawText):
if isinstance(rawText, bytes):
rawText = rawText.decode(self.sourceEncoding)
text = rawText.replace("&", "&amp;")
text = text.replace("<", "&lt;")
text = text.replace(">", "&gt;")
return text
def _writeAttribute(self, f, name, value):
f.write(' %s="' % unicode_type(name))
if not isinstance(value, string_or_bytes):
value = unicode_type(value)
value = self._encodeCdata(value)
value = value.replace('"', '&quot;')
f.write(value)
f.write('"')
def _writeText(self, f, rawText):
text = self._encodeCdata(rawText)
f.write(text)
def _write(self, f, e):
f.write('<' + unicode_type(e.tag))
attributes = e.items()
attributes.sort()
for name, value in attributes:
self._writeAttribute(f, name, value)
if e.text is not None or len(e) > 0:
f.write('>')
if e.text:
self._writeText(f, e.text)
for e2 in e:
self._write(f, e2)
f.write('</%s>' % e.tag)
else:
if self.spaceBeforeClose:
f.write(' ')
f.write('/>')
if e.tail is not None:
self._writeText(f, e.tail)
def toString(self):
class x:
pass
buffer = []
x.write = buffer.append
self.write(x)
return ''.join(buffer)
def write(self, f):
if self.header:
f.write('<?xml version="1.0" encoding="%s"?>\n' % self.outputEncodingName)
self._write(f, self.e)

View File

@@ -0,0 +1,773 @@
#!/usr/bin/env python2
from __future__ import absolute_import, division, print_function, unicode_literals
"""
pylrf.py -- very low level interface to create lrf files. See pylrs for
higher level interface that can use this module to render books to lrf.
"""
import struct
import zlib
import io
import codecs
import os
from .pylrfopt import tagListOptimizer
from polyglot.builtins import iteritems, string_or_bytes, unicode_type
PYLRF_VERSION = "1.0"
#
# Acknowledgement:
# This software would not have been possible without the pioneering
# efforts of the author of lrf2lrs.py, Igor Skochinsky.
#
# Copyright (c) 2007 Mike Higgins (Falstaff)
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the "Software"),
# to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following conditions:
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
# DEALINGS IN THE SOFTWARE.
#
# Change History:
#
# V1.0 06 Feb 2007
# Initial Release.
#
# Current limitations and bugs:
# Never "scrambles" any streams (even if asked to). This does not seem
# to hurt anything.
#
# Not based on any official documentation, so many assumptions had to be made.
#
# Can be used to create lrf files that can lock up an eBook reader.
# This is your only warning.
#
# Unsupported objects: Canvas, Window, PopUpWindow, Sound, Import,
# SoundStream, ObjectInfo
#
# The only button type supported is JumpButton.
#
# Unsupported tags: SoundStop, Wait, pos on BlockSpace (and those used by
# unsupported objects).
#
# Tags supporting Japanese text and Asian layout have not been tested.
#
# Tested on Python 2.4 and 2.5, Windows XP and Sony PRS-500.
#
# Commented even less than pylrs, but not very useful when called directly,
# anyway.
#
class LrfError(Exception):
pass
def writeByte(f, byte):
f.write(struct.pack("<B", byte))
def writeWord(f, word):
if int(word) > 65535:
raise LrfError('Cannot encode a number greater than 65535 in a word.')
if int(word) < 0:
raise LrfError('Cannot encode a number < 0 in a word: '+unicode_type(word))
f.write(struct.pack("<H", int(word)))
def writeSignedWord(f, sword):
f.write(struct.pack("<h", int(float(sword))))
def writeWords(f, *words):
f.write(struct.pack("<%dH" % len(words), *words))
def writeDWord(f, dword):
f.write(struct.pack("<I", int(dword)))
def writeDWords(f, *dwords):
f.write(struct.pack("<%dI" % len(dwords), *dwords))
def writeQWord(f, qword):
f.write(struct.pack("<Q", qword))
def writeZeros(f, nZeros):
f.write(b"\0" * nZeros)
def writeString(f, s):
f.write(s)
def writeIdList(f, idList):
writeWord(f, len(idList))
writeDWords(f, *idList)
def writeColor(f, color):
# TODO: allow color names, web format
f.write(struct.pack(">I", int(color, 0)))
def writeLineWidth(f, width):
writeWord(f, int(width))
def writeUnicode(f, string, encoding):
if isinstance(string, bytes):
string = string.decode(encoding)
string = string.encode("utf-16-le")
length = len(string)
if length > 65535:
raise LrfError('Cannot write strings longer than 65535 characters.')
writeWord(f, length)
writeString(f, string)
def writeRaw(f, string, encoding):
if isinstance(string, bytes):
string = string.decode(encoding)
string = string.encode("utf-16-le")
writeString(f, string)
def writeRubyAA(f, rubyAA):
ralign, radjust = rubyAA
radjust = {"line-edge":0x10, "none":0}[radjust]
ralign = {"start":1, "center":2}[ralign]
writeWord(f, ralign | radjust)
def writeBgImage(f, bgInfo):
imode, iid = bgInfo
imode = {"pfix": 0, "fix":1, "tile":2, "centering":3}[imode]
writeWord(f, imode)
writeDWord(f, iid)
def writeEmpDots(f, dotsInfo, encoding):
refDotsFont, dotsFontName, dotsCode = dotsInfo
writeDWord(f, refDotsFont)
LrfTag("fontfacename", dotsFontName).write(f, encoding)
writeWord(f, int(dotsCode, 0))
def writeRuledLine(f, lineInfo):
lineLength, lineType, lineWidth, lineColor = lineInfo
writeWord(f, lineLength)
writeWord(f, LINE_TYPE_ENCODING[lineType])
writeWord(f, lineWidth)
writeColor(f, lineColor)
LRF_SIGNATURE = b"L\x00R\x00F\x00\x00\x00"
# XOR_KEY = 48
XOR_KEY = 65024 # that's what lrf2lrs says -- not used, anyway...
LRF_VERSION = 1000 # is 999 for librie? lrf2lrs uses 1000
IMAGE_TYPE_ENCODING = dict(GIF=0x14, PNG=0x12, BMP=0x13, JPEG=0x11, JPG=0x11)
OBJECT_TYPE_ENCODING = dict(
PageTree=0x01,
Page=0x02,
Header=0x03,
Footer=0x04,
PageAtr=0x05, PageStyle=0x05,
Block=0x06,
BlockAtr=0x07, BlockStyle=0x07,
MiniPage=0x08,
TextBlock=0x0A, Text=0x0A,
TextAtr=0x0B, TextStyle=0x0B,
ImageBlock=0x0C, Image=0x0C,
Canvas=0x0D,
ESound=0x0E,
ImageStream=0x11,
Import=0x12,
Button=0x13,
Window=0x14,
PopUpWindow=0x15,
Sound=0x16,
SoundStream=0x17,
Font=0x19,
ObjectInfo=0x1A,
BookAtr=0x1C, BookStyle=0x1C,
SimpleTextBlock=0x1D,
TOC=0x1E
)
LINE_TYPE_ENCODING = {
'none':0, 'solid':0x10, 'dashed':0x20, 'double':0x30, 'dotted':0x40
}
BINDING_DIRECTION_ENCODING = dict(Lr=1, Rl=16)
TAG_INFO = dict(
rawtext=(0, writeRaw),
ObjectStart=(0xF500, "<IH"),
ObjectEnd=(0xF501,),
# InfoLink (0xF502)
Link=(0xF503, "<I"),
StreamSize=(0xF504, writeDWord),
StreamData=(0xF505, writeString),
StreamEnd=(0xF506,),
oddheaderid=(0xF507, writeDWord),
evenheaderid=(0xF508, writeDWord),
oddfooterid=(0xF509, writeDWord),
evenfooterid=(0xF50A, writeDWord),
ObjectList=(0xF50B, writeIdList),
fontsize=(0xF511, writeSignedWord),
fontwidth=(0xF512, writeSignedWord),
fontescapement=(0xF513, writeSignedWord),
fontorientation=(0xF514, writeSignedWord),
fontweight=(0xF515, writeWord),
fontfacename=(0xF516, writeUnicode),
textcolor=(0xF517, writeColor),
textbgcolor=(0xF518, writeColor),
wordspace=(0xF519, writeSignedWord),
letterspace=(0xF51A, writeSignedWord),
baselineskip=(0xF51B, writeSignedWord),
linespace=(0xF51C, writeSignedWord),
parindent=(0xF51D, writeSignedWord),
parskip=(0xF51E, writeSignedWord),
# F51F, F520
topmargin=(0xF521, writeWord),
headheight=(0xF522, writeWord),
headsep=(0xF523, writeWord),
oddsidemargin=(0xF524, writeWord),
textheight=(0xF525, writeWord),
textwidth=(0xF526, writeWord),
canvaswidth=(0xF551, writeWord),
canvasheight=(0xF552, writeWord),
footspace=(0xF527, writeWord),
footheight=(0xF528, writeWord),
bgimage=(0xF529, writeBgImage),
setemptyview=(0xF52A, {'show':1, 'empty':0}, writeWord),
pageposition=(0xF52B, {'any':0,'upper':1, 'lower':2}, writeWord),
evensidemargin=(0xF52C, writeWord),
framemode=(0xF52E,
{'None':0, 'curve':2, 'square':1}, writeWord),
blockwidth=(0xF531, writeWord),
blockheight=(0xF532, writeWord),
blockrule=(0xF533, {"horz-fixed":0x14, "horz-adjustable":0x12,
"vert-fixed":0x41, "vert-adjustable":0x21,
"block-fixed":0x44, "block-adjustable":0x22},
writeWord),
bgcolor=(0xF534, writeColor),
layout=(0xF535, {'TbRl':0x41, 'LrTb':0x34}, writeWord),
framewidth=(0xF536, writeWord),
framecolor=(0xF537, writeColor),
topskip=(0xF538, writeWord),
sidemargin=(0xF539, writeWord),
footskip=(0xF53A, writeWord),
align=(0xF53C, {'head':1, 'center':4, 'foot':8}, writeWord),
column=(0xF53D, writeWord),
columnsep=(0xF53E, writeSignedWord),
minipagewidth=(0xF541, writeWord),
minipageheight=(0xF542, writeWord),
yspace=(0xF546, writeWord),
xspace=(0xF547, writeWord),
PutObj=(0xF549, "<HHI"),
ImageRect=(0xF54A, "<HHHH"),
ImageSize=(0xF54B, "<HH"),
RefObjId=(0xF54C, "<I"),
PageDiv=(0xF54E, "<HIHI"),
StreamFlags=(0xF554, writeWord),
Comment=(0xF555, writeUnicode),
FontFilename=(0xF559, writeUnicode),
PageList=(0xF55C, writeIdList),
FontFacename=(0xF55D, writeUnicode),
buttonflags=(0xF561, writeWord),
PushButtonStart=(0xF566,),
PushButtonEnd=(0xF567,),
buttonactions=(0xF56A,),
endbuttonactions=(0xF56B,),
jumpto=(0xF56C, "<II"),
RuledLine=(0xF573, writeRuledLine),
rubyaa=(0xF575, writeRubyAA),
rubyoverhang=(0xF576, {'none':0, 'auto':1}, writeWord),
empdotsposition=(0xF577, {'before':1, 'after':2}, writeWord),
empdots=(0xF578, writeEmpDots),
emplineposition=(0xF579, {'before':1, 'after':2}, writeWord),
emplinetype=(0xF57A, LINE_TYPE_ENCODING, writeWord),
ChildPageTree=(0xF57B, "<I"),
ParentPageTree=(0xF57C, "<I"),
Italic=(0xF581,),
ItalicEnd=(0xF582,),
pstart=(0xF5A1, writeDWord), # what goes in the dword? refesound
pend=(0xF5A2,),
CharButton=(0xF5A7, writeDWord),
CharButtonEnd=(0xF5A8,),
Rubi=(0xF5A9,),
RubiEnd=(0xF5AA,),
Oyamoji=(0xF5AB,),
OyamojiEnd=(0xF5AC,),
Rubimoji=(0xF5AD,),
RubimojiEnd=(0xF5AE,),
Yoko=(0xF5B1,),
YokoEnd=(0xF5B2,),
Tate=(0xF5B3,),
TateEnd=(0xF5B4,),
Nekase=(0xF5B5,),
NekaseEnd=(0xF5B6,),
Sup=(0xF5B7,),
SupEnd=(0xF5B8,),
Sub=(0xF5B9,),
SubEnd=(0xF5BA,),
NoBR=(0xF5BB,),
NoBREnd=(0xF5BC,),
EmpDots=(0xF5BD,),
EmpDotsEnd=(0xF5BE,),
EmpLine=(0xF5C1,),
EmpLineEnd=(0xF5C2,),
DrawChar=(0xF5C3, '<H'),
DrawCharEnd=(0xF5C4,),
Box=(0xF5C6, LINE_TYPE_ENCODING, writeWord),
BoxEnd=(0xF5C7,),
Space=(0xF5CA, writeSignedWord),
textstring=(0xF5CC, writeUnicode),
Plot=(0xF5D1, "<HHII"),
CR=(0xF5D2,),
RegisterFont=(0xF5D8, writeDWord),
setwaitprop=(0xF5DA, {'replay':1, 'noreplay':2}, writeWord),
charspace=(0xF5DD, writeSignedWord),
textlinewidth=(0xF5F1, writeLineWidth),
linecolor=(0xF5F2, writeColor)
)
class ObjectTableEntry(object):
def __init__(self, objId, offset, size):
self.objId = objId
self.offset = offset
self.size = size
def write(self, f):
writeDWords(f, self.objId, self.offset, self.size, 0)
class LrfTag(object):
def __init__(self, name, *parameters):
try:
tagInfo = TAG_INFO[name]
except KeyError:
raise LrfError("tag name %s not recognized" % name)
self.name = name
self.type = tagInfo[0]
self.format = tagInfo[1:]
if len(parameters) > 1:
raise LrfError("only one parameter allowed on tag %s" % name)
if len(parameters) == 0:
self.parameter = None
else:
self.parameter = parameters[0]
def write(self, lrf, encoding=None):
if self.type != 0:
writeWord(lrf, self.type)
p = self.parameter
if p is None:
return
# print " Writing tag", self.name
for f in self.format:
if isinstance(f, dict):
p = f[p]
elif isinstance(f, string_or_bytes):
if isinstance(p, tuple):
writeString(lrf, struct.pack(f, *p))
else:
writeString(lrf, struct.pack(f, p))
else:
if f in [writeUnicode, writeRaw, writeEmpDots]:
if encoding is None:
raise LrfError("Tag requires encoding")
f(lrf, p, encoding)
else:
f(lrf, p)
STREAM_SCRAMBLED = 0x200
STREAM_COMPRESSED = 0x100
STREAM_FORCE_COMPRESSED = 0x8100
STREAM_TOC = 0x0051
class LrfStreamBase(object):
def __init__(self, streamFlags, streamData=None):
self.streamFlags = streamFlags
self.streamData = streamData
def setStreamData(self, streamData):
self.streamData = streamData
def getStreamTags(self, optimize=False):
# tags:
# StreamFlags
# StreamSize
# StreamStart
# (data)
# StreamEnd
#
# if flags & 0x200, stream is scrambled
# if flags & 0x100, stream is compressed
flags = self.streamFlags
streamBuffer = self.streamData
# implement scramble? I never scramble anything...
if flags & STREAM_FORCE_COMPRESSED == STREAM_FORCE_COMPRESSED:
optimize = False
if flags & STREAM_COMPRESSED == STREAM_COMPRESSED:
uncompLen = len(streamBuffer)
compStreamBuffer = zlib.compress(streamBuffer)
if optimize and uncompLen <= len(compStreamBuffer) + 4:
flags &= ~STREAM_COMPRESSED
else:
streamBuffer = struct.pack("<I", uncompLen) + compStreamBuffer
return [LrfTag("StreamFlags", flags & 0x01FF),
LrfTag("StreamSize", len(streamBuffer)),
LrfTag("StreamData", streamBuffer),
LrfTag("StreamEnd")]
class LrfTagStream(LrfStreamBase):
def __init__(self, streamFlags, streamTags=None):
LrfStreamBase.__init__(self, streamFlags)
if streamTags is None:
self.tags = []
else:
self.tags = streamTags[:]
def appendLrfTag(self, tag):
self.tags.append(tag)
def getStreamTags(self, encoding,
optimizeTags=False, optimizeCompression=False):
stream = io.BytesIO()
if optimizeTags:
tagListOptimizer(self.tags)
for tag in self.tags:
tag.write(stream, encoding)
self.streamData = stream.getvalue()
stream.close()
return LrfStreamBase.getStreamTags(self, optimize=optimizeCompression)
class LrfFileStream(LrfStreamBase):
def __init__(self, streamFlags, filename):
LrfStreamBase.__init__(self, streamFlags)
with open(filename, "rb") as f:
self.streamData = f.read()
class LrfObject(object):
def __init__(self, name, objId):
if objId <= 0:
raise LrfError("invalid objId for " + name)
self.name = name
self.objId = objId
self.tags = []
try:
self.type = OBJECT_TYPE_ENCODING[name]
except KeyError:
raise LrfError("object name %s not recognized" % name)
def __str__(self):
return 'LRFObject: ' + self.name + ", " + unicode_type(self.objId)
def appendLrfTag(self, tag):
self.tags.append(tag)
def appendLrfTags(self, tagList):
self.tags.extend(tagList)
# deprecated old name
append = appendLrfTag
def appendTagDict(self, tagDict, genClass=None):
#
# This code does not really belong here, I think. But it
# belongs somewhere, so here it is.
#
composites = {}
for name, value in iteritems(tagDict):
if name == 'rubyAlignAndAdjust':
continue
if name in {
"bgimagemode", "bgimageid", "rubyalign", "rubyadjust",
"empdotscode", "empdotsfontname", "refempdotsfont"}:
composites[name] = value
else:
self.append(LrfTag(name, value))
if "rubyalign" in composites or "rubyadjust" in composites:
ralign = composites.get("rubyalign", "none")
radjust = composites.get("rubyadjust", "start")
self.append(LrfTag("rubyaa", (ralign, radjust)))
if "bgimagemode" in composites or "bgimageid" in composites:
imode = composites.get("bgimagemode", "fix")
iid = composites.get("bgimageid", 0)
# for some reason, page style uses 0 for "fix"
# we call this pfix to differentiate it
if genClass == "PageStyle" and imode == "fix":
imode = "pfix"
self.append(LrfTag("bgimage", (imode, iid)))
if "empdotscode" in composites or "empdotsfontname" in composites or \
"refempdotsfont" in composites:
dotscode = composites.get("empdotscode", "0x002E")
dotsfontname = composites.get("empdotsfontname",
"Dutch801 Rm BT Roman")
refdotsfont = composites.get("refempdotsfont", 0)
self.append(LrfTag("empdots", (refdotsfont, dotsfontname,
dotscode)))
def write(self, lrf, encoding=None):
# print "Writing object", self.name
LrfTag("ObjectStart", (self.objId, self.type)).write(lrf)
for tag in self.tags:
tag.write(lrf, encoding)
LrfTag("ObjectEnd").write(lrf)
class LrfToc(LrfObject):
"""
Table of contents. Format of toc is:
[ (pageid, objid, string)...]
"""
def __init__(self, objId, toc, se):
LrfObject.__init__(self, "TOC", objId)
streamData = self._makeTocStream(toc, se)
self._makeStreamTags(streamData)
def _makeStreamTags(self, streamData):
stream = LrfStreamBase(STREAM_TOC, streamData)
self.tags.extend(stream.getStreamTags())
def _makeTocStream(self, toc, se):
stream = io.BytesIO()
nEntries = len(toc)
writeDWord(stream, nEntries)
lastOffset = 0
writeDWord(stream, lastOffset)
for i in range(nEntries - 1):
pageId, objId, label = toc[i]
entryLen = 4 + 4 + 2 + len(label)*2
lastOffset += entryLen
writeDWord(stream, lastOffset)
for entry in toc:
pageId, objId, label = entry
if pageId <= 0:
raise LrfError("page id invalid in toc: " + label)
if objId <= 0:
raise LrfError("textblock id invalid in toc: " + label)
writeDWord(stream, pageId)
writeDWord(stream, objId)
writeUnicode(stream, label, se)
streamData = stream.getvalue()
stream.close()
return streamData
class LrfWriter(object):
def __init__(self, sourceEncoding):
self.sourceEncoding = sourceEncoding
# The following flags are just to have a place to remember these
# values. The flags must still be passed to the appropriate classes
# in order to have them work.
self.saveStreamTags = False # used only in testing -- hogs memory
# highly experimental -- set to True at your own risk
self.optimizeTags = False
self.optimizeCompression = False
# End of placeholders
self.rootObjId = 0
self.rootObj = None
self.binding = 1 # 1=front to back, 16=back to front
self.dpi = 1600
self.width = 600
self.height = 800
self.colorDepth = 24
self.tocObjId = 0
self.docInfoXml = ""
self.thumbnailEncoding = "JPEG"
self.thumbnailData = b""
self.objects = []
self.objectTable = []
def getSourceEncoding(self):
return self.sourceEncoding
def toUnicode(self, string):
if isinstance(string, bytes):
string = string.decode(self.sourceEncoding)
return string
def getDocInfoXml(self):
return self.docInfoXml
def setPageTreeId(self, objId):
self.pageTreeId = objId
def getPageTreeId(self):
return self.pageTreeId
def setRootObject(self, obj):
if self.rootObjId != 0:
raise LrfError("root object already set")
self.rootObjId = obj.objId
self.rootObj = obj
def registerFontId(self, id):
if self.rootObj is None:
raise LrfError("can't register font -- no root object")
self.rootObj.append(LrfTag("RegisterFont", id))
def setTocObject(self, obj):
if self.tocObjId != 0:
raise LrfError("toc object already set")
self.tocObjId = obj.objId
def setThumbnailFile(self, filename, encoding=None):
with open(filename, "rb") as f:
self.thumbnailData = f.read()
if encoding is None:
encoding = os.path.splitext(filename)[1][1:]
encoding = encoding.upper()
if encoding not in IMAGE_TYPE_ENCODING:
raise LrfError("unknown image type: " + encoding)
self.thumbnailEncoding = encoding
def append(self, obj):
self.objects.append(obj)
def addLrfObject(self, objId):
pass
def writeFile(self, lrf):
if self.rootObjId == 0:
raise LrfError("no root object has been set")
self.writeHeader(lrf)
self.writeObjects(lrf)
self.updateObjectTableOffset(lrf)
self.updateTocObjectOffset(lrf)
self.writeObjectTable(lrf)
def writeHeader(self, lrf):
writeString(lrf, LRF_SIGNATURE)
writeWord(lrf, LRF_VERSION)
writeWord(lrf, XOR_KEY)
writeDWord(lrf, self.rootObjId)
writeQWord(lrf, len(self.objects))
writeQWord(lrf, 0) # 0x18 objectTableOffset -- will be updated
writeZeros(lrf, 4) # 0x20 unknown
writeWord(lrf, self.binding)
writeDWord(lrf, self.dpi)
writeWords(lrf, self.width, self.height, self.colorDepth)
writeZeros(lrf, 20) # 0x30 unknown
writeDWord(lrf, self.tocObjId)
writeDWord(lrf, 0) # 0x48 tocObjectOffset -- will be updated
docInfoXml = codecs.BOM_UTF8 + self.docInfoXml.encode("utf-8")
compDocInfo = zlib.compress(docInfoXml)
writeWord(lrf, len(compDocInfo) + 4)
writeWord(lrf, IMAGE_TYPE_ENCODING[self.thumbnailEncoding])
writeDWord(lrf, len(self.thumbnailData))
writeDWord(lrf, len(docInfoXml))
writeString(lrf, compDocInfo)
writeString(lrf, self.thumbnailData)
def writeObjects(self, lrf):
# also appends object entries to the object table
self.objectTable = []
for obj in self.objects:
objStart = lrf.tell()
obj.write(lrf, self.sourceEncoding)
objEnd = lrf.tell()
self.objectTable.append(
ObjectTableEntry(obj.objId, objStart, objEnd-objStart))
def updateObjectTableOffset(self, lrf):
# update the offset of the object table
tableOffset = lrf.tell()
lrf.seek(0x18, 0)
writeQWord(lrf, tableOffset)
lrf.seek(0, 2)
def updateTocObjectOffset(self, lrf):
if self.tocObjId == 0:
return
for entry in self.objectTable:
if entry.objId == self.tocObjId:
lrf.seek(0x48, 0)
writeDWord(lrf, entry.offset)
lrf.seek(0, 2)
break
else:
raise LrfError("toc object not in object table")
def writeObjectTable(self, lrf):
for tableEntry in self.objectTable:
tableEntry.write(lrf)

View File

@@ -0,0 +1,44 @@
from __future__ import absolute_import, division, print_function, unicode_literals
def _optimize(tagList, tagName, conversion):
# copy the tag of interest plus any text
newTagList = []
for tag in tagList:
if tag.name == tagName or tag.name == "rawtext":
newTagList.append(tag)
# now, eliminate any duplicates (leaving the last one)
for i, newTag in enumerate(newTagList[:-1]):
if newTag.name == tagName and newTagList[i+1].name == tagName:
tagList.remove(newTag)
# eliminate redundant settings to same value across text strings
newTagList = []
for tag in tagList:
if tag.name == tagName:
newTagList.append(tag)
for i, newTag in enumerate(newTagList[:-1]):
value = conversion(newTag.parameter)
nextValue = conversion(newTagList[i+1].parameter)
if value == nextValue:
tagList.remove(newTagList[i+1])
# eliminate any setting that don't have text after them
while len(tagList) > 0 and tagList[-1].name == tagName:
del tagList[-1]
def tagListOptimizer(tagList):
# this function eliminates redundant or unnecessary tags
# it scans a list of tags, looking for text settings that are
# changed before any text is output
# for example,
# fontsize=100, fontsize=200, text, fontsize=100, fontsize=200
# should be:
# fontsize=200 text
oldSize = len(tagList)
_optimize(tagList, "fontsize", int)
_optimize(tagList, "fontweight", int)
return oldSize - len(tagList)

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,440 @@
#!/usr/bin/env python2
# vim:fileencoding=utf-8
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
__docformat__ = 'restructuredtext en'
"""
Provides abstraction for metadata reading.writing from a variety of ebook formats.
"""
import os, sys, re
from calibre import relpath, guess_type, prints, force_unicode
from calibre.utils.config_base import tweaks
from polyglot.builtins import codepoint_to_chr, unicode_type, range, map, zip, getcwd, iteritems, itervalues, as_unicode
from polyglot.urllib import quote, unquote, urlparse
try:
_author_pat = re.compile(tweaks['authors_split_regex'])
except Exception:
prints('Author split regexp:', tweaks['authors_split_regex'],
'is invalid, using default')
_author_pat = re.compile(r'(?i),?\s+(and|with)\s+')
def string_to_authors(raw):
if not raw:
return []
raw = raw.replace('&&', '\uffff')
raw = _author_pat.sub('&', raw)
authors = [a.strip().replace('\uffff', '&') for a in raw.split('&')]
return [a for a in authors if a]
def authors_to_string(authors):
if authors is not None:
return ' & '.join([a.replace('&', '&&') for a in authors if a])
else:
return ''
def remove_bracketed_text(src, brackets=None):
if brackets is None:
brackets = {'(': ')', '[': ']', '{': '}'}
from collections import Counter
counts = Counter()
buf = []
src = force_unicode(src)
rmap = {v: k for k, v in iteritems(brackets)}
for char in src:
if char in brackets:
counts[char] += 1
elif char in rmap:
idx = rmap[char]
if counts[idx] > 0:
counts[idx] -= 1
elif sum(itervalues(counts)) < 1:
buf.append(char)
return ''.join(buf)
def author_to_author_sort(author, method=None):
if not author:
return ''
sauthor = remove_bracketed_text(author).strip()
tokens = sauthor.split()
if len(tokens) < 2:
return author
if method is None:
method = tweaks['author_sort_copy_method']
ltoks = frozenset(x.lower() for x in tokens)
copy_words = frozenset(x.lower() for x in tweaks['author_name_copywords'])
if ltoks.intersection(copy_words):
method = 'copy'
if method == 'copy':
return author
prefixes = {force_unicode(y).lower() for y in tweaks['author_name_prefixes']}
prefixes |= {y+'.' for y in prefixes}
while True:
if not tokens:
return author
tok = tokens[0].lower()
if tok in prefixes:
tokens = tokens[1:]
else:
break
suffixes = {force_unicode(y).lower() for y in tweaks['author_name_suffixes']}
suffixes |= {y+'.' for y in suffixes}
suffix = ''
while True:
if not tokens:
return author
last = tokens[-1].lower()
if last in suffixes:
suffix = tokens[-1] + ' ' + suffix
tokens = tokens[:-1]
else:
break
suffix = suffix.strip()
if method == 'comma' and ',' in ''.join(tokens):
return author
atokens = tokens[-1:] + tokens[:-1]
num_toks = len(atokens)
if suffix:
atokens.append(suffix)
if method != 'nocomma' and num_toks > 1:
atokens[0] += ','
return ' '.join(atokens)
def authors_to_sort_string(authors):
return ' & '.join(map(author_to_author_sort, authors))
_title_pats = {}
def get_title_sort_pat(lang=None):
ans = _title_pats.get(lang, None)
if ans is not None:
return ans
q = lang
from calibre.utils.localization import canonicalize_lang, get_lang
if lang is None:
q = tweaks['default_language_for_title_sort']
if q is None:
q = get_lang()
q = canonicalize_lang(q) if q else q
data = tweaks['per_language_title_sort_articles']
try:
ans = data.get(q, None)
except AttributeError:
ans = None # invalid tweak value
try:
ans = frozenset(ans) if ans else frozenset(data['eng'])
except:
ans = frozenset((r'A\s+', r'The\s+', r'An\s+'))
ans = '|'.join(ans)
ans = '^(%s)'%ans
try:
ans = re.compile(ans, re.IGNORECASE)
except:
ans = re.compile(r'^(A|The|An)\s+', re.IGNORECASE)
_title_pats[lang] = ans
return ans
_ignore_starts = '\'"'+''.join(codepoint_to_chr(x) for x in
list(range(0x2018, 0x201e))+[0x2032, 0x2033])
def title_sort(title, order=None, lang=None):
if order is None:
order = tweaks['title_series_sorting']
title = title.strip()
if order == 'strictly_alphabetic':
return title
if title and title[0] in _ignore_starts:
title = title[1:]
match = get_title_sort_pat(lang).search(title)
if match:
try:
prep = match.group(1)
except IndexError:
pass
else:
title = title[len(prep):] + ', ' + prep
if title[0] in _ignore_starts:
title = title[1:]
return title.strip()
coding = list(zip(
[1000,900,500,400,100,90,50,40,10,9,5,4,1],
["M","CM","D","CD","C","XC","L","XL","X","IX","V","IV","I"]
))
def roman(num):
if num <= 0 or num >= 4000 or int(num) != num:
return unicode_type(num)
result = []
for d, r in coding:
while num >= d:
result.append(r)
num -= d
return ''.join(result)
def fmt_sidx(i, fmt='%.2f', use_roman=False):
if i is None or i == '':
i = 1
try:
i = float(i)
except TypeError:
return unicode_type(i)
if int(i) == float(i):
return roman(int(i)) if use_roman else '%d'%int(i)
return fmt%i
class Resource(object):
'''
Represents a resource (usually a file on the filesystem or a URL pointing
to the web. Such resources are commonly referred to in OPF files.
They have the interface:
:member:`path`
:member:`mime_type`
:method:`href`
'''
def __init__(self, href_or_path, basedir=getcwd(), is_path=True):
self._href = None
self._basedir = basedir
self.path = None
self.fragment = ''
try:
self.mime_type = guess_type(href_or_path)[0]
except:
self.mime_type = None
if self.mime_type is None:
self.mime_type = 'application/octet-stream'
if is_path:
path = href_or_path
if not os.path.isabs(path):
path = os.path.abspath(os.path.join(basedir, path))
if isinstance(path, bytes):
path = path.decode(sys.getfilesystemencoding())
self.path = path
else:
url = urlparse(href_or_path)
if url[0] not in ('', 'file'):
self._href = href_or_path
else:
pc = url[2]
if isinstance(pc, unicode_type):
pc = pc.encode('utf-8')
pc = unquote(pc).decode('utf-8')
self.path = os.path.abspath(os.path.join(basedir, pc.replace('/', os.sep)))
self.fragment = unquote(url[-1])
def href(self, basedir=None):
'''
Return a URL pointing to this resource. If it is a file on the filesystem
the URL is relative to `basedir`.
`basedir`: If None, the basedir of this resource is used (see :method:`set_basedir`).
If this resource has no basedir, then the current working directory is used as the basedir.
'''
if basedir is None:
if self._basedir:
basedir = self._basedir
else:
basedir = getcwd()
if self.path is None:
return self._href
f = self.fragment.encode('utf-8') if isinstance(self.fragment, unicode_type) else self.fragment
frag = '#'+as_unicode(quote(f)) if self.fragment else ''
if self.path == basedir:
return ''+frag
try:
rpath = relpath(self.path, basedir)
except OSError: # On windows path and basedir could be on different drives
rpath = self.path
if isinstance(rpath, unicode_type):
rpath = rpath.encode('utf-8')
return as_unicode(quote(rpath.replace(os.sep, '/')))+frag
def set_basedir(self, path):
self._basedir = path
def basedir(self):
return self._basedir
def __repr__(self):
return 'Resource(%s, %s)'%(repr(self.path), repr(self.href()))
class ResourceCollection(object):
def __init__(self):
self._resources = []
def __iter__(self):
for r in self._resources:
yield r
def __len__(self):
return len(self._resources)
def __getitem__(self, index):
return self._resources[index]
def __bool__(self):
return len(self._resources) > 0
def __str__(self):
resources = map(repr, self)
return '[%s]'%', '.join(resources)
def __repr__(self):
return unicode_type(self)
def append(self, resource):
if not isinstance(resource, Resource):
raise ValueError('Can only append objects of type Resource')
self._resources.append(resource)
def remove(self, resource):
self._resources.remove(resource)
def replace(self, start, end, items):
'Same as list[start:end] = items'
self._resources[start:end] = items
@staticmethod
def from_directory_contents(top, topdown=True):
collection = ResourceCollection()
for spec in os.walk(top, topdown=topdown):
path = os.path.abspath(os.path.join(spec[0], spec[1]))
res = Resource.from_path(path)
res.set_basedir(top)
collection.append(res)
return collection
def set_basedir(self, path):
for res in self:
res.set_basedir(path)
def MetaInformation(title, authors=(_('Unknown'),)):
''' Convenient encapsulation of book metadata, needed for compatibility
@param title: title or ``_('Unknown')`` or a MetaInformation object
@param authors: List of strings or []
'''
from calibre.ebooks.metadata.book.base import Metadata
mi = None
if hasattr(title, 'title') and hasattr(title, 'authors'):
mi = title
title = mi.title
authors = mi.authors
return Metadata(title, authors, other=mi)
def check_isbn10(isbn):
try:
digits = tuple(map(int, isbn[:9]))
products = [(i+1)*digits[i] for i in range(9)]
check = sum(products)%11
if (check == 10 and isbn[9] == 'X') or check == int(isbn[9]):
return isbn
except Exception:
pass
return None
def check_isbn13(isbn):
try:
digits = tuple(map(int, isbn[:12]))
products = [(1 if i%2 ==0 else 3)*digits[i] for i in range(12)]
check = 10 - (sum(products)%10)
if check == 10:
check = 0
if unicode_type(check) == isbn[12]:
return isbn
except Exception:
pass
return None
def check_isbn(isbn):
if not isbn:
return None
isbn = re.sub(r'[^0-9X]', '', isbn.upper())
all_same = re.match(r'(\d)\1{9,12}$', isbn)
if all_same is not None:
return None
if len(isbn) == 10:
return check_isbn10(isbn)
if len(isbn) == 13:
return check_isbn13(isbn)
return None
def check_issn(issn):
if not issn:
return None
issn = re.sub(r'[^0-9X]', '', issn.upper())
try:
digits = tuple(map(int, issn[:7]))
products = [(8 - i) * d for i, d in enumerate(digits)]
check = 11 - sum(products) % 11
if (check == 10 and issn[7] == 'X') or check == int(issn[7]):
return issn
except Exception:
pass
return None
def format_isbn(isbn):
cisbn = check_isbn(isbn)
if not cisbn:
return isbn
i = cisbn
if len(i) == 10:
return '-'.join((i[:2], i[2:6], i[6:9], i[9]))
return '-'.join((i[:3], i[3:5], i[5:9], i[9:12], i[12]))
def check_doi(doi):
'Check if something that looks like a DOI is present anywhere in the string'
if not doi:
return None
doi_check = re.search(r'10\.\d{4}/\S+', doi)
if doi_check is not None:
return doi_check.group()
return None
def rating_to_stars(value, allow_half_stars=False, star='', half='½'):
r = max(0, min(int(value or 0), 10))
ans = star * (r // 2)
if allow_half_stars and r % 2:
ans += half
return ans

View File

@@ -0,0 +1,203 @@
#!/usr/bin/env python2
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import os
from contextlib import closing
from calibre.customize import FileTypePlugin
from calibre.utils.localization import canonicalize_lang
from polyglot.builtins import filter, unicode_type
def is_comic(list_of_names):
extensions = {x.rpartition('.')[-1].lower() for x in list_of_names
if '.' in x and x.lower().rpartition('/')[-1] != 'thumbs.db'}
comic_extensions = {'jpg', 'jpeg', 'png'}
return len(extensions - comic_extensions) == 0
def archive_type(stream):
from calibre.utils.zipfile import stringFileHeader
try:
pos = stream.tell()
except:
pos = 0
id_ = stream.read(4)
ans = None
if id_ == stringFileHeader:
ans = 'zip'
elif id_.startswith(b'Rar'):
ans = 'rar'
try:
stream.seek(pos)
except Exception:
pass
return ans
class KPFExtract(FileTypePlugin):
name = 'KPF Extract'
author = 'Kovid Goyal'
description = _('Extract the source DOCX file from Amazon Kindle Create KPF files.'
' Note this will not contain any edits made in the Kindle Create program itself.')
file_types = {'kpf'}
supported_platforms = ['windows', 'osx', 'linux']
on_import = True
def run(self, archive):
from calibre.utils.zipfile import ZipFile
with ZipFile(archive, 'r') as zf:
fnames = zf.namelist()
candidates = [x for x in fnames if x.lower().endswith('.docx')]
if not candidates:
return archive
of = self.temporary_file('_kpf_extract.docx')
with closing(of):
of.write(zf.read(candidates[0]))
return of.name
class ArchiveExtract(FileTypePlugin):
name = 'Archive Extract'
author = 'Kovid Goyal'
description = _('Extract common e-book formats from archive files '
'(ZIP/RAR). Also try to autodetect if they are actually '
'CBZ/CBR files.')
file_types = {'zip', 'rar'}
supported_platforms = ['windows', 'osx', 'linux']
on_import = True
def run(self, archive):
from calibre.utils.zipfile import ZipFile
is_rar = archive.lower().endswith('.rar')
if is_rar:
from calibre.utils.unrar import extract_member, names
else:
zf = ZipFile(archive, 'r')
if is_rar:
fnames = list(names(archive))
else:
fnames = zf.namelist()
def fname_ok(fname):
bn = os.path.basename(fname).lower()
if bn == 'thumbs.db':
return False
if '.' not in bn:
return False
if bn.rpartition('.')[-1] in {'diz', 'nfo'}:
return False
if '__MACOSX' in fname.split('/'):
return False
return True
fnames = list(filter(fname_ok, fnames))
if is_comic(fnames):
ext = '.cbr' if is_rar else '.cbz'
of = self.temporary_file('_archive_extract'+ext)
with open(archive, 'rb') as f:
of.write(f.read())
of.close()
return of.name
if len(fnames) > 1 or not fnames:
return archive
fname = fnames[0]
ext = os.path.splitext(fname)[1][1:]
if ext.lower() not in {
'lit', 'epub', 'mobi', 'prc', 'rtf', 'pdf', 'mp3', 'pdb',
'azw', 'azw1', 'azw3', 'fb2', 'docx', 'doc', 'odt'}:
return archive
of = self.temporary_file('_archive_extract.'+ext)
with closing(of):
if is_rar:
data = extract_member(archive, match=None, name=fname)[1]
of.write(data)
else:
of.write(zf.read(fname))
return of.name
def get_comic_book_info(d, mi, series_index='volume'):
# See http://code.google.com/p/comicbookinfo/wiki/Example
series = d.get('series', '')
if series.strip():
mi.series = series
si = d.get(series_index, None)
if si is None:
si = d.get('issue' if series_index == 'volume' else 'volume', None)
if si is not None:
try:
mi.series_index = float(si)
except Exception:
mi.series_index = 1
if d.get('language', None):
lang = canonicalize_lang(d.get('lang'))
if lang:
mi.languages = [lang]
if d.get('rating', -1) > -1:
mi.rating = d['rating']
for x in ('title', 'publisher'):
y = d.get(x, '').strip()
if y:
setattr(mi, x, y)
tags = d.get('tags', [])
if tags:
mi.tags = tags
authors = []
for credit in d.get('credits', []):
if credit.get('role', '') in ('Writer', 'Artist', 'Cartoonist',
'Creator'):
x = credit.get('person', '')
if x:
x = ' '.join((reversed(x.split(', '))))
authors.append(x)
if authors:
mi.authors = authors
comments = d.get('comments', '')
if comments and comments.strip():
mi.comments = comments.strip()
pubm, puby = d.get('publicationMonth', None), d.get('publicationYear', None)
if puby is not None:
from calibre.utils.date import parse_only_date
from datetime import date
try:
dt = date(puby, 6 if pubm is None else pubm, 15)
dt = parse_only_date(unicode_type(dt))
mi.pubdate = dt
except Exception:
pass
def parse_comic_comment(comment, series_index='volume'):
# See http://code.google.com/p/comicbookinfo/wiki/Example
from calibre.ebooks.metadata import MetaInformation
import json
mi = MetaInformation(None, None)
m = json.loads(comment)
if isinstance(m, dict):
for cat in m:
if cat.startswith('ComicBookInfo'):
get_comic_book_info(m[cat], mi, series_index=series_index)
break
return mi
def get_comic_metadata(stream, stream_type, series_index='volume'):
comment = None
if stream_type == 'cbz':
from calibre.utils.zipfile import ZipFile
zf = ZipFile(stream)
comment = zf.comment
elif stream_type == 'cbr':
from calibre.utils.unrar import comment as get_comment
comment = get_comment(stream)
return parse_comic_comment(comment or b'{}', series_index=series_index)

View File

@@ -0,0 +1,132 @@
#!/usr/bin/env python2
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
'''
All fields must have a NULL value represented as None for simple types,
an empty list/dictionary for complex types and (None, None) for cover_data
'''
SOCIAL_METADATA_FIELDS = frozenset((
'tags', # Ordered list
'rating', # A floating point number between 0 and 10
'comments', # A simple HTML enabled string
'series', # A simple string
'series_index', # A floating point number
# Of the form { scheme1:value1, scheme2:value2}
# For example: {'isbn':'123456789', 'doi':'xxxx', ... }
'identifiers',
))
'''
The list of names that convert to identifiers when in get and set.
'''
TOP_LEVEL_IDENTIFIERS = frozenset((
'isbn',
))
PUBLICATION_METADATA_FIELDS = frozenset((
'title', # title must never be None. Should be _('Unknown')
# Pseudo field that can be set, but if not set is auto generated
# from title and languages
'title_sort',
'authors', # Ordered list. Must never be None, can be [_('Unknown')]
'author_sort_map', # Map of sort strings for each author
# Pseudo field that can be set, but if not set is auto generated
# from authors and languages
'author_sort',
'book_producer',
'timestamp', # Dates and times must be timezone aware
'pubdate',
'last_modified',
'rights',
# So far only known publication type is periodical:calibre
# If None, means book
'publication_type',
'uuid', # A UUID usually of type 4
'languages', # ordered list of languages in this publication
'publisher', # Simple string, no special semantics
# Absolute path to image file encoded in filesystem_encoding
'cover',
# Of the form (format, data) where format is, for e.g. 'jpeg', 'png', 'gif'...
'cover_data',
# Either thumbnail data, or an object with the attribute
# image_path which is the path to an image file, encoded
# in filesystem_encoding
'thumbnail',
))
BOOK_STRUCTURE_FIELDS = frozenset((
# These are used by code, Null values are None.
'toc', 'spine', 'guide', 'manifest',
))
USER_METADATA_FIELDS = frozenset((
# A dict of dicts similar to field_metadata. Each field description dict
# also contains a value field with the key #value#.
'user_metadata',
))
DEVICE_METADATA_FIELDS = frozenset((
'device_collections', # Ordered list of strings
'lpath', # Unicode, / separated
'size', # In bytes
'mime', # Mimetype of the book file being represented
))
CALIBRE_METADATA_FIELDS = frozenset((
'application_id', # An application id, currently set to the db_id.
'db_id', # the calibre primary key of the item.
'formats', # list of formats (extensions) for this book
# a dict of user category names, where the value is a list of item names
# from the book that are in that category
'user_categories',
# a dict of author to an associated hyperlink
'author_link_map',
))
ALL_METADATA_FIELDS = SOCIAL_METADATA_FIELDS.union(
PUBLICATION_METADATA_FIELDS).union(
BOOK_STRUCTURE_FIELDS).union(
USER_METADATA_FIELDS).union(
DEVICE_METADATA_FIELDS).union(
CALIBRE_METADATA_FIELDS)
# All fields except custom fields
STANDARD_METADATA_FIELDS = SOCIAL_METADATA_FIELDS.union(
PUBLICATION_METADATA_FIELDS).union(
BOOK_STRUCTURE_FIELDS).union(
DEVICE_METADATA_FIELDS).union(
CALIBRE_METADATA_FIELDS)
# Metadata fields that smart update must do special processing to copy.
SC_FIELDS_NOT_COPIED = frozenset(('title', 'title_sort', 'authors',
'author_sort', 'author_sort_map',
'cover_data', 'tags', 'languages',
'identifiers'))
# Metadata fields that smart update should copy only if the source is not None
SC_FIELDS_COPY_NOT_NULL = frozenset(('device_collections', 'lpath', 'size', 'comments', 'thumbnail'))
# Metadata fields that smart update should copy without special handling
SC_COPYABLE_FIELDS = SOCIAL_METADATA_FIELDS.union(
PUBLICATION_METADATA_FIELDS).union(
BOOK_STRUCTURE_FIELDS).union(
DEVICE_METADATA_FIELDS).union(
CALIBRE_METADATA_FIELDS) - \
SC_FIELDS_NOT_COPIED.union(
SC_FIELDS_COPY_NOT_NULL)
SERIALIZABLE_FIELDS = SOCIAL_METADATA_FIELDS.union(
USER_METADATA_FIELDS).union(
PUBLICATION_METADATA_FIELDS).union(
CALIBRE_METADATA_FIELDS).union(
DEVICE_METADATA_FIELDS) - \
frozenset(('device_collections', 'formats',
'cover_data'))
# these are rebuilt when needed

View File

@@ -0,0 +1,841 @@
#!/usr/bin/env python2
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
import copy, traceback
from calibre import prints
from calibre.constants import DEBUG, ispy3
from calibre.ebooks.metadata.book import (SC_COPYABLE_FIELDS,
SC_FIELDS_COPY_NOT_NULL, STANDARD_METADATA_FIELDS,
TOP_LEVEL_IDENTIFIERS, ALL_METADATA_FIELDS)
from calibre.library.field_metadata import FieldMetadata
from calibre.utils.icu import sort_key
from polyglot.builtins import iteritems, unicode_type, filter, map
# Special sets used to optimize the performance of getting and setting
# attributes on Metadata objects
SIMPLE_GET = frozenset(STANDARD_METADATA_FIELDS - TOP_LEVEL_IDENTIFIERS)
SIMPLE_SET = frozenset(SIMPLE_GET - {'identifiers'})
def human_readable(size, precision=2):
""" Convert a size in bytes into megabytes """
return ('%.'+unicode_type(precision)+'f'+ 'MB') % (size/(1024*1024),)
NULL_VALUES = {
'user_metadata': {},
'cover_data' : (None, None),
'tags' : [],
'identifiers' : {},
'languages' : [],
'device_collections': [],
'author_sort_map': {},
'authors' : [_('Unknown')],
'author_sort' : _('Unknown'),
'title' : _('Unknown'),
'user_categories' : {},
'author_link_map' : {},
'language' : 'und'
}
field_metadata = FieldMetadata()
def reset_field_metadata():
global field_metadata
field_metadata = FieldMetadata()
ck = lambda typ: icu_lower(typ).strip().replace(':', '').replace(',', '')
cv = lambda val: val.strip().replace(',', '|')
class Metadata(object):
'''
A class representing all the metadata for a book. The various standard metadata
fields are available as attributes of this object. You can also stick
arbitrary attributes onto this object.
Metadata from custom columns should be accessed via the get() method,
passing in the lookup name for the column, for example: "#mytags".
Use the :meth:`is_null` method to test if a field is null.
This object also has functions to format fields into strings.
The list of standard metadata fields grows with time is in
:data:`STANDARD_METADATA_FIELDS`.
Please keep the method based API of this class to a minimum. Every method
becomes a reserved field name.
'''
__calibre_serializable__ = True
def __init__(self, title, authors=(_('Unknown'),), other=None, template_cache=None,
formatter=None):
'''
@param title: title or ``_('Unknown')``
@param authors: List of strings or []
@param other: None or a metadata object
'''
_data = copy.deepcopy(NULL_VALUES)
_data.pop('language')
object.__setattr__(self, '_data', _data)
if other is not None:
self.smart_update(other)
else:
if title:
self.title = title
if authors:
# List of strings or []
self.author = list(authors) if authors else [] # Needed for backward compatibility
self.authors = list(authors) if authors else []
from calibre.ebooks.metadata.book.formatter import SafeFormat
self.formatter = SafeFormat() if formatter is None else formatter
self.template_cache = template_cache
def is_null(self, field):
'''
Return True if the value of field is null in this object.
'null' means it is unknown or evaluates to False. So a title of
_('Unknown') is null or a language of 'und' is null.
Be careful with numeric fields since this will return True for zero as
well as None.
Also returns True if the field does not exist.
'''
try:
null_val = NULL_VALUES.get(field, None)
val = getattr(self, field, None)
return not val or val == null_val
except:
return True
def set_null(self, field):
null_val = copy.copy(NULL_VALUES.get(field))
setattr(self, field, null_val)
def __getattribute__(self, field):
_data = object.__getattribute__(self, '_data')
if field in SIMPLE_GET:
return _data.get(field, None)
if field in TOP_LEVEL_IDENTIFIERS:
return _data.get('identifiers').get(field, None)
if field == 'language':
try:
return _data.get('languages', [])[0]
except:
return NULL_VALUES['language']
try:
return object.__getattribute__(self, field)
except AttributeError:
pass
if field in _data['user_metadata']:
d = _data['user_metadata'][field]
val = d['#value#']
if d['datatype'] != 'composite':
return val
if val is None:
d['#value#'] = 'RECURSIVE_COMPOSITE FIELD (Metadata) ' + field
val = d['#value#'] = self.formatter.safe_format(
d['display']['composite_template'],
self,
_('TEMPLATE ERROR'),
self, column_name=field,
template_cache=self.template_cache).strip()
return val
if field.startswith('#') and field.endswith('_index'):
try:
return self.get_extra(field[:-6])
except:
pass
raise AttributeError(
'Metadata object has no attribute named: '+ repr(field))
def __setattr__(self, field, val, extra=None):
_data = object.__getattribute__(self, '_data')
if field in SIMPLE_SET:
if val is None:
val = copy.copy(NULL_VALUES.get(field, None))
_data[field] = val
elif field in TOP_LEVEL_IDENTIFIERS:
field, val = self._clean_identifier(field, val)
identifiers = _data['identifiers']
identifiers.pop(field, None)
if val:
identifiers[field] = val
elif field == 'identifiers':
if not val:
val = copy.copy(NULL_VALUES.get('identifiers', None))
self.set_identifiers(val)
elif field == 'language':
langs = []
if val and val.lower() != 'und':
langs = [val]
_data['languages'] = langs
elif field in _data['user_metadata']:
_data['user_metadata'][field]['#value#'] = val
_data['user_metadata'][field]['#extra#'] = extra
else:
# You are allowed to stick arbitrary attributes onto this object as
# long as they don't conflict with global or user metadata names
# Don't abuse this privilege
self.__dict__[field] = val
def __iter__(self):
return iter(object.__getattribute__(self, '_data'))
def has_key(self, key):
return key in object.__getattribute__(self, '_data')
def deepcopy(self, class_generator=lambda : Metadata(None)):
''' Do not use this method unless you know what you are doing, if you
want to create a simple clone of this object, use :meth:`deepcopy_metadata`
instead. Class_generator must be a function that returns an instance
of Metadata or a subclass of it.'''
m = class_generator()
if not isinstance(m, Metadata):
return None
object.__setattr__(m, '__dict__', copy.deepcopy(self.__dict__))
return m
def deepcopy_metadata(self):
m = Metadata(None)
object.__setattr__(m, '_data', copy.deepcopy(object.__getattribute__(self, '_data')))
return m
def get(self, field, default=None):
try:
return self.__getattribute__(field)
except AttributeError:
return default
def get_extra(self, field, default=None):
_data = object.__getattribute__(self, '_data')
if field in _data['user_metadata']:
try:
return _data['user_metadata'][field]['#extra#']
except:
return default
raise AttributeError(
'Metadata object has no attribute named: '+ repr(field))
def set(self, field, val, extra=None):
self.__setattr__(field, val, extra)
def get_identifiers(self):
'''
Return a copy of the identifiers dictionary.
The dict is small, and the penalty for using a reference where a copy is
needed is large. Also, we don't want any manipulations of the returned
dict to show up in the book.
'''
ans = object.__getattribute__(self,
'_data')['identifiers']
if not ans:
ans = {}
return copy.deepcopy(ans)
def _clean_identifier(self, typ, val):
if typ:
typ = ck(typ)
if val:
val = cv(val)
return typ, val
def set_identifiers(self, identifiers):
'''
Set all identifiers. Note that if you previously set ISBN, calling
this method will delete it.
'''
cleaned = {ck(k):cv(v) for k, v in iteritems(identifiers) if k and v}
object.__getattribute__(self, '_data')['identifiers'] = cleaned
def set_identifier(self, typ, val):
'If val is empty, deletes identifier of type typ'
typ, val = self._clean_identifier(typ, val)
if not typ:
return
identifiers = object.__getattribute__(self,
'_data')['identifiers']
identifiers.pop(typ, None)
if val:
identifiers[typ] = val
def has_identifier(self, typ):
identifiers = object.__getattribute__(self,
'_data')['identifiers']
return typ in identifiers
# field-oriented interface. Intended to be the same as in LibraryDatabase
def standard_field_keys(self):
'''
return a list of all possible keys, even if this book doesn't have them
'''
return STANDARD_METADATA_FIELDS
def custom_field_keys(self):
'''
return a list of the custom fields in this book
'''
return iter(object.__getattribute__(self, '_data')['user_metadata'])
def all_field_keys(self):
'''
All field keys known by this instance, even if their value is None
'''
_data = object.__getattribute__(self, '_data')
return frozenset(ALL_METADATA_FIELDS.union(frozenset(_data['user_metadata'])))
def metadata_for_field(self, key):
'''
return metadata describing a standard or custom field.
'''
if key not in self.custom_field_keys():
return self.get_standard_metadata(key, make_copy=False)
return self.get_user_metadata(key, make_copy=False)
def all_non_none_fields(self):
'''
Return a dictionary containing all non-None metadata fields, including
the custom ones.
'''
result = {}
_data = object.__getattribute__(self, '_data')
for attr in STANDARD_METADATA_FIELDS:
v = _data.get(attr, None)
if v is not None:
result[attr] = v
# separate these because it uses the self.get(), not _data.get()
for attr in TOP_LEVEL_IDENTIFIERS:
v = self.get(attr, None)
if v is not None:
result[attr] = v
for attr in _data['user_metadata']:
v = self.get(attr, None)
if v is not None:
result[attr] = v
if _data['user_metadata'][attr]['datatype'] == 'series':
result[attr+'_index'] = _data['user_metadata'][attr]['#extra#']
return result
# End of field-oriented interface
# Extended interfaces. These permit one to get copies of metadata dictionaries, and to
# get and set custom field metadata
def get_standard_metadata(self, field, make_copy):
'''
return field metadata from the field if it is there. Otherwise return
None. field is the key name, not the label. Return a copy if requested,
just in case the user wants to change values in the dict.
'''
if field in field_metadata and field_metadata[field]['kind'] == 'field':
if make_copy:
return copy.deepcopy(field_metadata[field])
return field_metadata[field]
return None
def get_all_standard_metadata(self, make_copy):
'''
return a dict containing all the standard field metadata associated with
the book.
'''
if not make_copy:
return field_metadata
res = {}
for k in field_metadata:
if field_metadata[k]['kind'] == 'field':
res[k] = copy.deepcopy(field_metadata[k])
return res
def get_all_user_metadata(self, make_copy):
'''
return a dict containing all the custom field metadata associated with
the book.
'''
_data = object.__getattribute__(self, '_data')
user_metadata = _data['user_metadata']
if not make_copy:
return user_metadata
res = {}
for k in user_metadata:
res[k] = copy.deepcopy(user_metadata[k])
return res
def get_user_metadata(self, field, make_copy):
'''
return field metadata from the object if it is there. Otherwise return
None. field is the key name, not the label. Return a copy if requested,
just in case the user wants to change values in the dict.
'''
_data = object.__getattribute__(self, '_data')
_data = _data['user_metadata']
if field in _data:
if make_copy:
return copy.deepcopy(_data[field])
return _data[field]
return None
def set_all_user_metadata(self, metadata):
'''
store custom field metadata into the object. Field is the key name
not the label
'''
if metadata is None:
traceback.print_stack()
return
um = {}
for key, meta in iteritems(metadata):
m = meta.copy()
if '#value#' not in m:
if m['datatype'] == 'text' and m['is_multiple']:
m['#value#'] = []
else:
m['#value#'] = None
um[key] = m
_data = object.__getattribute__(self, '_data')
_data['user_metadata'] = um
def set_user_metadata(self, field, metadata):
'''
store custom field metadata for one column into the object. Field is
the key name not the label
'''
if field is not None:
if not field.startswith('#'):
raise AttributeError(
'Custom field name %s must begin with \'#\''%repr(field))
if metadata is None:
traceback.print_stack()
return
m = dict(metadata)
# Copying the elements should not be necessary. The objects referenced
# in the dict should not change. Of course, they can be replaced.
# for k,v in iteritems(metadata):
# m[k] = copy.copy(v)
if '#value#' not in m:
if m['datatype'] == 'text' and m['is_multiple']:
m['#value#'] = []
else:
m['#value#'] = None
_data = object.__getattribute__(self, '_data')
_data['user_metadata'][field] = m
def template_to_attribute(self, other, ops):
'''
Takes a list [(src,dest), (src,dest)], evaluates the template in the
context of other, then copies the result to self[dest]. This is on a
best-efforts basis. Some assignments can make no sense.
'''
if not ops:
return
from calibre.ebooks.metadata.book.formatter import SafeFormat
formatter = SafeFormat()
for op in ops:
try:
src = op[0]
dest = op[1]
val = formatter.safe_format(src, other, 'PLUGBOARD TEMPLATE ERROR', other)
if dest == 'tags':
self.set(dest, [f.strip() for f in val.split(',') if f.strip()])
elif dest == 'authors':
self.set(dest, [f.strip() for f in val.split('&') if f.strip()])
else:
self.set(dest, val)
except:
if DEBUG:
traceback.print_exc()
# Old Metadata API {{{
def print_all_attributes(self):
for x in STANDARD_METADATA_FIELDS:
prints('%s:'%x, getattr(self, x, 'None'))
for x in self.custom_field_keys():
meta = self.get_user_metadata(x, make_copy=False)
if meta is not None:
prints(x, meta)
prints('--------------')
def smart_update(self, other, replace_metadata=False):
'''
Merge the information in `other` into self. In case of conflicts, the information
in `other` takes precedence, unless the information in `other` is NULL.
'''
def copy_not_none(dest, src, attr):
v = getattr(src, attr, None)
if v not in (None, NULL_VALUES.get(attr, None)):
setattr(dest, attr, copy.deepcopy(v))
unknown = _('Unknown')
if other.title and other.title != unknown:
self.title = other.title
if hasattr(other, 'title_sort'):
self.title_sort = other.title_sort
if other.authors and (
other.authors[0] != unknown or (
not self.authors or (
len(self.authors) == 1 and self.authors[0] == unknown and
getattr(self, 'author_sort', None) == unknown
)
)
):
self.authors = list(other.authors)
if hasattr(other, 'author_sort_map'):
self.author_sort_map = dict(other.author_sort_map)
if hasattr(other, 'author_sort'):
self.author_sort = other.author_sort
if replace_metadata:
# SPECIAL_FIELDS = frozenset(['lpath', 'size', 'comments', 'thumbnail'])
for attr in SC_COPYABLE_FIELDS:
setattr(self, attr, getattr(other, attr, 1.0 if
attr == 'series_index' else None))
self.tags = other.tags
self.cover_data = getattr(other, 'cover_data',
NULL_VALUES['cover_data'])
self.set_all_user_metadata(other.get_all_user_metadata(make_copy=True))
for x in SC_FIELDS_COPY_NOT_NULL:
copy_not_none(self, other, x)
if callable(getattr(other, 'get_identifiers', None)):
self.set_identifiers(other.get_identifiers())
# language is handled below
else:
for attr in SC_COPYABLE_FIELDS:
copy_not_none(self, other, attr)
for x in SC_FIELDS_COPY_NOT_NULL:
copy_not_none(self, other, x)
if other.tags:
# Case-insensitive but case preserving merging
lotags = [t.lower() for t in other.tags]
lstags = [t.lower() for t in self.tags]
ot, st = map(frozenset, (lotags, lstags))
for t in st.intersection(ot):
sidx = lstags.index(t)
oidx = lotags.index(t)
self.tags[sidx] = other.tags[oidx]
self.tags += [t for t in other.tags if t.lower() in ot-st]
if getattr(other, 'cover_data', False):
other_cover = other.cover_data[-1]
self_cover = self.cover_data[-1] if self.cover_data else b''
if not self_cover:
self_cover = b''
if not other_cover:
other_cover = b''
if len(other_cover) > len(self_cover):
self.cover_data = other.cover_data
if callable(getattr(other, 'custom_field_keys', None)):
for x in other.custom_field_keys():
meta = other.get_user_metadata(x, make_copy=True)
if meta is not None:
self_tags = self.get(x, [])
self.set_user_metadata(x, meta) # get... did the deepcopy
other_tags = other.get(x, [])
if meta['datatype'] == 'text' and meta['is_multiple']:
# Case-insensitive but case preserving merging
lotags = [t.lower() for t in other_tags]
try:
lstags = [t.lower() for t in self_tags]
except TypeError:
# Happens if x is not a text, is_multiple field
# on self
lstags = []
self_tags = []
ot, st = map(frozenset, (lotags, lstags))
for t in st.intersection(ot):
sidx = lstags.index(t)
oidx = lotags.index(t)
self_tags[sidx] = other_tags[oidx]
self_tags += [t for t in other_tags if t.lower() in ot-st]
setattr(self, x, self_tags)
my_comments = getattr(self, 'comments', '')
other_comments = getattr(other, 'comments', '')
if not my_comments:
my_comments = ''
if not other_comments:
other_comments = ''
if len(other_comments.strip()) > len(my_comments.strip()):
self.comments = other_comments
# Copy all the non-none identifiers
if callable(getattr(other, 'get_identifiers', None)):
d = self.get_identifiers()
s = other.get_identifiers()
d.update([v for v in iteritems(s) if v[1] is not None])
self.set_identifiers(d)
else:
# other structure not Metadata. Copy the top-level identifiers
for attr in TOP_LEVEL_IDENTIFIERS:
copy_not_none(self, other, attr)
other_lang = getattr(other, 'languages', [])
if other_lang and other_lang != ['und']:
self.languages = list(other_lang)
if not getattr(self, 'series', None):
self.series_index = None
def format_series_index(self, val=None):
from calibre.ebooks.metadata import fmt_sidx
v = self.series_index if val is None else val
try:
x = float(v)
except Exception:
x = 1
return fmt_sidx(x)
def authors_from_string(self, raw):
from calibre.ebooks.metadata import string_to_authors
self.authors = string_to_authors(raw)
def format_authors(self):
from calibre.ebooks.metadata import authors_to_string
return authors_to_string(self.authors)
def format_tags(self):
return ', '.join([unicode_type(t) for t in sorted(self.tags, key=sort_key)])
def format_rating(self, v=None, divide_by=1):
if v is None:
if self.rating is not None:
return unicode_type(self.rating/divide_by)
return 'None'
return unicode_type(v/divide_by)
def format_field(self, key, series_with_index=True):
'''
Returns the tuple (display_name, formatted_value)
'''
name, val, ign, ign = self.format_field_extended(key, series_with_index)
return (name, val)
def format_field_extended(self, key, series_with_index=True):
from calibre.ebooks.metadata import authors_to_string
'''
returns the tuple (display_name, formatted_value, original_value,
field_metadata)
'''
from calibre.utils.date import format_date
# Handle custom series index
if key.startswith('#') and key.endswith('_index'):
tkey = key[:-6] # strip the _index
cmeta = self.get_user_metadata(tkey, make_copy=False)
if cmeta and cmeta['datatype'] == 'series':
if self.get(tkey):
res = self.get_extra(tkey)
return (unicode_type(cmeta['name']+'_index'),
self.format_series_index(res), res, cmeta)
else:
return (unicode_type(cmeta['name']+'_index'), '', '', cmeta)
if key in self.custom_field_keys():
res = self.get(key, None) # get evaluates all necessary composites
cmeta = self.get_user_metadata(key, make_copy=False)
name = unicode_type(cmeta['name'])
if res is None or res == '': # can't check "not res" because of numeric fields
return (name, res, None, None)
orig_res = res
datatype = cmeta['datatype']
if datatype == 'text' and cmeta['is_multiple']:
res = cmeta['is_multiple']['list_to_ui'].join(res)
elif datatype == 'series' and series_with_index:
if self.get_extra(key) is not None:
res = res + \
' [%s]'%self.format_series_index(val=self.get_extra(key))
elif datatype == 'datetime':
res = format_date(res, cmeta['display'].get('date_format','dd MMM yyyy'))
elif datatype == 'bool':
res = _('Yes') if res else _('No')
elif datatype == 'rating':
res = '%.2g'%(res/2)
elif datatype in ['int', 'float']:
try:
fmt = cmeta['display'].get('number_format', None)
res = fmt.format(res)
except:
pass
return (name, unicode_type(res), orig_res, cmeta)
# convert top-level ids into their value
if key in TOP_LEVEL_IDENTIFIERS:
fmeta = field_metadata['identifiers']
name = key
res = self.get(key, None)
return (name, res, res, fmeta)
# Translate aliases into the standard field name
fmkey = field_metadata.search_term_to_field_key(key)
if fmkey in field_metadata and field_metadata[fmkey]['kind'] == 'field':
res = self.get(key, None)
fmeta = field_metadata[fmkey]
name = unicode_type(fmeta['name'])
if res is None or res == '':
return (name, res, None, None)
orig_res = res
name = unicode_type(fmeta['name'])
datatype = fmeta['datatype']
if key == 'authors':
res = authors_to_string(res)
elif key == 'series_index':
res = self.format_series_index(res)
elif datatype == 'text' and fmeta['is_multiple']:
if isinstance(res, dict):
res = [k + ':' + v for k,v in res.items()]
res = fmeta['is_multiple']['list_to_ui'].join(sorted(filter(None, res), key=sort_key))
elif datatype == 'series' and series_with_index:
res = res + ' [%s]'%self.format_series_index()
elif datatype == 'datetime':
res = format_date(res, fmeta['display'].get('date_format','dd MMM yyyy'))
elif datatype == 'rating':
res = '%.2g'%(res/2)
elif key == 'size':
res = human_readable(res)
return (name, unicode_type(res), orig_res, fmeta)
return (None, None, None, None)
def __unicode__representation__(self):
'''
A string representation of this object, suitable for printing to
console
'''
from calibre.utils.date import isoformat
from calibre.ebooks.metadata import authors_to_string
ans = []
def fmt(x, y):
ans.append('%-20s: %s'%(unicode_type(x), unicode_type(y)))
fmt('Title', self.title)
if self.title_sort:
fmt('Title sort', self.title_sort)
if self.authors:
fmt('Author(s)', authors_to_string(self.authors) +
((' [' + self.author_sort + ']')
if self.author_sort and self.author_sort != _('Unknown') else ''))
if self.publisher:
fmt('Publisher', self.publisher)
if getattr(self, 'book_producer', False):
fmt('Book Producer', self.book_producer)
if self.tags:
fmt('Tags', ', '.join([unicode_type(t) for t in self.tags]))
if self.series:
fmt('Series', self.series + ' #%s'%self.format_series_index())
if not self.is_null('languages'):
fmt('Languages', ', '.join(self.languages))
if self.rating is not None:
fmt('Rating', ('%.2g'%(float(self.rating)/2)) if self.rating
else '')
if self.timestamp is not None:
fmt('Timestamp', isoformat(self.timestamp))
if self.pubdate is not None:
fmt('Published', isoformat(self.pubdate))
if self.rights is not None:
fmt('Rights', unicode_type(self.rights))
if self.identifiers:
fmt('Identifiers', ', '.join(['%s:%s'%(k, v) for k, v in
iteritems(self.identifiers)]))
if self.comments:
fmt('Comments', self.comments)
for key in self.custom_field_keys():
val = self.get(key, None)
if val:
(name, val) = self.format_field(key)
fmt(name, unicode_type(val))
return '\n'.join(ans)
def to_html(self):
'''
A HTML representation of this object.
'''
from calibre.ebooks.metadata import authors_to_string
from calibre.utils.date import isoformat
ans = [(_('Title'), unicode_type(self.title))]
ans += [(_('Author(s)'), (authors_to_string(self.authors) if self.authors else _('Unknown')))]
ans += [(_('Publisher'), unicode_type(self.publisher))]
ans += [(_('Producer'), unicode_type(self.book_producer))]
ans += [(_('Comments'), unicode_type(self.comments))]
ans += [('ISBN', unicode_type(self.isbn))]
ans += [(_('Tags'), ', '.join([unicode_type(t) for t in self.tags]))]
if self.series:
ans += [(_('Series'), unicode_type(self.series) + ' #%s'%self.format_series_index())]
ans += [(_('Languages'), ', '.join(self.languages))]
if self.timestamp is not None:
ans += [(_('Timestamp'), unicode_type(isoformat(self.timestamp, as_utc=False, sep=' ')))]
if self.pubdate is not None:
ans += [(_('Published'), unicode_type(isoformat(self.pubdate, as_utc=False, sep=' ')))]
if self.rights is not None:
ans += [(_('Rights'), unicode_type(self.rights))]
for key in self.custom_field_keys():
val = self.get(key, None)
if val:
(name, val) = self.format_field(key)
ans += [(name, val)]
for i, x in enumerate(ans):
ans[i] = '<tr><td><b>%s</b></td><td>%s</td></tr>'%x
return '<table>%s</table>'%'\n'.join(ans)
if ispy3:
__str__ = __unicode__representation__
else:
__unicode__ = __unicode__representation__
def __str__(self):
return self.__unicode__().encode('utf-8')
def __nonzero__(self):
return bool(self.title or self.author or self.comments or self.tags)
__bool__ = __nonzero__
# }}}
def field_from_string(field, raw, field_metadata):
''' Parse the string raw to return an object that is suitable for calling
set() on a Metadata object. '''
dt = field_metadata['datatype']
val = object
if dt in {'int', 'float'}:
val = int(raw) if dt == 'int' else float(raw)
elif dt == 'rating':
val = float(raw) * 2
elif dt == 'datetime':
from calibre.utils.date import parse_only_date
val = parse_only_date(raw)
elif dt == 'bool':
if raw.lower() in {'true', 'yes', 'y'}:
val = True
elif raw.lower() in {'false', 'no', 'n'}:
val = False
else:
raise ValueError('Unknown value for %s: %s'%(field, raw))
elif dt == 'text':
ism = field_metadata['is_multiple']
if ism:
val = [x.strip() for x in raw.split(ism['ui_to_list'])]
if field == 'identifiers':
val = {x.partition(':')[0]:x.partition(':')[-1] for x in val}
elif field == 'languages':
from calibre.utils.localization import canonicalize_lang
val = [canonicalize_lang(x) for x in val]
val = [x for x in val if x]
if val is object:
val = raw
return val

View File

@@ -0,0 +1,46 @@
#!/usr/bin/env python2
# vim:fileencoding=utf-8
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
from calibre.ebooks.metadata.book import TOP_LEVEL_IDENTIFIERS, ALL_METADATA_FIELDS
from calibre.utils.formatter import TemplateFormatter
class SafeFormat(TemplateFormatter):
def __init__(self):
TemplateFormatter.__init__(self)
def get_value(self, orig_key, args, kwargs):
if not orig_key:
return ''
key = orig_key = orig_key.lower()
if (key != 'title_sort' and key not in TOP_LEVEL_IDENTIFIERS and
key not in ALL_METADATA_FIELDS):
from calibre.ebooks.metadata.book.base import field_metadata
key = field_metadata.search_term_to_field_key(key)
if key is None or (self.book and
key not in self.book.all_field_keys()):
if hasattr(self.book, orig_key):
key = orig_key
else:
raise ValueError(_('Value: unknown field ') + orig_key)
try:
b = self.book.get_user_metadata(key, False)
except:
b = None
if b and b['datatype'] in {'int', 'float'} and self.book.get(key, None) is None:
v = ''
else:
v = self.book.format_field(key, series_with_index=False)[1]
if v is None:
return ''
if v == '':
return ''
return v

View File

@@ -0,0 +1,218 @@
from __future__ import absolute_import, division, print_function, unicode_literals
'''
Created on 4 Jun 2010
@author: charles
'''
import json, traceback
from datetime import datetime, time
from calibre.ebooks.metadata.book import SERIALIZABLE_FIELDS
from calibre.constants import filesystem_encoding, preferred_encoding
from calibre.library.field_metadata import FieldMetadata
from calibre import isbytestring
from polyglot.builtins import iteritems, itervalues, as_bytes
from polyglot.binary import as_base64_unicode, from_base64_bytes
# Translate datetimes to and from strings. The string form is the datetime in
# UTC. The returned date is also UTC
def string_to_datetime(src):
from calibre.utils.iso8601 import parse_iso8601
if src != "None":
try:
return parse_iso8601(src)
except Exception:
pass
return None
def datetime_to_string(dateval):
from calibre.utils.date import isoformat, UNDEFINED_DATE, local_tz
if dateval is None:
return "None"
if not isinstance(dateval, datetime):
dateval = datetime.combine(dateval, time())
if hasattr(dateval, 'tzinfo') and dateval.tzinfo is None:
dateval = dateval.replace(tzinfo=local_tz)
if dateval <= UNDEFINED_DATE:
return "None"
return isoformat(dateval)
def encode_thumbnail(thumbnail):
'''
Encode the image part of a thumbnail, then return the 3 part tuple
'''
from calibre.utils.imghdr import identify
if thumbnail is None:
return None
if not isinstance(thumbnail, (tuple, list)):
try:
width, height = identify(as_bytes(thumbnail))[1:]
if width < 0 or height < 0:
return None
thumbnail = (width, height, thumbnail)
except Exception:
return None
return (thumbnail[0], thumbnail[1], as_base64_unicode(thumbnail[2]))
def decode_thumbnail(tup):
'''
Decode an encoded thumbnail into its 3 component parts
'''
if tup is None:
return None
return (tup[0], tup[1], from_base64_bytes(tup[2]))
def object_to_unicode(obj, enc=preferred_encoding):
def dec(x):
return x.decode(enc, 'replace')
if isbytestring(obj):
return dec(obj)
if isinstance(obj, (list, tuple)):
return [dec(x) if isbytestring(x) else object_to_unicode(x) for x in obj]
if isinstance(obj, dict):
ans = {}
for k, v in obj.items():
k = object_to_unicode(k)
v = object_to_unicode(v)
ans[k] = v
return ans
return obj
def encode_is_multiple(fm):
if fm.get('is_multiple', None):
# migrate is_multiple back to a character
fm['is_multiple2'] = fm.get('is_multiple', {})
dt = fm.get('datatype', None)
if dt == 'composite':
fm['is_multiple'] = ','
else:
fm['is_multiple'] = '|'
else:
fm['is_multiple'] = None
fm['is_multiple2'] = {}
def decode_is_multiple(fm):
im = fm.get('is_multiple2', None)
if im:
fm['is_multiple'] = im
del fm['is_multiple2']
else:
# Must migrate the is_multiple from char to dict
im = fm.get('is_multiple', {})
if im:
dt = fm.get('datatype', None)
if dt == 'composite':
im = {'cache_to_list': ',', 'ui_to_list': ',',
'list_to_ui': ', '}
elif fm.get('display', {}).get('is_names', False):
im = {'cache_to_list': '|', 'ui_to_list': '&',
'list_to_ui': ', '}
else:
im = {'cache_to_list': '|', 'ui_to_list': ',',
'list_to_ui': ', '}
elif im is None:
im = {}
fm['is_multiple'] = im
class JsonCodec(object):
def __init__(self, field_metadata=None):
self.field_metadata = field_metadata or FieldMetadata()
def encode_to_file(self, file_, booklist):
data = json.dumps(self.encode_booklist_metadata(booklist), indent=2)
if not isinstance(data, bytes):
data = data.encode('utf-8')
file_.write(data)
def encode_booklist_metadata(self, booklist):
result = []
for book in booklist:
result.append(self.encode_book_metadata(book))
return result
def encode_book_metadata(self, book):
result = {}
for key in SERIALIZABLE_FIELDS:
result[key] = self.encode_metadata_attr(book, key)
return result
def encode_metadata_attr(self, book, key):
if key == 'user_metadata':
meta = book.get_all_user_metadata(make_copy=True)
for fm in itervalues(meta):
if fm['datatype'] == 'datetime':
fm['#value#'] = datetime_to_string(fm['#value#'])
encode_is_multiple(fm)
return meta
if key in self.field_metadata:
datatype = self.field_metadata[key]['datatype']
else:
datatype = None
value = book.get(key)
if key == 'thumbnail':
return encode_thumbnail(value)
elif isbytestring(value): # str includes bytes
enc = filesystem_encoding if key == 'lpath' else preferred_encoding
return object_to_unicode(value, enc=enc)
elif datatype == 'datetime':
return datetime_to_string(value)
else:
return object_to_unicode(value)
def decode_from_file(self, file_, booklist, book_class, prefix):
js = []
try:
js = json.load(file_, encoding='utf-8')
for item in js:
entry = self.raw_to_book(item, book_class, prefix)
if entry is not None:
booklist.append(entry)
except:
print('exception during JSON decode_from_file')
traceback.print_exc()
def raw_to_book(self, json_book, book_class, prefix):
try:
book = book_class(prefix, json_book.get('lpath', None))
for key,val in iteritems(json_book):
meta = self.decode_metadata(key, val)
if key == 'user_metadata':
book.set_all_user_metadata(meta)
else:
if key == 'classifiers':
key = 'identifiers'
setattr(book, key, meta)
return book
except:
print('exception during JSON decoding')
traceback.print_exc()
def decode_metadata(self, key, value):
if key == 'classifiers':
key = 'identifiers'
if key == 'user_metadata':
for fm in itervalues(value):
if fm['datatype'] == 'datetime':
fm['#value#'] = string_to_datetime(fm['#value#'])
decode_is_multiple(fm)
return value
elif key in self.field_metadata:
if self.field_metadata[key]['datatype'] == 'datetime':
return string_to_datetime(value)
if key == 'thumbnail':
return decode_thumbnail(value)
return value

View File

@@ -0,0 +1,412 @@
#!/usr/bin/env python2
# vim:fileencoding=utf-8
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
'''
Try to read metadata from an HTML file.
'''
import re
import unittest
from collections import defaultdict
from html5_parser import parse
from lxml.etree import Comment
from calibre.ebooks.metadata import string_to_authors, authors_to_string
from calibre.ebooks.metadata.book.base import Metadata
from calibre.ebooks.chardet import xml_to_unicode
from calibre import replace_entities, isbytestring
from calibre.utils.date import parse_date, is_date_undefined
from polyglot.builtins import iteritems
def get_metadata(stream):
src = stream.read()
return get_metadata_(src)
COMMENT_NAMES = {
'title': 'TITLE',
'authors': 'AUTHOR',
'publisher': 'PUBLISHER',
'isbn': 'ISBN',
'languages': 'LANGUAGE',
'pubdate': 'PUBDATE',
'timestamp': 'TIMESTAMP',
'series': 'SERIES',
'series_index': 'SERIESNUMBER',
'rating': 'RATING',
'comments': 'COMMENTS',
'tags': 'TAGS',
}
META_NAMES = {
'title' : ('dc.title', 'dcterms.title', 'title'),
'authors': ('author', 'dc.creator.aut', 'dcterms.creator.aut', 'dc.creator'),
'publisher': ('publisher', 'dc.publisher', 'dcterms.publisher'),
'isbn': ('isbn',),
'languages': ('dc.language', 'dcterms.language'),
'pubdate': ('pubdate', 'date of publication', 'dc.date.published', 'dc.date.publication', 'dc.date.issued', 'dcterms.issued'),
'timestamp': ('timestamp', 'date of creation', 'dc.date.created', 'dc.date.creation', 'dcterms.created'),
'series': ('series',),
'series_index': ('seriesnumber', 'series_index', 'series.index'),
'rating': ('rating',),
'comments': ('comments', 'dc.description'),
'tags': ('tags',),
}
rmap_comment = {v:k for k, v in iteritems(COMMENT_NAMES)}
rmap_meta = {v:k for k, l in iteritems(META_NAMES) for v in l}
# Extract an HTML attribute value, supports both single and double quotes and
# single quotes inside double quotes and vice versa.
attr_pat = r'''(?:(?P<sq>')|(?P<dq>"))(?P<content>(?(sq)[^']+|[^"]+))(?(sq)'|")'''
def handle_comment(data, comment_tags):
if not hasattr(handle_comment, 'pat'):
handle_comment.pat = re.compile(r'''(?P<name>\S+)\s*=\s*%s''' % attr_pat)
for match in handle_comment.pat.finditer(data):
x = match.group('name')
field = None
try:
field = rmap_comment[x]
except KeyError:
pass
if field:
comment_tags[field].append(replace_entities(match.group('content')))
def parse_metadata(src):
root = parse(src)
comment_tags = defaultdict(list)
meta_tags = defaultdict(list)
meta_tag_ids = defaultdict(list)
title = ''
identifier_pat = re.compile(r'(?:dc|dcterms)[.:]identifier(?:\.|$)', flags=re.IGNORECASE)
id_pat2 = re.compile(r'(?:dc|dcterms)[.:]identifier$', flags=re.IGNORECASE)
for comment in root.iterdescendants(tag=Comment):
if comment.text:
handle_comment(comment.text, comment_tags)
for q in root.iterdescendants(tag='title'):
if q.text:
title = q.text
break
for meta in root.iterdescendants(tag='meta'):
name, content = meta.get('name'), meta.get('content')
if not name or not content:
continue
if identifier_pat.match(name) is not None:
scheme = None
if id_pat2.match(name) is not None:
scheme = meta.get('scheme')
else:
elements = re.split(r'[.:]', name)
if len(elements) == 3 and not meta.get('scheme'):
scheme = elements[2].strip()
if scheme:
meta_tag_ids[scheme.lower()].append(content)
else:
x = name.lower()
field = None
try:
field = rmap_meta[x]
except KeyError:
try:
field = rmap_meta[x.replace(':', '.')]
except KeyError:
pass
if field:
meta_tags[field].append(content)
return comment_tags, meta_tags, meta_tag_ids, title
def get_metadata_(src, encoding=None):
# Meta data definitions as in
# https://www.mobileread.com/forums/showpost.php?p=712544&postcount=9
if isbytestring(src):
if not encoding:
src = xml_to_unicode(src)[0]
else:
src = src.decode(encoding, 'replace')
src = src[:150000] # Searching shouldn't take too long
comment_tags, meta_tags, meta_tag_ids, title_tag = parse_metadata(src)
def get_all(field):
ans = comment_tags.get(field, meta_tags.get(field, None))
if ans:
ans = [x.strip() for x in ans if x.strip()]
if not ans:
ans = None
return ans
def get(field):
ans = get_all(field)
if ans:
ans = ans[0]
return ans
# Title
title = get('title') or title_tag.strip() or _('Unknown')
# Author
authors = authors_to_string(get_all('authors')) or _('Unknown')
# Create MetaInformation with Title and Author
mi = Metadata(title, string_to_authors(authors))
# Single-value text fields
for field in ('publisher', 'isbn'):
val = get(field)
if val:
setattr(mi, field, val)
# Multi-value text fields
for field in ('languages',):
val = get_all(field)
if val:
setattr(mi, field, val)
# HTML fields
for field in ('comments',):
val = get(field)
if val:
setattr(mi, field, val.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;').replace('"', '&quot;').replace("'", '&apos;'))
# Date fields
for field in ('pubdate', 'timestamp'):
try:
val = parse_date(get(field))
except:
pass
else:
if not is_date_undefined(val):
setattr(mi, field, val)
# SERIES
series = get('series')
if series:
pat = re.compile(r'\[([.0-9]+)\]$')
match = pat.search(series)
series_index = None
if match is not None:
try:
series_index = float(match.group(1))
except:
pass
series = series.replace(match.group(), '').strip()
mi.series = series
if series_index is None:
series_index = get('series_index')
try:
series_index = float(series_index)
except:
pass
if series_index is not None:
mi.series_index = series_index
# RATING
rating = get('rating')
if rating:
try:
mi.rating = float(rating)
if mi.rating < 0:
mi.rating = 0
if mi.rating > 10:
mi.rating = 0
except:
pass
# TAGS
tags = get_all('tags')
if tags:
tags = [x.strip() for s in tags for x in s.split(',') if x.strip()]
if tags:
mi.tags = tags
# IDENTIFIERS
for (k,v) in iteritems(meta_tag_ids):
v = [x.strip() for x in v if x.strip()]
if v:
mi.set_identifier(k, v[0])
return mi
class MetadataHtmlTest(unittest.TestCase):
def compare_metadata(self, meta_a, meta_b):
for attr in (
'title', 'authors', 'publisher', 'isbn', 'languages', 'pubdate', 'timestamp', 'series',
'series_index', 'rating', 'comments', 'tags', 'identifiers'
):
self.assertEqual(getattr(meta_a, attr), getattr(meta_b, attr))
def get_stream(self, test):
from io import BytesIO
raw = b'''\
<html>
<head>
'''
if test in {'title', 'meta_single', 'meta_multi', 'comment_single', 'comment_multi'}:
raw += b'''\
}
<title>A Title Tag &amp;amp; Title &#x24B8;</title>
'''
if test in {'meta_single', 'meta_multi', 'comment_single', 'comment_multi'}:
raw += b'''\
<meta name="dc:title" content="A Meta Tag &amp;amp; Title &#9400;" />
<meta name="dcterms.creator.aut" content="George Washington" />
<meta name="dc.publisher" content="Publisher A" />
<meta name="isbn" content="1234567890" />
<meta name="dc.language" content="English" />
<meta name="dc.date.published" content="2019-01-01" />
<meta name="dcterms.created" content="2018-01-01" />
<meta name="series" content="Meta Series" />
<meta name="seriesnumber" content="1" />
<meta name="rating" content="" />
<meta name="dc.description" content="" />
<meta name="tags" content="tag a, tag b" />
<meta name="dc.identifier.url" content="" />
<meta name="dc.identifier" scheme="" content="invalid" />
<meta name="dc.identifier." content="still invalid" />
<meta name="dc.identifier.conflicting" scheme="schemes" content="are also invalid" />
<meta name="dc.identifier.custom.subid" content="invalid too" />
'''
if test in {'meta_multi', 'comment_single', 'comment_multi'}:
raw += b'''\
<meta name="title" content="A Different Meta Tag &amp;amp; Title &#9400;" />
<meta name="author" content="John Adams with Thomas Jefferson" />
<meta name="publisher" content="Publisher B" />
<meta name="isbn" content="2345678901" />
<meta name="dcterms.language" content="Spanish" />
<meta name="date of publication" content="2017-01-01" />
<meta name="timestamp" content="2016-01-01" />
<meta name="series" content="Another Meta Series" />
<meta name="series.index" content="2" />
<meta name="rating" content="8" />
<meta name="comments" content="meta &quot;comments&quot; &#x2665; HTML &amp;amp;" />
<meta name="tags" content="tag c" />
<meta name="dc.identifier.url" content="http://google.com/search?q=calibre" />
'''
if test in {'comment_single', 'comment_multi'}:
raw += b'''\
<!-- TITLE="A Comment Tag &amp;amp; Title &#9400;" -->
<!-- AUTHOR="James Madison and James Monroe" -->
<!-- PUBLISHER="Publisher C" -->
<!-- ISBN="3456789012" -->
<!-- LANGUAGE="French" -->
<!-- PUBDATE="2015-01-01" -->
<!-- TIMESTAMP="2014-01-01" -->
<!-- SERIES="Comment Series" -->
<!-- SERIESNUMBER="3" -->
<!-- RATING="20" -->
<!-- COMMENTS="comment &quot;comments&quot; &#x2665; HTML -- too &amp;amp;" -->
<!-- TAGS="tag d" -->
'''
if test in {'comment_multi'}:
raw += b'''\
<!-- TITLE="Another Comment Tag &amp;amp; Title &#9400;" -->
<!-- AUTHOR="John Quincy Adams" -->
<!-- PUBLISHER="Publisher D" -->
<!-- ISBN="4567890123" -->
<!-- LANGUAGE="Japanese" -->
<!-- PUBDATE="2013-01-01" -->
<!-- TIMESTAMP="2012-01-01" -->
<!-- SERIES="Comment Series 2" -->
<!-- SERIESNUMBER="4" -->
<!-- RATING="1" -->
<!-- COMMENTS="comment &quot;comments&quot; &#x2665; HTML -- too &amp;amp; for sure" -->
<!-- TAGS="tag e, tag f" -->
'''
raw += b'''\
</head>
<body>
</body>
</html>
'''
return BytesIO(raw)
def test_input_title(self):
stream_meta = get_metadata(self.get_stream('title'))
canon_meta = Metadata('A Title Tag &amp; Title Ⓒ', [_('Unknown')])
self.compare_metadata(stream_meta, canon_meta)
def test_input_meta_single(self):
stream_meta = get_metadata(self.get_stream('meta_single'))
canon_meta = Metadata('A Meta Tag &amp; Title Ⓒ', ['George Washington'])
canon_meta.publisher = 'Publisher A'
canon_meta.languages = ['English']
canon_meta.pubdate = parse_date('2019-01-01')
canon_meta.timestamp = parse_date('2018-01-01')
canon_meta.series = 'Meta Series'
canon_meta.series_index = float(1)
# canon_meta.rating = float(0)
# canon_meta.comments = ''
canon_meta.tags = ['tag a', 'tag b']
canon_meta.set_identifiers({'isbn': '1234567890'})
self.compare_metadata(stream_meta, canon_meta)
def test_input_meta_multi(self):
stream_meta = get_metadata(self.get_stream('meta_multi'))
canon_meta = Metadata('A Meta Tag &amp; Title Ⓒ', ['George Washington', 'John Adams', 'Thomas Jefferson'])
canon_meta.publisher = 'Publisher A'
canon_meta.languages = ['English', 'Spanish']
canon_meta.pubdate = parse_date('2019-01-01')
canon_meta.timestamp = parse_date('2018-01-01')
canon_meta.series = 'Meta Series'
canon_meta.series_index = float(1)
canon_meta.rating = float(8)
canon_meta.comments = 'meta &quot;comments&quot; ♥ HTML &amp;amp;'
canon_meta.tags = ['tag a', 'tag b', 'tag c']
canon_meta.set_identifiers({'isbn': '1234567890', 'url': 'http://google.com/search?q=calibre'})
self.compare_metadata(stream_meta, canon_meta)
def test_input_comment_single(self):
stream_meta = get_metadata(self.get_stream('comment_single'))
canon_meta = Metadata('A Comment Tag &amp; Title Ⓒ', ['James Madison', 'James Monroe'])
canon_meta.publisher = 'Publisher C'
canon_meta.languages = ['French']
canon_meta.pubdate = parse_date('2015-01-01')
canon_meta.timestamp = parse_date('2014-01-01')
canon_meta.series = 'Comment Series'
canon_meta.series_index = float(3)
canon_meta.rating = float(0)
canon_meta.comments = 'comment &quot;comments&quot; ♥ HTML -- too &amp;amp;'
canon_meta.tags = ['tag d']
canon_meta.set_identifiers({'isbn': '3456789012', 'url': 'http://google.com/search?q=calibre'})
self.compare_metadata(stream_meta, canon_meta)
def test_input_comment_multi(self):
stream_meta = get_metadata(self.get_stream('comment_multi'))
canon_meta = Metadata('A Comment Tag &amp; Title Ⓒ', ['James Madison', 'James Monroe', 'John Quincy Adams'])
canon_meta.publisher = 'Publisher C'
canon_meta.languages = ['French', 'Japanese']
canon_meta.pubdate = parse_date('2015-01-01')
canon_meta.timestamp = parse_date('2014-01-01')
canon_meta.series = 'Comment Series'
canon_meta.series_index = float(3)
canon_meta.rating = float(0)
canon_meta.comments = 'comment &quot;comments&quot; ♥ HTML -- too &amp;amp;'
canon_meta.tags = ['tag d', 'tag e', 'tag f']
canon_meta.set_identifiers({'isbn': '3456789012', 'url': 'http://google.com/search?q=calibre'})
self.compare_metadata(stream_meta, canon_meta)
def find_tests():
return unittest.TestLoader().loadTestsFromTestCase(MetadataHtmlTest)

View File

@@ -0,0 +1,243 @@
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
import os, re, collections
from calibre.utils.config import prefs
from calibre.constants import filesystem_encoding
from calibre.ebooks.metadata.opf2 import OPF
from calibre import isbytestring
from calibre.customize.ui import get_file_type_metadata, set_file_type_metadata
from calibre.ebooks.metadata import MetaInformation, string_to_authors
from polyglot.builtins import getcwd, unicode_type
# The priorities for loading metadata from different file types
# Higher values should be used to update metadata from lower values
METADATA_PRIORITIES = collections.defaultdict(lambda:0)
for i, ext in enumerate((
'html', 'htm', 'xhtml', 'xhtm',
'rtf', 'fb2', 'pdf', 'prc', 'odt',
'epub', 'lit', 'lrx', 'lrf', 'mobi',
'azw', 'azw3', 'azw1', 'rb', 'imp', 'snb'
)):
METADATA_PRIORITIES[ext] = i + 1
def path_to_ext(path):
return os.path.splitext(path)[1][1:].lower()
def metadata_from_formats(formats, force_read_metadata=False, pattern=None):
try:
return _metadata_from_formats(formats, force_read_metadata, pattern)
except:
mi = metadata_from_filename(list(iter(formats))[0], pat=pattern)
if not mi.authors:
mi.authors = [_('Unknown')]
return mi
def _metadata_from_formats(formats, force_read_metadata=False, pattern=None):
mi = MetaInformation(None, None)
formats.sort(key=lambda x: METADATA_PRIORITIES[path_to_ext(x)])
extensions = list(map(path_to_ext, formats))
if 'opf' in extensions:
opf = formats[extensions.index('opf')]
mi2 = opf_metadata(opf)
if mi2 is not None and mi2.title:
return mi2
for path, ext in zip(formats, extensions):
with lopen(path, 'rb') as stream:
try:
newmi = get_metadata(stream, stream_type=ext,
use_libprs_metadata=True,
force_read_metadata=force_read_metadata,
pattern=pattern)
mi.smart_update(newmi)
except Exception:
continue
if getattr(mi, 'application_id', None) is not None:
return mi
if not mi.title:
mi.title = _('Unknown')
if not mi.authors:
mi.authors = [_('Unknown')]
return mi
def get_metadata(stream, stream_type='lrf', use_libprs_metadata=False,
force_read_metadata=False, pattern=None):
pos = 0
if hasattr(stream, 'tell'):
pos = stream.tell()
try:
return _get_metadata(stream, stream_type, use_libprs_metadata,
force_read_metadata, pattern)
finally:
if hasattr(stream, 'seek'):
stream.seek(pos)
def _get_metadata(stream, stream_type, use_libprs_metadata,
force_read_metadata=False, pattern=None):
if stream_type:
stream_type = stream_type.lower()
if stream_type in ('html', 'html', 'xhtml', 'xhtm', 'xml'):
stream_type = 'html'
if stream_type in ('mobi', 'prc', 'azw'):
stream_type = 'mobi'
if stream_type in ('odt', 'ods', 'odp', 'odg', 'odf'):
stream_type = 'odt'
opf = None
if hasattr(stream, 'name'):
c = os.path.splitext(stream.name)[0]+'.opf'
if os.access(c, os.R_OK):
opf = opf_metadata(os.path.abspath(c))
if use_libprs_metadata and getattr(opf, 'application_id', None) is not None:
return opf
name = os.path.basename(getattr(stream, 'name', ''))
# The fallback pattern matches the default filename format produced by calibre
base = metadata_from_filename(name, pat=pattern, fallback_pat=re.compile(
r'^(?P<title>.+) - (?P<author>[^-]+)$'))
if not base.authors:
base.authors = [_('Unknown')]
if not base.title:
base.title = _('Unknown')
mi = MetaInformation(None, None)
if force_read_metadata or prefs['read_file_metadata']:
mi = get_file_type_metadata(stream, stream_type)
base.smart_update(mi)
if opf is not None:
base.smart_update(opf)
return base
def set_metadata(stream, mi, stream_type='lrf', report_error=None):
if stream_type:
stream_type = stream_type.lower()
set_file_type_metadata(stream, mi, stream_type, report_error=report_error)
def metadata_from_filename(name, pat=None, fallback_pat=None):
if isbytestring(name):
name = name.decode(filesystem_encoding, 'replace')
name = name.rpartition('.')[0]
mi = MetaInformation(None, None)
if pat is None:
pat = re.compile(prefs.get('filename_pattern'))
name = name.replace('_', ' ')
match = pat.search(name)
if match is None and fallback_pat is not None:
match = fallback_pat.search(name)
if match is not None:
try:
mi.title = match.group('title')
except IndexError:
pass
try:
au = match.group('author')
aus = string_to_authors(au)
if aus:
mi.authors = aus
if prefs['swap_author_names'] and mi.authors:
def swap(a):
if ',' in a:
parts = a.split(',', 1)
else:
parts = a.split(None, 1)
if len(parts) > 1:
t = parts[-1]
parts = parts[:-1]
parts.insert(0, t)
return ' '.join(parts)
mi.authors = [swap(x) for x in mi.authors]
except (IndexError, ValueError):
pass
try:
mi.series = match.group('series')
except IndexError:
pass
try:
si = match.group('series_index')
mi.series_index = float(si)
except (IndexError, ValueError, TypeError):
pass
try:
si = match.group('isbn')
mi.isbn = si
except (IndexError, ValueError):
pass
try:
publisher = match.group('publisher')
mi.publisher = publisher
except (IndexError, ValueError):
pass
try:
pubdate = match.group('published')
if pubdate:
from calibre.utils.date import parse_only_date
mi.pubdate = parse_only_date(pubdate)
except:
pass
try:
comments = match.group('comments')
mi.comments = comments
except (IndexError, ValueError):
pass
if mi.is_null('title'):
mi.title = name
return mi
def opf_metadata(opfpath):
if hasattr(opfpath, 'read'):
f = opfpath
opfpath = getattr(f, 'name', getcwd())
else:
f = open(opfpath, 'rb')
try:
opf = OPF(f, os.path.dirname(opfpath))
if opf.application_id is not None:
mi = opf.to_book_metadata()
if hasattr(opf, 'cover') and opf.cover:
cpath = os.path.join(os.path.dirname(opfpath), opf.cover)
if os.access(cpath, os.R_OK):
fmt = cpath.rpartition('.')[-1]
with open(cpath, 'rb') as f:
data = f.read()
mi.cover_data = (fmt, data)
return mi
except Exception:
import traceback
traceback.print_exc()
pass
def forked_read_metadata(path, tdir):
from calibre.ebooks.metadata.opf2 import metadata_to_opf
with lopen(path, 'rb') as f:
fmt = os.path.splitext(path)[1][1:].lower()
f.seek(0, 2)
sz = f.tell()
with lopen(os.path.join(tdir, 'size.txt'), 'wb') as s:
s.write(unicode_type(sz).encode('ascii'))
f.seek(0)
mi = get_metadata(f, fmt)
if mi.cover_data and mi.cover_data[1]:
with lopen(os.path.join(tdir, 'cover.jpg'), 'wb') as f:
f.write(mi.cover_data[1])
mi.cover_data = (None, None)
mi.cover = 'cover.jpg'
opf = metadata_to_opf(mi, default_lang='und')
with lopen(os.path.join(tdir, 'metadata.opf'), 'wb') as f:
f.write(opf)

View File

@@ -0,0 +1,302 @@
#!/usr/bin/python2
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai
#
# Copyright (C) 2006 Søren Roug, European Environment Agency
#
# This is free software. You may redistribute it under the terms
# of the Apache license and the GNU General Public License Version
# 2 or at your option any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public
# License along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#
# Contributor(s):
#
from __future__ import absolute_import, division, print_function, unicode_literals
import io
import json
import os
import re
from lxml.etree import fromstring, tostring
from calibre.ebooks.metadata import (
MetaInformation, authors_to_string, check_isbn, string_to_authors
)
from calibre.utils.date import isoformat, parse_date
from calibre.utils.imghdr import identify
from calibre.utils.localization import canonicalize_lang, lang_as_iso639_1
from calibre.utils.zipfile import ZipFile, safe_replace
from odf.draw import Frame as odFrame, Image as odImage
from odf.namespaces import DCNS, METANS, OFFICENS
from odf.opendocument import load as odLoad
from polyglot.builtins import as_unicode
fields = {
'title': (DCNS, 'title'),
'description': (DCNS, 'description'),
'subject': (DCNS, 'subject'),
'creator': (DCNS, 'creator'),
'date': (DCNS, 'date'),
'language': (DCNS, 'language'),
'generator': (METANS, 'generator'),
'initial-creator': (METANS, 'initial-creator'),
'keyword': (METANS, 'keyword'),
'keywords': (METANS, 'keywords'),
'editing-duration': (METANS, 'editing-duration'),
'editing-cycles': (METANS, 'editing-cycles'),
'printed-by': (METANS, 'printed-by'),
'print-date': (METANS, 'print-date'),
'creation-date': (METANS, 'creation-date'),
'user-defined': (METANS, 'user-defined'),
# 'template': (METANS, 'template'),
}
def get_metadata(stream, extract_cover=True):
whitespace = re.compile(r'\s+')
def normalize(s):
return whitespace.sub(' ', s).strip()
with ZipFile(stream) as zf:
meta = zf.read('meta.xml')
root = fromstring(meta)
def find(field):
ns, tag = fields[field]
ans = root.xpath('//ns0:{}'.format(tag), namespaces={'ns0': ns})
if ans:
return normalize(tostring(ans[0], method='text', encoding='unicode', with_tail=False)).strip()
mi = MetaInformation(None, [])
title = find('title')
if title:
mi.title = title
creator = find('initial-creator') or find('creator')
if creator:
mi.authors = string_to_authors(creator)
desc = find('description')
if desc:
mi.comments = desc
lang = find('language')
if lang and canonicalize_lang(lang):
mi.languages = [canonicalize_lang(lang)]
kw = find('keyword') or find('keywords')
if kw:
mi.tags = [x.strip() for x in kw.split(',') if x.strip()]
data = {}
for tag in root.xpath('//ns0:user-defined', namespaces={'ns0': fields['user-defined'][0]}):
name = (tag.get('{%s}name' % METANS) or '').lower()
vtype = tag.get('{%s}value-type' % METANS) or 'string'
val = tag.text
if name and val:
if vtype == 'boolean':
val = val == 'true'
data[name] = val
opfmeta = False # we need this later for the cover
opfnocover = False
if data.get('opf.metadata'):
# custom metadata contains OPF information
opfmeta = True
if data.get('opf.titlesort', ''):
mi.title_sort = data['opf.titlesort']
if data.get('opf.authors', ''):
mi.authors = string_to_authors(data['opf.authors'])
if data.get('opf.authorsort', ''):
mi.author_sort = data['opf.authorsort']
if data.get('opf.isbn', ''):
isbn = check_isbn(data['opf.isbn'])
if isbn is not None:
mi.isbn = isbn
if data.get('opf.publisher', ''):
mi.publisher = data['opf.publisher']
if data.get('opf.pubdate', ''):
mi.pubdate = parse_date(data['opf.pubdate'], assume_utc=True)
if data.get('opf.identifiers'):
try:
mi.identifiers = json.loads(data['opf.identifiers'])
except Exception:
pass
if data.get('opf.rating'):
try:
mi.rating = max(0, min(float(data['opf.rating']), 10))
except Exception:
pass
if data.get('opf.series', ''):
mi.series = data['opf.series']
if data.get('opf.seriesindex', ''):
try:
mi.series_index = float(data['opf.seriesindex'])
except Exception:
mi.series_index = 1.0
if data.get('opf.language', ''):
cl = canonicalize_lang(data['opf.language'])
if cl:
mi.languages = [cl]
opfnocover = data.get('opf.nocover', False)
if not opfnocover:
try:
read_cover(stream, zf, mi, opfmeta, extract_cover)
except Exception:
pass # Do not let an error reading the cover prevent reading other data
return mi
def set_metadata(stream, mi):
with ZipFile(stream) as zf:
raw = _set_metadata(zf.open('meta.xml').read(), mi)
# print(raw.decode('utf-8'))
stream.seek(os.SEEK_SET)
safe_replace(stream, "meta.xml", io.BytesIO(raw))
def _set_metadata(raw, mi):
root = fromstring(raw)
namespaces = {'office': OFFICENS, 'meta': METANS, 'dc': DCNS}
nsrmap = {v: k for k, v in namespaces.items()}
def xpath(expr, parent=root):
return parent.xpath(expr, namespaces=namespaces)
def remove(*tag_names):
for tag_name in tag_names:
ns = fields[tag_name][0]
tag_name = '{}:{}'.format(nsrmap[ns], tag_name)
for x in xpath('descendant::' + tag_name, meta):
x.getparent().remove(x)
def add(tag, val=None):
ans = meta.makeelement('{%s}%s' % fields[tag])
ans.text = val
meta.append(ans)
return ans
def remove_user_metadata(*names):
for x in xpath('//meta:user-defined'):
q = (x.get('{%s}name' % METANS) or '').lower()
if q in names:
x.getparent().remove(x)
def add_um(name, val, vtype='string'):
ans = add('user-defined', val)
ans.set('{%s}value-type' % METANS, vtype)
ans.set('{%s}name' % METANS, name)
def add_user_metadata(name, val):
if not hasattr(add_user_metadata, 'sentinel_added'):
add_user_metadata.sentinel_added = True
remove_user_metadata('opf.metadata')
add_um('opf.metadata', 'true', 'boolean')
val_type = 'string'
if hasattr(val, 'strftime'):
val = isoformat(val, as_utc=True).split('T')[0]
val_type = 'date'
add_um(name, val, val_type)
meta = xpath('//office:meta')[0]
if not mi.is_null('title'):
remove('title')
add('title', mi.title)
if not mi.is_null('title_sort'):
remove_user_metadata('opf.titlesort')
add_user_metadata('opf.titlesort', mi.title_sort)
if not mi.is_null('authors'):
remove('initial-creator', 'creator')
val = authors_to_string(mi.authors)
add('initial-creator', val), add('creator', val)
remove_user_metadata('opf.authors')
add_user_metadata('opf.authors', val)
if not mi.is_null('author_sort'):
remove_user_metadata('opf.authorsort')
add_user_metadata('opf.authorsort', mi.author_sort)
if not mi.is_null('comments'):
remove('description')
add('description', mi.comments)
if not mi.is_null('tags'):
remove('keyword')
add('keyword', ', '.join(mi.tags))
if not mi.is_null('languages'):
lang = lang_as_iso639_1(mi.languages[0])
if lang:
remove('language')
add('language', lang)
if not mi.is_null('pubdate'):
remove_user_metadata('opf.pubdate')
add_user_metadata('opf.pubdate', mi.pubdate)
if not mi.is_null('publisher'):
remove_user_metadata('opf.publisher')
add_user_metadata('opf.publisher', mi.publisher)
if not mi.is_null('series'):
remove_user_metadata('opf.series', 'opf.seriesindex')
add_user_metadata('opf.series', mi.series)
add_user_metadata('opf.seriesindex', '{}'.format(mi.series_index))
if not mi.is_null('identifiers'):
remove_user_metadata('opf.identifiers')
add_user_metadata('opf.identifiers', as_unicode(json.dumps(mi.identifiers)))
if not mi.is_null('rating'):
remove_user_metadata('opf.rating')
add_user_metadata('opf.rating', '%.2g' % mi.rating)
return tostring(root, encoding='utf-8', pretty_print=True)
def read_cover(stream, zin, mi, opfmeta, extract_cover):
# search for an draw:image in a draw:frame with the name 'opf.cover'
# if opf.metadata prop is false, just use the first image that
# has a proper size (borrowed from docx)
otext = odLoad(stream)
cover_href = None
cover_data = None
cover_frame = None
imgnum = 0
for frm in otext.topnode.getElementsByType(odFrame):
img = frm.getElementsByType(odImage)
if len(img) == 0:
continue
i_href = img[0].getAttribute('href')
try:
raw = zin.read(i_href)
except KeyError:
continue
try:
fmt, width, height = identify(raw)
except Exception:
continue
imgnum += 1
if opfmeta and frm.getAttribute('name').lower() == 'opf.cover':
cover_href = i_href
cover_data = (fmt, raw)
cover_frame = frm.getAttribute('name') # could have upper case
break
if cover_href is None and imgnum == 1 and 0.8 <= height/width <= 1.8 and height*width >= 12000:
# Pick the first image as the cover if it is of a suitable size
cover_href = i_href
cover_data = (fmt, raw)
if not opfmeta:
break
if cover_href is not None:
mi.cover = cover_href
mi.odf_cover_frame = cover_frame
if extract_cover:
if not cover_data:
raw = zin.read(cover_href)
try:
fmt = identify(raw)[0]
except Exception:
pass
else:
cover_data = (fmt, raw)
mi.cover_data = cover_data

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,251 @@
#!/usr/bin/env python2
# vim:fileencoding=utf-8
# License: GPLv3 Copyright: 2008, Kovid Goyal <kovid at kovidgoyal.net>
"""
Edit metadata in RTF files.
"""
from __future__ import absolute_import, division, print_function, unicode_literals
import codecs
import re
from calibre import force_unicode
from calibre.ebooks.metadata import MetaInformation
from polyglot.builtins import codepoint_to_chr, string_or_bytes, unicode_type, int_to_byte, filter
title_pat = re.compile(br'\{\\info.*?\{\\title(.*?)(?<!\\)\}', re.DOTALL)
author_pat = re.compile(br'\{\\info.*?\{\\author(.*?)(?<!\\)\}', re.DOTALL)
comment_pat = re.compile(br'\{\\info.*?\{\\subject(.*?)(?<!\\)\}', re.DOTALL)
tags_pat = re.compile(br'\{\\info.*?\{\\category(.*?)(?<!\\)\}', re.DOTALL)
publisher_pat = re.compile(br'\{\\info.*?\{\\manager(.*?)(?<!\\)\}', re.DOTALL)
def get_document_info(stream):
"""
Extract the \\info block from an RTF file.
Return the info block as a string and the position in the file at which it
starts.
@param stream: File like object pointing to the RTF file.
"""
block_size = 4096
stream.seek(0)
found, block = False, b""
while not found:
prefix = block[-6:]
block = prefix + stream.read(block_size)
actual_block_size = len(block) - len(prefix)
if len(block) == len(prefix):
break
idx = block.find(br'{\info')
if idx >= 0:
found = True
pos = stream.tell() - actual_block_size + idx - len(prefix)
stream.seek(pos)
else:
if block.find(br'\sect') > -1:
break
if not found:
return None, 0
data, count, = [], 0
pos = stream.tell()
while True:
ch = stream.read(1)
if ch == b'\\':
data.append(ch + stream.read(1))
continue
if ch == b'{':
count += 1
elif ch == b'}':
count -= 1
data.append(ch)
if count == 0:
break
return b''.join(data), pos
def detect_codepage(stream):
pat = re.compile(br'\\ansicpg(\d+)')
match = pat.search(stream.read(512))
if match is not None:
num = match.group(1)
if num == b'0':
num = b'1252'
try:
codec = (b'cp'+num).decode('ascii')
codecs.lookup(codec)
return codec
except Exception:
pass
def encode(unistr):
if not isinstance(unistr, unicode_type):
unistr = force_unicode(unistr)
return ''.join(c if ord(c) < 128 else '\\u{}?'.format(ord(c)) for c in unistr)
def decode(raw, codec):
# https://en.wikipedia.org/wiki/Rich_Text_Format#Character_encoding
def codepage(match):
try:
return int_to_byte(int(match.group(1), 16)).decode(codec)
except ValueError:
return '?'
def uni(match):
try:
return codepoint_to_chr(int(match.group(1)))
except Exception:
return '?'
if isinstance(raw, bytes):
raw = raw.decode('ascii', 'replace')
if codec is not None:
raw = re.sub(r"\\'([a-fA-F0-9]{2})", codepage, raw)
raw = re.sub(r'\\u([0-9]{3,5}).', uni, raw)
return raw
def get_metadata(stream):
"""
Return metadata as a L{MetaInfo} object
"""
stream.seek(0)
if stream.read(5) != br'{\rtf':
return MetaInformation(_('Unknown'))
block = get_document_info(stream)[0]
if not block:
return MetaInformation(_('Unknown'))
stream.seek(0)
cpg = detect_codepage(stream)
stream.seek(0)
title_match = title_pat.search(block)
if title_match is not None:
title = decode(title_match.group(1).strip(), cpg)
else:
title = _('Unknown')
author_match = author_pat.search(block)
if author_match is not None:
author = decode(author_match.group(1).strip(), cpg)
else:
author = None
mi = MetaInformation(title)
if author:
mi.authors = [x.strip() for x in author.split(',')]
comment_match = comment_pat.search(block)
if comment_match is not None:
comment = decode(comment_match.group(1).strip(), cpg)
mi.comments = comment
tags_match = tags_pat.search(block)
if tags_match is not None:
tags = decode(tags_match.group(1).strip(), cpg)
mi.tags = list(filter(None, (x.strip() for x in tags.split(','))))
publisher_match = publisher_pat.search(block)
if publisher_match is not None:
publisher = decode(publisher_match.group(1).strip(), cpg)
mi.publisher = publisher
return mi
def create_metadata(stream, options):
md = [r'{\info']
if options.title:
title = encode(options.title)
md.append(r'{\title %s}'%(title,))
if options.authors:
au = options.authors
if not isinstance(au, string_or_bytes):
au = ', '.join(au)
author = encode(au)
md.append(r'{\author %s}'%(author,))
comp = options.comment if hasattr(options, 'comment') else options.comments
if comp:
comment = encode(comp)
md.append(r'{\subject %s}'%(comment,))
if options.publisher:
publisher = encode(options.publisher)
md.append(r'{\manager %s}'%(publisher,))
if options.tags:
tags = u', '.join(options.tags)
tags = encode(tags)
md.append(r'{\category %s}'%(tags,))
if len(md) > 1:
md.append('}')
stream.seek(0)
src = stream.read()
ans = src[:6] + ''.join(md).encode('ascii') + src[6:]
stream.seek(0)
stream.write(ans)
def set_metadata(stream, options):
'''
Modify/add RTF metadata in stream
@param options: Object with metadata attributes title, author, comment, category
'''
def add_metadata_item(src, name, val):
index = src.rindex('}')
return src[:index] + r'{\ '[:-1] + name + ' ' + val + '}}'
src, pos = get_document_info(stream)
if src is None:
create_metadata(stream, options)
else:
src = src.decode('ascii')
olen = len(src)
base_pat = r'\{\\name(.*?)(?<!\\)\}'
def replace_or_create(src, name, val):
val = encode(val)
pat = re.compile(base_pat.replace('name', name), re.DOTALL)
src, num = pat.subn('{\\' + name + ' ' + val + '}', src)
if num == 0:
src = add_metadata_item(src, name, val)
return src
if options.title is not None:
src = replace_or_create(src, 'title', options.title)
if options.comments is not None:
src = replace_or_create(src, 'subject', options.comments)
if options.authors is not None:
src = replace_or_create(src, 'author', ', '.join(options.authors))
if options.tags is not None:
src = replace_or_create(src, 'category', ', '.join(options.tags))
if options.publisher is not None:
src = replace_or_create(src, 'manager', options.publisher)
stream.seek(pos + olen)
after = stream.read()
stream.seek(pos)
stream.truncate()
stream.write(src.encode('ascii'))
stream.write(after)
def find_tests():
import unittest
from io import BytesIO
from calibre.ebooks.metadata.book.base import Metadata
class Test(unittest.TestCase):
def test_rtf_metadata(self):
stream = BytesIO(br'{\rtf1\ansi\ansicpg1252}')
m = Metadata('Test ø̄title', ['Author One', 'Author БTwo'])
m.tags = 'tag1 見tag2'.split()
m.comments = '<p>some ⊹comments</p>'
m.publisher = 'publiSher'
set_metadata(stream, m)
stream.seek(0)
o = get_metadata(stream)
for attr in 'title authors publisher comments tags'.split():
self.assertEqual(getattr(m, attr), getattr(o, attr))
return unittest.defaultTestLoader.loadTestsFromTestCase(Test)

View File

@@ -0,0 +1,296 @@
#!/usr/bin/env python2
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2010, Kovid Goyal <kovid at kovidgoyal.net>'
import os, glob, re, functools
from collections import Counter
from lxml import etree
from lxml.builder import ElementMaker
from calibre.constants import __appname__, __version__
from calibre.ebooks.chardet import xml_to_unicode
from calibre.utils.xml_parse import safe_xml_fromstring
from calibre.utils.cleantext import clean_xml_chars
from polyglot.builtins import unicode_type, getcwd
from polyglot.urllib import unquote, urlparse
NCX_NS = "http://www.daisy.org/z3986/2005/ncx/"
CALIBRE_NS = "http://calibre.kovidgoyal.net/2009/metadata"
NSMAP = {None: NCX_NS, 'calibre':CALIBRE_NS}
E = ElementMaker(namespace=NCX_NS, nsmap=NSMAP)
C = ElementMaker(namespace=CALIBRE_NS, nsmap=NSMAP)
def parse_html_toc(data):
from html5_parser import parse
from calibre.utils.cleantext import clean_xml_chars
from lxml import etree
if isinstance(data, bytes):
data = xml_to_unicode(data, strip_encoding_pats=True, resolve_entities=True)[0]
root = parse(clean_xml_chars(data), maybe_xhtml=True, keep_doctype=False, sanitize_names=True)
for a in root.xpath('//*[@href and local-name()="a"]'):
purl = urlparse(unquote(a.get('href')))
href, fragment = purl[2], purl[5]
if not fragment:
fragment = None
else:
fragment = fragment.strip()
href = href.strip()
txt = etree.tostring(a, method='text', encoding='unicode')
yield href, fragment, txt
class TOC(list):
def __init__(self, href=None, fragment=None, text=None, parent=None,
play_order=0, base_path=getcwd(), type='unknown', author=None,
description=None, toc_thumbnail=None):
self.href = href
self.fragment = fragment
if not self.fragment:
self.fragment = None
self.text = text
self.parent = parent
self.base_path = base_path
self.play_order = play_order
self.type = type
self.author = author
self.description = description
self.toc_thumbnail = toc_thumbnail
def __str__(self):
lines = ['TOC: %s#%s %s'%(self.href, self.fragment, self.text)]
for child in self:
c = unicode_type(child).splitlines()
for l in c:
lines.append('\t'+l)
return '\n'.join(lines)
def count(self, type):
return len([i for i in self.flat() if i.type == type])
def purge(self, types, max=0):
remove = []
for entry in self.flat():
if entry.type in types:
remove.append(entry)
remove = remove[max:]
for entry in remove:
if entry.parent is None:
continue
entry.parent.remove(entry)
return remove
def remove(self, entry):
list.remove(self, entry)
entry.parent = None
def add_item(self, href, fragment, text, play_order=None, type='unknown',
author=None, description=None, toc_thumbnail=None):
if play_order is None:
play_order = (self[-1].play_order if len(self) else self.play_order) + 1
self.append(TOC(href=href, fragment=fragment, text=text, parent=self,
base_path=self.base_path, play_order=play_order,
type=type, author=author, description=description, toc_thumbnail=toc_thumbnail))
return self[-1]
def top_level_items(self):
for item in self:
if item.text is not None:
yield item
def depth(self):
depth = 1
for obj in self:
c = obj.depth()
if c > depth - 1:
depth = c + 1
return depth
def flat(self):
'Depth first iteration over the tree rooted at self'
yield self
for obj in self:
for i in obj.flat():
yield i
@property
def abspath(self):
'Return the file this toc entry points to as a absolute path to a file on the system.'
if self.href is None:
return None
path = self.href.replace('/', os.sep)
if not os.path.isabs(path):
path = os.path.join(self.base_path, path)
return path
def read_from_opf(self, opfreader):
toc = opfreader.soup.find('spine', toc=True)
if toc is not None:
toc = toc['toc']
if toc is None:
try:
toc = opfreader.soup.find('guide').find('reference', attrs={'type':'toc'})['href']
except:
for item in opfreader.manifest:
if 'toc' in item.href().lower():
toc = item.href()
break
if toc is not None:
if toc.lower() not in ('ncx', 'ncxtoc'):
toc = urlparse(unquote(toc))[2]
toc = toc.replace('/', os.sep)
if not os.path.isabs(toc):
toc = os.path.join(self.base_path, toc)
try:
if not os.path.exists(toc):
bn = os.path.basename(toc)
bn = bn.replace('_top.htm', '_toc.htm') # Bug in BAEN OPF files
toc = os.path.join(os.path.dirname(toc), bn)
self.read_html_toc(toc)
except:
print('WARNING: Could not read Table of Contents. Continuing anyway.')
else:
path = opfreader.manifest.item(toc.lower())
path = getattr(path, 'path', path)
if path and os.access(path, os.R_OK):
try:
self.read_ncx_toc(path)
except Exception as err:
print('WARNING: Invalid NCX file:', err)
return
cwd = os.path.abspath(self.base_path)
m = glob.glob(os.path.join(cwd, '*.ncx'))
if m:
toc = m[0]
self.read_ncx_toc(toc)
def read_ncx_toc(self, toc, root=None):
self.base_path = os.path.dirname(toc)
if root is None:
with open(toc, 'rb') as f:
raw = xml_to_unicode(f.read(), assume_utf8=True,
strip_encoding_pats=True)[0]
root = safe_xml_fromstring(raw)
xpn = {'re': 'http://exslt.org/regular-expressions'}
XPath = functools.partial(etree.XPath, namespaces=xpn)
def get_attr(node, default=None, attr='playorder'):
for name, val in node.attrib.items():
if name and val and name.lower().endswith(attr):
return val
return default
nl_path = XPath('./*[re:match(local-name(), "navlabel$", "i")]')
txt_path = XPath('./*[re:match(local-name(), "text$", "i")]')
content_path = XPath('./*[re:match(local-name(), "content$", "i")]')
np_path = XPath('./*[re:match(local-name(), "navpoint$", "i")]')
def process_navpoint(np, dest):
try:
play_order = int(get_attr(np, 1))
except:
play_order = 1
href = fragment = text = None
nd = dest
nl = nl_path(np)
if nl:
nl = nl[0]
text = ''
for txt in txt_path(nl):
text += etree.tostring(txt, method='text',
encoding='unicode', with_tail=False)
content = content_path(np)
if content and text:
content = content[0]
# if get_attr(content, attr='src'):
purl = urlparse(content.get('src'))
href, fragment = unquote(purl[2]), unquote(purl[5])
nd = dest.add_item(href, fragment, text)
nd.play_order = play_order
for c in np_path(np):
process_navpoint(c, nd)
nm = XPath('//*[re:match(local-name(), "navmap$", "i")]')(root)
if not nm:
raise ValueError('NCX files must have a <navmap> element.')
nm = nm[0]
for child in np_path(nm):
process_navpoint(child, self)
def read_html_toc(self, toc):
self.base_path = os.path.dirname(toc)
with lopen(toc, 'rb') as f:
parsed_toc = parse_html_toc(f.read())
for href, fragment, txt in parsed_toc:
add = True
for i in self.flat():
if i.href == href and i.fragment == fragment:
add = False
break
if add:
self.add_item(href, fragment, txt)
def render(self, stream, uid):
root = E.ncx(
E.head(
E.meta(name='dtb:uid', content=unicode_type(uid)),
E.meta(name='dtb:depth', content=unicode_type(self.depth())),
E.meta(name='dtb:generator', content='%s (%s)'%(__appname__,
__version__)),
E.meta(name='dtb:totalPageCount', content='0'),
E.meta(name='dtb:maxPageNumber', content='0'),
),
E.docTitle(E.text('Table of Contents')),
)
navmap = E.navMap()
root.append(navmap)
root.set('{http://www.w3.org/XML/1998/namespace}lang', 'en')
c = Counter()
def navpoint(parent, np):
text = np.text
if not text:
text = ''
c[1] += 1
item_id = 'num_%d'%c[1]
text = clean_xml_chars(text)
elem = E.navPoint(
E.navLabel(E.text(re.sub(r'\s+', ' ', text))),
E.content(src=unicode_type(np.href)+(('#' + unicode_type(np.fragment))
if np.fragment else '')),
id=item_id,
playOrder=unicode_type(np.play_order)
)
au = getattr(np, 'author', None)
if au:
au = re.sub(r'\s+', ' ', au)
elem.append(C.meta(au, name='author'))
desc = getattr(np, 'description', None)
if desc:
desc = re.sub(r'\s+', ' ', desc)
try:
elem.append(C.meta(desc, name='description'))
except ValueError:
elem.append(C.meta(clean_xml_chars(desc), name='description'))
idx = getattr(np, 'toc_thumbnail', None)
if idx:
elem.append(C.meta(idx, name='toc_thumbnail'))
parent.append(elem)
for np2 in np:
navpoint(elem, np2)
for np in self:
navpoint(navmap, np)
raw = etree.tostring(root, encoding='utf-8', xml_declaration=True,
pretty_print=True)
stream.write(raw)

View File

@@ -0,0 +1,104 @@
#!/usr/bin/env python2
# vim:fileencoding=utf-8
# License: GPLv3 Copyright: 2016, Kovid Goyal <kovid at kovidgoyal.net>
from __future__ import absolute_import, division, print_function, unicode_literals
from collections import namedtuple
from calibre.ebooks.chardet import xml_to_unicode
from calibre.ebooks.oeb.base import OPF
from calibre.ebooks.oeb.polish.utils import guess_type
from calibre.spell import parse_lang_code
from calibre.utils.cleantext import clean_xml_chars
from calibre.utils.localization import lang_as_iso639_1
from calibre.utils.xml_parse import safe_xml_fromstring
from polyglot.builtins import filter, map
OPFVersion = namedtuple('OPFVersion', 'major minor patch')
def parse_opf_version(raw):
parts = (raw or '').split('.')
try:
major = int(parts[0])
except Exception:
return OPFVersion(2, 0, 0)
try:
v = list(map(int, raw.split('.')))
except Exception:
v = [major, 0, 0]
while len(v) < 3:
v.append(0)
v = v[:3]
return OPFVersion(*v)
def parse_opf(stream_or_path):
stream = stream_or_path
if not hasattr(stream, 'read'):
stream = open(stream, 'rb')
raw = stream.read()
if not raw:
raise ValueError('Empty file: '+getattr(stream, 'name', 'stream'))
raw, encoding = xml_to_unicode(raw, strip_encoding_pats=True, resolve_entities=True, assume_utf8=True)
raw = raw[raw.find('<'):]
root = safe_xml_fromstring(clean_xml_chars(raw))
if root is None:
raise ValueError('Not an OPF file')
return root
def normalize_languages(opf_languages, mi_languages):
' Preserve original country codes and use 2-letter lang codes where possible '
def parse(x):
try:
return parse_lang_code(x)
except ValueError:
return None
opf_languages = filter(None, map(parse, opf_languages))
cc_map = {c.langcode:c.countrycode for c in opf_languages}
mi_languages = filter(None, map(parse, mi_languages))
def norm(x):
lc = x.langcode
cc = x.countrycode or cc_map.get(lc, None)
lc = lang_as_iso639_1(lc) or lc
if cc:
lc += '-' + cc
return lc
return list(map(norm, mi_languages))
def ensure_unique(template, existing):
b, e = template.rpartition('.')[::2]
if b and e:
e = '.' + e
else:
b, e = template, ''
q = template
c = 0
while q in existing:
c += 1
q = '%s-%d%s' % (b, c, e)
return q
def create_manifest_item(root, href_template, id_template, media_type=None):
all_ids = frozenset(root.xpath('//*/@id'))
all_hrefs = frozenset(root.xpath('//*/@href'))
href = ensure_unique(href_template, all_hrefs)
item_id = ensure_unique(id_template, all_ids)
manifest = root.find(OPF('manifest'))
if manifest is not None:
i = manifest.makeelement(OPF('item'))
i.set('href', href), i.set('id', item_id)
i.set('media-type', media_type or guess_type(href_template))
manifest.append(i)
return i
def pretty_print_opf(root):
from calibre.ebooks.oeb.polish.pretty import pretty_opf, pretty_xml_tree
pretty_opf(root)
pretty_xml_tree(root)

View File

@@ -0,0 +1,15 @@
#!/usr/bin/env python2
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
class MobiError(Exception):
pass
# That might be a bit small on the PW, but Amazon/KG 2.5 still uses these values, even when delivered to a PW
MAX_THUMB_SIZE = 16 * 1024
MAX_THUMB_DIMEN = (180, 240)

View File

@@ -0,0 +1,108 @@
#!/usr/bin/env python2
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
from __future__ import absolute_import, division, print_function, unicode_literals
__license__ = 'GPL v3'
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en'
'''
Decompress MOBI files compressed with the Huff/cdic algorithm. Code thanks to darkninja
and igorsk.
'''
import struct
from calibre.ebooks.mobi import MobiError
from polyglot.builtins import map
class Reader(object):
def __init__(self):
self.q = struct.Struct(b'>Q').unpack_from
def load_huff(self, huff):
if huff[0:8] != b'HUFF\x00\x00\x00\x18':
raise MobiError('Invalid HUFF header')
off1, off2 = struct.unpack_from(b'>LL', huff, 8)
def dict1_unpack(v):
codelen, term, maxcode = v&0x1f, v&0x80, v>>8
assert codelen != 0
if codelen <= 8:
assert term
maxcode = ((maxcode + 1) << (32 - codelen)) - 1
return (codelen, term, maxcode)
self.dict1 = tuple(map(dict1_unpack, struct.unpack_from(b'>256L', huff, off1)))
dict2 = struct.unpack_from(b'>64L', huff, off2)
self.mincode, self.maxcode = (), ()
for codelen, mincode in enumerate((0,) + dict2[0::2]):
self.mincode += (mincode << (32 - codelen), )
for codelen, maxcode in enumerate((0,) + dict2[1::2]):
self.maxcode += (((maxcode + 1) << (32 - codelen)) - 1, )
self.dictionary = []
def load_cdic(self, cdic):
if cdic[0:8] != b'CDIC\x00\x00\x00\x10':
raise MobiError('Invalid CDIC header')
phrases, bits = struct.unpack_from(b'>LL', cdic, 8)
n = min(1<<bits, phrases-len(self.dictionary))
h = struct.Struct(b'>H').unpack_from
def getslice(off):
blen, = h(cdic, 16+off)
slice = cdic[18+off:18+off+(blen&0x7fff)]
return (slice, blen&0x8000)
self.dictionary += map(getslice, struct.unpack_from(b'>%dH' % n, cdic, 16))
def unpack(self, data):
q = self.q
bitsleft = len(data) * 8
data += b'\x00\x00\x00\x00\x00\x00\x00\x00'
pos = 0
x, = q(data, pos)
n = 32
s = []
while True:
if n <= 0:
pos += 4
x, = q(data, pos)
n += 32
code = (x >> n) & ((1 << 32) - 1)
codelen, term, maxcode = self.dict1[code >> 24]
if not term:
while code < self.mincode[codelen]:
codelen += 1
maxcode = self.maxcode[codelen]
n -= codelen
bitsleft -= codelen
if bitsleft < 0:
break
r = (maxcode - code) >> (32 - codelen)
slice_, flag = self.dictionary[r]
if not flag:
self.dictionary[r] = None
slice_ = self.unpack(slice_)
self.dictionary[r] = (slice_, 1)
s.append(slice_)
return b''.join(s)
class HuffReader(object):
def __init__(self, huffs):
self.reader = Reader()
self.reader.load_huff(huffs[0])
for cdic in huffs[1:]:
self.reader.load_cdic(cdic)
def unpack(self, section):
return self.reader.unpack(section)

Some files were not shown because too many files have changed in this diff Show More