mirror of
https://github.com/gryf/ebook-converter.git
synced 2026-03-26 12:33:32 +01:00
Initial import
This commit is contained in:
41
ebook_converter/ebooks/BeautifulSoup.py
Normal file
41
ebook_converter/ebooks/BeautifulSoup.py
Normal file
@@ -0,0 +1,41 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=utf-8
|
||||
# License: GPLv3 Copyright: 2019, Kovid Goyal <kovid at kovidgoyal.net>
|
||||
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
import bs4
|
||||
from bs4 import ( # noqa
|
||||
CData, Comment, Declaration, NavigableString, ProcessingInstruction,
|
||||
SoupStrainer, Tag, __version__
|
||||
)
|
||||
|
||||
from polyglot.builtins import unicode_type
|
||||
|
||||
|
||||
def parse_html(markup):
|
||||
from calibre.ebooks.chardet import strip_encoding_declarations, xml_to_unicode, substitute_entites
|
||||
from calibre.utils.cleantext import clean_xml_chars
|
||||
if isinstance(markup, unicode_type):
|
||||
markup = strip_encoding_declarations(markup)
|
||||
markup = substitute_entites(markup)
|
||||
else:
|
||||
markup = xml_to_unicode(markup, strip_encoding_pats=True, resolve_entities=True)[0]
|
||||
markup = clean_xml_chars(markup)
|
||||
from html5_parser.soup import parse
|
||||
return parse(markup, return_root=False)
|
||||
|
||||
|
||||
def prettify(soup):
|
||||
ans = soup.prettify()
|
||||
if isinstance(ans, bytes):
|
||||
ans = ans.decode('utf-8')
|
||||
return ans
|
||||
|
||||
|
||||
def BeautifulSoup(markup='', *a, **kw):
|
||||
return parse_html(markup)
|
||||
|
||||
|
||||
def BeautifulStoneSoup(markup='', *a, **kw):
|
||||
return bs4.BeautifulSoup(markup, 'xml')
|
||||
248
ebook_converter/ebooks/__init__.py
Normal file
248
ebook_converter/ebooks/__init__.py
Normal file
@@ -0,0 +1,248 @@
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
|
||||
'''
|
||||
Code for the conversion of ebook formats and the reading of metadata
|
||||
from various formats.
|
||||
'''
|
||||
|
||||
import os, re, numbers, sys
|
||||
from calibre import prints
|
||||
from calibre.ebooks.chardet import xml_to_unicode
|
||||
from polyglot.builtins import unicode_type
|
||||
|
||||
|
||||
class ConversionError(Exception):
|
||||
|
||||
def __init__(self, msg, only_msg=False):
|
||||
Exception.__init__(self, msg)
|
||||
self.only_msg = only_msg
|
||||
|
||||
|
||||
class UnknownFormatError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class DRMError(ValueError):
|
||||
pass
|
||||
|
||||
|
||||
class ParserError(ValueError):
|
||||
pass
|
||||
|
||||
|
||||
BOOK_EXTENSIONS = ['lrf', 'rar', 'zip', 'rtf', 'lit', 'txt', 'txtz', 'text', 'htm', 'xhtm',
|
||||
'html', 'htmlz', 'xhtml', 'pdf', 'pdb', 'updb', 'pdr', 'prc', 'mobi', 'azw', 'doc',
|
||||
'epub', 'fb2', 'fbz', 'djv', 'djvu', 'lrx', 'cbr', 'cbz', 'cbc', 'oebzip',
|
||||
'rb', 'imp', 'odt', 'chm', 'tpz', 'azw1', 'pml', 'pmlz', 'mbp', 'tan', 'snb',
|
||||
'xps', 'oxps', 'azw4', 'book', 'zbf', 'pobi', 'docx', 'docm', 'md',
|
||||
'textile', 'markdown', 'ibook', 'ibooks', 'iba', 'azw3', 'ps', 'kepub', 'kfx', 'kpf']
|
||||
|
||||
|
||||
def return_raster_image(path):
|
||||
from calibre.utils.imghdr import what
|
||||
if os.access(path, os.R_OK):
|
||||
with open(path, 'rb') as f:
|
||||
raw = f.read()
|
||||
if what(None, raw) not in (None, 'svg'):
|
||||
return raw
|
||||
|
||||
|
||||
def extract_cover_from_embedded_svg(html, base, log):
|
||||
from calibre.ebooks.oeb.base import XPath, SVG, XLINK
|
||||
from calibre.utils.xml_parse import safe_xml_fromstring
|
||||
root = safe_xml_fromstring(html)
|
||||
|
||||
svg = XPath('//svg:svg')(root)
|
||||
if len(svg) == 1 and len(svg[0]) == 1 and svg[0][0].tag == SVG('image'):
|
||||
image = svg[0][0]
|
||||
href = image.get(XLINK('href'), None)
|
||||
if href:
|
||||
path = os.path.join(base, *href.split('/'))
|
||||
return return_raster_image(path)
|
||||
|
||||
|
||||
def extract_calibre_cover(raw, base, log):
|
||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
||||
soup = BeautifulSoup(raw)
|
||||
matches = soup.find(name=['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'span',
|
||||
'font', 'br'])
|
||||
images = soup.findAll('img', src=True)
|
||||
if matches is None and len(images) == 1 and \
|
||||
images[0].get('alt', '').lower()=='cover':
|
||||
img = images[0]
|
||||
img = os.path.join(base, *img['src'].split('/'))
|
||||
q = return_raster_image(img)
|
||||
if q is not None:
|
||||
return q
|
||||
|
||||
# Look for a simple cover, i.e. a body with no text and only one <img> tag
|
||||
if matches is None:
|
||||
body = soup.find('body')
|
||||
if body is not None:
|
||||
text = u''.join(map(unicode_type, body.findAll(text=True)))
|
||||
if text.strip():
|
||||
# Body has text, abort
|
||||
return
|
||||
images = body.findAll('img', src=True)
|
||||
if len(images) == 1:
|
||||
img = os.path.join(base, *images[0]['src'].split('/'))
|
||||
return return_raster_image(img)
|
||||
|
||||
|
||||
def render_html_svg_workaround(path_to_html, log, width=590, height=750):
|
||||
from calibre.ebooks.oeb.base import SVG_NS
|
||||
with open(path_to_html, 'rb') as f:
|
||||
raw = f.read()
|
||||
raw = xml_to_unicode(raw, strip_encoding_pats=True)[0]
|
||||
data = None
|
||||
if SVG_NS in raw:
|
||||
try:
|
||||
data = extract_cover_from_embedded_svg(raw,
|
||||
os.path.dirname(path_to_html), log)
|
||||
except Exception:
|
||||
pass
|
||||
if data is None:
|
||||
try:
|
||||
data = extract_calibre_cover(raw, os.path.dirname(path_to_html), log)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if data is None:
|
||||
data = render_html_data(path_to_html, width, height)
|
||||
return data
|
||||
|
||||
|
||||
def render_html_data(path_to_html, width, height):
|
||||
from calibre.ptempfile import TemporaryDirectory
|
||||
from calibre.utils.ipc.simple_worker import fork_job, WorkerError
|
||||
result = {}
|
||||
|
||||
def report_error(text=''):
|
||||
prints('Failed to render', path_to_html, 'with errors:', file=sys.stderr)
|
||||
if text:
|
||||
prints(text, file=sys.stderr)
|
||||
if result and result['stdout_stderr']:
|
||||
with open(result['stdout_stderr'], 'rb') as f:
|
||||
prints(f.read(), file=sys.stderr)
|
||||
|
||||
with TemporaryDirectory('-render-html') as tdir:
|
||||
try:
|
||||
result = fork_job('calibre.ebooks.render_html', 'main', args=(path_to_html, tdir, 'jpeg'))
|
||||
except WorkerError as e:
|
||||
report_error(e.orig_tb)
|
||||
else:
|
||||
if result['result']:
|
||||
with open(os.path.join(tdir, 'rendered.jpeg'), 'rb') as f:
|
||||
return f.read()
|
||||
else:
|
||||
report_error()
|
||||
|
||||
|
||||
def check_ebook_format(stream, current_guess):
|
||||
ans = current_guess
|
||||
if current_guess.lower() in ('prc', 'mobi', 'azw', 'azw1', 'azw3'):
|
||||
stream.seek(0)
|
||||
if stream.read(3) == b'TPZ':
|
||||
ans = 'tpz'
|
||||
stream.seek(0)
|
||||
return ans
|
||||
|
||||
|
||||
def normalize(x):
|
||||
if isinstance(x, unicode_type):
|
||||
import unicodedata
|
||||
x = unicodedata.normalize('NFC', x)
|
||||
return x
|
||||
|
||||
|
||||
def calibre_cover(title, author_string, series_string=None,
|
||||
output_format='jpg', title_size=46, author_size=36, logo_path=None):
|
||||
title = normalize(title)
|
||||
author_string = normalize(author_string)
|
||||
series_string = normalize(series_string)
|
||||
from calibre.ebooks.covers import calibre_cover2
|
||||
from calibre.utils.img import image_to_data
|
||||
ans = calibre_cover2(title, author_string or '', series_string or '', logo_path=logo_path, as_qimage=True)
|
||||
return image_to_data(ans, fmt=output_format)
|
||||
|
||||
|
||||
UNIT_RE = re.compile(r'^(-*[0-9]*[.]?[0-9]*)\s*(%|em|ex|en|px|mm|cm|in|pt|pc|rem|q)$')
|
||||
|
||||
|
||||
def unit_convert(value, base, font, dpi, body_font_size=12):
|
||||
' Return value in pts'
|
||||
if isinstance(value, numbers.Number):
|
||||
return value
|
||||
try:
|
||||
return float(value) * 72.0 / dpi
|
||||
except:
|
||||
pass
|
||||
result = value
|
||||
m = UNIT_RE.match(value)
|
||||
if m is not None and m.group(1):
|
||||
value = float(m.group(1))
|
||||
unit = m.group(2)
|
||||
if unit == '%':
|
||||
result = (value / 100.0) * base
|
||||
elif unit == 'px':
|
||||
result = value * 72.0 / dpi
|
||||
elif unit == 'in':
|
||||
result = value * 72.0
|
||||
elif unit == 'pt':
|
||||
result = value
|
||||
elif unit == 'em':
|
||||
result = value * font
|
||||
elif unit in ('ex', 'en'):
|
||||
# This is a hack for ex since we have no way to know
|
||||
# the x-height of the font
|
||||
font = font
|
||||
result = value * font * 0.5
|
||||
elif unit == 'pc':
|
||||
result = value * 12.0
|
||||
elif unit == 'mm':
|
||||
result = value * 2.8346456693
|
||||
elif unit == 'cm':
|
||||
result = value * 28.346456693
|
||||
elif unit == 'rem':
|
||||
result = value * body_font_size
|
||||
elif unit == 'q':
|
||||
result = value * 0.708661417325
|
||||
return result
|
||||
|
||||
|
||||
def parse_css_length(value):
|
||||
try:
|
||||
m = UNIT_RE.match(value)
|
||||
except TypeError:
|
||||
return None, None
|
||||
if m is not None and m.group(1):
|
||||
value = float(m.group(1))
|
||||
unit = m.group(2)
|
||||
return value, unit.lower()
|
||||
return None, None
|
||||
|
||||
|
||||
def generate_masthead(title, output_path=None, width=600, height=60):
|
||||
from calibre.ebooks.conversion.config import load_defaults
|
||||
recs = load_defaults('mobi_output')
|
||||
masthead_font_family = recs.get('masthead_font', None)
|
||||
from calibre.ebooks.covers import generate_masthead
|
||||
return generate_masthead(title, output_path=output_path, width=width, height=height, font_family=masthead_font_family)
|
||||
|
||||
|
||||
def escape_xpath_attr(value):
|
||||
if '"' in value:
|
||||
if "'" in value:
|
||||
parts = re.split('("+)', value)
|
||||
ans = []
|
||||
for x in parts:
|
||||
if x:
|
||||
q = "'" if '"' in x else '"'
|
||||
ans.append(q + x + q)
|
||||
return 'concat(%s)' % ', '.join(ans)
|
||||
else:
|
||||
return "'%s'" % value
|
||||
return '"%s"' % value
|
||||
189
ebook_converter/ebooks/chardet.py
Normal file
189
ebook_converter/ebooks/chardet.py
Normal file
@@ -0,0 +1,189 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import re, codecs
|
||||
from polyglot.builtins import unicode_type
|
||||
|
||||
_encoding_pats = (
|
||||
# XML declaration
|
||||
r'<\?[^<>]+encoding\s*=\s*[\'"](.*?)[\'"][^<>]*>',
|
||||
# HTML 5 charset
|
||||
r'''<meta\s+charset=['"]([-_a-z0-9]+)['"][^<>]*>(?:\s*</meta>){0,1}''',
|
||||
# HTML 4 Pragma directive
|
||||
r'''<meta\s+?[^<>]*?content\s*=\s*['"][^'"]*?charset=([-_a-z0-9]+)[^'"]*?['"][^<>]*>(?:\s*</meta>){0,1}''',
|
||||
)
|
||||
|
||||
|
||||
def compile_pats(binary):
|
||||
for raw in _encoding_pats:
|
||||
if binary:
|
||||
raw = raw.encode('ascii')
|
||||
yield re.compile(raw, flags=re.IGNORECASE)
|
||||
|
||||
|
||||
class LazyEncodingPats(object):
|
||||
|
||||
def __call__(self, binary=False):
|
||||
attr = 'binary_pats' if binary else 'unicode_pats'
|
||||
pats = getattr(self, attr, None)
|
||||
if pats is None:
|
||||
pats = tuple(compile_pats(binary))
|
||||
setattr(self, attr, pats)
|
||||
for pat in pats:
|
||||
yield pat
|
||||
|
||||
|
||||
lazy_encoding_pats = LazyEncodingPats()
|
||||
ENTITY_PATTERN = re.compile(r'&(\S+?);')
|
||||
|
||||
|
||||
def strip_encoding_declarations(raw, limit=50*1024, preserve_newlines=False):
|
||||
prefix = raw[:limit]
|
||||
suffix = raw[limit:]
|
||||
is_binary = isinstance(raw, bytes)
|
||||
if preserve_newlines:
|
||||
if is_binary:
|
||||
sub = lambda m: b'\n' * m.group().count(b'\n')
|
||||
else:
|
||||
sub = lambda m: '\n' * m.group().count('\n')
|
||||
else:
|
||||
sub = b'' if is_binary else u''
|
||||
for pat in lazy_encoding_pats(is_binary):
|
||||
prefix = pat.sub(sub, prefix)
|
||||
raw = prefix + suffix
|
||||
return raw
|
||||
|
||||
|
||||
def replace_encoding_declarations(raw, enc='utf-8', limit=50*1024):
|
||||
prefix = raw[:limit]
|
||||
suffix = raw[limit:]
|
||||
changed = [False]
|
||||
is_binary = isinstance(raw, bytes)
|
||||
if is_binary:
|
||||
if not isinstance(enc, bytes):
|
||||
enc = enc.encode('ascii')
|
||||
else:
|
||||
if isinstance(enc, bytes):
|
||||
enc = enc.decode('ascii')
|
||||
|
||||
def sub(m):
|
||||
ans = m.group()
|
||||
if m.group(1).lower() != enc.lower():
|
||||
changed[0] = True
|
||||
start, end = m.start(1) - m.start(0), m.end(1) - m.end(0)
|
||||
ans = ans[:start] + enc + ans[end:]
|
||||
return ans
|
||||
|
||||
for pat in lazy_encoding_pats(is_binary):
|
||||
prefix = pat.sub(sub, prefix)
|
||||
raw = prefix + suffix
|
||||
return raw, changed[0]
|
||||
|
||||
|
||||
def find_declared_encoding(raw, limit=50*1024):
|
||||
prefix = raw[:limit]
|
||||
is_binary = isinstance(raw, bytes)
|
||||
for pat in lazy_encoding_pats(is_binary):
|
||||
m = pat.search(prefix)
|
||||
if m is not None:
|
||||
ans = m.group(1)
|
||||
if is_binary:
|
||||
ans = ans.decode('ascii', 'replace')
|
||||
return ans
|
||||
|
||||
|
||||
def substitute_entites(raw):
|
||||
from calibre import xml_entity_to_unicode
|
||||
return ENTITY_PATTERN.sub(xml_entity_to_unicode, raw)
|
||||
|
||||
|
||||
_CHARSET_ALIASES = {"macintosh" : "mac-roman",
|
||||
"x-sjis" : "shift-jis"}
|
||||
|
||||
|
||||
def detect(*args, **kwargs):
|
||||
from chardet import detect
|
||||
return detect(*args, **kwargs)
|
||||
|
||||
|
||||
def force_encoding(raw, verbose, assume_utf8=False):
|
||||
from calibre.constants import preferred_encoding
|
||||
|
||||
try:
|
||||
chardet = detect(raw[:1024*50])
|
||||
except:
|
||||
chardet = {'encoding':preferred_encoding, 'confidence':0}
|
||||
encoding = chardet['encoding']
|
||||
if chardet['confidence'] < 1 and assume_utf8:
|
||||
encoding = 'utf-8'
|
||||
if chardet['confidence'] < 1 and verbose:
|
||||
print('WARNING: Encoding detection confidence for %s is %d%%'%(
|
||||
chardet['encoding'], chardet['confidence']*100))
|
||||
if not encoding:
|
||||
encoding = preferred_encoding
|
||||
encoding = encoding.lower()
|
||||
encoding = _CHARSET_ALIASES.get(encoding, encoding)
|
||||
if encoding == 'ascii':
|
||||
encoding = 'utf-8'
|
||||
return encoding
|
||||
|
||||
|
||||
def detect_xml_encoding(raw, verbose=False, assume_utf8=False):
|
||||
if not raw or isinstance(raw, unicode_type):
|
||||
return raw, None
|
||||
for x in ('utf8', 'utf-16-le', 'utf-16-be'):
|
||||
bom = getattr(codecs, 'BOM_'+x.upper().replace('-16', '16').replace(
|
||||
'-', '_'))
|
||||
if raw.startswith(bom):
|
||||
return raw[len(bom):], x
|
||||
encoding = None
|
||||
for pat in lazy_encoding_pats(True):
|
||||
match = pat.search(raw)
|
||||
if match:
|
||||
encoding = match.group(1)
|
||||
encoding = encoding.decode('ascii', 'replace')
|
||||
break
|
||||
if encoding is None:
|
||||
encoding = force_encoding(raw, verbose, assume_utf8=assume_utf8)
|
||||
if encoding.lower().strip() == 'macintosh':
|
||||
encoding = 'mac-roman'
|
||||
if encoding.lower().replace('_', '-').strip() in (
|
||||
'gb2312', 'chinese', 'csiso58gb231280', 'euc-cn', 'euccn',
|
||||
'eucgb2312-cn', 'gb2312-1980', 'gb2312-80', 'iso-ir-58'):
|
||||
# Microsoft Word exports to HTML with encoding incorrectly set to
|
||||
# gb2312 instead of gbk. gbk is a superset of gb2312, anyway.
|
||||
encoding = 'gbk'
|
||||
try:
|
||||
codecs.lookup(encoding)
|
||||
except LookupError:
|
||||
encoding = 'utf-8'
|
||||
|
||||
return raw, encoding
|
||||
|
||||
|
||||
def xml_to_unicode(raw, verbose=False, strip_encoding_pats=False,
|
||||
resolve_entities=False, assume_utf8=False):
|
||||
'''
|
||||
Force conversion of byte string to unicode. Tries to look for XML/HTML
|
||||
encoding declaration first, if not found uses the chardet library and
|
||||
prints a warning if detection confidence is < 100%
|
||||
@return: (unicode, encoding used)
|
||||
'''
|
||||
if not raw:
|
||||
return '', None
|
||||
raw, encoding = detect_xml_encoding(raw, verbose=verbose,
|
||||
assume_utf8=assume_utf8)
|
||||
if not isinstance(raw, unicode_type):
|
||||
raw = raw.decode(encoding, 'replace')
|
||||
|
||||
if strip_encoding_pats:
|
||||
raw = strip_encoding_declarations(raw)
|
||||
if resolve_entities:
|
||||
raw = substitute_entites(raw)
|
||||
|
||||
return raw, encoding
|
||||
6
ebook_converter/ebooks/compression/__init__.py
Normal file
6
ebook_converter/ebooks/compression/__init__.py
Normal file
@@ -0,0 +1,6 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL 3'
|
||||
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
238
ebook_converter/ebooks/compression/palmdoc.c
Normal file
238
ebook_converter/ebooks/compression/palmdoc.c
Normal file
@@ -0,0 +1,238 @@
|
||||
/*
|
||||
:mod:`cPalmdoc` -- Palmdoc compression/decompression
|
||||
=====================================================
|
||||
|
||||
.. module:: cPalmdoc
|
||||
:platform: All
|
||||
:synopsis: Compression decompression of Palmdoc implemented in C for speed
|
||||
|
||||
.. moduleauthor:: Kovid Goyal <kovid@kovidgoyal.net> Copyright 2009
|
||||
|
||||
*/
|
||||
|
||||
#define PY_SSIZE_T_CLEAN
|
||||
#include <Python.h>
|
||||
#include <stdio.h>
|
||||
|
||||
#define BUFFER 6000
|
||||
|
||||
#define MIN(x, y) ( ((x) < (y)) ? (x) : (y) )
|
||||
#define MAX(x, y) ( ((x) > (y)) ? (x) : (y) )
|
||||
|
||||
typedef unsigned short int Byte;
|
||||
typedef struct {
|
||||
Byte *data;
|
||||
Py_ssize_t len;
|
||||
} buffer;
|
||||
|
||||
#ifdef bool
|
||||
#undef bool
|
||||
#endif
|
||||
#define bool int
|
||||
|
||||
#ifdef false
|
||||
#undef false
|
||||
#endif
|
||||
#define false 0
|
||||
|
||||
#ifdef true
|
||||
#undef true
|
||||
#endif
|
||||
#define true 1
|
||||
|
||||
#define CHAR(x) (( (x) > 127 ) ? (x)-256 : (x))
|
||||
|
||||
#if PY_MAJOR_VERSION >= 3
|
||||
#define BUFFER_FMT "y#"
|
||||
#define BYTES_FMT "y#"
|
||||
#else
|
||||
#define BUFFER_FMT "t#"
|
||||
#define BYTES_FMT "s#"
|
||||
#endif
|
||||
|
||||
static PyObject *
|
||||
cpalmdoc_decompress(PyObject *self, PyObject *args) {
|
||||
const char *_input = NULL; Py_ssize_t input_len = 0;
|
||||
Byte *input; char *output; Byte c; PyObject *ans;
|
||||
Py_ssize_t i = 0, o = 0, j = 0, di, n;
|
||||
if (!PyArg_ParseTuple(args, BUFFER_FMT, &_input, &input_len))
|
||||
return NULL;
|
||||
input = (Byte *) PyMem_Malloc(sizeof(Byte)*input_len);
|
||||
if (input == NULL) return PyErr_NoMemory();
|
||||
// Map chars to bytes
|
||||
for (j = 0; j < input_len; j++)
|
||||
input[j] = (_input[j] < 0) ? _input[j]+256 : _input[j];
|
||||
output = (char *)PyMem_Malloc(sizeof(char)*(MAX(BUFFER, 8*input_len)));
|
||||
if (output == NULL) return PyErr_NoMemory();
|
||||
|
||||
while (i < input_len) {
|
||||
c = input[i++];
|
||||
if (c >= 1 && c <= 8) // copy 'c' bytes
|
||||
while (c--) output[o++] = (char)input[i++];
|
||||
|
||||
else if (c <= 0x7F) // 0, 09-7F = self
|
||||
output[o++] = (char)c;
|
||||
|
||||
else if (c >= 0xC0) { // space + ASCII char
|
||||
output[o++] = ' ';
|
||||
output[o++] = c ^ 0x80;
|
||||
}
|
||||
else { // 80-BF repeat sequences
|
||||
c = (c << 8) + input[i++];
|
||||
di = (c & 0x3FFF) >> 3;
|
||||
for ( n = (c & 7) + 3; n--; ++o )
|
||||
output[o] = output[o - di];
|
||||
}
|
||||
}
|
||||
ans = Py_BuildValue(BYTES_FMT, output, o);
|
||||
if (output != NULL) PyMem_Free(output);
|
||||
if (input != NULL) PyMem_Free(input);
|
||||
return ans;
|
||||
}
|
||||
|
||||
static bool
|
||||
cpalmdoc_memcmp( Byte *a, Byte *b, Py_ssize_t len) {
|
||||
Py_ssize_t i;
|
||||
for (i = 0; i < len; i++) if (a[i] != b[i]) return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
static Py_ssize_t
|
||||
cpalmdoc_rfind(Byte *data, Py_ssize_t pos, Py_ssize_t chunk_length) {
|
||||
Py_ssize_t i;
|
||||
for (i = pos - chunk_length; i > -1; i--)
|
||||
if (cpalmdoc_memcmp(data+i, data+pos, chunk_length)) return i;
|
||||
return pos;
|
||||
}
|
||||
|
||||
|
||||
static Py_ssize_t
|
||||
cpalmdoc_do_compress(buffer *b, char *output) {
|
||||
Py_ssize_t i = 0, j, chunk_len, dist;
|
||||
unsigned int compound;
|
||||
Byte c, n;
|
||||
bool found;
|
||||
char *head;
|
||||
buffer temp;
|
||||
head = output;
|
||||
temp.data = (Byte *)PyMem_Malloc(sizeof(Byte)*8); temp.len = 0;
|
||||
if (temp.data == NULL) return 0;
|
||||
while (i < b->len) {
|
||||
c = b->data[i];
|
||||
//do repeats
|
||||
if ( i > 10 && (b->len - i) > 10) {
|
||||
found = false;
|
||||
for (chunk_len = 10; chunk_len > 2; chunk_len--) {
|
||||
j = cpalmdoc_rfind(b->data, i, chunk_len);
|
||||
dist = i - j;
|
||||
if (j < i && dist <= 2047) {
|
||||
found = true;
|
||||
compound = (unsigned int)((dist << 3) + chunk_len-3);
|
||||
*(output++) = CHAR(0x80 + (compound >> 8 ));
|
||||
*(output++) = CHAR(compound & 0xFF);
|
||||
i += chunk_len;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (found) continue;
|
||||
}
|
||||
|
||||
//write single character
|
||||
i++;
|
||||
if (c == 32 && i < b->len) {
|
||||
n = b->data[i];
|
||||
if ( n >= 0x40 && n <= 0x7F) {
|
||||
*(output++) = CHAR(n^0x80); i++; continue;
|
||||
}
|
||||
}
|
||||
if (c == 0 || (c > 8 && c < 0x80))
|
||||
*(output++) = CHAR(c);
|
||||
else { // Write binary data
|
||||
j = i;
|
||||
temp.data[0] = c; temp.len = 1;
|
||||
while (j < b->len && temp.len < 8) {
|
||||
c = b->data[j];
|
||||
if (c == 0 || (c > 8 && c < 0x80)) break;
|
||||
temp.data[temp.len++] = c; j++;
|
||||
}
|
||||
i += temp.len - 1;
|
||||
*(output++) = (char)temp.len;
|
||||
for (j=0; j < temp.len; j++) *(output++) = (char)temp.data[j];
|
||||
}
|
||||
}
|
||||
PyMem_Free(temp.data);
|
||||
return output - head;
|
||||
}
|
||||
|
||||
static PyObject *
|
||||
cpalmdoc_compress(PyObject *self, PyObject *args) {
|
||||
const char *_input = NULL; Py_ssize_t input_len = 0;
|
||||
char *output; PyObject *ans;
|
||||
Py_ssize_t j = 0;
|
||||
buffer b;
|
||||
if (!PyArg_ParseTuple(args, BUFFER_FMT, &_input, &input_len))
|
||||
return NULL;
|
||||
b.data = (Byte *)PyMem_Malloc(sizeof(Byte)*input_len);
|
||||
if (b.data == NULL) return PyErr_NoMemory();
|
||||
// Map chars to bytes
|
||||
for (j = 0; j < input_len; j++)
|
||||
b.data[j] = (_input[j] < 0) ? _input[j]+256 : _input[j];
|
||||
b.len = input_len;
|
||||
// Make the output buffer larger than the input as sometimes
|
||||
// compression results in a larger block
|
||||
output = (char *)PyMem_Malloc(sizeof(char) * (int)(1.25*b.len));
|
||||
if (output == NULL) return PyErr_NoMemory();
|
||||
j = cpalmdoc_do_compress(&b, output);
|
||||
if ( j == 0) return PyErr_NoMemory();
|
||||
ans = Py_BuildValue(BYTES_FMT, output, j);
|
||||
PyMem_Free(output);
|
||||
PyMem_Free(b.data);
|
||||
return ans;
|
||||
}
|
||||
|
||||
static char cPalmdoc_doc[] = "Compress and decompress palmdoc strings.";
|
||||
|
||||
static PyMethodDef cPalmdoc_methods[] = {
|
||||
{"decompress", cpalmdoc_decompress, METH_VARARGS,
|
||||
"decompress(bytestring) -> decompressed bytestring\n\n"
|
||||
"Decompress a palmdoc compressed byte string. "
|
||||
},
|
||||
|
||||
{"compress", cpalmdoc_compress, METH_VARARGS,
|
||||
"compress(bytestring) -> compressed bytestring\n\n"
|
||||
"Palmdoc compress a byte string. "
|
||||
},
|
||||
{NULL, NULL, 0, NULL}
|
||||
};
|
||||
|
||||
#if PY_MAJOR_VERSION >= 3
|
||||
#define INITERROR return NULL
|
||||
#define INITMODULE PyModule_Create(&cPalmdoc_module)
|
||||
static struct PyModuleDef cPalmdoc_module = {
|
||||
/* m_base */ PyModuleDef_HEAD_INIT,
|
||||
/* m_name */ "cPalmdoc",
|
||||
/* m_doc */ cPalmdoc_doc,
|
||||
/* m_size */ -1,
|
||||
/* m_methods */ cPalmdoc_methods,
|
||||
/* m_slots */ 0,
|
||||
/* m_traverse */ 0,
|
||||
/* m_clear */ 0,
|
||||
/* m_free */ 0,
|
||||
};
|
||||
CALIBRE_MODINIT_FUNC PyInit_cPalmdoc(void) {
|
||||
#else
|
||||
#define INITERROR return
|
||||
#define INITMODULE Py_InitModule3("cPalmdoc", cPalmdoc_methods, cPalmdoc_doc)
|
||||
CALIBRE_MODINIT_FUNC initcPalmdoc(void) {
|
||||
#endif
|
||||
|
||||
PyObject *m;
|
||||
m = INITMODULE;
|
||||
if (m == NULL) {
|
||||
INITERROR;
|
||||
}
|
||||
|
||||
#if PY_MAJOR_VERSION >= 3
|
||||
return m;
|
||||
#endif
|
||||
}
|
||||
96
ebook_converter/ebooks/compression/palmdoc.py
Normal file
96
ebook_converter/ebooks/compression/palmdoc.py
Normal file
@@ -0,0 +1,96 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
|
||||
import io
|
||||
from struct import pack
|
||||
|
||||
from calibre.constants import plugins
|
||||
from polyglot.builtins import range
|
||||
cPalmdoc = plugins['cPalmdoc'][0]
|
||||
if not cPalmdoc:
|
||||
raise RuntimeError(('Failed to load required cPalmdoc module: '
|
||||
'%s')%plugins['cPalmdoc'][1])
|
||||
|
||||
|
||||
def decompress_doc(data):
|
||||
return cPalmdoc.decompress(data)
|
||||
|
||||
|
||||
def compress_doc(data):
|
||||
return cPalmdoc.compress(data) if data else b''
|
||||
|
||||
|
||||
def py_compress_doc(data):
|
||||
out = io.BytesIO()
|
||||
i = 0
|
||||
ldata = len(data)
|
||||
while i < ldata:
|
||||
if i > 10 and (ldata - i) > 10:
|
||||
chunk = b''
|
||||
match = -1
|
||||
for j in range(10, 2, -1):
|
||||
chunk = data[i:i+j]
|
||||
try:
|
||||
match = data.rindex(chunk, 0, i)
|
||||
except ValueError:
|
||||
continue
|
||||
if (i - match) <= 2047:
|
||||
break
|
||||
match = -1
|
||||
if match >= 0:
|
||||
n = len(chunk)
|
||||
m = i - match
|
||||
code = 0x8000 + ((m << 3) & 0x3ff8) + (n - 3)
|
||||
out.write(pack('>H', code))
|
||||
i += n
|
||||
continue
|
||||
ch = data[i:i+1]
|
||||
och = ord(ch)
|
||||
i += 1
|
||||
if ch == b' ' and (i + 1) < ldata:
|
||||
onch = ord(data[i:i+1])
|
||||
if onch >= 0x40 and onch < 0x80:
|
||||
out.write(pack('>B', onch ^ 0x80))
|
||||
i += 1
|
||||
continue
|
||||
if och == 0 or (och > 8 and och < 0x80):
|
||||
out.write(ch)
|
||||
else:
|
||||
j = i
|
||||
binseq = [ch]
|
||||
while j < ldata and len(binseq) < 8:
|
||||
ch = data[j:j+1]
|
||||
och = ord(ch)
|
||||
if och == 0 or (och > 8 and och < 0x80):
|
||||
break
|
||||
binseq.append(ch)
|
||||
j += 1
|
||||
out.write(pack('>B', len(binseq)))
|
||||
out.write(b''.join(binseq))
|
||||
i += len(binseq) - 1
|
||||
return out.getvalue()
|
||||
|
||||
|
||||
def find_tests():
|
||||
import unittest
|
||||
|
||||
class Test(unittest.TestCase):
|
||||
|
||||
def test_palmdoc_compression(self):
|
||||
for test in [
|
||||
b'abc\x03\x04\x05\x06ms', # Test binary writing
|
||||
b'a b c \xfed ', # Test encoding of spaces
|
||||
b'0123456789axyz2bxyz2cdfgfo9iuyerh',
|
||||
b'0123456789asd0123456789asd|yyzzxxffhhjjkk',
|
||||
(b'ciewacnaq eiu743 r787q 0w% ; sa fd\xef\ffdxosac wocjp acoiecowei '
|
||||
b'owaic jociowapjcivcjpoivjporeivjpoavca; p9aw8743y6r74%$^$^%8 ')
|
||||
]:
|
||||
x = compress_doc(test)
|
||||
self.assertEqual(py_compress_doc(test), x)
|
||||
self.assertEqual(decompress_doc(x), test)
|
||||
|
||||
return unittest.defaultTestLoader.loadTestsFromTestCase(Test)
|
||||
30
ebook_converter/ebooks/conversion/__init__.py
Normal file
30
ebook_converter/ebooks/conversion/__init__.py
Normal file
@@ -0,0 +1,30 @@
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
from polyglot.builtins import native_string_type
|
||||
|
||||
|
||||
class ConversionUserFeedBack(Exception):
|
||||
|
||||
def __init__(self, title, msg, level='info', det_msg=''):
|
||||
''' Show a simple message to the user
|
||||
|
||||
:param title: The title (very short description)
|
||||
:param msg: The message to show the user
|
||||
:param level: Must be one of 'info', 'warn' or 'error'
|
||||
:param det_msg: Optional detailed message to show the user
|
||||
'''
|
||||
import json
|
||||
Exception.__init__(self, json.dumps({'msg':msg, 'level':level,
|
||||
'det_msg':det_msg, 'title':title}))
|
||||
self.title, self.msg, self.det_msg = title, msg, det_msg
|
||||
self.level = level
|
||||
|
||||
|
||||
# Ensure exception uses fully qualified name as this is used to detect it in
|
||||
# the GUI.
|
||||
ConversionUserFeedBack.__name__ = native_string_type('calibre.ebooks.conversion.ConversionUserFeedBack')
|
||||
428
ebook_converter/ebooks/conversion/cli.py
Normal file
428
ebook_converter/ebooks/conversion/cli.py
Normal file
@@ -0,0 +1,428 @@
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL 3'
|
||||
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
'''
|
||||
Command line interface to conversion sub-system
|
||||
'''
|
||||
|
||||
import sys, os, numbers
|
||||
from optparse import OptionGroup, Option
|
||||
from collections import OrderedDict
|
||||
|
||||
from calibre.utils.config import OptionParser
|
||||
from calibre.utils.logging import Log
|
||||
from calibre.customize.conversion import OptionRecommendation
|
||||
from calibre import patheq
|
||||
from calibre.ebooks.conversion import ConversionUserFeedBack
|
||||
from calibre.utils.localization import localize_user_manual_link
|
||||
from polyglot.builtins import iteritems
|
||||
|
||||
USAGE = '%prog ' + _('''\
|
||||
input_file output_file [options]
|
||||
|
||||
Convert an e-book from one format to another.
|
||||
|
||||
input_file is the input and output_file is the output. Both must be \
|
||||
specified as the first two arguments to the command.
|
||||
|
||||
The output e-book format is guessed from the file extension of \
|
||||
output_file. output_file can also be of the special format .EXT where \
|
||||
EXT is the output file extension. In this case, the name of the output \
|
||||
file is derived from the name of the input file. Note that the filenames must \
|
||||
not start with a hyphen. Finally, if output_file has no extension, then \
|
||||
it is treated as a directory and an "open e-book" (OEB) consisting of HTML \
|
||||
files is written to that directory. These files are the files that would \
|
||||
normally have been passed to the output plugin.
|
||||
|
||||
After specifying the input \
|
||||
and output file you can customize the conversion by specifying various \
|
||||
options. The available options depend on the input and output file types. \
|
||||
To get help on them specify the input and output file and then use the -h \
|
||||
option.
|
||||
|
||||
For full documentation of the conversion system see
|
||||
''') + localize_user_manual_link('https://manual.calibre-ebook.com/conversion.html')
|
||||
|
||||
HEURISTIC_OPTIONS = ['markup_chapter_headings',
|
||||
'italicize_common_cases', 'fix_indents',
|
||||
'html_unwrap_factor', 'unwrap_lines',
|
||||
'delete_blank_paragraphs', 'format_scene_breaks',
|
||||
'dehyphenate', 'renumber_headings',
|
||||
'replace_scene_breaks']
|
||||
|
||||
DEFAULT_TRUE_OPTIONS = HEURISTIC_OPTIONS + ['remove_fake_margins']
|
||||
|
||||
|
||||
def print_help(parser, log):
|
||||
parser.print_help()
|
||||
|
||||
|
||||
def check_command_line_options(parser, args, log):
|
||||
if len(args) < 3 or args[1].startswith('-') or args[2].startswith('-'):
|
||||
print_help(parser, log)
|
||||
log.error('\n\nYou must specify the input AND output files')
|
||||
raise SystemExit(1)
|
||||
|
||||
input = os.path.abspath(args[1])
|
||||
if not input.endswith('.recipe') and not os.access(input, os.R_OK) and not \
|
||||
('-h' in args or '--help' in args):
|
||||
log.error('Cannot read from', input)
|
||||
raise SystemExit(1)
|
||||
if input.endswith('.recipe') and not os.access(input, os.R_OK):
|
||||
input = args[1]
|
||||
|
||||
output = args[2]
|
||||
if (output.startswith('.') and output[:2] not in {'..', '.'} and '/' not in
|
||||
output and '\\' not in output):
|
||||
output = os.path.splitext(os.path.basename(input))[0]+output
|
||||
output = os.path.abspath(output)
|
||||
|
||||
return input, output
|
||||
|
||||
|
||||
def option_recommendation_to_cli_option(add_option, rec):
|
||||
opt = rec.option
|
||||
switches = ['-'+opt.short_switch] if opt.short_switch else []
|
||||
switches.append('--'+opt.long_switch)
|
||||
attrs = dict(dest=opt.name, help=opt.help,
|
||||
choices=opt.choices, default=rec.recommended_value)
|
||||
if isinstance(rec.recommended_value, type(True)):
|
||||
attrs['action'] = 'store_false' if rec.recommended_value else \
|
||||
'store_true'
|
||||
else:
|
||||
if isinstance(rec.recommended_value, numbers.Integral):
|
||||
attrs['type'] = 'int'
|
||||
if isinstance(rec.recommended_value, numbers.Real):
|
||||
attrs['type'] = 'float'
|
||||
|
||||
if opt.long_switch == 'verbose':
|
||||
attrs['action'] = 'count'
|
||||
attrs.pop('type', '')
|
||||
if opt.name == 'read_metadata_from_opf':
|
||||
switches.append('--from-opf')
|
||||
if opt.name == 'transform_css_rules':
|
||||
attrs['help'] = _(
|
||||
'Path to a file containing rules to transform the CSS styles'
|
||||
' in this book. The easiest way to create such a file is to'
|
||||
' use the wizard for creating rules in the calibre GUI. Access'
|
||||
' it in the "Look & feel->Transform styles" section of the conversion'
|
||||
' dialog. Once you create the rules, you can use the "Export" button'
|
||||
' to save them to a file.'
|
||||
)
|
||||
if opt.name in DEFAULT_TRUE_OPTIONS and rec.recommended_value is True:
|
||||
switches = ['--disable-'+opt.long_switch]
|
||||
add_option(Option(*switches, **attrs))
|
||||
|
||||
|
||||
def group_titles():
|
||||
return _('INPUT OPTIONS'), _('OUTPUT OPTIONS')
|
||||
|
||||
|
||||
def recipe_test(option, opt_str, value, parser):
|
||||
assert value is None
|
||||
value = []
|
||||
|
||||
def floatable(s):
|
||||
try:
|
||||
float(s)
|
||||
return True
|
||||
except ValueError:
|
||||
return False
|
||||
|
||||
for arg in parser.rargs:
|
||||
# stop on --foo like options
|
||||
if arg[:2] == "--":
|
||||
break
|
||||
# stop on -a, but not on -3 or -3.0
|
||||
if arg[:1] == "-" and len(arg) > 1 and not floatable(arg):
|
||||
break
|
||||
try:
|
||||
value.append(int(arg))
|
||||
except (TypeError, ValueError, AttributeError):
|
||||
break
|
||||
if len(value) == 2:
|
||||
break
|
||||
del parser.rargs[:len(value)]
|
||||
|
||||
while len(value) < 2:
|
||||
value.append(2)
|
||||
|
||||
setattr(parser.values, option.dest, tuple(value))
|
||||
|
||||
|
||||
def add_input_output_options(parser, plumber):
|
||||
input_options, output_options = \
|
||||
plumber.input_options, plumber.output_options
|
||||
|
||||
def add_options(group, options):
|
||||
for opt in options:
|
||||
if plumber.input_fmt == 'recipe' and opt.option.long_switch == 'test':
|
||||
group(Option('--test', dest='test', action='callback', callback=recipe_test))
|
||||
else:
|
||||
option_recommendation_to_cli_option(group, opt)
|
||||
|
||||
if input_options:
|
||||
title = group_titles()[0]
|
||||
io = OptionGroup(parser, title, _('Options to control the processing'
|
||||
' of the input %s file')%plumber.input_fmt)
|
||||
add_options(io.add_option, input_options)
|
||||
parser.add_option_group(io)
|
||||
|
||||
if output_options:
|
||||
title = group_titles()[1]
|
||||
oo = OptionGroup(parser, title, _('Options to control the processing'
|
||||
' of the output %s')%plumber.output_fmt)
|
||||
add_options(oo.add_option, output_options)
|
||||
parser.add_option_group(oo)
|
||||
|
||||
|
||||
def add_pipeline_options(parser, plumber):
|
||||
groups = OrderedDict((
|
||||
('' , ('',
|
||||
[
|
||||
'input_profile',
|
||||
'output_profile',
|
||||
]
|
||||
)),
|
||||
(_('LOOK AND FEEL') , (
|
||||
_('Options to control the look and feel of the output'),
|
||||
[
|
||||
'base_font_size', 'disable_font_rescaling',
|
||||
'font_size_mapping', 'embed_font_family',
|
||||
'subset_embedded_fonts', 'embed_all_fonts',
|
||||
'line_height', 'minimum_line_height',
|
||||
'linearize_tables',
|
||||
'extra_css', 'filter_css', 'transform_css_rules', 'expand_css',
|
||||
'smarten_punctuation', 'unsmarten_punctuation',
|
||||
'margin_top', 'margin_left', 'margin_right',
|
||||
'margin_bottom', 'change_justification',
|
||||
'insert_blank_line', 'insert_blank_line_size',
|
||||
'remove_paragraph_spacing',
|
||||
'remove_paragraph_spacing_indent_size',
|
||||
'asciiize', 'keep_ligatures',
|
||||
]
|
||||
)),
|
||||
|
||||
(_('HEURISTIC PROCESSING') , (
|
||||
_('Modify the document text and structure using common'
|
||||
' patterns. Disabled by default. Use %(en)s to enable. '
|
||||
' Individual actions can be disabled with the %(dis)s options.')
|
||||
% dict(en='--enable-heuristics', dis='--disable-*'),
|
||||
['enable_heuristics'] + HEURISTIC_OPTIONS
|
||||
)),
|
||||
|
||||
(_('SEARCH AND REPLACE') , (
|
||||
_('Modify the document text and structure using user defined patterns.'),
|
||||
[
|
||||
'sr1_search', 'sr1_replace',
|
||||
'sr2_search', 'sr2_replace',
|
||||
'sr3_search', 'sr3_replace',
|
||||
'search_replace',
|
||||
]
|
||||
)),
|
||||
|
||||
(_('STRUCTURE DETECTION') , (
|
||||
_('Control auto-detection of document structure.'),
|
||||
[
|
||||
'chapter', 'chapter_mark',
|
||||
'prefer_metadata_cover', 'remove_first_image',
|
||||
'insert_metadata', 'page_breaks_before',
|
||||
'remove_fake_margins', 'start_reading_at',
|
||||
]
|
||||
)),
|
||||
|
||||
(_('TABLE OF CONTENTS') , (
|
||||
_('Control the automatic generation of a Table of Contents. By '
|
||||
'default, if the source file has a Table of Contents, it will '
|
||||
'be used in preference to the automatically generated one.'),
|
||||
[
|
||||
'level1_toc', 'level2_toc', 'level3_toc',
|
||||
'toc_threshold', 'max_toc_links', 'no_chapters_in_toc',
|
||||
'use_auto_toc', 'toc_filter', 'duplicate_links_in_toc',
|
||||
]
|
||||
)),
|
||||
|
||||
(_('METADATA') , (_('Options to set metadata in the output'),
|
||||
plumber.metadata_option_names + ['read_metadata_from_opf'],
|
||||
)),
|
||||
(_('DEBUG'), (_('Options to help with debugging the conversion'),
|
||||
[
|
||||
'verbose',
|
||||
'debug_pipeline',
|
||||
])),
|
||||
|
||||
))
|
||||
|
||||
for group, (desc, options) in iteritems(groups):
|
||||
if group:
|
||||
group = OptionGroup(parser, group, desc)
|
||||
parser.add_option_group(group)
|
||||
add_option = group.add_option if group != '' else parser.add_option
|
||||
|
||||
for name in options:
|
||||
rec = plumber.get_option_by_name(name)
|
||||
if rec.level < rec.HIGH:
|
||||
option_recommendation_to_cli_option(add_option, rec)
|
||||
|
||||
|
||||
def option_parser():
|
||||
parser = OptionParser(usage=USAGE)
|
||||
parser.add_option('--list-recipes', default=False, action='store_true',
|
||||
help=_('List builtin recipe names. You can create an e-book from '
|
||||
'a builtin recipe like this: ebook-convert "Recipe Name.recipe" '
|
||||
'output.epub'))
|
||||
return parser
|
||||
|
||||
|
||||
class ProgressBar(object):
|
||||
|
||||
def __init__(self, log):
|
||||
self.log = log
|
||||
|
||||
def __call__(self, frac, msg=''):
|
||||
if msg:
|
||||
percent = int(frac*100)
|
||||
self.log('%d%% %s'%(percent, msg))
|
||||
|
||||
|
||||
def create_option_parser(args, log):
|
||||
if '--version' in args:
|
||||
from calibre.constants import __appname__, __version__, __author__
|
||||
log(os.path.basename(args[0]), '('+__appname__, __version__+')')
|
||||
log('Created by:', __author__)
|
||||
raise SystemExit(0)
|
||||
if '--list-recipes' in args:
|
||||
from calibre.web.feeds.recipes.collection import get_builtin_recipe_titles
|
||||
log('Available recipes:')
|
||||
titles = sorted(get_builtin_recipe_titles())
|
||||
for title in titles:
|
||||
try:
|
||||
log('\t'+title)
|
||||
except:
|
||||
log('\t'+repr(title))
|
||||
log('%d recipes available'%len(titles))
|
||||
raise SystemExit(0)
|
||||
|
||||
parser = option_parser()
|
||||
if len(args) < 3:
|
||||
print_help(parser, log)
|
||||
if any(x in args for x in ('-h', '--help')):
|
||||
raise SystemExit(0)
|
||||
else:
|
||||
raise SystemExit(1)
|
||||
|
||||
input, output = check_command_line_options(parser, args, log)
|
||||
|
||||
from calibre.ebooks.conversion.plumber import Plumber
|
||||
|
||||
reporter = ProgressBar(log)
|
||||
if patheq(input, output):
|
||||
raise ValueError('Input file is the same as the output file')
|
||||
|
||||
plumber = Plumber(input, output, log, reporter)
|
||||
add_input_output_options(parser, plumber)
|
||||
add_pipeline_options(parser, plumber)
|
||||
|
||||
return parser, plumber
|
||||
|
||||
|
||||
def abspath(x):
|
||||
if x.startswith('http:') or x.startswith('https:'):
|
||||
return x
|
||||
return os.path.abspath(os.path.expanduser(x))
|
||||
|
||||
|
||||
def escape_sr_pattern(exp):
|
||||
return exp.replace('\n', '\ue123')
|
||||
|
||||
|
||||
def read_sr_patterns(path, log=None):
|
||||
import json, re
|
||||
pats = []
|
||||
with open(path, 'rb') as f:
|
||||
lines = f.read().decode('utf-8').splitlines()
|
||||
pat = None
|
||||
for line in lines:
|
||||
if pat is None:
|
||||
if not line.strip():
|
||||
continue
|
||||
line = line.replace('\ue123', '\n')
|
||||
try:
|
||||
re.compile(line)
|
||||
except:
|
||||
msg = 'Invalid regular expression: %r from file: %r'%(
|
||||
line, path)
|
||||
if log is not None:
|
||||
log.error(msg)
|
||||
raise SystemExit(1)
|
||||
else:
|
||||
raise ValueError(msg)
|
||||
pat = line
|
||||
else:
|
||||
pats.append((pat, line))
|
||||
pat = None
|
||||
return json.dumps(pats)
|
||||
|
||||
|
||||
def main(args=sys.argv):
|
||||
log = Log()
|
||||
parser, plumber = create_option_parser(args, log)
|
||||
opts, leftover_args = parser.parse_args(args)
|
||||
if len(leftover_args) > 3:
|
||||
log.error('Extra arguments not understood:', u', '.join(leftover_args[3:]))
|
||||
return 1
|
||||
for x in ('read_metadata_from_opf', 'cover'):
|
||||
if getattr(opts, x, None) is not None:
|
||||
setattr(opts, x, abspath(getattr(opts, x)))
|
||||
if opts.search_replace:
|
||||
opts.search_replace = read_sr_patterns(opts.search_replace, log)
|
||||
if opts.transform_css_rules:
|
||||
from calibre.ebooks.css_transform_rules import import_rules, validate_rule
|
||||
with open(opts.transform_css_rules, 'rb') as tcr:
|
||||
opts.transform_css_rules = rules = list(import_rules(tcr.read()))
|
||||
for rule in rules:
|
||||
title, msg = validate_rule(rule)
|
||||
if title and msg:
|
||||
log.error('Failed to parse CSS transform rules')
|
||||
log.error(title)
|
||||
log.error(msg)
|
||||
return 1
|
||||
|
||||
recommendations = [(n.dest, getattr(opts, n.dest),
|
||||
OptionRecommendation.HIGH)
|
||||
for n in parser.options_iter()
|
||||
if n.dest]
|
||||
plumber.merge_ui_recommendations(recommendations)
|
||||
|
||||
try:
|
||||
plumber.run()
|
||||
except ConversionUserFeedBack as e:
|
||||
ll = {'info': log.info, 'warn': log.warn,
|
||||
'error':log.error}.get(e.level, log.info)
|
||||
ll(e.title)
|
||||
if e.det_msg:
|
||||
log.debug(e.detmsg)
|
||||
ll(e.msg)
|
||||
raise SystemExit(1)
|
||||
|
||||
log(_('Output saved to'), ' ', plumber.output)
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
def manual_index_strings():
|
||||
return _('''\
|
||||
The options and default values for the options change depending on both the
|
||||
input and output formats, so you should always check with::
|
||||
|
||||
%s
|
||||
|
||||
Below are the options that are common to all conversion, followed by the
|
||||
options specific to every input and output format.''')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
sys.exit(main())
|
||||
10
ebook_converter/ebooks/conversion/plugins/__init__.py
Normal file
10
ebook_converter/ebooks/conversion/plugins/__init__.py
Normal file
@@ -0,0 +1,10 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
|
||||
|
||||
29
ebook_converter/ebooks/conversion/plugins/azw4_input.py
Normal file
29
ebook_converter/ebooks/conversion/plugins/azw4_input.py
Normal file
@@ -0,0 +1,29 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2011, John Schember <john@nachtimwald.com>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
from calibre.customize.conversion import InputFormatPlugin
|
||||
from polyglot.builtins import getcwd
|
||||
|
||||
|
||||
class AZW4Input(InputFormatPlugin):
|
||||
|
||||
name = 'AZW4 Input'
|
||||
author = 'John Schember'
|
||||
description = 'Convert AZW4 to HTML'
|
||||
file_types = {'azw4'}
|
||||
commit_name = 'azw4_input'
|
||||
|
||||
def convert(self, stream, options, file_ext, log,
|
||||
accelerators):
|
||||
from calibre.ebooks.pdb.header import PdbHeaderReader
|
||||
from calibre.ebooks.azw4.reader import Reader
|
||||
|
||||
header = PdbHeaderReader(stream)
|
||||
reader = Reader(header, stream, log, options)
|
||||
opf = reader.extract_content(getcwd())
|
||||
|
||||
return opf
|
||||
202
ebook_converter/ebooks/conversion/plugins/chm_input.py
Normal file
202
ebook_converter/ebooks/conversion/plugins/chm_input.py
Normal file
@@ -0,0 +1,202 @@
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
''' CHM File decoding support '''
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>,' \
|
||||
' and Alex Bramley <a.bramley at gmail.com>.'
|
||||
|
||||
import os
|
||||
|
||||
from calibre.customize.conversion import InputFormatPlugin
|
||||
from calibre.ptempfile import TemporaryDirectory
|
||||
from calibre.constants import filesystem_encoding
|
||||
from polyglot.builtins import unicode_type, as_bytes
|
||||
|
||||
|
||||
class CHMInput(InputFormatPlugin):
|
||||
|
||||
name = 'CHM Input'
|
||||
author = 'Kovid Goyal and Alex Bramley'
|
||||
description = 'Convert CHM files to OEB'
|
||||
file_types = {'chm'}
|
||||
commit_name = 'chm_input'
|
||||
|
||||
def _chmtohtml(self, output_dir, chm_path, no_images, log, debug_dump=False):
|
||||
from calibre.ebooks.chm.reader import CHMReader
|
||||
log.debug('Opening CHM file')
|
||||
rdr = CHMReader(chm_path, log, input_encoding=self.opts.input_encoding)
|
||||
log.debug('Extracting CHM to %s' % output_dir)
|
||||
rdr.extract_content(output_dir, debug_dump=debug_dump)
|
||||
self._chm_reader = rdr
|
||||
return rdr.hhc_path
|
||||
|
||||
def convert(self, stream, options, file_ext, log, accelerators):
|
||||
from calibre.ebooks.chm.metadata import get_metadata_from_reader
|
||||
from calibre.customize.ui import plugin_for_input_format
|
||||
self.opts = options
|
||||
|
||||
log.debug('Processing CHM...')
|
||||
with TemporaryDirectory('_chm2oeb') as tdir:
|
||||
if not isinstance(tdir, unicode_type):
|
||||
tdir = tdir.decode(filesystem_encoding)
|
||||
html_input = plugin_for_input_format('html')
|
||||
for opt in html_input.options:
|
||||
setattr(options, opt.option.name, opt.recommended_value)
|
||||
no_images = False # options.no_images
|
||||
chm_name = stream.name
|
||||
# chm_data = stream.read()
|
||||
|
||||
# closing stream so CHM can be opened by external library
|
||||
stream.close()
|
||||
log.debug('tdir=%s' % tdir)
|
||||
log.debug('stream.name=%s' % stream.name)
|
||||
debug_dump = False
|
||||
odi = options.debug_pipeline
|
||||
if odi:
|
||||
debug_dump = os.path.join(odi, 'input')
|
||||
mainname = self._chmtohtml(tdir, chm_name, no_images, log,
|
||||
debug_dump=debug_dump)
|
||||
mainpath = os.path.join(tdir, mainname)
|
||||
|
||||
try:
|
||||
metadata = get_metadata_from_reader(self._chm_reader)
|
||||
except Exception:
|
||||
log.exception('Failed to read metadata, using filename')
|
||||
from calibre.ebooks.metadata.book.base import Metadata
|
||||
metadata = Metadata(os.path.basename(chm_name))
|
||||
encoding = self._chm_reader.get_encoding() or options.input_encoding or 'cp1252'
|
||||
self._chm_reader.CloseCHM()
|
||||
# print((tdir, mainpath))
|
||||
# from calibre import ipython
|
||||
# ipython()
|
||||
|
||||
options.debug_pipeline = None
|
||||
options.input_encoding = 'utf-8'
|
||||
uenc = encoding
|
||||
if os.path.abspath(mainpath) in self._chm_reader.re_encoded_files:
|
||||
uenc = 'utf-8'
|
||||
htmlpath, toc = self._create_html_root(mainpath, log, uenc)
|
||||
oeb = self._create_oebbook_html(htmlpath, tdir, options, log, metadata)
|
||||
options.debug_pipeline = odi
|
||||
if toc.count() > 1:
|
||||
oeb.toc = self.parse_html_toc(oeb.spine[0])
|
||||
oeb.manifest.remove(oeb.spine[0])
|
||||
oeb.auto_generated_toc = False
|
||||
return oeb
|
||||
|
||||
def parse_html_toc(self, item):
|
||||
from calibre.ebooks.oeb.base import TOC, XPath
|
||||
dx = XPath('./h:div')
|
||||
ax = XPath('./h:a[1]')
|
||||
|
||||
def do_node(parent, div):
|
||||
for child in dx(div):
|
||||
a = ax(child)[0]
|
||||
c = parent.add(a.text, a.attrib['href'])
|
||||
do_node(c, child)
|
||||
|
||||
toc = TOC()
|
||||
root = XPath('//h:div[1]')(item.data)[0]
|
||||
do_node(toc, root)
|
||||
return toc
|
||||
|
||||
def _create_oebbook_html(self, htmlpath, basedir, opts, log, mi):
|
||||
# use HTMLInput plugin to generate book
|
||||
from calibre.customize.builtins import HTMLInput
|
||||
opts.breadth_first = True
|
||||
htmlinput = HTMLInput(None)
|
||||
oeb = htmlinput.create_oebbook(htmlpath, basedir, opts, log, mi)
|
||||
return oeb
|
||||
|
||||
def _create_html_root(self, hhcpath, log, encoding):
|
||||
from lxml import html
|
||||
from polyglot.urllib import unquote as _unquote
|
||||
from calibre.ebooks.oeb.base import urlquote
|
||||
from calibre.ebooks.chardet import xml_to_unicode
|
||||
hhcdata = self._read_file(hhcpath)
|
||||
hhcdata = hhcdata.decode(encoding)
|
||||
hhcdata = xml_to_unicode(hhcdata, verbose=True,
|
||||
strip_encoding_pats=True, resolve_entities=True)[0]
|
||||
hhcroot = html.fromstring(hhcdata)
|
||||
toc = self._process_nodes(hhcroot)
|
||||
# print("=============================")
|
||||
# print("Printing hhcroot")
|
||||
# print(etree.tostring(hhcroot, pretty_print=True))
|
||||
# print("=============================")
|
||||
log.debug('Found %d section nodes' % toc.count())
|
||||
htmlpath = os.path.splitext(hhcpath)[0] + ".html"
|
||||
base = os.path.dirname(os.path.abspath(htmlpath))
|
||||
|
||||
def unquote(x):
|
||||
if isinstance(x, unicode_type):
|
||||
x = x.encode('utf-8')
|
||||
return _unquote(x).decode('utf-8')
|
||||
|
||||
def unquote_path(x):
|
||||
y = unquote(x)
|
||||
if (not os.path.exists(os.path.join(base, x)) and os.path.exists(os.path.join(base, y))):
|
||||
x = y
|
||||
return x
|
||||
|
||||
def donode(item, parent, base, subpath):
|
||||
for child in item:
|
||||
title = child.title
|
||||
if not title:
|
||||
continue
|
||||
raw = unquote_path(child.href or '')
|
||||
rsrcname = os.path.basename(raw)
|
||||
rsrcpath = os.path.join(subpath, rsrcname)
|
||||
if (not os.path.exists(os.path.join(base, rsrcpath)) and os.path.exists(os.path.join(base, raw))):
|
||||
rsrcpath = raw
|
||||
|
||||
if '%' not in rsrcpath:
|
||||
rsrcpath = urlquote(rsrcpath)
|
||||
if not raw:
|
||||
rsrcpath = ''
|
||||
c = DIV(A(title, href=rsrcpath))
|
||||
donode(child, c, base, subpath)
|
||||
parent.append(c)
|
||||
|
||||
with open(htmlpath, 'wb') as f:
|
||||
if toc.count() > 1:
|
||||
from lxml.html.builder import HTML, BODY, DIV, A
|
||||
path0 = toc[0].href
|
||||
path0 = unquote_path(path0)
|
||||
subpath = os.path.dirname(path0)
|
||||
base = os.path.dirname(f.name)
|
||||
root = DIV()
|
||||
donode(toc, root, base, subpath)
|
||||
raw = html.tostring(HTML(BODY(root)), encoding='utf-8',
|
||||
pretty_print=True)
|
||||
f.write(raw)
|
||||
else:
|
||||
f.write(as_bytes(hhcdata))
|
||||
return htmlpath, toc
|
||||
|
||||
def _read_file(self, name):
|
||||
with lopen(name, 'rb') as f:
|
||||
data = f.read()
|
||||
return data
|
||||
|
||||
def add_node(self, node, toc, ancestor_map):
|
||||
from calibre.ebooks.chm.reader import match_string
|
||||
if match_string(node.attrib.get('type', ''), 'text/sitemap'):
|
||||
p = node.xpath('ancestor::ul[1]/ancestor::li[1]/object[1]')
|
||||
parent = p[0] if p else None
|
||||
toc = ancestor_map.get(parent, toc)
|
||||
title = href = ''
|
||||
for param in node.xpath('./param'):
|
||||
if match_string(param.attrib['name'], 'name'):
|
||||
title = param.attrib['value']
|
||||
elif match_string(param.attrib['name'], 'local'):
|
||||
href = param.attrib['value']
|
||||
child = toc.add(title or _('Unknown'), href)
|
||||
ancestor_map[node] = child
|
||||
|
||||
def _process_nodes(self, root):
|
||||
from calibre.ebooks.oeb.base import TOC
|
||||
toc = TOC()
|
||||
ancestor_map = {}
|
||||
for node in root.xpath('//object'):
|
||||
self.add_node(node, toc, ancestor_map)
|
||||
return toc
|
||||
310
ebook_converter/ebooks/conversion/plugins/comic_input.py
Normal file
310
ebook_converter/ebooks/conversion/plugins/comic_input.py
Normal file
@@ -0,0 +1,310 @@
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
'''
|
||||
Based on ideas from comiclrf created by FangornUK.
|
||||
'''
|
||||
|
||||
import shutil, textwrap, codecs, os
|
||||
|
||||
from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
|
||||
from calibre import CurrentDir
|
||||
from calibre.ptempfile import PersistentTemporaryDirectory
|
||||
from polyglot.builtins import getcwd, map
|
||||
|
||||
|
||||
class ComicInput(InputFormatPlugin):
|
||||
|
||||
name = 'Comic Input'
|
||||
author = 'Kovid Goyal'
|
||||
description = 'Optimize comic files (.cbz, .cbr, .cbc) for viewing on portable devices'
|
||||
file_types = {'cbz', 'cbr', 'cbc'}
|
||||
is_image_collection = True
|
||||
commit_name = 'comic_input'
|
||||
core_usage = -1
|
||||
|
||||
options = {
|
||||
OptionRecommendation(name='colors', recommended_value=0,
|
||||
help=_('Reduce the number of colors used in the image. This works only'
|
||||
' if you choose the PNG output format. It is useful to reduce file sizes.'
|
||||
' Set to zero to turn off. Maximum value is 256. It is off by default.')),
|
||||
OptionRecommendation(name='dont_normalize', recommended_value=False,
|
||||
help=_('Disable normalize (improve contrast) color range '
|
||||
'for pictures. Default: False')),
|
||||
OptionRecommendation(name='keep_aspect_ratio', recommended_value=False,
|
||||
help=_('Maintain picture aspect ratio. Default is to fill the screen.')),
|
||||
OptionRecommendation(name='dont_sharpen', recommended_value=False,
|
||||
help=_('Disable sharpening.')),
|
||||
OptionRecommendation(name='disable_trim', recommended_value=False,
|
||||
help=_('Disable trimming of comic pages. For some comics, '
|
||||
'trimming might remove content as well as borders.')),
|
||||
OptionRecommendation(name='landscape', recommended_value=False,
|
||||
help=_("Don't split landscape images into two portrait images")),
|
||||
OptionRecommendation(name='wide', recommended_value=False,
|
||||
help=_("Keep aspect ratio and scale image using screen height as "
|
||||
"image width for viewing in landscape mode.")),
|
||||
OptionRecommendation(name='right2left', recommended_value=False,
|
||||
help=_('Used for right-to-left publications like manga. '
|
||||
'Causes landscape pages to be split into portrait pages '
|
||||
'from right to left.')),
|
||||
OptionRecommendation(name='despeckle', recommended_value=False,
|
||||
help=_('Enable Despeckle. Reduces speckle noise. '
|
||||
'May greatly increase processing time.')),
|
||||
OptionRecommendation(name='no_sort', recommended_value=False,
|
||||
help=_("Don't sort the files found in the comic "
|
||||
"alphabetically by name. Instead use the order they were "
|
||||
"added to the comic.")),
|
||||
OptionRecommendation(name='output_format', choices=['png', 'jpg'],
|
||||
recommended_value='png', help=_('The format that images in the created e-book '
|
||||
'are converted to. You can experiment to see which format gives '
|
||||
'you optimal size and look on your device.')),
|
||||
OptionRecommendation(name='no_process', recommended_value=False,
|
||||
help=_("Apply no processing to the image")),
|
||||
OptionRecommendation(name='dont_grayscale', recommended_value=False,
|
||||
help=_('Do not convert the image to grayscale (black and white)')),
|
||||
OptionRecommendation(name='comic_image_size', recommended_value=None,
|
||||
help=_('Specify the image size as widthxheight pixels. Normally,'
|
||||
' an image size is automatically calculated from the output '
|
||||
'profile, this option overrides it.')),
|
||||
OptionRecommendation(name='dont_add_comic_pages_to_toc', recommended_value=False,
|
||||
help=_('When converting a CBC do not add links to each page to'
|
||||
' the TOC. Note this only applies if the TOC has more than one'
|
||||
' section')),
|
||||
}
|
||||
|
||||
recommendations = {
|
||||
('margin_left', 0, OptionRecommendation.HIGH),
|
||||
('margin_top', 0, OptionRecommendation.HIGH),
|
||||
('margin_right', 0, OptionRecommendation.HIGH),
|
||||
('margin_bottom', 0, OptionRecommendation.HIGH),
|
||||
('insert_blank_line', False, OptionRecommendation.HIGH),
|
||||
('remove_paragraph_spacing', False, OptionRecommendation.HIGH),
|
||||
('change_justification', 'left', OptionRecommendation.HIGH),
|
||||
('dont_split_on_pagebreaks', True, OptionRecommendation.HIGH),
|
||||
('chapter', None, OptionRecommendation.HIGH),
|
||||
('page_breaks_brefore', None, OptionRecommendation.HIGH),
|
||||
('use_auto_toc', False, OptionRecommendation.HIGH),
|
||||
('page_breaks_before', None, OptionRecommendation.HIGH),
|
||||
('disable_font_rescaling', True, OptionRecommendation.HIGH),
|
||||
('linearize_tables', False, OptionRecommendation.HIGH),
|
||||
}
|
||||
|
||||
def get_comics_from_collection(self, stream):
|
||||
from calibre.libunzip import extract as zipextract
|
||||
tdir = PersistentTemporaryDirectory('_comic_collection')
|
||||
zipextract(stream, tdir)
|
||||
comics = []
|
||||
with CurrentDir(tdir):
|
||||
if not os.path.exists('comics.txt'):
|
||||
raise ValueError((
|
||||
'%s is not a valid comic collection'
|
||||
' no comics.txt was found in the file')
|
||||
%stream.name)
|
||||
with open('comics.txt', 'rb') as f:
|
||||
raw = f.read()
|
||||
if raw.startswith(codecs.BOM_UTF16_BE):
|
||||
raw = raw.decode('utf-16-be')[1:]
|
||||
elif raw.startswith(codecs.BOM_UTF16_LE):
|
||||
raw = raw.decode('utf-16-le')[1:]
|
||||
elif raw.startswith(codecs.BOM_UTF8):
|
||||
raw = raw.decode('utf-8')[1:]
|
||||
else:
|
||||
raw = raw.decode('utf-8')
|
||||
for line in raw.splitlines():
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
fname, title = line.partition(':')[0], line.partition(':')[-1]
|
||||
fname = fname.replace('#', '_')
|
||||
fname = os.path.join(tdir, *fname.split('/'))
|
||||
if not title:
|
||||
title = os.path.basename(fname).rpartition('.')[0]
|
||||
if os.access(fname, os.R_OK):
|
||||
comics.append([title, fname])
|
||||
if not comics:
|
||||
raise ValueError('%s has no comics'%stream.name)
|
||||
return comics
|
||||
|
||||
def get_pages(self, comic, tdir2):
|
||||
from calibre.ebooks.comic.input import (extract_comic, process_pages,
|
||||
find_pages)
|
||||
tdir = extract_comic(comic)
|
||||
new_pages = find_pages(tdir, sort_on_mtime=self.opts.no_sort,
|
||||
verbose=self.opts.verbose)
|
||||
thumbnail = None
|
||||
if not new_pages:
|
||||
raise ValueError('Could not find any pages in the comic: %s'
|
||||
%comic)
|
||||
if self.opts.no_process:
|
||||
n2 = []
|
||||
for i, page in enumerate(new_pages):
|
||||
n2.append(os.path.join(tdir2, '{} - {}' .format(i, os.path.basename(page))))
|
||||
shutil.copyfile(page, n2[-1])
|
||||
new_pages = n2
|
||||
else:
|
||||
new_pages, failures = process_pages(new_pages, self.opts,
|
||||
self.report_progress, tdir2)
|
||||
if failures:
|
||||
self.log.warning('Could not process the following pages '
|
||||
'(run with --verbose to see why):')
|
||||
for f in failures:
|
||||
self.log.warning('\t', f)
|
||||
if not new_pages:
|
||||
raise ValueError('Could not find any valid pages in comic: %s'
|
||||
% comic)
|
||||
thumbnail = os.path.join(tdir2,
|
||||
'thumbnail.'+self.opts.output_format.lower())
|
||||
if not os.access(thumbnail, os.R_OK):
|
||||
thumbnail = None
|
||||
return new_pages
|
||||
|
||||
def get_images(self):
|
||||
return self._images
|
||||
|
||||
def convert(self, stream, opts, file_ext, log, accelerators):
|
||||
from calibre.ebooks.metadata import MetaInformation
|
||||
from calibre.ebooks.metadata.opf2 import OPFCreator
|
||||
from calibre.ebooks.metadata.toc import TOC
|
||||
|
||||
self.opts, self.log= opts, log
|
||||
if file_ext == 'cbc':
|
||||
comics_ = self.get_comics_from_collection(stream)
|
||||
else:
|
||||
comics_ = [['Comic', os.path.abspath(stream.name)]]
|
||||
stream.close()
|
||||
comics = []
|
||||
for i, x in enumerate(comics_):
|
||||
title, fname = x
|
||||
cdir = 'comic_%d'%(i+1) if len(comics_) > 1 else '.'
|
||||
cdir = os.path.abspath(cdir)
|
||||
if not os.path.exists(cdir):
|
||||
os.makedirs(cdir)
|
||||
pages = self.get_pages(fname, cdir)
|
||||
if not pages:
|
||||
continue
|
||||
if self.for_viewer:
|
||||
comics.append((title, pages, [self.create_viewer_wrapper(pages)]))
|
||||
else:
|
||||
wrappers = self.create_wrappers(pages)
|
||||
comics.append((title, pages, wrappers))
|
||||
|
||||
if not comics:
|
||||
raise ValueError('No comic pages found in %s'%stream.name)
|
||||
|
||||
mi = MetaInformation(os.path.basename(stream.name).rpartition('.')[0],
|
||||
[_('Unknown')])
|
||||
opf = OPFCreator(getcwd(), mi)
|
||||
entries = []
|
||||
|
||||
def href(x):
|
||||
if len(comics) == 1:
|
||||
return os.path.basename(x)
|
||||
return '/'.join(x.split(os.sep)[-2:])
|
||||
|
||||
cover_href = None
|
||||
for comic in comics:
|
||||
pages, wrappers = comic[1:]
|
||||
page_entries = [(x, None) for x in map(href, pages)]
|
||||
entries += [(w, None) for w in map(href, wrappers)] + page_entries
|
||||
if cover_href is None and page_entries:
|
||||
cover_href = page_entries[0][0]
|
||||
opf.create_manifest(entries)
|
||||
spine = []
|
||||
for comic in comics:
|
||||
spine.extend(map(href, comic[2]))
|
||||
self._images = []
|
||||
for comic in comics:
|
||||
self._images.extend(comic[1])
|
||||
opf.create_spine(spine)
|
||||
if self.for_viewer and cover_href:
|
||||
opf.guide.set_cover(cover_href)
|
||||
toc = TOC()
|
||||
if len(comics) == 1:
|
||||
wrappers = comics[0][2]
|
||||
for i, x in enumerate(wrappers):
|
||||
toc.add_item(href(x), None, _('Page')+' %d'%(i+1),
|
||||
play_order=i)
|
||||
else:
|
||||
po = 0
|
||||
for comic in comics:
|
||||
po += 1
|
||||
wrappers = comic[2]
|
||||
stoc = toc.add_item(href(wrappers[0]),
|
||||
None, comic[0], play_order=po)
|
||||
if not opts.dont_add_comic_pages_to_toc:
|
||||
for i, x in enumerate(wrappers):
|
||||
stoc.add_item(href(x), None,
|
||||
_('Page')+' %d'%(i+1), play_order=po)
|
||||
po += 1
|
||||
opf.set_toc(toc)
|
||||
with open('metadata.opf', 'wb') as m, open('toc.ncx', 'wb') as n:
|
||||
opf.render(m, n, 'toc.ncx')
|
||||
return os.path.abspath('metadata.opf')
|
||||
|
||||
def create_wrappers(self, pages):
|
||||
from calibre.ebooks.oeb.base import XHTML_NS
|
||||
wrappers = []
|
||||
WRAPPER = textwrap.dedent('''\
|
||||
<html xmlns="%s">
|
||||
<head>
|
||||
<meta charset="utf-8"/>
|
||||
<title>Page #%d</title>
|
||||
<style type="text/css">
|
||||
@page { margin:0pt; padding: 0pt}
|
||||
body { margin: 0pt; padding: 0pt}
|
||||
div { text-align: center }
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<div>
|
||||
<img src="%s" alt="comic page #%d" />
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
''')
|
||||
dir = os.path.dirname(pages[0])
|
||||
for i, page in enumerate(pages):
|
||||
wrapper = WRAPPER%(XHTML_NS, i+1, os.path.basename(page), i+1)
|
||||
page = os.path.join(dir, 'page_%d.xhtml'%(i+1))
|
||||
with open(page, 'wb') as f:
|
||||
f.write(wrapper.encode('utf-8'))
|
||||
wrappers.append(page)
|
||||
return wrappers
|
||||
|
||||
def create_viewer_wrapper(self, pages):
|
||||
from calibre.ebooks.oeb.base import XHTML_NS
|
||||
|
||||
def page(src):
|
||||
return '<img src="{}"></img>'.format(os.path.basename(src))
|
||||
|
||||
pages = '\n'.join(map(page, pages))
|
||||
base = os.path.dirname(pages[0])
|
||||
wrapper = '''
|
||||
<html xmlns="%s">
|
||||
<head>
|
||||
<meta charset="utf-8"/>
|
||||
<style type="text/css">
|
||||
html, body, img { height: 100vh; display: block; margin: 0; padding: 0; border-width: 0; }
|
||||
img {
|
||||
width: 100%%; height: 100%%;
|
||||
object-fit: contain;
|
||||
margin-left: auto; margin-right: auto;
|
||||
max-width: 100vw; max-height: 100vh;
|
||||
top: 50vh; transform: translateY(-50%%);
|
||||
position: relative;
|
||||
page-break-after: always;
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
%s
|
||||
</body>
|
||||
</html>
|
||||
''' % (XHTML_NS, pages)
|
||||
path = os.path.join(base, 'wrapper.xhtml')
|
||||
with open(path, 'wb') as f:
|
||||
f.write(wrapper.encode('utf-8'))
|
||||
return path
|
||||
67
ebook_converter/ebooks/conversion/plugins/djvu_input.py
Normal file
67
ebook_converter/ebooks/conversion/plugins/djvu_input.py
Normal file
@@ -0,0 +1,67 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL 3'
|
||||
__copyright__ = '2011, Anthon van der Neut <anthon@mnt.org>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import os
|
||||
from io import BytesIO
|
||||
|
||||
from calibre.customize.conversion import InputFormatPlugin
|
||||
from polyglot.builtins import getcwd
|
||||
|
||||
|
||||
class DJVUInput(InputFormatPlugin):
|
||||
|
||||
name = 'DJVU Input'
|
||||
author = 'Anthon van der Neut'
|
||||
description = 'Convert OCR-ed DJVU files (.djvu) to HTML'
|
||||
file_types = {'djvu', 'djv'}
|
||||
commit_name = 'djvu_input'
|
||||
|
||||
def convert(self, stream, options, file_ext, log, accelerators):
|
||||
from calibre.ebooks.txt.processor import convert_basic
|
||||
|
||||
stdout = BytesIO()
|
||||
from calibre.ebooks.djvu.djvu import DJVUFile
|
||||
x = DJVUFile(stream)
|
||||
x.get_text(stdout)
|
||||
raw_text = stdout.getvalue()
|
||||
if not raw_text:
|
||||
raise ValueError('The DJVU file contains no text, only images, probably page scans.'
|
||||
' calibre only supports conversion of DJVU files with actual text in them.')
|
||||
|
||||
html = convert_basic(raw_text.replace(b"\n", b' ').replace(
|
||||
b'\037', b'\n\n'))
|
||||
# Run the HTMLized text through the html processing plugin.
|
||||
from calibre.customize.ui import plugin_for_input_format
|
||||
html_input = plugin_for_input_format('html')
|
||||
for opt in html_input.options:
|
||||
setattr(options, opt.option.name, opt.recommended_value)
|
||||
options.input_encoding = 'utf-8'
|
||||
base = getcwd()
|
||||
htmlfile = os.path.join(base, 'index.html')
|
||||
c = 0
|
||||
while os.path.exists(htmlfile):
|
||||
c += 1
|
||||
htmlfile = os.path.join(base, 'index%d.html'%c)
|
||||
with open(htmlfile, 'wb') as f:
|
||||
f.write(html.encode('utf-8'))
|
||||
odi = options.debug_pipeline
|
||||
options.debug_pipeline = None
|
||||
# Generate oeb from html conversion.
|
||||
with open(htmlfile, 'rb') as f:
|
||||
oeb = html_input.convert(f, options, 'html', log,
|
||||
{})
|
||||
options.debug_pipeline = odi
|
||||
os.remove(htmlfile)
|
||||
|
||||
# Set metadata from file.
|
||||
from calibre.customize.ui import get_file_type_metadata
|
||||
from calibre.ebooks.oeb.transforms.metadata import meta_info_to_oeb_metadata
|
||||
mi = get_file_type_metadata(stream, file_ext)
|
||||
meta_info_to_oeb_metadata(mi, oeb.metadata, log)
|
||||
|
||||
return oeb
|
||||
34
ebook_converter/ebooks/conversion/plugins/docx_input.py
Normal file
34
ebook_converter/ebooks/conversion/plugins/docx_input.py
Normal file
@@ -0,0 +1,34 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=utf-8
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
|
||||
from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
|
||||
|
||||
|
||||
class DOCXInput(InputFormatPlugin):
|
||||
name = 'DOCX Input'
|
||||
author = 'Kovid Goyal'
|
||||
description = _('Convert DOCX files (.docx and .docm) to HTML')
|
||||
file_types = {'docx', 'docm'}
|
||||
commit_name = 'docx_input'
|
||||
|
||||
options = {
|
||||
OptionRecommendation(name='docx_no_cover', recommended_value=False,
|
||||
help=_('Normally, if a large image is present at the start of the document that looks like a cover, '
|
||||
'it will be removed from the document and used as the cover for created e-book. This option '
|
||||
'turns off that behavior.')),
|
||||
OptionRecommendation(name='docx_no_pagebreaks_between_notes', recommended_value=False,
|
||||
help=_('Do not insert a page break after every endnote.')),
|
||||
OptionRecommendation(name='docx_inline_subsup', recommended_value=False,
|
||||
help=_('Render superscripts and subscripts so that they do not affect the line height.')),
|
||||
}
|
||||
|
||||
recommendations = {('page_breaks_before', '/', OptionRecommendation.MED)}
|
||||
|
||||
def convert(self, stream, options, file_ext, log, accelerators):
|
||||
from calibre.ebooks.docx.to_html import Convert
|
||||
return Convert(stream, detect_cover=not options.docx_no_cover, log=log, notes_nopb=options.docx_no_pagebreaks_between_notes,
|
||||
nosupsub=options.docx_inline_subsup)()
|
||||
93
ebook_converter/ebooks/conversion/plugins/docx_output.py
Normal file
93
ebook_converter/ebooks/conversion/plugins/docx_output.py
Normal file
@@ -0,0 +1,93 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=utf-8
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
|
||||
from calibre.customize.conversion import OutputFormatPlugin, OptionRecommendation
|
||||
|
||||
PAGE_SIZES = ['a0', 'a1', 'a2', 'a3', 'a4', 'a5', 'a6', 'b0', 'b1',
|
||||
'b2', 'b3', 'b4', 'b5', 'b6', 'legal', 'letter']
|
||||
|
||||
|
||||
class DOCXOutput(OutputFormatPlugin):
|
||||
|
||||
name = 'DOCX Output'
|
||||
author = 'Kovid Goyal'
|
||||
file_type = 'docx'
|
||||
commit_name = 'docx_output'
|
||||
ui_data = {'page_sizes': PAGE_SIZES}
|
||||
|
||||
options = {
|
||||
OptionRecommendation(name='docx_page_size', recommended_value='letter',
|
||||
level=OptionRecommendation.LOW, choices=PAGE_SIZES,
|
||||
help=_('The size of the page. Default is letter. Choices '
|
||||
'are %s') % PAGE_SIZES),
|
||||
|
||||
OptionRecommendation(name='docx_custom_page_size', recommended_value=None,
|
||||
help=_('Custom size of the document. Use the form widthxheight '
|
||||
'EG. `123x321` to specify the width and height (in pts). '
|
||||
'This overrides any specified page-size.')),
|
||||
|
||||
OptionRecommendation(name='docx_no_cover', recommended_value=False,
|
||||
help=_('Do not insert the book cover as an image at the start of the document.'
|
||||
' If you use this option, the book cover will be discarded.')),
|
||||
|
||||
OptionRecommendation(name='preserve_cover_aspect_ratio', recommended_value=False,
|
||||
help=_('Preserve the aspect ratio of the cover image instead of stretching'
|
||||
' it out to cover the entire page.')),
|
||||
|
||||
OptionRecommendation(name='docx_no_toc', recommended_value=False,
|
||||
help=_('Do not insert the table of contents as a page at the start of the document.')),
|
||||
|
||||
OptionRecommendation(name='extract_to',
|
||||
help=_('Extract the contents of the generated %s file to the '
|
||||
'specified directory. The contents of the directory are first '
|
||||
'deleted, so be careful.') % 'DOCX'),
|
||||
|
||||
OptionRecommendation(name='docx_page_margin_left', recommended_value=72.0,
|
||||
level=OptionRecommendation.LOW,
|
||||
help=_('The size of the left page margin, in pts. Default is 72pt.'
|
||||
' Overrides the common left page margin setting.')
|
||||
),
|
||||
|
||||
OptionRecommendation(name='docx_page_margin_top', recommended_value=72.0,
|
||||
level=OptionRecommendation.LOW,
|
||||
help=_('The size of the top page margin, in pts. Default is 72pt.'
|
||||
' Overrides the common top page margin setting, unless set to zero.')
|
||||
),
|
||||
|
||||
OptionRecommendation(name='docx_page_margin_right', recommended_value=72.0,
|
||||
level=OptionRecommendation.LOW,
|
||||
help=_('The size of the right page margin, in pts. Default is 72pt.'
|
||||
' Overrides the common right page margin setting, unless set to zero.')
|
||||
),
|
||||
|
||||
OptionRecommendation(name='docx_page_margin_bottom', recommended_value=72.0,
|
||||
level=OptionRecommendation.LOW,
|
||||
help=_('The size of the bottom page margin, in pts. Default is 72pt.'
|
||||
' Overrides the common bottom page margin setting, unless set to zero.')
|
||||
),
|
||||
|
||||
}
|
||||
|
||||
def convert_metadata(self, oeb):
|
||||
from lxml import etree
|
||||
from calibre.ebooks.oeb.base import OPF, OPF2_NS
|
||||
from calibre.ebooks.metadata.opf2 import OPF as ReadOPF
|
||||
from io import BytesIO
|
||||
package = etree.Element(OPF('package'), attrib={'version': '2.0'}, nsmap={None: OPF2_NS})
|
||||
oeb.metadata.to_opf2(package)
|
||||
self.mi = ReadOPF(BytesIO(etree.tostring(package, encoding='utf-8')), populate_spine=False, try_to_guess_cover=False).to_book_metadata()
|
||||
|
||||
def convert(self, oeb, output_path, input_plugin, opts, log):
|
||||
from calibre.ebooks.docx.writer.container import DOCX
|
||||
from calibre.ebooks.docx.writer.from_html import Convert
|
||||
docx = DOCX(opts, log)
|
||||
self.convert_metadata(oeb)
|
||||
Convert(oeb, docx, self.mi, not opts.docx_no_cover, not opts.docx_no_toc)()
|
||||
docx.write(output_path, self.mi)
|
||||
if opts.extract_to:
|
||||
from calibre.ebooks.docx.dump import do_dump
|
||||
do_dump(output_path, opts.extract_to)
|
||||
438
ebook_converter/ebooks/conversion/plugins/epub_input.py
Normal file
438
ebook_converter/ebooks/conversion/plugins/epub_input.py
Normal file
@@ -0,0 +1,438 @@
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL 3'
|
||||
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import os, re, posixpath
|
||||
from itertools import cycle
|
||||
|
||||
from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
|
||||
from polyglot.builtins import getcwd
|
||||
|
||||
ADOBE_OBFUSCATION = 'http://ns.adobe.com/pdf/enc#RC'
|
||||
IDPF_OBFUSCATION = 'http://www.idpf.org/2008/embedding'
|
||||
|
||||
|
||||
def decrypt_font_data(key, data, algorithm):
|
||||
is_adobe = algorithm == ADOBE_OBFUSCATION
|
||||
crypt_len = 1024 if is_adobe else 1040
|
||||
crypt = bytearray(data[:crypt_len])
|
||||
key = cycle(iter(bytearray(key)))
|
||||
decrypt = bytes(bytearray(x^next(key) for x in crypt))
|
||||
return decrypt + data[crypt_len:]
|
||||
|
||||
|
||||
def decrypt_font(key, path, algorithm):
|
||||
with lopen(path, 'r+b') as f:
|
||||
data = decrypt_font_data(key, f.read(), algorithm)
|
||||
f.seek(0), f.truncate(), f.write(data)
|
||||
|
||||
|
||||
class EPUBInput(InputFormatPlugin):
|
||||
|
||||
name = 'EPUB Input'
|
||||
author = 'Kovid Goyal'
|
||||
description = 'Convert EPUB files (.epub) to HTML'
|
||||
file_types = {'epub'}
|
||||
output_encoding = None
|
||||
commit_name = 'epub_input'
|
||||
|
||||
recommendations = {('page_breaks_before', '/', OptionRecommendation.MED)}
|
||||
|
||||
def process_encryption(self, encfile, opf, log):
|
||||
from lxml import etree
|
||||
import uuid, hashlib
|
||||
idpf_key = opf.raw_unique_identifier
|
||||
if idpf_key:
|
||||
idpf_key = re.sub('[\u0020\u0009\u000d\u000a]', '', idpf_key)
|
||||
idpf_key = hashlib.sha1(idpf_key.encode('utf-8')).digest()
|
||||
key = None
|
||||
for item in opf.identifier_iter():
|
||||
scheme = None
|
||||
for xkey in item.attrib.keys():
|
||||
if xkey.endswith('scheme'):
|
||||
scheme = item.get(xkey)
|
||||
if (scheme and scheme.lower() == 'uuid') or \
|
||||
(item.text and item.text.startswith('urn:uuid:')):
|
||||
try:
|
||||
key = item.text.rpartition(':')[-1]
|
||||
key = uuid.UUID(key).bytes
|
||||
except:
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
key = None
|
||||
|
||||
try:
|
||||
root = etree.parse(encfile)
|
||||
for em in root.xpath('descendant::*[contains(name(), "EncryptionMethod")]'):
|
||||
algorithm = em.get('Algorithm', '')
|
||||
if algorithm not in {ADOBE_OBFUSCATION, IDPF_OBFUSCATION}:
|
||||
return False
|
||||
cr = em.getparent().xpath('descendant::*[contains(name(), "CipherReference")]')[0]
|
||||
uri = cr.get('URI')
|
||||
path = os.path.abspath(os.path.join(os.path.dirname(encfile), '..', *uri.split('/')))
|
||||
tkey = (key if algorithm == ADOBE_OBFUSCATION else idpf_key)
|
||||
if (tkey and os.path.exists(path)):
|
||||
self._encrypted_font_uris.append(uri)
|
||||
decrypt_font(tkey, path, algorithm)
|
||||
return True
|
||||
except:
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
return False
|
||||
|
||||
def set_guide_type(self, opf, gtype, href=None, title=''):
|
||||
# Set the specified guide entry
|
||||
for elem in list(opf.iterguide()):
|
||||
if elem.get('type', '').lower() == gtype:
|
||||
elem.getparent().remove(elem)
|
||||
|
||||
if href is not None:
|
||||
t = opf.create_guide_item(gtype, title, href)
|
||||
for guide in opf.root.xpath('./*[local-name()="guide"]'):
|
||||
guide.append(t)
|
||||
return
|
||||
guide = opf.create_guide_element()
|
||||
opf.root.append(guide)
|
||||
guide.append(t)
|
||||
return t
|
||||
|
||||
def rationalize_cover3(self, opf, log):
|
||||
''' If there is a reference to the cover/titlepage via manifest properties, convert to
|
||||
entries in the <guide> so that the rest of the pipeline picks it up. '''
|
||||
from calibre.ebooks.metadata.opf3 import items_with_property
|
||||
removed = guide_titlepage_href = guide_titlepage_id = None
|
||||
|
||||
# Look for titlepages incorrectly marked in the <guide> as covers
|
||||
guide_cover, guide_elem = None, None
|
||||
for guide_elem in opf.iterguide():
|
||||
if guide_elem.get('type', '').lower() == 'cover':
|
||||
guide_cover = guide_elem.get('href', '').partition('#')[0]
|
||||
break
|
||||
if guide_cover:
|
||||
spine = list(opf.iterspine())
|
||||
if spine:
|
||||
idref = spine[0].get('idref', '')
|
||||
for x in opf.itermanifest():
|
||||
if x.get('id') == idref and x.get('href') == guide_cover:
|
||||
guide_titlepage_href = guide_cover
|
||||
guide_titlepage_id = idref
|
||||
break
|
||||
|
||||
raster_cover_href = opf.epub3_raster_cover or opf.raster_cover
|
||||
if raster_cover_href:
|
||||
self.set_guide_type(opf, 'cover', raster_cover_href, 'Cover Image')
|
||||
titlepage_id = titlepage_href = None
|
||||
for item in items_with_property(opf.root, 'calibre:title-page'):
|
||||
tid, href = item.get('id'), item.get('href')
|
||||
if href and tid:
|
||||
titlepage_id, titlepage_href = tid, href.partition('#')[0]
|
||||
break
|
||||
if titlepage_href is None:
|
||||
titlepage_href, titlepage_id = guide_titlepage_href, guide_titlepage_id
|
||||
if titlepage_href is not None:
|
||||
self.set_guide_type(opf, 'titlepage', titlepage_href, 'Title Page')
|
||||
spine = list(opf.iterspine())
|
||||
if len(spine) > 1:
|
||||
for item in spine:
|
||||
if item.get('idref') == titlepage_id:
|
||||
log('Found HTML cover', titlepage_href)
|
||||
if self.for_viewer:
|
||||
item.attrib.pop('linear', None)
|
||||
else:
|
||||
item.getparent().remove(item)
|
||||
removed = titlepage_href
|
||||
return removed
|
||||
|
||||
def rationalize_cover2(self, opf, log):
|
||||
''' Ensure that the cover information in the guide is correct. That
|
||||
means, at most one entry with type="cover" that points to a raster
|
||||
cover and at most one entry with type="titlepage" that points to an
|
||||
HTML titlepage. '''
|
||||
from calibre.ebooks.oeb.base import OPF
|
||||
removed = None
|
||||
from lxml import etree
|
||||
guide_cover, guide_elem = None, None
|
||||
for guide_elem in opf.iterguide():
|
||||
if guide_elem.get('type', '').lower() == 'cover':
|
||||
guide_cover = guide_elem.get('href', '').partition('#')[0]
|
||||
break
|
||||
if not guide_cover:
|
||||
raster_cover = opf.raster_cover
|
||||
if raster_cover:
|
||||
if guide_elem is None:
|
||||
g = opf.root.makeelement(OPF('guide'))
|
||||
opf.root.append(g)
|
||||
else:
|
||||
g = guide_elem.getparent()
|
||||
guide_cover = raster_cover
|
||||
guide_elem = g.makeelement(OPF('reference'), attrib={'href':raster_cover, 'type':'cover'})
|
||||
g.append(guide_elem)
|
||||
return
|
||||
spine = list(opf.iterspine())
|
||||
if not spine:
|
||||
return
|
||||
# Check if the cover specified in the guide is also
|
||||
# the first element in spine
|
||||
idref = spine[0].get('idref', '')
|
||||
manifest = list(opf.itermanifest())
|
||||
if not manifest:
|
||||
return
|
||||
elem = [x for x in manifest if x.get('id', '') == idref]
|
||||
if not elem or elem[0].get('href', None) != guide_cover:
|
||||
return
|
||||
log('Found HTML cover', guide_cover)
|
||||
|
||||
# Remove from spine as covers must be treated
|
||||
# specially
|
||||
if not self.for_viewer:
|
||||
if len(spine) == 1:
|
||||
log.warn('There is only a single spine item and it is marked as the cover. Removing cover marking.')
|
||||
for guide_elem in tuple(opf.iterguide()):
|
||||
if guide_elem.get('type', '').lower() == 'cover':
|
||||
guide_elem.getparent().remove(guide_elem)
|
||||
return
|
||||
else:
|
||||
spine[0].getparent().remove(spine[0])
|
||||
removed = guide_cover
|
||||
else:
|
||||
# Ensure the cover is displayed as the first item in the book, some
|
||||
# epub files have it set with linear='no' which causes the cover to
|
||||
# display in the end
|
||||
spine[0].attrib.pop('linear', None)
|
||||
opf.spine[0].is_linear = True
|
||||
# Ensure that the guide has a cover entry pointing to a raster cover
|
||||
# and a titlepage entry pointing to the html titlepage. The titlepage
|
||||
# entry will be used by the epub output plugin, the raster cover entry
|
||||
# by other output plugins.
|
||||
|
||||
# Search for a raster cover identified in the OPF
|
||||
raster_cover = opf.raster_cover
|
||||
|
||||
# Set the cover guide entry
|
||||
if raster_cover is not None:
|
||||
guide_elem.set('href', raster_cover)
|
||||
else:
|
||||
# Render the titlepage to create a raster cover
|
||||
from calibre.ebooks import render_html_svg_workaround
|
||||
guide_elem.set('href', 'calibre_raster_cover.jpg')
|
||||
t = etree.SubElement(
|
||||
elem[0].getparent(), OPF('item'), href=guide_elem.get('href'), id='calibre_raster_cover')
|
||||
t.set('media-type', 'image/jpeg')
|
||||
if os.path.exists(guide_cover):
|
||||
renderer = render_html_svg_workaround(guide_cover, log)
|
||||
if renderer is not None:
|
||||
with lopen('calibre_raster_cover.jpg', 'wb') as f:
|
||||
f.write(renderer)
|
||||
|
||||
# Set the titlepage guide entry
|
||||
self.set_guide_type(opf, 'titlepage', guide_cover, 'Title Page')
|
||||
return removed
|
||||
|
||||
def find_opf(self):
|
||||
from calibre.utils.xml_parse import safe_xml_fromstring
|
||||
|
||||
def attr(n, attr):
|
||||
for k, v in n.attrib.items():
|
||||
if k.endswith(attr):
|
||||
return v
|
||||
try:
|
||||
with lopen('META-INF/container.xml', 'rb') as f:
|
||||
root = safe_xml_fromstring(f.read())
|
||||
for r in root.xpath('//*[local-name()="rootfile"]'):
|
||||
if attr(r, 'media-type') != "application/oebps-package+xml":
|
||||
continue
|
||||
path = attr(r, 'full-path')
|
||||
if not path:
|
||||
continue
|
||||
path = os.path.join(getcwd(), *path.split('/'))
|
||||
if os.path.exists(path):
|
||||
return path
|
||||
except Exception:
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
|
||||
def convert(self, stream, options, file_ext, log, accelerators):
|
||||
from calibre.utils.zipfile import ZipFile
|
||||
from calibre import walk
|
||||
from calibre.ebooks import DRMError
|
||||
from calibre.ebooks.metadata.opf2 import OPF
|
||||
try:
|
||||
zf = ZipFile(stream)
|
||||
zf.extractall(getcwd())
|
||||
except:
|
||||
log.exception('EPUB appears to be invalid ZIP file, trying a'
|
||||
' more forgiving ZIP parser')
|
||||
from calibre.utils.localunzip import extractall
|
||||
stream.seek(0)
|
||||
extractall(stream)
|
||||
encfile = os.path.abspath(os.path.join('META-INF', 'encryption.xml'))
|
||||
opf = self.find_opf()
|
||||
if opf is None:
|
||||
for f in walk('.'):
|
||||
if f.lower().endswith('.opf') and '__MACOSX' not in f and \
|
||||
not os.path.basename(f).startswith('.'):
|
||||
opf = os.path.abspath(f)
|
||||
break
|
||||
path = getattr(stream, 'name', 'stream')
|
||||
|
||||
if opf is None:
|
||||
raise ValueError('%s is not a valid EPUB file (could not find opf)'%path)
|
||||
|
||||
opf = os.path.relpath(opf, getcwd())
|
||||
parts = os.path.split(opf)
|
||||
opf = OPF(opf, os.path.dirname(os.path.abspath(opf)))
|
||||
|
||||
self._encrypted_font_uris = []
|
||||
if os.path.exists(encfile):
|
||||
if not self.process_encryption(encfile, opf, log):
|
||||
raise DRMError(os.path.basename(path))
|
||||
self.encrypted_fonts = self._encrypted_font_uris
|
||||
|
||||
if len(parts) > 1 and parts[0]:
|
||||
delta = '/'.join(parts[:-1])+'/'
|
||||
|
||||
def normpath(x):
|
||||
return posixpath.normpath(delta + elem.get('href'))
|
||||
|
||||
for elem in opf.itermanifest():
|
||||
elem.set('href', normpath(elem.get('href')))
|
||||
for elem in opf.iterguide():
|
||||
elem.set('href', normpath(elem.get('href')))
|
||||
|
||||
f = self.rationalize_cover3 if opf.package_version >= 3.0 else self.rationalize_cover2
|
||||
self.removed_cover = f(opf, log)
|
||||
if self.removed_cover:
|
||||
self.removed_items_to_ignore = (self.removed_cover,)
|
||||
epub3_nav = opf.epub3_nav
|
||||
if epub3_nav is not None:
|
||||
self.convert_epub3_nav(epub3_nav, opf, log, options)
|
||||
|
||||
for x in opf.itermanifest():
|
||||
if x.get('media-type', '') == 'application/x-dtbook+xml':
|
||||
raise ValueError(
|
||||
'EPUB files with DTBook markup are not supported')
|
||||
|
||||
not_for_spine = set()
|
||||
for y in opf.itermanifest():
|
||||
id_ = y.get('id', None)
|
||||
if id_:
|
||||
mt = y.get('media-type', None)
|
||||
if mt in {
|
||||
'application/vnd.adobe-page-template+xml',
|
||||
'application/vnd.adobe.page-template+xml',
|
||||
'application/adobe-page-template+xml',
|
||||
'application/adobe.page-template+xml',
|
||||
'application/text'
|
||||
}:
|
||||
not_for_spine.add(id_)
|
||||
ext = y.get('href', '').rpartition('.')[-1].lower()
|
||||
if mt == 'text/plain' and ext in {'otf', 'ttf'}:
|
||||
# some epub authoring software sets font mime types to
|
||||
# text/plain
|
||||
not_for_spine.add(id_)
|
||||
y.set('media-type', 'application/font')
|
||||
|
||||
seen = set()
|
||||
for x in list(opf.iterspine()):
|
||||
ref = x.get('idref', None)
|
||||
if not ref or ref in not_for_spine or ref in seen:
|
||||
x.getparent().remove(x)
|
||||
continue
|
||||
seen.add(ref)
|
||||
|
||||
if len(list(opf.iterspine())) == 0:
|
||||
raise ValueError('No valid entries in the spine of this EPUB')
|
||||
|
||||
with lopen('content.opf', 'wb') as nopf:
|
||||
nopf.write(opf.render())
|
||||
|
||||
return os.path.abspath('content.opf')
|
||||
|
||||
def convert_epub3_nav(self, nav_path, opf, log, opts):
|
||||
from lxml import etree
|
||||
from calibre.ebooks.chardet import xml_to_unicode
|
||||
from calibre.ebooks.oeb.polish.parsing import parse
|
||||
from calibre.ebooks.oeb.base import EPUB_NS, XHTML, NCX_MIME, NCX, urlnormalize, urlunquote, serialize
|
||||
from calibre.ebooks.oeb.polish.toc import first_child
|
||||
from calibre.utils.xml_parse import safe_xml_fromstring
|
||||
from tempfile import NamedTemporaryFile
|
||||
with lopen(nav_path, 'rb') as f:
|
||||
raw = f.read()
|
||||
raw = xml_to_unicode(raw, strip_encoding_pats=True, assume_utf8=True)[0]
|
||||
root = parse(raw, log=log)
|
||||
ncx = safe_xml_fromstring('<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" version="2005-1" xml:lang="eng"><navMap/></ncx>')
|
||||
navmap = ncx[0]
|
||||
et = '{%s}type' % EPUB_NS
|
||||
bn = os.path.basename(nav_path)
|
||||
|
||||
def add_from_li(li, parent):
|
||||
href = text = None
|
||||
for x in li.iterchildren(XHTML('a'), XHTML('span')):
|
||||
text = etree.tostring(
|
||||
x, method='text', encoding='unicode', with_tail=False).strip() or ' '.join(
|
||||
x.xpath('descendant-or-self::*/@title')).strip()
|
||||
href = x.get('href')
|
||||
if href:
|
||||
if href.startswith('#'):
|
||||
href = bn + href
|
||||
break
|
||||
np = parent.makeelement(NCX('navPoint'))
|
||||
parent.append(np)
|
||||
np.append(np.makeelement(NCX('navLabel')))
|
||||
np[0].append(np.makeelement(NCX('text')))
|
||||
np[0][0].text = text
|
||||
if href:
|
||||
np.append(np.makeelement(NCX('content'), attrib={'src':href}))
|
||||
return np
|
||||
|
||||
def process_nav_node(node, toc_parent):
|
||||
for li in node.iterchildren(XHTML('li')):
|
||||
child = add_from_li(li, toc_parent)
|
||||
ol = first_child(li, XHTML('ol'))
|
||||
if child is not None and ol is not None:
|
||||
process_nav_node(ol, child)
|
||||
|
||||
for nav in root.iterdescendants(XHTML('nav')):
|
||||
if nav.get(et) == 'toc':
|
||||
ol = first_child(nav, XHTML('ol'))
|
||||
if ol is not None:
|
||||
process_nav_node(ol, navmap)
|
||||
break
|
||||
else:
|
||||
return
|
||||
|
||||
with NamedTemporaryFile(suffix='.ncx', dir=os.path.dirname(nav_path), delete=False) as f:
|
||||
f.write(etree.tostring(ncx, encoding='utf-8'))
|
||||
ncx_href = os.path.relpath(f.name, getcwd()).replace(os.sep, '/')
|
||||
ncx_id = opf.create_manifest_item(ncx_href, NCX_MIME, append=True).get('id')
|
||||
for spine in opf.root.xpath('//*[local-name()="spine"]'):
|
||||
spine.set('toc', ncx_id)
|
||||
opts.epub3_nav_href = urlnormalize(os.path.relpath(nav_path).replace(os.sep, '/'))
|
||||
opts.epub3_nav_parsed = root
|
||||
if getattr(self, 'removed_cover', None):
|
||||
changed = False
|
||||
base_path = os.path.dirname(nav_path)
|
||||
for elem in root.xpath('//*[@href]'):
|
||||
href, frag = elem.get('href').partition('#')[::2]
|
||||
link_path = os.path.relpath(os.path.join(base_path, urlunquote(href)), base_path)
|
||||
abs_href = urlnormalize(link_path)
|
||||
if abs_href == self.removed_cover:
|
||||
changed = True
|
||||
elem.set('data-calibre-removed-titlepage', '1')
|
||||
if changed:
|
||||
with lopen(nav_path, 'wb') as f:
|
||||
f.write(serialize(root, 'application/xhtml+xml'))
|
||||
|
||||
def postprocess_book(self, oeb, opts, log):
|
||||
rc = getattr(self, 'removed_cover', None)
|
||||
if rc:
|
||||
cover_toc_item = None
|
||||
for item in oeb.toc.iterdescendants():
|
||||
if item.href and item.href.partition('#')[0] == rc:
|
||||
cover_toc_item = item
|
||||
break
|
||||
spine = {x.href for x in oeb.spine}
|
||||
if (cover_toc_item is not None and cover_toc_item not in spine):
|
||||
oeb.toc.item_that_refers_to_cover = cover_toc_item
|
||||
548
ebook_converter/ebooks/conversion/plugins/epub_output.py
Normal file
548
ebook_converter/ebooks/conversion/plugins/epub_output.py
Normal file
@@ -0,0 +1,548 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import os, shutil, re
|
||||
|
||||
from calibre.customize.conversion import (OutputFormatPlugin,
|
||||
OptionRecommendation)
|
||||
from calibre.ptempfile import TemporaryDirectory
|
||||
from calibre import CurrentDir
|
||||
from polyglot.builtins import unicode_type, filter, map, zip, range, as_bytes
|
||||
|
||||
block_level_tags = (
|
||||
'address',
|
||||
'body',
|
||||
'blockquote',
|
||||
'center',
|
||||
'dir',
|
||||
'div',
|
||||
'dl',
|
||||
'fieldset',
|
||||
'form',
|
||||
'h1',
|
||||
'h2',
|
||||
'h3',
|
||||
'h4',
|
||||
'h5',
|
||||
'h6',
|
||||
'hr',
|
||||
'isindex',
|
||||
'menu',
|
||||
'noframes',
|
||||
'noscript',
|
||||
'ol',
|
||||
'p',
|
||||
'pre',
|
||||
'table',
|
||||
'ul',
|
||||
)
|
||||
|
||||
|
||||
class EPUBOutput(OutputFormatPlugin):
|
||||
|
||||
name = 'EPUB Output'
|
||||
author = 'Kovid Goyal'
|
||||
file_type = 'epub'
|
||||
commit_name = 'epub_output'
|
||||
ui_data = {'versions': ('2', '3')}
|
||||
|
||||
options = {
|
||||
OptionRecommendation(name='extract_to',
|
||||
help=_('Extract the contents of the generated %s file to the '
|
||||
'specified directory. The contents of the directory are first '
|
||||
'deleted, so be careful.') % 'EPUB'),
|
||||
|
||||
OptionRecommendation(name='dont_split_on_page_breaks',
|
||||
recommended_value=False, level=OptionRecommendation.LOW,
|
||||
help=_('Turn off splitting at page breaks. Normally, input '
|
||||
'files are automatically split at every page break into '
|
||||
'two files. This gives an output e-book that can be '
|
||||
'parsed faster and with less resources. However, '
|
||||
'splitting is slow and if your source file contains a '
|
||||
'very large number of page breaks, you should turn off '
|
||||
'splitting on page breaks.'
|
||||
)
|
||||
),
|
||||
|
||||
OptionRecommendation(name='flow_size', recommended_value=260,
|
||||
help=_('Split all HTML files larger than this size (in KB). '
|
||||
'This is necessary as most EPUB readers cannot handle large '
|
||||
'file sizes. The default of %defaultKB is the size required '
|
||||
'for Adobe Digital Editions. Set to 0 to disable size based splitting.')
|
||||
),
|
||||
|
||||
OptionRecommendation(name='no_default_epub_cover', recommended_value=False,
|
||||
help=_('Normally, if the input file has no cover and you don\'t'
|
||||
' specify one, a default cover is generated with the title, '
|
||||
'authors, etc. This option disables the generation of this cover.')
|
||||
),
|
||||
|
||||
OptionRecommendation(name='no_svg_cover', recommended_value=False,
|
||||
help=_('Do not use SVG for the book cover. Use this option if '
|
||||
'your EPUB is going to be used on a device that does not '
|
||||
'support SVG, like the iPhone or the JetBook Lite. '
|
||||
'Without this option, such devices will display the cover '
|
||||
'as a blank page.')
|
||||
),
|
||||
|
||||
OptionRecommendation(name='preserve_cover_aspect_ratio',
|
||||
recommended_value=False, help=_(
|
||||
'When using an SVG cover, this option will cause the cover to scale '
|
||||
'to cover the available screen area, but still preserve its aspect ratio '
|
||||
'(ratio of width to height). That means there may be white borders '
|
||||
'at the sides or top and bottom of the image, but the image will '
|
||||
'never be distorted. Without this option the image may be slightly '
|
||||
'distorted, but there will be no borders.'
|
||||
)
|
||||
),
|
||||
|
||||
OptionRecommendation(name='epub_flatten', recommended_value=False,
|
||||
help=_('This option is needed only if you intend to use the EPUB'
|
||||
' with FBReaderJ. It will flatten the file system inside the'
|
||||
' EPUB, putting all files into the top level.')
|
||||
),
|
||||
|
||||
OptionRecommendation(name='epub_inline_toc', recommended_value=False,
|
||||
help=_('Insert an inline Table of Contents that will appear as part of the main book content.')
|
||||
),
|
||||
|
||||
OptionRecommendation(name='epub_toc_at_end', recommended_value=False,
|
||||
help=_('Put the inserted inline Table of Contents at the end of the book instead of the start.')
|
||||
),
|
||||
|
||||
OptionRecommendation(name='toc_title', recommended_value=None,
|
||||
help=_('Title for any generated in-line table of contents.')
|
||||
),
|
||||
|
||||
OptionRecommendation(name='epub_version', recommended_value='2', choices=ui_data['versions'],
|
||||
help=_('The version of the EPUB file to generate. EPUB 2 is the'
|
||||
' most widely compatible, only use EPUB 3 if you know you'
|
||||
' actually need it.')
|
||||
),
|
||||
|
||||
}
|
||||
|
||||
recommendations = {('pretty_print', True, OptionRecommendation.HIGH)}
|
||||
|
||||
def workaround_webkit_quirks(self): # {{{
|
||||
from calibre.ebooks.oeb.base import XPath
|
||||
for x in self.oeb.spine:
|
||||
root = x.data
|
||||
body = XPath('//h:body')(root)
|
||||
if body:
|
||||
body = body[0]
|
||||
|
||||
if not hasattr(body, 'xpath'):
|
||||
continue
|
||||
|
||||
for pre in XPath('//h:pre')(body):
|
||||
if not pre.text and len(pre) == 0:
|
||||
pre.tag = 'div'
|
||||
# }}}
|
||||
|
||||
def upshift_markup(self): # {{{
|
||||
'Upgrade markup to comply with XHTML 1.1 where possible'
|
||||
from calibre.ebooks.oeb.base import XPath, XML
|
||||
for x in self.oeb.spine:
|
||||
root = x.data
|
||||
if (not root.get(XML('lang'))) and (root.get('lang')):
|
||||
root.set(XML('lang'), root.get('lang'))
|
||||
body = XPath('//h:body')(root)
|
||||
if body:
|
||||
body = body[0]
|
||||
|
||||
if not hasattr(body, 'xpath'):
|
||||
continue
|
||||
for u in XPath('//h:u')(root):
|
||||
u.tag = 'span'
|
||||
|
||||
seen_ids, seen_names = set(), set()
|
||||
for x in XPath('//*[@id or @name]')(root):
|
||||
eid, name = x.get('id', None), x.get('name', None)
|
||||
if eid:
|
||||
if eid in seen_ids:
|
||||
del x.attrib['id']
|
||||
else:
|
||||
seen_ids.add(eid)
|
||||
if name:
|
||||
if name in seen_names:
|
||||
del x.attrib['name']
|
||||
else:
|
||||
seen_names.add(name)
|
||||
|
||||
# }}}
|
||||
|
||||
def convert(self, oeb, output_path, input_plugin, opts, log):
|
||||
self.log, self.opts, self.oeb = log, opts, oeb
|
||||
|
||||
if self.opts.epub_inline_toc:
|
||||
from calibre.ebooks.mobi.writer8.toc import TOCAdder
|
||||
opts.mobi_toc_at_start = not opts.epub_toc_at_end
|
||||
opts.mobi_passthrough = False
|
||||
opts.no_inline_toc = False
|
||||
TOCAdder(oeb, opts, replace_previous_inline_toc=True, ignore_existing_toc=True)
|
||||
|
||||
if self.opts.epub_flatten:
|
||||
from calibre.ebooks.oeb.transforms.filenames import FlatFilenames
|
||||
FlatFilenames()(oeb, opts)
|
||||
else:
|
||||
from calibre.ebooks.oeb.transforms.filenames import UniqueFilenames
|
||||
UniqueFilenames()(oeb, opts)
|
||||
|
||||
self.workaround_ade_quirks()
|
||||
self.workaround_webkit_quirks()
|
||||
self.upshift_markup()
|
||||
from calibre.ebooks.oeb.transforms.rescale import RescaleImages
|
||||
RescaleImages(check_colorspaces=True)(oeb, opts)
|
||||
|
||||
from calibre.ebooks.oeb.transforms.split import Split
|
||||
split = Split(not self.opts.dont_split_on_page_breaks,
|
||||
max_flow_size=self.opts.flow_size*1024
|
||||
)
|
||||
split(self.oeb, self.opts)
|
||||
|
||||
from calibre.ebooks.oeb.transforms.cover import CoverManager
|
||||
cm = CoverManager(
|
||||
no_default_cover=self.opts.no_default_epub_cover,
|
||||
no_svg_cover=self.opts.no_svg_cover,
|
||||
preserve_aspect_ratio=self.opts.preserve_cover_aspect_ratio)
|
||||
cm(self.oeb, self.opts, self.log)
|
||||
|
||||
self.workaround_sony_quirks()
|
||||
|
||||
if self.oeb.toc.count() == 0:
|
||||
self.log.warn('This EPUB file has no Table of Contents. '
|
||||
'Creating a default TOC')
|
||||
first = next(iter(self.oeb.spine))
|
||||
self.oeb.toc.add(_('Start'), first.href)
|
||||
|
||||
from calibre.ebooks.oeb.base import OPF
|
||||
identifiers = oeb.metadata['identifier']
|
||||
uuid = None
|
||||
for x in identifiers:
|
||||
if x.get(OPF('scheme'), None).lower() == 'uuid' or unicode_type(x).startswith('urn:uuid:'):
|
||||
uuid = unicode_type(x).split(':')[-1]
|
||||
break
|
||||
encrypted_fonts = getattr(input_plugin, 'encrypted_fonts', [])
|
||||
|
||||
if uuid is None:
|
||||
self.log.warn('No UUID identifier found')
|
||||
from uuid import uuid4
|
||||
uuid = unicode_type(uuid4())
|
||||
oeb.metadata.add('identifier', uuid, scheme='uuid', id=uuid)
|
||||
|
||||
if encrypted_fonts and not uuid.startswith('urn:uuid:'):
|
||||
# Apparently ADE requires this value to start with urn:uuid:
|
||||
# for some absurd reason, or it will throw a hissy fit and refuse
|
||||
# to use the obfuscated fonts.
|
||||
for x in identifiers:
|
||||
if unicode_type(x) == uuid:
|
||||
x.content = 'urn:uuid:'+uuid
|
||||
|
||||
with TemporaryDirectory('_epub_output') as tdir:
|
||||
from calibre.customize.ui import plugin_for_output_format
|
||||
metadata_xml = None
|
||||
extra_entries = []
|
||||
if self.is_periodical:
|
||||
if self.opts.output_profile.epub_periodical_format == 'sony':
|
||||
from calibre.ebooks.epub.periodical import sony_metadata
|
||||
metadata_xml, atom_xml = sony_metadata(oeb)
|
||||
extra_entries = [('atom.xml', 'application/atom+xml', atom_xml)]
|
||||
oeb_output = plugin_for_output_format('oeb')
|
||||
oeb_output.convert(oeb, tdir, input_plugin, opts, log)
|
||||
opf = [x for x in os.listdir(tdir) if x.endswith('.opf')][0]
|
||||
self.condense_ncx([os.path.join(tdir, x) for x in os.listdir(tdir)
|
||||
if x.endswith('.ncx')][0])
|
||||
if self.opts.epub_version == '3':
|
||||
self.upgrade_to_epub3(tdir, opf)
|
||||
encryption = None
|
||||
if encrypted_fonts:
|
||||
encryption = self.encrypt_fonts(encrypted_fonts, tdir, uuid)
|
||||
|
||||
from calibre.ebooks.epub import initialize_container
|
||||
with initialize_container(output_path, os.path.basename(opf),
|
||||
extra_entries=extra_entries) as epub:
|
||||
epub.add_dir(tdir)
|
||||
if encryption is not None:
|
||||
epub.writestr('META-INF/encryption.xml', as_bytes(encryption))
|
||||
if metadata_xml is not None:
|
||||
epub.writestr('META-INF/metadata.xml',
|
||||
metadata_xml.encode('utf-8'))
|
||||
if opts.extract_to is not None:
|
||||
from calibre.utils.zipfile import ZipFile
|
||||
if os.path.exists(opts.extract_to):
|
||||
if os.path.isdir(opts.extract_to):
|
||||
shutil.rmtree(opts.extract_to)
|
||||
else:
|
||||
os.remove(opts.extract_to)
|
||||
os.mkdir(opts.extract_to)
|
||||
with ZipFile(output_path) as zf:
|
||||
zf.extractall(path=opts.extract_to)
|
||||
self.log.info('EPUB extracted to', opts.extract_to)
|
||||
|
||||
def upgrade_to_epub3(self, tdir, opf):
|
||||
self.log.info('Upgrading to EPUB 3...')
|
||||
from calibre.ebooks.epub import simple_container_xml
|
||||
from calibre.ebooks.oeb.polish.cover import fix_conversion_titlepage_links_in_nav
|
||||
try:
|
||||
os.mkdir(os.path.join(tdir, 'META-INF'))
|
||||
except EnvironmentError:
|
||||
pass
|
||||
with open(os.path.join(tdir, 'META-INF', 'container.xml'), 'wb') as f:
|
||||
f.write(simple_container_xml(os.path.basename(opf)).encode('utf-8'))
|
||||
from calibre.ebooks.oeb.polish.container import EpubContainer
|
||||
container = EpubContainer(tdir, self.log)
|
||||
from calibre.ebooks.oeb.polish.upgrade import epub_2_to_3
|
||||
existing_nav = getattr(self.opts, 'epub3_nav_parsed', None)
|
||||
nav_href = getattr(self.opts, 'epub3_nav_href', None)
|
||||
previous_nav = (nav_href, existing_nav) if existing_nav and nav_href else None
|
||||
epub_2_to_3(container, self.log.info, previous_nav=previous_nav)
|
||||
fix_conversion_titlepage_links_in_nav(container)
|
||||
container.commit()
|
||||
os.remove(f.name)
|
||||
try:
|
||||
os.rmdir(os.path.join(tdir, 'META-INF'))
|
||||
except EnvironmentError:
|
||||
pass
|
||||
|
||||
def encrypt_fonts(self, uris, tdir, uuid): # {{{
|
||||
from polyglot.binary import from_hex_bytes
|
||||
|
||||
key = re.sub(r'[^a-fA-F0-9]', '', uuid)
|
||||
if len(key) < 16:
|
||||
raise ValueError('UUID identifier %r is invalid'%uuid)
|
||||
key = bytearray(from_hex_bytes((key + key)[:32]))
|
||||
paths = []
|
||||
with CurrentDir(tdir):
|
||||
paths = [os.path.join(*x.split('/')) for x in uris]
|
||||
uris = dict(zip(uris, paths))
|
||||
fonts = []
|
||||
for uri in list(uris.keys()):
|
||||
path = uris[uri]
|
||||
if not os.path.exists(path):
|
||||
uris.pop(uri)
|
||||
continue
|
||||
self.log.debug('Encrypting font:', uri)
|
||||
with lopen(path, 'r+b') as f:
|
||||
data = f.read(1024)
|
||||
if len(data) >= 1024:
|
||||
data = bytearray(data)
|
||||
f.seek(0)
|
||||
f.write(bytes(bytearray(data[i] ^ key[i%16] for i in range(1024))))
|
||||
else:
|
||||
self.log.warn('Font', path, 'is invalid, ignoring')
|
||||
if not isinstance(uri, unicode_type):
|
||||
uri = uri.decode('utf-8')
|
||||
fonts.append('''
|
||||
<enc:EncryptedData>
|
||||
<enc:EncryptionMethod Algorithm="http://ns.adobe.com/pdf/enc#RC"/>
|
||||
<enc:CipherData>
|
||||
<enc:CipherReference URI="%s"/>
|
||||
</enc:CipherData>
|
||||
</enc:EncryptedData>
|
||||
'''%(uri.replace('"', '\\"')))
|
||||
if fonts:
|
||||
ans = '''<encryption
|
||||
xmlns="urn:oasis:names:tc:opendocument:xmlns:container"
|
||||
xmlns:enc="http://www.w3.org/2001/04/xmlenc#"
|
||||
xmlns:deenc="http://ns.adobe.com/digitaleditions/enc">
|
||||
'''
|
||||
ans += '\n'.join(fonts)
|
||||
ans += '\n</encryption>'
|
||||
return ans
|
||||
# }}}
|
||||
|
||||
def condense_ncx(self, ncx_path): # {{{
|
||||
from lxml import etree
|
||||
if not self.opts.pretty_print:
|
||||
tree = etree.parse(ncx_path)
|
||||
for tag in tree.getroot().iter(tag=etree.Element):
|
||||
if tag.text:
|
||||
tag.text = tag.text.strip()
|
||||
if tag.tail:
|
||||
tag.tail = tag.tail.strip()
|
||||
compressed = etree.tostring(tree.getroot(), encoding='utf-8')
|
||||
with open(ncx_path, 'wb') as f:
|
||||
f.write(compressed)
|
||||
# }}}
|
||||
|
||||
def workaround_ade_quirks(self): # {{{
|
||||
'''
|
||||
Perform various markup transforms to get the output to render correctly
|
||||
in the quirky ADE.
|
||||
'''
|
||||
from calibre.ebooks.oeb.base import XPath, XHTML, barename, urlunquote
|
||||
|
||||
stylesheet = self.oeb.manifest.main_stylesheet
|
||||
|
||||
# ADE cries big wet tears when it encounters an invalid fragment
|
||||
# identifier in the NCX toc.
|
||||
frag_pat = re.compile(r'[-A-Za-z0-9_:.]+$')
|
||||
for node in self.oeb.toc.iter():
|
||||
href = getattr(node, 'href', None)
|
||||
if hasattr(href, 'partition'):
|
||||
base, _, frag = href.partition('#')
|
||||
frag = urlunquote(frag)
|
||||
if frag and frag_pat.match(frag) is None:
|
||||
self.log.warn(
|
||||
'Removing fragment identifier %r from TOC as Adobe Digital Editions cannot handle it'%frag)
|
||||
node.href = base
|
||||
|
||||
for x in self.oeb.spine:
|
||||
root = x.data
|
||||
body = XPath('//h:body')(root)
|
||||
if body:
|
||||
body = body[0]
|
||||
|
||||
if hasattr(body, 'xpath'):
|
||||
# remove <img> tags with empty src elements
|
||||
bad = []
|
||||
for x in XPath('//h:img')(body):
|
||||
src = x.get('src', '').strip()
|
||||
if src in ('', '#') or src.startswith('http:'):
|
||||
bad.append(x)
|
||||
for img in bad:
|
||||
img.getparent().remove(img)
|
||||
|
||||
# Add id attribute to <a> tags that have name
|
||||
for x in XPath('//h:a[@name]')(body):
|
||||
if not x.get('id', False):
|
||||
x.set('id', x.get('name'))
|
||||
# The delightful epubcheck has started complaining about <a> tags that
|
||||
# have name attributes.
|
||||
x.attrib.pop('name')
|
||||
|
||||
# Replace <br> that are children of <body> as ADE doesn't handle them
|
||||
for br in XPath('./h:br')(body):
|
||||
if br.getparent() is None:
|
||||
continue
|
||||
try:
|
||||
prior = next(br.itersiblings(preceding=True))
|
||||
priortag = barename(prior.tag)
|
||||
priortext = prior.tail
|
||||
except:
|
||||
priortag = 'body'
|
||||
priortext = body.text
|
||||
if priortext:
|
||||
priortext = priortext.strip()
|
||||
br.tag = XHTML('p')
|
||||
br.text = '\u00a0'
|
||||
style = br.get('style', '').split(';')
|
||||
style = list(filter(None, map(lambda x: x.strip(), style)))
|
||||
style.append('margin:0pt; border:0pt')
|
||||
# If the prior tag is a block (including a <br> we replaced)
|
||||
# then this <br> replacement should have a 1-line height.
|
||||
# Otherwise it should have no height.
|
||||
if not priortext and priortag in block_level_tags:
|
||||
style.append('height:1em')
|
||||
else:
|
||||
style.append('height:0pt')
|
||||
br.set('style', '; '.join(style))
|
||||
|
||||
for tag in XPath('//h:embed')(root):
|
||||
tag.getparent().remove(tag)
|
||||
for tag in XPath('//h:object')(root):
|
||||
if tag.get('type', '').lower().strip() in {'image/svg+xml', 'application/svg+xml'}:
|
||||
continue
|
||||
tag.getparent().remove(tag)
|
||||
|
||||
for tag in XPath('//h:title|//h:style')(root):
|
||||
if not tag.text:
|
||||
tag.getparent().remove(tag)
|
||||
for tag in XPath('//h:script')(root):
|
||||
if (not tag.text and not tag.get('src', False) and tag.get('type', None) != 'text/x-mathjax-config'):
|
||||
tag.getparent().remove(tag)
|
||||
for tag in XPath('//h:body/descendant::h:script')(root):
|
||||
tag.getparent().remove(tag)
|
||||
|
||||
formchildren = XPath('./h:input|./h:button|./h:textarea|'
|
||||
'./h:label|./h:fieldset|./h:legend')
|
||||
for tag in XPath('//h:form')(root):
|
||||
if formchildren(tag):
|
||||
tag.getparent().remove(tag)
|
||||
else:
|
||||
# Not a real form
|
||||
tag.tag = XHTML('div')
|
||||
|
||||
for tag in XPath('//h:center')(root):
|
||||
tag.tag = XHTML('div')
|
||||
tag.set('style', 'text-align:center')
|
||||
# ADE can't handle & in an img url
|
||||
for tag in XPath('//h:img[@src]')(root):
|
||||
tag.set('src', tag.get('src', '').replace('&', ''))
|
||||
|
||||
# ADE whimpers in fright when it encounters a <td> outside a
|
||||
# <table>
|
||||
in_table = XPath('ancestor::h:table')
|
||||
for tag in XPath('//h:td|//h:tr|//h:th')(root):
|
||||
if not in_table(tag):
|
||||
tag.tag = XHTML('div')
|
||||
|
||||
# ADE fails to render non breaking hyphens/soft hyphens/zero width spaces
|
||||
special_chars = re.compile('[\u200b\u00ad]')
|
||||
for elem in root.iterdescendants('*'):
|
||||
if elem.text:
|
||||
elem.text = special_chars.sub('', elem.text)
|
||||
elem.text = elem.text.replace('\u2011', '-')
|
||||
if elem.tail:
|
||||
elem.tail = special_chars.sub('', elem.tail)
|
||||
elem.tail = elem.tail.replace('\u2011', '-')
|
||||
|
||||
if stylesheet is not None:
|
||||
# ADE doesn't render lists correctly if they have left margins
|
||||
from css_parser.css import CSSRule
|
||||
for lb in XPath('//h:ul[@class]|//h:ol[@class]')(root):
|
||||
sel = '.'+lb.get('class')
|
||||
for rule in stylesheet.data.cssRules.rulesOfType(CSSRule.STYLE_RULE):
|
||||
if sel == rule.selectorList.selectorText:
|
||||
rule.style.removeProperty('margin-left')
|
||||
# padding-left breaks rendering in webkit and gecko
|
||||
rule.style.removeProperty('padding-left')
|
||||
# Change whitespace:pre to pre-wrap to accommodate readers that
|
||||
# cannot scroll horizontally
|
||||
for rule in stylesheet.data.cssRules.rulesOfType(CSSRule.STYLE_RULE):
|
||||
style = rule.style
|
||||
ws = style.getPropertyValue('white-space')
|
||||
if ws == 'pre':
|
||||
style.setProperty('white-space', 'pre-wrap')
|
||||
|
||||
# }}}
|
||||
|
||||
def workaround_sony_quirks(self): # {{{
|
||||
'''
|
||||
Perform toc link transforms to alleviate slow loading.
|
||||
'''
|
||||
from calibre.ebooks.oeb.base import urldefrag, XPath
|
||||
from calibre.ebooks.oeb.polish.toc import item_at_top
|
||||
|
||||
def frag_is_at_top(root, frag):
|
||||
elem = XPath('//*[@id="%s" or @name="%s"]'%(frag, frag))(root)
|
||||
if elem:
|
||||
elem = elem[0]
|
||||
else:
|
||||
return False
|
||||
return item_at_top(elem)
|
||||
|
||||
def simplify_toc_entry(toc):
|
||||
if toc.href:
|
||||
href, frag = urldefrag(toc.href)
|
||||
if frag:
|
||||
for x in self.oeb.spine:
|
||||
if x.href == href:
|
||||
if frag_is_at_top(x.data, frag):
|
||||
self.log.debug('Removing anchor from TOC href:',
|
||||
href+'#'+frag)
|
||||
toc.href = href
|
||||
break
|
||||
for x in toc:
|
||||
simplify_toc_entry(x)
|
||||
|
||||
if self.oeb.toc:
|
||||
simplify_toc_entry(self.oeb.toc)
|
||||
|
||||
# }}}
|
||||
179
ebook_converter/ebooks/conversion/plugins/fb2_input.py
Normal file
179
ebook_converter/ebooks/conversion/plugins/fb2_input.py
Normal file
@@ -0,0 +1,179 @@
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2008, Anatoly Shipitsin <norguhtar at gmail.com>'
|
||||
"""
|
||||
Convert .fb2 files to .lrf
|
||||
"""
|
||||
import os, re
|
||||
|
||||
from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
|
||||
from calibre import guess_type
|
||||
from polyglot.builtins import iteritems, getcwd
|
||||
|
||||
FB2NS = 'http://www.gribuser.ru/xml/fictionbook/2.0'
|
||||
FB21NS = 'http://www.gribuser.ru/xml/fictionbook/2.1'
|
||||
|
||||
|
||||
class FB2Input(InputFormatPlugin):
|
||||
|
||||
name = 'FB2 Input'
|
||||
author = 'Anatoly Shipitsin'
|
||||
description = 'Convert FB2 and FBZ files to HTML'
|
||||
file_types = {'fb2', 'fbz'}
|
||||
commit_name = 'fb2_input'
|
||||
|
||||
recommendations = {
|
||||
('level1_toc', '//h:h1', OptionRecommendation.MED),
|
||||
('level2_toc', '//h:h2', OptionRecommendation.MED),
|
||||
('level3_toc', '//h:h3', OptionRecommendation.MED),
|
||||
}
|
||||
|
||||
options = {
|
||||
OptionRecommendation(name='no_inline_fb2_toc',
|
||||
recommended_value=False, level=OptionRecommendation.LOW,
|
||||
help=_('Do not insert a Table of Contents at the beginning of the book.'
|
||||
)
|
||||
)}
|
||||
|
||||
def convert(self, stream, options, file_ext, log,
|
||||
accelerators):
|
||||
from lxml import etree
|
||||
from calibre.utils.xml_parse import safe_xml_fromstring
|
||||
from calibre.ebooks.metadata.fb2 import ensure_namespace, get_fb2_data
|
||||
from calibre.ebooks.metadata.opf2 import OPFCreator
|
||||
from calibre.ebooks.metadata.meta import get_metadata
|
||||
from calibre.ebooks.oeb.base import XLINK_NS, XHTML_NS
|
||||
from calibre.ebooks.chardet import xml_to_unicode
|
||||
self.log = log
|
||||
log.debug('Parsing XML...')
|
||||
raw = get_fb2_data(stream)[0]
|
||||
raw = raw.replace(b'\0', b'')
|
||||
raw = xml_to_unicode(raw, strip_encoding_pats=True,
|
||||
assume_utf8=True, resolve_entities=True)[0]
|
||||
try:
|
||||
doc = safe_xml_fromstring(raw)
|
||||
except etree.XMLSyntaxError:
|
||||
doc = safe_xml_fromstring(raw.replace('& ', '&'))
|
||||
if doc is None:
|
||||
raise ValueError('The FB2 file is not valid XML')
|
||||
doc = ensure_namespace(doc)
|
||||
try:
|
||||
fb_ns = doc.nsmap[doc.prefix]
|
||||
except Exception:
|
||||
fb_ns = FB2NS
|
||||
|
||||
NAMESPACES = {'f':fb_ns, 'l':XLINK_NS}
|
||||
stylesheets = doc.xpath('//*[local-name() = "stylesheet" and @type="text/css"]')
|
||||
css = ''
|
||||
for s in stylesheets:
|
||||
css += etree.tostring(s, encoding='unicode', method='text',
|
||||
with_tail=False) + '\n\n'
|
||||
if css:
|
||||
import css_parser, logging
|
||||
parser = css_parser.CSSParser(fetcher=None,
|
||||
log=logging.getLogger('calibre.css'))
|
||||
|
||||
XHTML_CSS_NAMESPACE = '@namespace "%s";\n' % XHTML_NS
|
||||
text = XHTML_CSS_NAMESPACE + css
|
||||
log.debug('Parsing stylesheet...')
|
||||
stylesheet = parser.parseString(text)
|
||||
stylesheet.namespaces['h'] = XHTML_NS
|
||||
css = stylesheet.cssText
|
||||
if isinstance(css, bytes):
|
||||
css = css.decode('utf-8', 'replace')
|
||||
css = css.replace('h|style', 'h|span')
|
||||
css = re.sub(r'name\s*=\s*', 'class=', css)
|
||||
self.extract_embedded_content(doc)
|
||||
log.debug('Converting XML to HTML...')
|
||||
with open(P('templates/fb2.xsl'), 'rb') as f:
|
||||
ss = f.read().decode('utf-8')
|
||||
ss = ss.replace("__FB_NS__", fb_ns)
|
||||
if options.no_inline_fb2_toc:
|
||||
log('Disabling generation of inline FB2 TOC')
|
||||
ss = re.compile(r'<!-- BUILD TOC -->.*<!-- END BUILD TOC -->',
|
||||
re.DOTALL).sub('', ss)
|
||||
|
||||
styledoc = safe_xml_fromstring(ss)
|
||||
|
||||
transform = etree.XSLT(styledoc)
|
||||
result = transform(doc)
|
||||
|
||||
# Handle links of type note and cite
|
||||
notes = {a.get('href')[1:]: a for a in result.xpath('//a[@link_note and @href]') if a.get('href').startswith('#')}
|
||||
cites = {a.get('link_cite'): a for a in result.xpath('//a[@link_cite]') if not a.get('href', '')}
|
||||
all_ids = {x for x in result.xpath('//*/@id')}
|
||||
for cite, a in iteritems(cites):
|
||||
note = notes.get(cite, None)
|
||||
if note:
|
||||
c = 1
|
||||
while 'cite%d' % c in all_ids:
|
||||
c += 1
|
||||
if not note.get('id', None):
|
||||
note.set('id', 'cite%d' % c)
|
||||
all_ids.add(note.get('id'))
|
||||
a.set('href', '#%s' % note.get('id'))
|
||||
for x in result.xpath('//*[@link_note or @link_cite]'):
|
||||
x.attrib.pop('link_note', None)
|
||||
x.attrib.pop('link_cite', None)
|
||||
|
||||
for img in result.xpath('//img[@src]'):
|
||||
src = img.get('src')
|
||||
img.set('src', self.binary_map.get(src, src))
|
||||
index = transform.tostring(result)
|
||||
with open('index.xhtml', 'wb') as f:
|
||||
f.write(index.encode('utf-8'))
|
||||
with open('inline-styles.css', 'wb') as f:
|
||||
f.write(css.encode('utf-8'))
|
||||
stream.seek(0)
|
||||
mi = get_metadata(stream, 'fb2')
|
||||
if not mi.title:
|
||||
mi.title = _('Unknown')
|
||||
if not mi.authors:
|
||||
mi.authors = [_('Unknown')]
|
||||
cpath = None
|
||||
if mi.cover_data and mi.cover_data[1]:
|
||||
with open('fb2_cover_calibre_mi.jpg', 'wb') as f:
|
||||
f.write(mi.cover_data[1])
|
||||
cpath = os.path.abspath('fb2_cover_calibre_mi.jpg')
|
||||
else:
|
||||
for img in doc.xpath('//f:coverpage/f:image', namespaces=NAMESPACES):
|
||||
href = img.get('{%s}href'%XLINK_NS, img.get('href', None))
|
||||
if href is not None:
|
||||
if href.startswith('#'):
|
||||
href = href[1:]
|
||||
cpath = os.path.abspath(href)
|
||||
break
|
||||
|
||||
opf = OPFCreator(getcwd(), mi)
|
||||
entries = [(f2, guess_type(f2)[0]) for f2 in os.listdir(u'.')]
|
||||
opf.create_manifest(entries)
|
||||
opf.create_spine(['index.xhtml'])
|
||||
if cpath:
|
||||
opf.guide.set_cover(cpath)
|
||||
with open('metadata.opf', 'wb') as f:
|
||||
opf.render(f)
|
||||
return os.path.join(getcwd(), 'metadata.opf')
|
||||
|
||||
def extract_embedded_content(self, doc):
|
||||
from calibre.ebooks.fb2 import base64_decode
|
||||
self.binary_map = {}
|
||||
for elem in doc.xpath('./*'):
|
||||
if elem.text and 'binary' in elem.tag and 'id' in elem.attrib:
|
||||
ct = elem.get('content-type', '')
|
||||
fname = elem.attrib['id']
|
||||
ext = ct.rpartition('/')[-1].lower()
|
||||
if ext in ('png', 'jpeg', 'jpg'):
|
||||
if fname.lower().rpartition('.')[-1] not in {'jpg', 'jpeg',
|
||||
'png'}:
|
||||
fname += '.' + ext
|
||||
self.binary_map[elem.get('id')] = fname
|
||||
raw = elem.text.strip()
|
||||
try:
|
||||
data = base64_decode(raw)
|
||||
except TypeError:
|
||||
self.log.exception('Binary data with id=%s is corrupted, ignoring'%(
|
||||
elem.get('id')))
|
||||
else:
|
||||
with open(fname, 'wb') as f:
|
||||
f.write(data)
|
||||
203
ebook_converter/ebooks/conversion/plugins/fb2_output.py
Normal file
203
ebook_converter/ebooks/conversion/plugins/fb2_output.py
Normal file
@@ -0,0 +1,203 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL 3'
|
||||
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import os
|
||||
|
||||
from calibre.customize.conversion import OutputFormatPlugin, OptionRecommendation
|
||||
|
||||
|
||||
class FB2Output(OutputFormatPlugin):
|
||||
|
||||
name = 'FB2 Output'
|
||||
author = 'John Schember'
|
||||
file_type = 'fb2'
|
||||
commit_name = 'fb2_output'
|
||||
|
||||
FB2_GENRES = [
|
||||
# Science Fiction & Fantasy
|
||||
'sf_history', # Alternative history
|
||||
'sf_action', # Action
|
||||
'sf_epic', # Epic
|
||||
'sf_heroic', # Heroic
|
||||
'sf_detective', # Detective
|
||||
'sf_cyberpunk', # Cyberpunk
|
||||
'sf_space', # Space
|
||||
'sf_social', # Social#philosophical
|
||||
'sf_horror', # Horror & mystic
|
||||
'sf_humor', # Humor
|
||||
'sf_fantasy', # Fantasy
|
||||
'sf', # Science Fiction
|
||||
# Detectives & Thrillers
|
||||
'det_classic', # Classical detectives
|
||||
'det_police', # Police Stories
|
||||
'det_action', # Action
|
||||
'det_irony', # Ironical detectives
|
||||
'det_history', # Historical detectives
|
||||
'det_espionage', # Espionage detectives
|
||||
'det_crime', # Crime detectives
|
||||
'det_political', # Political detectives
|
||||
'det_maniac', # Maniacs
|
||||
'det_hard', # Hard#boiled
|
||||
'thriller', # Thrillers
|
||||
'detective', # Detectives
|
||||
# Prose
|
||||
'prose_classic', # Classics prose
|
||||
'prose_history', # Historical prose
|
||||
'prose_contemporary', # Contemporary prose
|
||||
'prose_counter', # Counterculture
|
||||
'prose_rus_classic', # Russial classics prose
|
||||
'prose_su_classics', # Soviet classics prose
|
||||
# Romance
|
||||
'love_contemporary', # Contemporary Romance
|
||||
'love_history', # Historical Romance
|
||||
'love_detective', # Detective Romance
|
||||
'love_short', # Short Romance
|
||||
'love_erotica', # Erotica
|
||||
# Adventure
|
||||
'adv_western', # Western
|
||||
'adv_history', # History
|
||||
'adv_indian', # Indians
|
||||
'adv_maritime', # Maritime Fiction
|
||||
'adv_geo', # Travel & geography
|
||||
'adv_animal', # Nature & animals
|
||||
'adventure', # Other
|
||||
# Children's
|
||||
'child_tale', # Fairy Tales
|
||||
'child_verse', # Verses
|
||||
'child_prose', # Prose
|
||||
'child_sf', # Science Fiction
|
||||
'child_det', # Detectives & Thrillers
|
||||
'child_adv', # Adventures
|
||||
'child_education', # Educational
|
||||
'children', # Other
|
||||
# Poetry & Dramaturgy
|
||||
'poetry', # Poetry
|
||||
'dramaturgy', # Dramaturgy
|
||||
# Antique literature
|
||||
'antique_ant', # Antique
|
||||
'antique_european', # European
|
||||
'antique_russian', # Old russian
|
||||
'antique_east', # Old east
|
||||
'antique_myths', # Myths. Legends. Epos
|
||||
'antique', # Other
|
||||
# Scientific#educational
|
||||
'sci_history', # History
|
||||
'sci_psychology', # Psychology
|
||||
'sci_culture', # Cultural science
|
||||
'sci_religion', # Religious studies
|
||||
'sci_philosophy', # Philosophy
|
||||
'sci_politics', # Politics
|
||||
'sci_business', # Business literature
|
||||
'sci_juris', # Jurisprudence
|
||||
'sci_linguistic', # Linguistics
|
||||
'sci_medicine', # Medicine
|
||||
'sci_phys', # Physics
|
||||
'sci_math', # Mathematics
|
||||
'sci_chem', # Chemistry
|
||||
'sci_biology', # Biology
|
||||
'sci_tech', # Technical
|
||||
'science', # Other
|
||||
# Computers & Internet
|
||||
'comp_www', # Internet
|
||||
'comp_programming', # Programming
|
||||
'comp_hard', # Hardware
|
||||
'comp_soft', # Software
|
||||
'comp_db', # Databases
|
||||
'comp_osnet', # OS & Networking
|
||||
'computers', # Other
|
||||
# Reference
|
||||
'ref_encyc', # Encyclopedias
|
||||
'ref_dict', # Dictionaries
|
||||
'ref_ref', # Reference
|
||||
'ref_guide', # Guidebooks
|
||||
'reference', # Other
|
||||
# Nonfiction
|
||||
'nonf_biography', # Biography & Memoirs
|
||||
'nonf_publicism', # Publicism
|
||||
'nonf_criticism', # Criticism
|
||||
'design', # Art & design
|
||||
'nonfiction', # Other
|
||||
# Religion & Inspiration
|
||||
'religion_rel', # Religion
|
||||
'religion_esoterics', # Esoterics
|
||||
'religion_self', # Self#improvement
|
||||
'religion', # Other
|
||||
# Humor
|
||||
'humor_anecdote', # Anecdote (funny stories)
|
||||
'humor_prose', # Prose
|
||||
'humor_verse', # Verses
|
||||
'humor', # Other
|
||||
# Home & Family
|
||||
'home_cooking', # Cooking
|
||||
'home_pets', # Pets
|
||||
'home_crafts', # Hobbies & Crafts
|
||||
'home_entertain', # Entertaining
|
||||
'home_health', # Health
|
||||
'home_garden', # Garden
|
||||
'home_diy', # Do it yourself
|
||||
'home_sport', # Sports
|
||||
'home_sex', # Erotica & sex
|
||||
'home', # Other
|
||||
]
|
||||
ui_data = {
|
||||
'sectionize': {
|
||||
'toc': _('Section per entry in the ToC'),
|
||||
'files': _('Section per file'),
|
||||
'nothing': _('A single section')
|
||||
},
|
||||
'genres': FB2_GENRES,
|
||||
}
|
||||
|
||||
options = {
|
||||
OptionRecommendation(name='sectionize',
|
||||
recommended_value='files', level=OptionRecommendation.LOW,
|
||||
choices=list(ui_data['sectionize']),
|
||||
help=_('Specify how sections are created:\n'
|
||||
' * nothing: {nothing}\n'
|
||||
' * files: {files}\n'
|
||||
' * toc: {toc}\n'
|
||||
'If ToC based generation fails, adjust the "Structure detection" and/or "Table of Contents" settings '
|
||||
'(turn on "Force use of auto-generated Table of Contents").').format(**ui_data['sectionize'])
|
||||
),
|
||||
OptionRecommendation(name='fb2_genre',
|
||||
recommended_value='antique', level=OptionRecommendation.LOW,
|
||||
choices=FB2_GENRES,
|
||||
help=(_('Genre for the book. Choices: %s\n\n See: ') % ', '.join(FB2_GENRES)
|
||||
) + 'http://www.fictionbook.org/index.php/Eng:FictionBook_2.1_genres ' + _('for a complete list with descriptions.')),
|
||||
}
|
||||
|
||||
def convert(self, oeb_book, output_path, input_plugin, opts, log):
|
||||
from calibre.ebooks.oeb.transforms.jacket import linearize_jacket
|
||||
from calibre.ebooks.oeb.transforms.rasterize import SVGRasterizer, Unavailable
|
||||
from calibre.ebooks.fb2.fb2ml import FB2MLizer
|
||||
|
||||
try:
|
||||
rasterizer = SVGRasterizer()
|
||||
rasterizer(oeb_book, opts)
|
||||
except Unavailable:
|
||||
log.warn('SVG rasterizer unavailable, SVG will not be converted')
|
||||
|
||||
linearize_jacket(oeb_book)
|
||||
|
||||
fb2mlizer = FB2MLizer(log)
|
||||
fb2_content = fb2mlizer.extract_content(oeb_book, opts)
|
||||
|
||||
close = False
|
||||
if not hasattr(output_path, 'write'):
|
||||
close = True
|
||||
if not os.path.exists(os.path.dirname(output_path)) and os.path.dirname(output_path) != '':
|
||||
os.makedirs(os.path.dirname(output_path))
|
||||
out_stream = lopen(output_path, 'wb')
|
||||
else:
|
||||
out_stream = output_path
|
||||
|
||||
out_stream.seek(0)
|
||||
out_stream.truncate()
|
||||
out_stream.write(fb2_content.encode('utf-8', 'replace'))
|
||||
|
||||
if close:
|
||||
out_stream.close()
|
||||
316
ebook_converter/ebooks/conversion/plugins/html_input.py
Normal file
316
ebook_converter/ebooks/conversion/plugins/html_input.py
Normal file
@@ -0,0 +1,316 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2012, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import re, tempfile, os
|
||||
from functools import partial
|
||||
|
||||
from calibre.constants import islinux, isbsd
|
||||
from calibre.customize.conversion import (InputFormatPlugin,
|
||||
OptionRecommendation)
|
||||
from calibre.utils.localization import get_lang
|
||||
from calibre.utils.filenames import ascii_filename
|
||||
from calibre.utils.imghdr import what
|
||||
from polyglot.builtins import unicode_type, zip, getcwd, as_unicode
|
||||
|
||||
|
||||
def sanitize_file_name(x):
|
||||
ans = re.sub(r'\s+', ' ', re.sub(r'[?&=;#]', '_', ascii_filename(x))).strip().rstrip('.')
|
||||
ans, ext = ans.rpartition('.')[::2]
|
||||
return (ans.strip() + '.' + ext.strip()).rstrip('.')
|
||||
|
||||
|
||||
class HTMLInput(InputFormatPlugin):
|
||||
|
||||
name = 'HTML Input'
|
||||
author = 'Kovid Goyal'
|
||||
description = 'Convert HTML and OPF files to an OEB'
|
||||
file_types = {'opf', 'html', 'htm', 'xhtml', 'xhtm', 'shtm', 'shtml'}
|
||||
commit_name = 'html_input'
|
||||
|
||||
options = {
|
||||
OptionRecommendation(name='breadth_first',
|
||||
recommended_value=False, level=OptionRecommendation.LOW,
|
||||
help=_('Traverse links in HTML files breadth first. Normally, '
|
||||
'they are traversed depth first.'
|
||||
)
|
||||
),
|
||||
|
||||
OptionRecommendation(name='max_levels',
|
||||
recommended_value=5, level=OptionRecommendation.LOW,
|
||||
help=_('Maximum levels of recursion when following links in '
|
||||
'HTML files. Must be non-negative. 0 implies that no '
|
||||
'links in the root HTML file are followed. Default is '
|
||||
'%default.'
|
||||
)
|
||||
),
|
||||
|
||||
OptionRecommendation(name='dont_package',
|
||||
recommended_value=False, level=OptionRecommendation.LOW,
|
||||
help=_('Normally this input plugin re-arranges all the input '
|
||||
'files into a standard folder hierarchy. Only use this option '
|
||||
'if you know what you are doing as it can result in various '
|
||||
'nasty side effects in the rest of the conversion pipeline.'
|
||||
)
|
||||
),
|
||||
|
||||
}
|
||||
|
||||
def convert(self, stream, opts, file_ext, log,
|
||||
accelerators):
|
||||
self._is_case_sensitive = None
|
||||
basedir = getcwd()
|
||||
self.opts = opts
|
||||
|
||||
fname = None
|
||||
if hasattr(stream, 'name'):
|
||||
basedir = os.path.dirname(stream.name)
|
||||
fname = os.path.basename(stream.name)
|
||||
|
||||
if file_ext != 'opf':
|
||||
if opts.dont_package:
|
||||
raise ValueError('The --dont-package option is not supported for an HTML input file')
|
||||
from calibre.ebooks.metadata.html import get_metadata
|
||||
mi = get_metadata(stream)
|
||||
if fname:
|
||||
from calibre.ebooks.metadata.meta import metadata_from_filename
|
||||
fmi = metadata_from_filename(fname)
|
||||
fmi.smart_update(mi)
|
||||
mi = fmi
|
||||
oeb = self.create_oebbook(stream.name, basedir, opts, log, mi)
|
||||
return oeb
|
||||
|
||||
from calibre.ebooks.conversion.plumber import create_oebbook
|
||||
return create_oebbook(log, stream.name, opts,
|
||||
encoding=opts.input_encoding)
|
||||
|
||||
def is_case_sensitive(self, path):
|
||||
if getattr(self, '_is_case_sensitive', None) is not None:
|
||||
return self._is_case_sensitive
|
||||
if not path or not os.path.exists(path):
|
||||
return islinux or isbsd
|
||||
self._is_case_sensitive = not (os.path.exists(path.lower()) and os.path.exists(path.upper()))
|
||||
return self._is_case_sensitive
|
||||
|
||||
def create_oebbook(self, htmlpath, basedir, opts, log, mi):
|
||||
import uuid
|
||||
from calibre.ebooks.conversion.plumber import create_oebbook
|
||||
from calibre.ebooks.oeb.base import (DirContainer,
|
||||
rewrite_links, urlnormalize, urldefrag, BINARY_MIME, OEB_STYLES,
|
||||
xpath, urlquote)
|
||||
from calibre import guess_type
|
||||
from calibre.ebooks.oeb.transforms.metadata import \
|
||||
meta_info_to_oeb_metadata
|
||||
from calibre.ebooks.html.input import get_filelist
|
||||
from calibre.ebooks.metadata import string_to_authors
|
||||
from calibre.utils.localization import canonicalize_lang
|
||||
import css_parser, logging
|
||||
css_parser.log.setLevel(logging.WARN)
|
||||
self.OEB_STYLES = OEB_STYLES
|
||||
oeb = create_oebbook(log, None, opts, self,
|
||||
encoding=opts.input_encoding, populate=False)
|
||||
self.oeb = oeb
|
||||
|
||||
metadata = oeb.metadata
|
||||
meta_info_to_oeb_metadata(mi, metadata, log)
|
||||
if not metadata.language:
|
||||
l = canonicalize_lang(getattr(opts, 'language', None))
|
||||
if not l:
|
||||
oeb.logger.warn('Language not specified')
|
||||
l = get_lang().replace('_', '-')
|
||||
metadata.add('language', l)
|
||||
if not metadata.creator:
|
||||
a = getattr(opts, 'authors', None)
|
||||
if a:
|
||||
a = string_to_authors(a)
|
||||
if not a:
|
||||
oeb.logger.warn('Creator not specified')
|
||||
a = [self.oeb.translate(__('Unknown'))]
|
||||
for aut in a:
|
||||
metadata.add('creator', aut)
|
||||
if not metadata.title:
|
||||
oeb.logger.warn('Title not specified')
|
||||
metadata.add('title', self.oeb.translate(__('Unknown')))
|
||||
bookid = unicode_type(uuid.uuid4())
|
||||
metadata.add('identifier', bookid, id='uuid_id', scheme='uuid')
|
||||
for ident in metadata.identifier:
|
||||
if 'id' in ident.attrib:
|
||||
self.oeb.uid = metadata.identifier[0]
|
||||
break
|
||||
|
||||
filelist = get_filelist(htmlpath, basedir, opts, log)
|
||||
filelist = [f for f in filelist if not f.is_binary]
|
||||
htmlfile_map = {}
|
||||
for f in filelist:
|
||||
path = f.path
|
||||
oeb.container = DirContainer(os.path.dirname(path), log,
|
||||
ignore_opf=True)
|
||||
bname = os.path.basename(path)
|
||||
id, href = oeb.manifest.generate(id='html', href=sanitize_file_name(bname))
|
||||
htmlfile_map[path] = href
|
||||
item = oeb.manifest.add(id, href, 'text/html')
|
||||
if path == htmlpath and '%' in path:
|
||||
bname = urlquote(bname)
|
||||
item.html_input_href = bname
|
||||
oeb.spine.add(item, True)
|
||||
|
||||
self.added_resources = {}
|
||||
self.log = log
|
||||
self.log('Normalizing filename cases')
|
||||
for path, href in htmlfile_map.items():
|
||||
if not self.is_case_sensitive(path):
|
||||
path = path.lower()
|
||||
self.added_resources[path] = href
|
||||
self.urlnormalize, self.DirContainer = urlnormalize, DirContainer
|
||||
self.urldefrag = urldefrag
|
||||
self.guess_type, self.BINARY_MIME = guess_type, BINARY_MIME
|
||||
|
||||
self.log('Rewriting HTML links')
|
||||
for f in filelist:
|
||||
path = f.path
|
||||
dpath = os.path.dirname(path)
|
||||
oeb.container = DirContainer(dpath, log, ignore_opf=True)
|
||||
href = htmlfile_map[path]
|
||||
try:
|
||||
item = oeb.manifest.hrefs[href]
|
||||
except KeyError:
|
||||
item = oeb.manifest.hrefs[urlnormalize(href)]
|
||||
rewrite_links(item.data, partial(self.resource_adder, base=dpath))
|
||||
|
||||
for item in oeb.manifest.values():
|
||||
if item.media_type in self.OEB_STYLES:
|
||||
dpath = None
|
||||
for path, href in self.added_resources.items():
|
||||
if href == item.href:
|
||||
dpath = os.path.dirname(path)
|
||||
break
|
||||
css_parser.replaceUrls(item.data,
|
||||
partial(self.resource_adder, base=dpath))
|
||||
|
||||
toc = self.oeb.toc
|
||||
self.oeb.auto_generated_toc = True
|
||||
titles = []
|
||||
headers = []
|
||||
for item in self.oeb.spine:
|
||||
if not item.linear:
|
||||
continue
|
||||
html = item.data
|
||||
title = ''.join(xpath(html, '/h:html/h:head/h:title/text()'))
|
||||
title = re.sub(r'\s+', ' ', title.strip())
|
||||
if title:
|
||||
titles.append(title)
|
||||
headers.append('(unlabled)')
|
||||
for tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'strong'):
|
||||
expr = '/h:html/h:body//h:%s[position()=1]/text()'
|
||||
header = ''.join(xpath(html, expr % tag))
|
||||
header = re.sub(r'\s+', ' ', header.strip())
|
||||
if header:
|
||||
headers[-1] = header
|
||||
break
|
||||
use = titles
|
||||
if len(titles) > len(set(titles)):
|
||||
use = headers
|
||||
for title, item in zip(use, self.oeb.spine):
|
||||
if not item.linear:
|
||||
continue
|
||||
toc.add(title, item.href)
|
||||
|
||||
oeb.container = DirContainer(getcwd(), oeb.log, ignore_opf=True)
|
||||
return oeb
|
||||
|
||||
def link_to_local_path(self, link_, base=None):
|
||||
from calibre.ebooks.html.input import Link
|
||||
if not isinstance(link_, unicode_type):
|
||||
try:
|
||||
link_ = link_.decode('utf-8', 'error')
|
||||
except:
|
||||
self.log.warn('Failed to decode link %r. Ignoring'%link_)
|
||||
return None, None
|
||||
try:
|
||||
l = Link(link_, base if base else getcwd())
|
||||
except:
|
||||
self.log.exception('Failed to process link: %r'%link_)
|
||||
return None, None
|
||||
if l.path is None:
|
||||
# Not a local resource
|
||||
return None, None
|
||||
link = l.path.replace('/', os.sep).strip()
|
||||
frag = l.fragment
|
||||
if not link:
|
||||
return None, None
|
||||
return link, frag
|
||||
|
||||
def resource_adder(self, link_, base=None):
|
||||
from polyglot.urllib import quote
|
||||
link, frag = self.link_to_local_path(link_, base=base)
|
||||
if link is None:
|
||||
return link_
|
||||
try:
|
||||
if base and not os.path.isabs(link):
|
||||
link = os.path.join(base, link)
|
||||
link = os.path.abspath(link)
|
||||
except:
|
||||
return link_
|
||||
if not os.access(link, os.R_OK):
|
||||
return link_
|
||||
if os.path.isdir(link):
|
||||
self.log.warn(link_, 'is a link to a directory. Ignoring.')
|
||||
return link_
|
||||
if not self.is_case_sensitive(tempfile.gettempdir()):
|
||||
link = link.lower()
|
||||
if link not in self.added_resources:
|
||||
bhref = os.path.basename(link)
|
||||
id, href = self.oeb.manifest.generate(id='added', href=sanitize_file_name(bhref))
|
||||
guessed = self.guess_type(href)[0]
|
||||
media_type = guessed or self.BINARY_MIME
|
||||
if media_type == 'text/plain':
|
||||
self.log.warn('Ignoring link to text file %r'%link_)
|
||||
return None
|
||||
if media_type == self.BINARY_MIME:
|
||||
# Check for the common case, images
|
||||
try:
|
||||
img = what(link)
|
||||
except EnvironmentError:
|
||||
pass
|
||||
else:
|
||||
if img:
|
||||
media_type = self.guess_type('dummy.'+img)[0] or self.BINARY_MIME
|
||||
|
||||
self.oeb.log.debug('Added', link)
|
||||
self.oeb.container = self.DirContainer(os.path.dirname(link),
|
||||
self.oeb.log, ignore_opf=True)
|
||||
# Load into memory
|
||||
item = self.oeb.manifest.add(id, href, media_type)
|
||||
# bhref refers to an already existing file. The read() method of
|
||||
# DirContainer will call unquote on it before trying to read the
|
||||
# file, therefore we quote it here.
|
||||
if isinstance(bhref, unicode_type):
|
||||
bhref = bhref.encode('utf-8')
|
||||
item.html_input_href = as_unicode(quote(bhref))
|
||||
if guessed in self.OEB_STYLES:
|
||||
item.override_css_fetch = partial(
|
||||
self.css_import_handler, os.path.dirname(link))
|
||||
item.data
|
||||
self.added_resources[link] = href
|
||||
|
||||
nlink = self.added_resources[link]
|
||||
if frag:
|
||||
nlink = '#'.join((nlink, frag))
|
||||
return nlink
|
||||
|
||||
def css_import_handler(self, base, href):
|
||||
link, frag = self.link_to_local_path(href, base=base)
|
||||
if link is None or not os.access(link, os.R_OK) or os.path.isdir(link):
|
||||
return (None, None)
|
||||
try:
|
||||
with open(link, 'rb') as f:
|
||||
raw = f.read().decode('utf-8', 'replace')
|
||||
raw = self.oeb.css_preprocessor(raw, add_namespace=False)
|
||||
except:
|
||||
self.log.exception('Failed to read CSS file: %r'%link)
|
||||
return (None, None)
|
||||
return (None, raw)
|
||||
226
ebook_converter/ebooks/conversion/plugins/html_output.py
Normal file
226
ebook_converter/ebooks/conversion/plugins/html_output.py
Normal file
@@ -0,0 +1,226 @@
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL 3'
|
||||
__copyright__ = '2010, Fabian Grassl <fg@jusmeum.de>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import os, re, shutil
|
||||
from os.path import dirname, abspath, relpath as _relpath, exists, basename
|
||||
|
||||
from calibre.customize.conversion import OutputFormatPlugin, OptionRecommendation
|
||||
from calibre import CurrentDir
|
||||
from calibre.ptempfile import PersistentTemporaryDirectory
|
||||
from polyglot.builtins import unicode_type
|
||||
|
||||
|
||||
def relpath(*args):
|
||||
return _relpath(*args).replace(os.sep, '/')
|
||||
|
||||
|
||||
class HTMLOutput(OutputFormatPlugin):
|
||||
|
||||
name = 'HTML Output'
|
||||
author = 'Fabian Grassl'
|
||||
file_type = 'zip'
|
||||
commit_name = 'html_output'
|
||||
|
||||
options = {
|
||||
OptionRecommendation(name='template_css',
|
||||
help=_('CSS file used for the output instead of the default file')),
|
||||
|
||||
OptionRecommendation(name='template_html_index',
|
||||
help=_('Template used for generation of the HTML index file instead of the default file')),
|
||||
|
||||
OptionRecommendation(name='template_html',
|
||||
help=_('Template used for the generation of the HTML contents of the book instead of the default file')),
|
||||
|
||||
OptionRecommendation(name='extract_to',
|
||||
help=_('Extract the contents of the generated ZIP file to the '
|
||||
'specified directory. WARNING: The contents of the directory '
|
||||
'will be deleted.')
|
||||
),
|
||||
}
|
||||
|
||||
recommendations = {('pretty_print', True, OptionRecommendation.HIGH)}
|
||||
|
||||
def generate_toc(self, oeb_book, ref_url, output_dir):
|
||||
'''
|
||||
Generate table of contents
|
||||
'''
|
||||
from lxml import etree
|
||||
from polyglot.urllib import unquote
|
||||
|
||||
from calibre.ebooks.oeb.base import element
|
||||
from calibre.utils.cleantext import clean_xml_chars
|
||||
with CurrentDir(output_dir):
|
||||
def build_node(current_node, parent=None):
|
||||
if parent is None:
|
||||
parent = etree.Element('ul')
|
||||
elif len(current_node.nodes):
|
||||
parent = element(parent, ('ul'))
|
||||
for node in current_node.nodes:
|
||||
point = element(parent, 'li')
|
||||
href = relpath(abspath(unquote(node.href)), dirname(ref_url))
|
||||
if isinstance(href, bytes):
|
||||
href = href.decode('utf-8')
|
||||
link = element(point, 'a', href=clean_xml_chars(href))
|
||||
title = node.title
|
||||
if isinstance(title, bytes):
|
||||
title = title.decode('utf-8')
|
||||
if title:
|
||||
title = re.sub(r'\s+', ' ', title)
|
||||
link.text = clean_xml_chars(title)
|
||||
build_node(node, point)
|
||||
return parent
|
||||
wrap = etree.Element('div')
|
||||
wrap.append(build_node(oeb_book.toc))
|
||||
return wrap
|
||||
|
||||
def generate_html_toc(self, oeb_book, ref_url, output_dir):
|
||||
from lxml import etree
|
||||
|
||||
root = self.generate_toc(oeb_book, ref_url, output_dir)
|
||||
return etree.tostring(root, pretty_print=True, encoding='unicode',
|
||||
xml_declaration=False)
|
||||
|
||||
def convert(self, oeb_book, output_path, input_plugin, opts, log):
|
||||
from lxml import etree
|
||||
from calibre.utils import zipfile
|
||||
from templite import Templite
|
||||
from polyglot.urllib import unquote
|
||||
from calibre.ebooks.html.meta import EasyMeta
|
||||
|
||||
# read template files
|
||||
if opts.template_html_index is not None:
|
||||
with open(opts.template_html_index, 'rb') as f:
|
||||
template_html_index_data = f.read()
|
||||
else:
|
||||
template_html_index_data = P('templates/html_export_default_index.tmpl', data=True)
|
||||
|
||||
if opts.template_html is not None:
|
||||
with open(opts.template_html, 'rb') as f:
|
||||
template_html_data = f.read()
|
||||
else:
|
||||
template_html_data = P('templates/html_export_default.tmpl', data=True)
|
||||
|
||||
if opts.template_css is not None:
|
||||
with open(opts.template_css, 'rb') as f:
|
||||
template_css_data = f.read()
|
||||
else:
|
||||
template_css_data = P('templates/html_export_default.css', data=True)
|
||||
|
||||
template_html_index_data = template_html_index_data.decode('utf-8')
|
||||
template_html_data = template_html_data.decode('utf-8')
|
||||
template_css_data = template_css_data.decode('utf-8')
|
||||
|
||||
self.log = log
|
||||
self.opts = opts
|
||||
meta = EasyMeta(oeb_book.metadata)
|
||||
|
||||
tempdir = os.path.realpath(PersistentTemporaryDirectory())
|
||||
output_file = os.path.join(tempdir,
|
||||
basename(re.sub(r'\.zip', '', output_path)+'.html'))
|
||||
output_dir = re.sub(r'\.html', '', output_file)+'_files'
|
||||
|
||||
if not exists(output_dir):
|
||||
os.makedirs(output_dir)
|
||||
|
||||
css_path = output_dir+os.sep+'calibreHtmlOutBasicCss.css'
|
||||
with open(css_path, 'wb') as f:
|
||||
f.write(template_css_data.encode('utf-8'))
|
||||
|
||||
with open(output_file, 'wb') as f:
|
||||
html_toc = self.generate_html_toc(oeb_book, output_file, output_dir)
|
||||
templite = Templite(template_html_index_data)
|
||||
nextLink = oeb_book.spine[0].href
|
||||
nextLink = relpath(output_dir+os.sep+nextLink, dirname(output_file))
|
||||
cssLink = relpath(abspath(css_path), dirname(output_file))
|
||||
tocUrl = relpath(output_file, dirname(output_file))
|
||||
t = templite.render(has_toc=bool(oeb_book.toc.count()),
|
||||
toc=html_toc, meta=meta, nextLink=nextLink,
|
||||
tocUrl=tocUrl, cssLink=cssLink,
|
||||
firstContentPageLink=nextLink)
|
||||
if isinstance(t, unicode_type):
|
||||
t = t.encode('utf-8')
|
||||
f.write(t)
|
||||
|
||||
with CurrentDir(output_dir):
|
||||
for item in oeb_book.manifest:
|
||||
path = abspath(unquote(item.href))
|
||||
dir = dirname(path)
|
||||
if not exists(dir):
|
||||
os.makedirs(dir)
|
||||
if item.spine_position is not None:
|
||||
with open(path, 'wb') as f:
|
||||
pass
|
||||
else:
|
||||
with open(path, 'wb') as f:
|
||||
f.write(item.bytes_representation)
|
||||
item.unload_data_from_memory(memory=path)
|
||||
|
||||
for item in oeb_book.spine:
|
||||
path = abspath(unquote(item.href))
|
||||
dir = dirname(path)
|
||||
root = item.data.getroottree()
|
||||
|
||||
# get & clean HTML <HEAD>-data
|
||||
head = root.xpath('//h:head', namespaces={'h': 'http://www.w3.org/1999/xhtml'})[0]
|
||||
head_content = etree.tostring(head, pretty_print=True, encoding='unicode')
|
||||
head_content = re.sub(r'\<\/?head.*\>', '', head_content)
|
||||
head_content = re.sub(re.compile(r'\<style.*\/style\>', re.M|re.S), '', head_content)
|
||||
head_content = re.sub(r'<(title)([^>]*)/>', r'<\1\2></\1>', head_content)
|
||||
|
||||
# get & clean HTML <BODY>-data
|
||||
body = root.xpath('//h:body', namespaces={'h': 'http://www.w3.org/1999/xhtml'})[0]
|
||||
ebook_content = etree.tostring(body, pretty_print=True, encoding='unicode')
|
||||
ebook_content = re.sub(r'\<\/?body.*\>', '', ebook_content)
|
||||
ebook_content = re.sub(r'<(div|a|span)([^>]*)/>', r'<\1\2></\1>', ebook_content)
|
||||
|
||||
# generate link to next page
|
||||
if item.spine_position+1 < len(oeb_book.spine):
|
||||
nextLink = oeb_book.spine[item.spine_position+1].href
|
||||
nextLink = relpath(abspath(nextLink), dir)
|
||||
else:
|
||||
nextLink = None
|
||||
|
||||
# generate link to previous page
|
||||
if item.spine_position > 0:
|
||||
prevLink = oeb_book.spine[item.spine_position-1].href
|
||||
prevLink = relpath(abspath(prevLink), dir)
|
||||
else:
|
||||
prevLink = None
|
||||
|
||||
cssLink = relpath(abspath(css_path), dir)
|
||||
tocUrl = relpath(output_file, dir)
|
||||
firstContentPageLink = oeb_book.spine[0].href
|
||||
|
||||
# render template
|
||||
templite = Templite(template_html_data)
|
||||
toc = lambda: self.generate_html_toc(oeb_book, path, output_dir)
|
||||
t = templite.render(ebookContent=ebook_content,
|
||||
prevLink=prevLink, nextLink=nextLink,
|
||||
has_toc=bool(oeb_book.toc.count()), toc=toc,
|
||||
tocUrl=tocUrl, head_content=head_content,
|
||||
meta=meta, cssLink=cssLink,
|
||||
firstContentPageLink=firstContentPageLink)
|
||||
|
||||
# write html to file
|
||||
with open(path, 'wb') as f:
|
||||
f.write(t.encode('utf-8'))
|
||||
item.unload_data_from_memory(memory=path)
|
||||
|
||||
zfile = zipfile.ZipFile(output_path, "w")
|
||||
zfile.add_dir(output_dir, basename(output_dir))
|
||||
zfile.write(output_file, basename(output_file), zipfile.ZIP_DEFLATED)
|
||||
|
||||
if opts.extract_to:
|
||||
if os.path.exists(opts.extract_to):
|
||||
shutil.rmtree(opts.extract_to)
|
||||
os.makedirs(opts.extract_to)
|
||||
zfile.extractall(opts.extract_to)
|
||||
self.log('Zip file extracted to', opts.extract_to)
|
||||
|
||||
zfile.close()
|
||||
|
||||
# cleanup temp dir
|
||||
shutil.rmtree(tempdir)
|
||||
133
ebook_converter/ebooks/conversion/plugins/htmlz_input.py
Normal file
133
ebook_converter/ebooks/conversion/plugins/htmlz_input.py
Normal file
@@ -0,0 +1,133 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
|
||||
__license__ = 'GPL 3'
|
||||
__copyright__ = '2011, John Schember <john@nachtimwald.com>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import os
|
||||
|
||||
from calibre import guess_type
|
||||
from calibre.customize.conversion import InputFormatPlugin
|
||||
from polyglot.builtins import getcwd
|
||||
|
||||
|
||||
class HTMLZInput(InputFormatPlugin):
|
||||
|
||||
name = 'HTLZ Input'
|
||||
author = 'John Schember'
|
||||
description = 'Convert HTML files to HTML'
|
||||
file_types = {'htmlz'}
|
||||
commit_name = 'htmlz_input'
|
||||
|
||||
def convert(self, stream, options, file_ext, log,
|
||||
accelerators):
|
||||
from calibre.ebooks.chardet import xml_to_unicode
|
||||
from calibre.ebooks.metadata.opf2 import OPF
|
||||
from calibre.utils.zipfile import ZipFile
|
||||
|
||||
self.log = log
|
||||
html = u''
|
||||
top_levels = []
|
||||
|
||||
# Extract content from zip archive.
|
||||
zf = ZipFile(stream)
|
||||
zf.extractall()
|
||||
|
||||
# Find the HTML file in the archive. It needs to be
|
||||
# top level.
|
||||
index = u''
|
||||
multiple_html = False
|
||||
# Get a list of all top level files in the archive.
|
||||
for x in os.listdir(u'.'):
|
||||
if os.path.isfile(x):
|
||||
top_levels.append(x)
|
||||
# Try to find an index. file.
|
||||
for x in top_levels:
|
||||
if x.lower() in (u'index.html', u'index.xhtml', u'index.htm'):
|
||||
index = x
|
||||
break
|
||||
# Look for multiple HTML files in the archive. We look at the
|
||||
# top level files only as only they matter in HTMLZ.
|
||||
for x in top_levels:
|
||||
if os.path.splitext(x)[1].lower() in (u'.html', u'.xhtml', u'.htm'):
|
||||
# Set index to the first HTML file found if it's not
|
||||
# called index.
|
||||
if not index:
|
||||
index = x
|
||||
else:
|
||||
multiple_html = True
|
||||
# Warn the user if there multiple HTML file in the archive. HTMLZ
|
||||
# supports a single HTML file. A conversion with a multiple HTML file
|
||||
# HTMLZ archive probably won't turn out as the user expects. With
|
||||
# Multiple HTML files ZIP input should be used in place of HTMLZ.
|
||||
if multiple_html:
|
||||
log.warn(_('Multiple HTML files found in the archive. Only %s will be used.') % index)
|
||||
|
||||
if index:
|
||||
with open(index, 'rb') as tf:
|
||||
html = tf.read()
|
||||
else:
|
||||
raise Exception(_('No top level HTML file found.'))
|
||||
|
||||
if not html:
|
||||
raise Exception(_('Top level HTML file %s is empty') % index)
|
||||
|
||||
# Encoding
|
||||
if options.input_encoding:
|
||||
ienc = options.input_encoding
|
||||
else:
|
||||
ienc = xml_to_unicode(html[:4096])[-1]
|
||||
html = html.decode(ienc, 'replace')
|
||||
|
||||
# Run the HTML through the html processing plugin.
|
||||
from calibre.customize.ui import plugin_for_input_format
|
||||
html_input = plugin_for_input_format('html')
|
||||
for opt in html_input.options:
|
||||
setattr(options, opt.option.name, opt.recommended_value)
|
||||
options.input_encoding = 'utf-8'
|
||||
base = getcwd()
|
||||
htmlfile = os.path.join(base, u'index.html')
|
||||
c = 0
|
||||
while os.path.exists(htmlfile):
|
||||
c += 1
|
||||
htmlfile = u'index%d.html'%c
|
||||
with open(htmlfile, 'wb') as f:
|
||||
f.write(html.encode('utf-8'))
|
||||
odi = options.debug_pipeline
|
||||
options.debug_pipeline = None
|
||||
# Generate oeb from html conversion.
|
||||
with open(htmlfile, 'rb') as f:
|
||||
oeb = html_input.convert(f, options, 'html', log,
|
||||
{})
|
||||
options.debug_pipeline = odi
|
||||
os.remove(htmlfile)
|
||||
|
||||
# Set metadata from file.
|
||||
from calibre.customize.ui import get_file_type_metadata
|
||||
from calibre.ebooks.oeb.transforms.metadata import meta_info_to_oeb_metadata
|
||||
mi = get_file_type_metadata(stream, file_ext)
|
||||
meta_info_to_oeb_metadata(mi, oeb.metadata, log)
|
||||
|
||||
# Get the cover path from the OPF.
|
||||
cover_path = None
|
||||
opf = None
|
||||
for x in top_levels:
|
||||
if os.path.splitext(x)[1].lower() == u'.opf':
|
||||
opf = x
|
||||
break
|
||||
if opf:
|
||||
opf = OPF(opf, basedir=getcwd())
|
||||
cover_path = opf.raster_cover or opf.cover
|
||||
# Set the cover.
|
||||
if cover_path:
|
||||
cdata = None
|
||||
with open(os.path.join(getcwd(), cover_path), 'rb') as cf:
|
||||
cdata = cf.read()
|
||||
cover_name = os.path.basename(cover_path)
|
||||
id, href = oeb.manifest.generate('cover', cover_name)
|
||||
oeb.manifest.add(id, href, guess_type(cover_name)[0], data=cdata)
|
||||
oeb.guide.add('cover', 'Cover', href)
|
||||
|
||||
return oeb
|
||||
136
ebook_converter/ebooks/conversion/plugins/htmlz_output.py
Normal file
136
ebook_converter/ebooks/conversion/plugins/htmlz_output.py
Normal file
@@ -0,0 +1,136 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
|
||||
__license__ = 'GPL 3'
|
||||
__copyright__ = '2011, John Schember <john@nachtimwald.com>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import io
|
||||
import os
|
||||
|
||||
from calibre.customize.conversion import OutputFormatPlugin, \
|
||||
OptionRecommendation
|
||||
from calibre.ptempfile import TemporaryDirectory
|
||||
from polyglot.builtins import unicode_type
|
||||
|
||||
|
||||
class HTMLZOutput(OutputFormatPlugin):
|
||||
|
||||
name = 'HTMLZ Output'
|
||||
author = 'John Schember'
|
||||
file_type = 'htmlz'
|
||||
commit_name = 'htmlz_output'
|
||||
ui_data = {
|
||||
'css_choices': {
|
||||
'class': _('Use CSS classes'),
|
||||
'inline': _('Use the style attribute'),
|
||||
'tag': _('Use HTML tags wherever possible')
|
||||
},
|
||||
'sheet_choices': {
|
||||
'external': _('Use an external CSS file'),
|
||||
'inline': _('Use a <style> tag in the HTML file')
|
||||
}
|
||||
}
|
||||
|
||||
options = {
|
||||
OptionRecommendation(name='htmlz_css_type', recommended_value='class',
|
||||
level=OptionRecommendation.LOW,
|
||||
choices=list(ui_data['css_choices']),
|
||||
help=_('Specify the handling of CSS. Default is class.\n'
|
||||
'class: {class}\n'
|
||||
'inline: {inline}\n'
|
||||
'tag: {tag}'
|
||||
).format(**ui_data['css_choices'])),
|
||||
OptionRecommendation(name='htmlz_class_style', recommended_value='external',
|
||||
level=OptionRecommendation.LOW,
|
||||
choices=list(ui_data['sheet_choices']),
|
||||
help=_('How to handle the CSS when using css-type = \'class\'.\n'
|
||||
'Default is external.\n'
|
||||
'external: {external}\n'
|
||||
'inline: {inline}'
|
||||
).format(**ui_data['sheet_choices'])),
|
||||
OptionRecommendation(name='htmlz_title_filename',
|
||||
recommended_value=False, level=OptionRecommendation.LOW,
|
||||
help=_('If set this option causes the file name of the HTML file'
|
||||
' inside the HTMLZ archive to be based on the book title.')
|
||||
),
|
||||
}
|
||||
|
||||
def convert(self, oeb_book, output_path, input_plugin, opts, log):
|
||||
from lxml import etree
|
||||
from calibre.ebooks.oeb.base import OEB_IMAGES, SVG_MIME
|
||||
from calibre.ebooks.metadata.opf2 import OPF, metadata_to_opf
|
||||
from calibre.utils.zipfile import ZipFile
|
||||
from calibre.utils.filenames import ascii_filename
|
||||
|
||||
# HTML
|
||||
if opts.htmlz_css_type == 'inline':
|
||||
from calibre.ebooks.htmlz.oeb2html import OEB2HTMLInlineCSSizer
|
||||
OEB2HTMLizer = OEB2HTMLInlineCSSizer
|
||||
elif opts.htmlz_css_type == 'tag':
|
||||
from calibre.ebooks.htmlz.oeb2html import OEB2HTMLNoCSSizer
|
||||
OEB2HTMLizer = OEB2HTMLNoCSSizer
|
||||
else:
|
||||
from calibre.ebooks.htmlz.oeb2html import OEB2HTMLClassCSSizer as OEB2HTMLizer
|
||||
|
||||
with TemporaryDirectory(u'_htmlz_output') as tdir:
|
||||
htmlizer = OEB2HTMLizer(log)
|
||||
html = htmlizer.oeb2html(oeb_book, opts)
|
||||
|
||||
fname = u'index'
|
||||
if opts.htmlz_title_filename:
|
||||
from calibre.utils.filenames import shorten_components_to
|
||||
fname = shorten_components_to(100, (ascii_filename(unicode_type(oeb_book.metadata.title[0])),))[0]
|
||||
with open(os.path.join(tdir, fname+u'.html'), 'wb') as tf:
|
||||
if isinstance(html, unicode_type):
|
||||
html = html.encode('utf-8')
|
||||
tf.write(html)
|
||||
|
||||
# CSS
|
||||
if opts.htmlz_css_type == 'class' and opts.htmlz_class_style == 'external':
|
||||
with open(os.path.join(tdir, u'style.css'), 'wb') as tf:
|
||||
tf.write(htmlizer.get_css(oeb_book))
|
||||
|
||||
# Images
|
||||
images = htmlizer.images
|
||||
if images:
|
||||
if not os.path.exists(os.path.join(tdir, u'images')):
|
||||
os.makedirs(os.path.join(tdir, u'images'))
|
||||
for item in oeb_book.manifest:
|
||||
if item.media_type in OEB_IMAGES and item.href in images:
|
||||
if item.media_type == SVG_MIME:
|
||||
data = etree.tostring(item.data, encoding='unicode')
|
||||
else:
|
||||
data = item.data
|
||||
fname = os.path.join(tdir, u'images', images[item.href])
|
||||
with open(fname, 'wb') as img:
|
||||
img.write(data)
|
||||
|
||||
# Cover
|
||||
cover_path = None
|
||||
try:
|
||||
cover_data = None
|
||||
if oeb_book.metadata.cover:
|
||||
term = oeb_book.metadata.cover[0].term
|
||||
cover_data = oeb_book.guide[term].item.data
|
||||
if cover_data:
|
||||
from calibre.utils.img import save_cover_data_to
|
||||
cover_path = os.path.join(tdir, u'cover.jpg')
|
||||
with lopen(cover_path, 'w') as cf:
|
||||
cf.write('')
|
||||
save_cover_data_to(cover_data, cover_path)
|
||||
except:
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
|
||||
# Metadata
|
||||
with open(os.path.join(tdir, u'metadata.opf'), 'wb') as mdataf:
|
||||
opf = OPF(io.BytesIO(etree.tostring(oeb_book.metadata.to_opf1(), encoding='UTF-8')))
|
||||
mi = opf.to_book_metadata()
|
||||
if cover_path:
|
||||
mi.cover = u'cover.jpg'
|
||||
mdataf.write(metadata_to_opf(mi))
|
||||
|
||||
htmlz = ZipFile(output_path, 'w')
|
||||
htmlz.add_dir(tdir)
|
||||
64
ebook_converter/ebooks/conversion/plugins/lit_input.py
Normal file
64
ebook_converter/ebooks/conversion/plugins/lit_input.py
Normal file
@@ -0,0 +1,64 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
from calibre.customize.conversion import InputFormatPlugin
|
||||
|
||||
|
||||
class LITInput(InputFormatPlugin):
|
||||
|
||||
name = 'LIT Input'
|
||||
author = 'Marshall T. Vandegrift'
|
||||
description = 'Convert LIT files to HTML'
|
||||
file_types = {'lit'}
|
||||
commit_name = 'lit_input'
|
||||
|
||||
def convert(self, stream, options, file_ext, log,
|
||||
accelerators):
|
||||
from calibre.ebooks.lit.reader import LitReader
|
||||
from calibre.ebooks.conversion.plumber import create_oebbook
|
||||
self.log = log
|
||||
return create_oebbook(log, stream, options, reader=LitReader)
|
||||
|
||||
def postprocess_book(self, oeb, opts, log):
|
||||
from calibre.ebooks.oeb.base import XHTML_NS, XPath, XHTML
|
||||
for item in oeb.spine:
|
||||
root = item.data
|
||||
if not hasattr(root, 'xpath'):
|
||||
continue
|
||||
for bad in ('metadata', 'guide'):
|
||||
metadata = XPath('//h:'+bad)(root)
|
||||
if metadata:
|
||||
for x in metadata:
|
||||
x.getparent().remove(x)
|
||||
body = XPath('//h:body')(root)
|
||||
if body:
|
||||
body = body[0]
|
||||
if len(body) == 1 and body[0].tag == XHTML('pre'):
|
||||
pre = body[0]
|
||||
from calibre.ebooks.txt.processor import convert_basic, \
|
||||
separate_paragraphs_single_line
|
||||
from calibre.ebooks.chardet import xml_to_unicode
|
||||
from calibre.utils.xml_parse import safe_xml_fromstring
|
||||
import copy
|
||||
self.log('LIT file with all text in singe <pre> tag detected')
|
||||
html = separate_paragraphs_single_line(pre.text)
|
||||
html = convert_basic(html).replace('<html>',
|
||||
'<html xmlns="%s">'%XHTML_NS)
|
||||
html = xml_to_unicode(html, strip_encoding_pats=True,
|
||||
resolve_entities=True)[0]
|
||||
if opts.smarten_punctuation:
|
||||
# SmartyPants skips text inside <pre> tags
|
||||
from calibre.ebooks.conversion.preprocess import smarten_punctuation
|
||||
html = smarten_punctuation(html, self.log)
|
||||
root = safe_xml_fromstring(html)
|
||||
body = XPath('//h:body')(root)
|
||||
pre.tag = XHTML('div')
|
||||
pre.text = ''
|
||||
for elem in body:
|
||||
ne = copy.deepcopy(elem)
|
||||
pre.append(ne)
|
||||
38
ebook_converter/ebooks/conversion/plugins/lit_output.py
Normal file
38
ebook_converter/ebooks/conversion/plugins/lit_output.py
Normal file
@@ -0,0 +1,38 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
|
||||
from calibre.customize.conversion import OutputFormatPlugin
|
||||
|
||||
|
||||
class LITOutput(OutputFormatPlugin):
|
||||
|
||||
name = 'LIT Output'
|
||||
author = 'Marshall T. Vandegrift'
|
||||
file_type = 'lit'
|
||||
commit_name = 'lit_output'
|
||||
|
||||
def convert(self, oeb, output_path, input_plugin, opts, log):
|
||||
self.log, self.opts, self.oeb = log, opts, oeb
|
||||
from calibre.ebooks.oeb.transforms.manglecase import CaseMangler
|
||||
from calibre.ebooks.oeb.transforms.rasterize import SVGRasterizer
|
||||
from calibre.ebooks.oeb.transforms.htmltoc import HTMLTOCAdder
|
||||
from calibre.ebooks.lit.writer import LitWriter
|
||||
from calibre.ebooks.oeb.transforms.split import Split
|
||||
split = Split(split_on_page_breaks=True, max_flow_size=0,
|
||||
remove_css_pagebreaks=False)
|
||||
split(self.oeb, self.opts)
|
||||
|
||||
tocadder = HTMLTOCAdder()
|
||||
tocadder(oeb, opts)
|
||||
mangler = CaseMangler()
|
||||
mangler(oeb, opts)
|
||||
rasterizer = SVGRasterizer()
|
||||
rasterizer(oeb, opts)
|
||||
lit = LitWriter(self.opts)
|
||||
lit(oeb, output_path)
|
||||
82
ebook_converter/ebooks/conversion/plugins/lrf_input.py
Normal file
82
ebook_converter/ebooks/conversion/plugins/lrf_input.py
Normal file
@@ -0,0 +1,82 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import os, sys
|
||||
from calibre.customize.conversion import InputFormatPlugin
|
||||
|
||||
|
||||
class LRFInput(InputFormatPlugin):
|
||||
|
||||
name = 'LRF Input'
|
||||
author = 'Kovid Goyal'
|
||||
description = 'Convert LRF files to HTML'
|
||||
file_types = {'lrf'}
|
||||
commit_name = 'lrf_input'
|
||||
|
||||
def convert(self, stream, options, file_ext, log,
|
||||
accelerators):
|
||||
from calibre.ebooks.lrf.input import (MediaType, Styles, TextBlock,
|
||||
Canvas, ImageBlock, RuledLine)
|
||||
self.log = log
|
||||
self.log('Generating XML')
|
||||
from calibre.ebooks.lrf.lrfparser import LRFDocument
|
||||
from calibre.utils.xml_parse import safe_xml_fromstring
|
||||
from lxml import etree
|
||||
d = LRFDocument(stream)
|
||||
d.parse()
|
||||
xml = d.to_xml(write_files=True)
|
||||
if options.verbose > 2:
|
||||
open(u'lrs.xml', 'wb').write(xml.encode('utf-8'))
|
||||
doc = safe_xml_fromstring(xml)
|
||||
|
||||
char_button_map = {}
|
||||
for x in doc.xpath('//CharButton[@refobj]'):
|
||||
ro = x.get('refobj')
|
||||
jump_button = doc.xpath('//*[@objid="%s"]'%ro)
|
||||
if jump_button:
|
||||
jump_to = jump_button[0].xpath('descendant::JumpTo[@refpage and @refobj]')
|
||||
if jump_to:
|
||||
char_button_map[ro] = '%s.xhtml#%s'%(jump_to[0].get('refpage'),
|
||||
jump_to[0].get('refobj'))
|
||||
plot_map = {}
|
||||
for x in doc.xpath('//Plot[@refobj]'):
|
||||
ro = x.get('refobj')
|
||||
image = doc.xpath('//Image[@objid="%s" and @refstream]'%ro)
|
||||
if image:
|
||||
imgstr = doc.xpath('//ImageStream[@objid="%s" and @file]'%
|
||||
image[0].get('refstream'))
|
||||
if imgstr:
|
||||
plot_map[ro] = imgstr[0].get('file')
|
||||
|
||||
self.log('Converting XML to HTML...')
|
||||
styledoc = safe_xml_fromstring(P('templates/lrf.xsl', data=True))
|
||||
media_type = MediaType()
|
||||
styles = Styles()
|
||||
text_block = TextBlock(styles, char_button_map, plot_map, log)
|
||||
canvas = Canvas(doc, styles, text_block, log)
|
||||
image_block = ImageBlock(canvas)
|
||||
ruled_line = RuledLine()
|
||||
extensions = {
|
||||
('calibre', 'media-type') : media_type,
|
||||
('calibre', 'text-block') : text_block,
|
||||
('calibre', 'ruled-line') : ruled_line,
|
||||
('calibre', 'styles') : styles,
|
||||
('calibre', 'canvas') : canvas,
|
||||
('calibre', 'image-block'): image_block,
|
||||
}
|
||||
transform = etree.XSLT(styledoc, extensions=extensions)
|
||||
try:
|
||||
result = transform(doc)
|
||||
except RuntimeError:
|
||||
sys.setrecursionlimit(5000)
|
||||
result = transform(doc)
|
||||
|
||||
with open('content.opf', 'wb') as f:
|
||||
f.write(result)
|
||||
styles.write()
|
||||
return os.path.abspath('content.opf')
|
||||
196
ebook_converter/ebooks/conversion/plugins/lrf_output.py
Normal file
196
ebook_converter/ebooks/conversion/plugins/lrf_output.py
Normal file
@@ -0,0 +1,196 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import sys, os
|
||||
|
||||
from calibre.customize.conversion import OutputFormatPlugin
|
||||
from calibre.customize.conversion import OptionRecommendation
|
||||
from polyglot.builtins import unicode_type
|
||||
|
||||
|
||||
class LRFOptions(object):
|
||||
|
||||
def __init__(self, output, opts, oeb):
|
||||
def f2s(f):
|
||||
try:
|
||||
return unicode_type(f[0])
|
||||
except:
|
||||
return ''
|
||||
m = oeb.metadata
|
||||
for x in ('left', 'top', 'right', 'bottom'):
|
||||
attr = 'margin_'+x
|
||||
val = getattr(opts, attr)
|
||||
if val < 0:
|
||||
setattr(opts, attr, 0)
|
||||
self.title = None
|
||||
self.author = self.publisher = _('Unknown')
|
||||
self.title_sort = self.author_sort = ''
|
||||
for x in m.creator:
|
||||
if x.role == 'aut':
|
||||
self.author = unicode_type(x)
|
||||
fa = unicode_type(getattr(x, 'file_as', ''))
|
||||
if fa:
|
||||
self.author_sort = fa
|
||||
for x in m.title:
|
||||
if unicode_type(x.file_as):
|
||||
self.title_sort = unicode_type(x.file_as)
|
||||
self.freetext = f2s(m.description)
|
||||
self.category = f2s(m.subject)
|
||||
self.cover = None
|
||||
self.use_metadata_cover = True
|
||||
self.output = output
|
||||
self.ignore_tables = opts.linearize_tables
|
||||
if opts.disable_font_rescaling:
|
||||
self.base_font_size = 0
|
||||
else:
|
||||
self.base_font_size = opts.base_font_size
|
||||
self.blank_after_para = opts.insert_blank_line
|
||||
self.use_spine = True
|
||||
self.font_delta = 0
|
||||
self.ignore_colors = False
|
||||
from calibre.ebooks.lrf import PRS500_PROFILE
|
||||
self.profile = PRS500_PROFILE
|
||||
self.link_levels = sys.maxsize
|
||||
self.link_exclude = '@'
|
||||
self.no_links_in_toc = True
|
||||
self.disable_chapter_detection = True
|
||||
self.chapter_regex = 'dsadcdswcdec'
|
||||
self.chapter_attr = '$,,$'
|
||||
self.override_css = self._override_css = ''
|
||||
self.page_break = 'h[12]'
|
||||
self.force_page_break = '$'
|
||||
self.force_page_break_attr = '$'
|
||||
self.add_chapters_to_toc = False
|
||||
self.baen = self.pdftohtml = self.book_designer = False
|
||||
self.verbose = opts.verbose
|
||||
self.encoding = 'utf-8'
|
||||
self.lrs = False
|
||||
self.minimize_memory_usage = False
|
||||
self.autorotation = opts.enable_autorotation
|
||||
self.header_separation = (self.profile.dpi/72.) * opts.header_separation
|
||||
self.headerformat = opts.header_format
|
||||
|
||||
for x in ('top', 'bottom', 'left', 'right'):
|
||||
setattr(self, x+'_margin',
|
||||
(self.profile.dpi/72.) * float(getattr(opts, 'margin_'+x)))
|
||||
|
||||
for x in ('wordspace', 'header', 'header_format',
|
||||
'minimum_indent', 'serif_family',
|
||||
'render_tables_as_images', 'sans_family', 'mono_family',
|
||||
'text_size_multiplier_for_rendered_tables'):
|
||||
setattr(self, x, getattr(opts, x))
|
||||
|
||||
|
||||
class LRFOutput(OutputFormatPlugin):
|
||||
|
||||
name = 'LRF Output'
|
||||
author = 'Kovid Goyal'
|
||||
file_type = 'lrf'
|
||||
commit_name = 'lrf_output'
|
||||
|
||||
options = {
|
||||
OptionRecommendation(name='enable_autorotation', recommended_value=False,
|
||||
help=_('Enable auto-rotation of images that are wider than the screen width.')
|
||||
),
|
||||
OptionRecommendation(name='wordspace',
|
||||
recommended_value=2.5, level=OptionRecommendation.LOW,
|
||||
help=_('Set the space between words in pts. Default is %default')
|
||||
),
|
||||
OptionRecommendation(name='header', recommended_value=False,
|
||||
help=_('Add a header to all the pages with title and author.')
|
||||
),
|
||||
OptionRecommendation(name='header_format', recommended_value="%t by %a",
|
||||
help=_('Set the format of the header. %a is replaced by the author '
|
||||
'and %t by the title. Default is %default')
|
||||
),
|
||||
OptionRecommendation(name='header_separation', recommended_value=0,
|
||||
help=_('Add extra spacing below the header. Default is %default pt.')
|
||||
),
|
||||
OptionRecommendation(name='minimum_indent', recommended_value=0,
|
||||
help=_('Minimum paragraph indent (the indent of the first line '
|
||||
'of a paragraph) in pts. Default: %default')
|
||||
),
|
||||
OptionRecommendation(name='render_tables_as_images',
|
||||
recommended_value=False,
|
||||
help=_('This option has no effect')
|
||||
),
|
||||
OptionRecommendation(name='text_size_multiplier_for_rendered_tables',
|
||||
recommended_value=1.0,
|
||||
help=_('Multiply the size of text in rendered tables by this '
|
||||
'factor. Default is %default')
|
||||
),
|
||||
OptionRecommendation(name='serif_family', recommended_value=None,
|
||||
help=_('The serif family of fonts to embed')
|
||||
),
|
||||
OptionRecommendation(name='sans_family', recommended_value=None,
|
||||
help=_('The sans-serif family of fonts to embed')
|
||||
),
|
||||
OptionRecommendation(name='mono_family', recommended_value=None,
|
||||
help=_('The monospace family of fonts to embed')
|
||||
),
|
||||
|
||||
}
|
||||
|
||||
recommendations = {
|
||||
('change_justification', 'original', OptionRecommendation.HIGH)}
|
||||
|
||||
def convert_images(self, pages, opts, wide):
|
||||
from calibre.ebooks.lrf.pylrs.pylrs import Book, BookSetting, ImageStream, ImageBlock
|
||||
from uuid import uuid4
|
||||
from calibre.constants import __appname__, __version__
|
||||
|
||||
width, height = (784, 1012) if wide else (584, 754)
|
||||
|
||||
ps = {}
|
||||
ps['topmargin'] = 0
|
||||
ps['evensidemargin'] = 0
|
||||
ps['oddsidemargin'] = 0
|
||||
ps['textwidth'] = width
|
||||
ps['textheight'] = height
|
||||
book = Book(title=opts.title, author=opts.author,
|
||||
bookid=uuid4().hex,
|
||||
publisher='%s %s'%(__appname__, __version__),
|
||||
category=_('Comic'), pagestyledefault=ps,
|
||||
booksetting=BookSetting(screenwidth=width, screenheight=height))
|
||||
for page in pages:
|
||||
imageStream = ImageStream(page)
|
||||
_page = book.create_page()
|
||||
_page.append(ImageBlock(refstream=imageStream,
|
||||
blockwidth=width, blockheight=height, xsize=width,
|
||||
ysize=height, x1=width, y1=height))
|
||||
book.append(_page)
|
||||
|
||||
book.renderLrf(open(opts.output, 'wb'))
|
||||
|
||||
def flatten_toc(self):
|
||||
from calibre.ebooks.oeb.base import TOC
|
||||
nroot = TOC()
|
||||
for x in self.oeb.toc.iterdescendants():
|
||||
nroot.add(x.title, x.href)
|
||||
self.oeb.toc = nroot
|
||||
|
||||
def convert(self, oeb, output_path, input_plugin, opts, log):
|
||||
self.log, self.opts, self.oeb = log, opts, oeb
|
||||
|
||||
lrf_opts = LRFOptions(output_path, opts, oeb)
|
||||
|
||||
if input_plugin.is_image_collection:
|
||||
self.convert_images(input_plugin.get_images(), lrf_opts,
|
||||
getattr(opts, 'wide', False))
|
||||
return
|
||||
|
||||
self.flatten_toc()
|
||||
|
||||
from calibre.ptempfile import TemporaryDirectory
|
||||
with TemporaryDirectory('_lrf_output') as tdir:
|
||||
from calibre.customize.ui import plugin_for_output_format
|
||||
oeb_output = plugin_for_output_format('oeb')
|
||||
oeb_output.convert(oeb, tdir, input_plugin, opts, log)
|
||||
opf = [x for x in os.listdir(tdir) if x.endswith('.opf')][0]
|
||||
from calibre.ebooks.lrf.html.convert_from import process_file
|
||||
process_file(os.path.join(tdir, opf), lrf_opts, self.log)
|
||||
66
ebook_converter/ebooks/conversion/plugins/mobi_input.py
Normal file
66
ebook_converter/ebooks/conversion/plugins/mobi_input.py
Normal file
@@ -0,0 +1,66 @@
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL 3'
|
||||
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import os
|
||||
|
||||
from calibre.customize.conversion import InputFormatPlugin
|
||||
from polyglot.builtins import unicode_type
|
||||
|
||||
|
||||
class MOBIInput(InputFormatPlugin):
|
||||
|
||||
name = 'MOBI Input'
|
||||
author = 'Kovid Goyal'
|
||||
description = 'Convert MOBI files (.mobi, .prc, .azw) to HTML'
|
||||
file_types = {'mobi', 'prc', 'azw', 'azw3', 'pobi'}
|
||||
commit_name = 'mobi_input'
|
||||
|
||||
def convert(self, stream, options, file_ext, log,
|
||||
accelerators):
|
||||
self.is_kf8 = False
|
||||
self.mobi_is_joint = False
|
||||
|
||||
from calibre.ebooks.mobi.reader.mobi6 import MobiReader
|
||||
from lxml import html
|
||||
parse_cache = {}
|
||||
try:
|
||||
mr = MobiReader(stream, log, options.input_encoding,
|
||||
options.debug_pipeline)
|
||||
if mr.kf8_type is None:
|
||||
mr.extract_content('.', parse_cache)
|
||||
|
||||
except:
|
||||
mr = MobiReader(stream, log, options.input_encoding,
|
||||
options.debug_pipeline, try_extra_data_fix=True)
|
||||
if mr.kf8_type is None:
|
||||
mr.extract_content('.', parse_cache)
|
||||
|
||||
if mr.kf8_type is not None:
|
||||
log('Found KF8 MOBI of type %r'%mr.kf8_type)
|
||||
if mr.kf8_type == 'joint':
|
||||
self.mobi_is_joint = True
|
||||
from calibre.ebooks.mobi.reader.mobi8 import Mobi8Reader
|
||||
mr = Mobi8Reader(mr, log)
|
||||
opf = os.path.abspath(mr())
|
||||
self.encrypted_fonts = mr.encrypted_fonts
|
||||
self.is_kf8 = True
|
||||
return opf
|
||||
|
||||
raw = parse_cache.pop('calibre_raw_mobi_markup', False)
|
||||
if raw:
|
||||
if isinstance(raw, unicode_type):
|
||||
raw = raw.encode('utf-8')
|
||||
with lopen('debug-raw.html', 'wb') as f:
|
||||
f.write(raw)
|
||||
from calibre.ebooks.oeb.base import close_self_closing_tags
|
||||
for f, root in parse_cache.items():
|
||||
raw = html.tostring(root, encoding='utf-8', method='xml',
|
||||
include_meta_content_type=False)
|
||||
raw = close_self_closing_tags(raw)
|
||||
with lopen(f, 'wb') as q:
|
||||
q.write(raw)
|
||||
accelerators['pagebreaks'] = '//h:div[@class="mbp_pagebreak"]'
|
||||
return mr.created_opf_path
|
||||
337
ebook_converter/ebooks/conversion/plugins/mobi_output.py
Normal file
337
ebook_converter/ebooks/conversion/plugins/mobi_output.py
Normal file
@@ -0,0 +1,337 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
from calibre.customize.conversion import (OutputFormatPlugin,
|
||||
OptionRecommendation)
|
||||
from polyglot.builtins import unicode_type
|
||||
|
||||
|
||||
def remove_html_cover(oeb, log):
|
||||
from calibre.ebooks.oeb.base import OEB_DOCS
|
||||
|
||||
if not oeb.metadata.cover \
|
||||
or 'cover' not in oeb.guide:
|
||||
return
|
||||
href = oeb.guide['cover'].href
|
||||
del oeb.guide['cover']
|
||||
item = oeb.manifest.hrefs[href]
|
||||
if item.spine_position is not None:
|
||||
log.warn('Found an HTML cover: ', item.href, 'removing it.',
|
||||
'If you find some content missing from the output MOBI, it '
|
||||
'is because you misidentified the HTML cover in the input '
|
||||
'document')
|
||||
oeb.spine.remove(item)
|
||||
if item.media_type in OEB_DOCS:
|
||||
oeb.manifest.remove(item)
|
||||
|
||||
|
||||
def extract_mobi(output_path, opts):
|
||||
if opts.extract_to is not None:
|
||||
from calibre.ebooks.mobi.debug.main import inspect_mobi
|
||||
ddir = opts.extract_to
|
||||
inspect_mobi(output_path, ddir=ddir)
|
||||
|
||||
|
||||
class MOBIOutput(OutputFormatPlugin):
|
||||
|
||||
name = 'MOBI Output'
|
||||
author = 'Kovid Goyal'
|
||||
file_type = 'mobi'
|
||||
commit_name = 'mobi_output'
|
||||
ui_data = {'file_types': ['old', 'both', 'new']}
|
||||
|
||||
options = {
|
||||
OptionRecommendation(name='prefer_author_sort',
|
||||
recommended_value=False, level=OptionRecommendation.LOW,
|
||||
help=_('When present, use author sort field as author.')
|
||||
),
|
||||
OptionRecommendation(name='no_inline_toc',
|
||||
recommended_value=False, level=OptionRecommendation.LOW,
|
||||
help=_('Don\'t add Table of Contents to the book. Useful if '
|
||||
'the book has its own table of contents.')),
|
||||
OptionRecommendation(name='toc_title', recommended_value=None,
|
||||
help=_('Title for any generated in-line table of contents.')
|
||||
),
|
||||
OptionRecommendation(name='dont_compress',
|
||||
recommended_value=False, level=OptionRecommendation.LOW,
|
||||
help=_('Disable compression of the file contents.')
|
||||
),
|
||||
OptionRecommendation(name='personal_doc', recommended_value='[PDOC]',
|
||||
help=_('Tag for MOBI files to be marked as personal documents.'
|
||||
' This option has no effect on the conversion. It is used'
|
||||
' only when sending MOBI files to a device. If the file'
|
||||
' being sent has the specified tag, it will be marked as'
|
||||
' a personal document when sent to the Kindle.')
|
||||
),
|
||||
OptionRecommendation(name='mobi_ignore_margins',
|
||||
recommended_value=False,
|
||||
help=_('Ignore margins in the input document. If False, then '
|
||||
'the MOBI output plugin will try to convert margins specified'
|
||||
' in the input document, otherwise it will ignore them.')
|
||||
),
|
||||
OptionRecommendation(name='mobi_toc_at_start',
|
||||
recommended_value=False,
|
||||
help=_('When adding the Table of Contents to the book, add it at the start of the '
|
||||
'book instead of the end. Not recommended.')
|
||||
),
|
||||
OptionRecommendation(name='extract_to',
|
||||
help=_('Extract the contents of the generated %s file to the '
|
||||
'specified directory. The contents of the directory are first '
|
||||
'deleted, so be careful.') % 'MOBI'
|
||||
),
|
||||
OptionRecommendation(name='share_not_sync', recommended_value=False,
|
||||
help=_('Enable sharing of book content via Facebook etc. '
|
||||
' on the Kindle. WARNING: Using this feature means that '
|
||||
' the book will not auto sync its last read position '
|
||||
' on multiple devices. Complain to Amazon.')
|
||||
),
|
||||
OptionRecommendation(name='mobi_keep_original_images',
|
||||
recommended_value=False,
|
||||
help=_('By default calibre converts all images to JPEG format '
|
||||
'in the output MOBI file. This is for maximum compatibility '
|
||||
'as some older MOBI viewers have problems with other image '
|
||||
'formats. This option tells calibre not to do this. '
|
||||
'Useful if your document contains lots of GIF/PNG images that '
|
||||
'become very large when converted to JPEG.')),
|
||||
OptionRecommendation(name='mobi_file_type', choices=ui_data['file_types'], recommended_value='old',
|
||||
help=_('By default calibre generates MOBI files that contain the '
|
||||
'old MOBI 6 format. This format is compatible with all '
|
||||
'devices. However, by changing this setting, you can tell '
|
||||
'calibre to generate MOBI files that contain both MOBI 6 and '
|
||||
'the new KF8 format, or only the new KF8 format. KF8 has '
|
||||
'more features than MOBI 6, but only works with newer Kindles. '
|
||||
'Allowed values: {}').format('old, both, new')),
|
||||
|
||||
}
|
||||
|
||||
def check_for_periodical(self):
|
||||
if self.is_periodical:
|
||||
self.periodicalize_toc()
|
||||
self.check_for_masthead()
|
||||
self.opts.mobi_periodical = True
|
||||
else:
|
||||
self.opts.mobi_periodical = False
|
||||
|
||||
def check_for_masthead(self):
|
||||
found = 'masthead' in self.oeb.guide
|
||||
if not found:
|
||||
from calibre.ebooks import generate_masthead
|
||||
self.oeb.log.debug('No masthead found in manifest, generating default mastheadImage...')
|
||||
raw = generate_masthead(unicode_type(self.oeb.metadata['title'][0]))
|
||||
id, href = self.oeb.manifest.generate('masthead', 'masthead')
|
||||
self.oeb.manifest.add(id, href, 'image/gif', data=raw)
|
||||
self.oeb.guide.add('masthead', 'Masthead Image', href)
|
||||
else:
|
||||
self.oeb.log.debug('Using mastheadImage supplied in manifest...')
|
||||
|
||||
def periodicalize_toc(self):
|
||||
from calibre.ebooks.oeb.base import TOC
|
||||
toc = self.oeb.toc
|
||||
if not toc or len(self.oeb.spine) < 3:
|
||||
return
|
||||
if toc and toc[0].klass != 'periodical':
|
||||
one, two = self.oeb.spine[0], self.oeb.spine[1]
|
||||
self.log('Converting TOC for MOBI periodical indexing...')
|
||||
|
||||
articles = {}
|
||||
if toc.depth() < 3:
|
||||
# single section periodical
|
||||
self.oeb.manifest.remove(one)
|
||||
self.oeb.manifest.remove(two)
|
||||
sections = [TOC(klass='section', title=_('All articles'),
|
||||
href=self.oeb.spine[0].href)]
|
||||
for x in toc:
|
||||
sections[0].nodes.append(x)
|
||||
else:
|
||||
# multi-section periodical
|
||||
self.oeb.manifest.remove(one)
|
||||
sections = list(toc)
|
||||
for i,x in enumerate(sections):
|
||||
x.klass = 'section'
|
||||
articles_ = list(x)
|
||||
if articles_:
|
||||
self.oeb.manifest.remove(self.oeb.manifest.hrefs[x.href])
|
||||
x.href = articles_[0].href
|
||||
|
||||
for sec in sections:
|
||||
articles[id(sec)] = []
|
||||
for a in list(sec):
|
||||
a.klass = 'article'
|
||||
articles[id(sec)].append(a)
|
||||
sec.nodes.remove(a)
|
||||
|
||||
root = TOC(klass='periodical', href=self.oeb.spine[0].href,
|
||||
title=unicode_type(self.oeb.metadata.title[0]))
|
||||
|
||||
for s in sections:
|
||||
if articles[id(s)]:
|
||||
for a in articles[id(s)]:
|
||||
s.nodes.append(a)
|
||||
root.nodes.append(s)
|
||||
|
||||
for x in list(toc.nodes):
|
||||
toc.nodes.remove(x)
|
||||
|
||||
toc.nodes.append(root)
|
||||
|
||||
# Fix up the periodical href to point to first section href
|
||||
toc.nodes[0].href = toc.nodes[0].nodes[0].href
|
||||
|
||||
def convert(self, oeb, output_path, input_plugin, opts, log):
|
||||
from calibre.ebooks.mobi.writer2.resources import Resources
|
||||
self.log, self.opts, self.oeb = log, opts, oeb
|
||||
|
||||
mobi_type = opts.mobi_file_type
|
||||
if self.is_periodical:
|
||||
mobi_type = 'old' # Amazon does not support KF8 periodicals
|
||||
create_kf8 = mobi_type in ('new', 'both')
|
||||
|
||||
remove_html_cover(self.oeb, self.log)
|
||||
resources = Resources(oeb, opts, self.is_periodical,
|
||||
add_fonts=create_kf8)
|
||||
self.check_for_periodical()
|
||||
|
||||
if create_kf8:
|
||||
from calibre.ebooks.mobi.writer8.cleanup import remove_duplicate_anchors
|
||||
remove_duplicate_anchors(self.oeb)
|
||||
# Split on pagebreaks so that the resulting KF8 is faster to load
|
||||
from calibre.ebooks.oeb.transforms.split import Split
|
||||
Split()(self.oeb, self.opts)
|
||||
|
||||
kf8 = self.create_kf8(resources, for_joint=mobi_type=='both'
|
||||
) if create_kf8 else None
|
||||
if mobi_type == 'new':
|
||||
kf8.write(output_path)
|
||||
extract_mobi(output_path, opts)
|
||||
return
|
||||
|
||||
self.log('Creating MOBI 6 output')
|
||||
self.write_mobi(input_plugin, output_path, kf8, resources)
|
||||
|
||||
def create_kf8(self, resources, for_joint=False):
|
||||
from calibre.ebooks.mobi.writer8.main import create_kf8_book
|
||||
return create_kf8_book(self.oeb, self.opts, resources,
|
||||
for_joint=for_joint)
|
||||
|
||||
def write_mobi(self, input_plugin, output_path, kf8, resources):
|
||||
from calibre.ebooks.mobi.mobiml import MobiMLizer
|
||||
from calibre.ebooks.oeb.transforms.manglecase import CaseMangler
|
||||
from calibre.ebooks.oeb.transforms.rasterize import SVGRasterizer, Unavailable
|
||||
from calibre.ebooks.oeb.transforms.htmltoc import HTMLTOCAdder
|
||||
from calibre.customize.ui import plugin_for_input_format
|
||||
|
||||
opts, oeb = self.opts, self.oeb
|
||||
if not opts.no_inline_toc:
|
||||
tocadder = HTMLTOCAdder(title=opts.toc_title, position='start' if
|
||||
opts.mobi_toc_at_start else 'end')
|
||||
tocadder(oeb, opts)
|
||||
mangler = CaseMangler()
|
||||
mangler(oeb, opts)
|
||||
try:
|
||||
rasterizer = SVGRasterizer()
|
||||
rasterizer(oeb, opts)
|
||||
except Unavailable:
|
||||
self.log.warn('SVG rasterizer unavailable, SVG will not be converted')
|
||||
else:
|
||||
# Add rasterized SVG images
|
||||
resources.add_extra_images()
|
||||
if hasattr(self.oeb, 'inserted_metadata_jacket'):
|
||||
self.workaround_fire_bugs(self.oeb.inserted_metadata_jacket)
|
||||
mobimlizer = MobiMLizer(ignore_tables=opts.linearize_tables)
|
||||
mobimlizer(oeb, opts)
|
||||
write_page_breaks_after_item = input_plugin is not plugin_for_input_format('cbz')
|
||||
from calibre.ebooks.mobi.writer2.main import MobiWriter
|
||||
writer = MobiWriter(opts, resources, kf8,
|
||||
write_page_breaks_after_item=write_page_breaks_after_item)
|
||||
writer(oeb, output_path)
|
||||
extract_mobi(output_path, opts)
|
||||
|
||||
def specialize_css_for_output(self, log, opts, item, stylizer):
|
||||
from calibre.ebooks.mobi.writer8.cleanup import CSSCleanup
|
||||
CSSCleanup(log, opts)(item, stylizer)
|
||||
|
||||
def workaround_fire_bugs(self, jacket):
|
||||
# The idiotic Fire crashes when trying to render the table used to
|
||||
# layout the jacket
|
||||
from calibre.ebooks.oeb.base import XHTML
|
||||
for table in jacket.data.xpath('//*[local-name()="table"]'):
|
||||
table.tag = XHTML('div')
|
||||
for tr in table.xpath('descendant::*[local-name()="tr"]'):
|
||||
cols = tr.xpath('descendant::*[local-name()="td"]')
|
||||
tr.tag = XHTML('div')
|
||||
for td in cols:
|
||||
td.tag = XHTML('span' if cols else 'div')
|
||||
|
||||
|
||||
class AZW3Output(OutputFormatPlugin):
|
||||
|
||||
name = 'AZW3 Output'
|
||||
author = 'Kovid Goyal'
|
||||
file_type = 'azw3'
|
||||
commit_name = 'azw3_output'
|
||||
|
||||
options = {
|
||||
OptionRecommendation(name='prefer_author_sort',
|
||||
recommended_value=False, level=OptionRecommendation.LOW,
|
||||
help=_('When present, use author sort field as author.')
|
||||
),
|
||||
OptionRecommendation(name='no_inline_toc',
|
||||
recommended_value=False, level=OptionRecommendation.LOW,
|
||||
help=_('Don\'t add Table of Contents to the book. Useful if '
|
||||
'the book has its own table of contents.')),
|
||||
OptionRecommendation(name='toc_title', recommended_value=None,
|
||||
help=_('Title for any generated in-line table of contents.')
|
||||
),
|
||||
OptionRecommendation(name='dont_compress',
|
||||
recommended_value=False, level=OptionRecommendation.LOW,
|
||||
help=_('Disable compression of the file contents.')
|
||||
),
|
||||
OptionRecommendation(name='mobi_toc_at_start',
|
||||
recommended_value=False,
|
||||
help=_('When adding the Table of Contents to the book, add it at the start of the '
|
||||
'book instead of the end. Not recommended.')
|
||||
),
|
||||
OptionRecommendation(name='extract_to',
|
||||
help=_('Extract the contents of the generated %s file to the '
|
||||
'specified directory. The contents of the directory are first '
|
||||
'deleted, so be careful.') % 'AZW3'),
|
||||
OptionRecommendation(name='share_not_sync', recommended_value=False,
|
||||
help=_('Enable sharing of book content via Facebook etc. '
|
||||
' on the Kindle. WARNING: Using this feature means that '
|
||||
' the book will not auto sync its last read position '
|
||||
' on multiple devices. Complain to Amazon.')
|
||||
),
|
||||
}
|
||||
|
||||
def convert(self, oeb, output_path, input_plugin, opts, log):
|
||||
from calibre.ebooks.mobi.writer2.resources import Resources
|
||||
from calibre.ebooks.mobi.writer8.main import create_kf8_book
|
||||
from calibre.ebooks.mobi.writer8.cleanup import remove_duplicate_anchors
|
||||
|
||||
self.oeb, self.opts, self.log = oeb, opts, log
|
||||
opts.mobi_periodical = self.is_periodical
|
||||
passthrough = getattr(opts, 'mobi_passthrough', False)
|
||||
remove_duplicate_anchors(oeb)
|
||||
|
||||
resources = Resources(self.oeb, self.opts, self.is_periodical,
|
||||
add_fonts=True, process_images=False)
|
||||
if not passthrough:
|
||||
remove_html_cover(self.oeb, self.log)
|
||||
|
||||
# Split on pagebreaks so that the resulting KF8 is faster to load
|
||||
from calibre.ebooks.oeb.transforms.split import Split
|
||||
Split()(self.oeb, self.opts)
|
||||
|
||||
kf8 = create_kf8_book(self.oeb, self.opts, resources, for_joint=False)
|
||||
|
||||
kf8.write(output_path)
|
||||
extract_mobi(output_path, opts)
|
||||
|
||||
def specialize_css_for_output(self, log, opts, item, stylizer):
|
||||
from calibre.ebooks.mobi.writer8.cleanup import CSSCleanup
|
||||
CSSCleanup(log, opts)(item, stylizer)
|
||||
25
ebook_converter/ebooks/conversion/plugins/odt_input.py
Normal file
25
ebook_converter/ebooks/conversion/plugins/odt_input.py
Normal file
@@ -0,0 +1,25 @@
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
'''
|
||||
Convert an ODT file into a Open Ebook
|
||||
'''
|
||||
|
||||
from calibre.customize.conversion import InputFormatPlugin
|
||||
|
||||
|
||||
class ODTInput(InputFormatPlugin):
|
||||
|
||||
name = 'ODT Input'
|
||||
author = 'Kovid Goyal'
|
||||
description = 'Convert ODT (OpenOffice) files to HTML'
|
||||
file_types = {'odt'}
|
||||
commit_name = 'odt_input'
|
||||
|
||||
def convert(self, stream, options, file_ext, log,
|
||||
accelerators):
|
||||
from calibre.ebooks.odt.input import Extract
|
||||
return Extract()(stream, '.', log)
|
||||
122
ebook_converter/ebooks/conversion/plugins/oeb_output.py
Normal file
122
ebook_converter/ebooks/conversion/plugins/oeb_output.py
Normal file
@@ -0,0 +1,122 @@
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL 3'
|
||||
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import os, re
|
||||
|
||||
|
||||
from calibre.customize.conversion import (OutputFormatPlugin,
|
||||
OptionRecommendation)
|
||||
from calibre import CurrentDir
|
||||
|
||||
|
||||
class OEBOutput(OutputFormatPlugin):
|
||||
|
||||
name = 'OEB Output'
|
||||
author = 'Kovid Goyal'
|
||||
file_type = 'oeb'
|
||||
commit_name = 'oeb_output'
|
||||
|
||||
recommendations = {('pretty_print', True, OptionRecommendation.HIGH)}
|
||||
|
||||
def convert(self, oeb_book, output_path, input_plugin, opts, log):
|
||||
from polyglot.urllib import unquote
|
||||
from lxml import etree
|
||||
|
||||
self.log, self.opts = log, opts
|
||||
if not os.path.exists(output_path):
|
||||
os.makedirs(output_path)
|
||||
from calibre.ebooks.oeb.base import OPF_MIME, NCX_MIME, PAGE_MAP_MIME, OEB_STYLES
|
||||
from calibre.ebooks.oeb.normalize_css import condense_sheet
|
||||
with CurrentDir(output_path):
|
||||
results = oeb_book.to_opf2(page_map=True)
|
||||
for key in (OPF_MIME, NCX_MIME, PAGE_MAP_MIME):
|
||||
href, root = results.pop(key, [None, None])
|
||||
if root is not None:
|
||||
if key == OPF_MIME:
|
||||
try:
|
||||
self.workaround_nook_cover_bug(root)
|
||||
except:
|
||||
self.log.exception('Something went wrong while trying to'
|
||||
' workaround Nook cover bug, ignoring')
|
||||
try:
|
||||
self.workaround_pocketbook_cover_bug(root)
|
||||
except:
|
||||
self.log.exception('Something went wrong while trying to'
|
||||
' workaround Pocketbook cover bug, ignoring')
|
||||
self.migrate_lang_code(root)
|
||||
raw = etree.tostring(root, pretty_print=True,
|
||||
encoding='utf-8', xml_declaration=True)
|
||||
if key == OPF_MIME:
|
||||
# Needed as I can't get lxml to output opf:role and
|
||||
# not output <opf:metadata> as well
|
||||
raw = re.sub(br'(<[/]{0,1})opf:', br'\1', raw)
|
||||
with lopen(href, 'wb') as f:
|
||||
f.write(raw)
|
||||
|
||||
for item in oeb_book.manifest:
|
||||
if (
|
||||
not self.opts.expand_css and item.media_type in OEB_STYLES and hasattr(
|
||||
item.data, 'cssText') and 'nook' not in self.opts.output_profile.short_name):
|
||||
condense_sheet(item.data)
|
||||
path = os.path.abspath(unquote(item.href))
|
||||
dir = os.path.dirname(path)
|
||||
if not os.path.exists(dir):
|
||||
os.makedirs(dir)
|
||||
with lopen(path, 'wb') as f:
|
||||
f.write(item.bytes_representation)
|
||||
item.unload_data_from_memory(memory=path)
|
||||
|
||||
def workaround_nook_cover_bug(self, root): # {{{
|
||||
cov = root.xpath('//*[local-name() = "meta" and @name="cover" and'
|
||||
' @content != "cover"]')
|
||||
|
||||
def manifest_items_with_id(id_):
|
||||
return root.xpath('//*[local-name() = "manifest"]/*[local-name() = "item" '
|
||||
' and @id="%s"]'%id_)
|
||||
|
||||
if len(cov) == 1:
|
||||
cov = cov[0]
|
||||
covid = cov.get('content', '')
|
||||
|
||||
if covid:
|
||||
manifest_item = manifest_items_with_id(covid)
|
||||
if len(manifest_item) == 1 and \
|
||||
manifest_item[0].get('media-type',
|
||||
'').startswith('image/'):
|
||||
self.log.warn('The cover image has an id != "cover". Renaming'
|
||||
' to work around bug in Nook Color')
|
||||
|
||||
from calibre.ebooks.oeb.base import uuid_id
|
||||
newid = uuid_id()
|
||||
|
||||
for item in manifest_items_with_id('cover'):
|
||||
item.set('id', newid)
|
||||
|
||||
for x in root.xpath('//*[@idref="cover"]'):
|
||||
x.set('idref', newid)
|
||||
|
||||
manifest_item = manifest_item[0]
|
||||
manifest_item.set('id', 'cover')
|
||||
cov.set('content', 'cover')
|
||||
# }}}
|
||||
|
||||
def workaround_pocketbook_cover_bug(self, root): # {{{
|
||||
m = root.xpath('//*[local-name() = "manifest"]/*[local-name() = "item" '
|
||||
' and @id="cover"]')
|
||||
if len(m) == 1:
|
||||
m = m[0]
|
||||
p = m.getparent()
|
||||
p.remove(m)
|
||||
p.insert(0, m)
|
||||
# }}}
|
||||
|
||||
def migrate_lang_code(self, root): # {{{
|
||||
from calibre.utils.localization import lang_as_iso639_1
|
||||
for lang in root.xpath('//*[local-name() = "language"]'):
|
||||
clc = lang_as_iso639_1(lang.text)
|
||||
if clc:
|
||||
lang.text = clc
|
||||
# }}}
|
||||
37
ebook_converter/ebooks/conversion/plugins/pdb_input.py
Normal file
37
ebook_converter/ebooks/conversion/plugins/pdb_input.py
Normal file
@@ -0,0 +1,37 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
from calibre.customize.conversion import InputFormatPlugin
|
||||
from polyglot.builtins import getcwd
|
||||
|
||||
|
||||
class PDBInput(InputFormatPlugin):
|
||||
|
||||
name = 'PDB Input'
|
||||
author = 'John Schember'
|
||||
description = 'Convert PDB to HTML'
|
||||
file_types = {'pdb', 'updb'}
|
||||
commit_name = 'pdb_input'
|
||||
|
||||
def convert(self, stream, options, file_ext, log,
|
||||
accelerators):
|
||||
from calibre.ebooks.pdb.header import PdbHeaderReader
|
||||
from calibre.ebooks.pdb import PDBError, IDENTITY_TO_NAME, get_reader
|
||||
|
||||
header = PdbHeaderReader(stream)
|
||||
Reader = get_reader(header.ident)
|
||||
|
||||
if Reader is None:
|
||||
raise PDBError('No reader available for format within container.\n Identity is %s. Book type is %s' %
|
||||
(header.ident, IDENTITY_TO_NAME.get(header.ident, _('Unknown'))))
|
||||
|
||||
log.debug('Detected ebook format as: %s with identity: %s' % (IDENTITY_TO_NAME[header.ident], header.ident))
|
||||
|
||||
reader = Reader(header, stream, log, options)
|
||||
opf = reader.extract_content(getcwd())
|
||||
|
||||
return opf
|
||||
64
ebook_converter/ebooks/conversion/plugins/pdb_output.py
Normal file
64
ebook_converter/ebooks/conversion/plugins/pdb_output.py
Normal file
@@ -0,0 +1,64 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL 3'
|
||||
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import os
|
||||
|
||||
from calibre.customize.conversion import OutputFormatPlugin, \
|
||||
OptionRecommendation
|
||||
from calibre.ebooks.pdb import PDBError, get_writer, ALL_FORMAT_WRITERS
|
||||
|
||||
|
||||
class PDBOutput(OutputFormatPlugin):
|
||||
|
||||
name = 'PDB Output'
|
||||
author = 'John Schember'
|
||||
file_type = 'pdb'
|
||||
commit_name = 'pdb_output'
|
||||
ui_data = {'formats': tuple(ALL_FORMAT_WRITERS)}
|
||||
|
||||
options = {
|
||||
OptionRecommendation(name='format', recommended_value='doc',
|
||||
level=OptionRecommendation.LOW,
|
||||
short_switch='f', choices=list(ALL_FORMAT_WRITERS),
|
||||
help=(_('Format to use inside the pdb container. Choices are:') + ' %s' % sorted(ALL_FORMAT_WRITERS))),
|
||||
OptionRecommendation(name='pdb_output_encoding', recommended_value='cp1252',
|
||||
level=OptionRecommendation.LOW,
|
||||
help=_('Specify the character encoding of the output document. '
|
||||
'The default is cp1252. Note: This option is not honored by all '
|
||||
'formats.')),
|
||||
OptionRecommendation(name='inline_toc',
|
||||
recommended_value=False, level=OptionRecommendation.LOW,
|
||||
help=_('Add Table of Contents to beginning of the book.')),
|
||||
}
|
||||
|
||||
def convert(self, oeb_book, output_path, input_plugin, opts, log):
|
||||
close = False
|
||||
if not hasattr(output_path, 'write'):
|
||||
close = True
|
||||
if not os.path.exists(os.path.dirname(output_path)) and os.path.dirname(output_path):
|
||||
os.makedirs(os.path.dirname(output_path))
|
||||
out_stream = lopen(output_path, 'wb')
|
||||
else:
|
||||
out_stream = output_path
|
||||
|
||||
Writer = get_writer(opts.format)
|
||||
|
||||
if Writer is None:
|
||||
raise PDBError('No writer available for format %s.' % format)
|
||||
|
||||
setattr(opts, 'max_line_length', 0)
|
||||
setattr(opts, 'force_max_line_length', False)
|
||||
|
||||
writer = Writer(opts, log)
|
||||
|
||||
out_stream.seek(0)
|
||||
out_stream.truncate()
|
||||
|
||||
writer.write_content(oeb_book, out_stream, oeb_book.metadata)
|
||||
|
||||
if close:
|
||||
out_stream.close()
|
||||
82
ebook_converter/ebooks/conversion/plugins/pdf_input.py
Normal file
82
ebook_converter/ebooks/conversion/plugins/pdf_input.py
Normal file
@@ -0,0 +1,82 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL 3'
|
||||
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import os
|
||||
|
||||
from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
|
||||
from polyglot.builtins import as_bytes, getcwd
|
||||
|
||||
|
||||
class PDFInput(InputFormatPlugin):
|
||||
|
||||
name = 'PDF Input'
|
||||
author = 'Kovid Goyal and John Schember'
|
||||
description = 'Convert PDF files to HTML'
|
||||
file_types = {'pdf'}
|
||||
commit_name = 'pdf_input'
|
||||
|
||||
options = {
|
||||
OptionRecommendation(name='no_images', recommended_value=False,
|
||||
help=_('Do not extract images from the document')),
|
||||
OptionRecommendation(name='unwrap_factor', recommended_value=0.45,
|
||||
help=_('Scale used to determine the length at which a line should '
|
||||
'be unwrapped. Valid values are a decimal between 0 and 1. The '
|
||||
'default is 0.45, just below the median line length.')),
|
||||
OptionRecommendation(name='new_pdf_engine', recommended_value=False,
|
||||
help=_('Use the new PDF conversion engine. Currently not operational.'))
|
||||
}
|
||||
|
||||
def convert_new(self, stream, accelerators):
|
||||
from calibre.ebooks.pdf.pdftohtml import pdftohtml
|
||||
from calibre.utils.cleantext import clean_ascii_chars
|
||||
from calibre.ebooks.pdf.reflow import PDFDocument
|
||||
|
||||
pdftohtml(getcwd(), stream.name, self.opts.no_images, as_xml=True)
|
||||
with lopen('index.xml', 'rb') as f:
|
||||
xml = clean_ascii_chars(f.read())
|
||||
PDFDocument(xml, self.opts, self.log)
|
||||
return os.path.join(getcwd(), 'metadata.opf')
|
||||
|
||||
def convert(self, stream, options, file_ext, log,
|
||||
accelerators):
|
||||
from calibre.ebooks.metadata.opf2 import OPFCreator
|
||||
from calibre.ebooks.pdf.pdftohtml import pdftohtml
|
||||
|
||||
log.debug('Converting file to html...')
|
||||
# The main html file will be named index.html
|
||||
self.opts, self.log = options, log
|
||||
if options.new_pdf_engine:
|
||||
return self.convert_new(stream, accelerators)
|
||||
pdftohtml(getcwd(), stream.name, options.no_images)
|
||||
|
||||
from calibre.ebooks.metadata.meta import get_metadata
|
||||
log.debug('Retrieving document metadata...')
|
||||
mi = get_metadata(stream, 'pdf')
|
||||
opf = OPFCreator(getcwd(), mi)
|
||||
|
||||
manifest = [('index.html', None)]
|
||||
|
||||
images = os.listdir(getcwd())
|
||||
images.remove('index.html')
|
||||
for i in images:
|
||||
manifest.append((i, None))
|
||||
log.debug('Generating manifest...')
|
||||
opf.create_manifest(manifest)
|
||||
|
||||
opf.create_spine(['index.html'])
|
||||
log.debug('Rendering manifest...')
|
||||
with lopen('metadata.opf', 'wb') as opffile:
|
||||
opf.render(opffile)
|
||||
if os.path.exists('toc.ncx'):
|
||||
ncxid = opf.manifest.id_for_path('toc.ncx')
|
||||
if ncxid:
|
||||
with lopen('metadata.opf', 'r+b') as f:
|
||||
raw = f.read().replace(b'<spine', b'<spine toc="%s"' % as_bytes(ncxid))
|
||||
f.seek(0)
|
||||
f.write(raw)
|
||||
|
||||
return os.path.join(getcwd(), 'metadata.opf')
|
||||
256
ebook_converter/ebooks/conversion/plugins/pdf_output.py
Normal file
256
ebook_converter/ebooks/conversion/plugins/pdf_output.py
Normal file
@@ -0,0 +1,256 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL 3'
|
||||
__copyright__ = '2012, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
'''
|
||||
Convert OEB ebook format to PDF.
|
||||
'''
|
||||
|
||||
import glob, os
|
||||
|
||||
from calibre.customize.conversion import (OutputFormatPlugin,
|
||||
OptionRecommendation)
|
||||
from calibre.ptempfile import TemporaryDirectory
|
||||
from polyglot.builtins import iteritems, unicode_type
|
||||
|
||||
UNITS = ('millimeter', 'centimeter', 'point', 'inch' , 'pica' , 'didot',
|
||||
'cicero', 'devicepixel')
|
||||
|
||||
PAPER_SIZES = ('a0', 'a1', 'a2', 'a3', 'a4', 'a5', 'a6', 'b0', 'b1',
|
||||
'b2', 'b3', 'b4', 'b5', 'b6', 'legal', 'letter')
|
||||
|
||||
|
||||
class PDFOutput(OutputFormatPlugin):
|
||||
|
||||
name = 'PDF Output'
|
||||
author = 'Kovid Goyal'
|
||||
file_type = 'pdf'
|
||||
commit_name = 'pdf_output'
|
||||
ui_data = {'paper_sizes': PAPER_SIZES, 'units': UNITS, 'font_types': ('serif', 'sans', 'mono')}
|
||||
|
||||
options = {
|
||||
OptionRecommendation(name='use_profile_size', recommended_value=False,
|
||||
help=_('Instead of using the paper size specified in the PDF Output options,'
|
||||
' use a paper size corresponding to the current output profile.'
|
||||
' Useful if you want to generate a PDF for viewing on a specific device.')),
|
||||
OptionRecommendation(name='unit', recommended_value='inch',
|
||||
level=OptionRecommendation.LOW, short_switch='u', choices=UNITS,
|
||||
help=_('The unit of measure for page sizes. Default is inch. Choices '
|
||||
'are {} '
|
||||
'Note: This does not override the unit for margins!').format(', '.join(UNITS))),
|
||||
OptionRecommendation(name='paper_size', recommended_value='letter',
|
||||
level=OptionRecommendation.LOW, choices=PAPER_SIZES,
|
||||
help=_('The size of the paper. This size will be overridden when a '
|
||||
'non default output profile is used. Default is letter. Choices '
|
||||
'are {}').format(', '.join(PAPER_SIZES))),
|
||||
OptionRecommendation(name='custom_size', recommended_value=None,
|
||||
help=_('Custom size of the document. Use the form widthxheight '
|
||||
'e.g. `123x321` to specify the width and height. '
|
||||
'This overrides any specified paper-size.')),
|
||||
OptionRecommendation(name='preserve_cover_aspect_ratio',
|
||||
recommended_value=False,
|
||||
help=_('Preserve the aspect ratio of the cover, instead'
|
||||
' of stretching it to fill the full first page of the'
|
||||
' generated pdf.')),
|
||||
OptionRecommendation(name='pdf_serif_family',
|
||||
recommended_value='Times', help=_(
|
||||
'The font family used to render serif fonts. Will work only if the font is available system-wide.')),
|
||||
OptionRecommendation(name='pdf_sans_family',
|
||||
recommended_value='Helvetica', help=_(
|
||||
'The font family used to render sans-serif fonts. Will work only if the font is available system-wide.')),
|
||||
OptionRecommendation(name='pdf_mono_family',
|
||||
recommended_value='Courier', help=_(
|
||||
'The font family used to render monospace fonts. Will work only if the font is available system-wide.')),
|
||||
OptionRecommendation(name='pdf_standard_font', choices=ui_data['font_types'],
|
||||
recommended_value='serif', help=_(
|
||||
'The font family used to render monospace fonts')),
|
||||
OptionRecommendation(name='pdf_default_font_size',
|
||||
recommended_value=20, help=_(
|
||||
'The default font size')),
|
||||
OptionRecommendation(name='pdf_mono_font_size',
|
||||
recommended_value=16, help=_(
|
||||
'The default font size for monospaced text')),
|
||||
OptionRecommendation(name='pdf_hyphenate', recommended_value=False,
|
||||
help=_('Break long words at the end of lines. This can give the text at the right margin a more even appearance.')),
|
||||
OptionRecommendation(name='pdf_mark_links', recommended_value=False,
|
||||
help=_('Surround all links with a red box, useful for debugging.')),
|
||||
OptionRecommendation(name='pdf_page_numbers', recommended_value=False,
|
||||
help=_('Add page numbers to the bottom of every page in the generated PDF file. If you '
|
||||
'specify a footer template, it will take precedence '
|
||||
'over this option.')),
|
||||
OptionRecommendation(name='pdf_footer_template', recommended_value=None,
|
||||
help=_('An HTML template used to generate %s on every page.'
|
||||
' The strings _PAGENUM_, _TITLE_, _AUTHOR_ and _SECTION_ will be replaced by their current values.')%_('footers')),
|
||||
OptionRecommendation(name='pdf_header_template', recommended_value=None,
|
||||
help=_('An HTML template used to generate %s on every page.'
|
||||
' The strings _PAGENUM_, _TITLE_, _AUTHOR_ and _SECTION_ will be replaced by their current values.')%_('headers')),
|
||||
OptionRecommendation(name='pdf_add_toc', recommended_value=False,
|
||||
help=_('Add a Table of Contents at the end of the PDF that lists page numbers. '
|
||||
'Useful if you want to print out the PDF. If this PDF is intended for electronic use, use the PDF Outline instead.')),
|
||||
OptionRecommendation(name='toc_title', recommended_value=None,
|
||||
help=_('Title for generated table of contents.')
|
||||
),
|
||||
|
||||
OptionRecommendation(name='pdf_page_margin_left', recommended_value=72.0,
|
||||
level=OptionRecommendation.LOW,
|
||||
help=_('The size of the left page margin, in pts. Default is 72pt.'
|
||||
' Overrides the common left page margin setting.')
|
||||
),
|
||||
|
||||
OptionRecommendation(name='pdf_page_margin_top', recommended_value=72.0,
|
||||
level=OptionRecommendation.LOW,
|
||||
help=_('The size of the top page margin, in pts. Default is 72pt.'
|
||||
' Overrides the common top page margin setting, unless set to zero.')
|
||||
),
|
||||
|
||||
OptionRecommendation(name='pdf_page_margin_right', recommended_value=72.0,
|
||||
level=OptionRecommendation.LOW,
|
||||
help=_('The size of the right page margin, in pts. Default is 72pt.'
|
||||
' Overrides the common right page margin setting, unless set to zero.')
|
||||
),
|
||||
|
||||
OptionRecommendation(name='pdf_page_margin_bottom', recommended_value=72.0,
|
||||
level=OptionRecommendation.LOW,
|
||||
help=_('The size of the bottom page margin, in pts. Default is 72pt.'
|
||||
' Overrides the common bottom page margin setting, unless set to zero.')
|
||||
),
|
||||
OptionRecommendation(name='pdf_use_document_margins', recommended_value=False,
|
||||
help=_('Use the page margins specified in the input document via @page CSS rules.'
|
||||
' This will cause the margins specified in the conversion settings to be ignored.'
|
||||
' If the document does not specify page margins, the conversion settings will be used as a fallback.')
|
||||
),
|
||||
OptionRecommendation(name='pdf_page_number_map', recommended_value=None,
|
||||
help=_('Adjust page numbers, as needed. Syntax is a JavaScript expression for the page number.'
|
||||
' For example, "if (n < 3) 0; else n - 3;", where n is current page number.')
|
||||
),
|
||||
OptionRecommendation(name='uncompressed_pdf',
|
||||
recommended_value=False, help=_(
|
||||
'Generate an uncompressed PDF, useful for debugging.')
|
||||
),
|
||||
OptionRecommendation(name='pdf_odd_even_offset', recommended_value=0.0,
|
||||
level=OptionRecommendation.LOW,
|
||||
help=_(
|
||||
'Shift the text horizontally by the specified offset (in pts).'
|
||||
' On odd numbered pages, it is shifted to the right and on even'
|
||||
' numbered pages to the left. Use negative numbers for the opposite'
|
||||
' effect. Note that this setting is ignored on pages where the margins'
|
||||
' are smaller than the specified offset. Shifting is done by setting'
|
||||
' the PDF CropBox, not all software respects the CropBox.'
|
||||
)
|
||||
),
|
||||
|
||||
}
|
||||
|
||||
def specialize_options(self, log, opts, input_fmt):
|
||||
# Ensure Qt is setup to be used with WebEngine
|
||||
# specialize_options is called early enough in the pipeline
|
||||
# that hopefully no Qt application has been constructed as yet
|
||||
from PyQt5.QtWebEngineCore import QWebEngineUrlScheme
|
||||
from PyQt5.QtWebEngineWidgets import QWebEnginePage # noqa
|
||||
from calibre.gui2 import must_use_qt
|
||||
from calibre.constants import FAKE_PROTOCOL
|
||||
scheme = QWebEngineUrlScheme(FAKE_PROTOCOL.encode('ascii'))
|
||||
scheme.setSyntax(QWebEngineUrlScheme.Syntax.Host)
|
||||
scheme.setFlags(QWebEngineUrlScheme.SecureScheme)
|
||||
QWebEngineUrlScheme.registerScheme(scheme)
|
||||
must_use_qt()
|
||||
self.input_fmt = input_fmt
|
||||
|
||||
if opts.pdf_use_document_margins:
|
||||
# Prevent the conversion pipeline from overwriting document margins
|
||||
opts.margin_left = opts.margin_right = opts.margin_top = opts.margin_bottom = -1
|
||||
|
||||
def convert(self, oeb_book, output_path, input_plugin, opts, log):
|
||||
self.stored_page_margins = getattr(opts, '_stored_page_margins', {})
|
||||
|
||||
self.oeb = oeb_book
|
||||
self.input_plugin, self.opts, self.log = input_plugin, opts, log
|
||||
self.output_path = output_path
|
||||
from calibre.ebooks.oeb.base import OPF, OPF2_NS
|
||||
from lxml import etree
|
||||
from io import BytesIO
|
||||
package = etree.Element(OPF('package'),
|
||||
attrib={'version': '2.0', 'unique-identifier': 'dummy'},
|
||||
nsmap={None: OPF2_NS})
|
||||
from calibre.ebooks.metadata.opf2 import OPF
|
||||
self.oeb.metadata.to_opf2(package)
|
||||
self.metadata = OPF(BytesIO(etree.tostring(package))).to_book_metadata()
|
||||
self.cover_data = None
|
||||
|
||||
if input_plugin.is_image_collection:
|
||||
log.debug('Converting input as an image collection...')
|
||||
self.convert_images(input_plugin.get_images())
|
||||
else:
|
||||
log.debug('Converting input as a text based book...')
|
||||
self.convert_text(oeb_book)
|
||||
|
||||
def convert_images(self, images):
|
||||
from calibre.ebooks.pdf.image_writer import convert
|
||||
convert(images, self.output_path, self.opts, self.metadata, self.report_progress)
|
||||
|
||||
def get_cover_data(self):
|
||||
oeb = self.oeb
|
||||
if (oeb.metadata.cover and unicode_type(oeb.metadata.cover[0]) in oeb.manifest.ids):
|
||||
cover_id = unicode_type(oeb.metadata.cover[0])
|
||||
item = oeb.manifest.ids[cover_id]
|
||||
self.cover_data = item.data
|
||||
|
||||
def process_fonts(self):
|
||||
''' Make sure all fonts are embeddable '''
|
||||
from calibre.ebooks.oeb.base import urlnormalize
|
||||
from calibre.utils.fonts.utils import remove_embed_restriction
|
||||
|
||||
processed = set()
|
||||
for item in list(self.oeb.manifest):
|
||||
if not hasattr(item.data, 'cssRules'):
|
||||
continue
|
||||
for i, rule in enumerate(item.data.cssRules):
|
||||
if rule.type == rule.FONT_FACE_RULE:
|
||||
try:
|
||||
s = rule.style
|
||||
src = s.getProperty('src').propertyValue[0].uri
|
||||
except:
|
||||
continue
|
||||
path = item.abshref(src)
|
||||
ff = self.oeb.manifest.hrefs.get(urlnormalize(path), None)
|
||||
if ff is None:
|
||||
continue
|
||||
|
||||
raw = nraw = ff.data
|
||||
if path not in processed:
|
||||
processed.add(path)
|
||||
try:
|
||||
nraw = remove_embed_restriction(raw)
|
||||
except:
|
||||
continue
|
||||
if nraw != raw:
|
||||
ff.data = nraw
|
||||
self.oeb.container.write(path, nraw)
|
||||
|
||||
def convert_text(self, oeb_book):
|
||||
import json
|
||||
from calibre.ebooks.pdf.html_writer import convert
|
||||
self.get_cover_data()
|
||||
self.process_fonts()
|
||||
|
||||
if self.opts.pdf_use_document_margins and self.stored_page_margins:
|
||||
for href, margins in iteritems(self.stored_page_margins):
|
||||
item = oeb_book.manifest.hrefs.get(href)
|
||||
if item is not None:
|
||||
root = item.data
|
||||
if hasattr(root, 'xpath') and margins:
|
||||
root.set('data-calibre-pdf-output-page-margins', json.dumps(margins))
|
||||
|
||||
with TemporaryDirectory('_pdf_out') as oeb_dir:
|
||||
from calibre.customize.ui import plugin_for_output_format
|
||||
oeb_dir = os.path.realpath(oeb_dir)
|
||||
oeb_output = plugin_for_output_format('oeb')
|
||||
oeb_output.convert(oeb_book, oeb_dir, self.input_plugin, self.opts, self.log)
|
||||
opfpath = glob.glob(os.path.join(oeb_dir, '*.opf'))[0]
|
||||
convert(
|
||||
opfpath, self.opts, metadata=self.metadata, output_path=self.output_path,
|
||||
log=self.log, cover_data=self.cover_data, report_progress=self.report_progress
|
||||
)
|
||||
165
ebook_converter/ebooks/conversion/plugins/pml_input.py
Normal file
165
ebook_converter/ebooks/conversion/plugins/pml_input.py
Normal file
@@ -0,0 +1,165 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import glob
|
||||
import os
|
||||
import shutil
|
||||
|
||||
from calibre.customize.conversion import InputFormatPlugin
|
||||
from calibre.ptempfile import TemporaryDirectory
|
||||
from polyglot.builtins import getcwd
|
||||
|
||||
|
||||
class PMLInput(InputFormatPlugin):
|
||||
|
||||
name = 'PML Input'
|
||||
author = 'John Schember'
|
||||
description = 'Convert PML to OEB'
|
||||
# pmlz is a zip file containing pml files and png images.
|
||||
file_types = {'pml', 'pmlz'}
|
||||
commit_name = 'pml_input'
|
||||
|
||||
def process_pml(self, pml_path, html_path, close_all=False):
|
||||
from calibre.ebooks.pml.pmlconverter import PML_HTMLizer
|
||||
|
||||
pclose = False
|
||||
hclose = False
|
||||
|
||||
if not hasattr(pml_path, 'read'):
|
||||
pml_stream = lopen(pml_path, 'rb')
|
||||
pclose = True
|
||||
else:
|
||||
pml_stream = pml_path
|
||||
pml_stream.seek(0)
|
||||
|
||||
if not hasattr(html_path, 'write'):
|
||||
html_stream = lopen(html_path, 'wb')
|
||||
hclose = True
|
||||
else:
|
||||
html_stream = html_path
|
||||
|
||||
ienc = getattr(pml_stream, 'encoding', None)
|
||||
if ienc is None:
|
||||
ienc = 'cp1252'
|
||||
if self.options.input_encoding:
|
||||
ienc = self.options.input_encoding
|
||||
|
||||
self.log.debug('Converting PML to HTML...')
|
||||
hizer = PML_HTMLizer()
|
||||
html = hizer.parse_pml(pml_stream.read().decode(ienc), html_path)
|
||||
html = '<html><head><title></title></head><body>%s</body></html>'%html
|
||||
html_stream.write(html.encode('utf-8', 'replace'))
|
||||
|
||||
if pclose:
|
||||
pml_stream.close()
|
||||
if hclose:
|
||||
html_stream.close()
|
||||
|
||||
return hizer.get_toc()
|
||||
|
||||
def get_images(self, stream, tdir, top_level=False):
|
||||
images = []
|
||||
imgs = []
|
||||
|
||||
if top_level:
|
||||
imgs = glob.glob(os.path.join(tdir, '*.png'))
|
||||
# Images not in top level try bookname_img directory because
|
||||
# that's where Dropbook likes to see them.
|
||||
if not imgs:
|
||||
if hasattr(stream, 'name'):
|
||||
imgs = glob.glob(os.path.join(tdir, os.path.splitext(os.path.basename(stream.name))[0] + '_img', '*.png'))
|
||||
# No images in Dropbook location try generic images directory
|
||||
if not imgs:
|
||||
imgs = glob.glob(os.path.join(os.path.join(tdir, 'images'), '*.png'))
|
||||
if imgs:
|
||||
os.makedirs(os.path.join(getcwd(), 'images'))
|
||||
for img in imgs:
|
||||
pimg_name = os.path.basename(img)
|
||||
pimg_path = os.path.join(getcwd(), 'images', pimg_name)
|
||||
|
||||
images.append('images/' + pimg_name)
|
||||
|
||||
shutil.copy(img, pimg_path)
|
||||
|
||||
return images
|
||||
|
||||
def convert(self, stream, options, file_ext, log,
|
||||
accelerators):
|
||||
from calibre.ebooks.metadata.toc import TOC
|
||||
from calibre.ebooks.metadata.opf2 import OPFCreator
|
||||
from calibre.utils.zipfile import ZipFile
|
||||
|
||||
self.options = options
|
||||
self.log = log
|
||||
pages, images = [], []
|
||||
toc = TOC()
|
||||
|
||||
if file_ext == 'pmlz':
|
||||
log.debug('De-compressing content to temporary directory...')
|
||||
with TemporaryDirectory('_unpmlz') as tdir:
|
||||
zf = ZipFile(stream)
|
||||
zf.extractall(tdir)
|
||||
|
||||
pmls = glob.glob(os.path.join(tdir, '*.pml'))
|
||||
for pml in pmls:
|
||||
html_name = os.path.splitext(os.path.basename(pml))[0]+'.html'
|
||||
html_path = os.path.join(getcwd(), html_name)
|
||||
|
||||
pages.append(html_name)
|
||||
log.debug('Processing PML item %s...' % pml)
|
||||
ttoc = self.process_pml(pml, html_path)
|
||||
toc += ttoc
|
||||
images = self.get_images(stream, tdir, True)
|
||||
else:
|
||||
toc = self.process_pml(stream, 'index.html')
|
||||
pages.append('index.html')
|
||||
|
||||
if hasattr(stream, 'name'):
|
||||
images = self.get_images(stream, os.path.abspath(os.path.dirname(stream.name)))
|
||||
|
||||
# We want pages to be orded alphabetically.
|
||||
pages.sort()
|
||||
|
||||
manifest_items = []
|
||||
for item in pages+images:
|
||||
manifest_items.append((item, None))
|
||||
|
||||
from calibre.ebooks.metadata.meta import get_metadata
|
||||
log.debug('Reading metadata from input file...')
|
||||
mi = get_metadata(stream, 'pml')
|
||||
if 'images/cover.png' in images:
|
||||
mi.cover = 'images/cover.png'
|
||||
opf = OPFCreator(getcwd(), mi)
|
||||
log.debug('Generating manifest...')
|
||||
opf.create_manifest(manifest_items)
|
||||
opf.create_spine(pages)
|
||||
opf.set_toc(toc)
|
||||
with lopen('metadata.opf', 'wb') as opffile:
|
||||
with lopen('toc.ncx', 'wb') as tocfile:
|
||||
opf.render(opffile, tocfile, 'toc.ncx')
|
||||
|
||||
return os.path.join(getcwd(), 'metadata.opf')
|
||||
|
||||
def postprocess_book(self, oeb, opts, log):
|
||||
from calibre.ebooks.oeb.base import XHTML, barename
|
||||
for item in oeb.spine:
|
||||
if hasattr(item.data, 'xpath'):
|
||||
for heading in item.data.iterdescendants(*map(XHTML, 'h1 h2 h3 h4 h5 h6'.split())):
|
||||
if not len(heading):
|
||||
continue
|
||||
span = heading[0]
|
||||
if not heading.text and not span.text and not len(span) and barename(span.tag) == 'span':
|
||||
if not heading.get('id') and span.get('id'):
|
||||
heading.set('id', span.get('id'))
|
||||
heading.text = span.tail
|
||||
heading.remove(span)
|
||||
if len(heading) == 1 and heading[0].get('style') == 'text-align: center; margin: auto;':
|
||||
div = heading[0]
|
||||
if barename(div.tag) == 'div' and not len(div) and not div.get('id') and not heading.get('style'):
|
||||
heading.text = (heading.text or '') + (div.text or '') + (div.tail or '')
|
||||
heading.remove(div)
|
||||
heading.set('style', 'text-align: center')
|
||||
77
ebook_converter/ebooks/conversion/plugins/pml_output.py
Normal file
77
ebook_converter/ebooks/conversion/plugins/pml_output.py
Normal file
@@ -0,0 +1,77 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL 3'
|
||||
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import os, io
|
||||
|
||||
from calibre.customize.conversion import (OutputFormatPlugin,
|
||||
OptionRecommendation)
|
||||
from calibre.ptempfile import TemporaryDirectory
|
||||
from polyglot.builtins import unicode_type
|
||||
|
||||
|
||||
class PMLOutput(OutputFormatPlugin):
|
||||
|
||||
name = 'PML Output'
|
||||
author = 'John Schember'
|
||||
file_type = 'pmlz'
|
||||
commit_name = 'pml_output'
|
||||
|
||||
options = {
|
||||
OptionRecommendation(name='pml_output_encoding', recommended_value='cp1252',
|
||||
level=OptionRecommendation.LOW,
|
||||
help=_('Specify the character encoding of the output document. '
|
||||
'The default is cp1252.')),
|
||||
OptionRecommendation(name='inline_toc',
|
||||
recommended_value=False, level=OptionRecommendation.LOW,
|
||||
help=_('Add Table of Contents to beginning of the book.')),
|
||||
OptionRecommendation(name='full_image_depth',
|
||||
recommended_value=False, level=OptionRecommendation.LOW,
|
||||
help=_('Do not reduce the size or bit depth of images. Images '
|
||||
'have their size and depth reduced by default to accommodate '
|
||||
'applications that can not convert images on their '
|
||||
'own such as Dropbook.')),
|
||||
}
|
||||
|
||||
def convert(self, oeb_book, output_path, input_plugin, opts, log):
|
||||
from calibre.ebooks.pml.pmlml import PMLMLizer
|
||||
from calibre.utils.zipfile import ZipFile
|
||||
|
||||
with TemporaryDirectory('_pmlz_output') as tdir:
|
||||
pmlmlizer = PMLMLizer(log)
|
||||
pml = unicode_type(pmlmlizer.extract_content(oeb_book, opts))
|
||||
with lopen(os.path.join(tdir, 'index.pml'), 'wb') as out:
|
||||
out.write(pml.encode(opts.pml_output_encoding, 'replace'))
|
||||
|
||||
img_path = os.path.join(tdir, 'index_img')
|
||||
if not os.path.exists(img_path):
|
||||
os.makedirs(img_path)
|
||||
self.write_images(oeb_book.manifest, pmlmlizer.image_hrefs, img_path, opts)
|
||||
|
||||
log.debug('Compressing output...')
|
||||
pmlz = ZipFile(output_path, 'w')
|
||||
pmlz.add_dir(tdir)
|
||||
|
||||
def write_images(self, manifest, image_hrefs, out_dir, opts):
|
||||
from PIL import Image
|
||||
|
||||
from calibre.ebooks.oeb.base import OEB_RASTER_IMAGES
|
||||
for item in manifest:
|
||||
if item.media_type in OEB_RASTER_IMAGES and item.href in image_hrefs.keys():
|
||||
if opts.full_image_depth:
|
||||
im = Image.open(io.BytesIO(item.data))
|
||||
else:
|
||||
im = Image.open(io.BytesIO(item.data)).convert('P')
|
||||
im.thumbnail((300,300), Image.ANTIALIAS)
|
||||
|
||||
data = io.BytesIO()
|
||||
im.save(data, 'PNG')
|
||||
data = data.getvalue()
|
||||
|
||||
path = os.path.join(out_dir, image_hrefs[item.href])
|
||||
|
||||
with lopen(path, 'wb') as out:
|
||||
out.write(data)
|
||||
28
ebook_converter/ebooks/conversion/plugins/rb_input.py
Normal file
28
ebook_converter/ebooks/conversion/plugins/rb_input.py
Normal file
@@ -0,0 +1,28 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL 3'
|
||||
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
|
||||
from calibre.customize.conversion import InputFormatPlugin
|
||||
from polyglot.builtins import getcwd
|
||||
|
||||
|
||||
class RBInput(InputFormatPlugin):
|
||||
|
||||
name = 'RB Input'
|
||||
author = 'John Schember'
|
||||
description = 'Convert RB files to HTML'
|
||||
file_types = {'rb'}
|
||||
commit_name = 'rb_input'
|
||||
|
||||
def convert(self, stream, options, file_ext, log,
|
||||
accelerators):
|
||||
from calibre.ebooks.rb.reader import Reader
|
||||
|
||||
reader = Reader(stream, log, options.input_encoding)
|
||||
opf = reader.extract_content(getcwd())
|
||||
|
||||
return opf
|
||||
45
ebook_converter/ebooks/conversion/plugins/rb_output.py
Normal file
45
ebook_converter/ebooks/conversion/plugins/rb_output.py
Normal file
@@ -0,0 +1,45 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL 3'
|
||||
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import os
|
||||
|
||||
from calibre.customize.conversion import OutputFormatPlugin, OptionRecommendation
|
||||
|
||||
|
||||
class RBOutput(OutputFormatPlugin):
|
||||
|
||||
name = 'RB Output'
|
||||
author = 'John Schember'
|
||||
file_type = 'rb'
|
||||
commit_name = 'rb_output'
|
||||
|
||||
options = {
|
||||
OptionRecommendation(name='inline_toc',
|
||||
recommended_value=False, level=OptionRecommendation.LOW,
|
||||
help=_('Add Table of Contents to beginning of the book.'))}
|
||||
|
||||
def convert(self, oeb_book, output_path, input_plugin, opts, log):
|
||||
from calibre.ebooks.rb.writer import RBWriter
|
||||
|
||||
close = False
|
||||
if not hasattr(output_path, 'write'):
|
||||
close = True
|
||||
if not os.path.exists(os.path.dirname(output_path)) and os.path.dirname(output_path):
|
||||
os.makedirs(os.path.dirname(output_path))
|
||||
out_stream = lopen(output_path, 'wb')
|
||||
else:
|
||||
out_stream = output_path
|
||||
|
||||
writer = RBWriter(opts, log)
|
||||
|
||||
out_stream.seek(0)
|
||||
out_stream.truncate()
|
||||
|
||||
writer.write_content(oeb_book, out_stream, oeb_book.metadata)
|
||||
|
||||
if close:
|
||||
out_stream.close()
|
||||
169
ebook_converter/ebooks/conversion/plugins/recipe_input.py
Normal file
169
ebook_converter/ebooks/conversion/plugins/recipe_input.py
Normal file
@@ -0,0 +1,169 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import os
|
||||
|
||||
from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
|
||||
from calibre.constants import numeric_version
|
||||
from calibre import walk
|
||||
from polyglot.builtins import unicode_type
|
||||
|
||||
|
||||
class RecipeDisabled(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class RecipeInput(InputFormatPlugin):
|
||||
|
||||
name = 'Recipe Input'
|
||||
author = 'Kovid Goyal'
|
||||
description = _('Download periodical content from the internet')
|
||||
file_types = {'recipe', 'downloaded_recipe'}
|
||||
commit_name = 'recipe_input'
|
||||
|
||||
recommendations = {
|
||||
('chapter', None, OptionRecommendation.HIGH),
|
||||
('dont_split_on_page_breaks', True, OptionRecommendation.HIGH),
|
||||
('use_auto_toc', False, OptionRecommendation.HIGH),
|
||||
('input_encoding', None, OptionRecommendation.HIGH),
|
||||
('input_profile', 'default', OptionRecommendation.HIGH),
|
||||
('page_breaks_before', None, OptionRecommendation.HIGH),
|
||||
('insert_metadata', False, OptionRecommendation.HIGH),
|
||||
}
|
||||
|
||||
options = {
|
||||
OptionRecommendation(name='test', recommended_value=False,
|
||||
help=_(
|
||||
'Useful for recipe development. Forces'
|
||||
' max_articles_per_feed to 2 and downloads at most 2 feeds.'
|
||||
' You can change the number of feeds and articles by supplying optional arguments.'
|
||||
' For example: --test 3 1 will download at most 3 feeds and only 1 article per feed.')),
|
||||
OptionRecommendation(name='username', recommended_value=None,
|
||||
help=_('Username for sites that require a login to access '
|
||||
'content.')),
|
||||
OptionRecommendation(name='password', recommended_value=None,
|
||||
help=_('Password for sites that require a login to access '
|
||||
'content.')),
|
||||
OptionRecommendation(name='dont_download_recipe',
|
||||
recommended_value=False,
|
||||
help=_('Do not download latest version of builtin recipes from the calibre server')),
|
||||
OptionRecommendation(name='lrf', recommended_value=False,
|
||||
help='Optimize fetching for subsequent conversion to LRF.'),
|
||||
}
|
||||
|
||||
def convert(self, recipe_or_file, opts, file_ext, log,
|
||||
accelerators):
|
||||
from calibre.web.feeds.recipes import compile_recipe
|
||||
opts.output_profile.flow_size = 0
|
||||
if file_ext == 'downloaded_recipe':
|
||||
from calibre.utils.zipfile import ZipFile
|
||||
zf = ZipFile(recipe_or_file, 'r')
|
||||
zf.extractall()
|
||||
zf.close()
|
||||
with lopen('download.recipe', 'rb') as f:
|
||||
self.recipe_source = f.read()
|
||||
recipe = compile_recipe(self.recipe_source)
|
||||
recipe.needs_subscription = False
|
||||
self.recipe_object = recipe(opts, log, self.report_progress)
|
||||
else:
|
||||
if os.environ.get('CALIBRE_RECIPE_URN'):
|
||||
from calibre.web.feeds.recipes.collection import get_custom_recipe, get_builtin_recipe_by_id
|
||||
urn = os.environ['CALIBRE_RECIPE_URN']
|
||||
log('Downloading recipe urn: ' + urn)
|
||||
rtype, recipe_id = urn.partition(':')[::2]
|
||||
if not recipe_id:
|
||||
raise ValueError('Invalid recipe urn: ' + urn)
|
||||
if rtype == 'custom':
|
||||
self.recipe_source = get_custom_recipe(recipe_id)
|
||||
else:
|
||||
self.recipe_source = get_builtin_recipe_by_id(urn, log=log, download_recipe=True)
|
||||
if not self.recipe_source:
|
||||
raise ValueError('Could not find recipe with urn: ' + urn)
|
||||
if not isinstance(self.recipe_source, bytes):
|
||||
self.recipe_source = self.recipe_source.encode('utf-8')
|
||||
recipe = compile_recipe(self.recipe_source)
|
||||
elif os.access(recipe_or_file, os.R_OK):
|
||||
with lopen(recipe_or_file, 'rb') as f:
|
||||
self.recipe_source = f.read()
|
||||
recipe = compile_recipe(self.recipe_source)
|
||||
log('Using custom recipe')
|
||||
else:
|
||||
from calibre.web.feeds.recipes.collection import (
|
||||
get_builtin_recipe_by_title, get_builtin_recipe_titles)
|
||||
title = getattr(opts, 'original_recipe_input_arg', recipe_or_file)
|
||||
title = os.path.basename(title).rpartition('.')[0]
|
||||
titles = frozenset(get_builtin_recipe_titles())
|
||||
if title not in titles:
|
||||
title = getattr(opts, 'original_recipe_input_arg', recipe_or_file)
|
||||
title = title.rpartition('.')[0]
|
||||
|
||||
raw = get_builtin_recipe_by_title(title, log=log,
|
||||
download_recipe=not opts.dont_download_recipe)
|
||||
builtin = False
|
||||
try:
|
||||
recipe = compile_recipe(raw)
|
||||
self.recipe_source = raw
|
||||
if recipe.requires_version > numeric_version:
|
||||
log.warn(
|
||||
'Downloaded recipe needs calibre version at least: %s' %
|
||||
('.'.join(recipe.requires_version)))
|
||||
builtin = True
|
||||
except:
|
||||
log.exception('Failed to compile downloaded recipe. Falling '
|
||||
'back to builtin one')
|
||||
builtin = True
|
||||
if builtin:
|
||||
log('Using bundled builtin recipe')
|
||||
raw = get_builtin_recipe_by_title(title, log=log,
|
||||
download_recipe=False)
|
||||
if raw is None:
|
||||
raise ValueError('Failed to find builtin recipe: '+title)
|
||||
recipe = compile_recipe(raw)
|
||||
self.recipe_source = raw
|
||||
else:
|
||||
log('Using downloaded builtin recipe')
|
||||
|
||||
if recipe is None:
|
||||
raise ValueError('%r is not a valid recipe file or builtin recipe' %
|
||||
recipe_or_file)
|
||||
|
||||
disabled = getattr(recipe, 'recipe_disabled', None)
|
||||
if disabled is not None:
|
||||
raise RecipeDisabled(disabled)
|
||||
ro = recipe(opts, log, self.report_progress)
|
||||
ro.download()
|
||||
self.recipe_object = ro
|
||||
|
||||
for key, val in self.recipe_object.conversion_options.items():
|
||||
setattr(opts, key, val)
|
||||
|
||||
for f in os.listdir('.'):
|
||||
if f.endswith('.opf'):
|
||||
return os.path.abspath(f)
|
||||
|
||||
for f in walk('.'):
|
||||
if f.endswith('.opf'):
|
||||
return os.path.abspath(f)
|
||||
|
||||
def postprocess_book(self, oeb, opts, log):
|
||||
if self.recipe_object is not None:
|
||||
self.recipe_object.internal_postprocess_book(oeb, opts, log)
|
||||
self.recipe_object.postprocess_book(oeb, opts, log)
|
||||
|
||||
def specialize(self, oeb, opts, log, output_fmt):
|
||||
if opts.no_inline_navbars:
|
||||
from calibre.ebooks.oeb.base import XPath
|
||||
for item in oeb.spine:
|
||||
for div in XPath('//h:div[contains(@class, "calibre_navbar")]')(item.data):
|
||||
div.getparent().remove(div)
|
||||
|
||||
def save_download(self, zf):
|
||||
raw = self.recipe_source
|
||||
if isinstance(raw, unicode_type):
|
||||
raw = raw.encode('utf-8')
|
||||
zf.writestr('download.recipe', raw)
|
||||
323
ebook_converter/ebooks/conversion/plugins/rtf_input.py
Normal file
323
ebook_converter/ebooks/conversion/plugins/rtf_input.py
Normal file
@@ -0,0 +1,323 @@
|
||||
from __future__ import with_statement, unicode_literals
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
|
||||
import os, glob, re, textwrap
|
||||
|
||||
from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
|
||||
from polyglot.builtins import iteritems, filter, getcwd, as_bytes
|
||||
|
||||
border_style_map = {
|
||||
'single' : 'solid',
|
||||
'double-thickness-border' : 'double',
|
||||
'shadowed-border': 'outset',
|
||||
'double-border': 'double',
|
||||
'dotted-border': 'dotted',
|
||||
'dashed': 'dashed',
|
||||
'hairline': 'solid',
|
||||
'inset': 'inset',
|
||||
'dash-small': 'dashed',
|
||||
'dot-dash': 'dotted',
|
||||
'dot-dot-dash': 'dotted',
|
||||
'outset': 'outset',
|
||||
'tripple': 'double',
|
||||
'triple': 'double',
|
||||
'thick-thin-small': 'solid',
|
||||
'thin-thick-small': 'solid',
|
||||
'thin-thick-thin-small': 'solid',
|
||||
'thick-thin-medium': 'solid',
|
||||
'thin-thick-medium': 'solid',
|
||||
'thin-thick-thin-medium': 'solid',
|
||||
'thick-thin-large': 'solid',
|
||||
'thin-thick-thin-large': 'solid',
|
||||
'wavy': 'ridge',
|
||||
'double-wavy': 'ridge',
|
||||
'striped': 'ridge',
|
||||
'emboss': 'inset',
|
||||
'engrave': 'inset',
|
||||
'frame': 'ridge',
|
||||
}
|
||||
|
||||
|
||||
class RTFInput(InputFormatPlugin):
|
||||
|
||||
name = 'RTF Input'
|
||||
author = 'Kovid Goyal'
|
||||
description = 'Convert RTF files to HTML'
|
||||
file_types = {'rtf'}
|
||||
commit_name = 'rtf_input'
|
||||
|
||||
options = {
|
||||
OptionRecommendation(name='ignore_wmf', recommended_value=False,
|
||||
help=_('Ignore WMF images instead of replacing them with a placeholder image.')),
|
||||
}
|
||||
|
||||
def generate_xml(self, stream):
|
||||
from calibre.ebooks.rtf2xml.ParseRtf import ParseRtf
|
||||
ofile = u'dataxml.xml'
|
||||
run_lev, debug_dir, indent_out = 1, None, 0
|
||||
if getattr(self.opts, 'debug_pipeline', None) is not None:
|
||||
try:
|
||||
os.mkdir(u'rtfdebug')
|
||||
debug_dir = u'rtfdebug'
|
||||
run_lev = 4
|
||||
indent_out = 1
|
||||
self.log('Running RTFParser in debug mode')
|
||||
except:
|
||||
self.log.warn('Impossible to run RTFParser in debug mode')
|
||||
parser = ParseRtf(
|
||||
in_file=stream,
|
||||
out_file=ofile,
|
||||
# Convert symbol fonts to unicode equivalents. Default
|
||||
# is 1
|
||||
convert_symbol=1,
|
||||
|
||||
# Convert Zapf fonts to unicode equivalents. Default
|
||||
# is 1.
|
||||
convert_zapf=1,
|
||||
|
||||
# Convert Wingding fonts to unicode equivalents.
|
||||
# Default is 1.
|
||||
convert_wingdings=1,
|
||||
|
||||
# Convert RTF caps to real caps.
|
||||
# Default is 1.
|
||||
convert_caps=1,
|
||||
|
||||
# Indent resulting XML.
|
||||
# Default is 0 (no indent).
|
||||
indent=indent_out,
|
||||
|
||||
# Form lists from RTF. Default is 1.
|
||||
form_lists=1,
|
||||
|
||||
# Convert headings to sections. Default is 0.
|
||||
headings_to_sections=1,
|
||||
|
||||
# Group paragraphs with the same style name. Default is 1.
|
||||
group_styles=1,
|
||||
|
||||
# Group borders. Default is 1.
|
||||
group_borders=1,
|
||||
|
||||
# Write or do not write paragraphs. Default is 0.
|
||||
empty_paragraphs=1,
|
||||
|
||||
# Debug
|
||||
deb_dir=debug_dir,
|
||||
|
||||
# Default encoding
|
||||
default_encoding=getattr(self.opts, 'input_encoding', 'cp1252') or 'cp1252',
|
||||
|
||||
# Run level
|
||||
run_level=run_lev,
|
||||
)
|
||||
parser.parse_rtf()
|
||||
with open(ofile, 'rb') as f:
|
||||
return f.read()
|
||||
|
||||
def extract_images(self, picts):
|
||||
from calibre.utils.imghdr import what
|
||||
from binascii import unhexlify
|
||||
self.log('Extracting images...')
|
||||
|
||||
with open(picts, 'rb') as f:
|
||||
raw = f.read()
|
||||
picts = filter(len, re.findall(br'\{\\pict([^}]+)\}', raw))
|
||||
hex_pat = re.compile(br'[^a-fA-F0-9]')
|
||||
encs = [hex_pat.sub(b'', pict) for pict in picts]
|
||||
|
||||
count = 0
|
||||
imap = {}
|
||||
for enc in encs:
|
||||
if len(enc) % 2 == 1:
|
||||
enc = enc[:-1]
|
||||
data = unhexlify(enc)
|
||||
fmt = what(None, data)
|
||||
if fmt is None:
|
||||
fmt = 'wmf'
|
||||
count += 1
|
||||
name = u'%04d.%s' % (count, fmt)
|
||||
with open(name, 'wb') as f:
|
||||
f.write(data)
|
||||
imap[count] = name
|
||||
# with open(name+'.hex', 'wb') as f:
|
||||
# f.write(enc)
|
||||
return self.convert_images(imap)
|
||||
|
||||
def convert_images(self, imap):
|
||||
self.default_img = None
|
||||
for count, val in iteritems(imap):
|
||||
try:
|
||||
imap[count] = self.convert_image(val)
|
||||
except:
|
||||
self.log.exception('Failed to convert', val)
|
||||
return imap
|
||||
|
||||
def convert_image(self, name):
|
||||
if not name.endswith('.wmf'):
|
||||
return name
|
||||
try:
|
||||
return self.rasterize_wmf(name)
|
||||
except Exception:
|
||||
self.log.exception('Failed to convert WMF image %r'%name)
|
||||
return self.replace_wmf(name)
|
||||
|
||||
def replace_wmf(self, name):
|
||||
if self.opts.ignore_wmf:
|
||||
os.remove(name)
|
||||
return '__REMOVE_ME__'
|
||||
from calibre.ebooks.covers import message_image
|
||||
if self.default_img is None:
|
||||
self.default_img = message_image('Conversion of WMF images is not supported.'
|
||||
' Use Microsoft Word or OpenOffice to save this RTF file'
|
||||
' as HTML and convert that in calibre.')
|
||||
name = name.replace('.wmf', '.jpg')
|
||||
with lopen(name, 'wb') as f:
|
||||
f.write(self.default_img)
|
||||
return name
|
||||
|
||||
def rasterize_wmf(self, name):
|
||||
from calibre.utils.wmf.parse import wmf_unwrap
|
||||
with open(name, 'rb') as f:
|
||||
data = f.read()
|
||||
data = wmf_unwrap(data)
|
||||
name = name.replace('.wmf', '.png')
|
||||
with open(name, 'wb') as f:
|
||||
f.write(data)
|
||||
return name
|
||||
|
||||
def write_inline_css(self, ic, border_styles):
|
||||
font_size_classes = ['span.fs%d { font-size: %spt }'%(i, x) for i, x in
|
||||
enumerate(ic.font_sizes)]
|
||||
color_classes = ['span.col%d { color: %s }'%(i, x) for i, x in
|
||||
enumerate(ic.colors) if x != 'false']
|
||||
css = textwrap.dedent('''
|
||||
span.none {
|
||||
text-decoration: none; font-weight: normal;
|
||||
font-style: normal; font-variant: normal
|
||||
}
|
||||
|
||||
span.italics { font-style: italic }
|
||||
|
||||
span.bold { font-weight: bold }
|
||||
|
||||
span.small-caps { font-variant: small-caps }
|
||||
|
||||
span.underlined { text-decoration: underline }
|
||||
|
||||
span.strike-through { text-decoration: line-through }
|
||||
|
||||
''')
|
||||
css += '\n'+'\n'.join(font_size_classes)
|
||||
css += '\n' +'\n'.join(color_classes)
|
||||
|
||||
for cls, val in iteritems(border_styles):
|
||||
css += '\n\n.%s {\n%s\n}'%(cls, val)
|
||||
|
||||
with open(u'styles.css', 'ab') as f:
|
||||
f.write(css.encode('utf-8'))
|
||||
|
||||
def convert_borders(self, doc):
|
||||
border_styles = []
|
||||
style_map = {}
|
||||
for elem in doc.xpath(r'//*[local-name()="cell"]'):
|
||||
style = ['border-style: hidden', 'border-width: 1px',
|
||||
'border-color: black']
|
||||
for x in ('bottom', 'top', 'left', 'right'):
|
||||
bs = elem.get('border-cell-%s-style'%x, None)
|
||||
if bs:
|
||||
cbs = border_style_map.get(bs, 'solid')
|
||||
style.append('border-%s-style: %s'%(x, cbs))
|
||||
bw = elem.get('border-cell-%s-line-width'%x, None)
|
||||
if bw:
|
||||
style.append('border-%s-width: %spt'%(x, bw))
|
||||
bc = elem.get('border-cell-%s-color'%x, None)
|
||||
if bc:
|
||||
style.append('border-%s-color: %s'%(x, bc))
|
||||
style = ';\n'.join(style)
|
||||
if style not in border_styles:
|
||||
border_styles.append(style)
|
||||
idx = border_styles.index(style)
|
||||
cls = 'border_style%d'%idx
|
||||
style_map[cls] = style
|
||||
elem.set('class', cls)
|
||||
return style_map
|
||||
|
||||
def convert(self, stream, options, file_ext, log,
|
||||
accelerators):
|
||||
from lxml import etree
|
||||
from calibre.ebooks.metadata.meta import get_metadata
|
||||
from calibre.ebooks.metadata.opf2 import OPFCreator
|
||||
from calibre.ebooks.rtf2xml.ParseRtf import RtfInvalidCodeException
|
||||
from calibre.ebooks.rtf.input import InlineClass
|
||||
from calibre.utils.xml_parse import safe_xml_fromstring
|
||||
self.opts = options
|
||||
self.log = log
|
||||
self.log('Converting RTF to XML...')
|
||||
try:
|
||||
xml = self.generate_xml(stream.name)
|
||||
except RtfInvalidCodeException as e:
|
||||
self.log.exception('Unable to parse RTF')
|
||||
raise ValueError(_('This RTF file has a feature calibre does not '
|
||||
'support. Convert it to HTML first and then try it.\n%s')%e)
|
||||
|
||||
d = glob.glob(os.path.join('*_rtf_pict_dir', 'picts.rtf'))
|
||||
if d:
|
||||
imap = {}
|
||||
try:
|
||||
imap = self.extract_images(d[0])
|
||||
except:
|
||||
self.log.exception('Failed to extract images...')
|
||||
|
||||
self.log('Parsing XML...')
|
||||
doc = safe_xml_fromstring(xml)
|
||||
border_styles = self.convert_borders(doc)
|
||||
for pict in doc.xpath('//rtf:pict[@num]',
|
||||
namespaces={'rtf':'http://rtf2xml.sourceforge.net/'}):
|
||||
num = int(pict.get('num'))
|
||||
name = imap.get(num, None)
|
||||
if name is not None:
|
||||
pict.set('num', name)
|
||||
|
||||
self.log('Converting XML to HTML...')
|
||||
inline_class = InlineClass(self.log)
|
||||
styledoc = safe_xml_fromstring(P('templates/rtf.xsl', data=True), recover=False)
|
||||
extensions = {('calibre', 'inline-class') : inline_class}
|
||||
transform = etree.XSLT(styledoc, extensions=extensions)
|
||||
result = transform(doc)
|
||||
html = u'index.xhtml'
|
||||
with open(html, 'wb') as f:
|
||||
res = as_bytes(transform.tostring(result))
|
||||
# res = res[:100].replace('xmlns:html', 'xmlns') + res[100:]
|
||||
# clean multiple \n
|
||||
res = re.sub(b'\n+', b'\n', res)
|
||||
# Replace newlines inserted by the 'empty_paragraphs' option in rtf2xml with html blank lines
|
||||
# res = re.sub('\s*<body>', '<body>', res)
|
||||
# res = re.sub('(?<=\n)\n{2}',
|
||||
# u'<p>\u00a0</p>\n'.encode('utf-8'), res)
|
||||
f.write(res)
|
||||
self.write_inline_css(inline_class, border_styles)
|
||||
stream.seek(0)
|
||||
mi = get_metadata(stream, 'rtf')
|
||||
if not mi.title:
|
||||
mi.title = _('Unknown')
|
||||
if not mi.authors:
|
||||
mi.authors = [_('Unknown')]
|
||||
opf = OPFCreator(getcwd(), mi)
|
||||
opf.create_manifest([(u'index.xhtml', None)])
|
||||
opf.create_spine([u'index.xhtml'])
|
||||
opf.render(open(u'metadata.opf', 'wb'))
|
||||
return os.path.abspath(u'metadata.opf')
|
||||
|
||||
def postprocess_book(self, oeb, opts, log):
|
||||
for item in oeb.spine:
|
||||
for img in item.data.xpath('//*[local-name()="img" and @src="__REMOVE_ME__"]'):
|
||||
p = img.getparent()
|
||||
idx = p.index(img)
|
||||
p.remove(img)
|
||||
if img.tail:
|
||||
if idx == 0:
|
||||
p.text = (p.text or '') + img.tail
|
||||
else:
|
||||
p[idx-1].tail = (p[idx-1].tail or '') + img.tail
|
||||
40
ebook_converter/ebooks/conversion/plugins/rtf_output.py
Normal file
40
ebook_converter/ebooks/conversion/plugins/rtf_output.py
Normal file
@@ -0,0 +1,40 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL 3'
|
||||
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import os
|
||||
|
||||
from calibre.customize.conversion import OutputFormatPlugin
|
||||
|
||||
|
||||
class RTFOutput(OutputFormatPlugin):
|
||||
|
||||
name = 'RTF Output'
|
||||
author = 'John Schember'
|
||||
file_type = 'rtf'
|
||||
commit_name = 'rtf_output'
|
||||
|
||||
def convert(self, oeb_book, output_path, input_plugin, opts, log):
|
||||
from calibre.ebooks.rtf.rtfml import RTFMLizer
|
||||
|
||||
rtfmlitzer = RTFMLizer(log)
|
||||
content = rtfmlitzer.extract_content(oeb_book, opts)
|
||||
|
||||
close = False
|
||||
if not hasattr(output_path, 'write'):
|
||||
close = True
|
||||
if not os.path.exists(os.path.dirname(output_path)) and os.path.dirname(output_path) != '':
|
||||
os.makedirs(os.path.dirname(output_path))
|
||||
out_stream = lopen(output_path, 'wb')
|
||||
else:
|
||||
out_stream = output_path
|
||||
|
||||
out_stream.seek(0)
|
||||
out_stream.truncate()
|
||||
out_stream.write(content.encode('ascii', 'replace'))
|
||||
|
||||
if close:
|
||||
out_stream.close()
|
||||
122
ebook_converter/ebooks/conversion/plugins/snb_input.py
Normal file
122
ebook_converter/ebooks/conversion/plugins/snb_input.py
Normal file
@@ -0,0 +1,122 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL 3'
|
||||
__copyright__ = '2010, Li Fanxi <lifanxi@freemindworld.com>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import os
|
||||
|
||||
from calibre.customize.conversion import InputFormatPlugin
|
||||
from calibre.ptempfile import TemporaryDirectory
|
||||
from calibre.utils.filenames import ascii_filename
|
||||
from polyglot.builtins import unicode_type
|
||||
|
||||
HTML_TEMPLATE = '<html><head><meta http-equiv="Content-Type" content="text/html; charset=utf-8"/><title>%s</title></head><body>\n%s\n</body></html>'
|
||||
|
||||
|
||||
def html_encode(s):
|
||||
return s.replace('&', '&').replace('<', '<').replace('>', '>').replace('"', '"').replace("'", ''').replace('\n', '<br/>').replace(' ', ' ') # noqa
|
||||
|
||||
|
||||
class SNBInput(InputFormatPlugin):
|
||||
|
||||
name = 'SNB Input'
|
||||
author = 'Li Fanxi'
|
||||
description = 'Convert SNB files to OEB'
|
||||
file_types = {'snb'}
|
||||
commit_name = 'snb_input'
|
||||
|
||||
options = set()
|
||||
|
||||
def convert(self, stream, options, file_ext, log,
|
||||
accelerators):
|
||||
import uuid
|
||||
|
||||
from calibre.ebooks.oeb.base import DirContainer
|
||||
from calibre.ebooks.snb.snbfile import SNBFile
|
||||
from calibre.utils.xml_parse import safe_xml_fromstring
|
||||
|
||||
log.debug("Parsing SNB file...")
|
||||
snbFile = SNBFile()
|
||||
try:
|
||||
snbFile.Parse(stream)
|
||||
except:
|
||||
raise ValueError("Invalid SNB file")
|
||||
if not snbFile.IsValid():
|
||||
log.debug("Invalid SNB file")
|
||||
raise ValueError("Invalid SNB file")
|
||||
log.debug("Handle meta data ...")
|
||||
from calibre.ebooks.conversion.plumber import create_oebbook
|
||||
oeb = create_oebbook(log, None, options,
|
||||
encoding=options.input_encoding, populate=False)
|
||||
meta = snbFile.GetFileStream('snbf/book.snbf')
|
||||
if meta is not None:
|
||||
meta = safe_xml_fromstring(meta)
|
||||
l = {'title' : './/head/name',
|
||||
'creator' : './/head/author',
|
||||
'language' : './/head/language',
|
||||
'generator': './/head/generator',
|
||||
'publisher': './/head/publisher',
|
||||
'cover' : './/head/cover', }
|
||||
d = {}
|
||||
for item in l:
|
||||
node = meta.find(l[item])
|
||||
if node is not None:
|
||||
d[item] = node.text if node.text is not None else ''
|
||||
else:
|
||||
d[item] = ''
|
||||
|
||||
oeb.metadata.add('title', d['title'])
|
||||
oeb.metadata.add('creator', d['creator'], attrib={'role':'aut'})
|
||||
oeb.metadata.add('language', d['language'].lower().replace('_', '-'))
|
||||
oeb.metadata.add('generator', d['generator'])
|
||||
oeb.metadata.add('publisher', d['publisher'])
|
||||
if d['cover'] != '':
|
||||
oeb.guide.add('cover', 'Cover', d['cover'])
|
||||
|
||||
bookid = unicode_type(uuid.uuid4())
|
||||
oeb.metadata.add('identifier', bookid, id='uuid_id', scheme='uuid')
|
||||
for ident in oeb.metadata.identifier:
|
||||
if 'id' in ident.attrib:
|
||||
oeb.uid = oeb.metadata.identifier[0]
|
||||
break
|
||||
|
||||
with TemporaryDirectory('_snb2oeb', keep=True) as tdir:
|
||||
log.debug('Process TOC ...')
|
||||
toc = snbFile.GetFileStream('snbf/toc.snbf')
|
||||
oeb.container = DirContainer(tdir, log)
|
||||
if toc is not None:
|
||||
toc = safe_xml_fromstring(toc)
|
||||
i = 1
|
||||
for ch in toc.find('.//body'):
|
||||
chapterName = ch.text
|
||||
chapterSrc = ch.get('src')
|
||||
fname = 'ch_%d.htm' % i
|
||||
data = snbFile.GetFileStream('snbc/' + chapterSrc)
|
||||
if data is None:
|
||||
continue
|
||||
snbc = safe_xml_fromstring(data)
|
||||
lines = []
|
||||
for line in snbc.find('.//body'):
|
||||
if line.tag == 'text':
|
||||
lines.append('<p>%s</p>' % html_encode(line.text))
|
||||
elif line.tag == 'img':
|
||||
lines.append('<p><img src="%s" /></p>' % html_encode(line.text))
|
||||
with open(os.path.join(tdir, fname), 'wb') as f:
|
||||
f.write((HTML_TEMPLATE % (chapterName, '\n'.join(lines))).encode('utf-8', 'replace'))
|
||||
oeb.toc.add(ch.text, fname)
|
||||
id, href = oeb.manifest.generate(id='html',
|
||||
href=ascii_filename(fname))
|
||||
item = oeb.manifest.add(id, href, 'text/html')
|
||||
item.html_input_href = fname
|
||||
oeb.spine.add(item, True)
|
||||
i = i + 1
|
||||
imageFiles = snbFile.OutputImageFiles(tdir)
|
||||
for f, m in imageFiles:
|
||||
id, href = oeb.manifest.generate(id='image',
|
||||
href=ascii_filename(f))
|
||||
item = oeb.manifest.add(id, href, m)
|
||||
item.html_input_href = f
|
||||
|
||||
return oeb
|
||||
269
ebook_converter/ebooks/conversion/plugins/snb_output.py
Normal file
269
ebook_converter/ebooks/conversion/plugins/snb_output.py
Normal file
@@ -0,0 +1,269 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL 3'
|
||||
__copyright__ = '2010, Li Fanxi <lifanxi@freemindworld.com>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import os
|
||||
|
||||
from calibre.customize.conversion import OutputFormatPlugin, OptionRecommendation
|
||||
from calibre.ptempfile import TemporaryDirectory
|
||||
from calibre.constants import __appname__, __version__
|
||||
from polyglot.builtins import unicode_type
|
||||
|
||||
|
||||
class SNBOutput(OutputFormatPlugin):
|
||||
|
||||
name = 'SNB Output'
|
||||
author = 'Li Fanxi'
|
||||
file_type = 'snb'
|
||||
commit_name = 'snb_output'
|
||||
|
||||
options = {
|
||||
OptionRecommendation(name='snb_output_encoding', recommended_value='utf-8',
|
||||
level=OptionRecommendation.LOW,
|
||||
help=_('Specify the character encoding of the output document. '
|
||||
'The default is utf-8.')),
|
||||
OptionRecommendation(name='snb_max_line_length',
|
||||
recommended_value=0, level=OptionRecommendation.LOW,
|
||||
help=_('The maximum number of characters per line. This splits on '
|
||||
'the first space before the specified value. If no space is found '
|
||||
'the line will be broken at the space after and will exceed the '
|
||||
'specified value. Also, there is a minimum of 25 characters. '
|
||||
'Use 0 to disable line splitting.')),
|
||||
OptionRecommendation(name='snb_insert_empty_line',
|
||||
recommended_value=False, level=OptionRecommendation.LOW,
|
||||
help=_('Specify whether or not to insert an empty line between '
|
||||
'two paragraphs.')),
|
||||
OptionRecommendation(name='snb_dont_indent_first_line',
|
||||
recommended_value=False, level=OptionRecommendation.LOW,
|
||||
help=_('Specify whether or not to insert two space characters '
|
||||
'to indent the first line of each paragraph.')),
|
||||
OptionRecommendation(name='snb_hide_chapter_name',
|
||||
recommended_value=False, level=OptionRecommendation.LOW,
|
||||
help=_('Specify whether or not to hide the chapter title for each '
|
||||
'chapter. Useful for image-only output (eg. comics).')),
|
||||
OptionRecommendation(name='snb_full_screen',
|
||||
recommended_value=False, level=OptionRecommendation.LOW,
|
||||
help=_('Resize all the images for full screen view. ')),
|
||||
}
|
||||
|
||||
def convert(self, oeb_book, output_path, input_plugin, opts, log):
|
||||
from lxml import etree
|
||||
from calibre.ebooks.snb.snbfile import SNBFile
|
||||
from calibre.ebooks.snb.snbml import SNBMLizer, ProcessFileName
|
||||
|
||||
self.opts = opts
|
||||
from calibre.ebooks.oeb.transforms.rasterize import SVGRasterizer, Unavailable
|
||||
try:
|
||||
rasterizer = SVGRasterizer()
|
||||
rasterizer(oeb_book, opts)
|
||||
except Unavailable:
|
||||
log.warn('SVG rasterizer unavailable, SVG will not be converted')
|
||||
|
||||
# Create temp dir
|
||||
with TemporaryDirectory('_snb_output') as tdir:
|
||||
# Create stub directories
|
||||
snbfDir = os.path.join(tdir, 'snbf')
|
||||
snbcDir = os.path.join(tdir, 'snbc')
|
||||
snbiDir = os.path.join(tdir, 'snbc/images')
|
||||
os.mkdir(snbfDir)
|
||||
os.mkdir(snbcDir)
|
||||
os.mkdir(snbiDir)
|
||||
|
||||
# Process Meta data
|
||||
meta = oeb_book.metadata
|
||||
if meta.title:
|
||||
title = unicode_type(meta.title[0])
|
||||
else:
|
||||
title = ''
|
||||
authors = [unicode_type(x) for x in meta.creator if x.role == 'aut']
|
||||
if meta.publisher:
|
||||
publishers = unicode_type(meta.publisher[0])
|
||||
else:
|
||||
publishers = ''
|
||||
if meta.language:
|
||||
lang = unicode_type(meta.language[0]).upper()
|
||||
else:
|
||||
lang = ''
|
||||
if meta.description:
|
||||
abstract = unicode_type(meta.description[0])
|
||||
else:
|
||||
abstract = ''
|
||||
|
||||
# Process Cover
|
||||
g, m, s = oeb_book.guide, oeb_book.manifest, oeb_book.spine
|
||||
href = None
|
||||
if 'titlepage' not in g:
|
||||
if 'cover' in g:
|
||||
href = g['cover'].href
|
||||
|
||||
# Output book info file
|
||||
bookInfoTree = etree.Element("book-snbf", version="1.0")
|
||||
headTree = etree.SubElement(bookInfoTree, "head")
|
||||
etree.SubElement(headTree, "name").text = title
|
||||
etree.SubElement(headTree, "author").text = ' '.join(authors)
|
||||
etree.SubElement(headTree, "language").text = lang
|
||||
etree.SubElement(headTree, "rights")
|
||||
etree.SubElement(headTree, "publisher").text = publishers
|
||||
etree.SubElement(headTree, "generator").text = __appname__ + ' ' + __version__
|
||||
etree.SubElement(headTree, "created")
|
||||
etree.SubElement(headTree, "abstract").text = abstract
|
||||
if href is not None:
|
||||
etree.SubElement(headTree, "cover").text = ProcessFileName(href)
|
||||
else:
|
||||
etree.SubElement(headTree, "cover")
|
||||
with open(os.path.join(snbfDir, 'book.snbf'), 'wb') as f:
|
||||
f.write(etree.tostring(bookInfoTree, pretty_print=True, encoding='utf-8'))
|
||||
|
||||
# Output TOC
|
||||
tocInfoTree = etree.Element("toc-snbf")
|
||||
tocHead = etree.SubElement(tocInfoTree, "head")
|
||||
tocBody = etree.SubElement(tocInfoTree, "body")
|
||||
outputFiles = {}
|
||||
if oeb_book.toc.count() == 0:
|
||||
log.warn('This SNB file has no Table of Contents. '
|
||||
'Creating a default TOC')
|
||||
first = next(iter(oeb_book.spine))
|
||||
oeb_book.toc.add(_('Start page'), first.href)
|
||||
else:
|
||||
first = next(iter(oeb_book.spine))
|
||||
if oeb_book.toc[0].href != first.href:
|
||||
# The pages before the fist item in toc will be stored as
|
||||
# "Cover Pages".
|
||||
# oeb_book.toc does not support "insert", so we generate
|
||||
# the tocInfoTree directly instead of modifying the toc
|
||||
ch = etree.SubElement(tocBody, "chapter")
|
||||
ch.set("src", ProcessFileName(first.href) + ".snbc")
|
||||
ch.text = _('Cover pages')
|
||||
outputFiles[first.href] = []
|
||||
outputFiles[first.href].append(("", _("Cover pages")))
|
||||
|
||||
for tocitem in oeb_book.toc:
|
||||
if tocitem.href.find('#') != -1:
|
||||
item = tocitem.href.split('#')
|
||||
if len(item) != 2:
|
||||
log.error('Error in TOC item: %s' % tocitem)
|
||||
else:
|
||||
if item[0] in outputFiles:
|
||||
outputFiles[item[0]].append((item[1], tocitem.title))
|
||||
else:
|
||||
outputFiles[item[0]] = []
|
||||
if "" not in outputFiles[item[0]]:
|
||||
outputFiles[item[0]].append(("", tocitem.title + _(" (Preface)")))
|
||||
ch = etree.SubElement(tocBody, "chapter")
|
||||
ch.set("src", ProcessFileName(item[0]) + ".snbc")
|
||||
ch.text = tocitem.title + _(" (Preface)")
|
||||
outputFiles[item[0]].append((item[1], tocitem.title))
|
||||
else:
|
||||
if tocitem.href in outputFiles:
|
||||
outputFiles[tocitem.href].append(("", tocitem.title))
|
||||
else:
|
||||
outputFiles[tocitem.href] = []
|
||||
outputFiles[tocitem.href].append(("", tocitem.title))
|
||||
ch = etree.SubElement(tocBody, "chapter")
|
||||
ch.set("src", ProcessFileName(tocitem.href) + ".snbc")
|
||||
ch.text = tocitem.title
|
||||
|
||||
etree.SubElement(tocHead, "chapters").text = '%d' % len(tocBody)
|
||||
|
||||
with open(os.path.join(snbfDir, 'toc.snbf'), 'wb') as f:
|
||||
f.write(etree.tostring(tocInfoTree, pretty_print=True, encoding='utf-8'))
|
||||
|
||||
# Output Files
|
||||
oldTree = None
|
||||
mergeLast = False
|
||||
lastName = None
|
||||
for item in s:
|
||||
from calibre.ebooks.oeb.base import OEB_DOCS, OEB_IMAGES
|
||||
if m.hrefs[item.href].media_type in OEB_DOCS:
|
||||
if item.href not in outputFiles:
|
||||
log.debug('File %s is unused in TOC. Continue in last chapter' % item.href)
|
||||
mergeLast = True
|
||||
else:
|
||||
if oldTree is not None and mergeLast:
|
||||
log.debug('Output the modified chapter again: %s' % lastName)
|
||||
with open(os.path.join(snbcDir, lastName), 'wb') as f:
|
||||
f.write(etree.tostring(oldTree, pretty_print=True, encoding='utf-8'))
|
||||
mergeLast = False
|
||||
|
||||
log.debug('Converting %s to snbc...' % item.href)
|
||||
snbwriter = SNBMLizer(log)
|
||||
snbcTrees = None
|
||||
if not mergeLast:
|
||||
snbcTrees = snbwriter.extract_content(oeb_book, item, outputFiles[item.href], opts)
|
||||
for subName in snbcTrees:
|
||||
postfix = ''
|
||||
if subName != '':
|
||||
postfix = '_' + subName
|
||||
lastName = ProcessFileName(item.href + postfix + ".snbc")
|
||||
oldTree = snbcTrees[subName]
|
||||
with open(os.path.join(snbcDir, lastName), 'wb') as f:
|
||||
f.write(etree.tostring(oldTree, pretty_print=True, encoding='utf-8'))
|
||||
else:
|
||||
log.debug('Merge %s with last TOC item...' % item.href)
|
||||
snbwriter.merge_content(oldTree, oeb_book, item, [('', _("Start"))], opts)
|
||||
|
||||
# Output the last one if needed
|
||||
log.debug('Output the last modified chapter again: %s' % lastName)
|
||||
if oldTree is not None and mergeLast:
|
||||
with open(os.path.join(snbcDir, lastName), 'wb') as f:
|
||||
f.write(etree.tostring(oldTree, pretty_print=True, encoding='utf-8'))
|
||||
mergeLast = False
|
||||
|
||||
for item in m:
|
||||
if m.hrefs[item.href].media_type in OEB_IMAGES:
|
||||
log.debug('Converting image: %s ...' % item.href)
|
||||
content = m.hrefs[item.href].data
|
||||
# Convert & Resize image
|
||||
self.HandleImage(content, os.path.join(snbiDir, ProcessFileName(item.href)))
|
||||
|
||||
# Package as SNB File
|
||||
snbFile = SNBFile()
|
||||
snbFile.FromDir(tdir)
|
||||
snbFile.Output(output_path)
|
||||
|
||||
def HandleImage(self, imageData, imagePath):
|
||||
from calibre.utils.img import image_from_data, resize_image, image_to_data
|
||||
img = image_from_data(imageData)
|
||||
x, y = img.width(), img.height()
|
||||
if self.opts:
|
||||
if self.opts.snb_full_screen:
|
||||
SCREEN_X, SCREEN_Y = self.opts.output_profile.screen_size
|
||||
else:
|
||||
SCREEN_X, SCREEN_Y = self.opts.output_profile.comic_screen_size
|
||||
else:
|
||||
SCREEN_X = 540
|
||||
SCREEN_Y = 700
|
||||
# Handle big image only
|
||||
if x > SCREEN_X or y > SCREEN_Y:
|
||||
xScale = float(x) / SCREEN_X
|
||||
yScale = float(y) / SCREEN_Y
|
||||
scale = max(xScale, yScale)
|
||||
# TODO : intelligent image rotation
|
||||
# img = img.rotate(90)
|
||||
# x,y = y,x
|
||||
img = resize_image(img, x // scale, y // scale)
|
||||
with lopen(imagePath, 'wb') as f:
|
||||
f.write(image_to_data(img, fmt=imagePath.rpartition('.')[-1]))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
from calibre.ebooks.oeb.reader import OEBReader
|
||||
from calibre.ebooks.oeb.base import OEBBook
|
||||
from calibre.ebooks.conversion.preprocess import HTMLPreProcessor
|
||||
from calibre.customize.profiles import HanlinV3Output
|
||||
|
||||
class OptionValues(object):
|
||||
pass
|
||||
|
||||
opts = OptionValues()
|
||||
opts.output_profile = HanlinV3Output(None)
|
||||
|
||||
html_preprocessor = HTMLPreProcessor(None, None, opts)
|
||||
from calibre.utils.logging import default_log
|
||||
oeb = OEBBook(default_log, html_preprocessor)
|
||||
reader = OEBReader
|
||||
reader()(oeb, '/tmp/bbb/processed/')
|
||||
SNBOutput(None).convert(oeb, '/tmp/test.snb', None, None, default_log)
|
||||
39
ebook_converter/ebooks/conversion/plugins/tcr_input.py
Normal file
39
ebook_converter/ebooks/conversion/plugins/tcr_input.py
Normal file
@@ -0,0 +1,39 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL 3'
|
||||
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
from io import BytesIO
|
||||
|
||||
from calibre.customize.conversion import InputFormatPlugin
|
||||
|
||||
|
||||
class TCRInput(InputFormatPlugin):
|
||||
|
||||
name = 'TCR Input'
|
||||
author = 'John Schember'
|
||||
description = 'Convert TCR files to HTML'
|
||||
file_types = {'tcr'}
|
||||
commit_name = 'tcr_input'
|
||||
|
||||
def convert(self, stream, options, file_ext, log, accelerators):
|
||||
from calibre.ebooks.compression.tcr import decompress
|
||||
|
||||
log.info('Decompressing text...')
|
||||
raw_txt = decompress(stream)
|
||||
|
||||
log.info('Converting text to OEB...')
|
||||
stream = BytesIO(raw_txt)
|
||||
|
||||
from calibre.customize.ui import plugin_for_input_format
|
||||
|
||||
txt_plugin = plugin_for_input_format('txt')
|
||||
for opt in txt_plugin.options:
|
||||
if not hasattr(self.options, opt.option.name):
|
||||
setattr(options, opt.option.name, opt.recommended_value)
|
||||
|
||||
stream.seek(0)
|
||||
return txt_plugin.convert(stream, options,
|
||||
'txt', log, accelerators)
|
||||
56
ebook_converter/ebooks/conversion/plugins/tcr_output.py
Normal file
56
ebook_converter/ebooks/conversion/plugins/tcr_output.py
Normal file
@@ -0,0 +1,56 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL 3'
|
||||
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import os
|
||||
|
||||
from calibre.customize.conversion import OutputFormatPlugin, \
|
||||
OptionRecommendation
|
||||
|
||||
|
||||
class TCROutput(OutputFormatPlugin):
|
||||
|
||||
name = 'TCR Output'
|
||||
author = 'John Schember'
|
||||
file_type = 'tcr'
|
||||
commit_name = 'tcr_output'
|
||||
|
||||
options = {
|
||||
OptionRecommendation(name='tcr_output_encoding', recommended_value='utf-8',
|
||||
level=OptionRecommendation.LOW,
|
||||
help=_('Specify the character encoding of the output document. '
|
||||
'The default is utf-8.'))}
|
||||
|
||||
def convert(self, oeb_book, output_path, input_plugin, opts, log):
|
||||
from calibre.ebooks.txt.txtml import TXTMLizer
|
||||
from calibre.ebooks.compression.tcr import compress
|
||||
|
||||
close = False
|
||||
if not hasattr(output_path, 'write'):
|
||||
close = True
|
||||
if not os.path.exists(os.path.dirname(output_path)) and os.path.dirname(output_path):
|
||||
os.makedirs(os.path.dirname(output_path))
|
||||
out_stream = lopen(output_path, 'wb')
|
||||
else:
|
||||
out_stream = output_path
|
||||
|
||||
setattr(opts, 'flush_paras', False)
|
||||
setattr(opts, 'max_line_length', 0)
|
||||
setattr(opts, 'force_max_line_length', False)
|
||||
setattr(opts, 'indent_paras', False)
|
||||
|
||||
writer = TXTMLizer(log)
|
||||
txt = writer.extract_content(oeb_book, opts).encode(opts.tcr_output_encoding, 'replace')
|
||||
|
||||
log.info('Compressing text...')
|
||||
txt = compress(txt)
|
||||
|
||||
out_stream.seek(0)
|
||||
out_stream.truncate()
|
||||
out_stream.write(txt)
|
||||
|
||||
if close:
|
||||
out_stream.close()
|
||||
308
ebook_converter/ebooks/conversion/plugins/txt_input.py
Normal file
308
ebook_converter/ebooks/conversion/plugins/txt_input.py
Normal file
@@ -0,0 +1,308 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL 3'
|
||||
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import os
|
||||
|
||||
from calibre import _ent_pat, walk, xml_entity_to_unicode
|
||||
from calibre.customize.conversion import InputFormatPlugin, OptionRecommendation
|
||||
from polyglot.builtins import getcwd
|
||||
|
||||
MD_EXTENSIONS = {
|
||||
'abbr': _('Abbreviations'),
|
||||
'admonition': _('Support admonitions'),
|
||||
'attr_list': _('Add attribute to HTML tags'),
|
||||
'codehilite': _('Add code highlighting via Pygments'),
|
||||
'def_list': _('Definition lists'),
|
||||
'extra': _('Enables various common extensions'),
|
||||
'fenced_code': _('Alternative code block syntax'),
|
||||
'footnotes': _('Footnotes'),
|
||||
'legacy_attrs': _('Use legacy element attributes'),
|
||||
'legacy_em': _('Use legacy underscore handling for connected words'),
|
||||
'meta': _('Metadata in the document'),
|
||||
'nl2br': _('Treat newlines as hard breaks'),
|
||||
'sane_lists': _('Do not allow mixing list types'),
|
||||
'smarty': _('Use markdown\'s internal smartypants parser'),
|
||||
'tables': _('Support tables'),
|
||||
'toc': _('Generate a table of contents'),
|
||||
'wikilinks': _('Wiki style links'),
|
||||
}
|
||||
|
||||
|
||||
class TXTInput(InputFormatPlugin):
|
||||
|
||||
name = 'TXT Input'
|
||||
author = 'John Schember'
|
||||
description = 'Convert TXT files to HTML'
|
||||
file_types = {'txt', 'txtz', 'text', 'md', 'textile', 'markdown'}
|
||||
commit_name = 'txt_input'
|
||||
ui_data = {
|
||||
'md_extensions': MD_EXTENSIONS,
|
||||
'paragraph_types': {
|
||||
'auto': _('Try to auto detect paragraph type'),
|
||||
'block': _('Treat a blank line as a paragraph break'),
|
||||
'single': _('Assume every line is a paragraph'),
|
||||
'print': _('Assume every line starting with 2+ spaces or a tab starts a paragraph'),
|
||||
'unformatted': _('Most lines have hard line breaks, few/no blank lines or indents'),
|
||||
'off': _('Don\'t modify the paragraph structure'),
|
||||
},
|
||||
'formatting_types': {
|
||||
'auto': _('Automatically decide which formatting processor to use'),
|
||||
'plain': _('No formatting'),
|
||||
'heuristic': _('Use heuristics to determine chapter headings, italics, etc.'),
|
||||
'textile': _('Use the TexTile markup language'),
|
||||
'markdown': _('Use the Markdown markup language')
|
||||
},
|
||||
}
|
||||
|
||||
options = {
|
||||
OptionRecommendation(name='formatting_type', recommended_value='auto',
|
||||
choices=list(ui_data['formatting_types']),
|
||||
help=_('Formatting used within the document.\n'
|
||||
'* auto: {auto}\n'
|
||||
'* plain: {plain}\n'
|
||||
'* heuristic: {heuristic}\n'
|
||||
'* textile: {textile}\n'
|
||||
'* markdown: {markdown}\n'
|
||||
'To learn more about markdown see {url}').format(
|
||||
url='https://daringfireball.net/projects/markdown/', **ui_data['formatting_types'])
|
||||
),
|
||||
OptionRecommendation(name='paragraph_type', recommended_value='auto',
|
||||
choices=list(ui_data['paragraph_types']),
|
||||
help=_('Paragraph structure to assume. The value of "off" is useful for formatted documents such as Markdown or Textile. '
|
||||
'Choices are:\n'
|
||||
'* auto: {auto}\n'
|
||||
'* block: {block}\n'
|
||||
'* single: {single}\n'
|
||||
'* print: {print}\n'
|
||||
'* unformatted: {unformatted}\n'
|
||||
'* off: {off}').format(**ui_data['paragraph_types'])
|
||||
),
|
||||
OptionRecommendation(name='preserve_spaces', recommended_value=False,
|
||||
help=_('Normally extra spaces are condensed into a single space. '
|
||||
'With this option all spaces will be displayed.')),
|
||||
OptionRecommendation(name='txt_in_remove_indents', recommended_value=False,
|
||||
help=_('Normally extra space at the beginning of lines is retained. '
|
||||
'With this option they will be removed.')),
|
||||
OptionRecommendation(name="markdown_extensions", recommended_value='footnotes, tables, toc',
|
||||
help=_('Enable extensions to markdown syntax. Extensions are formatting that is not part '
|
||||
'of the standard markdown format. The extensions enabled by default: %default.\n'
|
||||
'To learn more about markdown extensions, see {}\n'
|
||||
'This should be a comma separated list of extensions to enable:\n'
|
||||
).format('https://python-markdown.github.io/extensions/') + '\n'.join('* %s: %s' % (k, MD_EXTENSIONS[k]) for k in sorted(MD_EXTENSIONS))),
|
||||
}
|
||||
|
||||
def shift_file(self, fname, data):
|
||||
name, ext = os.path.splitext(fname)
|
||||
candidate = os.path.join(self.output_dir, fname)
|
||||
c = 0
|
||||
while os.path.exists(candidate):
|
||||
c += 1
|
||||
candidate = os.path.join(self.output_dir, '{}-{}{}'.format(name, c, ext))
|
||||
ans = candidate
|
||||
with open(ans, 'wb') as f:
|
||||
f.write(data)
|
||||
return f.name
|
||||
|
||||
def fix_resources(self, html, base_dir):
|
||||
from html5_parser import parse
|
||||
root = parse(html)
|
||||
changed = False
|
||||
for img in root.xpath('//img[@src]'):
|
||||
src = img.get('src')
|
||||
prefix = src.split(':', 1)[0].lower()
|
||||
if prefix not in ('file', 'http', 'https', 'ftp') and not os.path.isabs(src):
|
||||
src = os.path.join(base_dir, src)
|
||||
if os.access(src, os.R_OK):
|
||||
with open(src, 'rb') as f:
|
||||
data = f.read()
|
||||
f = self.shift_file(os.path.basename(src), data)
|
||||
changed = True
|
||||
img.set('src', os.path.basename(f))
|
||||
if changed:
|
||||
from lxml import etree
|
||||
html = etree.tostring(root, encoding='unicode')
|
||||
return html
|
||||
|
||||
def convert(self, stream, options, file_ext, log,
|
||||
accelerators):
|
||||
from calibre.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator
|
||||
from calibre.ebooks.chardet import detect
|
||||
from calibre.utils.zipfile import ZipFile
|
||||
from calibre.ebooks.txt.processor import (convert_basic,
|
||||
convert_markdown_with_metadata, separate_paragraphs_single_line,
|
||||
separate_paragraphs_print_formatted, preserve_spaces,
|
||||
detect_paragraph_type, detect_formatting_type,
|
||||
normalize_line_endings, convert_textile, remove_indents,
|
||||
block_to_single_line, separate_hard_scene_breaks)
|
||||
|
||||
self.log = log
|
||||
txt = b''
|
||||
log.debug('Reading text from file...')
|
||||
length = 0
|
||||
base_dir = self.output_dir = getcwd()
|
||||
|
||||
# Extract content from zip archive.
|
||||
if file_ext == 'txtz':
|
||||
zf = ZipFile(stream)
|
||||
zf.extractall('.')
|
||||
|
||||
for x in walk('.'):
|
||||
if os.path.splitext(x)[1].lower() in ('.txt', '.text'):
|
||||
with open(x, 'rb') as tf:
|
||||
txt += tf.read() + b'\n\n'
|
||||
else:
|
||||
if getattr(stream, 'name', None):
|
||||
base_dir = os.path.dirname(stream.name)
|
||||
txt = stream.read()
|
||||
if file_ext in {'md', 'textile', 'markdown'}:
|
||||
options.formatting_type = {'md': 'markdown'}.get(file_ext, file_ext)
|
||||
log.info('File extension indicates particular formatting. '
|
||||
'Forcing formatting type to: %s'%options.formatting_type)
|
||||
options.paragraph_type = 'off'
|
||||
|
||||
# Get the encoding of the document.
|
||||
if options.input_encoding:
|
||||
ienc = options.input_encoding
|
||||
log.debug('Using user specified input encoding of %s' % ienc)
|
||||
else:
|
||||
det_encoding = detect(txt[:4096])
|
||||
det_encoding, confidence = det_encoding['encoding'], det_encoding['confidence']
|
||||
if det_encoding and det_encoding.lower().replace('_', '-').strip() in (
|
||||
'gb2312', 'chinese', 'csiso58gb231280', 'euc-cn', 'euccn',
|
||||
'eucgb2312-cn', 'gb2312-1980', 'gb2312-80', 'iso-ir-58'):
|
||||
# Microsoft Word exports to HTML with encoding incorrectly set to
|
||||
# gb2312 instead of gbk. gbk is a superset of gb2312, anyway.
|
||||
det_encoding = 'gbk'
|
||||
ienc = det_encoding
|
||||
log.debug('Detected input encoding as %s with a confidence of %s%%' % (ienc, confidence * 100))
|
||||
if not ienc:
|
||||
ienc = 'utf-8'
|
||||
log.debug('No input encoding specified and could not auto detect using %s' % ienc)
|
||||
# Remove BOM from start of txt as its presence can confuse markdown
|
||||
import codecs
|
||||
for bom in (codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE, codecs.BOM_UTF8, codecs.BOM_UTF32_LE, codecs.BOM_UTF32_BE):
|
||||
if txt.startswith(bom):
|
||||
txt = txt[len(bom):]
|
||||
break
|
||||
txt = txt.decode(ienc, 'replace')
|
||||
|
||||
# Replace entities
|
||||
txt = _ent_pat.sub(xml_entity_to_unicode, txt)
|
||||
|
||||
# Normalize line endings
|
||||
txt = normalize_line_endings(txt)
|
||||
|
||||
# Determine the paragraph type of the document.
|
||||
if options.paragraph_type == 'auto':
|
||||
options.paragraph_type = detect_paragraph_type(txt)
|
||||
if options.paragraph_type == 'unknown':
|
||||
log.debug('Could not reliably determine paragraph type using block')
|
||||
options.paragraph_type = 'block'
|
||||
else:
|
||||
log.debug('Auto detected paragraph type as %s' % options.paragraph_type)
|
||||
|
||||
# Detect formatting
|
||||
if options.formatting_type == 'auto':
|
||||
options.formatting_type = detect_formatting_type(txt)
|
||||
log.debug('Auto detected formatting as %s' % options.formatting_type)
|
||||
|
||||
if options.formatting_type == 'heuristic':
|
||||
setattr(options, 'enable_heuristics', True)
|
||||
setattr(options, 'unwrap_lines', False)
|
||||
setattr(options, 'smarten_punctuation', True)
|
||||
|
||||
# Reformat paragraphs to block formatting based on the detected type.
|
||||
# We don't check for block because the processor assumes block.
|
||||
# single and print at transformed to block for processing.
|
||||
if options.paragraph_type == 'single':
|
||||
txt = separate_paragraphs_single_line(txt)
|
||||
elif options.paragraph_type == 'print':
|
||||
txt = separate_hard_scene_breaks(txt)
|
||||
txt = separate_paragraphs_print_formatted(txt)
|
||||
txt = block_to_single_line(txt)
|
||||
elif options.paragraph_type == 'unformatted':
|
||||
from calibre.ebooks.conversion.utils import HeuristicProcessor
|
||||
# unwrap lines based on punctuation
|
||||
docanalysis = DocAnalysis('txt', txt)
|
||||
length = docanalysis.line_length(.5)
|
||||
preprocessor = HeuristicProcessor(options, log=getattr(self, 'log', None))
|
||||
txt = preprocessor.punctuation_unwrap(length, txt, 'txt')
|
||||
txt = separate_paragraphs_single_line(txt)
|
||||
elif options.paragraph_type == 'block':
|
||||
txt = separate_hard_scene_breaks(txt)
|
||||
txt = block_to_single_line(txt)
|
||||
|
||||
if getattr(options, 'enable_heuristics', False) and getattr(options, 'dehyphenate', False):
|
||||
docanalysis = DocAnalysis('txt', txt)
|
||||
if not length:
|
||||
length = docanalysis.line_length(.5)
|
||||
dehyphenator = Dehyphenator(options.verbose, log=self.log)
|
||||
txt = dehyphenator(txt,'txt', length)
|
||||
|
||||
# User requested transformation on the text.
|
||||
if options.txt_in_remove_indents:
|
||||
txt = remove_indents(txt)
|
||||
|
||||
# Preserve spaces will replace multiple spaces to a space
|
||||
# followed by the entity.
|
||||
if options.preserve_spaces:
|
||||
txt = preserve_spaces(txt)
|
||||
|
||||
# Process the text using the appropriate text processor.
|
||||
self.shifted_files = []
|
||||
try:
|
||||
html = ''
|
||||
input_mi = None
|
||||
if options.formatting_type == 'markdown':
|
||||
log.debug('Running text through markdown conversion...')
|
||||
try:
|
||||
input_mi, html = convert_markdown_with_metadata(txt, extensions=[x.strip() for x in options.markdown_extensions.split(',') if x.strip()])
|
||||
except RuntimeError:
|
||||
raise ValueError('This txt file has malformed markup, it cannot be'
|
||||
' converted by calibre. See https://daringfireball.net/projects/markdown/syntax')
|
||||
html = self.fix_resources(html, base_dir)
|
||||
elif options.formatting_type == 'textile':
|
||||
log.debug('Running text through textile conversion...')
|
||||
html = convert_textile(txt)
|
||||
html = self.fix_resources(html, base_dir)
|
||||
else:
|
||||
log.debug('Running text through basic conversion...')
|
||||
flow_size = getattr(options, 'flow_size', 0)
|
||||
html = convert_basic(txt, epub_split_size_kb=flow_size)
|
||||
|
||||
# Run the HTMLized text through the html processing plugin.
|
||||
from calibre.customize.ui import plugin_for_input_format
|
||||
html_input = plugin_for_input_format('html')
|
||||
for opt in html_input.options:
|
||||
setattr(options, opt.option.name, opt.recommended_value)
|
||||
options.input_encoding = 'utf-8'
|
||||
htmlfile = self.shift_file('index.html', html.encode('utf-8'))
|
||||
odi = options.debug_pipeline
|
||||
options.debug_pipeline = None
|
||||
# Generate oeb from html conversion.
|
||||
oeb = html_input.convert(open(htmlfile, 'rb'), options, 'html', log, {})
|
||||
options.debug_pipeline = odi
|
||||
finally:
|
||||
for x in self.shifted_files:
|
||||
os.remove(x)
|
||||
|
||||
# Set metadata from file.
|
||||
if input_mi is None:
|
||||
from calibre.customize.ui import get_file_type_metadata
|
||||
input_mi = get_file_type_metadata(stream, file_ext)
|
||||
from calibre.ebooks.oeb.transforms.metadata import meta_info_to_oeb_metadata
|
||||
meta_info_to_oeb_metadata(input_mi, oeb.metadata, log)
|
||||
self.html_postprocess_title = input_mi.title
|
||||
|
||||
return oeb
|
||||
|
||||
def postprocess_book(self, oeb, opts, log):
|
||||
for item in oeb.spine:
|
||||
if hasattr(item.data, 'xpath'):
|
||||
for title in item.data.xpath('//*[local-name()="title"]'):
|
||||
if title.text == _('Unknown'):
|
||||
title.text = self.html_postprocess_title
|
||||
165
ebook_converter/ebooks/conversion/plugins/txt_output.py
Normal file
165
ebook_converter/ebooks/conversion/plugins/txt_output.py
Normal file
@@ -0,0 +1,165 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL 3'
|
||||
__copyright__ = '2009, John Schember <john@nachtimwald.com>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import os
|
||||
import shutil
|
||||
|
||||
|
||||
from calibre.customize.conversion import OutputFormatPlugin, \
|
||||
OptionRecommendation
|
||||
from calibre.ptempfile import TemporaryDirectory, TemporaryFile
|
||||
|
||||
NEWLINE_TYPES = ['system', 'unix', 'old_mac', 'windows']
|
||||
|
||||
|
||||
class TXTOutput(OutputFormatPlugin):
|
||||
|
||||
name = 'TXT Output'
|
||||
author = 'John Schember'
|
||||
file_type = 'txt'
|
||||
commit_name = 'txt_output'
|
||||
ui_data = {
|
||||
'newline_types': NEWLINE_TYPES,
|
||||
'formatting_types': {
|
||||
'plain': _('Plain text'),
|
||||
'markdown': _('Markdown formatted text'),
|
||||
'textile': _('TexTile formatted text')
|
||||
},
|
||||
}
|
||||
|
||||
options = {
|
||||
OptionRecommendation(name='newline', recommended_value='system',
|
||||
level=OptionRecommendation.LOW,
|
||||
short_switch='n', choices=NEWLINE_TYPES,
|
||||
help=_('Type of newline to use. Options are %s. Default is \'system\'. '
|
||||
'Use \'old_mac\' for compatibility with Mac OS 9 and earlier. '
|
||||
'For macOS use \'unix\'. \'system\' will default to the newline '
|
||||
'type used by this OS.') % sorted(NEWLINE_TYPES)),
|
||||
OptionRecommendation(name='txt_output_encoding', recommended_value='utf-8',
|
||||
level=OptionRecommendation.LOW,
|
||||
help=_('Specify the character encoding of the output document. '
|
||||
'The default is utf-8.')),
|
||||
OptionRecommendation(name='inline_toc',
|
||||
recommended_value=False, level=OptionRecommendation.LOW,
|
||||
help=_('Add Table of Contents to beginning of the book.')),
|
||||
OptionRecommendation(name='max_line_length',
|
||||
recommended_value=0, level=OptionRecommendation.LOW,
|
||||
help=_('The maximum number of characters per line. This splits on '
|
||||
'the first space before the specified value. If no space is found '
|
||||
'the line will be broken at the space after and will exceed the '
|
||||
'specified value. Also, there is a minimum of 25 characters. '
|
||||
'Use 0 to disable line splitting.')),
|
||||
OptionRecommendation(name='force_max_line_length',
|
||||
recommended_value=False, level=OptionRecommendation.LOW,
|
||||
help=_('Force splitting on the max-line-length value when no space '
|
||||
'is present. Also allows max-line-length to be below the minimum')),
|
||||
OptionRecommendation(name='txt_output_formatting',
|
||||
recommended_value='plain',
|
||||
choices=list(ui_data['formatting_types']),
|
||||
help=_('Formatting used within the document.\n'
|
||||
'* plain: {plain}\n'
|
||||
'* markdown: {markdown}\n'
|
||||
'* textile: {textile}').format(**ui_data['formatting_types'])),
|
||||
OptionRecommendation(name='keep_links',
|
||||
recommended_value=False, level=OptionRecommendation.LOW,
|
||||
help=_('Do not remove links within the document. This is only '
|
||||
'useful when paired with a txt-output-formatting option that '
|
||||
'is not none because links are always removed with plain text output.')),
|
||||
OptionRecommendation(name='keep_image_references',
|
||||
recommended_value=False, level=OptionRecommendation.LOW,
|
||||
help=_('Do not remove image references within the document. This is only '
|
||||
'useful when paired with a txt-output-formatting option that '
|
||||
'is not none because links are always removed with plain text output.')),
|
||||
OptionRecommendation(name='keep_color',
|
||||
recommended_value=False, level=OptionRecommendation.LOW,
|
||||
help=_('Do not remove font color from output. This is only useful when '
|
||||
'txt-output-formatting is set to textile. Textile is the only '
|
||||
'formatting that supports setting font color. If this option is '
|
||||
'not specified font color will not be set and default to the '
|
||||
'color displayed by the reader (generally this is black).')),
|
||||
}
|
||||
|
||||
def convert(self, oeb_book, output_path, input_plugin, opts, log):
|
||||
from calibre.ebooks.txt.txtml import TXTMLizer
|
||||
from calibre.utils.cleantext import clean_ascii_chars
|
||||
from calibre.ebooks.txt.newlines import specified_newlines, TxtNewlines
|
||||
|
||||
if opts.txt_output_formatting.lower() == 'markdown':
|
||||
from calibre.ebooks.txt.markdownml import MarkdownMLizer
|
||||
self.writer = MarkdownMLizer(log)
|
||||
elif opts.txt_output_formatting.lower() == 'textile':
|
||||
from calibre.ebooks.txt.textileml import TextileMLizer
|
||||
self.writer = TextileMLizer(log)
|
||||
else:
|
||||
self.writer = TXTMLizer(log)
|
||||
|
||||
txt = self.writer.extract_content(oeb_book, opts)
|
||||
txt = clean_ascii_chars(txt)
|
||||
|
||||
log.debug('\tReplacing newlines with selected type...')
|
||||
txt = specified_newlines(TxtNewlines(opts.newline).newline, txt)
|
||||
|
||||
close = False
|
||||
if not hasattr(output_path, 'write'):
|
||||
close = True
|
||||
if not os.path.exists(os.path.dirname(output_path)) and os.path.dirname(output_path) != '':
|
||||
os.makedirs(os.path.dirname(output_path))
|
||||
out_stream = open(output_path, 'wb')
|
||||
else:
|
||||
out_stream = output_path
|
||||
|
||||
out_stream.seek(0)
|
||||
out_stream.truncate()
|
||||
out_stream.write(txt.encode(opts.txt_output_encoding, 'replace'))
|
||||
|
||||
if close:
|
||||
out_stream.close()
|
||||
|
||||
|
||||
class TXTZOutput(TXTOutput):
|
||||
|
||||
name = 'TXTZ Output'
|
||||
author = 'John Schember'
|
||||
file_type = 'txtz'
|
||||
|
||||
def convert(self, oeb_book, output_path, input_plugin, opts, log):
|
||||
from calibre.ebooks.oeb.base import OEB_IMAGES
|
||||
from calibre.utils.zipfile import ZipFile
|
||||
from lxml import etree
|
||||
|
||||
with TemporaryDirectory('_txtz_output') as tdir:
|
||||
# TXT
|
||||
txt_name = 'index.txt'
|
||||
if opts.txt_output_formatting.lower() == 'textile':
|
||||
txt_name = 'index.text'
|
||||
with TemporaryFile(txt_name) as tf:
|
||||
TXTOutput.convert(self, oeb_book, tf, input_plugin, opts, log)
|
||||
shutil.copy(tf, os.path.join(tdir, txt_name))
|
||||
|
||||
# Images
|
||||
for item in oeb_book.manifest:
|
||||
if item.media_type in OEB_IMAGES:
|
||||
if hasattr(self.writer, 'images'):
|
||||
path = os.path.join(tdir, 'images')
|
||||
if item.href in self.writer.images:
|
||||
href = self.writer.images[item.href]
|
||||
else:
|
||||
continue
|
||||
else:
|
||||
path = os.path.join(tdir, os.path.dirname(item.href))
|
||||
href = os.path.basename(item.href)
|
||||
if not os.path.exists(path):
|
||||
os.makedirs(path)
|
||||
with open(os.path.join(path, href), 'wb') as imgf:
|
||||
imgf.write(item.data)
|
||||
|
||||
# Metadata
|
||||
with open(os.path.join(tdir, 'metadata.opf'), 'wb') as mdataf:
|
||||
mdataf.write(etree.tostring(oeb_book.metadata.to_opf1()))
|
||||
|
||||
txtz = ZipFile(output_path, 'w')
|
||||
txtz.add_dir(tdir)
|
||||
1330
ebook_converter/ebooks/conversion/plumber.py
Normal file
1330
ebook_converter/ebooks/conversion/plumber.py
Normal file
File diff suppressed because it is too large
Load Diff
646
ebook_converter/ebooks/conversion/preprocess.py
Normal file
646
ebook_converter/ebooks/conversion/preprocess.py
Normal file
@@ -0,0 +1,646 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import functools, re, json
|
||||
from math import ceil
|
||||
|
||||
from calibre import entity_to_unicode, as_unicode
|
||||
from polyglot.builtins import unicode_type, range
|
||||
|
||||
XMLDECL_RE = re.compile(r'^\s*<[?]xml.*?[?]>')
|
||||
SVG_NS = 'http://www.w3.org/2000/svg'
|
||||
XLINK_NS = 'http://www.w3.org/1999/xlink'
|
||||
|
||||
convert_entities = functools.partial(entity_to_unicode,
|
||||
result_exceptions={
|
||||
'<' : '<',
|
||||
'>' : '>',
|
||||
"'" : ''',
|
||||
'"' : '"',
|
||||
'&' : '&',
|
||||
})
|
||||
_span_pat = re.compile('<span.*?</span>', re.DOTALL|re.IGNORECASE)
|
||||
|
||||
LIGATURES = {
|
||||
# '\u00c6': 'AE',
|
||||
# '\u00e6': 'ae',
|
||||
# '\u0152': 'OE',
|
||||
# '\u0153': 'oe',
|
||||
# '\u0132': 'IJ',
|
||||
# '\u0133': 'ij',
|
||||
# '\u1D6B': 'ue',
|
||||
'\uFB00': 'ff',
|
||||
'\uFB01': 'fi',
|
||||
'\uFB02': 'fl',
|
||||
'\uFB03': 'ffi',
|
||||
'\uFB04': 'ffl',
|
||||
'\uFB05': 'ft',
|
||||
'\uFB06': 'st',
|
||||
}
|
||||
|
||||
_ligpat = re.compile('|'.join(LIGATURES))
|
||||
|
||||
|
||||
def sanitize_head(match):
|
||||
x = match.group(1)
|
||||
x = _span_pat.sub('', x)
|
||||
return '<head>\n%s\n</head>' % x
|
||||
|
||||
|
||||
def chap_head(match):
|
||||
chap = match.group('chap')
|
||||
title = match.group('title')
|
||||
if not title:
|
||||
return '<h1>'+chap+'</h1><br/>\n'
|
||||
else:
|
||||
return '<h1>'+chap+'</h1>\n<h3>'+title+'</h3>\n'
|
||||
|
||||
|
||||
def wrap_lines(match):
|
||||
ital = match.group('ital')
|
||||
if not ital:
|
||||
return ' '
|
||||
else:
|
||||
return ital+' '
|
||||
|
||||
|
||||
def smarten_punctuation(html, log=None):
|
||||
from calibre.utils.smartypants import smartyPants
|
||||
from calibre.ebooks.chardet import substitute_entites
|
||||
from calibre.ebooks.conversion.utils import HeuristicProcessor
|
||||
preprocessor = HeuristicProcessor(log=log)
|
||||
from uuid import uuid4
|
||||
start = 'calibre-smartypants-'+unicode_type(uuid4())
|
||||
stop = 'calibre-smartypants-'+unicode_type(uuid4())
|
||||
html = html.replace('<!--', start)
|
||||
html = html.replace('-->', stop)
|
||||
html = preprocessor.fix_nbsp_indents(html)
|
||||
html = smartyPants(html)
|
||||
html = html.replace(start, '<!--')
|
||||
html = html.replace(stop, '-->')
|
||||
return substitute_entites(html)
|
||||
|
||||
|
||||
class DocAnalysis(object):
|
||||
'''
|
||||
Provides various text analysis functions to determine how the document is structured.
|
||||
format is the type of document analysis will be done against.
|
||||
raw is the raw text to determine the line length to use for wrapping.
|
||||
Blank lines are excluded from analysis
|
||||
'''
|
||||
|
||||
def __init__(self, format='html', raw=''):
|
||||
raw = raw.replace(' ', ' ')
|
||||
if format == 'html':
|
||||
linere = re.compile(r'(?<=<p)(?![^>]*>\s*</p>).*?(?=</p>)', re.DOTALL)
|
||||
elif format == 'pdf':
|
||||
linere = re.compile(r'(?<=<br>)(?!\s*<br>).*?(?=<br>)', re.DOTALL)
|
||||
elif format == 'spanned_html':
|
||||
linere = re.compile('(?<=<span).*?(?=</span>)', re.DOTALL)
|
||||
elif format == 'txt':
|
||||
linere = re.compile('.*?\n')
|
||||
self.lines = linere.findall(raw)
|
||||
|
||||
def line_length(self, percent):
|
||||
'''
|
||||
Analyses the document to find the median line length.
|
||||
percentage is a decimal number, 0 - 1 which is used to determine
|
||||
how far in the list of line lengths to use. The list of line lengths is
|
||||
ordered smallest to largest and does not include duplicates. 0.5 is the
|
||||
median value.
|
||||
'''
|
||||
lengths = []
|
||||
for line in self.lines:
|
||||
if len(line) > 0:
|
||||
lengths.append(len(line))
|
||||
|
||||
if not lengths:
|
||||
return 0
|
||||
|
||||
lengths = list(set(lengths))
|
||||
total = sum(lengths)
|
||||
avg = total / len(lengths)
|
||||
max_line = ceil(avg * 2)
|
||||
|
||||
lengths = sorted(lengths)
|
||||
for i in range(len(lengths) - 1, -1, -1):
|
||||
if lengths[i] > max_line:
|
||||
del lengths[i]
|
||||
|
||||
if percent > 1:
|
||||
percent = 1
|
||||
if percent < 0:
|
||||
percent = 0
|
||||
|
||||
index = int(len(lengths) * percent) - 1
|
||||
|
||||
return lengths[index]
|
||||
|
||||
def line_histogram(self, percent):
|
||||
'''
|
||||
Creates a broad histogram of the document to determine whether it incorporates hard
|
||||
line breaks. Lines are sorted into 20 'buckets' based on length.
|
||||
percent is the percentage of lines that should be in a single bucket to return true
|
||||
The majority of the lines will exist in 1-2 buckets in typical docs with hard line breaks
|
||||
'''
|
||||
minLineLength=20 # Ignore lines under 20 chars (typical of spaces)
|
||||
maxLineLength=1900 # Discard larger than this to stay in range
|
||||
buckets=20 # Each line is divided into a bucket based on length
|
||||
|
||||
# print("there are "+unicode_type(len(lines))+" lines")
|
||||
# max = 0
|
||||
# for line in self.lines:
|
||||
# l = len(line)
|
||||
# if l > max:
|
||||
# max = l
|
||||
# print("max line found is "+unicode_type(max))
|
||||
# Build the line length histogram
|
||||
hRaw = [0 for i in range(0,buckets)]
|
||||
for line in self.lines:
|
||||
l = len(line)
|
||||
if l > minLineLength and l < maxLineLength:
|
||||
l = int(l // 100)
|
||||
# print("adding "+unicode_type(l))
|
||||
hRaw[l]+=1
|
||||
|
||||
# Normalize the histogram into percents
|
||||
totalLines = len(self.lines)
|
||||
if totalLines > 0:
|
||||
h = [float(count)/totalLines for count in hRaw]
|
||||
else:
|
||||
h = []
|
||||
# print("\nhRaw histogram lengths are: "+unicode_type(hRaw))
|
||||
# print(" percents are: "+unicode_type(h)+"\n")
|
||||
|
||||
# Find the biggest bucket
|
||||
maxValue = 0
|
||||
for i in range(0,len(h)):
|
||||
if h[i] > maxValue:
|
||||
maxValue = h[i]
|
||||
|
||||
if maxValue < percent:
|
||||
# print("Line lengths are too variable. Not unwrapping.")
|
||||
return False
|
||||
else:
|
||||
# print(unicode_type(maxValue)+" of the lines were in one bucket")
|
||||
return True
|
||||
|
||||
|
||||
class Dehyphenator(object):
|
||||
'''
|
||||
Analyzes words to determine whether hyphens should be retained/removed. Uses the document
|
||||
itself is as a dictionary. This method handles all languages along with uncommon, made-up, and
|
||||
scientific words. The primary disadvantage is that words appearing only once in the document
|
||||
retain hyphens.
|
||||
'''
|
||||
|
||||
def __init__(self, verbose=0, log=None):
|
||||
self.log = log
|
||||
self.verbose = verbose
|
||||
# Add common suffixes to the regex below to increase the likelihood of a match -
|
||||
# don't add suffixes which are also complete words, such as 'able' or 'sex'
|
||||
# only remove if it's not already the point of hyphenation
|
||||
self.suffix_string = (
|
||||
"((ed)?ly|'?e?s||a?(t|s)?ion(s|al(ly)?)?|ings?|er|(i)?ous|"
|
||||
"(i|a)ty|(it)?ies|ive|gence|istic(ally)?|(e|a)nce|m?ents?|ism|ated|"
|
||||
"(e|u)ct(ed)?|ed|(i|ed)?ness|(e|a)ncy|ble|ier|al|ex|ian)$")
|
||||
self.suffixes = re.compile(r"^%s" % self.suffix_string, re.IGNORECASE)
|
||||
self.removesuffixes = re.compile(r"%s" % self.suffix_string, re.IGNORECASE)
|
||||
# remove prefixes if the prefix was not already the point of hyphenation
|
||||
self.prefix_string = '^(dis|re|un|in|ex)'
|
||||
self.prefixes = re.compile(r'%s$' % self.prefix_string, re.IGNORECASE)
|
||||
self.removeprefix = re.compile(r'%s' % self.prefix_string, re.IGNORECASE)
|
||||
|
||||
def dehyphenate(self, match):
|
||||
firsthalf = match.group('firstpart')
|
||||
secondhalf = match.group('secondpart')
|
||||
try:
|
||||
wraptags = match.group('wraptags')
|
||||
except:
|
||||
wraptags = ''
|
||||
hyphenated = unicode_type(firsthalf) + "-" + unicode_type(secondhalf)
|
||||
dehyphenated = unicode_type(firsthalf) + unicode_type(secondhalf)
|
||||
if self.suffixes.match(secondhalf) is None:
|
||||
lookupword = self.removesuffixes.sub('', dehyphenated)
|
||||
else:
|
||||
lookupword = dehyphenated
|
||||
if len(firsthalf) > 4 and self.prefixes.match(firsthalf) is None:
|
||||
lookupword = self.removeprefix.sub('', lookupword)
|
||||
if self.verbose > 2:
|
||||
self.log("lookup word is: "+lookupword+", orig is: " + hyphenated)
|
||||
try:
|
||||
searchresult = self.html.find(lookupword.lower())
|
||||
except:
|
||||
return hyphenated
|
||||
if self.format == 'html_cleanup' or self.format == 'txt_cleanup':
|
||||
if self.html.find(lookupword) != -1 or searchresult != -1:
|
||||
if self.verbose > 2:
|
||||
self.log(" Cleanup:returned dehyphenated word: " + dehyphenated)
|
||||
return dehyphenated
|
||||
elif self.html.find(hyphenated) != -1:
|
||||
if self.verbose > 2:
|
||||
self.log(" Cleanup:returned hyphenated word: " + hyphenated)
|
||||
return hyphenated
|
||||
else:
|
||||
if self.verbose > 2:
|
||||
self.log(" Cleanup:returning original text "+firsthalf+" + linefeed "+secondhalf)
|
||||
return firsthalf+'\u2014'+wraptags+secondhalf
|
||||
|
||||
else:
|
||||
if self.format == 'individual_words' and len(firsthalf) + len(secondhalf) <= 6:
|
||||
if self.verbose > 2:
|
||||
self.log("too short, returned hyphenated word: " + hyphenated)
|
||||
return hyphenated
|
||||
if len(firsthalf) <= 2 and len(secondhalf) <= 2:
|
||||
if self.verbose > 2:
|
||||
self.log("too short, returned hyphenated word: " + hyphenated)
|
||||
return hyphenated
|
||||
if self.html.find(lookupword) != -1 or searchresult != -1:
|
||||
if self.verbose > 2:
|
||||
self.log(" returned dehyphenated word: " + dehyphenated)
|
||||
return dehyphenated
|
||||
else:
|
||||
if self.verbose > 2:
|
||||
self.log(" returned hyphenated word: " + hyphenated)
|
||||
return hyphenated
|
||||
|
||||
def __call__(self, html, format, length=1):
|
||||
self.html = html
|
||||
self.format = format
|
||||
if format == 'html':
|
||||
intextmatch = re.compile((
|
||||
r'(?<=.{%i})(?P<firstpart>[^\W\-]+)(-|‐)\s*(?=<)(?P<wraptags>(</span>)?'
|
||||
r'\s*(</[iubp]>\s*){1,2}(?P<up2threeblanks><(p|div)[^>]*>\s*(<p[^>]*>\s*</p>\s*)'
|
||||
r'?</(p|div)>\s+){0,3}\s*(<[iubp][^>]*>\s*){1,2}(<span[^>]*>)?)\s*(?P<secondpart>[\w\d]+)') % length)
|
||||
elif format == 'pdf':
|
||||
intextmatch = re.compile((
|
||||
r'(?<=.{%i})(?P<firstpart>[^\W\-]+)(-|‐)\s*(?P<wraptags><p>|'
|
||||
r'</[iub]>\s*<p>\s*<[iub]>)\s*(?P<secondpart>[\w\d]+)')% length)
|
||||
elif format == 'txt':
|
||||
intextmatch = re.compile(
|
||||
'(?<=.{%i})(?P<firstpart>[^\\W\\-]+)(-|‐)(\u0020|\u0009)*(?P<wraptags>(\n(\u0020|\u0009)*)+)(?P<secondpart>[\\w\\d]+)'% length)
|
||||
elif format == 'individual_words':
|
||||
intextmatch = re.compile(
|
||||
r'(?!<)(?P<firstpart>[^\W\-]+)(-|‐)\s*(?P<secondpart>\w+)(?![^<]*?>)', re.UNICODE)
|
||||
elif format == 'html_cleanup':
|
||||
intextmatch = re.compile(
|
||||
r'(?P<firstpart>[^\W\-]+)(-|‐)\s*(?=<)(?P<wraptags></span>\s*(</[iubp]>'
|
||||
r'\s*<[iubp][^>]*>\s*)?<span[^>]*>|</[iubp]>\s*<[iubp][^>]*>)?\s*(?P<secondpart>[\w\d]+)')
|
||||
elif format == 'txt_cleanup':
|
||||
intextmatch = re.compile(
|
||||
r'(?P<firstpart>[^\W\-]+)(-|‐)(?P<wraptags>\s+)(?P<secondpart>[\w\d]+)')
|
||||
|
||||
html = intextmatch.sub(self.dehyphenate, html)
|
||||
return html
|
||||
|
||||
|
||||
class CSSPreProcessor(object):
|
||||
|
||||
# Remove some of the broken CSS Microsoft products
|
||||
# create
|
||||
MS_PAT = re.compile(r'''
|
||||
(?P<start>^|;|\{)\s* # The end of the previous rule or block start
|
||||
(%s).+? # The invalid selectors
|
||||
(?P<end>$|;|\}) # The end of the declaration
|
||||
'''%'mso-|panose-|text-underline|tab-interval',
|
||||
re.MULTILINE|re.IGNORECASE|re.VERBOSE)
|
||||
|
||||
def ms_sub(self, match):
|
||||
end = match.group('end')
|
||||
try:
|
||||
start = match.group('start')
|
||||
except:
|
||||
start = ''
|
||||
if end == ';':
|
||||
end = ''
|
||||
return start + end
|
||||
|
||||
def __call__(self, data, add_namespace=False):
|
||||
from calibre.ebooks.oeb.base import XHTML_CSS_NAMESPACE
|
||||
data = self.MS_PAT.sub(self.ms_sub, data)
|
||||
if not add_namespace:
|
||||
return data
|
||||
|
||||
# Remove comments as the following namespace logic will break if there
|
||||
# are commented lines before the first @import or @charset rule. Since
|
||||
# the conversion will remove all stylesheets anyway, we don't lose
|
||||
# anything
|
||||
data = re.sub(unicode_type(r'/\*.*?\*/'), '', data, flags=re.DOTALL)
|
||||
|
||||
ans, namespaced = [], False
|
||||
for line in data.splitlines():
|
||||
ll = line.lstrip()
|
||||
if not (namespaced or ll.startswith('@import') or not ll or
|
||||
ll.startswith('@charset')):
|
||||
ans.append(XHTML_CSS_NAMESPACE.strip())
|
||||
namespaced = True
|
||||
ans.append(line)
|
||||
|
||||
return '\n'.join(ans)
|
||||
|
||||
|
||||
def accent_regex(accent_maps, letter_before=False):
|
||||
accent_cat = set()
|
||||
letters = set()
|
||||
|
||||
for accent in tuple(accent_maps):
|
||||
accent_cat.add(accent)
|
||||
k, v = accent_maps[accent].split(':', 1)
|
||||
if len(k) != len(v):
|
||||
raise ValueError('Invalid mapping for: {} -> {}'.format(k, v))
|
||||
accent_maps[accent] = lmap = dict(zip(k, v))
|
||||
letters |= set(lmap)
|
||||
|
||||
if letter_before:
|
||||
args = ''.join(letters), ''.join(accent_cat)
|
||||
accent_group, letter_group = 2, 1
|
||||
else:
|
||||
args = ''.join(accent_cat), ''.join(letters)
|
||||
accent_group, letter_group = 1, 2
|
||||
|
||||
pat = re.compile(r'([{}])\s*(?:<br[^>]*>){{0,1}}\s*([{}])'.format(*args), re.UNICODE)
|
||||
|
||||
def sub(m):
|
||||
lmap = accent_maps[m.group(accent_group)]
|
||||
return lmap.get(m.group(letter_group)) or m.group()
|
||||
|
||||
return pat, sub
|
||||
|
||||
|
||||
def html_preprocess_rules():
|
||||
ans = getattr(html_preprocess_rules, 'ans', None)
|
||||
if ans is None:
|
||||
ans = html_preprocess_rules.ans = [
|
||||
# Remove huge block of contiguous spaces as they slow down
|
||||
# the following regexes pretty badly
|
||||
(re.compile(r'\s{10000,}'), ''),
|
||||
# Some idiotic HTML generators (Frontpage I'm looking at you)
|
||||
# Put all sorts of crap into <head>. This messes up lxml
|
||||
(re.compile(r'<head[^>]*>\n*(.*?)\n*</head>', re.IGNORECASE|re.DOTALL),
|
||||
sanitize_head),
|
||||
# Convert all entities, since lxml doesn't handle them well
|
||||
(re.compile(r'&(\S+?);'), convert_entities),
|
||||
# Remove the <![if/endif tags inserted by everybody's darling, MS Word
|
||||
(re.compile(r'</{0,1}!\[(end){0,1}if\]{0,1}>', re.IGNORECASE), ''),
|
||||
]
|
||||
return ans
|
||||
|
||||
|
||||
def pdftohtml_rules():
|
||||
ans = getattr(pdftohtml_rules, 'ans', None)
|
||||
if ans is None:
|
||||
ans = pdftohtml_rules.ans = [
|
||||
accent_regex({
|
||||
'¨': 'aAeEiIoOuU:äÄëËïÏöÖüÜ',
|
||||
'`': 'aAeEiIoOuU:àÀèÈìÌòÒùÙ',
|
||||
'´': 'aAcCeEiIlLoOnNrRsSuUzZ:áÁćĆéÉíÍĺĹóÓńŃŕŔśŚúÚźŹ',
|
||||
'ˆ': 'aAeEiIoOuU:âÂêÊîÎôÔûÛ',
|
||||
'¸': 'cC:çÇ',
|
||||
'˛': 'aAeE:ąĄęĘ',
|
||||
'˙': 'zZ:żŻ',
|
||||
'ˇ': 'cCdDeElLnNrRsStTzZ:čČďĎěĚľĽňŇřŘšŠťŤžŽ',
|
||||
'°': 'uU:ůŮ',
|
||||
}),
|
||||
|
||||
accent_regex({'`': 'aAeEiIoOuU:àÀèÈìÌòÒùÙ'}, letter_before=True),
|
||||
|
||||
# If pdf printed from a browser then the header/footer has a reliable pattern
|
||||
(re.compile(r'((?<=</a>)\s*file:/{2,4}[A-Z].*<br>|file:////?[A-Z].*<br>(?=\s*<hr>))', re.IGNORECASE), lambda match: ''),
|
||||
|
||||
# Center separator lines
|
||||
(re.compile(r'<br>\s*(?P<break>([*#•✦=] *){3,})\s*<br>'), lambda match: '<p>\n<p style="text-align:center">' + match.group('break') + '</p>'),
|
||||
|
||||
# Remove <hr> tags
|
||||
(re.compile(r'<hr.*?>', re.IGNORECASE), ''),
|
||||
|
||||
# Remove gray background
|
||||
(re.compile(r'<BODY[^<>]+>'), '<BODY>'),
|
||||
|
||||
# Convert line breaks to paragraphs
|
||||
(re.compile(r'<br[^>]*>\s*'), '</p>\n<p>'),
|
||||
(re.compile(r'<body[^>]*>\s*'), '<body>\n<p>'),
|
||||
(re.compile(r'\s*</body>'), '</p>\n</body>'),
|
||||
|
||||
# Clean up spaces
|
||||
(re.compile(r'(?<=[\.,;\?!”"\'])[\s^ ]*(?=<)'), ' '),
|
||||
# Add space before and after italics
|
||||
(re.compile(r'(?<!“)<i>'), ' <i>'),
|
||||
(re.compile(r'</i>(?=\w)'), '</i> '),
|
||||
]
|
||||
return ans
|
||||
|
||||
|
||||
def book_designer_rules():
|
||||
ans = getattr(book_designer_rules, 'ans', None)
|
||||
if ans is None:
|
||||
ans = book_designer_rules.ans = [
|
||||
# HR
|
||||
(re.compile('<hr>', re.IGNORECASE),
|
||||
lambda match : '<span style="page-break-after:always"> </span>'),
|
||||
# Create header tags
|
||||
(re.compile(r'<h2[^><]*?id=BookTitle[^><]*?(align=)*(?(1)(\w+))*[^><]*?>[^><]*?</h2>', re.IGNORECASE),
|
||||
lambda match : '<h1 id="BookTitle" align="%s">%s</h1>'%(match.group(2) if match.group(2) else 'center', match.group(3))),
|
||||
(re.compile(r'<h2[^><]*?id=BookAuthor[^><]*?(align=)*(?(1)(\w+))*[^><]*?>[^><]*?</h2>', re.IGNORECASE),
|
||||
lambda match : '<h2 id="BookAuthor" align="%s">%s</h2>'%(match.group(2) if match.group(2) else 'center', match.group(3))),
|
||||
(re.compile('<span[^><]*?id=title[^><]*?>(.*?)</span>', re.IGNORECASE|re.DOTALL),
|
||||
lambda match : '<h2 class="title">%s</h2>'%(match.group(1),)),
|
||||
(re.compile('<span[^><]*?id=subtitle[^><]*?>(.*?)</span>', re.IGNORECASE|re.DOTALL),
|
||||
lambda match : '<h3 class="subtitle">%s</h3>'%(match.group(1),)),
|
||||
]
|
||||
return None
|
||||
|
||||
|
||||
class HTMLPreProcessor(object):
|
||||
|
||||
def __init__(self, log=None, extra_opts=None, regex_wizard_callback=None):
|
||||
self.log = log
|
||||
self.extra_opts = extra_opts
|
||||
self.regex_wizard_callback = regex_wizard_callback
|
||||
self.current_href = None
|
||||
|
||||
def is_baen(self, src):
|
||||
return re.compile(r'<meta\s+name="Publisher"\s+content=".*?Baen.*?"',
|
||||
re.IGNORECASE).search(src) is not None
|
||||
|
||||
def is_book_designer(self, raw):
|
||||
return re.search('<H2[^><]*id=BookTitle', raw) is not None
|
||||
|
||||
def is_pdftohtml(self, src):
|
||||
return '<!-- created by calibre\'s pdftohtml -->' in src[:1000]
|
||||
|
||||
def __call__(self, html, remove_special_chars=None,
|
||||
get_preprocess_html=False):
|
||||
if remove_special_chars is not None:
|
||||
html = remove_special_chars.sub('', html)
|
||||
html = html.replace('\0', '')
|
||||
is_pdftohtml = self.is_pdftohtml(html)
|
||||
if self.is_baen(html):
|
||||
rules = []
|
||||
elif self.is_book_designer(html):
|
||||
rules = book_designer_rules()
|
||||
elif is_pdftohtml:
|
||||
rules = pdftohtml_rules()
|
||||
else:
|
||||
rules = []
|
||||
|
||||
start_rules = []
|
||||
|
||||
if not getattr(self.extra_opts, 'keep_ligatures', False):
|
||||
html = _ligpat.sub(lambda m:LIGATURES[m.group()], html)
|
||||
|
||||
user_sr_rules = {}
|
||||
# Function for processing search and replace
|
||||
|
||||
def do_search_replace(search_pattern, replace_txt):
|
||||
from calibre.ebooks.conversion.search_replace import compile_regular_expression
|
||||
try:
|
||||
search_re = compile_regular_expression(search_pattern)
|
||||
if not replace_txt:
|
||||
replace_txt = ''
|
||||
rules.insert(0, (search_re, replace_txt))
|
||||
user_sr_rules[(search_re, replace_txt)] = search_pattern
|
||||
except Exception as e:
|
||||
self.log.error('Failed to parse %r regexp because %s' %
|
||||
(search, as_unicode(e)))
|
||||
|
||||
# search / replace using the sr?_search / sr?_replace options
|
||||
for i in range(1, 4):
|
||||
search, replace = 'sr%d_search'%i, 'sr%d_replace'%i
|
||||
search_pattern = getattr(self.extra_opts, search, '')
|
||||
replace_txt = getattr(self.extra_opts, replace, '')
|
||||
if search_pattern:
|
||||
do_search_replace(search_pattern, replace_txt)
|
||||
|
||||
# multi-search / replace using the search_replace option
|
||||
search_replace = getattr(self.extra_opts, 'search_replace', None)
|
||||
if search_replace:
|
||||
search_replace = json.loads(search_replace)
|
||||
for search_pattern, replace_txt in reversed(search_replace):
|
||||
do_search_replace(search_pattern, replace_txt)
|
||||
|
||||
end_rules = []
|
||||
# delete soft hyphens - moved here so it's executed after header/footer removal
|
||||
if is_pdftohtml:
|
||||
# unwrap/delete soft hyphens
|
||||
end_rules.append((re.compile(
|
||||
r'[](</p>\s*<p>\s*)+\s*(?=[\[a-z\d])'), lambda match: ''))
|
||||
# unwrap/delete soft hyphens with formatting
|
||||
end_rules.append((re.compile(
|
||||
r'[]\s*(</(i|u|b)>)+(</p>\s*<p>\s*)+\s*(<(i|u|b)>)+\s*(?=[\[a-z\d])'), lambda match: ''))
|
||||
|
||||
length = -1
|
||||
if getattr(self.extra_opts, 'unwrap_factor', 0.0) > 0.01:
|
||||
docanalysis = DocAnalysis('pdf', html)
|
||||
length = docanalysis.line_length(getattr(self.extra_opts, 'unwrap_factor'))
|
||||
if length:
|
||||
# print("The pdf line length returned is " + unicode_type(length))
|
||||
# unwrap em/en dashes
|
||||
end_rules.append((re.compile(
|
||||
r'(?<=.{%i}[–—])\s*<p>\s*(?=[\[a-z\d])' % length), lambda match: ''))
|
||||
end_rules.append(
|
||||
# Un wrap using punctuation
|
||||
(re.compile((
|
||||
r'(?<=.{%i}([a-zäëïöüàèìòùáćéíĺóŕńśúýâêîôûçąężıãõñæøþðßěľščťžňďřů,:)\\IAß]'
|
||||
r'|(?<!\&\w{4});))\s*(?P<ital></(i|b|u)>)?\s*(</p>\s*<p>\s*)+\s*(?=(<(i|b|u)>)?'
|
||||
r'\s*[\w\d$(])') % length, re.UNICODE), wrap_lines),
|
||||
)
|
||||
|
||||
for rule in html_preprocess_rules() + start_rules:
|
||||
html = rule[0].sub(rule[1], html)
|
||||
|
||||
if self.regex_wizard_callback is not None:
|
||||
self.regex_wizard_callback(self.current_href, html)
|
||||
|
||||
if get_preprocess_html:
|
||||
return html
|
||||
|
||||
def dump(raw, where):
|
||||
import os
|
||||
dp = getattr(self.extra_opts, 'debug_pipeline', None)
|
||||
if dp and os.path.exists(dp):
|
||||
odir = os.path.join(dp, 'input')
|
||||
if os.path.exists(odir):
|
||||
odir = os.path.join(odir, where)
|
||||
if not os.path.exists(odir):
|
||||
os.makedirs(odir)
|
||||
name, i = None, 0
|
||||
while not name or os.path.exists(os.path.join(odir, name)):
|
||||
i += 1
|
||||
name = '%04d.html'%i
|
||||
with open(os.path.join(odir, name), 'wb') as f:
|
||||
f.write(raw.encode('utf-8'))
|
||||
|
||||
# dump(html, 'pre-preprocess')
|
||||
|
||||
for rule in rules + end_rules:
|
||||
try:
|
||||
html = rule[0].sub(rule[1], html)
|
||||
except Exception as e:
|
||||
if rule in user_sr_rules:
|
||||
self.log.error(
|
||||
'User supplied search & replace rule: %s -> %s '
|
||||
'failed with error: %s, ignoring.'%(
|
||||
user_sr_rules[rule], rule[1], e))
|
||||
else:
|
||||
raise
|
||||
|
||||
if is_pdftohtml and length > -1:
|
||||
# Dehyphenate
|
||||
dehyphenator = Dehyphenator(self.extra_opts.verbose, self.log)
|
||||
html = dehyphenator(html,'html', length)
|
||||
|
||||
if is_pdftohtml:
|
||||
from calibre.ebooks.conversion.utils import HeuristicProcessor
|
||||
pdf_markup = HeuristicProcessor(self.extra_opts, None)
|
||||
totalwords = 0
|
||||
if pdf_markup.get_word_count(html) > 7000:
|
||||
html = pdf_markup.markup_chapters(html, totalwords, True)
|
||||
|
||||
# dump(html, 'post-preprocess')
|
||||
|
||||
# Handle broken XHTML w/ SVG (ugh)
|
||||
if 'svg:' in html and SVG_NS not in html:
|
||||
html = html.replace(
|
||||
'<html', '<html xmlns:svg="%s"' % SVG_NS, 1)
|
||||
if 'xlink:' in html and XLINK_NS not in html:
|
||||
html = html.replace(
|
||||
'<html', '<html xmlns:xlink="%s"' % XLINK_NS, 1)
|
||||
|
||||
html = XMLDECL_RE.sub('', html)
|
||||
|
||||
if getattr(self.extra_opts, 'asciiize', False):
|
||||
from calibre.utils.localization import get_udc
|
||||
from calibre.utils.mreplace import MReplace
|
||||
unihandecoder = get_udc()
|
||||
mr = MReplace(data={'«':'<'*3, '»':'>'*3})
|
||||
html = mr.mreplace(html)
|
||||
html = unihandecoder.decode(html)
|
||||
|
||||
if getattr(self.extra_opts, 'enable_heuristics', False):
|
||||
from calibre.ebooks.conversion.utils import HeuristicProcessor
|
||||
preprocessor = HeuristicProcessor(self.extra_opts, self.log)
|
||||
html = preprocessor(html)
|
||||
|
||||
if is_pdftohtml:
|
||||
html = html.replace('<!-- created by calibre\'s pdftohtml -->', '')
|
||||
|
||||
if getattr(self.extra_opts, 'smarten_punctuation', False):
|
||||
html = smarten_punctuation(html, self.log)
|
||||
|
||||
try:
|
||||
unsupported_unicode_chars = self.extra_opts.output_profile.unsupported_unicode_chars
|
||||
except AttributeError:
|
||||
unsupported_unicode_chars = ''
|
||||
if unsupported_unicode_chars:
|
||||
from calibre.utils.localization import get_udc
|
||||
unihandecoder = get_udc()
|
||||
for char in unsupported_unicode_chars:
|
||||
asciichar = unihandecoder.decode(char)
|
||||
html = html.replace(char, asciichar)
|
||||
|
||||
return html
|
||||
881
ebook_converter/ebooks/conversion/utils.py
Normal file
881
ebook_converter/ebooks/conversion/utils.py
Normal file
@@ -0,0 +1,881 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import re
|
||||
from math import ceil
|
||||
from calibre.ebooks.conversion.preprocess import DocAnalysis, Dehyphenator
|
||||
from calibre.utils.logging import default_log
|
||||
from calibre.utils.wordcount import get_wordcount_obj
|
||||
from polyglot.builtins import unicode_type
|
||||
|
||||
|
||||
class HeuristicProcessor(object):
|
||||
|
||||
def __init__(self, extra_opts=None, log=None):
|
||||
self.log = default_log if log is None else log
|
||||
self.html_preprocess_sections = 0
|
||||
self.found_indents = 0
|
||||
self.extra_opts = extra_opts
|
||||
self.deleted_nbsps = False
|
||||
self.totalwords = 0
|
||||
self.min_chapters = 1
|
||||
self.chapters_no_title = 0
|
||||
self.chapters_with_title = 0
|
||||
self.blanks_deleted = False
|
||||
self.blanks_between_paragraphs = False
|
||||
self.linereg = re.compile('(?<=<p).*?(?=</p>)', re.IGNORECASE|re.DOTALL)
|
||||
self.blankreg = re.compile(r'\s*(?P<openline><p(?!\sclass=\"(softbreak|whitespace)\")[^>]*>)\s*(?P<closeline></p>)', re.IGNORECASE)
|
||||
self.anyblank = re.compile(r'\s*(?P<openline><p[^>]*>)\s*(?P<closeline></p>)', re.IGNORECASE)
|
||||
self.multi_blank = re.compile(r'(\s*<p[^>]*>\s*</p>(\s*<div[^>]*>\s*</div>\s*)*){2,}(?!\s*<h\d)', re.IGNORECASE)
|
||||
self.any_multi_blank = re.compile(r'(\s*<p[^>]*>\s*</p>(\s*<div[^>]*>\s*</div>\s*)*){2,}', re.IGNORECASE)
|
||||
self.line_open = (
|
||||
r"<(?P<outer>p|div)[^>]*>\s*(<(?P<inner1>font|span|[ibu])[^>]*>)?\s*"
|
||||
r"(<(?P<inner2>font|span|[ibu])[^>]*>)?\s*(<(?P<inner3>font|span|[ibu])[^>]*>)?\s*")
|
||||
self.line_close = "(</(?P=inner3)>)?\\s*(</(?P=inner2)>)?\\s*(</(?P=inner1)>)?\\s*</(?P=outer)>"
|
||||
self.single_blank = re.compile(r'(\s*<(p|div)[^>]*>\s*</(p|div)>)', re.IGNORECASE)
|
||||
self.scene_break_open = '<p class="scenebreak" style="text-align:center; text-indent:0%; margin-top:1em; margin-bottom:1em; page-break-before:avoid">'
|
||||
self.common_in_text_endings = '[\"\'—’”,\\.!\\?\\…\\)„\\w]'
|
||||
self.common_in_text_beginnings = '[\\w\'\"“‘‛]'
|
||||
|
||||
def is_pdftohtml(self, src):
|
||||
return '<!-- created by calibre\'s pdftohtml -->' in src[:1000]
|
||||
|
||||
def is_abbyy(self, src):
|
||||
return '<meta name="generator" content="ABBYY FineReader' in src[:1000]
|
||||
|
||||
def chapter_head(self, match):
|
||||
from calibre.utils.html2text import html2text
|
||||
chap = match.group('chap')
|
||||
title = match.group('title')
|
||||
if not title:
|
||||
self.html_preprocess_sections = self.html_preprocess_sections + 1
|
||||
self.log.debug("marked " + unicode_type(self.html_preprocess_sections) +
|
||||
" chapters. - " + unicode_type(chap))
|
||||
return '<h2>'+chap+'</h2>\n'
|
||||
else:
|
||||
delete_whitespace = re.compile('^\\s*(?P<c>.*?)\\s*$')
|
||||
delete_quotes = re.compile('\'\"')
|
||||
txt_chap = delete_quotes.sub('', delete_whitespace.sub('\\g<c>', html2text(chap)))
|
||||
txt_title = delete_quotes.sub('', delete_whitespace.sub('\\g<c>', html2text(title)))
|
||||
self.html_preprocess_sections = self.html_preprocess_sections + 1
|
||||
self.log.debug("marked " + unicode_type(self.html_preprocess_sections) +
|
||||
" chapters & titles. - " + unicode_type(chap) + ", " + unicode_type(title))
|
||||
return '<h2 title="'+txt_chap+', '+txt_title+'">'+chap+'</h2>\n<h3 class="sigilNotInTOC">'+title+'</h3>\n'
|
||||
|
||||
def chapter_break(self, match):
|
||||
chap = match.group('section')
|
||||
styles = match.group('styles')
|
||||
self.html_preprocess_sections = self.html_preprocess_sections + 1
|
||||
self.log.debug("marked " + unicode_type(self.html_preprocess_sections) +
|
||||
" section markers based on punctuation. - " + unicode_type(chap))
|
||||
return '<'+styles+' style="page-break-before:always">'+chap
|
||||
|
||||
def analyze_title_matches(self, match):
|
||||
# chap = match.group('chap')
|
||||
title = match.group('title')
|
||||
if not title:
|
||||
self.chapters_no_title = self.chapters_no_title + 1
|
||||
else:
|
||||
self.chapters_with_title = self.chapters_with_title + 1
|
||||
|
||||
def insert_indent(self, match):
|
||||
pstyle = match.group('formatting')
|
||||
tag = match.group('tagtype')
|
||||
span = match.group('span')
|
||||
self.found_indents = self.found_indents + 1
|
||||
if pstyle:
|
||||
if pstyle.lower().find('style') != -1:
|
||||
pstyle = re.sub(r'"$', '; text-indent:3%"', pstyle)
|
||||
else:
|
||||
pstyle = pstyle+' style="text-indent:3%"'
|
||||
if not span:
|
||||
return '<'+tag+' '+pstyle+'>'
|
||||
else:
|
||||
return '<'+tag+' '+pstyle+'>'+span
|
||||
else:
|
||||
if not span:
|
||||
return '<'+tag+' style="text-indent:3%">'
|
||||
else:
|
||||
return '<'+tag+' style="text-indent:3%">'+span
|
||||
|
||||
def no_markup(self, raw, percent):
|
||||
'''
|
||||
Detects total marked up line endings in the file. raw is the text to
|
||||
inspect. Percent is the minimum percent of line endings which should
|
||||
be marked up to return true.
|
||||
'''
|
||||
htm_end_ere = re.compile('</(p|div)>', re.DOTALL)
|
||||
line_end_ere = re.compile('(\n|\r|\r\n)', re.DOTALL)
|
||||
htm_end = htm_end_ere.findall(raw)
|
||||
line_end = line_end_ere.findall(raw)
|
||||
tot_htm_ends = len(htm_end)
|
||||
tot_ln_fds = len(line_end)
|
||||
# self.log.debug("There are " + unicode_type(tot_ln_fds) + " total Line feeds, and " +
|
||||
# unicode_type(tot_htm_ends) + " marked up endings")
|
||||
|
||||
if percent > 1:
|
||||
percent = 1
|
||||
if percent < 0:
|
||||
percent = 0
|
||||
|
||||
min_lns = tot_ln_fds * percent
|
||||
# self.log.debug("There must be fewer than " + unicode_type(min_lns) + " unmarked lines to add markup")
|
||||
return min_lns > tot_htm_ends
|
||||
|
||||
def dump(self, raw, where):
|
||||
import os
|
||||
dp = getattr(self.extra_opts, 'debug_pipeline', None)
|
||||
if dp and os.path.exists(dp):
|
||||
odir = os.path.join(dp, 'preprocess')
|
||||
if not os.path.exists(odir):
|
||||
os.makedirs(odir)
|
||||
if os.path.exists(odir):
|
||||
odir = os.path.join(odir, where)
|
||||
if not os.path.exists(odir):
|
||||
os.makedirs(odir)
|
||||
name, i = None, 0
|
||||
while not name or os.path.exists(os.path.join(odir, name)):
|
||||
i += 1
|
||||
name = '%04d.html'%i
|
||||
with open(os.path.join(odir, name), 'wb') as f:
|
||||
f.write(raw.encode('utf-8'))
|
||||
|
||||
def get_word_count(self, html):
|
||||
word_count_text = re.sub(r'(?s)<head[^>]*>.*?</head>', '', html)
|
||||
word_count_text = re.sub(r'<[^>]*>', '', word_count_text)
|
||||
wordcount = get_wordcount_obj(word_count_text)
|
||||
return wordcount.words
|
||||
|
||||
def markup_italicis(self, html):
|
||||
# self.log.debug("\n\n\nitalicize debugging \n\n\n")
|
||||
ITALICIZE_WORDS = [
|
||||
'Etc.', 'etc.', 'viz.', 'ie.', 'i.e.', 'Ie.', 'I.e.', 'eg.',
|
||||
'e.g.', 'Eg.', 'E.g.', 'et al.', 'et cetera', 'n.b.', 'N.b.',
|
||||
'nota bene', 'Nota bene', 'Ste.', 'Mme.', 'Mdme.',
|
||||
'Mlle.', 'Mons.', 'PS.', 'PPS.',
|
||||
]
|
||||
|
||||
ITALICIZE_STYLE_PATS = [
|
||||
unicode_type(r'(?msu)(?<=[\s>"“\'‘])_\*/(?P<words>[^\*_]+)/\*_'),
|
||||
unicode_type(r'(?msu)(?<=[\s>"“\'‘])~~(?P<words>[^~]+)~~'),
|
||||
unicode_type(r'(?msu)(?<=[\s>"“\'‘])_/(?P<words>[^/_]+)/_'),
|
||||
unicode_type(r'(?msu)(?<=[\s>"“\'‘])_\*(?P<words>[^\*_]+)\*_'),
|
||||
unicode_type(r'(?msu)(?<=[\s>"“\'‘])\*/(?P<words>[^/\*]+)/\*'),
|
||||
unicode_type(r'(?msu)(?<=[\s>"“\'‘])/:(?P<words>[^:/]+):/'),
|
||||
unicode_type(r'(?msu)(?<=[\s>"“\'‘])\|:(?P<words>[^:\|]+):\|'),
|
||||
unicode_type(r'(?msu)(?<=[\s>"“\'‘])\*(?P<words>[^\*]+)\*'),
|
||||
unicode_type(r'(?msu)(?<=[\s>"“\'‘])~(?P<words>[^~]+)~'),
|
||||
unicode_type(r'(?msu)(?<=[\s>"“\'‘])/(?P<words>[^/\*><]+)/'),
|
||||
unicode_type(r'(?msu)(?<=[\s>"“\'‘])_(?P<words>[^_]+)_'),
|
||||
]
|
||||
|
||||
for word in ITALICIZE_WORDS:
|
||||
html = re.sub(r'(?<=\s|>)' + re.escape(word) + r'(?=\s|<)', '<i>%s</i>' % word, html)
|
||||
|
||||
search_text = re.sub(r'(?s)<head[^>]*>.*?</head>', '', html)
|
||||
search_text = re.sub(r'<[^>]*>', '', search_text)
|
||||
for pat in ITALICIZE_STYLE_PATS:
|
||||
for match in re.finditer(pat, search_text):
|
||||
ital_string = unicode_type(match.group('words'))
|
||||
# self.log.debug("italicising "+unicode_type(match.group(0))+" with <i>"+ital_string+"</i>")
|
||||
try:
|
||||
html = re.sub(re.escape(unicode_type(match.group(0))), '<i>%s</i>' % ital_string, html)
|
||||
except OverflowError:
|
||||
# match.group(0) was too large to be compiled into a regex
|
||||
continue
|
||||
except re.error:
|
||||
# the match was not a valid regular expression
|
||||
continue
|
||||
|
||||
return html
|
||||
|
||||
def markup_chapters(self, html, wordcount, blanks_between_paragraphs):
|
||||
'''
|
||||
Searches for common chapter headings throughout the document
|
||||
attempts multiple patterns based on likelihood of a match
|
||||
with minimum false positives. Exits after finding a successful pattern
|
||||
'''
|
||||
# Typical chapters are between 2000 and 7000 words, use the larger number to decide the
|
||||
# minimum of chapters to search for. A max limit is calculated to prevent things like OCR
|
||||
# or pdf page numbers from being treated as TOC markers
|
||||
max_chapters = 150
|
||||
typical_chapters = 7000.
|
||||
if wordcount > 7000:
|
||||
if wordcount > 200000:
|
||||
typical_chapters = 15000.
|
||||
self.min_chapters = int(ceil(wordcount / typical_chapters))
|
||||
self.log.debug("minimum chapters required are: "+unicode_type(self.min_chapters))
|
||||
heading = re.compile('<h[1-3][^>]*>', re.IGNORECASE)
|
||||
self.html_preprocess_sections = len(heading.findall(html))
|
||||
self.log.debug("found " + unicode_type(self.html_preprocess_sections) + " pre-existing headings")
|
||||
|
||||
# Build the Regular Expressions in pieces
|
||||
init_lookahead = "(?=<(p|div))"
|
||||
chapter_line_open = self.line_open
|
||||
title_line_open = (r"<(?P<outer2>p|div)[^>]*>\s*(<(?P<inner4>font|span|[ibu])[^>]*>)?"
|
||||
r"\s*(<(?P<inner5>font|span|[ibu])[^>]*>)?\s*(<(?P<inner6>font|span|[ibu])[^>]*>)?\s*")
|
||||
chapter_header_open = r"(?P<chap>"
|
||||
title_header_open = r"(?P<title>"
|
||||
chapter_header_close = ")\\s*"
|
||||
title_header_close = ")"
|
||||
chapter_line_close = self.line_close
|
||||
title_line_close = "(</(?P=inner6)>)?\\s*(</(?P=inner5)>)?\\s*(</(?P=inner4)>)?\\s*</(?P=outer2)>"
|
||||
|
||||
is_pdftohtml = self.is_pdftohtml(html)
|
||||
if is_pdftohtml:
|
||||
title_line_open = "<(?P<outer2>p)[^>]*>\\s*"
|
||||
title_line_close = "\\s*</(?P=outer2)>"
|
||||
|
||||
if blanks_between_paragraphs:
|
||||
blank_lines = "(\\s*<p[^>]*>\\s*</p>){0,2}\\s*"
|
||||
else:
|
||||
blank_lines = ""
|
||||
opt_title_open = "("
|
||||
opt_title_close = ")?"
|
||||
n_lookahead_open = "(?!\\s*"
|
||||
n_lookahead_close = ")\\s*"
|
||||
|
||||
default_title = r"(<[ibu][^>]*>)?\s{0,3}(?!Chapter)([\w\:\'’\"-]+\s{0,3}){1,5}?(</[ibu][^>]*>)?(?=<)"
|
||||
simple_title = r"(<[ibu][^>]*>)?\s{0,3}(?!(Chapter|\s+<)).{0,65}?(</[ibu][^>]*>)?(?=<)"
|
||||
|
||||
analysis_result = []
|
||||
|
||||
chapter_types = [
|
||||
[(
|
||||
r"[^'\"]?(Introduction|Synopsis|Acknowledgements|Epilogue|CHAPTER|Kapitel|Volume\b|Prologue|Book\b|Part\b|Dedication|Preface)"
|
||||
r"\s*([\d\w-]+\:?\'?\s*){0,5}"), True, True, True, False, "Searching for common section headings", 'common'],
|
||||
# Highest frequency headings which include titles
|
||||
[r"[^'\"]?(CHAPTER|Kapitel)\s*([\dA-Z\-\'\"\?!#,]+\s*){0,7}\s*", True, True, True, False, "Searching for most common chapter headings", 'chapter'],
|
||||
[r"<b[^>]*>\s*(<span[^>]*>)?\s*(?!([*#•=]+\s*)+)(\s*(?=[\d.\w#\-*\s]+<)([\d.\w#-*]+\s*){1,5}\s*)(?!\.)(</span>)?\s*</b>",
|
||||
True, True, True, False, "Searching for emphasized lines", 'emphasized'], # Emphasized lines
|
||||
[r"[^'\"]?(\d+(\.|:))\s*([\w\-\'\"#,]+\s*){0,7}\s*", True, True, True, False,
|
||||
"Searching for numeric chapter headings", 'numeric'], # Numeric Chapters
|
||||
[r"([A-Z]\s+){3,}\s*([\d\w-]+\s*){0,3}\s*", True, True, True, False, "Searching for letter spaced headings", 'letter_spaced'], # Spaced Lettering
|
||||
[r"[^'\"]?(\d+\.?\s+([\d\w-]+\:?\'?-?\s?){0,5})\s*", True, True, True, False,
|
||||
"Searching for numeric chapters with titles", 'numeric_title'], # Numeric Titles
|
||||
[r"[^'\"]?(\d+)\s*([\dA-Z\-\'\"\?!#,]+\s*){0,7}\s*", True, True, True, False,
|
||||
"Searching for simple numeric headings", 'plain_number'], # Numeric Chapters, no dot or colon
|
||||
[r"\s*[^'\"]?([A-Z#]+(\s|-){0,3}){1,5}\s*", False, True, False, False,
|
||||
"Searching for chapters with Uppercase Characters", 'uppercase'] # Uppercase Chapters
|
||||
]
|
||||
|
||||
def recurse_patterns(html, analyze):
|
||||
# Start with most typical chapter headings, get more aggressive until one works
|
||||
for [chapter_type, n_lookahead_req, strict_title, ignorecase, title_req, log_message, type_name] in chapter_types:
|
||||
n_lookahead = ''
|
||||
hits = 0
|
||||
self.chapters_no_title = 0
|
||||
self.chapters_with_title = 0
|
||||
|
||||
if n_lookahead_req:
|
||||
lp_n_lookahead_open = n_lookahead_open
|
||||
lp_n_lookahead_close = n_lookahead_close
|
||||
else:
|
||||
lp_n_lookahead_open = ''
|
||||
lp_n_lookahead_close = ''
|
||||
|
||||
if strict_title:
|
||||
lp_title = default_title
|
||||
else:
|
||||
lp_title = simple_title
|
||||
|
||||
if ignorecase:
|
||||
arg_ignorecase = r'(?i)'
|
||||
else:
|
||||
arg_ignorecase = ''
|
||||
|
||||
if title_req:
|
||||
lp_opt_title_open = ''
|
||||
lp_opt_title_close = ''
|
||||
else:
|
||||
lp_opt_title_open = opt_title_open
|
||||
lp_opt_title_close = opt_title_close
|
||||
|
||||
if self.html_preprocess_sections >= self.min_chapters:
|
||||
break
|
||||
full_chapter_line = chapter_line_open+chapter_header_open+chapter_type+chapter_header_close+chapter_line_close
|
||||
if n_lookahead_req:
|
||||
n_lookahead = re.sub("(ou|in|cha)", "lookahead_", full_chapter_line)
|
||||
if not analyze:
|
||||
self.log.debug("Marked " + unicode_type(self.html_preprocess_sections) + " headings, " + log_message)
|
||||
|
||||
chapter_marker = arg_ignorecase+init_lookahead+full_chapter_line+blank_lines+lp_n_lookahead_open+n_lookahead+lp_n_lookahead_close+ \
|
||||
lp_opt_title_open+title_line_open+title_header_open+lp_title+title_header_close+title_line_close+lp_opt_title_close
|
||||
chapdetect = re.compile(r'%s' % chapter_marker)
|
||||
|
||||
if analyze:
|
||||
hits = len(chapdetect.findall(html))
|
||||
if hits:
|
||||
chapdetect.sub(self.analyze_title_matches, html)
|
||||
if float(self.chapters_with_title) / float(hits) > .5:
|
||||
title_req = True
|
||||
strict_title = False
|
||||
self.log.debug(
|
||||
unicode_type(type_name)+" had "+unicode_type(hits)+
|
||||
" hits - "+unicode_type(self.chapters_no_title)+" chapters with no title, "+
|
||||
unicode_type(self.chapters_with_title)+" chapters with titles, "+
|
||||
unicode_type(float(self.chapters_with_title) / float(hits))+" percent. ")
|
||||
if type_name == 'common':
|
||||
analysis_result.append([chapter_type, n_lookahead_req, strict_title, ignorecase, title_req, log_message, type_name])
|
||||
elif self.min_chapters <= hits < max_chapters or self.min_chapters < 3 > hits:
|
||||
analysis_result.append([chapter_type, n_lookahead_req, strict_title, ignorecase, title_req, log_message, type_name])
|
||||
break
|
||||
else:
|
||||
html = chapdetect.sub(self.chapter_head, html)
|
||||
return html
|
||||
|
||||
recurse_patterns(html, True)
|
||||
chapter_types = analysis_result
|
||||
html = recurse_patterns(html, False)
|
||||
|
||||
words_per_chptr = wordcount
|
||||
if words_per_chptr > 0 and self.html_preprocess_sections > 0:
|
||||
words_per_chptr = wordcount // self.html_preprocess_sections
|
||||
self.log.debug("Total wordcount is: "+ unicode_type(wordcount)+", Average words per section is: "+
|
||||
unicode_type(words_per_chptr)+", Marked up "+unicode_type(self.html_preprocess_sections)+" chapters")
|
||||
return html
|
||||
|
||||
def punctuation_unwrap(self, length, content, format):
|
||||
'''
|
||||
Unwraps lines based on line length and punctuation
|
||||
supports a range of html markup and text files
|
||||
|
||||
the lookahead regex below is meant look for any non-full stop characters - punctuation
|
||||
characters which can be used as a full stop should *not* be added below - e.g. ?!“”. etc
|
||||
the reason for this is to prevent false positive wrapping. False positives are more
|
||||
difficult to detect than false negatives during a manual review of the doc
|
||||
|
||||
This function intentionally leaves hyphenated content alone as that is handled by the
|
||||
dehyphenate routine in a separate step
|
||||
'''
|
||||
def style_unwrap(match):
|
||||
style_close = match.group('style_close')
|
||||
style_open = match.group('style_open')
|
||||
if style_open and style_close:
|
||||
return style_close+' '+style_open
|
||||
elif style_open and not style_close:
|
||||
return ' '+style_open
|
||||
elif not style_open and style_close:
|
||||
return style_close+' '
|
||||
else:
|
||||
return ' '
|
||||
|
||||
# define the pieces of the regex
|
||||
# (?<!\&\w{4});) is a semicolon not part of an entity
|
||||
lookahead = "(?<=.{"+unicode_type(length)+r"}([a-zა-ჰäëïöüàèìòùáćéíĺóŕńśúýâêîôûçąężıãõñæøþðßěľščťžňďřů,:)\\IAß]|(?<!\&\w{4});))"
|
||||
em_en_lookahead = "(?<=.{"+unicode_type(length)+"}[\u2013\u2014])"
|
||||
soft_hyphen = "\xad"
|
||||
line_ending = "\\s*(?P<style_close></(span|[iub])>)?\\s*(</(p|div)>)?"
|
||||
blanklines = "\\s*(?P<up2threeblanks><(p|span|div)[^>]*>\\s*(<(p|span|div)[^>]*>\\s*</(span|p|div)>\\s*)</(span|p|div)>\\s*){0,3}\\s*"
|
||||
line_opening = "<(p|div)[^>]*>\\s*(?P<style_open><(span|[iub])[^>]*>)?\\s*"
|
||||
txt_line_wrap = "((\u0020|\u0009)*\n){1,4}"
|
||||
|
||||
if format == 'txt':
|
||||
unwrap_regex = lookahead+txt_line_wrap
|
||||
em_en_unwrap_regex = em_en_lookahead+txt_line_wrap
|
||||
shy_unwrap_regex = soft_hyphen+txt_line_wrap
|
||||
else:
|
||||
unwrap_regex = lookahead+line_ending+blanklines+line_opening
|
||||
em_en_unwrap_regex = em_en_lookahead+line_ending+blanklines+line_opening
|
||||
shy_unwrap_regex = soft_hyphen+line_ending+blanklines+line_opening
|
||||
|
||||
unwrap = re.compile("%s" % unwrap_regex, re.UNICODE)
|
||||
em_en_unwrap = re.compile("%s" % em_en_unwrap_regex, re.UNICODE)
|
||||
shy_unwrap = re.compile("%s" % shy_unwrap_regex, re.UNICODE)
|
||||
|
||||
if format == 'txt':
|
||||
content = unwrap.sub(' ', content)
|
||||
content = em_en_unwrap.sub('', content)
|
||||
content = shy_unwrap.sub('', content)
|
||||
else:
|
||||
content = unwrap.sub(style_unwrap, content)
|
||||
content = em_en_unwrap.sub(style_unwrap, content)
|
||||
content = shy_unwrap.sub(style_unwrap, content)
|
||||
|
||||
return content
|
||||
|
||||
def txt_process(self, match):
|
||||
from calibre.ebooks.txt.processor import convert_basic, separate_paragraphs_single_line
|
||||
content = match.group('text')
|
||||
content = separate_paragraphs_single_line(content)
|
||||
content = convert_basic(content, epub_split_size_kb=0)
|
||||
return content
|
||||
|
||||
def markup_pre(self, html):
|
||||
pre = re.compile(r'<pre>', re.IGNORECASE)
|
||||
if len(pre.findall(html)) >= 1:
|
||||
self.log.debug("Running Text Processing")
|
||||
outerhtml = re.compile(r'.*?(?<=<pre>)(?P<text>.*?)</pre>', re.IGNORECASE|re.DOTALL)
|
||||
html = outerhtml.sub(self.txt_process, html)
|
||||
from calibre.ebooks.conversion.preprocess import convert_entities
|
||||
html = re.sub(r'&(\S+?);', convert_entities, html)
|
||||
else:
|
||||
# Add markup naively
|
||||
# TODO - find out if there are cases where there are more than one <pre> tag or
|
||||
# other types of unmarked html and handle them in some better fashion
|
||||
add_markup = re.compile('(?<!>)(\n)')
|
||||
html = add_markup.sub('</p>\n<p>', html)
|
||||
return html
|
||||
|
||||
def arrange_htm_line_endings(self, html):
|
||||
html = re.sub(r"\s*</(?P<tag>p|div)>", "</"+"\\g<tag>"+">\n", html)
|
||||
html = re.sub(r"\s*<(?P<tag>p|div)(?P<style>[^>]*)>\s*", "\n<"+"\\g<tag>"+"\\g<style>"+">", html)
|
||||
return html
|
||||
|
||||
def fix_nbsp_indents(self, html):
|
||||
txtindent = re.compile(unicode_type(r'<(?P<tagtype>p|div)(?P<formatting>[^>]*)>\s*(?P<span>(<span[^>]*>\s*)+)?\s*(\u00a0){2,}'), re.IGNORECASE)
|
||||
html = txtindent.sub(self.insert_indent, html)
|
||||
if self.found_indents > 1:
|
||||
self.log.debug("replaced "+unicode_type(self.found_indents)+ " nbsp indents with inline styles")
|
||||
return html
|
||||
|
||||
def cleanup_markup(self, html):
|
||||
# remove remaining non-breaking spaces
|
||||
html = re.sub(unicode_type(r'\u00a0'), ' ', html)
|
||||
# Get rid of various common microsoft specific tags which can cause issues later
|
||||
# Get rid of empty <o:p> tags to simplify other processing
|
||||
html = re.sub(unicode_type(r'\s*<o:p>\s*</o:p>'), ' ', html)
|
||||
# Delete microsoft 'smart' tags
|
||||
html = re.sub('(?i)</?st1:\\w+>', '', html)
|
||||
# Re-open self closing paragraph tags
|
||||
html = re.sub('<p[^>/]*/>', '<p> </p>', html)
|
||||
# Get rid of empty span, bold, font, em, & italics tags
|
||||
fmt_tags = 'font|[ibu]|em|strong'
|
||||
open_fmt_pat, close_fmt_pat = r'<(?:{})(?:\s[^>]*)?>'.format(fmt_tags), '</(?:{})>'.format(fmt_tags)
|
||||
for i in range(2):
|
||||
html = re.sub(r"\s*<span[^>]*>\s*(<span[^>]*>\s*</span>){0,2}\s*</span>\s*", " ", html)
|
||||
html = re.sub(
|
||||
r"\s*{open}\s*({open}\s*{close}\s*){{0,2}}\s*{close}".format(open=open_fmt_pat, close=close_fmt_pat) , " ", html)
|
||||
# delete surrounding divs from empty paragraphs
|
||||
html = re.sub('<div[^>]*>\\s*<p[^>]*>\\s*</p>\\s*</div>', '<p> </p>', html)
|
||||
# Empty heading tags
|
||||
html = re.sub(r'(?i)<h\d+>\s*</h\d+>', '', html)
|
||||
self.deleted_nbsps = True
|
||||
return html
|
||||
|
||||
def analyze_line_endings(self, html):
|
||||
'''
|
||||
determines the type of html line ending used most commonly in a document
|
||||
use before calling docanalysis functions
|
||||
'''
|
||||
paras_reg = re.compile('<p[^>]*>', re.IGNORECASE)
|
||||
spans_reg = re.compile('<span[^>]*>', re.IGNORECASE)
|
||||
paras = len(paras_reg.findall(html))
|
||||
spans = len(spans_reg.findall(html))
|
||||
if spans > 1:
|
||||
if float(paras) / float(spans) < 0.75:
|
||||
return 'spanned_html'
|
||||
else:
|
||||
return 'html'
|
||||
else:
|
||||
return 'html'
|
||||
|
||||
def analyze_blanks(self, html):
|
||||
blanklines = self.blankreg.findall(html)
|
||||
lines = self.linereg.findall(html)
|
||||
if len(lines) > 1:
|
||||
self.log.debug("There are " + unicode_type(len(blanklines)) + " blank lines. " +
|
||||
unicode_type(float(len(blanklines)) / float(len(lines))) + " percent blank")
|
||||
|
||||
if float(len(blanklines)) / float(len(lines)) > 0.40:
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
def cleanup_required(self):
|
||||
for option in ['unwrap_lines', 'markup_chapter_headings', 'format_scene_breaks', 'delete_blank_paragraphs']:
|
||||
if getattr(self.extra_opts, option, False):
|
||||
return True
|
||||
return False
|
||||
|
||||
def merge_blanks(self, html, blanks_count=None):
|
||||
base_em = .5 # Baseline is 1.5em per blank line, 1st line is .5 em css and 1em for the nbsp
|
||||
em_per_line = 1.5 # Add another 1.5 em for each additional blank
|
||||
|
||||
def merge_matches(match):
|
||||
to_merge = match.group(0)
|
||||
lines = float(len(self.single_blank.findall(to_merge))) - 1.
|
||||
em = base_em + (em_per_line * lines)
|
||||
if to_merge.find('whitespace'):
|
||||
newline = self.any_multi_blank.sub('\n<p class="whitespace'+unicode_type(int(em * 10))+
|
||||
'" style="text-align:center; margin-top:'+unicode_type(em)+'em"> </p>', match.group(0))
|
||||
else:
|
||||
newline = self.any_multi_blank.sub('\n<p class="softbreak'+unicode_type(int(em * 10))+
|
||||
'" style="text-align:center; margin-top:'+unicode_type(em)+'em"> </p>', match.group(0))
|
||||
return newline
|
||||
|
||||
html = self.any_multi_blank.sub(merge_matches, html)
|
||||
return html
|
||||
|
||||
def detect_whitespace(self, html):
|
||||
blanks_around_headings = re.compile(
|
||||
r'(?P<initparas>(<(p|div)[^>]*>\s*</(p|div)>\s*){1,}\s*)?'
|
||||
r'(?P<content><h(?P<hnum>\d+)[^>]*>.*?</h(?P=hnum)>)(?P<endparas>\s*(<(p|div)[^>]*>\s*</(p|div)>\s*){1,})?', re.IGNORECASE|re.DOTALL)
|
||||
blanks_around_scene_breaks = re.compile(
|
||||
r'(?P<initparas>(<(p|div)[^>]*>\s*</(p|div)>\s*){1,}\s*)?'
|
||||
r'(?P<content><p class="scenebreak"[^>]*>.*?</p>)(?P<endparas>\s*(<(p|div)[^>]*>\s*</(p|div)>\s*){1,})?', re.IGNORECASE|re.DOTALL)
|
||||
blanks_n_nopunct = re.compile(
|
||||
r'(?P<initparas>(<p[^>]*>\s*</p>\s*){1,}\s*)?<p[^>]*>\s*(<(span|[ibu]|em|strong|font)[^>]*>\s*)*'
|
||||
r'.{1,100}?[^\W](</(span|[ibu]|em|strong|font)>\s*)*</p>(?P<endparas>\s*(<p[^>]*>\s*</p>\s*){1,})?', re.IGNORECASE|re.DOTALL)
|
||||
|
||||
def merge_header_whitespace(match):
|
||||
initblanks = match.group('initparas')
|
||||
endblanks = match.group('endparas')
|
||||
content = match.group('content')
|
||||
top_margin = ''
|
||||
bottom_margin = ''
|
||||
if initblanks is not None:
|
||||
top_margin = 'margin-top:'+unicode_type(len(self.single_blank.findall(initblanks)))+'em;'
|
||||
if endblanks is not None:
|
||||
bottom_margin = 'margin-bottom:'+unicode_type(len(self.single_blank.findall(endblanks)))+'em;'
|
||||
|
||||
if initblanks is None and endblanks is None:
|
||||
return content
|
||||
elif content.find('scenebreak') != -1:
|
||||
return content
|
||||
else:
|
||||
content = re.sub('(?i)<h(?P<hnum>\\d+)[^>]*>', '\n\n<h'+'\\g<hnum>'+' style="'+top_margin+bottom_margin+'">', content)
|
||||
return content
|
||||
|
||||
html = blanks_around_headings.sub(merge_header_whitespace, html)
|
||||
html = blanks_around_scene_breaks.sub(merge_header_whitespace, html)
|
||||
|
||||
def markup_whitespaces(match):
|
||||
blanks = match.group(0)
|
||||
blanks = self.blankreg.sub('\n<p class="whitespace" style="text-align:center; margin-top:0em; margin-bottom:0em"> </p>', blanks)
|
||||
return blanks
|
||||
|
||||
html = blanks_n_nopunct.sub(markup_whitespaces, html)
|
||||
if self.html_preprocess_sections > self.min_chapters:
|
||||
html = re.sub('(?si)^.*?(?=<h\\d)', markup_whitespaces, html)
|
||||
|
||||
return html
|
||||
|
||||
def detect_soft_breaks(self, html):
|
||||
line = '(?P<initline>'+self.line_open+'\\s*(?P<init_content>.*?)'+self.line_close+')'
|
||||
line_two = '(?P<line_two>'+re.sub('(ou|in|cha)', 'linetwo_', self.line_open)+ \
|
||||
'\\s*(?P<line_two_content>.*?)'+re.sub('(ou|in|cha)', 'linetwo_', self.line_close)+')'
|
||||
div_break_candidate_pattern = line+'\\s*<div[^>]*>\\s*</div>\\s*'+line_two
|
||||
div_break_candidate = re.compile(r'%s' % div_break_candidate_pattern, re.IGNORECASE|re.UNICODE)
|
||||
|
||||
def convert_div_softbreaks(match):
|
||||
init_is_paragraph = self.check_paragraph(match.group('init_content'))
|
||||
line_two_is_paragraph = self.check_paragraph(match.group('line_two_content'))
|
||||
if init_is_paragraph and line_two_is_paragraph:
|
||||
return (match.group('initline')+
|
||||
'\n<p class="softbreak" style="margin-top:.5em; page-break-before:avoid; text-align:center"> </p>\n'+
|
||||
match.group('line_two'))
|
||||
else:
|
||||
return match.group(0)
|
||||
|
||||
html = div_break_candidate.sub(convert_div_softbreaks, html)
|
||||
|
||||
if not self.blanks_deleted and self.blanks_between_paragraphs:
|
||||
html = self.multi_blank.sub('\n<p class="softbreak" style="margin-top:1em; page-break-before:avoid; text-align:center"> </p>', html)
|
||||
else:
|
||||
html = self.blankreg.sub('\n<p class="softbreak" style="margin-top:.5em; page-break-before:avoid; text-align:center"> </p>', html)
|
||||
return html
|
||||
|
||||
def detect_scene_breaks(self, html):
|
||||
scene_break_regex = self.line_open+'(?!('+self.common_in_text_beginnings+'|.*?'+self.common_in_text_endings+ \
|
||||
'<))(?P<break>((?P<break_char>((?!\\s)\\W))\\s*(?P=break_char)?)+)\\s*'+self.line_close
|
||||
scene_breaks = re.compile(r'%s' % scene_break_regex, re.IGNORECASE|re.UNICODE)
|
||||
html = scene_breaks.sub(self.scene_break_open+'\\g<break>'+'</p>', html)
|
||||
return html
|
||||
|
||||
def markup_user_break(self, replacement_break):
|
||||
'''
|
||||
Takes string a user supplies and wraps it in markup that will be centered with
|
||||
appropriate margins. <hr> and <img> tags are allowed. If the user specifies
|
||||
a style with width attributes in the <hr> tag then the appropriate margins are
|
||||
applied to wrapping divs. This is because many ebook devices don't support margin:auto
|
||||
All other html is converted to text.
|
||||
'''
|
||||
hr_open = '<div id="scenebreak" style="margin-left: 45%; margin-right: 45%; margin-top:1.5em; margin-bottom:1.5em; page-break-before:avoid">'
|
||||
if re.findall('(<|>)', replacement_break):
|
||||
if re.match('^<hr', replacement_break):
|
||||
if replacement_break.find('width') != -1:
|
||||
try:
|
||||
width = int(re.sub('.*?width(:|=)(?P<wnum>\\d+).*', '\\g<wnum>', replacement_break))
|
||||
except:
|
||||
scene_break = hr_open+'<hr style="height: 3px; background:#505050" /></div>'
|
||||
self.log.warn('Invalid replacement scene break'
|
||||
' expression, using default')
|
||||
else:
|
||||
replacement_break = re.sub('(?i)(width=\\d+\\%?|width:\\s*\\d+(\\%|px|pt|em)?;?)', '', replacement_break)
|
||||
divpercent = (100 - width) // 2
|
||||
hr_open = re.sub('45', unicode_type(divpercent), hr_open)
|
||||
scene_break = hr_open+replacement_break+'</div>'
|
||||
else:
|
||||
scene_break = hr_open+'<hr style="height: 3px; background:#505050" /></div>'
|
||||
elif re.match('^<img', replacement_break):
|
||||
scene_break = self.scene_break_open+replacement_break+'</p>'
|
||||
else:
|
||||
from calibre.utils.html2text import html2text
|
||||
replacement_break = html2text(replacement_break)
|
||||
replacement_break = re.sub('\\s', ' ', replacement_break)
|
||||
scene_break = self.scene_break_open+replacement_break+'</p>'
|
||||
else:
|
||||
replacement_break = re.sub('\\s', ' ', replacement_break)
|
||||
scene_break = self.scene_break_open+replacement_break+'</p>'
|
||||
|
||||
return scene_break
|
||||
|
||||
def check_paragraph(self, content):
|
||||
content = re.sub('\\s*</?span[^>]*>\\s*', '', content)
|
||||
if re.match('.*[\"\'.!?:]$', content):
|
||||
# print "detected this as a paragraph"
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
def abbyy_processor(self, html):
|
||||
abbyy_line = re.compile('((?P<linestart><p\\sstyle="(?P<styles>[^\"]*?);?">)(?P<content>.*?)(?P<lineend></p>)|(?P<image><img[^>]*>))', re.IGNORECASE)
|
||||
empty_paragraph = '\n<p> </p>\n'
|
||||
self.in_blockquote = False
|
||||
self.previous_was_paragraph = False
|
||||
html = re.sub('</?a[^>]*>', '', html)
|
||||
|
||||
def convert_styles(match):
|
||||
# print "raw styles are: "+match.group('styles')
|
||||
content = match.group('content')
|
||||
# print "raw content is: "+match.group('content')
|
||||
image = match.group('image')
|
||||
|
||||
is_paragraph = False
|
||||
text_align = ''
|
||||
text_indent = ''
|
||||
paragraph_before = ''
|
||||
paragraph_after = ''
|
||||
blockquote_open = '\n<blockquote>\n'
|
||||
blockquote_close = '</blockquote>\n'
|
||||
indented_text = 'text-indent:3%;'
|
||||
blockquote_open_loop = ''
|
||||
blockquote_close_loop = ''
|
||||
debugabby = False
|
||||
|
||||
if image:
|
||||
debugabby = True
|
||||
if self.in_blockquote:
|
||||
self.in_blockquote = False
|
||||
blockquote_close_loop = blockquote_close
|
||||
self.previous_was_paragraph = False
|
||||
return blockquote_close_loop+'\n'+image+'\n'
|
||||
else:
|
||||
styles = match.group('styles').split(';')
|
||||
is_paragraph = self.check_paragraph(content)
|
||||
# print "styles for this line are: "+unicode_type(styles)
|
||||
split_styles = []
|
||||
for style in styles:
|
||||
# print "style is: "+unicode_type(style)
|
||||
newstyle = style.split(':')
|
||||
# print "newstyle is: "+unicode_type(newstyle)
|
||||
split_styles.append(newstyle)
|
||||
styles = split_styles
|
||||
for style, setting in styles:
|
||||
if style == 'text-align' and setting != 'left':
|
||||
text_align = style+':'+setting+';'
|
||||
if style == 'text-indent':
|
||||
setting = int(re.sub('\\s*pt\\s*', '', setting))
|
||||
if 9 < setting < 14:
|
||||
text_indent = indented_text
|
||||
else:
|
||||
text_indent = style+':'+unicode_type(setting)+'pt;'
|
||||
if style == 'padding':
|
||||
setting = re.sub('pt', '', setting).split(' ')
|
||||
if int(setting[1]) < 16 and int(setting[3]) < 16:
|
||||
if self.in_blockquote:
|
||||
debugabby = True
|
||||
if is_paragraph:
|
||||
self.in_blockquote = False
|
||||
blockquote_close_loop = blockquote_close
|
||||
if int(setting[3]) > 8 and text_indent == '':
|
||||
text_indent = indented_text
|
||||
if int(setting[0]) > 5:
|
||||
paragraph_before = empty_paragraph
|
||||
if int(setting[2]) > 5:
|
||||
paragraph_after = empty_paragraph
|
||||
elif not self.in_blockquote and self.previous_was_paragraph:
|
||||
debugabby = True
|
||||
self.in_blockquote = True
|
||||
blockquote_open_loop = blockquote_open
|
||||
if debugabby:
|
||||
self.log.debug('\n\n******\n')
|
||||
self.log.debug('padding top is: '+unicode_type(setting[0]))
|
||||
self.log.debug('padding right is:' +unicode_type(setting[1]))
|
||||
self.log.debug('padding bottom is: ' + unicode_type(setting[2]))
|
||||
self.log.debug('padding left is: ' +unicode_type(setting[3]))
|
||||
|
||||
# print "text-align is: "+unicode_type(text_align)
|
||||
# print "\n***\nline is:\n "+unicode_type(match.group(0))+'\n'
|
||||
if debugabby:
|
||||
# print "this line is a paragraph = "+unicode_type(is_paragraph)+", previous line was "+unicode_type(self.previous_was_paragraph)
|
||||
self.log.debug("styles for this line were:", styles)
|
||||
self.log.debug('newline is:')
|
||||
self.log.debug(blockquote_open_loop+blockquote_close_loop+
|
||||
paragraph_before+'<p style="'+text_indent+text_align+
|
||||
'">'+content+'</p>'+paragraph_after+'\n\n\n\n\n')
|
||||
# print "is_paragraph is "+unicode_type(is_paragraph)+", previous_was_paragraph is "+unicode_type(self.previous_was_paragraph)
|
||||
self.previous_was_paragraph = is_paragraph
|
||||
# print "previous_was_paragraph is now set to "+unicode_type(self.previous_was_paragraph)+"\n\n\n"
|
||||
return blockquote_open_loop+blockquote_close_loop+paragraph_before+'<p style="'+text_indent+text_align+'">'+content+'</p>'+paragraph_after
|
||||
|
||||
html = abbyy_line.sub(convert_styles, html)
|
||||
return html
|
||||
|
||||
def __call__(self, html):
|
||||
self.log.debug("********* Heuristic processing HTML *********")
|
||||
# Count the words in the document to estimate how many chapters to look for and whether
|
||||
# other types of processing are attempted
|
||||
try:
|
||||
self.totalwords = self.get_word_count(html)
|
||||
except:
|
||||
self.log.warn("Can't get wordcount")
|
||||
|
||||
if self.totalwords < 50:
|
||||
self.log.warn("flow is too short, not running heuristics")
|
||||
return html
|
||||
|
||||
is_abbyy = self.is_abbyy(html)
|
||||
if is_abbyy:
|
||||
html = self.abbyy_processor(html)
|
||||
|
||||
# Arrange line feeds and </p> tags so the line_length and no_markup functions work correctly
|
||||
html = self.arrange_htm_line_endings(html)
|
||||
# self.dump(html, 'after_arrange_line_endings')
|
||||
if self.cleanup_required():
|
||||
# ##### Check Markup ######
|
||||
#
|
||||
# some lit files don't have any <p> tags or equivalent (generally just plain text between
|
||||
# <pre> tags), check and mark up line endings if required before proceeding
|
||||
# fix indents must run after this step
|
||||
if self.no_markup(html, 0.1):
|
||||
self.log.debug("not enough paragraph markers, adding now")
|
||||
# markup using text processing
|
||||
html = self.markup_pre(html)
|
||||
|
||||
# Replace series of non-breaking spaces with text-indent
|
||||
if getattr(self.extra_opts, 'fix_indents', False):
|
||||
html = self.fix_nbsp_indents(html)
|
||||
|
||||
if self.cleanup_required():
|
||||
# fix indents must run before this step, as it removes non-breaking spaces
|
||||
html = self.cleanup_markup(html)
|
||||
|
||||
is_pdftohtml = self.is_pdftohtml(html)
|
||||
if is_pdftohtml:
|
||||
self.line_open = "<(?P<outer>p)[^>]*>(\\s*<[ibu][^>]*>)?\\s*"
|
||||
self.line_close = "\\s*(</[ibu][^>]*>\\s*)?</(?P=outer)>"
|
||||
|
||||
# ADE doesn't render <br />, change to empty paragraphs
|
||||
# html = re.sub('<br[^>]*>', u'<p>\u00a0</p>', html)
|
||||
|
||||
# Determine whether the document uses interleaved blank lines
|
||||
self.blanks_between_paragraphs = self.analyze_blanks(html)
|
||||
|
||||
# detect chapters/sections to match xpath or splitting logic
|
||||
|
||||
if getattr(self.extra_opts, 'markup_chapter_headings', False):
|
||||
html = self.markup_chapters(html, self.totalwords, self.blanks_between_paragraphs)
|
||||
# self.dump(html, 'after_chapter_markup')
|
||||
|
||||
if getattr(self.extra_opts, 'italicize_common_cases', False):
|
||||
html = self.markup_italicis(html)
|
||||
|
||||
# If more than 40% of the lines are empty paragraphs and the user has enabled delete
|
||||
# blank paragraphs then delete blank lines to clean up spacing
|
||||
if self.blanks_between_paragraphs and getattr(self.extra_opts, 'delete_blank_paragraphs', False):
|
||||
self.log.debug("deleting blank lines")
|
||||
self.blanks_deleted = True
|
||||
html = self.multi_blank.sub('\n<p class="softbreak" style="margin-top:.5em; page-break-before:avoid; text-align:center"> </p>', html)
|
||||
html = self.blankreg.sub('', html)
|
||||
|
||||
# Determine line ending type
|
||||
# Some OCR sourced files have line breaks in the html using a combination of span & p tags
|
||||
# span are used for hard line breaks, p for new paragraphs. Determine which is used so
|
||||
# that lines can be un-wrapped across page boundaries
|
||||
format = self.analyze_line_endings(html)
|
||||
|
||||
# Check Line histogram to determine if the document uses hard line breaks, If 50% or
|
||||
# more of the lines break in the same region of the document then unwrapping is required
|
||||
docanalysis = DocAnalysis(format, html)
|
||||
hardbreaks = docanalysis.line_histogram(.50)
|
||||
self.log.debug("Hard line breaks check returned "+unicode_type(hardbreaks))
|
||||
|
||||
# Calculate Length
|
||||
unwrap_factor = getattr(self.extra_opts, 'html_unwrap_factor', 0.4)
|
||||
length = docanalysis.line_length(unwrap_factor)
|
||||
self.log.debug("Median line length is " + unicode_type(length) + ", calculated with " + format + " format")
|
||||
|
||||
# ##### Unwrap lines ######
|
||||
if getattr(self.extra_opts, 'unwrap_lines', False):
|
||||
# only go through unwrapping code if the histogram shows unwrapping is required or if the user decreased the default unwrap_factor
|
||||
if hardbreaks or unwrap_factor < 0.4:
|
||||
self.log.debug("Unwrapping required, unwrapping Lines")
|
||||
# Dehyphenate with line length limiters
|
||||
dehyphenator = Dehyphenator(self.extra_opts.verbose, self.log)
|
||||
html = dehyphenator(html,'html', length)
|
||||
html = self.punctuation_unwrap(length, html, 'html')
|
||||
|
||||
if getattr(self.extra_opts, 'dehyphenate', False):
|
||||
# dehyphenate in cleanup mode to fix anything previous conversions/editing missed
|
||||
self.log.debug("Fixing hyphenated content")
|
||||
dehyphenator = Dehyphenator(self.extra_opts.verbose, self.log)
|
||||
html = dehyphenator(html,'html_cleanup', length)
|
||||
html = dehyphenator(html, 'individual_words', length)
|
||||
|
||||
# If still no sections after unwrapping mark split points on lines with no punctuation
|
||||
if self.html_preprocess_sections < self.min_chapters and getattr(self.extra_opts, 'markup_chapter_headings', False):
|
||||
self.log.debug("Looking for more split points based on punctuation,"
|
||||
" currently have " + unicode_type(self.html_preprocess_sections))
|
||||
chapdetect3 = re.compile(
|
||||
r'<(?P<styles>(p|div)[^>]*)>\s*(?P<section>(<span[^>]*>)?\s*(?!([\W]+\s*)+)'
|
||||
r'(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*(<[ibu][^>]*>){0,2}\s*(<span[^>]*>)?\s*'
|
||||
r'.?(?=[a-z#\-*\s]+<)([a-z#-*]+\s*){1,5}\s*\s*(</span>)?(</[ibu]>){0,2}\s*'
|
||||
r'(</span>)?\s*(</[ibu]>){0,2}\s*(</span>)?\s*</(p|div)>)', re.IGNORECASE)
|
||||
html = chapdetect3.sub(self.chapter_break, html)
|
||||
|
||||
if getattr(self.extra_opts, 'renumber_headings', False):
|
||||
# search for places where a first or second level heading is immediately followed by another
|
||||
# top level heading. demote the second heading to h3 to prevent splitting between chapter
|
||||
# headings and titles, images, etc
|
||||
doubleheading = re.compile(
|
||||
r'(?P<firsthead><h(1|2)[^>]*>.+?</h(1|2)>\s*(<(?!h\d)[^>]*>\s*)*)<h(1|2)(?P<secondhead>[^>]*>.+?)</h(1|2)>', re.IGNORECASE)
|
||||
html = doubleheading.sub('\\g<firsthead>'+'\n<h3'+'\\g<secondhead>'+'</h3>', html)
|
||||
|
||||
# If scene break formatting is enabled, find all blank paragraphs that definitely aren't scenebreaks,
|
||||
# style it with the 'whitespace' class. All remaining blank lines are styled as softbreaks.
|
||||
# Multiple sequential blank paragraphs are merged with appropriate margins
|
||||
# If non-blank scene breaks exist they are center aligned and styled with appropriate margins.
|
||||
if getattr(self.extra_opts, 'format_scene_breaks', False):
|
||||
self.log.debug('Formatting scene breaks')
|
||||
html = re.sub('(?i)<div[^>]*>\\s*<br(\\s?/)?>\\s*</div>', '<p></p>', html)
|
||||
html = self.detect_scene_breaks(html)
|
||||
html = self.detect_whitespace(html)
|
||||
html = self.detect_soft_breaks(html)
|
||||
blanks_count = len(self.any_multi_blank.findall(html))
|
||||
if blanks_count >= 1:
|
||||
html = self.merge_blanks(html, blanks_count)
|
||||
detected_scene_break = re.compile(r'<p class="scenebreak"[^>]*>.*?</p>')
|
||||
scene_break_count = len(detected_scene_break.findall(html))
|
||||
# If the user has enabled scene break replacement, then either softbreaks
|
||||
# or 'hard' scene breaks are replaced, depending on which is in use
|
||||
# Otherwise separator lines are centered, use a bit larger margin in this case
|
||||
replacement_break = getattr(self.extra_opts, 'replace_scene_breaks', None)
|
||||
if replacement_break:
|
||||
replacement_break = self.markup_user_break(replacement_break)
|
||||
if scene_break_count >= 1:
|
||||
html = detected_scene_break.sub(replacement_break, html)
|
||||
html = re.sub('<p\\s+class="softbreak"[^>]*>\\s*</p>', replacement_break, html)
|
||||
else:
|
||||
html = re.sub('<p\\s+class="softbreak"[^>]*>\\s*</p>', replacement_break, html)
|
||||
|
||||
if self.deleted_nbsps:
|
||||
# put back non-breaking spaces in empty paragraphs so they render correctly
|
||||
html = self.anyblank.sub('\n'+r'\g<openline>'+'\u00a0'+r'\g<closeline>', html)
|
||||
return html
|
||||
11
ebook_converter/ebooks/docx/__init__.py
Normal file
11
ebook_converter/ebooks/docx/__init__.py
Normal file
@@ -0,0 +1,11 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=utf-8
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
|
||||
|
||||
class InvalidDOCX(ValueError):
|
||||
pass
|
||||
|
||||
478
ebook_converter/ebooks/docx/block_styles.py
Normal file
478
ebook_converter/ebooks/docx/block_styles.py
Normal file
@@ -0,0 +1,478 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=utf-8
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
|
||||
import numbers
|
||||
from collections import OrderedDict
|
||||
from polyglot.builtins import iteritems
|
||||
|
||||
|
||||
class Inherit(object):
|
||||
|
||||
def __eq__(self, other):
|
||||
return other is self
|
||||
|
||||
def __hash__(self):
|
||||
return id(self)
|
||||
|
||||
def __lt__(self, other):
|
||||
return False
|
||||
|
||||
def __gt__(self, other):
|
||||
return other is not self
|
||||
|
||||
def __ge__(self, other):
|
||||
if self is other:
|
||||
return True
|
||||
return True
|
||||
|
||||
def __le__(self, other):
|
||||
if self is other:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
inherit = Inherit()
|
||||
|
||||
|
||||
def binary_property(parent, name, XPath, get):
|
||||
vals = XPath('./w:%s' % name)(parent)
|
||||
if not vals:
|
||||
return inherit
|
||||
val = get(vals[0], 'w:val', 'on')
|
||||
return True if val in {'on', '1', 'true'} else False
|
||||
|
||||
|
||||
def simple_color(col, auto='black'):
|
||||
if not col or col == 'auto' or len(col) != 6:
|
||||
return auto
|
||||
return '#'+col
|
||||
|
||||
|
||||
def simple_float(val, mult=1.0):
|
||||
try:
|
||||
return float(val) * mult
|
||||
except (ValueError, TypeError, AttributeError, KeyError):
|
||||
pass
|
||||
|
||||
|
||||
def twips(val, mult=0.05):
|
||||
''' Parse val as either a pure number representing twentieths of a point or a number followed by the suffix pt, representing pts.'''
|
||||
try:
|
||||
return float(val) * mult
|
||||
except (ValueError, TypeError, AttributeError, KeyError):
|
||||
if val and val.endswith('pt') and mult == 0.05:
|
||||
return twips(val[:-2], mult=1.0)
|
||||
|
||||
|
||||
LINE_STYLES = { # {{{
|
||||
'basicBlackDashes': 'dashed',
|
||||
'basicBlackDots': 'dotted',
|
||||
'basicBlackSquares': 'dashed',
|
||||
'basicThinLines': 'solid',
|
||||
'dashDotStroked': 'groove',
|
||||
'dashed': 'dashed',
|
||||
'dashSmallGap': 'dashed',
|
||||
'dotDash': 'dashed',
|
||||
'dotDotDash': 'dashed',
|
||||
'dotted': 'dotted',
|
||||
'double': 'double',
|
||||
'inset': 'inset',
|
||||
'nil': 'none',
|
||||
'none': 'none',
|
||||
'outset': 'outset',
|
||||
'single': 'solid',
|
||||
'thick': 'solid',
|
||||
'thickThinLargeGap': 'double',
|
||||
'thickThinMediumGap': 'double',
|
||||
'thickThinSmallGap' : 'double',
|
||||
'thinThickLargeGap': 'double',
|
||||
'thinThickMediumGap': 'double',
|
||||
'thinThickSmallGap': 'double',
|
||||
'thinThickThinLargeGap': 'double',
|
||||
'thinThickThinMediumGap': 'double',
|
||||
'thinThickThinSmallGap': 'double',
|
||||
'threeDEmboss': 'ridge',
|
||||
'threeDEngrave': 'groove',
|
||||
'triple': 'double',
|
||||
} # }}}
|
||||
|
||||
# Read from XML {{{
|
||||
|
||||
border_props = ('padding_%s', 'border_%s_width', 'border_%s_style', 'border_%s_color')
|
||||
border_edges = ('left', 'top', 'right', 'bottom', 'between')
|
||||
|
||||
|
||||
def read_single_border(parent, edge, XPath, get):
|
||||
color = style = width = padding = None
|
||||
for elem in XPath('./w:%s' % edge)(parent):
|
||||
c = get(elem, 'w:color')
|
||||
if c is not None:
|
||||
color = simple_color(c)
|
||||
s = get(elem, 'w:val')
|
||||
if s is not None:
|
||||
style = LINE_STYLES.get(s, 'solid')
|
||||
space = get(elem, 'w:space')
|
||||
if space is not None:
|
||||
try:
|
||||
padding = float(space)
|
||||
except (ValueError, TypeError):
|
||||
pass
|
||||
sz = get(elem, 'w:sz')
|
||||
if sz is not None:
|
||||
# we dont care about art borders (they are only used for page borders)
|
||||
try:
|
||||
width = min(96, max(2, float(sz))) / 8
|
||||
except (ValueError, TypeError):
|
||||
pass
|
||||
return {p:v for p, v in zip(border_props, (padding, width, style, color))}
|
||||
|
||||
|
||||
def read_border(parent, dest, XPath, get, border_edges=border_edges, name='pBdr'):
|
||||
vals = {k % edge:inherit for edge in border_edges for k in border_props}
|
||||
|
||||
for border in XPath('./w:' + name)(parent):
|
||||
for edge in border_edges:
|
||||
for prop, val in iteritems(read_single_border(border, edge, XPath, get)):
|
||||
if val is not None:
|
||||
vals[prop % edge] = val
|
||||
|
||||
for key, val in iteritems(vals):
|
||||
setattr(dest, key, val)
|
||||
|
||||
|
||||
def border_to_css(edge, style, css):
|
||||
bs = getattr(style, 'border_%s_style' % edge)
|
||||
bc = getattr(style, 'border_%s_color' % edge)
|
||||
bw = getattr(style, 'border_%s_width' % edge)
|
||||
if isinstance(bw, numbers.Number):
|
||||
# WebKit needs at least 1pt to render borders and 3pt to render double borders
|
||||
bw = max(bw, (3 if bs == 'double' else 1))
|
||||
if bs is not inherit and bs is not None:
|
||||
css['border-%s-style' % edge] = bs
|
||||
if bc is not inherit and bc is not None:
|
||||
css['border-%s-color' % edge] = bc
|
||||
if bw is not inherit and bw is not None:
|
||||
if isinstance(bw, numbers.Number):
|
||||
bw = '%.3gpt' % bw
|
||||
css['border-%s-width' % edge] = bw
|
||||
|
||||
|
||||
def read_indent(parent, dest, XPath, get):
|
||||
padding_left = padding_right = text_indent = inherit
|
||||
for indent in XPath('./w:ind')(parent):
|
||||
l, lc = get(indent, 'w:left'), get(indent, 'w:leftChars')
|
||||
pl = simple_float(lc, 0.01) if lc is not None else simple_float(l, 0.05) if l is not None else None
|
||||
if pl is not None:
|
||||
padding_left = '%.3g%s' % (pl, 'em' if lc is not None else 'pt')
|
||||
|
||||
r, rc = get(indent, 'w:right'), get(indent, 'w:rightChars')
|
||||
pr = simple_float(rc, 0.01) if rc is not None else simple_float(r, 0.05) if r is not None else None
|
||||
if pr is not None:
|
||||
padding_right = '%.3g%s' % (pr, 'em' if rc is not None else 'pt')
|
||||
|
||||
h, hc = get(indent, 'w:hanging'), get(indent, 'w:hangingChars')
|
||||
fl, flc = get(indent, 'w:firstLine'), get(indent, 'w:firstLineChars')
|
||||
h = h if h is None else '-'+h
|
||||
hc = hc if hc is None else '-'+hc
|
||||
ti = (simple_float(hc, 0.01) if hc is not None else simple_float(h, 0.05) if h is not None else
|
||||
simple_float(flc, 0.01) if flc is not None else simple_float(fl, 0.05) if fl is not None else None)
|
||||
if ti is not None:
|
||||
text_indent = '%.3g%s' % (ti, 'em' if hc is not None or (h is None and flc is not None) else 'pt')
|
||||
|
||||
setattr(dest, 'margin_left', padding_left)
|
||||
setattr(dest, 'margin_right', padding_right)
|
||||
setattr(dest, 'text_indent', text_indent)
|
||||
|
||||
|
||||
def read_justification(parent, dest, XPath, get):
|
||||
ans = inherit
|
||||
for jc in XPath('./w:jc[@w:val]')(parent):
|
||||
val = get(jc, 'w:val')
|
||||
if not val:
|
||||
continue
|
||||
if val in {'both', 'distribute'} or 'thai' in val or 'kashida' in val:
|
||||
ans = 'justify'
|
||||
elif val in {'left', 'center', 'right', 'start', 'end'}:
|
||||
ans = val
|
||||
elif val in {'start', 'end'}:
|
||||
ans = {'start':'left'}.get(val, 'right')
|
||||
setattr(dest, 'text_align', ans)
|
||||
|
||||
|
||||
def read_spacing(parent, dest, XPath, get):
|
||||
padding_top = padding_bottom = line_height = inherit
|
||||
for s in XPath('./w:spacing')(parent):
|
||||
a, al, aa = get(s, 'w:after'), get(s, 'w:afterLines'), get(s, 'w:afterAutospacing')
|
||||
pb = None if aa in {'on', '1', 'true'} else simple_float(al, 0.02) if al is not None else simple_float(a, 0.05) if a is not None else None
|
||||
if pb is not None:
|
||||
padding_bottom = '%.3g%s' % (pb, 'ex' if al is not None else 'pt')
|
||||
|
||||
b, bl, bb = get(s, 'w:before'), get(s, 'w:beforeLines'), get(s, 'w:beforeAutospacing')
|
||||
pt = None if bb in {'on', '1', 'true'} else simple_float(bl, 0.02) if bl is not None else simple_float(b, 0.05) if b is not None else None
|
||||
if pt is not None:
|
||||
padding_top = '%.3g%s' % (pt, 'ex' if bl is not None else 'pt')
|
||||
|
||||
l, lr = get(s, 'w:line'), get(s, 'w:lineRule', 'auto')
|
||||
if l is not None:
|
||||
lh = simple_float(l, 0.05) if lr in {'exact', 'atLeast'} else simple_float(l, 1/240.0)
|
||||
if lh is not None:
|
||||
line_height = '%.3g%s' % (lh, 'pt' if lr in {'exact', 'atLeast'} else '')
|
||||
|
||||
setattr(dest, 'margin_top', padding_top)
|
||||
setattr(dest, 'margin_bottom', padding_bottom)
|
||||
setattr(dest, 'line_height', line_height)
|
||||
|
||||
|
||||
def read_shd(parent, dest, XPath, get):
|
||||
ans = inherit
|
||||
for shd in XPath('./w:shd[@w:fill]')(parent):
|
||||
val = get(shd, 'w:fill')
|
||||
if val:
|
||||
ans = simple_color(val, auto='transparent')
|
||||
setattr(dest, 'background_color', ans)
|
||||
|
||||
|
||||
def read_numbering(parent, dest, XPath, get):
|
||||
lvl = num_id = inherit
|
||||
for np in XPath('./w:numPr')(parent):
|
||||
for ilvl in XPath('./w:ilvl[@w:val]')(np):
|
||||
try:
|
||||
lvl = int(get(ilvl, 'w:val'))
|
||||
except (ValueError, TypeError):
|
||||
pass
|
||||
for num in XPath('./w:numId[@w:val]')(np):
|
||||
num_id = get(num, 'w:val')
|
||||
setattr(dest, 'numbering_id', num_id)
|
||||
setattr(dest, 'numbering_level', lvl)
|
||||
|
||||
|
||||
class Frame(object):
|
||||
|
||||
all_attributes = ('drop_cap', 'h', 'w', 'h_anchor', 'h_rule', 'v_anchor', 'wrap',
|
||||
'h_space', 'v_space', 'lines', 'x_align', 'y_align', 'x', 'y')
|
||||
|
||||
def __init__(self, fp, XPath, get):
|
||||
self.drop_cap = get(fp, 'w:dropCap', 'none')
|
||||
try:
|
||||
self.h = int(get(fp, 'w:h'))/20
|
||||
except (ValueError, TypeError):
|
||||
self.h = 0
|
||||
try:
|
||||
self.w = int(get(fp, 'w:w'))/20
|
||||
except (ValueError, TypeError):
|
||||
self.w = None
|
||||
try:
|
||||
self.x = int(get(fp, 'w:x'))/20
|
||||
except (ValueError, TypeError):
|
||||
self.x = 0
|
||||
try:
|
||||
self.y = int(get(fp, 'w:y'))/20
|
||||
except (ValueError, TypeError):
|
||||
self.y = 0
|
||||
|
||||
self.h_anchor = get(fp, 'w:hAnchor', 'page')
|
||||
self.h_rule = get(fp, 'w:hRule', 'auto')
|
||||
self.v_anchor = get(fp, 'w:vAnchor', 'page')
|
||||
self.wrap = get(fp, 'w:wrap', 'around')
|
||||
self.x_align = get(fp, 'w:xAlign')
|
||||
self.y_align = get(fp, 'w:yAlign')
|
||||
|
||||
try:
|
||||
self.h_space = int(get(fp, 'w:hSpace'))/20
|
||||
except (ValueError, TypeError):
|
||||
self.h_space = 0
|
||||
try:
|
||||
self.v_space = int(get(fp, 'w:vSpace'))/20
|
||||
except (ValueError, TypeError):
|
||||
self.v_space = 0
|
||||
try:
|
||||
self.lines = int(get(fp, 'w:lines'))
|
||||
except (ValueError, TypeError):
|
||||
self.lines = 1
|
||||
|
||||
def css(self, page):
|
||||
is_dropcap = self.drop_cap in {'drop', 'margin'}
|
||||
ans = {'overflow': 'hidden'}
|
||||
|
||||
if is_dropcap:
|
||||
ans['float'] = 'left'
|
||||
ans['margin'] = '0'
|
||||
ans['padding-right'] = '0.2em'
|
||||
else:
|
||||
if self.h_rule != 'auto':
|
||||
t = 'min-height' if self.h_rule == 'atLeast' else 'height'
|
||||
ans[t] = '%.3gpt' % self.h
|
||||
if self.w is not None:
|
||||
ans['width'] = '%.3gpt' % self.w
|
||||
ans['padding-top'] = ans['padding-bottom'] = '%.3gpt' % self.v_space
|
||||
if self.wrap not in {None, 'none'}:
|
||||
ans['padding-left'] = ans['padding-right'] = '%.3gpt' % self.h_space
|
||||
if self.x_align is None:
|
||||
fl = 'left' if self.x/page.width < 0.5 else 'right'
|
||||
else:
|
||||
fl = 'right' if self.x_align == 'right' else 'left'
|
||||
ans['float'] = fl
|
||||
return ans
|
||||
|
||||
def __eq__(self, other):
|
||||
for x in self.all_attributes:
|
||||
if getattr(other, x, inherit) != getattr(self, x):
|
||||
return False
|
||||
return True
|
||||
|
||||
def __ne__(self, other):
|
||||
return not self.__eq__(other)
|
||||
|
||||
|
||||
def read_frame(parent, dest, XPath, get):
|
||||
ans = inherit
|
||||
for fp in XPath('./w:framePr')(parent):
|
||||
ans = Frame(fp, XPath, get)
|
||||
setattr(dest, 'frame', ans)
|
||||
|
||||
# }}}
|
||||
|
||||
|
||||
class ParagraphStyle(object):
|
||||
|
||||
all_properties = (
|
||||
'adjustRightInd', 'autoSpaceDE', 'autoSpaceDN', 'bidi',
|
||||
'contextualSpacing', 'keepLines', 'keepNext', 'mirrorIndents',
|
||||
'pageBreakBefore', 'snapToGrid', 'suppressLineNumbers',
|
||||
'suppressOverlap', 'topLinePunct', 'widowControl', 'wordWrap',
|
||||
|
||||
# Border margins padding
|
||||
'border_left_width', 'border_left_style', 'border_left_color', 'padding_left',
|
||||
'border_top_width', 'border_top_style', 'border_top_color', 'padding_top',
|
||||
'border_right_width', 'border_right_style', 'border_right_color', 'padding_right',
|
||||
'border_bottom_width', 'border_bottom_style', 'border_bottom_color', 'padding_bottom',
|
||||
'border_between_width', 'border_between_style', 'border_between_color', 'padding_between',
|
||||
'margin_left', 'margin_top', 'margin_right', 'margin_bottom',
|
||||
|
||||
# Misc.
|
||||
'text_indent', 'text_align', 'line_height', 'background_color',
|
||||
'numbering_id', 'numbering_level', 'font_family', 'font_size', 'color', 'frame',
|
||||
'cs_font_size', 'cs_font_family',
|
||||
)
|
||||
|
||||
def __init__(self, namespace, pPr=None):
|
||||
self.namespace = namespace
|
||||
self.linked_style = None
|
||||
if pPr is None:
|
||||
for p in self.all_properties:
|
||||
setattr(self, p, inherit)
|
||||
else:
|
||||
for p in (
|
||||
'adjustRightInd', 'autoSpaceDE', 'autoSpaceDN', 'bidi',
|
||||
'contextualSpacing', 'keepLines', 'keepNext', 'mirrorIndents',
|
||||
'pageBreakBefore', 'snapToGrid', 'suppressLineNumbers',
|
||||
'suppressOverlap', 'topLinePunct', 'widowControl', 'wordWrap',
|
||||
):
|
||||
setattr(self, p, binary_property(pPr, p, namespace.XPath, namespace.get))
|
||||
|
||||
for x in ('border', 'indent', 'justification', 'spacing', 'shd', 'numbering', 'frame'):
|
||||
f = read_funcs[x]
|
||||
f(pPr, self, namespace.XPath, namespace.get)
|
||||
|
||||
for s in namespace.XPath('./w:pStyle[@w:val]')(pPr):
|
||||
self.linked_style = namespace.get(s, 'w:val')
|
||||
|
||||
self.font_family = self.font_size = self.color = self.cs_font_size = self.cs_font_family = inherit
|
||||
|
||||
self._css = None
|
||||
self._border_key = None
|
||||
|
||||
def update(self, other):
|
||||
for prop in self.all_properties:
|
||||
nval = getattr(other, prop)
|
||||
if nval is not inherit:
|
||||
setattr(self, prop, nval)
|
||||
if other.linked_style is not None:
|
||||
self.linked_style = other.linked_style
|
||||
|
||||
def resolve_based_on(self, parent):
|
||||
for p in self.all_properties:
|
||||
val = getattr(self, p)
|
||||
if val is inherit:
|
||||
setattr(self, p, getattr(parent, p))
|
||||
|
||||
@property
|
||||
def css(self):
|
||||
if self._css is None:
|
||||
self._css = c = OrderedDict()
|
||||
if self.keepLines is True:
|
||||
c['page-break-inside'] = 'avoid'
|
||||
if self.pageBreakBefore is True:
|
||||
c['page-break-before'] = 'always'
|
||||
if self.keepNext is True:
|
||||
c['page-break-after'] = 'avoid'
|
||||
for edge in ('left', 'top', 'right', 'bottom'):
|
||||
border_to_css(edge, self, c)
|
||||
val = getattr(self, 'padding_%s' % edge)
|
||||
if val is not inherit:
|
||||
c['padding-%s' % edge] = '%.3gpt' % val
|
||||
val = getattr(self, 'margin_%s' % edge)
|
||||
if val is not inherit:
|
||||
c['margin-%s' % edge] = val
|
||||
|
||||
if self.line_height not in {inherit, '1'}:
|
||||
c['line-height'] = self.line_height
|
||||
|
||||
for x in ('text_indent', 'background_color', 'font_family', 'font_size', 'color'):
|
||||
val = getattr(self, x)
|
||||
if val is not inherit:
|
||||
if x == 'font_size':
|
||||
val = '%.3gpt' % val
|
||||
c[x.replace('_', '-')] = val
|
||||
ta = self.text_align
|
||||
if ta is not inherit:
|
||||
if self.bidi is True:
|
||||
ta = {'left':'right', 'right':'left'}.get(ta, ta)
|
||||
c['text-align'] = ta
|
||||
|
||||
return self._css
|
||||
|
||||
@property
|
||||
def border_key(self):
|
||||
if self._border_key is None:
|
||||
k = []
|
||||
for edge in border_edges:
|
||||
for prop in border_props:
|
||||
prop = prop % edge
|
||||
k.append(getattr(self, prop))
|
||||
self._border_key = tuple(k)
|
||||
return self._border_key
|
||||
|
||||
def has_identical_borders(self, other_style):
|
||||
return self.border_key == getattr(other_style, 'border_key', None)
|
||||
|
||||
def clear_borders(self):
|
||||
for edge in border_edges[:-1]:
|
||||
for prop in ('width', 'color', 'style'):
|
||||
setattr(self, 'border_%s_%s' % (edge, prop), inherit)
|
||||
|
||||
def clone_border_styles(self):
|
||||
style = ParagraphStyle(self.namespace)
|
||||
for edge in border_edges[:-1]:
|
||||
for prop in ('width', 'color', 'style'):
|
||||
attr = 'border_%s_%s' % (edge, prop)
|
||||
setattr(style, attr, getattr(self, attr))
|
||||
return style
|
||||
|
||||
def apply_between_border(self):
|
||||
for prop in ('width', 'color', 'style'):
|
||||
setattr(self, 'border_bottom_%s' % prop, getattr(self, 'border_between_%s' % prop))
|
||||
|
||||
def has_visible_border(self):
|
||||
for edge in border_edges[:-1]:
|
||||
bw, bs = getattr(self, 'border_%s_width' % edge), getattr(self, 'border_%s_style' % edge)
|
||||
if bw is not inherit and bw and bs is not inherit and bs != 'none':
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
read_funcs = {k[5:]:v for k, v in iteritems(globals()) if k.startswith('read_')}
|
||||
302
ebook_converter/ebooks/docx/char_styles.py
Normal file
302
ebook_converter/ebooks/docx/char_styles.py
Normal file
@@ -0,0 +1,302 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=utf-8
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
|
||||
from collections import OrderedDict
|
||||
from calibre.ebooks.docx.block_styles import ( # noqa
|
||||
inherit, simple_color, LINE_STYLES, simple_float, binary_property, read_shd)
|
||||
|
||||
# Read from XML {{{
|
||||
|
||||
|
||||
def read_text_border(parent, dest, XPath, get):
|
||||
border_color = border_style = border_width = padding = inherit
|
||||
elems = XPath('./w:bdr')(parent)
|
||||
if elems and elems[0].attrib:
|
||||
border_color = simple_color('auto')
|
||||
border_style = 'none'
|
||||
border_width = 1
|
||||
for elem in elems:
|
||||
color = get(elem, 'w:color')
|
||||
if color is not None:
|
||||
border_color = simple_color(color)
|
||||
style = get(elem, 'w:val')
|
||||
if style is not None:
|
||||
border_style = LINE_STYLES.get(style, 'solid')
|
||||
space = get(elem, 'w:space')
|
||||
if space is not None:
|
||||
try:
|
||||
padding = float(space)
|
||||
except (ValueError, TypeError):
|
||||
pass
|
||||
sz = get(elem, 'w:sz')
|
||||
if sz is not None:
|
||||
# we dont care about art borders (they are only used for page borders)
|
||||
try:
|
||||
# A border of less than 1pt is not rendered by WebKit
|
||||
border_width = min(96, max(8, float(sz))) / 8
|
||||
except (ValueError, TypeError):
|
||||
pass
|
||||
|
||||
setattr(dest, 'border_color', border_color)
|
||||
setattr(dest, 'border_style', border_style)
|
||||
setattr(dest, 'border_width', border_width)
|
||||
setattr(dest, 'padding', padding)
|
||||
|
||||
|
||||
def read_color(parent, dest, XPath, get):
|
||||
ans = inherit
|
||||
for col in XPath('./w:color[@w:val]')(parent):
|
||||
val = get(col, 'w:val')
|
||||
if not val:
|
||||
continue
|
||||
ans = simple_color(val)
|
||||
setattr(dest, 'color', ans)
|
||||
|
||||
|
||||
def convert_highlight_color(val):
|
||||
return {
|
||||
'darkBlue': '#000080', 'darkCyan': '#008080', 'darkGray': '#808080',
|
||||
'darkGreen': '#008000', 'darkMagenta': '#800080', 'darkRed': '#800000', 'darkYellow': '#808000',
|
||||
'lightGray': '#c0c0c0'}.get(val, val)
|
||||
|
||||
|
||||
def read_highlight(parent, dest, XPath, get):
|
||||
ans = inherit
|
||||
for col in XPath('./w:highlight[@w:val]')(parent):
|
||||
val = get(col, 'w:val')
|
||||
if not val:
|
||||
continue
|
||||
if not val or val == 'none':
|
||||
val = 'transparent'
|
||||
else:
|
||||
val = convert_highlight_color(val)
|
||||
ans = val
|
||||
setattr(dest, 'highlight', ans)
|
||||
|
||||
|
||||
def read_lang(parent, dest, XPath, get):
|
||||
ans = inherit
|
||||
for col in XPath('./w:lang[@w:val]')(parent):
|
||||
val = get(col, 'w:val')
|
||||
if not val:
|
||||
continue
|
||||
try:
|
||||
code = int(val, 16)
|
||||
except (ValueError, TypeError):
|
||||
ans = val
|
||||
else:
|
||||
from calibre.ebooks.docx.lcid import lcid
|
||||
val = lcid.get(code, None)
|
||||
if val:
|
||||
ans = val
|
||||
setattr(dest, 'lang', ans)
|
||||
|
||||
|
||||
def read_letter_spacing(parent, dest, XPath, get):
|
||||
ans = inherit
|
||||
for col in XPath('./w:spacing[@w:val]')(parent):
|
||||
val = simple_float(get(col, 'w:val'), 0.05)
|
||||
if val is not None:
|
||||
ans = val
|
||||
setattr(dest, 'letter_spacing', ans)
|
||||
|
||||
|
||||
def read_underline(parent, dest, XPath, get):
|
||||
ans = inherit
|
||||
for col in XPath('./w:u[@w:val]')(parent):
|
||||
val = get(col, 'w:val')
|
||||
if val:
|
||||
ans = val if val == 'none' else 'underline'
|
||||
setattr(dest, 'text_decoration', ans)
|
||||
|
||||
|
||||
def read_vert_align(parent, dest, XPath, get):
|
||||
ans = inherit
|
||||
for col in XPath('./w:vertAlign[@w:val]')(parent):
|
||||
val = get(col, 'w:val')
|
||||
if val and val in {'baseline', 'subscript', 'superscript'}:
|
||||
ans = val
|
||||
setattr(dest, 'vert_align', ans)
|
||||
|
||||
|
||||
def read_position(parent, dest, XPath, get):
|
||||
ans = inherit
|
||||
for col in XPath('./w:position[@w:val]')(parent):
|
||||
val = get(col, 'w:val')
|
||||
try:
|
||||
ans = float(val)/2.0
|
||||
except Exception:
|
||||
pass
|
||||
setattr(dest, 'position', ans)
|
||||
|
||||
|
||||
def read_font(parent, dest, XPath, get):
|
||||
ff = inherit
|
||||
for col in XPath('./w:rFonts')(parent):
|
||||
val = get(col, 'w:asciiTheme')
|
||||
if val:
|
||||
val = '|%s|' % val
|
||||
else:
|
||||
val = get(col, 'w:ascii')
|
||||
if val:
|
||||
ff = val
|
||||
setattr(dest, 'font_family', ff)
|
||||
for col in XPath('./w:sz[@w:val]')(parent):
|
||||
val = simple_float(get(col, 'w:val'), 0.5)
|
||||
if val is not None:
|
||||
setattr(dest, 'font_size', val)
|
||||
return
|
||||
setattr(dest, 'font_size', inherit)
|
||||
|
||||
|
||||
def read_font_cs(parent, dest, XPath, get):
|
||||
ff = inherit
|
||||
for col in XPath('./w:rFonts')(parent):
|
||||
val = get(col, 'w:csTheme')
|
||||
if val:
|
||||
val = '|%s|' % val
|
||||
else:
|
||||
val = get(col, 'w:cs')
|
||||
if val:
|
||||
ff = val
|
||||
setattr(dest, 'cs_font_family', ff)
|
||||
for col in XPath('./w:szCS[@w:val]')(parent):
|
||||
val = simple_float(get(col, 'w:val'), 0.5)
|
||||
if val is not None:
|
||||
setattr(dest, 'font_size', val)
|
||||
return
|
||||
setattr(dest, 'cs_font_size', inherit)
|
||||
|
||||
# }}}
|
||||
|
||||
|
||||
class RunStyle(object):
|
||||
|
||||
all_properties = {
|
||||
'b', 'bCs', 'caps', 'cs', 'dstrike', 'emboss', 'i', 'iCs', 'imprint',
|
||||
'rtl', 'shadow', 'smallCaps', 'strike', 'vanish', 'webHidden',
|
||||
|
||||
'border_color', 'border_style', 'border_width', 'padding', 'color', 'highlight', 'background_color',
|
||||
'letter_spacing', 'font_size', 'text_decoration', 'vert_align', 'lang', 'font_family', 'position',
|
||||
'cs_font_size', 'cs_font_family'
|
||||
}
|
||||
|
||||
toggle_properties = {
|
||||
'b', 'bCs', 'caps', 'emboss', 'i', 'iCs', 'imprint', 'shadow', 'smallCaps', 'strike', 'vanish',
|
||||
}
|
||||
|
||||
def __init__(self, namespace, rPr=None):
|
||||
self.namespace = namespace
|
||||
self.linked_style = None
|
||||
if rPr is None:
|
||||
for p in self.all_properties:
|
||||
setattr(self, p, inherit)
|
||||
else:
|
||||
X, g = namespace.XPath, namespace.get
|
||||
for p in (
|
||||
'b', 'bCs', 'caps', 'cs', 'dstrike', 'emboss', 'i', 'iCs', 'imprint', 'rtl', 'shadow',
|
||||
'smallCaps', 'strike', 'vanish', 'webHidden',
|
||||
):
|
||||
setattr(self, p, binary_property(rPr, p, X, g))
|
||||
|
||||
read_font(rPr, self, X, g)
|
||||
read_font_cs(rPr, self, X, g)
|
||||
read_text_border(rPr, self, X, g)
|
||||
read_color(rPr, self, X, g)
|
||||
read_highlight(rPr, self, X, g)
|
||||
read_shd(rPr, self, X, g)
|
||||
read_letter_spacing(rPr, self, X, g)
|
||||
read_underline(rPr, self, X, g)
|
||||
read_vert_align(rPr, self, X, g)
|
||||
read_position(rPr, self, X, g)
|
||||
read_lang(rPr, self, X, g)
|
||||
|
||||
for s in X('./w:rStyle[@w:val]')(rPr):
|
||||
self.linked_style = g(s, 'w:val')
|
||||
|
||||
self._css = None
|
||||
|
||||
def update(self, other):
|
||||
for prop in self.all_properties:
|
||||
nval = getattr(other, prop)
|
||||
if nval is not inherit:
|
||||
setattr(self, prop, nval)
|
||||
if other.linked_style is not None:
|
||||
self.linked_style = other.linked_style
|
||||
|
||||
def resolve_based_on(self, parent):
|
||||
for p in self.all_properties:
|
||||
val = getattr(self, p)
|
||||
if val is inherit:
|
||||
setattr(self, p, getattr(parent, p))
|
||||
|
||||
def get_border_css(self, ans):
|
||||
for x in ('color', 'style', 'width'):
|
||||
val = getattr(self, 'border_'+x)
|
||||
if x == 'width' and val is not inherit:
|
||||
val = '%.3gpt' % val
|
||||
if val is not inherit:
|
||||
ans['border-%s' % x] = val
|
||||
|
||||
def clear_border_css(self):
|
||||
for x in ('color', 'style', 'width'):
|
||||
setattr(self, 'border_'+x, inherit)
|
||||
|
||||
@property
|
||||
def css(self):
|
||||
if self._css is None:
|
||||
c = self._css = OrderedDict()
|
||||
td = set()
|
||||
if self.text_decoration is not inherit:
|
||||
td.add(self.text_decoration)
|
||||
if self.strike and self.strike is not inherit:
|
||||
td.add('line-through')
|
||||
if self.dstrike and self.dstrike is not inherit:
|
||||
td.add('line-through')
|
||||
if td:
|
||||
c['text-decoration'] = ' '.join(td)
|
||||
if self.caps is True:
|
||||
c['text-transform'] = 'uppercase'
|
||||
if self.i is True:
|
||||
c['font-style'] = 'italic'
|
||||
if self.shadow and self.shadow is not inherit:
|
||||
c['text-shadow'] = '2px 2px'
|
||||
if self.smallCaps is True:
|
||||
c['font-variant'] = 'small-caps'
|
||||
if self.vanish is True or self.webHidden is True:
|
||||
c['display'] = 'none'
|
||||
|
||||
self.get_border_css(c)
|
||||
if self.padding is not inherit:
|
||||
c['padding'] = '%.3gpt' % self.padding
|
||||
|
||||
for x in ('color', 'background_color'):
|
||||
val = getattr(self, x)
|
||||
if val is not inherit:
|
||||
c[x.replace('_', '-')] = val
|
||||
|
||||
for x in ('letter_spacing', 'font_size'):
|
||||
val = getattr(self, x)
|
||||
if val is not inherit:
|
||||
c[x.replace('_', '-')] = '%.3gpt' % val
|
||||
|
||||
if self.position is not inherit:
|
||||
c['vertical-align'] = '%.3gpt' % self.position
|
||||
|
||||
if self.highlight is not inherit and self.highlight != 'transparent':
|
||||
c['background-color'] = self.highlight
|
||||
|
||||
if self.b:
|
||||
c['font-weight'] = 'bold'
|
||||
|
||||
if self.font_family is not inherit:
|
||||
c['font-family'] = self.font_family
|
||||
|
||||
return self._css
|
||||
|
||||
def same_border(self, other):
|
||||
return self.get_border_css({}) == other.get_border_css({})
|
||||
235
ebook_converter/ebooks/docx/cleanup.py
Normal file
235
ebook_converter/ebooks/docx/cleanup.py
Normal file
@@ -0,0 +1,235 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=utf-8
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
|
||||
import os
|
||||
from polyglot.builtins import itervalues, range
|
||||
|
||||
NBSP = '\xa0'
|
||||
|
||||
|
||||
def mergeable(previous, current):
|
||||
if previous.tail or current.tail:
|
||||
return False
|
||||
if previous.get('class', None) != current.get('class', None):
|
||||
return False
|
||||
if current.get('id', False):
|
||||
return False
|
||||
for attr in ('style', 'lang', 'dir'):
|
||||
if previous.get(attr) != current.get(attr):
|
||||
return False
|
||||
try:
|
||||
return next(previous.itersiblings()) is current
|
||||
except StopIteration:
|
||||
return False
|
||||
|
||||
|
||||
def append_text(parent, text):
|
||||
if len(parent) > 0:
|
||||
parent[-1].tail = (parent[-1].tail or '') + text
|
||||
else:
|
||||
parent.text = (parent.text or '') + text
|
||||
|
||||
|
||||
def merge(parent, span):
|
||||
if span.text:
|
||||
append_text(parent, span.text)
|
||||
for child in span:
|
||||
parent.append(child)
|
||||
if span.tail:
|
||||
append_text(parent, span.tail)
|
||||
span.getparent().remove(span)
|
||||
|
||||
|
||||
def merge_run(run):
|
||||
parent = run[0]
|
||||
for span in run[1:]:
|
||||
merge(parent, span)
|
||||
|
||||
|
||||
def liftable(css):
|
||||
# A <span> is liftable if all its styling would work just as well if it is
|
||||
# specified on the parent element.
|
||||
prefixes = {x.partition('-')[0] for x in css}
|
||||
return not (prefixes - {'text', 'font', 'letter', 'color', 'background'})
|
||||
|
||||
|
||||
def add_text(elem, attr, text):
|
||||
old = getattr(elem, attr) or ''
|
||||
setattr(elem, attr, old + text)
|
||||
|
||||
|
||||
def lift(span):
|
||||
# Replace an element by its content (text, children and tail)
|
||||
parent = span.getparent()
|
||||
idx = parent.index(span)
|
||||
try:
|
||||
last_child = span[-1]
|
||||
except IndexError:
|
||||
last_child = None
|
||||
|
||||
if span.text:
|
||||
if idx == 0:
|
||||
add_text(parent, 'text', span.text)
|
||||
else:
|
||||
add_text(parent[idx - 1], 'tail', span.text)
|
||||
|
||||
for child in reversed(span):
|
||||
parent.insert(idx, child)
|
||||
parent.remove(span)
|
||||
|
||||
if span.tail:
|
||||
if last_child is None:
|
||||
if idx == 0:
|
||||
add_text(parent, 'text', span.tail)
|
||||
else:
|
||||
add_text(parent[idx - 1], 'tail', span.tail)
|
||||
else:
|
||||
add_text(last_child, 'tail', span.tail)
|
||||
|
||||
|
||||
def before_count(root, tag, limit=10):
|
||||
body = root.xpath('//body[1]')
|
||||
if not body:
|
||||
return limit
|
||||
ans = 0
|
||||
for elem in body[0].iterdescendants():
|
||||
if elem is tag:
|
||||
return ans
|
||||
ans += 1
|
||||
if ans > limit:
|
||||
return limit
|
||||
|
||||
|
||||
def wrap_contents(tag_name, elem):
|
||||
wrapper = elem.makeelement(tag_name)
|
||||
wrapper.text, elem.text = elem.text, ''
|
||||
for child in elem:
|
||||
elem.remove(child)
|
||||
wrapper.append(child)
|
||||
elem.append(wrapper)
|
||||
|
||||
|
||||
def cleanup_markup(log, root, styles, dest_dir, detect_cover, XPath):
|
||||
# Apply vertical-align
|
||||
for span in root.xpath('//span[@data-docx-vert]'):
|
||||
wrap_contents(span.attrib.pop('data-docx-vert'), span)
|
||||
|
||||
# Move <hr>s outside paragraphs, if possible.
|
||||
pancestor = XPath('|'.join('ancestor::%s[1]' % x for x in ('p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6')))
|
||||
for hr in root.xpath('//span/hr'):
|
||||
p = pancestor(hr)
|
||||
if p:
|
||||
p = p[0]
|
||||
descendants = tuple(p.iterdescendants())
|
||||
if descendants[-1] is hr:
|
||||
parent = p.getparent()
|
||||
idx = parent.index(p)
|
||||
parent.insert(idx+1, hr)
|
||||
hr.tail = '\n\t'
|
||||
|
||||
# Merge consecutive spans that have the same styling
|
||||
current_run = []
|
||||
for span in root.xpath('//span'):
|
||||
if not current_run:
|
||||
current_run.append(span)
|
||||
else:
|
||||
last = current_run[-1]
|
||||
if mergeable(last, span):
|
||||
current_run.append(span)
|
||||
else:
|
||||
if len(current_run) > 1:
|
||||
merge_run(current_run)
|
||||
current_run = [span]
|
||||
|
||||
# Process dir attributes
|
||||
class_map = dict(itervalues(styles.classes))
|
||||
parents = ('p', 'div') + tuple('h%d' % i for i in range(1, 7))
|
||||
for parent in root.xpath('//*[(%s)]' % ' or '.join('name()="%s"' % t for t in parents)):
|
||||
# Ensure that children of rtl parents that are not rtl have an
|
||||
# explicit dir set. Also, remove dir from children if it is the same as
|
||||
# that of the parent.
|
||||
if len(parent):
|
||||
parent_dir = parent.get('dir')
|
||||
for child in parent.iterchildren('span'):
|
||||
child_dir = child.get('dir')
|
||||
if parent_dir == 'rtl' and child_dir != 'rtl':
|
||||
child_dir = 'ltr'
|
||||
child.set('dir', child_dir)
|
||||
if child_dir and child_dir == parent_dir:
|
||||
child.attrib.pop('dir')
|
||||
|
||||
# Remove unnecessary span tags that are the only child of a parent block
|
||||
# element
|
||||
for parent in root.xpath('//*[(%s) and count(span)=1]' % ' or '.join('name()="%s"' % t for t in parents)):
|
||||
if len(parent) == 1 and not parent.text and not parent[0].tail and not parent[0].get('id', None):
|
||||
# We have a block whose contents are entirely enclosed in a <span>
|
||||
span = parent[0]
|
||||
span_class = span.get('class', None)
|
||||
span_css = class_map.get(span_class, {})
|
||||
span_dir = span.get('dir')
|
||||
if liftable(span_css) and (not span_dir or span_dir == parent.get('dir')):
|
||||
pclass = parent.get('class', None)
|
||||
if span_class:
|
||||
pclass = (pclass + ' ' + span_class) if pclass else span_class
|
||||
parent.set('class', pclass)
|
||||
parent.text = span.text
|
||||
parent.remove(span)
|
||||
if span.get('lang'):
|
||||
parent.set('lang', span.get('lang'))
|
||||
if span.get('dir'):
|
||||
parent.set('dir', span.get('dir'))
|
||||
for child in span:
|
||||
parent.append(child)
|
||||
|
||||
# Make spans whose only styling is bold or italic into <b> and <i> tags
|
||||
for span in root.xpath('//span[@class and not(@style)]'):
|
||||
css = class_map.get(span.get('class', None), {})
|
||||
if len(css) == 1:
|
||||
if css == {'font-style':'italic'}:
|
||||
span.tag = 'i'
|
||||
del span.attrib['class']
|
||||
elif css == {'font-weight':'bold'}:
|
||||
span.tag = 'b'
|
||||
del span.attrib['class']
|
||||
|
||||
# Get rid of <span>s that have no styling
|
||||
for span in root.xpath('//span[not(@class or @id or @style or @lang or @dir)]'):
|
||||
lift(span)
|
||||
|
||||
# Convert <p><br style="page-break-after:always"> </p> style page breaks
|
||||
# into something the viewer will render as a page break
|
||||
for p in root.xpath('//p[br[@style="page-break-after:always"]]'):
|
||||
if len(p) == 1 and (not p[0].tail or not p[0].tail.strip()):
|
||||
p.remove(p[0])
|
||||
prefix = p.get('style', '')
|
||||
if prefix:
|
||||
prefix += '; '
|
||||
p.set('style', prefix + 'page-break-after:always')
|
||||
p.text = NBSP if not p.text else p.text
|
||||
|
||||
if detect_cover:
|
||||
# Check if the first image in the document is possibly a cover
|
||||
img = root.xpath('//img[@src][1]')
|
||||
if img:
|
||||
img = img[0]
|
||||
path = os.path.join(dest_dir, img.get('src'))
|
||||
if os.path.exists(path) and before_count(root, img, limit=10) < 5:
|
||||
from calibre.utils.imghdr import identify
|
||||
try:
|
||||
with lopen(path, 'rb') as imf:
|
||||
fmt, width, height = identify(imf)
|
||||
except:
|
||||
width, height, fmt = 0, 0, None # noqa
|
||||
del fmt
|
||||
try:
|
||||
is_cover = 0.8 <= height/width <= 1.8 and height*width >= 160000
|
||||
except ZeroDivisionError:
|
||||
is_cover = False
|
||||
if is_cover:
|
||||
log.debug('Detected an image that looks like a cover')
|
||||
img.getparent().remove(img)
|
||||
return path
|
||||
268
ebook_converter/ebooks/docx/container.py
Normal file
268
ebook_converter/ebooks/docx/container.py
Normal file
@@ -0,0 +1,268 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=utf-8
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
|
||||
import os, sys, shutil
|
||||
|
||||
from lxml import etree
|
||||
|
||||
from calibre import walk, guess_type
|
||||
from calibre.ebooks.metadata import string_to_authors, authors_to_sort_string
|
||||
from calibre.ebooks.metadata.book.base import Metadata
|
||||
from calibre.ebooks.docx import InvalidDOCX
|
||||
from calibre.ebooks.docx.names import DOCXNamespace
|
||||
from calibre.ptempfile import PersistentTemporaryDirectory
|
||||
from calibre.utils.localization import canonicalize_lang
|
||||
from calibre.utils.logging import default_log
|
||||
from calibre.utils.zipfile import ZipFile
|
||||
from calibre.utils.xml_parse import safe_xml_fromstring
|
||||
|
||||
|
||||
def fromstring(raw, parser=None):
|
||||
return safe_xml_fromstring(raw)
|
||||
|
||||
# Read metadata {{{
|
||||
|
||||
|
||||
def read_doc_props(raw, mi, XPath):
|
||||
root = fromstring(raw)
|
||||
titles = XPath('//dc:title')(root)
|
||||
if titles:
|
||||
title = titles[0].text
|
||||
if title and title.strip():
|
||||
mi.title = title.strip()
|
||||
tags = []
|
||||
for subject in XPath('//dc:subject')(root):
|
||||
if subject.text and subject.text.strip():
|
||||
tags.append(subject.text.strip().replace(',', '_'))
|
||||
for keywords in XPath('//cp:keywords')(root):
|
||||
if keywords.text and keywords.text.strip():
|
||||
for x in keywords.text.split():
|
||||
tags.extend(y.strip() for y in x.split(',') if y.strip())
|
||||
if tags:
|
||||
mi.tags = tags
|
||||
authors = XPath('//dc:creator')(root)
|
||||
aut = []
|
||||
for author in authors:
|
||||
if author.text and author.text.strip():
|
||||
aut.extend(string_to_authors(author.text))
|
||||
if aut:
|
||||
mi.authors = aut
|
||||
mi.author_sort = authors_to_sort_string(aut)
|
||||
|
||||
desc = XPath('//dc:description')(root)
|
||||
if desc:
|
||||
raw = etree.tostring(desc[0], method='text', encoding='unicode')
|
||||
raw = raw.replace('_x000d_', '') # Word 2007 mangles newlines in the summary
|
||||
mi.comments = raw.strip()
|
||||
|
||||
langs = []
|
||||
for lang in XPath('//dc:language')(root):
|
||||
if lang.text and lang.text.strip():
|
||||
l = canonicalize_lang(lang.text)
|
||||
if l:
|
||||
langs.append(l)
|
||||
if langs:
|
||||
mi.languages = langs
|
||||
|
||||
|
||||
def read_app_props(raw, mi):
|
||||
root = fromstring(raw)
|
||||
company = root.xpath('//*[local-name()="Company"]')
|
||||
if company and company[0].text and company[0].text.strip():
|
||||
mi.publisher = company[0].text.strip()
|
||||
|
||||
|
||||
def read_default_style_language(raw, mi, XPath):
|
||||
root = fromstring(raw)
|
||||
for lang in XPath('/w:styles/w:docDefaults/w:rPrDefault/w:rPr/w:lang/@w:val')(root):
|
||||
lang = canonicalize_lang(lang)
|
||||
if lang:
|
||||
mi.languages = [lang]
|
||||
break
|
||||
# }}}
|
||||
|
||||
|
||||
class DOCX(object):
|
||||
|
||||
def __init__(self, path_or_stream, log=None, extract=True):
|
||||
self.docx_is_transitional = True
|
||||
stream = path_or_stream if hasattr(path_or_stream, 'read') else open(path_or_stream, 'rb')
|
||||
self.name = getattr(stream, 'name', None) or '<stream>'
|
||||
self.log = log or default_log
|
||||
if extract:
|
||||
self.extract(stream)
|
||||
else:
|
||||
self.init_zipfile(stream)
|
||||
self.read_content_types()
|
||||
self.read_package_relationships()
|
||||
self.namespace = DOCXNamespace(self.docx_is_transitional)
|
||||
|
||||
def init_zipfile(self, stream):
|
||||
self.zipf = ZipFile(stream)
|
||||
self.names = frozenset(self.zipf.namelist())
|
||||
|
||||
def extract(self, stream):
|
||||
self.tdir = PersistentTemporaryDirectory('docx_container')
|
||||
try:
|
||||
zf = ZipFile(stream)
|
||||
zf.extractall(self.tdir)
|
||||
except:
|
||||
self.log.exception('DOCX appears to be invalid ZIP file, trying a'
|
||||
' more forgiving ZIP parser')
|
||||
from calibre.utils.localunzip import extractall
|
||||
stream.seek(0)
|
||||
extractall(stream, self.tdir)
|
||||
|
||||
self.names = {}
|
||||
for f in walk(self.tdir):
|
||||
name = os.path.relpath(f, self.tdir).replace(os.sep, '/')
|
||||
self.names[name] = f
|
||||
|
||||
def exists(self, name):
|
||||
return name in self.names
|
||||
|
||||
def read(self, name):
|
||||
if hasattr(self, 'zipf'):
|
||||
return self.zipf.open(name).read()
|
||||
path = self.names[name]
|
||||
with open(path, 'rb') as f:
|
||||
return f.read()
|
||||
|
||||
def read_content_types(self):
|
||||
try:
|
||||
raw = self.read('[Content_Types].xml')
|
||||
except KeyError:
|
||||
raise InvalidDOCX('The file %s docx file has no [Content_Types].xml' % self.name)
|
||||
root = fromstring(raw)
|
||||
self.content_types = {}
|
||||
self.default_content_types = {}
|
||||
for item in root.xpath('//*[local-name()="Types"]/*[local-name()="Default" and @Extension and @ContentType]'):
|
||||
self.default_content_types[item.get('Extension').lower()] = item.get('ContentType')
|
||||
for item in root.xpath('//*[local-name()="Types"]/*[local-name()="Override" and @PartName and @ContentType]'):
|
||||
name = item.get('PartName').lstrip('/')
|
||||
self.content_types[name] = item.get('ContentType')
|
||||
|
||||
def content_type(self, name):
|
||||
if name in self.content_types:
|
||||
return self.content_types[name]
|
||||
ext = name.rpartition('.')[-1].lower()
|
||||
if ext in self.default_content_types:
|
||||
return self.default_content_types[ext]
|
||||
return guess_type(name)[0]
|
||||
|
||||
def read_package_relationships(self):
|
||||
try:
|
||||
raw = self.read('_rels/.rels')
|
||||
except KeyError:
|
||||
raise InvalidDOCX('The file %s docx file has no _rels/.rels' % self.name)
|
||||
root = fromstring(raw)
|
||||
self.relationships = {}
|
||||
self.relationships_rmap = {}
|
||||
for item in root.xpath('//*[local-name()="Relationships"]/*[local-name()="Relationship" and @Type and @Target]'):
|
||||
target = item.get('Target').lstrip('/')
|
||||
typ = item.get('Type')
|
||||
if target == 'word/document.xml':
|
||||
self.docx_is_transitional = typ != 'http://purl.oclc.org/ooxml/officeDocument/relationships/officeDocument'
|
||||
self.relationships[typ] = target
|
||||
self.relationships_rmap[target] = typ
|
||||
|
||||
@property
|
||||
def document_name(self):
|
||||
name = self.relationships.get(self.namespace.names['DOCUMENT'], None)
|
||||
if name is None:
|
||||
names = tuple(n for n in self.names if n == 'document.xml' or n.endswith('/document.xml'))
|
||||
if not names:
|
||||
raise InvalidDOCX('The file %s docx file has no main document' % self.name)
|
||||
name = names[0]
|
||||
return name
|
||||
|
||||
@property
|
||||
def document(self):
|
||||
return fromstring(self.read(self.document_name))
|
||||
|
||||
@property
|
||||
def document_relationships(self):
|
||||
return self.get_relationships(self.document_name)
|
||||
|
||||
def get_relationships(self, name):
|
||||
base = '/'.join(name.split('/')[:-1])
|
||||
by_id, by_type = {}, {}
|
||||
parts = name.split('/')
|
||||
name = '/'.join(parts[:-1] + ['_rels', parts[-1] + '.rels'])
|
||||
try:
|
||||
raw = self.read(name)
|
||||
except KeyError:
|
||||
pass
|
||||
else:
|
||||
root = fromstring(raw)
|
||||
for item in root.xpath('//*[local-name()="Relationships"]/*[local-name()="Relationship" and @Type and @Target]'):
|
||||
target = item.get('Target')
|
||||
if item.get('TargetMode', None) != 'External' and not target.startswith('#'):
|
||||
target = '/'.join((base, target.lstrip('/')))
|
||||
typ = item.get('Type')
|
||||
Id = item.get('Id')
|
||||
by_id[Id] = by_type[typ] = target
|
||||
|
||||
return by_id, by_type
|
||||
|
||||
def get_document_properties_names(self):
|
||||
name = self.relationships.get(self.namespace.names['DOCPROPS'], None)
|
||||
if name is None:
|
||||
names = tuple(n for n in self.names if n.lower() == 'docprops/core.xml')
|
||||
if names:
|
||||
name = names[0]
|
||||
yield name
|
||||
name = self.relationships.get(self.namespace.names['APPPROPS'], None)
|
||||
if name is None:
|
||||
names = tuple(n for n in self.names if n.lower() == 'docprops/app.xml')
|
||||
if names:
|
||||
name = names[0]
|
||||
yield name
|
||||
|
||||
@property
|
||||
def metadata(self):
|
||||
mi = Metadata(_('Unknown'))
|
||||
dp_name, ap_name = self.get_document_properties_names()
|
||||
if dp_name:
|
||||
try:
|
||||
raw = self.read(dp_name)
|
||||
except KeyError:
|
||||
pass
|
||||
else:
|
||||
read_doc_props(raw, mi, self.namespace.XPath)
|
||||
if mi.is_null('language'):
|
||||
try:
|
||||
raw = self.read('word/styles.xml')
|
||||
except KeyError:
|
||||
pass
|
||||
else:
|
||||
read_default_style_language(raw, mi, self.namespace.XPath)
|
||||
|
||||
ap_name = self.relationships.get(self.namespace.names['APPPROPS'], None)
|
||||
if ap_name:
|
||||
try:
|
||||
raw = self.read(ap_name)
|
||||
except KeyError:
|
||||
pass
|
||||
else:
|
||||
read_app_props(raw, mi)
|
||||
|
||||
return mi
|
||||
|
||||
def close(self):
|
||||
if hasattr(self, 'zipf'):
|
||||
self.zipf.close()
|
||||
else:
|
||||
try:
|
||||
shutil.rmtree(self.tdir)
|
||||
except EnvironmentError:
|
||||
pass
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
d = DOCX(sys.argv[-1], extract=False)
|
||||
print(d.metadata)
|
||||
276
ebook_converter/ebooks/docx/fields.py
Normal file
276
ebook_converter/ebooks/docx/fields.py
Normal file
@@ -0,0 +1,276 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=utf-8
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
|
||||
import re
|
||||
|
||||
from calibre.ebooks.docx.index import process_index, polish_index_markup
|
||||
from polyglot.builtins import iteritems, native_string_type
|
||||
|
||||
|
||||
class Field(object):
|
||||
|
||||
def __init__(self, start):
|
||||
self.start = start
|
||||
self.end = None
|
||||
self.contents = []
|
||||
self.buf = []
|
||||
self.instructions = None
|
||||
self.name = None
|
||||
|
||||
def add_instr(self, elem):
|
||||
self.add_raw(elem.text)
|
||||
|
||||
def add_raw(self, raw):
|
||||
if not raw:
|
||||
return
|
||||
if self.name is None:
|
||||
# There are cases where partial index entries end with
|
||||
# a significant space, along the lines of
|
||||
# <>Summary <> ... <>Hearing<>.
|
||||
# No known examples of starting with a space yet.
|
||||
# self.name, raw = raw.strip().partition(' ')[0::2]
|
||||
self.name, raw = raw.lstrip().partition(' ')[0::2]
|
||||
self.buf.append(raw)
|
||||
|
||||
def finalize(self):
|
||||
self.instructions = ''.join(self.buf)
|
||||
del self.buf
|
||||
|
||||
|
||||
WORD, FLAG = 0, 1
|
||||
scanner = re.Scanner([
|
||||
(r'\\\S{1}', lambda s, t: (t, FLAG)), # A flag of the form \x
|
||||
(r'"[^"]*"', lambda s, t: (t[1:-1], WORD)), # Quoted word
|
||||
(r'[^\s\\"]\S*', lambda s, t: (t, WORD)), # A non-quoted word, must not start with a backslash or a space or a quote
|
||||
(r'\s+', None),
|
||||
], flags=re.DOTALL)
|
||||
|
||||
null = object()
|
||||
|
||||
|
||||
def parser(name, field_map, default_field_name=None):
|
||||
|
||||
field_map = dict((x.split(':') for x in field_map.split()))
|
||||
|
||||
def parse(raw, log=None):
|
||||
ans = {}
|
||||
last_option = None
|
||||
raw = raw.replace('\\\\', '\x01').replace('\\"', '\x02')
|
||||
for token, token_type in scanner.scan(raw)[0]:
|
||||
token = token.replace('\x01', '\\').replace('\x02', '"')
|
||||
if token_type is FLAG:
|
||||
last_option = field_map.get(token[1], null)
|
||||
if last_option is not None:
|
||||
ans[last_option] = None
|
||||
elif token_type is WORD:
|
||||
if last_option is None:
|
||||
ans[default_field_name] = token
|
||||
else:
|
||||
ans[last_option] = token
|
||||
last_option = None
|
||||
ans.pop(null, None)
|
||||
return ans
|
||||
|
||||
parse.__name__ = native_string_type('parse_' + name)
|
||||
|
||||
return parse
|
||||
|
||||
|
||||
parse_hyperlink = parser('hyperlink',
|
||||
'l:anchor m:image-map n:target o:title t:target', 'url')
|
||||
|
||||
parse_xe = parser('xe',
|
||||
'b:bold i:italic f:entry-type r:page-range-bookmark t:page-number-text y:yomi', 'text')
|
||||
|
||||
parse_index = parser('index',
|
||||
'b:bookmark c:columns-per-page d:sequence-separator e:first-page-number-separator'
|
||||
' f:entry-type g:page-range-separator h:heading k:crossref-separator'
|
||||
' l:page-number-separator p:letter-range s:sequence-name r:run-together y:yomi z:langcode')
|
||||
|
||||
parse_ref = parser('ref',
|
||||
'd:separator f:footnote h:hyperlink n:number p:position r:relative-number t:suppress w:number-full-context')
|
||||
|
||||
parse_noteref = parser('noteref',
|
||||
'f:footnote h:hyperlink p:position')
|
||||
|
||||
|
||||
class Fields(object):
|
||||
|
||||
def __init__(self, namespace):
|
||||
self.namespace = namespace
|
||||
self.fields = []
|
||||
self.index_bookmark_counter = 0
|
||||
self.index_bookmark_prefix = 'index-'
|
||||
|
||||
def __call__(self, doc, log):
|
||||
all_ids = frozenset(self.namespace.XPath('//*/@w:id')(doc))
|
||||
c = 0
|
||||
while self.index_bookmark_prefix in all_ids:
|
||||
c += 1
|
||||
self.index_bookmark_prefix = self.index_bookmark_prefix.replace('-', '%d-' % c)
|
||||
stack = []
|
||||
for elem in self.namespace.XPath(
|
||||
'//*[name()="w:p" or name()="w:r" or'
|
||||
' name()="w:instrText" or'
|
||||
' (name()="w:fldChar" and (@w:fldCharType="begin" or @w:fldCharType="end") or'
|
||||
' name()="w:fldSimple")]')(doc):
|
||||
if elem.tag.endswith('}fldChar'):
|
||||
typ = self.namespace.get(elem, 'w:fldCharType')
|
||||
if typ == 'begin':
|
||||
stack.append(Field(elem))
|
||||
self.fields.append(stack[-1])
|
||||
else:
|
||||
try:
|
||||
stack.pop().end = elem
|
||||
except IndexError:
|
||||
pass
|
||||
elif elem.tag.endswith('}instrText'):
|
||||
if stack:
|
||||
stack[-1].add_instr(elem)
|
||||
elif elem.tag.endswith('}fldSimple'):
|
||||
field = Field(elem)
|
||||
instr = self.namespace.get(elem, 'w:instr')
|
||||
if instr:
|
||||
field.add_raw(instr)
|
||||
self.fields.append(field)
|
||||
for r in self.namespace.XPath('descendant::w:r')(elem):
|
||||
field.contents.append(r)
|
||||
else:
|
||||
if stack:
|
||||
stack[-1].contents.append(elem)
|
||||
|
||||
field_types = ('hyperlink', 'xe', 'index', 'ref', 'noteref')
|
||||
parsers = {x.upper():getattr(self, 'parse_'+x) for x in field_types}
|
||||
parsers.update({x:getattr(self, 'parse_'+x) for x in field_types})
|
||||
field_parsers = {f.upper():globals()['parse_%s' % f] for f in field_types}
|
||||
field_parsers.update({f:globals()['parse_%s' % f] for f in field_types})
|
||||
|
||||
for f in field_types:
|
||||
setattr(self, '%s_fields' % f, [])
|
||||
unknown_fields = {'TOC', 'toc', 'PAGEREF', 'pageref'} # The TOC and PAGEREF fields are handled separately
|
||||
|
||||
for field in self.fields:
|
||||
field.finalize()
|
||||
if field.instructions:
|
||||
func = parsers.get(field.name, None)
|
||||
if func is not None:
|
||||
func(field, field_parsers[field.name], log)
|
||||
elif field.name not in unknown_fields:
|
||||
log.warn('Encountered unknown field: %s, ignoring it.' % field.name)
|
||||
unknown_fields.add(field.name)
|
||||
|
||||
def get_runs(self, field):
|
||||
all_runs = []
|
||||
current_runs = []
|
||||
# We only handle spans in a single paragraph
|
||||
# being wrapped in <a>
|
||||
for x in field.contents:
|
||||
if x.tag.endswith('}p'):
|
||||
if current_runs:
|
||||
all_runs.append(current_runs)
|
||||
current_runs = []
|
||||
elif x.tag.endswith('}r'):
|
||||
current_runs.append(x)
|
||||
if current_runs:
|
||||
all_runs.append(current_runs)
|
||||
return all_runs
|
||||
|
||||
def parse_hyperlink(self, field, parse_func, log):
|
||||
# Parse hyperlink fields
|
||||
hl = parse_func(field.instructions, log)
|
||||
if hl:
|
||||
if 'target' in hl and hl['target'] is None:
|
||||
hl['target'] = '_blank'
|
||||
for runs in self.get_runs(field):
|
||||
self.hyperlink_fields.append((hl, runs))
|
||||
|
||||
def parse_ref(self, field, parse_func, log):
|
||||
ref = parse_func(field.instructions, log)
|
||||
dest = ref.get(None, None)
|
||||
if dest is not None and 'hyperlink' in ref:
|
||||
for runs in self.get_runs(field):
|
||||
self.hyperlink_fields.append(({'anchor':dest}, runs))
|
||||
else:
|
||||
log.warn('Unsupported reference field (%s), ignoring: %r' % (field.name, ref))
|
||||
|
||||
parse_noteref = parse_ref
|
||||
|
||||
def parse_xe(self, field, parse_func, log):
|
||||
# Parse XE fields
|
||||
if None in (field.start, field.end):
|
||||
return
|
||||
xe = parse_func(field.instructions, log)
|
||||
if xe:
|
||||
# We insert a synthetic bookmark around this index item so that we
|
||||
# can link to it later
|
||||
def WORD(x):
|
||||
return self.namespace.expand('w:' + x)
|
||||
self.index_bookmark_counter += 1
|
||||
bmark = xe['anchor'] = '%s%d' % (self.index_bookmark_prefix, self.index_bookmark_counter)
|
||||
p = field.start.getparent()
|
||||
bm = p.makeelement(WORD('bookmarkStart'))
|
||||
bm.set(WORD('id'), bmark), bm.set(WORD('name'), bmark)
|
||||
p.insert(p.index(field.start), bm)
|
||||
p = field.end.getparent()
|
||||
bm = p.makeelement(WORD('bookmarkEnd'))
|
||||
bm.set(WORD('id'), bmark)
|
||||
p.insert(p.index(field.end) + 1, bm)
|
||||
xe['start_elem'] = field.start
|
||||
self.xe_fields.append(xe)
|
||||
|
||||
def parse_index(self, field, parse_func, log):
|
||||
if not field.contents:
|
||||
return
|
||||
idx = parse_func(field.instructions, log)
|
||||
hyperlinks, blocks = process_index(field, idx, self.xe_fields, log, self.namespace.XPath, self.namespace.expand)
|
||||
if not blocks:
|
||||
return
|
||||
for anchor, run in hyperlinks:
|
||||
self.hyperlink_fields.append(({'anchor':anchor}, [run]))
|
||||
|
||||
self.index_fields.append((idx, blocks))
|
||||
|
||||
def polish_markup(self, object_map):
|
||||
if not self.index_fields:
|
||||
return
|
||||
rmap = {v:k for k, v in iteritems(object_map)}
|
||||
for idx, blocks in self.index_fields:
|
||||
polish_index_markup(idx, [rmap[b] for b in blocks])
|
||||
|
||||
|
||||
def test_parse_fields(return_tests=False):
|
||||
import unittest
|
||||
|
||||
class TestParseFields(unittest.TestCase):
|
||||
|
||||
def test_hyperlink(self):
|
||||
ae = lambda x, y: self.assertEqual(parse_hyperlink(x, None), y)
|
||||
ae(r'\l anchor1', {'anchor':'anchor1'})
|
||||
ae(r'www.calibre-ebook.com', {'url':'www.calibre-ebook.com'})
|
||||
ae(r'www.calibre-ebook.com \t target \o tt', {'url':'www.calibre-ebook.com', 'target':'target', 'title': 'tt'})
|
||||
ae(r'"c:\\Some Folder"', {'url': 'c:\\Some Folder'})
|
||||
ae(r'xxxx \y yyyy', {'url': 'xxxx'})
|
||||
|
||||
def test_xe(self):
|
||||
ae = lambda x, y: self.assertEqual(parse_xe(x, None), y)
|
||||
ae(r'"some name"', {'text':'some name'})
|
||||
ae(r'name \b \i', {'text':'name', 'bold':None, 'italic':None})
|
||||
ae(r'xxx \y a', {'text':'xxx', 'yomi':'a'})
|
||||
|
||||
def test_index(self):
|
||||
ae = lambda x, y: self.assertEqual(parse_index(x, None), y)
|
||||
ae(r'', {})
|
||||
ae(r'\b \c 1', {'bookmark':None, 'columns-per-page': '1'})
|
||||
|
||||
suite = unittest.TestLoader().loadTestsFromTestCase(TestParseFields)
|
||||
if return_tests:
|
||||
return suite
|
||||
unittest.TextTestRunner(verbosity=4).run(suite)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
test_parse_fields()
|
||||
197
ebook_converter/ebooks/docx/fonts.py
Normal file
197
ebook_converter/ebooks/docx/fonts.py
Normal file
@@ -0,0 +1,197 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=utf-8
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
|
||||
import os, re
|
||||
from collections import namedtuple
|
||||
|
||||
from calibre.ebooks.docx.block_styles import binary_property, inherit
|
||||
from calibre.utils.filenames import ascii_filename
|
||||
from calibre.utils.fonts.scanner import font_scanner, NoFonts
|
||||
from calibre.utils.fonts.utils import panose_to_css_generic_family, is_truetype_font
|
||||
from calibre.utils.icu import ord_string
|
||||
from polyglot.builtins import codepoint_to_chr, iteritems, range
|
||||
|
||||
Embed = namedtuple('Embed', 'name key subsetted')
|
||||
|
||||
|
||||
def has_system_fonts(name):
|
||||
try:
|
||||
return bool(font_scanner.fonts_for_family(name))
|
||||
except NoFonts:
|
||||
return False
|
||||
|
||||
|
||||
def get_variant(bold=False, italic=False):
|
||||
return {(False, False):'Regular', (False, True):'Italic',
|
||||
(True, False):'Bold', (True, True):'BoldItalic'}[(bold, italic)]
|
||||
|
||||
|
||||
def find_fonts_matching(fonts, style='normal', stretch='normal'):
|
||||
for font in fonts:
|
||||
if font['font-style'] == style and font['font-stretch'] == stretch:
|
||||
yield font
|
||||
|
||||
|
||||
def weight_key(font):
|
||||
w = font['font-weight']
|
||||
try:
|
||||
return abs(int(w) - 400)
|
||||
except Exception:
|
||||
return abs({'normal': 400, 'bold': 700}.get(w, 1000000) - 400)
|
||||
|
||||
|
||||
def get_best_font(fonts, style, stretch):
|
||||
try:
|
||||
return sorted(find_fonts_matching(fonts, style, stretch), key=weight_key)[0]
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
class Family(object):
|
||||
|
||||
def __init__(self, elem, embed_relationships, XPath, get):
|
||||
self.name = self.family_name = get(elem, 'w:name')
|
||||
self.alt_names = tuple(get(x, 'w:val') for x in XPath('./w:altName')(elem))
|
||||
if self.alt_names and not has_system_fonts(self.name):
|
||||
for x in self.alt_names:
|
||||
if has_system_fonts(x):
|
||||
self.family_name = x
|
||||
break
|
||||
|
||||
self.embedded = {}
|
||||
for x in ('Regular', 'Bold', 'Italic', 'BoldItalic'):
|
||||
for y in XPath('./w:embed%s[@r:id]' % x)(elem):
|
||||
rid = get(y, 'r:id')
|
||||
key = get(y, 'w:fontKey')
|
||||
subsetted = get(y, 'w:subsetted') in {'1', 'true', 'on'}
|
||||
if rid in embed_relationships:
|
||||
self.embedded[x] = Embed(embed_relationships[rid], key, subsetted)
|
||||
|
||||
self.generic_family = 'auto'
|
||||
for x in XPath('./w:family[@w:val]')(elem):
|
||||
self.generic_family = get(x, 'w:val', 'auto')
|
||||
|
||||
ntt = binary_property(elem, 'notTrueType', XPath, get)
|
||||
self.is_ttf = ntt is inherit or not ntt
|
||||
|
||||
self.panose1 = None
|
||||
self.panose_name = None
|
||||
for x in XPath('./w:panose1[@w:val]')(elem):
|
||||
try:
|
||||
v = get(x, 'w:val')
|
||||
v = tuple(int(v[i:i+2], 16) for i in range(0, len(v), 2))
|
||||
except (TypeError, ValueError, IndexError):
|
||||
pass
|
||||
else:
|
||||
self.panose1 = v
|
||||
self.panose_name = panose_to_css_generic_family(v)
|
||||
|
||||
self.css_generic_family = {'roman':'serif', 'swiss':'sans-serif', 'modern':'monospace',
|
||||
'decorative':'fantasy', 'script':'cursive'}.get(self.generic_family, None)
|
||||
self.css_generic_family = self.css_generic_family or self.panose_name or 'serif'
|
||||
|
||||
|
||||
SYMBOL_MAPS = { # {{{
|
||||
'Wingdings': (' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '🖉', '✂', '✁', '👓', '🕭', '🕮', '🕯', '🕿', '✆', '🖂', '🖃', '📪', '📫', '📬', '📭', '🗀', '🗁', '🗎', '🗏', '🗐', '🗄', '⏳', '🖮', '🖰', '🖲', '🖳', '🖴', '🖫', '🖬', '✇', '✍', '🖎', '✌', '🖏', '👍', '👎', '☜', '☞', '☜', '🖗', '🖐', '☺', '😐', '☹', '💣', '🕱', '🏳', '🏱', '✈', '☼', '🌢', '❄', '🕆', '✞', '🕈', '✠', '✡', '☪', '☯', '🕉', '☸', '♈', '♉', '♊', '♋', '♌', '♍', '♎', '♏', '♐', '♑', '♒', '♓', '🙰', '🙵', '⚫', '🔾', '◼', '🞏', '🞐', '❑', '❒', '🞟', '⧫', '◆', '❖', '🞙', '⌧', '⮹', '⌘', '🏵', '🏶', '🙶', '🙷', ' ', '🄋', '➀', '➁', '➂', '➃', '➄', '➅', '➆', '➇', '➈', '➉', '🄌', '➊', '➋', '➌', '➍', '➎', '➏', '➐', '➑', '➒', '➓', '🙢', '🙠', '🙡', '🙣', '🙦', '🙤', '🙥', '🙧', '∙', '•', '⬝', '⭘', '🞆', '🞈', '🞊', '🞋', '🔿', '▪', '🞎', '🟀', '🟁', '★', '🟋', '🟏', '🟓', '🟑', '⯐', '⌖', '⯎', '⯏', '⯑', '✪', '✰', '🕐', '🕑', '🕒', '🕓', '🕔', '🕕', '🕖', '🕗', '🕘', '🕙', '🕚', '🕛', '⮰', '⮱', '⮲', '⮳', '⮴', '⮵', '⮶', '⮷', '🙪', '🙫', '🙕', '🙔', '🙗', '🙖', '🙐', '🙑', '🙒', '🙓', '⌫', '⌦', '⮘', '⮚', '⮙', '⮛', '⮈', '⮊', '⮉', '⮋', '🡨', '🡪', '🡩', '🡫', '🡬', '🡭', '🡯', '🡮', '🡸', '🡺', '🡹', '🡻', '🡼', '🡽', '🡿', '🡾', '⇦', '⇨', '⇧', '⇩', '⬄', '⇳', '⬁', '⬀', '⬃', '⬂', '🢬', '🢭', '🗶', '✓', '🗷', '🗹', ' '), # noqa
|
||||
|
||||
'Wingdings 2': (' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '🖊', '🖋', '🖌', '🖍', '✄', '✀', '🕾', '🕽', '🗅', '🗆', '🗇', '🗈', '🗉', '🗊', '🗋', '🗌', '🗍', '📋', '🗑', '🗔', '🖵', '🖶', '🖷', '🖸', '🖭', '🖯', '🖱', '🖒', '🖓', '🖘', '🖙', '🖚', '🖛', '👈', '👉', '🖜', '🖝', '🖞', '🖟', '🖠', '🖡', '👆', '👇', '🖢', '🖣', '🖑', '🗴', '🗸', '🗵', '☑', '⮽', '☒', '⮾', '⮿', '🛇', '⦸', '🙱', '🙴', '🙲', '🙳', '‽', '🙹', '🙺', '🙻', '🙦', '🙤', '🙥', '🙧', '🙚', '🙘', '🙙', '🙛', '⓪', '①', '②', '③', '④', '⑤', '⑥', '⑦', '⑧', '⑨', '⑩', '⓿', '❶', '❷', '❸', '❹', '❺', '❻', '❼', '❽', '❾', '❿', ' ', '☉', '🌕', '☽', '☾', '⸿', '✝', '🕇', '🕜', '🕝', '🕞', '🕟', '🕠', '🕡', '🕢', '🕣', '🕤', '🕥', '🕦', '🕧', '🙨', '🙩', '⋅', '🞄', '⦁', '●', '●', '🞅', '🞇', '🞉', '⊙', '⦿', '🞌', '🞍', '◾', '■', '□', '🞑', '🞒', '🞓', '🞔', '▣', '🞕', '🞖', '🞗', '🞘', '⬩', '⬥', '◇', '🞚', '◈', '🞛', '🞜', '🞝', '🞞', '⬪', '⬧', '◊', '🞠', '◖', '◗', '⯊', '⯋', '⯀', '⯁', '⬟', '⯂', '⬣', '⬢', '⯃', '⯄', '🞡', '🞢', '🞣', '🞤', '🞥', '🞦', '🞧', '🞨', '🞩', '🞪', '🞫', '🞬', '🞭', '🞮', '🞯', '🞰', '🞱', '🞲', '🞳', '🞴', '🞵', '🞶', '🞷', '🞸', '🞹', '🞺', '🞻', '🞼', '🞽', '🞾', '🞿', '🟀', '🟂', '🟄', '🟆', '🟉', '🟊', '✶', '🟌', '🟎', '🟐', '🟒', '✹', '🟃', '🟇', '✯', '🟍', '🟔', '⯌', '⯍', '※', '⁂', ' ', ' ', ' ', ' ', ' ', ' ',), # noqa
|
||||
|
||||
'Wingdings 3': (' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '⭠', '⭢', '⭡', '⭣', '⭤', '⭥', '⭧', '⭦', '⭰', '⭲', '⭱', '⭳', '⭶', '⭸', '⭻', '⭽', '⭤', '⭥', '⭪', '⭬', '⭫', '⭭', '⭍', '⮠', '⮡', '⮢', '⮣', '⮤', '⮥', '⮦', '⮧', '⮐', '⮑', '⮒', '⮓', '⮀', '⮃', '⭾', '⭿', '⮄', '⮆', '⮅', '⮇', '⮏', '⮍', '⮎', '⮌', '⭮', '⭯', '⎋', '⌤', '⌃', '⌥', '␣', '⍽', '⇪', '⮸', '🢠', '🢡', '🢢', '🢣', '🢤', '🢥', '🢦', '🢧', '🢨', '🢩', '🢪', '🢫', '🡐', '🡒', '🡑', '🡓', '🡔', '🡕', '🡗', '🡖', '🡘', '🡙', '▲', '▼', '△', '▽', '◀', '▶', '◁', '▷', '◣', '◢', '◤', '◥', '🞀', '🞂', '🞁', ' ', '🞃', '⯅', '⯆', '⯇', '⯈', '⮜', '⮞', '⮝', '⮟', '🠐', '🠒', '🠑', '🠓', '🠔', '🠖', '🠕', '🠗', '🠘', '🠚', '🠙', '🠛', '🠜', '🠞', '🠝', '🠟', '🠀', '🠂', '🠁', '🠃', '🠄', '🠆', '🠅', '🠇', '🠈', '🠊', '🠉', '🠋', '🠠', '🠢', '🠤', '🠦', '🠨', '🠪', '🠬', '🢜', '🢝', '🢞', '🢟', '🠮', '🠰', '🠲', '🠴', '🠶', '🠸', '🠺', '🠹', '🠻', '🢘', '🢚', '🢙', '🢛', '🠼', '🠾', '🠽', '🠿', '🡀', '🡂', '🡁', '🡃', '🡄', '🡆', '🡅', '🡇', '⮨', '⮩', '⮪', '⮫', '⮬', '⮭', '⮮', '⮯', '🡠', '🡢', '🡡', '🡣', '🡤', '🡥', '🡧', '🡦', '🡰', '🡲', '🡱', '🡳', '🡴', '🡵', '🡷', '🡶', '🢀', '🢂', '🢁', '🢃', '🢄', '🢅', '🢇', '🢆', '🢐', '🢒', '🢑', '🢓', '🢔', '🢕', '🢗', '🢖', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ',), # noqa
|
||||
|
||||
'Webdings': (' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '🕷', '🕸', '🕲', '🕶', '🏆', '🎖', '🖇', '🗨', '🗩', '🗰', '🗱', '🌶', '🎗', '🙾', '🙼', '🗕', '🗖', '🗗', '⏴', '⏵', '⏶', '⏷', '⏪', '⏩', '⏮', '⏭', '⏸', '⏹', '⏺', '🗚', '🗳', '🛠', '🏗', '🏘', '🏙', '🏚', '🏜', '🏭', '🏛', '🏠', '🏖', '🏝', '🛣', '🔍', '🏔', '👁', '👂', '🏞', '🏕', '🛤', '🏟', '🛳', '🕬', '🕫', '🕨', '🔈', '🎔', '🎕', '🗬', '🙽', '🗭', '🗪', '🗫', '⮔', '✔', '🚲', '⬜', '🛡', '📦', '🛱', '⬛', '🚑', '🛈', '🛩', '🛰', '🟈', '🕴', '⬤', '🛥', '🚔', '🗘', '🗙', '❓', '🛲', '🚇', '🚍', '⛳', '⦸', '⊖', '🚭', '🗮', '⏐', '🗯', '🗲', ' ', '🚹', '🚺', '🛉', '🛊', '🚼', '👽', '🏋', '⛷', '🏂', '🏌', '🏊', '🏄', '🏍', '🏎', '🚘', '🗠', '🛢', '📠', '🏷', '📣', '👪', '🗡', '🗢', '🗣', '✯', '🖄', '🖅', '🖃', '🖆', '🖹', '🖺', '🖻', '🕵', '🕰', '🖽', '🖾', '📋', '🗒', '🗓', '🕮', '📚', '🗞', '🗟', '🗃', '🗂', '🖼', '🎭', '🎜', '🎘', '🎙', '🎧', '💿', '🎞', '📷', '🎟', '🎬', '📽', '📹', '📾', '📻', '🎚', '🎛', '📺', '💻', '🖥', '🖦', '🖧', '🍹', '🎮', '🎮', '🕻', '🕼', '🖁', '🖀', '🖨', '🖩', '🖿', '🖪', '🗜', '🔒', '🔓', '🗝', '📥', '📤', '🕳', '🌣', '🌤', '🌥', '🌦', '☁', '🌨', '🌧', '🌩', '🌪', '🌬', '🌫', '🌜', '🌡', '🛋', '🛏', '🍽', '🍸', '🛎', '🛍', 'Ⓟ', '♿', '🛆', '🖈', '🎓', '🗤', '🗥', '🗦', '🗧', '🛪', '🐿', '🐦', '🐟', '🐕', '🐈', '🙬', '🙮', '🙭', '🙯', '🗺', '🌍', '🌏', '🌎', '🕊',), # noqa
|
||||
|
||||
'Symbol': (' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '!', '∀', '#', '∃', '%', '&', '∍', '(', ')', '*', '+', ',', '−', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '<', '=', '>', '?', '≅', 'Α', 'Β', 'Χ', 'Δ', 'Ε', 'Φ', 'Γ', 'Η', 'Ι', 'ϑ', 'Λ', 'Μ', 'Ν', 'Ξ', 'Ο', 'Π', 'Θ', 'Ρ', 'Σ', 'Τ', 'Υ', 'ς', 'Ω', 'Ξ', 'Ψ', 'Ζ', '[', '∴', ']', '⊥', '_', '', 'α', 'β', 'χ', 'δ', 'ε', 'φ', 'γ', 'η', 'ι', 'ϕ', 'λ', 'μ', 'ν', 'ξ', 'ο', 'π', 'θ', 'ρ', 'σ', 'τ', 'υ', 'ϖ', 'ω', 'ξ', 'ψ', 'ζ', '{', '|', '}', '~', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', '€', 'ϒ', '′', '≤', '⁄', '∞', 'ƒ', '♣', '♥', '♦', '♠', '↔', '←', '↑', '→', '↓', '°', '±', '″', '≥', '×', '∝', '∂', '•', '÷', '≠', '≡', '≈', '…', '⏐', '⎯', '↲', 'ℵ', 'ℑ', 'ℜ', '℘', '⊗', '⊕', '∅', '∩', '∪', '⊃', '⊇', '⊄', '⊂', '⊆', '∈', '∉', '∠', '∂', '®', '©', '™', '∏', '√', '⋅', '¬', '∦', '∧', '⇔', '⇐', '⇑', '⇒', '⇓', '◊', '〈', '®', '©', '™', '∑', '⎛', '⎜', '⎝', '⎡', '⎢', '⎣', '⎧', '⎨', '⎩', '⎪', ' ', '〉', '∫', '⌠', '⎮', '⌡', '⎞', '⎟', '⎠', '⎤', '⎥', '⎦', '⎪', '⎫', '⎬', ' ',), # noqa
|
||||
} # }}}
|
||||
|
||||
SYMBOL_FONT_NAMES = frozenset(n.lower() for n in SYMBOL_MAPS)
|
||||
|
||||
|
||||
def is_symbol_font(family):
|
||||
try:
|
||||
return family.lower() in SYMBOL_FONT_NAMES
|
||||
except AttributeError:
|
||||
return False
|
||||
|
||||
|
||||
def do_map(m, points):
|
||||
base = 0xf000
|
||||
limit = len(m) + base
|
||||
for p in points:
|
||||
if base < p < limit:
|
||||
yield m[p - base]
|
||||
else:
|
||||
yield codepoint_to_chr(p)
|
||||
|
||||
|
||||
def map_symbol_text(text, font):
|
||||
m = SYMBOL_MAPS[font]
|
||||
if isinstance(text, bytes):
|
||||
text = text.decode('utf-8')
|
||||
return ''.join(do_map(m, ord_string(text)))
|
||||
|
||||
|
||||
class Fonts(object):
|
||||
|
||||
def __init__(self, namespace):
|
||||
self.namespace = namespace
|
||||
self.fonts = {}
|
||||
self.used = set()
|
||||
|
||||
def __call__(self, root, embed_relationships, docx, dest_dir):
|
||||
for elem in self.namespace.XPath('//w:font[@w:name]')(root):
|
||||
self.fonts[self.namespace.get(elem, 'w:name')] = Family(elem, embed_relationships, self.namespace.XPath, self.namespace.get)
|
||||
|
||||
def family_for(self, name, bold=False, italic=False):
|
||||
f = self.fonts.get(name, None)
|
||||
if f is None:
|
||||
return 'serif'
|
||||
variant = get_variant(bold, italic)
|
||||
self.used.add((name, variant))
|
||||
name = f.name if variant in f.embedded else f.family_name
|
||||
if is_symbol_font(name):
|
||||
return name
|
||||
return '"%s", %s' % (name.replace('"', ''), f.css_generic_family)
|
||||
|
||||
def embed_fonts(self, dest_dir, docx):
|
||||
defs = []
|
||||
dest_dir = os.path.join(dest_dir, 'fonts')
|
||||
for name, variant in self.used:
|
||||
f = self.fonts[name]
|
||||
if variant in f.embedded:
|
||||
if not os.path.exists(dest_dir):
|
||||
os.mkdir(dest_dir)
|
||||
fname = self.write(name, dest_dir, docx, variant)
|
||||
if fname is not None:
|
||||
d = {'font-family':'"%s"' % name.replace('"', ''), 'src': 'url("fonts/%s")' % fname}
|
||||
if 'Bold' in variant:
|
||||
d['font-weight'] = 'bold'
|
||||
if 'Italic' in variant:
|
||||
d['font-style'] = 'italic'
|
||||
d = ['%s: %s' % (k, v) for k, v in iteritems(d)]
|
||||
d = ';\n\t'.join(d)
|
||||
defs.append('@font-face {\n\t%s\n}\n' % d)
|
||||
return '\n'.join(defs)
|
||||
|
||||
def write(self, name, dest_dir, docx, variant):
|
||||
f = self.fonts[name]
|
||||
ef = f.embedded[variant]
|
||||
raw = docx.read(ef.name)
|
||||
prefix = raw[:32]
|
||||
if ef.key:
|
||||
key = re.sub(r'[^A-Fa-f0-9]', '', ef.key)
|
||||
key = bytearray(reversed(tuple(int(key[i:i+2], 16) for i in range(0, len(key), 2))))
|
||||
prefix = bytearray(prefix)
|
||||
prefix = bytes(bytearray(prefix[i]^key[i % len(key)] for i in range(len(prefix))))
|
||||
if not is_truetype_font(prefix):
|
||||
return None
|
||||
ext = 'otf' if prefix.startswith(b'OTTO') else 'ttf'
|
||||
fname = ascii_filename('%s - %s.%s' % (name, variant, ext))
|
||||
with open(os.path.join(dest_dir, fname), 'wb') as dest:
|
||||
dest.write(prefix)
|
||||
dest.write(raw[32:])
|
||||
|
||||
return fname
|
||||
65
ebook_converter/ebooks/docx/footnotes.py
Normal file
65
ebook_converter/ebooks/docx/footnotes.py
Normal file
@@ -0,0 +1,65 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=utf-8
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
|
||||
from collections import OrderedDict
|
||||
from polyglot.builtins import iteritems, unicode_type
|
||||
|
||||
|
||||
class Note(object):
|
||||
|
||||
def __init__(self, namespace, parent, rels):
|
||||
self.type = namespace.get(parent, 'w:type', 'normal')
|
||||
self.parent = parent
|
||||
self.rels = rels
|
||||
self.namespace = namespace
|
||||
|
||||
def __iter__(self):
|
||||
for p in self.namespace.descendants(self.parent, 'w:p', 'w:tbl'):
|
||||
yield p
|
||||
|
||||
|
||||
class Footnotes(object):
|
||||
|
||||
def __init__(self, namespace):
|
||||
self.namespace = namespace
|
||||
self.footnotes = {}
|
||||
self.endnotes = {}
|
||||
self.counter = 0
|
||||
self.notes = OrderedDict()
|
||||
|
||||
def __call__(self, footnotes, footnotes_rels, endnotes, endnotes_rels):
|
||||
XPath, get = self.namespace.XPath, self.namespace.get
|
||||
if footnotes is not None:
|
||||
for footnote in XPath('./w:footnote[@w:id]')(footnotes):
|
||||
fid = get(footnote, 'w:id')
|
||||
if fid:
|
||||
self.footnotes[fid] = Note(self.namespace, footnote, footnotes_rels)
|
||||
|
||||
if endnotes is not None:
|
||||
for endnote in XPath('./w:endnote[@w:id]')(endnotes):
|
||||
fid = get(endnote, 'w:id')
|
||||
if fid:
|
||||
self.endnotes[fid] = Note(self.namespace, endnote, endnotes_rels)
|
||||
|
||||
def get_ref(self, ref):
|
||||
fid = self.namespace.get(ref, 'w:id')
|
||||
notes = self.footnotes if ref.tag.endswith('}footnoteReference') else self.endnotes
|
||||
note = notes.get(fid, None)
|
||||
if note is not None and note.type == 'normal':
|
||||
self.counter += 1
|
||||
anchor = 'note_%d' % self.counter
|
||||
self.notes[anchor] = (unicode_type(self.counter), note)
|
||||
return anchor, unicode_type(self.counter)
|
||||
return None, None
|
||||
|
||||
def __iter__(self):
|
||||
for anchor, (counter, note) in iteritems(self.notes):
|
||||
yield anchor, counter, note
|
||||
|
||||
@property
|
||||
def has_notes(self):
|
||||
return bool(self.notes)
|
||||
343
ebook_converter/ebooks/docx/images.py
Normal file
343
ebook_converter/ebooks/docx/images.py
Normal file
@@ -0,0 +1,343 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=utf-8
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
|
||||
import os
|
||||
|
||||
from lxml.html.builder import IMG, HR
|
||||
|
||||
from calibre.constants import iswindows
|
||||
from calibre.ebooks.docx.names import barename
|
||||
from calibre.utils.filenames import ascii_filename
|
||||
from calibre.utils.img import resize_to_fit, image_to_data
|
||||
from calibre.utils.imghdr import what
|
||||
from polyglot.builtins import iteritems, itervalues
|
||||
|
||||
|
||||
class LinkedImageNotFound(ValueError):
|
||||
|
||||
def __init__(self, fname):
|
||||
ValueError.__init__(self, fname)
|
||||
self.fname = fname
|
||||
|
||||
|
||||
def image_filename(x):
|
||||
return ascii_filename(x).replace(' ', '_').replace('#', '_')
|
||||
|
||||
|
||||
def emu_to_pt(x):
|
||||
return x / 12700
|
||||
|
||||
|
||||
def pt_to_emu(x):
|
||||
return int(x * 12700)
|
||||
|
||||
|
||||
def get_image_properties(parent, XPath, get):
|
||||
width = height = None
|
||||
for extent in XPath('./wp:extent')(parent):
|
||||
try:
|
||||
width = emu_to_pt(int(extent.get('cx')))
|
||||
except (TypeError, ValueError):
|
||||
pass
|
||||
try:
|
||||
height = emu_to_pt(int(extent.get('cy')))
|
||||
except (TypeError, ValueError):
|
||||
pass
|
||||
ans = {}
|
||||
if width is not None:
|
||||
ans['width'] = '%.3gpt' % width
|
||||
if height is not None:
|
||||
ans['height'] = '%.3gpt' % height
|
||||
|
||||
alt = None
|
||||
title = None
|
||||
for docPr in XPath('./wp:docPr')(parent):
|
||||
alt = docPr.get('descr') or alt
|
||||
title = docPr.get('title') or title
|
||||
if docPr.get('hidden', None) in {'true', 'on', '1'}:
|
||||
ans['display'] = 'none'
|
||||
|
||||
return ans, alt, title
|
||||
|
||||
|
||||
def get_image_margins(elem):
|
||||
ans = {}
|
||||
for w, css in iteritems({'L':'left', 'T':'top', 'R':'right', 'B':'bottom'}):
|
||||
val = elem.get('dist%s' % w, None)
|
||||
if val is not None:
|
||||
try:
|
||||
val = emu_to_pt(val)
|
||||
except (TypeError, ValueError):
|
||||
continue
|
||||
ans['padding-%s' % css] = '%.3gpt' % val
|
||||
return ans
|
||||
|
||||
|
||||
def get_hpos(anchor, page_width, XPath, get, width_frac):
|
||||
for ph in XPath('./wp:positionH')(anchor):
|
||||
rp = ph.get('relativeFrom', None)
|
||||
if rp == 'leftMargin':
|
||||
return 0 + width_frac
|
||||
if rp == 'rightMargin':
|
||||
return 1 + width_frac
|
||||
al = None
|
||||
almap = {'left':0, 'center':0.5, 'right':1}
|
||||
for align in XPath('./wp:align')(ph):
|
||||
al = almap.get(align.text)
|
||||
if al is not None:
|
||||
if rp == 'page':
|
||||
return al
|
||||
return al + width_frac
|
||||
for po in XPath('./wp:posOffset')(ph):
|
||||
try:
|
||||
pos = emu_to_pt(int(po.text))
|
||||
except (TypeError, ValueError):
|
||||
continue
|
||||
return pos/page_width + width_frac
|
||||
|
||||
for sp in XPath('./wp:simplePos')(anchor):
|
||||
try:
|
||||
x = emu_to_pt(sp.get('x', None))
|
||||
except (TypeError, ValueError):
|
||||
continue
|
||||
return x/page_width + width_frac
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
class Images(object):
|
||||
|
||||
def __init__(self, namespace, log):
|
||||
self.namespace = namespace
|
||||
self.rid_map = {}
|
||||
self.used = {}
|
||||
self.resized = {}
|
||||
self.names = set()
|
||||
self.all_images = set()
|
||||
self.links = []
|
||||
self.log = log
|
||||
|
||||
def __call__(self, relationships_by_id):
|
||||
self.rid_map = relationships_by_id
|
||||
|
||||
def read_image_data(self, fname, base=None):
|
||||
if fname.startswith('file://'):
|
||||
src = fname[len('file://'):]
|
||||
if iswindows and src and src[0] == '/':
|
||||
src = src[1:]
|
||||
if not src or not os.path.exists(src):
|
||||
raise LinkedImageNotFound(src)
|
||||
with open(src, 'rb') as rawsrc:
|
||||
raw = rawsrc.read()
|
||||
else:
|
||||
try:
|
||||
raw = self.docx.read(fname)
|
||||
except KeyError:
|
||||
raise LinkedImageNotFound(fname)
|
||||
base = base or image_filename(fname.rpartition('/')[-1]) or 'image'
|
||||
ext = what(None, raw) or base.rpartition('.')[-1] or 'jpeg'
|
||||
if ext == 'emf':
|
||||
# For an example, see: https://bugs.launchpad.net/bugs/1224849
|
||||
self.log('Found an EMF image: %s, trying to extract embedded raster image' % fname)
|
||||
from calibre.utils.wmf.emf import emf_unwrap
|
||||
try:
|
||||
raw = emf_unwrap(raw)
|
||||
except Exception:
|
||||
self.log.exception('Failed to extract embedded raster image from EMF')
|
||||
else:
|
||||
ext = 'png'
|
||||
base = base.rpartition('.')[0]
|
||||
if not base:
|
||||
base = 'image'
|
||||
base += '.' + ext
|
||||
return raw, base
|
||||
|
||||
def unique_name(self, base):
|
||||
exists = frozenset(itervalues(self.used))
|
||||
c = 1
|
||||
name = base
|
||||
while name in exists:
|
||||
n, e = base.rpartition('.')[0::2]
|
||||
name = '%s-%d.%s' % (n, c, e)
|
||||
c += 1
|
||||
return name
|
||||
|
||||
def resize_image(self, raw, base, max_width, max_height):
|
||||
resized, img = resize_to_fit(raw, max_width, max_height)
|
||||
if resized:
|
||||
base, ext = os.path.splitext(base)
|
||||
base = base + '-%dx%d%s' % (max_width, max_height, ext)
|
||||
raw = image_to_data(img, fmt=ext[1:])
|
||||
return raw, base, resized
|
||||
|
||||
def generate_filename(self, rid, base=None, rid_map=None, max_width=None, max_height=None):
|
||||
rid_map = self.rid_map if rid_map is None else rid_map
|
||||
fname = rid_map[rid]
|
||||
key = (fname, max_width, max_height)
|
||||
ans = self.used.get(key)
|
||||
if ans is not None:
|
||||
return ans
|
||||
raw, base = self.read_image_data(fname, base=base)
|
||||
resized = False
|
||||
if max_width is not None and max_height is not None:
|
||||
raw, base, resized = self.resize_image(raw, base, max_width, max_height)
|
||||
name = self.unique_name(base)
|
||||
self.used[key] = name
|
||||
if max_width is not None and max_height is not None and not resized:
|
||||
okey = (fname, None, None)
|
||||
if okey in self.used:
|
||||
return self.used[okey]
|
||||
self.used[okey] = name
|
||||
with open(os.path.join(self.dest_dir, name), 'wb') as f:
|
||||
f.write(raw)
|
||||
self.all_images.add('images/' + name)
|
||||
return name
|
||||
|
||||
def pic_to_img(self, pic, alt, parent, title):
|
||||
XPath, get = self.namespace.XPath, self.namespace.get
|
||||
name = None
|
||||
link = None
|
||||
for hl in XPath('descendant::a:hlinkClick[@r:id]')(parent):
|
||||
link = {'id':get(hl, 'r:id')}
|
||||
tgt = hl.get('tgtFrame', None)
|
||||
if tgt:
|
||||
link['target'] = tgt
|
||||
title = hl.get('tooltip', None)
|
||||
if title:
|
||||
link['title'] = title
|
||||
|
||||
for pr in XPath('descendant::pic:cNvPr')(pic):
|
||||
name = pr.get('name', None)
|
||||
if name:
|
||||
name = image_filename(name)
|
||||
alt = pr.get('descr') or alt
|
||||
for a in XPath('descendant::a:blip[@r:embed or @r:link]')(pic):
|
||||
rid = get(a, 'r:embed')
|
||||
if not rid:
|
||||
rid = get(a, 'r:link')
|
||||
if rid and rid in self.rid_map:
|
||||
try:
|
||||
src = self.generate_filename(rid, name)
|
||||
except LinkedImageNotFound as err:
|
||||
self.log.warn('Linked image: %s not found, ignoring' % err.fname)
|
||||
continue
|
||||
img = IMG(src='images/%s' % src)
|
||||
img.set('alt', alt or 'Image')
|
||||
if title:
|
||||
img.set('title', title)
|
||||
if link is not None:
|
||||
self.links.append((img, link, self.rid_map))
|
||||
return img
|
||||
|
||||
def drawing_to_html(self, drawing, page):
|
||||
XPath, get = self.namespace.XPath, self.namespace.get
|
||||
# First process the inline pictures
|
||||
for inline in XPath('./wp:inline')(drawing):
|
||||
style, alt, title = get_image_properties(inline, XPath, get)
|
||||
for pic in XPath('descendant::pic:pic')(inline):
|
||||
ans = self.pic_to_img(pic, alt, inline, title)
|
||||
if ans is not None:
|
||||
if style:
|
||||
ans.set('style', '; '.join('%s: %s' % (k, v) for k, v in iteritems(style)))
|
||||
yield ans
|
||||
|
||||
# Now process the floats
|
||||
for anchor in XPath('./wp:anchor')(drawing):
|
||||
style, alt, title = get_image_properties(anchor, XPath, get)
|
||||
self.get_float_properties(anchor, style, page)
|
||||
for pic in XPath('descendant::pic:pic')(anchor):
|
||||
ans = self.pic_to_img(pic, alt, anchor, title)
|
||||
if ans is not None:
|
||||
if style:
|
||||
ans.set('style', '; '.join('%s: %s' % (k, v) for k, v in iteritems(style)))
|
||||
yield ans
|
||||
|
||||
def pict_to_html(self, pict, page):
|
||||
XPath, get = self.namespace.XPath, self.namespace.get
|
||||
# First see if we have an <hr>
|
||||
is_hr = len(pict) == 1 and get(pict[0], 'o:hr') in {'t', 'true'}
|
||||
if is_hr:
|
||||
style = {}
|
||||
hr = HR()
|
||||
try:
|
||||
pct = float(get(pict[0], 'o:hrpct'))
|
||||
except (ValueError, TypeError, AttributeError):
|
||||
pass
|
||||
else:
|
||||
if pct > 0:
|
||||
style['width'] = '%.3g%%' % pct
|
||||
align = get(pict[0], 'o:hralign', 'center')
|
||||
if align in {'left', 'right'}:
|
||||
style['margin-left'] = '0' if align == 'left' else 'auto'
|
||||
style['margin-right'] = 'auto' if align == 'left' else '0'
|
||||
if style:
|
||||
hr.set('style', '; '.join(('%s:%s' % (k, v) for k, v in iteritems(style))))
|
||||
yield hr
|
||||
|
||||
for imagedata in XPath('descendant::v:imagedata[@r:id]')(pict):
|
||||
rid = get(imagedata, 'r:id')
|
||||
if rid in self.rid_map:
|
||||
try:
|
||||
src = self.generate_filename(rid)
|
||||
except LinkedImageNotFound as err:
|
||||
self.log.warn('Linked image: %s not found, ignoring' % err.fname)
|
||||
continue
|
||||
img = IMG(src='images/%s' % src, style="display:block")
|
||||
alt = get(imagedata, 'o:title')
|
||||
img.set('alt', alt or 'Image')
|
||||
yield img
|
||||
|
||||
def get_float_properties(self, anchor, style, page):
|
||||
XPath, get = self.namespace.XPath, self.namespace.get
|
||||
if 'display' not in style:
|
||||
style['display'] = 'block'
|
||||
padding = get_image_margins(anchor)
|
||||
width = float(style.get('width', '100pt')[:-2])
|
||||
|
||||
page_width = page.width - page.margin_left - page.margin_right
|
||||
if page_width <= 0:
|
||||
# Ignore margins
|
||||
page_width = page.width
|
||||
|
||||
hpos = get_hpos(anchor, page_width, XPath, get, width/(2*page_width))
|
||||
|
||||
wrap_elem = None
|
||||
dofloat = False
|
||||
|
||||
for child in reversed(anchor):
|
||||
bt = barename(child.tag)
|
||||
if bt in {'wrapNone', 'wrapSquare', 'wrapThrough', 'wrapTight', 'wrapTopAndBottom'}:
|
||||
wrap_elem = child
|
||||
dofloat = bt not in {'wrapNone', 'wrapTopAndBottom'}
|
||||
break
|
||||
|
||||
if wrap_elem is not None:
|
||||
padding.update(get_image_margins(wrap_elem))
|
||||
wt = wrap_elem.get('wrapText', None)
|
||||
hpos = 0 if wt == 'right' else 1 if wt == 'left' else hpos
|
||||
if dofloat:
|
||||
style['float'] = 'left' if hpos < 0.65 else 'right'
|
||||
else:
|
||||
ml, mr = (None, None) if hpos < 0.34 else ('auto', None) if hpos > 0.65 else ('auto', 'auto')
|
||||
if ml is not None:
|
||||
style['margin-left'] = ml
|
||||
if mr is not None:
|
||||
style['margin-right'] = mr
|
||||
|
||||
style.update(padding)
|
||||
|
||||
def to_html(self, elem, page, docx, dest_dir):
|
||||
dest = os.path.join(dest_dir, 'images')
|
||||
if not os.path.exists(dest):
|
||||
os.mkdir(dest)
|
||||
self.dest_dir, self.docx = dest, docx
|
||||
if elem.tag.endswith('}drawing'):
|
||||
for tag in self.drawing_to_html(elem, page):
|
||||
yield tag
|
||||
else:
|
||||
for tag in self.pict_to_html(elem, page):
|
||||
yield tag
|
||||
273
ebook_converter/ebooks/docx/index.py
Normal file
273
ebook_converter/ebooks/docx/index.py
Normal file
@@ -0,0 +1,273 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=utf-8
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2014, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
|
||||
from operator import itemgetter
|
||||
|
||||
from lxml import etree
|
||||
|
||||
from calibre.utils.icu import partition_by_first_letter, sort_key
|
||||
from polyglot.builtins import iteritems, filter
|
||||
|
||||
|
||||
def get_applicable_xe_fields(index, xe_fields, XPath, expand):
|
||||
iet = index.get('entry-type', None)
|
||||
xe_fields = [xe for xe in xe_fields if xe.get('entry-type', None) == iet]
|
||||
|
||||
lr = index.get('letter-range', None)
|
||||
if lr is not None:
|
||||
sl, el = lr.parition('-')[0::2]
|
||||
sl, el = sl.strip(), el.strip()
|
||||
if sl and el:
|
||||
def inrange(text):
|
||||
return sl <= text[0] <= el
|
||||
xe_fields = [xe for xe in xe_fields if inrange(xe.get('text', ''))]
|
||||
|
||||
bmark = index.get('bookmark', None)
|
||||
if bmark is None:
|
||||
return xe_fields
|
||||
attr = expand('w:name')
|
||||
bookmarks = {b for b in XPath('//w:bookmarkStart')(xe_fields[0]['start_elem']) if b.get(attr, None) == bmark}
|
||||
ancestors = XPath('ancestor::w:bookmarkStart')
|
||||
|
||||
def contained(xe):
|
||||
# Check if the xe field is contained inside a bookmark with the
|
||||
# specified name
|
||||
return bool(set(ancestors(xe['start_elem'])) & bookmarks)
|
||||
|
||||
return [xe for xe in xe_fields if contained(xe)]
|
||||
|
||||
|
||||
def make_block(expand, style, parent, pos):
|
||||
p = parent.makeelement(expand('w:p'))
|
||||
parent.insert(pos, p)
|
||||
if style is not None:
|
||||
ppr = p.makeelement(expand('w:pPr'))
|
||||
p.append(ppr)
|
||||
ps = ppr.makeelement(expand('w:pStyle'))
|
||||
ppr.append(ps)
|
||||
ps.set(expand('w:val'), style)
|
||||
r = p.makeelement(expand('w:r'))
|
||||
p.append(r)
|
||||
t = r.makeelement(expand('w:t'))
|
||||
t.set(expand('xml:space'), 'preserve')
|
||||
r.append(t)
|
||||
return p, t
|
||||
|
||||
|
||||
def add_xe(xe, t, expand):
|
||||
run = t.getparent()
|
||||
idx = run.index(t)
|
||||
t.text = xe.get('text') or ' '
|
||||
pt = xe.get('page-number-text', None)
|
||||
|
||||
if pt:
|
||||
p = t.getparent().getparent()
|
||||
r = p.makeelement(expand('w:r'))
|
||||
p.append(r)
|
||||
t2 = r.makeelement(expand('w:t'))
|
||||
t2.set(expand('xml:space'), 'preserve')
|
||||
t2.text = ' [%s]' % pt
|
||||
r.append(t2)
|
||||
# put separate entries on separate lines
|
||||
run.insert(idx + 1, run.makeelement(expand('w:br')))
|
||||
return xe['anchor'], run
|
||||
|
||||
|
||||
def process_index(field, index, xe_fields, log, XPath, expand):
|
||||
'''
|
||||
We remove all the word generated index markup and replace it with our own
|
||||
that is more suitable for an ebook.
|
||||
'''
|
||||
styles = []
|
||||
heading_text = index.get('heading', None)
|
||||
heading_style = 'IndexHeading'
|
||||
start_pos = None
|
||||
for elem in field.contents:
|
||||
if elem.tag.endswith('}p'):
|
||||
s = XPath('descendant::pStyle/@w:val')(elem)
|
||||
if s:
|
||||
styles.append(s[0])
|
||||
p = elem.getparent()
|
||||
if start_pos is None:
|
||||
start_pos = (p, p.index(elem))
|
||||
p.remove(elem)
|
||||
|
||||
xe_fields = get_applicable_xe_fields(index, xe_fields, XPath, expand)
|
||||
if not xe_fields:
|
||||
return [], []
|
||||
if heading_text is not None:
|
||||
groups = partition_by_first_letter(xe_fields, key=itemgetter('text'))
|
||||
items = []
|
||||
for key, fields in iteritems(groups):
|
||||
items.append(key), items.extend(fields)
|
||||
if styles:
|
||||
heading_style = styles[0]
|
||||
else:
|
||||
items = sorted(xe_fields, key=lambda x:sort_key(x['text']))
|
||||
|
||||
hyperlinks = []
|
||||
blocks = []
|
||||
for item in reversed(items):
|
||||
is_heading = not isinstance(item, dict)
|
||||
style = heading_style if is_heading else None
|
||||
p, t = make_block(expand, style, *start_pos)
|
||||
if is_heading:
|
||||
text = heading_text
|
||||
if text.lower().startswith('a'):
|
||||
text = item + text[1:]
|
||||
t.text = text
|
||||
else:
|
||||
hyperlinks.append(add_xe(item, t, expand))
|
||||
blocks.append(p)
|
||||
|
||||
return hyperlinks, blocks
|
||||
|
||||
|
||||
def split_up_block(block, a, text, parts, ldict):
|
||||
prefix = parts[:-1]
|
||||
a.text = parts[-1]
|
||||
parent = a.getparent()
|
||||
style = 'display:block; margin-left: %.3gem'
|
||||
for i, prefix in enumerate(prefix):
|
||||
m = 1.5 * i
|
||||
span = parent.makeelement('span', style=style % m)
|
||||
ldict[span] = i
|
||||
parent.append(span)
|
||||
span.text = prefix
|
||||
span = parent.makeelement('span', style=style % ((i + 1) * 1.5))
|
||||
parent.append(span)
|
||||
span.append(a)
|
||||
ldict[span] = len(prefix)
|
||||
|
||||
|
||||
"""
|
||||
The merge algorithm is a little tricky.
|
||||
We start with a list of elementary blocks. Each is an HtmlElement, a p node
|
||||
with a list of child nodes. The last child may be a link, and the earlier ones are
|
||||
just text.
|
||||
The list is in reverse order from what we want in the index.
|
||||
There is a dictionary ldict which records the level of each child node.
|
||||
|
||||
Now we want to do a reduce-like operation, combining all blocks with the same
|
||||
top level index entry into a single block representing the structure of all
|
||||
references, subentries, etc. under that top entry.
|
||||
Here's the algorithm.
|
||||
|
||||
Given a block p and the next block n, and the top level entries p1 and n1 in each
|
||||
block, which we assume have the same text:
|
||||
|
||||
Start with (p, p1) and (n, n1).
|
||||
|
||||
Given (p, p1, ..., pk) and (n, n1, ..., nk) which we want to merge:
|
||||
|
||||
If there are no more levels in n, and we have a link in nk,
|
||||
then add the link from nk to the links for pk.
|
||||
This might be the first link for pk, or we might get a list of references.
|
||||
|
||||
Otherwise nk+1 is the next level in n. Look for a matching entry in p. It must have
|
||||
the same text, it must follow pk, it must come before we find any other p entries at
|
||||
the same level as pk, and it must have the same level as nk+1.
|
||||
|
||||
If we find such a matching entry, go back to the start with (p ... pk+1) and (n ... nk+1).
|
||||
|
||||
If there is no matching entry, then because of the original reversed order we want
|
||||
to insert nk+1 and all following entries from n into p immediately following pk.
|
||||
"""
|
||||
|
||||
|
||||
def find_match(prev_block, pind, nextent, ldict):
|
||||
curlevel = ldict.get(prev_block[pind], -1)
|
||||
if curlevel < 0:
|
||||
return -1
|
||||
for p in range(pind+1, len(prev_block)):
|
||||
trylev = ldict.get(prev_block[p], -1)
|
||||
if trylev <= curlevel:
|
||||
return -1
|
||||
if trylev > (curlevel+1):
|
||||
continue
|
||||
if prev_block[p].text_content() == nextent.text_content():
|
||||
return p
|
||||
return -1
|
||||
|
||||
|
||||
def add_link(pent, nent, ldict):
|
||||
na = nent.xpath('descendant::a[1]')
|
||||
# If there is no link, leave it as text
|
||||
if not na or len(na) == 0:
|
||||
return
|
||||
na = na[0]
|
||||
pa = pent.xpath('descendant::a')
|
||||
if pa and len(pa) > 0:
|
||||
# Put on same line with a comma
|
||||
pa = pa[-1]
|
||||
pa.tail = ', '
|
||||
p = pa.getparent()
|
||||
p.insert(p.index(pa) + 1, na)
|
||||
else:
|
||||
# substitute link na for plain text in pent
|
||||
pent.text = ""
|
||||
pent.append(na)
|
||||
|
||||
|
||||
def merge_blocks(prev_block, next_block, pind, nind, next_path, ldict):
|
||||
# First elements match. Any more in next?
|
||||
if len(next_path) == (nind + 1):
|
||||
nextent = next_block[nind]
|
||||
add_link(prev_block[pind], nextent, ldict)
|
||||
return
|
||||
|
||||
nind = nind + 1
|
||||
nextent = next_block[nind]
|
||||
prevent = find_match(prev_block, pind, nextent, ldict)
|
||||
if prevent > 0:
|
||||
merge_blocks(prev_block, next_block, prevent, nind, next_path, ldict)
|
||||
return
|
||||
|
||||
# Want to insert elements into previous block
|
||||
while nind < len(next_block):
|
||||
# insert takes it out of old
|
||||
pind = pind + 1
|
||||
prev_block.insert(pind, next_block[nind])
|
||||
|
||||
next_block.getparent().remove(next_block)
|
||||
|
||||
|
||||
def polish_index_markup(index, blocks):
|
||||
# Blocks are in reverse order at this point
|
||||
path_map = {}
|
||||
ldict = {}
|
||||
for block in blocks:
|
||||
cls = block.get('class', '') or ''
|
||||
block.set('class', (cls + ' index-entry').lstrip())
|
||||
a = block.xpath('descendant::a[1]')
|
||||
text = ''
|
||||
if a:
|
||||
text = etree.tostring(a[0], method='text', with_tail=False, encoding='unicode').strip()
|
||||
if ':' in text:
|
||||
path_map[block] = parts = list(filter(None, (x.strip() for x in text.split(':'))))
|
||||
if len(parts) > 1:
|
||||
split_up_block(block, a[0], text, parts, ldict)
|
||||
else:
|
||||
# try using a span all the time
|
||||
path_map[block] = [text]
|
||||
parent = a[0].getparent()
|
||||
span = parent.makeelement('span', style='display:block; margin-left: 0em')
|
||||
parent.append(span)
|
||||
span.append(a[0])
|
||||
ldict[span] = 0
|
||||
|
||||
for br in block.xpath('descendant::br'):
|
||||
br.tail = None
|
||||
|
||||
# We want a single block for each main entry
|
||||
prev_block = blocks[0]
|
||||
for block in blocks[1:]:
|
||||
pp, pn = path_map[prev_block], path_map[block]
|
||||
if pp[0] == pn[0]:
|
||||
merge_blocks(prev_block, block, 0, 0, pn, ldict)
|
||||
else:
|
||||
prev_block = block
|
||||
144
ebook_converter/ebooks/docx/names.py
Normal file
144
ebook_converter/ebooks/docx/names.py
Normal file
@@ -0,0 +1,144 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=utf-8
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
|
||||
import re
|
||||
|
||||
from lxml.etree import XPath as X
|
||||
|
||||
from calibre.utils.filenames import ascii_text
|
||||
from polyglot.builtins import iteritems
|
||||
|
||||
# Names {{{
|
||||
TRANSITIONAL_NAMES = {
|
||||
'DOCUMENT' : 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument',
|
||||
'DOCPROPS' : 'http://schemas.openxmlformats.org/package/2006/relationships/metadata/core-properties',
|
||||
'APPPROPS' : 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/extended-properties',
|
||||
'STYLES' : 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/styles',
|
||||
'NUMBERING' : 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/numbering',
|
||||
'FONTS' : 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/fontTable',
|
||||
'EMBEDDED_FONT' : 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/font',
|
||||
'IMAGES' : 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/image',
|
||||
'LINKS' : 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/hyperlink',
|
||||
'FOOTNOTES' : 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/footnotes',
|
||||
'ENDNOTES' : 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/endnotes',
|
||||
'THEMES' : 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/theme',
|
||||
'SETTINGS' : 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/settings',
|
||||
'WEB_SETTINGS' : 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/webSettings',
|
||||
}
|
||||
|
||||
STRICT_NAMES = {
|
||||
k:v.replace('http://schemas.openxmlformats.org/officeDocument/2006', 'http://purl.oclc.org/ooxml/officeDocument')
|
||||
for k, v in iteritems(TRANSITIONAL_NAMES)
|
||||
}
|
||||
|
||||
TRANSITIONAL_NAMESPACES = {
|
||||
'mo': 'http://schemas.microsoft.com/office/mac/office/2008/main',
|
||||
'o': 'urn:schemas-microsoft-com:office:office',
|
||||
've': 'http://schemas.openxmlformats.org/markup-compatibility/2006',
|
||||
'mc': 'http://schemas.openxmlformats.org/markup-compatibility/2006',
|
||||
# Text Content
|
||||
'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main',
|
||||
'w10': 'urn:schemas-microsoft-com:office:word',
|
||||
'wne': 'http://schemas.microsoft.com/office/word/2006/wordml',
|
||||
'xml': 'http://www.w3.org/XML/1998/namespace',
|
||||
# Drawing
|
||||
'a': 'http://schemas.openxmlformats.org/drawingml/2006/main',
|
||||
'm': 'http://schemas.openxmlformats.org/officeDocument/2006/math',
|
||||
'mv': 'urn:schemas-microsoft-com:mac:vml',
|
||||
'pic': 'http://schemas.openxmlformats.org/drawingml/2006/picture',
|
||||
'v': 'urn:schemas-microsoft-com:vml',
|
||||
'wp': 'http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing',
|
||||
# Properties (core and extended)
|
||||
'cp': 'http://schemas.openxmlformats.org/package/2006/metadata/core-properties',
|
||||
'dc': 'http://purl.org/dc/elements/1.1/',
|
||||
'ep': 'http://schemas.openxmlformats.org/officeDocument/2006/extended-properties',
|
||||
'xsi': 'http://www.w3.org/2001/XMLSchema-instance',
|
||||
# Content Types
|
||||
'ct': 'http://schemas.openxmlformats.org/package/2006/content-types',
|
||||
# Package Relationships
|
||||
'r': 'http://schemas.openxmlformats.org/officeDocument/2006/relationships',
|
||||
'pr': 'http://schemas.openxmlformats.org/package/2006/relationships',
|
||||
# Dublin Core document properties
|
||||
'dcmitype': 'http://purl.org/dc/dcmitype/',
|
||||
'dcterms': 'http://purl.org/dc/terms/'
|
||||
}
|
||||
|
||||
STRICT_NAMESPACES = {
|
||||
k:v.replace(
|
||||
'http://schemas.openxmlformats.org/officeDocument/2006', 'http://purl.oclc.org/ooxml/officeDocument').replace(
|
||||
'http://schemas.openxmlformats.org/wordprocessingml/2006', 'http://purl.oclc.org/ooxml/wordprocessingml').replace(
|
||||
'http://schemas.openxmlformats.org/drawingml/2006', 'http://purl.oclc.org/ooxml/drawingml')
|
||||
for k, v in iteritems(TRANSITIONAL_NAMESPACES)
|
||||
}
|
||||
# }}}
|
||||
|
||||
|
||||
def barename(x):
|
||||
return x.rpartition('}')[-1]
|
||||
|
||||
|
||||
def XML(x):
|
||||
return '{%s}%s' % (TRANSITIONAL_NAMESPACES['xml'], x)
|
||||
|
||||
|
||||
def generate_anchor(name, existing):
|
||||
x = y = 'id_' + re.sub(r'[^0-9a-zA-Z_]', '', ascii_text(name)).lstrip('_')
|
||||
c = 1
|
||||
while y in existing:
|
||||
y = '%s_%d' % (x, c)
|
||||
c += 1
|
||||
return y
|
||||
|
||||
|
||||
class DOCXNamespace(object):
|
||||
|
||||
def __init__(self, transitional=True):
|
||||
self.xpath_cache = {}
|
||||
if transitional:
|
||||
self.namespaces = TRANSITIONAL_NAMESPACES.copy()
|
||||
self.names = TRANSITIONAL_NAMES.copy()
|
||||
else:
|
||||
self.namespaces = STRICT_NAMESPACES.copy()
|
||||
self.names = STRICT_NAMES.copy()
|
||||
|
||||
def XPath(self, expr):
|
||||
ans = self.xpath_cache.get(expr, None)
|
||||
if ans is None:
|
||||
self.xpath_cache[expr] = ans = X(expr, namespaces=self.namespaces)
|
||||
return ans
|
||||
|
||||
def is_tag(self, x, q):
|
||||
tag = getattr(x, 'tag', x)
|
||||
ns, name = q.partition(':')[0::2]
|
||||
return '{%s}%s' % (self.namespaces.get(ns, None), name) == tag
|
||||
|
||||
def expand(self, name, sep=':'):
|
||||
ns, tag = name.partition(sep)[::2]
|
||||
if ns and tag:
|
||||
tag = '{%s}%s' % (self.namespaces[ns], tag)
|
||||
return tag or ns
|
||||
|
||||
def get(self, x, attr, default=None):
|
||||
return x.attrib.get(self.expand(attr), default)
|
||||
|
||||
def ancestor(self, elem, name):
|
||||
try:
|
||||
return self.XPath('ancestor::%s[1]' % name)(elem)[0]
|
||||
except IndexError:
|
||||
return None
|
||||
|
||||
def children(self, elem, *args):
|
||||
return self.XPath('|'.join('child::%s' % a for a in args))(elem)
|
||||
|
||||
def descendants(self, elem, *args):
|
||||
return self.XPath('|'.join('descendant::%s' % a for a in args))(elem)
|
||||
|
||||
def makeelement(self, root, tag, append=True, **attrs):
|
||||
ans = root.makeelement(self.expand(tag), **{self.expand(k, sep='_'):v for k, v in iteritems(attrs)})
|
||||
if append:
|
||||
root.append(ans)
|
||||
return ans
|
||||
388
ebook_converter/ebooks/docx/numbering.py
Normal file
388
ebook_converter/ebooks/docx/numbering.py
Normal file
@@ -0,0 +1,388 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=utf-8
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
|
||||
import re, string
|
||||
from collections import Counter, defaultdict
|
||||
from functools import partial
|
||||
|
||||
from lxml.html.builder import OL, UL, SPAN
|
||||
|
||||
from calibre.ebooks.docx.block_styles import ParagraphStyle
|
||||
from calibre.ebooks.docx.char_styles import RunStyle, inherit
|
||||
from calibre.ebooks.metadata import roman
|
||||
from polyglot.builtins import iteritems, unicode_type
|
||||
|
||||
STYLE_MAP = {
|
||||
'aiueo': 'hiragana',
|
||||
'aiueoFullWidth': 'hiragana',
|
||||
'hebrew1': 'hebrew',
|
||||
'iroha': 'katakana-iroha',
|
||||
'irohaFullWidth': 'katakana-iroha',
|
||||
'lowerLetter': 'lower-alpha',
|
||||
'lowerRoman': 'lower-roman',
|
||||
'none': 'none',
|
||||
'upperLetter': 'upper-alpha',
|
||||
'upperRoman': 'upper-roman',
|
||||
'chineseCounting': 'cjk-ideographic',
|
||||
'decimalZero': 'decimal-leading-zero',
|
||||
}
|
||||
|
||||
|
||||
def alphabet(val, lower=True):
|
||||
x = string.ascii_lowercase if lower else string.ascii_uppercase
|
||||
return x[(abs(val - 1)) % len(x)]
|
||||
|
||||
|
||||
alphabet_map = {
|
||||
'lower-alpha':alphabet, 'upper-alpha':partial(alphabet, lower=False),
|
||||
'lower-roman':lambda x:roman(x).lower(), 'upper-roman':roman,
|
||||
'decimal-leading-zero': lambda x: '0%d' % x
|
||||
}
|
||||
|
||||
|
||||
class Level(object):
|
||||
|
||||
def __init__(self, namespace, lvl=None):
|
||||
self.namespace = namespace
|
||||
self.restart = None
|
||||
self.start = 0
|
||||
self.fmt = 'decimal'
|
||||
self.para_link = None
|
||||
self.paragraph_style = self.character_style = None
|
||||
self.is_numbered = False
|
||||
self.num_template = None
|
||||
self.bullet_template = None
|
||||
self.pic_id = None
|
||||
|
||||
if lvl is not None:
|
||||
self.read_from_xml(lvl)
|
||||
|
||||
def copy(self):
|
||||
ans = Level(self.namespace)
|
||||
for x in ('restart', 'pic_id', 'start', 'fmt', 'para_link', 'paragraph_style', 'character_style', 'is_numbered', 'num_template', 'bullet_template'):
|
||||
setattr(ans, x, getattr(self, x))
|
||||
return ans
|
||||
|
||||
def format_template(self, counter, ilvl, template):
|
||||
def sub(m):
|
||||
x = int(m.group(1)) - 1
|
||||
if x > ilvl or x not in counter:
|
||||
return ''
|
||||
val = counter[x] - (0 if x == ilvl else 1)
|
||||
formatter = alphabet_map.get(self.fmt, lambda x: '%d' % x)
|
||||
return formatter(val)
|
||||
return re.sub(r'%(\d+)', sub, template).rstrip() + '\xa0'
|
||||
|
||||
def read_from_xml(self, lvl, override=False):
|
||||
XPath, get = self.namespace.XPath, self.namespace.get
|
||||
for lr in XPath('./w:lvlRestart[@w:val]')(lvl):
|
||||
try:
|
||||
self.restart = int(get(lr, 'w:val'))
|
||||
except (TypeError, ValueError):
|
||||
pass
|
||||
|
||||
for lr in XPath('./w:start[@w:val]')(lvl):
|
||||
try:
|
||||
self.start = int(get(lr, 'w:val'))
|
||||
except (TypeError, ValueError):
|
||||
pass
|
||||
|
||||
for rPr in XPath('./w:rPr')(lvl):
|
||||
ps = RunStyle(self.namespace, rPr)
|
||||
if self.character_style is None:
|
||||
self.character_style = ps
|
||||
else:
|
||||
self.character_style.update(ps)
|
||||
|
||||
lt = None
|
||||
for lr in XPath('./w:lvlText[@w:val]')(lvl):
|
||||
lt = get(lr, 'w:val')
|
||||
|
||||
for lr in XPath('./w:numFmt[@w:val]')(lvl):
|
||||
val = get(lr, 'w:val')
|
||||
if val == 'bullet':
|
||||
self.is_numbered = False
|
||||
cs = self.character_style
|
||||
if lt in {'\uf0a7', 'o'} or (
|
||||
cs is not None and cs.font_family is not inherit and cs.font_family.lower() in {'wingdings', 'symbol'}):
|
||||
self.fmt = {'\uf0a7':'square', 'o':'circle'}.get(lt, 'disc')
|
||||
else:
|
||||
self.bullet_template = lt
|
||||
for lpid in XPath('./w:lvlPicBulletId[@w:val]')(lvl):
|
||||
self.pic_id = get(lpid, 'w:val')
|
||||
else:
|
||||
self.is_numbered = True
|
||||
self.fmt = STYLE_MAP.get(val, 'decimal')
|
||||
if lt and re.match(r'%\d+\.$', lt) is None:
|
||||
self.num_template = lt
|
||||
|
||||
for lr in XPath('./w:pStyle[@w:val]')(lvl):
|
||||
self.para_link = get(lr, 'w:val')
|
||||
|
||||
for pPr in XPath('./w:pPr')(lvl):
|
||||
ps = ParagraphStyle(self.namespace, pPr)
|
||||
if self.paragraph_style is None:
|
||||
self.paragraph_style = ps
|
||||
else:
|
||||
self.paragraph_style.update(ps)
|
||||
|
||||
def css(self, images, pic_map, rid_map):
|
||||
ans = {'list-style-type': self.fmt}
|
||||
if self.pic_id:
|
||||
rid = pic_map.get(self.pic_id, None)
|
||||
if rid:
|
||||
try:
|
||||
fname = images.generate_filename(rid, rid_map=rid_map, max_width=20, max_height=20)
|
||||
except Exception:
|
||||
fname = None
|
||||
else:
|
||||
ans['list-style-image'] = 'url("images/%s")' % fname
|
||||
return ans
|
||||
|
||||
def char_css(self):
|
||||
try:
|
||||
css = self.character_style.css
|
||||
except AttributeError:
|
||||
css = {}
|
||||
css.pop('font-family', None)
|
||||
return css
|
||||
|
||||
|
||||
class NumberingDefinition(object):
|
||||
|
||||
def __init__(self, namespace, parent=None, an_id=None):
|
||||
self.namespace = namespace
|
||||
XPath, get = self.namespace.XPath, self.namespace.get
|
||||
self.levels = {}
|
||||
self.abstract_numbering_definition_id = an_id
|
||||
if parent is not None:
|
||||
for lvl in XPath('./w:lvl')(parent):
|
||||
try:
|
||||
ilvl = int(get(lvl, 'w:ilvl', 0))
|
||||
except (TypeError, ValueError):
|
||||
ilvl = 0
|
||||
self.levels[ilvl] = Level(namespace, lvl)
|
||||
|
||||
def copy(self):
|
||||
ans = NumberingDefinition(self.namespace, an_id=self.abstract_numbering_definition_id)
|
||||
for l, lvl in iteritems(self.levels):
|
||||
ans.levels[l] = lvl.copy()
|
||||
return ans
|
||||
|
||||
|
||||
class Numbering(object):
|
||||
|
||||
def __init__(self, namespace):
|
||||
self.namespace = namespace
|
||||
self.definitions = {}
|
||||
self.instances = {}
|
||||
self.counters = defaultdict(Counter)
|
||||
self.starts = {}
|
||||
self.pic_map = {}
|
||||
|
||||
def __call__(self, root, styles, rid_map):
|
||||
' Read all numbering style definitions '
|
||||
XPath, get = self.namespace.XPath, self.namespace.get
|
||||
self.rid_map = rid_map
|
||||
for npb in XPath('./w:numPicBullet[@w:numPicBulletId]')(root):
|
||||
npbid = get(npb, 'w:numPicBulletId')
|
||||
for idata in XPath('descendant::v:imagedata[@r:id]')(npb):
|
||||
rid = get(idata, 'r:id')
|
||||
self.pic_map[npbid] = rid
|
||||
lazy_load = {}
|
||||
for an in XPath('./w:abstractNum[@w:abstractNumId]')(root):
|
||||
an_id = get(an, 'w:abstractNumId')
|
||||
nsl = XPath('./w:numStyleLink[@w:val]')(an)
|
||||
if nsl:
|
||||
lazy_load[an_id] = get(nsl[0], 'w:val')
|
||||
else:
|
||||
nd = NumberingDefinition(self.namespace, an, an_id=an_id)
|
||||
self.definitions[an_id] = nd
|
||||
|
||||
def create_instance(n, definition):
|
||||
nd = definition.copy()
|
||||
start_overrides = {}
|
||||
for lo in XPath('./w:lvlOverride')(n):
|
||||
try:
|
||||
ilvl = int(get(lo, 'w:ilvl'))
|
||||
except (ValueError, TypeError):
|
||||
ilvl = None
|
||||
for so in XPath('./w:startOverride[@w:val]')(lo):
|
||||
try:
|
||||
start_override = int(get(so, 'w:val'))
|
||||
except (TypeError, ValueError):
|
||||
pass
|
||||
else:
|
||||
start_overrides[ilvl] = start_override
|
||||
for lvl in XPath('./w:lvl')(lo)[:1]:
|
||||
nilvl = get(lvl, 'w:ilvl')
|
||||
ilvl = nilvl if ilvl is None else ilvl
|
||||
alvl = nd.levels.get(ilvl, None)
|
||||
if alvl is None:
|
||||
alvl = Level(self.namespace)
|
||||
alvl.read_from_xml(lvl, override=True)
|
||||
for ilvl, so in iteritems(start_overrides):
|
||||
try:
|
||||
nd.levels[ilvl].start = start_override
|
||||
except KeyError:
|
||||
pass
|
||||
return nd
|
||||
|
||||
next_pass = {}
|
||||
for n in XPath('./w:num[@w:numId]')(root):
|
||||
an_id = None
|
||||
num_id = get(n, 'w:numId')
|
||||
for an in XPath('./w:abstractNumId[@w:val]')(n):
|
||||
an_id = get(an, 'w:val')
|
||||
d = self.definitions.get(an_id, None)
|
||||
if d is None:
|
||||
next_pass[num_id] = (an_id, n)
|
||||
continue
|
||||
self.instances[num_id] = create_instance(n, d)
|
||||
|
||||
numbering_links = styles.numbering_style_links
|
||||
for an_id, style_link in iteritems(lazy_load):
|
||||
num_id = numbering_links[style_link]
|
||||
self.definitions[an_id] = self.instances[num_id].copy()
|
||||
|
||||
for num_id, (an_id, n) in iteritems(next_pass):
|
||||
d = self.definitions.get(an_id, None)
|
||||
if d is not None:
|
||||
self.instances[num_id] = create_instance(n, d)
|
||||
|
||||
for num_id, d in iteritems(self.instances):
|
||||
self.starts[num_id] = {lvl:d.levels[lvl].start for lvl in d.levels}
|
||||
|
||||
def get_pstyle(self, num_id, style_id):
|
||||
d = self.instances.get(num_id, None)
|
||||
if d is not None:
|
||||
for ilvl, lvl in iteritems(d.levels):
|
||||
if lvl.para_link == style_id:
|
||||
return ilvl
|
||||
|
||||
def get_para_style(self, num_id, lvl):
|
||||
d = self.instances.get(num_id, None)
|
||||
if d is not None:
|
||||
lvl = d.levels.get(lvl, None)
|
||||
return getattr(lvl, 'paragraph_style', None)
|
||||
|
||||
def update_counter(self, counter, levelnum, levels):
|
||||
counter[levelnum] += 1
|
||||
for ilvl, lvl in iteritems(levels):
|
||||
restart = lvl.restart
|
||||
if (restart is None and ilvl == levelnum + 1) or restart == levelnum + 1:
|
||||
counter[ilvl] = lvl.start
|
||||
|
||||
def apply_markup(self, items, body, styles, object_map, images):
|
||||
seen_instances = set()
|
||||
for p, num_id, ilvl in items:
|
||||
d = self.instances.get(num_id, None)
|
||||
if d is not None:
|
||||
lvl = d.levels.get(ilvl, None)
|
||||
if lvl is not None:
|
||||
an_id = d.abstract_numbering_definition_id
|
||||
counter = self.counters[an_id]
|
||||
if ilvl not in counter or num_id not in seen_instances:
|
||||
counter[ilvl] = self.starts[num_id][ilvl]
|
||||
seen_instances.add(num_id)
|
||||
p.tag = 'li'
|
||||
p.set('value', '%s' % counter[ilvl])
|
||||
p.set('list-lvl', unicode_type(ilvl))
|
||||
p.set('list-id', num_id)
|
||||
if lvl.num_template is not None:
|
||||
val = lvl.format_template(counter, ilvl, lvl.num_template)
|
||||
p.set('list-template', val)
|
||||
elif lvl.bullet_template is not None:
|
||||
val = lvl.format_template(counter, ilvl, lvl.bullet_template)
|
||||
p.set('list-template', val)
|
||||
self.update_counter(counter, ilvl, d.levels)
|
||||
|
||||
templates = {}
|
||||
|
||||
def commit(current_run):
|
||||
if not current_run:
|
||||
return
|
||||
start = current_run[0]
|
||||
parent = start.getparent()
|
||||
idx = parent.index(start)
|
||||
|
||||
d = self.instances[start.get('list-id')]
|
||||
ilvl = int(start.get('list-lvl'))
|
||||
lvl = d.levels[ilvl]
|
||||
lvlid = start.get('list-id') + start.get('list-lvl')
|
||||
has_template = 'list-template' in start.attrib
|
||||
wrap = (OL if lvl.is_numbered or has_template else UL)('\n\t')
|
||||
if has_template:
|
||||
wrap.set('lvlid', lvlid)
|
||||
else:
|
||||
wrap.set('class', styles.register(lvl.css(images, self.pic_map, self.rid_map), 'list'))
|
||||
ccss = lvl.char_css()
|
||||
if ccss:
|
||||
ccss = styles.register(ccss, 'bullet')
|
||||
parent.insert(idx, wrap)
|
||||
last_val = None
|
||||
for child in current_run:
|
||||
wrap.append(child)
|
||||
child.tail = '\n\t'
|
||||
if has_template:
|
||||
span = SPAN()
|
||||
span.text = child.text
|
||||
child.text = None
|
||||
for gc in child:
|
||||
span.append(gc)
|
||||
child.append(span)
|
||||
span = SPAN(child.get('list-template'))
|
||||
if ccss:
|
||||
span.set('class', ccss)
|
||||
last = templates.get(lvlid, '')
|
||||
if span.text and len(span.text) > len(last):
|
||||
templates[lvlid] = span.text
|
||||
child.insert(0, span)
|
||||
for attr in ('list-lvl', 'list-id', 'list-template'):
|
||||
child.attrib.pop(attr, None)
|
||||
val = int(child.get('value'))
|
||||
if last_val == val - 1 or wrap.tag == 'ul' or (last_val is None and val == 1):
|
||||
child.attrib.pop('value')
|
||||
last_val = val
|
||||
current_run[-1].tail = '\n'
|
||||
del current_run[:]
|
||||
|
||||
parents = set()
|
||||
for child in body.iterdescendants('li'):
|
||||
parents.add(child.getparent())
|
||||
|
||||
for parent in parents:
|
||||
current_run = []
|
||||
for child in parent:
|
||||
if child.tag == 'li':
|
||||
if current_run:
|
||||
last = current_run[-1]
|
||||
if (last.get('list-id') , last.get('list-lvl')) != (child.get('list-id'), child.get('list-lvl')):
|
||||
commit(current_run)
|
||||
current_run.append(child)
|
||||
else:
|
||||
commit(current_run)
|
||||
commit(current_run)
|
||||
|
||||
# Convert the list items that use custom text for bullets into tables
|
||||
# so that they display correctly
|
||||
for wrap in body.xpath('//ol[@lvlid]'):
|
||||
wrap.attrib.pop('lvlid')
|
||||
wrap.tag = 'div'
|
||||
wrap.set('style', 'display:table')
|
||||
for i, li in enumerate(wrap.iterchildren('li')):
|
||||
li.tag = 'div'
|
||||
li.attrib.pop('value', None)
|
||||
li.set('style', 'display:table-row')
|
||||
obj = object_map[li]
|
||||
bs = styles.para_cache[obj]
|
||||
if i == 0:
|
||||
wrap.set('style', 'display:table; padding-left:%s' %
|
||||
bs.css.get('margin-left', '0'))
|
||||
bs.css.pop('margin-left', None)
|
||||
for child in li:
|
||||
child.set('style', 'display:table-cell')
|
||||
21
ebook_converter/ebooks/docx/settings.py
Normal file
21
ebook_converter/ebooks/docx/settings.py
Normal file
@@ -0,0 +1,21 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=utf-8
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
|
||||
|
||||
class Settings(object):
|
||||
|
||||
def __init__(self, namespace):
|
||||
self.default_tab_stop = 720 / 20
|
||||
self.namespace = namespace
|
||||
|
||||
def __call__(self, root):
|
||||
for dts in self.namespace.XPath('//w:defaultTabStop[@w:val]')(root):
|
||||
try:
|
||||
self.default_tab_stop = int(self.namespace.get(dts, 'w:val')) / 20
|
||||
except (ValueError, TypeError, AttributeError):
|
||||
pass
|
||||
|
||||
504
ebook_converter/ebooks/docx/styles.py
Normal file
504
ebook_converter/ebooks/docx/styles.py
Normal file
@@ -0,0 +1,504 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=utf-8
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
|
||||
import textwrap
|
||||
from collections import OrderedDict, Counter
|
||||
|
||||
from calibre.ebooks.docx.block_styles import ParagraphStyle, inherit, twips
|
||||
from calibre.ebooks.docx.char_styles import RunStyle
|
||||
from calibre.ebooks.docx.tables import TableStyle
|
||||
from polyglot.builtins import iteritems, itervalues
|
||||
|
||||
|
||||
class PageProperties(object):
|
||||
|
||||
'''
|
||||
Class representing page level properties (page size/margins) read from
|
||||
sectPr elements.
|
||||
'''
|
||||
|
||||
def __init__(self, namespace, elems=()):
|
||||
self.width, self.height = 595.28, 841.89 # pts, A4
|
||||
self.margin_left = self.margin_right = 72 # pts
|
||||
|
||||
def setval(attr, val):
|
||||
val = twips(val)
|
||||
if val is not None:
|
||||
setattr(self, attr, val)
|
||||
|
||||
for sectPr in elems:
|
||||
for pgSz in namespace.XPath('./w:pgSz')(sectPr):
|
||||
w, h = namespace.get(pgSz, 'w:w'), namespace.get(pgSz, 'w:h')
|
||||
setval('width', w), setval('height', h)
|
||||
for pgMar in namespace.XPath('./w:pgMar')(sectPr):
|
||||
l, r = namespace.get(pgMar, 'w:left'), namespace.get(pgMar, 'w:right')
|
||||
setval('margin_left', l), setval('margin_right', r)
|
||||
|
||||
|
||||
class Style(object):
|
||||
'''
|
||||
Class representing a <w:style> element. Can contain block, character, etc. styles.
|
||||
'''
|
||||
|
||||
def __init__(self, namespace, elem):
|
||||
self.namespace = namespace
|
||||
self.name_path = namespace.XPath('./w:name[@w:val]')
|
||||
self.based_on_path = namespace.XPath('./w:basedOn[@w:val]')
|
||||
self.resolved = False
|
||||
self.style_id = namespace.get(elem, 'w:styleId')
|
||||
self.style_type = namespace.get(elem, 'w:type')
|
||||
names = self.name_path(elem)
|
||||
self.name = namespace.get(names[-1], 'w:val') if names else None
|
||||
based_on = self.based_on_path(elem)
|
||||
self.based_on = namespace.get(based_on[0], 'w:val') if based_on else None
|
||||
if self.style_type == 'numbering':
|
||||
self.based_on = None
|
||||
self.is_default = namespace.get(elem, 'w:default') in {'1', 'on', 'true'}
|
||||
|
||||
self.paragraph_style = self.character_style = self.table_style = None
|
||||
|
||||
if self.style_type in {'paragraph', 'character', 'table'}:
|
||||
if self.style_type == 'table':
|
||||
for tblPr in namespace.XPath('./w:tblPr')(elem):
|
||||
ts = TableStyle(namespace, tblPr)
|
||||
if self.table_style is None:
|
||||
self.table_style = ts
|
||||
else:
|
||||
self.table_style.update(ts)
|
||||
if self.style_type in {'paragraph', 'table'}:
|
||||
for pPr in namespace.XPath('./w:pPr')(elem):
|
||||
ps = ParagraphStyle(namespace, pPr)
|
||||
if self.paragraph_style is None:
|
||||
self.paragraph_style = ps
|
||||
else:
|
||||
self.paragraph_style.update(ps)
|
||||
|
||||
for rPr in namespace.XPath('./w:rPr')(elem):
|
||||
rs = RunStyle(namespace, rPr)
|
||||
if self.character_style is None:
|
||||
self.character_style = rs
|
||||
else:
|
||||
self.character_style.update(rs)
|
||||
|
||||
if self.style_type in {'numbering', 'paragraph'}:
|
||||
self.numbering_style_link = None
|
||||
for x in namespace.XPath('./w:pPr/w:numPr/w:numId[@w:val]')(elem):
|
||||
self.numbering_style_link = namespace.get(x, 'w:val')
|
||||
|
||||
def resolve_based_on(self, parent):
|
||||
if parent.table_style is not None:
|
||||
if self.table_style is None:
|
||||
self.table_style = TableStyle(self.namespace)
|
||||
self.table_style.resolve_based_on(parent.table_style)
|
||||
if parent.paragraph_style is not None:
|
||||
if self.paragraph_style is None:
|
||||
self.paragraph_style = ParagraphStyle(self.namespace)
|
||||
self.paragraph_style.resolve_based_on(parent.paragraph_style)
|
||||
if parent.character_style is not None:
|
||||
if self.character_style is None:
|
||||
self.character_style = RunStyle(self.namespace)
|
||||
self.character_style.resolve_based_on(parent.character_style)
|
||||
|
||||
|
||||
class Styles(object):
|
||||
|
||||
'''
|
||||
Collection of all styles defined in the document. Used to get the final styles applicable to elements in the document markup.
|
||||
'''
|
||||
|
||||
def __init__(self, namespace, tables):
|
||||
self.namespace = namespace
|
||||
self.id_map = OrderedDict()
|
||||
self.para_cache = {}
|
||||
self.para_char_cache = {}
|
||||
self.run_cache = {}
|
||||
self.classes = {}
|
||||
self.counter = Counter()
|
||||
self.default_styles = {}
|
||||
self.tables = tables
|
||||
self.numbering_style_links = {}
|
||||
self.default_paragraph_style = self.default_character_style = None
|
||||
|
||||
def __iter__(self):
|
||||
for s in itervalues(self.id_map):
|
||||
yield s
|
||||
|
||||
def __getitem__(self, key):
|
||||
return self.id_map[key]
|
||||
|
||||
def __len__(self):
|
||||
return len(self.id_map)
|
||||
|
||||
def get(self, key, default=None):
|
||||
return self.id_map.get(key, default)
|
||||
|
||||
def __call__(self, root, fonts, theme):
|
||||
self.fonts, self.theme = fonts, theme
|
||||
self.default_paragraph_style = self.default_character_style = None
|
||||
if root is not None:
|
||||
for s in self.namespace.XPath('//w:style')(root):
|
||||
s = Style(self.namespace, s)
|
||||
if s.style_id:
|
||||
self.id_map[s.style_id] = s
|
||||
if s.is_default:
|
||||
self.default_styles[s.style_type] = s
|
||||
if getattr(s, 'numbering_style_link', None) is not None:
|
||||
self.numbering_style_links[s.style_id] = s.numbering_style_link
|
||||
|
||||
for dd in self.namespace.XPath('./w:docDefaults')(root):
|
||||
for pd in self.namespace.XPath('./w:pPrDefault')(dd):
|
||||
for pPr in self.namespace.XPath('./w:pPr')(pd):
|
||||
ps = ParagraphStyle(self.namespace, pPr)
|
||||
if self.default_paragraph_style is None:
|
||||
self.default_paragraph_style = ps
|
||||
else:
|
||||
self.default_paragraph_style.update(ps)
|
||||
for pd in self.namespace.XPath('./w:rPrDefault')(dd):
|
||||
for pPr in self.namespace.XPath('./w:rPr')(pd):
|
||||
ps = RunStyle(self.namespace, pPr)
|
||||
if self.default_character_style is None:
|
||||
self.default_character_style = ps
|
||||
else:
|
||||
self.default_character_style.update(ps)
|
||||
|
||||
def resolve(s, p):
|
||||
if p is not None:
|
||||
if not p.resolved:
|
||||
resolve(p, self.get(p.based_on))
|
||||
s.resolve_based_on(p)
|
||||
s.resolved = True
|
||||
|
||||
for s in self:
|
||||
if not s.resolved:
|
||||
resolve(s, self.get(s.based_on))
|
||||
|
||||
def para_val(self, parent_styles, direct_formatting, attr):
|
||||
val = getattr(direct_formatting, attr)
|
||||
if val is inherit:
|
||||
for ps in reversed(parent_styles):
|
||||
pval = getattr(ps, attr)
|
||||
if pval is not inherit:
|
||||
val = pval
|
||||
break
|
||||
return val
|
||||
|
||||
def run_val(self, parent_styles, direct_formatting, attr):
|
||||
val = getattr(direct_formatting, attr)
|
||||
if val is not inherit:
|
||||
return val
|
||||
if attr in direct_formatting.toggle_properties:
|
||||
# The spec (section 17.7.3) does not make sense, so we follow the behavior
|
||||
# of Word, which seems to only consider the document default if the
|
||||
# property has not been defined in any styles.
|
||||
vals = [int(getattr(rs, attr)) for rs in parent_styles if rs is not self.default_character_style and getattr(rs, attr) is not inherit]
|
||||
if vals:
|
||||
return sum(vals) % 2 == 1
|
||||
if self.default_character_style is not None:
|
||||
return getattr(self.default_character_style, attr) is True
|
||||
return False
|
||||
for rs in reversed(parent_styles):
|
||||
rval = getattr(rs, attr)
|
||||
if rval is not inherit:
|
||||
return rval
|
||||
return val
|
||||
|
||||
def resolve_paragraph(self, p):
|
||||
ans = self.para_cache.get(p, None)
|
||||
if ans is None:
|
||||
linked_style = None
|
||||
ans = self.para_cache[p] = ParagraphStyle(self.namespace)
|
||||
ans.style_name = None
|
||||
direct_formatting = None
|
||||
is_section_break = False
|
||||
for pPr in self.namespace.XPath('./w:pPr')(p):
|
||||
ps = ParagraphStyle(self.namespace, pPr)
|
||||
if direct_formatting is None:
|
||||
direct_formatting = ps
|
||||
else:
|
||||
direct_formatting.update(ps)
|
||||
if self.namespace.XPath('./w:sectPr')(pPr):
|
||||
is_section_break = True
|
||||
|
||||
if direct_formatting is None:
|
||||
direct_formatting = ParagraphStyle(self.namespace)
|
||||
parent_styles = []
|
||||
if self.default_paragraph_style is not None:
|
||||
parent_styles.append(self.default_paragraph_style)
|
||||
ts = self.tables.para_style(p)
|
||||
if ts is not None:
|
||||
parent_styles.append(ts)
|
||||
|
||||
default_para = self.default_styles.get('paragraph', None)
|
||||
if direct_formatting.linked_style is not None:
|
||||
ls = linked_style = self.get(direct_formatting.linked_style)
|
||||
if ls is not None:
|
||||
ans.style_name = ls.name
|
||||
ps = ls.paragraph_style
|
||||
if ps is not None:
|
||||
parent_styles.append(ps)
|
||||
if ls.character_style is not None:
|
||||
self.para_char_cache[p] = ls.character_style
|
||||
elif default_para is not None:
|
||||
if default_para.paragraph_style is not None:
|
||||
parent_styles.append(default_para.paragraph_style)
|
||||
if default_para.character_style is not None:
|
||||
self.para_char_cache[p] = default_para.character_style
|
||||
|
||||
def has_numbering(block_style):
|
||||
num_id, lvl = getattr(block_style, 'numbering_id', inherit), getattr(block_style, 'numbering_level', inherit)
|
||||
return num_id is not None and num_id is not inherit and lvl is not None and lvl is not inherit
|
||||
|
||||
is_numbering = has_numbering(direct_formatting)
|
||||
is_section_break = is_section_break and not self.namespace.XPath('./w:r')(p)
|
||||
|
||||
if is_numbering and not is_section_break:
|
||||
num_id, lvl = direct_formatting.numbering_id, direct_formatting.numbering_level
|
||||
p.set('calibre_num_id', '%s:%s' % (lvl, num_id))
|
||||
ps = self.numbering.get_para_style(num_id, lvl)
|
||||
if ps is not None:
|
||||
parent_styles.append(ps)
|
||||
if (
|
||||
not is_numbering and not is_section_break and linked_style is not None and has_numbering(linked_style.paragraph_style)
|
||||
):
|
||||
num_id, lvl = linked_style.paragraph_style.numbering_id, linked_style.paragraph_style.numbering_level
|
||||
p.set('calibre_num_id', '%s:%s' % (lvl, num_id))
|
||||
is_numbering = True
|
||||
ps = self.numbering.get_para_style(num_id, lvl)
|
||||
if ps is not None:
|
||||
parent_styles.append(ps)
|
||||
|
||||
for attr in ans.all_properties:
|
||||
if not (is_numbering and attr == 'text_indent'): # skip text-indent for lists
|
||||
setattr(ans, attr, self.para_val(parent_styles, direct_formatting, attr))
|
||||
ans.linked_style = direct_formatting.linked_style
|
||||
return ans
|
||||
|
||||
def resolve_run(self, r):
|
||||
ans = self.run_cache.get(r, None)
|
||||
if ans is None:
|
||||
p = self.namespace.XPath('ancestor::w:p[1]')(r)
|
||||
p = p[0] if p else None
|
||||
ans = self.run_cache[r] = RunStyle(self.namespace)
|
||||
direct_formatting = None
|
||||
for rPr in self.namespace.XPath('./w:rPr')(r):
|
||||
rs = RunStyle(self.namespace, rPr)
|
||||
if direct_formatting is None:
|
||||
direct_formatting = rs
|
||||
else:
|
||||
direct_formatting.update(rs)
|
||||
|
||||
if direct_formatting is None:
|
||||
direct_formatting = RunStyle(self.namespace)
|
||||
|
||||
parent_styles = []
|
||||
default_char = self.default_styles.get('character', None)
|
||||
if self.default_character_style is not None:
|
||||
parent_styles.append(self.default_character_style)
|
||||
pstyle = self.para_char_cache.get(p, None)
|
||||
if pstyle is not None:
|
||||
parent_styles.append(pstyle)
|
||||
# As best as I can understand the spec, table overrides should be
|
||||
# applied before paragraph overrides, but word does it
|
||||
# this way, see the December 2007 table header in the demo
|
||||
# document.
|
||||
ts = self.tables.run_style(p)
|
||||
if ts is not None:
|
||||
parent_styles.append(ts)
|
||||
if direct_formatting.linked_style is not None:
|
||||
ls = getattr(self.get(direct_formatting.linked_style), 'character_style', None)
|
||||
if ls is not None:
|
||||
parent_styles.append(ls)
|
||||
elif default_char is not None and default_char.character_style is not None:
|
||||
parent_styles.append(default_char.character_style)
|
||||
|
||||
for attr in ans.all_properties:
|
||||
setattr(ans, attr, self.run_val(parent_styles, direct_formatting, attr))
|
||||
|
||||
if ans.font_family is not inherit:
|
||||
ff = self.theme.resolve_font_family(ans.font_family)
|
||||
ans.font_family = self.fonts.family_for(ff, ans.b, ans.i)
|
||||
|
||||
return ans
|
||||
|
||||
def resolve(self, obj):
|
||||
if obj.tag.endswith('}p'):
|
||||
return self.resolve_paragraph(obj)
|
||||
if obj.tag.endswith('}r'):
|
||||
return self.resolve_run(obj)
|
||||
|
||||
def cascade(self, layers):
|
||||
self.body_font_family = 'serif'
|
||||
self.body_font_size = '10pt'
|
||||
self.body_color = 'black'
|
||||
|
||||
def promote_property(char_styles, block_style, prop):
|
||||
vals = {getattr(s, prop) for s in char_styles}
|
||||
if len(vals) == 1:
|
||||
# All the character styles have the same value
|
||||
for s in char_styles:
|
||||
setattr(s, prop, inherit)
|
||||
setattr(block_style, prop, next(iter(vals)))
|
||||
|
||||
for p, runs in iteritems(layers):
|
||||
has_links = '1' in {r.get('is-link', None) for r in runs}
|
||||
char_styles = [self.resolve_run(r) for r in runs]
|
||||
block_style = self.resolve_paragraph(p)
|
||||
for prop in ('font_family', 'font_size', 'cs_font_family', 'cs_font_size', 'color'):
|
||||
if has_links and prop == 'color':
|
||||
# We cannot promote color as browser rendering engines will
|
||||
# override the link color setting it to blue, unless the
|
||||
# color is specified on the link element itself
|
||||
continue
|
||||
promote_property(char_styles, block_style, prop)
|
||||
for s in char_styles:
|
||||
if s.text_decoration == 'none':
|
||||
# The default text decoration is 'none'
|
||||
s.text_decoration = inherit
|
||||
|
||||
def promote_most_common(block_styles, prop, default):
|
||||
c = Counter()
|
||||
for s in block_styles:
|
||||
val = getattr(s, prop)
|
||||
if val is not inherit:
|
||||
c[val] += 1
|
||||
val = None
|
||||
if c:
|
||||
val = c.most_common(1)[0][0]
|
||||
for s in block_styles:
|
||||
oval = getattr(s, prop)
|
||||
if oval is inherit:
|
||||
if default != val:
|
||||
setattr(s, prop, default)
|
||||
elif oval == val:
|
||||
setattr(s, prop, inherit)
|
||||
return val
|
||||
|
||||
block_styles = tuple(self.resolve_paragraph(p) for p in layers)
|
||||
|
||||
ff = promote_most_common(block_styles, 'font_family', self.body_font_family)
|
||||
if ff is not None:
|
||||
self.body_font_family = ff
|
||||
|
||||
fs = promote_most_common(block_styles, 'font_size', int(self.body_font_size[:2]))
|
||||
if fs is not None:
|
||||
self.body_font_size = '%.3gpt' % fs
|
||||
|
||||
color = promote_most_common(block_styles, 'color', self.body_color)
|
||||
if color is not None:
|
||||
self.body_color = color
|
||||
|
||||
def resolve_numbering(self, numbering):
|
||||
# When a numPr element appears inside a paragraph style, the lvl info
|
||||
# must be discarded and pStyle used instead.
|
||||
self.numbering = numbering
|
||||
for style in self:
|
||||
ps = style.paragraph_style
|
||||
if ps is not None and ps.numbering_id is not inherit:
|
||||
lvl = numbering.get_pstyle(ps.numbering_id, style.style_id)
|
||||
if lvl is None:
|
||||
ps.numbering_id = ps.numbering_level = inherit
|
||||
else:
|
||||
ps.numbering_level = lvl
|
||||
|
||||
def apply_contextual_spacing(self, paras):
|
||||
last_para = None
|
||||
for p in paras:
|
||||
if last_para is not None:
|
||||
ls = self.resolve_paragraph(last_para)
|
||||
ps = self.resolve_paragraph(p)
|
||||
if ls.linked_style is not None and ls.linked_style == ps.linked_style:
|
||||
if ls.contextualSpacing is True:
|
||||
ls.margin_bottom = 0
|
||||
if ps.contextualSpacing is True:
|
||||
ps.margin_top = 0
|
||||
last_para = p
|
||||
|
||||
def apply_section_page_breaks(self, paras):
|
||||
for p in paras:
|
||||
ps = self.resolve_paragraph(p)
|
||||
ps.pageBreakBefore = True
|
||||
|
||||
def register(self, css, prefix):
|
||||
h = hash(frozenset(iteritems(css)))
|
||||
ans, _ = self.classes.get(h, (None, None))
|
||||
if ans is None:
|
||||
self.counter[prefix] += 1
|
||||
ans = '%s_%d' % (prefix, self.counter[prefix])
|
||||
self.classes[h] = (ans, css)
|
||||
return ans
|
||||
|
||||
def generate_classes(self):
|
||||
for bs in itervalues(self.para_cache):
|
||||
css = bs.css
|
||||
if css:
|
||||
self.register(css, 'block')
|
||||
for bs in itervalues(self.run_cache):
|
||||
css = bs.css
|
||||
if css:
|
||||
self.register(css, 'text')
|
||||
|
||||
def class_name(self, css):
|
||||
h = hash(frozenset(iteritems(css)))
|
||||
return self.classes.get(h, (None, None))[0]
|
||||
|
||||
def generate_css(self, dest_dir, docx, notes_nopb, nosupsub):
|
||||
ef = self.fonts.embed_fonts(dest_dir, docx)
|
||||
|
||||
s = '''\
|
||||
body { font-family: %s; font-size: %s; color: %s }
|
||||
|
||||
/* In word all paragraphs have zero margins unless explicitly specified in a style */
|
||||
p, h1, h2, h3, h4, h5, h6, div { margin: 0; padding: 0 }
|
||||
/* In word headings only have bold font if explicitly specified,
|
||||
similarly the font size is the body font size, unless explicitly set. */
|
||||
h1, h2, h3, h4, h5, h6 { font-weight: normal; font-size: 1rem }
|
||||
/* Setting padding-left to zero breaks rendering of lists, so we only set the other values to zero and leave padding-left for the user-agent */
|
||||
ul, ol { margin: 0; padding-top: 0; padding-bottom: 0; padding-right: 0 }
|
||||
|
||||
/* The word hyperlink styling will set text-decoration to underline if needed */
|
||||
a { text-decoration: none }
|
||||
|
||||
sup.noteref a { text-decoration: none }
|
||||
|
||||
h1.notes-header { page-break-before: always }
|
||||
|
||||
dl.footnote dt { font-size: large }
|
||||
|
||||
dl.footnote dt a { text-decoration: none }
|
||||
|
||||
'''
|
||||
|
||||
if not notes_nopb:
|
||||
s += '''\
|
||||
dl.footnote { page-break-after: always }
|
||||
dl.footnote:last-of-type { page-break-after: avoid }
|
||||
'''
|
||||
|
||||
s = s + '''\
|
||||
span.tab { white-space: pre }
|
||||
|
||||
p.index-entry { text-indent: 0pt; }
|
||||
p.index-entry a:visited { color: blue }
|
||||
p.index-entry a:hover { color: red }
|
||||
'''
|
||||
|
||||
if nosupsub:
|
||||
s = s + '''\
|
||||
sup { vertical-align: top }
|
||||
sub { vertical-align: bottom }
|
||||
'''
|
||||
|
||||
prefix = textwrap.dedent(s) % (self.body_font_family, self.body_font_size, self.body_color)
|
||||
if ef:
|
||||
prefix = ef + '\n' + prefix
|
||||
|
||||
ans = []
|
||||
for (cls, css) in sorted(itervalues(self.classes), key=lambda x:x[0]):
|
||||
b = ('\t%s: %s;' % (k, v) for k, v in iteritems(css))
|
||||
b = '\n'.join(b)
|
||||
ans.append('.%s {\n%s\n}\n' % (cls, b.rstrip(';')))
|
||||
return prefix + '\n' + '\n'.join(ans)
|
||||
700
ebook_converter/ebooks/docx/tables.py
Normal file
700
ebook_converter/ebooks/docx/tables.py
Normal file
@@ -0,0 +1,700 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=utf-8
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
|
||||
from lxml.html.builder import TABLE, TR, TD
|
||||
|
||||
from calibre.ebooks.docx.block_styles import inherit, read_shd as rs, read_border, binary_property, border_props, ParagraphStyle, border_to_css
|
||||
from calibre.ebooks.docx.char_styles import RunStyle
|
||||
from polyglot.builtins import filter, iteritems, itervalues, range, unicode_type
|
||||
|
||||
# Read from XML {{{
|
||||
read_shd = rs
|
||||
edges = ('left', 'top', 'right', 'bottom')
|
||||
|
||||
|
||||
def _read_width(elem, get):
|
||||
ans = inherit
|
||||
try:
|
||||
w = int(get(elem, 'w:w'))
|
||||
except (TypeError, ValueError):
|
||||
w = 0
|
||||
typ = get(elem, 'w:type', 'auto')
|
||||
if typ == 'nil':
|
||||
ans = '0'
|
||||
elif typ == 'auto':
|
||||
ans = 'auto'
|
||||
elif typ == 'dxa':
|
||||
ans = '%.3gpt' % (w/20)
|
||||
elif typ == 'pct':
|
||||
ans = '%.3g%%' % (w/50)
|
||||
return ans
|
||||
|
||||
|
||||
def read_width(parent, dest, XPath, get):
|
||||
ans = inherit
|
||||
for tblW in XPath('./w:tblW')(parent):
|
||||
ans = _read_width(tblW, get)
|
||||
setattr(dest, 'width', ans)
|
||||
|
||||
|
||||
def read_cell_width(parent, dest, XPath, get):
|
||||
ans = inherit
|
||||
for tblW in XPath('./w:tcW')(parent):
|
||||
ans = _read_width(tblW, get)
|
||||
setattr(dest, 'width', ans)
|
||||
|
||||
|
||||
def read_padding(parent, dest, XPath, get):
|
||||
name = 'tblCellMar' if parent.tag.endswith('}tblPr') else 'tcMar'
|
||||
ans = {x:inherit for x in edges}
|
||||
for mar in XPath('./w:%s' % name)(parent):
|
||||
for x in edges:
|
||||
for edge in XPath('./w:%s' % x)(mar):
|
||||
ans[x] = _read_width(edge, get)
|
||||
for x in edges:
|
||||
setattr(dest, 'cell_padding_%s' % x, ans[x])
|
||||
|
||||
|
||||
def read_justification(parent, dest, XPath, get):
|
||||
left = right = inherit
|
||||
for jc in XPath('./w:jc[@w:val]')(parent):
|
||||
val = get(jc, 'w:val')
|
||||
if not val:
|
||||
continue
|
||||
if val == 'left':
|
||||
right = 'auto'
|
||||
elif val == 'right':
|
||||
left = 'auto'
|
||||
elif val == 'center':
|
||||
left = right = 'auto'
|
||||
setattr(dest, 'margin_left', left)
|
||||
setattr(dest, 'margin_right', right)
|
||||
|
||||
|
||||
def read_spacing(parent, dest, XPath, get):
|
||||
ans = inherit
|
||||
for cs in XPath('./w:tblCellSpacing')(parent):
|
||||
ans = _read_width(cs, get)
|
||||
setattr(dest, 'spacing', ans)
|
||||
|
||||
|
||||
def read_float(parent, dest, XPath, get):
|
||||
ans = inherit
|
||||
for x in XPath('./w:tblpPr')(parent):
|
||||
ans = {k.rpartition('}')[-1]: v for k, v in iteritems(x.attrib)}
|
||||
setattr(dest, 'float', ans)
|
||||
|
||||
|
||||
def read_indent(parent, dest, XPath, get):
|
||||
ans = inherit
|
||||
for cs in XPath('./w:tblInd')(parent):
|
||||
ans = _read_width(cs, get)
|
||||
setattr(dest, 'indent', ans)
|
||||
|
||||
|
||||
border_edges = ('left', 'top', 'right', 'bottom', 'insideH', 'insideV')
|
||||
|
||||
|
||||
def read_borders(parent, dest, XPath, get):
|
||||
name = 'tblBorders' if parent.tag.endswith('}tblPr') else 'tcBorders'
|
||||
read_border(parent, dest, XPath, get, border_edges, name)
|
||||
|
||||
|
||||
def read_height(parent, dest, XPath, get):
|
||||
ans = inherit
|
||||
for rh in XPath('./w:trHeight')(parent):
|
||||
rule = get(rh, 'w:hRule', 'auto')
|
||||
if rule in {'auto', 'atLeast', 'exact'}:
|
||||
val = get(rh, 'w:val')
|
||||
ans = (rule, val)
|
||||
setattr(dest, 'height', ans)
|
||||
|
||||
|
||||
def read_vertical_align(parent, dest, XPath, get):
|
||||
ans = inherit
|
||||
for va in XPath('./w:vAlign')(parent):
|
||||
val = get(va, 'w:val')
|
||||
ans = {'center': 'middle', 'top': 'top', 'bottom': 'bottom'}.get(val, 'middle')
|
||||
setattr(dest, 'vertical_align', ans)
|
||||
|
||||
|
||||
def read_col_span(parent, dest, XPath, get):
|
||||
ans = inherit
|
||||
for gs in XPath('./w:gridSpan')(parent):
|
||||
try:
|
||||
ans = int(get(gs, 'w:val'))
|
||||
except (TypeError, ValueError):
|
||||
continue
|
||||
setattr(dest, 'col_span', ans)
|
||||
|
||||
|
||||
def read_merge(parent, dest, XPath, get):
|
||||
for x in ('hMerge', 'vMerge'):
|
||||
ans = inherit
|
||||
for m in XPath('./w:%s' % x)(parent):
|
||||
ans = get(m, 'w:val', 'continue')
|
||||
setattr(dest, x, ans)
|
||||
|
||||
|
||||
def read_band_size(parent, dest, XPath, get):
|
||||
for x in ('Col', 'Row'):
|
||||
ans = 1
|
||||
for y in XPath('./w:tblStyle%sBandSize' % x)(parent):
|
||||
try:
|
||||
ans = int(get(y, 'w:val'))
|
||||
except (TypeError, ValueError):
|
||||
continue
|
||||
setattr(dest, '%s_band_size' % x.lower(), ans)
|
||||
|
||||
|
||||
def read_look(parent, dest, XPath, get):
|
||||
ans = 0
|
||||
for x in XPath('./w:tblLook')(parent):
|
||||
try:
|
||||
ans = int(get(x, 'w:val'), 16)
|
||||
except (ValueError, TypeError):
|
||||
continue
|
||||
setattr(dest, 'look', ans)
|
||||
|
||||
# }}}
|
||||
|
||||
|
||||
def clone(style):
|
||||
if style is None:
|
||||
return None
|
||||
try:
|
||||
ans = type(style)(style.namespace)
|
||||
except TypeError:
|
||||
return None
|
||||
ans.update(style)
|
||||
return ans
|
||||
|
||||
|
||||
class Style(object):
|
||||
|
||||
is_bidi = False
|
||||
|
||||
def update(self, other):
|
||||
for prop in self.all_properties:
|
||||
nval = getattr(other, prop)
|
||||
if nval is not inherit:
|
||||
setattr(self, prop, nval)
|
||||
|
||||
def apply_bidi(self):
|
||||
self.is_bidi = True
|
||||
|
||||
def convert_spacing(self):
|
||||
ans = {}
|
||||
if self.spacing is not inherit:
|
||||
if self.spacing in {'auto', '0'}:
|
||||
ans['border-collapse'] = 'collapse'
|
||||
else:
|
||||
ans['border-collapse'] = 'separate'
|
||||
ans['border-spacing'] = self.spacing
|
||||
return ans
|
||||
|
||||
def convert_border(self):
|
||||
c = {}
|
||||
for x in edges:
|
||||
border_to_css(x, self, c)
|
||||
val = getattr(self, 'padding_%s' % x)
|
||||
if val is not inherit:
|
||||
c['padding-%s' % x] = '%.3gpt' % val
|
||||
if self.is_bidi:
|
||||
for a in ('padding-%s', 'border-%s-style', 'border-%s-color', 'border-%s-width'):
|
||||
l, r = c.get(a % 'left'), c.get(a % 'right')
|
||||
if l is not None:
|
||||
c[a % 'right'] = l
|
||||
if r is not None:
|
||||
c[a % 'left'] = r
|
||||
return c
|
||||
|
||||
|
||||
class RowStyle(Style):
|
||||
|
||||
all_properties = ('height', 'cantSplit', 'hidden', 'spacing',)
|
||||
|
||||
def __init__(self, namespace, trPr=None):
|
||||
self.namespace = namespace
|
||||
if trPr is None:
|
||||
for p in self.all_properties:
|
||||
setattr(self, p, inherit)
|
||||
else:
|
||||
for p in ('hidden', 'cantSplit'):
|
||||
setattr(self, p, binary_property(trPr, p, namespace.XPath, namespace.get))
|
||||
for p in ('spacing', 'height'):
|
||||
f = globals()['read_%s' % p]
|
||||
f(trPr, self, namespace.XPath, namespace.get)
|
||||
self._css = None
|
||||
|
||||
@property
|
||||
def css(self):
|
||||
if self._css is None:
|
||||
c = self._css = {}
|
||||
if self.hidden is True:
|
||||
c['display'] = 'none'
|
||||
if self.cantSplit is True:
|
||||
c['page-break-inside'] = 'avoid'
|
||||
if self.height is not inherit:
|
||||
rule, val = self.height
|
||||
if rule != 'auto':
|
||||
try:
|
||||
c['min-height' if rule == 'atLeast' else 'height'] = '%.3gpt' % (int(val)/20)
|
||||
except (ValueError, TypeError):
|
||||
pass
|
||||
c.update(self.convert_spacing())
|
||||
return self._css
|
||||
|
||||
|
||||
class CellStyle(Style):
|
||||
|
||||
all_properties = ('background_color', 'cell_padding_left', 'cell_padding_right', 'cell_padding_top',
|
||||
'cell_padding_bottom', 'width', 'vertical_align', 'col_span', 'vMerge', 'hMerge', 'row_span',
|
||||
) + tuple(k % edge for edge in border_edges for k in border_props)
|
||||
|
||||
def __init__(self, namespace, tcPr=None):
|
||||
self.namespace = namespace
|
||||
if tcPr is None:
|
||||
for p in self.all_properties:
|
||||
setattr(self, p, inherit)
|
||||
else:
|
||||
for x in ('borders', 'shd', 'padding', 'cell_width', 'vertical_align', 'col_span', 'merge'):
|
||||
f = globals()['read_%s' % x]
|
||||
f(tcPr, self, namespace.XPath, namespace.get)
|
||||
self.row_span = inherit
|
||||
self._css = None
|
||||
|
||||
@property
|
||||
def css(self):
|
||||
if self._css is None:
|
||||
self._css = c = {}
|
||||
if self.background_color is not inherit:
|
||||
c['background-color'] = self.background_color
|
||||
if self.width not in (inherit, 'auto'):
|
||||
c['width'] = self.width
|
||||
c['vertical-align'] = 'top' if self.vertical_align is inherit else self.vertical_align
|
||||
for x in edges:
|
||||
val = getattr(self, 'cell_padding_%s' % x)
|
||||
if val not in (inherit, 'auto'):
|
||||
c['padding-%s' % x] = val
|
||||
elif val is inherit and x in {'left', 'right'}:
|
||||
c['padding-%s' % x] = '%.3gpt' % (115/20)
|
||||
# In Word, tables are apparently rendered with some default top and
|
||||
# bottom padding irrespective of the cellMargin values. Simulate
|
||||
# that here.
|
||||
for x in ('top', 'bottom'):
|
||||
if c.get('padding-%s' % x, '0pt') == '0pt':
|
||||
c['padding-%s' % x] = '0.5ex'
|
||||
c.update(self.convert_border())
|
||||
|
||||
return self._css
|
||||
|
||||
|
||||
class TableStyle(Style):
|
||||
|
||||
all_properties = (
|
||||
'width', 'float', 'cell_padding_left', 'cell_padding_right', 'cell_padding_top',
|
||||
'cell_padding_bottom', 'margin_left', 'margin_right', 'background_color',
|
||||
'spacing', 'indent', 'overrides', 'col_band_size', 'row_band_size', 'look', 'bidi',
|
||||
) + tuple(k % edge for edge in border_edges for k in border_props)
|
||||
|
||||
def __init__(self, namespace, tblPr=None):
|
||||
self.namespace = namespace
|
||||
if tblPr is None:
|
||||
for p in self.all_properties:
|
||||
setattr(self, p, inherit)
|
||||
else:
|
||||
self.overrides = inherit
|
||||
self.bidi = binary_property(tblPr, 'bidiVisual', namespace.XPath, namespace.get)
|
||||
for x in ('width', 'float', 'padding', 'shd', 'justification', 'spacing', 'indent', 'borders', 'band_size', 'look'):
|
||||
f = globals()['read_%s' % x]
|
||||
f(tblPr, self, self.namespace.XPath, self.namespace.get)
|
||||
parent = tblPr.getparent()
|
||||
if self.namespace.is_tag(parent, 'w:style'):
|
||||
self.overrides = {}
|
||||
for tblStylePr in self.namespace.XPath('./w:tblStylePr[@w:type]')(parent):
|
||||
otype = self.namespace.get(tblStylePr, 'w:type')
|
||||
orides = self.overrides[otype] = {}
|
||||
for tblPr in self.namespace.XPath('./w:tblPr')(tblStylePr):
|
||||
orides['table'] = TableStyle(self.namespace, tblPr)
|
||||
for trPr in self.namespace.XPath('./w:trPr')(tblStylePr):
|
||||
orides['row'] = RowStyle(self.namespace, trPr)
|
||||
for tcPr in self.namespace.XPath('./w:tcPr')(tblStylePr):
|
||||
orides['cell'] = CellStyle(self.namespace, tcPr)
|
||||
for pPr in self.namespace.XPath('./w:pPr')(tblStylePr):
|
||||
orides['para'] = ParagraphStyle(self.namespace, pPr)
|
||||
for rPr in self.namespace.XPath('./w:rPr')(tblStylePr):
|
||||
orides['run'] = RunStyle(self.namespace, rPr)
|
||||
self._css = None
|
||||
|
||||
def resolve_based_on(self, parent):
|
||||
for p in self.all_properties:
|
||||
val = getattr(self, p)
|
||||
if val is inherit:
|
||||
setattr(self, p, getattr(parent, p))
|
||||
|
||||
@property
|
||||
def css(self):
|
||||
if self._css is None:
|
||||
c = self._css = {}
|
||||
if self.width not in (inherit, 'auto'):
|
||||
c['width'] = self.width
|
||||
for x in ('background_color', 'margin_left', 'margin_right'):
|
||||
val = getattr(self, x)
|
||||
if val is not inherit:
|
||||
c[x.replace('_', '-')] = val
|
||||
if self.indent not in (inherit, 'auto') and self.margin_left != 'auto':
|
||||
c['margin-left'] = self.indent
|
||||
if self.float is not inherit:
|
||||
for x in ('left', 'top', 'right', 'bottom'):
|
||||
val = self.float.get('%sFromText' % x, 0)
|
||||
try:
|
||||
val = '%.3gpt' % (int(val) / 20)
|
||||
except (ValueError, TypeError):
|
||||
val = '0'
|
||||
c['margin-%s' % x] = val
|
||||
if 'tblpXSpec' in self.float:
|
||||
c['float'] = 'right' if self.float['tblpXSpec'] in {'right', 'outside'} else 'left'
|
||||
else:
|
||||
page = self.page
|
||||
page_width = page.width - page.margin_left - page.margin_right
|
||||
try:
|
||||
x = int(self.float['tblpX']) / 20
|
||||
except (KeyError, ValueError, TypeError):
|
||||
x = 0
|
||||
c['float'] = 'left' if (x/page_width) < 0.65 else 'right'
|
||||
c.update(self.convert_spacing())
|
||||
if 'border-collapse' not in c:
|
||||
c['border-collapse'] = 'collapse'
|
||||
c.update(self.convert_border())
|
||||
|
||||
return self._css
|
||||
|
||||
|
||||
class Table(object):
|
||||
|
||||
def __init__(self, namespace, tbl, styles, para_map, is_sub_table=False):
|
||||
self.namespace = namespace
|
||||
self.tbl = tbl
|
||||
self.styles = styles
|
||||
self.is_sub_table = is_sub_table
|
||||
|
||||
# Read Table Style
|
||||
style = {'table':TableStyle(self.namespace)}
|
||||
for tblPr in self.namespace.XPath('./w:tblPr')(tbl):
|
||||
for ts in self.namespace.XPath('./w:tblStyle[@w:val]')(tblPr):
|
||||
style_id = self.namespace.get(ts, 'w:val')
|
||||
s = styles.get(style_id)
|
||||
if s is not None:
|
||||
if s.table_style is not None:
|
||||
style['table'].update(s.table_style)
|
||||
if s.paragraph_style is not None:
|
||||
if 'paragraph' in style:
|
||||
style['paragraph'].update(s.paragraph_style)
|
||||
else:
|
||||
style['paragraph'] = s.paragraph_style
|
||||
if s.character_style is not None:
|
||||
if 'run' in style:
|
||||
style['run'].update(s.character_style)
|
||||
else:
|
||||
style['run'] = s.character_style
|
||||
style['table'].update(TableStyle(self.namespace, tblPr))
|
||||
self.table_style, self.paragraph_style = style['table'], style.get('paragraph', None)
|
||||
self.run_style = style.get('run', None)
|
||||
self.overrides = self.table_style.overrides
|
||||
if self.overrides is inherit:
|
||||
self.overrides = {}
|
||||
if 'wholeTable' in self.overrides and 'table' in self.overrides['wholeTable']:
|
||||
self.table_style.update(self.overrides['wholeTable']['table'])
|
||||
|
||||
self.style_map = {}
|
||||
self.paragraphs = []
|
||||
self.cell_map = []
|
||||
|
||||
rows = self.namespace.XPath('./w:tr')(tbl)
|
||||
for r, tr in enumerate(rows):
|
||||
overrides = self.get_overrides(r, None, len(rows), None)
|
||||
self.resolve_row_style(tr, overrides)
|
||||
cells = self.namespace.XPath('./w:tc')(tr)
|
||||
self.cell_map.append([])
|
||||
for c, tc in enumerate(cells):
|
||||
overrides = self.get_overrides(r, c, len(rows), len(cells))
|
||||
self.resolve_cell_style(tc, overrides, r, c, len(rows), len(cells))
|
||||
self.cell_map[-1].append(tc)
|
||||
for p in self.namespace.XPath('./w:p')(tc):
|
||||
para_map[p] = self
|
||||
self.paragraphs.append(p)
|
||||
self.resolve_para_style(p, overrides)
|
||||
|
||||
self.handle_merged_cells()
|
||||
self.sub_tables = {x:Table(namespace, x, styles, para_map, is_sub_table=True) for x in self.namespace.XPath('./w:tr/w:tc/w:tbl')(tbl)}
|
||||
|
||||
@property
|
||||
def bidi(self):
|
||||
return self.table_style.bidi is True
|
||||
|
||||
def override_allowed(self, name):
|
||||
'Check if the named override is allowed by the tblLook element'
|
||||
if name.endswith('Cell') or name == 'wholeTable':
|
||||
return True
|
||||
look = self.table_style.look
|
||||
if (look & 0x0020 and name == 'firstRow') or (look & 0x0040 and name == 'lastRow') or \
|
||||
(look & 0x0080 and name == 'firstCol') or (look & 0x0100 and name == 'lastCol'):
|
||||
return True
|
||||
if name.startswith('band'):
|
||||
if name.endswith('Horz'):
|
||||
return not bool(look & 0x0200)
|
||||
if name.endswith('Vert'):
|
||||
return not bool(look & 0x0400)
|
||||
return False
|
||||
|
||||
def get_overrides(self, r, c, num_of_rows, num_of_cols_in_row):
|
||||
'List of possible overrides for the given para'
|
||||
overrides = ['wholeTable']
|
||||
|
||||
def divisor(m, n):
|
||||
return (m - (m % n)) // n
|
||||
if c is not None:
|
||||
odd_column_band = (divisor(c, self.table_style.col_band_size) % 2) == 1
|
||||
overrides.append('band%dVert' % (1 if odd_column_band else 2))
|
||||
odd_row_band = (divisor(r, self.table_style.row_band_size) % 2) == 1
|
||||
overrides.append('band%dHorz' % (1 if odd_row_band else 2))
|
||||
|
||||
# According to the OOXML spec columns should have higher override
|
||||
# priority than rows, but Word seems to do it the other way around.
|
||||
if c is not None:
|
||||
if c == 0:
|
||||
overrides.append('firstCol')
|
||||
if c >= num_of_cols_in_row - 1:
|
||||
overrides.append('lastCol')
|
||||
if r == 0:
|
||||
overrides.append('firstRow')
|
||||
if r >= num_of_rows - 1:
|
||||
overrides.append('lastRow')
|
||||
if c is not None:
|
||||
if r == 0:
|
||||
if c == 0:
|
||||
overrides.append('nwCell')
|
||||
if c == num_of_cols_in_row - 1:
|
||||
overrides.append('neCell')
|
||||
if r == num_of_rows - 1:
|
||||
if c == 0:
|
||||
overrides.append('swCell')
|
||||
if c == num_of_cols_in_row - 1:
|
||||
overrides.append('seCell')
|
||||
return tuple(filter(self.override_allowed, overrides))
|
||||
|
||||
def resolve_row_style(self, tr, overrides):
|
||||
rs = RowStyle(self.namespace)
|
||||
for o in overrides:
|
||||
if o in self.overrides:
|
||||
ovr = self.overrides[o]
|
||||
ors = ovr.get('row', None)
|
||||
if ors is not None:
|
||||
rs.update(ors)
|
||||
|
||||
for trPr in self.namespace.XPath('./w:trPr')(tr):
|
||||
rs.update(RowStyle(self.namespace, trPr))
|
||||
if self.bidi:
|
||||
rs.apply_bidi()
|
||||
self.style_map[tr] = rs
|
||||
|
||||
def resolve_cell_style(self, tc, overrides, row, col, rows, cols_in_row):
|
||||
cs = CellStyle(self.namespace)
|
||||
for o in overrides:
|
||||
if o in self.overrides:
|
||||
ovr = self.overrides[o]
|
||||
ors = ovr.get('cell', None)
|
||||
if ors is not None:
|
||||
cs.update(ors)
|
||||
|
||||
for tcPr in self.namespace.XPath('./w:tcPr')(tc):
|
||||
cs.update(CellStyle(self.namespace, tcPr))
|
||||
|
||||
for x in edges:
|
||||
p = 'cell_padding_%s' % x
|
||||
val = getattr(cs, p)
|
||||
if val is inherit:
|
||||
setattr(cs, p, getattr(self.table_style, p))
|
||||
|
||||
is_inside_edge = (
|
||||
(x == 'left' and col > 0) or
|
||||
(x == 'top' and row > 0) or
|
||||
(x == 'right' and col < cols_in_row - 1) or
|
||||
(x == 'bottom' and row < rows -1)
|
||||
)
|
||||
inside_edge = ('insideH' if x in {'top', 'bottom'} else 'insideV') if is_inside_edge else None
|
||||
for prop in border_props:
|
||||
if not prop.startswith('border'):
|
||||
continue
|
||||
eprop = prop % x
|
||||
iprop = (prop % inside_edge) if inside_edge else None
|
||||
val = getattr(cs, eprop)
|
||||
if val is inherit and iprop is not None:
|
||||
# Use the insideX borders if the main cell borders are not
|
||||
# specified
|
||||
val = getattr(cs, iprop)
|
||||
if val is inherit:
|
||||
val = getattr(self.table_style, iprop)
|
||||
if not is_inside_edge and val == 'none':
|
||||
# Cell borders must override table borders even when the
|
||||
# table border is not null and the cell border is null.
|
||||
val = 'hidden'
|
||||
setattr(cs, eprop, val)
|
||||
|
||||
if self.bidi:
|
||||
cs.apply_bidi()
|
||||
self.style_map[tc] = cs
|
||||
|
||||
def resolve_para_style(self, p, overrides):
|
||||
text_styles = [clone(self.paragraph_style), clone(self.run_style)]
|
||||
|
||||
for o in overrides:
|
||||
if o in self.overrides:
|
||||
ovr = self.overrides[o]
|
||||
for i, name in enumerate(('para', 'run')):
|
||||
ops = ovr.get(name, None)
|
||||
if ops is not None:
|
||||
if text_styles[i] is None:
|
||||
text_styles[i] = ops
|
||||
else:
|
||||
text_styles[i].update(ops)
|
||||
self.style_map[p] = text_styles
|
||||
|
||||
def handle_merged_cells(self):
|
||||
if not self.cell_map:
|
||||
return
|
||||
# Handle vMerge
|
||||
max_col_num = max(len(r) for r in self.cell_map)
|
||||
for c in range(max_col_num):
|
||||
cells = [row[c] if c < len(row) else None for row in self.cell_map]
|
||||
runs = [[]]
|
||||
for cell in cells:
|
||||
try:
|
||||
s = self.style_map[cell]
|
||||
except KeyError: # cell is None
|
||||
s = CellStyle(self.namespace)
|
||||
if s.vMerge == 'restart':
|
||||
runs.append([cell])
|
||||
elif s.vMerge == 'continue':
|
||||
runs[-1].append(cell)
|
||||
else:
|
||||
runs.append([])
|
||||
for run in runs:
|
||||
if len(run) > 1:
|
||||
self.style_map[run[0]].row_span = len(run)
|
||||
for tc in run[1:]:
|
||||
tc.getparent().remove(tc)
|
||||
|
||||
# Handle hMerge
|
||||
for cells in self.cell_map:
|
||||
runs = [[]]
|
||||
for cell in cells:
|
||||
try:
|
||||
s = self.style_map[cell]
|
||||
except KeyError: # cell is None
|
||||
s = CellStyle(self.namespace)
|
||||
if s.col_span is not inherit:
|
||||
runs.append([])
|
||||
continue
|
||||
if s.hMerge == 'restart':
|
||||
runs.append([cell])
|
||||
elif s.hMerge == 'continue':
|
||||
runs[-1].append(cell)
|
||||
else:
|
||||
runs.append([])
|
||||
|
||||
for run in runs:
|
||||
if len(run) > 1:
|
||||
self.style_map[run[0]].col_span = len(run)
|
||||
for tc in run[1:]:
|
||||
tc.getparent().remove(tc)
|
||||
|
||||
def __iter__(self):
|
||||
for p in self.paragraphs:
|
||||
yield p
|
||||
for t in itervalues(self.sub_tables):
|
||||
for p in t:
|
||||
yield p
|
||||
|
||||
def apply_markup(self, rmap, page, parent=None):
|
||||
table = TABLE('\n\t\t')
|
||||
if self.bidi:
|
||||
table.set('dir', 'rtl')
|
||||
self.table_style.page = page
|
||||
style_map = {}
|
||||
if parent is None:
|
||||
try:
|
||||
first_para = rmap[next(iter(self))]
|
||||
except StopIteration:
|
||||
return
|
||||
parent = first_para.getparent()
|
||||
idx = parent.index(first_para)
|
||||
parent.insert(idx, table)
|
||||
else:
|
||||
parent.append(table)
|
||||
for row in self.namespace.XPath('./w:tr')(self.tbl):
|
||||
tr = TR('\n\t\t\t')
|
||||
style_map[tr] = self.style_map[row]
|
||||
tr.tail = '\n\t\t'
|
||||
table.append(tr)
|
||||
for tc in self.namespace.XPath('./w:tc')(row):
|
||||
td = TD()
|
||||
style_map[td] = s = self.style_map[tc]
|
||||
if s.col_span is not inherit:
|
||||
td.set('colspan', unicode_type(s.col_span))
|
||||
if s.row_span is not inherit:
|
||||
td.set('rowspan', unicode_type(s.row_span))
|
||||
td.tail = '\n\t\t\t'
|
||||
tr.append(td)
|
||||
for x in self.namespace.XPath('./w:p|./w:tbl')(tc):
|
||||
if x.tag.endswith('}p'):
|
||||
td.append(rmap[x])
|
||||
else:
|
||||
self.sub_tables[x].apply_markup(rmap, page, parent=td)
|
||||
if len(tr):
|
||||
tr[-1].tail = '\n\t\t'
|
||||
if len(table):
|
||||
table[-1].tail = '\n\t'
|
||||
|
||||
table_style = self.table_style.css
|
||||
if table_style:
|
||||
table.set('class', self.styles.register(table_style, 'table'))
|
||||
for elem, style in iteritems(style_map):
|
||||
css = style.css
|
||||
if css:
|
||||
elem.set('class', self.styles.register(css, elem.tag))
|
||||
|
||||
|
||||
class Tables(object):
|
||||
|
||||
def __init__(self, namespace):
|
||||
self.tables = []
|
||||
self.para_map = {}
|
||||
self.sub_tables = set()
|
||||
self.namespace = namespace
|
||||
|
||||
def register(self, tbl, styles):
|
||||
if tbl in self.sub_tables:
|
||||
return
|
||||
self.tables.append(Table(self.namespace, tbl, styles, self.para_map))
|
||||
self.sub_tables |= set(self.tables[-1].sub_tables)
|
||||
|
||||
def apply_markup(self, object_map, page_map):
|
||||
rmap = {v:k for k, v in iteritems(object_map)}
|
||||
for table in self.tables:
|
||||
table.apply_markup(rmap, page_map[table.tbl])
|
||||
|
||||
def para_style(self, p):
|
||||
table = self.para_map.get(p, None)
|
||||
if table is not None:
|
||||
return table.style_map.get(p, (None, None))[0]
|
||||
|
||||
def run_style(self, p):
|
||||
table = self.para_map.get(p, None)
|
||||
if table is not None:
|
||||
return table.style_map.get(p, (None, None))[1]
|
||||
29
ebook_converter/ebooks/docx/theme.py
Normal file
29
ebook_converter/ebooks/docx/theme.py
Normal file
@@ -0,0 +1,29 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=utf-8
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
|
||||
|
||||
class Theme(object):
|
||||
|
||||
def __init__(self, namespace):
|
||||
self.major_latin_font = 'Cambria'
|
||||
self.minor_latin_font = 'Calibri'
|
||||
self.namespace = namespace
|
||||
|
||||
def __call__(self, root):
|
||||
for fs in self.namespace.XPath('//a:fontScheme')(root):
|
||||
for mj in self.namespace.XPath('./a:majorFont')(fs):
|
||||
for l in self.namespace.XPath('./a:latin[@typeface]')(mj):
|
||||
self.major_latin_font = l.get('typeface')
|
||||
for mj in self.namespace.XPath('./a:minorFont')(fs):
|
||||
for l in self.namespace.XPath('./a:latin[@typeface]')(mj):
|
||||
self.minor_latin_font = l.get('typeface')
|
||||
|
||||
def resolve_font_family(self, ff):
|
||||
if ff.startswith('|'):
|
||||
ff = ff[1:-1]
|
||||
ff = self.major_latin_font if ff.startswith('major') else self.minor_latin_font
|
||||
return ff
|
||||
839
ebook_converter/ebooks/docx/to_html.py
Normal file
839
ebook_converter/ebooks/docx/to_html.py
Normal file
@@ -0,0 +1,839 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=utf-8
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
|
||||
import sys, os, re, math, errno, uuid, numbers
|
||||
from collections import OrderedDict, defaultdict
|
||||
|
||||
from lxml import html
|
||||
from lxml.html.builder import (
|
||||
HTML, HEAD, TITLE, BODY, LINK, META, P, SPAN, BR, DIV, A, DT, DL, DD, H1)
|
||||
|
||||
from calibre import guess_type
|
||||
from calibre.ebooks.docx.container import DOCX, fromstring
|
||||
from calibre.ebooks.docx.names import XML, generate_anchor
|
||||
from calibre.ebooks.docx.styles import Styles, inherit, PageProperties
|
||||
from calibre.ebooks.docx.numbering import Numbering
|
||||
from calibre.ebooks.docx.fonts import Fonts, is_symbol_font, map_symbol_text
|
||||
from calibre.ebooks.docx.images import Images
|
||||
from calibre.ebooks.docx.tables import Tables
|
||||
from calibre.ebooks.docx.footnotes import Footnotes
|
||||
from calibre.ebooks.docx.cleanup import cleanup_markup
|
||||
from calibre.ebooks.docx.theme import Theme
|
||||
from calibre.ebooks.docx.toc import create_toc
|
||||
from calibre.ebooks.docx.fields import Fields
|
||||
from calibre.ebooks.docx.settings import Settings
|
||||
from calibre.ebooks.metadata.opf2 import OPFCreator
|
||||
from calibre.utils.localization import canonicalize_lang, lang_as_iso639_1
|
||||
from polyglot.builtins import iteritems, itervalues, filter, getcwd, map, unicode_type
|
||||
|
||||
|
||||
NBSP = '\xa0'
|
||||
|
||||
|
||||
class Text:
|
||||
|
||||
def __init__(self, elem, attr, buf):
|
||||
self.elem, self.attr, self.buf = elem, attr, buf
|
||||
self.elems = [self.elem]
|
||||
|
||||
def add_elem(self, elem):
|
||||
self.elems.append(elem)
|
||||
setattr(self.elem, self.attr, ''.join(self.buf))
|
||||
self.elem, self.attr, self.buf = elem, 'tail', []
|
||||
|
||||
def __iter__(self):
|
||||
return iter(self.elems)
|
||||
|
||||
|
||||
def html_lang(docx_lang):
|
||||
lang = canonicalize_lang(docx_lang)
|
||||
if lang and lang != 'und':
|
||||
lang = lang_as_iso639_1(lang)
|
||||
if lang:
|
||||
return lang
|
||||
|
||||
|
||||
class Convert(object):
|
||||
|
||||
def __init__(self, path_or_stream, dest_dir=None, log=None, detect_cover=True, notes_text=None, notes_nopb=False, nosupsub=False):
|
||||
self.docx = DOCX(path_or_stream, log=log)
|
||||
self.namespace = self.docx.namespace
|
||||
self.ms_pat = re.compile(r'\s{2,}')
|
||||
self.ws_pat = re.compile(r'[\n\r\t]')
|
||||
self.log = self.docx.log
|
||||
self.detect_cover = detect_cover
|
||||
self.notes_text = notes_text or _('Notes')
|
||||
self.notes_nopb = notes_nopb
|
||||
self.nosupsub = nosupsub
|
||||
self.dest_dir = dest_dir or getcwd()
|
||||
self.mi = self.docx.metadata
|
||||
self.body = BODY()
|
||||
self.theme = Theme(self.namespace)
|
||||
self.settings = Settings(self.namespace)
|
||||
self.tables = Tables(self.namespace)
|
||||
self.fields = Fields(self.namespace)
|
||||
self.styles = Styles(self.namespace, self.tables)
|
||||
self.images = Images(self.namespace, self.log)
|
||||
self.object_map = OrderedDict()
|
||||
self.html = HTML(
|
||||
HEAD(
|
||||
META(charset='utf-8'),
|
||||
TITLE(self.mi.title or _('Unknown')),
|
||||
LINK(rel='stylesheet', type='text/css', href='docx.css'),
|
||||
),
|
||||
self.body
|
||||
)
|
||||
self.html.text='\n\t'
|
||||
self.html[0].text='\n\t\t'
|
||||
self.html[0].tail='\n'
|
||||
for child in self.html[0]:
|
||||
child.tail = '\n\t\t'
|
||||
self.html[0][-1].tail = '\n\t'
|
||||
self.html[1].text = self.html[1].tail = '\n'
|
||||
lang = html_lang(self.mi.language)
|
||||
if lang:
|
||||
self.html.set('lang', lang)
|
||||
self.doc_lang = lang
|
||||
else:
|
||||
self.doc_lang = None
|
||||
|
||||
def __call__(self):
|
||||
doc = self.docx.document
|
||||
relationships_by_id, relationships_by_type = self.docx.document_relationships
|
||||
self.resolve_alternate_content(doc)
|
||||
self.fields(doc, self.log)
|
||||
self.read_styles(relationships_by_type)
|
||||
self.images(relationships_by_id)
|
||||
self.layers = OrderedDict()
|
||||
self.framed = [[]]
|
||||
self.frame_map = {}
|
||||
self.framed_map = {}
|
||||
self.anchor_map = {}
|
||||
self.link_map = defaultdict(list)
|
||||
self.link_source_map = {}
|
||||
self.toc_anchor = None
|
||||
self.block_runs = []
|
||||
paras = []
|
||||
|
||||
self.log.debug('Converting Word markup to HTML')
|
||||
|
||||
self.read_page_properties(doc)
|
||||
self.current_rels = relationships_by_id
|
||||
for wp, page_properties in iteritems(self.page_map):
|
||||
self.current_page = page_properties
|
||||
if wp.tag.endswith('}p'):
|
||||
p = self.convert_p(wp)
|
||||
self.body.append(p)
|
||||
paras.append(wp)
|
||||
|
||||
self.read_block_anchors(doc)
|
||||
self.styles.apply_contextual_spacing(paras)
|
||||
self.mark_block_runs(paras)
|
||||
# Apply page breaks at the start of every section, except the first
|
||||
# section (since that will be the start of the file)
|
||||
self.styles.apply_section_page_breaks(self.section_starts[1:])
|
||||
|
||||
notes_header = None
|
||||
orig_rid_map = self.images.rid_map
|
||||
if self.footnotes.has_notes:
|
||||
self.body.append(H1(self.notes_text))
|
||||
notes_header = self.body[-1]
|
||||
notes_header.set('class', 'notes-header')
|
||||
for anchor, text, note in self.footnotes:
|
||||
dl = DL(id=anchor)
|
||||
dl.set('class', 'footnote')
|
||||
self.body.append(dl)
|
||||
dl.append(DT('[', A('←' + text, href='#back_%s' % anchor, title=text)))
|
||||
dl[-1][0].tail = ']'
|
||||
dl.append(DD())
|
||||
paras = []
|
||||
self.images.rid_map = self.current_rels = note.rels[0]
|
||||
for wp in note:
|
||||
if wp.tag.endswith('}tbl'):
|
||||
self.tables.register(wp, self.styles)
|
||||
self.page_map[wp] = self.current_page
|
||||
else:
|
||||
p = self.convert_p(wp)
|
||||
dl[-1].append(p)
|
||||
paras.append(wp)
|
||||
self.styles.apply_contextual_spacing(paras)
|
||||
self.mark_block_runs(paras)
|
||||
|
||||
for p, wp in iteritems(self.object_map):
|
||||
if len(p) > 0 and not p.text and len(p[0]) > 0 and not p[0].text and p[0][0].get('class', None) == 'tab':
|
||||
# Paragraph uses tabs for indentation, convert to text-indent
|
||||
parent = p[0]
|
||||
tabs = []
|
||||
for child in parent:
|
||||
if child.get('class', None) == 'tab':
|
||||
tabs.append(child)
|
||||
if child.tail:
|
||||
break
|
||||
else:
|
||||
break
|
||||
indent = len(tabs) * self.settings.default_tab_stop
|
||||
style = self.styles.resolve(wp)
|
||||
if style.text_indent is inherit or (hasattr(style.text_indent, 'endswith') and style.text_indent.endswith('pt')):
|
||||
if style.text_indent is not inherit:
|
||||
indent = float(style.text_indent[:-2]) + indent
|
||||
style.text_indent = '%.3gpt' % indent
|
||||
parent.text = tabs[-1].tail or ''
|
||||
list(map(parent.remove, tabs))
|
||||
|
||||
self.images.rid_map = orig_rid_map
|
||||
|
||||
self.resolve_links()
|
||||
|
||||
self.styles.cascade(self.layers)
|
||||
|
||||
self.tables.apply_markup(self.object_map, self.page_map)
|
||||
|
||||
numbered = []
|
||||
for html_obj, obj in iteritems(self.object_map):
|
||||
raw = obj.get('calibre_num_id', None)
|
||||
if raw is not None:
|
||||
lvl, num_id = raw.partition(':')[0::2]
|
||||
try:
|
||||
lvl = int(lvl)
|
||||
except (TypeError, ValueError):
|
||||
lvl = 0
|
||||
numbered.append((html_obj, num_id, lvl))
|
||||
self.numbering.apply_markup(numbered, self.body, self.styles, self.object_map, self.images)
|
||||
self.apply_frames()
|
||||
|
||||
if len(self.body) > 0:
|
||||
self.body.text = '\n\t'
|
||||
for child in self.body:
|
||||
child.tail = '\n\t'
|
||||
self.body[-1].tail = '\n'
|
||||
|
||||
self.log.debug('Converting styles to CSS')
|
||||
self.styles.generate_classes()
|
||||
for html_obj, obj in iteritems(self.object_map):
|
||||
style = self.styles.resolve(obj)
|
||||
if style is not None:
|
||||
css = style.css
|
||||
if css:
|
||||
cls = self.styles.class_name(css)
|
||||
if cls:
|
||||
html_obj.set('class', cls)
|
||||
for html_obj, css in iteritems(self.framed_map):
|
||||
cls = self.styles.class_name(css)
|
||||
if cls:
|
||||
html_obj.set('class', cls)
|
||||
|
||||
if notes_header is not None:
|
||||
for h in self.namespace.children(self.body, 'h1', 'h2', 'h3'):
|
||||
notes_header.tag = h.tag
|
||||
cls = h.get('class', None)
|
||||
if cls and cls != 'notes-header':
|
||||
notes_header.set('class', '%s notes-header' % cls)
|
||||
break
|
||||
|
||||
self.fields.polish_markup(self.object_map)
|
||||
|
||||
self.log.debug('Cleaning up redundant markup generated by Word')
|
||||
self.cover_image = cleanup_markup(self.log, self.html, self.styles, self.dest_dir, self.detect_cover, self.namespace.XPath)
|
||||
|
||||
return self.write(doc)
|
||||
|
||||
def read_page_properties(self, doc):
|
||||
current = []
|
||||
self.page_map = OrderedDict()
|
||||
self.section_starts = []
|
||||
|
||||
for p in self.namespace.descendants(doc, 'w:p', 'w:tbl'):
|
||||
if p.tag.endswith('}tbl'):
|
||||
self.tables.register(p, self.styles)
|
||||
current.append(p)
|
||||
continue
|
||||
sect = tuple(self.namespace.descendants(p, 'w:sectPr'))
|
||||
if sect:
|
||||
pr = PageProperties(self.namespace, sect)
|
||||
paras = current + [p]
|
||||
for x in paras:
|
||||
self.page_map[x] = pr
|
||||
self.section_starts.append(paras[0])
|
||||
current = []
|
||||
else:
|
||||
current.append(p)
|
||||
|
||||
if current:
|
||||
self.section_starts.append(current[0])
|
||||
last = self.namespace.XPath('./w:body/w:sectPr')(doc)
|
||||
pr = PageProperties(self.namespace, last)
|
||||
for x in current:
|
||||
self.page_map[x] = pr
|
||||
|
||||
def resolve_alternate_content(self, doc):
|
||||
# For proprietary extensions in Word documents use the fallback, spec
|
||||
# compliant form
|
||||
# See https://wiki.openoffice.org/wiki/OOXML/Markup_Compatibility_and_Extensibility
|
||||
for ac in self.namespace.descendants(doc, 'mc:AlternateContent'):
|
||||
choices = self.namespace.XPath('./mc:Choice')(ac)
|
||||
fallbacks = self.namespace.XPath('./mc:Fallback')(ac)
|
||||
if fallbacks:
|
||||
for choice in choices:
|
||||
ac.remove(choice)
|
||||
|
||||
def read_styles(self, relationships_by_type):
|
||||
|
||||
def get_name(rtype, defname):
|
||||
name = relationships_by_type.get(rtype, None)
|
||||
if name is None:
|
||||
cname = self.docx.document_name.split('/')
|
||||
cname[-1] = defname
|
||||
if self.docx.exists('/'.join(cname)):
|
||||
name = name
|
||||
if name and name.startswith('word/word') and not self.docx.exists(name):
|
||||
name = name.partition('/')[2]
|
||||
return name
|
||||
|
||||
nname = get_name(self.namespace.names['NUMBERING'], 'numbering.xml')
|
||||
sname = get_name(self.namespace.names['STYLES'], 'styles.xml')
|
||||
sename = get_name(self.namespace.names['SETTINGS'], 'settings.xml')
|
||||
fname = get_name(self.namespace.names['FONTS'], 'fontTable.xml')
|
||||
tname = get_name(self.namespace.names['THEMES'], 'theme1.xml')
|
||||
foname = get_name(self.namespace.names['FOOTNOTES'], 'footnotes.xml')
|
||||
enname = get_name(self.namespace.names['ENDNOTES'], 'endnotes.xml')
|
||||
numbering = self.numbering = Numbering(self.namespace)
|
||||
footnotes = self.footnotes = Footnotes(self.namespace)
|
||||
fonts = self.fonts = Fonts(self.namespace)
|
||||
|
||||
foraw = enraw = None
|
||||
forel, enrel = ({}, {}), ({}, {})
|
||||
if sename is not None:
|
||||
try:
|
||||
seraw = self.docx.read(sename)
|
||||
except KeyError:
|
||||
self.log.warn('Settings %s do not exist' % sename)
|
||||
except EnvironmentError as e:
|
||||
if e.errno != errno.ENOENT:
|
||||
raise
|
||||
self.log.warn('Settings %s file missing' % sename)
|
||||
else:
|
||||
self.settings(fromstring(seraw))
|
||||
|
||||
if foname is not None:
|
||||
try:
|
||||
foraw = self.docx.read(foname)
|
||||
except KeyError:
|
||||
self.log.warn('Footnotes %s do not exist' % foname)
|
||||
else:
|
||||
forel = self.docx.get_relationships(foname)
|
||||
if enname is not None:
|
||||
try:
|
||||
enraw = self.docx.read(enname)
|
||||
except KeyError:
|
||||
self.log.warn('Endnotes %s do not exist' % enname)
|
||||
else:
|
||||
enrel = self.docx.get_relationships(enname)
|
||||
footnotes(fromstring(foraw) if foraw else None, forel, fromstring(enraw) if enraw else None, enrel)
|
||||
|
||||
if fname is not None:
|
||||
embed_relationships = self.docx.get_relationships(fname)[0]
|
||||
try:
|
||||
raw = self.docx.read(fname)
|
||||
except KeyError:
|
||||
self.log.warn('Fonts table %s does not exist' % fname)
|
||||
else:
|
||||
fonts(fromstring(raw), embed_relationships, self.docx, self.dest_dir)
|
||||
|
||||
if tname is not None:
|
||||
try:
|
||||
raw = self.docx.read(tname)
|
||||
except KeyError:
|
||||
self.log.warn('Styles %s do not exist' % sname)
|
||||
else:
|
||||
self.theme(fromstring(raw))
|
||||
|
||||
styles_loaded = False
|
||||
if sname is not None:
|
||||
try:
|
||||
raw = self.docx.read(sname)
|
||||
except KeyError:
|
||||
self.log.warn('Styles %s do not exist' % sname)
|
||||
else:
|
||||
self.styles(fromstring(raw), fonts, self.theme)
|
||||
styles_loaded = True
|
||||
if not styles_loaded:
|
||||
self.styles(None, fonts, self.theme)
|
||||
|
||||
if nname is not None:
|
||||
try:
|
||||
raw = self.docx.read(nname)
|
||||
except KeyError:
|
||||
self.log.warn('Numbering styles %s do not exist' % nname)
|
||||
else:
|
||||
numbering(fromstring(raw), self.styles, self.docx.get_relationships(nname)[0])
|
||||
|
||||
self.styles.resolve_numbering(numbering)
|
||||
|
||||
def write(self, doc):
|
||||
toc = create_toc(doc, self.body, self.resolved_link_map, self.styles, self.object_map, self.log, self.namespace)
|
||||
raw = html.tostring(self.html, encoding='utf-8', doctype='<!DOCTYPE html>')
|
||||
with lopen(os.path.join(self.dest_dir, 'index.html'), 'wb') as f:
|
||||
f.write(raw)
|
||||
css = self.styles.generate_css(self.dest_dir, self.docx, self.notes_nopb, self.nosupsub)
|
||||
if css:
|
||||
with lopen(os.path.join(self.dest_dir, 'docx.css'), 'wb') as f:
|
||||
f.write(css.encode('utf-8'))
|
||||
|
||||
opf = OPFCreator(self.dest_dir, self.mi)
|
||||
opf.toc = toc
|
||||
opf.create_manifest_from_files_in([self.dest_dir])
|
||||
for item in opf.manifest:
|
||||
if item.media_type == 'text/html':
|
||||
item.media_type = guess_type('a.xhtml')[0]
|
||||
opf.create_spine(['index.html'])
|
||||
if self.cover_image is not None:
|
||||
opf.guide.set_cover(self.cover_image)
|
||||
|
||||
def process_guide(E, guide):
|
||||
if self.toc_anchor is not None:
|
||||
guide.append(E.reference(
|
||||
href='index.html#' + self.toc_anchor, title=_('Table of Contents'), type='toc'))
|
||||
toc_file = os.path.join(self.dest_dir, 'toc.ncx')
|
||||
with lopen(os.path.join(self.dest_dir, 'metadata.opf'), 'wb') as of, open(toc_file, 'wb') as ncx:
|
||||
opf.render(of, ncx, 'toc.ncx', process_guide=process_guide)
|
||||
if os.path.getsize(toc_file) == 0:
|
||||
os.remove(toc_file)
|
||||
return os.path.join(self.dest_dir, 'metadata.opf')
|
||||
|
||||
def read_block_anchors(self, doc):
|
||||
doc_anchors = frozenset(self.namespace.XPath('./w:body/w:bookmarkStart[@w:name]')(doc))
|
||||
if doc_anchors:
|
||||
current_bm = set()
|
||||
rmap = {v:k for k, v in iteritems(self.object_map)}
|
||||
for p in self.namespace.descendants(doc, 'w:p', 'w:bookmarkStart[@w:name]'):
|
||||
if p.tag.endswith('}p'):
|
||||
if current_bm and p in rmap:
|
||||
para = rmap[p]
|
||||
if 'id' not in para.attrib:
|
||||
para.set('id', generate_anchor(next(iter(current_bm)), frozenset(itervalues(self.anchor_map))))
|
||||
for name in current_bm:
|
||||
self.anchor_map[name] = para.get('id')
|
||||
current_bm = set()
|
||||
elif p in doc_anchors:
|
||||
anchor = self.namespace.get(p, 'w:name')
|
||||
if anchor:
|
||||
current_bm.add(anchor)
|
||||
|
||||
def convert_p(self, p):
|
||||
dest = P()
|
||||
self.object_map[dest] = p
|
||||
style = self.styles.resolve_paragraph(p)
|
||||
self.layers[p] = []
|
||||
self.frame_map[p] = style.frame
|
||||
self.add_frame(dest, style.frame)
|
||||
|
||||
current_anchor = None
|
||||
current_hyperlink = None
|
||||
hl_xpath = self.namespace.XPath('ancestor::w:hyperlink[1]')
|
||||
|
||||
def p_parent(x):
|
||||
# Ensure that nested <w:p> tags are handled. These can occur if a
|
||||
# textbox is present inside a paragraph.
|
||||
while True:
|
||||
x = x.getparent()
|
||||
try:
|
||||
if x.tag.endswith('}p'):
|
||||
return x
|
||||
except AttributeError:
|
||||
break
|
||||
|
||||
for x in self.namespace.descendants(p, 'w:r', 'w:bookmarkStart', 'w:hyperlink', 'w:instrText'):
|
||||
if p_parent(x) is not p:
|
||||
continue
|
||||
if x.tag.endswith('}r'):
|
||||
span = self.convert_run(x)
|
||||
if current_anchor is not None:
|
||||
(dest if len(dest) == 0 else span).set('id', current_anchor)
|
||||
current_anchor = None
|
||||
if current_hyperlink is not None:
|
||||
try:
|
||||
hl = hl_xpath(x)[0]
|
||||
self.link_map[hl].append(span)
|
||||
self.link_source_map[hl] = self.current_rels
|
||||
x.set('is-link', '1')
|
||||
except IndexError:
|
||||
current_hyperlink = None
|
||||
dest.append(span)
|
||||
self.layers[p].append(x)
|
||||
elif x.tag.endswith('}bookmarkStart'):
|
||||
anchor = self.namespace.get(x, 'w:name')
|
||||
if anchor and anchor not in self.anchor_map and anchor != '_GoBack':
|
||||
# _GoBack is a special bookmark inserted by Word 2010 for
|
||||
# the return to previous edit feature, we ignore it
|
||||
old_anchor = current_anchor
|
||||
self.anchor_map[anchor] = current_anchor = generate_anchor(anchor, frozenset(itervalues(self.anchor_map)))
|
||||
if old_anchor is not None:
|
||||
# The previous anchor was not applied to any element
|
||||
for a, t in tuple(iteritems(self.anchor_map)):
|
||||
if t == old_anchor:
|
||||
self.anchor_map[a] = current_anchor
|
||||
elif x.tag.endswith('}hyperlink'):
|
||||
current_hyperlink = x
|
||||
elif x.tag.endswith('}instrText') and x.text and x.text.strip().startswith('TOC '):
|
||||
old_anchor = current_anchor
|
||||
anchor = unicode_type(uuid.uuid4())
|
||||
self.anchor_map[anchor] = current_anchor = generate_anchor('toc', frozenset(itervalues(self.anchor_map)))
|
||||
self.toc_anchor = current_anchor
|
||||
if old_anchor is not None:
|
||||
# The previous anchor was not applied to any element
|
||||
for a, t in tuple(iteritems(self.anchor_map)):
|
||||
if t == old_anchor:
|
||||
self.anchor_map[a] = current_anchor
|
||||
if current_anchor is not None:
|
||||
# This paragraph had no <w:r> descendants
|
||||
dest.set('id', current_anchor)
|
||||
current_anchor = None
|
||||
|
||||
m = re.match(r'heading\s+(\d+)$', style.style_name or '', re.IGNORECASE)
|
||||
if m is not None:
|
||||
n = min(6, max(1, int(m.group(1))))
|
||||
dest.tag = 'h%d' % n
|
||||
dest.set('data-heading-level', unicode_type(n))
|
||||
|
||||
if style.bidi is True:
|
||||
dest.set('dir', 'rtl')
|
||||
|
||||
border_runs = []
|
||||
common_borders = []
|
||||
for span in dest:
|
||||
run = self.object_map[span]
|
||||
style = self.styles.resolve_run(run)
|
||||
if not border_runs or border_runs[-1][1].same_border(style):
|
||||
border_runs.append((span, style))
|
||||
elif border_runs:
|
||||
if len(border_runs) > 1:
|
||||
common_borders.append(border_runs)
|
||||
border_runs = []
|
||||
|
||||
for border_run in common_borders:
|
||||
spans = []
|
||||
bs = {}
|
||||
for span, style in border_run:
|
||||
style.get_border_css(bs)
|
||||
style.clear_border_css()
|
||||
spans.append(span)
|
||||
if bs:
|
||||
cls = self.styles.register(bs, 'text_border')
|
||||
wrapper = self.wrap_elems(spans, SPAN())
|
||||
wrapper.set('class', cls)
|
||||
|
||||
if not dest.text and len(dest) == 0 and not style.has_visible_border():
|
||||
# Empty paragraph add a non-breaking space so that it is rendered
|
||||
# by WebKit
|
||||
dest.text = NBSP
|
||||
|
||||
# If the last element in a block is a <br> the <br> is not rendered in
|
||||
# HTML, unless it is followed by a trailing space. Word, on the other
|
||||
# hand inserts a blank line for trailing <br>s.
|
||||
if len(dest) > 0 and not dest[-1].tail:
|
||||
if dest[-1].tag == 'br':
|
||||
dest[-1].tail = NBSP
|
||||
elif len(dest[-1]) > 0 and dest[-1][-1].tag == 'br' and not dest[-1][-1].tail:
|
||||
dest[-1][-1].tail = NBSP
|
||||
|
||||
return dest
|
||||
|
||||
def wrap_elems(self, elems, wrapper):
|
||||
p = elems[0].getparent()
|
||||
idx = p.index(elems[0])
|
||||
p.insert(idx, wrapper)
|
||||
wrapper.tail = elems[-1].tail
|
||||
elems[-1].tail = None
|
||||
for elem in elems:
|
||||
try:
|
||||
p.remove(elem)
|
||||
except ValueError:
|
||||
# Probably a hyperlink that spans multiple
|
||||
# paragraphs,theoretically we should break this up into
|
||||
# multiple hyperlinks, but I can't be bothered.
|
||||
elem.getparent().remove(elem)
|
||||
wrapper.append(elem)
|
||||
return wrapper
|
||||
|
||||
def resolve_links(self):
|
||||
self.resolved_link_map = {}
|
||||
for hyperlink, spans in iteritems(self.link_map):
|
||||
relationships_by_id = self.link_source_map[hyperlink]
|
||||
span = spans[0]
|
||||
if len(spans) > 1:
|
||||
span = self.wrap_elems(spans, SPAN())
|
||||
span.tag = 'a'
|
||||
self.resolved_link_map[hyperlink] = span
|
||||
tgt = self.namespace.get(hyperlink, 'w:tgtFrame')
|
||||
if tgt:
|
||||
span.set('target', tgt)
|
||||
tt = self.namespace.get(hyperlink, 'w:tooltip')
|
||||
if tt:
|
||||
span.set('title', tt)
|
||||
rid = self.namespace.get(hyperlink, 'r:id')
|
||||
if rid and rid in relationships_by_id:
|
||||
span.set('href', relationships_by_id[rid])
|
||||
continue
|
||||
anchor = self.namespace.get(hyperlink, 'w:anchor')
|
||||
if anchor and anchor in self.anchor_map:
|
||||
span.set('href', '#' + self.anchor_map[anchor])
|
||||
continue
|
||||
self.log.warn('Hyperlink with unknown target (rid=%s, anchor=%s), ignoring' %
|
||||
(rid, anchor))
|
||||
# hrefs that point nowhere give epubcheck a hernia. The element
|
||||
# should be styled explicitly by Word anyway.
|
||||
# span.set('href', '#')
|
||||
rmap = {v:k for k, v in iteritems(self.object_map)}
|
||||
for hyperlink, runs in self.fields.hyperlink_fields:
|
||||
spans = [rmap[r] for r in runs if r in rmap]
|
||||
if not spans:
|
||||
continue
|
||||
span = spans[0]
|
||||
if len(spans) > 1:
|
||||
span = self.wrap_elems(spans, SPAN())
|
||||
span.tag = 'a'
|
||||
tgt = hyperlink.get('target', None)
|
||||
if tgt:
|
||||
span.set('target', tgt)
|
||||
tt = hyperlink.get('title', None)
|
||||
if tt:
|
||||
span.set('title', tt)
|
||||
url = hyperlink.get('url', None)
|
||||
if url is None:
|
||||
anchor = hyperlink.get('anchor', None)
|
||||
if anchor in self.anchor_map:
|
||||
span.set('href', '#' + self.anchor_map[anchor])
|
||||
continue
|
||||
self.log.warn('Hyperlink field with unknown anchor: %s' % anchor)
|
||||
else:
|
||||
if url in self.anchor_map:
|
||||
span.set('href', '#' + self.anchor_map[url])
|
||||
continue
|
||||
span.set('href', url)
|
||||
|
||||
for img, link, relationships_by_id in self.images.links:
|
||||
parent = img.getparent()
|
||||
idx = parent.index(img)
|
||||
a = A(img)
|
||||
a.tail, img.tail = img.tail, None
|
||||
parent.insert(idx, a)
|
||||
tgt = link.get('target', None)
|
||||
if tgt:
|
||||
a.set('target', tgt)
|
||||
tt = link.get('title', None)
|
||||
if tt:
|
||||
a.set('title', tt)
|
||||
rid = link['id']
|
||||
if rid in relationships_by_id:
|
||||
dest = relationships_by_id[rid]
|
||||
if dest.startswith('#'):
|
||||
if dest[1:] in self.anchor_map:
|
||||
a.set('href', '#' + self.anchor_map[dest[1:]])
|
||||
else:
|
||||
a.set('href', dest)
|
||||
|
||||
def convert_run(self, run):
|
||||
ans = SPAN()
|
||||
self.object_map[ans] = run
|
||||
text = Text(ans, 'text', [])
|
||||
|
||||
for child in run:
|
||||
if self.namespace.is_tag(child, 'w:t'):
|
||||
if not child.text:
|
||||
continue
|
||||
space = child.get(XML('space'), None)
|
||||
preserve = False
|
||||
ctext = child.text
|
||||
if space != 'preserve':
|
||||
# Remove leading and trailing whitespace. Word ignores
|
||||
# leading and trailing whitespace without preserve
|
||||
ctext = ctext.strip(' \n\r\t')
|
||||
# Only use a <span> with white-space:pre-wrap if this element
|
||||
# actually needs it, i.e. if it has more than one
|
||||
# consecutive space or it has newlines or tabs.
|
||||
multi_spaces = self.ms_pat.search(ctext) is not None
|
||||
preserve = multi_spaces or self.ws_pat.search(ctext) is not None
|
||||
if preserve:
|
||||
text.add_elem(SPAN(ctext, style="white-space:pre-wrap"))
|
||||
ans.append(text.elem)
|
||||
else:
|
||||
text.buf.append(ctext)
|
||||
elif self.namespace.is_tag(child, 'w:cr'):
|
||||
text.add_elem(BR())
|
||||
ans.append(text.elem)
|
||||
elif self.namespace.is_tag(child, 'w:br'):
|
||||
typ = self.namespace.get(child, 'w:type')
|
||||
if typ in {'column', 'page'}:
|
||||
br = BR(style='page-break-after:always')
|
||||
else:
|
||||
clear = child.get('clear', None)
|
||||
if clear in {'all', 'left', 'right'}:
|
||||
br = BR(style='clear:%s'%('both' if clear == 'all' else clear))
|
||||
else:
|
||||
br = BR()
|
||||
text.add_elem(br)
|
||||
ans.append(text.elem)
|
||||
elif self.namespace.is_tag(child, 'w:drawing') or self.namespace.is_tag(child, 'w:pict'):
|
||||
for img in self.images.to_html(child, self.current_page, self.docx, self.dest_dir):
|
||||
text.add_elem(img)
|
||||
ans.append(text.elem)
|
||||
elif self.namespace.is_tag(child, 'w:footnoteReference') or self.namespace.is_tag(child, 'w:endnoteReference'):
|
||||
anchor, name = self.footnotes.get_ref(child)
|
||||
if anchor and name:
|
||||
l = A(name, id='back_%s' % anchor, href='#' + anchor, title=name)
|
||||
l.set('class', 'noteref')
|
||||
text.add_elem(l)
|
||||
ans.append(text.elem)
|
||||
elif self.namespace.is_tag(child, 'w:tab'):
|
||||
spaces = int(math.ceil((self.settings.default_tab_stop / 36) * 6))
|
||||
text.add_elem(SPAN(NBSP * spaces))
|
||||
ans.append(text.elem)
|
||||
ans[-1].set('class', 'tab')
|
||||
elif self.namespace.is_tag(child, 'w:noBreakHyphen'):
|
||||
text.buf.append('\u2011')
|
||||
elif self.namespace.is_tag(child, 'w:softHyphen'):
|
||||
text.buf.append('\u00ad')
|
||||
if text.buf:
|
||||
setattr(text.elem, text.attr, ''.join(text.buf))
|
||||
|
||||
style = self.styles.resolve_run(run)
|
||||
if style.vert_align in {'superscript', 'subscript'}:
|
||||
if ans.text or len(ans):
|
||||
ans.set('data-docx-vert', 'sup' if style.vert_align == 'superscript' else 'sub')
|
||||
if style.lang is not inherit:
|
||||
lang = html_lang(style.lang)
|
||||
if lang is not None and lang != self.doc_lang:
|
||||
ans.set('lang', lang)
|
||||
if style.rtl is True:
|
||||
ans.set('dir', 'rtl')
|
||||
if is_symbol_font(style.font_family):
|
||||
for elem in text:
|
||||
if elem.text:
|
||||
elem.text = map_symbol_text(elem.text, style.font_family)
|
||||
if elem.tail:
|
||||
elem.tail = map_symbol_text(elem.tail, style.font_family)
|
||||
style.font_family = 'sans-serif'
|
||||
return ans
|
||||
|
||||
def add_frame(self, html_obj, style):
|
||||
last_run = self.framed[-1]
|
||||
if style is inherit:
|
||||
if last_run:
|
||||
self.framed.append([])
|
||||
return
|
||||
|
||||
if last_run:
|
||||
if last_run[-1][1] == style:
|
||||
last_run.append((html_obj, style))
|
||||
else:
|
||||
self.framed[-1].append((html_obj, style))
|
||||
else:
|
||||
last_run.append((html_obj, style))
|
||||
|
||||
def apply_frames(self):
|
||||
for run in filter(None, self.framed):
|
||||
style = run[0][1]
|
||||
paras = tuple(x[0] for x in run)
|
||||
parent = paras[0].getparent()
|
||||
idx = parent.index(paras[0])
|
||||
frame = DIV(*paras)
|
||||
parent.insert(idx, frame)
|
||||
self.framed_map[frame] = css = style.css(self.page_map[self.object_map[paras[0]]])
|
||||
self.styles.register(css, 'frame')
|
||||
|
||||
if not self.block_runs:
|
||||
return
|
||||
rmap = {v:k for k, v in iteritems(self.object_map)}
|
||||
for border_style, blocks in self.block_runs:
|
||||
paras = tuple(rmap[p] for p in blocks)
|
||||
for p in paras:
|
||||
if p.tag == 'li':
|
||||
has_li = True
|
||||
break
|
||||
else:
|
||||
has_li = False
|
||||
parent = paras[0].getparent()
|
||||
if parent.tag in ('ul', 'ol'):
|
||||
ul = parent
|
||||
parent = ul.getparent()
|
||||
idx = parent.index(ul)
|
||||
frame = DIV(ul)
|
||||
elif has_li:
|
||||
def top_level_tag(x):
|
||||
while True:
|
||||
q = x.getparent()
|
||||
if q is parent or q is None:
|
||||
break
|
||||
x = q
|
||||
return x
|
||||
paras = tuple(map(top_level_tag, paras))
|
||||
idx = parent.index(paras[0])
|
||||
frame = DIV(*paras)
|
||||
else:
|
||||
idx = parent.index(paras[0])
|
||||
frame = DIV(*paras)
|
||||
parent.insert(idx, frame)
|
||||
self.framed_map[frame] = css = border_style.css
|
||||
self.styles.register(css, 'frame')
|
||||
|
||||
def mark_block_runs(self, paras):
|
||||
|
||||
def process_run(run):
|
||||
max_left = max_right = 0
|
||||
has_visible_border = None
|
||||
for p in run:
|
||||
style = self.styles.resolve_paragraph(p)
|
||||
if has_visible_border is None:
|
||||
has_visible_border = style.has_visible_border()
|
||||
if isinstance(style.margin_left, numbers.Number):
|
||||
max_left = max(style.margin_left, max_left)
|
||||
if isinstance(style.margin_right, numbers.Number):
|
||||
max_right = max(style.margin_right, max_right)
|
||||
if has_visible_border:
|
||||
style.margin_left = style.margin_right = inherit
|
||||
if p is not run[0]:
|
||||
style.padding_top = 0
|
||||
else:
|
||||
border_style = style.clone_border_styles()
|
||||
if has_visible_border:
|
||||
border_style.margin_top, style.margin_top = style.margin_top, inherit
|
||||
if p is not run[-1]:
|
||||
style.padding_bottom = 0
|
||||
else:
|
||||
if has_visible_border:
|
||||
border_style.margin_bottom, style.margin_bottom = style.margin_bottom, inherit
|
||||
style.clear_borders()
|
||||
if p is not run[-1]:
|
||||
style.apply_between_border()
|
||||
if has_visible_border:
|
||||
border_style.margin_left, border_style.margin_right = max_left,max_right
|
||||
self.block_runs.append((border_style, run))
|
||||
|
||||
run = []
|
||||
for p in paras:
|
||||
if run and self.frame_map.get(p) == self.frame_map.get(run[-1]):
|
||||
style = self.styles.resolve_paragraph(p)
|
||||
last_style = self.styles.resolve_paragraph(run[-1])
|
||||
if style.has_identical_borders(last_style):
|
||||
run.append(p)
|
||||
continue
|
||||
if len(run) > 1:
|
||||
process_run(run)
|
||||
run = [p]
|
||||
if len(run) > 1:
|
||||
process_run(run)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
import shutil
|
||||
from calibre.utils.logging import default_log
|
||||
default_log.filter_level = default_log.DEBUG
|
||||
dest_dir = os.path.join(getcwd(), 'docx_input')
|
||||
if os.path.exists(dest_dir):
|
||||
shutil.rmtree(dest_dir)
|
||||
os.mkdir(dest_dir)
|
||||
Convert(sys.argv[-1], dest_dir=dest_dir, log=default_log)()
|
||||
143
ebook_converter/ebooks/docx/toc.py
Normal file
143
ebook_converter/ebooks/docx/toc.py
Normal file
@@ -0,0 +1,143 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=utf-8
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
|
||||
from collections import namedtuple
|
||||
from itertools import count
|
||||
|
||||
from lxml.etree import tostring
|
||||
|
||||
from calibre.ebooks.metadata.toc import TOC
|
||||
from calibre.ebooks.oeb.polish.toc import elem_to_toc_text
|
||||
from polyglot.builtins import iteritems, range
|
||||
|
||||
|
||||
def from_headings(body, log, namespace, num_levels=3):
|
||||
' Create a TOC from headings in the document '
|
||||
tocroot = TOC()
|
||||
all_heading_nodes = body.xpath('//*[@data-heading-level]')
|
||||
level_prev = {i+1:None for i in range(num_levels)}
|
||||
level_prev[0] = tocroot
|
||||
level_item_map = {i:frozenset(
|
||||
x for x in all_heading_nodes if int(x.get('data-heading-level')) == i)
|
||||
for i in range(1, num_levels+1)}
|
||||
item_level_map = {e:i for i, elems in iteritems(level_item_map) for e in elems}
|
||||
|
||||
idcount = count()
|
||||
|
||||
def ensure_id(elem):
|
||||
ans = elem.get('id', None)
|
||||
if not ans:
|
||||
ans = 'toc_id_%d' % (next(idcount) + 1)
|
||||
elem.set('id', ans)
|
||||
return ans
|
||||
|
||||
for item in all_heading_nodes:
|
||||
lvl = plvl = item_level_map.get(item, None)
|
||||
if lvl is None:
|
||||
continue
|
||||
parent = None
|
||||
while parent is None:
|
||||
plvl -= 1
|
||||
parent = level_prev[plvl]
|
||||
lvl = plvl + 1
|
||||
elem_id = ensure_id(item)
|
||||
text = elem_to_toc_text(item)
|
||||
toc = parent.add_item('index.html', elem_id, text)
|
||||
level_prev[lvl] = toc
|
||||
for i in range(lvl+1, num_levels+1):
|
||||
level_prev[i] = None
|
||||
|
||||
if len(tuple(tocroot.flat())) > 1:
|
||||
log('Generating Table of Contents from headings')
|
||||
return tocroot
|
||||
|
||||
|
||||
def structure_toc(entries):
|
||||
indent_vals = sorted({x.indent for x in entries})
|
||||
last_found = [None for i in indent_vals]
|
||||
newtoc = TOC()
|
||||
|
||||
if len(indent_vals) > 6:
|
||||
for x in entries:
|
||||
newtoc.add_item('index.html', x.anchor, x.text)
|
||||
return newtoc
|
||||
|
||||
def find_parent(level):
|
||||
candidates = last_found[:level]
|
||||
for x in reversed(candidates):
|
||||
if x is not None:
|
||||
return x
|
||||
return newtoc
|
||||
|
||||
for item in entries:
|
||||
level = indent_vals.index(item.indent)
|
||||
parent = find_parent(level)
|
||||
last_found[level] = parent.add_item('index.html', item.anchor,
|
||||
item.text)
|
||||
for i in range(level+1, len(last_found)):
|
||||
last_found[i] = None
|
||||
|
||||
return newtoc
|
||||
|
||||
|
||||
def link_to_txt(a, styles, object_map):
|
||||
if len(a) > 1:
|
||||
for child in a:
|
||||
run = object_map.get(child, None)
|
||||
if run is not None:
|
||||
rs = styles.resolve(run)
|
||||
if rs.css.get('display', None) == 'none':
|
||||
a.remove(child)
|
||||
|
||||
return tostring(a, method='text', with_tail=False, encoding='unicode').strip()
|
||||
|
||||
|
||||
def from_toc(docx, link_map, styles, object_map, log, namespace):
|
||||
XPath, get, ancestor = namespace.XPath, namespace.get, namespace.ancestor
|
||||
toc_level = None
|
||||
level = 0
|
||||
TI = namedtuple('TI', 'text anchor indent')
|
||||
toc = []
|
||||
for tag in XPath('//*[(@w:fldCharType and name()="w:fldChar") or name()="w:hyperlink" or name()="w:instrText"]')(docx):
|
||||
n = tag.tag.rpartition('}')[-1]
|
||||
if n == 'fldChar':
|
||||
t = get(tag, 'w:fldCharType')
|
||||
if t == 'begin':
|
||||
level += 1
|
||||
elif t == 'end':
|
||||
level -= 1
|
||||
if toc_level is not None and level < toc_level:
|
||||
break
|
||||
elif n == 'instrText':
|
||||
if level > 0 and tag.text and tag.text.strip().startswith('TOC '):
|
||||
toc_level = level
|
||||
elif n == 'hyperlink':
|
||||
if toc_level is not None and level >= toc_level and tag in link_map:
|
||||
a = link_map[tag]
|
||||
href = a.get('href', None)
|
||||
txt = link_to_txt(a, styles, object_map)
|
||||
p = ancestor(tag, 'w:p')
|
||||
if txt and href and p is not None:
|
||||
ps = styles.resolve_paragraph(p)
|
||||
try:
|
||||
ml = int(ps.margin_left[:-2])
|
||||
except (TypeError, ValueError, AttributeError):
|
||||
ml = 0
|
||||
if ps.text_align in {'center', 'right'}:
|
||||
ml = 0
|
||||
toc.append(TI(txt, href[1:], ml))
|
||||
if toc:
|
||||
log('Found Word Table of Contents, using it to generate the Table of Contents')
|
||||
return structure_toc(toc)
|
||||
|
||||
|
||||
def create_toc(docx, body, link_map, styles, object_map, log, namespace):
|
||||
ans = from_toc(docx, link_map, styles, object_map, log, namespace) or from_headings(body, log, namespace)
|
||||
# Remove heading level attributes
|
||||
for h in body.xpath('//*[@data-heading-level]'):
|
||||
del h.attrib['data-heading-level']
|
||||
return ans
|
||||
7
ebook_converter/ebooks/html/__init__.py
Normal file
7
ebook_converter/ebooks/html/__init__.py
Normal file
@@ -0,0 +1,7 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
258
ebook_converter/ebooks/html/input.py
Normal file
258
ebook_converter/ebooks/html/input.py
Normal file
@@ -0,0 +1,258 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
|
||||
'''
|
||||
Input plugin for HTML or OPF ebooks.
|
||||
'''
|
||||
|
||||
import os, re, sys, errno as gerrno
|
||||
|
||||
from calibre.ebooks.oeb.base import urlunquote
|
||||
from calibre.ebooks.chardet import detect_xml_encoding
|
||||
from calibre.constants import iswindows
|
||||
from calibre import unicode_path, as_unicode, replace_entities
|
||||
from polyglot.builtins import is_py3, unicode_type
|
||||
from polyglot.urllib import urlparse, urlunparse
|
||||
|
||||
|
||||
class Link(object):
|
||||
|
||||
'''
|
||||
Represents a link in a HTML file.
|
||||
'''
|
||||
|
||||
@classmethod
|
||||
def url_to_local_path(cls, url, base):
|
||||
path = url.path
|
||||
isabs = False
|
||||
if iswindows and path.startswith('/'):
|
||||
path = path[1:]
|
||||
isabs = True
|
||||
path = urlunparse(('', '', path, url.params, url.query, ''))
|
||||
path = urlunquote(path)
|
||||
if isabs or os.path.isabs(path):
|
||||
return path
|
||||
return os.path.abspath(os.path.join(base, path))
|
||||
|
||||
def __init__(self, url, base):
|
||||
'''
|
||||
:param url: The url this link points to. Must be an unquoted unicode string.
|
||||
:param base: The base directory that relative URLs are with respect to.
|
||||
Must be a unicode string.
|
||||
'''
|
||||
assert isinstance(url, unicode_type) and isinstance(base, unicode_type)
|
||||
self.url = url
|
||||
self.parsed_url = urlparse(self.url)
|
||||
self.is_local = self.parsed_url.scheme in ('', 'file')
|
||||
self.is_internal = self.is_local and not bool(self.parsed_url.path)
|
||||
self.path = None
|
||||
self.fragment = urlunquote(self.parsed_url.fragment)
|
||||
if self.is_local and not self.is_internal:
|
||||
self.path = self.url_to_local_path(self.parsed_url, base)
|
||||
|
||||
def __hash__(self):
|
||||
if self.path is None:
|
||||
return hash(self.url)
|
||||
return hash(self.path)
|
||||
|
||||
def __eq__(self, other):
|
||||
return self.path == getattr(other, 'path', other)
|
||||
|
||||
def __str__(self):
|
||||
return 'Link: %s --> %s'%(self.url, self.path)
|
||||
|
||||
if not is_py3:
|
||||
__unicode__ = __str__
|
||||
|
||||
|
||||
class IgnoreFile(Exception):
|
||||
|
||||
def __init__(self, msg, errno):
|
||||
Exception.__init__(self, msg)
|
||||
self.doesnt_exist = errno == gerrno.ENOENT
|
||||
self.errno = errno
|
||||
|
||||
|
||||
class HTMLFile(object):
|
||||
|
||||
'''
|
||||
Contains basic information about an HTML file. This
|
||||
includes a list of links to other files as well as
|
||||
the encoding of each file. Also tries to detect if the file is not a HTML
|
||||
file in which case :member:`is_binary` is set to True.
|
||||
|
||||
The encoding of the file is available as :member:`encoding`.
|
||||
'''
|
||||
|
||||
HTML_PAT = re.compile(r'<\s*html', re.IGNORECASE)
|
||||
TITLE_PAT = re.compile('<title>([^<>]+)</title>', re.IGNORECASE)
|
||||
LINK_PAT = re.compile(
|
||||
r'<\s*a\s+.*?href\s*=\s*(?:(?:"(?P<url1>[^"]+)")|(?:\'(?P<url2>[^\']+)\')|(?P<url3>[^\s>]+))',
|
||||
re.DOTALL|re.IGNORECASE)
|
||||
|
||||
def __init__(self, path_to_html_file, level, encoding, verbose, referrer=None):
|
||||
'''
|
||||
:param level: The level of this file. Should be 0 for the root file.
|
||||
:param encoding: Use `encoding` to decode HTML.
|
||||
:param referrer: The :class:`HTMLFile` that first refers to this file.
|
||||
'''
|
||||
self.path = unicode_path(path_to_html_file, abs=True)
|
||||
self.title = os.path.splitext(os.path.basename(self.path))[0]
|
||||
self.base = os.path.dirname(self.path)
|
||||
self.level = level
|
||||
self.referrer = referrer
|
||||
self.links = []
|
||||
|
||||
try:
|
||||
with open(self.path, 'rb') as f:
|
||||
src = header = f.read(4096)
|
||||
encoding = detect_xml_encoding(src)[1]
|
||||
if encoding:
|
||||
try:
|
||||
header = header.decode(encoding)
|
||||
except ValueError:
|
||||
pass
|
||||
self.is_binary = level > 0 and not bool(self.HTML_PAT.search(header))
|
||||
if not self.is_binary:
|
||||
src += f.read()
|
||||
except IOError as err:
|
||||
msg = 'Could not read from file: %s with error: %s'%(self.path, as_unicode(err))
|
||||
if level == 0:
|
||||
raise IOError(msg)
|
||||
raise IgnoreFile(msg, err.errno)
|
||||
|
||||
if not src:
|
||||
if level == 0:
|
||||
raise ValueError('The file %s is empty'%self.path)
|
||||
self.is_binary = True
|
||||
|
||||
if not self.is_binary:
|
||||
if not encoding:
|
||||
encoding = detect_xml_encoding(src[:4096], verbose=verbose)[1]
|
||||
self.encoding = encoding
|
||||
else:
|
||||
self.encoding = encoding
|
||||
|
||||
src = src.decode(encoding, 'replace')
|
||||
match = self.TITLE_PAT.search(src)
|
||||
self.title = match.group(1) if match is not None else self.title
|
||||
self.find_links(src)
|
||||
|
||||
def __eq__(self, other):
|
||||
return self.path == getattr(other, 'path', other)
|
||||
|
||||
def __hash__(self):
|
||||
return hash(self.path)
|
||||
|
||||
def __str__(self):
|
||||
return 'HTMLFile:%d:%s:%s'%(self.level, 'b' if self.is_binary else 'a', self.path)
|
||||
|
||||
def __repr__(self):
|
||||
return unicode_type(self)
|
||||
|
||||
def find_links(self, src):
|
||||
for match in self.LINK_PAT.finditer(src):
|
||||
url = None
|
||||
for i in ('url1', 'url2', 'url3'):
|
||||
url = match.group(i)
|
||||
if url:
|
||||
break
|
||||
url = replace_entities(url)
|
||||
try:
|
||||
link = self.resolve(url)
|
||||
except ValueError:
|
||||
# Unparseable URL, ignore
|
||||
continue
|
||||
if link not in self.links:
|
||||
self.links.append(link)
|
||||
|
||||
def resolve(self, url):
|
||||
return Link(url, self.base)
|
||||
|
||||
|
||||
def depth_first(root, flat, visited=None):
|
||||
yield root
|
||||
if visited is None:
|
||||
visited = set()
|
||||
visited.add(root)
|
||||
for link in root.links:
|
||||
if link.path is not None and link not in visited:
|
||||
try:
|
||||
index = flat.index(link)
|
||||
except ValueError: # Can happen if max_levels is used
|
||||
continue
|
||||
hf = flat[index]
|
||||
if hf not in visited:
|
||||
yield hf
|
||||
visited.add(hf)
|
||||
for hf in depth_first(hf, flat, visited):
|
||||
if hf not in visited:
|
||||
yield hf
|
||||
visited.add(hf)
|
||||
|
||||
|
||||
def traverse(path_to_html_file, max_levels=sys.maxsize, verbose=0, encoding=None):
|
||||
'''
|
||||
Recursively traverse all links in the HTML file.
|
||||
|
||||
:param max_levels: Maximum levels of recursion. Must be non-negative. 0
|
||||
implies that no links in the root HTML file are followed.
|
||||
:param encoding: Specify character encoding of HTML files. If `None` it is
|
||||
auto-detected.
|
||||
:return: A pair of lists (breadth_first, depth_first). Each list contains
|
||||
:class:`HTMLFile` objects.
|
||||
'''
|
||||
assert max_levels >= 0
|
||||
level = 0
|
||||
flat = [HTMLFile(path_to_html_file, level, encoding, verbose)]
|
||||
next_level = list(flat)
|
||||
while level < max_levels and len(next_level) > 0:
|
||||
level += 1
|
||||
nl = []
|
||||
for hf in next_level:
|
||||
rejects = []
|
||||
for link in hf.links:
|
||||
if link.path is None or link.path in flat:
|
||||
continue
|
||||
try:
|
||||
nf = HTMLFile(link.path, level, encoding, verbose, referrer=hf)
|
||||
if nf.is_binary:
|
||||
raise IgnoreFile('%s is a binary file'%nf.path, -1)
|
||||
nl.append(nf)
|
||||
flat.append(nf)
|
||||
except IgnoreFile as err:
|
||||
rejects.append(link)
|
||||
if not err.doesnt_exist or verbose > 1:
|
||||
print(repr(err))
|
||||
for link in rejects:
|
||||
hf.links.remove(link)
|
||||
|
||||
next_level = list(nl)
|
||||
orec = sys.getrecursionlimit()
|
||||
sys.setrecursionlimit(500000)
|
||||
try:
|
||||
return flat, list(depth_first(flat[0], flat))
|
||||
finally:
|
||||
sys.setrecursionlimit(orec)
|
||||
|
||||
|
||||
def get_filelist(htmlfile, dir, opts, log):
|
||||
'''
|
||||
Build list of files referenced by html file or try to detect and use an
|
||||
OPF file instead.
|
||||
'''
|
||||
log.info('Building file list...')
|
||||
filelist = traverse(htmlfile, max_levels=int(opts.max_levels),
|
||||
verbose=opts.verbose,
|
||||
encoding=opts.input_encoding)[0 if opts.breadth_first else 1]
|
||||
if opts.verbose:
|
||||
log.debug('\tFound files...')
|
||||
for f in filelist:
|
||||
log.debug('\t\t', f)
|
||||
return filelist
|
||||
122
ebook_converter/ebooks/html/to_zip.py
Normal file
122
ebook_converter/ebooks/html/to_zip.py
Normal file
@@ -0,0 +1,122 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import textwrap, os, glob
|
||||
|
||||
from calibre.customize import FileTypePlugin
|
||||
from calibre.constants import numeric_version
|
||||
from polyglot.builtins import unicode_type
|
||||
|
||||
|
||||
class HTML2ZIP(FileTypePlugin):
|
||||
name = 'HTML to ZIP'
|
||||
author = 'Kovid Goyal'
|
||||
description = textwrap.dedent(_('''\
|
||||
Follow all local links in an HTML file and create a ZIP \
|
||||
file containing all linked files. This plugin is run \
|
||||
every time you add an HTML file to the library.\
|
||||
'''))
|
||||
version = numeric_version
|
||||
file_types = {'html', 'htm', 'xhtml', 'xhtm', 'shtm', 'shtml'}
|
||||
supported_platforms = ['windows', 'osx', 'linux']
|
||||
on_import = True
|
||||
|
||||
def run(self, htmlfile):
|
||||
import codecs
|
||||
from calibre import prints
|
||||
from calibre.ptempfile import TemporaryDirectory
|
||||
from calibre.gui2.convert.gui_conversion import gui_convert
|
||||
from calibre.customize.conversion import OptionRecommendation
|
||||
from calibre.ebooks.epub import initialize_container
|
||||
|
||||
with TemporaryDirectory('_plugin_html2zip') as tdir:
|
||||
recs =[('debug_pipeline', tdir, OptionRecommendation.HIGH)]
|
||||
recs.append(['keep_ligatures', True, OptionRecommendation.HIGH])
|
||||
if self.site_customization and self.site_customization.strip():
|
||||
sc = self.site_customization.strip()
|
||||
enc, _, bf = sc.partition('|')
|
||||
if enc:
|
||||
try:
|
||||
codecs.lookup(enc)
|
||||
except Exception:
|
||||
prints('Ignoring invalid input encoding for HTML:', enc)
|
||||
else:
|
||||
recs.append(['input_encoding', enc, OptionRecommendation.HIGH])
|
||||
if bf == 'bf':
|
||||
recs.append(['breadth_first', True,
|
||||
OptionRecommendation.HIGH])
|
||||
gui_convert(htmlfile, tdir, recs, abort_after_input_dump=True)
|
||||
of = self.temporary_file('_plugin_html2zip.zip')
|
||||
tdir = os.path.join(tdir, 'input')
|
||||
opf = glob.glob(os.path.join(tdir, '*.opf'))[0]
|
||||
ncx = glob.glob(os.path.join(tdir, '*.ncx'))
|
||||
if ncx:
|
||||
os.remove(ncx[0])
|
||||
epub = initialize_container(of.name, os.path.basename(opf))
|
||||
epub.add_dir(tdir)
|
||||
epub.close()
|
||||
|
||||
return of.name
|
||||
|
||||
def customization_help(self, gui=False):
|
||||
return _('Character encoding for the input HTML files. Common choices '
|
||||
'include: cp1252, cp1251, latin1 and utf-8.')
|
||||
|
||||
def do_user_config(self, parent=None):
|
||||
'''
|
||||
This method shows a configuration dialog for this plugin. It returns
|
||||
True if the user clicks OK, False otherwise. The changes are
|
||||
automatically applied.
|
||||
'''
|
||||
from PyQt5.Qt import (QDialog, QDialogButtonBox, QVBoxLayout,
|
||||
QLabel, Qt, QLineEdit, QCheckBox)
|
||||
|
||||
config_dialog = QDialog(parent)
|
||||
button_box = QDialogButtonBox(QDialogButtonBox.Ok | QDialogButtonBox.Cancel)
|
||||
v = QVBoxLayout(config_dialog)
|
||||
|
||||
def size_dialog():
|
||||
config_dialog.resize(config_dialog.sizeHint())
|
||||
|
||||
button_box.accepted.connect(config_dialog.accept)
|
||||
button_box.rejected.connect(config_dialog.reject)
|
||||
config_dialog.setWindowTitle(_('Customize') + ' ' + self.name)
|
||||
from calibre.customize.ui import (plugin_customization,
|
||||
customize_plugin)
|
||||
help_text = self.customization_help(gui=True)
|
||||
help_text = QLabel(help_text, config_dialog)
|
||||
help_text.setWordWrap(True)
|
||||
help_text.setTextInteractionFlags(Qt.LinksAccessibleByMouse | Qt.LinksAccessibleByKeyboard)
|
||||
help_text.setOpenExternalLinks(True)
|
||||
v.addWidget(help_text)
|
||||
bf = QCheckBox(_('Add linked files in breadth first order'))
|
||||
bf.setToolTip(_('Normally, when following links in HTML files'
|
||||
' calibre does it depth first, i.e. if file A links to B and '
|
||||
' C, but B links to D, the files are added in the order A, B, D, C. '
|
||||
' With this option, they will instead be added as A, B, C, D'))
|
||||
sc = plugin_customization(self)
|
||||
if not sc:
|
||||
sc = ''
|
||||
sc = sc.strip()
|
||||
enc = sc.partition('|')[0]
|
||||
bfs = sc.partition('|')[-1]
|
||||
bf.setChecked(bfs == 'bf')
|
||||
sc = QLineEdit(enc, config_dialog)
|
||||
v.addWidget(sc)
|
||||
v.addWidget(bf)
|
||||
v.addWidget(button_box)
|
||||
size_dialog()
|
||||
config_dialog.exec_()
|
||||
|
||||
if config_dialog.result() == QDialog.Accepted:
|
||||
sc = unicode_type(sc.text()).strip()
|
||||
if bf.isChecked():
|
||||
sc += '|bf'
|
||||
customize_plugin(self, sc)
|
||||
|
||||
return config_dialog.result()
|
||||
2152
ebook_converter/ebooks/html_entities.py
Normal file
2152
ebook_converter/ebooks/html_entities.py
Normal file
File diff suppressed because it is too large
Load Diff
115
ebook_converter/ebooks/lrf/__init__.py
Normal file
115
ebook_converter/ebooks/lrf/__init__.py
Normal file
@@ -0,0 +1,115 @@
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
"""
|
||||
This package contains logic to read and write LRF files.
|
||||
The LRF file format is documented at U{http://www.sven.de/librie/Librie/LrfFormat}.
|
||||
"""
|
||||
|
||||
from calibre.ebooks.lrf.pylrs.pylrs import Book as _Book
|
||||
from calibre.ebooks.lrf.pylrs.pylrs import TextBlock, Header, \
|
||||
TextStyle, BlockStyle
|
||||
from calibre.ebooks.lrf.fonts import FONT_FILE_MAP
|
||||
from calibre.ebooks import ConversionError
|
||||
|
||||
__docformat__ = "epytext"
|
||||
|
||||
|
||||
class LRFParseError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class PRS500_PROFILE(object):
|
||||
screen_width = 600
|
||||
screen_height = 775
|
||||
dpi = 166
|
||||
# Number of pixels to subtract from screen_height when calculating height of text area
|
||||
fudge = 0
|
||||
font_size = 10 #: Default (in pt)
|
||||
parindent = 10 #: Default (in pt)
|
||||
line_space = 1.2 # : Default (in pt)
|
||||
header_font_size = 6 #: In pt
|
||||
header_height = 30 # : In px
|
||||
default_fonts = {'sans': "Swis721 BT Roman", 'mono': "Courier10 BT Roman",
|
||||
'serif': "Dutch801 Rm BT Roman"}
|
||||
|
||||
name = 'prs500'
|
||||
|
||||
|
||||
def find_custom_fonts(options, logger):
|
||||
from calibre.utils.fonts.scanner import font_scanner
|
||||
fonts = {'serif' : None, 'sans' : None, 'mono' : None}
|
||||
|
||||
def family(cmd):
|
||||
return cmd.split(',')[-1].strip()
|
||||
if options.serif_family:
|
||||
f = family(options.serif_family)
|
||||
fonts['serif'] = font_scanner.legacy_fonts_for_family(f)
|
||||
if not fonts['serif']:
|
||||
logger.warn('Unable to find serif family %s'%f)
|
||||
if options.sans_family:
|
||||
f = family(options.sans_family)
|
||||
fonts['sans'] = font_scanner.legacy_fonts_for_family(f)
|
||||
if not fonts['sans']:
|
||||
logger.warn('Unable to find sans family %s'%f)
|
||||
if options.mono_family:
|
||||
f = family(options.mono_family)
|
||||
fonts['mono'] = font_scanner.legacy_fonts_for_family(f)
|
||||
if not fonts['mono']:
|
||||
logger.warn('Unable to find mono family %s'%f)
|
||||
return fonts
|
||||
|
||||
|
||||
def Book(options, logger, font_delta=0, header=None,
|
||||
profile=PRS500_PROFILE, **settings):
|
||||
from uuid import uuid4
|
||||
ps = {}
|
||||
ps['topmargin'] = options.top_margin
|
||||
ps['evensidemargin'] = options.left_margin
|
||||
ps['oddsidemargin'] = options.left_margin
|
||||
ps['textwidth'] = profile.screen_width - (options.left_margin + options.right_margin)
|
||||
ps['textheight'] = profile.screen_height - (options.top_margin + options.bottom_margin) \
|
||||
- profile.fudge
|
||||
if header:
|
||||
hdr = Header()
|
||||
hb = TextBlock(textStyle=TextStyle(align='foot',
|
||||
fontsize=int(profile.header_font_size*10)),
|
||||
blockStyle=BlockStyle(blockwidth=ps['textwidth']))
|
||||
hb.append(header)
|
||||
hdr.PutObj(hb)
|
||||
ps['headheight'] = profile.header_height
|
||||
ps['headsep'] = options.header_separation
|
||||
ps['header'] = hdr
|
||||
ps['topmargin'] = 0
|
||||
ps['textheight'] = profile.screen_height - (options.bottom_margin + ps['topmargin']) \
|
||||
- ps['headheight'] - ps['headsep'] - profile.fudge
|
||||
|
||||
fontsize = int(10*profile.font_size+font_delta*20)
|
||||
baselineskip = fontsize + 20
|
||||
fonts = find_custom_fonts(options, logger)
|
||||
tsd = dict(fontsize=fontsize,
|
||||
parindent=int(10*profile.parindent),
|
||||
linespace=int(10*profile.line_space),
|
||||
baselineskip=baselineskip,
|
||||
wordspace=10*options.wordspace)
|
||||
if fonts['serif'] and 'normal' in fonts['serif']:
|
||||
tsd['fontfacename'] = fonts['serif']['normal'][1]
|
||||
|
||||
book = _Book(textstyledefault=tsd,
|
||||
pagestyledefault=ps,
|
||||
blockstyledefault=dict(blockwidth=ps['textwidth']),
|
||||
bookid=uuid4().hex,
|
||||
**settings)
|
||||
for family in fonts.keys():
|
||||
if fonts[family]:
|
||||
for font in fonts[family].values():
|
||||
book.embed_font(*font)
|
||||
FONT_FILE_MAP[font[1]] = font[0]
|
||||
|
||||
for family in ['serif', 'sans', 'mono']:
|
||||
if not fonts[family]:
|
||||
fonts[family] = {'normal' : (None, profile.default_fonts[family])}
|
||||
elif 'normal' not in fonts[family]:
|
||||
raise ConversionError('Could not find the normal version of the ' + family + ' font')
|
||||
return book, fonts
|
||||
33
ebook_converter/ebooks/lrf/fonts.py
Normal file
33
ebook_converter/ebooks/lrf/fonts.py
Normal file
@@ -0,0 +1,33 @@
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
|
||||
from PIL import ImageFont
|
||||
|
||||
'''
|
||||
Default fonts used in the PRS500
|
||||
'''
|
||||
|
||||
|
||||
LIBERATION_FONT_MAP = {
|
||||
'Swis721 BT Roman' : 'LiberationSans-Regular',
|
||||
'Dutch801 Rm BT Roman' : 'LiberationSerif-Regular',
|
||||
'Courier10 BT Roman' : 'LiberationMono-Regular',
|
||||
}
|
||||
|
||||
FONT_FILE_MAP = {}
|
||||
|
||||
|
||||
def get_font(name, size, encoding='unic'):
|
||||
'''
|
||||
Get an ImageFont object by name.
|
||||
@param size: Font height in pixels. To convert from pts:
|
||||
sz in pixels = (dpi/72) * size in pts
|
||||
@param encoding: Font encoding to use. E.g. 'unic', 'symbol', 'ADOB', 'ADBE', 'aprm'
|
||||
@param manager: A dict that will store the PersistentTemporary
|
||||
'''
|
||||
if name in LIBERATION_FONT_MAP:
|
||||
return ImageFont.truetype(P('fonts/liberation/%s.ttf' % LIBERATION_FONT_MAP[name]), size, encoding=encoding)
|
||||
elif name in FONT_FILE_MAP:
|
||||
return ImageFont.truetype(FONT_FILE_MAP[name], size, encoding=encoding)
|
||||
10
ebook_converter/ebooks/lrf/html/__init__.py
Normal file
10
ebook_converter/ebooks/lrf/html/__init__.py
Normal file
@@ -0,0 +1,10 @@
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
"""
|
||||
This package contains code to convert HTML ebooks to LRF ebooks.
|
||||
"""
|
||||
|
||||
__docformat__ = "epytext"
|
||||
__author__ = "Kovid Goyal <kovid@kovidgoyal.net>"
|
||||
115
ebook_converter/ebooks/lrf/html/color_map.py
Normal file
115
ebook_converter/ebooks/lrf/html/color_map.py
Normal file
@@ -0,0 +1,115 @@
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
|
||||
import re
|
||||
|
||||
NAME_MAP = {
|
||||
'aliceblue': '#F0F8FF',
|
||||
'antiquewhite': '#FAEBD7',
|
||||
'aqua': '#00FFFF',
|
||||
'aquamarine': '#7FFFD4',
|
||||
'azure': '#F0FFFF',
|
||||
'beige': '#F5F5DC',
|
||||
'bisque': '#FFE4C4',
|
||||
'black': '#000000',
|
||||
'blanchedalmond': '#FFEBCD',
|
||||
'blue': '#0000FF',
|
||||
'brown': '#A52A2A',
|
||||
'burlywood': '#DEB887',
|
||||
'cadetblue': '#5F9EA0',
|
||||
'chartreuse': '#7FFF00',
|
||||
'chocolate': '#D2691E',
|
||||
'coral': '#FF7F50',
|
||||
'crimson': '#DC143C',
|
||||
'cyan': '#00FFFF',
|
||||
'darkblue': '#00008B',
|
||||
'darkgoldenrod': '#B8860B',
|
||||
'darkgreen': '#006400',
|
||||
'darkkhaki': '#BDB76B',
|
||||
'darkmagenta': '#8B008B',
|
||||
'darkolivegreen': '#556B2F',
|
||||
'darkorange': '#FF8C00',
|
||||
'darkorchid': '#9932CC',
|
||||
'darkred': '#8B0000',
|
||||
'darksalmon': '#E9967A',
|
||||
'darkslateblue': '#483D8B',
|
||||
'darkslategrey': '#2F4F4F',
|
||||
'darkviolet': '#9400D3',
|
||||
'deeppink': '#FF1493',
|
||||
'dodgerblue': '#1E90FF',
|
||||
'firebrick': '#B22222',
|
||||
'floralwhite': '#FFFAF0',
|
||||
'forestgreen': '#228B22',
|
||||
'fuchsia': '#FF00FF',
|
||||
'gainsboro': '#DCDCDC',
|
||||
'ghostwhite': '#F8F8FF',
|
||||
'gold': '#FFD700',
|
||||
'goldenrod': '#DAA520',
|
||||
'indianred ': '#CD5C5C',
|
||||
'indigo ': '#4B0082',
|
||||
'khaki': '#F0E68C',
|
||||
'lavenderblush': '#FFF0F5',
|
||||
'lawngreen': '#7CFC00',
|
||||
'lightblue': '#ADD8E6',
|
||||
'lightcoral': '#F08080',
|
||||
'lightgoldenrodyellow': '#FAFAD2',
|
||||
'lightgray': '#D3D3D3',
|
||||
'lightgrey': '#D3D3D3',
|
||||
'lightskyblue': '#87CEFA',
|
||||
'lightslategrey': '#778899',
|
||||
'lightsteelblue': '#B0C4DE',
|
||||
'lime': '#87CEFA',
|
||||
'linen': '#FAF0E6',
|
||||
'magenta': '#FF00FF',
|
||||
'maroon': '#800000',
|
||||
'mediumaquamarine': '#66CDAA',
|
||||
'mediumblue': '#0000CD',
|
||||
'mediumorchid': '#BA55D3',
|
||||
'mediumpurple': '#9370D8',
|
||||
'mediumseagreen': '#3CB371',
|
||||
'mediumslateblue': '#7B68EE',
|
||||
'midnightblue': '#191970',
|
||||
'moccasin': '#FFE4B5',
|
||||
'navajowhite': '#FFDEAD',
|
||||
'navy': '#000080',
|
||||
'oldlace': '#FDF5E6',
|
||||
'olive': '#808000',
|
||||
'orange': '#FFA500',
|
||||
'orangered': '#FF4500',
|
||||
'orchid': '#DA70D6',
|
||||
'paleturquoise': '#AFEEEE',
|
||||
'papayawhip': '#FFEFD5',
|
||||
'peachpuff': '#FFDAB9',
|
||||
'powderblue': '#B0E0E6',
|
||||
'rosybrown': '#BC8F8F',
|
||||
'royalblue': '#4169E1',
|
||||
'saddlebrown': '#8B4513',
|
||||
'sandybrown': '#8B4513',
|
||||
'seashell': '#FFF5EE',
|
||||
'sienna': '#A0522D',
|
||||
'silver': '#C0C0C0',
|
||||
'skyblue': '#87CEEB',
|
||||
'slategrey': '#708090',
|
||||
'snow': '#FFFAFA',
|
||||
'springgreen': '#00FF7F',
|
||||
'violet': '#EE82EE',
|
||||
'yellowgreen': '#9ACD32'
|
||||
}
|
||||
|
||||
hex_pat = re.compile(r'#(\d{2})(\d{2})(\d{2})')
|
||||
rgb_pat = re.compile(r'rgb\(\s*(\d+)\s*,\s*(\d+)\s*,\s*(\d+)\s*\)', re.IGNORECASE)
|
||||
|
||||
|
||||
def lrs_color(html_color):
|
||||
hcol = html_color.lower()
|
||||
match = hex_pat.search(hcol)
|
||||
if match:
|
||||
return '0x00'+match.group(1)+match.group(2)+match.group(3)
|
||||
match = rgb_pat.search(hcol)
|
||||
if match:
|
||||
return '0x00'+hex(int(match.group(1)))[2:]+hex(int(match.group(2)))[2:]+hex(int(match.group(3)))[2:]
|
||||
if hcol in NAME_MAP:
|
||||
return NAME_MAP[hcol].replace('#', '0x00')
|
||||
return '0x00000000'
|
||||
1951
ebook_converter/ebooks/lrf/html/convert_from.py
Normal file
1951
ebook_converter/ebooks/lrf/html/convert_from.py
Normal file
File diff suppressed because it is too large
Load Diff
386
ebook_converter/ebooks/lrf/html/table.py
Normal file
386
ebook_converter/ebooks/lrf/html/table.py
Normal file
@@ -0,0 +1,386 @@
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
import math, sys, re, numbers
|
||||
|
||||
from calibre.ebooks.lrf.fonts import get_font
|
||||
from calibre.ebooks.lrf.pylrs.pylrs import TextBlock, Text, CR, Span, \
|
||||
CharButton, Plot, Paragraph, \
|
||||
LrsTextTag
|
||||
from polyglot.builtins import string_or_bytes, range, native_string_type
|
||||
|
||||
|
||||
def ceil(num):
|
||||
return int(math.ceil(num))
|
||||
|
||||
|
||||
def print_xml(elem):
|
||||
from calibre.ebooks.lrf.pylrs.pylrs import ElementWriter
|
||||
elem = elem.toElement(native_string_type('utf8'))
|
||||
ew = ElementWriter(elem, sourceEncoding=native_string_type('utf8'))
|
||||
ew.write(sys.stdout)
|
||||
print()
|
||||
|
||||
|
||||
def cattrs(base, extra):
|
||||
new = base.copy()
|
||||
new.update(extra)
|
||||
return new
|
||||
|
||||
|
||||
def tokens(tb):
|
||||
'''
|
||||
Return the next token. A token is :
|
||||
1. A string
|
||||
a block of text that has the same style
|
||||
'''
|
||||
def process_element(x, attrs):
|
||||
if isinstance(x, CR):
|
||||
yield 2, None
|
||||
elif isinstance(x, Text):
|
||||
yield x.text, cattrs(attrs, {})
|
||||
elif isinstance(x, string_or_bytes):
|
||||
yield x, cattrs(attrs, {})
|
||||
elif isinstance(x, (CharButton, LrsTextTag)):
|
||||
if x.contents:
|
||||
if hasattr(x.contents[0], 'text'):
|
||||
yield x.contents[0].text, cattrs(attrs, {})
|
||||
elif hasattr(x.contents[0], 'attrs'):
|
||||
for z in process_element(x.contents[0], x.contents[0].attrs):
|
||||
yield z
|
||||
elif isinstance(x, Plot):
|
||||
yield x, None
|
||||
elif isinstance(x, Span):
|
||||
attrs = cattrs(attrs, x.attrs)
|
||||
for y in x.contents:
|
||||
for z in process_element(y, attrs):
|
||||
yield z
|
||||
|
||||
for i in tb.contents:
|
||||
if isinstance(i, CR):
|
||||
yield 1, None
|
||||
elif isinstance(i, Paragraph):
|
||||
for j in i.contents:
|
||||
attrs = {}
|
||||
if hasattr(j, 'attrs'):
|
||||
attrs = j.attrs
|
||||
for k in process_element(j, attrs):
|
||||
yield k
|
||||
|
||||
|
||||
class Cell(object):
|
||||
|
||||
def __init__(self, conv, tag, css):
|
||||
self.conv = conv
|
||||
self.tag = tag
|
||||
self.css = css
|
||||
self.text_blocks = []
|
||||
self.pwidth = -1.
|
||||
if tag.has_attr('width') and '%' in tag['width']:
|
||||
try:
|
||||
self.pwidth = float(tag['width'].replace('%', ''))
|
||||
except ValueError:
|
||||
pass
|
||||
if 'width' in css and '%' in css['width']:
|
||||
try:
|
||||
self.pwidth = float(css['width'].replace('%', ''))
|
||||
except ValueError:
|
||||
pass
|
||||
if self.pwidth > 100:
|
||||
self.pwidth = -1
|
||||
self.rowspan = self.colspan = 1
|
||||
try:
|
||||
self.colspan = int(tag['colspan']) if tag.has_attr('colspan') else 1
|
||||
self.rowspan = int(tag['rowspan']) if tag.has_attr('rowspan') else 1
|
||||
except:
|
||||
pass
|
||||
|
||||
pp = conv.current_page
|
||||
conv.book.allow_new_page = False
|
||||
conv.current_page = conv.book.create_page()
|
||||
conv.parse_tag(tag, css)
|
||||
conv.end_current_block()
|
||||
for item in conv.current_page.contents:
|
||||
if isinstance(item, TextBlock):
|
||||
self.text_blocks.append(item)
|
||||
conv.current_page = pp
|
||||
conv.book.allow_new_page = True
|
||||
if not self.text_blocks:
|
||||
tb = conv.book.create_text_block()
|
||||
tb.Paragraph(' ')
|
||||
self.text_blocks.append(tb)
|
||||
for tb in self.text_blocks:
|
||||
tb.parent = None
|
||||
tb.objId = 0
|
||||
# Needed as we have to eventually change this BlockStyle's width and
|
||||
# height attributes. This blockstyle may be shared with other
|
||||
# elements, so doing that causes havoc.
|
||||
tb.blockStyle = conv.book.create_block_style()
|
||||
ts = conv.book.create_text_style(**tb.textStyle.attrs)
|
||||
ts.attrs['parindent'] = 0
|
||||
tb.textStyle = ts
|
||||
if ts.attrs['align'] == 'foot':
|
||||
if isinstance(tb.contents[-1], Paragraph):
|
||||
tb.contents[-1].append(' ')
|
||||
|
||||
def pts_to_pixels(self, pts):
|
||||
pts = int(pts)
|
||||
return ceil((float(self.conv.profile.dpi)/72)*(pts/10))
|
||||
|
||||
def minimum_width(self):
|
||||
return max([self.minimum_tb_width(tb) for tb in self.text_blocks])
|
||||
|
||||
def minimum_tb_width(self, tb):
|
||||
ts = tb.textStyle.attrs
|
||||
default_font = get_font(ts['fontfacename'], self.pts_to_pixels(ts['fontsize']))
|
||||
parindent = self.pts_to_pixels(ts['parindent'])
|
||||
mwidth = 0
|
||||
for token, attrs in tokens(tb):
|
||||
font = default_font
|
||||
if isinstance(token, numbers.Integral): # Handle para and line breaks
|
||||
continue
|
||||
if isinstance(token, Plot):
|
||||
return self.pts_to_pixels(token.xsize)
|
||||
ff = attrs.get('fontfacename', ts['fontfacename'])
|
||||
fs = attrs.get('fontsize', ts['fontsize'])
|
||||
if (ff, fs) != (ts['fontfacename'], ts['fontsize']):
|
||||
font = get_font(ff, self.pts_to_pixels(fs))
|
||||
if not token.strip():
|
||||
continue
|
||||
word = token.split()
|
||||
word = word[0] if word else ""
|
||||
width = font.getsize(word)[0]
|
||||
if width > mwidth:
|
||||
mwidth = width
|
||||
return parindent + mwidth + 2
|
||||
|
||||
def text_block_size(self, tb, maxwidth=sys.maxsize, debug=False):
|
||||
ts = tb.textStyle.attrs
|
||||
default_font = get_font(ts['fontfacename'], self.pts_to_pixels(ts['fontsize']))
|
||||
parindent = self.pts_to_pixels(ts['parindent'])
|
||||
top, bottom, left, right = 0, 0, parindent, parindent
|
||||
|
||||
def add_word(width, height, left, right, top, bottom, ls, ws):
|
||||
if left + width > maxwidth:
|
||||
left = width + ws
|
||||
top += ls
|
||||
bottom = top+ls if top+ls > bottom else bottom
|
||||
else:
|
||||
left += (width + ws)
|
||||
right = left if left > right else right
|
||||
bottom = top+ls if top+ls > bottom else bottom
|
||||
return left, right, top, bottom
|
||||
|
||||
for token, attrs in tokens(tb):
|
||||
if attrs is None:
|
||||
attrs = {}
|
||||
font = default_font
|
||||
ls = self.pts_to_pixels(attrs.get('baselineskip', ts['baselineskip']))+\
|
||||
self.pts_to_pixels(attrs.get('linespace', ts['linespace']))
|
||||
ws = self.pts_to_pixels(attrs.get('wordspace', ts['wordspace']))
|
||||
if isinstance(token, numbers.Integral): # Handle para and line breaks
|
||||
if top != bottom: # Previous element not a line break
|
||||
top = bottom
|
||||
else:
|
||||
top += ls
|
||||
bottom += ls
|
||||
left = parindent if int == 1 else 0
|
||||
continue
|
||||
if isinstance(token, Plot):
|
||||
width, height = self.pts_to_pixels(token.xsize), self.pts_to_pixels(token.ysize)
|
||||
left, right, top, bottom = add_word(width, height, left, right, top, bottom, height, ws)
|
||||
continue
|
||||
ff = attrs.get('fontfacename', ts['fontfacename'])
|
||||
fs = attrs.get('fontsize', ts['fontsize'])
|
||||
if (ff, fs) != (ts['fontfacename'], ts['fontsize']):
|
||||
font = get_font(ff, self.pts_to_pixels(fs))
|
||||
for word in token.split():
|
||||
width, height = font.getsize(word)
|
||||
left, right, top, bottom = add_word(width, height, left, right, top, bottom, ls, ws)
|
||||
return right+3+max(parindent, 10), bottom
|
||||
|
||||
def text_block_preferred_width(self, tb, debug=False):
|
||||
return self.text_block_size(tb, sys.maxsize, debug=debug)[0]
|
||||
|
||||
def preferred_width(self, debug=False):
|
||||
return ceil(max([self.text_block_preferred_width(i, debug=debug) for i in self.text_blocks]))
|
||||
|
||||
def height(self, width):
|
||||
return sum([self.text_block_size(i, width)[1] for i in self.text_blocks])
|
||||
|
||||
|
||||
class Row(object):
|
||||
|
||||
def __init__(self, conv, row, css, colpad):
|
||||
self.cells = []
|
||||
self.colpad = colpad
|
||||
cells = row.findAll(re.compile('td|th', re.IGNORECASE))
|
||||
self.targets = []
|
||||
for cell in cells:
|
||||
ccss = conv.tag_css(cell, css)[0]
|
||||
self.cells.append(Cell(conv, cell, ccss))
|
||||
for a in row.findAll(id=True) + row.findAll(name=True):
|
||||
name = a['name'] if a.has_attr('name') else a['id'] if a.has_attr('id') else None
|
||||
if name is not None:
|
||||
self.targets.append(name.replace('#', ''))
|
||||
|
||||
def number_of_cells(self):
|
||||
'''Number of cells in this row. Respects colspan'''
|
||||
ans = 0
|
||||
for cell in self.cells:
|
||||
ans += cell.colspan
|
||||
return ans
|
||||
|
||||
def height(self, widths):
|
||||
i, heights = 0, []
|
||||
for cell in self.cells:
|
||||
width = sum(widths[i:i+cell.colspan])
|
||||
heights.append(cell.height(width))
|
||||
i += cell.colspan
|
||||
if not heights:
|
||||
return 0
|
||||
return max(heights)
|
||||
|
||||
def cell_from_index(self, col):
|
||||
i = -1
|
||||
cell = None
|
||||
for cell in self.cells:
|
||||
for k in range(0, cell.colspan):
|
||||
if i == col:
|
||||
break
|
||||
i += 1
|
||||
if i == col:
|
||||
break
|
||||
return cell
|
||||
|
||||
def minimum_width(self, col):
|
||||
cell = self.cell_from_index(col)
|
||||
if not cell:
|
||||
return 0
|
||||
return cell.minimum_width()
|
||||
|
||||
def preferred_width(self, col):
|
||||
cell = self.cell_from_index(col)
|
||||
if not cell:
|
||||
return 0
|
||||
return 0 if cell.colspan > 1 else cell.preferred_width()
|
||||
|
||||
def width_percent(self, col):
|
||||
cell = self.cell_from_index(col)
|
||||
if not cell:
|
||||
return -1
|
||||
return -1 if cell.colspan > 1 else cell.pwidth
|
||||
|
||||
def cell_iterator(self):
|
||||
for c in self.cells:
|
||||
yield c
|
||||
|
||||
|
||||
class Table(object):
|
||||
|
||||
def __init__(self, conv, table, css, rowpad=10, colpad=10):
|
||||
self.rows = []
|
||||
self.conv = conv
|
||||
self.rowpad = rowpad
|
||||
self.colpad = colpad
|
||||
rows = table.findAll('tr')
|
||||
conv.in_table = True
|
||||
for row in rows:
|
||||
rcss = conv.tag_css(row, css)[0]
|
||||
self.rows.append(Row(conv, row, rcss, colpad))
|
||||
conv.in_table = False
|
||||
|
||||
def number_of_columns(self):
|
||||
max = 0
|
||||
for row in self.rows:
|
||||
max = row.number_of_cells() if row.number_of_cells() > max else max
|
||||
return max
|
||||
|
||||
def number_or_rows(self):
|
||||
return len(self.rows)
|
||||
|
||||
def height(self, maxwidth):
|
||||
''' Return row heights + self.rowpad'''
|
||||
widths = self.get_widths(maxwidth)
|
||||
return sum([row.height(widths) + self.rowpad for row in self.rows]) - self.rowpad
|
||||
|
||||
def minimum_width(self, col):
|
||||
return max([row.minimum_width(col) for row in self.rows])
|
||||
|
||||
def width_percent(self, col):
|
||||
return max([row.width_percent(col) for row in self.rows])
|
||||
|
||||
def get_widths(self, maxwidth):
|
||||
'''
|
||||
Return widths of columns + self.colpad
|
||||
'''
|
||||
rows, cols = self.number_or_rows(), self.number_of_columns()
|
||||
widths = list(range(cols))
|
||||
for c in range(cols):
|
||||
cellwidths = [0 for i in range(rows)]
|
||||
for r in range(rows):
|
||||
try:
|
||||
cellwidths[r] = self.rows[r].preferred_width(c)
|
||||
except IndexError:
|
||||
continue
|
||||
widths[c] = max(cellwidths)
|
||||
|
||||
min_widths = [self.minimum_width(i)+10 for i in range(cols)]
|
||||
for i in range(len(widths)):
|
||||
wp = self.width_percent(i)
|
||||
if wp >= 0:
|
||||
widths[i] = max(min_widths[i], ceil((wp/100) * (maxwidth - (cols-1)*self.colpad)))
|
||||
|
||||
itercount = 0
|
||||
|
||||
while sum(widths) > maxwidth-((len(widths)-1)*self.colpad) and itercount < 100:
|
||||
for i in range(cols):
|
||||
widths[i] = ceil((95/100)*widths[i]) if \
|
||||
ceil((95/100)*widths[i]) >= min_widths[i] else widths[i]
|
||||
itercount += 1
|
||||
|
||||
return [i+self.colpad for i in widths]
|
||||
|
||||
def blocks(self, maxwidth, maxheight):
|
||||
rows, cols = self.number_or_rows(), self.number_of_columns()
|
||||
cellmatrix = [[None for c in range(cols)] for r in range(rows)]
|
||||
rowpos = [0 for i in range(rows)]
|
||||
for r in range(rows):
|
||||
nc = self.rows[r].cell_iterator()
|
||||
try:
|
||||
while True:
|
||||
cell = next(nc)
|
||||
cellmatrix[r][rowpos[r]] = cell
|
||||
rowpos[r] += cell.colspan
|
||||
for k in range(1, cell.rowspan):
|
||||
try:
|
||||
rowpos[r+k] += 1
|
||||
except IndexError:
|
||||
break
|
||||
except StopIteration: # No more cells in this row
|
||||
continue
|
||||
|
||||
widths = self.get_widths(maxwidth)
|
||||
heights = [row.height(widths) for row in self.rows]
|
||||
|
||||
xpos = [sum(widths[:i]) for i in range(cols)]
|
||||
delta = maxwidth - sum(widths)
|
||||
if delta < 0:
|
||||
delta = 0
|
||||
for r in range(len(cellmatrix)):
|
||||
yield None, 0, heights[r], 0, self.rows[r].targets
|
||||
for c in range(len(cellmatrix[r])):
|
||||
cell = cellmatrix[r][c]
|
||||
if not cell:
|
||||
continue
|
||||
width = sum(widths[c:c+cell.colspan])-self.colpad*cell.colspan
|
||||
sypos = 0
|
||||
for tb in cell.text_blocks:
|
||||
tb.blockStyle = self.conv.book.create_block_style(
|
||||
blockwidth=width,
|
||||
blockheight=cell.text_block_size(tb, width)[1],
|
||||
blockrule='horz-fixed')
|
||||
|
||||
yield tb, xpos[c], sypos, delta, None
|
||||
sypos += tb.blockStyle.attrs['blockheight']
|
||||
7
ebook_converter/ebooks/lrf/pylrs/__init__.py
Normal file
7
ebook_converter/ebooks/lrf/pylrs/__init__.py
Normal file
@@ -0,0 +1,7 @@
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
"""
|
||||
This package contains code to generate ebooks in the SONY LRS/F format. It was
|
||||
originally developed by Mike Higgins and has been extended and modified by Kovid
|
||||
Goyal.
|
||||
"""
|
||||
78
ebook_converter/ebooks/lrf/pylrs/elements.py
Normal file
78
ebook_converter/ebooks/lrf/pylrs/elements.py
Normal file
@@ -0,0 +1,78 @@
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
""" elements.py -- replacements and helpers for ElementTree """
|
||||
|
||||
from polyglot.builtins import unicode_type, string_or_bytes
|
||||
|
||||
|
||||
class ElementWriter(object):
|
||||
|
||||
def __init__(self, e, header=False, sourceEncoding="ascii",
|
||||
spaceBeforeClose=True, outputEncodingName="UTF-16"):
|
||||
self.header = header
|
||||
self.e = e
|
||||
self.sourceEncoding=sourceEncoding
|
||||
self.spaceBeforeClose = spaceBeforeClose
|
||||
self.outputEncodingName = outputEncodingName
|
||||
|
||||
def _encodeCdata(self, rawText):
|
||||
if isinstance(rawText, bytes):
|
||||
rawText = rawText.decode(self.sourceEncoding)
|
||||
|
||||
text = rawText.replace("&", "&")
|
||||
text = text.replace("<", "<")
|
||||
text = text.replace(">", ">")
|
||||
return text
|
||||
|
||||
def _writeAttribute(self, f, name, value):
|
||||
f.write(' %s="' % unicode_type(name))
|
||||
if not isinstance(value, string_or_bytes):
|
||||
value = unicode_type(value)
|
||||
value = self._encodeCdata(value)
|
||||
value = value.replace('"', '"')
|
||||
f.write(value)
|
||||
f.write('"')
|
||||
|
||||
def _writeText(self, f, rawText):
|
||||
text = self._encodeCdata(rawText)
|
||||
f.write(text)
|
||||
|
||||
def _write(self, f, e):
|
||||
f.write('<' + unicode_type(e.tag))
|
||||
|
||||
attributes = e.items()
|
||||
attributes.sort()
|
||||
for name, value in attributes:
|
||||
self._writeAttribute(f, name, value)
|
||||
|
||||
if e.text is not None or len(e) > 0:
|
||||
f.write('>')
|
||||
|
||||
if e.text:
|
||||
self._writeText(f, e.text)
|
||||
|
||||
for e2 in e:
|
||||
self._write(f, e2)
|
||||
|
||||
f.write('</%s>' % e.tag)
|
||||
else:
|
||||
if self.spaceBeforeClose:
|
||||
f.write(' ')
|
||||
f.write('/>')
|
||||
|
||||
if e.tail is not None:
|
||||
self._writeText(f, e.tail)
|
||||
|
||||
def toString(self):
|
||||
class x:
|
||||
pass
|
||||
buffer = []
|
||||
x.write = buffer.append
|
||||
self.write(x)
|
||||
return ''.join(buffer)
|
||||
|
||||
def write(self, f):
|
||||
if self.header:
|
||||
f.write('<?xml version="1.0" encoding="%s"?>\n' % self.outputEncodingName)
|
||||
|
||||
self._write(f, self.e)
|
||||
773
ebook_converter/ebooks/lrf/pylrs/pylrf.py
Normal file
773
ebook_converter/ebooks/lrf/pylrs/pylrf.py
Normal file
@@ -0,0 +1,773 @@
|
||||
#!/usr/bin/env python2
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
"""
|
||||
pylrf.py -- very low level interface to create lrf files. See pylrs for
|
||||
higher level interface that can use this module to render books to lrf.
|
||||
"""
|
||||
import struct
|
||||
import zlib
|
||||
import io
|
||||
import codecs
|
||||
import os
|
||||
|
||||
from .pylrfopt import tagListOptimizer
|
||||
from polyglot.builtins import iteritems, string_or_bytes, unicode_type
|
||||
|
||||
PYLRF_VERSION = "1.0"
|
||||
|
||||
#
|
||||
# Acknowledgement:
|
||||
# This software would not have been possible without the pioneering
|
||||
# efforts of the author of lrf2lrs.py, Igor Skochinsky.
|
||||
#
|
||||
# Copyright (c) 2007 Mike Higgins (Falstaff)
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a
|
||||
# copy of this software and associated documentation files (the "Software"),
|
||||
# to deal in the Software without restriction, including without limitation
|
||||
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
# and/or sell copies of the Software, and to permit persons to whom the
|
||||
# Software is furnished to do so, subject to the following conditions:
|
||||
|
||||
# The above copyright notice and this permission notice shall be included in
|
||||
# all copies or substantial portions of the Software.
|
||||
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
# DEALINGS IN THE SOFTWARE.
|
||||
|
||||
#
|
||||
# Change History:
|
||||
#
|
||||
# V1.0 06 Feb 2007
|
||||
# Initial Release.
|
||||
|
||||
#
|
||||
# Current limitations and bugs:
|
||||
# Never "scrambles" any streams (even if asked to). This does not seem
|
||||
# to hurt anything.
|
||||
#
|
||||
# Not based on any official documentation, so many assumptions had to be made.
|
||||
#
|
||||
# Can be used to create lrf files that can lock up an eBook reader.
|
||||
# This is your only warning.
|
||||
#
|
||||
# Unsupported objects: Canvas, Window, PopUpWindow, Sound, Import,
|
||||
# SoundStream, ObjectInfo
|
||||
#
|
||||
# The only button type supported is JumpButton.
|
||||
#
|
||||
# Unsupported tags: SoundStop, Wait, pos on BlockSpace (and those used by
|
||||
# unsupported objects).
|
||||
#
|
||||
# Tags supporting Japanese text and Asian layout have not been tested.
|
||||
#
|
||||
# Tested on Python 2.4 and 2.5, Windows XP and Sony PRS-500.
|
||||
#
|
||||
# Commented even less than pylrs, but not very useful when called directly,
|
||||
# anyway.
|
||||
#
|
||||
|
||||
|
||||
class LrfError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
def writeByte(f, byte):
|
||||
f.write(struct.pack("<B", byte))
|
||||
|
||||
|
||||
def writeWord(f, word):
|
||||
if int(word) > 65535:
|
||||
raise LrfError('Cannot encode a number greater than 65535 in a word.')
|
||||
if int(word) < 0:
|
||||
raise LrfError('Cannot encode a number < 0 in a word: '+unicode_type(word))
|
||||
f.write(struct.pack("<H", int(word)))
|
||||
|
||||
|
||||
def writeSignedWord(f, sword):
|
||||
f.write(struct.pack("<h", int(float(sword))))
|
||||
|
||||
|
||||
def writeWords(f, *words):
|
||||
f.write(struct.pack("<%dH" % len(words), *words))
|
||||
|
||||
|
||||
def writeDWord(f, dword):
|
||||
f.write(struct.pack("<I", int(dword)))
|
||||
|
||||
|
||||
def writeDWords(f, *dwords):
|
||||
f.write(struct.pack("<%dI" % len(dwords), *dwords))
|
||||
|
||||
|
||||
def writeQWord(f, qword):
|
||||
f.write(struct.pack("<Q", qword))
|
||||
|
||||
|
||||
def writeZeros(f, nZeros):
|
||||
f.write(b"\0" * nZeros)
|
||||
|
||||
|
||||
def writeString(f, s):
|
||||
f.write(s)
|
||||
|
||||
|
||||
def writeIdList(f, idList):
|
||||
writeWord(f, len(idList))
|
||||
writeDWords(f, *idList)
|
||||
|
||||
|
||||
def writeColor(f, color):
|
||||
# TODO: allow color names, web format
|
||||
f.write(struct.pack(">I", int(color, 0)))
|
||||
|
||||
|
||||
def writeLineWidth(f, width):
|
||||
writeWord(f, int(width))
|
||||
|
||||
|
||||
def writeUnicode(f, string, encoding):
|
||||
if isinstance(string, bytes):
|
||||
string = string.decode(encoding)
|
||||
string = string.encode("utf-16-le")
|
||||
length = len(string)
|
||||
if length > 65535:
|
||||
raise LrfError('Cannot write strings longer than 65535 characters.')
|
||||
writeWord(f, length)
|
||||
writeString(f, string)
|
||||
|
||||
|
||||
def writeRaw(f, string, encoding):
|
||||
if isinstance(string, bytes):
|
||||
string = string.decode(encoding)
|
||||
|
||||
string = string.encode("utf-16-le")
|
||||
writeString(f, string)
|
||||
|
||||
|
||||
def writeRubyAA(f, rubyAA):
|
||||
ralign, radjust = rubyAA
|
||||
radjust = {"line-edge":0x10, "none":0}[radjust]
|
||||
ralign = {"start":1, "center":2}[ralign]
|
||||
writeWord(f, ralign | radjust)
|
||||
|
||||
|
||||
def writeBgImage(f, bgInfo):
|
||||
imode, iid = bgInfo
|
||||
imode = {"pfix": 0, "fix":1, "tile":2, "centering":3}[imode]
|
||||
writeWord(f, imode)
|
||||
writeDWord(f, iid)
|
||||
|
||||
|
||||
def writeEmpDots(f, dotsInfo, encoding):
|
||||
refDotsFont, dotsFontName, dotsCode = dotsInfo
|
||||
writeDWord(f, refDotsFont)
|
||||
LrfTag("fontfacename", dotsFontName).write(f, encoding)
|
||||
writeWord(f, int(dotsCode, 0))
|
||||
|
||||
|
||||
def writeRuledLine(f, lineInfo):
|
||||
lineLength, lineType, lineWidth, lineColor = lineInfo
|
||||
writeWord(f, lineLength)
|
||||
writeWord(f, LINE_TYPE_ENCODING[lineType])
|
||||
writeWord(f, lineWidth)
|
||||
writeColor(f, lineColor)
|
||||
|
||||
|
||||
LRF_SIGNATURE = b"L\x00R\x00F\x00\x00\x00"
|
||||
|
||||
# XOR_KEY = 48
|
||||
XOR_KEY = 65024 # that's what lrf2lrs says -- not used, anyway...
|
||||
|
||||
LRF_VERSION = 1000 # is 999 for librie? lrf2lrs uses 1000
|
||||
|
||||
IMAGE_TYPE_ENCODING = dict(GIF=0x14, PNG=0x12, BMP=0x13, JPEG=0x11, JPG=0x11)
|
||||
|
||||
OBJECT_TYPE_ENCODING = dict(
|
||||
PageTree=0x01,
|
||||
Page=0x02,
|
||||
Header=0x03,
|
||||
Footer=0x04,
|
||||
PageAtr=0x05, PageStyle=0x05,
|
||||
Block=0x06,
|
||||
BlockAtr=0x07, BlockStyle=0x07,
|
||||
MiniPage=0x08,
|
||||
TextBlock=0x0A, Text=0x0A,
|
||||
TextAtr=0x0B, TextStyle=0x0B,
|
||||
ImageBlock=0x0C, Image=0x0C,
|
||||
Canvas=0x0D,
|
||||
ESound=0x0E,
|
||||
ImageStream=0x11,
|
||||
Import=0x12,
|
||||
Button=0x13,
|
||||
Window=0x14,
|
||||
PopUpWindow=0x15,
|
||||
Sound=0x16,
|
||||
SoundStream=0x17,
|
||||
Font=0x19,
|
||||
ObjectInfo=0x1A,
|
||||
BookAtr=0x1C, BookStyle=0x1C,
|
||||
SimpleTextBlock=0x1D,
|
||||
TOC=0x1E
|
||||
)
|
||||
|
||||
LINE_TYPE_ENCODING = {
|
||||
'none':0, 'solid':0x10, 'dashed':0x20, 'double':0x30, 'dotted':0x40
|
||||
}
|
||||
|
||||
BINDING_DIRECTION_ENCODING = dict(Lr=1, Rl=16)
|
||||
|
||||
|
||||
TAG_INFO = dict(
|
||||
rawtext=(0, writeRaw),
|
||||
ObjectStart=(0xF500, "<IH"),
|
||||
ObjectEnd=(0xF501,),
|
||||
# InfoLink (0xF502)
|
||||
Link=(0xF503, "<I"),
|
||||
StreamSize=(0xF504, writeDWord),
|
||||
StreamData=(0xF505, writeString),
|
||||
StreamEnd=(0xF506,),
|
||||
oddheaderid=(0xF507, writeDWord),
|
||||
evenheaderid=(0xF508, writeDWord),
|
||||
oddfooterid=(0xF509, writeDWord),
|
||||
evenfooterid=(0xF50A, writeDWord),
|
||||
ObjectList=(0xF50B, writeIdList),
|
||||
fontsize=(0xF511, writeSignedWord),
|
||||
fontwidth=(0xF512, writeSignedWord),
|
||||
fontescapement=(0xF513, writeSignedWord),
|
||||
fontorientation=(0xF514, writeSignedWord),
|
||||
fontweight=(0xF515, writeWord),
|
||||
fontfacename=(0xF516, writeUnicode),
|
||||
textcolor=(0xF517, writeColor),
|
||||
textbgcolor=(0xF518, writeColor),
|
||||
wordspace=(0xF519, writeSignedWord),
|
||||
letterspace=(0xF51A, writeSignedWord),
|
||||
baselineskip=(0xF51B, writeSignedWord),
|
||||
linespace=(0xF51C, writeSignedWord),
|
||||
parindent=(0xF51D, writeSignedWord),
|
||||
parskip=(0xF51E, writeSignedWord),
|
||||
# F51F, F520
|
||||
topmargin=(0xF521, writeWord),
|
||||
headheight=(0xF522, writeWord),
|
||||
headsep=(0xF523, writeWord),
|
||||
oddsidemargin=(0xF524, writeWord),
|
||||
textheight=(0xF525, writeWord),
|
||||
textwidth=(0xF526, writeWord),
|
||||
canvaswidth=(0xF551, writeWord),
|
||||
canvasheight=(0xF552, writeWord),
|
||||
footspace=(0xF527, writeWord),
|
||||
footheight=(0xF528, writeWord),
|
||||
bgimage=(0xF529, writeBgImage),
|
||||
setemptyview=(0xF52A, {'show':1, 'empty':0}, writeWord),
|
||||
pageposition=(0xF52B, {'any':0,'upper':1, 'lower':2}, writeWord),
|
||||
evensidemargin=(0xF52C, writeWord),
|
||||
framemode=(0xF52E,
|
||||
{'None':0, 'curve':2, 'square':1}, writeWord),
|
||||
blockwidth=(0xF531, writeWord),
|
||||
blockheight=(0xF532, writeWord),
|
||||
blockrule=(0xF533, {"horz-fixed":0x14, "horz-adjustable":0x12,
|
||||
"vert-fixed":0x41, "vert-adjustable":0x21,
|
||||
"block-fixed":0x44, "block-adjustable":0x22},
|
||||
writeWord),
|
||||
bgcolor=(0xF534, writeColor),
|
||||
layout=(0xF535, {'TbRl':0x41, 'LrTb':0x34}, writeWord),
|
||||
framewidth=(0xF536, writeWord),
|
||||
framecolor=(0xF537, writeColor),
|
||||
topskip=(0xF538, writeWord),
|
||||
sidemargin=(0xF539, writeWord),
|
||||
footskip=(0xF53A, writeWord),
|
||||
align=(0xF53C, {'head':1, 'center':4, 'foot':8}, writeWord),
|
||||
column=(0xF53D, writeWord),
|
||||
columnsep=(0xF53E, writeSignedWord),
|
||||
minipagewidth=(0xF541, writeWord),
|
||||
minipageheight=(0xF542, writeWord),
|
||||
yspace=(0xF546, writeWord),
|
||||
xspace=(0xF547, writeWord),
|
||||
PutObj=(0xF549, "<HHI"),
|
||||
ImageRect=(0xF54A, "<HHHH"),
|
||||
ImageSize=(0xF54B, "<HH"),
|
||||
RefObjId=(0xF54C, "<I"),
|
||||
PageDiv=(0xF54E, "<HIHI"),
|
||||
StreamFlags=(0xF554, writeWord),
|
||||
Comment=(0xF555, writeUnicode),
|
||||
FontFilename=(0xF559, writeUnicode),
|
||||
PageList=(0xF55C, writeIdList),
|
||||
FontFacename=(0xF55D, writeUnicode),
|
||||
buttonflags=(0xF561, writeWord),
|
||||
PushButtonStart=(0xF566,),
|
||||
PushButtonEnd=(0xF567,),
|
||||
buttonactions=(0xF56A,),
|
||||
endbuttonactions=(0xF56B,),
|
||||
jumpto=(0xF56C, "<II"),
|
||||
RuledLine=(0xF573, writeRuledLine),
|
||||
rubyaa=(0xF575, writeRubyAA),
|
||||
rubyoverhang=(0xF576, {'none':0, 'auto':1}, writeWord),
|
||||
empdotsposition=(0xF577, {'before':1, 'after':2}, writeWord),
|
||||
empdots=(0xF578, writeEmpDots),
|
||||
emplineposition=(0xF579, {'before':1, 'after':2}, writeWord),
|
||||
emplinetype=(0xF57A, LINE_TYPE_ENCODING, writeWord),
|
||||
ChildPageTree=(0xF57B, "<I"),
|
||||
ParentPageTree=(0xF57C, "<I"),
|
||||
Italic=(0xF581,),
|
||||
ItalicEnd=(0xF582,),
|
||||
pstart=(0xF5A1, writeDWord), # what goes in the dword? refesound
|
||||
pend=(0xF5A2,),
|
||||
CharButton=(0xF5A7, writeDWord),
|
||||
CharButtonEnd=(0xF5A8,),
|
||||
Rubi=(0xF5A9,),
|
||||
RubiEnd=(0xF5AA,),
|
||||
Oyamoji=(0xF5AB,),
|
||||
OyamojiEnd=(0xF5AC,),
|
||||
Rubimoji=(0xF5AD,),
|
||||
RubimojiEnd=(0xF5AE,),
|
||||
Yoko=(0xF5B1,),
|
||||
YokoEnd=(0xF5B2,),
|
||||
Tate=(0xF5B3,),
|
||||
TateEnd=(0xF5B4,),
|
||||
Nekase=(0xF5B5,),
|
||||
NekaseEnd=(0xF5B6,),
|
||||
Sup=(0xF5B7,),
|
||||
SupEnd=(0xF5B8,),
|
||||
Sub=(0xF5B9,),
|
||||
SubEnd=(0xF5BA,),
|
||||
NoBR=(0xF5BB,),
|
||||
NoBREnd=(0xF5BC,),
|
||||
EmpDots=(0xF5BD,),
|
||||
EmpDotsEnd=(0xF5BE,),
|
||||
EmpLine=(0xF5C1,),
|
||||
EmpLineEnd=(0xF5C2,),
|
||||
DrawChar=(0xF5C3, '<H'),
|
||||
DrawCharEnd=(0xF5C4,),
|
||||
Box=(0xF5C6, LINE_TYPE_ENCODING, writeWord),
|
||||
BoxEnd=(0xF5C7,),
|
||||
Space=(0xF5CA, writeSignedWord),
|
||||
textstring=(0xF5CC, writeUnicode),
|
||||
Plot=(0xF5D1, "<HHII"),
|
||||
CR=(0xF5D2,),
|
||||
RegisterFont=(0xF5D8, writeDWord),
|
||||
setwaitprop=(0xF5DA, {'replay':1, 'noreplay':2}, writeWord),
|
||||
charspace=(0xF5DD, writeSignedWord),
|
||||
textlinewidth=(0xF5F1, writeLineWidth),
|
||||
linecolor=(0xF5F2, writeColor)
|
||||
)
|
||||
|
||||
|
||||
class ObjectTableEntry(object):
|
||||
|
||||
def __init__(self, objId, offset, size):
|
||||
self.objId = objId
|
||||
self.offset = offset
|
||||
self.size = size
|
||||
|
||||
def write(self, f):
|
||||
writeDWords(f, self.objId, self.offset, self.size, 0)
|
||||
|
||||
|
||||
class LrfTag(object):
|
||||
|
||||
def __init__(self, name, *parameters):
|
||||
try:
|
||||
tagInfo = TAG_INFO[name]
|
||||
except KeyError:
|
||||
raise LrfError("tag name %s not recognized" % name)
|
||||
|
||||
self.name = name
|
||||
self.type = tagInfo[0]
|
||||
self.format = tagInfo[1:]
|
||||
|
||||
if len(parameters) > 1:
|
||||
raise LrfError("only one parameter allowed on tag %s" % name)
|
||||
|
||||
if len(parameters) == 0:
|
||||
self.parameter = None
|
||||
else:
|
||||
self.parameter = parameters[0]
|
||||
|
||||
def write(self, lrf, encoding=None):
|
||||
if self.type != 0:
|
||||
writeWord(lrf, self.type)
|
||||
|
||||
p = self.parameter
|
||||
if p is None:
|
||||
return
|
||||
|
||||
# print " Writing tag", self.name
|
||||
for f in self.format:
|
||||
if isinstance(f, dict):
|
||||
p = f[p]
|
||||
elif isinstance(f, string_or_bytes):
|
||||
if isinstance(p, tuple):
|
||||
writeString(lrf, struct.pack(f, *p))
|
||||
else:
|
||||
writeString(lrf, struct.pack(f, p))
|
||||
else:
|
||||
if f in [writeUnicode, writeRaw, writeEmpDots]:
|
||||
if encoding is None:
|
||||
raise LrfError("Tag requires encoding")
|
||||
f(lrf, p, encoding)
|
||||
else:
|
||||
f(lrf, p)
|
||||
|
||||
|
||||
STREAM_SCRAMBLED = 0x200
|
||||
STREAM_COMPRESSED = 0x100
|
||||
STREAM_FORCE_COMPRESSED = 0x8100
|
||||
STREAM_TOC = 0x0051
|
||||
|
||||
|
||||
class LrfStreamBase(object):
|
||||
|
||||
def __init__(self, streamFlags, streamData=None):
|
||||
self.streamFlags = streamFlags
|
||||
self.streamData = streamData
|
||||
|
||||
def setStreamData(self, streamData):
|
||||
self.streamData = streamData
|
||||
|
||||
def getStreamTags(self, optimize=False):
|
||||
# tags:
|
||||
# StreamFlags
|
||||
# StreamSize
|
||||
# StreamStart
|
||||
# (data)
|
||||
# StreamEnd
|
||||
#
|
||||
# if flags & 0x200, stream is scrambled
|
||||
# if flags & 0x100, stream is compressed
|
||||
|
||||
flags = self.streamFlags
|
||||
streamBuffer = self.streamData
|
||||
|
||||
# implement scramble? I never scramble anything...
|
||||
|
||||
if flags & STREAM_FORCE_COMPRESSED == STREAM_FORCE_COMPRESSED:
|
||||
optimize = False
|
||||
|
||||
if flags & STREAM_COMPRESSED == STREAM_COMPRESSED:
|
||||
uncompLen = len(streamBuffer)
|
||||
compStreamBuffer = zlib.compress(streamBuffer)
|
||||
if optimize and uncompLen <= len(compStreamBuffer) + 4:
|
||||
flags &= ~STREAM_COMPRESSED
|
||||
else:
|
||||
streamBuffer = struct.pack("<I", uncompLen) + compStreamBuffer
|
||||
|
||||
return [LrfTag("StreamFlags", flags & 0x01FF),
|
||||
LrfTag("StreamSize", len(streamBuffer)),
|
||||
LrfTag("StreamData", streamBuffer),
|
||||
LrfTag("StreamEnd")]
|
||||
|
||||
|
||||
class LrfTagStream(LrfStreamBase):
|
||||
|
||||
def __init__(self, streamFlags, streamTags=None):
|
||||
LrfStreamBase.__init__(self, streamFlags)
|
||||
if streamTags is None:
|
||||
self.tags = []
|
||||
else:
|
||||
self.tags = streamTags[:]
|
||||
|
||||
def appendLrfTag(self, tag):
|
||||
self.tags.append(tag)
|
||||
|
||||
def getStreamTags(self, encoding,
|
||||
optimizeTags=False, optimizeCompression=False):
|
||||
stream = io.BytesIO()
|
||||
if optimizeTags:
|
||||
tagListOptimizer(self.tags)
|
||||
|
||||
for tag in self.tags:
|
||||
tag.write(stream, encoding)
|
||||
|
||||
self.streamData = stream.getvalue()
|
||||
stream.close()
|
||||
return LrfStreamBase.getStreamTags(self, optimize=optimizeCompression)
|
||||
|
||||
|
||||
class LrfFileStream(LrfStreamBase):
|
||||
|
||||
def __init__(self, streamFlags, filename):
|
||||
LrfStreamBase.__init__(self, streamFlags)
|
||||
with open(filename, "rb") as f:
|
||||
self.streamData = f.read()
|
||||
|
||||
|
||||
class LrfObject(object):
|
||||
|
||||
def __init__(self, name, objId):
|
||||
if objId <= 0:
|
||||
raise LrfError("invalid objId for " + name)
|
||||
|
||||
self.name = name
|
||||
self.objId = objId
|
||||
self.tags = []
|
||||
try:
|
||||
self.type = OBJECT_TYPE_ENCODING[name]
|
||||
except KeyError:
|
||||
raise LrfError("object name %s not recognized" % name)
|
||||
|
||||
def __str__(self):
|
||||
return 'LRFObject: ' + self.name + ", " + unicode_type(self.objId)
|
||||
|
||||
def appendLrfTag(self, tag):
|
||||
self.tags.append(tag)
|
||||
|
||||
def appendLrfTags(self, tagList):
|
||||
self.tags.extend(tagList)
|
||||
|
||||
# deprecated old name
|
||||
append = appendLrfTag
|
||||
|
||||
def appendTagDict(self, tagDict, genClass=None):
|
||||
#
|
||||
# This code does not really belong here, I think. But it
|
||||
# belongs somewhere, so here it is.
|
||||
#
|
||||
composites = {}
|
||||
for name, value in iteritems(tagDict):
|
||||
if name == 'rubyAlignAndAdjust':
|
||||
continue
|
||||
if name in {
|
||||
"bgimagemode", "bgimageid", "rubyalign", "rubyadjust",
|
||||
"empdotscode", "empdotsfontname", "refempdotsfont"}:
|
||||
composites[name] = value
|
||||
else:
|
||||
self.append(LrfTag(name, value))
|
||||
|
||||
if "rubyalign" in composites or "rubyadjust" in composites:
|
||||
ralign = composites.get("rubyalign", "none")
|
||||
radjust = composites.get("rubyadjust", "start")
|
||||
self.append(LrfTag("rubyaa", (ralign, radjust)))
|
||||
|
||||
if "bgimagemode" in composites or "bgimageid" in composites:
|
||||
imode = composites.get("bgimagemode", "fix")
|
||||
iid = composites.get("bgimageid", 0)
|
||||
|
||||
# for some reason, page style uses 0 for "fix"
|
||||
# we call this pfix to differentiate it
|
||||
if genClass == "PageStyle" and imode == "fix":
|
||||
imode = "pfix"
|
||||
|
||||
self.append(LrfTag("bgimage", (imode, iid)))
|
||||
|
||||
if "empdotscode" in composites or "empdotsfontname" in composites or \
|
||||
"refempdotsfont" in composites:
|
||||
dotscode = composites.get("empdotscode", "0x002E")
|
||||
dotsfontname = composites.get("empdotsfontname",
|
||||
"Dutch801 Rm BT Roman")
|
||||
refdotsfont = composites.get("refempdotsfont", 0)
|
||||
self.append(LrfTag("empdots", (refdotsfont, dotsfontname,
|
||||
dotscode)))
|
||||
|
||||
def write(self, lrf, encoding=None):
|
||||
# print "Writing object", self.name
|
||||
LrfTag("ObjectStart", (self.objId, self.type)).write(lrf)
|
||||
|
||||
for tag in self.tags:
|
||||
tag.write(lrf, encoding)
|
||||
|
||||
LrfTag("ObjectEnd").write(lrf)
|
||||
|
||||
|
||||
class LrfToc(LrfObject):
|
||||
"""
|
||||
Table of contents. Format of toc is:
|
||||
[ (pageid, objid, string)...]
|
||||
"""
|
||||
|
||||
def __init__(self, objId, toc, se):
|
||||
LrfObject.__init__(self, "TOC", objId)
|
||||
streamData = self._makeTocStream(toc, se)
|
||||
self._makeStreamTags(streamData)
|
||||
|
||||
def _makeStreamTags(self, streamData):
|
||||
stream = LrfStreamBase(STREAM_TOC, streamData)
|
||||
self.tags.extend(stream.getStreamTags())
|
||||
|
||||
def _makeTocStream(self, toc, se):
|
||||
stream = io.BytesIO()
|
||||
nEntries = len(toc)
|
||||
|
||||
writeDWord(stream, nEntries)
|
||||
|
||||
lastOffset = 0
|
||||
writeDWord(stream, lastOffset)
|
||||
for i in range(nEntries - 1):
|
||||
pageId, objId, label = toc[i]
|
||||
entryLen = 4 + 4 + 2 + len(label)*2
|
||||
lastOffset += entryLen
|
||||
writeDWord(stream, lastOffset)
|
||||
|
||||
for entry in toc:
|
||||
pageId, objId, label = entry
|
||||
if pageId <= 0:
|
||||
raise LrfError("page id invalid in toc: " + label)
|
||||
if objId <= 0:
|
||||
raise LrfError("textblock id invalid in toc: " + label)
|
||||
|
||||
writeDWord(stream, pageId)
|
||||
writeDWord(stream, objId)
|
||||
writeUnicode(stream, label, se)
|
||||
|
||||
streamData = stream.getvalue()
|
||||
stream.close()
|
||||
return streamData
|
||||
|
||||
|
||||
class LrfWriter(object):
|
||||
|
||||
def __init__(self, sourceEncoding):
|
||||
self.sourceEncoding = sourceEncoding
|
||||
|
||||
# The following flags are just to have a place to remember these
|
||||
# values. The flags must still be passed to the appropriate classes
|
||||
# in order to have them work.
|
||||
|
||||
self.saveStreamTags = False # used only in testing -- hogs memory
|
||||
|
||||
# highly experimental -- set to True at your own risk
|
||||
self.optimizeTags = False
|
||||
self.optimizeCompression = False
|
||||
|
||||
# End of placeholders
|
||||
|
||||
self.rootObjId = 0
|
||||
self.rootObj = None
|
||||
self.binding = 1 # 1=front to back, 16=back to front
|
||||
self.dpi = 1600
|
||||
self.width = 600
|
||||
self.height = 800
|
||||
self.colorDepth = 24
|
||||
self.tocObjId = 0
|
||||
self.docInfoXml = ""
|
||||
self.thumbnailEncoding = "JPEG"
|
||||
self.thumbnailData = b""
|
||||
self.objects = []
|
||||
self.objectTable = []
|
||||
|
||||
def getSourceEncoding(self):
|
||||
return self.sourceEncoding
|
||||
|
||||
def toUnicode(self, string):
|
||||
if isinstance(string, bytes):
|
||||
string = string.decode(self.sourceEncoding)
|
||||
|
||||
return string
|
||||
|
||||
def getDocInfoXml(self):
|
||||
return self.docInfoXml
|
||||
|
||||
def setPageTreeId(self, objId):
|
||||
self.pageTreeId = objId
|
||||
|
||||
def getPageTreeId(self):
|
||||
return self.pageTreeId
|
||||
|
||||
def setRootObject(self, obj):
|
||||
if self.rootObjId != 0:
|
||||
raise LrfError("root object already set")
|
||||
|
||||
self.rootObjId = obj.objId
|
||||
self.rootObj = obj
|
||||
|
||||
def registerFontId(self, id):
|
||||
if self.rootObj is None:
|
||||
raise LrfError("can't register font -- no root object")
|
||||
|
||||
self.rootObj.append(LrfTag("RegisterFont", id))
|
||||
|
||||
def setTocObject(self, obj):
|
||||
if self.tocObjId != 0:
|
||||
raise LrfError("toc object already set")
|
||||
|
||||
self.tocObjId = obj.objId
|
||||
|
||||
def setThumbnailFile(self, filename, encoding=None):
|
||||
with open(filename, "rb") as f:
|
||||
self.thumbnailData = f.read()
|
||||
|
||||
if encoding is None:
|
||||
encoding = os.path.splitext(filename)[1][1:]
|
||||
|
||||
encoding = encoding.upper()
|
||||
if encoding not in IMAGE_TYPE_ENCODING:
|
||||
raise LrfError("unknown image type: " + encoding)
|
||||
|
||||
self.thumbnailEncoding = encoding
|
||||
|
||||
def append(self, obj):
|
||||
self.objects.append(obj)
|
||||
|
||||
def addLrfObject(self, objId):
|
||||
pass
|
||||
|
||||
def writeFile(self, lrf):
|
||||
if self.rootObjId == 0:
|
||||
raise LrfError("no root object has been set")
|
||||
|
||||
self.writeHeader(lrf)
|
||||
self.writeObjects(lrf)
|
||||
self.updateObjectTableOffset(lrf)
|
||||
self.updateTocObjectOffset(lrf)
|
||||
self.writeObjectTable(lrf)
|
||||
|
||||
def writeHeader(self, lrf):
|
||||
writeString(lrf, LRF_SIGNATURE)
|
||||
writeWord(lrf, LRF_VERSION)
|
||||
writeWord(lrf, XOR_KEY)
|
||||
writeDWord(lrf, self.rootObjId)
|
||||
writeQWord(lrf, len(self.objects))
|
||||
writeQWord(lrf, 0) # 0x18 objectTableOffset -- will be updated
|
||||
writeZeros(lrf, 4) # 0x20 unknown
|
||||
writeWord(lrf, self.binding)
|
||||
writeDWord(lrf, self.dpi)
|
||||
writeWords(lrf, self.width, self.height, self.colorDepth)
|
||||
writeZeros(lrf, 20) # 0x30 unknown
|
||||
writeDWord(lrf, self.tocObjId)
|
||||
writeDWord(lrf, 0) # 0x48 tocObjectOffset -- will be updated
|
||||
docInfoXml = codecs.BOM_UTF8 + self.docInfoXml.encode("utf-8")
|
||||
compDocInfo = zlib.compress(docInfoXml)
|
||||
writeWord(lrf, len(compDocInfo) + 4)
|
||||
writeWord(lrf, IMAGE_TYPE_ENCODING[self.thumbnailEncoding])
|
||||
writeDWord(lrf, len(self.thumbnailData))
|
||||
writeDWord(lrf, len(docInfoXml))
|
||||
writeString(lrf, compDocInfo)
|
||||
writeString(lrf, self.thumbnailData)
|
||||
|
||||
def writeObjects(self, lrf):
|
||||
# also appends object entries to the object table
|
||||
self.objectTable = []
|
||||
for obj in self.objects:
|
||||
objStart = lrf.tell()
|
||||
obj.write(lrf, self.sourceEncoding)
|
||||
objEnd = lrf.tell()
|
||||
self.objectTable.append(
|
||||
ObjectTableEntry(obj.objId, objStart, objEnd-objStart))
|
||||
|
||||
def updateObjectTableOffset(self, lrf):
|
||||
# update the offset of the object table
|
||||
tableOffset = lrf.tell()
|
||||
lrf.seek(0x18, 0)
|
||||
writeQWord(lrf, tableOffset)
|
||||
lrf.seek(0, 2)
|
||||
|
||||
def updateTocObjectOffset(self, lrf):
|
||||
if self.tocObjId == 0:
|
||||
return
|
||||
|
||||
for entry in self.objectTable:
|
||||
if entry.objId == self.tocObjId:
|
||||
lrf.seek(0x48, 0)
|
||||
writeDWord(lrf, entry.offset)
|
||||
lrf.seek(0, 2)
|
||||
break
|
||||
else:
|
||||
raise LrfError("toc object not in object table")
|
||||
|
||||
def writeObjectTable(self, lrf):
|
||||
for tableEntry in self.objectTable:
|
||||
tableEntry.write(lrf)
|
||||
44
ebook_converter/ebooks/lrf/pylrs/pylrfopt.py
Normal file
44
ebook_converter/ebooks/lrf/pylrs/pylrfopt.py
Normal file
@@ -0,0 +1,44 @@
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
|
||||
def _optimize(tagList, tagName, conversion):
|
||||
# copy the tag of interest plus any text
|
||||
newTagList = []
|
||||
for tag in tagList:
|
||||
if tag.name == tagName or tag.name == "rawtext":
|
||||
newTagList.append(tag)
|
||||
|
||||
# now, eliminate any duplicates (leaving the last one)
|
||||
for i, newTag in enumerate(newTagList[:-1]):
|
||||
if newTag.name == tagName and newTagList[i+1].name == tagName:
|
||||
tagList.remove(newTag)
|
||||
|
||||
# eliminate redundant settings to same value across text strings
|
||||
newTagList = []
|
||||
for tag in tagList:
|
||||
if tag.name == tagName:
|
||||
newTagList.append(tag)
|
||||
|
||||
for i, newTag in enumerate(newTagList[:-1]):
|
||||
value = conversion(newTag.parameter)
|
||||
nextValue = conversion(newTagList[i+1].parameter)
|
||||
if value == nextValue:
|
||||
tagList.remove(newTagList[i+1])
|
||||
|
||||
# eliminate any setting that don't have text after them
|
||||
while len(tagList) > 0 and tagList[-1].name == tagName:
|
||||
del tagList[-1]
|
||||
|
||||
|
||||
def tagListOptimizer(tagList):
|
||||
# this function eliminates redundant or unnecessary tags
|
||||
# it scans a list of tags, looking for text settings that are
|
||||
# changed before any text is output
|
||||
# for example,
|
||||
# fontsize=100, fontsize=200, text, fontsize=100, fontsize=200
|
||||
# should be:
|
||||
# fontsize=200 text
|
||||
oldSize = len(tagList)
|
||||
_optimize(tagList, "fontsize", int)
|
||||
_optimize(tagList, "fontweight", int)
|
||||
return oldSize - len(tagList)
|
||||
2442
ebook_converter/ebooks/lrf/pylrs/pylrs.py
Normal file
2442
ebook_converter/ebooks/lrf/pylrs/pylrs.py
Normal file
File diff suppressed because it is too large
Load Diff
440
ebook_converter/ebooks/metadata/__init__.py
Normal file
440
ebook_converter/ebooks/metadata/__init__.py
Normal file
@@ -0,0 +1,440 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=utf-8
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
"""
|
||||
Provides abstraction for metadata reading.writing from a variety of ebook formats.
|
||||
"""
|
||||
import os, sys, re
|
||||
|
||||
from calibre import relpath, guess_type, prints, force_unicode
|
||||
from calibre.utils.config_base import tweaks
|
||||
from polyglot.builtins import codepoint_to_chr, unicode_type, range, map, zip, getcwd, iteritems, itervalues, as_unicode
|
||||
from polyglot.urllib import quote, unquote, urlparse
|
||||
|
||||
|
||||
try:
|
||||
_author_pat = re.compile(tweaks['authors_split_regex'])
|
||||
except Exception:
|
||||
prints('Author split regexp:', tweaks['authors_split_regex'],
|
||||
'is invalid, using default')
|
||||
_author_pat = re.compile(r'(?i),?\s+(and|with)\s+')
|
||||
|
||||
|
||||
def string_to_authors(raw):
|
||||
if not raw:
|
||||
return []
|
||||
raw = raw.replace('&&', '\uffff')
|
||||
raw = _author_pat.sub('&', raw)
|
||||
authors = [a.strip().replace('\uffff', '&') for a in raw.split('&')]
|
||||
return [a for a in authors if a]
|
||||
|
||||
|
||||
def authors_to_string(authors):
|
||||
if authors is not None:
|
||||
return ' & '.join([a.replace('&', '&&') for a in authors if a])
|
||||
else:
|
||||
return ''
|
||||
|
||||
|
||||
def remove_bracketed_text(src, brackets=None):
|
||||
if brackets is None:
|
||||
brackets = {'(': ')', '[': ']', '{': '}'}
|
||||
from collections import Counter
|
||||
counts = Counter()
|
||||
buf = []
|
||||
src = force_unicode(src)
|
||||
rmap = {v: k for k, v in iteritems(brackets)}
|
||||
for char in src:
|
||||
if char in brackets:
|
||||
counts[char] += 1
|
||||
elif char in rmap:
|
||||
idx = rmap[char]
|
||||
if counts[idx] > 0:
|
||||
counts[idx] -= 1
|
||||
elif sum(itervalues(counts)) < 1:
|
||||
buf.append(char)
|
||||
return ''.join(buf)
|
||||
|
||||
|
||||
def author_to_author_sort(author, method=None):
|
||||
if not author:
|
||||
return ''
|
||||
sauthor = remove_bracketed_text(author).strip()
|
||||
tokens = sauthor.split()
|
||||
if len(tokens) < 2:
|
||||
return author
|
||||
if method is None:
|
||||
method = tweaks['author_sort_copy_method']
|
||||
|
||||
ltoks = frozenset(x.lower() for x in tokens)
|
||||
copy_words = frozenset(x.lower() for x in tweaks['author_name_copywords'])
|
||||
if ltoks.intersection(copy_words):
|
||||
method = 'copy'
|
||||
|
||||
if method == 'copy':
|
||||
return author
|
||||
|
||||
prefixes = {force_unicode(y).lower() for y in tweaks['author_name_prefixes']}
|
||||
prefixes |= {y+'.' for y in prefixes}
|
||||
while True:
|
||||
if not tokens:
|
||||
return author
|
||||
tok = tokens[0].lower()
|
||||
if tok in prefixes:
|
||||
tokens = tokens[1:]
|
||||
else:
|
||||
break
|
||||
|
||||
suffixes = {force_unicode(y).lower() for y in tweaks['author_name_suffixes']}
|
||||
suffixes |= {y+'.' for y in suffixes}
|
||||
|
||||
suffix = ''
|
||||
while True:
|
||||
if not tokens:
|
||||
return author
|
||||
last = tokens[-1].lower()
|
||||
if last in suffixes:
|
||||
suffix = tokens[-1] + ' ' + suffix
|
||||
tokens = tokens[:-1]
|
||||
else:
|
||||
break
|
||||
suffix = suffix.strip()
|
||||
|
||||
if method == 'comma' and ',' in ''.join(tokens):
|
||||
return author
|
||||
|
||||
atokens = tokens[-1:] + tokens[:-1]
|
||||
num_toks = len(atokens)
|
||||
if suffix:
|
||||
atokens.append(suffix)
|
||||
|
||||
if method != 'nocomma' and num_toks > 1:
|
||||
atokens[0] += ','
|
||||
|
||||
return ' '.join(atokens)
|
||||
|
||||
|
||||
def authors_to_sort_string(authors):
|
||||
return ' & '.join(map(author_to_author_sort, authors))
|
||||
|
||||
|
||||
_title_pats = {}
|
||||
|
||||
|
||||
def get_title_sort_pat(lang=None):
|
||||
ans = _title_pats.get(lang, None)
|
||||
if ans is not None:
|
||||
return ans
|
||||
q = lang
|
||||
from calibre.utils.localization import canonicalize_lang, get_lang
|
||||
if lang is None:
|
||||
q = tweaks['default_language_for_title_sort']
|
||||
if q is None:
|
||||
q = get_lang()
|
||||
q = canonicalize_lang(q) if q else q
|
||||
data = tweaks['per_language_title_sort_articles']
|
||||
try:
|
||||
ans = data.get(q, None)
|
||||
except AttributeError:
|
||||
ans = None # invalid tweak value
|
||||
try:
|
||||
ans = frozenset(ans) if ans else frozenset(data['eng'])
|
||||
except:
|
||||
ans = frozenset((r'A\s+', r'The\s+', r'An\s+'))
|
||||
ans = '|'.join(ans)
|
||||
ans = '^(%s)'%ans
|
||||
try:
|
||||
ans = re.compile(ans, re.IGNORECASE)
|
||||
except:
|
||||
ans = re.compile(r'^(A|The|An)\s+', re.IGNORECASE)
|
||||
_title_pats[lang] = ans
|
||||
return ans
|
||||
|
||||
|
||||
_ignore_starts = '\'"'+''.join(codepoint_to_chr(x) for x in
|
||||
list(range(0x2018, 0x201e))+[0x2032, 0x2033])
|
||||
|
||||
|
||||
def title_sort(title, order=None, lang=None):
|
||||
if order is None:
|
||||
order = tweaks['title_series_sorting']
|
||||
title = title.strip()
|
||||
if order == 'strictly_alphabetic':
|
||||
return title
|
||||
if title and title[0] in _ignore_starts:
|
||||
title = title[1:]
|
||||
match = get_title_sort_pat(lang).search(title)
|
||||
if match:
|
||||
try:
|
||||
prep = match.group(1)
|
||||
except IndexError:
|
||||
pass
|
||||
else:
|
||||
title = title[len(prep):] + ', ' + prep
|
||||
if title[0] in _ignore_starts:
|
||||
title = title[1:]
|
||||
return title.strip()
|
||||
|
||||
|
||||
coding = list(zip(
|
||||
[1000,900,500,400,100,90,50,40,10,9,5,4,1],
|
||||
["M","CM","D","CD","C","XC","L","XL","X","IX","V","IV","I"]
|
||||
))
|
||||
|
||||
|
||||
def roman(num):
|
||||
if num <= 0 or num >= 4000 or int(num) != num:
|
||||
return unicode_type(num)
|
||||
result = []
|
||||
for d, r in coding:
|
||||
while num >= d:
|
||||
result.append(r)
|
||||
num -= d
|
||||
return ''.join(result)
|
||||
|
||||
|
||||
def fmt_sidx(i, fmt='%.2f', use_roman=False):
|
||||
if i is None or i == '':
|
||||
i = 1
|
||||
try:
|
||||
i = float(i)
|
||||
except TypeError:
|
||||
return unicode_type(i)
|
||||
if int(i) == float(i):
|
||||
return roman(int(i)) if use_roman else '%d'%int(i)
|
||||
return fmt%i
|
||||
|
||||
|
||||
class Resource(object):
|
||||
|
||||
'''
|
||||
Represents a resource (usually a file on the filesystem or a URL pointing
|
||||
to the web. Such resources are commonly referred to in OPF files.
|
||||
|
||||
They have the interface:
|
||||
|
||||
:member:`path`
|
||||
:member:`mime_type`
|
||||
:method:`href`
|
||||
|
||||
'''
|
||||
|
||||
def __init__(self, href_or_path, basedir=getcwd(), is_path=True):
|
||||
self._href = None
|
||||
self._basedir = basedir
|
||||
self.path = None
|
||||
self.fragment = ''
|
||||
try:
|
||||
self.mime_type = guess_type(href_or_path)[0]
|
||||
except:
|
||||
self.mime_type = None
|
||||
if self.mime_type is None:
|
||||
self.mime_type = 'application/octet-stream'
|
||||
if is_path:
|
||||
path = href_or_path
|
||||
if not os.path.isabs(path):
|
||||
path = os.path.abspath(os.path.join(basedir, path))
|
||||
if isinstance(path, bytes):
|
||||
path = path.decode(sys.getfilesystemencoding())
|
||||
self.path = path
|
||||
else:
|
||||
url = urlparse(href_or_path)
|
||||
if url[0] not in ('', 'file'):
|
||||
self._href = href_or_path
|
||||
else:
|
||||
pc = url[2]
|
||||
if isinstance(pc, unicode_type):
|
||||
pc = pc.encode('utf-8')
|
||||
pc = unquote(pc).decode('utf-8')
|
||||
self.path = os.path.abspath(os.path.join(basedir, pc.replace('/', os.sep)))
|
||||
self.fragment = unquote(url[-1])
|
||||
|
||||
def href(self, basedir=None):
|
||||
'''
|
||||
Return a URL pointing to this resource. If it is a file on the filesystem
|
||||
the URL is relative to `basedir`.
|
||||
|
||||
`basedir`: If None, the basedir of this resource is used (see :method:`set_basedir`).
|
||||
If this resource has no basedir, then the current working directory is used as the basedir.
|
||||
'''
|
||||
if basedir is None:
|
||||
if self._basedir:
|
||||
basedir = self._basedir
|
||||
else:
|
||||
basedir = getcwd()
|
||||
if self.path is None:
|
||||
return self._href
|
||||
f = self.fragment.encode('utf-8') if isinstance(self.fragment, unicode_type) else self.fragment
|
||||
frag = '#'+as_unicode(quote(f)) if self.fragment else ''
|
||||
if self.path == basedir:
|
||||
return ''+frag
|
||||
try:
|
||||
rpath = relpath(self.path, basedir)
|
||||
except OSError: # On windows path and basedir could be on different drives
|
||||
rpath = self.path
|
||||
if isinstance(rpath, unicode_type):
|
||||
rpath = rpath.encode('utf-8')
|
||||
return as_unicode(quote(rpath.replace(os.sep, '/')))+frag
|
||||
|
||||
def set_basedir(self, path):
|
||||
self._basedir = path
|
||||
|
||||
def basedir(self):
|
||||
return self._basedir
|
||||
|
||||
def __repr__(self):
|
||||
return 'Resource(%s, %s)'%(repr(self.path), repr(self.href()))
|
||||
|
||||
|
||||
class ResourceCollection(object):
|
||||
|
||||
def __init__(self):
|
||||
self._resources = []
|
||||
|
||||
def __iter__(self):
|
||||
for r in self._resources:
|
||||
yield r
|
||||
|
||||
def __len__(self):
|
||||
return len(self._resources)
|
||||
|
||||
def __getitem__(self, index):
|
||||
return self._resources[index]
|
||||
|
||||
def __bool__(self):
|
||||
return len(self._resources) > 0
|
||||
|
||||
def __str__(self):
|
||||
resources = map(repr, self)
|
||||
return '[%s]'%', '.join(resources)
|
||||
|
||||
def __repr__(self):
|
||||
return unicode_type(self)
|
||||
|
||||
def append(self, resource):
|
||||
if not isinstance(resource, Resource):
|
||||
raise ValueError('Can only append objects of type Resource')
|
||||
self._resources.append(resource)
|
||||
|
||||
def remove(self, resource):
|
||||
self._resources.remove(resource)
|
||||
|
||||
def replace(self, start, end, items):
|
||||
'Same as list[start:end] = items'
|
||||
self._resources[start:end] = items
|
||||
|
||||
@staticmethod
|
||||
def from_directory_contents(top, topdown=True):
|
||||
collection = ResourceCollection()
|
||||
for spec in os.walk(top, topdown=topdown):
|
||||
path = os.path.abspath(os.path.join(spec[0], spec[1]))
|
||||
res = Resource.from_path(path)
|
||||
res.set_basedir(top)
|
||||
collection.append(res)
|
||||
return collection
|
||||
|
||||
def set_basedir(self, path):
|
||||
for res in self:
|
||||
res.set_basedir(path)
|
||||
|
||||
|
||||
def MetaInformation(title, authors=(_('Unknown'),)):
|
||||
''' Convenient encapsulation of book metadata, needed for compatibility
|
||||
@param title: title or ``_('Unknown')`` or a MetaInformation object
|
||||
@param authors: List of strings or []
|
||||
'''
|
||||
from calibre.ebooks.metadata.book.base import Metadata
|
||||
mi = None
|
||||
if hasattr(title, 'title') and hasattr(title, 'authors'):
|
||||
mi = title
|
||||
title = mi.title
|
||||
authors = mi.authors
|
||||
return Metadata(title, authors, other=mi)
|
||||
|
||||
|
||||
def check_isbn10(isbn):
|
||||
try:
|
||||
digits = tuple(map(int, isbn[:9]))
|
||||
products = [(i+1)*digits[i] for i in range(9)]
|
||||
check = sum(products)%11
|
||||
if (check == 10 and isbn[9] == 'X') or check == int(isbn[9]):
|
||||
return isbn
|
||||
except Exception:
|
||||
pass
|
||||
return None
|
||||
|
||||
|
||||
def check_isbn13(isbn):
|
||||
try:
|
||||
digits = tuple(map(int, isbn[:12]))
|
||||
products = [(1 if i%2 ==0 else 3)*digits[i] for i in range(12)]
|
||||
check = 10 - (sum(products)%10)
|
||||
if check == 10:
|
||||
check = 0
|
||||
if unicode_type(check) == isbn[12]:
|
||||
return isbn
|
||||
except Exception:
|
||||
pass
|
||||
return None
|
||||
|
||||
|
||||
def check_isbn(isbn):
|
||||
if not isbn:
|
||||
return None
|
||||
isbn = re.sub(r'[^0-9X]', '', isbn.upper())
|
||||
all_same = re.match(r'(\d)\1{9,12}$', isbn)
|
||||
if all_same is not None:
|
||||
return None
|
||||
if len(isbn) == 10:
|
||||
return check_isbn10(isbn)
|
||||
if len(isbn) == 13:
|
||||
return check_isbn13(isbn)
|
||||
return None
|
||||
|
||||
|
||||
def check_issn(issn):
|
||||
if not issn:
|
||||
return None
|
||||
issn = re.sub(r'[^0-9X]', '', issn.upper())
|
||||
try:
|
||||
digits = tuple(map(int, issn[:7]))
|
||||
products = [(8 - i) * d for i, d in enumerate(digits)]
|
||||
check = 11 - sum(products) % 11
|
||||
if (check == 10 and issn[7] == 'X') or check == int(issn[7]):
|
||||
return issn
|
||||
except Exception:
|
||||
pass
|
||||
return None
|
||||
|
||||
|
||||
def format_isbn(isbn):
|
||||
cisbn = check_isbn(isbn)
|
||||
if not cisbn:
|
||||
return isbn
|
||||
i = cisbn
|
||||
if len(i) == 10:
|
||||
return '-'.join((i[:2], i[2:6], i[6:9], i[9]))
|
||||
return '-'.join((i[:3], i[3:5], i[5:9], i[9:12], i[12]))
|
||||
|
||||
|
||||
def check_doi(doi):
|
||||
'Check if something that looks like a DOI is present anywhere in the string'
|
||||
if not doi:
|
||||
return None
|
||||
doi_check = re.search(r'10\.\d{4}/\S+', doi)
|
||||
if doi_check is not None:
|
||||
return doi_check.group()
|
||||
return None
|
||||
|
||||
|
||||
def rating_to_stars(value, allow_half_stars=False, star='★', half='½'):
|
||||
r = max(0, min(int(value or 0), 10))
|
||||
ans = star * (r // 2)
|
||||
if allow_half_stars and r % 2:
|
||||
ans += half
|
||||
return ans
|
||||
203
ebook_converter/ebooks/metadata/archive.py
Normal file
203
ebook_converter/ebooks/metadata/archive.py
Normal file
@@ -0,0 +1,203 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import os
|
||||
from contextlib import closing
|
||||
|
||||
from calibre.customize import FileTypePlugin
|
||||
from calibre.utils.localization import canonicalize_lang
|
||||
from polyglot.builtins import filter, unicode_type
|
||||
|
||||
|
||||
def is_comic(list_of_names):
|
||||
extensions = {x.rpartition('.')[-1].lower() for x in list_of_names
|
||||
if '.' in x and x.lower().rpartition('/')[-1] != 'thumbs.db'}
|
||||
comic_extensions = {'jpg', 'jpeg', 'png'}
|
||||
return len(extensions - comic_extensions) == 0
|
||||
|
||||
|
||||
def archive_type(stream):
|
||||
from calibre.utils.zipfile import stringFileHeader
|
||||
try:
|
||||
pos = stream.tell()
|
||||
except:
|
||||
pos = 0
|
||||
id_ = stream.read(4)
|
||||
ans = None
|
||||
if id_ == stringFileHeader:
|
||||
ans = 'zip'
|
||||
elif id_.startswith(b'Rar'):
|
||||
ans = 'rar'
|
||||
try:
|
||||
stream.seek(pos)
|
||||
except Exception:
|
||||
pass
|
||||
return ans
|
||||
|
||||
|
||||
class KPFExtract(FileTypePlugin):
|
||||
|
||||
name = 'KPF Extract'
|
||||
author = 'Kovid Goyal'
|
||||
description = _('Extract the source DOCX file from Amazon Kindle Create KPF files.'
|
||||
' Note this will not contain any edits made in the Kindle Create program itself.')
|
||||
file_types = {'kpf'}
|
||||
supported_platforms = ['windows', 'osx', 'linux']
|
||||
on_import = True
|
||||
|
||||
def run(self, archive):
|
||||
from calibre.utils.zipfile import ZipFile
|
||||
with ZipFile(archive, 'r') as zf:
|
||||
fnames = zf.namelist()
|
||||
candidates = [x for x in fnames if x.lower().endswith('.docx')]
|
||||
if not candidates:
|
||||
return archive
|
||||
of = self.temporary_file('_kpf_extract.docx')
|
||||
with closing(of):
|
||||
of.write(zf.read(candidates[0]))
|
||||
return of.name
|
||||
|
||||
|
||||
class ArchiveExtract(FileTypePlugin):
|
||||
name = 'Archive Extract'
|
||||
author = 'Kovid Goyal'
|
||||
description = _('Extract common e-book formats from archive files '
|
||||
'(ZIP/RAR). Also try to autodetect if they are actually '
|
||||
'CBZ/CBR files.')
|
||||
file_types = {'zip', 'rar'}
|
||||
supported_platforms = ['windows', 'osx', 'linux']
|
||||
on_import = True
|
||||
|
||||
def run(self, archive):
|
||||
from calibre.utils.zipfile import ZipFile
|
||||
is_rar = archive.lower().endswith('.rar')
|
||||
if is_rar:
|
||||
from calibre.utils.unrar import extract_member, names
|
||||
else:
|
||||
zf = ZipFile(archive, 'r')
|
||||
|
||||
if is_rar:
|
||||
fnames = list(names(archive))
|
||||
else:
|
||||
fnames = zf.namelist()
|
||||
|
||||
def fname_ok(fname):
|
||||
bn = os.path.basename(fname).lower()
|
||||
if bn == 'thumbs.db':
|
||||
return False
|
||||
if '.' not in bn:
|
||||
return False
|
||||
if bn.rpartition('.')[-1] in {'diz', 'nfo'}:
|
||||
return False
|
||||
if '__MACOSX' in fname.split('/'):
|
||||
return False
|
||||
return True
|
||||
|
||||
fnames = list(filter(fname_ok, fnames))
|
||||
if is_comic(fnames):
|
||||
ext = '.cbr' if is_rar else '.cbz'
|
||||
of = self.temporary_file('_archive_extract'+ext)
|
||||
with open(archive, 'rb') as f:
|
||||
of.write(f.read())
|
||||
of.close()
|
||||
return of.name
|
||||
if len(fnames) > 1 or not fnames:
|
||||
return archive
|
||||
fname = fnames[0]
|
||||
ext = os.path.splitext(fname)[1][1:]
|
||||
if ext.lower() not in {
|
||||
'lit', 'epub', 'mobi', 'prc', 'rtf', 'pdf', 'mp3', 'pdb',
|
||||
'azw', 'azw1', 'azw3', 'fb2', 'docx', 'doc', 'odt'}:
|
||||
return archive
|
||||
|
||||
of = self.temporary_file('_archive_extract.'+ext)
|
||||
with closing(of):
|
||||
if is_rar:
|
||||
data = extract_member(archive, match=None, name=fname)[1]
|
||||
of.write(data)
|
||||
else:
|
||||
of.write(zf.read(fname))
|
||||
return of.name
|
||||
|
||||
|
||||
def get_comic_book_info(d, mi, series_index='volume'):
|
||||
# See http://code.google.com/p/comicbookinfo/wiki/Example
|
||||
series = d.get('series', '')
|
||||
if series.strip():
|
||||
mi.series = series
|
||||
si = d.get(series_index, None)
|
||||
if si is None:
|
||||
si = d.get('issue' if series_index == 'volume' else 'volume', None)
|
||||
if si is not None:
|
||||
try:
|
||||
mi.series_index = float(si)
|
||||
except Exception:
|
||||
mi.series_index = 1
|
||||
if d.get('language', None):
|
||||
lang = canonicalize_lang(d.get('lang'))
|
||||
if lang:
|
||||
mi.languages = [lang]
|
||||
if d.get('rating', -1) > -1:
|
||||
mi.rating = d['rating']
|
||||
for x in ('title', 'publisher'):
|
||||
y = d.get(x, '').strip()
|
||||
if y:
|
||||
setattr(mi, x, y)
|
||||
tags = d.get('tags', [])
|
||||
if tags:
|
||||
mi.tags = tags
|
||||
authors = []
|
||||
for credit in d.get('credits', []):
|
||||
if credit.get('role', '') in ('Writer', 'Artist', 'Cartoonist',
|
||||
'Creator'):
|
||||
x = credit.get('person', '')
|
||||
if x:
|
||||
x = ' '.join((reversed(x.split(', '))))
|
||||
authors.append(x)
|
||||
if authors:
|
||||
mi.authors = authors
|
||||
comments = d.get('comments', '')
|
||||
if comments and comments.strip():
|
||||
mi.comments = comments.strip()
|
||||
pubm, puby = d.get('publicationMonth', None), d.get('publicationYear', None)
|
||||
if puby is not None:
|
||||
from calibre.utils.date import parse_only_date
|
||||
from datetime import date
|
||||
try:
|
||||
dt = date(puby, 6 if pubm is None else pubm, 15)
|
||||
dt = parse_only_date(unicode_type(dt))
|
||||
mi.pubdate = dt
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
def parse_comic_comment(comment, series_index='volume'):
|
||||
# See http://code.google.com/p/comicbookinfo/wiki/Example
|
||||
from calibre.ebooks.metadata import MetaInformation
|
||||
import json
|
||||
mi = MetaInformation(None, None)
|
||||
m = json.loads(comment)
|
||||
if isinstance(m, dict):
|
||||
for cat in m:
|
||||
if cat.startswith('ComicBookInfo'):
|
||||
get_comic_book_info(m[cat], mi, series_index=series_index)
|
||||
break
|
||||
return mi
|
||||
|
||||
|
||||
def get_comic_metadata(stream, stream_type, series_index='volume'):
|
||||
comment = None
|
||||
if stream_type == 'cbz':
|
||||
from calibre.utils.zipfile import ZipFile
|
||||
zf = ZipFile(stream)
|
||||
comment = zf.comment
|
||||
elif stream_type == 'cbr':
|
||||
from calibre.utils.unrar import comment as get_comment
|
||||
comment = get_comment(stream)
|
||||
|
||||
return parse_comic_comment(comment or b'{}', series_index=series_index)
|
||||
132
ebook_converter/ebooks/metadata/book/__init__.py
Normal file
132
ebook_converter/ebooks/metadata/book/__init__.py
Normal file
@@ -0,0 +1,132 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
'''
|
||||
All fields must have a NULL value represented as None for simple types,
|
||||
an empty list/dictionary for complex types and (None, None) for cover_data
|
||||
'''
|
||||
|
||||
SOCIAL_METADATA_FIELDS = frozenset((
|
||||
'tags', # Ordered list
|
||||
'rating', # A floating point number between 0 and 10
|
||||
'comments', # A simple HTML enabled string
|
||||
'series', # A simple string
|
||||
'series_index', # A floating point number
|
||||
# Of the form { scheme1:value1, scheme2:value2}
|
||||
# For example: {'isbn':'123456789', 'doi':'xxxx', ... }
|
||||
'identifiers',
|
||||
))
|
||||
|
||||
'''
|
||||
The list of names that convert to identifiers when in get and set.
|
||||
'''
|
||||
|
||||
TOP_LEVEL_IDENTIFIERS = frozenset((
|
||||
'isbn',
|
||||
))
|
||||
|
||||
PUBLICATION_METADATA_FIELDS = frozenset((
|
||||
'title', # title must never be None. Should be _('Unknown')
|
||||
# Pseudo field that can be set, but if not set is auto generated
|
||||
# from title and languages
|
||||
'title_sort',
|
||||
'authors', # Ordered list. Must never be None, can be [_('Unknown')]
|
||||
'author_sort_map', # Map of sort strings for each author
|
||||
# Pseudo field that can be set, but if not set is auto generated
|
||||
# from authors and languages
|
||||
'author_sort',
|
||||
'book_producer',
|
||||
'timestamp', # Dates and times must be timezone aware
|
||||
'pubdate',
|
||||
'last_modified',
|
||||
'rights',
|
||||
# So far only known publication type is periodical:calibre
|
||||
# If None, means book
|
||||
'publication_type',
|
||||
'uuid', # A UUID usually of type 4
|
||||
'languages', # ordered list of languages in this publication
|
||||
'publisher', # Simple string, no special semantics
|
||||
# Absolute path to image file encoded in filesystem_encoding
|
||||
'cover',
|
||||
# Of the form (format, data) where format is, for e.g. 'jpeg', 'png', 'gif'...
|
||||
'cover_data',
|
||||
# Either thumbnail data, or an object with the attribute
|
||||
# image_path which is the path to an image file, encoded
|
||||
# in filesystem_encoding
|
||||
'thumbnail',
|
||||
))
|
||||
|
||||
BOOK_STRUCTURE_FIELDS = frozenset((
|
||||
# These are used by code, Null values are None.
|
||||
'toc', 'spine', 'guide', 'manifest',
|
||||
))
|
||||
|
||||
USER_METADATA_FIELDS = frozenset((
|
||||
# A dict of dicts similar to field_metadata. Each field description dict
|
||||
# also contains a value field with the key #value#.
|
||||
'user_metadata',
|
||||
))
|
||||
|
||||
DEVICE_METADATA_FIELDS = frozenset((
|
||||
'device_collections', # Ordered list of strings
|
||||
'lpath', # Unicode, / separated
|
||||
'size', # In bytes
|
||||
'mime', # Mimetype of the book file being represented
|
||||
))
|
||||
|
||||
CALIBRE_METADATA_FIELDS = frozenset((
|
||||
'application_id', # An application id, currently set to the db_id.
|
||||
'db_id', # the calibre primary key of the item.
|
||||
'formats', # list of formats (extensions) for this book
|
||||
# a dict of user category names, where the value is a list of item names
|
||||
# from the book that are in that category
|
||||
'user_categories',
|
||||
# a dict of author to an associated hyperlink
|
||||
'author_link_map',
|
||||
))
|
||||
|
||||
ALL_METADATA_FIELDS = SOCIAL_METADATA_FIELDS.union(
|
||||
PUBLICATION_METADATA_FIELDS).union(
|
||||
BOOK_STRUCTURE_FIELDS).union(
|
||||
USER_METADATA_FIELDS).union(
|
||||
DEVICE_METADATA_FIELDS).union(
|
||||
CALIBRE_METADATA_FIELDS)
|
||||
|
||||
# All fields except custom fields
|
||||
STANDARD_METADATA_FIELDS = SOCIAL_METADATA_FIELDS.union(
|
||||
PUBLICATION_METADATA_FIELDS).union(
|
||||
BOOK_STRUCTURE_FIELDS).union(
|
||||
DEVICE_METADATA_FIELDS).union(
|
||||
CALIBRE_METADATA_FIELDS)
|
||||
|
||||
# Metadata fields that smart update must do special processing to copy.
|
||||
SC_FIELDS_NOT_COPIED = frozenset(('title', 'title_sort', 'authors',
|
||||
'author_sort', 'author_sort_map',
|
||||
'cover_data', 'tags', 'languages',
|
||||
'identifiers'))
|
||||
|
||||
# Metadata fields that smart update should copy only if the source is not None
|
||||
SC_FIELDS_COPY_NOT_NULL = frozenset(('device_collections', 'lpath', 'size', 'comments', 'thumbnail'))
|
||||
|
||||
# Metadata fields that smart update should copy without special handling
|
||||
SC_COPYABLE_FIELDS = SOCIAL_METADATA_FIELDS.union(
|
||||
PUBLICATION_METADATA_FIELDS).union(
|
||||
BOOK_STRUCTURE_FIELDS).union(
|
||||
DEVICE_METADATA_FIELDS).union(
|
||||
CALIBRE_METADATA_FIELDS) - \
|
||||
SC_FIELDS_NOT_COPIED.union(
|
||||
SC_FIELDS_COPY_NOT_NULL)
|
||||
|
||||
SERIALIZABLE_FIELDS = SOCIAL_METADATA_FIELDS.union(
|
||||
USER_METADATA_FIELDS).union(
|
||||
PUBLICATION_METADATA_FIELDS).union(
|
||||
CALIBRE_METADATA_FIELDS).union(
|
||||
DEVICE_METADATA_FIELDS) - \
|
||||
frozenset(('device_collections', 'formats',
|
||||
'cover_data'))
|
||||
# these are rebuilt when needed
|
||||
841
ebook_converter/ebooks/metadata/book/base.py
Normal file
841
ebook_converter/ebooks/metadata/book/base.py
Normal file
@@ -0,0 +1,841 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2010, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import copy, traceback
|
||||
|
||||
from calibre import prints
|
||||
from calibre.constants import DEBUG, ispy3
|
||||
from calibre.ebooks.metadata.book import (SC_COPYABLE_FIELDS,
|
||||
SC_FIELDS_COPY_NOT_NULL, STANDARD_METADATA_FIELDS,
|
||||
TOP_LEVEL_IDENTIFIERS, ALL_METADATA_FIELDS)
|
||||
from calibre.library.field_metadata import FieldMetadata
|
||||
from calibre.utils.icu import sort_key
|
||||
from polyglot.builtins import iteritems, unicode_type, filter, map
|
||||
|
||||
# Special sets used to optimize the performance of getting and setting
|
||||
# attributes on Metadata objects
|
||||
SIMPLE_GET = frozenset(STANDARD_METADATA_FIELDS - TOP_LEVEL_IDENTIFIERS)
|
||||
SIMPLE_SET = frozenset(SIMPLE_GET - {'identifiers'})
|
||||
|
||||
|
||||
def human_readable(size, precision=2):
|
||||
""" Convert a size in bytes into megabytes """
|
||||
return ('%.'+unicode_type(precision)+'f'+ 'MB') % (size/(1024*1024),)
|
||||
|
||||
|
||||
NULL_VALUES = {
|
||||
'user_metadata': {},
|
||||
'cover_data' : (None, None),
|
||||
'tags' : [],
|
||||
'identifiers' : {},
|
||||
'languages' : [],
|
||||
'device_collections': [],
|
||||
'author_sort_map': {},
|
||||
'authors' : [_('Unknown')],
|
||||
'author_sort' : _('Unknown'),
|
||||
'title' : _('Unknown'),
|
||||
'user_categories' : {},
|
||||
'author_link_map' : {},
|
||||
'language' : 'und'
|
||||
}
|
||||
|
||||
field_metadata = FieldMetadata()
|
||||
|
||||
|
||||
def reset_field_metadata():
|
||||
global field_metadata
|
||||
field_metadata = FieldMetadata()
|
||||
|
||||
|
||||
ck = lambda typ: icu_lower(typ).strip().replace(':', '').replace(',', '')
|
||||
cv = lambda val: val.strip().replace(',', '|')
|
||||
|
||||
|
||||
class Metadata(object):
|
||||
|
||||
'''
|
||||
A class representing all the metadata for a book. The various standard metadata
|
||||
fields are available as attributes of this object. You can also stick
|
||||
arbitrary attributes onto this object.
|
||||
|
||||
Metadata from custom columns should be accessed via the get() method,
|
||||
passing in the lookup name for the column, for example: "#mytags".
|
||||
|
||||
Use the :meth:`is_null` method to test if a field is null.
|
||||
|
||||
This object also has functions to format fields into strings.
|
||||
|
||||
The list of standard metadata fields grows with time is in
|
||||
:data:`STANDARD_METADATA_FIELDS`.
|
||||
|
||||
Please keep the method based API of this class to a minimum. Every method
|
||||
becomes a reserved field name.
|
||||
'''
|
||||
__calibre_serializable__ = True
|
||||
|
||||
def __init__(self, title, authors=(_('Unknown'),), other=None, template_cache=None,
|
||||
formatter=None):
|
||||
'''
|
||||
@param title: title or ``_('Unknown')``
|
||||
@param authors: List of strings or []
|
||||
@param other: None or a metadata object
|
||||
'''
|
||||
_data = copy.deepcopy(NULL_VALUES)
|
||||
_data.pop('language')
|
||||
object.__setattr__(self, '_data', _data)
|
||||
if other is not None:
|
||||
self.smart_update(other)
|
||||
else:
|
||||
if title:
|
||||
self.title = title
|
||||
if authors:
|
||||
# List of strings or []
|
||||
self.author = list(authors) if authors else [] # Needed for backward compatibility
|
||||
self.authors = list(authors) if authors else []
|
||||
from calibre.ebooks.metadata.book.formatter import SafeFormat
|
||||
self.formatter = SafeFormat() if formatter is None else formatter
|
||||
self.template_cache = template_cache
|
||||
|
||||
def is_null(self, field):
|
||||
'''
|
||||
Return True if the value of field is null in this object.
|
||||
'null' means it is unknown or evaluates to False. So a title of
|
||||
_('Unknown') is null or a language of 'und' is null.
|
||||
|
||||
Be careful with numeric fields since this will return True for zero as
|
||||
well as None.
|
||||
|
||||
Also returns True if the field does not exist.
|
||||
'''
|
||||
try:
|
||||
null_val = NULL_VALUES.get(field, None)
|
||||
val = getattr(self, field, None)
|
||||
return not val or val == null_val
|
||||
except:
|
||||
return True
|
||||
|
||||
def set_null(self, field):
|
||||
null_val = copy.copy(NULL_VALUES.get(field))
|
||||
setattr(self, field, null_val)
|
||||
|
||||
def __getattribute__(self, field):
|
||||
_data = object.__getattribute__(self, '_data')
|
||||
if field in SIMPLE_GET:
|
||||
return _data.get(field, None)
|
||||
if field in TOP_LEVEL_IDENTIFIERS:
|
||||
return _data.get('identifiers').get(field, None)
|
||||
if field == 'language':
|
||||
try:
|
||||
return _data.get('languages', [])[0]
|
||||
except:
|
||||
return NULL_VALUES['language']
|
||||
try:
|
||||
return object.__getattribute__(self, field)
|
||||
except AttributeError:
|
||||
pass
|
||||
if field in _data['user_metadata']:
|
||||
d = _data['user_metadata'][field]
|
||||
val = d['#value#']
|
||||
if d['datatype'] != 'composite':
|
||||
return val
|
||||
if val is None:
|
||||
d['#value#'] = 'RECURSIVE_COMPOSITE FIELD (Metadata) ' + field
|
||||
val = d['#value#'] = self.formatter.safe_format(
|
||||
d['display']['composite_template'],
|
||||
self,
|
||||
_('TEMPLATE ERROR'),
|
||||
self, column_name=field,
|
||||
template_cache=self.template_cache).strip()
|
||||
return val
|
||||
if field.startswith('#') and field.endswith('_index'):
|
||||
try:
|
||||
return self.get_extra(field[:-6])
|
||||
except:
|
||||
pass
|
||||
raise AttributeError(
|
||||
'Metadata object has no attribute named: '+ repr(field))
|
||||
|
||||
def __setattr__(self, field, val, extra=None):
|
||||
_data = object.__getattribute__(self, '_data')
|
||||
if field in SIMPLE_SET:
|
||||
if val is None:
|
||||
val = copy.copy(NULL_VALUES.get(field, None))
|
||||
_data[field] = val
|
||||
elif field in TOP_LEVEL_IDENTIFIERS:
|
||||
field, val = self._clean_identifier(field, val)
|
||||
identifiers = _data['identifiers']
|
||||
identifiers.pop(field, None)
|
||||
if val:
|
||||
identifiers[field] = val
|
||||
elif field == 'identifiers':
|
||||
if not val:
|
||||
val = copy.copy(NULL_VALUES.get('identifiers', None))
|
||||
self.set_identifiers(val)
|
||||
elif field == 'language':
|
||||
langs = []
|
||||
if val and val.lower() != 'und':
|
||||
langs = [val]
|
||||
_data['languages'] = langs
|
||||
elif field in _data['user_metadata']:
|
||||
_data['user_metadata'][field]['#value#'] = val
|
||||
_data['user_metadata'][field]['#extra#'] = extra
|
||||
else:
|
||||
# You are allowed to stick arbitrary attributes onto this object as
|
||||
# long as they don't conflict with global or user metadata names
|
||||
# Don't abuse this privilege
|
||||
self.__dict__[field] = val
|
||||
|
||||
def __iter__(self):
|
||||
return iter(object.__getattribute__(self, '_data'))
|
||||
|
||||
def has_key(self, key):
|
||||
return key in object.__getattribute__(self, '_data')
|
||||
|
||||
def deepcopy(self, class_generator=lambda : Metadata(None)):
|
||||
''' Do not use this method unless you know what you are doing, if you
|
||||
want to create a simple clone of this object, use :meth:`deepcopy_metadata`
|
||||
instead. Class_generator must be a function that returns an instance
|
||||
of Metadata or a subclass of it.'''
|
||||
m = class_generator()
|
||||
if not isinstance(m, Metadata):
|
||||
return None
|
||||
object.__setattr__(m, '__dict__', copy.deepcopy(self.__dict__))
|
||||
return m
|
||||
|
||||
def deepcopy_metadata(self):
|
||||
m = Metadata(None)
|
||||
object.__setattr__(m, '_data', copy.deepcopy(object.__getattribute__(self, '_data')))
|
||||
return m
|
||||
|
||||
def get(self, field, default=None):
|
||||
try:
|
||||
return self.__getattribute__(field)
|
||||
except AttributeError:
|
||||
return default
|
||||
|
||||
def get_extra(self, field, default=None):
|
||||
_data = object.__getattribute__(self, '_data')
|
||||
if field in _data['user_metadata']:
|
||||
try:
|
||||
return _data['user_metadata'][field]['#extra#']
|
||||
except:
|
||||
return default
|
||||
raise AttributeError(
|
||||
'Metadata object has no attribute named: '+ repr(field))
|
||||
|
||||
def set(self, field, val, extra=None):
|
||||
self.__setattr__(field, val, extra)
|
||||
|
||||
def get_identifiers(self):
|
||||
'''
|
||||
Return a copy of the identifiers dictionary.
|
||||
The dict is small, and the penalty for using a reference where a copy is
|
||||
needed is large. Also, we don't want any manipulations of the returned
|
||||
dict to show up in the book.
|
||||
'''
|
||||
ans = object.__getattribute__(self,
|
||||
'_data')['identifiers']
|
||||
if not ans:
|
||||
ans = {}
|
||||
return copy.deepcopy(ans)
|
||||
|
||||
def _clean_identifier(self, typ, val):
|
||||
if typ:
|
||||
typ = ck(typ)
|
||||
if val:
|
||||
val = cv(val)
|
||||
return typ, val
|
||||
|
||||
def set_identifiers(self, identifiers):
|
||||
'''
|
||||
Set all identifiers. Note that if you previously set ISBN, calling
|
||||
this method will delete it.
|
||||
'''
|
||||
cleaned = {ck(k):cv(v) for k, v in iteritems(identifiers) if k and v}
|
||||
object.__getattribute__(self, '_data')['identifiers'] = cleaned
|
||||
|
||||
def set_identifier(self, typ, val):
|
||||
'If val is empty, deletes identifier of type typ'
|
||||
typ, val = self._clean_identifier(typ, val)
|
||||
if not typ:
|
||||
return
|
||||
identifiers = object.__getattribute__(self,
|
||||
'_data')['identifiers']
|
||||
|
||||
identifiers.pop(typ, None)
|
||||
if val:
|
||||
identifiers[typ] = val
|
||||
|
||||
def has_identifier(self, typ):
|
||||
identifiers = object.__getattribute__(self,
|
||||
'_data')['identifiers']
|
||||
return typ in identifiers
|
||||
|
||||
# field-oriented interface. Intended to be the same as in LibraryDatabase
|
||||
|
||||
def standard_field_keys(self):
|
||||
'''
|
||||
return a list of all possible keys, even if this book doesn't have them
|
||||
'''
|
||||
return STANDARD_METADATA_FIELDS
|
||||
|
||||
def custom_field_keys(self):
|
||||
'''
|
||||
return a list of the custom fields in this book
|
||||
'''
|
||||
return iter(object.__getattribute__(self, '_data')['user_metadata'])
|
||||
|
||||
def all_field_keys(self):
|
||||
'''
|
||||
All field keys known by this instance, even if their value is None
|
||||
'''
|
||||
_data = object.__getattribute__(self, '_data')
|
||||
return frozenset(ALL_METADATA_FIELDS.union(frozenset(_data['user_metadata'])))
|
||||
|
||||
def metadata_for_field(self, key):
|
||||
'''
|
||||
return metadata describing a standard or custom field.
|
||||
'''
|
||||
if key not in self.custom_field_keys():
|
||||
return self.get_standard_metadata(key, make_copy=False)
|
||||
return self.get_user_metadata(key, make_copy=False)
|
||||
|
||||
def all_non_none_fields(self):
|
||||
'''
|
||||
Return a dictionary containing all non-None metadata fields, including
|
||||
the custom ones.
|
||||
'''
|
||||
result = {}
|
||||
_data = object.__getattribute__(self, '_data')
|
||||
for attr in STANDARD_METADATA_FIELDS:
|
||||
v = _data.get(attr, None)
|
||||
if v is not None:
|
||||
result[attr] = v
|
||||
# separate these because it uses the self.get(), not _data.get()
|
||||
for attr in TOP_LEVEL_IDENTIFIERS:
|
||||
v = self.get(attr, None)
|
||||
if v is not None:
|
||||
result[attr] = v
|
||||
for attr in _data['user_metadata']:
|
||||
v = self.get(attr, None)
|
||||
if v is not None:
|
||||
result[attr] = v
|
||||
if _data['user_metadata'][attr]['datatype'] == 'series':
|
||||
result[attr+'_index'] = _data['user_metadata'][attr]['#extra#']
|
||||
return result
|
||||
|
||||
# End of field-oriented interface
|
||||
|
||||
# Extended interfaces. These permit one to get copies of metadata dictionaries, and to
|
||||
# get and set custom field metadata
|
||||
|
||||
def get_standard_metadata(self, field, make_copy):
|
||||
'''
|
||||
return field metadata from the field if it is there. Otherwise return
|
||||
None. field is the key name, not the label. Return a copy if requested,
|
||||
just in case the user wants to change values in the dict.
|
||||
'''
|
||||
if field in field_metadata and field_metadata[field]['kind'] == 'field':
|
||||
if make_copy:
|
||||
return copy.deepcopy(field_metadata[field])
|
||||
return field_metadata[field]
|
||||
return None
|
||||
|
||||
def get_all_standard_metadata(self, make_copy):
|
||||
'''
|
||||
return a dict containing all the standard field metadata associated with
|
||||
the book.
|
||||
'''
|
||||
if not make_copy:
|
||||
return field_metadata
|
||||
res = {}
|
||||
for k in field_metadata:
|
||||
if field_metadata[k]['kind'] == 'field':
|
||||
res[k] = copy.deepcopy(field_metadata[k])
|
||||
return res
|
||||
|
||||
def get_all_user_metadata(self, make_copy):
|
||||
'''
|
||||
return a dict containing all the custom field metadata associated with
|
||||
the book.
|
||||
'''
|
||||
_data = object.__getattribute__(self, '_data')
|
||||
user_metadata = _data['user_metadata']
|
||||
if not make_copy:
|
||||
return user_metadata
|
||||
res = {}
|
||||
for k in user_metadata:
|
||||
res[k] = copy.deepcopy(user_metadata[k])
|
||||
return res
|
||||
|
||||
def get_user_metadata(self, field, make_copy):
|
||||
'''
|
||||
return field metadata from the object if it is there. Otherwise return
|
||||
None. field is the key name, not the label. Return a copy if requested,
|
||||
just in case the user wants to change values in the dict.
|
||||
'''
|
||||
_data = object.__getattribute__(self, '_data')
|
||||
_data = _data['user_metadata']
|
||||
if field in _data:
|
||||
if make_copy:
|
||||
return copy.deepcopy(_data[field])
|
||||
return _data[field]
|
||||
return None
|
||||
|
||||
def set_all_user_metadata(self, metadata):
|
||||
'''
|
||||
store custom field metadata into the object. Field is the key name
|
||||
not the label
|
||||
'''
|
||||
if metadata is None:
|
||||
traceback.print_stack()
|
||||
return
|
||||
|
||||
um = {}
|
||||
for key, meta in iteritems(metadata):
|
||||
m = meta.copy()
|
||||
if '#value#' not in m:
|
||||
if m['datatype'] == 'text' and m['is_multiple']:
|
||||
m['#value#'] = []
|
||||
else:
|
||||
m['#value#'] = None
|
||||
um[key] = m
|
||||
_data = object.__getattribute__(self, '_data')
|
||||
_data['user_metadata'] = um
|
||||
|
||||
def set_user_metadata(self, field, metadata):
|
||||
'''
|
||||
store custom field metadata for one column into the object. Field is
|
||||
the key name not the label
|
||||
'''
|
||||
if field is not None:
|
||||
if not field.startswith('#'):
|
||||
raise AttributeError(
|
||||
'Custom field name %s must begin with \'#\''%repr(field))
|
||||
if metadata is None:
|
||||
traceback.print_stack()
|
||||
return
|
||||
m = dict(metadata)
|
||||
# Copying the elements should not be necessary. The objects referenced
|
||||
# in the dict should not change. Of course, they can be replaced.
|
||||
# for k,v in iteritems(metadata):
|
||||
# m[k] = copy.copy(v)
|
||||
if '#value#' not in m:
|
||||
if m['datatype'] == 'text' and m['is_multiple']:
|
||||
m['#value#'] = []
|
||||
else:
|
||||
m['#value#'] = None
|
||||
_data = object.__getattribute__(self, '_data')
|
||||
_data['user_metadata'][field] = m
|
||||
|
||||
def template_to_attribute(self, other, ops):
|
||||
'''
|
||||
Takes a list [(src,dest), (src,dest)], evaluates the template in the
|
||||
context of other, then copies the result to self[dest]. This is on a
|
||||
best-efforts basis. Some assignments can make no sense.
|
||||
'''
|
||||
if not ops:
|
||||
return
|
||||
from calibre.ebooks.metadata.book.formatter import SafeFormat
|
||||
formatter = SafeFormat()
|
||||
for op in ops:
|
||||
try:
|
||||
src = op[0]
|
||||
dest = op[1]
|
||||
val = formatter.safe_format(src, other, 'PLUGBOARD TEMPLATE ERROR', other)
|
||||
if dest == 'tags':
|
||||
self.set(dest, [f.strip() for f in val.split(',') if f.strip()])
|
||||
elif dest == 'authors':
|
||||
self.set(dest, [f.strip() for f in val.split('&') if f.strip()])
|
||||
else:
|
||||
self.set(dest, val)
|
||||
except:
|
||||
if DEBUG:
|
||||
traceback.print_exc()
|
||||
|
||||
# Old Metadata API {{{
|
||||
def print_all_attributes(self):
|
||||
for x in STANDARD_METADATA_FIELDS:
|
||||
prints('%s:'%x, getattr(self, x, 'None'))
|
||||
for x in self.custom_field_keys():
|
||||
meta = self.get_user_metadata(x, make_copy=False)
|
||||
if meta is not None:
|
||||
prints(x, meta)
|
||||
prints('--------------')
|
||||
|
||||
def smart_update(self, other, replace_metadata=False):
|
||||
'''
|
||||
Merge the information in `other` into self. In case of conflicts, the information
|
||||
in `other` takes precedence, unless the information in `other` is NULL.
|
||||
'''
|
||||
def copy_not_none(dest, src, attr):
|
||||
v = getattr(src, attr, None)
|
||||
if v not in (None, NULL_VALUES.get(attr, None)):
|
||||
setattr(dest, attr, copy.deepcopy(v))
|
||||
|
||||
unknown = _('Unknown')
|
||||
if other.title and other.title != unknown:
|
||||
self.title = other.title
|
||||
if hasattr(other, 'title_sort'):
|
||||
self.title_sort = other.title_sort
|
||||
|
||||
if other.authors and (
|
||||
other.authors[0] != unknown or (
|
||||
not self.authors or (
|
||||
len(self.authors) == 1 and self.authors[0] == unknown and
|
||||
getattr(self, 'author_sort', None) == unknown
|
||||
)
|
||||
)
|
||||
):
|
||||
self.authors = list(other.authors)
|
||||
if hasattr(other, 'author_sort_map'):
|
||||
self.author_sort_map = dict(other.author_sort_map)
|
||||
if hasattr(other, 'author_sort'):
|
||||
self.author_sort = other.author_sort
|
||||
|
||||
if replace_metadata:
|
||||
# SPECIAL_FIELDS = frozenset(['lpath', 'size', 'comments', 'thumbnail'])
|
||||
for attr in SC_COPYABLE_FIELDS:
|
||||
setattr(self, attr, getattr(other, attr, 1.0 if
|
||||
attr == 'series_index' else None))
|
||||
self.tags = other.tags
|
||||
self.cover_data = getattr(other, 'cover_data',
|
||||
NULL_VALUES['cover_data'])
|
||||
self.set_all_user_metadata(other.get_all_user_metadata(make_copy=True))
|
||||
for x in SC_FIELDS_COPY_NOT_NULL:
|
||||
copy_not_none(self, other, x)
|
||||
if callable(getattr(other, 'get_identifiers', None)):
|
||||
self.set_identifiers(other.get_identifiers())
|
||||
# language is handled below
|
||||
else:
|
||||
for attr in SC_COPYABLE_FIELDS:
|
||||
copy_not_none(self, other, attr)
|
||||
for x in SC_FIELDS_COPY_NOT_NULL:
|
||||
copy_not_none(self, other, x)
|
||||
|
||||
if other.tags:
|
||||
# Case-insensitive but case preserving merging
|
||||
lotags = [t.lower() for t in other.tags]
|
||||
lstags = [t.lower() for t in self.tags]
|
||||
ot, st = map(frozenset, (lotags, lstags))
|
||||
for t in st.intersection(ot):
|
||||
sidx = lstags.index(t)
|
||||
oidx = lotags.index(t)
|
||||
self.tags[sidx] = other.tags[oidx]
|
||||
self.tags += [t for t in other.tags if t.lower() in ot-st]
|
||||
|
||||
if getattr(other, 'cover_data', False):
|
||||
other_cover = other.cover_data[-1]
|
||||
self_cover = self.cover_data[-1] if self.cover_data else b''
|
||||
if not self_cover:
|
||||
self_cover = b''
|
||||
if not other_cover:
|
||||
other_cover = b''
|
||||
if len(other_cover) > len(self_cover):
|
||||
self.cover_data = other.cover_data
|
||||
|
||||
if callable(getattr(other, 'custom_field_keys', None)):
|
||||
for x in other.custom_field_keys():
|
||||
meta = other.get_user_metadata(x, make_copy=True)
|
||||
if meta is not None:
|
||||
self_tags = self.get(x, [])
|
||||
self.set_user_metadata(x, meta) # get... did the deepcopy
|
||||
other_tags = other.get(x, [])
|
||||
if meta['datatype'] == 'text' and meta['is_multiple']:
|
||||
# Case-insensitive but case preserving merging
|
||||
lotags = [t.lower() for t in other_tags]
|
||||
try:
|
||||
lstags = [t.lower() for t in self_tags]
|
||||
except TypeError:
|
||||
# Happens if x is not a text, is_multiple field
|
||||
# on self
|
||||
lstags = []
|
||||
self_tags = []
|
||||
ot, st = map(frozenset, (lotags, lstags))
|
||||
for t in st.intersection(ot):
|
||||
sidx = lstags.index(t)
|
||||
oidx = lotags.index(t)
|
||||
self_tags[sidx] = other_tags[oidx]
|
||||
self_tags += [t for t in other_tags if t.lower() in ot-st]
|
||||
setattr(self, x, self_tags)
|
||||
|
||||
my_comments = getattr(self, 'comments', '')
|
||||
other_comments = getattr(other, 'comments', '')
|
||||
if not my_comments:
|
||||
my_comments = ''
|
||||
if not other_comments:
|
||||
other_comments = ''
|
||||
if len(other_comments.strip()) > len(my_comments.strip()):
|
||||
self.comments = other_comments
|
||||
|
||||
# Copy all the non-none identifiers
|
||||
if callable(getattr(other, 'get_identifiers', None)):
|
||||
d = self.get_identifiers()
|
||||
s = other.get_identifiers()
|
||||
d.update([v for v in iteritems(s) if v[1] is not None])
|
||||
self.set_identifiers(d)
|
||||
else:
|
||||
# other structure not Metadata. Copy the top-level identifiers
|
||||
for attr in TOP_LEVEL_IDENTIFIERS:
|
||||
copy_not_none(self, other, attr)
|
||||
|
||||
other_lang = getattr(other, 'languages', [])
|
||||
if other_lang and other_lang != ['und']:
|
||||
self.languages = list(other_lang)
|
||||
if not getattr(self, 'series', None):
|
||||
self.series_index = None
|
||||
|
||||
def format_series_index(self, val=None):
|
||||
from calibre.ebooks.metadata import fmt_sidx
|
||||
v = self.series_index if val is None else val
|
||||
try:
|
||||
x = float(v)
|
||||
except Exception:
|
||||
x = 1
|
||||
return fmt_sidx(x)
|
||||
|
||||
def authors_from_string(self, raw):
|
||||
from calibre.ebooks.metadata import string_to_authors
|
||||
self.authors = string_to_authors(raw)
|
||||
|
||||
def format_authors(self):
|
||||
from calibre.ebooks.metadata import authors_to_string
|
||||
return authors_to_string(self.authors)
|
||||
|
||||
def format_tags(self):
|
||||
return ', '.join([unicode_type(t) for t in sorted(self.tags, key=sort_key)])
|
||||
|
||||
def format_rating(self, v=None, divide_by=1):
|
||||
if v is None:
|
||||
if self.rating is not None:
|
||||
return unicode_type(self.rating/divide_by)
|
||||
return 'None'
|
||||
return unicode_type(v/divide_by)
|
||||
|
||||
def format_field(self, key, series_with_index=True):
|
||||
'''
|
||||
Returns the tuple (display_name, formatted_value)
|
||||
'''
|
||||
name, val, ign, ign = self.format_field_extended(key, series_with_index)
|
||||
return (name, val)
|
||||
|
||||
def format_field_extended(self, key, series_with_index=True):
|
||||
from calibre.ebooks.metadata import authors_to_string
|
||||
'''
|
||||
returns the tuple (display_name, formatted_value, original_value,
|
||||
field_metadata)
|
||||
'''
|
||||
from calibre.utils.date import format_date
|
||||
|
||||
# Handle custom series index
|
||||
if key.startswith('#') and key.endswith('_index'):
|
||||
tkey = key[:-6] # strip the _index
|
||||
cmeta = self.get_user_metadata(tkey, make_copy=False)
|
||||
if cmeta and cmeta['datatype'] == 'series':
|
||||
if self.get(tkey):
|
||||
res = self.get_extra(tkey)
|
||||
return (unicode_type(cmeta['name']+'_index'),
|
||||
self.format_series_index(res), res, cmeta)
|
||||
else:
|
||||
return (unicode_type(cmeta['name']+'_index'), '', '', cmeta)
|
||||
|
||||
if key in self.custom_field_keys():
|
||||
res = self.get(key, None) # get evaluates all necessary composites
|
||||
cmeta = self.get_user_metadata(key, make_copy=False)
|
||||
name = unicode_type(cmeta['name'])
|
||||
if res is None or res == '': # can't check "not res" because of numeric fields
|
||||
return (name, res, None, None)
|
||||
orig_res = res
|
||||
datatype = cmeta['datatype']
|
||||
if datatype == 'text' and cmeta['is_multiple']:
|
||||
res = cmeta['is_multiple']['list_to_ui'].join(res)
|
||||
elif datatype == 'series' and series_with_index:
|
||||
if self.get_extra(key) is not None:
|
||||
res = res + \
|
||||
' [%s]'%self.format_series_index(val=self.get_extra(key))
|
||||
elif datatype == 'datetime':
|
||||
res = format_date(res, cmeta['display'].get('date_format','dd MMM yyyy'))
|
||||
elif datatype == 'bool':
|
||||
res = _('Yes') if res else _('No')
|
||||
elif datatype == 'rating':
|
||||
res = '%.2g'%(res/2)
|
||||
elif datatype in ['int', 'float']:
|
||||
try:
|
||||
fmt = cmeta['display'].get('number_format', None)
|
||||
res = fmt.format(res)
|
||||
except:
|
||||
pass
|
||||
return (name, unicode_type(res), orig_res, cmeta)
|
||||
|
||||
# convert top-level ids into their value
|
||||
if key in TOP_LEVEL_IDENTIFIERS:
|
||||
fmeta = field_metadata['identifiers']
|
||||
name = key
|
||||
res = self.get(key, None)
|
||||
return (name, res, res, fmeta)
|
||||
|
||||
# Translate aliases into the standard field name
|
||||
fmkey = field_metadata.search_term_to_field_key(key)
|
||||
if fmkey in field_metadata and field_metadata[fmkey]['kind'] == 'field':
|
||||
res = self.get(key, None)
|
||||
fmeta = field_metadata[fmkey]
|
||||
name = unicode_type(fmeta['name'])
|
||||
if res is None or res == '':
|
||||
return (name, res, None, None)
|
||||
orig_res = res
|
||||
name = unicode_type(fmeta['name'])
|
||||
datatype = fmeta['datatype']
|
||||
if key == 'authors':
|
||||
res = authors_to_string(res)
|
||||
elif key == 'series_index':
|
||||
res = self.format_series_index(res)
|
||||
elif datatype == 'text' and fmeta['is_multiple']:
|
||||
if isinstance(res, dict):
|
||||
res = [k + ':' + v for k,v in res.items()]
|
||||
res = fmeta['is_multiple']['list_to_ui'].join(sorted(filter(None, res), key=sort_key))
|
||||
elif datatype == 'series' and series_with_index:
|
||||
res = res + ' [%s]'%self.format_series_index()
|
||||
elif datatype == 'datetime':
|
||||
res = format_date(res, fmeta['display'].get('date_format','dd MMM yyyy'))
|
||||
elif datatype == 'rating':
|
||||
res = '%.2g'%(res/2)
|
||||
elif key == 'size':
|
||||
res = human_readable(res)
|
||||
return (name, unicode_type(res), orig_res, fmeta)
|
||||
|
||||
return (None, None, None, None)
|
||||
|
||||
def __unicode__representation__(self):
|
||||
'''
|
||||
A string representation of this object, suitable for printing to
|
||||
console
|
||||
'''
|
||||
from calibre.utils.date import isoformat
|
||||
from calibre.ebooks.metadata import authors_to_string
|
||||
ans = []
|
||||
|
||||
def fmt(x, y):
|
||||
ans.append('%-20s: %s'%(unicode_type(x), unicode_type(y)))
|
||||
|
||||
fmt('Title', self.title)
|
||||
if self.title_sort:
|
||||
fmt('Title sort', self.title_sort)
|
||||
if self.authors:
|
||||
fmt('Author(s)', authors_to_string(self.authors) +
|
||||
((' [' + self.author_sort + ']')
|
||||
if self.author_sort and self.author_sort != _('Unknown') else ''))
|
||||
if self.publisher:
|
||||
fmt('Publisher', self.publisher)
|
||||
if getattr(self, 'book_producer', False):
|
||||
fmt('Book Producer', self.book_producer)
|
||||
if self.tags:
|
||||
fmt('Tags', ', '.join([unicode_type(t) for t in self.tags]))
|
||||
if self.series:
|
||||
fmt('Series', self.series + ' #%s'%self.format_series_index())
|
||||
if not self.is_null('languages'):
|
||||
fmt('Languages', ', '.join(self.languages))
|
||||
if self.rating is not None:
|
||||
fmt('Rating', ('%.2g'%(float(self.rating)/2)) if self.rating
|
||||
else '')
|
||||
if self.timestamp is not None:
|
||||
fmt('Timestamp', isoformat(self.timestamp))
|
||||
if self.pubdate is not None:
|
||||
fmt('Published', isoformat(self.pubdate))
|
||||
if self.rights is not None:
|
||||
fmt('Rights', unicode_type(self.rights))
|
||||
if self.identifiers:
|
||||
fmt('Identifiers', ', '.join(['%s:%s'%(k, v) for k, v in
|
||||
iteritems(self.identifiers)]))
|
||||
if self.comments:
|
||||
fmt('Comments', self.comments)
|
||||
|
||||
for key in self.custom_field_keys():
|
||||
val = self.get(key, None)
|
||||
if val:
|
||||
(name, val) = self.format_field(key)
|
||||
fmt(name, unicode_type(val))
|
||||
return '\n'.join(ans)
|
||||
|
||||
def to_html(self):
|
||||
'''
|
||||
A HTML representation of this object.
|
||||
'''
|
||||
from calibre.ebooks.metadata import authors_to_string
|
||||
from calibre.utils.date import isoformat
|
||||
ans = [(_('Title'), unicode_type(self.title))]
|
||||
ans += [(_('Author(s)'), (authors_to_string(self.authors) if self.authors else _('Unknown')))]
|
||||
ans += [(_('Publisher'), unicode_type(self.publisher))]
|
||||
ans += [(_('Producer'), unicode_type(self.book_producer))]
|
||||
ans += [(_('Comments'), unicode_type(self.comments))]
|
||||
ans += [('ISBN', unicode_type(self.isbn))]
|
||||
ans += [(_('Tags'), ', '.join([unicode_type(t) for t in self.tags]))]
|
||||
if self.series:
|
||||
ans += [(_('Series'), unicode_type(self.series) + ' #%s'%self.format_series_index())]
|
||||
ans += [(_('Languages'), ', '.join(self.languages))]
|
||||
if self.timestamp is not None:
|
||||
ans += [(_('Timestamp'), unicode_type(isoformat(self.timestamp, as_utc=False, sep=' ')))]
|
||||
if self.pubdate is not None:
|
||||
ans += [(_('Published'), unicode_type(isoformat(self.pubdate, as_utc=False, sep=' ')))]
|
||||
if self.rights is not None:
|
||||
ans += [(_('Rights'), unicode_type(self.rights))]
|
||||
for key in self.custom_field_keys():
|
||||
val = self.get(key, None)
|
||||
if val:
|
||||
(name, val) = self.format_field(key)
|
||||
ans += [(name, val)]
|
||||
for i, x in enumerate(ans):
|
||||
ans[i] = '<tr><td><b>%s</b></td><td>%s</td></tr>'%x
|
||||
return '<table>%s</table>'%'\n'.join(ans)
|
||||
|
||||
if ispy3:
|
||||
__str__ = __unicode__representation__
|
||||
else:
|
||||
__unicode__ = __unicode__representation__
|
||||
|
||||
def __str__(self):
|
||||
return self.__unicode__().encode('utf-8')
|
||||
|
||||
def __nonzero__(self):
|
||||
return bool(self.title or self.author or self.comments or self.tags)
|
||||
__bool__ = __nonzero__
|
||||
|
||||
# }}}
|
||||
|
||||
|
||||
def field_from_string(field, raw, field_metadata):
|
||||
''' Parse the string raw to return an object that is suitable for calling
|
||||
set() on a Metadata object. '''
|
||||
dt = field_metadata['datatype']
|
||||
val = object
|
||||
if dt in {'int', 'float'}:
|
||||
val = int(raw) if dt == 'int' else float(raw)
|
||||
elif dt == 'rating':
|
||||
val = float(raw) * 2
|
||||
elif dt == 'datetime':
|
||||
from calibre.utils.date import parse_only_date
|
||||
val = parse_only_date(raw)
|
||||
elif dt == 'bool':
|
||||
if raw.lower() in {'true', 'yes', 'y'}:
|
||||
val = True
|
||||
elif raw.lower() in {'false', 'no', 'n'}:
|
||||
val = False
|
||||
else:
|
||||
raise ValueError('Unknown value for %s: %s'%(field, raw))
|
||||
elif dt == 'text':
|
||||
ism = field_metadata['is_multiple']
|
||||
if ism:
|
||||
val = [x.strip() for x in raw.split(ism['ui_to_list'])]
|
||||
if field == 'identifiers':
|
||||
val = {x.partition(':')[0]:x.partition(':')[-1] for x in val}
|
||||
elif field == 'languages':
|
||||
from calibre.utils.localization import canonicalize_lang
|
||||
val = [canonicalize_lang(x) for x in val]
|
||||
val = [x for x in val if x]
|
||||
if val is object:
|
||||
val = raw
|
||||
return val
|
||||
46
ebook_converter/ebooks/metadata/book/formatter.py
Normal file
46
ebook_converter/ebooks/metadata/book/formatter.py
Normal file
@@ -0,0 +1,46 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=utf-8
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
|
||||
from calibre.ebooks.metadata.book import TOP_LEVEL_IDENTIFIERS, ALL_METADATA_FIELDS
|
||||
|
||||
from calibre.utils.formatter import TemplateFormatter
|
||||
|
||||
|
||||
class SafeFormat(TemplateFormatter):
|
||||
|
||||
def __init__(self):
|
||||
TemplateFormatter.__init__(self)
|
||||
|
||||
def get_value(self, orig_key, args, kwargs):
|
||||
if not orig_key:
|
||||
return ''
|
||||
key = orig_key = orig_key.lower()
|
||||
if (key != 'title_sort' and key not in TOP_LEVEL_IDENTIFIERS and
|
||||
key not in ALL_METADATA_FIELDS):
|
||||
from calibre.ebooks.metadata.book.base import field_metadata
|
||||
key = field_metadata.search_term_to_field_key(key)
|
||||
if key is None or (self.book and
|
||||
key not in self.book.all_field_keys()):
|
||||
if hasattr(self.book, orig_key):
|
||||
key = orig_key
|
||||
else:
|
||||
raise ValueError(_('Value: unknown field ') + orig_key)
|
||||
try:
|
||||
b = self.book.get_user_metadata(key, False)
|
||||
except:
|
||||
b = None
|
||||
if b and b['datatype'] in {'int', 'float'} and self.book.get(key, None) is None:
|
||||
v = ''
|
||||
else:
|
||||
v = self.book.format_field(key, series_with_index=False)[1]
|
||||
if v is None:
|
||||
return ''
|
||||
if v == '':
|
||||
return ''
|
||||
return v
|
||||
|
||||
|
||||
218
ebook_converter/ebooks/metadata/book/json_codec.py
Normal file
218
ebook_converter/ebooks/metadata/book/json_codec.py
Normal file
@@ -0,0 +1,218 @@
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
'''
|
||||
Created on 4 Jun 2010
|
||||
|
||||
@author: charles
|
||||
'''
|
||||
|
||||
import json, traceback
|
||||
from datetime import datetime, time
|
||||
|
||||
from calibre.ebooks.metadata.book import SERIALIZABLE_FIELDS
|
||||
from calibre.constants import filesystem_encoding, preferred_encoding
|
||||
from calibre.library.field_metadata import FieldMetadata
|
||||
from calibre import isbytestring
|
||||
from polyglot.builtins import iteritems, itervalues, as_bytes
|
||||
from polyglot.binary import as_base64_unicode, from_base64_bytes
|
||||
|
||||
# Translate datetimes to and from strings. The string form is the datetime in
|
||||
# UTC. The returned date is also UTC
|
||||
|
||||
|
||||
def string_to_datetime(src):
|
||||
from calibre.utils.iso8601 import parse_iso8601
|
||||
if src != "None":
|
||||
try:
|
||||
return parse_iso8601(src)
|
||||
except Exception:
|
||||
pass
|
||||
return None
|
||||
|
||||
|
||||
def datetime_to_string(dateval):
|
||||
from calibre.utils.date import isoformat, UNDEFINED_DATE, local_tz
|
||||
if dateval is None:
|
||||
return "None"
|
||||
if not isinstance(dateval, datetime):
|
||||
dateval = datetime.combine(dateval, time())
|
||||
if hasattr(dateval, 'tzinfo') and dateval.tzinfo is None:
|
||||
dateval = dateval.replace(tzinfo=local_tz)
|
||||
if dateval <= UNDEFINED_DATE:
|
||||
return "None"
|
||||
return isoformat(dateval)
|
||||
|
||||
|
||||
def encode_thumbnail(thumbnail):
|
||||
'''
|
||||
Encode the image part of a thumbnail, then return the 3 part tuple
|
||||
'''
|
||||
from calibre.utils.imghdr import identify
|
||||
if thumbnail is None:
|
||||
return None
|
||||
if not isinstance(thumbnail, (tuple, list)):
|
||||
try:
|
||||
width, height = identify(as_bytes(thumbnail))[1:]
|
||||
if width < 0 or height < 0:
|
||||
return None
|
||||
thumbnail = (width, height, thumbnail)
|
||||
except Exception:
|
||||
return None
|
||||
return (thumbnail[0], thumbnail[1], as_base64_unicode(thumbnail[2]))
|
||||
|
||||
|
||||
def decode_thumbnail(tup):
|
||||
'''
|
||||
Decode an encoded thumbnail into its 3 component parts
|
||||
'''
|
||||
if tup is None:
|
||||
return None
|
||||
return (tup[0], tup[1], from_base64_bytes(tup[2]))
|
||||
|
||||
|
||||
def object_to_unicode(obj, enc=preferred_encoding):
|
||||
|
||||
def dec(x):
|
||||
return x.decode(enc, 'replace')
|
||||
|
||||
if isbytestring(obj):
|
||||
return dec(obj)
|
||||
if isinstance(obj, (list, tuple)):
|
||||
return [dec(x) if isbytestring(x) else object_to_unicode(x) for x in obj]
|
||||
if isinstance(obj, dict):
|
||||
ans = {}
|
||||
for k, v in obj.items():
|
||||
k = object_to_unicode(k)
|
||||
v = object_to_unicode(v)
|
||||
ans[k] = v
|
||||
return ans
|
||||
return obj
|
||||
|
||||
|
||||
def encode_is_multiple(fm):
|
||||
if fm.get('is_multiple', None):
|
||||
# migrate is_multiple back to a character
|
||||
fm['is_multiple2'] = fm.get('is_multiple', {})
|
||||
dt = fm.get('datatype', None)
|
||||
if dt == 'composite':
|
||||
fm['is_multiple'] = ','
|
||||
else:
|
||||
fm['is_multiple'] = '|'
|
||||
else:
|
||||
fm['is_multiple'] = None
|
||||
fm['is_multiple2'] = {}
|
||||
|
||||
|
||||
def decode_is_multiple(fm):
|
||||
im = fm.get('is_multiple2', None)
|
||||
if im:
|
||||
fm['is_multiple'] = im
|
||||
del fm['is_multiple2']
|
||||
else:
|
||||
# Must migrate the is_multiple from char to dict
|
||||
im = fm.get('is_multiple', {})
|
||||
if im:
|
||||
dt = fm.get('datatype', None)
|
||||
if dt == 'composite':
|
||||
im = {'cache_to_list': ',', 'ui_to_list': ',',
|
||||
'list_to_ui': ', '}
|
||||
elif fm.get('display', {}).get('is_names', False):
|
||||
im = {'cache_to_list': '|', 'ui_to_list': '&',
|
||||
'list_to_ui': ', '}
|
||||
else:
|
||||
im = {'cache_to_list': '|', 'ui_to_list': ',',
|
||||
'list_to_ui': ', '}
|
||||
elif im is None:
|
||||
im = {}
|
||||
fm['is_multiple'] = im
|
||||
|
||||
|
||||
class JsonCodec(object):
|
||||
|
||||
def __init__(self, field_metadata=None):
|
||||
self.field_metadata = field_metadata or FieldMetadata()
|
||||
|
||||
def encode_to_file(self, file_, booklist):
|
||||
data = json.dumps(self.encode_booklist_metadata(booklist), indent=2)
|
||||
if not isinstance(data, bytes):
|
||||
data = data.encode('utf-8')
|
||||
file_.write(data)
|
||||
|
||||
def encode_booklist_metadata(self, booklist):
|
||||
result = []
|
||||
for book in booklist:
|
||||
result.append(self.encode_book_metadata(book))
|
||||
return result
|
||||
|
||||
def encode_book_metadata(self, book):
|
||||
result = {}
|
||||
for key in SERIALIZABLE_FIELDS:
|
||||
result[key] = self.encode_metadata_attr(book, key)
|
||||
return result
|
||||
|
||||
def encode_metadata_attr(self, book, key):
|
||||
if key == 'user_metadata':
|
||||
meta = book.get_all_user_metadata(make_copy=True)
|
||||
for fm in itervalues(meta):
|
||||
if fm['datatype'] == 'datetime':
|
||||
fm['#value#'] = datetime_to_string(fm['#value#'])
|
||||
encode_is_multiple(fm)
|
||||
return meta
|
||||
if key in self.field_metadata:
|
||||
datatype = self.field_metadata[key]['datatype']
|
||||
else:
|
||||
datatype = None
|
||||
value = book.get(key)
|
||||
if key == 'thumbnail':
|
||||
return encode_thumbnail(value)
|
||||
elif isbytestring(value): # str includes bytes
|
||||
enc = filesystem_encoding if key == 'lpath' else preferred_encoding
|
||||
return object_to_unicode(value, enc=enc)
|
||||
elif datatype == 'datetime':
|
||||
return datetime_to_string(value)
|
||||
else:
|
||||
return object_to_unicode(value)
|
||||
|
||||
def decode_from_file(self, file_, booklist, book_class, prefix):
|
||||
js = []
|
||||
try:
|
||||
js = json.load(file_, encoding='utf-8')
|
||||
for item in js:
|
||||
entry = self.raw_to_book(item, book_class, prefix)
|
||||
if entry is not None:
|
||||
booklist.append(entry)
|
||||
except:
|
||||
print('exception during JSON decode_from_file')
|
||||
traceback.print_exc()
|
||||
|
||||
def raw_to_book(self, json_book, book_class, prefix):
|
||||
try:
|
||||
book = book_class(prefix, json_book.get('lpath', None))
|
||||
for key,val in iteritems(json_book):
|
||||
meta = self.decode_metadata(key, val)
|
||||
if key == 'user_metadata':
|
||||
book.set_all_user_metadata(meta)
|
||||
else:
|
||||
if key == 'classifiers':
|
||||
key = 'identifiers'
|
||||
setattr(book, key, meta)
|
||||
return book
|
||||
except:
|
||||
print('exception during JSON decoding')
|
||||
traceback.print_exc()
|
||||
|
||||
def decode_metadata(self, key, value):
|
||||
if key == 'classifiers':
|
||||
key = 'identifiers'
|
||||
if key == 'user_metadata':
|
||||
for fm in itervalues(value):
|
||||
if fm['datatype'] == 'datetime':
|
||||
fm['#value#'] = string_to_datetime(fm['#value#'])
|
||||
decode_is_multiple(fm)
|
||||
return value
|
||||
elif key in self.field_metadata:
|
||||
if self.field_metadata[key]['datatype'] == 'datetime':
|
||||
return string_to_datetime(value)
|
||||
if key == 'thumbnail':
|
||||
return decode_thumbnail(value)
|
||||
return value
|
||||
412
ebook_converter/ebooks/metadata/html.py
Normal file
412
ebook_converter/ebooks/metadata/html.py
Normal file
@@ -0,0 +1,412 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=utf-8
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
'''
|
||||
Try to read metadata from an HTML file.
|
||||
'''
|
||||
|
||||
import re
|
||||
import unittest
|
||||
|
||||
from collections import defaultdict
|
||||
from html5_parser import parse
|
||||
from lxml.etree import Comment
|
||||
|
||||
from calibre.ebooks.metadata import string_to_authors, authors_to_string
|
||||
from calibre.ebooks.metadata.book.base import Metadata
|
||||
from calibre.ebooks.chardet import xml_to_unicode
|
||||
from calibre import replace_entities, isbytestring
|
||||
from calibre.utils.date import parse_date, is_date_undefined
|
||||
from polyglot.builtins import iteritems
|
||||
|
||||
|
||||
def get_metadata(stream):
|
||||
src = stream.read()
|
||||
return get_metadata_(src)
|
||||
|
||||
|
||||
COMMENT_NAMES = {
|
||||
'title': 'TITLE',
|
||||
'authors': 'AUTHOR',
|
||||
'publisher': 'PUBLISHER',
|
||||
'isbn': 'ISBN',
|
||||
'languages': 'LANGUAGE',
|
||||
'pubdate': 'PUBDATE',
|
||||
'timestamp': 'TIMESTAMP',
|
||||
'series': 'SERIES',
|
||||
'series_index': 'SERIESNUMBER',
|
||||
'rating': 'RATING',
|
||||
'comments': 'COMMENTS',
|
||||
'tags': 'TAGS',
|
||||
}
|
||||
|
||||
META_NAMES = {
|
||||
'title' : ('dc.title', 'dcterms.title', 'title'),
|
||||
'authors': ('author', 'dc.creator.aut', 'dcterms.creator.aut', 'dc.creator'),
|
||||
'publisher': ('publisher', 'dc.publisher', 'dcterms.publisher'),
|
||||
'isbn': ('isbn',),
|
||||
'languages': ('dc.language', 'dcterms.language'),
|
||||
'pubdate': ('pubdate', 'date of publication', 'dc.date.published', 'dc.date.publication', 'dc.date.issued', 'dcterms.issued'),
|
||||
'timestamp': ('timestamp', 'date of creation', 'dc.date.created', 'dc.date.creation', 'dcterms.created'),
|
||||
'series': ('series',),
|
||||
'series_index': ('seriesnumber', 'series_index', 'series.index'),
|
||||
'rating': ('rating',),
|
||||
'comments': ('comments', 'dc.description'),
|
||||
'tags': ('tags',),
|
||||
}
|
||||
rmap_comment = {v:k for k, v in iteritems(COMMENT_NAMES)}
|
||||
rmap_meta = {v:k for k, l in iteritems(META_NAMES) for v in l}
|
||||
|
||||
|
||||
# Extract an HTML attribute value, supports both single and double quotes and
|
||||
# single quotes inside double quotes and vice versa.
|
||||
attr_pat = r'''(?:(?P<sq>')|(?P<dq>"))(?P<content>(?(sq)[^']+|[^"]+))(?(sq)'|")'''
|
||||
|
||||
|
||||
def handle_comment(data, comment_tags):
|
||||
if not hasattr(handle_comment, 'pat'):
|
||||
handle_comment.pat = re.compile(r'''(?P<name>\S+)\s*=\s*%s''' % attr_pat)
|
||||
for match in handle_comment.pat.finditer(data):
|
||||
x = match.group('name')
|
||||
field = None
|
||||
try:
|
||||
field = rmap_comment[x]
|
||||
except KeyError:
|
||||
pass
|
||||
if field:
|
||||
comment_tags[field].append(replace_entities(match.group('content')))
|
||||
|
||||
|
||||
def parse_metadata(src):
|
||||
root = parse(src)
|
||||
comment_tags = defaultdict(list)
|
||||
meta_tags = defaultdict(list)
|
||||
meta_tag_ids = defaultdict(list)
|
||||
title = ''
|
||||
identifier_pat = re.compile(r'(?:dc|dcterms)[.:]identifier(?:\.|$)', flags=re.IGNORECASE)
|
||||
id_pat2 = re.compile(r'(?:dc|dcterms)[.:]identifier$', flags=re.IGNORECASE)
|
||||
|
||||
for comment in root.iterdescendants(tag=Comment):
|
||||
if comment.text:
|
||||
handle_comment(comment.text, comment_tags)
|
||||
|
||||
for q in root.iterdescendants(tag='title'):
|
||||
if q.text:
|
||||
title = q.text
|
||||
break
|
||||
|
||||
for meta in root.iterdescendants(tag='meta'):
|
||||
name, content = meta.get('name'), meta.get('content')
|
||||
if not name or not content:
|
||||
continue
|
||||
if identifier_pat.match(name) is not None:
|
||||
scheme = None
|
||||
if id_pat2.match(name) is not None:
|
||||
scheme = meta.get('scheme')
|
||||
else:
|
||||
elements = re.split(r'[.:]', name)
|
||||
if len(elements) == 3 and not meta.get('scheme'):
|
||||
scheme = elements[2].strip()
|
||||
if scheme:
|
||||
meta_tag_ids[scheme.lower()].append(content)
|
||||
else:
|
||||
x = name.lower()
|
||||
field = None
|
||||
try:
|
||||
field = rmap_meta[x]
|
||||
except KeyError:
|
||||
try:
|
||||
field = rmap_meta[x.replace(':', '.')]
|
||||
except KeyError:
|
||||
pass
|
||||
if field:
|
||||
meta_tags[field].append(content)
|
||||
|
||||
return comment_tags, meta_tags, meta_tag_ids, title
|
||||
|
||||
|
||||
def get_metadata_(src, encoding=None):
|
||||
# Meta data definitions as in
|
||||
# https://www.mobileread.com/forums/showpost.php?p=712544&postcount=9
|
||||
|
||||
if isbytestring(src):
|
||||
if not encoding:
|
||||
src = xml_to_unicode(src)[0]
|
||||
else:
|
||||
src = src.decode(encoding, 'replace')
|
||||
src = src[:150000] # Searching shouldn't take too long
|
||||
comment_tags, meta_tags, meta_tag_ids, title_tag = parse_metadata(src)
|
||||
|
||||
def get_all(field):
|
||||
ans = comment_tags.get(field, meta_tags.get(field, None))
|
||||
if ans:
|
||||
ans = [x.strip() for x in ans if x.strip()]
|
||||
if not ans:
|
||||
ans = None
|
||||
return ans
|
||||
|
||||
def get(field):
|
||||
ans = get_all(field)
|
||||
if ans:
|
||||
ans = ans[0]
|
||||
return ans
|
||||
|
||||
# Title
|
||||
title = get('title') or title_tag.strip() or _('Unknown')
|
||||
|
||||
# Author
|
||||
authors = authors_to_string(get_all('authors')) or _('Unknown')
|
||||
|
||||
# Create MetaInformation with Title and Author
|
||||
mi = Metadata(title, string_to_authors(authors))
|
||||
|
||||
# Single-value text fields
|
||||
for field in ('publisher', 'isbn'):
|
||||
val = get(field)
|
||||
if val:
|
||||
setattr(mi, field, val)
|
||||
|
||||
# Multi-value text fields
|
||||
for field in ('languages',):
|
||||
val = get_all(field)
|
||||
if val:
|
||||
setattr(mi, field, val)
|
||||
|
||||
# HTML fields
|
||||
for field in ('comments',):
|
||||
val = get(field)
|
||||
if val:
|
||||
setattr(mi, field, val.replace('&', '&').replace('<', '<').replace('>', '>').replace('"', '"').replace("'", '''))
|
||||
|
||||
# Date fields
|
||||
for field in ('pubdate', 'timestamp'):
|
||||
try:
|
||||
val = parse_date(get(field))
|
||||
except:
|
||||
pass
|
||||
else:
|
||||
if not is_date_undefined(val):
|
||||
setattr(mi, field, val)
|
||||
|
||||
# SERIES
|
||||
series = get('series')
|
||||
if series:
|
||||
pat = re.compile(r'\[([.0-9]+)\]$')
|
||||
match = pat.search(series)
|
||||
series_index = None
|
||||
if match is not None:
|
||||
try:
|
||||
series_index = float(match.group(1))
|
||||
except:
|
||||
pass
|
||||
series = series.replace(match.group(), '').strip()
|
||||
mi.series = series
|
||||
if series_index is None:
|
||||
series_index = get('series_index')
|
||||
try:
|
||||
series_index = float(series_index)
|
||||
except:
|
||||
pass
|
||||
if series_index is not None:
|
||||
mi.series_index = series_index
|
||||
|
||||
# RATING
|
||||
rating = get('rating')
|
||||
if rating:
|
||||
try:
|
||||
mi.rating = float(rating)
|
||||
if mi.rating < 0:
|
||||
mi.rating = 0
|
||||
if mi.rating > 10:
|
||||
mi.rating = 0
|
||||
except:
|
||||
pass
|
||||
|
||||
# TAGS
|
||||
tags = get_all('tags')
|
||||
if tags:
|
||||
tags = [x.strip() for s in tags for x in s.split(',') if x.strip()]
|
||||
if tags:
|
||||
mi.tags = tags
|
||||
|
||||
# IDENTIFIERS
|
||||
for (k,v) in iteritems(meta_tag_ids):
|
||||
v = [x.strip() for x in v if x.strip()]
|
||||
if v:
|
||||
mi.set_identifier(k, v[0])
|
||||
|
||||
return mi
|
||||
|
||||
|
||||
class MetadataHtmlTest(unittest.TestCase):
|
||||
|
||||
def compare_metadata(self, meta_a, meta_b):
|
||||
for attr in (
|
||||
'title', 'authors', 'publisher', 'isbn', 'languages', 'pubdate', 'timestamp', 'series',
|
||||
'series_index', 'rating', 'comments', 'tags', 'identifiers'
|
||||
):
|
||||
self.assertEqual(getattr(meta_a, attr), getattr(meta_b, attr))
|
||||
|
||||
def get_stream(self, test):
|
||||
from io import BytesIO
|
||||
|
||||
raw = b'''\
|
||||
<html>
|
||||
<head>
|
||||
'''
|
||||
|
||||
if test in {'title', 'meta_single', 'meta_multi', 'comment_single', 'comment_multi'}:
|
||||
raw += b'''\
|
||||
}
|
||||
<title>A Title Tag &amp; Title Ⓒ</title>
|
||||
'''
|
||||
|
||||
if test in {'meta_single', 'meta_multi', 'comment_single', 'comment_multi'}:
|
||||
raw += b'''\
|
||||
<meta name="dc:title" content="A Meta Tag &amp; Title Ⓒ" />
|
||||
<meta name="dcterms.creator.aut" content="George Washington" />
|
||||
<meta name="dc.publisher" content="Publisher A" />
|
||||
<meta name="isbn" content="1234567890" />
|
||||
<meta name="dc.language" content="English" />
|
||||
<meta name="dc.date.published" content="2019-01-01" />
|
||||
<meta name="dcterms.created" content="2018-01-01" />
|
||||
<meta name="series" content="Meta Series" />
|
||||
<meta name="seriesnumber" content="1" />
|
||||
<meta name="rating" content="" />
|
||||
<meta name="dc.description" content="" />
|
||||
<meta name="tags" content="tag a, tag b" />
|
||||
<meta name="dc.identifier.url" content="" />
|
||||
<meta name="dc.identifier" scheme="" content="invalid" />
|
||||
<meta name="dc.identifier." content="still invalid" />
|
||||
<meta name="dc.identifier.conflicting" scheme="schemes" content="are also invalid" />
|
||||
<meta name="dc.identifier.custom.subid" content="invalid too" />
|
||||
'''
|
||||
|
||||
if test in {'meta_multi', 'comment_single', 'comment_multi'}:
|
||||
raw += b'''\
|
||||
<meta name="title" content="A Different Meta Tag &amp; Title Ⓒ" />
|
||||
<meta name="author" content="John Adams with Thomas Jefferson" />
|
||||
<meta name="publisher" content="Publisher B" />
|
||||
<meta name="isbn" content="2345678901" />
|
||||
<meta name="dcterms.language" content="Spanish" />
|
||||
<meta name="date of publication" content="2017-01-01" />
|
||||
<meta name="timestamp" content="2016-01-01" />
|
||||
<meta name="series" content="Another Meta Series" />
|
||||
<meta name="series.index" content="2" />
|
||||
<meta name="rating" content="8" />
|
||||
<meta name="comments" content="meta "comments" ♥ HTML &amp;" />
|
||||
<meta name="tags" content="tag c" />
|
||||
<meta name="dc.identifier.url" content="http://google.com/search?q=calibre" />
|
||||
'''
|
||||
|
||||
if test in {'comment_single', 'comment_multi'}:
|
||||
raw += b'''\
|
||||
<!-- TITLE="A Comment Tag &amp; Title Ⓒ" -->
|
||||
<!-- AUTHOR="James Madison and James Monroe" -->
|
||||
<!-- PUBLISHER="Publisher C" -->
|
||||
<!-- ISBN="3456789012" -->
|
||||
<!-- LANGUAGE="French" -->
|
||||
<!-- PUBDATE="2015-01-01" -->
|
||||
<!-- TIMESTAMP="2014-01-01" -->
|
||||
<!-- SERIES="Comment Series" -->
|
||||
<!-- SERIESNUMBER="3" -->
|
||||
<!-- RATING="20" -->
|
||||
<!-- COMMENTS="comment "comments" ♥ HTML -- too &amp;" -->
|
||||
<!-- TAGS="tag d" -->
|
||||
'''
|
||||
|
||||
if test in {'comment_multi'}:
|
||||
raw += b'''\
|
||||
<!-- TITLE="Another Comment Tag &amp; Title Ⓒ" -->
|
||||
<!-- AUTHOR="John Quincy Adams" -->
|
||||
<!-- PUBLISHER="Publisher D" -->
|
||||
<!-- ISBN="4567890123" -->
|
||||
<!-- LANGUAGE="Japanese" -->
|
||||
<!-- PUBDATE="2013-01-01" -->
|
||||
<!-- TIMESTAMP="2012-01-01" -->
|
||||
<!-- SERIES="Comment Series 2" -->
|
||||
<!-- SERIESNUMBER="4" -->
|
||||
<!-- RATING="1" -->
|
||||
<!-- COMMENTS="comment "comments" ♥ HTML -- too &amp; for sure" -->
|
||||
<!-- TAGS="tag e, tag f" -->
|
||||
'''
|
||||
|
||||
raw += b'''\
|
||||
</head>
|
||||
<body>
|
||||
</body>
|
||||
</html>
|
||||
'''
|
||||
return BytesIO(raw)
|
||||
|
||||
def test_input_title(self):
|
||||
stream_meta = get_metadata(self.get_stream('title'))
|
||||
canon_meta = Metadata('A Title Tag & Title Ⓒ', [_('Unknown')])
|
||||
self.compare_metadata(stream_meta, canon_meta)
|
||||
|
||||
def test_input_meta_single(self):
|
||||
stream_meta = get_metadata(self.get_stream('meta_single'))
|
||||
canon_meta = Metadata('A Meta Tag & Title Ⓒ', ['George Washington'])
|
||||
canon_meta.publisher = 'Publisher A'
|
||||
canon_meta.languages = ['English']
|
||||
canon_meta.pubdate = parse_date('2019-01-01')
|
||||
canon_meta.timestamp = parse_date('2018-01-01')
|
||||
canon_meta.series = 'Meta Series'
|
||||
canon_meta.series_index = float(1)
|
||||
# canon_meta.rating = float(0)
|
||||
# canon_meta.comments = ''
|
||||
canon_meta.tags = ['tag a', 'tag b']
|
||||
canon_meta.set_identifiers({'isbn': '1234567890'})
|
||||
self.compare_metadata(stream_meta, canon_meta)
|
||||
|
||||
def test_input_meta_multi(self):
|
||||
stream_meta = get_metadata(self.get_stream('meta_multi'))
|
||||
canon_meta = Metadata('A Meta Tag & Title Ⓒ', ['George Washington', 'John Adams', 'Thomas Jefferson'])
|
||||
canon_meta.publisher = 'Publisher A'
|
||||
canon_meta.languages = ['English', 'Spanish']
|
||||
canon_meta.pubdate = parse_date('2019-01-01')
|
||||
canon_meta.timestamp = parse_date('2018-01-01')
|
||||
canon_meta.series = 'Meta Series'
|
||||
canon_meta.series_index = float(1)
|
||||
canon_meta.rating = float(8)
|
||||
canon_meta.comments = 'meta "comments" ♥ HTML &amp;'
|
||||
canon_meta.tags = ['tag a', 'tag b', 'tag c']
|
||||
canon_meta.set_identifiers({'isbn': '1234567890', 'url': 'http://google.com/search?q=calibre'})
|
||||
self.compare_metadata(stream_meta, canon_meta)
|
||||
|
||||
def test_input_comment_single(self):
|
||||
stream_meta = get_metadata(self.get_stream('comment_single'))
|
||||
canon_meta = Metadata('A Comment Tag & Title Ⓒ', ['James Madison', 'James Monroe'])
|
||||
canon_meta.publisher = 'Publisher C'
|
||||
canon_meta.languages = ['French']
|
||||
canon_meta.pubdate = parse_date('2015-01-01')
|
||||
canon_meta.timestamp = parse_date('2014-01-01')
|
||||
canon_meta.series = 'Comment Series'
|
||||
canon_meta.series_index = float(3)
|
||||
canon_meta.rating = float(0)
|
||||
canon_meta.comments = 'comment "comments" ♥ HTML -- too &amp;'
|
||||
canon_meta.tags = ['tag d']
|
||||
canon_meta.set_identifiers({'isbn': '3456789012', 'url': 'http://google.com/search?q=calibre'})
|
||||
self.compare_metadata(stream_meta, canon_meta)
|
||||
|
||||
def test_input_comment_multi(self):
|
||||
stream_meta = get_metadata(self.get_stream('comment_multi'))
|
||||
canon_meta = Metadata('A Comment Tag & Title Ⓒ', ['James Madison', 'James Monroe', 'John Quincy Adams'])
|
||||
canon_meta.publisher = 'Publisher C'
|
||||
canon_meta.languages = ['French', 'Japanese']
|
||||
canon_meta.pubdate = parse_date('2015-01-01')
|
||||
canon_meta.timestamp = parse_date('2014-01-01')
|
||||
canon_meta.series = 'Comment Series'
|
||||
canon_meta.series_index = float(3)
|
||||
canon_meta.rating = float(0)
|
||||
canon_meta.comments = 'comment "comments" ♥ HTML -- too &amp;'
|
||||
canon_meta.tags = ['tag d', 'tag e', 'tag f']
|
||||
canon_meta.set_identifiers({'isbn': '3456789012', 'url': 'http://google.com/search?q=calibre'})
|
||||
self.compare_metadata(stream_meta, canon_meta)
|
||||
|
||||
|
||||
def find_tests():
|
||||
return unittest.TestLoader().loadTestsFromTestCase(MetadataHtmlTest)
|
||||
243
ebook_converter/ebooks/metadata/meta.py
Normal file
243
ebook_converter/ebooks/metadata/meta.py
Normal file
@@ -0,0 +1,243 @@
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
|
||||
import os, re, collections
|
||||
|
||||
from calibre.utils.config import prefs
|
||||
from calibre.constants import filesystem_encoding
|
||||
from calibre.ebooks.metadata.opf2 import OPF
|
||||
from calibre import isbytestring
|
||||
from calibre.customize.ui import get_file_type_metadata, set_file_type_metadata
|
||||
from calibre.ebooks.metadata import MetaInformation, string_to_authors
|
||||
from polyglot.builtins import getcwd, unicode_type
|
||||
|
||||
# The priorities for loading metadata from different file types
|
||||
# Higher values should be used to update metadata from lower values
|
||||
METADATA_PRIORITIES = collections.defaultdict(lambda:0)
|
||||
for i, ext in enumerate((
|
||||
'html', 'htm', 'xhtml', 'xhtm',
|
||||
'rtf', 'fb2', 'pdf', 'prc', 'odt',
|
||||
'epub', 'lit', 'lrx', 'lrf', 'mobi',
|
||||
'azw', 'azw3', 'azw1', 'rb', 'imp', 'snb'
|
||||
)):
|
||||
METADATA_PRIORITIES[ext] = i + 1
|
||||
|
||||
|
||||
def path_to_ext(path):
|
||||
return os.path.splitext(path)[1][1:].lower()
|
||||
|
||||
|
||||
def metadata_from_formats(formats, force_read_metadata=False, pattern=None):
|
||||
try:
|
||||
return _metadata_from_formats(formats, force_read_metadata, pattern)
|
||||
except:
|
||||
mi = metadata_from_filename(list(iter(formats))[0], pat=pattern)
|
||||
if not mi.authors:
|
||||
mi.authors = [_('Unknown')]
|
||||
return mi
|
||||
|
||||
|
||||
def _metadata_from_formats(formats, force_read_metadata=False, pattern=None):
|
||||
mi = MetaInformation(None, None)
|
||||
formats.sort(key=lambda x: METADATA_PRIORITIES[path_to_ext(x)])
|
||||
extensions = list(map(path_to_ext, formats))
|
||||
if 'opf' in extensions:
|
||||
opf = formats[extensions.index('opf')]
|
||||
mi2 = opf_metadata(opf)
|
||||
if mi2 is not None and mi2.title:
|
||||
return mi2
|
||||
|
||||
for path, ext in zip(formats, extensions):
|
||||
with lopen(path, 'rb') as stream:
|
||||
try:
|
||||
newmi = get_metadata(stream, stream_type=ext,
|
||||
use_libprs_metadata=True,
|
||||
force_read_metadata=force_read_metadata,
|
||||
pattern=pattern)
|
||||
mi.smart_update(newmi)
|
||||
except Exception:
|
||||
continue
|
||||
if getattr(mi, 'application_id', None) is not None:
|
||||
return mi
|
||||
|
||||
if not mi.title:
|
||||
mi.title = _('Unknown')
|
||||
if not mi.authors:
|
||||
mi.authors = [_('Unknown')]
|
||||
|
||||
return mi
|
||||
|
||||
|
||||
def get_metadata(stream, stream_type='lrf', use_libprs_metadata=False,
|
||||
force_read_metadata=False, pattern=None):
|
||||
pos = 0
|
||||
if hasattr(stream, 'tell'):
|
||||
pos = stream.tell()
|
||||
try:
|
||||
return _get_metadata(stream, stream_type, use_libprs_metadata,
|
||||
force_read_metadata, pattern)
|
||||
finally:
|
||||
if hasattr(stream, 'seek'):
|
||||
stream.seek(pos)
|
||||
|
||||
|
||||
def _get_metadata(stream, stream_type, use_libprs_metadata,
|
||||
force_read_metadata=False, pattern=None):
|
||||
if stream_type:
|
||||
stream_type = stream_type.lower()
|
||||
if stream_type in ('html', 'html', 'xhtml', 'xhtm', 'xml'):
|
||||
stream_type = 'html'
|
||||
if stream_type in ('mobi', 'prc', 'azw'):
|
||||
stream_type = 'mobi'
|
||||
if stream_type in ('odt', 'ods', 'odp', 'odg', 'odf'):
|
||||
stream_type = 'odt'
|
||||
|
||||
opf = None
|
||||
if hasattr(stream, 'name'):
|
||||
c = os.path.splitext(stream.name)[0]+'.opf'
|
||||
if os.access(c, os.R_OK):
|
||||
opf = opf_metadata(os.path.abspath(c))
|
||||
|
||||
if use_libprs_metadata and getattr(opf, 'application_id', None) is not None:
|
||||
return opf
|
||||
|
||||
name = os.path.basename(getattr(stream, 'name', ''))
|
||||
# The fallback pattern matches the default filename format produced by calibre
|
||||
base = metadata_from_filename(name, pat=pattern, fallback_pat=re.compile(
|
||||
r'^(?P<title>.+) - (?P<author>[^-]+)$'))
|
||||
if not base.authors:
|
||||
base.authors = [_('Unknown')]
|
||||
if not base.title:
|
||||
base.title = _('Unknown')
|
||||
mi = MetaInformation(None, None)
|
||||
if force_read_metadata or prefs['read_file_metadata']:
|
||||
mi = get_file_type_metadata(stream, stream_type)
|
||||
base.smart_update(mi)
|
||||
if opf is not None:
|
||||
base.smart_update(opf)
|
||||
|
||||
return base
|
||||
|
||||
|
||||
def set_metadata(stream, mi, stream_type='lrf', report_error=None):
|
||||
if stream_type:
|
||||
stream_type = stream_type.lower()
|
||||
set_file_type_metadata(stream, mi, stream_type, report_error=report_error)
|
||||
|
||||
|
||||
def metadata_from_filename(name, pat=None, fallback_pat=None):
|
||||
if isbytestring(name):
|
||||
name = name.decode(filesystem_encoding, 'replace')
|
||||
name = name.rpartition('.')[0]
|
||||
mi = MetaInformation(None, None)
|
||||
if pat is None:
|
||||
pat = re.compile(prefs.get('filename_pattern'))
|
||||
name = name.replace('_', ' ')
|
||||
match = pat.search(name)
|
||||
if match is None and fallback_pat is not None:
|
||||
match = fallback_pat.search(name)
|
||||
if match is not None:
|
||||
try:
|
||||
mi.title = match.group('title')
|
||||
except IndexError:
|
||||
pass
|
||||
try:
|
||||
au = match.group('author')
|
||||
aus = string_to_authors(au)
|
||||
if aus:
|
||||
mi.authors = aus
|
||||
if prefs['swap_author_names'] and mi.authors:
|
||||
def swap(a):
|
||||
if ',' in a:
|
||||
parts = a.split(',', 1)
|
||||
else:
|
||||
parts = a.split(None, 1)
|
||||
if len(parts) > 1:
|
||||
t = parts[-1]
|
||||
parts = parts[:-1]
|
||||
parts.insert(0, t)
|
||||
return ' '.join(parts)
|
||||
mi.authors = [swap(x) for x in mi.authors]
|
||||
except (IndexError, ValueError):
|
||||
pass
|
||||
try:
|
||||
mi.series = match.group('series')
|
||||
except IndexError:
|
||||
pass
|
||||
try:
|
||||
si = match.group('series_index')
|
||||
mi.series_index = float(si)
|
||||
except (IndexError, ValueError, TypeError):
|
||||
pass
|
||||
try:
|
||||
si = match.group('isbn')
|
||||
mi.isbn = si
|
||||
except (IndexError, ValueError):
|
||||
pass
|
||||
try:
|
||||
publisher = match.group('publisher')
|
||||
mi.publisher = publisher
|
||||
except (IndexError, ValueError):
|
||||
pass
|
||||
try:
|
||||
pubdate = match.group('published')
|
||||
if pubdate:
|
||||
from calibre.utils.date import parse_only_date
|
||||
mi.pubdate = parse_only_date(pubdate)
|
||||
except:
|
||||
pass
|
||||
try:
|
||||
comments = match.group('comments')
|
||||
mi.comments = comments
|
||||
except (IndexError, ValueError):
|
||||
pass
|
||||
|
||||
if mi.is_null('title'):
|
||||
mi.title = name
|
||||
return mi
|
||||
|
||||
|
||||
def opf_metadata(opfpath):
|
||||
if hasattr(opfpath, 'read'):
|
||||
f = opfpath
|
||||
opfpath = getattr(f, 'name', getcwd())
|
||||
else:
|
||||
f = open(opfpath, 'rb')
|
||||
try:
|
||||
opf = OPF(f, os.path.dirname(opfpath))
|
||||
if opf.application_id is not None:
|
||||
mi = opf.to_book_metadata()
|
||||
if hasattr(opf, 'cover') and opf.cover:
|
||||
cpath = os.path.join(os.path.dirname(opfpath), opf.cover)
|
||||
if os.access(cpath, os.R_OK):
|
||||
fmt = cpath.rpartition('.')[-1]
|
||||
with open(cpath, 'rb') as f:
|
||||
data = f.read()
|
||||
mi.cover_data = (fmt, data)
|
||||
return mi
|
||||
except Exception:
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
pass
|
||||
|
||||
|
||||
def forked_read_metadata(path, tdir):
|
||||
from calibre.ebooks.metadata.opf2 import metadata_to_opf
|
||||
with lopen(path, 'rb') as f:
|
||||
fmt = os.path.splitext(path)[1][1:].lower()
|
||||
f.seek(0, 2)
|
||||
sz = f.tell()
|
||||
with lopen(os.path.join(tdir, 'size.txt'), 'wb') as s:
|
||||
s.write(unicode_type(sz).encode('ascii'))
|
||||
f.seek(0)
|
||||
mi = get_metadata(f, fmt)
|
||||
if mi.cover_data and mi.cover_data[1]:
|
||||
with lopen(os.path.join(tdir, 'cover.jpg'), 'wb') as f:
|
||||
f.write(mi.cover_data[1])
|
||||
mi.cover_data = (None, None)
|
||||
mi.cover = 'cover.jpg'
|
||||
opf = metadata_to_opf(mi, default_lang='und')
|
||||
with lopen(os.path.join(tdir, 'metadata.opf'), 'wb') as f:
|
||||
f.write(opf)
|
||||
302
ebook_converter/ebooks/metadata/odt.py
Normal file
302
ebook_converter/ebooks/metadata/odt.py
Normal file
@@ -0,0 +1,302 @@
|
||||
#!/usr/bin/python2
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai
|
||||
#
|
||||
# Copyright (C) 2006 Søren Roug, European Environment Agency
|
||||
#
|
||||
# This is free software. You may redistribute it under the terms
|
||||
# of the Apache license and the GNU General Public License Version
|
||||
# 2 or at your option any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public
|
||||
# License along with this program; if not, write to the Free Software
|
||||
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
||||
#
|
||||
# Contributor(s):
|
||||
#
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
import io
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
|
||||
from lxml.etree import fromstring, tostring
|
||||
|
||||
from calibre.ebooks.metadata import (
|
||||
MetaInformation, authors_to_string, check_isbn, string_to_authors
|
||||
)
|
||||
from calibre.utils.date import isoformat, parse_date
|
||||
from calibre.utils.imghdr import identify
|
||||
from calibre.utils.localization import canonicalize_lang, lang_as_iso639_1
|
||||
from calibre.utils.zipfile import ZipFile, safe_replace
|
||||
from odf.draw import Frame as odFrame, Image as odImage
|
||||
from odf.namespaces import DCNS, METANS, OFFICENS
|
||||
from odf.opendocument import load as odLoad
|
||||
from polyglot.builtins import as_unicode
|
||||
|
||||
fields = {
|
||||
'title': (DCNS, 'title'),
|
||||
'description': (DCNS, 'description'),
|
||||
'subject': (DCNS, 'subject'),
|
||||
'creator': (DCNS, 'creator'),
|
||||
'date': (DCNS, 'date'),
|
||||
'language': (DCNS, 'language'),
|
||||
'generator': (METANS, 'generator'),
|
||||
'initial-creator': (METANS, 'initial-creator'),
|
||||
'keyword': (METANS, 'keyword'),
|
||||
'keywords': (METANS, 'keywords'),
|
||||
'editing-duration': (METANS, 'editing-duration'),
|
||||
'editing-cycles': (METANS, 'editing-cycles'),
|
||||
'printed-by': (METANS, 'printed-by'),
|
||||
'print-date': (METANS, 'print-date'),
|
||||
'creation-date': (METANS, 'creation-date'),
|
||||
'user-defined': (METANS, 'user-defined'),
|
||||
# 'template': (METANS, 'template'),
|
||||
}
|
||||
|
||||
|
||||
def get_metadata(stream, extract_cover=True):
|
||||
whitespace = re.compile(r'\s+')
|
||||
|
||||
def normalize(s):
|
||||
return whitespace.sub(' ', s).strip()
|
||||
|
||||
with ZipFile(stream) as zf:
|
||||
meta = zf.read('meta.xml')
|
||||
root = fromstring(meta)
|
||||
|
||||
def find(field):
|
||||
ns, tag = fields[field]
|
||||
ans = root.xpath('//ns0:{}'.format(tag), namespaces={'ns0': ns})
|
||||
if ans:
|
||||
return normalize(tostring(ans[0], method='text', encoding='unicode', with_tail=False)).strip()
|
||||
|
||||
mi = MetaInformation(None, [])
|
||||
title = find('title')
|
||||
if title:
|
||||
mi.title = title
|
||||
creator = find('initial-creator') or find('creator')
|
||||
if creator:
|
||||
mi.authors = string_to_authors(creator)
|
||||
desc = find('description')
|
||||
if desc:
|
||||
mi.comments = desc
|
||||
lang = find('language')
|
||||
if lang and canonicalize_lang(lang):
|
||||
mi.languages = [canonicalize_lang(lang)]
|
||||
kw = find('keyword') or find('keywords')
|
||||
if kw:
|
||||
mi.tags = [x.strip() for x in kw.split(',') if x.strip()]
|
||||
data = {}
|
||||
for tag in root.xpath('//ns0:user-defined', namespaces={'ns0': fields['user-defined'][0]}):
|
||||
name = (tag.get('{%s}name' % METANS) or '').lower()
|
||||
vtype = tag.get('{%s}value-type' % METANS) or 'string'
|
||||
val = tag.text
|
||||
if name and val:
|
||||
if vtype == 'boolean':
|
||||
val = val == 'true'
|
||||
data[name] = val
|
||||
opfmeta = False # we need this later for the cover
|
||||
opfnocover = False
|
||||
if data.get('opf.metadata'):
|
||||
# custom metadata contains OPF information
|
||||
opfmeta = True
|
||||
if data.get('opf.titlesort', ''):
|
||||
mi.title_sort = data['opf.titlesort']
|
||||
if data.get('opf.authors', ''):
|
||||
mi.authors = string_to_authors(data['opf.authors'])
|
||||
if data.get('opf.authorsort', ''):
|
||||
mi.author_sort = data['opf.authorsort']
|
||||
if data.get('opf.isbn', ''):
|
||||
isbn = check_isbn(data['opf.isbn'])
|
||||
if isbn is not None:
|
||||
mi.isbn = isbn
|
||||
if data.get('opf.publisher', ''):
|
||||
mi.publisher = data['opf.publisher']
|
||||
if data.get('opf.pubdate', ''):
|
||||
mi.pubdate = parse_date(data['opf.pubdate'], assume_utc=True)
|
||||
if data.get('opf.identifiers'):
|
||||
try:
|
||||
mi.identifiers = json.loads(data['opf.identifiers'])
|
||||
except Exception:
|
||||
pass
|
||||
if data.get('opf.rating'):
|
||||
try:
|
||||
mi.rating = max(0, min(float(data['opf.rating']), 10))
|
||||
except Exception:
|
||||
pass
|
||||
if data.get('opf.series', ''):
|
||||
mi.series = data['opf.series']
|
||||
if data.get('opf.seriesindex', ''):
|
||||
try:
|
||||
mi.series_index = float(data['opf.seriesindex'])
|
||||
except Exception:
|
||||
mi.series_index = 1.0
|
||||
if data.get('opf.language', ''):
|
||||
cl = canonicalize_lang(data['opf.language'])
|
||||
if cl:
|
||||
mi.languages = [cl]
|
||||
opfnocover = data.get('opf.nocover', False)
|
||||
if not opfnocover:
|
||||
try:
|
||||
read_cover(stream, zf, mi, opfmeta, extract_cover)
|
||||
except Exception:
|
||||
pass # Do not let an error reading the cover prevent reading other data
|
||||
|
||||
return mi
|
||||
|
||||
|
||||
def set_metadata(stream, mi):
|
||||
|
||||
with ZipFile(stream) as zf:
|
||||
raw = _set_metadata(zf.open('meta.xml').read(), mi)
|
||||
# print(raw.decode('utf-8'))
|
||||
|
||||
stream.seek(os.SEEK_SET)
|
||||
safe_replace(stream, "meta.xml", io.BytesIO(raw))
|
||||
|
||||
|
||||
def _set_metadata(raw, mi):
|
||||
root = fromstring(raw)
|
||||
namespaces = {'office': OFFICENS, 'meta': METANS, 'dc': DCNS}
|
||||
nsrmap = {v: k for k, v in namespaces.items()}
|
||||
|
||||
def xpath(expr, parent=root):
|
||||
return parent.xpath(expr, namespaces=namespaces)
|
||||
|
||||
def remove(*tag_names):
|
||||
for tag_name in tag_names:
|
||||
ns = fields[tag_name][0]
|
||||
tag_name = '{}:{}'.format(nsrmap[ns], tag_name)
|
||||
for x in xpath('descendant::' + tag_name, meta):
|
||||
x.getparent().remove(x)
|
||||
|
||||
def add(tag, val=None):
|
||||
ans = meta.makeelement('{%s}%s' % fields[tag])
|
||||
ans.text = val
|
||||
meta.append(ans)
|
||||
return ans
|
||||
|
||||
def remove_user_metadata(*names):
|
||||
for x in xpath('//meta:user-defined'):
|
||||
q = (x.get('{%s}name' % METANS) or '').lower()
|
||||
if q in names:
|
||||
x.getparent().remove(x)
|
||||
|
||||
def add_um(name, val, vtype='string'):
|
||||
ans = add('user-defined', val)
|
||||
ans.set('{%s}value-type' % METANS, vtype)
|
||||
ans.set('{%s}name' % METANS, name)
|
||||
|
||||
def add_user_metadata(name, val):
|
||||
if not hasattr(add_user_metadata, 'sentinel_added'):
|
||||
add_user_metadata.sentinel_added = True
|
||||
remove_user_metadata('opf.metadata')
|
||||
add_um('opf.metadata', 'true', 'boolean')
|
||||
val_type = 'string'
|
||||
if hasattr(val, 'strftime'):
|
||||
val = isoformat(val, as_utc=True).split('T')[0]
|
||||
val_type = 'date'
|
||||
add_um(name, val, val_type)
|
||||
|
||||
meta = xpath('//office:meta')[0]
|
||||
|
||||
if not mi.is_null('title'):
|
||||
remove('title')
|
||||
add('title', mi.title)
|
||||
if not mi.is_null('title_sort'):
|
||||
remove_user_metadata('opf.titlesort')
|
||||
add_user_metadata('opf.titlesort', mi.title_sort)
|
||||
if not mi.is_null('authors'):
|
||||
remove('initial-creator', 'creator')
|
||||
val = authors_to_string(mi.authors)
|
||||
add('initial-creator', val), add('creator', val)
|
||||
remove_user_metadata('opf.authors')
|
||||
add_user_metadata('opf.authors', val)
|
||||
if not mi.is_null('author_sort'):
|
||||
remove_user_metadata('opf.authorsort')
|
||||
add_user_metadata('opf.authorsort', mi.author_sort)
|
||||
if not mi.is_null('comments'):
|
||||
remove('description')
|
||||
add('description', mi.comments)
|
||||
if not mi.is_null('tags'):
|
||||
remove('keyword')
|
||||
add('keyword', ', '.join(mi.tags))
|
||||
if not mi.is_null('languages'):
|
||||
lang = lang_as_iso639_1(mi.languages[0])
|
||||
if lang:
|
||||
remove('language')
|
||||
add('language', lang)
|
||||
if not mi.is_null('pubdate'):
|
||||
remove_user_metadata('opf.pubdate')
|
||||
add_user_metadata('opf.pubdate', mi.pubdate)
|
||||
if not mi.is_null('publisher'):
|
||||
remove_user_metadata('opf.publisher')
|
||||
add_user_metadata('opf.publisher', mi.publisher)
|
||||
if not mi.is_null('series'):
|
||||
remove_user_metadata('opf.series', 'opf.seriesindex')
|
||||
add_user_metadata('opf.series', mi.series)
|
||||
add_user_metadata('opf.seriesindex', '{}'.format(mi.series_index))
|
||||
if not mi.is_null('identifiers'):
|
||||
remove_user_metadata('opf.identifiers')
|
||||
add_user_metadata('opf.identifiers', as_unicode(json.dumps(mi.identifiers)))
|
||||
if not mi.is_null('rating'):
|
||||
remove_user_metadata('opf.rating')
|
||||
add_user_metadata('opf.rating', '%.2g' % mi.rating)
|
||||
|
||||
return tostring(root, encoding='utf-8', pretty_print=True)
|
||||
|
||||
|
||||
def read_cover(stream, zin, mi, opfmeta, extract_cover):
|
||||
# search for an draw:image in a draw:frame with the name 'opf.cover'
|
||||
# if opf.metadata prop is false, just use the first image that
|
||||
# has a proper size (borrowed from docx)
|
||||
otext = odLoad(stream)
|
||||
cover_href = None
|
||||
cover_data = None
|
||||
cover_frame = None
|
||||
imgnum = 0
|
||||
for frm in otext.topnode.getElementsByType(odFrame):
|
||||
img = frm.getElementsByType(odImage)
|
||||
if len(img) == 0:
|
||||
continue
|
||||
i_href = img[0].getAttribute('href')
|
||||
try:
|
||||
raw = zin.read(i_href)
|
||||
except KeyError:
|
||||
continue
|
||||
try:
|
||||
fmt, width, height = identify(raw)
|
||||
except Exception:
|
||||
continue
|
||||
imgnum += 1
|
||||
if opfmeta and frm.getAttribute('name').lower() == 'opf.cover':
|
||||
cover_href = i_href
|
||||
cover_data = (fmt, raw)
|
||||
cover_frame = frm.getAttribute('name') # could have upper case
|
||||
break
|
||||
if cover_href is None and imgnum == 1 and 0.8 <= height/width <= 1.8 and height*width >= 12000:
|
||||
# Pick the first image as the cover if it is of a suitable size
|
||||
cover_href = i_href
|
||||
cover_data = (fmt, raw)
|
||||
if not opfmeta:
|
||||
break
|
||||
|
||||
if cover_href is not None:
|
||||
mi.cover = cover_href
|
||||
mi.odf_cover_frame = cover_frame
|
||||
if extract_cover:
|
||||
if not cover_data:
|
||||
raw = zin.read(cover_href)
|
||||
try:
|
||||
fmt = identify(raw)[0]
|
||||
except Exception:
|
||||
pass
|
||||
else:
|
||||
cover_data = (fmt, raw)
|
||||
mi.cover_data = cover_data
|
||||
1829
ebook_converter/ebooks/metadata/opf2.py
Normal file
1829
ebook_converter/ebooks/metadata/opf2.py
Normal file
File diff suppressed because it is too large
Load Diff
1118
ebook_converter/ebooks/metadata/opf3.py
Normal file
1118
ebook_converter/ebooks/metadata/opf3.py
Normal file
File diff suppressed because it is too large
Load Diff
251
ebook_converter/ebooks/metadata/rtf.py
Normal file
251
ebook_converter/ebooks/metadata/rtf.py
Normal file
@@ -0,0 +1,251 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=utf-8
|
||||
# License: GPLv3 Copyright: 2008, Kovid Goyal <kovid at kovidgoyal.net>
|
||||
|
||||
"""
|
||||
Edit metadata in RTF files.
|
||||
"""
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
import codecs
|
||||
import re
|
||||
|
||||
from calibre import force_unicode
|
||||
from calibre.ebooks.metadata import MetaInformation
|
||||
from polyglot.builtins import codepoint_to_chr, string_or_bytes, unicode_type, int_to_byte, filter
|
||||
|
||||
title_pat = re.compile(br'\{\\info.*?\{\\title(.*?)(?<!\\)\}', re.DOTALL)
|
||||
author_pat = re.compile(br'\{\\info.*?\{\\author(.*?)(?<!\\)\}', re.DOTALL)
|
||||
comment_pat = re.compile(br'\{\\info.*?\{\\subject(.*?)(?<!\\)\}', re.DOTALL)
|
||||
tags_pat = re.compile(br'\{\\info.*?\{\\category(.*?)(?<!\\)\}', re.DOTALL)
|
||||
publisher_pat = re.compile(br'\{\\info.*?\{\\manager(.*?)(?<!\\)\}', re.DOTALL)
|
||||
|
||||
|
||||
def get_document_info(stream):
|
||||
"""
|
||||
Extract the \\info block from an RTF file.
|
||||
Return the info block as a string and the position in the file at which it
|
||||
starts.
|
||||
@param stream: File like object pointing to the RTF file.
|
||||
"""
|
||||
block_size = 4096
|
||||
stream.seek(0)
|
||||
found, block = False, b""
|
||||
while not found:
|
||||
prefix = block[-6:]
|
||||
block = prefix + stream.read(block_size)
|
||||
actual_block_size = len(block) - len(prefix)
|
||||
if len(block) == len(prefix):
|
||||
break
|
||||
idx = block.find(br'{\info')
|
||||
if idx >= 0:
|
||||
found = True
|
||||
pos = stream.tell() - actual_block_size + idx - len(prefix)
|
||||
stream.seek(pos)
|
||||
else:
|
||||
if block.find(br'\sect') > -1:
|
||||
break
|
||||
if not found:
|
||||
return None, 0
|
||||
data, count, = [], 0
|
||||
pos = stream.tell()
|
||||
while True:
|
||||
ch = stream.read(1)
|
||||
if ch == b'\\':
|
||||
data.append(ch + stream.read(1))
|
||||
continue
|
||||
if ch == b'{':
|
||||
count += 1
|
||||
elif ch == b'}':
|
||||
count -= 1
|
||||
data.append(ch)
|
||||
if count == 0:
|
||||
break
|
||||
return b''.join(data), pos
|
||||
|
||||
|
||||
def detect_codepage(stream):
|
||||
pat = re.compile(br'\\ansicpg(\d+)')
|
||||
match = pat.search(stream.read(512))
|
||||
if match is not None:
|
||||
num = match.group(1)
|
||||
if num == b'0':
|
||||
num = b'1252'
|
||||
try:
|
||||
codec = (b'cp'+num).decode('ascii')
|
||||
codecs.lookup(codec)
|
||||
return codec
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
def encode(unistr):
|
||||
if not isinstance(unistr, unicode_type):
|
||||
unistr = force_unicode(unistr)
|
||||
return ''.join(c if ord(c) < 128 else '\\u{}?'.format(ord(c)) for c in unistr)
|
||||
|
||||
|
||||
def decode(raw, codec):
|
||||
# https://en.wikipedia.org/wiki/Rich_Text_Format#Character_encoding
|
||||
|
||||
def codepage(match):
|
||||
try:
|
||||
return int_to_byte(int(match.group(1), 16)).decode(codec)
|
||||
except ValueError:
|
||||
return '?'
|
||||
|
||||
def uni(match):
|
||||
try:
|
||||
return codepoint_to_chr(int(match.group(1)))
|
||||
except Exception:
|
||||
return '?'
|
||||
|
||||
if isinstance(raw, bytes):
|
||||
raw = raw.decode('ascii', 'replace')
|
||||
|
||||
if codec is not None:
|
||||
raw = re.sub(r"\\'([a-fA-F0-9]{2})", codepage, raw)
|
||||
|
||||
raw = re.sub(r'\\u([0-9]{3,5}).', uni, raw)
|
||||
return raw
|
||||
|
||||
|
||||
def get_metadata(stream):
|
||||
"""
|
||||
Return metadata as a L{MetaInfo} object
|
||||
"""
|
||||
stream.seek(0)
|
||||
if stream.read(5) != br'{\rtf':
|
||||
return MetaInformation(_('Unknown'))
|
||||
block = get_document_info(stream)[0]
|
||||
if not block:
|
||||
return MetaInformation(_('Unknown'))
|
||||
|
||||
stream.seek(0)
|
||||
cpg = detect_codepage(stream)
|
||||
stream.seek(0)
|
||||
|
||||
title_match = title_pat.search(block)
|
||||
if title_match is not None:
|
||||
title = decode(title_match.group(1).strip(), cpg)
|
||||
else:
|
||||
title = _('Unknown')
|
||||
author_match = author_pat.search(block)
|
||||
if author_match is not None:
|
||||
author = decode(author_match.group(1).strip(), cpg)
|
||||
else:
|
||||
author = None
|
||||
mi = MetaInformation(title)
|
||||
if author:
|
||||
mi.authors = [x.strip() for x in author.split(',')]
|
||||
|
||||
comment_match = comment_pat.search(block)
|
||||
if comment_match is not None:
|
||||
comment = decode(comment_match.group(1).strip(), cpg)
|
||||
mi.comments = comment
|
||||
tags_match = tags_pat.search(block)
|
||||
if tags_match is not None:
|
||||
tags = decode(tags_match.group(1).strip(), cpg)
|
||||
mi.tags = list(filter(None, (x.strip() for x in tags.split(','))))
|
||||
publisher_match = publisher_pat.search(block)
|
||||
if publisher_match is not None:
|
||||
publisher = decode(publisher_match.group(1).strip(), cpg)
|
||||
mi.publisher = publisher
|
||||
|
||||
return mi
|
||||
|
||||
|
||||
def create_metadata(stream, options):
|
||||
md = [r'{\info']
|
||||
if options.title:
|
||||
title = encode(options.title)
|
||||
md.append(r'{\title %s}'%(title,))
|
||||
if options.authors:
|
||||
au = options.authors
|
||||
if not isinstance(au, string_or_bytes):
|
||||
au = ', '.join(au)
|
||||
author = encode(au)
|
||||
md.append(r'{\author %s}'%(author,))
|
||||
comp = options.comment if hasattr(options, 'comment') else options.comments
|
||||
if comp:
|
||||
comment = encode(comp)
|
||||
md.append(r'{\subject %s}'%(comment,))
|
||||
if options.publisher:
|
||||
publisher = encode(options.publisher)
|
||||
md.append(r'{\manager %s}'%(publisher,))
|
||||
if options.tags:
|
||||
tags = u', '.join(options.tags)
|
||||
tags = encode(tags)
|
||||
md.append(r'{\category %s}'%(tags,))
|
||||
if len(md) > 1:
|
||||
md.append('}')
|
||||
stream.seek(0)
|
||||
src = stream.read()
|
||||
ans = src[:6] + ''.join(md).encode('ascii') + src[6:]
|
||||
stream.seek(0)
|
||||
stream.write(ans)
|
||||
|
||||
|
||||
def set_metadata(stream, options):
|
||||
'''
|
||||
Modify/add RTF metadata in stream
|
||||
@param options: Object with metadata attributes title, author, comment, category
|
||||
'''
|
||||
def add_metadata_item(src, name, val):
|
||||
index = src.rindex('}')
|
||||
return src[:index] + r'{\ '[:-1] + name + ' ' + val + '}}'
|
||||
|
||||
src, pos = get_document_info(stream)
|
||||
if src is None:
|
||||
create_metadata(stream, options)
|
||||
else:
|
||||
src = src.decode('ascii')
|
||||
olen = len(src)
|
||||
|
||||
base_pat = r'\{\\name(.*?)(?<!\\)\}'
|
||||
|
||||
def replace_or_create(src, name, val):
|
||||
val = encode(val)
|
||||
pat = re.compile(base_pat.replace('name', name), re.DOTALL)
|
||||
src, num = pat.subn('{\\' + name + ' ' + val + '}', src)
|
||||
if num == 0:
|
||||
src = add_metadata_item(src, name, val)
|
||||
return src
|
||||
|
||||
if options.title is not None:
|
||||
src = replace_or_create(src, 'title', options.title)
|
||||
if options.comments is not None:
|
||||
src = replace_or_create(src, 'subject', options.comments)
|
||||
if options.authors is not None:
|
||||
src = replace_or_create(src, 'author', ', '.join(options.authors))
|
||||
if options.tags is not None:
|
||||
src = replace_or_create(src, 'category', ', '.join(options.tags))
|
||||
if options.publisher is not None:
|
||||
src = replace_or_create(src, 'manager', options.publisher)
|
||||
stream.seek(pos + olen)
|
||||
after = stream.read()
|
||||
stream.seek(pos)
|
||||
stream.truncate()
|
||||
stream.write(src.encode('ascii'))
|
||||
stream.write(after)
|
||||
|
||||
|
||||
def find_tests():
|
||||
import unittest
|
||||
from io import BytesIO
|
||||
from calibre.ebooks.metadata.book.base import Metadata
|
||||
|
||||
class Test(unittest.TestCase):
|
||||
|
||||
def test_rtf_metadata(self):
|
||||
stream = BytesIO(br'{\rtf1\ansi\ansicpg1252}')
|
||||
m = Metadata('Test ø̄title', ['Author One', 'Author БTwo'])
|
||||
m.tags = 'tag1 見tag2'.split()
|
||||
m.comments = '<p>some ⊹comments</p>'
|
||||
m.publisher = 'publiSher'
|
||||
set_metadata(stream, m)
|
||||
stream.seek(0)
|
||||
o = get_metadata(stream)
|
||||
for attr in 'title authors publisher comments tags'.split():
|
||||
self.assertEqual(getattr(m, attr), getattr(o, attr))
|
||||
|
||||
return unittest.defaultTestLoader.loadTestsFromTestCase(Test)
|
||||
296
ebook_converter/ebooks/metadata/toc.py
Normal file
296
ebook_converter/ebooks/metadata/toc.py
Normal file
@@ -0,0 +1,296 @@
|
||||
#!/usr/bin/env python2
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2010, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
|
||||
import os, glob, re, functools
|
||||
from collections import Counter
|
||||
|
||||
from lxml import etree
|
||||
from lxml.builder import ElementMaker
|
||||
|
||||
from calibre.constants import __appname__, __version__
|
||||
from calibre.ebooks.chardet import xml_to_unicode
|
||||
from calibre.utils.xml_parse import safe_xml_fromstring
|
||||
from calibre.utils.cleantext import clean_xml_chars
|
||||
from polyglot.builtins import unicode_type, getcwd
|
||||
from polyglot.urllib import unquote, urlparse
|
||||
|
||||
NCX_NS = "http://www.daisy.org/z3986/2005/ncx/"
|
||||
CALIBRE_NS = "http://calibre.kovidgoyal.net/2009/metadata"
|
||||
NSMAP = {None: NCX_NS, 'calibre':CALIBRE_NS}
|
||||
E = ElementMaker(namespace=NCX_NS, nsmap=NSMAP)
|
||||
C = ElementMaker(namespace=CALIBRE_NS, nsmap=NSMAP)
|
||||
|
||||
|
||||
def parse_html_toc(data):
|
||||
from html5_parser import parse
|
||||
from calibre.utils.cleantext import clean_xml_chars
|
||||
from lxml import etree
|
||||
if isinstance(data, bytes):
|
||||
data = xml_to_unicode(data, strip_encoding_pats=True, resolve_entities=True)[0]
|
||||
root = parse(clean_xml_chars(data), maybe_xhtml=True, keep_doctype=False, sanitize_names=True)
|
||||
for a in root.xpath('//*[@href and local-name()="a"]'):
|
||||
purl = urlparse(unquote(a.get('href')))
|
||||
href, fragment = purl[2], purl[5]
|
||||
if not fragment:
|
||||
fragment = None
|
||||
else:
|
||||
fragment = fragment.strip()
|
||||
href = href.strip()
|
||||
|
||||
txt = etree.tostring(a, method='text', encoding='unicode')
|
||||
yield href, fragment, txt
|
||||
|
||||
|
||||
class TOC(list):
|
||||
|
||||
def __init__(self, href=None, fragment=None, text=None, parent=None,
|
||||
play_order=0, base_path=getcwd(), type='unknown', author=None,
|
||||
description=None, toc_thumbnail=None):
|
||||
self.href = href
|
||||
self.fragment = fragment
|
||||
if not self.fragment:
|
||||
self.fragment = None
|
||||
self.text = text
|
||||
self.parent = parent
|
||||
self.base_path = base_path
|
||||
self.play_order = play_order
|
||||
self.type = type
|
||||
self.author = author
|
||||
self.description = description
|
||||
self.toc_thumbnail = toc_thumbnail
|
||||
|
||||
def __str__(self):
|
||||
lines = ['TOC: %s#%s %s'%(self.href, self.fragment, self.text)]
|
||||
for child in self:
|
||||
c = unicode_type(child).splitlines()
|
||||
for l in c:
|
||||
lines.append('\t'+l)
|
||||
return '\n'.join(lines)
|
||||
|
||||
def count(self, type):
|
||||
return len([i for i in self.flat() if i.type == type])
|
||||
|
||||
def purge(self, types, max=0):
|
||||
remove = []
|
||||
for entry in self.flat():
|
||||
if entry.type in types:
|
||||
remove.append(entry)
|
||||
remove = remove[max:]
|
||||
for entry in remove:
|
||||
if entry.parent is None:
|
||||
continue
|
||||
entry.parent.remove(entry)
|
||||
return remove
|
||||
|
||||
def remove(self, entry):
|
||||
list.remove(self, entry)
|
||||
entry.parent = None
|
||||
|
||||
def add_item(self, href, fragment, text, play_order=None, type='unknown',
|
||||
author=None, description=None, toc_thumbnail=None):
|
||||
if play_order is None:
|
||||
play_order = (self[-1].play_order if len(self) else self.play_order) + 1
|
||||
self.append(TOC(href=href, fragment=fragment, text=text, parent=self,
|
||||
base_path=self.base_path, play_order=play_order,
|
||||
type=type, author=author, description=description, toc_thumbnail=toc_thumbnail))
|
||||
return self[-1]
|
||||
|
||||
def top_level_items(self):
|
||||
for item in self:
|
||||
if item.text is not None:
|
||||
yield item
|
||||
|
||||
def depth(self):
|
||||
depth = 1
|
||||
for obj in self:
|
||||
c = obj.depth()
|
||||
if c > depth - 1:
|
||||
depth = c + 1
|
||||
return depth
|
||||
|
||||
def flat(self):
|
||||
'Depth first iteration over the tree rooted at self'
|
||||
yield self
|
||||
for obj in self:
|
||||
for i in obj.flat():
|
||||
yield i
|
||||
|
||||
@property
|
||||
def abspath(self):
|
||||
'Return the file this toc entry points to as a absolute path to a file on the system.'
|
||||
|
||||
if self.href is None:
|
||||
return None
|
||||
path = self.href.replace('/', os.sep)
|
||||
if not os.path.isabs(path):
|
||||
path = os.path.join(self.base_path, path)
|
||||
return path
|
||||
|
||||
def read_from_opf(self, opfreader):
|
||||
toc = opfreader.soup.find('spine', toc=True)
|
||||
if toc is not None:
|
||||
toc = toc['toc']
|
||||
if toc is None:
|
||||
try:
|
||||
toc = opfreader.soup.find('guide').find('reference', attrs={'type':'toc'})['href']
|
||||
except:
|
||||
for item in opfreader.manifest:
|
||||
if 'toc' in item.href().lower():
|
||||
toc = item.href()
|
||||
break
|
||||
|
||||
if toc is not None:
|
||||
if toc.lower() not in ('ncx', 'ncxtoc'):
|
||||
toc = urlparse(unquote(toc))[2]
|
||||
toc = toc.replace('/', os.sep)
|
||||
if not os.path.isabs(toc):
|
||||
toc = os.path.join(self.base_path, toc)
|
||||
try:
|
||||
if not os.path.exists(toc):
|
||||
bn = os.path.basename(toc)
|
||||
bn = bn.replace('_top.htm', '_toc.htm') # Bug in BAEN OPF files
|
||||
toc = os.path.join(os.path.dirname(toc), bn)
|
||||
|
||||
self.read_html_toc(toc)
|
||||
except:
|
||||
print('WARNING: Could not read Table of Contents. Continuing anyway.')
|
||||
else:
|
||||
path = opfreader.manifest.item(toc.lower())
|
||||
path = getattr(path, 'path', path)
|
||||
if path and os.access(path, os.R_OK):
|
||||
try:
|
||||
self.read_ncx_toc(path)
|
||||
except Exception as err:
|
||||
print('WARNING: Invalid NCX file:', err)
|
||||
return
|
||||
cwd = os.path.abspath(self.base_path)
|
||||
m = glob.glob(os.path.join(cwd, '*.ncx'))
|
||||
if m:
|
||||
toc = m[0]
|
||||
self.read_ncx_toc(toc)
|
||||
|
||||
def read_ncx_toc(self, toc, root=None):
|
||||
self.base_path = os.path.dirname(toc)
|
||||
if root is None:
|
||||
with open(toc, 'rb') as f:
|
||||
raw = xml_to_unicode(f.read(), assume_utf8=True,
|
||||
strip_encoding_pats=True)[0]
|
||||
root = safe_xml_fromstring(raw)
|
||||
xpn = {'re': 'http://exslt.org/regular-expressions'}
|
||||
XPath = functools.partial(etree.XPath, namespaces=xpn)
|
||||
|
||||
def get_attr(node, default=None, attr='playorder'):
|
||||
for name, val in node.attrib.items():
|
||||
if name and val and name.lower().endswith(attr):
|
||||
return val
|
||||
return default
|
||||
|
||||
nl_path = XPath('./*[re:match(local-name(), "navlabel$", "i")]')
|
||||
txt_path = XPath('./*[re:match(local-name(), "text$", "i")]')
|
||||
content_path = XPath('./*[re:match(local-name(), "content$", "i")]')
|
||||
np_path = XPath('./*[re:match(local-name(), "navpoint$", "i")]')
|
||||
|
||||
def process_navpoint(np, dest):
|
||||
try:
|
||||
play_order = int(get_attr(np, 1))
|
||||
except:
|
||||
play_order = 1
|
||||
href = fragment = text = None
|
||||
nd = dest
|
||||
nl = nl_path(np)
|
||||
if nl:
|
||||
nl = nl[0]
|
||||
text = ''
|
||||
for txt in txt_path(nl):
|
||||
text += etree.tostring(txt, method='text',
|
||||
encoding='unicode', with_tail=False)
|
||||
content = content_path(np)
|
||||
if content and text:
|
||||
content = content[0]
|
||||
# if get_attr(content, attr='src'):
|
||||
purl = urlparse(content.get('src'))
|
||||
href, fragment = unquote(purl[2]), unquote(purl[5])
|
||||
nd = dest.add_item(href, fragment, text)
|
||||
nd.play_order = play_order
|
||||
|
||||
for c in np_path(np):
|
||||
process_navpoint(c, nd)
|
||||
|
||||
nm = XPath('//*[re:match(local-name(), "navmap$", "i")]')(root)
|
||||
if not nm:
|
||||
raise ValueError('NCX files must have a <navmap> element.')
|
||||
nm = nm[0]
|
||||
|
||||
for child in np_path(nm):
|
||||
process_navpoint(child, self)
|
||||
|
||||
def read_html_toc(self, toc):
|
||||
self.base_path = os.path.dirname(toc)
|
||||
with lopen(toc, 'rb') as f:
|
||||
parsed_toc = parse_html_toc(f.read())
|
||||
for href, fragment, txt in parsed_toc:
|
||||
add = True
|
||||
for i in self.flat():
|
||||
if i.href == href and i.fragment == fragment:
|
||||
add = False
|
||||
break
|
||||
if add:
|
||||
self.add_item(href, fragment, txt)
|
||||
|
||||
def render(self, stream, uid):
|
||||
root = E.ncx(
|
||||
E.head(
|
||||
E.meta(name='dtb:uid', content=unicode_type(uid)),
|
||||
E.meta(name='dtb:depth', content=unicode_type(self.depth())),
|
||||
E.meta(name='dtb:generator', content='%s (%s)'%(__appname__,
|
||||
__version__)),
|
||||
E.meta(name='dtb:totalPageCount', content='0'),
|
||||
E.meta(name='dtb:maxPageNumber', content='0'),
|
||||
),
|
||||
E.docTitle(E.text('Table of Contents')),
|
||||
)
|
||||
navmap = E.navMap()
|
||||
root.append(navmap)
|
||||
root.set('{http://www.w3.org/XML/1998/namespace}lang', 'en')
|
||||
c = Counter()
|
||||
|
||||
def navpoint(parent, np):
|
||||
text = np.text
|
||||
if not text:
|
||||
text = ''
|
||||
c[1] += 1
|
||||
item_id = 'num_%d'%c[1]
|
||||
text = clean_xml_chars(text)
|
||||
elem = E.navPoint(
|
||||
E.navLabel(E.text(re.sub(r'\s+', ' ', text))),
|
||||
E.content(src=unicode_type(np.href)+(('#' + unicode_type(np.fragment))
|
||||
if np.fragment else '')),
|
||||
id=item_id,
|
||||
playOrder=unicode_type(np.play_order)
|
||||
)
|
||||
au = getattr(np, 'author', None)
|
||||
if au:
|
||||
au = re.sub(r'\s+', ' ', au)
|
||||
elem.append(C.meta(au, name='author'))
|
||||
desc = getattr(np, 'description', None)
|
||||
if desc:
|
||||
desc = re.sub(r'\s+', ' ', desc)
|
||||
try:
|
||||
elem.append(C.meta(desc, name='description'))
|
||||
except ValueError:
|
||||
elem.append(C.meta(clean_xml_chars(desc), name='description'))
|
||||
idx = getattr(np, 'toc_thumbnail', None)
|
||||
if idx:
|
||||
elem.append(C.meta(idx, name='toc_thumbnail'))
|
||||
parent.append(elem)
|
||||
for np2 in np:
|
||||
navpoint(elem, np2)
|
||||
|
||||
for np in self:
|
||||
navpoint(navmap, np)
|
||||
raw = etree.tostring(root, encoding='utf-8', xml_declaration=True,
|
||||
pretty_print=True)
|
||||
stream.write(raw)
|
||||
104
ebook_converter/ebooks/metadata/utils.py
Normal file
104
ebook_converter/ebooks/metadata/utils.py
Normal file
@@ -0,0 +1,104 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=utf-8
|
||||
# License: GPLv3 Copyright: 2016, Kovid Goyal <kovid at kovidgoyal.net>
|
||||
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
from collections import namedtuple
|
||||
|
||||
from calibre.ebooks.chardet import xml_to_unicode
|
||||
from calibre.ebooks.oeb.base import OPF
|
||||
from calibre.ebooks.oeb.polish.utils import guess_type
|
||||
from calibre.spell import parse_lang_code
|
||||
from calibre.utils.cleantext import clean_xml_chars
|
||||
from calibre.utils.localization import lang_as_iso639_1
|
||||
from calibre.utils.xml_parse import safe_xml_fromstring
|
||||
from polyglot.builtins import filter, map
|
||||
|
||||
OPFVersion = namedtuple('OPFVersion', 'major minor patch')
|
||||
|
||||
|
||||
def parse_opf_version(raw):
|
||||
parts = (raw or '').split('.')
|
||||
try:
|
||||
major = int(parts[0])
|
||||
except Exception:
|
||||
return OPFVersion(2, 0, 0)
|
||||
try:
|
||||
v = list(map(int, raw.split('.')))
|
||||
except Exception:
|
||||
v = [major, 0, 0]
|
||||
while len(v) < 3:
|
||||
v.append(0)
|
||||
v = v[:3]
|
||||
return OPFVersion(*v)
|
||||
|
||||
|
||||
def parse_opf(stream_or_path):
|
||||
stream = stream_or_path
|
||||
if not hasattr(stream, 'read'):
|
||||
stream = open(stream, 'rb')
|
||||
raw = stream.read()
|
||||
if not raw:
|
||||
raise ValueError('Empty file: '+getattr(stream, 'name', 'stream'))
|
||||
raw, encoding = xml_to_unicode(raw, strip_encoding_pats=True, resolve_entities=True, assume_utf8=True)
|
||||
raw = raw[raw.find('<'):]
|
||||
root = safe_xml_fromstring(clean_xml_chars(raw))
|
||||
if root is None:
|
||||
raise ValueError('Not an OPF file')
|
||||
return root
|
||||
|
||||
|
||||
def normalize_languages(opf_languages, mi_languages):
|
||||
' Preserve original country codes and use 2-letter lang codes where possible '
|
||||
def parse(x):
|
||||
try:
|
||||
return parse_lang_code(x)
|
||||
except ValueError:
|
||||
return None
|
||||
opf_languages = filter(None, map(parse, opf_languages))
|
||||
cc_map = {c.langcode:c.countrycode for c in opf_languages}
|
||||
mi_languages = filter(None, map(parse, mi_languages))
|
||||
|
||||
def norm(x):
|
||||
lc = x.langcode
|
||||
cc = x.countrycode or cc_map.get(lc, None)
|
||||
lc = lang_as_iso639_1(lc) or lc
|
||||
if cc:
|
||||
lc += '-' + cc
|
||||
return lc
|
||||
return list(map(norm, mi_languages))
|
||||
|
||||
|
||||
def ensure_unique(template, existing):
|
||||
b, e = template.rpartition('.')[::2]
|
||||
if b and e:
|
||||
e = '.' + e
|
||||
else:
|
||||
b, e = template, ''
|
||||
q = template
|
||||
c = 0
|
||||
while q in existing:
|
||||
c += 1
|
||||
q = '%s-%d%s' % (b, c, e)
|
||||
return q
|
||||
|
||||
|
||||
def create_manifest_item(root, href_template, id_template, media_type=None):
|
||||
all_ids = frozenset(root.xpath('//*/@id'))
|
||||
all_hrefs = frozenset(root.xpath('//*/@href'))
|
||||
href = ensure_unique(href_template, all_hrefs)
|
||||
item_id = ensure_unique(id_template, all_ids)
|
||||
manifest = root.find(OPF('manifest'))
|
||||
if manifest is not None:
|
||||
i = manifest.makeelement(OPF('item'))
|
||||
i.set('href', href), i.set('id', item_id)
|
||||
i.set('media-type', media_type or guess_type(href_template))
|
||||
manifest.append(i)
|
||||
return i
|
||||
|
||||
|
||||
def pretty_print_opf(root):
|
||||
from calibre.ebooks.oeb.polish.pretty import pretty_opf, pretty_xml_tree
|
||||
pretty_opf(root)
|
||||
pretty_xml_tree(root)
|
||||
15
ebook_converter/ebooks/mobi/__init__.py
Normal file
15
ebook_converter/ebooks/mobi/__init__.py
Normal file
@@ -0,0 +1,15 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
|
||||
|
||||
class MobiError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
# That might be a bit small on the PW, but Amazon/KG 2.5 still uses these values, even when delivered to a PW
|
||||
MAX_THUMB_SIZE = 16 * 1024
|
||||
MAX_THUMB_DIMEN = (180, 240)
|
||||
108
ebook_converter/ebooks/mobi/huffcdic.py
Normal file
108
ebook_converter/ebooks/mobi/huffcdic.py
Normal file
@@ -0,0 +1,108 @@
|
||||
#!/usr/bin/env python2
|
||||
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
'''
|
||||
Decompress MOBI files compressed with the Huff/cdic algorithm. Code thanks to darkninja
|
||||
and igorsk.
|
||||
'''
|
||||
|
||||
import struct
|
||||
|
||||
from calibre.ebooks.mobi import MobiError
|
||||
from polyglot.builtins import map
|
||||
|
||||
|
||||
class Reader(object):
|
||||
|
||||
def __init__(self):
|
||||
self.q = struct.Struct(b'>Q').unpack_from
|
||||
|
||||
def load_huff(self, huff):
|
||||
if huff[0:8] != b'HUFF\x00\x00\x00\x18':
|
||||
raise MobiError('Invalid HUFF header')
|
||||
off1, off2 = struct.unpack_from(b'>LL', huff, 8)
|
||||
|
||||
def dict1_unpack(v):
|
||||
codelen, term, maxcode = v&0x1f, v&0x80, v>>8
|
||||
assert codelen != 0
|
||||
if codelen <= 8:
|
||||
assert term
|
||||
maxcode = ((maxcode + 1) << (32 - codelen)) - 1
|
||||
return (codelen, term, maxcode)
|
||||
self.dict1 = tuple(map(dict1_unpack, struct.unpack_from(b'>256L', huff, off1)))
|
||||
|
||||
dict2 = struct.unpack_from(b'>64L', huff, off2)
|
||||
self.mincode, self.maxcode = (), ()
|
||||
for codelen, mincode in enumerate((0,) + dict2[0::2]):
|
||||
self.mincode += (mincode << (32 - codelen), )
|
||||
for codelen, maxcode in enumerate((0,) + dict2[1::2]):
|
||||
self.maxcode += (((maxcode + 1) << (32 - codelen)) - 1, )
|
||||
|
||||
self.dictionary = []
|
||||
|
||||
def load_cdic(self, cdic):
|
||||
if cdic[0:8] != b'CDIC\x00\x00\x00\x10':
|
||||
raise MobiError('Invalid CDIC header')
|
||||
phrases, bits = struct.unpack_from(b'>LL', cdic, 8)
|
||||
n = min(1<<bits, phrases-len(self.dictionary))
|
||||
h = struct.Struct(b'>H').unpack_from
|
||||
|
||||
def getslice(off):
|
||||
blen, = h(cdic, 16+off)
|
||||
slice = cdic[18+off:18+off+(blen&0x7fff)]
|
||||
return (slice, blen&0x8000)
|
||||
self.dictionary += map(getslice, struct.unpack_from(b'>%dH' % n, cdic, 16))
|
||||
|
||||
def unpack(self, data):
|
||||
q = self.q
|
||||
|
||||
bitsleft = len(data) * 8
|
||||
data += b'\x00\x00\x00\x00\x00\x00\x00\x00'
|
||||
pos = 0
|
||||
x, = q(data, pos)
|
||||
n = 32
|
||||
|
||||
s = []
|
||||
while True:
|
||||
if n <= 0:
|
||||
pos += 4
|
||||
x, = q(data, pos)
|
||||
n += 32
|
||||
code = (x >> n) & ((1 << 32) - 1)
|
||||
|
||||
codelen, term, maxcode = self.dict1[code >> 24]
|
||||
if not term:
|
||||
while code < self.mincode[codelen]:
|
||||
codelen += 1
|
||||
maxcode = self.maxcode[codelen]
|
||||
|
||||
n -= codelen
|
||||
bitsleft -= codelen
|
||||
if bitsleft < 0:
|
||||
break
|
||||
|
||||
r = (maxcode - code) >> (32 - codelen)
|
||||
slice_, flag = self.dictionary[r]
|
||||
if not flag:
|
||||
self.dictionary[r] = None
|
||||
slice_ = self.unpack(slice_)
|
||||
self.dictionary[r] = (slice_, 1)
|
||||
s.append(slice_)
|
||||
return b''.join(s)
|
||||
|
||||
|
||||
class HuffReader(object):
|
||||
|
||||
def __init__(self, huffs):
|
||||
self.reader = Reader()
|
||||
self.reader.load_huff(huffs[0])
|
||||
for cdic in huffs[1:]:
|
||||
self.reader.load_cdic(cdic)
|
||||
|
||||
def unpack(self, section):
|
||||
return self.reader.unpack(section)
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user