mirror of
https://github.com/gryf/ebook-converter.git
synced 2026-03-01 06:05:55 +01:00
Here is the first batch of modules, which are needed for converting several formats to LRF. Some of the logic has been change, more cleanups will follow.
303 lines
11 KiB
Python
303 lines
11 KiB
Python
#!/usr/bin/python2
|
|
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai
|
|
#
|
|
# Copyright (C) 2006 Søren Roug, European Environment Agency
|
|
#
|
|
# This is free software. You may redistribute it under the terms
|
|
# of the Apache license and the GNU General Public License Version
|
|
# 2 or at your option any later version.
|
|
#
|
|
# This program is distributed in the hope that it will be useful,
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
# GNU General Public License for more details.
|
|
#
|
|
# You should have received a copy of the GNU General Public
|
|
# License along with this program; if not, write to the Free Software
|
|
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
#
|
|
# Contributor(s):
|
|
#
|
|
from __future__ import absolute_import, division, print_function, unicode_literals
|
|
|
|
import io
|
|
import json
|
|
import os
|
|
import re
|
|
|
|
from lxml.etree import fromstring, tostring
|
|
|
|
from ebook_converter.ebooks.metadata import (
|
|
MetaInformation, authors_to_string, check_isbn, string_to_authors
|
|
)
|
|
from ebook_converter.utils.date import isoformat, parse_date
|
|
from ebook_converter.utils.imghdr import identify
|
|
from ebook_converter.utils.localization import canonicalize_lang, lang_as_iso639_1
|
|
from ebook_converter.utils.zipfile import ZipFile, safe_replace
|
|
from odf.draw import Frame as odFrame, Image as odImage
|
|
from odf.namespaces import DCNS, METANS, OFFICENS
|
|
from odf.opendocument import load as odLoad
|
|
from ebook_converter.polyglot.builtins import as_unicode
|
|
|
|
fields = {
|
|
'title': (DCNS, 'title'),
|
|
'description': (DCNS, 'description'),
|
|
'subject': (DCNS, 'subject'),
|
|
'creator': (DCNS, 'creator'),
|
|
'date': (DCNS, 'date'),
|
|
'language': (DCNS, 'language'),
|
|
'generator': (METANS, 'generator'),
|
|
'initial-creator': (METANS, 'initial-creator'),
|
|
'keyword': (METANS, 'keyword'),
|
|
'keywords': (METANS, 'keywords'),
|
|
'editing-duration': (METANS, 'editing-duration'),
|
|
'editing-cycles': (METANS, 'editing-cycles'),
|
|
'printed-by': (METANS, 'printed-by'),
|
|
'print-date': (METANS, 'print-date'),
|
|
'creation-date': (METANS, 'creation-date'),
|
|
'user-defined': (METANS, 'user-defined'),
|
|
# 'template': (METANS, 'template'),
|
|
}
|
|
|
|
|
|
def get_metadata(stream, extract_cover=True):
|
|
whitespace = re.compile(r'\s+')
|
|
|
|
def normalize(s):
|
|
return whitespace.sub(' ', s).strip()
|
|
|
|
with ZipFile(stream) as zf:
|
|
meta = zf.read('meta.xml')
|
|
root = fromstring(meta)
|
|
|
|
def find(field):
|
|
ns, tag = fields[field]
|
|
ans = root.xpath('//ns0:{}'.format(tag), namespaces={'ns0': ns})
|
|
if ans:
|
|
return normalize(tostring(ans[0], method='text', encoding='unicode', with_tail=False)).strip()
|
|
|
|
mi = MetaInformation(None, [])
|
|
title = find('title')
|
|
if title:
|
|
mi.title = title
|
|
creator = find('initial-creator') or find('creator')
|
|
if creator:
|
|
mi.authors = string_to_authors(creator)
|
|
desc = find('description')
|
|
if desc:
|
|
mi.comments = desc
|
|
lang = find('language')
|
|
if lang and canonicalize_lang(lang):
|
|
mi.languages = [canonicalize_lang(lang)]
|
|
kw = find('keyword') or find('keywords')
|
|
if kw:
|
|
mi.tags = [x.strip() for x in kw.split(',') if x.strip()]
|
|
data = {}
|
|
for tag in root.xpath('//ns0:user-defined', namespaces={'ns0': fields['user-defined'][0]}):
|
|
name = (tag.get('{%s}name' % METANS) or '').lower()
|
|
vtype = tag.get('{%s}value-type' % METANS) or 'string'
|
|
val = tag.text
|
|
if name and val:
|
|
if vtype == 'boolean':
|
|
val = val == 'true'
|
|
data[name] = val
|
|
opfmeta = False # we need this later for the cover
|
|
opfnocover = False
|
|
if data.get('opf.metadata'):
|
|
# custom metadata contains OPF information
|
|
opfmeta = True
|
|
if data.get('opf.titlesort', ''):
|
|
mi.title_sort = data['opf.titlesort']
|
|
if data.get('opf.authors', ''):
|
|
mi.authors = string_to_authors(data['opf.authors'])
|
|
if data.get('opf.authorsort', ''):
|
|
mi.author_sort = data['opf.authorsort']
|
|
if data.get('opf.isbn', ''):
|
|
isbn = check_isbn(data['opf.isbn'])
|
|
if isbn is not None:
|
|
mi.isbn = isbn
|
|
if data.get('opf.publisher', ''):
|
|
mi.publisher = data['opf.publisher']
|
|
if data.get('opf.pubdate', ''):
|
|
mi.pubdate = parse_date(data['opf.pubdate'], assume_utc=True)
|
|
if data.get('opf.identifiers'):
|
|
try:
|
|
mi.identifiers = json.loads(data['opf.identifiers'])
|
|
except Exception:
|
|
pass
|
|
if data.get('opf.rating'):
|
|
try:
|
|
mi.rating = max(0, min(float(data['opf.rating']), 10))
|
|
except Exception:
|
|
pass
|
|
if data.get('opf.series', ''):
|
|
mi.series = data['opf.series']
|
|
if data.get('opf.seriesindex', ''):
|
|
try:
|
|
mi.series_index = float(data['opf.seriesindex'])
|
|
except Exception:
|
|
mi.series_index = 1.0
|
|
if data.get('opf.language', ''):
|
|
cl = canonicalize_lang(data['opf.language'])
|
|
if cl:
|
|
mi.languages = [cl]
|
|
opfnocover = data.get('opf.nocover', False)
|
|
if not opfnocover:
|
|
try:
|
|
read_cover(stream, zf, mi, opfmeta, extract_cover)
|
|
except Exception:
|
|
pass # Do not let an error reading the cover prevent reading other data
|
|
|
|
return mi
|
|
|
|
|
|
def set_metadata(stream, mi):
|
|
|
|
with ZipFile(stream) as zf:
|
|
raw = _set_metadata(zf.open('meta.xml').read(), mi)
|
|
# print(raw.decode('utf-8'))
|
|
|
|
stream.seek(os.SEEK_SET)
|
|
safe_replace(stream, "meta.xml", io.BytesIO(raw))
|
|
|
|
|
|
def _set_metadata(raw, mi):
|
|
root = fromstring(raw)
|
|
namespaces = {'office': OFFICENS, 'meta': METANS, 'dc': DCNS}
|
|
nsrmap = {v: k for k, v in namespaces.items()}
|
|
|
|
def xpath(expr, parent=root):
|
|
return parent.xpath(expr, namespaces=namespaces)
|
|
|
|
def remove(*tag_names):
|
|
for tag_name in tag_names:
|
|
ns = fields[tag_name][0]
|
|
tag_name = '{}:{}'.format(nsrmap[ns], tag_name)
|
|
for x in xpath('descendant::' + tag_name, meta):
|
|
x.getparent().remove(x)
|
|
|
|
def add(tag, val=None):
|
|
ans = meta.makeelement('{%s}%s' % fields[tag])
|
|
ans.text = val
|
|
meta.append(ans)
|
|
return ans
|
|
|
|
def remove_user_metadata(*names):
|
|
for x in xpath('//meta:user-defined'):
|
|
q = (x.get('{%s}name' % METANS) or '').lower()
|
|
if q in names:
|
|
x.getparent().remove(x)
|
|
|
|
def add_um(name, val, vtype='string'):
|
|
ans = add('user-defined', val)
|
|
ans.set('{%s}value-type' % METANS, vtype)
|
|
ans.set('{%s}name' % METANS, name)
|
|
|
|
def add_user_metadata(name, val):
|
|
if not hasattr(add_user_metadata, 'sentinel_added'):
|
|
add_user_metadata.sentinel_added = True
|
|
remove_user_metadata('opf.metadata')
|
|
add_um('opf.metadata', 'true', 'boolean')
|
|
val_type = 'string'
|
|
if hasattr(val, 'strftime'):
|
|
val = isoformat(val, as_utc=True).split('T')[0]
|
|
val_type = 'date'
|
|
add_um(name, val, val_type)
|
|
|
|
meta = xpath('//office:meta')[0]
|
|
|
|
if not mi.is_null('title'):
|
|
remove('title')
|
|
add('title', mi.title)
|
|
if not mi.is_null('title_sort'):
|
|
remove_user_metadata('opf.titlesort')
|
|
add_user_metadata('opf.titlesort', mi.title_sort)
|
|
if not mi.is_null('authors'):
|
|
remove('initial-creator', 'creator')
|
|
val = authors_to_string(mi.authors)
|
|
add('initial-creator', val), add('creator', val)
|
|
remove_user_metadata('opf.authors')
|
|
add_user_metadata('opf.authors', val)
|
|
if not mi.is_null('author_sort'):
|
|
remove_user_metadata('opf.authorsort')
|
|
add_user_metadata('opf.authorsort', mi.author_sort)
|
|
if not mi.is_null('comments'):
|
|
remove('description')
|
|
add('description', mi.comments)
|
|
if not mi.is_null('tags'):
|
|
remove('keyword')
|
|
add('keyword', ', '.join(mi.tags))
|
|
if not mi.is_null('languages'):
|
|
lang = lang_as_iso639_1(mi.languages[0])
|
|
if lang:
|
|
remove('language')
|
|
add('language', lang)
|
|
if not mi.is_null('pubdate'):
|
|
remove_user_metadata('opf.pubdate')
|
|
add_user_metadata('opf.pubdate', mi.pubdate)
|
|
if not mi.is_null('publisher'):
|
|
remove_user_metadata('opf.publisher')
|
|
add_user_metadata('opf.publisher', mi.publisher)
|
|
if not mi.is_null('series'):
|
|
remove_user_metadata('opf.series', 'opf.seriesindex')
|
|
add_user_metadata('opf.series', mi.series)
|
|
add_user_metadata('opf.seriesindex', '{}'.format(mi.series_index))
|
|
if not mi.is_null('identifiers'):
|
|
remove_user_metadata('opf.identifiers')
|
|
add_user_metadata('opf.identifiers', as_unicode(json.dumps(mi.identifiers)))
|
|
if not mi.is_null('rating'):
|
|
remove_user_metadata('opf.rating')
|
|
add_user_metadata('opf.rating', '%.2g' % mi.rating)
|
|
|
|
return tostring(root, encoding='utf-8', pretty_print=True)
|
|
|
|
|
|
def read_cover(stream, zin, mi, opfmeta, extract_cover):
|
|
# search for an draw:image in a draw:frame with the name 'opf.cover'
|
|
# if opf.metadata prop is false, just use the first image that
|
|
# has a proper size (borrowed from docx)
|
|
otext = odLoad(stream)
|
|
cover_href = None
|
|
cover_data = None
|
|
cover_frame = None
|
|
imgnum = 0
|
|
for frm in otext.topnode.getElementsByType(odFrame):
|
|
img = frm.getElementsByType(odImage)
|
|
if len(img) == 0:
|
|
continue
|
|
i_href = img[0].getAttribute('href')
|
|
try:
|
|
raw = zin.read(i_href)
|
|
except KeyError:
|
|
continue
|
|
try:
|
|
fmt, width, height = identify(raw)
|
|
except Exception:
|
|
continue
|
|
imgnum += 1
|
|
if opfmeta and frm.getAttribute('name').lower() == 'opf.cover':
|
|
cover_href = i_href
|
|
cover_data = (fmt, raw)
|
|
cover_frame = frm.getAttribute('name') # could have upper case
|
|
break
|
|
if cover_href is None and imgnum == 1 and 0.8 <= height/width <= 1.8 and height*width >= 12000:
|
|
# Pick the first image as the cover if it is of a suitable size
|
|
cover_href = i_href
|
|
cover_data = (fmt, raw)
|
|
if not opfmeta:
|
|
break
|
|
|
|
if cover_href is not None:
|
|
mi.cover = cover_href
|
|
mi.odf_cover_frame = cover_frame
|
|
if extract_cover:
|
|
if not cover_data:
|
|
raw = zin.read(cover_href)
|
|
try:
|
|
fmt = identify(raw)[0]
|
|
except Exception:
|
|
pass
|
|
else:
|
|
cover_data = (fmt, raw)
|
|
mi.cover_data = cover_data
|