mirror of
https://github.com/gryf/ebook-converter.git
synced 2026-01-29 09:35:45 +01:00
298 lines
11 KiB
Python
298 lines
11 KiB
Python
# Copyright (C) 2006 Søren Roug, European Environment Agency
|
|
#
|
|
# This is free software. You may redistribute it under the terms
|
|
# of the Apache license and the GNU General Public License Version
|
|
# 2 or at your option any later version.
|
|
#
|
|
# This program is distributed in the hope that it will be useful,
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
# GNU General Public License for more details.
|
|
#
|
|
# You should have received a copy of the GNU General Public
|
|
# License along with this program; if not, write to the Free Software
|
|
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
#
|
|
# Contributor(s):
|
|
#
|
|
import io
|
|
import json
|
|
import os
|
|
import re
|
|
|
|
from lxml.etree import fromstring, tostring
|
|
|
|
from ebook_converter.ebooks.metadata import (
|
|
MetaInformation, authors_to_string, check_isbn, string_to_authors
|
|
)
|
|
from ebook_converter.utils.date import isoformat, parse_date
|
|
from ebook_converter.utils.imghdr import identify
|
|
from ebook_converter.utils.localization import canonicalize_lang, lang_as_iso639_1
|
|
from ebook_converter.utils.zipfile import ZipFile, safe_replace
|
|
from odf.draw import Frame as odFrame, Image as odImage
|
|
from odf.namespaces import DCNS, METANS, OFFICENS
|
|
from odf.opendocument import load as odLoad
|
|
|
|
|
|
fields = {
|
|
'title': (DCNS, 'title'),
|
|
'description': (DCNS, 'description'),
|
|
'subject': (DCNS, 'subject'),
|
|
'creator': (DCNS, 'creator'),
|
|
'date': (DCNS, 'date'),
|
|
'language': (DCNS, 'language'),
|
|
'generator': (METANS, 'generator'),
|
|
'initial-creator': (METANS, 'initial-creator'),
|
|
'keyword': (METANS, 'keyword'),
|
|
'keywords': (METANS, 'keywords'),
|
|
'editing-duration': (METANS, 'editing-duration'),
|
|
'editing-cycles': (METANS, 'editing-cycles'),
|
|
'printed-by': (METANS, 'printed-by'),
|
|
'print-date': (METANS, 'print-date'),
|
|
'creation-date': (METANS, 'creation-date'),
|
|
'user-defined': (METANS, 'user-defined'),
|
|
# 'template': (METANS, 'template'),
|
|
}
|
|
|
|
|
|
def get_metadata(stream, extract_cover=True):
|
|
whitespace = re.compile(r'\s+')
|
|
|
|
def normalize(s):
|
|
return whitespace.sub(' ', s).strip()
|
|
|
|
with ZipFile(stream) as zf:
|
|
meta = zf.read('meta.xml')
|
|
root = fromstring(meta)
|
|
|
|
def find(field):
|
|
ns, tag = fields[field]
|
|
ans = root.xpath('//ns0:{}'.format(tag), namespaces={'ns0': ns})
|
|
if ans:
|
|
return normalize(tostring(ans[0], method='text', encoding='unicode', with_tail=False)).strip()
|
|
|
|
mi = MetaInformation(None, [])
|
|
title = find('title')
|
|
if title:
|
|
mi.title = title
|
|
creator = find('initial-creator') or find('creator')
|
|
if creator:
|
|
mi.authors = string_to_authors(creator)
|
|
desc = find('description')
|
|
if desc:
|
|
mi.comments = desc
|
|
lang = find('language')
|
|
if lang and canonicalize_lang(lang):
|
|
mi.languages = [canonicalize_lang(lang)]
|
|
kw = find('keyword') or find('keywords')
|
|
if kw:
|
|
mi.tags = [x.strip() for x in kw.split(',') if x.strip()]
|
|
data = {}
|
|
for tag in root.xpath('//ns0:user-defined', namespaces={'ns0': fields['user-defined'][0]}):
|
|
name = (tag.get('{%s}name' % METANS) or '').lower()
|
|
vtype = tag.get('{%s}value-type' % METANS) or 'string'
|
|
val = tag.text
|
|
if name and val:
|
|
if vtype == 'boolean':
|
|
val = val == 'true'
|
|
data[name] = val
|
|
opfmeta = False # we need this later for the cover
|
|
opfnocover = False
|
|
if data.get('opf.metadata'):
|
|
# custom metadata contains OPF information
|
|
opfmeta = True
|
|
if data.get('opf.titlesort', ''):
|
|
mi.title_sort = data['opf.titlesort']
|
|
if data.get('opf.authors', ''):
|
|
mi.authors = string_to_authors(data['opf.authors'])
|
|
if data.get('opf.authorsort', ''):
|
|
mi.author_sort = data['opf.authorsort']
|
|
if data.get('opf.isbn', ''):
|
|
isbn = check_isbn(data['opf.isbn'])
|
|
if isbn is not None:
|
|
mi.isbn = isbn
|
|
if data.get('opf.publisher', ''):
|
|
mi.publisher = data['opf.publisher']
|
|
if data.get('opf.pubdate', ''):
|
|
mi.pubdate = parse_date(data['opf.pubdate'], assume_utc=True)
|
|
if data.get('opf.identifiers'):
|
|
try:
|
|
mi.identifiers = json.loads(data['opf.identifiers'])
|
|
except Exception:
|
|
pass
|
|
if data.get('opf.rating'):
|
|
try:
|
|
mi.rating = max(0, min(float(data['opf.rating']), 10))
|
|
except Exception:
|
|
pass
|
|
if data.get('opf.series', ''):
|
|
mi.series = data['opf.series']
|
|
if data.get('opf.seriesindex', ''):
|
|
try:
|
|
mi.series_index = float(data['opf.seriesindex'])
|
|
except Exception:
|
|
mi.series_index = 1.0
|
|
if data.get('opf.language', ''):
|
|
cl = canonicalize_lang(data['opf.language'])
|
|
if cl:
|
|
mi.languages = [cl]
|
|
opfnocover = data.get('opf.nocover', False)
|
|
if not opfnocover:
|
|
try:
|
|
read_cover(stream, zf, mi, opfmeta, extract_cover)
|
|
except Exception:
|
|
pass # Do not let an error reading the cover prevent reading other data
|
|
|
|
return mi
|
|
|
|
|
|
def set_metadata(stream, mi):
|
|
|
|
with ZipFile(stream) as zf:
|
|
raw = _set_metadata(zf.open('meta.xml').read(), mi)
|
|
# print(raw.decode('utf-8'))
|
|
|
|
stream.seek(os.SEEK_SET)
|
|
safe_replace(stream, "meta.xml", io.BytesIO(raw))
|
|
|
|
|
|
def _set_metadata(raw, mi):
|
|
root = fromstring(raw)
|
|
namespaces = {'office': OFFICENS, 'meta': METANS, 'dc': DCNS}
|
|
nsrmap = {v: k for k, v in namespaces.items()}
|
|
|
|
def xpath(expr, parent=root):
|
|
return parent.xpath(expr, namespaces=namespaces)
|
|
|
|
def remove(*tag_names):
|
|
for tag_name in tag_names:
|
|
ns = fields[tag_name][0]
|
|
tag_name = '{}:{}'.format(nsrmap[ns], tag_name)
|
|
for x in xpath('descendant::' + tag_name, meta):
|
|
x.getparent().remove(x)
|
|
|
|
def add(tag, val=None):
|
|
ans = meta.makeelement('{%s}%s' % fields[tag])
|
|
ans.text = val
|
|
meta.append(ans)
|
|
return ans
|
|
|
|
def remove_user_metadata(*names):
|
|
for x in xpath('//meta:user-defined'):
|
|
q = (x.get('{%s}name' % METANS) or '').lower()
|
|
if q in names:
|
|
x.getparent().remove(x)
|
|
|
|
def add_um(name, val, vtype='string'):
|
|
ans = add('user-defined', val)
|
|
ans.set('{%s}value-type' % METANS, vtype)
|
|
ans.set('{%s}name' % METANS, name)
|
|
|
|
def add_user_metadata(name, val):
|
|
if not hasattr(add_user_metadata, 'sentinel_added'):
|
|
add_user_metadata.sentinel_added = True
|
|
remove_user_metadata('opf.metadata')
|
|
add_um('opf.metadata', 'true', 'boolean')
|
|
val_type = 'string'
|
|
if hasattr(val, 'strftime'):
|
|
val = isoformat(val, as_utc=True).split('T')[0]
|
|
val_type = 'date'
|
|
add_um(name, val, val_type)
|
|
|
|
meta = xpath('//office:meta')[0]
|
|
|
|
if not mi.is_null('title'):
|
|
remove('title')
|
|
add('title', mi.title)
|
|
if not mi.is_null('title_sort'):
|
|
remove_user_metadata('opf.titlesort')
|
|
add_user_metadata('opf.titlesort', mi.title_sort)
|
|
if not mi.is_null('authors'):
|
|
remove('initial-creator', 'creator')
|
|
val = authors_to_string(mi.authors)
|
|
add('initial-creator', val), add('creator', val)
|
|
remove_user_metadata('opf.authors')
|
|
add_user_metadata('opf.authors', val)
|
|
if not mi.is_null('author_sort'):
|
|
remove_user_metadata('opf.authorsort')
|
|
add_user_metadata('opf.authorsort', mi.author_sort)
|
|
if not mi.is_null('comments'):
|
|
remove('description')
|
|
add('description', mi.comments)
|
|
if not mi.is_null('tags'):
|
|
remove('keyword')
|
|
add('keyword', ', '.join(mi.tags))
|
|
if not mi.is_null('languages'):
|
|
lang = lang_as_iso639_1(mi.languages[0])
|
|
if lang:
|
|
remove('language')
|
|
add('language', lang)
|
|
if not mi.is_null('pubdate'):
|
|
remove_user_metadata('opf.pubdate')
|
|
add_user_metadata('opf.pubdate', mi.pubdate)
|
|
if not mi.is_null('publisher'):
|
|
remove_user_metadata('opf.publisher')
|
|
add_user_metadata('opf.publisher', mi.publisher)
|
|
if not mi.is_null('series'):
|
|
remove_user_metadata('opf.series', 'opf.seriesindex')
|
|
add_user_metadata('opf.series', mi.series)
|
|
add_user_metadata('opf.seriesindex', '{}'.format(mi.series_index))
|
|
if not mi.is_null('identifiers'):
|
|
remove_user_metadata('opf.identifiers')
|
|
add_user_metadata('opf.identifiers', str(json.dumps(mi.identifiers)))
|
|
if not mi.is_null('rating'):
|
|
remove_user_metadata('opf.rating')
|
|
add_user_metadata('opf.rating', '%.2g' % mi.rating)
|
|
|
|
return tostring(root, encoding='utf-8', pretty_print=True)
|
|
|
|
|
|
def read_cover(stream, zin, mi, opfmeta, extract_cover):
|
|
# search for an draw:image in a draw:frame with the name 'opf.cover'
|
|
# if opf.metadata prop is false, just use the first image that
|
|
# has a proper size (borrowed from docx)
|
|
otext = odLoad(stream)
|
|
cover_href = None
|
|
cover_data = None
|
|
cover_frame = None
|
|
imgnum = 0
|
|
for frm in otext.topnode.getElementsByType(odFrame):
|
|
img = frm.getElementsByType(odImage)
|
|
if len(img) == 0:
|
|
continue
|
|
i_href = img[0].getAttribute('href')
|
|
try:
|
|
raw = zin.read(i_href)
|
|
except KeyError:
|
|
continue
|
|
try:
|
|
fmt, width, height = identify(raw)
|
|
except Exception:
|
|
continue
|
|
imgnum += 1
|
|
if opfmeta and frm.getAttribute('name').lower() == 'opf.cover':
|
|
cover_href = i_href
|
|
cover_data = (fmt, raw)
|
|
cover_frame = frm.getAttribute('name') # could have upper case
|
|
break
|
|
if cover_href is None and imgnum == 1 and 0.8 <= height/width <= 1.8 and height*width >= 12000:
|
|
# Pick the first image as the cover if it is of a suitable size
|
|
cover_href = i_href
|
|
cover_data = (fmt, raw)
|
|
if not opfmeta:
|
|
break
|
|
|
|
if cover_href is not None:
|
|
mi.cover = cover_href
|
|
mi.odf_cover_frame = cover_frame
|
|
if extract_cover:
|
|
if not cover_data:
|
|
raw = zin.read(cover_href)
|
|
try:
|
|
fmt = identify(raw)[0]
|
|
except Exception:
|
|
pass
|
|
else:
|
|
cover_data = (fmt, raw)
|
|
mi.cover_data = cover_data
|