Initial import

2020-03-31 17:15:23 +02:00
commit d97ea9b0bc
311 changed files with 131419 additions and 0 deletions
@@ -0,0 +1,302 @@
+#!/usr/bin/python2
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:fdm=marker:ai
+#
+# Copyright (C) 2006 Søren Roug, European Environment Agency
+#
+# This is free software.  You may redistribute it under the terms
+# of the Apache license and the GNU General Public License Version
+# 2 or at your option any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public
+# License along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+#
+# Contributor(s):
+#
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import io
+import json
+import os
+import re
+
+from lxml.etree import fromstring, tostring
+
+from calibre.ebooks.metadata import (
+    MetaInformation, authors_to_string, check_isbn, string_to_authors
+)
+from calibre.utils.date import isoformat, parse_date
+from calibre.utils.imghdr import identify
+from calibre.utils.localization import canonicalize_lang, lang_as_iso639_1
+from calibre.utils.zipfile import ZipFile, safe_replace
+from odf.draw import Frame as odFrame, Image as odImage
+from odf.namespaces import DCNS, METANS, OFFICENS
+from odf.opendocument import load as odLoad
+from polyglot.builtins import as_unicode
+
+fields = {
+    'title':            (DCNS, 'title'),
+    'description':      (DCNS, 'description'),
+    'subject':          (DCNS, 'subject'),
+    'creator':          (DCNS, 'creator'),
+    'date':             (DCNS, 'date'),
+    'language':         (DCNS, 'language'),
+    'generator':        (METANS, 'generator'),
+    'initial-creator':  (METANS, 'initial-creator'),
+    'keyword':          (METANS, 'keyword'),
+    'keywords':         (METANS, 'keywords'),
+    'editing-duration': (METANS, 'editing-duration'),
+    'editing-cycles':   (METANS, 'editing-cycles'),
+    'printed-by':       (METANS, 'printed-by'),
+    'print-date':       (METANS, 'print-date'),
+    'creation-date':    (METANS, 'creation-date'),
+    'user-defined':     (METANS, 'user-defined'),
+    # 'template':         (METANS, 'template'),
+}
+
+
+def get_metadata(stream, extract_cover=True):
+    whitespace = re.compile(r'\s+')
+
+    def normalize(s):
+        return whitespace.sub(' ', s).strip()
+
+    with ZipFile(stream) as zf:
+        meta = zf.read('meta.xml')
+        root = fromstring(meta)
+
+        def find(field):
+            ns, tag = fields[field]
+            ans = root.xpath('//ns0:{}'.format(tag), namespaces={'ns0': ns})
+            if ans:
+                return normalize(tostring(ans[0], method='text', encoding='unicode', with_tail=False)).strip()
+
+        mi = MetaInformation(None, [])
+        title = find('title')
+        if title:
+            mi.title = title
+        creator = find('initial-creator') or find('creator')
+        if creator:
+            mi.authors = string_to_authors(creator)
+        desc = find('description')
+        if desc:
+            mi.comments = desc
+        lang = find('language')
+        if lang and canonicalize_lang(lang):
+            mi.languages = [canonicalize_lang(lang)]
+        kw = find('keyword') or find('keywords')
+        if kw:
+            mi.tags = [x.strip() for x in kw.split(',') if x.strip()]
+        data = {}
+        for tag in root.xpath('//ns0:user-defined', namespaces={'ns0': fields['user-defined'][0]}):
+            name = (tag.get('{%s}name' % METANS) or '').lower()
+            vtype = tag.get('{%s}value-type' % METANS) or 'string'
+            val = tag.text
+            if name and val:
+                if vtype == 'boolean':
+                    val = val == 'true'
+                data[name] = val
+        opfmeta = False  # we need this later for the cover
+        opfnocover = False
+        if data.get('opf.metadata'):
+            # custom metadata contains OPF information
+            opfmeta = True
+            if data.get('opf.titlesort', ''):
+                mi.title_sort = data['opf.titlesort']
+            if data.get('opf.authors', ''):
+                mi.authors = string_to_authors(data['opf.authors'])
+            if data.get('opf.authorsort', ''):
+                mi.author_sort = data['opf.authorsort']
+            if data.get('opf.isbn', ''):
+                isbn = check_isbn(data['opf.isbn'])
+                if isbn is not None:
+                    mi.isbn = isbn
+            if data.get('opf.publisher', ''):
+                mi.publisher = data['opf.publisher']
+            if data.get('opf.pubdate', ''):
+                mi.pubdate = parse_date(data['opf.pubdate'], assume_utc=True)
+            if data.get('opf.identifiers'):
+                try:
+                    mi.identifiers = json.loads(data['opf.identifiers'])
+                except Exception:
+                    pass
+            if data.get('opf.rating'):
+                try:
+                    mi.rating = max(0, min(float(data['opf.rating']), 10))
+                except Exception:
+                    pass
+            if data.get('opf.series', ''):
+                mi.series = data['opf.series']
+                if data.get('opf.seriesindex', ''):
+                    try:
+                        mi.series_index = float(data['opf.seriesindex'])
+                    except Exception:
+                        mi.series_index = 1.0
+            if data.get('opf.language', ''):
+                cl = canonicalize_lang(data['opf.language'])
+                if cl:
+                    mi.languages = [cl]
+            opfnocover = data.get('opf.nocover', False)
+        if not opfnocover:
+            try:
+                read_cover(stream, zf, mi, opfmeta, extract_cover)
+            except Exception:
+                pass  # Do not let an error reading the cover prevent reading other data
+
+    return mi
+
+
+def set_metadata(stream, mi):
+
+    with ZipFile(stream) as zf:
+        raw = _set_metadata(zf.open('meta.xml').read(), mi)
+        # print(raw.decode('utf-8'))
+
+    stream.seek(os.SEEK_SET)
+    safe_replace(stream, "meta.xml", io.BytesIO(raw))
+
+
+def _set_metadata(raw, mi):
+    root = fromstring(raw)
+    namespaces = {'office': OFFICENS, 'meta': METANS, 'dc': DCNS}
+    nsrmap = {v: k for k, v in namespaces.items()}
+
+    def xpath(expr, parent=root):
+        return parent.xpath(expr, namespaces=namespaces)
+
+    def remove(*tag_names):
+        for tag_name in tag_names:
+            ns = fields[tag_name][0]
+            tag_name = '{}:{}'.format(nsrmap[ns], tag_name)
+            for x in xpath('descendant::' + tag_name, meta):
+                x.getparent().remove(x)
+
+    def add(tag, val=None):
+        ans = meta.makeelement('{%s}%s' % fields[tag])
+        ans.text = val
+        meta.append(ans)
+        return ans
+
+    def remove_user_metadata(*names):
+        for x in xpath('//meta:user-defined'):
+            q = (x.get('{%s}name' % METANS) or '').lower()
+            if q in names:
+                x.getparent().remove(x)
+
+    def add_um(name, val, vtype='string'):
+        ans = add('user-defined', val)
+        ans.set('{%s}value-type' % METANS, vtype)
+        ans.set('{%s}name' % METANS, name)
+
+    def add_user_metadata(name, val):
+        if not hasattr(add_user_metadata, 'sentinel_added'):
+            add_user_metadata.sentinel_added = True
+            remove_user_metadata('opf.metadata')
+            add_um('opf.metadata', 'true', 'boolean')
+        val_type = 'string'
+        if hasattr(val, 'strftime'):
+            val = isoformat(val, as_utc=True).split('T')[0]
+            val_type = 'date'
+        add_um(name, val, val_type)
+
+    meta = xpath('//office:meta')[0]
+
+    if not mi.is_null('title'):
+        remove('title')
+        add('title', mi.title)
+        if not mi.is_null('title_sort'):
+            remove_user_metadata('opf.titlesort')
+            add_user_metadata('opf.titlesort', mi.title_sort)
+    if not mi.is_null('authors'):
+        remove('initial-creator', 'creator')
+        val = authors_to_string(mi.authors)
+        add('initial-creator', val), add('creator', val)
+        remove_user_metadata('opf.authors')
+        add_user_metadata('opf.authors', val)
+        if not mi.is_null('author_sort'):
+            remove_user_metadata('opf.authorsort')
+            add_user_metadata('opf.authorsort', mi.author_sort)
+    if not mi.is_null('comments'):
+        remove('description')
+        add('description', mi.comments)
+    if not mi.is_null('tags'):
+        remove('keyword')
+        add('keyword', ', '.join(mi.tags))
+    if not mi.is_null('languages'):
+        lang = lang_as_iso639_1(mi.languages[0])
+        if lang:
+            remove('language')
+            add('language', lang)
+    if not mi.is_null('pubdate'):
+        remove_user_metadata('opf.pubdate')
+        add_user_metadata('opf.pubdate', mi.pubdate)
+    if not mi.is_null('publisher'):
+        remove_user_metadata('opf.publisher')
+        add_user_metadata('opf.publisher', mi.publisher)
+    if not mi.is_null('series'):
+        remove_user_metadata('opf.series', 'opf.seriesindex')
+        add_user_metadata('opf.series', mi.series)
+        add_user_metadata('opf.seriesindex', '{}'.format(mi.series_index))
+    if not mi.is_null('identifiers'):
+        remove_user_metadata('opf.identifiers')
+        add_user_metadata('opf.identifiers', as_unicode(json.dumps(mi.identifiers)))
+    if not mi.is_null('rating'):
+        remove_user_metadata('opf.rating')
+        add_user_metadata('opf.rating', '%.2g' % mi.rating)
+
+    return tostring(root, encoding='utf-8', pretty_print=True)
+
+
+def read_cover(stream, zin, mi, opfmeta, extract_cover):
+    # search for an draw:image in a draw:frame with the name 'opf.cover'
+    # if opf.metadata prop is false, just use the first image that
+    # has a proper size (borrowed from docx)
+    otext = odLoad(stream)
+    cover_href = None
+    cover_data = None
+    cover_frame = None
+    imgnum = 0
+    for frm in otext.topnode.getElementsByType(odFrame):
+        img = frm.getElementsByType(odImage)
+        if len(img) == 0:
+            continue
+        i_href = img[0].getAttribute('href')
+        try:
+            raw = zin.read(i_href)
+        except KeyError:
+            continue
+        try:
+            fmt, width, height = identify(raw)
+        except Exception:
+            continue
+        imgnum += 1
+        if opfmeta and frm.getAttribute('name').lower() == 'opf.cover':
+            cover_href = i_href
+            cover_data = (fmt, raw)
+            cover_frame = frm.getAttribute('name')  # could have upper case
+            break
+        if cover_href is None and imgnum == 1 and 0.8 <= height/width <= 1.8 and height*width >= 12000:
+            # Pick the first image as the cover if it is of a suitable size
+            cover_href = i_href
+            cover_data = (fmt, raw)
+            if not opfmeta:
+                break
+
+    if cover_href is not None:
+        mi.cover = cover_href
+        mi.odf_cover_frame = cover_frame
+        if extract_cover:
+            if not cover_data:
+                raw = zin.read(cover_href)
+                try:
+                    fmt = identify(raw)[0]
+                except Exception:
+                    pass
+                else:
+                    cover_data = (fmt, raw)
+            mi.cover_data = cover_data