1
0
mirror of https://github.com/gryf/ebook-converter.git synced 2026-01-26 23:55:46 +01:00
Files
ebook-converter/ebook_converter/ebooks/oeb/transforms/metadata.py
gryf 1465e4267f Sorted out mime initialization.
Every mime related function in main __init__.py has a flag check for the
check if initialization has already done. This is nonsense, since it
should be done implicitly early on the converter is starting.

This commit straight the things out, and initialization is done in cli
module.

Also, function guess_type was removed, since it's just a proxy for
mimetypes.guess_type function.
2020-06-14 15:41:18 +02:00

218 lines
8.4 KiB
Python

import mimetypes
import os
import re
from ebook_converter.ebooks.oeb import base
from ebook_converter.utils.date import isoformat, now
def meta_info_to_oeb_metadata(mi, m, log, override_input_metadata=False):
if not mi.is_null('title'):
m.clear('title')
m.add('title', mi.title)
if mi.title_sort:
if not m.title:
m.add('title', mi.title_sort)
m.clear('title_sort')
m.add('title_sort', mi.title_sort)
if not mi.is_null('authors'):
m.filter('creator', lambda x: x.role.lower() in ['aut', ''])
for a in mi.authors:
attrib = {'role': 'aut'}
if mi.author_sort:
attrib[base.tag('opf', 'file-as')] = mi.author_sort
m.add('creator', a, attrib=attrib)
if not mi.is_null('book_producer'):
m.filter('contributor', lambda x: x.role.lower() == 'bkp')
m.add('contributor', mi.book_producer, role='bkp')
elif override_input_metadata:
m.filter('contributor', lambda x: x.role.lower() == 'bkp')
if not mi.is_null('comments'):
m.clear('description')
m.add('description', mi.comments)
elif override_input_metadata:
m.clear('description')
if not mi.is_null('publisher'):
m.clear('publisher')
m.add('publisher', mi.publisher)
elif override_input_metadata:
m.clear('publisher')
if not mi.is_null('series'):
m.clear('series')
m.add('series', mi.series)
elif override_input_metadata:
m.clear('series')
identifiers = mi.get_identifiers()
set_isbn = False
for typ, val in identifiers.items():
has = False
if typ.lower() == 'isbn':
set_isbn = True
for x in m.identifier:
if x.scheme.lower() == typ.lower():
x.content = val
has = True
if not has:
m.add('identifier', val, scheme=typ.upper())
if override_input_metadata and not set_isbn:
m.filter('identifier', lambda x: x.scheme.lower() == 'isbn')
if not mi.is_null('languages'):
m.clear('language')
for lang in mi.languages:
if lang and lang.lower() not in ('und', ''):
m.add('language', lang)
if not mi.is_null('series_index'):
m.clear('series_index')
m.add('series_index', mi.format_series_index())
elif override_input_metadata:
m.clear('series_index')
if not mi.is_null('rating'):
m.clear('rating')
m.add('rating', '%.2f' % mi.rating)
elif override_input_metadata:
m.clear('rating')
if not mi.is_null('tags'):
m.clear('subject')
for t in mi.tags:
m.add('subject', t)
elif override_input_metadata:
m.clear('subject')
if not mi.is_null('pubdate'):
m.clear('date')
m.add('date', isoformat(mi.pubdate))
if not mi.is_null('timestamp'):
m.clear('timestamp')
m.add('timestamp', isoformat(mi.timestamp))
if not mi.is_null('rights'):
m.clear('rights')
m.add('rights', mi.rights)
if not mi.is_null('publication_type'):
m.clear('publication_type')
m.add('publication_type', mi.publication_type)
if not m.timestamp:
m.add('timestamp', isoformat(now()))
class MergeMetadata(object):
'Merge in user metadata, including cover'
def __call__(self, oeb, mi, opts, override_input_metadata=False):
_oim = override_input_metadata
self.oeb, self.log = oeb, oeb.log
m = self.oeb.metadata
self.log('Merging user specified metadata...')
meta_info_to_oeb_metadata(mi, m, oeb.log,
override_input_metadata=_oim)
cover_id = self.set_cover(mi, opts.prefer_metadata_cover)
m.clear('cover')
if cover_id is not None:
m.add('cover', cover_id)
if mi.uuid is not None:
m.filter('identifier', lambda x: x.id == 'uuid_id')
self.oeb.metadata.add('identifier', mi.uuid, id='uuid_id',
scheme='uuid')
self.oeb.uid = self.oeb.metadata.identifier[-1]
if mi.application_id is not None:
m.filter('identifier', lambda x: x.scheme == 'calibre')
self.oeb.metadata.add('identifier', mi.application_id,
scheme='calibre')
def set_cover(self, mi, prefer_metadata_cover):
cdata, ext = b'', 'jpg'
if mi.cover and os.access(mi.cover, os.R_OK):
with open(mi.cover, 'rb') as f:
cdata = f.read()
ext = mi.cover.rpartition('.')[-1].lower().strip()
elif mi.cover_data and mi.cover_data[-1]:
cdata = mi.cover_data[1]
ext = mi.cover_data[0]
if ext not in ('png', 'jpg', 'jpeg'):
ext = 'jpg'
id = old_cover = None
if 'cover' in self.oeb.guide:
old_cover = self.oeb.guide['cover']
if prefer_metadata_cover and old_cover is not None:
cdata = b''
if cdata:
self.oeb.guide.remove('cover')
self.oeb.guide.remove('titlepage')
elif (self.oeb.plumber_output_format in {'mobi', 'azw3'} and
old_cover is not None):
# The amazon formats dont support html cover pages, so remove them
# even if no cover was specified.
self.oeb.guide.remove('titlepage')
do_remove_old_cover = False
if old_cover is not None:
if old_cover.href in self.oeb.manifest.hrefs:
item = self.oeb.manifest.hrefs[old_cover.href]
if not cdata:
return item.id
do_remove_old_cover = True
elif not cdata:
id = self.oeb.manifest.generate(id='cover')[0]
self.oeb.manifest.add(id, old_cover.href, 'image/jpeg')
return id
new_cover_item = None
if cdata:
id, href = self.oeb.manifest.generate('cover', 'cover.'+ext)
new_cover_item = self.oeb.manifest.add(
id, href, mimetypes.guess_type('cover.'+ext)[0], data=cdata)
self.oeb.guide.add('cover', 'Cover', href)
if do_remove_old_cover:
self.remove_old_cover(item, new_cover_item.href)
return id
def remove_old_cover(self, cover_item, new_cover_href=None):
from ebook_converter.ebooks.oeb.base import XPath, XLINK
from lxml import etree
self.oeb.manifest.remove(cover_item)
# Remove any references to the cover in the HTML
affected_items = set()
xp = XPath('//h:img[@src]|//svg:image[@xl:href]')
for i, item in enumerate(self.oeb.spine):
try:
images = xp(item.data)
except Exception:
images = ()
removed = False
for img in images:
href = img.get('src') or img.get(XLINK('href'))
try:
href = item.abshref(href)
except Exception:
continue # Invalid URL, ignore
if href == cover_item.href:
if new_cover_href is not None:
replacement_href = item.relhref(new_cover_href)
attr = ('src' if img.tag.endswith('img')
else XLINK('href'))
img.set(attr, replacement_href)
else:
p = img.getparent()
if p.tag.endswith('}svg'):
p.getparent().remove(p)
else:
p.remove(img)
removed = True
if removed:
affected_items.add(item)
# Check if the resulting HTML has no content, if so remove it
for item in affected_items:
body = XPath('//h:body')(item.data)
if body:
text = etree.tostring(body[0], method='text',
encoding='unicode')
else:
text = ''
text = re.sub(r'\s+', '', text)
if not text and not XPath('//h:img|//svg:svg')(item.data):
self.log('Removing %s as it is a wrapper around the cover '
'image' % item.href)
self.oeb.spine.remove(item)
self.oeb.manifest.remove(item)
self.oeb.guide.remove_by_href(item.href)