1
0
mirror of https://github.com/gryf/ebook-converter.git synced 2026-02-23 10:35:49 +01:00
Files
ebook-converter/ebook_converter/ebooks/oeb/transforms/metadata.py
gryf ce89f5c9d1 Use the real constants module.
This is progressing refactor of the calibre code to make it more
readable, and transform it to something more coherent.

In this patch, there are changes regarding imports for some modules,
instead of polluting namespace of each module with some other modules
symbols, which often were imported from other modules. Yuck.
2020-05-29 17:04:53 +02:00

219 lines
8.5 KiB
Python

import os
import re
from ebook_converter.ebooks.oeb import base
from ebook_converter.utils.date import isoformat, now
from ebook_converter import guess_type
def meta_info_to_oeb_metadata(mi, m, log, override_input_metadata=False):
if not mi.is_null('title'):
m.clear('title')
m.add('title', mi.title)
if mi.title_sort:
if not m.title:
m.add('title', mi.title_sort)
m.clear('title_sort')
m.add('title_sort', mi.title_sort)
if not mi.is_null('authors'):
m.filter('creator', lambda x: x.role.lower() in ['aut', ''])
for a in mi.authors:
attrib = {'role': 'aut'}
if mi.author_sort:
attrib[base.tag('opf', 'file-as')] = mi.author_sort
m.add('creator', a, attrib=attrib)
if not mi.is_null('book_producer'):
m.filter('contributor', lambda x: x.role.lower() == 'bkp')
m.add('contributor', mi.book_producer, role='bkp')
elif override_input_metadata:
m.filter('contributor', lambda x: x.role.lower() == 'bkp')
if not mi.is_null('comments'):
m.clear('description')
m.add('description', mi.comments)
elif override_input_metadata:
m.clear('description')
if not mi.is_null('publisher'):
m.clear('publisher')
m.add('publisher', mi.publisher)
elif override_input_metadata:
m.clear('publisher')
if not mi.is_null('series'):
m.clear('series')
m.add('series', mi.series)
elif override_input_metadata:
m.clear('series')
identifiers = mi.get_identifiers()
set_isbn = False
for typ, val in identifiers.items():
has = False
if typ.lower() == 'isbn':
set_isbn = True
for x in m.identifier:
if x.scheme.lower() == typ.lower():
x.content = val
has = True
if not has:
m.add('identifier', val, scheme=typ.upper())
if override_input_metadata and not set_isbn:
m.filter('identifier', lambda x: x.scheme.lower() == 'isbn')
if not mi.is_null('languages'):
m.clear('language')
for lang in mi.languages:
if lang and lang.lower() not in ('und', ''):
m.add('language', lang)
if not mi.is_null('series_index'):
m.clear('series_index')
m.add('series_index', mi.format_series_index())
elif override_input_metadata:
m.clear('series_index')
if not mi.is_null('rating'):
m.clear('rating')
m.add('rating', '%.2f' % mi.rating)
elif override_input_metadata:
m.clear('rating')
if not mi.is_null('tags'):
m.clear('subject')
for t in mi.tags:
m.add('subject', t)
elif override_input_metadata:
m.clear('subject')
if not mi.is_null('pubdate'):
m.clear('date')
m.add('date', isoformat(mi.pubdate))
if not mi.is_null('timestamp'):
m.clear('timestamp')
m.add('timestamp', isoformat(mi.timestamp))
if not mi.is_null('rights'):
m.clear('rights')
m.add('rights', mi.rights)
if not mi.is_null('publication_type'):
m.clear('publication_type')
m.add('publication_type', mi.publication_type)
if not m.timestamp:
m.add('timestamp', isoformat(now()))
class MergeMetadata(object):
'Merge in user metadata, including cover'
def __call__(self, oeb, mi, opts, override_input_metadata=False):
_oim = override_input_metadata
self.oeb, self.log = oeb, oeb.log
m = self.oeb.metadata
self.log('Merging user specified metadata...')
meta_info_to_oeb_metadata(mi, m, oeb.log,
override_input_metadata=_oim)
cover_id = self.set_cover(mi, opts.prefer_metadata_cover)
m.clear('cover')
if cover_id is not None:
m.add('cover', cover_id)
if mi.uuid is not None:
m.filter('identifier', lambda x: x.id == 'uuid_id')
self.oeb.metadata.add('identifier', mi.uuid, id='uuid_id',
scheme='uuid')
self.oeb.uid = self.oeb.metadata.identifier[-1]
if mi.application_id is not None:
m.filter('identifier', lambda x: x.scheme == 'calibre')
self.oeb.metadata.add('identifier', mi.application_id,
scheme='calibre')
def set_cover(self, mi, prefer_metadata_cover):
cdata, ext = b'', 'jpg'
if mi.cover and os.access(mi.cover, os.R_OK):
with open(mi.cover, 'rb') as f:
cdata = f.read()
ext = mi.cover.rpartition('.')[-1].lower().strip()
elif mi.cover_data and mi.cover_data[-1]:
cdata = mi.cover_data[1]
ext = mi.cover_data[0]
if ext not in ('png', 'jpg', 'jpeg'):
ext = 'jpg'
id = old_cover = None
if 'cover' in self.oeb.guide:
old_cover = self.oeb.guide['cover']
if prefer_metadata_cover and old_cover is not None:
cdata = b''
if cdata:
self.oeb.guide.remove('cover')
self.oeb.guide.remove('titlepage')
elif (self.oeb.plumber_output_format in {'mobi', 'azw3'} and
old_cover is not None):
# The amazon formats dont support html cover pages, so remove them
# even if no cover was specified.
self.oeb.guide.remove('titlepage')
do_remove_old_cover = False
if old_cover is not None:
if old_cover.href in self.oeb.manifest.hrefs:
item = self.oeb.manifest.hrefs[old_cover.href]
if not cdata:
return item.id
do_remove_old_cover = True
elif not cdata:
id = self.oeb.manifest.generate(id='cover')[0]
self.oeb.manifest.add(id, old_cover.href, 'image/jpeg')
return id
new_cover_item = None
if cdata:
id, href = self.oeb.manifest.generate('cover', 'cover.'+ext)
new_cover_item = self.oeb.manifest.add(id, href,
guess_type('cover.'+ext)[0],
data=cdata)
self.oeb.guide.add('cover', 'Cover', href)
if do_remove_old_cover:
self.remove_old_cover(item, new_cover_item.href)
return id
def remove_old_cover(self, cover_item, new_cover_href=None):
from ebook_converter.ebooks.oeb.base import XPath, XLINK
from lxml import etree
self.oeb.manifest.remove(cover_item)
# Remove any references to the cover in the HTML
affected_items = set()
xp = XPath('//h:img[@src]|//svg:image[@xl:href]')
for i, item in enumerate(self.oeb.spine):
try:
images = xp(item.data)
except Exception:
images = ()
removed = False
for img in images:
href = img.get('src') or img.get(XLINK('href'))
try:
href = item.abshref(href)
except Exception:
continue # Invalid URL, ignore
if href == cover_item.href:
if new_cover_href is not None:
replacement_href = item.relhref(new_cover_href)
attr = ('src' if img.tag.endswith('img')
else XLINK('href'))
img.set(attr, replacement_href)
else:
p = img.getparent()
if p.tag.endswith('}svg'):
p.getparent().remove(p)
else:
p.remove(img)
removed = True
if removed:
affected_items.add(item)
# Check if the resulting HTML has no content, if so remove it
for item in affected_items:
body = XPath('//h:body')(item.data)
if body:
text = etree.tostring(body[0], method='text',
encoding='unicode')
else:
text = ''
text = re.sub(r'\s+', '', text)
if not text and not XPath('//h:img|//svg:svg')(item.data):
self.log('Removing %s as it is a wrapper around the cover '
'image' % item.href)
self.oeb.spine.remove(item)
self.oeb.manifest.remove(item)
self.oeb.guide.remove_by_href(item.href)