mirror of
https://github.com/gryf/ebook-converter.git
synced 2026-02-21 01:05:52 +01:00
Here is the first batch of modules, which are needed for converting several formats to LRF. Some of the logic has been change, more cleanups will follow.
244 lines
8.2 KiB
Python
244 lines
8.2 KiB
Python
from __future__ import absolute_import, division, print_function, unicode_literals
|
|
|
|
__license__ = 'GPL v3'
|
|
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
|
|
|
|
import os, re, collections
|
|
|
|
from ebook_converter.utils.config import prefs
|
|
from ebook_converter.constants import filesystem_encoding
|
|
from ebook_converter.ebooks.metadata.opf2 import OPF
|
|
from ebook_converter import isbytestring
|
|
from ebook_converter.customize.ui import get_file_type_metadata, set_file_type_metadata
|
|
from ebook_converter.ebooks.metadata import MetaInformation, string_to_authors
|
|
from ebook_converter.polyglot.builtins import getcwd, unicode_type
|
|
|
|
# The priorities for loading metadata from different file types
|
|
# Higher values should be used to update metadata from lower values
|
|
METADATA_PRIORITIES = collections.defaultdict(lambda:0)
|
|
for i, ext in enumerate((
|
|
'html', 'htm', 'xhtml', 'xhtm',
|
|
'rtf', 'fb2', 'pdf', 'prc', 'odt',
|
|
'epub', 'lit', 'lrx', 'lrf', 'mobi',
|
|
'azw', 'azw3', 'azw1', 'rb', 'imp', 'snb'
|
|
)):
|
|
METADATA_PRIORITIES[ext] = i + 1
|
|
|
|
|
|
def path_to_ext(path):
|
|
return os.path.splitext(path)[1][1:].lower()
|
|
|
|
|
|
def metadata_from_formats(formats, force_read_metadata=False, pattern=None):
|
|
try:
|
|
return _metadata_from_formats(formats, force_read_metadata, pattern)
|
|
except:
|
|
mi = metadata_from_filename(list(iter(formats))[0], pat=pattern)
|
|
if not mi.authors:
|
|
mi.authors = [_('Unknown')]
|
|
return mi
|
|
|
|
|
|
def _metadata_from_formats(formats, force_read_metadata=False, pattern=None):
|
|
mi = MetaInformation(None, None)
|
|
formats.sort(key=lambda x: METADATA_PRIORITIES[path_to_ext(x)])
|
|
extensions = list(map(path_to_ext, formats))
|
|
if 'opf' in extensions:
|
|
opf = formats[extensions.index('opf')]
|
|
mi2 = opf_metadata(opf)
|
|
if mi2 is not None and mi2.title:
|
|
return mi2
|
|
|
|
for path, ext in zip(formats, extensions):
|
|
with lopen(path, 'rb') as stream:
|
|
try:
|
|
newmi = get_metadata(stream, stream_type=ext,
|
|
use_libprs_metadata=True,
|
|
force_read_metadata=force_read_metadata,
|
|
pattern=pattern)
|
|
mi.smart_update(newmi)
|
|
except Exception:
|
|
continue
|
|
if getattr(mi, 'application_id', None) is not None:
|
|
return mi
|
|
|
|
if not mi.title:
|
|
mi.title = _('Unknown')
|
|
if not mi.authors:
|
|
mi.authors = [_('Unknown')]
|
|
|
|
return mi
|
|
|
|
|
|
def get_metadata(stream, stream_type='lrf', use_libprs_metadata=False,
|
|
force_read_metadata=False, pattern=None):
|
|
pos = 0
|
|
if hasattr(stream, 'tell'):
|
|
pos = stream.tell()
|
|
try:
|
|
return _get_metadata(stream, stream_type, use_libprs_metadata,
|
|
force_read_metadata, pattern)
|
|
finally:
|
|
if hasattr(stream, 'seek'):
|
|
stream.seek(pos)
|
|
|
|
|
|
def _get_metadata(stream, stream_type, use_libprs_metadata,
|
|
force_read_metadata=False, pattern=None):
|
|
if stream_type:
|
|
stream_type = stream_type.lower()
|
|
if stream_type in ('html', 'html', 'xhtml', 'xhtm', 'xml'):
|
|
stream_type = 'html'
|
|
if stream_type in ('mobi', 'prc', 'azw'):
|
|
stream_type = 'mobi'
|
|
if stream_type in ('odt', 'ods', 'odp', 'odg', 'odf'):
|
|
stream_type = 'odt'
|
|
|
|
opf = None
|
|
if hasattr(stream, 'name'):
|
|
c = os.path.splitext(stream.name)[0]+'.opf'
|
|
if os.access(c, os.R_OK):
|
|
opf = opf_metadata(os.path.abspath(c))
|
|
|
|
if use_libprs_metadata and getattr(opf, 'application_id', None) is not None:
|
|
return opf
|
|
|
|
name = os.path.basename(getattr(stream, 'name', ''))
|
|
# The fallback pattern matches the default filename format produced by calibre
|
|
base = metadata_from_filename(name, pat=pattern, fallback_pat=re.compile(
|
|
r'^(?P<title>.+) - (?P<author>[^-]+)$'))
|
|
if not base.authors:
|
|
base.authors = [_('Unknown')]
|
|
if not base.title:
|
|
base.title = _('Unknown')
|
|
mi = MetaInformation(None, None)
|
|
if force_read_metadata or prefs['read_file_metadata']:
|
|
mi = get_file_type_metadata(stream, stream_type)
|
|
base.smart_update(mi)
|
|
if opf is not None:
|
|
base.smart_update(opf)
|
|
|
|
return base
|
|
|
|
|
|
def set_metadata(stream, mi, stream_type='lrf', report_error=None):
|
|
if stream_type:
|
|
stream_type = stream_type.lower()
|
|
set_file_type_metadata(stream, mi, stream_type, report_error=report_error)
|
|
|
|
|
|
def metadata_from_filename(name, pat=None, fallback_pat=None):
|
|
if isbytestring(name):
|
|
name = name.decode(filesystem_encoding, 'replace')
|
|
name = name.rpartition('.')[0]
|
|
mi = MetaInformation(None, None)
|
|
if pat is None:
|
|
pat = re.compile(prefs.get('filename_pattern'))
|
|
name = name.replace('_', ' ')
|
|
match = pat.search(name)
|
|
if match is None and fallback_pat is not None:
|
|
match = fallback_pat.search(name)
|
|
if match is not None:
|
|
try:
|
|
mi.title = match.group('title')
|
|
except IndexError:
|
|
pass
|
|
try:
|
|
au = match.group('author')
|
|
aus = string_to_authors(au)
|
|
if aus:
|
|
mi.authors = aus
|
|
if prefs['swap_author_names'] and mi.authors:
|
|
def swap(a):
|
|
if ',' in a:
|
|
parts = a.split(',', 1)
|
|
else:
|
|
parts = a.split(None, 1)
|
|
if len(parts) > 1:
|
|
t = parts[-1]
|
|
parts = parts[:-1]
|
|
parts.insert(0, t)
|
|
return ' '.join(parts)
|
|
mi.authors = [swap(x) for x in mi.authors]
|
|
except (IndexError, ValueError):
|
|
pass
|
|
try:
|
|
mi.series = match.group('series')
|
|
except IndexError:
|
|
pass
|
|
try:
|
|
si = match.group('series_index')
|
|
mi.series_index = float(si)
|
|
except (IndexError, ValueError, TypeError):
|
|
pass
|
|
try:
|
|
si = match.group('isbn')
|
|
mi.isbn = si
|
|
except (IndexError, ValueError):
|
|
pass
|
|
try:
|
|
publisher = match.group('publisher')
|
|
mi.publisher = publisher
|
|
except (IndexError, ValueError):
|
|
pass
|
|
try:
|
|
pubdate = match.group('published')
|
|
if pubdate:
|
|
from ebook_converter.utils.date import parse_only_date
|
|
mi.pubdate = parse_only_date(pubdate)
|
|
except:
|
|
pass
|
|
try:
|
|
comments = match.group('comments')
|
|
mi.comments = comments
|
|
except (IndexError, ValueError):
|
|
pass
|
|
|
|
if mi.is_null('title'):
|
|
mi.title = name
|
|
return mi
|
|
|
|
|
|
def opf_metadata(opfpath):
|
|
if hasattr(opfpath, 'read'):
|
|
f = opfpath
|
|
opfpath = getattr(f, 'name', getcwd())
|
|
else:
|
|
f = open(opfpath, 'rb')
|
|
try:
|
|
opf = OPF(f, os.path.dirname(opfpath))
|
|
if opf.application_id is not None:
|
|
mi = opf.to_book_metadata()
|
|
if hasattr(opf, 'cover') and opf.cover:
|
|
cpath = os.path.join(os.path.dirname(opfpath), opf.cover)
|
|
if os.access(cpath, os.R_OK):
|
|
fmt = cpath.rpartition('.')[-1]
|
|
with open(cpath, 'rb') as f:
|
|
data = f.read()
|
|
mi.cover_data = (fmt, data)
|
|
return mi
|
|
except Exception:
|
|
import traceback
|
|
traceback.print_exc()
|
|
pass
|
|
|
|
|
|
def forked_read_metadata(path, tdir):
|
|
from ebook_converter.ebooks.metadata.opf2 import metadata_to_opf
|
|
with lopen(path, 'rb') as f:
|
|
fmt = os.path.splitext(path)[1][1:].lower()
|
|
f.seek(0, 2)
|
|
sz = f.tell()
|
|
with lopen(os.path.join(tdir, 'size.txt'), 'wb') as s:
|
|
s.write(unicode_type(sz).encode('ascii'))
|
|
f.seek(0)
|
|
mi = get_metadata(f, fmt)
|
|
if mi.cover_data and mi.cover_data[1]:
|
|
with lopen(os.path.join(tdir, 'cover.jpg'), 'wb') as f:
|
|
f.write(mi.cover_data[1])
|
|
mi.cover_data = (None, None)
|
|
mi.cover = 'cover.jpg'
|
|
opf = metadata_to_opf(mi, default_lang='und')
|
|
with lopen(os.path.join(tdir, 'metadata.opf'), 'wb') as f:
|
|
f.write(opf)
|