mirror of
https://github.com/gryf/ebook-converter.git
synced 2026-01-08 04:34:12 +01:00
Here is the first batch of modules, which are needed for converting several formats to LRF. Some of the logic has been change, more cleanups will follow.
413 lines
15 KiB
Python
413 lines
15 KiB
Python
#!/usr/bin/env python2
|
|
# vim:fileencoding=utf-8
|
|
from __future__ import absolute_import, division, print_function, unicode_literals
|
|
|
|
__license__ = 'GPL v3'
|
|
__copyright__ = '2013, Kovid Goyal <kovid at kovidgoyal.net>'
|
|
'''
|
|
Try to read metadata from an HTML file.
|
|
'''
|
|
|
|
import re
|
|
import unittest
|
|
|
|
from collections import defaultdict
|
|
from html5_parser import parse
|
|
from lxml.etree import Comment
|
|
|
|
from ebook_converter.ebooks.metadata import string_to_authors, authors_to_string
|
|
from ebook_converter.ebooks.metadata.book.base import Metadata
|
|
from ebook_converter.ebooks.chardet import xml_to_unicode
|
|
from ebook_converter import replace_entities, isbytestring
|
|
from ebook_converter.utils.date import parse_date, is_date_undefined
|
|
from ebook_converter.polyglot.builtins import iteritems
|
|
|
|
|
|
def get_metadata(stream):
|
|
src = stream.read()
|
|
return get_metadata_(src)
|
|
|
|
|
|
COMMENT_NAMES = {
|
|
'title': 'TITLE',
|
|
'authors': 'AUTHOR',
|
|
'publisher': 'PUBLISHER',
|
|
'isbn': 'ISBN',
|
|
'languages': 'LANGUAGE',
|
|
'pubdate': 'PUBDATE',
|
|
'timestamp': 'TIMESTAMP',
|
|
'series': 'SERIES',
|
|
'series_index': 'SERIESNUMBER',
|
|
'rating': 'RATING',
|
|
'comments': 'COMMENTS',
|
|
'tags': 'TAGS',
|
|
}
|
|
|
|
META_NAMES = {
|
|
'title' : ('dc.title', 'dcterms.title', 'title'),
|
|
'authors': ('author', 'dc.creator.aut', 'dcterms.creator.aut', 'dc.creator'),
|
|
'publisher': ('publisher', 'dc.publisher', 'dcterms.publisher'),
|
|
'isbn': ('isbn',),
|
|
'languages': ('dc.language', 'dcterms.language'),
|
|
'pubdate': ('pubdate', 'date of publication', 'dc.date.published', 'dc.date.publication', 'dc.date.issued', 'dcterms.issued'),
|
|
'timestamp': ('timestamp', 'date of creation', 'dc.date.created', 'dc.date.creation', 'dcterms.created'),
|
|
'series': ('series',),
|
|
'series_index': ('seriesnumber', 'series_index', 'series.index'),
|
|
'rating': ('rating',),
|
|
'comments': ('comments', 'dc.description'),
|
|
'tags': ('tags',),
|
|
}
|
|
rmap_comment = {v:k for k, v in iteritems(COMMENT_NAMES)}
|
|
rmap_meta = {v:k for k, l in iteritems(META_NAMES) for v in l}
|
|
|
|
|
|
# Extract an HTML attribute value, supports both single and double quotes and
|
|
# single quotes inside double quotes and vice versa.
|
|
attr_pat = r'''(?:(?P<sq>')|(?P<dq>"))(?P<content>(?(sq)[^']+|[^"]+))(?(sq)'|")'''
|
|
|
|
|
|
def handle_comment(data, comment_tags):
|
|
if not hasattr(handle_comment, 'pat'):
|
|
handle_comment.pat = re.compile(r'''(?P<name>\S+)\s*=\s*%s''' % attr_pat)
|
|
for match in handle_comment.pat.finditer(data):
|
|
x = match.group('name')
|
|
field = None
|
|
try:
|
|
field = rmap_comment[x]
|
|
except KeyError:
|
|
pass
|
|
if field:
|
|
comment_tags[field].append(replace_entities(match.group('content')))
|
|
|
|
|
|
def parse_metadata(src):
|
|
root = parse(src)
|
|
comment_tags = defaultdict(list)
|
|
meta_tags = defaultdict(list)
|
|
meta_tag_ids = defaultdict(list)
|
|
title = ''
|
|
identifier_pat = re.compile(r'(?:dc|dcterms)[.:]identifier(?:\.|$)', flags=re.IGNORECASE)
|
|
id_pat2 = re.compile(r'(?:dc|dcterms)[.:]identifier$', flags=re.IGNORECASE)
|
|
|
|
for comment in root.iterdescendants(tag=Comment):
|
|
if comment.text:
|
|
handle_comment(comment.text, comment_tags)
|
|
|
|
for q in root.iterdescendants(tag='title'):
|
|
if q.text:
|
|
title = q.text
|
|
break
|
|
|
|
for meta in root.iterdescendants(tag='meta'):
|
|
name, content = meta.get('name'), meta.get('content')
|
|
if not name or not content:
|
|
continue
|
|
if identifier_pat.match(name) is not None:
|
|
scheme = None
|
|
if id_pat2.match(name) is not None:
|
|
scheme = meta.get('scheme')
|
|
else:
|
|
elements = re.split(r'[.:]', name)
|
|
if len(elements) == 3 and not meta.get('scheme'):
|
|
scheme = elements[2].strip()
|
|
if scheme:
|
|
meta_tag_ids[scheme.lower()].append(content)
|
|
else:
|
|
x = name.lower()
|
|
field = None
|
|
try:
|
|
field = rmap_meta[x]
|
|
except KeyError:
|
|
try:
|
|
field = rmap_meta[x.replace(':', '.')]
|
|
except KeyError:
|
|
pass
|
|
if field:
|
|
meta_tags[field].append(content)
|
|
|
|
return comment_tags, meta_tags, meta_tag_ids, title
|
|
|
|
|
|
def get_metadata_(src, encoding=None):
|
|
# Meta data definitions as in
|
|
# https://www.mobileread.com/forums/showpost.php?p=712544&postcount=9
|
|
|
|
if isbytestring(src):
|
|
if not encoding:
|
|
src = xml_to_unicode(src)[0]
|
|
else:
|
|
src = src.decode(encoding, 'replace')
|
|
src = src[:150000] # Searching shouldn't take too long
|
|
comment_tags, meta_tags, meta_tag_ids, title_tag = parse_metadata(src)
|
|
|
|
def get_all(field):
|
|
ans = comment_tags.get(field, meta_tags.get(field, None))
|
|
if ans:
|
|
ans = [x.strip() for x in ans if x.strip()]
|
|
if not ans:
|
|
ans = None
|
|
return ans
|
|
|
|
def get(field):
|
|
ans = get_all(field)
|
|
if ans:
|
|
ans = ans[0]
|
|
return ans
|
|
|
|
# Title
|
|
title = get('title') or title_tag.strip() or _('Unknown')
|
|
|
|
# Author
|
|
authors = authors_to_string(get_all('authors')) or _('Unknown')
|
|
|
|
# Create MetaInformation with Title and Author
|
|
mi = Metadata(title, string_to_authors(authors))
|
|
|
|
# Single-value text fields
|
|
for field in ('publisher', 'isbn'):
|
|
val = get(field)
|
|
if val:
|
|
setattr(mi, field, val)
|
|
|
|
# Multi-value text fields
|
|
for field in ('languages',):
|
|
val = get_all(field)
|
|
if val:
|
|
setattr(mi, field, val)
|
|
|
|
# HTML fields
|
|
for field in ('comments',):
|
|
val = get(field)
|
|
if val:
|
|
setattr(mi, field, val.replace('&', '&').replace('<', '<').replace('>', '>').replace('"', '"').replace("'", '''))
|
|
|
|
# Date fields
|
|
for field in ('pubdate', 'timestamp'):
|
|
try:
|
|
val = parse_date(get(field))
|
|
except:
|
|
pass
|
|
else:
|
|
if not is_date_undefined(val):
|
|
setattr(mi, field, val)
|
|
|
|
# SERIES
|
|
series = get('series')
|
|
if series:
|
|
pat = re.compile(r'\[([.0-9]+)\]$')
|
|
match = pat.search(series)
|
|
series_index = None
|
|
if match is not None:
|
|
try:
|
|
series_index = float(match.group(1))
|
|
except:
|
|
pass
|
|
series = series.replace(match.group(), '').strip()
|
|
mi.series = series
|
|
if series_index is None:
|
|
series_index = get('series_index')
|
|
try:
|
|
series_index = float(series_index)
|
|
except:
|
|
pass
|
|
if series_index is not None:
|
|
mi.series_index = series_index
|
|
|
|
# RATING
|
|
rating = get('rating')
|
|
if rating:
|
|
try:
|
|
mi.rating = float(rating)
|
|
if mi.rating < 0:
|
|
mi.rating = 0
|
|
if mi.rating > 10:
|
|
mi.rating = 0
|
|
except:
|
|
pass
|
|
|
|
# TAGS
|
|
tags = get_all('tags')
|
|
if tags:
|
|
tags = [x.strip() for s in tags for x in s.split(',') if x.strip()]
|
|
if tags:
|
|
mi.tags = tags
|
|
|
|
# IDENTIFIERS
|
|
for (k,v) in iteritems(meta_tag_ids):
|
|
v = [x.strip() for x in v if x.strip()]
|
|
if v:
|
|
mi.set_identifier(k, v[0])
|
|
|
|
return mi
|
|
|
|
|
|
class MetadataHtmlTest(unittest.TestCase):
|
|
|
|
def compare_metadata(self, meta_a, meta_b):
|
|
for attr in (
|
|
'title', 'authors', 'publisher', 'isbn', 'languages', 'pubdate', 'timestamp', 'series',
|
|
'series_index', 'rating', 'comments', 'tags', 'identifiers'
|
|
):
|
|
self.assertEqual(getattr(meta_a, attr), getattr(meta_b, attr))
|
|
|
|
def get_stream(self, test):
|
|
from io import BytesIO
|
|
|
|
raw = b'''\
|
|
<html>
|
|
<head>
|
|
'''
|
|
|
|
if test in {'title', 'meta_single', 'meta_multi', 'comment_single', 'comment_multi'}:
|
|
raw += b'''\
|
|
}
|
|
<title>A Title Tag &amp; Title Ⓒ</title>
|
|
'''
|
|
|
|
if test in {'meta_single', 'meta_multi', 'comment_single', 'comment_multi'}:
|
|
raw += b'''\
|
|
<meta name="dc:title" content="A Meta Tag &amp; Title Ⓒ" />
|
|
<meta name="dcterms.creator.aut" content="George Washington" />
|
|
<meta name="dc.publisher" content="Publisher A" />
|
|
<meta name="isbn" content="1234567890" />
|
|
<meta name="dc.language" content="English" />
|
|
<meta name="dc.date.published" content="2019-01-01" />
|
|
<meta name="dcterms.created" content="2018-01-01" />
|
|
<meta name="series" content="Meta Series" />
|
|
<meta name="seriesnumber" content="1" />
|
|
<meta name="rating" content="" />
|
|
<meta name="dc.description" content="" />
|
|
<meta name="tags" content="tag a, tag b" />
|
|
<meta name="dc.identifier.url" content="" />
|
|
<meta name="dc.identifier" scheme="" content="invalid" />
|
|
<meta name="dc.identifier." content="still invalid" />
|
|
<meta name="dc.identifier.conflicting" scheme="schemes" content="are also invalid" />
|
|
<meta name="dc.identifier.custom.subid" content="invalid too" />
|
|
'''
|
|
|
|
if test in {'meta_multi', 'comment_single', 'comment_multi'}:
|
|
raw += b'''\
|
|
<meta name="title" content="A Different Meta Tag &amp; Title Ⓒ" />
|
|
<meta name="author" content="John Adams with Thomas Jefferson" />
|
|
<meta name="publisher" content="Publisher B" />
|
|
<meta name="isbn" content="2345678901" />
|
|
<meta name="dcterms.language" content="Spanish" />
|
|
<meta name="date of publication" content="2017-01-01" />
|
|
<meta name="timestamp" content="2016-01-01" />
|
|
<meta name="series" content="Another Meta Series" />
|
|
<meta name="series.index" content="2" />
|
|
<meta name="rating" content="8" />
|
|
<meta name="comments" content="meta "comments" ♥ HTML &amp;" />
|
|
<meta name="tags" content="tag c" />
|
|
<meta name="dc.identifier.url" content="http://google.com/search?q=calibre" />
|
|
'''
|
|
|
|
if test in {'comment_single', 'comment_multi'}:
|
|
raw += b'''\
|
|
<!-- TITLE="A Comment Tag &amp; Title Ⓒ" -->
|
|
<!-- AUTHOR="James Madison and James Monroe" -->
|
|
<!-- PUBLISHER="Publisher C" -->
|
|
<!-- ISBN="3456789012" -->
|
|
<!-- LANGUAGE="French" -->
|
|
<!-- PUBDATE="2015-01-01" -->
|
|
<!-- TIMESTAMP="2014-01-01" -->
|
|
<!-- SERIES="Comment Series" -->
|
|
<!-- SERIESNUMBER="3" -->
|
|
<!-- RATING="20" -->
|
|
<!-- COMMENTS="comment "comments" ♥ HTML -- too &amp;" -->
|
|
<!-- TAGS="tag d" -->
|
|
'''
|
|
|
|
if test in {'comment_multi'}:
|
|
raw += b'''\
|
|
<!-- TITLE="Another Comment Tag &amp; Title Ⓒ" -->
|
|
<!-- AUTHOR="John Quincy Adams" -->
|
|
<!-- PUBLISHER="Publisher D" -->
|
|
<!-- ISBN="4567890123" -->
|
|
<!-- LANGUAGE="Japanese" -->
|
|
<!-- PUBDATE="2013-01-01" -->
|
|
<!-- TIMESTAMP="2012-01-01" -->
|
|
<!-- SERIES="Comment Series 2" -->
|
|
<!-- SERIESNUMBER="4" -->
|
|
<!-- RATING="1" -->
|
|
<!-- COMMENTS="comment "comments" ♥ HTML -- too &amp; for sure" -->
|
|
<!-- TAGS="tag e, tag f" -->
|
|
'''
|
|
|
|
raw += b'''\
|
|
</head>
|
|
<body>
|
|
</body>
|
|
</html>
|
|
'''
|
|
return BytesIO(raw)
|
|
|
|
def test_input_title(self):
|
|
stream_meta = get_metadata(self.get_stream('title'))
|
|
canon_meta = Metadata('A Title Tag & Title Ⓒ', [_('Unknown')])
|
|
self.compare_metadata(stream_meta, canon_meta)
|
|
|
|
def test_input_meta_single(self):
|
|
stream_meta = get_metadata(self.get_stream('meta_single'))
|
|
canon_meta = Metadata('A Meta Tag & Title Ⓒ', ['George Washington'])
|
|
canon_meta.publisher = 'Publisher A'
|
|
canon_meta.languages = ['English']
|
|
canon_meta.pubdate = parse_date('2019-01-01')
|
|
canon_meta.timestamp = parse_date('2018-01-01')
|
|
canon_meta.series = 'Meta Series'
|
|
canon_meta.series_index = float(1)
|
|
# canon_meta.rating = float(0)
|
|
# canon_meta.comments = ''
|
|
canon_meta.tags = ['tag a', 'tag b']
|
|
canon_meta.set_identifiers({'isbn': '1234567890'})
|
|
self.compare_metadata(stream_meta, canon_meta)
|
|
|
|
def test_input_meta_multi(self):
|
|
stream_meta = get_metadata(self.get_stream('meta_multi'))
|
|
canon_meta = Metadata('A Meta Tag & Title Ⓒ', ['George Washington', 'John Adams', 'Thomas Jefferson'])
|
|
canon_meta.publisher = 'Publisher A'
|
|
canon_meta.languages = ['English', 'Spanish']
|
|
canon_meta.pubdate = parse_date('2019-01-01')
|
|
canon_meta.timestamp = parse_date('2018-01-01')
|
|
canon_meta.series = 'Meta Series'
|
|
canon_meta.series_index = float(1)
|
|
canon_meta.rating = float(8)
|
|
canon_meta.comments = 'meta "comments" ♥ HTML &amp;'
|
|
canon_meta.tags = ['tag a', 'tag b', 'tag c']
|
|
canon_meta.set_identifiers({'isbn': '1234567890', 'url': 'http://google.com/search?q=calibre'})
|
|
self.compare_metadata(stream_meta, canon_meta)
|
|
|
|
def test_input_comment_single(self):
|
|
stream_meta = get_metadata(self.get_stream('comment_single'))
|
|
canon_meta = Metadata('A Comment Tag & Title Ⓒ', ['James Madison', 'James Monroe'])
|
|
canon_meta.publisher = 'Publisher C'
|
|
canon_meta.languages = ['French']
|
|
canon_meta.pubdate = parse_date('2015-01-01')
|
|
canon_meta.timestamp = parse_date('2014-01-01')
|
|
canon_meta.series = 'Comment Series'
|
|
canon_meta.series_index = float(3)
|
|
canon_meta.rating = float(0)
|
|
canon_meta.comments = 'comment "comments" ♥ HTML -- too &amp;'
|
|
canon_meta.tags = ['tag d']
|
|
canon_meta.set_identifiers({'isbn': '3456789012', 'url': 'http://google.com/search?q=calibre'})
|
|
self.compare_metadata(stream_meta, canon_meta)
|
|
|
|
def test_input_comment_multi(self):
|
|
stream_meta = get_metadata(self.get_stream('comment_multi'))
|
|
canon_meta = Metadata('A Comment Tag & Title Ⓒ', ['James Madison', 'James Monroe', 'John Quincy Adams'])
|
|
canon_meta.publisher = 'Publisher C'
|
|
canon_meta.languages = ['French', 'Japanese']
|
|
canon_meta.pubdate = parse_date('2015-01-01')
|
|
canon_meta.timestamp = parse_date('2014-01-01')
|
|
canon_meta.series = 'Comment Series'
|
|
canon_meta.series_index = float(3)
|
|
canon_meta.rating = float(0)
|
|
canon_meta.comments = 'comment "comments" ♥ HTML -- too &amp;'
|
|
canon_meta.tags = ['tag d', 'tag e', 'tag f']
|
|
canon_meta.set_identifiers({'isbn': '3456789012', 'url': 'http://google.com/search?q=calibre'})
|
|
self.compare_metadata(stream_meta, canon_meta)
|
|
|
|
|
|
def find_tests():
|
|
return unittest.TestLoader().loadTestsFromTestCase(MetadataHtmlTest)
|