""" Try to read metadata from an HTML file. """ import re import unittest from collections import defaultdict from html5_parser import parse from lxml.etree import Comment from ebook_converter.ebooks.metadata import string_to_authors, authors_to_string from ebook_converter.ebooks.metadata.book.base import Metadata from ebook_converter.ebooks.chardet import xml_to_unicode from ebook_converter import replace_entities from ebook_converter.utils.date import parse_date, is_date_undefined __license__ = 'GPL v3' __copyright__ = '2013, Kovid Goyal ' def get_metadata(stream): src = stream.read() return get_metadata_(src) COMMENT_NAMES = { 'title': 'TITLE', 'authors': 'AUTHOR', 'publisher': 'PUBLISHER', 'isbn': 'ISBN', 'languages': 'LANGUAGE', 'pubdate': 'PUBDATE', 'timestamp': 'TIMESTAMP', 'series': 'SERIES', 'series_index': 'SERIESNUMBER', 'rating': 'RATING', 'comments': 'COMMENTS', 'tags': 'TAGS', } META_NAMES = { 'title' : ('dc.title', 'dcterms.title', 'title'), 'authors': ('author', 'dc.creator.aut', 'dcterms.creator.aut', 'dc.creator'), 'publisher': ('publisher', 'dc.publisher', 'dcterms.publisher'), 'isbn': ('isbn',), 'languages': ('dc.language', 'dcterms.language'), 'pubdate': ('pubdate', 'date of publication', 'dc.date.published', 'dc.date.publication', 'dc.date.issued', 'dcterms.issued'), 'timestamp': ('timestamp', 'date of creation', 'dc.date.created', 'dc.date.creation', 'dcterms.created'), 'series': ('series',), 'series_index': ('seriesnumber', 'series_index', 'series.index'), 'rating': ('rating',), 'comments': ('comments', 'dc.description'), 'tags': ('tags',), } rmap_comment = {v:k for k, v in COMMENT_NAMES.items()} rmap_meta = {v:k for k, l in META_NAMES.items() for v in l} # Extract an HTML attribute value, supports both single and double quotes and # single quotes inside double quotes and vice versa. attr_pat = r'''(?:(?P')|(?P"))(?P(?(sq)[^']+|[^"]+))(?(sq)'|")''' def handle_comment(data, comment_tags): if not hasattr(handle_comment, 'pat'): handle_comment.pat = re.compile(r'''(?P\S+)\s*=\s*%s''' % attr_pat) for match in handle_comment.pat.finditer(data): x = match.group('name') field = None try: field = rmap_comment[x] except KeyError: pass if field: comment_tags[field].append(replace_entities(match.group('content'))) def parse_metadata(src): root = parse(src) comment_tags = defaultdict(list) meta_tags = defaultdict(list) meta_tag_ids = defaultdict(list) title = '' identifier_pat = re.compile(r'(?:dc|dcterms)[.:]identifier(?:\.|$)', flags=re.IGNORECASE) id_pat2 = re.compile(r'(?:dc|dcterms)[.:]identifier$', flags=re.IGNORECASE) for comment in root.iterdescendants(tag=Comment): if comment.text: handle_comment(comment.text, comment_tags) for q in root.iterdescendants(tag='title'): if q.text: title = q.text break for meta in root.iterdescendants(tag='meta'): name, content = meta.get('name'), meta.get('content') if not name or not content: continue if identifier_pat.match(name) is not None: scheme = None if id_pat2.match(name) is not None: scheme = meta.get('scheme') else: elements = re.split(r'[.:]', name) if len(elements) == 3 and not meta.get('scheme'): scheme = elements[2].strip() if scheme: meta_tag_ids[scheme.lower()].append(content) else: x = name.lower() field = None try: field = rmap_meta[x] except KeyError: try: field = rmap_meta[x.replace(':', '.')] except KeyError: pass if field: meta_tags[field].append(content) return comment_tags, meta_tags, meta_tag_ids, title def get_metadata_(src, encoding=None): # Meta data definitions as in # https://www.mobileread.com/forums/showpost.php?p=712544&postcount=9 if isinstance(src, bytes): if not encoding: src = xml_to_unicode(src)[0] else: src = src.decode(encoding, 'replace') src = src[:150000] # Searching shouldn't take too long comment_tags, meta_tags, meta_tag_ids, title_tag = parse_metadata(src) def get_all(field): ans = comment_tags.get(field, meta_tags.get(field, None)) if ans: ans = [x.strip() for x in ans if x.strip()] if not ans: ans = None return ans def get(field): ans = get_all(field) if ans: ans = ans[0] return ans # Title title = get('title') or title_tag.strip() or 'Unknown' # Author authors = authors_to_string(get_all('authors')) or 'Unknown' # Create MetaInformation with Title and Author mi = Metadata(title, string_to_authors(authors)) # Single-value text fields for field in ('publisher', 'isbn'): val = get(field) if val: setattr(mi, field, val) # Multi-value text fields for field in ('languages',): val = get_all(field) if val: setattr(mi, field, val) # HTML fields for field in ('comments',): val = get(field) if val: setattr(mi, field, val.replace('&', '&').replace('<', '<').replace('>', '>').replace('"', '"').replace("'", ''')) # Date fields for field in ('pubdate', 'timestamp'): try: val = parse_date(get(field)) except: pass else: if not is_date_undefined(val): setattr(mi, field, val) # SERIES series = get('series') if series: pat = re.compile(r'\[([.0-9]+)\]$') match = pat.search(series) series_index = None if match is not None: try: series_index = float(match.group(1)) except: pass series = series.replace(match.group(), '').strip() mi.series = series if series_index is None: series_index = get('series_index') try: series_index = float(series_index) except: pass if series_index is not None: mi.series_index = series_index # RATING rating = get('rating') if rating: try: mi.rating = float(rating) if mi.rating < 0: mi.rating = 0 if mi.rating > 10: mi.rating = 0 except: pass # TAGS tags = get_all('tags') if tags: tags = [x.strip() for s in tags for x in s.split(',') if x.strip()] if tags: mi.tags = tags # IDENTIFIERS for (k,v) in meta_tag_ids.items(): v = [x.strip() for x in v if x.strip()] if v: mi.set_identifier(k, v[0]) return mi class MetadataHtmlTest(unittest.TestCase): def compare_metadata(self, meta_a, meta_b): for attr in ( 'title', 'authors', 'publisher', 'isbn', 'languages', 'pubdate', 'timestamp', 'series', 'series_index', 'rating', 'comments', 'tags', 'identifiers' ): self.assertEqual(getattr(meta_a, attr), getattr(meta_b, attr)) def get_stream(self, test): from io import BytesIO raw = b'''\ ''' if test in {'title', 'meta_single', 'meta_multi', 'comment_single', 'comment_multi'}: raw += b'''\ } A Title Tag &amp; Title Ⓒ ''' if test in {'meta_single', 'meta_multi', 'comment_single', 'comment_multi'}: raw += b'''\ ''' if test in {'meta_multi', 'comment_single', 'comment_multi'}: raw += b'''\ ''' if test in {'comment_single', 'comment_multi'}: raw += b'''\ ''' if test in {'comment_multi'}: raw += b'''\ ''' raw += b'''\ ''' return BytesIO(raw) def test_input_title(self): stream_meta = get_metadata(self.get_stream('title')) canon_meta = Metadata('A Title Tag & Title Ⓒ', ['Unknown']) self.compare_metadata(stream_meta, canon_meta) def test_input_meta_single(self): stream_meta = get_metadata(self.get_stream('meta_single')) canon_meta = Metadata('A Meta Tag & Title Ⓒ', ['George Washington']) canon_meta.publisher = 'Publisher A' canon_meta.languages = ['English'] canon_meta.pubdate = parse_date('2019-01-01') canon_meta.timestamp = parse_date('2018-01-01') canon_meta.series = 'Meta Series' canon_meta.series_index = float(1) # canon_meta.rating = float(0) # canon_meta.comments = '' canon_meta.tags = ['tag a', 'tag b'] canon_meta.set_identifiers({'isbn': '1234567890'}) self.compare_metadata(stream_meta, canon_meta) def test_input_meta_multi(self): stream_meta = get_metadata(self.get_stream('meta_multi')) canon_meta = Metadata('A Meta Tag & Title Ⓒ', ['George Washington', 'John Adams', 'Thomas Jefferson']) canon_meta.publisher = 'Publisher A' canon_meta.languages = ['English', 'Spanish'] canon_meta.pubdate = parse_date('2019-01-01') canon_meta.timestamp = parse_date('2018-01-01') canon_meta.series = 'Meta Series' canon_meta.series_index = float(1) canon_meta.rating = float(8) canon_meta.comments = 'meta "comments" ♥ HTML &amp;' canon_meta.tags = ['tag a', 'tag b', 'tag c'] canon_meta.set_identifiers({'isbn': '1234567890', 'url': 'http://google.com/search?q=calibre'}) self.compare_metadata(stream_meta, canon_meta) def test_input_comment_single(self): stream_meta = get_metadata(self.get_stream('comment_single')) canon_meta = Metadata('A Comment Tag & Title Ⓒ', ['James Madison', 'James Monroe']) canon_meta.publisher = 'Publisher C' canon_meta.languages = ['French'] canon_meta.pubdate = parse_date('2015-01-01') canon_meta.timestamp = parse_date('2014-01-01') canon_meta.series = 'Comment Series' canon_meta.series_index = float(3) canon_meta.rating = float(0) canon_meta.comments = 'comment "comments" ♥ HTML -- too &amp;' canon_meta.tags = ['tag d'] canon_meta.set_identifiers({'isbn': '3456789012', 'url': 'http://google.com/search?q=calibre'}) self.compare_metadata(stream_meta, canon_meta) def test_input_comment_multi(self): stream_meta = get_metadata(self.get_stream('comment_multi')) canon_meta = Metadata('A Comment Tag & Title Ⓒ', ['James Madison', 'James Monroe', 'John Quincy Adams']) canon_meta.publisher = 'Publisher C' canon_meta.languages = ['French', 'Japanese'] canon_meta.pubdate = parse_date('2015-01-01') canon_meta.timestamp = parse_date('2014-01-01') canon_meta.series = 'Comment Series' canon_meta.series_index = float(3) canon_meta.rating = float(0) canon_meta.comments = 'comment "comments" ♥ HTML -- too &amp;' canon_meta.tags = ['tag d', 'tag e', 'tag f'] canon_meta.set_identifiers({'isbn': '3456789012', 'url': 'http://google.com/search?q=calibre'}) self.compare_metadata(stream_meta, canon_meta) def find_tests(): return unittest.TestLoader().loadTestsFromTestCase(MetadataHtmlTest)