diff --git a/rtv/mime_handlers.py b/rtv/mime_handlers.py
index 839457b..470267c 100644
--- a/rtv/mime_handlers.py
+++ b/rtv/mime_handlers.py
@@ -1,88 +1,20 @@
import re
+import logging
import mimetypes
-from six.moves.html_parser import HTMLParser
import requests
+from six.moves.html_parser import HTMLParser
+from html import parser
+_logger = logging.getLogger(__name__)
+
+# HTML Parsers
class HTMLParsed(Exception):
def __init__(self, data):
self.data = data
-class BaseHandler(object):
- """
- BaseHandler can be sub-classed to define custom handlers for determining
- the MIME type of external urls.
- """
-
- # URL regex pattern that the handler will be triggered on
- pattern = re.compile(r'.*$')
-
- @staticmethod
- def get_mimetype(url):
- """
- Args:
- url (text): Web url that was linked to by a reddit submission.
-
- Returns:
- modified_url (text): The url (or filename) that will be used when
- constructing the command to run.
- content_type (text): The mime-type that will be used when
- constructing the command to run. If the mime-type is unknown,
- return None and the program will fallback to using the web
- browser.
- """
-
- # Guess based on the file extension
- filename = url.split('?')[0]
- content_type, _ = mimetypes.guess_type(filename)
- return url, content_type
-
-
-class YoutubeHandler(BaseHandler):
- """
- Youtube videos can be streamed with vlc or downloaded with youtube-dl.
- Assign a custom mime-type so they can be referenced in mailcap.
- """
-
- pattern = re.compile(
- r'(?:https?://)?(m\.)?(?:youtu\.be/|(?:www\.)?youtube\.com/watch'
- r'(?:\.php)?\'?.*v=)([a-zA-Z0-9\-_]+)')
-
- @staticmethod
- def get_mimetype(url):
- return url, 'video/x-youtube'
-
-
-class GifvHandler(BaseHandler):
- """
- Special case for .gifv, which is a custom video format for imgur that is
- incorrectly (or on purpose?) returned with a Content-Type of text/html.
- """
- pattern = re.compile(r'.*[.]gifv$')
-
- @staticmethod
- def get_mimetype(url):
- modified_url = url[:-4] + 'webm'
- return modified_url, 'video/webm'
-
-
-class RedditUploadsHandler(BaseHandler):
- """
- Reddit uploads do not have a file extension, but we can grab the mime-type
- from the page header.
- """
- pattern = re.compile(r'https://i.reddituploads.com/.+$')
-
- @staticmethod
- def get_mimetype(url):
- page = requests.head(url)
- content_type = page.headers.get('Content-Type', '')
- content_type = content_type.split(';')[0] # Strip out the encoding
- return url, content_type
-
-
class ImgurHTMLParser(HTMLParser):
"""
Scrape the actual image url from an imgur landing page. Imgur intentionally
@@ -96,19 +28,117 @@ class ImgurHTMLParser(HTMLParser):
Note:
BeautifulSoup or lxml would be faster here but I wanted to skip adding
- an extra dependency for something as trivial as this.
+ an extra dependency for something this trivial.
"""
-
def handle_starttag(self, tag, attr):
if tag == 'meta' and attr[0] == ('name', 'twitter:image'):
raise HTMLParsed(attr[1][1])
-class ImgurHandler(BaseHandler):
+class ImgurAlbumHTMLParser(HTMLParser):
+ """
+ Scrape the complete list of images from an imgur album. The HTML parser is
+ very limited, so this assumes the following html structure:
+
+
+ """
+ def reset(self):
+ super(ImgurAlbumHTMLParser, self).reset()
+ self.primed = False
+ self.hrefs = []
+
+ def handle_starttag(self, tag, attr):
+ if tag == 'div' and ('class', 'post-image') in attr:
+ self.primed = True
+ elif self.primed:
+ self.primed = False
+ if tag == 'a' and attr[0][0] == 'href':
+ self.hrefs.append(attr[0][1])
+
+
+# MIME Parsers
+
+class BaseMIMEParser(object):
+ """
+ BaseMIMEParser can be sub-classed to define custom handlers for determining
+ the MIME type of external urls.
+ """
+ pattern = re.compile(r'.*$')
+
+ @staticmethod
+ def get_mimetype(url):
+ """
+ Guess based on the file extension.
+
+ Args:
+ url (text): Web url that was linked to by a reddit submission.
+
+ Returns:
+ modified_url (text): The url (or filename) that will be used when
+ constructing the command to run.
+ content_type (text): The mime-type that will be used when
+ constructing the command to run. If the mime-type is unknown,
+ return None and the program will fallback to using the web
+ browser.
+ """
+ filename = url.split('?')[0]
+ content_type, _ = mimetypes.guess_type(filename)
+ return url, content_type
+
+
+class YoutubeMIMEParser(BaseMIMEParser):
+ """
+ Youtube videos can be streamed with vlc or downloaded with youtube-dl.
+ Assign a custom mime-type so they can be referenced in mailcap.
+ """
+ pattern = re.compile(
+ r'(?:https?://)?(m\.)?(?:youtu\.be/|(?:www\.)?youtube\.com/watch'
+ r'(?:\.php)?\'?.*v=)([a-zA-Z0-9\-_]+)')
+
+ @staticmethod
+ def get_mimetype(url):
+ return url, 'video/x-youtube'
+
+
+class GifvMIMEParser(BaseMIMEParser):
+ """
+ Special case for .gifv, which is a custom video format for imgur that is
+ incorrectly (or on purpose?) returned with a Content-Type of text/html.
+ """
+ pattern = re.compile(r'.*[.]gifv$')
+
+ @staticmethod
+ def get_mimetype(url):
+ modified_url = url[:-4] + 'webm'
+ return modified_url, 'image/webm'
+
+
+class RedditUploadsMIMEParser(BaseMIMEParser):
+ """
+ Reddit uploads do not have a file extension, but we can grab the mime-type
+ from the page header.
+ """
+ pattern = re.compile(r'https://i.reddituploads.com/.+$')
+
+ @staticmethod
+ def get_mimetype(url):
+ page = requests.head(url)
+ content_type = page.headers.get('Content-Type', '')
+ content_type = content_type.split(';')[0] # Strip out the encoding
+ return url, content_type
+
+
+class ImgurMIMEParser(BaseMIMEParser):
"""
The majority of imgur links don't point directly to the image, so we need
- to open the provided url and scrape the page for the link. For galleries,
- this method only returns the first image.
+ to open the provided url and scrape the page for the link.
"""
pattern = re.compile(r'https?://(w+\.)?(m\.)?imgur\.com/[^.]+$')
@@ -121,16 +151,43 @@ class ImgurHandler(BaseHandler):
except HTMLParsed as data:
# We found a link
url = data.data
- if GifvHandler.pattern.match(url):
- return GifvHandler.get_mimetype(url)
+ if GifvMIMEParser.pattern.match(url):
+ return GifvMIMEParser.get_mimetype(url)
- return BaseHandler.get_mimetype(url)
+ return BaseMIMEParser.get_mimetype(url)
-# Handlers should be defined in the order they will be checked
-handlers = [
- ImgurHandler,
- RedditUploadsHandler,
- YoutubeHandler,
- GifvHandler,
- BaseHandler]
+class ImgurAlbumMIMEParser(BaseMIMEParser):
+ """
+ Imgur albums can contain several images, which need to be scraped from the
+ landing page.
+ """
+ pattern = re.compile(r'https?://(w+\.)?(m\.)?imgur\.com/a/[^.]+$')
+
+ @staticmethod
+ def get_mimetype(url):
+ imgur_page = requests.get(url)
+ parser = ImgurAlbumHTMLParser(convert_charrefs=True)
+
+ try:
+ parser.feed(imgur_page.text)
+ except Exception as e:
+ _logger.warning(e)
+ urls = []
+ else:
+ urls = ['http:' + href for href in parser.hrefs]
+
+ if urls:
+ return "' '".join(urls), 'image/x-imgur-album'
+ else:
+ return url, None
+
+
+# Parsers should be listed in the order they will be checked
+parsers = [
+ ImgurAlbumMIMEParser,
+ ImgurMIMEParser,
+ RedditUploadsMIMEParser,
+ YoutubeMIMEParser,
+ GifvMIMEParser,
+ BaseMIMEParser]
\ No newline at end of file
diff --git a/rtv/terminal.py b/rtv/terminal.py
index 2d867d9..ebd048a 100644
--- a/rtv/terminal.py
+++ b/rtv/terminal.py
@@ -317,9 +317,9 @@ class Terminal(object):
return self.open_browser(url)
command = None
- for handler in mime_handlers.handlers:
- if handler.pattern.match(url):
- modified_url, content_type = handler.get_mimetype(url)
+ for parser in mime_handlers.parsers:
+ if parser.pattern.match(url):
+ modified_url, content_type = parser.get_mimetype(url)
_logger.info('MIME type: %s', content_type)
_logger.info('Modified url: %s', modified_url)
if not content_type or content_type == 'text/html':