Added support for imgur albums.
This commit is contained in:
@@ -1,88 +1,20 @@
|
|||||||
import re
|
import re
|
||||||
|
import logging
|
||||||
import mimetypes
|
import mimetypes
|
||||||
|
|
||||||
from six.moves.html_parser import HTMLParser
|
|
||||||
import requests
|
import requests
|
||||||
|
from six.moves.html_parser import HTMLParser
|
||||||
|
from html import parser
|
||||||
|
|
||||||
|
_logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# HTML Parsers
|
||||||
|
|
||||||
class HTMLParsed(Exception):
|
class HTMLParsed(Exception):
|
||||||
def __init__(self, data):
|
def __init__(self, data):
|
||||||
self.data = data
|
self.data = data
|
||||||
|
|
||||||
|
|
||||||
class BaseHandler(object):
|
|
||||||
"""
|
|
||||||
BaseHandler can be sub-classed to define custom handlers for determining
|
|
||||||
the MIME type of external urls.
|
|
||||||
"""
|
|
||||||
|
|
||||||
# URL regex pattern that the handler will be triggered on
|
|
||||||
pattern = re.compile(r'.*$')
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def get_mimetype(url):
|
|
||||||
"""
|
|
||||||
Args:
|
|
||||||
url (text): Web url that was linked to by a reddit submission.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
modified_url (text): The url (or filename) that will be used when
|
|
||||||
constructing the command to run.
|
|
||||||
content_type (text): The mime-type that will be used when
|
|
||||||
constructing the command to run. If the mime-type is unknown,
|
|
||||||
return None and the program will fallback to using the web
|
|
||||||
browser.
|
|
||||||
"""
|
|
||||||
|
|
||||||
# Guess based on the file extension
|
|
||||||
filename = url.split('?')[0]
|
|
||||||
content_type, _ = mimetypes.guess_type(filename)
|
|
||||||
return url, content_type
|
|
||||||
|
|
||||||
|
|
||||||
class YoutubeHandler(BaseHandler):
|
|
||||||
"""
|
|
||||||
Youtube videos can be streamed with vlc or downloaded with youtube-dl.
|
|
||||||
Assign a custom mime-type so they can be referenced in mailcap.
|
|
||||||
"""
|
|
||||||
|
|
||||||
pattern = re.compile(
|
|
||||||
r'(?:https?://)?(m\.)?(?:youtu\.be/|(?:www\.)?youtube\.com/watch'
|
|
||||||
r'(?:\.php)?\'?.*v=)([a-zA-Z0-9\-_]+)')
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def get_mimetype(url):
|
|
||||||
return url, 'video/x-youtube'
|
|
||||||
|
|
||||||
|
|
||||||
class GifvHandler(BaseHandler):
|
|
||||||
"""
|
|
||||||
Special case for .gifv, which is a custom video format for imgur that is
|
|
||||||
incorrectly (or on purpose?) returned with a Content-Type of text/html.
|
|
||||||
"""
|
|
||||||
pattern = re.compile(r'.*[.]gifv$')
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def get_mimetype(url):
|
|
||||||
modified_url = url[:-4] + 'webm'
|
|
||||||
return modified_url, 'video/webm'
|
|
||||||
|
|
||||||
|
|
||||||
class RedditUploadsHandler(BaseHandler):
|
|
||||||
"""
|
|
||||||
Reddit uploads do not have a file extension, but we can grab the mime-type
|
|
||||||
from the page header.
|
|
||||||
"""
|
|
||||||
pattern = re.compile(r'https://i.reddituploads.com/.+$')
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def get_mimetype(url):
|
|
||||||
page = requests.head(url)
|
|
||||||
content_type = page.headers.get('Content-Type', '')
|
|
||||||
content_type = content_type.split(';')[0] # Strip out the encoding
|
|
||||||
return url, content_type
|
|
||||||
|
|
||||||
|
|
||||||
class ImgurHTMLParser(HTMLParser):
|
class ImgurHTMLParser(HTMLParser):
|
||||||
"""
|
"""
|
||||||
Scrape the actual image url from an imgur landing page. Imgur intentionally
|
Scrape the actual image url from an imgur landing page. Imgur intentionally
|
||||||
@@ -96,19 +28,117 @@ class ImgurHTMLParser(HTMLParser):
|
|||||||
|
|
||||||
Note:
|
Note:
|
||||||
BeautifulSoup or lxml would be faster here but I wanted to skip adding
|
BeautifulSoup or lxml would be faster here but I wanted to skip adding
|
||||||
an extra dependency for something as trivial as this.
|
an extra dependency for something this trivial.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def handle_starttag(self, tag, attr):
|
def handle_starttag(self, tag, attr):
|
||||||
if tag == 'meta' and attr[0] == ('name', 'twitter:image'):
|
if tag == 'meta' and attr[0] == ('name', 'twitter:image'):
|
||||||
raise HTMLParsed(attr[1][1])
|
raise HTMLParsed(attr[1][1])
|
||||||
|
|
||||||
|
|
||||||
class ImgurHandler(BaseHandler):
|
class ImgurAlbumHTMLParser(HTMLParser):
|
||||||
|
"""
|
||||||
|
Scrape the complete list of images from an imgur album. The HTML parser is
|
||||||
|
very limited, so this assumes the following html structure:
|
||||||
|
|
||||||
|
<div class="post-image">
|
||||||
|
<a href="//i.imgur.com/L3Lfp1O.jpg" class="zoom">
|
||||||
|
<img class="post-image-placeholder"
|
||||||
|
src="//i.imgur.com/L3Lfp1Og.jpg" alt="Close up">
|
||||||
|
<img class="js-post-image-thumb"
|
||||||
|
src="//i.imgur.com/L3Lfp1Og.jpg" alt="Close up">
|
||||||
|
</a>
|
||||||
|
</div>
|
||||||
|
"""
|
||||||
|
def reset(self):
|
||||||
|
super(ImgurAlbumHTMLParser, self).reset()
|
||||||
|
self.primed = False
|
||||||
|
self.hrefs = []
|
||||||
|
|
||||||
|
def handle_starttag(self, tag, attr):
|
||||||
|
if tag == 'div' and ('class', 'post-image') in attr:
|
||||||
|
self.primed = True
|
||||||
|
elif self.primed:
|
||||||
|
self.primed = False
|
||||||
|
if tag == 'a' and attr[0][0] == 'href':
|
||||||
|
self.hrefs.append(attr[0][1])
|
||||||
|
|
||||||
|
|
||||||
|
# MIME Parsers
|
||||||
|
|
||||||
|
class BaseMIMEParser(object):
|
||||||
|
"""
|
||||||
|
BaseMIMEParser can be sub-classed to define custom handlers for determining
|
||||||
|
the MIME type of external urls.
|
||||||
|
"""
|
||||||
|
pattern = re.compile(r'.*$')
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def get_mimetype(url):
|
||||||
|
"""
|
||||||
|
Guess based on the file extension.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
url (text): Web url that was linked to by a reddit submission.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
modified_url (text): The url (or filename) that will be used when
|
||||||
|
constructing the command to run.
|
||||||
|
content_type (text): The mime-type that will be used when
|
||||||
|
constructing the command to run. If the mime-type is unknown,
|
||||||
|
return None and the program will fallback to using the web
|
||||||
|
browser.
|
||||||
|
"""
|
||||||
|
filename = url.split('?')[0]
|
||||||
|
content_type, _ = mimetypes.guess_type(filename)
|
||||||
|
return url, content_type
|
||||||
|
|
||||||
|
|
||||||
|
class YoutubeMIMEParser(BaseMIMEParser):
|
||||||
|
"""
|
||||||
|
Youtube videos can be streamed with vlc or downloaded with youtube-dl.
|
||||||
|
Assign a custom mime-type so they can be referenced in mailcap.
|
||||||
|
"""
|
||||||
|
pattern = re.compile(
|
||||||
|
r'(?:https?://)?(m\.)?(?:youtu\.be/|(?:www\.)?youtube\.com/watch'
|
||||||
|
r'(?:\.php)?\'?.*v=)([a-zA-Z0-9\-_]+)')
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def get_mimetype(url):
|
||||||
|
return url, 'video/x-youtube'
|
||||||
|
|
||||||
|
|
||||||
|
class GifvMIMEParser(BaseMIMEParser):
|
||||||
|
"""
|
||||||
|
Special case for .gifv, which is a custom video format for imgur that is
|
||||||
|
incorrectly (or on purpose?) returned with a Content-Type of text/html.
|
||||||
|
"""
|
||||||
|
pattern = re.compile(r'.*[.]gifv$')
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def get_mimetype(url):
|
||||||
|
modified_url = url[:-4] + 'webm'
|
||||||
|
return modified_url, 'image/webm'
|
||||||
|
|
||||||
|
|
||||||
|
class RedditUploadsMIMEParser(BaseMIMEParser):
|
||||||
|
"""
|
||||||
|
Reddit uploads do not have a file extension, but we can grab the mime-type
|
||||||
|
from the page header.
|
||||||
|
"""
|
||||||
|
pattern = re.compile(r'https://i.reddituploads.com/.+$')
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def get_mimetype(url):
|
||||||
|
page = requests.head(url)
|
||||||
|
content_type = page.headers.get('Content-Type', '')
|
||||||
|
content_type = content_type.split(';')[0] # Strip out the encoding
|
||||||
|
return url, content_type
|
||||||
|
|
||||||
|
|
||||||
|
class ImgurMIMEParser(BaseMIMEParser):
|
||||||
"""
|
"""
|
||||||
The majority of imgur links don't point directly to the image, so we need
|
The majority of imgur links don't point directly to the image, so we need
|
||||||
to open the provided url and scrape the page for the link. For galleries,
|
to open the provided url and scrape the page for the link.
|
||||||
this method only returns the first image.
|
|
||||||
"""
|
"""
|
||||||
pattern = re.compile(r'https?://(w+\.)?(m\.)?imgur\.com/[^.]+$')
|
pattern = re.compile(r'https?://(w+\.)?(m\.)?imgur\.com/[^.]+$')
|
||||||
|
|
||||||
@@ -121,16 +151,43 @@ class ImgurHandler(BaseHandler):
|
|||||||
except HTMLParsed as data:
|
except HTMLParsed as data:
|
||||||
# We found a link
|
# We found a link
|
||||||
url = data.data
|
url = data.data
|
||||||
if GifvHandler.pattern.match(url):
|
if GifvMIMEParser.pattern.match(url):
|
||||||
return GifvHandler.get_mimetype(url)
|
return GifvMIMEParser.get_mimetype(url)
|
||||||
|
|
||||||
return BaseHandler.get_mimetype(url)
|
return BaseMIMEParser.get_mimetype(url)
|
||||||
|
|
||||||
|
|
||||||
# Handlers should be defined in the order they will be checked
|
class ImgurAlbumMIMEParser(BaseMIMEParser):
|
||||||
handlers = [
|
"""
|
||||||
ImgurHandler,
|
Imgur albums can contain several images, which need to be scraped from the
|
||||||
RedditUploadsHandler,
|
landing page.
|
||||||
YoutubeHandler,
|
"""
|
||||||
GifvHandler,
|
pattern = re.compile(r'https?://(w+\.)?(m\.)?imgur\.com/a/[^.]+$')
|
||||||
BaseHandler]
|
|
||||||
|
@staticmethod
|
||||||
|
def get_mimetype(url):
|
||||||
|
imgur_page = requests.get(url)
|
||||||
|
parser = ImgurAlbumHTMLParser(convert_charrefs=True)
|
||||||
|
|
||||||
|
try:
|
||||||
|
parser.feed(imgur_page.text)
|
||||||
|
except Exception as e:
|
||||||
|
_logger.warning(e)
|
||||||
|
urls = []
|
||||||
|
else:
|
||||||
|
urls = ['http:' + href for href in parser.hrefs]
|
||||||
|
|
||||||
|
if urls:
|
||||||
|
return "' '".join(urls), 'image/x-imgur-album'
|
||||||
|
else:
|
||||||
|
return url, None
|
||||||
|
|
||||||
|
|
||||||
|
# Parsers should be listed in the order they will be checked
|
||||||
|
parsers = [
|
||||||
|
ImgurAlbumMIMEParser,
|
||||||
|
ImgurMIMEParser,
|
||||||
|
RedditUploadsMIMEParser,
|
||||||
|
YoutubeMIMEParser,
|
||||||
|
GifvMIMEParser,
|
||||||
|
BaseMIMEParser]
|
||||||
@@ -317,9 +317,9 @@ class Terminal(object):
|
|||||||
return self.open_browser(url)
|
return self.open_browser(url)
|
||||||
|
|
||||||
command = None
|
command = None
|
||||||
for handler in mime_handlers.handlers:
|
for parser in mime_handlers.parsers:
|
||||||
if handler.pattern.match(url):
|
if parser.pattern.match(url):
|
||||||
modified_url, content_type = handler.get_mimetype(url)
|
modified_url, content_type = parser.get_mimetype(url)
|
||||||
_logger.info('MIME type: %s', content_type)
|
_logger.info('MIME type: %s', content_type)
|
||||||
_logger.info('Modified url: %s', modified_url)
|
_logger.info('Modified url: %s', modified_url)
|
||||||
if not content_type or content_type == 'text/html':
|
if not content_type or content_type == 'text/html':
|
||||||
|
|||||||
Reference in New Issue
Block a user