import re
import logging
import mimetypes
import requests
from bs4 import BeautifulSoup
_logger = logging.getLogger(__name__)
class BaseMIMEParser(object):
"""
BaseMIMEParser can be sub-classed to define custom handlers for determining
the MIME type of external urls.
"""
pattern = re.compile(r'.*$')
@staticmethod
def get_mimetype(url):
"""
Guess based on the file extension.
Args:
url (text): Web url that was linked to by a reddit submission.
Returns:
modified_url (text): The url (or filename) that will be used when
constructing the command to run.
content_type (text): The mime-type that will be used when
constructing the command to run. If the mime-type is unknown,
return None and the program will fallback to using the web
browser.
"""
filename = url.split('?')[0]
filename = filename.split('#')[0]
content_type, _ = mimetypes.guess_type(filename)
return url, content_type
class OpenGraphMIMEParser(BaseMIMEParser):
"""
Open graph protocol is used on many web pages.
If the page is a video page both of the above tags will be present and
priority is given to video content.
see http://ogp.me
"""
pattern = re.compile(r'.*$')
@staticmethod
def get_mimetype(url):
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
for og_type in ['video', 'image']:
prop = 'og:' + og_type + ':secure_url'
tag = soup.find('meta', attrs={'property': prop})
if not tag:
prop = 'og:' + og_type
tag = soup.find('meta', attrs={'property': prop})
if tag:
return BaseMIMEParser.get_mimetype(tag.get('content'))
return url, None
class VideoTagMIMEParser(BaseMIMEParser):
"""
"""
pattern = re.compile(r'.*$')
@staticmethod
def get_mimetype(url):
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
# TODO: Handle pages with multiple videos
video = soup.find('video')
source = None
if video:
source = video.find('source', attr={'res': 'HD'})
source = source or video.find('source', attr={'type': 'video/mp4'})
source = source or video.find('source')
if source:
return source.get('src'), source.get('type')
else:
return url, None
class GfycatMIMEParser(BaseMIMEParser):
"""
Gfycat provides a primitive json api to generate image links. URLs can be
downloaded as either gif, mp4, webm, or mjpg. Mp4 was selected because it's
fast and works with VLC.
https://gfycat.com/api
https://gfycat.com/UntidyAcidicIberianemeraldlizard -->
https://giant.gfycat.com/UntidyAcidicIberianemeraldlizard.webm
"""
pattern = re.compile(r'https?://(www\.)?gfycat\.com/[^.]+$')
@staticmethod
def get_mimetype(url):
identifier = url.split('/')[-1]
api_url = 'https://api.gfycat.com/v1/gfycats/{}'.format(identifier)
resp = requests.get(api_url)
image_url = resp.json()['gfyItem']['mp4Url']
return image_url, 'video/mp4'
class YoutubeMIMEParser(BaseMIMEParser):
"""
Youtube videos can be streamed with vlc or downloaded with youtube-dl.
Assign a custom mime-type so they can be referenced in mailcap.
"""
pattern = re.compile(
r'(?:https?://)?(m\.)?(?:youtu\.be/|(?:www\.)?youtube\.com/watch'
r'(?:\.php)?\'?.*v=)([a-zA-Z0-9\-_]+)')
@staticmethod
def get_mimetype(url):
return url, 'video/x-youtube'
class VimeoMIMEParser(BaseMIMEParser):
"""
Vimeo videos can be streamed with vlc or downloaded with youtube-dl.
Assign a custom mime-type so they can be referenced in mailcap.
"""
pattern = re.compile(r'https?://(www\.)?vimeo\.com/\d+$')
@staticmethod
def get_mimetype(url):
return url, 'video/x-youtube'
class GifvMIMEParser(BaseMIMEParser):
"""
Special case for .gifv, which is a custom video format for imgur serves
as html with a special