import re import logging import mimetypes import requests from bs4 import BeautifulSoup _logger = logging.getLogger(__name__) class BaseMIMEParser(object): """ BaseMIMEParser can be sub-classed to define custom handlers for determining the MIME type of external urls. """ pattern = re.compile(r'.*$') @staticmethod def get_mimetype(url): """ Guess based on the file extension. Args: url (text): Web url that was linked to by a reddit submission. Returns: modified_url (text): The url (or filename) that will be used when constructing the command to run. content_type (text): The mime-type that will be used when constructing the command to run. If the mime-type is unknown, return None and the program will fallback to using the web browser. """ filename = url.split('?')[0] filename = filename.split('#')[0] content_type, _ = mimetypes.guess_type(filename) return url, content_type class OpenGraphMIMEParser(BaseMIMEParser): """ Open graph protocol is used on many web pages. If the page is a video page both of the above tags will be present and priority is given to video content. see http://ogp.me """ pattern = re.compile(r'.*$') @staticmethod def get_mimetype(url): page = requests.get(url) soup = BeautifulSoup(page.content, 'html.parser') for og_type in ['video', 'image']: prop = 'og:' + og_type + ':secure_url' tag = soup.find('meta', attrs={'property': prop}) if not tag: prop = 'og:' + og_type tag = soup.find('meta', attrs={'property': prop}) if tag: return BaseMIMEParser.get_mimetype(tag.get('content')) return url, None class VideoTagMIMEParser(BaseMIMEParser): """ """ pattern = re.compile(r'.*$') @staticmethod def get_mimetype(url): page = requests.get(url) soup = BeautifulSoup(page.content, 'html.parser') # TODO: Handle pages with multiple videos video = soup.find('video') source = None if video: source = video.find('source', attr={'res': 'HD'}) source = source or video.find('source', attr={'type': 'video/mp4'}) source = source or video.find('source') if source: return source.get('src'), source.get('type') else: return url, None class GfycatMIMEParser(BaseMIMEParser): """ Gfycat provides a primitive json api to generate image links. URLs can be downloaded as either gif, mp4, webm, or mjpg. Mp4 was selected because it's fast and works with VLC. https://gfycat.com/api https://gfycat.com/UntidyAcidicIberianemeraldlizard --> https://giant.gfycat.com/UntidyAcidicIberianemeraldlizard.webm """ pattern = re.compile(r'https?://(www\.)?gfycat\.com/[^.]+$') @staticmethod def get_mimetype(url): identifier = url.split('/')[-1] api_url = 'https://api.gfycat.com/v1/gfycats/{}'.format(identifier) resp = requests.get(api_url) image_url = resp.json()['gfyItem']['mp4Url'] return image_url, 'video/mp4' class YoutubeMIMEParser(BaseMIMEParser): """ Youtube videos can be streamed with vlc or downloaded with youtube-dl. Assign a custom mime-type so they can be referenced in mailcap. """ pattern = re.compile( r'(?:https?://)?(m\.)?(?:youtu\.be/|(?:www\.)?youtube\.com/watch' r'(?:\.php)?\'?.*v=)([a-zA-Z0-9\-_]+)') @staticmethod def get_mimetype(url): return url, 'video/x-youtube' class VimeoMIMEParser(BaseMIMEParser): """ Vimeo videos can be streamed with vlc or downloaded with youtube-dl. Assign a custom mime-type so they can be referenced in mailcap. """ pattern = re.compile(r'https?://(www\.)?vimeo\.com/\d+$') @staticmethod def get_mimetype(url): return url, 'video/x-youtube' class GifvMIMEParser(BaseMIMEParser): """ Special case for .gifv, which is a custom video format for imgur serves as html with a special