Merge branch 'InstagramMIMEParser' of https://github.com/woorst/rtv into woorst-InstagramMIMEParser

2016-08-06 16:59:45 -07:00
parent 33727be682 cc1d208b37
commit 71713bdbfd
1 changed files with 25 additions and 0 deletions
@@ -164,8 +164,33 @@ class ImgurAlbumMIMEParser(BaseMIMEParser):
            return url, None


+class InstagramMIMEParser(BaseMIMEParser):
+    """
+    Instagram pages can contain either an embedded image or video. The <meta>
+    tags below provide the relevant info.
+
+    <meta property="og:image" content="https://xxxx.jpg?ig_cache_key=xxxxx" />
+    <meta property="og:video:secure_url" content="https://xxxxx.mp4" />
+
+    If the page is a video page both of the above tags will be present.
+    """
+    pattern = re.compile(r'https?://(www\.)?instagr((am\.com)|\.am)/p/[^.]+$')
+
+    @staticmethod
+    def get_mimetype(url):
+        page = requests.get(url)
+        soup = BeautifulSoup(page.content, 'html.parser')
+        tags = soup.find_all('meta', attrs={'property': 'og:video:secure_url'})
+        if tags:
+            return BaseMIMEParser.get_mimetype(tags[0].get('content'))
+        else:
+            tags = soup.find_all('meta', attrs={'property':  'og:image'})
+            return BaseMIMEParser.get_mimetype(tags[0].get('content'))
+
+
 # Parsers should be listed in the order they will be checked
 parsers = [
+    InstagramMIMEParser,
    GfycatMIMEParser,
    ImgurAlbumMIMEParser,
    ImgurMIMEParser,