From 5da32c258a8e98eb0dde00cc1dd7e4a18a7d707c Mon Sep 17 00:00:00 2001 From: woorst Date: Sun, 28 May 2017 00:25:00 -0500 Subject: [PATCH 1/4] Use imgur api to retrieve direct links to images. --- rtv/mime_parsers.py | 72 +++++++++++++++------------------------------ 1 file changed, 23 insertions(+), 49 deletions(-) diff --git a/rtv/mime_parsers.py b/rtv/mime_parsers.py index 59b978d..b8450de 100644 --- a/rtv/mime_parsers.py +++ b/rtv/mime_parsers.py @@ -4,6 +4,7 @@ import mimetypes import requests from bs4 import BeautifulSoup +import json _logger = logging.getLogger(__name__) @@ -104,64 +105,38 @@ class RedditUploadsMIMEParser(BaseMIMEParser): class ImgurMIMEParser(BaseMIMEParser): """ - The majority of imgur links don't point directly to the image, so we need - to open the provided url and scrape the page for the link. + Imgur provides a json api exposing its entire infrastructure. Each imgur + page has an associated hash and can either contain an album, a gallery, or single image. - Scrape the actual image url from an imgur landing page. Imgur intentionally - obscures this on most reddit links in order to draw more traffic for their - advertisements. - - There are a couple of tags that supply the relevant info: - - - + see https://apidocs.imgur.com """ pattern = re.compile(r'https?://(w+\.)?(m\.)?imgur\.com/[^.]+$') @staticmethod def get_mimetype(url): - page = requests.get(url) - soup = BeautifulSoup(page.content, 'html.parser') - tag = soup.find('meta', attrs={'name': 'twitter:image'}) - if tag: - url = tag.get('content') - if GifvMIMEParser.pattern.match(url): - return GifvMIMEParser.get_mimetype(url) - return BaseMIMEParser.get_mimetype(url) + endpoint = 'https://api.imgur.com/3/{domain}/{page_hash}' + header = {'authorization': 'Client-ID {}'.format('d8842d573e8b9dd')} + pattern = re.compile(r'https?://(w+\.)?(m\.)?imgur\.com/((?Pa|album|gallery)/)?(?P.+)$') + m = pattern.match(url) + page_hash = m.group('hash') + domain = 'album' if m.group('domain') in ['a', 'album'] else 'gallery' -class ImgurAlbumMIMEParser(BaseMIMEParser): - """ - Imgur albums can contain several images, which need to be scraped from the - landing page. Assumes the following html structure: + r = requests.get(endpoint.format(domain=domain, page_hash=page_hash), + headers=header) + if r.status_code == 404: + r = requests.get(endpoint.format(domain='image', + page_hash=page_hash), headers=header) -
- - Close up - Close up - -
- """ - pattern = re.compile(r'https?://(w+\.)?(m\.)?imgur\.com/a(lbum)?/[^.]+$') - - @staticmethod - def get_mimetype(url): - page = requests.get(url) - soup = BeautifulSoup(page.content, 'html.parser') - - urls = [] - for div in soup.find_all('div', class_='post-image'): - img = div.find('img') - src = img.get('src') if img else None - if src: - urls.append('http:{0}'.format(src)) - - if urls: - return " ".join(urls), 'image/x-imgur-album' + data = json.loads(r.text)['data'] + if 'images' in data: + # TODO: handle imgur albums with mixed content, i.e. jpeg and gifv + urls = ' '.join([d['link'] for d in data['images'] if not d['animated']]) + return urls, 'image/x-imgur-album' else: - return url, None + return (data['mp4'], 'video/mp4') if data['animated'] else (data['link'], data['type']) + + return url, None class InstagramMIMEParser(BaseMIMEParser): @@ -192,7 +167,6 @@ class InstagramMIMEParser(BaseMIMEParser): parsers = [ InstagramMIMEParser, GfycatMIMEParser, - ImgurAlbumMIMEParser, ImgurMIMEParser, RedditUploadsMIMEParser, YoutubeMIMEParser, From d123986d4f1db4de111f1068a6a6c0aebb4dbaf6 Mon Sep 17 00:00:00 2001 From: woorst Date: Sun, 28 May 2017 02:13:39 -0500 Subject: [PATCH 2/4] pass unhandled imgur page to browser --- rtv/mime_parsers.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/rtv/mime_parsers.py b/rtv/mime_parsers.py index b8450de..43ef191 100644 --- a/rtv/mime_parsers.py +++ b/rtv/mime_parsers.py @@ -124,20 +124,20 @@ class ImgurMIMEParser(BaseMIMEParser): r = requests.get(endpoint.format(domain=domain, page_hash=page_hash), headers=header) - if r.status_code == 404: + if r.status_code != 200: r = requests.get(endpoint.format(domain='image', page_hash=page_hash), headers=header) + if r.status_code != 200: + return url, None data = json.loads(r.text)['data'] if 'images' in data: # TODO: handle imgur albums with mixed content, i.e. jpeg and gifv urls = ' '.join([d['link'] for d in data['images'] if not d['animated']]) return urls, 'image/x-imgur-album' - else: + else : return (data['mp4'], 'video/mp4') if data['animated'] else (data['link'], data['type']) - return url, None - class InstagramMIMEParser(BaseMIMEParser): """ From 371f8db06aa61373cb2920b9d6f5737ac2021e9b Mon Sep 17 00:00:00 2001 From: woorst Date: Sun, 28 May 2017 09:25:28 -0500 Subject: [PATCH 3/4] Fix test for imgur gif link --- rtv/mime_parsers.py | 8 +++++--- tests/test_mime_parsers.py | 8 ++++---- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/rtv/mime_parsers.py b/rtv/mime_parsers.py index 43ef191..469f6ec 100644 --- a/rtv/mime_parsers.py +++ b/rtv/mime_parsers.py @@ -133,10 +133,12 @@ class ImgurMIMEParser(BaseMIMEParser): data = json.loads(r.text)['data'] if 'images' in data: # TODO: handle imgur albums with mixed content, i.e. jpeg and gifv - urls = ' '.join([d['link'] for d in data['images'] if not d['animated']]) - return urls, 'image/x-imgur-album' + links = ' '.join([d['link'] for d in data['images'] if not d['animated']]) + return links.replace('http://', 'https://'), 'image/x-imgur-album' else : - return (data['mp4'], 'video/mp4') if data['animated'] else (data['link'], data['type']) + link = data['mp4'] if data['animated'] else data['link'] + mime = 'video/mp4' if data['animated'] else data['type'] + return link.replace('http://', 'https://'), mime class InstagramMIMEParser(BaseMIMEParser): diff --git a/tests/test_mime_parsers.py b/tests/test_mime_parsers.py index f281215..d7a5d9a 100644 --- a/tests/test_mime_parsers.py +++ b/tests/test_mime_parsers.py @@ -41,12 +41,12 @@ URLS = OrderedDict([ 'https://i.imgur.com/yW0kbMi.jpg', 'image/jpeg')), ('imgur_2', ( - 'http://imgur.com/yjP1v4B', - 'https://i.imgur.com/yjP1v4Bh.jpg', - 'image/jpeg')), + 'http://imgur.com/gallery/yjP1v4B', + 'https://i.imgur.com/yjP1v4B.mp4', + 'video/mp4')), ('imgur_album', ( 'http://imgur.com/a/qx9t5', - 'http://i.imgur.com/uEt0YLI.jpg', + 'https://i.imgur.com/uEt0YLI.jpg', 'image/x-imgur-album')), ('instagram_image', ( 'https://www.instagram.com/p/BIxQ0vrBN2Y/?taken-by=kimchi_chic', From cb4e56e0c6fa805301b590b390f8238bdc9e2a42 Mon Sep 17 00:00:00 2001 From: woorst Date: Mon, 29 May 2017 11:46:18 -0500 Subject: [PATCH 4/4] use request json method --- rtv/mime_parsers.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/rtv/mime_parsers.py b/rtv/mime_parsers.py index 469f6ec..e358082 100644 --- a/rtv/mime_parsers.py +++ b/rtv/mime_parsers.py @@ -4,7 +4,6 @@ import mimetypes import requests from bs4 import BeautifulSoup -import json _logger = logging.getLogger(__name__) @@ -130,7 +129,7 @@ class ImgurMIMEParser(BaseMIMEParser): if r.status_code != 200: return url, None - data = json.loads(r.text)['data'] + data = r.json()['data'] if 'images' in data: # TODO: handle imgur albums with mixed content, i.e. jpeg and gifv links = ' '.join([d['link'] for d in data['images'] if not d['animated']])