Use imgur api to retrieve direct links to images.

This commit is contained in:
woorst
2017-05-28 00:25:00 -05:00
parent 84824f0352
commit 5da32c258a

View File

@@ -4,6 +4,7 @@ import mimetypes
import requests import requests
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
import json
_logger = logging.getLogger(__name__) _logger = logging.getLogger(__name__)
@@ -104,64 +105,38 @@ class RedditUploadsMIMEParser(BaseMIMEParser):
class ImgurMIMEParser(BaseMIMEParser): class ImgurMIMEParser(BaseMIMEParser):
""" """
The majority of imgur links don't point directly to the image, so we need Imgur provides a json api exposing its entire infrastructure. Each imgur
to open the provided url and scrape the page for the link. page has an associated hash and can either contain an album, a gallery, or single image.
Scrape the actual image url from an imgur landing page. Imgur intentionally see https://apidocs.imgur.com
obscures this on most reddit links in order to draw more traffic for their
advertisements.
There are a couple of <meta> tags that supply the relevant info:
<meta name="twitter:image" content="https://i.imgur.com/xrqQ4LEh.jpg">
<meta property="og:image" content="http://i.imgur.com/xrqQ4LE.jpg?fb">
<link rel="image_src" href="http://i.imgur.com/xrqQ4LE.jpg">
""" """
pattern = re.compile(r'https?://(w+\.)?(m\.)?imgur\.com/[^.]+$') pattern = re.compile(r'https?://(w+\.)?(m\.)?imgur\.com/[^.]+$')
@staticmethod @staticmethod
def get_mimetype(url): def get_mimetype(url):
page = requests.get(url) endpoint = 'https://api.imgur.com/3/{domain}/{page_hash}'
soup = BeautifulSoup(page.content, 'html.parser') header = {'authorization': 'Client-ID {}'.format('d8842d573e8b9dd')}
tag = soup.find('meta', attrs={'name': 'twitter:image'})
if tag:
url = tag.get('content')
if GifvMIMEParser.pattern.match(url):
return GifvMIMEParser.get_mimetype(url)
return BaseMIMEParser.get_mimetype(url)
pattern = re.compile(r'https?://(w+\.)?(m\.)?imgur\.com/((?P<domain>a|album|gallery)/)?(?P<hash>.+)$')
m = pattern.match(url)
page_hash = m.group('hash')
domain = 'album' if m.group('domain') in ['a', 'album'] else 'gallery'
class ImgurAlbumMIMEParser(BaseMIMEParser): r = requests.get(endpoint.format(domain=domain, page_hash=page_hash),
""" headers=header)
Imgur albums can contain several images, which need to be scraped from the if r.status_code == 404:
landing page. Assumes the following html structure: r = requests.get(endpoint.format(domain='image',
page_hash=page_hash), headers=header)
<div class="post-image"> data = json.loads(r.text)['data']
<a href="//i.imgur.com/L3Lfp1O.jpg" class="zoom"> if 'images' in data:
<img class="post-image-placeholder" # TODO: handle imgur albums with mixed content, i.e. jpeg and gifv
src="//i.imgur.com/L3Lfp1Og.jpg" alt="Close up"> urls = ' '.join([d['link'] for d in data['images'] if not d['animated']])
<img class="js-post-image-thumb" return urls, 'image/x-imgur-album'
src="//i.imgur.com/L3Lfp1Og.jpg" alt="Close up">
</a>
</div>
"""
pattern = re.compile(r'https?://(w+\.)?(m\.)?imgur\.com/a(lbum)?/[^.]+$')
@staticmethod
def get_mimetype(url):
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
urls = []
for div in soup.find_all('div', class_='post-image'):
img = div.find('img')
src = img.get('src') if img else None
if src:
urls.append('http:{0}'.format(src))
if urls:
return " ".join(urls), 'image/x-imgur-album'
else: else:
return url, None return (data['mp4'], 'video/mp4') if data['animated'] else (data['link'], data['type'])
return url, None
class InstagramMIMEParser(BaseMIMEParser): class InstagramMIMEParser(BaseMIMEParser):
@@ -192,7 +167,6 @@ class InstagramMIMEParser(BaseMIMEParser):
parsers = [ parsers = [
InstagramMIMEParser, InstagramMIMEParser,
GfycatMIMEParser, GfycatMIMEParser,
ImgurAlbumMIMEParser,
ImgurMIMEParser, ImgurMIMEParser,
RedditUploadsMIMEParser, RedditUploadsMIMEParser,
YoutubeMIMEParser, YoutubeMIMEParser,