Merge branch 'imgur_api' of https://github.com/woorst/rtv into woorst-imgur_api
This commit is contained in:
@@ -130,65 +130,41 @@ class RedditUploadsMIMEParser(BaseMIMEParser):
|
|||||||
|
|
||||||
class ImgurMIMEParser(BaseMIMEParser):
|
class ImgurMIMEParser(BaseMIMEParser):
|
||||||
"""
|
"""
|
||||||
The majority of imgur links don't point directly to the image, so we need
|
Imgur provides a json api exposing its entire infrastructure. Each imgur
|
||||||
to open the provided url and scrape the page for the link.
|
page has an associated hash and can either contain an album, a gallery, or single image.
|
||||||
|
|
||||||
Scrape the actual image url from an imgur landing page. Imgur intentionally
|
see https://apidocs.imgur.com
|
||||||
obscures this on most reddit links in order to draw more traffic for their
|
|
||||||
advertisements.
|
|
||||||
|
|
||||||
There are a couple of <meta> tags that supply the relevant info:
|
|
||||||
<meta name="twitter:image" content="https://i.imgur.com/xrqQ4LEh.jpg">
|
|
||||||
<meta property="og:image" content="http://i.imgur.com/xrqQ4LE.jpg?fb">
|
|
||||||
<link rel="image_src" href="http://i.imgur.com/xrqQ4LE.jpg">
|
|
||||||
"""
|
"""
|
||||||
pattern = re.compile(r'https?://(w+\.)?(m\.)?imgur\.com/[^.]+$')
|
pattern = re.compile(r'https?://(w+\.)?(m\.)?imgur\.com/[^.]+$')
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def get_mimetype(url):
|
def get_mimetype(url):
|
||||||
page = requests.get(url)
|
endpoint = 'https://api.imgur.com/3/{domain}/{page_hash}'
|
||||||
soup = BeautifulSoup(page.content, 'html.parser')
|
header = {'authorization': 'Client-ID {}'.format('d8842d573e8b9dd')}
|
||||||
tag = soup.find('meta', attrs={'name': 'twitter:image'})
|
|
||||||
if tag:
|
|
||||||
url = tag.get('content')
|
|
||||||
if GifvMIMEParser.pattern.match(url):
|
|
||||||
return GifvMIMEParser.get_mimetype(url)
|
|
||||||
return BaseMIMEParser.get_mimetype(url)
|
|
||||||
|
|
||||||
|
pattern = re.compile(r'https?://(w+\.)?(m\.)?imgur\.com/((?P<domain>a|album|gallery)/)?(?P<hash>.+)$')
|
||||||
|
m = pattern.match(url)
|
||||||
|
page_hash = m.group('hash')
|
||||||
|
domain = 'album' if m.group('domain') in ['a', 'album'] else 'gallery'
|
||||||
|
|
||||||
class ImgurAlbumMIMEParser(BaseMIMEParser):
|
r = requests.get(endpoint.format(domain=domain, page_hash=page_hash),
|
||||||
"""
|
headers=header)
|
||||||
Imgur albums can contain several images, which need to be scraped from the
|
if r.status_code != 200:
|
||||||
landing page. Assumes the following html structure:
|
r = requests.get(endpoint.format(domain='image',
|
||||||
|
page_hash=page_hash), headers=header)
|
||||||
<div class="post-image">
|
if r.status_code != 200:
|
||||||
<a href="//i.imgur.com/L3Lfp1O.jpg" class="zoom">
|
|
||||||
<img class="post-image-placeholder"
|
|
||||||
src="//i.imgur.com/L3Lfp1Og.jpg" alt="Close up">
|
|
||||||
<img class="js-post-image-thumb"
|
|
||||||
src="//i.imgur.com/L3Lfp1Og.jpg" alt="Close up">
|
|
||||||
</a>
|
|
||||||
</div>
|
|
||||||
"""
|
|
||||||
pattern = re.compile(r'https?://(w+\.)?(m\.)?imgur\.com/a(lbum)?/[^.]+$')
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def get_mimetype(url):
|
|
||||||
page = requests.get(url)
|
|
||||||
soup = BeautifulSoup(page.content, 'html.parser')
|
|
||||||
|
|
||||||
urls = []
|
|
||||||
for div in soup.find_all('div', class_='post-image'):
|
|
||||||
img = div.find('img')
|
|
||||||
src = img.get('src') if img else None
|
|
||||||
if src:
|
|
||||||
urls.append('http:{0}'.format(src))
|
|
||||||
|
|
||||||
if urls:
|
|
||||||
return " ".join(urls), 'image/x-imgur-album'
|
|
||||||
else:
|
|
||||||
return url, None
|
return url, None
|
||||||
|
|
||||||
|
data = r.json()['data']
|
||||||
|
if 'images' in data:
|
||||||
|
# TODO: handle imgur albums with mixed content, i.e. jpeg and gifv
|
||||||
|
links = ' '.join([d['link'] for d in data['images'] if not d['animated']])
|
||||||
|
return links.replace('http://', 'https://'), 'image/x-imgur-album'
|
||||||
|
else :
|
||||||
|
link = data['mp4'] if data['animated'] else data['link']
|
||||||
|
mime = 'video/mp4' if data['animated'] else data['type']
|
||||||
|
return link.replace('http://', 'https://'), mime
|
||||||
|
|
||||||
|
|
||||||
class InstagramMIMEParser(OpenGraphMIMEParser):
|
class InstagramMIMEParser(OpenGraphMIMEParser):
|
||||||
"""
|
"""
|
||||||
@@ -227,7 +203,6 @@ parsers = [
|
|||||||
VidmeMIMEParser,
|
VidmeMIMEParser,
|
||||||
InstagramMIMEParser,
|
InstagramMIMEParser,
|
||||||
GfycatMIMEParser,
|
GfycatMIMEParser,
|
||||||
ImgurAlbumMIMEParser,
|
|
||||||
ImgurMIMEParser,
|
ImgurMIMEParser,
|
||||||
RedditUploadsMIMEParser,
|
RedditUploadsMIMEParser,
|
||||||
YoutubeMIMEParser,
|
YoutubeMIMEParser,
|
||||||
|
|||||||
@@ -46,12 +46,12 @@ URLS = OrderedDict([
|
|||||||
'https://i.imgur.com/yW0kbMi.jpg',
|
'https://i.imgur.com/yW0kbMi.jpg',
|
||||||
'image/jpeg')),
|
'image/jpeg')),
|
||||||
('imgur_2', (
|
('imgur_2', (
|
||||||
'http://imgur.com/yjP1v4B',
|
'http://imgur.com/gallery/yjP1v4B',
|
||||||
'https://i.imgur.com/yjP1v4Bh.jpg',
|
'https://i.imgur.com/yjP1v4B.mp4',
|
||||||
'image/jpeg')),
|
'video/mp4')),
|
||||||
('imgur_album', (
|
('imgur_album', (
|
||||||
'http://imgur.com/a/qx9t5',
|
'http://imgur.com/a/qx9t5',
|
||||||
'http://i.imgur.com/uEt0YLI.jpg',
|
'https://i.imgur.com/uEt0YLI.jpg',
|
||||||
'image/x-imgur-album')),
|
'image/x-imgur-album')),
|
||||||
('instagram_image', (
|
('instagram_image', (
|
||||||
'https://www.instagram.com/p/BIxQ0vrBN2Y/?taken-by=kimchi_chic',
|
'https://www.instagram.com/p/BIxQ0vrBN2Y/?taken-by=kimchi_chic',
|
||||||
|
|||||||
Reference in New Issue
Block a user