Fixing mime parsers (#580)

mime parser cleanup
2018-08-04 23:24:32 -04:00
parent b3343e2579
commit b164b5a6ba
29 changed files with 6385 additions and 6912 deletions
--- a/rtv/mime_parsers.py
+++ b/rtv/mime_parsers.py
@@ -84,6 +84,7 @@ class VideoTagMIMEParser(BaseMIMEParser):

        # TODO: Handle pages with multiple videos
        video = soup.find('video')
+        source = None
        if video:
            source = video.find('source', attr={'res': 'HD'})
            source = source or video.find('source', attr={'type': 'video/mp4'})
@@ -185,11 +186,12 @@ class RedditVideoMIMEParser(BaseMIMEParser):
        page = requests.get(request_url)
        soup = BeautifulSoup(page.content, 'html.parser')
        if not soup.find('representation', attrs={'mimetype': 'audio/mp4'}):
-            reps = soup.find_all('representation',
-                                 attrs={'mimetype': 'video/mp4'})
-            rep = sorted(reps, reverse=True,
-                         key=lambda t: int(t.get('bandwidth')))[0]
-            return url + '/' + rep.find('baseurl').text, 'video/mp4'
+            reps = soup.find_all('representation', attrs={'mimetype': 'video/mp4'})
+            reps = sorted(reps, reverse=True, key=lambda t: int(t.get('bandwidth')))
+            if reps:
+                url_suffix = reps[0].find('baseurl')
+                if url_suffix:
+                    return url + '/' + url_suffix.text, 'video/mp4'

        return request_url, 'video/x-youtube'

@@ -270,7 +272,9 @@ class ImgurApiMIMEParser(BaseMIMEParser):
        Attempt to use one of the scrapers if the API doesn't work
        """
        if domain == 'album':
-            return ImgurScrapeAlbumMIMEParser.get_mimetype(url)
+            # The old Imgur album scraper has stopped working and I haven't
+            # put in the effort to figure out why
+            return url, None
        else:
            return ImgurScrapeMIMEParser.get_mimetype(url)

@@ -303,40 +307,6 @@ class ImgurScrapeMIMEParser(BaseMIMEParser):
        return BaseMIMEParser.get_mimetype(url)


-class ImgurScrapeAlbumMIMEParser(BaseMIMEParser):
-    """
-    Imgur albums can contain several images, which need to be scraped from the
-    landing page. Assumes the following html structure:
-
-        <div class="post-image">
-            <a href="//i.imgur.com/L3Lfp1O.jpg" class="zoom">
-                <img class="post-image-placeholder"
-                     src="//i.imgur.com/L3Lfp1Og.jpg" alt="Close up">
-                <img class="js-post-image-thumb"
-                     src="//i.imgur.com/L3Lfp1Og.jpg" alt="Close up">
-            </a>
-        </div>
-    """
-    pattern = re.compile(r'https?://(w+\.)?(m\.)?imgur\.com/a(lbum)?/[^.]+$')
-
-    @staticmethod
-    def get_mimetype(url):
-        page = requests.get(url)
-        soup = BeautifulSoup(page.content, 'html.parser')
-
-        urls = []
-        for div in soup.find_all('div', class_='post-image'):
-            img = div.find('img')
-            src = img.get('src') if img else None
-            if src:
-                urls.append('http:{0}'.format(src))
-
-        if urls:
-            return " ".join(urls), 'image/x-imgur-album'
-
-        return url, None
-
-
 class InstagramMIMEParser(OpenGraphMIMEParser):
    """
    Instagram uses the Open Graph protocol
@@ -351,49 +321,6 @@ class StreamableMIMEParser(OpenGraphMIMEParser):
    pattern = re.compile(r'https?://(www\.)?streamable\.com/[^.]+$')


-class TwitchMIMEParser(BaseMIMEParser):
-    """
-    Non-streaming videos hosted by twitch.tv
-    """
-    pattern = re.compile(r'https?://clips\.?twitch\.tv/[^.]+$')
-
-    @staticmethod
-    def get_mimetype(url):
-        page = requests.get(url)
-        soup = BeautifulSoup(page.content, 'html.parser')
-        tag = soup.find('meta', attrs={'name': 'twitter:image'})
-        thumbnail = tag.get('content')
-        suffix = '-preview.jpg'
-        if thumbnail.endswith(suffix):
-            return thumbnail.replace(suffix, '.mp4'), 'video/mp4'
-
-        return url, None
-
-
-class OddshotMIMEParser(OpenGraphMIMEParser):
-    """
-    Oddshot uses the Open Graph protocol
-    """
-    pattern = re.compile(r'https?://oddshot\.tv/s(hot)?/[^.]+$')
-
-
-class VidmeMIMEParser(BaseMIMEParser):
-    """
-    Vidme provides a json api.
-
-    https://doc.vid.me
-    """
-    pattern = re.compile(r'https?://(www\.)?vid\.me/[^.]+$')
-
-    @staticmethod
-    def get_mimetype(url):
-        resp = requests.get('https://api.vid.me/videoByUrl?url=' + url)
-        if resp.status_code == 200 and resp.json()['status']:
-            return resp.json()['video']['complete_url'], 'video/mp4'
-
-        return url, None
-
-
 class LiveleakMIMEParser(BaseMIMEParser):
    """
    https://www.liveleak.com/view?i=12c_3456789
@@ -442,9 +369,14 @@ class ClippitUserMIMEParser(BaseMIMEParser):
    def get_mimetype(url):
        page = requests.get(url)
        soup = BeautifulSoup(page.content, 'html.parser')
-        tag = soup.find(id='jwplayer-container')
-        quality = ['data-{}-file'.format(_) for _ in ['hd', 'sd']]
-        return tag.get(quality[0]), 'video/mp4'
+        tag = soup.find(id='player-container')
+        if tag:
+            quality = ['data-{}-file'.format(_) for _ in ['hd', 'sd']]
+            new_url = tag.get(quality[0])
+            if new_url:
+                return new_url, 'video/mp4'
+
+        return url, None


 class GifsMIMEParser(OpenGraphMIMEParser):
@@ -461,13 +393,6 @@ class GiphyMIMEParser(OpenGraphMIMEParser):
    pattern = re.compile(r'https?://(www\.)?giphy\.com/gifs/.+$')


-class ImgtcMIMEParser(OpenGraphMIMEParser):
-    """
-    imgtc.com uses the Open Graph protocol
-    """
-    pattern = re.compile(r'https?://(www\.)?imgtc\.com/w/.+$')
-
-
 class ImgflipMIMEParser(OpenGraphMIMEParser):
    """
    imgflip.com uses the Open Graph protocol
@@ -540,9 +465,7 @@ class WorldStarHipHopMIMEParser(BaseMIMEParser):
 parsers = [
    StreamjaMIMEParser,
    ClippitUserMIMEParser,
-    OddshotMIMEParser,
    StreamableMIMEParser,
-    VidmeMIMEParser,
    InstagramMIMEParser,
    GfycatMIMEParser,
    ImgurApiMIMEParser,
@@ -551,11 +474,9 @@ parsers = [
    YoutubeMIMEParser,
    VimeoMIMEParser,
    LiveleakMIMEParser,
-    TwitchMIMEParser,
    FlickrMIMEParser,
    GifsMIMEParser,
    GiphyMIMEParser,
-    ImgtcMIMEParser,
    ImgflipMIMEParser,
    LivememeMIMEParser,
    MakeamemeMIMEParser,