Refactoring and adding tests

2017-09-13 23:35:40 -04:00
parent 5e82811918
commit 031b58b3b4
9 changed files with 2500 additions and 109 deletions
--- a/rtv/content.py
+++ b/rtv/content.py
@@ -2,8 +2,10 @@
 from __future__ import unicode_literals

 import re
+import time
 import logging
 from datetime import datetime
+from timeit import default_timer as timer

 import six
 from kitchen.text.display import wrap
@@ -11,6 +13,8 @@ from kitchen.text.display import wrap
 from . import exceptions
 from .packages import praw
 from .packages.praw.errors import InvalidSubreddit
+from .packages.praw.helpers import normalize_url
+from .packages.praw.handlers import DefaultHandler

 _logger = logging.getLogger(__name__)

@@ -777,3 +781,172 @@ class SubscriptionContent(Content):
        data['h_offset'] = 0

        return data
+
+
+class RequestHeaderRateLimiter(DefaultHandler):
+    """Custom PRAW request handler for rate-limiting requests.
+
+    This is an alternative to PRAW 3's DefaultHandler that uses
+    Reddit's modern API guidelines to rate-limit requests based
+    on the X-Ratelimit-* headers returned from Reddit. Most of
+    these methods are copied from or derived from the DefaultHandler.
+
+    References:
+        https://github.com/reddit/reddit/wiki/API
+        https://github.com/praw-dev/prawcore/blob/master/prawcore/rate_limit.py
+    """
+
+    def __init__(self):
+
+        # In PRAW's convention, these variables were bound to the
+        # class so the cache could be shared among all of the ``reddit``
+        # instances. In RTV's use-case there is only ever a single reddit
+        # instance so it made sense to clean up the globals and transfer them
+        # to method variables
+        self.cache = {}
+        self.cache_hit_callback = None
+        self.timeouts = {}
+
+        # These are used for the header rate-limiting
+        self.used = None
+        self.remaining = None
+        self.seconds_to_reset = None
+        self.next_request_timestamp = None
+
+        super(RequestHeaderRateLimiter, self).__init__()
+
+    def _delay(self):
+        """
+        Pause before making the next HTTP request.
+        """
+        if self.next_request_timestamp is None:
+            return
+
+        sleep_seconds = self.next_request_timestamp - time.time()
+        if sleep_seconds <= 0:
+            return
+        time.sleep(sleep_seconds)
+
+    def _update(self, response_headers):
+        """
+        Update the state of the rate limiter based on the response headers:
+
+            X-Ratelimit-Used: Approximate number of requests used this period
+            X-Ratelimit-Remaining: Approximate number of requests left to use
+            X-Ratelimit-Reset: Approximate number of seconds to end of period
+
+        PRAW 5's rate limiting logic is structured for making hundreds of
+        evenly-spaced API requests, which makes sense for running something
+        like a bot or crawler.
+
+        This handler's logic, on the other hand, is geared more towards
+        interactive usage. It allows for short, sporadic bursts of requests.
+        The assumption is that actual users browsing reddit shouldn't ever be
+        in danger of hitting the rate limit. If they do hit the limit, they
+        will be cutoff until the period resets.
+        """
+
+        if 'x-ratelimit-remaining' not in response_headers:
+            # This could be because the API returned an error response, or it
+            # could be because we're using something like read-only credentials
+            # which Reddit doesn't appear to care about rate limiting.
+            return
+
+        self.used = float(response_headers['x-ratelimit-used'])
+        self.remaining = float(response_headers['x-ratelimit-remaining'])
+        self.seconds_to_reset = int(response_headers['x-ratelimit-reset'])
+        _logger.debug('Rate limit: %s used, %s remaining, %s reset',
+                      self.used, self.remaining, self.seconds_to_reset)
+
+        if self.remaining <= 0:
+            self.next_request_timestamp = time.time() + self.seconds_to_reset
+        else:
+            self.next_request_timestamp = None
+
+    def _clear_timeouts(self, cache_timeout):
+        """
+        Clear the cache of timed out results.
+        """
+
+        for key in list(self.timeouts):
+            if timer() - self.timeouts[key] > cache_timeout:
+                del self.timeouts[key]
+                del self.cache[key]
+
+    def clear_cache(self):
+        """Remove all items from the cache."""
+        self.cache = {}
+        self.timeouts = {}
+
+    def evict(self, urls):
+        """Remove items from cache matching URLs.
+
+        Return the number of items removed.
+
+        """
+        if isinstance(urls, six.text_type):
+            urls = [urls]
+        urls = set(normalize_url(url) for url in urls)
+        retval = 0
+        for key in list(self.cache):
+            if key[0] in urls:
+                retval += 1
+                del self.cache[key]
+                del self.timeouts[key]
+        return retval
+
+    def request(self, _cache_key, _cache_ignore, _cache_timeout, **kwargs):
+        """
+        This is a wrapper function that handles the caching of the request.
+        
+        See DefaultHandler.with_cache for reference.
+        """
+        if _cache_key:
+            # Pop the request's session cookies from the cache key.
+            # These appear to be unreliable and change with every
+            # request. Also, with the introduction of OAuth I don't think
+            # that cookies are being used to store anything that
+            # differentiates API requests anyways
+            url, items = _cache_key
+            _cache_key = (url, (items[0], items[1], items[3], items[4]))
+
+        if kwargs['request'].method != 'GET':
+            # I added this check for RTV, I have no idea why PRAW would ever
+            # want to cache POST/PUT/DELETE requests
+            _cache_ignore = True
+
+        if _cache_ignore:
+            return self._request(**kwargs)
+
+        self._clear_timeouts(_cache_timeout)
+        if _cache_key in self.cache:
+            if self.cache_hit_callback:
+                self.cache_hit_callback(_cache_key)
+            return self.cache[_cache_key]
+
+        result = self._request(**kwargs)
+
+        # The handlers don't call `raise_for_status` so we need to ignore
+        # status codes that will result in an exception that should not be
+        # cached.
+        if result.status_code not in (200, 302):
+            return result
+
+        self.timeouts[_cache_key] = timer()
+        self.cache[_cache_key] = result
+        return result
+
+    def _request(self, request, proxies, timeout, verify, **_):
+        """
+        This is where we apply rate limiting and make the HTTP request.
+        """
+
+        settings = self.http.merge_environment_settings(
+            request.url, proxies, False, verify, None)
+
+        self._delay()
+        response = self.http.send(
+            request, timeout=timeout, allow_redirects=False, **settings)
+        self._update(response.headers)
+
+        return response