diff --git a/ebook_converter/__init__.py b/ebook_converter/__init__.py index 4b9a076..53e8dd8 100644 --- a/ebook_converter/__init__.py +++ b/ebook_converter/__init__.py @@ -267,131 +267,6 @@ def extract(path, dir): extractor(path, dir) -def get_proxies(debug=True): - proxies = urllib.request.getproxies() - for key, proxy in list(proxies.items()): - if not proxy or '..' in proxy or key == 'auto': - del proxies[key] - continue - if proxy.startswith(key+'://'): - proxy = proxy[len(key)+3:] - if key == 'https' and proxy.startswith('http://'): - proxy = proxy[7:] - if proxy.endswith('/'): - proxy = proxy[:-1] - if len(proxy) > 4: - proxies[key] = proxy - else: - prints('Removing invalid', key, 'proxy:', proxy) - del proxies[key] - - if proxies and debug: - prints('Using proxies:', proxies) - return proxies - - -def get_parsed_proxy(typ='http', debug=True): - proxies = get_proxies(debug) - proxy = proxies.get(typ, None) - if proxy: - pattern = re.compile(( - '(?:ptype://)?' - '(?:(?P\\w+):(?P.*)@)?' - '(?P[\\w\\-\\.]+)' - '(?::(?P\\d+))?').replace('ptype', typ) - ) - - match = pattern.match(proxies[typ]) - if match: - try: - ans = { - 'host' : match.group('host'), - 'port' : match.group('port'), - 'user' : match.group('user'), - 'pass' : match.group('pass') - } - if ans['port']: - ans['port'] = int(ans['port']) - except: - if debug: - import traceback - traceback.print_exc() - else: - if debug: - prints('Using http proxy', str(ans)) - return ans - - -def get_proxy_info(proxy_scheme, proxy_string): - ''' - Parse all proxy information from a proxy string (as returned by - get_proxies). The returned dict will have members set to None when the info - is not available in the string. If an exception occurs parsing the string - this method returns None. - ''' - try: - proxy_url = '%s://%s'%(proxy_scheme, proxy_string) - urlinfo = urllib.parse.urlparse(proxy_url) - ans = { - 'scheme': urlinfo.scheme, - 'hostname': urlinfo.hostname, - 'port': urlinfo.port, - 'username': urlinfo.username, - 'password': urlinfo.password, - } - except Exception: - return None - return ans - - -# IE 11 on windows 7 -USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:11.0) like Gecko' -USER_AGENT_MOBILE = 'Mozilla/5.0 (Windows; U; Windows CE 5.1; rv:1.8.1a3) Gecko/20060610 Minimo/0.016' - - -def is_mobile_ua(ua): - return 'Mobile/' in ua or 'Mobile ' in ua - - -def random_user_agent(choose=None, allow_ie=True): - from ebook_converter.utils.random_ua import common_user_agents - ua_list = common_user_agents() - ua_list = [x for x in ua_list if not is_mobile_ua(x)] - if not allow_ie: - ua_list = [x for x in ua_list if 'Trident/' not in x and 'Edge/' not in x] - return random.choice(ua_list) if choose is None else ua_list[choose] - - -def browser(honor_time=True, max_time=2, mobile_browser=False, user_agent=None, verify_ssl_certificates=True, handle_refresh=True): - ''' - Create a mechanize browser for web scraping. The browser handles cookies, - refresh requests and ignores robots.txt. Also uses proxy if available. - - :param honor_time: If True honors pause time in refresh requests - :param max_time: Maximum time in seconds to wait during a refresh request - :param verify_ssl_certificates: If false SSL certificates errors are ignored - ''' - from ebook_converter.utils.browser import Browser - opener = Browser(verify_ssl=verify_ssl_certificates) - opener.set_handle_refresh(handle_refresh, max_time=max_time, honor_time=honor_time) - opener.set_handle_robots(False) - if user_agent is None: - user_agent = USER_AGENT_MOBILE if mobile_browser else USER_AGENT - opener.addheaders = [('User-agent', user_agent)] - proxies = get_proxies() - to_add = {} - http_proxy = proxies.get('http', None) - if http_proxy: - to_add['http'] = http_proxy - https_proxy = proxies.get('https', None) - if https_proxy: - to_add['https'] = https_proxy - if to_add: - opener.set_proxies(to_add) - - return opener - - def fit_image(width, height, pwidth, pheight): ''' Fit image in box of width pwidth and height pheight. diff --git a/ebook_converter/ebooks/conversion/plumber.py b/ebook_converter/ebooks/conversion/plumber.py index e294835..a15a1cf 100644 --- a/ebook_converter/ebooks/conversion/plumber.py +++ b/ebook_converter/ebooks/conversion/plumber.py @@ -889,21 +889,6 @@ OptionRecommendation(name='search_replace', continue setattr(mi, x, val) - def download_cover(self, url): - from ebook_converter import browser - from PIL import Image - import io - from ebook_converter.ptempfile import PersistentTemporaryFile - self.log('Downloading cover from %r'%url) - br = browser() - raw = br.open_novisit(url).read() - buf = io.BytesIO(raw) - pt = PersistentTemporaryFile('.jpg') - pt.close() - img = Image.open(buf) - img.convert('RGB').save(pt.name) - return pt.name - def read_user_metadata(self): ''' Read all metadata specified by the user. Command line options override @@ -921,7 +906,8 @@ OptionRecommendation(name='search_replace', self.opts_to_mi(mi) if mi.cover: if mi.cover.startswith('http:') or mi.cover.startswith('https:'): - mi.cover = self.download_cover(mi.cover) + self.log.warn("TODO: Cover image is on remote server, " + "implement downloading using requests") ext = mi.cover.rpartition('.')[-1].lower().strip() if ext not in ('png', 'jpg', 'jpeg', 'gif'): ext = 'jpg'