diff --git a/.gitignore b/.gitignore index 9b364e6b05..6a108bef5d 100644 --- a/.gitignore +++ b/.gitignore @@ -26,7 +26,7 @@ resources/content-server/locales.zip resources/content-server/mathjax.zip.xz resources/content-server/mathjax.version resources/mozilla-ca-certs.pem -resources/common-user-agents.txt +resources/user-agent-data.json icons/icns/*.iconset setup/installer/windows/calibre/build.log tags diff --git a/setup/browser_data.py b/setup/browser_data.py new file mode 100644 index 0000000000..16b3f1fbab --- /dev/null +++ b/setup/browser_data.py @@ -0,0 +1,115 @@ +#!/usr/bin/env python2 +# vim:fileencoding=utf-8 +# License: GPLv3 Copyright: 2017, Kovid Goyal + +from __future__ import absolute_import, division, print_function, unicode_literals + +import os +import re +from datetime import datetime + +from setup import download_securely + +is_ci = os.environ.get('CI', '').lower() == 'true' + + +def filter_ans(ans): + return filter(None, (x.strip() for x in ans)) + + +def common_user_agents(): + if is_ci: + return [ + # IE 11 - windows 10 + 'Mozilla/5.0 (Windows NT 10.0; Trident/7.0; rv:11.0) like Gecko', + # IE 11 - windows 8.1 + 'Mozilla/5.0 (Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko', + # IE 11 - windows 8 + 'Mozilla/5.0 (Windows NT 6.2; Trident/7.0; rv:11.0) like Gecko', + # IE 11 - windows 7 + 'Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:11.0) like Gecko', + # 32bit IE 11 on 64 bit win 10 + 'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko', + # 32bit IE 11 on 64 bit win 8.1 + 'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko', + # 32bit IE 11 on 64 bit win 7 + 'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko', + ] + print('Getting recent UAs...') + raw = download_securely( + 'https://techblog.willshouse.com/2012/01/03/most-common-user-agents/').decode('utf-8') + lines = re.search( + r'([^<]+)', raw).group(1).splitlines() + ans = filter_ans(lines) + if not ans: + raise ValueError('Failed to download list of common UAs') + return ans + + +def firefox_versions(): + if is_ci: + return '51.0 50.0'.split() + print('Getting firefox versions...') + import html5lib + raw = download_securely( + 'https://www.mozilla.org/en-US/firefox/releases/').decode('utf-8') + root = html5lib.parse(raw, treebuilder='lxml', namespaceHTMLElements=False) + ol = root.xpath('//div[@id="main-content"]/ol')[0] + ol.xpath('descendant::li/strong/a[@href]') + ans = filter_ans(ol.xpath('descendant::li/strong/a[@href]/text()')) + if not ans: + raise ValueError('Failed to download list of firefox versions') + return ans + + +def chrome_versions(): + if is_ci: + return [] + print('Getting chrome versions...') + import html5lib + raw = download_securely( + 'https://en.wikipedia.org/wiki/Google_Chrome_version_history').decode('utf-8') + root = html5lib.parse(raw, treebuilder='lxml', namespaceHTMLElements=False) + table = root.xpath('//*[@id="mw-content-text"]//tbody')[-1] + ans = [] + for tr in table.iterchildren('tr'): + cells = tuple(tr.iterchildren('td')) + if not cells: + continue + if not cells[2].text or not cells[2].text.strip(): + continue + s = cells[0].get('style') + if '#a0e75a' not in s and 'salmon' not in s: + break + chrome_version = cells[0].text.strip() + ts = datetime.strptime(cells[1].text.strip().split()[ + 0], '%Y-%m-%d').date().strftime('%Y-%m-%d') + try: + webkit_version = cells[2].text.strip().split()[1] + except IndexError: + continue + ans.append({'date': ts, 'chrome_version': chrome_version, + 'webkit_version': webkit_version}) + return list(reversed(ans)) + + +def all_desktop_platforms(user_agents): + ans = set() + for ua in user_agents: + if 'Mobile/' not in ua and ('Firefox/' in ua or 'Chrome/' in ua): + plat = ua.partition('(')[2].partition(')')[0] + parts = plat.split(';') + if 'Firefox/' in ua: + del parts[-1] + ans.add(';'.join(parts)) + return ans + + +def get_data(): + ans = { + 'chrome_versions': chrome_versions(), + 'firefox_versions': firefox_versions(), + 'common_user_agents': common_user_agents(), + } + ans['desktop_platforms'] = list(all_desktop_platforms(ans['common_user_agents'])) + return ans diff --git a/setup/resources.py b/setup/resources.py index de104eb5a9..7fb34bd89d 100644 --- a/setup/resources.py +++ b/setup/resources.py @@ -257,38 +257,14 @@ class CACerts(Command): # {{{ class RecentUAs(Command): # {{{ - description = 'Get updated list of recent browser user agents' - UA_PATH = os.path.join(Command.RESOURCES, 'common-user-agents.txt') - - def get_list(self): - if is_ci: - # Dont hammer the server from CI - return [ - # IE 11 - windows 10 - 'Mozilla/5.0 (Windows NT 10.0; Trident/7.0; rv:11.0) like Gecko', - # IE 11 - windows 8.1 - 'Mozilla/5.0 (Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko', - # IE 11 - windows 8 - 'Mozilla/5.0 (Windows NT 6.2; Trident/7.0; rv:11.0) like Gecko', - # IE 11 - windows 7 - 'Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:11.0) like Gecko', - # 32bit IE 11 on 64 bit win 10 - 'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko', - # 32bit IE 11 on 64 bit win 8.1 - 'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko', - # 32bit IE 11 on 64 bit win 7 - 'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko', - ] - raw = download_securely('https://techblog.willshouse.com/2012/01/03/most-common-user-agents/').decode('utf-8') - lines = re.search(r'([^<]+)', raw).group(1).splitlines() - return [x.strip() for x in lines if x.strip()] + description = 'Get updated list of common browser user agents' + UA_PATH = os.path.join(Command.RESOURCES, 'user-agent-data.json') def run(self, opts): - lines = self.get_list()[:10] - if not lines: - raise RuntimeError('Failed to download list of common user agents') + from setup.browser_data import get_data + data = get_data() with open(self.UA_PATH, 'wb') as f: - f.write('\n'.join(lines).encode('ascii')) + f.write(json.dumps(data, indent=2)) # }}} diff --git a/src/calibre/__init__.py b/src/calibre/__init__.py index dfd897ba07..faa54dd0f9 100644 --- a/src/calibre/__init__.py +++ b/src/calibre/__init__.py @@ -397,12 +397,11 @@ USER_AGENT_MOBILE = 'Mozilla/5.0 (Windows; U; Windows CE 5.1; rv:1.8.1a3) Gecko/ def random_user_agent(choose=None, allow_ie=True): - try: - ua_list = random_user_agent.ua_list - except AttributeError: - ua_list = random_user_agent.ua_list = P('common-user-agents.txt', data=True, allow_user_override=False).decode('utf-8').splitlines() + from calibre.utils.random_ua import common_user_agents + ua_list = common_user_agents() + ua_list = filter(lambda x: 'Mobile/' not in x, ua_list) if not allow_ie: - ua_list = filter(lambda x: 'Firefox/' in x or 'Chrome/' in x, ua_list) + ua_list = filter(lambda x: 'Trident/' not in x and 'Edge/' not in x, ua_list) return random.choice(ua_list) if choose is None else ua_list[choose] diff --git a/src/calibre/utils/random_ua.py b/src/calibre/utils/random_ua.py new file mode 100644 index 0000000000..ffde047c8b --- /dev/null +++ b/src/calibre/utils/random_ua.py @@ -0,0 +1,50 @@ +#!/usr/bin/env python2 +# vim:fileencoding=utf-8 +# License: GPLv3 Copyright: 2017, Kovid Goyal + +from __future__ import absolute_import, division, print_function, unicode_literals + +import json +import random + + +def user_agent_data(): + ans = getattr(user_agent_data, 'ans', None) + if ans is None: + ans = user_agent_data.ans = json.loads( + P('user-agent-data.json', data=True, allow_user_override=False)) + return ans + + +def common_user_agents(): + return user_agent_data()['common_user_agents'] + + +def random_firefox_version(): + versions = user_agent_data()['firefox_versions'][:7] + return random.choice(versions) + + +def random_desktop_platform(): + return random.choice(user_agent_data()['desktop_platforms']) + + +def random_firefox_ua(): + # https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/User-Agent/Firefox + return 'Mozilla/5.0 ({p}; rv:{ver}) Gecko/20100101 Firefox/{ver}'.format( + p=random_desktop_platform(), ver=random_firefox_version()) + + +def random_chrome_version(): + versions = user_agent_data()['chrome_versions'][:7] + return random.choice(versions) + + +def random_chrome_ua(): + v = random_chrome_version() + return 'Mozilla/5.0 ({p}) AppleWebKit/{wv} (KHTML, like Gecko) Chrome/{cv} Safari/{wv}'.format( + p=random_desktop_platform(), wv=v['webkit_version'], cv=v['chrome_version']) + + +def random_user_agent(): + return random.choice((random_chrome_ua, random_firefox_ua))()