From 18a3d945c6dce5c7633413fdd406e110792a8efd Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 27 Dec 2020 10:21:14 +0530 Subject: [PATCH] Get common user agents from the calibre website logs --- setup/browser_data.py | 43 ++++++++++++++++-------------- src/calibre/utils/random_ua.py | 48 ---------------------------------- 2 files changed, 24 insertions(+), 67 deletions(-) diff --git a/setup/browser_data.py b/setup/browser_data.py index 7646ea9da2..a167055f1e 100644 --- a/setup/browser_data.py +++ b/setup/browser_data.py @@ -3,19 +3,24 @@ # License: GPLv3 Copyright: 2017, Kovid Goyal +import bz2 import os -import json -import gzip -import io +import sys from datetime import datetime - -from setup import download_securely +from urllib.request import urlopen from polyglot.builtins import filter +from setup import download_securely is_ci = os.environ.get('CI', '').lower() == 'true' +def download_from_calibre_server(url): + ca = os.path.join(sys.resources_location, 'calibre-ebook-root-CA.crt') + with urlopen(url, cafile=ca) as f: + return f.read() + + def filter_ans(ans): return list(filter(None, (x.strip() for x in ans))) @@ -39,18 +44,15 @@ def common_user_agents(): 'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko', ] print('Getting recent UAs...') - raw = download_securely( - 'https://raw.githubusercontent.com/intoli/user-agents/master/src/user-agents.json.gz') - data = json.loads(gzip.GzipFile(fileobj=io.BytesIO(raw)).read()) - uas = [] - for item in data: - ua = item['userAgent'] - if not ua.startswith('Opera'): - uas.append(ua) - ans = filter_ans(uas)[:256] - if not ans: - raise ValueError('Failed to download list of common UAs') - return ans + raw = download_from_calibre_server('https://code.calibre-ebook.com/ua-popularity') + ans = {} + for line in bz2.decompress(raw).decode('utf-8').splitlines(): + count, ua = line.partition(':')[::2] + count = int(count.strip()) + ua = ua.strip() + if len(ua) > 20: + ans[ua] = count + return ans, list(sorted(ans, reverse=True, key=ans.__getitem__)) def firefox_versions(): @@ -103,7 +105,7 @@ def chrome_versions(): def all_desktop_platforms(user_agents): ans = set() for ua in user_agents: - if 'Mobile/' not in ua and ('Firefox/' in ua or 'Chrome/' in ua): + if ' Mobile ' not in ua and 'Mobile/' not in ua and ('Firefox/' in ua or 'Chrome/' in ua): plat = ua.partition('(')[2].partition(')')[0] parts = plat.split(';') if 'Firefox/' in ua: @@ -113,10 +115,13 @@ def all_desktop_platforms(user_agents): def get_data(): + ua_freq_map, common = common_user_agents() ans = { 'chrome_versions': chrome_versions(), 'firefox_versions': firefox_versions(), - 'common_user_agents': common_user_agents(), + 'common_user_agents': common, + 'user_agents_popularity': ua_freq_map, + 'timestamp': datetime.utcnow().isoformat() + '+00:00', } ans['desktop_platforms'] = list(all_desktop_platforms(ans['common_user_agents'])) return ans diff --git a/src/calibre/utils/random_ua.py b/src/calibre/utils/random_ua.py index b825c0922f..79ebdee0b8 100644 --- a/src/calibre/utils/random_ua.py +++ b/src/calibre/utils/random_ua.py @@ -23,62 +23,14 @@ def all_firefox_versions(limit=10): return user_agent_data()['firefox_versions'][:limit] -def random_firefox_version(): - return random.choice(all_firefox_versions()) - - def random_desktop_platform(): return random.choice(user_agent_data()['desktop_platforms']) -def render_firefox_ua(platform, version): - # https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/User-Agent/Firefox - return 'Mozilla/5.0 ({p}; rv:{ver}) Gecko/20100101 Firefox/{ver}'.format( - p=platform, ver=version) - - -def random_firefox_ua(): - render_firefox_ua(random_desktop_platform(), random_firefox_version()) - - def all_chrome_versions(limit=10): return user_agent_data()['chrome_versions'][:limit] -def random_chrome_version(): - return random.choice(all_chrome_versions()) - - -def render_chrome_ua(platform, version): - return 'Mozilla/5.0 ({p}) AppleWebKit/{wv} (KHTML, like Gecko) Chrome/{cv} Safari/{wv}'.format( - p=platform, wv=version['webkit_version'], cv=version['chrome_version']) - - -def random_chrome_ua(): - return render_chrome_ua(random_desktop_platform(), random_chrome_version()) - - -def all_user_agents(): - ans = getattr(all_user_agents, 'ans', None) - if ans is None: - uas = [] - g = globals() - platforms = user_agent_data()['desktop_platforms'] - for b in ('chrome', 'firefox'): - versions = g['all_%s_versions' % b]() - func = g['render_%s_ua' % b] - for v in versions: - for p in platforms: - uas.append(func(p, v)) - random.shuffle(uas) - ans = all_user_agents.ans = tuple(uas) - return ans - - -def random_user_agent(): - return random.choice(all_user_agents()) - - def accept_header_for_ua(ua): if 'Firefox/' in ua: return 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'