Get common user agents from the calibre website logs

This commit is contained in:
Kovid Goyal 2020-12-27 10:21:14 +05:30
parent ffdf794246
commit 18a3d945c6
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
2 changed files with 24 additions and 67 deletions

View File

@ -3,19 +3,24 @@
# License: GPLv3 Copyright: 2017, Kovid Goyal <kovid at kovidgoyal.net> # License: GPLv3 Copyright: 2017, Kovid Goyal <kovid at kovidgoyal.net>
import bz2
import os import os
import json import sys
import gzip
import io
from datetime import datetime from datetime import datetime
from urllib.request import urlopen
from setup import download_securely
from polyglot.builtins import filter from polyglot.builtins import filter
from setup import download_securely
is_ci = os.environ.get('CI', '').lower() == 'true' is_ci = os.environ.get('CI', '').lower() == 'true'
def download_from_calibre_server(url):
ca = os.path.join(sys.resources_location, 'calibre-ebook-root-CA.crt')
with urlopen(url, cafile=ca) as f:
return f.read()
def filter_ans(ans): def filter_ans(ans):
return list(filter(None, (x.strip() for x in ans))) return list(filter(None, (x.strip() for x in ans)))
@ -39,18 +44,15 @@ def common_user_agents():
'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko', 'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko',
] ]
print('Getting recent UAs...') print('Getting recent UAs...')
raw = download_securely( raw = download_from_calibre_server('https://code.calibre-ebook.com/ua-popularity')
'https://raw.githubusercontent.com/intoli/user-agents/master/src/user-agents.json.gz') ans = {}
data = json.loads(gzip.GzipFile(fileobj=io.BytesIO(raw)).read()) for line in bz2.decompress(raw).decode('utf-8').splitlines():
uas = [] count, ua = line.partition(':')[::2]
for item in data: count = int(count.strip())
ua = item['userAgent'] ua = ua.strip()
if not ua.startswith('Opera'): if len(ua) > 20:
uas.append(ua) ans[ua] = count
ans = filter_ans(uas)[:256] return ans, list(sorted(ans, reverse=True, key=ans.__getitem__))
if not ans:
raise ValueError('Failed to download list of common UAs')
return ans
def firefox_versions(): def firefox_versions():
@ -103,7 +105,7 @@ def chrome_versions():
def all_desktop_platforms(user_agents): def all_desktop_platforms(user_agents):
ans = set() ans = set()
for ua in user_agents: for ua in user_agents:
if 'Mobile/' not in ua and ('Firefox/' in ua or 'Chrome/' in ua): if ' Mobile ' not in ua and 'Mobile/' not in ua and ('Firefox/' in ua or 'Chrome/' in ua):
plat = ua.partition('(')[2].partition(')')[0] plat = ua.partition('(')[2].partition(')')[0]
parts = plat.split(';') parts = plat.split(';')
if 'Firefox/' in ua: if 'Firefox/' in ua:
@ -113,10 +115,13 @@ def all_desktop_platforms(user_agents):
def get_data(): def get_data():
ua_freq_map, common = common_user_agents()
ans = { ans = {
'chrome_versions': chrome_versions(), 'chrome_versions': chrome_versions(),
'firefox_versions': firefox_versions(), 'firefox_versions': firefox_versions(),
'common_user_agents': common_user_agents(), 'common_user_agents': common,
'user_agents_popularity': ua_freq_map,
'timestamp': datetime.utcnow().isoformat() + '+00:00',
} }
ans['desktop_platforms'] = list(all_desktop_platforms(ans['common_user_agents'])) ans['desktop_platforms'] = list(all_desktop_platforms(ans['common_user_agents']))
return ans return ans

View File

@ -23,62 +23,14 @@ def all_firefox_versions(limit=10):
return user_agent_data()['firefox_versions'][:limit] return user_agent_data()['firefox_versions'][:limit]
def random_firefox_version():
return random.choice(all_firefox_versions())
def random_desktop_platform(): def random_desktop_platform():
return random.choice(user_agent_data()['desktop_platforms']) return random.choice(user_agent_data()['desktop_platforms'])
def render_firefox_ua(platform, version):
# https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/User-Agent/Firefox
return 'Mozilla/5.0 ({p}; rv:{ver}) Gecko/20100101 Firefox/{ver}'.format(
p=platform, ver=version)
def random_firefox_ua():
render_firefox_ua(random_desktop_platform(), random_firefox_version())
def all_chrome_versions(limit=10): def all_chrome_versions(limit=10):
return user_agent_data()['chrome_versions'][:limit] return user_agent_data()['chrome_versions'][:limit]
def random_chrome_version():
return random.choice(all_chrome_versions())
def render_chrome_ua(platform, version):
return 'Mozilla/5.0 ({p}) AppleWebKit/{wv} (KHTML, like Gecko) Chrome/{cv} Safari/{wv}'.format(
p=platform, wv=version['webkit_version'], cv=version['chrome_version'])
def random_chrome_ua():
return render_chrome_ua(random_desktop_platform(), random_chrome_version())
def all_user_agents():
ans = getattr(all_user_agents, 'ans', None)
if ans is None:
uas = []
g = globals()
platforms = user_agent_data()['desktop_platforms']
for b in ('chrome', 'firefox'):
versions = g['all_%s_versions' % b]()
func = g['render_%s_ua' % b]
for v in versions:
for p in platforms:
uas.append(func(p, v))
random.shuffle(uas)
ans = all_user_agents.ans = tuple(uas)
return ans
def random_user_agent():
return random.choice(all_user_agents())
def accept_header_for_ua(ua): def accept_header_for_ua(ua):
if 'Firefox/' in ua: if 'Firefox/' in ua:
return 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8' return 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'