Start working on using Chromiums network stack to download HTML

A bunch of websites have started using TLS fingerprinting to deny
access. Bloody morons.
This commit is contained in:
Kovid Goyal 2022-03-24 17:35:48 +05:30
parent b5370e0270
commit 2c8e5a3a36
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
4 changed files with 71 additions and 7 deletions

View File

@ -3,7 +3,7 @@ __license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid@kovidgoyal.net>' __copyright__ = '2008, Kovid Goyal <kovid@kovidgoyal.net>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
import sys, os, re, time, random, warnings import sys, os, re, time, warnings
from polyglot.builtins import codepoint_to_chr, hasenv, native_string_type from polyglot.builtins import codepoint_to_chr, hasenv, native_string_type
from math import floor from math import floor
from functools import partial from functools import partial
@ -290,18 +290,14 @@ def is_mobile_ua(ua):
def random_user_agent(choose=None, allow_ie=True): def random_user_agent(choose=None, allow_ie=True):
from calibre.utils.random_ua import common_user_agents, user_agents_popularity_map from calibre.utils.random_ua import common_user_agents, choose_randomly_by_popularity
ua_list = common_user_agents() ua_list = common_user_agents()
ua_list = tuple(x for x in ua_list if not is_mobile_ua(x)) ua_list = tuple(x for x in ua_list if not is_mobile_ua(x))
if not allow_ie: if not allow_ie:
ua_list = tuple(x for x in ua_list if 'Trident/' not in x) ua_list = tuple(x for x in ua_list if 'Trident/' not in x)
if choose is not None: if choose is not None:
return ua_list[choose] return ua_list[choose]
pm = user_agents_popularity_map() return choose_randomly_by_popularity(ua_list)
weights = None
if pm:
weights = tuple(map(pm.__getitem__, ua_list))
return random.choices(ua_list, weights=weights)[0]
def browser(honor_time=True, max_time=2, user_agent=None, verify_ssl_certificates=True, handle_refresh=True, **kw): def browser(honor_time=True, max_time=2, user_agent=None, verify_ssl_certificates=True, handle_refresh=True, **kw):

View File

View File

@ -0,0 +1,50 @@
#!/usr/bin/env python
# vim:fileencoding=utf-8
# License: GPL v3 Copyright: 2022, Kovid Goyal <kovid at kovidgoyal.net>
import os
from functools import lru_cache
from qt.core import QApplication
from qt.webengine import QWebEnginePage, QWebEngineProfile, QWebEngineSettings
from calibre.constants import cache_dir
@lru_cache(maxsize=4)
def create_profile(cache_name='simple', allow_js=False):
from calibre.utils.random_ua import random_common_chrome_user_agent
ans = QWebEngineProfile(cache_name, QApplication.instance())
ans.setHttpUserAgent(random_common_chrome_user_agent())
ans.setHttpCacheMaximumSize(0) # managed by webengine
ans.setCachePath(os.path.join(cache_dir(), 'scraper', cache_name))
s = ans.settings()
a = s.setAttribute
a(QWebEngineSettings.WebAttribute.PluginsEnabled, False)
a(QWebEngineSettings.WebAttribute.JavascriptEnabled, allow_js)
s.setUnknownUrlSchemePolicy(QWebEngineSettings.UnknownUrlSchemePolicy.DisallowUnknownUrlSchemes)
a(QWebEngineSettings.WebAttribute.JavascriptCanOpenWindows, False)
a(QWebEngineSettings.WebAttribute.JavascriptCanAccessClipboard, False)
# ensure javascript cannot read from local files
a(QWebEngineSettings.WebAttribute.LocalContentCanAccessFileUrls, False)
a(QWebEngineSettings.WebAttribute.AllowWindowActivationFromJavaScript, False)
return s
class SimpleScraper(QWebEnginePage):
def __init__(self, source, parent=None):
super().__init__(create_profile(source), parent=parent)
self.setAudioMuted(True)
def javaScriptAlert(self, url, msg):
pass
def javaScriptConfirm(self, url, msg):
return True
def javaScriptPrompt(self, url, msg, defval):
return True, defval
def javaScriptConsoleMessage(self, level, message, line_num, source_id):
pass

View File

@ -25,6 +25,24 @@ def common_user_agents():
return user_agent_data()['common_user_agents'] return user_agent_data()['common_user_agents']
def common_chrome_user_agents():
for x in user_agent_data()['common_user_agents']:
if 'Chrome/' in x:
yield x
def choose_randomly_by_popularity(ua_list):
pm = user_agents_popularity_map()
weights = None
if pm:
weights = tuple(map(pm.__getitem__, ua_list))
return random.choices(ua_list, weights=weights)[0]
def random_common_chrome_user_agent():
return choose_randomly_by_popularity(tuple(common_chrome_user_agents()))
def user_agents_popularity_map(): def user_agents_popularity_map():
return user_agent_data().get('user_agents_popularity', {}) return user_agent_data().get('user_agents_popularity', {})