Switch over to Qt browser rather than webengine download based browser as alternative to mechanize

Seems to work better. I am guessing the download() implementation in
QtWebEngine restricts itself to HTTP 1.1
This commit is contained in:
Kovid Goyal 2024-08-14 12:45:23 +05:30
parent e28f9c3ed3
commit e8453ed590
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C

View File

@ -428,10 +428,9 @@ class BasicNewsRecipe(Recipe):
recipe_specific_options = None recipe_specific_options = None
#: The simulated browser engine to use when downloading from servers. The default is to use the Python mechanize #: The simulated browser engine to use when downloading from servers. The default is to use the Python mechanize
#: browser engine. An alternate is "chromium" which will use the network engine from the Chromium web browser instead. #: browser engine. An alternate is "qt" which will use the network engine from the Qt toolkit.
#: The mechanize engine supports logging in, the Chromium engine does not. However, the Chromium engine supports HTTP/2 and #: The mechanize engine supports logging in, the Qt engine does not. However, the Qt engine supports HTTP/2 and
#: similar technologies and also is harder for bot interception services to fingerprint. To customize the Chromium based #: similar technologies and also is harder for bot interception services to fingerprint.
#: browser, such as adding headers or cookies override the get_chromium_browser() method in your recipe.
browser_type = 'mechanize' browser_type = 'mechanize'
#: Set to False if you do not want to use gzipped transfers with the mechanize browser. #: Set to False if you do not want to use gzipped transfers with the mechanize browser.
@ -541,14 +540,23 @@ class BasicNewsRecipe(Recipe):
Return a browser instance used to fetch documents from the web. By default Return a browser instance used to fetch documents from the web. By default
it returns a `mechanize <https://mechanize.readthedocs.io/en/latest/>`_ it returns a `mechanize <https://mechanize.readthedocs.io/en/latest/>`_
browser instance that supports cookies, ignores robots.txt, handles browser instance that supports cookies, ignores robots.txt, handles
refreshes and has a mozilla firefox user agent. refreshes and has a random common user agent.
If your recipe requires that you login first, override this method To customize the browser override this method in your sub-class as::
in your subclass. For example, the following code is used in the New York
Times recipe to login for full access::
def get_browser(self): def get_browser(self, *a, **kw):
br = BasicNewsRecipe.get_browser(self) br = super().get_browser(*a, **kw)
# Add some headers
br.addheaders += [
('My-Header', 'one'),
('My-Header2', 'two'),
]
# Set some cookies
br.set_simple_cookie('name', 'value')
br.set_simple_cookie('name2', 'value2', domain='.mydomain.com')
# Make a POST request with some data
br.open('https://someurl.com', {'username': 'def', 'password': 'pwd'}).read()
# Do a login via a simple web form (only supported with mechanize browsers)
if self.username is not None and self.password is not None: if self.username is not None and self.password is not None:
br.open('https://www.nytimes.com/auth/login') br.open('https://www.nytimes.com/auth/login')
br.select_form(name='login') br.select_form(name='login')
@ -558,38 +566,20 @@ class BasicNewsRecipe(Recipe):
return br return br
''' '''
if self.browser_type == 'chromium':
return self.get_chromium_browser()
if 'user_agent' not in kwargs: if 'user_agent' not in kwargs:
# More and more news sites are serving JPEG XR images to IE # More and more news sites are serving JPEG XR images to IE
ua = getattr(self, 'last_used_user_agent', None) or self.calibre_most_common_ua or random_user_agent(allow_ie=False) ua = getattr(self, 'last_used_user_agent', None) or self.calibre_most_common_ua or random_user_agent(allow_ie=False)
kwargs['user_agent'] = self.last_used_user_agent = ua kwargs['user_agent'] = self.last_used_user_agent = ua
self.log('Using user agent:', kwargs['user_agent']) self.log('Using user agent:', kwargs['user_agent'])
if self.browser_type == 'qt':
from calibre.scraper.qt import Browser
return Browser(user_agent=kwargs['user_agent'])
br = browser(*args, **kwargs) br = browser(*args, **kwargs)
br.addheaders += [('Accept', '*/*')] br.addheaders += [('Accept', '*/*')]
if self.handle_gzip: if self.handle_gzip:
br.set_handle_gzip(True) br.set_handle_gzip(True)
return br return br
def get_chromium_browser(self, *a, **kw):
'''
Get a "browser" that uses the Chromium network stack for support of HTTP/2 and HTTP/3 and a TLS fingerprint identical
to that of a normal browser. Customizing the browser is simple::
br = super().get_chromium_browser()
# Adding headers that are added to every network request
br.addheaders += [
('My-Header': 'Some value'),
('Another-Header': 'another value'),
]
# Changing the user agent
br.set_user_agent('some user agent')
# Adding cookies
br.set_simple_cookie('cookie-name', 'cookie-value')
'''
from calibre.scraper.fetch import Browser
return Browser()
def clone_browser(self, br): def clone_browser(self, br):
''' '''
Clone the browser br. Cloned browsers are used for multi-threaded Clone the browser br. Cloned browsers are used for multi-threaded