Switch over to Qt browser rather than webengine download based browser as alternative to mechanize

Seems to work better. I am guessing the download() implementation in QtWebEngine restricts itself to HTTP 1.1
2025-07-09 03:04:10 -04:00 · 2024-08-14 12:45:23 +05:30 · 2024-08-14 12:45:23 +05:30 · e8453ed590
commit e8453ed590
parent e28f9c3ed3
1 changed files with 21 additions and 31 deletions
--- a/src/calibre/web/feeds/news.py
+++ b/src/calibre/web/feeds/news.py
@ -428,10 +428,9 @@ class BasicNewsRecipe(Recipe):
    recipe_specific_options = None
    #: The simulated browser engine to use when downloading from servers. The default is to use the Python mechanize
-    #: browser engine. An alternate is "chromium" which will use the network engine from the Chromium web browser instead.
+    #: browser engine. An alternate is "qt" which will use the network engine from the Qt toolkit.
-    #: The mechanize engine supports logging in, the Chromium engine does not. However, the Chromium engine supports HTTP/2 and
+    #: The mechanize engine supports logging in, the Qt engine does not. However, the Qt engine supports HTTP/2 and
-    #: similar technologies and also is harder for bot interception services to fingerprint. To customize the Chromium based
+    #: similar technologies and also is harder for bot interception services to fingerprint.
    #: browser, such as adding headers or cookies override the get_chromium_browser() method in your recipe.
    browser_type = 'mechanize'
    #: Set to False if you do not want to use gzipped transfers with the mechanize browser.
@ -541,14 +540,23 @@ class BasicNewsRecipe(Recipe):
        Return a browser instance used to fetch documents from the web. By default
        it returns a `mechanize <https://mechanize.readthedocs.io/en/latest/>`_
        browser instance that supports cookies, ignores robots.txt, handles
-        refreshes and has a mozilla firefox user agent.
+        refreshes and has a random common user agent.
-        If your recipe requires that you login first, override this method
+        To customize the browser override this method in your sub-class as::
        in your subclass. For example, the following code is used in the New York
        Times recipe to login for full access::
-            def get_browser(self):
+            def get_browser(self, *a, **kw):
-                br = BasicNewsRecipe.get_browser(self)
+                br = super().get_browser(*a, **kw)
                # Add some headers
                br.addheaders += [
                    ('My-Header', 'one'),
                    ('My-Header2', 'two'),
                ]
                # Set some cookies
                br.set_simple_cookie('name', 'value')
                br.set_simple_cookie('name2', 'value2', domain='.mydomain.com')
                # Make a POST request with some data
                br.open('https://someurl.com', {'username': 'def', 'password': 'pwd'}).read()
                # Do a login via a simple web form (only supported with mechanize browsers)
                if self.username is not None and self.password is not None:
                    br.open('https://www.nytimes.com/auth/login')
                    br.select_form(name='login')
@ -558,38 +566,20 @@ class BasicNewsRecipe(Recipe):
                return br
        '''
        if self.browser_type == 'chromium':
            return self.get_chromium_browser()
        if 'user_agent' not in kwargs:
            # More and more news sites are serving JPEG XR images to IE
            ua = getattr(self, 'last_used_user_agent', None) or self.calibre_most_common_ua or random_user_agent(allow_ie=False)
            kwargs['user_agent'] = self.last_used_user_agent = ua
        self.log('Using user agent:', kwargs['user_agent'])
        if self.browser_type == 'qt':
            from calibre.scraper.qt import Browser
            return Browser(user_agent=kwargs['user_agent'])
        br = browser(*args, **kwargs)
        br.addheaders += [('Accept', '*/*')]
        if self.handle_gzip:
            br.set_handle_gzip(True)
        return br
    def get_chromium_browser(self, *a, **kw):
        '''
        Get a "browser" that uses the Chromium network stack for support of HTTP/2 and HTTP/3 and a TLS fingerprint identical
        to that of a normal browser. Customizing the browser is simple::
            br = super().get_chromium_browser()
            # Adding headers that are added to every network request
            br.addheaders += [
                ('My-Header': 'Some value'),
                ('Another-Header': 'another value'),
            ]
            # Changing the user agent
            br.set_user_agent('some user agent')
            # Adding cookies
            br.set_simple_cookie('cookie-name', 'cookie-value')
        '''
        from calibre.scraper.fetch import Browser
        return Browser()
    def clone_browser(self, br):
        '''
        Clone the browser br. Cloned browsers are used for multi-threaded