Fetch news: Allow using the chromium network stack to make HTTP requests

2025-08-11 09:13:57 -04:00 · 2024-08-13 18:43:37 +05:30 · 2024-08-13 18:43:37 +05:30 · 66f8ae20fc
commit 66f8ae20fc
parent f40950d1ff
2 changed files with 34 additions and 2 deletions
--- a/src/calibre/scraper/fetch.py
+++ b/src/calibre/scraper/fetch.py
@ -106,6 +106,9 @@ class Browser:
        self.user_agent = val
        self._send_command({'action': 'set_user_agent', 'user_agent': val})

+    def clone_browser(self):
+        return self
+
    def _send_command(self, cmd):
        self.worker.stdin.write(json.dumps(cmd).encode())
        self.worker.stdin.write(b'\n')
--- a/src/calibre/web/feeds/news.py
+++ b/src/calibre/web/feeds/news.py
@ -427,7 +427,15 @@ class BasicNewsRecipe(Recipe):
    #: If no default is specified, the option will not be in the dict at all, when unspecified by the user.
    recipe_specific_options = None

-    #: Set to False if you do not want to use gzipped transfers. Note that some old servers flake out with gzip
+    #: The simulated browser engine to use when downloading from servers. The default is to use the Python mechanize
+    #: browser engine. An alternate is "chromium" which will use the network engine from the Chromium web browser instead.
+    #: The mechanize engine supports logging in, the Chromium engine does not. However, the Chromium engine supports HTTP/2 and
+    #: similar technologies and also is harder for bot interception services to fingerprint. To customize the Chromium based
+    #: browser, such as adding headers or cookies override the get_chromium_browser() method in your recipe.
+    browser_type = 'mechanize'
+
+    #: Set to False if you do not want to use gzipped transfers with the mechanize browser.
+    #: Note that some old servers flake out with gzip.
    handle_gzip = True

    # See the built-in recipes for examples of these settings.
@ -550,6 +558,8 @@ class BasicNewsRecipe(Recipe):
                return br

        '''
+        if self.browser_type == 'chromium':
+            return self.get_chromium_browser()
        if 'user_agent' not in kwargs:
            # More and more news sites are serving JPEG XR images to IE
            ua = getattr(self, 'last_used_user_agent', None) or self.calibre_most_common_ua or random_user_agent(allow_ie=False)
@ -561,6 +571,25 @@ class BasicNewsRecipe(Recipe):
            br.set_handle_gzip(True)
        return br

+    def get_chromium_browser(self, *a, **kw):
+        '''
+        Get a "browser" that uses the Chromium network stack for support of HTTP/2 and HTTP/3 and a TLS fingerprint identical
+        to that of a normal browser. Customizing the browser is simple::
+
+            br = super().get_chromium_browser()
+            # Adding headers that are added to every network request
+            br.addheaders += [
+                ('My-Header': 'Some value'),
+                ('Another-Header': 'another value'),
+            ]
+            # Changing the user agent
+            br.set_user_agent('some user agent')
+            # Adding cookies
+            br.set_simple_cookie('cookie-name', 'cookie-value')
+        '''
+        from calibre.scraper.fetch import Browser
+        return Browser()
+
    def clone_browser(self, br):
        '''
        Clone the browser br. Cloned browsers are used for multi-threaded
@ -580,7 +609,7 @@ class BasicNewsRecipe(Recipe):

    @property
    def cloned_browser(self):
-        if hasattr(self.get_browser, 'is_base_class_implementation'):
+        if hasattr(self.get_browser, 'is_base_class_implementation') and self.browser_type == 'mechanize':
            # We are using the default get_browser, which means no need to
            # clone
            br = BasicNewsRecipe.get_browser(self)