Use a UA based on random english words

Cloudflare appears to block http requests with common browser user agents, probably it checks for some other header field with the user agent
2025-06-23 15:30:45 -04:00 · 2021-12-14 12:52:34 +05:30 · 2021-12-14 12:52:34 +05:30 · 1dfe4bd1c0
commit 1dfe4bd1c0
parent 07f72d2d94
4 changed files with 3022 additions and 2 deletions
--- a/recipes/hindu.recipe
+++ b/recipes/hindu.recipe
@ -37,8 +37,9 @@ class TheHindu(BasicNewsRecipe):
    ]

    def get_browser(self):
-        br = BasicNewsRecipe.get_browser(self)
+        br = BasicNewsRecipe.get_browser(self, user_agent='common_words/based')
        br.addheaders += [('Referer', self.epaper_url)]     # needed for fetching cover
+        # br.set_debug_http(True)
        return br

    def get_cover_url(self):
@ -99,7 +100,7 @@ class TheHindu(BasicNewsRecipe):
        #     {'title':'xxx', 'url':'http://www.thehindu.com/opinion/op-ed/rohingya-bangladeshs-burden-to-bear/article19694058.ece'},
        #     {'title':'yyy', 'url':'http://www.thehindu.com/sci-tech/energy-and-environment/on-river-washed-antique-plains/article19699327.ece'}
        # ])]
-        soup = self.index_to_soup('http://www.thehindu.com/todays-paper/')
+        soup = self.index_to_soup('https://www.thehindu.com/todays-paper/')
        nav_div = soup.find(id='subnav-tpbar-latest')
        section_list = []

--- a/resources/common-english-words.txt
+++ b/resources/common-english-words.txt
--- a/src/calibre/init.py
+++ b/src/calibre/init.py
@ -319,6 +319,9 @@ def browser(honor_time=True, max_time=2, user_agent=None, verify_ssl_certificate
    opener.set_handle_robots(False)
    if user_agent is None:
        user_agent = random_user_agent(0, allow_ie=False)
+    elif user_agent == 'common_words/based':
+        from calibre.utils.random_ua import common_english_word_ua
+        user_agent = common_english_word_ua()
    opener.addheaders = [('User-agent', user_agent)]
    proxies = get_proxies()
    to_add = {}
--- a/src/calibre/utils/random_ua.py
+++ b/src/calibre/utils/random_ua.py
@ -15,6 +15,13 @@ def user_agent_data():
    return ans


+def common_english_words():
+    ans = getattr(common_english_words, 'ans', None)
+    if ans is None:
+        ans = common_english_words.ans = tuple(x.strip() for x in P('common-english-words.txt', data=True).decode('utf-8').splitlines())
+    return ans
+
+
 def common_user_agents():
    return user_agent_data()['common_user_agents']

@ -39,3 +46,12 @@ def accept_header_for_ua(ua):
    if 'Firefox/' in ua:
        return 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
    return 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
+
+
+def common_english_word_ua():
+    words = common_english_words()
+    w1 = random.choice(words)
+    w2 = w1
+    while w2 == w1:
+        w2 = random.choice(words)
+    return f'{w1}/{w2}'