Use a UA based on random english words

Cloudflare appears to block http requests with common browser user
agents, probably it checks for some other header field with the user
agent
This commit is contained in:
Kovid Goyal 2021-12-14 12:52:34 +05:30
parent 07f72d2d94
commit 1dfe4bd1c0
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
4 changed files with 3022 additions and 2 deletions

View File

@ -37,8 +37,9 @@ class TheHindu(BasicNewsRecipe):
] ]
def get_browser(self): def get_browser(self):
br = BasicNewsRecipe.get_browser(self) br = BasicNewsRecipe.get_browser(self, user_agent='common_words/based')
br.addheaders += [('Referer', self.epaper_url)] # needed for fetching cover br.addheaders += [('Referer', self.epaper_url)] # needed for fetching cover
# br.set_debug_http(True)
return br return br
def get_cover_url(self): def get_cover_url(self):
@ -99,7 +100,7 @@ class TheHindu(BasicNewsRecipe):
# {'title':'xxx', 'url':'http://www.thehindu.com/opinion/op-ed/rohingya-bangladeshs-burden-to-bear/article19694058.ece'}, # {'title':'xxx', 'url':'http://www.thehindu.com/opinion/op-ed/rohingya-bangladeshs-burden-to-bear/article19694058.ece'},
# {'title':'yyy', 'url':'http://www.thehindu.com/sci-tech/energy-and-environment/on-river-washed-antique-plains/article19699327.ece'} # {'title':'yyy', 'url':'http://www.thehindu.com/sci-tech/energy-and-environment/on-river-washed-antique-plains/article19699327.ece'}
# ])] # ])]
soup = self.index_to_soup('http://www.thehindu.com/todays-paper/') soup = self.index_to_soup('https://www.thehindu.com/todays-paper/')
nav_div = soup.find(id='subnav-tpbar-latest') nav_div = soup.find(id='subnav-tpbar-latest')
section_list = [] section_list = []

File diff suppressed because it is too large Load Diff

View File

@ -319,6 +319,9 @@ def browser(honor_time=True, max_time=2, user_agent=None, verify_ssl_certificate
opener.set_handle_robots(False) opener.set_handle_robots(False)
if user_agent is None: if user_agent is None:
user_agent = random_user_agent(0, allow_ie=False) user_agent = random_user_agent(0, allow_ie=False)
elif user_agent == 'common_words/based':
from calibre.utils.random_ua import common_english_word_ua
user_agent = common_english_word_ua()
opener.addheaders = [('User-agent', user_agent)] opener.addheaders = [('User-agent', user_agent)]
proxies = get_proxies() proxies = get_proxies()
to_add = {} to_add = {}

View File

@ -15,6 +15,13 @@ def user_agent_data():
return ans return ans
def common_english_words():
ans = getattr(common_english_words, 'ans', None)
if ans is None:
ans = common_english_words.ans = tuple(x.strip() for x in P('common-english-words.txt', data=True).decode('utf-8').splitlines())
return ans
def common_user_agents(): def common_user_agents():
return user_agent_data()['common_user_agents'] return user_agent_data()['common_user_agents']
@ -39,3 +46,12 @@ def accept_header_for_ua(ua):
if 'Firefox/' in ua: if 'Firefox/' in ua:
return 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8' return 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
return 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' return 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
def common_english_word_ua():
words = common_english_words()
w1 = random.choice(words)
w2 = w1
while w2 == w1:
w2 = random.choice(words)
return f'{w1}/{w2}'