mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-06-23 15:30:45 -04:00
Use a UA based on random english words
Cloudflare appears to block http requests with common browser user agents, probably it checks for some other header field with the user agent
This commit is contained in:
parent
07f72d2d94
commit
1dfe4bd1c0
@ -37,8 +37,9 @@ class TheHindu(BasicNewsRecipe):
|
||||
]
|
||||
|
||||
def get_browser(self):
|
||||
br = BasicNewsRecipe.get_browser(self)
|
||||
br = BasicNewsRecipe.get_browser(self, user_agent='common_words/based')
|
||||
br.addheaders += [('Referer', self.epaper_url)] # needed for fetching cover
|
||||
# br.set_debug_http(True)
|
||||
return br
|
||||
|
||||
def get_cover_url(self):
|
||||
@ -99,7 +100,7 @@ class TheHindu(BasicNewsRecipe):
|
||||
# {'title':'xxx', 'url':'http://www.thehindu.com/opinion/op-ed/rohingya-bangladeshs-burden-to-bear/article19694058.ece'},
|
||||
# {'title':'yyy', 'url':'http://www.thehindu.com/sci-tech/energy-and-environment/on-river-washed-antique-plains/article19699327.ece'}
|
||||
# ])]
|
||||
soup = self.index_to_soup('http://www.thehindu.com/todays-paper/')
|
||||
soup = self.index_to_soup('https://www.thehindu.com/todays-paper/')
|
||||
nav_div = soup.find(id='subnav-tpbar-latest')
|
||||
section_list = []
|
||||
|
||||
|
3000
resources/common-english-words.txt
Normal file
3000
resources/common-english-words.txt
Normal file
File diff suppressed because it is too large
Load Diff
@ -319,6 +319,9 @@ def browser(honor_time=True, max_time=2, user_agent=None, verify_ssl_certificate
|
||||
opener.set_handle_robots(False)
|
||||
if user_agent is None:
|
||||
user_agent = random_user_agent(0, allow_ie=False)
|
||||
elif user_agent == 'common_words/based':
|
||||
from calibre.utils.random_ua import common_english_word_ua
|
||||
user_agent = common_english_word_ua()
|
||||
opener.addheaders = [('User-agent', user_agent)]
|
||||
proxies = get_proxies()
|
||||
to_add = {}
|
||||
|
@ -15,6 +15,13 @@ def user_agent_data():
|
||||
return ans
|
||||
|
||||
|
||||
def common_english_words():
|
||||
ans = getattr(common_english_words, 'ans', None)
|
||||
if ans is None:
|
||||
ans = common_english_words.ans = tuple(x.strip() for x in P('common-english-words.txt', data=True).decode('utf-8').splitlines())
|
||||
return ans
|
||||
|
||||
|
||||
def common_user_agents():
|
||||
return user_agent_data()['common_user_agents']
|
||||
|
||||
@ -39,3 +46,12 @@ def accept_header_for_ua(ua):
|
||||
if 'Firefox/' in ua:
|
||||
return 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
|
||||
return 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
|
||||
|
||||
|
||||
def common_english_word_ua():
|
||||
words = common_english_words()
|
||||
w1 = random.choice(words)
|
||||
w2 = w1
|
||||
while w2 == w1:
|
||||
w2 = random.choice(words)
|
||||
return f'{w1}/{w2}'
|
||||
|
Loading…
x
Reference in New Issue
Block a user