diff --git a/src/calibre/__init__.py b/src/calibre/__init__.py index a4f7439405..221f5911c6 100644 --- a/src/calibre/__init__.py +++ b/src/calibre/__init__.py @@ -241,7 +241,7 @@ def get_parsed_proxy(typ='http', debug=True): return ans -def browser(honor_time=True, max_time=2, mobile_browser=False): +def browser(honor_time=True, max_time=2, mobile_browser=False, user_agent=None): ''' Create a mechanize browser for web scraping. The browser handles cookies, refresh requests and ignores robots.txt. Also uses proxy if avaialable. @@ -253,8 +253,10 @@ def browser(honor_time=True, max_time=2, mobile_browser=False): opener = Browser() opener.set_handle_refresh(True, max_time=max_time, honor_time=honor_time) opener.set_handle_robots(False) - opener.addheaders = [('User-agent', ' Mozilla/5.0 (Windows; U; Windows CE 5.1; rv:1.8.1a3) Gecko/20060610 Minimo/0.016' if mobile_browser else \ - 'Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.13) Gecko/20101210 Gentoo Firefox/3.6.13')] + if user_agent is None: + user_agent = ' Mozilla/5.0 (Windows; U; Windows CE 5.1; rv:1.8.1a3) Gecko/20060610 Minimo/0.016' if mobile_browser else \ + 'Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.13) Gecko/20101210 Gentoo Firefox/3.6.13' + opener.addheaders = [('User-agent', user_agent)] http_proxy = get_proxies().get('http', None) if http_proxy: opener.set_proxies({'http':http_proxy}) diff --git a/src/calibre/ebooks/metadata/library_thing.py b/src/calibre/ebooks/metadata/library_thing.py index 7f312da1d9..d956747a2b 100644 --- a/src/calibre/ebooks/metadata/library_thing.py +++ b/src/calibre/ebooks/metadata/library_thing.py @@ -4,7 +4,7 @@ __copyright__ = '2008, Kovid Goyal ' Fetch cover from LibraryThing.com based on ISBN number. ''' -import sys, socket, os, re +import sys, socket, os, re, random from lxml import html import mechanize @@ -16,13 +16,26 @@ from calibre.ebooks.chardet import strip_encoding_declarations OPENLIBRARY = 'http://covers.openlibrary.org/b/isbn/%s-L.jpg?default=false' +def get_ua(): + choices = [ + 'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.2.11) Gecko/20101012 Firefox/3.6.11' + 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)' + 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)' + 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1)' + 'Mozilla/5.0 (iPhone; U; CPU iPhone OS 3_0 like Mac OS X; en-us) AppleWebKit/528.18 (KHTML, like Gecko) Version/4.0 Mobile/7A341 Safari/528.16' + 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/0.2.153.1 Safari/525.19' + 'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.2.11) Gecko/20101012 Firefox/3.6.11' + ] + return choices[random.randint(0, len(choices)-1)] + + class HeadRequest(mechanize.Request): def get_method(self): return 'HEAD' def check_for_cover(isbn, timeout=5.): - br = browser() + br = browser(user_agent=get_ua()) br.set_handle_redirect(False) try: br.open_novisit(HeadRequest(OPENLIBRARY%isbn), timeout=timeout) @@ -51,7 +64,7 @@ def login(br, username, password, force=True): def cover_from_isbn(isbn, timeout=5., username=None, password=None): src = None - br = browser() + br = browser(user_agent=get_ua()) try: return br.open(OPENLIBRARY%isbn, timeout=timeout).read(), 'jpg' except: @@ -100,7 +113,7 @@ def get_social_metadata(title, authors, publisher, isbn, username=None, from calibre.ebooks.metadata import MetaInformation mi = MetaInformation(title, authors) if isbn: - br = browser() + br = browser(user_agent=get_ua()) if username and password: try: login(br, username, password, force=False)