Fix #8477 (Series/Sequence Info no longer being downloaded)

This commit is contained in:
Kovid Goyal 2011-01-20 11:59:19 -07:00
parent f573d07e81
commit 1820832fa8
2 changed files with 22 additions and 7 deletions

View File

@ -241,7 +241,7 @@ def get_parsed_proxy(typ='http', debug=True):
return ans
def browser(honor_time=True, max_time=2, mobile_browser=False):
def browser(honor_time=True, max_time=2, mobile_browser=False, user_agent=None):
'''
Create a mechanize browser for web scraping. The browser handles cookies,
refresh requests and ignores robots.txt. Also uses proxy if avaialable.
@ -253,8 +253,10 @@ def browser(honor_time=True, max_time=2, mobile_browser=False):
opener = Browser()
opener.set_handle_refresh(True, max_time=max_time, honor_time=honor_time)
opener.set_handle_robots(False)
opener.addheaders = [('User-agent', ' Mozilla/5.0 (Windows; U; Windows CE 5.1; rv:1.8.1a3) Gecko/20060610 Minimo/0.016' if mobile_browser else \
'Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.13) Gecko/20101210 Gentoo Firefox/3.6.13')]
if user_agent is None:
user_agent = ' Mozilla/5.0 (Windows; U; Windows CE 5.1; rv:1.8.1a3) Gecko/20060610 Minimo/0.016' if mobile_browser else \
'Mozilla/5.0 (X11; U; Linux x86_64; en-US; rv:1.9.2.13) Gecko/20101210 Gentoo Firefox/3.6.13'
opener.addheaders = [('User-agent', user_agent)]
http_proxy = get_proxies().get('http', None)
if http_proxy:
opener.set_proxies({'http':http_proxy})

View File

@ -4,7 +4,7 @@ __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
Fetch cover from LibraryThing.com based on ISBN number.
'''
import sys, socket, os, re
import sys, socket, os, re, random
from lxml import html
import mechanize
@ -16,13 +16,26 @@ from calibre.ebooks.chardet import strip_encoding_declarations
OPENLIBRARY = 'http://covers.openlibrary.org/b/isbn/%s-L.jpg?default=false'
def get_ua():
choices = [
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.2.11) Gecko/20101012 Firefox/3.6.11'
'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)'
'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)'
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1)'
'Mozilla/5.0 (iPhone; U; CPU iPhone OS 3_0 like Mac OS X; en-us) AppleWebKit/528.18 (KHTML, like Gecko) Version/4.0 Mobile/7A341 Safari/528.16'
'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.19 (KHTML, like Gecko) Chrome/0.2.153.1 Safari/525.19'
'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.2.11) Gecko/20101012 Firefox/3.6.11'
]
return choices[random.randint(0, len(choices)-1)]
class HeadRequest(mechanize.Request):
def get_method(self):
return 'HEAD'
def check_for_cover(isbn, timeout=5.):
br = browser()
br = browser(user_agent=get_ua())
br.set_handle_redirect(False)
try:
br.open_novisit(HeadRequest(OPENLIBRARY%isbn), timeout=timeout)
@ -51,7 +64,7 @@ def login(br, username, password, force=True):
def cover_from_isbn(isbn, timeout=5., username=None, password=None):
src = None
br = browser()
br = browser(user_agent=get_ua())
try:
return br.open(OPENLIBRARY%isbn, timeout=timeout).read(), 'jpg'
except:
@ -100,7 +113,7 @@ def get_social_metadata(title, authors, publisher, isbn, username=None,
from calibre.ebooks.metadata import MetaInformation
mi = MetaInformation(title, authors)
if isbn:
br = browser()
br = browser(user_agent=get_ua())
if username and password:
try:
login(br, username, password, force=False)