This commit is contained in:
Lee 2011-04-18 18:17:54 +08:00
parent 09da88b6d1
commit aa30f306b5

View File

@ -45,7 +45,6 @@ class OverDrive(Source):
isbn = identifiers.get('isbn', None)
br = self.browser
print "in identify, calling to_ovrdrv_data"
ovrdrv_data = self.to_ovrdrv_data(br, title, authors, ovrdrv_id)
if ovrdrv_data:
title = ovrdrv_data[8]
@ -83,7 +82,6 @@ class OverDrive(Source):
if cached_url is None:
log.info('No cached cover found, running identify')
rq = Queue()
print "inside download cover, calling identify"
self.identify(log, rq, abort, title=title, authors=authors,
identifiers=identifiers)
if abort.is_set():
@ -110,7 +108,6 @@ class OverDrive(Source):
ovrdrv_id = identifiers.get('overdrive', None)
br = self.browser
referer = self.get_base_referer()+'ContentDetails-Cover.htm?ID='+ovrdrv_id
print "downloading cover, referer is "+str(referer)
req = mechanize.Request(cached_url)
req.add_header('referer', referer)
log('Downloading cover from:', cached_url)
@ -124,7 +121,6 @@ class OverDrive(Source):
def get_cached_cover_url(self, identifiers): # {{{
url = None
ovrdrv_id = identifiers.get('overdrive', None)
print "inside get_cached_cover_url, ovrdrv_id is "+str(ovrdrv_id)
if ovrdrv_id is None:
isbn = identifiers.get('isbn', None)
if isbn is not None:
@ -217,13 +213,8 @@ class OverDrive(Source):
q_init_search = q+'SearchResults.aspx'
# get first author as string - convert this to a proper cleanup function later
s = Source(None)
print "printing list with author "+str(author)+":"
author_tokens = list(s.get_author_tokens(author))
print list(author_tokens)
title_tokens = list(s.get_title_tokens(title, False, True))
print "there are "+str(len(title_tokens))+" title tokens"
for token in title_tokens:
print "cleaned up title token is: "+str(token)
if len(title_tokens) >= len(author_tokens):
initial_q = ' '.join(title_tokens)
@ -232,8 +223,6 @@ class OverDrive(Source):
initial_q = ' '.join(author_tokens)
xref_q = '+'.join(title_tokens)
print "initial query is "+str(initial_q)
print "cross reference query is "+str(xref_q)
q_xref = q+'SearchResults.svc/GetResults?iDisplayLength=50&sSearch='+xref_q
query = '{"szKeyword":"'+initial_q+'"}'
@ -242,7 +231,6 @@ class OverDrive(Source):
req.add_header('Content-Type', 'application/json; charset=utf-8')
br.open_novisit(req, query)
print "q_init_search is "+q_init_search
# initiate the search without messing up the cookiejar
self.safe_query(br, q_init_search)
@ -254,7 +242,6 @@ class OverDrive(Source):
xreq.add_header('Referer', q_init_search)
xreq.add_header('Accept', 'application/json, text/javascript, */*')
raw = br.open_novisit(xreq).read()
print "overdrive search result is:\n"+raw
for m in re.finditer(ur'"iTotalDisplayRecords":(?P<displayrecords>\d+).*?"iTotalRecords":(?P<totalrecords>\d+)', raw):
if int(m.group('displayrecords')) >= 1:
results = True
@ -264,54 +251,40 @@ class OverDrive(Source):
elif int(m.group('totalrecords')) == 0:
return ''
print "\n\nsorting results"
return self.sort_ovrdrv_results(raw, title, title_tokens, author, author_tokens)
def sort_ovrdrv_results(self, raw, title=None, title_tokens=None, author=None, author_tokens=None, ovrdrv_id=None):
print "\ntitle to search for is "+str(title)+"\nauthor to search for is "+str(author)
close_matches = []
raw = re.sub('.*?\[\[(?P<content>.*?)\]\].*', '[[\g<content>]]', raw)
results = eval(raw)
print "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n"
#print results
# The search results are either from a keyword search or a multi-format list from a single ID,
# sort through the results for closest match/format
if results:
for reserveid, od_title, subtitle, edition, series, publisher, format, formatid, creators, \
thumbimage, shortdescription, worldcatlink, excerptlink, creatorfile, sorttitle, \
availabletolibrary, availabletoretailer, relevancyrank, unknown1, unknown2, unknown3 in results:
print "this record's title is "+od_title+", subtitle is "+subtitle+", author[s] are "+creators+", series is "+series
if ovrdrv_id is not None and int(formatid) in [1, 50, 410, 900]:
print "overdrive id is not None, searching based on format type priority"
return self.format_results(reserveid, od_title, subtitle, series, publisher, creators, thumbimage, worldcatlink, formatid)
else:
creators = creators.split(', ')
print "split creators from results are: "+str(creators)
# if an exact match in a preferred format occurs
if creators[0] == author[0] and od_title == title and int(formatid) in [1, 50, 410, 900]:
print "Got Exact Match!!!"
return self.format_results(reserveid, od_title, subtitle, series, publisher, creators, thumbimage, worldcatlink, formatid)
else:
close_title_match = False
close_author_match = False
print "format id is "+str(formatid)
for token in title_tokens:
print "attempting to find "+str(token)+" title token"
if od_title.lower().find(token.lower()) != -1:
print "matched token"
close_title_match = True
else:
print "token didn't match"
close_title_match = False
break
for token in author_tokens:
print "attempting to find "+str(token)+" author token"
if creators[0].lower().find(token.lower()) != -1:
print "matched token"
close_author_match = True
else:
print "token didn't match"
close_author_match = False
break
if close_title_match and close_author_match and int(formatid) in [1, 50, 410, 900]:
@ -350,7 +323,6 @@ class OverDrive(Source):
def find_ovrdrv_data(self, br, title, author, isbn, ovrdrv_id=None):
print "in find_ovrdrv_data, title is "+str(title)+", author is "+str(author)+", overdrive id is "+str(ovrdrv_id)
q = base_url
if ovrdrv_id is None:
return self.overdrive_search(br, q, title, author)
@ -364,28 +336,22 @@ class OverDrive(Source):
Takes either a title/author combo or an Overdrive ID. One of these
two must be passed to this function.
'''
print "starting to_ovrdrv_data"
if ovrdrv_id is not None:
with cache_lock:
ans = ovrdrv_data_cache.get(ovrdrv_id, None)
if ans:
print "inside to_ovrdrv_data, cache lookup successful, ans is "+str(ans)
return ans
elif ans is False:
print "inside to_ovrdrv_data, ans returned False"
return None
else:
ovrdrv_data = self.find_ovrdrv_data(br, title, author, ovrdrv_id)
else:
try:
print "trying to retrieve data, running find_ovrdrv_data"
ovrdrv_data = self.find_ovrdrv_data(br, title, author, ovrdrv_id)
print "ovrdrv_data is "+str(ovrdrv_data)
except:
import traceback
traceback.print_exc()
ovrdrv_data = None
print "writing results to ovrdrv_data cache"
with cache_lock:
ovrdrv_data_cache[ovrdrv_id] = ovrdrv_data if ovrdrv_data else False
@ -402,7 +368,6 @@ class OverDrive(Source):
publisher[5], creators[6], reserveid[7], title[8]]
'''
print "inside parse_search_results, writing the metadata results"
ovrdrv_id = ovrdrv_data[7]
mi.set_identifier('overdrive', ovrdrv_id)
@ -445,7 +410,7 @@ class OverDrive(Source):
mi.pubdate = parse_date(pub_date[0].strip())
if lang:
mi.language = lang[0].strip()
print "languages is "+str(mi.language)
#if ebook_isbn:
# print "ebook isbn is "+str(ebook_isbn[0])
# isbn = check_isbn(ebook_isbn[0].strip())
@ -454,7 +419,7 @@ class OverDrive(Source):
# mi.isbn = isbn
if subjects:
mi.tags = [tag.strip() for tag in subjects[0].split(',')]
print "tags are "+str(mi.tags)
if desc:
desc = desc[0]
desc = html.tostring(desc, method='html', encoding=unicode).strip()
@ -468,7 +433,6 @@ class OverDrive(Source):
def main(args=sys.argv):
print "running through main tests"
import tempfile, os, time
tdir = tempfile.gettempdir()
br = browser()
@ -490,19 +454,19 @@ def main(args=sys.argv):
(None, '9780345509741', 'The Horror Stories of Robert E. Howard', ['Robert E. Howard']), # Complex title with initials/dots stripped, some results don't have a cover
]:
cpath = os.path.join(tdir, title+'.jpg')
print "cpath is "+cpath
#print "cpath is "+cpath
st = time.time()
curl = get_cover_url(isbn, title, author, br, ovrdrv_id)
print '\n\n Took ', time.time() - st, ' to get basic metadata\n\n'
if curl is None:
print 'No cover found for', title
else:
print "curl is "+curl
#open(cpath, 'wb').write(br.open_novisit(curl).read())
#print 'Cover for', title, 'saved to', cpath
#print '\n\n Took ', time.time() - st, ' to get basic metadata\n\n'
#if curl is None:
# print 'No cover found for', title
#else:
# print "curl is "+curl
# open(cpath, 'wb').write(br.open_novisit(curl).read())
# print 'Cover for', title, 'saved to', cpath
st = time.time()
print get_social_metadata(title, author, isbn, ovrdrv_id)
print '\n\n Took ', time.time() - st, ' to get detailed metadata\n\n'
#print get_social_metadata(title, author, isbn, ovrdrv_id)
#print '\n\n Took ', time.time() - st, ' to get detailed metadata\n\n'
return 0