mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
further work on the overdrive plugin
This commit is contained in:
parent
ed93d74e98
commit
c6a2c8e82e
@ -161,14 +161,7 @@ class OverdriveCovers(CoverDownload): # {{{
|
|||||||
def has_cover(self, mi, ans, timeout=5.):
|
def has_cover(self, mi, ans, timeout=5.):
|
||||||
if not mi.authors or not mi.title:
|
if not mi.authors or not mi.title:
|
||||||
return False
|
return False
|
||||||
from calibre.ebooks.metadata.overdrive import get_cover_url
|
return True
|
||||||
br = browser()
|
|
||||||
try:
|
|
||||||
get_cover_url(mi.isbn, mi.title, mi.authors, br)
|
|
||||||
self.debug('cover for', mi.isbn, 'found')
|
|
||||||
ans.set()
|
|
||||||
except Exception, e:
|
|
||||||
self.debug(e)
|
|
||||||
|
|
||||||
def get_covers(self, mi, result_queue, abort, timeout=5.):
|
def get_covers(self, mi, result_queue, abort, timeout=5.):
|
||||||
if not mi.isbn:
|
if not mi.isbn:
|
||||||
|
@ -261,10 +261,10 @@ class Overdrive(MetadataSource): # {{{
|
|||||||
def fetch(self):
|
def fetch(self):
|
||||||
if not self.isbn:
|
if not self.isbn:
|
||||||
return
|
return
|
||||||
from calibre.ebooks.metadata.overdrive import get_metadata
|
from calibre.ebooks.metadata.overdrive import get_social_metadata
|
||||||
try:
|
try:
|
||||||
self.results = get_metadata(self.title, self.book_author,
|
self.results = get_social_metadata(self.title, self.book_author, self.isbn)
|
||||||
self.publisher, self.isbn)
|
|
||||||
except Exception, e:
|
except Exception, e:
|
||||||
self.exception = e
|
self.exception = e
|
||||||
self.tb = traceback.format_exc()
|
self.tb = traceback.format_exc()
|
||||||
|
@ -25,13 +25,12 @@ cache_lock = RLock()
|
|||||||
base_url = 'http://search.overdrive.com/'
|
base_url = 'http://search.overdrive.com/'
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def create_query(self, title=None, authors=None, identifiers={}):
|
def create_query(self, title=None, authors=None, identifiers={}):
|
||||||
q = ''
|
q = ''
|
||||||
if title or authors:
|
if title or authors:
|
||||||
def build_term(prefix, parts):
|
def build_term(prefix, parts):
|
||||||
return ' '.join('in'+prefix + ':' + x for x in parts)
|
return ' '.join('in'+prefix + ':' + x for x in parts)
|
||||||
title_tokens = list(self.get_title_tokens(title))
|
title_tokens = list(self.get_title_tokens(title, False))
|
||||||
if title_tokens:
|
if title_tokens:
|
||||||
q += build_term('title', title_tokens)
|
q += build_term('title', title_tokens)
|
||||||
author_tokens = self.get_author_tokens(authors,
|
author_tokens = self.get_author_tokens(authors,
|
||||||
@ -58,7 +57,7 @@ def get_base_referer():
|
|||||||
'http://spl.lib.overdrive.com/5875E082-4CB2-4689-9426-8509F354AFEF/10/335/en/'
|
'http://spl.lib.overdrive.com/5875E082-4CB2-4689-9426-8509F354AFEF/10/335/en/'
|
||||||
]
|
]
|
||||||
return choices[random.randint(0, len(choices)-1)]
|
return choices[random.randint(0, len(choices)-1)]
|
||||||
|
|
||||||
def format_results(reserveid, od_title, subtitle, series, publisher, creators, thumbimage, worldcatlink, formatid):
|
def format_results(reserveid, od_title, subtitle, series, publisher, creators, thumbimage, worldcatlink, formatid):
|
||||||
fix_slashes = re.compile(r'\\/')
|
fix_slashes = re.compile(r'\\/')
|
||||||
thumbimage = fix_slashes.sub('/', thumbimage)
|
thumbimage = fix_slashes.sub('/', thumbimage)
|
||||||
@ -67,8 +66,10 @@ def format_results(reserveid, od_title, subtitle, series, publisher, creators, t
|
|||||||
social_metadata_url = base_url+'TitleInfo.aspx?ReserveID='+reserveid+'&FormatID='+formatid
|
social_metadata_url = base_url+'TitleInfo.aspx?ReserveID='+reserveid+'&FormatID='+formatid
|
||||||
series_num = ''
|
series_num = ''
|
||||||
if not series:
|
if not series:
|
||||||
if subtitle:
|
if subtitle:
|
||||||
title = od_title+': '+subtitle
|
title = od_title+': '+subtitle
|
||||||
|
else:
|
||||||
|
title = od_title
|
||||||
else:
|
else:
|
||||||
title = od_title
|
title = od_title
|
||||||
m = re.search("([0-9]+$)", subtitle)
|
m = re.search("([0-9]+$)", subtitle)
|
||||||
@ -76,36 +77,12 @@ def format_results(reserveid, od_title, subtitle, series, publisher, creators, t
|
|||||||
series_num = float(m.group(1))
|
series_num = float(m.group(1))
|
||||||
return [cover_url, social_metadata_url, worldcatlink, series, series_num, publisher, creators, reserveid, title]
|
return [cover_url, social_metadata_url, worldcatlink, series, series_num, publisher, creators, reserveid, title]
|
||||||
|
|
||||||
def overdrive_search(br, q, title, author):
|
def safe_query(br, query_url):
|
||||||
q_query = q+'default.aspx/SearchByKeyword'
|
'''
|
||||||
q_init_search = q+'SearchResults.aspx'
|
The query must be initialized by loading an empty search results page
|
||||||
# get first author as string - convert this to a proper cleanup function later
|
this page attempts to set a cookie that Mechanize doesn't like
|
||||||
s = Source(None)
|
copy the cookiejar to a separate instance and make a one-off request with the temp cookiejar
|
||||||
print "printing list with string:"
|
'''
|
||||||
print list(s.get_author_tokens(['J. R. R. Tolkien']))
|
|
||||||
print "printing list with author "+str(author)+":"
|
|
||||||
print list(s.get_author_tokens(author))
|
|
||||||
author = list(s.get_author_tokens(author))
|
|
||||||
for token in author:
|
|
||||||
print "cleaned up author is: "+str(token)
|
|
||||||
author_q = '+'.join(author)
|
|
||||||
#author_q = separator.join(for x in author)
|
|
||||||
# query terms
|
|
||||||
#author_q = re.sub('\s', '+', author_q)
|
|
||||||
print "final author query is "+str(author_q)
|
|
||||||
q_xref = q+'SearchResults.svc/GetResults?iDisplayLength=10&sSearch='+author_q
|
|
||||||
query = '{"szKeyword":"'+title+'"}'
|
|
||||||
|
|
||||||
# main query, requires specific Content Type header
|
|
||||||
req = mechanize.Request(q_query)
|
|
||||||
req.add_header('Content-Type', 'application/json; charset=utf-8')
|
|
||||||
br.open_novisit(req, query)
|
|
||||||
|
|
||||||
print "q_init_search is "+q_init_search
|
|
||||||
|
|
||||||
# the query must be initialized by loading an empty search results page
|
|
||||||
# this page attempts to set a cookie that Mechanize doesn't like
|
|
||||||
# copy the cookiejar to a separate instance and make a one-off request with the temp cookiejar
|
|
||||||
goodcookies = br._ua_handlers['_cookies'].cookiejar
|
goodcookies = br._ua_handlers['_cookies'].cookiejar
|
||||||
clean_cj = mechanize.CookieJar()
|
clean_cj = mechanize.CookieJar()
|
||||||
cookies_to_copy = []
|
cookies_to_copy = []
|
||||||
@ -115,10 +92,46 @@ def overdrive_search(br, q, title, author):
|
|||||||
for copied_cookie in cookies_to_copy:
|
for copied_cookie in cookies_to_copy:
|
||||||
clean_cj.set_cookie(copied_cookie)
|
clean_cj.set_cookie(copied_cookie)
|
||||||
|
|
||||||
br.open_novisit(q_init_search)
|
br.open_novisit(query_url)
|
||||||
|
|
||||||
br.set_cookiejar(clean_cj)
|
br.set_cookiejar(clean_cj)
|
||||||
|
|
||||||
|
|
||||||
|
def overdrive_search(br, q, title, author):
|
||||||
|
q_query = q+'default.aspx/SearchByKeyword'
|
||||||
|
q_init_search = q+'SearchResults.aspx'
|
||||||
|
# get first author as string - convert this to a proper cleanup function later
|
||||||
|
s = Source(None)
|
||||||
|
print "printing list with string:"
|
||||||
|
#print list(s.get_author_tokens(['J. R. R. Tolkien']))
|
||||||
|
print "printing list with author "+str(author)+":"
|
||||||
|
print list(s.get_author_tokens(author))
|
||||||
|
author_tokens = list(s.get_author_tokens(author))
|
||||||
|
for token in author_tokens:
|
||||||
|
print "cleaned up author token is: "+str(token)
|
||||||
|
author_q = ' '.join(author_tokens)
|
||||||
|
|
||||||
|
title_tokens = list(s.get_title_tokens(title))
|
||||||
|
for token in title_tokens:
|
||||||
|
print "cleaned up title token is: "+str(token)
|
||||||
|
title_q = '+'.join(title_tokens)
|
||||||
|
#author_q = separator.join(for x in author)
|
||||||
|
# query terms
|
||||||
|
#author_q = re.sub('\s', '+', author_q)
|
||||||
|
print "final author query is "+str(author_q)
|
||||||
|
print "final title query is "+str(title_q)
|
||||||
|
q_xref = q+'SearchResults.svc/GetResults?iDisplayLength=20&sSearch='+title_q
|
||||||
|
query = '{"szKeyword":"'+author_q+'"}'
|
||||||
|
|
||||||
|
# main query, requires specific Content Type header
|
||||||
|
req = mechanize.Request(q_query)
|
||||||
|
req.add_header('Content-Type', 'application/json; charset=utf-8')
|
||||||
|
br.open_novisit(req, query)
|
||||||
|
|
||||||
|
print "q_init_search is "+q_init_search
|
||||||
|
# initiate the search without messing up the cookiejar
|
||||||
|
safe_query(br, q_init_search)
|
||||||
|
|
||||||
# get the search results object
|
# get the search results object
|
||||||
xreq = mechanize.Request(q_xref)
|
xreq = mechanize.Request(q_xref)
|
||||||
xreq.add_header('X-Requested-With', 'XMLHttpRequest')
|
xreq.add_header('X-Requested-With', 'XMLHttpRequest')
|
||||||
@ -126,83 +139,102 @@ def overdrive_search(br, q, title, author):
|
|||||||
xreq.add_header('Accept', 'application/json, text/javascript, */*')
|
xreq.add_header('Accept', 'application/json, text/javascript, */*')
|
||||||
raw = br.open_novisit(xreq).read()
|
raw = br.open_novisit(xreq).read()
|
||||||
print "overdrive search result is:\n"+raw
|
print "overdrive search result is:\n"+raw
|
||||||
|
print "\n\nsorting results"
|
||||||
|
return sort_ovrdrv_results(raw, title, title_tokens, author, author_tokens)
|
||||||
|
|
||||||
|
|
||||||
|
def sort_ovrdrv_results(raw, title=None, title_tokens=None, author=None, author_tokens=None, ovrdrv_id=None):
|
||||||
|
print "\ntitle to search for is "+str(title)+"\nauthor to search for is "+str(author)
|
||||||
|
close_matches = []
|
||||||
raw = re.sub('.*?\[\[(?P<content>.*?)\]\].*', '[[\g<content>]]', raw)
|
raw = re.sub('.*?\[\[(?P<content>.*?)\]\].*', '[[\g<content>]]', raw)
|
||||||
results = eval(raw)
|
results = eval(raw)
|
||||||
print "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n"
|
print "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n"
|
||||||
print results
|
#print results
|
||||||
# The search results are from a keyword search (overdrive's advanced search is broken),
|
# The search results are either from a keyword search or a multi-format list from a single ID,
|
||||||
# sort through the results for closest match/format
|
# sort through the results for closest match/format
|
||||||
for result in results:
|
for result in results:
|
||||||
print "\n\n\nthis result is "+str(result)
|
print "\n\n\nthis result is "+str(result)
|
||||||
for reserveid, od_title, subtitle, edition, series, publisher, format, formatid, creators, \
|
for reserveid, od_title, subtitle, edition, series, publisher, format, formatid, creators, \
|
||||||
thumbimage, shortdescription, worldcatlink, excerptlink, creatorfile, sorttitle, \
|
thumbimage, shortdescription, worldcatlink, excerptlink, creatorfile, sorttitle, \
|
||||||
availabletolibrary, availabletoretailer, relevancyrank, unknown1, unknown2, unknown3 in results:
|
availabletolibrary, availabletoretailer, relevancyrank, unknown1, unknown2, unknown3 in results:
|
||||||
creators = creators.split(', ')
|
if ovrdrv_id is not None and int(formatid) in [1, 50, 410, 900]:
|
||||||
print "fixed creators are: "+str(creators)
|
print "overdrive id is not None, searching based on format type priority"
|
||||||
# if an exact match occurs
|
return format_results(reserveid, od_title, subtitle, series, publisher, creators, thumbimage, worldcatlink, formatid)
|
||||||
if creators[0] == author and od_title == title and int(formatid) in [1, 50, 410, 900]:
|
|
||||||
print "Got Exact Match!!!"
|
|
||||||
return format_results(reserveid, od_title, subtitle, series, publisher, creators, thumbimage, worldcatlink, formatid)
|
|
||||||
|
|
||||||
|
|
||||||
def library_search(br, q, title, author):
|
|
||||||
q_search = q+'AdvancedSearch.htm'
|
|
||||||
q_query = q+'BANGSearch.dll'
|
|
||||||
br.open(q_search)
|
|
||||||
# Search for cover with audiobooks lowest priority
|
|
||||||
for format in ['410','50','900','25','425']:
|
|
||||||
query = 'Title='+title+'&Creator='+author+'&Keyword=&ISBN=&Format='+format+'&Language=&Publisher=&Subject=&Award=&CollDate=&PerPage=10&Sort=SortBy%3Dtitle'
|
|
||||||
query = re.sub('\s', '+', query)
|
|
||||||
#print "search url is "+str(q_search)
|
|
||||||
print "query is "+str(query)
|
|
||||||
raw = br.open(q_query, query).read()
|
|
||||||
#print "raw html is:\n"+str(raw)
|
|
||||||
raw = xml_to_unicode(raw, strip_encoding_pats=True,
|
|
||||||
resolve_entities=True)[0]
|
|
||||||
root = html.fromstring(raw)
|
|
||||||
revs = root.xpath("//img[@class='blackborder']")
|
|
||||||
if revs:
|
|
||||||
#print "revs are "+str(revs)
|
|
||||||
# get the first match, as it's the most likely candidate
|
|
||||||
x = revs[0]
|
|
||||||
id = urllib.unquote(re.sub('.*?/(?P<i>%7B.*?%7D).*', '\g<i>', x.get('src')))
|
|
||||||
curl = re.sub('(?P<img>(Ima?g(eType-)?))200', '\g<img>100', x.get('src'))
|
|
||||||
murl = root.xpath("//img[@class='blackborder']/parent::*")
|
|
||||||
if murl:
|
|
||||||
murl = [y.get('href') for y in murl]
|
|
||||||
print "murl is"+str(murl)
|
|
||||||
murl = q+murl[0]
|
|
||||||
else:
|
else:
|
||||||
print "didn't get metadata URL"
|
creators = creators.split(', ')
|
||||||
print "curl is "+str(curl)+", id is "+str(id)+", murl is "+str(murl)
|
print "fixed creators are: "+str(creators)
|
||||||
ovrdrv_data = [id, curl, murl]
|
# if an exact match in a preferred format occurs
|
||||||
print "revs final are "+str(revs)
|
if creators[0] == author[0] and od_title == title and int(formatid) in [1, 50, 410, 900]:
|
||||||
return ovrdrv_data
|
print "Got Exact Match!!!"
|
||||||
|
return format_results(reserveid, od_title, subtitle, series, publisher, creators, thumbimage, worldcatlink, formatid)
|
||||||
|
else:
|
||||||
|
close_title_match = False
|
||||||
|
close_author_match = False
|
||||||
|
for token in title_tokens:
|
||||||
|
if od_title.lower().find(token.lower()) != -1:
|
||||||
|
close_title_match = True
|
||||||
|
else:
|
||||||
|
close_title_match = False
|
||||||
|
break
|
||||||
|
for token in author_tokens:
|
||||||
|
if creators[0].lower().find(token.lower()) != -1:
|
||||||
|
close_author_match = True
|
||||||
|
else:
|
||||||
|
close_author_match = False
|
||||||
|
break
|
||||||
|
if close_title_match and close_author_match and int(formatid) in [1, 50, 410, 900]:
|
||||||
|
close_matches.append(format_results(reserveid, od_title, subtitle, series, publisher, creators, thumbimage, worldcatlink, formatid))
|
||||||
|
if close_matches:
|
||||||
|
return close_matches[0]
|
||||||
|
else:
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
def find_ovrdrv_data(br, title, author, isbn):
|
|
||||||
print "in fnd_ovrdrv_data, title is "+str(title)+", author is "+str(author)
|
def overdrive_get_record(br, q, ovrdrv_id):
|
||||||
|
search_url = q+'SearchResults.aspx?ReserveID={'+ovrdrv_id+'}'
|
||||||
|
results_url = q+'SearchResults.svc/GetResults?sEcho=1&iColumns=18&sColumns=ReserveID%2CTitle%2CSubtitle%2CEdition%2CSeries%2CPublisher%2CFormat%2CFormatID%2CCreators%2CThumbImage%2CShortDescription%2CWorldCatLink%2CExcerptLink%2CCreatorFile%2CSortTitle%2CAvailableToLibrary%2CAvailableToRetailer%2CRelevancyRank&iDisplayStart=0&iDisplayLength=10&sSearch=&bEscapeRegex=true&iSortingCols=1&iSortCol_0=17&sSortDir_0=asc'
|
||||||
|
|
||||||
|
# get the base url to set the proper session cookie
|
||||||
|
br.open_novisit(q)
|
||||||
|
|
||||||
|
# initialize the search
|
||||||
|
safe_query(br, search_url)
|
||||||
|
|
||||||
|
# get the results
|
||||||
|
req = mechanize.Request(results_url)
|
||||||
|
req.add_header('X-Requested-With', 'XMLHttpRequest')
|
||||||
|
req.add_header('Referer', search_url)
|
||||||
|
req.add_header('Accept', 'application/json, text/javascript, */*')
|
||||||
|
raw = br.open_novisit(req)
|
||||||
|
raw = str(list(raw))
|
||||||
|
return sort_ovrdrv_results(raw, None, None, None, ovrdrv_id)
|
||||||
|
|
||||||
|
|
||||||
|
def find_ovrdrv_data(br, title, author, isbn, ovrdrv_id=None):
|
||||||
|
print "in find_ovrdrv_data, title is "+str(title)+", author is "+str(author)+", overdrive id is "+str(ovrdrv_id)
|
||||||
q = base_url
|
q = base_url
|
||||||
if re.match('http://search\.overdrive\.', q):
|
if ovrdrv_id is None:
|
||||||
return overdrive_search(br, q, title, author)
|
return overdrive_search(br, q, title, author)
|
||||||
else:
|
else:
|
||||||
return library_search(br, q, title, author)
|
return overdrive_get_record(br, q, ovrdrv_id)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def to_ovrdrv_data(br, title, author, isbn):
|
|
||||||
|
def to_ovrdrv_data(br, title, author, isbn, ovrdrv_id=None):
|
||||||
print "starting to_ovrdrv_data"
|
print "starting to_ovrdrv_data"
|
||||||
with cache_lock:
|
with cache_lock:
|
||||||
ans = ovrdrv_data_cache.get(isbn, None)
|
ans = ovrdrv_data_cache.get(isbn, None)
|
||||||
if ans:
|
if ans:
|
||||||
print "inside to_ovrdrv_data, ans returned positive, ans is"+str(ans)
|
print "inside to_ovrdrv_data, cache lookup successful, ans is "+str(ans)
|
||||||
return ans
|
return ans
|
||||||
if ans is False:
|
if ans is False:
|
||||||
print "inside to_ovrdrv_data, ans returned False"
|
print "inside to_ovrdrv_data, ans returned False"
|
||||||
return None
|
return None
|
||||||
try:
|
try:
|
||||||
ovrdrv_data = find_ovrdrv_data(br, title, author, isbn)
|
print "trying to retrieve data, running find_ovrdrv_data"
|
||||||
print "ovrdrv_data = "+str(ovrdrv_data)
|
ovrdrv_data = find_ovrdrv_data(br, title, author, isbn, ovrdrv_id)
|
||||||
|
print "ovrdrv_data is "+str(ovrdrv_data)
|
||||||
except:
|
except:
|
||||||
import traceback
|
import traceback
|
||||||
traceback.print_exc()
|
traceback.print_exc()
|
||||||
@ -210,66 +242,69 @@ def to_ovrdrv_data(br, title, author, isbn):
|
|||||||
|
|
||||||
with cache_lock:
|
with cache_lock:
|
||||||
ovrdrv_data_cache[isbn] = ovrdrv_data if ovrdrv_data else False
|
ovrdrv_data_cache[isbn] = ovrdrv_data if ovrdrv_data else False
|
||||||
|
if ovrdrv_data:
|
||||||
|
from calibre.ebooks.metadata.xisbn import xisbn
|
||||||
|
for i in xisbn.get_associated_isbns(isbn):
|
||||||
|
with cache_lock:
|
||||||
|
ovrdrv_data_cache[i] = ovrdrv_data
|
||||||
|
|
||||||
return ovrdrv_data
|
return ovrdrv_data
|
||||||
|
|
||||||
|
|
||||||
def get_social_metadata(title, authors, publisher, isbn):
|
def get_social_metadata(title, authors, isbn, ovrdrv_id=None):
|
||||||
author = authors[0]
|
author = authors[0]
|
||||||
mi = Metadata(title, authors)
|
mi = Metadata(title, authors)
|
||||||
if not isbn:
|
|
||||||
return mi
|
|
||||||
isbn = check_isbn(isbn)
|
|
||||||
if not isbn:
|
|
||||||
return mi
|
|
||||||
br = browser()
|
br = browser()
|
||||||
ovrdrv_data = to_ovrdrv_data(br, title, authors, isbn)
|
print "calling to_ovrdrv_data from inside get_social_metadata"
|
||||||
if ovrdrv_data and get_metadata_detail_ovrdrv(br, ovrdrv_data, mi):
|
ovrdrv_data = to_ovrdrv_data(br, title, authors, isbn, ovrdrv_id)
|
||||||
|
|
||||||
|
#[cover_url, social_metadata_url, worldcatlink, series, series_num, publisher, creators, reserveid, title]
|
||||||
|
|
||||||
|
if len(ovrdrv_data[3]) > 1:
|
||||||
|
mi.series = ovrdrv_data[3]
|
||||||
|
if ovrdrv_data[4]:
|
||||||
|
mi.series_index = ovrdrv_data[4]
|
||||||
|
mi.publisher = ovrdrv_data[5]
|
||||||
|
mi.authors = ovrdrv_data[6]
|
||||||
|
if ovrdrv_id is None:
|
||||||
|
ovrdrv_id = ovrdrv_data[7]
|
||||||
|
mi.set_identifier('overdrive', ovrdrv_id)
|
||||||
|
mi.title = ovrdrv_data[8]
|
||||||
|
|
||||||
|
if ovrdrv_data and get_metadata_detail(br, ovrdrv_data[1], mi, isbn):
|
||||||
return mi
|
return mi
|
||||||
#from calibre.ebooks.metadata.xisbn import xisbn
|
|
||||||
#for i in xisbn.get_associated_isbns(isbn):
|
|
||||||
# print "xisbn isbn is "+str(i)
|
|
||||||
# ovrdrv_data = to_ovrdrv_data(br, title, author, i)
|
|
||||||
# if ovrdrv_data and get_metadata_detail(br, ovrdrv_data, mi):
|
|
||||||
# return mi
|
|
||||||
return mi
|
return mi
|
||||||
|
|
||||||
def get_cover_url(isbn, title, author, br):
|
def get_cover_url(isbn, title, author, br, ovrdrv_id=None):
|
||||||
print "starting get_cover_url"
|
print "starting get_cover_url"
|
||||||
isbn = check_isbn(isbn)
|
|
||||||
print "isbn is "+str(isbn)
|
|
||||||
print "title is "+str(title)
|
print "title is "+str(title)
|
||||||
print "author is "+str(author[0])
|
print "author is "+str(author[0])
|
||||||
|
print "isbn is "+str(isbn)
|
||||||
|
print "ovrdrv_id is "+str(ovrdrv_id)
|
||||||
|
|
||||||
with cache_lock:
|
with cache_lock:
|
||||||
ans = cover_url_cache.get(isbn, None)
|
ans = cover_url_cache.get(isbn, None)
|
||||||
|
#ans = cover_url_cache.get(ovrdrv_id, None)
|
||||||
if ans:
|
if ans:
|
||||||
print "ans returned positive"
|
print "cover url cache lookup returned positive, ans is "+str(ans)
|
||||||
return ans
|
return ans
|
||||||
if ans is False:
|
if ans is False:
|
||||||
"ans returned false"
|
"cover url cache lookup returned false"
|
||||||
return None
|
return None
|
||||||
print "in get_cover_url, running through ovrdrv_data function"
|
print "in get_cover_url, calling to_ovrdrv_data function"
|
||||||
ovrdrv_data = to_ovrdrv_data(br, title, author, isbn)
|
ovrdrv_data = to_ovrdrv_data(br, title, author, isbn, ovrdrv_id)
|
||||||
print "ovrdrv_id is "+str(ovrdrv_data)
|
|
||||||
if ovrdrv_data:
|
if ovrdrv_data:
|
||||||
ans = ovrdrv_data[0]
|
ans = ovrdrv_data[0]
|
||||||
print "inside get_cover_url, ans is "+str(ans)
|
print "inside get_cover_url, got url from to_ovrdrv_data, ans is "+str(ans)
|
||||||
if ans:
|
if ans:
|
||||||
|
print "writing cover url to url cache"
|
||||||
with cache_lock:
|
with cache_lock:
|
||||||
cover_url_cache[isbn] = ans
|
cover_url_cache[isbn] = ans
|
||||||
|
#cover_url_cache[ovrdrv_id] = ans
|
||||||
return ans
|
return ans
|
||||||
#from calibre.ebooks.metadata.xisbn import xisbn
|
|
||||||
#for i in xisbn.get_associated_isbns(isbn):
|
|
||||||
# print "in get_cover_url, using xisbn list to associate other books"
|
|
||||||
# ovrdrv_data = to_ovrdrv_data(br, title, author, i)
|
|
||||||
# if ovrdrv_data:
|
|
||||||
# ans = _get_cover_url(br, ovrdrv_data)
|
|
||||||
# if ans:
|
|
||||||
# with cache_lock:
|
|
||||||
# cover_url_cache[isbn] = ans
|
|
||||||
# cover_url_cache[i] = ans
|
|
||||||
# return ans
|
|
||||||
with cache_lock:
|
with cache_lock:
|
||||||
|
print "marking cover url cache for this isbn false"
|
||||||
cover_url_cache[isbn] = False
|
cover_url_cache[isbn] = False
|
||||||
return None
|
return None
|
||||||
|
|
||||||
@ -303,18 +338,14 @@ def _get_cover_url(br, ovrdrv_data):
|
|||||||
return ('/'.join(parts[:-1]))+'/'+bn
|
return ('/'.join(parts[:-1]))+'/'+bn
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
def get_metadata_detail(br, metadata_url, mi, isbn=None):
|
||||||
def get_metadata_detail(br, ovrdrv_data, mi):
|
|
||||||
q = ovrdrv_data[2]
|
|
||||||
try:
|
try:
|
||||||
raw = br.open_novisit(q).read()
|
raw = br.open_novisit(metadata_url).read()
|
||||||
except Exception, e:
|
except Exception, e:
|
||||||
if callable(getattr(e, 'getcode', None)) and \
|
if callable(getattr(e, 'getcode', None)) and \
|
||||||
e.getcode() == 404:
|
e.getcode() == 404:
|
||||||
return False
|
return False
|
||||||
raise
|
raise
|
||||||
if '<title>404 - ' in raw:
|
|
||||||
return False
|
|
||||||
raw = xml_to_unicode(raw, strip_encoding_pats=True,
|
raw = xml_to_unicode(raw, strip_encoding_pats=True,
|
||||||
resolve_entities=True)[0]
|
resolve_entities=True)[0]
|
||||||
try:
|
try:
|
||||||
@ -322,26 +353,28 @@ def get_metadata_detail(br, ovrdrv_data, mi):
|
|||||||
except:
|
except:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
# Check for series name and retrieve it
|
isbn = check_isbn(isbn)
|
||||||
series_name = root.xpath("//td/script[re:test(text(), 'szSeries', 'i')]",
|
|
||||||
namespaces={"re": "http://exslt.org/regular-expressions"})
|
|
||||||
if series_name:
|
|
||||||
series = html.tostring(series_name[0], method='html', encoding=unicode).strip()
|
|
||||||
series = re.sub('(?s).*?szSeries\s*=\s*\"(?P<series>.*?)\";.*', '\g<series>', series)
|
|
||||||
if len(series) > 1:
|
|
||||||
mi.series = series
|
|
||||||
# If series was successful attempt to get the series number
|
|
||||||
series_num = root.xpath("//div/strong[re:test(text(), ',\s(Book|Part|Volume)')]",
|
|
||||||
namespaces={"re": "http://exslt.org/regular-expressions"})
|
|
||||||
if series_num:
|
|
||||||
series_num = float(re.sub('(?s).*?,\s*(Book|Part|Volume)\s*(?P<num>\d+).*', '\g<num>',
|
|
||||||
etree.tostring(series_num[0])))
|
|
||||||
if series_num >= 1:
|
|
||||||
mi.series_index = series_num
|
|
||||||
print "series_num is "+str(series_num)
|
|
||||||
|
|
||||||
desc = root.xpath("//td[@class='collection' and re:test(., 'Description', 'i')]/following::div[1]",
|
pub_date = root.xpath("//div/label[@id='ctl00_ContentPlaceHolder1_lblPubDate']/text()")
|
||||||
namespaces={"re": "http://exslt.org/regular-expressions"})
|
lang = root.xpath("//div/label[@id='ctl00_ContentPlaceHolder1_lblLanguage']/text()")
|
||||||
|
subjects = root.xpath("//div/label[@id='ctl00_ContentPlaceHolder1_lblSubjects']/text()")
|
||||||
|
ebook_isbn = root.xpath("//div/label[@id='ctl00_ContentPlaceHolder1_lblIdentifier']/text()")
|
||||||
|
desc = root.xpath("//div/label[@id='ctl00_ContentPlaceHolder1_lblDescription']/ancestor::div[1]")
|
||||||
|
|
||||||
|
if pub_date:
|
||||||
|
from calibre.utils.date import parse_date
|
||||||
|
mi.pubdate = parse_date(pub_date[0].strip())
|
||||||
|
if lang:
|
||||||
|
mi.language = lang[0].strip()
|
||||||
|
print "languages is "+str(mi.language)
|
||||||
|
if ebook_isbn and isbn is None:
|
||||||
|
print "ebook isbn is "+str(ebook_isbn[0])
|
||||||
|
mi.set_identifier('isbn', ebook_isbn)
|
||||||
|
#elif isbn is not None:
|
||||||
|
# mi.set_identifier('isbn', isbn)
|
||||||
|
if subjects:
|
||||||
|
mi.tags = subjects
|
||||||
|
print "tags are "+str(mi.tags)
|
||||||
if desc:
|
if desc:
|
||||||
desc = desc[0]
|
desc = desc[0]
|
||||||
desc = html.tostring(desc, method='html', encoding=unicode).strip()
|
desc = html.tostring(desc, method='html', encoding=unicode).strip()
|
||||||
@ -351,36 +384,6 @@ def get_metadata_detail(br, ovrdrv_data, mi):
|
|||||||
desc = re.sub(r'(?s)<!--.*?-->', '', desc)
|
desc = re.sub(r'(?s)<!--.*?-->', '', desc)
|
||||||
mi.comments = sanitize_comments_html(desc)
|
mi.comments = sanitize_comments_html(desc)
|
||||||
|
|
||||||
publisher = root.xpath("//td/strong[re:test(text(), 'Publisher\:', 'i')]/ancestor::td[1]/following-sibling::td/text()",
|
|
||||||
namespaces={"re": "http://exslt.org/regular-expressions"})
|
|
||||||
if publisher:
|
|
||||||
mi.publisher = re.sub('^\s*(?P<pub>.*?)\s*$', '\g<pub>', publisher[0])
|
|
||||||
print "publisher is "+str(mi.publisher)
|
|
||||||
|
|
||||||
lang = root.xpath("//td/strong[re:test(text(), 'Language\(s\):', 'i')]/ancestor::td[1]/following-sibling::td/text()",
|
|
||||||
namespaces={"re": "http://exslt.org/regular-expressions"})
|
|
||||||
if lang:
|
|
||||||
mi.language = re.sub('^\s*(?P<lang>.*?)\s*$', '\g<lang>', lang[0])
|
|
||||||
print "languages is "+str(mi.language)
|
|
||||||
|
|
||||||
isbn = root.xpath("//tr/td[re:test(text(), 'ISBN:', 'i')]/following::td/text()",
|
|
||||||
namespaces={"re": "http://exslt.org/regular-expressions"})
|
|
||||||
if isbn:
|
|
||||||
mi.isbn = re.sub('^\s*(?P<isbn>.*?)\s*$', '\g<isbn>', isbn[0])
|
|
||||||
print "ISBN is "+str(mi.isbn)
|
|
||||||
|
|
||||||
subjects = root.xpath("//td/strong[re:test(text(), 'Subject', 'i')]/ancestor::td[1]/following-sibling::td/a/text()",
|
|
||||||
namespaces={"re": "http://exslt.org/regular-expressions"})
|
|
||||||
if subjects:
|
|
||||||
mi.tags = subjects
|
|
||||||
print "tags are "+str(mi.tags)
|
|
||||||
|
|
||||||
creators = root.xpath("//table/tr/td[re:test(text(), '\s*by', 'i')]/ancestor::tr[1]/td[2]/table/tr/td/a/text()",
|
|
||||||
namespaces={"re": "http://exslt.org/regular-expressions"})
|
|
||||||
if creators:
|
|
||||||
print "authors are "+str(creators)
|
|
||||||
mi.authors = creators
|
|
||||||
|
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def main(args=sys.argv):
|
def main(args=sys.argv):
|
||||||
@ -388,19 +391,26 @@ def main(args=sys.argv):
|
|||||||
import tempfile, os, time
|
import tempfile, os, time
|
||||||
tdir = tempfile.gettempdir()
|
tdir = tempfile.gettempdir()
|
||||||
br = browser()
|
br = browser()
|
||||||
for isbn, title, author in [
|
for ovrdrv_id, isbn, title, author in [
|
||||||
#('0899661343', 'On the Road', ['Jack Kerouac']), # basic test, no series, single author
|
#(None, '0899661343', 'On the Road', ['Jack Kerouac']), # basic test, no series, single author
|
||||||
#('9780061952838', 'The Fellowship of the Ring', ['J. R. R. Tolkien']), # Series test, multi-author
|
#(None, '9780061952838', 'The Fellowship of the Ring', ['J. R. R. Tolkien']), # Series test, multi-author
|
||||||
('9780061952838', 'The Two Towers', ['J. R. R. Tolkien']), # Series test, book 2
|
#(None, '9780061952838', 'The Two Towers', ['J. R. R. Tolkien']), # Series test, book 2
|
||||||
#('9780345505057', 'Deluge', ['Anne McCaffrey']) # Multiple authors
|
#('57844706-20fa-4ace-b5ee-3470b1b52173', None, 'The Two Towers', ['J. R. R. Tolkien']), # Series test, w/ ovrdrv id
|
||||||
#('', 'Deluge', ['Anne McCaffrey']) # Empty ISBN
|
#(None, '9780345505057', 'Deluge', ['Anne McCaffrey']) # Multiple authors
|
||||||
#(None, 'On the Road', ['Jack Kerouac']) # Nonetype ISBN
|
#(None, None, 'Deluge', ['Anne McCaffrey']) # Empty ISBN
|
||||||
|
#(None, None, 'On the Road', ['Jack Kerouac']), # Nonetype ISBN
|
||||||
|
#(None, '9780345435279', 'A Caress of Twilight', ['Laurell K. Hamilton']),
|
||||||
|
#(None, '9780606087230', 'The Omnivore\'s Dilemma : A Natural History of Four Meals', ['Michael Pollan']), # Subtitle colon
|
||||||
|
#(None, '9780061747649', 'Mental_Floss Presents: Condensed Knowledge', ['Will Pearson', 'Mangesh Hattikudur']),
|
||||||
|
#(None, '9781400050802', 'The Zombie Survival Guide', ['Max Brooks']), # Two books with this title by this author
|
||||||
|
#(None, '9781775414315', 'The Worst Journey in the World / Antarctic 1910-1913', ['Apsley Cherry-Garrard']), # Garbage sub-title
|
||||||
|
(None, '9780440335160', 'Outlander', ['Diana Gabaldon']), # Returns lots of results to sort through to get the best match
|
||||||
]:
|
]:
|
||||||
cpath = os.path.join(tdir, title+'.jpg')
|
cpath = os.path.join(tdir, title+'.jpg')
|
||||||
print "cpath is "+cpath
|
print "cpath is "+cpath
|
||||||
st = time.time()
|
st = time.time()
|
||||||
curl = get_cover_url(isbn, title, author, br)
|
curl = get_cover_url(isbn, title, author, br, ovrdrv_id)
|
||||||
print '\n\n Took ', time.time() - st, ' to get metadata\n\n'
|
print '\n\n Took ', time.time() - st, ' to get basic metadata\n\n'
|
||||||
if curl is None:
|
if curl is None:
|
||||||
print 'No cover found for', title
|
print 'No cover found for', title
|
||||||
else:
|
else:
|
||||||
@ -408,9 +418,7 @@ def main(args=sys.argv):
|
|||||||
#open(cpath, 'wb').write(br.open_novisit(curl).read())
|
#open(cpath, 'wb').write(br.open_novisit(curl).read())
|
||||||
#print 'Cover for', title, 'saved to', cpath
|
#print 'Cover for', title, 'saved to', cpath
|
||||||
|
|
||||||
#import time
|
print get_social_metadata(title, author, isbn, ovrdrv_id)
|
||||||
|
|
||||||
#print get_social_metadata(title, author, None, isbn)
|
|
||||||
#print '\n\n', time.time() - st, '\n\n'
|
#print '\n\n', time.time() - st, '\n\n'
|
||||||
|
|
||||||
return 0
|
return 0
|
||||||
|
@ -87,32 +87,40 @@ class Source(Plugin):
|
|||||||
|
|
||||||
if authors:
|
if authors:
|
||||||
# Leave ' in there for Irish names
|
# Leave ' in there for Irish names
|
||||||
pat = re.compile(r'[-,:;+!@#$%^&*(){}.`~"\s\[\]/]')
|
remove_pat = re.compile(r'[,:;!@#$%^&*(){}.`~"\s\[\]/]')
|
||||||
|
replace_pat = re.compile(r'-+')
|
||||||
if only_first_author:
|
if only_first_author:
|
||||||
authors = authors[:1]
|
authors = authors[:1]
|
||||||
for au in authors:
|
for au in authors:
|
||||||
|
au = replace_pat.sub(' ', au)
|
||||||
parts = au.split()
|
parts = au.split()
|
||||||
if ',' in au:
|
if ',' in au:
|
||||||
# au probably in ln, fn form
|
# au probably in ln, fn form
|
||||||
parts = parts[1:] + parts[:1]
|
parts = parts[1:] + parts[:1]
|
||||||
for tok in parts:
|
for tok in parts:
|
||||||
tok = pat.sub('', tok).strip()
|
tok = remove_pat.sub('', tok).strip()
|
||||||
if len(tok) > 2 and tok.lower() not in ('von', ):
|
if len(tok) > 2 and tok.lower() not in ('von', ):
|
||||||
yield tok
|
yield tok
|
||||||
|
|
||||||
|
|
||||||
def get_title_tokens(self, title):
|
def get_title_tokens(self, title, strip_joiners=True):
|
||||||
'''
|
'''
|
||||||
Take a title and return a list of tokens useful for an AND search query.
|
Take a title and return a list of tokens useful for an AND search query.
|
||||||
Excludes connectives and punctuation.
|
Excludes connectives and punctuation.
|
||||||
'''
|
'''
|
||||||
if title:
|
if title:
|
||||||
pat = re.compile(r'''[-,:;+!@#$%^&*(){}.`~"'\s\[\]/]''')
|
# strip sub-titles
|
||||||
|
subtitle = re.compile(r'([\(\[\{].*?[\)\]\}]|[/:\\].*$)')
|
||||||
|
if len(subtitle.sub('', title)) > 1:
|
||||||
|
title = subtitle.sub('', title)
|
||||||
|
pat = re.compile(r'''([-,:;+!@#$%^&*(){}.`~"\s\[\]/]|'(?!s))''')
|
||||||
title = pat.sub(' ', title)
|
title = pat.sub(' ', title)
|
||||||
tokens = title.split()
|
tokens = title.split()
|
||||||
for token in tokens:
|
for token in tokens:
|
||||||
token = token.strip()
|
token = token.strip()
|
||||||
if token and token.lower() not in ('a', 'and', 'the'):
|
if token and token.lower() not in ('a', 'and', 'the') and strip_joiners:
|
||||||
|
yield token
|
||||||
|
elif token:
|
||||||
yield token
|
yield token
|
||||||
|
|
||||||
def split_jobs(self, jobs, num):
|
def split_jobs(self, jobs, num):
|
||||||
|
Loading…
x
Reference in New Issue
Block a user