diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py
index f4a8c6b6bc..75c02c7e00 100644
--- a/src/calibre/customize/builtins.py
+++ b/src/calibre/customize/builtins.py
@@ -633,14 +633,14 @@ if test_eight_code:
# }}}
else:
from calibre.ebooks.metadata.fetch import GoogleBooks, ISBNDB, Amazon, \
- KentDistrictLibrary, Overdrive
+ KentDistrictLibrary
from calibre.ebooks.metadata.douban import DoubanBooks
from calibre.ebooks.metadata.nicebooks import NiceBooks, NiceBooksCovers
from calibre.ebooks.metadata.covers import OpenLibraryCovers, \
- AmazonCovers, DoubanCovers, OverdriveCovers
+ AmazonCovers, DoubanCovers
- plugins += [GoogleBooks, ISBNDB, Amazon, Overdrive,
- OpenLibraryCovers, AmazonCovers, DoubanCovers, OverdriveCovers,
+ plugins += [GoogleBooks, ISBNDB, Amazon,
+ OpenLibraryCovers, AmazonCovers, DoubanCovers,
NiceBooksCovers, KentDistrictLibrary, DoubanBooks, NiceBooks]
plugins += [
diff --git a/src/calibre/ebooks/metadata/covers.py b/src/calibre/ebooks/metadata/covers.py
index f705317f59..10acff4e61 100644
--- a/src/calibre/ebooks/metadata/covers.py
+++ b/src/calibre/ebooks/metadata/covers.py
@@ -151,33 +151,6 @@ class AmazonCovers(CoverDownload): # {{{
# }}}
-class OverdriveCovers(CoverDownload): # {{{
-
- name = 'overdrive.com covers'
- description = _('Download covers from Overdrive')
- author = 'Kovid Goyal'
-
-
- def has_cover(self, mi, ans, timeout=5.):
- if not mi.authors or not mi.title:
- return False
- return True
-
- def get_covers(self, mi, result_queue, abort, timeout=5.):
- if not mi.isbn:
- return
- from calibre.ebooks.metadata.overdrive import get_cover_url
- br = browser()
- try:
- url = get_cover_url(mi.isbn, mi.title, mi.authors, br)
- cover_data = br.open_novisit(url).read()
- result_queue.put((True, cover_data, 'jpg', self.name))
- except Exception, e:
- result_queue.put((False, self.exception_to_string(e),
- traceback.format_exc(), self.name))
-
-# }}}
-
def check_for_cover(mi, timeout=5.): # {{{
from calibre.customize.ui import cover_sources
ans = Event()
diff --git a/src/calibre/ebooks/metadata/fetch.py b/src/calibre/ebooks/metadata/fetch.py
index fb01c5dd71..e1fac50d16 100644
--- a/src/calibre/ebooks/metadata/fetch.py
+++ b/src/calibre/ebooks/metadata/fetch.py
@@ -250,27 +250,6 @@ class Amazon(MetadataSource): # {{{
# }}}
-class Overdrive(MetadataSource): # {{{
-
- name = 'Overdrive'
- metadata_type = 'social'
- description = _('Downloads metadata from the Overdrive library network')
-
- has_html_comments = True
-
- def fetch(self):
- if not self.isbn:
- return
- from calibre.ebooks.metadata.overdrive import get_social_metadata
- try:
- self.results = get_social_metadata(self.title, self.book_author, self.isbn)
-
- except Exception, e:
- self.exception = e
- self.tb = traceback.format_exc()
-
- # }}}
-
class KentDistrictLibrary(MetadataSource): # {{{
name = 'Kent District Library'
diff --git a/src/calibre/ebooks/metadata/overdrive.py b/src/calibre/ebooks/metadata/overdrive.py
deleted file mode 100644
index 38d6d730ff..0000000000
--- a/src/calibre/ebooks/metadata/overdrive.py
+++ /dev/null
@@ -1,459 +0,0 @@
-#!/usr/bin/env python
-__license__ = 'GPL v3'
-__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
-__docformat__ = 'restructuredtext en'
-
-'''
-Fetch metadata using Overdrive Content Reserve
-'''
-import sys, re, random, urllib, mechanize, copy
-from threading import RLock
-
-from lxml import html, etree
-from lxml.html import soupparser
-
-from calibre import browser
-from calibre.ebooks.metadata import check_isbn
-from calibre.ebooks.metadata.sources.base import Source
-from calibre.ebooks.metadata.book.base import Metadata
-from calibre.ebooks.chardet import xml_to_unicode
-from calibre.library.comments import sanitize_comments_html
-
-ovrdrv_data_cache = {}
-cover_url_cache = {}
-cache_lock = RLock()
-base_url = 'http://search.overdrive.com/'
-
-
-def create_query(self, title=None, authors=None, identifiers={}):
- q = ''
- if title or authors:
- def build_term(prefix, parts):
- return ' '.join('in'+prefix + ':' + x for x in parts)
- title_tokens = list(self.get_title_tokens(title, False))
- if title_tokens:
- q += build_term('title', title_tokens)
- author_tokens = self.get_author_tokens(authors,
- only_first_author=True)
- if author_tokens:
- q += ('+' if q else '') + build_term('author',
- author_tokens)
-
- if isinstance(q, unicode):
- q = q.encode('utf-8')
- if not q:
- return None
- return BASE_URL+urlencode({
- 'q':q,
- })
-
-
-def get_base_referer():
- choices = [
- 'http://overdrive.chipublib.org/82DC601D-7DDE-4212-B43A-09D821935B01/10/375/en/',
- 'http://emedia.clevnet.org/9D321DAD-EC0D-490D-BFD8-64AE2C96ECA8/10/241/en/',
- 'http://singapore.lib.overdrive.com/F11D55BE-A917-4D63-8111-318E88B29740/10/382/en/',
- 'http://ebooks.nypl.org/20E48048-A377-4520-BC43-F8729A42A424/10/257/en/',
- 'http://spl.lib.overdrive.com/5875E082-4CB2-4689-9426-8509F354AFEF/10/335/en/'
- ]
- return choices[random.randint(0, len(choices)-1)]
-
-def format_results(reserveid, od_title, subtitle, series, publisher, creators, thumbimage, worldcatlink, formatid):
- fix_slashes = re.compile(r'\\/')
- thumbimage = fix_slashes.sub('/', thumbimage)
- worldcatlink = fix_slashes.sub('/', worldcatlink)
- cover_url = re.sub('(?P
(Ima?g(eType-)?))200', '\g
100', thumbimage)
- social_metadata_url = base_url+'TitleInfo.aspx?ReserveID='+reserveid+'&FormatID='+formatid
- series_num = ''
- if not series:
- if subtitle:
- title = od_title+': '+subtitle
- else:
- title = od_title
- else:
- title = od_title
- m = re.search("([0-9]+$)", subtitle)
- if m:
- series_num = float(m.group(1))
- return [cover_url, social_metadata_url, worldcatlink, series, series_num, publisher, creators, reserveid, title]
-
-def safe_query(br, query_url):
- '''
- The query must be initialized by loading an empty search results page
- this page attempts to set a cookie that Mechanize doesn't like
- copy the cookiejar to a separate instance and make a one-off request with the temp cookiejar
- '''
- goodcookies = br._ua_handlers['_cookies'].cookiejar
- clean_cj = mechanize.CookieJar()
- cookies_to_copy = []
- for cookie in goodcookies:
- copied_cookie = copy.deepcopy(cookie)
- cookies_to_copy.append(copied_cookie)
- for copied_cookie in cookies_to_copy:
- clean_cj.set_cookie(copied_cookie)
-
- br.open_novisit(query_url)
-
- br.set_cookiejar(clean_cj)
-
-
-def overdrive_search(br, q, title, author):
- q_query = q+'default.aspx/SearchByKeyword'
- q_init_search = q+'SearchResults.aspx'
- # get first author as string - convert this to a proper cleanup function later
- s = Source(None)
- print "printing list with string:"
- #print list(s.get_author_tokens(['J. R. R. Tolkien']))
- print "printing list with author "+str(author)+":"
- print list(s.get_author_tokens(author))
- author_tokens = list(s.get_author_tokens(author))
- print "there are "+str(len(author_tokens))+" author tokens"
- for token in author_tokens:
- print "cleaned up author token is: "+str(token)
-
-
- title_tokens = list(s.get_title_tokens(title))
- print "there are "+str(len(title_tokens))+" title tokens"
- for token in title_tokens:
- print "cleaned up title token is: "+str(token)
-
- if len(title_tokens) >= len(author_tokens):
- initial_q = ' '.join(title_tokens)
- xref_q = '+'.join(author_tokens)
- else:
- initial_q = ' '.join(author_tokens)
- xref_q = '+'.join(title_tokens)
-
- print "initial query is "+str(initial_q)
- print "cross reference query is "+str(xref_q)
- q_xref = q+'SearchResults.svc/GetResults?iDisplayLength=50&sSearch='+xref_q
- query = '{"szKeyword":"'+initial_q+'"}'
-
- # main query, requires specific Content Type header
- req = mechanize.Request(q_query)
- req.add_header('Content-Type', 'application/json; charset=utf-8')
- br.open_novisit(req, query)
-
- print "q_init_search is "+q_init_search
- # initiate the search without messing up the cookiejar
- safe_query(br, q_init_search)
-
- # get the search results object
- results = False
- while results == False:
- xreq = mechanize.Request(q_xref)
- xreq.add_header('X-Requested-With', 'XMLHttpRequest')
- xreq.add_header('Referer', q_init_search)
- xreq.add_header('Accept', 'application/json, text/javascript, */*')
- raw = br.open_novisit(xreq).read()
- print "overdrive search result is:\n"+raw
- for m in re.finditer(ur'"iTotalDisplayRecords":(?P\d+).*?"iTotalRecords":(?P\d+)', raw):
- if int(m.group('displayrecords')) >= 1:
- results = True
- elif int(m.group('totalrecords')) >= 1:
- xref_q = ''
- q_xref = q+'SearchResults.svc/GetResults?iDisplayLength=50&sSearch='+xref_q
- elif int(m.group('totalrecords')) == 0:
- return ''
-
- print "\n\nsorting results"
- return sort_ovrdrv_results(raw, title, title_tokens, author, author_tokens)
-
-
-def sort_ovrdrv_results(raw, title=None, title_tokens=None, author=None, author_tokens=None, ovrdrv_id=None):
- print "\ntitle to search for is "+str(title)+"\nauthor to search for is "+str(author)
- close_matches = []
- raw = re.sub('.*?\[\[(?P.*?)\]\].*', '[[\g]]', raw)
- results = eval(raw)
- print "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n"
- #print results
- # The search results are either from a keyword search or a multi-format list from a single ID,
- # sort through the results for closest match/format
- if results:
- for reserveid, od_title, subtitle, edition, series, publisher, format, formatid, creators, \
- thumbimage, shortdescription, worldcatlink, excerptlink, creatorfile, sorttitle, \
- availabletolibrary, availabletoretailer, relevancyrank, unknown1, unknown2, unknown3 in results:
- print "this record's title is "+od_title+", subtitle is "+subtitle+", author[s] are "+creators+", series is "+series
- if ovrdrv_id is not None and int(formatid) in [1, 50, 410, 900]:
- print "overdrive id is not None, searching based on format type priority"
- return format_results(reserveid, od_title, subtitle, series, publisher, creators, thumbimage, worldcatlink, formatid)
- else:
- creators = creators.split(', ')
- print "split creators from results are: "+str(creators)
- # if an exact match in a preferred format occurs
- if creators[0] == author[0] and od_title == title and int(formatid) in [1, 50, 410, 900]:
- print "Got Exact Match!!!"
- return format_results(reserveid, od_title, subtitle, series, publisher, creators, thumbimage, worldcatlink, formatid)
- else:
- close_title_match = False
- close_author_match = False
- print "format id is "+str(formatid)
- for token in title_tokens:
- print "attempting to find "+str(token)+" title token"
- if od_title.lower().find(token.lower()) != -1:
- print "matched token"
- close_title_match = True
- else:
- print "token didn't match"
- close_title_match = False
- break
- for token in author_tokens:
- print "attempting to find "+str(token)+" author token"
- if creators[0].lower().find(token.lower()) != -1:
- print "matched token"
- close_author_match = True
- else:
- print "token didn't match"
- close_author_match = False
- break
- if close_title_match and close_author_match and int(formatid) in [1, 50, 410, 900]:
- if subtitle and series:
- close_matches.insert(0, format_results(reserveid, od_title, subtitle, series, publisher, creators, thumbimage, worldcatlink, formatid))
- else:
- close_matches.append(format_results(reserveid, od_title, subtitle, series, publisher, creators, thumbimage, worldcatlink, formatid))
- if close_matches:
- return close_matches[0]
- else:
- return ''
- else:
- return ''
-
-
-
-def overdrive_get_record(br, q, ovrdrv_id):
- search_url = q+'SearchResults.aspx?ReserveID={'+ovrdrv_id+'}'
- results_url = q+'SearchResults.svc/GetResults?sEcho=1&iColumns=18&sColumns=ReserveID%2CTitle%2CSubtitle%2CEdition%2CSeries%2CPublisher%2CFormat%2CFormatID%2CCreators%2CThumbImage%2CShortDescription%2CWorldCatLink%2CExcerptLink%2CCreatorFile%2CSortTitle%2CAvailableToLibrary%2CAvailableToRetailer%2CRelevancyRank&iDisplayStart=0&iDisplayLength=10&sSearch=&bEscapeRegex=true&iSortingCols=1&iSortCol_0=17&sSortDir_0=asc'
-
- # get the base url to set the proper session cookie
- br.open_novisit(q)
-
- # initialize the search
- safe_query(br, search_url)
-
- # get the results
- req = mechanize.Request(results_url)
- req.add_header('X-Requested-With', 'XMLHttpRequest')
- req.add_header('Referer', search_url)
- req.add_header('Accept', 'application/json, text/javascript, */*')
- raw = br.open_novisit(req)
- raw = str(list(raw))
- return sort_ovrdrv_results(raw, None, None, None, ovrdrv_id)
-
-
-def find_ovrdrv_data(br, title, author, isbn, ovrdrv_id=None):
- print "in find_ovrdrv_data, title is "+str(title)+", author is "+str(author)+", overdrive id is "+str(ovrdrv_id)
- q = base_url
- if ovrdrv_id is None:
- return overdrive_search(br, q, title, author)
- else:
- return overdrive_get_record(br, q, ovrdrv_id)
-
-
-
-def to_ovrdrv_data(br, title, author, isbn, ovrdrv_id=None):
- print "starting to_ovrdrv_data"
- with cache_lock:
- ans = ovrdrv_data_cache.get(isbn, None)
- if ans:
- print "inside to_ovrdrv_data, cache lookup successful, ans is "+str(ans)
- return ans
- if ans is False:
- print "inside to_ovrdrv_data, ans returned False"
- return None
- try:
- print "trying to retrieve data, running find_ovrdrv_data"
- ovrdrv_data = find_ovrdrv_data(br, title, author, isbn, ovrdrv_id)
- print "ovrdrv_data is "+str(ovrdrv_data)
- except:
- import traceback
- traceback.print_exc()
- ovrdrv_data = None
-
- with cache_lock:
- ovrdrv_data_cache[isbn] = ovrdrv_data if ovrdrv_data else False
- if ovrdrv_data:
- from calibre.ebooks.metadata.xisbn import xisbn
- for i in xisbn.get_associated_isbns(isbn):
- with cache_lock:
- ovrdrv_data_cache[i] = ovrdrv_data
-
- return ovrdrv_data
-
-
-def get_social_metadata(title, authors, isbn, ovrdrv_id=None):
- author = authors[0]
- mi = Metadata(title, authors)
- br = browser()
- print "calling to_ovrdrv_data from inside get_social_metadata"
- ovrdrv_data = to_ovrdrv_data(br, title, authors, isbn, ovrdrv_id)
-
- #[cover_url, social_metadata_url, worldcatlink, series, series_num, publisher, creators, reserveid, title]
-
- if len(ovrdrv_data[3]) > 1:
- mi.series = ovrdrv_data[3]
- if ovrdrv_data[4]:
- mi.series_index = ovrdrv_data[4]
- mi.publisher = ovrdrv_data[5]
- mi.authors = ovrdrv_data[6]
- if ovrdrv_id is None:
- ovrdrv_id = ovrdrv_data[7]
- mi.set_identifier('overdrive', ovrdrv_id)
- mi.title = ovrdrv_data[8]
- print "populated basic social metadata, getting detailed metadata"
- if ovrdrv_data and get_metadata_detail(br, ovrdrv_data[1], mi, isbn):
- return mi
- print "failed to get detailed metadata, returning basic info"
- return mi
-
-def get_cover_url(isbn, title, author, br, ovrdrv_id=None):
- print "starting get_cover_url"
- print "title is "+str(title)
- print "author is "+str(author[0])
- print "isbn is "+str(isbn)
- print "ovrdrv_id is "+str(ovrdrv_id)
-
- with cache_lock:
- ans = cover_url_cache.get(isbn, None)
- #ans = cover_url_cache.get(ovrdrv_id, None)
- if ans:
- print "cover url cache lookup returned positive, ans is "+str(ans)
- return ans
- if ans is False:
- "cover url cache lookup returned false"
- return None
- print "in get_cover_url, calling to_ovrdrv_data function"
- ovrdrv_data = to_ovrdrv_data(br, title, author, isbn, ovrdrv_id)
- if ovrdrv_data:
- ans = ovrdrv_data[0]
- print "inside get_cover_url, got url from to_ovrdrv_data, ans is "+str(ans)
- if ans:
- print "writing cover url to url cache"
- with cache_lock:
- cover_url_cache[isbn] = ans
- #cover_url_cache[ovrdrv_id] = ans
- return ans
-
- with cache_lock:
- print "marking cover url cache for this isbn false"
- cover_url_cache[isbn] = False
- return None
-
-def _get_cover_url(br, ovrdrv_data):
- q = ovrdrv_data[1]
- try:
- raw = br.open_novisit(q).read()
- except Exception, e:
- if callable(getattr(e, 'getcode', None)) and \
- e.getcode() == 404:
- return None
- raise
- if '404 - ' in raw:
- return None
- raw = xml_to_unicode(raw, strip_encoding_pats=True,
- resolve_entities=True)[0]
- try:
- root = soupparser.fromstring(raw)
- except:
- return False
-
- imgs = root.xpath('//img[@id="prodImage" and @src]')
- if imgs:
- src = imgs[0].get('src')
- parts = src.split('/')
- if len(parts) > 3:
- bn = parts[-1]
- sparts = bn.split('_')
- if len(sparts) > 2:
- bn = sparts[0] + sparts[-1]
- return ('/'.join(parts[:-1]))+'/'+bn
- return None
-
-def get_metadata_detail(br, metadata_url, mi, isbn=None):
- try:
- raw = br.open_novisit(metadata_url).read()
- except Exception, e:
- if callable(getattr(e, 'getcode', None)) and \
- e.getcode() == 404:
- return False
- raise
- raw = xml_to_unicode(raw, strip_encoding_pats=True,
- resolve_entities=True)[0]
- try:
- root = soupparser.fromstring(raw)
- except:
- return False
-
- isbn = check_isbn(isbn)
-
- pub_date = root.xpath("//div/label[@id='ctl00_ContentPlaceHolder1_lblPubDate']/text()")
- lang = root.xpath("//div/label[@id='ctl00_ContentPlaceHolder1_lblLanguage']/text()")
- subjects = root.xpath("//div/label[@id='ctl00_ContentPlaceHolder1_lblSubjects']/text()")
- ebook_isbn = root.xpath("//div/label[@id='ctl00_ContentPlaceHolder1_lblIdentifier']/text()")
- desc = root.xpath("//div/label[@id='ctl00_ContentPlaceHolder1_lblDescription']/ancestor::div[1]")
-
- if pub_date:
- from calibre.utils.date import parse_date
- mi.pubdate = parse_date(pub_date[0].strip())
- if lang:
- mi.language = lang[0].strip()
- print "languages is "+str(mi.language)
- if ebook_isbn and isbn is None:
- print "ebook isbn is "+str(ebook_isbn[0])
- mi.set_identifier('isbn', ebook_isbn)
- #elif isbn is not None:
- # mi.set_identifier('isbn', isbn)
- if subjects:
- mi.tags = [tag.strip() for tag in subjects[0].split(',')]
- print "tags are "+str(mi.tags)
- if desc:
- desc = desc[0]
- desc = html.tostring(desc, method='html', encoding=unicode).strip()
- # remove all attributes from tags
- desc = re.sub(r'<([a-zA-Z0-9]+)\s[^>]+>', r'<\1>', desc)
- # Remove comments
- desc = re.sub(r'(?s)', '', desc)
- mi.comments = sanitize_comments_html(desc)
-
- return True
-
-def main(args=sys.argv):
- print "running through main tests"
- import tempfile, os, time
- tdir = tempfile.gettempdir()
- br = browser()
- for ovrdrv_id, isbn, title, author in [
- #(None, '0899661343', 'On the Road', ['Jack Kerouac']), # basic test, no series, single author
- #(None, '9780061952838', 'The Fellowship of the Ring', ['J. R. R. Tolkien']), # Series test, multi-author
- #(None, '9780061952838', 'The Two Towers (The Lord of the Rings, Book II)', ['J. R. R. Tolkien']), # Series test, book 2
- #(None, '9780618153985', 'The Fellowship of the Ring (The Lord of the Rings, Part 1)', ['J.R.R. Tolkien']),
- #('57844706-20fa-4ace-b5ee-3470b1b52173', None, 'The Two Towers', ['J. R. R. Tolkien']), # Series test, w/ ovrdrv id
- #(None, '9780345505057', 'Deluge', ['Anne McCaffrey']) # Multiple authors
- #(None, None, 'Deluge', ['Anne McCaffrey']) # Empty ISBN
- #(None, None, 'On the Road', ['Jack Kerouac']), # Nonetype ISBN
- #(None, '9780345435279', 'A Caress of Twilight', ['Laurell K. Hamilton']),
- #(None, '9780606087230', 'The Omnivore\'s Dilemma : A Natural History of Four Meals', ['Michael Pollan']), # Subtitle colon
- #(None, '9780061747649', 'Mental_Floss Presents: Condensed Knowledge', ['Will Pearson', 'Mangesh Hattikudur']),
- #(None, '9781400050802', 'The Zombie Survival Guide', ['Max Brooks']), # Two books with this title by this author
- #(None, '9781775414315', 'The Worst Journey in the World / Antarctic 1910-1913', ['Apsley Cherry-Garrard']), # Garbage sub-title
- #(None, '9780440335160', 'Outlander', ['Diana Gabaldon']), # Returns lots of results to sort through to get the best match
- (None, '9780345509741', 'The Horror Stories of Robert E. Howard', ['Robert E. Howard']), # Complex title with initials/dots stripped, some results don't have a cover
- ]:
- cpath = os.path.join(tdir, title+'.jpg')
- print "cpath is "+cpath
- st = time.time()
- curl = get_cover_url(isbn, title, author, br, ovrdrv_id)
- print '\n\n Took ', time.time() - st, ' to get basic metadata\n\n'
- if curl is None:
- print 'No cover found for', title
- else:
- print "curl is "+curl
- #open(cpath, 'wb').write(br.open_novisit(curl).read())
- #print 'Cover for', title, 'saved to', cpath
- st = time.time()
- print get_social_metadata(title, author, isbn, ovrdrv_id)
- print '\n\n Took ', time.time() - st, ' to get detailed metadata\n\n'
-
- return 0
-
-if __name__ == '__main__':
- sys.exit(main())
diff --git a/src/calibre/ebooks/metadata/sources/base.py b/src/calibre/ebooks/metadata/sources/base.py
index 5911a357ac..53fe9a4c2d 100644
--- a/src/calibre/ebooks/metadata/sources/base.py
+++ b/src/calibre/ebooks/metadata/sources/base.py
@@ -313,8 +313,8 @@ class Source(Plugin):
(r'(\d+),(\d+)', r'\1\2'),
# Remove hyphens only if they have whitespace before them
(r'(\s-)', ' '),
- # Remove single quotes
- (r"'", ''),
+ # Remove single quotes not followed by 's'
+ (r"'(?!s)", ''),
# Replace other special chars with a space
(r'''[:,;+!@#$%^&*(){}.`~"\s\[\]/]''', ' ')
]]
diff --git a/src/calibre/ebooks/metadata/sources/overdrive.py b/src/calibre/ebooks/metadata/sources/overdrive.py
new file mode 100755
index 0000000000..6950711da4
--- /dev/null
+++ b/src/calibre/ebooks/metadata/sources/overdrive.py
@@ -0,0 +1,510 @@
+#!/usr/bin/env python
+__license__ = 'GPL v3'
+__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
+__docformat__ = 'restructuredtext en'
+
+'''
+Fetch metadata using Overdrive Content Reserve
+'''
+import sys, re, random, urllib, mechanize, copy
+from threading import RLock
+from Queue import Queue, Empty
+
+from lxml import html, etree
+from lxml.html import soupparser
+
+from calibre import browser
+from calibre.ebooks.metadata import check_isbn
+from calibre.ebooks.metadata.sources.base import Source
+from calibre.ebooks.metadata.book.base import Metadata
+from calibre.ebooks.chardet import xml_to_unicode
+from calibre.library.comments import sanitize_comments_html
+
+ovrdrv_data_cache = {}
+cover_url_cache = {}
+cache_lock = RLock()
+base_url = 'http://search.overdrive.com/'
+
+
+class OverDrive(Source):
+
+ name = 'Overdrive'
+ description = _('Downloads metadata from Overdrive\'s Content Reserve')
+
+ capabilities = frozenset(['identify', 'cover'])
+ touched_fields = frozenset(['title', 'authors', 'tags', 'pubdate',
+ 'comments', 'publisher', 'identifier:isbn', 'series', 'series_num',
+ 'language', 'identifier:overdrive'])
+ has_html_comments = True
+ supports_gzip_transfer_encoding = False
+ cached_cover_url_is_reliable = True
+
+ def identify(self, log, result_queue, abort, title=None, authors=None, # {{{
+ identifiers={}, timeout=30):
+ ovrdrv_id = identifiers.get('overdrive', None)
+ isbn = identifiers.get('isbn', None)
+
+ br = self.browser
+ print "in identify, calling to_ovrdrv_data"
+ ovrdrv_data = self.to_ovrdrv_data(br, title, authors, ovrdrv_id)
+ if ovrdrv_data:
+ title = ovrdrv_data[8]
+ authors = ovrdrv_data[6]
+ mi = Metadata(title, authors)
+ self.parse_search_results(ovrdrv_data, mi)
+ if ovrdrv_id is None:
+ ovrdrv_id = ovrdrv_data[7]
+ if isbn is not None:
+ self.cache_isbn_to_identifier(isbn, ovrdrv_id)
+
+ self.get_book_detail(br, ovrdrv_data[1], mi, ovrdrv_id, log)
+
+ result_queue.put(mi)
+
+ return None
+ # }}}
+
+
+ def get_book_url(self, identifiers): # {{{
+ ovrdrv_id = identifiers.get('overdrive', None)
+ if ovrdrv_id is not None:
+ ovrdrv_data = ovrdrv_data_cache.get(ovrdrv_id, None)
+ if ovrdrv_data:
+ return ovrdrv_data[1]
+ else:
+ br = browser()
+ ovrdrv_data = self.to_ovrdrv_data(br, None, None, ovrdrv_id)
+ return ovrdrv_data[1]
+ # }}}
+
+ def download_cover(self, log, result_queue, abort, # {{{
+ title=None, authors=None, identifiers={}, timeout=30):
+ cached_url = self.get_cached_cover_url(identifiers)
+ if cached_url is None:
+ log.info('No cached cover found, running identify')
+ rq = Queue()
+ print "inside download cover, calling identify"
+ self.identify(log, rq, abort, title=title, authors=authors,
+ identifiers=identifiers)
+ if abort.is_set():
+ return
+ results = []
+ while True:
+ try:
+ results.append(rq.get_nowait())
+ except Empty:
+ break
+ results.sort(key=self.identify_results_keygen(
+ title=title, authors=authors, identifiers=identifiers))
+ for mi in results:
+ cached_url = self.get_cached_cover_url(mi.identifiers)
+ if cached_url is not None:
+ break
+ if cached_url is None:
+ log.info('No cover found')
+ return
+
+ if abort.is_set():
+ return
+
+ ovrdrv_id = identifiers.get('overdrive', None)
+ br = self.browser
+ referer = self.get_base_referer()+'ContentDetails-Cover.htm?ID='+ovrdrv_id
+ print "downloading cover, referer is "+str(referer)
+ req = mechanize.Request(cached_url)
+ req.add_header('referer', referer)
+ log('Downloading cover from:', cached_url)
+ try:
+ cdata = br.open_novisit(req, timeout=timeout).read()
+ result_queue.put((self, cdata))
+ except:
+ log.exception('Failed to download cover from:', cached_url)
+ # }}}
+
+ def get_cached_cover_url(self, identifiers): # {{{
+ url = None
+ ovrdrv_id = identifiers.get('overdrive', None)
+ print "inside get_cached_cover_url, ovrdrv_id is "+str(ovrdrv_id)
+ if ovrdrv_id is None:
+ isbn = identifiers.get('isbn', None)
+ if isbn is not None:
+ ovrdrv_id = self.cached_isbn_to_identifier(isbn)
+ if ovrdrv_id is not None:
+ url = self.cached_identifier_to_cover_url(ovrdrv_id)
+
+ return url
+ # }}}
+
+ def create_query(self, title=None, authors=None, identifiers={}):
+ q = ''
+ if title or authors:
+ def build_term(prefix, parts):
+ return ' '.join('in'+prefix + ':' + x for x in parts)
+ title_tokens = list(self.get_title_tokens(title, False, True))
+ if title_tokens:
+ q += build_term('title', title_tokens)
+ author_tokens = self.get_author_tokens(authors,
+ only_first_author=True)
+ if author_tokens:
+ q += ('+' if q else '') + build_term('author',
+ author_tokens)
+
+ if isinstance(q, unicode):
+ q = q.encode('utf-8')
+ if not q:
+ return None
+ return BASE_URL+urlencode({
+ 'q':q,
+ })
+
+ def get_base_referer(self): # to be used for passing referrer headers to cover download
+ choices = [
+ 'http://overdrive.chipublib.org/82DC601D-7DDE-4212-B43A-09D821935B01/10/375/en/',
+ 'http://emedia.clevnet.org/9D321DAD-EC0D-490D-BFD8-64AE2C96ECA8/10/241/en/',
+ 'http://singapore.lib.overdrive.com/F11D55BE-A917-4D63-8111-318E88B29740/10/382/en/',
+ 'http://ebooks.nypl.org/20E48048-A377-4520-BC43-F8729A42A424/10/257/en/',
+ 'http://spl.lib.overdrive.com/5875E082-4CB2-4689-9426-8509F354AFEF/10/335/en/'
+ ]
+ return choices[random.randint(0, len(choices)-1)]
+
+ def format_results(self, reserveid, od_title, subtitle, series, publisher, creators, thumbimage, worldcatlink, formatid):
+ fix_slashes = re.compile(r'\\/')
+ thumbimage = fix_slashes.sub('/', thumbimage)
+ worldcatlink = fix_slashes.sub('/', worldcatlink)
+ cover_url = re.sub('(?P
(Ima?g(eType-)?))200', '\g
100', thumbimage)
+ social_metadata_url = base_url+'TitleInfo.aspx?ReserveID='+reserveid+'&FormatID='+formatid
+ series_num = ''
+ if not series:
+ if subtitle:
+ title = od_title+': '+subtitle
+ else:
+ title = od_title
+ else:
+ title = od_title
+ m = re.search("([0-9]+$)", subtitle)
+ if m:
+ series_num = float(m.group(1))
+ return [cover_url, social_metadata_url, worldcatlink, series, series_num, publisher, creators, reserveid, title]
+
+ def safe_query(self, br, query_url, post=''):
+ '''
+ The query must be initialized by loading an empty search results page
+ this page attempts to set a cookie that Mechanize doesn't like
+ copy the cookiejar to a separate instance and make a one-off request with the temp cookiejar
+ '''
+ goodcookies = br._ua_handlers['_cookies'].cookiejar
+ clean_cj = mechanize.CookieJar()
+ cookies_to_copy = []
+ for cookie in goodcookies:
+ copied_cookie = copy.deepcopy(cookie)
+ cookies_to_copy.append(copied_cookie)
+ for copied_cookie in cookies_to_copy:
+ clean_cj.set_cookie(copied_cookie)
+
+ if post:
+ br.open_novisit(query_url, post)
+ else:
+ br.open_novisit(query_url)
+
+ br.set_cookiejar(clean_cj)
+
+
+ def overdrive_search(self, br, q, title, author):
+ # re-initialize the cookiejar to so that it's clean
+ clean_cj = mechanize.CookieJar()
+ br.set_cookiejar(clean_cj)
+ q_query = q+'default.aspx/SearchByKeyword'
+ q_init_search = q+'SearchResults.aspx'
+ # get first author as string - convert this to a proper cleanup function later
+ s = Source(None)
+ print "printing list with author "+str(author)+":"
+ author_tokens = list(s.get_author_tokens(author))
+ print list(author_tokens)
+ title_tokens = list(s.get_title_tokens(title, False, True))
+ print "there are "+str(len(title_tokens))+" title tokens"
+ for token in title_tokens:
+ print "cleaned up title token is: "+str(token)
+
+ if len(title_tokens) >= len(author_tokens):
+ initial_q = ' '.join(title_tokens)
+ xref_q = '+'.join(author_tokens)
+ else:
+ initial_q = ' '.join(author_tokens)
+ xref_q = '+'.join(title_tokens)
+
+ print "initial query is "+str(initial_q)
+ print "cross reference query is "+str(xref_q)
+ q_xref = q+'SearchResults.svc/GetResults?iDisplayLength=50&sSearch='+xref_q
+ query = '{"szKeyword":"'+initial_q+'"}'
+
+ # main query, requires specific Content Type header
+ req = mechanize.Request(q_query)
+ req.add_header('Content-Type', 'application/json; charset=utf-8')
+ br.open_novisit(req, query)
+
+ print "q_init_search is "+q_init_search
+ # initiate the search without messing up the cookiejar
+ self.safe_query(br, q_init_search)
+
+ # get the search results object
+ results = False
+ while results == False:
+ xreq = mechanize.Request(q_xref)
+ xreq.add_header('X-Requested-With', 'XMLHttpRequest')
+ xreq.add_header('Referer', q_init_search)
+ xreq.add_header('Accept', 'application/json, text/javascript, */*')
+ raw = br.open_novisit(xreq).read()
+ print "overdrive search result is:\n"+raw
+ for m in re.finditer(ur'"iTotalDisplayRecords":(?P\d+).*?"iTotalRecords":(?P\d+)', raw):
+ if int(m.group('displayrecords')) >= 1:
+ results = True
+ elif int(m.group('totalrecords')) >= 1:
+ xref_q = ''
+ q_xref = q+'SearchResults.svc/GetResults?iDisplayLength=50&sSearch='+xref_q
+ elif int(m.group('totalrecords')) == 0:
+ return ''
+
+ print "\n\nsorting results"
+ return self.sort_ovrdrv_results(raw, title, title_tokens, author, author_tokens)
+
+
+ def sort_ovrdrv_results(self, raw, title=None, title_tokens=None, author=None, author_tokens=None, ovrdrv_id=None):
+ print "\ntitle to search for is "+str(title)+"\nauthor to search for is "+str(author)
+ close_matches = []
+ raw = re.sub('.*?\[\[(?P.*?)\]\].*', '[[\g]]', raw)
+ results = eval(raw)
+ print "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n"
+ #print results
+ # The search results are either from a keyword search or a multi-format list from a single ID,
+ # sort through the results for closest match/format
+ if results:
+ for reserveid, od_title, subtitle, edition, series, publisher, format, formatid, creators, \
+ thumbimage, shortdescription, worldcatlink, excerptlink, creatorfile, sorttitle, \
+ availabletolibrary, availabletoretailer, relevancyrank, unknown1, unknown2, unknown3 in results:
+ print "this record's title is "+od_title+", subtitle is "+subtitle+", author[s] are "+creators+", series is "+series
+ if ovrdrv_id is not None and int(formatid) in [1, 50, 410, 900]:
+ print "overdrive id is not None, searching based on format type priority"
+ return self.format_results(reserveid, od_title, subtitle, series, publisher, creators, thumbimage, worldcatlink, formatid)
+ else:
+ creators = creators.split(', ')
+ print "split creators from results are: "+str(creators)
+ # if an exact match in a preferred format occurs
+ if creators[0] == author[0] and od_title == title and int(formatid) in [1, 50, 410, 900]:
+ print "Got Exact Match!!!"
+ return self.format_results(reserveid, od_title, subtitle, series, publisher, creators, thumbimage, worldcatlink, formatid)
+ else:
+ close_title_match = False
+ close_author_match = False
+ print "format id is "+str(formatid)
+ for token in title_tokens:
+ print "attempting to find "+str(token)+" title token"
+ if od_title.lower().find(token.lower()) != -1:
+ print "matched token"
+ close_title_match = True
+ else:
+ print "token didn't match"
+ close_title_match = False
+ break
+ for token in author_tokens:
+ print "attempting to find "+str(token)+" author token"
+ if creators[0].lower().find(token.lower()) != -1:
+ print "matched token"
+ close_author_match = True
+ else:
+ print "token didn't match"
+ close_author_match = False
+ break
+ if close_title_match and close_author_match and int(formatid) in [1, 50, 410, 900]:
+ if subtitle and series:
+ close_matches.insert(0, self.format_results(reserveid, od_title, subtitle, series, publisher, creators, thumbimage, worldcatlink, formatid))
+ else:
+ close_matches.append(self.format_results(reserveid, od_title, subtitle, series, publisher, creators, thumbimage, worldcatlink, formatid))
+ if close_matches:
+ return close_matches[0]
+ else:
+ return ''
+ else:
+ return ''
+
+
+ def overdrive_get_record(self, br, q, ovrdrv_id):
+ search_url = q+'SearchResults.aspx?ReserveID={'+ovrdrv_id+'}'
+ results_url = q+'SearchResults.svc/GetResults?sEcho=1&iColumns=18&sColumns=ReserveID%2CTitle%2CSubtitle%2CEdition%2CSeries%2CPublisher%2CFormat%2CFormatID%2CCreators%2CThumbImage%2CShortDescription%2CWorldCatLink%2CExcerptLink%2CCreatorFile%2CSortTitle%2CAvailableToLibrary%2CAvailableToRetailer%2CRelevancyRank&iDisplayStart=0&iDisplayLength=10&sSearch=&bEscapeRegex=true&iSortingCols=1&iSortCol_0=17&sSortDir_0=asc'
+
+ # get the base url to set the proper session cookie
+ br.open_novisit(q)
+
+ # initialize the search
+ self.safe_query(br, search_url)
+
+ # get the results
+ req = mechanize.Request(results_url)
+ req.add_header('X-Requested-With', 'XMLHttpRequest')
+ req.add_header('Referer', search_url)
+ req.add_header('Accept', 'application/json, text/javascript, */*')
+ raw = br.open_novisit(req)
+ raw = str(list(raw))
+ clean_cj = mechanize.CookieJar()
+ br.set_cookiejar(clean_cj)
+ return self.sort_ovrdrv_results(raw, None, None, None, ovrdrv_id)
+
+
+ def find_ovrdrv_data(self, br, title, author, isbn, ovrdrv_id=None):
+ print "in find_ovrdrv_data, title is "+str(title)+", author is "+str(author)+", overdrive id is "+str(ovrdrv_id)
+ q = base_url
+ if ovrdrv_id is None:
+ return self.overdrive_search(br, q, title, author)
+ else:
+ return self.overdrive_get_record(br, q, ovrdrv_id)
+
+
+
+ def to_ovrdrv_data(self, br, title=None, author=None, ovrdrv_id=None):
+ '''
+ Takes either a title/author combo or an Overdrive ID. One of these
+ two must be passed to this function.
+ '''
+ print "starting to_ovrdrv_data"
+ if ovrdrv_id is not None:
+ with cache_lock:
+ ans = ovrdrv_data_cache.get(ovrdrv_id, None)
+ if ans:
+ print "inside to_ovrdrv_data, cache lookup successful, ans is "+str(ans)
+ return ans
+ elif ans is False:
+ print "inside to_ovrdrv_data, ans returned False"
+ return None
+ else:
+ ovrdrv_data = self.find_ovrdrv_data(br, title, author, ovrdrv_id)
+ else:
+ try:
+ print "trying to retrieve data, running find_ovrdrv_data"
+ ovrdrv_data = self.find_ovrdrv_data(br, title, author, ovrdrv_id)
+ print "ovrdrv_data is "+str(ovrdrv_data)
+ except:
+ import traceback
+ traceback.print_exc()
+ ovrdrv_data = None
+ print "writing results to ovrdrv_data cache"
+ with cache_lock:
+ ovrdrv_data_cache[ovrdrv_id] = ovrdrv_data if ovrdrv_data else False
+
+ return ovrdrv_data if ovrdrv_data else False
+
+
+ def parse_search_results(self, ovrdrv_data, mi):
+ '''
+ Parse the formatted search results from the initial Overdrive query and
+ add the values to the metadta.
+
+ The list object has these values:
+ [cover_url[0], social_metadata_url[1], worldcatlink[2], series[3], series_num[4],
+ publisher[5], creators[6], reserveid[7], title[8]]
+
+ '''
+ print "inside parse_search_results, writing the metadata results"
+ ovrdrv_id = ovrdrv_data[7]
+ mi.set_identifier('overdrive', ovrdrv_id)
+
+ if len(ovrdrv_data[3]) > 1:
+ mi.series = ovrdrv_data[3]
+ if ovrdrv_data[4]:
+ mi.series_index = ovrdrv_data[4]
+ mi.publisher = ovrdrv_data[5]
+ mi.authors = ovrdrv_data[6]
+ mi.title = ovrdrv_data[8]
+ cover_url = ovrdrv_data[0]
+ if cover_url:
+ self.cache_identifier_to_cover_url(ovrdrv_id,
+ cover_url)
+
+
+ def get_book_detail(self, br, metadata_url, mi, ovrdrv_id, log):
+ try:
+ raw = br.open_novisit(metadata_url).read()
+ except Exception, e:
+ if callable(getattr(e, 'getcode', None)) and \
+ e.getcode() == 404:
+ return False
+ raise
+ raw = xml_to_unicode(raw, strip_encoding_pats=True,
+ resolve_entities=True)[0]
+ try:
+ root = soupparser.fromstring(raw)
+ except:
+ return False
+
+ pub_date = root.xpath("//div/label[@id='ctl00_ContentPlaceHolder1_lblPubDate']/text()")
+ lang = root.xpath("//div/label[@id='ctl00_ContentPlaceHolder1_lblLanguage']/text()")
+ subjects = root.xpath("//div/label[@id='ctl00_ContentPlaceHolder1_lblSubjects']/text()")
+ ebook_isbn = root.xpath("//td/label[@id='ctl00_ContentPlaceHolder1_lblIdentifier']/text()")
+ desc = root.xpath("//div/label[@id='ctl00_ContentPlaceHolder1_lblDescription']/ancestor::div[1]")
+
+ if pub_date:
+ from calibre.utils.date import parse_date
+ mi.pubdate = parse_date(pub_date[0].strip())
+ if lang:
+ mi.language = lang[0].strip()
+ print "languages is "+str(mi.language)
+ #if ebook_isbn:
+ # print "ebook isbn is "+str(ebook_isbn[0])
+ # isbn = check_isbn(ebook_isbn[0].strip())
+ # if isbn:
+ # self.cache_isbn_to_identifier(isbn, ovrdrv_id)
+ # mi.isbn = isbn
+ if subjects:
+ mi.tags = [tag.strip() for tag in subjects[0].split(',')]
+ print "tags are "+str(mi.tags)
+ if desc:
+ desc = desc[0]
+ desc = html.tostring(desc, method='html', encoding=unicode).strip()
+ # remove all attributes from tags
+ desc = re.sub(r'<([a-zA-Z0-9]+)\s[^>]+>', r'<\1>', desc)
+ # Remove comments
+ desc = re.sub(r'(?s)', '', desc)
+ mi.comments = sanitize_comments_html(desc)
+
+ return None
+
+
+def main(args=sys.argv):
+ print "running through main tests"
+ import tempfile, os, time
+ tdir = tempfile.gettempdir()
+ br = browser()
+ for ovrdrv_id, isbn, title, author in [
+ #(None, '0899661343', 'On the Road', ['Jack Kerouac']), # basic test, no series, single author
+ #(None, '9780061952838', 'The Fellowship of the Ring', ['J. R. R. Tolkien']), # Series test, multi-author
+ #(None, '9780061952838', 'The Two Towers (The Lord of the Rings, Book II)', ['J. R. R. Tolkien']), # Series test, book 2
+ #(None, '9780618153985', 'The Fellowship of the Ring (The Lord of the Rings, Part 1)', ['J.R.R. Tolkien']),
+ #('57844706-20fa-4ace-b5ee-3470b1b52173', None, 'The Two Towers', ['J. R. R. Tolkien']), # Series test, w/ ovrdrv id
+ #(None, '9780345505057', 'Deluge', ['Anne McCaffrey']) # Multiple authors
+ #(None, None, 'Deluge', ['Anne McCaffrey']) # Empty ISBN
+ #(None, None, 'On the Road', ['Jack Kerouac']), # Nonetype ISBN
+ #(None, '9780345435279', 'A Caress of Twilight', ['Laurell K. Hamilton']),
+ #(None, '9780606087230', 'The Omnivore\'s Dilemma : A Natural History of Four Meals', ['Michael Pollan']), # Subtitle colon
+ #(None, '9780061747649', 'Mental_Floss Presents: Condensed Knowledge', ['Will Pearson', 'Mangesh Hattikudur']),
+ #(None, '9781400050802', 'The Zombie Survival Guide', ['Max Brooks']), # Two books with this title by this author
+ #(None, '9781775414315', 'The Worst Journey in the World / Antarctic 1910-1913', ['Apsley Cherry-Garrard']), # Garbage sub-title
+ #(None, '9780440335160', 'Outlander', ['Diana Gabaldon']), # Returns lots of results to sort through to get the best match
+ (None, '9780345509741', 'The Horror Stories of Robert E. Howard', ['Robert E. Howard']), # Complex title with initials/dots stripped, some results don't have a cover
+ ]:
+ cpath = os.path.join(tdir, title+'.jpg')
+ print "cpath is "+cpath
+ st = time.time()
+ curl = get_cover_url(isbn, title, author, br, ovrdrv_id)
+ print '\n\n Took ', time.time() - st, ' to get basic metadata\n\n'
+ if curl is None:
+ print 'No cover found for', title
+ else:
+ print "curl is "+curl
+ #open(cpath, 'wb').write(br.open_novisit(curl).read())
+ #print 'Cover for', title, 'saved to', cpath
+ st = time.time()
+ print get_social_metadata(title, author, isbn, ovrdrv_id)
+ print '\n\n Took ', time.time() - st, ' to get detailed metadata\n\n'
+
+ return 0
+
+if __name__ == '__main__':
+ sys.exit(main())