further work on the overdrive plugin

2025-07-09 03:04:10 -04:00 · 2011-03-22 13:53:09 +08:00 · 2011-03-22 13:53:09 +08:00 · c6a2c8e82e
commit c6a2c8e82e
parent ed93d74e98
4 changed files with 214 additions and 205 deletions
--- a/src/calibre/ebooks/metadata/covers.py
+++ b/src/calibre/ebooks/metadata/covers.py
@ -161,14 +161,7 @@ class OverdriveCovers(CoverDownload): # {{{
    def has_cover(self, mi, ans, timeout=5.):
        if not mi.authors or not mi.title:
            return False
-        from calibre.ebooks.metadata.overdrive import get_cover_url
-        br = browser()
-        try:
-            get_cover_url(mi.isbn, mi.title, mi.authors, br)
-            self.debug('cover for', mi.isbn, 'found')
-            ans.set()
-        except Exception, e:
-            self.debug(e)
+        return True

    def get_covers(self, mi, result_queue, abort, timeout=5.):
        if not mi.isbn:
--- a/src/calibre/ebooks/metadata/fetch.py
+++ b/src/calibre/ebooks/metadata/fetch.py
@ -261,10 +261,10 @@ class Overdrive(MetadataSource): # {{{
    def fetch(self):
        if not self.isbn:
            return
-        from calibre.ebooks.metadata.overdrive import get_metadata
+        from calibre.ebooks.metadata.overdrive import get_social_metadata
        try:
-            self.results = get_metadata(self.title, self.book_author,
-                    self.publisher, self.isbn)
+            self.results = get_social_metadata(self.title, self.book_author, self.isbn)
+            
        except Exception, e:
            self.exception = e
            self.tb = traceback.format_exc()
--- a/src/calibre/ebooks/metadata/overdrive.py
+++ b/src/calibre/ebooks/metadata/overdrive.py
@ -25,13 +25,12 @@ cache_lock = RLock()
 base_url = 'http://search.overdrive.com/'


-
 def create_query(self, title=None, authors=None, identifiers={}):
    q = ''
    if title or authors:
        def build_term(prefix, parts):
            return ' '.join('in'+prefix + ':' + x for x in parts)
-        title_tokens = list(self.get_title_tokens(title))
+        title_tokens = list(self.get_title_tokens(title, False))
        if title_tokens:
            q += build_term('title', title_tokens)
        author_tokens = self.get_author_tokens(authors,
@ -71,41 +70,19 @@ def format_results(reserveid, od_title, subtitle, series, publisher, creators, t
            title = od_title+': '+subtitle
        else:
            title = od_title
+    else:
+        title = od_title
        m = re.search("([0-9]+$)", subtitle)
        if m:
            series_num = float(m.group(1))
    return [cover_url, social_metadata_url, worldcatlink, series, series_num, publisher, creators, reserveid, title]

-def overdrive_search(br, q, title, author):
-    q_query = q+'default.aspx/SearchByKeyword'
-    q_init_search = q+'SearchResults.aspx'
-    # get first author as string - convert this to a proper cleanup function later
-    s = Source(None)
-    print "printing list with string:"
-    print list(s.get_author_tokens(['J. R. R. Tolkien']))
-    print "printing list with author "+str(author)+":"
-    print list(s.get_author_tokens(author))
-    author = list(s.get_author_tokens(author))
-    for token in author:
-        print "cleaned up author is: "+str(token)
-    author_q = '+'.join(author)
-    #author_q = separator.join(for x in author)
-    # query terms
-    #author_q = re.sub('\s', '+', author_q)
-    print "final author query is "+str(author_q)
-    q_xref = q+'SearchResults.svc/GetResults?iDisplayLength=10&sSearch='+author_q
-    query = '{"szKeyword":"'+title+'"}'
-
-    # main query, requires specific Content Type header
-    req = mechanize.Request(q_query)
-    req.add_header('Content-Type', 'application/json; charset=utf-8')
-    br.open_novisit(req, query)
-
-    print "q_init_search is "+q_init_search
-    
-    # the query must be initialized by loading an empty search results page
-    # this page attempts to set a cookie that Mechanize doesn't like
-    # copy the cookiejar to a separate instance and make a one-off request with the temp cookiejar
+def safe_query(br, query_url):
+    '''
+    The query must be initialized by loading an empty search results page
+    this page attempts to set a cookie that Mechanize doesn't like
+    copy the cookiejar to a separate instance and make a one-off request with the temp cookiejar
+    '''
    goodcookies = br._ua_handlers['_cookies'].cookiejar
    clean_cj = mechanize.CookieJar()
    cookies_to_copy = []
@ -115,10 +92,46 @@ def overdrive_search(br, q, title, author):
    for copied_cookie in cookies_to_copy:
        clean_cj.set_cookie(copied_cookie)

-    br.open_novisit(q_init_search)
+    br.open_novisit(query_url)
    
    br.set_cookiejar(clean_cj)

+
+def overdrive_search(br, q, title, author):
+    q_query = q+'default.aspx/SearchByKeyword'
+    q_init_search = q+'SearchResults.aspx'
+    # get first author as string - convert this to a proper cleanup function later
+    s = Source(None)
+    print "printing list with string:"
+    #print list(s.get_author_tokens(['J. R. R. Tolkien']))
+    print "printing list with author "+str(author)+":"
+    print list(s.get_author_tokens(author))
+    author_tokens = list(s.get_author_tokens(author))
+    for token in author_tokens:
+        print "cleaned up author token is: "+str(token)
+    author_q = ' '.join(author_tokens)
+
+    title_tokens = list(s.get_title_tokens(title))
+    for token in title_tokens:
+        print "cleaned up title token is: "+str(token)
+    title_q = '+'.join(title_tokens)
+    #author_q = separator.join(for x in author)
+    # query terms
+    #author_q = re.sub('\s', '+', author_q)
+    print "final author query is "+str(author_q)
+    print "final title query is "+str(title_q)
+    q_xref = q+'SearchResults.svc/GetResults?iDisplayLength=20&sSearch='+title_q
+    query = '{"szKeyword":"'+author_q+'"}'
+
+    # main query, requires specific Content Type header
+    req = mechanize.Request(q_query)
+    req.add_header('Content-Type', 'application/json; charset=utf-8')
+    br.open_novisit(req, query)
+
+    print "q_init_search is "+q_init_search
+    # initiate the search without messing up the cookiejar
+    safe_query(br, q_init_search)
+
    # get the search results object
    xreq = mechanize.Request(q_xref)
    xreq.add_header('X-Requested-With', 'XMLHttpRequest')
@ -126,83 +139,102 @@ def overdrive_search(br, q, title, author):
    xreq.add_header('Accept', 'application/json, text/javascript, */*')
    raw = br.open_novisit(xreq).read()
    print "overdrive search result is:\n"+raw
+    print "\n\nsorting results"
+    return sort_ovrdrv_results(raw, title, title_tokens, author, author_tokens)
+
+
+def sort_ovrdrv_results(raw, title=None, title_tokens=None, author=None, author_tokens=None, ovrdrv_id=None):
+    print "\ntitle to search for is "+str(title)+"\nauthor to search for is "+str(author)
+    close_matches = []
    raw = re.sub('.*?\[\[(?P<content>.*?)\]\].*', '[[\g<content>]]', raw)
    results = eval(raw)
    print "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n"
-    print results
-    # The search results are from a keyword search (overdrive's advanced search is broken), 
+    #print results
+    # The search results are either from a keyword search or a multi-format list from a single ID,
    # sort through the results for closest match/format
    for result in results:
        print "\n\n\nthis result is "+str(result)
        for reserveid, od_title, subtitle, edition, series, publisher, format, formatid, creators, \
                thumbimage, shortdescription, worldcatlink, excerptlink, creatorfile, sorttitle, \
                availabletolibrary, availabletoretailer, relevancyrank, unknown1, unknown2, unknown3 in results:
+            if ovrdrv_id is not None and int(formatid) in [1, 50, 410, 900]:
+                print "overdrive id is not None, searching based on format type priority"
+                return format_results(reserveid, od_title, subtitle, series, publisher, creators, thumbimage, worldcatlink, formatid)            
+            else:
                creators = creators.split(', ')
                print "fixed creators are: "+str(creators)
-            # if an exact match occurs
-            if creators[0] == author and od_title == title and int(formatid) in [1, 50, 410, 900]:
+                # if an exact match in a preferred format occurs
+                if creators[0] == author[0] and od_title == title and int(formatid) in [1, 50, 410, 900]:
                    print "Got Exact Match!!!"
                    return format_results(reserveid, od_title, subtitle, series, publisher, creators, thumbimage, worldcatlink, formatid)
-        
-
-def library_search(br, q, title, author):
-    q_search = q+'AdvancedSearch.htm'
-    q_query = q+'BANGSearch.dll'
-    br.open(q_search)
-    # Search for cover with audiobooks lowest priority
-    for format in ['410','50','900','25','425']:
-        query = 'Title='+title+'&Creator='+author+'&Keyword=&ISBN=&Format='+format+'&Language=&Publisher=&Subject=&Award=&CollDate=&PerPage=10&Sort=SortBy%3Dtitle'
-        query = re.sub('\s', '+', query)
-        #print "search url is "+str(q_search)
-        print "query is "+str(query)
-        raw = br.open(q_query, query).read()
-        #print "raw html is:\n"+str(raw)
-        raw = xml_to_unicode(raw, strip_encoding_pats=True,
-                resolve_entities=True)[0]
-        root = html.fromstring(raw)
-        revs = root.xpath("//img[@class='blackborder']")
-        if revs:
-            #print "revs are "+str(revs)
-            # get the first match, as it's the most likely candidate
-            x = revs[0]
-            id = urllib.unquote(re.sub('.*?/(?P<i>%7B.*?%7D).*', '\g<i>', x.get('src')))
-            curl = re.sub('(?P<img>(Ima?g(eType-)?))200', '\g<img>100', x.get('src'))
-            murl = root.xpath("//img[@class='blackborder']/parent::*")
-            if murl:
-                murl = [y.get('href') for y in murl]
-                print "murl is"+str(murl)
-                murl = q+murl[0]
                else:
-                print "didn't get metadata URL"
-            print "curl is "+str(curl)+", id is "+str(id)+", murl is "+str(murl)
-            ovrdrv_data = [id, curl, murl]
-            print "revs final are "+str(revs)
-            return ovrdrv_data
+                    close_title_match = False
+                    close_author_match = False
+                    for token in title_tokens:
+                        if od_title.lower().find(token.lower()) != -1:
+                            close_title_match = True
+                        else:
+                            close_title_match = False
+                            break
+                    for token in author_tokens:
+                        if creators[0].lower().find(token.lower()) != -1:
+                            close_author_match = True
+                        else:
+                            close_author_match = False
+                            break
+                    if close_title_match and close_author_match and int(formatid) in [1, 50, 410, 900]:
+                        close_matches.append(format_results(reserveid, od_title, subtitle, series, publisher, creators, thumbimage, worldcatlink, formatid))
+        if close_matches:
+            return close_matches[0]
+        else:
+            return None


-def find_ovrdrv_data(br, title, author, isbn):
-    print "in fnd_ovrdrv_data, title is "+str(title)+", author is "+str(author)
+
+def overdrive_get_record(br, q, ovrdrv_id):
+    search_url = q+'SearchResults.aspx?ReserveID={'+ovrdrv_id+'}'
+    results_url = q+'SearchResults.svc/GetResults?sEcho=1&iColumns=18&sColumns=ReserveID%2CTitle%2CSubtitle%2CEdition%2CSeries%2CPublisher%2CFormat%2CFormatID%2CCreators%2CThumbImage%2CShortDescription%2CWorldCatLink%2CExcerptLink%2CCreatorFile%2CSortTitle%2CAvailableToLibrary%2CAvailableToRetailer%2CRelevancyRank&iDisplayStart=0&iDisplayLength=10&sSearch=&bEscapeRegex=true&iSortingCols=1&iSortCol_0=17&sSortDir_0=asc'
+
+    # get the base url to set the proper session cookie
+    br.open_novisit(q)
+
+    # initialize the search
+    safe_query(br, search_url)
+
+    # get the results
+    req = mechanize.Request(results_url)
+    req.add_header('X-Requested-With', 'XMLHttpRequest')
+    req.add_header('Referer', search_url)
+    req.add_header('Accept', 'application/json, text/javascript, */*')
+    raw = br.open_novisit(req)
+    raw = str(list(raw))
+    return sort_ovrdrv_results(raw, None, None, None, ovrdrv_id)
+
+
+def find_ovrdrv_data(br, title, author, isbn, ovrdrv_id=None):
+    print "in find_ovrdrv_data, title is "+str(title)+", author is "+str(author)+", overdrive id is "+str(ovrdrv_id)
    q = base_url
-    if re.match('http://search\.overdrive\.', q):
+    if ovrdrv_id is None:
       return overdrive_search(br, q, title, author)
    else:
-       return library_search(br, q, title, author)
+       return overdrive_get_record(br, q, ovrdrv_id)



-def to_ovrdrv_data(br, title, author, isbn):
+def to_ovrdrv_data(br, title, author, isbn, ovrdrv_id=None):
    print "starting to_ovrdrv_data"
    with cache_lock:
        ans = ovrdrv_data_cache.get(isbn, None)
    if ans:
-        print "inside to_ovrdrv_data, ans returned positive, ans is"+str(ans)
+        print "inside to_ovrdrv_data, cache lookup successful, ans is "+str(ans)
        return ans
    if ans is False:
        print "inside to_ovrdrv_data, ans returned False"
        return None
    try:
-        ovrdrv_data = find_ovrdrv_data(br, title, author, isbn)
-        print "ovrdrv_data = "+str(ovrdrv_data)
+        print "trying to retrieve data, running find_ovrdrv_data"
+        ovrdrv_data = find_ovrdrv_data(br, title, author, isbn, ovrdrv_id)
+        print "ovrdrv_data is "+str(ovrdrv_data)
    except:
        import traceback
        traceback.print_exc()
@ -210,66 +242,69 @@ def to_ovrdrv_data(br, title, author, isbn):

    with cache_lock:
        ovrdrv_data_cache[isbn] = ovrdrv_data if ovrdrv_data else False
+    if ovrdrv_data:
+        from calibre.ebooks.metadata.xisbn import xisbn
+        for i in xisbn.get_associated_isbns(isbn):
+            with cache_lock:
+                ovrdrv_data_cache[i] = ovrdrv_data
+
    return ovrdrv_data


-def get_social_metadata(title, authors, publisher, isbn):
+def get_social_metadata(title, authors, isbn, ovrdrv_id=None):
    author = authors[0]
    mi = Metadata(title, authors)
-    if not isbn:
-        return mi
-    isbn = check_isbn(isbn)
-    if not isbn:
-        return mi
    br = browser()
-    ovrdrv_data = to_ovrdrv_data(br, title, authors, isbn)
-    if ovrdrv_data and get_metadata_detail_ovrdrv(br, ovrdrv_data, mi):
+    print "calling to_ovrdrv_data from inside get_social_metadata"
+    ovrdrv_data = to_ovrdrv_data(br, title, authors, isbn, ovrdrv_id)
+
+    #[cover_url, social_metadata_url, worldcatlink, series, series_num, publisher, creators, reserveid, title]
+
+    if len(ovrdrv_data[3]) > 1:
+        mi.series = ovrdrv_data[3]
+        if ovrdrv_data[4]:
+            mi.series_index = ovrdrv_data[4]
+    mi.publisher = ovrdrv_data[5]
+    mi.authors = ovrdrv_data[6]
+    if ovrdrv_id is None:
+        ovrdrv_id = ovrdrv_data[7]
+    mi.set_identifier('overdrive', ovrdrv_id)
+    mi.title = ovrdrv_data[8]
+
+    if ovrdrv_data and get_metadata_detail(br, ovrdrv_data[1], mi, isbn):
        return mi
-    #from calibre.ebooks.metadata.xisbn import xisbn
-    #for i in xisbn.get_associated_isbns(isbn):
-    #    print "xisbn isbn is "+str(i)
-    #    ovrdrv_data = to_ovrdrv_data(br, title, author, i)
-    #    if ovrdrv_data and get_metadata_detail(br, ovrdrv_data, mi):
-    #        return mi
    return mi

-def get_cover_url(isbn, title, author, br):
+def get_cover_url(isbn, title, author, br, ovrdrv_id=None):
    print "starting get_cover_url"
-    isbn = check_isbn(isbn)
-    print "isbn is "+str(isbn)
    print "title is "+str(title)
    print "author is "+str(author[0])
+    print "isbn is "+str(isbn)
+    print "ovrdrv_id is "+str(ovrdrv_id)

    with cache_lock:
        ans = cover_url_cache.get(isbn, None)
+        #ans = cover_url_cache.get(ovrdrv_id, None)
    if ans:
-        print "ans returned positive"
+        print "cover url cache lookup returned positive, ans is "+str(ans)
        return ans
    if ans is False:
-        "ans returned false"
+        "cover url cache lookup returned false"
        return None
-    print "in get_cover_url, running through ovrdrv_data function"
-    ovrdrv_data = to_ovrdrv_data(br, title, author, isbn)
-    print "ovrdrv_id is "+str(ovrdrv_data)
+    print "in get_cover_url, calling to_ovrdrv_data function"
+    ovrdrv_data = to_ovrdrv_data(br, title, author, isbn, ovrdrv_id)
    if ovrdrv_data:
        ans = ovrdrv_data[0]
-        print "inside get_cover_url, ans is "+str(ans)
+        print "inside get_cover_url, got url from to_ovrdrv_data, ans is "+str(ans)
        if ans:
+            print "writing cover url to url cache"
            with cache_lock:
                cover_url_cache[isbn] = ans
+                #cover_url_cache[ovrdrv_id] = ans
            return ans
-    #from calibre.ebooks.metadata.xisbn import xisbn
-    #for i in xisbn.get_associated_isbns(isbn):
-    #    print "in get_cover_url, using xisbn list to associate other books"
-    #    ovrdrv_data = to_ovrdrv_data(br, title, author, i)
-    #    if ovrdrv_data:
-    #        ans = _get_cover_url(br, ovrdrv_data)
-    #        if ans:
-    #            with cache_lock:
-    #                cover_url_cache[isbn] = ans
-    #                cover_url_cache[i] = ans
-    #            return ans
+            
    with cache_lock:
+        print "marking cover url cache for this isbn false"
        cover_url_cache[isbn] = False
    return None

@ -303,18 +338,14 @@ def _get_cover_url(br, ovrdrv_data):
                return ('/'.join(parts[:-1]))+'/'+bn
    return None

-
-def get_metadata_detail(br, ovrdrv_data, mi):
-    q = ovrdrv_data[2]
+def get_metadata_detail(br, metadata_url, mi, isbn=None):
    try:
-        raw = br.open_novisit(q).read()
+        raw = br.open_novisit(metadata_url).read()
    except Exception, e:
        if callable(getattr(e, 'getcode', None)) and \
                e.getcode() == 404:
            return False
        raise   
-    if '<title>404 - ' in raw:
-        return False
    raw = xml_to_unicode(raw, strip_encoding_pats=True,
            resolve_entities=True)[0]
    try:
@ -322,26 +353,28 @@ def get_metadata_detail(br, ovrdrv_data, mi):
    except:
        return False

-    # Check for series name and retrieve it
-    series_name = root.xpath("//td/script[re:test(text(), 'szSeries', 'i')]", 
-                           namespaces={"re": "http://exslt.org/regular-expressions"})
-    if series_name:
-        series = html.tostring(series_name[0], method='html', encoding=unicode).strip()
-        series = re.sub('(?s).*?szSeries\s*=\s*\"(?P<series>.*?)\";.*', '\g<series>', series)
-        if len(series) > 1:
-            mi.series = series
-            # If series was successful attempt to get the series number
-            series_num = root.xpath("//div/strong[re:test(text(), ',\s(Book|Part|Volume)')]", 
-                                  namespaces={"re": "http://exslt.org/regular-expressions"})
-            if series_num:
-                series_num = float(re.sub('(?s).*?,\s*(Book|Part|Volume)\s*(?P<num>\d+).*', '\g<num>', 
-                                 etree.tostring(series_num[0])))
-                if series_num >= 1:
-                    mi.series_index = series_num
-            print "series_num is "+str(series_num)
+    isbn = check_isbn(isbn)

-    desc = root.xpath("//td[@class='collection' and re:test(., 'Description', 'i')]/following::div[1]", 
-                    namespaces={"re": "http://exslt.org/regular-expressions"})
+    pub_date = root.xpath("//div/label[@id='ctl00_ContentPlaceHolder1_lblPubDate']/text()")
+    lang = root.xpath("//div/label[@id='ctl00_ContentPlaceHolder1_lblLanguage']/text()")
+    subjects = root.xpath("//div/label[@id='ctl00_ContentPlaceHolder1_lblSubjects']/text()")
+    ebook_isbn = root.xpath("//div/label[@id='ctl00_ContentPlaceHolder1_lblIdentifier']/text()")
+    desc = root.xpath("//div/label[@id='ctl00_ContentPlaceHolder1_lblDescription']/ancestor::div[1]")
+
+    if pub_date:
+        from calibre.utils.date import parse_date
+        mi.pubdate = parse_date(pub_date[0].strip())
+    if lang:
+        mi.language = lang[0].strip()
+        print "languages is "+str(mi.language)
+    if ebook_isbn and isbn is None:
+        print "ebook isbn is "+str(ebook_isbn[0])
+        mi.set_identifier('isbn', ebook_isbn)
+    #elif isbn is not None:
+    #    mi.set_identifier('isbn', isbn)
+    if subjects:
+        mi.tags = subjects
+        print "tags are "+str(mi.tags)
    if desc:
        desc = desc[0]
        desc = html.tostring(desc, method='html', encoding=unicode).strip()
@ -351,36 +384,6 @@ def get_metadata_detail(br, ovrdrv_data, mi):
        desc = re.sub(r'(?s)<!--.*?-->', '', desc)
        mi.comments = sanitize_comments_html(desc)

-    publisher = root.xpath("//td/strong[re:test(text(), 'Publisher\:', 'i')]/ancestor::td[1]/following-sibling::td/text()", 
-                         namespaces={"re": "http://exslt.org/regular-expressions"})
-    if publisher:
-        mi.publisher = re.sub('^\s*(?P<pub>.*?)\s*$', '\g<pub>', publisher[0])
-        print "publisher is "+str(mi.publisher)
-
-    lang = root.xpath("//td/strong[re:test(text(), 'Language\(s\):', 'i')]/ancestor::td[1]/following-sibling::td/text()", 
-                    namespaces={"re": "http://exslt.org/regular-expressions"})
-    if lang:
-        mi.language = re.sub('^\s*(?P<lang>.*?)\s*$', '\g<lang>', lang[0])
-        print "languages is "+str(mi.language)    
-
-    isbn = root.xpath("//tr/td[re:test(text(), 'ISBN:', 'i')]/following::td/text()", 
-                    namespaces={"re": "http://exslt.org/regular-expressions"})
-    if isbn:
-        mi.isbn = re.sub('^\s*(?P<isbn>.*?)\s*$', '\g<isbn>', isbn[0])
-        print "ISBN is "+str(mi.isbn)    
-
-    subjects = root.xpath("//td/strong[re:test(text(), 'Subject', 'i')]/ancestor::td[1]/following-sibling::td/a/text()", 
-                        namespaces={"re": "http://exslt.org/regular-expressions"})
-    if subjects:
-        mi.tags = subjects
-        print "tags are "+str(mi.tags) 
-
-    creators = root.xpath("//table/tr/td[re:test(text(), '\s*by', 'i')]/ancestor::tr[1]/td[2]/table/tr/td/a/text()", 
-                        namespaces={"re": "http://exslt.org/regular-expressions"})
-    if creators:
-        print "authors are "+str(creators)
-        mi.authors = creators
-
    return True

 def main(args=sys.argv):
@ -388,19 +391,26 @@ def main(args=sys.argv):
    import tempfile, os, time
    tdir = tempfile.gettempdir()
    br = browser()
-    for isbn, title, author in [
-            #('0899661343', 'On the Road', ['Jack Kerouac']), # basic test, no series, single author
-            #('9780061952838', 'The Fellowship of the Ring', ['J. R. R. Tolkien']), # Series test, multi-author
-            ('9780061952838', 'The Two Towers', ['J. R. R. Tolkien']), # Series test, book 2
-            #('9780345505057', 'Deluge', ['Anne McCaffrey']) # Multiple authors
-            #('', 'Deluge', ['Anne McCaffrey']) # Empty ISBN
-            #(None, 'On the Road', ['Jack Kerouac']) # Nonetype ISBN
+    for ovrdrv_id, isbn, title, author in [
+            #(None, '0899661343', 'On the Road', ['Jack Kerouac']), # basic test, no series, single author
+            #(None, '9780061952838', 'The Fellowship of the Ring', ['J. R. R. Tolkien']), # Series test, multi-author
+            #(None, '9780061952838', 'The Two Towers', ['J. R. R. Tolkien']), # Series test, book 2
+            #('57844706-20fa-4ace-b5ee-3470b1b52173', None, 'The Two Towers', ['J. R. R. Tolkien']), # Series test, w/ ovrdrv id
+            #(None, '9780345505057', 'Deluge', ['Anne McCaffrey']) # Multiple authors
+            #(None, None, 'Deluge', ['Anne McCaffrey']) # Empty ISBN
+            #(None, None, 'On the Road', ['Jack Kerouac']), # Nonetype ISBN
+            #(None, '9780345435279', 'A Caress of Twilight', ['Laurell K. Hamilton']),
+            #(None, '9780606087230', 'The Omnivore\'s Dilemma : A Natural History of Four Meals', ['Michael Pollan']), # Subtitle colon
+            #(None, '9780061747649', 'Mental_Floss Presents: Condensed Knowledge', ['Will Pearson', 'Mangesh Hattikudur']),
+            #(None, '9781400050802', 'The Zombie Survival Guide', ['Max Brooks']), # Two books with this title by this author
+            #(None, '9781775414315', 'The Worst Journey in the World / Antarctic 1910-1913', ['Apsley Cherry-Garrard']), # Garbage sub-title
+            (None, '9780440335160', 'Outlander', ['Diana Gabaldon']), # Returns lots of results to sort through to get the best match
            ]:
        cpath = os.path.join(tdir, title+'.jpg')
        print "cpath is "+cpath
        st = time.time()
-        curl = get_cover_url(isbn, title, author, br)
-        print '\n\n Took ', time.time() - st, ' to get metadata\n\n'
+        curl = get_cover_url(isbn, title, author, br, ovrdrv_id)
+        print '\n\n Took ', time.time() - st, ' to get basic metadata\n\n'
        if curl is None:
            print 'No cover found for', title
        else:
@ -408,9 +418,7 @@ def main(args=sys.argv):
            #open(cpath, 'wb').write(br.open_novisit(curl).read())
            #print 'Cover for', title, 'saved to', cpath

-        #import time
-        
-        #print get_social_metadata(title, author, None, isbn)
+        print get_social_metadata(title, author, isbn, ovrdrv_id)
        #print '\n\n', time.time() - st, '\n\n'

    return 0
--- a/src/calibre/ebooks/metadata/sources/base.py
+++ b/src/calibre/ebooks/metadata/sources/base.py
@ -87,32 +87,40 @@ class Source(Plugin):

        if authors:
            # Leave ' in there for Irish names
-            pat = re.compile(r'[-,:;+!@#$%^&*(){}.`~"\s\[\]/]')
+            remove_pat = re.compile(r'[,:;!@#$%^&*(){}.`~"\s\[\]/]')
+            replace_pat = re.compile(r'-+')
            if only_first_author:
                authors = authors[:1]
            for au in authors:
+                au = replace_pat.sub(' ', au)
                parts = au.split()
                if ',' in au:
                    # au probably in ln, fn form
                    parts = parts[1:] + parts[:1]
                for tok in parts:
-                    tok = pat.sub('', tok).strip()
+                    tok = remove_pat.sub('', tok).strip()
                    if len(tok) > 2 and tok.lower() not in ('von', ):
                        yield tok


-    def get_title_tokens(self, title):
+    def get_title_tokens(self, title, strip_joiners=True):
        '''
        Take a title and return a list of tokens useful for an AND search query.
        Excludes connectives and punctuation.
        '''
        if title:
-            pat = re.compile(r'''[-,:;+!@#$%^&*(){}.`~"'\s\[\]/]''')
+            # strip sub-titles
+            subtitle = re.compile(r'([\(\[\{].*?[\)\]\}]|[/:\\].*$)')
+            if len(subtitle.sub('', title)) > 1:
+                title = subtitle.sub('', title)
+            pat = re.compile(r'''([-,:;+!@#$%^&*(){}.`~"\s\[\]/]|'(?!s))''')
            title = pat.sub(' ', title)
            tokens = title.split()
            for token in tokens:
                token = token.strip()
-                if token and token.lower() not in ('a', 'and', 'the'):
+                if token and token.lower() not in ('a', 'and', 'the') and strip_joiners:
+                    yield token
+                elif token:
                    yield token

    def split_jobs(self, jobs, num):