diff --git a/src/calibre/ebooks/metadata/covers.py b/src/calibre/ebooks/metadata/covers.py index 9f5958f1ad..6ea292aa93 100644 --- a/src/calibre/ebooks/metadata/covers.py +++ b/src/calibre/ebooks/metadata/covers.py @@ -161,14 +161,7 @@ class OverdriveCovers(CoverDownload): # {{{ def has_cover(self, mi, ans, timeout=5.): if not mi.authors or not mi.title: return False - from calibre.ebooks.metadata.overdrive import get_cover_url - br = browser() - try: - get_cover_url(mi.isbn, mi.title, mi.authors, br) - self.debug('cover for', mi.isbn, 'found') - ans.set() - except Exception, e: - self.debug(e) + return True def get_covers(self, mi, result_queue, abort, timeout=5.): if not mi.isbn: diff --git a/src/calibre/ebooks/metadata/fetch.py b/src/calibre/ebooks/metadata/fetch.py index 1f584bc107..0401ee78c5 100644 --- a/src/calibre/ebooks/metadata/fetch.py +++ b/src/calibre/ebooks/metadata/fetch.py @@ -261,10 +261,10 @@ class Overdrive(MetadataSource): # {{{ def fetch(self): if not self.isbn: return - from calibre.ebooks.metadata.overdrive import get_metadata + from calibre.ebooks.metadata.overdrive import get_social_metadata try: - self.results = get_metadata(self.title, self.book_author, - self.publisher, self.isbn) + self.results = get_social_metadata(self.title, self.book_author, self.isbn) + except Exception, e: self.exception = e self.tb = traceback.format_exc() diff --git a/src/calibre/ebooks/metadata/overdrive.py b/src/calibre/ebooks/metadata/overdrive.py index 5afb875fad..e72d168146 100644 --- a/src/calibre/ebooks/metadata/overdrive.py +++ b/src/calibre/ebooks/metadata/overdrive.py @@ -25,13 +25,12 @@ cache_lock = RLock() base_url = 'http://search.overdrive.com/' - def create_query(self, title=None, authors=None, identifiers={}): q = '' if title or authors: def build_term(prefix, parts): return ' '.join('in'+prefix + ':' + x for x in parts) - title_tokens = list(self.get_title_tokens(title)) + title_tokens = list(self.get_title_tokens(title, False)) if title_tokens: q += build_term('title', title_tokens) author_tokens = self.get_author_tokens(authors, @@ -58,7 +57,7 @@ def get_base_referer(): 'http://spl.lib.overdrive.com/5875E082-4CB2-4689-9426-8509F354AFEF/10/335/en/' ] return choices[random.randint(0, len(choices)-1)] - + def format_results(reserveid, od_title, subtitle, series, publisher, creators, thumbimage, worldcatlink, formatid): fix_slashes = re.compile(r'\\/') thumbimage = fix_slashes.sub('/', thumbimage) @@ -67,8 +66,10 @@ def format_results(reserveid, od_title, subtitle, series, publisher, creators, t social_metadata_url = base_url+'TitleInfo.aspx?ReserveID='+reserveid+'&FormatID='+formatid series_num = '' if not series: - if subtitle: - title = od_title+': '+subtitle + if subtitle: + title = od_title+': '+subtitle + else: + title = od_title else: title = od_title m = re.search("([0-9]+$)", subtitle) @@ -76,36 +77,12 @@ def format_results(reserveid, od_title, subtitle, series, publisher, creators, t series_num = float(m.group(1)) return [cover_url, social_metadata_url, worldcatlink, series, series_num, publisher, creators, reserveid, title] -def overdrive_search(br, q, title, author): - q_query = q+'default.aspx/SearchByKeyword' - q_init_search = q+'SearchResults.aspx' - # get first author as string - convert this to a proper cleanup function later - s = Source(None) - print "printing list with string:" - print list(s.get_author_tokens(['J. R. R. Tolkien'])) - print "printing list with author "+str(author)+":" - print list(s.get_author_tokens(author)) - author = list(s.get_author_tokens(author)) - for token in author: - print "cleaned up author is: "+str(token) - author_q = '+'.join(author) - #author_q = separator.join(for x in author) - # query terms - #author_q = re.sub('\s', '+', author_q) - print "final author query is "+str(author_q) - q_xref = q+'SearchResults.svc/GetResults?iDisplayLength=10&sSearch='+author_q - query = '{"szKeyword":"'+title+'"}' - - # main query, requires specific Content Type header - req = mechanize.Request(q_query) - req.add_header('Content-Type', 'application/json; charset=utf-8') - br.open_novisit(req, query) - - print "q_init_search is "+q_init_search - - # the query must be initialized by loading an empty search results page - # this page attempts to set a cookie that Mechanize doesn't like - # copy the cookiejar to a separate instance and make a one-off request with the temp cookiejar +def safe_query(br, query_url): + ''' + The query must be initialized by loading an empty search results page + this page attempts to set a cookie that Mechanize doesn't like + copy the cookiejar to a separate instance and make a one-off request with the temp cookiejar + ''' goodcookies = br._ua_handlers['_cookies'].cookiejar clean_cj = mechanize.CookieJar() cookies_to_copy = [] @@ -115,10 +92,46 @@ def overdrive_search(br, q, title, author): for copied_cookie in cookies_to_copy: clean_cj.set_cookie(copied_cookie) - br.open_novisit(q_init_search) + br.open_novisit(query_url) br.set_cookiejar(clean_cj) + +def overdrive_search(br, q, title, author): + q_query = q+'default.aspx/SearchByKeyword' + q_init_search = q+'SearchResults.aspx' + # get first author as string - convert this to a proper cleanup function later + s = Source(None) + print "printing list with string:" + #print list(s.get_author_tokens(['J. R. R. Tolkien'])) + print "printing list with author "+str(author)+":" + print list(s.get_author_tokens(author)) + author_tokens = list(s.get_author_tokens(author)) + for token in author_tokens: + print "cleaned up author token is: "+str(token) + author_q = ' '.join(author_tokens) + + title_tokens = list(s.get_title_tokens(title)) + for token in title_tokens: + print "cleaned up title token is: "+str(token) + title_q = '+'.join(title_tokens) + #author_q = separator.join(for x in author) + # query terms + #author_q = re.sub('\s', '+', author_q) + print "final author query is "+str(author_q) + print "final title query is "+str(title_q) + q_xref = q+'SearchResults.svc/GetResults?iDisplayLength=20&sSearch='+title_q + query = '{"szKeyword":"'+author_q+'"}' + + # main query, requires specific Content Type header + req = mechanize.Request(q_query) + req.add_header('Content-Type', 'application/json; charset=utf-8') + br.open_novisit(req, query) + + print "q_init_search is "+q_init_search + # initiate the search without messing up the cookiejar + safe_query(br, q_init_search) + # get the search results object xreq = mechanize.Request(q_xref) xreq.add_header('X-Requested-With', 'XMLHttpRequest') @@ -126,83 +139,102 @@ def overdrive_search(br, q, title, author): xreq.add_header('Accept', 'application/json, text/javascript, */*') raw = br.open_novisit(xreq).read() print "overdrive search result is:\n"+raw + print "\n\nsorting results" + return sort_ovrdrv_results(raw, title, title_tokens, author, author_tokens) + + +def sort_ovrdrv_results(raw, title=None, title_tokens=None, author=None, author_tokens=None, ovrdrv_id=None): + print "\ntitle to search for is "+str(title)+"\nauthor to search for is "+str(author) + close_matches = [] raw = re.sub('.*?\[\[(?P.*?)\]\].*', '[[\g]]', raw) results = eval(raw) print "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n" - print results - # The search results are from a keyword search (overdrive's advanced search is broken), + #print results + # The search results are either from a keyword search or a multi-format list from a single ID, # sort through the results for closest match/format for result in results: print "\n\n\nthis result is "+str(result) for reserveid, od_title, subtitle, edition, series, publisher, format, formatid, creators, \ thumbimage, shortdescription, worldcatlink, excerptlink, creatorfile, sorttitle, \ availabletolibrary, availabletoretailer, relevancyrank, unknown1, unknown2, unknown3 in results: - creators = creators.split(', ') - print "fixed creators are: "+str(creators) - # if an exact match occurs - if creators[0] == author and od_title == title and int(formatid) in [1, 50, 410, 900]: - print "Got Exact Match!!!" - return format_results(reserveid, od_title, subtitle, series, publisher, creators, thumbimage, worldcatlink, formatid) - - -def library_search(br, q, title, author): - q_search = q+'AdvancedSearch.htm' - q_query = q+'BANGSearch.dll' - br.open(q_search) - # Search for cover with audiobooks lowest priority - for format in ['410','50','900','25','425']: - query = 'Title='+title+'&Creator='+author+'&Keyword=&ISBN=&Format='+format+'&Language=&Publisher=&Subject=&Award=&CollDate=&PerPage=10&Sort=SortBy%3Dtitle' - query = re.sub('\s', '+', query) - #print "search url is "+str(q_search) - print "query is "+str(query) - raw = br.open(q_query, query).read() - #print "raw html is:\n"+str(raw) - raw = xml_to_unicode(raw, strip_encoding_pats=True, - resolve_entities=True)[0] - root = html.fromstring(raw) - revs = root.xpath("//img[@class='blackborder']") - if revs: - #print "revs are "+str(revs) - # get the first match, as it's the most likely candidate - x = revs[0] - id = urllib.unquote(re.sub('.*?/(?P%7B.*?%7D).*', '\g', x.get('src'))) - curl = re.sub('(?P(Ima?g(eType-)?))200', '\g100', x.get('src')) - murl = root.xpath("//img[@class='blackborder']/parent::*") - if murl: - murl = [y.get('href') for y in murl] - print "murl is"+str(murl) - murl = q+murl[0] + if ovrdrv_id is not None and int(formatid) in [1, 50, 410, 900]: + print "overdrive id is not None, searching based on format type priority" + return format_results(reserveid, od_title, subtitle, series, publisher, creators, thumbimage, worldcatlink, formatid) else: - print "didn't get metadata URL" - print "curl is "+str(curl)+", id is "+str(id)+", murl is "+str(murl) - ovrdrv_data = [id, curl, murl] - print "revs final are "+str(revs) - return ovrdrv_data + creators = creators.split(', ') + print "fixed creators are: "+str(creators) + # if an exact match in a preferred format occurs + if creators[0] == author[0] and od_title == title and int(formatid) in [1, 50, 410, 900]: + print "Got Exact Match!!!" + return format_results(reserveid, od_title, subtitle, series, publisher, creators, thumbimage, worldcatlink, formatid) + else: + close_title_match = False + close_author_match = False + for token in title_tokens: + if od_title.lower().find(token.lower()) != -1: + close_title_match = True + else: + close_title_match = False + break + for token in author_tokens: + if creators[0].lower().find(token.lower()) != -1: + close_author_match = True + else: + close_author_match = False + break + if close_title_match and close_author_match and int(formatid) in [1, 50, 410, 900]: + close_matches.append(format_results(reserveid, od_title, subtitle, series, publisher, creators, thumbimage, worldcatlink, formatid)) + if close_matches: + return close_matches[0] + else: + return None -def find_ovrdrv_data(br, title, author, isbn): - print "in fnd_ovrdrv_data, title is "+str(title)+", author is "+str(author) + +def overdrive_get_record(br, q, ovrdrv_id): + search_url = q+'SearchResults.aspx?ReserveID={'+ovrdrv_id+'}' + results_url = q+'SearchResults.svc/GetResults?sEcho=1&iColumns=18&sColumns=ReserveID%2CTitle%2CSubtitle%2CEdition%2CSeries%2CPublisher%2CFormat%2CFormatID%2CCreators%2CThumbImage%2CShortDescription%2CWorldCatLink%2CExcerptLink%2CCreatorFile%2CSortTitle%2CAvailableToLibrary%2CAvailableToRetailer%2CRelevancyRank&iDisplayStart=0&iDisplayLength=10&sSearch=&bEscapeRegex=true&iSortingCols=1&iSortCol_0=17&sSortDir_0=asc' + + # get the base url to set the proper session cookie + br.open_novisit(q) + + # initialize the search + safe_query(br, search_url) + + # get the results + req = mechanize.Request(results_url) + req.add_header('X-Requested-With', 'XMLHttpRequest') + req.add_header('Referer', search_url) + req.add_header('Accept', 'application/json, text/javascript, */*') + raw = br.open_novisit(req) + raw = str(list(raw)) + return sort_ovrdrv_results(raw, None, None, None, ovrdrv_id) + + +def find_ovrdrv_data(br, title, author, isbn, ovrdrv_id=None): + print "in find_ovrdrv_data, title is "+str(title)+", author is "+str(author)+", overdrive id is "+str(ovrdrv_id) q = base_url - if re.match('http://search\.overdrive\.', q): + if ovrdrv_id is None: return overdrive_search(br, q, title, author) else: - return library_search(br, q, title, author) - + return overdrive_get_record(br, q, ovrdrv_id) -def to_ovrdrv_data(br, title, author, isbn): + +def to_ovrdrv_data(br, title, author, isbn, ovrdrv_id=None): print "starting to_ovrdrv_data" with cache_lock: ans = ovrdrv_data_cache.get(isbn, None) if ans: - print "inside to_ovrdrv_data, ans returned positive, ans is"+str(ans) + print "inside to_ovrdrv_data, cache lookup successful, ans is "+str(ans) return ans if ans is False: print "inside to_ovrdrv_data, ans returned False" return None try: - ovrdrv_data = find_ovrdrv_data(br, title, author, isbn) - print "ovrdrv_data = "+str(ovrdrv_data) + print "trying to retrieve data, running find_ovrdrv_data" + ovrdrv_data = find_ovrdrv_data(br, title, author, isbn, ovrdrv_id) + print "ovrdrv_data is "+str(ovrdrv_data) except: import traceback traceback.print_exc() @@ -210,66 +242,69 @@ def to_ovrdrv_data(br, title, author, isbn): with cache_lock: ovrdrv_data_cache[isbn] = ovrdrv_data if ovrdrv_data else False + if ovrdrv_data: + from calibre.ebooks.metadata.xisbn import xisbn + for i in xisbn.get_associated_isbns(isbn): + with cache_lock: + ovrdrv_data_cache[i] = ovrdrv_data + return ovrdrv_data -def get_social_metadata(title, authors, publisher, isbn): +def get_social_metadata(title, authors, isbn, ovrdrv_id=None): author = authors[0] mi = Metadata(title, authors) - if not isbn: - return mi - isbn = check_isbn(isbn) - if not isbn: - return mi br = browser() - ovrdrv_data = to_ovrdrv_data(br, title, authors, isbn) - if ovrdrv_data and get_metadata_detail_ovrdrv(br, ovrdrv_data, mi): + print "calling to_ovrdrv_data from inside get_social_metadata" + ovrdrv_data = to_ovrdrv_data(br, title, authors, isbn, ovrdrv_id) + + #[cover_url, social_metadata_url, worldcatlink, series, series_num, publisher, creators, reserveid, title] + + if len(ovrdrv_data[3]) > 1: + mi.series = ovrdrv_data[3] + if ovrdrv_data[4]: + mi.series_index = ovrdrv_data[4] + mi.publisher = ovrdrv_data[5] + mi.authors = ovrdrv_data[6] + if ovrdrv_id is None: + ovrdrv_id = ovrdrv_data[7] + mi.set_identifier('overdrive', ovrdrv_id) + mi.title = ovrdrv_data[8] + + if ovrdrv_data and get_metadata_detail(br, ovrdrv_data[1], mi, isbn): return mi - #from calibre.ebooks.metadata.xisbn import xisbn - #for i in xisbn.get_associated_isbns(isbn): - # print "xisbn isbn is "+str(i) - # ovrdrv_data = to_ovrdrv_data(br, title, author, i) - # if ovrdrv_data and get_metadata_detail(br, ovrdrv_data, mi): - # return mi return mi -def get_cover_url(isbn, title, author, br): +def get_cover_url(isbn, title, author, br, ovrdrv_id=None): print "starting get_cover_url" - isbn = check_isbn(isbn) - print "isbn is "+str(isbn) print "title is "+str(title) print "author is "+str(author[0]) + print "isbn is "+str(isbn) + print "ovrdrv_id is "+str(ovrdrv_id) with cache_lock: ans = cover_url_cache.get(isbn, None) + #ans = cover_url_cache.get(ovrdrv_id, None) if ans: - print "ans returned positive" + print "cover url cache lookup returned positive, ans is "+str(ans) return ans if ans is False: - "ans returned false" + "cover url cache lookup returned false" return None - print "in get_cover_url, running through ovrdrv_data function" - ovrdrv_data = to_ovrdrv_data(br, title, author, isbn) - print "ovrdrv_id is "+str(ovrdrv_data) + print "in get_cover_url, calling to_ovrdrv_data function" + ovrdrv_data = to_ovrdrv_data(br, title, author, isbn, ovrdrv_id) if ovrdrv_data: ans = ovrdrv_data[0] - print "inside get_cover_url, ans is "+str(ans) + print "inside get_cover_url, got url from to_ovrdrv_data, ans is "+str(ans) if ans: + print "writing cover url to url cache" with cache_lock: cover_url_cache[isbn] = ans + #cover_url_cache[ovrdrv_id] = ans return ans - #from calibre.ebooks.metadata.xisbn import xisbn - #for i in xisbn.get_associated_isbns(isbn): - # print "in get_cover_url, using xisbn list to associate other books" - # ovrdrv_data = to_ovrdrv_data(br, title, author, i) - # if ovrdrv_data: - # ans = _get_cover_url(br, ovrdrv_data) - # if ans: - # with cache_lock: - # cover_url_cache[isbn] = ans - # cover_url_cache[i] = ans - # return ans + with cache_lock: + print "marking cover url cache for this isbn false" cover_url_cache[isbn] = False return None @@ -303,18 +338,14 @@ def _get_cover_url(br, ovrdrv_data): return ('/'.join(parts[:-1]))+'/'+bn return None - -def get_metadata_detail(br, ovrdrv_data, mi): - q = ovrdrv_data[2] +def get_metadata_detail(br, metadata_url, mi, isbn=None): try: - raw = br.open_novisit(q).read() + raw = br.open_novisit(metadata_url).read() except Exception, e: if callable(getattr(e, 'getcode', None)) and \ e.getcode() == 404: return False - raise - if '404 - ' in raw: - return False + raise raw = xml_to_unicode(raw, strip_encoding_pats=True, resolve_entities=True)[0] try: @@ -322,26 +353,28 @@ def get_metadata_detail(br, ovrdrv_data, mi): except: return False - # Check for series name and retrieve it - series_name = root.xpath("//td/script[re:test(text(), 'szSeries', 'i')]", - namespaces={"re": "http://exslt.org/regular-expressions"}) - if series_name: - series = html.tostring(series_name[0], method='html', encoding=unicode).strip() - series = re.sub('(?s).*?szSeries\s*=\s*\"(?P<series>.*?)\";.*', '\g<series>', series) - if len(series) > 1: - mi.series = series - # If series was successful attempt to get the series number - series_num = root.xpath("//div/strong[re:test(text(), ',\s(Book|Part|Volume)')]", - namespaces={"re": "http://exslt.org/regular-expressions"}) - if series_num: - series_num = float(re.sub('(?s).*?,\s*(Book|Part|Volume)\s*(?P<num>\d+).*', '\g<num>', - etree.tostring(series_num[0]))) - if series_num >= 1: - mi.series_index = series_num - print "series_num is "+str(series_num) + isbn = check_isbn(isbn) - desc = root.xpath("//td[@class='collection' and re:test(., 'Description', 'i')]/following::div[1]", - namespaces={"re": "http://exslt.org/regular-expressions"}) + pub_date = root.xpath("//div/label[@id='ctl00_ContentPlaceHolder1_lblPubDate']/text()") + lang = root.xpath("//div/label[@id='ctl00_ContentPlaceHolder1_lblLanguage']/text()") + subjects = root.xpath("//div/label[@id='ctl00_ContentPlaceHolder1_lblSubjects']/text()") + ebook_isbn = root.xpath("//div/label[@id='ctl00_ContentPlaceHolder1_lblIdentifier']/text()") + desc = root.xpath("//div/label[@id='ctl00_ContentPlaceHolder1_lblDescription']/ancestor::div[1]") + + if pub_date: + from calibre.utils.date import parse_date + mi.pubdate = parse_date(pub_date[0].strip()) + if lang: + mi.language = lang[0].strip() + print "languages is "+str(mi.language) + if ebook_isbn and isbn is None: + print "ebook isbn is "+str(ebook_isbn[0]) + mi.set_identifier('isbn', ebook_isbn) + #elif isbn is not None: + # mi.set_identifier('isbn', isbn) + if subjects: + mi.tags = subjects + print "tags are "+str(mi.tags) if desc: desc = desc[0] desc = html.tostring(desc, method='html', encoding=unicode).strip() @@ -351,36 +384,6 @@ def get_metadata_detail(br, ovrdrv_data, mi): desc = re.sub(r'(?s)<!--.*?-->', '', desc) mi.comments = sanitize_comments_html(desc) - publisher = root.xpath("//td/strong[re:test(text(), 'Publisher\:', 'i')]/ancestor::td[1]/following-sibling::td/text()", - namespaces={"re": "http://exslt.org/regular-expressions"}) - if publisher: - mi.publisher = re.sub('^\s*(?P<pub>.*?)\s*$', '\g<pub>', publisher[0]) - print "publisher is "+str(mi.publisher) - - lang = root.xpath("//td/strong[re:test(text(), 'Language\(s\):', 'i')]/ancestor::td[1]/following-sibling::td/text()", - namespaces={"re": "http://exslt.org/regular-expressions"}) - if lang: - mi.language = re.sub('^\s*(?P<lang>.*?)\s*$', '\g<lang>', lang[0]) - print "languages is "+str(mi.language) - - isbn = root.xpath("//tr/td[re:test(text(), 'ISBN:', 'i')]/following::td/text()", - namespaces={"re": "http://exslt.org/regular-expressions"}) - if isbn: - mi.isbn = re.sub('^\s*(?P<isbn>.*?)\s*$', '\g<isbn>', isbn[0]) - print "ISBN is "+str(mi.isbn) - - subjects = root.xpath("//td/strong[re:test(text(), 'Subject', 'i')]/ancestor::td[1]/following-sibling::td/a/text()", - namespaces={"re": "http://exslt.org/regular-expressions"}) - if subjects: - mi.tags = subjects - print "tags are "+str(mi.tags) - - creators = root.xpath("//table/tr/td[re:test(text(), '\s*by', 'i')]/ancestor::tr[1]/td[2]/table/tr/td/a/text()", - namespaces={"re": "http://exslt.org/regular-expressions"}) - if creators: - print "authors are "+str(creators) - mi.authors = creators - return True def main(args=sys.argv): @@ -388,19 +391,26 @@ def main(args=sys.argv): import tempfile, os, time tdir = tempfile.gettempdir() br = browser() - for isbn, title, author in [ - #('0899661343', 'On the Road', ['Jack Kerouac']), # basic test, no series, single author - #('9780061952838', 'The Fellowship of the Ring', ['J. R. R. Tolkien']), # Series test, multi-author - ('9780061952838', 'The Two Towers', ['J. R. R. Tolkien']), # Series test, book 2 - #('9780345505057', 'Deluge', ['Anne McCaffrey']) # Multiple authors - #('', 'Deluge', ['Anne McCaffrey']) # Empty ISBN - #(None, 'On the Road', ['Jack Kerouac']) # Nonetype ISBN + for ovrdrv_id, isbn, title, author in [ + #(None, '0899661343', 'On the Road', ['Jack Kerouac']), # basic test, no series, single author + #(None, '9780061952838', 'The Fellowship of the Ring', ['J. R. R. Tolkien']), # Series test, multi-author + #(None, '9780061952838', 'The Two Towers', ['J. R. R. Tolkien']), # Series test, book 2 + #('57844706-20fa-4ace-b5ee-3470b1b52173', None, 'The Two Towers', ['J. R. R. Tolkien']), # Series test, w/ ovrdrv id + #(None, '9780345505057', 'Deluge', ['Anne McCaffrey']) # Multiple authors + #(None, None, 'Deluge', ['Anne McCaffrey']) # Empty ISBN + #(None, None, 'On the Road', ['Jack Kerouac']), # Nonetype ISBN + #(None, '9780345435279', 'A Caress of Twilight', ['Laurell K. Hamilton']), + #(None, '9780606087230', 'The Omnivore\'s Dilemma : A Natural History of Four Meals', ['Michael Pollan']), # Subtitle colon + #(None, '9780061747649', 'Mental_Floss Presents: Condensed Knowledge', ['Will Pearson', 'Mangesh Hattikudur']), + #(None, '9781400050802', 'The Zombie Survival Guide', ['Max Brooks']), # Two books with this title by this author + #(None, '9781775414315', 'The Worst Journey in the World / Antarctic 1910-1913', ['Apsley Cherry-Garrard']), # Garbage sub-title + (None, '9780440335160', 'Outlander', ['Diana Gabaldon']), # Returns lots of results to sort through to get the best match ]: cpath = os.path.join(tdir, title+'.jpg') print "cpath is "+cpath st = time.time() - curl = get_cover_url(isbn, title, author, br) - print '\n\n Took ', time.time() - st, ' to get metadata\n\n' + curl = get_cover_url(isbn, title, author, br, ovrdrv_id) + print '\n\n Took ', time.time() - st, ' to get basic metadata\n\n' if curl is None: print 'No cover found for', title else: @@ -408,9 +418,7 @@ def main(args=sys.argv): #open(cpath, 'wb').write(br.open_novisit(curl).read()) #print 'Cover for', title, 'saved to', cpath - #import time - - #print get_social_metadata(title, author, None, isbn) + print get_social_metadata(title, author, isbn, ovrdrv_id) #print '\n\n', time.time() - st, '\n\n' return 0 diff --git a/src/calibre/ebooks/metadata/sources/base.py b/src/calibre/ebooks/metadata/sources/base.py index 55cc996cf7..b600eafaf2 100644 --- a/src/calibre/ebooks/metadata/sources/base.py +++ b/src/calibre/ebooks/metadata/sources/base.py @@ -87,32 +87,40 @@ class Source(Plugin): if authors: # Leave ' in there for Irish names - pat = re.compile(r'[-,:;+!@#$%^&*(){}.`~"\s\[\]/]') + remove_pat = re.compile(r'[,:;!@#$%^&*(){}.`~"\s\[\]/]') + replace_pat = re.compile(r'-+') if only_first_author: authors = authors[:1] for au in authors: + au = replace_pat.sub(' ', au) parts = au.split() if ',' in au: # au probably in ln, fn form parts = parts[1:] + parts[:1] for tok in parts: - tok = pat.sub('', tok).strip() + tok = remove_pat.sub('', tok).strip() if len(tok) > 2 and tok.lower() not in ('von', ): yield tok - def get_title_tokens(self, title): + def get_title_tokens(self, title, strip_joiners=True): ''' Take a title and return a list of tokens useful for an AND search query. Excludes connectives and punctuation. ''' if title: - pat = re.compile(r'''[-,:;+!@#$%^&*(){}.`~"'\s\[\]/]''') + # strip sub-titles + subtitle = re.compile(r'([\(\[\{].*?[\)\]\}]|[/:\\].*$)') + if len(subtitle.sub('', title)) > 1: + title = subtitle.sub('', title) + pat = re.compile(r'''([-,:;+!@#$%^&*(){}.`~"\s\[\]/]|'(?!s))''') title = pat.sub(' ', title) tokens = title.split() for token in tokens: token = token.strip() - if token and token.lower() not in ('a', 'and', 'the'): + if token and token.lower() not in ('a', 'and', 'the') and strip_joiners: + yield token + elif token: yield token def split_jobs(self, jobs, num):