From a81e601428b80f31d1d992d6d78ba88936a8c10d Mon Sep 17 00:00:00 2001 From: Lee Date: Sun, 6 Mar 2011 17:18:54 +0800 Subject: [PATCH 01/30] added initial Overdrive support --- src/calibre/customize/builtins.py | 8 +- src/calibre/ebooks/metadata/covers.py | 34 ++ src/calibre/ebooks/metadata/fetch.py | 21 ++ src/calibre/ebooks/metadata/overdrive.py | 386 +++++++++++++++++++++++ 4 files changed, 445 insertions(+), 4 deletions(-) create mode 100644 src/calibre/ebooks/metadata/overdrive.py diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py index cd4c866562..0c71317f8f 100644 --- a/src/calibre/customize/builtins.py +++ b/src/calibre/customize/builtins.py @@ -581,19 +581,19 @@ from calibre.devices.kobo.driver import KOBO from calibre.devices.bambook.driver import BAMBOOK from calibre.ebooks.metadata.fetch import GoogleBooks, ISBNDB, Amazon, \ - KentDistrictLibrary + KentDistrictLibrary, Overdrive from calibre.ebooks.metadata.douban import DoubanBooks from calibre.ebooks.metadata.nicebooks import NiceBooks, NiceBooksCovers from calibre.ebooks.metadata.covers import OpenLibraryCovers, \ - AmazonCovers, DoubanCovers + AmazonCovers, DoubanCovers, OverdriveCovers from calibre.library.catalog import CSV_XML, EPUB_MOBI, BIBTEX from calibre.ebooks.epub.fix.unmanifested import Unmanifested from calibre.ebooks.epub.fix.epubcheck import Epubcheck -plugins = [HTML2ZIP, PML2PMLZ, TXT2TXTZ, ArchiveExtract, GoogleBooks, ISBNDB, Amazon, +plugins = [HTML2ZIP, PML2PMLZ, TXT2TXTZ, ArchiveExtract, GoogleBooks, ISBNDB, Amazon, Overdrive, KentDistrictLibrary, DoubanBooks, NiceBooks, CSV_XML, EPUB_MOBI, BIBTEX, Unmanifested, Epubcheck, OpenLibraryCovers, AmazonCovers, DoubanCovers, - NiceBooksCovers] + NiceBooksCovers, OverdriveCovers] plugins += [ ComicInput, EPUBInput, diff --git a/src/calibre/ebooks/metadata/covers.py b/src/calibre/ebooks/metadata/covers.py index 15e0a05c1e..280ca077ef 100644 --- a/src/calibre/ebooks/metadata/covers.py +++ b/src/calibre/ebooks/metadata/covers.py @@ -145,6 +145,40 @@ class AmazonCovers(CoverDownload): # {{{ # }}} +class OverdriveCovers(CoverDownload): # {{{ + + name = 'overdrive.com covers' + description = _('Download covers from Overdrive') + author = 'Kovid Goyal' + + + def has_cover(self, mi, ans, timeout=5.): + if not mi.authors or not mi.title: + return False + from calibre.ebooks.metadata.overdrive import get_cover_url + br = browser() + try: + get_cover_url(mi.isbn, mi.title, mi.authors, br) + self.debug('cover for', mi.isbn, 'found') + ans.set() + except Exception, e: + self.debug(e) + + def get_covers(self, mi, result_queue, abort, timeout=5.): + if not mi.isbn: + return + from calibre.ebooks.metadata.overdrive import get_cover_url + br = browser() + try: + url = get_cover_url(mi.isbn, mi.title, mi.authors, br) + cover_data = br.open_novisit(url).read() + result_queue.put((True, cover_data, 'jpg', self.name)) + except Exception, e: + result_queue.put((False, self.exception_to_string(e), + traceback.format_exc(), self.name)) + +# }}} + def check_for_cover(mi, timeout=5.): # {{{ from calibre.customize.ui import cover_sources ans = Event() diff --git a/src/calibre/ebooks/metadata/fetch.py b/src/calibre/ebooks/metadata/fetch.py index 667b4f4d7c..1f584bc107 100644 --- a/src/calibre/ebooks/metadata/fetch.py +++ b/src/calibre/ebooks/metadata/fetch.py @@ -250,6 +250,27 @@ class Amazon(MetadataSource): # {{{ # }}} +class Overdrive(MetadataSource): # {{{ + + name = 'Overdrive' + metadata_type = 'social' + description = _('Downloads metadata from the Overdrive library network') + + has_html_comments = True + + def fetch(self): + if not self.isbn: + return + from calibre.ebooks.metadata.overdrive import get_metadata + try: + self.results = get_metadata(self.title, self.book_author, + self.publisher, self.isbn) + except Exception, e: + self.exception = e + self.tb = traceback.format_exc() + + # }}} + class KentDistrictLibrary(MetadataSource): # {{{ name = 'Kent District Library' diff --git a/src/calibre/ebooks/metadata/overdrive.py b/src/calibre/ebooks/metadata/overdrive.py new file mode 100644 index 0000000000..ad512579d7 --- /dev/null +++ b/src/calibre/ebooks/metadata/overdrive.py @@ -0,0 +1,386 @@ +#!/usr/bin/env python +__license__ = 'GPL v3' +__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net' +__docformat__ = 'restructuredtext en' + +''' +Fetch metadata using Adobe Overdrive +''' +import sys, re, random, urllib, mechanize, copy +from threading import RLock + +from lxml import html, etree +from lxml.html import soupparser + +from calibre import browser +from calibre.ebooks.metadata import check_isbn +from calibre.ebooks.metadata.sources.base import Source +from calibre.ebooks.metadata.book.base import Metadata +from calibre.ebooks.chardet import xml_to_unicode +from calibre.library.comments import sanitize_comments_html + +ovrdrv_data_cache = {} +cover_url_cache = {} +cache_lock = RLock() +base_url = 'http://search.overdrive.com/' + +def get_base_referer(): + choices = [ + 'http://overdrive.chipublib.org/82DC601D-7DDE-4212-B43A-09D821935B01/10/375/en/', + 'http://emedia.clevnet.org/9D321DAD-EC0D-490D-BFD8-64AE2C96ECA8/10/241/en/', + 'http://singapore.lib.overdrive.com/F11D55BE-A917-4D63-8111-318E88B29740/10/382/en/', + 'http://ebooks.nypl.org/20E48048-A377-4520-BC43-F8729A42A424/10/257/en/', + 'http://spl.lib.overdrive.com/5875E082-4CB2-4689-9426-8509F354AFEF/10/335/en/' + ] + return choices[random.randint(0, len(choices)-1)] + +def format_results(reserveid, od_title, subtitle, series, publisher, creators, thumbimage, worldcatlink, formatid): + fix_slashes = re.compile(r'\\/') + thumbimage = fix_slashes.sub('/', thumbimage) + worldcatlink = fix_slashes.sub('/', worldcatlink) + cover_url = re.sub('(?P(Ima?g(eType-)?))200', '\g100', thumbimage) + social_metadata_url = base_url+'TitleInfo.aspx?ReserveID='+reserveid+'&FormatID='+formatid + series_num = '' + if not series: + if subtitle: + title = od_title+': '+subtitle + else: + title = od_title + m = re.search("([0-9]+$)", subtitle) + if m: + series_num = float(m.group(1)) + return [cover_url, social_metadata_url, worldcatlink, series, series_num, publisher, creators, reserveid, title] + +def overdrive_search(br, q, title, author): + q_query = q+'default.aspx/SearchByKeyword' + q_init_search = q+'SearchResults.aspx' + + # query terms + author_q = re.sub('\s', '+', author) + q_xref = q+'SearchResults.svc/GetResults?iDisplayLength=10&sSearch='+author_q + query = '{"szKeyword":"'+title+'"}' + + # main query, requires specific Content Type header + req = mechanize.Request(q_query) + req.add_header('Content-Type', 'application/json; charset=utf-8') + br.open_novisit(req, query) + + print "q_init_search is "+q_init_search + + # the query must be initialized by loading an empty search results page + # this page attempts to set a cookie that Mechanize doesn't like + # copy the cookiejar to a separate instance and make a one-off request with the temp cookiejar + goodcookies = br._ua_handlers['_cookies'].cookiejar + clean_cj = mechanize.CookieJar() + cookies_to_copy = [] + for cookie in goodcookies: + copied_cookie = copy.deepcopy(cookie) + cookies_to_copy.append(copied_cookie) + for copied_cookie in cookies_to_copy: + clean_cj.set_cookie(copied_cookie) + + br.open_novisit(q_init_search) + + br.set_cookiejar(clean_cj) + + # get the search results object + xreq = mechanize.Request(q_xref) + xreq.add_header('X-Requested-With', 'XMLHttpRequest') + xreq.add_header('Referer', q_init_search) + xreq.add_header('Accept', 'application/json, text/javascript, */*') + raw = br.open_novisit(xreq).read() + print "overdrive search result is:\n"+raw + raw = re.sub('.*?\[\[(?P.*?)\]\].*', '[[\g]]', raw) + results = eval(raw) + print "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n" + print results + # The search results are from a keyword search (overdrive's advanced search is broken), + # sort through the results for closest match/format + for result in results: + print "\n\n\nthis result is "+str(result) + for reserveid, od_title, subtitle, edition, series, publisher, format, formatid, creators, \ + thumbimage, shortdescription, worldcatlink, excerptlink, creatorfile, sorttitle, \ + availabletolibrary, availabletoretailer, relevancyrank, unknown1, unknown2, unknown3 in results: + creators = creators.split(', ') + print "fixed creators are: "+str(creators) + # if an exact match occurs + if creators[0] == author and od_title == title and int(formatid) in [1, 50, 410, 900]: + print "Got Exact Match!!!" + return format_results(reserveid, od_title, subtitle, series, publisher, creators, thumbimage, worldcatlink, formatid) + + +def library_search(br, q, title, author): + q_search = q+'AdvancedSearch.htm' + q_query = q+'BANGSearch.dll' + br.open(q_search) + # Search for cover with audiobooks lowest priority + for format in ['410','50','900','25','425']: + query = 'Title='+title+'&Creator='+author+'&Keyword=&ISBN=&Format='+format+'&Language=&Publisher=&Subject=&Award=&CollDate=&PerPage=10&Sort=SortBy%3Dtitle' + query = re.sub('\s', '+', query) + #print "search url is "+str(q_search) + print "query is "+str(query) + raw = br.open(q_query, query).read() + #print "raw html is:\n"+str(raw) + raw = xml_to_unicode(raw, strip_encoding_pats=True, + resolve_entities=True)[0] + root = html.fromstring(raw) + revs = root.xpath("//img[@class='blackborder']") + if revs: + #print "revs are "+str(revs) + # get the first match, as it's the most likely candidate + x = revs[0] + id = urllib.unquote(re.sub('.*?/(?P%7B.*?%7D).*', '\g', x.get('src'))) + curl = re.sub('(?P(Ima?g(eType-)?))200', '\g100', x.get('src')) + murl = root.xpath("//img[@class='blackborder']/parent::*") + if murl: + murl = [y.get('href') for y in murl] + print "murl is"+str(murl) + murl = q+murl[0] + else: + print "didn't get metadata URL" + print "curl is "+str(curl)+", id is "+str(id)+", murl is "+str(murl) + ovrdrv_data = [id, curl, murl] + print "revs final are "+str(revs) + return ovrdrv_data + + +def find_ovrdrv_data(br, title, author, isbn): + print "in fnd_ovrdrv_data, title is "+str(title)+", author is "+str(author) + q = base_url + if re.match('http://search\.overdrive\.', q): + return overdrive_search(br, q, title, author) + else: + return library_search(br, q, title, author) + + + +def to_ovrdrv_data(br, title, author, isbn): + print "starting to_ovrdrv_data" + with cache_lock: + ans = ovrdrv_data_cache.get(isbn, None) + if ans: + print "inside to_ovrdrv_data, ans returned positive, ans is"+str(ans) + return ans + if ans is False: + print "inside to_ovrdrv_data, ans returned False" + return None + try: + ovrdrv_data = find_ovrdrv_data(br, title, author, isbn) + print "ovrdrv_data = "+str(ovrdrv_data) + except: + import traceback + traceback.print_exc() + ovrdrv_data = None + + with cache_lock: + ovrdrv_data_cache[isbn] = ovrdrv_data if ovrdrv_data else False + return ovrdrv_data + + +def get_social_metadata(title, authors, publisher, isbn): + author = authors[0] + mi = Metadata(title, authors) + if not isbn: + return mi + isbn = check_isbn(isbn) + if not isbn: + return mi + br = browser() + ovrdrv_data = to_ovrdrv_data(br, title, authors, isbn) + if ovrdrv_data and get_metadata_detail_ovrdrv(br, ovrdrv_data, mi): + return mi + #from calibre.ebooks.metadata.xisbn import xisbn + #for i in xisbn.get_associated_isbns(isbn): + # print "xisbn isbn is "+str(i) + # ovrdrv_data = to_ovrdrv_data(br, title, author, i) + # if ovrdrv_data and get_metadata_detail(br, ovrdrv_data, mi): + # return mi + return mi + +def get_cover_url(isbn, title, author, br): + print "starting get_cover_url" + isbn = check_isbn(isbn) + print "isbn is "+str(isbn) + print "title is "+str(title) + print "author is "+str(author[0]) + cleanup = Source() + author = cleanup.get_author_tokens(author) + print "cleansed author is "+str(author) + + with cache_lock: + ans = cover_url_cache.get(isbn, None) + if ans: + print "ans returned positive" + return ans + if ans is False: + "ans returned false" + return None + print "in get_cover_url, running through ovrdrv_data function" + ovrdrv_data = to_ovrdrv_data(br, title, author, isbn) + print "ovrdrv_id is "+str(ovrdrv_data) + if ovrdrv_data: + ans = ovrdrv_data[0] + print "inside get_cover_url, ans is "+str(ans) + if ans: + with cache_lock: + cover_url_cache[isbn] = ans + return ans + #from calibre.ebooks.metadata.xisbn import xisbn + #for i in xisbn.get_associated_isbns(isbn): + # print "in get_cover_url, using xisbn list to associate other books" + # ovrdrv_data = to_ovrdrv_data(br, title, author, i) + # if ovrdrv_data: + # ans = _get_cover_url(br, ovrdrv_data) + # if ans: + # with cache_lock: + # cover_url_cache[isbn] = ans + # cover_url_cache[i] = ans + # return ans + with cache_lock: + cover_url_cache[isbn] = False + return None + +def _get_cover_url(br, ovrdrv_data): + q = ovrdrv_data[1] + try: + raw = br.open_novisit(q).read() + except Exception, e: + if callable(getattr(e, 'getcode', None)) and \ + e.getcode() == 404: + return None + raise + if '404 - ' in raw: + return None + raw = xml_to_unicode(raw, strip_encoding_pats=True, + resolve_entities=True)[0] + try: + root = soupparser.fromstring(raw) + except: + return False + + imgs = root.xpath('//img[@id="prodImage" and @src]') + if imgs: + src = imgs[0].get('src') + parts = src.split('/') + if len(parts) > 3: + bn = parts[-1] + sparts = bn.split('_') + if len(sparts) > 2: + bn = sparts[0] + sparts[-1] + return ('/'.join(parts[:-1]))+'/'+bn + return None + + +def get_metadata_detail(br, ovrdrv_data, mi): + q = ovrdrv_data[2] + try: + raw = br.open_novisit(q).read() + except Exception, e: + if callable(getattr(e, 'getcode', None)) and \ + e.getcode() == 404: + return False + raise + if '<title>404 - ' in raw: + return False + raw = xml_to_unicode(raw, strip_encoding_pats=True, + resolve_entities=True)[0] + try: + root = soupparser.fromstring(raw) + except: + return False + + # Check for series name and retrieve it + series_name = root.xpath("//td/script[re:test(text(), 'szSeries', 'i')]", + namespaces={"re": "http://exslt.org/regular-expressions"}) + if series_name: + series = html.tostring(series_name[0], method='html', encoding=unicode).strip() + series = re.sub('(?s).*?szSeries\s*=\s*\"(?P<series>.*?)\";.*', '\g<series>', series) + if len(series) > 1: + mi.series = series + # If series was successful attempt to get the series number + series_num = root.xpath("//div/strong[re:test(text(), ',\s(Book|Part|Volume)')]", + namespaces={"re": "http://exslt.org/regular-expressions"}) + if series_num: + series_num = float(re.sub('(?s).*?,\s*(Book|Part|Volume)\s*(?P<num>\d+).*', '\g<num>', + etree.tostring(series_num[0]))) + if series_num >= 1: + mi.series_index = series_num + print "series_num is "+str(series_num) + + desc = root.xpath("//td[@class='collection' and re:test(., 'Description', 'i')]/following::div[1]", + namespaces={"re": "http://exslt.org/regular-expressions"}) + if desc: + desc = desc[0] + desc = html.tostring(desc, method='html', encoding=unicode).strip() + # remove all attributes from tags + desc = re.sub(r'<([a-zA-Z0-9]+)\s[^>]+>', r'<\1>', desc) + # Remove comments + desc = re.sub(r'(?s)<!--.*?-->', '', desc) + mi.comments = sanitize_comments_html(desc) + + publisher = root.xpath("//td/strong[re:test(text(), 'Publisher\:', 'i')]/ancestor::td[1]/following-sibling::td/text()", + namespaces={"re": "http://exslt.org/regular-expressions"}) + if publisher: + mi.publisher = re.sub('^\s*(?P<pub>.*?)\s*$', '\g<pub>', publisher[0]) + print "publisher is "+str(mi.publisher) + + lang = root.xpath("//td/strong[re:test(text(), 'Language\(s\):', 'i')]/ancestor::td[1]/following-sibling::td/text()", + namespaces={"re": "http://exslt.org/regular-expressions"}) + if lang: + mi.language = re.sub('^\s*(?P<lang>.*?)\s*$', '\g<lang>', lang[0]) + print "languages is "+str(mi.language) + + isbn = root.xpath("//tr/td[re:test(text(), 'ISBN:', 'i')]/following::td/text()", + namespaces={"re": "http://exslt.org/regular-expressions"}) + if isbn: + mi.isbn = re.sub('^\s*(?P<isbn>.*?)\s*$', '\g<isbn>', isbn[0]) + print "ISBN is "+str(mi.isbn) + + subjects = root.xpath("//td/strong[re:test(text(), 'Subject', 'i')]/ancestor::td[1]/following-sibling::td/a/text()", + namespaces={"re": "http://exslt.org/regular-expressions"}) + if subjects: + mi.tags = subjects + print "tags are "+str(mi.tags) + + creators = root.xpath("//table/tr/td[re:test(text(), '\s*by', 'i')]/ancestor::tr[1]/td[2]/table/tr/td/a/text()", + namespaces={"re": "http://exslt.org/regular-expressions"}) + if creators: + print "authors are "+str(creators) + mi.authors = creators + + return True + +def main(args=sys.argv): + print "running through main tests" + import tempfile, os, time + tdir = tempfile.gettempdir() + br = browser() + for isbn, title, author in [ + #('0899661343', 'On the Road', ['Jack Kerouac']), # basic test, no series, single author + #('9780061952838', 'The Fellowship of the Ring', ['J. R. R. Tolkien']), # Series test, multi-author + #('9780061952838', 'The Two Towers', ['J. R. R. Tolkien']), # Series test, book 2 + ('9780345505057', 'Deluge', ['Anne McCaffrey']) # Multiple authors + #('', 'Deluge', ['Anne McCaffrey']) # Empty ISBN + #(None, 'On the Road', ['Jack Kerouac']) # Nonetype ISBN + ]: + cpath = os.path.join(tdir, title+'.jpg') + print "cpath is "+cpath + st = time.time() + curl = get_cover_url(isbn, title, author, br) + print '\n\n Took ', time.time() - st, ' to get metadata\n\n' + if curl is None: + print 'No cover found for', title + else: + print "curl is "+curl + #open(cpath, 'wb').write(br.open_novisit(curl).read()) + #print 'Cover for', title, 'saved to', cpath + + #import time + + #print get_social_metadata(title, author, None, isbn) + #print '\n\n', time.time() - st, '\n\n' + + return 0 + +if __name__ == '__main__': + sys.exit(main()) From 4e428219c94c05df0597c54aab9849f227928094 Mon Sep 17 00:00:00 2001 From: Lee <ldolse@yahoo.com> Date: Mon, 7 Mar 2011 04:37:47 +0800 Subject: [PATCH 02/30] ... --- src/calibre/ebooks/metadata/overdrive.py | 67 +++++++++++++----------- 1 file changed, 36 insertions(+), 31 deletions(-) diff --git a/src/calibre/ebooks/metadata/overdrive.py b/src/calibre/ebooks/metadata/overdrive.py index cb9ab2c9c9..5afb875fad 100644 --- a/src/calibre/ebooks/metadata/overdrive.py +++ b/src/calibre/ebooks/metadata/overdrive.py @@ -24,32 +24,29 @@ cover_url_cache = {} cache_lock = RLock() base_url = 'http://search.overdrive.com/' -class ContentReserve(Source): - def create_query(self, title=None, authors=None, identifiers={}): - q = '' - if title or authors: - def build_term(prefix, parts): - return ' '.join('in'+prefix + ':' + x for x in parts) - title_tokens = list(self.get_title_tokens(title)) - if title_tokens: - q += build_term('title', title_tokens) - author_tokens = self.get_author_tokens(authors, - only_first_author=True) - if author_tokens: - q += ('+' if q else '') + build_term('author', - author_tokens) - if isinstance(q, unicode): - q = q.encode('utf-8') - if not q: - return None - return BASE_URL+urlencode({ - 'q':q, - 'max-results':20, - 'start-index':1, - 'min-viewability':'none', - }) +def create_query(self, title=None, authors=None, identifiers={}): + q = '' + if title or authors: + def build_term(prefix, parts): + return ' '.join('in'+prefix + ':' + x for x in parts) + title_tokens = list(self.get_title_tokens(title)) + if title_tokens: + q += build_term('title', title_tokens) + author_tokens = self.get_author_tokens(authors, + only_first_author=True) + if author_tokens: + q += ('+' if q else '') + build_term('author', + author_tokens) + + if isinstance(q, unicode): + q = q.encode('utf-8') + if not q: + return None + return BASE_URL+urlencode({ + 'q':q, + }) def get_base_referer(): @@ -82,9 +79,20 @@ def format_results(reserveid, od_title, subtitle, series, publisher, creators, t def overdrive_search(br, q, title, author): q_query = q+'default.aspx/SearchByKeyword' q_init_search = q+'SearchResults.aspx' - + # get first author as string - convert this to a proper cleanup function later + s = Source(None) + print "printing list with string:" + print list(s.get_author_tokens(['J. R. R. Tolkien'])) + print "printing list with author "+str(author)+":" + print list(s.get_author_tokens(author)) + author = list(s.get_author_tokens(author)) + for token in author: + print "cleaned up author is: "+str(token) + author_q = '+'.join(author) + #author_q = separator.join(for x in author) # query terms - author_q = re.sub('\s', '+', author) + #author_q = re.sub('\s', '+', author_q) + print "final author query is "+str(author_q) q_xref = q+'SearchResults.svc/GetResults?iDisplayLength=10&sSearch='+author_q query = '{"szKeyword":"'+title+'"}' @@ -231,9 +239,6 @@ def get_cover_url(isbn, title, author, br): print "isbn is "+str(isbn) print "title is "+str(title) print "author is "+str(author[0]) - cleanup = ContentReserve() - query = cleanup.create_query(author, title) - print "cleansed query is "+str(author) with cache_lock: ans = cover_url_cache.get(isbn, None) @@ -386,8 +391,8 @@ def main(args=sys.argv): for isbn, title, author in [ #('0899661343', 'On the Road', ['Jack Kerouac']), # basic test, no series, single author #('9780061952838', 'The Fellowship of the Ring', ['J. R. R. Tolkien']), # Series test, multi-author - #('9780061952838', 'The Two Towers', ['J. R. R. Tolkien']), # Series test, book 2 - ('9780345505057', 'Deluge', ['Anne McCaffrey']) # Multiple authors + ('9780061952838', 'The Two Towers', ['J. R. R. Tolkien']), # Series test, book 2 + #('9780345505057', 'Deluge', ['Anne McCaffrey']) # Multiple authors #('', 'Deluge', ['Anne McCaffrey']) # Empty ISBN #(None, 'On the Road', ['Jack Kerouac']) # Nonetype ISBN ]: From c6a2c8e82e5dcd64f0bfb605b10f3a590eb41a08 Mon Sep 17 00:00:00 2001 From: Lee <ldolse@yahoo.com> Date: Tue, 22 Mar 2011 13:53:09 +0800 Subject: [PATCH 03/30] further work on the overdrive plugin --- src/calibre/ebooks/metadata/covers.py | 9 +- src/calibre/ebooks/metadata/fetch.py | 6 +- src/calibre/ebooks/metadata/overdrive.py | 386 ++++++++++---------- src/calibre/ebooks/metadata/sources/base.py | 18 +- 4 files changed, 214 insertions(+), 205 deletions(-) diff --git a/src/calibre/ebooks/metadata/covers.py b/src/calibre/ebooks/metadata/covers.py index 9f5958f1ad..6ea292aa93 100644 --- a/src/calibre/ebooks/metadata/covers.py +++ b/src/calibre/ebooks/metadata/covers.py @@ -161,14 +161,7 @@ class OverdriveCovers(CoverDownload): # {{{ def has_cover(self, mi, ans, timeout=5.): if not mi.authors or not mi.title: return False - from calibre.ebooks.metadata.overdrive import get_cover_url - br = browser() - try: - get_cover_url(mi.isbn, mi.title, mi.authors, br) - self.debug('cover for', mi.isbn, 'found') - ans.set() - except Exception, e: - self.debug(e) + return True def get_covers(self, mi, result_queue, abort, timeout=5.): if not mi.isbn: diff --git a/src/calibre/ebooks/metadata/fetch.py b/src/calibre/ebooks/metadata/fetch.py index 1f584bc107..0401ee78c5 100644 --- a/src/calibre/ebooks/metadata/fetch.py +++ b/src/calibre/ebooks/metadata/fetch.py @@ -261,10 +261,10 @@ class Overdrive(MetadataSource): # {{{ def fetch(self): if not self.isbn: return - from calibre.ebooks.metadata.overdrive import get_metadata + from calibre.ebooks.metadata.overdrive import get_social_metadata try: - self.results = get_metadata(self.title, self.book_author, - self.publisher, self.isbn) + self.results = get_social_metadata(self.title, self.book_author, self.isbn) + except Exception, e: self.exception = e self.tb = traceback.format_exc() diff --git a/src/calibre/ebooks/metadata/overdrive.py b/src/calibre/ebooks/metadata/overdrive.py index 5afb875fad..e72d168146 100644 --- a/src/calibre/ebooks/metadata/overdrive.py +++ b/src/calibre/ebooks/metadata/overdrive.py @@ -25,13 +25,12 @@ cache_lock = RLock() base_url = 'http://search.overdrive.com/' - def create_query(self, title=None, authors=None, identifiers={}): q = '' if title or authors: def build_term(prefix, parts): return ' '.join('in'+prefix + ':' + x for x in parts) - title_tokens = list(self.get_title_tokens(title)) + title_tokens = list(self.get_title_tokens(title, False)) if title_tokens: q += build_term('title', title_tokens) author_tokens = self.get_author_tokens(authors, @@ -58,7 +57,7 @@ def get_base_referer(): 'http://spl.lib.overdrive.com/5875E082-4CB2-4689-9426-8509F354AFEF/10/335/en/' ] return choices[random.randint(0, len(choices)-1)] - + def format_results(reserveid, od_title, subtitle, series, publisher, creators, thumbimage, worldcatlink, formatid): fix_slashes = re.compile(r'\\/') thumbimage = fix_slashes.sub('/', thumbimage) @@ -67,8 +66,10 @@ def format_results(reserveid, od_title, subtitle, series, publisher, creators, t social_metadata_url = base_url+'TitleInfo.aspx?ReserveID='+reserveid+'&FormatID='+formatid series_num = '' if not series: - if subtitle: - title = od_title+': '+subtitle + if subtitle: + title = od_title+': '+subtitle + else: + title = od_title else: title = od_title m = re.search("([0-9]+$)", subtitle) @@ -76,36 +77,12 @@ def format_results(reserveid, od_title, subtitle, series, publisher, creators, t series_num = float(m.group(1)) return [cover_url, social_metadata_url, worldcatlink, series, series_num, publisher, creators, reserveid, title] -def overdrive_search(br, q, title, author): - q_query = q+'default.aspx/SearchByKeyword' - q_init_search = q+'SearchResults.aspx' - # get first author as string - convert this to a proper cleanup function later - s = Source(None) - print "printing list with string:" - print list(s.get_author_tokens(['J. R. R. Tolkien'])) - print "printing list with author "+str(author)+":" - print list(s.get_author_tokens(author)) - author = list(s.get_author_tokens(author)) - for token in author: - print "cleaned up author is: "+str(token) - author_q = '+'.join(author) - #author_q = separator.join(for x in author) - # query terms - #author_q = re.sub('\s', '+', author_q) - print "final author query is "+str(author_q) - q_xref = q+'SearchResults.svc/GetResults?iDisplayLength=10&sSearch='+author_q - query = '{"szKeyword":"'+title+'"}' - - # main query, requires specific Content Type header - req = mechanize.Request(q_query) - req.add_header('Content-Type', 'application/json; charset=utf-8') - br.open_novisit(req, query) - - print "q_init_search is "+q_init_search - - # the query must be initialized by loading an empty search results page - # this page attempts to set a cookie that Mechanize doesn't like - # copy the cookiejar to a separate instance and make a one-off request with the temp cookiejar +def safe_query(br, query_url): + ''' + The query must be initialized by loading an empty search results page + this page attempts to set a cookie that Mechanize doesn't like + copy the cookiejar to a separate instance and make a one-off request with the temp cookiejar + ''' goodcookies = br._ua_handlers['_cookies'].cookiejar clean_cj = mechanize.CookieJar() cookies_to_copy = [] @@ -115,10 +92,46 @@ def overdrive_search(br, q, title, author): for copied_cookie in cookies_to_copy: clean_cj.set_cookie(copied_cookie) - br.open_novisit(q_init_search) + br.open_novisit(query_url) br.set_cookiejar(clean_cj) + +def overdrive_search(br, q, title, author): + q_query = q+'default.aspx/SearchByKeyword' + q_init_search = q+'SearchResults.aspx' + # get first author as string - convert this to a proper cleanup function later + s = Source(None) + print "printing list with string:" + #print list(s.get_author_tokens(['J. R. R. Tolkien'])) + print "printing list with author "+str(author)+":" + print list(s.get_author_tokens(author)) + author_tokens = list(s.get_author_tokens(author)) + for token in author_tokens: + print "cleaned up author token is: "+str(token) + author_q = ' '.join(author_tokens) + + title_tokens = list(s.get_title_tokens(title)) + for token in title_tokens: + print "cleaned up title token is: "+str(token) + title_q = '+'.join(title_tokens) + #author_q = separator.join(for x in author) + # query terms + #author_q = re.sub('\s', '+', author_q) + print "final author query is "+str(author_q) + print "final title query is "+str(title_q) + q_xref = q+'SearchResults.svc/GetResults?iDisplayLength=20&sSearch='+title_q + query = '{"szKeyword":"'+author_q+'"}' + + # main query, requires specific Content Type header + req = mechanize.Request(q_query) + req.add_header('Content-Type', 'application/json; charset=utf-8') + br.open_novisit(req, query) + + print "q_init_search is "+q_init_search + # initiate the search without messing up the cookiejar + safe_query(br, q_init_search) + # get the search results object xreq = mechanize.Request(q_xref) xreq.add_header('X-Requested-With', 'XMLHttpRequest') @@ -126,83 +139,102 @@ def overdrive_search(br, q, title, author): xreq.add_header('Accept', 'application/json, text/javascript, */*') raw = br.open_novisit(xreq).read() print "overdrive search result is:\n"+raw + print "\n\nsorting results" + return sort_ovrdrv_results(raw, title, title_tokens, author, author_tokens) + + +def sort_ovrdrv_results(raw, title=None, title_tokens=None, author=None, author_tokens=None, ovrdrv_id=None): + print "\ntitle to search for is "+str(title)+"\nauthor to search for is "+str(author) + close_matches = [] raw = re.sub('.*?\[\[(?P<content>.*?)\]\].*', '[[\g<content>]]', raw) results = eval(raw) print "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n" - print results - # The search results are from a keyword search (overdrive's advanced search is broken), + #print results + # The search results are either from a keyword search or a multi-format list from a single ID, # sort through the results for closest match/format for result in results: print "\n\n\nthis result is "+str(result) for reserveid, od_title, subtitle, edition, series, publisher, format, formatid, creators, \ thumbimage, shortdescription, worldcatlink, excerptlink, creatorfile, sorttitle, \ availabletolibrary, availabletoretailer, relevancyrank, unknown1, unknown2, unknown3 in results: - creators = creators.split(', ') - print "fixed creators are: "+str(creators) - # if an exact match occurs - if creators[0] == author and od_title == title and int(formatid) in [1, 50, 410, 900]: - print "Got Exact Match!!!" - return format_results(reserveid, od_title, subtitle, series, publisher, creators, thumbimage, worldcatlink, formatid) - - -def library_search(br, q, title, author): - q_search = q+'AdvancedSearch.htm' - q_query = q+'BANGSearch.dll' - br.open(q_search) - # Search for cover with audiobooks lowest priority - for format in ['410','50','900','25','425']: - query = 'Title='+title+'&Creator='+author+'&Keyword=&ISBN=&Format='+format+'&Language=&Publisher=&Subject=&Award=&CollDate=&PerPage=10&Sort=SortBy%3Dtitle' - query = re.sub('\s', '+', query) - #print "search url is "+str(q_search) - print "query is "+str(query) - raw = br.open(q_query, query).read() - #print "raw html is:\n"+str(raw) - raw = xml_to_unicode(raw, strip_encoding_pats=True, - resolve_entities=True)[0] - root = html.fromstring(raw) - revs = root.xpath("//img[@class='blackborder']") - if revs: - #print "revs are "+str(revs) - # get the first match, as it's the most likely candidate - x = revs[0] - id = urllib.unquote(re.sub('.*?/(?P<i>%7B.*?%7D).*', '\g<i>', x.get('src'))) - curl = re.sub('(?P<img>(Ima?g(eType-)?))200', '\g<img>100', x.get('src')) - murl = root.xpath("//img[@class='blackborder']/parent::*") - if murl: - murl = [y.get('href') for y in murl] - print "murl is"+str(murl) - murl = q+murl[0] + if ovrdrv_id is not None and int(formatid) in [1, 50, 410, 900]: + print "overdrive id is not None, searching based on format type priority" + return format_results(reserveid, od_title, subtitle, series, publisher, creators, thumbimage, worldcatlink, formatid) else: - print "didn't get metadata URL" - print "curl is "+str(curl)+", id is "+str(id)+", murl is "+str(murl) - ovrdrv_data = [id, curl, murl] - print "revs final are "+str(revs) - return ovrdrv_data + creators = creators.split(', ') + print "fixed creators are: "+str(creators) + # if an exact match in a preferred format occurs + if creators[0] == author[0] and od_title == title and int(formatid) in [1, 50, 410, 900]: + print "Got Exact Match!!!" + return format_results(reserveid, od_title, subtitle, series, publisher, creators, thumbimage, worldcatlink, formatid) + else: + close_title_match = False + close_author_match = False + for token in title_tokens: + if od_title.lower().find(token.lower()) != -1: + close_title_match = True + else: + close_title_match = False + break + for token in author_tokens: + if creators[0].lower().find(token.lower()) != -1: + close_author_match = True + else: + close_author_match = False + break + if close_title_match and close_author_match and int(formatid) in [1, 50, 410, 900]: + close_matches.append(format_results(reserveid, od_title, subtitle, series, publisher, creators, thumbimage, worldcatlink, formatid)) + if close_matches: + return close_matches[0] + else: + return None -def find_ovrdrv_data(br, title, author, isbn): - print "in fnd_ovrdrv_data, title is "+str(title)+", author is "+str(author) + +def overdrive_get_record(br, q, ovrdrv_id): + search_url = q+'SearchResults.aspx?ReserveID={'+ovrdrv_id+'}' + results_url = q+'SearchResults.svc/GetResults?sEcho=1&iColumns=18&sColumns=ReserveID%2CTitle%2CSubtitle%2CEdition%2CSeries%2CPublisher%2CFormat%2CFormatID%2CCreators%2CThumbImage%2CShortDescription%2CWorldCatLink%2CExcerptLink%2CCreatorFile%2CSortTitle%2CAvailableToLibrary%2CAvailableToRetailer%2CRelevancyRank&iDisplayStart=0&iDisplayLength=10&sSearch=&bEscapeRegex=true&iSortingCols=1&iSortCol_0=17&sSortDir_0=asc' + + # get the base url to set the proper session cookie + br.open_novisit(q) + + # initialize the search + safe_query(br, search_url) + + # get the results + req = mechanize.Request(results_url) + req.add_header('X-Requested-With', 'XMLHttpRequest') + req.add_header('Referer', search_url) + req.add_header('Accept', 'application/json, text/javascript, */*') + raw = br.open_novisit(req) + raw = str(list(raw)) + return sort_ovrdrv_results(raw, None, None, None, ovrdrv_id) + + +def find_ovrdrv_data(br, title, author, isbn, ovrdrv_id=None): + print "in find_ovrdrv_data, title is "+str(title)+", author is "+str(author)+", overdrive id is "+str(ovrdrv_id) q = base_url - if re.match('http://search\.overdrive\.', q): + if ovrdrv_id is None: return overdrive_search(br, q, title, author) else: - return library_search(br, q, title, author) - + return overdrive_get_record(br, q, ovrdrv_id) -def to_ovrdrv_data(br, title, author, isbn): + +def to_ovrdrv_data(br, title, author, isbn, ovrdrv_id=None): print "starting to_ovrdrv_data" with cache_lock: ans = ovrdrv_data_cache.get(isbn, None) if ans: - print "inside to_ovrdrv_data, ans returned positive, ans is"+str(ans) + print "inside to_ovrdrv_data, cache lookup successful, ans is "+str(ans) return ans if ans is False: print "inside to_ovrdrv_data, ans returned False" return None try: - ovrdrv_data = find_ovrdrv_data(br, title, author, isbn) - print "ovrdrv_data = "+str(ovrdrv_data) + print "trying to retrieve data, running find_ovrdrv_data" + ovrdrv_data = find_ovrdrv_data(br, title, author, isbn, ovrdrv_id) + print "ovrdrv_data is "+str(ovrdrv_data) except: import traceback traceback.print_exc() @@ -210,66 +242,69 @@ def to_ovrdrv_data(br, title, author, isbn): with cache_lock: ovrdrv_data_cache[isbn] = ovrdrv_data if ovrdrv_data else False + if ovrdrv_data: + from calibre.ebooks.metadata.xisbn import xisbn + for i in xisbn.get_associated_isbns(isbn): + with cache_lock: + ovrdrv_data_cache[i] = ovrdrv_data + return ovrdrv_data -def get_social_metadata(title, authors, publisher, isbn): +def get_social_metadata(title, authors, isbn, ovrdrv_id=None): author = authors[0] mi = Metadata(title, authors) - if not isbn: - return mi - isbn = check_isbn(isbn) - if not isbn: - return mi br = browser() - ovrdrv_data = to_ovrdrv_data(br, title, authors, isbn) - if ovrdrv_data and get_metadata_detail_ovrdrv(br, ovrdrv_data, mi): + print "calling to_ovrdrv_data from inside get_social_metadata" + ovrdrv_data = to_ovrdrv_data(br, title, authors, isbn, ovrdrv_id) + + #[cover_url, social_metadata_url, worldcatlink, series, series_num, publisher, creators, reserveid, title] + + if len(ovrdrv_data[3]) > 1: + mi.series = ovrdrv_data[3] + if ovrdrv_data[4]: + mi.series_index = ovrdrv_data[4] + mi.publisher = ovrdrv_data[5] + mi.authors = ovrdrv_data[6] + if ovrdrv_id is None: + ovrdrv_id = ovrdrv_data[7] + mi.set_identifier('overdrive', ovrdrv_id) + mi.title = ovrdrv_data[8] + + if ovrdrv_data and get_metadata_detail(br, ovrdrv_data[1], mi, isbn): return mi - #from calibre.ebooks.metadata.xisbn import xisbn - #for i in xisbn.get_associated_isbns(isbn): - # print "xisbn isbn is "+str(i) - # ovrdrv_data = to_ovrdrv_data(br, title, author, i) - # if ovrdrv_data and get_metadata_detail(br, ovrdrv_data, mi): - # return mi return mi -def get_cover_url(isbn, title, author, br): +def get_cover_url(isbn, title, author, br, ovrdrv_id=None): print "starting get_cover_url" - isbn = check_isbn(isbn) - print "isbn is "+str(isbn) print "title is "+str(title) print "author is "+str(author[0]) + print "isbn is "+str(isbn) + print "ovrdrv_id is "+str(ovrdrv_id) with cache_lock: ans = cover_url_cache.get(isbn, None) + #ans = cover_url_cache.get(ovrdrv_id, None) if ans: - print "ans returned positive" + print "cover url cache lookup returned positive, ans is "+str(ans) return ans if ans is False: - "ans returned false" + "cover url cache lookup returned false" return None - print "in get_cover_url, running through ovrdrv_data function" - ovrdrv_data = to_ovrdrv_data(br, title, author, isbn) - print "ovrdrv_id is "+str(ovrdrv_data) + print "in get_cover_url, calling to_ovrdrv_data function" + ovrdrv_data = to_ovrdrv_data(br, title, author, isbn, ovrdrv_id) if ovrdrv_data: ans = ovrdrv_data[0] - print "inside get_cover_url, ans is "+str(ans) + print "inside get_cover_url, got url from to_ovrdrv_data, ans is "+str(ans) if ans: + print "writing cover url to url cache" with cache_lock: cover_url_cache[isbn] = ans + #cover_url_cache[ovrdrv_id] = ans return ans - #from calibre.ebooks.metadata.xisbn import xisbn - #for i in xisbn.get_associated_isbns(isbn): - # print "in get_cover_url, using xisbn list to associate other books" - # ovrdrv_data = to_ovrdrv_data(br, title, author, i) - # if ovrdrv_data: - # ans = _get_cover_url(br, ovrdrv_data) - # if ans: - # with cache_lock: - # cover_url_cache[isbn] = ans - # cover_url_cache[i] = ans - # return ans + with cache_lock: + print "marking cover url cache for this isbn false" cover_url_cache[isbn] = False return None @@ -303,18 +338,14 @@ def _get_cover_url(br, ovrdrv_data): return ('/'.join(parts[:-1]))+'/'+bn return None - -def get_metadata_detail(br, ovrdrv_data, mi): - q = ovrdrv_data[2] +def get_metadata_detail(br, metadata_url, mi, isbn=None): try: - raw = br.open_novisit(q).read() + raw = br.open_novisit(metadata_url).read() except Exception, e: if callable(getattr(e, 'getcode', None)) and \ e.getcode() == 404: return False - raise - if '<title>404 - ' in raw: - return False + raise raw = xml_to_unicode(raw, strip_encoding_pats=True, resolve_entities=True)[0] try: @@ -322,26 +353,28 @@ def get_metadata_detail(br, ovrdrv_data, mi): except: return False - # Check for series name and retrieve it - series_name = root.xpath("//td/script[re:test(text(), 'szSeries', 'i')]", - namespaces={"re": "http://exslt.org/regular-expressions"}) - if series_name: - series = html.tostring(series_name[0], method='html', encoding=unicode).strip() - series = re.sub('(?s).*?szSeries\s*=\s*\"(?P<series>.*?)\";.*', '\g<series>', series) - if len(series) > 1: - mi.series = series - # If series was successful attempt to get the series number - series_num = root.xpath("//div/strong[re:test(text(), ',\s(Book|Part|Volume)')]", - namespaces={"re": "http://exslt.org/regular-expressions"}) - if series_num: - series_num = float(re.sub('(?s).*?,\s*(Book|Part|Volume)\s*(?P<num>\d+).*', '\g<num>', - etree.tostring(series_num[0]))) - if series_num >= 1: - mi.series_index = series_num - print "series_num is "+str(series_num) + isbn = check_isbn(isbn) - desc = root.xpath("//td[@class='collection' and re:test(., 'Description', 'i')]/following::div[1]", - namespaces={"re": "http://exslt.org/regular-expressions"}) + pub_date = root.xpath("//div/label[@id='ctl00_ContentPlaceHolder1_lblPubDate']/text()") + lang = root.xpath("//div/label[@id='ctl00_ContentPlaceHolder1_lblLanguage']/text()") + subjects = root.xpath("//div/label[@id='ctl00_ContentPlaceHolder1_lblSubjects']/text()") + ebook_isbn = root.xpath("//div/label[@id='ctl00_ContentPlaceHolder1_lblIdentifier']/text()") + desc = root.xpath("//div/label[@id='ctl00_ContentPlaceHolder1_lblDescription']/ancestor::div[1]") + + if pub_date: + from calibre.utils.date import parse_date + mi.pubdate = parse_date(pub_date[0].strip()) + if lang: + mi.language = lang[0].strip() + print "languages is "+str(mi.language) + if ebook_isbn and isbn is None: + print "ebook isbn is "+str(ebook_isbn[0]) + mi.set_identifier('isbn', ebook_isbn) + #elif isbn is not None: + # mi.set_identifier('isbn', isbn) + if subjects: + mi.tags = subjects + print "tags are "+str(mi.tags) if desc: desc = desc[0] desc = html.tostring(desc, method='html', encoding=unicode).strip() @@ -351,36 +384,6 @@ def get_metadata_detail(br, ovrdrv_data, mi): desc = re.sub(r'(?s)<!--.*?-->', '', desc) mi.comments = sanitize_comments_html(desc) - publisher = root.xpath("//td/strong[re:test(text(), 'Publisher\:', 'i')]/ancestor::td[1]/following-sibling::td/text()", - namespaces={"re": "http://exslt.org/regular-expressions"}) - if publisher: - mi.publisher = re.sub('^\s*(?P<pub>.*?)\s*$', '\g<pub>', publisher[0]) - print "publisher is "+str(mi.publisher) - - lang = root.xpath("//td/strong[re:test(text(), 'Language\(s\):', 'i')]/ancestor::td[1]/following-sibling::td/text()", - namespaces={"re": "http://exslt.org/regular-expressions"}) - if lang: - mi.language = re.sub('^\s*(?P<lang>.*?)\s*$', '\g<lang>', lang[0]) - print "languages is "+str(mi.language) - - isbn = root.xpath("//tr/td[re:test(text(), 'ISBN:', 'i')]/following::td/text()", - namespaces={"re": "http://exslt.org/regular-expressions"}) - if isbn: - mi.isbn = re.sub('^\s*(?P<isbn>.*?)\s*$', '\g<isbn>', isbn[0]) - print "ISBN is "+str(mi.isbn) - - subjects = root.xpath("//td/strong[re:test(text(), 'Subject', 'i')]/ancestor::td[1]/following-sibling::td/a/text()", - namespaces={"re": "http://exslt.org/regular-expressions"}) - if subjects: - mi.tags = subjects - print "tags are "+str(mi.tags) - - creators = root.xpath("//table/tr/td[re:test(text(), '\s*by', 'i')]/ancestor::tr[1]/td[2]/table/tr/td/a/text()", - namespaces={"re": "http://exslt.org/regular-expressions"}) - if creators: - print "authors are "+str(creators) - mi.authors = creators - return True def main(args=sys.argv): @@ -388,19 +391,26 @@ def main(args=sys.argv): import tempfile, os, time tdir = tempfile.gettempdir() br = browser() - for isbn, title, author in [ - #('0899661343', 'On the Road', ['Jack Kerouac']), # basic test, no series, single author - #('9780061952838', 'The Fellowship of the Ring', ['J. R. R. Tolkien']), # Series test, multi-author - ('9780061952838', 'The Two Towers', ['J. R. R. Tolkien']), # Series test, book 2 - #('9780345505057', 'Deluge', ['Anne McCaffrey']) # Multiple authors - #('', 'Deluge', ['Anne McCaffrey']) # Empty ISBN - #(None, 'On the Road', ['Jack Kerouac']) # Nonetype ISBN + for ovrdrv_id, isbn, title, author in [ + #(None, '0899661343', 'On the Road', ['Jack Kerouac']), # basic test, no series, single author + #(None, '9780061952838', 'The Fellowship of the Ring', ['J. R. R. Tolkien']), # Series test, multi-author + #(None, '9780061952838', 'The Two Towers', ['J. R. R. Tolkien']), # Series test, book 2 + #('57844706-20fa-4ace-b5ee-3470b1b52173', None, 'The Two Towers', ['J. R. R. Tolkien']), # Series test, w/ ovrdrv id + #(None, '9780345505057', 'Deluge', ['Anne McCaffrey']) # Multiple authors + #(None, None, 'Deluge', ['Anne McCaffrey']) # Empty ISBN + #(None, None, 'On the Road', ['Jack Kerouac']), # Nonetype ISBN + #(None, '9780345435279', 'A Caress of Twilight', ['Laurell K. Hamilton']), + #(None, '9780606087230', 'The Omnivore\'s Dilemma : A Natural History of Four Meals', ['Michael Pollan']), # Subtitle colon + #(None, '9780061747649', 'Mental_Floss Presents: Condensed Knowledge', ['Will Pearson', 'Mangesh Hattikudur']), + #(None, '9781400050802', 'The Zombie Survival Guide', ['Max Brooks']), # Two books with this title by this author + #(None, '9781775414315', 'The Worst Journey in the World / Antarctic 1910-1913', ['Apsley Cherry-Garrard']), # Garbage sub-title + (None, '9780440335160', 'Outlander', ['Diana Gabaldon']), # Returns lots of results to sort through to get the best match ]: cpath = os.path.join(tdir, title+'.jpg') print "cpath is "+cpath st = time.time() - curl = get_cover_url(isbn, title, author, br) - print '\n\n Took ', time.time() - st, ' to get metadata\n\n' + curl = get_cover_url(isbn, title, author, br, ovrdrv_id) + print '\n\n Took ', time.time() - st, ' to get basic metadata\n\n' if curl is None: print 'No cover found for', title else: @@ -408,9 +418,7 @@ def main(args=sys.argv): #open(cpath, 'wb').write(br.open_novisit(curl).read()) #print 'Cover for', title, 'saved to', cpath - #import time - - #print get_social_metadata(title, author, None, isbn) + print get_social_metadata(title, author, isbn, ovrdrv_id) #print '\n\n', time.time() - st, '\n\n' return 0 diff --git a/src/calibre/ebooks/metadata/sources/base.py b/src/calibre/ebooks/metadata/sources/base.py index 55cc996cf7..b600eafaf2 100644 --- a/src/calibre/ebooks/metadata/sources/base.py +++ b/src/calibre/ebooks/metadata/sources/base.py @@ -87,32 +87,40 @@ class Source(Plugin): if authors: # Leave ' in there for Irish names - pat = re.compile(r'[-,:;+!@#$%^&*(){}.`~"\s\[\]/]') + remove_pat = re.compile(r'[,:;!@#$%^&*(){}.`~"\s\[\]/]') + replace_pat = re.compile(r'-+') if only_first_author: authors = authors[:1] for au in authors: + au = replace_pat.sub(' ', au) parts = au.split() if ',' in au: # au probably in ln, fn form parts = parts[1:] + parts[:1] for tok in parts: - tok = pat.sub('', tok).strip() + tok = remove_pat.sub('', tok).strip() if len(tok) > 2 and tok.lower() not in ('von', ): yield tok - def get_title_tokens(self, title): + def get_title_tokens(self, title, strip_joiners=True): ''' Take a title and return a list of tokens useful for an AND search query. Excludes connectives and punctuation. ''' if title: - pat = re.compile(r'''[-,:;+!@#$%^&*(){}.`~"'\s\[\]/]''') + # strip sub-titles + subtitle = re.compile(r'([\(\[\{].*?[\)\]\}]|[/:\\].*$)') + if len(subtitle.sub('', title)) > 1: + title = subtitle.sub('', title) + pat = re.compile(r'''([-,:;+!@#$%^&*(){}.`~"\s\[\]/]|'(?!s))''') title = pat.sub(' ', title) tokens = title.split() for token in tokens: token = token.strip() - if token and token.lower() not in ('a', 'and', 'the'): + if token and token.lower() not in ('a', 'and', 'the') and strip_joiners: + yield token + elif token: yield token def split_jobs(self, jobs, num): From 6f9fff63e03f2392c6c0e646530b5a16e804ffb2 Mon Sep 17 00:00:00 2001 From: Lee <ldolse@yahoo.com> Date: Wed, 23 Mar 2011 22:38:29 +0800 Subject: [PATCH 04/30] ... --- src/calibre/ebooks/metadata/overdrive.py | 19 +++++++++++++------ src/calibre/ebooks/metadata/sources/base.py | 8 ++++---- 2 files changed, 17 insertions(+), 10 deletions(-) diff --git a/src/calibre/ebooks/metadata/overdrive.py b/src/calibre/ebooks/metadata/overdrive.py index e72d168146..61ff2ee7ae 100644 --- a/src/calibre/ebooks/metadata/overdrive.py +++ b/src/calibre/ebooks/metadata/overdrive.py @@ -120,7 +120,7 @@ def overdrive_search(br, q, title, author): #author_q = re.sub('\s', '+', author_q) print "final author query is "+str(author_q) print "final title query is "+str(title_q) - q_xref = q+'SearchResults.svc/GetResults?iDisplayLength=20&sSearch='+title_q + q_xref = q+'SearchResults.svc/GetResults?iDisplayLength=50&sSearch='+title_q query = '{"szKeyword":"'+author_q+'"}' # main query, requires specific Content Type header @@ -152,11 +152,11 @@ def sort_ovrdrv_results(raw, title=None, title_tokens=None, author=None, author_ #print results # The search results are either from a keyword search or a multi-format list from a single ID, # sort through the results for closest match/format - for result in results: - print "\n\n\nthis result is "+str(result) + if results: for reserveid, od_title, subtitle, edition, series, publisher, format, formatid, creators, \ thumbimage, shortdescription, worldcatlink, excerptlink, creatorfile, sorttitle, \ availabletolibrary, availabletoretailer, relevancyrank, unknown1, unknown2, unknown3 in results: + print "this record's title is "+od_title+", subtitle is "+subtitle+", author[s] are "+creators+", series is "+series if ovrdrv_id is not None and int(formatid) in [1, 50, 410, 900]: print "overdrive id is not None, searching based on format type priority" return format_results(reserveid, od_title, subtitle, series, publisher, creators, thumbimage, worldcatlink, formatid) @@ -183,11 +183,16 @@ def sort_ovrdrv_results(raw, title=None, title_tokens=None, author=None, author_ close_author_match = False break if close_title_match and close_author_match and int(formatid) in [1, 50, 410, 900]: - close_matches.append(format_results(reserveid, od_title, subtitle, series, publisher, creators, thumbimage, worldcatlink, formatid)) + if subtitle and series: + close_matches.insert(0, format_results(reserveid, od_title, subtitle, series, publisher, creators, thumbimage, worldcatlink, formatid)) + else: + close_matches.append(format_results(reserveid, od_title, subtitle, series, publisher, creators, thumbimage, worldcatlink, formatid)) if close_matches: return close_matches[0] else: - return None + return '' + else: + return '' @@ -394,7 +399,8 @@ def main(args=sys.argv): for ovrdrv_id, isbn, title, author in [ #(None, '0899661343', 'On the Road', ['Jack Kerouac']), # basic test, no series, single author #(None, '9780061952838', 'The Fellowship of the Ring', ['J. R. R. Tolkien']), # Series test, multi-author - #(None, '9780061952838', 'The Two Towers', ['J. R. R. Tolkien']), # Series test, book 2 + #(None, '9780061952838', 'The Two Towers (The Lord of the Rings, Book II)', ['J. R. R. Tolkien']), # Series test, book 2 + #(None, '9780618153985', 'The Fellowship of the Ring (The Lord of the Rings, Part 1)', ['J.R.R. Tolkien']), #('57844706-20fa-4ace-b5ee-3470b1b52173', None, 'The Two Towers', ['J. R. R. Tolkien']), # Series test, w/ ovrdrv id #(None, '9780345505057', 'Deluge', ['Anne McCaffrey']) # Multiple authors #(None, None, 'Deluge', ['Anne McCaffrey']) # Empty ISBN @@ -405,6 +411,7 @@ def main(args=sys.argv): #(None, '9781400050802', 'The Zombie Survival Guide', ['Max Brooks']), # Two books with this title by this author #(None, '9781775414315', 'The Worst Journey in the World / Antarctic 1910-1913', ['Apsley Cherry-Garrard']), # Garbage sub-title (None, '9780440335160', 'Outlander', ['Diana Gabaldon']), # Returns lots of results to sort through to get the best match + (None, '9780345509741', 'The Horror Stories of Robert E. Howard', ['Robert E. Howard']), # Complex title with initials/dots stripped, some results don't have a cover ]: cpath = os.path.join(tdir, title+'.jpg') print "cpath is "+cpath diff --git a/src/calibre/ebooks/metadata/sources/base.py b/src/calibre/ebooks/metadata/sources/base.py index 464d08032b..6fc52eb88b 100644 --- a/src/calibre/ebooks/metadata/sources/base.py +++ b/src/calibre/ebooks/metadata/sources/base.py @@ -102,8 +102,8 @@ class Source(Plugin): if authors: # Leave ' in there for Irish names - remove_pat = re.compile(r'[,:;!@#$%^&*(){}.`~"\s\[\]/]') - replace_pat = re.compile(r'-+') + remove_pat = re.compile(r'[,!@#$%^&*(){}`~"\s\[\]/]') + replace_pat = re.compile(r'[-+.:;]') if only_first_author: authors = authors[:1] for au in authors: @@ -128,12 +128,12 @@ class Source(Plugin): subtitle = re.compile(r'([\(\[\{].*?[\)\]\}]|[/:\\].*$)') if len(subtitle.sub('', title)) > 1: title = subtitle.sub('', title) - pat = re.compile(r'''([-,:;+!@#$%^&*(){}.`~"\s\[\]/]|'(?!s))''') + pat = re.compile(r'''([-,:;+!@#$%^*(){}.`~"\s\[\]/]|'(?!s))''') title = pat.sub(' ', title) tokens = title.split() for token in tokens: token = token.strip() - if token and token.lower() not in ('a', 'and', 'the') and strip_joiners: + if token and token.lower() not in ('a', 'and', 'the', '&') and strip_joiners: yield token elif token: yield token From 433270f20ead59bc013855d5b1403e43e1f50a02 Mon Sep 17 00:00:00 2001 From: Lee <ldolse@yahoo.com> Date: Mon, 28 Mar 2011 17:24:45 +0800 Subject: [PATCH 05/30] add another type of scene break to the scene break formatting logic --- src/calibre/ebooks/conversion/utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/calibre/ebooks/conversion/utils.py b/src/calibre/ebooks/conversion/utils.py index f1f2f87293..1546644f95 100644 --- a/src/calibre/ebooks/conversion/utils.py +++ b/src/calibre/ebooks/conversion/utils.py @@ -764,6 +764,7 @@ class HeuristicProcessor(object): # Multiple sequential blank paragraphs are merged with appropriate margins # If non-blank scene breaks exist they are center aligned and styled with appropriate margins. if getattr(self.extra_opts, 'format_scene_breaks', False): + html = re.sub('(?i)<div[^>]*>\s*<br(\s?/)?>\s*</div>', '<p></p>', html) html = self.detect_whitespace(html) html = self.detect_soft_breaks(html) blanks_count = len(self.any_multi_blank.findall(html)) From 07733b2fc800fb135bbebbeae33153434b82daf3 Mon Sep 17 00:00:00 2001 From: Lee <ldolse@yahoo.com> Date: Fri, 1 Apr 2011 12:59:07 +0800 Subject: [PATCH 06/30] overdrive tweaks --- src/calibre/ebooks/metadata/overdrive.py | 58 +++++++++++++++--------- 1 file changed, 37 insertions(+), 21 deletions(-) diff --git a/src/calibre/ebooks/metadata/overdrive.py b/src/calibre/ebooks/metadata/overdrive.py index 61ff2ee7ae..289d6bea0e 100644 --- a/src/calibre/ebooks/metadata/overdrive.py +++ b/src/calibre/ebooks/metadata/overdrive.py @@ -107,21 +107,27 @@ def overdrive_search(br, q, title, author): print "printing list with author "+str(author)+":" print list(s.get_author_tokens(author)) author_tokens = list(s.get_author_tokens(author)) + print "there are "+str(len(author_tokens))+" author tokens" for token in author_tokens: print "cleaned up author token is: "+str(token) - author_q = ' '.join(author_tokens) + title_tokens = list(s.get_title_tokens(title)) + print "there are "+str(len(title_tokens))+" title tokens" for token in title_tokens: print "cleaned up title token is: "+str(token) - title_q = '+'.join(title_tokens) - #author_q = separator.join(for x in author) - # query terms - #author_q = re.sub('\s', '+', author_q) - print "final author query is "+str(author_q) - print "final title query is "+str(title_q) - q_xref = q+'SearchResults.svc/GetResults?iDisplayLength=50&sSearch='+title_q - query = '{"szKeyword":"'+author_q+'"}' + + if len(title_tokens) >= len(author_tokens): + initial_q = ' '.join(title_tokens) + xref_q = '+'.join(author_tokens) + else: + initial_q = ' '.join(author_tokens) + xref_q = '+'.join(title_tokens) + + print "initial query is "+str(initial_q) + print "cross reference query is "+str(xref_q) + q_xref = q+'SearchResults.svc/GetResults?iDisplayLength=50&sSearch='+xref_q + query = '{"szKeyword":"'+initial_q+'"}' # main query, requires specific Content Type header req = mechanize.Request(q_query) @@ -133,12 +139,21 @@ def overdrive_search(br, q, title, author): safe_query(br, q_init_search) # get the search results object - xreq = mechanize.Request(q_xref) - xreq.add_header('X-Requested-With', 'XMLHttpRequest') - xreq.add_header('Referer', q_init_search) - xreq.add_header('Accept', 'application/json, text/javascript, */*') - raw = br.open_novisit(xreq).read() - print "overdrive search result is:\n"+raw + results = False + while results == False: + xreq = mechanize.Request(q_xref) + xreq.add_header('X-Requested-With', 'XMLHttpRequest') + xreq.add_header('Referer', q_init_search) + xreq.add_header('Accept', 'application/json, text/javascript, */*') + raw = br.open_novisit(xreq).read() + print "overdrive search result is:\n"+raw + for m in re.finditer(ur'"iTotalDisplayRecords":(?P<displayrecords>\d+).*?"iTotalRecords":(?P<totalrecords>\d+)', raw): + if int(m.group('displayrecords')) >= 1: + results = True + elif int(m.group('totalrecords')) >= 1: + xref_q = '' + q_xref = q+'SearchResults.svc/GetResults?iDisplayLength=50&sSearch='+xref_q + print "\n\nsorting results" return sort_ovrdrv_results(raw, title, title_tokens, author, author_tokens) @@ -162,7 +177,7 @@ def sort_ovrdrv_results(raw, title=None, title_tokens=None, author=None, author_ return format_results(reserveid, od_title, subtitle, series, publisher, creators, thumbimage, worldcatlink, formatid) else: creators = creators.split(', ') - print "fixed creators are: "+str(creators) + print "split creators from results are: "+str(creators) # if an exact match in a preferred format occurs if creators[0] == author[0] and od_title == title and int(formatid) in [1, 50, 410, 900]: print "Got Exact Match!!!" @@ -275,9 +290,10 @@ def get_social_metadata(title, authors, isbn, ovrdrv_id=None): ovrdrv_id = ovrdrv_data[7] mi.set_identifier('overdrive', ovrdrv_id) mi.title = ovrdrv_data[8] - + print "populated basic social metadata, getting detailed metadata" if ovrdrv_data and get_metadata_detail(br, ovrdrv_data[1], mi, isbn): return mi + print "failed to get detailed metadata, returning basic info" return mi def get_cover_url(isbn, title, author, br, ovrdrv_id=None): @@ -378,7 +394,7 @@ def get_metadata_detail(br, metadata_url, mi, isbn=None): #elif isbn is not None: # mi.set_identifier('isbn', isbn) if subjects: - mi.tags = subjects + mi.tags = [tag.strip() for tag in subjects[0].split(',')] print "tags are "+str(mi.tags) if desc: desc = desc[0] @@ -410,7 +426,7 @@ def main(args=sys.argv): #(None, '9780061747649', 'Mental_Floss Presents: Condensed Knowledge', ['Will Pearson', 'Mangesh Hattikudur']), #(None, '9781400050802', 'The Zombie Survival Guide', ['Max Brooks']), # Two books with this title by this author #(None, '9781775414315', 'The Worst Journey in the World / Antarctic 1910-1913', ['Apsley Cherry-Garrard']), # Garbage sub-title - (None, '9780440335160', 'Outlander', ['Diana Gabaldon']), # Returns lots of results to sort through to get the best match + #(None, '9780440335160', 'Outlander', ['Diana Gabaldon']), # Returns lots of results to sort through to get the best match (None, '9780345509741', 'The Horror Stories of Robert E. Howard', ['Robert E. Howard']), # Complex title with initials/dots stripped, some results don't have a cover ]: cpath = os.path.join(tdir, title+'.jpg') @@ -424,9 +440,9 @@ def main(args=sys.argv): print "curl is "+curl #open(cpath, 'wb').write(br.open_novisit(curl).read()) #print 'Cover for', title, 'saved to', cpath - + st = time.time() print get_social_metadata(title, author, isbn, ovrdrv_id) - #print '\n\n', time.time() - st, '\n\n' + print '\n\n Took ', time.time() - st, ' to get detailed metadata\n\n' return 0 From c4b5c8c91665d108cceadcd648c36d1e2888c4ef Mon Sep 17 00:00:00 2001 From: Lee <ldolse@yahoo.com> Date: Thu, 7 Apr 2011 13:31:41 +0800 Subject: [PATCH 07/30] ... --- src/calibre/ebooks/metadata/overdrive.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/calibre/ebooks/metadata/overdrive.py b/src/calibre/ebooks/metadata/overdrive.py index 289d6bea0e..38d6d730ff 100644 --- a/src/calibre/ebooks/metadata/overdrive.py +++ b/src/calibre/ebooks/metadata/overdrive.py @@ -153,6 +153,8 @@ def overdrive_search(br, q, title, author): elif int(m.group('totalrecords')) >= 1: xref_q = '' q_xref = q+'SearchResults.svc/GetResults?iDisplayLength=50&sSearch='+xref_q + elif int(m.group('totalrecords')) == 0: + return '' print "\n\nsorting results" return sort_ovrdrv_results(raw, title, title_tokens, author, author_tokens) @@ -185,16 +187,23 @@ def sort_ovrdrv_results(raw, title=None, title_tokens=None, author=None, author_ else: close_title_match = False close_author_match = False + print "format id is "+str(formatid) for token in title_tokens: + print "attempting to find "+str(token)+" title token" if od_title.lower().find(token.lower()) != -1: + print "matched token" close_title_match = True else: + print "token didn't match" close_title_match = False break for token in author_tokens: + print "attempting to find "+str(token)+" author token" if creators[0].lower().find(token.lower()) != -1: + print "matched token" close_author_match = True else: + print "token didn't match" close_author_match = False break if close_title_match and close_author_match and int(formatid) in [1, 50, 410, 900]: From 361e86c6ff04cf0d6a3cb07226309e99df373128 Mon Sep 17 00:00:00 2001 From: Lee <ldolse@yahoo.com> Date: Thu, 7 Apr 2011 23:04:32 +0800 Subject: [PATCH 08/30] ... --- src/calibre/customize/builtins.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py index 475cb36687..5e50f81173 100644 --- a/src/calibre/customize/builtins.py +++ b/src/calibre/customize/builtins.py @@ -10,6 +10,7 @@ from calibre.constants import numeric_version from calibre.ebooks.metadata.archive import ArchiveExtract, get_cbz_metadata from calibre.ebooks.metadata.opf2 import metadata_to_opf from calibre.ebooks.oeb.base import OEB_IMAGES +from calibre.utils.config import test_eight_code # To archive plugins {{{ class HTML2ZIP(FileTypePlugin): @@ -166,6 +167,14 @@ class ComicMetadataReader(MetadataReaderPlugin): description = _('Extract cover from comic files') def get_metadata(self, stream, ftype): + if hasattr(stream, 'seek') and hasattr(stream, 'tell'): + pos = stream.tell() + id_ = stream.read(3) + stream.seek(pos) + if id_ == b'Rar': + ftype = 'cbr' + elif id.startswith(b'PK'): + ftype = 'cbz' if ftype == 'cbr': from calibre.libunrar import extract_first_alphabetically as extract_first extract_first From ddb3d935d4c311382615dd646eae1f97e512c973 Mon Sep 17 00:00:00 2001 From: Lee <ldolse@yahoo.com> Date: Fri, 8 Apr 2011 08:37:38 +0800 Subject: [PATCH 09/30] ... --- src/calibre/customize/builtins.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py index 5e50f81173..8dbc72f8ac 100644 --- a/src/calibre/customize/builtins.py +++ b/src/calibre/customize/builtins.py @@ -637,7 +637,7 @@ else: from calibre.ebooks.metadata.covers import OpenLibraryCovers, \ AmazonCovers, DoubanCovers, OverdriveCovers - plugins += [GoogleBooks, ISBNDB, Amazon, + plugins += [GoogleBooks, ISBNDB, Amazon, Overdrive, OpenLibraryCovers, AmazonCovers, DoubanCovers, OverdriveCovers, NiceBooksCovers, KentDistrictLibrary, DoubanBooks, NiceBooks] From 330d12c5eb8f41295990945d7a74ff1524825ba1 Mon Sep 17 00:00:00 2001 From: Lee <ldolse@yahoo.com> Date: Wed, 13 Apr 2011 23:24:34 +0800 Subject: [PATCH 10/30] ... --- src/calibre/ebooks/mobi/mobiml.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/calibre/ebooks/mobi/mobiml.py b/src/calibre/ebooks/mobi/mobiml.py index 40ad5e9e78..3feef7b6f5 100644 --- a/src/calibre/ebooks/mobi/mobiml.py +++ b/src/calibre/ebooks/mobi/mobiml.py @@ -464,9 +464,10 @@ class MobiMLizer(object): valign = style['vertical-align'] not_baseline = valign in ('super', 'sub', 'text-top', 'text-bottom') or ( - isinstance(valign, (float, int)) and abs(valign) != 0) + isinstance(valign, (float, int)) and abs(valign) != 0) or ( + tag in ('sup', 'sub')) issup = valign in ('super', 'text-top') or ( - isinstance(valign, (float, int)) and valign > 0) + isinstance(valign, (float, int)) and valign > 0) or tag == 'sup' vtag = 'sup' if issup else 'sub' if not_baseline and not ignore_valign and tag not in NOT_VTAGS and not isblock: nroot = etree.Element(XHTML('html'), nsmap=MOBI_NSMAP) From ecf21962d5fac590f9a8103fda8608e6cade3843 Mon Sep 17 00:00:00 2001 From: Lee <ldolse@yahoo.com> Date: Thu, 14 Apr 2011 18:34:49 +0800 Subject: [PATCH 11/30] ... --- src/calibre/ebooks/mobi/mobiml.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/calibre/ebooks/mobi/mobiml.py b/src/calibre/ebooks/mobi/mobiml.py index 8c7b740cdb..1e626cf916 100644 --- a/src/calibre/ebooks/mobi/mobiml.py +++ b/src/calibre/ebooks/mobi/mobiml.py @@ -464,10 +464,9 @@ class MobiMLizer(object): valign = style['vertical-align'] not_baseline = valign in ('super', 'sub', 'text-top', 'text-bottom') or ( - isinstance(valign, (float, int)) and abs(valign) != 0) or ( - tag in ('sup', 'sub')) + isinstance(valign, (float, int)) and abs(valign) != 0) issup = valign in ('super', 'text-top') or ( - isinstance(valign, (float, int)) and valign > 0) or tag == 'sup' + isinstance(valign, (float, int)) and valign > 0) vtag = 'sup' if issup else 'sub' if not_baseline and not ignore_valign and tag not in NOT_VTAGS and not isblock: nroot = etree.Element(XHTML('html'), nsmap=MOBI_NSMAP) From 04b543e854a409b73a7da8555815a10b5669e3d7 Mon Sep 17 00:00:00 2001 From: Lee <ldolse@yahoo.com> Date: Sun, 17 Apr 2011 22:42:57 +0800 Subject: [PATCH 12/30] start porting overdrive to 8 --- src/calibre/customize/builtins.py | 4 +++- src/calibre/ebooks/conversion/preprocess.py | 2 +- src/calibre/ebooks/metadata/sources/base.py | 2 +- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py index 8956780e2c..6131c03f9c 100644 --- a/src/calibre/customize/builtins.py +++ b/src/calibre/customize/builtins.py @@ -626,8 +626,9 @@ if test_eight_code: from calibre.ebooks.metadata.sources.amazon import Amazon from calibre.ebooks.metadata.sources.openlibrary import OpenLibrary from calibre.ebooks.metadata.sources.isbndb import ISBNDB + from calibre.ebooks.metadata.sources.overdrive import OverDrive - plugins += [GoogleBooks, Amazon, OpenLibrary, ISBNDB] + plugins += [GoogleBooks, Amazon, OpenLibrary, ISBNDB, OverDrive] # }}} else: @@ -1097,6 +1098,7 @@ if test_eight_code: from calibre.ebooks.metadata.sources.google import GoogleBooks from calibre.ebooks.metadata.sources.amazon import Amazon from calibre.ebooks.metadata.sources.openlibrary import OpenLibrary +from calibre.ebooks.metadata.sources.overdrive import OverDrive plugins += [GoogleBooks, Amazon, OpenLibrary] diff --git a/src/calibre/ebooks/conversion/preprocess.py b/src/calibre/ebooks/conversion/preprocess.py index a1d5fa94d8..8822a39b87 100644 --- a/src/calibre/ebooks/conversion/preprocess.py +++ b/src/calibre/ebooks/conversion/preprocess.py @@ -399,7 +399,7 @@ class HTMLPreProcessor(object): (re.compile(u'˙\s*(<br.*?>)*\s*Z', re.UNICODE), lambda match: u'Ż'), # If pdf printed from a browser then the header/footer has a reliable pattern - (re.compile(r'((?<=</a>)\s*file:////?[A-Z].*<br>|file:////?[A-Z].*<br>(?=\s*<hr>))', re.IGNORECASE), lambda match: ''), + (re.compile(r'((?<=</a>)\s*file:/{2,4}[A-Z].*<br>|file:////?[A-Z].*<br>(?=\s*<hr>))', re.IGNORECASE), lambda match: ''), # Center separator lines (re.compile(u'<br>\s*(?P<break>([*#•✦=]+\s*)+)\s*<br>'), lambda match: '<p>\n<p style="text-align:center">' + match.group(1) + '</p>'), diff --git a/src/calibre/ebooks/metadata/sources/base.py b/src/calibre/ebooks/metadata/sources/base.py index 56f82641ab..bfc3e498eb 100644 --- a/src/calibre/ebooks/metadata/sources/base.py +++ b/src/calibre/ebooks/metadata/sources/base.py @@ -293,7 +293,7 @@ class Source(Plugin): def get_title_tokens(self, title, strip_joiners=True): ''' Take a title and return a list of tokens useful for an AND search query. - Excludes connectives and punctuation. + Excludes connectives(optionally) and punctuation. ''' if title: # strip sub-titles From 4ea961ba6298a905bc50136e8054117d77a18575 Mon Sep 17 00:00:00 2001 From: Charles Haley <> Date: Mon, 18 Apr 2011 08:34:22 +0100 Subject: [PATCH 13/30] From Greg --- src/calibre/customize/builtins.py | 5 +- src/calibre/devices/apple/driver.py | 51 +++++++------ .../devices/content_server/__init__.py | 10 +++ src/calibre/devices/content_server/driver.py | 74 +++++++++++++++++++ src/calibre/gui2/actions/catalog.py | 2 +- src/calibre/gui2/device.py | 2 +- src/calibre/gui2/dialogs/tweak_epub.py | 11 ++- src/calibre/library/server/content.py | 27 ++++++- 8 files changed, 154 insertions(+), 28 deletions(-) create mode 100644 src/calibre/devices/content_server/__init__.py create mode 100644 src/calibre/devices/content_server/driver.py diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py index d3b0b8409d..458bfec3fd 100644 --- a/src/calibre/customize/builtins.py +++ b/src/calibre/customize/builtins.py @@ -582,6 +582,7 @@ from calibre.ebooks.snb.output import SNBOutput from calibre.customize.profiles import input_profiles, output_profiles from calibre.devices.apple.driver import ITUNES +from calibre.devices.content_server.driver import CONTENT_SERVER_FOR_CONFIG from calibre.devices.hanlin.driver import HANLINV3, HANLINV5, BOOX, SPECTRA from calibre.devices.blackberry.driver import BLACKBERRY from calibre.devices.cybook.driver import CYBOOK, ORIZON @@ -753,7 +754,9 @@ plugins += [ EEEREADER, NEXTBOOK, ITUNES, -] + CONTENT_SERVER_FOR_CONFIG + ] + plugins += [x for x in list(locals().values()) if isinstance(x, type) and \ x.__name__.endswith('MetadataReader')] plugins += [x for x in list(locals().values()) if isinstance(x, type) and \ diff --git a/src/calibre/devices/apple/driver.py b/src/calibre/devices/apple/driver.py index 2cc478603a..d7811f0a22 100644 --- a/src/calibre/devices/apple/driver.py +++ b/src/calibre/devices/apple/driver.py @@ -201,8 +201,9 @@ class ITUNES(DriverBase): # 0x1294 iPhone 3GS # 0x1297 iPhone 4 # 0x129a iPad + # 0x12a2 iPad2 VENDOR_ID = [0x05ac] - PRODUCT_ID = [0x1292,0x1293,0x1294,0x1297,0x1299,0x129a] + PRODUCT_ID = [0x1292,0x1293,0x1294,0x1297,0x1299,0x129a,0x12a2] BCD = [0x01] # Plugboard ID @@ -421,7 +422,7 @@ class ITUNES(DriverBase): cached_books[this_book.path] = { 'title':book.name(), - 'author':[book.artist()], + 'author':book.artist().split(' & '), 'lib_book':library_books[this_book.path] if this_book.path in library_books else None, 'dev_book':book, 'uuid': book.composer() @@ -459,7 +460,7 @@ class ITUNES(DriverBase): cached_books[this_book.path] = { 'title':book.Name, - 'author':book.Artist, + 'author':book.artist().split(' & '), 'lib_book':library_books[this_book.path] if this_book.path in library_books else None, 'uuid': book.Composer, 'format': 'pdf' if book.KindAsString.startswith('PDF') else 'epub' @@ -1021,7 +1022,9 @@ class ITUNES(DriverBase): if isosx: for (i,file) in enumerate(files): format = file.rpartition('.')[2].lower() - path = self.path_template % (metadata[i].title, metadata[i].author[0],format) + path = self.path_template % (metadata[i].title, + authors_to_string(metadata[i].authors), + format) self._remove_existing_copy(path, metadata[i]) fpath = self._get_fpath(file, metadata[i], format, update_md=True) db_added, lb_added = self._add_new_copy(fpath, metadata[i]) @@ -1034,9 +1037,11 @@ class ITUNES(DriverBase): if DEBUG: self.log.info("ITUNES.upload_books()") self.log.info(" adding '%s' by '%s' uuid:%s to self.cached_books" % - ( metadata[i].title, metadata[i].author, metadata[i].uuid)) + (metadata[i].title, + authors_to_string(metadata[i].authors), + metadata[i].uuid)) self.cached_books[this_book.path] = { - 'author': metadata[i].author, + 'author': authors_to_string(metadata[i].authors), 'dev_book': db_added, 'format': format, 'lib_book': lb_added, @@ -1055,7 +1060,9 @@ class ITUNES(DriverBase): for (i,file) in enumerate(files): format = file.rpartition('.')[2].lower() - path = self.path_template % (metadata[i].title, metadata[i].author[0],format) + path = self.path_template % (metadata[i].title, + authors_to_string(metadata[i].authors), + format) self._remove_existing_copy(path, metadata[i]) fpath = self._get_fpath(file, metadata[i],format, update_md=True) db_added, lb_added = self._add_new_copy(fpath, metadata[i]) @@ -1075,9 +1082,11 @@ class ITUNES(DriverBase): if DEBUG: self.log.info("ITUNES.upload_books()") self.log.info(" adding '%s' by '%s' uuid:%s to self.cached_books" % - ( metadata[i].title, metadata[i].author, metadata[i].uuid)) + (metadata[i].title, + authors_to_string(metadata[i].authors), + metadata[i].uuid)) self.cached_books[this_book.path] = { - 'author': metadata[i].author[0], + 'author': authors_to_string(metadata[i].authors), 'dev_book': db_added, 'format': format, 'lib_book': lb_added, @@ -1190,7 +1199,7 @@ class ITUNES(DriverBase): base_fn = base_fn.rpartition('.')[0] db_added = self._find_device_book( { 'title': base_fn if format == 'pdf' else metadata.title, - 'author': metadata.authors[0], + 'author': authors_to_string(metadata.authors), 'uuid': metadata.uuid, 'format': format}) return db_added @@ -1255,7 +1264,7 @@ class ITUNES(DriverBase): base_fn = base_fn.rpartition('.')[0] added = self._find_library_book( { 'title': base_fn if format == 'pdf' else metadata.title, - 'author': metadata.author[0], + 'author': authors_to_string(metadata.authors), 'uuid': metadata.uuid, 'format': format}) return added @@ -1314,7 +1323,7 @@ class ITUNES(DriverBase): with open(metadata.cover,'r+b') as cd: cover_data = cd.read() except: - self.problem_titles.append("'%s' by %s" % (metadata.title, metadata.author[0])) + self.problem_titles.append("'%s' by %s" % (metadata.title, authors_to_string(metadata.authors))) self.log.error(" error scaling '%s' for '%s'" % (metadata.cover,metadata.title)) import traceback @@ -1389,7 +1398,7 @@ class ITUNES(DriverBase): thumb_path = path.rpartition('.')[0] + '.jpg' zfw.writestr(thumb_path, thumb) except: - self.problem_titles.append("'%s' by %s" % (metadata.title, metadata.author[0])) + self.problem_titles.append("'%s' by %s" % (metadata.title, authors_to_string(metadata.authors))) self.log.error(" error converting '%s' to thumb for '%s'" % (metadata.cover,metadata.title)) finally: try: @@ -1407,7 +1416,7 @@ class ITUNES(DriverBase): if DEBUG: self.log.info(" ITUNES._create_new_book()") - this_book = Book(metadata.title, authors_to_string(metadata.author)) + this_book = Book(metadata.title, authors_to_string(metadata.authors)) this_book.datetime = time.gmtime() this_book.db_id = None this_book.device_collections = [] @@ -2451,7 +2460,7 @@ class ITUNES(DriverBase): for book in self.cached_books: if self.cached_books[book]['uuid'] == metadata.uuid or \ (self.cached_books[book]['title'] == metadata.title and \ - self.cached_books[book]['author'] == metadata.authors[0]): + self.cached_books[book]['author'] == authors_to_string(metadata.authors)): self.update_list.append(self.cached_books[book]) self._remove_from_device(self.cached_books[book]) if DEBUG: @@ -2470,7 +2479,7 @@ class ITUNES(DriverBase): for book in self.cached_books: if self.cached_books[book]['uuid'] == metadata.uuid or \ (self.cached_books[book]['title'] == metadata.title and \ - self.cached_books[book]['author'] == metadata.authors[0]): + self.cached_books[book]['author'] == authors_to_string(metadata.authors)): self.update_list.append(self.cached_books[book]) self._remove_from_iTunes(self.cached_books[book]) if DEBUG: @@ -2939,13 +2948,13 @@ class ITUNES(DriverBase): def _xform_metadata_via_plugboard(self, book, format): ''' Transform book metadata from plugboard templates ''' if DEBUG: - self.log.info(" ITUNES._xform_metadata_via_plugboard()") + self.log.info(" ITUNES._xform_metadata_via_plugboard()") if self.plugboard_func: pb = self.plugboard_func(self.DEVICE_PLUGBOARD_NAME, format, self.plugboards) newmi = book.deepcopy_metadata() newmi.template_to_attribute(book, pb) - if DEBUG: + if pb is not None and DEBUG: self.log.info(" transforming %s using %s:" % (format, pb)) self.log.info(" title: %s %s" % (book.title, ">>> %s" % newmi.title if book.title != newmi.title else '')) @@ -3062,7 +3071,7 @@ class ITUNES_ASYNC(ITUNES): cached_books[this_book.path] = { 'title':library_books[book].name(), - 'author':[library_books[book].artist()], + 'author':library_books[book].artist().split(' & '), 'lib_book':library_books[book], 'dev_book':None, 'uuid': library_books[book].composer(), @@ -3102,7 +3111,7 @@ class ITUNES_ASYNC(ITUNES): cached_books[this_book.path] = { 'title':library_books[book].Name, - 'author':library_books[book].Artist, + 'author':library_books[book].Artist.split(' & '), 'lib_book':library_books[book], 'uuid': library_books[book].Composer, 'format': format @@ -3288,7 +3297,7 @@ class Book(Metadata): See ebooks.metadata.book.base ''' def __init__(self,title,author): - Metadata.__init__(self, title, authors=[author]) + Metadata.__init__(self, title, authors=author.split(' & ')) @property def title_sorter(self): diff --git a/src/calibre/devices/content_server/__init__.py b/src/calibre/devices/content_server/__init__.py new file mode 100644 index 0000000000..3d1a86922e --- /dev/null +++ b/src/calibre/devices/content_server/__init__.py @@ -0,0 +1,10 @@ +#!/usr/bin/env python +# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai +from __future__ import with_statement + +__license__ = 'GPL v3' +__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>' +__docformat__ = 'restructuredtext en' + + + diff --git a/src/calibre/devices/content_server/driver.py b/src/calibre/devices/content_server/driver.py new file mode 100644 index 0000000000..84b14f8e62 --- /dev/null +++ b/src/calibre/devices/content_server/driver.py @@ -0,0 +1,74 @@ +''' +Created on 17 Apr 2011 + +@author: GRiker, modeled on charles's Folder Device + +''' + +from calibre.constants import DEBUG +from calibre.devices.interface import DevicePlugin +from calibre.devices.usbms.deviceconfig import DeviceConfig +from calibre.devices.usbms.driver import USBMS, BookList + +class DriverBase(DeviceConfig, DevicePlugin): + # Reduce to just the formats eligible for plugboard xforms + # These formats are shown in the customization dialog + FORMATS = ['epub', 'mobi'] + USER_CAN_ADD_NEW_FORMATS = False + + # Hide the standard customization widgets + SUPPORTS_SUB_DIRS = False + MUST_READ_METADATA = True + SUPPORTS_USE_AUTHOR_SORT = False + + +# This class is added to the standard device plugin chain, so that it can +# be configured. It has invalid vendor_id etc, so it will never match a +# device. The 'real' CONTENT_SERVER will use the config from it. +class CONTENT_SERVER_FOR_CONFIG(USBMS): + name = 'Content Server Interface' + gui_name = 'Content Server' + description = _('Enables metadata plugboards to be used with Content Server.') + author = 'GRiker' + supported_platforms = ['windows', 'osx', 'linux'] + + VENDOR_ID = [0xffff] + PRODUCT_ID = [0xffff] + BCD = [0xffff] + DEVICE_PLUGBOARD_NAME = 'CONTENT_SERVER' + + def config_widget(cls): + ''' + Configure a minimal QWidget + Better to simply disable the config_widget altogether + ''' + cw = DriverBase.config_widget() + # Turn off the Save template + cw.opt_save_template.setVisible(False) + cw.label.setVisible(False) + # Hide the up/down arrows + cw.column_up.setVisible(False) + cw.column_down.setVisible(False) + # Retitle + cw.groupBox.setTitle(_("Enable metadata plugboards for the following formats:")) + return cw + +class CONTENT_SERVER(USBMS): + + FORMATS = CONTENT_SERVER_FOR_CONFIG.FORMATS + DEVICE_PLUGBOARD_NAME = 'CONTENT_SERVER' + + def __init__(self): + if DEBUG: + print("CONTENT_SERVER.init()") + pass + + def set_plugboards(self, plugboards, pb_func): + # This method is called with the plugboard that matches the format + # declared in use_plugboard_ext and a device name of CONTENT_SERVER + if DEBUG: + print("CONTENT_SERVER.set_plugboards()") + print(' using plugboard %s' % plugboards) + self.plugboards = plugboards + self.plugboard_func = pb_func + diff --git a/src/calibre/gui2/actions/catalog.py b/src/calibre/gui2/actions/catalog.py index fad6e59294..093985d041 100644 --- a/src/calibre/gui2/actions/catalog.py +++ b/src/calibre/gui2/actions/catalog.py @@ -17,7 +17,7 @@ from calibre.gui2.actions import InterfaceAction class GenerateCatalogAction(InterfaceAction): name = 'Generate Catalog' - action_spec = (_('Create a catalog of the books in your calibre library'), None, None, None) + action_spec = (_('Create a catalog of the books in your calibre library'), 'catalog.png', 'Catalog builder', None) dont_add_to = frozenset(['menubar-device', 'toolbar-device', 'context-menu-device']) def generate_catalog(self): diff --git a/src/calibre/gui2/device.py b/src/calibre/gui2/device.py index 4d4f66eab1..8f21c17eaf 100644 --- a/src/calibre/gui2/device.py +++ b/src/calibre/gui2/device.py @@ -892,7 +892,7 @@ class DeviceMixin(object): # {{{ sub_dest_parts.append('') to = sub_dest_parts[0] fmts = sub_dest_parts[1] - subject = ';'.join(sub_dest_parts[2:]) + subject = ';'.join(sub_dest_parts[2:]) fmts = [x.strip().lower() for x in fmts.split(',')] self.send_by_mail(to, fmts, delete, subject=subject) diff --git a/src/calibre/gui2/dialogs/tweak_epub.py b/src/calibre/gui2/dialogs/tweak_epub.py index db6e93fd7a..a42fb07e40 100755 --- a/src/calibre/gui2/dialogs/tweak_epub.py +++ b/src/calibre/gui2/dialogs/tweak_epub.py @@ -12,6 +12,7 @@ from zipfile import ZipFile, ZIP_DEFLATED, ZIP_STORED from PyQt4.Qt import QDialog +from calibre.constants import isosx, iswindows from calibre.gui2 import open_local_file from calibre.gui2.dialogs.tweak_epub_ui import Ui_Dialog from calibre.libunzip import extract as zipextract @@ -42,11 +43,19 @@ class TweakEpub(QDialog, Ui_Dialog): self.move(parent_loc.x(),parent_loc.y()) def cleanup(self): + if isosx: + try: + import appscript + self.finder = appscript.app('Finder') + self.finder.Finder_windows[os.path.basename(self._exploded)].close() + except: + # appscript fails to load on 10.4 + pass + # Delete directory containing exploded ePub if self._exploded is not None: shutil.rmtree(self._exploded, ignore_errors=True) - def display_exploded(self): ''' Generic subprocess launch of native file browser diff --git a/src/calibre/library/server/content.py b/src/calibre/library/server/content.py index 0c3edd1627..faa0a61baf 100644 --- a/src/calibre/library/server/content.py +++ b/src/calibre/library/server/content.py @@ -183,16 +183,37 @@ class ContentServer(object): if fmt is None: raise cherrypy.HTTPError(404, 'book: %d does not have format: %s'%(id, format)) if format == 'EPUB': + # Get the original metadata + mi = self.db.get_metadata(id, index_is_id=True) + + # Instantiate the CONTENT_SERVER driver + from calibre.devices.content_server.driver import CONTENT_SERVER + cs = CONTENT_SERVER() + + # Get any EPUB plugboards for the content server + from calibre.gui2.device import find_plugboard, device_name_for_plugboards + plugboards = self.db.prefs.get('plugboards', {}) + + # Transform the metadata via the plugboard + if hasattr(cs, 'set_plugboards') and callable(cs.set_plugboards): + cs.set_plugboards(plugboards, find_plugboard) + cpb = find_plugboard(device_name_for_plugboards(cs), format.lower(), plugboards) + if cpb: + newmi = mi.deepcopy_metadata() + newmi.template_to_attribute(mi, cpb) + else: + newmi = mi + + # Write the updated file from tempfile import TemporaryFile from calibre.ebooks.metadata.meta import set_metadata raw = fmt.read() fmt = TemporaryFile() fmt.write(raw) fmt.seek(0) - set_metadata(fmt, self.db.get_metadata(id, index_is_id=True, - get_cover=True), - 'epub') + set_metadata(fmt, newmi, 'epub') fmt.seek(0) + mt = guess_type('dummy.'+format.lower())[0] if mt is None: mt = 'application/octet-stream' From 91c5356ac5d14ef807cce610431ec44aa6ab0ff0 Mon Sep 17 00:00:00 2001 From: Charles Haley <> Date: Mon, 18 Apr 2011 09:26:34 +0100 Subject: [PATCH 14/30] Suggested content server plugboard implementation for Greg --- src/calibre/customize/builtins.py | 2 - .../devices/content_server/__init__.py | 10 --- src/calibre/devices/content_server/driver.py | 74 ------------------- src/calibre/gui2/device.py | 20 +---- src/calibre/gui2/preferences/plugboard.py | 4 +- src/calibre/library/save_to_disk.py | 32 ++++---- src/calibre/library/server/content.py | 27 +++---- 7 files changed, 34 insertions(+), 135 deletions(-) delete mode 100644 src/calibre/devices/content_server/__init__.py delete mode 100644 src/calibre/devices/content_server/driver.py diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py index 458bfec3fd..8f50481f84 100644 --- a/src/calibre/customize/builtins.py +++ b/src/calibre/customize/builtins.py @@ -582,7 +582,6 @@ from calibre.ebooks.snb.output import SNBOutput from calibre.customize.profiles import input_profiles, output_profiles from calibre.devices.apple.driver import ITUNES -from calibre.devices.content_server.driver import CONTENT_SERVER_FOR_CONFIG from calibre.devices.hanlin.driver import HANLINV3, HANLINV5, BOOX, SPECTRA from calibre.devices.blackberry.driver import BLACKBERRY from calibre.devices.cybook.driver import CYBOOK, ORIZON @@ -754,7 +753,6 @@ plugins += [ EEEREADER, NEXTBOOK, ITUNES, - CONTENT_SERVER_FOR_CONFIG ] plugins += [x for x in list(locals().values()) if isinstance(x, type) and \ diff --git a/src/calibre/devices/content_server/__init__.py b/src/calibre/devices/content_server/__init__.py deleted file mode 100644 index 3d1a86922e..0000000000 --- a/src/calibre/devices/content_server/__init__.py +++ /dev/null @@ -1,10 +0,0 @@ -#!/usr/bin/env python -# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai -from __future__ import with_statement - -__license__ = 'GPL v3' -__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>' -__docformat__ = 'restructuredtext en' - - - diff --git a/src/calibre/devices/content_server/driver.py b/src/calibre/devices/content_server/driver.py deleted file mode 100644 index 84b14f8e62..0000000000 --- a/src/calibre/devices/content_server/driver.py +++ /dev/null @@ -1,74 +0,0 @@ -''' -Created on 17 Apr 2011 - -@author: GRiker, modeled on charles's Folder Device - -''' - -from calibre.constants import DEBUG -from calibre.devices.interface import DevicePlugin -from calibre.devices.usbms.deviceconfig import DeviceConfig -from calibre.devices.usbms.driver import USBMS, BookList - -class DriverBase(DeviceConfig, DevicePlugin): - # Reduce to just the formats eligible for plugboard xforms - # These formats are shown in the customization dialog - FORMATS = ['epub', 'mobi'] - USER_CAN_ADD_NEW_FORMATS = False - - # Hide the standard customization widgets - SUPPORTS_SUB_DIRS = False - MUST_READ_METADATA = True - SUPPORTS_USE_AUTHOR_SORT = False - - -# This class is added to the standard device plugin chain, so that it can -# be configured. It has invalid vendor_id etc, so it will never match a -# device. The 'real' CONTENT_SERVER will use the config from it. -class CONTENT_SERVER_FOR_CONFIG(USBMS): - name = 'Content Server Interface' - gui_name = 'Content Server' - description = _('Enables metadata plugboards to be used with Content Server.') - author = 'GRiker' - supported_platforms = ['windows', 'osx', 'linux'] - - VENDOR_ID = [0xffff] - PRODUCT_ID = [0xffff] - BCD = [0xffff] - DEVICE_PLUGBOARD_NAME = 'CONTENT_SERVER' - - def config_widget(cls): - ''' - Configure a minimal QWidget - Better to simply disable the config_widget altogether - ''' - cw = DriverBase.config_widget() - # Turn off the Save template - cw.opt_save_template.setVisible(False) - cw.label.setVisible(False) - # Hide the up/down arrows - cw.column_up.setVisible(False) - cw.column_down.setVisible(False) - # Retitle - cw.groupBox.setTitle(_("Enable metadata plugboards for the following formats:")) - return cw - -class CONTENT_SERVER(USBMS): - - FORMATS = CONTENT_SERVER_FOR_CONFIG.FORMATS - DEVICE_PLUGBOARD_NAME = 'CONTENT_SERVER' - - def __init__(self): - if DEBUG: - print("CONTENT_SERVER.init()") - pass - - def set_plugboards(self, plugboards, pb_func): - # This method is called with the plugboard that matches the format - # declared in use_plugboard_ext and a device name of CONTENT_SERVER - if DEBUG: - print("CONTENT_SERVER.set_plugboards()") - print(' using plugboard %s' % plugboards) - self.plugboards = plugboards - self.plugboard_func = pb_func - diff --git a/src/calibre/gui2/device.py b/src/calibre/gui2/device.py index 8f21c17eaf..2e252047af 100644 --- a/src/calibre/gui2/device.py +++ b/src/calibre/gui2/device.py @@ -29,8 +29,7 @@ from calibre.ebooks.metadata.meta import set_metadata from calibre.constants import DEBUG from calibre.utils.config import prefs, tweaks from calibre.utils.magick.draw import thumbnail -from calibre.library.save_to_disk import plugboard_any_device_value, \ - plugboard_any_format_value +from calibre.library.save_to_disk import find_plugboard # }}} class DeviceJob(BaseJob): # {{{ @@ -93,23 +92,6 @@ class DeviceJob(BaseJob): # {{{ # }}} -def find_plugboard(device_name, format, plugboards): - cpb = None - if format in plugboards: - cpb = plugboards[format] - elif plugboard_any_format_value in plugboards: - cpb = plugboards[plugboard_any_format_value] - if cpb is not None: - if device_name in cpb: - cpb = cpb[device_name] - elif plugboard_any_device_value in cpb: - cpb = cpb[plugboard_any_device_value] - else: - cpb = None - if DEBUG: - prints('Device using plugboard', format, device_name, cpb) - return cpb - def device_name_for_plugboards(device_class): if hasattr(device_class, 'DEVICE_PLUGBOARD_NAME'): return device_class.DEVICE_PLUGBOARD_NAME diff --git a/src/calibre/gui2/preferences/plugboard.py b/src/calibre/gui2/preferences/plugboard.py index 8f2b084d76..c5db7074dc 100644 --- a/src/calibre/gui2/preferences/plugboard.py +++ b/src/calibre/gui2/preferences/plugboard.py @@ -15,6 +15,7 @@ from calibre.gui2.preferences.plugboard_ui import Ui_Form from calibre.customize.ui import metadata_writers, device_plugins from calibre.library.save_to_disk import plugboard_any_format_value, \ plugboard_any_device_value, plugboard_save_to_disk_value +from calibre.library.server.content import plugboard_content_server_value from calibre.utils.formatter import validation_formatter @@ -74,7 +75,8 @@ class ConfigWidget(ConfigWidgetBase, Ui_Form): self.devices.append(n) self.devices.sort(cmp=lambda x, y: cmp(x.lower(), y.lower())) self.devices.insert(1, plugboard_save_to_disk_value) - self.devices.insert(2, plugboard_any_device_value) + self.devices.insert(1, plugboard_content_server_value) + self.devices.insert(1, plugboard_any_device_value) self.new_device.addItems(self.devices) self.formats = [''] diff --git a/src/calibre/library/save_to_disk.py b/src/calibre/library/save_to_disk.py index 96c42e6e0e..3c57af40a8 100644 --- a/src/calibre/library/save_to_disk.py +++ b/src/calibre/library/save_to_disk.py @@ -51,6 +51,23 @@ for x in FORMAT_ARG_DESCS: FORMAT_ARGS[x] = '' +def find_plugboard(device_name, format, plugboards): + cpb = None + if format in plugboards: + cpb = plugboards[format] + elif plugboard_any_format_value in plugboards: + cpb = plugboards[plugboard_any_format_value] + if cpb is not None: + if device_name in cpb: + cpb = cpb[device_name] + elif plugboard_any_device_value in cpb: + cpb = cpb[plugboard_any_device_value] + else: + cpb = None + if DEBUG: + prints('Device using plugboard', format, device_name, cpb) + return cpb + def config(defaults=None): if defaults is None: c = Config('save_to_disk', _('Options to control saving to disk')) @@ -279,20 +296,7 @@ def do_save_book_to_disk(id_, mi, cover, plugboards, written = False for fmt in formats: global plugboard_save_to_disk_value, plugboard_any_format_value - dev_name = plugboard_save_to_disk_value - cpb = None - if fmt in plugboards: - cpb = plugboards[fmt] - if dev_name in cpb: - cpb = cpb[dev_name] - else: - cpb = None - if cpb is None and plugboard_any_format_value in plugboards: - cpb = plugboards[plugboard_any_format_value] - if dev_name in cpb: - cpb = cpb[dev_name] - else: - cpb = None + cpb = find_plugboard(plugboard_save_to_disk_value, fmt, plugboards) # Leave this here for a while, in case problems arise. if cpb is not None: prints('Save-to-disk using plugboard:', fmt, cpb) diff --git a/src/calibre/library/server/content.py b/src/calibre/library/server/content.py index faa0a61baf..8d9e71c528 100644 --- a/src/calibre/library/server/content.py +++ b/src/calibre/library/server/content.py @@ -12,9 +12,13 @@ import cherrypy from calibre import fit_image, guess_type from calibre.utils.date import fromtimestamp from calibre.library.caches import SortKeyGenerator +from calibre.library.save_to_disk import find_plugboard + from calibre.utils.magick.draw import save_cover_data_to, Image, \ thumbnail as generate_thumbnail +plugboard_content_server_value = 'content_server' + class CSSortKeyGenerator(SortKeyGenerator): def __init__(self, fields, fm, db_prefs): @@ -186,23 +190,16 @@ class ContentServer(object): # Get the original metadata mi = self.db.get_metadata(id, index_is_id=True) - # Instantiate the CONTENT_SERVER driver - from calibre.devices.content_server.driver import CONTENT_SERVER - cs = CONTENT_SERVER() - # Get any EPUB plugboards for the content server - from calibre.gui2.device import find_plugboard, device_name_for_plugboards plugboards = self.db.prefs.get('plugboards', {}) - - # Transform the metadata via the plugboard - if hasattr(cs, 'set_plugboards') and callable(cs.set_plugboards): - cs.set_plugboards(plugboards, find_plugboard) - cpb = find_plugboard(device_name_for_plugboards(cs), format.lower(), plugboards) - if cpb: - newmi = mi.deepcopy_metadata() - newmi.template_to_attribute(mi, cpb) - else: - newmi = mi + cpb = find_plugboard(plugboard_content_server_value, + 'epub', plugboards) + if cpb: + # Transform the metadata via the plugboard + newmi = mi.deepcopy_metadata() + newmi.template_to_attribute(mi, cpb) + else: + newmi = mi # Write the updated file from tempfile import TemporaryFile From 3709dcbc621e152472184bacbb056e16e1aef1fe Mon Sep 17 00:00:00 2001 From: Charles Haley <> Date: Mon, 18 Apr 2011 10:43:20 +0100 Subject: [PATCH 15/30] Add check for valid formats --- src/calibre/gui2/preferences/plugboard.py | 16 +++++++++++++++- src/calibre/library/server/content.py | 1 + 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/src/calibre/gui2/preferences/plugboard.py b/src/calibre/gui2/preferences/plugboard.py index c5db7074dc..7036ddf8f3 100644 --- a/src/calibre/gui2/preferences/plugboard.py +++ b/src/calibre/gui2/preferences/plugboard.py @@ -15,7 +15,8 @@ from calibre.gui2.preferences.plugboard_ui import Ui_Form from calibre.customize.ui import metadata_writers, device_plugins from calibre.library.save_to_disk import plugboard_any_format_value, \ plugboard_any_device_value, plugboard_save_to_disk_value -from calibre.library.server.content import plugboard_content_server_value +from calibre.library.server.content import plugboard_content_server_value, \ + plugboard_content_server_formats from calibre.utils.formatter import validation_formatter @@ -69,13 +70,17 @@ class ConfigWidget(ConfigWidgetBase, Ui_Form): self.device_label.setText(_('Device currently connected: None')) self.devices = ['', 'APPLE', 'FOLDER_DEVICE'] + self.device_to_formats_map = {} for device in device_plugins(): n = device_name_for_plugboards(device) + self.device_to_formats_map[n] = device.FORMATS if n not in self.devices: self.devices.append(n) self.devices.sort(cmp=lambda x, y: cmp(x.lower(), y.lower())) self.devices.insert(1, plugboard_save_to_disk_value) self.devices.insert(1, plugboard_content_server_value) + self.device_to_formats_map[plugboard_content_server_value] = \ + plugboard_content_server_formats self.devices.insert(1, plugboard_any_device_value) self.new_device.addItems(self.devices) @@ -232,6 +237,15 @@ class ConfigWidget(ConfigWidgetBase, Ui_Form): show=True) self.new_device.setCurrentIndex(0) return + if self.current_device in self.device_to_formats_map: + allowable_formats = self.device_to_formats_map[self.current_device] + if self.current_format not in allowable_formats: + error_dialog(self, '', + _('The {0} device does not support the {1} format.'). + format(self.current_device, self.current_format), + show=True) + self.new_device.setCurrentIndex(0) + return self.set_fields() def new_format_changed(self, txt): diff --git a/src/calibre/library/server/content.py b/src/calibre/library/server/content.py index 8d9e71c528..08de4faecd 100644 --- a/src/calibre/library/server/content.py +++ b/src/calibre/library/server/content.py @@ -18,6 +18,7 @@ from calibre.utils.magick.draw import save_cover_data_to, Image, \ thumbnail as generate_thumbnail plugboard_content_server_value = 'content_server' +plugboard_content_server_formats = ['epub'] class CSSortKeyGenerator(SortKeyGenerator): From 09da88b6d18d0c4bf09e126f6af7195069b15863 Mon Sep 17 00:00:00 2001 From: Lee <ldolse@yahoo.com> Date: Mon, 18 Apr 2011 18:06:10 +0800 Subject: [PATCH 16/30] port overdrive plugin to 8.x framework, remove from 7.x framework --- src/calibre/customize/builtins.py | 8 +- src/calibre/ebooks/metadata/covers.py | 27 - src/calibre/ebooks/metadata/fetch.py | 21 - src/calibre/ebooks/metadata/overdrive.py | 459 ---------------- src/calibre/ebooks/metadata/sources/base.py | 4 +- .../ebooks/metadata/sources/overdrive.py | 510 ++++++++++++++++++ 6 files changed, 516 insertions(+), 513 deletions(-) delete mode 100644 src/calibre/ebooks/metadata/overdrive.py create mode 100755 src/calibre/ebooks/metadata/sources/overdrive.py diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py index f4a8c6b6bc..75c02c7e00 100644 --- a/src/calibre/customize/builtins.py +++ b/src/calibre/customize/builtins.py @@ -633,14 +633,14 @@ if test_eight_code: # }}} else: from calibre.ebooks.metadata.fetch import GoogleBooks, ISBNDB, Amazon, \ - KentDistrictLibrary, Overdrive + KentDistrictLibrary from calibre.ebooks.metadata.douban import DoubanBooks from calibre.ebooks.metadata.nicebooks import NiceBooks, NiceBooksCovers from calibre.ebooks.metadata.covers import OpenLibraryCovers, \ - AmazonCovers, DoubanCovers, OverdriveCovers + AmazonCovers, DoubanCovers - plugins += [GoogleBooks, ISBNDB, Amazon, Overdrive, - OpenLibraryCovers, AmazonCovers, DoubanCovers, OverdriveCovers, + plugins += [GoogleBooks, ISBNDB, Amazon, + OpenLibraryCovers, AmazonCovers, DoubanCovers, NiceBooksCovers, KentDistrictLibrary, DoubanBooks, NiceBooks] plugins += [ diff --git a/src/calibre/ebooks/metadata/covers.py b/src/calibre/ebooks/metadata/covers.py index f705317f59..10acff4e61 100644 --- a/src/calibre/ebooks/metadata/covers.py +++ b/src/calibre/ebooks/metadata/covers.py @@ -151,33 +151,6 @@ class AmazonCovers(CoverDownload): # {{{ # }}} -class OverdriveCovers(CoverDownload): # {{{ - - name = 'overdrive.com covers' - description = _('Download covers from Overdrive') - author = 'Kovid Goyal' - - - def has_cover(self, mi, ans, timeout=5.): - if not mi.authors or not mi.title: - return False - return True - - def get_covers(self, mi, result_queue, abort, timeout=5.): - if not mi.isbn: - return - from calibre.ebooks.metadata.overdrive import get_cover_url - br = browser() - try: - url = get_cover_url(mi.isbn, mi.title, mi.authors, br) - cover_data = br.open_novisit(url).read() - result_queue.put((True, cover_data, 'jpg', self.name)) - except Exception, e: - result_queue.put((False, self.exception_to_string(e), - traceback.format_exc(), self.name)) - -# }}} - def check_for_cover(mi, timeout=5.): # {{{ from calibre.customize.ui import cover_sources ans = Event() diff --git a/src/calibre/ebooks/metadata/fetch.py b/src/calibre/ebooks/metadata/fetch.py index fb01c5dd71..e1fac50d16 100644 --- a/src/calibre/ebooks/metadata/fetch.py +++ b/src/calibre/ebooks/metadata/fetch.py @@ -250,27 +250,6 @@ class Amazon(MetadataSource): # {{{ # }}} -class Overdrive(MetadataSource): # {{{ - - name = 'Overdrive' - metadata_type = 'social' - description = _('Downloads metadata from the Overdrive library network') - - has_html_comments = True - - def fetch(self): - if not self.isbn: - return - from calibre.ebooks.metadata.overdrive import get_social_metadata - try: - self.results = get_social_metadata(self.title, self.book_author, self.isbn) - - except Exception, e: - self.exception = e - self.tb = traceback.format_exc() - - # }}} - class KentDistrictLibrary(MetadataSource): # {{{ name = 'Kent District Library' diff --git a/src/calibre/ebooks/metadata/overdrive.py b/src/calibre/ebooks/metadata/overdrive.py deleted file mode 100644 index 38d6d730ff..0000000000 --- a/src/calibre/ebooks/metadata/overdrive.py +++ /dev/null @@ -1,459 +0,0 @@ -#!/usr/bin/env python -__license__ = 'GPL v3' -__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net' -__docformat__ = 'restructuredtext en' - -''' -Fetch metadata using Overdrive Content Reserve -''' -import sys, re, random, urllib, mechanize, copy -from threading import RLock - -from lxml import html, etree -from lxml.html import soupparser - -from calibre import browser -from calibre.ebooks.metadata import check_isbn -from calibre.ebooks.metadata.sources.base import Source -from calibre.ebooks.metadata.book.base import Metadata -from calibre.ebooks.chardet import xml_to_unicode -from calibre.library.comments import sanitize_comments_html - -ovrdrv_data_cache = {} -cover_url_cache = {} -cache_lock = RLock() -base_url = 'http://search.overdrive.com/' - - -def create_query(self, title=None, authors=None, identifiers={}): - q = '' - if title or authors: - def build_term(prefix, parts): - return ' '.join('in'+prefix + ':' + x for x in parts) - title_tokens = list(self.get_title_tokens(title, False)) - if title_tokens: - q += build_term('title', title_tokens) - author_tokens = self.get_author_tokens(authors, - only_first_author=True) - if author_tokens: - q += ('+' if q else '') + build_term('author', - author_tokens) - - if isinstance(q, unicode): - q = q.encode('utf-8') - if not q: - return None - return BASE_URL+urlencode({ - 'q':q, - }) - - -def get_base_referer(): - choices = [ - 'http://overdrive.chipublib.org/82DC601D-7DDE-4212-B43A-09D821935B01/10/375/en/', - 'http://emedia.clevnet.org/9D321DAD-EC0D-490D-BFD8-64AE2C96ECA8/10/241/en/', - 'http://singapore.lib.overdrive.com/F11D55BE-A917-4D63-8111-318E88B29740/10/382/en/', - 'http://ebooks.nypl.org/20E48048-A377-4520-BC43-F8729A42A424/10/257/en/', - 'http://spl.lib.overdrive.com/5875E082-4CB2-4689-9426-8509F354AFEF/10/335/en/' - ] - return choices[random.randint(0, len(choices)-1)] - -def format_results(reserveid, od_title, subtitle, series, publisher, creators, thumbimage, worldcatlink, formatid): - fix_slashes = re.compile(r'\\/') - thumbimage = fix_slashes.sub('/', thumbimage) - worldcatlink = fix_slashes.sub('/', worldcatlink) - cover_url = re.sub('(?P<img>(Ima?g(eType-)?))200', '\g<img>100', thumbimage) - social_metadata_url = base_url+'TitleInfo.aspx?ReserveID='+reserveid+'&FormatID='+formatid - series_num = '' - if not series: - if subtitle: - title = od_title+': '+subtitle - else: - title = od_title - else: - title = od_title - m = re.search("([0-9]+$)", subtitle) - if m: - series_num = float(m.group(1)) - return [cover_url, social_metadata_url, worldcatlink, series, series_num, publisher, creators, reserveid, title] - -def safe_query(br, query_url): - ''' - The query must be initialized by loading an empty search results page - this page attempts to set a cookie that Mechanize doesn't like - copy the cookiejar to a separate instance and make a one-off request with the temp cookiejar - ''' - goodcookies = br._ua_handlers['_cookies'].cookiejar - clean_cj = mechanize.CookieJar() - cookies_to_copy = [] - for cookie in goodcookies: - copied_cookie = copy.deepcopy(cookie) - cookies_to_copy.append(copied_cookie) - for copied_cookie in cookies_to_copy: - clean_cj.set_cookie(copied_cookie) - - br.open_novisit(query_url) - - br.set_cookiejar(clean_cj) - - -def overdrive_search(br, q, title, author): - q_query = q+'default.aspx/SearchByKeyword' - q_init_search = q+'SearchResults.aspx' - # get first author as string - convert this to a proper cleanup function later - s = Source(None) - print "printing list with string:" - #print list(s.get_author_tokens(['J. R. R. Tolkien'])) - print "printing list with author "+str(author)+":" - print list(s.get_author_tokens(author)) - author_tokens = list(s.get_author_tokens(author)) - print "there are "+str(len(author_tokens))+" author tokens" - for token in author_tokens: - print "cleaned up author token is: "+str(token) - - - title_tokens = list(s.get_title_tokens(title)) - print "there are "+str(len(title_tokens))+" title tokens" - for token in title_tokens: - print "cleaned up title token is: "+str(token) - - if len(title_tokens) >= len(author_tokens): - initial_q = ' '.join(title_tokens) - xref_q = '+'.join(author_tokens) - else: - initial_q = ' '.join(author_tokens) - xref_q = '+'.join(title_tokens) - - print "initial query is "+str(initial_q) - print "cross reference query is "+str(xref_q) - q_xref = q+'SearchResults.svc/GetResults?iDisplayLength=50&sSearch='+xref_q - query = '{"szKeyword":"'+initial_q+'"}' - - # main query, requires specific Content Type header - req = mechanize.Request(q_query) - req.add_header('Content-Type', 'application/json; charset=utf-8') - br.open_novisit(req, query) - - print "q_init_search is "+q_init_search - # initiate the search without messing up the cookiejar - safe_query(br, q_init_search) - - # get the search results object - results = False - while results == False: - xreq = mechanize.Request(q_xref) - xreq.add_header('X-Requested-With', 'XMLHttpRequest') - xreq.add_header('Referer', q_init_search) - xreq.add_header('Accept', 'application/json, text/javascript, */*') - raw = br.open_novisit(xreq).read() - print "overdrive search result is:\n"+raw - for m in re.finditer(ur'"iTotalDisplayRecords":(?P<displayrecords>\d+).*?"iTotalRecords":(?P<totalrecords>\d+)', raw): - if int(m.group('displayrecords')) >= 1: - results = True - elif int(m.group('totalrecords')) >= 1: - xref_q = '' - q_xref = q+'SearchResults.svc/GetResults?iDisplayLength=50&sSearch='+xref_q - elif int(m.group('totalrecords')) == 0: - return '' - - print "\n\nsorting results" - return sort_ovrdrv_results(raw, title, title_tokens, author, author_tokens) - - -def sort_ovrdrv_results(raw, title=None, title_tokens=None, author=None, author_tokens=None, ovrdrv_id=None): - print "\ntitle to search for is "+str(title)+"\nauthor to search for is "+str(author) - close_matches = [] - raw = re.sub('.*?\[\[(?P<content>.*?)\]\].*', '[[\g<content>]]', raw) - results = eval(raw) - print "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n" - #print results - # The search results are either from a keyword search or a multi-format list from a single ID, - # sort through the results for closest match/format - if results: - for reserveid, od_title, subtitle, edition, series, publisher, format, formatid, creators, \ - thumbimage, shortdescription, worldcatlink, excerptlink, creatorfile, sorttitle, \ - availabletolibrary, availabletoretailer, relevancyrank, unknown1, unknown2, unknown3 in results: - print "this record's title is "+od_title+", subtitle is "+subtitle+", author[s] are "+creators+", series is "+series - if ovrdrv_id is not None and int(formatid) in [1, 50, 410, 900]: - print "overdrive id is not None, searching based on format type priority" - return format_results(reserveid, od_title, subtitle, series, publisher, creators, thumbimage, worldcatlink, formatid) - else: - creators = creators.split(', ') - print "split creators from results are: "+str(creators) - # if an exact match in a preferred format occurs - if creators[0] == author[0] and od_title == title and int(formatid) in [1, 50, 410, 900]: - print "Got Exact Match!!!" - return format_results(reserveid, od_title, subtitle, series, publisher, creators, thumbimage, worldcatlink, formatid) - else: - close_title_match = False - close_author_match = False - print "format id is "+str(formatid) - for token in title_tokens: - print "attempting to find "+str(token)+" title token" - if od_title.lower().find(token.lower()) != -1: - print "matched token" - close_title_match = True - else: - print "token didn't match" - close_title_match = False - break - for token in author_tokens: - print "attempting to find "+str(token)+" author token" - if creators[0].lower().find(token.lower()) != -1: - print "matched token" - close_author_match = True - else: - print "token didn't match" - close_author_match = False - break - if close_title_match and close_author_match and int(formatid) in [1, 50, 410, 900]: - if subtitle and series: - close_matches.insert(0, format_results(reserveid, od_title, subtitle, series, publisher, creators, thumbimage, worldcatlink, formatid)) - else: - close_matches.append(format_results(reserveid, od_title, subtitle, series, publisher, creators, thumbimage, worldcatlink, formatid)) - if close_matches: - return close_matches[0] - else: - return '' - else: - return '' - - - -def overdrive_get_record(br, q, ovrdrv_id): - search_url = q+'SearchResults.aspx?ReserveID={'+ovrdrv_id+'}' - results_url = q+'SearchResults.svc/GetResults?sEcho=1&iColumns=18&sColumns=ReserveID%2CTitle%2CSubtitle%2CEdition%2CSeries%2CPublisher%2CFormat%2CFormatID%2CCreators%2CThumbImage%2CShortDescription%2CWorldCatLink%2CExcerptLink%2CCreatorFile%2CSortTitle%2CAvailableToLibrary%2CAvailableToRetailer%2CRelevancyRank&iDisplayStart=0&iDisplayLength=10&sSearch=&bEscapeRegex=true&iSortingCols=1&iSortCol_0=17&sSortDir_0=asc' - - # get the base url to set the proper session cookie - br.open_novisit(q) - - # initialize the search - safe_query(br, search_url) - - # get the results - req = mechanize.Request(results_url) - req.add_header('X-Requested-With', 'XMLHttpRequest') - req.add_header('Referer', search_url) - req.add_header('Accept', 'application/json, text/javascript, */*') - raw = br.open_novisit(req) - raw = str(list(raw)) - return sort_ovrdrv_results(raw, None, None, None, ovrdrv_id) - - -def find_ovrdrv_data(br, title, author, isbn, ovrdrv_id=None): - print "in find_ovrdrv_data, title is "+str(title)+", author is "+str(author)+", overdrive id is "+str(ovrdrv_id) - q = base_url - if ovrdrv_id is None: - return overdrive_search(br, q, title, author) - else: - return overdrive_get_record(br, q, ovrdrv_id) - - - -def to_ovrdrv_data(br, title, author, isbn, ovrdrv_id=None): - print "starting to_ovrdrv_data" - with cache_lock: - ans = ovrdrv_data_cache.get(isbn, None) - if ans: - print "inside to_ovrdrv_data, cache lookup successful, ans is "+str(ans) - return ans - if ans is False: - print "inside to_ovrdrv_data, ans returned False" - return None - try: - print "trying to retrieve data, running find_ovrdrv_data" - ovrdrv_data = find_ovrdrv_data(br, title, author, isbn, ovrdrv_id) - print "ovrdrv_data is "+str(ovrdrv_data) - except: - import traceback - traceback.print_exc() - ovrdrv_data = None - - with cache_lock: - ovrdrv_data_cache[isbn] = ovrdrv_data if ovrdrv_data else False - if ovrdrv_data: - from calibre.ebooks.metadata.xisbn import xisbn - for i in xisbn.get_associated_isbns(isbn): - with cache_lock: - ovrdrv_data_cache[i] = ovrdrv_data - - return ovrdrv_data - - -def get_social_metadata(title, authors, isbn, ovrdrv_id=None): - author = authors[0] - mi = Metadata(title, authors) - br = browser() - print "calling to_ovrdrv_data from inside get_social_metadata" - ovrdrv_data = to_ovrdrv_data(br, title, authors, isbn, ovrdrv_id) - - #[cover_url, social_metadata_url, worldcatlink, series, series_num, publisher, creators, reserveid, title] - - if len(ovrdrv_data[3]) > 1: - mi.series = ovrdrv_data[3] - if ovrdrv_data[4]: - mi.series_index = ovrdrv_data[4] - mi.publisher = ovrdrv_data[5] - mi.authors = ovrdrv_data[6] - if ovrdrv_id is None: - ovrdrv_id = ovrdrv_data[7] - mi.set_identifier('overdrive', ovrdrv_id) - mi.title = ovrdrv_data[8] - print "populated basic social metadata, getting detailed metadata" - if ovrdrv_data and get_metadata_detail(br, ovrdrv_data[1], mi, isbn): - return mi - print "failed to get detailed metadata, returning basic info" - return mi - -def get_cover_url(isbn, title, author, br, ovrdrv_id=None): - print "starting get_cover_url" - print "title is "+str(title) - print "author is "+str(author[0]) - print "isbn is "+str(isbn) - print "ovrdrv_id is "+str(ovrdrv_id) - - with cache_lock: - ans = cover_url_cache.get(isbn, None) - #ans = cover_url_cache.get(ovrdrv_id, None) - if ans: - print "cover url cache lookup returned positive, ans is "+str(ans) - return ans - if ans is False: - "cover url cache lookup returned false" - return None - print "in get_cover_url, calling to_ovrdrv_data function" - ovrdrv_data = to_ovrdrv_data(br, title, author, isbn, ovrdrv_id) - if ovrdrv_data: - ans = ovrdrv_data[0] - print "inside get_cover_url, got url from to_ovrdrv_data, ans is "+str(ans) - if ans: - print "writing cover url to url cache" - with cache_lock: - cover_url_cache[isbn] = ans - #cover_url_cache[ovrdrv_id] = ans - return ans - - with cache_lock: - print "marking cover url cache for this isbn false" - cover_url_cache[isbn] = False - return None - -def _get_cover_url(br, ovrdrv_data): - q = ovrdrv_data[1] - try: - raw = br.open_novisit(q).read() - except Exception, e: - if callable(getattr(e, 'getcode', None)) and \ - e.getcode() == 404: - return None - raise - if '<title>404 - ' in raw: - return None - raw = xml_to_unicode(raw, strip_encoding_pats=True, - resolve_entities=True)[0] - try: - root = soupparser.fromstring(raw) - except: - return False - - imgs = root.xpath('//img[@id="prodImage" and @src]') - if imgs: - src = imgs[0].get('src') - parts = src.split('/') - if len(parts) > 3: - bn = parts[-1] - sparts = bn.split('_') - if len(sparts) > 2: - bn = sparts[0] + sparts[-1] - return ('/'.join(parts[:-1]))+'/'+bn - return None - -def get_metadata_detail(br, metadata_url, mi, isbn=None): - try: - raw = br.open_novisit(metadata_url).read() - except Exception, e: - if callable(getattr(e, 'getcode', None)) and \ - e.getcode() == 404: - return False - raise - raw = xml_to_unicode(raw, strip_encoding_pats=True, - resolve_entities=True)[0] - try: - root = soupparser.fromstring(raw) - except: - return False - - isbn = check_isbn(isbn) - - pub_date = root.xpath("//div/label[@id='ctl00_ContentPlaceHolder1_lblPubDate']/text()") - lang = root.xpath("//div/label[@id='ctl00_ContentPlaceHolder1_lblLanguage']/text()") - subjects = root.xpath("//div/label[@id='ctl00_ContentPlaceHolder1_lblSubjects']/text()") - ebook_isbn = root.xpath("//div/label[@id='ctl00_ContentPlaceHolder1_lblIdentifier']/text()") - desc = root.xpath("//div/label[@id='ctl00_ContentPlaceHolder1_lblDescription']/ancestor::div[1]") - - if pub_date: - from calibre.utils.date import parse_date - mi.pubdate = parse_date(pub_date[0].strip()) - if lang: - mi.language = lang[0].strip() - print "languages is "+str(mi.language) - if ebook_isbn and isbn is None: - print "ebook isbn is "+str(ebook_isbn[0]) - mi.set_identifier('isbn', ebook_isbn) - #elif isbn is not None: - # mi.set_identifier('isbn', isbn) - if subjects: - mi.tags = [tag.strip() for tag in subjects[0].split(',')] - print "tags are "+str(mi.tags) - if desc: - desc = desc[0] - desc = html.tostring(desc, method='html', encoding=unicode).strip() - # remove all attributes from tags - desc = re.sub(r'<([a-zA-Z0-9]+)\s[^>]+>', r'<\1>', desc) - # Remove comments - desc = re.sub(r'(?s)<!--.*?-->', '', desc) - mi.comments = sanitize_comments_html(desc) - - return True - -def main(args=sys.argv): - print "running through main tests" - import tempfile, os, time - tdir = tempfile.gettempdir() - br = browser() - for ovrdrv_id, isbn, title, author in [ - #(None, '0899661343', 'On the Road', ['Jack Kerouac']), # basic test, no series, single author - #(None, '9780061952838', 'The Fellowship of the Ring', ['J. R. R. Tolkien']), # Series test, multi-author - #(None, '9780061952838', 'The Two Towers (The Lord of the Rings, Book II)', ['J. R. R. Tolkien']), # Series test, book 2 - #(None, '9780618153985', 'The Fellowship of the Ring (The Lord of the Rings, Part 1)', ['J.R.R. Tolkien']), - #('57844706-20fa-4ace-b5ee-3470b1b52173', None, 'The Two Towers', ['J. R. R. Tolkien']), # Series test, w/ ovrdrv id - #(None, '9780345505057', 'Deluge', ['Anne McCaffrey']) # Multiple authors - #(None, None, 'Deluge', ['Anne McCaffrey']) # Empty ISBN - #(None, None, 'On the Road', ['Jack Kerouac']), # Nonetype ISBN - #(None, '9780345435279', 'A Caress of Twilight', ['Laurell K. Hamilton']), - #(None, '9780606087230', 'The Omnivore\'s Dilemma : A Natural History of Four Meals', ['Michael Pollan']), # Subtitle colon - #(None, '9780061747649', 'Mental_Floss Presents: Condensed Knowledge', ['Will Pearson', 'Mangesh Hattikudur']), - #(None, '9781400050802', 'The Zombie Survival Guide', ['Max Brooks']), # Two books with this title by this author - #(None, '9781775414315', 'The Worst Journey in the World / Antarctic 1910-1913', ['Apsley Cherry-Garrard']), # Garbage sub-title - #(None, '9780440335160', 'Outlander', ['Diana Gabaldon']), # Returns lots of results to sort through to get the best match - (None, '9780345509741', 'The Horror Stories of Robert E. Howard', ['Robert E. Howard']), # Complex title with initials/dots stripped, some results don't have a cover - ]: - cpath = os.path.join(tdir, title+'.jpg') - print "cpath is "+cpath - st = time.time() - curl = get_cover_url(isbn, title, author, br, ovrdrv_id) - print '\n\n Took ', time.time() - st, ' to get basic metadata\n\n' - if curl is None: - print 'No cover found for', title - else: - print "curl is "+curl - #open(cpath, 'wb').write(br.open_novisit(curl).read()) - #print 'Cover for', title, 'saved to', cpath - st = time.time() - print get_social_metadata(title, author, isbn, ovrdrv_id) - print '\n\n Took ', time.time() - st, ' to get detailed metadata\n\n' - - return 0 - -if __name__ == '__main__': - sys.exit(main()) diff --git a/src/calibre/ebooks/metadata/sources/base.py b/src/calibre/ebooks/metadata/sources/base.py index 5911a357ac..53fe9a4c2d 100644 --- a/src/calibre/ebooks/metadata/sources/base.py +++ b/src/calibre/ebooks/metadata/sources/base.py @@ -313,8 +313,8 @@ class Source(Plugin): (r'(\d+),(\d+)', r'\1\2'), # Remove hyphens only if they have whitespace before them (r'(\s-)', ' '), - # Remove single quotes - (r"'", ''), + # Remove single quotes not followed by 's' + (r"'(?!s)", ''), # Replace other special chars with a space (r'''[:,;+!@#$%^&*(){}.`~"\s\[\]/]''', ' ') ]] diff --git a/src/calibre/ebooks/metadata/sources/overdrive.py b/src/calibre/ebooks/metadata/sources/overdrive.py new file mode 100755 index 0000000000..6950711da4 --- /dev/null +++ b/src/calibre/ebooks/metadata/sources/overdrive.py @@ -0,0 +1,510 @@ +#!/usr/bin/env python +__license__ = 'GPL v3' +__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net' +__docformat__ = 'restructuredtext en' + +''' +Fetch metadata using Overdrive Content Reserve +''' +import sys, re, random, urllib, mechanize, copy +from threading import RLock +from Queue import Queue, Empty + +from lxml import html, etree +from lxml.html import soupparser + +from calibre import browser +from calibre.ebooks.metadata import check_isbn +from calibre.ebooks.metadata.sources.base import Source +from calibre.ebooks.metadata.book.base import Metadata +from calibre.ebooks.chardet import xml_to_unicode +from calibre.library.comments import sanitize_comments_html + +ovrdrv_data_cache = {} +cover_url_cache = {} +cache_lock = RLock() +base_url = 'http://search.overdrive.com/' + + +class OverDrive(Source): + + name = 'Overdrive' + description = _('Downloads metadata from Overdrive\'s Content Reserve') + + capabilities = frozenset(['identify', 'cover']) + touched_fields = frozenset(['title', 'authors', 'tags', 'pubdate', + 'comments', 'publisher', 'identifier:isbn', 'series', 'series_num', + 'language', 'identifier:overdrive']) + has_html_comments = True + supports_gzip_transfer_encoding = False + cached_cover_url_is_reliable = True + + def identify(self, log, result_queue, abort, title=None, authors=None, # {{{ + identifiers={}, timeout=30): + ovrdrv_id = identifiers.get('overdrive', None) + isbn = identifiers.get('isbn', None) + + br = self.browser + print "in identify, calling to_ovrdrv_data" + ovrdrv_data = self.to_ovrdrv_data(br, title, authors, ovrdrv_id) + if ovrdrv_data: + title = ovrdrv_data[8] + authors = ovrdrv_data[6] + mi = Metadata(title, authors) + self.parse_search_results(ovrdrv_data, mi) + if ovrdrv_id is None: + ovrdrv_id = ovrdrv_data[7] + if isbn is not None: + self.cache_isbn_to_identifier(isbn, ovrdrv_id) + + self.get_book_detail(br, ovrdrv_data[1], mi, ovrdrv_id, log) + + result_queue.put(mi) + + return None + # }}} + + + def get_book_url(self, identifiers): # {{{ + ovrdrv_id = identifiers.get('overdrive', None) + if ovrdrv_id is not None: + ovrdrv_data = ovrdrv_data_cache.get(ovrdrv_id, None) + if ovrdrv_data: + return ovrdrv_data[1] + else: + br = browser() + ovrdrv_data = self.to_ovrdrv_data(br, None, None, ovrdrv_id) + return ovrdrv_data[1] + # }}} + + def download_cover(self, log, result_queue, abort, # {{{ + title=None, authors=None, identifiers={}, timeout=30): + cached_url = self.get_cached_cover_url(identifiers) + if cached_url is None: + log.info('No cached cover found, running identify') + rq = Queue() + print "inside download cover, calling identify" + self.identify(log, rq, abort, title=title, authors=authors, + identifiers=identifiers) + if abort.is_set(): + return + results = [] + while True: + try: + results.append(rq.get_nowait()) + except Empty: + break + results.sort(key=self.identify_results_keygen( + title=title, authors=authors, identifiers=identifiers)) + for mi in results: + cached_url = self.get_cached_cover_url(mi.identifiers) + if cached_url is not None: + break + if cached_url is None: + log.info('No cover found') + return + + if abort.is_set(): + return + + ovrdrv_id = identifiers.get('overdrive', None) + br = self.browser + referer = self.get_base_referer()+'ContentDetails-Cover.htm?ID='+ovrdrv_id + print "downloading cover, referer is "+str(referer) + req = mechanize.Request(cached_url) + req.add_header('referer', referer) + log('Downloading cover from:', cached_url) + try: + cdata = br.open_novisit(req, timeout=timeout).read() + result_queue.put((self, cdata)) + except: + log.exception('Failed to download cover from:', cached_url) + # }}} + + def get_cached_cover_url(self, identifiers): # {{{ + url = None + ovrdrv_id = identifiers.get('overdrive', None) + print "inside get_cached_cover_url, ovrdrv_id is "+str(ovrdrv_id) + if ovrdrv_id is None: + isbn = identifiers.get('isbn', None) + if isbn is not None: + ovrdrv_id = self.cached_isbn_to_identifier(isbn) + if ovrdrv_id is not None: + url = self.cached_identifier_to_cover_url(ovrdrv_id) + + return url + # }}} + + def create_query(self, title=None, authors=None, identifiers={}): + q = '' + if title or authors: + def build_term(prefix, parts): + return ' '.join('in'+prefix + ':' + x for x in parts) + title_tokens = list(self.get_title_tokens(title, False, True)) + if title_tokens: + q += build_term('title', title_tokens) + author_tokens = self.get_author_tokens(authors, + only_first_author=True) + if author_tokens: + q += ('+' if q else '') + build_term('author', + author_tokens) + + if isinstance(q, unicode): + q = q.encode('utf-8') + if not q: + return None + return BASE_URL+urlencode({ + 'q':q, + }) + + def get_base_referer(self): # to be used for passing referrer headers to cover download + choices = [ + 'http://overdrive.chipublib.org/82DC601D-7DDE-4212-B43A-09D821935B01/10/375/en/', + 'http://emedia.clevnet.org/9D321DAD-EC0D-490D-BFD8-64AE2C96ECA8/10/241/en/', + 'http://singapore.lib.overdrive.com/F11D55BE-A917-4D63-8111-318E88B29740/10/382/en/', + 'http://ebooks.nypl.org/20E48048-A377-4520-BC43-F8729A42A424/10/257/en/', + 'http://spl.lib.overdrive.com/5875E082-4CB2-4689-9426-8509F354AFEF/10/335/en/' + ] + return choices[random.randint(0, len(choices)-1)] + + def format_results(self, reserveid, od_title, subtitle, series, publisher, creators, thumbimage, worldcatlink, formatid): + fix_slashes = re.compile(r'\\/') + thumbimage = fix_slashes.sub('/', thumbimage) + worldcatlink = fix_slashes.sub('/', worldcatlink) + cover_url = re.sub('(?P<img>(Ima?g(eType-)?))200', '\g<img>100', thumbimage) + social_metadata_url = base_url+'TitleInfo.aspx?ReserveID='+reserveid+'&FormatID='+formatid + series_num = '' + if not series: + if subtitle: + title = od_title+': '+subtitle + else: + title = od_title + else: + title = od_title + m = re.search("([0-9]+$)", subtitle) + if m: + series_num = float(m.group(1)) + return [cover_url, social_metadata_url, worldcatlink, series, series_num, publisher, creators, reserveid, title] + + def safe_query(self, br, query_url, post=''): + ''' + The query must be initialized by loading an empty search results page + this page attempts to set a cookie that Mechanize doesn't like + copy the cookiejar to a separate instance and make a one-off request with the temp cookiejar + ''' + goodcookies = br._ua_handlers['_cookies'].cookiejar + clean_cj = mechanize.CookieJar() + cookies_to_copy = [] + for cookie in goodcookies: + copied_cookie = copy.deepcopy(cookie) + cookies_to_copy.append(copied_cookie) + for copied_cookie in cookies_to_copy: + clean_cj.set_cookie(copied_cookie) + + if post: + br.open_novisit(query_url, post) + else: + br.open_novisit(query_url) + + br.set_cookiejar(clean_cj) + + + def overdrive_search(self, br, q, title, author): + # re-initialize the cookiejar to so that it's clean + clean_cj = mechanize.CookieJar() + br.set_cookiejar(clean_cj) + q_query = q+'default.aspx/SearchByKeyword' + q_init_search = q+'SearchResults.aspx' + # get first author as string - convert this to a proper cleanup function later + s = Source(None) + print "printing list with author "+str(author)+":" + author_tokens = list(s.get_author_tokens(author)) + print list(author_tokens) + title_tokens = list(s.get_title_tokens(title, False, True)) + print "there are "+str(len(title_tokens))+" title tokens" + for token in title_tokens: + print "cleaned up title token is: "+str(token) + + if len(title_tokens) >= len(author_tokens): + initial_q = ' '.join(title_tokens) + xref_q = '+'.join(author_tokens) + else: + initial_q = ' '.join(author_tokens) + xref_q = '+'.join(title_tokens) + + print "initial query is "+str(initial_q) + print "cross reference query is "+str(xref_q) + q_xref = q+'SearchResults.svc/GetResults?iDisplayLength=50&sSearch='+xref_q + query = '{"szKeyword":"'+initial_q+'"}' + + # main query, requires specific Content Type header + req = mechanize.Request(q_query) + req.add_header('Content-Type', 'application/json; charset=utf-8') + br.open_novisit(req, query) + + print "q_init_search is "+q_init_search + # initiate the search without messing up the cookiejar + self.safe_query(br, q_init_search) + + # get the search results object + results = False + while results == False: + xreq = mechanize.Request(q_xref) + xreq.add_header('X-Requested-With', 'XMLHttpRequest') + xreq.add_header('Referer', q_init_search) + xreq.add_header('Accept', 'application/json, text/javascript, */*') + raw = br.open_novisit(xreq).read() + print "overdrive search result is:\n"+raw + for m in re.finditer(ur'"iTotalDisplayRecords":(?P<displayrecords>\d+).*?"iTotalRecords":(?P<totalrecords>\d+)', raw): + if int(m.group('displayrecords')) >= 1: + results = True + elif int(m.group('totalrecords')) >= 1: + xref_q = '' + q_xref = q+'SearchResults.svc/GetResults?iDisplayLength=50&sSearch='+xref_q + elif int(m.group('totalrecords')) == 0: + return '' + + print "\n\nsorting results" + return self.sort_ovrdrv_results(raw, title, title_tokens, author, author_tokens) + + + def sort_ovrdrv_results(self, raw, title=None, title_tokens=None, author=None, author_tokens=None, ovrdrv_id=None): + print "\ntitle to search for is "+str(title)+"\nauthor to search for is "+str(author) + close_matches = [] + raw = re.sub('.*?\[\[(?P<content>.*?)\]\].*', '[[\g<content>]]', raw) + results = eval(raw) + print "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n" + #print results + # The search results are either from a keyword search or a multi-format list from a single ID, + # sort through the results for closest match/format + if results: + for reserveid, od_title, subtitle, edition, series, publisher, format, formatid, creators, \ + thumbimage, shortdescription, worldcatlink, excerptlink, creatorfile, sorttitle, \ + availabletolibrary, availabletoretailer, relevancyrank, unknown1, unknown2, unknown3 in results: + print "this record's title is "+od_title+", subtitle is "+subtitle+", author[s] are "+creators+", series is "+series + if ovrdrv_id is not None and int(formatid) in [1, 50, 410, 900]: + print "overdrive id is not None, searching based on format type priority" + return self.format_results(reserveid, od_title, subtitle, series, publisher, creators, thumbimage, worldcatlink, formatid) + else: + creators = creators.split(', ') + print "split creators from results are: "+str(creators) + # if an exact match in a preferred format occurs + if creators[0] == author[0] and od_title == title and int(formatid) in [1, 50, 410, 900]: + print "Got Exact Match!!!" + return self.format_results(reserveid, od_title, subtitle, series, publisher, creators, thumbimage, worldcatlink, formatid) + else: + close_title_match = False + close_author_match = False + print "format id is "+str(formatid) + for token in title_tokens: + print "attempting to find "+str(token)+" title token" + if od_title.lower().find(token.lower()) != -1: + print "matched token" + close_title_match = True + else: + print "token didn't match" + close_title_match = False + break + for token in author_tokens: + print "attempting to find "+str(token)+" author token" + if creators[0].lower().find(token.lower()) != -1: + print "matched token" + close_author_match = True + else: + print "token didn't match" + close_author_match = False + break + if close_title_match and close_author_match and int(formatid) in [1, 50, 410, 900]: + if subtitle and series: + close_matches.insert(0, self.format_results(reserveid, od_title, subtitle, series, publisher, creators, thumbimage, worldcatlink, formatid)) + else: + close_matches.append(self.format_results(reserveid, od_title, subtitle, series, publisher, creators, thumbimage, worldcatlink, formatid)) + if close_matches: + return close_matches[0] + else: + return '' + else: + return '' + + + def overdrive_get_record(self, br, q, ovrdrv_id): + search_url = q+'SearchResults.aspx?ReserveID={'+ovrdrv_id+'}' + results_url = q+'SearchResults.svc/GetResults?sEcho=1&iColumns=18&sColumns=ReserveID%2CTitle%2CSubtitle%2CEdition%2CSeries%2CPublisher%2CFormat%2CFormatID%2CCreators%2CThumbImage%2CShortDescription%2CWorldCatLink%2CExcerptLink%2CCreatorFile%2CSortTitle%2CAvailableToLibrary%2CAvailableToRetailer%2CRelevancyRank&iDisplayStart=0&iDisplayLength=10&sSearch=&bEscapeRegex=true&iSortingCols=1&iSortCol_0=17&sSortDir_0=asc' + + # get the base url to set the proper session cookie + br.open_novisit(q) + + # initialize the search + self.safe_query(br, search_url) + + # get the results + req = mechanize.Request(results_url) + req.add_header('X-Requested-With', 'XMLHttpRequest') + req.add_header('Referer', search_url) + req.add_header('Accept', 'application/json, text/javascript, */*') + raw = br.open_novisit(req) + raw = str(list(raw)) + clean_cj = mechanize.CookieJar() + br.set_cookiejar(clean_cj) + return self.sort_ovrdrv_results(raw, None, None, None, ovrdrv_id) + + + def find_ovrdrv_data(self, br, title, author, isbn, ovrdrv_id=None): + print "in find_ovrdrv_data, title is "+str(title)+", author is "+str(author)+", overdrive id is "+str(ovrdrv_id) + q = base_url + if ovrdrv_id is None: + return self.overdrive_search(br, q, title, author) + else: + return self.overdrive_get_record(br, q, ovrdrv_id) + + + + def to_ovrdrv_data(self, br, title=None, author=None, ovrdrv_id=None): + ''' + Takes either a title/author combo or an Overdrive ID. One of these + two must be passed to this function. + ''' + print "starting to_ovrdrv_data" + if ovrdrv_id is not None: + with cache_lock: + ans = ovrdrv_data_cache.get(ovrdrv_id, None) + if ans: + print "inside to_ovrdrv_data, cache lookup successful, ans is "+str(ans) + return ans + elif ans is False: + print "inside to_ovrdrv_data, ans returned False" + return None + else: + ovrdrv_data = self.find_ovrdrv_data(br, title, author, ovrdrv_id) + else: + try: + print "trying to retrieve data, running find_ovrdrv_data" + ovrdrv_data = self.find_ovrdrv_data(br, title, author, ovrdrv_id) + print "ovrdrv_data is "+str(ovrdrv_data) + except: + import traceback + traceback.print_exc() + ovrdrv_data = None + print "writing results to ovrdrv_data cache" + with cache_lock: + ovrdrv_data_cache[ovrdrv_id] = ovrdrv_data if ovrdrv_data else False + + return ovrdrv_data if ovrdrv_data else False + + + def parse_search_results(self, ovrdrv_data, mi): + ''' + Parse the formatted search results from the initial Overdrive query and + add the values to the metadta. + + The list object has these values: + [cover_url[0], social_metadata_url[1], worldcatlink[2], series[3], series_num[4], + publisher[5], creators[6], reserveid[7], title[8]] + + ''' + print "inside parse_search_results, writing the metadata results" + ovrdrv_id = ovrdrv_data[7] + mi.set_identifier('overdrive', ovrdrv_id) + + if len(ovrdrv_data[3]) > 1: + mi.series = ovrdrv_data[3] + if ovrdrv_data[4]: + mi.series_index = ovrdrv_data[4] + mi.publisher = ovrdrv_data[5] + mi.authors = ovrdrv_data[6] + mi.title = ovrdrv_data[8] + cover_url = ovrdrv_data[0] + if cover_url: + self.cache_identifier_to_cover_url(ovrdrv_id, + cover_url) + + + def get_book_detail(self, br, metadata_url, mi, ovrdrv_id, log): + try: + raw = br.open_novisit(metadata_url).read() + except Exception, e: + if callable(getattr(e, 'getcode', None)) and \ + e.getcode() == 404: + return False + raise + raw = xml_to_unicode(raw, strip_encoding_pats=True, + resolve_entities=True)[0] + try: + root = soupparser.fromstring(raw) + except: + return False + + pub_date = root.xpath("//div/label[@id='ctl00_ContentPlaceHolder1_lblPubDate']/text()") + lang = root.xpath("//div/label[@id='ctl00_ContentPlaceHolder1_lblLanguage']/text()") + subjects = root.xpath("//div/label[@id='ctl00_ContentPlaceHolder1_lblSubjects']/text()") + ebook_isbn = root.xpath("//td/label[@id='ctl00_ContentPlaceHolder1_lblIdentifier']/text()") + desc = root.xpath("//div/label[@id='ctl00_ContentPlaceHolder1_lblDescription']/ancestor::div[1]") + + if pub_date: + from calibre.utils.date import parse_date + mi.pubdate = parse_date(pub_date[0].strip()) + if lang: + mi.language = lang[0].strip() + print "languages is "+str(mi.language) + #if ebook_isbn: + # print "ebook isbn is "+str(ebook_isbn[0]) + # isbn = check_isbn(ebook_isbn[0].strip()) + # if isbn: + # self.cache_isbn_to_identifier(isbn, ovrdrv_id) + # mi.isbn = isbn + if subjects: + mi.tags = [tag.strip() for tag in subjects[0].split(',')] + print "tags are "+str(mi.tags) + if desc: + desc = desc[0] + desc = html.tostring(desc, method='html', encoding=unicode).strip() + # remove all attributes from tags + desc = re.sub(r'<([a-zA-Z0-9]+)\s[^>]+>', r'<\1>', desc) + # Remove comments + desc = re.sub(r'(?s)<!--.*?-->', '', desc) + mi.comments = sanitize_comments_html(desc) + + return None + + +def main(args=sys.argv): + print "running through main tests" + import tempfile, os, time + tdir = tempfile.gettempdir() + br = browser() + for ovrdrv_id, isbn, title, author in [ + #(None, '0899661343', 'On the Road', ['Jack Kerouac']), # basic test, no series, single author + #(None, '9780061952838', 'The Fellowship of the Ring', ['J. R. R. Tolkien']), # Series test, multi-author + #(None, '9780061952838', 'The Two Towers (The Lord of the Rings, Book II)', ['J. R. R. Tolkien']), # Series test, book 2 + #(None, '9780618153985', 'The Fellowship of the Ring (The Lord of the Rings, Part 1)', ['J.R.R. Tolkien']), + #('57844706-20fa-4ace-b5ee-3470b1b52173', None, 'The Two Towers', ['J. R. R. Tolkien']), # Series test, w/ ovrdrv id + #(None, '9780345505057', 'Deluge', ['Anne McCaffrey']) # Multiple authors + #(None, None, 'Deluge', ['Anne McCaffrey']) # Empty ISBN + #(None, None, 'On the Road', ['Jack Kerouac']), # Nonetype ISBN + #(None, '9780345435279', 'A Caress of Twilight', ['Laurell K. Hamilton']), + #(None, '9780606087230', 'The Omnivore\'s Dilemma : A Natural History of Four Meals', ['Michael Pollan']), # Subtitle colon + #(None, '9780061747649', 'Mental_Floss Presents: Condensed Knowledge', ['Will Pearson', 'Mangesh Hattikudur']), + #(None, '9781400050802', 'The Zombie Survival Guide', ['Max Brooks']), # Two books with this title by this author + #(None, '9781775414315', 'The Worst Journey in the World / Antarctic 1910-1913', ['Apsley Cherry-Garrard']), # Garbage sub-title + #(None, '9780440335160', 'Outlander', ['Diana Gabaldon']), # Returns lots of results to sort through to get the best match + (None, '9780345509741', 'The Horror Stories of Robert E. Howard', ['Robert E. Howard']), # Complex title with initials/dots stripped, some results don't have a cover + ]: + cpath = os.path.join(tdir, title+'.jpg') + print "cpath is "+cpath + st = time.time() + curl = get_cover_url(isbn, title, author, br, ovrdrv_id) + print '\n\n Took ', time.time() - st, ' to get basic metadata\n\n' + if curl is None: + print 'No cover found for', title + else: + print "curl is "+curl + #open(cpath, 'wb').write(br.open_novisit(curl).read()) + #print 'Cover for', title, 'saved to', cpath + st = time.time() + print get_social_metadata(title, author, isbn, ovrdrv_id) + print '\n\n Took ', time.time() - st, ' to get detailed metadata\n\n' + + return 0 + +if __name__ == '__main__': + sys.exit(main()) From aa30f306b5f8894641bc5536559b427c02c5303d Mon Sep 17 00:00:00 2001 From: Lee <ldolse@yahoo.com> Date: Mon, 18 Apr 2011 18:17:54 +0800 Subject: [PATCH 17/30] ... --- .../ebooks/metadata/sources/overdrive.py | 64 ++++--------------- 1 file changed, 14 insertions(+), 50 deletions(-) diff --git a/src/calibre/ebooks/metadata/sources/overdrive.py b/src/calibre/ebooks/metadata/sources/overdrive.py index 6950711da4..dd2e8b2a85 100755 --- a/src/calibre/ebooks/metadata/sources/overdrive.py +++ b/src/calibre/ebooks/metadata/sources/overdrive.py @@ -45,7 +45,6 @@ class OverDrive(Source): isbn = identifiers.get('isbn', None) br = self.browser - print "in identify, calling to_ovrdrv_data" ovrdrv_data = self.to_ovrdrv_data(br, title, authors, ovrdrv_id) if ovrdrv_data: title = ovrdrv_data[8] @@ -83,7 +82,6 @@ class OverDrive(Source): if cached_url is None: log.info('No cached cover found, running identify') rq = Queue() - print "inside download cover, calling identify" self.identify(log, rq, abort, title=title, authors=authors, identifiers=identifiers) if abort.is_set(): @@ -110,7 +108,6 @@ class OverDrive(Source): ovrdrv_id = identifiers.get('overdrive', None) br = self.browser referer = self.get_base_referer()+'ContentDetails-Cover.htm?ID='+ovrdrv_id - print "downloading cover, referer is "+str(referer) req = mechanize.Request(cached_url) req.add_header('referer', referer) log('Downloading cover from:', cached_url) @@ -124,7 +121,6 @@ class OverDrive(Source): def get_cached_cover_url(self, identifiers): # {{{ url = None ovrdrv_id = identifiers.get('overdrive', None) - print "inside get_cached_cover_url, ovrdrv_id is "+str(ovrdrv_id) if ovrdrv_id is None: isbn = identifiers.get('isbn', None) if isbn is not None: @@ -217,14 +213,9 @@ class OverDrive(Source): q_init_search = q+'SearchResults.aspx' # get first author as string - convert this to a proper cleanup function later s = Source(None) - print "printing list with author "+str(author)+":" author_tokens = list(s.get_author_tokens(author)) - print list(author_tokens) title_tokens = list(s.get_title_tokens(title, False, True)) - print "there are "+str(len(title_tokens))+" title tokens" - for token in title_tokens: - print "cleaned up title token is: "+str(token) - + if len(title_tokens) >= len(author_tokens): initial_q = ' '.join(title_tokens) xref_q = '+'.join(author_tokens) @@ -232,8 +223,6 @@ class OverDrive(Source): initial_q = ' '.join(author_tokens) xref_q = '+'.join(title_tokens) - print "initial query is "+str(initial_q) - print "cross reference query is "+str(xref_q) q_xref = q+'SearchResults.svc/GetResults?iDisplayLength=50&sSearch='+xref_q query = '{"szKeyword":"'+initial_q+'"}' @@ -242,7 +231,6 @@ class OverDrive(Source): req.add_header('Content-Type', 'application/json; charset=utf-8') br.open_novisit(req, query) - print "q_init_search is "+q_init_search # initiate the search without messing up the cookiejar self.safe_query(br, q_init_search) @@ -254,7 +242,6 @@ class OverDrive(Source): xreq.add_header('Referer', q_init_search) xreq.add_header('Accept', 'application/json, text/javascript, */*') raw = br.open_novisit(xreq).read() - print "overdrive search result is:\n"+raw for m in re.finditer(ur'"iTotalDisplayRecords":(?P<displayrecords>\d+).*?"iTotalRecords":(?P<totalrecords>\d+)', raw): if int(m.group('displayrecords')) >= 1: results = True @@ -264,54 +251,40 @@ class OverDrive(Source): elif int(m.group('totalrecords')) == 0: return '' - print "\n\nsorting results" return self.sort_ovrdrv_results(raw, title, title_tokens, author, author_tokens) def sort_ovrdrv_results(self, raw, title=None, title_tokens=None, author=None, author_tokens=None, ovrdrv_id=None): - print "\ntitle to search for is "+str(title)+"\nauthor to search for is "+str(author) close_matches = [] raw = re.sub('.*?\[\[(?P<content>.*?)\]\].*', '[[\g<content>]]', raw) results = eval(raw) - print "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n" - #print results + # The search results are either from a keyword search or a multi-format list from a single ID, # sort through the results for closest match/format if results: for reserveid, od_title, subtitle, edition, series, publisher, format, formatid, creators, \ thumbimage, shortdescription, worldcatlink, excerptlink, creatorfile, sorttitle, \ availabletolibrary, availabletoretailer, relevancyrank, unknown1, unknown2, unknown3 in results: - print "this record's title is "+od_title+", subtitle is "+subtitle+", author[s] are "+creators+", series is "+series if ovrdrv_id is not None and int(formatid) in [1, 50, 410, 900]: - print "overdrive id is not None, searching based on format type priority" return self.format_results(reserveid, od_title, subtitle, series, publisher, creators, thumbimage, worldcatlink, formatid) else: creators = creators.split(', ') - print "split creators from results are: "+str(creators) # if an exact match in a preferred format occurs if creators[0] == author[0] and od_title == title and int(formatid) in [1, 50, 410, 900]: - print "Got Exact Match!!!" return self.format_results(reserveid, od_title, subtitle, series, publisher, creators, thumbimage, worldcatlink, formatid) else: close_title_match = False close_author_match = False - print "format id is "+str(formatid) for token in title_tokens: - print "attempting to find "+str(token)+" title token" if od_title.lower().find(token.lower()) != -1: - print "matched token" close_title_match = True else: - print "token didn't match" close_title_match = False break for token in author_tokens: - print "attempting to find "+str(token)+" author token" if creators[0].lower().find(token.lower()) != -1: - print "matched token" close_author_match = True else: - print "token didn't match" close_author_match = False break if close_title_match and close_author_match and int(formatid) in [1, 50, 410, 900]: @@ -350,7 +323,6 @@ class OverDrive(Source): def find_ovrdrv_data(self, br, title, author, isbn, ovrdrv_id=None): - print "in find_ovrdrv_data, title is "+str(title)+", author is "+str(author)+", overdrive id is "+str(ovrdrv_id) q = base_url if ovrdrv_id is None: return self.overdrive_search(br, q, title, author) @@ -364,28 +336,22 @@ class OverDrive(Source): Takes either a title/author combo or an Overdrive ID. One of these two must be passed to this function. ''' - print "starting to_ovrdrv_data" if ovrdrv_id is not None: with cache_lock: ans = ovrdrv_data_cache.get(ovrdrv_id, None) if ans: - print "inside to_ovrdrv_data, cache lookup successful, ans is "+str(ans) return ans elif ans is False: - print "inside to_ovrdrv_data, ans returned False" return None else: ovrdrv_data = self.find_ovrdrv_data(br, title, author, ovrdrv_id) else: try: - print "trying to retrieve data, running find_ovrdrv_data" ovrdrv_data = self.find_ovrdrv_data(br, title, author, ovrdrv_id) - print "ovrdrv_data is "+str(ovrdrv_data) except: import traceback traceback.print_exc() ovrdrv_data = None - print "writing results to ovrdrv_data cache" with cache_lock: ovrdrv_data_cache[ovrdrv_id] = ovrdrv_data if ovrdrv_data else False @@ -402,7 +368,6 @@ class OverDrive(Source): publisher[5], creators[6], reserveid[7], title[8]] ''' - print "inside parse_search_results, writing the metadata results" ovrdrv_id = ovrdrv_data[7] mi.set_identifier('overdrive', ovrdrv_id) @@ -445,7 +410,7 @@ class OverDrive(Source): mi.pubdate = parse_date(pub_date[0].strip()) if lang: mi.language = lang[0].strip() - print "languages is "+str(mi.language) + #if ebook_isbn: # print "ebook isbn is "+str(ebook_isbn[0]) # isbn = check_isbn(ebook_isbn[0].strip()) @@ -454,7 +419,7 @@ class OverDrive(Source): # mi.isbn = isbn if subjects: mi.tags = [tag.strip() for tag in subjects[0].split(',')] - print "tags are "+str(mi.tags) + if desc: desc = desc[0] desc = html.tostring(desc, method='html', encoding=unicode).strip() @@ -468,7 +433,6 @@ class OverDrive(Source): def main(args=sys.argv): - print "running through main tests" import tempfile, os, time tdir = tempfile.gettempdir() br = browser() @@ -490,19 +454,19 @@ def main(args=sys.argv): (None, '9780345509741', 'The Horror Stories of Robert E. Howard', ['Robert E. Howard']), # Complex title with initials/dots stripped, some results don't have a cover ]: cpath = os.path.join(tdir, title+'.jpg') - print "cpath is "+cpath + #print "cpath is "+cpath st = time.time() curl = get_cover_url(isbn, title, author, br, ovrdrv_id) - print '\n\n Took ', time.time() - st, ' to get basic metadata\n\n' - if curl is None: - print 'No cover found for', title - else: - print "curl is "+curl - #open(cpath, 'wb').write(br.open_novisit(curl).read()) - #print 'Cover for', title, 'saved to', cpath + #print '\n\n Took ', time.time() - st, ' to get basic metadata\n\n' + #if curl is None: + # print 'No cover found for', title + #else: + # print "curl is "+curl + # open(cpath, 'wb').write(br.open_novisit(curl).read()) + # print 'Cover for', title, 'saved to', cpath st = time.time() - print get_social_metadata(title, author, isbn, ovrdrv_id) - print '\n\n Took ', time.time() - st, ' to get detailed metadata\n\n' + #print get_social_metadata(title, author, isbn, ovrdrv_id) + #print '\n\n Took ', time.time() - st, ' to get detailed metadata\n\n' return 0 From 2b82d4944859d22e56daf21f030586f4fb8977b3 Mon Sep 17 00:00:00 2001 From: Lee <ldolse@yahoo.com> Date: Mon, 18 Apr 2011 20:56:11 +0800 Subject: [PATCH 18/30] fixed multiple author sorting --- .../ebooks/metadata/sources/overdrive.py | 43 ++++++++++++++----- 1 file changed, 33 insertions(+), 10 deletions(-) diff --git a/src/calibre/ebooks/metadata/sources/overdrive.py b/src/calibre/ebooks/metadata/sources/overdrive.py index dd2e8b2a85..42b320745a 100755 --- a/src/calibre/ebooks/metadata/sources/overdrive.py +++ b/src/calibre/ebooks/metadata/sources/overdrive.py @@ -212,10 +212,15 @@ class OverDrive(Source): q_query = q+'default.aspx/SearchByKeyword' q_init_search = q+'SearchResults.aspx' # get first author as string - convert this to a proper cleanup function later - s = Source(None) - author_tokens = list(s.get_author_tokens(author)) - title_tokens = list(s.get_title_tokens(title, False, True)) - + print "printing list with author "+str(author)+":" + author_tokens = list(self.get_author_tokens(author, + only_first_author=True)) + print list(author_tokens) + title_tokens = list(self.get_title_tokens(title, False, True)) + print "there are "+str(len(title_tokens))+" title tokens" + for token in title_tokens: + print "cleaned up title token is: "+str(token) + if len(title_tokens) >= len(author_tokens): initial_q = ' '.join(title_tokens) xref_q = '+'.join(author_tokens) @@ -251,41 +256,59 @@ class OverDrive(Source): elif int(m.group('totalrecords')) == 0: return '' + print "\n\nsorting results" return self.sort_ovrdrv_results(raw, title, title_tokens, author, author_tokens) def sort_ovrdrv_results(self, raw, title=None, title_tokens=None, author=None, author_tokens=None, ovrdrv_id=None): + print "\ntitle to search for is "+str(title)+"\nauthor to search for is "+str(author) close_matches = [] raw = re.sub('.*?\[\[(?P<content>.*?)\]\].*', '[[\g<content>]]', raw) results = eval(raw) - + print "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n" + #print results # The search results are either from a keyword search or a multi-format list from a single ID, # sort through the results for closest match/format if results: for reserveid, od_title, subtitle, edition, series, publisher, format, formatid, creators, \ thumbimage, shortdescription, worldcatlink, excerptlink, creatorfile, sorttitle, \ availabletolibrary, availabletoretailer, relevancyrank, unknown1, unknown2, unknown3 in results: + print "this record's title is "+od_title+", subtitle is "+subtitle+", author[s] are "+creators+", series is "+series if ovrdrv_id is not None and int(formatid) in [1, 50, 410, 900]: + print "overdrive id is not None, searching based on format type priority" return self.format_results(reserveid, od_title, subtitle, series, publisher, creators, thumbimage, worldcatlink, formatid) else: creators = creators.split(', ') + print "split creators from results are: "+str(creators)+", there are "+str(len(creators))+" total" # if an exact match in a preferred format occurs if creators[0] == author[0] and od_title == title and int(formatid) in [1, 50, 410, 900]: + print "Got Exact Match!!!" return self.format_results(reserveid, od_title, subtitle, series, publisher, creators, thumbimage, worldcatlink, formatid) else: close_title_match = False close_author_match = False + print "format id is "+str(formatid) for token in title_tokens: + print "attempting to find "+str(token)+" title token" if od_title.lower().find(token.lower()) != -1: + print "matched token" close_title_match = True else: + print "token didn't match" close_title_match = False break - for token in author_tokens: - if creators[0].lower().find(token.lower()) != -1: - close_author_match = True - else: - close_author_match = False + for author in creators: + print "matching tokens for "+str(author) + for token in author_tokens: + print "attempting to find "+str(token)+" author token" + if author.lower().find(token.lower()) != -1: + print "matched token" + close_author_match = True + else: + print "token didn't match" + close_author_match = False + break + if close_author_match: break if close_title_match and close_author_match and int(formatid) in [1, 50, 410, 900]: if subtitle and series: From 7b196c762bb63b482bef111fcabe553d74ed8395 Mon Sep 17 00:00:00 2001 From: Lee <ldolse@yahoo.com> Date: Mon, 18 Apr 2011 21:07:26 +0800 Subject: [PATCH 19/30] prioritized results with covers, cleaned up print statements --- .../ebooks/metadata/sources/overdrive.py | 69 ++++++++----------- 1 file changed, 27 insertions(+), 42 deletions(-) diff --git a/src/calibre/ebooks/metadata/sources/overdrive.py b/src/calibre/ebooks/metadata/sources/overdrive.py index 42b320745a..4fc8dbab1b 100755 --- a/src/calibre/ebooks/metadata/sources/overdrive.py +++ b/src/calibre/ebooks/metadata/sources/overdrive.py @@ -55,9 +55,9 @@ class OverDrive(Source): ovrdrv_id = ovrdrv_data[7] if isbn is not None: self.cache_isbn_to_identifier(isbn, ovrdrv_id) - + self.get_book_detail(br, ovrdrv_data[1], mi, ovrdrv_id, log) - + result_queue.put(mi) return None @@ -144,7 +144,7 @@ class OverDrive(Source): if author_tokens: q += ('+' if q else '') + build_term('author', author_tokens) - + if isinstance(q, unicode): q = q.encode('utf-8') if not q: @@ -162,7 +162,7 @@ class OverDrive(Source): 'http://spl.lib.overdrive.com/5875E082-4CB2-4689-9426-8509F354AFEF/10/335/en/' ] return choices[random.randint(0, len(choices)-1)] - + def format_results(self, reserveid, od_title, subtitle, series, publisher, creators, thumbimage, worldcatlink, formatid): fix_slashes = re.compile(r'\\/') thumbimage = fix_slashes.sub('/', thumbimage) @@ -181,7 +181,7 @@ class OverDrive(Source): if m: series_num = float(m.group(1)) return [cover_url, social_metadata_url, worldcatlink, series, series_num, publisher, creators, reserveid, title] - + def safe_query(self, br, query_url, post=''): ''' The query must be initialized by loading an empty search results page @@ -212,33 +212,29 @@ class OverDrive(Source): q_query = q+'default.aspx/SearchByKeyword' q_init_search = q+'SearchResults.aspx' # get first author as string - convert this to a proper cleanup function later - print "printing list with author "+str(author)+":" author_tokens = list(self.get_author_tokens(author, only_first_author=True)) - print list(author_tokens) - title_tokens = list(self.get_title_tokens(title, False, True)) - print "there are "+str(len(title_tokens))+" title tokens" - for token in title_tokens: - print "cleaned up title token is: "+str(token) - + title_tokens = list(self.get_title_tokens(title, + strip_joiners=False, strip_subtitle=True)) + if len(title_tokens) >= len(author_tokens): initial_q = ' '.join(title_tokens) xref_q = '+'.join(author_tokens) else: initial_q = ' '.join(author_tokens) xref_q = '+'.join(title_tokens) - + q_xref = q+'SearchResults.svc/GetResults?iDisplayLength=50&sSearch='+xref_q query = '{"szKeyword":"'+initial_q+'"}' - + # main query, requires specific Content Type header req = mechanize.Request(q_query) req.add_header('Content-Type', 'application/json; charset=utf-8') br.open_novisit(req, query) - + # initiate the search without messing up the cookiejar self.safe_query(br, q_init_search) - + # get the search results object results = False while results == False: @@ -256,16 +252,13 @@ class OverDrive(Source): elif int(m.group('totalrecords')) == 0: return '' - print "\n\nsorting results" return self.sort_ovrdrv_results(raw, title, title_tokens, author, author_tokens) - - + + def sort_ovrdrv_results(self, raw, title=None, title_tokens=None, author=None, author_tokens=None, ovrdrv_id=None): - print "\ntitle to search for is "+str(title)+"\nauthor to search for is "+str(author) close_matches = [] raw = re.sub('.*?\[\[(?P<content>.*?)\]\].*', '[[\g<content>]]', raw) results = eval(raw) - print "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n" #print results # The search results are either from a keyword search or a multi-format list from a single ID, # sort through the results for closest match/format @@ -273,44 +266,36 @@ class OverDrive(Source): for reserveid, od_title, subtitle, edition, series, publisher, format, formatid, creators, \ thumbimage, shortdescription, worldcatlink, excerptlink, creatorfile, sorttitle, \ availabletolibrary, availabletoretailer, relevancyrank, unknown1, unknown2, unknown3 in results: - print "this record's title is "+od_title+", subtitle is "+subtitle+", author[s] are "+creators+", series is "+series + #print "this record's title is "+od_title+", subtitle is "+subtitle+", author[s] are "+creators+", series is "+series if ovrdrv_id is not None and int(formatid) in [1, 50, 410, 900]: - print "overdrive id is not None, searching based on format type priority" - return self.format_results(reserveid, od_title, subtitle, series, publisher, creators, thumbimage, worldcatlink, formatid) + #print "overdrive id is not None, searching based on format type priority" + return self.format_results(reserveid, od_title, subtitle, series, publisher, + creators, thumbimage, worldcatlink, formatid) else: creators = creators.split(', ') - print "split creators from results are: "+str(creators)+", there are "+str(len(creators))+" total" # if an exact match in a preferred format occurs if creators[0] == author[0] and od_title == title and int(formatid) in [1, 50, 410, 900]: - print "Got Exact Match!!!" - return self.format_results(reserveid, od_title, subtitle, series, publisher, creators, thumbimage, worldcatlink, formatid) + return self.format_results(reserveid, od_title, subtitle, series, publisher, + creators, thumbimage, worldcatlink, formatid) else: close_title_match = False close_author_match = False - print "format id is "+str(formatid) for token in title_tokens: - print "attempting to find "+str(token)+" title token" if od_title.lower().find(token.lower()) != -1: - print "matched token" close_title_match = True else: - print "token didn't match" close_title_match = False break for author in creators: - print "matching tokens for "+str(author) for token in author_tokens: - print "attempting to find "+str(token)+" author token" if author.lower().find(token.lower()) != -1: - print "matched token" close_author_match = True else: - print "token didn't match" close_author_match = False break if close_author_match: break - if close_title_match and close_author_match and int(formatid) in [1, 50, 410, 900]: + if close_title_match and close_author_match and int(formatid) in [1, 50, 410, 900] and thumbimage: if subtitle and series: close_matches.insert(0, self.format_results(reserveid, od_title, subtitle, series, publisher, creators, thumbimage, worldcatlink, formatid)) else: @@ -321,18 +306,18 @@ class OverDrive(Source): return '' else: return '' - - + + def overdrive_get_record(self, br, q, ovrdrv_id): search_url = q+'SearchResults.aspx?ReserveID={'+ovrdrv_id+'}' results_url = q+'SearchResults.svc/GetResults?sEcho=1&iColumns=18&sColumns=ReserveID%2CTitle%2CSubtitle%2CEdition%2CSeries%2CPublisher%2CFormat%2CFormatID%2CCreators%2CThumbImage%2CShortDescription%2CWorldCatLink%2CExcerptLink%2CCreatorFile%2CSortTitle%2CAvailableToLibrary%2CAvailableToRetailer%2CRelevancyRank&iDisplayStart=0&iDisplayLength=10&sSearch=&bEscapeRegex=true&iSortingCols=1&iSortCol_0=17&sSortDir_0=asc' - + # get the base url to set the proper session cookie br.open_novisit(q) - + # initialize the search self.safe_query(br, search_url) - + # get the results req = mechanize.Request(results_url) req.add_header('X-Requested-With', 'XMLHttpRequest') @@ -385,7 +370,7 @@ class OverDrive(Source): ''' Parse the formatted search results from the initial Overdrive query and add the values to the metadta. - + The list object has these values: [cover_url[0], social_metadata_url[1], worldcatlink[2], series[3], series_num[4], publisher[5], creators[6], reserveid[7], title[8]] From f7535a51edf98b802218bffd2d2eccbbc5bf913f Mon Sep 17 00:00:00 2001 From: Lee <ldolse@yahoo.com> Date: Mon, 18 Apr 2011 21:30:35 +0800 Subject: [PATCH 20/30] re-initialize the cookie jar for book records that already have an overdrive identifier --- src/calibre/ebooks/metadata/sources/overdrive.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/calibre/ebooks/metadata/sources/overdrive.py b/src/calibre/ebooks/metadata/sources/overdrive.py index 4fc8dbab1b..1b237ad683 100755 --- a/src/calibre/ebooks/metadata/sources/overdrive.py +++ b/src/calibre/ebooks/metadata/sources/overdrive.py @@ -312,6 +312,9 @@ class OverDrive(Source): search_url = q+'SearchResults.aspx?ReserveID={'+ovrdrv_id+'}' results_url = q+'SearchResults.svc/GetResults?sEcho=1&iColumns=18&sColumns=ReserveID%2CTitle%2CSubtitle%2CEdition%2CSeries%2CPublisher%2CFormat%2CFormatID%2CCreators%2CThumbImage%2CShortDescription%2CWorldCatLink%2CExcerptLink%2CCreatorFile%2CSortTitle%2CAvailableToLibrary%2CAvailableToRetailer%2CRelevancyRank&iDisplayStart=0&iDisplayLength=10&sSearch=&bEscapeRegex=true&iSortingCols=1&iSortCol_0=17&sSortDir_0=asc' + # re-initialize the cookiejar to so that it's clean + clean_cj = mechanize.CookieJar() + br.set_cookiejar(clean_cj) # get the base url to set the proper session cookie br.open_novisit(q) From 0991d2ca3ccccf5a9226007e9fd8d7eafdcd624d Mon Sep 17 00:00:00 2001 From: Lee <ldolse@yahoo.com> Date: Mon, 18 Apr 2011 22:53:19 +0800 Subject: [PATCH 21/30] get rid of test code that applied to the old plugin, set defaults --- .../ebooks/metadata/sources/overdrive.py | 40 +++---------------- 1 file changed, 5 insertions(+), 35 deletions(-) diff --git a/src/calibre/ebooks/metadata/sources/overdrive.py b/src/calibre/ebooks/metadata/sources/overdrive.py index 1b237ad683..56a905de03 100755 --- a/src/calibre/ebooks/metadata/sources/overdrive.py +++ b/src/calibre/ebooks/metadata/sources/overdrive.py @@ -39,6 +39,11 @@ class OverDrive(Source): supports_gzip_transfer_encoding = False cached_cover_url_is_reliable = True + def __init__(self, *args, **kwargs): + Source.__init__(self, *args, **kwargs) + self.prefs.defaults['ignore_fields'] =['tags', 'pubdate', 'comments', 'identifier:isbn', 'language'] + + def identify(self, log, result_queue, abort, title=None, authors=None, # {{{ identifiers={}, timeout=30): ovrdrv_id = identifiers.get('overdrive', None) @@ -444,41 +449,6 @@ class OverDrive(Source): def main(args=sys.argv): - import tempfile, os, time - tdir = tempfile.gettempdir() - br = browser() - for ovrdrv_id, isbn, title, author in [ - #(None, '0899661343', 'On the Road', ['Jack Kerouac']), # basic test, no series, single author - #(None, '9780061952838', 'The Fellowship of the Ring', ['J. R. R. Tolkien']), # Series test, multi-author - #(None, '9780061952838', 'The Two Towers (The Lord of the Rings, Book II)', ['J. R. R. Tolkien']), # Series test, book 2 - #(None, '9780618153985', 'The Fellowship of the Ring (The Lord of the Rings, Part 1)', ['J.R.R. Tolkien']), - #('57844706-20fa-4ace-b5ee-3470b1b52173', None, 'The Two Towers', ['J. R. R. Tolkien']), # Series test, w/ ovrdrv id - #(None, '9780345505057', 'Deluge', ['Anne McCaffrey']) # Multiple authors - #(None, None, 'Deluge', ['Anne McCaffrey']) # Empty ISBN - #(None, None, 'On the Road', ['Jack Kerouac']), # Nonetype ISBN - #(None, '9780345435279', 'A Caress of Twilight', ['Laurell K. Hamilton']), - #(None, '9780606087230', 'The Omnivore\'s Dilemma : A Natural History of Four Meals', ['Michael Pollan']), # Subtitle colon - #(None, '9780061747649', 'Mental_Floss Presents: Condensed Knowledge', ['Will Pearson', 'Mangesh Hattikudur']), - #(None, '9781400050802', 'The Zombie Survival Guide', ['Max Brooks']), # Two books with this title by this author - #(None, '9781775414315', 'The Worst Journey in the World / Antarctic 1910-1913', ['Apsley Cherry-Garrard']), # Garbage sub-title - #(None, '9780440335160', 'Outlander', ['Diana Gabaldon']), # Returns lots of results to sort through to get the best match - (None, '9780345509741', 'The Horror Stories of Robert E. Howard', ['Robert E. Howard']), # Complex title with initials/dots stripped, some results don't have a cover - ]: - cpath = os.path.join(tdir, title+'.jpg') - #print "cpath is "+cpath - st = time.time() - curl = get_cover_url(isbn, title, author, br, ovrdrv_id) - #print '\n\n Took ', time.time() - st, ' to get basic metadata\n\n' - #if curl is None: - # print 'No cover found for', title - #else: - # print "curl is "+curl - # open(cpath, 'wb').write(br.open_novisit(curl).read()) - # print 'Cover for', title, 'saved to', cpath - st = time.time() - #print get_social_metadata(title, author, isbn, ovrdrv_id) - #print '\n\n Took ', time.time() - st, ' to get detailed metadata\n\n' - return 0 if __name__ == '__main__': From 9278da958c420cfe4cfaeb37b5302ccef8d1c358 Mon Sep 17 00:00:00 2001 From: Kovid Goyal <kovid@kovidgoyal.net> Date: Mon, 18 Apr 2011 11:06:19 -0600 Subject: [PATCH 22/30] Bulk metadata download: Make the confirm dialog more useful --- src/calibre/gui2/actions/edit_metadata.py | 10 +-- src/calibre/gui2/metadata/bulk_download2.py | 81 +++++++++++++++++---- 2 files changed, 71 insertions(+), 20 deletions(-) diff --git a/src/calibre/gui2/actions/edit_metadata.py b/src/calibre/gui2/actions/edit_metadata.py index 18a73fb282..9d4d3891ca 100644 --- a/src/calibre/gui2/actions/edit_metadata.py +++ b/src/calibre/gui2/actions/edit_metadata.py @@ -37,8 +37,6 @@ class EditMetadataAction(InterfaceAction): md.addSeparator() if test_eight_code: dall = self.download_metadata - dident = partial(self.download_metadata, covers=False) - dcovers = partial(self.download_metadata, identify=False) else: dall = partial(self.download_metadata_old, False, covers=True) dident = partial(self.download_metadata_old, False, covers=False) @@ -47,9 +45,9 @@ class EditMetadataAction(InterfaceAction): md.addAction(_('Download metadata and covers'), dall, Qt.ControlModifier+Qt.Key_D) - md.addAction(_('Download only metadata'), dident) - md.addAction(_('Download only covers'), dcovers) if not test_eight_code: + md.addAction(_('Download only metadata'), dident) + md.addAction(_('Download only covers'), dcovers) md.addAction(_('Download only social metadata'), partial(self.download_metadata_old, False, covers=False, set_metadata=False, set_social_metadata=True)) @@ -80,7 +78,7 @@ class EditMetadataAction(InterfaceAction): self.qaction.setEnabled(enabled) self.action_merge.setEnabled(enabled) - def download_metadata(self, identify=True, covers=True, ids=None): + def download_metadata(self, ids=None): if ids is None: rows = self.gui.library_view.selectionModel().selectedRows() if not rows or len(rows) == 0: @@ -90,7 +88,7 @@ class EditMetadataAction(InterfaceAction): ids = [db.id(row.row()) for row in rows] from calibre.gui2.metadata.bulk_download2 import start_download start_download(self.gui, ids, - Dispatcher(self.bulk_metadata_downloaded), identify, covers) + Dispatcher(self.bulk_metadata_downloaded)) def bulk_metadata_downloaded(self, job): if job.failed: diff --git a/src/calibre/gui2/metadata/bulk_download2.py b/src/calibre/gui2/metadata/bulk_download2.py index 5f0af1b316..11cbc65680 100644 --- a/src/calibre/gui2/metadata/bulk_download2.py +++ b/src/calibre/gui2/metadata/bulk_download2.py @@ -12,7 +12,8 @@ from functools import partial from itertools import izip from PyQt4.Qt import (QIcon, QDialog, QVBoxLayout, QTextBrowser, QSize, - QDialogButtonBox, QApplication, QTimer, QLabel, QProgressBar) + QDialogButtonBox, QApplication, QTimer, QLabel, QProgressBar, + QGridLayout, QPixmap, Qt) from calibre.gui2.dialogs.message_box import MessageBox from calibre.gui2.threaded_jobs import ThreadedJob @@ -25,37 +26,86 @@ from calibre.ebooks.metadata.book.base import Metadata from calibre.customize.ui import metadata_plugins from calibre.ptempfile import PersistentTemporaryFile +# Start download {{{ def show_config(gui, parent): from calibre.gui2.preferences import show_config_widget show_config_widget('Sharing', 'Metadata download', parent=parent, gui=gui, never_shutdown=True) -def start_download(gui, ids, callback, identify, covers): - q = MessageBox(MessageBox.QUESTION, _('Schedule download?'), +class ConfirmDialog(QDialog): + + def __init__(self, ids, parent): + QDialog.__init__(self, parent) + self.setWindowTitle(_('Schedule download?')) + self.setWindowIcon(QIcon(I('dialog_question.png'))) + + l = self.l = QGridLayout() + self.setLayout(l) + + i = QLabel(self) + i.setPixmap(QPixmap(I('dialog_question.png'))) + l.addWidget(i, 0, 0) + + t = QLabel( '<p>'+_('The download of metadata for the <b>%d selected book(s)</b> will' ' run in the background. Proceed?')%len(ids) + '<p>'+_('You can monitor the progress of the download ' 'by clicking the rotating spinner in the bottom right ' 'corner.') + '<p>'+_('When the download completes you will be asked for' - ' confirmation before calibre applies the downloaded metadata.'), - show_copy_button=False, parent=gui) - b = q.bb.addButton(_('Configure download'), q.bb.ActionRole) - b.setIcon(QIcon(I('config.png'))) - b.clicked.connect(partial(show_config, gui, q)) - q.det_msg_toggle.setVisible(False) + ' confirmation before calibre applies the downloaded metadata.') + ) + t.setWordWrap(True) + l.addWidget(t, 0, 1) + l.setColumnStretch(0, 1) + l.setColumnStretch(1, 100) - ret = q.exec_() - b.clicked.disconnect() - if ret != q.Accepted: + self.identify = self.covers = True + self.bb = QDialogButtonBox(QDialogButtonBox.Cancel) + self.bb.rejected.connect(self.reject) + b = self.bb.addButton(_('Download only metadata'), + self.bb.AcceptRole) + b.clicked.connect(self.only_metadata) + b.setIcon(QIcon(I('edit_input.png'))) + b = self.bb.addButton(_('Download only covers'), + self.bb.AcceptRole) + b.clicked.connect(self.only_covers) + b.setIcon(QIcon(I('default_cover.png'))) + b = self.b = self.bb.addButton(_('Configure download'), self.bb.ActionRole) + b.setIcon(QIcon(I('config.png'))) + b.clicked.connect(partial(show_config, parent, self)) + l.addWidget(self.bb, 1, 0, 1, 2) + b = self.bb.addButton(_('Download both'), + self.bb.AcceptRole) + b.clicked.connect(self.accept) + b.setDefault(True) + b.setAutoDefault(True) + b.setIcon(QIcon(I('ok.png'))) + + self.resize(self.sizeHint()) + b.setFocus(Qt.OtherFocusReason) + + def only_metadata(self): + self.covers = False + self.accept() + + def only_covers(self): + self.identify = False + self.accept() + +def start_download(gui, ids, callback): + d = ConfirmDialog(ids, gui) + ret = d.exec_() + d.b.clicked.disconnect() + if ret != d.Accepted: return job = ThreadedJob('metadata bulk download', _('Download metadata for %d books')%len(ids), - download, (ids, gui.current_db, identify, covers), {}, callback) + download, (ids, gui.current_db, d.identify, d.covers), {}, callback) gui.job_manager.run_threaded_job(job) gui.status_bar.show_message(_('Metadata download started'), 3000) - +# }}} class ViewLog(QDialog): # {{{ @@ -93,6 +143,7 @@ def view_log(job, parent): # }}} +# Apply downloaded metadata {{{ class ApplyDialog(QDialog): def __init__(self, id_map, gui): @@ -248,6 +299,8 @@ def proceed(gui, job): q.show() q.finished.connect(partial(apply_metadata, job, gui, q)) +# }}} + def merge_result(oldmi, newmi): dummy = Metadata(_('Unknown')) for f in msprefs['ignore_fields']: From ab1ad20dba92e3be931a80ae2eaf50625341564c Mon Sep 17 00:00:00 2001 From: Kovid Goyal <kovid@kovidgoyal.net> Date: Mon, 18 Apr 2011 11:11:11 -0600 Subject: [PATCH 23/30] ... --- src/calibre/gui2/metadata/bulk_download2.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/calibre/gui2/metadata/bulk_download2.py b/src/calibre/gui2/metadata/bulk_download2.py index 11cbc65680..a95c8b52c7 100644 --- a/src/calibre/gui2/metadata/bulk_download2.py +++ b/src/calibre/gui2/metadata/bulk_download2.py @@ -63,19 +63,19 @@ class ConfirmDialog(QDialog): self.identify = self.covers = True self.bb = QDialogButtonBox(QDialogButtonBox.Cancel) self.bb.rejected.connect(self.reject) - b = self.bb.addButton(_('Download only metadata'), + b = self.bb.addButton(_('Download only &metadata'), self.bb.AcceptRole) b.clicked.connect(self.only_metadata) b.setIcon(QIcon(I('edit_input.png'))) - b = self.bb.addButton(_('Download only covers'), + b = self.bb.addButton(_('Download only &covers'), self.bb.AcceptRole) b.clicked.connect(self.only_covers) b.setIcon(QIcon(I('default_cover.png'))) - b = self.b = self.bb.addButton(_('Configure download'), self.bb.ActionRole) + b = self.b = self.bb.addButton(_('&Configure download'), self.bb.ActionRole) b.setIcon(QIcon(I('config.png'))) b.clicked.connect(partial(show_config, parent, self)) l.addWidget(self.bb, 1, 0, 1, 2) - b = self.bb.addButton(_('Download both'), + b = self.bb.addButton(_('Download &both'), self.bb.AcceptRole) b.clicked.connect(self.accept) b.setDefault(True) From 8611632ea4a110f0b2003b994dd05bab96d29597 Mon Sep 17 00:00:00 2001 From: Kovid Goyal <kovid@kovidgoyal.net> Date: Mon, 18 Apr 2011 11:51:09 -0600 Subject: [PATCH 24/30] Nicer implementation of apply metadata dialog --- src/calibre/gui2/metadata/bulk_download2.py | 40 ++++++++++++--------- 1 file changed, 23 insertions(+), 17 deletions(-) diff --git a/src/calibre/gui2/metadata/bulk_download2.py b/src/calibre/gui2/metadata/bulk_download2.py index a95c8b52c7..4aa4561078 100644 --- a/src/calibre/gui2/metadata/bulk_download2.py +++ b/src/calibre/gui2/metadata/bulk_download2.py @@ -146,7 +146,7 @@ def view_log(job, parent): # Apply downloaded metadata {{{ class ApplyDialog(QDialog): - def __init__(self, id_map, gui): + def __init__(self, gui): QDialog.__init__(self, gui) self.l = l = QVBoxLayout() @@ -155,27 +155,33 @@ class ApplyDialog(QDialog): self.pb = QProgressBar(self) l.addWidget(self.pb) - self.pb.setMinimum(0) - self.pb.setMaximum(len(id_map)) self.bb = QDialogButtonBox(QDialogButtonBox.Cancel) self.bb.rejected.connect(self.reject) - self.bb.accepted.connect(self.accept) l.addWidget(self.bb) self.gui = gui + self.timer = QTimer(self) + self.timer.timeout.connect(self.do_one) + + def start(self, id_map): self.id_map = list(id_map.iteritems()) self.current_idx = 0 - self.failures = [] self.ids = [] self.canceled = False - - QTimer.singleShot(20, self.do_one) + self.pb.setMinimum(0) + self.pb.setMaximum(len(id_map)) + self.timer.start(50) def do_one(self): if self.canceled: return + if self.current_idx >= len(self.id_map): + self.timer.stop() + self.finalize() + return + i, mi = self.id_map[self.current_idx] db = self.gui.current_db try: @@ -195,15 +201,11 @@ class ApplyDialog(QDialog): pass self.pb.setValue(self.pb.value()+1) - - if self.current_idx >= len(self.id_map) - 1: - self.finalize() - else: - self.current_idx += 1 - QTimer.singleShot(20, self.do_one) + self.current_idx += 1 def reject(self): self.canceled = True + self.timer.stop() QDialog.reject(self) def finalize(self): @@ -220,17 +222,18 @@ class ApplyDialog(QDialog): title += ' - ' + authors_to_string(authors) msg.append(title+'\n\n'+tb+'\n'+('*'*80)) - error_dialog(self, _('Some failures'), + parent = self if self.isVisible() else self.parent() + error_dialog(parent, _('Some failures'), _('Failed to apply updated metadata for some books' ' in your library. Click "Show Details" to see ' 'details.'), det_msg='\n\n'.join(msg), show=True) - self.accept() if self.ids: cr = self.gui.library_view.currentIndex().row() self.gui.library_view.model().refresh_ids( self.ids, cr) if self.gui.cover_flow: self.gui.cover_flow.dataChanged() + self.accept() _amd = None def apply_metadata(job, gui, q, result): @@ -268,8 +271,11 @@ def apply_metadata(job, gui, q, result): 'Do you want to proceed?'), det_msg='\n'.join(modified)): return - _amd = ApplyDialog(id_map, gui) - _amd.exec_() + if _amd is None: + _amd = ApplyDialog(gui) + _amd.start(id_map) + if len(id_map) > 3: + _amd.exec_() def proceed(gui, job): gui.status_bar.show_message(_('Metadata download completed'), 3000) From b0ec35f0d310d861aa72a423a6337acd0bb25da5 Mon Sep 17 00:00:00 2001 From: Kovid Goyal <kovid@kovidgoyal.net> Date: Mon, 18 Apr 2011 14:06:07 -0600 Subject: [PATCH 25/30] ... --- src/calibre/gui2/metadata/single_download.py | 11 ++++------- src/calibre/gui2/preferences/__init__.py | 8 +++++++- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/src/calibre/gui2/metadata/single_download.py b/src/calibre/gui2/metadata/single_download.py index 7e30f02420..c4e13a90f8 100644 --- a/src/calibre/gui2/metadata/single_download.py +++ b/src/calibre/gui2/metadata/single_download.py @@ -30,7 +30,6 @@ from calibre.ebooks.metadata.book.base import Metadata from calibre.gui2 import error_dialog, NONE from calibre.utils.date import utcnow, fromordinal, format_date from calibre.library.comments import comments_to_html -from calibre.constants import islinux from calibre import force_unicode # }}} @@ -117,12 +116,10 @@ class CoverDelegate(QStyledItemDelegate): # {{{ def paint(self, painter, option, index): QStyledItemDelegate.paint(self, painter, option, index) - if islinux: - # On linux for some reason the selected color is drawn on top of - # the decoration - style = QApplication.style() - style.drawItemPixmap(painter, option.rect, Qt.AlignTop|Qt.AlignHCenter, - QPixmap(index.data(Qt.DecorationRole))) + # Ensure the cover is rendered over any selection rect + style = QApplication.style() + style.drawItemPixmap(painter, option.rect, Qt.AlignTop|Qt.AlignHCenter, + QPixmap(index.data(Qt.DecorationRole))) if self.timer.isActive() and index.data(Qt.UserRole).toBool(): rect = QRect(0, 0, self.spinner_width, self.spinner_width) rect.moveCenter(option.rect.center()) diff --git a/src/calibre/gui2/preferences/__init__.py b/src/calibre/gui2/preferences/__init__.py index 649a58448d..5b0a05ba40 100644 --- a/src/calibre/gui2/preferences/__init__.py +++ b/src/calibre/gui2/preferences/__init__.py @@ -337,7 +337,13 @@ def show_config_widget(category, name, gui=None, show_restart_msg=False, bb.button(bb.RestoreDefaults).setEnabled(w.supports_restoring_to_defaults) bb.button(bb.Apply).setEnabled(False) bb.button(bb.Apply).clicked.connect(d.accept) - w.changed_signal.connect(lambda : bb.button(bb.Apply).setEnabled(True)) + def onchange(): + b = bb.button(bb.Apply) + b.setEnabled(True) + b.setDefault(True) + b.setAutoDefault(True) + w.changed_signal.connect(onchange) + bb.button(bb.Cancel).setFocus(True) l = QVBoxLayout() d.setLayout(l) l.addWidget(w) From 97c5bf39c13ec466712869526bc82d9f4566ef62 Mon Sep 17 00:00:00 2001 From: Kovid Goyal <kovid@kovidgoyal.net> Date: Mon, 18 Apr 2011 14:08:03 -0600 Subject: [PATCH 26/30] ... --- src/calibre/devices/android/driver.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/calibre/devices/android/driver.py b/src/calibre/devices/android/driver.py index 44d9bc1e49..7fe246f450 100644 --- a/src/calibre/devices/android/driver.py +++ b/src/calibre/devices/android/driver.py @@ -108,10 +108,10 @@ class ANDROID(USBMS): 'SGH-T849', '_MB300', 'A70S', 'S_ANDROID', 'A101IT', 'A70H', 'IDEOS_TABLET', 'MYTOUCH_4G', 'UMS_COMPOSITE', 'SCH-I800_CARD', '7', 'A956', 'A955', 'A43', 'ANDROID_PLATFORM', 'TEGRA_2', - 'MB860', 'MULTI-CARD', 'MID7015A'] + 'MB860', 'MULTI-CARD', 'MID7015A', 'INCREDIBLE'] WINDOWS_CARD_A_MEM = ['ANDROID_PHONE', 'GT-I9000_CARD', 'SGH-I897', 'FILE-STOR_GADGET', 'SGH-T959', 'SAMSUNG_ANDROID', 'GT-P1000_CARD', - 'A70S', 'A101IT', '7'] + 'A70S', 'A101IT', '7', 'INCREDIBLE'] OSX_MAIN_MEM = 'Android Device Main Memory' From b79faeff5691fb11e110c24bad69cb60fc05ce82 Mon Sep 17 00:00:00 2001 From: Charles Haley <> Date: Mon, 18 Apr 2011 21:38:27 +0100 Subject: [PATCH 27/30] Change author_sort_copy_method default from invert to comma. --- resources/default_tweaks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/resources/default_tweaks.py b/resources/default_tweaks.py index c4c951f980..091aa9a34d 100644 --- a/resources/default_tweaks.py +++ b/resources/default_tweaks.py @@ -48,7 +48,7 @@ authors_completer_append_separator = False # When this tweak is changed, the author_sort values stored with each author # must be recomputed by right-clicking on an author in the left-hand tags pane, # selecting 'manage authors', and pressing 'Recalculate all author sort values'. -author_sort_copy_method = 'invert' +author_sort_copy_method = 'comma' #: Use author sort in Tag Browser # Set which author field to display in the tags pane (the list of authors, From b6f44d0b7c0de9b2b9a6bfbb29d2874ea9718e7b Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Mon, 18 Apr 2011 18:52:14 -0400 Subject: [PATCH 28/30] Store: Search allows for main window location and boolean filtering. --- src/calibre/gui2/store/search.py | 91 +++++++++++++++++++++++++++++++- 1 file changed, 89 insertions(+), 2 deletions(-) diff --git a/src/calibre/gui2/store/search.py b/src/calibre/gui2/store/search.py index 1d263959ef..ce74d52547 100644 --- a/src/calibre/gui2/store/search.py +++ b/src/calibre/gui2/store/search.py @@ -8,6 +8,7 @@ __docformat__ = 'restructuredtext en' import re import time +import traceback from contextlib import closing from random import shuffle from threading import Thread @@ -20,9 +21,12 @@ from calibre import browser from calibre.gui2 import NONE from calibre.gui2.progress_indicator import ProgressIndicator from calibre.gui2.store.search_ui import Ui_Dialog +from calibre.library.caches import _match, CONTAINS_MATCH, EQUALS_MATCH, \ + REGEXP_MATCH from calibre.utils.config import DynamicConfig from calibre.utils.icu import sort_key from calibre.utils.magick.draw import thumbnail +from calibre.utils.search_query_parser import SearchQueryParser HANG_TIME = 75000 # milliseconds seconds TIMEOUT = 75 # seconds @@ -290,11 +294,15 @@ class SearchThread(Thread): while self._run and not self.tasks.empty(): try: query, store_name, store_plugin, timeout = self.tasks.get() - for res in store_plugin.search(query, timeout=timeout): + squery = query + for loc in SearchFilter.USABLE_LOCATIONS: + squery = re.sub(r'%s:"?(?P<a>[^\s"]+)"?' % loc, '\g<a>', squery) + for res in store_plugin.search(squery, timeout=timeout): if not self._run: return res.store_name = store_name - self.results.put(res) + if SearchFilter(res).parse(query): + self.results.put(res) self.tasks.task_done() except: pass @@ -450,3 +458,82 @@ class Matches(QAbstractItemModel): if reset: self.reset() + +class SearchFilter(SearchQueryParser): + + USABLE_LOCATIONS = [ + 'all', + 'author', + 'authors', + 'cover', + 'price', + 'title', + 'store', + ] + + def __init__(self, search_result): + SearchQueryParser.__init__(self, locations=self.USABLE_LOCATIONS) + self.search_result = search_result + + def universal_set(self): + return set([self.search_result]) + + def get_matches(self, location, query): + location = location.lower().strip() + if location == 'authors': + location = 'author' + + matchkind = CONTAINS_MATCH + if len(query) > 1: + if query.startswith('\\'): + query = query[1:] + elif query.startswith('='): + matchkind = EQUALS_MATCH + query = query[1:] + elif query.startswith('~'): + matchkind = REGEXP_MATCH + query = query[1:] + if matchkind != REGEXP_MATCH: ### leave case in regexps because it can be significant e.g. \S \W \D + query = query.lower() + + if location not in self.USABLE_LOCATIONS: + return set([]) + matches = set([]) + all_locs = set(self.USABLE_LOCATIONS) - set(['all']) + locations = all_locs if location == 'all' else [location] + q = { + 'author': self.search_result.author.lower(), + 'cover': self.search_result.cover_url, + 'format': '', + 'price': self.search_result.price, + 'store': self.search_result.store_name.lower(), + 'title': self.search_result.title.lower(), + } + for x in ('author', 'format'): + q[x+'s'] = q[x] + for locvalue in locations: + ac_val = q[locvalue] + if query == 'true': + if ac_val is not None: + matches.add(self.search_result) + continue + if query == 'false': + if ac_val is None: + matches.add(self.search_result) + continue + try: + ### Can't separate authors because comma is used for name sep and author sep + ### Exact match might not get what you want. For that reason, turn author + ### exactmatch searches into contains searches. + if locvalue == 'author' and matchkind == EQUALS_MATCH: + m = CONTAINS_MATCH + else: + m = matchkind + + vals = [ac_val] + if _match(query, vals, m): + matches.add(self.search_result) + break + except ValueError: # Unicode errors + traceback.print_exc() + return matches From 234248cd23826764240e6ea1a4ac91c02cc23371 Mon Sep 17 00:00:00 2001 From: John Schember <john@nachtimwald.com> Date: Mon, 18 Apr 2011 18:57:28 -0400 Subject: [PATCH 29/30] Store: Fix issue with using proxy when an arument is None. --- src/calibre/gui2/store/web_control.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/src/calibre/gui2/store/web_control.py b/src/calibre/gui2/store/web_control.py index 874328f872..0b79c526a8 100644 --- a/src/calibre/gui2/store/web_control.py +++ b/src/calibre/gui2/store/web_control.py @@ -31,10 +31,14 @@ class NPWebView(QWebView): proxy_parts = urlparse(http_proxy) proxy = QNetworkProxy() proxy.setType(QNetworkProxy.HttpProxy) - proxy.setUser(proxy_parts.username) - proxy.setPassword(proxy_parts.password) - proxy.setHostName(proxy_parts.hostname) - proxy.setPort(proxy_parts.port) + if proxy_parts.username: + proxy.setUser(proxy_parts.username) + if proxy_parts.password: + proxy.setPassword(proxy_parts.password) + if proxy_parts.hostname: + proxy.setHostName(proxy_parts.hostname) + if proxy_parts.port: + proxy.setPort(proxy_parts.port) self.page().networkAccessManager().setProxy(proxy) self.page().setForwardUnsupportedContent(True) From 8d174eaffdfcda971885b80e0705bd221ea11f79 Mon Sep 17 00:00:00 2001 From: Kovid Goyal <kovid@kovidgoyal.net> Date: Mon, 18 Apr 2011 20:56:56 -0600 Subject: [PATCH 30/30] ... --- src/calibre/ebooks/metadata/sources/base.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/calibre/ebooks/metadata/sources/base.py b/src/calibre/ebooks/metadata/sources/base.py index 37407a0656..86a9fe1133 100644 --- a/src/calibre/ebooks/metadata/sources/base.py +++ b/src/calibre/ebooks/metadata/sources/base.py @@ -377,8 +377,9 @@ class Source(Plugin): This URL must be browseable to by a human using a browser. It is meant to provide a clickable link for the user to easily visit the books page at this source. - If no URL is found, return None. This method must be quick, either it - should construct the URL using a known URL scheme or use a cached URL. + If no URL is found, return None. This method must be quick, and + consistent, so only implement it if it is possible to construct the URL + from a known scheme given identifiers. ''' return None