port overdrive plugin to 8.x framework, remove from 7.x framework

2025-07-08 10:44:09 -04:00 · 2011-04-18 18:06:10 +08:00 · 2011-04-18 18:06:10 +08:00 · 09da88b6d1
commit 09da88b6d1
parent 34986d6774
6 changed files with 516 additions and 513 deletions
--- a/src/calibre/customize/builtins.py
+++ b/src/calibre/customize/builtins.py
@ -633,14 +633,14 @@ if test_eight_code:
 # }}}
 else:
    from calibre.ebooks.metadata.fetch import GoogleBooks, ISBNDB, Amazon, \
-        KentDistrictLibrary, Overdrive
+        KentDistrictLibrary
    from calibre.ebooks.metadata.douban import DoubanBooks
    from calibre.ebooks.metadata.nicebooks import NiceBooks, NiceBooksCovers
    from calibre.ebooks.metadata.covers import OpenLibraryCovers, \
-            AmazonCovers, DoubanCovers, OverdriveCovers
+            AmazonCovers, DoubanCovers
-    plugins += [GoogleBooks, ISBNDB, Amazon, Overdrive,
+    plugins += [GoogleBooks, ISBNDB, Amazon,
-        OpenLibraryCovers, AmazonCovers, DoubanCovers, OverdriveCovers,
+        OpenLibraryCovers, AmazonCovers, DoubanCovers,
        NiceBooksCovers, KentDistrictLibrary, DoubanBooks, NiceBooks]
 plugins += [
--- a/src/calibre/ebooks/metadata/covers.py
+++ b/src/calibre/ebooks/metadata/covers.py
@ -151,33 +151,6 @@ class AmazonCovers(CoverDownload): # {{{
 # }}}
 class OverdriveCovers(CoverDownload): # {{{
    name = 'overdrive.com covers'
    description = _('Download covers from Overdrive')
    author = 'Kovid Goyal'
    def has_cover(self, mi, ans, timeout=5.):
        if not mi.authors or not mi.title:
            return False
        return True
    def get_covers(self, mi, result_queue, abort, timeout=5.):
        if not mi.isbn:
            return
        from calibre.ebooks.metadata.overdrive import get_cover_url
        br = browser()
        try:
            url = get_cover_url(mi.isbn, mi.title, mi.authors, br)
            cover_data = br.open_novisit(url).read()
            result_queue.put((True, cover_data, 'jpg', self.name))
        except Exception, e:
            result_queue.put((False, self.exception_to_string(e),
                traceback.format_exc(), self.name))
 # }}}
 def check_for_cover(mi, timeout=5.): # {{{
    from calibre.customize.ui import cover_sources
    ans = Event()
--- a/src/calibre/ebooks/metadata/fetch.py
+++ b/src/calibre/ebooks/metadata/fetch.py
@ -250,27 +250,6 @@ class Amazon(MetadataSource): # {{{
    # }}}
 class Overdrive(MetadataSource): # {{{
    name = 'Overdrive'
    metadata_type = 'social'
    description = _('Downloads  metadata from the Overdrive library network')
    has_html_comments = True
    def fetch(self):
        if not self.isbn:
            return
        from calibre.ebooks.metadata.overdrive import get_social_metadata
        try:
            self.results = get_social_metadata(self.title, self.book_author, self.isbn)
        except Exception, e:
            self.exception = e
            self.tb = traceback.format_exc()
    # }}}
 class KentDistrictLibrary(MetadataSource): # {{{
    name = 'Kent District Library'
--- a/src/calibre/ebooks/metadata/overdrive.py
+++ b/src/calibre/ebooks/metadata/overdrive.py
@ -1,459 +0,0 @@
 #!/usr/bin/env  python
 __license__   = 'GPL v3'
 __copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
 __docformat__ = 'restructuredtext en'
 '''
 Fetch metadata using Overdrive Content Reserve
 '''
 import sys, re, random, urllib, mechanize, copy
 from threading import RLock
 from lxml import html, etree
 from lxml.html import soupparser
 from calibre import browser
 from calibre.ebooks.metadata import check_isbn
 from calibre.ebooks.metadata.sources.base import Source
 from calibre.ebooks.metadata.book.base import Metadata
 from calibre.ebooks.chardet import xml_to_unicode
 from calibre.library.comments import sanitize_comments_html
 ovrdrv_data_cache = {}
 cover_url_cache = {}
 cache_lock = RLock()
 base_url = 'http://search.overdrive.com/'
 def create_query(self, title=None, authors=None, identifiers={}):
    q = ''
    if title or authors:
        def build_term(prefix, parts):
            return ' '.join('in'+prefix + ':' + x for x in parts)
        title_tokens = list(self.get_title_tokens(title, False))
        if title_tokens:
            q += build_term('title', title_tokens)
        author_tokens = self.get_author_tokens(authors,
                only_first_author=True)
        if author_tokens:
            q += ('+' if q else '') + build_term('author',
                    author_tokens)
    if isinstance(q, unicode):
        q = q.encode('utf-8')
    if not q:
        return None
    return BASE_URL+urlencode({
        'q':q,
        })
 def get_base_referer():
    choices = [
        'http://overdrive.chipublib.org/82DC601D-7DDE-4212-B43A-09D821935B01/10/375/en/',
        'http://emedia.clevnet.org/9D321DAD-EC0D-490D-BFD8-64AE2C96ECA8/10/241/en/',
        'http://singapore.lib.overdrive.com/F11D55BE-A917-4D63-8111-318E88B29740/10/382/en/',
        'http://ebooks.nypl.org/20E48048-A377-4520-BC43-F8729A42A424/10/257/en/',
        'http://spl.lib.overdrive.com/5875E082-4CB2-4689-9426-8509F354AFEF/10/335/en/'
    ]
    return choices[random.randint(0, len(choices)-1)]
 def format_results(reserveid, od_title, subtitle, series, publisher, creators, thumbimage, worldcatlink, formatid):
    fix_slashes = re.compile(r'\\/')
    thumbimage = fix_slashes.sub('/', thumbimage)
    worldcatlink = fix_slashes.sub('/', worldcatlink)
    cover_url = re.sub('(?P<img>(Ima?g(eType-)?))200', '\g<img>100', thumbimage)
    social_metadata_url = base_url+'TitleInfo.aspx?ReserveID='+reserveid+'&FormatID='+formatid
    series_num = ''
    if not series:
        if subtitle:
            title = od_title+': '+subtitle
        else:
            title = od_title
    else:
        title = od_title
        m = re.search("([0-9]+$)", subtitle)
        if m:
            series_num = float(m.group(1))
    return [cover_url, social_metadata_url, worldcatlink, series, series_num, publisher, creators, reserveid, title]
 def safe_query(br, query_url):
    '''
    The query must be initialized by loading an empty search results page
    this page attempts to set a cookie that Mechanize doesn't like
    copy the cookiejar to a separate instance and make a one-off request with the temp cookiejar
    '''
    goodcookies = br._ua_handlers['_cookies'].cookiejar
    clean_cj = mechanize.CookieJar()
    cookies_to_copy = []
    for cookie in goodcookies:
        copied_cookie = copy.deepcopy(cookie)
        cookies_to_copy.append(copied_cookie)
    for copied_cookie in cookies_to_copy:
        clean_cj.set_cookie(copied_cookie)
    br.open_novisit(query_url)
    br.set_cookiejar(clean_cj)
 def overdrive_search(br, q, title, author):
    q_query = q+'default.aspx/SearchByKeyword'
    q_init_search = q+'SearchResults.aspx'
    # get first author as string - convert this to a proper cleanup function later
    s = Source(None)
    print "printing list with string:"
    #print list(s.get_author_tokens(['J. R. R. Tolkien']))
    print "printing list with author "+str(author)+":"
    print list(s.get_author_tokens(author))
    author_tokens = list(s.get_author_tokens(author))
    print "there are "+str(len(author_tokens))+" author tokens"
    for token in author_tokens:
        print "cleaned up author token is: "+str(token)
    title_tokens = list(s.get_title_tokens(title))
    print "there are "+str(len(title_tokens))+" title tokens"
    for token in title_tokens:
        print "cleaned up title token is: "+str(token)
    if len(title_tokens) >= len(author_tokens):
        initial_q = ' '.join(title_tokens)
        xref_q = '+'.join(author_tokens)
    else:
        initial_q = ' '.join(author_tokens)
        xref_q = '+'.join(title_tokens)
    print "initial query is "+str(initial_q)
    print "cross reference query is "+str(xref_q)
    q_xref = q+'SearchResults.svc/GetResults?iDisplayLength=50&sSearch='+xref_q
    query = '{"szKeyword":"'+initial_q+'"}'
    # main query, requires specific Content Type header
    req = mechanize.Request(q_query)
    req.add_header('Content-Type', 'application/json; charset=utf-8')
    br.open_novisit(req, query)
    print "q_init_search is "+q_init_search
    # initiate the search without messing up the cookiejar
    safe_query(br, q_init_search)
    # get the search results object
    results = False
    while results == False:
        xreq = mechanize.Request(q_xref)
        xreq.add_header('X-Requested-With', 'XMLHttpRequest')
        xreq.add_header('Referer', q_init_search)
        xreq.add_header('Accept', 'application/json, text/javascript, */*')
        raw = br.open_novisit(xreq).read()
        print "overdrive search result is:\n"+raw
        for m in re.finditer(ur'"iTotalDisplayRecords":(?P<displayrecords>\d+).*?"iTotalRecords":(?P<totalrecords>\d+)', raw):
            if int(m.group('displayrecords')) >= 1:
                results = True
            elif int(m.group('totalrecords')) >= 1:
                xref_q = ''
                q_xref = q+'SearchResults.svc/GetResults?iDisplayLength=50&sSearch='+xref_q
            elif int(m.group('totalrecords')) == 0:
                return ''
    print "\n\nsorting results"
    return sort_ovrdrv_results(raw, title, title_tokens, author, author_tokens)
 def sort_ovrdrv_results(raw, title=None, title_tokens=None, author=None, author_tokens=None, ovrdrv_id=None):
    print "\ntitle to search for is "+str(title)+"\nauthor to search for is "+str(author)
    close_matches = []
    raw = re.sub('.*?\[\[(?P<content>.*?)\]\].*', '[[\g<content>]]', raw)
    results = eval(raw)
    print "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n"
    #print results
    # The search results are either from a keyword search or a multi-format list from a single ID,
    # sort through the results for closest match/format
    if results:
        for reserveid, od_title, subtitle, edition, series, publisher, format, formatid, creators, \
                thumbimage, shortdescription, worldcatlink, excerptlink, creatorfile, sorttitle, \
                availabletolibrary, availabletoretailer, relevancyrank, unknown1, unknown2, unknown3 in results:
            print "this record's title is "+od_title+", subtitle is "+subtitle+", author[s] are "+creators+", series is "+series
            if ovrdrv_id is not None and int(formatid) in [1, 50, 410, 900]:
                print "overdrive id is not None, searching based on format type priority"
                return format_results(reserveid, od_title, subtitle, series, publisher, creators, thumbimage, worldcatlink, formatid)            
            else:
                creators = creators.split(', ')
                print "split creators from results are: "+str(creators)
                # if an exact match in a preferred format occurs
                if creators[0] == author[0] and od_title == title and int(formatid) in [1, 50, 410, 900]:
                    print "Got Exact Match!!!"
                    return format_results(reserveid, od_title, subtitle, series, publisher, creators, thumbimage, worldcatlink, formatid)
                else:
                    close_title_match = False
                    close_author_match = False
                    print "format id is "+str(formatid)
                    for token in title_tokens:
                        print "attempting to find "+str(token)+" title token"
                        if od_title.lower().find(token.lower()) != -1:
                            print "matched token"
                            close_title_match = True
                        else:
                            print "token didn't match"
                            close_title_match = False
                            break
                    for token in author_tokens:
                        print "attempting to find "+str(token)+" author token"
                        if creators[0].lower().find(token.lower()) != -1:
                            print "matched token"
                            close_author_match = True
                        else:
                            print "token didn't match"
                            close_author_match = False
                            break
                    if close_title_match and close_author_match and int(formatid) in [1, 50, 410, 900]:
                        if subtitle and series:
                            close_matches.insert(0, format_results(reserveid, od_title, subtitle, series, publisher, creators, thumbimage, worldcatlink, formatid))
                        else:
                            close_matches.append(format_results(reserveid, od_title, subtitle, series, publisher, creators, thumbimage, worldcatlink, formatid))
        if close_matches:
            return close_matches[0]
        else:
            return ''
    else:
        return ''
 def overdrive_get_record(br, q, ovrdrv_id):
    search_url = q+'SearchResults.aspx?ReserveID={'+ovrdrv_id+'}'
    results_url = q+'SearchResults.svc/GetResults?sEcho=1&iColumns=18&sColumns=ReserveID%2CTitle%2CSubtitle%2CEdition%2CSeries%2CPublisher%2CFormat%2CFormatID%2CCreators%2CThumbImage%2CShortDescription%2CWorldCatLink%2CExcerptLink%2CCreatorFile%2CSortTitle%2CAvailableToLibrary%2CAvailableToRetailer%2CRelevancyRank&iDisplayStart=0&iDisplayLength=10&sSearch=&bEscapeRegex=true&iSortingCols=1&iSortCol_0=17&sSortDir_0=asc'
    # get the base url to set the proper session cookie
    br.open_novisit(q)
    # initialize the search
    safe_query(br, search_url)
    # get the results
    req = mechanize.Request(results_url)
    req.add_header('X-Requested-With', 'XMLHttpRequest')
    req.add_header('Referer', search_url)
    req.add_header('Accept', 'application/json, text/javascript, */*')
    raw = br.open_novisit(req)
    raw = str(list(raw))
    return sort_ovrdrv_results(raw, None, None, None, ovrdrv_id)
 def find_ovrdrv_data(br, title, author, isbn, ovrdrv_id=None):
    print "in find_ovrdrv_data, title is "+str(title)+", author is "+str(author)+", overdrive id is "+str(ovrdrv_id)
    q = base_url
    if ovrdrv_id is None:
       return overdrive_search(br, q, title, author)
    else:
       return overdrive_get_record(br, q, ovrdrv_id)
 def to_ovrdrv_data(br, title, author, isbn, ovrdrv_id=None):
    print "starting to_ovrdrv_data"
    with cache_lock:
        ans = ovrdrv_data_cache.get(isbn, None)
    if ans:
        print "inside to_ovrdrv_data, cache lookup successful, ans is "+str(ans)
        return ans
    if ans is False:
        print "inside to_ovrdrv_data, ans returned False"
        return None
    try:
        print "trying to retrieve data, running find_ovrdrv_data"
        ovrdrv_data = find_ovrdrv_data(br, title, author, isbn, ovrdrv_id)
        print "ovrdrv_data is "+str(ovrdrv_data)
    except:
        import traceback
        traceback.print_exc()
        ovrdrv_data = None
    with cache_lock:
        ovrdrv_data_cache[isbn] = ovrdrv_data if ovrdrv_data else False
    if ovrdrv_data:
        from calibre.ebooks.metadata.xisbn import xisbn
        for i in xisbn.get_associated_isbns(isbn):
            with cache_lock:
                ovrdrv_data_cache[i] = ovrdrv_data
    return ovrdrv_data
 def get_social_metadata(title, authors, isbn, ovrdrv_id=None):
    author = authors[0]
    mi = Metadata(title, authors)
    br = browser()
    print "calling to_ovrdrv_data from inside get_social_metadata"
    ovrdrv_data = to_ovrdrv_data(br, title, authors, isbn, ovrdrv_id)
    #[cover_url, social_metadata_url, worldcatlink, series, series_num, publisher, creators, reserveid, title]
    if len(ovrdrv_data[3]) > 1:
        mi.series = ovrdrv_data[3]
        if ovrdrv_data[4]:
            mi.series_index = ovrdrv_data[4]
    mi.publisher = ovrdrv_data[5]
    mi.authors = ovrdrv_data[6]
    if ovrdrv_id is None:
        ovrdrv_id = ovrdrv_data[7]
    mi.set_identifier('overdrive', ovrdrv_id)
    mi.title = ovrdrv_data[8]
    print "populated basic social metadata, getting detailed metadata"
    if ovrdrv_data and get_metadata_detail(br, ovrdrv_data[1], mi, isbn):
        return mi
    print "failed to get detailed metadata, returning basic info"
    return mi
 def get_cover_url(isbn, title, author, br, ovrdrv_id=None):
    print "starting get_cover_url"
    print "title is "+str(title)
    print "author is "+str(author[0])
    print "isbn is "+str(isbn)
    print "ovrdrv_id is "+str(ovrdrv_id)
    with cache_lock:
        ans = cover_url_cache.get(isbn, None)
        #ans = cover_url_cache.get(ovrdrv_id, None)
    if ans:
        print "cover url cache lookup returned positive, ans is "+str(ans)
        return ans
    if ans is False:
        "cover url cache lookup returned false"
        return None
    print "in get_cover_url, calling to_ovrdrv_data function"
    ovrdrv_data = to_ovrdrv_data(br, title, author, isbn, ovrdrv_id)
    if ovrdrv_data:
        ans = ovrdrv_data[0]
        print "inside get_cover_url, got url from to_ovrdrv_data, ans is "+str(ans)
        if ans:
            print "writing cover url to url cache"
            with cache_lock:
                cover_url_cache[isbn] = ans
                #cover_url_cache[ovrdrv_id] = ans
            return ans
    with cache_lock:
        print "marking cover url cache for this isbn false"
        cover_url_cache[isbn] = False
    return None
 def _get_cover_url(br, ovrdrv_data):
    q = ovrdrv_data[1]
    try:
        raw = br.open_novisit(q).read()
    except Exception, e:
        if callable(getattr(e, 'getcode', None)) and \
                e.getcode() == 404:
            return None
        raise
    if '<title>404 - ' in raw:
        return None
    raw = xml_to_unicode(raw, strip_encoding_pats=True,
            resolve_entities=True)[0]
    try:
        root = soupparser.fromstring(raw)
    except:
        return False
    imgs = root.xpath('//img[@id="prodImage" and @src]')
    if imgs:
        src = imgs[0].get('src')
        parts = src.split('/')
        if len(parts) > 3:
            bn = parts[-1]
            sparts = bn.split('_')
            if len(sparts) > 2:
                bn = sparts[0] + sparts[-1]
                return ('/'.join(parts[:-1]))+'/'+bn
    return None
 def get_metadata_detail(br, metadata_url, mi, isbn=None):
    try:
        raw = br.open_novisit(metadata_url).read()
    except Exception, e:
        if callable(getattr(e, 'getcode', None)) and \
                e.getcode() == 404:
            return False
        raise   
    raw = xml_to_unicode(raw, strip_encoding_pats=True,
            resolve_entities=True)[0]
    try:
        root = soupparser.fromstring(raw)
    except:
        return False
    isbn = check_isbn(isbn)
    pub_date = root.xpath("//div/label[@id='ctl00_ContentPlaceHolder1_lblPubDate']/text()")
    lang = root.xpath("//div/label[@id='ctl00_ContentPlaceHolder1_lblLanguage']/text()")
    subjects = root.xpath("//div/label[@id='ctl00_ContentPlaceHolder1_lblSubjects']/text()")
    ebook_isbn = root.xpath("//div/label[@id='ctl00_ContentPlaceHolder1_lblIdentifier']/text()")
    desc = root.xpath("//div/label[@id='ctl00_ContentPlaceHolder1_lblDescription']/ancestor::div[1]")
    if pub_date:
        from calibre.utils.date import parse_date
        mi.pubdate = parse_date(pub_date[0].strip())
    if lang:
        mi.language = lang[0].strip()
        print "languages is "+str(mi.language)
    if ebook_isbn and isbn is None:
        print "ebook isbn is "+str(ebook_isbn[0])
        mi.set_identifier('isbn', ebook_isbn)
    #elif isbn is not None:
    #    mi.set_identifier('isbn', isbn)
    if subjects:
        mi.tags = [tag.strip() for tag in subjects[0].split(',')]
        print "tags are "+str(mi.tags)
    if desc:
        desc = desc[0]
        desc = html.tostring(desc, method='html', encoding=unicode).strip()
        # remove all attributes from tags
        desc = re.sub(r'<([a-zA-Z0-9]+)\s[^>]+>', r'<\1>', desc)
        # Remove comments
        desc = re.sub(r'(?s)<!--.*?-->', '', desc)
        mi.comments = sanitize_comments_html(desc)
    return True
 def main(args=sys.argv):
    print "running through main tests"
    import tempfile, os, time
    tdir = tempfile.gettempdir()
    br = browser()
    for ovrdrv_id, isbn, title, author in [
            #(None, '0899661343', 'On the Road', ['Jack Kerouac']), # basic test, no series, single author
            #(None, '9780061952838', 'The Fellowship of the Ring', ['J. R. R. Tolkien']), # Series test, multi-author
            #(None, '9780061952838', 'The Two Towers (The Lord of the Rings, Book II)', ['J. R. R. Tolkien']), # Series test, book 2
            #(None, '9780618153985', 'The Fellowship of the Ring (The Lord of the Rings, Part 1)', ['J.R.R. Tolkien']),
            #('57844706-20fa-4ace-b5ee-3470b1b52173', None, 'The Two Towers', ['J. R. R. Tolkien']), # Series test, w/ ovrdrv id
            #(None, '9780345505057', 'Deluge', ['Anne McCaffrey']) # Multiple authors
            #(None, None, 'Deluge', ['Anne McCaffrey']) # Empty ISBN
            #(None, None, 'On the Road', ['Jack Kerouac']), # Nonetype ISBN
            #(None, '9780345435279', 'A Caress of Twilight', ['Laurell K. Hamilton']),
            #(None, '9780606087230', 'The Omnivore\'s Dilemma : A Natural History of Four Meals', ['Michael Pollan']), # Subtitle colon
            #(None, '9780061747649', 'Mental_Floss Presents: Condensed Knowledge', ['Will Pearson', 'Mangesh Hattikudur']),
            #(None, '9781400050802', 'The Zombie Survival Guide', ['Max Brooks']), # Two books with this title by this author
            #(None, '9781775414315', 'The Worst Journey in the World / Antarctic 1910-1913', ['Apsley Cherry-Garrard']), # Garbage sub-title
            #(None, '9780440335160', 'Outlander', ['Diana Gabaldon']), # Returns lots of results to sort through to get the best match
            (None, '9780345509741', 'The Horror Stories of Robert E. Howard', ['Robert E. Howard']), # Complex title with initials/dots stripped, some results don't have a cover
            ]:
        cpath = os.path.join(tdir, title+'.jpg')
        print "cpath is "+cpath
        st = time.time()
        curl = get_cover_url(isbn, title, author, br, ovrdrv_id)
        print '\n\n Took ', time.time() - st, ' to get basic metadata\n\n'
        if curl is None:
            print 'No cover found for', title
        else:
            print "curl is "+curl
            #open(cpath, 'wb').write(br.open_novisit(curl).read())
            #print 'Cover for', title, 'saved to', cpath
        st = time.time()
        print get_social_metadata(title, author, isbn, ovrdrv_id)
        print '\n\n Took ', time.time() - st, ' to get detailed metadata\n\n'
    return 0
 if __name__ == '__main__':
    sys.exit(main())
--- a/src/calibre/ebooks/metadata/sources/base.py
+++ b/src/calibre/ebooks/metadata/sources/base.py
@ -313,8 +313,8 @@ class Source(Plugin):
                (r'(\d+),(\d+)', r'\1\2'),
                # Remove hyphens only if they have whitespace before them
                (r'(\s-)', ' '),
-                # Remove single quotes
+                # Remove single quotes not followed by 's'
-                (r"'", ''),
+                (r"'(?!s)", ''),
                # Replace other special chars with a space
                (r'''[:,;+!@#$%^&*(){}.`~"\s\[\]/]''', ' ')
            ]]
--- a/src/calibre/ebooks/metadata/sources/overdrive.py
+++ b/src/calibre/ebooks/metadata/sources/overdrive.py
@ -0,0 +1,510 @@
 #!/usr/bin/env  python
 __license__   = 'GPL v3'
 __copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
 __docformat__ = 'restructuredtext en'
 '''
 Fetch metadata using Overdrive Content Reserve
 '''
 import sys, re, random, urllib, mechanize, copy
 from threading import RLock
 from Queue import Queue, Empty
 from lxml import html, etree
 from lxml.html import soupparser
 from calibre import browser
 from calibre.ebooks.metadata import check_isbn
 from calibre.ebooks.metadata.sources.base import Source
 from calibre.ebooks.metadata.book.base import Metadata
 from calibre.ebooks.chardet import xml_to_unicode
 from calibre.library.comments import sanitize_comments_html
 ovrdrv_data_cache = {}
 cover_url_cache = {}
 cache_lock = RLock()
 base_url = 'http://search.overdrive.com/'
 class OverDrive(Source):
    name = 'Overdrive'
    description = _('Downloads metadata from Overdrive\'s Content Reserve')
    capabilities = frozenset(['identify', 'cover'])
    touched_fields = frozenset(['title', 'authors', 'tags', 'pubdate',
        'comments', 'publisher', 'identifier:isbn', 'series', 'series_num',
        'language', 'identifier:overdrive'])
    has_html_comments = True
    supports_gzip_transfer_encoding = False
    cached_cover_url_is_reliable = True
    def identify(self, log, result_queue, abort, title=None, authors=None, # {{{
            identifiers={}, timeout=30):
        ovrdrv_id = identifiers.get('overdrive', None)
        isbn = identifiers.get('isbn', None)
        br = self.browser
        print "in identify, calling to_ovrdrv_data"
        ovrdrv_data = self.to_ovrdrv_data(br, title, authors, ovrdrv_id)
        if ovrdrv_data:
            title = ovrdrv_data[8]
            authors = ovrdrv_data[6]
            mi = Metadata(title, authors)
            self.parse_search_results(ovrdrv_data, mi)
            if ovrdrv_id is None:
                ovrdrv_id = ovrdrv_data[7]
            if isbn is not None:
                self.cache_isbn_to_identifier(isbn, ovrdrv_id)
            self.get_book_detail(br, ovrdrv_data[1], mi, ovrdrv_id, log)
            result_queue.put(mi)
        return None
    # }}}
    def get_book_url(self, identifiers): # {{{
        ovrdrv_id = identifiers.get('overdrive', None)
        if ovrdrv_id is not None:
            ovrdrv_data = ovrdrv_data_cache.get(ovrdrv_id, None)
            if ovrdrv_data:
                return ovrdrv_data[1]
            else:
                br = browser()
                ovrdrv_data = self.to_ovrdrv_data(br, None, None, ovrdrv_id)
                return ovrdrv_data[1]
    # }}}
    def download_cover(self, log, result_queue, abort, # {{{
            title=None, authors=None, identifiers={}, timeout=30):
        cached_url = self.get_cached_cover_url(identifiers)
        if cached_url is None:
            log.info('No cached cover found, running identify')
            rq = Queue()
            print "inside download cover, calling identify"
            self.identify(log, rq, abort, title=title, authors=authors,
                    identifiers=identifiers)
            if abort.is_set():
                return
            results = []
            while True:
                try:
                    results.append(rq.get_nowait())
                except Empty:
                    break
            results.sort(key=self.identify_results_keygen(
                title=title, authors=authors, identifiers=identifiers))
            for mi in results:
                cached_url = self.get_cached_cover_url(mi.identifiers)
                if cached_url is not None:
                    break
        if cached_url is None:
            log.info('No cover found')
            return
        if abort.is_set():
            return
        ovrdrv_id = identifiers.get('overdrive', None)
        br = self.browser
        referer = self.get_base_referer()+'ContentDetails-Cover.htm?ID='+ovrdrv_id
        print "downloading cover, referer is "+str(referer)
        req = mechanize.Request(cached_url)
        req.add_header('referer', referer)
        log('Downloading cover from:', cached_url)
        try:
            cdata = br.open_novisit(req, timeout=timeout).read()
            result_queue.put((self, cdata))
        except:
            log.exception('Failed to download cover from:', cached_url)
    # }}}
    def get_cached_cover_url(self, identifiers): # {{{
        url = None
        ovrdrv_id = identifiers.get('overdrive', None)
        print "inside get_cached_cover_url, ovrdrv_id is "+str(ovrdrv_id)
        if ovrdrv_id is None:
            isbn = identifiers.get('isbn', None)
            if isbn is not None:
                ovrdrv_id = self.cached_isbn_to_identifier(isbn)
        if ovrdrv_id is not None:
            url = self.cached_identifier_to_cover_url(ovrdrv_id)
        return url
    # }}}
    def create_query(self, title=None, authors=None, identifiers={}):
        q = ''
        if title or authors:
            def build_term(prefix, parts):
                return ' '.join('in'+prefix + ':' + x for x in parts)
            title_tokens = list(self.get_title_tokens(title, False, True))
            if title_tokens:
                q += build_term('title', title_tokens)
            author_tokens = self.get_author_tokens(authors,
                    only_first_author=True)
            if author_tokens:
                q += ('+' if q else '') + build_term('author',
                        author_tokens)
        if isinstance(q, unicode):
            q = q.encode('utf-8')
        if not q:
            return None
        return BASE_URL+urlencode({
            'q':q,
            })
    def get_base_referer(self): # to be used for passing referrer headers to cover download
        choices = [
            'http://overdrive.chipublib.org/82DC601D-7DDE-4212-B43A-09D821935B01/10/375/en/',
            'http://emedia.clevnet.org/9D321DAD-EC0D-490D-BFD8-64AE2C96ECA8/10/241/en/',
            'http://singapore.lib.overdrive.com/F11D55BE-A917-4D63-8111-318E88B29740/10/382/en/',
            'http://ebooks.nypl.org/20E48048-A377-4520-BC43-F8729A42A424/10/257/en/',
            'http://spl.lib.overdrive.com/5875E082-4CB2-4689-9426-8509F354AFEF/10/335/en/'
        ]
        return choices[random.randint(0, len(choices)-1)]
    def format_results(self, reserveid, od_title, subtitle, series, publisher, creators, thumbimage, worldcatlink, formatid):
        fix_slashes = re.compile(r'\\/')
        thumbimage = fix_slashes.sub('/', thumbimage)
        worldcatlink = fix_slashes.sub('/', worldcatlink)
        cover_url = re.sub('(?P<img>(Ima?g(eType-)?))200', '\g<img>100', thumbimage)
        social_metadata_url = base_url+'TitleInfo.aspx?ReserveID='+reserveid+'&FormatID='+formatid
        series_num = ''
        if not series:
            if subtitle:
                title = od_title+': '+subtitle
            else:
                title = od_title
        else:
            title = od_title
            m = re.search("([0-9]+$)", subtitle)
            if m:
                series_num = float(m.group(1))
        return [cover_url, social_metadata_url, worldcatlink, series, series_num, publisher, creators, reserveid, title]
    def safe_query(self, br, query_url, post=''):
        '''
        The query must be initialized by loading an empty search results page
        this page attempts to set a cookie that Mechanize doesn't like
        copy the cookiejar to a separate instance and make a one-off request with the temp cookiejar
        '''
        goodcookies = br._ua_handlers['_cookies'].cookiejar
        clean_cj = mechanize.CookieJar()
        cookies_to_copy = []
        for cookie in goodcookies:
            copied_cookie = copy.deepcopy(cookie)
            cookies_to_copy.append(copied_cookie)
        for copied_cookie in cookies_to_copy:
            clean_cj.set_cookie(copied_cookie)
        if post:
            br.open_novisit(query_url, post)
        else:
            br.open_novisit(query_url)
        br.set_cookiejar(clean_cj)
    def overdrive_search(self, br, q, title, author):
        # re-initialize the cookiejar to so that it's clean
        clean_cj = mechanize.CookieJar()
        br.set_cookiejar(clean_cj)
        q_query = q+'default.aspx/SearchByKeyword'
        q_init_search = q+'SearchResults.aspx'
        # get first author as string - convert this to a proper cleanup function later
        s = Source(None)
        print "printing list with author "+str(author)+":"
        author_tokens = list(s.get_author_tokens(author))
        print list(author_tokens)
        title_tokens = list(s.get_title_tokens(title, False, True))
        print "there are "+str(len(title_tokens))+" title tokens"
        for token in title_tokens:
            print "cleaned up title token is: "+str(token)
        if len(title_tokens) >= len(author_tokens):
            initial_q = ' '.join(title_tokens)
            xref_q = '+'.join(author_tokens)
        else:
            initial_q = ' '.join(author_tokens)
            xref_q = '+'.join(title_tokens)
        print "initial query is "+str(initial_q)
        print "cross reference query is "+str(xref_q)
        q_xref = q+'SearchResults.svc/GetResults?iDisplayLength=50&sSearch='+xref_q
        query = '{"szKeyword":"'+initial_q+'"}'
        # main query, requires specific Content Type header
        req = mechanize.Request(q_query)
        req.add_header('Content-Type', 'application/json; charset=utf-8')
        br.open_novisit(req, query)
        print "q_init_search is "+q_init_search
        # initiate the search without messing up the cookiejar
        self.safe_query(br, q_init_search)
        # get the search results object
        results = False
        while results == False:
            xreq = mechanize.Request(q_xref)
            xreq.add_header('X-Requested-With', 'XMLHttpRequest')
            xreq.add_header('Referer', q_init_search)
            xreq.add_header('Accept', 'application/json, text/javascript, */*')
            raw = br.open_novisit(xreq).read()
            print "overdrive search result is:\n"+raw
            for m in re.finditer(ur'"iTotalDisplayRecords":(?P<displayrecords>\d+).*?"iTotalRecords":(?P<totalrecords>\d+)', raw):
                if int(m.group('displayrecords')) >= 1:
                    results = True
                elif int(m.group('totalrecords')) >= 1:
                    xref_q = ''
                    q_xref = q+'SearchResults.svc/GetResults?iDisplayLength=50&sSearch='+xref_q
                elif int(m.group('totalrecords')) == 0:
                    return ''
        print "\n\nsorting results"
        return self.sort_ovrdrv_results(raw, title, title_tokens, author, author_tokens)
    def sort_ovrdrv_results(self, raw, title=None, title_tokens=None, author=None, author_tokens=None, ovrdrv_id=None):
        print "\ntitle to search for is "+str(title)+"\nauthor to search for is "+str(author)
        close_matches = []
        raw = re.sub('.*?\[\[(?P<content>.*?)\]\].*', '[[\g<content>]]', raw)
        results = eval(raw)
        print "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n"
        #print results
        # The search results are either from a keyword search or a multi-format list from a single ID,
        # sort through the results for closest match/format
        if results:
            for reserveid, od_title, subtitle, edition, series, publisher, format, formatid, creators, \
                    thumbimage, shortdescription, worldcatlink, excerptlink, creatorfile, sorttitle, \
                    availabletolibrary, availabletoretailer, relevancyrank, unknown1, unknown2, unknown3 in results:
                print "this record's title is "+od_title+", subtitle is "+subtitle+", author[s] are "+creators+", series is "+series
                if ovrdrv_id is not None and int(formatid) in [1, 50, 410, 900]:
                    print "overdrive id is not None, searching based on format type priority"
                    return self.format_results(reserveid, od_title, subtitle, series, publisher, creators, thumbimage, worldcatlink, formatid)            
                else:
                    creators = creators.split(', ')
                    print "split creators from results are: "+str(creators)
                    # if an exact match in a preferred format occurs
                    if creators[0] == author[0] and od_title == title and int(formatid) in [1, 50, 410, 900]:
                        print "Got Exact Match!!!"
                        return self.format_results(reserveid, od_title, subtitle, series, publisher, creators, thumbimage, worldcatlink, formatid)
                    else:
                        close_title_match = False
                        close_author_match = False
                        print "format id is "+str(formatid)
                        for token in title_tokens:
                            print "attempting to find "+str(token)+" title token"
                            if od_title.lower().find(token.lower()) != -1:
                                print "matched token"
                                close_title_match = True
                            else:
                                print "token didn't match"
                                close_title_match = False
                                break
                        for token in author_tokens:
                            print "attempting to find "+str(token)+" author token"
                            if creators[0].lower().find(token.lower()) != -1:
                                print "matched token"
                                close_author_match = True
                            else:
                                print "token didn't match"
                                close_author_match = False
                                break
                        if close_title_match and close_author_match and int(formatid) in [1, 50, 410, 900]:
                            if subtitle and series:
                                close_matches.insert(0, self.format_results(reserveid, od_title, subtitle, series, publisher, creators, thumbimage, worldcatlink, formatid))
                            else:
                                close_matches.append(self.format_results(reserveid, od_title, subtitle, series, publisher, creators, thumbimage, worldcatlink, formatid))
            if close_matches:
                return close_matches[0]
            else:
                return ''
        else:
            return ''
    def overdrive_get_record(self, br, q, ovrdrv_id):
        search_url = q+'SearchResults.aspx?ReserveID={'+ovrdrv_id+'}'
        results_url = q+'SearchResults.svc/GetResults?sEcho=1&iColumns=18&sColumns=ReserveID%2CTitle%2CSubtitle%2CEdition%2CSeries%2CPublisher%2CFormat%2CFormatID%2CCreators%2CThumbImage%2CShortDescription%2CWorldCatLink%2CExcerptLink%2CCreatorFile%2CSortTitle%2CAvailableToLibrary%2CAvailableToRetailer%2CRelevancyRank&iDisplayStart=0&iDisplayLength=10&sSearch=&bEscapeRegex=true&iSortingCols=1&iSortCol_0=17&sSortDir_0=asc'
        # get the base url to set the proper session cookie
        br.open_novisit(q)
        # initialize the search
        self.safe_query(br, search_url)
        # get the results
        req = mechanize.Request(results_url)
        req.add_header('X-Requested-With', 'XMLHttpRequest')
        req.add_header('Referer', search_url)
        req.add_header('Accept', 'application/json, text/javascript, */*')
        raw = br.open_novisit(req)
        raw = str(list(raw))
        clean_cj = mechanize.CookieJar()
        br.set_cookiejar(clean_cj)
        return self.sort_ovrdrv_results(raw, None, None, None, ovrdrv_id)
    def find_ovrdrv_data(self, br, title, author, isbn, ovrdrv_id=None):
        print "in find_ovrdrv_data, title is "+str(title)+", author is "+str(author)+", overdrive id is "+str(ovrdrv_id)
        q = base_url
        if ovrdrv_id is None:
           return self.overdrive_search(br, q, title, author)
        else:
           return self.overdrive_get_record(br, q, ovrdrv_id)
    def to_ovrdrv_data(self, br, title=None, author=None, ovrdrv_id=None):
        '''
        Takes either a title/author combo or an Overdrive ID.  One of these
        two must be passed to this function.
        '''
        print "starting to_ovrdrv_data"
        if ovrdrv_id is not None:
            with cache_lock:
                ans = ovrdrv_data_cache.get(ovrdrv_id, None)
            if ans:
                print "inside to_ovrdrv_data, cache lookup successful, ans is "+str(ans)
                return ans
            elif ans is False:
                print "inside to_ovrdrv_data, ans returned False"
                return None
            else:
                ovrdrv_data = self.find_ovrdrv_data(br, title, author, ovrdrv_id)
        else:
            try:
                print "trying to retrieve data, running find_ovrdrv_data"
                ovrdrv_data = self.find_ovrdrv_data(br, title, author, ovrdrv_id)
                print "ovrdrv_data is "+str(ovrdrv_data)
            except:
                import traceback
                traceback.print_exc()
                ovrdrv_data = None
        print "writing results to ovrdrv_data cache"
        with cache_lock:
            ovrdrv_data_cache[ovrdrv_id] = ovrdrv_data if ovrdrv_data else False
        return ovrdrv_data if ovrdrv_data else False
    def parse_search_results(self, ovrdrv_data, mi):
        '''
        Parse the formatted search results from the initial Overdrive query and
        add the values to the metadta.
        The list object has these values:
        [cover_url[0], social_metadata_url[1], worldcatlink[2], series[3], series_num[4],
        publisher[5], creators[6], reserveid[7], title[8]]
        '''
        print "inside parse_search_results, writing the metadata results"
        ovrdrv_id = ovrdrv_data[7]
        mi.set_identifier('overdrive', ovrdrv_id)
        if len(ovrdrv_data[3]) > 1:
            mi.series = ovrdrv_data[3]
            if ovrdrv_data[4]:
                mi.series_index = ovrdrv_data[4]
        mi.publisher = ovrdrv_data[5]
        mi.authors = ovrdrv_data[6]
        mi.title = ovrdrv_data[8]
        cover_url = ovrdrv_data[0]
        if cover_url:
            self.cache_identifier_to_cover_url(ovrdrv_id,
                    cover_url)
    def get_book_detail(self, br, metadata_url, mi, ovrdrv_id, log):
        try:
            raw = br.open_novisit(metadata_url).read()
        except Exception, e:
            if callable(getattr(e, 'getcode', None)) and \
                    e.getcode() == 404:
                return False
            raise   
        raw = xml_to_unicode(raw, strip_encoding_pats=True,
                resolve_entities=True)[0]
        try:
            root = soupparser.fromstring(raw)
        except:
            return False
        pub_date = root.xpath("//div/label[@id='ctl00_ContentPlaceHolder1_lblPubDate']/text()")
        lang = root.xpath("//div/label[@id='ctl00_ContentPlaceHolder1_lblLanguage']/text()")
        subjects = root.xpath("//div/label[@id='ctl00_ContentPlaceHolder1_lblSubjects']/text()")
        ebook_isbn = root.xpath("//td/label[@id='ctl00_ContentPlaceHolder1_lblIdentifier']/text()")
        desc = root.xpath("//div/label[@id='ctl00_ContentPlaceHolder1_lblDescription']/ancestor::div[1]")
        if pub_date:
            from calibre.utils.date import parse_date
            mi.pubdate = parse_date(pub_date[0].strip())
        if lang:
            mi.language = lang[0].strip()
            print "languages is "+str(mi.language)
        #if ebook_isbn:
        #    print "ebook isbn is "+str(ebook_isbn[0])
        #    isbn = check_isbn(ebook_isbn[0].strip())
        #    if isbn:
        #        self.cache_isbn_to_identifier(isbn, ovrdrv_id)
        #        mi.isbn = isbn
        if subjects:
            mi.tags = [tag.strip() for tag in subjects[0].split(',')]
            print "tags are "+str(mi.tags)
        if desc:
            desc = desc[0]
            desc = html.tostring(desc, method='html', encoding=unicode).strip()
            # remove all attributes from tags
            desc = re.sub(r'<([a-zA-Z0-9]+)\s[^>]+>', r'<\1>', desc)
            # Remove comments
            desc = re.sub(r'(?s)<!--.*?-->', '', desc)
            mi.comments = sanitize_comments_html(desc)
        return None
 def main(args=sys.argv):
    print "running through main tests"
    import tempfile, os, time
    tdir = tempfile.gettempdir()
    br = browser()
    for ovrdrv_id, isbn, title, author in [
            #(None, '0899661343', 'On the Road', ['Jack Kerouac']), # basic test, no series, single author
            #(None, '9780061952838', 'The Fellowship of the Ring', ['J. R. R. Tolkien']), # Series test, multi-author
            #(None, '9780061952838', 'The Two Towers (The Lord of the Rings, Book II)', ['J. R. R. Tolkien']), # Series test, book 2
            #(None, '9780618153985', 'The Fellowship of the Ring (The Lord of the Rings, Part 1)', ['J.R.R. Tolkien']),
            #('57844706-20fa-4ace-b5ee-3470b1b52173', None, 'The Two Towers', ['J. R. R. Tolkien']), # Series test, w/ ovrdrv id
            #(None, '9780345505057', 'Deluge', ['Anne McCaffrey']) # Multiple authors
            #(None, None, 'Deluge', ['Anne McCaffrey']) # Empty ISBN
            #(None, None, 'On the Road', ['Jack Kerouac']), # Nonetype ISBN
            #(None, '9780345435279', 'A Caress of Twilight', ['Laurell K. Hamilton']),
            #(None, '9780606087230', 'The Omnivore\'s Dilemma : A Natural History of Four Meals', ['Michael Pollan']), # Subtitle colon
            #(None, '9780061747649', 'Mental_Floss Presents: Condensed Knowledge', ['Will Pearson', 'Mangesh Hattikudur']),
            #(None, '9781400050802', 'The Zombie Survival Guide', ['Max Brooks']), # Two books with this title by this author
            #(None, '9781775414315', 'The Worst Journey in the World / Antarctic 1910-1913', ['Apsley Cherry-Garrard']), # Garbage sub-title
            #(None, '9780440335160', 'Outlander', ['Diana Gabaldon']), # Returns lots of results to sort through to get the best match
            (None, '9780345509741', 'The Horror Stories of Robert E. Howard', ['Robert E. Howard']), # Complex title with initials/dots stripped, some results don't have a cover
            ]:
        cpath = os.path.join(tdir, title+'.jpg')
        print "cpath is "+cpath
        st = time.time()
        curl = get_cover_url(isbn, title, author, br, ovrdrv_id)
        print '\n\n Took ', time.time() - st, ' to get basic metadata\n\n'
        if curl is None:
            print 'No cover found for', title
        else:
            print "curl is "+curl
            #open(cpath, 'wb').write(br.open_novisit(curl).read())
            #print 'Cover for', title, 'saved to', cpath
        st = time.time()
        print get_social_metadata(title, author, isbn, ovrdrv_id)
        print '\n\n Took ', time.time() - st, ' to get detailed metadata\n\n'
    return 0
 if __name__ == '__main__':
    sys.exit(main())