Add Overdrive as a metadata download source

2025-07-09 03:04:10 -04:00 · 2011-04-18 20:53:15 -06:00 · 2011-04-18 20:53:15 -06:00 · 727ada1ac4
commit 727ada1ac4
parent dd0cabf0aa 0991d2ca3c
8 changed files with 503 additions and 31 deletions
--- a/recipes/slashdot.recipe
+++ b/recipes/slashdot.recipe
@ -19,10 +19,23 @@ class Slashdot(BasicNewsRecipe):

    __author__ = 'floweros edited by Huan T'
    no_stylesheets = True
-#             keep_only_tags = [
-#                 dict(name='div',attrs={'class':'article'}),
-#                 dict(name='div',attrs={'class':'commentTop'}),
-#                 ]
+    keep_only_tags = [
+        dict(name='div',attrs={'id':'article'}),
+        dict(name='div',attrs={'class':['postBody' 'details']}),
+        dict(name='footer',attrs={'class':['clearfix meta article-foot']}),
+        dict(name='article',attrs={'class':['fhitem fhitem-story article usermode thumbs grid_24']}),
+        dict(name='dl',attrs={'class':'relatedPosts'}),
+        dict(name='h2',attrs={'class':'story'}),
+        dict(name='span',attrs={'class':'comments'}),
+        ]
+
+
+    remove_tags    = [
+        dict(name='aside',attrs={'id':'slashboxes'}),
+        dict(name='div',attrs={'class':'paginate'}),
+        dict(name='section',attrs={'id':'comments'}),
+        dict(name='span',attrs={'class':'topic'}),
+        ]

    feeds          = [
                 (u'Slashdot',
@ -37,5 +50,3 @@ class Slashdot(BasicNewsRecipe):
 u'http://rss.slashdot.org/Slashdot/slashdotYourRightsOnline')
                 ]

-             def get_article_url(self, article):
-                          return article.get('feedburner_origlink', None)
--- a/src/calibre/customize/builtins.py
+++ b/src/calibre/customize/builtins.py
@ -626,8 +626,9 @@ if test_eight_code:
    from calibre.ebooks.metadata.sources.amazon import Amazon
    from calibre.ebooks.metadata.sources.openlibrary import OpenLibrary
    from calibre.ebooks.metadata.sources.isbndb import ISBNDB
+    from calibre.ebooks.metadata.sources.overdrive import OverDrive

-    plugins += [GoogleBooks, Amazon, OpenLibrary, ISBNDB]
+    plugins += [GoogleBooks, Amazon, OpenLibrary, ISBNDB, OverDrive]

 # }}}
 else:
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@ -399,7 +399,7 @@ class HTMLPreProcessor(object):
                  (re.compile(u'˙\s*(<br.*?>)*\s*Z', re.UNICODE), lambda match: u'Ż'),

                  # If pdf printed from a browser then the header/footer has a reliable pattern
-                  (re.compile(r'((?<=</a>)\s*file:////?[A-Z].*<br>|file:////?[A-Z].*<br>(?=\s*<hr>))', re.IGNORECASE), lambda match: ''),
+                  (re.compile(r'((?<=</a>)\s*file:/{2,4}[A-Z].*<br>|file:////?[A-Z].*<br>(?=\s*<hr>))', re.IGNORECASE), lambda match: ''),

                  # Center separator lines
                  (re.compile(u'<br>\s*(?P<break>([*#•✦=]+\s*)+)\s*<br>'), lambda match: '<p>\n<p style="text-align:center">' + match.group(1) + '</p>'),
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@ -764,6 +764,7 @@ class HeuristicProcessor(object):
        # Multiple sequential blank paragraphs are merged with appropriate margins
        # If non-blank scene breaks exist they are center aligned and styled with appropriate margins.
        if getattr(self.extra_opts, 'format_scene_breaks', False):
+            html = re.sub('(?i)<div[^>]*>\s*<br(\s?/)?>\s*</div>', '<p></p>', html)
            html = self.detect_whitespace(html)
            html = self.detect_soft_breaks(html)
            blanks_count = len(self.any_multi_blank.findall(html))
--- a/src/calibre/ebooks/metadata/sources/base.py
+++ b/src/calibre/ebooks/metadata/sources/base.py
@ -274,26 +274,34 @@ class Source(Plugin):

        if authors:
            # Leave ' in there for Irish names
-            pat = re.compile(r'[-,:;+!@#$%^&*(){}.`~"\s\[\]/]')
+            remove_pat = re.compile(r'[,!@#$%^&*(){}`~"\s\[\]/]')
+            replace_pat = re.compile(r'[-+.:;]')
            if only_first_author:
                authors = authors[:1]
            for au in authors:
+                au = replace_pat.sub(' ', au)
                parts = au.split()
                if ',' in au:
                    # au probably in ln, fn form
                    parts = parts[1:] + parts[:1]
                for tok in parts:
-                    tok = pat.sub('', tok).strip()
+                    tok = remove_pat.sub('', tok).strip()
                    if len(tok) > 2 and tok.lower() not in ('von', ):
                        yield tok


-    def get_title_tokens(self, title):
+    def get_title_tokens(self, title, strip_joiners=True, strip_subtitle=False):
        '''
        Take a title and return a list of tokens useful for an AND search query.
-        Excludes connectives and punctuation.
+        Excludes connectives(optionally) and punctuation.
        '''
        if title:
+            # strip sub-titles
+            if strip_subtitle:
+                subtitle = re.compile(r'([\(\[\{].*?[\)\]\}]|[/:\\].*$)')
+                if len(subtitle.sub('', title)) > 1:
+                    title = subtitle.sub('', title)
+
            title_patterns = [(re.compile(pat, re.IGNORECASE), repl) for pat, repl in
            [
                # Remove things like: (2010) (Omnibus) etc.
@ -305,17 +313,20 @@ class Source(Plugin):
                (r'(\d+),(\d+)', r'\1\2'),
                # Remove hyphens only if they have whitespace before them
                (r'(\s-)', ' '),
-                # Remove single quotes
-                (r"'", ''),
+                # Remove single quotes not followed by 's'
+                (r"'(?!s)", ''),
                # Replace other special chars with a space
                (r'''[:,;+!@#$%^&*(){}.`~"\s\[\]/]''', ' ')
            ]]
+
            for pat, repl in title_patterns:
                title = pat.sub(repl, title)
+
            tokens = title.split()
            for token in tokens:
                token = token.strip()
-                if token and token.lower() not in ('a', 'and', 'the'):
+                if token and (not strip_joiners or token.lower() not in ('a',
+                    'and', 'the', '&')):
                    yield token

    def split_jobs(self, jobs, num):
@ -363,7 +374,11 @@ class Source(Plugin):
    def get_book_url(self, identifiers):
        '''
        Return the URL for the book identified by identifiers at this source.
-        If no URL is found, return None.
+        This URL must be browseable to by a human using a browser. It is meant
+        to provide a clickable link for the user to easily visit the books page
+        at this source.
+        If no URL is found, return None. This method must be quick, either it
+        should construct the URL using a known URL scheme or use a cached URL.
        '''
        return None

--- a/src/calibre/ebooks/metadata/sources/identify.py
+++ b/src/calibre/ebooks/metadata/sources/identify.py
@ -433,7 +433,7 @@ def urls_from_identifiers(identifiers): # {{{
            pass
    isbn = identifiers.get('isbn', None)
    if isbn:
-        ans.append(('ISBN',
+        ans.append((isbn,
            'http://www.worldcat.org/search?q=bn%%3A%s&qt=advanced'%isbn))
    return ans
 # }}}
@ -444,13 +444,18 @@ if __name__ == '__main__': # tests {{{
    from calibre.ebooks.metadata.sources.test import (test_identify,
            title_test, authors_test)
    tests = [
+            (
+                {'title':'Magykal Papers',
+                    'authors':['Sage']},
+                [title_test('The Magykal Papers', exact=True)],
+            ),
+

            ( # An e-book ISBN not on Amazon, one of the authors is
              # unknown to Amazon
                {'identifiers':{'isbn': '9780307459671'},
                    'title':'Invisible Gorilla', 'authors':['Christopher Chabris']},
-                [title_test('The Invisible Gorilla',
-                    exact=True), authors_test(['Christopher Chabris', 'Daniel Simons'])]
+                [title_test('The Invisible Gorilla', exact=True)]

            ),

--- a/src/calibre/ebooks/metadata/sources/overdrive.py
+++ b/src/calibre/ebooks/metadata/sources/overdrive.py
@ -0,0 +1,439 @@
+#!/usr/bin/env  python
+from __future__ import (unicode_literals, division, absolute_import,
+                        print_function)
+
+__license__   = 'GPL v3'
+__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
+__docformat__ = 'restructuredtext en'
+
+'''
+Fetch metadata using Overdrive Content Reserve
+'''
+import re, random, mechanize, copy
+from threading import RLock
+from Queue import Queue, Empty
+
+from lxml import html
+from lxml.html import soupparser
+
+from calibre.ebooks.metadata import check_isbn
+from calibre.ebooks.metadata.sources.base import Source
+from calibre.ebooks.metadata.book.base import Metadata
+from calibre.ebooks.chardet import xml_to_unicode
+from calibre.library.comments import sanitize_comments_html
+
+ovrdrv_data_cache = {}
+cover_url_cache = {}
+cache_lock = RLock()
+base_url = 'http://search.overdrive.com/'
+
+
+class OverDrive(Source):
+
+    name = 'Overdrive'
+    description = _('Downloads metadata from Overdrive\'s Content Reserve')
+
+    capabilities = frozenset(['identify', 'cover'])
+    touched_fields = frozenset(['title', 'authors', 'tags', 'pubdate',
+        'comments', 'publisher', 'identifier:isbn', 'series', 'series_index',
+        'language', 'identifier:overdrive'])
+    has_html_comments = True
+    supports_gzip_transfer_encoding = False
+    cached_cover_url_is_reliable = True
+
+    def __init__(self, *args, **kwargs):
+       Source.__init__(self, *args, **kwargs)
+       self.prefs.defaults['ignore_fields'] =['tags', 'pubdate', 'comments', 'identifier:isbn', 'language']
+
+    def identify(self, log, result_queue, abort, title=None, authors=None, # {{{
+            identifiers={}, timeout=30):
+        ovrdrv_id = identifiers.get('overdrive', None)
+        isbn = identifiers.get('isbn', None)
+
+        br = self.browser
+        ovrdrv_data = self.to_ovrdrv_data(br, title, authors, ovrdrv_id)
+        if ovrdrv_data:
+            title = ovrdrv_data[8]
+            authors = ovrdrv_data[6]
+            mi = Metadata(title, authors)
+            self.parse_search_results(ovrdrv_data, mi)
+            if ovrdrv_id is None:
+                ovrdrv_id = ovrdrv_data[7]
+            if isbn is not None:
+                self.cache_isbn_to_identifier(isbn, ovrdrv_id)
+
+            self.get_book_detail(br, ovrdrv_data[1], mi, ovrdrv_id, log)
+
+            result_queue.put(mi)
+
+        return None
+    # }}}
+
+    def download_cover(self, log, result_queue, abort, # {{{
+            title=None, authors=None, identifiers={}, timeout=30):
+        cached_url = self.get_cached_cover_url(identifiers)
+        if cached_url is None:
+            log.info('No cached cover found, running identify')
+            rq = Queue()
+            self.identify(log, rq, abort, title=title, authors=authors,
+                    identifiers=identifiers)
+            if abort.is_set():
+                return
+            results = []
+            while True:
+                try:
+                    results.append(rq.get_nowait())
+                except Empty:
+                    break
+            results.sort(key=self.identify_results_keygen(
+                title=title, authors=authors, identifiers=identifiers))
+            for mi in results:
+                cached_url = self.get_cached_cover_url(mi.identifiers)
+                if cached_url is not None:
+                    break
+        if cached_url is None:
+            log.info('No cover found')
+            return
+
+        if abort.is_set():
+            return
+
+        ovrdrv_id = identifiers.get('overdrive', None)
+        br = self.browser
+        referer = self.get_base_referer()+'ContentDetails-Cover.htm?ID='+ovrdrv_id
+        req = mechanize.Request(cached_url)
+        req.add_header('referer', referer)
+        log('Downloading cover from:', cached_url)
+        try:
+            cdata = br.open_novisit(req, timeout=timeout).read()
+            result_queue.put((self, cdata))
+        except:
+            log.exception('Failed to download cover from:', cached_url)
+    # }}}
+
+    def get_cached_cover_url(self, identifiers): # {{{
+        url = None
+        ovrdrv_id = identifiers.get('overdrive', None)
+        if ovrdrv_id is None:
+            isbn = identifiers.get('isbn', None)
+            if isbn is not None:
+                ovrdrv_id = self.cached_isbn_to_identifier(isbn)
+        if ovrdrv_id is not None:
+            url = self.cached_identifier_to_cover_url(ovrdrv_id)
+
+        return url
+    # }}}
+
+    def get_base_referer(self): # to be used for passing referrer headers to cover download
+        choices = [
+            'http://overdrive.chipublib.org/82DC601D-7DDE-4212-B43A-09D821935B01/10/375/en/',
+            'http://emedia.clevnet.org/9D321DAD-EC0D-490D-BFD8-64AE2C96ECA8/10/241/en/',
+            'http://singapore.lib.overdrive.com/F11D55BE-A917-4D63-8111-318E88B29740/10/382/en/',
+            'http://ebooks.nypl.org/20E48048-A377-4520-BC43-F8729A42A424/10/257/en/',
+            'http://spl.lib.overdrive.com/5875E082-4CB2-4689-9426-8509F354AFEF/10/335/en/'
+        ]
+        return choices[random.randint(0, len(choices)-1)]
+
+    def format_results(self, reserveid, od_title, subtitle, series, publisher, creators, thumbimage, worldcatlink, formatid):
+        fix_slashes = re.compile(r'\\/')
+        thumbimage = fix_slashes.sub('/', thumbimage)
+        worldcatlink = fix_slashes.sub('/', worldcatlink)
+        cover_url = re.sub('(?P<img>(Ima?g(eType-)?))200', '\g<img>100', thumbimage)
+        social_metadata_url = base_url+'TitleInfo.aspx?ReserveID='+reserveid+'&FormatID='+formatid
+        series_num = ''
+        if not series:
+            if subtitle:
+                title = od_title+': '+subtitle
+            else:
+                title = od_title
+        else:
+            title = od_title
+            m = re.search("([0-9]+$)", subtitle)
+            if m:
+                series_num = float(m.group(1))
+        return [cover_url, social_metadata_url, worldcatlink, series, series_num, publisher, creators, reserveid, title]
+
+    def safe_query(self, br, query_url, post=''):
+        '''
+        The query must be initialized by loading an empty search results page
+        this page attempts to set a cookie that Mechanize doesn't like
+        copy the cookiejar to a separate instance and make a one-off request with the temp cookiejar
+        '''
+        goodcookies = br._ua_handlers['_cookies'].cookiejar
+        clean_cj = mechanize.CookieJar()
+        cookies_to_copy = []
+        for cookie in goodcookies:
+            copied_cookie = copy.deepcopy(cookie)
+            cookies_to_copy.append(copied_cookie)
+        for copied_cookie in cookies_to_copy:
+            clean_cj.set_cookie(copied_cookie)
+
+        if post:
+            br.open_novisit(query_url, post)
+        else:
+            br.open_novisit(query_url)
+
+        br.set_cookiejar(clean_cj)
+
+    def overdrive_search(self, br, q, title, author):
+        # re-initialize the cookiejar to so that it's clean
+        clean_cj = mechanize.CookieJar()
+        br.set_cookiejar(clean_cj)
+        q_query = q+'default.aspx/SearchByKeyword'
+        q_init_search = q+'SearchResults.aspx'
+        # get first author as string - convert this to a proper cleanup function later
+        author_tokens = list(self.get_author_tokens(author,
+                only_first_author=True))
+        title_tokens = list(self.get_title_tokens(title,
+                strip_joiners=False, strip_subtitle=True))
+
+        if len(title_tokens) >= len(author_tokens):
+            initial_q = ' '.join(title_tokens)
+            xref_q = '+'.join(author_tokens)
+        else:
+            initial_q = ' '.join(author_tokens)
+            xref_q = '+'.join(title_tokens)
+
+        q_xref = q+'SearchResults.svc/GetResults?iDisplayLength=50&sSearch='+xref_q
+        query = '{"szKeyword":"'+initial_q+'"}'
+
+        # main query, requires specific Content Type header
+        req = mechanize.Request(q_query)
+        req.add_header('Content-Type', 'application/json; charset=utf-8')
+        br.open_novisit(req, query)
+
+        # initiate the search without messing up the cookiejar
+        self.safe_query(br, q_init_search)
+
+        # get the search results object
+        results = False
+        while results == False:
+            xreq = mechanize.Request(q_xref)
+            xreq.add_header('X-Requested-With', 'XMLHttpRequest')
+            xreq.add_header('Referer', q_init_search)
+            xreq.add_header('Accept', 'application/json, text/javascript, */*')
+            raw = br.open_novisit(xreq).read()
+            for m in re.finditer(ur'"iTotalDisplayRecords":(?P<displayrecords>\d+).*?"iTotalRecords":(?P<totalrecords>\d+)', raw):
+                if int(m.group('displayrecords')) >= 1:
+                    results = True
+                elif int(m.group('totalrecords')) >= 1:
+                    xref_q = ''
+                    q_xref = q+'SearchResults.svc/GetResults?iDisplayLength=50&sSearch='+xref_q
+                elif int(m.group('totalrecords')) == 0:
+                    return ''
+
+        return self.sort_ovrdrv_results(raw, title, title_tokens, author, author_tokens)
+
+
+    def sort_ovrdrv_results(self, raw, title=None, title_tokens=None, author=None, author_tokens=None, ovrdrv_id=None):
+        close_matches = []
+        raw = re.sub('.*?\[\[(?P<content>.*?)\]\].*', '[[\g<content>]]', raw)
+        results = eval(raw)
+        #print results
+        # The search results are either from a keyword search or a multi-format list from a single ID,
+        # sort through the results for closest match/format
+        if results:
+            for reserveid, od_title, subtitle, edition, series, publisher, format, formatid, creators, \
+                    thumbimage, shortdescription, worldcatlink, excerptlink, creatorfile, sorttitle, \
+                    availabletolibrary, availabletoretailer, relevancyrank, unknown1, unknown2, unknown3 in results:
+                #print "this record's title is "+od_title+", subtitle is "+subtitle+", author[s] are "+creators+", series is "+series
+                if ovrdrv_id is not None and int(formatid) in [1, 50, 410, 900]:
+                    #print "overdrive id is not None, searching based on format type priority"
+                    return self.format_results(reserveid, od_title, subtitle, series, publisher,
+                            creators, thumbimage, worldcatlink, formatid)
+                else:
+                    creators = creators.split(', ')
+                    # if an exact match in a preferred format occurs
+                    if (author and creators[0] == author[0]) and od_title == title and int(formatid) in [1, 50, 410, 900]:
+                        return self.format_results(reserveid, od_title, subtitle, series, publisher,
+                                creators, thumbimage, worldcatlink, formatid)
+                    else:
+                        close_title_match = False
+                        close_author_match = False
+                        for token in title_tokens:
+                            if od_title.lower().find(token.lower()) != -1:
+                                close_title_match = True
+                            else:
+                                close_title_match = False
+                                break
+                        for author in creators:
+                            for token in author_tokens:
+                                if author.lower().find(token.lower()) != -1:
+                                    close_author_match = True
+                                else:
+                                    close_author_match = False
+                                    break
+                            if close_author_match:
+                                break
+                        if close_title_match and close_author_match and int(formatid) in [1, 50, 410, 900] and thumbimage:
+                            if subtitle and series:
+                                close_matches.insert(0, self.format_results(reserveid, od_title, subtitle, series, publisher, creators, thumbimage, worldcatlink, formatid))
+                            else:
+                                close_matches.append(self.format_results(reserveid, od_title, subtitle, series, publisher, creators, thumbimage, worldcatlink, formatid))
+            if close_matches:
+                return close_matches[0]
+            else:
+                return ''
+        else:
+            return ''
+
+    def overdrive_get_record(self, br, q, ovrdrv_id):
+        search_url = q+'SearchResults.aspx?ReserveID={'+ovrdrv_id+'}'
+        results_url = q+'SearchResults.svc/GetResults?sEcho=1&iColumns=18&sColumns=ReserveID%2CTitle%2CSubtitle%2CEdition%2CSeries%2CPublisher%2CFormat%2CFormatID%2CCreators%2CThumbImage%2CShortDescription%2CWorldCatLink%2CExcerptLink%2CCreatorFile%2CSortTitle%2CAvailableToLibrary%2CAvailableToRetailer%2CRelevancyRank&iDisplayStart=0&iDisplayLength=10&sSearch=&bEscapeRegex=true&iSortingCols=1&iSortCol_0=17&sSortDir_0=asc'
+
+        # re-initialize the cookiejar to so that it's clean
+        clean_cj = mechanize.CookieJar()
+        br.set_cookiejar(clean_cj)
+        # get the base url to set the proper session cookie
+        br.open_novisit(q)
+
+        # initialize the search
+        self.safe_query(br, search_url)
+
+        # get the results
+        req = mechanize.Request(results_url)
+        req.add_header('X-Requested-With', 'XMLHttpRequest')
+        req.add_header('Referer', search_url)
+        req.add_header('Accept', 'application/json, text/javascript, */*')
+        raw = br.open_novisit(req)
+        raw = str(list(raw))
+        clean_cj = mechanize.CookieJar()
+        br.set_cookiejar(clean_cj)
+        return self.sort_ovrdrv_results(raw, None, None, None, ovrdrv_id)
+
+
+    def find_ovrdrv_data(self, br, title, author, isbn, ovrdrv_id=None):
+        q = base_url
+        if ovrdrv_id is None:
+           return self.overdrive_search(br, q, title, author)
+        else:
+           return self.overdrive_get_record(br, q, ovrdrv_id)
+
+
+
+    def to_ovrdrv_data(self, br, title=None, author=None, ovrdrv_id=None):
+        '''
+        Takes either a title/author combo or an Overdrive ID.  One of these
+        two must be passed to this function.
+        '''
+        if ovrdrv_id is not None:
+            with cache_lock:
+                ans = ovrdrv_data_cache.get(ovrdrv_id, None)
+            if ans:
+                return ans
+            elif ans is False:
+                return None
+            else:
+                ovrdrv_data = self.find_ovrdrv_data(br, title, author, ovrdrv_id)
+        else:
+            try:
+                ovrdrv_data = self.find_ovrdrv_data(br, title, author, ovrdrv_id)
+            except:
+                import traceback
+                traceback.print_exc()
+                ovrdrv_data = None
+        with cache_lock:
+            ovrdrv_data_cache[ovrdrv_id] = ovrdrv_data if ovrdrv_data else False
+
+        return ovrdrv_data if ovrdrv_data else False
+
+
+    def parse_search_results(self, ovrdrv_data, mi):
+        '''
+        Parse the formatted search results from the initial Overdrive query and
+        add the values to the metadta.
+
+        The list object has these values:
+        [cover_url[0], social_metadata_url[1], worldcatlink[2], series[3], series_num[4],
+        publisher[5], creators[6], reserveid[7], title[8]]
+
+        '''
+        ovrdrv_id = ovrdrv_data[7]
+        mi.set_identifier('overdrive', ovrdrv_id)
+
+        if len(ovrdrv_data[3]) > 1:
+            mi.series = ovrdrv_data[3]
+            if ovrdrv_data[4]:
+                try:
+                    mi.series_index = float(ovrdrv_data[4])
+                except:
+                    pass
+        mi.publisher = ovrdrv_data[5]
+        mi.authors = ovrdrv_data[6]
+        mi.title = ovrdrv_data[8]
+        cover_url = ovrdrv_data[0]
+        if cover_url:
+            self.cache_identifier_to_cover_url(ovrdrv_id,
+                    cover_url)
+
+
+    def get_book_detail(self, br, metadata_url, mi, ovrdrv_id, log):
+        try:
+            raw = br.open_novisit(metadata_url).read()
+        except Exception, e:
+            if callable(getattr(e, 'getcode', None)) and \
+                    e.getcode() == 404:
+                return False
+            raise
+        raw = xml_to_unicode(raw, strip_encoding_pats=True,
+                resolve_entities=True)[0]
+        try:
+            root = soupparser.fromstring(raw)
+        except:
+            return False
+
+        pub_date = root.xpath("//div/label[@id='ctl00_ContentPlaceHolder1_lblPubDate']/text()")
+        lang = root.xpath("//div/label[@id='ctl00_ContentPlaceHolder1_lblLanguage']/text()")
+        subjects = root.xpath("//div/label[@id='ctl00_ContentPlaceHolder1_lblSubjects']/text()")
+        ebook_isbn = root.xpath("//td/label[@id='ctl00_ContentPlaceHolder1_lblIdentifier']/text()")
+        desc = root.xpath("//div/label[@id='ctl00_ContentPlaceHolder1_lblDescription']/ancestor::div[1]")
+
+        if pub_date:
+            from calibre.utils.date import parse_date
+            mi.pubdate = parse_date(pub_date[0].strip())
+        if lang:
+            mi.language = lang[0].strip()
+
+        if ebook_isbn:
+            #print "ebook isbn is "+str(ebook_isbn[0])
+            isbn = check_isbn(ebook_isbn[0].strip())
+            if isbn:
+                self.cache_isbn_to_identifier(isbn, ovrdrv_id)
+                mi.isbn = isbn
+        if subjects:
+            mi.tags = [tag.strip() for tag in subjects[0].split(',')]
+
+        if desc:
+            desc = desc[0]
+            desc = html.tostring(desc, method='html', encoding=unicode).strip()
+            # remove all attributes from tags
+            desc = re.sub(r'<([a-zA-Z0-9]+)\s[^>]+>', r'<\1>', desc)
+            # Remove comments
+            desc = re.sub(r'(?s)<!--.*?-->', '', desc)
+            mi.comments = sanitize_comments_html(desc)
+
+        return None
+
+
+if __name__ == '__main__':
+    # To run these test use:
+    # calibre-debug -e src/calibre/ebooks/metadata/sources/overdrive.py
+    from calibre.ebooks.metadata.sources.test import (test_identify_plugin,
+            title_test, authors_test)
+    test_identify_plugin(OverDrive.name,
+        [
+
+            (
+                {'title':'Foundation and Earth',
+                    'authors':['Asimov']},
+                [title_test('Foundation and Earth', exact=True),
+                    authors_test(['Isaac Asimov'])]
+            ),
+
+            (
+                {'title': 'Elephants', 'authors':['Agatha']},
+                [title_test('Elephants Can Remember', exact=False),
+                    authors_test(['Agatha Christie'])]
+            ),
+    ])
+
--- a/src/calibre/gui2/metadata/single_download.py
+++ b/src/calibre/gui2/metadata/single_download.py
@ -949,7 +949,7 @@ class CoverFetch(QDialog): # {{{
 # }}}

 if __name__ == '__main__':
-    DEBUG_DIALOG = True
+    #DEBUG_DIALOG = True
    app = QApplication([])
    d = FullFetch()
    d.start(title='great gatsby', authors=['fitzgerald'])