Add Overdrive as a metadata download source

2025-07-09 03:04:10 -04:00 · 2011-04-18 20:53:15 -06:00 · 2011-04-18 20:53:15 -06:00 · 727ada1ac4
commit 727ada1ac4
parent dd0cabf0aa 0991d2ca3c
8 changed files with 503 additions and 31 deletions
--- a/recipes/slashdot.recipe
+++ b/recipes/slashdot.recipe
@ -19,10 +19,23 @@ class Slashdot(BasicNewsRecipe):
    __author__ = 'floweros edited by Huan T'
    no_stylesheets = True
-#             keep_only_tags = [
+    keep_only_tags = [
-#                 dict(name='div',attrs={'class':'article'}),
+        dict(name='div',attrs={'id':'article'}),
-#                 dict(name='div',attrs={'class':'commentTop'}),
+        dict(name='div',attrs={'class':['postBody' 'details']}),
-#                 ]
+        dict(name='footer',attrs={'class':['clearfix meta article-foot']}),
        dict(name='article',attrs={'class':['fhitem fhitem-story article usermode thumbs grid_24']}),
        dict(name='dl',attrs={'class':'relatedPosts'}),
        dict(name='h2',attrs={'class':'story'}),
        dict(name='span',attrs={'class':'comments'}),
        ]
    remove_tags    = [
        dict(name='aside',attrs={'id':'slashboxes'}),
        dict(name='div',attrs={'class':'paginate'}),
        dict(name='section',attrs={'id':'comments'}),
        dict(name='span',attrs={'class':'topic'}),
        ]
    feeds          = [
                 (u'Slashdot',
@ -37,5 +50,3 @@ class Slashdot(BasicNewsRecipe):
 u'http://rss.slashdot.org/Slashdot/slashdotYourRightsOnline')
                 ]
             def get_article_url(self, article):
                          return article.get('feedburner_origlink', None)
--- a/src/calibre/customize/builtins.py
+++ b/src/calibre/customize/builtins.py
@ -626,8 +626,9 @@ if test_eight_code:
    from calibre.ebooks.metadata.sources.amazon import Amazon
    from calibre.ebooks.metadata.sources.openlibrary import OpenLibrary
    from calibre.ebooks.metadata.sources.isbndb import ISBNDB
    from calibre.ebooks.metadata.sources.overdrive import OverDrive
-    plugins += [GoogleBooks, Amazon, OpenLibrary, ISBNDB]
+    plugins += [GoogleBooks, Amazon, OpenLibrary, ISBNDB, OverDrive]
 # }}}
 else:
--- a/src/calibre/ebooks/conversion/preprocess.py
+++ b/src/calibre/ebooks/conversion/preprocess.py
@ -399,7 +399,7 @@ class HTMLPreProcessor(object):
                  (re.compile(u'˙\s*(<br.*?>)*\s*Z', re.UNICODE), lambda match: u'Ż'),
                  # If pdf printed from a browser then the header/footer has a reliable pattern
-                  (re.compile(r'((?<=</a>)\s*file:////?[A-Z].*<br>|file:////?[A-Z].*<br>(?=\s*<hr>))', re.IGNORECASE), lambda match: ''),
+                  (re.compile(r'((?<=</a>)\s*file:/{2,4}[A-Z].*<br>|file:////?[A-Z].*<br>(?=\s*<hr>))', re.IGNORECASE), lambda match: ''),
                  # Center separator lines
                  (re.compile(u'<br>\s*(?P<break>([*#•✦=]+\s*)+)\s*<br>'), lambda match: '<p>\n<p style="text-align:center">' + match.group(1) + '</p>'),
--- a/src/calibre/ebooks/conversion/utils.py
+++ b/src/calibre/ebooks/conversion/utils.py
@ -764,6 +764,7 @@ class HeuristicProcessor(object):
        # Multiple sequential blank paragraphs are merged with appropriate margins
        # If non-blank scene breaks exist they are center aligned and styled with appropriate margins.
        if getattr(self.extra_opts, 'format_scene_breaks', False):
            html = re.sub('(?i)<div[^>]*>\s*<br(\s?/)?>\s*</div>', '<p></p>', html)
            html = self.detect_whitespace(html)
            html = self.detect_soft_breaks(html)
            blanks_count = len(self.any_multi_blank.findall(html))
--- a/src/calibre/ebooks/metadata/sources/base.py
+++ b/src/calibre/ebooks/metadata/sources/base.py
@ -274,26 +274,34 @@ class Source(Plugin):
        if authors:
            # Leave ' in there for Irish names
-            pat = re.compile(r'[-,:;+!@#$%^&*(){}.`~"\s\[\]/]')
+            remove_pat = re.compile(r'[,!@#$%^&*(){}`~"\s\[\]/]')
            replace_pat = re.compile(r'[-+.:;]')
            if only_first_author:
                authors = authors[:1]
            for au in authors:
                au = replace_pat.sub(' ', au)
                parts = au.split()
                if ',' in au:
                    # au probably in ln, fn form
                    parts = parts[1:] + parts[:1]
                for tok in parts:
-                    tok = pat.sub('', tok).strip()
+                    tok = remove_pat.sub('', tok).strip()
                    if len(tok) > 2 and tok.lower() not in ('von', ):
                        yield tok
-    def get_title_tokens(self, title):
+    def get_title_tokens(self, title, strip_joiners=True, strip_subtitle=False):
        '''
        Take a title and return a list of tokens useful for an AND search query.
-        Excludes connectives and punctuation.
+        Excludes connectives(optionally) and punctuation.
        '''
        if title:
            # strip sub-titles
            if strip_subtitle:
                subtitle = re.compile(r'([\(\[\{].*?[\)\]\}]|[/:\\].*$)')
                if len(subtitle.sub('', title)) > 1:
                    title = subtitle.sub('', title)
            title_patterns = [(re.compile(pat, re.IGNORECASE), repl) for pat, repl in
            [
                # Remove things like: (2010) (Omnibus) etc.
@ -305,17 +313,20 @@ class Source(Plugin):
                (r'(\d+),(\d+)', r'\1\2'),
                # Remove hyphens only if they have whitespace before them
                (r'(\s-)', ' '),
-                # Remove single quotes
+                # Remove single quotes not followed by 's'
-                (r"'", ''),
+                (r"'(?!s)", ''),
                # Replace other special chars with a space
                (r'''[:,;+!@#$%^&*(){}.`~"\s\[\]/]''', ' ')
            ]]
            for pat, repl in title_patterns:
                title = pat.sub(repl, title)
            tokens = title.split()
            for token in tokens:
                token = token.strip()
-                if token and token.lower() not in ('a', 'and', 'the'):
+                if token and (not strip_joiners or token.lower() not in ('a',
                    'and', 'the', '&')):
                    yield token
    def split_jobs(self, jobs, num):
@ -363,7 +374,11 @@ class Source(Plugin):
    def get_book_url(self, identifiers):
        '''
        Return the URL for the book identified by identifiers at this source.
-        If no URL is found, return None.
+        This URL must be browseable to by a human using a browser. It is meant
        to provide a clickable link for the user to easily visit the books page
        at this source.
        If no URL is found, return None. This method must be quick, either it
        should construct the URL using a known URL scheme or use a cached URL.
        '''
        return None
--- a/src/calibre/ebooks/metadata/sources/identify.py
+++ b/src/calibre/ebooks/metadata/sources/identify.py
@ -433,7 +433,7 @@ def urls_from_identifiers(identifiers): # {{{
            pass
    isbn = identifiers.get('isbn', None)
    if isbn:
-        ans.append(('ISBN',
+        ans.append((isbn,
            'http://www.worldcat.org/search?q=bn%%3A%s&qt=advanced'%isbn))
    return ans
 # }}}
@ -444,13 +444,18 @@ if __name__ == '__main__': # tests {{{
    from calibre.ebooks.metadata.sources.test import (test_identify,
            title_test, authors_test)
    tests = [
            (
                {'title':'Magykal Papers',
                    'authors':['Sage']},
                [title_test('The Magykal Papers', exact=True)],
            ),
            ( # An e-book ISBN not on Amazon, one of the authors is
              # unknown to Amazon
                {'identifiers':{'isbn': '9780307459671'},
                    'title':'Invisible Gorilla', 'authors':['Christopher Chabris']},
-                [title_test('The Invisible Gorilla',
+                [title_test('The Invisible Gorilla', exact=True)]
                    exact=True), authors_test(['Christopher Chabris', 'Daniel Simons'])]
            ),
--- a/src/calibre/ebooks/metadata/sources/overdrive.py
+++ b/src/calibre/ebooks/metadata/sources/overdrive.py
@ -0,0 +1,439 @@
 #!/usr/bin/env  python
 from __future__ import (unicode_literals, division, absolute_import,
                        print_function)
 __license__   = 'GPL v3'
 __copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
 __docformat__ = 'restructuredtext en'
 '''
 Fetch metadata using Overdrive Content Reserve
 '''
 import re, random, mechanize, copy
 from threading import RLock
 from Queue import Queue, Empty
 from lxml import html
 from lxml.html import soupparser
 from calibre.ebooks.metadata import check_isbn
 from calibre.ebooks.metadata.sources.base import Source
 from calibre.ebooks.metadata.book.base import Metadata
 from calibre.ebooks.chardet import xml_to_unicode
 from calibre.library.comments import sanitize_comments_html
 ovrdrv_data_cache = {}
 cover_url_cache = {}
 cache_lock = RLock()
 base_url = 'http://search.overdrive.com/'
 class OverDrive(Source):
    name = 'Overdrive'
    description = _('Downloads metadata from Overdrive\'s Content Reserve')
    capabilities = frozenset(['identify', 'cover'])
    touched_fields = frozenset(['title', 'authors', 'tags', 'pubdate',
        'comments', 'publisher', 'identifier:isbn', 'series', 'series_index',
        'language', 'identifier:overdrive'])
    has_html_comments = True
    supports_gzip_transfer_encoding = False
    cached_cover_url_is_reliable = True
    def __init__(self, *args, **kwargs):
       Source.__init__(self, *args, **kwargs)
       self.prefs.defaults['ignore_fields'] =['tags', 'pubdate', 'comments', 'identifier:isbn', 'language']
    def identify(self, log, result_queue, abort, title=None, authors=None, # {{{
            identifiers={}, timeout=30):
        ovrdrv_id = identifiers.get('overdrive', None)
        isbn = identifiers.get('isbn', None)
        br = self.browser
        ovrdrv_data = self.to_ovrdrv_data(br, title, authors, ovrdrv_id)
        if ovrdrv_data:
            title = ovrdrv_data[8]
            authors = ovrdrv_data[6]
            mi = Metadata(title, authors)
            self.parse_search_results(ovrdrv_data, mi)
            if ovrdrv_id is None:
                ovrdrv_id = ovrdrv_data[7]
            if isbn is not None:
                self.cache_isbn_to_identifier(isbn, ovrdrv_id)
            self.get_book_detail(br, ovrdrv_data[1], mi, ovrdrv_id, log)
            result_queue.put(mi)
        return None
    # }}}
    def download_cover(self, log, result_queue, abort, # {{{
            title=None, authors=None, identifiers={}, timeout=30):
        cached_url = self.get_cached_cover_url(identifiers)
        if cached_url is None:
            log.info('No cached cover found, running identify')
            rq = Queue()
            self.identify(log, rq, abort, title=title, authors=authors,
                    identifiers=identifiers)
            if abort.is_set():
                return
            results = []
            while True:
                try:
                    results.append(rq.get_nowait())
                except Empty:
                    break
            results.sort(key=self.identify_results_keygen(
                title=title, authors=authors, identifiers=identifiers))
            for mi in results:
                cached_url = self.get_cached_cover_url(mi.identifiers)
                if cached_url is not None:
                    break
        if cached_url is None:
            log.info('No cover found')
            return
        if abort.is_set():
            return
        ovrdrv_id = identifiers.get('overdrive', None)
        br = self.browser
        referer = self.get_base_referer()+'ContentDetails-Cover.htm?ID='+ovrdrv_id
        req = mechanize.Request(cached_url)
        req.add_header('referer', referer)
        log('Downloading cover from:', cached_url)
        try:
            cdata = br.open_novisit(req, timeout=timeout).read()
            result_queue.put((self, cdata))
        except:
            log.exception('Failed to download cover from:', cached_url)
    # }}}
    def get_cached_cover_url(self, identifiers): # {{{
        url = None
        ovrdrv_id = identifiers.get('overdrive', None)
        if ovrdrv_id is None:
            isbn = identifiers.get('isbn', None)
            if isbn is not None:
                ovrdrv_id = self.cached_isbn_to_identifier(isbn)
        if ovrdrv_id is not None:
            url = self.cached_identifier_to_cover_url(ovrdrv_id)
        return url
    # }}}
    def get_base_referer(self): # to be used for passing referrer headers to cover download
        choices = [
            'http://overdrive.chipublib.org/82DC601D-7DDE-4212-B43A-09D821935B01/10/375/en/',
            'http://emedia.clevnet.org/9D321DAD-EC0D-490D-BFD8-64AE2C96ECA8/10/241/en/',
            'http://singapore.lib.overdrive.com/F11D55BE-A917-4D63-8111-318E88B29740/10/382/en/',
            'http://ebooks.nypl.org/20E48048-A377-4520-BC43-F8729A42A424/10/257/en/',
            'http://spl.lib.overdrive.com/5875E082-4CB2-4689-9426-8509F354AFEF/10/335/en/'
        ]
        return choices[random.randint(0, len(choices)-1)]
    def format_results(self, reserveid, od_title, subtitle, series, publisher, creators, thumbimage, worldcatlink, formatid):
        fix_slashes = re.compile(r'\\/')
        thumbimage = fix_slashes.sub('/', thumbimage)
        worldcatlink = fix_slashes.sub('/', worldcatlink)
        cover_url = re.sub('(?P<img>(Ima?g(eType-)?))200', '\g<img>100', thumbimage)
        social_metadata_url = base_url+'TitleInfo.aspx?ReserveID='+reserveid+'&FormatID='+formatid
        series_num = ''
        if not series:
            if subtitle:
                title = od_title+': '+subtitle
            else:
                title = od_title
        else:
            title = od_title
            m = re.search("([0-9]+$)", subtitle)
            if m:
                series_num = float(m.group(1))
        return [cover_url, social_metadata_url, worldcatlink, series, series_num, publisher, creators, reserveid, title]
    def safe_query(self, br, query_url, post=''):
        '''
        The query must be initialized by loading an empty search results page
        this page attempts to set a cookie that Mechanize doesn't like
        copy the cookiejar to a separate instance and make a one-off request with the temp cookiejar
        '''
        goodcookies = br._ua_handlers['_cookies'].cookiejar
        clean_cj = mechanize.CookieJar()
        cookies_to_copy = []
        for cookie in goodcookies:
            copied_cookie = copy.deepcopy(cookie)
            cookies_to_copy.append(copied_cookie)
        for copied_cookie in cookies_to_copy:
            clean_cj.set_cookie(copied_cookie)
        if post:
            br.open_novisit(query_url, post)
        else:
            br.open_novisit(query_url)
        br.set_cookiejar(clean_cj)
    def overdrive_search(self, br, q, title, author):
        # re-initialize the cookiejar to so that it's clean
        clean_cj = mechanize.CookieJar()
        br.set_cookiejar(clean_cj)
        q_query = q+'default.aspx/SearchByKeyword'
        q_init_search = q+'SearchResults.aspx'
        # get first author as string - convert this to a proper cleanup function later
        author_tokens = list(self.get_author_tokens(author,
                only_first_author=True))
        title_tokens = list(self.get_title_tokens(title,
                strip_joiners=False, strip_subtitle=True))
        if len(title_tokens) >= len(author_tokens):
            initial_q = ' '.join(title_tokens)
            xref_q = '+'.join(author_tokens)
        else:
            initial_q = ' '.join(author_tokens)
            xref_q = '+'.join(title_tokens)
        q_xref = q+'SearchResults.svc/GetResults?iDisplayLength=50&sSearch='+xref_q
        query = '{"szKeyword":"'+initial_q+'"}'
        # main query, requires specific Content Type header
        req = mechanize.Request(q_query)
        req.add_header('Content-Type', 'application/json; charset=utf-8')
        br.open_novisit(req, query)
        # initiate the search without messing up the cookiejar
        self.safe_query(br, q_init_search)
        # get the search results object
        results = False
        while results == False:
            xreq = mechanize.Request(q_xref)
            xreq.add_header('X-Requested-With', 'XMLHttpRequest')
            xreq.add_header('Referer', q_init_search)
            xreq.add_header('Accept', 'application/json, text/javascript, */*')
            raw = br.open_novisit(xreq).read()
            for m in re.finditer(ur'"iTotalDisplayRecords":(?P<displayrecords>\d+).*?"iTotalRecords":(?P<totalrecords>\d+)', raw):
                if int(m.group('displayrecords')) >= 1:
                    results = True
                elif int(m.group('totalrecords')) >= 1:
                    xref_q = ''
                    q_xref = q+'SearchResults.svc/GetResults?iDisplayLength=50&sSearch='+xref_q
                elif int(m.group('totalrecords')) == 0:
                    return ''
        return self.sort_ovrdrv_results(raw, title, title_tokens, author, author_tokens)
    def sort_ovrdrv_results(self, raw, title=None, title_tokens=None, author=None, author_tokens=None, ovrdrv_id=None):
        close_matches = []
        raw = re.sub('.*?\[\[(?P<content>.*?)\]\].*', '[[\g<content>]]', raw)
        results = eval(raw)
        #print results
        # The search results are either from a keyword search or a multi-format list from a single ID,
        # sort through the results for closest match/format
        if results:
            for reserveid, od_title, subtitle, edition, series, publisher, format, formatid, creators, \
                    thumbimage, shortdescription, worldcatlink, excerptlink, creatorfile, sorttitle, \
                    availabletolibrary, availabletoretailer, relevancyrank, unknown1, unknown2, unknown3 in results:
                #print "this record's title is "+od_title+", subtitle is "+subtitle+", author[s] are "+creators+", series is "+series
                if ovrdrv_id is not None and int(formatid) in [1, 50, 410, 900]:
                    #print "overdrive id is not None, searching based on format type priority"
                    return self.format_results(reserveid, od_title, subtitle, series, publisher,
                            creators, thumbimage, worldcatlink, formatid)
                else:
                    creators = creators.split(', ')
                    # if an exact match in a preferred format occurs
                    if (author and creators[0] == author[0]) and od_title == title and int(formatid) in [1, 50, 410, 900]:
                        return self.format_results(reserveid, od_title, subtitle, series, publisher,
                                creators, thumbimage, worldcatlink, formatid)
                    else:
                        close_title_match = False
                        close_author_match = False
                        for token in title_tokens:
                            if od_title.lower().find(token.lower()) != -1:
                                close_title_match = True
                            else:
                                close_title_match = False
                                break
                        for author in creators:
                            for token in author_tokens:
                                if author.lower().find(token.lower()) != -1:
                                    close_author_match = True
                                else:
                                    close_author_match = False
                                    break
                            if close_author_match:
                                break
                        if close_title_match and close_author_match and int(formatid) in [1, 50, 410, 900] and thumbimage:
                            if subtitle and series:
                                close_matches.insert(0, self.format_results(reserveid, od_title, subtitle, series, publisher, creators, thumbimage, worldcatlink, formatid))
                            else:
                                close_matches.append(self.format_results(reserveid, od_title, subtitle, series, publisher, creators, thumbimage, worldcatlink, formatid))
            if close_matches:
                return close_matches[0]
            else:
                return ''
        else:
            return ''
    def overdrive_get_record(self, br, q, ovrdrv_id):
        search_url = q+'SearchResults.aspx?ReserveID={'+ovrdrv_id+'}'
        results_url = q+'SearchResults.svc/GetResults?sEcho=1&iColumns=18&sColumns=ReserveID%2CTitle%2CSubtitle%2CEdition%2CSeries%2CPublisher%2CFormat%2CFormatID%2CCreators%2CThumbImage%2CShortDescription%2CWorldCatLink%2CExcerptLink%2CCreatorFile%2CSortTitle%2CAvailableToLibrary%2CAvailableToRetailer%2CRelevancyRank&iDisplayStart=0&iDisplayLength=10&sSearch=&bEscapeRegex=true&iSortingCols=1&iSortCol_0=17&sSortDir_0=asc'
        # re-initialize the cookiejar to so that it's clean
        clean_cj = mechanize.CookieJar()
        br.set_cookiejar(clean_cj)
        # get the base url to set the proper session cookie
        br.open_novisit(q)
        # initialize the search
        self.safe_query(br, search_url)
        # get the results
        req = mechanize.Request(results_url)
        req.add_header('X-Requested-With', 'XMLHttpRequest')
        req.add_header('Referer', search_url)
        req.add_header('Accept', 'application/json, text/javascript, */*')
        raw = br.open_novisit(req)
        raw = str(list(raw))
        clean_cj = mechanize.CookieJar()
        br.set_cookiejar(clean_cj)
        return self.sort_ovrdrv_results(raw, None, None, None, ovrdrv_id)
    def find_ovrdrv_data(self, br, title, author, isbn, ovrdrv_id=None):
        q = base_url
        if ovrdrv_id is None:
           return self.overdrive_search(br, q, title, author)
        else:
           return self.overdrive_get_record(br, q, ovrdrv_id)
    def to_ovrdrv_data(self, br, title=None, author=None, ovrdrv_id=None):
        '''
        Takes either a title/author combo or an Overdrive ID.  One of these
        two must be passed to this function.
        '''
        if ovrdrv_id is not None:
            with cache_lock:
                ans = ovrdrv_data_cache.get(ovrdrv_id, None)
            if ans:
                return ans
            elif ans is False:
                return None
            else:
                ovrdrv_data = self.find_ovrdrv_data(br, title, author, ovrdrv_id)
        else:
            try:
                ovrdrv_data = self.find_ovrdrv_data(br, title, author, ovrdrv_id)
            except:
                import traceback
                traceback.print_exc()
                ovrdrv_data = None
        with cache_lock:
            ovrdrv_data_cache[ovrdrv_id] = ovrdrv_data if ovrdrv_data else False
        return ovrdrv_data if ovrdrv_data else False
    def parse_search_results(self, ovrdrv_data, mi):
        '''
        Parse the formatted search results from the initial Overdrive query and
        add the values to the metadta.
        The list object has these values:
        [cover_url[0], social_metadata_url[1], worldcatlink[2], series[3], series_num[4],
        publisher[5], creators[6], reserveid[7], title[8]]
        '''
        ovrdrv_id = ovrdrv_data[7]
        mi.set_identifier('overdrive', ovrdrv_id)
        if len(ovrdrv_data[3]) > 1:
            mi.series = ovrdrv_data[3]
            if ovrdrv_data[4]:
                try:
                    mi.series_index = float(ovrdrv_data[4])
                except:
                    pass
        mi.publisher = ovrdrv_data[5]
        mi.authors = ovrdrv_data[6]
        mi.title = ovrdrv_data[8]
        cover_url = ovrdrv_data[0]
        if cover_url:
            self.cache_identifier_to_cover_url(ovrdrv_id,
                    cover_url)
    def get_book_detail(self, br, metadata_url, mi, ovrdrv_id, log):
        try:
            raw = br.open_novisit(metadata_url).read()
        except Exception, e:
            if callable(getattr(e, 'getcode', None)) and \
                    e.getcode() == 404:
                return False
            raise
        raw = xml_to_unicode(raw, strip_encoding_pats=True,
                resolve_entities=True)[0]
        try:
            root = soupparser.fromstring(raw)
        except:
            return False
        pub_date = root.xpath("//div/label[@id='ctl00_ContentPlaceHolder1_lblPubDate']/text()")
        lang = root.xpath("//div/label[@id='ctl00_ContentPlaceHolder1_lblLanguage']/text()")
        subjects = root.xpath("//div/label[@id='ctl00_ContentPlaceHolder1_lblSubjects']/text()")
        ebook_isbn = root.xpath("//td/label[@id='ctl00_ContentPlaceHolder1_lblIdentifier']/text()")
        desc = root.xpath("//div/label[@id='ctl00_ContentPlaceHolder1_lblDescription']/ancestor::div[1]")
        if pub_date:
            from calibre.utils.date import parse_date
            mi.pubdate = parse_date(pub_date[0].strip())
        if lang:
            mi.language = lang[0].strip()
        if ebook_isbn:
            #print "ebook isbn is "+str(ebook_isbn[0])
            isbn = check_isbn(ebook_isbn[0].strip())
            if isbn:
                self.cache_isbn_to_identifier(isbn, ovrdrv_id)
                mi.isbn = isbn
        if subjects:
            mi.tags = [tag.strip() for tag in subjects[0].split(',')]
        if desc:
            desc = desc[0]
            desc = html.tostring(desc, method='html', encoding=unicode).strip()
            # remove all attributes from tags
            desc = re.sub(r'<([a-zA-Z0-9]+)\s[^>]+>', r'<\1>', desc)
            # Remove comments
            desc = re.sub(r'(?s)<!--.*?-->', '', desc)
            mi.comments = sanitize_comments_html(desc)
        return None
 if __name__ == '__main__':
    # To run these test use:
    # calibre-debug -e src/calibre/ebooks/metadata/sources/overdrive.py
    from calibre.ebooks.metadata.sources.test import (test_identify_plugin,
            title_test, authors_test)
    test_identify_plugin(OverDrive.name,
        [
            (
                {'title':'Foundation and Earth',
                    'authors':['Asimov']},
                [title_test('Foundation and Earth', exact=True),
                    authors_test(['Isaac Asimov'])]
            ),
            (
                {'title': 'Elephants', 'authors':['Agatha']},
                [title_test('Elephants Can Remember', exact=False),
                    authors_test(['Agatha Christie'])]
            ),
    ])
--- a/src/calibre/gui2/metadata/single_download.py
+++ b/src/calibre/gui2/metadata/single_download.py
@ -949,7 +949,7 @@ class CoverFetch(QDialog): # {{{
 # }}}
 if __name__ == '__main__':
-    DEBUG_DIALOG = True
+    #DEBUG_DIALOG = True
    app = QApplication([])
    d = FullFetch()
    d.start(title='great gatsby', authors=['fitzgerald'])