Initial import of new metadata download framework

2025-07-09 03:04:10 -04:00 · 2011-01-31 20:09:26 -07:00 · 2011-01-31 20:09:26 -07:00 · d2ba1812bb
commit d2ba1812bb
parent 5d4c738862
2 changed files with 276 additions and 0 deletions
--- a/src/calibre/ebooks/metadata/sources/base.py
+++ b/src/calibre/ebooks/metadata/sources/base.py
@ -0,0 +1,61 @@
 #!/usr/bin/env python
 # vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
 __license__   = 'GPL v3'
 __copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'
 import re
 from calibre.customize import Plugin
 class Source(Plugin):
    type = _('Metadata source')
    author = 'Kovid Goyal'
    supported_platforms = ['windows', 'osx', 'linux']
    result_of_identify_is_complete = True
    def get_author_tokens(self, authors):
        'Take a list of authors and return a list of tokens useful for a '
        'AND search query'
        # Leave ' in there for Irish names
        pat = re.compile(r'[-,:;+!@#$%^&*(){}.`~"\s\[\]/]')
        for au in authors:
            for tok in au.split():
                yield pat.sub('', tok)
    def split_jobs(self, jobs, num):
        'Split a list of jobs into at most num groups, as evenly as possible'
        groups = [[] for i in range(num)]
        jobs = list(jobs)
        while jobs:
            for gr in groups:
                try:
                    job = jobs.pop()
                except IndexError:
                    break
                gr.append(job)
        return [g for g in groups if g]
    def identify(self, log, result_queue, abort, title=None, authors=None, identifiers={}):
        '''
        Identify a book by its title/author/isbn/etc.
        :param log: A log object, use it to output debugging information/errors
        :param result_queue: A result Queue, results should be put into it.
                            Each result is a Metadata object
        :param abort: If abort.is_set() returns True, abort further processing
                      and return as soon as possible
        :param title: The title of the book, can be None
        :param authors: A list of authors of the book, can be None
        :param identifiers: A dictionary of other identifiers, most commonly
                            {'isbn':'1234...'}
        :return: None if no errors occurred, otherwise a unicode representation
                 of the error suitable for showing to the user
        '''
        return None
--- a/src/calibre/ebooks/metadata/sources/google.py
+++ b/src/calibre/ebooks/metadata/sources/google.py
@ -0,0 +1,215 @@
 #!/usr/bin/env python
 # vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
 __license__   = 'GPL v3'
 __copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
 __docformat__ = 'restructuredtext en'
 import time
 from urllib import urlencode
 from functools import partial
 from threading import Thread
 from lxml import etree
 from calibre.ebooks.metadata.sources import Source
 from calibre.ebooks.metadata.book.base import Metadata
 from calibre.utils.date import parse_date, utcnow
 from calibre import browser, as_unicode
 NAMESPACES = {
              'openSearch':'http://a9.com/-/spec/opensearchrss/1.0/',
              'atom' : 'http://www.w3.org/2005/Atom',
              'dc': 'http://purl.org/dc/terms'
            }
 XPath = partial(etree.XPath, namespaces=NAMESPACES)
 total_results  = XPath('//openSearch:totalResults')
 start_index    = XPath('//openSearch:startIndex')
 items_per_page = XPath('//openSearch:itemsPerPage')
 entry          = XPath('//atom:entry')
 entry_id       = XPath('descendant::atom:id')
 creator        = XPath('descendant::dc:creator')
 identifier     = XPath('descendant::dc:identifier')
 title          = XPath('descendant::dc:title')
 date           = XPath('descendant::dc:date')
 publisher      = XPath('descendant::dc:publisher')
 subject        = XPath('descendant::dc:subject')
 description    = XPath('descendant::dc:description')
 language       = XPath('descendant::dc:language')
 def to_metadata(browser, log, entry_):
    def get_text(extra, x):
        try:
            ans = x(extra)
            if ans:
                ans = ans[0].text
                if ans and ans.strip():
                    return ans.strip()
        except:
            log.exception('Programming error:')
        return None
    id_url = entry_id(entry_)[0].text
    title_ = ': '.join([x.text for x in title(entry_)]).strip()
    authors = [x.text.strip() for x in creator(entry_) if x.text]
    if not authors:
        authors = [_('Unknown')]
    if not id_url or not title:
        # Silently discard this entry
        return None
    mi = Metadata(title_, authors)
    try:
        raw = browser.open(id_url).read()
        feed = etree.fromstring(raw)
        extra = entry(feed)[0]
    except:
        log.exception('Failed to get additional details for', mi.title)
        return mi
    mi.comments = get_text(extra, description)
    #mi.language = get_text(extra, language)
    mi.publisher = get_text(extra, publisher)
    # Author sort
    for x in creator(extra):
        for key, val in x.attrib.items():
            if key.endswith('file-as') and val and val.strip():
                mi.author_sort = val
                break
    # ISBN
    isbns = []
    for x in identifier(extra):
        t = str(x.text).strip()
        if t[:5].upper() in ('ISBN:', 'LCCN:', 'OCLC:'):
            if t[:5].upper() == 'ISBN:':
                isbns.append(t[5:])
    if isbns:
        mi.isbn = sorted(isbns, key=len)[-1]
    # Tags
    try:
        btags = [x.text for x in subject(extra) if x.text]
        tags = []
        for t in btags:
            tags.extend([y.strip() for y in t.split('/')])
        tags = list(sorted(list(set(tags))))
    except:
        log.exception('Failed to parse tags:')
        tags = []
    if tags:
        mi.tags = [x.replace(',', ';') for x in tags]
    # pubdate
    pubdate = get_text(extra, date)
    if pubdate:
        try:
            default = utcnow().replace(day=15)
            mi.pubdate = parse_date(pubdate, assume_utc=True, default=default)
        except:
            log.exception('Failed to parse pubdate')
    return mi
 class Worker(Thread):
    def __init__(self, log, entries, abort, result_queue):
        self.browser, self.log, self.entries = browser(), log, entries
        self.abort, self.result_queue = abort, result_queue
        Thread.__init__(self)
        self.daemon = True
    def run(self):
        for i in self.entries:
            try:
                ans = to_metadata(self.browser, self.log, i)
                if ans is not None:
                    self.result_queue.put(ans)
            except:
                self.log.exception(
                    'Failed to get metadata for identify entry:',
                    etree.tostring(i))
            if self.abort.is_set():
                break
 class GoogleBooks(Source):
    name = 'Google Books'
    def create_query(self, log, title=None, authors=None, identifiers={},
            start_index=1):
        BASE_URL = 'http://books.google.com/books/feeds/volumes?'
        isbn = identifiers.get('isbn', None)
        q = ''
        if isbn is not None:
            q += 'isbn:'+isbn
        elif title or authors:
            def build_term(prefix, parts):
                return ' '.join('in'+prefix + ':' + x for x in parts)
            if title is not None:
                q += build_term('title', title.split())
            if authors:
                q += ('+' if q else '')+build_term('author',
                        self.get_author_tokens(authors))
        if isinstance(q, unicode):
            q = q.encode('utf-8')
        if not q:
            return None
        return BASE_URL+urlencode({
            'q':q,
            'max-results':20,
            'start-index':start_index,
            'min-viewability':'none',
            })
    def identify(self, log, result_queue, abort, title=None, authors=None, identifiers={}):
        query = self.create_query(log, title=title, authors=authors,
                identifiers=identifiers)
        try:
            raw = browser().open_novisit(query).read()
        except Exception, e:
            log.exception('Failed to make identify query: %r'%query)
            return as_unicode(e)
        try:
            parser = etree.XMLParser(recover=True, no_network=True)
            feed = etree.fromstring(raw, parser=parser)
            entries = entry(feed)
        except Exception, e:
            log.exception('Failed to parse identify results')
            return as_unicode(e)
        groups = self.split_jobs(entries, 5) # At most 5 threads
        if not groups:
            return
        workers = [Worker(log, entries, abort, result_queue) for entries in
                groups]
        if abort.is_set():
            return
        for worker in workers: worker.start()
        has_alive_worker = True
        while has_alive_worker and not abort.is_set():
            has_alive_worker = False
            for worker in workers:
                if worker.is_alive():
                    has_alive_worker = True
            time.sleep(0.1)
        return None