Initial import of new metadata download framework

2025-07-09 03:04:10 -04:00 · 2011-01-31 20:09:26 -07:00 · 2011-01-31 20:09:26 -07:00 · d2ba1812bb
commit d2ba1812bb
parent 5d4c738862
2 changed files with 276 additions and 0 deletions
--- a/src/calibre/ebooks/metadata/sources/base.py
+++ b/src/calibre/ebooks/metadata/sources/base.py
@ -0,0 +1,61 @@
+#!/usr/bin/env python
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+
+__license__   = 'GPL v3'
+__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+import re
+
+from calibre.customize import Plugin
+
+class Source(Plugin):
+
+    type = _('Metadata source')
+    author = 'Kovid Goyal'
+
+    supported_platforms = ['windows', 'osx', 'linux']
+
+    result_of_identify_is_complete = True
+
+    def get_author_tokens(self, authors):
+        'Take a list of authors and return a list of tokens useful for a '
+        'AND search query'
+        # Leave ' in there for Irish names
+        pat = re.compile(r'[-,:;+!@#$%^&*(){}.`~"\s\[\]/]')
+        for au in authors:
+            for tok in au.split():
+                yield pat.sub('', tok)
+
+    def split_jobs(self, jobs, num):
+        'Split a list of jobs into at most num groups, as evenly as possible'
+        groups = [[] for i in range(num)]
+        jobs = list(jobs)
+        while jobs:
+            for gr in groups:
+                try:
+                    job = jobs.pop()
+                except IndexError:
+                    break
+                gr.append(job)
+        return [g for g in groups if g]
+
+    def identify(self, log, result_queue, abort, title=None, authors=None, identifiers={}):
+        '''
+        Identify a book by its title/author/isbn/etc.
+
+        :param log: A log object, use it to output debugging information/errors
+        :param result_queue: A result Queue, results should be put into it.
+                            Each result is a Metadata object
+        :param abort: If abort.is_set() returns True, abort further processing
+                      and return as soon as possible
+        :param title: The title of the book, can be None
+        :param authors: A list of authors of the book, can be None
+        :param identifiers: A dictionary of other identifiers, most commonly
+                            {'isbn':'1234...'}
+        :return: None if no errors occurred, otherwise a unicode representation
+                 of the error suitable for showing to the user
+
+        '''
+        return None
+
--- a/src/calibre/ebooks/metadata/sources/google.py
+++ b/src/calibre/ebooks/metadata/sources/google.py
@ -0,0 +1,215 @@
+#!/usr/bin/env python
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+
+__license__   = 'GPL v3'
+__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
+__docformat__ = 'restructuredtext en'
+
+import time
+from urllib import urlencode
+from functools import partial
+from threading import Thread
+
+from lxml import etree
+
+from calibre.ebooks.metadata.sources import Source
+from calibre.ebooks.metadata.book.base import Metadata
+from calibre.utils.date import parse_date, utcnow
+from calibre import browser, as_unicode
+
+NAMESPACES = {
+              'openSearch':'http://a9.com/-/spec/opensearchrss/1.0/',
+              'atom' : 'http://www.w3.org/2005/Atom',
+              'dc': 'http://purl.org/dc/terms'
+            }
+XPath = partial(etree.XPath, namespaces=NAMESPACES)
+
+total_results  = XPath('//openSearch:totalResults')
+start_index    = XPath('//openSearch:startIndex')
+items_per_page = XPath('//openSearch:itemsPerPage')
+entry          = XPath('//atom:entry')
+entry_id       = XPath('descendant::atom:id')
+creator        = XPath('descendant::dc:creator')
+identifier     = XPath('descendant::dc:identifier')
+title          = XPath('descendant::dc:title')
+date           = XPath('descendant::dc:date')
+publisher      = XPath('descendant::dc:publisher')
+subject        = XPath('descendant::dc:subject')
+description    = XPath('descendant::dc:description')
+language       = XPath('descendant::dc:language')
+
+
+
+def to_metadata(browser, log, entry_):
+
+    def get_text(extra, x):
+        try:
+            ans = x(extra)
+            if ans:
+                ans = ans[0].text
+                if ans and ans.strip():
+                    return ans.strip()
+        except:
+            log.exception('Programming error:')
+        return None
+
+
+    id_url = entry_id(entry_)[0].text
+    title_ = ': '.join([x.text for x in title(entry_)]).strip()
+    authors = [x.text.strip() for x in creator(entry_) if x.text]
+    if not authors:
+        authors = [_('Unknown')]
+    if not id_url or not title:
+        # Silently discard this entry
+        return None
+
+    mi = Metadata(title_, authors)
+    try:
+        raw = browser.open(id_url).read()
+        feed = etree.fromstring(raw)
+        extra = entry(feed)[0]
+    except:
+        log.exception('Failed to get additional details for', mi.title)
+        return mi
+
+    mi.comments = get_text(extra, description)
+    #mi.language = get_text(extra, language)
+    mi.publisher = get_text(extra, publisher)
+
+    # Author sort
+    for x in creator(extra):
+        for key, val in x.attrib.items():
+            if key.endswith('file-as') and val and val.strip():
+                mi.author_sort = val
+                break
+    # ISBN
+    isbns = []
+    for x in identifier(extra):
+        t = str(x.text).strip()
+        if t[:5].upper() in ('ISBN:', 'LCCN:', 'OCLC:'):
+            if t[:5].upper() == 'ISBN:':
+                isbns.append(t[5:])
+    if isbns:
+        mi.isbn = sorted(isbns, key=len)[-1]
+
+    # Tags
+    try:
+        btags = [x.text for x in subject(extra) if x.text]
+        tags = []
+        for t in btags:
+            tags.extend([y.strip() for y in t.split('/')])
+        tags = list(sorted(list(set(tags))))
+    except:
+        log.exception('Failed to parse tags:')
+        tags = []
+    if tags:
+        mi.tags = [x.replace(',', ';') for x in tags]
+
+    # pubdate
+    pubdate = get_text(extra, date)
+    if pubdate:
+        try:
+            default = utcnow().replace(day=15)
+            mi.pubdate = parse_date(pubdate, assume_utc=True, default=default)
+        except:
+            log.exception('Failed to parse pubdate')
+
+
+    return mi
+
+class Worker(Thread):
+
+    def __init__(self, log, entries, abort, result_queue):
+        self.browser, self.log, self.entries = browser(), log, entries
+        self.abort, self.result_queue = abort, result_queue
+        Thread.__init__(self)
+        self.daemon = True
+
+    def run(self):
+        for i in self.entries:
+            try:
+                ans = to_metadata(self.browser, self.log, i)
+                if ans is not None:
+                    self.result_queue.put(ans)
+            except:
+                self.log.exception(
+                    'Failed to get metadata for identify entry:',
+                    etree.tostring(i))
+            if self.abort.is_set():
+                break
+
+
+class GoogleBooks(Source):
+
+    name = 'Google Books'
+
+    def create_query(self, log, title=None, authors=None, identifiers={},
+            start_index=1):
+        BASE_URL = 'http://books.google.com/books/feeds/volumes?'
+        isbn = identifiers.get('isbn', None)
+        q = ''
+        if isbn is not None:
+            q += 'isbn:'+isbn
+        elif title or authors:
+            def build_term(prefix, parts):
+                return ' '.join('in'+prefix + ':' + x for x in parts)
+            if title is not None:
+                q += build_term('title', title.split())
+            if authors:
+                q += ('+' if q else '')+build_term('author',
+                        self.get_author_tokens(authors))
+
+        if isinstance(q, unicode):
+            q = q.encode('utf-8')
+        if not q:
+            return None
+        return BASE_URL+urlencode({
+            'q':q,
+            'max-results':20,
+            'start-index':start_index,
+            'min-viewability':'none',
+            })
+
+
+    def identify(self, log, result_queue, abort, title=None, authors=None, identifiers={}):
+        query = self.create_query(log, title=title, authors=authors,
+                identifiers=identifiers)
+        try:
+            raw = browser().open_novisit(query).read()
+        except Exception, e:
+            log.exception('Failed to make identify query: %r'%query)
+            return as_unicode(e)
+
+        try:
+            parser = etree.XMLParser(recover=True, no_network=True)
+            feed = etree.fromstring(raw, parser=parser)
+            entries = entry(feed)
+        except Exception, e:
+            log.exception('Failed to parse identify results')
+            return as_unicode(e)
+
+
+        groups = self.split_jobs(entries, 5) # At most 5 threads
+        if not groups:
+            return
+        workers = [Worker(log, entries, abort, result_queue) for entries in
+                groups]
+
+        if abort.is_set():
+            return
+
+        for worker in workers: worker.start()
+
+        has_alive_worker = True
+        while has_alive_worker and not abort.is_set():
+            has_alive_worker = False
+            for worker in workers:
+                if worker.is_alive():
+                    has_alive_worker = True
+            time.sleep(0.1)
+
+        return None
+
+
+
+