...

2025-07-09 03:04:10 -04:00 · 2011-02-15 19:58:27 -07:00 · 2011-02-15 19:58:27 -07:00 · 6db09a6dc1
commit 6db09a6dc1
parent d8ee793cd4
2 changed files with 61 additions and 27 deletions
--- a/src/calibre/ebooks/metadata/sources/base.py
+++ b/src/calibre/ebooks/metadata/sources/base.py
@ -18,14 +18,42 @@ class Source(Plugin):
    result_of_identify_is_complete = True
-    def get_author_tokens(self, authors):
+    def get_author_tokens(self, authors, only_first_author=True):
-        'Take a list of authors and return a list of tokens useful for a '
+        '''
-        'AND search query'
+        Take a list of authors and return a list of tokens useful for an
        AND search query. This function tries to return tokens in
        first name middle names last name order, by assuming that if a comma is
        in the author name, the name is in lastname, other names form.
        '''
        if authors:
            # Leave ' in there for Irish names
            pat = re.compile(r'[-,:;+!@#$%^&*(){}.`~"\s\[\]/]')
            if only_first_author:
                authors = authors[:1]
            for au in authors:
-            for tok in au.split():
+                parts = au.split()
-                yield pat.sub('', tok)
+                if ',' in au:
                    # au probably in ln, fn form
                    parts = parts[1:] + parts[:1]
                for tok in parts:
                    tok = pat.sub('', tok).strip()
                    yield tok
    def get_title_tokens(self, title):
        '''
        Take a title and return a list of tokens useful for an AND search query.
        Excludes connectives and punctuation.
        '''
        if title:
            pat = re.compile(r'''[-,:;+!@#$%^&*(){}.`~"'\s\[\]/]''')
            title = pat.sub(' ', title)
            tokens = title.split()
            for token in tokens:
                token = token.strip()
                if token and token.lower() not in ('a', 'and', 'the'):
                    yield token
    def split_jobs(self, jobs, num):
        'Split a list of jobs into at most num groups, as evenly as possible'
--- a/src/calibre/ebooks/metadata/sources/google.py
+++ b/src/calibre/ebooks/metadata/sources/google.py
@ -8,7 +8,7 @@ __docformat__ = 'restructuredtext en'
 import time
 from urllib import urlencode
 from functools import partial
-from threading import Thread
+from threading import Thread, RLock
 from lxml import etree
@ -38,7 +38,7 @@ subject        = XPath('descendant::dc:subject')
 description    = XPath('descendant::dc:description')
 language       = XPath('descendant::dc:language')
-
+_log_lock = RLock()
 def to_metadata(browser, log, entry_):
@ -50,6 +50,7 @@ def to_metadata(browser, log, entry_):
                if ans and ans.strip():
                    return ans.strip()
        except:
            with _log_lock:
                log.exception('Programming error:')
        return None
@ -69,6 +70,7 @@ def to_metadata(browser, log, entry_):
        feed = etree.fromstring(raw)
        extra = entry(feed)[0]
    except:
        with _log_lock:
            log.exception('Failed to get additional details for', mi.title)
        return mi
@ -100,6 +102,7 @@ def to_metadata(browser, log, entry_):
            tags.extend([y.strip() for y in t.split('/')])
        tags = list(sorted(list(set(tags))))
    except:
        with _log_lock:
            log.exception('Failed to parse tags:')
        tags = []
    if tags:
@ -112,6 +115,7 @@ def to_metadata(browser, log, entry_):
            default = utcnow().replace(day=15)
            mi.pubdate = parse_date(pubdate, assume_utc=True, default=default)
        except:
            with _log_lock:
                log.exception('Failed to parse pubdate')
@ -132,6 +136,7 @@ class Worker(Thread):
                if isinstance(ans, Metadata):
                    self.result_queue.put(ans)
            except:
                with _log_lock:
                    self.log.exception(
                        'Failed to get metadata for identify entry:',
                        etree.tostring(i))
@ -153,11 +158,14 @@ class GoogleBooks(Source):
        elif title or authors:
            def build_term(prefix, parts):
                return ' '.join('in'+prefix + ':' + x for x in parts)
-            if title is not None:
+            title_tokens = list(self.get_title_tokens())
-                q += build_term('title', title.split())
+            if title_tokens:
-            if authors:
+                q += build_term('title', title_tokens)
-                q += ('+' if q else '')+build_term('author',
+            author_tokens = self.get_author_tokens(authors,
-                        self.get_author_tokens(authors))
+                    only_first_author=True)
            if author_tokens:
                q += ('+' if q else '') + build_term('author',
                        author_tokens)
        if isinstance(q, unicode):
            q = q.encode('utf-8')
@ -191,25 +199,23 @@ class GoogleBooks(Source):
        groups = self.split_jobs(entries, 5) # At most 5 threads
        if not groups:
-            return
+            return None
        workers = [Worker(log, entries, abort, result_queue) for entries in
                groups]
        if abort.is_set():
-            return
+            return None
        for worker in workers: worker.start()
        has_alive_worker = True
        while has_alive_worker and not abort.is_set():
            time.sleep(0.1)
            has_alive_worker = False
            for worker in workers:
                if worker.is_alive():
                    has_alive_worker = True
            time.sleep(0.1)
        return None