From 6db09a6dc13fe70bf308a9d8d4b87dbcb4a884b9 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Tue, 15 Feb 2011 19:58:27 -0700
Subject: [PATCH] ...

---
 src/calibre/ebooks/metadata/sources/base.py   | 44 +++++++++++++++----
 src/calibre/ebooks/metadata/sources/google.py | 44 +++++++++++--------
 2 files changed, 61 insertions(+), 27 deletions(-)

diff --git a/src/calibre/ebooks/metadata/sources/base.py b/src/calibre/ebooks/metadata/sources/base.py
index 89ad8a7956..937245cfa9 100644
--- a/src/calibre/ebooks/metadata/sources/base.py
+++ b/src/calibre/ebooks/metadata/sources/base.py
@@ -18,14 +18,42 @@ class Source(Plugin):
 
     result_of_identify_is_complete = True
 
-    def get_author_tokens(self, authors):
-        'Take a list of authors and return a list of tokens useful for a '
-        'AND search query'
-        # Leave ' in there for Irish names
-        pat = re.compile(r'[-,:;+!@#$%^&*(){}.`~"\s\[\]/]')
-        for au in authors:
-            for tok in au.split():
-                yield pat.sub('', tok)
+    def get_author_tokens(self, authors, only_first_author=True):
+        '''
+        Take a list of authors and return a list of tokens useful for an
+        AND search query. This function tries to return tokens in
+        first name middle names last name order, by assuming that if a comma is
+        in the author name, the name is in lastname, other names form.
+        '''
+
+        if authors:
+            # Leave ' in there for Irish names
+            pat = re.compile(r'[-,:;+!@#$%^&*(){}.`~"\s\[\]/]')
+            if only_first_author:
+                authors = authors[:1]
+            for au in authors:
+                parts = au.split()
+                if ',' in au:
+                    # au probably in ln, fn form
+                    parts = parts[1:] + parts[:1]
+                for tok in parts:
+                    tok = pat.sub('', tok).strip()
+                    yield tok
+
+
+    def get_title_tokens(self, title):
+        '''
+        Take a title and return a list of tokens useful for an AND search query.
+        Excludes connectives and punctuation.
+        '''
+        if title:
+            pat = re.compile(r'''[-,:;+!@#$%^&*(){}.`~"'\s\[\]/]''')
+            title = pat.sub(' ', title)
+            tokens = title.split()
+            for token in tokens:
+                token = token.strip()
+                if token and token.lower() not in ('a', 'and', 'the'):
+                    yield token
 
     def split_jobs(self, jobs, num):
         'Split a list of jobs into at most num groups, as evenly as possible'
diff --git a/src/calibre/ebooks/metadata/sources/google.py b/src/calibre/ebooks/metadata/sources/google.py
index d9efb65ae0..7e0e3a0901 100644
--- a/src/calibre/ebooks/metadata/sources/google.py
+++ b/src/calibre/ebooks/metadata/sources/google.py
@@ -8,7 +8,7 @@ __docformat__ = 'restructuredtext en'
 import time
 from urllib import urlencode
 from functools import partial
-from threading import Thread
+from threading import Thread, RLock
 
 from lxml import etree
 
@@ -38,7 +38,7 @@ subject        = XPath('descendant::dc:subject')
 description    = XPath('descendant::dc:description')
 language       = XPath('descendant::dc:language')
 
-
+_log_lock = RLock()
 
 def to_metadata(browser, log, entry_):
 
@@ -50,7 +50,8 @@ def to_metadata(browser, log, entry_):
                 if ans and ans.strip():
                     return ans.strip()
         except:
-            log.exception('Programming error:')
+            with _log_lock:
+                log.exception('Programming error:')
         return None
 
 
@@ -69,7 +70,8 @@ def to_metadata(browser, log, entry_):
         feed = etree.fromstring(raw)
         extra = entry(feed)[0]
     except:
-        log.exception('Failed to get additional details for', mi.title)
+        with _log_lock:
+            log.exception('Failed to get additional details for', mi.title)
         return mi
 
     mi.comments = get_text(extra, description)
@@ -100,7 +102,8 @@ def to_metadata(browser, log, entry_):
             tags.extend([y.strip() for y in t.split('/')])
         tags = list(sorted(list(set(tags))))
     except:
-        log.exception('Failed to parse tags:')
+        with _log_lock:
+            log.exception('Failed to parse tags:')
         tags = []
     if tags:
         mi.tags = [x.replace(',', ';') for x in tags]
@@ -112,7 +115,8 @@ def to_metadata(browser, log, entry_):
             default = utcnow().replace(day=15)
             mi.pubdate = parse_date(pubdate, assume_utc=True, default=default)
         except:
-            log.exception('Failed to parse pubdate')
+            with _log_lock:
+                log.exception('Failed to parse pubdate')
 
 
     return mi
@@ -132,9 +136,10 @@ class Worker(Thread):
                 if isinstance(ans, Metadata):
                     self.result_queue.put(ans)
             except:
-                self.log.exception(
-                    'Failed to get metadata for identify entry:',
-                    etree.tostring(i))
+                with _log_lock:
+                    self.log.exception(
+                        'Failed to get metadata for identify entry:',
+                        etree.tostring(i))
             if self.abort.is_set():
                 break
 
@@ -153,11 +158,14 @@ class GoogleBooks(Source):
         elif title or authors:
             def build_term(prefix, parts):
                 return ' '.join('in'+prefix + ':' + x for x in parts)
-            if title is not None:
-                q += build_term('title', title.split())
-            if authors:
-                q += ('+' if q else '')+build_term('author',
-                        self.get_author_tokens(authors))
+            title_tokens = list(self.get_title_tokens())
+            if title_tokens:
+                q += build_term('title', title_tokens)
+            author_tokens = self.get_author_tokens(authors,
+                    only_first_author=True)
+            if author_tokens:
+                q += ('+' if q else '') + build_term('author',
+                        author_tokens)
 
         if isinstance(q, unicode):
             q = q.encode('utf-8')
@@ -191,25 +199,23 @@ class GoogleBooks(Source):
 
         groups = self.split_jobs(entries, 5) # At most 5 threads
         if not groups:
-            return
+            return None
         workers = [Worker(log, entries, abort, result_queue) for entries in
                 groups]
 
         if abort.is_set():
-            return
+            return None
 
         for worker in workers: worker.start()
 
         has_alive_worker = True
         while has_alive_worker and not abort.is_set():
+            time.sleep(0.1)
             has_alive_worker = False
             for worker in workers:
                 if worker.is_alive():
                     has_alive_worker = True
-            time.sleep(0.1)
 
         return None
 
 
-
-