...

2025-07-09 03:04:10 -04:00 · 2011-02-15 19:58:27 -07:00 · 2011-02-15 19:58:27 -07:00 · 6db09a6dc1
commit 6db09a6dc1
parent d8ee793cd4
2 changed files with 61 additions and 27 deletions
--- a/src/calibre/ebooks/metadata/sources/base.py
+++ b/src/calibre/ebooks/metadata/sources/base.py
@ -18,14 +18,42 @@ class Source(Plugin):

    result_of_identify_is_complete = True

-    def get_author_tokens(self, authors):
-        'Take a list of authors and return a list of tokens useful for a '
-        'AND search query'
-        # Leave ' in there for Irish names
-        pat = re.compile(r'[-,:;+!@#$%^&*(){}.`~"\s\[\]/]')
-        for au in authors:
-            for tok in au.split():
-                yield pat.sub('', tok)
+    def get_author_tokens(self, authors, only_first_author=True):
+        '''
+        Take a list of authors and return a list of tokens useful for an
+        AND search query. This function tries to return tokens in
+        first name middle names last name order, by assuming that if a comma is
+        in the author name, the name is in lastname, other names form.
+        '''
+
+        if authors:
+            # Leave ' in there for Irish names
+            pat = re.compile(r'[-,:;+!@#$%^&*(){}.`~"\s\[\]/]')
+            if only_first_author:
+                authors = authors[:1]
+            for au in authors:
+                parts = au.split()
+                if ',' in au:
+                    # au probably in ln, fn form
+                    parts = parts[1:] + parts[:1]
+                for tok in parts:
+                    tok = pat.sub('', tok).strip()
+                    yield tok
+
+
+    def get_title_tokens(self, title):
+        '''
+        Take a title and return a list of tokens useful for an AND search query.
+        Excludes connectives and punctuation.
+        '''
+        if title:
+            pat = re.compile(r'''[-,:;+!@#$%^&*(){}.`~"'\s\[\]/]''')
+            title = pat.sub(' ', title)
+            tokens = title.split()
+            for token in tokens:
+                token = token.strip()
+                if token and token.lower() not in ('a', 'and', 'the'):
+                    yield token

    def split_jobs(self, jobs, num):
        'Split a list of jobs into at most num groups, as evenly as possible'
--- a/src/calibre/ebooks/metadata/sources/google.py
+++ b/src/calibre/ebooks/metadata/sources/google.py
@ -8,7 +8,7 @@ __docformat__ = 'restructuredtext en'
 import time
 from urllib import urlencode
 from functools import partial
-from threading import Thread
+from threading import Thread, RLock

 from lxml import etree

@ -38,7 +38,7 @@ subject        = XPath('descendant::dc:subject')
 description    = XPath('descendant::dc:description')
 language       = XPath('descendant::dc:language')

-
+_log_lock = RLock()

 def to_metadata(browser, log, entry_):

@ -50,7 +50,8 @@ def to_metadata(browser, log, entry_):
                if ans and ans.strip():
                    return ans.strip()
        except:
-            log.exception('Programming error:')
+            with _log_lock:
+                log.exception('Programming error:')
        return None


@ -69,7 +70,8 @@ def to_metadata(browser, log, entry_):
        feed = etree.fromstring(raw)
        extra = entry(feed)[0]
    except:
-        log.exception('Failed to get additional details for', mi.title)
+        with _log_lock:
+            log.exception('Failed to get additional details for', mi.title)
        return mi

    mi.comments = get_text(extra, description)
@ -100,7 +102,8 @@ def to_metadata(browser, log, entry_):
            tags.extend([y.strip() for y in t.split('/')])
        tags = list(sorted(list(set(tags))))
    except:
-        log.exception('Failed to parse tags:')
+        with _log_lock:
+            log.exception('Failed to parse tags:')
        tags = []
    if tags:
        mi.tags = [x.replace(',', ';') for x in tags]
@ -112,7 +115,8 @@ def to_metadata(browser, log, entry_):
            default = utcnow().replace(day=15)
            mi.pubdate = parse_date(pubdate, assume_utc=True, default=default)
        except:
-            log.exception('Failed to parse pubdate')
+            with _log_lock:
+                log.exception('Failed to parse pubdate')


    return mi
@ -132,9 +136,10 @@ class Worker(Thread):
                if isinstance(ans, Metadata):
                    self.result_queue.put(ans)
            except:
-                self.log.exception(
-                    'Failed to get metadata for identify entry:',
-                    etree.tostring(i))
+                with _log_lock:
+                    self.log.exception(
+                        'Failed to get metadata for identify entry:',
+                        etree.tostring(i))
            if self.abort.is_set():
                break

@ -153,11 +158,14 @@ class GoogleBooks(Source):
        elif title or authors:
            def build_term(prefix, parts):
                return ' '.join('in'+prefix + ':' + x for x in parts)
-            if title is not None:
-                q += build_term('title', title.split())
-            if authors:
-                q += ('+' if q else '')+build_term('author',
-                        self.get_author_tokens(authors))
+            title_tokens = list(self.get_title_tokens())
+            if title_tokens:
+                q += build_term('title', title_tokens)
+            author_tokens = self.get_author_tokens(authors,
+                    only_first_author=True)
+            if author_tokens:
+                q += ('+' if q else '') + build_term('author',
+                        author_tokens)

        if isinstance(q, unicode):
            q = q.encode('utf-8')
@ -191,25 +199,23 @@ class GoogleBooks(Source):

        groups = self.split_jobs(entries, 5) # At most 5 threads
        if not groups:
-            return
+            return None
        workers = [Worker(log, entries, abort, result_queue) for entries in
                groups]

        if abort.is_set():
-            return
+            return None

        for worker in workers: worker.start()

        has_alive_worker = True
        while has_alive_worker and not abort.is_set():
+            time.sleep(0.1)
            has_alive_worker = False
            for worker in workers:
                if worker.is_alive():
                    has_alive_worker = True
-            time.sleep(0.1)

        return None


-
-