From 6db09a6dc13fe70bf308a9d8d4b87dbcb4a884b9 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 15 Feb 2011 19:58:27 -0700 Subject: [PATCH] ... --- src/calibre/ebooks/metadata/sources/base.py | 44 +++++++++++++++---- src/calibre/ebooks/metadata/sources/google.py | 44 +++++++++++-------- 2 files changed, 61 insertions(+), 27 deletions(-) diff --git a/src/calibre/ebooks/metadata/sources/base.py b/src/calibre/ebooks/metadata/sources/base.py index 89ad8a7956..937245cfa9 100644 --- a/src/calibre/ebooks/metadata/sources/base.py +++ b/src/calibre/ebooks/metadata/sources/base.py @@ -18,14 +18,42 @@ class Source(Plugin): result_of_identify_is_complete = True - def get_author_tokens(self, authors): - 'Take a list of authors and return a list of tokens useful for a ' - 'AND search query' - # Leave ' in there for Irish names - pat = re.compile(r'[-,:;+!@#$%^&*(){}.`~"\s\[\]/]') - for au in authors: - for tok in au.split(): - yield pat.sub('', tok) + def get_author_tokens(self, authors, only_first_author=True): + ''' + Take a list of authors and return a list of tokens useful for an + AND search query. This function tries to return tokens in + first name middle names last name order, by assuming that if a comma is + in the author name, the name is in lastname, other names form. + ''' + + if authors: + # Leave ' in there for Irish names + pat = re.compile(r'[-,:;+!@#$%^&*(){}.`~"\s\[\]/]') + if only_first_author: + authors = authors[:1] + for au in authors: + parts = au.split() + if ',' in au: + # au probably in ln, fn form + parts = parts[1:] + parts[:1] + for tok in parts: + tok = pat.sub('', tok).strip() + yield tok + + + def get_title_tokens(self, title): + ''' + Take a title and return a list of tokens useful for an AND search query. + Excludes connectives and punctuation. + ''' + if title: + pat = re.compile(r'''[-,:;+!@#$%^&*(){}.`~"'\s\[\]/]''') + title = pat.sub(' ', title) + tokens = title.split() + for token in tokens: + token = token.strip() + if token and token.lower() not in ('a', 'and', 'the'): + yield token def split_jobs(self, jobs, num): 'Split a list of jobs into at most num groups, as evenly as possible' diff --git a/src/calibre/ebooks/metadata/sources/google.py b/src/calibre/ebooks/metadata/sources/google.py index d9efb65ae0..7e0e3a0901 100644 --- a/src/calibre/ebooks/metadata/sources/google.py +++ b/src/calibre/ebooks/metadata/sources/google.py @@ -8,7 +8,7 @@ __docformat__ = 'restructuredtext en' import time from urllib import urlencode from functools import partial -from threading import Thread +from threading import Thread, RLock from lxml import etree @@ -38,7 +38,7 @@ subject = XPath('descendant::dc:subject') description = XPath('descendant::dc:description') language = XPath('descendant::dc:language') - +_log_lock = RLock() def to_metadata(browser, log, entry_): @@ -50,7 +50,8 @@ def to_metadata(browser, log, entry_): if ans and ans.strip(): return ans.strip() except: - log.exception('Programming error:') + with _log_lock: + log.exception('Programming error:') return None @@ -69,7 +70,8 @@ def to_metadata(browser, log, entry_): feed = etree.fromstring(raw) extra = entry(feed)[0] except: - log.exception('Failed to get additional details for', mi.title) + with _log_lock: + log.exception('Failed to get additional details for', mi.title) return mi mi.comments = get_text(extra, description) @@ -100,7 +102,8 @@ def to_metadata(browser, log, entry_): tags.extend([y.strip() for y in t.split('/')]) tags = list(sorted(list(set(tags)))) except: - log.exception('Failed to parse tags:') + with _log_lock: + log.exception('Failed to parse tags:') tags = [] if tags: mi.tags = [x.replace(',', ';') for x in tags] @@ -112,7 +115,8 @@ def to_metadata(browser, log, entry_): default = utcnow().replace(day=15) mi.pubdate = parse_date(pubdate, assume_utc=True, default=default) except: - log.exception('Failed to parse pubdate') + with _log_lock: + log.exception('Failed to parse pubdate') return mi @@ -132,9 +136,10 @@ class Worker(Thread): if isinstance(ans, Metadata): self.result_queue.put(ans) except: - self.log.exception( - 'Failed to get metadata for identify entry:', - etree.tostring(i)) + with _log_lock: + self.log.exception( + 'Failed to get metadata for identify entry:', + etree.tostring(i)) if self.abort.is_set(): break @@ -153,11 +158,14 @@ class GoogleBooks(Source): elif title or authors: def build_term(prefix, parts): return ' '.join('in'+prefix + ':' + x for x in parts) - if title is not None: - q += build_term('title', title.split()) - if authors: - q += ('+' if q else '')+build_term('author', - self.get_author_tokens(authors)) + title_tokens = list(self.get_title_tokens()) + if title_tokens: + q += build_term('title', title_tokens) + author_tokens = self.get_author_tokens(authors, + only_first_author=True) + if author_tokens: + q += ('+' if q else '') + build_term('author', + author_tokens) if isinstance(q, unicode): q = q.encode('utf-8') @@ -191,25 +199,23 @@ class GoogleBooks(Source): groups = self.split_jobs(entries, 5) # At most 5 threads if not groups: - return + return None workers = [Worker(log, entries, abort, result_queue) for entries in groups] if abort.is_set(): - return + return None for worker in workers: worker.start() has_alive_worker = True while has_alive_worker and not abort.is_set(): + time.sleep(0.1) has_alive_worker = False for worker in workers: if worker.is_alive(): has_alive_worker = True - time.sleep(0.1) return None - -