This commit is contained in:
Kovid Goyal 2011-02-15 19:58:27 -07:00
parent d8ee793cd4
commit 6db09a6dc1
2 changed files with 61 additions and 27 deletions

View File

@ -18,14 +18,42 @@ class Source(Plugin):
result_of_identify_is_complete = True
def get_author_tokens(self, authors):
'Take a list of authors and return a list of tokens useful for a '
'AND search query'
# Leave ' in there for Irish names
pat = re.compile(r'[-,:;+!@#$%^&*(){}.`~"\s\[\]/]')
for au in authors:
for tok in au.split():
yield pat.sub('', tok)
def get_author_tokens(self, authors, only_first_author=True):
'''
Take a list of authors and return a list of tokens useful for an
AND search query. This function tries to return tokens in
first name middle names last name order, by assuming that if a comma is
in the author name, the name is in lastname, other names form.
'''
if authors:
# Leave ' in there for Irish names
pat = re.compile(r'[-,:;+!@#$%^&*(){}.`~"\s\[\]/]')
if only_first_author:
authors = authors[:1]
for au in authors:
parts = au.split()
if ',' in au:
# au probably in ln, fn form
parts = parts[1:] + parts[:1]
for tok in parts:
tok = pat.sub('', tok).strip()
yield tok
def get_title_tokens(self, title):
'''
Take a title and return a list of tokens useful for an AND search query.
Excludes connectives and punctuation.
'''
if title:
pat = re.compile(r'''[-,:;+!@#$%^&*(){}.`~"'\s\[\]/]''')
title = pat.sub(' ', title)
tokens = title.split()
for token in tokens:
token = token.strip()
if token and token.lower() not in ('a', 'and', 'the'):
yield token
def split_jobs(self, jobs, num):
'Split a list of jobs into at most num groups, as evenly as possible'

View File

@ -8,7 +8,7 @@ __docformat__ = 'restructuredtext en'
import time
from urllib import urlencode
from functools import partial
from threading import Thread
from threading import Thread, RLock
from lxml import etree
@ -38,7 +38,7 @@ subject = XPath('descendant::dc:subject')
description = XPath('descendant::dc:description')
language = XPath('descendant::dc:language')
_log_lock = RLock()
def to_metadata(browser, log, entry_):
@ -50,7 +50,8 @@ def to_metadata(browser, log, entry_):
if ans and ans.strip():
return ans.strip()
except:
log.exception('Programming error:')
with _log_lock:
log.exception('Programming error:')
return None
@ -69,7 +70,8 @@ def to_metadata(browser, log, entry_):
feed = etree.fromstring(raw)
extra = entry(feed)[0]
except:
log.exception('Failed to get additional details for', mi.title)
with _log_lock:
log.exception('Failed to get additional details for', mi.title)
return mi
mi.comments = get_text(extra, description)
@ -100,7 +102,8 @@ def to_metadata(browser, log, entry_):
tags.extend([y.strip() for y in t.split('/')])
tags = list(sorted(list(set(tags))))
except:
log.exception('Failed to parse tags:')
with _log_lock:
log.exception('Failed to parse tags:')
tags = []
if tags:
mi.tags = [x.replace(',', ';') for x in tags]
@ -112,7 +115,8 @@ def to_metadata(browser, log, entry_):
default = utcnow().replace(day=15)
mi.pubdate = parse_date(pubdate, assume_utc=True, default=default)
except:
log.exception('Failed to parse pubdate')
with _log_lock:
log.exception('Failed to parse pubdate')
return mi
@ -132,9 +136,10 @@ class Worker(Thread):
if isinstance(ans, Metadata):
self.result_queue.put(ans)
except:
self.log.exception(
'Failed to get metadata for identify entry:',
etree.tostring(i))
with _log_lock:
self.log.exception(
'Failed to get metadata for identify entry:',
etree.tostring(i))
if self.abort.is_set():
break
@ -153,11 +158,14 @@ class GoogleBooks(Source):
elif title or authors:
def build_term(prefix, parts):
return ' '.join('in'+prefix + ':' + x for x in parts)
if title is not None:
q += build_term('title', title.split())
if authors:
q += ('+' if q else '')+build_term('author',
self.get_author_tokens(authors))
title_tokens = list(self.get_title_tokens())
if title_tokens:
q += build_term('title', title_tokens)
author_tokens = self.get_author_tokens(authors,
only_first_author=True)
if author_tokens:
q += ('+' if q else '') + build_term('author',
author_tokens)
if isinstance(q, unicode):
q = q.encode('utf-8')
@ -191,25 +199,23 @@ class GoogleBooks(Source):
groups = self.split_jobs(entries, 5) # At most 5 threads
if not groups:
return
return None
workers = [Worker(log, entries, abort, result_queue) for entries in
groups]
if abort.is_set():
return
return None
for worker in workers: worker.start()
has_alive_worker = True
while has_alive_worker and not abort.is_set():
time.sleep(0.1)
has_alive_worker = False
for worker in workers:
if worker.is_alive():
has_alive_worker = True
time.sleep(0.1)
return None