This commit is contained in:
Kovid Goyal 2011-02-15 19:58:27 -07:00
parent d8ee793cd4
commit 6db09a6dc1
2 changed files with 61 additions and 27 deletions

View File

@ -18,14 +18,42 @@ class Source(Plugin):
result_of_identify_is_complete = True result_of_identify_is_complete = True
def get_author_tokens(self, authors): def get_author_tokens(self, authors, only_first_author=True):
'Take a list of authors and return a list of tokens useful for a ' '''
'AND search query' Take a list of authors and return a list of tokens useful for an
# Leave ' in there for Irish names AND search query. This function tries to return tokens in
pat = re.compile(r'[-,:;+!@#$%^&*(){}.`~"\s\[\]/]') first name middle names last name order, by assuming that if a comma is
for au in authors: in the author name, the name is in lastname, other names form.
for tok in au.split(): '''
yield pat.sub('', tok)
if authors:
# Leave ' in there for Irish names
pat = re.compile(r'[-,:;+!@#$%^&*(){}.`~"\s\[\]/]')
if only_first_author:
authors = authors[:1]
for au in authors:
parts = au.split()
if ',' in au:
# au probably in ln, fn form
parts = parts[1:] + parts[:1]
for tok in parts:
tok = pat.sub('', tok).strip()
yield tok
def get_title_tokens(self, title):
'''
Take a title and return a list of tokens useful for an AND search query.
Excludes connectives and punctuation.
'''
if title:
pat = re.compile(r'''[-,:;+!@#$%^&*(){}.`~"'\s\[\]/]''')
title = pat.sub(' ', title)
tokens = title.split()
for token in tokens:
token = token.strip()
if token and token.lower() not in ('a', 'and', 'the'):
yield token
def split_jobs(self, jobs, num): def split_jobs(self, jobs, num):
'Split a list of jobs into at most num groups, as evenly as possible' 'Split a list of jobs into at most num groups, as evenly as possible'

View File

@ -8,7 +8,7 @@ __docformat__ = 'restructuredtext en'
import time import time
from urllib import urlencode from urllib import urlencode
from functools import partial from functools import partial
from threading import Thread from threading import Thread, RLock
from lxml import etree from lxml import etree
@ -38,7 +38,7 @@ subject = XPath('descendant::dc:subject')
description = XPath('descendant::dc:description') description = XPath('descendant::dc:description')
language = XPath('descendant::dc:language') language = XPath('descendant::dc:language')
_log_lock = RLock()
def to_metadata(browser, log, entry_): def to_metadata(browser, log, entry_):
@ -50,7 +50,8 @@ def to_metadata(browser, log, entry_):
if ans and ans.strip(): if ans and ans.strip():
return ans.strip() return ans.strip()
except: except:
log.exception('Programming error:') with _log_lock:
log.exception('Programming error:')
return None return None
@ -69,7 +70,8 @@ def to_metadata(browser, log, entry_):
feed = etree.fromstring(raw) feed = etree.fromstring(raw)
extra = entry(feed)[0] extra = entry(feed)[0]
except: except:
log.exception('Failed to get additional details for', mi.title) with _log_lock:
log.exception('Failed to get additional details for', mi.title)
return mi return mi
mi.comments = get_text(extra, description) mi.comments = get_text(extra, description)
@ -100,7 +102,8 @@ def to_metadata(browser, log, entry_):
tags.extend([y.strip() for y in t.split('/')]) tags.extend([y.strip() for y in t.split('/')])
tags = list(sorted(list(set(tags)))) tags = list(sorted(list(set(tags))))
except: except:
log.exception('Failed to parse tags:') with _log_lock:
log.exception('Failed to parse tags:')
tags = [] tags = []
if tags: if tags:
mi.tags = [x.replace(',', ';') for x in tags] mi.tags = [x.replace(',', ';') for x in tags]
@ -112,7 +115,8 @@ def to_metadata(browser, log, entry_):
default = utcnow().replace(day=15) default = utcnow().replace(day=15)
mi.pubdate = parse_date(pubdate, assume_utc=True, default=default) mi.pubdate = parse_date(pubdate, assume_utc=True, default=default)
except: except:
log.exception('Failed to parse pubdate') with _log_lock:
log.exception('Failed to parse pubdate')
return mi return mi
@ -132,9 +136,10 @@ class Worker(Thread):
if isinstance(ans, Metadata): if isinstance(ans, Metadata):
self.result_queue.put(ans) self.result_queue.put(ans)
except: except:
self.log.exception( with _log_lock:
'Failed to get metadata for identify entry:', self.log.exception(
etree.tostring(i)) 'Failed to get metadata for identify entry:',
etree.tostring(i))
if self.abort.is_set(): if self.abort.is_set():
break break
@ -153,11 +158,14 @@ class GoogleBooks(Source):
elif title or authors: elif title or authors:
def build_term(prefix, parts): def build_term(prefix, parts):
return ' '.join('in'+prefix + ':' + x for x in parts) return ' '.join('in'+prefix + ':' + x for x in parts)
if title is not None: title_tokens = list(self.get_title_tokens())
q += build_term('title', title.split()) if title_tokens:
if authors: q += build_term('title', title_tokens)
q += ('+' if q else '')+build_term('author', author_tokens = self.get_author_tokens(authors,
self.get_author_tokens(authors)) only_first_author=True)
if author_tokens:
q += ('+' if q else '') + build_term('author',
author_tokens)
if isinstance(q, unicode): if isinstance(q, unicode):
q = q.encode('utf-8') q = q.encode('utf-8')
@ -191,25 +199,23 @@ class GoogleBooks(Source):
groups = self.split_jobs(entries, 5) # At most 5 threads groups = self.split_jobs(entries, 5) # At most 5 threads
if not groups: if not groups:
return return None
workers = [Worker(log, entries, abort, result_queue) for entries in workers = [Worker(log, entries, abort, result_queue) for entries in
groups] groups]
if abort.is_set(): if abort.is_set():
return return None
for worker in workers: worker.start() for worker in workers: worker.start()
has_alive_worker = True has_alive_worker = True
while has_alive_worker and not abort.is_set(): while has_alive_worker and not abort.is_set():
time.sleep(0.1)
has_alive_worker = False has_alive_worker = False
for worker in workers: for worker in workers:
if worker.is_alive(): if worker.is_alive():
has_alive_worker = True has_alive_worker = True
time.sleep(0.1)
return None return None