mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
...
This commit is contained in:
parent
d8ee793cd4
commit
6db09a6dc1
@ -18,14 +18,42 @@ class Source(Plugin):
|
|||||||
|
|
||||||
result_of_identify_is_complete = True
|
result_of_identify_is_complete = True
|
||||||
|
|
||||||
def get_author_tokens(self, authors):
|
def get_author_tokens(self, authors, only_first_author=True):
|
||||||
'Take a list of authors and return a list of tokens useful for a '
|
'''
|
||||||
'AND search query'
|
Take a list of authors and return a list of tokens useful for an
|
||||||
|
AND search query. This function tries to return tokens in
|
||||||
|
first name middle names last name order, by assuming that if a comma is
|
||||||
|
in the author name, the name is in lastname, other names form.
|
||||||
|
'''
|
||||||
|
|
||||||
|
if authors:
|
||||||
# Leave ' in there for Irish names
|
# Leave ' in there for Irish names
|
||||||
pat = re.compile(r'[-,:;+!@#$%^&*(){}.`~"\s\[\]/]')
|
pat = re.compile(r'[-,:;+!@#$%^&*(){}.`~"\s\[\]/]')
|
||||||
|
if only_first_author:
|
||||||
|
authors = authors[:1]
|
||||||
for au in authors:
|
for au in authors:
|
||||||
for tok in au.split():
|
parts = au.split()
|
||||||
yield pat.sub('', tok)
|
if ',' in au:
|
||||||
|
# au probably in ln, fn form
|
||||||
|
parts = parts[1:] + parts[:1]
|
||||||
|
for tok in parts:
|
||||||
|
tok = pat.sub('', tok).strip()
|
||||||
|
yield tok
|
||||||
|
|
||||||
|
|
||||||
|
def get_title_tokens(self, title):
|
||||||
|
'''
|
||||||
|
Take a title and return a list of tokens useful for an AND search query.
|
||||||
|
Excludes connectives and punctuation.
|
||||||
|
'''
|
||||||
|
if title:
|
||||||
|
pat = re.compile(r'''[-,:;+!@#$%^&*(){}.`~"'\s\[\]/]''')
|
||||||
|
title = pat.sub(' ', title)
|
||||||
|
tokens = title.split()
|
||||||
|
for token in tokens:
|
||||||
|
token = token.strip()
|
||||||
|
if token and token.lower() not in ('a', 'and', 'the'):
|
||||||
|
yield token
|
||||||
|
|
||||||
def split_jobs(self, jobs, num):
|
def split_jobs(self, jobs, num):
|
||||||
'Split a list of jobs into at most num groups, as evenly as possible'
|
'Split a list of jobs into at most num groups, as evenly as possible'
|
||||||
|
@ -8,7 +8,7 @@ __docformat__ = 'restructuredtext en'
|
|||||||
import time
|
import time
|
||||||
from urllib import urlencode
|
from urllib import urlencode
|
||||||
from functools import partial
|
from functools import partial
|
||||||
from threading import Thread
|
from threading import Thread, RLock
|
||||||
|
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
|
|
||||||
@ -38,7 +38,7 @@ subject = XPath('descendant::dc:subject')
|
|||||||
description = XPath('descendant::dc:description')
|
description = XPath('descendant::dc:description')
|
||||||
language = XPath('descendant::dc:language')
|
language = XPath('descendant::dc:language')
|
||||||
|
|
||||||
|
_log_lock = RLock()
|
||||||
|
|
||||||
def to_metadata(browser, log, entry_):
|
def to_metadata(browser, log, entry_):
|
||||||
|
|
||||||
@ -50,6 +50,7 @@ def to_metadata(browser, log, entry_):
|
|||||||
if ans and ans.strip():
|
if ans and ans.strip():
|
||||||
return ans.strip()
|
return ans.strip()
|
||||||
except:
|
except:
|
||||||
|
with _log_lock:
|
||||||
log.exception('Programming error:')
|
log.exception('Programming error:')
|
||||||
return None
|
return None
|
||||||
|
|
||||||
@ -69,6 +70,7 @@ def to_metadata(browser, log, entry_):
|
|||||||
feed = etree.fromstring(raw)
|
feed = etree.fromstring(raw)
|
||||||
extra = entry(feed)[0]
|
extra = entry(feed)[0]
|
||||||
except:
|
except:
|
||||||
|
with _log_lock:
|
||||||
log.exception('Failed to get additional details for', mi.title)
|
log.exception('Failed to get additional details for', mi.title)
|
||||||
return mi
|
return mi
|
||||||
|
|
||||||
@ -100,6 +102,7 @@ def to_metadata(browser, log, entry_):
|
|||||||
tags.extend([y.strip() for y in t.split('/')])
|
tags.extend([y.strip() for y in t.split('/')])
|
||||||
tags = list(sorted(list(set(tags))))
|
tags = list(sorted(list(set(tags))))
|
||||||
except:
|
except:
|
||||||
|
with _log_lock:
|
||||||
log.exception('Failed to parse tags:')
|
log.exception('Failed to parse tags:')
|
||||||
tags = []
|
tags = []
|
||||||
if tags:
|
if tags:
|
||||||
@ -112,6 +115,7 @@ def to_metadata(browser, log, entry_):
|
|||||||
default = utcnow().replace(day=15)
|
default = utcnow().replace(day=15)
|
||||||
mi.pubdate = parse_date(pubdate, assume_utc=True, default=default)
|
mi.pubdate = parse_date(pubdate, assume_utc=True, default=default)
|
||||||
except:
|
except:
|
||||||
|
with _log_lock:
|
||||||
log.exception('Failed to parse pubdate')
|
log.exception('Failed to parse pubdate')
|
||||||
|
|
||||||
|
|
||||||
@ -132,6 +136,7 @@ class Worker(Thread):
|
|||||||
if isinstance(ans, Metadata):
|
if isinstance(ans, Metadata):
|
||||||
self.result_queue.put(ans)
|
self.result_queue.put(ans)
|
||||||
except:
|
except:
|
||||||
|
with _log_lock:
|
||||||
self.log.exception(
|
self.log.exception(
|
||||||
'Failed to get metadata for identify entry:',
|
'Failed to get metadata for identify entry:',
|
||||||
etree.tostring(i))
|
etree.tostring(i))
|
||||||
@ -153,11 +158,14 @@ class GoogleBooks(Source):
|
|||||||
elif title or authors:
|
elif title or authors:
|
||||||
def build_term(prefix, parts):
|
def build_term(prefix, parts):
|
||||||
return ' '.join('in'+prefix + ':' + x for x in parts)
|
return ' '.join('in'+prefix + ':' + x for x in parts)
|
||||||
if title is not None:
|
title_tokens = list(self.get_title_tokens())
|
||||||
q += build_term('title', title.split())
|
if title_tokens:
|
||||||
if authors:
|
q += build_term('title', title_tokens)
|
||||||
q += ('+' if q else '')+build_term('author',
|
author_tokens = self.get_author_tokens(authors,
|
||||||
self.get_author_tokens(authors))
|
only_first_author=True)
|
||||||
|
if author_tokens:
|
||||||
|
q += ('+' if q else '') + build_term('author',
|
||||||
|
author_tokens)
|
||||||
|
|
||||||
if isinstance(q, unicode):
|
if isinstance(q, unicode):
|
||||||
q = q.encode('utf-8')
|
q = q.encode('utf-8')
|
||||||
@ -191,25 +199,23 @@ class GoogleBooks(Source):
|
|||||||
|
|
||||||
groups = self.split_jobs(entries, 5) # At most 5 threads
|
groups = self.split_jobs(entries, 5) # At most 5 threads
|
||||||
if not groups:
|
if not groups:
|
||||||
return
|
return None
|
||||||
workers = [Worker(log, entries, abort, result_queue) for entries in
|
workers = [Worker(log, entries, abort, result_queue) for entries in
|
||||||
groups]
|
groups]
|
||||||
|
|
||||||
if abort.is_set():
|
if abort.is_set():
|
||||||
return
|
return None
|
||||||
|
|
||||||
for worker in workers: worker.start()
|
for worker in workers: worker.start()
|
||||||
|
|
||||||
has_alive_worker = True
|
has_alive_worker = True
|
||||||
while has_alive_worker and not abort.is_set():
|
while has_alive_worker and not abort.is_set():
|
||||||
|
time.sleep(0.1)
|
||||||
has_alive_worker = False
|
has_alive_worker = False
|
||||||
for worker in workers:
|
for worker in workers:
|
||||||
if worker.is_alive():
|
if worker.is_alive():
|
||||||
has_alive_worker = True
|
has_alive_worker = True
|
||||||
time.sleep(0.1)
|
|
||||||
|
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user