mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Initial import of new metadata download framework
This commit is contained in:
parent
5d4c738862
commit
d2ba1812bb
61
src/calibre/ebooks/metadata/sources/base.py
Normal file
61
src/calibre/ebooks/metadata/sources/base.py
Normal file
@ -0,0 +1,61 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||||
|
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||||
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
|
import re
|
||||||
|
|
||||||
|
from calibre.customize import Plugin
|
||||||
|
|
||||||
|
class Source(Plugin):
|
||||||
|
|
||||||
|
type = _('Metadata source')
|
||||||
|
author = 'Kovid Goyal'
|
||||||
|
|
||||||
|
supported_platforms = ['windows', 'osx', 'linux']
|
||||||
|
|
||||||
|
result_of_identify_is_complete = True
|
||||||
|
|
||||||
|
def get_author_tokens(self, authors):
|
||||||
|
'Take a list of authors and return a list of tokens useful for a '
|
||||||
|
'AND search query'
|
||||||
|
# Leave ' in there for Irish names
|
||||||
|
pat = re.compile(r'[-,:;+!@#$%^&*(){}.`~"\s\[\]/]')
|
||||||
|
for au in authors:
|
||||||
|
for tok in au.split():
|
||||||
|
yield pat.sub('', tok)
|
||||||
|
|
||||||
|
def split_jobs(self, jobs, num):
|
||||||
|
'Split a list of jobs into at most num groups, as evenly as possible'
|
||||||
|
groups = [[] for i in range(num)]
|
||||||
|
jobs = list(jobs)
|
||||||
|
while jobs:
|
||||||
|
for gr in groups:
|
||||||
|
try:
|
||||||
|
job = jobs.pop()
|
||||||
|
except IndexError:
|
||||||
|
break
|
||||||
|
gr.append(job)
|
||||||
|
return [g for g in groups if g]
|
||||||
|
|
||||||
|
def identify(self, log, result_queue, abort, title=None, authors=None, identifiers={}):
|
||||||
|
'''
|
||||||
|
Identify a book by its title/author/isbn/etc.
|
||||||
|
|
||||||
|
:param log: A log object, use it to output debugging information/errors
|
||||||
|
:param result_queue: A result Queue, results should be put into it.
|
||||||
|
Each result is a Metadata object
|
||||||
|
:param abort: If abort.is_set() returns True, abort further processing
|
||||||
|
and return as soon as possible
|
||||||
|
:param title: The title of the book, can be None
|
||||||
|
:param authors: A list of authors of the book, can be None
|
||||||
|
:param identifiers: A dictionary of other identifiers, most commonly
|
||||||
|
{'isbn':'1234...'}
|
||||||
|
:return: None if no errors occurred, otherwise a unicode representation
|
||||||
|
of the error suitable for showing to the user
|
||||||
|
|
||||||
|
'''
|
||||||
|
return None
|
||||||
|
|
215
src/calibre/ebooks/metadata/sources/google.py
Normal file
215
src/calibre/ebooks/metadata/sources/google.py
Normal file
@ -0,0 +1,215 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
|
||||||
|
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||||
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
|
import time
|
||||||
|
from urllib import urlencode
|
||||||
|
from functools import partial
|
||||||
|
from threading import Thread
|
||||||
|
|
||||||
|
from lxml import etree
|
||||||
|
|
||||||
|
from calibre.ebooks.metadata.sources import Source
|
||||||
|
from calibre.ebooks.metadata.book.base import Metadata
|
||||||
|
from calibre.utils.date import parse_date, utcnow
|
||||||
|
from calibre import browser, as_unicode
|
||||||
|
|
||||||
|
NAMESPACES = {
|
||||||
|
'openSearch':'http://a9.com/-/spec/opensearchrss/1.0/',
|
||||||
|
'atom' : 'http://www.w3.org/2005/Atom',
|
||||||
|
'dc': 'http://purl.org/dc/terms'
|
||||||
|
}
|
||||||
|
XPath = partial(etree.XPath, namespaces=NAMESPACES)
|
||||||
|
|
||||||
|
total_results = XPath('//openSearch:totalResults')
|
||||||
|
start_index = XPath('//openSearch:startIndex')
|
||||||
|
items_per_page = XPath('//openSearch:itemsPerPage')
|
||||||
|
entry = XPath('//atom:entry')
|
||||||
|
entry_id = XPath('descendant::atom:id')
|
||||||
|
creator = XPath('descendant::dc:creator')
|
||||||
|
identifier = XPath('descendant::dc:identifier')
|
||||||
|
title = XPath('descendant::dc:title')
|
||||||
|
date = XPath('descendant::dc:date')
|
||||||
|
publisher = XPath('descendant::dc:publisher')
|
||||||
|
subject = XPath('descendant::dc:subject')
|
||||||
|
description = XPath('descendant::dc:description')
|
||||||
|
language = XPath('descendant::dc:language')
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def to_metadata(browser, log, entry_):
|
||||||
|
|
||||||
|
def get_text(extra, x):
|
||||||
|
try:
|
||||||
|
ans = x(extra)
|
||||||
|
if ans:
|
||||||
|
ans = ans[0].text
|
||||||
|
if ans and ans.strip():
|
||||||
|
return ans.strip()
|
||||||
|
except:
|
||||||
|
log.exception('Programming error:')
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
id_url = entry_id(entry_)[0].text
|
||||||
|
title_ = ': '.join([x.text for x in title(entry_)]).strip()
|
||||||
|
authors = [x.text.strip() for x in creator(entry_) if x.text]
|
||||||
|
if not authors:
|
||||||
|
authors = [_('Unknown')]
|
||||||
|
if not id_url or not title:
|
||||||
|
# Silently discard this entry
|
||||||
|
return None
|
||||||
|
|
||||||
|
mi = Metadata(title_, authors)
|
||||||
|
try:
|
||||||
|
raw = browser.open(id_url).read()
|
||||||
|
feed = etree.fromstring(raw)
|
||||||
|
extra = entry(feed)[0]
|
||||||
|
except:
|
||||||
|
log.exception('Failed to get additional details for', mi.title)
|
||||||
|
return mi
|
||||||
|
|
||||||
|
mi.comments = get_text(extra, description)
|
||||||
|
#mi.language = get_text(extra, language)
|
||||||
|
mi.publisher = get_text(extra, publisher)
|
||||||
|
|
||||||
|
# Author sort
|
||||||
|
for x in creator(extra):
|
||||||
|
for key, val in x.attrib.items():
|
||||||
|
if key.endswith('file-as') and val and val.strip():
|
||||||
|
mi.author_sort = val
|
||||||
|
break
|
||||||
|
# ISBN
|
||||||
|
isbns = []
|
||||||
|
for x in identifier(extra):
|
||||||
|
t = str(x.text).strip()
|
||||||
|
if t[:5].upper() in ('ISBN:', 'LCCN:', 'OCLC:'):
|
||||||
|
if t[:5].upper() == 'ISBN:':
|
||||||
|
isbns.append(t[5:])
|
||||||
|
if isbns:
|
||||||
|
mi.isbn = sorted(isbns, key=len)[-1]
|
||||||
|
|
||||||
|
# Tags
|
||||||
|
try:
|
||||||
|
btags = [x.text for x in subject(extra) if x.text]
|
||||||
|
tags = []
|
||||||
|
for t in btags:
|
||||||
|
tags.extend([y.strip() for y in t.split('/')])
|
||||||
|
tags = list(sorted(list(set(tags))))
|
||||||
|
except:
|
||||||
|
log.exception('Failed to parse tags:')
|
||||||
|
tags = []
|
||||||
|
if tags:
|
||||||
|
mi.tags = [x.replace(',', ';') for x in tags]
|
||||||
|
|
||||||
|
# pubdate
|
||||||
|
pubdate = get_text(extra, date)
|
||||||
|
if pubdate:
|
||||||
|
try:
|
||||||
|
default = utcnow().replace(day=15)
|
||||||
|
mi.pubdate = parse_date(pubdate, assume_utc=True, default=default)
|
||||||
|
except:
|
||||||
|
log.exception('Failed to parse pubdate')
|
||||||
|
|
||||||
|
|
||||||
|
return mi
|
||||||
|
|
||||||
|
class Worker(Thread):
|
||||||
|
|
||||||
|
def __init__(self, log, entries, abort, result_queue):
|
||||||
|
self.browser, self.log, self.entries = browser(), log, entries
|
||||||
|
self.abort, self.result_queue = abort, result_queue
|
||||||
|
Thread.__init__(self)
|
||||||
|
self.daemon = True
|
||||||
|
|
||||||
|
def run(self):
|
||||||
|
for i in self.entries:
|
||||||
|
try:
|
||||||
|
ans = to_metadata(self.browser, self.log, i)
|
||||||
|
if ans is not None:
|
||||||
|
self.result_queue.put(ans)
|
||||||
|
except:
|
||||||
|
self.log.exception(
|
||||||
|
'Failed to get metadata for identify entry:',
|
||||||
|
etree.tostring(i))
|
||||||
|
if self.abort.is_set():
|
||||||
|
break
|
||||||
|
|
||||||
|
|
||||||
|
class GoogleBooks(Source):
|
||||||
|
|
||||||
|
name = 'Google Books'
|
||||||
|
|
||||||
|
def create_query(self, log, title=None, authors=None, identifiers={},
|
||||||
|
start_index=1):
|
||||||
|
BASE_URL = 'http://books.google.com/books/feeds/volumes?'
|
||||||
|
isbn = identifiers.get('isbn', None)
|
||||||
|
q = ''
|
||||||
|
if isbn is not None:
|
||||||
|
q += 'isbn:'+isbn
|
||||||
|
elif title or authors:
|
||||||
|
def build_term(prefix, parts):
|
||||||
|
return ' '.join('in'+prefix + ':' + x for x in parts)
|
||||||
|
if title is not None:
|
||||||
|
q += build_term('title', title.split())
|
||||||
|
if authors:
|
||||||
|
q += ('+' if q else '')+build_term('author',
|
||||||
|
self.get_author_tokens(authors))
|
||||||
|
|
||||||
|
if isinstance(q, unicode):
|
||||||
|
q = q.encode('utf-8')
|
||||||
|
if not q:
|
||||||
|
return None
|
||||||
|
return BASE_URL+urlencode({
|
||||||
|
'q':q,
|
||||||
|
'max-results':20,
|
||||||
|
'start-index':start_index,
|
||||||
|
'min-viewability':'none',
|
||||||
|
})
|
||||||
|
|
||||||
|
|
||||||
|
def identify(self, log, result_queue, abort, title=None, authors=None, identifiers={}):
|
||||||
|
query = self.create_query(log, title=title, authors=authors,
|
||||||
|
identifiers=identifiers)
|
||||||
|
try:
|
||||||
|
raw = browser().open_novisit(query).read()
|
||||||
|
except Exception, e:
|
||||||
|
log.exception('Failed to make identify query: %r'%query)
|
||||||
|
return as_unicode(e)
|
||||||
|
|
||||||
|
try:
|
||||||
|
parser = etree.XMLParser(recover=True, no_network=True)
|
||||||
|
feed = etree.fromstring(raw, parser=parser)
|
||||||
|
entries = entry(feed)
|
||||||
|
except Exception, e:
|
||||||
|
log.exception('Failed to parse identify results')
|
||||||
|
return as_unicode(e)
|
||||||
|
|
||||||
|
|
||||||
|
groups = self.split_jobs(entries, 5) # At most 5 threads
|
||||||
|
if not groups:
|
||||||
|
return
|
||||||
|
workers = [Worker(log, entries, abort, result_queue) for entries in
|
||||||
|
groups]
|
||||||
|
|
||||||
|
if abort.is_set():
|
||||||
|
return
|
||||||
|
|
||||||
|
for worker in workers: worker.start()
|
||||||
|
|
||||||
|
has_alive_worker = True
|
||||||
|
while has_alive_worker and not abort.is_set():
|
||||||
|
has_alive_worker = False
|
||||||
|
for worker in workers:
|
||||||
|
if worker.is_alive():
|
||||||
|
has_alive_worker = True
|
||||||
|
time.sleep(0.1)
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
x
Reference in New Issue
Block a user