Metadata compatibility

This commit is contained in:
Sengian 2011-03-09 22:21:02 +01:00
parent e3ec837fd1
commit 888aaec88f
3 changed files with 52 additions and 234 deletions

View File

@ -580,12 +580,12 @@ from calibre.devices.folder_device.driver import FOLDER_DEVICE_FOR_CONFIG
from calibre.devices.kobo.driver import KOBO
from calibre.devices.bambook.driver import BAMBOOK
from calibre.ebooks.metadata.fetch import KentDistrictLibrary
from calibre.ebooks.metadata.fetch import KentDistrictLibrary, Amazon
from calibre.ebooks.metadata.douban import DoubanBooks
from calibre.ebooks.metadata.isbndb import ISBNDB
from calibre.ebooks.metadata.google_books import GoogleBooks
from calibre.ebooks.metadata.nicebooks import NiceBooks, NiceBooksCovers
from calibre.ebooks.metadata.amazon import Amazon, AmazonSocial
# from calibre.ebooks.metadata.amazon import Amazon , AmazonSocial
from calibre.ebooks.metadata.fictionwise import Fictionwise
from calibre.ebooks.metadata.covers import OpenLibraryCovers, \
AmazonCovers, DoubanCovers, LibrarythingCovers
@ -593,7 +593,7 @@ from calibre.library.catalog import CSV_XML, EPUB_MOBI, BIBTEX
from calibre.ebooks.epub.fix.unmanifested import Unmanifested
from calibre.ebooks.epub.fix.epubcheck import Epubcheck
plugins = [HTML2ZIP, PML2PMLZ, TXT2TXTZ, ArchiveExtract, GoogleBooks, ISBNDB, Amazon, AmazonSocial,
plugins = [HTML2ZIP, PML2PMLZ, TXT2TXTZ, ArchiveExtract, GoogleBooks, ISBNDB, Amazon, #AmazonSocial,
KentDistrictLibrary, DoubanBooks, NiceBooks, CSV_XML, EPUB_MOBI, BIBTEX, Unmanifested,
Epubcheck, OpenLibraryCovers, AmazonCovers, DoubanCovers, LibrarythingCovers,
NiceBooksCovers]

View File

@ -1,7 +1,11 @@
from __future__ import with_statement
__license__ = 'GPL 3'
__copyright__ = '2010, sengian <sengian1@gmail.com>'
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
__docformat__ = 'restructuredtext en'
'''
Fetch metadata using Amazon AWS
'''
import sys, re
from threading import RLock
@ -12,10 +16,6 @@ from calibre import browser
from calibre.ebooks.metadata import check_isbn
from calibre.ebooks.metadata.book.base import Metadata
from calibre.ebooks.chardet import xml_to_unicode
from calibre.ebooks.metadata import MetaInformation, check_isbn, \
authors_to_sort_string
from calibre.ebooks.metadata.fetch import MetadataSource
from calibre.utils.config import OptionParser
from calibre.library.comments import sanitize_comments_html
asin_cache = {}
@ -160,229 +160,31 @@ def get_metadata(br, asin, mi):
m = pat.match(t)
if m is not None:
try:
default = utcnow().replace(day=15)
if self.lang != 'all':
d = replace_months(d, self.lang)
d = parse_date(d, assume_utc=True, default=default)
mi.pubdate = d
mi.rating = float(m.group(1))/float(m.group(2)) * 5
break
except:
report(verbose)
#ISBN
elt = filter(lambda x: self.reisbn.search(x.find('b').text), elts)
if elt:
isbn = elt[0].find('b').tail.replace('-', '').strip()
if check_isbn(isbn):
mi.isbn = unicode(isbn)
elif len(elt) > 1:
isbnone = elt[1].find('b').tail.replace('-', '').strip()
if check_isbn(isbnone):
mi.isbn = unicode(isbnone)
else:
#assume ASIN-> find a check for asin
mi.isbn = unicode(isbn)
#Langue
elt = filter(lambda x: self.relang.search(x.find('b').text), elts)
if elt:
langue = elt[0].find('b').tail.strip()
if langue:
mi.language = unicode(langue)
#ratings
elt = filter(lambda x: self.reratelt.search(x.find('b').text), elts)
if elt:
ratings = elt[0].find_class('swSprite')
if ratings:
ratings = self.rerat.findall(ratings[0].get('title'))
if len(ratings) == 2:
mi.rating = float(ratings[0])/float(ratings[1]) * 5
return mi
pass
def fill_MI(self, entry, verbose):
try:
title = self.get_title(entry)
authors = self.get_authors(entry)
except Exception, e:
if verbose:
print _('Failed to get all details for an entry')
print e
print _('URL who failed: %s') % x
report(verbose)
return None
mi = MetaInformation(title, authors)
mi.author_sort = authors_to_sort_string(authors)
try:
mi.comments = self.get_description(entry, verbose)
mi = self.get_book_info(entry, mi, verbose)
except:
pass
return mi
desc = root.xpath('//div[@id="productDescription"]/*[@class="content"]')
if desc:
desc = desc[0]
for c in desc.xpath('descendant::*[@class="seeAll" or'
' @class="emptyClear" or @href]'):
c.getparent().remove(c)
desc = html.tostring(desc, method='html', encoding=unicode).strip()
# remove all attributes from tags
desc = re.sub(r'<([a-zA-Z0-9]+)\s[^>]+>', r'<\1>', desc)
# Collapse whitespace
#desc = re.sub('\n+', '\n', desc)
#desc = re.sub(' +', ' ', desc)
# Remove the notice about text referring to out of print editions
desc = re.sub(r'(?s)<em>--This text ref.*?</em>', '', desc)
# Remove comments
desc = re.sub(r'(?s)<!--.*?-->', '', desc)
mi.comments = sanitize_comments_html(desc)
def get_individual_metadata(self, url, br, verbose):
try:
raw = br.open_novisit(url).read()
except Exception, e:
import socket
report(verbose)
if callable(getattr(e, 'getcode', None)) and \
e.getcode() == 404:
return None
attr = getattr(e, 'args', [None])
attr = attr if attr else [None]
if isinstance(attr[0], socket.timeout):
raise AmazonError(_('Amazon timed out. Try again later.'))
raise AmazonError(_('Amazon encountered an error.'))
if '<title>404 - ' in raw:
report(verbose)
return None
raw = xml_to_unicode(raw, strip_encoding_pats=True,
resolve_entities=True)[0]
try:
return soupparser.fromstring(raw)
except:
try:
#remove ASCII invalid chars
return soupparser.fromstring(clean_ascii_chars(raw))
except:
report(verbose)
return None
return True
def fetchdatathread(self, qbr, qsync, nb, url, verbose):
try:
browser = qbr.get(True)
entry = self.get_individual_metadata(url, browser, verbose)
except:
report(verbose)
entry = None
finally:
qbr.put(browser, True)
qsync.put((nb, entry), True)
def producer(self, sync, urls, br, verbose=False):
for i in xrange(len(urls)):
thread = Thread(target=self.fetchdatathread,
args=(br, sync, i, urls[i], verbose))
thread.start()
def consumer(self, sync, syncbis, br, total_entries, verbose=False):
i=0
self.extend([None]*total_entries)
while i < total_entries:
rq = sync.get(True)
nb = int(rq[0])
entry = rq[1]
i+=1
if entry is not None:
mi = self.fill_MI(entry, verbose)
if mi is not None:
mi.tags, atag = self.get_tags(entry, verbose)
self[nb] = mi
if atag:
thread = Thread(target=self.fetchdatathread,
args=(br, syncbis, nb, mi.tags, verbose))
thread.start()
else:
syncbis.put((nb, None), True)
def final(self, sync, total_entries, verbose):
i=0
while i < total_entries:
rq = sync.get(True)
nb = int(rq[0])
tags = rq[1]
i+=1
if tags is not None:
self[nb].tags = self.get_tags(tags, verbose)[0]
def populate(self, entries, ibr, verbose=False, brcall=3):
br = Queue(brcall)
cbr = Queue(brcall-1)
syncp = Queue(1)
syncc = Queue(1)
for i in xrange(brcall-1):
br.put(browser(), True)
cbr.put(browser(), True)
br.put(ibr, True)
prod_thread = Thread(target=self.producer, args=(syncp, entries, br, verbose))
cons_thread = Thread(target=self.consumer, args=(syncp, syncc, cbr, len(entries), verbose))
fin_thread = Thread(target=self.final, args=(syncc, len(entries), verbose))
prod_thread.start()
cons_thread.start()
fin_thread.start()
prod_thread.join()
cons_thread.join()
fin_thread.join()
def search(title=None, author=None, publisher=None, isbn=None,
max_results=5, verbose=False, keywords=None, lang='all'):
br = browser()
entries, baseurl = Query(title=title, author=author, isbn=isbn, publisher=publisher,
keywords=keywords, max_results=max_results,rlang=lang)(br, verbose)
if entries is None or len(entries) == 0:
return None
#List of entry
ans = ResultList(baseurl, lang)
ans.populate(entries, br, verbose)
return [x for x in ans if x is not None]
def get_social_metadata(title, authors, publisher, isbn, verbose=False,
max_results=1, lang='all'):
mi = MetaInformation(title, authors)
if not isbn or not check_isbn(isbn):
return [mi]
amazresults = search(isbn=isbn, verbose=verbose,
max_results=max_results, lang=lang)
if amazresults is None or amazresults[0] is None:
from calibre.ebooks.metadata.xisbn import xisbn
for i in xisbn.get_associated_isbns(isbn):
amazresults = search(isbn=i, verbose=verbose,
max_results=max_results, lang=lang)
if amazresults is not None and amazresults[0] is not None:
break
if amazresults is None or amazresults[0] is None:
return [mi]
miaz = amazresults[0]
if miaz.rating is not None:
mi.rating = miaz.rating
if miaz.comments is not None:
mi.comments = miaz.comments
if miaz.tags is not None:
mi.tags = miaz.tags
return [mi]
def option_parser():
import textwrap
parser = OptionParser(textwrap.dedent(\
_('''\
%prog [options]
Fetch book metadata from Amazon. You must specify one of title, author,
ISBN, publisher or keywords. Will fetch a maximum of 20 matches,
so you should make your query as specific as possible.
You can chose the language for metadata retrieval:
english & french & german
'''
)))
parser.add_option('-t', '--title', help=_('Book title'))
parser.add_option('-a', '--author', help=_('Book author(s)'))
parser.add_option('-p', '--publisher', help=_('Book publisher'))
parser.add_option('-i', '--isbn', help=_('Book ISBN'))
parser.add_option('-k', '--keywords', help=_('Keywords'))
parser.add_option('-s', '--social', default=0, action='count',
help=_('Get social data only'))
parser.add_option('-m', '--max-results', default=10,
help=_('Maximum number of results to fetch'))
parser.add_option('-l', '--lang', default='all',
help=_('Chosen language for metadata search (en, fr, de)'))
parser.add_option('-v', '--verbose', default=0, action='count',
help=_('Be more verbose about errors'))
return parser
def main(args=sys.argv):
import tempfile, os
@ -412,8 +214,3 @@ def main(args=sys.argv):
if __name__ == '__main__':
sys.exit(main())
# import cProfile
# sys.exit(cProfile.run("import calibre.ebooks.metadata.amazonbis; calibre.ebooks.metadata.amazonbis.main()"))
# sys.exit(cProfile.run("import calibre.ebooks.metadata.amazonbis; calibre.ebooks.metadata.amazonbis.main()", "profile"))
# calibre-debug -e "D:\Mes eBooks\Developpement\calibre\src\calibre\ebooks\metadata\amazon.py" -m 5 -a gore -v>data.html

View File

@ -212,6 +212,27 @@ class MetadataSource(Plugin): # {{{
# }}}
class Amazon(MetadataSource): # {{{
name = 'Amazon'
metadata_type = 'social'
description = _('Downloads social metadata from amazon.com')
has_html_comments = True
def fetch(self):
if not self.isbn:
return
from calibre.ebooks.metadata.amazon import get_social_metadata
try:
self.results = get_social_metadata(self.title, self.book_author,
self.publisher, self.isbn)
except Exception, e:
self.exception = e
self.tb = traceback.format_exc()
# }}}
class KentDistrictLibrary(MetadataSource): # {{{
name = 'Kent District Library'