Metadata compatibility

This commit is contained in:
Sengian 2011-03-09 22:21:02 +01:00
parent e3ec837fd1
commit 888aaec88f
3 changed files with 52 additions and 234 deletions

View File

@ -580,12 +580,12 @@ from calibre.devices.folder_device.driver import FOLDER_DEVICE_FOR_CONFIG
from calibre.devices.kobo.driver import KOBO from calibre.devices.kobo.driver import KOBO
from calibre.devices.bambook.driver import BAMBOOK from calibre.devices.bambook.driver import BAMBOOK
from calibre.ebooks.metadata.fetch import KentDistrictLibrary from calibre.ebooks.metadata.fetch import KentDistrictLibrary, Amazon
from calibre.ebooks.metadata.douban import DoubanBooks from calibre.ebooks.metadata.douban import DoubanBooks
from calibre.ebooks.metadata.isbndb import ISBNDB from calibre.ebooks.metadata.isbndb import ISBNDB
from calibre.ebooks.metadata.google_books import GoogleBooks from calibre.ebooks.metadata.google_books import GoogleBooks
from calibre.ebooks.metadata.nicebooks import NiceBooks, NiceBooksCovers from calibre.ebooks.metadata.nicebooks import NiceBooks, NiceBooksCovers
from calibre.ebooks.metadata.amazon import Amazon, AmazonSocial # from calibre.ebooks.metadata.amazon import Amazon , AmazonSocial
from calibre.ebooks.metadata.fictionwise import Fictionwise from calibre.ebooks.metadata.fictionwise import Fictionwise
from calibre.ebooks.metadata.covers import OpenLibraryCovers, \ from calibre.ebooks.metadata.covers import OpenLibraryCovers, \
AmazonCovers, DoubanCovers, LibrarythingCovers AmazonCovers, DoubanCovers, LibrarythingCovers
@ -593,7 +593,7 @@ from calibre.library.catalog import CSV_XML, EPUB_MOBI, BIBTEX
from calibre.ebooks.epub.fix.unmanifested import Unmanifested from calibre.ebooks.epub.fix.unmanifested import Unmanifested
from calibre.ebooks.epub.fix.epubcheck import Epubcheck from calibre.ebooks.epub.fix.epubcheck import Epubcheck
plugins = [HTML2ZIP, PML2PMLZ, TXT2TXTZ, ArchiveExtract, GoogleBooks, ISBNDB, Amazon, AmazonSocial, plugins = [HTML2ZIP, PML2PMLZ, TXT2TXTZ, ArchiveExtract, GoogleBooks, ISBNDB, Amazon, #AmazonSocial,
KentDistrictLibrary, DoubanBooks, NiceBooks, CSV_XML, EPUB_MOBI, BIBTEX, Unmanifested, KentDistrictLibrary, DoubanBooks, NiceBooks, CSV_XML, EPUB_MOBI, BIBTEX, Unmanifested,
Epubcheck, OpenLibraryCovers, AmazonCovers, DoubanCovers, LibrarythingCovers, Epubcheck, OpenLibraryCovers, AmazonCovers, DoubanCovers, LibrarythingCovers,
NiceBooksCovers] NiceBooksCovers]

View File

@ -1,7 +1,11 @@
from __future__ import with_statement #!/usr/bin/env python
__license__ = 'GPL 3' __license__ = 'GPL v3'
__copyright__ = '2010, sengian <sengian1@gmail.com>' __copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net'
__docformat__ = 'restructuredtext en'
'''
Fetch metadata using Amazon AWS
'''
import sys, re import sys, re
from threading import RLock from threading import RLock
@ -12,10 +16,6 @@ from calibre import browser
from calibre.ebooks.metadata import check_isbn from calibre.ebooks.metadata import check_isbn
from calibre.ebooks.metadata.book.base import Metadata from calibre.ebooks.metadata.book.base import Metadata
from calibre.ebooks.chardet import xml_to_unicode from calibre.ebooks.chardet import xml_to_unicode
from calibre.ebooks.metadata import MetaInformation, check_isbn, \
authors_to_sort_string
from calibre.ebooks.metadata.fetch import MetadataSource
from calibre.utils.config import OptionParser
from calibre.library.comments import sanitize_comments_html from calibre.library.comments import sanitize_comments_html
asin_cache = {} asin_cache = {}
@ -160,229 +160,31 @@ def get_metadata(br, asin, mi):
m = pat.match(t) m = pat.match(t)
if m is not None: if m is not None:
try: try:
default = utcnow().replace(day=15) mi.rating = float(m.group(1))/float(m.group(2)) * 5
if self.lang != 'all': break
d = replace_months(d, self.lang)
d = parse_date(d, assume_utc=True, default=default)
mi.pubdate = d
except:
report(verbose)
#ISBN
elt = filter(lambda x: self.reisbn.search(x.find('b').text), elts)
if elt:
isbn = elt[0].find('b').tail.replace('-', '').strip()
if check_isbn(isbn):
mi.isbn = unicode(isbn)
elif len(elt) > 1:
isbnone = elt[1].find('b').tail.replace('-', '').strip()
if check_isbn(isbnone):
mi.isbn = unicode(isbnone)
else:
#assume ASIN-> find a check for asin
mi.isbn = unicode(isbn)
#Langue
elt = filter(lambda x: self.relang.search(x.find('b').text), elts)
if elt:
langue = elt[0].find('b').tail.strip()
if langue:
mi.language = unicode(langue)
#ratings
elt = filter(lambda x: self.reratelt.search(x.find('b').text), elts)
if elt:
ratings = elt[0].find_class('swSprite')
if ratings:
ratings = self.rerat.findall(ratings[0].get('title'))
if len(ratings) == 2:
mi.rating = float(ratings[0])/float(ratings[1]) * 5
return mi
def fill_MI(self, entry, verbose):
try:
title = self.get_title(entry)
authors = self.get_authors(entry)
except Exception, e:
if verbose:
print _('Failed to get all details for an entry')
print e
print _('URL who failed: %s') % x
report(verbose)
return None
mi = MetaInformation(title, authors)
mi.author_sort = authors_to_sort_string(authors)
try:
mi.comments = self.get_description(entry, verbose)
mi = self.get_book_info(entry, mi, verbose)
except: except:
pass pass
return mi
def get_individual_metadata(self, url, br, verbose): desc = root.xpath('//div[@id="productDescription"]/*[@class="content"]')
try: if desc:
raw = br.open_novisit(url).read() desc = desc[0]
except Exception, e: for c in desc.xpath('descendant::*[@class="seeAll" or'
import socket ' @class="emptyClear" or @href]'):
report(verbose) c.getparent().remove(c)
if callable(getattr(e, 'getcode', None)) and \ desc = html.tostring(desc, method='html', encoding=unicode).strip()
e.getcode() == 404: # remove all attributes from tags
return None desc = re.sub(r'<([a-zA-Z0-9]+)\s[^>]+>', r'<\1>', desc)
attr = getattr(e, 'args', [None]) # Collapse whitespace
attr = attr if attr else [None] #desc = re.sub('\n+', '\n', desc)
if isinstance(attr[0], socket.timeout): #desc = re.sub(' +', ' ', desc)
raise AmazonError(_('Amazon timed out. Try again later.')) # Remove the notice about text referring to out of print editions
raise AmazonError(_('Amazon encountered an error.')) desc = re.sub(r'(?s)<em>--This text ref.*?</em>', '', desc)
if '<title>404 - ' in raw: # Remove comments
report(verbose) desc = re.sub(r'(?s)<!--.*?-->', '', desc)
return None mi.comments = sanitize_comments_html(desc)
raw = xml_to_unicode(raw, strip_encoding_pats=True,
resolve_entities=True)[0]
try:
return soupparser.fromstring(raw)
except:
try:
#remove ASCII invalid chars
return soupparser.fromstring(clean_ascii_chars(raw))
except:
report(verbose)
return None
def fetchdatathread(self, qbr, qsync, nb, url, verbose): return True
try:
browser = qbr.get(True)
entry = self.get_individual_metadata(url, browser, verbose)
except:
report(verbose)
entry = None
finally:
qbr.put(browser, True)
qsync.put((nb, entry), True)
def producer(self, sync, urls, br, verbose=False):
for i in xrange(len(urls)):
thread = Thread(target=self.fetchdatathread,
args=(br, sync, i, urls[i], verbose))
thread.start()
def consumer(self, sync, syncbis, br, total_entries, verbose=False):
i=0
self.extend([None]*total_entries)
while i < total_entries:
rq = sync.get(True)
nb = int(rq[0])
entry = rq[1]
i+=1
if entry is not None:
mi = self.fill_MI(entry, verbose)
if mi is not None:
mi.tags, atag = self.get_tags(entry, verbose)
self[nb] = mi
if atag:
thread = Thread(target=self.fetchdatathread,
args=(br, syncbis, nb, mi.tags, verbose))
thread.start()
else:
syncbis.put((nb, None), True)
def final(self, sync, total_entries, verbose):
i=0
while i < total_entries:
rq = sync.get(True)
nb = int(rq[0])
tags = rq[1]
i+=1
if tags is not None:
self[nb].tags = self.get_tags(tags, verbose)[0]
def populate(self, entries, ibr, verbose=False, brcall=3):
br = Queue(brcall)
cbr = Queue(brcall-1)
syncp = Queue(1)
syncc = Queue(1)
for i in xrange(brcall-1):
br.put(browser(), True)
cbr.put(browser(), True)
br.put(ibr, True)
prod_thread = Thread(target=self.producer, args=(syncp, entries, br, verbose))
cons_thread = Thread(target=self.consumer, args=(syncp, syncc, cbr, len(entries), verbose))
fin_thread = Thread(target=self.final, args=(syncc, len(entries), verbose))
prod_thread.start()
cons_thread.start()
fin_thread.start()
prod_thread.join()
cons_thread.join()
fin_thread.join()
def search(title=None, author=None, publisher=None, isbn=None,
max_results=5, verbose=False, keywords=None, lang='all'):
br = browser()
entries, baseurl = Query(title=title, author=author, isbn=isbn, publisher=publisher,
keywords=keywords, max_results=max_results,rlang=lang)(br, verbose)
if entries is None or len(entries) == 0:
return None
#List of entry
ans = ResultList(baseurl, lang)
ans.populate(entries, br, verbose)
return [x for x in ans if x is not None]
def get_social_metadata(title, authors, publisher, isbn, verbose=False,
max_results=1, lang='all'):
mi = MetaInformation(title, authors)
if not isbn or not check_isbn(isbn):
return [mi]
amazresults = search(isbn=isbn, verbose=verbose,
max_results=max_results, lang=lang)
if amazresults is None or amazresults[0] is None:
from calibre.ebooks.metadata.xisbn import xisbn
for i in xisbn.get_associated_isbns(isbn):
amazresults = search(isbn=i, verbose=verbose,
max_results=max_results, lang=lang)
if amazresults is not None and amazresults[0] is not None:
break
if amazresults is None or amazresults[0] is None:
return [mi]
miaz = amazresults[0]
if miaz.rating is not None:
mi.rating = miaz.rating
if miaz.comments is not None:
mi.comments = miaz.comments
if miaz.tags is not None:
mi.tags = miaz.tags
return [mi]
def option_parser():
import textwrap
parser = OptionParser(textwrap.dedent(\
_('''\
%prog [options]
Fetch book metadata from Amazon. You must specify one of title, author,
ISBN, publisher or keywords. Will fetch a maximum of 20 matches,
so you should make your query as specific as possible.
You can chose the language for metadata retrieval:
english & french & german
'''
)))
parser.add_option('-t', '--title', help=_('Book title'))
parser.add_option('-a', '--author', help=_('Book author(s)'))
parser.add_option('-p', '--publisher', help=_('Book publisher'))
parser.add_option('-i', '--isbn', help=_('Book ISBN'))
parser.add_option('-k', '--keywords', help=_('Keywords'))
parser.add_option('-s', '--social', default=0, action='count',
help=_('Get social data only'))
parser.add_option('-m', '--max-results', default=10,
help=_('Maximum number of results to fetch'))
parser.add_option('-l', '--lang', default='all',
help=_('Chosen language for metadata search (en, fr, de)'))
parser.add_option('-v', '--verbose', default=0, action='count',
help=_('Be more verbose about errors'))
return parser
def main(args=sys.argv): def main(args=sys.argv):
import tempfile, os import tempfile, os
@ -412,8 +214,3 @@ def main(args=sys.argv):
if __name__ == '__main__': if __name__ == '__main__':
sys.exit(main()) sys.exit(main())
# import cProfile
# sys.exit(cProfile.run("import calibre.ebooks.metadata.amazonbis; calibre.ebooks.metadata.amazonbis.main()"))
# sys.exit(cProfile.run("import calibre.ebooks.metadata.amazonbis; calibre.ebooks.metadata.amazonbis.main()", "profile"))
# calibre-debug -e "D:\Mes eBooks\Developpement\calibre\src\calibre\ebooks\metadata\amazon.py" -m 5 -a gore -v>data.html

View File

@ -212,6 +212,27 @@ class MetadataSource(Plugin): # {{{
# }}} # }}}
class Amazon(MetadataSource): # {{{
name = 'Amazon'
metadata_type = 'social'
description = _('Downloads social metadata from amazon.com')
has_html_comments = True
def fetch(self):
if not self.isbn:
return
from calibre.ebooks.metadata.amazon import get_social_metadata
try:
self.results = get_social_metadata(self.title, self.book_author,
self.publisher, self.isbn)
except Exception, e:
self.exception = e
self.tb = traceback.format_exc()
# }}}
class KentDistrictLibrary(MetadataSource): # {{{ class KentDistrictLibrary(MetadataSource): # {{{
name = 'Kent District Library' name = 'Kent District Library'