diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py index ba90d20dcc..74f1f9eafe 100644 --- a/src/calibre/customize/builtins.py +++ b/src/calibre/customize/builtins.py @@ -580,12 +580,12 @@ from calibre.devices.folder_device.driver import FOLDER_DEVICE_FOR_CONFIG from calibre.devices.kobo.driver import KOBO from calibre.devices.bambook.driver import BAMBOOK -from calibre.ebooks.metadata.fetch import KentDistrictLibrary +from calibre.ebooks.metadata.fetch import KentDistrictLibrary, Amazon from calibre.ebooks.metadata.douban import DoubanBooks from calibre.ebooks.metadata.isbndb import ISBNDB from calibre.ebooks.metadata.google_books import GoogleBooks from calibre.ebooks.metadata.nicebooks import NiceBooks, NiceBooksCovers -from calibre.ebooks.metadata.amazon import Amazon, AmazonSocial +# from calibre.ebooks.metadata.amazon import Amazon , AmazonSocial from calibre.ebooks.metadata.fictionwise import Fictionwise from calibre.ebooks.metadata.covers import OpenLibraryCovers, \ AmazonCovers, DoubanCovers, LibrarythingCovers @@ -593,7 +593,7 @@ from calibre.library.catalog import CSV_XML, EPUB_MOBI, BIBTEX from calibre.ebooks.epub.fix.unmanifested import Unmanifested from calibre.ebooks.epub.fix.epubcheck import Epubcheck -plugins = [HTML2ZIP, PML2PMLZ, TXT2TXTZ, ArchiveExtract, GoogleBooks, ISBNDB, Amazon, AmazonSocial, +plugins = [HTML2ZIP, PML2PMLZ, TXT2TXTZ, ArchiveExtract, GoogleBooks, ISBNDB, Amazon, #AmazonSocial, KentDistrictLibrary, DoubanBooks, NiceBooks, CSV_XML, EPUB_MOBI, BIBTEX, Unmanifested, Epubcheck, OpenLibraryCovers, AmazonCovers, DoubanCovers, LibrarythingCovers, NiceBooksCovers] diff --git a/src/calibre/ebooks/metadata/amazon.py b/src/calibre/ebooks/metadata/amazon.py index a2ddc22770..c87249ed39 100644 --- a/src/calibre/ebooks/metadata/amazon.py +++ b/src/calibre/ebooks/metadata/amazon.py @@ -1,7 +1,11 @@ -from __future__ import with_statement -__license__ = 'GPL 3' -__copyright__ = '2010, sengian ' +#!/usr/bin/env python +__license__ = 'GPL v3' +__copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net' +__docformat__ = 'restructuredtext en' +''' +Fetch metadata using Amazon AWS +''' import sys, re from threading import RLock @@ -12,10 +16,6 @@ from calibre import browser from calibre.ebooks.metadata import check_isbn from calibre.ebooks.metadata.book.base import Metadata from calibre.ebooks.chardet import xml_to_unicode -from calibre.ebooks.metadata import MetaInformation, check_isbn, \ - authors_to_sort_string -from calibre.ebooks.metadata.fetch import MetadataSource -from calibre.utils.config import OptionParser from calibre.library.comments import sanitize_comments_html asin_cache = {} @@ -160,229 +160,31 @@ def get_metadata(br, asin, mi): m = pat.match(t) if m is not None: try: - default = utcnow().replace(day=15) - if self.lang != 'all': - d = replace_months(d, self.lang) - d = parse_date(d, assume_utc=True, default=default) - mi.pubdate = d + mi.rating = float(m.group(1))/float(m.group(2)) * 5 + break except: - report(verbose) - #ISBN - elt = filter(lambda x: self.reisbn.search(x.find('b').text), elts) - if elt: - isbn = elt[0].find('b').tail.replace('-', '').strip() - if check_isbn(isbn): - mi.isbn = unicode(isbn) - elif len(elt) > 1: - isbnone = elt[1].find('b').tail.replace('-', '').strip() - if check_isbn(isbnone): - mi.isbn = unicode(isbnone) - else: - #assume ASIN-> find a check for asin - mi.isbn = unicode(isbn) - #Langue - elt = filter(lambda x: self.relang.search(x.find('b').text), elts) - if elt: - langue = elt[0].find('b').tail.strip() - if langue: - mi.language = unicode(langue) - #ratings - elt = filter(lambda x: self.reratelt.search(x.find('b').text), elts) - if elt: - ratings = elt[0].find_class('swSprite') - if ratings: - ratings = self.rerat.findall(ratings[0].get('title')) - if len(ratings) == 2: - mi.rating = float(ratings[0])/float(ratings[1]) * 5 - return mi + pass - def fill_MI(self, entry, verbose): - try: - title = self.get_title(entry) - authors = self.get_authors(entry) - except Exception, e: - if verbose: - print _('Failed to get all details for an entry') - print e - print _('URL who failed: %s') % x - report(verbose) - return None - mi = MetaInformation(title, authors) - mi.author_sort = authors_to_sort_string(authors) - try: - mi.comments = self.get_description(entry, verbose) - mi = self.get_book_info(entry, mi, verbose) - except: - pass - return mi + desc = root.xpath('//div[@id="productDescription"]/*[@class="content"]') + if desc: + desc = desc[0] + for c in desc.xpath('descendant::*[@class="seeAll" or' + ' @class="emptyClear" or @href]'): + c.getparent().remove(c) + desc = html.tostring(desc, method='html', encoding=unicode).strip() + # remove all attributes from tags + desc = re.sub(r'<([a-zA-Z0-9]+)\s[^>]+>', r'<\1>', desc) + # Collapse whitespace + #desc = re.sub('\n+', '\n', desc) + #desc = re.sub(' +', ' ', desc) + # Remove the notice about text referring to out of print editions + desc = re.sub(r'(?s)--This text ref.*?', '', desc) + # Remove comments + desc = re.sub(r'(?s)', '', desc) + mi.comments = sanitize_comments_html(desc) - def get_individual_metadata(self, url, br, verbose): - try: - raw = br.open_novisit(url).read() - except Exception, e: - import socket - report(verbose) - if callable(getattr(e, 'getcode', None)) and \ - e.getcode() == 404: - return None - attr = getattr(e, 'args', [None]) - attr = attr if attr else [None] - if isinstance(attr[0], socket.timeout): - raise AmazonError(_('Amazon timed out. Try again later.')) - raise AmazonError(_('Amazon encountered an error.')) - if '404 - ' in raw: - report(verbose) - return None - raw = xml_to_unicode(raw, strip_encoding_pats=True, - resolve_entities=True)[0] - try: - return soupparser.fromstring(raw) - except: - try: - #remove ASCII invalid chars - return soupparser.fromstring(clean_ascii_chars(raw)) - except: - report(verbose) - return None + return True - def fetchdatathread(self, qbr, qsync, nb, url, verbose): - try: - browser = qbr.get(True) - entry = self.get_individual_metadata(url, browser, verbose) - except: - report(verbose) - entry = None - finally: - qbr.put(browser, True) - qsync.put((nb, entry), True) - - def producer(self, sync, urls, br, verbose=False): - for i in xrange(len(urls)): - thread = Thread(target=self.fetchdatathread, - args=(br, sync, i, urls[i], verbose)) - thread.start() - - def consumer(self, sync, syncbis, br, total_entries, verbose=False): - i=0 - self.extend([None]*total_entries) - while i < total_entries: - rq = sync.get(True) - nb = int(rq[0]) - entry = rq[1] - i+=1 - if entry is not None: - mi = self.fill_MI(entry, verbose) - if mi is not None: - mi.tags, atag = self.get_tags(entry, verbose) - self[nb] = mi - if atag: - thread = Thread(target=self.fetchdatathread, - args=(br, syncbis, nb, mi.tags, verbose)) - thread.start() - else: - syncbis.put((nb, None), True) - - def final(self, sync, total_entries, verbose): - i=0 - while i < total_entries: - rq = sync.get(True) - nb = int(rq[0]) - tags = rq[1] - i+=1 - if tags is not None: - self[nb].tags = self.get_tags(tags, verbose)[0] - - def populate(self, entries, ibr, verbose=False, brcall=3): - br = Queue(brcall) - cbr = Queue(brcall-1) - - syncp = Queue(1) - syncc = Queue(1) - - for i in xrange(brcall-1): - br.put(browser(), True) - cbr.put(browser(), True) - br.put(ibr, True) - - prod_thread = Thread(target=self.producer, args=(syncp, entries, br, verbose)) - cons_thread = Thread(target=self.consumer, args=(syncp, syncc, cbr, len(entries), verbose)) - fin_thread = Thread(target=self.final, args=(syncc, len(entries), verbose)) - prod_thread.start() - cons_thread.start() - fin_thread.start() - prod_thread.join() - cons_thread.join() - fin_thread.join() - - -def search(title=None, author=None, publisher=None, isbn=None, - max_results=5, verbose=False, keywords=None, lang='all'): - br = browser() - entries, baseurl = Query(title=title, author=author, isbn=isbn, publisher=publisher, - keywords=keywords, max_results=max_results,rlang=lang)(br, verbose) - - if entries is None or len(entries) == 0: - return None - - #List of entry - ans = ResultList(baseurl, lang) - ans.populate(entries, br, verbose) - return [x for x in ans if x is not None] - -def get_social_metadata(title, authors, publisher, isbn, verbose=False, - max_results=1, lang='all'): - mi = MetaInformation(title, authors) - if not isbn or not check_isbn(isbn): - return [mi] - - amazresults = search(isbn=isbn, verbose=verbose, - max_results=max_results, lang=lang) - if amazresults is None or amazresults[0] is None: - from calibre.ebooks.metadata.xisbn import xisbn - for i in xisbn.get_associated_isbns(isbn): - amazresults = search(isbn=i, verbose=verbose, - max_results=max_results, lang=lang) - if amazresults is not None and amazresults[0] is not None: - break - if amazresults is None or amazresults[0] is None: - return [mi] - - miaz = amazresults[0] - if miaz.rating is not None: - mi.rating = miaz.rating - if miaz.comments is not None: - mi.comments = miaz.comments - if miaz.tags is not None: - mi.tags = miaz.tags - return [mi] - -def option_parser(): - import textwrap - parser = OptionParser(textwrap.dedent(\ - _('''\ - %prog [options] - - Fetch book metadata from Amazon. You must specify one of title, author, - ISBN, publisher or keywords. Will fetch a maximum of 20 matches, - so you should make your query as specific as possible. - You can chose the language for metadata retrieval: - english & french & german - ''' - ))) - parser.add_option('-t', '--title', help=_('Book title')) - parser.add_option('-a', '--author', help=_('Book author(s)')) - parser.add_option('-p', '--publisher', help=_('Book publisher')) - parser.add_option('-i', '--isbn', help=_('Book ISBN')) - parser.add_option('-k', '--keywords', help=_('Keywords')) - parser.add_option('-s', '--social', default=0, action='count', - help=_('Get social data only')) - parser.add_option('-m', '--max-results', default=10, - help=_('Maximum number of results to fetch')) - parser.add_option('-l', '--lang', default='all', - help=_('Chosen language for metadata search (en, fr, de)')) - parser.add_option('-v', '--verbose', default=0, action='count', - help=_('Be more verbose about errors')) - return parser def main(args=sys.argv): import tempfile, os @@ -412,8 +214,3 @@ def main(args=sys.argv): if __name__ == '__main__': sys.exit(main()) - # import cProfile - # sys.exit(cProfile.run("import calibre.ebooks.metadata.amazonbis; calibre.ebooks.metadata.amazonbis.main()")) - # sys.exit(cProfile.run("import calibre.ebooks.metadata.amazonbis; calibre.ebooks.metadata.amazonbis.main()", "profile")) - -# calibre-debug -e "D:\Mes eBooks\Developpement\calibre\src\calibre\ebooks\metadata\amazon.py" -m 5 -a gore -v>data.html \ No newline at end of file diff --git a/src/calibre/ebooks/metadata/fetch.py b/src/calibre/ebooks/metadata/fetch.py index 5936222e24..978e460190 100644 --- a/src/calibre/ebooks/metadata/fetch.py +++ b/src/calibre/ebooks/metadata/fetch.py @@ -212,6 +212,27 @@ class MetadataSource(Plugin): # {{{ # }}} +class Amazon(MetadataSource): # {{{ + + name = 'Amazon' + metadata_type = 'social' + description = _('Downloads social metadata from amazon.com') + + has_html_comments = True + + def fetch(self): + if not self.isbn: + return + from calibre.ebooks.metadata.amazon import get_social_metadata + try: + self.results = get_social_metadata(self.title, self.book_author, + self.publisher, self.isbn) + except Exception, e: + self.exception = e + self.tb = traceback.format_exc() + + # }}} + class KentDistrictLibrary(MetadataSource): # {{{ name = 'Kent District Library'