diff --git a/src/calibre/__init__.py b/src/calibre/__init__.py index 2c3d14e2c2..ccef0f39e4 100644 --- a/src/calibre/__init__.py +++ b/src/calibre/__init__.py @@ -193,7 +193,7 @@ def extract(path, dir): raise Exception('Unknown archive type') extractor(path, dir) -def get_proxies(): +def get_proxies(debug=True): proxies = {} for q in ('http', 'ftp'): @@ -226,10 +226,40 @@ def get_proxies(): if len(proxies[x]) < 5: prints('Removing invalid', x, 'proxy:', proxies[x]) del proxies[x] - if proxies: + if proxies and debug: prints('Using proxies:', proxies) return proxies +def get_parsed_proxy(typ='http', debug=True): + proxies = get_proxies(debug) + if typ not in proxies: + return + pattern = re.compile(( + '(?:ptype://)?' \ + '(?:(?P\w+):(?P.*)@)?' \ + '(?P[\w\-\.]+)' \ + '(?::(?P\d+))?').replace('ptype', typ) + ) + + match = pattern.match(proxies['typ']) + if match: + try: + ans = { + 'host' : match.group('host'), + 'port' : match.group('port'), + 'user' : match.group('user'), + 'pass' : match.group('pass') + } + if ans['port']: + ans['port'] = int(ans['port']) + except: + if debug: + traceback.print_exc() + return + if debug: + prints('Using http proxy', ans) + return ans + def browser(honor_time=True, max_time=2, mobile_browser=False): ''' diff --git a/src/calibre/customize/builtins.py b/src/calibre/customize/builtins.py index 22ae0d4b04..66007448bd 100644 --- a/src/calibre/customize/builtins.py +++ b/src/calibre/customize/builtins.py @@ -374,8 +374,8 @@ from calibre.devices.eslick.driver import ESLICK from calibre.devices.nuut2.driver import NUUT2 from calibre.devices.iriver.driver import IRIVER_STORY -from calibre.ebooks.metadata.fetch import GoogleBooks, ISBNDB -plugins = [HTML2ZIP, GoogleBooks, ISBNDB] +from calibre.ebooks.metadata.fetch import GoogleBooks, ISBNDB, Amazon +plugins = [HTML2ZIP, GoogleBooks, ISBNDB, Amazon] plugins += [ ComicInput, EPUBInput, diff --git a/src/calibre/customize/ui.py b/src/calibre/customize/ui.py index 66716fe4bb..11810b1644 100644 --- a/src/calibre/customize/ui.py +++ b/src/calibre/customize/ui.py @@ -90,9 +90,10 @@ def output_profiles(): if isinstance(plugin, OutputProfile): yield plugin -def metadata_sources(customize=True, isbndb_key=None): +def metadata_sources(metadata_type='basic', customize=True, isbndb_key=None): for plugin in _initialized_plugins: - if isinstance(plugin, MetadataSource): + if isinstance(plugin, MetadataSource) and \ + plugin.metadata_type == metadata_type: if is_disabled(plugin): continue if customize: diff --git a/src/calibre/ebooks/metadata/amazon.py b/src/calibre/ebooks/metadata/amazon.py index 102441e844..e988fb8234 100644 --- a/src/calibre/ebooks/metadata/amazon.py +++ b/src/calibre/ebooks/metadata/amazon.py @@ -6,45 +6,83 @@ __docformat__ = 'restructuredtext en' ''' Fetch metadata using Amazon AWS ''' -import re +import sys, re +from datetime import datetime + +from lxml import etree +from dateutil import parser from calibre import browser +from calibre.ebooks.metadata import MetaInformation, string_to_authors + +AWS_NS = 'http://webservices.amazon.com/AWSECommerceService/2005-10-05' + +def AWS(tag): + return '{%s}%s'%(AWS_NS, tag) + +def check_for_errors(root): + err = root.find('.//'+AWS('Error')) + if err is not None: + raise Exception('Failed to get metadata with error: '\ + + etree.tostring(err, method='text', pretty_print=True, + encoding=unicode)) + +def get_social_metadata(title, authors, publisher, isbn): + mi = MetaInformation(title, authors) + if isbn: + br = browser() + response_xml = br.open('http://status.calibre-ebook.com/aws/metadata/'+isbn).read() + root = etree.fromstring(response_xml) + check_for_errors(root) + mi.title = root.findtext('.//'+AWS('Title')) + authors = [x.text for x in root.findall('.//'+AWS('Author'))] + if authors: + mi.authors = [] + for x in authors: + mi.authors.extend(string_to_authors(x)) + mi.publisher = root.findtext('.//'+AWS('Publisher')) + try: + d = root.findtext('.//'+AWS('PublicationDate')) + if d: + default = datetime.utcnow() + default = datetime(default.year, default.month, 15) + d = parser.parse(d[0].text, default=default) + mi.pubdate = d + except: + pass + try: + rating = float(root.findtext('.//'+AWS('AverageRating'))) + num_of_reviews = int(root.findtext('.//'+AWS('TotalReviews'))) + if num_of_reviews > 4 and rating > 0 and rating < 5: + mi.rating = rating + except: + pass + tags = [x.text for x in root.findall('.//%s/%s'%(AWS('Subjects'), + AWS('Subject')))] + if tags: + mi.tags = [] + for x in tags: + mi.tags.extend([y.strip() for y in x.split('/')]) + comments = root.find('.//%s/%s'%(AWS('EditorialReview'), + AWS('Content'))) + if comments is not None: + mi.comments = etree.tostring(comments, + method='text', encoding=unicode) + mi.comments = re.sub('<([pP]|DIV)>', '\n\n', mi.comments) + mi.comments = re.sub('', '*', mi.comments) + mi.comments = re.sub('', '**', mi.comments) + mi.comments = re.sub('
', '\n\n', mi.comments) + mi.comments = re.sub('<[^>]+>', '', mi.comments) + mi.comments = mi.comments.strip() + mi.comments = _('EDITORIAL REVIEW')+':\n\n'+mi.comments + + return mi -BASE_URL = 'http://ecs.amazonaws.com/onca/xml?Service=AWSECommerceService&AWSAccessKeyId=%(key)s&Operation=ItemLookup&ItemId=1416551727&ResponseGroup=%(group)s' - -import sys - -def get_rating(isbn, key): - br = browser() - url = BASE_URL%dict(key=key, group='Reviews') - raw = br.open(url).read() - match = re.search(r'([\d.]+)', raw) - if match: - return float(match.group(1)) - -def get_cover_url(isbn, key): - br = browser() - url = BASE_URL%dict(key=key, group='Images') - raw = br.open(url).read() - match = re.search(r'(.+?)', raw) - if match: - return match.group(1) - -def get_editorial_review(isbn, key): - br = browser() - url = BASE_URL%dict(key=key, group='EditorialReview') - raw = br.open(url).read() - match = re.compile(r'.*?(.+?)', re.DOTALL).search(raw) - if match: - return match.group(1) def main(args=sys.argv): - print 'Rating:', get_rating(args[1], args[2]) - print 'Cover:', get_rating(args[1], args[2]) - print 'EditorialReview:', get_editorial_review(args[1], args[2]) - + print get_social_metadata(None, None, None, '9781416551720') return 0 if __name__ == '__main__': - sys.exit(main()) \ No newline at end of file + sys.exit(main()) diff --git a/src/calibre/ebooks/metadata/fetch.py b/src/calibre/ebooks/metadata/fetch.py index 5c90914bee..ed69c6882e 100644 --- a/src/calibre/ebooks/metadata/fetch.py +++ b/src/calibre/ebooks/metadata/fetch.py @@ -6,7 +6,7 @@ __docformat__ = 'restructuredtext en' import traceback, sys, textwrap, re from threading import Thread -from calibre import preferred_encoding +from calibre import prints from calibre.utils.config import OptionParser from calibre.utils.logging import default_log @@ -15,7 +15,14 @@ from calibre.customize import Plugin class MetadataSource(Plugin): author = 'Kovid Goyal' + supported_platforms = ['windows', 'osx', 'linux'] + + #: The type of metadata fetched. 'basic' means basic metadata like + #: title/author/isbn/etc. 'social' means social metadata like + #: tags/rating/reviews/etc. + metadata_type = 'basic' + type = _('Metadata download') def __call__(self, title, author, publisher, isbn, verbose, log=None, @@ -49,6 +56,7 @@ class MetadataSource(Plugin): def join(self): return self.worker.join() + class GoogleBooks(MetadataSource): name = 'Google Books' @@ -104,6 +112,22 @@ class ISBNDB(MetadataSource): ans = ans.replace('%s', '') return ans +class Amazon(MetadataSource): + + name = 'Amazon' + metadata_type = 'social' + + def fetch(self): + if not self.isbn: + return + from calibre.ebooks.metadata.amazon import get_social_metadata + try: + self.results = get_social_metadata(self.title, self.author, + self.publisher, self.isbn) + except Exception, e: + self.exception = e + self.tb = traceback.format_exc() + def result_index(source, result): if not result.isbn: return -1 @@ -134,16 +158,56 @@ def search(title=None, author=None, publisher=None, isbn=None, isbndb_key=None, fetcher(title, author, publisher, isbn, verbose) for fetcher in fetchers: fetcher.join() + results = list(fetchers[0].results) for fetcher in fetchers[1:]: - merge_results(fetchers[0].results, fetcher.results) + merge_results(results, fetcher.results) - results = sorted(fetchers[0].results, cmp=lambda x, y : cmp( + results = sorted(results, cmp=lambda x, y : cmp( (x.comments.strip() if x.comments else ''), (y.comments.strip() if y.comments else '') ), reverse=True) return results, [(x.name, x.exception, x.tb) for x in fetchers] +def get_social_metadata(mi, verbose=0): + from calibre.customize.ui import metadata_sources + fetchers = list(metadata_sources(metadata_type='social')) + for fetcher in fetchers: + fetcher(mi.title, mi.authors, mi.publisher, mi.isbn, verbose) + for fetcher in fetchers: + fetcher.join() + ratings, tags, comments = [], set([]), set([]) + for fetcher in fetchers: + if fetcher.results: + dmi = fetcher.results + if dmi.rating is not None: + ratings.append(dmi.rating) + if dmi.tags: + for t in dmi.tags: + tags.add(t) + if mi.pubdate is None and dmi.pubdate is not None: + mi.pubdate = dmi.pubdate + if dmi.comments: + comments.add(dmi.comments) + if ratings: + rating = sum(ratings)/float(len(ratings)) + if mi.rating is None: + mi.rating = rating + else: + mi.rating = (mi.rating + rating)/2.0 + if tags: + if not mi.tags: + mi.tags = [] + mi.tags += list(tags) + mi.tags = list(sorted(list(set(mi.tags)))) + if comments: + mi.comments = '' + for x in comments: + mi.comments += '\n\n'+x + + return [(x.name, x.exception, x.tb) for x in fetchers] + + def option_parser(): parser = OptionParser(textwrap.dedent( @@ -174,11 +238,13 @@ def main(args=sys.argv): opts, args = parser.parse_args(args) results, exceptions = search(opts.title, opts.author, opts.publisher, opts.isbn, opts.isbndb_key, opts.verbose) + social_exceptions = [] for result in results: - print unicode(result).encode(preferred_encoding) + social_exceptions.extend(get_social_metadata(result, opts.verbose)) + prints(unicode(result)) print - for name, exception, tb in exceptions: + for name, exception, tb in exceptions+social_exceptions: if exception is not None: print 'WARNING: Fetching from', name, 'failed with error:' print exception diff --git a/src/calibre/ebooks/metadata/google_books.py b/src/calibre/ebooks/metadata/google_books.py index fea3117f77..a02e13add6 100644 --- a/src/calibre/ebooks/metadata/google_books.py +++ b/src/calibre/ebooks/metadata/google_books.py @@ -135,7 +135,11 @@ class ResultList(list): def get_tags(self, entry, verbose): try: - tags = [x.text for x in subject(entry)] + btags = [x.text for x in subject(entry)] + tags = [] + for t in btags: + tags.extend([y.strip() for y in t.split('/')]) + tags = list(sorted(list(set(tags)))) except: report(verbose) tags = [] diff --git a/src/calibre/ebooks/metadata/isbndb.py b/src/calibre/ebooks/metadata/isbndb.py index b54bab7f98..d9f376c83d 100644 --- a/src/calibre/ebooks/metadata/isbndb.py +++ b/src/calibre/ebooks/metadata/isbndb.py @@ -125,7 +125,16 @@ def create_books(opts, args, timeout=5.): if opts.verbose: print ('ISBNDB query: '+url) - return [ISBNDBMetadata(book) for book in fetch_metadata(url, timeout=timeout)] + tans = [ISBNDBMetadata(book) for book in fetch_metadata(url, timeout=timeout)] + ans = [] + for x in tans: + add = True + for y in ans: + if y.isbn == x.isbn: + add = False + if add: + ans.append(x) + return ans def main(args=sys.argv): parser = option_parser() diff --git a/src/calibre/manual/plugins.rst b/src/calibre/manual/plugins.rst index 8ba33e036d..1969a70a4b 100644 --- a/src/calibre/manual/plugins.rst +++ b/src/calibre/manual/plugins.rst @@ -122,6 +122,8 @@ Metadata download plugins :class:`MetaInformation` objects. If there is an error, it should be stored in `self.exception` and `self.tb` (for the traceback). +.. automember:: calibre.ebooks.metadata.fetch.MetadataSource.metadata_type + .. automethod:: calibre.ebooks.metadata.fetch.MetadataSource.fetch .. automethod:: calibre.ebooks.metadata.fetch.MetadataSource.is_ok