diff --git a/src/calibre/ebooks/metadata/sources/amazon.py b/src/calibre/ebooks/metadata/sources/amazon.py index a62a9683cb..33ea24c421 100644 --- a/src/calibre/ebooks/metadata/sources/amazon.py +++ b/src/calibre/ebooks/metadata/sources/amazon.py @@ -7,7 +7,7 @@ __license__ = 'GPL v3' __copyright__ = '2011, Kovid Goyal ' __docformat__ = 'restructuredtext en' -import socket, time +import socket, time, re from urllib import urlencode from threading import Thread @@ -18,9 +18,15 @@ from calibre.ebooks.metadata import check_isbn from calibre.ebooks.metadata.sources.base import Source from calibre.utils.cleantext import clean_ascii_chars from calibre.ebooks.chardet import xml_to_unicode +from calibre.ebooks.metadata.book.base import Metadata +from calibre.library.comments import sanitize_comments_html class Worker(Thread): + ''' + Get book details from amazons book page in a separate thread + ''' + def __init__(self, url, result_queue, browser, log, timeout=20): self.url, self.result_queue = url, result_queue self.log, self.timeout = log, timeout @@ -75,7 +81,117 @@ class Worker(Thread): self.parse_details(root) def parse_details(self, root): - pass + try: + asin = self.parse_asin(root) + except: + self.log.exception('Error parsing asin for url: %r'%self.url) + asin = None + + try: + title = self.parse_title(root) + except: + self.log.exception('Error parsing title for url: %r'%self.url) + title = None + + try: + authors = self.parse_authors(root) + except: + self.log.exception('Error parsing authors for url: %r'%self.url) + authors = [] + + + if not title or not authors or not asin: + self.log.error('Could not find title/authors/asin for %r'%self.url) + self.log.error('ASIN: %r Title: %r Authors: %r'%(asin, title, + authors)) + return + + mi = Metadata(title, authors) + mi.set_identifier('amazon', asin) + self.amazon_id = asin + + try: + mi.rating = self.parse_ratings(root) + except: + self.log.exception('Error parsing ratings for url: %r'%self.url) + + try: + mi.comments = self.parse_comments(root) + except: + self.log.exception('Error parsing comments for url: %r'%self.url) + + try: + self.cover_url = self.parse_cover(root) + except: + self.log.exception('Error parsing cover for url: %r'%self.url) + + self.result_queue.put(mi) + + def parse_asin(self, root): + link = root.xpath('//link[@rel="canonical" and @href]') + for l in link: + return l.get('href').rpartition('/')[-1] + + def parse_title(self, root): + tdiv = root.xpath('//h1[@class="parseasinTitle"]')[0] + actual_title = tdiv.xpath('descendant::*[@id="btAsinTitle"]') + if actual_title: + title = tostring(actual_title[0], encoding=unicode, + method='text').strip() + else: + title = tostring(tdiv, encoding=unicode, method='text').strip() + return re.sub(r'[([].*[)]]', '', title).strip() + + def parse_authors(self, root): + bdiv = root.xpath('//div[@class="buying"]')[0] + aname = bdiv.xpath('descendant::span[@class="contributorNameTrigger"]') + authors = [tostring(x, encoding=unicode, method='text').strip() for x + in aname] + return authors + + def parse_ratings(self, root): + ratings = root.xpath('//form[@id="handleBuy"]/descendant::*[@class="asinReviewsSummary"]') + pat = re.compile(r'([0-9.]+) out of (\d+) stars') + if ratings: + for elem in ratings[0].xpath('descendant::*[@title]'): + t = elem.get('title') + m = pat.match(t) + if m is not None: + try: + return float(m.group(1))/float(m.group(2)) * 5 + except: + pass + + def parse_comments(self, root): + desc = root.xpath('//div[@id="productDescription"]/*[@class="content"]') + if desc: + desc = desc[0] + for c in desc.xpath('descendant::*[@class="seeAll" or' + ' @class="emptyClear" or @href]'): + c.getparent().remove(c) + desc = tostring(desc, method='html', encoding=unicode).strip() + # remove all attributes from tags + desc = re.sub(r'<([a-zA-Z0-9]+)\s[^>]+>', r'<\1>', desc) + # Collapse whitespace + #desc = re.sub('\n+', '\n', desc) + #desc = re.sub(' +', ' ', desc) + # Remove the notice about text referring to out of print editions + desc = re.sub(r'(?s)--This text ref.*?', '', desc) + # Remove comments + desc = re.sub(r'(?s)', '', desc) + return sanitize_comments_html(desc) + + def parse_cover(self, root): + imgs = root.xpath('//img[@id="prodImage" and @src]') + if imgs: + src = imgs[0].get('src') + parts = src.split('/') + if len(parts) > 3: + bn = parts[-1] + sparts = bn.split('_') + if len(sparts) > 2: + bn = sparts[0] + sparts[-1] + return ('/'.join(parts[:-1]))+'/'+bn class Amazon(Source): diff --git a/src/calibre/manual/faq.rst b/src/calibre/manual/faq.rst index a3d4332fd0..948611f775 100644 --- a/src/calibre/manual/faq.rst +++ b/src/calibre/manual/faq.rst @@ -508,9 +508,9 @@ You have two choices: 1. Create a patch by hacking on |app| and send it to me for review and inclusion. See `Development `_. 2. `Open a ticket `_ (you have to register and login first). Remember that |app| development is done by volunteers, so if you get no response to your feature request, it means no one feels like implementing it. -Can I include |app| on a CD to be distributed with my product/magazine? +How is |app| licensed? ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -|app| is licensed under the GNU General Public License v3 (an open source license). This means that you are free to redistribute |app| as long as you make the source code available. So if you want to put |app| on a CD with your product, you must also put the |app| source code on the CD. The source code is available for download `from googlecode `_. +|app| is licensed under the GNU General Public License v3 (an open source license). This means that you are free to redistribute |app| as long as you make the source code available. So if you want to put |app| on a CD with your product, you must also put the |app| source code on the CD. The source code is available for download `from googlecode `_. You are free to use the results of conversions from |app| however you want. You cannot use code, libraries from |app| in your software without maing your software open source. For details, see `The GNU GPL v3 http://www.gnu.org/licenses/gpl.html`_. How do I run calibre from my USB stick? ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~