mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
More work on the new Amazon metadata download plugin
This commit is contained in:
parent
ca5703b250
commit
1edc4f5a80
@ -7,7 +7,7 @@ __license__ = 'GPL v3'
|
|||||||
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
|
__copyright__ = '2011, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||||
__docformat__ = 'restructuredtext en'
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
import socket, time
|
import socket, time, re
|
||||||
from urllib import urlencode
|
from urllib import urlencode
|
||||||
from threading import Thread
|
from threading import Thread
|
||||||
|
|
||||||
@ -18,9 +18,15 @@ from calibre.ebooks.metadata import check_isbn
|
|||||||
from calibre.ebooks.metadata.sources.base import Source
|
from calibre.ebooks.metadata.sources.base import Source
|
||||||
from calibre.utils.cleantext import clean_ascii_chars
|
from calibre.utils.cleantext import clean_ascii_chars
|
||||||
from calibre.ebooks.chardet import xml_to_unicode
|
from calibre.ebooks.chardet import xml_to_unicode
|
||||||
|
from calibre.ebooks.metadata.book.base import Metadata
|
||||||
|
from calibre.library.comments import sanitize_comments_html
|
||||||
|
|
||||||
class Worker(Thread):
|
class Worker(Thread):
|
||||||
|
|
||||||
|
'''
|
||||||
|
Get book details from amazons book page in a separate thread
|
||||||
|
'''
|
||||||
|
|
||||||
def __init__(self, url, result_queue, browser, log, timeout=20):
|
def __init__(self, url, result_queue, browser, log, timeout=20):
|
||||||
self.url, self.result_queue = url, result_queue
|
self.url, self.result_queue = url, result_queue
|
||||||
self.log, self.timeout = log, timeout
|
self.log, self.timeout = log, timeout
|
||||||
@ -75,7 +81,117 @@ class Worker(Thread):
|
|||||||
self.parse_details(root)
|
self.parse_details(root)
|
||||||
|
|
||||||
def parse_details(self, root):
|
def parse_details(self, root):
|
||||||
pass
|
try:
|
||||||
|
asin = self.parse_asin(root)
|
||||||
|
except:
|
||||||
|
self.log.exception('Error parsing asin for url: %r'%self.url)
|
||||||
|
asin = None
|
||||||
|
|
||||||
|
try:
|
||||||
|
title = self.parse_title(root)
|
||||||
|
except:
|
||||||
|
self.log.exception('Error parsing title for url: %r'%self.url)
|
||||||
|
title = None
|
||||||
|
|
||||||
|
try:
|
||||||
|
authors = self.parse_authors(root)
|
||||||
|
except:
|
||||||
|
self.log.exception('Error parsing authors for url: %r'%self.url)
|
||||||
|
authors = []
|
||||||
|
|
||||||
|
|
||||||
|
if not title or not authors or not asin:
|
||||||
|
self.log.error('Could not find title/authors/asin for %r'%self.url)
|
||||||
|
self.log.error('ASIN: %r Title: %r Authors: %r'%(asin, title,
|
||||||
|
authors))
|
||||||
|
return
|
||||||
|
|
||||||
|
mi = Metadata(title, authors)
|
||||||
|
mi.set_identifier('amazon', asin)
|
||||||
|
self.amazon_id = asin
|
||||||
|
|
||||||
|
try:
|
||||||
|
mi.rating = self.parse_ratings(root)
|
||||||
|
except:
|
||||||
|
self.log.exception('Error parsing ratings for url: %r'%self.url)
|
||||||
|
|
||||||
|
try:
|
||||||
|
mi.comments = self.parse_comments(root)
|
||||||
|
except:
|
||||||
|
self.log.exception('Error parsing comments for url: %r'%self.url)
|
||||||
|
|
||||||
|
try:
|
||||||
|
self.cover_url = self.parse_cover(root)
|
||||||
|
except:
|
||||||
|
self.log.exception('Error parsing cover for url: %r'%self.url)
|
||||||
|
|
||||||
|
self.result_queue.put(mi)
|
||||||
|
|
||||||
|
def parse_asin(self, root):
|
||||||
|
link = root.xpath('//link[@rel="canonical" and @href]')
|
||||||
|
for l in link:
|
||||||
|
return l.get('href').rpartition('/')[-1]
|
||||||
|
|
||||||
|
def parse_title(self, root):
|
||||||
|
tdiv = root.xpath('//h1[@class="parseasinTitle"]')[0]
|
||||||
|
actual_title = tdiv.xpath('descendant::*[@id="btAsinTitle"]')
|
||||||
|
if actual_title:
|
||||||
|
title = tostring(actual_title[0], encoding=unicode,
|
||||||
|
method='text').strip()
|
||||||
|
else:
|
||||||
|
title = tostring(tdiv, encoding=unicode, method='text').strip()
|
||||||
|
return re.sub(r'[([].*[)]]', '', title).strip()
|
||||||
|
|
||||||
|
def parse_authors(self, root):
|
||||||
|
bdiv = root.xpath('//div[@class="buying"]')[0]
|
||||||
|
aname = bdiv.xpath('descendant::span[@class="contributorNameTrigger"]')
|
||||||
|
authors = [tostring(x, encoding=unicode, method='text').strip() for x
|
||||||
|
in aname]
|
||||||
|
return authors
|
||||||
|
|
||||||
|
def parse_ratings(self, root):
|
||||||
|
ratings = root.xpath('//form[@id="handleBuy"]/descendant::*[@class="asinReviewsSummary"]')
|
||||||
|
pat = re.compile(r'([0-9.]+) out of (\d+) stars')
|
||||||
|
if ratings:
|
||||||
|
for elem in ratings[0].xpath('descendant::*[@title]'):
|
||||||
|
t = elem.get('title')
|
||||||
|
m = pat.match(t)
|
||||||
|
if m is not None:
|
||||||
|
try:
|
||||||
|
return float(m.group(1))/float(m.group(2)) * 5
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
def parse_comments(self, root):
|
||||||
|
desc = root.xpath('//div[@id="productDescription"]/*[@class="content"]')
|
||||||
|
if desc:
|
||||||
|
desc = desc[0]
|
||||||
|
for c in desc.xpath('descendant::*[@class="seeAll" or'
|
||||||
|
' @class="emptyClear" or @href]'):
|
||||||
|
c.getparent().remove(c)
|
||||||
|
desc = tostring(desc, method='html', encoding=unicode).strip()
|
||||||
|
# remove all attributes from tags
|
||||||
|
desc = re.sub(r'<([a-zA-Z0-9]+)\s[^>]+>', r'<\1>', desc)
|
||||||
|
# Collapse whitespace
|
||||||
|
#desc = re.sub('\n+', '\n', desc)
|
||||||
|
#desc = re.sub(' +', ' ', desc)
|
||||||
|
# Remove the notice about text referring to out of print editions
|
||||||
|
desc = re.sub(r'(?s)<em>--This text ref.*?</em>', '', desc)
|
||||||
|
# Remove comments
|
||||||
|
desc = re.sub(r'(?s)<!--.*?-->', '', desc)
|
||||||
|
return sanitize_comments_html(desc)
|
||||||
|
|
||||||
|
def parse_cover(self, root):
|
||||||
|
imgs = root.xpath('//img[@id="prodImage" and @src]')
|
||||||
|
if imgs:
|
||||||
|
src = imgs[0].get('src')
|
||||||
|
parts = src.split('/')
|
||||||
|
if len(parts) > 3:
|
||||||
|
bn = parts[-1]
|
||||||
|
sparts = bn.split('_')
|
||||||
|
if len(sparts) > 2:
|
||||||
|
bn = sparts[0] + sparts[-1]
|
||||||
|
return ('/'.join(parts[:-1]))+'/'+bn
|
||||||
|
|
||||||
|
|
||||||
class Amazon(Source):
|
class Amazon(Source):
|
||||||
|
@ -508,9 +508,9 @@ You have two choices:
|
|||||||
1. Create a patch by hacking on |app| and send it to me for review and inclusion. See `Development <http://calibre-ebook.com/get-involved>`_.
|
1. Create a patch by hacking on |app| and send it to me for review and inclusion. See `Development <http://calibre-ebook.com/get-involved>`_.
|
||||||
2. `Open a ticket <http://bugs.calibre-ebook.com/newticket>`_ (you have to register and login first). Remember that |app| development is done by volunteers, so if you get no response to your feature request, it means no one feels like implementing it.
|
2. `Open a ticket <http://bugs.calibre-ebook.com/newticket>`_ (you have to register and login first). Remember that |app| development is done by volunteers, so if you get no response to your feature request, it means no one feels like implementing it.
|
||||||
|
|
||||||
Can I include |app| on a CD to be distributed with my product/magazine?
|
How is |app| licensed?
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|app| is licensed under the GNU General Public License v3 (an open source license). This means that you are free to redistribute |app| as long as you make the source code available. So if you want to put |app| on a CD with your product, you must also put the |app| source code on the CD. The source code is available for download `from googlecode <http://code.google.com/p/calibre-ebook/downloads/list>`_.
|
|app| is licensed under the GNU General Public License v3 (an open source license). This means that you are free to redistribute |app| as long as you make the source code available. So if you want to put |app| on a CD with your product, you must also put the |app| source code on the CD. The source code is available for download `from googlecode <http://code.google.com/p/calibre-ebook/downloads/list>`_. You are free to use the results of conversions from |app| however you want. You cannot use code, libraries from |app| in your software without maing your software open source. For details, see `The GNU GPL v3 http://www.gnu.org/licenses/gpl.html`_.
|
||||||
|
|
||||||
How do I run calibre from my USB stick?
|
How do I run calibre from my USB stick?
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
Loading…
x
Reference in New Issue
Block a user