mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Add threading to Amazon (still lagging like hell)
This commit is contained in:
parent
bb2d2a6641
commit
f766eb871c
@ -3,11 +3,12 @@ __license__ = 'GPL 3'
|
|||||||
__copyright__ = '2010, sengian <sengian1@gmail.com>'
|
__copyright__ = '2010, sengian <sengian1@gmail.com>'
|
||||||
|
|
||||||
import sys, textwrap, re, traceback
|
import sys, textwrap, re, traceback
|
||||||
|
from threading import Thread
|
||||||
|
from Queue import Queue
|
||||||
from urllib import urlencode
|
from urllib import urlencode
|
||||||
from math import ceil
|
from math import ceil
|
||||||
|
|
||||||
from lxml import html
|
from lxml.html import soupparser, tostring
|
||||||
from lxml.html import soupparser
|
|
||||||
|
|
||||||
from calibre.utils.date import parse_date, utcnow, replace_months
|
from calibre.utils.date import parse_date, utcnow, replace_months
|
||||||
from calibre.utils.cleantext import clean_ascii_chars
|
from calibre.utils.cleantext import clean_ascii_chars
|
||||||
@ -116,6 +117,48 @@ def report(verbose):
|
|||||||
if verbose:
|
if verbose:
|
||||||
traceback.print_exc()
|
traceback.print_exc()
|
||||||
|
|
||||||
|
class AmazonError(Exception):
|
||||||
|
pass
|
||||||
|
|
||||||
|
class BrowserThread(Thread):
|
||||||
|
|
||||||
|
def __init__(self, url, verbose=False, timeout=10., ex=Exception, name='Meta'):
|
||||||
|
self.url = url
|
||||||
|
self.ex = ex
|
||||||
|
self.plugname = name
|
||||||
|
self.verbose = verbose
|
||||||
|
self.timeout = timeout
|
||||||
|
self.result = None
|
||||||
|
Thread.__init__(self)
|
||||||
|
|
||||||
|
def get_result(self):
|
||||||
|
return self.result
|
||||||
|
|
||||||
|
def run(self):
|
||||||
|
try:
|
||||||
|
raw = browser().open_novisit(self.url, timeout=self.timeout).read()
|
||||||
|
except Exception, e:
|
||||||
|
report(self.verbose)
|
||||||
|
if callable(getattr(e, 'getcode', None)) and \
|
||||||
|
e.getcode() == 404:
|
||||||
|
self.result = None
|
||||||
|
if isinstance(getattr(e, 'args', [None])[0], socket.timeout):
|
||||||
|
raise self.ex(_('%s timed out. Try again later.') % self.plugname)
|
||||||
|
raise self.ex(_('%s encountered an error.') % self.plugname)
|
||||||
|
if '<title>404 - ' in raw:
|
||||||
|
report(self.verbose)
|
||||||
|
self.result = None
|
||||||
|
raw = xml_to_unicode(raw, strip_encoding_pats=True,
|
||||||
|
resolve_entities=True)[0]
|
||||||
|
try:
|
||||||
|
self.result = soupparser.fromstring(raw)
|
||||||
|
except:
|
||||||
|
try:
|
||||||
|
#remove ASCII invalid chars
|
||||||
|
self.result = soupparser.fromstring(clean_ascii_chars(raw))
|
||||||
|
except:
|
||||||
|
self.result = None
|
||||||
|
|
||||||
|
|
||||||
class Query(object):
|
class Query(object):
|
||||||
|
|
||||||
@ -189,7 +232,7 @@ class Query(object):
|
|||||||
|
|
||||||
def __call__(self, browser, verbose, timeout = 5.):
|
def __call__(self, browser, verbose, timeout = 5.):
|
||||||
if verbose:
|
if verbose:
|
||||||
print 'Query:', self.urldata
|
print _('Query: %s') % self.urldata
|
||||||
|
|
||||||
try:
|
try:
|
||||||
raw = browser.open_novisit(self.urldata, timeout=timeout).read()
|
raw = browser.open_novisit(self.urldata, timeout=timeout).read()
|
||||||
@ -197,10 +240,12 @@ class Query(object):
|
|||||||
report(verbose)
|
report(verbose)
|
||||||
if callable(getattr(e, 'getcode', None)) and \
|
if callable(getattr(e, 'getcode', None)) and \
|
||||||
e.getcode() == 404:
|
e.getcode() == 404:
|
||||||
return
|
return None, self.urldata
|
||||||
raise
|
if isinstance(getattr(e, 'args', [None])[0], socket.timeout):
|
||||||
|
raise AmazonError(_('Amazon timed out. Try again later.'))
|
||||||
|
raise AmazonError(_('Amazon encountered an error.'))
|
||||||
if '<title>404 - ' in raw:
|
if '<title>404 - ' in raw:
|
||||||
return
|
return None, self.urldata
|
||||||
raw = xml_to_unicode(raw, strip_encoding_pats=True,
|
raw = xml_to_unicode(raw, strip_encoding_pats=True,
|
||||||
resolve_entities=True)[0]
|
resolve_entities=True)[0]
|
||||||
|
|
||||||
@ -315,7 +360,7 @@ class ResultList(list):
|
|||||||
inv_class = ('seeAll', 'emptyClear')
|
inv_class = ('seeAll', 'emptyClear')
|
||||||
inv_tags ={'img': True, 'a': False}
|
inv_tags ={'img': True, 'a': False}
|
||||||
self.clean_entry(description, invalid_tags=inv_tags, invalid_class=inv_class)
|
self.clean_entry(description, invalid_tags=inv_tags, invalid_class=inv_class)
|
||||||
description = html.tostring(description, method='html', encoding=unicode).strip()
|
description = tostring(description, method='html', encoding=unicode).strip()
|
||||||
# remove all attributes from tags
|
# remove all attributes from tags
|
||||||
description = self.reattr.sub(r'<\1>', description)
|
description = self.reattr.sub(r'<\1>', description)
|
||||||
# Remove the notice about text referring to out of print editions
|
# Remove the notice about text referring to out of print editions
|
||||||
@ -327,7 +372,7 @@ class ResultList(list):
|
|||||||
report(verbose)
|
report(verbose)
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def get_tags(self, entry, browser, verbose):
|
def get_tags(self, entry, verbose):
|
||||||
try:
|
try:
|
||||||
tags = entry.get_element_by_id('tagContentHolder')
|
tags = entry.get_element_by_id('tagContentHolder')
|
||||||
testptag = tags.find_class('see-all')
|
testptag = tags.find_class('see-all')
|
||||||
@ -338,7 +383,7 @@ class ResultList(list):
|
|||||||
if alink[0].get('class') == 'tgJsActive':
|
if alink[0].get('class') == 'tgJsActive':
|
||||||
continue
|
continue
|
||||||
link = self.baseurl + alink[0].get('href')
|
link = self.baseurl + alink[0].get('href')
|
||||||
entry = self.get_individual_metadata(browser, link, verbose)
|
entry = self.get_individual_metadata(link, verbose)
|
||||||
tags = entry.get_element_by_id('tagContentHolder')
|
tags = entry.get_element_by_id('tagContentHolder')
|
||||||
break
|
break
|
||||||
tags = [a.text for a in tags.getiterator('a') if a.get('rel') == 'tag']
|
tags = [a.text for a in tags.getiterator('a') if a.get('rel') == 'tag']
|
||||||
@ -402,26 +447,41 @@ class ResultList(list):
|
|||||||
mi.rating = float(ratings[0])/float(ratings[1]) * 5
|
mi.rating = float(ratings[0])/float(ratings[1]) * 5
|
||||||
return mi
|
return mi
|
||||||
|
|
||||||
def fill_MI(self, entry, title, authors, browser, verbose):
|
def fill_MI(self, entry, verbose):
|
||||||
|
try:
|
||||||
|
title = self.get_title(entry)
|
||||||
|
authors = self.get_authors(entry)
|
||||||
|
except Exception, e:
|
||||||
|
if verbose:
|
||||||
|
print _('Failed to get all details for an entry')
|
||||||
|
print e
|
||||||
|
print _('URL who failed: %s') % x
|
||||||
|
report(verbose)
|
||||||
|
return None
|
||||||
mi = MetaInformation(title, authors)
|
mi = MetaInformation(title, authors)
|
||||||
mi.author_sort = authors_to_sort_string(authors)
|
mi.author_sort = authors_to_sort_string(authors)
|
||||||
mi.comments = self.get_description(entry, verbose)
|
try:
|
||||||
mi = self.get_book_info(entry, mi, verbose)
|
mi.comments = self.get_description(entry, verbose)
|
||||||
mi.tags = self.get_tags(entry, browser, verbose)
|
mi = self.get_book_info(entry, mi, verbose)
|
||||||
|
mi.tags = self.get_tags(entry, verbose)
|
||||||
|
except:
|
||||||
|
pass
|
||||||
return mi
|
return mi
|
||||||
|
|
||||||
def get_individual_metadata(self, browser, linkdata, verbose):
|
def get_individual_metadata(self, url, verbose):
|
||||||
try:
|
try:
|
||||||
raw = browser.open_novisit(linkdata).read()
|
raw = browser().open_novisit(url).read()
|
||||||
except Exception, e:
|
except Exception, e:
|
||||||
report(verbose)
|
report(verbose)
|
||||||
if callable(getattr(e, 'getcode', None)) and \
|
if callable(getattr(e, 'getcode', None)) and \
|
||||||
e.getcode() == 404:
|
e.getcode() == 404:
|
||||||
return
|
return None
|
||||||
raise
|
if isinstance(getattr(e, 'args', [None])[0], socket.timeout):
|
||||||
|
raise AmazonError(_('Amazon timed out. Try again later.'))
|
||||||
|
raise AmazonError(_('Amazon encountered an error.'))
|
||||||
if '<title>404 - ' in raw:
|
if '<title>404 - ' in raw:
|
||||||
report(verbose)
|
report(verbose)
|
||||||
return
|
return None
|
||||||
raw = xml_to_unicode(raw, strip_encoding_pats=True,
|
raw = xml_to_unicode(raw, strip_encoding_pats=True,
|
||||||
resolve_entities=True)[0]
|
resolve_entities=True)[0]
|
||||||
try:
|
try:
|
||||||
@ -432,27 +492,34 @@ class ResultList(list):
|
|||||||
return soupparser.fromstring(clean_ascii_chars(raw))
|
return soupparser.fromstring(clean_ascii_chars(raw))
|
||||||
except:
|
except:
|
||||||
report(verbose)
|
report(verbose)
|
||||||
return
|
return None
|
||||||
|
|
||||||
def populate(self, entries, browser, verbose=False):
|
def producer(self, q, data, verbose=False):
|
||||||
for x in entries:
|
for x in data:
|
||||||
try:
|
thread = BrowserThread(x, verbose=verbose, ex=AmazonError,
|
||||||
entry = self.get_individual_metadata(browser, x, verbose)
|
name='Amazon')
|
||||||
# clean results
|
thread.start()
|
||||||
# inv_ids = ('divsinglecolumnminwidth', 'sims.purchase', 'AutoBuyXGetY', 'A9AdsMiddleBoxTop')
|
q.put(thread, True)
|
||||||
# inv_class = ('buyingDetailsGrid', 'productImageGrid')
|
|
||||||
# inv_tags ={'script': True, 'style': True, 'form': False}
|
def consumer(self, q, total_entries, verbose=False):
|
||||||
# self.clean_entry(entry, invalid_id=inv_ids)
|
while len(self) < total_entries:
|
||||||
title = self.get_title(entry)
|
thread = q.get(True)
|
||||||
authors = self.get_authors(entry)
|
thread.join()
|
||||||
except Exception, e:
|
mi = thread.get_result()
|
||||||
if verbose:
|
if mi is None:
|
||||||
print 'Failed to get all details for an entry'
|
self.append(None)
|
||||||
print e
|
else:
|
||||||
print 'URL who failed:', x
|
self.append(self.fill_MI(mi, verbose))
|
||||||
report(verbose)
|
|
||||||
continue
|
def populate(self, entries, verbose=False, brcall=5):
|
||||||
self.append(self.fill_MI(entry, title, authors, browser, verbose))
|
#multiple entries
|
||||||
|
q = Queue(brcall)
|
||||||
|
prod_thread = Thread(target=self.producer, args=(q, entries, verbose))
|
||||||
|
cons_thread = Thread(target=self.consumer, args=(q, len(entries), verbose))
|
||||||
|
prod_thread.start()
|
||||||
|
cons_thread.start()
|
||||||
|
prod_thread.join()
|
||||||
|
cons_thread.join()
|
||||||
|
|
||||||
|
|
||||||
def search(title=None, author=None, publisher=None, isbn=None,
|
def search(title=None, author=None, publisher=None, isbn=None,
|
||||||
@ -466,8 +533,8 @@ def search(title=None, author=None, publisher=None, isbn=None,
|
|||||||
|
|
||||||
#List of entry
|
#List of entry
|
||||||
ans = ResultList(baseurl, lang)
|
ans = ResultList(baseurl, lang)
|
||||||
ans.populate(entries, br, verbose)
|
ans.populate(entries, verbose)
|
||||||
return ans
|
return [x for x in ans if x is not None]
|
||||||
|
|
||||||
def option_parser():
|
def option_parser():
|
||||||
parser = OptionParser(textwrap.dedent(\
|
parser = OptionParser(textwrap.dedent(\
|
||||||
@ -506,7 +573,7 @@ def main(args=sys.argv):
|
|||||||
parser.print_help()
|
parser.print_help()
|
||||||
return 1
|
return 1
|
||||||
if results is None or len(results) == 0:
|
if results is None or len(results) == 0:
|
||||||
print 'No result found for this search!'
|
print _('No result found for this search!')
|
||||||
return 0
|
return 0
|
||||||
for result in results:
|
for result in results:
|
||||||
print unicode(result).encode(preferred_encoding, 'replace')
|
print unicode(result).encode(preferred_encoding, 'replace')
|
||||||
@ -514,3 +581,5 @@ def main(args=sys.argv):
|
|||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
sys.exit(main())
|
sys.exit(main())
|
||||||
|
|
||||||
|
# calibre-debug -e "H:\Mes eBooks\Developpement\calibre\src\calibre\ebooks\metadata\amazonfr.py" -m 5 -a gore -v>data.html
|
@ -80,11 +80,11 @@ class BrowserThread(Thread):
|
|||||||
except:
|
except:
|
||||||
self.result = None
|
self.result = None
|
||||||
|
|
||||||
|
|
||||||
def report(verbose):
|
def report(verbose):
|
||||||
if verbose:
|
if verbose:
|
||||||
traceback.print_exc()
|
traceback.print_exc()
|
||||||
|
|
||||||
|
|
||||||
class Query(object):
|
class Query(object):
|
||||||
|
|
||||||
BASE_URL = 'http://www.fictionwise.com/servlet/mw'
|
BASE_URL = 'http://www.fictionwise.com/servlet/mw'
|
||||||
@ -322,15 +322,18 @@ class ResultList(list):
|
|||||||
print e
|
print e
|
||||||
return None
|
return None
|
||||||
mi = MetaInformation(title, authors)
|
mi = MetaInformation(title, authors)
|
||||||
ratings = entry.xpath("./p/table")
|
|
||||||
if len(ratings) >= 2:
|
|
||||||
mi.rating = self.get_rating(ratings[1], verbose)
|
|
||||||
mi.comments = self.get_description(entry)
|
|
||||||
mi.publisher = self.get_publisher(entry)
|
|
||||||
mi.tags = self.get_tags(entry)
|
|
||||||
mi.pubdate = self.get_date(entry, verbose)
|
|
||||||
mi.isbn = self.get_ISBN(entry)
|
|
||||||
mi.author_sort = authors_to_sort_string(authors)
|
mi.author_sort = authors_to_sort_string(authors)
|
||||||
|
try:
|
||||||
|
ratings = entry.xpath("./p/table")
|
||||||
|
if len(ratings) >= 2:
|
||||||
|
mi.rating = self.get_rating(ratings[1], verbose)
|
||||||
|
mi.comments = self.get_description(entry)
|
||||||
|
mi.publisher = self.get_publisher(entry)
|
||||||
|
mi.tags = self.get_tags(entry)
|
||||||
|
mi.pubdate = self.get_date(entry, verbose)
|
||||||
|
mi.isbn = self.get_ISBN(entry)
|
||||||
|
except:
|
||||||
|
pass
|
||||||
return mi
|
return mi
|
||||||
|
|
||||||
def producer(self, q, data, verbose=False):
|
def producer(self, q, data, verbose=False):
|
||||||
|
@ -279,8 +279,12 @@ class ResultList(list):
|
|||||||
return None
|
return None
|
||||||
mi = MetaInformation(title, authors)
|
mi = MetaInformation(title, authors)
|
||||||
mi.author_sort = authors_to_sort_string(authors)
|
mi.author_sort = authors_to_sort_string(authors)
|
||||||
mi.comments = self.get_description(entry, verbose)
|
try:
|
||||||
return self.get_book_info(entry, mi, verbose)
|
mi.comments = self.get_description(entry, verbose)
|
||||||
|
mi = self.get_book_info(entry, mi, verbose)
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
return mi
|
||||||
|
|
||||||
def producer(self, q, data, verbose=False):
|
def producer(self, q, data, verbose=False):
|
||||||
for x in data:
|
for x in data:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user