This commit is contained in:
Sengian 2010-12-13 23:24:12 +01:00
parent d374b36e97
commit 81af8382d6
3 changed files with 39 additions and 37 deletions

View File

@ -2,7 +2,7 @@ from __future__ import with_statement
__license__ = 'GPL 3' __license__ = 'GPL 3'
__copyright__ = '2010, sengian <sengian1@gmail.com>' __copyright__ = '2010, sengian <sengian1@gmail.com>'
import sys, textwrap, re, traceback, socket import sys, re
from threading import Thread from threading import Thread
from Queue import Queue from Queue import Queue
from urllib import urlencode from urllib import urlencode
@ -61,6 +61,7 @@ class Amazon(MetadataSource):
tempres.extend(tmpnoloc) tempres.extend(tmpnoloc)
self.results = tempres self.results = tempres
except Exception, e: except Exception, e:
import traceback
self.exception = e self.exception = e
self.tb = traceback.format_exc() self.tb = traceback.format_exc()
@ -107,12 +108,14 @@ class AmazonSocial(MetadataSource):
tmploc.tags = tmpnoloc.tags tmploc.tags = tmpnoloc.tags
self.results = tmploc self.results = tmploc
except Exception, e: except Exception, e:
import traceback
self.exception = e self.exception = e
self.tb = traceback.format_exc() self.tb = traceback.format_exc()
def report(verbose): def report(verbose):
if verbose: if verbose:
import traceback
traceback.print_exc() traceback.print_exc()
class AmazonError(Exception): class AmazonError(Exception):
@ -208,33 +211,40 @@ class Query(object):
q = q.encode('utf-8') q = q.encode('utf-8')
self.urldata += '/gp/search/ref=sr_adv_b/?' + urlencode(q) self.urldata += '/gp/search/ref=sr_adv_b/?' + urlencode(q)
def __call__(self, browser, verbose, timeout = 5.): def brcall(self, browser, url, verbose, timeout):
if verbose: if verbose:
print _('Query: %s') % self.urldata print _('Query: %s') % url
try: try:
raw = browser.open_novisit(self.urldata, timeout=timeout).read() raw = browser.open_novisit(url, timeout=timeout).read()
except Exception, e: except Exception, e:
import socket
report(verbose) report(verbose)
if callable(getattr(e, 'getcode', None)) and \ if callable(getattr(e, 'getcode', None)) and \
e.getcode() == 404: e.getcode() == 404:
return None, self.urldata return None
if isinstance(getattr(e, 'args', [None])[0], socket.timeout): attr = getattr(e, 'args', [None])
raise AmazonError(_('Amazon timed out. Try again later.')) attr = attr if attr else [None]
raise AmazonError(_('Amazon encountered an error.')) if isinstance(attr[0], socket.timeout):
raise NiceBooksError(_('Nicebooks timed out. Try again later.'))
raise NiceBooksError(_('Nicebooks encountered an error.'))
if '<title>404 - ' in raw: if '<title>404 - ' in raw:
return None, self.urldata return
raw = xml_to_unicode(raw, strip_encoding_pats=True, raw = xml_to_unicode(raw, strip_encoding_pats=True,
resolve_entities=True)[0] resolve_entities=True)[0]
try: try:
feed = soupparser.fromstring(raw) return soupparser.fromstring(raw)
except: except:
try: try:
#remove ASCII invalid chars #remove ASCII invalid chars
return soupparser.fromstring(clean_ascii_chars(raw)) return soupparser.fromstring(clean_ascii_chars(raw))
except: except:
return None, self.urldata return None
def __call__(self, browser, verbose, timeout = 5.):
feed = self.brcall(browser, self.urldata, verbose, timeout)
if feed is None:
return None, self.urldata
#nb of page #nb of page
try: try:
@ -247,23 +257,10 @@ class Query(object):
if len(nbresults) > 1: if len(nbresults) > 1:
nbpagetoquery = int(ceil(float(min(int(nbresults[2]), self.max_results))/ int(nbresults[1]))) nbpagetoquery = int(ceil(float(min(int(nbresults[2]), self.max_results))/ int(nbresults[1])))
for i in xrange(2, nbpagetoquery + 1): for i in xrange(2, nbpagetoquery + 1):
try: urldata = self.urldata + '&page=' + str(i)
urldata = self.urldata + '&page=' + str(i) feed = self.brcall(browser, urldata, verbose, timeout)
raw = browser.open_novisit(urldata, timeout=timeout).read() if feed is None:
except Exception, e:
continue continue
if '<title>404 - ' in raw:
continue
raw = xml_to_unicode(raw, strip_encoding_pats=True,
resolve_entities=True)[0]
try:
feed = soupparser.fromstring(raw)
except:
try:
#remove ASCII invalid chars
return soupparser.fromstring(clean_ascii_chars(raw))
except:
continue
pages.append(feed) pages.append(feed)
results = [] results = []
@ -453,11 +450,14 @@ class ResultList(object):
try: try:
raw = br.open_novisit(url).read() raw = br.open_novisit(url).read()
except Exception, e: except Exception, e:
import socket
report(verbose) report(verbose)
if callable(getattr(e, 'getcode', None)) and \ if callable(getattr(e, 'getcode', None)) and \
e.getcode() == 404: e.getcode() == 404:
return None return None
if isinstance(getattr(e, 'args', [None])[0], socket.timeout): attr = getattr(e, 'args', [None])
attr = attr if attr else [None]
if isinstance(attr[0], socket.timeout):
raise AmazonError(_('Amazon timed out. Try again later.')) raise AmazonError(_('Amazon timed out. Try again later.'))
raise AmazonError(_('Amazon encountered an error.')) raise AmazonError(_('Amazon encountered an error.'))
if '<title>404 - ' in raw: if '<title>404 - ' in raw:
@ -584,6 +584,7 @@ def get_social_metadata(title, authors, publisher, isbn, verbose=False,
return [mi] return [mi]
def option_parser(): def option_parser():
import textwrap
parser = OptionParser(textwrap.dedent(\ parser = OptionParser(textwrap.dedent(\
_('''\ _('''\
%prog [options] %prog [options]
@ -648,6 +649,6 @@ if __name__ == '__main__':
sys.exit(main()) sys.exit(main())
# import cProfile # import cProfile
# sys.exit(cProfile.run("import calibre.ebooks.metadata.amazonbis; calibre.ebooks.metadata.amazonbis.main()")) # sys.exit(cProfile.run("import calibre.ebooks.metadata.amazonbis; calibre.ebooks.metadata.amazonbis.main()"))
# sys.exit(cProfile.run("import calibre.ebooks.metadata.amazonbis; calibre.ebooks.metadata.amazonbis.main()", "profile_tmp_2")) # sys.exit(cProfile.run("import calibre.ebooks.metadata.amazonbis; calibre.ebooks.metadata.amazonbis.main()", "profile"))
# calibre-debug -e "H:\Mes eBooks\Developpement\calibre\src\calibre\ebooks\metadata\amazonbis.py" -m 5 -a gore -v>data.html # calibre-debug -e "H:\Mes eBooks\Developpement\calibre\src\calibre\ebooks\metadata\amazon.py" -m 5 -a gore -v>data.html

View File

@ -14,11 +14,12 @@ from calibre import browser, preferred_encoding
from calibre.ebooks.chardet import xml_to_unicode from calibre.ebooks.chardet import xml_to_unicode
from calibre.ebooks.metadata import MetaInformation, check_isbn, \ from calibre.ebooks.metadata import MetaInformation, check_isbn, \
authors_to_sort_string authors_to_sort_string
from calibre.library.comments import sanitize_comments_html
from calibre.ebooks.metadata.fetch import MetadataSource from calibre.ebooks.metadata.fetch import MetadataSource
from calibre.library.comments import sanitize_comments_html
from calibre.utils.config import OptionParser from calibre.utils.config import OptionParser
from calibre.utils.date import parse_date, utcnow
from calibre.utils.cleantext import clean_ascii_chars, unescape from calibre.utils.cleantext import clean_ascii_chars, unescape
from calibre.utils.date import parse_date, utcnow
class Fictionwise(MetadataSource): class Fictionwise(MetadataSource):

View File

@ -1,6 +1,6 @@
from __future__ import with_statement from __future__ import with_statement
__license__ = 'GPL 3' __license__ = 'GPL 3'
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>' __copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>, 2010, sengian <sengian1@gmail.com>'
__docformat__ = 'restructuredtext en' __docformat__ = 'restructuredtext en'
import sys import sys
@ -12,13 +12,13 @@ from functools import partial
from lxml import etree from lxml import etree
from calibre import browser, preferred_encoding from calibre import browser, preferred_encoding
from calibre.ebooks.chardet import xml_to_unicode
from calibre.ebooks.metadata import MetaInformation, check_isbn, \ from calibre.ebooks.metadata import MetaInformation, check_isbn, \
authors_to_sort_string authors_to_sort_string
from calibre.ebooks.metadata.fetch import MetadataSource from calibre.ebooks.metadata.fetch import MetadataSource
from calibre.ebooks.chardet import xml_to_unicode
from calibre.utils.config import OptionParser from calibre.utils.config import OptionParser
from calibre.utils.date import parse_date, utcnow
from calibre.utils.cleantext import clean_ascii_chars from calibre.utils.cleantext import clean_ascii_chars
from calibre.utils.date import parse_date, utcnow
NAMESPACES = { NAMESPACES = {
'openSearch':'http://a9.com/-/spec/opensearchrss/1.0/', 'openSearch':'http://a9.com/-/spec/opensearchrss/1.0/',