mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
cleaning
This commit is contained in:
parent
d374b36e97
commit
81af8382d6
@ -2,7 +2,7 @@ from __future__ import with_statement
|
||||
__license__ = 'GPL 3'
|
||||
__copyright__ = '2010, sengian <sengian1@gmail.com>'
|
||||
|
||||
import sys, textwrap, re, traceback, socket
|
||||
import sys, re
|
||||
from threading import Thread
|
||||
from Queue import Queue
|
||||
from urllib import urlencode
|
||||
@ -61,6 +61,7 @@ class Amazon(MetadataSource):
|
||||
tempres.extend(tmpnoloc)
|
||||
self.results = tempres
|
||||
except Exception, e:
|
||||
import traceback
|
||||
self.exception = e
|
||||
self.tb = traceback.format_exc()
|
||||
|
||||
@ -107,12 +108,14 @@ class AmazonSocial(MetadataSource):
|
||||
tmploc.tags = tmpnoloc.tags
|
||||
self.results = tmploc
|
||||
except Exception, e:
|
||||
import traceback
|
||||
self.exception = e
|
||||
self.tb = traceback.format_exc()
|
||||
|
||||
|
||||
def report(verbose):
|
||||
if verbose:
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
|
||||
class AmazonError(Exception):
|
||||
@ -208,33 +211,40 @@ class Query(object):
|
||||
q = q.encode('utf-8')
|
||||
self.urldata += '/gp/search/ref=sr_adv_b/?' + urlencode(q)
|
||||
|
||||
def __call__(self, browser, verbose, timeout = 5.):
|
||||
def brcall(self, browser, url, verbose, timeout):
|
||||
if verbose:
|
||||
print _('Query: %s') % self.urldata
|
||||
print _('Query: %s') % url
|
||||
|
||||
try:
|
||||
raw = browser.open_novisit(self.urldata, timeout=timeout).read()
|
||||
raw = browser.open_novisit(url, timeout=timeout).read()
|
||||
except Exception, e:
|
||||
import socket
|
||||
report(verbose)
|
||||
if callable(getattr(e, 'getcode', None)) and \
|
||||
e.getcode() == 404:
|
||||
return None, self.urldata
|
||||
if isinstance(getattr(e, 'args', [None])[0], socket.timeout):
|
||||
raise AmazonError(_('Amazon timed out. Try again later.'))
|
||||
raise AmazonError(_('Amazon encountered an error.'))
|
||||
return None
|
||||
attr = getattr(e, 'args', [None])
|
||||
attr = attr if attr else [None]
|
||||
if isinstance(attr[0], socket.timeout):
|
||||
raise NiceBooksError(_('Nicebooks timed out. Try again later.'))
|
||||
raise NiceBooksError(_('Nicebooks encountered an error.'))
|
||||
if '<title>404 - ' in raw:
|
||||
return None, self.urldata
|
||||
return
|
||||
raw = xml_to_unicode(raw, strip_encoding_pats=True,
|
||||
resolve_entities=True)[0]
|
||||
|
||||
try:
|
||||
feed = soupparser.fromstring(raw)
|
||||
return soupparser.fromstring(raw)
|
||||
except:
|
||||
try:
|
||||
#remove ASCII invalid chars
|
||||
return soupparser.fromstring(clean_ascii_chars(raw))
|
||||
except:
|
||||
return None, self.urldata
|
||||
return None
|
||||
|
||||
def __call__(self, browser, verbose, timeout = 5.):
|
||||
feed = self.brcall(browser, self.urldata, verbose, timeout)
|
||||
if feed is None:
|
||||
return None, self.urldata
|
||||
|
||||
#nb of page
|
||||
try:
|
||||
@ -247,23 +257,10 @@ class Query(object):
|
||||
if len(nbresults) > 1:
|
||||
nbpagetoquery = int(ceil(float(min(int(nbresults[2]), self.max_results))/ int(nbresults[1])))
|
||||
for i in xrange(2, nbpagetoquery + 1):
|
||||
try:
|
||||
urldata = self.urldata + '&page=' + str(i)
|
||||
raw = browser.open_novisit(urldata, timeout=timeout).read()
|
||||
except Exception, e:
|
||||
urldata = self.urldata + '&page=' + str(i)
|
||||
feed = self.brcall(browser, urldata, verbose, timeout)
|
||||
if feed is None:
|
||||
continue
|
||||
if '<title>404 - ' in raw:
|
||||
continue
|
||||
raw = xml_to_unicode(raw, strip_encoding_pats=True,
|
||||
resolve_entities=True)[0]
|
||||
try:
|
||||
feed = soupparser.fromstring(raw)
|
||||
except:
|
||||
try:
|
||||
#remove ASCII invalid chars
|
||||
return soupparser.fromstring(clean_ascii_chars(raw))
|
||||
except:
|
||||
continue
|
||||
pages.append(feed)
|
||||
|
||||
results = []
|
||||
@ -453,11 +450,14 @@ class ResultList(object):
|
||||
try:
|
||||
raw = br.open_novisit(url).read()
|
||||
except Exception, e:
|
||||
import socket
|
||||
report(verbose)
|
||||
if callable(getattr(e, 'getcode', None)) and \
|
||||
e.getcode() == 404:
|
||||
return None
|
||||
if isinstance(getattr(e, 'args', [None])[0], socket.timeout):
|
||||
attr = getattr(e, 'args', [None])
|
||||
attr = attr if attr else [None]
|
||||
if isinstance(attr[0], socket.timeout):
|
||||
raise AmazonError(_('Amazon timed out. Try again later.'))
|
||||
raise AmazonError(_('Amazon encountered an error.'))
|
||||
if '<title>404 - ' in raw:
|
||||
@ -584,6 +584,7 @@ def get_social_metadata(title, authors, publisher, isbn, verbose=False,
|
||||
return [mi]
|
||||
|
||||
def option_parser():
|
||||
import textwrap
|
||||
parser = OptionParser(textwrap.dedent(\
|
||||
_('''\
|
||||
%prog [options]
|
||||
@ -648,6 +649,6 @@ if __name__ == '__main__':
|
||||
sys.exit(main())
|
||||
# import cProfile
|
||||
# sys.exit(cProfile.run("import calibre.ebooks.metadata.amazonbis; calibre.ebooks.metadata.amazonbis.main()"))
|
||||
# sys.exit(cProfile.run("import calibre.ebooks.metadata.amazonbis; calibre.ebooks.metadata.amazonbis.main()", "profile_tmp_2"))
|
||||
# sys.exit(cProfile.run("import calibre.ebooks.metadata.amazonbis; calibre.ebooks.metadata.amazonbis.main()", "profile"))
|
||||
|
||||
# calibre-debug -e "H:\Mes eBooks\Developpement\calibre\src\calibre\ebooks\metadata\amazonbis.py" -m 5 -a gore -v>data.html
|
||||
# calibre-debug -e "H:\Mes eBooks\Developpement\calibre\src\calibre\ebooks\metadata\amazon.py" -m 5 -a gore -v>data.html
|
@ -14,11 +14,12 @@ from calibre import browser, preferred_encoding
|
||||
from calibre.ebooks.chardet import xml_to_unicode
|
||||
from calibre.ebooks.metadata import MetaInformation, check_isbn, \
|
||||
authors_to_sort_string
|
||||
from calibre.library.comments import sanitize_comments_html
|
||||
from calibre.ebooks.metadata.fetch import MetadataSource
|
||||
from calibre.library.comments import sanitize_comments_html
|
||||
from calibre.utils.config import OptionParser
|
||||
from calibre.utils.date import parse_date, utcnow
|
||||
from calibre.utils.cleantext import clean_ascii_chars, unescape
|
||||
from calibre.utils.date import parse_date, utcnow
|
||||
|
||||
|
||||
class Fictionwise(MetadataSource):
|
||||
|
||||
|
@ -1,6 +1,6 @@
|
||||
from __future__ import with_statement
|
||||
__license__ = 'GPL 3'
|
||||
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>'
|
||||
__copyright__ = '2009, Kovid Goyal <kovid@kovidgoyal.net>, 2010, sengian <sengian1@gmail.com>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import sys
|
||||
@ -12,13 +12,13 @@ from functools import partial
|
||||
from lxml import etree
|
||||
|
||||
from calibre import browser, preferred_encoding
|
||||
from calibre.ebooks.chardet import xml_to_unicode
|
||||
from calibre.ebooks.metadata import MetaInformation, check_isbn, \
|
||||
authors_to_sort_string
|
||||
from calibre.ebooks.metadata.fetch import MetadataSource
|
||||
from calibre.ebooks.chardet import xml_to_unicode
|
||||
from calibre.utils.config import OptionParser
|
||||
from calibre.utils.date import parse_date, utcnow
|
||||
from calibre.utils.cleantext import clean_ascii_chars
|
||||
from calibre.utils.date import parse_date, utcnow
|
||||
|
||||
NAMESPACES = {
|
||||
'openSearch':'http://a9.com/-/spec/opensearchrss/1.0/',
|
||||
|
Loading…
x
Reference in New Issue
Block a user