mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Various fix in amazon metadata, add german site for german users
Add a clean ascii trial: this should be corrected everytime soupparser is used from lxml.html due to problems with xml_to_unicode output
This commit is contained in:
parent
61db7b02b6
commit
8af48a9d06
@ -10,6 +10,7 @@ from lxml import html
|
|||||||
from lxml.html import soupparser
|
from lxml.html import soupparser
|
||||||
|
|
||||||
from calibre.utils.date import parse_date, utcnow, replace_months
|
from calibre.utils.date import parse_date, utcnow, replace_months
|
||||||
|
from calibre.utils.cleantext import clean_ascii_char
|
||||||
from calibre import browser, preferred_encoding
|
from calibre import browser, preferred_encoding
|
||||||
from calibre.ebooks.chardet import xml_to_unicode
|
from calibre.ebooks.chardet import xml_to_unicode
|
||||||
from calibre.ebooks.metadata import MetaInformation, check_isbn, \
|
from calibre.ebooks.metadata import MetaInformation, check_isbn, \
|
||||||
@ -53,9 +54,9 @@ class AmazonEs(MetadataSource):
|
|||||||
self.exception = e
|
self.exception = e
|
||||||
self.tb = traceback.format_exc()
|
self.tb = traceback.format_exc()
|
||||||
|
|
||||||
class AmazonUS(MetadataSource):
|
class AmazonEn(MetadataSource):
|
||||||
|
|
||||||
name = 'Amazon US english'
|
name = 'Amazon english'
|
||||||
description = _('Downloads social metadata from amazon.com in english')
|
description = _('Downloads social metadata from amazon.com in english')
|
||||||
supported_platforms = ['windows', 'osx', 'linux']
|
supported_platforms = ['windows', 'osx', 'linux']
|
||||||
author = 'Sengian'
|
author = 'Sengian'
|
||||||
@ -65,7 +66,7 @@ class AmazonUS(MetadataSource):
|
|||||||
def fetch(self):
|
def fetch(self):
|
||||||
try:
|
try:
|
||||||
self.results = search(self.title, self.book_author, self.publisher,
|
self.results = search(self.title, self.book_author, self.publisher,
|
||||||
self.isbn, max_results=10, verbose=self.verbose, lang='us')
|
self.isbn, max_results=10, verbose=self.verbose, lang='en')
|
||||||
except Exception, e:
|
except Exception, e:
|
||||||
self.exception = e
|
self.exception = e
|
||||||
self.tb = traceback.format_exc()
|
self.tb = traceback.format_exc()
|
||||||
@ -97,6 +98,8 @@ class Amazon(MetadataSource):
|
|||||||
has_html_comments = True
|
has_html_comments = True
|
||||||
|
|
||||||
def fetch(self):
|
def fetch(self):
|
||||||
|
# if not self.site_customization:
|
||||||
|
# return
|
||||||
try:
|
try:
|
||||||
self.results = search(self.title, self.book_author, self.publisher,
|
self.results = search(self.title, self.book_author, self.publisher,
|
||||||
self.isbn, max_results=10, verbose=self.verbose, lang='all')
|
self.isbn, max_results=10, verbose=self.verbose, lang='all')
|
||||||
@ -104,17 +107,20 @@ class Amazon(MetadataSource):
|
|||||||
self.exception = e
|
self.exception = e
|
||||||
self.tb = traceback.format_exc()
|
self.tb = traceback.format_exc()
|
||||||
|
|
||||||
|
# @property
|
||||||
|
# def string_customization_help(self):
|
||||||
|
# return _('You can select here the language for metadata search with amazon.com')
|
||||||
|
|
||||||
|
|
||||||
def report(verbose):
|
def report(verbose):
|
||||||
if verbose:
|
if verbose:
|
||||||
import traceback
|
|
||||||
traceback.print_exc()
|
traceback.print_exc()
|
||||||
|
|
||||||
|
|
||||||
class Query(object):
|
class Query(object):
|
||||||
|
|
||||||
BASE_URL_FR = 'http://www.amazon.fr'
|
|
||||||
BASE_URL_ALL = 'http://www.amazon.com'
|
BASE_URL_ALL = 'http://www.amazon.com'
|
||||||
|
BASE_URL_FR = 'http://www.amazon.fr'
|
||||||
BASE_URL_DE = 'http://www.amazon.de'
|
BASE_URL_DE = 'http://www.amazon.de'
|
||||||
|
|
||||||
def __init__(self, title=None, author=None, publisher=None, isbn=None, keywords=None,
|
def __init__(self, title=None, author=None, publisher=None, isbn=None, keywords=None,
|
||||||
@ -153,7 +159,7 @@ class Query(object):
|
|||||||
q['sort'] = 'relevanceexprank'
|
q['sort'] = 'relevanceexprank'
|
||||||
q['field-language'] = 'Spanish'
|
q['field-language'] = 'Spanish'
|
||||||
self.urldata = self.BASE_URL_ALL
|
self.urldata = self.BASE_URL_ALL
|
||||||
elif rlang =='us':
|
elif rlang =='en':
|
||||||
q['sort'] = 'relevanceexprank'
|
q['sort'] = 'relevanceexprank'
|
||||||
q['field-language'] = 'English'
|
q['field-language'] = 'English'
|
||||||
self.urldata = self.BASE_URL_ALL
|
self.urldata = self.BASE_URL_ALL
|
||||||
@ -197,24 +203,25 @@ class Query(object):
|
|||||||
return
|
return
|
||||||
raw = xml_to_unicode(raw, strip_encoding_pats=True,
|
raw = xml_to_unicode(raw, strip_encoding_pats=True,
|
||||||
resolve_entities=True)[0]
|
resolve_entities=True)[0]
|
||||||
|
|
||||||
try:
|
try:
|
||||||
feed = soupparser.fromstring(raw)
|
feed = soupparser.fromstring(raw)
|
||||||
except:
|
except:
|
||||||
return None, self.urldata
|
try:
|
||||||
|
#remove ASCII invalid chars
|
||||||
|
return soupparser.fromstring(clean_ascii_char(raw))
|
||||||
|
except:
|
||||||
|
return None, self.urldata
|
||||||
|
|
||||||
#nb of page
|
#nb of page
|
||||||
try:
|
try:
|
||||||
nbresults = self.renbres.findall(feed.xpath("//*[@class='resultCount']")[0].text)
|
nbresults = self.renbres.findall(feed.xpath("//*[@class='resultCount']")[0].text)
|
||||||
rpp = 0
|
|
||||||
if len(nbresults) > 1:
|
|
||||||
rpp = int(nbresults[1])
|
|
||||||
nbresults = int(nbresults[2])
|
|
||||||
except:
|
except:
|
||||||
return None, self.urldata
|
return None, self.urldata
|
||||||
|
|
||||||
pages =[feed]
|
pages =[feed]
|
||||||
if rpp:
|
if len(nbresults) > 1:
|
||||||
nbpagetoquery = int(ceil(float(min(nbresults, self.max_results))/ rpp))
|
nbpagetoquery = int(ceil(float(min(int(nbresults[2]), self.max_results))/ int(nbresults[1])))
|
||||||
for i in xrange(2, nbpagetoquery + 1):
|
for i in xrange(2, nbpagetoquery + 1):
|
||||||
try:
|
try:
|
||||||
urldata = self.urldata + '&page=' + str(i)
|
urldata = self.urldata + '&page=' + str(i)
|
||||||
@ -228,7 +235,11 @@ class Query(object):
|
|||||||
try:
|
try:
|
||||||
feed = soupparser.fromstring(raw)
|
feed = soupparser.fromstring(raw)
|
||||||
except:
|
except:
|
||||||
continue
|
try:
|
||||||
|
#remove ASCII invalid chars
|
||||||
|
return soupparser.fromstring(clean_ascii_char(raw))
|
||||||
|
except:
|
||||||
|
continue
|
||||||
pages.append(feed)
|
pages.append(feed)
|
||||||
|
|
||||||
results = []
|
results = []
|
||||||
@ -416,7 +427,12 @@ class ResultList(list):
|
|||||||
try:
|
try:
|
||||||
return soupparser.fromstring(raw)
|
return soupparser.fromstring(raw)
|
||||||
except:
|
except:
|
||||||
return
|
try:
|
||||||
|
#remove ASCII invalid chars
|
||||||
|
return soupparser.fromstring(clean_ascii_char(raw))
|
||||||
|
except:
|
||||||
|
report(verbose)
|
||||||
|
return
|
||||||
|
|
||||||
def populate(self, entries, browser, verbose=False):
|
def populate(self, entries, browser, verbose=False):
|
||||||
for x in entries:
|
for x in entries:
|
||||||
@ -433,6 +449,8 @@ class ResultList(list):
|
|||||||
if verbose:
|
if verbose:
|
||||||
print 'Failed to get all details for an entry'
|
print 'Failed to get all details for an entry'
|
||||||
print e
|
print e
|
||||||
|
print 'URL who failed:', x
|
||||||
|
report(verbose)
|
||||||
continue
|
continue
|
||||||
self.append(self.fill_MI(entry, title, authors, browser, verbose))
|
self.append(self.fill_MI(entry, title, authors, browser, verbose))
|
||||||
|
|
||||||
@ -453,16 +471,16 @@ def search(title=None, author=None, publisher=None, isbn=None,
|
|||||||
|
|
||||||
def option_parser():
|
def option_parser():
|
||||||
parser = OptionParser(textwrap.dedent(\
|
parser = OptionParser(textwrap.dedent(\
|
||||||
'''\
|
_('''\
|
||||||
%prog [options]
|
%prog [options]
|
||||||
|
|
||||||
Fetch book metadata from Amazon. You must specify one of title, author,
|
Fetch book metadata from Amazon. You must specify one of title, author,
|
||||||
ISBN, publisher or keywords. Will fetch a maximum of 10 matches,
|
ISBN, publisher or keywords. Will fetch a maximum of 10 matches,
|
||||||
so you should make your query as specific as possible.
|
so you should make your query as specific as possible.
|
||||||
You can chose the language for metadata retrieval:
|
You can chose the language for metadata retrieval:
|
||||||
All & US english & french & german & spanish
|
All & english & french & german & spanish
|
||||||
'''
|
'''
|
||||||
))
|
)))
|
||||||
parser.add_option('-t', '--title', help='Book title')
|
parser.add_option('-t', '--title', help='Book title')
|
||||||
parser.add_option('-a', '--author', help='Book author(s)')
|
parser.add_option('-a', '--author', help='Book author(s)')
|
||||||
parser.add_option('-p', '--publisher', help='Book publisher')
|
parser.add_option('-p', '--publisher', help='Book publisher')
|
||||||
@ -471,7 +489,7 @@ def option_parser():
|
|||||||
parser.add_option('-m', '--max-results', default=10,
|
parser.add_option('-m', '--max-results', default=10,
|
||||||
help='Maximum number of results to fetch')
|
help='Maximum number of results to fetch')
|
||||||
parser.add_option('-l', '--lang', default='all',
|
parser.add_option('-l', '--lang', default='all',
|
||||||
help='Chosen language for metadata search (all, us, fr, es , de)')
|
help='Chosen language for metadata search (all, en, fr, es, de)')
|
||||||
parser.add_option('-v', '--verbose', default=0, action='count',
|
parser.add_option('-v', '--verbose', default=0, action='count',
|
||||||
help='Be more verbose about errors')
|
help='Be more verbose about errors')
|
||||||
return parser
|
return parser
|
||||||
|
15
src/calibre/utils/cleantext.py
Normal file
15
src/calibre/utils/cleantext.py
Normal file
@ -0,0 +1,15 @@
|
|||||||
|
from __future__ import with_statement
|
||||||
|
__license__ = 'GPL 3'
|
||||||
|
__copyright__ = '2010, sengian <sengian1@gmail.com>'
|
||||||
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
|
import re
|
||||||
|
|
||||||
|
def clean_ascii_char(txt, charlist = None):
|
||||||
|
#remove ASCII invalid chars : 0 to 8 and 11-14 to 24-26-27 by default
|
||||||
|
chars = list(range(8)) + [0x0B, 0x0E, 0x0F] + list(range(0x10, 0x19)) \
|
||||||
|
+ [0x1A, 0x1B]
|
||||||
|
if charlist is not None:
|
||||||
|
chars = charlist
|
||||||
|
illegal_chars = re.compile(u'|'.join(map(unichr, chars)))
|
||||||
|
return illegal_chars.sub('', txt)
|
Loading…
x
Reference in New Issue
Block a user