mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Initial implementation of basic metadata Amazon plugin
This commit is contained in:
commit
d43af28fcf
@ -11,9 +11,9 @@ from calibre.ebooks.metadata.book.base import Metadata
|
||||
from calibre.devices.mime import mime_type_ext
|
||||
from calibre.devices.interface import BookList as _BookList
|
||||
from calibre.constants import preferred_encoding
|
||||
from calibre import isbytestring
|
||||
from calibre import isbytestring, force_unicode
|
||||
from calibre.utils.config import prefs, tweaks
|
||||
from calibre.utils.icu import sort_key, strcmp as icu_strcmp
|
||||
from calibre.utils.icu import strcmp
|
||||
|
||||
class Book(Metadata):
|
||||
def __init__(self, prefix, lpath, size=None, other=None):
|
||||
@ -241,7 +241,7 @@ class CollectionsBookList(BookList):
|
||||
if y is None:
|
||||
return -1
|
||||
if isinstance(x, (unicode, str)):
|
||||
c = strcmp(x, y)
|
||||
c = strcmp(force_unicode(x), force_unicode(y))
|
||||
else:
|
||||
c = cmp(x, y)
|
||||
if c != 0:
|
||||
|
516
src/calibre/ebooks/metadata/amazonfr.py
Normal file
516
src/calibre/ebooks/metadata/amazonfr.py
Normal file
@ -0,0 +1,516 @@
|
||||
from __future__ import with_statement
|
||||
__license__ = 'GPL 3'
|
||||
__copyright__ = '2010, sengian <sengian1@gmail.com>'
|
||||
|
||||
import sys, textwrap, re, traceback
|
||||
from urllib import urlencode
|
||||
from math import ceil
|
||||
|
||||
from lxml import html
|
||||
from lxml.html import soupparser
|
||||
|
||||
from calibre.utils.date import parse_date, utcnow, replace_months
|
||||
from calibre.utils.cleantext import clean_ascii_chars
|
||||
from calibre import browser, preferred_encoding
|
||||
from calibre.ebooks.chardet import xml_to_unicode
|
||||
from calibre.ebooks.metadata import MetaInformation, check_isbn, \
|
||||
authors_to_sort_string
|
||||
from calibre.ebooks.metadata.fetch import MetadataSource
|
||||
from calibre.utils.config import OptionParser
|
||||
from calibre.library.comments import sanitize_comments_html
|
||||
|
||||
|
||||
class AmazonFr(MetadataSource):
|
||||
|
||||
name = 'Amazon French'
|
||||
description = _('Downloads metadata from amazon.fr')
|
||||
supported_platforms = ['windows', 'osx', 'linux']
|
||||
author = 'Sengian'
|
||||
version = (1, 0, 0)
|
||||
has_html_comments = True
|
||||
|
||||
def fetch(self):
|
||||
try:
|
||||
self.results = search(self.title, self.book_author, self.publisher,
|
||||
self.isbn, max_results=10, verbose=self.verbose, lang='fr')
|
||||
except Exception, e:
|
||||
self.exception = e
|
||||
self.tb = traceback.format_exc()
|
||||
|
||||
class AmazonEs(MetadataSource):
|
||||
|
||||
name = 'Amazon Spanish'
|
||||
description = _('Downloads metadata from amazon.com in spanish')
|
||||
supported_platforms = ['windows', 'osx', 'linux']
|
||||
author = 'Sengian'
|
||||
version = (1, 0, 0)
|
||||
has_html_comments = True
|
||||
|
||||
def fetch(self):
|
||||
try:
|
||||
self.results = search(self.title, self.book_author, self.publisher,
|
||||
self.isbn, max_results=10, verbose=self.verbose, lang='es')
|
||||
except Exception, e:
|
||||
self.exception = e
|
||||
self.tb = traceback.format_exc()
|
||||
|
||||
class AmazonEn(MetadataSource):
|
||||
|
||||
name = 'Amazon English'
|
||||
description = _('Downloads metadata from amazon.com in english')
|
||||
supported_platforms = ['windows', 'osx', 'linux']
|
||||
author = 'Sengian'
|
||||
version = (1, 0, 0)
|
||||
has_html_comments = True
|
||||
|
||||
def fetch(self):
|
||||
try:
|
||||
self.results = search(self.title, self.book_author, self.publisher,
|
||||
self.isbn, max_results=10, verbose=self.verbose, lang='en')
|
||||
except Exception, e:
|
||||
self.exception = e
|
||||
self.tb = traceback.format_exc()
|
||||
|
||||
class AmazonDe(MetadataSource):
|
||||
|
||||
name = 'Amazon German'
|
||||
description = _('Downloads metadata from amazon.de')
|
||||
supported_platforms = ['windows', 'osx', 'linux']
|
||||
author = 'Sengian'
|
||||
version = (1, 0, 0)
|
||||
has_html_comments = True
|
||||
|
||||
def fetch(self):
|
||||
try:
|
||||
self.results = search(self.title, self.book_author, self.publisher,
|
||||
self.isbn, max_results=10, verbose=self.verbose, lang='de')
|
||||
except Exception, e:
|
||||
self.exception = e
|
||||
self.tb = traceback.format_exc()
|
||||
|
||||
class Amazon(MetadataSource):
|
||||
|
||||
name = 'Amazon'
|
||||
description = _('Downloads metadata from amazon.com')
|
||||
supported_platforms = ['windows', 'osx', 'linux']
|
||||
author = 'Kovid Goyal & Sengian'
|
||||
version = (1, 1, 0)
|
||||
has_html_comments = True
|
||||
|
||||
def fetch(self):
|
||||
# if not self.site_customization:
|
||||
# return
|
||||
try:
|
||||
self.results = search(self.title, self.book_author, self.publisher,
|
||||
self.isbn, max_results=10, verbose=self.verbose, lang='all')
|
||||
except Exception, e:
|
||||
self.exception = e
|
||||
self.tb = traceback.format_exc()
|
||||
|
||||
# @property
|
||||
# def string_customization_help(self):
|
||||
# return _('You can select here the language for metadata search with amazon.com')
|
||||
|
||||
|
||||
def report(verbose):
|
||||
if verbose:
|
||||
traceback.print_exc()
|
||||
|
||||
|
||||
class Query(object):
|
||||
|
||||
BASE_URL_ALL = 'http://www.amazon.com'
|
||||
BASE_URL_FR = 'http://www.amazon.fr'
|
||||
BASE_URL_DE = 'http://www.amazon.de'
|
||||
|
||||
def __init__(self, title=None, author=None, publisher=None, isbn=None, keywords=None,
|
||||
max_results=20, rlang='all'):
|
||||
assert not(title is None and author is None and publisher is None \
|
||||
and isbn is None and keywords is None)
|
||||
assert (max_results < 21)
|
||||
|
||||
self.max_results = int(max_results)
|
||||
self.renbres = re.compile(u'\s*(\d+)\s*')
|
||||
|
||||
q = { 'search-alias' : 'stripbooks' ,
|
||||
'unfiltered' : '1',
|
||||
'field-keywords' : '',
|
||||
'field-author' : '',
|
||||
'field-title' : '',
|
||||
'field-isbn' : '',
|
||||
'field-publisher' : ''
|
||||
#get to amazon detailed search page to get all options
|
||||
# 'node' : '',
|
||||
# 'field-binding' : '',
|
||||
#before, during, after
|
||||
# 'field-dateop' : '',
|
||||
#month as number
|
||||
# 'field-datemod' : '',
|
||||
# 'field-dateyear' : '',
|
||||
#french only
|
||||
# 'field-collection' : '',
|
||||
#many options available
|
||||
}
|
||||
|
||||
if rlang =='all':
|
||||
q['sort'] = 'relevanceexprank'
|
||||
self.urldata = self.BASE_URL_ALL
|
||||
elif rlang =='es':
|
||||
q['sort'] = 'relevanceexprank'
|
||||
q['field-language'] = 'Spanish'
|
||||
self.urldata = self.BASE_URL_ALL
|
||||
elif rlang =='en':
|
||||
q['sort'] = 'relevanceexprank'
|
||||
q['field-language'] = 'English'
|
||||
self.urldata = self.BASE_URL_ALL
|
||||
elif rlang =='fr':
|
||||
q['sort'] = 'relevancerank'
|
||||
self.urldata = self.BASE_URL_FR
|
||||
elif rlang =='de':
|
||||
q['sort'] = 'relevancerank'
|
||||
self.urldata = self.BASE_URL_DE
|
||||
self.baseurl = self.urldata
|
||||
|
||||
if isbn is not None:
|
||||
q['field-isbn'] = isbn.replace('-', '')
|
||||
else:
|
||||
if title is not None:
|
||||
q['field-title'] = title
|
||||
if author is not None:
|
||||
q['field-author'] = author
|
||||
if publisher is not None:
|
||||
q['field-publisher'] = publisher
|
||||
if keywords is not None:
|
||||
q['field-keywords'] = keywords
|
||||
|
||||
if isinstance(q, unicode):
|
||||
q = q.encode('utf-8')
|
||||
self.urldata += '/gp/search/ref=sr_adv_b/?' + urlencode(q)
|
||||
|
||||
def __call__(self, browser, verbose, timeout = 5.):
|
||||
if verbose:
|
||||
print 'Query:', self.urldata
|
||||
|
||||
try:
|
||||
raw = browser.open_novisit(self.urldata, timeout=timeout).read()
|
||||
except Exception, e:
|
||||
report(verbose)
|
||||
if callable(getattr(e, 'getcode', None)) and \
|
||||
e.getcode() == 404:
|
||||
return
|
||||
raise
|
||||
if '<title>404 - ' in raw:
|
||||
return
|
||||
raw = xml_to_unicode(raw, strip_encoding_pats=True,
|
||||
resolve_entities=True)[0]
|
||||
|
||||
try:
|
||||
feed = soupparser.fromstring(raw)
|
||||
except:
|
||||
try:
|
||||
#remove ASCII invalid chars
|
||||
return soupparser.fromstring(clean_ascii_chars(raw))
|
||||
except:
|
||||
return None, self.urldata
|
||||
|
||||
#nb of page
|
||||
try:
|
||||
nbresults = self.renbres.findall(feed.xpath("//*[@class='resultCount']")[0].text)
|
||||
except:
|
||||
return None, self.urldata
|
||||
|
||||
pages =[feed]
|
||||
if len(nbresults) > 1:
|
||||
nbpagetoquery = int(ceil(float(min(int(nbresults[2]), self.max_results))/ int(nbresults[1])))
|
||||
for i in xrange(2, nbpagetoquery + 1):
|
||||
try:
|
||||
urldata = self.urldata + '&page=' + str(i)
|
||||
raw = browser.open_novisit(urldata, timeout=timeout).read()
|
||||
except Exception, e:
|
||||
continue
|
||||
if '<title>404 - ' in raw:
|
||||
continue
|
||||
raw = xml_to_unicode(raw, strip_encoding_pats=True,
|
||||
resolve_entities=True)[0]
|
||||
try:
|
||||
feed = soupparser.fromstring(raw)
|
||||
except:
|
||||
try:
|
||||
#remove ASCII invalid chars
|
||||
return soupparser.fromstring(clean_ascii_chars(raw))
|
||||
except:
|
||||
continue
|
||||
pages.append(feed)
|
||||
|
||||
results = []
|
||||
for x in pages:
|
||||
results.extend([i.getparent().get('href') \
|
||||
for i in x.xpath("//a/span[@class='srTitle']")])
|
||||
return results[:self.max_results], self.baseurl
|
||||
|
||||
class ResultList(list):
|
||||
|
||||
def __init__(self, baseurl, lang = 'all'):
|
||||
self.baseurl = baseurl
|
||||
self.lang = lang
|
||||
self.repub = re.compile(u'\((.*)\)')
|
||||
self.rerat = re.compile(u'([0-9.]+)')
|
||||
self.reattr = re.compile(r'<([a-zA-Z0-9]+)\s[^>]+>')
|
||||
self.reoutp = re.compile(r'(?s)<em>--This text ref.*?</em>')
|
||||
self.recom = re.compile(r'(?s)<!--.*?-->')
|
||||
self.republi = re.compile(u'(Editeur|Publisher|Verlag)', re.I)
|
||||
self.reisbn = re.compile(u'(ISBN-10|ISBN-10|ASIN)', re.I)
|
||||
self.relang = re.compile(u'(Language|Langue|Sprache)', re.I)
|
||||
self.reratelt = re.compile(u'(Average\s*Customer\s*Review|Moyenne\s*des\s*commentaires\s*client|Durchschnittliche\s*Kundenbewertung)', re.I)
|
||||
self.reprod = re.compile(u'(Product\s*Details|D.tails\s*sur\s*le\s*produit|Produktinformation)', re.I)
|
||||
|
||||
def strip_tags_etree(self, etreeobj, invalid_tags):
|
||||
for (itag, rmv) in invalid_tags.iteritems():
|
||||
if rmv:
|
||||
for elts in etreeobj.getiterator(itag):
|
||||
elts.drop_tree()
|
||||
else:
|
||||
for elts in etreeobj.getiterator(itag):
|
||||
elts.drop_tag()
|
||||
|
||||
def clean_entry(self, entry, invalid_tags = {'script': True},
|
||||
invalid_id = (), invalid_class=()):
|
||||
#invalid_tags: remove tag and keep content if False else remove
|
||||
#remove tags
|
||||
if invalid_tags:
|
||||
self.strip_tags_etree(entry, invalid_tags)
|
||||
#remove id
|
||||
if invalid_id:
|
||||
for eltid in invalid_id:
|
||||
elt = entry.get_element_by_id(eltid)
|
||||
if elt is not None:
|
||||
elt.drop_tree()
|
||||
#remove class
|
||||
if invalid_class:
|
||||
for eltclass in invalid_class:
|
||||
elts = entry.find_class(eltclass)
|
||||
if elts is not None:
|
||||
for elt in elts:
|
||||
elt.drop_tree()
|
||||
|
||||
def get_title(self, entry):
|
||||
title = entry.get_element_by_id('btAsinTitle')
|
||||
if title is not None:
|
||||
title = title.text
|
||||
return unicode(title.replace('\n', '').strip())
|
||||
|
||||
def get_authors(self, entry):
|
||||
author = entry.get_element_by_id('btAsinTitle')
|
||||
while author.getparent().tag != 'div':
|
||||
author = author.getparent()
|
||||
author = author.getparent()
|
||||
authortext = []
|
||||
for x in author.getiterator('a'):
|
||||
authortext.append(unicode(x.text_content().strip()))
|
||||
return authortext
|
||||
|
||||
def get_description(self, entry, verbose):
|
||||
try:
|
||||
description = entry.get_element_by_id("productDescription").find("div[@class='content']")
|
||||
inv_class = ('seeAll', 'emptyClear')
|
||||
inv_tags ={'img': True, 'a': False}
|
||||
self.clean_entry(description, invalid_tags=inv_tags, invalid_class=inv_class)
|
||||
description = html.tostring(description, method='html', encoding=unicode).strip()
|
||||
# remove all attributes from tags
|
||||
description = self.reattr.sub(r'<\1>', description)
|
||||
# Remove the notice about text referring to out of print editions
|
||||
description = self.reoutp.sub('', description)
|
||||
# Remove comments
|
||||
description = self.recom.sub('', description)
|
||||
return unicode(sanitize_comments_html(description))
|
||||
except:
|
||||
report(verbose)
|
||||
return None
|
||||
|
||||
def get_tags(self, entry, browser, verbose):
|
||||
try:
|
||||
tags = entry.get_element_by_id('tagContentHolder')
|
||||
testptag = tags.find_class('see-all')
|
||||
if testptag:
|
||||
for x in testptag:
|
||||
alink = x.xpath('descendant-or-self::a')
|
||||
if alink:
|
||||
if alink[0].get('class') == 'tgJsActive':
|
||||
continue
|
||||
link = self.baseurl + alink[0].get('href')
|
||||
entry = self.get_individual_metadata(browser, link, verbose)
|
||||
tags = entry.get_element_by_id('tagContentHolder')
|
||||
break
|
||||
tags = [a.text for a in tags.getiterator('a') if a.get('rel') == 'tag']
|
||||
except:
|
||||
report(verbose)
|
||||
tags = []
|
||||
return tags
|
||||
|
||||
def get_book_info(self, entry, mi, verbose):
|
||||
try:
|
||||
entry = entry.get_element_by_id('SalesRank').getparent()
|
||||
except:
|
||||
try:
|
||||
for z in entry.getiterator('h2'):
|
||||
if self.reprod.search(z.text_content()):
|
||||
entry = z.getparent().find("div[@class='content']/ul")
|
||||
break
|
||||
except:
|
||||
report(verbose)
|
||||
return mi
|
||||
elts = entry.findall('li')
|
||||
#pub & date
|
||||
elt = filter(lambda x: self.republi.search(x.find('b').text), elts)
|
||||
if elt:
|
||||
pub = elt[0].find('b').tail
|
||||
mi.publisher = unicode(self.repub.sub('', pub).strip())
|
||||
d = self.repub.search(pub)
|
||||
if d is not None:
|
||||
d = d.group(1)
|
||||
try:
|
||||
default = utcnow().replace(day=15)
|
||||
if self.lang != 'all':
|
||||
d = replace_months(d, self.lang)
|
||||
d = parse_date(d, assume_utc=True, default=default)
|
||||
mi.pubdate = d
|
||||
except:
|
||||
report(verbose)
|
||||
#ISBN
|
||||
elt = filter(lambda x: self.reisbn.search(x.find('b').text), elts)
|
||||
if elt:
|
||||
isbn = elt[0].find('b').tail.replace('-', '').strip()
|
||||
if check_isbn(isbn):
|
||||
mi.isbn = unicode(isbn)
|
||||
elif len(elt) > 1:
|
||||
isbn = elt[1].find('b').tail.replace('-', '').strip()
|
||||
if check_isbn(isbn):
|
||||
mi.isbn = unicode(isbn)
|
||||
#Langue
|
||||
elt = filter(lambda x: self.relang.search(x.find('b').text), elts)
|
||||
if elt:
|
||||
langue = elt[0].find('b').tail.strip()
|
||||
if langue:
|
||||
mi.language = unicode(langue)
|
||||
#ratings
|
||||
elt = filter(lambda x: self.reratelt.search(x.find('b').text), elts)
|
||||
if elt:
|
||||
ratings = elt[0].find_class('swSprite')
|
||||
if ratings:
|
||||
ratings = self.rerat.findall(ratings[0].get('title'))
|
||||
if len(ratings) == 2:
|
||||
mi.rating = float(ratings[0])/float(ratings[1]) * 5
|
||||
return mi
|
||||
|
||||
def fill_MI(self, entry, title, authors, browser, verbose):
|
||||
mi = MetaInformation(title, authors)
|
||||
mi.author_sort = authors_to_sort_string(authors)
|
||||
mi.comments = self.get_description(entry, verbose)
|
||||
mi = self.get_book_info(entry, mi, verbose)
|
||||
mi.tags = self.get_tags(entry, browser, verbose)
|
||||
return mi
|
||||
|
||||
def get_individual_metadata(self, browser, linkdata, verbose):
|
||||
try:
|
||||
raw = browser.open_novisit(linkdata).read()
|
||||
except Exception, e:
|
||||
report(verbose)
|
||||
if callable(getattr(e, 'getcode', None)) and \
|
||||
e.getcode() == 404:
|
||||
return
|
||||
raise
|
||||
if '<title>404 - ' in raw:
|
||||
report(verbose)
|
||||
return
|
||||
raw = xml_to_unicode(raw, strip_encoding_pats=True,
|
||||
resolve_entities=True)[0]
|
||||
try:
|
||||
return soupparser.fromstring(raw)
|
||||
except:
|
||||
try:
|
||||
#remove ASCII invalid chars
|
||||
return soupparser.fromstring(clean_ascii_chars(raw))
|
||||
except:
|
||||
report(verbose)
|
||||
return
|
||||
|
||||
def populate(self, entries, browser, verbose=False):
|
||||
for x in entries:
|
||||
try:
|
||||
entry = self.get_individual_metadata(browser, x, verbose)
|
||||
# clean results
|
||||
# inv_ids = ('divsinglecolumnminwidth', 'sims.purchase', 'AutoBuyXGetY', 'A9AdsMiddleBoxTop')
|
||||
# inv_class = ('buyingDetailsGrid', 'productImageGrid')
|
||||
# inv_tags ={'script': True, 'style': True, 'form': False}
|
||||
# self.clean_entry(entry, invalid_id=inv_ids)
|
||||
title = self.get_title(entry)
|
||||
authors = self.get_authors(entry)
|
||||
except Exception, e:
|
||||
if verbose:
|
||||
print 'Failed to get all details for an entry'
|
||||
print e
|
||||
print 'URL who failed:', x
|
||||
report(verbose)
|
||||
continue
|
||||
self.append(self.fill_MI(entry, title, authors, browser, verbose))
|
||||
|
||||
|
||||
def search(title=None, author=None, publisher=None, isbn=None,
|
||||
max_results=5, verbose=False, keywords=None, lang='all'):
|
||||
br = browser()
|
||||
entries, baseurl = Query(title=title, author=author, isbn=isbn, publisher=publisher,
|
||||
keywords=keywords, max_results=max_results,rlang=lang)(br, verbose)
|
||||
|
||||
if entries is None or len(entries) == 0:
|
||||
return
|
||||
|
||||
#List of entry
|
||||
ans = ResultList(baseurl, lang)
|
||||
ans.populate(entries, br, verbose)
|
||||
return ans
|
||||
|
||||
def option_parser():
|
||||
parser = OptionParser(textwrap.dedent(\
|
||||
_('''\
|
||||
%prog [options]
|
||||
|
||||
Fetch book metadata from Amazon. You must specify one of title, author,
|
||||
ISBN, publisher or keywords. Will fetch a maximum of 10 matches,
|
||||
so you should make your query as specific as possible.
|
||||
You can chose the language for metadata retrieval:
|
||||
All & english & french & german & spanish
|
||||
'''
|
||||
)))
|
||||
parser.add_option('-t', '--title', help='Book title')
|
||||
parser.add_option('-a', '--author', help='Book author(s)')
|
||||
parser.add_option('-p', '--publisher', help='Book publisher')
|
||||
parser.add_option('-i', '--isbn', help='Book ISBN')
|
||||
parser.add_option('-k', '--keywords', help='Keywords')
|
||||
parser.add_option('-m', '--max-results', default=10,
|
||||
help='Maximum number of results to fetch')
|
||||
parser.add_option('-l', '--lang', default='all',
|
||||
help='Chosen language for metadata search (all, en, fr, es, de)')
|
||||
parser.add_option('-v', '--verbose', default=0, action='count',
|
||||
help='Be more verbose about errors')
|
||||
return parser
|
||||
|
||||
def main(args=sys.argv):
|
||||
parser = option_parser()
|
||||
opts, args = parser.parse_args(args)
|
||||
try:
|
||||
results = search(opts.title, opts.author, isbn=opts.isbn, publisher=opts.publisher,
|
||||
keywords=opts.keywords, verbose=opts.verbose, max_results=opts.max_results,
|
||||
lang=opts.lang)
|
||||
except AssertionError:
|
||||
report(True)
|
||||
parser.print_help()
|
||||
return 1
|
||||
if results is None or len(results) == 0:
|
||||
print 'No result found for this search!'
|
||||
return 0
|
||||
for result in results:
|
||||
print unicode(result).encode(preferred_encoding, 'replace')
|
||||
print
|
||||
|
||||
if __name__ == '__main__':
|
||||
sys.exit(main())
|
390
src/calibre/ebooks/metadata/fictionwise.py
Normal file
390
src/calibre/ebooks/metadata/fictionwise.py
Normal file
@ -0,0 +1,390 @@
|
||||
from __future__ import with_statement
|
||||
__license__ = 'GPL 3'
|
||||
__copyright__ = '2010, sengian <sengian1@gmail.com>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import sys, textwrap, re, traceback, socket
|
||||
from urllib import urlencode
|
||||
|
||||
from lxml.html import soupparser, tostring
|
||||
|
||||
from calibre import browser, preferred_encoding
|
||||
from calibre.ebooks.chardet import xml_to_unicode
|
||||
from calibre.ebooks.metadata import MetaInformation, check_isbn, \
|
||||
authors_to_sort_string
|
||||
from calibre.library.comments import sanitize_comments_html
|
||||
from calibre.ebooks.metadata.fetch import MetadataSource
|
||||
from calibre.utils.config import OptionParser
|
||||
from calibre.utils.date import parse_date, utcnow
|
||||
from calibre.utils.cleantext import clean_ascii_chars
|
||||
|
||||
class Fictionwise(MetadataSource): # {{{
|
||||
|
||||
author = 'Sengian'
|
||||
name = 'Fictionwise'
|
||||
description = _('Downloads metadata from Fictionwise')
|
||||
|
||||
has_html_comments = True
|
||||
|
||||
def fetch(self):
|
||||
try:
|
||||
self.results = search(self.title, self.book_author, self.publisher,
|
||||
self.isbn, max_results=10, verbose=self.verbose)
|
||||
except Exception, e:
|
||||
self.exception = e
|
||||
self.tb = traceback.format_exc()
|
||||
|
||||
# }}}
|
||||
|
||||
class FictionwiseError(Exception):
|
||||
pass
|
||||
|
||||
def report(verbose):
|
||||
if verbose:
|
||||
traceback.print_exc()
|
||||
|
||||
class Query(object):
|
||||
|
||||
BASE_URL = 'http://www.fictionwise.com/servlet/mw'
|
||||
|
||||
def __init__(self, title=None, author=None, publisher=None, keywords=None, max_results=20):
|
||||
assert not(title is None and author is None and publisher is None and keywords is None)
|
||||
assert (max_results < 21)
|
||||
|
||||
self.max_results = int(max_results)
|
||||
q = { 'template' : 'searchresults_adv.htm' ,
|
||||
'searchtitle' : '',
|
||||
'searchauthor' : '',
|
||||
'searchpublisher' : '',
|
||||
'searchkeyword' : '',
|
||||
#possibilities startoflast, fullname, lastfirst
|
||||
'searchauthortype' : 'startoflast',
|
||||
'searchcategory' : '',
|
||||
'searchcategory2' : '',
|
||||
'searchprice_s' : '0',
|
||||
'searchprice_e' : 'ANY',
|
||||
'searchformat' : '',
|
||||
'searchgeo' : 'US',
|
||||
'searchfwdatetype' : '',
|
||||
#maybe use dates fields if needed?
|
||||
#'sortorder' : 'DESC',
|
||||
#many options available: b.SortTitle, a.SortName,
|
||||
#b.DateFirstPublished, b.FWPublishDate
|
||||
'sortby' : 'b.SortTitle'
|
||||
}
|
||||
if title is not None:
|
||||
q['searchtitle'] = title
|
||||
if author is not None:
|
||||
q['searchauthor'] = author
|
||||
if publisher is not None:
|
||||
q['searchpublisher'] = publisher
|
||||
if keywords is not None:
|
||||
q['searchkeyword'] = keywords
|
||||
|
||||
if isinstance(q, unicode):
|
||||
q = q.encode('utf-8')
|
||||
self.urldata = urlencode(q)
|
||||
|
||||
def __call__(self, browser, verbose, timeout = 5.):
|
||||
if verbose:
|
||||
print _('Query: %s') % self.BASE_URL+self.urldata
|
||||
|
||||
try:
|
||||
raw = browser.open_novisit(self.BASE_URL, self.urldata, timeout=timeout).read()
|
||||
except Exception, e:
|
||||
report(verbose)
|
||||
if callable(getattr(e, 'getcode', None)) and \
|
||||
e.getcode() == 404:
|
||||
return
|
||||
if isinstance(getattr(e, 'args', [None])[0], socket.timeout):
|
||||
raise FictionwiseError(_('Fictionwise timed out. Try again later.'))
|
||||
raise FictionwiseError(_('Fictionwise encountered an error.'))
|
||||
if '<title>404 - ' in raw:
|
||||
return
|
||||
raw = xml_to_unicode(raw, strip_encoding_pats=True,
|
||||
resolve_entities=True)[0]
|
||||
try:
|
||||
feed = soupparser.fromstring(raw)
|
||||
except:
|
||||
try:
|
||||
#remove ASCII invalid chars
|
||||
feed = soupparser.fromstring(clean_ascii_chars(raw))
|
||||
except:
|
||||
return None
|
||||
|
||||
# get list of results as links
|
||||
results = feed.xpath("//table[3]/tr/td[2]/table/tr/td/p/table[2]/tr[@valign]")
|
||||
results = results[:self.max_results]
|
||||
results = [i.xpath('descendant-or-self::a')[0].get('href') for i in results]
|
||||
#return feed if no links ie normally a single book or nothing
|
||||
if not results:
|
||||
results = [feed]
|
||||
return results
|
||||
|
||||
class ResultList(list):
|
||||
|
||||
BASE_URL = 'http://www.fictionwise.com'
|
||||
COLOR_VALUES = {'BLUE': 4, 'GREEN': 3, 'YELLOW': 2, 'RED': 1, 'NA': 0}
|
||||
|
||||
def __init__(self):
|
||||
self.retitle = re.compile(r'\[[^\[\]]+\]')
|
||||
self.rechkauth = re.compile(r'.*book\s*by', re.I)
|
||||
self.redesc = re.compile(r'book\s*description\s*:\s*(<br[^>]+>)*(?P<desc>.*)<br[^>]*>.{,15}publisher\s*:', re.I)
|
||||
self.repub = re.compile(r'.*publisher\s*:\s*', re.I)
|
||||
self.redate = re.compile(r'.*release\s*date\s*:\s*', re.I)
|
||||
self.retag = re.compile(r'.*book\s*category\s*:\s*', re.I)
|
||||
self.resplitbr = re.compile(r'<br[^>]*>', re.I)
|
||||
self.recomment = re.compile(r'(?s)<!--.*?-->')
|
||||
self.reimg = re.compile(r'<img[^>]*>', re.I)
|
||||
self.resanitize = re.compile(r'\[HTML_REMOVED\]\s*', re.I)
|
||||
self.renbcom = re.compile('(?P<nbcom>\d+)\s*Reader Ratings:')
|
||||
self.recolor = re.compile('(?P<ncolor>[^/]+).gif')
|
||||
self.resplitbrdiv = re.compile(r'(<br[^>]+>|</?div[^>]*>)', re.I)
|
||||
self.reisbn = re.compile(r'.*ISBN\s*:\s*', re.I)
|
||||
|
||||
def strip_tags_etree(self, etreeobj, invalid_tags):
|
||||
for (itag, rmv) in invalid_tags.iteritems():
|
||||
if rmv:
|
||||
for elts in etreeobj.getiterator(itag):
|
||||
elts.drop_tree()
|
||||
else:
|
||||
for elts in etreeobj.getiterator(itag):
|
||||
elts.drop_tag()
|
||||
|
||||
def clean_entry(self, entry, invalid_tags = {'script': True},
|
||||
invalid_id = (), invalid_class=(), invalid_xpath = ()):
|
||||
#invalid_tags: remove tag and keep content if False else remove
|
||||
#remove tags
|
||||
if invalid_tags:
|
||||
self.strip_tags_etree(entry, invalid_tags)
|
||||
#remove xpath
|
||||
if invalid_xpath:
|
||||
for eltid in invalid_xpath:
|
||||
elt = entry.xpath(eltid)
|
||||
for el in elt:
|
||||
el.drop_tree()
|
||||
#remove id
|
||||
if invalid_id:
|
||||
for eltid in invalid_id:
|
||||
elt = entry.get_element_by_id(eltid)
|
||||
if elt is not None:
|
||||
elt.drop_tree()
|
||||
#remove class
|
||||
if invalid_class:
|
||||
for eltclass in invalid_class:
|
||||
elts = entry.find_class(eltclass)
|
||||
if elts is not None:
|
||||
for elt in elts:
|
||||
elt.drop_tree()
|
||||
|
||||
def output_entry(self, entry, prettyout = True, htmlrm="\d+"):
|
||||
out = tostring(entry, pretty_print=prettyout)
|
||||
#try to work around tostring to remove this encoding for exemle
|
||||
reclean = re.compile('(\n+|\t+|\r+|&#'+htmlrm+';)')
|
||||
return reclean.sub('', out)
|
||||
|
||||
def get_title(self, entry):
|
||||
title = entry.findtext('./')
|
||||
return self.retitle.sub('', title).strip()
|
||||
|
||||
def get_authors(self, entry):
|
||||
authortext = entry.find('./br').tail
|
||||
if not self.rechkauth.search(authortext):
|
||||
return []
|
||||
authortext = self.rechkauth.sub('', authortext)
|
||||
return [a.strip() for a in authortext.split('&')]
|
||||
|
||||
def get_rating(self, entrytable, verbose):
|
||||
nbcomment = tostring(entrytable.getprevious())
|
||||
try:
|
||||
nbcomment = self.renbcom.search(nbcomment).group("nbcom")
|
||||
except:
|
||||
report(verbose)
|
||||
return None
|
||||
hval = dict((self.COLOR_VALUES[self.recolor.search(image.get('src', default='NA.gif')).group("ncolor")],
|
||||
float(image.get('height', default=0))) \
|
||||
for image in entrytable.getiterator('img'))
|
||||
#ratings as x/5
|
||||
return float(1.25*sum(k*v for (k, v) in hval.iteritems())/sum(hval.itervalues()))
|
||||
|
||||
def get_description(self, entry):
|
||||
description = self.output_entry(entry.xpath('./p')[1],htmlrm="")
|
||||
description = self.redesc.search(description)
|
||||
if not description or not description.group("desc"):
|
||||
return None
|
||||
#remove invalid tags
|
||||
description = self.reimg.sub('', description.group("desc"))
|
||||
description = self.recomment.sub('', description)
|
||||
description = self.resanitize.sub('', sanitize_comments_html(description))
|
||||
return _('SUMMARY:\n %s') % re.sub(r'\n\s+</p>','\n</p>', description)
|
||||
|
||||
def get_publisher(self, entry):
|
||||
publisher = self.output_entry(entry.xpath('./p')[1])
|
||||
publisher = filter(lambda x: self.repub.search(x) is not None,
|
||||
self.resplitbr.split(publisher))
|
||||
if not len(publisher):
|
||||
return None
|
||||
publisher = self.repub.sub('', publisher[0])
|
||||
return publisher.split(',')[0].strip()
|
||||
|
||||
def get_tags(self, entry):
|
||||
tag = self.output_entry(entry.xpath('./p')[1])
|
||||
tag = filter(lambda x: self.retag.search(x) is not None,
|
||||
self.resplitbr.split(tag))
|
||||
if not len(tag):
|
||||
return []
|
||||
return map(lambda x: x.strip(), self.retag.sub('', tag[0]).split('/'))
|
||||
|
||||
def get_date(self, entry, verbose):
|
||||
date = self.output_entry(entry.xpath('./p')[1])
|
||||
date = filter(lambda x: self.redate.search(x) is not None,
|
||||
self.resplitbr.split(date))
|
||||
if not len(date):
|
||||
return None
|
||||
try:
|
||||
d = self.redate.sub('', date[0])
|
||||
if d:
|
||||
default = utcnow().replace(day=15)
|
||||
d = parse_date(d, assume_utc=True, default=default)
|
||||
else:
|
||||
d = None
|
||||
except:
|
||||
report(verbose)
|
||||
d = None
|
||||
return d
|
||||
|
||||
def get_ISBN(self, entry):
|
||||
isbns = self.output_entry(entry.xpath('./p')[2])
|
||||
isbns = filter(lambda x: self.reisbn.search(x) is not None,
|
||||
self.resplitbrdiv.split(isbns))
|
||||
if not len(isbns):
|
||||
return None
|
||||
isbns = [self.reisbn.sub('', x) for x in isbns if check_isbn(self.reisbn.sub('', x))]
|
||||
return sorted(isbns, cmp=lambda x,y:cmp(len(x), len(y)))[-1]
|
||||
|
||||
def fill_MI(self, entry, title, authors, ratings, verbose):
|
||||
mi = MetaInformation(title, authors)
|
||||
mi.rating = ratings
|
||||
mi.comments = self.get_description(entry)
|
||||
mi.publisher = self.get_publisher(entry)
|
||||
mi.tags = self.get_tags(entry)
|
||||
mi.pubdate = self.get_date(entry, verbose)
|
||||
mi.isbn = self.get_ISBN(entry)
|
||||
mi.author_sort = authors_to_sort_string(authors)
|
||||
return mi
|
||||
|
||||
def get_individual_metadata(self, browser, linkdata, verbose):
|
||||
try:
|
||||
raw = browser.open_novisit(self.BASE_URL + linkdata).read()
|
||||
except Exception, e:
|
||||
report(verbose)
|
||||
if callable(getattr(e, 'getcode', None)) and \
|
||||
e.getcode() == 404:
|
||||
return
|
||||
if isinstance(getattr(e, 'args', [None])[0], socket.timeout):
|
||||
raise FictionwiseError(_('Fictionwise timed out. Try again later.'))
|
||||
raise FictionwiseError(_('Fictionwise encountered an error.'))
|
||||
if '<title>404 - ' in raw:
|
||||
report(verbose)
|
||||
return
|
||||
raw = xml_to_unicode(raw, strip_encoding_pats=True,
|
||||
resolve_entities=True)[0]
|
||||
try:
|
||||
return soupparser.fromstring(raw)
|
||||
except:
|
||||
try:
|
||||
#remove ASCII invalid chars
|
||||
return soupparser.fromstring(clean_ascii_chars(raw))
|
||||
except:
|
||||
return None
|
||||
|
||||
def populate(self, entries, browser, verbose=False):
|
||||
inv_tags ={'script': True, 'a': False, 'font': False, 'strong': False, 'b': False,
|
||||
'ul': False, 'span': False}
|
||||
inv_xpath =('./table',)
|
||||
#single entry
|
||||
if len(entries) == 1 and not isinstance(entries[0], str):
|
||||
try:
|
||||
entry = entries.xpath("//table[3]/tr/td[2]/table[1]/tr/td/font/table/tr/td")
|
||||
self.clean_entry(entry, invalid_tags=inv_tags, invalid_xpath=inv_xpath)
|
||||
title = self.get_title(entry)
|
||||
#maybe strenghten the search
|
||||
ratings = self.get_rating(entry.xpath("./p/table")[1], verbose)
|
||||
authors = self.get_authors(entry)
|
||||
except Exception, e:
|
||||
if verbose:
|
||||
print _('Failed to get all details for an entry')
|
||||
print e
|
||||
return
|
||||
self.append(self.fill_MI(entry, title, authors, ratings, verbose))
|
||||
else:
|
||||
#multiple entries
|
||||
for x in entries:
|
||||
try:
|
||||
entry = self.get_individual_metadata(browser, x, verbose)
|
||||
entry = entry.xpath("//table[3]/tr/td[2]/table[1]/tr/td/font/table/tr/td")[0]
|
||||
self.clean_entry(entry, invalid_tags=inv_tags, invalid_xpath=inv_xpath)
|
||||
title = self.get_title(entry)
|
||||
#maybe strenghten the search
|
||||
ratings = self.get_rating(entry.xpath("./p/table")[1], verbose)
|
||||
authors = self.get_authors(entry)
|
||||
except Exception, e:
|
||||
if verbose:
|
||||
print _('Failed to get all details for an entry')
|
||||
print e
|
||||
continue
|
||||
self.append(self.fill_MI(entry, title, authors, ratings, verbose))
|
||||
|
||||
|
||||
def search(title=None, author=None, publisher=None, isbn=None,
|
||||
min_viewability='none', verbose=False, max_results=5,
|
||||
keywords=None):
|
||||
br = browser()
|
||||
entries = Query(title=title, author=author, publisher=publisher,
|
||||
keywords=keywords, max_results=max_results)(br, verbose, timeout = 15.)
|
||||
|
||||
#List of entry
|
||||
ans = ResultList()
|
||||
ans.populate(entries, br, verbose)
|
||||
return ans
|
||||
|
||||
|
||||
def option_parser():
|
||||
parser = OptionParser(textwrap.dedent(\
|
||||
_('''\
|
||||
%prog [options]
|
||||
|
||||
Fetch book metadata from Fictionwise. You must specify one of title, author,
|
||||
or keywords. No ISBN specification possible. Will fetch a maximum of 20 matches,
|
||||
so you should make your query as specific as possible.
|
||||
''')
|
||||
))
|
||||
parser.add_option('-t', '--title', help=_('Book title'))
|
||||
parser.add_option('-a', '--author', help=_('Book author(s)'))
|
||||
parser.add_option('-p', '--publisher', help=_('Book publisher'))
|
||||
parser.add_option('-k', '--keywords', help=_('Keywords'))
|
||||
parser.add_option('-m', '--max-results', default=20,
|
||||
help=_('Maximum number of results to fetch'))
|
||||
parser.add_option('-v', '--verbose', default=0, action='count',
|
||||
help=_('Be more verbose about errors'))
|
||||
return parser
|
||||
|
||||
def main(args=sys.argv):
|
||||
parser = option_parser()
|
||||
opts, args = parser.parse_args(args)
|
||||
try:
|
||||
results = search(opts.title, opts.author, publisher=opts.publisher,
|
||||
keywords=opts.keywords, verbose=opts.verbose, max_results=opts.max_results)
|
||||
except AssertionError:
|
||||
report(True)
|
||||
parser.print_help()
|
||||
return 1
|
||||
if results is None or len(results) == 0:
|
||||
print _('No result found for this search!')
|
||||
return 0
|
||||
for result in results:
|
||||
print unicode(result).encode(preferred_encoding, 'replace')
|
||||
print
|
||||
|
||||
if __name__ == '__main__':
|
||||
sys.exit(main())
|
@ -10,7 +10,8 @@ from copy import deepcopy
|
||||
|
||||
from lxml.html import soupparser
|
||||
|
||||
from calibre.utils.date import parse_date, utcnow
|
||||
from calibre.utils.date import parse_date, utcnow, replace_months
|
||||
from calibre.utils.cleantext import clean_ascii_chars
|
||||
from calibre import browser, preferred_encoding
|
||||
from calibre.ebooks.chardet import xml_to_unicode
|
||||
from calibre.ebooks.metadata import MetaInformation, check_isbn, \
|
||||
@ -71,31 +72,16 @@ class NiceBooksCovers(CoverDownload):
|
||||
traceback.format_exc(), self.name))
|
||||
|
||||
|
||||
class NiceBooksError(Exception):
|
||||
pass
|
||||
|
||||
class ISBNNotFound(NiceBooksError):
|
||||
pass
|
||||
|
||||
def report(verbose):
|
||||
if verbose:
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
|
||||
def replace_monthsfr(datefr):
|
||||
# Replace french months by english equivalent for parse_date
|
||||
frtoen = {
|
||||
u'[jJ]anvier': u'jan',
|
||||
u'[fF].vrier': u'feb',
|
||||
u'[mM]ars': u'mar',
|
||||
u'[aA]vril': u'apr',
|
||||
u'[mM]ai': u'may',
|
||||
u'[jJ]uin': u'jun',
|
||||
u'[jJ]uillet': u'jul',
|
||||
u'[aA]o.t': u'aug',
|
||||
u'[sS]eptembre': u'sep',
|
||||
u'[Oo]ctobre': u'oct',
|
||||
u'[nN]ovembre': u'nov',
|
||||
u'[dD].cembre': u'dec' }
|
||||
for k in frtoen.iterkeys():
|
||||
tmp = re.sub(k, frtoen[k], datefr)
|
||||
if tmp <> datefr: break
|
||||
return tmp
|
||||
|
||||
class Query(object):
|
||||
|
||||
BASE_URL = 'http://fr.nicebooks.com/'
|
||||
@ -119,7 +105,7 @@ class Query(object):
|
||||
|
||||
def __call__(self, browser, verbose, timeout = 5.):
|
||||
if verbose:
|
||||
print 'Query:', self.BASE_URL+self.urldata
|
||||
print _('Query: %s') % self.BASE_URL+self.urldata
|
||||
|
||||
try:
|
||||
raw = browser.open_novisit(self.BASE_URL+self.urldata, timeout=timeout).read()
|
||||
@ -128,7 +114,9 @@ class Query(object):
|
||||
if callable(getattr(e, 'getcode', None)) and \
|
||||
e.getcode() == 404:
|
||||
return
|
||||
raise
|
||||
if isinstance(getattr(e, 'args', [None])[0], socket.timeout):
|
||||
raise NiceBooksError(_('Nicebooks timed out. Try again later.'))
|
||||
raise NiceBooksError(_('Nicebooks encountered an error.'))
|
||||
if '<title>404 - ' in raw:
|
||||
return
|
||||
raw = xml_to_unicode(raw, strip_encoding_pats=True,
|
||||
@ -136,7 +124,11 @@ class Query(object):
|
||||
try:
|
||||
feed = soupparser.fromstring(raw)
|
||||
except:
|
||||
return
|
||||
try:
|
||||
#remove ASCII invalid chars
|
||||
feed = soupparser.fromstring(clean_ascii_chars(raw))
|
||||
except:
|
||||
return None
|
||||
|
||||
#nb of page to call
|
||||
try:
|
||||
@ -160,6 +152,10 @@ class Query(object):
|
||||
resolve_entities=True)[0]
|
||||
try:
|
||||
feed = soupparser.fromstring(raw)
|
||||
except:
|
||||
try:
|
||||
#remove ASCII invalid chars
|
||||
feed = soupparser.fromstring(clean_ascii_chars(raw))
|
||||
except:
|
||||
continue
|
||||
pages.append(feed)
|
||||
@ -180,14 +176,12 @@ class ResultList(list):
|
||||
self.reautclean = re.compile(u'\s*\(.*\)\s*')
|
||||
|
||||
def get_title(self, entry):
|
||||
# title = deepcopy(entry.find("div[@id='book-info']"))
|
||||
title = deepcopy(entry)
|
||||
title.remove(title.find("dl[@title='Informations sur le livre']"))
|
||||
title = ' '.join([i.text_content() for i in title.iterchildren()])
|
||||
return unicode(title.replace('\n', ''))
|
||||
|
||||
def get_authors(self, entry):
|
||||
# author = entry.find("div[@id='book-info']/dl[@title='Informations sur le livre']")
|
||||
author = entry.find("dl[@title='Informations sur le livre']")
|
||||
authortext = []
|
||||
for x in author.getiterator('dt'):
|
||||
@ -223,7 +217,7 @@ class ResultList(list):
|
||||
d = x.getnext().text_content()
|
||||
try:
|
||||
default = utcnow().replace(day=15)
|
||||
d = replace_monthsfr(d)
|
||||
d = replace_months(d, 'fr')
|
||||
d = parse_date(d, assume_utc=True, default=default)
|
||||
mi.pubdate = d
|
||||
except:
|
||||
@ -234,11 +228,6 @@ class ResultList(list):
|
||||
mi = MetaInformation(title, authors)
|
||||
mi.author_sort = authors_to_sort_string(authors)
|
||||
mi.comments = self.get_description(entry, verbose)
|
||||
# entry = entry.find("dl[@title='Informations sur le livre']")
|
||||
# mi.publisher = self.get_publisher(entry)
|
||||
# mi.pubdate = self.get_date(entry, verbose)
|
||||
# mi.isbn = self.get_ISBN(entry)
|
||||
# mi.language = self.get_language(entry)
|
||||
return self.get_book_info(entry, mi, verbose)
|
||||
|
||||
def get_individual_metadata(self, browser, linkdata, verbose):
|
||||
@ -249,7 +238,9 @@ class ResultList(list):
|
||||
if callable(getattr(e, 'getcode', None)) and \
|
||||
e.getcode() == 404:
|
||||
return
|
||||
raise
|
||||
if isinstance(getattr(e, 'args', [None])[0], socket.timeout):
|
||||
raise NiceBooksError(_('Nicebooks timed out. Try again later.'))
|
||||
raise NiceBooksError(_('Nicebooks encountered an error.'))
|
||||
if '<title>404 - ' in raw:
|
||||
report(verbose)
|
||||
return
|
||||
@ -258,7 +249,11 @@ class ResultList(list):
|
||||
try:
|
||||
feed = soupparser.fromstring(raw)
|
||||
except:
|
||||
return
|
||||
try:
|
||||
#remove ASCII invalid chars
|
||||
feed = soupparser.fromstring(clean_ascii_chars(raw))
|
||||
except:
|
||||
return None
|
||||
|
||||
# get results
|
||||
return feed.xpath("//div[@id='container']")[0]
|
||||
@ -292,13 +287,6 @@ class ResultList(list):
|
||||
continue
|
||||
self.append(self.fill_MI(entry, title, authors, verbose))
|
||||
|
||||
|
||||
class NiceBooksError(Exception):
|
||||
pass
|
||||
|
||||
class ISBNNotFound(NiceBooksError):
|
||||
pass
|
||||
|
||||
class Covers(object):
|
||||
|
||||
def __init__(self, isbn = None):
|
||||
@ -329,11 +317,10 @@ class Covers(object):
|
||||
return cover, ext if ext else 'jpg'
|
||||
except Exception, err:
|
||||
if isinstance(getattr(err, 'args', [None])[0], socket.timeout):
|
||||
err = NiceBooksError(_('Nicebooks timed out. Try again later.'))
|
||||
raise err
|
||||
raise NiceBooksError(_('Nicebooks timed out. Try again later.'))
|
||||
if not len(self.urlimg):
|
||||
if not self.isbnf:
|
||||
raise ISBNNotFound('ISBN: '+self.isbn+_(' not found.'))
|
||||
raise ISBNNotFound(_('ISBN: %s not found.') % self.isbn)
|
||||
raise NiceBooksError(_('An errror occured with Nicebooks cover fetcher'))
|
||||
|
||||
|
||||
@ -341,10 +328,10 @@ def search(title=None, author=None, publisher=None, isbn=None,
|
||||
max_results=5, verbose=False, keywords=None):
|
||||
br = browser()
|
||||
entries = Query(title=title, author=author, isbn=isbn, publisher=publisher,
|
||||
keywords=keywords, max_results=max_results)(br, verbose)
|
||||
keywords=keywords, max_results=max_results)(br, verbose,timeout = 10.)
|
||||
|
||||
if entries is None or len(entries) == 0:
|
||||
return
|
||||
return None
|
||||
|
||||
#List of entry
|
||||
ans = ResultList()
|
||||
@ -364,28 +351,28 @@ def cover_from_isbn(isbn, timeout = 5.):
|
||||
|
||||
def option_parser():
|
||||
parser = OptionParser(textwrap.dedent(\
|
||||
'''\
|
||||
_('''\
|
||||
%prog [options]
|
||||
|
||||
Fetch book metadata from Nicebooks. You must specify one of title, author,
|
||||
ISBN, publisher or keywords. Will fetch a maximum of 20 matches,
|
||||
so you should make your query as specific as possible.
|
||||
It can also get covers if the option is activated.
|
||||
'''
|
||||
''')
|
||||
))
|
||||
parser.add_option('-t', '--title', help='Book title')
|
||||
parser.add_option('-a', '--author', help='Book author(s)')
|
||||
parser.add_option('-p', '--publisher', help='Book publisher')
|
||||
parser.add_option('-i', '--isbn', help='Book ISBN')
|
||||
parser.add_option('-k', '--keywords', help='Keywords')
|
||||
parser.add_option('-t', '--title', help=_('Book title'))
|
||||
parser.add_option('-a', '--author', help=_('Book author(s)'))
|
||||
parser.add_option('-p', '--publisher', help=_('Book publisher'))
|
||||
parser.add_option('-i', '--isbn', help=_('Book ISBN'))
|
||||
parser.add_option('-k', '--keywords', help=_('Keywords'))
|
||||
parser.add_option('-c', '--covers', default=0,
|
||||
help='Covers: 1-Check/ 2-Download')
|
||||
help=_('Covers: 1-Check/ 2-Download'))
|
||||
parser.add_option('-p', '--coverspath', default='',
|
||||
help='Covers files path')
|
||||
help=_('Covers files path'))
|
||||
parser.add_option('-m', '--max-results', default=20,
|
||||
help='Maximum number of results to fetch')
|
||||
help=_('Maximum number of results to fetch'))
|
||||
parser.add_option('-v', '--verbose', default=0, action='count',
|
||||
help='Be more verbose about errors')
|
||||
help=_('Be more verbose about errors'))
|
||||
return parser
|
||||
|
||||
def main(args=sys.argv):
|
||||
@ -400,15 +387,15 @@ def main(args=sys.argv):
|
||||
parser.print_help()
|
||||
return 1
|
||||
if results is None or len(results) == 0:
|
||||
print 'No result found for this search!'
|
||||
print _('No result found for this search!')
|
||||
return 0
|
||||
for result in results:
|
||||
print unicode(result).encode(preferred_encoding, 'replace')
|
||||
covact = int(opts.covers)
|
||||
if covact == 1:
|
||||
textcover = 'No cover found!'
|
||||
textcover = _('No cover found!')
|
||||
if check_for_cover(result.isbn):
|
||||
textcover = 'A cover was found for this book'
|
||||
textcover = _('A cover was found for this book')
|
||||
print textcover
|
||||
elif covact == 2:
|
||||
cover_data, ext = cover_from_isbn(result.isbn)
|
||||
@ -417,7 +404,7 @@ def main(args=sys.argv):
|
||||
cpath = os.path.normpath(opts.coverspath + '/' + result.isbn)
|
||||
oname = os.path.abspath(cpath+'.'+ext)
|
||||
open(oname, 'wb').write(cover_data)
|
||||
print 'Cover saved to file ', oname
|
||||
print _('Cover saved to file '), oname
|
||||
print
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
23
src/calibre/utils/cleantext.py
Normal file
23
src/calibre/utils/cleantext.py
Normal file
@ -0,0 +1,23 @@
|
||||
from __future__ import with_statement
|
||||
__license__ = 'GPL 3'
|
||||
__copyright__ = '2010, sengian <sengian1@gmail.com>'
|
||||
__docformat__ = 'restructuredtext en'
|
||||
|
||||
import re
|
||||
|
||||
_ascii_pat = None
|
||||
|
||||
def clean_ascii_chars(txt, charlist=None):
|
||||
'remove ASCII invalid chars : 0 to 8 and 11-14 to 24-26-27 by default'
|
||||
global _ascii_pat
|
||||
if _ascii_pat is None:
|
||||
chars = list(range(8)) + [0x0B, 0x0E, 0x0F] + list(range(0x10, 0x19)) \
|
||||
+ [0x1A, 0x1B]
|
||||
_ascii_pat = re.compile(u'|'.join(map(unichr, chars)))
|
||||
|
||||
if charlist is None:
|
||||
pat = _ascii_pat
|
||||
else:
|
||||
pat = re.compile(u'|'.join(map(unichr, charlist)))
|
||||
return pat.sub('', txt)
|
||||
|
@ -151,3 +151,45 @@ def format_date(dt, format, assume_utc=False, as_utc=False):
|
||||
format = re.sub('d{1,4}', format_day, format)
|
||||
format = re.sub('M{1,4}', format_month, format)
|
||||
return re.sub('yyyy|yy', format_year, format)
|
||||
|
||||
def replace_months(datestr, clang):
|
||||
# Replace months by english equivalent for parse_date
|
||||
frtoen = {
|
||||
u'[jJ]anvier': u'jan',
|
||||
u'[fF].vrier': u'feb',
|
||||
u'[mM]ars': u'mar',
|
||||
u'[aA]vril': u'apr',
|
||||
u'[mM]ai': u'may',
|
||||
u'[jJ]uin': u'jun',
|
||||
u'[jJ]uillet': u'jul',
|
||||
u'[aA]o.t': u'aug',
|
||||
u'[sS]eptembre': u'sep',
|
||||
u'[Oo]ctobre': u'oct',
|
||||
u'[nN]ovembre': u'nov',
|
||||
u'[dD].cembre': u'dec' }
|
||||
detoen = {
|
||||
u'[jJ]anuar': u'jan',
|
||||
u'[fF]ebruar': u'feb',
|
||||
u'[mM].rz': u'mar',
|
||||
u'[aA]pril': u'apr',
|
||||
u'[mM]ai': u'may',
|
||||
u'[jJ]uni': u'jun',
|
||||
u'[jJ]uli': u'jul',
|
||||
u'[aA]ugust': u'aug',
|
||||
u'[sS]eptember': u'sep',
|
||||
u'[Oo]ktober': u'oct',
|
||||
u'[nN]ovember': u'nov',
|
||||
u'[dD]ezember': u'dec' }
|
||||
|
||||
if clang == 'fr':
|
||||
dictoen = frtoen
|
||||
elif clang == 'de':
|
||||
dictoen = detoen
|
||||
else:
|
||||
return datestr
|
||||
|
||||
for k in dictoen.iterkeys():
|
||||
tmp = re.sub(k, dictoen[k], datestr)
|
||||
if tmp != datestr: break
|
||||
return tmp
|
||||
|
||||
|
@ -284,7 +284,7 @@ icu_upper(PyObject *self, PyObject *args) {
|
||||
PyMem_Free(input);
|
||||
|
||||
return ret;
|
||||
}
|
||||
} // }}}
|
||||
|
||||
// lower {{{
|
||||
static PyObject *
|
||||
|
Loading…
x
Reference in New Issue
Block a user