mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Update of fictionwise.py
This commit is contained in:
parent
085e0af936
commit
57e0e1820a
@ -53,7 +53,6 @@ class Query(object):
|
|||||||
assert (max_results < 21)
|
assert (max_results < 21)
|
||||||
|
|
||||||
self.max_results = int(max_results)
|
self.max_results = int(max_results)
|
||||||
|
|
||||||
q = { 'template' : 'searchresults_adv.htm' ,
|
q = { 'template' : 'searchresults_adv.htm' ,
|
||||||
'searchtitle' : '',
|
'searchtitle' : '',
|
||||||
'searchauthor' : '',
|
'searchauthor' : '',
|
||||||
@ -131,12 +130,11 @@ class ResultList(list):
|
|||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.retitle = re.compile(r'\[[^\[\]]+\]')
|
self.retitle = re.compile(r'\[[^\[\]]+\]')
|
||||||
self.rechkauth = re.compile(r'.*book\s*by', re.I)
|
self.rechkauth = re.compile(r'.*book\s*by', re.I)
|
||||||
self.redesc = re.compile(r'book\s*description\s*:\s*(<br[^>]+>)*(?P<desc>.*)' \
|
self.redesc = re.compile(r'book\s*description\s*:\s*(<br[^>]+>)*(?P<desc>.*)<br[^>]*>.{,15}publisher\s*:', re.I)
|
||||||
+ '<br[^>]+>.{,15}publisher\s*:', re.I)
|
|
||||||
self.repub = re.compile(r'.*publisher\s*:\s*', re.I)
|
self.repub = re.compile(r'.*publisher\s*:\s*', re.I)
|
||||||
self.redate = re.compile(r'.*release\s*date\s*:\s*', re.I)
|
self.redate = re.compile(r'.*release\s*date\s*:\s*', re.I)
|
||||||
self.retag = re.compile(r'.*book\s*category\s*:\s*', re.I)
|
self.retag = re.compile(r'.*book\s*category\s*:\s*', re.I)
|
||||||
self.resplitbr = re.compile(r'<br[^>]+>', re.I)
|
self.resplitbr = re.compile(r'<br[^>]*>', re.I)
|
||||||
self.recomment = re.compile(r'(?s)<!--.*?-->')
|
self.recomment = re.compile(r'(?s)<!--.*?-->')
|
||||||
self.reimg = re.compile(r'<img[^>]*>', re.I)
|
self.reimg = re.compile(r'<img[^>]*>', re.I)
|
||||||
self.resanitize = re.compile(r'\[HTML_REMOVED\]\s*', re.I)
|
self.resanitize = re.compile(r'\[HTML_REMOVED\]\s*', re.I)
|
||||||
@ -180,21 +178,9 @@ class ResultList(list):
|
|||||||
for elt in elts:
|
for elt in elts:
|
||||||
elt.drop_tree()
|
elt.drop_tree()
|
||||||
|
|
||||||
def clean_entry_dffdfbdjbf(self, entry,
|
|
||||||
invalid_tags = ('font', 'strong', 'b', 'ul', 'span', 'a'),
|
|
||||||
remove_tags_trees = ('script',)):
|
|
||||||
for it in entry[0].iterchildren(tag='table'):
|
|
||||||
entry[0].remove(it)
|
|
||||||
entry[0].remove(entry[0].xpath( 'descendant-or-self::p[1]')[0])
|
|
||||||
entry = entry[0]
|
|
||||||
cleantree = self.strip_tags_etree(entry, invalid_tags)
|
|
||||||
for itag in remove_tags_trees:
|
|
||||||
for elts in cleantree.getiterator(itag):
|
|
||||||
elts.drop_tree()
|
|
||||||
return cleantree
|
|
||||||
|
|
||||||
def output_entry(self, entry, prettyout = True, htmlrm="\d+"):
|
def output_entry(self, entry, prettyout = True, htmlrm="\d+"):
|
||||||
out = tostring(entry, pretty_print=prettyout)
|
out = tostring(entry, pretty_print=prettyout)
|
||||||
|
#try to work around tostring to remove this encoding for exemle
|
||||||
reclean = re.compile('(\n+|\t+|\r+|&#'+htmlrm+';)')
|
reclean = re.compile('(\n+|\t+|\r+|&#'+htmlrm+';)')
|
||||||
return reclean.sub('', out)
|
return reclean.sub('', out)
|
||||||
|
|
||||||
@ -223,18 +209,18 @@ class ResultList(list):
|
|||||||
return float(1.25*sum(k*v for (k, v) in hval.iteritems())/sum(hval.itervalues()))
|
return float(1.25*sum(k*v for (k, v) in hval.iteritems())/sum(hval.itervalues()))
|
||||||
|
|
||||||
def get_description(self, entry):
|
def get_description(self, entry):
|
||||||
description = self.output_entry(entry.find('./p'),htmlrm="")
|
description = self.output_entry(entry.xpath('./p')[1],htmlrm="")
|
||||||
description = self.redesc.search(description)
|
description = self.redesc.search(description)
|
||||||
if not description and not description.group("desc"):
|
if not description or not description.group("desc"):
|
||||||
return None
|
return None
|
||||||
#remove invalid tags
|
#remove invalid tags
|
||||||
description = self.reimg.sub('', description.group("desc"))
|
description = self.reimg.sub('', description.group("desc"))
|
||||||
description = self.recomment.sub('', description)
|
description = self.recomment.sub('', description)
|
||||||
description = self.resanitize.sub('', sanitize_comments_html(description))
|
description = self.resanitize.sub('', sanitize_comments_html(description))
|
||||||
return 'SUMMARY:\n' + re.sub(r'\n\s+</p>','\n</p>', description)
|
return _('SUMMARY:\n %s') % re.sub(r'\n\s+</p>','\n</p>', description)
|
||||||
|
|
||||||
def get_publisher(self, entry):
|
def get_publisher(self, entry):
|
||||||
publisher = self.output_entry(entry.find('./p'))
|
publisher = self.output_entry(entry.xpath('./p')[1])
|
||||||
publisher = filter(lambda x: self.repub.search(x) is not None,
|
publisher = filter(lambda x: self.repub.search(x) is not None,
|
||||||
self.resplitbr.split(publisher))
|
self.resplitbr.split(publisher))
|
||||||
if not len(publisher):
|
if not len(publisher):
|
||||||
@ -243,7 +229,7 @@ class ResultList(list):
|
|||||||
return publisher.split(',')[0].strip()
|
return publisher.split(',')[0].strip()
|
||||||
|
|
||||||
def get_tags(self, entry):
|
def get_tags(self, entry):
|
||||||
tag = self.output_entry(entry.find('./p'))
|
tag = self.output_entry(entry.xpath('./p')[1])
|
||||||
tag = filter(lambda x: self.retag.search(x) is not None,
|
tag = filter(lambda x: self.retag.search(x) is not None,
|
||||||
self.resplitbr.split(tag))
|
self.resplitbr.split(tag))
|
||||||
if not len(tag):
|
if not len(tag):
|
||||||
@ -251,7 +237,7 @@ class ResultList(list):
|
|||||||
return map(lambda x: x.strip(), self.retag.sub('', tag[0]).split('/'))
|
return map(lambda x: x.strip(), self.retag.sub('', tag[0]).split('/'))
|
||||||
|
|
||||||
def get_date(self, entry, verbose):
|
def get_date(self, entry, verbose):
|
||||||
date = self.output_entry(entry.find('./p'))
|
date = self.output_entry(entry.xpath('./p')[1])
|
||||||
date = filter(lambda x: self.redate.search(x) is not None,
|
date = filter(lambda x: self.redate.search(x) is not None,
|
||||||
self.resplitbr.split(date))
|
self.resplitbr.split(date))
|
||||||
if not len(date):
|
if not len(date):
|
||||||
@ -269,12 +255,11 @@ class ResultList(list):
|
|||||||
return d
|
return d
|
||||||
|
|
||||||
def get_ISBN(self, entry):
|
def get_ISBN(self, entry):
|
||||||
isbns = self.output_entry(entry.getchildren()[2])
|
isbns = self.output_entry(entry.xpath('./p')[2])
|
||||||
isbns = filter(lambda x: self.reisbn.search(x) is not None,
|
isbns = filter(lambda x: self.reisbn.search(x) is not None,
|
||||||
self.resplitbrdiv.split(isbns))
|
self.resplitbrdiv.split(isbns))
|
||||||
if not len(isbns):
|
if not len(isbns):
|
||||||
return None
|
return None
|
||||||
#TODO: parse all tag if necessary
|
|
||||||
isbns = [self.reisbn.sub('', x) for x in isbns if check_isbn(self.reisbn.sub('', x))]
|
isbns = [self.reisbn.sub('', x) for x in isbns if check_isbn(self.reisbn.sub('', x))]
|
||||||
return sorted(isbns, cmp=lambda x,y:cmp(len(x), len(y)))[-1]
|
return sorted(isbns, cmp=lambda x,y:cmp(len(x), len(y)))[-1]
|
||||||
|
|
||||||
@ -287,7 +272,6 @@ class ResultList(list):
|
|||||||
mi.pubdate = self.get_date(entry, verbose)
|
mi.pubdate = self.get_date(entry, verbose)
|
||||||
mi.isbn = self.get_ISBN(entry)
|
mi.isbn = self.get_ISBN(entry)
|
||||||
mi.author_sort = authors_to_sort_string(authors)
|
mi.author_sort = authors_to_sort_string(authors)
|
||||||
# mi.language = self.get_language(x, verbose)
|
|
||||||
return mi
|
return mi
|
||||||
|
|
||||||
def get_individual_metadata(self, browser, linkdata, verbose):
|
def get_individual_metadata(self, browser, linkdata, verbose):
|
||||||
@ -298,36 +282,35 @@ class ResultList(list):
|
|||||||
if callable(getattr(e, 'getcode', None)) and \
|
if callable(getattr(e, 'getcode', None)) and \
|
||||||
e.getcode() == 404:
|
e.getcode() == 404:
|
||||||
return
|
return
|
||||||
raise
|
if isinstance(getattr(e, 'args', [None])[0], socket.timeout):
|
||||||
|
raise FictionwiseError(_('Fictionwise timed out. Try again later.'))
|
||||||
|
raise FictionwiseError(_('Fictionwise encountered an error.'))
|
||||||
if '<title>404 - ' in raw:
|
if '<title>404 - ' in raw:
|
||||||
report(verbose)
|
report(verbose)
|
||||||
return
|
return
|
||||||
raw = xml_to_unicode(raw, strip_encoding_pats=True,
|
raw = xml_to_unicode(raw, strip_encoding_pats=True,
|
||||||
resolve_entities=True)[0]
|
resolve_entities=True)[0]
|
||||||
try:
|
try:
|
||||||
feed = soupparser.fromstring(raw)
|
return soupparser.fromstring(raw)
|
||||||
except:
|
except:
|
||||||
return
|
try:
|
||||||
|
#remove ASCII invalid chars
|
||||||
# get results
|
return soupparser.fromstring(clean_ascii_char(raw))
|
||||||
return feed.xpath("//table[3]/tr/td[2]/table[1]/tr/td/font/table/tr/td")
|
except:
|
||||||
|
return None
|
||||||
|
|
||||||
def populate(self, entries, browser, verbose=False):
|
def populate(self, entries, browser, verbose=False):
|
||||||
inv_tags ={'script': True, 'a': False, 'font': False, 'strong': False, 'b': False,
|
inv_tags ={'script': True, 'a': False, 'font': False, 'strong': False, 'b': False,
|
||||||
'ul': False, 'span': False, 'table': True}
|
'ul': False, 'span': False}
|
||||||
inv_xpath =('descendant-or-self::p[1]',)
|
inv_xpath =('./table',)
|
||||||
#single entry
|
#single entry
|
||||||
if len(entries) == 1 and not isinstance(entries[0], str):
|
if len(entries) == 1 and not isinstance(entries[0], str):
|
||||||
try:
|
try:
|
||||||
entry = entries.xpath("//table[3]/tr/td[2]/table[1]/tr/td/font/table/tr/td")
|
entry = entries.xpath("//table[3]/tr/td[2]/table[1]/tr/td/font/table/tr/td")
|
||||||
self.clean_entry(entry, invalid_tags=inv_tags, invalid_xpath=inv_xpath)
|
self.clean_entry(entry, invalid_tags=inv_tags, invalid_xpath=inv_xpath)
|
||||||
entry = self.clean_entry(entry)
|
|
||||||
title = self.get_title(entry)
|
title = self.get_title(entry)
|
||||||
#ratings: get table for rating then drop
|
#maybe strenghten the search
|
||||||
for elt in entry.getiterator('table'):
|
ratings = self.get_rating(entry.xpath("./p/table")[1], verbose)
|
||||||
ratings = self.get_rating(elt, verbose)
|
|
||||||
elt.getprevious().drop_tree()
|
|
||||||
elt.drop_tree()
|
|
||||||
authors = self.get_authors(entry)
|
authors = self.get_authors(entry)
|
||||||
except Exception, e:
|
except Exception, e:
|
||||||
if verbose:
|
if verbose:
|
||||||
@ -340,13 +323,11 @@ class ResultList(list):
|
|||||||
for x in entries:
|
for x in entries:
|
||||||
try:
|
try:
|
||||||
entry = self.get_individual_metadata(browser, x, verbose)
|
entry = self.get_individual_metadata(browser, x, verbose)
|
||||||
|
entry = entry.xpath("//table[3]/tr/td[2]/table[1]/tr/td/font/table/tr/td")[0]
|
||||||
self.clean_entry(entry, invalid_tags=inv_tags, invalid_xpath=inv_xpath)
|
self.clean_entry(entry, invalid_tags=inv_tags, invalid_xpath=inv_xpath)
|
||||||
title = self.get_title(entry)
|
title = self.get_title(entry)
|
||||||
#ratings: get table for rating then drop
|
#maybe strenghten the search
|
||||||
for elt in entry.getiterator('table'):
|
ratings = self.get_rating(entry.xpath("./p/table")[1], verbose)
|
||||||
ratings = self.get_rating(elt, verbose)
|
|
||||||
elt.getprevious().drop_tree()
|
|
||||||
elt.drop_tree()
|
|
||||||
authors = self.get_authors(entry)
|
authors = self.get_authors(entry)
|
||||||
except Exception, e:
|
except Exception, e:
|
||||||
if verbose:
|
if verbose:
|
||||||
@ -361,7 +342,7 @@ def search(title=None, author=None, publisher=None, isbn=None,
|
|||||||
keywords=None):
|
keywords=None):
|
||||||
br = browser()
|
br = browser()
|
||||||
entries = Query(title=title, author=author, publisher=publisher,
|
entries = Query(title=title, author=author, publisher=publisher,
|
||||||
keywords=keywords, max_results=max_results)(br, verbose, timeout = 10.)
|
keywords=keywords, max_results=max_results)(br, verbose, timeout = 15.)
|
||||||
|
|
||||||
#List of entry
|
#List of entry
|
||||||
ans = ResultList()
|
ans = ResultList()
|
||||||
|
Loading…
x
Reference in New Issue
Block a user