mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Threading in fictionwise and some cleanup
This commit is contained in:
parent
7e7eb2cad3
commit
1610a739af
@ -4,6 +4,7 @@ __copyright__ = '2010, sengian <sengian1@gmail.com>'
|
|||||||
__docformat__ = 'restructuredtext en'
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
import sys, textwrap, re, traceback, socket
|
import sys, textwrap, re, traceback, socket
|
||||||
|
from threading import Thread
|
||||||
from Queue import Queue
|
from Queue import Queue
|
||||||
from urllib import urlencode
|
from urllib import urlencode
|
||||||
|
|
||||||
@ -17,7 +18,7 @@ from calibre.library.comments import sanitize_comments_html
|
|||||||
from calibre.ebooks.metadata.fetch import MetadataSource
|
from calibre.ebooks.metadata.fetch import MetadataSource
|
||||||
from calibre.utils.config import OptionParser
|
from calibre.utils.config import OptionParser
|
||||||
from calibre.utils.date import parse_date, utcnow
|
from calibre.utils.date import parse_date, utcnow
|
||||||
from calibre.utils.cleantext import clean_ascii_chars
|
from calibre.utils.cleantext import clean_ascii_chars, unescape
|
||||||
|
|
||||||
class Fictionwise(MetadataSource): # {{{
|
class Fictionwise(MetadataSource): # {{{
|
||||||
|
|
||||||
@ -40,6 +41,44 @@ class Fictionwise(MetadataSource): # {{{
|
|||||||
class FictionwiseError(Exception):
|
class FictionwiseError(Exception):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
class BrowserThread(Thread):
|
||||||
|
|
||||||
|
def __init__(self, url, verbose=False, timeout=10., ex=Exception, name='Meta'):
|
||||||
|
self.url = url
|
||||||
|
self.ex = ex
|
||||||
|
self.plugname = name
|
||||||
|
self.verbose = verbose
|
||||||
|
self.timeout = timeout
|
||||||
|
self.result = None
|
||||||
|
Thread.__init__(self)
|
||||||
|
|
||||||
|
def get_result(self):
|
||||||
|
return self.result
|
||||||
|
|
||||||
|
def run(self):
|
||||||
|
try:
|
||||||
|
raw = browser().open_novisit(self.url, timeout=self.timeout).read()
|
||||||
|
except Exception, e:
|
||||||
|
report(self.verbose)
|
||||||
|
if callable(getattr(e, 'getcode', None)) and \
|
||||||
|
e.getcode() == 404:
|
||||||
|
self.result = None
|
||||||
|
if isinstance(getattr(e, 'args', [None])[0], socket.timeout):
|
||||||
|
raise self.ex(_('%s timed out. Try again later.') % self.plugname)
|
||||||
|
raise self.ex(_('%s encountered an error.') % self.plugname)
|
||||||
|
if '<title>404 - ' in raw:
|
||||||
|
report(self.verbose)
|
||||||
|
self.result = None
|
||||||
|
raw = xml_to_unicode(raw, strip_encoding_pats=True,
|
||||||
|
resolve_entities=True)[0]
|
||||||
|
try:
|
||||||
|
self.result = soupparser.fromstring(raw)
|
||||||
|
except:
|
||||||
|
try:
|
||||||
|
#remove ASCII invalid chars
|
||||||
|
self.result = soupparser.fromstring(clean_ascii_chars(raw))
|
||||||
|
except:
|
||||||
|
self.result = None
|
||||||
|
|
||||||
|
|
||||||
def report(verbose):
|
def report(verbose):
|
||||||
@ -180,10 +219,13 @@ class ResultList(list):
|
|||||||
for elt in elts:
|
for elt in elts:
|
||||||
elt.drop_tree()
|
elt.drop_tree()
|
||||||
|
|
||||||
def output_entry(self, entry, prettyout = True, htmlrm="\d+"):
|
def output_entry(self, entry, prettyout = True, rmhtmlchar=True):
|
||||||
out = tostring(entry, pretty_print=prettyout)
|
out = tostring(entry, pretty_print=prettyout)
|
||||||
#try to work around tostring to remove this encoding for exemle
|
#remove html chars
|
||||||
reclean = re.compile('(\n+|\t+|\r+|&#'+htmlrm+';)')
|
if rmhtmlchar:
|
||||||
|
out = unescape(out, rm=True)
|
||||||
|
# Remove \n\t\r.
|
||||||
|
reclean = re.compile('(\n+|\t+|\r+)')
|
||||||
return reclean.sub('', out)
|
return reclean.sub('', out)
|
||||||
|
|
||||||
def get_title(self, entry):
|
def get_title(self, entry):
|
||||||
@ -211,7 +253,7 @@ class ResultList(list):
|
|||||||
return float(1.25*sum(k*v for (k, v) in hval.iteritems())/sum(hval.itervalues()))
|
return float(1.25*sum(k*v for (k, v) in hval.iteritems())/sum(hval.itervalues()))
|
||||||
|
|
||||||
def get_description(self, entry):
|
def get_description(self, entry):
|
||||||
description = self.output_entry(entry.xpath('./p')[1],htmlrm="")
|
description = self.output_entry(entry.xpath('./p')[1],rmhtmlchar=False)
|
||||||
description = self.redesc.search(description)
|
description = self.redesc.search(description)
|
||||||
if not description or not description.group("desc"):
|
if not description or not description.group("desc"):
|
||||||
return None
|
return None
|
||||||
@ -265,9 +307,24 @@ class ResultList(list):
|
|||||||
isbns = [self.reisbn.sub('', x) for x in isbns if check_isbn(self.reisbn.sub('', x))]
|
isbns = [self.reisbn.sub('', x) for x in isbns if check_isbn(self.reisbn.sub('', x))]
|
||||||
return sorted(isbns, cmp=lambda x,y:cmp(len(x), len(y)))[-1]
|
return sorted(isbns, cmp=lambda x,y:cmp(len(x), len(y)))[-1]
|
||||||
|
|
||||||
def fill_MI(self, entry, title, authors, ratings, verbose):
|
def fill_MI(self, data, verbose):
|
||||||
|
inv_tags ={'script': True, 'a': False, 'font': False, 'strong': False, 'b': False,
|
||||||
|
'ul': False, 'span': False}
|
||||||
|
inv_xpath =('./table',)
|
||||||
|
try:
|
||||||
|
entry = data.xpath("//table[3]/tr/td[2]/table[1]/tr/td/font/table/tr/td")[0]
|
||||||
|
self.clean_entry(entry, invalid_tags=inv_tags, invalid_xpath=inv_xpath)
|
||||||
|
title = self.get_title(entry)
|
||||||
|
authors = self.get_authors(entry)
|
||||||
|
except Exception, e:
|
||||||
|
if verbose:
|
||||||
|
print _('Failed to get all details for an entry')
|
||||||
|
print e
|
||||||
|
return None
|
||||||
mi = MetaInformation(title, authors)
|
mi = MetaInformation(title, authors)
|
||||||
mi.rating = ratings
|
ratings = entry.xpath("./p/table")
|
||||||
|
if len(ratings) >= 2:
|
||||||
|
mi.rating = self.get_rating(ratings[1], verbose)
|
||||||
mi.comments = self.get_description(entry)
|
mi.comments = self.get_description(entry)
|
||||||
mi.publisher = self.get_publisher(entry)
|
mi.publisher = self.get_publisher(entry)
|
||||||
mi.tags = self.get_tags(entry)
|
mi.tags = self.get_tags(entry)
|
||||||
@ -276,67 +333,36 @@ class ResultList(list):
|
|||||||
mi.author_sort = authors_to_sort_string(authors)
|
mi.author_sort = authors_to_sort_string(authors)
|
||||||
return mi
|
return mi
|
||||||
|
|
||||||
def get_individual_metadata(self, browser, linkdata, verbose):
|
def producer(self, q, data, verbose=False):
|
||||||
try:
|
for x in data:
|
||||||
raw = browser.open_novisit(self.BASE_URL + linkdata).read()
|
thread = BrowserThread(self.BASE_URL+x, verbose=verbose, ex=FictionwiseError,
|
||||||
except Exception, e:
|
name='Fictionwise')
|
||||||
report(verbose)
|
thread.start()
|
||||||
if callable(getattr(e, 'getcode', None)) and \
|
q.put(thread, True)
|
||||||
e.getcode() == 404:
|
|
||||||
return
|
|
||||||
if isinstance(getattr(e, 'args', [None])[0], socket.timeout):
|
|
||||||
raise FictionwiseError(_('Fictionwise timed out. Try again later.'))
|
|
||||||
raise FictionwiseError(_('Fictionwise encountered an error.'))
|
|
||||||
if '<title>404 - ' in raw:
|
|
||||||
report(verbose)
|
|
||||||
return
|
|
||||||
raw = xml_to_unicode(raw, strip_encoding_pats=True,
|
|
||||||
resolve_entities=True)[0]
|
|
||||||
try:
|
|
||||||
return soupparser.fromstring(raw)
|
|
||||||
except:
|
|
||||||
try:
|
|
||||||
#remove ASCII invalid chars
|
|
||||||
return soupparser.fromstring(clean_ascii_chars(raw))
|
|
||||||
except:
|
|
||||||
return None
|
|
||||||
|
|
||||||
def populate(self, entries, browser, verbose=False):
|
def consumer(self, q, total_entries, verbose=False):
|
||||||
inv_tags ={'script': True, 'a': False, 'font': False, 'strong': False, 'b': False,
|
while len(self) < total_entries:
|
||||||
'ul': False, 'span': False}
|
thread = q.get(True)
|
||||||
inv_xpath =('./table',)
|
thread.join()
|
||||||
#single entry
|
mi = thread.get_result()
|
||||||
|
if mi is None:
|
||||||
|
self.append(None)
|
||||||
|
else:
|
||||||
|
self.append(self.fill_MI(mi, verbose))
|
||||||
|
|
||||||
|
def populate(self, entries, verbose=False, brcall=3):
|
||||||
if len(entries) == 1 and not isinstance(entries[0], str):
|
if len(entries) == 1 and not isinstance(entries[0], str):
|
||||||
try:
|
#single entry
|
||||||
entry = entries.xpath("//table[3]/tr/td[2]/table[1]/tr/td/font/table/tr/td")
|
self.append(self.fill_MI(entries[0], verbose))
|
||||||
self.clean_entry(entry, invalid_tags=inv_tags, invalid_xpath=inv_xpath)
|
|
||||||
title = self.get_title(entry)
|
|
||||||
#maybe strenghten the search
|
|
||||||
ratings = self.get_rating(entry.xpath("./p/table")[1], verbose)
|
|
||||||
authors = self.get_authors(entry)
|
|
||||||
except Exception, e:
|
|
||||||
if verbose:
|
|
||||||
print _('Failed to get all details for an entry')
|
|
||||||
print e
|
|
||||||
return
|
|
||||||
self.append(self.fill_MI(entry, title, authors, ratings, verbose))
|
|
||||||
else:
|
else:
|
||||||
#multiple entries
|
#multiple entries
|
||||||
for x in entries:
|
q = Queue(brcall)
|
||||||
try:
|
prod_thread = Thread(target=self.producer, args=(q, entries, verbose))
|
||||||
entry = self.get_individual_metadata(browser, x, verbose)
|
cons_thread = Thread(target=self.consumer, args=(q, len(entries), verbose))
|
||||||
entry = entry.xpath("//table[3]/tr/td[2]/table[1]/tr/td/font/table/tr/td")[0]
|
prod_thread.start()
|
||||||
self.clean_entry(entry, invalid_tags=inv_tags, invalid_xpath=inv_xpath)
|
cons_thread.start()
|
||||||
title = self.get_title(entry)
|
prod_thread.join()
|
||||||
#maybe strenghten the search
|
cons_thread.join()
|
||||||
ratings = self.get_rating(entry.xpath("./p/table")[1], verbose)
|
|
||||||
authors = self.get_authors(entry)
|
|
||||||
except Exception, e:
|
|
||||||
if verbose:
|
|
||||||
print _('Failed to get all details for an entry')
|
|
||||||
print e
|
|
||||||
continue
|
|
||||||
self.append(self.fill_MI(entry, title, authors, ratings, verbose))
|
|
||||||
|
|
||||||
|
|
||||||
def search(title=None, author=None, publisher=None, isbn=None,
|
def search(title=None, author=None, publisher=None, isbn=None,
|
||||||
@ -349,7 +375,7 @@ def search(title=None, author=None, publisher=None, isbn=None,
|
|||||||
#List of entry
|
#List of entry
|
||||||
ans = ResultList()
|
ans = ResultList()
|
||||||
ans.populate(entries, br, verbose)
|
ans.populate(entries, br, verbose)
|
||||||
return ans
|
return [x for x in ans if x is not None]
|
||||||
|
|
||||||
|
|
||||||
def option_parser():
|
def option_parser():
|
||||||
@ -391,3 +417,5 @@ def main(args=sys.argv):
|
|||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
sys.exit(main())
|
sys.exit(main())
|
||||||
|
|
||||||
|
# calibre-debug -e "H:\Mes eBooks\Developpement\calibre\src\calibre\ebooks\metadata\fictionwise.py" -m 5 -a gore -v>data.html
|
@ -302,9 +302,7 @@ class ResultList(list):
|
|||||||
def populate(self, entries, verbose=False, brcall=3):
|
def populate(self, entries, verbose=False, brcall=3):
|
||||||
if len(entries) == 1 and not isinstance(entries[0], str):
|
if len(entries) == 1 and not isinstance(entries[0], str):
|
||||||
#single entry
|
#single entry
|
||||||
mi = self.fill_MI(entries[0], verbose)
|
self.append(self.fill_MI(entries[0], verbose))
|
||||||
if mi:
|
|
||||||
self.append(mi)
|
|
||||||
else:
|
else:
|
||||||
#multiple entries
|
#multiple entries
|
||||||
q = Queue(brcall)
|
q = Queue(brcall)
|
||||||
@ -364,7 +362,7 @@ def search(title=None, author=None, publisher=None, isbn=None,
|
|||||||
#List of entry
|
#List of entry
|
||||||
ans = ResultList()
|
ans = ResultList()
|
||||||
ans.populate(entries, verbose)
|
ans.populate(entries, verbose)
|
||||||
return [x for x in ans if x]
|
return [x for x in ans if x is not None]
|
||||||
|
|
||||||
def check_for_cover(isbn):
|
def check_for_cover(isbn):
|
||||||
br = browser()
|
br = browser()
|
||||||
|
@ -3,7 +3,8 @@ __license__ = 'GPL 3'
|
|||||||
__copyright__ = '2010, sengian <sengian1@gmail.com>'
|
__copyright__ = '2010, sengian <sengian1@gmail.com>'
|
||||||
__docformat__ = 'restructuredtext en'
|
__docformat__ = 'restructuredtext en'
|
||||||
|
|
||||||
import re
|
import re, htmlentitydefs
|
||||||
|
from functools import partial
|
||||||
|
|
||||||
_ascii_pat = None
|
_ascii_pat = None
|
||||||
|
|
||||||
@ -21,3 +22,32 @@ def clean_ascii_chars(txt, charlist=None):
|
|||||||
pat = re.compile(u'|'.join(map(unichr, charlist)))
|
pat = re.compile(u'|'.join(map(unichr, charlist)))
|
||||||
return pat.sub('', txt)
|
return pat.sub('', txt)
|
||||||
|
|
||||||
|
##
|
||||||
|
# Fredrik Lundh: http://effbot.org/zone/re-sub.htm#unescape-html
|
||||||
|
# Removes HTML or XML character references and entities from a text string.
|
||||||
|
#
|
||||||
|
# @param text The HTML (or XML) source text.
|
||||||
|
# @return The plain text, as a Unicode string, if necessary.
|
||||||
|
|
||||||
|
def unescape(text, rm=False, rchar=u''):
|
||||||
|
def fixup(m, rm=rm, rchar=rchar):
|
||||||
|
text = m.group(0)
|
||||||
|
if text[:2] == "&#":
|
||||||
|
# character reference
|
||||||
|
try:
|
||||||
|
if text[:3] == "&#x":
|
||||||
|
return unichr(int(text[3:-1], 16))
|
||||||
|
else:
|
||||||
|
return unichr(int(text[2:-1]))
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
# named entity
|
||||||
|
try:
|
||||||
|
text = unichr(htmlentitydefs.name2codepoint[text[1:-1]])
|
||||||
|
except KeyError:
|
||||||
|
pass
|
||||||
|
if rm:
|
||||||
|
return rchar #replace by char
|
||||||
|
return text # leave as is
|
||||||
|
return re.sub("&#?\w+;", fixup, text)
|
Loading…
x
Reference in New Issue
Block a user