From 41b4a5dd96c5f129a915445e9a715e2d1988fd3f Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Thu, 24 Jun 2010 08:30:18 -0600 Subject: [PATCH 1/4] Metadata download: Filter out non book results. Also sort results by availability of covers for the isbn. Fixes #5946 (fix file plugin postprocessing and update metadata download sorting) --- src/calibre/ebooks/metadata/fetch.py | 102 +++++++++++++++++++++++---- 1 file changed, 90 insertions(+), 12 deletions(-) diff --git a/src/calibre/ebooks/metadata/fetch.py b/src/calibre/ebooks/metadata/fetch.py index d12c668e0d..db6ad0278d 100644 --- a/src/calibre/ebooks/metadata/fetch.py +++ b/src/calibre/ebooks/metadata/fetch.py @@ -3,17 +3,18 @@ __license__ = 'GPL 3' __copyright__ = '2009, Kovid Goyal ' __docformat__ = 'restructuredtext en' -import traceback, sys, textwrap, re +import traceback, sys, textwrap, re, urllib2 from threading import Thread -from calibre import prints +from calibre import prints, browser from calibre.utils.config import OptionParser from calibre.utils.logging import default_log from calibre.customize import Plugin +from calibre.ebooks.metadata.library_thing import OPENLIBRARY metadata_config = None -class MetadataSource(Plugin): +class MetadataSource(Plugin): # {{{ author = 'Kovid Goyal' @@ -130,7 +131,9 @@ class MetadataSource(Plugin): def customization_help(self): return 'This plugin can only be customized using the GUI' -class GoogleBooks(MetadataSource): + # }}} + +class GoogleBooks(MetadataSource): # {{{ name = 'Google Books' description = _('Downloads metadata from Google Books') @@ -145,8 +148,9 @@ class GoogleBooks(MetadataSource): self.exception = e self.tb = traceback.format_exc() + # }}} -class ISBNDB(MetadataSource): +class ISBNDB(MetadataSource): # {{{ name = 'IsbnDB' description = _('Downloads metadata from isbndb.com') @@ -181,7 +185,9 @@ class ISBNDB(MetadataSource): 'and enter your access key below.') return '

'+ans%('', '') -class Amazon(MetadataSource): + # }}} + +class Amazon(MetadataSource): # {{{ name = 'Amazon' metadata_type = 'social' @@ -198,7 +204,9 @@ class Amazon(MetadataSource): self.exception = e self.tb = traceback.format_exc() -class LibraryThing(MetadataSource): + # }}} + +class LibraryThing(MetadataSource): # {{{ name = 'LibraryThing' metadata_type = 'social' @@ -207,7 +215,6 @@ class LibraryThing(MetadataSource): def fetch(self): if not self.isbn: return - from calibre import browser from calibre.ebooks.metadata import MetaInformation import json br = browser() @@ -228,6 +235,7 @@ class LibraryThing(MetadataSource): except Exception, e: self.exception = e self.tb = traceback.format_exc() + # }}} def result_index(source, result): @@ -268,6 +276,27 @@ class MetadataSources(object): for s in self.sources: s.join() +def filter_metadata_results(item): + keywords = ["audio", "tape", "cassette", "abridged", "playaway"] + for keyword in keywords: + if item.publisher and keyword in item.publisher.lower(): + return False + return True + +class HeadRequest(urllib2.Request): + def get_method(self): + return "HEAD" + +def check_for_covers(items): + opener = browser() + for item in items: + item.has_cover = False + try: + opener.open(HeadRequest(OPENLIBRARY%item.isbn), timeout=5) + item.has_cover = True + except: + pass # Cover not found + def search(title=None, author=None, publisher=None, isbn=None, isbndb_key=None, verbose=0): assert not(title is None and author is None and publisher is None and \ @@ -285,10 +314,59 @@ def search(title=None, author=None, publisher=None, isbn=None, isbndb_key=None, for fetcher in fetchers[1:]: merge_results(results, fetcher.results) - results = sorted(results, cmp=lambda x, y : cmp( - (x.comments.strip() if x.comments else ''), - (y.comments.strip() if y.comments else '') - ), reverse=True) + results = list(filter(filter_metadata_results, results)) + + check_for_covers(results) + + words = ("the", "a", "an", "of", "and") + prefix_pat = re.compile(r'^(%s)\s+'%("|".join(words))) + trailing_paren_pat = re.compile(r'\(.*\)$') + whitespace_pat = re.compile(r'\s+') + + def sort_func(x, y): + def cleanup_title(s): + s = s.strip().lower() + s = prefix_pat.sub(' ', s) + s = trailing_paren_pat.sub('', s) + s = whitespace_pat.sub(' ', s) + return s.strip() + + t = cleanup_title(title) + x_title = cleanup_title(x.title) + y_title = cleanup_title(y.title) + + # prefer titles that start with the search title + tx = cmp(t, x_title) + ty = cmp(t, y_title) + result = 0 if abs(tx) == abs(ty) else abs(tx) - abs(ty) + + # then prefer titles that have a cover image + if result == 0: + result = -cmp(x.has_cover, y.has_cover) + + # then prefer titles with the longest comment, with in 10% + if result == 0: + cx = len(x.comments.strip() if x.comments else '') + cy = len(y.comments.strip() if y.comments else '') + t = (cx + cy) / 20 + result = cy - cx + if abs(result) < t: + result = 0 + + return result + + results = sorted(results, cmp=sort_func) + + # if for some reason there is no comment in the top selection, go looking for one + if len(results) > 1: + if not results[0].comments or len(results[0].comments) == 0: + for r in results[1:]: + if title.lower() == r.title[:len(title)].lower() and r.comments and len(r.comments): + results[0].comments = r.comments + break + + # for r in results: + # print "{0:14.14} {1:30.30} {2:20.20} {3:6} {4}".format(r.isbn, r.title, r.publisher, len(r.comments if r.comments else ''), r.has_cover) return results, [(x.name, x.exception, x.tb) for x in fetchers] From bb5ab06f3b9e7791b9793c64c6e486b950e3b441 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Thu, 24 Jun 2010 11:56:54 -0600 Subject: [PATCH 2/4] Fix #5951 (unable to retrieve news item) --- resources/recipes/national_post.recipe | 39 ++++++++------------------ 1 file changed, 11 insertions(+), 28 deletions(-) diff --git a/resources/recipes/national_post.recipe b/resources/recipes/national_post.recipe index 4fe188934c..00eb918d02 100644 --- a/resources/recipes/national_post.recipe +++ b/resources/recipes/national_post.recipe @@ -7,18 +7,18 @@ class NYTimes(BasicNewsRecipe): __author__ = 'Krittika Goyal' description = 'Canadian national newspaper' timefmt = ' [%d %b, %Y]' - needs_subscription = False language = 'en_CA' + needs_subscription = False no_stylesheets = True #remove_tags_before = dict(name='h1', attrs={'class':'heading'}) - #remove_tags_after = dict(name='td', attrs={'class':'newptool1'}) + remove_tags_after = dict(name='div', attrs={'class':'npStoryTools npWidth1-6 npRight npTxtStrong'}) remove_tags = [ dict(name='iframe'), - dict(name='div', attrs={'class':'story-tools'}), + dict(name='div', attrs={'class':['story-tools', 'npStoryTools npWidth1-6 npRight npTxtStrong']}), #dict(name='div', attrs={'id':['qrformdiv', 'inSection', 'alpha-inner']}), #dict(name='form', attrs={'onsubmit':''}), - #dict(name='table', attrs={'cellspacing':'0'}), + dict(name='ul', attrs={'class':'npTxtAlt npGroup npTxtCentre npStoryShare npTxtStrong npTxtDim'}), ] # def preprocess_html(self, soup): @@ -37,7 +37,7 @@ class NYTimes(BasicNewsRecipe): def parse_index(self): soup = self.nejm_get_index() - div = soup.find(id='LegoText4') + div = soup.find(id='npContentMain') current_section = None current_articles = [] @@ -50,7 +50,7 @@ class NYTimes(BasicNewsRecipe): current_section = self.tag_to_string(x) current_articles = [] self.log('\tFound section:', current_section) - if current_section is not None and x.name == 'h3': + if current_section is not None and x.name == 'h5': # Article found title = self.tag_to_string(x) a = x.find('a', href=lambda x: x and 'story' in x) @@ -59,8 +59,8 @@ class NYTimes(BasicNewsRecipe): url = a.get('href', False) if not url or not title: continue - if url.startswith('story'): - url = 'http://www.nationalpost.com/todays-paper/'+url + #if url.startswith('story'): + url = 'http://www.nationalpost.com/todays-paper/'+url self.log('\t\tFound article:', title) self.log('\t\t\t', url) current_articles.append({'title': title, 'url':url, @@ -70,28 +70,11 @@ class NYTimes(BasicNewsRecipe): feeds.append((current_section, current_articles)) return feeds - def preprocess_html(self, soup): - story = soup.find(name='div', attrs={'class':'triline'}) - page2_link = soup.find('p','pagenav') - if page2_link: - atag = page2_link.find('a',href=True) - if atag: - page2_url = atag['href'] - if page2_url.startswith('story'): - page2_url = 'http://www.nationalpost.com/todays-paper/'+page2_url - elif page2_url.startswith( '/todays-paper/story.html'): - page2_url = 'http://www.nationalpost.com/'+page2_url - page2_soup = self.index_to_soup(page2_url) - if page2_soup: - page2_content = page2_soup.find('div','story-content') - if page2_content: - full_story = BeautifulSoup('

') - full_story.insert(0,story) - full_story.insert(1,page2_content) - story = full_story + story = soup.find(name='div', attrs={'id':'npContentMain'}) + ##td = heading.findParent(name='td') + ##td.extract() soup = BeautifulSoup('t') body = soup.find(name='body') body.insert(0, story) return soup - From 985e65d3864fa6a8575c9bb0f76ea8089eab72fc Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Thu, 24 Jun 2010 12:06:34 -0600 Subject: [PATCH 3/4] Metadata download: Make cover check multithreaded --- src/calibre/ebooks/metadata/fetch.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/src/calibre/ebooks/metadata/fetch.py b/src/calibre/ebooks/metadata/fetch.py index e7883d3757..0fd671f86a 100644 --- a/src/calibre/ebooks/metadata/fetch.py +++ b/src/calibre/ebooks/metadata/fetch.py @@ -287,15 +287,19 @@ class HeadRequest(urllib2.Request): def get_method(self): return "HEAD" -def check_for_covers(items): +def do_cover_check(item): opener = browser() - for item in items: - item.has_cover = False - try: - opener.open(HeadRequest(OPENLIBRARY%item.isbn), timeout=5) - item.has_cover = True - except: - pass # Cover not found + item.has_cover = False + try: + opener.open(HeadRequest(OPENLIBRARY%item.isbn), timeout=5) + item.has_cover = True + except: + pass # Cover not found + +def check_for_covers(items): + threads = [Thread(target=do_cover_check, args=(item,)) for item in items] + for t in threads: t.start() + for t in threads: t.join() def search(title=None, author=None, publisher=None, isbn=None, isbndb_key=None, verbose=0): From e7eb5b69657de4d051bd1900a27f16f501afb5b3 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Thu, 24 Jun 2010 12:07:33 -0600 Subject: [PATCH 4/4] Fix #5937 ("New Scientist" recipe problems) --- resources/recipes/new_scientist.recipe | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/resources/recipes/new_scientist.recipe b/resources/recipes/new_scientist.recipe index 1727a926ed..b40be458bc 100644 --- a/resources/recipes/new_scientist.recipe +++ b/resources/recipes/new_scientist.recipe @@ -32,15 +32,16 @@ class NewScientist(BasicNewsRecipe): } preprocess_regexps = [(re.compile(r'.*?', re.DOTALL|re.IGNORECASE),lambda match: '')] - keep_only_tags = [dict(name='div', attrs={'id':['pgtop','maincol','nsblgposts','hldgalcols']})] + keep_only_tags = [dict(name='div', attrs={'id':['pgtop','maincol','blgmaincol','nsblgposts','hldgalcols']})] remove_tags = [ dict(name='div' , attrs={'class':['hldBd','adline','pnl','infotext' ]}) - ,dict(name='div' , attrs={'id' :['compnl','artIssueInfo','artTools']}) + ,dict(name='div' , attrs={'id' :['compnl','artIssueInfo','artTools','comments','blgsocial']}) ,dict(name='p' , attrs={'class':['marker','infotext' ]}) ,dict(name='meta' , attrs={'name' :'description' }) + ,dict(name='a' , attrs={'rel' :'tag' }) ] - remove_tags_after = dict(attrs={'class':'nbpcopy'}) + remove_tags_after = dict(attrs={'class':['nbpcopy','comments']}) remove_attributes = ['height','width'] feeds = [