diff --git a/resources/recipes/national_post.recipe b/resources/recipes/national_post.recipe index 4fe188934c..00eb918d02 100644 --- a/resources/recipes/national_post.recipe +++ b/resources/recipes/national_post.recipe @@ -7,18 +7,18 @@ class NYTimes(BasicNewsRecipe): __author__ = 'Krittika Goyal' description = 'Canadian national newspaper' timefmt = ' [%d %b, %Y]' - needs_subscription = False language = 'en_CA' + needs_subscription = False no_stylesheets = True #remove_tags_before = dict(name='h1', attrs={'class':'heading'}) - #remove_tags_after = dict(name='td', attrs={'class':'newptool1'}) + remove_tags_after = dict(name='div', attrs={'class':'npStoryTools npWidth1-6 npRight npTxtStrong'}) remove_tags = [ dict(name='iframe'), - dict(name='div', attrs={'class':'story-tools'}), + dict(name='div', attrs={'class':['story-tools', 'npStoryTools npWidth1-6 npRight npTxtStrong']}), #dict(name='div', attrs={'id':['qrformdiv', 'inSection', 'alpha-inner']}), #dict(name='form', attrs={'onsubmit':''}), - #dict(name='table', attrs={'cellspacing':'0'}), + dict(name='ul', attrs={'class':'npTxtAlt npGroup npTxtCentre npStoryShare npTxtStrong npTxtDim'}), ] # def preprocess_html(self, soup): @@ -37,7 +37,7 @@ class NYTimes(BasicNewsRecipe): def parse_index(self): soup = self.nejm_get_index() - div = soup.find(id='LegoText4') + div = soup.find(id='npContentMain') current_section = None current_articles = [] @@ -50,7 +50,7 @@ class NYTimes(BasicNewsRecipe): current_section = self.tag_to_string(x) current_articles = [] self.log('\tFound section:', current_section) - if current_section is not None and x.name == 'h3': + if current_section is not None and x.name == 'h5': # Article found title = self.tag_to_string(x) a = x.find('a', href=lambda x: x and 'story' in x) @@ -59,8 +59,8 @@ class NYTimes(BasicNewsRecipe): url = a.get('href', False) if not url or not title: continue - if url.startswith('story'): - url = 'http://www.nationalpost.com/todays-paper/'+url + #if url.startswith('story'): + url = 'http://www.nationalpost.com/todays-paper/'+url self.log('\t\tFound article:', title) self.log('\t\t\t', url) current_articles.append({'title': title, 'url':url, @@ -70,28 +70,11 @@ class NYTimes(BasicNewsRecipe): feeds.append((current_section, current_articles)) return feeds - def preprocess_html(self, soup): - story = soup.find(name='div', attrs={'class':'triline'}) - page2_link = soup.find('p','pagenav') - if page2_link: - atag = page2_link.find('a',href=True) - if atag: - page2_url = atag['href'] - if page2_url.startswith('story'): - page2_url = 'http://www.nationalpost.com/todays-paper/'+page2_url - elif page2_url.startswith( '/todays-paper/story.html'): - page2_url = 'http://www.nationalpost.com/'+page2_url - page2_soup = self.index_to_soup(page2_url) - if page2_soup: - page2_content = page2_soup.find('div','story-content') - if page2_content: - full_story = BeautifulSoup('
') - full_story.insert(0,story) - full_story.insert(1,page2_content) - story = full_story + story = soup.find(name='div', attrs={'id':'npContentMain'}) + ##td = heading.findParent(name='td') + ##td.extract() soup = BeautifulSoup(''+ans%('', '')
-class Amazon(MetadataSource):
+ # }}}
+
+class Amazon(MetadataSource): # {{{
name = 'Amazon'
metadata_type = 'social'
@@ -198,7 +204,9 @@ class Amazon(MetadataSource):
self.exception = e
self.tb = traceback.format_exc()
-class LibraryThing(MetadataSource):
+ # }}}
+
+class LibraryThing(MetadataSource): # {{{
name = 'LibraryThing'
metadata_type = 'social'
@@ -207,7 +215,6 @@ class LibraryThing(MetadataSource):
def fetch(self):
if not self.isbn:
return
- from calibre import browser
from calibre.ebooks.metadata import MetaInformation
import json
br = browser()
@@ -228,6 +235,7 @@ class LibraryThing(MetadataSource):
except Exception, e:
self.exception = e
self.tb = traceback.format_exc()
+ # }}}
def result_index(source, result):
@@ -268,6 +276,31 @@ class MetadataSources(object):
for s in self.sources:
s.join()
+def filter_metadata_results(item):
+ keywords = ["audio", "tape", "cassette", "abridged", "playaway"]
+ for keyword in keywords:
+ if item.publisher and keyword in item.publisher.lower():
+ return False
+ return True
+
+class HeadRequest(urllib2.Request):
+ def get_method(self):
+ return "HEAD"
+
+def do_cover_check(item):
+ opener = browser()
+ item.has_cover = False
+ try:
+ opener.open(HeadRequest(OPENLIBRARY%item.isbn), timeout=5)
+ item.has_cover = True
+ except:
+ pass # Cover not found
+
+def check_for_covers(items):
+ threads = [Thread(target=do_cover_check, args=(item,)) for item in items]
+ for t in threads: t.start()
+ for t in threads: t.join()
+
def search(title=None, author=None, publisher=None, isbn=None, isbndb_key=None,
verbose=0):
assert not(title is None and author is None and publisher is None and \
@@ -285,10 +318,60 @@ def search(title=None, author=None, publisher=None, isbn=None, isbndb_key=None,
for fetcher in fetchers[1:]:
merge_results(results, fetcher.results)
- results = sorted(results, cmp=lambda x, y : cmp(
- (x.comments.strip() if x.comments else ''),
- (y.comments.strip() if y.comments else '')
- ), reverse=True)
+ results = list(filter(filter_metadata_results, results))
+
+ check_for_covers(results)
+
+ words = ("the", "a", "an", "of", "and")
+ prefix_pat = re.compile(r'^(%s)\s+'%("|".join(words)))
+ trailing_paren_pat = re.compile(r'\(.*\)$')
+ whitespace_pat = re.compile(r'\s+')
+
+ def sort_func(x, y):
+
+ def cleanup_title(s):
+ s = s.strip().lower()
+ s = prefix_pat.sub(' ', s)
+ s = trailing_paren_pat.sub('', s)
+ s = whitespace_pat.sub(' ', s)
+ return s.strip()
+
+ t = cleanup_title(title)
+ x_title = cleanup_title(x.title)
+ y_title = cleanup_title(y.title)
+
+ # prefer titles that start with the search title
+ tx = cmp(t, x_title)
+ ty = cmp(t, y_title)
+ result = 0 if abs(tx) == abs(ty) else abs(tx) - abs(ty)
+
+ # then prefer titles that have a cover image
+ if result == 0:
+ result = -cmp(x.has_cover, y.has_cover)
+
+ # then prefer titles with the longest comment, with in 10%
+ if result == 0:
+ cx = len(x.comments.strip() if x.comments else '')
+ cy = len(y.comments.strip() if y.comments else '')
+ t = (cx + cy) / 20
+ result = cy - cx
+ if abs(result) < t:
+ result = 0
+
+ return result
+
+ results = sorted(results, cmp=sort_func)
+
+ # if for some reason there is no comment in the top selection, go looking for one
+ if len(results) > 1:
+ if not results[0].comments or len(results[0].comments) == 0:
+ for r in results[1:]:
+ if title.lower() == r.title[:len(title)].lower() and r.comments and len(r.comments):
+ results[0].comments = r.comments
+ break
+
+ # for r in results:
+ # print "{0:14.14} {1:30.30} {2:20.20} {3:6} {4}".format(r.isbn, r.title, r.publisher, len(r.comments if r.comments else ''), r.has_cover)
return results, [(x.name, x.exception, x.tb) for x in fetchers]
diff --git a/src/calibre/gui2/dialogs/config/add_save.ui b/src/calibre/gui2/dialogs/config/add_save.ui
index a29c0fd2e6..64a8137aa1 100644
--- a/src/calibre/gui2/dialogs/config/add_save.ui
+++ b/src/calibre/gui2/dialogs/config/add_save.ui
@@ -181,14 +181,14 @@ Title match ignores leading indefinite articles ("the", "a",