diff --git a/resources/recipes/national_post.recipe b/resources/recipes/national_post.recipe index 660f638e63..7fe00032a5 100644 --- a/resources/recipes/national_post.recipe +++ b/resources/recipes/national_post.recipe @@ -1,20 +1,81 @@ -from calibre.web.feeds.news import BasicNewsRecipe +import string, re +from calibre import strftime +from calibre.web.feeds.recipes import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import BeautifulSoup -class AdvancedUserRecipe1261379503(BasicNewsRecipe): - title = u'National Post' - language = 'en_CA' - __author__ = 'Nick Redding' - description = u"News from Canada" - oldest_article = 2 - max_articles_per_feed = 25 +class NYTimes(BasicNewsRecipe): - keep_only_tags = [dict(name='div', attrs={'id':'content'})] - remove_tags = [dict(name='div', attrs={'class':'story-tools'}),dict(name='div', attrs={'class':'newsblock'}),dict(name='p', attrs={'class':'border-top'}),dict(name='div', attrs={'id':'footer'})] + title = 'National Post' + __author__ = 'Krittika Goyal' + description = 'Canadian national newspaper' + timefmt = ' [%d %b, %Y]' + needs_subscription = False + + no_stylesheets = True + #remove_tags_before = dict(name='h1', attrs={'class':'heading'}) + #remove_tags_after = dict(name='td', attrs={'class':'newptool1'}) + remove_tags = [ + dict(name='iframe'), + dict(name='div', attrs={'class':'story-tools'}), + #dict(name='div', attrs={'id':['qrformdiv', 'inSection', 'alpha-inner']}), + #dict(name='form', attrs={'onsubmit':''}), + #dict(name='table', attrs={'cellspacing':'0'}), + ] - feeds = [(u'News Headlines', u'http://www.nationalpost.com/scripts/sp6query.aspx?catalog=ntnp&type=stry&tags=section| news'), - (u'FP Headlines', u'http://www.nationalpost.com/scripts/sp6query.aspx?catalog=ntnp&type=stry&tags=section| financial%20post|storytype|business'), - (u'Arts & Life Headlines', u'http://www.nationalpost.com/scripts/sp6query.aspx?catalog=ntnp&type=stry&tags=section| arts%20%26%20life|storytype|news'), - (u'Canada News', u'http://www.nationalpost.com/scripts/sp6query.aspx?catalog=ntnp&type=stry&tags=storytyp e|webcanada&feed=rss'), - (u'World News', u'http://www.nationalpost.com/scripts/sp6query.aspx?catalog=ntnp&type=stry&tags=storytyp e|webworld&feed=rss'),(u'Editorial', u'http://www.nationalpost.com/scripts/sp6query.aspx?catalog=ntnp&type=stry&tags=section| editorial'), - (u'FP Opinion', u'http://www.nationalpost.com/scripts/columnists.aspx?publication=national+post&columnty pe=fp')] + # def preprocess_html(self, soup): + # table = soup.find('table') + # if table is not None: + # table.extract() + # return soup + + + #TO GET ARTICLE TOC + def nejm_get_index(self): + return self.index_to_soup('http://www.nationalpost.com/todays-paper/index.html') + + # To parse artice toc + def parse_index(self): + soup = self.nejm_get_index() + + div = soup.find(id='LegoText4') + + current_section = None + current_articles = [] + feeds = [] + for x in div.findAll(True): + if x.name == 'h4': + # Section found + if current_articles and current_section: + feeds.append((current_section, current_articles)) + current_section = self.tag_to_string(x) + current_articles = [] + self.log('\tFound section:', current_section) + if current_section is not None and x.name == 'h3': + # Article found + title = self.tag_to_string(x) + a = x.find('a', href=lambda x: x and 'story' in x) + if a is None: + continue + url = a.get('href', False) + if not url or not title: + continue + if url.startswith('story'): + url = 'http://www.nationalpost.com/todays-paper/'+url + self.log('\t\tFound article:', title) + self.log('\t\t\t', url) + current_articles.append({'title': title, 'url':url, + 'description':'', 'date':''}) + + if current_articles and current_section: + feeds.append((current_section, current_articles)) + + return feeds + def preprocess_html(self, soup): + story = soup.find(name='div', attrs={'class':'triline'}) + #td = heading.findParent(name='td') + #td.extract() + soup = BeautifulSoup('t') + body = soup.find(name='body') + body.insert(0, story) + return soup diff --git a/src/calibre/gui2/library.py b/src/calibre/gui2/library.py index 8c43e25a42..488d535c73 100644 --- a/src/calibre/gui2/library.py +++ b/src/calibre/gui2/library.py @@ -580,7 +580,7 @@ class BooksModel(QAbstractTableModel): def tags(r): tags = self.db.data[r][tgdx] if tags: - return ', '.join(tags.split(',')) + return ', '.join(sorted(tags.split(','))) def series(r): series = self.db.data[r][srdx] diff --git a/src/calibre/web/feeds/recipes/collection.py b/src/calibre/web/feeds/recipes/collection.py index fb59dd08f7..478947fcd9 100644 --- a/src/calibre/web/feeds/recipes/collection.py +++ b/src/calibre/web/feeds/recipes/collection.py @@ -20,7 +20,8 @@ NS = 'http://calibre-ebook.com/recipe_collection' E = ElementMaker(namespace=NS, nsmap={None:NS}) def iterate_over_builtin_recipe_files(): - exclude = ['craigslist', 'iht', 'outlook_india', 'toronto_sun'] + exclude = ['craigslist', 'iht', 'outlook_india', 'toronto_sun', + 'indian_express', 'india_today', 'toi'] d = os.path.dirname base = os.path.join(d(d(d(d(d(d(os.path.abspath(__file__))))))), 'resources', 'recipes') for x in os.walk(base):