diff --git a/resources/recipes/globe_and_mail.recipe b/resources/recipes/globe_and_mail.recipe index b2a9915250..b6e6b5c25b 100644 --- a/resources/recipes/globe_and_mail.recipe +++ b/resources/recipes/globe_and_mail.recipe @@ -26,31 +26,12 @@ class GlobeAndMail(BasicNewsRecipe): #credit {margin-top:0px;} .tag {font-size: 22pt;}''' description = 'Canada\'s national newspaper' - remove_tags_before = dict(id="article-top") - remove_tags = [ - {'id':['util', 'article-tabs', 'comments', 'article-relations', - 'gallery-controls', 'video', 'galleryLoading','deck','header', - 'toolsBottom'] }, - {'class':['credit','inline-img-caption','tab-pointer'] }, - dict(name='div', attrs={'id':['lead-photo', 'most-popular-story']}), - dict(name='div', attrs={'class':'right'}), - dict(name='div', attrs={'id':'footer'}), - dict(name='div', attrs={'id':'beta-msg'}), - dict(name='img', attrs={'class':'headshot'}), - dict(name='div', attrs={'class':'brand'}), - dict(name='div', attrs={'id':'nav-wrap'}), - dict(name='div', attrs={'id':'featureTopics'}), - dict(name='div', attrs={'id':'videoNav'}), - dict(name='div', attrs={'id':'blog-header'}), - dict(name='div', attrs={'id':'right-rail'}), - dict(name='div', attrs={'id':'group-footer-container'}), - dict(name=['iframe', 'style']) - ] - remove_attributes = ['style'] - remove_tags_after = [{'id':['article-content']}, - {'class':['pull','inline-img'] }, - dict(name='img', attrs={'class':'inline-media-embed'}), - ] + keep_only_tags = [dict(name='article')] + remove_tags = [dict(name='aside'), + dict(name='footer'), + dict(name='div', attrs={'class':(lambda x: isinstance(x, (str,unicode)) and 'articlecommentcountholder' in x.split(' '))}), + dict(name='ul', attrs={'class':(lambda x: isinstance(x, (str,unicode)) and 'articletoolbar' in x.split(' '))}), + ] feeds = [ (u'Latest headlines', u'http://www.theglobeandmail.com/?service=rss'), (u'Top stories', u'http://www.theglobeandmail.com/?service=rss&feed=topstories'), diff --git a/resources/recipes/volksrant.recipe b/resources/recipes/volksrant.recipe index dcc8c042ee..6f3ec4ce0d 100644 --- a/resources/recipes/volksrant.recipe +++ b/resources/recipes/volksrant.recipe @@ -11,6 +11,7 @@ __docformat__ = 'restructuredtext en' on 10/10/10 to include function to grab print version of articles ''' +from datetime import date from calibre.web.feeds.news import BasicNewsRecipe ''' added by Tony Stegall @@ -27,7 +28,6 @@ class AdvancedUserRecipe1249039563(BasicNewsRecipe): no_stylesheets = True language = 'nl' - extra_css = ''' body{font-family:Arial,Helvetica,sans-serif; font-size:small;} h1{font-size:large;} @@ -43,14 +43,16 @@ class AdvancedUserRecipe1249039563(BasicNewsRecipe): def get_obfuscated_article(self, url): br = self.get_browser() + print 'THE CURRENT URL IS: ', url br.open(url) + year = date.today().year try: - response = br.follow_link(url_regex='.*?(2010)(\\/)(article)(\\/)(print)(\\/)', nr = 0) - html = response.read() + response = br.follow_link(url_regex='.*?(%d)(\\/)(article)(\\/)(print)(\\/)'%year, nr = 0) + html = response.read() except: - response = br.open(url) - html = response.read() + response = br.open(url) + html = response.read() self.temp_files.append(PersistentTemporaryFile('_fa.html')) self.temp_files[-1].write(html) @@ -59,19 +61,22 @@ class AdvancedUserRecipe1249039563(BasicNewsRecipe): ############################################################################################################### - feeds = [ - (u'Laatste Nieuws', u'http://volkskrant.nl/rss/laatstenieuws.rss'), - (u'Binnenlands nieuws', u'http://volkskrant.nl/rss/nederland.rss'), - (u'Buitenlands nieuws', u'http://volkskrant.nl/rss/internationaal.rss'), - (u'Economisch nieuws', u'http://volkskrant.nl/rss/economie.rss'), - (u'Sportnieuws', u'http://volkskrant.nl/rss/sport.rss'), - (u'Kunstnieuws', u'http://volkskrant.nl/rss/kunst.rss'), + ''' + Change Log: + Date: 10/15/2010 + Feeds updated by Martin Tarenskeen + ''' + + feeds = [ + (u'Laatste Nieuws', u'http://www.volkskrant.nl/rss/laatstenieuws.rss'), + (u'Binnenland', u'http://www.volkskrant.nl/rss/nederland.rss'), + (u'Buitenland', u'http://www.volkskrant.nl/rss/internationaal.rss'), + (u'Economie', u'http://www.volkskrant.nl/rss/economie.rss'), + (u'Sport', u'http://www.volkskrant.nl/rss/sport.rss'), + (u'Cultuur', u'http://www.volkskrant.nl/rss/kunst.rss'), + (u'Gezondheid & Wetenschap', u'http://www.volkskrant.nl/rss/wetenschap.rss'), + (u'Internet & Media', u'http://www.volkskrant.nl/rss/media.rss') ] - #both of these rss feeds link back to the main volksrant.nl url a.k.a Broken - #If someone happens to know the correct paths then they can put them in here - #(u'Wetenschapsnieuws', u'http://feeds.feedburner.com/DeVolkskrantWetenschap'), - #(u'Technologienieuws', u'http://feeds.feedburner.com/vkmedia') - ] ''' example for formating