diff --git a/resources/images/news/discover_magazine.png b/resources/images/news/discover_magazine.png new file mode 100644 index 0000000000..b63cf518d1 Binary files /dev/null and b/resources/images/news/discover_magazine.png differ diff --git a/resources/recipes/ap.recipe b/resources/recipes/ap.recipe index 0118cf0726..2b9f9e5571 100644 --- a/resources/recipes/ap.recipe +++ b/resources/recipes/ap.recipe @@ -6,31 +6,38 @@ class AssociatedPress(BasicNewsRecipe): title = u'Associated Press' description = 'Global news' - __author__ = 'Kovid Goyal' + __author__ = 'Kovid Goyal and Sujata Raman' use_embedded_content = False language = 'en' - + no_stylesheets = True max_articles_per_feed = 15 html2lrf_options = ['--force-page-break-before-tag="chapter"'] preprocess_regexps = [ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in [ - (r'.*?' , lambda match : ''), - (r'.*?', lambda match : ''), - (r'.*?', lambda match : ''), - (r'.*?', lambda match : ''), - (r'.*?', lambda match : ''), - (r'
.*?

', lambda match : '

'), - (r'

', lambda match : '

'), - (r'Learn more about our Privacy Policy.*?', lambda match : ''), + (r'', lambda match : '

'), ] ] + keep_only_tags = [ dict(name='div', attrs={'class':['body']}), + dict(name='div', attrs={'class':['entry-content']}), + ] + remove_tags = [dict(name='table', attrs={'class':['ap-video-table','ap-htmlfragment-table','ap-htmltable-table']}), + dict(name='span', attrs={'class':['apCaption','tabletitle']}), + dict(name='td', attrs={'bgcolor':['#333333']}), + ] + extra_css = ''' + .headline{font-family:Verdana,Arial,Helvetica,sans-serif;font-weight:bold;} + .bline{color:#003366;} + body{font-family:Arial,Helvetica,sans-serif;} + ''' - feeds = [ ('AP Headlines', 'http://hosted.ap.org/lineups/TOPHEADS-rss_2.0.xml?SITE=ORAST&SECTION=HOME'), - ('AP US News', 'http://hosted.ap.org/lineups/USHEADS-rss_2.0.xml?SITE=CAVIC&SECTION=HOME'), + + feeds = [ + ('AP Headlines', 'http://hosted.ap.org/lineups/TOPHEADS-rss_2.0.xml?SITE=ORAST&SECTION=HOME'), + ('AP US News', 'http://hosted.ap.org/lineups/USHEADS-rss_2.0.xml?SITE=CAVIC&SECTION=HOME'), ('AP World News', 'http://hosted.ap.org/lineups/WORLDHEADS-rss_2.0.xml?SITE=SCAND&SECTION=HOME'), ('AP Political News', 'http://hosted.ap.org/lineups/POLITICSHEADS-rss_2.0.xml?SITE=ORMED&SECTION=HOME'), ('AP Washington State News', 'http://hosted.ap.org/lineups/WASHINGTONHEADS-rss_2.0.xml?SITE=NYPLA&SECTION=HOME'), @@ -38,4 +45,5 @@ class AssociatedPress(BasicNewsRecipe): ('AP Health News', 'http://hosted.ap.org/lineups/HEALTHHEADS-rss_2.0.xml?SITE=FLDAY&SECTION=HOME'), ('AP Science News', 'http://hosted.ap.org/lineups/SCIENCEHEADS-rss_2.0.xml?SITE=OHCIN&SECTION=HOME'), ('AP Strange News', 'http://hosted.ap.org/lineups/STRANGEHEADS-rss_2.0.xml?SITE=WCNC&SECTION=HOME'), - ] + ] + diff --git a/resources/recipes/discover_magazine.recipe b/resources/recipes/discover_magazine.recipe index 02cdb952b5..a777ff51fe 100644 --- a/resources/recipes/discover_magazine.recipe +++ b/resources/recipes/discover_magazine.recipe @@ -14,56 +14,35 @@ class DiscoverMagazine(BasicNewsRecipe): title = u'Discover Magazine' description = u'Science, Technology and the Future' - __author__ = 'Starson17' + __author__ = 'Starson17 and Sujata Raman' language = 'en' oldest_article = 33 max_articles_per_feed = 20 + no_stylesheets = True remove_javascript = True use_embedded_content = False + linearize_tables = True encoding = 'utf-8' + extra_css = '.headline {font-size: x-large;} \n .fact {padding-top: 10pt}' + + keep_only_tags = [ dict(name='div', attrs={'class':['content']}),] remove_tags = [ - dict(name='div', attrs={'id':['searchModule', 'mainMenu', 'tool-box']}), - dict(name='div', attrs={'id':['footer','teaser','already-subscriber','teaser-suite','related-articles']}), - dict(name='div', attrs={'class':['column']}), - dict(name='img', attrs={'src':'http://discovermagazine.com/onebyone.gif'})] - - remove_tags_after = [dict(name='div', attrs={'class':'listingBar'})] - - def append_page(self, soup, appendtag, position): - pager = soup.find('span',attrs={'class':'next'}) - if pager: - nexturl = pager.a['href'] - soup2 = self.index_to_soup(nexturl) - texttag = soup2.find('div', attrs={'class':'articlebody'}) - newpos = len(texttag.contents) - self.append_page(soup2,texttag,newpos) - texttag.extract() - appendtag.insert(position,texttag) - - def preprocess_html(self, soup): - mtag = '\n' - soup.head.insert(0,mtag) - self.append_page(soup, soup.body, 3) - pager = soup.find('div',attrs={'class':'listingBar'}) - if pager: - pager.extract() - return soup + dict(name='div', attrs={'class':['navigation','socialcontainer']}), + dict(name='span', attrs={'class':['sociableButton']}), + dict(name='p', attrs={'class':'footerBlogResume'}), + dict(name='h3', attrs={'id':['comments','respond']}), + dict(name='ol', attrs={'class':'commentlist'}), + ] def postprocess_html(self, soup, first_fetch): - for tag in soup.findAll(text=re.compile('^This article is a sample')): + + for tag in soup.findAll(text=re.compile('Related content')): tag.parent.extract() - for tag in soup.findAll(['table', 'tr', 'td']): - tag.name = 'div' - for tag in soup.findAll('div', attrs={'class':'discreet advert'}): - tag.extract() - for tag in soup.findAll('hr', attrs={'size':'1'}): - tag.extract() - for tag in soup.findAll('br'): - tag.extract() + return soup feeds = [