From af0000ad029c5bc0802c339ae539c3afe1de28fa Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 3 Aug 2010 20:31:06 -0600 Subject: [PATCH] Updated Associated Press and Discover magazine --- resources/images/news/discover_magazine.png | Bin 0 -> 1014 bytes resources/recipes/ap.recipe | 34 ++++++++----- resources/recipes/discover_magazine.recipe | 51 ++++++-------------- 3 files changed, 36 insertions(+), 49 deletions(-) create mode 100644 resources/images/news/discover_magazine.png diff --git a/resources/images/news/discover_magazine.png b/resources/images/news/discover_magazine.png new file mode 100644 index 0000000000000000000000000000000000000000..b63cf518d1521d82cc7f4eed29a9224b8f773595 GIT binary patch literal 1014 zcmViOLQn>>S>b_UX85uYW2TibwpE+UVLIpIbKlp?ef>J<^Dlf%Oj*_#a$SNP zMQfP)#1?YfZ7`<`tAgD80~>@}xeac=hsszQsAF*k9a&L_D7vtlP^$vVu8 zxd?}?Zs11_kPkhIh`0?C;Mx!le4SKQlZxA|{IORRs;Jn!s_N{!iicLK>g$&O`8~zf zl;W*Y@z^U$yO8&#Qpz%QOyb}p#JiusJ^eZL^dNlm45_^Xx9|n9BLDd{G$)z4_7w5X z$1nq%Vf+Vd?|Fz*L#lQgg(y{5&dPs`!r%^tP|3%j=#yfM@`GDdePk7`4k-AnxZ9~3 z`%%Qcm5LWXk)N7SwYNilaGQL%C4b|x;*lPOqdlso#}r$&{P`Vf?MD9QxWdIR6%Riv zTer<4F5WWnotxu^liCSwrRO%IX$6f9ha8U6xm9gXgt#07>N4FXsQb@wi6Q&Wt7 zOPs$N>I$x1!0P5WvqlfY1DmkZm(kEZa{FRT?=WuFdQA6o6dV7-&AuOo4#3slDIR(q zJN!MiIS#2gBEM&rVvMTBPpkaUDF3=op^EY$DTMHU=Y{f5P3h9DDA4%%#1U7snu^sBr7L z{2o-E+#>J2QfpG>z}u?&SE@R^LVn;K`Ro6x^i74+Z^~~+scb5xB%@RdD||VP>t96< zS1?cRpzd6ck;Pg|4lNk_4s+@aXq$t2&LKC81{ULEP@`}_#?%F|HA+7I61Hstn!OBg zaPt?Ko}gI%7G}{}qz1-j!15nwa0yz26zEw1001R)MObuXVRU6WV{&C-bY%cCFflMK zFgPtRGgL7+Ix{&sGB7PLHaajcN&GU@0000bbVXQnWMOn=I&E)cX=Zr.*?' , lambda match : ''), - (r'.*?', lambda match : ''), - (r'.*?', lambda match : ''), - (r'.*?', lambda match : ''), - (r'.*?', lambda match : ''), - (r'
.*?

', lambda match : '

'), - (r'

', lambda match : '

'), - (r'Learn more about our Privacy Policy.*?', lambda match : ''), + (r'', lambda match : '

'), ] ] + keep_only_tags = [ dict(name='div', attrs={'class':['body']}), + dict(name='div', attrs={'class':['entry-content']}), + ] + remove_tags = [dict(name='table', attrs={'class':['ap-video-table','ap-htmlfragment-table','ap-htmltable-table']}), + dict(name='span', attrs={'class':['apCaption','tabletitle']}), + dict(name='td', attrs={'bgcolor':['#333333']}), + ] + extra_css = ''' + .headline{font-family:Verdana,Arial,Helvetica,sans-serif;font-weight:bold;} + .bline{color:#003366;} + body{font-family:Arial,Helvetica,sans-serif;} + ''' - feeds = [ ('AP Headlines', 'http://hosted.ap.org/lineups/TOPHEADS-rss_2.0.xml?SITE=ORAST&SECTION=HOME'), - ('AP US News', 'http://hosted.ap.org/lineups/USHEADS-rss_2.0.xml?SITE=CAVIC&SECTION=HOME'), + + feeds = [ + ('AP Headlines', 'http://hosted.ap.org/lineups/TOPHEADS-rss_2.0.xml?SITE=ORAST&SECTION=HOME'), + ('AP US News', 'http://hosted.ap.org/lineups/USHEADS-rss_2.0.xml?SITE=CAVIC&SECTION=HOME'), ('AP World News', 'http://hosted.ap.org/lineups/WORLDHEADS-rss_2.0.xml?SITE=SCAND&SECTION=HOME'), ('AP Political News', 'http://hosted.ap.org/lineups/POLITICSHEADS-rss_2.0.xml?SITE=ORMED&SECTION=HOME'), ('AP Washington State News', 'http://hosted.ap.org/lineups/WASHINGTONHEADS-rss_2.0.xml?SITE=NYPLA&SECTION=HOME'), @@ -38,4 +45,5 @@ class AssociatedPress(BasicNewsRecipe): ('AP Health News', 'http://hosted.ap.org/lineups/HEALTHHEADS-rss_2.0.xml?SITE=FLDAY&SECTION=HOME'), ('AP Science News', 'http://hosted.ap.org/lineups/SCIENCEHEADS-rss_2.0.xml?SITE=OHCIN&SECTION=HOME'), ('AP Strange News', 'http://hosted.ap.org/lineups/STRANGEHEADS-rss_2.0.xml?SITE=WCNC&SECTION=HOME'), - ] + ] + diff --git a/resources/recipes/discover_magazine.recipe b/resources/recipes/discover_magazine.recipe index 02cdb952b5..a777ff51fe 100644 --- a/resources/recipes/discover_magazine.recipe +++ b/resources/recipes/discover_magazine.recipe @@ -14,56 +14,35 @@ class DiscoverMagazine(BasicNewsRecipe): title = u'Discover Magazine' description = u'Science, Technology and the Future' - __author__ = 'Starson17' + __author__ = 'Starson17 and Sujata Raman' language = 'en' oldest_article = 33 max_articles_per_feed = 20 + no_stylesheets = True remove_javascript = True use_embedded_content = False + linearize_tables = True encoding = 'utf-8' + extra_css = '.headline {font-size: x-large;} \n .fact {padding-top: 10pt}' + + keep_only_tags = [ dict(name='div', attrs={'class':['content']}),] remove_tags = [ - dict(name='div', attrs={'id':['searchModule', 'mainMenu', 'tool-box']}), - dict(name='div', attrs={'id':['footer','teaser','already-subscriber','teaser-suite','related-articles']}), - dict(name='div', attrs={'class':['column']}), - dict(name='img', attrs={'src':'http://discovermagazine.com/onebyone.gif'})] - - remove_tags_after = [dict(name='div', attrs={'class':'listingBar'})] - - def append_page(self, soup, appendtag, position): - pager = soup.find('span',attrs={'class':'next'}) - if pager: - nexturl = pager.a['href'] - soup2 = self.index_to_soup(nexturl) - texttag = soup2.find('div', attrs={'class':'articlebody'}) - newpos = len(texttag.contents) - self.append_page(soup2,texttag,newpos) - texttag.extract() - appendtag.insert(position,texttag) - - def preprocess_html(self, soup): - mtag = '\n' - soup.head.insert(0,mtag) - self.append_page(soup, soup.body, 3) - pager = soup.find('div',attrs={'class':'listingBar'}) - if pager: - pager.extract() - return soup + dict(name='div', attrs={'class':['navigation','socialcontainer']}), + dict(name='span', attrs={'class':['sociableButton']}), + dict(name='p', attrs={'class':'footerBlogResume'}), + dict(name='h3', attrs={'id':['comments','respond']}), + dict(name='ol', attrs={'class':'commentlist'}), + ] def postprocess_html(self, soup, first_fetch): - for tag in soup.findAll(text=re.compile('^This article is a sample')): + + for tag in soup.findAll(text=re.compile('Related content')): tag.parent.extract() - for tag in soup.findAll(['table', 'tr', 'td']): - tag.name = 'div' - for tag in soup.findAll('div', attrs={'class':'discreet advert'}): - tag.extract() - for tag in soup.findAll('hr', attrs={'size':'1'}): - tag.extract() - for tag in soup.findAll('br'): - tag.extract() + return soup feeds = [