Switch Newsweek recipe to using the current issue to fetch articles.

2026-06-07 06:25:26 -04:00 · 2008-03-19 20:29:43 +00:00
parent ce8a03f00f
commit 39c3bd160f
1 changed files with 37 additions and 29 deletions
@@ -14,7 +14,7 @@
 ##    You should have received a copy of the GNU General Public License along
 ##    with this program; if not, write to the Free Software Foundation, Inc.,
 ##    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-import re
+import re, string, time
 from libprs500.web.feeds.news import BasicNewsRecipe
 from libprs500.ebooks.BeautifulSoup import BeautifulSoup

@@ -23,25 +23,6 @@ class Newsweek(BasicNewsRecipe):
    title          = 'Newsweek'
    __author__     = 'Kovid Goyal'
    no_stylesheets = True
-    oldest_article = 11
-    
-    feeds = [
-             ('Top News', 'http://feeds.newsweek.com/newsweek/TopNews',),
-             'http://feeds.newsweek.com/newsweek/columnists/StevenLevy',
-             ('Politics', 'http://feeds.newsweek.com/headlines/politics'),
-             ('Health', 'http://feeds.newsweek.com/headlines/health'),
-             ('Business', 'http://feeds.newsweek.com/headlines/business'),
-             ('Science and Technology', 'http://feeds.newsweek.com/headlines/technology/science'),
-             ('National News', 'http://feeds.newsweek.com/newsweek/NationalNews'),
-             ('World News', 'http://feeds.newsweek.com/newsweek/WorldNews'),
-             'http://feeds.newsweek.com/newsweek/Columnists/ChristopherDickey',
-             'http://feeds.newsweek.com/newsweek/Columnists/FareedZakaria',
-             ('Iraq', 'http://feeds.newsweek.com/newsweek/iraq'),
-             ('Society', 'http://feeds.newsweek.com/newsweek/society'),
-             ('Entertainment', 'http://feeds.newsweek.com/newsweek/entertainment'),
-             'http://feeds.newsweek.com/newsweek/columnists/GeorgeFWill',
-             'http://feeds.newsweek.com/newsweek/columnists/AnnaQuindlen',
-             ]
    
    extra_css = '#content { font:serif 12pt; }\n.story {font:12pt}\n.HorizontalHeader {font:18pt}\n.deck {font:16pt}'
    keep_only_tags = [dict(name='div', id='content')]
@@ -57,6 +38,41 @@ class Newsweek(BasicNewsRecipe):
    match_regexps = [r'http://www.newsweek.com/id/\S+/page/\d+']
    
    
+    def parse_index(self):
+        soup = self.index_to_soup(self.get_current_issue())
+        img = soup.find(alt='Cover')
+        if img is not None and img.has_key('src'):
+            small = img['src']
+            self.cover_url = small.replace('coversmall', 'coverlarge')
+            
+        articles = {}
+        key = None
+        for tag in soup.findAll(['h5', 'h6']):
+            if tag.name == 'h6':
+                if key and not articles[key]:
+                    articles.pop(key)
+                key = self.tag_to_string(tag)
+                if not key or not key.strip():
+                    key = 'uncategorized'
+                key = string.capwords(key)
+                articles[key] = []
+            elif tag.name == 'h5' and key is not None:
+                a = tag.find('a', href=True)
+                if a is not None:
+                    title = self.tag_to_string(a)
+                    if not title:
+                        a = 'Untitled article'
+                    art = {
+                           'title' : title,
+                           'url'   : a['href'],
+                           'description':'', 'content':'',
+                           'date': time.strftime('%a, %d %b', time.localtime())
+                           }
+                    if art['title'] and art['url']:
+                        articles[key].append(art)
+        return articles
+        
+    
    def postprocess_html(self,  soup):
        divs = list(soup.findAll('div', 'pagination'))
        divs[0].extract()
@@ -80,12 +96,4 @@ class Newsweek(BasicNewsRecipe):
        img  = soup.find('img', alt='Current Magazine')
        if img and img.parent.has_key('href'):
            return urlopen(img.parent['href']).read()
-        
-    def get_cover_url(self):
-        ci = self.get_current_issue()
-        if ci is not None:
-            soup = BeautifulSoup(ci)
-            img = soup.find(alt='Cover')
-            if img is not None and img.has_key('src'):
-                small = img['src']
-                return small.replace('coversmall', 'coverlarge')
+