Switch Newsweek recipe to using the current issue to fetch articles.

2025-11-25 07:45:01 -05:00 · 2008-03-19 20:29:43 +00:00 · 2008-03-19 20:29:43 +00:00 · 39c3bd160f
commit 39c3bd160f
parent ce8a03f00f
1 changed files with 37 additions and 29 deletions
--- a/src/libprs500/web/feeds/recipes/newsweek.py
+++ b/src/libprs500/web/feeds/recipes/newsweek.py
@ -14,7 +14,7 @@
 ##    You should have received a copy of the GNU General Public License along
 ##    with this program; if not, write to the Free Software Foundation, Inc.,
 ##    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-import re
+import re, string, time
 from libprs500.web.feeds.news import BasicNewsRecipe
 from libprs500.ebooks.BeautifulSoup import BeautifulSoup
@ -23,25 +23,6 @@ class Newsweek(BasicNewsRecipe):
    title          = 'Newsweek'
    __author__     = 'Kovid Goyal'
    no_stylesheets = True
    oldest_article = 11
    feeds = [
             ('Top News', 'http://feeds.newsweek.com/newsweek/TopNews',),
             'http://feeds.newsweek.com/newsweek/columnists/StevenLevy',
             ('Politics', 'http://feeds.newsweek.com/headlines/politics'),
             ('Health', 'http://feeds.newsweek.com/headlines/health'),
             ('Business', 'http://feeds.newsweek.com/headlines/business'),
             ('Science and Technology', 'http://feeds.newsweek.com/headlines/technology/science'),
             ('National News', 'http://feeds.newsweek.com/newsweek/NationalNews'),
             ('World News', 'http://feeds.newsweek.com/newsweek/WorldNews'),
             'http://feeds.newsweek.com/newsweek/Columnists/ChristopherDickey',
             'http://feeds.newsweek.com/newsweek/Columnists/FareedZakaria',
             ('Iraq', 'http://feeds.newsweek.com/newsweek/iraq'),
             ('Society', 'http://feeds.newsweek.com/newsweek/society'),
             ('Entertainment', 'http://feeds.newsweek.com/newsweek/entertainment'),
             'http://feeds.newsweek.com/newsweek/columnists/GeorgeFWill',
             'http://feeds.newsweek.com/newsweek/columnists/AnnaQuindlen',
             ]
    extra_css = '#content { font:serif 12pt; }\n.story {font:12pt}\n.HorizontalHeader {font:18pt}\n.deck {font:16pt}'
    keep_only_tags = [dict(name='div', id='content')]
@ -57,6 +38,41 @@ class Newsweek(BasicNewsRecipe):
    match_regexps = [r'http://www.newsweek.com/id/\S+/page/\d+']
    def parse_index(self):
        soup = self.index_to_soup(self.get_current_issue())
        img = soup.find(alt='Cover')
        if img is not None and img.has_key('src'):
            small = img['src']
            self.cover_url = small.replace('coversmall', 'coverlarge')
        articles = {}
        key = None
        for tag in soup.findAll(['h5', 'h6']):
            if tag.name == 'h6':
                if key and not articles[key]:
                    articles.pop(key)
                key = self.tag_to_string(tag)
                if not key or not key.strip():
                    key = 'uncategorized'
                key = string.capwords(key)
                articles[key] = []
            elif tag.name == 'h5' and key is not None:
                a = tag.find('a', href=True)
                if a is not None:
                    title = self.tag_to_string(a)
                    if not title:
                        a = 'Untitled article'
                    art = {
                           'title' : title,
                           'url'   : a['href'],
                           'description':'', 'content':'',
                           'date': time.strftime('%a, %d %b', time.localtime())
                           }
                    if art['title'] and art['url']:
                        articles[key].append(art)
        return articles
    def postprocess_html(self,  soup):
        divs = list(soup.findAll('div', 'pagination'))
        divs[0].extract()
@ -81,11 +97,3 @@ class Newsweek(BasicNewsRecipe):
        if img and img.parent.has_key('href'):
            return urlopen(img.parent['href']).read()
    def get_cover_url(self):
        ci = self.get_current_issue()
        if ci is not None:
            soup = BeautifulSoup(ci)
            img = soup.find(alt='Cover')
            if img is not None and img.has_key('src'):
                small = img['src']
                return small.replace('coversmall', 'coverlarge')