Updte Newsweek recipe for new site

2025-11-29 17:55:01 -05:00 · 2010-06-01 11:08:16 -06:00 · 2010-06-01 11:08:16 -06:00 · ef0af86b19
commit ef0af86b19
parent ce67fe9797
1 changed files with 58 additions and 171 deletions
--- a/resources/recipes/newsweek.recipe
+++ b/resources/recipes/newsweek.recipe
@ -1,189 +1,76 @@
-__license__   = 'GPL v3'
+import string
 __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
 import re
 from calibre import strftime
 from calibre.ebooks.BeautifulSoup import BeautifulSoup
 from calibre.web.feeds.news import BasicNewsRecipe
 class Newsweek(BasicNewsRecipe):
    title          = 'Newsweek'
-    __author__     = 'Kovid Goyal and Sujata Raman'
+    __author__     = 'Kovid Goyal'
    description    = 'Weekly news and current affairs in the US'
    language       = 'en'
    encoding       = 'utf-8'
    no_stylesheets = True
-    extra_css = '''
+    BASE_URL = 'http://www.newsweek.com'
-                    h1{font-family:Arial,Helvetica,sans-serif; font-size:large; color:#383733;}
+    INDEX = BASE_URL+'/topics.html'
                    .deck{font-family:Georgia,sans-serif; color:#383733;}
                    .bylineDate{font-family:georgia ; color:#58544A; font-size:x-small;}
                    .authorInfo{font-family:arial,helvetica,sans-serif; color:#0066CC; font-size:x-small;}
                    .articleUpdated{font-family:arial,helvetica,sans-serif; color:#73726C; font-size:x-small;}
                    .issueDate{font-family:arial,helvetica,sans-serif; color:#73726C; font-size:x-small; font-style:italic;}
                    h5{font-family:arial,helvetica,sans-serif; color:#73726C; font-size:x-small;}
                    h6{font-family:arial,helvetica,sans-serif; color:#73726C; font-size:x-small;}
                    .story{font-family:georgia,sans-serif ;color:black;}
                    .photoCredit{color:#999999; font-family:Arial,Helvetica,sans-serif;font-size:x-small;}
                    .photoCaption{color:#0A0A09;font-family:Arial,Helvetica,sans-serif;font-size:x-small;}
                    .fwArticle{font-family:Arial,Helvetica,sans-serif;font-size:x-small;font-weight:bold;}
                    '''
-    encoding       = 'utf-8'
+    keep_only_tags = dict(name='article', attrs={'class':'article-text'})
-    language = 'en'
+    remove_tags = [dict(attrs={'data-dartad':True})]
    remove_attributes = ['property']
-    remove_tags = [
+    def postprocess_html(self, soup, first):
-            {'class':['fwArticle noHr','fwArticle','hdlBulletItem','head-content','navbar','link', 'ad', 'sponsorLinksArticle', 'mm-content',
+        for tag in soup.findAll(name=['article', 'header']):
-                'inline-social-links-wrapper', 'email-article','ToolBox',
+            tag.name = 'div'
-                'inline-promo-link', 'sponsorship',
+        return soup
-                'inlineComponentRight',
+
-                'comments-and-social-links-wrapper', 'EmailArticleBlock']},
+    def newsweek_sections(self):
-            {'id' : ['footer', 'ticker-data', 'topTenVertical',
+        soup = self.index_to_soup(self.INDEX)
-                'digg-top-five', 'mesothorax', 'nw-comments', 'my-take-landing',
+        for a in soup.findAll('a', title='Primary tag', href=True):
-                'ToolBox', 'EmailMain']},
+            yield (string.capitalize(self.tag_to_string(a)),
-            {'class': re.compile('related-cloud')},
+                    self.BASE_URL+a['href'])
            dict(name='li', attrs={'id':['slug_bigbox']})
            ]
-    keep_only_tags = [{'class':['article HorizontalHeader',
+    def newsweek_parse_section_page(self, soup):
-        'articlecontent','photoBox', 'article columnist first']}, ]
+        for article in soup.findAll('article', about=True,
-    recursions = 1
+                attrs={'class':'stream-item'}):
-    match_regexps = [r'http://www.newsweek.com/id/\S+/page/\d+']
+            title = article.find(attrs={'property': 'dc:title'})
-    preprocess_regexps = [(re.compile(r'<!--.*?-->', re.DOTALL), lambda m: '')]
+            if title is None: continue
-
+            title = self.tag_to_string(title)
-    def find_title(self, section):
+            url = self.BASE_URL + article['about']
-        d = {'scope':'Scope', 'thetake':'The Take', 'features':'Features',
+            desc = ''
-                None:'Departments', 'culture':'Culture'}
+            author = article.find({'property':'dc:creator'})
-        ans = None
+            if author:
-        a = section.find('a', attrs={'name':True})
+                desc = u'by %s. '%self.tag_to_string(author)
-        if a is not None:
+            p = article.find(attrs={'property':'dc:abstract'})
-            ans = a['name']
+            if p is not None:
-        return d.get(ans, ans)
+                for a in p.find('a'): a.extract()
-
+                desc += self.tag_to_string(p)
-
+            t = article.find('time', attrs={'property':'dc:created'})
-    def find_articles(self, section):
+            date = ''
-        ans = []
+            if t is not None:
-        for x in section.findAll('h5'):
+                date = u' [%s]'%self.tag_to_string(t)
-            title = ' '.join(x.findAll(text=True)).strip()
+            self.log('\tFound article:', title, 'at', url)
-            a = x.find('a')
+            self.log('\t\t', desc)
-            if not a: continue
+            yield {'title':title, 'url':url, 'description':desc, 'date':date}
            href = a['href']
            ans.append({'title':title, 'url':href, 'description':'', 'date': strftime('%a, %d %b')})
        if not ans:
            for x in section.findAll('div', attrs={'class':'hdlItem'}):
                a = x.find('a', href=True)
                if not a : continue
                title = ' '.join(a.findAll(text=True)).strip()
                href = a['href']
                if 'http://xtra.newsweek.com' in href: continue
                ans.append({'title':title, 'url':href, 'description':'', 'date': strftime('%a, %d %b')})
        #for x in ans:
        #    x['url'] += '/output/print'
        return ans
    def parse_index(self):
-        soup = self.get_current_issue()
+        sections = []
-        if not soup:
+        for section, shref in self.newsweek_sections():
-            raise RuntimeError('Unable to connect to newsweek.com. Try again later.')
+            self.log('Processing section', section, shref)
-        sections = soup.findAll('div', attrs={'class':'featurewell'})
+            articles = []
-        titles = map(self.find_title, sections)
+            soups = [self.index_to_soup(shref)]
-        articles = map(self.find_articles, sections)
+            na = soups[0].find('a', rel='next')
-        ans = list(zip(titles, articles))
+            if na:
-        def fcmp(x, y):
+                soups.append(self.index_to_soup(self.BASE_URL+na['href']))
-            tx, ty = x[0], y[0]
+            for soup in soups:
-            if tx == "Features": return cmp(1, 2)
+                articles.extend(self.newsweek_parse_section_page(soup))
-            if ty == "Features": return cmp(2, 1)
+                if self.test and len(articles) > 1:
-            return cmp(tx, ty)
+                    break
-        return sorted(ans, cmp=fcmp)
+            if articles:
-
+                sections.append((section, articles))
-    def ensure_html(self, soup):
+            if self.test and len(sections) > 1:
-        root = soup.find(name=True)
+                break
-        if root.name == 'html': return soup
+        return sections
        nsoup = BeautifulSoup('<html><head></head><body/></html>')
        nroot = nsoup.find(name='body')
        for x in soup.contents:
            if getattr(x, 'name', False):
                x.extract()
                nroot.insert(len(nroot), x)
        return nsoup
    def postprocess_html(self, soup, first_fetch):
        if not first_fetch:
            h1 = soup.find(id='headline')
            if h1:
                h1.extract()
            div = soup.find(attrs={'class':'articleInfo'})
            if div:
                div.extract()
        divs = list(soup.findAll('div', 'pagination'))
        if not divs:
            return self.ensure_html(soup)
        for div in divs[1:]: div.extract()
        all_a = divs[0].findAll('a', href=True)
        divs[0]['style']="display:none"
        if len(all_a) > 1:
            all_a[-1].extract()
        test = re.compile(self.match_regexps[0])
        for a in soup.findAll('a', href=test):
            if a not in all_a:
                del a['href']
        return self.ensure_html(soup)
    def get_current_issue(self):
        soup = self.index_to_soup('http://www.newsweek.com')
        div = soup.find('div', attrs={'class':re.compile('more-from-mag')})
        if div is None: return None
        a = div.find('a')
        if a is not None:
            href = a['href'].split('#')[0]
            return self.index_to_soup(href)
    def get_cover_url(self):
        cover_url = None
        soup = self.index_to_soup('http://www.newsweek.com')
        link_item = soup.find('div',attrs={'class':'cover-image'})
        if link_item and link_item.a and link_item.a.img:
           cover_url = link_item.a.img['src']
        return cover_url
    def postprocess_book(self, oeb, opts, log) :
        def extractByline(href) :
            soup = BeautifulSoup(str(oeb.manifest.hrefs[href]))
            byline = soup.find(True,attrs={'class':'authorInfo'})
            byline = self.tag_to_string(byline) if byline is not None else ''
            issueDate = soup.find(True,attrs={'class':'issueDate'})
            issueDate = self.tag_to_string(issueDate) if issueDate is not None else ''
            issueDate = re.sub(',','', issueDate)
            if byline > '' and issueDate > '' :
                return byline + ' | ' + issueDate
            else :
                return byline + issueDate
        def extractDescription(href) :
            soup = BeautifulSoup(str(oeb.manifest.hrefs[href]))
            description = soup.find(True,attrs={'name':'description'})
            if description is not None and description.has_key('content'):
                description = description['content']
                if description.startswith('Newsweek magazine online plus') :
                    description = soup.find(True, attrs={'class':'story'})
                    firstPara = soup.find('p')
                    description = self.tag_to_string(firstPara)
            else :
                description = soup.find(True, attrs={'class':'story'})
                firstPara = soup.find('p')
                description = self.tag_to_string(firstPara)
            return description
        for section in oeb.toc :
            for article in section :
                if article.author is None :
                    article.author = extractByline(article.href)
                if article.description is None :
                    article.description = extractDescription(article.href)
        return