Update Newsweek

2026-05-31 02:55:19 -04:00 · 2014-06-09 15:21:26 +05:30
parent 7b284b949f
commit b9bcb7008c
1 changed files with 93 additions and 74 deletions
@@ -1,94 +1,113 @@
-from calibre.web.feeds.news import BasicNewsRecipe
+from calibre.web.feeds.jsnews import JavascriptRecipe
+from cssselect import HTMLTranslator
+from lxml.etree import XPath
+import datetime

-class Newsweek(BasicNewsRecipe):
+def CSSSelect(expr):
+    return XPath(HTMLTranslator().css_to_xpath(expr))
+
+BASE = 'http://www.newsweek.com'
+def href_to_url(a):
+    return BASE + a.get('href') + '?piano_t=1'
+
+class Newsweek(JavascriptRecipe):

    title          = 'Newsweek'
    __author__     = 'Kovid Goyal'
-    description    = 'Weekly news and current affairs in the US'
+    description    = 'Weekly news and current affairs in the US. Requires a subscription.'
    language       = 'en'
    encoding       = 'utf-8'
    no_stylesheets = True
-    recipe_disabled = ('Newsweek was taken over by The Daily Beast,'
-            ' newsweek.com no longer exists, so this recipe '
-            ' has been disabled.')
+    requires_version = (1, 40, 0)

-    BASE_URL = 'http://www.newsweek.com'
+    keep_only_tags = ['article.content-fullwidth']
+    remove_tags = [
+        'meta', '.block-openadstream', '.block-ibtmedia-social', '.issue-next',
+        '.most-popular', '.ibt-media-stories', '.user-btn-group',
+        '#taboola-below-main-column', '.trc_related_container',
+    ]
+    LOGIN = 'https://bar.piano-media.com/lite/authent/login//custom/newsweek/?service_id=25&loc=http%3A%2F%2Fwww.newsweek.com%2F'  # noqa

-    topics = {
-        'Culture' : '/tag/culture.html',
-        'Business' : '/tag/business.html',
-        'Society' : '/tag/society.html',
-        'Science' : '/tag/science.html',
-        'Education' : '/tag/education.html',
-        'Politics' : '/tag/politics.html',
-        'Health' : '/tag/health.html',
-        'World' : '/tag/world.html',
-        'Nation' : '/tag/nation.html',
-        'Technology' : '/tag/technology.html',
-        'Game Changers' : '/tag/game-changers.html',
-    }
+    needs_subscription = True
+    def do_login(self, br, username, password):
+        br.visit(self.LOGIN)
+        form = br.select_form('#pianomedia_login_form')
+        form['login'] = username
+        form['password'] = password
+        br.submit()

-    keep_only_tags = dict(name='article', attrs={'class':'article-text'})
-    remove_tags = [dict(attrs={'data-dartad':True})]
-    remove_attributes = ['property']
+    def get_publication_data(self, browser):
+        browser.wait_for_element('nav.main-menu a[href]')
+        root = self.index_to_soup(browser.html)
+        for a in CSSSelect('nav.main-menu a[href]')(root):
+            if a.text and a.text.strip() == 'This Week\'s Edition':
+                return self.get_newsweek_publication_data(browser, href_to_url(a))

-    def postprocess_html(self, soup, first):
-        for tag in soup.findAll(name=['article', 'header']):
-            tag.name = 'div'
-        return soup
+    def get_newsweek_publication_data(self, browser, url):
+        root = self.index_to_soup(url)
+        sel = lambda expr: CSSSelect(expr)(root)
+        ans = {}

-    def newsweek_sections(self):
-        for topic_name, topic_url in self.topics.iteritems():
-            yield (topic_name,
-                    self.BASE_URL+topic_url)
+        for img in sel('div.cover-story div.info img[src]'):
+            if '_Cover_' in img.get('title', ''):
+                ans['cover'] = browser.get_resource(img.get('src'))
+                break
+        for title in root.xpath('//title'):
+            raw = title.text
+            if raw:
+                self.timefmt = datetime.datetime.strptime(raw, '%Y/%m/%d').strftime(' [%b %d]')

-
-    def newsweek_parse_section_page(self, soup):
-        for article in soup.findAll('article', about=True,
-                attrs={'class':'stream-item'}):
-            title = article.find(attrs={'property': 'dc:title'})
-            if title is None: continue
-            title = self.tag_to_string(title)
-            url = self.BASE_URL + article['about']
-            desc = ''
-            author = article.find({'property':'dc:creator'})
-            if author:
-                desc = u'by %s. '%self.tag_to_string(author)
-            p = article.find(attrs={'property':'dc:abstract'})
-            if p is not None:
-                for a in p.find('a'): a.extract()
-                desc += self.tag_to_string(p)
-            t = article.find('time', attrs={'property':'dc:created'})
-            date = ''
-            if t is not None:
-                date = u' [%s]'%self.tag_to_string(t)
-            self.log('\tFound article:', title, 'at', url)
-            self.log('\t\t', desc)
-            yield {'title':title, 'url':url, 'description':desc, 'date':date}
-
-
-    def parse_index(self):
        sections = []
-        for section, shref in self.newsweek_sections():
-            self.log('Processing section', section, shref)
-            articles = []
-            try:
-                soups = [self.index_to_soup(shref)]
-            except:
-                self.log.warn('Section %s not found, skipping'%section)
+        for div in sel('div.cover-story div.info'):
+            url = None
+            for a in div.xpath('descendant::a[@href]'):
+                url = href_to_url(a)
+                break
+            for s in div.xpath('descendant::div[@class="summary"]'):
+                sections.append(('Cover Story', [{'title':'Cover Story', 'date':'', 'url':url, 'description':self.tag_to_string(s)}]))
+                break
+        features = []
+        for li in sel('div.features li'):
+            url = None
+            for a in li.xpath('descendant::a[@class="article-link"]'):
+                url = href_to_url(a)
+                features.append({'title':self.tag_to_string(a), 'url':url})
+                break
+        if features:
+            sections.append(('Features', features))
+
+        for div in sel('div.issue-list-block'):
+            for d in div.xpath('descendant::div[@class="block-title"]'):
+                section_title = self.tag_to_string(d)
+                articles = []
+                break
+            else:
                continue
-            na = soups[0].find('a', rel='next')
-            if na:
-                soups.append(self.index_to_soup(self.BASE_URL+na['href']))
-            for soup in soups:
-                articles.extend(self.newsweek_parse_section_page(soup))
-                if self.test and len(articles) > 1:
+            for li in div.xpath('descendant::li'):
+                desc = ''
+                for d in li.xpath('descendant::div[@class="summary"]'):
+                    desc = self.tag_to_string(d)
+                    break
+                for a in li.xpath('descendant::a[@class="article-link"]'):
+                    articles.append({'title':self.tag_to_string(a), 'url':href_to_url(a), 'description':desc})
                    break
            if articles:
-                sections.append((section, articles))
-            if self.test and len(sections) > 1:
-                break
-        return sections
+                sections.append((section_title, articles))

+        ans['index'] = sections
+        return ans

+    def preprocess_stage1(self, article, browser, url, recursion_level):
+        # Parallax images in the articles are loaded as background images
+        # on <span> tags. Convert them to normal images.
+        for span in browser.css_select('span.parallax-image', all=True):
+            bg = unicode(span.styleProperty('background-image', span.InlineStyle))
+            if bg:
+                url = bg.partition('(')[-1][:-1]
+                span.appendInside('<img src="%s"></img>' % url)
+            span.setAttribute('style', '')

+    def postprocess_html(self, article, root, url, recursion_level):
+        for x in root.xpath('//*[@id="piano-root"]'):
+            x.getparent().remove(x)
+        return root