Update Newsweek

2025-12-03 11:45:01 -05:00 · 2016-04-08 16:31:27 +05:30 · 2016-04-08 16:31:27 +05:30 · 1b413d27f9
commit 1b413d27f9
parent 3e70e190dd
1 changed files with 63 additions and 96 deletions
--- a/recipes/newsweek.recipe
+++ b/recipes/newsweek.recipe
@ -1,116 +1,83 @@
-from calibre.web.feeds.jsnews import JavascriptRecipe
-import datetime
+from calibre.web.feeds.news import BasicNewsRecipe
+from collections import defaultdict

 BASE = 'http://www.newsweek.com'
 def href_to_url(a, add_piano=False):
-    return BASE + a.get('href') + ('?piano_t=1' if add_piano else '')
+    return BASE + a.get('href') + ('?piano_d=1' if add_piano else '')

-class Newsweek(JavascriptRecipe):
+def class_sels(*args):
+    q = set(args)
+    return dict(attrs={'class':lambda x: x and set(x.split()).intersection(q)})
+
+class Newsweek(BasicNewsRecipe):

    title          = 'Newsweek'
    __author__     = 'Kovid Goyal'
-    description    = 'Weekly news and current affairs in the US. Requires a subscription.'
+    description    = 'Weekly news and current affairs in the US'
    language       = 'en'
    encoding       = 'utf-8'
    no_stylesheets = True
    requires_version = (1, 40, 0)

-    keep_only_tags = ['article.content-fullwidth']
+    keep_only_tags = class_sels('article-header', 'article-body', 'header-image')
    remove_tags = [
-        'meta', '.block-openadstream', '.block-ibtmedia-social', '.issue-next',
-        '.most-popular', '.ibt-media-stories', '.user-btn-group',
-        '#taboola-below-main-column', '.trc_related_container',
-        '#block-nw-magazine-magazine-more-from-issue', '.block-ibtmedia-top-stories',
+        dict(name='meta'),
+        class_sels(
+            'block-openadstream', 'block-ibtmedia-social', 'issue-next',
+            'most-popular', 'ibt-media-stories', 'user-btn-group',
+            'trial-link', 'trc_related_container',
+            'block-ibtmedia-top-stories'
+        ),
+        dict(id=['taboola-below-main-column', 'piano-root', 'block-nw-magazine-magazine-more-from-issue']),
    ]
-    LOGIN = 'https://bar.piano-media.com/lite/authent/login//custom/newsweek/?service_id=25&loc=http%3A%2F%2Fwww.newsweek.com%2F'  # noqa
+    remove_attributes = ['style']

-    needs_subscription = True
-    def do_login(self, br, username, password):
-        br.visit(self.LOGIN)
-        form = br.select_form('#pianomedia_login_form')
-        form['login'] = username
-        form['password'] = password
-        br.submit()
-
-    def get_publication_data(self, browser):
-        browser.wait_for_element('nav.main-menu a[href]')
-        root = self.index_to_soup(browser.html)
-        for a in root.xpath('''descendant-or-self::nav[@class and contains(concat(' ', normalize-space(@class), ' '), ' main-menu ')]/descendant-or-self::*/a[@href]'''):
-            if a.text and a.text.strip() == 'This Week\'s Edition':
-                return self.get_newsweek_publication_data(browser, href_to_url(a, True))
-
-    def get_newsweek_publication_data(self, browser, url):
-        root = self.index_to_soup(url)
-        sel = lambda expr: root.xpath(expr)
-        ans = {}
-
-        for img in sel('''descendant-or-self::div[@class and contains(concat(' ', normalize-space(@class), ' '), ' cover-story ')]/descendant-or-self::*/div[@class and contains(concat(' ', normalize-space(@class), ' '), ' info ')]/descendant-or-self::*/img[@src]'''):
-            if '_Cover_' in img.get('title', ''):
-                ans['cover'] = browser.get_resource(img.get('src'))
-                break
-        for title in root.xpath('//title'):
-            raw = title.text
-            if raw:
-                self.timefmt = datetime.datetime.strptime(raw, '%Y/%m/%d').strftime(' [%b %d]')
-
-        sections = []
-        for div in sel('''descendant-or-self::div[@class and contains(concat(' ', normalize-space(@class), ' '), ' cover-story ')]/descendant-or-self::*/div[@class and contains(concat(' ', normalize-space(@class), ' '), ' info ')]'''):
-            url = None
-            for a in div.xpath('descendant::a[@href]'):
-                url = href_to_url(a)
-                break
-            for s in div.xpath('descendant::div[@class="summary"]'):
-                sections.append(('Cover Story', [{'title':'Cover Story', 'date':'', 'url':url, 'description':self.tag_to_string(s)}]))
-                break
-        features = []
-        for li in sel('''descendant-or-self::div[@class and contains(concat(' ', normalize-space(@class), ' '), ' features ')]/descendant-or-self::*/li'''):
-            url = None
-            for a in li.xpath('descendant::a[@class="article-link"]'):
-                url = href_to_url(a)
-                features.append({'title':self.tag_to_string(a), 'url':url})
-                break
-        if features:
-            sections.append(('Features', features))
-
-        for div in sel('''descendant-or-self::div[@class and contains(concat(' ', normalize-space(@class), ' '), ' issue-list-block ')]'''):
-            for d in div.xpath('descendant::div[@class="block-title"]'):
-                section_title = self.tag_to_string(d)
-                articles = []
-                break
+    def parse_index(self):
+        root = self.index_to_soup('http://www.newsweek.com/archive', as_tree=True)
+        li = root.xpath('//ul[contains(@class, "magazine-archive-items")]/li')[0]
+        a = li.xpath('descendant::a[@href]')[0]
+        url = href_to_url(a, add_piano=True)
+        self.timefmt = self.tag_to_string(a)
+        img = li.xpath('descendant::a[@href]/img[@src]')[0]
+        self.cover_url = img.get('src')
+        root = self.index_to_soup(url, as_tree=True)
+        div = root.xpath('//div[@id="block-nw-magazine-magazine-cover-story"]')[0]
+        a = div.xpath('descendant::a[@href]')[0]
+        index = [('Cover', [{'title':'Cover story', 'url':href_to_url(a)}])]
+        sections = defaultdict(list)
+        div = root.xpath('//div[@id="block-nw-magazine-magazine-issue-story-list"]')[0]
+        for a in div.xpath('descendant::h3/a[@href and contains(@class, "article-link")]'):
+            title = self.tag_to_string(a)
+            li = a.xpath('ancestor::li')[0]
+            desc = ''
+            s = li.xpath('descendant::div[@class="summary"]')
+            if s:
+                desc = self.tag_to_string(s[0])
+            sec = li.xpath('descendant::div[@class="category"]')
+            if sec:
+                sec = self.tag_to_string(sec[0])
            else:
-                continue
-            for li in div.xpath('descendant::li'):
-                desc = ''
-                for d in li.xpath('descendant::div[@class="summary"]'):
-                    desc = self.tag_to_string(d)
-                    break
-                for a in li.xpath('descendant::a[@class="article-link"]'):
-                    articles.append({'title':self.tag_to_string(a), 'url':href_to_url(a), 'description':desc})
-                    break
-            if articles:
-                sections.append((section_title, articles))
+                sec = 'Articles'
+            sections[sec].append({'title':title, 'url':href_to_url(a), 'description':desc})
+            self.log(title, url)
+            if desc:
+                self.log('\t' + desc)
+            self.log('')
+        for k in sorted(sections):
+            index.append((k, sections[k]))
+        return index

-        ans['index'] = sections
-        return ans
+    def print_version(self, url):
+        return url + '?piano_d=1'

-    def load_complete(self, browser, url, recursion_level):
-        browser.wait_for_element('div.article-body')
-        return browser.load_completed  # This is needed to allow the parallax images to load
-
-    def preprocess_stage1(self, article, browser, url, recursion_level):
+    def preprocess_html(self, soup):
        # Parallax images in the articles are loaded as background images
        # on <span> tags. Convert them to normal images.
-        for span in browser.css_select('span.parallax-image', all=True):
-            bg = unicode(span.styleProperty('background-image', span.InlineStyle))
-            if bg:
-                url = bg.strip().partition('(')[-1][:-1]
-                span.appendInside('<img src="%s"></img>' % url)
-            span.setAttribute('style', '')
-        browser.run_for_a_time(0.1)  # This is needed to give the DOM time to update
-
-    def postprocess_html(self, article, root, url, recursion_level):
-        for x in root.xpath('//*[local-name()="body" and @style]'):
-            del x.attrib['style']  # body has a fixed height, which causes problems with epub viewers
-        for x in root.xpath('//*[@id="piano-root"]'):
-            x.getparent().remove(x)
-        return root
+        for span in soup.findAll('span', attrs={'class':lambda x: x and 'parallax' in x.split()}):
+            s = span.find(style=True)
+            if s is not None:
+                url = s['style'].partition('(')[-1][:-1]
+                s['style'] = 'display: block'
+                s.name = 'img'
+                s['src'] = url
+        return soup