Update Economic and Political Weekly

Fixes #1532410 [Economic and Political Weekly recipe unable to download all sections in calibre](https://bugs.launchpad.net/calibre/+bug/1532410)
2025-07-09 03:04:10 -04:00 · 2016-01-09 16:34:06 +05:30 · 2016-01-09 16:34:06 +05:30 · 3e5f0e4bc7
commit 3e5f0e4bc7
parent 3a1bb4ad0a
1 changed files with 53 additions and 27 deletions
--- a/recipes/epw.recipe
+++ b/recipes/epw.recipe
@ -1,11 +1,18 @@
-__license__   = 'GPL v3'
+#!/usr/bin/env python2
-__copyright__ = '2014-2015, Karthik <hashkendistro@gmail.com>, Krittika Goyal'
+# vim:fileencoding=utf-8
 # License: GPLv3 Copyright: 2015, Kovid Goyal <kovid at kovidgoyal.net>
 from collections import OrderedDict
 from calibre.web.feeds.news import BasicNewsRecipe
 def absurl(x):
    if x.startswith('/'):
        x = 'http://www.epw.in' + x
        return x
 class EconomicAndPoliticalWeekly(BasicNewsRecipe):
    title                  = 'Economic and Political Weekly'
-    __author__             = 'Karthik K, Krittika Goyal'
+    __author__             = 'Kovid Goyal'
    description            = 'Economic and Political news from India'
    publisher              = 'epw.in'
    category               = 'news, finances, politics, India'
@ -18,29 +25,48 @@ class EconomicAndPoliticalWeekly(BasicNewsRecipe):
    language               = 'en_IN'
    publication_type       = 'newspaper'
    masthead_url           = 'http://www.epw.in/system/files/epw_masthead.png'
    extra_css              = """
                                 body{font-family: Arial,Helvetica,sans-serif}
                             """
    conversion_options     = {'comment'          : description,
                              'tags'             : category,
                              'publisher'        : publisher,
                              'language'         : language
                             }
    remove_tags_before     = dict(name='h1', attrs={'class':'print-title'})
    remove_tags_after      = dict(name='div', attrs={'class':'print-content'})
    remove_tags            = [dict(name='div', attrs={'class':'terms'}),
                              dict(name='span', attrs={'class':'print-link'})]
    feeds                  = [(u'Editorials', u'http://www.epw.in/taxonomy/term/1/feed'),
                              (u'Commentary', u'http://www.epw.in/taxonomy/term/3/feed'),
                              (u'Insight', u'http://www.epw.in/taxonomy/term/14/feed'),
                              (u'Book Reviews', u'http://www.epw.in/taxonomy/term/4/feed'),
                              (u'Perspectives', u'http://www.epw.in/taxonomy/term/5/feed'),
                              (u'Special Articles', u'http://www.epw.in/taxonomy/term/6/feed'),
                              (u'Discussion', u'http://www.epw.in/taxonomy/term/7/feed'),
                              (u'Web Exclusives', u'http://www.epw.in/taxonomy/term/11087/feed')]
-    def print_version(self, url):
+    keep_only_tags = [
-        return url.replace('http://www.epw.in', 'http://www.epw.in/print')
+        dict(id=['block-system-main', 'page-title']),
    ]
-    def postprocess_html(self, soup, first_fetch):
+    def parse_index(self):
-        return self.adeify_images(soup)
+        soup = self.index_to_soup('http://www.epw.in/')
        main = soup.find('div', id='main-content')
        sections = OrderedDict()
        current_section = None
        for div in main.findAll(attrs={'class':lambda x: x and 'views-field-title' in x.split()}):
            section = self.tag_to_string(div.findParent(attrs={'class':'block-inner'}).find('h2'))
            if section != current_section:
                current_section = section
                if section not in sections:
                    sections[section] = []
                self.log('\n\n' + section)
            title = self.tag_to_string(div)
            a = div.find('a', href=True)
            url = absurl(a['href'])
            desc = ''
            if a.get('title'):
                desc = a['title']
            else:
                d = div.findNextSibling(attrs={'class':lambda x: x and 'views-field-body' in x.split()})
                if d is not None:
                    desc = self.tag_to_string(d)
            self.log('\t', title, url)
            self.log('\t\t', desc)
            sections[current_section].append({'title':title, 'url':url, 'description':desc})
        current_section = 'Web Exclusive'
        sections[current_section] = []
        self.log('\n\n' + current_section)
        for div in main.findAll(attrs={'class':lambda x: x and 'views-field-nothing' in x.split()}):
            title = self.tag_to_string(div)
            elems = div.findAll('a', href=True)
            desc, title = map(self.tag_to_string, elems)
            url = absurl(elems[-1]['href'])
            self.log('\t', title, url)
            self.log('\t\t', desc)
            sections[current_section].append({'title':title, 'url':url, 'description':desc})
        return [(t, articles) for t, articles in sections.iteritems() if articles]