Update Economic and Political Weekly

Fixes #1532410 [Economic and Political Weekly recipe unable to download all sections in calibre](https://bugs.launchpad.net/calibre/+bug/1532410)
2025-07-09 03:04:10 -04:00 · 2016-01-09 16:34:06 +05:30 · 2016-01-09 16:34:06 +05:30 · 3e5f0e4bc7
commit 3e5f0e4bc7
parent 3a1bb4ad0a
1 changed files with 53 additions and 27 deletions
--- a/recipes/epw.recipe
+++ b/recipes/epw.recipe
@ -1,11 +1,18 @@
-__license__   = 'GPL v3'
-__copyright__ = '2014-2015, Karthik <hashkendistro@gmail.com>, Krittika Goyal'
+#!/usr/bin/env python2
+# vim:fileencoding=utf-8
+# License: GPLv3 Copyright: 2015, Kovid Goyal <kovid at kovidgoyal.net>

+from collections import OrderedDict
 from calibre.web.feeds.news import BasicNewsRecipe

+def absurl(x):
+    if x.startswith('/'):
+        x = 'http://www.epw.in' + x
+        return x
+
 class EconomicAndPoliticalWeekly(BasicNewsRecipe):
    title                  = 'Economic and Political Weekly'
-    __author__             = 'Karthik K, Krittika Goyal'
+    __author__             = 'Kovid Goyal'
    description            = 'Economic and Political news from India'
    publisher              = 'epw.in'
    category               = 'news, finances, politics, India'
@ -18,29 +25,48 @@ class EconomicAndPoliticalWeekly(BasicNewsRecipe):
    language               = 'en_IN'
    publication_type       = 'newspaper'
    masthead_url           = 'http://www.epw.in/system/files/epw_masthead.png'
-    extra_css              = """
-                                 body{font-family: Arial,Helvetica,sans-serif}
-                             """
-    conversion_options     = {'comment'          : description,
-                              'tags'             : category,
-                              'publisher'        : publisher,
-                              'language'         : language
-                             }
-    remove_tags_before     = dict(name='h1', attrs={'class':'print-title'})
-    remove_tags_after      = dict(name='div', attrs={'class':'print-content'})
-    remove_tags            = [dict(name='div', attrs={'class':'terms'}),
-                              dict(name='span', attrs={'class':'print-link'})]
-    feeds                  = [(u'Editorials', u'http://www.epw.in/taxonomy/term/1/feed'),
-                              (u'Commentary', u'http://www.epw.in/taxonomy/term/3/feed'),
-                              (u'Insight', u'http://www.epw.in/taxonomy/term/14/feed'),
-                              (u'Book Reviews', u'http://www.epw.in/taxonomy/term/4/feed'),
-                              (u'Perspectives', u'http://www.epw.in/taxonomy/term/5/feed'),
-                              (u'Special Articles', u'http://www.epw.in/taxonomy/term/6/feed'),
-                              (u'Discussion', u'http://www.epw.in/taxonomy/term/7/feed'),
-                              (u'Web Exclusives', u'http://www.epw.in/taxonomy/term/11087/feed')]

-    def print_version(self, url):
-        return url.replace('http://www.epw.in', 'http://www.epw.in/print')
+    keep_only_tags = [
+        dict(id=['block-system-main', 'page-title']),
+    ]

-    def postprocess_html(self, soup, first_fetch):
-        return self.adeify_images(soup)
+    def parse_index(self):
+        soup = self.index_to_soup('http://www.epw.in/')
+        main = soup.find('div', id='main-content')
+        sections = OrderedDict()
+        current_section = None
+
+        for div in main.findAll(attrs={'class':lambda x: x and 'views-field-title' in x.split()}):
+            section = self.tag_to_string(div.findParent(attrs={'class':'block-inner'}).find('h2'))
+            if section != current_section:
+                current_section = section
+                if section not in sections:
+                    sections[section] = []
+                self.log('\n\n' + section)
+            title = self.tag_to_string(div)
+            a = div.find('a', href=True)
+            url = absurl(a['href'])
+            desc = ''
+            if a.get('title'):
+                desc = a['title']
+            else:
+                d = div.findNextSibling(attrs={'class':lambda x: x and 'views-field-body' in x.split()})
+                if d is not None:
+                    desc = self.tag_to_string(d)
+            self.log('\t', title, url)
+            self.log('\t\t', desc)
+            sections[current_section].append({'title':title, 'url':url, 'description':desc})
+
+        current_section = 'Web Exclusive'
+        sections[current_section] = []
+        self.log('\n\n' + current_section)
+        for div in main.findAll(attrs={'class':lambda x: x and 'views-field-nothing' in x.split()}):
+            title = self.tag_to_string(div)
+            elems = div.findAll('a', href=True)
+            desc, title = map(self.tag_to_string, elems)
+            url = absurl(elems[-1]['href'])
+            self.log('\t', title, url)
+            self.log('\t\t', desc)
+            sections[current_section].append({'title':title, 'url':url, 'description':desc})
+
+        return [(t, articles) for t, articles in sections.iteritems() if articles]