From 3e5f0e4bc75e0962b2a7a8ae6e8292317895eccf Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 9 Jan 2016 16:34:06 +0530 Subject: [PATCH] Update Economic and Political Weekly Fixes #1532410 [Economic and Political Weekly recipe unable to download all sections in calibre](https://bugs.launchpad.net/calibre/+bug/1532410) --- recipes/epw.recipe | 80 ++++++++++++++++++++++++++++++---------------- 1 file changed, 53 insertions(+), 27 deletions(-) diff --git a/recipes/epw.recipe b/recipes/epw.recipe index e57014a9d4..779b96ef24 100644 --- a/recipes/epw.recipe +++ b/recipes/epw.recipe @@ -1,11 +1,18 @@ -__license__ = 'GPL v3' -__copyright__ = '2014-2015, Karthik , Krittika Goyal' +#!/usr/bin/env python2 +# vim:fileencoding=utf-8 +# License: GPLv3 Copyright: 2015, Kovid Goyal +from collections import OrderedDict from calibre.web.feeds.news import BasicNewsRecipe +def absurl(x): + if x.startswith('/'): + x = 'http://www.epw.in' + x + return x + class EconomicAndPoliticalWeekly(BasicNewsRecipe): title = 'Economic and Political Weekly' - __author__ = 'Karthik K, Krittika Goyal' + __author__ = 'Kovid Goyal' description = 'Economic and Political news from India' publisher = 'epw.in' category = 'news, finances, politics, India' @@ -18,29 +25,48 @@ class EconomicAndPoliticalWeekly(BasicNewsRecipe): language = 'en_IN' publication_type = 'newspaper' masthead_url = 'http://www.epw.in/system/files/epw_masthead.png' - extra_css = """ - body{font-family: Arial,Helvetica,sans-serif} - """ - conversion_options = {'comment' : description, - 'tags' : category, - 'publisher' : publisher, - 'language' : language - } - remove_tags_before = dict(name='h1', attrs={'class':'print-title'}) - remove_tags_after = dict(name='div', attrs={'class':'print-content'}) - remove_tags = [dict(name='div', attrs={'class':'terms'}), - dict(name='span', attrs={'class':'print-link'})] - feeds = [(u'Editorials', u'http://www.epw.in/taxonomy/term/1/feed'), - (u'Commentary', u'http://www.epw.in/taxonomy/term/3/feed'), - (u'Insight', u'http://www.epw.in/taxonomy/term/14/feed'), - (u'Book Reviews', u'http://www.epw.in/taxonomy/term/4/feed'), - (u'Perspectives', u'http://www.epw.in/taxonomy/term/5/feed'), - (u'Special Articles', u'http://www.epw.in/taxonomy/term/6/feed'), - (u'Discussion', u'http://www.epw.in/taxonomy/term/7/feed'), - (u'Web Exclusives', u'http://www.epw.in/taxonomy/term/11087/feed')] - def print_version(self, url): - return url.replace('http://www.epw.in', 'http://www.epw.in/print') + keep_only_tags = [ + dict(id=['block-system-main', 'page-title']), + ] - def postprocess_html(self, soup, first_fetch): - return self.adeify_images(soup) + def parse_index(self): + soup = self.index_to_soup('http://www.epw.in/') + main = soup.find('div', id='main-content') + sections = OrderedDict() + current_section = None + + for div in main.findAll(attrs={'class':lambda x: x and 'views-field-title' in x.split()}): + section = self.tag_to_string(div.findParent(attrs={'class':'block-inner'}).find('h2')) + if section != current_section: + current_section = section + if section not in sections: + sections[section] = [] + self.log('\n\n' + section) + title = self.tag_to_string(div) + a = div.find('a', href=True) + url = absurl(a['href']) + desc = '' + if a.get('title'): + desc = a['title'] + else: + d = div.findNextSibling(attrs={'class':lambda x: x and 'views-field-body' in x.split()}) + if d is not None: + desc = self.tag_to_string(d) + self.log('\t', title, url) + self.log('\t\t', desc) + sections[current_section].append({'title':title, 'url':url, 'description':desc}) + + current_section = 'Web Exclusive' + sections[current_section] = [] + self.log('\n\n' + current_section) + for div in main.findAll(attrs={'class':lambda x: x and 'views-field-nothing' in x.split()}): + title = self.tag_to_string(div) + elems = div.findAll('a', href=True) + desc, title = map(self.tag_to_string, elems) + url = absurl(elems[-1]['href']) + self.log('\t', title, url) + self.log('\t\t', desc) + sections[current_section].append({'title':title, 'url':url, 'description':desc}) + + return [(t, articles) for t, articles in sections.iteritems() if articles]