From 383e16cff2aba5c7a998c3dce0ee6727bd5c8ae7 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 20 Jan 2016 14:59:39 +0530 Subject: [PATCH] Update Business World Magazine Fixes #1534463 [Business World Indian Recipe not working](https://bugs.launchpad.net/calibre/+bug/1534463) --- recipes/businessworldin.recipe | 73 ++++++++-------------------------- 1 file changed, 16 insertions(+), 57 deletions(-) diff --git a/recipes/businessworldin.recipe b/recipes/businessworldin.recipe index cb5f443e9f..02f58f6757 100644 --- a/recipes/businessworldin.recipe +++ b/recipes/businessworldin.recipe @@ -4,73 +4,32 @@ __copyright__ = '2009-2010, Darko Miletic ' www.businessworld.in ''' -import re from calibre.web.feeds.news import BasicNewsRecipe class BusinessWorldMagazine(BasicNewsRecipe): title = 'Business World Magazine' __author__ = 'Kovid Goyal' description = 'News from India' - publisher = 'ABP Pvt Ltd Publication' category = 'news, politics, finances, India, Asia' - delay = 1 no_stylesheets = True - INDEX = 'http://www.businessworld.in/businessworld/magazine_latest_issue.php' - ROOT = 'http://www.businessworld.in' encoding = 'utf-8' language = 'en_IN' - auto_cleanup = True - - def parse_index(self): - br = self.browser - br.open(self.ROOT) - raw = br.open(br.click_link(text_regex=re.compile('Current.*Issue', - re.I))).read() - soup = self.index_to_soup(raw) - mc = soup.find(attrs={'class':'mag_cover'}) - if mc is not None: - img = mc.find('img', src=True) - if img is not None: - self.cover_url = img['src'] - - feeds = [] - current_section = None - articles = [] - for tag in soup.findAll(['h3', 'h2']): - inner_a = tag.find('a') - if tag.name == 'h3' and inner_a is not None: - continue - if tag.name == 'h2' and (inner_a is None or current_section is - None): - continue - - if tag.name == 'h3': - if current_section is not None and articles: - feeds.append((current_section, articles)) - current_section = self.tag_to_string(tag) - self.log('Found section:', current_section) - articles = [] - elif tag.name == 'h2': - url = inner_a.get('href', None) - if url is None: continue - if url.startswith('/'): url = self.ROOT + url - title = self.tag_to_string(inner_a) - h1 = tag.findPreviousSibling('h1') - if h1 is not None: - title = self.tag_to_string(h1) + title - self.log('\tFound article:', title) - articles.append({'title':title, 'url':url, 'date':'', - 'description':''}) - - if current_section and articles: - feeds.append((current_section, articles)) - - return feeds - - - - - + oldest_article = 2 + keep_only_tags = [ + dict(attrs={'class':['main-article']}), + ] + remove_tags = [ + dict(id='video_n_ad_div'), + dict(attrs={'class':['meta-tools', 'social-article']}), + ] + remove_tags_after = dict(attrs={'class':'social-article'}) + feeds = ['http://www.businessworld.in/rss/all-article.xml'] + def preprocess_html(self, soup): + for img in soup.findAll('img', attrs={'data-original':True}): + img['src'] = img['data-original'] + for ins in soup.findAll(attrs={'class':'adsbygoogle'}): + ins.parent.extract() + return soup