from calibre.web.feeds.news import BasicNewsRecipe try: from urllib.parse import quote except ImportError: from urllib import quote class EconomiaMagazine(BasicNewsRecipe): title = u'Economia Magazine' __author__ = 'Kovid Goyal' description = 'Economia - Intelligence & Insight for ICAEW Members' language = 'en_GB' BASE = 'http://economia.icaew.com/' no_stylesheets = True keep_only_tags = [ dict(name='h1'), dict(name='figure', attrs={ 'class': lambda x: x and 'figure' in x.split()}), dict(attrs={'class': 'intro articleCopy'.split()}) ] def image_url_processor(cls, baseurl, iurl): if iurl: return baseurl + quote(iurl) return baseurl + '404.jpeg' def preprocess_raw_html(self, raw_html, url): return raw_html.replace('src=""', '') def parse_index(self): soup = self.index_to_soup('http://economia.icaew.com/') img = soup.find('img', src=lambda x: x and 'Magazine covers' in x) self.cover_url = self.BASE + quote(img['src'].encode('utf-8')) soup = self.index_to_soup(self.BASE + img.parent['href']) self.timefmt = ' [%s]' % self.tag_to_string( soup.find('title')).split('|')[0].strip() ans = [] for div in soup.findAll('div', attrs={'class': 'articlePreview'}): h2 = div.find('h2') section_title = self.tag_to_string(h2).strip() self.log('Found section:', section_title) articles = [] for li in div.findAll('li'): h3 = li.find('h3') title = self.tag_to_string(h3) a = h3.find('a', href=True) url = self.BASE + a['href'] p = li.find('p') self.log('\t', title, 'at', url) articles.append({'title': title, 'url': url, 'description': self.tag_to_string(p)}) if articles: ans.append((section_title, articles)) return ans