From afe7d69681c4e23f9ad740b4d10fefa7679cb33d Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 5 Oct 2021 10:03:10 +0530 Subject: [PATCH] Update Entrepreneur Magazine Fixes #1945569 [Fetching news from Entrepreneur Magazine fails](https://bugs.launchpad.net/calibre/+bug/1945569) --- recipes/entrepeneur.recipe | 47 +++++++++++++++----------------------- 1 file changed, 19 insertions(+), 28 deletions(-) diff --git a/recipes/entrepeneur.recipe b/recipes/entrepeneur.recipe index b6aa5d52a9..5a95f204f3 100644 --- a/recipes/entrepeneur.recipe +++ b/recipes/entrepeneur.recipe @@ -23,40 +23,31 @@ class EntrepeneurMagRecipe(BasicNewsRecipe): remove_javascript = True keep_only_tags = [ - dict(attrs={'class': ['headline', 'hero topimage']}), - dict(itemprop='articlebody'), + dict(attrs={'data-word-count': True}), ] remove_tags = [ dict(attrs={'class': ['related-content']}), ] remove_attributes = ['style'] - INDEX = 'http://www.entrepreneur.com' + INDEX = 'https://www.entrepreneur.com' def parse_index(self): - root = self.index_to_soup( - self.INDEX + '/magazine/index.html', as_tree=True) - for href in root.xpath('//h2[@class="sectiontitle nb"]/a/@href'): - return self.parse_ent_index(self.INDEX + href) + soup = self.index_to_soup(self.INDEX + '/latest') + articles = [] + for h3 in soup.findAll('h3'): + a = h3.parent + if a.name == 'a' and a.get('href'): + url = self.INDEX + a['href'] + title = self.tag_to_string(h3) + desc = '' + if a.next_sibling and a.next_sibling.name == 'p': + desc = self.tag_to_string(a.next_sibling) + articles.append({'title': title, 'url': url, 'description': desc}) + self.log(title, url) + return [('Articles', articles)] - def parse_ent_index(self, url): - root = self.index_to_soup(url, as_tree=True) - img = root.xpath('//a[@class="hero"]/img[@class="lazy"]')[0] - self.cover_url = img.get('data-original') - self.timefmt = ' [%s]' % img.get('alt').rpartition('-')[-1].strip() - body = root.xpath('//div[@id="latest"]')[0] - ans = [] - for x in body.xpath('descendant::h3'): - title = self.tag_to_string(x) - try: - a = x.xpath('./a')[0] - except IndexError: - continue - url = self.INDEX + a.get('href') - d = x.getnext() - desc = self.tag_to_string(d) if d is not None else '' - self.log('\t', title, 'at:', url) - self.log('\t\t', desc) - ans.append({'title': title, 'url': url, 'description': desc}) - - return [('Articles', ans)] + def preprocess_html(self, soup): + for img in soup.findAll('img', attrs={'data-src': True}): + img['src'] = img['data-src'] + return soup