From c0d533f7ec29e78bbf2fff4521ec9b46fb88453d Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 15 Apr 2023 21:24:44 +0530 Subject: [PATCH] Update Frontline and Outlook Magazine --- recipes/frontline.recipe | 56 +++++++++++++++++++----------------- recipes/outlook_india.recipe | 3 ++ 2 files changed, 33 insertions(+), 26 deletions(-) diff --git a/recipes/frontline.recipe b/recipes/frontline.recipe index b3bd2de3a2..7a95524c8d 100644 --- a/recipes/frontline.recipe +++ b/recipes/frontline.recipe @@ -1,7 +1,6 @@ from collections import defaultdict from calibre.web.feeds.news import BasicNewsRecipe, classes - class Frontline(BasicNewsRecipe): title = u'Frontline' __author__ = 'unkn0wn' @@ -16,18 +15,21 @@ class Frontline(BasicNewsRecipe): remove_attributes = ['height', 'width'] resolve_internal_links = True extra_css = ''' - .overline{ font-size:small; color:#404040; } - .person-name { font-size:small; font-weight:bold; } - .lead-img-caption, .caption-cont { font-size:small; text-align:center; } + .environment, .publish-time, .author { font-size:small; color:#404040; } + .caption { font-size:small; text-align:center; } + img { display:block; margin:0 auto; } + .question {font-weight:bold;} ''' keep_only_tags = [ - classes('article') + dict(name='div', attrs={'class':'container article-section'}) ] remove_tags = [ - classes('shareicon-article articleBottomLine secheader mobilesocialicons'), - dict(name='h2', attrs={'class':'title'}) + classes( + 'breadcrumb comments-shares share-page article-video ' + 'referpara slide-mobile title-patch hide-mobile related-stories' + ), ] def preprocess_html(self, soup): @@ -36,11 +38,11 @@ class Frontline(BasicNewsRecipe): source = img.findPrevious('source', srcset=True) img.extract() if source: - source['src'] = source['srcset'] + source['src'] = source['srcset'].replace('_320','_1200') source.name = 'img' else: img['src'] = img['data-original'] - for cap in soup.findAll(**classes('caption-cont')): + for cap in soup.findAll(**classes('caption')): cap.name = 'figcaption' return soup @@ -50,30 +52,32 @@ class Frontline(BasicNewsRecipe): return soup def parse_index(self): - soup = self.index_to_soup('https://frontline.thehindu.com/magazine/') - issue = soup.find(**classes('sptar-archive-item')).find('a')['href'] - self.log(issue) - soup = self.index_to_soup(issue) - time = soup.find(**classes('date')).findNext('h3') - if time: - self.timefmt = ' ' + self.tag_to_string(time) - self.log('Downloading Issue:', self.timefmt) - self.cover_url = soup.find(**classes('sptar-cover-item')).find('img')['data-original'].replace('FREE_320', 'FREE_810') + soup = self.index_to_soup('https://frontline.thehindu.com/current-issue/') + + if cover := soup.find('div', attrs={'class':'magazine'}): + self.cover_url = cover.find(**classes('sptar-image')).img['data-original'].replace('_320', '_1200') + self.log('Cover ', self.cover_url) + if desc := cover.find(**classes('sub-text')): + self.description = self.tag_to_string(desc) + feeds_dict = defaultdict(list) - for div in soup.findAll('div', attrs={'class':'brief-list-item'}): - a = div.find(**classes('brief-title')).find('a') + + mag = soup.find(**classes('section-magazine')) + for div in mag.findAll('div', attrs={'class':'content'}): + a = div.find(**classes('title')).find('a') url = a['href'] title = self.tag_to_string(a) section = 'Articles' - cat = div.find(**classes('brief-cat')) - if cat: + if cat := div.find(**classes('label')): section = self.tag_to_string(cat) desc = '' - art = div.find(**classes('artbody')) - if art: + + if art := div.find(**classes('sub-text')): desc = self.tag_to_string(art) + if auth := div.find(**classes('author')): + desc = self.tag_to_string(auth) + ' | ' + desc if not url or not title: continue self.log(section, '\n\t', title, '\n\t', desc, '\n\t\t', url) - feeds_dict[section].append({"title": title, "url": url}) - return [(section, articles) for section, articles in feeds_dict.items()] + feeds_dict[section].append({"title": title, "url": url, "description": desc}) + return [(section, articles) for section, articles in feeds_dict.items()] \ No newline at end of file diff --git a/recipes/outlook_india.recipe b/recipes/outlook_india.recipe index f88714f386..a50f177c9d 100644 --- a/recipes/outlook_india.recipe +++ b/recipes/outlook_india.recipe @@ -32,6 +32,9 @@ class outlook(BasicNewsRecipe): ) ] + def get_browser(self): + return BasicNewsRecipe.get_browser(self, user_agent='common_words/based') + def parse_index(self): soup = self.index_to_soup('https://www.outlookindia.com/magazine') div = soup.find('div', attrs={'class':'wrapper'})