From 8f1573b628bc337884de02c84df2305f5ed7308c Mon Sep 17 00:00:00 2001 From: unkn0w7n <51942695+unkn0w7n@users.noreply.github.com> Date: Wed, 8 Nov 2023 10:37:49 +0530 Subject: [PATCH] TOI Print update --- ...heeconomictimes_india_print_edition.recipe | 3 +- recipes/toiprint.recipe | 45 ++++++++++--------- recipes/wsj.recipe | 1 + recipes/wsj_free.recipe | 1 + 4 files changed, 27 insertions(+), 23 deletions(-) diff --git a/recipes/theeconomictimes_india_print_edition.recipe b/recipes/theeconomictimes_india_print_edition.recipe index c610386a3e..293eb63e0d 100644 --- a/recipes/theeconomictimes_india_print_edition.recipe +++ b/recipes/theeconomictimes_india_print_edition.recipe @@ -79,11 +79,10 @@ class TheEconomicTimes(BasicNewsRecipe): for h3 in section.findAll(("h1", "h3", "h4", "h5")): span = h3.find( 'span', - href=lambda x: x and x.startswith('/epaper/'), + href=lambda x: x and x.startswith('https://economictimes.indiatimes.com/epaper/'), attrs={'class': 'banner'} ) url = span['href'] - url = 'https://economictimes.indiatimes.com' + url title = self.tag_to_string(span) div = h3.find_next_sibling('div', attrs={'class': 'dsc'}) if div is not None: diff --git a/recipes/toiprint.recipe b/recipes/toiprint.recipe index b2a961cf82..9f3127b14e 100644 --- a/recipes/toiprint.recipe +++ b/recipes/toiprint.recipe @@ -1,6 +1,7 @@ from calibre.web.feeds.news import BasicNewsRecipe import json from datetime import date +from collections import defaultdict # default edition is Delhi i.e., 'cap' @@ -54,33 +55,32 @@ class toiprint(BasicNewsRecipe): url = index + '/DayIndex/' + date_ + '_' + le + '.json' raw = self.index_to_soup(url, raw=True) data = json.loads(raw) - if 'DigitalIndex' not in data: + if 'DayIndex' not in data: raise ValueError( 'The Times of India Newspaper is not published today.' ) - data = data['DigitalIndex'] - feeds = [] + data = data['DayIndex'] + feeds_dict = defaultdict(list) for link in data: sec_name = link['PageTitle'] + if sec_name == 'Advertisement': + continue self.log(sec_name) articles = [] - if 'Views' in link: - for sec in link['Views']: - if 'Articles' in sec: - for art in sec['Articles']: - if 'ArticleName' not in art: - continue - url = art['ArticleName'] - title = art.get('ArticleTitle', 'unknown').replace('
', '') - if art.get('ColumnTitle', '') == '': - desc = 'Page No.' + url.split('_')[-3] + ' | ' + art.get('ArticleBody', '') - else: - desc = 'Page No.' + url.split('_')[-3] + ' | ' + art.get('ColumnTitle', '') - self.log('\t', title, '\n\t', desc, '\n\t\t', url) - articles.append({'title': title, 'description':desc, 'url': url}) - if articles: - feeds.append((sec_name, articles)) - return feeds + if 'Articles' in link: + for art in link['Articles']: + section = sec_name + if 'ArticleName' not in art: + continue + url = art['ArticleName'] + title = art.get('ArticleTitle', 'unknown').replace('
', '') + if art.get('ColumnTitle', '') == '': + desc = 'Page No.' + url.split('_')[-3] + ' | ' + art.get('ArticleBody', '') + else: + desc = 'Page No.' + url.split('_')[-3] + ' | ' + art.get('ColumnTitle', '') + self.log('\t', title, '\n\t', desc.replace('\n', '')) + feeds_dict[section].append({"title": title, "url": url, "description": desc}) + return [(section, articles) for section, articles in feeds_dict.items()] def preprocess_raw_html(self, raw, *a): data = json.loads(raw) @@ -107,8 +107,11 @@ class toiprint(BasicNewsRecipe): elif 'ZoneText' in x: body += '

' + x['ZoneText'] + '

' return '
' \ - + body.replace('
', '

').replace('
', '

').replace('<br>', '

').replace('\n', '

') \ + + body.replace('
', '

').replace('
', '

').replace('<br>', '

').replace('\n', '
') \ + '

' def print_version(self, url): return index + '/ArticleZoneJson/' + url.split('_')[-3] + '/' + url + '.json' + + def populate_article_metadata(self, article, soup, first): + article.url = '***' diff --git a/recipes/wsj.recipe b/recipes/wsj.recipe index ceeb4fd08f..9a8dbbff09 100644 --- a/recipes/wsj.recipe +++ b/recipes/wsj.recipe @@ -35,6 +35,7 @@ class WSJ(BasicNewsRecipe): __author__ = 'Kovid Goyal' description = 'News and current affairs' language = 'en' + masthead_url = 'https://s.wsj.net/media/wsj_amp_masthead_lg.png' compress_news_images = True compress_news_images_auto_size = 7 diff --git a/recipes/wsj_free.recipe b/recipes/wsj_free.recipe index 6b5a264c50..8d45770393 100644 --- a/recipes/wsj_free.recipe +++ b/recipes/wsj_free.recipe @@ -35,6 +35,7 @@ class WSJ(BasicNewsRecipe): __author__ = 'Kovid Goyal' description = 'News and current affairs' language = 'en' + masthead_url = 'https://s.wsj.net/media/wsj_amp_masthead_lg.png' compress_news_images = True compress_news_images_auto_size = 7