TOI Print update

2025-12-21 12:27:20 -05:00 · 2023-11-08 10:37:49 +05:30 · 2023-11-08 10:37:49 +05:30 · 8f1573b628
commit 8f1573b628
parent cb1f70e816
4 changed files with 27 additions and 23 deletions
--- a/recipes/theeconomictimes_india_print_edition.recipe
+++ b/recipes/theeconomictimes_india_print_edition.recipe
@ -79,11 +79,10 @@ class TheEconomicTimes(BasicNewsRecipe):
            for h3 in section.findAll(("h1", "h3", "h4", "h5")):
                span = h3.find(
                    'span',
-                    href=lambda x: x and x.startswith('/epaper/'),
+                    href=lambda x: x and x.startswith('https://economictimes.indiatimes.com/epaper/'),
                    attrs={'class': 'banner'}
                )
                url = span['href']
-                url = 'https://economictimes.indiatimes.com' + url
                title = self.tag_to_string(span)
                div = h3.find_next_sibling('div', attrs={'class': 'dsc'})
                if div is not None:
--- a/recipes/toiprint.recipe
+++ b/recipes/toiprint.recipe
@ -1,6 +1,7 @@
 from calibre.web.feeds.news import BasicNewsRecipe
 import json
 from datetime import date
+from collections import defaultdict


 # default edition is Delhi i.e., 'cap'
@ -54,33 +55,32 @@ class toiprint(BasicNewsRecipe):
        url = index + '/DayIndex/' + date_ + '_' + le + '.json'
        raw = self.index_to_soup(url, raw=True)
        data = json.loads(raw)
-        if 'DigitalIndex' not in data:
+        if 'DayIndex' not in data:
            raise ValueError(
                    'The Times of India Newspaper is not published today.'
                )
-        data = data['DigitalIndex']
-        feeds = []
+        data = data['DayIndex']
+        feeds_dict = defaultdict(list)
        for link in data:
            sec_name = link['PageTitle']
+            if sec_name == 'Advertisement':
+                continue
            self.log(sec_name)
            articles = []
-            if 'Views' in link:
-                for sec in link['Views']:
-                    if 'Articles' in sec:
-                        for art in sec['Articles']:
-                            if 'ArticleName' not in art:
-                                continue
-                            url = art['ArticleName']
-                            title = art.get('ArticleTitle', 'unknown').replace('<br>', '')
-                            if art.get('ColumnTitle', '') == '':
-                                desc = 'Page No.' + url.split('_')[-3] + ' | ' + art.get('ArticleBody', '')
-                            else:
-                                desc = 'Page No.' + url.split('_')[-3] + ' | ' + art.get('ColumnTitle', '')
-                            self.log('\t', title, '\n\t', desc, '\n\t\t', url)
-                            articles.append({'title': title, 'description':desc, 'url': url})
-            if articles:
-                feeds.append((sec_name, articles))
-        return feeds
+            if 'Articles' in link:
+                for art in link['Articles']:
+                    section = sec_name
+                    if 'ArticleName' not in art:
+                        continue
+                    url = art['ArticleName']
+                    title = art.get('ArticleTitle', 'unknown').replace('<br>', '')
+                    if art.get('ColumnTitle', '') == '':
+                        desc = 'Page No.' + url.split('_')[-3] + ' | ' + art.get('ArticleBody', '')
+                    else:
+                        desc = 'Page No.' + url.split('_')[-3] + ' | ' + art.get('ColumnTitle', '')
+                    self.log('\t', title, '\n\t', desc.replace('\n', ''))
+                    feeds_dict[section].append({"title": title, "url": url, "description": desc})
+        return [(section, articles) for section, articles in feeds_dict.items()]

    def preprocess_raw_html(self, raw, *a):
        data = json.loads(raw)
@ -107,8 +107,11 @@ class toiprint(BasicNewsRecipe):
            elif 'ZoneText' in x:
                body += '<p><i>' + x['ZoneText'] + '</i></p>'
        return '<html><body><div>' \
-                    + body.replace('<br>', '<p>').replace('<br/>', '<p>').replace('&lt;br&gt;', '<p>').replace('\n', '<div>') \
+                    + body.replace('<br>', '<p>').replace('<br/>', '<p>').replace('&lt;br&gt;', '<p>').replace('\n', '<br>') \
                        + '</div></body></html>'

    def print_version(self, url):
        return index + '/ArticleZoneJson/' + url.split('_')[-3] + '/' + url + '.json'
+
+    def populate_article_metadata(self, article, soup, first):
+        article.url = '***'
--- a/recipes/wsj.recipe
+++ b/recipes/wsj.recipe
@ -35,6 +35,7 @@ class WSJ(BasicNewsRecipe):
    __author__ = 'Kovid Goyal'
    description = 'News and current affairs'
    language = 'en'
+    masthead_url = 'https://s.wsj.net/media/wsj_amp_masthead_lg.png'

    compress_news_images = True
    compress_news_images_auto_size = 7
--- a/recipes/wsj_free.recipe
+++ b/recipes/wsj_free.recipe
@ -35,6 +35,7 @@ class WSJ(BasicNewsRecipe):
    __author__ = 'Kovid Goyal'
    description = 'News and current affairs'
    language = 'en'
+    masthead_url = 'https://s.wsj.net/media/wsj_amp_masthead_lg.png'

    compress_news_images = True
    compress_news_images_auto_size = 7