Merge branch 'master' of https://github.com/unkn0w7n/calibre

2025-08-11 09:13:57 -04:00 · 2024-07-23 19:23:49 +05:30 · 2024-07-23 19:23:49 +05:30 · ab5ff807af
commit ab5ff807af
parent ab9cb22eac 07d32a87a5
3 changed files with 51 additions and 38 deletions
--- a/recipes/business_standard.recipe
+++ b/recipes/business_standard.recipe
@ -1,7 +1,8 @@
+#!/usr/bin/env python
+# vim:fileencoding=utf-8
 import json
 from datetime import datetime

-from calibre.ptempfile import PersistentTemporaryFile
 from calibre.web.feeds.news import BasicNewsRecipe
 from html5_parser import parse

@ -28,7 +29,22 @@ class BusinessStandard(BasicNewsRecipe):
    ignore_duplicate_articles = {'title', 'url'}
    remove_empty_feeds = True
    resolve_internal_links = True
-    max_articles_per_feed = 20
+    max_articles_per_feed = 50
+    oldest_article = 1.15
+
+    recipe_specific_options = {
+        'days': {
+            'short': 'Oldest article to download from this news source. In days ',
+            'long': 'For example, 0.5, gives you articles from the past 12 hours',
+            'default': str(oldest_article)
+        }
+    }
+
+    def __init__(self, *args, **kwargs):
+        BasicNewsRecipe.__init__(self, *args, **kwargs)
+        d = self.recipe_specific_options.get('days')
+        if d and isinstance(d, str):
+            self.oldest_article = float(d)

    extra_css = '''
        img {display:block; margin:0 auto;}
@ -36,35 +52,21 @@ class BusinessStandard(BasicNewsRecipe):
        .cap { font-size:small; text-align:center; }
    '''

-    articles_are_obfuscated = True
-
-    def get_obfuscated_article(self, url):
-        br = self.get_browser()
-        soup = self.index_to_soup(url)
-        link = soup.find('a', attrs={'href':lambda x: x and x.startswith('https://www.business-standard.com')})
-        skip_sections =[ # add sections you want to skip
-            '/video/', '/videos/', '/multimedia/',
+    # https://www.business-standard.com/rss-feeds/listing
+    feeds = [
+        ('Top Stories', 'https://www.business-standard.com/rss/home_page_top_stories.rss'),
+        ('Todays Paper', 'https://www.business-standard.com/rss/todays-paper.rss'),
+        ('Budget', 'https://www.business-standard.com/rss/budget-110.rss'),
+        ('Economy', 'https://www.business-standard.com/rss/economy-102.rss'),
+        ('Opinion', 'https://www.business-standard.com/rss/opinion-105.rss'),
+        ('Companies', 'https://www.business-standard.com/rss/companies-101.rss'),
+        ('Industries', 'https://www.business-standard.com/rss/industry-217.rss'),
+        ('Market', 'https://www.business-standard.com/rss/markets-106.rss'),
+        ('Politics', 'https://www.business-standard.com/rss/budget-110.rss'),
+        ('World', 'https://www.business-standard.com/rss/industry-217.rss'),
+        ('Technology', 'https://www.business-standard.com/rss/technology-108.rss'),
+        ('Latest', 'https://www.business-standard.com/rss/latest.rss')
    ]
-        if any(x in link['href'] for x in skip_sections):
-            self.abort_article('skipping video links ', link['href'])
-        self.log('Found ', link['href'])
-        html = br.open(link['href']).read()
-        pt = PersistentTemporaryFile('.html')
-        pt.write(html)
-        pt.close()
-        return pt.name
-
-    feeds = []
-
-    sections = [
-        'india-news', 'economy', 'opinion', 'markets', 'companies', 'industry', 'finance', 'world-news',
-    #    'politics', 'cricket', 'sports', 'technology', 'book', 'education', 'specials'
-    ]
-
-    for sec in sections:
-        a = 'https://news.google.com/rss/search?q=when:27h+allinurl:business-standard.com{}&hl=en-IN&gl=IN&ceid=IN:en'
-        feeds.append((sec.capitalize(), a.format('%2F' + sec + '%2F')))
-    # feeds.append(('Others', a.format('')))

    def preprocess_raw_html(self, raw, *a):
        root = parse(raw)
@ -112,9 +114,3 @@ class BusinessStandard(BasicNewsRecipe):
        body = data['htmlContent']

        return '<html><body>' + cat + title + subhead + auth + lede + caption + '<div><p></p>' + body + '</div></body></html>'
-
-    def populate_article_metadata(self, article, soup, first):
-        article.url = soup.find('h1')['title']
-        article.summary = self.tag_to_string(soup.find('h3'))
-        article.text_summary = self.tag_to_string(soup.find('h3'))
-        article.title = article.title.replace(' - Business Standard', '')
--- a/recipes/business_today.recipe
+++ b/recipes/business_today.recipe
@ -1,3 +1,5 @@
+#!/usr/bin/env python
+# vim:fileencoding=utf-8
 from calibre.web.feeds.news import BasicNewsRecipe, classes


@ -29,6 +31,14 @@ class BT(BasicNewsRecipe):
            'banner_content'
        )
    ]
+
+    recipe_specific_options = {
+        'date': {
+            'short': 'The date of the edition to download (YYYY-MM-DD format)',
+            'long': 'For example, 2024-07-07'
+        }
+    }
+
    extra_css = '''
        img {display:block; margin:0 auto;}
        em { color:#202020; }
@ -43,8 +53,14 @@ class BT(BasicNewsRecipe):
        )
        soup = self.index_to_soup('https://www.businesstoday.in')
        a = soup.findAll('a', attrs={'class':'mag_sld_img'})[1]
-        self.cover_url = a.img['data-src'].split('?')[0]
        url = a['href']
+
+        d = self.recipe_specific_options.get('date')
+        if d and isinstance(d, str):
+            url = 'https://www.businesstoday.in/magazine/issue/' + d
+        else:
+            self.cover_url = a.img['data-src'].split('?')[0]
+
        self.log('issue =', url)
        self.timefmt = ' [' + url.split('/')[-1] + ']'
        soup = self.index_to_soup(url)
--- a/recipes/livemint.recipe
+++ b/recipes/livemint.recipe
@ -113,7 +113,8 @@ class LiveMint(BasicNewsRecipe):
                'trendingSimilarHeight moreNews mobAppDownload label msgError msgOk taboolaHeight gadgetSlider'
                ' socialHolder imgbig disclamerText disqus-comment-count openinApp2 lastAdSlot bs_logo author-widget'
                ' datePublish sepStory premiumSlider moreStory Joinus moreAbout milestone benefitText checkCibilBtn'
-            )
+            ),
+            dict(attrs={'class':lambda x: x and x.startswith('storyPage_alsoRead__')})
        ]

        feeds = [