Merge branch 'master' of https://github.com/unkn0w7n/calibre

2025-08-30 23:00:21 -04:00 · 2024-07-24 11:47:19 +05:30 · 2024-07-24 11:47:19 +05:30 · d9561e2321
commit d9561e2321
parent ab5ff807af 26af835e54
7 changed files with 88 additions and 28 deletions
--- a/recipes/barrons.recipe
+++ b/recipes/barrons.recipe
@ -1,3 +1,5 @@
+#!/usr/bin/env python
+# vim:fileencoding=utf-8
 import re
 from collections import defaultdict
 from datetime import date
@ -77,12 +79,23 @@ class barrons(BasicNewsRecipe):
        ]
        return br

+    recipe_specific_options = {
+        'date': {
+            'short': 'The date of the edition to download (YYYYMMDD format)',
+            'long': 'For example, 20240722.\nIf it didn\'t work, try again later.'
+        }
+    }
+
    def parse_index(self):
        self.log(
            '\n***\nif this recipe fails, report it on: '
            'https://www.mobileread.com/forums/forumdisplay.php?f=228\n***\n'
        )
-        archive = self.index_to_soup('https://www.barrons.com/magazine?archives=' + date.today().strftime('%Y'))
+        issue_url = 'https://www.barrons.com/magazine?archives=' + date.today().strftime('%Y')'
+        d = self.recipe_specific_options.get('date')
+        if d and isinstance(d, str):
+            issue_url = 'https://www.barrons.com/magazine?archives=' + d
+        archive = self.index_to_soup(issue_url)
        issue = archive.find(**prefixed_classes('BarronsTheme--archive-box--'))
        self.timefmt = ' [' + self.tag_to_string(issue.find(**prefixed_classes('BarronsTheme--date--'))) + ']'
        self.description = self.tag_to_string(issue.find(**prefixed_classes('BarronsTheme--headline--')))
--- a/recipes/business_today.recipe
+++ b/recipes/business_today.recipe
@ -51,14 +51,14 @@ class BT(BasicNewsRecipe):
            '\n***\nif this recipe fails, report it on: '
            'https://www.mobileread.com/forums/forumdisplay.php?f=228\n***\n'
        )
-        soup = self.index_to_soup('https://www.businesstoday.in')
-        a = soup.findAll('a', attrs={'class':'mag_sld_img'})[1]
-        url = a['href']

        d = self.recipe_specific_options.get('date')
        if d and isinstance(d, str):
            url = 'https://www.businesstoday.in/magazine/issue/' + d
        else:
+            soup = self.index_to_soup('https://www.businesstoday.in')
+            a = soup.findAll('a', attrs={'class':'mag_sld_img'})[1]
+            url = a['href']
            self.cover_url = a.img['data-src'].split('?')[0]

        self.log('issue =', url)
--- a/recipes/hindustan_times_print.recipe
+++ b/recipes/hindustan_times_print.recipe
@ -55,7 +55,10 @@ class ht(BasicNewsRecipe):
        if p and isinstance(p, str):
            today = p

-        self.timefmt = ' [%s]' % today
+        day, month, year = (int(x) for x in today.split('/'))
+        dt = date(year, month, day)
+
+        self.timefmt = ' [%s]' % dt.strftime('%b %d, %Y')

        today = today.replace('/', '%2F')

--- a/recipes/horizons.recipe
+++ b/recipes/horizons.recipe
@ -1,3 +1,5 @@
+#!/usr/bin/env python
+# vim:fileencoding=utf-8
 '''
 https://www.cirsd.org/en/horizons
 '''
@ -28,23 +30,34 @@ class horizons(BasicNewsRecipe):
        classes('back-link'),
        dict(name='div', attrs={'class':'single-post-footer'})
    ]
+
+    recipe_specific_options = {
+        'issue_url': {
+            'short': 'The issue URL ',
+            'long': 'For example, https://www.cirsd.org/en/horizons/horizons-winter-2024--issue-no-25',
+        }
+    }
    
    def get_browser(self):
        return BasicNewsRecipe.get_browser(self, verify_ssl_certificates=False)

    def parse_index(self):
-        soup = self.index_to_soup('https://www.cirsd.org/en/horizons')
-        a = soup.findAll('a', href=True, attrs={'class':'horizon-gallery-box'})[0] #use 1 for previous edition
-        url = a['href']
-        if url.startswith('/'):
-            url = 'https://www.cirsd.org' + url
-        self.cover_url = a.find('img')['src']
-        self.log(self.cover_url)
-        issue = a.find('div', attrs={'class':'horizon-gallery-title'})
-        if issue:
-            self.title = self.tag_to_string(issue).strip()
-            self.timefmt = ' [' + self.tag_to_string(issue).strip().replace('Horizons ', '')  + ']'
-            self.log('Downloading Issue: ', self.timefmt, self.title)
+        d = self.recipe_specific_options.get('issue_url')
+        if d and isinstance(d, str):
+            url = d
+        else:
+            soup = self.index_to_soup('https://www.cirsd.org/en/horizons')
+            a = soup.findAll('a', href=True, attrs={'class':'horizon-gallery-box'})[0] #use 1 for previous edition
+            url = a['href']
+            if url.startswith('/'):
+                url = 'https://www.cirsd.org' + url
+            self.cover_url = a.find('img')['src']
+            self.log(self.cover_url)
+            issue = a.find('div', attrs={'class':'horizon-gallery-title'})
+            if issue:
+                self.title = self.tag_to_string(issue).strip()
+                self.timefmt = ' [' + self.tag_to_string(issue).strip().replace('Horizons ', '')  + ']'
+                self.log('Downloading Issue: ', self.timefmt, self.title)
        soup = self.index_to_soup(url)

        feeds = []
--- a/recipes/livemint.recipe
+++ b/recipes/livemint.recipe
@ -97,9 +97,8 @@ class LiveMint(BasicNewsRecipe):
            .summary, .highlights, .synopsis {
                font-weight:normal !important; font-style:italic; color:#202020;
            }
-            h2 {font-size:normal !important;}
            em, blockquote {color:#202020;}
-            .moreAbout, .articleInfo, .metaData, .psTopicsHeading, .topicsTag {font-size:small;}
+            .moreAbout, .articleInfo, .metaData, .psTopicsHeading, .topicsTag, .auth {font-size:small;}
        '''

        keep_only_tags = [
@ -109,12 +108,15 @@ class LiveMint(BasicNewsRecipe):
        ]
        remove_tags = [
            dict(name=['meta', 'link', 'svg', 'button', 'iframe']),
+            dict(attrs={'class':lambda x: x and x.startswith(
+                ('storyPage_alsoRead__', 'storyPage_firstPublishDate__', 'storyPage_bcrumb__')
+            )}),
+            dict(attrs={'id':['faqSection', 'seoText', 'ellipsisId']}),
            classes(
-                'trendingSimilarHeight moreNews mobAppDownload label msgError msgOk taboolaHeight gadgetSlider'
+                'trendingSimilarHeight moreNews mobAppDownload label msgError msgOk taboolaHeight gadgetSlider ninSec'
                ' socialHolder imgbig disclamerText disqus-comment-count openinApp2 lastAdSlot bs_logo author-widget'
-                ' datePublish sepStory premiumSlider moreStory Joinus moreAbout milestone benefitText checkCibilBtn'
-            ),
-            dict(attrs={'class':lambda x: x and x.startswith('storyPage_alsoRead__')})
+                ' datePublish sepStory premiumSlider moreStory Joinus moreAbout milestone benefitText checkCibilBtn trade'
+            )
        ]

        feeds = [
@ -160,22 +162,36 @@ class LiveMint(BasicNewsRecipe):
            return raw

        def preprocess_html(self, soup):
+            for h2 in soup.findAll('h2'):
+                h2.name = 'h4'
+            auth = soup.find(attrs={'class':lambda x: x and x.startswith(('storyPage_authorInfo__', 'storyPage_authorSocial__'))})
+            if auth:
+                auth['class'] = 'auth'
+            summ = soup.find(attrs={'class':lambda x: x and x.startswith('storyPage_summary__')})
+            if summ:
+                summ['class'] = 'summary'
            for strong in soup.findAll('strong'):
                if strong.find('p'):
                    strong.name = 'div'
            for embed in soup.findAll('div', attrs={'class':'embed'}):
-                if nos := embed.find('noscript'):
+                nos = embed.find('noscript')
+                if nos:
                    nos.name = 'span'
            for span in soup.findAll('figcaption'):
                span['id'] = 'img-cap'
            for auth in soup.findAll('span', attrs={'class':lambda x: x and 'articleInfo' in x.split()}):
                auth.name = 'div'
-            for span in soup.findAll('span', attrs={'class':'exclusive'}):
-                span.extract()
            for img in soup.findAll('img', attrs={'data-src': True}):
                img['src'] = img['data-src']
+            for span in soup.findAll('span', attrs={'class':'exclusive'}):
+                span.extract()
+            for al in soup.findAll('a', attrs={'class':'manualbacklink'}):
+                pa = al.findParent('p')
+                if pa:
+                    pa.extract()
            if wa := soup.find(**classes('autobacklink-topic')):
-                if p := wa.findParent('p'):
+                p = wa.findParent('p')
+                if p:
                    p.extract()
            return soup

--- a/recipes/lrb.recipe
+++ b/recipes/lrb.recipe
@ -67,7 +67,18 @@ class LondonReviewOfBooksPayed(BasicNewsRecipe):
            img['src'] = 'https://www.lrb.co.uk/storage/400_filter/images/' + img['data-appsrc'].split('/images/')[-1]
        return soup

+    recipe_specific_options = {
+        'issue_url': {
+            'short': 'The issue URL ',
+            'long': 'For example, https://www.lrb.co.uk/the-paper/v46/n01',
+            'default': 'https://www.lrb.co.uk/the-paper/'
+        }
+    }
+
    def parse_index(self):
+        d = self.recipe_specific_options.get('issue_url')
+        if d and isinstance(d, str):
+            self.INDEX = d
        soup = self.index_to_soup(self.INDEX)
        container = soup.find('div', attrs={'class': 'article-issue-cover-image'})
        if container:
--- a/recipes/wsj_news.recipe
+++ b/recipes/wsj_news.recipe
@ -168,7 +168,11 @@ class WSJ(BasicNewsRecipe):
                    sec_parse = json.loads(self.index_to_soup(index + v, raw=True))
                    data = sec_parse['articles']
                    for art in data:
-                        dt = datetime.fromtimestamp(data[art]['pubdateNumber'] + time.timezone)
+                        try:
+                            tme = data[art]['pubdateNumber']
+                        except Exception:
+                            tme = data[art]['origPubdateNumber']
+                        dt = datetime.fromtimestamp(tme + time.timezone)
                        if (datetime.now() - dt) > timedelta(self.oldest_article):
                            continue
                        title = data[art]['headline']