Merge branch 'master' of https://github.com/unkn0w7n/calibre

2025-06-23 15:30:45 -04:00 · 2023-11-11 11:48:50 +05:30 · 2023-11-11 11:48:50 +05:30 · d266fe5447
commit d266fe5447
parent dc7667d02d b3486b0c07
5 changed files with 33 additions and 2 deletions
--- a/recipes/natgeo.recipe
+++ b/recipes/natgeo.recipe
@ -135,6 +135,11 @@ class NatGeo(BasicNewsRecipe):
        .auth, .time { font-size:small; color:#5c5c5c; }
    '''

+    def get_cover_url(self):
+        soup = self.index_to_soup('https://www.nationalgeographic.com/magazine/')
+        png = re.findall('https://i\.natgeofe\.com\S+?national-geographic-\S+?\.jpg', soup.decode('utf-8'))
+        return png[0] + '?w=1000&h=1000'
+
    def parse_index(self):
        pages = [
            'https://www.nationalgeographic.com/animals',
@ -176,3 +181,9 @@ class NatGeo(BasicNewsRecipe):
            # for high res images use '?w=2000&h=2000'
            img['src'] = img['src'] + '?w=1000&h=1000'
        return soup
+
+    def populate_article_metadata(self, article, soup, first):
+        summ = soup.find(attrs={'class':'byline'})
+        if summ:
+            article.summary = self.tag_to_string(summ)
+            article.text_summary = self.tag_to_string(summ)
--- a/recipes/natgeohis.recipe
+++ b/recipes/natgeohis.recipe
@ -160,3 +160,9 @@ class NatGeo(BasicNewsRecipe):
            # for high res images use '?w=2000&h=2000'
            img['src'] = img['src'] + '?w=1000&h=1000'
        return soup
+
+    def populate_article_metadata(self, article, soup, first):
+        summ = soup.find(attrs={'class':'byline'})
+        if summ:
+            article.summary = self.tag_to_string(summ)
+            article.text_summary = self.tag_to_string(summ)
--- a/recipes/natgeomag.recipe
+++ b/recipes/natgeomag.recipe
@ -143,7 +143,7 @@ class NatGeo(BasicNewsRecipe):
        self.log('Downloading ', url)
        self.timefmt = ' [' + edition + ']'
        soup = self.index_to_soup(url)
-        png = re.findall('https://i\.natgeofe\.com\S+?national-geographic-magazine-\S+?\.jpg', soup.decode('utf-8'))
+        png = re.findall('https://i\.natgeofe\.com\S+?national-geographic-\S+?\.jpg', soup.decode('utf-8'))
        self.cover_url = png[0] + '?w=1000&h=1000'

        name = soup.find(attrs={'class':lambda x: x and 'Header__Description' in x.split()})
@ -179,3 +179,9 @@ class NatGeo(BasicNewsRecipe):
            # for high res images use '?w=2000&h=2000'
            img['src'] = img['src'] + '?w=1200&h=1200'
        return soup
+
+    def populate_article_metadata(self, article, soup, first):
+        summ = soup.find(attrs={'class':'byline'})
+        if summ:
+            article.summary = self.tag_to_string(summ)
+            article.text_summary = self.tag_to_string(summ)
--- a/recipes/theeconomictimes_india_print_edition.recipe
+++ b/recipes/theeconomictimes_india_print_edition.recipe
@ -58,6 +58,10 @@ class TheEconomicTimes(BasicNewsRecipe):
    ]

    def parse_index(self):
+        self.log(
+            '\n***\nif this recipe fails, report it on: '
+            'https://www.mobileread.com/forums/forumdisplay.php?f=228\n***\n'
+        )
        soup = self.index_to_soup(
            'https://economictimes.indiatimes.com/print_edition.cms'
        )
--- a/recipes/toiprint.recipe
+++ b/recipes/toiprint.recipe
@ -52,6 +52,10 @@ class toiprint(BasicNewsRecipe):
        return cover

    def parse_index(self):
+        self.log(
+            '\n***\nif this recipe fails, report it on: '
+            'https://www.mobileread.com/forums/forumdisplay.php?f=228\n***\n'
+        )
        url = index + '/DayIndex/' + date_ + '_' + le + '.json'
        raw = self.index_to_soup(url, raw=True)
        data = json.loads(raw)
@ -73,7 +77,7 @@ class toiprint(BasicNewsRecipe):
                    if 'ArticleName' not in art:
                        continue
                    url = art['ArticleName']
-                    title = art.get('ArticleTitle', 'unknown').replace('<br>', '')
+                    title = art.get('ArticleTitle', 'unknown').replace('<br>', '').replace('<br/>', '')
                    if art.get('ColumnTitle', '') == '':
                        desc = 'Page No.' + url.split('_')[-3] + ' | ' + art.get('ArticleBody', '')
                    else: