Merge branch 'master' of https://github.com/unkn0w7n/calibre

2026-02-18 17:20:07 -05:00 · 2024-02-02 18:58:06 +05:30 · 2024-02-02 18:58:06 +05:30 · eebec37dc8
commit eebec37dc8
parent c48d0700a4 26f78ac70a
4 changed files with 19 additions and 28 deletions
--- a/recipes/natgeo.recipe
+++ b/recipes/natgeo.recipe
@ -167,6 +167,8 @@ class NatGeo(BasicNewsRecipe):
        for article in soup.findAll('article'):
            a = article.find('a')
            url = a['href']
+            if url.startswith('/'):
+                url = 'https://www.nationalgeographic.com' + url
            section = self.tag_to_string(article.find(**classes('SectionLabel')))
            if section.startswith('Paid Content'):
                continue
--- a/recipes/natgeohis.recipe
+++ b/recipes/natgeohis.recipe
@ -146,6 +146,8 @@ class NatGeo(BasicNewsRecipe):
        for article in soup.findAll('article'):
            a = article.find('a')
            url = a['href']
+            if url.startswith('/'):
+                url = 'https://www.nationalgeographic.com' + url
            title = self.tag_to_string(article.find(**classes('PromoTile__Title--truncated')))
            ans.append({'title': title, 'url': url})
            self.log(title, '  ', url)
--- a/recipes/natgeomag.recipe
+++ b/recipes/natgeomag.recipe
@ -156,12 +156,14 @@ class NatGeo(BasicNewsRecipe):
            title = self.tag_to_string(photoart)
            url = photoart['href']
            if url.startswith('/'):
-                url = 'https://www.nationalgeographic.com' + photoart['href']
+                url = 'https://www.nationalgeographic.com' + url
            ans2.append(('Photo Essay', [{'title': title, 'url': url}]))
        for gird in soup.findAll(attrs={'class':'GridPromoTile'}):
            for article in soup.findAll('article'):
                a = article.find('a')
                url = a['href']
+                if url.startswith('/'):
+                    url = 'https://www.nationalgeographic.com' + url
                if '/graphics/' in url:
                    continue
                section = self.tag_to_string(article.find(**classes('SectionLabel')))
--- a/recipes/science_news.recipe
+++ b/recipes/science_news.recipe
@ -5,7 +5,7 @@ __license__ = 'GPL v3'
 sciencenews.org
 '''

-from calibre.web.feeds.news import BasicNewsRecipe
+from calibre.web.feeds.news import BasicNewsRecipe, prefixed_classes
 import datetime
 import re

@ -16,13 +16,13 @@ class ScienceNewsIssue(BasicNewsRecipe):
                   " in all fields of science. This recipe downloads all the articles from the latest issue.")
    category = u'Science, Technology, News'
    publisher = u'Society for Science & the Public'
-    oldest_article = 14
    language = 'en'
-    max_articles_per_feed = 50
    no_stylesheets = True
    use_embedded_content = False
-    timefmt = ' [%A, %d %B, %Y]'
    auto_cleanup = False
+    remove_attributes = ['height', 'width', 'style']
+    ignore_duplicate_articles = {'url'}
+    resolve_internal_links = True

    keep_only_tags = [
        dict(
@ -37,6 +37,7 @@ class ScienceNewsIssue(BasicNewsRecipe):
        )
    ]
    remove_tags = [
+        dict(name=['svg', 'button']),
        dict(
            attrs={'class': lambda x: x and ('newsletter-signup__wrapper___' in x)}
        )
@ -44,13 +45,15 @@ class ScienceNewsIssue(BasicNewsRecipe):

    def parse_index(self):

-        # Get URL of latest mag page
-        ld = self._get_mag_date()
-        url = f"https://www.sciencenews.org/sn-magazine/{ld:%B}-{ld.day}-{ld.year}"
-        url = url.lower()
+        index = self.index_to_soup('https://www.sciencenews.org/sn-magazine')
+        a = index.find(**prefixed_classes('magazine-archive__issue-thumbnail___'))
+        url = a['href']
+        self.timefmt = ' [' + url.split('/')[-1] + ']'
+        self.cover_url = a.img['src']

        # Get articles
        soup = self.index_to_soup(url)
+        soup = soup.find('main', attrs={'id':'content'})
        re_article = re.compile("https://www.sciencenews.org/article/")
        stories = []
        past_urls = set()
@ -68,6 +71,7 @@ class ScienceNewsIssue(BasicNewsRecipe):
                continue

            past_urls.add(article_url)
+            self.log('\t', article_title, ' ', article_url)
            article_info = {
                "url": article_url,
                "title": article_title,
@ -78,22 +82,3 @@ class ScienceNewsIssue(BasicNewsRecipe):
            ("Articles", stories),
        ]
        return index
-
-    def _get_mag_date(self):
-        """Return date of latest magazine issue.
-        It is published every 2 weeks."""
-
-        d = datetime.date(2022, 6, 18)
-        t = datetime.date.today()
-        ld = None
-        while d <= t:
-            ld = d
-            d += datetime.timedelta(days=14)
-        return ld
-
-    def get_cover_url(self):
-        ld = self._get_mag_date()
-        url = ld.strftime(
-            "https://www.sciencenews.org/wp-content/uploads/%Y/%m/%m%d%y_cover.jpg"
-        )
-        return url