Merge branch 'master' of https://github.com/unkn0w7n/calibre

2025-08-30 23:00:21 -04:00 · 2025-03-27 22:15:42 +05:30 · 2025-03-27 22:15:42 +05:30 · 6833c07410
commit 6833c07410
parent d6a1cbf72d 55d965cffb
2 changed files with 57 additions and 61 deletions
--- a/recipes/ap.recipe
+++ b/recipes/ap.recipe
@ -1,82 +1,78 @@
 #!/usr/bin/env python
 # vim:fileencoding=utf-8
-# License: GPLv3 Copyright: 2017, Kovid Goyal <kovid at kovidgoyal.net>
+"""
+https://apnews.com
+"""

-import json
-
-from calibre.utils.date import parse_date, utcnow
 from calibre.web.feeds.news import BasicNewsRecipe, classes


 class AssociatedPress(BasicNewsRecipe):
-
-    title = u'Associated Press'
-    description = 'Global news'
-    __author__ = 'Kovid Goyal'
-    use_embedded_content = False
+    title = 'Associated Press'
+    description = (
+        'Read the latest headlines, breaking news, and videos at APNews.com, the definitive '
+        'source for independent journalism from every corner of the globe. Articles from Front Page.'
+    )
+    __author__ = 'unkn0wn'
    language = 'en'
    encoding = 'utf-8'
    no_stylesheets = True
-    ignore_duplicate_articles = {'title', 'url'}
+    remove_javascript = True
+    ignore_duplicate_articles = {'url'}
    remove_empty_feeds = False
-    oldest_article = 1.5
+    remove_attributes = ['style', 'height', 'width']
+    simultaneous_downloads = 1
+    cover_url = 'https://upload.wikimedia.org/wikipedia/commons/thumb/0/0c/Associated_Press_logo_2012.svg/662px-Associated_Press_logo_2012.svg.png'

    keep_only_tags = [
-        classes('Page-headline Page-lead Page-storyBody Page-authorinfo'),
+        classes('StoryPage-lede-content Page-lead Page-byline-info RichTextStoryBody'),
    ]
    remove_tags = [
-        classes('Page-actions Enhancement'),
-        dict(name='source'),
+        classes('displayNone Advertisement HTMLModuleEnhancement videoSlide'),
+        dict(
+            name=[
+                'source',
+                'button',
+                'svg',
+                'template',
+                'bsp-jw-player',
+                'astro-island',
+                'iframe',
+                'document',
+            ]
+        ),
+        dict(attrs={'data-parsely-title': 'Related Stories'}),
    ]
-    remove_attributes = ['srcset']
+
    extra_css = '''
-    .Figure-caption {
-    font-style: italic;
-    font-size: smaller;
-    margin-left: 1rem; margin-right: 1rem;
-    }
+        .Page-byline-info, .Page-breadcrumbs, .CarouselSlide-info, .Figure-caption { font-size:small; }
+        img {display:block; margin:0 auto;}
+        em { color: #202020; }
    '''

    def parse_index(self):
        feeds = []
-        limit = self.test[0] if self.test else 100
-        for front in (
-            'topnews sports politics entertainment usnews oddities'
-            ' Travel technology lifestyle business Health science intlnews'.split()
+        soup = self.index_to_soup('https://apnews.com')
+        for a in soup.findAll(
+            'a',
+            attrs={'href': lambda x: x and x.startswith('https://apnews.com/article/')},
        ):
-            name = {
-                'topnews': 'Top News',
-                'intlnews': 'International',
-                'usnews': 'U.S. News'
-            }.get(front, front).capitalize()
-            feeds.append([name, self.parse_section(front)])
-            if len(feeds) >= limit:
-                break
-        return feeds
+            url = a['href']
+            title = self.tag_to_string(a)
+            self.log(title, '\n\t', url)
+            feeds.append({'title': title, 'url': url})
+        return [('Articles', feeds)]

-    def parse_section(self, front):
-        url = 'https://afs-prod.appspot.com/api/v2/feed/tag?tags=apf-' + front
-        self.log('Processing section:', front, 'at', url)
-        data = self.index_to_soup(url, raw=True)
-        data = json.loads(data)
-        cards = data.get('cards', ())
-        articles = []
-
-        for card in cards:
-            for article in card['contents']:
-                url = article['localLinkUrl']
-                title = article.get('headline', article.get('flattenedFirstWords'))
-                if not title:
-                    continue
-                title = title.split('\u2014')[-1]
-                updated = article.get('updated')
-                if updated:
-                    updated = parse_date(updated, assume_utc=True)
-                    delta = utcnow() - updated
-                    if (delta.days*24*3600 + delta.seconds) > 24*3600*self.oldest_article:
-                        self.log('Skipping', title, 'as it is too old')
-                        continue
-                self.log('\tFound article:', title, 'at', url)
-                articles.append({'title': title, 'url': url})
-        self.log('')
-        return articles
+    def preprocess_html(self, soup):
+        for st in soup.findAll(**classes('CarouselSlide-infoDescription')):
+            if p := st.find('p'):
+                p.name = 'span'
+        for h in soup.findAll(['h2', 'h3']):
+            h.name = 'h4'
+        for img in soup.findAll('img', attrs={'srcset': True}):
+            img['src'] = img['srcset'].split()[0]
+        for img_ in soup.findAll(
+            'img', attrs={'data-flickity-lazyload-srcset': True, 'srcset': False}
+        ):
+            img_['src'] = img_['data-flickity-lazyload-srcset'].split()[0]
+        return soup
--- a/recipes/horizons.recipe
+++ b/recipes/horizons.recipe
@ -23,7 +23,7 @@ class horizons(BasicNewsRecipe):
    remove_attributes = ['style', 'height', 'width']
    masthead_url = 'https://www.cirsd.org/bundles/olpublic/images/horizons-logo.jpg'
    ignore_duplicate_articles = {'url'}
-    extra_css = 'em{color:#404040;}'
+    extra_css = 'em{color:#202020;}'
    simultaneous_downloads = 1

    keep_only_tags = [dict(name='div', attrs={'class': 'article'})]
@ -40,7 +40,7 @@ class horizons(BasicNewsRecipe):
    }

    def preprocess_raw_html(self, raw, *a):
-        return raw.replace('<p>&nbsp;</p>', '')
+        return raw.replace('<p>&nbsp;</p>', '').replace('<p dir="ltr">&nbsp;</p>', '')

    def get_browser(self):
        return BasicNewsRecipe.get_browser(self, verify_ssl_certificates=False)