Merge branch 'master' of https://github.com/unkn0w7n/calibre

2025-07-09 03:04:10 -04:00 · 2025-03-27 22:15:42 +05:30 · 2025-03-27 22:15:42 +05:30 · 6833c07410
commit 6833c07410
parent d6a1cbf72d 55d965cffb
2 changed files with 57 additions and 61 deletions
--- a/recipes/ap.recipe
+++ b/recipes/ap.recipe
@ -1,82 +1,78 @@
 #!/usr/bin/env python
 # vim:fileencoding=utf-8
-# License: GPLv3 Copyright: 2017, Kovid Goyal <kovid at kovidgoyal.net>
+"""
 https://apnews.com
 """
 import json
 from calibre.utils.date import parse_date, utcnow
 from calibre.web.feeds.news import BasicNewsRecipe, classes
 class AssociatedPress(BasicNewsRecipe):
-
+    title = 'Associated Press'
-    title = u'Associated Press'
+    description = (
-    description = 'Global news'
+        'Read the latest headlines, breaking news, and videos at APNews.com, the definitive '
-    __author__ = 'Kovid Goyal'
+        'source for independent journalism from every corner of the globe. Articles from Front Page.'
-    use_embedded_content = False
+    )
    __author__ = 'unkn0wn'
    language = 'en'
    encoding = 'utf-8'
    no_stylesheets = True
-    ignore_duplicate_articles = {'title', 'url'}
+    remove_javascript = True
    ignore_duplicate_articles = {'url'}
    remove_empty_feeds = False
-    oldest_article = 1.5
+    remove_attributes = ['style', 'height', 'width']
    simultaneous_downloads = 1
    cover_url = 'https://upload.wikimedia.org/wikipedia/commons/thumb/0/0c/Associated_Press_logo_2012.svg/662px-Associated_Press_logo_2012.svg.png'
    keep_only_tags = [
-        classes('Page-headline Page-lead Page-storyBody Page-authorinfo'),
+        classes('StoryPage-lede-content Page-lead Page-byline-info RichTextStoryBody'),
    ]
    remove_tags = [
-        classes('Page-actions Enhancement'),
+        classes('displayNone Advertisement HTMLModuleEnhancement videoSlide'),
-        dict(name='source'),
+        dict(
            name=[
                'source',
                'button',
                'svg',
                'template',
                'bsp-jw-player',
                'astro-island',
                'iframe',
                'document',
            ]
        ),
        dict(attrs={'data-parsely-title': 'Related Stories'}),
    ]
-    remove_attributes = ['srcset']
+
    extra_css = '''
-    .Figure-caption {
+        .Page-byline-info, .Page-breadcrumbs, .CarouselSlide-info, .Figure-caption { font-size:small; }
-    font-style: italic;
+        img {display:block; margin:0 auto;}
-    font-size: smaller;
+        em { color: #202020; }
    margin-left: 1rem; margin-right: 1rem;
    }
    '''
    def parse_index(self):
        feeds = []
-        limit = self.test[0] if self.test else 100
+        soup = self.index_to_soup('https://apnews.com')
-        for front in (
+        for a in soup.findAll(
-            'topnews sports politics entertainment usnews oddities'
+            'a',
-            ' Travel technology lifestyle business Health science intlnews'.split()
+            attrs={'href': lambda x: x and x.startswith('https://apnews.com/article/')},
        ):
-            name = {
+            url = a['href']
-                'topnews': 'Top News',
+            title = self.tag_to_string(a)
-                'intlnews': 'International',
+            self.log(title, '\n\t', url)
-                'usnews': 'U.S. News'
+            feeds.append({'title': title, 'url': url})
-            }.get(front, front).capitalize()
+        return [('Articles', feeds)]
            feeds.append([name, self.parse_section(front)])
            if len(feeds) >= limit:
                break
        return feeds
-    def parse_section(self, front):
+    def preprocess_html(self, soup):
-        url = 'https://afs-prod.appspot.com/api/v2/feed/tag?tags=apf-' + front
+        for st in soup.findAll(**classes('CarouselSlide-infoDescription')):
-        self.log('Processing section:', front, 'at', url)
+            if p := st.find('p'):
-        data = self.index_to_soup(url, raw=True)
+                p.name = 'span'
-        data = json.loads(data)
+        for h in soup.findAll(['h2', 'h3']):
-        cards = data.get('cards', ())
+            h.name = 'h4'
-        articles = []
+        for img in soup.findAll('img', attrs={'srcset': True}):
-
+            img['src'] = img['srcset'].split()[0]
-        for card in cards:
+        for img_ in soup.findAll(
-            for article in card['contents']:
+            'img', attrs={'data-flickity-lazyload-srcset': True, 'srcset': False}
-                url = article['localLinkUrl']
+        ):
-                title = article.get('headline', article.get('flattenedFirstWords'))
+            img_['src'] = img_['data-flickity-lazyload-srcset'].split()[0]
-                if not title:
+        return soup
                    continue
                title = title.split('\u2014')[-1]
                updated = article.get('updated')
                if updated:
                    updated = parse_date(updated, assume_utc=True)
                    delta = utcnow() - updated
                    if (delta.days*24*3600 + delta.seconds) > 24*3600*self.oldest_article:
                        self.log('Skipping', title, 'as it is too old')
                        continue
                self.log('\tFound article:', title, 'at', url)
                articles.append({'title': title, 'url': url})
        self.log('')
        return articles
--- a/recipes/horizons.recipe
+++ b/recipes/horizons.recipe
@ -23,7 +23,7 @@ class horizons(BasicNewsRecipe):
    remove_attributes = ['style', 'height', 'width']
    masthead_url = 'https://www.cirsd.org/bundles/olpublic/images/horizons-logo.jpg'
    ignore_duplicate_articles = {'url'}
-    extra_css = 'em{color:#404040;}'
+    extra_css = 'em{color:#202020;}'
    simultaneous_downloads = 1
    keep_only_tags = [dict(name='div', attrs={'class': 'article'})]
@ -40,7 +40,7 @@ class horizons(BasicNewsRecipe):
    }
    def preprocess_raw_html(self, raw, *a):
-        return raw.replace('<p>&nbsp;</p>', '')
+        return raw.replace('<p>&nbsp;</p>', '').replace('<p dir="ltr">&nbsp;</p>', '')
    def get_browser(self):
        return BasicNewsRecipe.get_browser(self, verify_ssl_certificates=False)