Merge branch 'master' of https://github.com/unkn0w7n/calibre

2025-07-09 03:04:10 -04:00 · 2025-03-30 11:16:48 +05:30 · 2025-03-30 11:16:48 +05:30 · 8eae5df87d
commit 8eae5df87d
parent 00ba3b8066 ee066587e5
1 changed files with 121 additions and 113 deletions
--- a/recipes/reuters.recipe
+++ b/recipes/reuters.recipe
@ -31,10 +31,12 @@ class Reuters(BasicNewsRecipe):
    no_stylesheets = True
    remove_attributes = ['style', 'height', 'width']
    resolve_internal_links = True
-    ignore_duplicate_articles = {'url', 'title'}
+    ignore_duplicate_articles = {'url'}
    remove_empty_feeds = True
    extra_css = '''
        .label, .auth { font-size:small; color:#202020; }
        .desc { font-style: italic; }
        .figc { font-size:small; }
        img {display:block; margin:0 auto;}
    '''
@ -48,8 +50,8 @@ class Reuters(BasicNewsRecipe):
        'res': {
            'short': 'For hi-res images, select a resolution from the\nfollowing options: 960, 1080, 1200',
            'long': 'This is useful for non e-ink devices',
-            'default': '480'
+            'default': '480',
-        }
+        },
    }
    def __init__(self, *args, **kwargs):
@ -61,58 +63,55 @@ class Reuters(BasicNewsRecipe):
    def parse_index(self):
        index = 'https://www.reuters.com'
        today = datetime.now()
-        feed_api = (
+
-            index
+        sections = []
-            + '/arc/outboundfeeds/v3/mobile/section/{}/?from=0&size=50&outputType=json'
+
        sec_api = json.loads(
            self.index_to_soup(index + '/mobile/api/v1/menu/?outputType=json', raw=True)
        )
-        path_api = index + '/arc/outboundfeeds/v3/mobile{}?outputType=json'
+
-        sections = [
+        for s in sec_api[0]['data']['hierarchy']['children']:
-            'world',
+            if s.get('type', '') == 'section':
-            'business',
+                sections.append((s['name'], s['id']))
-            'markets',
+                sections.extend(
-            'sustainability',
+                    (s['name'] + ' - ' + s2['name'], s2['id'])
-            'legal',
+                    for s2 in s.get('children', [])
-            'breakingviews',
+                    if s2.get('type', '') == 'section'
-            'technology',
+                )
            # 'sports',
            'science',
            'lifestyle',
        ]
        feeds = []
-        for sec in sections:
+        for sec, link in sections:
-            section = sec.capitalize()
+            self.log(sec)
            self.log(section)
            articles = []
-            data = json.loads(self.index_to_soup(feed_api.format(sec), raw=True))[
+            data = json.loads(
-                'wireitems'
+                self.index_to_soup(
-            ]
+                    index + '/mobile/v1' + link + '?outputType=json', raw=True
                )
            )
-            for x in data:
+            for st in (
-                if x.get('wireitem_type', '') == 'story':
+                story
-                    for y in x['templates']:
+                for x in data
-                        if y.get('type', '') == 'story':
+                if isinstance(x, dict)
-                            title = y['story']['hed']
+                for story in x.get('data', {}).get('stories', [])
            ):
                title = st['title']
-                            date = datetime.fromisoformat(
+                date = datetime.fromisoformat(st['display_time'][:-1]) + timedelta(
-                                y['story']['updated_at'][:-1]
+                    seconds=time.timezone
-                            ) + timedelta(seconds=time.timezone)
+                )
-                            if (today - date) > timedelta(self.oldest_article):
+                if (today - date) > timedelta(self.oldest_article):
-                                continue
+                    continue
-                            desc = y['story']['lede']
+                desc = st['description']
-                            path = y['template_action']
+                url = index + st['url']
-                            if path.get('type', '') == 'article':
+                self.log('            ', title, '\n\t', desc, '\n\t', url)
-                                url = path_api.format(path['api_path_native'])
+                articles.append({'title': title, 'description': desc, 'url': url})
                                self.log('            ', title, '\n\t', desc)
                                articles.append(
                                    {'title': title, 'description': desc, 'url': url}
                                )
            if articles:
-                feeds.append((section, articles))
+                feeds.append((sec, articles))
        return feeds
    def preprocess_raw_html(self, raw, url):
@ -120,75 +119,84 @@ class Reuters(BasicNewsRecipe):
        w = self.recipe_specific_options.get('res')
        if w and isinstance(w, str):
            res = '&width=' + w
-        js = json.loads(raw)
+
        data = js['wireitems']
        body = ''
-        for x in data:
+
-            if x.get('wireitem_type', '') == 'story':
+        for det in json.loads(raw):
-                for y in x['templates']:
+            if not det.get('type', '') == 'article_detail':
-                    if 'label' in y['cid']:
+                continue
-                        body += '<div class="label">' + y['title'] + '</div>'
+            data = det['data']['article']
-                        break
+            body += '<h1>' + data['title'] + '</h1>'
-                for y in x['templates']:
+            if data.get('description'):
-                    if 'title' in y['cid']:
+                body += '<p class="desc">' + data['description'] + '</p>'
-                        body += (
+            if data.get('authors'):
-                            '<h1 title="{}">'.format(js['share_url'])
+                body += (
-                            + y['content']
+                    '<p class="auth">'
-                            + '</h1>'
+                    + 'By '
-                        )
+                    + ', '.join(at.get('byline', '') for at in data.get('authors', []))
-                        break
+                    + '</p>'
-                for y in x['templates']:
+                )
-                    if 'author' in y['cid']:
+
-                        body += '<p>'
+            if data.get('thumbnail') and data['thumbnail'].get('type', '') == 'image':
-                        auths = list(y.get('authors_names', []))
+                th = data['thumbnail']
-                        if auths:
+                body += '<img src="{}"><div class="figc">{}</div>'.format(
-                            body += (
+                    th['resizer_url'].split('&')[0] + res,
-                                '<div class="auth">' + 'By ' + ', '.join(auths) + '</div>'
+                    th.get('caption', ''),
-                            )
+                )
-                            break
+
-                for y in x['templates']:
+            body += (
-                    if 'datetime' in y['cid']:
+                '<p class="auth">'
-                        body += (
+                + str(data['read_minutes'])
-                            '<div class="auth">'
+                + ' minute read | '
-                            + str(y['read_minutes'])
+                + str(data['word_count'])
-                            + ' minute read | '
+                + ' words | '
-                            + p_dt(y['display_time'])
+                + p_dt(
-                            + '</div>'
+                    data['updated_time']
-                        )
+                    if data.get('updated_time')
-                        body += '</p>'
+                    else data['display_time']
-                        break
+                )
-                for y in x['templates']:
+                + '</p>'
-                    if 'paragraph' in y['cid']:
+            )
-                        body += '<p>' + y['content'] + '</p>'
+
-                    if 'header' in y['cid']:
+            if data.get('summary'):
-                        body += '<h4>' + y['content'] + '</h4>'
+                (
-                    if 'image' in y['cid']:
+                    '<blockquote>'
-                        if 'renditions' in y['image']:
+                    + ''.join(f'<li>{su["description"]}</li>' for su in data['summary'])
-                            body += '<img src="{}"><div class="figc">{}</div>'.format(
+                    + '</blockquote>'
-                                y['image']['url'].split('&')[0] + res,
+                )
-                                y['image']['caption'],
+
-                            )
+            for y in data['content_elements']:
-                        else:
+                ty = y.get('type', '')
-                            body += '<img src="{}"><div class="figc">{}</div>'.format(
+                if ty == 'placeholder':
-                                y['image']['url'], y['image']['caption']
+                    continue
-                            )
+
-                    if 'gallery' in y['cid']:
+                elif ty == 'paragraph':
-                        for imgs in y['images']:
+                    body += '<p>' + y['content'] + '</p>'
-                            if 'renditions' in imgs:
+                elif ty == 'header':
-                                body += '<img src="{}"><div class="figc">{}</div>'.format(
+                    body += '<h4>' + y['content'] + '</h4>'
-                                    imgs['url'].split('&')[0] + res,
+                elif ty == 'graphic':
-                                    imgs['caption'],
+                    body += '<img src="{}"><div class="figc">{}</div>'.format(
-                                )
+                        y['resizer_url'].split('&')[0] + res,
-                            else:
+                        y.get('description', ''),
-                                body += '<img src="{}"><div class="figc">{}</div>'.format(
+                    )
-                                    imgs['url'], imgs['caption']
+                else:
-                                )
+                    self.log('**', ty)
-                    if 'video' in y['cid']:
+
-                        body += '<img src="{}"><div class="figc">{}</div>'.format(
+            if data.get('sign_off'):
-                            y['video']['thumbnail']['url'],
+                body += '<p class="auth">' + data['sign_off'] + '</p>'
-                            y['video']['thumbnail']['caption'],
+
                        )
        return '<html><body><div>' + body + '</div></body></html>'
-    def populate_article_metadata(self, article, soup, first):
+    def get_browser(self, *args, **kwargs):
-        article.url = soup.find('h1')['title']
+        kwargs['user_agent'] = (
            'ReutersNews/7.11.0.1742843009 Mozilla/5.0 (Linux; Android 14) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.6422.165 Mobile Safari/537.36'
        )
        br = BasicNewsRecipe.get_browser(self, *args, **kwargs)
        br.addheaders += [('cookie', 'reuters-geo={"country":"-"; "region":"-"}=')]
        return br
    def print_version(self, url):
        return (
            url.replace('https://www.reuters.com', 'https://www.reuters.com/mobile/v1')
            + '?outputType=json'
        )