Merge branch 'master' of https://github.com/unkn0w7n/calibre

2025-08-11 09:13:57 -04:00 · 2023-09-15 11:11:28 +05:30 · 2023-09-15 11:11:28 +05:30 · 06752c03cc
commit 06752c03cc
parent fe29b9a34d 1a068abc52
1 changed files with 35 additions and 6 deletions
--- a/recipes/financial_times.recipe
+++ b/recipes/financial_times.recipe
@ -1,6 +1,7 @@
 import json
 import re
 from urllib.parse import quote
+from html5_parser import parse

 from calibre.web.feeds.news import BasicNewsRecipe

@ -16,9 +17,29 @@ class ft(BasicNewsRecipe):
    remove_javascript = True
    remove_empty_feeds = True
    ignore_duplicate_articles = {'url'}
+    resolve_internal_links = True
    remove_attributes = ['style', 'width', 'height']
    masthead_url = 'https://im.ft-static.com/m/img/masthead_main.jpg'
-    extra_css = '#fig-cap{font-style:italic; text-align:left; font-size:small;}'
+
+    extra_css = '''
+        .article-info__time-byline {font-size:small; font-weight:bold; }
+        .o-topper__visual, #fig, .main-image, .n-content-image { text-align:center; font-size:small; }
+        blockquote, i { color:#5c5c5c; }
+        .o-topper__standfirst { font-weight:bold; color:#202020; } 
+        .o-topper__topic { font-size:small; color:#5c5c5c; }
+    '''
+
+    keep_only_tags = [
+        classes(
+            'body_json o-topper__topic o-topper__headline o-topper__standfirst o-topper__visual article-info__time-byline main-image'
+        ),
+        dict(name='article', attrs={'id':'article-body'})
+    ]
+    
+    remove_tags = [
+        dict(name='aside', attrs={'class':'n-content-recommended--single-story'}),
+        classes('in-article-advert')
+    ]

    # needs_subscription = 'optional'
    #
@ -40,6 +61,10 @@ class ft(BasicNewsRecipe):
        br.set_current_header('Referer',  'https://www.google.com/')
        return br

+    # the print_version loads all articles but sometimes it might fail due to too many requests
+    # def print_version(self, url):
+    #     return 'https://webcache.googleusercontent.com/search?q=cache:' + quote(url, safe='')
+
    def get_cover_url(self):
        from datetime import date
        cover = 'http://img.kiosko.net/' + str(
@ -74,6 +99,11 @@ class ft(BasicNewsRecipe):
    def preprocess_raw_html(self, raw, *a):
        # with open('/t/raw.html', 'w') as f:
        #     f.write(raw)
+        root = parse(raw)
+        if x := root.xpath('//article[@id="article-body"]'):
+            self.log('**has article content')
+            return raw
+        self.log('**no article content')
        m = re.search(r'type="application/ld\+json">[^<]+?"@type":"NewsArticle"', raw)
        raw = raw[m.start():]
        raw = raw.split('>', 1)[1]
@ -114,12 +144,11 @@ class ft(BasicNewsRecipe):
        body = re.sub(r'\[https://\S+?\]', insert_image, body)
        if data.get('description'):
            desc = '<h2>' + data['description'] + '</h2>'
-        html = '<html><body><h1>' + title + '</h1>' + desc + '<h3>' + author + '</h3>' + image + '<p>' + body
+        html = '<html><body><div class="body_json"><h1>' + title + '</h1>' + desc + '<h3>' + author + '</h3>' + image + '<p>' + body
        return html

    def preprocess_html(self, soup):
-        for span in soup.findAll('span'):
-            p = span.findParent('p')
-            if p:
-                p['id'] = 'fig-cap'
+        for con in soup.findAll(attrs={'class':'n-content-layout__slot'}):
+            if con.find('figure'):
+                con['id'] = 'fig'
        return soup