Merge branch 'master' of https://github.com/unkn0w7n/calibre

2025-12-14 17:15:06 -05:00 · 2024-02-11 05:22:39 +05:30 · 2024-02-11 05:22:39 +05:30 · 1dc703da20
commit 1dc703da20
parent 48327b08a9 9288eeea00
1 changed files with 51 additions and 37 deletions
--- a/recipes/rtnews.recipe
+++ b/recipes/rtnews.recipe
@ -1,62 +1,76 @@
 __license__ = 'GPL v3'
 __copyright__ = '2011, Darko Miletic <darko.miletic at gmail.com>'
 '''
 rt.com
 '''
-from calibre.web.feeds.news import BasicNewsRecipe
+from calibre.web.feeds.news import BasicNewsRecipe, classes
 class RT_eng(BasicNewsRecipe):
-    title = 'RT in English'
+    title = 'Russia Today'
-    __author__ = 'Darko Miletic'
+    __author__ = 'unkn0wn'
-    description = 'RT is the first Russian 24/7 English-language news channel which brings the Russian view on global news.'
+    description = '''
        RT creates news with an edge for viewers who want to Question More. RT covers stories overlooked by the mainstream 
        media, provides alternative perspectives on current affairs, and acquaints international audiences with a Russian 
        viewpoint on major global events.
    '''
    publisher = 'Autonomous Nonprofit Organization "TV-Novosti"'
    category = 'news, politics, economy, finances, Russia, world'
-    oldest_article = 2
+    oldest_article = 1.2
    no_stylesheets = True
-    encoding = 'utf8'
+    encoding = 'utf-8'
-    masthead_url = 'http://rt.com/s/css/img/printlogo.gif'
+    ignore_duplicate_articles = {'url', 'title'}
    use_embedded_content = False
    remove_empty_feeds = True
-    language = 'en_RU'
+    remove_javascript = True
    language = 'en'
    remove_attributes = ['height', 'width', 'style']
    publication_type = 'newsportal'
-    extra_css             = """
+
-                                body{font-family: Arial,Helvetica,sans-serif}
+    extra_css = '''
-                                h1{font-family: Georgia,"Times New Roman",Times,serif}
+        img {display:block; margin:0 auto;}
-                                .grey{color: gray}
+        em { color:#202020; }
-                                .fs12{font-size: small}
+        .date { font-size:small; color:#404040; }
-                            """
+        .article__summary { font-style:italic; color:#202020; }
        .media__footer { font-size:small; text-align:center; }
    '''
    conversion_options = {
        'comment': description, 'tags': category, 'publisher': publisher, 'language': language
    }
-    keep_only_tags = [dict(name='div', attrs={'class': 'all'})]
+    keep_only_tags = [
-    remove_tags = [
+        dict(name='div', attrs={'class':'article'})
-        dict(name=['object', 'link', 'embed', 'iframe', 'meta', 'link']), dict(
+    ]
-            attrs={'class': 'crumbs oh'})
+
    remove_tags = [
        dict(name=['meta', 'link', 'svg', 'button', 'style', 'iframe', 'noscript']),
        classes(
        	'update_date_visible breadcrumbs read-more Read-more-text-only article__share '
            'article__social-wrapper article__share_bottom'
        )
    ]
    remove_attributes = ['clear']
    feeds = [
-
+        ('Russia', 'https://www.rt.com/rss/russia/'),
-    (u'Politics', u'http://rt.com/politics/rss/'),
+        ('India', 'https://www.rt.com/rss/india/'),
-    (u'USA', u'http://rt.com/usa/news/rss/'),
+        ('Africa', 'https://www.rt.com/rss/africa/'),
-    (u'Business', u'http://rt.com/business/news/rss/'),
+        ('World News', 'https://www.rt.com/rss/news/'),
-    (u'Sport', u'http://rt.com/sport/rss/'),
+        ('Business', 'https://www.rt.com/rss/business/'),
-    (u'Art&Culture', u'http://rt.com/art-and-culture/news/rss/')
+        ('Opinion', 'https://www.rt.com/rss/op-ed/'),
        ('Culture', 'https://www.rt.com/rss/pop-culture/'),
        ('Others', 'https://www.rt.com/rss/')
    ]
-    def print_version(self, url):
+    def get_article_url(self, article):
-        return url + 'print/'
+        url = BasicNewsRecipe.get_article_url(self, article)
        return url.split('?')[0]
    def preprocess_html(self, soup):
-        for item in soup.findAll(style=True):
+        for img in soup.findAll('img'):
-            del item['style']
+            srcset = img.find_previous_sibling('source', attrs={'data-srcset':True})
-        for item in soup.findAll('a'):
+            if srcset:
-            str = item.string
+                for x in srcset['data-srcset'].split(','):
-            if str is None:
+                    if '/l/' in x.split()[0].strip():
-                str = self.tag_to_string(item)
+                        img['src'] = x.split()[0].strip()
-            item.replaceWith(str)
+        for src in soup.findAll('source'):
            src.decompose()
        return soup