Update Telegraph UK

2026-04-27 03:09:56 -04:00 · 2016-05-11 16:57:47 +05:30 · 2016-05-11 16:57:47 +05:30 · 7128a9327c
commit 7128a9327c
parent 77ef24afde
1 changed files with 27 additions and 34 deletions
--- a/recipes/telegraph_uk.recipe
+++ b/recipes/telegraph_uk.recipe
@ -4,8 +4,13 @@ __copyright__ = '2008-2010, Darko Miletic <darko.miletic at gmail.com>'
 telegraph.co.uk
 '''

+import json
 from calibre.web.feeds.news import BasicNewsRecipe

+def classes(classes):
+    q = frozenset(classes.split(' '))
+    return dict(attrs={'class':lambda x:x and frozenset(x.split()).intersection(q)})
+
 class TelegraphUK(BasicNewsRecipe):
    title                 = 'Telegraph.co.uk'
    __author__            = 'Darko Miletic and Sujata Raman'
@ -13,41 +18,16 @@ class TelegraphUK(BasicNewsRecipe):
    oldest_article        = 2
    category              = 'news, politics, UK'
    publisher             = 'Telegraph Media Group ltd.'
-    compress_news_images  = True
    max_articles_per_feed = 100
    no_stylesheets        = True
    language              = 'en_GB'
+    encoding = 'utf-8'
+    ignore_duplicate_articles = {'title', 'url'}
    remove_empty_feeds    = True
    use_embedded_content  = False

-    extra_css           = '''
-                        h1{font-family :Arial,Helvetica,sans-serif; font-size:large; }
-                        h2{font-family :Arial,Helvetica,sans-serif; font-size:x-small; color:#444444;}
-                        .story{font-family :Arial,Helvetica,sans-serif; font-size: x-small;}
-                        .byline{color:#666666; font-family :Arial,Helvetica,sans-serif; font-size: xx-small;}
-                        a{color:#234B7B; }
-                        .imageExtras{color:#666666; font-family :Arial,Helvetica,sans-serif; font-size: xx-small;}
-                        '''
-
-    conversion_options = {
-                          'comment'   : description
-                        , 'tags'      : category
-                        , 'publisher' : publisher
-                        , 'language'  : language
-                        }
-
-    keep_only_tags      = [
-                           dict(name='div', attrs={'class':['storyHead','byline']})
-                          ,dict(name='div', attrs={'id':'mainBodyArea'})
-                          ]
-    remove_tags         = [dict(name='div', attrs={
-        'class':['related_links_inline',"imgindex","next","prev","gutterUnder",'ssImgHide','imageExtras','ssImg hide','related_links_video']})
-                          ,dict(name='ul' , attrs={'class':['shareThis shareBottom']})
-                          ,dict(name='span', attrs={'class':['num','placeComment']})
-                          ]
-
    feeds               = [
-                         (u'UK News'        , u'http://www.telegraph.co.uk/news/uknews/rss')
+                        (u'UK News'        , u'http://www.telegraph.co.uk/news/uknews/rss')
                        ,(u'World News'     , u'http://www.telegraph.co.uk/news/worldnews/rss')
                        ,(u'Politics'       , u'http://www.telegraph.co.uk/news/newstopics/politics/rss')
                        ,(u'Finance'        , u'http://www.telegraph.co.uk/finance/rss')
@ -59,12 +39,17 @@ class TelegraphUK(BasicNewsRecipe):
                        ,(u'Comment'        , u'http://www.telegraph.co.uk/comment/rss')
                        ,(u'Travel'        , u'http://www.telegraph.co.uk/travel/rss')
                        ,(u'How about that?', u'http://www.telegraph.co.uk/news/newstopics/howaboutthat/rss')
-                         ]
-    def populate_article_metadata(self, article, soup, first):
-        if first and hasattr(self, 'add_toc_thumbnail'):
-            picdiv = soup.find('img')
-            if picdiv is not None:
-                self.add_toc_thumbnail(article,picdiv['src'])
+    ]
+
+    keep_only_tags = [
+        classes('lead-asset-image-container headline__heading footer-author'),
+        dict(itemprop='articleBody'),
+    ]
+    remove_tags = [
+        dict(name=['link', 'meta', 'style']),
+        classes('videoPlayer'),
+    ]
+    remove_attributes = 'width height'.split()

    def get_article_url(self, article):
        url = article.get('link', None)
@ -72,3 +57,11 @@ class TelegraphUK(BasicNewsRecipe):
            url = None
        return url

+    def preprocess_html(self, soup):
+        for img in soup.findAll('img', attrs={'data-frz-src-array':True}):
+            d = json.loads(img['data-frz-src-array'].replace("'", '"'))
+            for item in d:
+                if int(item.get('width', 0)) > 700:
+                    img['src'] = item['src']
+                    break
+        return soup