Update Telegraph UK

2025-07-09 03:04:10 -04:00 · 2017-03-15 16:57:45 +05:30 · 2017-03-15 16:57:45 +05:30 · 7e23a26063
commit 7e23a26063
parent b2e38cd0d4
1 changed files with 31 additions and 16 deletions
--- a/recipes/telegraph_uk.recipe
+++ b/recipes/telegraph_uk.recipe
@ -5,12 +5,21 @@ telegraph.co.uk
 '''

 import json
+from calibre import random_user_agent
 from calibre.web.feeds.news import BasicNewsRecipe


 def classes(classes):
    q = frozenset(classes.split(' '))
-    return dict(attrs={'class': lambda x: x and frozenset(x.split()).intersection(q)})
+    return dict(
+        attrs={'class': lambda x: x and frozenset(x.split()).intersection(q)}
+    )
+
+
+def absolutize(url):
+    if url.startswith('/'):
+        url = 'http://www.telegraph.co.uk' + url
+    return url


 class TelegraphUK(BasicNewsRecipe):
@ -29,7 +38,6 @@ class TelegraphUK(BasicNewsRecipe):
    use_embedded_content = False

    feeds = [
-
        (u'UK News', u'http://www.telegraph.co.uk/news/uknews/rss'),
        (u'World News', u'http://www.telegraph.co.uk/news/worldnews/rss'),
        (u'Politics', u'http://www.telegraph.co.uk/news/newstopics/politics/rss'),
@ -54,6 +62,11 @@ class TelegraphUK(BasicNewsRecipe):
    ]
    remove_attributes = 'width height'.split()

+    def get_browser(self):
+        return BasicNewsRecipe.get_browser(
+            self, user_agent=random_user_agent(allow_ie=False)
+        )
+
    def get_article_url(self, article):
        url = article.get('link', None)
        if 'picture-galleries' in url or 'pictures' in url or 'picturegalleries' in url:
@ -61,10 +74,12 @@ class TelegraphUK(BasicNewsRecipe):
        return url

    def preprocess_html(self, soup):
-        for img in soup.findAll('img', attrs={'data-frz-src-array': True}):
+        for img in soup.findAll(attrs={'data-frz-src-array': True}):
+            img['style'] = ''
+            img.name = 'img'
            d = json.loads(img['data-frz-src-array'].replace("'", '"'))
            for item in d:
                if int(item.get('width', 0)) > 700:
-                    img['src'] = item['src']
+                    img['src'] = absolutize(item['src'])
                    break
        return soup