Update Irish Times. Fixes #1159553 (Updated news recipe for the Irish Times)

2025-12-19 03:25:01 -05:00 · 2013-03-25 08:59:17 +05:30 · 2013-03-25 08:59:17 +05:30 · 3eacc9cadb
commit 3eacc9cadb
parent 8535e21694
1 changed files with 41 additions and 44 deletions
--- a/recipes/irish_times.recipe
+++ b/recipes/irish_times.recipe
@ -1,65 +1,62 @@
 __license__  = 'GPL v3'
-__copyright__ = "2008, Derry FitzGerald. 2009 Modified by Ray Kinsella and David O'Callaghan, 2011 Modified by Phil Burns"
+__copyright__ = "2008, Derry FitzGerald. 2009 Modified by Ray Kinsella and David O'Callaghan, 2011 Modified by Phil Burns, 2013 Tom Scholl"
 '''
 irishtimes.com
 '''
-import re
+import urlparse, re
 from calibre.web.feeds.news import BasicNewsRecipe
 from calibre.ptempfile import PersistentTemporaryFile
 class IrishTimes(BasicNewsRecipe):
    title          = u'The Irish Times'
-    encoding  = 'ISO-8859-15'
+    __author__    = "Derry FitzGerald, Ray Kinsella, David O'Callaghan and Phil Burns, Tom Scholl"
    __author__    = "Derry FitzGerald, Ray Kinsella, David O'Callaghan and Phil Burns"
    language = 'en_IE'
    timefmt = ' (%A, %B %d, %Y)'
    masthead_url = 'http://www.irishtimes.com/assets/images/generic/website/logo_theirishtimes.png'
    encoding = 'utf-8'
    oldest_article = 1.0
-    max_articles_per_feed  = 100
+    max_articles_per_feed = 100
    remove_empty_feeds = True
    no_stylesheets = True
-    simultaneous_downloads= 5
+    temp_files = []
-
+    articles_are_obfuscated = True
    r = re.compile('.*(?P<url>http:\/\/(www.irishtimes.com)|(rss.feedsportal.com\/c)\/.*\.html?).*')
    remove_tags    = [dict(name='div', attrs={'class':'footer'})]
    extra_css      = 'p, div { margin: 0pt; border: 0pt; text-indent: 0.5em } .headline {font-size: large;} \n .fact { padding-top: 10pt  }'
    feeds          = [
-                      ('Frontpage', 'http://www.irishtimes.com/feeds/rss/newspaper/index.rss'),
+                      ('News', 'http://www.irishtimes.com/cmlink/the-irish-times-news-1.1319192'),
-                      ('Ireland', 'http://www.irishtimes.com/feeds/rss/newspaper/ireland.rss'),
+                      ('World', 'http://www.irishtimes.com/cmlink/irishtimesworldfeed-1.1321046'),
-                      ('World', 'http://www.irishtimes.com/feeds/rss/newspaper/world.rss'),
+                      ('Politics', 'http://www.irishtimes.com/cmlink/irish-times-politics-rss-1.1315953'),
-                      ('Finance', 'http://www.irishtimes.com/feeds/rss/newspaper/finance.rss'),
+                      ('Business', 'http://www.irishtimes.com/cmlink/the-irish-times-business-1.1319195'),
-                      ('Features', 'http://www.irishtimes.com/feeds/rss/newspaper/features.rss'),
+                      ('Culture', 'http://www.irishtimes.com/cmlink/the-irish-times-culture-1.1319213'),
-                      ('Sport', 'http://www.irishtimes.com/feeds/rss/newspaper/sport.rss'),
+                      ('Sport', 'http://www.irishtimes.com/cmlink/the-irish-times-sport-1.1319194'),
-                      ('Opinion', 'http://www.irishtimes.com/feeds/rss/newspaper/opinion.rss'),
+                      ('Debate', 'http://www.irishtimes.com/cmlink/debate-1.1319211'),
-                      ('Letters', 'http://www.irishtimes.com/feeds/rss/newspaper/letters.rss'),
+                      ('Life & Style', 'http://www.irishtimes.com/cmlink/the-irish-times-life-style-1.1319214'),
                      ('Magazine', 'http://www.irishtimes.com/feeds/rss/newspaper/magazine.rss'),
                      ('Health', 'http://www.irishtimes.com/feeds/rss/newspaper/health.rss'),
                      ('Education & Parenting', 'http://www.irishtimes.com/feeds/rss/newspaper/education.rss'),
                      ('Motors', 'http://www.irishtimes.com/feeds/rss/newspaper/motors.rss'),
                      ('An Teanga Bheo', 'http://www.irishtimes.com/feeds/rss/newspaper/anteangabheo.rss'),
                      ('Commercial Property', 'http://www.irishtimes.com/feeds/rss/newspaper/commercialproperty.rss'),
                      ('Science Today', 'http://www.irishtimes.com/feeds/rss/newspaper/sciencetoday.rss'),
                      ('Property', 'http://www.irishtimes.com/feeds/rss/newspaper/property.rss'),
                      ('The Tickets', 'http://www.irishtimes.com/feeds/rss/newspaper/theticket.rss'),
                      ('Weekend', 'http://www.irishtimes.com/feeds/rss/newspaper/weekend.rss'),
                      ('News features', 'http://www.irishtimes.com/feeds/rss/newspaper/newsfeatures.rss'),
                      ('Obituaries', 'http://www.irishtimes.com/feeds/rss/newspaper/obituaries.rss'),
                    ]
-    def print_version(self, url):
+    def get_obfuscated_article(self, url):
-        if url.count('rss.feedsportal.com'):
+        # Insert a pic from the original url, but use content from the print url
-            #u = url.replace('0Bhtml/story01.htm','_pf0Bhtml/story01.htm')
+        pic = None
-            u = url.find('irishtimes')
+        pics = self.index_to_soup(url)
-            u = 'http://www.irishtimes.com' + url[u + 12:]
+        div = pics.find('div', {'class' : re.compile('image-carousel')})
-            u = u.replace('0C', '/')
+        if div:
-            u = u.replace('A', '')
+            pic = div.img
-            u = u.replace('0Bhtml/story01.htm', '_pf.html')
+            if pic:
-        else:
+                try:
-            u = url.replace('.html','_pf.html')
+                    pic['src'] = urlparse.urljoin(url, pic['src'])
-        return u
+                    pic.extract()
                except:
                    pic = None
        content = self.index_to_soup(url + '?mode=print&ot=example.AjaxPageLayout.ot')
        if pic:
            content.p.insert(0, pic)
        self.temp_files.append(PersistentTemporaryFile('_fa.html'))
        self.temp_files[-1].write(content.prettify())
        self.temp_files[-1].close()
        return self.temp_files[-1].name
    def get_article_url(self, article):
        return article.link