Add recipes for The Irish Times and The International Herald Tribune

2025-12-17 10:35:02 -05:00 · 2008-08-17 09:55:25 -07:00 · 2008-08-17 09:55:25 -07:00 · 6334cc850c
commit 6334cc850c
parent c31e509de2
4 changed files with 115 additions and 6 deletions
--- a/src/calibre/web/feeds/news.py
+++ b/src/calibre/web/feeds/news.py
@ -98,6 +98,10 @@ class BasicNewsRecipe(object, LoggingInterface):
    #: embedded content.
    use_embedded_content   = None
    
+    #: Set to True and implement :method:`get_obfuscated_article` to handle
+    #: websites that try to make it difficult to scrape content.
+    articles_are_obfuscated = False
+    
    #: Specify any extra :term:`CSS` that should be addded to downloaded :term:`HTML` files
    #: It will be inserted into `<style>` tags, just before the closing
    #: `</head>` tag thereby overrinding all :term:`CSS` except that which is
@ -360,12 +364,25 @@ class BasicNewsRecipe(object, LoggingInterface):
        '''
        raise NotImplementedError
    
+    def get_obfuscated_article(self, url, logger):
+        '''
+        If you set :member:`articles_are_obfuscated` this method is called with
+        every article URL. It should return the path to a file on the filesystem
+        that contains the article HTML. That file is processed by the recursive
+        HTML fetching engine, so it can contain links to pages/images on the web.
+        
+        This method is typically useful for sites that try to make it difficult to
+        access article content automatically. See for example the 
+        :module:`calibre.web.recipes.iht` recipe.
+        '''
+        raise NotImplementedError
+    
    def __init__(self, options, parser, progress_reporter):
        '''
        Initialize the recipe.
-        @param options: Parsed commandline options 
-        @param parser:  Command line option parser. Used to intelligently merge options.
-        @param progress_reporter: A Callable that takes two arguments: progress (a number between 0 and 1) and a string message. The message should be optional.
+        :param options: Parsed commandline options 
+        :param parser:  Command line option parser. Used to intelligently merge options.
+        :param progress_reporter: A Callable that takes two arguments: progress (a number between 0 and 1) and a string message. The message should be optional.
        '''
        LoggingInterface.__init__(self, logging.getLogger('feeds2disk'))
        if not isinstance(self.title, unicode):
@ -564,7 +581,11 @@ class BasicNewsRecipe(object, LoggingInterface):
    
    def fetch_article(self, url, dir, logger, f, a, num_of_feeds):
        return self._fetch_article(url, dir, logger, f, a, num_of_feeds)
-        
+    
+    def fetch_obfuscated_article(self, url, dir, logger, f, a, num_of_feeds):
+        path = os.path.abspath(self.get_obfuscated_article(url, logger))
+        url = ('file:'+path) if iswindows else ('file://'+path)
+        return self._fetch_article(url, dir, logger, f, a, num_of_feeds)
    
    def fetch_embedded_article(self, article, dir, logger, f, a, num_of_feeds):
        pt = PersistentTemporaryFile('_feeds2disk.html')
@ -620,7 +641,8 @@ class BasicNewsRecipe(object, LoggingInterface):
                    continue
                    
                func, arg = (self.fetch_embedded_article, article) if self.use_embedded_content else \
-                            (self.fetch_article, url)
+                            ((self.fetch_obfuscated_article if self.articles_are_obfuscated \
+                              else self.fetch_article), url)
                req = WorkRequest(func, (arg, art_dir, logger, f, a, len(feed)), 
                                      {}, (f, a), self.article_downloaded, 
                                      self.error_in_article_download)
--- a/src/calibre/web/feeds/recipes/init.py
+++ b/src/calibre/web/feeds/recipes/init.py
@ -8,7 +8,7 @@ recipes = [
           'newsweek', 'atlantic', 'economist', 'portfolio', 
           'nytimes', 'usatoday', 'outlook_india', 'bbc', 'greader', 'wsj',
           'wired', 'globe_and_mail', 'smh', 'espn', 'business_week',
-           'ars_technica', 'upi', 'new_yorker',
+           'ars_technica', 'upi', 'new_yorker', 'irish_times', 'iht',
          ]

 import re, imp, inspect, time
--- a/src/calibre/web/feeds/recipes/iht.py
+++ b/src/calibre/web/feeds/recipes/iht.py
@ -0,0 +1,51 @@
+__license__   = 'GPL v3'
+__copyright__ = '2008, Derry FitzGerald'
+'''
+iht.com
+'''
+
+from calibre.web.feeds.news import BasicNewsRecipe
+from calibre.ptempfile import PersistentTemporaryFile
+
+
+class InternationalHeraldTribune(BasicNewsRecipe):
+    title          = u'The International Herald Tribune'
+    __author__     = 'Derry FitzGerald'
+    oldest_article = 1
+    max_articles_per_feed = 10
+    no_stylesheets = True
+
+    remove_tags    = [dict(name='div', attrs={'class':'footer'})]
+    extra_css      = '.headline {font-size: x-large;} \n .fact { padding-top: 10pt  }' 
+
+    feeds          = [
+                      (u'Frontpage', u'http://www.iht.com/rss/frontpage.xml'), 
+                      (u'Business', u'http://www.iht.com/rss/business.xml'),
+                      (u'Americas', u'http://www.iht.com/rss/america.xml'),
+                      (u'Europe', u'http://www.iht.com/rss/europe.xml'),
+                      (u'Asia', u'http://www.iht.com/rss/asia.xml'),
+                      (u'Africa and Middle East', u'http://www.iht.com/rss/africa.xml'),
+                      (u'Opinion', u'http://www.iht.com/rss/opinion.xml'),
+                      (u'Technology', u'http://www.iht.com/rss/technology.xml'),
+                      (u'Health and Science', u'http://www.iht.com/rss/healthscience.xml'),
+                      (u'Sports', u'http://www.iht.com/rss/sports.xml'),
+                      (u'Culture', u'http://www.iht.com/rss/arts.xml'),
+                      (u'Style and Design', u'http://www.iht.com/rss/style.xml'),
+                      (u'Travel', u'http://www.iht.com/rss/travel.xml'),
+                      (u'At Home Abroad', u'http://www.iht.com/rss/athome.xml'),
+                      (u'Your Money', u'http://www.iht.com/rss/yourmoney.xml'),
+                      (u'Properties', u'http://www.iht.com/rss/properties.xml')
+                    ]
+    temp_files = []
+    articles_are_obfuscated = True
+    
+    def get_obfuscated_article(self, url, logger):
+        br = self.get_browser()
+        br.open(url)
+        br.select_form(name='printFriendly')
+        res = br.submit()
+        html = res.read()
+        self.temp_files.append(PersistentTemporaryFile('_iht.html'))
+        self.temp_files[-1].write(html)
+        self.temp_files[-1].close()
+        return self.temp_files[-1].name
--- a/src/calibre/web/feeds/recipes/irish_times.py
+++ b/src/calibre/web/feeds/recipes/irish_times.py
@ -0,0 +1,36 @@
+__license__   = 'GPL v3'
+__copyright__ = '2008, Derry FitzGerald'
+'''
+irishtimes.com
+'''
+
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class IrishTimes(BasicNewsRecipe):
+    title          = u'The Irish Times'
+    __author__     = 'Derry FitzGerald'
+    no_stylesheets = True
+
+    remove_tags    = [dict(name='div', attrs={'class':'footer'})]
+    extra_css      = '.headline {font-size: x-large;} \n .fact { padding-top: 10pt  }' 
+
+    feeds          = [
+                      ('Frontpage', 'http://www.irishtimes.com/feeds/rss/newspaper/index.rss'), 
+                      ('Ireland', 'http://www.irishtimes.com/feeds/rss/newspaper/ireland.rss'),
+                      ('World', 'http://www.irishtimes.com/feeds/rss/newspaper/world.rss'),
+                      ('Finance', 'http://www.irishtimes.com/feeds/rss/newspaper/finance.rss'),
+                      ('Features', 'http://www.irishtimes.com/feeds/rss/newspaper/features.rss'),
+                      ('Sport', 'http://www.irishtimes.com/feeds/rss/newspaper/sport.rss'),
+                      ('Opinion', 'http://www.irishtimes.com/feeds/rss/newspaper/opinion.rss'),
+                      ('Letters', 'http://www.irishtimes.com/feeds/rss/newspaper/letters.rss'),
+                      ('Health', 'http://www.irishtimes.com/feeds/rss/newspaper/health.rss'),
+                      ('Education and Parenting', 'http://www.irishtimes.com/feeds/rss/newspaper/education.rss'),
+                      ('Science Today', 'http://www.irishtimes.com/feeds/rss/newspaper/sciencetoday.rss'),
+                      ('The Ticket', 'http://www.irishtimes.com/feeds/rss/newspaper/theticket.rss'),
+                      ('Weekend', 'http://www.irishtimes.com/feeds/rss/newspaper/weekend.rss'),
+                      ('News Features', 'http://www.irishtimes.com/feeds/rss/newspaper/newsfeatures.rss'),
+                      ('Magazine', 'http://www.irishtimes.com/feeds/rss/newspaper/magazine.rss'),
+                    ]
+
+    def print_version(self, url):
+        return url.replace('.html', '_pf.html')