Add recipe for xkcd thanks to Martin Pitt. Also make Time magazine recipe a little more efficient.

2025-11-18 12:33:03 -05:00 · 2009-01-03 20:19:47 -08:00 · 2009-01-03 20:19:47 -08:00 · 8f2dd9cf84
commit 8f2dd9cf84
parent bca2a86168
3 changed files with 58 additions and 13 deletions
--- a/src/calibre/web/feeds/recipes/init.py
+++ b/src/calibre/web/feeds/recipes/init.py
@ -20,7 +20,7 @@ recipe_modules = ['recipe_' + r for r in (
           'science_news', 'the_nation', 'lrb', 'harpers_full', 'liberation',
           'linux_magazine', 'telegraph_uk', 'utne', 'sciencedaily', 'forbes',
           'time_magazine', 'endgadget', 'fudzilla', 'nspm_int', 'nspm', 'pescanik',
-           'spiegel_int', 'themarketticker', 'tomshardware',
+           'spiegel_int', 'themarketticker', 'tomshardware', 'xkcd',
          )]

 import re, imp, inspect, time, os
--- a/src/calibre/web/feeds/recipes/recipe_time_magazine.py
+++ b/src/calibre/web/feeds/recipes/recipe_time_magazine.py
@ -6,22 +6,28 @@ __copyright__ = '2008, Darko Miletic <darko.miletic at gmail.com>'
 time.com
 '''

-from calibre.ebooks.BeautifulSoup import BeautifulSoup
 from calibre.web.feeds.news import BasicNewsRecipe

 class Time(BasicNewsRecipe):
    title                 = u'Time'
-    __author__            = 'Darko Miletic'
+    __author__            = 'Kovid Goyal'
    description           = 'Weekly magazine'    
    oldest_article        = 7
    max_articles_per_feed = 100
    no_stylesheets        = True
    use_embedded_content  = False
    
-    #cover_url = 'http://img.timeinc.net/time/rd/trunk/www/web/feds/i/logo_time_home.gif'
-    
    keep_only_tags = [dict(name='div', attrs={'class':'tout1'})]
-    remove_tags    = [dict(name='ul', attrs={'class':['button', 'find']})]
+    remove_tags_after = [dict(id='connectStory')]
+    remove_tags    = [
+                      dict(name='ul', attrs={'class':['button', 'find']}),
+                      dict(name='div', attrs={'class':['nav', 'header', 'sectheader', 
+                                                       'searchWrap', 'subNav', 
+                                                       'artTools', 'connect',
+                                                       'similarrecs']}),
+                      dict(name='div', id=['articleSideBar', 'connectStory']),
+                      dict(name='dl', id=['links']),                                 
+                      ]

    feeds          = [
                       (u'Top Stories', u'http://feedproxy.google.com/time/topstories')
@ -34,17 +40,20 @@ class Time(BasicNewsRecipe):
                       ,(u'Travel', u'http://feedproxy.google.com/time/travel')
                     ]
    
+    def get_article_url(self, article):
+        return article.get('guid',  article['link'])
+    
    def get_cover_url(self):
        soup = self.index_to_soup('http://www.time.com/time/')
        img = soup.find('img', alt='Current Time.com Cover', width='107')
        if img is not None:
            return img.get('src', None)
        
-    
    def print_version(self, url):
-        raw = self.browser.open(url).read()
-        soup = BeautifulSoup(raw.decode('utf8', 'replace'))
-        print_link = soup.find('a', {'id':'prt'})
-        if print_link is None:
-            return ''
-        return 'http://www.time.com' + print_link['href']
+        try:
+            soup = self.index_to_soup(url)
+            print_link = soup.find('a', {'id':'prt'})
+            return 'http://www.time.com' + print_link['href']
+        except:
+            self.log_exception('Failed to find print version for '+url)
+        return ''
--- a/src/calibre/web/feeds/recipes/recipe_xkcd.py
+++ b/src/calibre/web/feeds/recipes/recipe_xkcd.py
@ -0,0 +1,36 @@
+__license__   = 'GPL v3'
+__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
+
+'''
+Fetch xkcd.
+'''
+
+import time
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class XkcdCom(BasicNewsRecipe):
+    title = 'xkcd'
+    description = 'A webcomic of romance and math humor.'
+    __author__ = 'Martin Pitt'
+    use_embedded_content   = False
+    oldest_article = 60
+    keep_only_tags = [dict(id='middleContent')]
+    remove_tags = [dict(name='ul'), dict(name='h3'), dict(name='br')]
+    no_stylesheets = True
+    
+    def parse_index(self):
+        INDEX = 'http://xkcd.com/archive/'
+
+        soup = self.index_to_soup(INDEX) 
+        articles = []
+        for item in soup.findAll('a', title=True):
+            articles.append({
+                'date': item['title'],
+                'timestamp': time.mktime(time.strptime(item['title'], '%Y-%m-%d'))+1,
+                'url': 'http://xkcd.com' + item['href'],
+                'title': self.tag_to_string(item).encode('UTF-8'),
+                'description': '',
+                'content': '',
+            })
+
+        return [('xkcd', articles)]