Fix Dilbert feed for new Dilbert website

2025-07-09 03:04:10 -04:00 · 2008-01-29 04:43:22 +00:00 · 2008-01-29 04:43:22 +00:00 · 997367ed56
commit 997367ed56
parent 178936b977
2 changed files with 48 additions and 19 deletions
--- a/src/libprs500/ebooks/lrf/web/profiles/init.py
+++ b/src/libprs500/ebooks/lrf/web/profiles/init.py
@ -15,7 +15,7 @@
 '''
 '''

-import tempfile, time, calendar, re, operator
+import tempfile, time, calendar, re, operator, atexit, shutil, os
 from htmlentitydefs import name2codepoint

 from libprs500 import __appname__, iswindows, browser
@ -100,15 +100,10 @@ class DefaultProfile(object):
            self.url = 'file:'+ ('' if iswindows else '//') + self.build_index()
        except NotImplementedError:
            self.url = None
-    
-    def __del__(self):
-        import os, shutil
-        if os.path.isdir(self.temp_dir):
-            shutil.rmtree(self.temp_dir)
+        atexit.register(cleanup, self.temp_dir)
    
    def build_index(self):
        '''Build an RSS based index.html'''
-        import os
        articles = self.parse_feeds()
        
    
@ -168,6 +163,8 @@ class DefaultProfile(object):
        '''
        if not tag:
            return ''
+        if isinstance(tag, basestring):
+            return tag
        strings = []
        for item in tag.contents:
            if isinstance(item, (NavigableString, CData)):
@ -180,6 +177,19 @@ class DefaultProfile(object):
                    strings.append(item['alt'])
        return u''.join(strings) 
    
+    def get_article_url(self, item):
+        '''
+        Return the article URL given an item Tag from a feed, or None if no valid URL is found
+        @param: A BeautifulSoup Tag instance corresponding to the <item> tag from a feed.
+        '''
+        url = None
+        for element in self.url_search_order:
+            url = item.find(element)
+            if url:
+                break
+        return url
+        
+    
    def parse_feeds(self, require_url=True):
        '''
        Create list of articles from a list of feeds.
@ -220,15 +230,14 @@ class DefaultProfile(object):
                            continue
                        pubdate = self.tag_to_string(pubdate)
                        pubdate = pubdate.replace('+0000', 'GMT')
-                    for element in self.url_search_order:
-                        url = item.find(element)
-                        if url:
-                            break
                    
-                    if require_url and (not url or not url.string):
+                    url = self.get_article_url(item)
+                    
+                    
+                    url = self.tag_to_string(url)
+                    if require_url and not url:
                        self.logger.debug('Skipping article as it does not have a link url')
                        continue
-                    url = self.tag_to_string(url)
                    
                    content = item.find('content:encoded')
                    if not content:
@ -362,7 +371,6 @@ class FullContentProfile(DefaultProfile):
    
    def build_index(self):
        '''Build an RSS based index.html'''
-        import os
        articles = self.parse_feeds(require_url=False)
        
        def build_sub_index(title, items):
@ -449,3 +457,10 @@ def create_class(src):
            if item.__name__ not in ['DefaultProfile', 'FullContentProfile']:
                return item
   
+def cleanup(tdir):
+    try:
+        if os.path.isdir(tdir):
+            shutil.rmtree(tdir)
+    except:
+        #print tdir
+        pass 
--- a/src/libprs500/ebooks/lrf/web/profiles/dilbert.py
+++ b/src/libprs500/ebooks/lrf/web/profiles/dilbert.py
@ -19,7 +19,7 @@
 '''
 Fetch Dilbert.
 '''
-
+import os

 from libprs500.ebooks.lrf.web.profiles import DefaultProfile

@ -27,7 +27,7 @@ class Dilbert(DefaultProfile):

    title = 'Dilbert'
    timefmt = ' [%d %b %Y]'
-    max_recursions = 1
+    max_recursions = 2
    max_articles_per_feed = 6
    html_description = True
    no_stylesheets = True
@ -35,3 +35,17 @@ class Dilbert(DefaultProfile):
    def get_feeds(self): 
        return [ ('Dilbert', 'http://feeds.feedburner.com/tapestrydilbert') ]
    
+    def get_article_url(self, item):
+        return item.find('enclosure')['url']
+    
+    def build_index(self):
+        index = os.path.join(self.temp_dir, 'index.html')
+        articles = list(self.parse_feeds(require_url=False).values())[0]
+        res = ''
+        for item in articles:
+            res += '<h3>%s</h3><img style="page-break-after:always" src="%s" />\n'%(item['title'], item['url'])
+        res = '<html><body><h1>Dilbert</h1>%s</body></html'%res
+        open(index, 'wb').write(res)
+        return index
+         
+