Fix Dilbert feed for new Dilbert website

2025-07-09 03:04:10 -04:00 · 2008-01-29 04:43:22 +00:00 · 2008-01-29 04:43:22 +00:00 · 997367ed56
commit 997367ed56
parent 178936b977
2 changed files with 48 additions and 19 deletions
--- a/src/libprs500/ebooks/lrf/web/profiles/init.py
+++ b/src/libprs500/ebooks/lrf/web/profiles/init.py
@ -15,7 +15,7 @@
 '''
 '''
-import tempfile, time, calendar, re, operator
+import tempfile, time, calendar, re, operator, atexit, shutil, os
 from htmlentitydefs import name2codepoint
 from libprs500 import __appname__, iswindows, browser
@ -100,15 +100,10 @@ class DefaultProfile(object):
            self.url = 'file:'+ ('' if iswindows else '//') + self.build_index()
        except NotImplementedError:
            self.url = None
-    
+        atexit.register(cleanup, self.temp_dir)
    def __del__(self):
        import os, shutil
        if os.path.isdir(self.temp_dir):
            shutil.rmtree(self.temp_dir)
    def build_index(self):
        '''Build an RSS based index.html'''
        import os
        articles = self.parse_feeds()
@ -168,6 +163,8 @@ class DefaultProfile(object):
        '''
        if not tag:
            return ''
        if isinstance(tag, basestring):
            return tag
        strings = []
        for item in tag.contents:
            if isinstance(item, (NavigableString, CData)):
@ -180,6 +177,19 @@ class DefaultProfile(object):
                    strings.append(item['alt'])
        return u''.join(strings) 
    def get_article_url(self, item):
        '''
        Return the article URL given an item Tag from a feed, or None if no valid URL is found
        @param: A BeautifulSoup Tag instance corresponding to the <item> tag from a feed.
        '''
        url = None
        for element in self.url_search_order:
            url = item.find(element)
            if url:
                break
        return url
    def parse_feeds(self, require_url=True):
        '''
        Create list of articles from a list of feeds.
@ -220,15 +230,14 @@ class DefaultProfile(object):
                            continue
                        pubdate = self.tag_to_string(pubdate)
                        pubdate = pubdate.replace('+0000', 'GMT')
-                    for element in self.url_search_order:
+                    
-                        url = item.find(element)
+                    url = self.get_article_url(item)
-                        if url:
+                    
-                            break
+                    
-                        
+                    url = self.tag_to_string(url)
-                    if require_url and (not url or not url.string):
+                    if require_url and not url:
                        self.logger.debug('Skipping article as it does not have a link url')
                        continue
                    url = self.tag_to_string(url)
                    content = item.find('content:encoded')
                    if not content:
@ -362,7 +371,6 @@ class FullContentProfile(DefaultProfile):
    def build_index(self):
        '''Build an RSS based index.html'''
        import os
        articles = self.parse_feeds(require_url=False)
        def build_sub_index(title, items):
@ -448,4 +456,11 @@ def create_class(src):
        if hasattr(item, 'build_index'):
            if item.__name__ not in ['DefaultProfile', 'FullContentProfile']:
                return item
-    
+   
 def cleanup(tdir):
    try:
        if os.path.isdir(tdir):
            shutil.rmtree(tdir)
    except:
        #print tdir
        pass 
--- a/src/libprs500/ebooks/lrf/web/profiles/dilbert.py
+++ b/src/libprs500/ebooks/lrf/web/profiles/dilbert.py
@ -19,7 +19,7 @@
 '''
 Fetch Dilbert.
 '''
-
+import os
 from libprs500.ebooks.lrf.web.profiles import DefaultProfile
@ -27,11 +27,25 @@ class Dilbert(DefaultProfile):
    title = 'Dilbert'
    timefmt = ' [%d %b %Y]'
-    max_recursions = 1
+    max_recursions = 2
    max_articles_per_feed = 6
    html_description = True
    no_stylesheets = True
    def get_feeds(self): 
-        return [ ('Dilbert', 'http://feeds.feedburner.com/tapestrydilbert') ] 
+        return [ ('Dilbert', 'http://feeds.feedburner.com/tapestrydilbert') ]
    def get_article_url(self, item):
        return item.find('enclosure')['url']
    def build_index(self):
        index = os.path.join(self.temp_dir, 'index.html')
        articles = list(self.parse_feeds(require_url=False).values())[0]
        res = ''
        for item in articles:
            res += '<h3>%s</h3><img style="page-break-after:always" src="%s" />\n'%(item['title'], item['url'])
        res = '<html><body><h1>Dilbert</h1>%s</body></html'%res
        open(index, 'wb').write(res)
        return index