Implement support for full-content feeds via a FullContentProfile

2025-11-18 12:33:03 -05:00 · 2008-01-22 06:34:05 +00:00 · 2008-01-22 06:34:05 +00:00 · f79fd7a15c
commit f79fd7a15c
parent 2080d9e52d
2 changed files with 110 additions and 15 deletions
--- a/src/libprs500/ebooks/lrf/web/profiles/init.py
+++ b/src/libprs500/ebooks/lrf/web/profiles/init.py
@ -153,9 +153,10 @@ class DefaultProfile(object):
        return index
-    def parse_feeds(self):
+    def parse_feeds(self, require_url=True):
        '''
        Create list of articles from a list of feeds.
        @param require_url: If True skip articles that don't have a link to a HTML page with the full article contents.
        @return: A dictionary whose keys are feed titles and whose values are each
        a list of dictionaries. Each list contains dictionaries of the form:
        {
@ -163,6 +164,7 @@ class DefaultProfile(object):
            'url'         : URL of print version,
            'date'        : The publication date of the article as a string,
            'description' : A summary of the article
            'content'     : The full article (can be an empty string). This is unused in DefaultProfile
        }
        '''
        added_articles = {}
@ -196,14 +198,25 @@ class DefaultProfile(object):
                        if url:
                            break
-                    if not url or not url.string:
+                    if require_url and (not url or not url.string):
                        self.logger.debug('Skipping article as it does not have a link url')
                        continue
                    url = url.string if (url and url.string) else ''
                    content = item.find('content:encoded')
                    if not content:
                        content = item.find('description')
                    if content:
                        content = self.process_html_description(content, strip_links=False)
                    else:
                        content = ''
                    d = { 
                        'title'    : item.find('title').string,                 
-                        'url'      : self.print_version(url.string),
+                        'url'      : self.print_version(url),
                        'timestamp': self.strptime(pubdate) if self.use_pubdate else time.time(),
-                        'date'     : pubdate if self.use_pubdate else time.ctime()
+                        'date'     : pubdate if self.use_pubdate else time.ctime(),
                        'content'  : content,
                        }
                    delta = time.time() - d['timestamp']
                    if not self.allow_duplicates:
@ -240,13 +253,19 @@ class DefaultProfile(object):
        pass
    @classmethod
-    def process_html_description(cls, tag):
+    def process_html_description(cls, tag, strip_links=True):
        src = '\n'.join(tag.contents)
-        replaced_entities = [ 'amp', 'lt', 'gt' , 'ldquo', 'rdquo', 'lsquo', 'rsquo' ]
+        match = re.match(r'<\!\[CDATA\[(.*)\]\]>', src.lstrip())
-        for e in replaced_entities:
+        if match:
-            ent = '&'+e+';'
+            src = match.group(1)
-            src = src.replace(ent, unichr(name2codepoint[e]))
+        else:
-        return re.compile(r'<a.*?>(.*?)</a>', re.IGNORECASE|re.DOTALL).sub(r'\1', src)
+            replaced_entities = [ 'amp', 'lt', 'gt' , 'ldquo', 'rdquo', 'lsquo', 'rsquo' ]
            for e in replaced_entities:
                ent = '&'+e+';'
                src = src.replace(ent, unichr(name2codepoint[e]))
        if strip_links:
            src = re.compile(r'<a.*?>(.*?)</a>', re.IGNORECASE|re.DOTALL).sub(r'\1', src)
        return src 
    DAY_MAP        = dict(Sun=0, Mon=1, Tue=2, Wed=3, Thu=4, Fri=5, Sat=6)
@ -301,3 +320,81 @@ class DefaultProfile(object):
        return args
 class FullContentProfile(DefaultProfile):
    max_recursions = 3
    summary_length = 500 # Max number of characters in the short description
    article_counter = 0
    def build_index(self):
        '''Build an RSS based index.html'''
        import os
        articles = self.parse_feeds(require_url=False)
        def build_sub_index(title, items):
            ilist = ''
            li = u'<li><a href="%(url)s">%(title)s</a> <span style="font-size: x-small">[%(date)s]</span><br/>\n'+\
                u'<div style="font-size:small; font-family:sans">%(description)s<br /></div></li>\n'
            for item in items:
                content = item['content']
                if not content:
                    self.logger.debug('Skipping article as it has no content:%s'%item['title'])
                    continue
                item['description'] = item['description'][:self.summary_length]+'&hellip;'
                self.article_counter = self.article_counter + 1
                url = os.path.join(self.temp_dir, 'article%d.html'%self.article_counter)
                item['url'] = url
                open(url, 'wb').write((u'''\
                    <html>
                    <body>
                    <h2>%s</h2>
                    <div>
                    %s
                    </div>
                    </body>
                    </html>'''%(item['title'], content)).encode('utf-8')
                    )
                ilist += li%item
            return u'''\
            <html>
            <body>
            <h2>%(title)s</h2>
            <ul>
            %(items)s
            </ul>
            </body>
            </html>
            '''%dict(title=title, items=ilist.rstrip())        
        cnum = 0
        clist = ''
        categories = articles.keys()
        categories.sort()
        for category in categories:
            cnum  += 1
            cfile = os.path.join(self.temp_dir, 'category'+str(cnum)+'.html')
            prefix = 'file:' if iswindows else ''
            clist += u'<li><a href="%s">%s</a></li>\n'%(prefix+cfile, category)
            src = build_sub_index(category, articles[category])
            open(cfile, 'wb').write(src.encode('utf-8'))        
        src = '''\
        <html>
        <body>
        <h1>%(title)s</h1>
        <div style='text-align: right; font-weight: bold'>%(date)s</div>
        <ul>
        %(categories)s
        </ul>
        </body>
        </html>
        '''%dict(date=time.strftime('%a, %d %B, %Y', time.localtime()), 
                 categories=clist, title=self.title)
        index = os.path.join(self.temp_dir, 'index.html')
        open(index, 'wb').write(src.encode('utf-8'))
        return index
--- a/src/libprs500/ebooks/lrf/web/profiles/portfolio.py
+++ b/src/libprs500/ebooks/lrf/web/profiles/portfolio.py
@ -4,12 +4,11 @@
 ''' 
 ''' 
-from libprs500.ebooks.lrf.web.profiles import DefaultProfile  
+from libprs500.ebooks.lrf.web.profiles import FullContentProfile  
-class Portfolio(DefaultProfile): 
+class Portfolio(FullContentProfile): 
        title = 'Portfolio'
        max_recursions = 0
        max_articles_per_feed = 50
        timefmt  = ' [%a, %b %d, %Y]' 
        html_description = True 
@ -17,7 +16,6 @@ class Portfolio(DefaultProfile):
        html2lrf_options = ['--ignore-tables']
        ##delay = 1
        ## Don't grab articles more than 7 days old 
        oldest_article = 30
 ## Comment out the feeds you don't want retrieved.