From f79fd7a15c719426b428cd12cef865e2f261daa5 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Tue, 22 Jan 2008 06:34:05 +0000
Subject: [PATCH] Implement support for full-content feeds via a
 FullContentProfile

---
 .../ebooks/lrf/web/profiles/__init__.py       | 117 ++++++++++++++++--
 .../ebooks/lrf/web/profiles/portfolio.py      |   8 +-
 2 files changed, 110 insertions(+), 15 deletions(-)
diff --git a/src/libprs500/ebooks/lrf/web/profiles/__init__.py b/src/libprs500/ebooks/lrf/web/profiles/__init__.py
index 490bf29df6..33b605e60f 100644
--- a/src/libprs500/ebooks/lrf/web/profiles/__init__.py
+++ b/src/libprs500/ebooks/lrf/web/profiles/__init__.py
@@ -153,9 +153,10 @@ class DefaultProfile(object):
         return index
 
     
-    def parse_feeds(self):
+    def parse_feeds(self, require_url=True):
         '''
         Create list of articles from a list of feeds.
+        @param require_url: If True skip articles that don't have a link to a HTML page with the full article contents.
         @return: A dictionary whose keys are feed titles and whose values are each
         a list of dictionaries. Each list contains dictionaries of the form:
         {
@@ -163,6 +164,7 @@ class DefaultProfile(object):
             'url'         : URL of print version,
             'date'        : The publication date of the article as a string,
             'description' : A summary of the article
+            'content'     : The full article (can be an empty string). This is unused in DefaultProfile
         }
         '''
         added_articles = {}
@@ -196,14 +198,25 @@ class DefaultProfile(object):
                         if url:
                             break
                         
-                    if not url or not url.string:
+                    if require_url and (not url or not url.string):
                         self.logger.debug('Skipping article as it does not have a link url')
                         continue
+                    url = url.string if (url and url.string) else ''
+                    
+                    content = item.find('content:encoded')
+                    if not content:
+                        content = item.find('description')
+                    if content:
+                        content = self.process_html_description(content, strip_links=False)
+                    else:
+                        content = ''
+                         
                     d = { 
                         'title'    : item.find('title').string,                 
-                        'url'      : self.print_version(url.string),
+                        'url'      : self.print_version(url),
                         'timestamp': self.strptime(pubdate) if self.use_pubdate else time.time(),
-                        'date'     : pubdate if self.use_pubdate else time.ctime()
+                        'date'     : pubdate if self.use_pubdate else time.ctime(),
+                        'content'  : content,
                         }
                     delta = time.time() - d['timestamp']
                     if not self.allow_duplicates:
@@ -240,13 +253,19 @@ class DefaultProfile(object):
         pass
     
     @classmethod
-    def process_html_description(cls, tag):
+    def process_html_description(cls, tag, strip_links=True):
         src = '\n'.join(tag.contents)
-        replaced_entities = [ 'amp', 'lt', 'gt' , 'ldquo', 'rdquo', 'lsquo', 'rsquo' ]
-        for e in replaced_entities:
-            ent = '&'+e+';'
-            src = src.replace(ent, unichr(name2codepoint[e]))
-        return re.compile(r'<a.*?>(.*?)</a>', re.IGNORECASE|re.DOTALL).sub(r'\1', src)
+        match = re.match(r'<\!\[CDATA\[(.*)\]\]>', src.lstrip())
+        if match:
+            src = match.group(1)
+        else:
+            replaced_entities = [ 'amp', 'lt', 'gt' , 'ldquo', 'rdquo', 'lsquo', 'rsquo' ]
+            for e in replaced_entities:
+                ent = '&'+e+';'
+                src = src.replace(ent, unichr(name2codepoint[e]))
+        if strip_links:
+            src = re.compile(r'<a.*?>(.*?)</a>', re.IGNORECASE|re.DOTALL).sub(r'\1', src)
+        return src 
 
     
     DAY_MAP        = dict(Sun=0, Mon=1, Tue=2, Wed=3, Thu=4, Fri=5, Sat=6)
@@ -301,3 +320,81 @@ class DefaultProfile(object):
         return args
         
     
+class FullContentProfile(DefaultProfile):
+    
+    max_recursions = 3
+    summary_length = 500 # Max number of characters in the short description
+
+    article_counter = 0
+    
+    def build_index(self):
+        '''Build an RSS based index.html'''
+        import os
+        articles = self.parse_feeds(require_url=False)
+        
+        
+    
+        def build_sub_index(title, items):
+            ilist = ''
+            li = u'<li><a href="%(url)s">%(title)s</a> <span style="font-size: x-small">[%(date)s]</span><br/>\n'+\
+                u'<div style="font-size:small; font-family:sans">%(description)s<br /></div></li>\n'
+            for item in items:
+                content = item['content']
+                if not content:
+                    self.logger.debug('Skipping article as it has no content:%s'%item['title'])
+                    continue
+                item['description'] = item['description'][:self.summary_length]+'&hellip;'
+                self.article_counter = self.article_counter + 1
+                url = os.path.join(self.temp_dir, 'article%d.html'%self.article_counter)
+                item['url'] = url
+                open(url, 'wb').write((u'''\
+                    <html>
+                    <body>
+                    <h2>%s</h2>
+                    <div>
+                    %s
+                    </div>
+                    </body>
+                    </html>'''%(item['title'], content)).encode('utf-8')
+                    )
+                ilist += li%item
+            return u'''\
+            <html>
+            <body>
+            <h2>%(title)s</h2>
+            <ul>
+            %(items)s
+            </ul>
+            </body>
+            </html>
+            '''%dict(title=title, items=ilist.rstrip())        
+        
+        cnum = 0
+        clist = ''
+        categories = articles.keys()
+        categories.sort()
+        for category in categories:
+            cnum  += 1
+            cfile = os.path.join(self.temp_dir, 'category'+str(cnum)+'.html')
+            prefix = 'file:' if iswindows else ''
+            clist += u'<li><a href="%s">%s</a></li>\n'%(prefix+cfile, category)
+            src = build_sub_index(category, articles[category])
+            open(cfile, 'wb').write(src.encode('utf-8'))        
+        
+        src = '''\
+        <html>
+        <body>
+        <h1>%(title)s</h1>
+        <div style='text-align: right; font-weight: bold'>%(date)s</div>
+        <ul>
+        %(categories)s
+        </ul>
+        </body>
+        </html>
+        '''%dict(date=time.strftime('%a, %d %B, %Y', time.localtime()), 
+                 categories=clist, title=self.title)
+        index = os.path.join(self.temp_dir, 'index.html')
+        open(index, 'wb').write(src.encode('utf-8'))
+        return index
+
+    
\ No newline at end of file
diff --git a/src/libprs500/ebooks/lrf/web/profiles/portfolio.py b/src/libprs500/ebooks/lrf/web/profiles/portfolio.py
index a955402fdc..c57455ba80 100644
--- a/src/libprs500/ebooks/lrf/web/profiles/portfolio.py
+++ b/src/libprs500/ebooks/lrf/web/profiles/portfolio.py
@@ -4,12 +4,11 @@
 ''' 
 ''' 
  
-from libprs500.ebooks.lrf.web.profiles import DefaultProfile  
+from libprs500.ebooks.lrf.web.profiles import FullContentProfile  
          
-class Portfolio(DefaultProfile): 
+class Portfolio(FullContentProfile): 
     
-        title = 'Portfolio' 
-        max_recursions = 0
+        title = 'Portfolio'
         max_articles_per_feed = 50
         timefmt  = ' [%a, %b %d, %Y]' 
         html_description = True 
@@ -17,7 +16,6 @@ class Portfolio(DefaultProfile):
         html2lrf_options = ['--ignore-tables']
         ##delay = 1
         
-        ## Don't grab articles more than 7 days old 
         oldest_article = 30
 
 ## Comment out the feeds you don't want retrieved.