Support for content embedded feeds in feeds2disk

2025-07-09 03:04:10 -04:00 · 2008-03-18 06:31:40 +00:00 · 2008-03-18 06:31:40 +00:00 · 7ddfcd5711
commit 7ddfcd5711
parent ef40f3abab
5 changed files with 145 additions and 9 deletions
--- a/src/libprs500/web/feeds/init.py
+++ b/src/libprs500/web/feeds/init.py
@ -163,6 +163,15 @@ class Feed(object):
            if getattr(article, 'downloaded', False):
                return True
        return False
+    
+    def has_embedded_content(self):
+        length = 0
+        for a in self:
+            if a.content or a.summary:
+                length += max(len(a.content if a.content else ''), 
+                              len(a.summary if a.summary else ''))
+                
+        return length > 2000 * len(self)


 def feed_from_xml(raw_xml, title=None, oldest_article=7, 
--- a/src/libprs500/web/feeds/news.py
+++ b/src/libprs500/web/feeds/news.py
@ -13,6 +13,8 @@
 ##    You should have received a copy of the GNU General Public License along
 ##    with this program; if not, write to the Free Software Foundation, Inc.,
 ##    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+from libprs500.ebooks.lrf.web.profiles import FullContentProfile
+from libprs500.ptempfile import PersistentTemporaryFile
 '''
 The backend to parse feeds and create HTML that can then be converted
 to an ebook.
@ -100,7 +102,18 @@ class BasicNewsRecipe(object):
    #: using cp1252. If None, try to detect the encoding. 
    encoding = None
    
+    #: Normally we try to guess if a feed has full articles embedded in it
+    #: based on the length of the embedded content. If C{None}, then the
+    #: default guessing is used. If C{True} then the we always assume the feeds has 
+    #: embedded content and if False we always assume the feed does not have
+    #: embedded content.
+    use_embedded_content = None
+    
    #: Specify any extra CSS that should be addded to downloaded HTML files
+    #: It will be inserted into C{<style></style>} just before the closing
+    #: C{</head>} tag thereby overrinding all CSS except that which is
+    #: declared using the style attribute on individual HTML tags.
+    #: type: string
    extra_css = None
    
    #: List of regular expressions that determines which links to follow
@ -388,6 +401,24 @@ class BasicNewsRecipe(object):
        templ = templates.IndexTemplate()
        return templ.generate(self.title, self.timefmt, feeds).render(doctype='xhtml')
    
+    @classmethod
+    def description_limiter(cls, src):
+        pos = cls.summary_length
+        fuzz = 50
+        si = src.find(';', pos)
+        if si > 0 and si-pos > fuzz:
+            si = -1
+        gi = src.find('>', pos)
+        if gi > 0 and gi-pos > fuzz:
+            gi = -1
+        npos = max(si, gi)
+        if npos < 0:
+            npos = pos
+        
+        return src[:npos+1]+u'\u2026'
+
+        
+    
    def feed2index(self, feed):
        if feed.image_url is not None: # Download feed image
            imgdir = os.path.join(self.output_dir, 'images')
@ -408,7 +439,7 @@ class BasicNewsRecipe(object):
                        self.image_map[feed.image_url] = img
                
        templ = templates.FeedTemplate()
-        return templ.generate(feed).render(doctype='xhtml')
+        return templ.generate(feed, self.description_limiter).render(doctype='xhtml')
        
    
    def create_logger(self, feed_number, article_number):
@ -422,7 +453,7 @@ class BasicNewsRecipe(object):
        logger.addHandler(handler)
        return logger, out
    
-    def fetch_article(self, url, dir, logger, f, a, num_of_feeds):
+    def _fetch_article(self, url, dir, logger, f, a, num_of_feeds):
        fetcher = RecursiveFetcher(self.web2disk_options, logger, self.image_map, self.css_map, (url, f, a, num_of_feeds))
        fetcher.base_dir = dir
        fetcher.current_dir = dir
@ -432,6 +463,20 @@ class BasicNewsRecipe(object):
            raise Exception(_('Could not fetch article. Run with --debug to see the reason'))
        return res, path, failures
    
+    def fetch_article(self, url, dir, logger, f, a, num_of_feeds):
+        return self._fetch_article(url, dir, logger, f, a, num_of_feeds)
+        
+    
+    def fetch_embedded_article(self, article, dir, logger, f, a, num_of_feeds):
+        pt = PersistentTemporaryFile('_feeds2disk.html')
+        templ = templates.EmbeddedContent()
+        raw = templ.generate(article).render('html')
+        open(pt.name, 'wb').write(raw)
+        pt.close()
+        url = ('file:'+pt.name) if iswindows else ('file://'+pt.name)
+        return self._fetch_article(url, dir, logger, f, a, num_of_feeds) 
+        
+    
    def build_index(self):
        self.report_progress(0, _('Fetching feeds...'))
        try:
@ -447,6 +492,9 @@ class BasicNewsRecipe(object):
            feeds = feeds[:2]
        self.has_single_feed = len(feeds) == 1
        
+        if self.use_embedded_content is None:
+            self.use_embedded_content = feeds[0].has_embedded_content()
+        
        index = os.path.join(self.output_dir, 'index.html') 
        
        html = self.feeds2index(feeds)
@ -459,6 +507,8 @@ class BasicNewsRecipe(object):
                os.makedirs(feed_dir)
                
            for a, article in enumerate(feed):
+                if a >= self.max_articles_per_feed:
+                    break
                art_dir = os.path.join(feed_dir, 'article_%d'%a)
                if not os.path.isdir(art_dir):
                    os.makedirs(art_dir)
@ -467,9 +517,12 @@ class BasicNewsRecipe(object):
                    url = self.print_version(article.url)
                except NotImplementedError:
                    url = article.url
-                req = WorkRequest(self.fetch_article, (url, art_dir, logger, f, a, len(feed)), 
-                                  {}, (f, a), self.article_downloaded, 
-                                  self.error_in_article_download)
+                    
+                func, arg = (self.fetch_embedded_article, article) if self.use_embedded_content else \
+                            (self.fetch_article, url)
+                req = WorkRequest(func, (arg, art_dir, logger, f, a, len(feed)), 
+                                      {}, (f, a), self.article_downloaded, 
+                                      self.error_in_article_download)
                req.stream = stream
                req.feed = feed
                req.article = article
@ -674,6 +727,7 @@ class Profile2Recipe(BasicNewsRecipe):
        self.simultaneous_downloads = 1
        BasicNewsRecipe.__init__(self, options, parser, progress_reporter)
        self.browser = self.old_profile.browser
+        self.use_embedded_content = isinstance(self.old_profile, FullContentProfile) 
        
    def parse_index(self):
        return self.old_profile.parse_feeds()
--- a/src/libprs500/web/feeds/recipes/init.py
+++ b/src/libprs500/web/feeds/recipes/init.py
@ -17,7 +17,7 @@
 '''
 Builtin recipes.
 '''
-recipes = ['newsweek', 'atlantic', 'economist', 'dilbert']
+recipes = ['newsweek', 'atlantic', 'economist', 'dilbert', 'portfolio']

 import re
 from libprs500.web.feeds.news import BasicNewsRecipe, CustomIndexRecipe
--- a/src/libprs500/web/feeds/recipes/portfolio.py
+++ b/src/libprs500/web/feeds/recipes/portfolio.py
@ -0,0 +1,45 @@
+#!/usr/bin/env  python
+
+##    Copyright (C) 2008 Kovid Goyal kovid@kovidgoyal.net
+##    This program is free software; you can redistribute it and/or modify
+##    it under the terms of the GNU General Public License as published by
+##    the Free Software Foundation; either version 2 of the License, or
+##    (at your option) any later version.
+##
+##    This program is distributed in the hope that it will be useful,
+##    but WITHOUT ANY WARRANTY; without even the implied warranty of
+##    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+##    GNU General Public License for more details.
+##
+##    You should have received a copy of the GNU General Public License along
+##    with this program; if not, write to the Free Software Foundation, Inc.,
+##    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+'''
+portfolio.com
+'''
+
+from libprs500.web.feeds.news import BasicNewsRecipe
+
+class Portfolio(BasicNewsRecipe):
+    
+    title                = 'Portfolio'
+    use_embedded_content = True
+    timefmt              = ' [%a, %b %d, %Y]'
+    html2lrf_options     = ['--ignore-tables']
+    
+    feeds = [ 
+                ('Business Travel', 'http://feeds.portfolio.com/portfolio/businesstravel'), 
+                ('Careers', 'http://feeds.portfolio.com/portfolio/careers'), 
+                ('Culture and Lifestyle', 'http://feeds.portfolio.com/portfolio/cultureandlifestyle'), 
+                ('Executives','http://feeds.portfolio.com/portfolio/executives'), 
+                ('News and Markets', 'http://feeds.portfolio.com/portfolio/news'), 
+                ('Business Spin', 'http://feeds.portfolio.com/portfolio/businessspin'), 
+                ('Capital', 'http://feeds.portfolio.com/portfolio/capital'), 
+                ('Daily Brief', 'http://feeds.portfolio.com/portfolio/dailybrief'), 
+                ('Market Movers', 'http://feeds.portfolio.com/portfolio/marketmovers'), 
+                ('Mixed Media', 'http://feeds.portfolio.com/portfolio/mixedmedia'), 
+                ('Odd Numbers', 'http://feeds.portfolio.com/portfolio/oddnumbers'), 
+                ('Playbook', 'http://feeds.portfolio.com/portfolio/playbook'), 
+                ('Tech Observer', 'http://feeds.portfolio.com/portfolio/thetechobserver'), 
+                ('World According to ...', 'http://feeds.portfolio.com/portfolio/theworldaccordingto'), 
+            ]
--- a/src/libprs500/web/feeds/templates.py
+++ b/src/libprs500/web/feeds/templates.py
@ -163,7 +163,7 @@ class FeedTemplate(Template):
                <a class="article" href="${article.url}">${article.title}</a>
                <span class="article_date">${article.localtime.strftime(" [%a, %d %b %H:%M]")}</span>
                <p class="article_decription" py:if="article.summary">
-                    ${Markup(article.summary)}
+                    ${Markup(cutoff(article.summary))}
                </p>
            </li>
            </py:for>
@ -172,5 +172,33 @@ class FeedTemplate(Template):
 </html>
 ''')
        
-    def generate(self, feed):
-        return Template.generate(self, feed=feed)
+    def generate(self, feed, cutoff):
+        return Template.generate(self, feed=feed, cutoff=cutoff)
+
+class EmbeddedContent(Template):
+    
+    def __init__(self):
+        Template.__init__(self, '''\
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" 
+    "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml" 
+      xml:lang="en"
+      xmlns:xi="http://www.w3.org/2001/XInclude"
+      xmlns:py="http://genshi.edgewall.org/" 
+       
+>
+    <head>
+        <title>${article.title}</title>
+    </head>
+    
+    <body>
+        <h2>${article.title}</h2>
+        <div>
+            ${Markup(article.content if len(article.content if article.content else '') > len(article.summary if article.summary else '') else article.summary)}
+        </div>
+    </body>
+</html> 
+''')
+    
+    def generate(self, article):
+        return Template.generate(self, article=article)