Various bug fixes/minor improvements to feeds2disk

2025-07-07 10:14:46 -04:00 · 2008-03-19 21:23:44 +00:00 · 2008-03-19 21:23:44 +00:00 · daa495e78a
commit daa495e78a
parent c74cfc55cd
6 changed files with 59 additions and 13 deletions
--- a/src/libprs500/web/feeds/init.py
+++ b/src/libprs500/web/feeds/init.py
@ -190,7 +190,7 @@ def feeds_from_index(index, oldest_article=7, max_articles_per_feed=100):
    @rtype: list
    '''
    feeds = []
-    for title, articles in index.items():
+    for title, articles in index:
        pfeed = Feed()
        pfeed.populate_from_preparsed_feed(title, articles, oldest_article=oldest_article, 
                                       max_articles_per_feed=max_articles_per_feed)
--- a/src/libprs500/web/feeds/news.py
+++ b/src/libprs500/web/feeds/news.py
@ -13,24 +13,25 @@
 ##    You should have received a copy of the GNU General Public License along
 ##    with this program; if not, write to the Free Software Foundation, Inc.,
 ##    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-from libprs500.ebooks.lrf.web.profiles import FullContentProfile
-from libprs500.ptempfile import PersistentTemporaryFile
 '''
 The backend to parse feeds and create HTML that can then be converted
 to an ebook.
 '''
-import logging, os, cStringIO, time, traceback, re
-import urlparse
+import logging, os, cStringIO, time, traceback, re, urlparse
+from collections import defaultdict

 from libprs500 import browser, __appname__, iswindows
 from libprs500.ebooks.BeautifulSoup import BeautifulSoup, NavigableString, CData, Tag
 from libprs500.ebooks.metadata.opf import OPFCreator
+from libprs500.ebooks.lrf import entity_to_unicode
 from libprs500.ebooks.metadata.toc import TOC
 from libprs500.ebooks.metadata import MetaInformation
 from libprs500.web.feeds import feed_from_xml, templates, feeds_from_index
 from libprs500.web.fetch.simple import option_parser as web2disk_option_parser
 from libprs500.web.fetch.simple import RecursiveFetcher
 from libprs500.threadpool import WorkRequest, ThreadPool, NoResultsPending
+from libprs500.ebooks.lrf.web.profiles import FullContentProfile
+from libprs500.ptempfile import PersistentTemporaryFile


 class BasicNewsRecipe(object):
@ -252,6 +253,36 @@ class BasicNewsRecipe(object):
        '''
        pass
    
+    def index_to_soup(self, url_or_raw):
+        '''
+        Convenience method that takes an URL to the index page and returns
+        a BeautifulSoup of it.
+        @param url_or_raw: Either a URL or the downloaded index page as a string
+        '''
+        if re.match(r'\w+://', url_or_raw):
+            raw = self.browser.open(url_or_raw).read()
+        else:
+            raw = url_or_raw
+        if not isinstance(raw, unicode) and self.encoding:
+            raw = raw.decode(self.encoding)
+        raw = re.sub(r'&(\S+?);', 
+                     lambda match: entity_to_unicode(match, encoding=self.encoding), 
+                     raw)
+        return BeautifulSoup(raw)
+        
+    
+    def sort_index_by(self, index, weights):
+        '''
+        Convenience method to sort the titles in index according to weights.
+        @param index: A list of titles.
+        @param weights: A dictionary that maps weights to titles. If any titles
+        in index are not in weights, they are assumed to have a weight of 0.
+        @return: Sorted index
+        '''
+        weights = defaultdict(lambda : 0, weights)
+        index.sort(cmp=lambda x, y: cmp(weights[x], weights[y]))
+        return index
+    
    def parse_index(self):
        '''
        This method should be implemented in recipes that parse a website
@ -259,9 +290,9 @@ class BasicNewsRecipe(object):
        news sources that have a "Print Edition" webpage that lists all the 
        articles in the current print edition. If this function is implemented,
        it will be used in preference to L{parse_feeds}.
-        @rtype: dictionary
-        @return: A dictionary whose keys are feed titles and whose values are each
-        a list of dictionaries. Each list contains dictionaries of the form::
+        @rtype: list
+        @return: A list of two element tuples of the form ('feed title', list of articles). 
+        Each list of articles contains dictionaries of the form::
            {
            'title'       : article title,
            'url'         : URL of print version,
@ -658,7 +689,7 @@ class BasicNewsRecipe(object):
        self.logger.debug(traceback)
        self.logger.debug('\n')
        self.report_progress(float(self.jobs_done)/len(self.jobs), _('Article download failed: %s')%request.article.title)
-        self.failed_downloads.append((request.feed.title, request.article, debug))
+        self.failed_downloads.append((request.feed, request.article, debug))
        
    def parse_feeds(self):
        '''
@ -731,6 +762,9 @@ class Profile2Recipe(BasicNewsRecipe):
        self.use_embedded_content = isinstance(self.old_profile, FullContentProfile) 
        
    def parse_index(self):
+        feeds = []
+        for key, val in self.old_profile.parse_feeds().items():
+            feeds.append((key, val))
        return self.old_profile.parse_feeds()
        
 class CustomIndexRecipe(BasicNewsRecipe):
--- a/src/libprs500/web/feeds/recipes/init.py
+++ b/src/libprs500/web/feeds/recipes/init.py
@ -17,7 +17,7 @@
 '''
 Builtin recipes.
 '''
-recipes = ['newsweek', 'atlantic', 'economist', 'dilbert', 'portfolio']
+recipes = ['newsweek', 'atlantic', 'economist', 'dilbert', 'portfolio', 'nytimes']

 import re
 from libprs500.web.feeds.news import BasicNewsRecipe, CustomIndexRecipe
--- a/src/libprs500/web/feeds/recipes/atlantic.py
+++ b/src/libprs500/web/feeds/recipes/atlantic.py
@ -63,4 +63,4 @@ class TheAtlantic(BasicNewsRecipe):
                                })
                
        
-        return {'Current Issue' : articles }
+        return [('Current Issue', articles)]
--- a/src/libprs500/web/feeds/recipes/economist.py
+++ b/src/libprs500/web/feeds/recipes/economist.py
@ -20,7 +20,7 @@ economist.com
 from libprs500.web.feeds.news import BasicNewsRecipe
 from libprs500.ebooks.BeautifulSoup import BeautifulSoup

-import mechanize
+import mechanize, string
 from urllib2 import quote

 class Economist(BasicNewsRecipe):
@ -47,6 +47,7 @@ class Economist(BasicNewsRecipe):
                             convertEntities=BeautifulSoup.HTML_ENTITIES)
        index_started = False
        feeds = {}
+        ans = []
        key = None
        for tag in soup.findAll(['h1', 'h2']):
            text = ''.join(tag.findAll(text=True))                
@ -57,7 +58,9 @@ class Economist(BasicNewsRecipe):
                    index_started = True
                if not index_started:
                    continue
+                text = string.capwords(text)
                feeds[text] = []
+                ans.append(text)
                key = text
                continue
            if key is None:
@ -68,4 +71,6 @@ class Economist(BasicNewsRecipe):
                    url='http://www.economist.com'+a['href'].replace('displaystory', 'PrinterFriendly'), 
                    description='', content='', date='')
                feeds[key].append(article)
-        return feeds
+                
+        ans = [(key, feeds[key]) for key in ans if feeds.has_key(key)]
+        return ans
--- a/src/libprs500/web/feeds/templates.py
+++ b/src/libprs500/web/feeds/templates.py
@ -35,6 +35,7 @@ class Template(MarkupTemplate):
            a.feed {
                font-weight: bold; font-size: large;
            }
+            
 '''
    
    def generate(self, *args, **kwargs):
@ -64,6 +65,9 @@ class NavBarTemplate(Template):
            <py:if test="art != num - 1 and not bottom">
            | <a href="${prefix}../article_${str(art+1)}/index.html">Next</a>
            </py:if>
+            <py:if test="art == num - 1 and not bottom">
+            | <a href="${prefix}../../feed_${str(feed+1)}/index.html">Next</a>
+            </py:if>
            | <a href="${prefix}../index.html#article_${str(art)}">Up one level</a> 
            <py:if test="two_levels">
            | <a href="${prefix}../../index.html#feed_${str(feed)}">Up two levels</a>
@ -168,6 +172,9 @@ class FeedTemplate(Template):
            </li>
            </py:for>
        </ul>
+        <div class="navbar" style="text-align:center; font-family:monospace; font-size:8pt">
+            | <a href="../index.html">Up one level</a> |
+        </div>
    </body>
 </html>
 ''')