From daa495e78a3dda51c9700a56b6d6d9abfa715e3c Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Wed, 19 Mar 2008 21:23:44 +0000
Subject: [PATCH] Various bug fixes/minor improvements to feeds2disk

---
 src/libprs500/web/feeds/__init__.py          |  2 +-
 src/libprs500/web/feeds/news.py              | 50 ++++++++++++++++----
 src/libprs500/web/feeds/recipes/__init__.py  |  2 +-
 src/libprs500/web/feeds/recipes/atlantic.py  |  2 +-
 src/libprs500/web/feeds/recipes/economist.py |  9 +++-
 src/libprs500/web/feeds/templates.py         |  7 +++
 6 files changed, 59 insertions(+), 13 deletions(-)

diff --git a/src/libprs500/web/feeds/__init__.py b/src/libprs500/web/feeds/__init__.py
index bda3ed586e..eb5037ea4c 100644
--- a/src/libprs500/web/feeds/__init__.py
+++ b/src/libprs500/web/feeds/__init__.py
@@ -190,7 +190,7 @@ def feeds_from_index(index, oldest_article=7, max_articles_per_feed=100):
     @rtype: list
     '''
     feeds = []
-    for title, articles in index.items():
+    for title, articles in index:
         pfeed = Feed()
         pfeed.populate_from_preparsed_feed(title, articles, oldest_article=oldest_article, 
                                        max_articles_per_feed=max_articles_per_feed)
diff --git a/src/libprs500/web/feeds/news.py b/src/libprs500/web/feeds/news.py
index 1372d45083..cdc7ce3307 100644
--- a/src/libprs500/web/feeds/news.py
+++ b/src/libprs500/web/feeds/news.py
@@ -13,24 +13,25 @@
 ##    You should have received a copy of the GNU General Public License along
 ##    with this program; if not, write to the Free Software Foundation, Inc.,
 ##    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-from libprs500.ebooks.lrf.web.profiles import FullContentProfile
-from libprs500.ptempfile import PersistentTemporaryFile
 '''
 The backend to parse feeds and create HTML that can then be converted
 to an ebook.
 '''
-import logging, os, cStringIO, time, traceback, re
-import urlparse
+import logging, os, cStringIO, time, traceback, re, urlparse
+from collections import defaultdict
 
 from libprs500 import browser, __appname__, iswindows
 from libprs500.ebooks.BeautifulSoup import BeautifulSoup, NavigableString, CData, Tag
 from libprs500.ebooks.metadata.opf import OPFCreator
+from libprs500.ebooks.lrf import entity_to_unicode
 from libprs500.ebooks.metadata.toc import TOC
 from libprs500.ebooks.metadata import MetaInformation
 from libprs500.web.feeds import feed_from_xml, templates, feeds_from_index
 from libprs500.web.fetch.simple import option_parser as web2disk_option_parser
 from libprs500.web.fetch.simple import RecursiveFetcher
 from libprs500.threadpool import WorkRequest, ThreadPool, NoResultsPending
+from libprs500.ebooks.lrf.web.profiles import FullContentProfile
+from libprs500.ptempfile import PersistentTemporaryFile
 
 
 class BasicNewsRecipe(object):
@@ -252,6 +253,36 @@ class BasicNewsRecipe(object):
         '''
         pass
     
+    def index_to_soup(self, url_or_raw):
+        '''
+        Convenience method that takes an URL to the index page and returns
+        a BeautifulSoup of it.
+        @param url_or_raw: Either a URL or the downloaded index page as a string
+        '''
+        if re.match(r'\w+://', url_or_raw):
+            raw = self.browser.open(url_or_raw).read()
+        else:
+            raw = url_or_raw
+        if not isinstance(raw, unicode) and self.encoding:
+            raw = raw.decode(self.encoding)
+        raw = re.sub(r'&(\S+?);', 
+                     lambda match: entity_to_unicode(match, encoding=self.encoding), 
+                     raw)
+        return BeautifulSoup(raw)
+        
+    
+    def sort_index_by(self, index, weights):
+        '''
+        Convenience method to sort the titles in index according to weights.
+        @param index: A list of titles.
+        @param weights: A dictionary that maps weights to titles. If any titles
+        in index are not in weights, they are assumed to have a weight of 0.
+        @return: Sorted index
+        '''
+        weights = defaultdict(lambda : 0, weights)
+        index.sort(cmp=lambda x, y: cmp(weights[x], weights[y]))
+        return index
+    
     def parse_index(self):
         '''
         This method should be implemented in recipes that parse a website
@@ -259,9 +290,9 @@ class BasicNewsRecipe(object):
         news sources that have a "Print Edition" webpage that lists all the 
         articles in the current print edition. If this function is implemented,
         it will be used in preference to L{parse_feeds}.
-        @rtype: dictionary
-        @return: A dictionary whose keys are feed titles and whose values are each
-        a list of dictionaries. Each list contains dictionaries of the form::
+        @rtype: list
+        @return: A list of two element tuples of the form ('feed title', list of articles). 
+        Each list of articles contains dictionaries of the form::
             {
             'title'       : article title,
             'url'         : URL of print version,
@@ -658,7 +689,7 @@ class BasicNewsRecipe(object):
         self.logger.debug(traceback)
         self.logger.debug('\n')
         self.report_progress(float(self.jobs_done)/len(self.jobs), _('Article download failed: %s')%request.article.title)
-        self.failed_downloads.append((request.feed.title, request.article, debug))
+        self.failed_downloads.append((request.feed, request.article, debug))
         
     def parse_feeds(self):
         '''
@@ -731,6 +762,9 @@ class Profile2Recipe(BasicNewsRecipe):
         self.use_embedded_content = isinstance(self.old_profile, FullContentProfile) 
         
     def parse_index(self):
+        feeds = []
+        for key, val in self.old_profile.parse_feeds().items():
+            feeds.append((key, val))
         return self.old_profile.parse_feeds()
         
 class CustomIndexRecipe(BasicNewsRecipe):
diff --git a/src/libprs500/web/feeds/recipes/__init__.py b/src/libprs500/web/feeds/recipes/__init__.py
index ba1a702590..01b1873c60 100644
--- a/src/libprs500/web/feeds/recipes/__init__.py
+++ b/src/libprs500/web/feeds/recipes/__init__.py
@@ -17,7 +17,7 @@
 '''
 Builtin recipes.
 '''
-recipes = ['newsweek', 'atlantic', 'economist', 'dilbert', 'portfolio']
+recipes = ['newsweek', 'atlantic', 'economist', 'dilbert', 'portfolio', 'nytimes']
 
 import re
 from libprs500.web.feeds.news import BasicNewsRecipe, CustomIndexRecipe
diff --git a/src/libprs500/web/feeds/recipes/atlantic.py b/src/libprs500/web/feeds/recipes/atlantic.py
index 7cea7fa510..05ea60512f 100644
--- a/src/libprs500/web/feeds/recipes/atlantic.py
+++ b/src/libprs500/web/feeds/recipes/atlantic.py
@@ -63,4 +63,4 @@ class TheAtlantic(BasicNewsRecipe):
                                 })
                 
         
-        return {'Current Issue' : articles }
\ No newline at end of file
+        return [('Current Issue', articles)]
\ No newline at end of file
diff --git a/src/libprs500/web/feeds/recipes/economist.py b/src/libprs500/web/feeds/recipes/economist.py
index 8190ab50e3..d447603e74 100644
--- a/src/libprs500/web/feeds/recipes/economist.py
+++ b/src/libprs500/web/feeds/recipes/economist.py
@@ -20,7 +20,7 @@ economist.com
 from libprs500.web.feeds.news import BasicNewsRecipe
 from libprs500.ebooks.BeautifulSoup import BeautifulSoup
 
-import mechanize
+import mechanize, string
 from urllib2 import quote
 
 class Economist(BasicNewsRecipe):
@@ -47,6 +47,7 @@ class Economist(BasicNewsRecipe):
                              convertEntities=BeautifulSoup.HTML_ENTITIES)
         index_started = False
         feeds = {}
+        ans = []
         key = None
         for tag in soup.findAll(['h1', 'h2']):
             text = ''.join(tag.findAll(text=True))                
@@ -57,7 +58,9 @@ class Economist(BasicNewsRecipe):
                     index_started = True
                 if not index_started:
                     continue
+                text = string.capwords(text)
                 feeds[text] = []
+                ans.append(text)
                 key = text
                 continue
             if key is None:
@@ -68,4 +71,6 @@ class Economist(BasicNewsRecipe):
                     url='http://www.economist.com'+a['href'].replace('displaystory', 'PrinterFriendly'), 
                     description='', content='', date='')
                 feeds[key].append(article)
-        return feeds
\ No newline at end of file
+                
+        ans = [(key, feeds[key]) for key in ans if feeds.has_key(key)]
+        return ans
\ No newline at end of file
diff --git a/src/libprs500/web/feeds/templates.py b/src/libprs500/web/feeds/templates.py
index 66c734a3a5..968388ede2 100644
--- a/src/libprs500/web/feeds/templates.py
+++ b/src/libprs500/web/feeds/templates.py
@@ -35,6 +35,7 @@ class Template(MarkupTemplate):
             a.feed {
                 font-weight: bold; font-size: large;
             }
+            
 '''
     
     def generate(self, *args, **kwargs):
@@ -64,6 +65,9 @@ class NavBarTemplate(Template):
             <py:if test="art != num - 1 and not bottom">
             | <a href="${prefix}../article_${str(art+1)}/index.html">Next</a>
             </py:if>
+            <py:if test="art == num - 1 and not bottom">
+            | <a href="${prefix}../../feed_${str(feed+1)}/index.html">Next</a>
+            </py:if>
             | <a href="${prefix}../index.html#article_${str(art)}">Up one level</a> 
             <py:if test="two_levels">
             | <a href="${prefix}../../index.html#feed_${str(feed)}">Up two levels</a>
@@ -168,6 +172,9 @@ class FeedTemplate(Template):
             </li>
             </py:for>
         </ul>
+        <div class="navbar" style="text-align:center; font-family:monospace; font-size:8pt">
+            | <a href="../index.html">Up one level</a> |
+        </div>
     </body>
 </html>
 ''')