From daa495e78a3dda51c9700a56b6d6d9abfa715e3c Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 19 Mar 2008 21:23:44 +0000 Subject: [PATCH] Various bug fixes/minor improvements to feeds2disk --- src/libprs500/web/feeds/__init__.py | 2 +- src/libprs500/web/feeds/news.py | 50 ++++++++++++++++---- src/libprs500/web/feeds/recipes/__init__.py | 2 +- src/libprs500/web/feeds/recipes/atlantic.py | 2 +- src/libprs500/web/feeds/recipes/economist.py | 9 +++- src/libprs500/web/feeds/templates.py | 7 +++ 6 files changed, 59 insertions(+), 13 deletions(-) diff --git a/src/libprs500/web/feeds/__init__.py b/src/libprs500/web/feeds/__init__.py index bda3ed586e..eb5037ea4c 100644 --- a/src/libprs500/web/feeds/__init__.py +++ b/src/libprs500/web/feeds/__init__.py @@ -190,7 +190,7 @@ def feeds_from_index(index, oldest_article=7, max_articles_per_feed=100): @rtype: list ''' feeds = [] - for title, articles in index.items(): + for title, articles in index: pfeed = Feed() pfeed.populate_from_preparsed_feed(title, articles, oldest_article=oldest_article, max_articles_per_feed=max_articles_per_feed) diff --git a/src/libprs500/web/feeds/news.py b/src/libprs500/web/feeds/news.py index 1372d45083..cdc7ce3307 100644 --- a/src/libprs500/web/feeds/news.py +++ b/src/libprs500/web/feeds/news.py @@ -13,24 +13,25 @@ ## You should have received a copy of the GNU General Public License along ## with this program; if not, write to the Free Software Foundation, Inc., ## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. -from libprs500.ebooks.lrf.web.profiles import FullContentProfile -from libprs500.ptempfile import PersistentTemporaryFile ''' The backend to parse feeds and create HTML that can then be converted to an ebook. ''' -import logging, os, cStringIO, time, traceback, re -import urlparse +import logging, os, cStringIO, time, traceback, re, urlparse +from collections import defaultdict from libprs500 import browser, __appname__, iswindows from libprs500.ebooks.BeautifulSoup import BeautifulSoup, NavigableString, CData, Tag from libprs500.ebooks.metadata.opf import OPFCreator +from libprs500.ebooks.lrf import entity_to_unicode from libprs500.ebooks.metadata.toc import TOC from libprs500.ebooks.metadata import MetaInformation from libprs500.web.feeds import feed_from_xml, templates, feeds_from_index from libprs500.web.fetch.simple import option_parser as web2disk_option_parser from libprs500.web.fetch.simple import RecursiveFetcher from libprs500.threadpool import WorkRequest, ThreadPool, NoResultsPending +from libprs500.ebooks.lrf.web.profiles import FullContentProfile +from libprs500.ptempfile import PersistentTemporaryFile class BasicNewsRecipe(object): @@ -252,6 +253,36 @@ class BasicNewsRecipe(object): ''' pass + def index_to_soup(self, url_or_raw): + ''' + Convenience method that takes an URL to the index page and returns + a BeautifulSoup of it. + @param url_or_raw: Either a URL or the downloaded index page as a string + ''' + if re.match(r'\w+://', url_or_raw): + raw = self.browser.open(url_or_raw).read() + else: + raw = url_or_raw + if not isinstance(raw, unicode) and self.encoding: + raw = raw.decode(self.encoding) + raw = re.sub(r'&(\S+?);', + lambda match: entity_to_unicode(match, encoding=self.encoding), + raw) + return BeautifulSoup(raw) + + + def sort_index_by(self, index, weights): + ''' + Convenience method to sort the titles in index according to weights. + @param index: A list of titles. + @param weights: A dictionary that maps weights to titles. If any titles + in index are not in weights, they are assumed to have a weight of 0. + @return: Sorted index + ''' + weights = defaultdict(lambda : 0, weights) + index.sort(cmp=lambda x, y: cmp(weights[x], weights[y])) + return index + def parse_index(self): ''' This method should be implemented in recipes that parse a website @@ -259,9 +290,9 @@ class BasicNewsRecipe(object): news sources that have a "Print Edition" webpage that lists all the articles in the current print edition. If this function is implemented, it will be used in preference to L{parse_feeds}. - @rtype: dictionary - @return: A dictionary whose keys are feed titles and whose values are each - a list of dictionaries. Each list contains dictionaries of the form:: + @rtype: list + @return: A list of two element tuples of the form ('feed title', list of articles). + Each list of articles contains dictionaries of the form:: { 'title' : article title, 'url' : URL of print version, @@ -658,7 +689,7 @@ class BasicNewsRecipe(object): self.logger.debug(traceback) self.logger.debug('\n') self.report_progress(float(self.jobs_done)/len(self.jobs), _('Article download failed: %s')%request.article.title) - self.failed_downloads.append((request.feed.title, request.article, debug)) + self.failed_downloads.append((request.feed, request.article, debug)) def parse_feeds(self): ''' @@ -731,6 +762,9 @@ class Profile2Recipe(BasicNewsRecipe): self.use_embedded_content = isinstance(self.old_profile, FullContentProfile) def parse_index(self): + feeds = [] + for key, val in self.old_profile.parse_feeds().items(): + feeds.append((key, val)) return self.old_profile.parse_feeds() class CustomIndexRecipe(BasicNewsRecipe): diff --git a/src/libprs500/web/feeds/recipes/__init__.py b/src/libprs500/web/feeds/recipes/__init__.py index ba1a702590..01b1873c60 100644 --- a/src/libprs500/web/feeds/recipes/__init__.py +++ b/src/libprs500/web/feeds/recipes/__init__.py @@ -17,7 +17,7 @@ ''' Builtin recipes. ''' -recipes = ['newsweek', 'atlantic', 'economist', 'dilbert', 'portfolio'] +recipes = ['newsweek', 'atlantic', 'economist', 'dilbert', 'portfolio', 'nytimes'] import re from libprs500.web.feeds.news import BasicNewsRecipe, CustomIndexRecipe diff --git a/src/libprs500/web/feeds/recipes/atlantic.py b/src/libprs500/web/feeds/recipes/atlantic.py index 7cea7fa510..05ea60512f 100644 --- a/src/libprs500/web/feeds/recipes/atlantic.py +++ b/src/libprs500/web/feeds/recipes/atlantic.py @@ -63,4 +63,4 @@ class TheAtlantic(BasicNewsRecipe): }) - return {'Current Issue' : articles } \ No newline at end of file + return [('Current Issue', articles)] \ No newline at end of file diff --git a/src/libprs500/web/feeds/recipes/economist.py b/src/libprs500/web/feeds/recipes/economist.py index 8190ab50e3..d447603e74 100644 --- a/src/libprs500/web/feeds/recipes/economist.py +++ b/src/libprs500/web/feeds/recipes/economist.py @@ -20,7 +20,7 @@ economist.com from libprs500.web.feeds.news import BasicNewsRecipe from libprs500.ebooks.BeautifulSoup import BeautifulSoup -import mechanize +import mechanize, string from urllib2 import quote class Economist(BasicNewsRecipe): @@ -47,6 +47,7 @@ class Economist(BasicNewsRecipe): convertEntities=BeautifulSoup.HTML_ENTITIES) index_started = False feeds = {} + ans = [] key = None for tag in soup.findAll(['h1', 'h2']): text = ''.join(tag.findAll(text=True)) @@ -57,7 +58,9 @@ class Economist(BasicNewsRecipe): index_started = True if not index_started: continue + text = string.capwords(text) feeds[text] = [] + ans.append(text) key = text continue if key is None: @@ -68,4 +71,6 @@ class Economist(BasicNewsRecipe): url='http://www.economist.com'+a['href'].replace('displaystory', 'PrinterFriendly'), description='', content='', date='') feeds[key].append(article) - return feeds \ No newline at end of file + + ans = [(key, feeds[key]) for key in ans if feeds.has_key(key)] + return ans \ No newline at end of file diff --git a/src/libprs500/web/feeds/templates.py b/src/libprs500/web/feeds/templates.py index 66c734a3a5..968388ede2 100644 --- a/src/libprs500/web/feeds/templates.py +++ b/src/libprs500/web/feeds/templates.py @@ -35,6 +35,7 @@ class Template(MarkupTemplate): a.feed { font-weight: bold; font-size: large; } + ''' def generate(self, *args, **kwargs): @@ -64,6 +65,9 @@ class NavBarTemplate(Template): | Next + + | Next + | Up one level | Up two levels @@ -168,6 +172,9 @@ class FeedTemplate(Template): + ''')