Various bug fixes/minor improvements to feeds2disk

This commit is contained in:
Kovid Goyal 2008-03-19 21:23:44 +00:00
parent c74cfc55cd
commit daa495e78a
6 changed files with 59 additions and 13 deletions

View File

@ -190,7 +190,7 @@ def feeds_from_index(index, oldest_article=7, max_articles_per_feed=100):
@rtype: list
'''
feeds = []
for title, articles in index.items():
for title, articles in index:
pfeed = Feed()
pfeed.populate_from_preparsed_feed(title, articles, oldest_article=oldest_article,
max_articles_per_feed=max_articles_per_feed)

View File

@ -13,24 +13,25 @@
## You should have received a copy of the GNU General Public License along
## with this program; if not, write to the Free Software Foundation, Inc.,
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
from libprs500.ebooks.lrf.web.profiles import FullContentProfile
from libprs500.ptempfile import PersistentTemporaryFile
'''
The backend to parse feeds and create HTML that can then be converted
to an ebook.
'''
import logging, os, cStringIO, time, traceback, re
import urlparse
import logging, os, cStringIO, time, traceback, re, urlparse
from collections import defaultdict
from libprs500 import browser, __appname__, iswindows
from libprs500.ebooks.BeautifulSoup import BeautifulSoup, NavigableString, CData, Tag
from libprs500.ebooks.metadata.opf import OPFCreator
from libprs500.ebooks.lrf import entity_to_unicode
from libprs500.ebooks.metadata.toc import TOC
from libprs500.ebooks.metadata import MetaInformation
from libprs500.web.feeds import feed_from_xml, templates, feeds_from_index
from libprs500.web.fetch.simple import option_parser as web2disk_option_parser
from libprs500.web.fetch.simple import RecursiveFetcher
from libprs500.threadpool import WorkRequest, ThreadPool, NoResultsPending
from libprs500.ebooks.lrf.web.profiles import FullContentProfile
from libprs500.ptempfile import PersistentTemporaryFile
class BasicNewsRecipe(object):
@ -252,6 +253,36 @@ class BasicNewsRecipe(object):
'''
pass
def index_to_soup(self, url_or_raw):
'''
Convenience method that takes an URL to the index page and returns
a BeautifulSoup of it.
@param url_or_raw: Either a URL or the downloaded index page as a string
'''
if re.match(r'\w+://', url_or_raw):
raw = self.browser.open(url_or_raw).read()
else:
raw = url_or_raw
if not isinstance(raw, unicode) and self.encoding:
raw = raw.decode(self.encoding)
raw = re.sub(r'&(\S+?);',
lambda match: entity_to_unicode(match, encoding=self.encoding),
raw)
return BeautifulSoup(raw)
def sort_index_by(self, index, weights):
'''
Convenience method to sort the titles in index according to weights.
@param index: A list of titles.
@param weights: A dictionary that maps weights to titles. If any titles
in index are not in weights, they are assumed to have a weight of 0.
@return: Sorted index
'''
weights = defaultdict(lambda : 0, weights)
index.sort(cmp=lambda x, y: cmp(weights[x], weights[y]))
return index
def parse_index(self):
'''
This method should be implemented in recipes that parse a website
@ -259,9 +290,9 @@ class BasicNewsRecipe(object):
news sources that have a "Print Edition" webpage that lists all the
articles in the current print edition. If this function is implemented,
it will be used in preference to L{parse_feeds}.
@rtype: dictionary
@return: A dictionary whose keys are feed titles and whose values are each
a list of dictionaries. Each list contains dictionaries of the form::
@rtype: list
@return: A list of two element tuples of the form ('feed title', list of articles).
Each list of articles contains dictionaries of the form::
{
'title' : article title,
'url' : URL of print version,
@ -658,7 +689,7 @@ class BasicNewsRecipe(object):
self.logger.debug(traceback)
self.logger.debug('\n')
self.report_progress(float(self.jobs_done)/len(self.jobs), _('Article download failed: %s')%request.article.title)
self.failed_downloads.append((request.feed.title, request.article, debug))
self.failed_downloads.append((request.feed, request.article, debug))
def parse_feeds(self):
'''
@ -731,6 +762,9 @@ class Profile2Recipe(BasicNewsRecipe):
self.use_embedded_content = isinstance(self.old_profile, FullContentProfile)
def parse_index(self):
feeds = []
for key, val in self.old_profile.parse_feeds().items():
feeds.append((key, val))
return self.old_profile.parse_feeds()
class CustomIndexRecipe(BasicNewsRecipe):

View File

@ -17,7 +17,7 @@
'''
Builtin recipes.
'''
recipes = ['newsweek', 'atlantic', 'economist', 'dilbert', 'portfolio']
recipes = ['newsweek', 'atlantic', 'economist', 'dilbert', 'portfolio', 'nytimes']
import re
from libprs500.web.feeds.news import BasicNewsRecipe, CustomIndexRecipe

View File

@ -63,4 +63,4 @@ class TheAtlantic(BasicNewsRecipe):
})
return {'Current Issue' : articles }
return [('Current Issue', articles)]

View File

@ -20,7 +20,7 @@ economist.com
from libprs500.web.feeds.news import BasicNewsRecipe
from libprs500.ebooks.BeautifulSoup import BeautifulSoup
import mechanize
import mechanize, string
from urllib2 import quote
class Economist(BasicNewsRecipe):
@ -47,6 +47,7 @@ class Economist(BasicNewsRecipe):
convertEntities=BeautifulSoup.HTML_ENTITIES)
index_started = False
feeds = {}
ans = []
key = None
for tag in soup.findAll(['h1', 'h2']):
text = ''.join(tag.findAll(text=True))
@ -57,7 +58,9 @@ class Economist(BasicNewsRecipe):
index_started = True
if not index_started:
continue
text = string.capwords(text)
feeds[text] = []
ans.append(text)
key = text
continue
if key is None:
@ -68,4 +71,6 @@ class Economist(BasicNewsRecipe):
url='http://www.economist.com'+a['href'].replace('displaystory', 'PrinterFriendly'),
description='', content='', date='')
feeds[key].append(article)
return feeds
ans = [(key, feeds[key]) for key in ans if feeds.has_key(key)]
return ans

View File

@ -35,6 +35,7 @@ class Template(MarkupTemplate):
a.feed {
font-weight: bold; font-size: large;
}
'''
def generate(self, *args, **kwargs):
@ -64,6 +65,9 @@ class NavBarTemplate(Template):
<py:if test="art != num - 1 and not bottom">
| <a href="${prefix}../article_${str(art+1)}/index.html">Next</a>
</py:if>
<py:if test="art == num - 1 and not bottom">
| <a href="${prefix}../../feed_${str(feed+1)}/index.html">Next</a>
</py:if>
| <a href="${prefix}../index.html#article_${str(art)}">Up one level</a>
<py:if test="two_levels">
| <a href="${prefix}../../index.html#feed_${str(feed)}">Up two levels</a>
@ -168,6 +172,9 @@ class FeedTemplate(Template):
</li>
</py:for>
</ul>
<div class="navbar" style="text-align:center; font-family:monospace; font-size:8pt">
| <a href="../index.html">Up one level</a> |
</div>
</body>
</html>
''')