Added recipes for The Atlantic and Economist to feeds2disk

This commit is contained in:
Kovid Goyal 2008-03-15 20:44:25 +00:00
parent 0dde91062f
commit 5a76f5c2e1
7 changed files with 266 additions and 26 deletions

View File

@ -64,7 +64,7 @@ class Feed(object):
max_articles_per_feed=100):
entries = feed.entries
feed = feed.feed
self.title = feed.get('title', 'Unknown feed') if not title else title
self.title = feed.get('title', _('Unknown feed')) if not title else title
self.description = feed.get('description', '')
image = feed.get('image', {})
self.image_url = image.get('href', None)
@ -83,6 +83,38 @@ class Feed(object):
break
self.parse_article(item)
def populate_from_preparsed_feed(self, title, articles, oldest_article=7,
max_articles_per_feed=100):
self.title = title if title else _('Unknown feed')
self.descrition = ''
self.image_url = None
self.articles = []
self.added_articles = []
self.oldest_article = oldest_article
self.id_counter = 0
for item in articles:
if len(self.articles) >= max_articles_per_feed:
break
id = item.get('id', 'internal id#'+str(self.id_counter))
if id in self.added_articles:
return
self.added_articles.append(id)
self.id_counter += 1
published = time.gmtime(item.get('timestamp', time.time()))
title = item.get('title', _('Untitled article'))
link = item.get('url', None)
description = item.get('description', '')
content = item.get('content', '')
article = Article(id, title, link, description, published, content)
delta = datetime.utcnow() - article.utctime
if delta.days*24*3600 + delta.seconds <= 24*3600*self.oldest_article:
self.articles.append(article)
else:
self.logger.debug('Skipping article %s (%s) from feed %s as it is too old.'%(title, article.localtime.strftime('%a, %d %b, %Y %H:%M'), self.title))
def parse_article(self, item):
id = item.get('id', 'internal id#'+str(self.id_counter))
if id in self.added_articles:
@ -91,7 +123,7 @@ class Feed(object):
self.id_counter += 1
self.added_articles.append(id)
title = item.get('title', 'Untitled article')
title = item.get('title', _('Untitled article'))
link = item.get('link', None)
description = item.get('summary', None)
@ -134,3 +166,17 @@ def feed_from_xml(raw_xml, title=None, oldest_article=7, max_articles_per_feed=1
oldest_article=oldest_article,
max_articles_per_feed=max_articles_per_feed)
return pfeed
def feeds_from_index(index, oldest_article=7, max_articles_per_feed=100):
'''
@param index: A parsed index as returned by L{BasicNewsRecipe.parse_index}.
@return: A list of L{Feed} objects.
@rtype: list
'''
feeds = []
for title, articles in index.items():
pfeed = Feed()
pfeed.populate_from_preparsed_feed(title, articles, oldest_article=oldest_article,
max_articles_per_feed=max_articles_per_feed)
feeds.append(pfeed)
return feeds

View File

@ -21,11 +21,11 @@ import logging, os, cStringIO, time, traceback
import urlparse
from libprs500 import browser, __appname__
from libprs500.ebooks.BeautifulSoup import BeautifulSoup
from libprs500.ebooks.BeautifulSoup import BeautifulSoup, NavigableString, CData, Tag
from libprs500.ebooks.metadata.opf import OPFCreator
from libprs500.ebooks.metadata.toc import TOC
from libprs500.ebooks.metadata import MetaInformation
from libprs500.web.feeds import feed_from_xml, templates
from libprs500.web.feeds import feed_from_xml, templates, feeds_from_index
from libprs500.web.fetch.simple import option_parser as web2disk_option_parser
from libprs500.web.fetch.simple import RecursiveFetcher
from libprs500.threadpool import WorkRequest, ThreadPool, NoResultsPending
@ -74,6 +74,11 @@ class BasicNewsRecipe(object):
#: @type: string
timefmt = ' [%a, %d %b %Y]'
#: List of feeds to download
#: Can be either C{[url1, url2, ...]} or C{[('title1', url1), ('title2', url2),...]}
#: @type: List of strings or list of 2-tuples
feeds = None
#: Max number of characters in the short description.
#: @type: integer
summary_length = 500
@ -112,7 +117,7 @@ class BasicNewsRecipe(object):
#: List of options to pass to html2lrf, to customize generation of LRF ebooks.
#: @type: list of strings
html2lrf_options = ['--page-break-before', '$']
html2lrf_options = []
#: List of tags to be removed. Specified tags are removed from downloaded HTML.
#: A tag is specified as a dictionary of the form::
@ -134,6 +139,12 @@ class BasicNewsRecipe(object):
#: tags after the element with id C{content}.
remove_tags_after = None
#: Remove all tags that occur before the specified tag.
#: For the format for specifying a tag see L{remove_tags}.
#: For example, C{remove_tags_before = [dict(id='content')]} will remove all
#: tags before the element with id C{content}.
remove_tags_before = None
#: Keep only the specified tags and their children.
#: For the format for specifying tags see L{remove_tags}.
#: If this list is not empty, then the <body> element will be emptied and re-filled with
@ -220,6 +231,26 @@ class BasicNewsRecipe(object):
'''
pass
def parse_index(self):
'''
This method should be implemented in recipes that parse a website
instead of feeds to generate a list of articles. Typical uses are for
news sources that have a "Print Edition" webpage that lists all the
articles in the current print edition. If this function is implemented,
it will be used in preference to L{parse_feeds}.
@rtype: dictionary
@return: A dictionary whose keys are feed titles and whose values are each
a list of dictionaries. Each list contains dictionaries of the form::
{
'title' : article title,
'url' : URL of print version,
'date' : The publication date of the article as a string,
'description' : A summary of the article
'content' : The full article (can be an empty string). This is used by FullContentProfile
}
'''
raise NotImplementedError
def __init__(self, options, parser, progress_reporter):
'''
Initialize the recipe.
@ -285,7 +316,7 @@ class BasicNewsRecipe(object):
self.web2disk_options = web2disk_option_parser().parse_args(web2disk_cmdline)[0]
for extra in ('keep_only_tags', 'remove_tags', 'preprocess_regexps',
'preprocess_html', 'remove_tags_after'):
'preprocess_html', 'remove_tags_after', 'remove_tags_before'):
setattr(self.web2disk_options, extra, getattr(self, extra))
self.web2disk_options.postprocess_html = [self._postprocess_html, self.postprocess_html]
@ -293,7 +324,7 @@ class BasicNewsRecipe(object):
self.simultaneous_downloads = 1
self.navbar = templates.NavBarTemplate()
self.html2lrf_options.append('--use-spine')
self.html2lrf_options.extend(['--page-break-before', '$', '--use-spine'])
self.failed_downloads = []
self.partial_failures = []
@ -389,7 +420,13 @@ class BasicNewsRecipe(object):
def build_index(self):
self.report_progress(0, _('Fetching feeds...'))
try:
feeds = feeds_from_index(self.parse_index(), oldest_article=self.oldest_article,
max_articles_per_feed=self.max_articles_per_feed)
self.report_progress(0, _('Got feeds from index page'))
except NotImplementedError:
feeds = self.parse_feeds()
if self.test:
feeds = feeds[:2]
self.has_single_feed = len(feeds) == 1
@ -485,28 +522,31 @@ class BasicNewsRecipe(object):
entries = ['index.html']
toc = TOC(base_path=dir)
for i, f in enumerate(feeds):
entries.append('feed_%d/index.html'%i)
feed = toc.add_item('feed_%d/index.html'%i, None, f.title)
def feed_index(num, parent):
f = feeds[num]
for j, a in enumerate(f):
if getattr(a, 'downloaded', False):
adir = 'feed_%d/article_%d/'%(i, j)
adir = 'feed_%d/article_%d/'%(num, j)
entries.append('%sindex.html'%adir)
feed.add_item('%sindex.html'%adir, None, a.title if a.title else 'Untitled article')
parent.add_item('%sindex.html'%adir, None, a.title if a.title else _('Untitled Article'))
for sp in a.sub_pages:
prefix = os.path.commonprefix([opf_path, sp])
relp = sp[len(prefix):]
entries.append(relp.replace(os.sep, '/'))
if len(feeds) > 1:
for i, f in enumerate(feeds):
entries.append('feed_%d/index.html'%i)
feed = toc.add_item('feed_%d/index.html'%i, None, f.title)
feed_index(i, feed)
else:
entries.append('feed_%d/index.html'%0)
feed_index(0, toc)
opf.create_spine(entries)
opf.set_toc(toc)
for i, f in enumerate(feeds):
for j, a in enumerate(f):
if getattr(a, 'downloaded', False):
adir = 'feed_%d/article_%d/'%(i, j)
opf.render(open(opf_path, 'wb'), open(ncx_path, 'wb'))
@ -525,7 +565,7 @@ class BasicNewsRecipe(object):
article = request.article
self.logger.debug(_('\nDownloaded article %s from %s\n%s')%(article.title, article.url, request.stream.getvalue().decode('utf-8', 'ignore')))
article.url = result[0]
article.url = 'article_%d/index.html'%a
article.downloaded = True
article.sub_pages = result[1][1:]
self.jobs_done += 1
@ -563,3 +603,29 @@ class BasicNewsRecipe(object):
max_articles_per_feed=self.max_articles_per_feed))
return parsed_feeds
@classmethod
def tag_to_string(cls, tag, use_alt=True):
'''
Convenience method to take a BeautifulSoup Tag and extract the text from it
recursively, including any CDATA sections and alt tag attributes.
@param use_alt: If True try to use the alt attribute for tags that don't have any textual content
@type use_alt: boolean
@return: A unicode (possibly empty) object
@rtype: unicode string
'''
if not tag:
return ''
if isinstance(tag, basestring):
return tag
strings = []
for item in tag.contents:
if isinstance(item, (NavigableString, CData)):
strings.append(item.string)
elif isinstance(item, Tag):
res = cls.tag_to_string(item)
if res:
strings.append(res)
elif use_alt and item.has_key('alt'):
strings.append(item['alt'])
return u''.join(strings)

View File

@ -17,7 +17,7 @@
'''
Builtin recipes.
'''
recipes = ['newsweek']
recipes = ['newsweek', 'atlantic', 'economist']
import re
from libprs500.web.feeds.news import BasicNewsRecipe

View File

@ -0,0 +1,60 @@
#!/usr/bin/env python
## Copyright (C) 2008 Kovid Goyal kovid@kovidgoyal.net
## This program is free software; you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
## the Free Software Foundation; either version 2 of the License, or
## (at your option) any later version.
##
## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
## GNU General Public License for more details.
##
## You should have received a copy of the GNU General Public License along
## with this program; if not, write to the Free Software Foundation, Inc.,
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
'''
theatlantic.com
'''
from libprs500.web.feeds.news import BasicNewsRecipe
from libprs500.ebooks.BeautifulSoup import BeautifulSoup
class TheAtlantic(BasicNewsRecipe):
title = 'The Atlantic'
INDEX = 'http://www.theatlantic.com/doc/current'
remove_tags_before = dict(name='div', id='storytop')
remove_tags = [dict(name='div', id='seealso')]
extra_css = '#bodytext {line-height: 1}'
def parse_index(self):
articles = []
src = self.browser.open(self.INDEX).read()
soup = BeautifulSoup(src, convertEntities=BeautifulSoup.HTML_ENTITIES)
issue = soup.find('span', attrs={'class':'issue'})
if issue:
self.timefmt = ' [%s]'%self.tag_to_string(issue).rpartition('|')[-1].strip().replace('/', '-')
for item in soup.findAll('div', attrs={'class':'item'}):
a = item.find('a')
if a and a.has_key('href'):
url = a['href']
url = 'http://www.theatlantic.com/'+url.replace('/doc', 'doc/print')
title = self.tag_to_string(a)
byline = item.find(attrs={'class':'byline'})
date = self.tag_to_string(byline) if byline else ''
description = ''
articles.append({
'title':title,
'date':date,
'url':url,
'description':description
})
return {'Current Issue' : articles }

View File

@ -0,0 +1,57 @@
#!/usr/bin/env python
## Copyright (C) 2008 Kovid Goyal kovid@kovidgoyal.net
## This program is free software; you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
## the Free Software Foundation; either version 2 of the License, or
## (at your option) any later version.
##
## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
## GNU General Public License for more details.
##
## You should have received a copy of the GNU General Public License along
## with this program; if not, write to the Free Software Foundation, Inc.,
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
'''
economist.com
'''
from libprs500.web.feeds.news import BasicNewsRecipe
from libprs500.ebooks.BeautifulSoup import BeautifulSoup
class Economist(BasicNewsRecipe):
title = 'The Economist'
oldest_article = 7.0
INDEX = 'http://www.economist.com/printedition'
remove_tags = [dict(name=['script', 'noscript', 'title'])]
remove_tags_before = dict(name=lambda tag: tag.name=='title' and tag.parent.name=='body')
def parse_index(self):
soup = BeautifulSoup(self.browser.open(self.INDEX).read(),
convertEntities=BeautifulSoup.HTML_ENTITIES)
index_started = False
feeds = {}
key = None
for tag in soup.findAll(['h1', 'h2']):
text = ''.join(tag.findAll(text=True))
if tag.name == 'h1':
if 'Classified ads' in text:
break
if 'The world this week' in text:
index_started = True
if not index_started:
continue
feeds[text] = []
key = text
continue
if key is None:
continue
a = tag.find('a', href=True)
if a is not None:
article = dict(title=text,
url='http://www.economist.com'+a['href'].replace('displaystory', 'PrinterFriendly'),
description='', content='', date='')
feeds[key].append(article)
return feeds

View File

@ -102,7 +102,7 @@ class IndexTemplate(Template):
<?python
from datetime import datetime
?>
<p style="text-align:right">${datetime.now().strftime(datefmt)}</p>
<p style="text-align:right">${datetime.now().strftime(str(datefmt))}</p>
<ul>
<py:for each="i, feed in enumerate(feeds)">
<li py:if="feed" id="feed_${str(i)}">
@ -144,6 +144,9 @@ class FeedTemplate(Template):
<img alt="${feed.image_alt}" src="${feed.image_url}" />
</div>
</py:if>
<div py:if="feed.description">
${feed.description}
</div>
<ul>
<py:for each="i, article in enumerate(feed.articles)">
<li id="${'article_%d'%i}" py:if="getattr(article, 'downloaded', False)">

View File

@ -85,6 +85,7 @@ class RecursiveFetcher(object):
self.preprocess_regexps = getattr(options, 'preprocess_regexps', [])
self.remove_tags = getattr(options, 'remove_tags', [])
self.remove_tags_after = getattr(options, 'remove_tags_after', None)
self.remove_tags_before = getattr(options, 'remove_tags_before', None)
self.keep_only_tags = getattr(options, 'keep_only_tags', [])
self.preprocess_html_ext = getattr(options, 'preprocess_html', lambda soup: soup)
self.postprocess_html_ext= getattr(options, 'postprocess_html', [])
@ -105,16 +106,23 @@ class RecursiveFetcher(object):
body.insert(len(body.contents), tag)
soup.find('body').replaceWith(body)
if self.remove_tags_after is not None:
tag = soup.find(**self.remove_tags_after)
def remove_beyond(tag, next):
while tag is not None and tag.name != 'body':
after = tag.nextSibling
after = getattr(tag, next)
while after is not None:
ns = after.nextSibling
ns = getattr(tag, next)
after.extract()
after = ns
tag = tag.parent
if self.remove_tags_after is not None:
tag = soup.find(**self.remove_tags_after)
remove_beyond(tag, 'nextSibling')
if self.remove_tags_before is not None:
tag = soup.find(**self.remove_tags_before)
remove_beyond(tag, 'previousSibling')
for kwds in self.remove_tags:
for tag in soup.findAll(**kwds):
tag.extract()