Added recipes for The Atlantic and Economist to feeds2disk

This commit is contained in:
Kovid Goyal 2008-03-15 20:44:25 +00:00
parent 0dde91062f
commit 5a76f5c2e1
7 changed files with 266 additions and 26 deletions

View File

@ -64,7 +64,7 @@ class Feed(object):
max_articles_per_feed=100): max_articles_per_feed=100):
entries = feed.entries entries = feed.entries
feed = feed.feed feed = feed.feed
self.title = feed.get('title', 'Unknown feed') if not title else title self.title = feed.get('title', _('Unknown feed')) if not title else title
self.description = feed.get('description', '') self.description = feed.get('description', '')
image = feed.get('image', {}) image = feed.get('image', {})
self.image_url = image.get('href', None) self.image_url = image.get('href', None)
@ -83,6 +83,38 @@ class Feed(object):
break break
self.parse_article(item) self.parse_article(item)
def populate_from_preparsed_feed(self, title, articles, oldest_article=7,
max_articles_per_feed=100):
self.title = title if title else _('Unknown feed')
self.descrition = ''
self.image_url = None
self.articles = []
self.added_articles = []
self.oldest_article = oldest_article
self.id_counter = 0
for item in articles:
if len(self.articles) >= max_articles_per_feed:
break
id = item.get('id', 'internal id#'+str(self.id_counter))
if id in self.added_articles:
return
self.added_articles.append(id)
self.id_counter += 1
published = time.gmtime(item.get('timestamp', time.time()))
title = item.get('title', _('Untitled article'))
link = item.get('url', None)
description = item.get('description', '')
content = item.get('content', '')
article = Article(id, title, link, description, published, content)
delta = datetime.utcnow() - article.utctime
if delta.days*24*3600 + delta.seconds <= 24*3600*self.oldest_article:
self.articles.append(article)
else:
self.logger.debug('Skipping article %s (%s) from feed %s as it is too old.'%(title, article.localtime.strftime('%a, %d %b, %Y %H:%M'), self.title))
def parse_article(self, item): def parse_article(self, item):
id = item.get('id', 'internal id#'+str(self.id_counter)) id = item.get('id', 'internal id#'+str(self.id_counter))
if id in self.added_articles: if id in self.added_articles:
@ -91,7 +123,7 @@ class Feed(object):
self.id_counter += 1 self.id_counter += 1
self.added_articles.append(id) self.added_articles.append(id)
title = item.get('title', 'Untitled article') title = item.get('title', _('Untitled article'))
link = item.get('link', None) link = item.get('link', None)
description = item.get('summary', None) description = item.get('summary', None)
@ -134,3 +166,17 @@ def feed_from_xml(raw_xml, title=None, oldest_article=7, max_articles_per_feed=1
oldest_article=oldest_article, oldest_article=oldest_article,
max_articles_per_feed=max_articles_per_feed) max_articles_per_feed=max_articles_per_feed)
return pfeed return pfeed
def feeds_from_index(index, oldest_article=7, max_articles_per_feed=100):
'''
@param index: A parsed index as returned by L{BasicNewsRecipe.parse_index}.
@return: A list of L{Feed} objects.
@rtype: list
'''
feeds = []
for title, articles in index.items():
pfeed = Feed()
pfeed.populate_from_preparsed_feed(title, articles, oldest_article=oldest_article,
max_articles_per_feed=max_articles_per_feed)
feeds.append(pfeed)
return feeds

View File

@ -21,11 +21,11 @@ import logging, os, cStringIO, time, traceback
import urlparse import urlparse
from libprs500 import browser, __appname__ from libprs500 import browser, __appname__
from libprs500.ebooks.BeautifulSoup import BeautifulSoup from libprs500.ebooks.BeautifulSoup import BeautifulSoup, NavigableString, CData, Tag
from libprs500.ebooks.metadata.opf import OPFCreator from libprs500.ebooks.metadata.opf import OPFCreator
from libprs500.ebooks.metadata.toc import TOC from libprs500.ebooks.metadata.toc import TOC
from libprs500.ebooks.metadata import MetaInformation from libprs500.ebooks.metadata import MetaInformation
from libprs500.web.feeds import feed_from_xml, templates from libprs500.web.feeds import feed_from_xml, templates, feeds_from_index
from libprs500.web.fetch.simple import option_parser as web2disk_option_parser from libprs500.web.fetch.simple import option_parser as web2disk_option_parser
from libprs500.web.fetch.simple import RecursiveFetcher from libprs500.web.fetch.simple import RecursiveFetcher
from libprs500.threadpool import WorkRequest, ThreadPool, NoResultsPending from libprs500.threadpool import WorkRequest, ThreadPool, NoResultsPending
@ -74,6 +74,11 @@ class BasicNewsRecipe(object):
#: @type: string #: @type: string
timefmt = ' [%a, %d %b %Y]' timefmt = ' [%a, %d %b %Y]'
#: List of feeds to download
#: Can be either C{[url1, url2, ...]} or C{[('title1', url1), ('title2', url2),...]}
#: @type: List of strings or list of 2-tuples
feeds = None
#: Max number of characters in the short description. #: Max number of characters in the short description.
#: @type: integer #: @type: integer
summary_length = 500 summary_length = 500
@ -112,7 +117,7 @@ class BasicNewsRecipe(object):
#: List of options to pass to html2lrf, to customize generation of LRF ebooks. #: List of options to pass to html2lrf, to customize generation of LRF ebooks.
#: @type: list of strings #: @type: list of strings
html2lrf_options = ['--page-break-before', '$'] html2lrf_options = []
#: List of tags to be removed. Specified tags are removed from downloaded HTML. #: List of tags to be removed. Specified tags are removed from downloaded HTML.
#: A tag is specified as a dictionary of the form:: #: A tag is specified as a dictionary of the form::
@ -134,6 +139,12 @@ class BasicNewsRecipe(object):
#: tags after the element with id C{content}. #: tags after the element with id C{content}.
remove_tags_after = None remove_tags_after = None
#: Remove all tags that occur before the specified tag.
#: For the format for specifying a tag see L{remove_tags}.
#: For example, C{remove_tags_before = [dict(id='content')]} will remove all
#: tags before the element with id C{content}.
remove_tags_before = None
#: Keep only the specified tags and their children. #: Keep only the specified tags and their children.
#: For the format for specifying tags see L{remove_tags}. #: For the format for specifying tags see L{remove_tags}.
#: If this list is not empty, then the <body> element will be emptied and re-filled with #: If this list is not empty, then the <body> element will be emptied and re-filled with
@ -220,6 +231,26 @@ class BasicNewsRecipe(object):
''' '''
pass pass
def parse_index(self):
'''
This method should be implemented in recipes that parse a website
instead of feeds to generate a list of articles. Typical uses are for
news sources that have a "Print Edition" webpage that lists all the
articles in the current print edition. If this function is implemented,
it will be used in preference to L{parse_feeds}.
@rtype: dictionary
@return: A dictionary whose keys are feed titles and whose values are each
a list of dictionaries. Each list contains dictionaries of the form::
{
'title' : article title,
'url' : URL of print version,
'date' : The publication date of the article as a string,
'description' : A summary of the article
'content' : The full article (can be an empty string). This is used by FullContentProfile
}
'''
raise NotImplementedError
def __init__(self, options, parser, progress_reporter): def __init__(self, options, parser, progress_reporter):
''' '''
Initialize the recipe. Initialize the recipe.
@ -285,7 +316,7 @@ class BasicNewsRecipe(object):
self.web2disk_options = web2disk_option_parser().parse_args(web2disk_cmdline)[0] self.web2disk_options = web2disk_option_parser().parse_args(web2disk_cmdline)[0]
for extra in ('keep_only_tags', 'remove_tags', 'preprocess_regexps', for extra in ('keep_only_tags', 'remove_tags', 'preprocess_regexps',
'preprocess_html', 'remove_tags_after'): 'preprocess_html', 'remove_tags_after', 'remove_tags_before'):
setattr(self.web2disk_options, extra, getattr(self, extra)) setattr(self.web2disk_options, extra, getattr(self, extra))
self.web2disk_options.postprocess_html = [self._postprocess_html, self.postprocess_html] self.web2disk_options.postprocess_html = [self._postprocess_html, self.postprocess_html]
@ -293,7 +324,7 @@ class BasicNewsRecipe(object):
self.simultaneous_downloads = 1 self.simultaneous_downloads = 1
self.navbar = templates.NavBarTemplate() self.navbar = templates.NavBarTemplate()
self.html2lrf_options.append('--use-spine') self.html2lrf_options.extend(['--page-break-before', '$', '--use-spine'])
self.failed_downloads = [] self.failed_downloads = []
self.partial_failures = [] self.partial_failures = []
@ -389,7 +420,13 @@ class BasicNewsRecipe(object):
def build_index(self): def build_index(self):
self.report_progress(0, _('Fetching feeds...')) self.report_progress(0, _('Fetching feeds...'))
try:
feeds = feeds_from_index(self.parse_index(), oldest_article=self.oldest_article,
max_articles_per_feed=self.max_articles_per_feed)
self.report_progress(0, _('Got feeds from index page'))
except NotImplementedError:
feeds = self.parse_feeds() feeds = self.parse_feeds()
if self.test: if self.test:
feeds = feeds[:2] feeds = feeds[:2]
self.has_single_feed = len(feeds) == 1 self.has_single_feed = len(feeds) == 1
@ -485,28 +522,31 @@ class BasicNewsRecipe(object):
entries = ['index.html'] entries = ['index.html']
toc = TOC(base_path=dir) toc = TOC(base_path=dir)
for i, f in enumerate(feeds):
entries.append('feed_%d/index.html'%i) def feed_index(num, parent):
feed = toc.add_item('feed_%d/index.html'%i, None, f.title) f = feeds[num]
for j, a in enumerate(f): for j, a in enumerate(f):
if getattr(a, 'downloaded', False): if getattr(a, 'downloaded', False):
adir = 'feed_%d/article_%d/'%(i, j) adir = 'feed_%d/article_%d/'%(num, j)
entries.append('%sindex.html'%adir) entries.append('%sindex.html'%adir)
feed.add_item('%sindex.html'%adir, None, a.title if a.title else 'Untitled article') parent.add_item('%sindex.html'%adir, None, a.title if a.title else _('Untitled Article'))
for sp in a.sub_pages: for sp in a.sub_pages:
prefix = os.path.commonprefix([opf_path, sp]) prefix = os.path.commonprefix([opf_path, sp])
relp = sp[len(prefix):] relp = sp[len(prefix):]
entries.append(relp.replace(os.sep, '/')) entries.append(relp.replace(os.sep, '/'))
if len(feeds) > 1:
for i, f in enumerate(feeds):
entries.append('feed_%d/index.html'%i)
feed = toc.add_item('feed_%d/index.html'%i, None, f.title)
feed_index(i, feed)
else:
entries.append('feed_%d/index.html'%0)
feed_index(0, toc)
opf.create_spine(entries) opf.create_spine(entries)
opf.set_toc(toc) opf.set_toc(toc)
for i, f in enumerate(feeds):
for j, a in enumerate(f):
if getattr(a, 'downloaded', False):
adir = 'feed_%d/article_%d/'%(i, j)
opf.render(open(opf_path, 'wb'), open(ncx_path, 'wb')) opf.render(open(opf_path, 'wb'), open(ncx_path, 'wb'))
@ -525,7 +565,7 @@ class BasicNewsRecipe(object):
article = request.article article = request.article
self.logger.debug(_('\nDownloaded article %s from %s\n%s')%(article.title, article.url, request.stream.getvalue().decode('utf-8', 'ignore'))) self.logger.debug(_('\nDownloaded article %s from %s\n%s')%(article.title, article.url, request.stream.getvalue().decode('utf-8', 'ignore')))
article.url = result[0] article.url = 'article_%d/index.html'%a
article.downloaded = True article.downloaded = True
article.sub_pages = result[1][1:] article.sub_pages = result[1][1:]
self.jobs_done += 1 self.jobs_done += 1
@ -563,3 +603,29 @@ class BasicNewsRecipe(object):
max_articles_per_feed=self.max_articles_per_feed)) max_articles_per_feed=self.max_articles_per_feed))
return parsed_feeds return parsed_feeds
@classmethod
def tag_to_string(cls, tag, use_alt=True):
'''
Convenience method to take a BeautifulSoup Tag and extract the text from it
recursively, including any CDATA sections and alt tag attributes.
@param use_alt: If True try to use the alt attribute for tags that don't have any textual content
@type use_alt: boolean
@return: A unicode (possibly empty) object
@rtype: unicode string
'''
if not tag:
return ''
if isinstance(tag, basestring):
return tag
strings = []
for item in tag.contents:
if isinstance(item, (NavigableString, CData)):
strings.append(item.string)
elif isinstance(item, Tag):
res = cls.tag_to_string(item)
if res:
strings.append(res)
elif use_alt and item.has_key('alt'):
strings.append(item['alt'])
return u''.join(strings)

View File

@ -17,7 +17,7 @@
''' '''
Builtin recipes. Builtin recipes.
''' '''
recipes = ['newsweek'] recipes = ['newsweek', 'atlantic', 'economist']
import re import re
from libprs500.web.feeds.news import BasicNewsRecipe from libprs500.web.feeds.news import BasicNewsRecipe

View File

@ -0,0 +1,60 @@
#!/usr/bin/env python
## Copyright (C) 2008 Kovid Goyal kovid@kovidgoyal.net
## This program is free software; you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
## the Free Software Foundation; either version 2 of the License, or
## (at your option) any later version.
##
## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
## GNU General Public License for more details.
##
## You should have received a copy of the GNU General Public License along
## with this program; if not, write to the Free Software Foundation, Inc.,
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
'''
theatlantic.com
'''
from libprs500.web.feeds.news import BasicNewsRecipe
from libprs500.ebooks.BeautifulSoup import BeautifulSoup
class TheAtlantic(BasicNewsRecipe):
title = 'The Atlantic'
INDEX = 'http://www.theatlantic.com/doc/current'
remove_tags_before = dict(name='div', id='storytop')
remove_tags = [dict(name='div', id='seealso')]
extra_css = '#bodytext {line-height: 1}'
def parse_index(self):
articles = []
src = self.browser.open(self.INDEX).read()
soup = BeautifulSoup(src, convertEntities=BeautifulSoup.HTML_ENTITIES)
issue = soup.find('span', attrs={'class':'issue'})
if issue:
self.timefmt = ' [%s]'%self.tag_to_string(issue).rpartition('|')[-1].strip().replace('/', '-')
for item in soup.findAll('div', attrs={'class':'item'}):
a = item.find('a')
if a and a.has_key('href'):
url = a['href']
url = 'http://www.theatlantic.com/'+url.replace('/doc', 'doc/print')
title = self.tag_to_string(a)
byline = item.find(attrs={'class':'byline'})
date = self.tag_to_string(byline) if byline else ''
description = ''
articles.append({
'title':title,
'date':date,
'url':url,
'description':description
})
return {'Current Issue' : articles }

View File

@ -0,0 +1,57 @@
#!/usr/bin/env python
## Copyright (C) 2008 Kovid Goyal kovid@kovidgoyal.net
## This program is free software; you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
## the Free Software Foundation; either version 2 of the License, or
## (at your option) any later version.
##
## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
## GNU General Public License for more details.
##
## You should have received a copy of the GNU General Public License along
## with this program; if not, write to the Free Software Foundation, Inc.,
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
'''
economist.com
'''
from libprs500.web.feeds.news import BasicNewsRecipe
from libprs500.ebooks.BeautifulSoup import BeautifulSoup
class Economist(BasicNewsRecipe):
title = 'The Economist'
oldest_article = 7.0
INDEX = 'http://www.economist.com/printedition'
remove_tags = [dict(name=['script', 'noscript', 'title'])]
remove_tags_before = dict(name=lambda tag: tag.name=='title' and tag.parent.name=='body')
def parse_index(self):
soup = BeautifulSoup(self.browser.open(self.INDEX).read(),
convertEntities=BeautifulSoup.HTML_ENTITIES)
index_started = False
feeds = {}
key = None
for tag in soup.findAll(['h1', 'h2']):
text = ''.join(tag.findAll(text=True))
if tag.name == 'h1':
if 'Classified ads' in text:
break
if 'The world this week' in text:
index_started = True
if not index_started:
continue
feeds[text] = []
key = text
continue
if key is None:
continue
a = tag.find('a', href=True)
if a is not None:
article = dict(title=text,
url='http://www.economist.com'+a['href'].replace('displaystory', 'PrinterFriendly'),
description='', content='', date='')
feeds[key].append(article)
return feeds

View File

@ -102,7 +102,7 @@ class IndexTemplate(Template):
<?python <?python
from datetime import datetime from datetime import datetime
?> ?>
<p style="text-align:right">${datetime.now().strftime(datefmt)}</p> <p style="text-align:right">${datetime.now().strftime(str(datefmt))}</p>
<ul> <ul>
<py:for each="i, feed in enumerate(feeds)"> <py:for each="i, feed in enumerate(feeds)">
<li py:if="feed" id="feed_${str(i)}"> <li py:if="feed" id="feed_${str(i)}">
@ -144,6 +144,9 @@ class FeedTemplate(Template):
<img alt="${feed.image_alt}" src="${feed.image_url}" /> <img alt="${feed.image_alt}" src="${feed.image_url}" />
</div> </div>
</py:if> </py:if>
<div py:if="feed.description">
${feed.description}
</div>
<ul> <ul>
<py:for each="i, article in enumerate(feed.articles)"> <py:for each="i, article in enumerate(feed.articles)">
<li id="${'article_%d'%i}" py:if="getattr(article, 'downloaded', False)"> <li id="${'article_%d'%i}" py:if="getattr(article, 'downloaded', False)">

View File

@ -85,6 +85,7 @@ class RecursiveFetcher(object):
self.preprocess_regexps = getattr(options, 'preprocess_regexps', []) self.preprocess_regexps = getattr(options, 'preprocess_regexps', [])
self.remove_tags = getattr(options, 'remove_tags', []) self.remove_tags = getattr(options, 'remove_tags', [])
self.remove_tags_after = getattr(options, 'remove_tags_after', None) self.remove_tags_after = getattr(options, 'remove_tags_after', None)
self.remove_tags_before = getattr(options, 'remove_tags_before', None)
self.keep_only_tags = getattr(options, 'keep_only_tags', []) self.keep_only_tags = getattr(options, 'keep_only_tags', [])
self.preprocess_html_ext = getattr(options, 'preprocess_html', lambda soup: soup) self.preprocess_html_ext = getattr(options, 'preprocess_html', lambda soup: soup)
self.postprocess_html_ext= getattr(options, 'postprocess_html', []) self.postprocess_html_ext= getattr(options, 'postprocess_html', [])
@ -105,16 +106,23 @@ class RecursiveFetcher(object):
body.insert(len(body.contents), tag) body.insert(len(body.contents), tag)
soup.find('body').replaceWith(body) soup.find('body').replaceWith(body)
if self.remove_tags_after is not None: def remove_beyond(tag, next):
tag = soup.find(**self.remove_tags_after)
while tag is not None and tag.name != 'body': while tag is not None and tag.name != 'body':
after = tag.nextSibling after = getattr(tag, next)
while after is not None: while after is not None:
ns = after.nextSibling ns = getattr(tag, next)
after.extract() after.extract()
after = ns after = ns
tag = tag.parent tag = tag.parent
if self.remove_tags_after is not None:
tag = soup.find(**self.remove_tags_after)
remove_beyond(tag, 'nextSibling')
if self.remove_tags_before is not None:
tag = soup.find(**self.remove_tags_before)
remove_beyond(tag, 'previousSibling')
for kwds in self.remove_tags: for kwds in self.remove_tags:
for tag in soup.findAll(**kwds): for tag in soup.findAll(**kwds):
tag.extract() tag.extract()