From 5a76f5c2e1fd161898de5ccdde9131288e0820d2 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 15 Mar 2008 20:44:25 +0000 Subject: [PATCH] Added recipes for The Atlantic and Economist to feeds2disk --- src/libprs500/web/feeds/__init__.py | 50 ++++++++- src/libprs500/web/feeds/news.py | 102 +++++++++++++++---- src/libprs500/web/feeds/recipes/__init__.py | 2 +- src/libprs500/web/feeds/recipes/atlantic.py | 60 +++++++++++ src/libprs500/web/feeds/recipes/economist.py | 57 +++++++++++ src/libprs500/web/feeds/templates.py | 5 +- src/libprs500/web/fetch/simple.py | 16 ++- 7 files changed, 266 insertions(+), 26 deletions(-) create mode 100644 src/libprs500/web/feeds/recipes/atlantic.py create mode 100644 src/libprs500/web/feeds/recipes/economist.py diff --git a/src/libprs500/web/feeds/__init__.py b/src/libprs500/web/feeds/__init__.py index fb551febd4..1ebbfd78d0 100644 --- a/src/libprs500/web/feeds/__init__.py +++ b/src/libprs500/web/feeds/__init__.py @@ -64,7 +64,7 @@ class Feed(object): max_articles_per_feed=100): entries = feed.entries feed = feed.feed - self.title = feed.get('title', 'Unknown feed') if not title else title + self.title = feed.get('title', _('Unknown feed')) if not title else title self.description = feed.get('description', '') image = feed.get('image', {}) self.image_url = image.get('href', None) @@ -83,6 +83,38 @@ class Feed(object): break self.parse_article(item) + def populate_from_preparsed_feed(self, title, articles, oldest_article=7, + max_articles_per_feed=100): + self.title = title if title else _('Unknown feed') + self.descrition = '' + self.image_url = None + self.articles = [] + self.added_articles = [] + + self.oldest_article = oldest_article + self.id_counter = 0 + + for item in articles: + if len(self.articles) >= max_articles_per_feed: + break + id = item.get('id', 'internal id#'+str(self.id_counter)) + if id in self.added_articles: + return + self.added_articles.append(id) + self.id_counter += 1 + published = time.gmtime(item.get('timestamp', time.time())) + title = item.get('title', _('Untitled article')) + link = item.get('url', None) + description = item.get('description', '') + content = item.get('content', '') + article = Article(id, title, link, description, published, content) + delta = datetime.utcnow() - article.utctime + if delta.days*24*3600 + delta.seconds <= 24*3600*self.oldest_article: + self.articles.append(article) + else: + self.logger.debug('Skipping article %s (%s) from feed %s as it is too old.'%(title, article.localtime.strftime('%a, %d %b, %Y %H:%M'), self.title)) + + def parse_article(self, item): id = item.get('id', 'internal id#'+str(self.id_counter)) if id in self.added_articles: @@ -91,7 +123,7 @@ class Feed(object): self.id_counter += 1 self.added_articles.append(id) - title = item.get('title', 'Untitled article') + title = item.get('title', _('Untitled article')) link = item.get('link', None) description = item.get('summary', None) @@ -134,3 +166,17 @@ def feed_from_xml(raw_xml, title=None, oldest_article=7, max_articles_per_feed=1 oldest_article=oldest_article, max_articles_per_feed=max_articles_per_feed) return pfeed + +def feeds_from_index(index, oldest_article=7, max_articles_per_feed=100): + ''' + @param index: A parsed index as returned by L{BasicNewsRecipe.parse_index}. + @return: A list of L{Feed} objects. + @rtype: list + ''' + feeds = [] + for title, articles in index.items(): + pfeed = Feed() + pfeed.populate_from_preparsed_feed(title, articles, oldest_article=oldest_article, + max_articles_per_feed=max_articles_per_feed) + feeds.append(pfeed) + return feeds \ No newline at end of file diff --git a/src/libprs500/web/feeds/news.py b/src/libprs500/web/feeds/news.py index 4550e34fcc..3b82418f0d 100644 --- a/src/libprs500/web/feeds/news.py +++ b/src/libprs500/web/feeds/news.py @@ -21,11 +21,11 @@ import logging, os, cStringIO, time, traceback import urlparse from libprs500 import browser, __appname__ -from libprs500.ebooks.BeautifulSoup import BeautifulSoup +from libprs500.ebooks.BeautifulSoup import BeautifulSoup, NavigableString, CData, Tag from libprs500.ebooks.metadata.opf import OPFCreator from libprs500.ebooks.metadata.toc import TOC from libprs500.ebooks.metadata import MetaInformation -from libprs500.web.feeds import feed_from_xml, templates +from libprs500.web.feeds import feed_from_xml, templates, feeds_from_index from libprs500.web.fetch.simple import option_parser as web2disk_option_parser from libprs500.web.fetch.simple import RecursiveFetcher from libprs500.threadpool import WorkRequest, ThreadPool, NoResultsPending @@ -74,6 +74,11 @@ class BasicNewsRecipe(object): #: @type: string timefmt = ' [%a, %d %b %Y]' + #: List of feeds to download + #: Can be either C{[url1, url2, ...]} or C{[('title1', url1), ('title2', url2),...]} + #: @type: List of strings or list of 2-tuples + feeds = None + #: Max number of characters in the short description. #: @type: integer summary_length = 500 @@ -112,7 +117,7 @@ class BasicNewsRecipe(object): #: List of options to pass to html2lrf, to customize generation of LRF ebooks. #: @type: list of strings - html2lrf_options = ['--page-break-before', '$'] + html2lrf_options = [] #: List of tags to be removed. Specified tags are removed from downloaded HTML. #: A tag is specified as a dictionary of the form:: @@ -134,6 +139,12 @@ class BasicNewsRecipe(object): #: tags after the element with id C{content}. remove_tags_after = None + #: Remove all tags that occur before the specified tag. + #: For the format for specifying a tag see L{remove_tags}. + #: For example, C{remove_tags_before = [dict(id='content')]} will remove all + #: tags before the element with id C{content}. + remove_tags_before = None + #: Keep only the specified tags and their children. #: For the format for specifying tags see L{remove_tags}. #: If this list is not empty, then the element will be emptied and re-filled with @@ -220,6 +231,26 @@ class BasicNewsRecipe(object): ''' pass + def parse_index(self): + ''' + This method should be implemented in recipes that parse a website + instead of feeds to generate a list of articles. Typical uses are for + news sources that have a "Print Edition" webpage that lists all the + articles in the current print edition. If this function is implemented, + it will be used in preference to L{parse_feeds}. + @rtype: dictionary + @return: A dictionary whose keys are feed titles and whose values are each + a list of dictionaries. Each list contains dictionaries of the form:: + { + 'title' : article title, + 'url' : URL of print version, + 'date' : The publication date of the article as a string, + 'description' : A summary of the article + 'content' : The full article (can be an empty string). This is used by FullContentProfile + } + ''' + raise NotImplementedError + def __init__(self, options, parser, progress_reporter): ''' Initialize the recipe. @@ -285,7 +316,7 @@ class BasicNewsRecipe(object): self.web2disk_options = web2disk_option_parser().parse_args(web2disk_cmdline)[0] for extra in ('keep_only_tags', 'remove_tags', 'preprocess_regexps', - 'preprocess_html', 'remove_tags_after'): + 'preprocess_html', 'remove_tags_after', 'remove_tags_before'): setattr(self.web2disk_options, extra, getattr(self, extra)) self.web2disk_options.postprocess_html = [self._postprocess_html, self.postprocess_html] @@ -293,7 +324,7 @@ class BasicNewsRecipe(object): self.simultaneous_downloads = 1 self.navbar = templates.NavBarTemplate() - self.html2lrf_options.append('--use-spine') + self.html2lrf_options.extend(['--page-break-before', '$', '--use-spine']) self.failed_downloads = [] self.partial_failures = [] @@ -389,7 +420,13 @@ class BasicNewsRecipe(object): def build_index(self): self.report_progress(0, _('Fetching feeds...')) - feeds = self.parse_feeds() + try: + feeds = feeds_from_index(self.parse_index(), oldest_article=self.oldest_article, + max_articles_per_feed=self.max_articles_per_feed) + self.report_progress(0, _('Got feeds from index page')) + except NotImplementedError: + feeds = self.parse_feeds() + if self.test: feeds = feeds[:2] self.has_single_feed = len(feeds) == 1 @@ -485,28 +522,31 @@ class BasicNewsRecipe(object): entries = ['index.html'] toc = TOC(base_path=dir) - for i, f in enumerate(feeds): - entries.append('feed_%d/index.html'%i) - feed = toc.add_item('feed_%d/index.html'%i, None, f.title) + + def feed_index(num, parent): + f = feeds[num] for j, a in enumerate(f): if getattr(a, 'downloaded', False): - adir = 'feed_%d/article_%d/'%(i, j) + adir = 'feed_%d/article_%d/'%(num, j) entries.append('%sindex.html'%adir) - feed.add_item('%sindex.html'%adir, None, a.title if a.title else 'Untitled article') + parent.add_item('%sindex.html'%adir, None, a.title if a.title else _('Untitled Article')) for sp in a.sub_pages: prefix = os.path.commonprefix([opf_path, sp]) relp = sp[len(prefix):] entries.append(relp.replace(os.sep, '/')) + + if len(feeds) > 1: + for i, f in enumerate(feeds): + entries.append('feed_%d/index.html'%i) + feed = toc.add_item('feed_%d/index.html'%i, None, f.title) + feed_index(i, feed) + else: + entries.append('feed_%d/index.html'%0) + feed_index(0, toc) opf.create_spine(entries) opf.set_toc(toc) - for i, f in enumerate(feeds): - - for j, a in enumerate(f): - if getattr(a, 'downloaded', False): - adir = 'feed_%d/article_%d/'%(i, j) - opf.render(open(opf_path, 'wb'), open(ncx_path, 'wb')) @@ -525,7 +565,7 @@ class BasicNewsRecipe(object): article = request.article self.logger.debug(_('\nDownloaded article %s from %s\n%s')%(article.title, article.url, request.stream.getvalue().decode('utf-8', 'ignore'))) - article.url = result[0] + article.url = 'article_%d/index.html'%a article.downloaded = True article.sub_pages = result[1][1:] self.jobs_done += 1 @@ -563,3 +603,29 @@ class BasicNewsRecipe(object): max_articles_per_feed=self.max_articles_per_feed)) return parsed_feeds + + @classmethod + def tag_to_string(cls, tag, use_alt=True): + ''' + Convenience method to take a BeautifulSoup Tag and extract the text from it + recursively, including any CDATA sections and alt tag attributes. + @param use_alt: If True try to use the alt attribute for tags that don't have any textual content + @type use_alt: boolean + @return: A unicode (possibly empty) object + @rtype: unicode string + ''' + if not tag: + return '' + if isinstance(tag, basestring): + return tag + strings = [] + for item in tag.contents: + if isinstance(item, (NavigableString, CData)): + strings.append(item.string) + elif isinstance(item, Tag): + res = cls.tag_to_string(item) + if res: + strings.append(res) + elif use_alt and item.has_key('alt'): + strings.append(item['alt']) + return u''.join(strings) diff --git a/src/libprs500/web/feeds/recipes/__init__.py b/src/libprs500/web/feeds/recipes/__init__.py index ed7d7a3d9a..4fb593a371 100644 --- a/src/libprs500/web/feeds/recipes/__init__.py +++ b/src/libprs500/web/feeds/recipes/__init__.py @@ -17,7 +17,7 @@ ''' Builtin recipes. ''' -recipes = ['newsweek'] +recipes = ['newsweek', 'atlantic', 'economist'] import re from libprs500.web.feeds.news import BasicNewsRecipe diff --git a/src/libprs500/web/feeds/recipes/atlantic.py b/src/libprs500/web/feeds/recipes/atlantic.py new file mode 100644 index 0000000000..6632e83e12 --- /dev/null +++ b/src/libprs500/web/feeds/recipes/atlantic.py @@ -0,0 +1,60 @@ +#!/usr/bin/env python + +## Copyright (C) 2008 Kovid Goyal kovid@kovidgoyal.net +## This program is free software; you can redistribute it and/or modify +## it under the terms of the GNU General Public License as published by +## the Free Software Foundation; either version 2 of the License, or +## (at your option) any later version. +## +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License along +## with this program; if not, write to the Free Software Foundation, Inc., +## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +''' +theatlantic.com +''' + +from libprs500.web.feeds.news import BasicNewsRecipe +from libprs500.ebooks.BeautifulSoup import BeautifulSoup + +class TheAtlantic(BasicNewsRecipe): + + title = 'The Atlantic' + INDEX = 'http://www.theatlantic.com/doc/current' + + remove_tags_before = dict(name='div', id='storytop') + remove_tags = [dict(name='div', id='seealso')] + extra_css = '#bodytext {line-height: 1}' + + def parse_index(self): + articles = [] + + src = self.browser.open(self.INDEX).read() + soup = BeautifulSoup(src, convertEntities=BeautifulSoup.HTML_ENTITIES) + + issue = soup.find('span', attrs={'class':'issue'}) + if issue: + self.timefmt = ' [%s]'%self.tag_to_string(issue).rpartition('|')[-1].strip().replace('/', '-') + + for item in soup.findAll('div', attrs={'class':'item'}): + a = item.find('a') + if a and a.has_key('href'): + url = a['href'] + url = 'http://www.theatlantic.com/'+url.replace('/doc', 'doc/print') + title = self.tag_to_string(a) + byline = item.find(attrs={'class':'byline'}) + date = self.tag_to_string(byline) if byline else '' + description = '' + articles.append({ + 'title':title, + 'date':date, + 'url':url, + 'description':description + }) + + + return {'Current Issue' : articles } \ No newline at end of file diff --git a/src/libprs500/web/feeds/recipes/economist.py b/src/libprs500/web/feeds/recipes/economist.py new file mode 100644 index 0000000000..33407fa04a --- /dev/null +++ b/src/libprs500/web/feeds/recipes/economist.py @@ -0,0 +1,57 @@ +#!/usr/bin/env python + +## Copyright (C) 2008 Kovid Goyal kovid@kovidgoyal.net +## This program is free software; you can redistribute it and/or modify +## it under the terms of the GNU General Public License as published by +## the Free Software Foundation; either version 2 of the License, or +## (at your option) any later version. +## +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License along +## with this program; if not, write to the Free Software Foundation, Inc., +## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +''' +economist.com +''' +from libprs500.web.feeds.news import BasicNewsRecipe +from libprs500.ebooks.BeautifulSoup import BeautifulSoup + +class Economist(BasicNewsRecipe): + + title = 'The Economist' + oldest_article = 7.0 + INDEX = 'http://www.economist.com/printedition' + remove_tags = [dict(name=['script', 'noscript', 'title'])] + remove_tags_before = dict(name=lambda tag: tag.name=='title' and tag.parent.name=='body') + + def parse_index(self): + soup = BeautifulSoup(self.browser.open(self.INDEX).read(), + convertEntities=BeautifulSoup.HTML_ENTITIES) + index_started = False + feeds = {} + key = None + for tag in soup.findAll(['h1', 'h2']): + text = ''.join(tag.findAll(text=True)) + if tag.name == 'h1': + if 'Classified ads' in text: + break + if 'The world this week' in text: + index_started = True + if not index_started: + continue + feeds[text] = [] + key = text + continue + if key is None: + continue + a = tag.find('a', href=True) + if a is not None: + article = dict(title=text, + url='http://www.economist.com'+a['href'].replace('displaystory', 'PrinterFriendly'), + description='', content='', date='') + feeds[key].append(article) + return feeds \ No newline at end of file diff --git a/src/libprs500/web/feeds/templates.py b/src/libprs500/web/feeds/templates.py index 1d1becbb51..03432ec151 100644 --- a/src/libprs500/web/feeds/templates.py +++ b/src/libprs500/web/feeds/templates.py @@ -102,7 +102,7 @@ class IndexTemplate(Template): -

${datetime.now().strftime(datefmt)}

+

${datetime.now().strftime(str(datefmt))}