From 5a76f5c2e1fd161898de5ccdde9131288e0820d2 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Sat, 15 Mar 2008 20:44:25 +0000
Subject: [PATCH] Added recipes for The Atlantic and Economist to feeds2disk

---
 src/libprs500/web/feeds/__init__.py          |  50 ++++++++-
 src/libprs500/web/feeds/news.py              | 102 +++++++++++++++----
 src/libprs500/web/feeds/recipes/__init__.py  |   2 +-
 src/libprs500/web/feeds/recipes/atlantic.py  |  60 +++++++++++
 src/libprs500/web/feeds/recipes/economist.py |  57 +++++++++++
 src/libprs500/web/feeds/templates.py         |   5 +-
 src/libprs500/web/fetch/simple.py            |  16 ++-
 7 files changed, 266 insertions(+), 26 deletions(-)
 create mode 100644 src/libprs500/web/feeds/recipes/atlantic.py
 create mode 100644 src/libprs500/web/feeds/recipes/economist.py

diff --git a/src/libprs500/web/feeds/__init__.py b/src/libprs500/web/feeds/__init__.py
index fb551febd4..1ebbfd78d0 100644
--- a/src/libprs500/web/feeds/__init__.py
+++ b/src/libprs500/web/feeds/__init__.py
@@ -64,7 +64,7 @@ class Feed(object):
                            max_articles_per_feed=100):
         entries = feed.entries
         feed = feed.feed
-        self.title        = feed.get('title', 'Unknown feed') if not title else title
+        self.title        = feed.get('title', _('Unknown feed')) if not title else title
         self.description  = feed.get('description', '')
         image             = feed.get('image', {})
         self.image_url    = image.get('href', None)
@@ -83,6 +83,38 @@ class Feed(object):
                 break
             self.parse_article(item)
 
+    def populate_from_preparsed_feed(self, title, articles, oldest_article=7, 
+                           max_articles_per_feed=100):
+        self.title      = title if title else _('Unknown feed')
+        self.descrition = ''
+        self.image_url  = None
+        self.articles   = []
+        self.added_articles = []
+         
+        self.oldest_article = oldest_article
+        self.id_counter = 0
+        
+        for item in articles:
+            if len(self.articles) >= max_articles_per_feed:
+                break
+            id = item.get('id', 'internal id#'+str(self.id_counter))
+            if id in self.added_articles:
+                return
+            self.added_articles.append(id)
+            self.id_counter += 1
+            published   = time.gmtime(item.get('timestamp', time.time()))
+            title       = item.get('title', _('Untitled article'))
+            link        = item.get('url', None)
+            description = item.get('description', '')
+            content     = item.get('content', '')
+            article = Article(id, title, link, description, published, content)
+            delta = datetime.utcnow() - article.utctime
+            if delta.days*24*3600 + delta.seconds <= 24*3600*self.oldest_article:
+                self.articles.append(article)
+            else:
+                self.logger.debug('Skipping article %s (%s) from feed %s as it is too old.'%(title, article.localtime.strftime('%a, %d %b, %Y %H:%M'), self.title))
+         
+    
     def parse_article(self, item):
         id = item.get('id', 'internal id#'+str(self.id_counter))
         if id in self.added_articles:
@@ -91,7 +123,7 @@ class Feed(object):
         self.id_counter += 1
         self.added_articles.append(id)
         
-        title = item.get('title', 'Untitled article')
+        title = item.get('title', _('Untitled article'))
         link  = item.get('link',  None)
         description = item.get('summary', None)
         
@@ -134,3 +166,17 @@ def feed_from_xml(raw_xml, title=None, oldest_article=7, max_articles_per_feed=1
                             oldest_article=oldest_article,
                             max_articles_per_feed=max_articles_per_feed)
     return pfeed
+
+def feeds_from_index(index, oldest_article=7, max_articles_per_feed=100):
+    '''
+    @param index: A parsed index as returned by L{BasicNewsRecipe.parse_index}.
+    @return: A list of L{Feed} objects.
+    @rtype: list
+    '''
+    feeds = []
+    for title, articles in index.items():
+        pfeed = Feed()
+        pfeed.populate_from_preparsed_feed(title, articles, oldest_article=oldest_article, 
+                                       max_articles_per_feed=max_articles_per_feed)
+        feeds.append(pfeed)
+    return feeds
\ No newline at end of file
diff --git a/src/libprs500/web/feeds/news.py b/src/libprs500/web/feeds/news.py
index 4550e34fcc..3b82418f0d 100644
--- a/src/libprs500/web/feeds/news.py
+++ b/src/libprs500/web/feeds/news.py
@@ -21,11 +21,11 @@ import logging, os, cStringIO, time, traceback
 import urlparse
 
 from libprs500 import browser, __appname__
-from libprs500.ebooks.BeautifulSoup import BeautifulSoup
+from libprs500.ebooks.BeautifulSoup import BeautifulSoup, NavigableString, CData, Tag
 from libprs500.ebooks.metadata.opf import OPFCreator
 from libprs500.ebooks.metadata.toc import TOC
 from libprs500.ebooks.metadata import MetaInformation
-from libprs500.web.feeds import feed_from_xml, templates
+from libprs500.web.feeds import feed_from_xml, templates, feeds_from_index
 from libprs500.web.fetch.simple import option_parser as web2disk_option_parser
 from libprs500.web.fetch.simple import RecursiveFetcher
 from libprs500.threadpool import WorkRequest, ThreadPool, NoResultsPending
@@ -74,6 +74,11 @@ class BasicNewsRecipe(object):
     #: @type: string
     timefmt               = ' [%a, %d %b %Y]'
     
+    #: List of feeds to download
+    #: Can be either C{[url1, url2, ...]} or C{[('title1', url1), ('title2', url2),...]}
+    #: @type: List of strings or list of 2-tuples
+    feeds = None
+    
     #: Max number of characters in the short description.
     #: @type: integer
     summary_length        = 500
@@ -112,7 +117,7 @@ class BasicNewsRecipe(object):
     
     #: List of options to pass to html2lrf, to customize generation of LRF ebooks.
     #: @type: list of strings
-    html2lrf_options   = ['--page-break-before', '$']
+    html2lrf_options   = []
     
     #: List of tags to be removed. Specified tags are removed from downloaded HTML.
     #: A tag is specified as a dictionary of the form::
@@ -134,6 +139,12 @@ class BasicNewsRecipe(object):
     #: tags after the element with id C{content}.
     remove_tags_after = None
     
+    #: Remove all tags that occur before the specified tag.
+    #: For the format for specifying a tag see L{remove_tags}.
+    #: For example, C{remove_tags_before = [dict(id='content')]} will remove all
+    #: tags before the element with id C{content}.
+    remove_tags_before = None
+    
     #: Keep only the specified tags and their children. 
     #: For the format for specifying tags see L{remove_tags}.
     #: If this list is not empty, then the <body> element will be emptied and re-filled with
@@ -220,6 +231,26 @@ class BasicNewsRecipe(object):
         '''
         pass
     
+    def parse_index(self):
+        '''
+        This method should be implemented in recipes that parse a website
+        instead of feeds to generate a list of articles. Typical uses are for
+        news sources that have a "Print Edition" webpage that lists all the 
+        articles in the current print edition. If this function is implemented,
+        it will be used in preference to L{parse_feeds}.
+        @rtype: dictionary
+        @return: A dictionary whose keys are feed titles and whose values are each
+        a list of dictionaries. Each list contains dictionaries of the form::
+            {
+            'title'       : article title,
+            'url'         : URL of print version,
+            'date'        : The publication date of the article as a string,
+            'description' : A summary of the article
+            'content'     : The full article (can be an empty string). This is used by FullContentProfile
+            }
+        '''
+        raise NotImplementedError
+    
     def __init__(self, options, parser, progress_reporter):
         '''
         Initialize the recipe.
@@ -285,7 +316,7 @@ class BasicNewsRecipe(object):
             
         self.web2disk_options = web2disk_option_parser().parse_args(web2disk_cmdline)[0]
         for extra in ('keep_only_tags', 'remove_tags', 'preprocess_regexps', 
-                      'preprocess_html', 'remove_tags_after'):
+                      'preprocess_html', 'remove_tags_after', 'remove_tags_before'):
             setattr(self.web2disk_options, extra, getattr(self, extra))
         self.web2disk_options.postprocess_html = [self._postprocess_html, self.postprocess_html]
         
@@ -293,7 +324,7 @@ class BasicNewsRecipe(object):
             self.simultaneous_downloads = 1
             
         self.navbar = templates.NavBarTemplate()
-        self.html2lrf_options.append('--use-spine')
+        self.html2lrf_options.extend(['--page-break-before', '$', '--use-spine'])
         self.failed_downloads = []
         self.partial_failures = []
         
@@ -389,7 +420,13 @@ class BasicNewsRecipe(object):
     
     def build_index(self):
         self.report_progress(0, _('Fetching feeds...'))
-        feeds = self.parse_feeds()
+        try:
+            feeds = feeds_from_index(self.parse_index(), oldest_article=self.oldest_article,
+                                     max_articles_per_feed=self.max_articles_per_feed)
+            self.report_progress(0, _('Got feeds from index page'))
+        except NotImplementedError:
+            feeds = self.parse_feeds()
+            
         if self.test:
             feeds = feeds[:2]
         self.has_single_feed = len(feeds) == 1
@@ -485,28 +522,31 @@ class BasicNewsRecipe(object):
         
         entries = ['index.html']
         toc = TOC(base_path=dir)
-        for i, f in enumerate(feeds):
-            entries.append('feed_%d/index.html'%i)
-            feed = toc.add_item('feed_%d/index.html'%i, None, f.title)
+        
+        def feed_index(num, parent):
+            f = feeds[num]
             for j, a in enumerate(f):
                 if getattr(a, 'downloaded', False):
-                    adir = 'feed_%d/article_%d/'%(i, j)
+                    adir = 'feed_%d/article_%d/'%(num, j)
                     entries.append('%sindex.html'%adir)
-                    feed.add_item('%sindex.html'%adir, None, a.title if a.title else 'Untitled article')
+                    parent.add_item('%sindex.html'%adir, None, a.title if a.title else _('Untitled Article'))
                     for sp in a.sub_pages:
                         prefix = os.path.commonprefix([opf_path, sp])
                         relp = sp[len(prefix):]
                         entries.append(relp.replace(os.sep, '/'))
+        
+        if len(feeds) > 1:
+            for i, f in enumerate(feeds):
+                entries.append('feed_%d/index.html'%i)
+                feed = toc.add_item('feed_%d/index.html'%i, None, f.title)
+                feed_index(i, feed)
+        else:
+            entries.append('feed_%d/index.html'%0)
+            feed_index(0, toc)
                         
         opf.create_spine(entries)
         opf.set_toc(toc)
         
-        for i, f in enumerate(feeds):
-            
-            for j, a in enumerate(f):
-                if getattr(a, 'downloaded', False):
-                    adir = 'feed_%d/article_%d/'%(i, j)
-                    
         opf.render(open(opf_path, 'wb'), open(ncx_path, 'wb'))
         
     
@@ -525,7 +565,7 @@ class BasicNewsRecipe(object):
         
         article = request.article
         self.logger.debug(_('\nDownloaded article %s from %s\n%s')%(article.title, article.url, request.stream.getvalue().decode('utf-8', 'ignore')))
-        article.url = result[0]
+        article.url = 'article_%d/index.html'%a
         article.downloaded = True
         article.sub_pages  = result[1][1:]
         self.jobs_done += 1
@@ -563,3 +603,29 @@ class BasicNewsRecipe(object):
                                               max_articles_per_feed=self.max_articles_per_feed))
             
         return parsed_feeds
+    
+    @classmethod
+    def tag_to_string(cls, tag, use_alt=True):
+        '''
+        Convenience method to take a BeautifulSoup Tag and extract the text from it
+        recursively, including any CDATA sections and alt tag attributes.
+        @param use_alt: If True try to use the alt attribute for tags that don't have any textual content
+        @type use_alt: boolean
+        @return: A unicode (possibly empty) object
+        @rtype: unicode string
+        '''
+        if not tag:
+            return ''
+        if isinstance(tag, basestring):
+            return tag
+        strings = []
+        for item in tag.contents:
+            if isinstance(item, (NavigableString, CData)):
+                strings.append(item.string)
+            elif isinstance(item, Tag):
+                res = cls.tag_to_string(item)
+                if res:
+                    strings.append(res)
+                elif use_alt and item.has_key('alt'):
+                    strings.append(item['alt'])
+        return u''.join(strings)
diff --git a/src/libprs500/web/feeds/recipes/__init__.py b/src/libprs500/web/feeds/recipes/__init__.py
index ed7d7a3d9a..4fb593a371 100644
--- a/src/libprs500/web/feeds/recipes/__init__.py
+++ b/src/libprs500/web/feeds/recipes/__init__.py
@@ -17,7 +17,7 @@
 '''
 Builtin recipes.
 '''
-recipes = ['newsweek']
+recipes = ['newsweek', 'atlantic', 'economist']
 
 import re
 from libprs500.web.feeds.news import BasicNewsRecipe
diff --git a/src/libprs500/web/feeds/recipes/atlantic.py b/src/libprs500/web/feeds/recipes/atlantic.py
new file mode 100644
index 0000000000..6632e83e12
--- /dev/null
+++ b/src/libprs500/web/feeds/recipes/atlantic.py
@@ -0,0 +1,60 @@
+#!/usr/bin/env  python
+
+##    Copyright (C) 2008 Kovid Goyal kovid@kovidgoyal.net
+##    This program is free software; you can redistribute it and/or modify
+##    it under the terms of the GNU General Public License as published by
+##    the Free Software Foundation; either version 2 of the License, or
+##    (at your option) any later version.
+##
+##    This program is distributed in the hope that it will be useful,
+##    but WITHOUT ANY WARRANTY; without even the implied warranty of
+##    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+##    GNU General Public License for more details.
+##
+##    You should have received a copy of the GNU General Public License along
+##    with this program; if not, write to the Free Software Foundation, Inc.,
+##    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+'''
+theatlantic.com
+'''
+
+from libprs500.web.feeds.news import BasicNewsRecipe
+from libprs500.ebooks.BeautifulSoup import BeautifulSoup
+
+class TheAtlantic(BasicNewsRecipe):
+    
+    title = 'The Atlantic'
+    INDEX = 'http://www.theatlantic.com/doc/current'
+    
+    remove_tags_before = dict(name='div', id='storytop')
+    remove_tags        = [dict(name='div', id='seealso')]
+    extra_css          = '#bodytext {line-height: 1}'
+    
+    def parse_index(self):
+        articles = []
+        
+        src = self.browser.open(self.INDEX).read()
+        soup = BeautifulSoup(src, convertEntities=BeautifulSoup.HTML_ENTITIES)
+        
+        issue = soup.find('span', attrs={'class':'issue'})
+        if issue:
+            self.timefmt = ' [%s]'%self.tag_to_string(issue).rpartition('|')[-1].strip().replace('/', '-')
+        
+        for item in soup.findAll('div', attrs={'class':'item'}):
+            a = item.find('a')
+            if a and a.has_key('href'):
+                url = a['href']
+                url = 'http://www.theatlantic.com/'+url.replace('/doc', 'doc/print')
+                title = self.tag_to_string(a)
+                byline = item.find(attrs={'class':'byline'})
+                date = self.tag_to_string(byline) if byline else ''
+                description = ''
+                articles.append({
+                                 'title':title,
+                                 'date':date,
+                                 'url':url,
+                                 'description':description
+                                })
+                
+        
+        return {'Current Issue' : articles }
\ No newline at end of file
diff --git a/src/libprs500/web/feeds/recipes/economist.py b/src/libprs500/web/feeds/recipes/economist.py
new file mode 100644
index 0000000000..33407fa04a
--- /dev/null
+++ b/src/libprs500/web/feeds/recipes/economist.py
@@ -0,0 +1,57 @@
+#!/usr/bin/env  python
+
+##    Copyright (C) 2008 Kovid Goyal kovid@kovidgoyal.net
+##    This program is free software; you can redistribute it and/or modify
+##    it under the terms of the GNU General Public License as published by
+##    the Free Software Foundation; either version 2 of the License, or
+##    (at your option) any later version.
+##
+##    This program is distributed in the hope that it will be useful,
+##    but WITHOUT ANY WARRANTY; without even the implied warranty of
+##    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+##    GNU General Public License for more details.
+##
+##    You should have received a copy of the GNU General Public License along
+##    with this program; if not, write to the Free Software Foundation, Inc.,
+##    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+'''
+economist.com
+'''
+from libprs500.web.feeds.news import BasicNewsRecipe
+from libprs500.ebooks.BeautifulSoup import BeautifulSoup
+
+class Economist(BasicNewsRecipe):
+    
+    title = 'The Economist'
+    oldest_article = 7.0
+    INDEX = 'http://www.economist.com/printedition'
+    remove_tags = [dict(name=['script', 'noscript', 'title'])]
+    remove_tags_before = dict(name=lambda tag: tag.name=='title' and tag.parent.name=='body')
+    
+    def parse_index(self):
+        soup = BeautifulSoup(self.browser.open(self.INDEX).read(), 
+                             convertEntities=BeautifulSoup.HTML_ENTITIES)
+        index_started = False
+        feeds = {}
+        key = None
+        for tag in soup.findAll(['h1', 'h2']):
+            text = ''.join(tag.findAll(text=True))                
+            if tag.name == 'h1':
+                if 'Classified ads' in text:
+                    break
+                if 'The world this week' in text:
+                    index_started = True
+                if not index_started:
+                    continue
+                feeds[text] = []
+                key = text
+                continue
+            if key is None:
+                continue
+            a = tag.find('a', href=True)
+            if a is not None:
+                article = dict(title=text, 
+                    url='http://www.economist.com'+a['href'].replace('displaystory', 'PrinterFriendly'), 
+                    description='', content='', date='')
+                feeds[key].append(article)
+        return feeds
\ No newline at end of file
diff --git a/src/libprs500/web/feeds/templates.py b/src/libprs500/web/feeds/templates.py
index 1d1becbb51..03432ec151 100644
--- a/src/libprs500/web/feeds/templates.py
+++ b/src/libprs500/web/feeds/templates.py
@@ -102,7 +102,7 @@ class IndexTemplate(Template):
         <?python
         from datetime import datetime
         ?>
-        <p style="text-align:right">${datetime.now().strftime(datefmt)}</p>
+        <p style="text-align:right">${datetime.now().strftime(str(datefmt))}</p>
         <ul>
             <py:for each="i, feed in enumerate(feeds)">
             <li py:if="feed" id="feed_${str(i)}">
@@ -144,6 +144,9 @@ class FeedTemplate(Template):
             <img alt="${feed.image_alt}" src="${feed.image_url}" />
         </div>
         </py:if>
+        <div py:if="feed.description">
+            ${feed.description}
+        </div>
         <ul>
             <py:for each="i, article in enumerate(feed.articles)">
             <li id="${'article_%d'%i}" py:if="getattr(article, 'downloaded', False)">
diff --git a/src/libprs500/web/fetch/simple.py b/src/libprs500/web/fetch/simple.py
index 8d9b842a7c..a16c619190 100644
--- a/src/libprs500/web/fetch/simple.py
+++ b/src/libprs500/web/fetch/simple.py
@@ -85,6 +85,7 @@ class RecursiveFetcher(object):
         self.preprocess_regexps  = getattr(options, 'preprocess_regexps', [])
         self.remove_tags         = getattr(options, 'remove_tags', [])
         self.remove_tags_after   = getattr(options, 'remove_tags_after', None)
+        self.remove_tags_before  = getattr(options, 'remove_tags_before', None)
         self.keep_only_tags      = getattr(options, 'keep_only_tags', [])
         self.preprocess_html_ext = getattr(options, 'preprocess_html', lambda soup: soup) 
         self.postprocess_html_ext= getattr(options, 'postprocess_html', [])
@@ -105,15 +106,22 @@ class RecursiveFetcher(object):
                     body.insert(len(body.contents), tag)
             soup.find('body').replaceWith(body)
             
-        if self.remove_tags_after is not None:
-            tag = soup.find(**self.remove_tags_after)
+        def remove_beyond(tag, next):
             while tag is not None and tag.name != 'body':
-                after = tag.nextSibling
+                after = getattr(tag, next)
                 while after is not None:
-                    ns = after.nextSibling
+                    ns = getattr(tag, next)
                     after.extract()
                     after = ns
                 tag = tag.parent
+        
+        if self.remove_tags_after is not None:
+            tag = soup.find(**self.remove_tags_after)
+            remove_beyond(tag, 'nextSibling')
+            
+        if self.remove_tags_before is not None:
+            tag = soup.find(**self.remove_tags_before)
+            remove_beyond(tag, 'previousSibling')
             
         for kwds in self.remove_tags:
             for tag in soup.findAll(**kwds):