Migrate Dilbert feed to new infrastructure

2025-08-11 09:13:57 -04:00 · 2008-03-17 23:07:31 +00:00 · 2008-03-17 23:07:31 +00:00 · d834a8facc
commit d834a8facc
parent 0983dbeafd
7 changed files with 150 additions and 22 deletions
--- a/src/libprs500/ebooks/chardet/init.py
+++ b/src/libprs500/ebooks/chardet/init.py
@ -52,6 +52,7 @@ def xml_to_unicode(raw, verbose=False):
            print 'WARNING: Encoding detection confidence %d%%'%(chardet['confidence']*100)
    CHARSET_ALIASES = { "macintosh" : "mac-roman",
                        "x-sjis" : "shift-jis" }
    if encoding:
        encoding = encoding.lower()
    if CHARSET_ALIASES.has_key(encoding):
        encoding = CHARSET_ALIASES[encoding]
--- a/src/libprs500/ebooks/metadata/opf.xml
+++ b/src/libprs500/ebooks/metadata/opf.xml
@ -24,13 +24,14 @@
        <reference py:if="mi.cover" type="cover" href="${mi.cover}" /> 
    </guide>
-    <manifest>
+    <manifest py:if="getattr(mi, 'manifest', None)">
        <py:for each="i, m in enumerate(mi.manifest)">
        <item id="${str(i)}" href="${m[0]}" media-type="${m[1]}" /> 
        </py:for>
    </manifest>
-    <spine py:with="attrs={'toc':'ncx' if mi.toc else None}" py:attrs="attrs">
+    <spine py:if="getattr(mi, 'manifest', None)" 
           py:with="attrs={'toc':'ncx' if mi.toc else None}" py:attrs="attrs">
        <py:for each="idref in mi.spine">
        <itemref idref="${str(idref)}" />
        </py:for>
--- a/src/libprs500/web/feeds/init.py
+++ b/src/libprs500/web/feeds/init.py
@ -17,7 +17,7 @@
 '''
 Contains the logic for parsing feeds.
 '''
-import time, logging
+import time, logging, traceback
 from datetime import datetime
 from libprs500.web.feeds.feedparser import parse
@ -54,11 +54,12 @@ Has content : %s
 class Feed(object):
-    def __init__(self):
+    def __init__(self, get_article_url=lambda item: item.get('link', None)):
        '''
        Parse a feed into articles.
        '''
        self.logger = logging.getLogger('feeds2disk')
        self.get_article_url = get_article_url
    def populate_from_feed(self, feed, title=None, oldest_article=7, 
                           max_articles_per_feed=100):
@ -124,7 +125,12 @@ class Feed(object):
        self.added_articles.append(id)
        title = item.get('title', _('Untitled article'))
-        link  = item.get('link',  None)
+        try:
            link  = self.get_article_url(item)
        except:
            self.logger.warning('Failed to get link for %s'%title)
            self.logger.debug(traceback.format_exc())
            link = None
        description = item.get('summary', None)
        content = '\n'.join(i.value for i in item.get('content', []))
@ -159,9 +165,10 @@ class Feed(object):
        return False
-def feed_from_xml(raw_xml, title=None, oldest_article=7, max_articles_per_feed=100):
+def feed_from_xml(raw_xml, title=None, oldest_article=7, 
                  max_articles_per_feed=100, get_article_url=lambda item: item.get('link', None)):
    feed = parse(raw_xml)
-    pfeed = Feed()
+    pfeed = Feed(get_article_url=get_article_url)
    pfeed.populate_from_feed(feed, title=title, 
                            oldest_article=oldest_article,
                            max_articles_per_feed=max_articles_per_feed)
--- a/src/libprs500/web/feeds/main.py
+++ b/src/libprs500/web/feeds/main.py
@ -20,6 +20,7 @@ from libprs500.web.feeds.news import BasicNewsRecipe
 import sys, os, logging
 from libprs500.web.feeds.recipes import get_builtin_recipe, compile_recipe, titles
 from libprs500.web.fetch.simple import option_parser as _option_parser
 from libprs500.web.feeds.news import Profile2Recipe
 def option_parser(usage='''\
@ -110,7 +111,7 @@ def run_recipe(opts, recipe_arg, parser, notification=None, handler=None):
            else:
                raise Exception('not file')
        except:
-            recipe = get_builtin_recipe(recipe_arg)
+            recipe, is_profile = get_builtin_recipe(recipe_arg)
            if recipe is None:
                recipe = compile_recipe(recipe_arg)
@ -125,6 +126,9 @@ def run_recipe(opts, recipe_arg, parser, notification=None, handler=None):
        handler.setFormatter(ColoredFormatter('%(levelname)s: %(message)s\n')) # The trailing newline is need because of the progress bar
        logging.getLogger('feeds2disk').addHandler(handler)
    if is_profile:
        recipe = Profile2Recipe(recipe, opts, parser, notification)
    else:
        recipe = recipe(opts, parser, notification)
    if not os.path.exists(recipe.output_dir):
        os.makedirs(recipe.output_dir)
--- a/src/libprs500/web/feeds/news.py
+++ b/src/libprs500/web/feeds/news.py
@ -20,7 +20,7 @@ to an ebook.
 import logging, os, cStringIO, time, traceback, re
 import urlparse
-from libprs500 import browser, __appname__
+from libprs500 import browser, __appname__, iswindows
 from libprs500.ebooks.BeautifulSoup import BeautifulSoup, NavigableString, CData, Tag
 from libprs500.ebooks.metadata.opf import OPFCreator
 from libprs500.ebooks.metadata.toc import TOC
@ -41,7 +41,7 @@ class BasicNewsRecipe(object):
    title                 = _('Unknown News Source')
    #: The author of this recipe
-    __author__            = _('Unknown')    
+    __author__            = _(__appname__)    
    #: Maximum number of articles to download from each feed
    #: @type: integer
@ -198,6 +198,14 @@ class BasicNewsRecipe(object):
        '''
        return browser()
    def get_article_url(self, item):
        '''
        Override to perform extraction of URL for each article. 
        @param item: An article instance from L{feedparser}.
        @type item: L{FeedParserDict} 
        '''
        return item.get('link',  None)
    def preprocess_html(self, soup):
        '''
        This function is called with the source of each downloaded HTML file, before
@ -335,7 +343,7 @@ class BasicNewsRecipe(object):
            if head:
                style = BeautifulSoup(u'<style type="text/css">%s</style>'%self.extra_css).find('style')
                head.insert(len(head.contents), style)
-        if first_fetch:
+        if first_fetch and job_info:
            url, f, a, feed_len = job_info
            body = soup.find('body')
            if body is not None:
@ -615,7 +623,8 @@ class BasicNewsRecipe(object):
            parsed_feeds.append(feed_from_xml(self.browser.open(url).read(), 
                                              title=title,
                                              oldest_article=self.oldest_article,
-                                              max_articles_per_feed=self.max_articles_per_feed))
+                                              max_articles_per_feed=self.max_articles_per_feed,
                                              get_article_url=self.get_article_url))
        return parsed_feeds
@ -644,3 +653,55 @@ class BasicNewsRecipe(object):
                elif use_alt and item.has_key('alt'):
                    strings.append(item['alt'])
        return u''.join(strings)
 class Profile2Recipe(BasicNewsRecipe):
    '''
    Used to migrate the old news Profiles to the new Recipes. Uses the settings
    from the old Profile to populate the settings in the Recipe. Also uses, the 
    Profile's get_browser and parse_feeds.
    '''
    def __init__(self, profile_class, options, parser, progress_reporter):
        self.old_profile = profile_class(logging.getLogger('feeds2disk'), 
                                         username=options.username, 
                                         password=options.password,
                                         lrf=options.lrf)
        for attr in ('preprocess_regexps', 'oldest_article', 'delay', 'timeout',
                     'match_regexps', 'filter_regexps', 'html2lrf_options', 
                     'timefmt', 'needs_subscription', 'summary_length',
                     'max_articles_per_feed', 'title','no_stylesheets', 'encoding'):
            setattr(self, attr, getattr(self.old_profile, attr))
        self.simultaneous_downloads = 1
        BasicNewsRecipe.__init__(self, options, parser, progress_reporter)
        self.browser = self.old_profile.browser
    def parse_index(self):
        return self.old_profile.parse_feeds()
 class CustomIndexRecipe(BasicNewsRecipe):
    def custom_index(self):
        '''
        Return the path to a custom HTML document that will serve as the index for 
        this recipe.
        @rtype: string
        '''
        raise NotImplementedError
    def create_opf(self):
        mi = MetaInformation(self.title, [__appname__])
        mi = OPFCreator(self.output_dir, mi)
        mi.create_manifest_from_files_in([self.output_dir])
        mi.create_spine(['index.html'])
        mi.render(open(os.path.join(self.output_dir, 'index.opf'), 'wb'))
    def download(self):
        index = os.path.abspath(self.custom_index())
        url = 'file:'+index if iswindows else 'file://'+index
        fetcher = RecursiveFetcher(self.web2disk_options, self.logger)
        fetcher.base_dir = self.output_dir
        fetcher.current_dir = self.output_dir
        fetcher.show_progress = False
        res = fetcher.start_fetch(url)
        self.create_opf()
        return res
--- a/src/libprs500/web/feeds/recipes/init.py
+++ b/src/libprs500/web/feeds/recipes/init.py
@ -17,14 +17,14 @@
 '''
 Builtin recipes.
 '''
-recipes = ['newsweek', 'atlantic', 'economist']
+recipes = ['newsweek', 'atlantic', 'economist', 'dilbert']
 import re
-from libprs500.web.feeds.news import BasicNewsRecipe
+from libprs500.web.feeds.news import BasicNewsRecipe, CustomIndexRecipe
 from libprs500.ebooks.lrf.web.profiles import DefaultProfile, FullContentProfile
-from libprs500.ebooks.lrf.web import available_profiles
+from libprs500.ebooks.lrf.web import builtin_profiles
-basic_recipes = (BasicNewsRecipe, DefaultProfile, FullContentProfile)
+basic_recipes = (BasicNewsRecipe, CustomIndexRecipe, DefaultProfile, FullContentProfile)
 basic_recipe_names = (i.__name__ for i in basic_recipes)
@ -51,6 +51,8 @@ recipes = [load_recipe(i) for i in recipes]
 def compile_recipe(src):
    '''
    Compile the code in src and return the first object that is a recipe or profile.
    @param src: Python source code
    @type src: string
    @return: Recipe/Profile class or None, if no such class was found in C{src} 
    '''
    locals = {}
@ -67,13 +69,20 @@ def compile_recipe(src):
 def get_builtin_recipe(title):
    '''
    Return a builtin recipe/profile class whoose title == C{title} or None if no such
-    recipe exists.
+    recipe exists. Also returns a flag that is True iff the found recipe is really
    an old-style Profile.
    @type title: string
-    @rtype: class or None
+    @rtype: class or None, boolean
    '''
    for r in recipes:
        if r.title == title:
-            return r
+            return r, False
    for p in builtin_profiles:
        if p.title == title:
            return p, True
    return None, False
-titles = set([r.title for r in recipes])
+_titles = list(frozenset([r.title for r in recipes] + [p.title for p in builtin_profiles]))
 _titles.sort()
 titles = _titles
--- a/src/libprs500/web/feeds/recipes/dilbert.py
+++ b/src/libprs500/web/feeds/recipes/dilbert.py
@ -0,0 +1,45 @@
 #!/usr/bin/env  python
 ##    Copyright (C) 2008 Kovid Goyal kovid@kovidgoyal.net
 ##    This program is free software; you can redistribute it and/or modify
 ##    it under the terms of the GNU General Public License as published by
 ##    the Free Software Foundation; either version 2 of the License, or
 ##    (at your option) any later version.
 ##
 ##    This program is distributed in the hope that it will be useful,
 ##    but WITHOUT ANY WARRANTY; without even the implied warranty of
 ##    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 ##    GNU General Public License for more details.
 ##
 ##    You should have received a copy of the GNU General Public License along
 ##    with this program; if not, write to the Free Software Foundation, Inc.,
 ##    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 '''
 Daily Dilbert
 '''
 import os
 from libprs500.web.feeds.news import CustomIndexRecipe
 from libprs500.ptempfile import PersistentTemporaryDirectory
 class Dilbert(CustomIndexRecipe):
    title = 'Dilbert'
    timefmt = ' [%d %b %Y]'
    feeds = [('Dilbert', 'http://feeds.feedburner.com/tapestrydilbert')]
    def get_article_url(self, item):
        return item.get('enclosures')[0].get('url')
    def custom_index(self):
        tdir = PersistentTemporaryDirectory('feeds2disk_dilbert')
        index = os.path.join(tdir, 'index.html')
        feed = self.parse_feeds()[0]
        res = ''
        for item in feed:
            res += '<h3>%s</h3><img style="page-break-after:always" src="%s" />\n'%(item.title, item.url)
        res = '<html><body><h1>Dilbert</h1>%s</body></html'%res
        open(index, 'wb').write(res)
        return index