Migrate Dilbert feed to new infrastructure

This commit is contained in:
Kovid Goyal 2008-03-17 23:07:31 +00:00
parent 0983dbeafd
commit d834a8facc
7 changed files with 150 additions and 22 deletions

View File

@ -52,6 +52,7 @@ def xml_to_unicode(raw, verbose=False):
print 'WARNING: Encoding detection confidence %d%%'%(chardet['confidence']*100)
CHARSET_ALIASES = { "macintosh" : "mac-roman",
"x-sjis" : "shift-jis" }
if encoding:
encoding = encoding.lower()
if CHARSET_ALIASES.has_key(encoding):
encoding = CHARSET_ALIASES[encoding]

View File

@ -24,13 +24,14 @@
<reference py:if="mi.cover" type="cover" href="${mi.cover}" />
</guide>
<manifest>
<manifest py:if="getattr(mi, 'manifest', None)">
<py:for each="i, m in enumerate(mi.manifest)">
<item id="${str(i)}" href="${m[0]}" media-type="${m[1]}" />
</py:for>
</manifest>
<spine py:with="attrs={'toc':'ncx' if mi.toc else None}" py:attrs="attrs">
<spine py:if="getattr(mi, 'manifest', None)"
py:with="attrs={'toc':'ncx' if mi.toc else None}" py:attrs="attrs">
<py:for each="idref in mi.spine">
<itemref idref="${str(idref)}" />
</py:for>

View File

@ -17,7 +17,7 @@
'''
Contains the logic for parsing feeds.
'''
import time, logging
import time, logging, traceback
from datetime import datetime
from libprs500.web.feeds.feedparser import parse
@ -54,11 +54,12 @@ Has content : %s
class Feed(object):
def __init__(self):
def __init__(self, get_article_url=lambda item: item.get('link', None)):
'''
Parse a feed into articles.
'''
self.logger = logging.getLogger('feeds2disk')
self.get_article_url = get_article_url
def populate_from_feed(self, feed, title=None, oldest_article=7,
max_articles_per_feed=100):
@ -124,7 +125,12 @@ class Feed(object):
self.added_articles.append(id)
title = item.get('title', _('Untitled article'))
link = item.get('link', None)
try:
link = self.get_article_url(item)
except:
self.logger.warning('Failed to get link for %s'%title)
self.logger.debug(traceback.format_exc())
link = None
description = item.get('summary', None)
content = '\n'.join(i.value for i in item.get('content', []))
@ -159,9 +165,10 @@ class Feed(object):
return False
def feed_from_xml(raw_xml, title=None, oldest_article=7, max_articles_per_feed=100):
def feed_from_xml(raw_xml, title=None, oldest_article=7,
max_articles_per_feed=100, get_article_url=lambda item: item.get('link', None)):
feed = parse(raw_xml)
pfeed = Feed()
pfeed = Feed(get_article_url=get_article_url)
pfeed.populate_from_feed(feed, title=title,
oldest_article=oldest_article,
max_articles_per_feed=max_articles_per_feed)

View File

@ -20,6 +20,7 @@ from libprs500.web.feeds.news import BasicNewsRecipe
import sys, os, logging
from libprs500.web.feeds.recipes import get_builtin_recipe, compile_recipe, titles
from libprs500.web.fetch.simple import option_parser as _option_parser
from libprs500.web.feeds.news import Profile2Recipe
def option_parser(usage='''\
@ -110,7 +111,7 @@ def run_recipe(opts, recipe_arg, parser, notification=None, handler=None):
else:
raise Exception('not file')
except:
recipe = get_builtin_recipe(recipe_arg)
recipe, is_profile = get_builtin_recipe(recipe_arg)
if recipe is None:
recipe = compile_recipe(recipe_arg)
@ -125,6 +126,9 @@ def run_recipe(opts, recipe_arg, parser, notification=None, handler=None):
handler.setFormatter(ColoredFormatter('%(levelname)s: %(message)s\n')) # The trailing newline is need because of the progress bar
logging.getLogger('feeds2disk').addHandler(handler)
if is_profile:
recipe = Profile2Recipe(recipe, opts, parser, notification)
else:
recipe = recipe(opts, parser, notification)
if not os.path.exists(recipe.output_dir):
os.makedirs(recipe.output_dir)

View File

@ -20,7 +20,7 @@ to an ebook.
import logging, os, cStringIO, time, traceback, re
import urlparse
from libprs500 import browser, __appname__
from libprs500 import browser, __appname__, iswindows
from libprs500.ebooks.BeautifulSoup import BeautifulSoup, NavigableString, CData, Tag
from libprs500.ebooks.metadata.opf import OPFCreator
from libprs500.ebooks.metadata.toc import TOC
@ -41,7 +41,7 @@ class BasicNewsRecipe(object):
title = _('Unknown News Source')
#: The author of this recipe
__author__ = _('Unknown')
__author__ = _(__appname__)
#: Maximum number of articles to download from each feed
#: @type: integer
@ -198,6 +198,14 @@ class BasicNewsRecipe(object):
'''
return browser()
def get_article_url(self, item):
'''
Override to perform extraction of URL for each article.
@param item: An article instance from L{feedparser}.
@type item: L{FeedParserDict}
'''
return item.get('link', None)
def preprocess_html(self, soup):
'''
This function is called with the source of each downloaded HTML file, before
@ -335,7 +343,7 @@ class BasicNewsRecipe(object):
if head:
style = BeautifulSoup(u'<style type="text/css">%s</style>'%self.extra_css).find('style')
head.insert(len(head.contents), style)
if first_fetch:
if first_fetch and job_info:
url, f, a, feed_len = job_info
body = soup.find('body')
if body is not None:
@ -615,7 +623,8 @@ class BasicNewsRecipe(object):
parsed_feeds.append(feed_from_xml(self.browser.open(url).read(),
title=title,
oldest_article=self.oldest_article,
max_articles_per_feed=self.max_articles_per_feed))
max_articles_per_feed=self.max_articles_per_feed,
get_article_url=self.get_article_url))
return parsed_feeds
@ -644,3 +653,55 @@ class BasicNewsRecipe(object):
elif use_alt and item.has_key('alt'):
strings.append(item['alt'])
return u''.join(strings)
class Profile2Recipe(BasicNewsRecipe):
'''
Used to migrate the old news Profiles to the new Recipes. Uses the settings
from the old Profile to populate the settings in the Recipe. Also uses, the
Profile's get_browser and parse_feeds.
'''
def __init__(self, profile_class, options, parser, progress_reporter):
self.old_profile = profile_class(logging.getLogger('feeds2disk'),
username=options.username,
password=options.password,
lrf=options.lrf)
for attr in ('preprocess_regexps', 'oldest_article', 'delay', 'timeout',
'match_regexps', 'filter_regexps', 'html2lrf_options',
'timefmt', 'needs_subscription', 'summary_length',
'max_articles_per_feed', 'title','no_stylesheets', 'encoding'):
setattr(self, attr, getattr(self.old_profile, attr))
self.simultaneous_downloads = 1
BasicNewsRecipe.__init__(self, options, parser, progress_reporter)
self.browser = self.old_profile.browser
def parse_index(self):
return self.old_profile.parse_feeds()
class CustomIndexRecipe(BasicNewsRecipe):
def custom_index(self):
'''
Return the path to a custom HTML document that will serve as the index for
this recipe.
@rtype: string
'''
raise NotImplementedError
def create_opf(self):
mi = MetaInformation(self.title, [__appname__])
mi = OPFCreator(self.output_dir, mi)
mi.create_manifest_from_files_in([self.output_dir])
mi.create_spine(['index.html'])
mi.render(open(os.path.join(self.output_dir, 'index.opf'), 'wb'))
def download(self):
index = os.path.abspath(self.custom_index())
url = 'file:'+index if iswindows else 'file://'+index
fetcher = RecursiveFetcher(self.web2disk_options, self.logger)
fetcher.base_dir = self.output_dir
fetcher.current_dir = self.output_dir
fetcher.show_progress = False
res = fetcher.start_fetch(url)
self.create_opf()
return res

View File

@ -17,14 +17,14 @@
'''
Builtin recipes.
'''
recipes = ['newsweek', 'atlantic', 'economist']
recipes = ['newsweek', 'atlantic', 'economist', 'dilbert']
import re
from libprs500.web.feeds.news import BasicNewsRecipe
from libprs500.web.feeds.news import BasicNewsRecipe, CustomIndexRecipe
from libprs500.ebooks.lrf.web.profiles import DefaultProfile, FullContentProfile
from libprs500.ebooks.lrf.web import available_profiles
from libprs500.ebooks.lrf.web import builtin_profiles
basic_recipes = (BasicNewsRecipe, DefaultProfile, FullContentProfile)
basic_recipes = (BasicNewsRecipe, CustomIndexRecipe, DefaultProfile, FullContentProfile)
basic_recipe_names = (i.__name__ for i in basic_recipes)
@ -51,6 +51,8 @@ recipes = [load_recipe(i) for i in recipes]
def compile_recipe(src):
'''
Compile the code in src and return the first object that is a recipe or profile.
@param src: Python source code
@type src: string
@return: Recipe/Profile class or None, if no such class was found in C{src}
'''
locals = {}
@ -67,13 +69,20 @@ def compile_recipe(src):
def get_builtin_recipe(title):
'''
Return a builtin recipe/profile class whoose title == C{title} or None if no such
recipe exists.
recipe exists. Also returns a flag that is True iff the found recipe is really
an old-style Profile.
@type title: string
@rtype: class or None
@rtype: class or None, boolean
'''
for r in recipes:
if r.title == title:
return r
return r, False
for p in builtin_profiles:
if p.title == title:
return p, True
return None, False
titles = set([r.title for r in recipes])
_titles = list(frozenset([r.title for r in recipes] + [p.title for p in builtin_profiles]))
_titles.sort()
titles = _titles

View File

@ -0,0 +1,45 @@
#!/usr/bin/env python
## Copyright (C) 2008 Kovid Goyal kovid@kovidgoyal.net
## This program is free software; you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
## the Free Software Foundation; either version 2 of the License, or
## (at your option) any later version.
##
## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
## GNU General Public License for more details.
##
## You should have received a copy of the GNU General Public License along
## with this program; if not, write to the Free Software Foundation, Inc.,
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
'''
Daily Dilbert
'''
import os
from libprs500.web.feeds.news import CustomIndexRecipe
from libprs500.ptempfile import PersistentTemporaryDirectory
class Dilbert(CustomIndexRecipe):
title = 'Dilbert'
timefmt = ' [%d %b %Y]'
feeds = [('Dilbert', 'http://feeds.feedburner.com/tapestrydilbert')]
def get_article_url(self, item):
return item.get('enclosures')[0].get('url')
def custom_index(self):
tdir = PersistentTemporaryDirectory('feeds2disk_dilbert')
index = os.path.join(tdir, 'index.html')
feed = self.parse_feeds()[0]
res = ''
for item in feed:
res += '<h3>%s</h3><img style="page-break-after:always" src="%s" />\n'%(item.title, item.url)
res = '<html><body><h1>Dilbert</h1>%s</body></html'%res
open(index, 'wb').write(res)
return index