Migrate Dilbert feed to new infrastructure

This commit is contained in:
Kovid Goyal 2008-03-17 23:07:31 +00:00
parent 0983dbeafd
commit d834a8facc
7 changed files with 150 additions and 22 deletions

View File

@ -52,6 +52,7 @@ def xml_to_unicode(raw, verbose=False):
print 'WARNING: Encoding detection confidence %d%%'%(chardet['confidence']*100) print 'WARNING: Encoding detection confidence %d%%'%(chardet['confidence']*100)
CHARSET_ALIASES = { "macintosh" : "mac-roman", CHARSET_ALIASES = { "macintosh" : "mac-roman",
"x-sjis" : "shift-jis" } "x-sjis" : "shift-jis" }
if encoding:
encoding = encoding.lower() encoding = encoding.lower()
if CHARSET_ALIASES.has_key(encoding): if CHARSET_ALIASES.has_key(encoding):
encoding = CHARSET_ALIASES[encoding] encoding = CHARSET_ALIASES[encoding]

View File

@ -24,13 +24,14 @@
<reference py:if="mi.cover" type="cover" href="${mi.cover}" /> <reference py:if="mi.cover" type="cover" href="${mi.cover}" />
</guide> </guide>
<manifest> <manifest py:if="getattr(mi, 'manifest', None)">
<py:for each="i, m in enumerate(mi.manifest)"> <py:for each="i, m in enumerate(mi.manifest)">
<item id="${str(i)}" href="${m[0]}" media-type="${m[1]}" /> <item id="${str(i)}" href="${m[0]}" media-type="${m[1]}" />
</py:for> </py:for>
</manifest> </manifest>
<spine py:with="attrs={'toc':'ncx' if mi.toc else None}" py:attrs="attrs"> <spine py:if="getattr(mi, 'manifest', None)"
py:with="attrs={'toc':'ncx' if mi.toc else None}" py:attrs="attrs">
<py:for each="idref in mi.spine"> <py:for each="idref in mi.spine">
<itemref idref="${str(idref)}" /> <itemref idref="${str(idref)}" />
</py:for> </py:for>

View File

@ -17,7 +17,7 @@
''' '''
Contains the logic for parsing feeds. Contains the logic for parsing feeds.
''' '''
import time, logging import time, logging, traceback
from datetime import datetime from datetime import datetime
from libprs500.web.feeds.feedparser import parse from libprs500.web.feeds.feedparser import parse
@ -54,11 +54,12 @@ Has content : %s
class Feed(object): class Feed(object):
def __init__(self): def __init__(self, get_article_url=lambda item: item.get('link', None)):
''' '''
Parse a feed into articles. Parse a feed into articles.
''' '''
self.logger = logging.getLogger('feeds2disk') self.logger = logging.getLogger('feeds2disk')
self.get_article_url = get_article_url
def populate_from_feed(self, feed, title=None, oldest_article=7, def populate_from_feed(self, feed, title=None, oldest_article=7,
max_articles_per_feed=100): max_articles_per_feed=100):
@ -124,7 +125,12 @@ class Feed(object):
self.added_articles.append(id) self.added_articles.append(id)
title = item.get('title', _('Untitled article')) title = item.get('title', _('Untitled article'))
link = item.get('link', None) try:
link = self.get_article_url(item)
except:
self.logger.warning('Failed to get link for %s'%title)
self.logger.debug(traceback.format_exc())
link = None
description = item.get('summary', None) description = item.get('summary', None)
content = '\n'.join(i.value for i in item.get('content', [])) content = '\n'.join(i.value for i in item.get('content', []))
@ -159,9 +165,10 @@ class Feed(object):
return False return False
def feed_from_xml(raw_xml, title=None, oldest_article=7, max_articles_per_feed=100): def feed_from_xml(raw_xml, title=None, oldest_article=7,
max_articles_per_feed=100, get_article_url=lambda item: item.get('link', None)):
feed = parse(raw_xml) feed = parse(raw_xml)
pfeed = Feed() pfeed = Feed(get_article_url=get_article_url)
pfeed.populate_from_feed(feed, title=title, pfeed.populate_from_feed(feed, title=title,
oldest_article=oldest_article, oldest_article=oldest_article,
max_articles_per_feed=max_articles_per_feed) max_articles_per_feed=max_articles_per_feed)

View File

@ -20,6 +20,7 @@ from libprs500.web.feeds.news import BasicNewsRecipe
import sys, os, logging import sys, os, logging
from libprs500.web.feeds.recipes import get_builtin_recipe, compile_recipe, titles from libprs500.web.feeds.recipes import get_builtin_recipe, compile_recipe, titles
from libprs500.web.fetch.simple import option_parser as _option_parser from libprs500.web.fetch.simple import option_parser as _option_parser
from libprs500.web.feeds.news import Profile2Recipe
def option_parser(usage='''\ def option_parser(usage='''\
@ -110,7 +111,7 @@ def run_recipe(opts, recipe_arg, parser, notification=None, handler=None):
else: else:
raise Exception('not file') raise Exception('not file')
except: except:
recipe = get_builtin_recipe(recipe_arg) recipe, is_profile = get_builtin_recipe(recipe_arg)
if recipe is None: if recipe is None:
recipe = compile_recipe(recipe_arg) recipe = compile_recipe(recipe_arg)
@ -125,6 +126,9 @@ def run_recipe(opts, recipe_arg, parser, notification=None, handler=None):
handler.setFormatter(ColoredFormatter('%(levelname)s: %(message)s\n')) # The trailing newline is need because of the progress bar handler.setFormatter(ColoredFormatter('%(levelname)s: %(message)s\n')) # The trailing newline is need because of the progress bar
logging.getLogger('feeds2disk').addHandler(handler) logging.getLogger('feeds2disk').addHandler(handler)
if is_profile:
recipe = Profile2Recipe(recipe, opts, parser, notification)
else:
recipe = recipe(opts, parser, notification) recipe = recipe(opts, parser, notification)
if not os.path.exists(recipe.output_dir): if not os.path.exists(recipe.output_dir):
os.makedirs(recipe.output_dir) os.makedirs(recipe.output_dir)

View File

@ -20,7 +20,7 @@ to an ebook.
import logging, os, cStringIO, time, traceback, re import logging, os, cStringIO, time, traceback, re
import urlparse import urlparse
from libprs500 import browser, __appname__ from libprs500 import browser, __appname__, iswindows
from libprs500.ebooks.BeautifulSoup import BeautifulSoup, NavigableString, CData, Tag from libprs500.ebooks.BeautifulSoup import BeautifulSoup, NavigableString, CData, Tag
from libprs500.ebooks.metadata.opf import OPFCreator from libprs500.ebooks.metadata.opf import OPFCreator
from libprs500.ebooks.metadata.toc import TOC from libprs500.ebooks.metadata.toc import TOC
@ -41,7 +41,7 @@ class BasicNewsRecipe(object):
title = _('Unknown News Source') title = _('Unknown News Source')
#: The author of this recipe #: The author of this recipe
__author__ = _('Unknown') __author__ = _(__appname__)
#: Maximum number of articles to download from each feed #: Maximum number of articles to download from each feed
#: @type: integer #: @type: integer
@ -198,6 +198,14 @@ class BasicNewsRecipe(object):
''' '''
return browser() return browser()
def get_article_url(self, item):
'''
Override to perform extraction of URL for each article.
@param item: An article instance from L{feedparser}.
@type item: L{FeedParserDict}
'''
return item.get('link', None)
def preprocess_html(self, soup): def preprocess_html(self, soup):
''' '''
This function is called with the source of each downloaded HTML file, before This function is called with the source of each downloaded HTML file, before
@ -335,7 +343,7 @@ class BasicNewsRecipe(object):
if head: if head:
style = BeautifulSoup(u'<style type="text/css">%s</style>'%self.extra_css).find('style') style = BeautifulSoup(u'<style type="text/css">%s</style>'%self.extra_css).find('style')
head.insert(len(head.contents), style) head.insert(len(head.contents), style)
if first_fetch: if first_fetch and job_info:
url, f, a, feed_len = job_info url, f, a, feed_len = job_info
body = soup.find('body') body = soup.find('body')
if body is not None: if body is not None:
@ -615,7 +623,8 @@ class BasicNewsRecipe(object):
parsed_feeds.append(feed_from_xml(self.browser.open(url).read(), parsed_feeds.append(feed_from_xml(self.browser.open(url).read(),
title=title, title=title,
oldest_article=self.oldest_article, oldest_article=self.oldest_article,
max_articles_per_feed=self.max_articles_per_feed)) max_articles_per_feed=self.max_articles_per_feed,
get_article_url=self.get_article_url))
return parsed_feeds return parsed_feeds
@ -644,3 +653,55 @@ class BasicNewsRecipe(object):
elif use_alt and item.has_key('alt'): elif use_alt and item.has_key('alt'):
strings.append(item['alt']) strings.append(item['alt'])
return u''.join(strings) return u''.join(strings)
class Profile2Recipe(BasicNewsRecipe):
'''
Used to migrate the old news Profiles to the new Recipes. Uses the settings
from the old Profile to populate the settings in the Recipe. Also uses, the
Profile's get_browser and parse_feeds.
'''
def __init__(self, profile_class, options, parser, progress_reporter):
self.old_profile = profile_class(logging.getLogger('feeds2disk'),
username=options.username,
password=options.password,
lrf=options.lrf)
for attr in ('preprocess_regexps', 'oldest_article', 'delay', 'timeout',
'match_regexps', 'filter_regexps', 'html2lrf_options',
'timefmt', 'needs_subscription', 'summary_length',
'max_articles_per_feed', 'title','no_stylesheets', 'encoding'):
setattr(self, attr, getattr(self.old_profile, attr))
self.simultaneous_downloads = 1
BasicNewsRecipe.__init__(self, options, parser, progress_reporter)
self.browser = self.old_profile.browser
def parse_index(self):
return self.old_profile.parse_feeds()
class CustomIndexRecipe(BasicNewsRecipe):
def custom_index(self):
'''
Return the path to a custom HTML document that will serve as the index for
this recipe.
@rtype: string
'''
raise NotImplementedError
def create_opf(self):
mi = MetaInformation(self.title, [__appname__])
mi = OPFCreator(self.output_dir, mi)
mi.create_manifest_from_files_in([self.output_dir])
mi.create_spine(['index.html'])
mi.render(open(os.path.join(self.output_dir, 'index.opf'), 'wb'))
def download(self):
index = os.path.abspath(self.custom_index())
url = 'file:'+index if iswindows else 'file://'+index
fetcher = RecursiveFetcher(self.web2disk_options, self.logger)
fetcher.base_dir = self.output_dir
fetcher.current_dir = self.output_dir
fetcher.show_progress = False
res = fetcher.start_fetch(url)
self.create_opf()
return res

View File

@ -17,14 +17,14 @@
''' '''
Builtin recipes. Builtin recipes.
''' '''
recipes = ['newsweek', 'atlantic', 'economist'] recipes = ['newsweek', 'atlantic', 'economist', 'dilbert']
import re import re
from libprs500.web.feeds.news import BasicNewsRecipe from libprs500.web.feeds.news import BasicNewsRecipe, CustomIndexRecipe
from libprs500.ebooks.lrf.web.profiles import DefaultProfile, FullContentProfile from libprs500.ebooks.lrf.web.profiles import DefaultProfile, FullContentProfile
from libprs500.ebooks.lrf.web import available_profiles from libprs500.ebooks.lrf.web import builtin_profiles
basic_recipes = (BasicNewsRecipe, DefaultProfile, FullContentProfile) basic_recipes = (BasicNewsRecipe, CustomIndexRecipe, DefaultProfile, FullContentProfile)
basic_recipe_names = (i.__name__ for i in basic_recipes) basic_recipe_names = (i.__name__ for i in basic_recipes)
@ -51,6 +51,8 @@ recipes = [load_recipe(i) for i in recipes]
def compile_recipe(src): def compile_recipe(src):
''' '''
Compile the code in src and return the first object that is a recipe or profile. Compile the code in src and return the first object that is a recipe or profile.
@param src: Python source code
@type src: string
@return: Recipe/Profile class or None, if no such class was found in C{src} @return: Recipe/Profile class or None, if no such class was found in C{src}
''' '''
locals = {} locals = {}
@ -67,13 +69,20 @@ def compile_recipe(src):
def get_builtin_recipe(title): def get_builtin_recipe(title):
''' '''
Return a builtin recipe/profile class whoose title == C{title} or None if no such Return a builtin recipe/profile class whoose title == C{title} or None if no such
recipe exists. recipe exists. Also returns a flag that is True iff the found recipe is really
an old-style Profile.
@type title: string @type title: string
@rtype: class or None @rtype: class or None, boolean
''' '''
for r in recipes: for r in recipes:
if r.title == title: if r.title == title:
return r return r, False
for p in builtin_profiles:
if p.title == title:
return p, True
return None, False
titles = set([r.title for r in recipes]) _titles = list(frozenset([r.title for r in recipes] + [p.title for p in builtin_profiles]))
_titles.sort()
titles = _titles

View File

@ -0,0 +1,45 @@
#!/usr/bin/env python
## Copyright (C) 2008 Kovid Goyal kovid@kovidgoyal.net
## This program is free software; you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
## the Free Software Foundation; either version 2 of the License, or
## (at your option) any later version.
##
## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
## GNU General Public License for more details.
##
## You should have received a copy of the GNU General Public License along
## with this program; if not, write to the Free Software Foundation, Inc.,
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
'''
Daily Dilbert
'''
import os
from libprs500.web.feeds.news import CustomIndexRecipe
from libprs500.ptempfile import PersistentTemporaryDirectory
class Dilbert(CustomIndexRecipe):
title = 'Dilbert'
timefmt = ' [%d %b %Y]'
feeds = [('Dilbert', 'http://feeds.feedburner.com/tapestrydilbert')]
def get_article_url(self, item):
return item.get('enclosures')[0].get('url')
def custom_index(self):
tdir = PersistentTemporaryDirectory('feeds2disk_dilbert')
index = os.path.join(tdir, 'index.html')
feed = self.parse_feeds()[0]
res = ''
for item in feed:
res += '<h3>%s</h3><img style="page-break-after:always" src="%s" />\n'%(item.title, item.url)
res = '<html><body><h1>Dilbert</h1>%s</body></html'%res
open(index, 'wb').write(res)
return index