diff --git a/src/libprs500/ebooks/chardet/__init__.py b/src/libprs500/ebooks/chardet/__init__.py
index c54c945708..9c851480a8 100644
--- a/src/libprs500/ebooks/chardet/__init__.py
+++ b/src/libprs500/ebooks/chardet/__init__.py
@@ -52,7 +52,8 @@ def xml_to_unicode(raw, verbose=False):
print 'WARNING: Encoding detection confidence %d%%'%(chardet['confidence']*100)
CHARSET_ALIASES = { "macintosh" : "mac-roman",
"x-sjis" : "shift-jis" }
- encoding = encoding.lower()
+ if encoding:
+ encoding = encoding.lower()
if CHARSET_ALIASES.has_key(encoding):
encoding = CHARSET_ALIASES[encoding]
return raw.decode(encoding, 'ignore'), encoding
diff --git a/src/libprs500/ebooks/metadata/opf.xml b/src/libprs500/ebooks/metadata/opf.xml
index 822a5dae76..3fe98111e7 100644
--- a/src/libprs500/ebooks/metadata/opf.xml
+++ b/src/libprs500/ebooks/metadata/opf.xml
@@ -24,13 +24,14 @@
-
+
-
+
diff --git a/src/libprs500/web/feeds/__init__.py b/src/libprs500/web/feeds/__init__.py
index 1ebbfd78d0..92dc4e9bfe 100644
--- a/src/libprs500/web/feeds/__init__.py
+++ b/src/libprs500/web/feeds/__init__.py
@@ -17,7 +17,7 @@
'''
Contains the logic for parsing feeds.
'''
-import time, logging
+import time, logging, traceback
from datetime import datetime
from libprs500.web.feeds.feedparser import parse
@@ -54,11 +54,12 @@ Has content : %s
class Feed(object):
- def __init__(self):
+ def __init__(self, get_article_url=lambda item: item.get('link', None)):
'''
Parse a feed into articles.
'''
self.logger = logging.getLogger('feeds2disk')
+ self.get_article_url = get_article_url
def populate_from_feed(self, feed, title=None, oldest_article=7,
max_articles_per_feed=100):
@@ -124,7 +125,12 @@ class Feed(object):
self.added_articles.append(id)
title = item.get('title', _('Untitled article'))
- link = item.get('link', None)
+ try:
+ link = self.get_article_url(item)
+ except:
+ self.logger.warning('Failed to get link for %s'%title)
+ self.logger.debug(traceback.format_exc())
+ link = None
description = item.get('summary', None)
content = '\n'.join(i.value for i in item.get('content', []))
@@ -159,9 +165,10 @@ class Feed(object):
return False
-def feed_from_xml(raw_xml, title=None, oldest_article=7, max_articles_per_feed=100):
+def feed_from_xml(raw_xml, title=None, oldest_article=7,
+ max_articles_per_feed=100, get_article_url=lambda item: item.get('link', None)):
feed = parse(raw_xml)
- pfeed = Feed()
+ pfeed = Feed(get_article_url=get_article_url)
pfeed.populate_from_feed(feed, title=title,
oldest_article=oldest_article,
max_articles_per_feed=max_articles_per_feed)
diff --git a/src/libprs500/web/feeds/main.py b/src/libprs500/web/feeds/main.py
index 11d60b6a49..8dcb0b5029 100644
--- a/src/libprs500/web/feeds/main.py
+++ b/src/libprs500/web/feeds/main.py
@@ -20,6 +20,7 @@ from libprs500.web.feeds.news import BasicNewsRecipe
import sys, os, logging
from libprs500.web.feeds.recipes import get_builtin_recipe, compile_recipe, titles
from libprs500.web.fetch.simple import option_parser as _option_parser
+from libprs500.web.feeds.news import Profile2Recipe
def option_parser(usage='''\
@@ -110,7 +111,7 @@ def run_recipe(opts, recipe_arg, parser, notification=None, handler=None):
else:
raise Exception('not file')
except:
- recipe = get_builtin_recipe(recipe_arg)
+ recipe, is_profile = get_builtin_recipe(recipe_arg)
if recipe is None:
recipe = compile_recipe(recipe_arg)
@@ -125,7 +126,10 @@ def run_recipe(opts, recipe_arg, parser, notification=None, handler=None):
handler.setFormatter(ColoredFormatter('%(levelname)s: %(message)s\n')) # The trailing newline is need because of the progress bar
logging.getLogger('feeds2disk').addHandler(handler)
- recipe = recipe(opts, parser, notification)
+ if is_profile:
+ recipe = Profile2Recipe(recipe, opts, parser, notification)
+ else:
+ recipe = recipe(opts, parser, notification)
if not os.path.exists(recipe.output_dir):
os.makedirs(recipe.output_dir)
recipe.download()
diff --git a/src/libprs500/web/feeds/news.py b/src/libprs500/web/feeds/news.py
index e4ffb2aadf..da67309595 100644
--- a/src/libprs500/web/feeds/news.py
+++ b/src/libprs500/web/feeds/news.py
@@ -20,7 +20,7 @@ to an ebook.
import logging, os, cStringIO, time, traceback, re
import urlparse
-from libprs500 import browser, __appname__
+from libprs500 import browser, __appname__, iswindows
from libprs500.ebooks.BeautifulSoup import BeautifulSoup, NavigableString, CData, Tag
from libprs500.ebooks.metadata.opf import OPFCreator
from libprs500.ebooks.metadata.toc import TOC
@@ -41,7 +41,7 @@ class BasicNewsRecipe(object):
title = _('Unknown News Source')
#: The author of this recipe
- __author__ = _('Unknown')
+ __author__ = _(__appname__)
#: Maximum number of articles to download from each feed
#: @type: integer
@@ -198,6 +198,14 @@ class BasicNewsRecipe(object):
'''
return browser()
+ def get_article_url(self, item):
+ '''
+ Override to perform extraction of URL for each article.
+ @param item: An article instance from L{feedparser}.
+ @type item: L{FeedParserDict}
+ '''
+ return item.get('link', None)
+
def preprocess_html(self, soup):
'''
This function is called with the source of each downloaded HTML file, before
@@ -335,7 +343,7 @@ class BasicNewsRecipe(object):
if head:
style = BeautifulSoup(u''%self.extra_css).find('style')
head.insert(len(head.contents), style)
- if first_fetch:
+ if first_fetch and job_info:
url, f, a, feed_len = job_info
body = soup.find('body')
if body is not None:
@@ -615,7 +623,8 @@ class BasicNewsRecipe(object):
parsed_feeds.append(feed_from_xml(self.browser.open(url).read(),
title=title,
oldest_article=self.oldest_article,
- max_articles_per_feed=self.max_articles_per_feed))
+ max_articles_per_feed=self.max_articles_per_feed,
+ get_article_url=self.get_article_url))
return parsed_feeds
@@ -644,3 +653,55 @@ class BasicNewsRecipe(object):
elif use_alt and item.has_key('alt'):
strings.append(item['alt'])
return u''.join(strings)
+
+class Profile2Recipe(BasicNewsRecipe):
+ '''
+ Used to migrate the old news Profiles to the new Recipes. Uses the settings
+ from the old Profile to populate the settings in the Recipe. Also uses, the
+ Profile's get_browser and parse_feeds.
+ '''
+ def __init__(self, profile_class, options, parser, progress_reporter):
+ self.old_profile = profile_class(logging.getLogger('feeds2disk'),
+ username=options.username,
+ password=options.password,
+ lrf=options.lrf)
+ for attr in ('preprocess_regexps', 'oldest_article', 'delay', 'timeout',
+ 'match_regexps', 'filter_regexps', 'html2lrf_options',
+ 'timefmt', 'needs_subscription', 'summary_length',
+ 'max_articles_per_feed', 'title','no_stylesheets', 'encoding'):
+ setattr(self, attr, getattr(self.old_profile, attr))
+
+ self.simultaneous_downloads = 1
+ BasicNewsRecipe.__init__(self, options, parser, progress_reporter)
+ self.browser = self.old_profile.browser
+
+ def parse_index(self):
+ return self.old_profile.parse_feeds()
+
+class CustomIndexRecipe(BasicNewsRecipe):
+
+ def custom_index(self):
+ '''
+ Return the path to a custom HTML document that will serve as the index for
+ this recipe.
+ @rtype: string
+ '''
+ raise NotImplementedError
+
+ def create_opf(self):
+ mi = MetaInformation(self.title, [__appname__])
+ mi = OPFCreator(self.output_dir, mi)
+ mi.create_manifest_from_files_in([self.output_dir])
+ mi.create_spine(['index.html'])
+ mi.render(open(os.path.join(self.output_dir, 'index.opf'), 'wb'))
+
+ def download(self):
+ index = os.path.abspath(self.custom_index())
+ url = 'file:'+index if iswindows else 'file://'+index
+ fetcher = RecursiveFetcher(self.web2disk_options, self.logger)
+ fetcher.base_dir = self.output_dir
+ fetcher.current_dir = self.output_dir
+ fetcher.show_progress = False
+ res = fetcher.start_fetch(url)
+ self.create_opf()
+ return res
\ No newline at end of file
diff --git a/src/libprs500/web/feeds/recipes/__init__.py b/src/libprs500/web/feeds/recipes/__init__.py
index 4fb593a371..3efcde84d9 100644
--- a/src/libprs500/web/feeds/recipes/__init__.py
+++ b/src/libprs500/web/feeds/recipes/__init__.py
@@ -17,14 +17,14 @@
'''
Builtin recipes.
'''
-recipes = ['newsweek', 'atlantic', 'economist']
+recipes = ['newsweek', 'atlantic', 'economist', 'dilbert']
import re
-from libprs500.web.feeds.news import BasicNewsRecipe
+from libprs500.web.feeds.news import BasicNewsRecipe, CustomIndexRecipe
from libprs500.ebooks.lrf.web.profiles import DefaultProfile, FullContentProfile
-from libprs500.ebooks.lrf.web import available_profiles
+from libprs500.ebooks.lrf.web import builtin_profiles
-basic_recipes = (BasicNewsRecipe, DefaultProfile, FullContentProfile)
+basic_recipes = (BasicNewsRecipe, CustomIndexRecipe, DefaultProfile, FullContentProfile)
basic_recipe_names = (i.__name__ for i in basic_recipes)
@@ -51,6 +51,8 @@ recipes = [load_recipe(i) for i in recipes]
def compile_recipe(src):
'''
Compile the code in src and return the first object that is a recipe or profile.
+ @param src: Python source code
+ @type src: string
@return: Recipe/Profile class or None, if no such class was found in C{src}
'''
locals = {}
@@ -67,13 +69,20 @@ def compile_recipe(src):
def get_builtin_recipe(title):
'''
Return a builtin recipe/profile class whoose title == C{title} or None if no such
- recipe exists.
+ recipe exists. Also returns a flag that is True iff the found recipe is really
+ an old-style Profile.
@type title: string
- @rtype: class or None
+ @rtype: class or None, boolean
'''
for r in recipes:
if r.title == title:
- return r
+ return r, False
+ for p in builtin_profiles:
+ if p.title == title:
+ return p, True
+ return None, False
-titles = set([r.title for r in recipes])
\ No newline at end of file
+_titles = list(frozenset([r.title for r in recipes] + [p.title for p in builtin_profiles]))
+_titles.sort()
+titles = _titles
\ No newline at end of file
diff --git a/src/libprs500/web/feeds/recipes/dilbert.py b/src/libprs500/web/feeds/recipes/dilbert.py
new file mode 100644
index 0000000000..5daa8be964
--- /dev/null
+++ b/src/libprs500/web/feeds/recipes/dilbert.py
@@ -0,0 +1,45 @@
+#!/usr/bin/env python
+
+## Copyright (C) 2008 Kovid Goyal kovid@kovidgoyal.net
+## This program is free software; you can redistribute it and/or modify
+## it under the terms of the GNU General Public License as published by
+## the Free Software Foundation; either version 2 of the License, or
+## (at your option) any later version.
+##
+## This program is distributed in the hope that it will be useful,
+## but WITHOUT ANY WARRANTY; without even the implied warranty of
+## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+## GNU General Public License for more details.
+##
+## You should have received a copy of the GNU General Public License along
+## with this program; if not, write to the Free Software Foundation, Inc.,
+## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+'''
+Daily Dilbert
+'''
+import os
+from libprs500.web.feeds.news import CustomIndexRecipe
+from libprs500.ptempfile import PersistentTemporaryDirectory
+
+class Dilbert(CustomIndexRecipe):
+
+ title = 'Dilbert'
+ timefmt = ' [%d %b %Y]'
+
+ feeds = [('Dilbert', 'http://feeds.feedburner.com/tapestrydilbert')]
+
+ def get_article_url(self, item):
+ return item.get('enclosures')[0].get('url')
+
+ def custom_index(self):
+ tdir = PersistentTemporaryDirectory('feeds2disk_dilbert')
+ index = os.path.join(tdir, 'index.html')
+ feed = self.parse_feeds()[0]
+
+ res = ''
+ for item in feed:
+ res += '%s
\n'%(item.title, item.url)
+ res = 'Dilbert
%s