diff --git a/src/libprs500/ebooks/chardet/__init__.py b/src/libprs500/ebooks/chardet/__init__.py index c54c945708..9c851480a8 100644 --- a/src/libprs500/ebooks/chardet/__init__.py +++ b/src/libprs500/ebooks/chardet/__init__.py @@ -52,7 +52,8 @@ def xml_to_unicode(raw, verbose=False): print 'WARNING: Encoding detection confidence %d%%'%(chardet['confidence']*100) CHARSET_ALIASES = { "macintosh" : "mac-roman", "x-sjis" : "shift-jis" } - encoding = encoding.lower() + if encoding: + encoding = encoding.lower() if CHARSET_ALIASES.has_key(encoding): encoding = CHARSET_ALIASES[encoding] return raw.decode(encoding, 'ignore'), encoding diff --git a/src/libprs500/ebooks/metadata/opf.xml b/src/libprs500/ebooks/metadata/opf.xml index 822a5dae76..3fe98111e7 100644 --- a/src/libprs500/ebooks/metadata/opf.xml +++ b/src/libprs500/ebooks/metadata/opf.xml @@ -24,13 +24,14 @@ - + - + diff --git a/src/libprs500/web/feeds/__init__.py b/src/libprs500/web/feeds/__init__.py index 1ebbfd78d0..92dc4e9bfe 100644 --- a/src/libprs500/web/feeds/__init__.py +++ b/src/libprs500/web/feeds/__init__.py @@ -17,7 +17,7 @@ ''' Contains the logic for parsing feeds. ''' -import time, logging +import time, logging, traceback from datetime import datetime from libprs500.web.feeds.feedparser import parse @@ -54,11 +54,12 @@ Has content : %s class Feed(object): - def __init__(self): + def __init__(self, get_article_url=lambda item: item.get('link', None)): ''' Parse a feed into articles. ''' self.logger = logging.getLogger('feeds2disk') + self.get_article_url = get_article_url def populate_from_feed(self, feed, title=None, oldest_article=7, max_articles_per_feed=100): @@ -124,7 +125,12 @@ class Feed(object): self.added_articles.append(id) title = item.get('title', _('Untitled article')) - link = item.get('link', None) + try: + link = self.get_article_url(item) + except: + self.logger.warning('Failed to get link for %s'%title) + self.logger.debug(traceback.format_exc()) + link = None description = item.get('summary', None) content = '\n'.join(i.value for i in item.get('content', [])) @@ -159,9 +165,10 @@ class Feed(object): return False -def feed_from_xml(raw_xml, title=None, oldest_article=7, max_articles_per_feed=100): +def feed_from_xml(raw_xml, title=None, oldest_article=7, + max_articles_per_feed=100, get_article_url=lambda item: item.get('link', None)): feed = parse(raw_xml) - pfeed = Feed() + pfeed = Feed(get_article_url=get_article_url) pfeed.populate_from_feed(feed, title=title, oldest_article=oldest_article, max_articles_per_feed=max_articles_per_feed) diff --git a/src/libprs500/web/feeds/main.py b/src/libprs500/web/feeds/main.py index 11d60b6a49..8dcb0b5029 100644 --- a/src/libprs500/web/feeds/main.py +++ b/src/libprs500/web/feeds/main.py @@ -20,6 +20,7 @@ from libprs500.web.feeds.news import BasicNewsRecipe import sys, os, logging from libprs500.web.feeds.recipes import get_builtin_recipe, compile_recipe, titles from libprs500.web.fetch.simple import option_parser as _option_parser +from libprs500.web.feeds.news import Profile2Recipe def option_parser(usage='''\ @@ -110,7 +111,7 @@ def run_recipe(opts, recipe_arg, parser, notification=None, handler=None): else: raise Exception('not file') except: - recipe = get_builtin_recipe(recipe_arg) + recipe, is_profile = get_builtin_recipe(recipe_arg) if recipe is None: recipe = compile_recipe(recipe_arg) @@ -125,7 +126,10 @@ def run_recipe(opts, recipe_arg, parser, notification=None, handler=None): handler.setFormatter(ColoredFormatter('%(levelname)s: %(message)s\n')) # The trailing newline is need because of the progress bar logging.getLogger('feeds2disk').addHandler(handler) - recipe = recipe(opts, parser, notification) + if is_profile: + recipe = Profile2Recipe(recipe, opts, parser, notification) + else: + recipe = recipe(opts, parser, notification) if not os.path.exists(recipe.output_dir): os.makedirs(recipe.output_dir) recipe.download() diff --git a/src/libprs500/web/feeds/news.py b/src/libprs500/web/feeds/news.py index e4ffb2aadf..da67309595 100644 --- a/src/libprs500/web/feeds/news.py +++ b/src/libprs500/web/feeds/news.py @@ -20,7 +20,7 @@ to an ebook. import logging, os, cStringIO, time, traceback, re import urlparse -from libprs500 import browser, __appname__ +from libprs500 import browser, __appname__, iswindows from libprs500.ebooks.BeautifulSoup import BeautifulSoup, NavigableString, CData, Tag from libprs500.ebooks.metadata.opf import OPFCreator from libprs500.ebooks.metadata.toc import TOC @@ -41,7 +41,7 @@ class BasicNewsRecipe(object): title = _('Unknown News Source') #: The author of this recipe - __author__ = _('Unknown') + __author__ = _(__appname__) #: Maximum number of articles to download from each feed #: @type: integer @@ -198,6 +198,14 @@ class BasicNewsRecipe(object): ''' return browser() + def get_article_url(self, item): + ''' + Override to perform extraction of URL for each article. + @param item: An article instance from L{feedparser}. + @type item: L{FeedParserDict} + ''' + return item.get('link', None) + def preprocess_html(self, soup): ''' This function is called with the source of each downloaded HTML file, before @@ -335,7 +343,7 @@ class BasicNewsRecipe(object): if head: style = BeautifulSoup(u''%self.extra_css).find('style') head.insert(len(head.contents), style) - if first_fetch: + if first_fetch and job_info: url, f, a, feed_len = job_info body = soup.find('body') if body is not None: @@ -615,7 +623,8 @@ class BasicNewsRecipe(object): parsed_feeds.append(feed_from_xml(self.browser.open(url).read(), title=title, oldest_article=self.oldest_article, - max_articles_per_feed=self.max_articles_per_feed)) + max_articles_per_feed=self.max_articles_per_feed, + get_article_url=self.get_article_url)) return parsed_feeds @@ -644,3 +653,55 @@ class BasicNewsRecipe(object): elif use_alt and item.has_key('alt'): strings.append(item['alt']) return u''.join(strings) + +class Profile2Recipe(BasicNewsRecipe): + ''' + Used to migrate the old news Profiles to the new Recipes. Uses the settings + from the old Profile to populate the settings in the Recipe. Also uses, the + Profile's get_browser and parse_feeds. + ''' + def __init__(self, profile_class, options, parser, progress_reporter): + self.old_profile = profile_class(logging.getLogger('feeds2disk'), + username=options.username, + password=options.password, + lrf=options.lrf) + for attr in ('preprocess_regexps', 'oldest_article', 'delay', 'timeout', + 'match_regexps', 'filter_regexps', 'html2lrf_options', + 'timefmt', 'needs_subscription', 'summary_length', + 'max_articles_per_feed', 'title','no_stylesheets', 'encoding'): + setattr(self, attr, getattr(self.old_profile, attr)) + + self.simultaneous_downloads = 1 + BasicNewsRecipe.__init__(self, options, parser, progress_reporter) + self.browser = self.old_profile.browser + + def parse_index(self): + return self.old_profile.parse_feeds() + +class CustomIndexRecipe(BasicNewsRecipe): + + def custom_index(self): + ''' + Return the path to a custom HTML document that will serve as the index for + this recipe. + @rtype: string + ''' + raise NotImplementedError + + def create_opf(self): + mi = MetaInformation(self.title, [__appname__]) + mi = OPFCreator(self.output_dir, mi) + mi.create_manifest_from_files_in([self.output_dir]) + mi.create_spine(['index.html']) + mi.render(open(os.path.join(self.output_dir, 'index.opf'), 'wb')) + + def download(self): + index = os.path.abspath(self.custom_index()) + url = 'file:'+index if iswindows else 'file://'+index + fetcher = RecursiveFetcher(self.web2disk_options, self.logger) + fetcher.base_dir = self.output_dir + fetcher.current_dir = self.output_dir + fetcher.show_progress = False + res = fetcher.start_fetch(url) + self.create_opf() + return res \ No newline at end of file diff --git a/src/libprs500/web/feeds/recipes/__init__.py b/src/libprs500/web/feeds/recipes/__init__.py index 4fb593a371..3efcde84d9 100644 --- a/src/libprs500/web/feeds/recipes/__init__.py +++ b/src/libprs500/web/feeds/recipes/__init__.py @@ -17,14 +17,14 @@ ''' Builtin recipes. ''' -recipes = ['newsweek', 'atlantic', 'economist'] +recipes = ['newsweek', 'atlantic', 'economist', 'dilbert'] import re -from libprs500.web.feeds.news import BasicNewsRecipe +from libprs500.web.feeds.news import BasicNewsRecipe, CustomIndexRecipe from libprs500.ebooks.lrf.web.profiles import DefaultProfile, FullContentProfile -from libprs500.ebooks.lrf.web import available_profiles +from libprs500.ebooks.lrf.web import builtin_profiles -basic_recipes = (BasicNewsRecipe, DefaultProfile, FullContentProfile) +basic_recipes = (BasicNewsRecipe, CustomIndexRecipe, DefaultProfile, FullContentProfile) basic_recipe_names = (i.__name__ for i in basic_recipes) @@ -51,6 +51,8 @@ recipes = [load_recipe(i) for i in recipes] def compile_recipe(src): ''' Compile the code in src and return the first object that is a recipe or profile. + @param src: Python source code + @type src: string @return: Recipe/Profile class or None, if no such class was found in C{src} ''' locals = {} @@ -67,13 +69,20 @@ def compile_recipe(src): def get_builtin_recipe(title): ''' Return a builtin recipe/profile class whoose title == C{title} or None if no such - recipe exists. + recipe exists. Also returns a flag that is True iff the found recipe is really + an old-style Profile. @type title: string - @rtype: class or None + @rtype: class or None, boolean ''' for r in recipes: if r.title == title: - return r + return r, False + for p in builtin_profiles: + if p.title == title: + return p, True + return None, False -titles = set([r.title for r in recipes]) \ No newline at end of file +_titles = list(frozenset([r.title for r in recipes] + [p.title for p in builtin_profiles])) +_titles.sort() +titles = _titles \ No newline at end of file diff --git a/src/libprs500/web/feeds/recipes/dilbert.py b/src/libprs500/web/feeds/recipes/dilbert.py new file mode 100644 index 0000000000..5daa8be964 --- /dev/null +++ b/src/libprs500/web/feeds/recipes/dilbert.py @@ -0,0 +1,45 @@ +#!/usr/bin/env python + +## Copyright (C) 2008 Kovid Goyal kovid@kovidgoyal.net +## This program is free software; you can redistribute it and/or modify +## it under the terms of the GNU General Public License as published by +## the Free Software Foundation; either version 2 of the License, or +## (at your option) any later version. +## +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License along +## with this program; if not, write to the Free Software Foundation, Inc., +## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +''' +Daily Dilbert +''' +import os +from libprs500.web.feeds.news import CustomIndexRecipe +from libprs500.ptempfile import PersistentTemporaryDirectory + +class Dilbert(CustomIndexRecipe): + + title = 'Dilbert' + timefmt = ' [%d %b %Y]' + + feeds = [('Dilbert', 'http://feeds.feedburner.com/tapestrydilbert')] + + def get_article_url(self, item): + return item.get('enclosures')[0].get('url') + + def custom_index(self): + tdir = PersistentTemporaryDirectory('feeds2disk_dilbert') + index = os.path.join(tdir, 'index.html') + feed = self.parse_feeds()[0] + + res = '' + for item in feed: + res += '

%s

\n'%(item.title, item.url) + res = '

Dilbert

%s