diff --git a/src/libprs500/web/feeds/__init__.py b/src/libprs500/web/feeds/__init__.py index 92dc4e9bfe..bda3ed586e 100644 --- a/src/libprs500/web/feeds/__init__.py +++ b/src/libprs500/web/feeds/__init__.py @@ -163,6 +163,15 @@ class Feed(object): if getattr(article, 'downloaded', False): return True return False + + def has_embedded_content(self): + length = 0 + for a in self: + if a.content or a.summary: + length += max(len(a.content if a.content else ''), + len(a.summary if a.summary else '')) + + return length > 2000 * len(self) def feed_from_xml(raw_xml, title=None, oldest_article=7, diff --git a/src/libprs500/web/feeds/news.py b/src/libprs500/web/feeds/news.py index da67309595..d14f6c9b93 100644 --- a/src/libprs500/web/feeds/news.py +++ b/src/libprs500/web/feeds/news.py @@ -13,6 +13,8 @@ ## You should have received a copy of the GNU General Public License along ## with this program; if not, write to the Free Software Foundation, Inc., ## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +from libprs500.ebooks.lrf.web.profiles import FullContentProfile +from libprs500.ptempfile import PersistentTemporaryFile ''' The backend to parse feeds and create HTML that can then be converted to an ebook. @@ -100,7 +102,18 @@ class BasicNewsRecipe(object): #: using cp1252. If None, try to detect the encoding. encoding = None + #: Normally we try to guess if a feed has full articles embedded in it + #: based on the length of the embedded content. If C{None}, then the + #: default guessing is used. If C{True} then the we always assume the feeds has + #: embedded content and if False we always assume the feed does not have + #: embedded content. + use_embedded_content = None + #: Specify any extra CSS that should be addded to downloaded HTML files + #: It will be inserted into C{} just before the closing + #: C{} tag thereby overrinding all CSS except that which is + #: declared using the style attribute on individual HTML tags. + #: type: string extra_css = None #: List of regular expressions that determines which links to follow @@ -388,6 +401,24 @@ class BasicNewsRecipe(object): templ = templates.IndexTemplate() return templ.generate(self.title, self.timefmt, feeds).render(doctype='xhtml') + @classmethod + def description_limiter(cls, src): + pos = cls.summary_length + fuzz = 50 + si = src.find(';', pos) + if si > 0 and si-pos > fuzz: + si = -1 + gi = src.find('>', pos) + if gi > 0 and gi-pos > fuzz: + gi = -1 + npos = max(si, gi) + if npos < 0: + npos = pos + + return src[:npos+1]+u'\u2026' + + + def feed2index(self, feed): if feed.image_url is not None: # Download feed image imgdir = os.path.join(self.output_dir, 'images') @@ -408,7 +439,7 @@ class BasicNewsRecipe(object): self.image_map[feed.image_url] = img templ = templates.FeedTemplate() - return templ.generate(feed).render(doctype='xhtml') + return templ.generate(feed, self.description_limiter).render(doctype='xhtml') def create_logger(self, feed_number, article_number): @@ -422,7 +453,7 @@ class BasicNewsRecipe(object): logger.addHandler(handler) return logger, out - def fetch_article(self, url, dir, logger, f, a, num_of_feeds): + def _fetch_article(self, url, dir, logger, f, a, num_of_feeds): fetcher = RecursiveFetcher(self.web2disk_options, logger, self.image_map, self.css_map, (url, f, a, num_of_feeds)) fetcher.base_dir = dir fetcher.current_dir = dir @@ -432,6 +463,20 @@ class BasicNewsRecipe(object): raise Exception(_('Could not fetch article. Run with --debug to see the reason')) return res, path, failures + def fetch_article(self, url, dir, logger, f, a, num_of_feeds): + return self._fetch_article(url, dir, logger, f, a, num_of_feeds) + + + def fetch_embedded_article(self, article, dir, logger, f, a, num_of_feeds): + pt = PersistentTemporaryFile('_feeds2disk.html') + templ = templates.EmbeddedContent() + raw = templ.generate(article).render('html') + open(pt.name, 'wb').write(raw) + pt.close() + url = ('file:'+pt.name) if iswindows else ('file://'+pt.name) + return self._fetch_article(url, dir, logger, f, a, num_of_feeds) + + def build_index(self): self.report_progress(0, _('Fetching feeds...')) try: @@ -447,6 +492,9 @@ class BasicNewsRecipe(object): feeds = feeds[:2] self.has_single_feed = len(feeds) == 1 + if self.use_embedded_content is None: + self.use_embedded_content = feeds[0].has_embedded_content() + index = os.path.join(self.output_dir, 'index.html') html = self.feeds2index(feeds) @@ -459,6 +507,8 @@ class BasicNewsRecipe(object): os.makedirs(feed_dir) for a, article in enumerate(feed): + if a >= self.max_articles_per_feed: + break art_dir = os.path.join(feed_dir, 'article_%d'%a) if not os.path.isdir(art_dir): os.makedirs(art_dir) @@ -467,9 +517,12 @@ class BasicNewsRecipe(object): url = self.print_version(article.url) except NotImplementedError: url = article.url - req = WorkRequest(self.fetch_article, (url, art_dir, logger, f, a, len(feed)), - {}, (f, a), self.article_downloaded, - self.error_in_article_download) + + func, arg = (self.fetch_embedded_article, article) if self.use_embedded_content else \ + (self.fetch_article, url) + req = WorkRequest(func, (arg, art_dir, logger, f, a, len(feed)), + {}, (f, a), self.article_downloaded, + self.error_in_article_download) req.stream = stream req.feed = feed req.article = article @@ -674,6 +727,7 @@ class Profile2Recipe(BasicNewsRecipe): self.simultaneous_downloads = 1 BasicNewsRecipe.__init__(self, options, parser, progress_reporter) self.browser = self.old_profile.browser + self.use_embedded_content = isinstance(self.old_profile, FullContentProfile) def parse_index(self): return self.old_profile.parse_feeds() diff --git a/src/libprs500/web/feeds/recipes/__init__.py b/src/libprs500/web/feeds/recipes/__init__.py index 3efcde84d9..ba1a702590 100644 --- a/src/libprs500/web/feeds/recipes/__init__.py +++ b/src/libprs500/web/feeds/recipes/__init__.py @@ -17,7 +17,7 @@ ''' Builtin recipes. ''' -recipes = ['newsweek', 'atlantic', 'economist', 'dilbert'] +recipes = ['newsweek', 'atlantic', 'economist', 'dilbert', 'portfolio'] import re from libprs500.web.feeds.news import BasicNewsRecipe, CustomIndexRecipe diff --git a/src/libprs500/web/feeds/recipes/portfolio.py b/src/libprs500/web/feeds/recipes/portfolio.py new file mode 100644 index 0000000000..5931b9b3a4 --- /dev/null +++ b/src/libprs500/web/feeds/recipes/portfolio.py @@ -0,0 +1,45 @@ +#!/usr/bin/env python + +## Copyright (C) 2008 Kovid Goyal kovid@kovidgoyal.net +## This program is free software; you can redistribute it and/or modify +## it under the terms of the GNU General Public License as published by +## the Free Software Foundation; either version 2 of the License, or +## (at your option) any later version. +## +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License along +## with this program; if not, write to the Free Software Foundation, Inc., +## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +''' +portfolio.com +''' + +from libprs500.web.feeds.news import BasicNewsRecipe + +class Portfolio(BasicNewsRecipe): + + title = 'Portfolio' + use_embedded_content = True + timefmt = ' [%a, %b %d, %Y]' + html2lrf_options = ['--ignore-tables'] + + feeds = [ + ('Business Travel', 'http://feeds.portfolio.com/portfolio/businesstravel'), + ('Careers', 'http://feeds.portfolio.com/portfolio/careers'), + ('Culture and Lifestyle', 'http://feeds.portfolio.com/portfolio/cultureandlifestyle'), + ('Executives','http://feeds.portfolio.com/portfolio/executives'), + ('News and Markets', 'http://feeds.portfolio.com/portfolio/news'), + ('Business Spin', 'http://feeds.portfolio.com/portfolio/businessspin'), + ('Capital', 'http://feeds.portfolio.com/portfolio/capital'), + ('Daily Brief', 'http://feeds.portfolio.com/portfolio/dailybrief'), + ('Market Movers', 'http://feeds.portfolio.com/portfolio/marketmovers'), + ('Mixed Media', 'http://feeds.portfolio.com/portfolio/mixedmedia'), + ('Odd Numbers', 'http://feeds.portfolio.com/portfolio/oddnumbers'), + ('Playbook', 'http://feeds.portfolio.com/portfolio/playbook'), + ('Tech Observer', 'http://feeds.portfolio.com/portfolio/thetechobserver'), + ('World According to ...', 'http://feeds.portfolio.com/portfolio/theworldaccordingto'), + ] \ No newline at end of file diff --git a/src/libprs500/web/feeds/templates.py b/src/libprs500/web/feeds/templates.py index 8947944092..66c734a3a5 100644 --- a/src/libprs500/web/feeds/templates.py +++ b/src/libprs500/web/feeds/templates.py @@ -163,7 +163,7 @@ class FeedTemplate(Template): ${article.title} ${article.localtime.strftime(" [%a, %d %b %H:%M]")}
- ${Markup(article.summary)} + ${Markup(cutoff(article.summary))}
@@ -172,5 +172,33 @@ class FeedTemplate(Template):