Support for content embedded feeds in feeds2disk

This commit is contained in:
Kovid Goyal 2008-03-18 06:31:40 +00:00
parent ef40f3abab
commit 7ddfcd5711
5 changed files with 145 additions and 9 deletions

View File

@ -164,6 +164,15 @@ class Feed(object):
return True return True
return False return False
def has_embedded_content(self):
length = 0
for a in self:
if a.content or a.summary:
length += max(len(a.content if a.content else ''),
len(a.summary if a.summary else ''))
return length > 2000 * len(self)
def feed_from_xml(raw_xml, title=None, oldest_article=7, def feed_from_xml(raw_xml, title=None, oldest_article=7,
max_articles_per_feed=100, get_article_url=lambda item: item.get('link', None)): max_articles_per_feed=100, get_article_url=lambda item: item.get('link', None)):

View File

@ -13,6 +13,8 @@
## You should have received a copy of the GNU General Public License along ## You should have received a copy of the GNU General Public License along
## with this program; if not, write to the Free Software Foundation, Inc., ## with this program; if not, write to the Free Software Foundation, Inc.,
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. ## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
from libprs500.ebooks.lrf.web.profiles import FullContentProfile
from libprs500.ptempfile import PersistentTemporaryFile
''' '''
The backend to parse feeds and create HTML that can then be converted The backend to parse feeds and create HTML that can then be converted
to an ebook. to an ebook.
@ -100,7 +102,18 @@ class BasicNewsRecipe(object):
#: using cp1252. If None, try to detect the encoding. #: using cp1252. If None, try to detect the encoding.
encoding = None encoding = None
#: Normally we try to guess if a feed has full articles embedded in it
#: based on the length of the embedded content. If C{None}, then the
#: default guessing is used. If C{True} then the we always assume the feeds has
#: embedded content and if False we always assume the feed does not have
#: embedded content.
use_embedded_content = None
#: Specify any extra CSS that should be addded to downloaded HTML files #: Specify any extra CSS that should be addded to downloaded HTML files
#: It will be inserted into C{<style></style>} just before the closing
#: C{</head>} tag thereby overrinding all CSS except that which is
#: declared using the style attribute on individual HTML tags.
#: type: string
extra_css = None extra_css = None
#: List of regular expressions that determines which links to follow #: List of regular expressions that determines which links to follow
@ -388,6 +401,24 @@ class BasicNewsRecipe(object):
templ = templates.IndexTemplate() templ = templates.IndexTemplate()
return templ.generate(self.title, self.timefmt, feeds).render(doctype='xhtml') return templ.generate(self.title, self.timefmt, feeds).render(doctype='xhtml')
@classmethod
def description_limiter(cls, src):
pos = cls.summary_length
fuzz = 50
si = src.find(';', pos)
if si > 0 and si-pos > fuzz:
si = -1
gi = src.find('>', pos)
if gi > 0 and gi-pos > fuzz:
gi = -1
npos = max(si, gi)
if npos < 0:
npos = pos
return src[:npos+1]+u'\u2026'
def feed2index(self, feed): def feed2index(self, feed):
if feed.image_url is not None: # Download feed image if feed.image_url is not None: # Download feed image
imgdir = os.path.join(self.output_dir, 'images') imgdir = os.path.join(self.output_dir, 'images')
@ -408,7 +439,7 @@ class BasicNewsRecipe(object):
self.image_map[feed.image_url] = img self.image_map[feed.image_url] = img
templ = templates.FeedTemplate() templ = templates.FeedTemplate()
return templ.generate(feed).render(doctype='xhtml') return templ.generate(feed, self.description_limiter).render(doctype='xhtml')
def create_logger(self, feed_number, article_number): def create_logger(self, feed_number, article_number):
@ -422,7 +453,7 @@ class BasicNewsRecipe(object):
logger.addHandler(handler) logger.addHandler(handler)
return logger, out return logger, out
def fetch_article(self, url, dir, logger, f, a, num_of_feeds): def _fetch_article(self, url, dir, logger, f, a, num_of_feeds):
fetcher = RecursiveFetcher(self.web2disk_options, logger, self.image_map, self.css_map, (url, f, a, num_of_feeds)) fetcher = RecursiveFetcher(self.web2disk_options, logger, self.image_map, self.css_map, (url, f, a, num_of_feeds))
fetcher.base_dir = dir fetcher.base_dir = dir
fetcher.current_dir = dir fetcher.current_dir = dir
@ -432,6 +463,20 @@ class BasicNewsRecipe(object):
raise Exception(_('Could not fetch article. Run with --debug to see the reason')) raise Exception(_('Could not fetch article. Run with --debug to see the reason'))
return res, path, failures return res, path, failures
def fetch_article(self, url, dir, logger, f, a, num_of_feeds):
return self._fetch_article(url, dir, logger, f, a, num_of_feeds)
def fetch_embedded_article(self, article, dir, logger, f, a, num_of_feeds):
pt = PersistentTemporaryFile('_feeds2disk.html')
templ = templates.EmbeddedContent()
raw = templ.generate(article).render('html')
open(pt.name, 'wb').write(raw)
pt.close()
url = ('file:'+pt.name) if iswindows else ('file://'+pt.name)
return self._fetch_article(url, dir, logger, f, a, num_of_feeds)
def build_index(self): def build_index(self):
self.report_progress(0, _('Fetching feeds...')) self.report_progress(0, _('Fetching feeds...'))
try: try:
@ -447,6 +492,9 @@ class BasicNewsRecipe(object):
feeds = feeds[:2] feeds = feeds[:2]
self.has_single_feed = len(feeds) == 1 self.has_single_feed = len(feeds) == 1
if self.use_embedded_content is None:
self.use_embedded_content = feeds[0].has_embedded_content()
index = os.path.join(self.output_dir, 'index.html') index = os.path.join(self.output_dir, 'index.html')
html = self.feeds2index(feeds) html = self.feeds2index(feeds)
@ -459,6 +507,8 @@ class BasicNewsRecipe(object):
os.makedirs(feed_dir) os.makedirs(feed_dir)
for a, article in enumerate(feed): for a, article in enumerate(feed):
if a >= self.max_articles_per_feed:
break
art_dir = os.path.join(feed_dir, 'article_%d'%a) art_dir = os.path.join(feed_dir, 'article_%d'%a)
if not os.path.isdir(art_dir): if not os.path.isdir(art_dir):
os.makedirs(art_dir) os.makedirs(art_dir)
@ -467,7 +517,10 @@ class BasicNewsRecipe(object):
url = self.print_version(article.url) url = self.print_version(article.url)
except NotImplementedError: except NotImplementedError:
url = article.url url = article.url
req = WorkRequest(self.fetch_article, (url, art_dir, logger, f, a, len(feed)),
func, arg = (self.fetch_embedded_article, article) if self.use_embedded_content else \
(self.fetch_article, url)
req = WorkRequest(func, (arg, art_dir, logger, f, a, len(feed)),
{}, (f, a), self.article_downloaded, {}, (f, a), self.article_downloaded,
self.error_in_article_download) self.error_in_article_download)
req.stream = stream req.stream = stream
@ -674,6 +727,7 @@ class Profile2Recipe(BasicNewsRecipe):
self.simultaneous_downloads = 1 self.simultaneous_downloads = 1
BasicNewsRecipe.__init__(self, options, parser, progress_reporter) BasicNewsRecipe.__init__(self, options, parser, progress_reporter)
self.browser = self.old_profile.browser self.browser = self.old_profile.browser
self.use_embedded_content = isinstance(self.old_profile, FullContentProfile)
def parse_index(self): def parse_index(self):
return self.old_profile.parse_feeds() return self.old_profile.parse_feeds()

View File

@ -17,7 +17,7 @@
''' '''
Builtin recipes. Builtin recipes.
''' '''
recipes = ['newsweek', 'atlantic', 'economist', 'dilbert'] recipes = ['newsweek', 'atlantic', 'economist', 'dilbert', 'portfolio']
import re import re
from libprs500.web.feeds.news import BasicNewsRecipe, CustomIndexRecipe from libprs500.web.feeds.news import BasicNewsRecipe, CustomIndexRecipe

View File

@ -0,0 +1,45 @@
#!/usr/bin/env python
## Copyright (C) 2008 Kovid Goyal kovid@kovidgoyal.net
## This program is free software; you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
## the Free Software Foundation; either version 2 of the License, or
## (at your option) any later version.
##
## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
## GNU General Public License for more details.
##
## You should have received a copy of the GNU General Public License along
## with this program; if not, write to the Free Software Foundation, Inc.,
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
'''
portfolio.com
'''
from libprs500.web.feeds.news import BasicNewsRecipe
class Portfolio(BasicNewsRecipe):
title = 'Portfolio'
use_embedded_content = True
timefmt = ' [%a, %b %d, %Y]'
html2lrf_options = ['--ignore-tables']
feeds = [
('Business Travel', 'http://feeds.portfolio.com/portfolio/businesstravel'),
('Careers', 'http://feeds.portfolio.com/portfolio/careers'),
('Culture and Lifestyle', 'http://feeds.portfolio.com/portfolio/cultureandlifestyle'),
('Executives','http://feeds.portfolio.com/portfolio/executives'),
('News and Markets', 'http://feeds.portfolio.com/portfolio/news'),
('Business Spin', 'http://feeds.portfolio.com/portfolio/businessspin'),
('Capital', 'http://feeds.portfolio.com/portfolio/capital'),
('Daily Brief', 'http://feeds.portfolio.com/portfolio/dailybrief'),
('Market Movers', 'http://feeds.portfolio.com/portfolio/marketmovers'),
('Mixed Media', 'http://feeds.portfolio.com/portfolio/mixedmedia'),
('Odd Numbers', 'http://feeds.portfolio.com/portfolio/oddnumbers'),
('Playbook', 'http://feeds.portfolio.com/portfolio/playbook'),
('Tech Observer', 'http://feeds.portfolio.com/portfolio/thetechobserver'),
('World According to ...', 'http://feeds.portfolio.com/portfolio/theworldaccordingto'),
]

View File

@ -163,7 +163,7 @@ class FeedTemplate(Template):
<a class="article" href="${article.url}">${article.title}</a> <a class="article" href="${article.url}">${article.title}</a>
<span class="article_date">${article.localtime.strftime(" [%a, %d %b %H:%M]")}</span> <span class="article_date">${article.localtime.strftime(" [%a, %d %b %H:%M]")}</span>
<p class="article_decription" py:if="article.summary"> <p class="article_decription" py:if="article.summary">
${Markup(article.summary)} ${Markup(cutoff(article.summary))}
</p> </p>
</li> </li>
</py:for> </py:for>
@ -172,5 +172,33 @@ class FeedTemplate(Template):
</html> </html>
''') ''')
def generate(self, feed): def generate(self, feed, cutoff):
return Template.generate(self, feed=feed) return Template.generate(self, feed=feed, cutoff=cutoff)
class EmbeddedContent(Template):
def __init__(self):
Template.__init__(self, '''\
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml"
xml:lang="en"
xmlns:xi="http://www.w3.org/2001/XInclude"
xmlns:py="http://genshi.edgewall.org/"
>
<head>
<title>${article.title}</title>
</head>
<body>
<h2>${article.title}</h2>
<div>
${Markup(article.content if len(article.content if article.content else '') > len(article.summary if article.summary else '') else article.summary)}
</div>
</body>
</html>
''')
def generate(self, article):
return Template.generate(self, article=article)