mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Support for content embedded feeds in feeds2disk
This commit is contained in:
parent
ef40f3abab
commit
7ddfcd5711
@ -164,6 +164,15 @@ class Feed(object):
|
|||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
def has_embedded_content(self):
|
||||||
|
length = 0
|
||||||
|
for a in self:
|
||||||
|
if a.content or a.summary:
|
||||||
|
length += max(len(a.content if a.content else ''),
|
||||||
|
len(a.summary if a.summary else ''))
|
||||||
|
|
||||||
|
return length > 2000 * len(self)
|
||||||
|
|
||||||
|
|
||||||
def feed_from_xml(raw_xml, title=None, oldest_article=7,
|
def feed_from_xml(raw_xml, title=None, oldest_article=7,
|
||||||
max_articles_per_feed=100, get_article_url=lambda item: item.get('link', None)):
|
max_articles_per_feed=100, get_article_url=lambda item: item.get('link', None)):
|
||||||
|
@ -13,6 +13,8 @@
|
|||||||
## You should have received a copy of the GNU General Public License along
|
## You should have received a copy of the GNU General Public License along
|
||||||
## with this program; if not, write to the Free Software Foundation, Inc.,
|
## with this program; if not, write to the Free Software Foundation, Inc.,
|
||||||
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||||
|
from libprs500.ebooks.lrf.web.profiles import FullContentProfile
|
||||||
|
from libprs500.ptempfile import PersistentTemporaryFile
|
||||||
'''
|
'''
|
||||||
The backend to parse feeds and create HTML that can then be converted
|
The backend to parse feeds and create HTML that can then be converted
|
||||||
to an ebook.
|
to an ebook.
|
||||||
@ -100,7 +102,18 @@ class BasicNewsRecipe(object):
|
|||||||
#: using cp1252. If None, try to detect the encoding.
|
#: using cp1252. If None, try to detect the encoding.
|
||||||
encoding = None
|
encoding = None
|
||||||
|
|
||||||
|
#: Normally we try to guess if a feed has full articles embedded in it
|
||||||
|
#: based on the length of the embedded content. If C{None}, then the
|
||||||
|
#: default guessing is used. If C{True} then the we always assume the feeds has
|
||||||
|
#: embedded content and if False we always assume the feed does not have
|
||||||
|
#: embedded content.
|
||||||
|
use_embedded_content = None
|
||||||
|
|
||||||
#: Specify any extra CSS that should be addded to downloaded HTML files
|
#: Specify any extra CSS that should be addded to downloaded HTML files
|
||||||
|
#: It will be inserted into C{<style></style>} just before the closing
|
||||||
|
#: C{</head>} tag thereby overrinding all CSS except that which is
|
||||||
|
#: declared using the style attribute on individual HTML tags.
|
||||||
|
#: type: string
|
||||||
extra_css = None
|
extra_css = None
|
||||||
|
|
||||||
#: List of regular expressions that determines which links to follow
|
#: List of regular expressions that determines which links to follow
|
||||||
@ -388,6 +401,24 @@ class BasicNewsRecipe(object):
|
|||||||
templ = templates.IndexTemplate()
|
templ = templates.IndexTemplate()
|
||||||
return templ.generate(self.title, self.timefmt, feeds).render(doctype='xhtml')
|
return templ.generate(self.title, self.timefmt, feeds).render(doctype='xhtml')
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def description_limiter(cls, src):
|
||||||
|
pos = cls.summary_length
|
||||||
|
fuzz = 50
|
||||||
|
si = src.find(';', pos)
|
||||||
|
if si > 0 and si-pos > fuzz:
|
||||||
|
si = -1
|
||||||
|
gi = src.find('>', pos)
|
||||||
|
if gi > 0 and gi-pos > fuzz:
|
||||||
|
gi = -1
|
||||||
|
npos = max(si, gi)
|
||||||
|
if npos < 0:
|
||||||
|
npos = pos
|
||||||
|
|
||||||
|
return src[:npos+1]+u'\u2026'
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def feed2index(self, feed):
|
def feed2index(self, feed):
|
||||||
if feed.image_url is not None: # Download feed image
|
if feed.image_url is not None: # Download feed image
|
||||||
imgdir = os.path.join(self.output_dir, 'images')
|
imgdir = os.path.join(self.output_dir, 'images')
|
||||||
@ -408,7 +439,7 @@ class BasicNewsRecipe(object):
|
|||||||
self.image_map[feed.image_url] = img
|
self.image_map[feed.image_url] = img
|
||||||
|
|
||||||
templ = templates.FeedTemplate()
|
templ = templates.FeedTemplate()
|
||||||
return templ.generate(feed).render(doctype='xhtml')
|
return templ.generate(feed, self.description_limiter).render(doctype='xhtml')
|
||||||
|
|
||||||
|
|
||||||
def create_logger(self, feed_number, article_number):
|
def create_logger(self, feed_number, article_number):
|
||||||
@ -422,7 +453,7 @@ class BasicNewsRecipe(object):
|
|||||||
logger.addHandler(handler)
|
logger.addHandler(handler)
|
||||||
return logger, out
|
return logger, out
|
||||||
|
|
||||||
def fetch_article(self, url, dir, logger, f, a, num_of_feeds):
|
def _fetch_article(self, url, dir, logger, f, a, num_of_feeds):
|
||||||
fetcher = RecursiveFetcher(self.web2disk_options, logger, self.image_map, self.css_map, (url, f, a, num_of_feeds))
|
fetcher = RecursiveFetcher(self.web2disk_options, logger, self.image_map, self.css_map, (url, f, a, num_of_feeds))
|
||||||
fetcher.base_dir = dir
|
fetcher.base_dir = dir
|
||||||
fetcher.current_dir = dir
|
fetcher.current_dir = dir
|
||||||
@ -432,6 +463,20 @@ class BasicNewsRecipe(object):
|
|||||||
raise Exception(_('Could not fetch article. Run with --debug to see the reason'))
|
raise Exception(_('Could not fetch article. Run with --debug to see the reason'))
|
||||||
return res, path, failures
|
return res, path, failures
|
||||||
|
|
||||||
|
def fetch_article(self, url, dir, logger, f, a, num_of_feeds):
|
||||||
|
return self._fetch_article(url, dir, logger, f, a, num_of_feeds)
|
||||||
|
|
||||||
|
|
||||||
|
def fetch_embedded_article(self, article, dir, logger, f, a, num_of_feeds):
|
||||||
|
pt = PersistentTemporaryFile('_feeds2disk.html')
|
||||||
|
templ = templates.EmbeddedContent()
|
||||||
|
raw = templ.generate(article).render('html')
|
||||||
|
open(pt.name, 'wb').write(raw)
|
||||||
|
pt.close()
|
||||||
|
url = ('file:'+pt.name) if iswindows else ('file://'+pt.name)
|
||||||
|
return self._fetch_article(url, dir, logger, f, a, num_of_feeds)
|
||||||
|
|
||||||
|
|
||||||
def build_index(self):
|
def build_index(self):
|
||||||
self.report_progress(0, _('Fetching feeds...'))
|
self.report_progress(0, _('Fetching feeds...'))
|
||||||
try:
|
try:
|
||||||
@ -447,6 +492,9 @@ class BasicNewsRecipe(object):
|
|||||||
feeds = feeds[:2]
|
feeds = feeds[:2]
|
||||||
self.has_single_feed = len(feeds) == 1
|
self.has_single_feed = len(feeds) == 1
|
||||||
|
|
||||||
|
if self.use_embedded_content is None:
|
||||||
|
self.use_embedded_content = feeds[0].has_embedded_content()
|
||||||
|
|
||||||
index = os.path.join(self.output_dir, 'index.html')
|
index = os.path.join(self.output_dir, 'index.html')
|
||||||
|
|
||||||
html = self.feeds2index(feeds)
|
html = self.feeds2index(feeds)
|
||||||
@ -459,6 +507,8 @@ class BasicNewsRecipe(object):
|
|||||||
os.makedirs(feed_dir)
|
os.makedirs(feed_dir)
|
||||||
|
|
||||||
for a, article in enumerate(feed):
|
for a, article in enumerate(feed):
|
||||||
|
if a >= self.max_articles_per_feed:
|
||||||
|
break
|
||||||
art_dir = os.path.join(feed_dir, 'article_%d'%a)
|
art_dir = os.path.join(feed_dir, 'article_%d'%a)
|
||||||
if not os.path.isdir(art_dir):
|
if not os.path.isdir(art_dir):
|
||||||
os.makedirs(art_dir)
|
os.makedirs(art_dir)
|
||||||
@ -467,7 +517,10 @@ class BasicNewsRecipe(object):
|
|||||||
url = self.print_version(article.url)
|
url = self.print_version(article.url)
|
||||||
except NotImplementedError:
|
except NotImplementedError:
|
||||||
url = article.url
|
url = article.url
|
||||||
req = WorkRequest(self.fetch_article, (url, art_dir, logger, f, a, len(feed)),
|
|
||||||
|
func, arg = (self.fetch_embedded_article, article) if self.use_embedded_content else \
|
||||||
|
(self.fetch_article, url)
|
||||||
|
req = WorkRequest(func, (arg, art_dir, logger, f, a, len(feed)),
|
||||||
{}, (f, a), self.article_downloaded,
|
{}, (f, a), self.article_downloaded,
|
||||||
self.error_in_article_download)
|
self.error_in_article_download)
|
||||||
req.stream = stream
|
req.stream = stream
|
||||||
@ -674,6 +727,7 @@ class Profile2Recipe(BasicNewsRecipe):
|
|||||||
self.simultaneous_downloads = 1
|
self.simultaneous_downloads = 1
|
||||||
BasicNewsRecipe.__init__(self, options, parser, progress_reporter)
|
BasicNewsRecipe.__init__(self, options, parser, progress_reporter)
|
||||||
self.browser = self.old_profile.browser
|
self.browser = self.old_profile.browser
|
||||||
|
self.use_embedded_content = isinstance(self.old_profile, FullContentProfile)
|
||||||
|
|
||||||
def parse_index(self):
|
def parse_index(self):
|
||||||
return self.old_profile.parse_feeds()
|
return self.old_profile.parse_feeds()
|
||||||
|
@ -17,7 +17,7 @@
|
|||||||
'''
|
'''
|
||||||
Builtin recipes.
|
Builtin recipes.
|
||||||
'''
|
'''
|
||||||
recipes = ['newsweek', 'atlantic', 'economist', 'dilbert']
|
recipes = ['newsweek', 'atlantic', 'economist', 'dilbert', 'portfolio']
|
||||||
|
|
||||||
import re
|
import re
|
||||||
from libprs500.web.feeds.news import BasicNewsRecipe, CustomIndexRecipe
|
from libprs500.web.feeds.news import BasicNewsRecipe, CustomIndexRecipe
|
||||||
|
45
src/libprs500/web/feeds/recipes/portfolio.py
Normal file
45
src/libprs500/web/feeds/recipes/portfolio.py
Normal file
@ -0,0 +1,45 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
## Copyright (C) 2008 Kovid Goyal kovid@kovidgoyal.net
|
||||||
|
## This program is free software; you can redistribute it and/or modify
|
||||||
|
## it under the terms of the GNU General Public License as published by
|
||||||
|
## the Free Software Foundation; either version 2 of the License, or
|
||||||
|
## (at your option) any later version.
|
||||||
|
##
|
||||||
|
## This program is distributed in the hope that it will be useful,
|
||||||
|
## but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
## GNU General Public License for more details.
|
||||||
|
##
|
||||||
|
## You should have received a copy of the GNU General Public License along
|
||||||
|
## with this program; if not, write to the Free Software Foundation, Inc.,
|
||||||
|
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||||
|
'''
|
||||||
|
portfolio.com
|
||||||
|
'''
|
||||||
|
|
||||||
|
from libprs500.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
class Portfolio(BasicNewsRecipe):
|
||||||
|
|
||||||
|
title = 'Portfolio'
|
||||||
|
use_embedded_content = True
|
||||||
|
timefmt = ' [%a, %b %d, %Y]'
|
||||||
|
html2lrf_options = ['--ignore-tables']
|
||||||
|
|
||||||
|
feeds = [
|
||||||
|
('Business Travel', 'http://feeds.portfolio.com/portfolio/businesstravel'),
|
||||||
|
('Careers', 'http://feeds.portfolio.com/portfolio/careers'),
|
||||||
|
('Culture and Lifestyle', 'http://feeds.portfolio.com/portfolio/cultureandlifestyle'),
|
||||||
|
('Executives','http://feeds.portfolio.com/portfolio/executives'),
|
||||||
|
('News and Markets', 'http://feeds.portfolio.com/portfolio/news'),
|
||||||
|
('Business Spin', 'http://feeds.portfolio.com/portfolio/businessspin'),
|
||||||
|
('Capital', 'http://feeds.portfolio.com/portfolio/capital'),
|
||||||
|
('Daily Brief', 'http://feeds.portfolio.com/portfolio/dailybrief'),
|
||||||
|
('Market Movers', 'http://feeds.portfolio.com/portfolio/marketmovers'),
|
||||||
|
('Mixed Media', 'http://feeds.portfolio.com/portfolio/mixedmedia'),
|
||||||
|
('Odd Numbers', 'http://feeds.portfolio.com/portfolio/oddnumbers'),
|
||||||
|
('Playbook', 'http://feeds.portfolio.com/portfolio/playbook'),
|
||||||
|
('Tech Observer', 'http://feeds.portfolio.com/portfolio/thetechobserver'),
|
||||||
|
('World According to ...', 'http://feeds.portfolio.com/portfolio/theworldaccordingto'),
|
||||||
|
]
|
@ -163,7 +163,7 @@ class FeedTemplate(Template):
|
|||||||
<a class="article" href="${article.url}">${article.title}</a>
|
<a class="article" href="${article.url}">${article.title}</a>
|
||||||
<span class="article_date">${article.localtime.strftime(" [%a, %d %b %H:%M]")}</span>
|
<span class="article_date">${article.localtime.strftime(" [%a, %d %b %H:%M]")}</span>
|
||||||
<p class="article_decription" py:if="article.summary">
|
<p class="article_decription" py:if="article.summary">
|
||||||
${Markup(article.summary)}
|
${Markup(cutoff(article.summary))}
|
||||||
</p>
|
</p>
|
||||||
</li>
|
</li>
|
||||||
</py:for>
|
</py:for>
|
||||||
@ -172,5 +172,33 @@ class FeedTemplate(Template):
|
|||||||
</html>
|
</html>
|
||||||
''')
|
''')
|
||||||
|
|
||||||
def generate(self, feed):
|
def generate(self, feed, cutoff):
|
||||||
return Template.generate(self, feed=feed)
|
return Template.generate(self, feed=feed, cutoff=cutoff)
|
||||||
|
|
||||||
|
class EmbeddedContent(Template):
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
Template.__init__(self, '''\
|
||||||
|
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
|
||||||
|
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
|
||||||
|
<html xmlns="http://www.w3.org/1999/xhtml"
|
||||||
|
xml:lang="en"
|
||||||
|
xmlns:xi="http://www.w3.org/2001/XInclude"
|
||||||
|
xmlns:py="http://genshi.edgewall.org/"
|
||||||
|
|
||||||
|
>
|
||||||
|
<head>
|
||||||
|
<title>${article.title}</title>
|
||||||
|
</head>
|
||||||
|
|
||||||
|
<body>
|
||||||
|
<h2>${article.title}</h2>
|
||||||
|
<div>
|
||||||
|
${Markup(article.content if len(article.content if article.content else '') > len(article.summary if article.summary else '') else article.summary)}
|
||||||
|
</div>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
''')
|
||||||
|
|
||||||
|
def generate(self, article):
|
||||||
|
return Template.generate(self, article=article)
|
Loading…
x
Reference in New Issue
Block a user