mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Added recipes for The Atlantic and Economist to feeds2disk
This commit is contained in:
parent
0dde91062f
commit
5a76f5c2e1
@ -64,7 +64,7 @@ class Feed(object):
|
||||
max_articles_per_feed=100):
|
||||
entries = feed.entries
|
||||
feed = feed.feed
|
||||
self.title = feed.get('title', 'Unknown feed') if not title else title
|
||||
self.title = feed.get('title', _('Unknown feed')) if not title else title
|
||||
self.description = feed.get('description', '')
|
||||
image = feed.get('image', {})
|
||||
self.image_url = image.get('href', None)
|
||||
@ -83,6 +83,38 @@ class Feed(object):
|
||||
break
|
||||
self.parse_article(item)
|
||||
|
||||
def populate_from_preparsed_feed(self, title, articles, oldest_article=7,
|
||||
max_articles_per_feed=100):
|
||||
self.title = title if title else _('Unknown feed')
|
||||
self.descrition = ''
|
||||
self.image_url = None
|
||||
self.articles = []
|
||||
self.added_articles = []
|
||||
|
||||
self.oldest_article = oldest_article
|
||||
self.id_counter = 0
|
||||
|
||||
for item in articles:
|
||||
if len(self.articles) >= max_articles_per_feed:
|
||||
break
|
||||
id = item.get('id', 'internal id#'+str(self.id_counter))
|
||||
if id in self.added_articles:
|
||||
return
|
||||
self.added_articles.append(id)
|
||||
self.id_counter += 1
|
||||
published = time.gmtime(item.get('timestamp', time.time()))
|
||||
title = item.get('title', _('Untitled article'))
|
||||
link = item.get('url', None)
|
||||
description = item.get('description', '')
|
||||
content = item.get('content', '')
|
||||
article = Article(id, title, link, description, published, content)
|
||||
delta = datetime.utcnow() - article.utctime
|
||||
if delta.days*24*3600 + delta.seconds <= 24*3600*self.oldest_article:
|
||||
self.articles.append(article)
|
||||
else:
|
||||
self.logger.debug('Skipping article %s (%s) from feed %s as it is too old.'%(title, article.localtime.strftime('%a, %d %b, %Y %H:%M'), self.title))
|
||||
|
||||
|
||||
def parse_article(self, item):
|
||||
id = item.get('id', 'internal id#'+str(self.id_counter))
|
||||
if id in self.added_articles:
|
||||
@ -91,7 +123,7 @@ class Feed(object):
|
||||
self.id_counter += 1
|
||||
self.added_articles.append(id)
|
||||
|
||||
title = item.get('title', 'Untitled article')
|
||||
title = item.get('title', _('Untitled article'))
|
||||
link = item.get('link', None)
|
||||
description = item.get('summary', None)
|
||||
|
||||
@ -134,3 +166,17 @@ def feed_from_xml(raw_xml, title=None, oldest_article=7, max_articles_per_feed=1
|
||||
oldest_article=oldest_article,
|
||||
max_articles_per_feed=max_articles_per_feed)
|
||||
return pfeed
|
||||
|
||||
def feeds_from_index(index, oldest_article=7, max_articles_per_feed=100):
|
||||
'''
|
||||
@param index: A parsed index as returned by L{BasicNewsRecipe.parse_index}.
|
||||
@return: A list of L{Feed} objects.
|
||||
@rtype: list
|
||||
'''
|
||||
feeds = []
|
||||
for title, articles in index.items():
|
||||
pfeed = Feed()
|
||||
pfeed.populate_from_preparsed_feed(title, articles, oldest_article=oldest_article,
|
||||
max_articles_per_feed=max_articles_per_feed)
|
||||
feeds.append(pfeed)
|
||||
return feeds
|
@ -21,11 +21,11 @@ import logging, os, cStringIO, time, traceback
|
||||
import urlparse
|
||||
|
||||
from libprs500 import browser, __appname__
|
||||
from libprs500.ebooks.BeautifulSoup import BeautifulSoup
|
||||
from libprs500.ebooks.BeautifulSoup import BeautifulSoup, NavigableString, CData, Tag
|
||||
from libprs500.ebooks.metadata.opf import OPFCreator
|
||||
from libprs500.ebooks.metadata.toc import TOC
|
||||
from libprs500.ebooks.metadata import MetaInformation
|
||||
from libprs500.web.feeds import feed_from_xml, templates
|
||||
from libprs500.web.feeds import feed_from_xml, templates, feeds_from_index
|
||||
from libprs500.web.fetch.simple import option_parser as web2disk_option_parser
|
||||
from libprs500.web.fetch.simple import RecursiveFetcher
|
||||
from libprs500.threadpool import WorkRequest, ThreadPool, NoResultsPending
|
||||
@ -74,6 +74,11 @@ class BasicNewsRecipe(object):
|
||||
#: @type: string
|
||||
timefmt = ' [%a, %d %b %Y]'
|
||||
|
||||
#: List of feeds to download
|
||||
#: Can be either C{[url1, url2, ...]} or C{[('title1', url1), ('title2', url2),...]}
|
||||
#: @type: List of strings or list of 2-tuples
|
||||
feeds = None
|
||||
|
||||
#: Max number of characters in the short description.
|
||||
#: @type: integer
|
||||
summary_length = 500
|
||||
@ -112,7 +117,7 @@ class BasicNewsRecipe(object):
|
||||
|
||||
#: List of options to pass to html2lrf, to customize generation of LRF ebooks.
|
||||
#: @type: list of strings
|
||||
html2lrf_options = ['--page-break-before', '$']
|
||||
html2lrf_options = []
|
||||
|
||||
#: List of tags to be removed. Specified tags are removed from downloaded HTML.
|
||||
#: A tag is specified as a dictionary of the form::
|
||||
@ -134,6 +139,12 @@ class BasicNewsRecipe(object):
|
||||
#: tags after the element with id C{content}.
|
||||
remove_tags_after = None
|
||||
|
||||
#: Remove all tags that occur before the specified tag.
|
||||
#: For the format for specifying a tag see L{remove_tags}.
|
||||
#: For example, C{remove_tags_before = [dict(id='content')]} will remove all
|
||||
#: tags before the element with id C{content}.
|
||||
remove_tags_before = None
|
||||
|
||||
#: Keep only the specified tags and their children.
|
||||
#: For the format for specifying tags see L{remove_tags}.
|
||||
#: If this list is not empty, then the <body> element will be emptied and re-filled with
|
||||
@ -220,6 +231,26 @@ class BasicNewsRecipe(object):
|
||||
'''
|
||||
pass
|
||||
|
||||
def parse_index(self):
|
||||
'''
|
||||
This method should be implemented in recipes that parse a website
|
||||
instead of feeds to generate a list of articles. Typical uses are for
|
||||
news sources that have a "Print Edition" webpage that lists all the
|
||||
articles in the current print edition. If this function is implemented,
|
||||
it will be used in preference to L{parse_feeds}.
|
||||
@rtype: dictionary
|
||||
@return: A dictionary whose keys are feed titles and whose values are each
|
||||
a list of dictionaries. Each list contains dictionaries of the form::
|
||||
{
|
||||
'title' : article title,
|
||||
'url' : URL of print version,
|
||||
'date' : The publication date of the article as a string,
|
||||
'description' : A summary of the article
|
||||
'content' : The full article (can be an empty string). This is used by FullContentProfile
|
||||
}
|
||||
'''
|
||||
raise NotImplementedError
|
||||
|
||||
def __init__(self, options, parser, progress_reporter):
|
||||
'''
|
||||
Initialize the recipe.
|
||||
@ -285,7 +316,7 @@ class BasicNewsRecipe(object):
|
||||
|
||||
self.web2disk_options = web2disk_option_parser().parse_args(web2disk_cmdline)[0]
|
||||
for extra in ('keep_only_tags', 'remove_tags', 'preprocess_regexps',
|
||||
'preprocess_html', 'remove_tags_after'):
|
||||
'preprocess_html', 'remove_tags_after', 'remove_tags_before'):
|
||||
setattr(self.web2disk_options, extra, getattr(self, extra))
|
||||
self.web2disk_options.postprocess_html = [self._postprocess_html, self.postprocess_html]
|
||||
|
||||
@ -293,7 +324,7 @@ class BasicNewsRecipe(object):
|
||||
self.simultaneous_downloads = 1
|
||||
|
||||
self.navbar = templates.NavBarTemplate()
|
||||
self.html2lrf_options.append('--use-spine')
|
||||
self.html2lrf_options.extend(['--page-break-before', '$', '--use-spine'])
|
||||
self.failed_downloads = []
|
||||
self.partial_failures = []
|
||||
|
||||
@ -389,7 +420,13 @@ class BasicNewsRecipe(object):
|
||||
|
||||
def build_index(self):
|
||||
self.report_progress(0, _('Fetching feeds...'))
|
||||
feeds = self.parse_feeds()
|
||||
try:
|
||||
feeds = feeds_from_index(self.parse_index(), oldest_article=self.oldest_article,
|
||||
max_articles_per_feed=self.max_articles_per_feed)
|
||||
self.report_progress(0, _('Got feeds from index page'))
|
||||
except NotImplementedError:
|
||||
feeds = self.parse_feeds()
|
||||
|
||||
if self.test:
|
||||
feeds = feeds[:2]
|
||||
self.has_single_feed = len(feeds) == 1
|
||||
@ -485,28 +522,31 @@ class BasicNewsRecipe(object):
|
||||
|
||||
entries = ['index.html']
|
||||
toc = TOC(base_path=dir)
|
||||
for i, f in enumerate(feeds):
|
||||
entries.append('feed_%d/index.html'%i)
|
||||
feed = toc.add_item('feed_%d/index.html'%i, None, f.title)
|
||||
|
||||
def feed_index(num, parent):
|
||||
f = feeds[num]
|
||||
for j, a in enumerate(f):
|
||||
if getattr(a, 'downloaded', False):
|
||||
adir = 'feed_%d/article_%d/'%(i, j)
|
||||
adir = 'feed_%d/article_%d/'%(num, j)
|
||||
entries.append('%sindex.html'%adir)
|
||||
feed.add_item('%sindex.html'%adir, None, a.title if a.title else 'Untitled article')
|
||||
parent.add_item('%sindex.html'%adir, None, a.title if a.title else _('Untitled Article'))
|
||||
for sp in a.sub_pages:
|
||||
prefix = os.path.commonprefix([opf_path, sp])
|
||||
relp = sp[len(prefix):]
|
||||
entries.append(relp.replace(os.sep, '/'))
|
||||
|
||||
if len(feeds) > 1:
|
||||
for i, f in enumerate(feeds):
|
||||
entries.append('feed_%d/index.html'%i)
|
||||
feed = toc.add_item('feed_%d/index.html'%i, None, f.title)
|
||||
feed_index(i, feed)
|
||||
else:
|
||||
entries.append('feed_%d/index.html'%0)
|
||||
feed_index(0, toc)
|
||||
|
||||
opf.create_spine(entries)
|
||||
opf.set_toc(toc)
|
||||
|
||||
for i, f in enumerate(feeds):
|
||||
|
||||
for j, a in enumerate(f):
|
||||
if getattr(a, 'downloaded', False):
|
||||
adir = 'feed_%d/article_%d/'%(i, j)
|
||||
|
||||
opf.render(open(opf_path, 'wb'), open(ncx_path, 'wb'))
|
||||
|
||||
|
||||
@ -525,7 +565,7 @@ class BasicNewsRecipe(object):
|
||||
|
||||
article = request.article
|
||||
self.logger.debug(_('\nDownloaded article %s from %s\n%s')%(article.title, article.url, request.stream.getvalue().decode('utf-8', 'ignore')))
|
||||
article.url = result[0]
|
||||
article.url = 'article_%d/index.html'%a
|
||||
article.downloaded = True
|
||||
article.sub_pages = result[1][1:]
|
||||
self.jobs_done += 1
|
||||
@ -563,3 +603,29 @@ class BasicNewsRecipe(object):
|
||||
max_articles_per_feed=self.max_articles_per_feed))
|
||||
|
||||
return parsed_feeds
|
||||
|
||||
@classmethod
|
||||
def tag_to_string(cls, tag, use_alt=True):
|
||||
'''
|
||||
Convenience method to take a BeautifulSoup Tag and extract the text from it
|
||||
recursively, including any CDATA sections and alt tag attributes.
|
||||
@param use_alt: If True try to use the alt attribute for tags that don't have any textual content
|
||||
@type use_alt: boolean
|
||||
@return: A unicode (possibly empty) object
|
||||
@rtype: unicode string
|
||||
'''
|
||||
if not tag:
|
||||
return ''
|
||||
if isinstance(tag, basestring):
|
||||
return tag
|
||||
strings = []
|
||||
for item in tag.contents:
|
||||
if isinstance(item, (NavigableString, CData)):
|
||||
strings.append(item.string)
|
||||
elif isinstance(item, Tag):
|
||||
res = cls.tag_to_string(item)
|
||||
if res:
|
||||
strings.append(res)
|
||||
elif use_alt and item.has_key('alt'):
|
||||
strings.append(item['alt'])
|
||||
return u''.join(strings)
|
||||
|
@ -17,7 +17,7 @@
|
||||
'''
|
||||
Builtin recipes.
|
||||
'''
|
||||
recipes = ['newsweek']
|
||||
recipes = ['newsweek', 'atlantic', 'economist']
|
||||
|
||||
import re
|
||||
from libprs500.web.feeds.news import BasicNewsRecipe
|
||||
|
60
src/libprs500/web/feeds/recipes/atlantic.py
Normal file
60
src/libprs500/web/feeds/recipes/atlantic.py
Normal file
@ -0,0 +1,60 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
## Copyright (C) 2008 Kovid Goyal kovid@kovidgoyal.net
|
||||
## This program is free software; you can redistribute it and/or modify
|
||||
## it under the terms of the GNU General Public License as published by
|
||||
## the Free Software Foundation; either version 2 of the License, or
|
||||
## (at your option) any later version.
|
||||
##
|
||||
## This program is distributed in the hope that it will be useful,
|
||||
## but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
## GNU General Public License for more details.
|
||||
##
|
||||
## You should have received a copy of the GNU General Public License along
|
||||
## with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
'''
|
||||
theatlantic.com
|
||||
'''
|
||||
|
||||
from libprs500.web.feeds.news import BasicNewsRecipe
|
||||
from libprs500.ebooks.BeautifulSoup import BeautifulSoup
|
||||
|
||||
class TheAtlantic(BasicNewsRecipe):
|
||||
|
||||
title = 'The Atlantic'
|
||||
INDEX = 'http://www.theatlantic.com/doc/current'
|
||||
|
||||
remove_tags_before = dict(name='div', id='storytop')
|
||||
remove_tags = [dict(name='div', id='seealso')]
|
||||
extra_css = '#bodytext {line-height: 1}'
|
||||
|
||||
def parse_index(self):
|
||||
articles = []
|
||||
|
||||
src = self.browser.open(self.INDEX).read()
|
||||
soup = BeautifulSoup(src, convertEntities=BeautifulSoup.HTML_ENTITIES)
|
||||
|
||||
issue = soup.find('span', attrs={'class':'issue'})
|
||||
if issue:
|
||||
self.timefmt = ' [%s]'%self.tag_to_string(issue).rpartition('|')[-1].strip().replace('/', '-')
|
||||
|
||||
for item in soup.findAll('div', attrs={'class':'item'}):
|
||||
a = item.find('a')
|
||||
if a and a.has_key('href'):
|
||||
url = a['href']
|
||||
url = 'http://www.theatlantic.com/'+url.replace('/doc', 'doc/print')
|
||||
title = self.tag_to_string(a)
|
||||
byline = item.find(attrs={'class':'byline'})
|
||||
date = self.tag_to_string(byline) if byline else ''
|
||||
description = ''
|
||||
articles.append({
|
||||
'title':title,
|
||||
'date':date,
|
||||
'url':url,
|
||||
'description':description
|
||||
})
|
||||
|
||||
|
||||
return {'Current Issue' : articles }
|
57
src/libprs500/web/feeds/recipes/economist.py
Normal file
57
src/libprs500/web/feeds/recipes/economist.py
Normal file
@ -0,0 +1,57 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
## Copyright (C) 2008 Kovid Goyal kovid@kovidgoyal.net
|
||||
## This program is free software; you can redistribute it and/or modify
|
||||
## it under the terms of the GNU General Public License as published by
|
||||
## the Free Software Foundation; either version 2 of the License, or
|
||||
## (at your option) any later version.
|
||||
##
|
||||
## This program is distributed in the hope that it will be useful,
|
||||
## but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
## GNU General Public License for more details.
|
||||
##
|
||||
## You should have received a copy of the GNU General Public License along
|
||||
## with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
'''
|
||||
economist.com
|
||||
'''
|
||||
from libprs500.web.feeds.news import BasicNewsRecipe
|
||||
from libprs500.ebooks.BeautifulSoup import BeautifulSoup
|
||||
|
||||
class Economist(BasicNewsRecipe):
|
||||
|
||||
title = 'The Economist'
|
||||
oldest_article = 7.0
|
||||
INDEX = 'http://www.economist.com/printedition'
|
||||
remove_tags = [dict(name=['script', 'noscript', 'title'])]
|
||||
remove_tags_before = dict(name=lambda tag: tag.name=='title' and tag.parent.name=='body')
|
||||
|
||||
def parse_index(self):
|
||||
soup = BeautifulSoup(self.browser.open(self.INDEX).read(),
|
||||
convertEntities=BeautifulSoup.HTML_ENTITIES)
|
||||
index_started = False
|
||||
feeds = {}
|
||||
key = None
|
||||
for tag in soup.findAll(['h1', 'h2']):
|
||||
text = ''.join(tag.findAll(text=True))
|
||||
if tag.name == 'h1':
|
||||
if 'Classified ads' in text:
|
||||
break
|
||||
if 'The world this week' in text:
|
||||
index_started = True
|
||||
if not index_started:
|
||||
continue
|
||||
feeds[text] = []
|
||||
key = text
|
||||
continue
|
||||
if key is None:
|
||||
continue
|
||||
a = tag.find('a', href=True)
|
||||
if a is not None:
|
||||
article = dict(title=text,
|
||||
url='http://www.economist.com'+a['href'].replace('displaystory', 'PrinterFriendly'),
|
||||
description='', content='', date='')
|
||||
feeds[key].append(article)
|
||||
return feeds
|
@ -102,7 +102,7 @@ class IndexTemplate(Template):
|
||||
<?python
|
||||
from datetime import datetime
|
||||
?>
|
||||
<p style="text-align:right">${datetime.now().strftime(datefmt)}</p>
|
||||
<p style="text-align:right">${datetime.now().strftime(str(datefmt))}</p>
|
||||
<ul>
|
||||
<py:for each="i, feed in enumerate(feeds)">
|
||||
<li py:if="feed" id="feed_${str(i)}">
|
||||
@ -144,6 +144,9 @@ class FeedTemplate(Template):
|
||||
<img alt="${feed.image_alt}" src="${feed.image_url}" />
|
||||
</div>
|
||||
</py:if>
|
||||
<div py:if="feed.description">
|
||||
${feed.description}
|
||||
</div>
|
||||
<ul>
|
||||
<py:for each="i, article in enumerate(feed.articles)">
|
||||
<li id="${'article_%d'%i}" py:if="getattr(article, 'downloaded', False)">
|
||||
|
@ -85,6 +85,7 @@ class RecursiveFetcher(object):
|
||||
self.preprocess_regexps = getattr(options, 'preprocess_regexps', [])
|
||||
self.remove_tags = getattr(options, 'remove_tags', [])
|
||||
self.remove_tags_after = getattr(options, 'remove_tags_after', None)
|
||||
self.remove_tags_before = getattr(options, 'remove_tags_before', None)
|
||||
self.keep_only_tags = getattr(options, 'keep_only_tags', [])
|
||||
self.preprocess_html_ext = getattr(options, 'preprocess_html', lambda soup: soup)
|
||||
self.postprocess_html_ext= getattr(options, 'postprocess_html', [])
|
||||
@ -105,15 +106,22 @@ class RecursiveFetcher(object):
|
||||
body.insert(len(body.contents), tag)
|
||||
soup.find('body').replaceWith(body)
|
||||
|
||||
if self.remove_tags_after is not None:
|
||||
tag = soup.find(**self.remove_tags_after)
|
||||
def remove_beyond(tag, next):
|
||||
while tag is not None and tag.name != 'body':
|
||||
after = tag.nextSibling
|
||||
after = getattr(tag, next)
|
||||
while after is not None:
|
||||
ns = after.nextSibling
|
||||
ns = getattr(tag, next)
|
||||
after.extract()
|
||||
after = ns
|
||||
tag = tag.parent
|
||||
|
||||
if self.remove_tags_after is not None:
|
||||
tag = soup.find(**self.remove_tags_after)
|
||||
remove_beyond(tag, 'nextSibling')
|
||||
|
||||
if self.remove_tags_before is not None:
|
||||
tag = soup.find(**self.remove_tags_before)
|
||||
remove_beyond(tag, 'previousSibling')
|
||||
|
||||
for kwds in self.remove_tags:
|
||||
for tag in soup.findAll(**kwds):
|
||||
|
Loading…
x
Reference in New Issue
Block a user