mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-08-30 23:00:21 -04:00
feeds2disk improved to the point of being able to download Newsweek. Added new Newsweek recipe.
This commit is contained in:
parent
756de168fe
commit
2ccf260f7d
@ -23,13 +23,14 @@ from gettext import GNUTranslations
|
||||
from math import floor
|
||||
from optparse import OptionParser as _OptionParser
|
||||
from optparse import IndentedHelpFormatter
|
||||
from logging import Formatter
|
||||
|
||||
from ttfquery import findsystem, describe
|
||||
|
||||
from libprs500.translations.msgfmt import make
|
||||
from libprs500.ebooks.chardet import detect
|
||||
from libprs500.terminfo import TerminalController
|
||||
terminal_controller = TerminalController()
|
||||
terminal_controller = TerminalController(sys.stdout)
|
||||
|
||||
iswindows = 'win32' in sys.platform.lower() or 'win64' in sys.platform.lower()
|
||||
isosx = 'darwin' in sys.platform.lower()
|
||||
@ -51,6 +52,25 @@ __builtin__.__dict__['_'] = lambda s: s
|
||||
class CommandLineError(Exception):
|
||||
pass
|
||||
|
||||
class ColoredFormatter(Formatter):
|
||||
|
||||
def format(self, record):
|
||||
ln = record.__dict__['levelname']
|
||||
col = ''
|
||||
if ln == 'CRITICAL':
|
||||
col = terminal_controller.YELLOW
|
||||
elif ln == 'ERROR':
|
||||
col = terminal_controller.RED
|
||||
elif ln in ['WARN', 'WARNING']:
|
||||
col = terminal_controller.BLUE
|
||||
elif ln == 'INFO':
|
||||
col = terminal_controller.GREEN
|
||||
elif ln == 'DEBUG':
|
||||
col = terminal_controller.CYAN
|
||||
record.__dict__['levelname'] = col + record.__dict__['levelname'] + terminal_controller.NORMAL
|
||||
return Formatter.format(self, record)
|
||||
|
||||
|
||||
def setup_cli_handlers(logger, level):
|
||||
logger.setLevel(level)
|
||||
if level == logging.WARNING:
|
||||
@ -187,9 +207,9 @@ def extract(path, dir):
|
||||
raise Exception('Unknown archive type')
|
||||
extractor(path, dir)
|
||||
|
||||
def browser():
|
||||
def browser(honor_time=False):
|
||||
opener = mechanize.Browser()
|
||||
opener.set_handle_refresh(True)
|
||||
opener.set_handle_refresh(True, honor_time=honor_time)
|
||||
opener.set_handle_robots(False)
|
||||
opener.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; i686 Linux; en_US; rv:1.8.0.4) Gecko/20060508 Firefox/1.5.0.4')]
|
||||
return opener
|
||||
|
@ -147,7 +147,7 @@ class Delegator(object):
|
||||
d.parent = self
|
||||
methods = d.getMethods()
|
||||
self.delegatedMethods += methods
|
||||
for m in methods:
|
||||
for m in methods:
|
||||
setattr(self, m, getattr(d, m))
|
||||
|
||||
"""
|
||||
|
@ -595,6 +595,11 @@ class OPFCreator(OPF):
|
||||
self.uid = mi.uid
|
||||
|
||||
def create_manifest(self, entries):
|
||||
'''
|
||||
Create <manifest>
|
||||
@param entries: List of (URL, mime-type)
|
||||
@type entries: list of 2-tuples
|
||||
'''
|
||||
doc = dom.parseString(self.soup.__str__('UTF-8').strip())
|
||||
package = doc.documentElement
|
||||
manifest = doc.createElement('manifest')
|
||||
@ -616,6 +621,11 @@ class OPFCreator(OPF):
|
||||
|
||||
|
||||
def create_spine(self, entries):
|
||||
'''
|
||||
Create the <spine> element. Must first call L{create_manifest}.
|
||||
@param: List of paths
|
||||
@type param: list of strings
|
||||
'''
|
||||
doc = dom.parseString(self.soup.__str__('UTF-8').strip())
|
||||
package = doc.documentElement
|
||||
spine = doc.createElement('spine')
|
||||
|
@ -74,6 +74,34 @@ def options(option_parser):
|
||||
opts.extend(opt._long_opts)
|
||||
return opts
|
||||
|
||||
def opts_and_words(name, op, words):
|
||||
opts = ' '.join(options(op))
|
||||
words = [repr(w) for w in words]
|
||||
words = ' '.join(words)
|
||||
return '_'+name+'()'+\
|
||||
'''
|
||||
{
|
||||
local cur prev opts
|
||||
COMPREPLY=()
|
||||
cur="${COMP_WORDS[COMP_CWORD]}"
|
||||
opts="%s"
|
||||
words="%s"
|
||||
|
||||
case "${cur}" in
|
||||
-* )
|
||||
COMPREPLY=( $(compgen -W "${opts}" -- ${cur}) )
|
||||
return 0
|
||||
;;
|
||||
* )
|
||||
COMPREPLY=( $(compgen -W "${words}" -- ${cur}) )
|
||||
return 0
|
||||
;;
|
||||
esac
|
||||
|
||||
}
|
||||
complete -F _'''%(opts, words) + name + ' ' + name +"\n\n"
|
||||
|
||||
|
||||
def opts_and_exts(name, op, exts):
|
||||
opts = ' '.join(options(op))
|
||||
exts.extend([i.upper() for i in exts])
|
||||
@ -135,6 +163,8 @@ def setup_completion(fatal_errors):
|
||||
from libprs500.gui2.lrf_renderer.main import option_parser as lrfviewerop
|
||||
from libprs500.ebooks.lrf.pdf.reflow import option_parser as pdfhtmlop
|
||||
from libprs500.ebooks.mobi.reader import option_parser as mobioeb
|
||||
from libprs500.web.feeds.main import option_parser as feeds2disk
|
||||
from libprs500.web.feeds.recipes import titles as feed_titles
|
||||
|
||||
f = open_file('/etc/bash_completion.d/libprs500')
|
||||
|
||||
@ -159,6 +189,7 @@ def setup_completion(fatal_errors):
|
||||
f.write(opts_and_exts('lrfviewer', lrfviewerop, ['lrf']))
|
||||
f.write(opts_and_exts('pdfrelow', pdfhtmlop, ['pdf']))
|
||||
f.write(opts_and_exts('mobi2oeb', mobioeb, ['mobi', 'prc']))
|
||||
f.write(opts_and_words('feeds2disk', feeds2disk, feed_titles))
|
||||
f.write('''
|
||||
_prs500_ls()
|
||||
{
|
||||
|
@ -15,7 +15,7 @@
|
||||
'''
|
||||
Used to run jobs in parallel in separate processes.
|
||||
'''
|
||||
import re, sys, tempfile, os, cPickle, cStringIO, traceback, atexit, binascii, time, subprocess
|
||||
import re, sys, tempfile, os, cPickle, traceback, atexit, binascii, time, subprocess
|
||||
from functools import partial
|
||||
|
||||
|
||||
|
@ -106,7 +106,7 @@ class TerminalController:
|
||||
except: return
|
||||
|
||||
# If the stream isn't a tty, then assume it has no capabilities.
|
||||
if not term_stream.isatty(): return
|
||||
if not hasattr(term_stream, 'isatty') or not term_stream.isatty(): return
|
||||
|
||||
# Check the terminal type. If we fail, then assume that the
|
||||
# terminal has no capabilities.
|
||||
|
@ -97,7 +97,8 @@ class WorkerThread(threading.Thread):
|
||||
)
|
||||
except:
|
||||
request.exception = True
|
||||
self.resultQueue.put((request, sys.exc_info()))
|
||||
import traceback
|
||||
self.resultQueue.put((request, traceback.format_exc()))
|
||||
|
||||
def dismiss(self):
|
||||
"""Sets a flag to tell the thread to exit when done with current job.
|
||||
|
@ -27,6 +27,7 @@ class Article(object):
|
||||
time_offset = datetime.now() - datetime.utcnow()
|
||||
|
||||
def __init__(self, id, title, url, summary, published, content):
|
||||
self.downloaded = False
|
||||
self.id = id
|
||||
self.title = title
|
||||
self.url = url
|
||||
@ -103,7 +104,7 @@ class Feed(object):
|
||||
if delta.days*24*3600 + delta.seconds <= 24*3600*self.oldest_article:
|
||||
self.articles.append(article)
|
||||
else:
|
||||
self.logger.debug('Skipping article %s as it is too old.'%title)
|
||||
self.logger.debug('Skipping article %s (%s) from feed %s as it is too old.'%(title, article.localtime.strftime('%a, %d %b, %Y %H:%M'), self.title))
|
||||
|
||||
def __iter__(self):
|
||||
return iter(self.articles)
|
||||
@ -118,6 +119,12 @@ class Feed(object):
|
||||
|
||||
def __str__(self):
|
||||
return repr(self)
|
||||
|
||||
def __bool__(self):
|
||||
for article in self:
|
||||
if getattr(article, 'downloaded', False):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def feed_from_xml(raw_xml, title=None, oldest_article=7, max_articles_per_feed=100):
|
||||
|
@ -18,21 +18,24 @@ from libprs500.web.feeds.news import BasicNewsRecipe
|
||||
''''''
|
||||
|
||||
import sys, os, logging
|
||||
from libprs500.web.recipes import get_feed, compile_recipe
|
||||
from libprs500.web.feeds.recipes import get_builtin_recipe, compile_recipe, titles
|
||||
from libprs500.web.fetch.simple import option_parser as _option_parser
|
||||
|
||||
|
||||
def option_parser(usage='''\
|
||||
%prog [options] ARG
|
||||
%%prog [options] ARG
|
||||
|
||||
%prog parsers an online source of articles, like an RSS or ATOM feed and
|
||||
%%prog parsers an online source of articles, like an RSS or ATOM feed and
|
||||
fetches the article contents organized in a nice hierarchy.
|
||||
|
||||
ARG can be one of:
|
||||
file name - %prog will try to load a recipe from the file
|
||||
builtin recipe title - %prog will load the builtin recipe and use it to fetch the feed. For e.g. Newsweek or "The BBC" or "The New York Times"
|
||||
recipe as a string - $prog will load the recipe directly from the string arg.
|
||||
'''):
|
||||
file name - %%prog will try to load a recipe from the file
|
||||
builtin recipe title - %%prog will load the builtin recipe and use it to fetch the feed. For e.g. Newsweek or "The BBC" or "The New York Times"
|
||||
recipe as a string - %%prog will load the recipe directly from the string arg.
|
||||
|
||||
Available builtin recipes are:
|
||||
%s
|
||||
'''%(unicode(list(titles))[1:-1])):
|
||||
p = _option_parser(usage=usage)
|
||||
p.remove_option('--max-recursions')
|
||||
p.remove_option('--base-dir')
|
||||
@ -86,7 +89,7 @@ def main(args=sys.argv, notification=None, handler=None):
|
||||
else:
|
||||
notification = no_progress_bar
|
||||
|
||||
if len(args) != 2:
|
||||
if len(args) != 2 and opts.feeds is None:
|
||||
p.print_help()
|
||||
return 1
|
||||
|
||||
@ -96,11 +99,16 @@ def main(args=sys.argv, notification=None, handler=None):
|
||||
else:
|
||||
try:
|
||||
if os.access(args[1], os.R_OK):
|
||||
recipe = compile_recipe(open(args[1]).read())
|
||||
try:
|
||||
recipe = compile_recipe(open(args[1]).read())
|
||||
except:
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
return 1
|
||||
else:
|
||||
raise Exception('')
|
||||
raise Exception('not file')
|
||||
except:
|
||||
recipe = get_feed(args[1])
|
||||
recipe = get_builtin_recipe(args[1])
|
||||
if recipe is None:
|
||||
recipe = compile_recipe(args[1])
|
||||
|
||||
@ -111,9 +119,10 @@ def main(args=sys.argv, notification=None, handler=None):
|
||||
return 1
|
||||
|
||||
if handler is None:
|
||||
from libprs500 import ColoredFormatter
|
||||
handler = logging.StreamHandler(sys.stdout)
|
||||
handler.setLevel(logging.DEBUG if opts.debug else logging.INFO if opts.verbose else logging.WARN)
|
||||
handler.setFormatter(logging.Formatter('%(levelname)s: %(message)s'))
|
||||
handler.setFormatter(ColoredFormatter('%(levelname)s: %(message)s\n')) # The trailing newline is need because of the progress bar
|
||||
logging.getLogger('feeds2disk').addHandler(handler)
|
||||
|
||||
recipe = recipe(opts, p, notification)
|
||||
|
@ -17,11 +17,13 @@
|
||||
The backend to parse feeds and create HTML that can then be converted
|
||||
to an ebook.
|
||||
'''
|
||||
import logging, os, cStringIO, traceback, time
|
||||
import logging, os, cStringIO, time, itertools, traceback
|
||||
import urlparse
|
||||
|
||||
from libprs500 import browser
|
||||
from libprs500 import browser, __appname__
|
||||
from libprs500.ebooks.BeautifulSoup import BeautifulSoup
|
||||
from libprs500.ebooks.metadata.opf import OPFCreator
|
||||
from libprs500.ebooks.metadata import MetaInformation
|
||||
from libprs500.web.feeds import feed_from_xml, templates
|
||||
from libprs500.web.fetch.simple import option_parser as web2disk_option_parser
|
||||
from libprs500.web.fetch.simple import RecursiveFetcher
|
||||
@ -35,7 +37,10 @@ class BasicNewsRecipe(object):
|
||||
|
||||
#: The title to use for the ebook
|
||||
#: @type: string
|
||||
title = 'Unknown News Source'
|
||||
title = _('Unknown News Source')
|
||||
|
||||
#: The author of this recipe
|
||||
__author__ = _('Unknown')
|
||||
|
||||
#: Maximum number of articles to download from each feed
|
||||
#: @type: integer
|
||||
@ -55,17 +60,18 @@ class BasicNewsRecipe(object):
|
||||
delay = 0
|
||||
|
||||
#: Number of simultaneous downloads. Set to 1 if the server is picky.
|
||||
#: Automatically reduced to 1 if L{delay} > 0
|
||||
#: @type: integer
|
||||
simultaneous_downloads = 5
|
||||
|
||||
#: Timeout for fetching files from server in seconds
|
||||
#: @type: integer
|
||||
timeout = 10
|
||||
timeout = 120
|
||||
|
||||
#: The format string for the date shown on the first page
|
||||
#: By default: Day Name Day Number Month Name Year
|
||||
#: @type: string
|
||||
timefmt = ' %a, %d %b %Y'
|
||||
timefmt = ' [%a, %d %b %Y]'
|
||||
|
||||
#: Max number of characters in the short description.
|
||||
#: @type: integer
|
||||
@ -102,7 +108,7 @@ class BasicNewsRecipe(object):
|
||||
|
||||
#: List of options to pass to html2lrf, to customize generation of LRF ebooks.
|
||||
#: @type: list of strings
|
||||
html2lrf_options = []
|
||||
html2lrf_options = ['--page-break-before', '$']
|
||||
|
||||
#: List of tags to be removed. Specified tags are removed from downloaded HTML.
|
||||
#: A tag is specified as a dictionary of the form::
|
||||
@ -114,9 +120,23 @@ class BasicNewsRecipe(object):
|
||||
#: U{http://www.crummy.com/software/BeautifulSoup/documentation.html#The basic find method: findAll(name, attrs, recursive, text, limit, **kwargs)}
|
||||
#: A common example::
|
||||
#: remove_tags = [dict(name='div', attrs={'class':'advert'})]
|
||||
#: This will remove all <div class="advert"> tags and all their children from the downloaded HTML.
|
||||
#: This will remove all <div class="advert"> tags and all their children from the downloaded HTML.
|
||||
#: @type: list
|
||||
remove_tags = []
|
||||
|
||||
#: Remove all tags that occur after the specified tag.
|
||||
#: For the format for specifying a tag see L{remove_tags}.
|
||||
#: For example, C{remove_tags_after = [dict(id='content')]} will remove all
|
||||
#: tags after the element with id C{content}.
|
||||
remove_tags_after = None
|
||||
|
||||
#: Keep only the specified tags and their children.
|
||||
#: For the format for specifying tags see L{remove_tags}.
|
||||
#: If this list is not empty, then the <body> element will be emptied and re-filled with
|
||||
#: the tags that match the entries in this list.
|
||||
#: @type: list
|
||||
keep_only_tags = []
|
||||
|
||||
#: List of regexp substitution rules to run on the downloaded HTML. Each element of the
|
||||
#: list should be a two element tuple. The first element of the tuple should
|
||||
#: be a compiled regular expression and the second a callable that takes
|
||||
@ -126,6 +146,13 @@ class BasicNewsRecipe(object):
|
||||
|
||||
# See the built-in profiles for examples of these settings.
|
||||
|
||||
def get_cover_url(self):
|
||||
'''
|
||||
Return a URL to the cover image for this issue or None.
|
||||
@rtype: string or None
|
||||
'''
|
||||
return getattr(self, 'cover_url', None)
|
||||
|
||||
def get_feeds(self):
|
||||
'''
|
||||
Return a list of RSS feeds to fetch for this profile. Each element of the list
|
||||
@ -156,7 +183,21 @@ class BasicNewsRecipe(object):
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
'''
|
||||
This function is called with the source of each downloaded HTML file.
|
||||
This function is called with the source of each downloaded HTML file, before
|
||||
it is parsed for links and images.
|
||||
It can be used to do arbitrarily powerful pre-processing on the HTML.
|
||||
@param soup: A U{BeautifulSoup<http://www.crummy.com/software/BeautifulSoup/documentation.html>}
|
||||
instance containing the downloaded HTML.
|
||||
@type soup: A U{BeautifulSoup<http://www.crummy.com/software/BeautifulSoup/documentation.html>} instance
|
||||
@return: It must return soup (after having done any needed preprocessing)
|
||||
@rtype: A U{BeautifulSoup<http://www.crummy.com/software/BeautifulSoup/documentation.html>} instance
|
||||
'''
|
||||
return soup
|
||||
|
||||
def postprocess_html(self, soup):
|
||||
'''
|
||||
This function is called with the source of each downloaded HTML file, after
|
||||
it is parsed for links and images.
|
||||
It can be used to do arbitrarily powerful pre-processing on the HTML.
|
||||
@param soup: A U{BeautifulSoup<http://www.crummy.com/software/BeautifulSoup/documentation.html>}
|
||||
instance containing the downloaded HTML.
|
||||
@ -210,6 +251,7 @@ class BasicNewsRecipe(object):
|
||||
|
||||
self.browser = self.get_browser()
|
||||
self.image_map, self.image_counter = {}, 1
|
||||
self.css_map = {}
|
||||
|
||||
web2disk_cmdline = [ 'web2disk',
|
||||
'--timeout', str(self.timeout),
|
||||
@ -233,14 +275,18 @@ class BasicNewsRecipe(object):
|
||||
web2disk_cmdline.extend(['--filter-regexp', reg])
|
||||
|
||||
self.web2disk_options = web2disk_option_parser().parse_args(web2disk_cmdline)[0]
|
||||
self.web2disk_options.remove_tags = self.remove_tags
|
||||
self.web2disk_options.preprocess_regexps = self.preprocess_regexps
|
||||
self.web2disk_options.preprocess_html = self.preprocess_html
|
||||
for extra in ('keep_only_tags', 'remove_tags', 'preprocess_regexps',
|
||||
'preprocess_html', 'remove_tags_after', 'postprocess_html'):
|
||||
setattr(self.web2disk_options, extra, getattr(self, extra))
|
||||
|
||||
if self.delay > 0:
|
||||
self.simultaneous_downloads = 1
|
||||
|
||||
self.navbar = templates.NavBarTemplate()
|
||||
self.max_articles_per_feed -= 1
|
||||
self.html2lrf_options.append('--use-spine')
|
||||
self.failed_downloads = []
|
||||
self.partial_failures = []
|
||||
|
||||
def download(self):
|
||||
'''
|
||||
@ -250,9 +296,26 @@ class BasicNewsRecipe(object):
|
||||
@return: Path to index.html
|
||||
@rtype: string
|
||||
'''
|
||||
self.report_progress(0, _('Initialized'))
|
||||
self.report_progress(0, _('Trying to download cover...'))
|
||||
self.download_cover()
|
||||
res = self.build_index()
|
||||
self.cleanup()
|
||||
self.report_progress(1, _('Download finished'))
|
||||
if self.failed_downloads:
|
||||
self.logger.warning(_('Failed to download the following articles:'))
|
||||
for feed, article, debug in self.failed_downloads:
|
||||
self.logger.warning(article.title+_(' from ')+feed.title)
|
||||
self.logger.debug(article.url)
|
||||
self.logger.debug(debug)
|
||||
if self.partial_failures:
|
||||
self.logger.warning(_('Failed to download parts of the following articles:'))
|
||||
for feed, atitle, aurl, debug in self.partial_failures:
|
||||
self.logger.warning(atitle + _(' from ') + feed)
|
||||
self.logger.debug(aurl)
|
||||
self.logger.warning(_('\tFailed links:'))
|
||||
for l, tb in debug:
|
||||
self.logger.warning(l)
|
||||
self.logger.debug(tb)
|
||||
return res
|
||||
|
||||
def feeds2index(self, feeds):
|
||||
@ -294,11 +357,14 @@ class BasicNewsRecipe(object):
|
||||
return logger, out
|
||||
|
||||
def fetch_article(self, url, dir, logger):
|
||||
fetcher = RecursiveFetcher(self.web2disk_options, logger, self.image_map)
|
||||
fetcher = RecursiveFetcher(self.web2disk_options, logger, self.image_map, self.css_map)
|
||||
fetcher.base_dir = dir
|
||||
fetcher.current_dir = dir
|
||||
fetcher.show_progress = False
|
||||
return fetcher.start_fetch(url)
|
||||
res, path, failures = fetcher.start_fetch(url), fetcher.downloaded_paths, fetcher.failed_links
|
||||
if not res:
|
||||
raise Exception(_('Could not fetch article. Run with --debug to see the reason'))
|
||||
return res, path, failures
|
||||
|
||||
def build_index(self):
|
||||
self.report_progress(0, _('Fetching feeds...'))
|
||||
@ -331,58 +397,111 @@ class BasicNewsRecipe(object):
|
||||
req.stream = stream
|
||||
req.feed = feed
|
||||
req.article = article
|
||||
req.feed_dir = feed_dir
|
||||
self.jobs.append(req)
|
||||
|
||||
|
||||
self.jobs_done = 0
|
||||
tp = ThreadPool(self.simultaneous_downloads)
|
||||
for req in self.jobs:
|
||||
tp.putRequest(req, block=True, timeout=0)
|
||||
|
||||
|
||||
self.report_progress(0, _('Starting download [%d thread(s)]...')%self.simultaneous_downloads)
|
||||
while True:
|
||||
try:
|
||||
tp.poll(True)
|
||||
tp.poll()
|
||||
time.sleep(0.1)
|
||||
except NoResultsPending:
|
||||
break
|
||||
|
||||
html = self.feed2index(feed)
|
||||
open(os.path.join(feed_dir, 'index.html'), 'wb').write(html)
|
||||
for f, feed in enumerate(feeds):
|
||||
html = self.feed2index(feed)
|
||||
feed_dir = os.path.join(self.output_dir, 'feed_%d'%f)
|
||||
open(os.path.join(feed_dir, 'index.html'), 'wb').write(html)
|
||||
|
||||
self.create_opf(feeds)
|
||||
self.report_progress(1, _('Feeds downloaded to %s')%index)
|
||||
return index
|
||||
|
||||
def download_cover(self):
|
||||
self.cover_path = None
|
||||
try:
|
||||
cu = self.get_cover_url()
|
||||
except Exception, err:
|
||||
cu = None
|
||||
self.logger.error(_('Could not download cover: %s')%str(err))
|
||||
self.logger.debug(traceback.format_exc())
|
||||
if cu is not None:
|
||||
ext = cu.rpartition('.')[-1]
|
||||
ext = ext.lower() if ext else 'jpg'
|
||||
self.report_progress(1, _('Downloading cover from %s')%cu)
|
||||
cpath = os.path.join(self.output_dir, 'cover.'+ext)
|
||||
cfile = open(cpath, 'wb')
|
||||
cfile.write(self.browser.open(cu).read())
|
||||
self.cover_path = cpath
|
||||
|
||||
|
||||
|
||||
def create_opf(self, feeds, dir=None):
|
||||
if dir is None:
|
||||
dir = self.output_dir
|
||||
mi = MetaInformation(self.title + time.strftime(self.timefmt), [__appname__])
|
||||
opf = OPFCreator(mi)
|
||||
opf_path = os.path.join(dir, 'index.opf')
|
||||
|
||||
cpath = getattr(self, 'cover_path', None)
|
||||
if cpath is not None and os.access(cpath, os.R_OK):
|
||||
opf.cover = cpath
|
||||
|
||||
entries = ['index.html']
|
||||
for i, f in enumerate(feeds):
|
||||
entries.append('feed_%d/index.html'%i)
|
||||
for j, a in enumerate(f):
|
||||
if getattr(a, 'downloaded', False):
|
||||
adir = 'feed_%d/article_%d/'%(i, j)
|
||||
entries.append('%sindex.html'%adir)
|
||||
for sp in a.sub_pages:
|
||||
prefix = os.path.commonprefix([opf_path, sp])
|
||||
relp = sp[len(prefix):]
|
||||
entries.append(relp.replace(os.sep, '/'))
|
||||
|
||||
opf.create_manifest(itertools.izip(entries, itertools.repeat('text/html')))
|
||||
opf.create_spine(entries)
|
||||
opf.write(open(opf_path, 'wb'))
|
||||
|
||||
|
||||
def article_downloaded(self, request, result):
|
||||
index = os.path.join(os.path.dirname(result), 'index.html')
|
||||
os.rename(result, index)
|
||||
index = os.path.join(os.path.dirname(result[0]), 'index.html')
|
||||
os.rename(result[0], index)
|
||||
src = open(index, 'rb').read().decode('utf-8')
|
||||
f, a = request.requestID
|
||||
soup = BeautifulSoup(src)
|
||||
body = soup.find('body')
|
||||
if body is not None:
|
||||
top = self.navbar.generate(False, f, a, len(request.feed), not self.has_single_feed).render(doctype='xhtml')
|
||||
bottom = self.navbar.generate(True, f, a, len(request.feed), not self.has_single_feed).render(doctype='xhtml')
|
||||
top = BeautifulSoup(top).find('div')
|
||||
bottom = BeautifulSoup(bottom).find('div')
|
||||
body.insert(0, top)
|
||||
body.insert(len(body.contents), bottom)
|
||||
open(index, 'wb').write(unicode(soup).encode('utf-8'))
|
||||
|
||||
article = request.article
|
||||
self.logger.debug(_('\nDownloaded article %s from %s\n%s')%(article.title, article.url, request.stream.getvalue()))
|
||||
article.url = result
|
||||
self.logger.debug(_('\nDownloaded article %s from %s\n%s')%(article.title, article.url, request.stream.getvalue().decode('utf-8', 'ignore')))
|
||||
article.url = result[0]
|
||||
article.downloaded = True
|
||||
article.sub_pages = result[1][1:]
|
||||
self.jobs_done += 1
|
||||
self.report_progress(float(self.jobs_done)/len(self.jobs), _('Article downloaded: %s')%article.title)
|
||||
if result[2]:
|
||||
self.partial_failures.append((request.feed.title, article.title, article.url, result[2]))
|
||||
|
||||
def error_in_article_download(self, request, exc_info):
|
||||
def error_in_article_download(self, request, traceback):
|
||||
self.jobs_done += 1
|
||||
self.logger.error(_('Failed to download article: %s from %s')%(request.article.title, request.article.url))
|
||||
self.logger.debug(traceback.format_exc(*exc_info))
|
||||
self.logger.debug(request.stream.getvalue())
|
||||
self.logger.error(_('Failed to download article: %s from %s\n')%(request.article.title, request.article.url))
|
||||
debug = request.stream.getvalue().decode('utf-8', 'ignore')
|
||||
self.logger.debug(debug)
|
||||
self.logger.debug(traceback)
|
||||
self.logger.debug('\n')
|
||||
self.report_progress(float(self.jobs_done)/len(self.jobs), _('Article download failed: %s')%request.article.title)
|
||||
|
||||
self.failed_downloads.append((request.feed.title, request.article, debug))
|
||||
|
||||
def parse_feeds(self):
|
||||
'''
|
||||
@ -404,5 +523,3 @@ class BasicNewsRecipe(object):
|
||||
max_articles_per_feed=self.max_articles_per_feed))
|
||||
|
||||
return parsed_feeds
|
||||
|
||||
|
||||
|
79
src/libprs500/web/feeds/recipes/__init__.py
Normal file
79
src/libprs500/web/feeds/recipes/__init__.py
Normal file
@ -0,0 +1,79 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
## Copyright (C) 2008 Kovid Goyal kovid@kovidgoyal.net
|
||||
## This program is free software; you can redistribute it and/or modify
|
||||
## it under the terms of the GNU General Public License as published by
|
||||
## the Free Software Foundation; either version 2 of the License, or
|
||||
## (at your option) any later version.
|
||||
##
|
||||
## This program is distributed in the hope that it will be useful,
|
||||
## but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
## GNU General Public License for more details.
|
||||
##
|
||||
## You should have received a copy of the GNU General Public License along
|
||||
## with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
'''
|
||||
Builtin recipes.
|
||||
'''
|
||||
recipes = ['newsweek']
|
||||
|
||||
import re
|
||||
from libprs500.web.feeds.news import BasicNewsRecipe
|
||||
from libprs500.ebooks.lrf.web.profiles import DefaultProfile, FullContentProfile
|
||||
from libprs500.ebooks.lrf.web import available_profiles
|
||||
|
||||
basic_recipes = (BasicNewsRecipe, DefaultProfile, FullContentProfile)
|
||||
basic_recipe_names = (i.__name__ for i in basic_recipes)
|
||||
|
||||
|
||||
#: Compiled builtin recipe/profile classes
|
||||
def load_recipe(module, package='libprs500.web.feeds.recipes'):
|
||||
module = __import__(package+'.'+module, fromlist=[''])
|
||||
for attr in dir(module):
|
||||
obj = getattr(module, attr)
|
||||
if type(obj) is not type:
|
||||
continue
|
||||
recipe = False
|
||||
for b in obj.__bases__:
|
||||
if b in basic_recipes:
|
||||
recipe = True
|
||||
break
|
||||
if not recipe:
|
||||
continue
|
||||
if obj not in basic_recipes:
|
||||
return obj
|
||||
|
||||
|
||||
recipes = [load_recipe(i) for i in recipes]
|
||||
|
||||
def compile_recipe(src):
|
||||
'''
|
||||
Compile the code in src and return the first object that is a recipe or profile.
|
||||
@return: Recipe/Profile class or None, if no such class was found in C{src}
|
||||
'''
|
||||
locals = {}
|
||||
exec src in globals(), locals
|
||||
for obj in locals.values():
|
||||
if type(obj) is type and obj.__name__ not in basic_recipe_names:
|
||||
for base in obj.__bases__:
|
||||
if base in basic_recipes:
|
||||
return obj
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def get_builtin_recipe(title):
|
||||
'''
|
||||
Return a builtin recipe/profile class whoose title == C{title} or None if no such
|
||||
recipe exists.
|
||||
|
||||
@type title: string
|
||||
@rtype: class or None
|
||||
'''
|
||||
for r in recipes:
|
||||
if r.title == title:
|
||||
return r
|
||||
|
||||
titles = set([r.title for r in recipes])
|
90
src/libprs500/web/feeds/recipes/newsweek.py
Normal file
90
src/libprs500/web/feeds/recipes/newsweek.py
Normal file
@ -0,0 +1,90 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
## Copyright (C) 2008 Kovid Goyal kovid@kovidgoyal.net
|
||||
## This program is free software; you can redistribute it and/or modify
|
||||
## it under the terms of the GNU General Public License as published by
|
||||
## the Free Software Foundation; either version 2 of the License, or
|
||||
## (at your option) any later version.
|
||||
##
|
||||
## This program is distributed in the hope that it will be useful,
|
||||
## but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
## GNU General Public License for more details.
|
||||
##
|
||||
## You should have received a copy of the GNU General Public License along
|
||||
## with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
import re
|
||||
from libprs500.web.feeds.news import BasicNewsRecipe
|
||||
from libprs500.ebooks.BeautifulSoup import BeautifulSoup
|
||||
|
||||
class Newsweek(BasicNewsRecipe):
|
||||
|
||||
title = 'Newsweek'
|
||||
__author__ = 'Kovid Goyal'
|
||||
|
||||
feeds = [
|
||||
('Top News', 'http://feeds.newsweek.com/newsweek/TopNews',),
|
||||
'http://feeds.newsweek.com/newsweek/columnists/StevenLevy',
|
||||
('Politics', 'http://feeds.newsweek.com/headlines/politics'),
|
||||
('Health', 'http://feeds.newsweek.com/headlines/health'),
|
||||
('Business', 'http://feeds.newsweek.com/headlines/business'),
|
||||
('Science and Technology', 'http://feeds.newsweek.com/headlines/technology/science'),
|
||||
('National News', 'http://feeds.newsweek.com/newsweek/NationalNews'),
|
||||
('World News', 'http://feeds.newsweek.com/newsweek/WorldNews'),
|
||||
'http://feeds.newsweek.com/newsweek/Columnists/ChristopherDickey',
|
||||
'http://feeds.newsweek.com/newsweek/Columnists/FareedZakaria',
|
||||
('Iraq', 'http://feeds.newsweek.com/newsweek/iraq'),
|
||||
('Society', 'http://feeds.newsweek.com/newsweek/society'),
|
||||
('Entertainment', 'http://feeds.newsweek.com/newsweek/entertainment'),
|
||||
'http://feeds.newsweek.com/newsweek/columnists/GeorgeFWill',
|
||||
'http://feeds.newsweek.com/newsweek/columnists/AnnaQuindlen',
|
||||
]
|
||||
# For testing
|
||||
feeds = feeds[:2]
|
||||
max_articles_per_feed = 1
|
||||
|
||||
keep_only_tags = [dict(name='div', id='content')]
|
||||
|
||||
remove_tags = [
|
||||
dict(name=['script', 'noscript']),
|
||||
dict(name='div', attrs={'class':['ad', 'SocialLinks', 'SocialLinksDiv', 'channel', 'bot', 'nav', 'top', 'EmailArticleBlock']}),
|
||||
dict(name='div', attrs={'class':re.compile('box')}),
|
||||
dict(id=['ToolBox', 'EmailMain', 'EmailArticle', ])
|
||||
]
|
||||
|
||||
recursions = 1
|
||||
match_regexps = [r'http://www.newsweek.com/id/\S+/page/\d+']
|
||||
|
||||
def postprocess_html(self, soup):
|
||||
divs = list(soup.findAll('div', 'pagination'))
|
||||
divs[0].extract()
|
||||
if len(divs) > 1:
|
||||
soup.find('body')['style'] = 'page-break-after:avoid'
|
||||
divs[1].extract()
|
||||
|
||||
h1 = soup.find('h1')
|
||||
if h1:
|
||||
h1.extract()
|
||||
ai = soup.find('div', 'articleInfo')
|
||||
ai.extract()
|
||||
else:
|
||||
soup.find('body')['style'] = 'page-break-before:always; page-break-after:avoid;'
|
||||
return soup
|
||||
|
||||
def get_current_issue(self):
|
||||
from urllib2 import urlopen # For some reason mechanize fails
|
||||
home = urlopen('http://www.newsweek.com').read()
|
||||
soup = BeautifulSoup(home)
|
||||
img = soup.find('img', alt='Current Magazine')
|
||||
if img and img.parent.has_key('href'):
|
||||
return urlopen(img.parent['href']).read()
|
||||
|
||||
def get_cover_url(self):
|
||||
ci = self.get_current_issue()
|
||||
if ci is not None:
|
||||
soup = BeautifulSoup(ci)
|
||||
img = soup.find(alt='Cover')
|
||||
if img is not None and img.has_key('src'):
|
||||
small = img['src']
|
||||
return small.replace('coversmall', 'coverlarge')
|
@ -104,7 +104,7 @@ class IndexTemplate(Template):
|
||||
<p style="text-align:right">${datetime.now().strftime(datefmt)}</p>
|
||||
<ul>
|
||||
<py:for each="i, feed in enumerate(feeds)">
|
||||
<li id="feed_${str(i)}">
|
||||
<li py:if="feed" id="feed_${str(i)}">
|
||||
<a class="feed" href="${'feed_%d/index.html'%i}">${feed.title}</a>
|
||||
</li>
|
||||
</py:for>
|
||||
@ -136,7 +136,7 @@ class FeedTemplate(Template):
|
||||
${style}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<body style="page-break-before:always">
|
||||
<h2>${feed.title}</h2>
|
||||
<py:if test="feed.image">
|
||||
<div class="feed_image">
|
||||
@ -144,7 +144,7 @@ class FeedTemplate(Template):
|
||||
</div>
|
||||
</py:if>
|
||||
<ul>
|
||||
<py:for each="i, article in enumerate(feed)">
|
||||
<py:for each="i, article in enumerate(feed.articles)">
|
||||
<li id="${'article_%d'%i}" py:if="getattr(article, 'downloaded', False)">
|
||||
<a class="article" href="${article.url}">${article.title}</a>
|
||||
<span class="article_date">${article.localtime.strftime(" [%a, %d %b %H:%M]")}</span>
|
||||
|
@ -17,12 +17,12 @@ Fetch a webpage and its links recursively. The webpages are saved to disk in
|
||||
UTF-8 encoding with any charset declarations removed.
|
||||
'''
|
||||
from __future__ import with_statement
|
||||
import sys, socket, os, urlparse, codecs, logging, re, time, copy, urllib2, threading
|
||||
import sys, socket, os, urlparse, codecs, logging, re, time, copy, urllib2, threading, traceback
|
||||
from urllib import url2pathname
|
||||
from httplib import responses
|
||||
|
||||
from libprs500 import setup_cli_handlers, browser, sanitize_file_name, OptionParser
|
||||
from libprs500.ebooks.BeautifulSoup import BeautifulSoup
|
||||
from libprs500.ebooks.BeautifulSoup import BeautifulSoup, Tag
|
||||
from libprs500.ebooks.chardet import xml_to_unicode
|
||||
|
||||
class FetchError(Exception):
|
||||
@ -37,10 +37,11 @@ def basename(url):
|
||||
return res
|
||||
|
||||
def save_soup(soup, target):
|
||||
for meta in soup.findAll('meta', content=True):
|
||||
nm = Tag(soup, '<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />')
|
||||
for meta in soup.find('meta', content=True):
|
||||
if 'charset' in meta['content']:
|
||||
meta.extract()
|
||||
f = codecs.open(target, 'w', 'utf8')
|
||||
meta.replaceWith(nm)
|
||||
f = codecs.open(target, 'w', 'utf-8')
|
||||
f.write(unicode(soup))
|
||||
f.close()
|
||||
|
||||
@ -55,7 +56,7 @@ class RecursiveFetcher(object):
|
||||
# )
|
||||
CSS_IMPORT_PATTERN = re.compile(r'\@import\s+url\((.*?)\)', re.IGNORECASE)
|
||||
|
||||
def __init__(self, options, logger, image_map={}):
|
||||
def __init__(self, options, logger, image_map={}, css_map={}):
|
||||
self.logger = logger
|
||||
self.base_dir = os.path.abspath(os.path.expanduser(options.dir))
|
||||
if not os.path.exists(self.base_dir):
|
||||
@ -74,20 +75,44 @@ class RecursiveFetcher(object):
|
||||
self.filemap = {}
|
||||
self.imagemap = image_map
|
||||
self.imagemap_lock = threading.RLock()
|
||||
self.stylemap = {}
|
||||
self.stylemap = css_map
|
||||
self.stylemap_lock = threading.RLock()
|
||||
self.downloaded_paths = []
|
||||
self.current_dir = self.base_dir
|
||||
self.files = 0
|
||||
self.preprocess_regexps = getattr(options, 'preprocess_regexps', [])
|
||||
self.remove_tags = getattr(options, 'remove_tags', [])
|
||||
self.remove_tags_after = getattr(options, 'remove_tags_after', None)
|
||||
self.keep_only_tags = getattr(options, 'keep_only_tags', [])
|
||||
self.preprocess_html_ext = getattr(options, 'preprocess_html', lambda soup: soup)
|
||||
self.postprocess_html_ext= getattr(options, 'postprocess_html', lambda soup: soup)
|
||||
self.download_stylesheets = not options.no_stylesheets
|
||||
self.show_progress = True
|
||||
self.failed_links = []
|
||||
|
||||
|
||||
def get_soup(self, src):
|
||||
nmassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE)
|
||||
nmassage.extend(self.preprocess_regexps)
|
||||
soup = BeautifulSoup(xml_to_unicode(src, self.verbose)[0], markupMassage=nmassage)
|
||||
|
||||
if self.keep_only_tags:
|
||||
body = Tag(soup, 'body')
|
||||
for spec in self.keep_only_tags:
|
||||
for tag in soup.find('body').findAll(**spec):
|
||||
body.insert(len(body.contents), tag)
|
||||
soup.find('body').replaceWith(body)
|
||||
|
||||
if self.remove_tags_after is not None:
|
||||
tag = soup.find(**self.remove_tags_after)
|
||||
while tag is not None and tag.name != 'body':
|
||||
after = tag.nextSibling
|
||||
while after is not None:
|
||||
ns = after.nextSibling
|
||||
after.extract()
|
||||
after = ns
|
||||
tag = tag.parent
|
||||
|
||||
for kwds in self.remove_tags:
|
||||
for tag in soup.findAll(**kwds):
|
||||
tag.extract()
|
||||
@ -105,7 +130,12 @@ class RecursiveFetcher(object):
|
||||
except urllib2.URLError, err:
|
||||
if hasattr(err, 'code') and responses.has_key(err.code):
|
||||
raise FetchError, responses[err.code]
|
||||
raise err
|
||||
if err.reason[0] == 104: # Connection reset by peer
|
||||
self.logger.debug('Connection reset by peer retrying in 1 second.')
|
||||
time.sleep(1)
|
||||
f = self.browser.open(url)
|
||||
else:
|
||||
raise err
|
||||
finally:
|
||||
self.last_fetch_at = time.time()
|
||||
return f
|
||||
@ -146,9 +176,10 @@ class RecursiveFetcher(object):
|
||||
iurl = tag['href']
|
||||
if not urlparse.urlsplit(iurl).scheme:
|
||||
iurl = urlparse.urljoin(baseurl, iurl, False)
|
||||
if self.stylemap.has_key(iurl):
|
||||
tag['href'] = self.stylemap[iurl]
|
||||
continue
|
||||
with self.stylemap_lock:
|
||||
if self.stylemap.has_key(iurl):
|
||||
tag['href'] = self.stylemap[iurl]
|
||||
continue
|
||||
try:
|
||||
f = self.fetch_url(iurl)
|
||||
except Exception, err:
|
||||
@ -157,7 +188,8 @@ class RecursiveFetcher(object):
|
||||
continue
|
||||
c += 1
|
||||
stylepath = os.path.join(diskpath, 'style'+str(c)+'.css')
|
||||
self.stylemap[iurl] = stylepath
|
||||
with self.stylemap_lock:
|
||||
self.stylemap[iurl] = stylepath
|
||||
open(stylepath, 'wb').write(f.read())
|
||||
tag['href'] = stylepath
|
||||
else:
|
||||
@ -168,9 +200,10 @@ class RecursiveFetcher(object):
|
||||
iurl = m.group(1)
|
||||
if not urlparse.urlsplit(iurl).scheme:
|
||||
iurl = urlparse.urljoin(baseurl, iurl, False)
|
||||
if self.stylemap.has_key(iurl):
|
||||
ns.replaceWith(src.replace(m.group(1), self.stylemap[iurl]))
|
||||
continue
|
||||
with self.stylemap_lock:
|
||||
if self.stylemap.has_key(iurl):
|
||||
ns.replaceWith(src.replace(m.group(1), self.stylemap[iurl]))
|
||||
continue
|
||||
try:
|
||||
f = self.fetch_url(iurl)
|
||||
except Exception, err:
|
||||
@ -179,7 +212,8 @@ class RecursiveFetcher(object):
|
||||
continue
|
||||
c += 1
|
||||
stylepath = os.path.join(diskpath, 'style'+str(c)+'.css')
|
||||
self.stylemap[iurl] = stylepath
|
||||
with self.stylemap_lock:
|
||||
self.stylemap[iurl] = stylepath
|
||||
open(stylepath, 'wb').write(f.read())
|
||||
ns.replaceWith(src.replace(m.group(1), stylepath))
|
||||
|
||||
@ -214,7 +248,7 @@ class RecursiveFetcher(object):
|
||||
open(imgpath, 'wb').write(f.read())
|
||||
tag['src'] = imgpath
|
||||
|
||||
def absurl(self, baseurl, tag, key):
|
||||
def absurl(self, baseurl, tag, key, filter=True):
|
||||
iurl = tag[key]
|
||||
parts = urlparse.urlsplit(iurl)
|
||||
if not parts.netloc and not parts.path:
|
||||
@ -224,7 +258,7 @@ class RecursiveFetcher(object):
|
||||
if not self.is_link_ok(iurl):
|
||||
self.logger.debug('Skipping invalid link: %s', iurl)
|
||||
return None
|
||||
if not self.is_link_wanted(iurl):
|
||||
if filter and not self.is_link_wanted(iurl):
|
||||
self.logger.debug('Filtered link: '+iurl)
|
||||
return None
|
||||
return iurl
|
||||
@ -256,12 +290,12 @@ class RecursiveFetcher(object):
|
||||
prev_dir = self.current_dir
|
||||
try:
|
||||
self.current_dir = diskpath
|
||||
for tag in soup.findAll(lambda tag: tag.name.lower()=='a' and tag.has_key('href')):
|
||||
for tag in soup.findAll('a', href=True):
|
||||
if self.show_progress:
|
||||
print '.',
|
||||
sys.stdout.flush()
|
||||
sys.stdout.flush()
|
||||
iurl = self.absurl(baseurl, tag, 'href')
|
||||
iurl = self.absurl(baseurl, tag, 'href', filter=recursion_level != 0)
|
||||
if not iurl:
|
||||
continue
|
||||
nurl = self.normurl(iurl)
|
||||
@ -293,6 +327,7 @@ class RecursiveFetcher(object):
|
||||
self.process_stylesheets(soup, f.geturl())
|
||||
|
||||
res = os.path.join(linkdiskpath, basename(iurl))
|
||||
self.downloaded_paths.append(res)
|
||||
self.filemap[nurl] = res
|
||||
if recursion_level < self.max_recursions:
|
||||
self.logger.debug('Processing links...')
|
||||
@ -301,9 +336,11 @@ class RecursiveFetcher(object):
|
||||
self.process_return_links(soup, iurl)
|
||||
self.logger.debug('Recursion limit reached. Skipping links in %s', iurl)
|
||||
|
||||
save_soup(soup, res)
|
||||
save_soup(self.postprocess_html_ext(soup), res)
|
||||
|
||||
self.localize_link(tag, 'href', res)
|
||||
except Exception, err:
|
||||
self.failed_links.append((iurl, traceback.format_exc()))
|
||||
self.logger.warning('Could not fetch link %s', iurl)
|
||||
self.logger.debug('Error: %s', str(err), exc_info=True)
|
||||
finally:
|
||||
|
@ -1,63 +0,0 @@
|
||||
#!/usr/bin/env python
|
||||
## Copyright (C) 2008 Kovid Goyal kovid@kovidgoyal.net
|
||||
## This program is free software; you can redistribute it and/or modify
|
||||
## it under the terms of the GNU General Public License as published by
|
||||
## the Free Software Foundation; either version 2 of the License, or
|
||||
## (at your option) any later version.
|
||||
##
|
||||
## This program is distributed in the hope that it will be useful,
|
||||
## but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
## GNU General Public License for more details.
|
||||
##
|
||||
## You should have received a copy of the GNU General Public License along
|
||||
## with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
'''
|
||||
Contains recipes for various common news sources and websites.
|
||||
'''
|
||||
import re
|
||||
from libprs500.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
_basic_recipes = (BasicNewsRecipe,)
|
||||
_basic_recipe_names = (i.__name__ for i in _basic_recipes)
|
||||
|
||||
def compile_recipe(src):
|
||||
'''
|
||||
Compile the code in src and return the first object that is
|
||||
'''
|
||||
locals = {}
|
||||
exec src in globals(), locals
|
||||
for obj in locals.values():
|
||||
if type(obj) is type and obj.__name__ not in _basic_recipe_names:
|
||||
for base in obj.__bases__:
|
||||
if base in _basic_recipes:
|
||||
return obj
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def get_feed(title):
|
||||
'''
|
||||
Return a builtin recipe class whoose title == C{title} or None if no such
|
||||
recipe exists.
|
||||
|
||||
@type title: string
|
||||
@rtype: class or None
|
||||
'''
|
||||
if isinstance(_feeds[0], basestring):
|
||||
for i, val in enumerate(_feeds):
|
||||
recipe = compile_recipe(val)
|
||||
if recipe is None:
|
||||
raise RuntimeError('The builtin Recipe #%d is invalid.'%i)
|
||||
_feeds[i] = recipe
|
||||
|
||||
for recipe in _feeds:
|
||||
if recipe.title == title:
|
||||
return recipe
|
||||
|
||||
return None
|
||||
|
||||
|
||||
#: Recipes to be used with feeds2disk
|
||||
_feeds = ['class Temp(BasicNewsRecipe):\n\ttitle="temp"']
|
Loading…
x
Reference in New Issue
Block a user