mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-07 10:14:46 -04:00
Initial implementation of feeds2disk
This commit is contained in:
parent
829267da44
commit
ae28c0a164
@ -1,29 +0,0 @@
|
|||||||
<?xml version="1.0" encoding="UTF-8"?>
|
|
||||||
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"
|
|
||||||
"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
|
|
||||||
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" >
|
|
||||||
|
|
||||||
<head>
|
|
||||||
<meta name="author" content="Kovid Goyal" />
|
|
||||||
<meta name="copyright" content="© 2008 Kovid Goyal" />
|
|
||||||
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
|
|
||||||
<title></title>
|
|
||||||
<link rel="stylesheet" type="text/css" href="styles/common.css" />
|
|
||||||
</head>
|
|
||||||
|
|
||||||
<body>
|
|
||||||
|
|
||||||
%body
|
|
||||||
|
|
||||||
<hr />
|
|
||||||
<div class="footer">
|
|
||||||
<p>
|
|
||||||
<a href="http://validator.w3.org/check?uri=referer">
|
|
||||||
<img src="images/valid.png" alt="Valid XHTML 1.1" height="31" width="88" />
|
|
||||||
</a><br />
|
|
||||||
Created by Kovid Goyal © 2008
|
|
||||||
</p>
|
|
||||||
</div>
|
|
||||||
</body>
|
|
||||||
|
|
||||||
</html>
|
|
@ -190,14 +190,17 @@ class ProgressBar:
|
|||||||
self.cleared = 1 #: true if we haven't drawn the bar yet.
|
self.cleared = 1 #: true if we haven't drawn the bar yet.
|
||||||
|
|
||||||
def update(self, percent, message=''):
|
def update(self, percent, message=''):
|
||||||
|
if isinstance(message, unicode):
|
||||||
|
message = message.encode('utf-8', 'ignore')
|
||||||
if self.cleared:
|
if self.cleared:
|
||||||
sys.stdout.write(self.header)
|
sys.stdout.write(self.header)
|
||||||
self.cleared = 0
|
self.cleared = 0
|
||||||
n = int((self.width-10)*percent)
|
n = int((self.width-10)*percent)
|
||||||
|
msg = message.center(self.width)
|
||||||
sys.stdout.write(
|
sys.stdout.write(
|
||||||
self.term.BOL + self.term.UP + self.term.CLEAR_EOL +
|
self.term.BOL + self.term.UP + self.term.CLEAR_EOL +
|
||||||
(self.bar % (100*percent, '='*n, '-'*(self.width-10-n))) +
|
(self.bar % (100*percent, '='*n, '-'*(self.width-10-n))) +
|
||||||
self.term.CLEAR_EOL + message.center(self.width))
|
self.term.CLEAR_EOL + msg)
|
||||||
|
|
||||||
def clear(self):
|
def clear(self):
|
||||||
if not self.cleared:
|
if not self.cleared:
|
||||||
|
331
src/libprs500/threadpool.py
Normal file
331
src/libprs500/threadpool.py
Normal file
@ -0,0 +1,331 @@
|
|||||||
|
"""Easy to use object-oriented thread pool framework.
|
||||||
|
|
||||||
|
A thread pool is an object that maintains a pool of worker threads to perform
|
||||||
|
time consuming operations in parallel. It assigns jobs to the threads
|
||||||
|
by putting them in a work request queue, where they are picked up by the
|
||||||
|
next available thread. This then performs the requested operation in the
|
||||||
|
background and puts the results in a another queue.
|
||||||
|
|
||||||
|
The thread pool object can then collect the results from all threads from
|
||||||
|
this queue as soon as they become available or after all threads have
|
||||||
|
finished their work. It's also possible, to define callbacks to handle
|
||||||
|
each result as it comes in.
|
||||||
|
|
||||||
|
The basic concept and some code was taken from the book "Python in a Nutshell"
|
||||||
|
by Alex Martelli, copyright 2003, ISBN 0-596-00188-6, from section 14.5
|
||||||
|
"Threaded Program Architecture". I wrapped the main program logic in the
|
||||||
|
ThreadPool class, added the WorkRequest class and the callback system and
|
||||||
|
tweaked the code here and there. Kudos also to Florent Aide for the exception
|
||||||
|
handling mechanism.
|
||||||
|
|
||||||
|
Basic usage:
|
||||||
|
|
||||||
|
>>> pool = TreadPool(poolsize)
|
||||||
|
>>> requests = makeRequests(some_callable, list_of_args, callback)
|
||||||
|
>>> [pool.putRequest(req) for req in requests]
|
||||||
|
>>> pool.wait()
|
||||||
|
|
||||||
|
See the end of the module code for a brief, annotated usage example.
|
||||||
|
|
||||||
|
Website : http://chrisarndt.de/en/software/python/threadpool/
|
||||||
|
"""
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
'makeRequests',
|
||||||
|
'NoResultsPending',
|
||||||
|
'NoWorkersAvailable',
|
||||||
|
'ThreadPool',
|
||||||
|
'WorkRequest',
|
||||||
|
'WorkerThread'
|
||||||
|
]
|
||||||
|
|
||||||
|
__author__ = "Christopher Arndt"
|
||||||
|
__version__ = "1.2.3"
|
||||||
|
__revision__ = "$Revision: 1.5 $"
|
||||||
|
__date__ = "$Date: 2006/06/23 12:32:25 $"
|
||||||
|
__license__ = 'Python license'
|
||||||
|
|
||||||
|
# standard library modules
|
||||||
|
import sys
|
||||||
|
import threading
|
||||||
|
import Queue
|
||||||
|
|
||||||
|
# exceptions
|
||||||
|
class NoResultsPending(Exception):
|
||||||
|
"""All work requests have been processed."""
|
||||||
|
pass
|
||||||
|
|
||||||
|
class NoWorkersAvailable(Exception):
|
||||||
|
"""No worker threads available to process remaining requests."""
|
||||||
|
pass
|
||||||
|
|
||||||
|
# classes
|
||||||
|
class WorkerThread(threading.Thread):
|
||||||
|
"""Background thread connected to the requests/results queues.
|
||||||
|
|
||||||
|
A worker thread sits in the background and picks up work requests from
|
||||||
|
one queue and puts the results in another until it is dismissed.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, requestsQueue, resultsQueue, **kwds):
|
||||||
|
"""Set up thread in daemonic mode and start it immediatedly.
|
||||||
|
|
||||||
|
requestsQueue and resultQueue are instances of Queue.Queue passed
|
||||||
|
by the ThreadPool class when it creates a new worker thread.
|
||||||
|
"""
|
||||||
|
|
||||||
|
threading.Thread.__init__(self, **kwds)
|
||||||
|
self.setDaemon(1)
|
||||||
|
self.workRequestQueue = requestsQueue
|
||||||
|
self.resultQueue = resultsQueue
|
||||||
|
self._dismissed = threading.Event()
|
||||||
|
self.start()
|
||||||
|
|
||||||
|
def run(self):
|
||||||
|
"""Repeatedly process the job queue until told to exit."""
|
||||||
|
|
||||||
|
while not self._dismissed.isSet():
|
||||||
|
# thread blocks here, if queue empty
|
||||||
|
request = self.workRequestQueue.get()
|
||||||
|
if self._dismissed.isSet():
|
||||||
|
# if told to exit, return the work request we just picked up
|
||||||
|
self.workRequestQueue.put(request)
|
||||||
|
break # and exit
|
||||||
|
try:
|
||||||
|
self.resultQueue.put(
|
||||||
|
(request, request.callable(*request.args, **request.kwds))
|
||||||
|
)
|
||||||
|
except:
|
||||||
|
request.exception = True
|
||||||
|
self.resultQueue.put((request, sys.exc_info()))
|
||||||
|
|
||||||
|
def dismiss(self):
|
||||||
|
"""Sets a flag to tell the thread to exit when done with current job.
|
||||||
|
"""
|
||||||
|
|
||||||
|
self._dismissed.set()
|
||||||
|
|
||||||
|
|
||||||
|
class WorkRequest:
|
||||||
|
"""A request to execute a callable for putting in the request queue later.
|
||||||
|
|
||||||
|
See the module function makeRequests() for the common case
|
||||||
|
where you want to build several WorkRequests for the same callable
|
||||||
|
but with different arguments for each call.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, callable, args=None, kwds=None, requestID=None,
|
||||||
|
callback=None, exc_callback=None):
|
||||||
|
"""Create a work request for a callable and attach callbacks.
|
||||||
|
|
||||||
|
A work request consists of the a callable to be executed by a
|
||||||
|
worker thread, a list of positional arguments, a dictionary
|
||||||
|
of keyword arguments.
|
||||||
|
|
||||||
|
A callback function can be specified, that is called when the results
|
||||||
|
of the request are picked up from the result queue. It must accept
|
||||||
|
two arguments, the request object and the results of the callable,
|
||||||
|
in that order. If you want to pass additional information to the
|
||||||
|
callback, just stick it on the request object.
|
||||||
|
|
||||||
|
You can also give a callback for when an exception occurs. It should
|
||||||
|
also accept two arguments, the work request and a tuple with the
|
||||||
|
exception details as returned by sys.exc_info().
|
||||||
|
|
||||||
|
requestID, if given, must be hashable since it is used by the
|
||||||
|
ThreadPool object to store the results of that work request in a
|
||||||
|
dictionary. It defaults to the return value of id(self).
|
||||||
|
"""
|
||||||
|
|
||||||
|
if requestID is None:
|
||||||
|
self.requestID = id(self)
|
||||||
|
else:
|
||||||
|
try:
|
||||||
|
hash(requestID)
|
||||||
|
except TypeError:
|
||||||
|
raise TypeError("requestID must be hashable.")
|
||||||
|
self.requestID = requestID
|
||||||
|
self.exception = False
|
||||||
|
self.callback = callback
|
||||||
|
self.exc_callback = exc_callback
|
||||||
|
self.callable = callable
|
||||||
|
self.args = args or []
|
||||||
|
self.kwds = kwds or {}
|
||||||
|
|
||||||
|
|
||||||
|
class ThreadPool:
|
||||||
|
"""A thread pool, distributing work requests and collecting results.
|
||||||
|
|
||||||
|
See the module doctring for more information.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, num_workers, q_size=0):
|
||||||
|
"""Set up the thread pool and start num_workers worker threads.
|
||||||
|
|
||||||
|
num_workers is the number of worker threads to start initialy.
|
||||||
|
If q_size > 0 the size of the work request queue is limited and
|
||||||
|
the thread pool blocks when the queue is full and it tries to put
|
||||||
|
more work requests in it (see putRequest method).
|
||||||
|
"""
|
||||||
|
|
||||||
|
self.requestsQueue = Queue.Queue(q_size)
|
||||||
|
self.resultsQueue = Queue.Queue()
|
||||||
|
self.workers = []
|
||||||
|
self.workRequests = {}
|
||||||
|
self.createWorkers(num_workers)
|
||||||
|
|
||||||
|
def createWorkers(self, num_workers):
|
||||||
|
"""Add num_workers worker threads to the pool."""
|
||||||
|
|
||||||
|
for i in range(num_workers):
|
||||||
|
self.workers.append(WorkerThread(self.requestsQueue,
|
||||||
|
self.resultsQueue))
|
||||||
|
|
||||||
|
def dismissWorkers(self, num_workers):
|
||||||
|
"""Tell num_workers worker threads to quit after their current task.
|
||||||
|
"""
|
||||||
|
|
||||||
|
for i in range(min(num_workers, len(self.workers))):
|
||||||
|
worker = self.workers.pop()
|
||||||
|
worker.dismiss()
|
||||||
|
|
||||||
|
def putRequest(self, request, block=True, timeout=0):
|
||||||
|
"""Put work request into work queue and save its id for later."""
|
||||||
|
|
||||||
|
assert isinstance(request, WorkRequest)
|
||||||
|
self.requestsQueue.put(request, block, timeout)
|
||||||
|
self.workRequests[request.requestID] = request
|
||||||
|
|
||||||
|
def poll(self, block=False):
|
||||||
|
"""Process any new results in the queue."""
|
||||||
|
|
||||||
|
while True:
|
||||||
|
# still results pending?
|
||||||
|
if not self.workRequests:
|
||||||
|
raise NoResultsPending
|
||||||
|
# are there still workers to process remaining requests?
|
||||||
|
elif block and not self.workers:
|
||||||
|
raise NoWorkersAvailable
|
||||||
|
try:
|
||||||
|
# get back next results
|
||||||
|
request, result = self.resultsQueue.get(block=block)
|
||||||
|
# has an exception occured?
|
||||||
|
if request.exception and request.exc_callback:
|
||||||
|
request.exc_callback(request, result)
|
||||||
|
# hand results to callback, if any
|
||||||
|
if request.callback and not \
|
||||||
|
(request.exception and request.exc_callback):
|
||||||
|
request.callback(request, result)
|
||||||
|
del self.workRequests[request.requestID]
|
||||||
|
except Queue.Empty:
|
||||||
|
break
|
||||||
|
|
||||||
|
def wait(self, sleep=0):
|
||||||
|
"""Wait for results, blocking until all have arrived."""
|
||||||
|
|
||||||
|
while 1:
|
||||||
|
try:
|
||||||
|
self.poll(True)
|
||||||
|
time.sleep(sleep)
|
||||||
|
except NoResultsPending:
|
||||||
|
break
|
||||||
|
|
||||||
|
# helper functions
|
||||||
|
def makeRequests(callable, args_list, callback=None, exc_callback=None):
|
||||||
|
"""Create several work requests for same callable with different arguments.
|
||||||
|
|
||||||
|
Convenience function for creating several work requests for the same
|
||||||
|
callable where each invocation of the callable receives different values
|
||||||
|
for its arguments.
|
||||||
|
|
||||||
|
args_list contains the parameters for each invocation of callable.
|
||||||
|
Each item in 'args_list' should be either a 2-item tuple of the list of
|
||||||
|
positional arguments and a dictionary of keyword arguments or a single,
|
||||||
|
non-tuple argument.
|
||||||
|
|
||||||
|
See docstring for WorkRequest for info on callback and exc_callback.
|
||||||
|
"""
|
||||||
|
|
||||||
|
requests = []
|
||||||
|
for item in args_list:
|
||||||
|
if isinstance(item, tuple):
|
||||||
|
requests.append(
|
||||||
|
WorkRequest(callable, item[0], item[1], callback=callback,
|
||||||
|
exc_callback=exc_callback)
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
requests.append(
|
||||||
|
WorkRequest(callable, [item], None, callback=callback,
|
||||||
|
exc_callback=exc_callback)
|
||||||
|
)
|
||||||
|
return requests
|
||||||
|
|
||||||
|
################
|
||||||
|
# USAGE EXAMPLE
|
||||||
|
################
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
import random
|
||||||
|
import time
|
||||||
|
|
||||||
|
# the work the threads will have to do (rather trivial in our example)
|
||||||
|
def do_something(data):
|
||||||
|
time.sleep(random.randint(1,5))
|
||||||
|
result = round(random.random() * data, 5)
|
||||||
|
# just to show off, we throw an exception once in a while
|
||||||
|
if result > 3:
|
||||||
|
raise RuntimeError("Something extraordinary happened!")
|
||||||
|
return result
|
||||||
|
|
||||||
|
# this will be called each time a result is available
|
||||||
|
def print_result(request, result):
|
||||||
|
print "**Result: %s from request #%s" % (result, request.requestID)
|
||||||
|
|
||||||
|
# this will be called when an exception occurs within a thread
|
||||||
|
def handle_exception(request, exc_info):
|
||||||
|
print "Exception occured in request #%s: %s" % \
|
||||||
|
(request.requestID, exc_info[1])
|
||||||
|
|
||||||
|
# assemble the arguments for each job to a list...
|
||||||
|
data = [random.randint(1,10) for i in range(20)]
|
||||||
|
# ... and build a WorkRequest object for each item in data
|
||||||
|
requests = makeRequests(do_something, data, print_result, handle_exception)
|
||||||
|
|
||||||
|
# or the other form of args_lists accepted by makeRequests: ((,), {})
|
||||||
|
data = [((random.randint(1,10),), {}) for i in range(20)]
|
||||||
|
requests.extend(
|
||||||
|
makeRequests(do_something, data, print_result, handle_exception)
|
||||||
|
)
|
||||||
|
|
||||||
|
# we create a pool of 3 worker threads
|
||||||
|
main = ThreadPool(3)
|
||||||
|
|
||||||
|
# then we put the work requests in the queue...
|
||||||
|
for req in requests:
|
||||||
|
main.putRequest(req)
|
||||||
|
print "Work request #%s added." % req.requestID
|
||||||
|
# or shorter:
|
||||||
|
# [main.putRequest(req) for req in requests]
|
||||||
|
|
||||||
|
# ...and wait for the results to arrive in the result queue
|
||||||
|
# by using ThreadPool.wait(). This would block until results for
|
||||||
|
# all work requests have arrived:
|
||||||
|
# main.wait()
|
||||||
|
|
||||||
|
# instead we can poll for results while doing something else:
|
||||||
|
i = 0
|
||||||
|
while 1:
|
||||||
|
try:
|
||||||
|
main.poll()
|
||||||
|
print "Main thread working..."
|
||||||
|
time.sleep(0.5)
|
||||||
|
if i == 10:
|
||||||
|
print "Adding 3 more worker threads..."
|
||||||
|
main.createWorkers(3)
|
||||||
|
i += 1
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
print "Interrupted!"
|
||||||
|
break
|
||||||
|
except NoResultsPending:
|
||||||
|
print "All results collected."
|
||||||
|
break
|
@ -17,4 +17,113 @@
|
|||||||
'''
|
'''
|
||||||
Contains the logic for parsing feeds.
|
Contains the logic for parsing feeds.
|
||||||
'''
|
'''
|
||||||
|
import time, logging
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
from libprs500.web.feeds.feedparser import parse
|
||||||
|
|
||||||
|
class Article(object):
|
||||||
|
|
||||||
|
time_offset = datetime.now() - datetime.utcnow()
|
||||||
|
|
||||||
|
def __init__(self, id, title, url, summary, published, content):
|
||||||
|
self.id = id
|
||||||
|
self.title = title
|
||||||
|
self.url = url
|
||||||
|
self.summary = summary
|
||||||
|
self.content = content
|
||||||
|
self.date = published
|
||||||
|
self.utctime = datetime(*self.date[:6])
|
||||||
|
self.localtime = self.utctime + self.time_offset
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return \
|
||||||
|
(u'''\
|
||||||
|
Title : %s
|
||||||
|
URL : %s
|
||||||
|
Summary : %s
|
||||||
|
Date : %s
|
||||||
|
Has content : %s
|
||||||
|
'''%(self.title, self.url, self.summary[:20]+'...', self.localtime.strftime('%a, %d %b, %Y %H:%M'),
|
||||||
|
bool(self.content))).encode('utf-8')
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return repr(self)
|
||||||
|
|
||||||
|
|
||||||
|
class Feed(object):
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
'''
|
||||||
|
Parse a feed into articles.
|
||||||
|
'''
|
||||||
|
self.logger = logging.getLogger('feeds2disk')
|
||||||
|
|
||||||
|
def populate_from_feed(self, feed, title=None, oldest_article=7,
|
||||||
|
max_articles_per_feed=100):
|
||||||
|
entries = feed.entries
|
||||||
|
feed = feed.feed
|
||||||
|
self.title = feed.get('title', 'Unknown feed') if not title else title
|
||||||
|
self.description = feed.get('description', '')
|
||||||
|
image = feed.get('image', {})
|
||||||
|
self.image_url = image.get('href', None)
|
||||||
|
self.image_width = image.get('width', 88)
|
||||||
|
self.image_height = image.get('height', 31)
|
||||||
|
self.image_alt = image.get('title', '')
|
||||||
|
|
||||||
|
self.articles = []
|
||||||
|
self.id_counter = 0
|
||||||
|
self.added_articles = []
|
||||||
|
|
||||||
|
self.oldest_article = oldest_article
|
||||||
|
|
||||||
|
for item in entries:
|
||||||
|
if len(self.articles) > max_articles_per_feed:
|
||||||
|
break
|
||||||
|
self.parse_article(item)
|
||||||
|
|
||||||
|
def parse_article(self, item):
|
||||||
|
id = item.get('id', 'internal id#'+str(self.id_counter))
|
||||||
|
if id in self.added_articles:
|
||||||
|
return
|
||||||
|
published = item.get('date_parsed', time.gmtime())
|
||||||
|
self.id_counter += 1
|
||||||
|
self.added_articles.append(id)
|
||||||
|
|
||||||
|
title = item.get('title', 'Untitled article')
|
||||||
|
link = item.get('link', None)
|
||||||
|
description = item.get('summary', None)
|
||||||
|
|
||||||
|
content = '\n'.join(i.value for i in item.get('content', []))
|
||||||
|
if not content.strip():
|
||||||
|
content = None
|
||||||
|
|
||||||
|
article = Article(id, title, link, description, published, content)
|
||||||
|
delta = datetime.utcnow() - article.utctime
|
||||||
|
if delta.days*24*3600 + delta.seconds <= 24*3600*self.oldest_article:
|
||||||
|
self.articles.append(article)
|
||||||
|
else:
|
||||||
|
self.logger.debug('Skipping article %s as it is too old.'%title)
|
||||||
|
|
||||||
|
def __iter__(self):
|
||||||
|
return iter(self.articles)
|
||||||
|
|
||||||
|
def __len__(self):
|
||||||
|
return len(self.articles)
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
res = [('%20s\n'%'').replace(' ', '_')+repr(art) for art in self]
|
||||||
|
|
||||||
|
return '\n'+'\n'.join(res)+'\n'
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return repr(self)
|
||||||
|
|
||||||
|
|
||||||
|
def feed_from_xml(raw_xml, title=None, oldest_article=7, max_articles_per_feed=100):
|
||||||
|
feed = parse(raw_xml)
|
||||||
|
pfeed = Feed()
|
||||||
|
pfeed.populate_from_feed(feed, title=title,
|
||||||
|
oldest_article=oldest_article,
|
||||||
|
max_articles_per_feed=max_articles_per_feed)
|
||||||
|
return pfeed
|
||||||
|
@ -17,7 +17,7 @@ from libprs500.web.feeds.news import BasicNewsRecipe
|
|||||||
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||||
''''''
|
''''''
|
||||||
|
|
||||||
import sys, os
|
import sys, os, logging
|
||||||
from libprs500.web.recipes import get_feed, compile_recipe
|
from libprs500.web.recipes import get_feed, compile_recipe
|
||||||
from libprs500.web.fetch.simple import option_parser as _option_parser
|
from libprs500.web.fetch.simple import option_parser as _option_parser
|
||||||
|
|
||||||
@ -53,26 +53,38 @@ If you specify this option, any argument to %prog is ignored and a default recip
|
|||||||
p.add_option('--lrf', default=False, action='store_true', help='Optimize fetching for subsequent conversion to LRF.')
|
p.add_option('--lrf', default=False, action='store_true', help='Optimize fetching for subsequent conversion to LRF.')
|
||||||
p.add_option('--recursions', default=0, type='int',
|
p.add_option('--recursions', default=0, type='int',
|
||||||
help=_('Number of levels of links to follow on webpages that are linked to from feeds. Defaul %default'))
|
help=_('Number of levels of links to follow on webpages that are linked to from feeds. Defaul %default'))
|
||||||
|
p.add_option('--output-dir', default=os.getcwd(),
|
||||||
|
help=_('The directory in which to store the downloaded feeds. Defaults to the current directory.'))
|
||||||
|
p.add_option('--no-progress-bar', dest='progress_bar', default=True, action='store_false',
|
||||||
|
help=_('Dont show the progress bar'))
|
||||||
|
p.add_option('--debug', action='store_true', default=False,
|
||||||
|
help='Very verbose output, useful for debugging.')
|
||||||
|
|
||||||
return p
|
return p
|
||||||
|
|
||||||
def simple_progress_bar(*args):
|
def simple_progress_bar(percent, msg):
|
||||||
print '%d%%'%(args[0]*100),
|
print '%d%%'%(percent*100),
|
||||||
sys.stdout.flush()
|
sys.stdout.flush()
|
||||||
|
|
||||||
|
def no_progress_bar(percent, msg):
|
||||||
|
print msg
|
||||||
|
|
||||||
def main(args=sys.argv, notification=None):
|
def main(args=sys.argv, notification=None, handler=None):
|
||||||
p = option_parser()
|
p = option_parser()
|
||||||
opts, args = p.parse_args(args)
|
opts, args = p.parse_args(args)
|
||||||
|
|
||||||
if notification is None:
|
if notification is None:
|
||||||
from libprs500.terminfo import TerminalController, ProgressBar
|
from libprs500.terminfo import TerminalController, ProgressBar
|
||||||
term = TerminalController(sys.stdout)
|
term = TerminalController(sys.stdout)
|
||||||
try:
|
if opts.progress_bar:
|
||||||
pb = ProgressBar(term, _('Fetching feeds...'))
|
try:
|
||||||
notification = pb.update
|
pb = ProgressBar(term, _('Fetching feeds...'))
|
||||||
except ValueError:
|
notification = pb.update
|
||||||
notification = simple_progress_bar
|
except ValueError:
|
||||||
print _('Fetching feeds...')
|
notification = simple_progress_bar
|
||||||
|
print _('Fetching feeds...')
|
||||||
|
else:
|
||||||
|
notification = no_progress_bar
|
||||||
|
|
||||||
if len(args) != 2:
|
if len(args) != 2:
|
||||||
p.print_help()
|
p.print_help()
|
||||||
@ -98,11 +110,15 @@ def main(args=sys.argv, notification=None):
|
|||||||
print args[1], 'is an invalid recipe'
|
print args[1], 'is an invalid recipe'
|
||||||
return 1
|
return 1
|
||||||
|
|
||||||
recipe = recipe(opts, p, notification)
|
if handler is None:
|
||||||
index = recipe.download()
|
handler = logging.StreamHandler(sys.stdout)
|
||||||
|
handler.setLevel(logging.DEBUG if opts.debug else logging.INFO if opts.verbose else logging.WARN)
|
||||||
|
handler.setFormatter(logging.Formatter('%(levelname)s: %(message)s'))
|
||||||
|
logging.getLogger('feeds2disk').addHandler(handler)
|
||||||
|
|
||||||
|
recipe = recipe(opts, p, notification)
|
||||||
|
recipe.download()
|
||||||
|
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
@ -1,5 +1,4 @@
|
|||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
|
|
||||||
## Copyright (C) 2008 Kovid Goyal kovid@kovidgoyal.net
|
## Copyright (C) 2008 Kovid Goyal kovid@kovidgoyal.net
|
||||||
## This program is free software; you can redistribute it and/or modify
|
## This program is free software; you can redistribute it and/or modify
|
||||||
## it under the terms of the GNU General Public License as published by
|
## it under the terms of the GNU General Public License as published by
|
||||||
@ -18,9 +17,16 @@
|
|||||||
The backend to parse feeds and create HTML that can then be converted
|
The backend to parse feeds and create HTML that can then be converted
|
||||||
to an ebook.
|
to an ebook.
|
||||||
'''
|
'''
|
||||||
import logging
|
import logging, os, cStringIO, traceback, time
|
||||||
|
import urlparse
|
||||||
|
|
||||||
from libprs500 import browser
|
from libprs500 import browser
|
||||||
|
from libprs500.ebooks.BeautifulSoup import BeautifulSoup
|
||||||
|
from libprs500.web.feeds import feed_from_xml, templates
|
||||||
|
from libprs500.web.fetch.simple import option_parser as web2disk_option_parser
|
||||||
|
from libprs500.web.fetch.simple import RecursiveFetcher
|
||||||
|
from libprs500.threadpool import WorkRequest, ThreadPool, NoResultsPending
|
||||||
|
|
||||||
|
|
||||||
class BasicNewsRecipe(object):
|
class BasicNewsRecipe(object):
|
||||||
'''
|
'''
|
||||||
@ -48,6 +54,10 @@ class BasicNewsRecipe(object):
|
|||||||
#: @type: integer
|
#: @type: integer
|
||||||
delay = 0
|
delay = 0
|
||||||
|
|
||||||
|
#: Number of simultaneous downloads. Set to 1 if the server is picky.
|
||||||
|
#: @type: integer
|
||||||
|
simultaneous_downloads = 5
|
||||||
|
|
||||||
#: Timeout for fetching files from server in seconds
|
#: Timeout for fetching files from server in seconds
|
||||||
#: @type: integer
|
#: @type: integer
|
||||||
timeout = 10
|
timeout = 10
|
||||||
@ -55,7 +65,7 @@ class BasicNewsRecipe(object):
|
|||||||
#: The format string for the date shown on the first page
|
#: The format string for the date shown on the first page
|
||||||
#: By default: Day Name Day Number Month Name Year
|
#: By default: Day Name Day Number Month Name Year
|
||||||
#: @type: string
|
#: @type: string
|
||||||
timefmt = ' [%a %d %b %Y]'
|
timefmt = ' %a, %d %b %Y'
|
||||||
|
|
||||||
#: Max number of characters in the short description.
|
#: Max number of characters in the short description.
|
||||||
#: @type: integer
|
#: @type: integer
|
||||||
@ -94,6 +104,19 @@ class BasicNewsRecipe(object):
|
|||||||
#: @type: list of strings
|
#: @type: list of strings
|
||||||
html2lrf_options = []
|
html2lrf_options = []
|
||||||
|
|
||||||
|
#: List of tags to be removed. Specified tags are removed from downloaded HTML.
|
||||||
|
#: A tag is specified as a dictionary of the form::
|
||||||
|
#: {
|
||||||
|
#: name : 'tag name', #e.g. 'div'
|
||||||
|
#: attrs : a dictionary, #e.g. {class: 'advertisment'}
|
||||||
|
#: }
|
||||||
|
#: All keys are optional. For a full explanantion of the search criteria, see
|
||||||
|
#: U{http://www.crummy.com/software/BeautifulSoup/documentation.html#The basic find method: findAll(name, attrs, recursive, text, limit, **kwargs)}
|
||||||
|
#: A common example::
|
||||||
|
#: remove_tags = [dict(name='div', attrs={'class':'advert'})]
|
||||||
|
#: This will remove all <div class="advert"> tags and all their children from the downloaded HTML.
|
||||||
|
remove_tags = []
|
||||||
|
|
||||||
#: List of regexp substitution rules to run on the downloaded HTML. Each element of the
|
#: List of regexp substitution rules to run on the downloaded HTML. Each element of the
|
||||||
#: list should be a two element tuple. The first element of the tuple should
|
#: list should be a two element tuple. The first element of the tuple should
|
||||||
#: be a compiled regular expression and the second a callable that takes
|
#: be a compiled regular expression and the second a callable that takes
|
||||||
@ -131,6 +154,25 @@ class BasicNewsRecipe(object):
|
|||||||
'''
|
'''
|
||||||
return browser()
|
return browser()
|
||||||
|
|
||||||
|
def preprocess_html(self, soup):
|
||||||
|
'''
|
||||||
|
This function is called with the source of each downloaded HTML file.
|
||||||
|
It can be used to do arbitrarily powerful pre-processing on the HTML.
|
||||||
|
@param soup: A U{BeautifulSoup<http://www.crummy.com/software/BeautifulSoup/documentation.html>}
|
||||||
|
instance containing the downloaded HTML.
|
||||||
|
@type soup: A U{BeautifulSoup<http://www.crummy.com/software/BeautifulSoup/documentation.html>} instance
|
||||||
|
@return: It must return soup (after having done any needed preprocessing)
|
||||||
|
@rtype: A U{BeautifulSoup<http://www.crummy.com/software/BeautifulSoup/documentation.html>} instance
|
||||||
|
'''
|
||||||
|
return soup
|
||||||
|
|
||||||
|
def cleanup(self):
|
||||||
|
'''
|
||||||
|
Called after all articles have been download. Use it to do any cleanup like
|
||||||
|
logging out of subscription sites, etc.
|
||||||
|
'''
|
||||||
|
pass
|
||||||
|
|
||||||
def __init__(self, options, parser, progress_reporter):
|
def __init__(self, options, parser, progress_reporter):
|
||||||
'''
|
'''
|
||||||
Initialize the recipe.
|
Initialize the recipe.
|
||||||
@ -138,9 +180,15 @@ class BasicNewsRecipe(object):
|
|||||||
@param parser: Command line option parser. Used to intelligently merge options.
|
@param parser: Command line option parser. Used to intelligently merge options.
|
||||||
@param progress_reporter: A Callable that takes two arguments: progress (a number between 0 and 1) and a string message. The message should be optional.
|
@param progress_reporter: A Callable that takes two arguments: progress (a number between 0 and 1) and a string message. The message should be optional.
|
||||||
'''
|
'''
|
||||||
for attr in ('username', 'password', 'lrf'):
|
for attr in ('username', 'password', 'lrf', 'output_dir', 'verbose', 'debug'):
|
||||||
setattr(self, attr, getattr(options, attr))
|
setattr(self, attr, getattr(options, attr))
|
||||||
|
self.output_dir = os.path.abspath(self.output_dir)
|
||||||
|
|
||||||
self.logger = logging.getLogger('feeds2disk')
|
self.logger = logging.getLogger('feeds2disk')
|
||||||
|
|
||||||
|
if self.debug:
|
||||||
|
self.logger.setLevel(logging.DEBUG)
|
||||||
|
self.verbose = True
|
||||||
self.report_progress = progress_reporter
|
self.report_progress = progress_reporter
|
||||||
|
|
||||||
self.username = self.password = None
|
self.username = self.password = None
|
||||||
@ -160,24 +208,201 @@ class BasicNewsRecipe(object):
|
|||||||
if self.needs_subscription and (self.username is None or self.password is None):
|
if self.needs_subscription and (self.username is None or self.password is None):
|
||||||
raise ValueError('The %s recipe needs a username and password.'%self.title)
|
raise ValueError('The %s recipe needs a username and password.'%self.title)
|
||||||
|
|
||||||
|
self.browser = self.get_browser()
|
||||||
|
self.image_map, self.image_counter = {}, 1
|
||||||
|
|
||||||
|
web2disk_cmdline = [ 'web2disk',
|
||||||
|
'--timeout', str(self.timeout),
|
||||||
|
'--max-recursions', str(self.recursions),
|
||||||
|
'--delay', str(self.delay),
|
||||||
|
'--timeout', str(self.timeout),
|
||||||
|
]
|
||||||
|
if self.encoding is not None:
|
||||||
|
web2disk_cmdline.extend(['--encoding', self.encoding])
|
||||||
|
|
||||||
|
if self.verbose:
|
||||||
|
web2disk_cmdline.append('--verbose')
|
||||||
|
|
||||||
|
if self.no_stylesheets:
|
||||||
|
web2disk_cmdline.append('--dont-download-stylesheets')
|
||||||
|
|
||||||
|
for reg in self.match_regexps:
|
||||||
|
web2disk_cmdline.extend(['--match-regexp', reg])
|
||||||
|
|
||||||
|
for reg in self.filter_regexps:
|
||||||
|
web2disk_cmdline.extend(['--filter-regexp', reg])
|
||||||
|
|
||||||
|
self.web2disk_options = web2disk_option_parser().parse_args(web2disk_cmdline)[0]
|
||||||
|
self.web2disk_options.remove_tags = self.remove_tags
|
||||||
|
self.web2disk_options.preprocess_regexps = self.preprocess_regexps
|
||||||
|
self.web2disk_options.preprocess_html = self.preprocess_html
|
||||||
|
|
||||||
|
if self.delay > 0:
|
||||||
|
self.simultaneous_downloads = 1
|
||||||
|
|
||||||
|
self.navbar = templates.NavBarTemplate()
|
||||||
|
|
||||||
def download(self):
|
def download(self):
|
||||||
self.report_progress(0, 'Starting download...')
|
'''
|
||||||
return self.build_index()
|
Download and pre-process all articles from the feeds in this recipe.
|
||||||
|
This method should be called only one on a particular Recipe instance.
|
||||||
|
Calling it more than once will lead to undefined behavior.
|
||||||
|
@return: Path to index.html
|
||||||
|
@rtype: string
|
||||||
|
'''
|
||||||
|
self.report_progress(0, _('Initialized'))
|
||||||
|
res = self.build_index()
|
||||||
|
self.cleanup()
|
||||||
|
return res
|
||||||
|
|
||||||
|
def feeds2index(self, feeds):
|
||||||
|
templ = templates.IndexTemplate()
|
||||||
|
return templ.generate(self.title, self.timefmt, feeds).render(doctype='xhtml')
|
||||||
|
|
||||||
|
def feed2index(self, feed):
|
||||||
|
if feed.image_url is not None: # Download feed image
|
||||||
|
imgdir = os.path.join(self.output_dir, 'images')
|
||||||
|
if not os.path.isdir(imgdir):
|
||||||
|
os.makedirs(imgdir)
|
||||||
|
|
||||||
|
if self.image_map.has_key(feed.image_url):
|
||||||
|
feed.image_url = self.image_map[feed.image_url]
|
||||||
|
else:
|
||||||
|
bn = urlparse.urlsplit(feed.image_url).path
|
||||||
|
if bn:
|
||||||
|
bn = bn.rpartition('/')[-1]
|
||||||
|
if bn:
|
||||||
|
img = os.path.join(imgdir, 'feed_image_%d%s'%(self.image_counter, os.path.splitext(bn)))
|
||||||
|
open(img, 'wb').write(self.browser.open(feed.image_url).read())
|
||||||
|
self.image_counter += 1
|
||||||
|
feed.image_url = img
|
||||||
|
self.image_map[feed.image_url] = img
|
||||||
|
|
||||||
|
templ = templates.FeedTemplate()
|
||||||
|
return templ.generate(feed).render(doctype='xhtml')
|
||||||
|
|
||||||
|
|
||||||
|
def create_logger(self, feed_number, article_number):
|
||||||
|
logger = logging.getLogger('feeds2disk.article_%d_%d'%(feed_number, article_number))
|
||||||
|
out = cStringIO.StringIO()
|
||||||
|
handler = logging.StreamHandler(out)
|
||||||
|
handler.setFormatter(logging.Formatter('%(levelname)s: %(message)s'))
|
||||||
|
handler.setLevel(logging.INFO if self.verbose else logging.WARNING)
|
||||||
|
if self.debug:
|
||||||
|
handler.setLevel(logging.DEBUG)
|
||||||
|
logger.addHandler(handler)
|
||||||
|
return logger, out
|
||||||
|
|
||||||
|
def fetch_article(self, url, dir, logger):
|
||||||
|
fetcher = RecursiveFetcher(self.web2disk_options, logger, self.image_map)
|
||||||
|
fetcher.base_dir = dir
|
||||||
|
fetcher.current_dir = dir
|
||||||
|
fetcher.show_progress = False
|
||||||
|
return fetcher.start_fetch(url)
|
||||||
|
|
||||||
def build_index(self):
|
def build_index(self):
|
||||||
self.parse_feeds()
|
self.report_progress(0, _('Fetching feeds...'))
|
||||||
|
feeds = self.parse_feeds()
|
||||||
|
self.has_single_feed = len(feeds) == 1
|
||||||
|
|
||||||
|
index = os.path.join(self.output_dir, 'index.html')
|
||||||
|
|
||||||
|
html = self.feeds2index(feeds)
|
||||||
|
open(index, 'wb').write(html)
|
||||||
|
|
||||||
|
self.jobs = []
|
||||||
|
for f, feed in enumerate(feeds):
|
||||||
|
feed_dir = os.path.join(self.output_dir, 'feed_%d'%f)
|
||||||
|
if not os.path.isdir(feed_dir):
|
||||||
|
os.makedirs(feed_dir)
|
||||||
|
|
||||||
|
for a, article in enumerate(feed):
|
||||||
|
art_dir = os.path.join(feed_dir, 'article_%d'%a)
|
||||||
|
if not os.path.isdir(art_dir):
|
||||||
|
os.makedirs(art_dir)
|
||||||
|
logger, stream = self.create_logger(f, a)
|
||||||
|
try:
|
||||||
|
url = self.print_version(article.url)
|
||||||
|
except NotImplementedError:
|
||||||
|
url = article.url
|
||||||
|
req = WorkRequest(self.fetch_article, (url, art_dir, logger),
|
||||||
|
{}, (f, a), self.article_downloaded,
|
||||||
|
self.error_in_article_download)
|
||||||
|
req.stream = stream
|
||||||
|
req.feed = feed
|
||||||
|
req.article = article
|
||||||
|
self.jobs.append(req)
|
||||||
|
|
||||||
|
self.jobs_done = 0
|
||||||
|
tp = ThreadPool(self.simultaneous_downloads)
|
||||||
|
for req in self.jobs:
|
||||||
|
tp.putRequest(req, block=True, timeout=0)
|
||||||
|
|
||||||
|
self.report_progress(0, _('Starting download [%d thread(s)]...')%self.simultaneous_downloads)
|
||||||
|
while True:
|
||||||
|
try:
|
||||||
|
tp.poll(True)
|
||||||
|
time.sleep(0.1)
|
||||||
|
except NoResultsPending:
|
||||||
|
break
|
||||||
|
|
||||||
|
html = self.feed2index(feed)
|
||||||
|
open(os.path.join(feed_dir, 'index.html'), 'wb').write(html)
|
||||||
|
self.report_progress(1, _('Feeds downloaded to %s')%index)
|
||||||
|
return index
|
||||||
|
|
||||||
|
|
||||||
|
def article_downloaded(self, request, result):
|
||||||
|
index = os.path.join(os.path.dirname(result), 'index.html')
|
||||||
|
os.rename(result, index)
|
||||||
|
src = open(index, 'rb').read().decode('utf-8')
|
||||||
|
f, a = request.requestID
|
||||||
|
soup = BeautifulSoup(src)
|
||||||
|
body = soup.find('body')
|
||||||
|
if body is not None:
|
||||||
|
top = self.navbar.generate(False, f, a, len(request.feed), not self.has_single_feed).render(doctype='xhtml')
|
||||||
|
bottom = self.navbar.generate(True, f, a, len(request.feed), not self.has_single_feed).render(doctype='xhtml')
|
||||||
|
top = BeautifulSoup(top).find('div')
|
||||||
|
bottom = BeautifulSoup(bottom).find('div')
|
||||||
|
body.insert(0, top)
|
||||||
|
body.insert(len(body.contents), bottom)
|
||||||
|
open(index, 'wb').write(unicode(soup).encode('utf-8'))
|
||||||
|
|
||||||
|
article = request.article
|
||||||
|
self.logger.debug(_('\nDownloaded article %s from %s\n%s')%(article.title, article.url, request.stream.getvalue()))
|
||||||
|
article.url = result
|
||||||
|
article.downloaded = True
|
||||||
|
self.jobs_done += 1
|
||||||
|
self.report_progress(float(self.jobs_done)/len(self.jobs), _('Article downloaded: %s')%article.title)
|
||||||
|
|
||||||
|
def error_in_article_download(self, request, exc_info):
|
||||||
|
self.jobs_done += 1
|
||||||
|
self.logger.error(_('Failed to download article: %s from %s')%(request.article.title, request.article.url))
|
||||||
|
self.logger.debug(traceback.format_exc(*exc_info))
|
||||||
|
self.logger.debug(request.stream.getvalue())
|
||||||
|
self.logger.debug('\n')
|
||||||
|
self.report_progress(float(self.jobs_done)/len(self.jobs), _('Article download failed: %s')%request.article.title)
|
||||||
|
|
||||||
|
|
||||||
def parse_feeds(self):
|
def parse_feeds(self):
|
||||||
'''
|
'''
|
||||||
Create list of articles from a list of feeds.
|
Create a list of articles from a list of feeds.
|
||||||
@rtype: list
|
@rtype: list
|
||||||
@return: A list whose items are 2-tuples C{('feed title', articles)},
|
@return: A list of L{Feed}s.
|
||||||
where C{articles} is a list of dictionaries each of the form::
|
|
||||||
{
|
|
||||||
'title' : article title,
|
|
||||||
'url' : URL of print version,
|
|
||||||
'date' : The publication date of the article as a string,
|
|
||||||
'description' : A summary of the article
|
|
||||||
'content' : The full article (can be an empty string). This is used by FullContentProfile
|
|
||||||
}
|
|
||||||
'''
|
'''
|
||||||
|
feeds = self.get_feeds()
|
||||||
|
parsed_feeds = []
|
||||||
|
for obj in feeds:
|
||||||
|
if isinstance(obj, basestring):
|
||||||
|
title, url = None, obj
|
||||||
|
else:
|
||||||
|
title, url = obj
|
||||||
|
self.report_progress(0, _('Fetching feed %s...'%(title if title else url)))
|
||||||
|
parsed_feeds.append(feed_from_xml(self.browser.open(url).read(),
|
||||||
|
title=title,
|
||||||
|
oldest_article=self.oldest_article,
|
||||||
|
max_articles_per_feed=self.max_articles_per_feed))
|
||||||
|
|
||||||
|
return parsed_feeds
|
||||||
|
|
||||||
|
|
||||||
|
162
src/libprs500/web/feeds/templates.py
Normal file
162
src/libprs500/web/feeds/templates.py
Normal file
@ -0,0 +1,162 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
## Copyright (C) 2008 Kovid Goyal kovid@kovidgoyal.net
|
||||||
|
## This program is free software; you can redistribute it and/or modify
|
||||||
|
## it under the terms of the GNU General Public License as published by
|
||||||
|
## the Free Software Foundation; either version 2 of the License, or
|
||||||
|
## (at your option) any later version.
|
||||||
|
##
|
||||||
|
## This program is distributed in the hope that it will be useful,
|
||||||
|
## but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
## GNU General Public License for more details.
|
||||||
|
##
|
||||||
|
## You should have received a copy of the GNU General Public License along
|
||||||
|
## with this program; if not, write to the Free Software Foundation, Inc.,
|
||||||
|
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||||
|
|
||||||
|
from genshi.template import MarkupTemplate
|
||||||
|
|
||||||
|
class Template(MarkupTemplate):
|
||||||
|
|
||||||
|
STYLE = '''\
|
||||||
|
.article_date {
|
||||||
|
font-size: x-small; color: gray; font-family: monospace;
|
||||||
|
}
|
||||||
|
|
||||||
|
.article_description {
|
||||||
|
font-size: small; font-family: sans; text-indent: 0pt;
|
||||||
|
}
|
||||||
|
|
||||||
|
a.article {
|
||||||
|
font-weight: bold; font-size: large;
|
||||||
|
}
|
||||||
|
|
||||||
|
a.feed {
|
||||||
|
font-weight: bold; font-size: large;
|
||||||
|
}
|
||||||
|
'''
|
||||||
|
|
||||||
|
def generate(self, *args, **kwargs):
|
||||||
|
if not kwargs.has_key('style'):
|
||||||
|
kwargs['style'] = self.STYLE
|
||||||
|
return MarkupTemplate.generate(self, *args, **kwargs)
|
||||||
|
|
||||||
|
class NavBarTemplate(Template):
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
Template.__init__(self, '''\
|
||||||
|
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
|
||||||
|
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
|
||||||
|
<html xmlns="http://www.w3.org/1999/xhtml"
|
||||||
|
xml:lang="en"
|
||||||
|
xmlns:xi="http://www.w3.org/2001/XInclude"
|
||||||
|
xmlns:py="http://genshi.edgewall.org/"
|
||||||
|
|
||||||
|
>
|
||||||
|
<body>
|
||||||
|
<div class="navbar" style="text-align:center">
|
||||||
|
<hr py:if="bottom" />
|
||||||
|
<a href="../index.html#article_${str(art)}">Up one level</a>
|
||||||
|
<py:if test="two_levels">
|
||||||
|
| <a href="../../index.html#_${str(feed)}">Up two levels</a>
|
||||||
|
</py:if>
|
||||||
|
<py:if test="art != 0">
|
||||||
|
| <a href="../article_${str(art-1)}/index.html">Previous</a>
|
||||||
|
</py:if>
|
||||||
|
<py:if test="art != num - 1">
|
||||||
|
| <a href="../article_${str(art+1)}/index.html">Next</a>
|
||||||
|
</py:if>
|
||||||
|
<hr py:if="not bottom" />
|
||||||
|
</div>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
''')
|
||||||
|
|
||||||
|
def generate(self, bottom, feed, art, number_of_articles_in_feed, two_levels):
|
||||||
|
return Template.generate(self, bottom=bottom, art=art, num=number_of_articles_in_feed, two_levels=two_levels)
|
||||||
|
|
||||||
|
|
||||||
|
class IndexTemplate(Template):
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
Template.__init__(self, '''\
|
||||||
|
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
|
||||||
|
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
|
||||||
|
<html xmlns="http://www.w3.org/1999/xhtml"
|
||||||
|
xml:lang="en"
|
||||||
|
xmlns:xi="http://www.w3.org/2001/XInclude"
|
||||||
|
xmlns:py="http://genshi.edgewall.org/"
|
||||||
|
|
||||||
|
>
|
||||||
|
<head>
|
||||||
|
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
|
||||||
|
<title>${title}</title>
|
||||||
|
<style type="text/css">
|
||||||
|
${style}
|
||||||
|
</style>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<h1>${title}</h1>
|
||||||
|
<?python
|
||||||
|
from datetime import datetime
|
||||||
|
?>
|
||||||
|
<p style="text-align:right">${datetime.now().strftime(datefmt)}</p>
|
||||||
|
<ul>
|
||||||
|
<py:for each="i, feed in enumerate(feeds)">
|
||||||
|
<li id="feed_${str(i)}">
|
||||||
|
<a class="feed" href="${'feed_%d/index.html'%i}">${feed.title}</a>
|
||||||
|
</li>
|
||||||
|
</py:for>
|
||||||
|
</ul>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
''')
|
||||||
|
|
||||||
|
def generate(self, title, datefmt, feeds):
|
||||||
|
return Template.generate(self, title=title, datefmt=datefmt, feeds=feeds)
|
||||||
|
|
||||||
|
|
||||||
|
class FeedTemplate(Template):
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
Template.__init__(self, '''\
|
||||||
|
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
|
||||||
|
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
|
||||||
|
<html xmlns="http://www.w3.org/1999/xhtml"
|
||||||
|
xml:lang="en"
|
||||||
|
xmlns:xi="http://www.w3.org/2001/XInclude"
|
||||||
|
xmlns:py="http://genshi.edgewall.org/"
|
||||||
|
|
||||||
|
>
|
||||||
|
<head>
|
||||||
|
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
|
||||||
|
<title>${feed.title}</title>
|
||||||
|
<style type="text/css">
|
||||||
|
${style}
|
||||||
|
</style>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<h2>${feed.title}</h2>
|
||||||
|
<py:if test="feed.image">
|
||||||
|
<div class="feed_image">
|
||||||
|
<img alt="${feed.image_alt}" src="${feed.image_url}" />
|
||||||
|
</div>
|
||||||
|
</py:if>
|
||||||
|
<ul>
|
||||||
|
<py:for each="i, article in enumerate(feed)">
|
||||||
|
<li id="${'article_%d'%i}" py:if="getattr(article, 'downloaded', False)">
|
||||||
|
<a class="article" href="${article.url}">${article.title}</a>
|
||||||
|
<span class="article_date">${article.localtime.strftime(" [%a, %d %b %H:%M]")}</span>
|
||||||
|
<p class="article_decription" py:if="article.summary">
|
||||||
|
${Markup(article.summary)}
|
||||||
|
</p>
|
||||||
|
</li>
|
||||||
|
</py:for>
|
||||||
|
</ul>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
''')
|
||||||
|
|
||||||
|
def generate(self, feed):
|
||||||
|
return Template.generate(self, feed=feed)
|
@ -77,14 +77,22 @@ class RecursiveFetcher(object):
|
|||||||
self.stylemap = {}
|
self.stylemap = {}
|
||||||
self.current_dir = self.base_dir
|
self.current_dir = self.base_dir
|
||||||
self.files = 0
|
self.files = 0
|
||||||
self.preprocess_regexps = []
|
self.preprocess_regexps = getattr(options, 'preprocess_regexps', [])
|
||||||
|
self.remove_tags = getattr(options, 'remove_tags', [])
|
||||||
|
self.preprocess_html_ext = getattr(options, 'preprocess_html', lambda soup: soup)
|
||||||
self.download_stylesheets = not options.no_stylesheets
|
self.download_stylesheets = not options.no_stylesheets
|
||||||
|
self.show_progress = True
|
||||||
|
|
||||||
|
|
||||||
def get_soup(self, src):
|
def get_soup(self, src):
|
||||||
nmassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE)
|
nmassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE)
|
||||||
nmassage.extend(self.preprocess_regexps)
|
nmassage.extend(self.preprocess_regexps)
|
||||||
return BeautifulSoup(xml_to_unicode(src, self.verbose)[0], markupMassage=nmassage)
|
soup = BeautifulSoup(xml_to_unicode(src, self.verbose)[0], markupMassage=nmassage)
|
||||||
|
for kwds in self.remove_tags:
|
||||||
|
for tag in soup.findAll(**kwds):
|
||||||
|
tag.extract()
|
||||||
|
return self.preprocess_html_ext(soup)
|
||||||
|
|
||||||
|
|
||||||
def fetch_url(self, url):
|
def fetch_url(self, url):
|
||||||
f = None
|
f = None
|
||||||
@ -249,7 +257,9 @@ class RecursiveFetcher(object):
|
|||||||
try:
|
try:
|
||||||
self.current_dir = diskpath
|
self.current_dir = diskpath
|
||||||
for tag in soup.findAll(lambda tag: tag.name.lower()=='a' and tag.has_key('href')):
|
for tag in soup.findAll(lambda tag: tag.name.lower()=='a' and tag.has_key('href')):
|
||||||
print '.',
|
if self.show_progress:
|
||||||
|
print '.',
|
||||||
|
sys.stdout.flush()
|
||||||
sys.stdout.flush()
|
sys.stdout.flush()
|
||||||
iurl = self.absurl(baseurl, tag, 'href')
|
iurl = self.absurl(baseurl, tag, 'href')
|
||||||
if not iurl:
|
if not iurl:
|
||||||
@ -301,7 +311,8 @@ class RecursiveFetcher(object):
|
|||||||
self.files += 1
|
self.files += 1
|
||||||
finally:
|
finally:
|
||||||
self.current_dir = prev_dir
|
self.current_dir = prev_dir
|
||||||
print
|
if self.show_progress:
|
||||||
|
print
|
||||||
return res
|
return res
|
||||||
|
|
||||||
def __del__(self):
|
def __del__(self):
|
||||||
@ -327,7 +338,6 @@ def option_parser(usage='%prog URL\n\nWhere URL is for example http://google.com
|
|||||||
help='Any link that matches this regular expression will be ignored. This option can be specified multiple times, in which case as long as any regexp matches a link, it will be ignored.By default, no links are ignored. If both --filter-regexp and --match-regexp are specified, then --filter-regexp is applied first.')
|
help='Any link that matches this regular expression will be ignored. This option can be specified multiple times, in which case as long as any regexp matches a link, it will be ignored.By default, no links are ignored. If both --filter-regexp and --match-regexp are specified, then --filter-regexp is applied first.')
|
||||||
parser.add_option('--dont-download-stylesheets', action='store_true', default=False,
|
parser.add_option('--dont-download-stylesheets', action='store_true', default=False,
|
||||||
help='Do not download CSS stylesheets.', dest='no_stylesheets')
|
help='Do not download CSS stylesheets.', dest='no_stylesheets')
|
||||||
|
|
||||||
parser.add_option('--verbose', help='Show detailed output information. Useful for debugging',
|
parser.add_option('--verbose', help='Show detailed output information. Useful for debugging',
|
||||||
default=False, action='store_true', dest='verbose')
|
default=False, action='store_true', dest='verbose')
|
||||||
return parser
|
return parser
|
||||||
|
Loading…
x
Reference in New Issue
Block a user