feeds2disk improved to the point of being able to download Newsweek. Added new Newsweek recipe.

This commit is contained in:
Kovid Goyal 2008-03-12 20:59:29 +00:00
parent 756de168fe
commit 2ccf260f7d
15 changed files with 477 additions and 139 deletions

View File

@ -23,13 +23,14 @@ from gettext import GNUTranslations
from math import floor from math import floor
from optparse import OptionParser as _OptionParser from optparse import OptionParser as _OptionParser
from optparse import IndentedHelpFormatter from optparse import IndentedHelpFormatter
from logging import Formatter
from ttfquery import findsystem, describe from ttfquery import findsystem, describe
from libprs500.translations.msgfmt import make from libprs500.translations.msgfmt import make
from libprs500.ebooks.chardet import detect from libprs500.ebooks.chardet import detect
from libprs500.terminfo import TerminalController from libprs500.terminfo import TerminalController
terminal_controller = TerminalController() terminal_controller = TerminalController(sys.stdout)
iswindows = 'win32' in sys.platform.lower() or 'win64' in sys.platform.lower() iswindows = 'win32' in sys.platform.lower() or 'win64' in sys.platform.lower()
isosx = 'darwin' in sys.platform.lower() isosx = 'darwin' in sys.platform.lower()
@ -51,6 +52,25 @@ __builtin__.__dict__['_'] = lambda s: s
class CommandLineError(Exception): class CommandLineError(Exception):
pass pass
class ColoredFormatter(Formatter):
def format(self, record):
ln = record.__dict__['levelname']
col = ''
if ln == 'CRITICAL':
col = terminal_controller.YELLOW
elif ln == 'ERROR':
col = terminal_controller.RED
elif ln in ['WARN', 'WARNING']:
col = terminal_controller.BLUE
elif ln == 'INFO':
col = terminal_controller.GREEN
elif ln == 'DEBUG':
col = terminal_controller.CYAN
record.__dict__['levelname'] = col + record.__dict__['levelname'] + terminal_controller.NORMAL
return Formatter.format(self, record)
def setup_cli_handlers(logger, level): def setup_cli_handlers(logger, level):
logger.setLevel(level) logger.setLevel(level)
if level == logging.WARNING: if level == logging.WARNING:
@ -187,9 +207,9 @@ def extract(path, dir):
raise Exception('Unknown archive type') raise Exception('Unknown archive type')
extractor(path, dir) extractor(path, dir)
def browser(): def browser(honor_time=False):
opener = mechanize.Browser() opener = mechanize.Browser()
opener.set_handle_refresh(True) opener.set_handle_refresh(True, honor_time=honor_time)
opener.set_handle_robots(False) opener.set_handle_robots(False)
opener.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; i686 Linux; en_US; rv:1.8.0.4) Gecko/20060508 Firefox/1.5.0.4')] opener.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; i686 Linux; en_US; rv:1.8.0.4) Gecko/20060508 Firefox/1.5.0.4')]
return opener return opener

View File

@ -147,7 +147,7 @@ class Delegator(object):
d.parent = self d.parent = self
methods = d.getMethods() methods = d.getMethods()
self.delegatedMethods += methods self.delegatedMethods += methods
for m in methods: for m in methods:
setattr(self, m, getattr(d, m)) setattr(self, m, getattr(d, m))
""" """

View File

@ -595,6 +595,11 @@ class OPFCreator(OPF):
self.uid = mi.uid self.uid = mi.uid
def create_manifest(self, entries): def create_manifest(self, entries):
'''
Create <manifest>
@param entries: List of (URL, mime-type)
@type entries: list of 2-tuples
'''
doc = dom.parseString(self.soup.__str__('UTF-8').strip()) doc = dom.parseString(self.soup.__str__('UTF-8').strip())
package = doc.documentElement package = doc.documentElement
manifest = doc.createElement('manifest') manifest = doc.createElement('manifest')
@ -616,6 +621,11 @@ class OPFCreator(OPF):
def create_spine(self, entries): def create_spine(self, entries):
'''
Create the <spine> element. Must first call L{create_manifest}.
@param: List of paths
@type param: list of strings
'''
doc = dom.parseString(self.soup.__str__('UTF-8').strip()) doc = dom.parseString(self.soup.__str__('UTF-8').strip())
package = doc.documentElement package = doc.documentElement
spine = doc.createElement('spine') spine = doc.createElement('spine')

View File

@ -74,6 +74,34 @@ def options(option_parser):
opts.extend(opt._long_opts) opts.extend(opt._long_opts)
return opts return opts
def opts_and_words(name, op, words):
opts = ' '.join(options(op))
words = [repr(w) for w in words]
words = ' '.join(words)
return '_'+name+'()'+\
'''
{
local cur prev opts
COMPREPLY=()
cur="${COMP_WORDS[COMP_CWORD]}"
opts="%s"
words="%s"
case "${cur}" in
-* )
COMPREPLY=( $(compgen -W "${opts}" -- ${cur}) )
return 0
;;
* )
COMPREPLY=( $(compgen -W "${words}" -- ${cur}) )
return 0
;;
esac
}
complete -F _'''%(opts, words) + name + ' ' + name +"\n\n"
def opts_and_exts(name, op, exts): def opts_and_exts(name, op, exts):
opts = ' '.join(options(op)) opts = ' '.join(options(op))
exts.extend([i.upper() for i in exts]) exts.extend([i.upper() for i in exts])
@ -135,6 +163,8 @@ def setup_completion(fatal_errors):
from libprs500.gui2.lrf_renderer.main import option_parser as lrfviewerop from libprs500.gui2.lrf_renderer.main import option_parser as lrfviewerop
from libprs500.ebooks.lrf.pdf.reflow import option_parser as pdfhtmlop from libprs500.ebooks.lrf.pdf.reflow import option_parser as pdfhtmlop
from libprs500.ebooks.mobi.reader import option_parser as mobioeb from libprs500.ebooks.mobi.reader import option_parser as mobioeb
from libprs500.web.feeds.main import option_parser as feeds2disk
from libprs500.web.feeds.recipes import titles as feed_titles
f = open_file('/etc/bash_completion.d/libprs500') f = open_file('/etc/bash_completion.d/libprs500')
@ -159,6 +189,7 @@ def setup_completion(fatal_errors):
f.write(opts_and_exts('lrfviewer', lrfviewerop, ['lrf'])) f.write(opts_and_exts('lrfviewer', lrfviewerop, ['lrf']))
f.write(opts_and_exts('pdfrelow', pdfhtmlop, ['pdf'])) f.write(opts_and_exts('pdfrelow', pdfhtmlop, ['pdf']))
f.write(opts_and_exts('mobi2oeb', mobioeb, ['mobi', 'prc'])) f.write(opts_and_exts('mobi2oeb', mobioeb, ['mobi', 'prc']))
f.write(opts_and_words('feeds2disk', feeds2disk, feed_titles))
f.write(''' f.write('''
_prs500_ls() _prs500_ls()
{ {

View File

@ -15,7 +15,7 @@
''' '''
Used to run jobs in parallel in separate processes. Used to run jobs in parallel in separate processes.
''' '''
import re, sys, tempfile, os, cPickle, cStringIO, traceback, atexit, binascii, time, subprocess import re, sys, tempfile, os, cPickle, traceback, atexit, binascii, time, subprocess
from functools import partial from functools import partial

View File

@ -106,7 +106,7 @@ class TerminalController:
except: return except: return
# If the stream isn't a tty, then assume it has no capabilities. # If the stream isn't a tty, then assume it has no capabilities.
if not term_stream.isatty(): return if not hasattr(term_stream, 'isatty') or not term_stream.isatty(): return
# Check the terminal type. If we fail, then assume that the # Check the terminal type. If we fail, then assume that the
# terminal has no capabilities. # terminal has no capabilities.

View File

@ -97,7 +97,8 @@ class WorkerThread(threading.Thread):
) )
except: except:
request.exception = True request.exception = True
self.resultQueue.put((request, sys.exc_info())) import traceback
self.resultQueue.put((request, traceback.format_exc()))
def dismiss(self): def dismiss(self):
"""Sets a flag to tell the thread to exit when done with current job. """Sets a flag to tell the thread to exit when done with current job.

View File

@ -27,6 +27,7 @@ class Article(object):
time_offset = datetime.now() - datetime.utcnow() time_offset = datetime.now() - datetime.utcnow()
def __init__(self, id, title, url, summary, published, content): def __init__(self, id, title, url, summary, published, content):
self.downloaded = False
self.id = id self.id = id
self.title = title self.title = title
self.url = url self.url = url
@ -103,7 +104,7 @@ class Feed(object):
if delta.days*24*3600 + delta.seconds <= 24*3600*self.oldest_article: if delta.days*24*3600 + delta.seconds <= 24*3600*self.oldest_article:
self.articles.append(article) self.articles.append(article)
else: else:
self.logger.debug('Skipping article %s as it is too old.'%title) self.logger.debug('Skipping article %s (%s) from feed %s as it is too old.'%(title, article.localtime.strftime('%a, %d %b, %Y %H:%M'), self.title))
def __iter__(self): def __iter__(self):
return iter(self.articles) return iter(self.articles)
@ -118,6 +119,12 @@ class Feed(object):
def __str__(self): def __str__(self):
return repr(self) return repr(self)
def __bool__(self):
for article in self:
if getattr(article, 'downloaded', False):
return True
return False
def feed_from_xml(raw_xml, title=None, oldest_article=7, max_articles_per_feed=100): def feed_from_xml(raw_xml, title=None, oldest_article=7, max_articles_per_feed=100):

View File

@ -18,21 +18,24 @@ from libprs500.web.feeds.news import BasicNewsRecipe
'''''' ''''''
import sys, os, logging import sys, os, logging
from libprs500.web.recipes import get_feed, compile_recipe from libprs500.web.feeds.recipes import get_builtin_recipe, compile_recipe, titles
from libprs500.web.fetch.simple import option_parser as _option_parser from libprs500.web.fetch.simple import option_parser as _option_parser
def option_parser(usage='''\ def option_parser(usage='''\
%prog [options] ARG %%prog [options] ARG
%prog parsers an online source of articles, like an RSS or ATOM feed and %%prog parsers an online source of articles, like an RSS or ATOM feed and
fetches the article contents organized in a nice hierarchy. fetches the article contents organized in a nice hierarchy.
ARG can be one of: ARG can be one of:
file name - %prog will try to load a recipe from the file file name - %%prog will try to load a recipe from the file
builtin recipe title - %prog will load the builtin recipe and use it to fetch the feed. For e.g. Newsweek or "The BBC" or "The New York Times" builtin recipe title - %%prog will load the builtin recipe and use it to fetch the feed. For e.g. Newsweek or "The BBC" or "The New York Times"
recipe as a string - $prog will load the recipe directly from the string arg. recipe as a string - %%prog will load the recipe directly from the string arg.
'''):
Available builtin recipes are:
%s
'''%(unicode(list(titles))[1:-1])):
p = _option_parser(usage=usage) p = _option_parser(usage=usage)
p.remove_option('--max-recursions') p.remove_option('--max-recursions')
p.remove_option('--base-dir') p.remove_option('--base-dir')
@ -86,7 +89,7 @@ def main(args=sys.argv, notification=None, handler=None):
else: else:
notification = no_progress_bar notification = no_progress_bar
if len(args) != 2: if len(args) != 2 and opts.feeds is None:
p.print_help() p.print_help()
return 1 return 1
@ -96,11 +99,16 @@ def main(args=sys.argv, notification=None, handler=None):
else: else:
try: try:
if os.access(args[1], os.R_OK): if os.access(args[1], os.R_OK):
recipe = compile_recipe(open(args[1]).read()) try:
recipe = compile_recipe(open(args[1]).read())
except:
import traceback
traceback.print_exc()
return 1
else: else:
raise Exception('') raise Exception('not file')
except: except:
recipe = get_feed(args[1]) recipe = get_builtin_recipe(args[1])
if recipe is None: if recipe is None:
recipe = compile_recipe(args[1]) recipe = compile_recipe(args[1])
@ -111,9 +119,10 @@ def main(args=sys.argv, notification=None, handler=None):
return 1 return 1
if handler is None: if handler is None:
from libprs500 import ColoredFormatter
handler = logging.StreamHandler(sys.stdout) handler = logging.StreamHandler(sys.stdout)
handler.setLevel(logging.DEBUG if opts.debug else logging.INFO if opts.verbose else logging.WARN) handler.setLevel(logging.DEBUG if opts.debug else logging.INFO if opts.verbose else logging.WARN)
handler.setFormatter(logging.Formatter('%(levelname)s: %(message)s')) handler.setFormatter(ColoredFormatter('%(levelname)s: %(message)s\n')) # The trailing newline is need because of the progress bar
logging.getLogger('feeds2disk').addHandler(handler) logging.getLogger('feeds2disk').addHandler(handler)
recipe = recipe(opts, p, notification) recipe = recipe(opts, p, notification)

View File

@ -17,11 +17,13 @@
The backend to parse feeds and create HTML that can then be converted The backend to parse feeds and create HTML that can then be converted
to an ebook. to an ebook.
''' '''
import logging, os, cStringIO, traceback, time import logging, os, cStringIO, time, itertools, traceback
import urlparse import urlparse
from libprs500 import browser from libprs500 import browser, __appname__
from libprs500.ebooks.BeautifulSoup import BeautifulSoup from libprs500.ebooks.BeautifulSoup import BeautifulSoup
from libprs500.ebooks.metadata.opf import OPFCreator
from libprs500.ebooks.metadata import MetaInformation
from libprs500.web.feeds import feed_from_xml, templates from libprs500.web.feeds import feed_from_xml, templates
from libprs500.web.fetch.simple import option_parser as web2disk_option_parser from libprs500.web.fetch.simple import option_parser as web2disk_option_parser
from libprs500.web.fetch.simple import RecursiveFetcher from libprs500.web.fetch.simple import RecursiveFetcher
@ -35,7 +37,10 @@ class BasicNewsRecipe(object):
#: The title to use for the ebook #: The title to use for the ebook
#: @type: string #: @type: string
title = 'Unknown News Source' title = _('Unknown News Source')
#: The author of this recipe
__author__ = _('Unknown')
#: Maximum number of articles to download from each feed #: Maximum number of articles to download from each feed
#: @type: integer #: @type: integer
@ -55,17 +60,18 @@ class BasicNewsRecipe(object):
delay = 0 delay = 0
#: Number of simultaneous downloads. Set to 1 if the server is picky. #: Number of simultaneous downloads. Set to 1 if the server is picky.
#: Automatically reduced to 1 if L{delay} > 0
#: @type: integer #: @type: integer
simultaneous_downloads = 5 simultaneous_downloads = 5
#: Timeout for fetching files from server in seconds #: Timeout for fetching files from server in seconds
#: @type: integer #: @type: integer
timeout = 10 timeout = 120
#: The format string for the date shown on the first page #: The format string for the date shown on the first page
#: By default: Day Name Day Number Month Name Year #: By default: Day Name Day Number Month Name Year
#: @type: string #: @type: string
timefmt = ' %a, %d %b %Y' timefmt = ' [%a, %d %b %Y]'
#: Max number of characters in the short description. #: Max number of characters in the short description.
#: @type: integer #: @type: integer
@ -102,7 +108,7 @@ class BasicNewsRecipe(object):
#: List of options to pass to html2lrf, to customize generation of LRF ebooks. #: List of options to pass to html2lrf, to customize generation of LRF ebooks.
#: @type: list of strings #: @type: list of strings
html2lrf_options = [] html2lrf_options = ['--page-break-before', '$']
#: List of tags to be removed. Specified tags are removed from downloaded HTML. #: List of tags to be removed. Specified tags are removed from downloaded HTML.
#: A tag is specified as a dictionary of the form:: #: A tag is specified as a dictionary of the form::
@ -114,9 +120,23 @@ class BasicNewsRecipe(object):
#: U{http://www.crummy.com/software/BeautifulSoup/documentation.html#The basic find method: findAll(name, attrs, recursive, text, limit, **kwargs)} #: U{http://www.crummy.com/software/BeautifulSoup/documentation.html#The basic find method: findAll(name, attrs, recursive, text, limit, **kwargs)}
#: A common example:: #: A common example::
#: remove_tags = [dict(name='div', attrs={'class':'advert'})] #: remove_tags = [dict(name='div', attrs={'class':'advert'})]
#: This will remove all <div class="advert"> tags and all their children from the downloaded HTML. #: This will remove all <div class="advert"> tags and all their children from the downloaded HTML.
#: @type: list
remove_tags = [] remove_tags = []
#: Remove all tags that occur after the specified tag.
#: For the format for specifying a tag see L{remove_tags}.
#: For example, C{remove_tags_after = [dict(id='content')]} will remove all
#: tags after the element with id C{content}.
remove_tags_after = None
#: Keep only the specified tags and their children.
#: For the format for specifying tags see L{remove_tags}.
#: If this list is not empty, then the <body> element will be emptied and re-filled with
#: the tags that match the entries in this list.
#: @type: list
keep_only_tags = []
#: List of regexp substitution rules to run on the downloaded HTML. Each element of the #: List of regexp substitution rules to run on the downloaded HTML. Each element of the
#: list should be a two element tuple. The first element of the tuple should #: list should be a two element tuple. The first element of the tuple should
#: be a compiled regular expression and the second a callable that takes #: be a compiled regular expression and the second a callable that takes
@ -126,6 +146,13 @@ class BasicNewsRecipe(object):
# See the built-in profiles for examples of these settings. # See the built-in profiles for examples of these settings.
def get_cover_url(self):
'''
Return a URL to the cover image for this issue or None.
@rtype: string or None
'''
return getattr(self, 'cover_url', None)
def get_feeds(self): def get_feeds(self):
''' '''
Return a list of RSS feeds to fetch for this profile. Each element of the list Return a list of RSS feeds to fetch for this profile. Each element of the list
@ -156,7 +183,21 @@ class BasicNewsRecipe(object):
def preprocess_html(self, soup): def preprocess_html(self, soup):
''' '''
This function is called with the source of each downloaded HTML file. This function is called with the source of each downloaded HTML file, before
it is parsed for links and images.
It can be used to do arbitrarily powerful pre-processing on the HTML.
@param soup: A U{BeautifulSoup<http://www.crummy.com/software/BeautifulSoup/documentation.html>}
instance containing the downloaded HTML.
@type soup: A U{BeautifulSoup<http://www.crummy.com/software/BeautifulSoup/documentation.html>} instance
@return: It must return soup (after having done any needed preprocessing)
@rtype: A U{BeautifulSoup<http://www.crummy.com/software/BeautifulSoup/documentation.html>} instance
'''
return soup
def postprocess_html(self, soup):
'''
This function is called with the source of each downloaded HTML file, after
it is parsed for links and images.
It can be used to do arbitrarily powerful pre-processing on the HTML. It can be used to do arbitrarily powerful pre-processing on the HTML.
@param soup: A U{BeautifulSoup<http://www.crummy.com/software/BeautifulSoup/documentation.html>} @param soup: A U{BeautifulSoup<http://www.crummy.com/software/BeautifulSoup/documentation.html>}
instance containing the downloaded HTML. instance containing the downloaded HTML.
@ -210,6 +251,7 @@ class BasicNewsRecipe(object):
self.browser = self.get_browser() self.browser = self.get_browser()
self.image_map, self.image_counter = {}, 1 self.image_map, self.image_counter = {}, 1
self.css_map = {}
web2disk_cmdline = [ 'web2disk', web2disk_cmdline = [ 'web2disk',
'--timeout', str(self.timeout), '--timeout', str(self.timeout),
@ -233,14 +275,18 @@ class BasicNewsRecipe(object):
web2disk_cmdline.extend(['--filter-regexp', reg]) web2disk_cmdline.extend(['--filter-regexp', reg])
self.web2disk_options = web2disk_option_parser().parse_args(web2disk_cmdline)[0] self.web2disk_options = web2disk_option_parser().parse_args(web2disk_cmdline)[0]
self.web2disk_options.remove_tags = self.remove_tags for extra in ('keep_only_tags', 'remove_tags', 'preprocess_regexps',
self.web2disk_options.preprocess_regexps = self.preprocess_regexps 'preprocess_html', 'remove_tags_after', 'postprocess_html'):
self.web2disk_options.preprocess_html = self.preprocess_html setattr(self.web2disk_options, extra, getattr(self, extra))
if self.delay > 0: if self.delay > 0:
self.simultaneous_downloads = 1 self.simultaneous_downloads = 1
self.navbar = templates.NavBarTemplate() self.navbar = templates.NavBarTemplate()
self.max_articles_per_feed -= 1
self.html2lrf_options.append('--use-spine')
self.failed_downloads = []
self.partial_failures = []
def download(self): def download(self):
''' '''
@ -250,9 +296,26 @@ class BasicNewsRecipe(object):
@return: Path to index.html @return: Path to index.html
@rtype: string @rtype: string
''' '''
self.report_progress(0, _('Initialized')) self.report_progress(0, _('Trying to download cover...'))
self.download_cover()
res = self.build_index() res = self.build_index()
self.cleanup() self.cleanup()
self.report_progress(1, _('Download finished'))
if self.failed_downloads:
self.logger.warning(_('Failed to download the following articles:'))
for feed, article, debug in self.failed_downloads:
self.logger.warning(article.title+_(' from ')+feed.title)
self.logger.debug(article.url)
self.logger.debug(debug)
if self.partial_failures:
self.logger.warning(_('Failed to download parts of the following articles:'))
for feed, atitle, aurl, debug in self.partial_failures:
self.logger.warning(atitle + _(' from ') + feed)
self.logger.debug(aurl)
self.logger.warning(_('\tFailed links:'))
for l, tb in debug:
self.logger.warning(l)
self.logger.debug(tb)
return res return res
def feeds2index(self, feeds): def feeds2index(self, feeds):
@ -294,11 +357,14 @@ class BasicNewsRecipe(object):
return logger, out return logger, out
def fetch_article(self, url, dir, logger): def fetch_article(self, url, dir, logger):
fetcher = RecursiveFetcher(self.web2disk_options, logger, self.image_map) fetcher = RecursiveFetcher(self.web2disk_options, logger, self.image_map, self.css_map)
fetcher.base_dir = dir fetcher.base_dir = dir
fetcher.current_dir = dir fetcher.current_dir = dir
fetcher.show_progress = False fetcher.show_progress = False
return fetcher.start_fetch(url) res, path, failures = fetcher.start_fetch(url), fetcher.downloaded_paths, fetcher.failed_links
if not res:
raise Exception(_('Could not fetch article. Run with --debug to see the reason'))
return res, path, failures
def build_index(self): def build_index(self):
self.report_progress(0, _('Fetching feeds...')) self.report_progress(0, _('Fetching feeds...'))
@ -331,58 +397,111 @@ class BasicNewsRecipe(object):
req.stream = stream req.stream = stream
req.feed = feed req.feed = feed
req.article = article req.article = article
req.feed_dir = feed_dir
self.jobs.append(req) self.jobs.append(req)
self.jobs_done = 0 self.jobs_done = 0
tp = ThreadPool(self.simultaneous_downloads) tp = ThreadPool(self.simultaneous_downloads)
for req in self.jobs: for req in self.jobs:
tp.putRequest(req, block=True, timeout=0) tp.putRequest(req, block=True, timeout=0)
self.report_progress(0, _('Starting download [%d thread(s)]...')%self.simultaneous_downloads) self.report_progress(0, _('Starting download [%d thread(s)]...')%self.simultaneous_downloads)
while True: while True:
try: try:
tp.poll(True) tp.poll()
time.sleep(0.1) time.sleep(0.1)
except NoResultsPending: except NoResultsPending:
break break
html = self.feed2index(feed) for f, feed in enumerate(feeds):
open(os.path.join(feed_dir, 'index.html'), 'wb').write(html) html = self.feed2index(feed)
feed_dir = os.path.join(self.output_dir, 'feed_%d'%f)
open(os.path.join(feed_dir, 'index.html'), 'wb').write(html)
self.create_opf(feeds)
self.report_progress(1, _('Feeds downloaded to %s')%index) self.report_progress(1, _('Feeds downloaded to %s')%index)
return index return index
def download_cover(self):
self.cover_path = None
try:
cu = self.get_cover_url()
except Exception, err:
cu = None
self.logger.error(_('Could not download cover: %s')%str(err))
self.logger.debug(traceback.format_exc())
if cu is not None:
ext = cu.rpartition('.')[-1]
ext = ext.lower() if ext else 'jpg'
self.report_progress(1, _('Downloading cover from %s')%cu)
cpath = os.path.join(self.output_dir, 'cover.'+ext)
cfile = open(cpath, 'wb')
cfile.write(self.browser.open(cu).read())
self.cover_path = cpath
def create_opf(self, feeds, dir=None):
if dir is None:
dir = self.output_dir
mi = MetaInformation(self.title + time.strftime(self.timefmt), [__appname__])
opf = OPFCreator(mi)
opf_path = os.path.join(dir, 'index.opf')
cpath = getattr(self, 'cover_path', None)
if cpath is not None and os.access(cpath, os.R_OK):
opf.cover = cpath
entries = ['index.html']
for i, f in enumerate(feeds):
entries.append('feed_%d/index.html'%i)
for j, a in enumerate(f):
if getattr(a, 'downloaded', False):
adir = 'feed_%d/article_%d/'%(i, j)
entries.append('%sindex.html'%adir)
for sp in a.sub_pages:
prefix = os.path.commonprefix([opf_path, sp])
relp = sp[len(prefix):]
entries.append(relp.replace(os.sep, '/'))
opf.create_manifest(itertools.izip(entries, itertools.repeat('text/html')))
opf.create_spine(entries)
opf.write(open(opf_path, 'wb'))
def article_downloaded(self, request, result): def article_downloaded(self, request, result):
index = os.path.join(os.path.dirname(result), 'index.html') index = os.path.join(os.path.dirname(result[0]), 'index.html')
os.rename(result, index) os.rename(result[0], index)
src = open(index, 'rb').read().decode('utf-8') src = open(index, 'rb').read().decode('utf-8')
f, a = request.requestID f, a = request.requestID
soup = BeautifulSoup(src) soup = BeautifulSoup(src)
body = soup.find('body') body = soup.find('body')
if body is not None: if body is not None:
top = self.navbar.generate(False, f, a, len(request.feed), not self.has_single_feed).render(doctype='xhtml') top = self.navbar.generate(False, f, a, len(request.feed), not self.has_single_feed).render(doctype='xhtml')
bottom = self.navbar.generate(True, f, a, len(request.feed), not self.has_single_feed).render(doctype='xhtml')
top = BeautifulSoup(top).find('div') top = BeautifulSoup(top).find('div')
bottom = BeautifulSoup(bottom).find('div')
body.insert(0, top) body.insert(0, top)
body.insert(len(body.contents), bottom)
open(index, 'wb').write(unicode(soup).encode('utf-8')) open(index, 'wb').write(unicode(soup).encode('utf-8'))
article = request.article article = request.article
self.logger.debug(_('\nDownloaded article %s from %s\n%s')%(article.title, article.url, request.stream.getvalue())) self.logger.debug(_('\nDownloaded article %s from %s\n%s')%(article.title, article.url, request.stream.getvalue().decode('utf-8', 'ignore')))
article.url = result article.url = result[0]
article.downloaded = True article.downloaded = True
article.sub_pages = result[1][1:]
self.jobs_done += 1 self.jobs_done += 1
self.report_progress(float(self.jobs_done)/len(self.jobs), _('Article downloaded: %s')%article.title) self.report_progress(float(self.jobs_done)/len(self.jobs), _('Article downloaded: %s')%article.title)
if result[2]:
self.partial_failures.append((request.feed.title, article.title, article.url, result[2]))
def error_in_article_download(self, request, exc_info): def error_in_article_download(self, request, traceback):
self.jobs_done += 1 self.jobs_done += 1
self.logger.error(_('Failed to download article: %s from %s')%(request.article.title, request.article.url)) self.logger.error(_('Failed to download article: %s from %s\n')%(request.article.title, request.article.url))
self.logger.debug(traceback.format_exc(*exc_info)) debug = request.stream.getvalue().decode('utf-8', 'ignore')
self.logger.debug(request.stream.getvalue()) self.logger.debug(debug)
self.logger.debug(traceback)
self.logger.debug('\n') self.logger.debug('\n')
self.report_progress(float(self.jobs_done)/len(self.jobs), _('Article download failed: %s')%request.article.title) self.report_progress(float(self.jobs_done)/len(self.jobs), _('Article download failed: %s')%request.article.title)
self.failed_downloads.append((request.feed.title, request.article, debug))
def parse_feeds(self): def parse_feeds(self):
''' '''
@ -404,5 +523,3 @@ class BasicNewsRecipe(object):
max_articles_per_feed=self.max_articles_per_feed)) max_articles_per_feed=self.max_articles_per_feed))
return parsed_feeds return parsed_feeds

View File

@ -0,0 +1,79 @@
#!/usr/bin/env python
## Copyright (C) 2008 Kovid Goyal kovid@kovidgoyal.net
## This program is free software; you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
## the Free Software Foundation; either version 2 of the License, or
## (at your option) any later version.
##
## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
## GNU General Public License for more details.
##
## You should have received a copy of the GNU General Public License along
## with this program; if not, write to the Free Software Foundation, Inc.,
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
'''
Builtin recipes.
'''
recipes = ['newsweek']
import re
from libprs500.web.feeds.news import BasicNewsRecipe
from libprs500.ebooks.lrf.web.profiles import DefaultProfile, FullContentProfile
from libprs500.ebooks.lrf.web import available_profiles
basic_recipes = (BasicNewsRecipe, DefaultProfile, FullContentProfile)
basic_recipe_names = (i.__name__ for i in basic_recipes)
#: Compiled builtin recipe/profile classes
def load_recipe(module, package='libprs500.web.feeds.recipes'):
module = __import__(package+'.'+module, fromlist=[''])
for attr in dir(module):
obj = getattr(module, attr)
if type(obj) is not type:
continue
recipe = False
for b in obj.__bases__:
if b in basic_recipes:
recipe = True
break
if not recipe:
continue
if obj not in basic_recipes:
return obj
recipes = [load_recipe(i) for i in recipes]
def compile_recipe(src):
'''
Compile the code in src and return the first object that is a recipe or profile.
@return: Recipe/Profile class or None, if no such class was found in C{src}
'''
locals = {}
exec src in globals(), locals
for obj in locals.values():
if type(obj) is type and obj.__name__ not in basic_recipe_names:
for base in obj.__bases__:
if base in basic_recipes:
return obj
return None
def get_builtin_recipe(title):
'''
Return a builtin recipe/profile class whoose title == C{title} or None if no such
recipe exists.
@type title: string
@rtype: class or None
'''
for r in recipes:
if r.title == title:
return r
titles = set([r.title for r in recipes])

View File

@ -0,0 +1,90 @@
#!/usr/bin/env python
## Copyright (C) 2008 Kovid Goyal kovid@kovidgoyal.net
## This program is free software; you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
## the Free Software Foundation; either version 2 of the License, or
## (at your option) any later version.
##
## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
## GNU General Public License for more details.
##
## You should have received a copy of the GNU General Public License along
## with this program; if not, write to the Free Software Foundation, Inc.,
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
import re
from libprs500.web.feeds.news import BasicNewsRecipe
from libprs500.ebooks.BeautifulSoup import BeautifulSoup
class Newsweek(BasicNewsRecipe):
title = 'Newsweek'
__author__ = 'Kovid Goyal'
feeds = [
('Top News', 'http://feeds.newsweek.com/newsweek/TopNews',),
'http://feeds.newsweek.com/newsweek/columnists/StevenLevy',
('Politics', 'http://feeds.newsweek.com/headlines/politics'),
('Health', 'http://feeds.newsweek.com/headlines/health'),
('Business', 'http://feeds.newsweek.com/headlines/business'),
('Science and Technology', 'http://feeds.newsweek.com/headlines/technology/science'),
('National News', 'http://feeds.newsweek.com/newsweek/NationalNews'),
('World News', 'http://feeds.newsweek.com/newsweek/WorldNews'),
'http://feeds.newsweek.com/newsweek/Columnists/ChristopherDickey',
'http://feeds.newsweek.com/newsweek/Columnists/FareedZakaria',
('Iraq', 'http://feeds.newsweek.com/newsweek/iraq'),
('Society', 'http://feeds.newsweek.com/newsweek/society'),
('Entertainment', 'http://feeds.newsweek.com/newsweek/entertainment'),
'http://feeds.newsweek.com/newsweek/columnists/GeorgeFWill',
'http://feeds.newsweek.com/newsweek/columnists/AnnaQuindlen',
]
# For testing
feeds = feeds[:2]
max_articles_per_feed = 1
keep_only_tags = [dict(name='div', id='content')]
remove_tags = [
dict(name=['script', 'noscript']),
dict(name='div', attrs={'class':['ad', 'SocialLinks', 'SocialLinksDiv', 'channel', 'bot', 'nav', 'top', 'EmailArticleBlock']}),
dict(name='div', attrs={'class':re.compile('box')}),
dict(id=['ToolBox', 'EmailMain', 'EmailArticle', ])
]
recursions = 1
match_regexps = [r'http://www.newsweek.com/id/\S+/page/\d+']
def postprocess_html(self, soup):
divs = list(soup.findAll('div', 'pagination'))
divs[0].extract()
if len(divs) > 1:
soup.find('body')['style'] = 'page-break-after:avoid'
divs[1].extract()
h1 = soup.find('h1')
if h1:
h1.extract()
ai = soup.find('div', 'articleInfo')
ai.extract()
else:
soup.find('body')['style'] = 'page-break-before:always; page-break-after:avoid;'
return soup
def get_current_issue(self):
from urllib2 import urlopen # For some reason mechanize fails
home = urlopen('http://www.newsweek.com').read()
soup = BeautifulSoup(home)
img = soup.find('img', alt='Current Magazine')
if img and img.parent.has_key('href'):
return urlopen(img.parent['href']).read()
def get_cover_url(self):
ci = self.get_current_issue()
if ci is not None:
soup = BeautifulSoup(ci)
img = soup.find(alt='Cover')
if img is not None and img.has_key('src'):
small = img['src']
return small.replace('coversmall', 'coverlarge')

View File

@ -104,7 +104,7 @@ class IndexTemplate(Template):
<p style="text-align:right">${datetime.now().strftime(datefmt)}</p> <p style="text-align:right">${datetime.now().strftime(datefmt)}</p>
<ul> <ul>
<py:for each="i, feed in enumerate(feeds)"> <py:for each="i, feed in enumerate(feeds)">
<li id="feed_${str(i)}"> <li py:if="feed" id="feed_${str(i)}">
<a class="feed" href="${'feed_%d/index.html'%i}">${feed.title}</a> <a class="feed" href="${'feed_%d/index.html'%i}">${feed.title}</a>
</li> </li>
</py:for> </py:for>
@ -136,7 +136,7 @@ class FeedTemplate(Template):
${style} ${style}
</style> </style>
</head> </head>
<body> <body style="page-break-before:always">
<h2>${feed.title}</h2> <h2>${feed.title}</h2>
<py:if test="feed.image"> <py:if test="feed.image">
<div class="feed_image"> <div class="feed_image">
@ -144,7 +144,7 @@ class FeedTemplate(Template):
</div> </div>
</py:if> </py:if>
<ul> <ul>
<py:for each="i, article in enumerate(feed)"> <py:for each="i, article in enumerate(feed.articles)">
<li id="${'article_%d'%i}" py:if="getattr(article, 'downloaded', False)"> <li id="${'article_%d'%i}" py:if="getattr(article, 'downloaded', False)">
<a class="article" href="${article.url}">${article.title}</a> <a class="article" href="${article.url}">${article.title}</a>
<span class="article_date">${article.localtime.strftime(" [%a, %d %b %H:%M]")}</span> <span class="article_date">${article.localtime.strftime(" [%a, %d %b %H:%M]")}</span>

View File

@ -17,12 +17,12 @@ Fetch a webpage and its links recursively. The webpages are saved to disk in
UTF-8 encoding with any charset declarations removed. UTF-8 encoding with any charset declarations removed.
''' '''
from __future__ import with_statement from __future__ import with_statement
import sys, socket, os, urlparse, codecs, logging, re, time, copy, urllib2, threading import sys, socket, os, urlparse, codecs, logging, re, time, copy, urllib2, threading, traceback
from urllib import url2pathname from urllib import url2pathname
from httplib import responses from httplib import responses
from libprs500 import setup_cli_handlers, browser, sanitize_file_name, OptionParser from libprs500 import setup_cli_handlers, browser, sanitize_file_name, OptionParser
from libprs500.ebooks.BeautifulSoup import BeautifulSoup from libprs500.ebooks.BeautifulSoup import BeautifulSoup, Tag
from libprs500.ebooks.chardet import xml_to_unicode from libprs500.ebooks.chardet import xml_to_unicode
class FetchError(Exception): class FetchError(Exception):
@ -37,10 +37,11 @@ def basename(url):
return res return res
def save_soup(soup, target): def save_soup(soup, target):
for meta in soup.findAll('meta', content=True): nm = Tag(soup, '<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />')
for meta in soup.find('meta', content=True):
if 'charset' in meta['content']: if 'charset' in meta['content']:
meta.extract() meta.replaceWith(nm)
f = codecs.open(target, 'w', 'utf8') f = codecs.open(target, 'w', 'utf-8')
f.write(unicode(soup)) f.write(unicode(soup))
f.close() f.close()
@ -55,7 +56,7 @@ class RecursiveFetcher(object):
# ) # )
CSS_IMPORT_PATTERN = re.compile(r'\@import\s+url\((.*?)\)', re.IGNORECASE) CSS_IMPORT_PATTERN = re.compile(r'\@import\s+url\((.*?)\)', re.IGNORECASE)
def __init__(self, options, logger, image_map={}): def __init__(self, options, logger, image_map={}, css_map={}):
self.logger = logger self.logger = logger
self.base_dir = os.path.abspath(os.path.expanduser(options.dir)) self.base_dir = os.path.abspath(os.path.expanduser(options.dir))
if not os.path.exists(self.base_dir): if not os.path.exists(self.base_dir):
@ -74,20 +75,44 @@ class RecursiveFetcher(object):
self.filemap = {} self.filemap = {}
self.imagemap = image_map self.imagemap = image_map
self.imagemap_lock = threading.RLock() self.imagemap_lock = threading.RLock()
self.stylemap = {} self.stylemap = css_map
self.stylemap_lock = threading.RLock()
self.downloaded_paths = []
self.current_dir = self.base_dir self.current_dir = self.base_dir
self.files = 0 self.files = 0
self.preprocess_regexps = getattr(options, 'preprocess_regexps', []) self.preprocess_regexps = getattr(options, 'preprocess_regexps', [])
self.remove_tags = getattr(options, 'remove_tags', []) self.remove_tags = getattr(options, 'remove_tags', [])
self.remove_tags_after = getattr(options, 'remove_tags_after', None)
self.keep_only_tags = getattr(options, 'keep_only_tags', [])
self.preprocess_html_ext = getattr(options, 'preprocess_html', lambda soup: soup) self.preprocess_html_ext = getattr(options, 'preprocess_html', lambda soup: soup)
self.postprocess_html_ext= getattr(options, 'postprocess_html', lambda soup: soup)
self.download_stylesheets = not options.no_stylesheets self.download_stylesheets = not options.no_stylesheets
self.show_progress = True self.show_progress = True
self.failed_links = []
def get_soup(self, src): def get_soup(self, src):
nmassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE) nmassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE)
nmassage.extend(self.preprocess_regexps) nmassage.extend(self.preprocess_regexps)
soup = BeautifulSoup(xml_to_unicode(src, self.verbose)[0], markupMassage=nmassage) soup = BeautifulSoup(xml_to_unicode(src, self.verbose)[0], markupMassage=nmassage)
if self.keep_only_tags:
body = Tag(soup, 'body')
for spec in self.keep_only_tags:
for tag in soup.find('body').findAll(**spec):
body.insert(len(body.contents), tag)
soup.find('body').replaceWith(body)
if self.remove_tags_after is not None:
tag = soup.find(**self.remove_tags_after)
while tag is not None and tag.name != 'body':
after = tag.nextSibling
while after is not None:
ns = after.nextSibling
after.extract()
after = ns
tag = tag.parent
for kwds in self.remove_tags: for kwds in self.remove_tags:
for tag in soup.findAll(**kwds): for tag in soup.findAll(**kwds):
tag.extract() tag.extract()
@ -105,7 +130,12 @@ class RecursiveFetcher(object):
except urllib2.URLError, err: except urllib2.URLError, err:
if hasattr(err, 'code') and responses.has_key(err.code): if hasattr(err, 'code') and responses.has_key(err.code):
raise FetchError, responses[err.code] raise FetchError, responses[err.code]
raise err if err.reason[0] == 104: # Connection reset by peer
self.logger.debug('Connection reset by peer retrying in 1 second.')
time.sleep(1)
f = self.browser.open(url)
else:
raise err
finally: finally:
self.last_fetch_at = time.time() self.last_fetch_at = time.time()
return f return f
@ -146,9 +176,10 @@ class RecursiveFetcher(object):
iurl = tag['href'] iurl = tag['href']
if not urlparse.urlsplit(iurl).scheme: if not urlparse.urlsplit(iurl).scheme:
iurl = urlparse.urljoin(baseurl, iurl, False) iurl = urlparse.urljoin(baseurl, iurl, False)
if self.stylemap.has_key(iurl): with self.stylemap_lock:
tag['href'] = self.stylemap[iurl] if self.stylemap.has_key(iurl):
continue tag['href'] = self.stylemap[iurl]
continue
try: try:
f = self.fetch_url(iurl) f = self.fetch_url(iurl)
except Exception, err: except Exception, err:
@ -157,7 +188,8 @@ class RecursiveFetcher(object):
continue continue
c += 1 c += 1
stylepath = os.path.join(diskpath, 'style'+str(c)+'.css') stylepath = os.path.join(diskpath, 'style'+str(c)+'.css')
self.stylemap[iurl] = stylepath with self.stylemap_lock:
self.stylemap[iurl] = stylepath
open(stylepath, 'wb').write(f.read()) open(stylepath, 'wb').write(f.read())
tag['href'] = stylepath tag['href'] = stylepath
else: else:
@ -168,9 +200,10 @@ class RecursiveFetcher(object):
iurl = m.group(1) iurl = m.group(1)
if not urlparse.urlsplit(iurl).scheme: if not urlparse.urlsplit(iurl).scheme:
iurl = urlparse.urljoin(baseurl, iurl, False) iurl = urlparse.urljoin(baseurl, iurl, False)
if self.stylemap.has_key(iurl): with self.stylemap_lock:
ns.replaceWith(src.replace(m.group(1), self.stylemap[iurl])) if self.stylemap.has_key(iurl):
continue ns.replaceWith(src.replace(m.group(1), self.stylemap[iurl]))
continue
try: try:
f = self.fetch_url(iurl) f = self.fetch_url(iurl)
except Exception, err: except Exception, err:
@ -179,7 +212,8 @@ class RecursiveFetcher(object):
continue continue
c += 1 c += 1
stylepath = os.path.join(diskpath, 'style'+str(c)+'.css') stylepath = os.path.join(diskpath, 'style'+str(c)+'.css')
self.stylemap[iurl] = stylepath with self.stylemap_lock:
self.stylemap[iurl] = stylepath
open(stylepath, 'wb').write(f.read()) open(stylepath, 'wb').write(f.read())
ns.replaceWith(src.replace(m.group(1), stylepath)) ns.replaceWith(src.replace(m.group(1), stylepath))
@ -214,7 +248,7 @@ class RecursiveFetcher(object):
open(imgpath, 'wb').write(f.read()) open(imgpath, 'wb').write(f.read())
tag['src'] = imgpath tag['src'] = imgpath
def absurl(self, baseurl, tag, key): def absurl(self, baseurl, tag, key, filter=True):
iurl = tag[key] iurl = tag[key]
parts = urlparse.urlsplit(iurl) parts = urlparse.urlsplit(iurl)
if not parts.netloc and not parts.path: if not parts.netloc and not parts.path:
@ -224,7 +258,7 @@ class RecursiveFetcher(object):
if not self.is_link_ok(iurl): if not self.is_link_ok(iurl):
self.logger.debug('Skipping invalid link: %s', iurl) self.logger.debug('Skipping invalid link: %s', iurl)
return None return None
if not self.is_link_wanted(iurl): if filter and not self.is_link_wanted(iurl):
self.logger.debug('Filtered link: '+iurl) self.logger.debug('Filtered link: '+iurl)
return None return None
return iurl return iurl
@ -256,12 +290,12 @@ class RecursiveFetcher(object):
prev_dir = self.current_dir prev_dir = self.current_dir
try: try:
self.current_dir = diskpath self.current_dir = diskpath
for tag in soup.findAll(lambda tag: tag.name.lower()=='a' and tag.has_key('href')): for tag in soup.findAll('a', href=True):
if self.show_progress: if self.show_progress:
print '.', print '.',
sys.stdout.flush() sys.stdout.flush()
sys.stdout.flush() sys.stdout.flush()
iurl = self.absurl(baseurl, tag, 'href') iurl = self.absurl(baseurl, tag, 'href', filter=recursion_level != 0)
if not iurl: if not iurl:
continue continue
nurl = self.normurl(iurl) nurl = self.normurl(iurl)
@ -293,6 +327,7 @@ class RecursiveFetcher(object):
self.process_stylesheets(soup, f.geturl()) self.process_stylesheets(soup, f.geturl())
res = os.path.join(linkdiskpath, basename(iurl)) res = os.path.join(linkdiskpath, basename(iurl))
self.downloaded_paths.append(res)
self.filemap[nurl] = res self.filemap[nurl] = res
if recursion_level < self.max_recursions: if recursion_level < self.max_recursions:
self.logger.debug('Processing links...') self.logger.debug('Processing links...')
@ -301,9 +336,11 @@ class RecursiveFetcher(object):
self.process_return_links(soup, iurl) self.process_return_links(soup, iurl)
self.logger.debug('Recursion limit reached. Skipping links in %s', iurl) self.logger.debug('Recursion limit reached. Skipping links in %s', iurl)
save_soup(soup, res) save_soup(self.postprocess_html_ext(soup), res)
self.localize_link(tag, 'href', res) self.localize_link(tag, 'href', res)
except Exception, err: except Exception, err:
self.failed_links.append((iurl, traceback.format_exc()))
self.logger.warning('Could not fetch link %s', iurl) self.logger.warning('Could not fetch link %s', iurl)
self.logger.debug('Error: %s', str(err), exc_info=True) self.logger.debug('Error: %s', str(err), exc_info=True)
finally: finally:

View File

@ -1,63 +0,0 @@
#!/usr/bin/env python
## Copyright (C) 2008 Kovid Goyal kovid@kovidgoyal.net
## This program is free software; you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
## the Free Software Foundation; either version 2 of the License, or
## (at your option) any later version.
##
## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
## GNU General Public License for more details.
##
## You should have received a copy of the GNU General Public License along
## with this program; if not, write to the Free Software Foundation, Inc.,
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
'''
Contains recipes for various common news sources and websites.
'''
import re
from libprs500.web.feeds.news import BasicNewsRecipe
_basic_recipes = (BasicNewsRecipe,)
_basic_recipe_names = (i.__name__ for i in _basic_recipes)
def compile_recipe(src):
'''
Compile the code in src and return the first object that is
'''
locals = {}
exec src in globals(), locals
for obj in locals.values():
if type(obj) is type and obj.__name__ not in _basic_recipe_names:
for base in obj.__bases__:
if base in _basic_recipes:
return obj
return None
def get_feed(title):
'''
Return a builtin recipe class whoose title == C{title} or None if no such
recipe exists.
@type title: string
@rtype: class or None
'''
if isinstance(_feeds[0], basestring):
for i, val in enumerate(_feeds):
recipe = compile_recipe(val)
if recipe is None:
raise RuntimeError('The builtin Recipe #%d is invalid.'%i)
_feeds[i] = recipe
for recipe in _feeds:
if recipe.title == title:
return recipe
return None
#: Recipes to be used with feeds2disk
_feeds = ['class Temp(BasicNewsRecipe):\n\ttitle="temp"']