feeds2disk improved to the point of being able to download Newsweek. Added new Newsweek recipe.

This commit is contained in:
Kovid Goyal 2008-03-12 20:59:29 +00:00
parent 756de168fe
commit 2ccf260f7d
15 changed files with 477 additions and 139 deletions

View File

@ -23,13 +23,14 @@ from gettext import GNUTranslations
from math import floor
from optparse import OptionParser as _OptionParser
from optparse import IndentedHelpFormatter
from logging import Formatter
from ttfquery import findsystem, describe
from libprs500.translations.msgfmt import make
from libprs500.ebooks.chardet import detect
from libprs500.terminfo import TerminalController
terminal_controller = TerminalController()
terminal_controller = TerminalController(sys.stdout)
iswindows = 'win32' in sys.platform.lower() or 'win64' in sys.platform.lower()
isosx = 'darwin' in sys.platform.lower()
@ -51,6 +52,25 @@ __builtin__.__dict__['_'] = lambda s: s
class CommandLineError(Exception):
pass
class ColoredFormatter(Formatter):
def format(self, record):
ln = record.__dict__['levelname']
col = ''
if ln == 'CRITICAL':
col = terminal_controller.YELLOW
elif ln == 'ERROR':
col = terminal_controller.RED
elif ln in ['WARN', 'WARNING']:
col = terminal_controller.BLUE
elif ln == 'INFO':
col = terminal_controller.GREEN
elif ln == 'DEBUG':
col = terminal_controller.CYAN
record.__dict__['levelname'] = col + record.__dict__['levelname'] + terminal_controller.NORMAL
return Formatter.format(self, record)
def setup_cli_handlers(logger, level):
logger.setLevel(level)
if level == logging.WARNING:
@ -187,9 +207,9 @@ def extract(path, dir):
raise Exception('Unknown archive type')
extractor(path, dir)
def browser():
def browser(honor_time=False):
opener = mechanize.Browser()
opener.set_handle_refresh(True)
opener.set_handle_refresh(True, honor_time=honor_time)
opener.set_handle_robots(False)
opener.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; i686 Linux; en_US; rv:1.8.0.4) Gecko/20060508 Firefox/1.5.0.4')]
return opener

View File

@ -147,7 +147,7 @@ class Delegator(object):
d.parent = self
methods = d.getMethods()
self.delegatedMethods += methods
for m in methods:
for m in methods:
setattr(self, m, getattr(d, m))
"""

View File

@ -595,6 +595,11 @@ class OPFCreator(OPF):
self.uid = mi.uid
def create_manifest(self, entries):
'''
Create <manifest>
@param entries: List of (URL, mime-type)
@type entries: list of 2-tuples
'''
doc = dom.parseString(self.soup.__str__('UTF-8').strip())
package = doc.documentElement
manifest = doc.createElement('manifest')
@ -616,6 +621,11 @@ class OPFCreator(OPF):
def create_spine(self, entries):
'''
Create the <spine> element. Must first call L{create_manifest}.
@param: List of paths
@type param: list of strings
'''
doc = dom.parseString(self.soup.__str__('UTF-8').strip())
package = doc.documentElement
spine = doc.createElement('spine')

View File

@ -74,6 +74,34 @@ def options(option_parser):
opts.extend(opt._long_opts)
return opts
def opts_and_words(name, op, words):
opts = ' '.join(options(op))
words = [repr(w) for w in words]
words = ' '.join(words)
return '_'+name+'()'+\
'''
{
local cur prev opts
COMPREPLY=()
cur="${COMP_WORDS[COMP_CWORD]}"
opts="%s"
words="%s"
case "${cur}" in
-* )
COMPREPLY=( $(compgen -W "${opts}" -- ${cur}) )
return 0
;;
* )
COMPREPLY=( $(compgen -W "${words}" -- ${cur}) )
return 0
;;
esac
}
complete -F _'''%(opts, words) + name + ' ' + name +"\n\n"
def opts_and_exts(name, op, exts):
opts = ' '.join(options(op))
exts.extend([i.upper() for i in exts])
@ -135,6 +163,8 @@ def setup_completion(fatal_errors):
from libprs500.gui2.lrf_renderer.main import option_parser as lrfviewerop
from libprs500.ebooks.lrf.pdf.reflow import option_parser as pdfhtmlop
from libprs500.ebooks.mobi.reader import option_parser as mobioeb
from libprs500.web.feeds.main import option_parser as feeds2disk
from libprs500.web.feeds.recipes import titles as feed_titles
f = open_file('/etc/bash_completion.d/libprs500')
@ -159,6 +189,7 @@ def setup_completion(fatal_errors):
f.write(opts_and_exts('lrfviewer', lrfviewerop, ['lrf']))
f.write(opts_and_exts('pdfrelow', pdfhtmlop, ['pdf']))
f.write(opts_and_exts('mobi2oeb', mobioeb, ['mobi', 'prc']))
f.write(opts_and_words('feeds2disk', feeds2disk, feed_titles))
f.write('''
_prs500_ls()
{

View File

@ -15,7 +15,7 @@
'''
Used to run jobs in parallel in separate processes.
'''
import re, sys, tempfile, os, cPickle, cStringIO, traceback, atexit, binascii, time, subprocess
import re, sys, tempfile, os, cPickle, traceback, atexit, binascii, time, subprocess
from functools import partial

View File

@ -106,7 +106,7 @@ class TerminalController:
except: return
# If the stream isn't a tty, then assume it has no capabilities.
if not term_stream.isatty(): return
if not hasattr(term_stream, 'isatty') or not term_stream.isatty(): return
# Check the terminal type. If we fail, then assume that the
# terminal has no capabilities.

View File

@ -97,7 +97,8 @@ class WorkerThread(threading.Thread):
)
except:
request.exception = True
self.resultQueue.put((request, sys.exc_info()))
import traceback
self.resultQueue.put((request, traceback.format_exc()))
def dismiss(self):
"""Sets a flag to tell the thread to exit when done with current job.

View File

@ -27,6 +27,7 @@ class Article(object):
time_offset = datetime.now() - datetime.utcnow()
def __init__(self, id, title, url, summary, published, content):
self.downloaded = False
self.id = id
self.title = title
self.url = url
@ -103,7 +104,7 @@ class Feed(object):
if delta.days*24*3600 + delta.seconds <= 24*3600*self.oldest_article:
self.articles.append(article)
else:
self.logger.debug('Skipping article %s as it is too old.'%title)
self.logger.debug('Skipping article %s (%s) from feed %s as it is too old.'%(title, article.localtime.strftime('%a, %d %b, %Y %H:%M'), self.title))
def __iter__(self):
return iter(self.articles)
@ -118,6 +119,12 @@ class Feed(object):
def __str__(self):
return repr(self)
def __bool__(self):
for article in self:
if getattr(article, 'downloaded', False):
return True
return False
def feed_from_xml(raw_xml, title=None, oldest_article=7, max_articles_per_feed=100):

View File

@ -18,21 +18,24 @@ from libprs500.web.feeds.news import BasicNewsRecipe
''''''
import sys, os, logging
from libprs500.web.recipes import get_feed, compile_recipe
from libprs500.web.feeds.recipes import get_builtin_recipe, compile_recipe, titles
from libprs500.web.fetch.simple import option_parser as _option_parser
def option_parser(usage='''\
%prog [options] ARG
%%prog [options] ARG
%prog parsers an online source of articles, like an RSS or ATOM feed and
%%prog parsers an online source of articles, like an RSS or ATOM feed and
fetches the article contents organized in a nice hierarchy.
ARG can be one of:
file name - %prog will try to load a recipe from the file
builtin recipe title - %prog will load the builtin recipe and use it to fetch the feed. For e.g. Newsweek or "The BBC" or "The New York Times"
recipe as a string - $prog will load the recipe directly from the string arg.
'''):
file name - %%prog will try to load a recipe from the file
builtin recipe title - %%prog will load the builtin recipe and use it to fetch the feed. For e.g. Newsweek or "The BBC" or "The New York Times"
recipe as a string - %%prog will load the recipe directly from the string arg.
Available builtin recipes are:
%s
'''%(unicode(list(titles))[1:-1])):
p = _option_parser(usage=usage)
p.remove_option('--max-recursions')
p.remove_option('--base-dir')
@ -86,7 +89,7 @@ def main(args=sys.argv, notification=None, handler=None):
else:
notification = no_progress_bar
if len(args) != 2:
if len(args) != 2 and opts.feeds is None:
p.print_help()
return 1
@ -96,11 +99,16 @@ def main(args=sys.argv, notification=None, handler=None):
else:
try:
if os.access(args[1], os.R_OK):
recipe = compile_recipe(open(args[1]).read())
try:
recipe = compile_recipe(open(args[1]).read())
except:
import traceback
traceback.print_exc()
return 1
else:
raise Exception('')
raise Exception('not file')
except:
recipe = get_feed(args[1])
recipe = get_builtin_recipe(args[1])
if recipe is None:
recipe = compile_recipe(args[1])
@ -111,9 +119,10 @@ def main(args=sys.argv, notification=None, handler=None):
return 1
if handler is None:
from libprs500 import ColoredFormatter
handler = logging.StreamHandler(sys.stdout)
handler.setLevel(logging.DEBUG if opts.debug else logging.INFO if opts.verbose else logging.WARN)
handler.setFormatter(logging.Formatter('%(levelname)s: %(message)s'))
handler.setFormatter(ColoredFormatter('%(levelname)s: %(message)s\n')) # The trailing newline is need because of the progress bar
logging.getLogger('feeds2disk').addHandler(handler)
recipe = recipe(opts, p, notification)

View File

@ -17,11 +17,13 @@
The backend to parse feeds and create HTML that can then be converted
to an ebook.
'''
import logging, os, cStringIO, traceback, time
import logging, os, cStringIO, time, itertools, traceback
import urlparse
from libprs500 import browser
from libprs500 import browser, __appname__
from libprs500.ebooks.BeautifulSoup import BeautifulSoup
from libprs500.ebooks.metadata.opf import OPFCreator
from libprs500.ebooks.metadata import MetaInformation
from libprs500.web.feeds import feed_from_xml, templates
from libprs500.web.fetch.simple import option_parser as web2disk_option_parser
from libprs500.web.fetch.simple import RecursiveFetcher
@ -35,7 +37,10 @@ class BasicNewsRecipe(object):
#: The title to use for the ebook
#: @type: string
title = 'Unknown News Source'
title = _('Unknown News Source')
#: The author of this recipe
__author__ = _('Unknown')
#: Maximum number of articles to download from each feed
#: @type: integer
@ -55,17 +60,18 @@ class BasicNewsRecipe(object):
delay = 0
#: Number of simultaneous downloads. Set to 1 if the server is picky.
#: Automatically reduced to 1 if L{delay} > 0
#: @type: integer
simultaneous_downloads = 5
#: Timeout for fetching files from server in seconds
#: @type: integer
timeout = 10
timeout = 120
#: The format string for the date shown on the first page
#: By default: Day Name Day Number Month Name Year
#: @type: string
timefmt = ' %a, %d %b %Y'
timefmt = ' [%a, %d %b %Y]'
#: Max number of characters in the short description.
#: @type: integer
@ -102,7 +108,7 @@ class BasicNewsRecipe(object):
#: List of options to pass to html2lrf, to customize generation of LRF ebooks.
#: @type: list of strings
html2lrf_options = []
html2lrf_options = ['--page-break-before', '$']
#: List of tags to be removed. Specified tags are removed from downloaded HTML.
#: A tag is specified as a dictionary of the form::
@ -114,9 +120,23 @@ class BasicNewsRecipe(object):
#: U{http://www.crummy.com/software/BeautifulSoup/documentation.html#The basic find method: findAll(name, attrs, recursive, text, limit, **kwargs)}
#: A common example::
#: remove_tags = [dict(name='div', attrs={'class':'advert'})]
#: This will remove all <div class="advert"> tags and all their children from the downloaded HTML.
#: This will remove all <div class="advert"> tags and all their children from the downloaded HTML.
#: @type: list
remove_tags = []
#: Remove all tags that occur after the specified tag.
#: For the format for specifying a tag see L{remove_tags}.
#: For example, C{remove_tags_after = [dict(id='content')]} will remove all
#: tags after the element with id C{content}.
remove_tags_after = None
#: Keep only the specified tags and their children.
#: For the format for specifying tags see L{remove_tags}.
#: If this list is not empty, then the <body> element will be emptied and re-filled with
#: the tags that match the entries in this list.
#: @type: list
keep_only_tags = []
#: List of regexp substitution rules to run on the downloaded HTML. Each element of the
#: list should be a two element tuple. The first element of the tuple should
#: be a compiled regular expression and the second a callable that takes
@ -126,6 +146,13 @@ class BasicNewsRecipe(object):
# See the built-in profiles for examples of these settings.
def get_cover_url(self):
'''
Return a URL to the cover image for this issue or None.
@rtype: string or None
'''
return getattr(self, 'cover_url', None)
def get_feeds(self):
'''
Return a list of RSS feeds to fetch for this profile. Each element of the list
@ -156,7 +183,21 @@ class BasicNewsRecipe(object):
def preprocess_html(self, soup):
'''
This function is called with the source of each downloaded HTML file.
This function is called with the source of each downloaded HTML file, before
it is parsed for links and images.
It can be used to do arbitrarily powerful pre-processing on the HTML.
@param soup: A U{BeautifulSoup<http://www.crummy.com/software/BeautifulSoup/documentation.html>}
instance containing the downloaded HTML.
@type soup: A U{BeautifulSoup<http://www.crummy.com/software/BeautifulSoup/documentation.html>} instance
@return: It must return soup (after having done any needed preprocessing)
@rtype: A U{BeautifulSoup<http://www.crummy.com/software/BeautifulSoup/documentation.html>} instance
'''
return soup
def postprocess_html(self, soup):
'''
This function is called with the source of each downloaded HTML file, after
it is parsed for links and images.
It can be used to do arbitrarily powerful pre-processing on the HTML.
@param soup: A U{BeautifulSoup<http://www.crummy.com/software/BeautifulSoup/documentation.html>}
instance containing the downloaded HTML.
@ -210,6 +251,7 @@ class BasicNewsRecipe(object):
self.browser = self.get_browser()
self.image_map, self.image_counter = {}, 1
self.css_map = {}
web2disk_cmdline = [ 'web2disk',
'--timeout', str(self.timeout),
@ -233,14 +275,18 @@ class BasicNewsRecipe(object):
web2disk_cmdline.extend(['--filter-regexp', reg])
self.web2disk_options = web2disk_option_parser().parse_args(web2disk_cmdline)[0]
self.web2disk_options.remove_tags = self.remove_tags
self.web2disk_options.preprocess_regexps = self.preprocess_regexps
self.web2disk_options.preprocess_html = self.preprocess_html
for extra in ('keep_only_tags', 'remove_tags', 'preprocess_regexps',
'preprocess_html', 'remove_tags_after', 'postprocess_html'):
setattr(self.web2disk_options, extra, getattr(self, extra))
if self.delay > 0:
self.simultaneous_downloads = 1
self.navbar = templates.NavBarTemplate()
self.max_articles_per_feed -= 1
self.html2lrf_options.append('--use-spine')
self.failed_downloads = []
self.partial_failures = []
def download(self):
'''
@ -250,9 +296,26 @@ class BasicNewsRecipe(object):
@return: Path to index.html
@rtype: string
'''
self.report_progress(0, _('Initialized'))
self.report_progress(0, _('Trying to download cover...'))
self.download_cover()
res = self.build_index()
self.cleanup()
self.report_progress(1, _('Download finished'))
if self.failed_downloads:
self.logger.warning(_('Failed to download the following articles:'))
for feed, article, debug in self.failed_downloads:
self.logger.warning(article.title+_(' from ')+feed.title)
self.logger.debug(article.url)
self.logger.debug(debug)
if self.partial_failures:
self.logger.warning(_('Failed to download parts of the following articles:'))
for feed, atitle, aurl, debug in self.partial_failures:
self.logger.warning(atitle + _(' from ') + feed)
self.logger.debug(aurl)
self.logger.warning(_('\tFailed links:'))
for l, tb in debug:
self.logger.warning(l)
self.logger.debug(tb)
return res
def feeds2index(self, feeds):
@ -294,11 +357,14 @@ class BasicNewsRecipe(object):
return logger, out
def fetch_article(self, url, dir, logger):
fetcher = RecursiveFetcher(self.web2disk_options, logger, self.image_map)
fetcher = RecursiveFetcher(self.web2disk_options, logger, self.image_map, self.css_map)
fetcher.base_dir = dir
fetcher.current_dir = dir
fetcher.show_progress = False
return fetcher.start_fetch(url)
res, path, failures = fetcher.start_fetch(url), fetcher.downloaded_paths, fetcher.failed_links
if not res:
raise Exception(_('Could not fetch article. Run with --debug to see the reason'))
return res, path, failures
def build_index(self):
self.report_progress(0, _('Fetching feeds...'))
@ -331,58 +397,111 @@ class BasicNewsRecipe(object):
req.stream = stream
req.feed = feed
req.article = article
req.feed_dir = feed_dir
self.jobs.append(req)
self.jobs_done = 0
tp = ThreadPool(self.simultaneous_downloads)
for req in self.jobs:
tp.putRequest(req, block=True, timeout=0)
self.report_progress(0, _('Starting download [%d thread(s)]...')%self.simultaneous_downloads)
while True:
try:
tp.poll(True)
tp.poll()
time.sleep(0.1)
except NoResultsPending:
break
html = self.feed2index(feed)
open(os.path.join(feed_dir, 'index.html'), 'wb').write(html)
for f, feed in enumerate(feeds):
html = self.feed2index(feed)
feed_dir = os.path.join(self.output_dir, 'feed_%d'%f)
open(os.path.join(feed_dir, 'index.html'), 'wb').write(html)
self.create_opf(feeds)
self.report_progress(1, _('Feeds downloaded to %s')%index)
return index
def download_cover(self):
self.cover_path = None
try:
cu = self.get_cover_url()
except Exception, err:
cu = None
self.logger.error(_('Could not download cover: %s')%str(err))
self.logger.debug(traceback.format_exc())
if cu is not None:
ext = cu.rpartition('.')[-1]
ext = ext.lower() if ext else 'jpg'
self.report_progress(1, _('Downloading cover from %s')%cu)
cpath = os.path.join(self.output_dir, 'cover.'+ext)
cfile = open(cpath, 'wb')
cfile.write(self.browser.open(cu).read())
self.cover_path = cpath
def create_opf(self, feeds, dir=None):
if dir is None:
dir = self.output_dir
mi = MetaInformation(self.title + time.strftime(self.timefmt), [__appname__])
opf = OPFCreator(mi)
opf_path = os.path.join(dir, 'index.opf')
cpath = getattr(self, 'cover_path', None)
if cpath is not None and os.access(cpath, os.R_OK):
opf.cover = cpath
entries = ['index.html']
for i, f in enumerate(feeds):
entries.append('feed_%d/index.html'%i)
for j, a in enumerate(f):
if getattr(a, 'downloaded', False):
adir = 'feed_%d/article_%d/'%(i, j)
entries.append('%sindex.html'%adir)
for sp in a.sub_pages:
prefix = os.path.commonprefix([opf_path, sp])
relp = sp[len(prefix):]
entries.append(relp.replace(os.sep, '/'))
opf.create_manifest(itertools.izip(entries, itertools.repeat('text/html')))
opf.create_spine(entries)
opf.write(open(opf_path, 'wb'))
def article_downloaded(self, request, result):
index = os.path.join(os.path.dirname(result), 'index.html')
os.rename(result, index)
index = os.path.join(os.path.dirname(result[0]), 'index.html')
os.rename(result[0], index)
src = open(index, 'rb').read().decode('utf-8')
f, a = request.requestID
soup = BeautifulSoup(src)
body = soup.find('body')
if body is not None:
top = self.navbar.generate(False, f, a, len(request.feed), not self.has_single_feed).render(doctype='xhtml')
bottom = self.navbar.generate(True, f, a, len(request.feed), not self.has_single_feed).render(doctype='xhtml')
top = BeautifulSoup(top).find('div')
bottom = BeautifulSoup(bottom).find('div')
body.insert(0, top)
body.insert(len(body.contents), bottom)
open(index, 'wb').write(unicode(soup).encode('utf-8'))
article = request.article
self.logger.debug(_('\nDownloaded article %s from %s\n%s')%(article.title, article.url, request.stream.getvalue()))
article.url = result
self.logger.debug(_('\nDownloaded article %s from %s\n%s')%(article.title, article.url, request.stream.getvalue().decode('utf-8', 'ignore')))
article.url = result[0]
article.downloaded = True
article.sub_pages = result[1][1:]
self.jobs_done += 1
self.report_progress(float(self.jobs_done)/len(self.jobs), _('Article downloaded: %s')%article.title)
if result[2]:
self.partial_failures.append((request.feed.title, article.title, article.url, result[2]))
def error_in_article_download(self, request, exc_info):
def error_in_article_download(self, request, traceback):
self.jobs_done += 1
self.logger.error(_('Failed to download article: %s from %s')%(request.article.title, request.article.url))
self.logger.debug(traceback.format_exc(*exc_info))
self.logger.debug(request.stream.getvalue())
self.logger.error(_('Failed to download article: %s from %s\n')%(request.article.title, request.article.url))
debug = request.stream.getvalue().decode('utf-8', 'ignore')
self.logger.debug(debug)
self.logger.debug(traceback)
self.logger.debug('\n')
self.report_progress(float(self.jobs_done)/len(self.jobs), _('Article download failed: %s')%request.article.title)
self.failed_downloads.append((request.feed.title, request.article, debug))
def parse_feeds(self):
'''
@ -404,5 +523,3 @@ class BasicNewsRecipe(object):
max_articles_per_feed=self.max_articles_per_feed))
return parsed_feeds

View File

@ -0,0 +1,79 @@
#!/usr/bin/env python
## Copyright (C) 2008 Kovid Goyal kovid@kovidgoyal.net
## This program is free software; you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
## the Free Software Foundation; either version 2 of the License, or
## (at your option) any later version.
##
## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
## GNU General Public License for more details.
##
## You should have received a copy of the GNU General Public License along
## with this program; if not, write to the Free Software Foundation, Inc.,
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
'''
Builtin recipes.
'''
recipes = ['newsweek']
import re
from libprs500.web.feeds.news import BasicNewsRecipe
from libprs500.ebooks.lrf.web.profiles import DefaultProfile, FullContentProfile
from libprs500.ebooks.lrf.web import available_profiles
basic_recipes = (BasicNewsRecipe, DefaultProfile, FullContentProfile)
basic_recipe_names = (i.__name__ for i in basic_recipes)
#: Compiled builtin recipe/profile classes
def load_recipe(module, package='libprs500.web.feeds.recipes'):
module = __import__(package+'.'+module, fromlist=[''])
for attr in dir(module):
obj = getattr(module, attr)
if type(obj) is not type:
continue
recipe = False
for b in obj.__bases__:
if b in basic_recipes:
recipe = True
break
if not recipe:
continue
if obj not in basic_recipes:
return obj
recipes = [load_recipe(i) for i in recipes]
def compile_recipe(src):
'''
Compile the code in src and return the first object that is a recipe or profile.
@return: Recipe/Profile class or None, if no such class was found in C{src}
'''
locals = {}
exec src in globals(), locals
for obj in locals.values():
if type(obj) is type and obj.__name__ not in basic_recipe_names:
for base in obj.__bases__:
if base in basic_recipes:
return obj
return None
def get_builtin_recipe(title):
'''
Return a builtin recipe/profile class whoose title == C{title} or None if no such
recipe exists.
@type title: string
@rtype: class or None
'''
for r in recipes:
if r.title == title:
return r
titles = set([r.title for r in recipes])

View File

@ -0,0 +1,90 @@
#!/usr/bin/env python
## Copyright (C) 2008 Kovid Goyal kovid@kovidgoyal.net
## This program is free software; you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
## the Free Software Foundation; either version 2 of the License, or
## (at your option) any later version.
##
## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
## GNU General Public License for more details.
##
## You should have received a copy of the GNU General Public License along
## with this program; if not, write to the Free Software Foundation, Inc.,
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
import re
from libprs500.web.feeds.news import BasicNewsRecipe
from libprs500.ebooks.BeautifulSoup import BeautifulSoup
class Newsweek(BasicNewsRecipe):
title = 'Newsweek'
__author__ = 'Kovid Goyal'
feeds = [
('Top News', 'http://feeds.newsweek.com/newsweek/TopNews',),
'http://feeds.newsweek.com/newsweek/columnists/StevenLevy',
('Politics', 'http://feeds.newsweek.com/headlines/politics'),
('Health', 'http://feeds.newsweek.com/headlines/health'),
('Business', 'http://feeds.newsweek.com/headlines/business'),
('Science and Technology', 'http://feeds.newsweek.com/headlines/technology/science'),
('National News', 'http://feeds.newsweek.com/newsweek/NationalNews'),
('World News', 'http://feeds.newsweek.com/newsweek/WorldNews'),
'http://feeds.newsweek.com/newsweek/Columnists/ChristopherDickey',
'http://feeds.newsweek.com/newsweek/Columnists/FareedZakaria',
('Iraq', 'http://feeds.newsweek.com/newsweek/iraq'),
('Society', 'http://feeds.newsweek.com/newsweek/society'),
('Entertainment', 'http://feeds.newsweek.com/newsweek/entertainment'),
'http://feeds.newsweek.com/newsweek/columnists/GeorgeFWill',
'http://feeds.newsweek.com/newsweek/columnists/AnnaQuindlen',
]
# For testing
feeds = feeds[:2]
max_articles_per_feed = 1
keep_only_tags = [dict(name='div', id='content')]
remove_tags = [
dict(name=['script', 'noscript']),
dict(name='div', attrs={'class':['ad', 'SocialLinks', 'SocialLinksDiv', 'channel', 'bot', 'nav', 'top', 'EmailArticleBlock']}),
dict(name='div', attrs={'class':re.compile('box')}),
dict(id=['ToolBox', 'EmailMain', 'EmailArticle', ])
]
recursions = 1
match_regexps = [r'http://www.newsweek.com/id/\S+/page/\d+']
def postprocess_html(self, soup):
divs = list(soup.findAll('div', 'pagination'))
divs[0].extract()
if len(divs) > 1:
soup.find('body')['style'] = 'page-break-after:avoid'
divs[1].extract()
h1 = soup.find('h1')
if h1:
h1.extract()
ai = soup.find('div', 'articleInfo')
ai.extract()
else:
soup.find('body')['style'] = 'page-break-before:always; page-break-after:avoid;'
return soup
def get_current_issue(self):
from urllib2 import urlopen # For some reason mechanize fails
home = urlopen('http://www.newsweek.com').read()
soup = BeautifulSoup(home)
img = soup.find('img', alt='Current Magazine')
if img and img.parent.has_key('href'):
return urlopen(img.parent['href']).read()
def get_cover_url(self):
ci = self.get_current_issue()
if ci is not None:
soup = BeautifulSoup(ci)
img = soup.find(alt='Cover')
if img is not None and img.has_key('src'):
small = img['src']
return small.replace('coversmall', 'coverlarge')

View File

@ -104,7 +104,7 @@ class IndexTemplate(Template):
<p style="text-align:right">${datetime.now().strftime(datefmt)}</p>
<ul>
<py:for each="i, feed in enumerate(feeds)">
<li id="feed_${str(i)}">
<li py:if="feed" id="feed_${str(i)}">
<a class="feed" href="${'feed_%d/index.html'%i}">${feed.title}</a>
</li>
</py:for>
@ -136,7 +136,7 @@ class FeedTemplate(Template):
${style}
</style>
</head>
<body>
<body style="page-break-before:always">
<h2>${feed.title}</h2>
<py:if test="feed.image">
<div class="feed_image">
@ -144,7 +144,7 @@ class FeedTemplate(Template):
</div>
</py:if>
<ul>
<py:for each="i, article in enumerate(feed)">
<py:for each="i, article in enumerate(feed.articles)">
<li id="${'article_%d'%i}" py:if="getattr(article, 'downloaded', False)">
<a class="article" href="${article.url}">${article.title}</a>
<span class="article_date">${article.localtime.strftime(" [%a, %d %b %H:%M]")}</span>

View File

@ -17,12 +17,12 @@ Fetch a webpage and its links recursively. The webpages are saved to disk in
UTF-8 encoding with any charset declarations removed.
'''
from __future__ import with_statement
import sys, socket, os, urlparse, codecs, logging, re, time, copy, urllib2, threading
import sys, socket, os, urlparse, codecs, logging, re, time, copy, urllib2, threading, traceback
from urllib import url2pathname
from httplib import responses
from libprs500 import setup_cli_handlers, browser, sanitize_file_name, OptionParser
from libprs500.ebooks.BeautifulSoup import BeautifulSoup
from libprs500.ebooks.BeautifulSoup import BeautifulSoup, Tag
from libprs500.ebooks.chardet import xml_to_unicode
class FetchError(Exception):
@ -37,10 +37,11 @@ def basename(url):
return res
def save_soup(soup, target):
for meta in soup.findAll('meta', content=True):
nm = Tag(soup, '<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />')
for meta in soup.find('meta', content=True):
if 'charset' in meta['content']:
meta.extract()
f = codecs.open(target, 'w', 'utf8')
meta.replaceWith(nm)
f = codecs.open(target, 'w', 'utf-8')
f.write(unicode(soup))
f.close()
@ -55,7 +56,7 @@ class RecursiveFetcher(object):
# )
CSS_IMPORT_PATTERN = re.compile(r'\@import\s+url\((.*?)\)', re.IGNORECASE)
def __init__(self, options, logger, image_map={}):
def __init__(self, options, logger, image_map={}, css_map={}):
self.logger = logger
self.base_dir = os.path.abspath(os.path.expanduser(options.dir))
if not os.path.exists(self.base_dir):
@ -74,20 +75,44 @@ class RecursiveFetcher(object):
self.filemap = {}
self.imagemap = image_map
self.imagemap_lock = threading.RLock()
self.stylemap = {}
self.stylemap = css_map
self.stylemap_lock = threading.RLock()
self.downloaded_paths = []
self.current_dir = self.base_dir
self.files = 0
self.preprocess_regexps = getattr(options, 'preprocess_regexps', [])
self.remove_tags = getattr(options, 'remove_tags', [])
self.remove_tags_after = getattr(options, 'remove_tags_after', None)
self.keep_only_tags = getattr(options, 'keep_only_tags', [])
self.preprocess_html_ext = getattr(options, 'preprocess_html', lambda soup: soup)
self.postprocess_html_ext= getattr(options, 'postprocess_html', lambda soup: soup)
self.download_stylesheets = not options.no_stylesheets
self.show_progress = True
self.failed_links = []
def get_soup(self, src):
nmassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE)
nmassage.extend(self.preprocess_regexps)
soup = BeautifulSoup(xml_to_unicode(src, self.verbose)[0], markupMassage=nmassage)
if self.keep_only_tags:
body = Tag(soup, 'body')
for spec in self.keep_only_tags:
for tag in soup.find('body').findAll(**spec):
body.insert(len(body.contents), tag)
soup.find('body').replaceWith(body)
if self.remove_tags_after is not None:
tag = soup.find(**self.remove_tags_after)
while tag is not None and tag.name != 'body':
after = tag.nextSibling
while after is not None:
ns = after.nextSibling
after.extract()
after = ns
tag = tag.parent
for kwds in self.remove_tags:
for tag in soup.findAll(**kwds):
tag.extract()
@ -105,7 +130,12 @@ class RecursiveFetcher(object):
except urllib2.URLError, err:
if hasattr(err, 'code') and responses.has_key(err.code):
raise FetchError, responses[err.code]
raise err
if err.reason[0] == 104: # Connection reset by peer
self.logger.debug('Connection reset by peer retrying in 1 second.')
time.sleep(1)
f = self.browser.open(url)
else:
raise err
finally:
self.last_fetch_at = time.time()
return f
@ -146,9 +176,10 @@ class RecursiveFetcher(object):
iurl = tag['href']
if not urlparse.urlsplit(iurl).scheme:
iurl = urlparse.urljoin(baseurl, iurl, False)
if self.stylemap.has_key(iurl):
tag['href'] = self.stylemap[iurl]
continue
with self.stylemap_lock:
if self.stylemap.has_key(iurl):
tag['href'] = self.stylemap[iurl]
continue
try:
f = self.fetch_url(iurl)
except Exception, err:
@ -157,7 +188,8 @@ class RecursiveFetcher(object):
continue
c += 1
stylepath = os.path.join(diskpath, 'style'+str(c)+'.css')
self.stylemap[iurl] = stylepath
with self.stylemap_lock:
self.stylemap[iurl] = stylepath
open(stylepath, 'wb').write(f.read())
tag['href'] = stylepath
else:
@ -168,9 +200,10 @@ class RecursiveFetcher(object):
iurl = m.group(1)
if not urlparse.urlsplit(iurl).scheme:
iurl = urlparse.urljoin(baseurl, iurl, False)
if self.stylemap.has_key(iurl):
ns.replaceWith(src.replace(m.group(1), self.stylemap[iurl]))
continue
with self.stylemap_lock:
if self.stylemap.has_key(iurl):
ns.replaceWith(src.replace(m.group(1), self.stylemap[iurl]))
continue
try:
f = self.fetch_url(iurl)
except Exception, err:
@ -179,7 +212,8 @@ class RecursiveFetcher(object):
continue
c += 1
stylepath = os.path.join(diskpath, 'style'+str(c)+'.css')
self.stylemap[iurl] = stylepath
with self.stylemap_lock:
self.stylemap[iurl] = stylepath
open(stylepath, 'wb').write(f.read())
ns.replaceWith(src.replace(m.group(1), stylepath))
@ -214,7 +248,7 @@ class RecursiveFetcher(object):
open(imgpath, 'wb').write(f.read())
tag['src'] = imgpath
def absurl(self, baseurl, tag, key):
def absurl(self, baseurl, tag, key, filter=True):
iurl = tag[key]
parts = urlparse.urlsplit(iurl)
if not parts.netloc and not parts.path:
@ -224,7 +258,7 @@ class RecursiveFetcher(object):
if not self.is_link_ok(iurl):
self.logger.debug('Skipping invalid link: %s', iurl)
return None
if not self.is_link_wanted(iurl):
if filter and not self.is_link_wanted(iurl):
self.logger.debug('Filtered link: '+iurl)
return None
return iurl
@ -256,12 +290,12 @@ class RecursiveFetcher(object):
prev_dir = self.current_dir
try:
self.current_dir = diskpath
for tag in soup.findAll(lambda tag: tag.name.lower()=='a' and tag.has_key('href')):
for tag in soup.findAll('a', href=True):
if self.show_progress:
print '.',
sys.stdout.flush()
sys.stdout.flush()
iurl = self.absurl(baseurl, tag, 'href')
iurl = self.absurl(baseurl, tag, 'href', filter=recursion_level != 0)
if not iurl:
continue
nurl = self.normurl(iurl)
@ -293,6 +327,7 @@ class RecursiveFetcher(object):
self.process_stylesheets(soup, f.geturl())
res = os.path.join(linkdiskpath, basename(iurl))
self.downloaded_paths.append(res)
self.filemap[nurl] = res
if recursion_level < self.max_recursions:
self.logger.debug('Processing links...')
@ -301,9 +336,11 @@ class RecursiveFetcher(object):
self.process_return_links(soup, iurl)
self.logger.debug('Recursion limit reached. Skipping links in %s', iurl)
save_soup(soup, res)
save_soup(self.postprocess_html_ext(soup), res)
self.localize_link(tag, 'href', res)
except Exception, err:
self.failed_links.append((iurl, traceback.format_exc()))
self.logger.warning('Could not fetch link %s', iurl)
self.logger.debug('Error: %s', str(err), exc_info=True)
finally:

View File

@ -1,63 +0,0 @@
#!/usr/bin/env python
## Copyright (C) 2008 Kovid Goyal kovid@kovidgoyal.net
## This program is free software; you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
## the Free Software Foundation; either version 2 of the License, or
## (at your option) any later version.
##
## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
## GNU General Public License for more details.
##
## You should have received a copy of the GNU General Public License along
## with this program; if not, write to the Free Software Foundation, Inc.,
## 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
'''
Contains recipes for various common news sources and websites.
'''
import re
from libprs500.web.feeds.news import BasicNewsRecipe
_basic_recipes = (BasicNewsRecipe,)
_basic_recipe_names = (i.__name__ for i in _basic_recipes)
def compile_recipe(src):
'''
Compile the code in src and return the first object that is
'''
locals = {}
exec src in globals(), locals
for obj in locals.values():
if type(obj) is type and obj.__name__ not in _basic_recipe_names:
for base in obj.__bases__:
if base in _basic_recipes:
return obj
return None
def get_feed(title):
'''
Return a builtin recipe class whoose title == C{title} or None if no such
recipe exists.
@type title: string
@rtype: class or None
'''
if isinstance(_feeds[0], basestring):
for i, val in enumerate(_feeds):
recipe = compile_recipe(val)
if recipe is None:
raise RuntimeError('The builtin Recipe #%d is invalid.'%i)
_feeds[i] = recipe
for recipe in _feeds:
if recipe.title == title:
return recipe
return None
#: Recipes to be used with feeds2disk
_feeds = ['class Temp(BasicNewsRecipe):\n\ttitle="temp"']