Make web2disk use relative paths. Add navbar at bottom of articles as well.

This commit is contained in:
Kovid Goyal 2008-03-16 10:14:23 +00:00
parent 1a1fd62a2c
commit e537e6a12d
6 changed files with 96 additions and 43 deletions

View File

@ -31,6 +31,8 @@
</manifest>
<spine py:with="attrs={'toc':'ncx' if mi.toc else None}" py:attrs="attrs">
<itemref py:for="idref in mi.spine" idref="${str(idref)}" />
<py:for each="idref in mi.spine">
<itemref idref="${str(idref)}" />
</py:for>
</spine>
</package>

View File

@ -68,7 +68,10 @@ If you specify this option, any argument to %prog is ignored and a default recip
return p
def simple_progress_bar(percent, msg):
print '%d%%'%(percent*100),
if not msg:
print '%d%%'%(percent*100),
else:
print '%d%%'%(percent*100), msg
sys.stdout.flush()
def no_progress_bar(percent, msg):

View File

@ -17,7 +17,7 @@
The backend to parse feeds and create HTML that can then be converted
to an ebook.
'''
import logging, os, cStringIO, time, traceback
import logging, os, cStringIO, time, traceback, re
import urlparse
from libprs500 import browser, __appname__
@ -329,17 +329,21 @@ class BasicNewsRecipe(object):
self.partial_failures = []
def _postprocess_html(self, soup, last_fetch, article_url):
def _postprocess_html(self, soup, first_fetch, job_info):
if self.extra_css is not None:
head = soup.find('head')
if head:
style = BeautifulSoup(u'<style type="text/css">%s</style>'%self.extra_css).find('style')
head.insert(len(head.contents), style)
if last_fetch:
if first_fetch:
url, f, a, feed_len = job_info
body = soup.find('body')
if body:
div = BeautifulSoup('<div style="font:8pt monospace"><hr />This article was downloaded by <b>%s</b> from <a href="%s">%s</a></div>'%(__appname__, article_url, article_url)).find('div')
body.insert(len(body.contents), div)
if body is not None:
templ = self.navbar.generate(False, f, a, feed_len,
not self.has_single_feed,
url, __appname__)
elem = BeautifulSoup(templ.render(doctype='xhtml').decode('utf-8')).find('div')
body.insert(0, elem)
return self.postprocess_html(soup)
@ -410,8 +414,8 @@ class BasicNewsRecipe(object):
logger.addHandler(handler)
return logger, out
def fetch_article(self, url, dir, logger):
fetcher = RecursiveFetcher(self.web2disk_options, logger, self.image_map, self.css_map, url)
def fetch_article(self, url, dir, logger, f, a, num_of_feeds):
fetcher = RecursiveFetcher(self.web2disk_options, logger, self.image_map, self.css_map, (url, f, a, num_of_feeds))
fetcher.base_dir = dir
fetcher.current_dir = dir
fetcher.show_progress = False
@ -455,7 +459,7 @@ class BasicNewsRecipe(object):
url = self.print_version(article.url)
except NotImplementedError:
url = article.url
req = WorkRequest(self.fetch_article, (url, art_dir, logger),
req = WorkRequest(self.fetch_article, (url, art_dir, logger, f, a, len(feed)),
{}, (f, a), self.article_downloaded,
self.error_in_article_download)
req.stream = stream
@ -534,16 +538,29 @@ class BasicNewsRecipe(object):
adir = 'feed_%d/article_%d/'%(num, j)
entries.append('%sindex.html'%adir)
parent.add_item('%sindex.html'%adir, None, a.title if a.title else _('Untitled Article'))
last = os.path.join(self.output_dir, ('%sindex.html'%adir).replace('/', os.sep))
for sp in a.sub_pages:
prefix = os.path.commonprefix([opf_path, sp])
relp = sp[len(prefix):]
entries.append(relp.replace(os.sep, '/'))
last = sp
src = open(last, 'rb').read()
soup = BeautifulSoup(src)
body = soup.find('body')
if body is not None:
prefix = '/'.join('..'for i in range(2*len(re.findall(r'link\d+', last))))
templ = self.navbar.generate(True, num, j, len(f),
not self.has_single_feed,
a.orig_url, __appname__, prefix=prefix)
elem = BeautifulSoup(templ.render(doctype='xhtml').decode('utf-8')).find('div')
body.insert(len(body.contents), elem)
open(last, 'wb').write(unicode(soup).encode('utf-8'))
if len(feeds) > 1:
for i, f in enumerate(feeds):
entries.append('feed_%d/index.html'%i)
feed = toc.add_item('feed_%d/index.html'%i, None, f.title)
feed_index(i, feed)
feed_index(i, toc.add_item('feed_%d/index.html'%i, None, f.title))
else:
entries.append('feed_%d/index.html'%0)
feed_index(0, toc)
@ -556,19 +573,13 @@ class BasicNewsRecipe(object):
def article_downloaded(self, request, result):
index = os.path.join(os.path.dirname(result[0]), 'index.html')
os.rename(result[0], index)
src = open(index, 'rb').read().decode('utf-8')
f, a = request.requestID
soup = BeautifulSoup(src)
body = soup.find('body')
if body is not None:
top = self.navbar.generate(False, f, a, len(request.feed), not self.has_single_feed).render(doctype='xhtml')
top = BeautifulSoup(top).find('div')
body.insert(0, top)
open(index, 'wb').write(unicode(soup).encode('utf-8'))
if index != result[0]:
os.rename(result[0], index)
a = request.requestID[1]
article = request.article
self.logger.debug(_('\nDownloaded article %s from %s\n%s')%(article.title, article.url, request.stream.getvalue().decode('utf-8', 'ignore')))
article.orig_url = article.url
article.url = 'article_%d/index.html'%a
article.downloaded = True
article.sub_pages = result[1][1:]

View File

@ -20,8 +20,10 @@ from libprs500.ebooks.BeautifulSoup import BeautifulSoup
class Newsweek(BasicNewsRecipe):
title = 'Newsweek'
__author__ = 'Kovid Goyal'
title = 'Newsweek'
__author__ = 'Kovid Goyal'
no_stylesheets = True
oldest_article = 11
feeds = [
('Top News', 'http://feeds.newsweek.com/newsweek/TopNews',),

View File

@ -55,17 +55,21 @@ class NavBarTemplate(Template):
>
<body>
<div class="navbar" style="text-align:center">
<div class="navbar" style="text-align:center; font-family:monospace; font-size:10pt">
<hr py:if="bottom" />
<py:if test="art != num - 1">
| <a href="../article_${str(art+1)}/index.html">Next</a>
<p py:if="bottom" style="font-size:8pt; text-align:left">
This article was downloaded by <b>${__appname__}</b> from <a href="${url}">${url}</a>
</p>
<br py:if="bottom" /><br py:if="bottom" />
<py:if test="art != num - 1 and not bottom">
| <a href="${prefix}/../article_${str(art+1)}/index.html">Next</a>
</py:if>
| <a href="../index.html#article_${str(art)}">Up one level</a>
| <a href="${prefix}/../index.html#article_${str(art)}">Up one level</a>
<py:if test="two_levels">
| <a href="../../index.html#_${str(feed)}">Up two levels</a>
| <a href="${prefix}/../../index.html#feed_${str(feed)}">Up two levels</a>
</py:if>
<py:if test="art != 0">
| <a href="../article_${str(art-1)}/index.html">Previous</a>
<py:if test="art != 0 and not bottom">
| <a href="${prefix}/../article_${str(art-1)}/index.html">Previous</a>
</py:if>
|
<hr py:if="not bottom" />
@ -74,8 +78,12 @@ class NavBarTemplate(Template):
</html>
''')
def generate(self, bottom, feed, art, number_of_articles_in_feed, two_levels):
return Template.generate(self, bottom=bottom, art=art, num=number_of_articles_in_feed, two_levels=two_levels)
def generate(self, bottom, feed, art, number_of_articles_in_feed,
two_levels, url, __appname__, prefix=''):
return Template.generate(self, bottom=bottom, art=art, feed=feed,
num=number_of_articles_in_feed,
two_levels=two_levels, url=url,
__appname__=__appname__, prefix=prefix)
class IndexTemplate(Template):
@ -145,7 +153,7 @@ class FeedTemplate(Template):
</div>
</py:if>
<div py:if="feed.description">
${feed.description}
${feed.description}<br />
</div>
<ul>
<py:for each="i, article in enumerate(feed.articles)">

View File

@ -43,6 +43,30 @@ def save_soup(soup, target):
for meta in metas:
if 'charset' in meta['content']:
meta.replaceWith(nm)
selfdir = os.path.dirname(target)
def abs2rel(path, base):
prefix = os.path.commonprefix([path, base])
if not os.path.exists(prefix) or not os.path.isdir(prefix):
prefix = os.path.dirname(prefix)
prefix = os.path.normpath(prefix)
if prefix.startswith(selfdir): # path is in a subdirectory
return path[len(prefix)+1:]
from_prefix = path[len(prefix)+1:]
left = base
ups = []
while left != prefix:
left = os.path.split(left)[0]
ups.append('..')
ups.append(from_prefix)
return os.path.join(*ups)
for tag in soup.findAll(['img', 'link', 'a']):
for key in ('src', 'href'):
path = tag.get(key, None)
if path and os.path.exists(path) and os.path.isabs(path):
tag[key] = abs2rel(path, selfdir).replace(os.sep, '/')
f = codecs.open(target, 'w', 'utf-8')
f.write(unicode(soup))
f.close()
@ -92,8 +116,8 @@ class RecursiveFetcher(object):
self.download_stylesheets = not options.no_stylesheets
self.show_progress = True
self.failed_links = []
self.job_info = job_info
self.job_info = job_info
def get_soup(self, src):
nmassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE)
nmassage.extend(self.preprocess_regexps)
@ -180,8 +204,7 @@ class RecursiveFetcher(object):
diskpath = os.path.join(self.current_dir, 'stylesheets')
if not os.path.exists(diskpath):
os.mkdir(diskpath)
c = 0
for tag in soup.findAll(lambda tag: tag.name.lower()in ['link', 'style'] and tag.has_key('type') and tag['type'].lower() == 'text/css'):
for c, tag in enumerate(soup.findAll(lambda tag: tag.name.lower()in ['link', 'style'] and tag.has_key('type') and tag['type'].lower() == 'text/css')):
if tag.has_key('href'):
iurl = tag['href']
if not urlparse.urlsplit(iurl).scheme:
@ -196,7 +219,6 @@ class RecursiveFetcher(object):
self.logger.warning('Could not fetch stylesheet %s', iurl)
self.logger.debug('Error: %s', str(err), exc_info=True)
continue
c += 1
stylepath = os.path.join(diskpath, 'style'+str(c)+'.css')
with self.stylemap_lock:
self.stylemap[iurl] = stylepath
@ -301,6 +323,7 @@ class RecursiveFetcher(object):
try:
self.current_dir = diskpath
tags = list(soup.findAll('a', href=True))
for c, tag in enumerate(tags):
if self.show_progress:
print '.',
@ -324,7 +347,7 @@ class RecursiveFetcher(object):
f = self.fetch_url(iurl)
dsrc = f.read()
if len(dsrc) == 0:
raise Exception('No content')
raise ValueError('No content at URL %s'%iurl)
if self.encoding is not None:
dsrc = dsrc.decode(self.encoding, 'ignore')
else:
@ -347,9 +370,13 @@ class RecursiveFetcher(object):
self.logger.debug('Recursion limit reached. Skipping links in %s', iurl)
if callable(self.postprocess_html_ext):
soup = self.postprocess_html_ext(soup, c == len(tags)-1, self.job_info)
save_soup(soup, res)
soup = self.postprocess_html_ext(soup,
c==0 and recursion_level==0 and not getattr(self, 'called_first', False),
self.job_info)
if c==0 and recursion_level == 0:
self.called_first = True
save_soup(soup, res)
self.localize_link(tag, 'href', res)
except Exception, err:
self.failed_links.append((iurl, traceback.format_exc()))