Make web2disk use relative paths. Add navbar at bottom of articles as well.

This commit is contained in:
Kovid Goyal 2008-03-16 10:14:23 +00:00
parent 1a1fd62a2c
commit e537e6a12d
6 changed files with 96 additions and 43 deletions

View File

@ -31,6 +31,8 @@
</manifest> </manifest>
<spine py:with="attrs={'toc':'ncx' if mi.toc else None}" py:attrs="attrs"> <spine py:with="attrs={'toc':'ncx' if mi.toc else None}" py:attrs="attrs">
<itemref py:for="idref in mi.spine" idref="${str(idref)}" /> <py:for each="idref in mi.spine">
<itemref idref="${str(idref)}" />
</py:for>
</spine> </spine>
</package> </package>

View File

@ -68,7 +68,10 @@ If you specify this option, any argument to %prog is ignored and a default recip
return p return p
def simple_progress_bar(percent, msg): def simple_progress_bar(percent, msg):
print '%d%%'%(percent*100), if not msg:
print '%d%%'%(percent*100),
else:
print '%d%%'%(percent*100), msg
sys.stdout.flush() sys.stdout.flush()
def no_progress_bar(percent, msg): def no_progress_bar(percent, msg):

View File

@ -17,7 +17,7 @@
The backend to parse feeds and create HTML that can then be converted The backend to parse feeds and create HTML that can then be converted
to an ebook. to an ebook.
''' '''
import logging, os, cStringIO, time, traceback import logging, os, cStringIO, time, traceback, re
import urlparse import urlparse
from libprs500 import browser, __appname__ from libprs500 import browser, __appname__
@ -329,17 +329,21 @@ class BasicNewsRecipe(object):
self.partial_failures = [] self.partial_failures = []
def _postprocess_html(self, soup, last_fetch, article_url): def _postprocess_html(self, soup, first_fetch, job_info):
if self.extra_css is not None: if self.extra_css is not None:
head = soup.find('head') head = soup.find('head')
if head: if head:
style = BeautifulSoup(u'<style type="text/css">%s</style>'%self.extra_css).find('style') style = BeautifulSoup(u'<style type="text/css">%s</style>'%self.extra_css).find('style')
head.insert(len(head.contents), style) head.insert(len(head.contents), style)
if last_fetch: if first_fetch:
url, f, a, feed_len = job_info
body = soup.find('body') body = soup.find('body')
if body: if body is not None:
div = BeautifulSoup('<div style="font:8pt monospace"><hr />This article was downloaded by <b>%s</b> from <a href="%s">%s</a></div>'%(__appname__, article_url, article_url)).find('div') templ = self.navbar.generate(False, f, a, feed_len,
body.insert(len(body.contents), div) not self.has_single_feed,
url, __appname__)
elem = BeautifulSoup(templ.render(doctype='xhtml').decode('utf-8')).find('div')
body.insert(0, elem)
return self.postprocess_html(soup) return self.postprocess_html(soup)
@ -410,8 +414,8 @@ class BasicNewsRecipe(object):
logger.addHandler(handler) logger.addHandler(handler)
return logger, out return logger, out
def fetch_article(self, url, dir, logger): def fetch_article(self, url, dir, logger, f, a, num_of_feeds):
fetcher = RecursiveFetcher(self.web2disk_options, logger, self.image_map, self.css_map, url) fetcher = RecursiveFetcher(self.web2disk_options, logger, self.image_map, self.css_map, (url, f, a, num_of_feeds))
fetcher.base_dir = dir fetcher.base_dir = dir
fetcher.current_dir = dir fetcher.current_dir = dir
fetcher.show_progress = False fetcher.show_progress = False
@ -455,7 +459,7 @@ class BasicNewsRecipe(object):
url = self.print_version(article.url) url = self.print_version(article.url)
except NotImplementedError: except NotImplementedError:
url = article.url url = article.url
req = WorkRequest(self.fetch_article, (url, art_dir, logger), req = WorkRequest(self.fetch_article, (url, art_dir, logger, f, a, len(feed)),
{}, (f, a), self.article_downloaded, {}, (f, a), self.article_downloaded,
self.error_in_article_download) self.error_in_article_download)
req.stream = stream req.stream = stream
@ -534,16 +538,29 @@ class BasicNewsRecipe(object):
adir = 'feed_%d/article_%d/'%(num, j) adir = 'feed_%d/article_%d/'%(num, j)
entries.append('%sindex.html'%adir) entries.append('%sindex.html'%adir)
parent.add_item('%sindex.html'%adir, None, a.title if a.title else _('Untitled Article')) parent.add_item('%sindex.html'%adir, None, a.title if a.title else _('Untitled Article'))
last = os.path.join(self.output_dir, ('%sindex.html'%adir).replace('/', os.sep))
for sp in a.sub_pages: for sp in a.sub_pages:
prefix = os.path.commonprefix([opf_path, sp]) prefix = os.path.commonprefix([opf_path, sp])
relp = sp[len(prefix):] relp = sp[len(prefix):]
entries.append(relp.replace(os.sep, '/')) entries.append(relp.replace(os.sep, '/'))
last = sp
src = open(last, 'rb').read()
soup = BeautifulSoup(src)
body = soup.find('body')
if body is not None:
prefix = '/'.join('..'for i in range(2*len(re.findall(r'link\d+', last))))
templ = self.navbar.generate(True, num, j, len(f),
not self.has_single_feed,
a.orig_url, __appname__, prefix=prefix)
elem = BeautifulSoup(templ.render(doctype='xhtml').decode('utf-8')).find('div')
body.insert(len(body.contents), elem)
open(last, 'wb').write(unicode(soup).encode('utf-8'))
if len(feeds) > 1: if len(feeds) > 1:
for i, f in enumerate(feeds): for i, f in enumerate(feeds):
entries.append('feed_%d/index.html'%i) entries.append('feed_%d/index.html'%i)
feed = toc.add_item('feed_%d/index.html'%i, None, f.title) feed_index(i, toc.add_item('feed_%d/index.html'%i, None, f.title))
feed_index(i, feed)
else: else:
entries.append('feed_%d/index.html'%0) entries.append('feed_%d/index.html'%0)
feed_index(0, toc) feed_index(0, toc)
@ -556,19 +573,13 @@ class BasicNewsRecipe(object):
def article_downloaded(self, request, result): def article_downloaded(self, request, result):
index = os.path.join(os.path.dirname(result[0]), 'index.html') index = os.path.join(os.path.dirname(result[0]), 'index.html')
os.rename(result[0], index) if index != result[0]:
src = open(index, 'rb').read().decode('utf-8') os.rename(result[0], index)
f, a = request.requestID a = request.requestID[1]
soup = BeautifulSoup(src)
body = soup.find('body')
if body is not None:
top = self.navbar.generate(False, f, a, len(request.feed), not self.has_single_feed).render(doctype='xhtml')
top = BeautifulSoup(top).find('div')
body.insert(0, top)
open(index, 'wb').write(unicode(soup).encode('utf-8'))
article = request.article article = request.article
self.logger.debug(_('\nDownloaded article %s from %s\n%s')%(article.title, article.url, request.stream.getvalue().decode('utf-8', 'ignore'))) self.logger.debug(_('\nDownloaded article %s from %s\n%s')%(article.title, article.url, request.stream.getvalue().decode('utf-8', 'ignore')))
article.orig_url = article.url
article.url = 'article_%d/index.html'%a article.url = 'article_%d/index.html'%a
article.downloaded = True article.downloaded = True
article.sub_pages = result[1][1:] article.sub_pages = result[1][1:]

View File

@ -20,8 +20,10 @@ from libprs500.ebooks.BeautifulSoup import BeautifulSoup
class Newsweek(BasicNewsRecipe): class Newsweek(BasicNewsRecipe):
title = 'Newsweek' title = 'Newsweek'
__author__ = 'Kovid Goyal' __author__ = 'Kovid Goyal'
no_stylesheets = True
oldest_article = 11
feeds = [ feeds = [
('Top News', 'http://feeds.newsweek.com/newsweek/TopNews',), ('Top News', 'http://feeds.newsweek.com/newsweek/TopNews',),

View File

@ -55,17 +55,21 @@ class NavBarTemplate(Template):
> >
<body> <body>
<div class="navbar" style="text-align:center"> <div class="navbar" style="text-align:center; font-family:monospace; font-size:10pt">
<hr py:if="bottom" /> <hr py:if="bottom" />
<py:if test="art != num - 1"> <p py:if="bottom" style="font-size:8pt; text-align:left">
| <a href="../article_${str(art+1)}/index.html">Next</a> This article was downloaded by <b>${__appname__}</b> from <a href="${url}">${url}</a>
</p>
<br py:if="bottom" /><br py:if="bottom" />
<py:if test="art != num - 1 and not bottom">
| <a href="${prefix}/../article_${str(art+1)}/index.html">Next</a>
</py:if> </py:if>
| <a href="../index.html#article_${str(art)}">Up one level</a> | <a href="${prefix}/../index.html#article_${str(art)}">Up one level</a>
<py:if test="two_levels"> <py:if test="two_levels">
| <a href="../../index.html#_${str(feed)}">Up two levels</a> | <a href="${prefix}/../../index.html#feed_${str(feed)}">Up two levels</a>
</py:if> </py:if>
<py:if test="art != 0"> <py:if test="art != 0 and not bottom">
| <a href="../article_${str(art-1)}/index.html">Previous</a> | <a href="${prefix}/../article_${str(art-1)}/index.html">Previous</a>
</py:if> </py:if>
| |
<hr py:if="not bottom" /> <hr py:if="not bottom" />
@ -74,8 +78,12 @@ class NavBarTemplate(Template):
</html> </html>
''') ''')
def generate(self, bottom, feed, art, number_of_articles_in_feed, two_levels): def generate(self, bottom, feed, art, number_of_articles_in_feed,
return Template.generate(self, bottom=bottom, art=art, num=number_of_articles_in_feed, two_levels=two_levels) two_levels, url, __appname__, prefix=''):
return Template.generate(self, bottom=bottom, art=art, feed=feed,
num=number_of_articles_in_feed,
two_levels=two_levels, url=url,
__appname__=__appname__, prefix=prefix)
class IndexTemplate(Template): class IndexTemplate(Template):
@ -145,7 +153,7 @@ class FeedTemplate(Template):
</div> </div>
</py:if> </py:if>
<div py:if="feed.description"> <div py:if="feed.description">
${feed.description} ${feed.description}<br />
</div> </div>
<ul> <ul>
<py:for each="i, article in enumerate(feed.articles)"> <py:for each="i, article in enumerate(feed.articles)">

View File

@ -43,6 +43,30 @@ def save_soup(soup, target):
for meta in metas: for meta in metas:
if 'charset' in meta['content']: if 'charset' in meta['content']:
meta.replaceWith(nm) meta.replaceWith(nm)
selfdir = os.path.dirname(target)
def abs2rel(path, base):
prefix = os.path.commonprefix([path, base])
if not os.path.exists(prefix) or not os.path.isdir(prefix):
prefix = os.path.dirname(prefix)
prefix = os.path.normpath(prefix)
if prefix.startswith(selfdir): # path is in a subdirectory
return path[len(prefix)+1:]
from_prefix = path[len(prefix)+1:]
left = base
ups = []
while left != prefix:
left = os.path.split(left)[0]
ups.append('..')
ups.append(from_prefix)
return os.path.join(*ups)
for tag in soup.findAll(['img', 'link', 'a']):
for key in ('src', 'href'):
path = tag.get(key, None)
if path and os.path.exists(path) and os.path.isabs(path):
tag[key] = abs2rel(path, selfdir).replace(os.sep, '/')
f = codecs.open(target, 'w', 'utf-8') f = codecs.open(target, 'w', 'utf-8')
f.write(unicode(soup)) f.write(unicode(soup))
f.close() f.close()
@ -92,8 +116,8 @@ class RecursiveFetcher(object):
self.download_stylesheets = not options.no_stylesheets self.download_stylesheets = not options.no_stylesheets
self.show_progress = True self.show_progress = True
self.failed_links = [] self.failed_links = []
self.job_info = job_info self.job_info = job_info
def get_soup(self, src): def get_soup(self, src):
nmassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE) nmassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE)
nmassage.extend(self.preprocess_regexps) nmassage.extend(self.preprocess_regexps)
@ -180,8 +204,7 @@ class RecursiveFetcher(object):
diskpath = os.path.join(self.current_dir, 'stylesheets') diskpath = os.path.join(self.current_dir, 'stylesheets')
if not os.path.exists(diskpath): if not os.path.exists(diskpath):
os.mkdir(diskpath) os.mkdir(diskpath)
c = 0 for c, tag in enumerate(soup.findAll(lambda tag: tag.name.lower()in ['link', 'style'] and tag.has_key('type') and tag['type'].lower() == 'text/css')):
for tag in soup.findAll(lambda tag: tag.name.lower()in ['link', 'style'] and tag.has_key('type') and tag['type'].lower() == 'text/css'):
if tag.has_key('href'): if tag.has_key('href'):
iurl = tag['href'] iurl = tag['href']
if not urlparse.urlsplit(iurl).scheme: if not urlparse.urlsplit(iurl).scheme:
@ -196,7 +219,6 @@ class RecursiveFetcher(object):
self.logger.warning('Could not fetch stylesheet %s', iurl) self.logger.warning('Could not fetch stylesheet %s', iurl)
self.logger.debug('Error: %s', str(err), exc_info=True) self.logger.debug('Error: %s', str(err), exc_info=True)
continue continue
c += 1
stylepath = os.path.join(diskpath, 'style'+str(c)+'.css') stylepath = os.path.join(diskpath, 'style'+str(c)+'.css')
with self.stylemap_lock: with self.stylemap_lock:
self.stylemap[iurl] = stylepath self.stylemap[iurl] = stylepath
@ -301,6 +323,7 @@ class RecursiveFetcher(object):
try: try:
self.current_dir = diskpath self.current_dir = diskpath
tags = list(soup.findAll('a', href=True)) tags = list(soup.findAll('a', href=True))
for c, tag in enumerate(tags): for c, tag in enumerate(tags):
if self.show_progress: if self.show_progress:
print '.', print '.',
@ -324,7 +347,7 @@ class RecursiveFetcher(object):
f = self.fetch_url(iurl) f = self.fetch_url(iurl)
dsrc = f.read() dsrc = f.read()
if len(dsrc) == 0: if len(dsrc) == 0:
raise Exception('No content') raise ValueError('No content at URL %s'%iurl)
if self.encoding is not None: if self.encoding is not None:
dsrc = dsrc.decode(self.encoding, 'ignore') dsrc = dsrc.decode(self.encoding, 'ignore')
else: else:
@ -347,9 +370,13 @@ class RecursiveFetcher(object):
self.logger.debug('Recursion limit reached. Skipping links in %s', iurl) self.logger.debug('Recursion limit reached. Skipping links in %s', iurl)
if callable(self.postprocess_html_ext): if callable(self.postprocess_html_ext):
soup = self.postprocess_html_ext(soup, c == len(tags)-1, self.job_info) soup = self.postprocess_html_ext(soup,
save_soup(soup, res) c==0 and recursion_level==0 and not getattr(self, 'called_first', False),
self.job_info)
if c==0 and recursion_level == 0:
self.called_first = True
save_soup(soup, res)
self.localize_link(tag, 'href', res) self.localize_link(tag, 'href', res)
except Exception, err: except Exception, err:
self.failed_links.append((iurl, traceback.format_exc())) self.failed_links.append((iurl, traceback.format_exc()))