mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Make web2disk use relative paths. Add navbar at bottom of articles as well.
This commit is contained in:
parent
1a1fd62a2c
commit
e537e6a12d
@ -31,6 +31,8 @@
|
||||
</manifest>
|
||||
|
||||
<spine py:with="attrs={'toc':'ncx' if mi.toc else None}" py:attrs="attrs">
|
||||
<itemref py:for="idref in mi.spine" idref="${str(idref)}" />
|
||||
<py:for each="idref in mi.spine">
|
||||
<itemref idref="${str(idref)}" />
|
||||
</py:for>
|
||||
</spine>
|
||||
</package>
|
||||
|
@ -68,7 +68,10 @@ If you specify this option, any argument to %prog is ignored and a default recip
|
||||
return p
|
||||
|
||||
def simple_progress_bar(percent, msg):
|
||||
print '%d%%'%(percent*100),
|
||||
if not msg:
|
||||
print '%d%%'%(percent*100),
|
||||
else:
|
||||
print '%d%%'%(percent*100), msg
|
||||
sys.stdout.flush()
|
||||
|
||||
def no_progress_bar(percent, msg):
|
||||
|
@ -17,7 +17,7 @@
|
||||
The backend to parse feeds and create HTML that can then be converted
|
||||
to an ebook.
|
||||
'''
|
||||
import logging, os, cStringIO, time, traceback
|
||||
import logging, os, cStringIO, time, traceback, re
|
||||
import urlparse
|
||||
|
||||
from libprs500 import browser, __appname__
|
||||
@ -329,17 +329,21 @@ class BasicNewsRecipe(object):
|
||||
self.partial_failures = []
|
||||
|
||||
|
||||
def _postprocess_html(self, soup, last_fetch, article_url):
|
||||
def _postprocess_html(self, soup, first_fetch, job_info):
|
||||
if self.extra_css is not None:
|
||||
head = soup.find('head')
|
||||
if head:
|
||||
style = BeautifulSoup(u'<style type="text/css">%s</style>'%self.extra_css).find('style')
|
||||
head.insert(len(head.contents), style)
|
||||
if last_fetch:
|
||||
if first_fetch:
|
||||
url, f, a, feed_len = job_info
|
||||
body = soup.find('body')
|
||||
if body:
|
||||
div = BeautifulSoup('<div style="font:8pt monospace"><hr />This article was downloaded by <b>%s</b> from <a href="%s">%s</a></div>'%(__appname__, article_url, article_url)).find('div')
|
||||
body.insert(len(body.contents), div)
|
||||
if body is not None:
|
||||
templ = self.navbar.generate(False, f, a, feed_len,
|
||||
not self.has_single_feed,
|
||||
url, __appname__)
|
||||
elem = BeautifulSoup(templ.render(doctype='xhtml').decode('utf-8')).find('div')
|
||||
body.insert(0, elem)
|
||||
|
||||
return self.postprocess_html(soup)
|
||||
|
||||
@ -410,8 +414,8 @@ class BasicNewsRecipe(object):
|
||||
logger.addHandler(handler)
|
||||
return logger, out
|
||||
|
||||
def fetch_article(self, url, dir, logger):
|
||||
fetcher = RecursiveFetcher(self.web2disk_options, logger, self.image_map, self.css_map, url)
|
||||
def fetch_article(self, url, dir, logger, f, a, num_of_feeds):
|
||||
fetcher = RecursiveFetcher(self.web2disk_options, logger, self.image_map, self.css_map, (url, f, a, num_of_feeds))
|
||||
fetcher.base_dir = dir
|
||||
fetcher.current_dir = dir
|
||||
fetcher.show_progress = False
|
||||
@ -455,7 +459,7 @@ class BasicNewsRecipe(object):
|
||||
url = self.print_version(article.url)
|
||||
except NotImplementedError:
|
||||
url = article.url
|
||||
req = WorkRequest(self.fetch_article, (url, art_dir, logger),
|
||||
req = WorkRequest(self.fetch_article, (url, art_dir, logger, f, a, len(feed)),
|
||||
{}, (f, a), self.article_downloaded,
|
||||
self.error_in_article_download)
|
||||
req.stream = stream
|
||||
@ -534,16 +538,29 @@ class BasicNewsRecipe(object):
|
||||
adir = 'feed_%d/article_%d/'%(num, j)
|
||||
entries.append('%sindex.html'%adir)
|
||||
parent.add_item('%sindex.html'%adir, None, a.title if a.title else _('Untitled Article'))
|
||||
last = os.path.join(self.output_dir, ('%sindex.html'%adir).replace('/', os.sep))
|
||||
for sp in a.sub_pages:
|
||||
prefix = os.path.commonprefix([opf_path, sp])
|
||||
relp = sp[len(prefix):]
|
||||
entries.append(relp.replace(os.sep, '/'))
|
||||
last = sp
|
||||
|
||||
src = open(last, 'rb').read()
|
||||
soup = BeautifulSoup(src)
|
||||
body = soup.find('body')
|
||||
if body is not None:
|
||||
prefix = '/'.join('..'for i in range(2*len(re.findall(r'link\d+', last))))
|
||||
templ = self.navbar.generate(True, num, j, len(f),
|
||||
not self.has_single_feed,
|
||||
a.orig_url, __appname__, prefix=prefix)
|
||||
elem = BeautifulSoup(templ.render(doctype='xhtml').decode('utf-8')).find('div')
|
||||
body.insert(len(body.contents), elem)
|
||||
open(last, 'wb').write(unicode(soup).encode('utf-8'))
|
||||
|
||||
if len(feeds) > 1:
|
||||
for i, f in enumerate(feeds):
|
||||
entries.append('feed_%d/index.html'%i)
|
||||
feed = toc.add_item('feed_%d/index.html'%i, None, f.title)
|
||||
feed_index(i, feed)
|
||||
feed_index(i, toc.add_item('feed_%d/index.html'%i, None, f.title))
|
||||
else:
|
||||
entries.append('feed_%d/index.html'%0)
|
||||
feed_index(0, toc)
|
||||
@ -556,19 +573,13 @@ class BasicNewsRecipe(object):
|
||||
|
||||
def article_downloaded(self, request, result):
|
||||
index = os.path.join(os.path.dirname(result[0]), 'index.html')
|
||||
os.rename(result[0], index)
|
||||
src = open(index, 'rb').read().decode('utf-8')
|
||||
f, a = request.requestID
|
||||
soup = BeautifulSoup(src)
|
||||
body = soup.find('body')
|
||||
if body is not None:
|
||||
top = self.navbar.generate(False, f, a, len(request.feed), not self.has_single_feed).render(doctype='xhtml')
|
||||
top = BeautifulSoup(top).find('div')
|
||||
body.insert(0, top)
|
||||
open(index, 'wb').write(unicode(soup).encode('utf-8'))
|
||||
if index != result[0]:
|
||||
os.rename(result[0], index)
|
||||
a = request.requestID[1]
|
||||
|
||||
article = request.article
|
||||
self.logger.debug(_('\nDownloaded article %s from %s\n%s')%(article.title, article.url, request.stream.getvalue().decode('utf-8', 'ignore')))
|
||||
article.orig_url = article.url
|
||||
article.url = 'article_%d/index.html'%a
|
||||
article.downloaded = True
|
||||
article.sub_pages = result[1][1:]
|
||||
|
@ -20,8 +20,10 @@ from libprs500.ebooks.BeautifulSoup import BeautifulSoup
|
||||
|
||||
class Newsweek(BasicNewsRecipe):
|
||||
|
||||
title = 'Newsweek'
|
||||
__author__ = 'Kovid Goyal'
|
||||
title = 'Newsweek'
|
||||
__author__ = 'Kovid Goyal'
|
||||
no_stylesheets = True
|
||||
oldest_article = 11
|
||||
|
||||
feeds = [
|
||||
('Top News', 'http://feeds.newsweek.com/newsweek/TopNews',),
|
||||
|
@ -55,17 +55,21 @@ class NavBarTemplate(Template):
|
||||
|
||||
>
|
||||
<body>
|
||||
<div class="navbar" style="text-align:center">
|
||||
<div class="navbar" style="text-align:center; font-family:monospace; font-size:10pt">
|
||||
<hr py:if="bottom" />
|
||||
<py:if test="art != num - 1">
|
||||
| <a href="../article_${str(art+1)}/index.html">Next</a>
|
||||
<p py:if="bottom" style="font-size:8pt; text-align:left">
|
||||
This article was downloaded by <b>${__appname__}</b> from <a href="${url}">${url}</a>
|
||||
</p>
|
||||
<br py:if="bottom" /><br py:if="bottom" />
|
||||
<py:if test="art != num - 1 and not bottom">
|
||||
| <a href="${prefix}/../article_${str(art+1)}/index.html">Next</a>
|
||||
</py:if>
|
||||
| <a href="../index.html#article_${str(art)}">Up one level</a>
|
||||
| <a href="${prefix}/../index.html#article_${str(art)}">Up one level</a>
|
||||
<py:if test="two_levels">
|
||||
| <a href="../../index.html#_${str(feed)}">Up two levels</a>
|
||||
| <a href="${prefix}/../../index.html#feed_${str(feed)}">Up two levels</a>
|
||||
</py:if>
|
||||
<py:if test="art != 0">
|
||||
| <a href="../article_${str(art-1)}/index.html">Previous</a>
|
||||
<py:if test="art != 0 and not bottom">
|
||||
| <a href="${prefix}/../article_${str(art-1)}/index.html">Previous</a>
|
||||
</py:if>
|
||||
|
|
||||
<hr py:if="not bottom" />
|
||||
@ -74,8 +78,12 @@ class NavBarTemplate(Template):
|
||||
</html>
|
||||
''')
|
||||
|
||||
def generate(self, bottom, feed, art, number_of_articles_in_feed, two_levels):
|
||||
return Template.generate(self, bottom=bottom, art=art, num=number_of_articles_in_feed, two_levels=two_levels)
|
||||
def generate(self, bottom, feed, art, number_of_articles_in_feed,
|
||||
two_levels, url, __appname__, prefix=''):
|
||||
return Template.generate(self, bottom=bottom, art=art, feed=feed,
|
||||
num=number_of_articles_in_feed,
|
||||
two_levels=two_levels, url=url,
|
||||
__appname__=__appname__, prefix=prefix)
|
||||
|
||||
|
||||
class IndexTemplate(Template):
|
||||
@ -145,7 +153,7 @@ class FeedTemplate(Template):
|
||||
</div>
|
||||
</py:if>
|
||||
<div py:if="feed.description">
|
||||
${feed.description}
|
||||
${feed.description}<br />
|
||||
</div>
|
||||
<ul>
|
||||
<py:for each="i, article in enumerate(feed.articles)">
|
||||
|
@ -43,6 +43,30 @@ def save_soup(soup, target):
|
||||
for meta in metas:
|
||||
if 'charset' in meta['content']:
|
||||
meta.replaceWith(nm)
|
||||
|
||||
selfdir = os.path.dirname(target)
|
||||
def abs2rel(path, base):
|
||||
prefix = os.path.commonprefix([path, base])
|
||||
if not os.path.exists(prefix) or not os.path.isdir(prefix):
|
||||
prefix = os.path.dirname(prefix)
|
||||
prefix = os.path.normpath(prefix)
|
||||
if prefix.startswith(selfdir): # path is in a subdirectory
|
||||
return path[len(prefix)+1:]
|
||||
from_prefix = path[len(prefix)+1:]
|
||||
left = base
|
||||
ups = []
|
||||
while left != prefix:
|
||||
left = os.path.split(left)[0]
|
||||
ups.append('..')
|
||||
ups.append(from_prefix)
|
||||
return os.path.join(*ups)
|
||||
|
||||
for tag in soup.findAll(['img', 'link', 'a']):
|
||||
for key in ('src', 'href'):
|
||||
path = tag.get(key, None)
|
||||
if path and os.path.exists(path) and os.path.isabs(path):
|
||||
tag[key] = abs2rel(path, selfdir).replace(os.sep, '/')
|
||||
|
||||
f = codecs.open(target, 'w', 'utf-8')
|
||||
f.write(unicode(soup))
|
||||
f.close()
|
||||
@ -92,8 +116,8 @@ class RecursiveFetcher(object):
|
||||
self.download_stylesheets = not options.no_stylesheets
|
||||
self.show_progress = True
|
||||
self.failed_links = []
|
||||
self.job_info = job_info
|
||||
|
||||
self.job_info = job_info
|
||||
|
||||
def get_soup(self, src):
|
||||
nmassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE)
|
||||
nmassage.extend(self.preprocess_regexps)
|
||||
@ -180,8 +204,7 @@ class RecursiveFetcher(object):
|
||||
diskpath = os.path.join(self.current_dir, 'stylesheets')
|
||||
if not os.path.exists(diskpath):
|
||||
os.mkdir(diskpath)
|
||||
c = 0
|
||||
for tag in soup.findAll(lambda tag: tag.name.lower()in ['link', 'style'] and tag.has_key('type') and tag['type'].lower() == 'text/css'):
|
||||
for c, tag in enumerate(soup.findAll(lambda tag: tag.name.lower()in ['link', 'style'] and tag.has_key('type') and tag['type'].lower() == 'text/css')):
|
||||
if tag.has_key('href'):
|
||||
iurl = tag['href']
|
||||
if not urlparse.urlsplit(iurl).scheme:
|
||||
@ -196,7 +219,6 @@ class RecursiveFetcher(object):
|
||||
self.logger.warning('Could not fetch stylesheet %s', iurl)
|
||||
self.logger.debug('Error: %s', str(err), exc_info=True)
|
||||
continue
|
||||
c += 1
|
||||
stylepath = os.path.join(diskpath, 'style'+str(c)+'.css')
|
||||
with self.stylemap_lock:
|
||||
self.stylemap[iurl] = stylepath
|
||||
@ -301,6 +323,7 @@ class RecursiveFetcher(object):
|
||||
try:
|
||||
self.current_dir = diskpath
|
||||
tags = list(soup.findAll('a', href=True))
|
||||
|
||||
for c, tag in enumerate(tags):
|
||||
if self.show_progress:
|
||||
print '.',
|
||||
@ -324,7 +347,7 @@ class RecursiveFetcher(object):
|
||||
f = self.fetch_url(iurl)
|
||||
dsrc = f.read()
|
||||
if len(dsrc) == 0:
|
||||
raise Exception('No content')
|
||||
raise ValueError('No content at URL %s'%iurl)
|
||||
if self.encoding is not None:
|
||||
dsrc = dsrc.decode(self.encoding, 'ignore')
|
||||
else:
|
||||
@ -347,9 +370,13 @@ class RecursiveFetcher(object):
|
||||
self.logger.debug('Recursion limit reached. Skipping links in %s', iurl)
|
||||
|
||||
if callable(self.postprocess_html_ext):
|
||||
soup = self.postprocess_html_ext(soup, c == len(tags)-1, self.job_info)
|
||||
save_soup(soup, res)
|
||||
soup = self.postprocess_html_ext(soup,
|
||||
c==0 and recursion_level==0 and not getattr(self, 'called_first', False),
|
||||
self.job_info)
|
||||
if c==0 and recursion_level == 0:
|
||||
self.called_first = True
|
||||
|
||||
save_soup(soup, res)
|
||||
self.localize_link(tag, 'href', res)
|
||||
except Exception, err:
|
||||
self.failed_links.append((iurl, traceback.format_exc()))
|
||||
|
Loading…
x
Reference in New Issue
Block a user