mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Make web2disk use relative paths. Add navbar at bottom of articles as well.
This commit is contained in:
parent
1a1fd62a2c
commit
e537e6a12d
@ -31,6 +31,8 @@
|
|||||||
</manifest>
|
</manifest>
|
||||||
|
|
||||||
<spine py:with="attrs={'toc':'ncx' if mi.toc else None}" py:attrs="attrs">
|
<spine py:with="attrs={'toc':'ncx' if mi.toc else None}" py:attrs="attrs">
|
||||||
<itemref py:for="idref in mi.spine" idref="${str(idref)}" />
|
<py:for each="idref in mi.spine">
|
||||||
|
<itemref idref="${str(idref)}" />
|
||||||
|
</py:for>
|
||||||
</spine>
|
</spine>
|
||||||
</package>
|
</package>
|
||||||
|
@ -68,7 +68,10 @@ If you specify this option, any argument to %prog is ignored and a default recip
|
|||||||
return p
|
return p
|
||||||
|
|
||||||
def simple_progress_bar(percent, msg):
|
def simple_progress_bar(percent, msg):
|
||||||
|
if not msg:
|
||||||
print '%d%%'%(percent*100),
|
print '%d%%'%(percent*100),
|
||||||
|
else:
|
||||||
|
print '%d%%'%(percent*100), msg
|
||||||
sys.stdout.flush()
|
sys.stdout.flush()
|
||||||
|
|
||||||
def no_progress_bar(percent, msg):
|
def no_progress_bar(percent, msg):
|
||||||
|
@ -17,7 +17,7 @@
|
|||||||
The backend to parse feeds and create HTML that can then be converted
|
The backend to parse feeds and create HTML that can then be converted
|
||||||
to an ebook.
|
to an ebook.
|
||||||
'''
|
'''
|
||||||
import logging, os, cStringIO, time, traceback
|
import logging, os, cStringIO, time, traceback, re
|
||||||
import urlparse
|
import urlparse
|
||||||
|
|
||||||
from libprs500 import browser, __appname__
|
from libprs500 import browser, __appname__
|
||||||
@ -329,17 +329,21 @@ class BasicNewsRecipe(object):
|
|||||||
self.partial_failures = []
|
self.partial_failures = []
|
||||||
|
|
||||||
|
|
||||||
def _postprocess_html(self, soup, last_fetch, article_url):
|
def _postprocess_html(self, soup, first_fetch, job_info):
|
||||||
if self.extra_css is not None:
|
if self.extra_css is not None:
|
||||||
head = soup.find('head')
|
head = soup.find('head')
|
||||||
if head:
|
if head:
|
||||||
style = BeautifulSoup(u'<style type="text/css">%s</style>'%self.extra_css).find('style')
|
style = BeautifulSoup(u'<style type="text/css">%s</style>'%self.extra_css).find('style')
|
||||||
head.insert(len(head.contents), style)
|
head.insert(len(head.contents), style)
|
||||||
if last_fetch:
|
if first_fetch:
|
||||||
|
url, f, a, feed_len = job_info
|
||||||
body = soup.find('body')
|
body = soup.find('body')
|
||||||
if body:
|
if body is not None:
|
||||||
div = BeautifulSoup('<div style="font:8pt monospace"><hr />This article was downloaded by <b>%s</b> from <a href="%s">%s</a></div>'%(__appname__, article_url, article_url)).find('div')
|
templ = self.navbar.generate(False, f, a, feed_len,
|
||||||
body.insert(len(body.contents), div)
|
not self.has_single_feed,
|
||||||
|
url, __appname__)
|
||||||
|
elem = BeautifulSoup(templ.render(doctype='xhtml').decode('utf-8')).find('div')
|
||||||
|
body.insert(0, elem)
|
||||||
|
|
||||||
return self.postprocess_html(soup)
|
return self.postprocess_html(soup)
|
||||||
|
|
||||||
@ -410,8 +414,8 @@ class BasicNewsRecipe(object):
|
|||||||
logger.addHandler(handler)
|
logger.addHandler(handler)
|
||||||
return logger, out
|
return logger, out
|
||||||
|
|
||||||
def fetch_article(self, url, dir, logger):
|
def fetch_article(self, url, dir, logger, f, a, num_of_feeds):
|
||||||
fetcher = RecursiveFetcher(self.web2disk_options, logger, self.image_map, self.css_map, url)
|
fetcher = RecursiveFetcher(self.web2disk_options, logger, self.image_map, self.css_map, (url, f, a, num_of_feeds))
|
||||||
fetcher.base_dir = dir
|
fetcher.base_dir = dir
|
||||||
fetcher.current_dir = dir
|
fetcher.current_dir = dir
|
||||||
fetcher.show_progress = False
|
fetcher.show_progress = False
|
||||||
@ -455,7 +459,7 @@ class BasicNewsRecipe(object):
|
|||||||
url = self.print_version(article.url)
|
url = self.print_version(article.url)
|
||||||
except NotImplementedError:
|
except NotImplementedError:
|
||||||
url = article.url
|
url = article.url
|
||||||
req = WorkRequest(self.fetch_article, (url, art_dir, logger),
|
req = WorkRequest(self.fetch_article, (url, art_dir, logger, f, a, len(feed)),
|
||||||
{}, (f, a), self.article_downloaded,
|
{}, (f, a), self.article_downloaded,
|
||||||
self.error_in_article_download)
|
self.error_in_article_download)
|
||||||
req.stream = stream
|
req.stream = stream
|
||||||
@ -534,16 +538,29 @@ class BasicNewsRecipe(object):
|
|||||||
adir = 'feed_%d/article_%d/'%(num, j)
|
adir = 'feed_%d/article_%d/'%(num, j)
|
||||||
entries.append('%sindex.html'%adir)
|
entries.append('%sindex.html'%adir)
|
||||||
parent.add_item('%sindex.html'%adir, None, a.title if a.title else _('Untitled Article'))
|
parent.add_item('%sindex.html'%adir, None, a.title if a.title else _('Untitled Article'))
|
||||||
|
last = os.path.join(self.output_dir, ('%sindex.html'%adir).replace('/', os.sep))
|
||||||
for sp in a.sub_pages:
|
for sp in a.sub_pages:
|
||||||
prefix = os.path.commonprefix([opf_path, sp])
|
prefix = os.path.commonprefix([opf_path, sp])
|
||||||
relp = sp[len(prefix):]
|
relp = sp[len(prefix):]
|
||||||
entries.append(relp.replace(os.sep, '/'))
|
entries.append(relp.replace(os.sep, '/'))
|
||||||
|
last = sp
|
||||||
|
|
||||||
|
src = open(last, 'rb').read()
|
||||||
|
soup = BeautifulSoup(src)
|
||||||
|
body = soup.find('body')
|
||||||
|
if body is not None:
|
||||||
|
prefix = '/'.join('..'for i in range(2*len(re.findall(r'link\d+', last))))
|
||||||
|
templ = self.navbar.generate(True, num, j, len(f),
|
||||||
|
not self.has_single_feed,
|
||||||
|
a.orig_url, __appname__, prefix=prefix)
|
||||||
|
elem = BeautifulSoup(templ.render(doctype='xhtml').decode('utf-8')).find('div')
|
||||||
|
body.insert(len(body.contents), elem)
|
||||||
|
open(last, 'wb').write(unicode(soup).encode('utf-8'))
|
||||||
|
|
||||||
if len(feeds) > 1:
|
if len(feeds) > 1:
|
||||||
for i, f in enumerate(feeds):
|
for i, f in enumerate(feeds):
|
||||||
entries.append('feed_%d/index.html'%i)
|
entries.append('feed_%d/index.html'%i)
|
||||||
feed = toc.add_item('feed_%d/index.html'%i, None, f.title)
|
feed_index(i, toc.add_item('feed_%d/index.html'%i, None, f.title))
|
||||||
feed_index(i, feed)
|
|
||||||
else:
|
else:
|
||||||
entries.append('feed_%d/index.html'%0)
|
entries.append('feed_%d/index.html'%0)
|
||||||
feed_index(0, toc)
|
feed_index(0, toc)
|
||||||
@ -556,19 +573,13 @@ class BasicNewsRecipe(object):
|
|||||||
|
|
||||||
def article_downloaded(self, request, result):
|
def article_downloaded(self, request, result):
|
||||||
index = os.path.join(os.path.dirname(result[0]), 'index.html')
|
index = os.path.join(os.path.dirname(result[0]), 'index.html')
|
||||||
|
if index != result[0]:
|
||||||
os.rename(result[0], index)
|
os.rename(result[0], index)
|
||||||
src = open(index, 'rb').read().decode('utf-8')
|
a = request.requestID[1]
|
||||||
f, a = request.requestID
|
|
||||||
soup = BeautifulSoup(src)
|
|
||||||
body = soup.find('body')
|
|
||||||
if body is not None:
|
|
||||||
top = self.navbar.generate(False, f, a, len(request.feed), not self.has_single_feed).render(doctype='xhtml')
|
|
||||||
top = BeautifulSoup(top).find('div')
|
|
||||||
body.insert(0, top)
|
|
||||||
open(index, 'wb').write(unicode(soup).encode('utf-8'))
|
|
||||||
|
|
||||||
article = request.article
|
article = request.article
|
||||||
self.logger.debug(_('\nDownloaded article %s from %s\n%s')%(article.title, article.url, request.stream.getvalue().decode('utf-8', 'ignore')))
|
self.logger.debug(_('\nDownloaded article %s from %s\n%s')%(article.title, article.url, request.stream.getvalue().decode('utf-8', 'ignore')))
|
||||||
|
article.orig_url = article.url
|
||||||
article.url = 'article_%d/index.html'%a
|
article.url = 'article_%d/index.html'%a
|
||||||
article.downloaded = True
|
article.downloaded = True
|
||||||
article.sub_pages = result[1][1:]
|
article.sub_pages = result[1][1:]
|
||||||
|
@ -22,6 +22,8 @@ class Newsweek(BasicNewsRecipe):
|
|||||||
|
|
||||||
title = 'Newsweek'
|
title = 'Newsweek'
|
||||||
__author__ = 'Kovid Goyal'
|
__author__ = 'Kovid Goyal'
|
||||||
|
no_stylesheets = True
|
||||||
|
oldest_article = 11
|
||||||
|
|
||||||
feeds = [
|
feeds = [
|
||||||
('Top News', 'http://feeds.newsweek.com/newsweek/TopNews',),
|
('Top News', 'http://feeds.newsweek.com/newsweek/TopNews',),
|
||||||
|
@ -55,17 +55,21 @@ class NavBarTemplate(Template):
|
|||||||
|
|
||||||
>
|
>
|
||||||
<body>
|
<body>
|
||||||
<div class="navbar" style="text-align:center">
|
<div class="navbar" style="text-align:center; font-family:monospace; font-size:10pt">
|
||||||
<hr py:if="bottom" />
|
<hr py:if="bottom" />
|
||||||
<py:if test="art != num - 1">
|
<p py:if="bottom" style="font-size:8pt; text-align:left">
|
||||||
| <a href="../article_${str(art+1)}/index.html">Next</a>
|
This article was downloaded by <b>${__appname__}</b> from <a href="${url}">${url}</a>
|
||||||
|
</p>
|
||||||
|
<br py:if="bottom" /><br py:if="bottom" />
|
||||||
|
<py:if test="art != num - 1 and not bottom">
|
||||||
|
| <a href="${prefix}/../article_${str(art+1)}/index.html">Next</a>
|
||||||
</py:if>
|
</py:if>
|
||||||
| <a href="../index.html#article_${str(art)}">Up one level</a>
|
| <a href="${prefix}/../index.html#article_${str(art)}">Up one level</a>
|
||||||
<py:if test="two_levels">
|
<py:if test="two_levels">
|
||||||
| <a href="../../index.html#_${str(feed)}">Up two levels</a>
|
| <a href="${prefix}/../../index.html#feed_${str(feed)}">Up two levels</a>
|
||||||
</py:if>
|
</py:if>
|
||||||
<py:if test="art != 0">
|
<py:if test="art != 0 and not bottom">
|
||||||
| <a href="../article_${str(art-1)}/index.html">Previous</a>
|
| <a href="${prefix}/../article_${str(art-1)}/index.html">Previous</a>
|
||||||
</py:if>
|
</py:if>
|
||||||
|
|
|
|
||||||
<hr py:if="not bottom" />
|
<hr py:if="not bottom" />
|
||||||
@ -74,8 +78,12 @@ class NavBarTemplate(Template):
|
|||||||
</html>
|
</html>
|
||||||
''')
|
''')
|
||||||
|
|
||||||
def generate(self, bottom, feed, art, number_of_articles_in_feed, two_levels):
|
def generate(self, bottom, feed, art, number_of_articles_in_feed,
|
||||||
return Template.generate(self, bottom=bottom, art=art, num=number_of_articles_in_feed, two_levels=two_levels)
|
two_levels, url, __appname__, prefix=''):
|
||||||
|
return Template.generate(self, bottom=bottom, art=art, feed=feed,
|
||||||
|
num=number_of_articles_in_feed,
|
||||||
|
two_levels=two_levels, url=url,
|
||||||
|
__appname__=__appname__, prefix=prefix)
|
||||||
|
|
||||||
|
|
||||||
class IndexTemplate(Template):
|
class IndexTemplate(Template):
|
||||||
@ -145,7 +153,7 @@ class FeedTemplate(Template):
|
|||||||
</div>
|
</div>
|
||||||
</py:if>
|
</py:if>
|
||||||
<div py:if="feed.description">
|
<div py:if="feed.description">
|
||||||
${feed.description}
|
${feed.description}<br />
|
||||||
</div>
|
</div>
|
||||||
<ul>
|
<ul>
|
||||||
<py:for each="i, article in enumerate(feed.articles)">
|
<py:for each="i, article in enumerate(feed.articles)">
|
||||||
|
@ -43,6 +43,30 @@ def save_soup(soup, target):
|
|||||||
for meta in metas:
|
for meta in metas:
|
||||||
if 'charset' in meta['content']:
|
if 'charset' in meta['content']:
|
||||||
meta.replaceWith(nm)
|
meta.replaceWith(nm)
|
||||||
|
|
||||||
|
selfdir = os.path.dirname(target)
|
||||||
|
def abs2rel(path, base):
|
||||||
|
prefix = os.path.commonprefix([path, base])
|
||||||
|
if not os.path.exists(prefix) or not os.path.isdir(prefix):
|
||||||
|
prefix = os.path.dirname(prefix)
|
||||||
|
prefix = os.path.normpath(prefix)
|
||||||
|
if prefix.startswith(selfdir): # path is in a subdirectory
|
||||||
|
return path[len(prefix)+1:]
|
||||||
|
from_prefix = path[len(prefix)+1:]
|
||||||
|
left = base
|
||||||
|
ups = []
|
||||||
|
while left != prefix:
|
||||||
|
left = os.path.split(left)[0]
|
||||||
|
ups.append('..')
|
||||||
|
ups.append(from_prefix)
|
||||||
|
return os.path.join(*ups)
|
||||||
|
|
||||||
|
for tag in soup.findAll(['img', 'link', 'a']):
|
||||||
|
for key in ('src', 'href'):
|
||||||
|
path = tag.get(key, None)
|
||||||
|
if path and os.path.exists(path) and os.path.isabs(path):
|
||||||
|
tag[key] = abs2rel(path, selfdir).replace(os.sep, '/')
|
||||||
|
|
||||||
f = codecs.open(target, 'w', 'utf-8')
|
f = codecs.open(target, 'w', 'utf-8')
|
||||||
f.write(unicode(soup))
|
f.write(unicode(soup))
|
||||||
f.close()
|
f.close()
|
||||||
@ -180,8 +204,7 @@ class RecursiveFetcher(object):
|
|||||||
diskpath = os.path.join(self.current_dir, 'stylesheets')
|
diskpath = os.path.join(self.current_dir, 'stylesheets')
|
||||||
if not os.path.exists(diskpath):
|
if not os.path.exists(diskpath):
|
||||||
os.mkdir(diskpath)
|
os.mkdir(diskpath)
|
||||||
c = 0
|
for c, tag in enumerate(soup.findAll(lambda tag: tag.name.lower()in ['link', 'style'] and tag.has_key('type') and tag['type'].lower() == 'text/css')):
|
||||||
for tag in soup.findAll(lambda tag: tag.name.lower()in ['link', 'style'] and tag.has_key('type') and tag['type'].lower() == 'text/css'):
|
|
||||||
if tag.has_key('href'):
|
if tag.has_key('href'):
|
||||||
iurl = tag['href']
|
iurl = tag['href']
|
||||||
if not urlparse.urlsplit(iurl).scheme:
|
if not urlparse.urlsplit(iurl).scheme:
|
||||||
@ -196,7 +219,6 @@ class RecursiveFetcher(object):
|
|||||||
self.logger.warning('Could not fetch stylesheet %s', iurl)
|
self.logger.warning('Could not fetch stylesheet %s', iurl)
|
||||||
self.logger.debug('Error: %s', str(err), exc_info=True)
|
self.logger.debug('Error: %s', str(err), exc_info=True)
|
||||||
continue
|
continue
|
||||||
c += 1
|
|
||||||
stylepath = os.path.join(diskpath, 'style'+str(c)+'.css')
|
stylepath = os.path.join(diskpath, 'style'+str(c)+'.css')
|
||||||
with self.stylemap_lock:
|
with self.stylemap_lock:
|
||||||
self.stylemap[iurl] = stylepath
|
self.stylemap[iurl] = stylepath
|
||||||
@ -301,6 +323,7 @@ class RecursiveFetcher(object):
|
|||||||
try:
|
try:
|
||||||
self.current_dir = diskpath
|
self.current_dir = diskpath
|
||||||
tags = list(soup.findAll('a', href=True))
|
tags = list(soup.findAll('a', href=True))
|
||||||
|
|
||||||
for c, tag in enumerate(tags):
|
for c, tag in enumerate(tags):
|
||||||
if self.show_progress:
|
if self.show_progress:
|
||||||
print '.',
|
print '.',
|
||||||
@ -324,7 +347,7 @@ class RecursiveFetcher(object):
|
|||||||
f = self.fetch_url(iurl)
|
f = self.fetch_url(iurl)
|
||||||
dsrc = f.read()
|
dsrc = f.read()
|
||||||
if len(dsrc) == 0:
|
if len(dsrc) == 0:
|
||||||
raise Exception('No content')
|
raise ValueError('No content at URL %s'%iurl)
|
||||||
if self.encoding is not None:
|
if self.encoding is not None:
|
||||||
dsrc = dsrc.decode(self.encoding, 'ignore')
|
dsrc = dsrc.decode(self.encoding, 'ignore')
|
||||||
else:
|
else:
|
||||||
@ -347,9 +370,13 @@ class RecursiveFetcher(object):
|
|||||||
self.logger.debug('Recursion limit reached. Skipping links in %s', iurl)
|
self.logger.debug('Recursion limit reached. Skipping links in %s', iurl)
|
||||||
|
|
||||||
if callable(self.postprocess_html_ext):
|
if callable(self.postprocess_html_ext):
|
||||||
soup = self.postprocess_html_ext(soup, c == len(tags)-1, self.job_info)
|
soup = self.postprocess_html_ext(soup,
|
||||||
save_soup(soup, res)
|
c==0 and recursion_level==0 and not getattr(self, 'called_first', False),
|
||||||
|
self.job_info)
|
||||||
|
if c==0 and recursion_level == 0:
|
||||||
|
self.called_first = True
|
||||||
|
|
||||||
|
save_soup(soup, res)
|
||||||
self.localize_link(tag, 'href', res)
|
self.localize_link(tag, 'href', res)
|
||||||
except Exception, err:
|
except Exception, err:
|
||||||
self.failed_links.append((iurl, traceback.format_exc()))
|
self.failed_links.append((iurl, traceback.format_exc()))
|
||||||
|
Loading…
x
Reference in New Issue
Block a user