diff --git a/src/libprs500/ebooks/metadata/opf.xml b/src/libprs500/ebooks/metadata/opf.xml
index a847bae2c8..822a5dae76 100644
--- a/src/libprs500/ebooks/metadata/opf.xml
+++ b/src/libprs500/ebooks/metadata/opf.xml
@@ -31,6 +31,8 @@
-
+
+
+
diff --git a/src/libprs500/web/feeds/main.py b/src/libprs500/web/feeds/main.py
index 9c34614e28..11d60b6a49 100644
--- a/src/libprs500/web/feeds/main.py
+++ b/src/libprs500/web/feeds/main.py
@@ -68,7 +68,10 @@ If you specify this option, any argument to %prog is ignored and a default recip
return p
def simple_progress_bar(percent, msg):
- print '%d%%'%(percent*100),
+ if not msg:
+ print '%d%%'%(percent*100),
+ else:
+ print '%d%%'%(percent*100), msg
sys.stdout.flush()
def no_progress_bar(percent, msg):
diff --git a/src/libprs500/web/feeds/news.py b/src/libprs500/web/feeds/news.py
index 4effa12f6e..e4ffb2aadf 100644
--- a/src/libprs500/web/feeds/news.py
+++ b/src/libprs500/web/feeds/news.py
@@ -17,7 +17,7 @@
The backend to parse feeds and create HTML that can then be converted
to an ebook.
'''
-import logging, os, cStringIO, time, traceback
+import logging, os, cStringIO, time, traceback, re
import urlparse
from libprs500 import browser, __appname__
@@ -329,17 +329,21 @@ class BasicNewsRecipe(object):
self.partial_failures = []
- def _postprocess_html(self, soup, last_fetch, article_url):
+ def _postprocess_html(self, soup, first_fetch, job_info):
if self.extra_css is not None:
head = soup.find('head')
if head:
style = BeautifulSoup(u''%self.extra_css).find('style')
head.insert(len(head.contents), style)
- if last_fetch:
+ if first_fetch:
+ url, f, a, feed_len = job_info
body = soup.find('body')
- if body:
- div = BeautifulSoup('
This article was downloaded by
%s from
%s '%(__appname__, article_url, article_url)).find('div')
- body.insert(len(body.contents), div)
+ if body is not None:
+ templ = self.navbar.generate(False, f, a, feed_len,
+ not self.has_single_feed,
+ url, __appname__)
+ elem = BeautifulSoup(templ.render(doctype='xhtml').decode('utf-8')).find('div')
+ body.insert(0, elem)
return self.postprocess_html(soup)
@@ -410,8 +414,8 @@ class BasicNewsRecipe(object):
logger.addHandler(handler)
return logger, out
- def fetch_article(self, url, dir, logger):
- fetcher = RecursiveFetcher(self.web2disk_options, logger, self.image_map, self.css_map, url)
+ def fetch_article(self, url, dir, logger, f, a, num_of_feeds):
+ fetcher = RecursiveFetcher(self.web2disk_options, logger, self.image_map, self.css_map, (url, f, a, num_of_feeds))
fetcher.base_dir = dir
fetcher.current_dir = dir
fetcher.show_progress = False
@@ -455,7 +459,7 @@ class BasicNewsRecipe(object):
url = self.print_version(article.url)
except NotImplementedError:
url = article.url
- req = WorkRequest(self.fetch_article, (url, art_dir, logger),
+ req = WorkRequest(self.fetch_article, (url, art_dir, logger, f, a, len(feed)),
{}, (f, a), self.article_downloaded,
self.error_in_article_download)
req.stream = stream
@@ -534,16 +538,29 @@ class BasicNewsRecipe(object):
adir = 'feed_%d/article_%d/'%(num, j)
entries.append('%sindex.html'%adir)
parent.add_item('%sindex.html'%adir, None, a.title if a.title else _('Untitled Article'))
+ last = os.path.join(self.output_dir, ('%sindex.html'%adir).replace('/', os.sep))
for sp in a.sub_pages:
prefix = os.path.commonprefix([opf_path, sp])
relp = sp[len(prefix):]
entries.append(relp.replace(os.sep, '/'))
+ last = sp
+
+ src = open(last, 'rb').read()
+ soup = BeautifulSoup(src)
+ body = soup.find('body')
+ if body is not None:
+ prefix = '/'.join('..'for i in range(2*len(re.findall(r'link\d+', last))))
+ templ = self.navbar.generate(True, num, j, len(f),
+ not self.has_single_feed,
+ a.orig_url, __appname__, prefix=prefix)
+ elem = BeautifulSoup(templ.render(doctype='xhtml').decode('utf-8')).find('div')
+ body.insert(len(body.contents), elem)
+ open(last, 'wb').write(unicode(soup).encode('utf-8'))
if len(feeds) > 1:
for i, f in enumerate(feeds):
entries.append('feed_%d/index.html'%i)
- feed = toc.add_item('feed_%d/index.html'%i, None, f.title)
- feed_index(i, feed)
+ feed_index(i, toc.add_item('feed_%d/index.html'%i, None, f.title))
else:
entries.append('feed_%d/index.html'%0)
feed_index(0, toc)
@@ -556,19 +573,13 @@ class BasicNewsRecipe(object):
def article_downloaded(self, request, result):
index = os.path.join(os.path.dirname(result[0]), 'index.html')
- os.rename(result[0], index)
- src = open(index, 'rb').read().decode('utf-8')
- f, a = request.requestID
- soup = BeautifulSoup(src)
- body = soup.find('body')
- if body is not None:
- top = self.navbar.generate(False, f, a, len(request.feed), not self.has_single_feed).render(doctype='xhtml')
- top = BeautifulSoup(top).find('div')
- body.insert(0, top)
- open(index, 'wb').write(unicode(soup).encode('utf-8'))
+ if index != result[0]:
+ os.rename(result[0], index)
+ a = request.requestID[1]
article = request.article
self.logger.debug(_('\nDownloaded article %s from %s\n%s')%(article.title, article.url, request.stream.getvalue().decode('utf-8', 'ignore')))
+ article.orig_url = article.url
article.url = 'article_%d/index.html'%a
article.downloaded = True
article.sub_pages = result[1][1:]
diff --git a/src/libprs500/web/feeds/recipes/newsweek.py b/src/libprs500/web/feeds/recipes/newsweek.py
index 8772e79325..db4d998ab5 100644
--- a/src/libprs500/web/feeds/recipes/newsweek.py
+++ b/src/libprs500/web/feeds/recipes/newsweek.py
@@ -20,8 +20,10 @@ from libprs500.ebooks.BeautifulSoup import BeautifulSoup
class Newsweek(BasicNewsRecipe):
- title = 'Newsweek'
- __author__ = 'Kovid Goyal'
+ title = 'Newsweek'
+ __author__ = 'Kovid Goyal'
+ no_stylesheets = True
+ oldest_article = 11
feeds = [
('Top News', 'http://feeds.newsweek.com/newsweek/TopNews',),
diff --git a/src/libprs500/web/feeds/templates.py b/src/libprs500/web/feeds/templates.py
index 03432ec151..81e24db951 100644
--- a/src/libprs500/web/feeds/templates.py
+++ b/src/libprs500/web/feeds/templates.py
@@ -55,17 +55,21 @@ class NavBarTemplate(Template):
>
-
+
-
- | Next
+
+ This article was downloaded by ${__appname__} from ${url}
+
+
+
+ | Next
- | Up one level
+ | Up one level
- | Up two levels
+ | Up two levels
-
- | Previous
+
+ | Previous
|
@@ -74,8 +78,12 @@ class NavBarTemplate(Template):
''')
- def generate(self, bottom, feed, art, number_of_articles_in_feed, two_levels):
- return Template.generate(self, bottom=bottom, art=art, num=number_of_articles_in_feed, two_levels=two_levels)
+ def generate(self, bottom, feed, art, number_of_articles_in_feed,
+ two_levels, url, __appname__, prefix=''):
+ return Template.generate(self, bottom=bottom, art=art, feed=feed,
+ num=number_of_articles_in_feed,
+ two_levels=two_levels, url=url,
+ __appname__=__appname__, prefix=prefix)
class IndexTemplate(Template):
@@ -145,7 +153,7 @@ class FeedTemplate(Template):
- ${feed.description}
+ ${feed.description}
diff --git a/src/libprs500/web/fetch/simple.py b/src/libprs500/web/fetch/simple.py
index 3a2ab9ec19..9fe122219f 100644
--- a/src/libprs500/web/fetch/simple.py
+++ b/src/libprs500/web/fetch/simple.py
@@ -43,6 +43,30 @@ def save_soup(soup, target):
for meta in metas:
if 'charset' in meta['content']:
meta.replaceWith(nm)
+
+ selfdir = os.path.dirname(target)
+ def abs2rel(path, base):
+ prefix = os.path.commonprefix([path, base])
+ if not os.path.exists(prefix) or not os.path.isdir(prefix):
+ prefix = os.path.dirname(prefix)
+ prefix = os.path.normpath(prefix)
+ if prefix.startswith(selfdir): # path is in a subdirectory
+ return path[len(prefix)+1:]
+ from_prefix = path[len(prefix)+1:]
+ left = base
+ ups = []
+ while left != prefix:
+ left = os.path.split(left)[0]
+ ups.append('..')
+ ups.append(from_prefix)
+ return os.path.join(*ups)
+
+ for tag in soup.findAll(['img', 'link', 'a']):
+ for key in ('src', 'href'):
+ path = tag.get(key, None)
+ if path and os.path.exists(path) and os.path.isabs(path):
+ tag[key] = abs2rel(path, selfdir).replace(os.sep, '/')
+
f = codecs.open(target, 'w', 'utf-8')
f.write(unicode(soup))
f.close()
@@ -92,8 +116,8 @@ class RecursiveFetcher(object):
self.download_stylesheets = not options.no_stylesheets
self.show_progress = True
self.failed_links = []
- self.job_info = job_info
-
+ self.job_info = job_info
+
def get_soup(self, src):
nmassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE)
nmassage.extend(self.preprocess_regexps)
@@ -180,8 +204,7 @@ class RecursiveFetcher(object):
diskpath = os.path.join(self.current_dir, 'stylesheets')
if not os.path.exists(diskpath):
os.mkdir(diskpath)
- c = 0
- for tag in soup.findAll(lambda tag: tag.name.lower()in ['link', 'style'] and tag.has_key('type') and tag['type'].lower() == 'text/css'):
+ for c, tag in enumerate(soup.findAll(lambda tag: tag.name.lower()in ['link', 'style'] and tag.has_key('type') and tag['type'].lower() == 'text/css')):
if tag.has_key('href'):
iurl = tag['href']
if not urlparse.urlsplit(iurl).scheme:
@@ -196,7 +219,6 @@ class RecursiveFetcher(object):
self.logger.warning('Could not fetch stylesheet %s', iurl)
self.logger.debug('Error: %s', str(err), exc_info=True)
continue
- c += 1
stylepath = os.path.join(diskpath, 'style'+str(c)+'.css')
with self.stylemap_lock:
self.stylemap[iurl] = stylepath
@@ -301,6 +323,7 @@ class RecursiveFetcher(object):
try:
self.current_dir = diskpath
tags = list(soup.findAll('a', href=True))
+
for c, tag in enumerate(tags):
if self.show_progress:
print '.',
@@ -324,7 +347,7 @@ class RecursiveFetcher(object):
f = self.fetch_url(iurl)
dsrc = f.read()
if len(dsrc) == 0:
- raise Exception('No content')
+ raise ValueError('No content at URL %s'%iurl)
if self.encoding is not None:
dsrc = dsrc.decode(self.encoding, 'ignore')
else:
@@ -347,9 +370,13 @@ class RecursiveFetcher(object):
self.logger.debug('Recursion limit reached. Skipping links in %s', iurl)
if callable(self.postprocess_html_ext):
- soup = self.postprocess_html_ext(soup, c == len(tags)-1, self.job_info)
- save_soup(soup, res)
+ soup = self.postprocess_html_ext(soup,
+ c==0 and recursion_level==0 and not getattr(self, 'called_first', False),
+ self.job_info)
+ if c==0 and recursion_level == 0:
+ self.called_first = True
+ save_soup(soup, res)
self.localize_link(tag, 'href', res)
except Exception, err:
self.failed_links.append((iurl, traceback.format_exc()))