Make web2disk use relative paths. Add navbar at bottom of articles as well.

2025-07-09 03:04:10 -04:00 · 2008-03-16 10:14:23 +00:00 · 2008-03-16 10:14:23 +00:00 · e537e6a12d
commit e537e6a12d
parent 1a1fd62a2c
6 changed files with 96 additions and 43 deletions
--- a/src/libprs500/ebooks/metadata/opf.xml
+++ b/src/libprs500/ebooks/metadata/opf.xml
@ -31,6 +31,8 @@
    </manifest>
    <spine py:with="attrs={'toc':'ncx' if mi.toc else None}" py:attrs="attrs">
-        <itemref py:for="idref in mi.spine" idref="${str(idref)}" />
+        <py:for each="idref in mi.spine">
        <itemref idref="${str(idref)}" />
        </py:for>
    </spine>    
 </package>
--- a/src/libprs500/web/feeds/main.py
+++ b/src/libprs500/web/feeds/main.py
@ -68,7 +68,10 @@ If you specify this option, any argument to %prog is ignored and a default recip
    return p
 def simple_progress_bar(percent, msg):
-    print '%d%%'%(percent*100),
+    if not msg:
        print '%d%%'%(percent*100),
    else:
        print '%d%%'%(percent*100), msg
    sys.stdout.flush()
 def no_progress_bar(percent, msg):
--- a/src/libprs500/web/feeds/news.py
+++ b/src/libprs500/web/feeds/news.py
@ -17,7 +17,7 @@
 The backend to parse feeds and create HTML that can then be converted
 to an ebook.
 '''
-import logging, os, cStringIO, time, traceback
+import logging, os, cStringIO, time, traceback, re
 import urlparse
 from libprs500 import browser, __appname__
@ -329,17 +329,21 @@ class BasicNewsRecipe(object):
        self.partial_failures = []
-    def _postprocess_html(self, soup, last_fetch, article_url):
+    def _postprocess_html(self, soup, first_fetch, job_info):
        if self.extra_css is not None:
            head = soup.find('head')
            if head:
                style = BeautifulSoup(u'<style type="text/css">%s</style>'%self.extra_css).find('style')
                head.insert(len(head.contents), style)
-        if last_fetch:
+        if first_fetch:
            url, f, a, feed_len = job_info
            body = soup.find('body')
-            if body:
+            if body is not None:
-                div = BeautifulSoup('<div style="font:8pt monospace"><hr />This article was downloaded by <b>%s</b> from <a href="%s">%s</a></div>'%(__appname__, article_url, article_url)).find('div')
+                templ = self.navbar.generate(False, f, a, feed_len, 
-                body.insert(len(body.contents), div)
+                                             not self.has_single_feed, 
                                             url, __appname__)
                elem = BeautifulSoup(templ.render(doctype='xhtml').decode('utf-8')).find('div')
                body.insert(0, elem)
        return self.postprocess_html(soup)
@ -410,8 +414,8 @@ class BasicNewsRecipe(object):
        logger.addHandler(handler)
        return logger, out
-    def fetch_article(self, url, dir, logger):
+    def fetch_article(self, url, dir, logger, f, a, num_of_feeds):
-        fetcher = RecursiveFetcher(self.web2disk_options, logger, self.image_map, self.css_map, url)
+        fetcher = RecursiveFetcher(self.web2disk_options, logger, self.image_map, self.css_map, (url, f, a, num_of_feeds))
        fetcher.base_dir = dir
        fetcher.current_dir = dir
        fetcher.show_progress = False
@ -455,7 +459,7 @@ class BasicNewsRecipe(object):
                    url = self.print_version(article.url)
                except NotImplementedError:
                    url = article.url
-                req = WorkRequest(self.fetch_article, (url, art_dir, logger), 
+                req = WorkRequest(self.fetch_article, (url, art_dir, logger, f, a, len(feed)), 
                                  {}, (f, a), self.article_downloaded, 
                                  self.error_in_article_download)
                req.stream = stream
@ -534,16 +538,29 @@ class BasicNewsRecipe(object):
                    adir = 'feed_%d/article_%d/'%(num, j)
                    entries.append('%sindex.html'%adir)
                    parent.add_item('%sindex.html'%adir, None, a.title if a.title else _('Untitled Article'))
                    last = os.path.join(self.output_dir, ('%sindex.html'%adir).replace('/', os.sep))
                    for sp in a.sub_pages:
                        prefix = os.path.commonprefix([opf_path, sp])
                        relp = sp[len(prefix):]
                        entries.append(relp.replace(os.sep, '/'))
                        last = sp
                    src = open(last, 'rb').read()
                    soup = BeautifulSoup(src)
                    body = soup.find('body')
                    if body is not None:
                        prefix = '/'.join('..'for i in range(2*len(re.findall(r'link\d+', last))))
                        templ = self.navbar.generate(True, num, j, len(f), 
                                         not self.has_single_feed, 
                                         a.orig_url, __appname__, prefix=prefix)
                        elem = BeautifulSoup(templ.render(doctype='xhtml').decode('utf-8')).find('div')
                        body.insert(len(body.contents), elem)
                        open(last, 'wb').write(unicode(soup).encode('utf-8'))
        if len(feeds) > 1:
            for i, f in enumerate(feeds):
                entries.append('feed_%d/index.html'%i)
-                feed = toc.add_item('feed_%d/index.html'%i, None, f.title)
+                feed_index(i, toc.add_item('feed_%d/index.html'%i, None, f.title))
                feed_index(i, feed)
        else:
            entries.append('feed_%d/index.html'%0)
            feed_index(0, toc)
@ -556,19 +573,13 @@ class BasicNewsRecipe(object):
    def article_downloaded(self, request, result):
        index = os.path.join(os.path.dirname(result[0]), 'index.html')
-        os.rename(result[0], index)
+        if index != result[0]:
-        src = open(index, 'rb').read().decode('utf-8')
+            os.rename(result[0], index)
-        f, a = request.requestID
+        a = request.requestID[1]        
        soup = BeautifulSoup(src)
        body = soup.find('body')
        if body is not None:
            top    = self.navbar.generate(False, f, a, len(request.feed), not self.has_single_feed).render(doctype='xhtml')
            top    = BeautifulSoup(top).find('div')
            body.insert(0, top)
            open(index, 'wb').write(unicode(soup).encode('utf-8'))
        article = request.article
        self.logger.debug(_('\nDownloaded article %s from %s\n%s')%(article.title, article.url, request.stream.getvalue().decode('utf-8', 'ignore')))
        article.orig_url = article.url
        article.url = 'article_%d/index.html'%a
        article.downloaded = True
        article.sub_pages  = result[1][1:]
--- a/src/libprs500/web/feeds/recipes/newsweek.py
+++ b/src/libprs500/web/feeds/recipes/newsweek.py
@ -20,8 +20,10 @@ from libprs500.ebooks.BeautifulSoup import BeautifulSoup
 class Newsweek(BasicNewsRecipe):
-    title      = 'Newsweek'
+    title          = 'Newsweek'
-    __author__ = 'Kovid Goyal'
+    __author__     = 'Kovid Goyal'
    no_stylesheets = True
    oldest_article = 11
    feeds = [
             ('Top News', 'http://feeds.newsweek.com/newsweek/TopNews',),
--- a/src/libprs500/web/feeds/templates.py
+++ b/src/libprs500/web/feeds/templates.py
@ -55,17 +55,21 @@ class NavBarTemplate(Template):
 >
    <body>
-        <div class="navbar" style="text-align:center">
+        <div class="navbar" style="text-align:center; font-family:monospace; font-size:10pt">
            <hr py:if="bottom" />
-            <py:if test="art != num - 1">
+            <p py:if="bottom" style="font-size:8pt; text-align:left">
-            | <a href="../article_${str(art+1)}/index.html">Next</a>
+                This article was downloaded by <b>${__appname__}</b> from <a href="${url}">${url}</a>
            </p>
            <br py:if="bottom" /><br py:if="bottom" />
            <py:if test="art != num - 1 and not bottom">
            | <a href="${prefix}/../article_${str(art+1)}/index.html">Next</a>
            </py:if>
-            | <a href="../index.html#article_${str(art)}">Up one level</a> 
+            | <a href="${prefix}/../index.html#article_${str(art)}">Up one level</a> 
            <py:if test="two_levels">
-            | <a href="../../index.html#_${str(feed)}">Up two levels</a>
+            | <a href="${prefix}/../../index.html#feed_${str(feed)}">Up two levels</a>
            </py:if>
-            <py:if test="art != 0">
+            <py:if test="art != 0 and not bottom">
-            | <a href="../article_${str(art-1)}/index.html">Previous</a>
+            | <a href="${prefix}/../article_${str(art-1)}/index.html">Previous</a>
            </py:if>
            |
            <hr py:if="not bottom" />
@ -74,8 +78,12 @@ class NavBarTemplate(Template):
 </html>
 ''')
-    def generate(self, bottom, feed, art, number_of_articles_in_feed, two_levels):
+    def generate(self, bottom, feed, art, number_of_articles_in_feed, 
-        return Template.generate(self, bottom=bottom, art=art, num=number_of_articles_in_feed, two_levels=two_levels)
+                 two_levels, url, __appname__, prefix=''):
        return Template.generate(self, bottom=bottom, art=art, feed=feed,
                                 num=number_of_articles_in_feed, 
                                 two_levels=two_levels, url=url,
                                 __appname__=__appname__, prefix=prefix)
 class IndexTemplate(Template):
@ -145,7 +153,7 @@ class FeedTemplate(Template):
        </div>
        </py:if>
        <div py:if="feed.description">
-            ${feed.description}
+            ${feed.description}<br />
        </div>
        <ul>
            <py:for each="i, article in enumerate(feed.articles)">
--- a/src/libprs500/web/fetch/simple.py
+++ b/src/libprs500/web/fetch/simple.py
@ -43,6 +43,30 @@ def save_soup(soup, target):
    for meta in metas:
        if 'charset' in meta['content']:
            meta.replaceWith(nm)
    selfdir = os.path.dirname(target)
    def abs2rel(path, base):
        prefix = os.path.commonprefix([path, base])
        if not os.path.exists(prefix) or not os.path.isdir(prefix):
            prefix = os.path.dirname(prefix)
        prefix = os.path.normpath(prefix)
        if prefix.startswith(selfdir): # path is in a subdirectory
            return path[len(prefix)+1:]
        from_prefix = path[len(prefix)+1:]
        left = base
        ups = []
        while left != prefix:
            left = os.path.split(left)[0]
            ups.append('..')
        ups.append(from_prefix)
        return os.path.join(*ups)
    for tag in soup.findAll(['img', 'link', 'a']):
        for key in ('src', 'href'):
            path = tag.get(key, None)
            if path and os.path.exists(path) and os.path.isabs(path):
                tag[key] = abs2rel(path, selfdir).replace(os.sep, '/')
    f = codecs.open(target, 'w', 'utf-8')
    f.write(unicode(soup))
    f.close()
@ -92,8 +116,8 @@ class RecursiveFetcher(object):
        self.download_stylesheets = not options.no_stylesheets
        self.show_progress = True
        self.failed_links = []
-        self.job_info = job_info 
+        self.job_info = job_info
-
+        
    def get_soup(self, src):
        nmassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE)
        nmassage.extend(self.preprocess_regexps)
@ -180,8 +204,7 @@ class RecursiveFetcher(object):
        diskpath = os.path.join(self.current_dir, 'stylesheets')
        if not os.path.exists(diskpath):
            os.mkdir(diskpath)
-        c = 0
+        for c, tag in enumerate(soup.findAll(lambda tag: tag.name.lower()in ['link', 'style'] and tag.has_key('type') and tag['type'].lower() == 'text/css')):
        for tag in soup.findAll(lambda tag: tag.name.lower()in ['link', 'style'] and tag.has_key('type') and tag['type'].lower() == 'text/css'):
            if tag.has_key('href'):
                iurl = tag['href']
                if not urlparse.urlsplit(iurl).scheme:
@ -196,7 +219,6 @@ class RecursiveFetcher(object):
                    self.logger.warning('Could not fetch stylesheet %s', iurl)
                    self.logger.debug('Error: %s', str(err), exc_info=True)
                    continue
                c += 1
                stylepath = os.path.join(diskpath, 'style'+str(c)+'.css')
                with self.stylemap_lock:
                    self.stylemap[iurl] = stylepath
@ -301,6 +323,7 @@ class RecursiveFetcher(object):
        try:
            self.current_dir = diskpath
            tags = list(soup.findAll('a', href=True))
            for c, tag in enumerate(tags):
                if self.show_progress:
                    print '.',
@ -324,7 +347,7 @@ class RecursiveFetcher(object):
                    f = self.fetch_url(iurl)
                    dsrc = f.read()
                    if len(dsrc) == 0:
-                        raise Exception('No content')
+                        raise ValueError('No content at URL %s'%iurl)
                    if self.encoding is not None:
                        dsrc = dsrc.decode(self.encoding, 'ignore')
                    else:
@ -347,9 +370,13 @@ class RecursiveFetcher(object):
                        self.logger.debug('Recursion limit reached. Skipping links in %s', iurl)
                    if callable(self.postprocess_html_ext):
-                        soup = self.postprocess_html_ext(soup, c == len(tags)-1, self.job_info)
+                        soup = self.postprocess_html_ext(soup, 
-                    save_soup(soup, res)
+                                c==0 and recursion_level==0 and not getattr(self, 'called_first', False),
                                self.job_info)
                        if c==0 and recursion_level == 0:
                            self.called_first = True
                    save_soup(soup, res)
                    self.localize_link(tag, 'href', res)
                except Exception, err:
                    self.failed_links.append((iurl, traceback.format_exc()))