From 1a1fd62a2c143191c51d77489e6aba1a940e3aa2 Mon Sep 17 00:00:00 2001
From: Kovid Goyal <kovid@kovidgoyal.net>
Date: Sat, 15 Mar 2008 23:49:12 +0000
Subject: [PATCH] Added article source link to bottom of articles

---
 src/libprs500/web/feeds/news.py   | 15 +++++++++++----
 src/libprs500/web/fetch/simple.py | 16 ++++++++--------
 2 files changed, 19 insertions(+), 12 deletions(-)
diff --git a/src/libprs500/web/feeds/news.py b/src/libprs500/web/feeds/news.py
index a4d94e5427..4effa12f6e 100644
--- a/src/libprs500/web/feeds/news.py
+++ b/src/libprs500/web/feeds/news.py
@@ -318,7 +318,7 @@ class BasicNewsRecipe(object):
         for extra in ('keep_only_tags', 'remove_tags', 'preprocess_regexps', 
                       'preprocess_html', 'remove_tags_after', 'remove_tags_before'):
             setattr(self.web2disk_options, extra, getattr(self, extra))
-        self.web2disk_options.postprocess_html = [self._postprocess_html, self.postprocess_html]
+        self.web2disk_options.postprocess_html = self._postprocess_html
         
         if self.delay > 0:
             self.simultaneous_downloads = 1
@@ -329,13 +329,20 @@ class BasicNewsRecipe(object):
         self.partial_failures = []
                 
             
-    def _postprocess_html(self, soup):
+    def _postprocess_html(self, soup, last_fetch, article_url):
         if self.extra_css is not None:
             head = soup.find('head')
             if head:
                 style = BeautifulSoup(u'<style type="text/css">%s</style>'%self.extra_css).find('style')
                 head.insert(len(head.contents), style)
-        return soup
+        if last_fetch:
+            body = soup.find('body')
+            if body:
+                div = BeautifulSoup('<div style="font:8pt monospace"><hr />This article was downloaded by <b>%s</b> from <a href="%s">%s</a></div>'%(__appname__, article_url, article_url)).find('div')
+                body.insert(len(body.contents), div)
+            
+        return self.postprocess_html(soup)
+        
     
     def download(self):
         '''
@@ -404,7 +411,7 @@ class BasicNewsRecipe(object):
         return logger, out
     
     def fetch_article(self, url, dir, logger):
-        fetcher = RecursiveFetcher(self.web2disk_options, logger, self.image_map, self.css_map)
+        fetcher = RecursiveFetcher(self.web2disk_options, logger, self.image_map, self.css_map, url)
         fetcher.base_dir = dir
         fetcher.current_dir = dir
         fetcher.show_progress = False
diff --git a/src/libprs500/web/fetch/simple.py b/src/libprs500/web/fetch/simple.py
index a16c619190..3a2ab9ec19 100644
--- a/src/libprs500/web/fetch/simple.py
+++ b/src/libprs500/web/fetch/simple.py
@@ -58,7 +58,7 @@ class RecursiveFetcher(object):
     #                       )
     CSS_IMPORT_PATTERN = re.compile(r'\@import\s+url\((.*?)\)', re.IGNORECASE)
     
-    def __init__(self, options, logger, image_map={}, css_map={}):
+    def __init__(self, options, logger, image_map={}, css_map={}, job_info=None):
         self.logger = logger
         self.base_dir = os.path.abspath(os.path.expanduser(options.dir))
         if not os.path.exists(self.base_dir):
@@ -88,11 +88,11 @@ class RecursiveFetcher(object):
         self.remove_tags_before  = getattr(options, 'remove_tags_before', None)
         self.keep_only_tags      = getattr(options, 'keep_only_tags', [])
         self.preprocess_html_ext = getattr(options, 'preprocess_html', lambda soup: soup) 
-        self.postprocess_html_ext= getattr(options, 'postprocess_html', [])
+        self.postprocess_html_ext= getattr(options, 'postprocess_html', None)
         self.download_stylesheets = not options.no_stylesheets
         self.show_progress = True
         self.failed_links = []
-               
+        self.job_info = job_info 
 
     def get_soup(self, src):
         nmassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE)
@@ -293,14 +293,15 @@ class RecursiveFetcher(object):
                 self.localize_link(tag, 'href', self.filemap[nurl])
     
     def process_links(self, soup, baseurl, recursion_level, into_dir='links'):
-        c, res = 0, ''
+        res = ''
         diskpath = os.path.join(self.current_dir, into_dir)
         if not os.path.exists(diskpath):
             os.mkdir(diskpath)
         prev_dir = self.current_dir
         try:
             self.current_dir = diskpath
-            for tag in soup.findAll('a', href=True):
+            tags = list(soup.findAll('a', href=True))
+            for c, tag in enumerate(tags):
                 if self.show_progress:
                     print '.',
                     sys.stdout.flush()
@@ -314,7 +315,6 @@ class RecursiveFetcher(object):
                     continue
                 if self.files > self.max_files:
                     return res
-                c += 1
                 linkdir = 'link'+str(c) if into_dir else ''
                 linkdiskpath = os.path.join(diskpath, linkdir)
                 if not os.path.exists(linkdiskpath):
@@ -346,8 +346,8 @@ class RecursiveFetcher(object):
                         self.process_return_links(soup, iurl) 
                         self.logger.debug('Recursion limit reached. Skipping links in %s', iurl)
                     
-                    for func in self.postprocess_html_ext:
-                        soup = func(soup)
+                    if callable(self.postprocess_html_ext):
+                        soup = self.postprocess_html_ext(soup, c == len(tags)-1, self.job_info)
                     save_soup(soup, res)
                     
                     self.localize_link(tag, 'href', res)