Added article source link to bottom of articles

2025-07-09 03:04:10 -04:00 · 2008-03-15 23:49:12 +00:00 · 2008-03-15 23:49:12 +00:00 · 1a1fd62a2c
commit 1a1fd62a2c
parent 8da62e7383
2 changed files with 19 additions and 12 deletions
--- a/src/libprs500/web/feeds/news.py
+++ b/src/libprs500/web/feeds/news.py
@ -318,7 +318,7 @@ class BasicNewsRecipe(object):
        for extra in ('keep_only_tags', 'remove_tags', 'preprocess_regexps', 
                      'preprocess_html', 'remove_tags_after', 'remove_tags_before'):
            setattr(self.web2disk_options, extra, getattr(self, extra))
-        self.web2disk_options.postprocess_html = [self._postprocess_html, self.postprocess_html]
+        self.web2disk_options.postprocess_html = self._postprocess_html
        if self.delay > 0:
            self.simultaneous_downloads = 1
@ -329,13 +329,20 @@ class BasicNewsRecipe(object):
        self.partial_failures = []
-    def _postprocess_html(self, soup):
+    def _postprocess_html(self, soup, last_fetch, article_url):
        if self.extra_css is not None:
            head = soup.find('head')
            if head:
                style = BeautifulSoup(u'<style type="text/css">%s</style>'%self.extra_css).find('style')
                head.insert(len(head.contents), style)
-        return soup
+        if last_fetch:
            body = soup.find('body')
            if body:
                div = BeautifulSoup('<div style="font:8pt monospace"><hr />This article was downloaded by <b>%s</b> from <a href="%s">%s</a></div>'%(__appname__, article_url, article_url)).find('div')
                body.insert(len(body.contents), div)
        return self.postprocess_html(soup)
    def download(self):
        '''
@ -404,7 +411,7 @@ class BasicNewsRecipe(object):
        return logger, out
    def fetch_article(self, url, dir, logger):
-        fetcher = RecursiveFetcher(self.web2disk_options, logger, self.image_map, self.css_map)
+        fetcher = RecursiveFetcher(self.web2disk_options, logger, self.image_map, self.css_map, url)
        fetcher.base_dir = dir
        fetcher.current_dir = dir
        fetcher.show_progress = False
--- a/src/libprs500/web/fetch/simple.py
+++ b/src/libprs500/web/fetch/simple.py
@ -58,7 +58,7 @@ class RecursiveFetcher(object):
    #                       )
    CSS_IMPORT_PATTERN = re.compile(r'\@import\s+url\((.*?)\)', re.IGNORECASE)
-    def __init__(self, options, logger, image_map={}, css_map={}):
+    def __init__(self, options, logger, image_map={}, css_map={}, job_info=None):
        self.logger = logger
        self.base_dir = os.path.abspath(os.path.expanduser(options.dir))
        if not os.path.exists(self.base_dir):
@ -88,11 +88,11 @@ class RecursiveFetcher(object):
        self.remove_tags_before  = getattr(options, 'remove_tags_before', None)
        self.keep_only_tags      = getattr(options, 'keep_only_tags', [])
        self.preprocess_html_ext = getattr(options, 'preprocess_html', lambda soup: soup) 
-        self.postprocess_html_ext= getattr(options, 'postprocess_html', [])
+        self.postprocess_html_ext= getattr(options, 'postprocess_html', None)
        self.download_stylesheets = not options.no_stylesheets
        self.show_progress = True
        self.failed_links = []
-               
+        self.job_info = job_info 
    def get_soup(self, src):
        nmassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE)
@ -293,14 +293,15 @@ class RecursiveFetcher(object):
                self.localize_link(tag, 'href', self.filemap[nurl])
    def process_links(self, soup, baseurl, recursion_level, into_dir='links'):
-        c, res = 0, ''
+        res = ''
        diskpath = os.path.join(self.current_dir, into_dir)
        if not os.path.exists(diskpath):
            os.mkdir(diskpath)
        prev_dir = self.current_dir
        try:
            self.current_dir = diskpath
-            for tag in soup.findAll('a', href=True):
+            tags = list(soup.findAll('a', href=True))
            for c, tag in enumerate(tags):
                if self.show_progress:
                    print '.',
                    sys.stdout.flush()
@ -314,7 +315,6 @@ class RecursiveFetcher(object):
                    continue
                if self.files > self.max_files:
                    return res
                c += 1
                linkdir = 'link'+str(c) if into_dir else ''
                linkdiskpath = os.path.join(diskpath, linkdir)
                if not os.path.exists(linkdiskpath):
@ -346,8 +346,8 @@ class RecursiveFetcher(object):
                        self.process_return_links(soup, iurl) 
                        self.logger.debug('Recursion limit reached. Skipping links in %s', iurl)
-                    for func in self.postprocess_html_ext:
+                    if callable(self.postprocess_html_ext):
-                        soup = func(soup)
+                        soup = self.postprocess_html_ext(soup, c == len(tags)-1, self.job_info)
                    save_soup(soup, res)
                    self.localize_link(tag, 'href', res)