From 1a1fd62a2c143191c51d77489e6aba1a940e3aa2 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sat, 15 Mar 2008 23:49:12 +0000 Subject: [PATCH] Added article source link to bottom of articles --- src/libprs500/web/feeds/news.py | 15 +++++++++++---- src/libprs500/web/fetch/simple.py | 16 ++++++++-------- 2 files changed, 19 insertions(+), 12 deletions(-) diff --git a/src/libprs500/web/feeds/news.py b/src/libprs500/web/feeds/news.py index a4d94e5427..4effa12f6e 100644 --- a/src/libprs500/web/feeds/news.py +++ b/src/libprs500/web/feeds/news.py @@ -318,7 +318,7 @@ class BasicNewsRecipe(object): for extra in ('keep_only_tags', 'remove_tags', 'preprocess_regexps', 'preprocess_html', 'remove_tags_after', 'remove_tags_before'): setattr(self.web2disk_options, extra, getattr(self, extra)) - self.web2disk_options.postprocess_html = [self._postprocess_html, self.postprocess_html] + self.web2disk_options.postprocess_html = self._postprocess_html if self.delay > 0: self.simultaneous_downloads = 1 @@ -329,13 +329,20 @@ class BasicNewsRecipe(object): self.partial_failures = [] - def _postprocess_html(self, soup): + def _postprocess_html(self, soup, last_fetch, article_url): if self.extra_css is not None: head = soup.find('head') if head: style = BeautifulSoup(u''%self.extra_css).find('style') head.insert(len(head.contents), style) - return soup + if last_fetch: + body = soup.find('body') + if body: + div = BeautifulSoup('

This article was downloaded by %s from %s
'%(__appname__, article_url, article_url)).find('div') + body.insert(len(body.contents), div) + + return self.postprocess_html(soup) + def download(self): ''' @@ -404,7 +411,7 @@ class BasicNewsRecipe(object): return logger, out def fetch_article(self, url, dir, logger): - fetcher = RecursiveFetcher(self.web2disk_options, logger, self.image_map, self.css_map) + fetcher = RecursiveFetcher(self.web2disk_options, logger, self.image_map, self.css_map, url) fetcher.base_dir = dir fetcher.current_dir = dir fetcher.show_progress = False diff --git a/src/libprs500/web/fetch/simple.py b/src/libprs500/web/fetch/simple.py index a16c619190..3a2ab9ec19 100644 --- a/src/libprs500/web/fetch/simple.py +++ b/src/libprs500/web/fetch/simple.py @@ -58,7 +58,7 @@ class RecursiveFetcher(object): # ) CSS_IMPORT_PATTERN = re.compile(r'\@import\s+url\((.*?)\)', re.IGNORECASE) - def __init__(self, options, logger, image_map={}, css_map={}): + def __init__(self, options, logger, image_map={}, css_map={}, job_info=None): self.logger = logger self.base_dir = os.path.abspath(os.path.expanduser(options.dir)) if not os.path.exists(self.base_dir): @@ -88,11 +88,11 @@ class RecursiveFetcher(object): self.remove_tags_before = getattr(options, 'remove_tags_before', None) self.keep_only_tags = getattr(options, 'keep_only_tags', []) self.preprocess_html_ext = getattr(options, 'preprocess_html', lambda soup: soup) - self.postprocess_html_ext= getattr(options, 'postprocess_html', []) + self.postprocess_html_ext= getattr(options, 'postprocess_html', None) self.download_stylesheets = not options.no_stylesheets self.show_progress = True self.failed_links = [] - + self.job_info = job_info def get_soup(self, src): nmassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE) @@ -293,14 +293,15 @@ class RecursiveFetcher(object): self.localize_link(tag, 'href', self.filemap[nurl]) def process_links(self, soup, baseurl, recursion_level, into_dir='links'): - c, res = 0, '' + res = '' diskpath = os.path.join(self.current_dir, into_dir) if not os.path.exists(diskpath): os.mkdir(diskpath) prev_dir = self.current_dir try: self.current_dir = diskpath - for tag in soup.findAll('a', href=True): + tags = list(soup.findAll('a', href=True)) + for c, tag in enumerate(tags): if self.show_progress: print '.', sys.stdout.flush() @@ -314,7 +315,6 @@ class RecursiveFetcher(object): continue if self.files > self.max_files: return res - c += 1 linkdir = 'link'+str(c) if into_dir else '' linkdiskpath = os.path.join(diskpath, linkdir) if not os.path.exists(linkdiskpath): @@ -346,8 +346,8 @@ class RecursiveFetcher(object): self.process_return_links(soup, iurl) self.logger.debug('Recursion limit reached. Skipping links in %s', iurl) - for func in self.postprocess_html_ext: - soup = func(soup) + if callable(self.postprocess_html_ext): + soup = self.postprocess_html_ext(soup, c == len(tags)-1, self.job_info) save_soup(soup, res) self.localize_link(tag, 'href', res)