Added article source link to bottom of articles

This commit is contained in:
Kovid Goyal 2008-03-15 23:49:12 +00:00
parent 8da62e7383
commit 1a1fd62a2c
2 changed files with 19 additions and 12 deletions

View File

@ -318,7 +318,7 @@ class BasicNewsRecipe(object):
for extra in ('keep_only_tags', 'remove_tags', 'preprocess_regexps', for extra in ('keep_only_tags', 'remove_tags', 'preprocess_regexps',
'preprocess_html', 'remove_tags_after', 'remove_tags_before'): 'preprocess_html', 'remove_tags_after', 'remove_tags_before'):
setattr(self.web2disk_options, extra, getattr(self, extra)) setattr(self.web2disk_options, extra, getattr(self, extra))
self.web2disk_options.postprocess_html = [self._postprocess_html, self.postprocess_html] self.web2disk_options.postprocess_html = self._postprocess_html
if self.delay > 0: if self.delay > 0:
self.simultaneous_downloads = 1 self.simultaneous_downloads = 1
@ -329,13 +329,20 @@ class BasicNewsRecipe(object):
self.partial_failures = [] self.partial_failures = []
def _postprocess_html(self, soup): def _postprocess_html(self, soup, last_fetch, article_url):
if self.extra_css is not None: if self.extra_css is not None:
head = soup.find('head') head = soup.find('head')
if head: if head:
style = BeautifulSoup(u'<style type="text/css">%s</style>'%self.extra_css).find('style') style = BeautifulSoup(u'<style type="text/css">%s</style>'%self.extra_css).find('style')
head.insert(len(head.contents), style) head.insert(len(head.contents), style)
return soup if last_fetch:
body = soup.find('body')
if body:
div = BeautifulSoup('<div style="font:8pt monospace"><hr />This article was downloaded by <b>%s</b> from <a href="%s">%s</a></div>'%(__appname__, article_url, article_url)).find('div')
body.insert(len(body.contents), div)
return self.postprocess_html(soup)
def download(self): def download(self):
''' '''
@ -404,7 +411,7 @@ class BasicNewsRecipe(object):
return logger, out return logger, out
def fetch_article(self, url, dir, logger): def fetch_article(self, url, dir, logger):
fetcher = RecursiveFetcher(self.web2disk_options, logger, self.image_map, self.css_map) fetcher = RecursiveFetcher(self.web2disk_options, logger, self.image_map, self.css_map, url)
fetcher.base_dir = dir fetcher.base_dir = dir
fetcher.current_dir = dir fetcher.current_dir = dir
fetcher.show_progress = False fetcher.show_progress = False

View File

@ -58,7 +58,7 @@ class RecursiveFetcher(object):
# ) # )
CSS_IMPORT_PATTERN = re.compile(r'\@import\s+url\((.*?)\)', re.IGNORECASE) CSS_IMPORT_PATTERN = re.compile(r'\@import\s+url\((.*?)\)', re.IGNORECASE)
def __init__(self, options, logger, image_map={}, css_map={}): def __init__(self, options, logger, image_map={}, css_map={}, job_info=None):
self.logger = logger self.logger = logger
self.base_dir = os.path.abspath(os.path.expanduser(options.dir)) self.base_dir = os.path.abspath(os.path.expanduser(options.dir))
if not os.path.exists(self.base_dir): if not os.path.exists(self.base_dir):
@ -88,11 +88,11 @@ class RecursiveFetcher(object):
self.remove_tags_before = getattr(options, 'remove_tags_before', None) self.remove_tags_before = getattr(options, 'remove_tags_before', None)
self.keep_only_tags = getattr(options, 'keep_only_tags', []) self.keep_only_tags = getattr(options, 'keep_only_tags', [])
self.preprocess_html_ext = getattr(options, 'preprocess_html', lambda soup: soup) self.preprocess_html_ext = getattr(options, 'preprocess_html', lambda soup: soup)
self.postprocess_html_ext= getattr(options, 'postprocess_html', []) self.postprocess_html_ext= getattr(options, 'postprocess_html', None)
self.download_stylesheets = not options.no_stylesheets self.download_stylesheets = not options.no_stylesheets
self.show_progress = True self.show_progress = True
self.failed_links = [] self.failed_links = []
self.job_info = job_info
def get_soup(self, src): def get_soup(self, src):
nmassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE) nmassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE)
@ -293,14 +293,15 @@ class RecursiveFetcher(object):
self.localize_link(tag, 'href', self.filemap[nurl]) self.localize_link(tag, 'href', self.filemap[nurl])
def process_links(self, soup, baseurl, recursion_level, into_dir='links'): def process_links(self, soup, baseurl, recursion_level, into_dir='links'):
c, res = 0, '' res = ''
diskpath = os.path.join(self.current_dir, into_dir) diskpath = os.path.join(self.current_dir, into_dir)
if not os.path.exists(diskpath): if not os.path.exists(diskpath):
os.mkdir(diskpath) os.mkdir(diskpath)
prev_dir = self.current_dir prev_dir = self.current_dir
try: try:
self.current_dir = diskpath self.current_dir = diskpath
for tag in soup.findAll('a', href=True): tags = list(soup.findAll('a', href=True))
for c, tag in enumerate(tags):
if self.show_progress: if self.show_progress:
print '.', print '.',
sys.stdout.flush() sys.stdout.flush()
@ -314,7 +315,6 @@ class RecursiveFetcher(object):
continue continue
if self.files > self.max_files: if self.files > self.max_files:
return res return res
c += 1
linkdir = 'link'+str(c) if into_dir else '' linkdir = 'link'+str(c) if into_dir else ''
linkdiskpath = os.path.join(diskpath, linkdir) linkdiskpath = os.path.join(diskpath, linkdir)
if not os.path.exists(linkdiskpath): if not os.path.exists(linkdiskpath):
@ -346,8 +346,8 @@ class RecursiveFetcher(object):
self.process_return_links(soup, iurl) self.process_return_links(soup, iurl)
self.logger.debug('Recursion limit reached. Skipping links in %s', iurl) self.logger.debug('Recursion limit reached. Skipping links in %s', iurl)
for func in self.postprocess_html_ext: if callable(self.postprocess_html_ext):
soup = func(soup) soup = self.postprocess_html_ext(soup, c == len(tags)-1, self.job_info)
save_soup(soup, res) save_soup(soup, res)
self.localize_link(tag, 'href', res) self.localize_link(tag, 'href', res)