mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Added article source link to bottom of articles
This commit is contained in:
parent
8da62e7383
commit
1a1fd62a2c
@ -318,7 +318,7 @@ class BasicNewsRecipe(object):
|
|||||||
for extra in ('keep_only_tags', 'remove_tags', 'preprocess_regexps',
|
for extra in ('keep_only_tags', 'remove_tags', 'preprocess_regexps',
|
||||||
'preprocess_html', 'remove_tags_after', 'remove_tags_before'):
|
'preprocess_html', 'remove_tags_after', 'remove_tags_before'):
|
||||||
setattr(self.web2disk_options, extra, getattr(self, extra))
|
setattr(self.web2disk_options, extra, getattr(self, extra))
|
||||||
self.web2disk_options.postprocess_html = [self._postprocess_html, self.postprocess_html]
|
self.web2disk_options.postprocess_html = self._postprocess_html
|
||||||
|
|
||||||
if self.delay > 0:
|
if self.delay > 0:
|
||||||
self.simultaneous_downloads = 1
|
self.simultaneous_downloads = 1
|
||||||
@ -329,13 +329,20 @@ class BasicNewsRecipe(object):
|
|||||||
self.partial_failures = []
|
self.partial_failures = []
|
||||||
|
|
||||||
|
|
||||||
def _postprocess_html(self, soup):
|
def _postprocess_html(self, soup, last_fetch, article_url):
|
||||||
if self.extra_css is not None:
|
if self.extra_css is not None:
|
||||||
head = soup.find('head')
|
head = soup.find('head')
|
||||||
if head:
|
if head:
|
||||||
style = BeautifulSoup(u'<style type="text/css">%s</style>'%self.extra_css).find('style')
|
style = BeautifulSoup(u'<style type="text/css">%s</style>'%self.extra_css).find('style')
|
||||||
head.insert(len(head.contents), style)
|
head.insert(len(head.contents), style)
|
||||||
return soup
|
if last_fetch:
|
||||||
|
body = soup.find('body')
|
||||||
|
if body:
|
||||||
|
div = BeautifulSoup('<div style="font:8pt monospace"><hr />This article was downloaded by <b>%s</b> from <a href="%s">%s</a></div>'%(__appname__, article_url, article_url)).find('div')
|
||||||
|
body.insert(len(body.contents), div)
|
||||||
|
|
||||||
|
return self.postprocess_html(soup)
|
||||||
|
|
||||||
|
|
||||||
def download(self):
|
def download(self):
|
||||||
'''
|
'''
|
||||||
@ -404,7 +411,7 @@ class BasicNewsRecipe(object):
|
|||||||
return logger, out
|
return logger, out
|
||||||
|
|
||||||
def fetch_article(self, url, dir, logger):
|
def fetch_article(self, url, dir, logger):
|
||||||
fetcher = RecursiveFetcher(self.web2disk_options, logger, self.image_map, self.css_map)
|
fetcher = RecursiveFetcher(self.web2disk_options, logger, self.image_map, self.css_map, url)
|
||||||
fetcher.base_dir = dir
|
fetcher.base_dir = dir
|
||||||
fetcher.current_dir = dir
|
fetcher.current_dir = dir
|
||||||
fetcher.show_progress = False
|
fetcher.show_progress = False
|
||||||
|
@ -58,7 +58,7 @@ class RecursiveFetcher(object):
|
|||||||
# )
|
# )
|
||||||
CSS_IMPORT_PATTERN = re.compile(r'\@import\s+url\((.*?)\)', re.IGNORECASE)
|
CSS_IMPORT_PATTERN = re.compile(r'\@import\s+url\((.*?)\)', re.IGNORECASE)
|
||||||
|
|
||||||
def __init__(self, options, logger, image_map={}, css_map={}):
|
def __init__(self, options, logger, image_map={}, css_map={}, job_info=None):
|
||||||
self.logger = logger
|
self.logger = logger
|
||||||
self.base_dir = os.path.abspath(os.path.expanduser(options.dir))
|
self.base_dir = os.path.abspath(os.path.expanduser(options.dir))
|
||||||
if not os.path.exists(self.base_dir):
|
if not os.path.exists(self.base_dir):
|
||||||
@ -88,11 +88,11 @@ class RecursiveFetcher(object):
|
|||||||
self.remove_tags_before = getattr(options, 'remove_tags_before', None)
|
self.remove_tags_before = getattr(options, 'remove_tags_before', None)
|
||||||
self.keep_only_tags = getattr(options, 'keep_only_tags', [])
|
self.keep_only_tags = getattr(options, 'keep_only_tags', [])
|
||||||
self.preprocess_html_ext = getattr(options, 'preprocess_html', lambda soup: soup)
|
self.preprocess_html_ext = getattr(options, 'preprocess_html', lambda soup: soup)
|
||||||
self.postprocess_html_ext= getattr(options, 'postprocess_html', [])
|
self.postprocess_html_ext= getattr(options, 'postprocess_html', None)
|
||||||
self.download_stylesheets = not options.no_stylesheets
|
self.download_stylesheets = not options.no_stylesheets
|
||||||
self.show_progress = True
|
self.show_progress = True
|
||||||
self.failed_links = []
|
self.failed_links = []
|
||||||
|
self.job_info = job_info
|
||||||
|
|
||||||
def get_soup(self, src):
|
def get_soup(self, src):
|
||||||
nmassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE)
|
nmassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE)
|
||||||
@ -293,14 +293,15 @@ class RecursiveFetcher(object):
|
|||||||
self.localize_link(tag, 'href', self.filemap[nurl])
|
self.localize_link(tag, 'href', self.filemap[nurl])
|
||||||
|
|
||||||
def process_links(self, soup, baseurl, recursion_level, into_dir='links'):
|
def process_links(self, soup, baseurl, recursion_level, into_dir='links'):
|
||||||
c, res = 0, ''
|
res = ''
|
||||||
diskpath = os.path.join(self.current_dir, into_dir)
|
diskpath = os.path.join(self.current_dir, into_dir)
|
||||||
if not os.path.exists(diskpath):
|
if not os.path.exists(diskpath):
|
||||||
os.mkdir(diskpath)
|
os.mkdir(diskpath)
|
||||||
prev_dir = self.current_dir
|
prev_dir = self.current_dir
|
||||||
try:
|
try:
|
||||||
self.current_dir = diskpath
|
self.current_dir = diskpath
|
||||||
for tag in soup.findAll('a', href=True):
|
tags = list(soup.findAll('a', href=True))
|
||||||
|
for c, tag in enumerate(tags):
|
||||||
if self.show_progress:
|
if self.show_progress:
|
||||||
print '.',
|
print '.',
|
||||||
sys.stdout.flush()
|
sys.stdout.flush()
|
||||||
@ -314,7 +315,6 @@ class RecursiveFetcher(object):
|
|||||||
continue
|
continue
|
||||||
if self.files > self.max_files:
|
if self.files > self.max_files:
|
||||||
return res
|
return res
|
||||||
c += 1
|
|
||||||
linkdir = 'link'+str(c) if into_dir else ''
|
linkdir = 'link'+str(c) if into_dir else ''
|
||||||
linkdiskpath = os.path.join(diskpath, linkdir)
|
linkdiskpath = os.path.join(diskpath, linkdir)
|
||||||
if not os.path.exists(linkdiskpath):
|
if not os.path.exists(linkdiskpath):
|
||||||
@ -346,8 +346,8 @@ class RecursiveFetcher(object):
|
|||||||
self.process_return_links(soup, iurl)
|
self.process_return_links(soup, iurl)
|
||||||
self.logger.debug('Recursion limit reached. Skipping links in %s', iurl)
|
self.logger.debug('Recursion limit reached. Skipping links in %s', iurl)
|
||||||
|
|
||||||
for func in self.postprocess_html_ext:
|
if callable(self.postprocess_html_ext):
|
||||||
soup = func(soup)
|
soup = self.postprocess_html_ext(soup, c == len(tags)-1, self.job_info)
|
||||||
save_soup(soup, res)
|
save_soup(soup, res)
|
||||||
|
|
||||||
self.localize_link(tag, 'href', res)
|
self.localize_link(tag, 'href', res)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user