From 42429dd12bf0c05383f80a02a9e40b357b9d302c Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 2 Apr 2012 22:07:42 +0530 Subject: [PATCH] The Southern Star by watou --- recipes/southernstar.recipe | 136 ++++++++++++++++++++++++++++++++++++ 1 file changed, 136 insertions(+) create mode 100644 recipes/southernstar.recipe diff --git a/recipes/southernstar.recipe b/recipes/southernstar.recipe new file mode 100644 index 0000000000..69a81e2fb6 --- /dev/null +++ b/recipes/southernstar.recipe @@ -0,0 +1,136 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' +__copyright__ = '2012, watou' +''' +southernstar.ie +''' +import re +import tempfile +import os +import codecs + +from calibre.web.feeds.news import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import Tag, NavigableString + +class TheSouthernStar(BasicNewsRecipe): + + title = 'The Southern Star' + __author__ = 'watou' + description = 'West Cork\'s leading news and information provider since 1889' + NEWS_INDEX = 'http://www.southernstar.ie/news.php' + LOCAL_NOTES = 'http://www.southernstar.ie/localnotes.php' + SPORT_INDEX = 'http://www.southernstar.ie/sport.php' + CLASSIFIEDS = 'http://www.southernstar.ie/classifieds.php' + language = 'en_IE' + encoding = 'cp1252' + + publication_type = 'newspaper' + masthead_url = 'http://www.southernstar.ie/images/logo.gif' + remove_tags_before = dict(name='div', attrs={'class':'article'}) + remove_tags_after = dict(name='div', attrs={'class':'article'}) + remove_tags = [dict(name='div', attrs={'style':'width:300px; position:relative'}), + dict(name='form'), + dict(name='div', attrs={'class':'endpanel'})] + no_stylesheets = True + tempfiles = [] + pubdate = '' + + preprocess_regexps = [(re.compile(r'', re.DOTALL), lambda m: '')] + + def parse_index(self): + feeds = [] + seen_titles = set([]) + + articles = self.fetch_ss_articles(self.NEWS_INDEX, seen_titles) + if articles: + feeds.append(('News', articles)) + + articles = self.fetch_ss_notes(self.LOCAL_NOTES) + if articles: + feeds.append(('Local Notes', articles)) + + articles = self.fetch_ss_articles(self.SPORT_INDEX, seen_titles) + if articles: + feeds.append(('Sport', articles)) + + articles = self.fetch_ss_notes(self.CLASSIFIEDS) + if articles: + feeds.append(('Classifieds', articles)) + + return feeds + + def fetch_ss_articles(self, index, seen_titles): + articles = [] + soup = self.index_to_soup(index) + ts = soup.find('div', {'class':'article'}) + ds = self.tag_to_string(ts.find('strong')) + self.pubdate = ' ['+ds+']' + self.timefmt = ' [%s]'%ds + + for post in ts.findAll('h1'): + a = post.find('a', href=True) + title = self.tag_to_string(a) + if title in seen_titles: + continue + seen_titles.add(title) + url = a['href'] + if url.startswith('article'): + url = 'http://www.southernstar.ie/'+url + self.log('\tFound article:', title, 'at', url) + p = post.findNextSibling('p') + desc = None + if p is not None: + desc = str(p) + articles.append({'title':title, 'url':url, 'description':desc, + 'date':self.pubdate}) + + return articles + + def fetch_ss_notes(self, page): + articles = [] + + soup = self.index_to_soup(page) + ts = soup.find('div', {'class':'content'}) + for post in ts.findAll('h1'): + title = self.tag_to_string(post) + self.log('\tFound note:', title) + f = tempfile.NamedTemporaryFile(suffix='.html',delete=False) + f.close() + f = codecs.open(f.name, 'w+b', self.encoding, 'replace') + url = "file://" + f.name + f.write(u'

'+title+'

') + f.write(str(post.findNextSibling('p'))) + f.write(u'') + self.log('\tWrote note to', f.name) + f.close() + self.tempfiles.append(f) + articles.append({'title':title, 'url':url, 'date':self.pubdate}) + + return articles + + def postprocess_html(self, soup, first): + for table in soup.findAll('table', align='right'): + img = table.find('img') + if img is not None: + img.extract() + caption = self.tag_to_string(table).strip() + div = Tag(soup, 'div') + div['style'] = 'text-align:center' + div.insert(0, img) + div.insert(1, Tag(soup, 'br')) + if caption: + div.insert(2, NavigableString(caption)) + table.replaceWith(div) + + return soup + + def image_url_processor(self, baseurl, url): + return url.replace(' ','%20') + + def cleanup(self): + self.log('cleaning up') + for f in self.tempfiles: + os.unlink(f.name) + self.tempfiles = []