diff --git a/resources/recipes/guardian.recipe b/resources/recipes/guardian.recipe index 194e20d478..9105d17937 100644 --- a/resources/recipes/guardian.recipe +++ b/resources/recipes/guardian.recipe @@ -6,10 +6,7 @@ __docformat__ = 'restructuredtext en' ''' www.guardian.co.uk ''' -import string -import re from calibre import strftime -from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag from calibre.web.feeds.news import BasicNewsRecipe class Guardian(BasicNewsRecipe): @@ -45,58 +42,58 @@ class Guardian(BasicNewsRecipe): #full-contents{font-size:small; font-family:Arial,Helvetica,sans-serif;font-weight:normal;} #match-stats-summary{font-size:small; font-family:Arial,Helvetica,sans-serif;font-weight:normal;} ''' - - def parse_index(self): - + + def parse_index(self): + soup = self.index_to_soup('http://www.guardian.co.uk/theguardian') # find cover pic - img = soup.find( 'img',attrs ={'alt':'Guardian digital edition'}) + img = soup.find( 'img',attrs ={'alt':'Guardian digital edition'}) if img is None: return None else: - self.cover_url = img['src'] + self.cover_url = img['src'] # end find cover pic sections = [] ans = [] for li in soup.findAll( 'li'): section = '' articles = [] - - if li.a and li.a.has_key('href'): - url = li.a['href'] - if 'mainsection' in url: + + if li.a and li.a.has_key('href'): + url = li.a['href'] + if 'mainsection' in url: section = self.tag_to_string(url) i = len(section) - + index1 = section.rfind('/',0,i) - section = section[index1+1:i] + section = section[index1+1:i] sections.append(section) - - #find the articles in the Main Section start + + #find the articles in the Main Section start soup = self.index_to_soup(url) date = strftime('%a, %d %b') descl = [] - + for desclist in soup.findAll(name='div',attrs={'class':"trailtext"}): descl.append(self.tag_to_string(desclist).strip()) - + t = -1 for tag in soup.findAll('h3'): t = t+1 - + for a in tag.findAll('a'): - - if t < len(descl): + + if t < len(descl): desc = descl[t] else: - desc = '' - if a and a.has_key('href'): - url2 = a['href'] + desc = '' + if a and a.has_key('href'): + url2 = a['href'] else: - url2 ='' + url2 ='' title = self.tag_to_string(a) - + if len(articles) == 0: #First article - + articles.append({ 'title':title, 'date':date, @@ -105,41 +102,41 @@ class Guardian(BasicNewsRecipe): }) else: #eliminate duplicates start - if {'title':title,'date':date,'url':url2,'description':desc} in articles : + if {'title':title,'date':date,'url':url2,'description':desc} in articles : url2 = '' #eliminate duplicates end else: if 'http://jobs.guardian.co.uk/' in url2: url2 = '' else: - + articles.append({ 'title':title, 'date':date, 'url':url2, - 'description':desc, - }) + 'description':desc, + }) #find the articles in the Main Section end ans.append( articles) - + else: url ='' - - + + titles = map(self.find_title, sections) ans1 = list(zip(titles,ans)) - + return ans1[2:] - + def find_title(self, section): d = {'topstories':'Top Stories', 'international':'International', 'editorialsandreply':'Editorials and Reply', 'commentanddebate':'Comment and Debate','uknews':'UK News','saturday':'Saturday','sunday':'Sunday', - 'reviews':'Reviews', 'obituaries':'Obituaries'} - + 'reviews':'Reviews', 'obituaries':'Obituaries'} + return d.get(section, section) def preprocess_html(self, soup): - + for item in soup.findAll(style=True): del item['style'] diff --git a/resources/recipes/irish_times.recipe b/resources/recipes/irish_times.recipe index f536895903..7c5772eaa5 100644 --- a/resources/recipes/irish_times.recipe +++ b/resources/recipes/irish_times.recipe @@ -1,5 +1,5 @@ __license__ = 'GPL v3' -__copyright__ = '2008, Derry FitzGerald. 2009 Modified by Ray Kinsella' +__copyright__ = "2008, Derry FitzGerald. 2009 Modified by Ray Kinsella and David O'Callaghan" ''' irishtimes.com ''' @@ -9,18 +9,21 @@ from calibre.web.feeds.news import BasicNewsRecipe class IrishTimes(BasicNewsRecipe): title = u'The Irish Times' - __author__ = 'Derry FitzGerald and Ray Kinsella' + __author__ = "Derry FitzGerald, Ray Kinsella and David O'Callaghan" language = 'en' + timefmt = ' (%A, %B %e, %Y)' + + oldest_article = 3 no_stylesheets = True simultaneous_downloads= 1 - - r = re.compile('.*(?Phttp:\/\/www.irishtimes.com\/.*\.html).*') + + r = re.compile('.*(?Phttp:\/\/(www.irishtimes.com)|(rss.feedsportal.com\/c)\/.*\.html?).*') remove_tags = [dict(name='div', attrs={'class':'footer'})] - extra_css = '.headline {font-size: x-large;} \n .fact { padding-top: 10pt }' + extra_css = '.headline {font-size: x-large;} \n .fact { padding-top: 10pt }' feeds = [ - ('Frontpage', 'http://www.irishtimes.com/feeds/rss/newspaper/index.rss'), + ('Frontpage', 'http://www.irishtimes.com/feeds/rss/newspaper/index.rss'), ('Ireland', 'http://www.irishtimes.com/feeds/rss/newspaper/ireland.rss'), ('World', 'http://www.irishtimes.com/feeds/rss/newspaper/world.rss'), ('Finance', 'http://www.irishtimes.com/feeds/rss/newspaper/finance.rss'), @@ -29,12 +32,14 @@ class IrishTimes(BasicNewsRecipe): ('Opinion', 'http://www.irishtimes.com/feeds/rss/newspaper/opinion.rss'), ('Letters', 'http://www.irishtimes.com/feeds/rss/newspaper/letters.rss'), ] - + def print_version(self, url): - return url.replace('.html', '_pf.html') - + if url.count('rss.feedsportal.com'): + u = url.replace('0Bhtml/story01.htm','_pf0Bhtml/story01.htm') + else: + u = url.replace('.html','_pf.html') + return u + def get_article_url(self, article): - m = self.r.match(article.get('description', None)) - print m.group('url') - return m.group('url') \ No newline at end of file + return article.link