Fix #3972 (The Irish Times feeds have changed)

This commit is contained in:
Kovid Goyal 2009-11-09 09:05:04 -07:00
parent 2f43bc64ea
commit 569dbeb2b9
2 changed files with 53 additions and 51 deletions

View File

@ -6,10 +6,7 @@ __docformat__ = 'restructuredtext en'
''' '''
www.guardian.co.uk www.guardian.co.uk
''' '''
import string
import re
from calibre import strftime from calibre import strftime
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
class Guardian(BasicNewsRecipe): class Guardian(BasicNewsRecipe):
@ -45,58 +42,58 @@ class Guardian(BasicNewsRecipe):
#full-contents{font-size:small; font-family:Arial,Helvetica,sans-serif;font-weight:normal;} #full-contents{font-size:small; font-family:Arial,Helvetica,sans-serif;font-weight:normal;}
#match-stats-summary{font-size:small; font-family:Arial,Helvetica,sans-serif;font-weight:normal;} #match-stats-summary{font-size:small; font-family:Arial,Helvetica,sans-serif;font-weight:normal;}
''' '''
def parse_index(self): def parse_index(self):
soup = self.index_to_soup('http://www.guardian.co.uk/theguardian') soup = self.index_to_soup('http://www.guardian.co.uk/theguardian')
# find cover pic # find cover pic
img = soup.find( 'img',attrs ={'alt':'Guardian digital edition'}) img = soup.find( 'img',attrs ={'alt':'Guardian digital edition'})
if img is None: return None if img is None: return None
else: else:
self.cover_url = img['src'] self.cover_url = img['src']
# end find cover pic # end find cover pic
sections = [] sections = []
ans = [] ans = []
for li in soup.findAll( 'li'): for li in soup.findAll( 'li'):
section = '' section = ''
articles = [] articles = []
if li.a and li.a.has_key('href'): if li.a and li.a.has_key('href'):
url = li.a['href'] url = li.a['href']
if 'mainsection' in url: if 'mainsection' in url:
section = self.tag_to_string(url) section = self.tag_to_string(url)
i = len(section) i = len(section)
index1 = section.rfind('/',0,i) index1 = section.rfind('/',0,i)
section = section[index1+1:i] section = section[index1+1:i]
sections.append(section) sections.append(section)
#find the articles in the Main Section start #find the articles in the Main Section start
soup = self.index_to_soup(url) soup = self.index_to_soup(url)
date = strftime('%a, %d %b') date = strftime('%a, %d %b')
descl = [] descl = []
for desclist in soup.findAll(name='div',attrs={'class':"trailtext"}): for desclist in soup.findAll(name='div',attrs={'class':"trailtext"}):
descl.append(self.tag_to_string(desclist).strip()) descl.append(self.tag_to_string(desclist).strip())
t = -1 t = -1
for tag in soup.findAll('h3'): for tag in soup.findAll('h3'):
t = t+1 t = t+1
for a in tag.findAll('a'): for a in tag.findAll('a'):
if t < len(descl): if t < len(descl):
desc = descl[t] desc = descl[t]
else: else:
desc = '' desc = ''
if a and a.has_key('href'): if a and a.has_key('href'):
url2 = a['href'] url2 = a['href']
else: else:
url2 ='' url2 =''
title = self.tag_to_string(a) title = self.tag_to_string(a)
if len(articles) == 0: #First article if len(articles) == 0: #First article
articles.append({ articles.append({
'title':title, 'title':title,
'date':date, 'date':date,
@ -105,41 +102,41 @@ class Guardian(BasicNewsRecipe):
}) })
else: else:
#eliminate duplicates start #eliminate duplicates start
if {'title':title,'date':date,'url':url2,'description':desc} in articles : if {'title':title,'date':date,'url':url2,'description':desc} in articles :
url2 = '' url2 = ''
#eliminate duplicates end #eliminate duplicates end
else: else:
if 'http://jobs.guardian.co.uk/' in url2: if 'http://jobs.guardian.co.uk/' in url2:
url2 = '' url2 = ''
else: else:
articles.append({ articles.append({
'title':title, 'title':title,
'date':date, 'date':date,
'url':url2, 'url':url2,
'description':desc, 'description':desc,
}) })
#find the articles in the Main Section end #find the articles in the Main Section end
ans.append( articles) ans.append( articles)
else: else:
url ='' url =''
titles = map(self.find_title, sections) titles = map(self.find_title, sections)
ans1 = list(zip(titles,ans)) ans1 = list(zip(titles,ans))
return ans1[2:] return ans1[2:]
def find_title(self, section): def find_title(self, section):
d = {'topstories':'Top Stories', 'international':'International', 'editorialsandreply':'Editorials and Reply', d = {'topstories':'Top Stories', 'international':'International', 'editorialsandreply':'Editorials and Reply',
'commentanddebate':'Comment and Debate','uknews':'UK News','saturday':'Saturday','sunday':'Sunday', 'commentanddebate':'Comment and Debate','uknews':'UK News','saturday':'Saturday','sunday':'Sunday',
'reviews':'Reviews', 'obituaries':'Obituaries'} 'reviews':'Reviews', 'obituaries':'Obituaries'}
return d.get(section, section) return d.get(section, section)
def preprocess_html(self, soup): def preprocess_html(self, soup):
for item in soup.findAll(style=True): for item in soup.findAll(style=True):
del item['style'] del item['style']

View File

@ -1,5 +1,5 @@
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2008, Derry FitzGerald. 2009 Modified by Ray Kinsella' __copyright__ = "2008, Derry FitzGerald. 2009 Modified by Ray Kinsella and David O'Callaghan"
''' '''
irishtimes.com irishtimes.com
''' '''
@ -9,18 +9,21 @@ from calibre.web.feeds.news import BasicNewsRecipe
class IrishTimes(BasicNewsRecipe): class IrishTimes(BasicNewsRecipe):
title = u'The Irish Times' title = u'The Irish Times'
__author__ = 'Derry FitzGerald and Ray Kinsella' __author__ = "Derry FitzGerald, Ray Kinsella and David O'Callaghan"
language = 'en' language = 'en'
timefmt = ' (%A, %B %e, %Y)'
oldest_article = 3
no_stylesheets = True no_stylesheets = True
simultaneous_downloads= 1 simultaneous_downloads= 1
r = re.compile('.*(?P<url>http:\/\/www.irishtimes.com\/.*\.html).*') r = re.compile('.*(?P<url>http:\/\/(www.irishtimes.com)|(rss.feedsportal.com\/c)\/.*\.html?).*')
remove_tags = [dict(name='div', attrs={'class':'footer'})] remove_tags = [dict(name='div', attrs={'class':'footer'})]
extra_css = '.headline {font-size: x-large;} \n .fact { padding-top: 10pt }' extra_css = '.headline {font-size: x-large;} \n .fact { padding-top: 10pt }'
feeds = [ feeds = [
('Frontpage', 'http://www.irishtimes.com/feeds/rss/newspaper/index.rss'), ('Frontpage', 'http://www.irishtimes.com/feeds/rss/newspaper/index.rss'),
('Ireland', 'http://www.irishtimes.com/feeds/rss/newspaper/ireland.rss'), ('Ireland', 'http://www.irishtimes.com/feeds/rss/newspaper/ireland.rss'),
('World', 'http://www.irishtimes.com/feeds/rss/newspaper/world.rss'), ('World', 'http://www.irishtimes.com/feeds/rss/newspaper/world.rss'),
('Finance', 'http://www.irishtimes.com/feeds/rss/newspaper/finance.rss'), ('Finance', 'http://www.irishtimes.com/feeds/rss/newspaper/finance.rss'),
@ -29,12 +32,14 @@ class IrishTimes(BasicNewsRecipe):
('Opinion', 'http://www.irishtimes.com/feeds/rss/newspaper/opinion.rss'), ('Opinion', 'http://www.irishtimes.com/feeds/rss/newspaper/opinion.rss'),
('Letters', 'http://www.irishtimes.com/feeds/rss/newspaper/letters.rss'), ('Letters', 'http://www.irishtimes.com/feeds/rss/newspaper/letters.rss'),
] ]
def print_version(self, url): def print_version(self, url):
return url.replace('.html', '_pf.html') if url.count('rss.feedsportal.com'):
u = url.replace('0Bhtml/story01.htm','_pf0Bhtml/story01.htm')
else:
u = url.replace('.html','_pf.html')
return u
def get_article_url(self, article): def get_article_url(self, article):
m = self.r.match(article.get('description', None)) return article.link
print m.group('url')
return m.group('url')