IGN:Split new guardian recipe into sections

This commit is contained in:
Kovid Goyal 2009-11-07 08:42:47 -07:00
parent 7f4365e159
commit ab72fba13b
2 changed files with 79 additions and 76 deletions

Binary file not shown.

After

Width:  |  Height:  |  Size: 548 B

View File

@ -6,7 +6,10 @@ __docformat__ = 'restructuredtext en'
''' '''
www.guardian.co.uk www.guardian.co.uk
''' '''
import string
import re
from calibre import strftime
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
class Guardian(BasicNewsRecipe): class Guardian(BasicNewsRecipe):
@ -43,67 +46,57 @@ class Guardian(BasicNewsRecipe):
#match-stats-summary{font-size:small; font-family:Arial,Helvetica,sans-serif;font-weight:normal;} #match-stats-summary{font-size:small; font-family:Arial,Helvetica,sans-serif;font-weight:normal;}
''' '''
# feeds = [
# ('Front Page', 'http://www.guardian.co.uk/rss'),
# ('Business', 'http://www.guardian.co.uk/business/rss'),
# ('Sport', 'http://www.guardian.co.uk/sport/rss'),
# ('Culture', 'http://www.guardian.co.uk/culture/rss'),
# ('Money', 'http://www.guardian.co.uk/money/rss'),
# ('Life & Style', 'http://www.guardian.co.uk/lifeandstyle/rss'),
# ('Travel', 'http://www.guardian.co.uk/travel/rss'),
# ('Environment', 'http://www.guardian.co.uk/environment/rss'),
# ('Comment','http://www.guardian.co.uk/commentisfree/rss'),
# ]
# def get_article_url(self, article):
# url = article.get('guid', None)
# if '/video/' in url or '/flyer/' in url or '/quiz/' in url or \
# '/gallery/' in url or 'ivebeenthere' in url or \
# 'pickthescore' in url or 'audioslideshow' in url :
# url = None
# return url
def parse_index(self): def parse_index(self):
articles = []
soup = self.index_to_soup('http://www.guardian.co.uk/theguardian') soup = self.index_to_soup('http://www.guardian.co.uk/theguardian')
# find cover pic # find cover pic
img = soup.find( 'img',attrs ={'alt':'Guardian digital edition'}) img = soup.find( 'img',attrs ={'alt':'Guardian digital edition'})
if img is None: return None
if img is not None: else:
self.cover_url = img['src'] self.cover_url = img['src']
# end find cover pic # end find cover pic
sections = []
ans = []
for li in soup.findAll( 'li'): for li in soup.findAll( 'li'):
section = ''
articles = []
if li.a and li.a.has_key('href'): if li.a and li.a.has_key('href'):
url = li.a['href'] url = li.a['href']
if 'mainsection' in url: if 'mainsection' in url:
section = self.tag_to_string(url)
i = len(section)
index1 = section.rfind('/',0,i)
section = section[index1+1:i]
sections.append(section)
#find the articles in the Main Section #find the articles in the Main Section start
soup = self.index_to_soup(url) soup = self.index_to_soup(url)
date = strftime('%a, %d %b')
descl = []
for desclist in soup.findAll(name='div',attrs={'class':"trailtext"}):
descl.append(self.tag_to_string(desclist).strip())
t = -1
for tag in soup.findAll('h3'): for tag in soup.findAll('h3'):
t = t+1
for a in tag.findAll('a'): for a in tag.findAll('a'):
if t < len(descl):
desc = descl[t]
else:
desc = ''
if a and a.has_key('href'): if a and a.has_key('href'):
url2 = a['href'] url2 = a['href']
else: else:
url2 ='' url2 =''
title = self.tag_to_string(a) title = self.tag_to_string(a)
#eliminate duplicates
if len(articles) == 0: if len(articles) == 0: #First article
desc = 'Main Section'
date = ''
articles.append({ articles.append({
'title':title, 'title':title,
'date':date, 'date':date,
@ -111,29 +104,39 @@ class Guardian(BasicNewsRecipe):
'description':desc, 'description':desc,
}) })
else: else:
if len(articles) > 0: #eliminate duplicates start
if {'title':title,'date':date,'url':url2,'description':desc} in articles: if {'title':title,'date':date,'url':url2,'description':desc} in articles :
ulrl2 = '' url2 = ''
#eliminate duplicates #eliminate duplicates end
else:
if 'http://jobs.guardian.co.uk/' in url2:
url2 = ''
else: else:
desc = 'Main Section'
date = ''
articles.append({ articles.append({
'title':title, 'title':title,
'date':date, 'date':date,
'url':url2, 'url':url2,
'description':desc, 'description':desc,
}) })
#find the articles in the Main Section #find the articles in the Main Section end
ans.append( articles)
else: else:
url ='' url =''
titles = map(self.find_title, sections)
ans1 = list(zip(titles,ans))
return ans1[2:]
return [('Current Issue', articles)] def find_title(self, section):
d = {'topstories':'Top Stories', 'international':'International', 'editorialsandreply':'Editorials and Reply',
'commentanddebate':'Comment and Debate','uknews':'UK News','saturday':'Saturday','sunday':'Sunday',
'reviews':'Reviews', 'obituaries':'Obituaries'}
return d.get(section, section)
def preprocess_html(self, soup): def preprocess_html(self, soup):