Update The Sun

This commit is contained in:
Kovid Goyal 2013-05-04 14:16:17 +05:30
parent 22f95c8678
commit 4037971bde

View File

@ -1,4 +1,4 @@
import re, random import random
from calibre import browser from calibre import browser
from calibre.web.feeds.recipes import BasicNewsRecipe from calibre.web.feeds.recipes import BasicNewsRecipe
@ -8,7 +8,7 @@ class AdvancedUserRecipe1325006965(BasicNewsRecipe):
title = u'The Sun UK' title = u'The Sun UK'
description = 'Articles from The Sun tabloid UK' description = 'Articles from The Sun tabloid UK'
__author__ = 'Dave Asbury' __author__ = 'Dave Asbury'
# last updated 19/10/12 better cover fetch # last updated 5/5/13 better cover fetch
language = 'en_GB' language = 'en_GB'
oldest_article = 1 oldest_article = 1
max_articles_per_feed = 15 max_articles_per_feed = 15
@ -29,10 +29,6 @@ class AdvancedUserRecipe1325006965(BasicNewsRecipe):
dict(name='div',attrs={'class' : 'intro'}), dict(name='div',attrs={'class' : 'intro'}),
dict(name='h3'), dict(name='h3'),
dict(name='div',attrs={'id' : 'articlebody'}), dict(name='div',attrs={'id' : 'articlebody'}),
#dict(attrs={'class' : ['right_col_branding','related-stories','mystery-meat-link','ltbx-container','ltbx-var ltbx-hbxpn','ltbx-var ltbx-nav-loop','ltbx-var ltbx-url']}),
# dict(name='div',attrs={'class' : 'cf'}),
# dict(attrs={'title' : 'download flash'}),
# dict(attrs={'style' : 'padding: 5px'})
] ]
remove_tags_after = [dict(id='bodyText')] remove_tags_after = [dict(id='bodyText')]
@ -52,7 +48,6 @@ class AdvancedUserRecipe1325006965(BasicNewsRecipe):
feeds = BasicNewsRecipe.parse_feeds(self) feeds = BasicNewsRecipe.parse_feeds(self)
for feed in feeds: for feed in feeds:
for article in feed.articles[:]: for article in feed.articles[:]:
print 'article.title is: ', article.title
if 'Try out The Sun' in article.title.upper() or 'Try-out-The-Suns' in article.url: if 'Try out The Sun' in article.title.upper() or 'Try-out-The-Suns' in article.url:
feed.articles.remove(article) feed.articles.remove(article)
if 'Web porn harms kids' in article.title.upper() or 'Sun-says-Web-porn' in article.url: if 'Web porn harms kids' in article.title.upper() or 'Sun-says-Web-porn' in article.url:
@ -60,27 +55,12 @@ class AdvancedUserRecipe1325006965(BasicNewsRecipe):
return feeds return feeds
def get_cover_url(self): def get_cover_url(self):
soup = self.index_to_soup('http://www.politicshome.com/uk/latest_frontpage.html')
# look for the block containing the sun button and url
cov = soup.find(attrs={'style' : 'background-image: url(http://www.politicshome.com/images/sources/source_frontpage_button_84.gif);'})
#cov = soup.find(attrs={'id' : 'large'})
cov2 = str(cov)
cov2='http://www.politicshome.com'+cov2[9:-133]
#cov2 now contains url of the page containing pic
#cov2 now contains url of the page containing pic
soup = self.index_to_soup(cov2)
cov = soup.find(attrs={'id' : 'large'})
cov=str(cov)
cov2 = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', cov)
cov2 = str(cov2)
cov2=cov2[2:len(cov2)-2]
br = browser() br = browser()
br.set_handle_redirect(False) br.set_handle_redirect(False)
cover_url = 'http://www.thepaperboy.com/frontpages/current/The_Sun_newspaper_front_page.jpg'
try: try:
br.open_novisit(cov2) br.open_novisit('http://www.thepaperboy.com/frontpages/current/The_Sun_newspaper_front_page.jpg')
cover_url = cov2
except: except:
cover_url = random.choice([ cover_url = random.choice([
'http://img.thesun.co.uk/multimedia/archive/00905/errorpage6_677961a_905507a.jpg' 'http://img.thesun.co.uk/multimedia/archive/00905/errorpage6_677961a_905507a.jpg'