This commit is contained in:
Kovid Goyal 2011-11-25 19:21:33 +05:30
parent fc0a33c7ee
commit 7316bcad84

View File

@ -9,6 +9,7 @@ www.guardian.co.uk
from calibre import strftime from calibre import strftime
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
from datetime import date from datetime import date
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
class Guardian(BasicNewsRecipe): class Guardian(BasicNewsRecipe):
@ -16,16 +17,19 @@ class Guardian(BasicNewsRecipe):
if date.today().weekday() == 6: if date.today().weekday() == 6:
base_url = "http://www.guardian.co.uk/theobserver" base_url = "http://www.guardian.co.uk/theobserver"
cover_pic = 'Observer digital edition' cover_pic = 'Observer digital edition'
masthead_url = 'http://static.guim.co.uk/sys-images/Guardian/Pix/site_furniture/2010/10/19/1287478087992/The-Observer-001.gif'
else: else:
base_url = "http://www.guardian.co.uk/theguardian" base_url = "http://www.guardian.co.uk/theguardian"
cover_pic = 'Guardian digital edition' cover_pic = 'Guardian digital edition'
masthead_url = 'http://static.guim.co.uk/static/f76b43f9dcfd761f0ecf7099a127b603b2922118/common/images/logos/the-guardian/titlepiece.gif'
__author__ = 'Seabound and Sujata Raman' __author__ = 'Seabound and Sujata Raman'
language = 'en_GB' language = 'en_GB'
oldest_article = 7 oldest_article = 7
max_articles_per_feed = 100 max_articles_per_feed = 100
remove_javascript = True remove_javascript = True
encoding = 'utf-8'
# List of section titles to ignore # List of section titles to ignore
# For example: ['Sport'] # For example: ['Sport']
@ -41,6 +45,16 @@ class Guardian(BasicNewsRecipe):
dict(name='div', attrs={'class':["guardian-tickets promo-component",]}), dict(name='div', attrs={'class':["guardian-tickets promo-component",]}),
dict(name='ul', attrs={'class':["pagination"]}), dict(name='ul', attrs={'class':["pagination"]}),
dict(name='ul', attrs={'id':["content-actions"]}), dict(name='ul', attrs={'id':["content-actions"]}),
# article history link
dict(name='a', attrs={'class':["rollover history-link"]}),
# "a version of this article ..." speil
dict(name='div' , attrs = { 'class' : ['section']}),
# "about this article" js dialog
dict(name='div', attrs={'class':["share-top",]}),
# author picture
dict(name='img', attrs={'class':["contributor-pic-small"]}),
# embedded videos/captions
dict(name='span',attrs={'class' : ['inline embed embed-media']}),
#dict(name='img'), #dict(name='img'),
] ]
use_embedded_content = False use_embedded_content = False
@ -67,6 +81,13 @@ class Guardian(BasicNewsRecipe):
def preprocess_html(self, soup): def preprocess_html(self, soup):
# multiple html sections in soup, useful stuff in the first
html = soup.find('html')
soup2 = BeautifulSoup()
soup2.insert(0,html)
soup = soup2
for item in soup.findAll(style=True): for item in soup.findAll(style=True):
del item['style'] del item['style']
@ -75,6 +96,17 @@ class Guardian(BasicNewsRecipe):
for tag in soup.findAll(name=['ul','li']): for tag in soup.findAll(name=['ul','li']):
tag.name = 'div' tag.name = 'div'
# removes number next to rating stars
items_to_remove = []
rating_container = soup.find('div', attrs = {'class': ['rating-container']})
if rating_container:
for item in rating_container:
if isinstance(item, Tag) and str(item.name) == 'span':
items_to_remove.append(item)
for item in items_to_remove:
item.extract()
return soup return soup
def find_sections(self): def find_sections(self):