Update The Guardian and The Observer

This commit is contained in:
Kovid Goyal 2015-11-26 17:03:01 +05:30
parent 3869c4764b
commit ad5825d995

View File

@ -6,10 +6,8 @@ __docformat__ = 'restructuredtext en'
''' '''
www.guardian.co.uk www.guardian.co.uk
''' '''
from calibre import strftime
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
from datetime import date from datetime import date
from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag
class Guardian(BasicNewsRecipe): class Guardian(BasicNewsRecipe):
@ -23,15 +21,16 @@ class Guardian(BasicNewsRecipe):
cover_pic = 'Guardian digital edition' cover_pic = 'Guardian digital edition'
masthead_url = 'http://static.guim.co.uk/static/f76b43f9dcfd761f0ecf7099a127b603b2922118/common/images/logos/the-guardian/titlepiece.gif' masthead_url = 'http://static.guim.co.uk/static/f76b43f9dcfd761f0ecf7099a127b603b2922118/common/images/logos/the-guardian/titlepiece.gif'
__author__ = 'Seabound and Sujata Raman' __author__ = 'Kovid Goyal'
language = 'en_GB' language = 'en_GB'
oldest_article = 7 oldest_article = 7
max_articles_per_feed = 100 max_articles_per_feed = 100
remove_javascript = True remove_javascript = True
encoding = 'utf-8' encoding = 'utf-8'
compress_news_images = True remove_empty_feeds = True
compress_news_images_auto_size = 8 no_stylesheets = True
remove_attributes = ['style']
# List of section titles to ignore # List of section titles to ignore
# For example: ['Sport'] # For example: ['Sport']
@ -40,129 +39,37 @@ class Guardian(BasicNewsRecipe):
timefmt = ' [%a, %d %b %Y]' timefmt = ' [%a, %d %b %Y]'
keep_only_tags = [ keep_only_tags = [
dict(name='div', attrs={'id':["content","article_header","main-article-info",]}), dict(attrs={'class': lambda x: x and 'content__main-column' in x.split()}),
dict(attrs={'class':lambda x: x and set(x.split()).intersection({'content__head', 'content__main'})}),
] ]
remove_tags = [ remove_tags = [
dict(name='div', attrs={'class':[ dict(attrs={'class': lambda x:x and '--twitter' in x}),
"video-content","videos-third-column", 'meta__extras', 'submeta-container submeta-container--break-at-leftcol ']}), dict(attrs={'data-component': ['share', 'social']}),
dict(name='div', attrs={'id':["article-toolbox","subscribe-feeds",]}), dict(attrs={'data-link-name': 'block share'}),
dict(name='div', attrs={'class':["guardian-tickets promo-component",]}), dict(attrs={'class': lambda x:x and 'inline-expand-image' in x}),
dict(name='ul', attrs={'class':["pagination"]}), dict(attrs={'class': lambda x:x and 'modern-visible' in x.split()}),
dict(name='ul', attrs={'id':["content-actions"]}), ]
# article history link remove_tags_after = [
dict(name='a', attrs={'class':["rollover history-link"]}), dict(attrs={'class': lambda x: x and 'content__article-body' in x.split()}),
# "a version of this article ..." speil
dict(name='div' , attrs={'class' : ['section']}),
# "about this article" js dialog
dict(name='div', attrs={'class':["share-top",]}),
# author picture
dict(name='img', attrs={'class':["contributor-pic-small"]}),
# embedded videos/captions
dict(name='span',attrs={'class' : ['inline embed embed-media']}),
] ]
use_embedded_content = False
no_stylesheets = True
extra_css = '''
.article-attributes{font-size: x-small; font-family:Arial,Helvetica,sans-serif;}
.h1{font-size: large ;font-family:georgia,serif; font-weight:bold;}
.stand-first-alone{color:#666666; font-size:small; font-family:Arial,Helvetica,sans-serif;}
.caption{color:#666666; font-size:x-small; font-family:Arial,Helvetica,sans-serif;}
#article-wrapper{font-size:small; font-family:Arial,Helvetica,sans-serif;font-weight:normal;}
.main-article-info{font-family:Arial,Helvetica,sans-serif;}
#full-contents{font-size:small; font-family:Arial,Helvetica,sans-serif;font-weight:normal;}
#match-stats-summary{font-size:small; font-family:Arial,Helvetica,sans-serif;font-weight:normal;}
'''
def get_article_url(self, article):
url = article.get('guid', None)
if '/video/' in url or '/flyer/' in url or '/quiz/' in url or \
'/gallery/' in url or 'ivebeenthere' in url or \
'pickthescore' in url or 'audioslideshow' in url :
url = None
return url
def populate_article_metadata(self, article, soup, first):
if first and hasattr(self, 'add_toc_thumbnail'):
picdiv = soup.find('img')
if picdiv is not None:
self.add_toc_thumbnail(article,picdiv['src'])
def preprocess_html(self, soup): def preprocess_html(self, soup):
for img in soup.findAll('img', srcset=True):
# multiple html sections in soup, useful stuff in the first img['src'] = img['srcset'].partition(' ')[0]
html = soup.find('html') img['srcset'] = ''
soup2 = BeautifulSoup()
soup2.insert(0,html)
soup = soup2
for item in soup.findAll(style=True):
del item['style']
for item in soup.findAll(face=True):
del item['face']
for tag in soup.findAll(name=['ul','li']):
tag.name = 'div'
# removes number next to rating stars
items_to_remove = []
rating_container = soup.find('div', attrs={'class': ['rating-container']})
if rating_container:
for item in rating_container:
if isinstance(item, Tag) and str(item.name) == 'span':
items_to_remove.append(item)
for item in items_to_remove:
item.extract()
return soup return soup
def find_sections(self):
# soup = self.index_to_soup("http://www.guardian.co.uk/theobserver")
soup = self.index_to_soup(self.base_url)
# find cover pic
img = soup.find('img',attrs={'alt':self.cover_pic})
if img is not None:
self.cover_url = img['src']
# end find cover pic
idx = soup.find('div', id='book-index')
for s in idx.findAll('strong', attrs={'class':'book'}):
a = s.find('a', href=True)
section_title = self.tag_to_string(a)
if section_title not in self.ignore_sections:
prefix = ''
if section_title != 'Main section':
prefix = section_title + ': '
for subsection in s.parent.findAll('a', attrs={'class':'book-section'}):
yield (prefix + self.tag_to_string(subsection), subsection['href'])
def find_articles(self, url):
soup = self.index_to_soup(url)
div = soup.find('div', attrs={'class':'book-index'})
for ul in div.findAll('ul', attrs={'class':'trailblock'}):
for li in ul.findAll('li'):
a = li.find(href=True)
if not a:
continue
title = self.tag_to_string(a)
url = a['href']
if not title or not url:
continue
tt = li.find('div', attrs={'class':'trailtext'})
if tt is not None:
for da in tt.findAll('a'):
da.extract()
desc = self.tag_to_string(tt).strip()
yield {
'title': title, 'url':url, 'description':desc,
'date' : strftime('%a, %d %b'),
}
def parse_index(self): def parse_index(self):
feeds = [] feeds = []
for title, href in self.find_sections(): soup = self.index_to_soup(self.base_url)
feeds.append((title, list(self.find_articles(href)))) for section in soup.findAll('section'):
title = self.tag_to_string(section.find(attrs={'class':'fc-container__header__title'})).strip().capitalize()
self.log('\nFound section:', title)
feeds.append((title, []))
for li in section.findAll('li'):
for a in li.findAll('a', attrs={'data-link-name':'article'}, href=True):
title = self.tag_to_string(a).strip()
url = a['href']
self.log(' ', title, url)
feeds[-1][1].append({'title':title, 'url':url})
break
return feeds return feeds