mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Improved recipe for Slate
This commit is contained in:
parent
e77eafa751
commit
aef7433160
@ -1,7 +1,8 @@
|
|||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
|
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||||
|
|
||||||
__license__ = 'GPL v3'
|
__license__ = 'GPL v3'
|
||||||
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
|
|
||||||
'''
|
'''
|
||||||
calibre recipe for slate.com
|
calibre recipe for slate.com
|
||||||
'''
|
'''
|
||||||
@ -10,13 +11,12 @@ import re
|
|||||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup, NavigableString, CData, Comment, Tag
|
from calibre.ebooks.BeautifulSoup import BeautifulSoup, NavigableString, CData, Comment, Tag
|
||||||
|
|
||||||
class PeriodicalNameHere(BasicNewsRecipe):
|
class Slate(BasicNewsRecipe):
|
||||||
# Method variables for customizing downloads
|
# Method variables for customizing downloads
|
||||||
title = 'Slate'
|
|
||||||
description = 'A general-interest publication offering analysis and commentary about politics, news and culture.'
|
description = 'A general-interest publication offering analysis and commentary about politics, news and culture.'
|
||||||
__author__ = 'GRiker and Sujata Raman'
|
__author__ = 'GRiker, Sujata Raman and Nick Redding'
|
||||||
max_articles_per_feed = 20
|
max_articles_per_feed = 100
|
||||||
oldest_article = 7.0
|
oldest_article = 14
|
||||||
recursions = 0
|
recursions = 0
|
||||||
delay = 0
|
delay = 0
|
||||||
simultaneous_downloads = 5
|
simultaneous_downloads = 5
|
||||||
@ -27,6 +27,12 @@ class PeriodicalNameHere(BasicNewsRecipe):
|
|||||||
encoding = None
|
encoding = None
|
||||||
language = 'en'
|
language = 'en'
|
||||||
|
|
||||||
|
slate_complete = True
|
||||||
|
if slate_complete:
|
||||||
|
title = 'Slate (complete)'
|
||||||
|
else:
|
||||||
|
title = 'Slate (weekly)'
|
||||||
|
|
||||||
# Method variables for customizing feed parsing
|
# Method variables for customizing feed parsing
|
||||||
summary_length = 250
|
summary_length = 250
|
||||||
use_embedded_content = None
|
use_embedded_content = None
|
||||||
@ -42,26 +48,15 @@ class PeriodicalNameHere(BasicNewsRecipe):
|
|||||||
match_regexps = []
|
match_regexps = []
|
||||||
|
|
||||||
# The second entry is for 'Big Money', which comes from a different site, uses different markup
|
# The second entry is for 'Big Money', which comes from a different site, uses different markup
|
||||||
keep_only_tags = [dict(attrs={ 'id':['article_top', 'article_body', 'story']}),
|
keep_only_tags = [dict(attrs={ 'id':['article_top', 'article_body']}),
|
||||||
dict(attrs={ 'id':['content']}) ]
|
dict(attrs={ 'id':['content']}) ]
|
||||||
|
|
||||||
# The second entry is for 'Big Money', which comes from a different site, uses different markup
|
# The second entry is for 'Big Money', which comes from a different site, uses different markup
|
||||||
remove_tags = [dict(attrs={ 'id':[
|
remove_tags = [dict(attrs={ 'id':['toolbox','recommend_tab','insider_ad_wrapper',
|
||||||
'add_comments_button',
|
'article_bottom_tools_cntr','fray_article_discussion','fray_article_links','bottom_sponsored_links','author_bio',
|
||||||
'article_bottom_tools',
|
'bizbox_links_bottom','ris_links_wrapper','BOXXLE',
|
||||||
'article_bottom_tools_cntr',
|
'comments_button','add_comments_button','comments-to-fray','marriott_ad',
|
||||||
'bizbox_links_bottom',
|
'article_bottom_tools','recommend_tab2','fbog_article_bottom_cntr']}),
|
||||||
'BOXXLE',
|
|
||||||
'comments_button',
|
|
||||||
'comments-to-fray',
|
|
||||||
'fbog_article_bottom_cntr',
|
|
||||||
'fray_article_discussion', 'fray_article_links','bottom_sponsored_links','author_bio',
|
|
||||||
'insider_ad_wrapper',
|
|
||||||
'js_kit_cntr',
|
|
||||||
'recommend_tab',
|
|
||||||
'ris_links_wrapper',
|
|
||||||
'toolbox',
|
|
||||||
]}),
|
|
||||||
dict(attrs={ 'id':['content-top','service-links-bottom','hed']}) ]
|
dict(attrs={ 'id':['content-top','service-links-bottom','hed']}) ]
|
||||||
|
|
||||||
excludedDescriptionKeywords = ['Slate V','Twitter feed','podcast']
|
excludedDescriptionKeywords = ['Slate V','Twitter feed','podcast']
|
||||||
@ -72,16 +67,15 @@ class PeriodicalNameHere(BasicNewsRecipe):
|
|||||||
extra_css = '''
|
extra_css = '''
|
||||||
.h1_subhead{font-family:Arial; font-size:small; }
|
.h1_subhead{font-family:Arial; font-size:small; }
|
||||||
h1{font-family:Verdana; font-size:large; }
|
h1{font-family:Verdana; font-size:large; }
|
||||||
.byline {font-family:Georgia; margin-bottom: 0px; color: #660033;}
|
.byline {font-family:Georgia; margin-bottom: 0px; }
|
||||||
.dateline {font-family:Arial; font-size: smaller; height: 0pt; color:#666666;}
|
.dateline {font-family:Arial; font-size: smaller; height: 0pt;}
|
||||||
.imagewrapper {font-family:Verdana;font-size:x-small; }
|
.imagewrapper {font-family:Verdana;font-size:x-small; }
|
||||||
.source {font-family:Verdana; font-size:x-small;}
|
.source {font-family:Verdana; font-size:x-small;}
|
||||||
.credit {font-family:Verdana; font-size: smaller;}
|
.credit {font-family:Verdana; font-size: smaller;}
|
||||||
#article_body {font-family:Verdana; }
|
#article_body {font-family:Verdana; }
|
||||||
#content {font-family:Arial; }
|
#content {font-family:Arial; }
|
||||||
.caption{font-family:Verdana;font-style:italic; font-size:x-small;}
|
.caption{font-family:Verdana;font-style:italic; font-size:x-small;}
|
||||||
h3{font-family:Arial; color:#666666; font-size:small}
|
h3{font-family:Arial; font-size:small}
|
||||||
a{color:#0066CC;}
|
|
||||||
'''
|
'''
|
||||||
|
|
||||||
# Local variables to extend class
|
# Local variables to extend class
|
||||||
@ -99,32 +93,59 @@ class PeriodicalNameHere(BasicNewsRecipe):
|
|||||||
if isinstance(item, (NavigableString, CData)):
|
if isinstance(item, (NavigableString, CData)):
|
||||||
strings.append(item.string)
|
strings.append(item.string)
|
||||||
elif isinstance(item, Tag):
|
elif isinstance(item, Tag):
|
||||||
res = self.tag_to_string(item)
|
res = self.tag_to_string(item,use_alt=False)
|
||||||
if res:
|
if res:
|
||||||
strings.append(res)
|
strings.append(res)
|
||||||
return strings
|
return strings
|
||||||
|
|
||||||
|
def extract_named_sections(self):
|
||||||
def extract_sections(self):
|
|
||||||
soup = self.index_to_soup( self.baseURL )
|
soup = self.index_to_soup( self.baseURL )
|
||||||
soup_top_stories = soup.find(True, attrs={'class':'tap2_topic entry-content'})
|
soup_nav_bar = soup.find(True, attrs={'id':'nav'})
|
||||||
|
briefing_nav = soup.find('li')
|
||||||
|
briefing_url = briefing_nav.a['href']
|
||||||
|
for section_nav in soup_nav_bar.findAll('li'):
|
||||||
|
section_name = self.tag_to_string(section_nav,use_alt=False)
|
||||||
|
self.section_dates.append(section_name)
|
||||||
|
|
||||||
|
soup = self.index_to_soup(briefing_url)
|
||||||
|
|
||||||
|
self.log("Briefing url = %s " % briefing_url)
|
||||||
|
section_lists = soup.findAll('ul','view_links_list')
|
||||||
|
|
||||||
|
sections = []
|
||||||
|
for section in section_lists :
|
||||||
|
sections.append(section)
|
||||||
|
return sections
|
||||||
|
|
||||||
|
|
||||||
|
def extract_dated_sections(self):
|
||||||
|
soup = self.index_to_soup( self.baseURL )
|
||||||
|
soup_top_stories = soup.find(True, attrs={'id':'tap3_cntr'})
|
||||||
|
if soup_top_stories:
|
||||||
|
self.section_dates.append("Top Stories")
|
||||||
|
self.log("SELECTION TOP STORIES %s" % "Top Stories")
|
||||||
|
|
||||||
soup = soup.find(True, attrs={'id':'toc_links_container'})
|
soup = soup.find(True, attrs={'id':'toc_links_container'})
|
||||||
|
|
||||||
todays_section = soup.find(True, attrs={'class':'todaydateline'})
|
todays_section = soup.find(True, attrs={'class':'todaydateline'})
|
||||||
self.section_dates.append(self.tag_to_string(todays_section,use_alt=False))
|
self.section_dates.append(self.tag_to_string(todays_section,use_alt=False))
|
||||||
|
self.log("SELECTION DATE %s" % self.tag_to_string(todays_section,use_alt=False))
|
||||||
|
|
||||||
older_section_dates = soup.findAll(True, attrs={'class':'maindateline'})
|
older_section_dates = soup.findAll(True, attrs={'class':'maindateline'})
|
||||||
for older_section in older_section_dates :
|
for older_section in older_section_dates :
|
||||||
self.section_dates.append(self.tag_to_string(older_section,use_alt=False))
|
self.section_dates.append(self.tag_to_string(older_section,use_alt=False))
|
||||||
|
self.log("SELECTION DATE %s" % self.tag_to_string(older_section,use_alt=False))
|
||||||
|
|
||||||
if soup_top_stories:
|
if soup_top_stories:
|
||||||
headline_stories = soup_top_stories.find('ul')
|
headline_stories = soup_top_stories
|
||||||
|
self.log("HAVE top_stories")
|
||||||
else:
|
else:
|
||||||
headline_stories = None
|
headline_stories = None
|
||||||
|
self.log("NO top_stories")
|
||||||
section_lists = soup.findAll('ul')
|
section_lists = soup.findAll('ul')
|
||||||
# Prepend the headlines to the first section
|
# Prepend the headlines to the first section
|
||||||
if headline_stories:
|
if headline_stories:
|
||||||
section_lists[0].insert(0,headline_stories)
|
section_lists.insert(0,headline_stories)
|
||||||
|
|
||||||
sections = []
|
sections = []
|
||||||
for section in section_lists :
|
for section in section_lists :
|
||||||
@ -133,9 +154,8 @@ class PeriodicalNameHere(BasicNewsRecipe):
|
|||||||
|
|
||||||
|
|
||||||
def extract_section_articles(self, sections_html) :
|
def extract_section_articles(self, sections_html) :
|
||||||
# Find the containers with section content
|
# Find the containers with section content
|
||||||
soup = self.index_to_soup(str(sections_html))
|
sections = sections_html
|
||||||
sections = soup.findAll('ul')
|
|
||||||
|
|
||||||
articles = {}
|
articles = {}
|
||||||
key = None
|
key = None
|
||||||
@ -145,10 +165,25 @@ class PeriodicalNameHere(BasicNewsRecipe):
|
|||||||
|
|
||||||
# Get the section name
|
# Get the section name
|
||||||
if section.has_key('id') :
|
if section.has_key('id') :
|
||||||
|
self.log("PROCESSING SECTION id = %s" % section['id'])
|
||||||
key = self.section_dates[i]
|
key = self.section_dates[i]
|
||||||
|
if key.startswith("Pod"):
|
||||||
|
continue
|
||||||
|
if key.startswith("Blog"):
|
||||||
|
continue
|
||||||
|
articles[key] = []
|
||||||
|
ans.append(key)
|
||||||
|
elif self.slate_complete:
|
||||||
|
key = self.section_dates[i]
|
||||||
|
if key.startswith("Pod"):
|
||||||
|
continue
|
||||||
|
if key.startswith("Blog"):
|
||||||
|
continue
|
||||||
|
self.log("PROCESSING SECTION name = %s" % key)
|
||||||
articles[key] = []
|
articles[key] = []
|
||||||
ans.append(key)
|
ans.append(key)
|
||||||
else :
|
else :
|
||||||
|
self.log("SECTION %d HAS NO id" % i);
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Get the section article_list
|
# Get the section article_list
|
||||||
@ -159,8 +194,10 @@ class PeriodicalNameHere(BasicNewsRecipe):
|
|||||||
bylines = self.tag_to_strings(article)
|
bylines = self.tag_to_strings(article)
|
||||||
url = article.a['href']
|
url = article.a['href']
|
||||||
title = bylines[0]
|
title = bylines[0]
|
||||||
full_title = self.tag_to_string(article)
|
full_title = self.tag_to_string(article,use_alt=False)
|
||||||
|
#self.log("ARTICLE TITLE%s" % title)
|
||||||
|
#self.log("ARTICLE FULL_TITLE%s" % full_title)
|
||||||
|
#self.log("URL %s" % url)
|
||||||
author = None
|
author = None
|
||||||
description = None
|
description = None
|
||||||
pubdate = None
|
pubdate = None
|
||||||
@ -191,7 +228,7 @@ class PeriodicalNameHere(BasicNewsRecipe):
|
|||||||
excluded = re.compile('|'.join(self.excludedDescriptionKeywords))
|
excluded = re.compile('|'.join(self.excludedDescriptionKeywords))
|
||||||
found_excluded = excluded.search(description)
|
found_excluded = excluded.search(description)
|
||||||
if found_excluded :
|
if found_excluded :
|
||||||
if self.verbose : self.log(" >>> skipping %s (description keyword exclusion: %s) <<<\n" % (title, found_excluded.group(0)))
|
self.log(" >>> skipping %s (description keyword exclusion: %s) <<<\n" % (title, found_excluded.group(0)))
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Skip articles whose title contain excluded keywords
|
# Skip articles whose title contain excluded keywords
|
||||||
@ -200,7 +237,7 @@ class PeriodicalNameHere(BasicNewsRecipe):
|
|||||||
#self.log("evaluating full_title: %s" % full_title)
|
#self.log("evaluating full_title: %s" % full_title)
|
||||||
found_excluded = excluded.search(full_title)
|
found_excluded = excluded.search(full_title)
|
||||||
if found_excluded :
|
if found_excluded :
|
||||||
if self.verbose : self.log(" >>> skipping %s (title keyword exclusion: %s) <<<\n" % (title, found_excluded.group(0)))
|
self.log(" >>> skipping %s (title keyword exclusion: %s) <<<\n" % (title, found_excluded.group(0)))
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Skip articles whose author contain excluded keywords
|
# Skip articles whose author contain excluded keywords
|
||||||
@ -208,7 +245,7 @@ class PeriodicalNameHere(BasicNewsRecipe):
|
|||||||
excluded = re.compile('|'.join(self.excludedAuthorKeywords))
|
excluded = re.compile('|'.join(self.excludedAuthorKeywords))
|
||||||
found_excluded = excluded.search(author)
|
found_excluded = excluded.search(author)
|
||||||
if found_excluded :
|
if found_excluded :
|
||||||
if self.verbose : self.log(" >>> skipping %s (author keyword exclusion: %s) <<<\n" % (title, found_excluded.group(0)))
|
self.log(" >>> skipping %s (author keyword exclusion: %s) <<<\n" % (title, found_excluded.group(0)))
|
||||||
continue
|
continue
|
||||||
|
|
||||||
skip_this_article = False
|
skip_this_article = False
|
||||||
@ -216,6 +253,7 @@ class PeriodicalNameHere(BasicNewsRecipe):
|
|||||||
for article in articles[key] :
|
for article in articles[key] :
|
||||||
if article['url'] == url :
|
if article['url'] == url :
|
||||||
skip_this_article = True
|
skip_this_article = True
|
||||||
|
self.log("SKIPPING DUP %s" % url)
|
||||||
break
|
break
|
||||||
|
|
||||||
if skip_this_article :
|
if skip_this_article :
|
||||||
@ -227,6 +265,8 @@ class PeriodicalNameHere(BasicNewsRecipe):
|
|||||||
articles[feed] = []
|
articles[feed] = []
|
||||||
articles[feed].append(dict(title=title, url=url, date=pubdate, description=description,
|
articles[feed].append(dict(title=title, url=url, date=pubdate, description=description,
|
||||||
author=author, content=''))
|
author=author, content=''))
|
||||||
|
#self.log("KEY %s" % feed)
|
||||||
|
#self.log("APPENDED %s" % url)
|
||||||
# Promote 'newspapers' to top
|
# Promote 'newspapers' to top
|
||||||
for (i,article) in enumerate(articles[feed]) :
|
for (i,article) in enumerate(articles[feed]) :
|
||||||
if article['description'] is not None :
|
if article['description'] is not None :
|
||||||
@ -235,32 +275,6 @@ class PeriodicalNameHere(BasicNewsRecipe):
|
|||||||
|
|
||||||
|
|
||||||
ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
|
ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
|
||||||
ans = self.remove_duplicates(ans)
|
|
||||||
return ans
|
|
||||||
|
|
||||||
def flatten_document(self, ans):
|
|
||||||
flat_articles = []
|
|
||||||
for (i,section) in enumerate(ans) :
|
|
||||||
#self.log("flattening section %s: " % section[0])
|
|
||||||
for article in section[1] :
|
|
||||||
#self.log("moving %s to flat_articles[]" % article['title'])
|
|
||||||
flat_articles.append(article)
|
|
||||||
flat_section = ['All Articles', flat_articles]
|
|
||||||
flat_ans = [flat_section]
|
|
||||||
return flat_ans
|
|
||||||
|
|
||||||
def remove_duplicates(self, ans):
|
|
||||||
# Return a stripped ans
|
|
||||||
for (i,section) in enumerate(ans) :
|
|
||||||
#self.log("section %s: " % section[0])
|
|
||||||
for article in section[1] :
|
|
||||||
#self.log("\t%s" % article['title'])
|
|
||||||
#self.log("\looking for %s" % article['url'])
|
|
||||||
for (j,subsequent_section) in enumerate(ans[i+1:]) :
|
|
||||||
for (k,subsequent_article) in enumerate(subsequent_section[1]) :
|
|
||||||
if article['url'] == subsequent_article['url'] :
|
|
||||||
#self.log( "removing %s (%s) from %s" % (subsequent_article['title'], subsequent_article['url'], subsequent_section[0]) )
|
|
||||||
del subsequent_section[1][k]
|
|
||||||
return ans
|
return ans
|
||||||
|
|
||||||
def print_version(self, url) :
|
def print_version(self, url) :
|
||||||
@ -268,13 +282,22 @@ class PeriodicalNameHere(BasicNewsRecipe):
|
|||||||
|
|
||||||
# Class methods
|
# Class methods
|
||||||
def parse_index(self) :
|
def parse_index(self) :
|
||||||
sections = self.extract_sections()
|
if self.slate_complete:
|
||||||
|
sections = self.extract_named_sections()
|
||||||
|
else:
|
||||||
|
sections = self.extract_dated_sections()
|
||||||
section_list = self.extract_section_articles(sections)
|
section_list = self.extract_section_articles(sections)
|
||||||
section_list = self.flatten_document(section_list)
|
|
||||||
return section_list
|
return section_list
|
||||||
|
|
||||||
def get_browser(self) :
|
def get_masthead_url(self):
|
||||||
return BasicNewsRecipe.get_browser()
|
masthead = 'http://img.slate.com/images/redesign2008/slate_logo.gif'
|
||||||
|
br = BasicNewsRecipe.get_browser()
|
||||||
|
try:
|
||||||
|
br.open(masthead)
|
||||||
|
except:
|
||||||
|
self.log("\nMasthead unavailable")
|
||||||
|
masthead = None
|
||||||
|
return masthead
|
||||||
|
|
||||||
def stripAnchors(self,soup):
|
def stripAnchors(self,soup):
|
||||||
body = soup.find('div',attrs={'id':['article_body','content']})
|
body = soup.find('div',attrs={'id':['article_body','content']})
|
||||||
@ -304,8 +327,8 @@ class PeriodicalNameHere(BasicNewsRecipe):
|
|||||||
excluded = re.compile('|'.join(self.excludedContentKeywords))
|
excluded = re.compile('|'.join(self.excludedContentKeywords))
|
||||||
found_excluded = excluded.search(str(soup))
|
found_excluded = excluded.search(str(soup))
|
||||||
if found_excluded :
|
if found_excluded :
|
||||||
print "no allowed content found, removing article"
|
print "No allowed content found, removing article"
|
||||||
raise Exception('String error')
|
raise Exception('Rejected article')
|
||||||
|
|
||||||
# Articles from www.thebigmoney.com use different tagging for byline, dateline and body
|
# Articles from www.thebigmoney.com use different tagging for byline, dateline and body
|
||||||
head = soup.find('head')
|
head = soup.find('head')
|
||||||
@ -338,7 +361,6 @@ class PeriodicalNameHere(BasicNewsRecipe):
|
|||||||
dept_kicker = soup.find('div', attrs={'class':'department_kicker'})
|
dept_kicker = soup.find('div', attrs={'class':'department_kicker'})
|
||||||
if dept_kicker is not None :
|
if dept_kicker is not None :
|
||||||
kicker_strings = self.tag_to_strings(dept_kicker)
|
kicker_strings = self.tag_to_strings(dept_kicker)
|
||||||
#kicker = kicker_strings[2] + kicker_strings[3]
|
|
||||||
kicker = ''.join(kicker_strings[2:])
|
kicker = ''.join(kicker_strings[2:])
|
||||||
kicker = re.sub('\.','',kicker)
|
kicker = re.sub('\.','',kicker)
|
||||||
h3Tag = Tag(soup, "h3")
|
h3Tag = Tag(soup, "h3")
|
||||||
@ -346,25 +368,11 @@ class PeriodicalNameHere(BasicNewsRecipe):
|
|||||||
emTag.insert(0,NavigableString(kicker))
|
emTag.insert(0,NavigableString(kicker))
|
||||||
h3Tag.insert(0, emTag)
|
h3Tag.insert(0, emTag)
|
||||||
dept_kicker.replaceWith(h3Tag)
|
dept_kicker.replaceWith(h3Tag)
|
||||||
|
else:
|
||||||
|
self.log("No kicker--return null")
|
||||||
|
return None
|
||||||
|
|
||||||
# Change <h1> to <h2>
|
# Fix up the concatenated byline and dateline
|
||||||
headline = soup.find("h1")
|
|
||||||
#tag = headline.find("span")
|
|
||||||
#tag.name = 'div'
|
|
||||||
|
|
||||||
if headline is not None :
|
|
||||||
h2tag = Tag(soup, "h2")
|
|
||||||
h2tag['class'] = "headline"
|
|
||||||
strs = self.tag_to_strings(headline)
|
|
||||||
result = ''
|
|
||||||
for (i,substr) in enumerate(strs) :
|
|
||||||
result += substr
|
|
||||||
if i < len(strs) -1 :
|
|
||||||
result += '<br />'
|
|
||||||
#h2tag.insert(0, result)
|
|
||||||
#headline.replaceWith(h2tag)
|
|
||||||
|
|
||||||
# Fix up the concatenated byline and dateline
|
|
||||||
byline = soup.find(True,attrs={'class':'byline'})
|
byline = soup.find(True,attrs={'class':'byline'})
|
||||||
if byline is not None :
|
if byline is not None :
|
||||||
bylineTag = Tag(soup,'div')
|
bylineTag = Tag(soup,'div')
|
||||||
|
Loading…
x
Reference in New Issue
Block a user