mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
New recipe for Slate by GRiker
This commit is contained in:
parent
192abdd179
commit
26d217611f
@ -52,7 +52,7 @@ recipe_modules = ['recipe_' + r for r in (
|
||||
'diagonales', 'miradasalsur', 'newsweek_argentina', 'veintitres',
|
||||
'gva_be', 'hln', 'tijd', 'degentenaar', 'inquirer_net', 'uncrate',
|
||||
'fastcompany', 'accountancyage', 'laprensa_hn', 'latribuna',
|
||||
'eltiempo_hn',
|
||||
'eltiempo_hn', 'slate',
|
||||
)]
|
||||
|
||||
|
||||
|
329
src/calibre/web/feeds/recipes/recipe_slate.py
Normal file
329
src/calibre/web/feeds/recipes/recipe_slate.py
Normal file
@ -0,0 +1,329 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
|
||||
'''
|
||||
Fetches the last 7 days of featured articles from slate.com
|
||||
'''
|
||||
|
||||
import re
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
from calibre.ebooks.BeautifulSoup import BeautifulSoup, NavigableString, CData, Tag
|
||||
|
||||
class Slate(BasicNewsRecipe):
|
||||
# Method variables for customizing downloads
|
||||
title = 'Slate'
|
||||
description = 'A daily magazine on the Web, offering analysis and commentary about politics, news and culture.'
|
||||
__author__ = 'GRiker@hotmail.com'
|
||||
max_articles_per_feed = 40
|
||||
oldest_article = 7.0
|
||||
recursions = 0
|
||||
delay = 0
|
||||
simultaneous_downloads = 5
|
||||
timeout = 120.0
|
||||
timefmt = ''
|
||||
feeds = None
|
||||
no_stylesheets = True
|
||||
encoding = None
|
||||
|
||||
# Method variables for customizing feed parsing
|
||||
summary_length = 250
|
||||
use_embedded_content = None
|
||||
|
||||
# Method variables for pre/post processing of HTML
|
||||
remove_tags = [ dict(name=['link','style']),
|
||||
dict(id=['toolbox','site_navigation','article_bottom_tools_cntr',
|
||||
'article_bottom_tools','recommend_tab2','bottom_sponsored_links',
|
||||
'fray_article_discussion','bizbox_sponsored_links_bottom',
|
||||
'page_rightcol','top_banner','also_in_slate_bottom','articlefooter',
|
||||
'article_top_wedge','content-top','page-title',
|
||||
'block-today039s-business-press-archives','block-blog-roll',
|
||||
'block-also-in-tbm','block-most-popular-on-tbm','block-the-best-of-tbm',
|
||||
'service-links-bottom','comments','ft']),
|
||||
dict(attrs={'class':['fray_article_links','clearing','nav',
|
||||
'service-links service-links-stack','yui-b last',
|
||||
'read-more-comments']})]
|
||||
extra_css = '.headline {text-align:left;}\n\
|
||||
.byline {font:monospace; text-align:left; margin-bottom:0pt;}\n\
|
||||
.dateline {text-align:left; height:0pt;}\n\
|
||||
.source {align:left;}\n\
|
||||
.credit {text-align:right;font-size:smaller;}\n'
|
||||
|
||||
baseURL = 'http://slate.com'
|
||||
section_dates = []
|
||||
|
||||
def tag_to_strings(self, tag):
|
||||
if not tag:
|
||||
return ''
|
||||
if isinstance(tag, basestring):
|
||||
return tag
|
||||
strings = []
|
||||
for item in tag.contents:
|
||||
if isinstance(item, (NavigableString, CData)):
|
||||
strings.append(item.string)
|
||||
elif isinstance(item, Tag):
|
||||
res = self.tag_to_string(item)
|
||||
if res:
|
||||
strings.append(res)
|
||||
return strings
|
||||
|
||||
def extract_sections(self):
|
||||
soup = self.index_to_soup( self.baseURL )
|
||||
|
||||
soup_top_stories = soup.find(True, attrs={'class':'tap2_topic entry-content'})
|
||||
soup = soup.find(True, attrs={'id':'toc_links_container'})
|
||||
|
||||
todays_section = soup.find(True, attrs={'class':'todaydateline'})
|
||||
self.section_dates.append(self.tag_to_string(todays_section,use_alt=False))
|
||||
self.section_dates.append(self.tag_to_string(todays_section,use_alt=False))
|
||||
|
||||
older_section_dates = soup.findAll(True, attrs={'class':'maindateline'})
|
||||
for older_section in older_section_dates :
|
||||
self.section_dates.append(self.tag_to_string(older_section,use_alt=False))
|
||||
|
||||
headline_stories = soup_top_stories.find('ul')
|
||||
section_lists = soup.findAll('ul')
|
||||
# Prepend the headlines to the first section
|
||||
section_lists[0].insert(0,headline_stories)
|
||||
|
||||
sections = []
|
||||
for section in section_lists :
|
||||
sections.append(section)
|
||||
|
||||
return sections
|
||||
|
||||
|
||||
def extract_section_articles(self, sections_html) :
|
||||
soup = self.index_to_soup(str(sections_html))
|
||||
sections = soup.findAll('ul')
|
||||
articles = {}
|
||||
key = None
|
||||
ans = []
|
||||
|
||||
for (i,section) in enumerate(sections) :
|
||||
|
||||
# Get the section name
|
||||
if section.has_key('id') :
|
||||
key = self.section_dates[i]
|
||||
articles[key] = []
|
||||
ans.append(key)
|
||||
else :
|
||||
continue
|
||||
|
||||
# Get the section article_list
|
||||
article_list = section.findAll('li')
|
||||
|
||||
excludedDescriptionKeywords = ['Slate V','Twitter feed','podcast']
|
||||
excludedTitleKeywords = ['Gabfest','Slate V']
|
||||
excludedAuthorKeywords = ['Prudence']
|
||||
|
||||
# Extract the article attributes
|
||||
for article in article_list :
|
||||
bylines = self.tag_to_strings(article)
|
||||
url = article.a['href']
|
||||
title = bylines[0]
|
||||
full_title = self.tag_to_string(article)
|
||||
|
||||
author = None
|
||||
description = None
|
||||
pubdate = None
|
||||
|
||||
if len(bylines) == 2 and self.tag_to_string(article).find("Today's Papers") > 0 :
|
||||
description = "A summary of what's in the major U.S. newspapers."
|
||||
|
||||
if len(bylines) == 3 :
|
||||
author = bylines[2].strip()
|
||||
author = re.sub('[\r][\n][\t][\t\t]','', author)
|
||||
author = re.sub(',','', author)
|
||||
if bylines[1] is not None :
|
||||
description = bylines[1]
|
||||
full_byline = self.tag_to_string(article)
|
||||
if full_byline.find('major U.S. newspapers') > 0 :
|
||||
description = "A summary of what's in the major U.S. newspapers."
|
||||
|
||||
|
||||
if len(bylines) > 3 and author is not None:
|
||||
author += " | "
|
||||
for (i,substring) in enumerate(bylines[3:]) :
|
||||
#print "substring: %s" % substring.encode('cp1252')
|
||||
author += substring.strip()
|
||||
if i < len(bylines[3:]) :
|
||||
author += " | "
|
||||
|
||||
# Skip articles whose descriptions contain excluded keywords
|
||||
if description is not None :
|
||||
excluded = re.compile('|'.join(excludedDescriptionKeywords))
|
||||
found_excluded = excluded.search(description)
|
||||
if found_excluded :
|
||||
continue
|
||||
|
||||
# Skip articles whose title contain excluded keywords
|
||||
if full_title is not None :
|
||||
excluded = re.compile('|'.join(excludedTitleKeywords))
|
||||
#self.log("evaluating full_title: %s" % full_title)
|
||||
found_excluded = excluded.search(full_title)
|
||||
if found_excluded :
|
||||
continue
|
||||
|
||||
# Skip articles whose author contain excluded keywords
|
||||
if author is not None :
|
||||
excluded = re.compile('|'.join(excludedAuthorKeywords))
|
||||
found_excluded = excluded.search(author)
|
||||
if found_excluded :
|
||||
continue
|
||||
|
||||
skip_this_article = False
|
||||
# Check to make sure we're not adding a duplicate
|
||||
for article in articles[key] :
|
||||
if article['url'] == url :
|
||||
skip_this_article = True
|
||||
break
|
||||
|
||||
if skip_this_article :
|
||||
continue
|
||||
|
||||
# Build the dictionary entry for this article
|
||||
feed = key
|
||||
if not articles.has_key(feed) :
|
||||
articles[feed] = []
|
||||
articles[feed].append(dict(title=title, url=url, date=pubdate, description=description,
|
||||
author=author, content=''))
|
||||
# Promote 'newspapers' to top
|
||||
for (i,article) in enumerate(articles[feed]) :
|
||||
if article['description'] is not None :
|
||||
if article['description'].find('newspapers') > 0 :
|
||||
articles[feed].insert(0,articles[feed].pop(i))
|
||||
|
||||
|
||||
ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
|
||||
ans = self.remove_duplicates(ans)
|
||||
return ans
|
||||
|
||||
def flatten_document(self, ans):
|
||||
flat_articles = []
|
||||
for (i,section) in enumerate(ans) :
|
||||
for article in section[1] :
|
||||
flat_articles.append(article)
|
||||
flat_section = ['All Articles', flat_articles]
|
||||
flat_ans = [flat_section]
|
||||
|
||||
return flat_ans
|
||||
|
||||
def remove_duplicates(self, ans):
|
||||
for (i,section) in enumerate(ans) :
|
||||
for article in section[1] :
|
||||
for (j,subsequent_section) in enumerate(ans[i+1:]) :
|
||||
for (k,subsequent_article) in enumerate(subsequent_section[1]) :
|
||||
if article['url'] == subsequent_article['url'] :
|
||||
del subsequent_section[1][k]
|
||||
return ans
|
||||
|
||||
def print_version(self, url) :
|
||||
return url + 'pagenum/all/'
|
||||
|
||||
# Class methods
|
||||
def parse_index(self) :
|
||||
sections = self.extract_sections()
|
||||
section_list = self.extract_section_articles(sections)
|
||||
section_list = self.flatten_document(section_list)
|
||||
return section_list
|
||||
|
||||
|
||||
def postprocess_html(self, soup, first_fetch) :
|
||||
# Fix up dept_kicker as <h3><em>
|
||||
dept_kicker = soup.find(True, attrs={'class':'department_kicker'})
|
||||
if dept_kicker is not None :
|
||||
kicker_strings = self.tag_to_strings(dept_kicker)
|
||||
kicker = kicker_strings[2] + kicker_strings[3]
|
||||
kicker = re.sub('.','',kicker)
|
||||
h3Tag = Tag(soup, "h3")
|
||||
emTag = Tag(soup, "em")
|
||||
h3Tag.insert(0, emTag)
|
||||
emTag.insert(0,kicker)
|
||||
dept_kicker.replaceWith(h3Tag)
|
||||
|
||||
# Change <h1> to <h2>
|
||||
headline = soup.find("h1")
|
||||
if headline is not None :
|
||||
h2tag = Tag(soup, "h2")
|
||||
h2tag['class'] = "headline"
|
||||
strs = self.tag_to_strings(headline)
|
||||
result = ''
|
||||
for (i,substr) in enumerate(strs) :
|
||||
result += substr
|
||||
if i < len(strs) -1 :
|
||||
result += '<br />'
|
||||
h2tag.insert(0, result)
|
||||
headline.replaceWith(h2tag)
|
||||
|
||||
# Fix up the concatenated byline and dateline
|
||||
byline = soup.find(True,attrs={'class':'byline'})
|
||||
if byline is not None :
|
||||
bylineTag = Tag(soup,'div')
|
||||
bylineTag['class'] = 'byline'
|
||||
bylineTag.insert(0,self.tag_to_string(byline))
|
||||
byline.replaceWith(bylineTag)
|
||||
|
||||
dateline = soup.find(True, attrs={'class':'dateline'})
|
||||
if dateline is not None :
|
||||
datelineTag = Tag(soup, 'div')
|
||||
datelineTag['class'] = 'dateline'
|
||||
datelineTag.insert(0,self.tag_to_string(dateline))
|
||||
dateline.replaceWith(datelineTag)
|
||||
|
||||
# Change captions to italic, add <hr>
|
||||
for caption in soup.findAll(True, {'class':'caption'}) :
|
||||
if caption is not None:
|
||||
emTag = Tag(soup, "em")
|
||||
emTag.insert(0, '<br />' + self.tag_to_string(caption))
|
||||
hrTag = Tag(soup, 'hr')
|
||||
emTag.insert(1, hrTag)
|
||||
caption.replaceWith(emTag)
|
||||
|
||||
return soup
|
||||
|
||||
def postprocess_book(self, oeb, opts, log) :
|
||||
|
||||
def extract_byline(href) :
|
||||
soup = BeautifulSoup(str(oeb.manifest.hrefs[href]))
|
||||
byline = soup.find(True,attrs={'class':'byline'})
|
||||
if byline is not None:
|
||||
return self.tag_to_string(byline,use_alt=False)
|
||||
else :
|
||||
return None
|
||||
|
||||
def extract_description(href) :
|
||||
soup = BeautifulSoup(str(oeb.manifest.hrefs[href]))
|
||||
paragraphs = soup.findAll('p')
|
||||
for p in paragraphs :
|
||||
if self.tag_to_string(p,use_alt=False).startswith('By ') or \
|
||||
self.tag_to_string(p,use_alt=False).startswith('Posted '):
|
||||
continue
|
||||
|
||||
images = p.findAll(True, attrs={'class':'imagewrapper'})
|
||||
for image in images :
|
||||
image.extract()
|
||||
return self.tag_to_string(p,use_alt=False)[:200] + '...'
|
||||
|
||||
return None
|
||||
|
||||
if oeb.toc.depth() == 2 :
|
||||
for article in oeb.toc :
|
||||
if article.author is None :
|
||||
article.author = extract_byline(article.href)
|
||||
|
||||
if article.description is None :
|
||||
article.description = extract_description(article.href)
|
||||
|
||||
|
||||
elif oeb.toc.depth() == 3 :
|
||||
for section in oeb.toc :
|
||||
for article in section :
|
||||
if article.author is None :
|
||||
article.author = extract_byline(article.href)
|
||||
|
||||
if article.description is None :
|
||||
article.description = extract_description(article.href)
|
||||
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user