From 26d217611f838b7f312bf393aea45415993d2a7e Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 21 Jul 2009 19:08:29 -0600 Subject: [PATCH] New recipe for Slate by GRiker --- src/calibre/web/feeds/recipes/__init__.py | 2 +- src/calibre/web/feeds/recipes/recipe_slate.py | 329 ++++++++++++++++++ 2 files changed, 330 insertions(+), 1 deletion(-) create mode 100644 src/calibre/web/feeds/recipes/recipe_slate.py diff --git a/src/calibre/web/feeds/recipes/__init__.py b/src/calibre/web/feeds/recipes/__init__.py index cf0e16ecf0..acb1d967b0 100644 --- a/src/calibre/web/feeds/recipes/__init__.py +++ b/src/calibre/web/feeds/recipes/__init__.py @@ -52,7 +52,7 @@ recipe_modules = ['recipe_' + r for r in ( 'diagonales', 'miradasalsur', 'newsweek_argentina', 'veintitres', 'gva_be', 'hln', 'tijd', 'degentenaar', 'inquirer_net', 'uncrate', 'fastcompany', 'accountancyage', 'laprensa_hn', 'latribuna', - 'eltiempo_hn', + 'eltiempo_hn', 'slate', )] diff --git a/src/calibre/web/feeds/recipes/recipe_slate.py b/src/calibre/web/feeds/recipes/recipe_slate.py new file mode 100644 index 0000000000..570cf54542 --- /dev/null +++ b/src/calibre/web/feeds/recipes/recipe_slate.py @@ -0,0 +1,329 @@ +#!/usr/bin/env python + +__license__ = 'GPL v3' +__copyright__ = '2008, Kovid Goyal ' +''' +Fetches the last 7 days of featured articles from slate.com +''' + +import re +from calibre.web.feeds.recipes import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import BeautifulSoup, NavigableString, CData, Tag + +class Slate(BasicNewsRecipe): + # Method variables for customizing downloads + title = 'Slate' + description = 'A daily magazine on the Web, offering analysis and commentary about politics, news and culture.' + __author__ = 'GRiker@hotmail.com' + max_articles_per_feed = 40 + oldest_article = 7.0 + recursions = 0 + delay = 0 + simultaneous_downloads = 5 + timeout = 120.0 + timefmt = '' + feeds = None + no_stylesheets = True + encoding = None + + # Method variables for customizing feed parsing + summary_length = 250 + use_embedded_content = None + + # Method variables for pre/post processing of HTML + remove_tags = [ dict(name=['link','style']), + dict(id=['toolbox','site_navigation','article_bottom_tools_cntr', + 'article_bottom_tools','recommend_tab2','bottom_sponsored_links', + 'fray_article_discussion','bizbox_sponsored_links_bottom', + 'page_rightcol','top_banner','also_in_slate_bottom','articlefooter', + 'article_top_wedge','content-top','page-title', + 'block-today039s-business-press-archives','block-blog-roll', + 'block-also-in-tbm','block-most-popular-on-tbm','block-the-best-of-tbm', + 'service-links-bottom','comments','ft']), + dict(attrs={'class':['fray_article_links','clearing','nav', + 'service-links service-links-stack','yui-b last', + 'read-more-comments']})] + extra_css = '.headline {text-align:left;}\n\ + .byline {font:monospace; text-align:left; margin-bottom:0pt;}\n\ + .dateline {text-align:left; height:0pt;}\n\ + .source {align:left;}\n\ + .credit {text-align:right;font-size:smaller;}\n' + + baseURL = 'http://slate.com' + section_dates = [] + + def tag_to_strings(self, tag): + if not tag: + return '' + if isinstance(tag, basestring): + return tag + strings = [] + for item in tag.contents: + if isinstance(item, (NavigableString, CData)): + strings.append(item.string) + elif isinstance(item, Tag): + res = self.tag_to_string(item) + if res: + strings.append(res) + return strings + + def extract_sections(self): + soup = self.index_to_soup( self.baseURL ) + + soup_top_stories = soup.find(True, attrs={'class':'tap2_topic entry-content'}) + soup = soup.find(True, attrs={'id':'toc_links_container'}) + + todays_section = soup.find(True, attrs={'class':'todaydateline'}) + self.section_dates.append(self.tag_to_string(todays_section,use_alt=False)) + self.section_dates.append(self.tag_to_string(todays_section,use_alt=False)) + + older_section_dates = soup.findAll(True, attrs={'class':'maindateline'}) + for older_section in older_section_dates : + self.section_dates.append(self.tag_to_string(older_section,use_alt=False)) + + headline_stories = soup_top_stories.find('ul') + section_lists = soup.findAll('ul') + # Prepend the headlines to the first section + section_lists[0].insert(0,headline_stories) + + sections = [] + for section in section_lists : + sections.append(section) + + return sections + + + def extract_section_articles(self, sections_html) : + soup = self.index_to_soup(str(sections_html)) + sections = soup.findAll('ul') + articles = {} + key = None + ans = [] + + for (i,section) in enumerate(sections) : + + # Get the section name + if section.has_key('id') : + key = self.section_dates[i] + articles[key] = [] + ans.append(key) + else : + continue + + # Get the section article_list + article_list = section.findAll('li') + + excludedDescriptionKeywords = ['Slate V','Twitter feed','podcast'] + excludedTitleKeywords = ['Gabfest','Slate V'] + excludedAuthorKeywords = ['Prudence'] + + # Extract the article attributes + for article in article_list : + bylines = self.tag_to_strings(article) + url = article.a['href'] + title = bylines[0] + full_title = self.tag_to_string(article) + + author = None + description = None + pubdate = None + + if len(bylines) == 2 and self.tag_to_string(article).find("Today's Papers") > 0 : + description = "A summary of what's in the major U.S. newspapers." + + if len(bylines) == 3 : + author = bylines[2].strip() + author = re.sub('[\r][\n][\t][\t\t]','', author) + author = re.sub(',','', author) + if bylines[1] is not None : + description = bylines[1] + full_byline = self.tag_to_string(article) + if full_byline.find('major U.S. newspapers') > 0 : + description = "A summary of what's in the major U.S. newspapers." + + + if len(bylines) > 3 and author is not None: + author += " | " + for (i,substring) in enumerate(bylines[3:]) : + #print "substring: %s" % substring.encode('cp1252') + author += substring.strip() + if i < len(bylines[3:]) : + author += " | " + + # Skip articles whose descriptions contain excluded keywords + if description is not None : + excluded = re.compile('|'.join(excludedDescriptionKeywords)) + found_excluded = excluded.search(description) + if found_excluded : + continue + + # Skip articles whose title contain excluded keywords + if full_title is not None : + excluded = re.compile('|'.join(excludedTitleKeywords)) + #self.log("evaluating full_title: %s" % full_title) + found_excluded = excluded.search(full_title) + if found_excluded : + continue + + # Skip articles whose author contain excluded keywords + if author is not None : + excluded = re.compile('|'.join(excludedAuthorKeywords)) + found_excluded = excluded.search(author) + if found_excluded : + continue + + skip_this_article = False + # Check to make sure we're not adding a duplicate + for article in articles[key] : + if article['url'] == url : + skip_this_article = True + break + + if skip_this_article : + continue + + # Build the dictionary entry for this article + feed = key + if not articles.has_key(feed) : + articles[feed] = [] + articles[feed].append(dict(title=title, url=url, date=pubdate, description=description, + author=author, content='')) + # Promote 'newspapers' to top + for (i,article) in enumerate(articles[feed]) : + if article['description'] is not None : + if article['description'].find('newspapers') > 0 : + articles[feed].insert(0,articles[feed].pop(i)) + + + ans = [(key, articles[key]) for key in ans if articles.has_key(key)] + ans = self.remove_duplicates(ans) + return ans + + def flatten_document(self, ans): + flat_articles = [] + for (i,section) in enumerate(ans) : + for article in section[1] : + flat_articles.append(article) + flat_section = ['All Articles', flat_articles] + flat_ans = [flat_section] + + return flat_ans + + def remove_duplicates(self, ans): + for (i,section) in enumerate(ans) : + for article in section[1] : + for (j,subsequent_section) in enumerate(ans[i+1:]) : + for (k,subsequent_article) in enumerate(subsequent_section[1]) : + if article['url'] == subsequent_article['url'] : + del subsequent_section[1][k] + return ans + + def print_version(self, url) : + return url + 'pagenum/all/' + + # Class methods + def parse_index(self) : + sections = self.extract_sections() + section_list = self.extract_section_articles(sections) + section_list = self.flatten_document(section_list) + return section_list + + + def postprocess_html(self, soup, first_fetch) : + # Fix up dept_kicker as

+ dept_kicker = soup.find(True, attrs={'class':'department_kicker'}) + if dept_kicker is not None : + kicker_strings = self.tag_to_strings(dept_kicker) + kicker = kicker_strings[2] + kicker_strings[3] + kicker = re.sub('.','',kicker) + h3Tag = Tag(soup, "h3") + emTag = Tag(soup, "em") + h3Tag.insert(0, emTag) + emTag.insert(0,kicker) + dept_kicker.replaceWith(h3Tag) + + # Change

to

+ headline = soup.find("h1") + if headline is not None : + h2tag = Tag(soup, "h2") + h2tag['class'] = "headline" + strs = self.tag_to_strings(headline) + result = '' + for (i,substr) in enumerate(strs) : + result += substr + if i < len(strs) -1 : + result += '
' + h2tag.insert(0, result) + headline.replaceWith(h2tag) + + # Fix up the concatenated byline and dateline + byline = soup.find(True,attrs={'class':'byline'}) + if byline is not None : + bylineTag = Tag(soup,'div') + bylineTag['class'] = 'byline' + bylineTag.insert(0,self.tag_to_string(byline)) + byline.replaceWith(bylineTag) + + dateline = soup.find(True, attrs={'class':'dateline'}) + if dateline is not None : + datelineTag = Tag(soup, 'div') + datelineTag['class'] = 'dateline' + datelineTag.insert(0,self.tag_to_string(dateline)) + dateline.replaceWith(datelineTag) + + # Change captions to italic, add
+ for caption in soup.findAll(True, {'class':'caption'}) : + if caption is not None: + emTag = Tag(soup, "em") + emTag.insert(0, '
' + self.tag_to_string(caption)) + hrTag = Tag(soup, 'hr') + emTag.insert(1, hrTag) + caption.replaceWith(emTag) + + return soup + + def postprocess_book(self, oeb, opts, log) : + + def extract_byline(href) : + soup = BeautifulSoup(str(oeb.manifest.hrefs[href])) + byline = soup.find(True,attrs={'class':'byline'}) + if byline is not None: + return self.tag_to_string(byline,use_alt=False) + else : + return None + + def extract_description(href) : + soup = BeautifulSoup(str(oeb.manifest.hrefs[href])) + paragraphs = soup.findAll('p') + for p in paragraphs : + if self.tag_to_string(p,use_alt=False).startswith('By ') or \ + self.tag_to_string(p,use_alt=False).startswith('Posted '): + continue + + images = p.findAll(True, attrs={'class':'imagewrapper'}) + for image in images : + image.extract() + return self.tag_to_string(p,use_alt=False)[:200] + '...' + + return None + + if oeb.toc.depth() == 2 : + for article in oeb.toc : + if article.author is None : + article.author = extract_byline(article.href) + + if article.description is None : + article.description = extract_description(article.href) + + + elif oeb.toc.depth() == 3 : + for section in oeb.toc : + for article in section : + if article.author is None : + article.author = extract_byline(article.href) + + if article.description is None : + article.description = extract_description(article.href) + + +