New recipe for Slate by GRiker

2025-07-09 03:04:10 -04:00 · 2009-07-21 19:08:29 -06:00 · 2009-07-21 19:08:29 -06:00 · 26d217611f
commit 26d217611f
parent 192abdd179
2 changed files with 330 additions and 1 deletions
--- a/src/calibre/web/feeds/recipes/init.py
+++ b/src/calibre/web/feeds/recipes/init.py
@ -52,7 +52,7 @@ recipe_modules = ['recipe_' + r for r in (
           'diagonales', 'miradasalsur', 'newsweek_argentina', 'veintitres',
           'gva_be', 'hln', 'tijd', 'degentenaar', 'inquirer_net', 'uncrate',
           'fastcompany', 'accountancyage', 'laprensa_hn', 'latribuna',
-           'eltiempo_hn',
+           'eltiempo_hn', 'slate',
          )]


--- a/src/calibre/web/feeds/recipes/recipe_slate.py
+++ b/src/calibre/web/feeds/recipes/recipe_slate.py
@ -0,0 +1,329 @@
+#!/usr/bin/env  python
+
+__license__   = 'GPL v3'
+__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
+'''
+Fetches the last 7 days of featured articles from slate.com
+'''
+
+import re
+from calibre.web.feeds.recipes import BasicNewsRecipe
+from calibre.ebooks.BeautifulSoup import BeautifulSoup, NavigableString, CData, Tag
+
+class Slate(BasicNewsRecipe):
+    # Method variables for customizing downloads
+    title                   = 'Slate'
+    description             = 'A daily magazine on the Web, offering analysis and commentary about politics, news and culture.'
+    __author__              = 'GRiker@hotmail.com'
+    max_articles_per_feed   = 40
+    oldest_article          = 7.0
+    recursions              = 0
+    delay                   = 0
+    simultaneous_downloads  = 5
+    timeout                 = 120.0
+    timefmt                 = ''
+    feeds                   = None
+    no_stylesheets          = True
+    encoding                = None
+
+    # Method variables for customizing feed parsing
+    summary_length          = 250
+    use_embedded_content    = None
+
+    # Method variables for pre/post processing of HTML
+    remove_tags             = [ dict(name=['link','style']),
+                                dict(id=['toolbox','site_navigation','article_bottom_tools_cntr',
+                                         'article_bottom_tools','recommend_tab2','bottom_sponsored_links',
+                                         'fray_article_discussion','bizbox_sponsored_links_bottom',
+                                         'page_rightcol','top_banner','also_in_slate_bottom','articlefooter',
+                                         'article_top_wedge','content-top','page-title',
+                                         'block-today039s-business-press-archives','block-blog-roll',
+                                         'block-also-in-tbm','block-most-popular-on-tbm','block-the-best-of-tbm',
+                                         'service-links-bottom','comments','ft']),
+                                dict(attrs={'class':['fray_article_links','clearing','nav',
+                                            'service-links service-links-stack','yui-b last',
+                                            'read-more-comments']})]
+    extra_css = '.headline  {text-align:left;}\n\
+                 .byline    {font:monospace; text-align:left; margin-bottom:0pt;}\n\
+                 .dateline  {text-align:left; height:0pt;}\n\
+                 .source    {align:left;}\n\
+                 .credit    {text-align:right;font-size:smaller;}\n'
+
+    baseURL = 'http://slate.com'
+    section_dates = []
+
+    def tag_to_strings(self, tag):
+        if not tag:
+            return ''
+        if isinstance(tag, basestring):
+            return tag
+        strings = []
+        for item in tag.contents:
+            if isinstance(item, (NavigableString, CData)):
+                strings.append(item.string)
+            elif isinstance(item, Tag):
+                res = self.tag_to_string(item)
+                if res:
+                    strings.append(res)
+        return strings
+
+    def extract_sections(self):
+        soup = self.index_to_soup( self.baseURL )
+
+        soup_top_stories = soup.find(True, attrs={'class':'tap2_topic entry-content'})
+        soup = soup.find(True, attrs={'id':'toc_links_container'})
+
+        todays_section = soup.find(True, attrs={'class':'todaydateline'})
+        self.section_dates.append(self.tag_to_string(todays_section,use_alt=False))
+        self.section_dates.append(self.tag_to_string(todays_section,use_alt=False))
+
+        older_section_dates = soup.findAll(True, attrs={'class':'maindateline'})
+        for older_section in older_section_dates :
+            self.section_dates.append(self.tag_to_string(older_section,use_alt=False))
+
+        headline_stories = soup_top_stories.find('ul')
+        section_lists = soup.findAll('ul')
+        # Prepend the headlines to the first section
+        section_lists[0].insert(0,headline_stories)
+
+        sections = []
+        for section in section_lists :
+            sections.append(section)
+
+        return sections
+
+
+    def extract_section_articles(self, sections_html) :
+        soup = self.index_to_soup(str(sections_html))
+        sections = soup.findAll('ul')
+        articles = {}
+        key = None
+        ans = []
+
+        for (i,section) in enumerate(sections) :
+
+            # Get the section name
+            if section.has_key('id') :
+                key = self.section_dates[i]
+                articles[key] = []
+                ans.append(key)
+            else :
+                continue
+
+            # Get the section article_list
+            article_list = section.findAll('li')
+
+            excludedDescriptionKeywords = ['Slate V','Twitter feed','podcast']
+            excludedTitleKeywords = ['Gabfest','Slate V']
+            excludedAuthorKeywords = ['Prudence']
+
+            # Extract the article attributes
+            for article in article_list :
+                bylines = self.tag_to_strings(article)
+                url = article.a['href']
+                title = bylines[0]
+                full_title = self.tag_to_string(article)
+
+                author = None
+                description = None
+                pubdate = None
+
+                if len(bylines) == 2 and self.tag_to_string(article).find("Today's Papers") > 0 :
+                    description = "A summary of what's in the major U.S. newspapers."
+
+                if len(bylines) == 3 :
+                    author = bylines[2].strip()
+                    author = re.sub('[\r][\n][\t][\t\t]','', author)
+                    author = re.sub(',','', author)
+                    if bylines[1] is not None :
+                        description = bylines[1]
+                        full_byline = self.tag_to_string(article)
+                        if full_byline.find('major U.S. newspapers') > 0 :
+                            description = "A summary of what's in the major U.S. newspapers."
+
+
+                if len(bylines) > 3  and author is not None:
+                    author += " | "
+                    for (i,substring) in enumerate(bylines[3:]) :
+                        #print "substring: %s" % substring.encode('cp1252')
+                        author += substring.strip()
+                        if i < len(bylines[3:]) :
+                            author += " | "
+
+                # Skip articles whose descriptions contain excluded keywords
+                if description is not None :
+                    excluded = re.compile('|'.join(excludedDescriptionKeywords))
+                    found_excluded = excluded.search(description)
+                    if found_excluded :
+                        continue
+
+                # Skip articles whose title contain excluded keywords
+                if full_title is not None :
+                    excluded = re.compile('|'.join(excludedTitleKeywords))
+                    #self.log("evaluating full_title: %s" % full_title)
+                    found_excluded = excluded.search(full_title)
+                    if found_excluded :
+                        continue
+
+                # Skip articles whose author contain excluded keywords
+                if author is not None :
+                    excluded = re.compile('|'.join(excludedAuthorKeywords))
+                    found_excluded = excluded.search(author)
+                    if found_excluded :
+                        continue
+
+                skip_this_article = False
+                # Check to make sure we're not adding a duplicate
+                for article in articles[key] :
+                    if article['url'] == url :
+                        skip_this_article = True
+                        break
+
+                if skip_this_article :
+                    continue
+
+                # Build the dictionary entry for this article
+                feed = key
+                if not articles.has_key(feed) :
+                    articles[feed] = []
+                articles[feed].append(dict(title=title, url=url, date=pubdate, description=description,
+                                           author=author, content=''))
+            # Promote 'newspapers' to top
+            for (i,article) in enumerate(articles[feed]) :
+                if article['description'] is not None :
+                    if article['description'].find('newspapers') > 0 :
+                        articles[feed].insert(0,articles[feed].pop(i))
+
+
+        ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
+        ans = self.remove_duplicates(ans)
+        return ans
+
+    def flatten_document(self, ans):
+        flat_articles = []
+        for (i,section) in enumerate(ans) :
+            for article in section[1] :
+                flat_articles.append(article)
+        flat_section = ['All Articles', flat_articles]
+        flat_ans = [flat_section]
+
+        return flat_ans
+
+    def remove_duplicates(self, ans):
+        for (i,section) in enumerate(ans) :
+            for article in section[1] :
+                for (j,subsequent_section) in enumerate(ans[i+1:]) :
+                    for (k,subsequent_article) in enumerate(subsequent_section[1]) :
+                        if article['url'] == subsequent_article['url'] :
+                            del subsequent_section[1][k]
+        return ans
+
+    def print_version(self, url) :
+        return url + 'pagenum/all/'
+
+    # Class methods
+    def parse_index(self) :
+        sections = self.extract_sections()
+        section_list = self.extract_section_articles(sections)
+        section_list = self.flatten_document(section_list)
+        return section_list
+
+
+    def postprocess_html(self, soup, first_fetch) :
+        # Fix up dept_kicker as <h3><em>
+        dept_kicker = soup.find(True, attrs={'class':'department_kicker'})
+        if dept_kicker is not None :
+            kicker_strings = self.tag_to_strings(dept_kicker)
+            kicker = kicker_strings[2] + kicker_strings[3]
+            kicker = re.sub('.','',kicker)
+            h3Tag = Tag(soup, "h3")
+            emTag = Tag(soup, "em")
+            h3Tag.insert(0, emTag)
+            emTag.insert(0,kicker)
+            dept_kicker.replaceWith(h3Tag)
+
+        # Change <h1> to <h2>
+        headline = soup.find("h1")
+        if headline is not None :
+            h2tag = Tag(soup, "h2")
+            h2tag['class'] = "headline"
+            strs = self.tag_to_strings(headline)
+            result = ''
+            for (i,substr) in enumerate(strs) :
+                result += substr
+                if i < len(strs) -1 :
+                    result += '<br />'
+            h2tag.insert(0, result)
+            headline.replaceWith(h2tag)
+
+        # Fix up the concatenated byline and dateline
+        byline = soup.find(True,attrs={'class':'byline'})
+        if byline is not None :
+            bylineTag = Tag(soup,'div')
+            bylineTag['class'] = 'byline'
+            bylineTag.insert(0,self.tag_to_string(byline))
+            byline.replaceWith(bylineTag)
+
+        dateline = soup.find(True, attrs={'class':'dateline'})
+        if dateline is not None :
+            datelineTag = Tag(soup, 'div')
+            datelineTag['class'] = 'dateline'
+            datelineTag.insert(0,self.tag_to_string(dateline))
+            dateline.replaceWith(datelineTag)
+
+        # Change captions to italic, add <hr>
+        for caption in soup.findAll(True, {'class':'caption'}) :
+            if caption is not None:
+                emTag = Tag(soup, "em")
+                emTag.insert(0, '<br />' + self.tag_to_string(caption))
+                hrTag = Tag(soup, 'hr')
+                emTag.insert(1, hrTag)
+                caption.replaceWith(emTag)
+
+        return soup
+
+    def postprocess_book(self, oeb, opts, log) :
+
+        def extract_byline(href) :
+            soup = BeautifulSoup(str(oeb.manifest.hrefs[href]))
+            byline = soup.find(True,attrs={'class':'byline'})
+            if byline is not None:
+                return self.tag_to_string(byline,use_alt=False)
+            else :
+                return None
+
+        def extract_description(href) :
+            soup = BeautifulSoup(str(oeb.manifest.hrefs[href]))
+            paragraphs = soup.findAll('p')
+            for p in paragraphs :
+                if self.tag_to_string(p,use_alt=False).startswith('By ') or \
+                   self.tag_to_string(p,use_alt=False).startswith('Posted '):
+                    continue
+
+                images = p.findAll(True, attrs={'class':'imagewrapper'})
+                for image in images :
+                    image.extract()
+                return self.tag_to_string(p,use_alt=False)[:200] + '...'
+
+            return None
+
+        if oeb.toc.depth() == 2 :
+            for article in oeb.toc :
+                if article.author is None :
+                    article.author = extract_byline(article.href)
+
+                if article.description is None :
+                    article.description = extract_description(article.href)
+
+
+        elif oeb.toc.depth() == 3 :
+            for section in oeb.toc :
+                for article in section :
+                    if article.author is None :
+                        article.author = extract_byline(article.href)
+
+                    if article.description is None :
+                        article.description = extract_description(article.href)
+
+
+