New recipe for Slate by GRiker

2025-08-30 23:00:21 -04:00 · 2009-07-21 19:08:29 -06:00 · 2009-07-21 19:08:29 -06:00 · 26d217611f
commit 26d217611f
parent 192abdd179
2 changed files with 330 additions and 1 deletions
--- a/src/calibre/web/feeds/recipes/init.py
+++ b/src/calibre/web/feeds/recipes/init.py
@ -52,7 +52,7 @@ recipe_modules = ['recipe_' + r for r in (
           'diagonales', 'miradasalsur', 'newsweek_argentina', 'veintitres',
           'gva_be', 'hln', 'tijd', 'degentenaar', 'inquirer_net', 'uncrate',
           'fastcompany', 'accountancyage', 'laprensa_hn', 'latribuna',
-           'eltiempo_hn',
+           'eltiempo_hn', 'slate',
          )]
--- a/src/calibre/web/feeds/recipes/recipe_slate.py
+++ b/src/calibre/web/feeds/recipes/recipe_slate.py
@ -0,0 +1,329 @@
 #!/usr/bin/env  python
 __license__   = 'GPL v3'
 __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
 '''
 Fetches the last 7 days of featured articles from slate.com
 '''
 import re
 from calibre.web.feeds.recipes import BasicNewsRecipe
 from calibre.ebooks.BeautifulSoup import BeautifulSoup, NavigableString, CData, Tag
 class Slate(BasicNewsRecipe):
    # Method variables for customizing downloads
    title                   = 'Slate'
    description             = 'A daily magazine on the Web, offering analysis and commentary about politics, news and culture.'
    __author__              = 'GRiker@hotmail.com'
    max_articles_per_feed   = 40
    oldest_article          = 7.0
    recursions              = 0
    delay                   = 0
    simultaneous_downloads  = 5
    timeout                 = 120.0
    timefmt                 = ''
    feeds                   = None
    no_stylesheets          = True
    encoding                = None
    # Method variables for customizing feed parsing
    summary_length          = 250
    use_embedded_content    = None
    # Method variables for pre/post processing of HTML
    remove_tags             = [ dict(name=['link','style']),
                                dict(id=['toolbox','site_navigation','article_bottom_tools_cntr',
                                         'article_bottom_tools','recommend_tab2','bottom_sponsored_links',
                                         'fray_article_discussion','bizbox_sponsored_links_bottom',
                                         'page_rightcol','top_banner','also_in_slate_bottom','articlefooter',
                                         'article_top_wedge','content-top','page-title',
                                         'block-today039s-business-press-archives','block-blog-roll',
                                         'block-also-in-tbm','block-most-popular-on-tbm','block-the-best-of-tbm',
                                         'service-links-bottom','comments','ft']),
                                dict(attrs={'class':['fray_article_links','clearing','nav',
                                            'service-links service-links-stack','yui-b last',
                                            'read-more-comments']})]
    extra_css = '.headline  {text-align:left;}\n\
                 .byline    {font:monospace; text-align:left; margin-bottom:0pt;}\n\
                 .dateline  {text-align:left; height:0pt;}\n\
                 .source    {align:left;}\n\
                 .credit    {text-align:right;font-size:smaller;}\n'
    baseURL = 'http://slate.com'
    section_dates = []
    def tag_to_strings(self, tag):
        if not tag:
            return ''
        if isinstance(tag, basestring):
            return tag
        strings = []
        for item in tag.contents:
            if isinstance(item, (NavigableString, CData)):
                strings.append(item.string)
            elif isinstance(item, Tag):
                res = self.tag_to_string(item)
                if res:
                    strings.append(res)
        return strings
    def extract_sections(self):
        soup = self.index_to_soup( self.baseURL )
        soup_top_stories = soup.find(True, attrs={'class':'tap2_topic entry-content'})
        soup = soup.find(True, attrs={'id':'toc_links_container'})
        todays_section = soup.find(True, attrs={'class':'todaydateline'})
        self.section_dates.append(self.tag_to_string(todays_section,use_alt=False))
        self.section_dates.append(self.tag_to_string(todays_section,use_alt=False))
        older_section_dates = soup.findAll(True, attrs={'class':'maindateline'})
        for older_section in older_section_dates :
            self.section_dates.append(self.tag_to_string(older_section,use_alt=False))
        headline_stories = soup_top_stories.find('ul')
        section_lists = soup.findAll('ul')
        # Prepend the headlines to the first section
        section_lists[0].insert(0,headline_stories)
        sections = []
        for section in section_lists :
            sections.append(section)
        return sections
    def extract_section_articles(self, sections_html) :
        soup = self.index_to_soup(str(sections_html))
        sections = soup.findAll('ul')
        articles = {}
        key = None
        ans = []
        for (i,section) in enumerate(sections) :
            # Get the section name
            if section.has_key('id') :
                key = self.section_dates[i]
                articles[key] = []
                ans.append(key)
            else :
                continue
            # Get the section article_list
            article_list = section.findAll('li')
            excludedDescriptionKeywords = ['Slate V','Twitter feed','podcast']
            excludedTitleKeywords = ['Gabfest','Slate V']
            excludedAuthorKeywords = ['Prudence']
            # Extract the article attributes
            for article in article_list :
                bylines = self.tag_to_strings(article)
                url = article.a['href']
                title = bylines[0]
                full_title = self.tag_to_string(article)
                author = None
                description = None
                pubdate = None
                if len(bylines) == 2 and self.tag_to_string(article).find("Today's Papers") > 0 :
                    description = "A summary of what's in the major U.S. newspapers."
                if len(bylines) == 3 :
                    author = bylines[2].strip()
                    author = re.sub('[\r][\n][\t][\t\t]','', author)
                    author = re.sub(',','', author)
                    if bylines[1] is not None :
                        description = bylines[1]
                        full_byline = self.tag_to_string(article)
                        if full_byline.find('major U.S. newspapers') > 0 :
                            description = "A summary of what's in the major U.S. newspapers."
                if len(bylines) > 3  and author is not None:
                    author += " | "
                    for (i,substring) in enumerate(bylines[3:]) :
                        #print "substring: %s" % substring.encode('cp1252')
                        author += substring.strip()
                        if i < len(bylines[3:]) :
                            author += " | "
                # Skip articles whose descriptions contain excluded keywords
                if description is not None :
                    excluded = re.compile('|'.join(excludedDescriptionKeywords))
                    found_excluded = excluded.search(description)
                    if found_excluded :
                        continue
                # Skip articles whose title contain excluded keywords
                if full_title is not None :
                    excluded = re.compile('|'.join(excludedTitleKeywords))
                    #self.log("evaluating full_title: %s" % full_title)
                    found_excluded = excluded.search(full_title)
                    if found_excluded :
                        continue
                # Skip articles whose author contain excluded keywords
                if author is not None :
                    excluded = re.compile('|'.join(excludedAuthorKeywords))
                    found_excluded = excluded.search(author)
                    if found_excluded :
                        continue
                skip_this_article = False
                # Check to make sure we're not adding a duplicate
                for article in articles[key] :
                    if article['url'] == url :
                        skip_this_article = True
                        break
                if skip_this_article :
                    continue
                # Build the dictionary entry for this article
                feed = key
                if not articles.has_key(feed) :
                    articles[feed] = []
                articles[feed].append(dict(title=title, url=url, date=pubdate, description=description,
                                           author=author, content=''))
            # Promote 'newspapers' to top
            for (i,article) in enumerate(articles[feed]) :
                if article['description'] is not None :
                    if article['description'].find('newspapers') > 0 :
                        articles[feed].insert(0,articles[feed].pop(i))
        ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
        ans = self.remove_duplicates(ans)
        return ans
    def flatten_document(self, ans):
        flat_articles = []
        for (i,section) in enumerate(ans) :
            for article in section[1] :
                flat_articles.append(article)
        flat_section = ['All Articles', flat_articles]
        flat_ans = [flat_section]
        return flat_ans
    def remove_duplicates(self, ans):
        for (i,section) in enumerate(ans) :
            for article in section[1] :
                for (j,subsequent_section) in enumerate(ans[i+1:]) :
                    for (k,subsequent_article) in enumerate(subsequent_section[1]) :
                        if article['url'] == subsequent_article['url'] :
                            del subsequent_section[1][k]
        return ans
    def print_version(self, url) :
        return url + 'pagenum/all/'
    # Class methods
    def parse_index(self) :
        sections = self.extract_sections()
        section_list = self.extract_section_articles(sections)
        section_list = self.flatten_document(section_list)
        return section_list
    def postprocess_html(self, soup, first_fetch) :
        # Fix up dept_kicker as <h3><em>
        dept_kicker = soup.find(True, attrs={'class':'department_kicker'})
        if dept_kicker is not None :
            kicker_strings = self.tag_to_strings(dept_kicker)
            kicker = kicker_strings[2] + kicker_strings[3]
            kicker = re.sub('.','',kicker)
            h3Tag = Tag(soup, "h3")
            emTag = Tag(soup, "em")
            h3Tag.insert(0, emTag)
            emTag.insert(0,kicker)
            dept_kicker.replaceWith(h3Tag)
        # Change <h1> to <h2>
        headline = soup.find("h1")
        if headline is not None :
            h2tag = Tag(soup, "h2")
            h2tag['class'] = "headline"
            strs = self.tag_to_strings(headline)
            result = ''
            for (i,substr) in enumerate(strs) :
                result += substr
                if i < len(strs) -1 :
                    result += '<br />'
            h2tag.insert(0, result)
            headline.replaceWith(h2tag)
        # Fix up the concatenated byline and dateline
        byline = soup.find(True,attrs={'class':'byline'})
        if byline is not None :
            bylineTag = Tag(soup,'div')
            bylineTag['class'] = 'byline'
            bylineTag.insert(0,self.tag_to_string(byline))
            byline.replaceWith(bylineTag)
        dateline = soup.find(True, attrs={'class':'dateline'})
        if dateline is not None :
            datelineTag = Tag(soup, 'div')
            datelineTag['class'] = 'dateline'
            datelineTag.insert(0,self.tag_to_string(dateline))
            dateline.replaceWith(datelineTag)
        # Change captions to italic, add <hr>
        for caption in soup.findAll(True, {'class':'caption'}) :
            if caption is not None:
                emTag = Tag(soup, "em")
                emTag.insert(0, '<br />' + self.tag_to_string(caption))
                hrTag = Tag(soup, 'hr')
                emTag.insert(1, hrTag)
                caption.replaceWith(emTag)
        return soup
    def postprocess_book(self, oeb, opts, log) :
        def extract_byline(href) :
            soup = BeautifulSoup(str(oeb.manifest.hrefs[href]))
            byline = soup.find(True,attrs={'class':'byline'})
            if byline is not None:
                return self.tag_to_string(byline,use_alt=False)
            else :
                return None
        def extract_description(href) :
            soup = BeautifulSoup(str(oeb.manifest.hrefs[href]))
            paragraphs = soup.findAll('p')
            for p in paragraphs :
                if self.tag_to_string(p,use_alt=False).startswith('By ') or \
                   self.tag_to_string(p,use_alt=False).startswith('Posted '):
                    continue
                images = p.findAll(True, attrs={'class':'imagewrapper'})
                for image in images :
                    image.extract()
                return self.tag_to_string(p,use_alt=False)[:200] + '...'
            return None
        if oeb.toc.depth() == 2 :
            for article in oeb.toc :
                if article.author is None :
                    article.author = extract_byline(article.href)
                if article.description is None :
                    article.description = extract_description(article.href)
        elif oeb.toc.depth() == 3 :
            for section in oeb.toc :
                for article in section :
                    if article.author is None :
                        article.author = extract_byline(article.href)
                    if article.description is None :
                        article.description = extract_description(article.href)