New recipe for The New York Magazine by Kovid Goyal. Fixes #405 (Request for new news feeds)

2025-12-18 19:15:01 -05:00 · 2009-12-28 12:57:51 -07:00 · 2009-12-28 12:57:51 -07:00 · d45a7879c1
commit d45a7879c1
parent 83951981ba
1 changed files with 74 additions and 0 deletions
--- a/resources/recipes/nymag.recipe
+++ b/resources/recipes/nymag.recipe
@ -0,0 +1,74 @@
 #!/usr/bin/env  python
 __license__   = 'GPL v3'
 __copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
 '''
 theatlantic.com
 '''
 from calibre.web.feeds.news import BasicNewsRecipe
 class NewYorkMagazine(BasicNewsRecipe):
    title       = 'New York Magazine'
    __author__  = 'Kovid Goyal'
    description = 'Food, culture, arts and entertainment in New York'
    language    = 'en'
    no_stylesheets = True
    remove_javascript = True
    encoding = 'iso-8859-1'
    recursions = 1
    match_regexps = [r'http://nymag.com/.+/index[0-9]{1,2}.html$']
    keep_only_tags = [dict(id='main')]
    remove_tags = [
            dict(attrs={'class':['start-discussion']}),
            dict(id=['minibrowserbox', 'article-related', 'article-tools'])
            ]
    PREFIX = 'http://nymag.com'
    def nymag_get_index(self):
        return self.index_to_soup('http://nymag.com/includes/tableofcontents.htm')
    def parse_index(self):
        soup = self.nymag_get_index()
        self.cover_url = soup.find(attrs={'class':'cover'}).find('img',
                src=True).get('src')
        feeds = []
        current_section = 'Cover Story'
        current_articles = []
        for h in soup.findAll(['h4', 'h5']):
            if h.name == 'h4':
                if current_section and current_articles:
                    feeds.append((current_section, current_articles))
                current_section = self.tag_to_string(h)
                self.log('\tFound section:', current_section)
                current_articles = []
            elif h.name == 'h5':
                title = self.tag_to_string(h)
                a = h.find('a', href=True)
                if a is not None:
                    url = a.get('href')
                    if url.startswith('/'):
                        url = self.PREFIX + url
                    if title and url:
                        self.log('\t\tFound article:', title)
                        self.log('\t\t\t', url)
                        desc = ''
                        p = h.findNextSibling('p')
                        if p is not None:
                            desc = self.tag_to_string(p)
                            self.log('\t\t\t', desc)
                        current_articles.append({'title':title, 'url':url,
                            'date':'', 'description':desc})
        return feeds
    def postprocess_html(self, soup, first):
        for x in soup.findAll(attrs={'class':'page-navigation'}):
            x.extract()
        if not first:
            for x in soup.findAll(attrs={'class':'header-spacing'}):
                x.extract()
        return soup