calibre/recipes/villagevoice.recipe

#!/usr/bin/env python

import re

from calibre.web.feeds.news import BasicNewsRecipe


class VillageVoice(BasicNewsRecipe):

    title = 'Village Voice'
    feeds = [
        ('Complete Issue', 'http://villagevoice.com/syndication/issue'),
        ('News', 'http://villagevoice.com/syndication/section/news'),
        ('Music', 'http://villagevoice.com/syndication/section/music'),
        ('Movies', 'http://villagevoice.com/syndication/section/film'),
        # ("Restaurants", "http://villagevoice.com/syndication/section/dining"),
        # ("Music Events", "http://villagevoice.com/syndication/events?type=music"),
        # ("Calendar Events", "http://villagevoice.com/syndication/events"),
        # ("Promotional Events", "http://villagevoice.com/syndication/promoEvents"),
        # ("Restaurant Guide", "http://villagevoice.com/syndication/restaurants/search")
    ]

    auto_cleanup = True
    max_articles_per_feed = 50
    masthead_url = 'http://assets.villagevoice.com/img/citylogo.png'
    language = 'en'
    __author__ = 'Barty'

    seen_urls = []

    # village voice breaks the article up into multiple pages, so
    # parse page and grab the print url

    url_regex = re.compile(r'\/content\/printVersion\/\d+', re.I)

    def print_version(self, url):
        if url in self.seen_urls:
            return None
        self.seen_urls.append(url)
        soup = self.index_to_soup(url)
        atag = soup.find('a', attrs={'href': self.url_regex})
        if atag is None:
            self.log('Warning: no print url found for ' + url)
        else:
            m = self.url_regex.search(atag['href'])
            if m:
                url = 'http://www.villagevoice.com' + m.group(0)
        return url