diff --git a/recipes/villagevoice.recipe b/recipes/villagevoice.recipe new file mode 100644 index 0000000000..4723700817 --- /dev/null +++ b/recipes/villagevoice.recipe @@ -0,0 +1,46 @@ +#!/usr/bin/env python + +import re +from calibre.web.feeds.news import BasicNewsRecipe + +class VillageVoice(BasicNewsRecipe): + + title = 'Village Voice' + feeds = [ + ("Complete Issue", "http://villagevoice.com/syndication/issue"), + ("News", "http://villagevoice.com/syndication/section/news"), + ("Music", "http://villagevoice.com/syndication/section/music"), + ("Movies", "http://villagevoice.com/syndication/section/film"), + #("Restaurants", "http://villagevoice.com/syndication/section/dining"), + #("Music Events", "http://villagevoice.com/syndication/events?type=music"), + #("Calendar Events", "http://villagevoice.com/syndication/events"), + #("Promotional Events", "http://villagevoice.com/syndication/promoEvents"), + #("Restaurant Guide", "http://villagevoice.com/syndication/restaurants/search") + ] + + auto_cleanup = True + max_articles_per_feed = 50 + masthead_url = "http://assets.villagevoice.com/img/citylogo.png" + language = 'en' + __author__ = 'Barty' + + seen_urls = [] + + # village voice breaks the article up into multiple pages, so + # parse page and grab the print url + + url_regex = re.compile(r'\/content\/printVersion\/\d+',re.I) + + def print_version(self, url): + if url in self.seen_urls: + return None + self.seen_urls.append( url) + soup = self.index_to_soup(url) + atag = soup.find('a',attrs={'href':self.url_regex}) + if atag is None: + self.log('Warning: no print url found for '+url) + else: + m = self.url_regex.search(atag['href']) + if m: + url = 'http://www.villagevoice.com'+m.group(0) + return url