#!/usr/bin/env python import re from calibre.web.feeds.news import BasicNewsRecipe class VillageVoice(BasicNewsRecipe): title = 'Village Voice' feeds = [ ("Complete Issue", "http://villagevoice.com/syndication/issue"), ("News", "http://villagevoice.com/syndication/section/news"), ("Music", "http://villagevoice.com/syndication/section/music"), ("Movies", "http://villagevoice.com/syndication/section/film"), # ("Restaurants", "http://villagevoice.com/syndication/section/dining"), # ("Music Events", "http://villagevoice.com/syndication/events?type=music"), # ("Calendar Events", "http://villagevoice.com/syndication/events"), # ("Promotional Events", "http://villagevoice.com/syndication/promoEvents"), # ("Restaurant Guide", "http://villagevoice.com/syndication/restaurants/search") ] auto_cleanup = True max_articles_per_feed = 50 masthead_url = "http://assets.villagevoice.com/img/citylogo.png" language = 'en' __author__ = 'Barty' seen_urls = [] # village voice breaks the article up into multiple pages, so # parse page and grab the print url url_regex = re.compile(r'\/content\/printVersion\/\d+', re.I) def print_version(self, url): if url in self.seen_urls: return None self.seen_urls.append(url) soup = self.index_to_soup(url) atag = soup.find('a', attrs={'href': self.url_regex}) if atag is None: self.log('Warning: no print url found for ' + url) else: m = self.url_regex.search(atag['href']) if m: url = 'http://www.villagevoice.com' + m.group(0) return url