diff --git a/recipes/prospectmaguk.recipe b/recipes/prospectmaguk.recipe index 24f8f6ab1f..cd2fa62162 100644 --- a/recipes/prospectmaguk.recipe +++ b/recipes/prospectmaguk.recipe @@ -1,4 +1,4 @@ -#!/usr/bin/env python2 +#!/usr/bin/env python2 __copyright__ = '2008, Kovid Goyal ' __license__ = 'GPL v3' @@ -7,8 +7,8 @@ __license__ = 'GPL v3' calibre recipe for prospectmagazine.co.uk (subscription) ''' -import re from calibre.web.feeds.recipes import BasicNewsRecipe +from css_selectors import Select class ProspectMagUK(BasicNewsRecipe): @@ -18,16 +18,13 @@ class ProspectMagUK(BasicNewsRecipe): timefmt = ' [%d %B %Y]' no_stylesheets = True publication_type = 'magazine' - masthead_url = 'http://www.prospectmagazine.co.uk/wp-content/themes/prospect/images/titleMain.jpg' category = 'news, UK' language = 'en_GB' max_articles_per_feed = 100 - auto_cleanup = True needs_subscription = True - auto_cleanup_keep = '//div[@class="lead_image"]' - INDEX = 'http://www.prospectmagazine.co.uk/issue/' + keep_only_tags = [dict(id='post_content')] def get_browser(self): br = BasicNewsRecipe.get_browser(self) @@ -40,41 +37,19 @@ class ProspectMagUK(BasicNewsRecipe): return br def parse_index(self): - soup = self.index_to_soup(self.INDEX) - div = soup.find('div', id='cover_image') - if div is not None: - img = div.find('img', src=True) - if img is not None: - src = img['src'] - if src.startswith('/'): - src = 'http://www.prospectmagazine.co.uk' + src - self.cover_url = src + root = self.index_to_soup(self.INDEX, as_tree=True) + sel = Select(root) + for img in sel('.block_this_month .img_wrap img'): + self.cover_url = img.get('src').partition('?')[0] feeds = [] - # loop through sections - for sect in soup.findAll('div', attrs={'class': 'sectionheading'}): - fname = self.tag_to_string(sect).replace('>', '').strip() - self.log('Found section', fname) + for h2 in sel('h2.block-title'): + current_section = self.tag_to_string(h2) articles = [] - - # note: can't just find siblings with class='post' because that will also - # grab all the articles belonging to the sections that follow. - for item in sect.findNextSiblings('div', attrs={'class': True}): - if 'post' not in item['class']: - break - a = item.find('a', href=True) - if a is None: - continue - url = a['href'] - title = self.tag_to_string(a) - p = item.find('p') - desc = self.tag_to_string(p) if p is not None else '' - art = {'title': title, 'description': desc, - 'date': ' ', 'url': url} - p = item.find(attrs={'class': re.compile('author')}) - self.log('\tFound article:', title, '::', url) - if p is not None: - art['author'] = self.tag_to_string(p).strip() - articles.append(art) - - feeds.append((fname, articles)) + self.log('Found section:', current_section) + for div in sel('div.block_home_post', h2.getparent()): + for a in sel('div.title a[href]', div): + articles.append({'title':self.tag_to_string(a), 'url':a.get('href')}) + self.log('\tFound article:', articles[-1]['title']) + if articles: + feeds.append((current_section, articles)) return feeds