diff --git a/resources/recipes/scientific_american.recipe b/resources/recipes/scientific_american.recipe index 3970684788..8896121092 100644 --- a/resources/recipes/scientific_american.recipe +++ b/resources/recipes/scientific_american.recipe @@ -12,96 +12,53 @@ from calibre.web.feeds.news import BasicNewsRecipe class ScientificAmerican(BasicNewsRecipe): title = u'Scientific American' description = u'Popular science. Monthly magazine.' - __author__ = 'Kovid Goyal and Sujata Raman' + __author__ = 'Kovid Goyal' language = 'en' remove_javascript = True - oldest_article = 30 - max_articles_per_feed = 100 - no_stylesheets = True - use_embedded_content = False - extra_css = ''' - p{font-weight: normal; font-size:small} - li{font-weight: normal; font-size:small} - .headline p{font-size:x-small; font-family:Arial,Helvetica,sans-serif;} - h2{font-size:x-small;} - h3{font-size:x-small;font-family:Arial,Helvetica,sans-serif;} - ''' - remove_tags_before = dict(name='div', attrs={'class':'headline'}) + encoding = 'utf-8' - remove_tags_after = dict(id=['article']) - remove_tags = [ - dict(id=['sharetools', 'reddit']), - #dict(name='script'), - {'class':['float_left', 'atools']}, - {"class": re.compile(r'also-in-this')}, - dict(name='a',title = ["Get the Rest of the Article","Subscribe","Buy this Issue"]), - dict(name = 'img',alt = ["Graphic - Get the Rest of the Article"]), - dict(name='div', attrs={'class':['commentbox']}), - dict(name='h2', attrs={'class':['discuss_h2']}), - ] - - html2lrf_options = ['--base-font-size', '8'] - recursions = 1 - match_regexps = [r'article.cfm.id=\S+page=(2|3|4|5|6|7|8|9|10|11|12|13|14|15)'] + def print_version(self, url): + return url + '&print=true' def parse_index(self): soup = self.index_to_soup('http://www.scientificamerican.com/sciammag/') - monthtag = soup.find('div',attrs={'id':'magazine-main_col2'}) - month = self.tag_to_string(monthtag.contents[1]) - - - self.timefmt = ' [%s]'%(self.tag_to_string(month)) + month = self.tag_to_string(soup.find('p',attrs={'id':'articleDek'})) + self.timefmt = ' [%s]'%(' '.join(month.strip().split()[:2])) img = soup.find('img', alt='Scientific American Magazine', src=True) if img is not None: self.cover_url = img['src'] - features, feeds = [], [] - for p in soup.find(id='magazine-main_col2').findAll('p') : - a = p.find('a', href=True) - - if a is None: continue - desc = '' - s = p.find('span', attrs={'class':"sub"}) - desc = self.tag_to_string(s) - - article = { - 'url' : a['href'], - 'title' : self.tag_to_string(a), - 'date' : '', - 'description' : desc, - } - features.append(article) - feeds.append(('Features', features)) - - section = [] - title = None - - for x in soup.find(id='magazine-main_col1').findAll(['div', 'a']): - - if x.name == 'div': - - if section: - feeds.append((title, section)) - - title = self.tag_to_string(x) - section = [] - else: - - if 'article.cfm' in x['href']: - article = { - 'url' : x['href'], - 'title' : self.tag_to_string(x), - 'date': '', - 'description': '', - } - - section.append(article) - - if section: - feeds.append((title, section)) + feeds = [] + for div in soup.findAll('div', attrs={'class':['primaryCol', + 'secondaryCol']}): + current_section = None + for tag in div.findAll(['h2', 'ul']): + if tag.name == 'h2': + current_section = self.tag_to_string(tag).strip() + self.log('\tFound section:', current_section) + elif current_section is not None and tag.name == 'ul': + articles = [] + for li in tag.findAll('li'): + t = li.findAll('a', + attrs={'class':lambda x: x != 'thumb'}, + href=lambda x: x and 'article.cfm' in x) + if not t: + continue + t = t[-1] + title = self.tag_to_string(t) + url = t['href'] + desc = '' + p = li.find(attrs={'class':'dek'}) + if p is not None: + desc = self.tag_to_string(p) + articles.append({'title':title, 'url':url, + 'description':desc, 'date':''}) + self.log('\t\tFound article:', title, '\n\t\tat', url) + if articles: + feeds.append((current_section, articles)) + current_section = None return feeds - def postprocess_html(self, soup, first_fetch): if soup is not None: for span in soup.findAll('span', attrs={'class':'pagination'}):