diff --git a/src/calibre/web/feeds/recipes/recipe_newsweek.py b/src/calibre/web/feeds/recipes/recipe_newsweek.py index 6f47019f46..863bbb10a4 100644 --- a/src/calibre/web/feeds/recipes/recipe_newsweek.py +++ b/src/calibre/web/feeds/recipes/recipe_newsweek.py @@ -2,10 +2,9 @@ __license__ = 'GPL v3' __copyright__ = '2008, Kovid Goyal ' -import re, string, time +import re, time from calibre import strftime from calibre.web.feeds.news import BasicNewsRecipe -from calibre.ebooks.BeautifulSoup import BeautifulSoup class Newsweek(BasicNewsRecipe): @@ -14,59 +13,53 @@ class Newsweek(BasicNewsRecipe): description = 'Weekly news and current affairs in the US' no_stylesheets = True language = _('English') - - extra_css = '#content { font:serif 12pt; }\n.story {font:12pt}\n.HorizontalHeader {font:18pt}\n.deck {font:16pt}' - keep_only_tags = [dict(name='div', id='content')] - remove_tags = [ - dict(name=['script', 'noscript']), - dict(name='div', attrs={'class':['ad', 'SocialLinks', 'SocialLinksDiv', - 'channel', 'bot', 'nav', 'top', - 'EmailArticleBlock', - 'comments-and-social-links-wrapper', - 'inline-social-links-wrapper', - 'inline-social-links', - ]}), - dict(name='div', attrs={'class':re.compile('box')}), - dict(id=['ToolBox', 'EmailMain', 'EmailArticle', 'comment-box', - 'nw-comments']) - ] - + {'class':['navbar', 'ad', 'sponsorLinksArticle', 'mm-content', + 'inline-social-links-wrapper', 'email-article', + 'comments-and-social-links-wrapper', 'EmailArticleBlock']}, + {'id' : ['footer', 'ticker-data', 'topTenVertical', + 'digg-top-five', 'mesothorax', 'nw-comments', + 'ToolBox', 'EmailMain']}, + {'class': re.compile('related-cloud')}, + ] + keep_only_tags = [{'class':['article HorizontalHeader', 'articlecontent']}] + + recursions = 1 match_regexps = [r'http://www.newsweek.com/id/\S+/page/\d+'] - - - def get_sections(self, soup): - sections = [] - - def process_section(img): - articles = [] - match = re.search(r'label_([^_.]+)', img['src']) - if match is None: - return - title = match.group(1) - if title in ['coverstory', 'more', 'tipsheet']: - return - title = string.capwords(title) - - for a in img.parent.findAll('a', href=True): - art, href = a.string, a['href'] - if not re.search('\d+$', href) or not art or 'Preview Article' in art: - continue - articles.append({ - 'title':art, 'url':href, 'description':'', - 'content':'', 'date':'' - }) - sections.append((title, articles)) - - img.parent.extract() - for img in soup.findAll(src=re.compile('/label_')): - process_section(img) - - return sections + def find_title(self, section): + d = {'scope':'Scope', 'thetake':'The Take', 'features':'Features', + None:'Departments'} + ans = None + a = section.find('a', attrs={'name':True}) + if a is not None: + ans = a['name'] + return d[ans] + + + def find_articles(self, section): + ans = [] + for x in section.findAll('h5'): + title = ' '.join(x.findAll(text=True)).strip() + a = x.find('a') + if not a: continue + href = a['href'] + ans.append({'title':title, 'url':href, 'description':'', 'date': strftime('%a, %d %b')}) + if not ans: + for x in section.findAll('div', attrs={'class':'hdlItem'}): + a = x.find('a', href=True) + if not a : continue + title = ' '.join(a.findAll(text=True)).strip() + href = a['href'] + if 'http://xtra.newsweek.com' in href: continue + ans.append({'title':title, 'url':href, 'description':'', 'date': strftime('%a, %d %b')}) + + #for x in ans: + # x['url'] += '/output/print' + return ans + - def parse_index(self): soup = self.get_current_issue() if not soup: @@ -78,50 +71,46 @@ class Newsweek(BasicNewsRecipe): if match is not None: self.timefmt = strftime(' [%d %b, %Y]', time.strptime(match.group(1), '%y%m%d')) self.cover_url = small.replace('coversmall', 'coverlarge') - - sections = self.get_sections(soup) - sections.insert(0, ('Main articles', [])) - - for tag in soup.findAll('h5'): - a = tag.find('a', href=True) - if a is not None: - title = self.tag_to_string(a) - if not title: - a = 'Untitled article' - art = { - 'title' : title, - 'url' : a['href'], - 'description':'', 'content':'', - 'date': strftime('%a, %d %b') - } - if art['title'] and art['url']: - sections[0][1].append(art) - return sections - - + + sections = soup.findAll('div', attrs={'class':'featurewell'}) + titles = map(self.find_title, sections) + articles = map(self.find_articles, sections) + ans = list(zip(titles, articles)) + def fcmp(x, y): + tx, ty = x[0], y[0] + if tx == "Features": return cmp(1, 2) + if ty == "Features": return cmp(2, 1) + return cmp(tx, ty) + return sorted(ans, cmp=fcmp) + def postprocess_html(self, soup, first_fetch): - divs = list(soup.findAll('div', 'pagination')) - if not divs: - return - divs[0].extract() - if len(divs) > 1: - soup.find('body')['style'] = 'page-break-after:avoid' - divs[1].extract() - - h1 = soup.find('h1') + if not first_fetch: + h1 = soup.find(id='headline') if h1: h1.extract() - ai = soup.find('div', 'articleInfo') - ai.extract() - else: - soup.find('body')['style'] = 'page-break-before:always; page-break-after:avoid;' + div = soup.find(attrs={'class':'articleInfo'}) + if div: + div.extract() + divs = list(soup.findAll('div', 'pagination')) + if not divs: + return soup + for div in divs[1:]: div.extract() + all_a = divs[0].findAll('a', href=True) + divs[0]['style']="display:none" + if len(all_a) > 1: + all_a[-1].extract() + test = re.compile(self.match_regexps[0]) + for a in soup.findAll('a', href=test): + if a not in all_a: + del a['href'] return soup - + def get_current_issue(self): - #from urllib2 import urlopen # For some reason mechanize fails - #home = urlopen('http://www.newsweek.com').read() - soup = self.index_to_soup('http://www.newsweek.com')#BeautifulSoup(home) - img = soup.find('img', alt='Current Magazine') - if img and img.parent.has_key('href'): - return self.index_to_soup(img.parent['href']) - + soup = self.index_to_soup('http://www.newsweek.com') + div = soup.find('div', attrs={'class':re.compile('more-from-mag')}) + if div is None: return None + a = div.find('a') + if a is not None: + href = a['href'].split('#')[0] + return self.index_to_soup(href) +