diff --git a/src/calibre/ebooks/oeb/transforms/guide.py b/src/calibre/ebooks/oeb/transforms/guide.py index 82bcaca72a..c1f0dd6669 100644 --- a/src/calibre/ebooks/oeb/transforms/guide.py +++ b/src/calibre/ebooks/oeb/transforms/guide.py @@ -34,7 +34,8 @@ class Clean(object): for x in list(self.oeb.guide): href = urldefrag(self.oeb.guide[x].href)[0] - if x.lower() not in ('cover', 'titlepage', 'masthead'): + if x.lower() not in ('cover', 'titlepage', 'masthead', 'toc', + 'title-page', 'copyright-page'): self.oeb.guide.remove(x) diff --git a/src/calibre/web/feeds/recipes/recipe_nytimes_sub.py b/src/calibre/web/feeds/recipes/recipe_nytimes_sub.py index 5d91dbae38..4449ba1aa2 100644 --- a/src/calibre/web/feeds/recipes/recipe_nytimes_sub.py +++ b/src/calibre/web/feeds/recipes/recipe_nytimes_sub.py @@ -11,7 +11,7 @@ from calibre.web.feeds.recipes import BasicNewsRecipe from calibre.ebooks.BeautifulSoup import BeautifulSoup class NYTimes(BasicNewsRecipe): - + title = 'The New York Times (subscription)' __author__ = 'Kovid Goyal' language = _('English') @@ -20,13 +20,13 @@ class NYTimes(BasicNewsRecipe): needs_subscription = True remove_tags_before = dict(id='article') remove_tags_after = dict(id='article') - remove_tags = [dict(attrs={'class':['articleTools', 'post-tools', 'side_tool', 'nextArticleLink clearfix']}), - dict(id=['footer', 'toolsRight', 'articleInline', 'navigation', 'archive', 'side_search', 'blog_sidebar', 'side_tool', 'side_index']), + remove_tags = [dict(attrs={'class':['articleTools', 'post-tools', 'side_tool', 'nextArticleLink clearfix']}), + dict(id=['footer', 'toolsRight', 'articleInline', 'navigation', 'archive', 'side_search', 'blog_sidebar', 'side_tool', 'side_index']), dict(name=['script', 'noscript', 'style'])] encoding = 'cp1252' no_stylesheets = True extra_css = 'h1 {font: sans-serif large;}\n.byline {font:monospace;}' - + def get_browser(self): br = BasicNewsRecipe.get_browser() if self.username is not None and self.password is not None: @@ -36,24 +36,24 @@ class NYTimes(BasicNewsRecipe): br['PASSWORD'] = self.password br.submit() return br - + def parse_index(self): soup = self.index_to_soup('http://www.nytimes.com/pages/todayspaper/index.html') - + def feed_title(div): return ''.join(div.findAll(text=True, recursive=False)).strip() - + articles = {} key = None ans = [] - for div in soup.findAll(True, + for div in soup.findAll(True, attrs={'class':['section-headline', 'story', 'story headline']}): - + if div['class'] == 'section-headline': key = string.capwords(feed_title(div)) articles[key] = [] ans.append(key) - + elif div['class'] in ['story', 'story headline']: a = div.find('a', href=True) if not a: @@ -66,21 +66,21 @@ class NYTimes(BasicNewsRecipe): summary = div.find(True, attrs={'class':'summary'}) if summary: description = self.tag_to_string(summary, use_alt=False) - + feed = key if key is not None else 'Uncategorized' if not articles.has_key(feed): articles[feed] = [] if not 'podcasts' in url: articles[feed].append( - dict(title=title, url=url, date=pubdate, + dict(title=title, url=url, date=pubdate, description=description, content='')) - ans = self.sort_index_by(ans, {'The Front Page':-1, - 'Dining In, Dining Out':1, + ans = self.sort_index_by(ans, {'The Front Page':-1, + 'Dining In, Dining Out':1, 'Obituaries':2}) ans = [(key, articles[key]) for key in ans if articles.has_key(key)] return ans - + def preprocess_html(self, soup): refresh = soup.find('meta', {'http-equiv':'refresh'}) if refresh is None: