diff --git a/src/calibre/constants.py b/src/calibre/constants.py index b02f92bc17..9e89f08978 100644 --- a/src/calibre/constants.py +++ b/src/calibre/constants.py @@ -2,7 +2,7 @@ __license__ = 'GPL v3' __copyright__ = '2008, Kovid Goyal kovid@kovidgoyal.net' __docformat__ = 'restructuredtext en' __appname__ = 'calibre' -__version__ = '0.6.0b12' +__version__ = '0.6.0b13' __author__ = "Kovid Goyal " import re diff --git a/src/calibre/web/feeds/__init__.py b/src/calibre/web/feeds/__init__.py index 5bf1260df4..13f1b387bc 100644 --- a/src/calibre/web/feeds/__init__.py +++ b/src/calibre/web/feeds/__init__.py @@ -19,14 +19,14 @@ class Article(object): def __init__(self, id, title, url, author, summary, published, content): self.downloaded = False self.id = id - self.title = title.strip() if title else title + self._title = title.strip() if title else title try: - self.title = re.sub(r'&(\S+);', - entity_to_unicode, self.title) + self._title = re.sub(r'&(\S+);', + entity_to_unicode, self._title) except: pass - if not isinstance(self.title, unicode): - self.title = self.title.decode('utf-8', 'replace') + if not isinstance(self._title, unicode): + self._title = self._title.decode('utf-8', 'replace') self.url = url self.author = author if author and not isinstance(author, unicode): @@ -50,6 +50,17 @@ class Article(object): self.utctime = datetime(*self.date[:6]) self.localtime = self.utctime + self.time_offset + @dynamic_property + def title(self): + def fget(self): + t = self._title + if not isinstance(t, unicode) and hasattr(t, 'decode'): + t = t.decode('utf-8', 'replace') + return t + def fset(self, val): + self._title = val + return property(fget=fget, fset=fset) + def __repr__(self): return \ diff --git a/src/calibre/web/feeds/recipes/recipe_nytimes.py b/src/calibre/web/feeds/recipes/recipe_nytimes.py index e50702ede5..04313a1172 100644 --- a/src/calibre/web/feeds/recipes/recipe_nytimes.py +++ b/src/calibre/web/feeds/recipes/recipe_nytimes.py @@ -6,6 +6,7 @@ __copyright__ = '2008, Kovid Goyal ' nytimes.com ''' import re +from calibre import entity_to_unicode from calibre.web.feeds.recipes import BasicNewsRecipe from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag @@ -19,15 +20,16 @@ class NYTimes(BasicNewsRecipe): timefmt = '' needs_subscription = True remove_tags_after = dict(attrs={'id':['comments']}) - remove_tags = [dict(attrs={'class':['articleTools', 'post-tools', 'side_tool', 'nextArticleLink', + remove_tags = [dict(attrs={'class':['articleTools', 'post-tools', 'side_tool', 'nextArticleLink', 'clearfix', 'nextArticleLink clearfix','inlineSearchControl', 'columnGroup','entry-meta','entry-response module','jumpLink','nav', - 'columnGroup advertisementColumnGroup']}), - dict(id=['footer', 'toolsRight', 'articleInline', 'navigation', 'archive', + 'columnGroup advertisementColumnGroup', 'kicker entry-category']}), + dict(id=['footer', 'toolsRight', 'articleInline', 'navigation', 'archive', 'side_search', 'blog_sidebar', 'side_tool', 'side_index', 'login', - 'blog-header','searchForm','NYTLogo','insideNYTimes']), + 'blog-header','searchForm','NYTLogo','insideNYTimes','adxToolSponsor', + 'adxLeaderboard']), dict(name=['script', 'noscript', 'style','hr'])] - encoding = None + encoding = 'cp1252' no_stylesheets = True #extra_css = 'h1 {font: sans-serif large;}\n.byline {font:monospace;}' extra_css = '.headline {text-align:left;}\n\ @@ -37,6 +39,8 @@ class NYTimes(BasicNewsRecipe): flatPeriodical = True + feed = None + ans = [] def get_browser(self): br = BasicNewsRecipe.get_browser() @@ -48,31 +52,76 @@ class NYTimes(BasicNewsRecipe): br.submit() return br + def index_to_soup(self, url_or_raw, raw=False): + ''' + Convenience method that takes an URL to the index page and returns + a `BeautifulSoup `_ + of it. + + This is an OVERRIDE of the method provided in news.py to solve an encoding problem + with NYTimes index pages which seem to be encoded in a wonderful blend + + `url_or_raw`: Either a URL or the downloaded index page as a string + ''' + def get_the_soup(docEncoding, url_or_raw, raw=False) : + if re.match(r'\w+://', url_or_raw): + f = self.browser.open(url_or_raw) + _raw = f.read() + f.close() + if not _raw: + raise RuntimeError('Could not fetch index from %s'%url_or_raw) + else: + _raw = url_or_raw + if raw: + return _raw + + if not isinstance(_raw, unicode) and self.encoding: + _raw = _raw.decode(docEncoding, 'replace') + massage = list(BeautifulSoup.MARKUP_MASSAGE) + massage.append((re.compile(r'&(\S+?);'), lambda match: entity_to_unicode(match, encoding=self.encoding))) + return BeautifulSoup(_raw, markupMassage=massage) + + # Entry point + soup = get_the_soup( self.encoding, url_or_raw ) + contentType = soup.find(True,attrs={'http-equiv':'Content-Type'}) + docEncoding = str(contentType)[str(contentType).find('charset=') + len('charset='):str(contentType).rfind('"')] + if docEncoding == '' : + docEncoding = self.encoding + + if self.verbose : + self.log( " document encoding: '%s'" % docEncoding) + if docEncoding != self.encoding : + soup = get_the_soup(docEncoding, url_or_raw) + + return soup + def parse_index(self): - soup = self.index_to_soup('http://www.nytimes.com/pages/todaysheadlines/') - - def feed_title(div): - return ''.join(div.findAll(text=True, recursive=False)).strip() - articles = {} - ans = [] if self.flatPeriodical : - feed = key = 'All Top Stories' + self.feed = key = 'All Top Stories' articles[key] = [] - ans.append(key) + self.ans.append(key) else : key = None + ''' + def feed_title(div): + return ''.join(div.findAll(text=True, recursive=False)).strip() + ''' + + sections = { 'arts' : 'Arts', 'business' : 'Business', 'editorials' : 'Editorials', + 'health' : 'Health', 'magazine' : 'Magazine', 'mediaadvertising' : 'Media & Advertising', 'newyorkregion' : 'New York/Region', 'oped' : 'Op-Ed', 'politics' : 'Politics', + 'science' : 'Science', 'sports' : 'Sports', 'technology' : 'Technology', 'topstories' : 'Top Stories', @@ -81,8 +130,18 @@ class NYTimes(BasicNewsRecipe): 'world' : 'World' } - #excludeSectionKeywords = ['World','U.S.', 'Politics','Business','Technology','Sports','Arts','New York','Travel', 'Editorials', 'Op-Ed'] - excludeSectionKeywords = [] + ''' + excludeSectionKeywords = ['Arts','Business','Editorials','Health','Magazine','Media', + 'New York','Op-Ed','Politics','Science','Sports','Technology', + 'Top Stories','Travel','U.S.','World'] + ''' + excludeSectionKeywords = ['Arts','Business','Editorials','Health','Magazine','Media', + 'New York','Politics','Science','Sports','Technology', + 'Top Stories','Travel','U.S.','World'] + + #excludeSectionKeywords = [] + + soup = self.index_to_soup('http://www.nytimes.com/pages/todaysheadlines/') # Fetch the outer table table = soup.find('table') @@ -164,7 +223,7 @@ class NYTimes(BasicNewsRecipe): if not self.flatPeriodical : articles[key] = [] - ans.append(key) + self.ans.append(key) # Get the bylines and descriptions if not skipThisSection : @@ -192,7 +251,7 @@ class NYTimes(BasicNewsRecipe): title = self.tag_to_string(a, use_alt=True) if self.flatPeriodical : # prepend the section name - title = sections[section] + " : " + title + title = sections[section] + " · " + title if not isinstance(title, unicode): title = title.decode('utf-8', 'replace') description = descriptions[i] @@ -201,28 +260,43 @@ class NYTimes(BasicNewsRecipe): else : author = None - if self.verbose > 2 : self.log( " title: %s" % title) if self.verbose > 2 : self.log( " url: %s" % url) if self.verbose > 2 : self.log( " author: %s" % author) if self.verbose > 2 : self.log( "description: %s" % description) if not self.flatPeriodical : - feed = key + self.feed = key - if not articles.has_key(feed): - if self.verbose > 2 : self.log( "adding %s to articles[]" % feed) - articles[feed] = [] - if self.verbose > 2 : self.log( " adding: %s to articles[%s]\n" % (title, feed)) - articles[feed].append( + # Check for duplicates + duplicateFound = False + if self.flatPeriodical and len(articles[self.feed]) > 1: + #print articles[self.feed] + for article in articles[self.feed] : + #print "comparing %s\n %s\n" % (url, article['url']) + if url == article['url'] : + duplicateFound = True + break + #print + + if duplicateFound: + # Continue fetching, don't add this article + print " skipping duplicate %s" % article['url'] + continue + + if not articles.has_key(self.feed): + if self.verbose > 2 : self.log( "adding %s to articles[]" % self.feed) + articles[self.feed] = [] + if self.verbose > 2 : self.log( " adding: %s to articles[%s]\n" % (title, self.feed)) + articles[self.feed].append( dict(title=title, url=url, date=pubdate, description=description, author=author, content='')) - ans = self.sort_index_by(ans, {'Top Stories':-1}) - ans = [(key, articles[key]) for key in ans if articles.has_key(key)] + self.ans = self.sort_index_by(self.ans, {'Top Stories':-1}) + self.ans = [(key, articles[key]) for key in self.ans if articles.has_key(key)] #sys.exit(1) - return ans + return self.ans def preprocess_html(self, soup): refresh = soup.find('meta', {'http-equiv':'refresh'}) @@ -286,17 +360,3 @@ class NYTimes(BasicNewsRecipe): return soup - def postprocess_book(self, oeb, opts, log) : - log( " ********** recipe.postprocess_book ********** ") - log( list(oeb.toc) ) - log( "oeb: %s" % oeb.toc) - log( "opts: %s" % opts.verbose) - for sections in oeb.toc : - log( "section:") - for articleTOC in sections: - log( " title: %s" % articleTOC.title) - log( " author: %s" % articleTOC.author) - log( "description: %s" % articleTOC.description) - log( " href: %s" % articleTOC.href) - log( " content: %s" % oeb.manifest.hrefs[articleTOC.href]) - return