From 8eb3e165a0b03617af5d6d512563291df8be4a37 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Mon, 25 Jan 2010 10:04:30 -0700 Subject: [PATCH] Fix #4539 (Apostrophes not showing up in NYT recipe) --- resources/recipes/nytimes_sub.recipe | 9 ++++++++- src/calibre/web/feeds/input.py | 2 +- src/calibre/web/feeds/news.py | 4 +++- src/calibre/web/fetch/simple.py | 4 +++- 4 files changed, 15 insertions(+), 4 deletions(-) diff --git a/resources/recipes/nytimes_sub.recipe b/resources/recipes/nytimes_sub.recipe index 9944f919be..ebc97b561c 100644 --- a/resources/recipes/nytimes_sub.recipe +++ b/resources/recipes/nytimes_sub.recipe @@ -10,11 +10,18 @@ from calibre import strftime from calibre.web.feeds.recipes import BasicNewsRecipe from calibre.ebooks.BeautifulSoup import BeautifulSoup +def decode(self, src): + enc = 'utf-8' + if 'iso-8859-1' in src: + enc = 'cp1252' + return src.decode(enc, 'ignore') + class NYTimes(BasicNewsRecipe): title = 'The New York Times (subscription)' __author__ = 'Kovid Goyal' language = 'en' + requires_version = (0, 6, 36) description = 'Daily news from the New York Times (subscription version)' timefmt = ' [%a, %b %d, %Y]' @@ -27,7 +34,7 @@ class NYTimes(BasicNewsRecipe): 'side_tool', 'side_index', 'relatedArticles', 'relatedTopics', 'adxSponLink']), dict(name=['script', 'noscript', 'style'])] - #encoding = 'cp1252' + encoding = decode no_stylesheets = True extra_css = 'h1 {font: sans-serif large;}\n.byline {font:monospace;}' diff --git a/src/calibre/web/feeds/input.py b/src/calibre/web/feeds/input.py index 5015c1bef2..adb2f13a56 100644 --- a/src/calibre/web/feeds/input.py +++ b/src/calibre/web/feeds/input.py @@ -66,7 +66,7 @@ class RecipeInput(InputFormatPlugin): if recipe.requires_version > numeric_version: log.warn( 'Downloaded recipe needs calibre version at least: %s' % \ - recipe.requires_version) + ('.'.join(recipe.requires_version))) builtin = True except: log.exception('Failed to compile downloaded recipe. Falling ' diff --git a/src/calibre/web/feeds/news.py b/src/calibre/web/feeds/news.py index 60b5ad0174..61de20c2d6 100644 --- a/src/calibre/web/feeds/news.py +++ b/src/calibre/web/feeds/news.py @@ -111,7 +111,9 @@ class BasicNewsRecipe(Recipe): #: Specify an override encoding for sites that have an incorrect #: charset specification. The most common being specifying ``latin1`` and - #: using ``cp1252``. If None, try to detect the encoding. + #: using ``cp1252``. If None, try to detect the encoding. If it is a + #: callable, the callable is called with two arguments: The recipe object + #: and the source to be decoded. It must return the decoded source. encoding = None #: Normally we try to guess if a feed has full articles embedded in it diff --git a/src/calibre/web/fetch/simple.py b/src/calibre/web/fetch/simple.py index 620850a762..f97382190d 100644 --- a/src/calibre/web/fetch/simple.py +++ b/src/calibre/web/fetch/simple.py @@ -403,7 +403,9 @@ class RecursiveFetcher(object): if len(dsrc) == 0 or \ len(re.compile('', re.DOTALL).sub('', dsrc).strip()) == 0: raise ValueError('No content at URL %s'%iurl) - if self.encoding is not None: + if callable(self.encoding): + dsrc = self.encoding(dsrc) + elif self.encoding is not None: dsrc = dsrc.decode(self.encoding, 'replace') else: dsrc = xml_to_unicode(dsrc, self.verbose)[0]