Fix #4539 (Apostrophes not showing up in NYT recipe)

This commit is contained in:
Kovid Goyal 2010-01-25 10:04:30 -07:00
parent b542c8a090
commit 8eb3e165a0
4 changed files with 15 additions and 4 deletions

View File

@ -10,11 +10,18 @@ from calibre import strftime
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup
def decode(self, src):
enc = 'utf-8'
if 'iso-8859-1' in src:
enc = 'cp1252'
return src.decode(enc, 'ignore')
class NYTimes(BasicNewsRecipe):
title = 'The New York Times (subscription)'
__author__ = 'Kovid Goyal'
language = 'en'
requires_version = (0, 6, 36)
description = 'Daily news from the New York Times (subscription version)'
timefmt = ' [%a, %b %d, %Y]'
@ -27,7 +34,7 @@ class NYTimes(BasicNewsRecipe):
'side_tool', 'side_index',
'relatedArticles', 'relatedTopics', 'adxSponLink']),
dict(name=['script', 'noscript', 'style'])]
#encoding = 'cp1252'
encoding = decode
no_stylesheets = True
extra_css = 'h1 {font: sans-serif large;}\n.byline {font:monospace;}'

View File

@ -66,7 +66,7 @@ class RecipeInput(InputFormatPlugin):
if recipe.requires_version > numeric_version:
log.warn(
'Downloaded recipe needs calibre version at least: %s' % \
recipe.requires_version)
('.'.join(recipe.requires_version)))
builtin = True
except:
log.exception('Failed to compile downloaded recipe. Falling '

View File

@ -111,7 +111,9 @@ class BasicNewsRecipe(Recipe):
#: Specify an override encoding for sites that have an incorrect
#: charset specification. The most common being specifying ``latin1`` and
#: using ``cp1252``. If None, try to detect the encoding.
#: using ``cp1252``. If None, try to detect the encoding. If it is a
#: callable, the callable is called with two arguments: The recipe object
#: and the source to be decoded. It must return the decoded source.
encoding = None
#: Normally we try to guess if a feed has full articles embedded in it

View File

@ -403,7 +403,9 @@ class RecursiveFetcher(object):
if len(dsrc) == 0 or \
len(re.compile('<!--.*?-->', re.DOTALL).sub('', dsrc).strip()) == 0:
raise ValueError('No content at URL %s'%iurl)
if self.encoding is not None:
if callable(self.encoding):
dsrc = self.encoding(dsrc)
elif self.encoding is not None:
dsrc = dsrc.decode(self.encoding, 'replace')
else:
dsrc = xml_to_unicode(dsrc, self.verbose)[0]