From 3a12b18dc353a7256d30c55267af94f035a97338 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Wed, 2 Jun 2010 10:46:53 -0600 Subject: [PATCH] One more fix for NYTimes --- resources/recipes/nytimes.recipe | 8 ++++++-- resources/recipes/nytimes_sub.recipe | 16 ++++++---------- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/resources/recipes/nytimes.recipe b/resources/recipes/nytimes.recipe index 33758e8c47..eba717027e 100644 --- a/resources/recipes/nytimes.recipe +++ b/resources/recipes/nytimes.recipe @@ -391,10 +391,14 @@ class NYTimes(BasicNewsRecipe): return ans def preprocess_html(self, soup): - # Skip ad pages before actual article + # Skip ad pages served before actual article skip_tag = soup.find(True, {'name':'skip'}) if skip_tag is not None: - soup = self.index_to_soup(skip_tag.parent['href']) + self.log.error("Found forwarding link: %s" % skip_tag.parent['href']) + url = 'http://www.nytimes.com' + re.sub(r'\?.*', '', skip_tag.parent['href']) + url += '?pagewanted=all' + self.log.error("Skipping ad to article at '%s'" % url) + soup = self.index_to_soup(url) return self.strip_anchors(soup) def postprocess_html(self,soup, True): diff --git a/resources/recipes/nytimes_sub.recipe b/resources/recipes/nytimes_sub.recipe index 79c0d49223..c08b06572d 100644 --- a/resources/recipes/nytimes_sub.recipe +++ b/resources/recipes/nytimes_sub.recipe @@ -280,18 +280,14 @@ class NYTimes(BasicNewsRecipe): return ans def preprocess_html(self, soup): - ''' - refresh = soup.find('meta', {'http-equiv':'refresh'}) - if refresh is None: - return soup - content = refresh.get('content').partition('=')[2] - raw = self.browser.open('http://www.nytimes.com'+content).read() - return BeautifulSoup(raw.decode('cp1252', 'replace')) - ''' - # Skip ad pages before actual article + # Skip ad pages served before actual article skip_tag = soup.find(True, {'name':'skip'}) if skip_tag is not None: - soup = self.index_to_soup(skip_tag.parent['href']) + self.log.error("Found forwarding link: %s" % skip_tag.parent['href']) + url = 'http://www.nytimes.com' + re.sub(r'\?.*', '', skip_tag.parent['href']) + url += '?pagewanted=all' + self.log.error("Skipping ad to article at '%s'" % url) + soup = self.index_to_soup(url) return self.strip_anchors(soup) def postprocess_html(self,soup, True):