From ebda738c8136455df0dd71cb857e8ca620386982 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Tue, 18 Jan 2011 13:49:12 -0700 Subject: [PATCH] Updated NY Times --- resources/recipes/nytimes_sub.recipe | 275 ++++++++++++++++----------- 1 file changed, 161 insertions(+), 114 deletions(-) diff --git a/resources/recipes/nytimes_sub.recipe b/resources/recipes/nytimes_sub.recipe index cdacc42d92..2424113e31 100644 --- a/resources/recipes/nytimes_sub.recipe +++ b/resources/recipes/nytimes_sub.recipe @@ -159,6 +159,11 @@ class NYTimes(BasicNewsRecipe): 'relatedSearchesModule', 'side_tool', 'singleAd', + 'entry entry-utility', #added for DealBook + 'entry-tags', #added for DealBook + 'footer promos clearfix', #added for DealBook + 'footer links clearfix', #added for DealBook + 'inlineImage module', #added for DealBook re.compile('^subNavigation'), re.compile('^leaderboard'), re.compile('^module'), @@ -192,6 +197,9 @@ class NYTimes(BasicNewsRecipe): 'side_index', 'side_tool', 'toolsRight', + 'skybox', #added for DealBook + 'TopAd', #added for DealBook + 'related-content', #added for DealBook ]), dict(name=['script', 'noscript', 'style','form','hr'])] no_stylesheets = True @@ -246,7 +254,7 @@ class NYTimes(BasicNewsRecipe): def exclude_url(self,url): if not url.startswith("http"): return True - if not url.endswith(".html"): + if not url.endswith(".html") and 'dealbook.nytimes.com' not in url: #added for DealBook return True if 'nytimes.com' not in url: return True @@ -569,7 +577,6 @@ class NYTimes(BasicNewsRecipe): def preprocess_html(self, soup): - if self.webEdition & (self.oldest_article>0): date_tag = soup.find(True,attrs={'class': ['dateline','date']}) if date_tag: @@ -592,128 +599,168 @@ class NYTimes(BasicNewsRecipe): img_div = soup.find('div','inlineImage module') if img_div: img_div.extract() + + return self.strip_anchors(soup) def postprocess_html(self,soup, True): - try: - if self.one_picture_per_article: - # Remove all images after first - largeImg = soup.find(True, {'class':'articleSpanImage'}) - inlineImgs = soup.findAll(True, {'class':'inlineImage module'}) - if largeImg: - for inlineImg in inlineImgs: - inlineImg.extract() - else: - if inlineImgs: - firstImg = inlineImgs[0] - for inlineImg in inlineImgs[1:]: - inlineImg.extract() - # Move firstImg before article body - cgFirst = soup.find(True, {'class':re.compile('columnGroup *first')}) - if cgFirst: - # Strip all sibling NavigableStrings: noise - navstrings = cgFirst.findAll(text=True, recursive=False) - [ns.extract() for ns in navstrings] - headline_found = False - tag = cgFirst.find(True) - insertLoc = 0 - while True: - insertLoc += 1 - if hasattr(tag,'class') and tag['class'] == 'articleHeadline': - headline_found = True - break - tag = tag.nextSibling - if not tag: - headline_found = False - break - if headline_found: - cgFirst.insert(insertLoc,firstImg) - else: - self.log(">>> No class:'columnGroup first' found <<<") - except: - self.log("ERROR: One picture per article in postprocess_html") - try: - # Change captions to italic - for caption in soup.findAll(True, {'class':'caption'}) : - if caption and len(caption) > 0: - cTag = Tag(soup, "p", [("class", "caption")]) - c = self.fixChars(self.tag_to_string(caption,use_alt=False)).strip() - mp_off = c.find("More Photos") - if mp_off >= 0: - c = c[:mp_off] - cTag.insert(0, c) - caption.replaceWith(cTag) - except: - self.log("ERROR: Problem in change captions to italic") + try: + if self.one_picture_per_article: + # Remove all images after first + largeImg = soup.find(True, {'class':'articleSpanImage'}) + inlineImgs = soup.findAll(True, {'class':'inlineImage module'}) + if largeImg: + for inlineImg in inlineImgs: + inlineImg.extract() + else: + if inlineImgs: + firstImg = inlineImgs[0] + for inlineImg in inlineImgs[1:]: + inlineImg.extract() + # Move firstImg before article body + cgFirst = soup.find(True, {'class':re.compile('columnGroup *first')}) + if cgFirst: + # Strip all sibling NavigableStrings: noise + navstrings = cgFirst.findAll(text=True, recursive=False) + [ns.extract() for ns in navstrings] + headline_found = False + tag = cgFirst.find(True) + insertLoc = 0 + while True: + insertLoc += 1 + if hasattr(tag,'class') and tag['class'] == 'articleHeadline': + headline_found = True + break + tag = tag.nextSibling + if not tag: + headline_found = False + break + if headline_found: + cgFirst.insert(insertLoc,firstImg) + else: + self.log(">>> No class:'columnGroup first' found <<<") + except: + self.log("ERROR: One picture per article in postprocess_html") - try: - # Change to

- h1 = soup.find('h1') - if h1: - headline = h1.find("nyt_headline") - if headline: - tag = Tag(soup, "h2") - tag['class'] = "headline" - tag.insert(0, self.fixChars(headline.contents[0])) - h1.replaceWith(tag) - else: - # Blog entry - replace headline, remove
tags - headline = soup.find('title') - if headline: - tag = Tag(soup, "h2") - tag['class'] = "headline" - tag.insert(0, self.fixChars(headline.contents[0])) - soup.insert(0, tag) - hrs = soup.findAll('hr') - for hr in hrs: - hr.extract() - except: - self.log("ERROR: Problem in Change to

") + try: + # Change captions to italic + for caption in soup.findAll(True, {'class':'caption'}) : + if caption and len(caption) > 0: + cTag = Tag(soup, "p", [("class", "caption")]) + c = self.fixChars(self.tag_to_string(caption,use_alt=False)).strip() + mp_off = c.find("More Photos") + if mp_off >= 0: + c = c[:mp_off] + cTag.insert(0, c) + caption.replaceWith(cTag) + except: + self.log("ERROR: Problem in change captions to italic") - try: - # Change

to

- used in editorial blogs - masthead = soup.find("h1") - if masthead: - # Nuke the href - if masthead.a: - del(masthead.a['href']) - tag = Tag(soup, "h3") - tag.insert(0, self.fixChars(masthead.contents[0])) - masthead.replaceWith(tag) - except: - self.log("ERROR: Problem in Change

to

- used in editorial blogs") + try: + # Change to

+ h1 = soup.find('h1') + blogheadline = str(h1) #added for dealbook + if h1: + headline = h1.find("nyt_headline") + if headline: + tag = Tag(soup, "h2") + tag['class'] = "headline" + tag.insert(0, self.fixChars(headline.contents[0])) + h1.replaceWith(tag) + elif blogheadline.find('entry-title'):#added for dealbook + tag = Tag(soup, "h2")#added for dealbook + tag['class'] = "headline"#added for dealbook + tag.insert(0, self.fixChars(h1.contents[0]))#added for dealbook + h1.replaceWith(tag)#added for dealbook - try: - # Change to - for subhead in soup.findAll(True, {'class':'bold'}) : - if subhead.contents: - bTag = Tag(soup, "b") - bTag.insert(0, subhead.contents[0]) - subhead.replaceWith(bTag) - except: - self.log("ERROR: Problem in Change

to

- used in editorial blogs") + else: + # Blog entry - replace headline, remove
tags - BCC I think this is no longer functional 1-18-2011 + headline = soup.find('title') + if headline: + tag = Tag(soup, "h2") + tag['class'] = "headline" + tag.insert(0, self.fixChars(headline.renderContents())) + soup.insert(0, tag) + hrs = soup.findAll('hr') + for hr in hrs: + hr.extract() + except: + self.log("ERROR: Problem in Change to

") - try: - divTag = soup.find('div',attrs={'id':'articleBody'}) - if divTag: - divTag['class'] = divTag['id'] - except: - self.log("ERROR: Problem in soup.find(div,attrs={id:articleBody})") + try: + #if this is from a blog (dealbook, fix the byline format + bylineauthor = soup.find('address',attrs={'class':'byline author vcard'}) + if bylineauthor: + tag = Tag(soup, "h6") + tag['class'] = "byline" + tag.insert(0, self.fixChars(bylineauthor.renderContents())) + bylineauthor.replaceWith(tag) + except: + self.log("ERROR: fixing byline author format") - try: - # Add class="authorId" to
so we can format with CSS - divTag = soup.find('div',attrs={'id':'authorId'}) - if divTag and divTag.contents[0]: - tag = Tag(soup, "p") - tag['class'] = "authorId" - tag.insert(0, self.fixChars(self.tag_to_string(divTag.contents[0], - use_alt=False))) - divTag.replaceWith(tag) - except: - self.log("ERROR: Problem in Add class=authorId to
so we can format with CSS") + try: + #if this is a blog (dealbook) fix the credit style for the pictures + blogcredit = soup.find('div',attrs={'class':'credit'}) + if blogcredit: + tag = Tag(soup, "h6") + tag['class'] = "credit" + tag.insert(0, self.fixChars(blogcredit.renderContents())) + blogcredit.replaceWith(tag) + except: + self.log("ERROR: fixing credit format") - return soup + + try: + # Change

to

- used in editorial blogs + masthead = soup.find("h1") + if masthead: + # Nuke the href + if masthead.a: + del(masthead.a['href']) + tag = Tag(soup, "h3") + tag.insert(0, self.fixChars(masthead.contents[0])) + masthead.replaceWith(tag) + except: + self.log("ERROR: Problem in Change

to

- used in editorial blogs") + + try: + # Change to + for subhead in soup.findAll(True, {'class':'bold'}) : + if subhead.contents: + bTag = Tag(soup, "b") + bTag.insert(0, subhead.contents[0]) + subhead.replaceWith(bTag) + except: + self.log("ERROR: Problem in Change

to

- used in editorial blogs") + try: + #remove the update tag + blogupdated = soup.find('span', {'class':'update'}) + if blogupdated: + blogupdated.replaceWith("") + except: + self.log("ERROR: Removing strong tag") + + try: + divTag = soup.find('div',attrs={'id':'articleBody'}) + if divTag: + divTag['class'] = divTag['id'] + except: + self.log("ERROR: Problem in soup.find(div,attrs={id:articleBody})") + + try: + # Add class="authorId" to
so we can format with CSS + divTag = soup.find('div',attrs={'id':'authorId'}) + if divTag and divTag.contents[0]: + tag = Tag(soup, "p") + tag['class'] = "authorId" + tag.insert(0, self.fixChars(self.tag_to_string(divTag.contents[0], + use_alt=False))) + divTag.replaceWith(tag) + except: + self.log("ERROR: Problem in Add class=authorId to
so we can format with CSS") + + return soup def populate_article_metadata(self, article, soup, first): shortparagraph = "" try: