#!/usr/bin/env python __license__ = 'GPL v3' __copyright__ = '2008, Kovid Goyal ' ''' nytimes.com ''' import re import time from calibre import entity_to_unicode from calibre.web.feeds.recipes import BasicNewsRecipe from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, NavigableString, \ Comment, BeautifulStoneSoup class NYTimes(BasicNewsRecipe): title = 'New York Times Top Stories' __author__ = 'GRiker' language = 'en' description = 'Top Stories from the New York Times' # List of sections typically included in Top Stories. Use a keyword from the # right column in the excludeSectionKeywords[] list to skip downloading that section sections = { 'arts' : 'Arts', 'business' : 'Business', 'diningwine' : 'Dining & Wine', 'editorials' : 'Editorials', 'health' : 'Health', 'magazine' : 'Magazine', 'mediaadvertising' : 'Media & Advertising', 'newyorkregion' : 'New York/Region', 'oped' : 'Op-Ed', 'politics' : 'Politics', 'science' : 'Science', 'sports' : 'Sports', 'technology' : 'Technology', 'topstories' : 'Top Stories', 'travel' : 'Travel', 'us' : 'U.S.', 'world' : 'World' } # Add section keywords from the right column above to skip that section # For example, to skip sections containing the word 'Sports' or 'Dining', use: # excludeSectionKeywords = ['Sports', 'Dining'] # Fetch only Business and Technology # excludeSectionKeywords = ['Arts','Dining','Editorials','Health','Magazine','Media','Region','Op-Ed','Politics','Science','Sports','Top Stories','Travel','U.S.','World'] # Fetch only Top Stories # excludeSectionKeywords = ['Arts','Business','Dining','Editorials','Health','Magazine','Media','Region','Op-Ed','Politics','Science','Sports','Technology','Travel','U.S.','World'] # By default, no sections are skipped. excludeSectionKeywords = [] # one_picture_per_article specifies that calibre should only use the first image # from an article (if one exists). If one_picture_per_article = True, the image # will be moved to a location between the headline and the byline. # If one_picture_per_article = False, all images from the article will be included # and shown in their original location. one_picture_per_article = True # The maximum number of articles that will be downloaded max_articles_per_feed = 40 timefmt = '' needs_subscription = True masthead_url = 'http://graphics8.nytimes.com/images/misc/nytlogo379x64.gif' remove_tags_before = dict(id='article') remove_tags_after = dict(id='article') remove_tags = [dict(attrs={'class':[ 'articleFooter', 'articleTools', 'columnGroup doubleRule', 'columnGroup singleRule', 'columnGroup last', 'columnGroup last', 'doubleRule', 'dottedLine', 'entry-meta', 'icon enlargeThis', 'leftNavTabs', 'module box nav', 'nextArticleLink', 'nextArticleLink clearfix', 'post-tools', 'relatedSearchesModule', 'side_tool', 'singleAd', 'subNavigation tabContent active', 'subNavigation tabContent active clearfix', ]}), dict(id=[ 'adxLeaderboard', 'archive', 'articleExtras', 'articleInline', 'blog_sidebar', 'businessSearchBar', 'cCol', 'entertainmentSearchBar', 'footer', 'header', 'header_search', 'login', 'masthead', 'masthead-nav', 'memberTools', 'navigation', 'portfolioInline', 'relatedArticles', 'side_search', 'side_index', 'side_tool', 'toolsRight', ]), dict(name=['script', 'noscript', 'style'])] no_stylesheets = True extra_css = '.headline {text-align: left;}\n \ .byline {font-family: monospace; \ text-align: left; \ margin-top: 0px; \ margin-bottom: 0px;}\n \ .dateline {font-size: small; \ margin-top: 0px; \ margin-bottom: 0px;}\n \ .timestamp {font-size: small; \ margin-top: 0px; \ margin-bottom: 0px;}\n \ .source {text-align: left;}\n \ .image {text-align: center;}\n \ .credit {text-align: right; \ font-size: small; \ margin-top: 0px; \ margin-bottom: 0px;}\n \ .articleBody {text-align: left;}\n \ .authorId {text-align: left; \ font-style: italic;}\n ' def dump_ans(self, ans) : total_article_count = 0 for section in ans : if self.verbose: self.log("section %s: %d articles" % (section[0], len(section[1])) ) for article in section[1]: total_article_count += 1 if self.verbose: self.log("\t%-40.40s... \t%-60.60s..." % (article['title'].encode('cp1252','replace'), article['url'].encode('cp1252','replace'))) self.log( "Queued %d articles" % total_article_count ) def fixChars(self,string): # Replace lsquo (\x91) fixed = re.sub("\x91","‘",string) # Replace rsquo (\x92) fixed = re.sub("\x92","’",fixed) # Replace ldquo (\x93) fixed = re.sub("\x93","“",fixed) # Replace rdquo (\x94) fixed = re.sub("\x94","”",fixed) # Replace ndash (\x96) fixed = re.sub("\x96","–",fixed) # Replace mdash (\x97) fixed = re.sub("\x97","—",fixed) return fixed def get_browser(self): br = BasicNewsRecipe.get_browser() if self.username is not None and self.password is not None: try: br.open('http://www.nytimes.com/auth/login') br.select_form(name='login') br['USERID'] = self.username br['PASSWORD'] = self.password br.submit() except: self.log("\nFailed to login") return br def get_cover_url(self): cover = None st = time.localtime() year = str(st.tm_year) month = "%.2d" % st.tm_mon day = "%.2d" % st.tm_mday cover = 'http://graphics8.nytimes.com/images/' + year + '/' + month +'/' + day +'/nytfrontpage/scan.jpg' br = BasicNewsRecipe.get_browser() try: br.open(cover) except: self.log("\nCover unavailable") cover = None return cover def index_to_soup(self, url_or_raw, raw=False): ''' OVERRIDE of class method deals with various page encodings between index and articles ''' def get_the_soup(docEncoding, url_or_raw, raw=False) : if re.match(r'\w+://', url_or_raw): f = self.browser.open(url_or_raw) _raw = f.read() f.close() if not _raw: raise RuntimeError('Could not fetch index from %s'%url_or_raw) else: _raw = url_or_raw if raw: return _raw if not isinstance(_raw, unicode) and self.encoding: _raw = _raw.decode(docEncoding, 'replace') massage = list(BeautifulSoup.MARKUP_MASSAGE) massage.append((re.compile(r'&(\S+?);'), lambda match: entity_to_unicode(match, encoding=self.encoding))) return BeautifulSoup(_raw, markupMassage=massage) # Entry point print "index_to_soup()" soup = get_the_soup( self.encoding, url_or_raw ) contentType = soup.find(True,attrs={'http-equiv':'Content-Type'}) docEncoding = str(contentType)[str(contentType).find('charset=') + len('charset='):str(contentType).rfind('"')] if docEncoding == '' : docEncoding = self.encoding if self.verbose > 2: self.log( " document encoding: '%s'" % docEncoding) if docEncoding != self.encoding : soup = get_the_soup(docEncoding, url_or_raw) return soup def massageNCXText(self, description): # Kindle TOC descriptions won't render certain characters if description: massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES)) # Replace '&' with '&' massaged = re.sub("&","&", massaged) return self.fixChars(massaged) else: return description def parse_index(self): articles = {} ans = [] feed = key = 'All Top Stories' articles[key] = [] ans.append(key) self.log("Scanning 1 section ...") soup = self.index_to_soup('http://www.nytimes.com/pages/todaysheadlines/') # Fetch the outer table table = soup.find('table') previousTable = table # Find the deepest table containing the stories while True : table = table.find('table') if table.find(text=re.compile('top stories start')) : previousTable = table continue else : table = previousTable break # There are multiple subtables, find the one containing the stories for block in table.findAll('table') : if block.find(text=re.compile('top stories start')) : table = block break else : continue # Again there are multiple subtables, find the one containing the stories for storyblock in table.findAll('table') : if storyblock.find(text=re.compile('top stories start')) : break else : continue skipThisSection = False todays_article_count = 0 # Within this table are entries self.log("Fetching feed Top Stories") for tr in storyblock.findAllNext('tr'): if tr.find('span') is not None : sectionblock = tr.find(True, attrs={'face':['times new roman, times,sans serif', 'times new roman,times, sans serif', 'times new roman, times, sans serif']}) section = None bylines = [] descriptions = [] pubdate = None # Get the Section title for (x,i) in enumerate(sectionblock.contents) : skipThisSection = False # Extract the section title if ('Comment' in str(i.__class__)) : if 'start(name=' in i : section = i[i.find('=')+1:-2] if not self.sections.has_key(section) : skipThisSection = True break # Check for excluded section if len(self.excludeSectionKeywords): key = self.sections[section] excluded = re.compile('|'.join(self.excludeSectionKeywords)) if excluded.search(key) or articles.has_key(key): skipThisSection = True break # Get the bylines and descriptions if not skipThisSection : lines = sectionblock.contents contentStrings = [] for line in lines: if not isinstance(line, Comment) and line.strip and line.strip() > "": contentStrings.append(line.strip()) # Gather the byline/description pairs bylines = [] descriptions = [] for contentString in contentStrings: if contentString[0:3] == 'By ' and contentString[3].isupper() : bylines.append(contentString) else: descriptions.append(contentString) # Fetch the article titles and URLs articleCount = len(sectionblock.findAll('span')) todays_article_count += articleCount for (i,span) in enumerate(sectionblock.findAll(attrs={'class':'headlineWrapper'})) : a = span.find('a', href=True) url = re.sub(r'\?.*', '', a['href']) url += '?pagewanted=all' title = self.tag_to_string(a, use_alt=True) # prepend the section name title = self.sections[section] + " · " + title if not isinstance(title, unicode): title = title.decode('utf-8', 'replace') # Allow for unattributed, undescribed entries "Editor's Note" if i >= len(descriptions) : description = None else : description = descriptions[i] if len(bylines) == articleCount : author = bylines[i] else : author = None # Check for duplicates duplicateFound = False if len(articles[feed]) > 1: for article in articles[feed] : if url == article['url'] : duplicateFound = True break if duplicateFound: # Continue fetching, don't add this article todays_article_count -= 1 continue if not articles.has_key(feed): articles[feed] = [] articles[feed].append( dict(title=title, url=url, date=pubdate, description=description, author=author, content='')) # self.log("Queuing %d articles from %s" % (todays_article_count, "Top Stories")) ans = self.sort_index_by(ans, {'Top Stories':-1}) ans = [(key, articles[key]) for key in ans if articles.has_key(key)] self.dump_ans(ans) return ans def preprocess_html(self, soup): # Skip ad pages served before actual article skip_tag = soup.find(True, {'name':'skip'}) if skip_tag is not None: self.log.error("Found forwarding link: %s" % skip_tag.parent['href']) url = 'http://www.nytimes.com' + re.sub(r'\?.*', '', skip_tag.parent['href']) url += '?pagewanted=all' self.log.error("Skipping ad to article at '%s'" % url) soup = self.index_to_soup(url) return self.strip_anchors(soup) def postprocess_html(self,soup, True): if self.one_picture_per_article: # Remove all images after first largeImg = soup.find(True, {'class':'articleSpanImage'}) inlineImgs = soup.findAll(True, {'class':'inlineImage module'}) if largeImg: for inlineImg in inlineImgs: inlineImg.extract() else: if inlineImgs: firstImg = inlineImgs[0] for inlineImg in inlineImgs[1:]: inlineImg.extract() # Move firstImg after headline cgFirst = soup.find(True, {'class':'columnGroup first'}) if cgFirst: # Strip all sibling NavigableStrings: noise navstrings = cgFirst.findAll(text=True, recursive=False) [ns.extract() for ns in navstrings] headline_found = False tag = cgFirst.find(True) insertLoc = 0 while True: insertLoc += 1 if hasattr(tag,'class') and tag['class'] == 'articleHeadline': headline_found = True break tag = tag.nextSibling if not tag: headline_found = False break if headline_found: cgFirst.insert(insertLoc,firstImg) else: self.log(">>> No class:'columnGroup first' found <<<") # Change class="kicker" to

kicker = soup.find(True, {'class':'kicker'}) if kicker and kicker.contents[0]: h3Tag = Tag(soup, "h3") h3Tag.insert(0, self.fixChars(self.tag_to_string(kicker, use_alt=False))) kicker.replaceWith(h3Tag) # Change captions to italic -1 for caption in soup.findAll(True, {'class':'caption'}) : if caption and caption.contents[0]: emTag = Tag(soup, "em") c = self.fixChars(self.tag_to_string(caption,use_alt=False)).strip() mp_off = c.find("More Photos") if mp_off >= 0: c = c[:mp_off] emTag.insert(0, c) hrTag = Tag(soup, 'hr') #hrTag['style'] = "margin-top:0em;margin-bottom:0em" emTag.insert(1, hrTag) caption.replaceWith(emTag) # Change to

h1 = soup.find('h1') if h1: headline = h1.find("nyt_headline") if headline: tag = Tag(soup, "h2") tag['class'] = "headline" tag.insert(0, self.fixChars(headline.contents[0])) h1.replaceWith(tag) else: # Blog entry - replace headline, remove
tags headline = soup.find('title') if headline: tag = Tag(soup, "h2") tag['class'] = "headline" tag.insert(0, self.fixChars(headline.contents[0])) soup.insert(0, tag) hrs = soup.findAll('hr') for hr in hrs: hr.extract() # Change

to

- used in editorial blogs masthead = soup.find("h1") if masthead: # Nuke the href if masthead.a: del(masthead.a['href']) tag = Tag(soup, "h3") tag.insert(0, self.fixChars(masthead.contents[0])) masthead.replaceWith(tag) # Change to for subhead in soup.findAll(True, {'class':'bold'}) : if subhead.contents: bTag = Tag(soup, "b") bTag.insert(0, subhead.contents[0]) subhead.replaceWith(bTag) # Synthesize a section header dsk = soup.find('meta', attrs={'name':'dsk'}) if dsk and dsk.has_key('content'): hTag = Tag(soup,'h3') hTag['class'] = 'section' hTag.insert(0,NavigableString(dsk['content'])) articleTag = soup.find(True, attrs={'id':'article'}) if articleTag: articleTag.insert(0,hTag) # Add class="articleBody" to
so we can format with CSS divTag = soup.find('div',attrs={'id':'articleBody'}) if divTag: divTag['class'] = divTag['id'] # Add class="authorId" to
so we can format with CSS divTag = soup.find('div',attrs={'id':'authorId'}) if divTag and divTag.contents[0]: tag = Tag(soup, "p") tag['class'] = "authorId" tag.insert(0, self.fixChars(self.tag_to_string(divTag.contents[0], use_alt=False))) divTag.replaceWith(tag) return soup def strip_anchors(self,soup): paras = soup.findAll(True) for para in paras: aTags = para.findAll('a') for a in aTags: if a.img is None: a.replaceWith(a.renderContents().decode('cp1252','replace')) return soup