diff --git a/resources/recipes/nytimes.recipe b/resources/recipes/nytimes.recipe index a2d5135045..c656450990 100644 --- a/resources/recipes/nytimes.recipe +++ b/resources/recipes/nytimes.recipe @@ -5,62 +5,59 @@ __copyright__ = '2008, Kovid Goyal ' ''' nytimes.com ''' -import re -import time -from calibre import entity_to_unicode +import re, string, time +from calibre import entity_to_unicode, strftime from calibre.web.feeds.recipes import BasicNewsRecipe -from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, NavigableString, \ -Comment, BeautifulStoneSoup +from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, BeautifulStoneSoup class NYTimes(BasicNewsRecipe): - title = 'New York Times Top Stories' - __author__ = 'GRiker' - language = 'en' - requires_version = (0, 7, 5) - description = 'Top Stories from the New York Times' + # set headlinesOnly to True for the headlines-only version + headlinesOnly = True - # List of sections typically included in Top Stories. Use a keyword from the - # right column in the excludeSectionKeywords[] list to skip downloading that section - sections = { - 'arts' : 'Arts', - 'business' : 'Business', - 'diningwine' : 'Dining & Wine', - 'editorials' : 'Editorials', - 'health' : 'Health', - 'magazine' : 'Magazine', - 'mediaadvertising' : 'Media & Advertising', - 'newyorkregion' : 'New York/Region', - 'oped' : 'Op-Ed', - 'politics' : 'Politics', - 'science' : 'Science', - 'sports' : 'Sports', - 'technology' : 'Technology', - 'topstories' : 'Top Stories', - 'travel' : 'Travel', - 'us' : 'U.S.', - 'world' : 'World' - } + # includeSections: List of sections to include. If empty, all sections found will be included. + # Otherwise, only the sections named will be included. For example, + # + # includeSections = ['Politics','Sports'] + # + # would cause only the Politics and Sports sections to be included. - # Add section keywords from the right column above to skip that section - # For example, to skip sections containing the word 'Sports' or 'Dining', use: - # excludeSectionKeywords = ['Sports', 'Dining'] - # Fetch only Business and Technology - # excludeSectionKeywords = ['Arts','Dining','Editorials','Health','Magazine','Media','Region','Op-Ed','Politics','Science','Sports','Top Stories','Travel','U.S.','World'] - # Fetch only Top Stories - # excludeSectionKeywords = ['Arts','Business','Dining','Editorials','Health','Magazine','Media','Region','Op-Ed','Politics','Science','Sports','Technology','Travel','U.S.','World'] - # By default, no sections are skipped. - excludeSectionKeywords = [] + includeSections = [] # by default, all sections included + + # excludeSections: List of sections to exclude. If empty, all sections found will be included. + # Otherwise, the sections named will be excluded. For example, + # + # excludeSections = ['Politics','Sports'] + # + # would cause the Politics and Sports sections to be excluded. This parameter can be used + # in conjuction with includeSections although in most cases using one or the other, but + # not both, is sufficient. + + excludeSections = [] # one_picture_per_article specifies that calibre should only use the first image # from an article (if one exists). If one_picture_per_article = True, the image # will be moved to a location between the headline and the byline. # If one_picture_per_article = False, all images from the article will be included + # and shown in their original location. one_picture_per_article = True # The maximum number of articles that will be downloaded - max_articles_per_feed = 40 + max_articles_per_feed = 100 + + + if headlinesOnly: + title='New York Times Headlines' + description = 'Headlines from the New York Times' + else: + title='New York Times' + description = 'Today\'s New York Times' + + __author__ = 'GRiker/Kovid Goyal/Nick Redding' + language = 'en' + requires_version = (0, 7, 5) + timefmt = '' needs_subscription = True @@ -82,6 +79,7 @@ class NYTimes(BasicNewsRecipe): 'entry-response module', 'icon enlargeThis', 'leftNavTabs', + 'metaFootnote', 'module box nav', 'nextArticleLink', 'nextArticleLink clearfix', @@ -89,12 +87,13 @@ class NYTimes(BasicNewsRecipe): 'relatedSearchesModule', 'side_tool', 'singleAd', - 'subNavigation clearfix', - 'subNavigation tabContent active', - 'subNavigation tabContent active clearfix', + re.compile('^subNavigation'), + re.compile('^leaderboard'), + re.compile('^module'), ]}), dict(id=[ 'adxLeaderboard', + 'adxSponLink', 'archive', 'articleExtras', 'articleInline', @@ -105,87 +104,98 @@ class NYTimes(BasicNewsRecipe): 'footer', 'header', 'header_search', + 'inlineBox', 'login', 'masthead', 'masthead-nav', 'memberTools', 'navigation', 'portfolioInline', + 'readerReviews', + 'readerReviewsCount', 'relatedArticles', + 'relatedTopics', 'respond', 'side_search', 'side_index', 'side_tool', 'toolsRight', ]), - dict(name=['script', 'noscript', 'style'])] - + dict(name=['script', 'noscript', 'style','form','hr'])] no_stylesheets = True - extra_css = '.headline {text-align: left;}\n \ - .byline {font-family: monospace; \ - text-align: left; \ - margin-top: 0px; \ - margin-bottom: 0px;}\n \ - .dateline {font-size: small; \ - margin-top: 0px; \ - margin-bottom: 0px;}\n \ - .timestamp {font-size: small; \ - margin-top: 0px; \ - margin-bottom: 0px;}\n \ - .source {text-align: left;}\n \ - .image {text-align: center;}\n \ - .credit {text-align: right; \ - font-size: small; \ - margin-top: 0px; \ - margin-bottom: 0px;}\n \ - .articleBody {text-align: left;}\n \ - .authorId {text-align: left; \ - font-style: italic;}\n ' + extra_css = ''' + .articleHeadline { text-align: left; margin-top:0.5em; margin-bottom:0.25em; } + .credit { text-align: right; font-size: small; line-height:1em; margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; } + .byline { text-align: left; font-size: small; line-height:1em; margin-top:10px; margin-left:0; margin-right:0; margin-bottom: 0; } + .dateline { text-align: left; font-size: small; line-height:1em;margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; } + .kicker { font-size: small; line-height:1em;margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; } + .timestamp { text-align: left; font-size: small; } + .caption { font-size: small; font-style:italic; line-height:1em; margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; } + a:link {text-decoration: none; } + .articleBody { } + .authorId {text-align: left; } + .image {text-align: center;} + .source {text-align: left; }''' - def dump_ans(self, ans) : + def filter_ans(self, ans) : total_article_count = 0 - for section in ans : + idx = 0 + idx_max = len(ans)-1 + while idx <= idx_max: + if self.includeSections != []: + if ans[idx][0] not in self.includeSections: + print "SECTION NOT INCLUDED: ",ans[idx][0] + del ans[idx] + idx_max = idx_max-1 + continue + if ans[idx][0] in self.excludeSections: + print "SECTION EXCLUDED: ",ans[idx][0] + del ans[idx] + idx_max = idx_max-1 + continue if self.verbose: - self.log("section %s: %d articles" % (section[0], len(section[1])) ) - for article in section[1]: + self.log("Section %s: %d articles" % (ans[idx][0], len(ans[idx][1])) ) + for article in ans[idx][1]: total_article_count += 1 if self.verbose: self.log("\t%-40.40s... \t%-60.60s..." % (article['title'].encode('cp1252','replace'), article['url'].encode('cp1252','replace'))) + idx = idx+1 + self.log( "Queued %d articles" % total_article_count ) + return ans def fixChars(self,string): # Replace lsquo (\x91) - fixed = re.sub("\x91","‘",string) + fixed = re.sub("\x91","‘",string) # Replace rsquo (\x92) - fixed = re.sub("\x92","’",fixed) + fixed = re.sub("\x92","’",fixed) # Replace ldquo (\x93) - fixed = re.sub("\x93","“",fixed) + fixed = re.sub("\x93","“",fixed) # Replace rdquo (\x94) - fixed = re.sub("\x94","”",fixed) + fixed = re.sub("\x94","”",fixed) # Replace ndash (\x96) - fixed = re.sub("\x96","–",fixed) + fixed = re.sub("\x96","–",fixed) # Replace mdash (\x97) - fixed = re.sub("\x97","—",fixed) + fixed = re.sub("\x97","—",fixed) return fixed def get_browser(self): br = BasicNewsRecipe.get_browser() if self.username is not None and self.password is not None: - try: - br.open('http://www.nytimes.com/auth/login') - br.select_form(name='login') - br['USERID'] = self.username - br['PASSWORD'] = self.password - br.submit() - except: - self.log("\nFailed to login") + br.open('http://www.nytimes.com/auth/login') + br.select_form(name='login') + br['USERID'] = self.username + br['PASSWORD'] = self.password + raw = br.submit().read() + if 'Please try again' in raw: + raise Exception('Your username and password are incorrect') return br def skip_ad_pages(self, soup): @@ -213,6 +223,9 @@ class NYTimes(BasicNewsRecipe): cover = None return cover + def short_title(self): + return self.title + def index_to_soup(self, url_or_raw, raw=False): ''' OVERRIDE of class method @@ -255,157 +268,184 @@ class NYTimes(BasicNewsRecipe): # Kindle TOC descriptions won't render certain characters if description: massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES)) - # Replace '&' with '&' - massaged = re.sub("&","&", massaged) + # Replace '&' with '&' + massaged = re.sub("&","&", massaged) return self.fixChars(massaged) else: return description - def parse_index(self): + def parse_todays_index(self): + + def feed_title(div): + return ''.join(div.findAll(text=True, recursive=True)).strip() + + articles = {} + key = None + ans = [] + url_list = [] + + def handle_article(div): + a = div.find('a', href=True) + if not a: + return + url = re.sub(r'\?.*', '', a['href']) + if not url.startswith("http"): + return + if not url.endswith(".html"): + return + if 'podcast' in url: + return + if '/video/' in url: + return + url += '?pagewanted=all' + if url in url_list: + return + url_list.append(url) + title = self.tag_to_string(a, use_alt=True).strip() + description = '' + pubdate = strftime('%a, %d %b') + summary = div.find(True, attrs={'class':'summary'}) + if summary: + description = self.tag_to_string(summary, use_alt=False) + author = '' + authorAttribution = div.find(True, attrs={'class':'byline'}) + if authorAttribution: + author = self.tag_to_string(authorAttribution, use_alt=False) + else: + authorAttribution = div.find(True, attrs={'class':'byline'}) + if authorAttribution: + author = self.tag_to_string(authorAttribution, use_alt=False) + feed = key if key is not None else 'Uncategorized' + if not articles.has_key(feed): + ans.append(feed) + articles[feed] = [] + articles[feed].append( + dict(title=title, url=url, date=pubdate, + description=description, author=author, + content='')) + + + soup = self.index_to_soup('http://www.nytimes.com/pages/todayspaper/index.html') + + + # Find each article + for div in soup.findAll(True, + attrs={'class':['section-headline', 'story', 'story headline','sectionHeader','headlinesOnly multiline flush']}): + + if div['class'] in ['section-headline','sectionHeader']: + key = string.capwords(feed_title(div)) + key = key.replace('Op-ed','Op-Ed') + key = key.replace('U.s.','U.S.') + elif div['class'] in ['story', 'story headline'] : + handle_article(div) + elif div['class'] == 'headlinesOnly multiline flush': + for lidiv in div.findAll('li'): + handle_article(lidiv) + + ans = [(key, articles[key]) for key in ans if articles.has_key(key)] + return self.filter_ans(ans) + + def parse_headline_index(self): + articles = {} ans = [] - - feed = key = 'All Top Stories' - articles[key] = [] - ans.append(key) - self.log("Scanning 1 section ...") + url_list = [] soup = self.index_to_soup('http://www.nytimes.com/pages/todaysheadlines/') - # Fetch the outer table - table = soup.find('table') - previousTable = table + # Fetch the content table + content_table = soup.find('table',{'id':'content'}) + if content_table is None: + self.log("FATAL ERROR: CANNOT FIND CONTENT TABLE") + return None - # Find the deepest table containing the stories - while True : - table = table.find('table') - if table.find(text=re.compile('top stories start')) : - previousTable = table - continue - else : - table = previousTable - break + # Within this table are entries, each containing one or more h6 tags which represent sections - # There are multiple subtables, find the one containing the stories - for block in table.findAll('table') : - if block.find(text=re.compile('top stories start')) : - table = block - break - else : - continue + for td_col in content_table.findAll('td', {'id' : re.compile('Column')}): + for div_sec in td_col.findAll('div',recursive=False): + for h6_sec_name in div_sec.findAll('h6',{'style' : re.compile('text-transform: *uppercase')}): + section_name = self.tag_to_string(h6_sec_name,use_alt=False) + section_name = re.sub(r'^ *$','',section_name) + if section_name == '': + continue + section_name=string.capwords(section_name) + if section_name == 'U.s.': + section_name = 'U.S.' + elif section_name == 'Op-ed': + section_name = 'Op-Ed' + pubdate = strftime('%a, %d %b') - # Again there are multiple subtables, find the one containing the stories - for storyblock in table.findAll('table') : - if storyblock.find(text=re.compile('top stories start')) : - break - else : - continue - - skipThisSection = False - todays_article_count = 0 - # Within this table are entries - self.log("Fetching feed Top Stories") - for tr in storyblock.findAllNext('tr'): - if tr.find('span') is not None : - - sectionblock = tr.find(True, attrs={'face':['times new roman, times,sans serif', - 'times new roman,times, sans serif', - 'times new roman, times, sans serif']}) - section = None - bylines = [] - descriptions = [] - pubdate = None - - # Get the Section title - for (x,i) in enumerate(sectionblock.contents) : - skipThisSection = False - # Extract the section title - if ('Comment' in str(i.__class__)) : - if 'start(name=' in i : - section = i[i.find('=')+1:-2] - - if not self.sections.has_key(section) : - skipThisSection = True + search_div = div_sec + for next_tag in h6_sec_name.findNextSiblings(True): + if next_tag.__class__.__name__ == 'Tag': + if next_tag.name == 'div': + search_div = next_tag break - # Check for excluded section - if len(self.excludeSectionKeywords): - key = self.sections[section] - excluded = re.compile('|'.join(self.excludeSectionKeywords)) - if excluded.search(key) or articles.has_key(key): - skipThisSection = True - break - - # Get the bylines and descriptions - if not skipThisSection : - lines = sectionblock.contents - contentStrings = [] - - for line in lines: - if not isinstance(line, Comment) and line.strip and line.strip() > "": - contentStrings.append(line.strip()) - - # Gather the byline/description pairs - bylines = [] - descriptions = [] - for contentString in contentStrings: - if contentString[0:3] == 'By ' and contentString[3].isupper() : - bylines.append(contentString) + # Get the articles + for h3_item in search_div.findAll('h3'): + byline = h3_item.h6 + if byline is not None: + author = self.tag_to_string(byline,usa_alt=False) else: - descriptions.append(contentString) - - # Fetch the article titles and URLs - articleCount = len(sectionblock.findAll('span')) - todays_article_count += articleCount - for (i,span) in enumerate(sectionblock.findAll(attrs={'class':'headlineWrapper'})) : - a = span.find('a', href=True) + author = '' + a = h3_item.find('a', href=True) + if not a: + continue url = re.sub(r'\?.*', '', a['href']) + if not url.startswith("http"): + continue + if not url.endswith(".html"): + continue + if 'podcast' in url: + continue + if 'video' in url: + continue url += '?pagewanted=all' + if url in url_list: + continue + url_list.append(url) + self.log("URL %s" % url) + title = self.tag_to_string(a, use_alt=True).strip() + desc = h3_item.find('p') + if desc is not None: + description = self.tag_to_string(desc,use_alt=False) + else: + description = '' + if not articles.has_key(section_name): + ans.append(section_name) + articles[section_name] = [] + articles[section_name].append(dict(title=title, url=url, date=pubdate, description=description, author=author, content='')) - title = self.tag_to_string(a, use_alt=True) - # prepend the section name - title = self.sections[section] + " · " + title - if not isinstance(title, unicode): - title = title.decode('utf-8', 'replace') - - # Allow for unattributed, undescribed entries "Editor's Note" - if i >= len(descriptions) : - description = None - else : - description = descriptions[i] - - if len(bylines) == articleCount : - author = bylines[i] - else : - author = None - - # Check for duplicates - duplicateFound = False - if len(articles[feed]) > 1: - for article in articles[feed] : - if url == article['url'] : - duplicateFound = True - break - - if duplicateFound: - # Continue fetching, don't add this article - todays_article_count -= 1 - continue - - if not articles.has_key(feed): - articles[feed] = [] - articles[feed].append( - dict(title=title, url=url, date=pubdate, - description=description, author=author, content='')) -# self.log("Queuing %d articles from %s" % (todays_article_count, "Top Stories")) - - ans = self.sort_index_by(ans, {'Top Stories':-1}) ans = [(key, articles[key]) for key in ans if articles.has_key(key)] - self.dump_ans(ans) - return ans + return self.filter_ans(ans) + + def parse_index(self): + if self.headlinesOnly: + return self.parse_headline_index() + else: + return self.parse_todays_index() + + def strip_anchors(self,soup): + paras = soup.findAll(True) + for para in paras: + aTags = para.findAll('a') + for a in aTags: + if a.img is None: + a.replaceWith(a.renderContents().decode('cp1252','replace')) + return soup + def preprocess_html(self, soup): + + kicker_tag = soup.find(attrs={'class':'kicker'}) + if kicker_tag: # remove Op_Ed author head shots + tagline = self.tag_to_string(kicker_tag) + if tagline=='Op-Ed Columnist': + img_div = soup.find('div','inlineImage module') + if img_div: + img_div.extract() return self.strip_anchors(soup) def postprocess_html(self,soup, True): @@ -422,8 +462,9 @@ class NYTimes(BasicNewsRecipe): firstImg = inlineImgs[0] for inlineImg in inlineImgs[1:]: inlineImg.extract() - # Move firstImg after headline - cgFirst = soup.find(True, {'class':'columnGroup first'}) + # Move firstImg before article body + #article_body = soup.find(True, {'id':'articleBody'}) + cgFirst = soup.find(True, {'class':re.compile('columnGroup *first')}) if cgFirst: # Strip all sibling NavigableStrings: noise navstrings = cgFirst.findAll(text=True, recursive=False) @@ -443,30 +484,18 @@ class NYTimes(BasicNewsRecipe): if headline_found: cgFirst.insert(insertLoc,firstImg) else: - self.log(">>> No class:'columnGroup first' found <<<") - # Change class="kicker" to

- kicker = soup.find(True, {'class':'kicker'}) - if kicker and kicker.contents[0]: - h3Tag = Tag(soup, "h3") - h3Tag.insert(0, self.fixChars(self.tag_to_string(kicker, - use_alt=False))) - kicker.replaceWith(h3Tag) + self.log(">>> No class:'columnGroup first' found <<<") - # Change captions to italic -1 + # Change captions to italic for caption in soup.findAll(True, {'class':'caption'}) : if caption and caption.contents[0]: - emTag = Tag(soup, "em") + cTag = Tag(soup, "p", [("class", "caption")]) c = self.fixChars(self.tag_to_string(caption,use_alt=False)).strip() mp_off = c.find("More Photos") if mp_off >= 0: c = c[:mp_off] - emTag.insert(0, c) - #hrTag = Tag(soup, 'hr') - #hrTag['class'] = 'caption_divider' - hrTag = Tag(soup, 'div') - hrTag['class'] = 'divider' - emTag.insert(1, hrTag) - caption.replaceWith(emTag) + cTag.insert(0, c) + caption.replaceWith(cTag) # Change to

h1 = soup.find('h1') @@ -506,17 +535,6 @@ class NYTimes(BasicNewsRecipe): bTag.insert(0, subhead.contents[0]) subhead.replaceWith(bTag) - # Synthesize a section header - dsk = soup.find('meta', attrs={'name':'dsk'}) - if dsk and dsk.has_key('content'): - hTag = Tag(soup,'h3') - hTag['class'] = 'section' - hTag.insert(0,NavigableString(dsk['content'])) - articleTag = soup.find(True, attrs={'id':'article'}) - if articleTag: - articleTag.insert(0,hTag) - - # Add class="articleBody" to
so we can format with CSS divTag = soup.find('div',attrs={'id':'articleBody'}) if divTag: divTag['class'] = divTag['id'] @@ -532,11 +550,3 @@ class NYTimes(BasicNewsRecipe): return soup - def strip_anchors(self,soup): - paras = soup.findAll(True) - for para in paras: - aTags = para.findAll('a') - for a in aTags: - if a.img is None: - a.replaceWith(a.renderContents().decode('cp1252','replace')) - return soup diff --git a/resources/recipes/nytimes_sub.recipe b/resources/recipes/nytimes_sub.recipe index 5452ae1c6e..ed1ba75f0f 100644 --- a/resources/recipes/nytimes_sub.recipe +++ b/resources/recipes/nytimes_sub.recipe @@ -5,52 +5,186 @@ __copyright__ = '2008, Kovid Goyal ' ''' nytimes.com ''' -import string, re, time -from calibre import strftime +import re, string, time +from calibre import entity_to_unicode, strftime from calibre.web.feeds.recipes import BasicNewsRecipe -from calibre.ebooks.BeautifulSoup import BeautifulSoup - -def decode(self, src): - enc = 'utf-8' - if 'iso-8859-1' in src: - enc = 'cp1252' - return src.decode(enc, 'ignore') +from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, BeautifulStoneSoup class NYTimes(BasicNewsRecipe): - title = u'New York Times' - __author__ = 'Kovid Goyal/Nick Redding' - language = 'en' - requires_version = (0, 6, 36) + # set headlinesOnly to True for the headlines-only version + headlinesOnly = False - description = 'Daily news from the New York Times (subscription version)' - timefmt = ' [%b %d]' + # includeSections: List of sections to include. If empty, all sections found will be included. + # Otherwise, only the sections named will be included. For example, + # + # includeSections = ['Politics','Sports'] + # + # would cause only the Politics and Sports sections to be included. + + includeSections = [] # by default, all sections included + + # excludeSections: List of sections to exclude. If empty, all sections found will be included. + # Otherwise, the sections named will be excluded. For example, + # + # excludeSections = ['Politics','Sports'] + # + # would cause the Politics and Sports sections to be excluded. This parameter can be used + # in conjuction with includeSections although in most cases using one or the other, but + # not both, is sufficient. + + excludeSections = [] + + # one_picture_per_article specifies that calibre should only use the first image + # from an article (if one exists). If one_picture_per_article = True, the image + # will be moved to a location between the headline and the byline. + # If one_picture_per_article = False, all images from the article will be included + + # and shown in their original location. + one_picture_per_article = True + + # The maximum number of articles that will be downloaded + max_articles_per_feed = 100 + + + if headlinesOnly: + title='New York Times Headlines' + description = 'Headlines from the New York Times' + else: + title='New York Times' + description = 'Today\'s New York Times' + + __author__ = 'GRiker/Kovid Goyal/Nick Redding' + language = 'en' + requires_version = (0, 7, 5) + + + timefmt = '' needs_subscription = True + masthead_url = 'http://graphics8.nytimes.com/images/misc/nytlogo379x64.gif' + cover_margins = (18,18,'grey99') + remove_tags_before = dict(id='article') remove_tags_after = dict(id='article') - remove_tags = [dict(attrs={'class':['articleTools', 'post-tools', 'side_tool','nextArticleLink', - 'nextArticleLink clearfix','columnGroup doubleRule','doubleRule','entry-meta', - 'icon enlargeThis','columnGroup last','relatedSearchesModule']}), - dict({'class':re.compile('^subNavigation')}), - dict({'class':re.compile('^leaderboard')}), - dict({'class':re.compile('^module')}), - dict({'class':'metaFootnote'}), - dict(id=['inlineBox','footer', 'toolsRight', 'articleInline','login','masthead', - 'navigation', 'archive', 'side_search', 'blog_sidebar','cCol','portfolioInline', - 'side_tool', 'side_index','header','readerReviewsCount','readerReviews', - 'relatedArticles', 'relatedTopics', 'adxSponLink']), + remove_tags = [dict(attrs={'class':[ + 'articleFooter', + 'articleTools', + 'columnGroup doubleRule', + 'columnGroup singleRule', + 'columnGroup last', + 'columnGroup last', + 'doubleRule', + 'dottedLine', + 'entry-meta', + 'entry-response module', + 'icon enlargeThis', + 'leftNavTabs', + 'metaFootnote', + 'module box nav', + 'nextArticleLink', + 'nextArticleLink clearfix', + 'post-tools', + 'relatedSearchesModule', + 'side_tool', + 'singleAd', + re.compile('^subNavigation'), + re.compile('^leaderboard'), + re.compile('^module'), + ]}), + dict(id=[ + 'adxLeaderboard', + 'adxSponLink', + 'archive', + 'articleExtras', + 'articleInline', + 'blog_sidebar', + 'businessSearchBar', + 'cCol', + 'entertainmentSearchBar', + 'footer', + 'header', + 'header_search', + 'inlineBox', + 'login', + 'masthead', + 'masthead-nav', + 'memberTools', + 'navigation', + 'portfolioInline', + 'readerReviews', + 'readerReviewsCount', + 'relatedArticles', + 'relatedTopics', + 'respond', + 'side_search', + 'side_index', + 'side_tool', + 'toolsRight', + ]), dict(name=['script', 'noscript', 'style','form','hr'])] - encoding = decode no_stylesheets = True extra_css = ''' - .articleHeadline { margin-top:0.5em; margin-bottom:0.25em; } - .credit { font-size: small; font-style:italic; line-height:1em; margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; } - .byline { font-size: small; font-style:italic; line-height:1em; margin-top:10px; margin-left:0; margin-right:0; margin-bottom: 0; } - .dateline { font-size: small; line-height:1em;margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; } + .articleHeadline { text-align: left; margin-top:0.5em; margin-bottom:0.25em; } + .credit { text-align: right; font-size: small; line-height:1em; margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; } + .byline { text-align: left; font-size: small; line-height:1em; margin-top:10px; margin-left:0; margin-right:0; margin-bottom: 0; } + .dateline { text-align: left; font-size: small; line-height:1em;margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; } .kicker { font-size: small; line-height:1em;margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; } - .timestamp { font-size: small; } - .caption { font-size: small; line-height:1em; margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; } - a:link {text-decoration: none; }''' + .timestamp { text-align: left; font-size: small; } + .caption { font-size: small; font-style:italic; line-height:1em; margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; } + a:link {text-decoration: none; } + .articleBody { } + .authorId {text-align: left; } + .image {text-align: center;} + .source {text-align: left; }''' + + def filter_ans(self, ans) : + total_article_count = 0 + idx = 0 + idx_max = len(ans)-1 + while idx <= idx_max: + if self.includeSections != []: + if ans[idx][0] not in self.includeSections: + print "SECTION NOT INCLUDED: ",ans[idx][0] + del ans[idx] + idx_max = idx_max-1 + continue + if ans[idx][0] in self.excludeSections: + print "SECTION EXCLUDED: ",ans[idx][0] + del ans[idx] + idx_max = idx_max-1 + continue + if self.verbose: + self.log("Section %s: %d articles" % (ans[idx][0], len(ans[idx][1])) ) + for article in ans[idx][1]: + total_article_count += 1 + if self.verbose: + self.log("\t%-40.40s... \t%-60.60s..." % (article['title'].encode('cp1252','replace'), + article['url'].encode('cp1252','replace'))) + idx = idx+1 + + self.log( "Queued %d articles" % total_article_count ) + return ans + + def fixChars(self,string): + # Replace lsquo (\x91) + fixed = re.sub("\x91","‘",string) + + # Replace rsquo (\x92) + fixed = re.sub("\x92","’",fixed) + + # Replace ldquo (\x93) + fixed = re.sub("\x93","“",fixed) + + # Replace rdquo (\x94) + fixed = re.sub("\x94","”",fixed) + + # Replace ndash (\x96) + fixed = re.sub("\x96","–",fixed) + + # Replace mdash (\x97) + fixed = re.sub("\x97","—",fixed) + + return fixed def get_browser(self): br = BasicNewsRecipe.get_browser() @@ -60,22 +194,19 @@ class NYTimes(BasicNewsRecipe): br['USERID'] = self.username br['PASSWORD'] = self.password raw = br.submit().read() - if 'Sorry, we could not find the combination you entered. Please try again.' in raw: + if 'Please try again' in raw: raise Exception('Your username and password are incorrect') - #open('/t/log.html', 'wb').write(raw) return br - def get_masthead_url(self): - masthead = 'http://graphics8.nytimes.com/images/misc/nytlogo379x64.gif' - #masthead = 'http://members.cox.net/nickredding/nytlogo.gif' - br = BasicNewsRecipe.get_browser() - try: - br.open(masthead) - except: - self.log("\nMasthead unavailable") - masthead = None - return masthead - + def skip_ad_pages(self, soup): + # Skip ad pages served before actual article + skip_tag = soup.find(True, {'name':'skip'}) + if skip_tag is not None: + self.log.warn("Found forwarding link: %s" % skip_tag.parent['href']) + url = 'http://www.nytimes.com' + re.sub(r'\?.*', '', skip_tag.parent['href']) + url += '?pagewanted=all' + self.log.warn("Skipping ad to article at '%s'" % url) + return self.index_to_soup(url, raw=True) def get_cover_url(self): cover = None @@ -93,12 +224,57 @@ class NYTimes(BasicNewsRecipe): return cover def short_title(self): - return 'New York Times' + return self.title - def parse_index(self): - self.encoding = 'cp1252' - soup = self.index_to_soup('http://www.nytimes.com/pages/todayspaper/index.html') - self.encoding = decode + def index_to_soup(self, url_or_raw, raw=False): + ''' + OVERRIDE of class method + deals with various page encodings between index and articles + ''' + def get_the_soup(docEncoding, url_or_raw, raw=False) : + if re.match(r'\w+://', url_or_raw): + f = self.browser.open(url_or_raw) + _raw = f.read() + f.close() + if not _raw: + raise RuntimeError('Could not fetch index from %s'%url_or_raw) + else: + _raw = url_or_raw + if raw: + return _raw + + if not isinstance(_raw, unicode) and self.encoding: + _raw = _raw.decode(docEncoding, 'replace') + massage = list(BeautifulSoup.MARKUP_MASSAGE) + massage.append((re.compile(r'&(\S+?);'), lambda match: entity_to_unicode(match, encoding=self.encoding))) + return BeautifulSoup(_raw, markupMassage=massage) + + # Entry point + print "index_to_soup()" + soup = get_the_soup( self.encoding, url_or_raw ) + contentType = soup.find(True,attrs={'http-equiv':'Content-Type'}) + docEncoding = str(contentType)[str(contentType).find('charset=') + len('charset='):str(contentType).rfind('"')] + if docEncoding == '' : + docEncoding = self.encoding + + if self.verbose > 2: + self.log( " document encoding: '%s'" % docEncoding) + if docEncoding != self.encoding : + soup = get_the_soup(docEncoding, url_or_raw) + + return soup + + def massageNCXText(self, description): + # Kindle TOC descriptions won't render certain characters + if description: + massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES)) + # Replace '&' with '&' + massaged = re.sub("&","&", massaged) + return self.fixChars(massaged) + else: + return description + + def parse_todays_index(self): def feed_title(div): return ''.join(div.findAll(text=True, recursive=True)).strip() @@ -119,12 +295,13 @@ class NYTimes(BasicNewsRecipe): return if 'podcast' in url: return + if '/video/' in url: + return url += '?pagewanted=all' if url in url_list: return url_list.append(url) title = self.tag_to_string(a, use_alt=True).strip() - #self.log("Title: %s" % title) description = '' pubdate = strftime('%a, %d %b') summary = div.find(True, attrs={'class':'summary'}) @@ -140,6 +317,7 @@ class NYTimes(BasicNewsRecipe): author = self.tag_to_string(authorAttribution, use_alt=False) feed = key if key is not None else 'Uncategorized' if not articles.has_key(feed): + ans.append(feed) articles[feed] = [] articles[feed].append( dict(title=title, url=url, date=pubdate, @@ -147,46 +325,228 @@ class NYTimes(BasicNewsRecipe): content='')) + soup = self.index_to_soup('http://www.nytimes.com/pages/todayspaper/index.html') - # Find each instance of class="section-headline", class="story", class="story headline" + + # Find each article for div in soup.findAll(True, attrs={'class':['section-headline', 'story', 'story headline','sectionHeader','headlinesOnly multiline flush']}): if div['class'] in ['section-headline','sectionHeader']: key = string.capwords(feed_title(div)) - articles[key] = [] - ans.append(key) - #self.log('Section: %s' % key) - + key = key.replace('Op-ed','Op-Ed') + key = key.replace('U.s.','U.S.') elif div['class'] in ['story', 'story headline'] : handle_article(div) elif div['class'] == 'headlinesOnly multiline flush': for lidiv in div.findAll('li'): handle_article(lidiv) -# ans = self.sort_index_by(ans, {'The Front Page':-1, -# 'Dining In, Dining Out':1, -# 'Obituaries':2}) ans = [(key, articles[key]) for key in ans if articles.has_key(key)] + return self.filter_ans(ans) + + def parse_headline_index(self): + + articles = {} + ans = [] + url_list = [] + + soup = self.index_to_soup('http://www.nytimes.com/pages/todaysheadlines/') + + # Fetch the content table + content_table = soup.find('table',{'id':'content'}) + if content_table is None: + self.log("FATAL ERROR: CANNOT FIND CONTENT TABLE") + return None + + # Within this table are entries, each containing one or more h6 tags which represent sections + + for td_col in content_table.findAll('td', {'id' : re.compile('Column')}): + for div_sec in td_col.findAll('div',recursive=False): + for h6_sec_name in div_sec.findAll('h6',{'style' : re.compile('text-transform: *uppercase')}): + section_name = self.tag_to_string(h6_sec_name,use_alt=False) + section_name = re.sub(r'^ *$','',section_name) + if section_name == '': + continue + section_name=string.capwords(section_name) + if section_name == 'U.s.': + section_name = 'U.S.' + elif section_name == 'Op-ed': + section_name = 'Op-Ed' + pubdate = strftime('%a, %d %b') + + search_div = div_sec + for next_tag in h6_sec_name.findNextSiblings(True): + if next_tag.__class__.__name__ == 'Tag': + if next_tag.name == 'div': + search_div = next_tag + break + + # Get the articles + for h3_item in search_div.findAll('h3'): + byline = h3_item.h6 + if byline is not None: + author = self.tag_to_string(byline,usa_alt=False) + else: + author = '' + a = h3_item.find('a', href=True) + if not a: + continue + url = re.sub(r'\?.*', '', a['href']) + if not url.startswith("http"): + continue + if not url.endswith(".html"): + continue + if 'podcast' in url: + continue + if 'video' in url: + continue + url += '?pagewanted=all' + if url in url_list: + continue + url_list.append(url) + self.log("URL %s" % url) + title = self.tag_to_string(a, use_alt=True).strip() + desc = h3_item.find('p') + if desc is not None: + description = self.tag_to_string(desc,use_alt=False) + else: + description = '' + if not articles.has_key(section_name): + ans.append(section_name) + articles[section_name] = [] + articles[section_name].append(dict(title=title, url=url, date=pubdate, description=description, author=author, content='')) + + + ans = [(key, articles[key]) for key in ans if articles.has_key(key)] + return self.filter_ans(ans) + + def parse_index(self): + if self.headlinesOnly: + return self.parse_headline_index() + else: + return self.parse_todays_index() + + def strip_anchors(self,soup): + paras = soup.findAll(True) + for para in paras: + aTags = para.findAll('a') + for a in aTags: + if a.img is None: + a.replaceWith(a.renderContents().decode('cp1252','replace')) + return soup - return ans def preprocess_html(self, soup): + kicker_tag = soup.find(attrs={'class':'kicker'}) - if kicker_tag: + if kicker_tag: # remove Op_Ed author head shots tagline = self.tag_to_string(kicker_tag) - #self.log("FOUND KICKER %s" % tagline) if tagline=='Op-Ed Columnist': img_div = soup.find('div','inlineImage module') - #self.log("Searching for photo") if img_div: img_div.extract() - #self.log("Photo deleted") - refresh = soup.find('meta', {'http-equiv':'refresh'}) - if refresh is None: - return soup - content = refresh.get('content').partition('=')[2] - raw = self.browser.open_novisit('http://www.nytimes.com'+content).read() - return BeautifulSoup(raw.decode('cp1252', 'replace')) + return self.strip_anchors(soup) + def postprocess_html(self,soup, True): + + if self.one_picture_per_article: + # Remove all images after first + largeImg = soup.find(True, {'class':'articleSpanImage'}) + inlineImgs = soup.findAll(True, {'class':'inlineImage module'}) + if largeImg: + for inlineImg in inlineImgs: + inlineImg.extract() + else: + if inlineImgs: + firstImg = inlineImgs[0] + for inlineImg in inlineImgs[1:]: + inlineImg.extract() + # Move firstImg before article body + #article_body = soup.find(True, {'id':'articleBody'}) + cgFirst = soup.find(True, {'class':re.compile('columnGroup *first')}) + if cgFirst: + # Strip all sibling NavigableStrings: noise + navstrings = cgFirst.findAll(text=True, recursive=False) + [ns.extract() for ns in navstrings] + headline_found = False + tag = cgFirst.find(True) + insertLoc = 0 + while True: + insertLoc += 1 + if hasattr(tag,'class') and tag['class'] == 'articleHeadline': + headline_found = True + break + tag = tag.nextSibling + if not tag: + headline_found = False + break + if headline_found: + cgFirst.insert(insertLoc,firstImg) + else: + self.log(">>> No class:'columnGroup first' found <<<") + + # Change captions to italic + for caption in soup.findAll(True, {'class':'caption'}) : + if caption and caption.contents[0]: + cTag = Tag(soup, "p", [("class", "caption")]) + c = self.fixChars(self.tag_to_string(caption,use_alt=False)).strip() + mp_off = c.find("More Photos") + if mp_off >= 0: + c = c[:mp_off] + cTag.insert(0, c) + caption.replaceWith(cTag) + + # Change to

+ h1 = soup.find('h1') + if h1: + headline = h1.find("nyt_headline") + if headline: + tag = Tag(soup, "h2") + tag['class'] = "headline" + tag.insert(0, self.fixChars(headline.contents[0])) + h1.replaceWith(tag) + else: + # Blog entry - replace headline, remove
tags + headline = soup.find('title') + if headline: + tag = Tag(soup, "h2") + tag['class'] = "headline" + tag.insert(0, self.fixChars(headline.contents[0])) + soup.insert(0, tag) + hrs = soup.findAll('hr') + for hr in hrs: + hr.extract() + + # Change

to

- used in editorial blogs + masthead = soup.find("h1") + if masthead: + # Nuke the href + if masthead.a: + del(masthead.a['href']) + tag = Tag(soup, "h3") + tag.insert(0, self.fixChars(masthead.contents[0])) + masthead.replaceWith(tag) + + # Change to + for subhead in soup.findAll(True, {'class':'bold'}) : + if subhead.contents: + bTag = Tag(soup, "b") + bTag.insert(0, subhead.contents[0]) + subhead.replaceWith(bTag) + + divTag = soup.find('div',attrs={'id':'articleBody'}) + if divTag: + divTag['class'] = divTag['id'] + + # Add class="authorId" to
so we can format with CSS + divTag = soup.find('div',attrs={'id':'authorId'}) + if divTag and divTag.contents[0]: + tag = Tag(soup, "p") + tag['class'] = "authorId" + tag.insert(0, self.fixChars(self.tag_to_string(divTag.contents[0], + use_alt=False))) + divTag.replaceWith(tag) + + return soup