diff --git a/recipes/nytimes.recipe b/recipes/nytimes.recipe index 6afa4318d7..bdf61bc15f 100644 --- a/recipes/nytimes.recipe +++ b/recipes/nytimes.recipe @@ -1,1289 +1,170 @@ -#!/usr/bin/env python2 -# -*- coding: utf-8 -*- -__license__ = 'GPL v3' -__copyright__ = '2008, Kovid Goyal ' -''' -nytimes.com -''' -import re -import string -import time -from calibre import strftime -from datetime import timedelta, date -from time import sleep -from calibre.web.feeds.recipes import BasicNewsRecipe -from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, BeautifulStoneSoup +#!/usr/bin/env python2 +# vim:fileencoding=utf-8 +# License: GPLv3 Copyright: 2018, Kovid Goyal + +from __future__ import absolute_import, division, print_function, unicode_literals + +from calibre.web.feeds.news import BasicNewsRecipe +from calibre.utils.date import strptime + +is_web_edition = True +# The sections to download when downloading the web edition, comment out +# the section you are not interested in +web_sections = [ + ('World', 'world'), + ('U.S.', 'us'), + ('Politics', 'politics'), + ('New York', 'nyregion'), + ('Business', 'business'), + ('Technology', 'technology'), + ('Sports', 'sports'), + ('Science', 'science'), + ('Health', 'health'), + ('Opinion', 'opinion'), + ('Arts', 'arts'), + ('Books', 'books'), + ('Movies', 'movies'), + ('Music', 'arts/music'), + ('Television', 'arts/television'), + ('Style', 'style'), + ('Dining & Wine', 'dining'), + ('Fashion & Style', 'fashion'), + ('Home & Garden', 'garden'), + ('Travel', 'travel'), + ('Education', 'education'), + ('Multimedia', 'multimedia'), + ('Obituaries', 'obituaries'), + ('Sunday Magazine', 'magazine') +] -class NYTimes(BasicNewsRecipe): +def classes(classes): + q = frozenset(classes.split(' ')) + return dict(attrs={ + 'class': lambda x: x and frozenset(x.split()).intersection(q)}) - recursions = 1 # set this to zero to omit Related articles lists - # speeds up processing by preventing index page links from being followed - match_regexps = [r'/[12][0-9][0-9][0-9]/[0-9]+/'] - # set getTechBlogs to True to include the technology blogs - # set tech_oldest_article to control article age - # set tech_max_articles_per_feed to control article count - getTechBlogs = True - remove_empty_feeds = True - tech_oldest_article = 14 - tech_max_articles_per_feed = 25 +class NewYorkTimes(BasicNewsRecipe): - # set getPopularArticles to False if you don't want the Most E-mailed and Most Viewed articles - # otherwise you will get up to 20 of the most popular e-mailed and viewed articles (in each category) - # This is currently disabled because the NYT is changing this functionality - # on their website to a new "Trending" page - getPopularArticles = False - popularPeriod = '1' # set this to the number of days to include in the measurement - # e.g. 7 will get the most popular measured over the last 7 days - # and 30 will get the most popular measured over 30 days. - # you still only get up to 20 articles in each category - - # set headlinesOnly to True for the headlines-only version. If True, - # webEdition is ignored. - headlinesOnly = True - - # set webEdition to True for the Web edition of the newspaper. Set oldest_web_article to the - # number of days old an article can be for inclusion. If oldest_web_article = None all articles - # will be included. Note: oldest_web_article is ignored if webEdition = - # False - webEdition = False - oldest_web_article = None - - # download higher resolution images than the small thumbnails typically included in the article - # the down side of having large beautiful images is the file size is much - # larger, on the order of 7MB per paper - useHighResImages = True + title = 'The New York Times' + if is_web_edition: + description = 'New York Times (Web). You can edit the recipe to remove sections you are not interested in.' + else: + description = 'Today\'s New York Times' + encoding = 'utf-8' + __author__ = 'Kovid Goyal' + language = 'en' + ignore_duplicate_articles = {'title', 'url'} + no_stylesheets = True compress_news_images = True compress_news_images_auto_size = 5 - # replace paid Kindle Version: the name will be changed to "The New York Times" to cause - # previous paid versions of the new york times to best sent to the back - # issues folder on the kindle - replaceKindleVersion = False - - # includeSections: List of sections to include. If empty, all sections found will be included. - # Otherwise, only the sections named will be included. For example, - # - # includeSections = ['Politics','Sports'] - # - # would cause only the Politics and Sports sections to be included. - - includeSections = [] # by default, all sections included - - # excludeSections: List of sections to exclude. If empty, all sections found will be included. - # Otherwise, the sections named will be excluded. For example, - # - # excludeSections = ['Politics','Sports'] - # - # would cause the Politics and Sports sections to be excluded. This parameter can be used - # in conjuction with includeSections although in most cases using one or the other, but - # not both, is sufficient. - - excludeSections = [] - - # one_picture_per_article specifies that calibre should only use the first image - # from an article (if one exists). If one_picture_per_article = True, the image - # will be moved to a location between the headline and the byline. - # If one_picture_per_article = False, all images from the article will be included - # and shown in their original location. - one_picture_per_article = False - - # The maximum number of articles that will be downloaded - max_articles_per_feed = 100 - use_embedded_content = False - - # Whether to omit duplicates of articles (typically arsing when articles are indexed in - # more than one section). If True, only the first occurance will be - # downloaded. - filterDuplicates = True - - # Sections to collect for the Web edition. - # Delete any you don't want, or use includeSections or excludeSections - web_sections = [(u'World', u'world'), - (u'U.S.', u'national'), - (u'Politics', u'politics'), - (u'New York', u'nyregion'), - (u'Business', 'business'), - (u'Technology', u'technology'), - (u'Sports', u'sports'), - (u'Science', u'science'), - (u'Health', u'health'), - (u'Opinion', u'opinion'), - (u'Arts', u'arts'), - (u'Books', u'books'), - (u'Movies', u'movies'), - (u'Music', u'arts/music'), - (u'Television', u'arts/television'), - (u'Style', u'style'), - (u'Dining & Wine', u'dining'), - (u'Fashion & Style', u'fashion'), - (u'Home & Garden', u'garden'), - (u'Travel', u'travel'), - ('Education', u'education'), - ('Multimedia', u'multimedia'), - (u'Obituaries', u'obituaries'), - (u'Sunday Magazine', u'magazine') - ] - - tech_feeds = [ - (u'Tech - News', u'http://pogue.blogs.nytimes.com/feed/'), - (u'Tech - Gadgetwise', u'http://gadgetwise.blogs.nytimes.com/feed/'), - (u'Tech - Open', u'http://open.blogs.nytimes.com/feed/') + keep_only_tags = [ + dict(id='story-header'), + classes('story-body-supplemental story-interrupter'), + ] + remove_tags = [ + dict(attrs={'aria-label':'tools'.split()}), + dict(attrs={'data-videoid':True}), + dict(name='button'), + dict(id=lambda x: x and x.startswith('story-ad-')), + dict(name='a', href=lambda x: x and '#story-continues-' in x), + dict(name='a', href=lambda x: x and '#whats-next' in x), + dict(id=lambda x: x and 'sharetools-' in x), + dict(id='newsletter-promo'.split()), ] - if headlinesOnly: - title = 'New York Times Headlines' - description = 'Headlines from the New York Times' - needs_subscription = False - elif webEdition: - title = 'New York Times (Web)' - description = 'New York Times on the Web' - needs_subscription = False - elif replaceKindleVersion: - title = 'The New York Times' - description = 'Today\'s New York Times' - needs_subscription = False - else: - title = 'New York Times' - description = 'Today\'s New York Times' - needs_subscription = False + def read_nyt_metadata(self): + INDEX = 'https://www.nytimes.com/section/todayspaper' + # INDEX = 'file:///t/raw.html' + soup = self.index_to_soup(INDEX) + pdate = soup.find('meta', attrs={'name':'pdate', 'content': True})['content'] + date = strptime(pdate, '%Y%m%d', assume_utc=False, as_utc=False) + self.cover_url = 'https://static01.nyt.com/images/{}/nytfrontpage/scan.jpg'.format(date.strftime('%Y/%m/%d')) + self.timefmt = date.strftime(' [%d %b, %Y]') + return soup - def decode_url_date(self, url): - urlitems = url.split('/') - try: - d = date(int(urlitems[3]), int(urlitems[4]), int(urlitems[5])) - except: - try: - d = date(int(urlitems[4]), int(urlitems[5]), int(urlitems[6])) - except: - return None - return d + def parse_todays_sections(self, container): + for h2 in container.findAll('h2', **classes('headline')): + title = self.tag_to_string(h2) + a = h2.find('a', href=True) + url = a['href'] + if '?' in url: + url = url.split('?')[0] + p = h2.findParent(**classes('story-body')) + desc = '' + if p is not None: + s = p.find(**classes('summary')) + if s is not None: + desc = self.tag_to_string(s) + self.log('\t', title, ': ', url) + self.log('\t\t', desc) + yield {'title': title, 'url': url, 'description': desc} - if oldest_web_article is None: - earliest_date = date.today() - else: - earliest_date = date.today() - timedelta(days=oldest_web_article) - oldest_article = 365 # by default, a long time ago - - __author__ = 'GRiker/Kovid Goyal/Nick Redding' - language = 'en' - requires_version = (0, 7, 5) - encoding = 'utf-8' - - timefmt = '' - - # simultaneous_downloads = 1 # no longer required to deal with ads - - cover_margins = (18, 18, 'grey99') - - keep_only_tags = dict(id=['article', 'story', 'content']) - remove_tags = [ - dict(attrs={'class': [ - 'articleFooter', - 'articleTools', - 'rfd', 'story-footer-links', 'page-footer', - 'columnGroup singleRule', - 'columnGroup last', - 'columnGroup last', - 'doubleRule', - 'dottedLine', - 'entry-meta', - 'entry-response module', - 'leftNavTabs', - 'metaFootnote', - 'inside-story', - 'module box nav', - 'nextArticleLink', - 'nextArticleLink clearfix', - 'post-tools', - 'relatedSearchesModule', - 'side_tool', - 'singleAd', - 'postCategory column', - 'refer tagRefer', # added for bits blog post - 'entry entry-utility', # added for DealBook - 'entry-tags', # added for DealBook - 'footer promos clearfix', # added for DealBook - 'footer links clearfix', # added for DealBook - 'tabsContainer', # added for other blog downloads - 'column lastColumn', # added for other blog downloads - 'pageHeaderWithLabel', # added for other gadgetwise downloads - 'column two', # added for other blog downloads - 'column two last', # added for other blog downloads - 'column three', # added for other blog downloads - 'column three last', # added for other blog downloads - 'column four', # added for other blog downloads - 'column four last', # added for other blog downloads - 'column last', # added for other blog downloads - 'entry entry-related', - 'subNavigation tabContent active', # caucus blog navigation - 'mediaOverlay slideshow', - 'wideThumb', - 'video', # added 02-11-2011 - 'videoHeader', # added 02-11-2011 - 'articleInlineVideoHolder', # added 02-11-2011 - 'assetCompanionAd', - 'nytint-sectionHeader', - re.compile('^subNavigation'), - re.compile('^leaderboard'), - re.compile('^module'), - re.compile('commentCount'), - 'lede-container', - 'credit', - 'caption-video', - 'upshot-social' - ]}), - dict( - attrs={'class': lambda x: x and 'related-coverage-marginalia' in x.split()}), - dict(attrs={'class': lambda x: x and 'hidden' in x.split()}), - dict(attrs={'class': lambda x: x and 'interactive' in x.split()}), - dict(attrs={'class': lambda x: x and 'SectionBarShare' in x.split('-')}), - dict(attrs={'class': lambda x: x and 'ResponsiveAd' in x.split('-')}), - dict(attrs={'class': lambda x: x and 'skip-to-text-link' in x.split()}), - dict(attrs={'class': lambda x: x and 'sharetools' in x.split()}), - dict(attrs={'class': lambda x: x and 'ad' in x.split()}), - dict(attrs={'class': lambda x: x and 'video' in x.split()}), - dict(attrs={'class': lambda x: x and 'visually-hidden' in x.split()}), - dict(name='div', attrs={'class': re.compile('toolsList')}), # bits - dict(name='div', attrs={ - 'class': re.compile('postNavigation')}), # bits - dict(name='div', attrs={'class': 'tweet'}), - dict(name='span', attrs={'class': 'commentCount meta'}), - dict(name='div', attrs={'id': 'header'}), - # bits, pogue, gadgetwise, open - dict(name='div', attrs={'id': re.compile('commentsContainer')}), - # pogue, gadgetwise - dict(name='ul', attrs={'class': re.compile('entry-tools')}), - # pogue, gadgetwise - dict(name='div', attrs={'class': re.compile('nocontent')}), - dict(name='div', attrs={'id': re.compile('respond')}), # open - dict(name='div', attrs={'class': re.compile('entry-tags')}), # pogue - dict(name='h4', attrs={'class': 'headline'}), - dict(id=[ - 'adxLeaderboard', - 'pagelinks', - 'adxSponLink', - 'anchoredAd_module', - 'anchoredAd_spot', - 'archive', - 'articleExtras', - 'articleInline', - 'blog_sidebar', - 'businessSearchBar', - 'cCol', - 'entertainmentSearchBar', - 'footer', - 'header', - 'header_search', - 'inlineBox', - 'login', - 'masthead', - 'masthead-nav', - 'masthead-social', - 'memberTools', - 'navigation', 'navigation-ghost', 'navigation-modal', 'navigation-edge', - 'page-footer', - 'portfolioInline', - 'readerReviews', - 'readerReviewsCount', - 'relatedArticles', - 'relatedTopics', - 'respond', - 'ribbon', - 'side_search', - 'side_index', - 'side_tool', - 'toolsRight', - 'skybox', # added for DealBook - 'TopAd', # added for DealBook - 'related-content', # added for DealBook - 'whats-next', - 'newsletter-promo', - ]), - dict(name=['script', 'noscript', 'style', 'form', 'hr', 'button', 'meta', 'footer'])] - no_stylesheets = True - extra_css = ''' - .articleHeadline { text-align: left; margin-top:0.5em; margin-bottom:0.25em; } - .credit { font-weight: normal; text-align: right; font-size: - 50%; line-height:1em; margin-top:5px; margin-left:0; - margin-right:0; margin-bottom: 0; } - .byline { text-align: left; font-size: 50%; line-height:1em; margin-top:10px; margin-left:0; margin-right:0; margin-bottom: 0; } - .dateline { text-align: left; font-size: 50%; line-height:1em;margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; } - .kicker { font-size: 50%; line-height:1em;margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; } - .timestamp { font-weight: normal; text-align: left; font-size: 50%; } - .caption { font-size: 50%; font-style:italic; line-height:1em; margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; } - a:link {text-decoration: none; } - .date{font-size: 50%; } - .update{font-size: 50%; } - .articleBody { } - .authorId {text-align: left; font-size: 50%; } - .image {text-align: center;} - .aside {color:blue;margin:0px 0px 0px 0px; padding: 0px 0px 0px 0px; font-size:100%;} - .asidenote {color:blue;margin:0px 0px 0px 0px; padding: 0px 0px 0px 0px; font-size:100%;font-weight:bold;} - .source {text-align: left; font-size: x-small; }''' - - articles = {} - key = None - ans = [] - url_list = [] - - def filter_ans(self, ans): - total_article_count = 0 - idx = 0 - idx_max = len(ans) - 1 - while idx <= idx_max: - if self.includeSections != []: - if ans[idx][0] not in self.includeSections: - print "SECTION NOT INCLUDED: ", ans[idx][0] - del ans[idx] - idx_max = idx_max - 1 - continue - if ans[idx][0] in self.excludeSections: - print "SECTION EXCLUDED: ", ans[idx][0] - del ans[idx] - idx_max = idx_max - 1 - continue - if True: # self.verbose - self.log("Section %s: %d articles" % - (ans[idx][0], len(ans[idx][1]))) - for article in ans[idx][1]: - total_article_count += 1 - if True: # self.verbose - self.log("\t%-40.40s... \t%-60.60s..." % (article['title'].encode('cp1252', 'replace'), - article['url'].encode('cp1252', 'replace'))) - idx = idx + 1 - - self.log("Queued %d articles" % total_article_count) - return ans - - def exclude_url(self, url): - if not url.startswith("http"): - return True - # added for DealBook - if not url.endswith(".html") and 'dealbook.nytimes.com' not in url: - return True - if 'nytimes.com' not in url: - return True - if 'cn.nytimes.com' in url: - return True - if '/es/' in url: - return True - if 'podcast' in url: - return True - if '/video/' in url: - return True - if '/multimedia/' in url: - return True - if '/slideshow/' in url: - return True - if '/magazine/index' in url: - return True - if '/interactive/' in url: - return True - if '/reference/' in url: - return True - if '/premium/' in url: - return True - if '#comment' in url: - return True - if '#postComment' in url: - return True - if '#postcomment' in url: - return True - if re.search('/\d\d\d\d/\d\d/\d\d/', url) is None: - print("NO DATE IN " + url) - return True - return False - - def fixChars(self, string): - # Replace lsquo (\x91) - fixed = re.sub("\x91", "‘", string) - - # Replace rsquo (\x92) - fixed = re.sub("\x92", "’", fixed) - - # Replace ldquo (\x93) - fixed = re.sub("\x93", "“", fixed) - - # Replace rdquo (\x94) - fixed = re.sub("\x94", "”", fixed) - - # Replace ndash (\x96) - fixed = re.sub("\x96", "–", fixed) - - # Replace mdash (\x97) - fixed = re.sub("\x97", "—", fixed) - - return fixed - - def get_browser(self): - br = BasicNewsRecipe.get_browser(self) - return br - - cover_tag = 'NY_NYT' - - def get_cover_url(self): - from datetime import date - today = date.today() - cover = 'https://static01.nyt.com/images/' \ - + today.strftime('%Y') + '/' + today.strftime('%m') + '/' \ - + today.strftime('%d') + '/nytfrontpage/scan.jpg' - self.log(cover) - br = BasicNewsRecipe.get_browser(self) - try: - br.open(cover) - except: - self.log("\nCover unavailable") - cover = None - return cover - - masthead_url = 'http://graphics8.nytimes.com/images/misc/nytlogo379x64.gif' - - def short_title(self): - return self.title - - def article_to_soup(self, url_or_raw, raw=False): - from contextlib import closing - import copy - from calibre.ebooks.chardet import xml_to_unicode - print("ARTICLE_TO_SOUP " + url_or_raw) - if re.match(r'\w+://', url_or_raw): - br = self.clone_browser(self.browser) - open_func = getattr(br, 'open_novisit', br.open) - with closing(open_func(url_or_raw)) as f: - _raw = f.read() - if not _raw: - raise RuntimeError( - 'Could not fetch index from %s' % url_or_raw) - else: - _raw = url_or_raw - if raw: - return _raw - if not isinstance(_raw, unicode) and self.encoding: - if callable(self.encoding): - _raw = self.encoding(_raw) - else: - _raw = _raw.decode(self.encoding, 'replace') - - nmassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE) - nmassage.extend(self.preprocess_regexps) - nmassage += [(re.compile(r'', re.DOTALL), lambda m: '')] - # Some websites have buggy doctype declarations that mess up beautifulsoup - # Remove comments as they can leave detritus when extracting tags leaves - # multiple nested comments - nmassage.append((re.compile(r'', re.DOTALL), lambda m: '')) - usrc = xml_to_unicode(_raw, self.verbose, strip_encoding_pats=True)[0] - usrc = self.preprocess_raw_html(usrc, url_or_raw) - return BeautifulSoup(usrc, markupMassage=nmassage) - - def massageNCXText(self, description): - # Kindle TOC descriptions won't render certain characters - if description: - massaged = unicode(BeautifulStoneSoup( - description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES)) - # Replace '&' with '&' - massaged = re.sub("&", "&", massaged) - massaged = re.sub("&", "&", massaged) - return self.fixChars(massaged) - else: - return description - - def feed_title(self, div): - return ''.join(div.findAll(text=True, recursive=True)).strip() - - def handle_article(self, div): - thumbnail = div.find('div', 'thumbnail') - if thumbnail: - thumbnail.extract() - return self.handle_base_article(div) - - # Handle '
' in world, u.s., etc - def handle_article_tag(self, div): - thumbnail = div.find('figure', 'media photo') - if not thumbnail: - thumbnail = div.find('div', 'thumb') - if thumbnail: - thumbnail.extract() - div = div.find('div', 'story-body') - if not div: - return - return self.handle_base_article(div) - - def handle_base_article(self, div): - a = div.find('a', href=True) - if not a: - return - url = re.sub(r'\?.*', '', a['href']) - if self.exclude_url(url): - return - url += '?pagewanted=all' - if self.filterDuplicates: - if url in self.url_list: - return - if self.webEdition: - date_tag = self.decode_url_date(url) - if date_tag is not None: - if self.oldest_web_article is not None: - if date_tag < self.earliest_date: - self.log("Skipping article %s" % url) - return - else: - self.log("Skipping article %s" % url) - return - self.url_list.append(url) - title = self.tag_to_string(a, use_alt=True).strip() - description = '' - pubdate = strftime('%a, %d %b') - summary = div.find(True, attrs={'class': 'summary'}) - if summary: - description = self.tag_to_string(summary, use_alt=False) - author = '' - authorAttribution = div.find(True, attrs={'class': 'byline'}) - if authorAttribution: - author = self.tag_to_string(authorAttribution, use_alt=False) - else: - authorAttribution = div.find(True, attrs={'class': 'byline'}) - if authorAttribution: - author = self.tag_to_string(authorAttribution, use_alt=False) - feed = self.key if self.key is not None else 'Uncategorized' - if feed not in self.articles: - self.ans.append(feed) - self.articles[feed] = [] - self.articles[feed].append( - dict(title=title, url=url, date=pubdate, - description=description, author=author, - content='')) - - def get_popular_articles(self, ans): - if self.getPopularArticles: - popular_articles = {} - key_list = [] - - def handleh3(h3tag): - try: - url = h3tag.a['href'] - except: - return ('', '', '', '') - url = re.sub(r'\?.*', '', url) - if self.exclude_url(url): - return ('', '', '', '') - url += '?pagewanted=all' - title = self.tag_to_string(h3tag.a, False) - h6tag = h3tag.findNextSibling('h6') - if h6tag is not None: - author = self.tag_to_string(h6tag, False) - else: - author = '' - ptag = h3tag.findNextSibling('p') - if ptag is not None: - desc = self.tag_to_string(ptag, False) - else: - desc = '' - return(title, url, author, desc) - - have_emailed = False - emailed_soup = self.index_to_soup( - 'http://www.nytimes.com/most-popular-emailed?period=' + self.popularPeriod) - for h3tag in emailed_soup.findAll('h3'): - (title, url, author, desc) = handleh3(h3tag) - if url == '': - continue - if not have_emailed: - key_list.append('Most E-Mailed') - popular_articles['Most E-Mailed'] = [] - have_emailed = True - popular_articles['Most E-Mailed'].append( - dict(title=title, url=url, date=strftime('%a, %d %b'), - description=desc, author=author, - content='')) - have_viewed = False - viewed_soup = self.index_to_soup( - 'http://www.nytimes.com/most-popular-viewed?period=' + self.popularPeriod) - for h3tag in viewed_soup.findAll('h3'): - (title, url, author, desc) = handleh3(h3tag) - if url == '': - continue - if not have_viewed: - key_list.append('Most Viewed') - popular_articles['Most Viewed'] = [] - have_viewed = True - popular_articles['Most Viewed'].append( - dict(title=title, url=url, date=strftime('%a, %d %b'), - description=desc, author=author, - content='')) - viewed_ans = [(k, popular_articles[k]) - for k in key_list if k in popular_articles] - for x in viewed_ans: - ans.append(x) - return ans - - def get_tech_feeds(self, ans): - if self.getTechBlogs: - tech_articles = {} - key_list = [] - save_oldest_article = self.oldest_article - save_max_articles_per_feed = self.max_articles_per_feed - self.oldest_article = self.tech_oldest_article - self.max_articles_per_feed = self.tech_max_articles_per_feed - self.feeds = self.tech_feeds - tech = self.parse_feeds() - self.oldest_article = save_oldest_article - self.max_articles_per_feed = save_max_articles_per_feed - self.feeds = None - for f in tech: - key_list.append(f.title) - tech_articles[f.title] = [] - for a in f.articles: - tech_articles[f.title].append( - dict(title=a.title, url=a.url.partition('?')[0], date=a.date, - description=a.summary, author=a.author, - content=a.content)) - tech_ans = [(k, tech_articles[k]) - for k in key_list if k in tech_articles] - for x in tech_ans: - ans.append(x) - return ans - - def parse_web_edition(self): - - for (sec_title, index_url) in self.web_sections: - if self.includeSections != []: - if sec_title not in self.includeSections: - print "SECTION NOT INCLUDED: ", sec_title - continue - if sec_title in self.excludeSections: - print "SECTION EXCLUDED: ", sec_title - continue - try: - soup = self.index_to_soup( - 'https://www.nytimes.com/pages/' + index_url + '/index.html') - except: - continue - print 'Index URL: ' + 'https://www.nytimes.com/pages/' + index_url + '/index.html' - - self.key = sec_title - # Find each article - for div in soup.findAll('article'): - self.handle_article_tag(div) - for div in soup.findAll(True, attrs={ - 'class': ['section-headline', 'ledeStory', 'story', 'story headline', 'sectionHeader', 'headlinesOnly multiline flush']}): - if div['class'] in ['story', 'story headline', 'storyHeader']: - self.handle_article(div) - elif div['class'] == 'ledeStory': - divsub = div.find('div', 'storyHeader') - if divsub is not None: - self.handle_article(divsub) - ulrefer = div.find('ul', 'refer') - if ulrefer is not None: - for lidiv in ulrefer.findAll('li'): - self.handle_article(lidiv) - elif div['class'] == 'headlinesOnly multiline flush': - for lidiv in div.findAll('li'): - self.handle_article(lidiv) - - self.ans = [(k, self.articles[k]) - for k in self.ans if k in self.articles] - return self.filter_ans(self.get_tech_feeds(self.get_popular_articles(self.ans))) - - def parse_todays_index(self): - - soup = self.index_to_soup( - 'https://www.nytimes.com/pages/todayspaper/index.html') - skipping = False - # Find each article - for div in soup.findAll(True, - attrs={'class': ['section-headline', 'story', 'story headline', 'sectionHeader', 'headlinesOnly multiline flush']}): - if div['class'] in ['section-headline', 'sectionHeader']: - self.key = string.capwords(self.feed_title(div)) - self.key = self.key.replace('Op-ed', 'Op-Ed') - self.key = self.key.replace('U.s.', 'U.S.') - self.key = self.key.replace('N.y.', 'N.Y.') - skipping = False - if self.includeSections != []: - if self.key not in self.includeSections: - print "SECTION NOT INCLUDED: ", self.key - skipping = True - if self.key in self.excludeSections: - print "SECTION EXCLUDED: ", self.key - skipping = True - - elif div['class'] in ['story', 'story headline']: - if not skipping: - self.handle_article(div) - elif div['class'] == 'headlinesOnly multiline flush': - for lidiv in div.findAll('li'): - if not skipping: - self.handle_article(lidiv) - - self.ans = [(k, self.articles[k]) - for k in self.ans if k in self.articles] - return self.filter_ans(self.get_tech_feeds(self.get_popular_articles(self.ans))) - - def parse_headline_index(self): - soup = self.index_to_soup( - 'https://www.nytimes.com/pages/todaysheadlines/') - pubdate = strftime('%a, %d %b') - section = None - articles = [] + def parse_todays_page(self): + soup = self.read_nyt_metadata() + section = soup.find(id='collection-todays-new-york-times') feeds = [] - for h6 in soup.findAll('h6'): - section = self.tag_to_string(h6).strip() - articles = [] - table = h6.parent.findNextSibling('table') - if table is None: - continue - for a in table.findAll('a', attrs={'class':'headURL'}): - title = self.tag_to_string(a) - url = a['href'].partition('?')[0] - if self.exclude_url(url) or (self.filterDuplicates and url in self.url_list): - continue - self.url_list.append(url) - desc = '' - h4 = a.findNextSibling('h4') - if h4 is not None: - desc += self.tag_to_string(h4) - p = a.findNextSibling('p') - if p is not None: - desc += ' ' + self.tag_to_string(p) - articles.append({'title':title, 'url':url + '?pagewanted=all', 'date':pubdate, 'description':desc}) + for h1 in section.findAll('h1')[1:]: + section_title = self.tag_to_string(h1) + self.log('Found section:', section_title) + articles = list(self.parse_todays_sections(h1.parent)) if articles: - feeds.append((section, articles)) - self.ans = feeds - return self.filter_ans(self.ans) + feeds.append((section_title, articles)) + return feeds + + def parse_highlights(self, container): + for article in container.findAll('article', **classes('story')): + h2 = article.find('h2') + if h2 is not None: + title = self.tag_to_string(h2) + a = h2.find('a', href=True) + if a is not None: + url = a['href'] + desc = '' + p = article.find(**classes('summary')) + if p is not None: + desc = self.tag_to_string(p) + yield {'title': title, 'url': url, 'description': desc} + + def parse_web_section(self, soup, slug): + + def log(article): + self.log('\t', article['title'], ':', article['url']) + if article.get('description'): + self.log('\t\t', article['description']) + + container = soup.find(itemtype='http://schema.org/CollectionPage') + highlights = container.find('section', **classes('highlights')) + for article in self.parse_highlights(highlights): + log(article) + yield article + extra = container.find('section', attrs={'data-collection-type': True}) + if extra is not None: + title = self.tag_to_string(extra.find('h2')) + for article in self.parse_highlights(extra): + article['title'] = '{}: {}'.format(title, article['title']) + log(article) + yield article + + def parse_web_sections(self): + feeds = [] + for section_title, slug in web_sections: + url = 'https://www.nytimes.com/section/' + slug + try: + soup = self.index_to_soup(url) + except Exception: + self.log.error('Failed to download section:', url) + continue + self.log('Found section:', section_title) + articles = list(self.parse_web_section(soup, slug)) + if articles: + feeds.append((section_title, articles)) + if self.test and len(feeds) >= self.test[0]: + break + return feeds def parse_index(self): - if self.headlinesOnly: - return self.parse_headline_index() - elif self.webEdition: - return self.parse_web_edition() - else: - return self.parse_todays_index() - - def strip_anchors(self, soup, kill_all=False): - paras = soup.findAll(True) - for para in paras: - aTags = para.findAll('a') - for a in aTags: - if a.img is None: - if kill_all or (self.recursions == 0): - a.replaceWith(self.tag_to_string(a, False)) - else: - if 'href' in a: - if a['href'].startswith('http://www.nytimes'): - if not a['href'].endswith('pagewanted=all'): - url = re.sub(r'\?.*', '', a['href']) - if self.exclude_url(url): - a.replaceWith( - self.tag_to_string(a, False)) - else: - a['href'] = url + '?pagewanted=all' - elif not (a['href'].startswith('http://pogue') or - a['href'].startswith('http://bits') or - a['href'].startswith('http://travel') or - a['href'].startswith('http://business') or - a['href'].startswith('http://tech') or - a['href'].startswith('http://health') or - a['href'].startswith('http://dealbook') or - a['href'].startswith('http://open')): - a.replaceWith(self.tag_to_string(a, False)) - return soup - - def handle_tags(self, soup): - try: - print("HANDLE TAGS: TITLE = " + self.tag_to_string(soup.title)) - except: - print("HANDLE TAGS: NO TITLE") - if soup is None: - print("ERROR: handle_tags received NoneType") - return None - - if self.keep_only_tags: - body = Tag(soup, 'body') - try: - if isinstance(self.keep_only_tags, dict): - self.keep_only_tags = [self.keep_only_tags] - for spec in self.keep_only_tags: - for tag in soup.find('body').findAll(**spec): - body.insert(len(body.contents), tag) - soup.find('body').replaceWith(body) - except AttributeError: # soup has no body element - pass - - def remove_beyond(tag, next): - while tag is not None and getattr(tag, 'name', None) != 'body': - after = getattr(tag, next) - while after is not None: - ns = getattr(tag, next) - after.extract() - after = ns - tag = tag.parent - - if self.remove_tags_after is not None: - rt = [self.remove_tags_after] if isinstance( - self.remove_tags_after, dict) else self.remove_tags_after - for spec in rt: - tag = soup.find(**spec) - remove_beyond(tag, 'nextSibling') - - if self.remove_tags_before is not None: - tag = soup.find(**self.remove_tags_before) - remove_beyond(tag, 'previousSibling') - - for kwds in self.remove_tags: - for tag in soup.findAll(**kwds): - tag.extract() - - return soup - - def preprocess_html(self, soup): - skip_tag = soup.find(True, {'name': 'skip'}) - if skip_tag is not None: - url = 'http://www.nytimes.com' + skip_tag.parent['href'] - self.log.warn("Skipping ad to article at '%s'" % url) - sleep(5) - soup = self.handle_tags(self.article_to_soup(url)) - - # check if the article is from one of the tech blogs - blog = soup.find( - 'div', attrs={'id': ['pogue', 'bits', 'gadgetwise', 'open']}) - - if blog is not None: - old_body = soup.find('body') - new_body = Tag(soup, 'body') - new_body.append(soup.find('div', attrs={'id': 'content'})) - new_body.find('div', attrs={'id': 'content'})[ - 'id'] = 'blogcontent' # identify for postprocess_html - old_body.replaceWith(new_body) - for divr in soup.findAll('div', attrs={'class': re.compile('w190 right')}): - if divr.find(text=re.compile('Sign up')): - divr.extract() - divr = soup.find( - 'div', attrs={'class': re.compile('^relatedArticlesModule')}) - if divr is not None: - print("PROCESSING RELATED: " + - self.tag_to_string(soup.title, False)) - # handle related articles - rlist = [] - ul = divr.find('ul') - if ul is not None: - for li in ul.findAll('li'): - atag = li.find('a') - if atag is not None: - if atag['href'].startswith('http://pogue') or atag['href'].startswith('http://bits') or \ - atag['href'].startswith('http://open'): - atag.find(text=True).replaceWith( - self.massageNCXText(self.tag_to_string(atag, False))) - rlist.append(atag) - divr.extract() - if rlist != []: - asidediv = Tag(soup, 'div', [('class', 'aside')]) - if soup.find('hr') is None: - asidediv.append(Tag(soup, 'hr')) - h4 = Tag(soup, 'h4', [('class', 'asidenote')]) - h4.insert(0, "Related Posts") - asidediv.append(h4) - ul = Tag(soup, 'ul') - for r in rlist: - li = Tag(soup, 'li', [('class', 'aside')]) - r['class'] = 'aside' - li.append(r) - ul.append(li) - asidediv.append(ul) - asidediv.append(Tag(soup, 'hr')) - smain = soup.find('body') - smain.append(asidediv) - else: - print("CANNOT FIND RELATED: " + - self.tag_to_string(soup.title, False)) - for atag in soup.findAll('a'): - img = atag.find('img') - if img is not None: - atag.replaceWith(img) - elif 'href' not in atag: - atag.replaceWith( - atag.renderContents().decode('cp1252', 'replace')) - elif not (atag['href'].startswith('http://www.nytimes') or atag['href'].startswith('http://pogue') or - atag['href'].startswith('http://bits') or atag['href'].startswith('http://open')): - atag.replaceWith( - atag.renderContents().decode('cp1252', 'replace')) - hdr = soup.find('address') - if hdr is not None: - hdr.name = 'span' - for span_credit in soup.findAll('span', 'credit'): - sp = Tag(soup, 'span') - span_credit.replaceWith(sp) - sp.append(Tag(soup, 'br')) - sp.append(span_credit) - sp.append(Tag(soup, 'br')) - - else: # nytimes article - - related = [] # these will be the related articles - first_outer = None # first related outer tag - first_related = None # first related tag - for outerdiv in soup.findAll(attrs={'class': re.compile('articleInline runaroundLeft')}): - for rdiv in soup.findAll('div', 'columnGroup doubleRule'): - if rdiv.find('h3') is not None: - if self.tag_to_string(rdiv.h3, False).startswith('Related'): - rdiv.h3.find(text=True).replaceWith( - "Related articles") - rdiv.h3['class'] = 'asidenote' - for litag in rdiv.findAll('li'): - if litag.find('a') is not None: - if litag.find('a')['href'].startswith('http://www.nytimes.com'): - url = re.sub( - r'\?.*', '', litag.find('a')['href']) - litag.find('a')[ - 'href'] = url + '?pagewanted=all' - litag.extract() - related.append(litag) - if first_related is None: - first_related = rdiv - first_outer = outerdiv - else: - litag.extract() - for h6tag in rdiv.findAll('h6'): - if h6tag.find('a') is not None: - if h6tag.find('a')['href'].startswith('http://www.nytimes.com'): - url = re.sub( - r'\?.*', '', h6tag.find('a')['href']) - h6tag.find('a')[ - 'href'] = url + '?pagewanted=all' - h6tag.extract() - related.append(h6tag) - if first_related is None: - first_related = rdiv - first_outer = outerdiv - else: - h6tag.extract() - if related != []: - for r in related: - if r.h6: # don't want the anchor inside a h6 tag - r.h6.replaceWith(r.h6.a) - first_related.ul.append(r) - first_related.insert(0, Tag(soup, 'hr')) - first_related.append(Tag(soup, 'hr')) - first_related['class'] = 'aside' - # replace the outer tag with the related tag - first_outer.replaceWith(first_related) - - for rdiv in soup.findAll(attrs={'class': re.compile('articleInline runaroundLeft')}): - rdiv.extract() - - kicker_tag = soup.find(attrs={'class': 'kicker'}) - if kicker_tag: # remove Op_Ed author head shots - tagline = self.tag_to_string(kicker_tag) - if tagline == 'Op-Ed Columnist': - img_div = soup.find('div', 'inlineImage module') - if img_div: - img_div.extract() - - if self.useHighResImages: - try: - # open up all the "Enlarge this Image" pop-ups and download - # the full resolution jpegs - enlargeThisList = soup.findAll( - 'div', {'class': 'icon enlargeThis'}) - if enlargeThisList: - for popupref in enlargeThisList: - popupreflink = popupref.find('a') - if popupreflink: - reflinkstring = str(popupreflink['href']) - refstart = reflinkstring.find( - "javascript:pop_me_up2('") + len("javascript:pop_me_up2('") - refend = reflinkstring.find( - ".html", refstart) + len(".html") - reflinkstring = reflinkstring[refstart:refend] - - popuppage = self.browser.open(reflinkstring) - popuphtml = popuppage.read() - popuppage.close() - if popuphtml: - st = time.localtime() - year = str(st.tm_year) - month = "%.2d" % st.tm_mon - day = "%.2d" % st.tm_mday - imgstartpos = popuphtml.find('http://graphics8.nytimes.com/images/' + year + '/' + month + '/' + day + '/') + \ - len('http://graphics8.nytimes.com/images/' + - year + '/' + month + '/' + day + '/') - highResImageLink = 'http://graphics8.nytimes.com/images/' + year + '/' + \ - month + '/' + day + '/' + \ - popuphtml[imgstartpos:popuphtml.find( - '.jpg', imgstartpos) + 4] - popupSoup = BeautifulSoup(popuphtml) - highResTag = popupSoup.find( - 'img', {'src': highResImageLink}) - if highResTag: - try: - newWidth = highResTag['width'] - newHeight = highResTag['height'] - imageTag = popupref.parent.find( - "img") - except: - self.log( - "Error: finding width and height of img") - popupref.extract() - if imageTag: - try: - imageTag[ - 'src'] = highResImageLink - imageTag['width'] = newWidth - imageTag['height'] = newHeight - except: - self.log( - "Error setting the src width and height parameters") - except Exception: - self.log("Error pulling high resolution images") - - try: - # in case pulling images failed, delete the enlarge this - # text - enlargeThisList = soup.findAll( - 'div', {'class': 'icon enlargeThis'}) - if enlargeThisList: - for popupref in enlargeThisList: - popupref.extract() - except: - self.log("Error removing Enlarge this text") - - return self.strip_anchors(soup, False) - - def postprocess_html(self, soup, first_fetch): - if not first_fetch: # remove Related links - for aside in soup.findAll('div', 'aside'): - aside.extract() - soup = self.strip_anchors(soup, True) - - for t in soup.findAll('time', attrs={'class':'dateline'}): - t.name = 'div' - - if soup.find('div', attrs={'id': 'blogcontent'}) is None: - if first_fetch: - aside = soup.find('div', 'aside') - if aside is not None: # move the related list to the end of the article - art = soup.find('div', attrs={'id': 'article'}) - if art is None: - art = soup.find('div', attrs={'class': 'article'}) - if art is not None: - art.append(aside) - try: - if self.one_picture_per_article: - # Remove all images after first - largeImg = soup.find(True, {'class': 'articleSpanImage'}) - inlineImgs = soup.findAll( - True, {'class': 'inlineImage module'}) - if largeImg: - for inlineImg in inlineImgs: - inlineImg.extract() - else: - if inlineImgs: - firstImg = inlineImgs[0] - for inlineImg in inlineImgs[1:]: - inlineImg.extract() - # Move firstImg before article body - cgFirst = soup.find( - True, {'class': re.compile('columnGroup *first')}) - if cgFirst: - # Strip all sibling NavigableStrings: noise - navstrings = cgFirst.findAll( - text=True, recursive=False) - [ns.extract() for ns in navstrings] - headline_found = False - tag = cgFirst.find(True) - insertLoc = 0 - while True: - insertLoc += 1 - if hasattr(tag, 'class') and tag['class'] == 'articleHeadline': - headline_found = True - break - tag = tag.nextSibling - if not tag: - headline_found = False - break - if headline_found: - cgFirst.insert(insertLoc, firstImg) - else: - self.log( - ">>> No class:'columnGroup first' found <<<") - except: - self.log("ERROR: One picture per article in postprocess_html") - - try: - # Change captions to italic - for caption in soup.findAll(True, {'class': 'caption'}): - if caption and len(caption) > 0: - cTag = Tag(soup, "p", [("class", "caption")]) - c = self.fixChars(self.tag_to_string( - caption, use_alt=False)).strip() - mp_off = c.find("More Photos") - if mp_off >= 0: - c = c[:mp_off] - cTag.insert(0, c) - caption.replaceWith(cTag) - except: - self.log("ERROR: Problem in change captions to italic") - - try: - # Change to

- h1 = soup.find('h1') - blogheadline = str(h1) # added for dealbook - if h1: - headline = h1.find("nyt_headline") - if headline: - tag = Tag(soup, "h2") - tag['class'] = "headline" - tag.insert(0, self.fixChars(headline.contents[0])) - h1.replaceWith(tag) - elif blogheadline.find('entry-title'): # added for dealbook - tag = Tag(soup, "h2") # added for dealbook - tag['class'] = "headline" # added for dealbook - # added for dealbook - tag.insert(0, self.fixChars(h1.contents[0])) - h1.replaceWith(tag) # added for dealbook - - else: - # Blog entry - replace headline, remove
tags - BCC I - # think this is no longer functional 1-18-2011 - headline = soup.find('title') - if headline: - tag = Tag(soup, "h2") - tag['class'] = "headline" - tag.insert(0, self.fixChars( - self.tag_to_string(headline, False))) - soup.insert(0, tag) - hrs = soup.findAll('hr') - for hr in hrs: - hr.extract() - except: - self.log("ERROR: Problem in Change to

") - - try: - # if this is from a blog (dealbook, fix the byline format - bylineauthor = soup.find( - 'address', attrs={'class': 'byline author vcard'}) - if bylineauthor: - tag = Tag(soup, "h6") - tag['class'] = "byline" - tag.insert(0, self.fixChars( - self.tag_to_string(bylineauthor, False))) - bylineauthor.replaceWith(tag) - except: - self.log("ERROR: fixing byline author format") - - try: - # if this is a blog (dealbook) fix the credit style for the - # pictures - blogcredit = soup.find('div', attrs={'class': 'credit'}) - if blogcredit: - tag = Tag(soup, "h6") - tag['class'] = "credit" - tag.insert(0, self.fixChars( - self.tag_to_string(blogcredit, False))) - blogcredit.replaceWith(tag) - except: - self.log("ERROR: fixing credit format") - - try: - # Change

to

- used in editorial blogs - masthead = soup.find("h1") - if masthead: - # Nuke the href - if masthead.a: - del(masthead.a['href']) - tag = Tag(soup, "h3") - tag.insert(0, self.fixChars(masthead.contents[0])) - masthead.replaceWith(tag) - except: - self.log( - "ERROR: Problem in Change

to

- used in editorial blogs") - - try: - # Change to - for subhead in soup.findAll(True, {'class': 'bold'}): - if subhead.contents: - bTag = Tag(soup, "b") - bTag.insert(0, subhead.contents[0]) - subhead.replaceWith(bTag) - except: - self.log( - "ERROR: Problem in Change

to

- used in editorial blogs") - try: - # remove the update tag - blogupdated = soup.find('span', {'class': 'update'}) - if blogupdated: - blogupdated.replaceWith("") - except: - self.log("ERROR: Removing strong tag") - - try: - divTag = soup.find('div', attrs={'id': 'articleBody'}) - if divTag: - divTag['class'] = divTag['id'] - except: - self.log( - "ERROR: Problem in soup.find(div,attrs={id:articleBody})") - - try: - # Add class="authorId" to
so we can format with CSS - divTag = soup.find('div', attrs={'id': 'authorId'}) - if divTag and divTag.contents[0]: - tag = Tag(soup, "p") - tag['class'] = "authorId" - tag.insert(0, self.fixChars(self.tag_to_string(divTag.contents[0], - use_alt=False))) - divTag.replaceWith(tag) - except: - self.log( - "ERROR: Problem in Add class=authorId to
so we can format with CSS") - return soup - - def populate_article_metadata(self, article, soup, first): - if not first: - return - idxdiv = soup.find('div', attrs={'class': 'articleSpanImage'}) - if idxdiv is not None: - if idxdiv.img: - self.add_toc_thumbnail(article, re.sub( - r'links\\link\d+\\', '', idxdiv.img['src'])) - else: - img = soup.find('body').find('img') - if img is not None: - self.add_toc_thumbnail(article, re.sub( - r'links\\link\d+\\', '', img['src'])) - shortparagraph = "" - try: - if len(article.text_summary.strip()) == 0: - articlebodies = soup.findAll( - 'div', attrs={'class': 'articleBody'}) - if articlebodies: - for articlebody in articlebodies: - if articlebody: - paras = articlebody.findAll('p') - for p in paras: - refparagraph = self.massageNCXText( - self.tag_to_string(p, use_alt=False)).strip() - # account for blank paragraphs and short - # paragraphs by appending them to longer ones - if len(refparagraph) > 0: - if len(refparagraph) > 70: # approximately one line of text - newpara = shortparagraph + refparagraph - newparaDateline, newparaEm, newparaDesc = newpara.partition( - '—') - if newparaEm == '': - newparaDateline, newparaEm, newparaDesc = newpara.partition( - '—') - if newparaEm == '': - newparaDesc = newparaDateline - article.summary = article.text_summary = newparaDesc.strip() - return - else: - shortparagraph = refparagraph + " " - if shortparagraph.strip().find(" ") == -1 and not shortparagraph.strip().endswith(":"): - shortparagraph = shortparagraph + "- " - else: - article.summary = article.text_summary = self.massageNCXText( - article.text_summary) - except: - self.log("Error creating article descriptions") - return + if is_web_edition: + return self.parse_web_sections() + return self.parse_todays_page() diff --git a/recipes/nytimes_sub.recipe b/recipes/nytimes_sub.recipe index 925c0c1494..cae243517a 100644 --- a/recipes/nytimes_sub.recipe +++ b/recipes/nytimes_sub.recipe @@ -1,1314 +1,170 @@ -#!/usr/bin/env python2 -# -*- coding: utf-8 -*- -__license__ = 'GPL v3' -__copyright__ = '2008, Kovid Goyal ' -''' -nytimes.com -''' -import re -import string -import time -from calibre import strftime -from datetime import timedelta, date -from time import sleep -from calibre.web.feeds.recipes import BasicNewsRecipe -from calibre.ebooks.BeautifulSoup import BeautifulSoup, Tag, BeautifulStoneSoup +#!/usr/bin/env python2 +# vim:fileencoding=utf-8 +# License: GPLv3 Copyright: 2018, Kovid Goyal + +from __future__ import absolute_import, division, print_function, unicode_literals + +from calibre.web.feeds.news import BasicNewsRecipe +from calibre.utils.date import strptime + +is_web_edition = False +# The sections to download when downloading the web edition, comment out +# the section you are not interested in +web_sections = [ + ('World', 'world'), + ('U.S.', 'us'), + ('Politics', 'politics'), + ('New York', 'nyregion'), + ('Business', 'business'), + ('Technology', 'technology'), + ('Sports', 'sports'), + ('Science', 'science'), + ('Health', 'health'), + ('Opinion', 'opinion'), + ('Arts', 'arts'), + ('Books', 'books'), + ('Movies', 'movies'), + ('Music', 'arts/music'), + ('Television', 'arts/television'), + ('Style', 'style'), + ('Dining & Wine', 'dining'), + ('Fashion & Style', 'fashion'), + ('Home & Garden', 'garden'), + ('Travel', 'travel'), + ('Education', 'education'), + ('Multimedia', 'multimedia'), + ('Obituaries', 'obituaries'), + ('Sunday Magazine', 'magazine') +] -class NYTimes(BasicNewsRecipe): +def classes(classes): + q = frozenset(classes.split(' ')) + return dict(attrs={ + 'class': lambda x: x and frozenset(x.split()).intersection(q)}) - recursions = 1 # set this to zero to omit Related articles lists - # speeds up processing by preventing index page links from being followed - match_regexps = [r'/[12][0-9][0-9][0-9]/[0-9]+/'] - # set getTechBlogs to True to include the technology blogs - # set tech_oldest_article to control article age - # set tech_max_articles_per_feed to control article count - getTechBlogs = True - remove_empty_feeds = True - tech_oldest_article = 14 - tech_max_articles_per_feed = 25 +class NewYorkTimes(BasicNewsRecipe): - # set getPopularArticles to False if you don't want the Most E-mailed and Most Viewed articles - # otherwise you will get up to 20 of the most popular e-mailed and viewed articles (in each category) - # This is currently disabled because the NYT is changing this functionality - # on their website to a new "Trending" page - getPopularArticles = False - popularPeriod = '1' # set this to the number of days to include in the measurement - # e.g. 7 will get the most popular measured over the last 7 days - # and 30 will get the most popular measured over 30 days. - # you still only get up to 20 articles in each category - - # set headlinesOnly to True for the headlines-only version. If True, - # webEdition is ignored. - headlinesOnly = False - - # set webEdition to True for the Web edition of the newspaper. Set oldest_web_article to the - # number of days old an article can be for inclusion. If oldest_web_article = None all articles - # will be included. Note: oldest_web_article is ignored if webEdition = - # False - webEdition = False - oldest_web_article = None - - # download higher resolution images than the small thumbnails typically included in the article - # the down side of having large beautiful images is the file size is much - # larger, on the order of 7MB per paper - useHighResImages = True + title = 'The New York Times' + if is_web_edition: + description = 'New York Times (Web). You can edit the recipe to remove sections you are not interested in.' + else: + description = 'Today\'s New York Times' + encoding = 'utf-8' + __author__ = 'Kovid Goyal' + language = 'en' + ignore_duplicate_articles = {'title', 'url'} + no_stylesheets = True compress_news_images = True compress_news_images_auto_size = 5 - # replace paid Kindle Version: the name will be changed to "The New York Times" to cause - # previous paid versions of the new york times to best sent to the back - # issues folder on the kindle - replaceKindleVersion = False - - # includeSections: List of sections to include. If empty, all sections found will be included. - # Otherwise, only the sections named will be included. For example, - # - # includeSections = ['Politics','Sports'] - # - # would cause only the Politics and Sports sections to be included. - - includeSections = [] # by default, all sections included - - # excludeSections: List of sections to exclude. If empty, all sections found will be included. - # Otherwise, the sections named will be excluded. For example, - # - # excludeSections = ['Politics','Sports'] - # - # would cause the Politics and Sports sections to be excluded. This parameter can be used - # in conjuction with includeSections although in most cases using one or the other, but - # not both, is sufficient. - - excludeSections = [] - - # one_picture_per_article specifies that calibre should only use the first image - # from an article (if one exists). If one_picture_per_article = True, the image - # will be moved to a location between the headline and the byline. - # If one_picture_per_article = False, all images from the article will be included - # and shown in their original location. - one_picture_per_article = False - - # The maximum number of articles that will be downloaded - max_articles_per_feed = 100 - use_embedded_content = False - - # Whether to omit duplicates of articles (typically arsing when articles are indexed in - # more than one section). If True, only the first occurance will be - # downloaded. - filterDuplicates = True - - # Sections to collect for the Web edition. - # Delete any you don't want, or use includeSections or excludeSections - web_sections = [(u'World', u'world'), - (u'U.S.', u'national'), - (u'Politics', u'politics'), - (u'New York', u'nyregion'), - (u'Business', 'business'), - (u'Technology', u'technology'), - (u'Sports', u'sports'), - (u'Science', u'science'), - (u'Health', u'health'), - (u'Opinion', u'opinion'), - (u'Arts', u'arts'), - (u'Books', u'books'), - (u'Movies', u'movies'), - (u'Music', u'arts/music'), - (u'Television', u'arts/television'), - (u'Style', u'style'), - (u'Dining & Wine', u'dining'), - (u'Fashion & Style', u'fashion'), - (u'Home & Garden', u'garden'), - (u'Travel', u'travel'), - ('Education', u'education'), - ('Multimedia', u'multimedia'), - (u'Obituaries', u'obituaries'), - (u'Sunday Magazine', u'magazine') - ] - - tech_feeds = [ - (u'Tech - News', u'http://pogue.blogs.nytimes.com/feed/'), - (u'Tech - Gadgetwise', u'http://gadgetwise.blogs.nytimes.com/feed/'), - (u'Tech - Open', u'http://open.blogs.nytimes.com/feed/') + keep_only_tags = [ + dict(id='story-header'), + classes('story-body-supplemental story-interrupter'), + ] + remove_tags = [ + dict(attrs={'aria-label':'tools'.split()}), + dict(attrs={'data-videoid':True}), + dict(name='button'), + dict(id=lambda x: x and x.startswith('story-ad-')), + dict(name='a', href=lambda x: x and '#story-continues-' in x), + dict(name='a', href=lambda x: x and '#whats-next' in x), + dict(id=lambda x: x and 'sharetools-' in x), + dict(id='newsletter-promo'.split()), ] - if headlinesOnly: - title = 'New York Times Headlines' - description = 'Headlines from the New York Times' - needs_subscription = 'optional' - elif webEdition: - title = 'New York Times (Web)' - description = 'New York Times on the Web' - needs_subscription = 'optional' - elif replaceKindleVersion: - title = 'The New York Times' - description = 'Today\'s New York Times' - needs_subscription = 'optional' - else: - title = 'New York Times' - description = 'Today\'s New York Times' - needs_subscription = 'optional' + def read_nyt_metadata(self): + INDEX = 'https://www.nytimes.com/section/todayspaper' + # INDEX = 'file:///t/raw.html' + soup = self.index_to_soup(INDEX) + pdate = soup.find('meta', attrs={'name':'pdate', 'content': True})['content'] + date = strptime(pdate, '%Y%m%d', assume_utc=False, as_utc=False) + self.cover_url = 'https://static01.nyt.com/images/{}/nytfrontpage/scan.jpg'.format(date.strftime('%Y/%m/%d')) + self.timefmt = date.strftime(' [%d %b, %Y]') + return soup - def decode_url_date(self, url): - urlitems = url.split('/') - try: - d = date(int(urlitems[3]), int(urlitems[4]), int(urlitems[5])) - except: - try: - d = date(int(urlitems[4]), int(urlitems[5]), int(urlitems[6])) - except: - return None - return d + def parse_todays_sections(self, container): + for h2 in container.findAll('h2', **classes('headline')): + title = self.tag_to_string(h2) + a = h2.find('a', href=True) + url = a['href'] + if '?' in url: + url = url.split('?')[0] + p = h2.findParent(**classes('story-body')) + desc = '' + if p is not None: + s = p.find(**classes('summary')) + if s is not None: + desc = self.tag_to_string(s) + self.log('\t', title, ': ', url) + self.log('\t\t', desc) + yield {'title': title, 'url': url, 'description': desc} - if oldest_web_article is None: - earliest_date = date.today() - else: - earliest_date = date.today() - timedelta(days=oldest_web_article) - oldest_article = 365 # by default, a long time ago - - __author__ = 'GRiker/Kovid Goyal/Nick Redding' - language = 'en' - requires_version = (0, 7, 5) - encoding = 'utf-8' - - timefmt = '' - - # simultaneous_downloads = 1 # no longer required to deal with ads - - cover_margins = (18, 18, 'grey99') - - keep_only_tags = dict(id=['article', 'story', 'content']) - remove_tags = [ - dict(attrs={'class': [ - 'articleFooter', - 'articleTools', - 'rfd', 'story-footer-links', 'page-footer', - 'columnGroup singleRule', - 'columnGroup last', - 'columnGroup last', - 'doubleRule', - 'dottedLine', - 'entry-meta', - 'entry-response module', - 'leftNavTabs', - 'metaFootnote', - 'inside-story', - 'module box nav', - 'nextArticleLink', - 'nextArticleLink clearfix', - 'post-tools', - 'relatedSearchesModule', - 'side_tool', - 'singleAd', - 'postCategory column', - 'refer tagRefer', # added for bits blog post - 'entry entry-utility', # added for DealBook - 'entry-tags', # added for DealBook - 'footer promos clearfix', # added for DealBook - 'footer links clearfix', # added for DealBook - 'tabsContainer', # added for other blog downloads - 'column lastColumn', # added for other blog downloads - 'pageHeaderWithLabel', # added for other gadgetwise downloads - 'column two', # added for other blog downloads - 'column two last', # added for other blog downloads - 'column three', # added for other blog downloads - 'column three last', # added for other blog downloads - 'column four', # added for other blog downloads - 'column four last', # added for other blog downloads - 'column last', # added for other blog downloads - 'entry entry-related', - 'subNavigation tabContent active', # caucus blog navigation - 'mediaOverlay slideshow', - 'wideThumb', - 'video', # added 02-11-2011 - 'videoHeader', # added 02-11-2011 - 'articleInlineVideoHolder', # added 02-11-2011 - 'assetCompanionAd', - 'nytint-sectionHeader', - re.compile('^subNavigation'), - re.compile('^leaderboard'), - re.compile('^module'), - re.compile('commentCount'), - 'lede-container', - 'credit', - 'caption-video', - 'upshot-social' - ]}), - dict( - attrs={'class': lambda x: x and 'related-coverage-marginalia' in x.split()}), - dict(attrs={'class': lambda x: x and 'hidden' in x.split()}), - dict(attrs={'class': lambda x: x and 'interactive' in x.split()}), - dict(attrs={'class': lambda x: x and 'SectionBarShare' in x.split('-')}), - dict(attrs={'class': lambda x: x and 'ResponsiveAd' in x.split('-')}), - dict(attrs={'class': lambda x: x and 'skip-to-text-link' in x.split()}), - dict(attrs={'class': lambda x: x and 'sharetools' in x.split()}), - dict(attrs={'class': lambda x: x and 'ad' in x.split()}), - dict(attrs={'class': lambda x: x and 'video' in x.split()}), - dict(attrs={'class': lambda x: x and 'visually-hidden' in x.split()}), - dict(name='div', attrs={'class': re.compile('toolsList')}), # bits - dict(name='div', attrs={ - 'class': re.compile('postNavigation')}), # bits - dict(name='div', attrs={'class': 'tweet'}), - dict(name='span', attrs={'class': 'commentCount meta'}), - dict(name='div', attrs={'id': 'header'}), - # bits, pogue, gadgetwise, open - dict(name='div', attrs={'id': re.compile('commentsContainer')}), - # pogue, gadgetwise - dict(name='ul', attrs={'class': re.compile('entry-tools')}), - # pogue, gadgetwise - dict(name='div', attrs={'class': re.compile('nocontent')}), - dict(name='div', attrs={'id': re.compile('respond')}), # open - dict(name='div', attrs={'class': re.compile('entry-tags')}), # pogue - dict(name='h4', attrs={'class': 'headline'}), - dict(id=[ - 'adxLeaderboard', - 'pagelinks', - 'adxSponLink', - 'anchoredAd_module', - 'anchoredAd_spot', - 'archive', - 'articleExtras', - 'articleInline', - 'blog_sidebar', - 'businessSearchBar', - 'cCol', - 'entertainmentSearchBar', - 'footer', - 'header', - 'header_search', - 'inlineBox', - 'login', - 'masthead', - 'masthead-nav', - 'masthead-social', - 'memberTools', - 'navigation', 'navigation-ghost', 'navigation-modal', 'navigation-edge', - 'page-footer', - 'portfolioInline', - 'readerReviews', - 'readerReviewsCount', - 'relatedArticles', - 'relatedTopics', - 'respond', - 'ribbon', - 'side_search', - 'side_index', - 'side_tool', - 'toolsRight', - 'skybox', # added for DealBook - 'TopAd', # added for DealBook - 'related-content', # added for DealBook - 'whats-next', - 'newsletter-promo', - ]), - dict(name=['script', 'noscript', 'style', 'form', 'hr', 'button', 'meta', 'footer'])] - no_stylesheets = True - extra_css = ''' - .articleHeadline { text-align: left; margin-top:0.5em; margin-bottom:0.25em; } - .credit { font-weight: normal; text-align: right; font-size: - 50%; line-height:1em; margin-top:5px; margin-left:0; - margin-right:0; margin-bottom: 0; } - .byline { text-align: left; font-size: 50%; line-height:1em; margin-top:10px; margin-left:0; margin-right:0; margin-bottom: 0; } - .dateline { text-align: left; font-size: 50%; line-height:1em;margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; } - .kicker { font-size: 50%; line-height:1em;margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; } - .timestamp { font-weight: normal; text-align: left; font-size: 50%; } - .caption { font-size: 50%; font-style:italic; line-height:1em; margin-top:5px; margin-left:0; margin-right:0; margin-bottom: 0; } - a:link {text-decoration: none; } - .date{font-size: 50%; } - .update{font-size: 50%; } - .articleBody { } - .authorId {text-align: left; font-size: 50%; } - .image {text-align: center;} - .aside {color:blue;margin:0px 0px 0px 0px; padding: 0px 0px 0px 0px; font-size:100%;} - .asidenote {color:blue;margin:0px 0px 0px 0px; padding: 0px 0px 0px 0px; font-size:100%;font-weight:bold;} - .source {text-align: left; font-size: x-small; }''' - - articles = {} - key = None - ans = [] - url_list = [] - - def filter_ans(self, ans): - total_article_count = 0 - idx = 0 - idx_max = len(ans) - 1 - while idx <= idx_max: - if self.includeSections != []: - if ans[idx][0] not in self.includeSections: - print "SECTION NOT INCLUDED: ", ans[idx][0] - del ans[idx] - idx_max = idx_max - 1 - continue - if ans[idx][0] in self.excludeSections: - print "SECTION EXCLUDED: ", ans[idx][0] - del ans[idx] - idx_max = idx_max - 1 - continue - if True: # self.verbose - self.log("Section %s: %d articles" % - (ans[idx][0], len(ans[idx][1]))) - for article in ans[idx][1]: - total_article_count += 1 - if True: # self.verbose - self.log("\t%-40.40s... \t%-60.60s..." % (article['title'].encode('cp1252', 'replace'), - article['url'].encode('cp1252', 'replace'))) - idx = idx + 1 - - self.log("Queued %d articles" % total_article_count) - return ans - - def exclude_url(self, url): - if not url.startswith("http"): - return True - # added for DealBook - if not url.endswith(".html") and 'dealbook.nytimes.com' not in url: - return True - if 'nytimes.com' not in url: - return True - if 'cn.nytimes.com' in url: - return True - if '/es/' in url: - return True - if 'podcast' in url: - return True - if '/video/' in url: - return True - if '/multimedia/' in url: - return True - if '/slideshow/' in url: - return True - if '/magazine/index' in url: - return True - if '/interactive/' in url: - return True - if '/reference/' in url: - return True - if '/premium/' in url: - return True - if '#comment' in url: - return True - if '#postComment' in url: - return True - if '#postcomment' in url: - return True - if re.search('/\d\d\d\d/\d\d/\d\d/', url) is None: - print("NO DATE IN " + url) - return True - return False - - def fixChars(self, string): - # Replace lsquo (\x91) - fixed = re.sub("\x91", "‘", string) - - # Replace rsquo (\x92) - fixed = re.sub("\x92", "’", fixed) - - # Replace ldquo (\x93) - fixed = re.sub("\x93", "“", fixed) - - # Replace rdquo (\x94) - fixed = re.sub("\x94", "”", fixed) - - # Replace ndash (\x96) - fixed = re.sub("\x96", "–", fixed) - - # Replace mdash (\x97) - fixed = re.sub("\x97", "—", fixed) - - return fixed - - def get_browser(self): - br = BasicNewsRecipe.get_browser(self) - if False and self.username is not None and self.password is not None: - # disabled because the idiotic nyt has moved to using an all JS - # login process with the further involvement of recaptcha, they - # apparently dont want their subscribing users to access their - # content conveniently - # Sample POST request - # URL: https://myaccount.nytimes.com/svc/account/auth/v1/login - # Origin: https://myaccount.nytimes.com - # Referer: https://myaccount.nytimes.com/auth/login - # Payload: - # auth_token: "H4sIAAAAAAAAAw3LQQqDMBBG4btk7cLJxKbjZST5ZwQpGEkUWmzv3my/x7tdtXaUvdlyfg5zs0Mpr83c0INu1XAuV926d7H30aW5mYI82XvhaXB7uXb0UZjD76uTInFSRCKf05gMPvIqijHALECyRE2RRs9A9jIRkyE+xEx1xR+mstBMkAAAAA==" # noqa - # captcha_result: "03AEHxwuxyNRK8s99A7rMvZOvWGfXUuy9HqbRfC0uSREKhf6lpR5eL0I6hp-PHBN8wshkxSU4piWvyD7n7xESxnwidq19esqPvWiTV7oTUGDkqFltTbq9Dk4xjAaZ7nczevNJR_xeG8tZhF72_EmwL_AGCR83a8aylDXzX7e_PDzh9JC9wA03rEVn66Q7uY29NGUOQd0Ux8frNelwCYKfUJaIdYCjzBnaCCQwp6mufm7rWVdQxX2Togq2-g6MKdpjpyXqjCQxFzYaX1Jcm5XS4bcVlB7F_tt5W-6FqfxsEqGUPhH1QTc8LEb8aRJ39QCMUNWGQ3Uz96ZK1LYOODwM9hv0wNecjUkl7fzdzikHC-o5m1Dg3pzaNlb1vQeejkNzt03QLk7CxhgisnbR-LmClpl-6BULP4un0HVvt0YkgY08osSYDEBVfT2I" # noqa - # password : "asdasd" - # remember_me : "Y" - # username : "sdasd" - # where auth_token is taken from data-auth-options attribute of - # and the captcha result comes from recaptcha - #
- br.open('https://www.nytimes.com/auth/login') - br.form = list(br.forms())[0] - br['userid'] = self.username - br['password'] = self.password - br.submit().read() - # if 'Please try again' in raw: - # raise Exception('Your username and password are incorrect') - return br - - cover_tag = 'NY_NYT' - - def get_cover_url(self): - from datetime import date - today = date.today() - cover = 'https://static01.nyt.com/images/' \ - + today.strftime('%Y') + '/' + today.strftime('%m') + '/' \ - + today.strftime('%d') + '/nytfrontpage/scan.jpg' - self.log(cover) - br = BasicNewsRecipe.get_browser(self) - try: - br.open(cover) - except: - self.log("\nCover unavailable") - cover = None - return cover - - masthead_url = 'http://graphics8.nytimes.com/images/misc/nytlogo379x64.gif' - - def short_title(self): - return self.title - - def article_to_soup(self, url_or_raw, raw=False): - from contextlib import closing - import copy - from calibre.ebooks.chardet import xml_to_unicode - print("ARTICLE_TO_SOUP " + url_or_raw) - if re.match(r'\w+://', url_or_raw): - br = self.clone_browser(self.browser) - open_func = getattr(br, 'open_novisit', br.open) - with closing(open_func(url_or_raw)) as f: - _raw = f.read() - if not _raw: - raise RuntimeError( - 'Could not fetch index from %s' % url_or_raw) - else: - _raw = url_or_raw - if raw: - return _raw - if not isinstance(_raw, unicode) and self.encoding: - if callable(self.encoding): - _raw = self.encoding(_raw) - else: - _raw = _raw.decode(self.encoding, 'replace') - - nmassage = copy.copy(BeautifulSoup.MARKUP_MASSAGE) - nmassage.extend(self.preprocess_regexps) - nmassage += [(re.compile(r'', re.DOTALL), lambda m: '')] - # Some websites have buggy doctype declarations that mess up beautifulsoup - # Remove comments as they can leave detritus when extracting tags leaves - # multiple nested comments - nmassage.append((re.compile(r'', re.DOTALL), lambda m: '')) - usrc = xml_to_unicode(_raw, self.verbose, strip_encoding_pats=True)[0] - usrc = self.preprocess_raw_html(usrc, url_or_raw) - return BeautifulSoup(usrc, markupMassage=nmassage) - - def massageNCXText(self, description): - # Kindle TOC descriptions won't render certain characters - if description: - massaged = unicode(BeautifulStoneSoup( - description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES)) - # Replace '&' with '&' - massaged = re.sub("&", "&", massaged) - massaged = re.sub("&", "&", massaged) - return self.fixChars(massaged) - else: - return description - - def feed_title(self, div): - return ''.join(div.findAll(text=True, recursive=True)).strip() - - def handle_article(self, div): - thumbnail = div.find('div', 'thumbnail') - if thumbnail: - thumbnail.extract() - return self.handle_base_article(div) - - # Handle '
' in world, u.s., etc - def handle_article_tag(self, div): - thumbnail = div.find('figure', 'media photo') - if not thumbnail: - thumbnail = div.find('div', 'thumb') - if thumbnail: - thumbnail.extract() - div = div.find('div', 'story-body') - if not div: - return - return self.handle_base_article(div) - - def handle_base_article(self, div): - a = div.find('a', href=True) - if not a: - return - url = re.sub(r'\?.*', '', a['href']) - if self.exclude_url(url): - return - url += '?pagewanted=all' - if self.filterDuplicates: - if url in self.url_list: - return - if self.webEdition: - date_tag = self.decode_url_date(url) - if date_tag is not None: - if self.oldest_web_article is not None: - if date_tag < self.earliest_date: - self.log("Skipping article %s" % url) - return - else: - self.log("Skipping article %s" % url) - return - self.url_list.append(url) - title = self.tag_to_string(a, use_alt=True).strip() - description = '' - pubdate = strftime('%a, %d %b') - summary = div.find(True, attrs={'class': 'summary'}) - if summary: - description = self.tag_to_string(summary, use_alt=False) - author = '' - authorAttribution = div.find(True, attrs={'class': 'byline'}) - if authorAttribution: - author = self.tag_to_string(authorAttribution, use_alt=False) - else: - authorAttribution = div.find(True, attrs={'class': 'byline'}) - if authorAttribution: - author = self.tag_to_string(authorAttribution, use_alt=False) - feed = self.key if self.key is not None else 'Uncategorized' - if feed not in self.articles: - self.ans.append(feed) - self.articles[feed] = [] - self.articles[feed].append( - dict(title=title, url=url, date=pubdate, - description=description, author=author, - content='')) - - def get_popular_articles(self, ans): - if self.getPopularArticles: - popular_articles = {} - key_list = [] - - def handleh3(h3tag): - try: - url = h3tag.a['href'] - except: - return ('', '', '', '') - url = re.sub(r'\?.*', '', url) - if self.exclude_url(url): - return ('', '', '', '') - url += '?pagewanted=all' - title = self.tag_to_string(h3tag.a, False) - h6tag = h3tag.findNextSibling('h6') - if h6tag is not None: - author = self.tag_to_string(h6tag, False) - else: - author = '' - ptag = h3tag.findNextSibling('p') - if ptag is not None: - desc = self.tag_to_string(ptag, False) - else: - desc = '' - return(title, url, author, desc) - - have_emailed = False - emailed_soup = self.index_to_soup( - 'http://www.nytimes.com/most-popular-emailed?period=' + self.popularPeriod) - for h3tag in emailed_soup.findAll('h3'): - (title, url, author, desc) = handleh3(h3tag) - if url == '': - continue - if not have_emailed: - key_list.append('Most E-Mailed') - popular_articles['Most E-Mailed'] = [] - have_emailed = True - popular_articles['Most E-Mailed'].append( - dict(title=title, url=url, date=strftime('%a, %d %b'), - description=desc, author=author, - content='')) - have_viewed = False - viewed_soup = self.index_to_soup( - 'http://www.nytimes.com/most-popular-viewed?period=' + self.popularPeriod) - for h3tag in viewed_soup.findAll('h3'): - (title, url, author, desc) = handleh3(h3tag) - if url == '': - continue - if not have_viewed: - key_list.append('Most Viewed') - popular_articles['Most Viewed'] = [] - have_viewed = True - popular_articles['Most Viewed'].append( - dict(title=title, url=url, date=strftime('%a, %d %b'), - description=desc, author=author, - content='')) - viewed_ans = [(k, popular_articles[k]) - for k in key_list if k in popular_articles] - for x in viewed_ans: - ans.append(x) - return ans - - def get_tech_feeds(self, ans): - if self.getTechBlogs: - tech_articles = {} - key_list = [] - save_oldest_article = self.oldest_article - save_max_articles_per_feed = self.max_articles_per_feed - self.oldest_article = self.tech_oldest_article - self.max_articles_per_feed = self.tech_max_articles_per_feed - self.feeds = self.tech_feeds - tech = self.parse_feeds() - self.oldest_article = save_oldest_article - self.max_articles_per_feed = save_max_articles_per_feed - self.feeds = None - for f in tech: - key_list.append(f.title) - tech_articles[f.title] = [] - for a in f.articles: - tech_articles[f.title].append( - dict(title=a.title, url=a.url.partition('?')[0], date=a.date, - description=a.summary, author=a.author, - content=a.content)) - tech_ans = [(k, tech_articles[k]) - for k in key_list if k in tech_articles] - for x in tech_ans: - ans.append(x) - return ans - - def parse_web_edition(self): - - for (sec_title, index_url) in self.web_sections: - if self.includeSections != []: - if sec_title not in self.includeSections: - print "SECTION NOT INCLUDED: ", sec_title - continue - if sec_title in self.excludeSections: - print "SECTION EXCLUDED: ", sec_title - continue - try: - soup = self.index_to_soup( - 'https://www.nytimes.com/pages/' + index_url + '/index.html') - except: - continue - print 'Index URL: ' + 'https://www.nytimes.com/pages/' + index_url + '/index.html' - - self.key = sec_title - # Find each article - for div in soup.findAll('article'): - self.handle_article_tag(div) - for div in soup.findAll(True, attrs={ - 'class': ['section-headline', 'ledeStory', 'story', 'story headline', 'sectionHeader', 'headlinesOnly multiline flush']}): - if div['class'] in ['story', 'story headline', 'storyHeader']: - self.handle_article(div) - elif div['class'] == 'ledeStory': - divsub = div.find('div', 'storyHeader') - if divsub is not None: - self.handle_article(divsub) - ulrefer = div.find('ul', 'refer') - if ulrefer is not None: - for lidiv in ulrefer.findAll('li'): - self.handle_article(lidiv) - elif div['class'] == 'headlinesOnly multiline flush': - for lidiv in div.findAll('li'): - self.handle_article(lidiv) - - self.ans = [(k, self.articles[k]) - for k in self.ans if k in self.articles] - return self.filter_ans(self.get_tech_feeds(self.get_popular_articles(self.ans))) - - def parse_todays_index(self): - - soup = self.index_to_soup( - 'https://www.nytimes.com/pages/todayspaper/index.html') - skipping = False - # Find each article - for div in soup.findAll(True, - attrs={'class': ['section-headline', 'story', 'story headline', 'sectionHeader', 'headlinesOnly multiline flush']}): - if div['class'] in ['section-headline', 'sectionHeader']: - self.key = string.capwords(self.feed_title(div)) - self.key = self.key.replace('Op-ed', 'Op-Ed') - self.key = self.key.replace('U.s.', 'U.S.') - self.key = self.key.replace('N.y.', 'N.Y.') - skipping = False - if self.includeSections != []: - if self.key not in self.includeSections: - print "SECTION NOT INCLUDED: ", self.key - skipping = True - if self.key in self.excludeSections: - print "SECTION EXCLUDED: ", self.key - skipping = True - - elif div['class'] in ['story', 'story headline']: - if not skipping: - self.handle_article(div) - elif div['class'] == 'headlinesOnly multiline flush': - for lidiv in div.findAll('li'): - if not skipping: - self.handle_article(lidiv) - - self.ans = [(k, self.articles[k]) - for k in self.ans if k in self.articles] - return self.filter_ans(self.get_tech_feeds(self.get_popular_articles(self.ans))) - - def parse_headline_index(self): - soup = self.index_to_soup( - 'https://www.nytimes.com/pages/todaysheadlines/') - pubdate = strftime('%a, %d %b') - section = None - articles = [] + def parse_todays_page(self): + soup = self.read_nyt_metadata() + section = soup.find(id='collection-todays-new-york-times') feeds = [] - for h6 in soup.findAll('h6'): - section = self.tag_to_string(h6).strip() - articles = [] - table = h6.parent.findNextSibling('table') - if table is None: - continue - for a in table.findAll('a', attrs={'class':'headURL'}): - title = self.tag_to_string(a) - url = a['href'].partition('?')[0] - if self.exclude_url(url) or (self.filterDuplicates and url in self.url_list): - continue - self.url_list.append(url) - desc = '' - h4 = a.findNextSibling('h4') - if h4 is not None: - desc += self.tag_to_string(h4) - p = a.findNextSibling('p') - if p is not None: - desc += ' ' + self.tag_to_string(p) - articles.append({'title':title, 'url':url + '?pagewanted=all', 'date':pubdate, 'description':desc}) + for h1 in section.findAll('h1')[1:]: + section_title = self.tag_to_string(h1) + self.log('Found section:', section_title) + articles = list(self.parse_todays_sections(h1.parent)) if articles: - feeds.append((section, articles)) - self.ans = feeds - return self.filter_ans(self.ans) + feeds.append((section_title, articles)) + return feeds + + def parse_highlights(self, container): + for article in container.findAll('article', **classes('story')): + h2 = article.find('h2') + if h2 is not None: + title = self.tag_to_string(h2) + a = h2.find('a', href=True) + if a is not None: + url = a['href'] + desc = '' + p = article.find(**classes('summary')) + if p is not None: + desc = self.tag_to_string(p) + yield {'title': title, 'url': url, 'description': desc} + + def parse_web_section(self, soup, slug): + + def log(article): + self.log('\t', article['title'], ':', article['url']) + if article.get('description'): + self.log('\t\t', article['description']) + + container = soup.find(itemtype='http://schema.org/CollectionPage') + highlights = container.find('section', **classes('highlights')) + for article in self.parse_highlights(highlights): + log(article) + yield article + extra = container.find('section', attrs={'data-collection-type': True}) + if extra is not None: + title = self.tag_to_string(extra.find('h2')) + for article in self.parse_highlights(extra): + article['title'] = '{}: {}'.format(title, article['title']) + log(article) + yield article + + def parse_web_sections(self): + feeds = [] + for section_title, slug in web_sections: + url = 'https://www.nytimes.com/section/' + slug + try: + soup = self.index_to_soup(url) + except Exception: + self.log.error('Failed to download section:', url) + continue + self.log('Found section:', section_title) + articles = list(self.parse_web_section(soup, slug)) + if articles: + feeds.append((section_title, articles)) + if self.test and len(feeds) >= self.test[0]: + break + return feeds def parse_index(self): - if self.headlinesOnly: - return self.parse_headline_index() - elif self.webEdition: - return self.parse_web_edition() - else: - return self.parse_todays_index() - - def strip_anchors(self, soup, kill_all=False): - paras = soup.findAll(True) - for para in paras: - aTags = para.findAll('a') - for a in aTags: - if a.img is None: - if kill_all or (self.recursions == 0): - a.replaceWith(self.tag_to_string(a, False)) - else: - if 'href' in a: - if a['href'].startswith('http://www.nytimes'): - if not a['href'].endswith('pagewanted=all'): - url = re.sub(r'\?.*', '', a['href']) - if self.exclude_url(url): - a.replaceWith( - self.tag_to_string(a, False)) - else: - a['href'] = url + '?pagewanted=all' - elif not (a['href'].startswith('http://pogue') or - a['href'].startswith('http://bits') or - a['href'].startswith('http://travel') or - a['href'].startswith('http://business') or - a['href'].startswith('http://tech') or - a['href'].startswith('http://health') or - a['href'].startswith('http://dealbook') or - a['href'].startswith('http://open')): - a.replaceWith(self.tag_to_string(a, False)) - return soup - - def handle_tags(self, soup): - try: - print("HANDLE TAGS: TITLE = " + self.tag_to_string(soup.title)) - except: - print("HANDLE TAGS: NO TITLE") - if soup is None: - print("ERROR: handle_tags received NoneType") - return None - - if self.keep_only_tags: - body = Tag(soup, 'body') - try: - if isinstance(self.keep_only_tags, dict): - self.keep_only_tags = [self.keep_only_tags] - for spec in self.keep_only_tags: - for tag in soup.find('body').findAll(**spec): - body.insert(len(body.contents), tag) - soup.find('body').replaceWith(body) - except AttributeError: # soup has no body element - pass - - def remove_beyond(tag, next): - while tag is not None and getattr(tag, 'name', None) != 'body': - after = getattr(tag, next) - while after is not None: - ns = getattr(tag, next) - after.extract() - after = ns - tag = tag.parent - - if self.remove_tags_after is not None: - rt = [self.remove_tags_after] if isinstance( - self.remove_tags_after, dict) else self.remove_tags_after - for spec in rt: - tag = soup.find(**spec) - remove_beyond(tag, 'nextSibling') - - if self.remove_tags_before is not None: - tag = soup.find(**self.remove_tags_before) - remove_beyond(tag, 'previousSibling') - - for kwds in self.remove_tags: - for tag in soup.findAll(**kwds): - tag.extract() - - return soup - - def preprocess_html(self, soup): - skip_tag = soup.find(True, {'name': 'skip'}) - if skip_tag is not None: - url = 'http://www.nytimes.com' + skip_tag.parent['href'] - self.log.warn("Skipping ad to article at '%s'" % url) - sleep(5) - soup = self.handle_tags(self.article_to_soup(url)) - - # check if the article is from one of the tech blogs - blog = soup.find( - 'div', attrs={'id': ['pogue', 'bits', 'gadgetwise', 'open']}) - - if blog is not None: - old_body = soup.find('body') - new_body = Tag(soup, 'body') - new_body.append(soup.find('div', attrs={'id': 'content'})) - new_body.find('div', attrs={'id': 'content'})[ - 'id'] = 'blogcontent' # identify for postprocess_html - old_body.replaceWith(new_body) - for divr in soup.findAll('div', attrs={'class': re.compile('w190 right')}): - if divr.find(text=re.compile('Sign up')): - divr.extract() - divr = soup.find( - 'div', attrs={'class': re.compile('^relatedArticlesModule')}) - if divr is not None: - print("PROCESSING RELATED: " + - self.tag_to_string(soup.title, False)) - # handle related articles - rlist = [] - ul = divr.find('ul') - if ul is not None: - for li in ul.findAll('li'): - atag = li.find('a') - if atag is not None: - if atag['href'].startswith('http://pogue') or atag['href'].startswith('http://bits') or \ - atag['href'].startswith('http://open'): - atag.find(text=True).replaceWith( - self.massageNCXText(self.tag_to_string(atag, False))) - rlist.append(atag) - divr.extract() - if rlist != []: - asidediv = Tag(soup, 'div', [('class', 'aside')]) - if soup.find('hr') is None: - asidediv.append(Tag(soup, 'hr')) - h4 = Tag(soup, 'h4', [('class', 'asidenote')]) - h4.insert(0, "Related Posts") - asidediv.append(h4) - ul = Tag(soup, 'ul') - for r in rlist: - li = Tag(soup, 'li', [('class', 'aside')]) - r['class'] = 'aside' - li.append(r) - ul.append(li) - asidediv.append(ul) - asidediv.append(Tag(soup, 'hr')) - smain = soup.find('body') - smain.append(asidediv) - else: - print("CANNOT FIND RELATED: " + - self.tag_to_string(soup.title, False)) - for atag in soup.findAll('a'): - img = atag.find('img') - if img is not None: - atag.replaceWith(img) - elif 'href' not in atag: - atag.replaceWith( - atag.renderContents().decode('cp1252', 'replace')) - elif not (atag['href'].startswith('http://www.nytimes') or atag['href'].startswith('http://pogue') or - atag['href'].startswith('http://bits') or atag['href'].startswith('http://open')): - atag.replaceWith( - atag.renderContents().decode('cp1252', 'replace')) - hdr = soup.find('address') - if hdr is not None: - hdr.name = 'span' - for span_credit in soup.findAll('span', 'credit'): - sp = Tag(soup, 'span') - span_credit.replaceWith(sp) - sp.append(Tag(soup, 'br')) - sp.append(span_credit) - sp.append(Tag(soup, 'br')) - - else: # nytimes article - - related = [] # these will be the related articles - first_outer = None # first related outer tag - first_related = None # first related tag - for outerdiv in soup.findAll(attrs={'class': re.compile('articleInline runaroundLeft')}): - for rdiv in soup.findAll('div', 'columnGroup doubleRule'): - if rdiv.find('h3') is not None: - if self.tag_to_string(rdiv.h3, False).startswith('Related'): - rdiv.h3.find(text=True).replaceWith( - "Related articles") - rdiv.h3['class'] = 'asidenote' - for litag in rdiv.findAll('li'): - if litag.find('a') is not None: - if litag.find('a')['href'].startswith('http://www.nytimes.com'): - url = re.sub( - r'\?.*', '', litag.find('a')['href']) - litag.find('a')[ - 'href'] = url + '?pagewanted=all' - litag.extract() - related.append(litag) - if first_related is None: - first_related = rdiv - first_outer = outerdiv - else: - litag.extract() - for h6tag in rdiv.findAll('h6'): - if h6tag.find('a') is not None: - if h6tag.find('a')['href'].startswith('http://www.nytimes.com'): - url = re.sub( - r'\?.*', '', h6tag.find('a')['href']) - h6tag.find('a')[ - 'href'] = url + '?pagewanted=all' - h6tag.extract() - related.append(h6tag) - if first_related is None: - first_related = rdiv - first_outer = outerdiv - else: - h6tag.extract() - if related != []: - for r in related: - if r.h6: # don't want the anchor inside a h6 tag - r.h6.replaceWith(r.h6.a) - first_related.ul.append(r) - first_related.insert(0, Tag(soup, 'hr')) - first_related.append(Tag(soup, 'hr')) - first_related['class'] = 'aside' - # replace the outer tag with the related tag - first_outer.replaceWith(first_related) - - for rdiv in soup.findAll(attrs={'class': re.compile('articleInline runaroundLeft')}): - rdiv.extract() - - kicker_tag = soup.find(attrs={'class': 'kicker'}) - if kicker_tag: # remove Op_Ed author head shots - tagline = self.tag_to_string(kicker_tag) - if tagline == 'Op-Ed Columnist': - img_div = soup.find('div', 'inlineImage module') - if img_div: - img_div.extract() - - if self.useHighResImages: - try: - # open up all the "Enlarge this Image" pop-ups and download - # the full resolution jpegs - enlargeThisList = soup.findAll( - 'div', {'class': 'icon enlargeThis'}) - if enlargeThisList: - for popupref in enlargeThisList: - popupreflink = popupref.find('a') - if popupreflink: - reflinkstring = str(popupreflink['href']) - refstart = reflinkstring.find( - "javascript:pop_me_up2('") + len("javascript:pop_me_up2('") - refend = reflinkstring.find( - ".html", refstart) + len(".html") - reflinkstring = reflinkstring[refstart:refend] - - popuppage = self.browser.open(reflinkstring) - popuphtml = popuppage.read() - popuppage.close() - if popuphtml: - st = time.localtime() - year = str(st.tm_year) - month = "%.2d" % st.tm_mon - day = "%.2d" % st.tm_mday - imgstartpos = popuphtml.find('http://graphics8.nytimes.com/images/' + year + '/' + month + '/' + day + '/') + \ - len('http://graphics8.nytimes.com/images/' + - year + '/' + month + '/' + day + '/') - highResImageLink = 'http://graphics8.nytimes.com/images/' + year + '/' + \ - month + '/' + day + '/' + \ - popuphtml[imgstartpos:popuphtml.find( - '.jpg', imgstartpos) + 4] - popupSoup = BeautifulSoup(popuphtml) - highResTag = popupSoup.find( - 'img', {'src': highResImageLink}) - if highResTag: - try: - newWidth = highResTag['width'] - newHeight = highResTag['height'] - imageTag = popupref.parent.find( - "img") - except: - self.log( - "Error: finding width and height of img") - popupref.extract() - if imageTag: - try: - imageTag[ - 'src'] = highResImageLink - imageTag['width'] = newWidth - imageTag['height'] = newHeight - except: - self.log( - "Error setting the src width and height parameters") - except Exception: - self.log("Error pulling high resolution images") - - try: - # in case pulling images failed, delete the enlarge this - # text - enlargeThisList = soup.findAll( - 'div', {'class': 'icon enlargeThis'}) - if enlargeThisList: - for popupref in enlargeThisList: - popupref.extract() - except: - self.log("Error removing Enlarge this text") - - return self.strip_anchors(soup, False) - - def postprocess_html(self, soup, first_fetch): - if not first_fetch: # remove Related links - for aside in soup.findAll('div', 'aside'): - aside.extract() - soup = self.strip_anchors(soup, True) - - for t in soup.findAll('time', attrs={'class':'dateline'}): - t.name = 'div' - - if soup.find('div', attrs={'id': 'blogcontent'}) is None: - if first_fetch: - aside = soup.find('div', 'aside') - if aside is not None: # move the related list to the end of the article - art = soup.find('div', attrs={'id': 'article'}) - if art is None: - art = soup.find('div', attrs={'class': 'article'}) - if art is not None: - art.append(aside) - try: - if self.one_picture_per_article: - # Remove all images after first - largeImg = soup.find(True, {'class': 'articleSpanImage'}) - inlineImgs = soup.findAll( - True, {'class': 'inlineImage module'}) - if largeImg: - for inlineImg in inlineImgs: - inlineImg.extract() - else: - if inlineImgs: - firstImg = inlineImgs[0] - for inlineImg in inlineImgs[1:]: - inlineImg.extract() - # Move firstImg before article body - cgFirst = soup.find( - True, {'class': re.compile('columnGroup *first')}) - if cgFirst: - # Strip all sibling NavigableStrings: noise - navstrings = cgFirst.findAll( - text=True, recursive=False) - [ns.extract() for ns in navstrings] - headline_found = False - tag = cgFirst.find(True) - insertLoc = 0 - while True: - insertLoc += 1 - if hasattr(tag, 'class') and tag['class'] == 'articleHeadline': - headline_found = True - break - tag = tag.nextSibling - if not tag: - headline_found = False - break - if headline_found: - cgFirst.insert(insertLoc, firstImg) - else: - self.log( - ">>> No class:'columnGroup first' found <<<") - except: - self.log("ERROR: One picture per article in postprocess_html") - - try: - # Change captions to italic - for caption in soup.findAll(True, {'class': 'caption'}): - if caption and len(caption) > 0: - cTag = Tag(soup, "p", [("class", "caption")]) - c = self.fixChars(self.tag_to_string( - caption, use_alt=False)).strip() - mp_off = c.find("More Photos") - if mp_off >= 0: - c = c[:mp_off] - cTag.insert(0, c) - caption.replaceWith(cTag) - except: - self.log("ERROR: Problem in change captions to italic") - - try: - # Change to

- h1 = soup.find('h1') - blogheadline = str(h1) # added for dealbook - if h1: - headline = h1.find("nyt_headline") - if headline: - tag = Tag(soup, "h2") - tag['class'] = "headline" - tag.insert(0, self.fixChars(headline.contents[0])) - h1.replaceWith(tag) - elif blogheadline.find('entry-title'): # added for dealbook - tag = Tag(soup, "h2") # added for dealbook - tag['class'] = "headline" # added for dealbook - # added for dealbook - tag.insert(0, self.fixChars(h1.contents[0])) - h1.replaceWith(tag) # added for dealbook - - else: - # Blog entry - replace headline, remove
tags - BCC I - # think this is no longer functional 1-18-2011 - headline = soup.find('title') - if headline: - tag = Tag(soup, "h2") - tag['class'] = "headline" - tag.insert(0, self.fixChars( - self.tag_to_string(headline, False))) - soup.insert(0, tag) - hrs = soup.findAll('hr') - for hr in hrs: - hr.extract() - except: - self.log("ERROR: Problem in Change to

") - - try: - # if this is from a blog (dealbook, fix the byline format - bylineauthor = soup.find( - 'address', attrs={'class': 'byline author vcard'}) - if bylineauthor: - tag = Tag(soup, "h6") - tag['class'] = "byline" - tag.insert(0, self.fixChars( - self.tag_to_string(bylineauthor, False))) - bylineauthor.replaceWith(tag) - except: - self.log("ERROR: fixing byline author format") - - try: - # if this is a blog (dealbook) fix the credit style for the - # pictures - blogcredit = soup.find('div', attrs={'class': 'credit'}) - if blogcredit: - tag = Tag(soup, "h6") - tag['class'] = "credit" - tag.insert(0, self.fixChars( - self.tag_to_string(blogcredit, False))) - blogcredit.replaceWith(tag) - except: - self.log("ERROR: fixing credit format") - - try: - # Change

to

- used in editorial blogs - masthead = soup.find("h1") - if masthead: - # Nuke the href - if masthead.a: - del(masthead.a['href']) - tag = Tag(soup, "h3") - tag.insert(0, self.fixChars(masthead.contents[0])) - masthead.replaceWith(tag) - except: - self.log( - "ERROR: Problem in Change

to

- used in editorial blogs") - - try: - # Change to - for subhead in soup.findAll(True, {'class': 'bold'}): - if subhead.contents: - bTag = Tag(soup, "b") - bTag.insert(0, subhead.contents[0]) - subhead.replaceWith(bTag) - except: - self.log( - "ERROR: Problem in Change

to

- used in editorial blogs") - try: - # remove the update tag - blogupdated = soup.find('span', {'class': 'update'}) - if blogupdated: - blogupdated.replaceWith("") - except: - self.log("ERROR: Removing strong tag") - - try: - divTag = soup.find('div', attrs={'id': 'articleBody'}) - if divTag: - divTag['class'] = divTag['id'] - except: - self.log( - "ERROR: Problem in soup.find(div,attrs={id:articleBody})") - - try: - # Add class="authorId" to
so we can format with CSS - divTag = soup.find('div', attrs={'id': 'authorId'}) - if divTag and divTag.contents[0]: - tag = Tag(soup, "p") - tag['class'] = "authorId" - tag.insert(0, self.fixChars(self.tag_to_string(divTag.contents[0], - use_alt=False))) - divTag.replaceWith(tag) - except: - self.log( - "ERROR: Problem in Add class=authorId to
so we can format with CSS") - return soup - - def populate_article_metadata(self, article, soup, first): - if not first: - return - idxdiv = soup.find('div', attrs={'class': 'articleSpanImage'}) - if idxdiv is not None: - if idxdiv.img: - self.add_toc_thumbnail(article, re.sub( - r'links\\link\d+\\', '', idxdiv.img['src'])) - else: - img = soup.find('body').find('img') - if img is not None: - self.add_toc_thumbnail(article, re.sub( - r'links\\link\d+\\', '', img['src'])) - shortparagraph = "" - try: - if len(article.text_summary.strip()) == 0: - articlebodies = soup.findAll( - 'div', attrs={'class': 'articleBody'}) - if articlebodies: - for articlebody in articlebodies: - if articlebody: - paras = articlebody.findAll('p') - for p in paras: - refparagraph = self.massageNCXText( - self.tag_to_string(p, use_alt=False)).strip() - # account for blank paragraphs and short - # paragraphs by appending them to longer ones - if len(refparagraph) > 0: - if len(refparagraph) > 70: # approximately one line of text - newpara = shortparagraph + refparagraph - newparaDateline, newparaEm, newparaDesc = newpara.partition( - '—') - if newparaEm == '': - newparaDateline, newparaEm, newparaDesc = newpara.partition( - '—') - if newparaEm == '': - newparaDesc = newparaDateline - article.summary = article.text_summary = newparaDesc.strip() - return - else: - shortparagraph = refparagraph + " " - if shortparagraph.strip().find(" ") == -1 and not shortparagraph.strip().endswith(":"): - shortparagraph = shortparagraph + "- " - else: - article.summary = article.text_summary = self.massageNCXText( - article.text_summary) - except: - self.log("Error creating article descriptions") - return + if is_web_edition: + return self.parse_web_sections() + return self.parse_todays_page()