diff --git a/recipes/nytimes.recipe b/recipes/nytimes.recipe index 1f7826de1f..bbbb238a40 100644 --- a/recipes/nytimes.recipe +++ b/recipes/nytimes.recipe @@ -13,8 +13,9 @@ from calibre.ebooks.BeautifulSoup import Tag from calibre.utils.date import strptime from calibre.web.feeds.news import BasicNewsRecipe -is_web_edition = True +is_web_edition = False oldest_web_edition_article = 7 # days +use_wayback_machine = False # The sections to download when downloading the web edition, comment out @@ -92,32 +93,37 @@ class NewYorkTimes(BasicNewsRecipe): remove_attributes = ['style'] conversion_options = {'flow_size': 0} - def preprocess_raw_html(self, raw_html, url): - if not hasattr(self, 'nyt_parser'): + @property + def nyt_parser(self): + ans = getattr(self, '_nyt_parser', None) + if ans is None: from calibre.live import load_module - m = load_module('calibre.web.site_parsers.nytimes') - self.nyt_parser = m + self._nyt_parser = ans = load_module('calibre.web.site_parsers.nytimes') + return ans + + def get_nyt_page(self, url): + if use_wayback_machine: + from calibre import browser + return self.nyt_parser.download_url(url, browser()) + return self.browser.open_novisit(url).read() + + def preprocess_raw_html(self, raw_html, url): html = self.nyt_parser.extract_html(self.index_to_soup(raw_html)) return html + articles_are_obfuscated = use_wayback_machine + + if use_wayback_machine: + def get_obfuscated_article(self, url): + from calibre.ptempfile import PersistentTemporaryFile + with PersistentTemporaryFile() as tf: + tf.write(self.get_nyt_page(url)) + return tf.name + def read_todays_paper(self): INDEX = 'https://www.nytimes.com/section/todayspaper' # INDEX = 'file:///t/raw.html' - try: - soup = self.index_to_soup(INDEX) - except Exception as err: - if getattr(err, 'code', None) == 404: - try: - soup = self.index_to_soup(strftime('https://www.nytimes.com/issue/todayspaper/%Y/%m/%d/todays-new-york-times')) - except Exception as err: - if getattr(err, 'code', None) == 404: - dt = datetime.datetime.today() - datetime.timedelta(days=1) - soup = self.index_to_soup(dt.strftime('https://www.nytimes.com/issue/todayspaper/%Y/%m/%d/todays-new-york-times')) - else: - raise - else: - raise - return soup + return self.index_to_soup(self.get_nyt_page(INDEX)) def read_nyt_metadata(self): soup = self.read_todays_paper() @@ -241,7 +247,7 @@ class NewYorkTimes(BasicNewsRecipe): for section_title, slug in web_sections: url = 'https://www.nytimes.com/section/' + slug try: - soup = self.index_to_soup(url) + soup = self.index_to_soup(self.get_nyt_page(url)) except Exception: self.log.error('Failed to download section:', url) continue diff --git a/recipes/nytimes_sub.recipe b/recipes/nytimes_sub.recipe index 6b42573e88..bbbb238a40 100644 --- a/recipes/nytimes_sub.recipe +++ b/recipes/nytimes_sub.recipe @@ -15,6 +15,7 @@ from calibre.web.feeds.news import BasicNewsRecipe is_web_edition = False oldest_web_edition_article = 7 # days +use_wayback_machine = False # The sections to download when downloading the web edition, comment out @@ -92,32 +93,37 @@ class NewYorkTimes(BasicNewsRecipe): remove_attributes = ['style'] conversion_options = {'flow_size': 0} - def preprocess_raw_html(self, raw_html, url): - if not hasattr(self, 'nyt_parser'): + @property + def nyt_parser(self): + ans = getattr(self, '_nyt_parser', None) + if ans is None: from calibre.live import load_module - m = load_module('calibre.web.site_parsers.nytimes') - self.nyt_parser = m + self._nyt_parser = ans = load_module('calibre.web.site_parsers.nytimes') + return ans + + def get_nyt_page(self, url): + if use_wayback_machine: + from calibre import browser + return self.nyt_parser.download_url(url, browser()) + return self.browser.open_novisit(url).read() + + def preprocess_raw_html(self, raw_html, url): html = self.nyt_parser.extract_html(self.index_to_soup(raw_html)) return html + articles_are_obfuscated = use_wayback_machine + + if use_wayback_machine: + def get_obfuscated_article(self, url): + from calibre.ptempfile import PersistentTemporaryFile + with PersistentTemporaryFile() as tf: + tf.write(self.get_nyt_page(url)) + return tf.name + def read_todays_paper(self): INDEX = 'https://www.nytimes.com/section/todayspaper' # INDEX = 'file:///t/raw.html' - try: - soup = self.index_to_soup(INDEX) - except Exception as err: - if getattr(err, 'code', None) == 404: - try: - soup = self.index_to_soup(strftime('https://www.nytimes.com/issue/todayspaper/%Y/%m/%d/todays-new-york-times')) - except Exception as err: - if getattr(err, 'code', None) == 404: - dt = datetime.datetime.today() - datetime.timedelta(days=1) - soup = self.index_to_soup(dt.strftime('https://www.nytimes.com/issue/todayspaper/%Y/%m/%d/todays-new-york-times')) - else: - raise - else: - raise - return soup + return self.index_to_soup(self.get_nyt_page(INDEX)) def read_nyt_metadata(self): soup = self.read_todays_paper() @@ -241,7 +247,7 @@ class NewYorkTimes(BasicNewsRecipe): for section_title, slug in web_sections: url = 'https://www.nytimes.com/section/' + slug try: - soup = self.index_to_soup(url) + soup = self.index_to_soup(self.get_nyt_page(url)) except Exception: self.log.error('Failed to download section:', url) continue diff --git a/recipes/nytimesbook.recipe b/recipes/nytimesbook.recipe index dfb62ed986..d18846eeef 100644 --- a/recipes/nytimesbook.recipe +++ b/recipes/nytimesbook.recipe @@ -2,14 +2,9 @@ # vim:fileencoding=utf-8 # License: GPLv3 Copyright: 2015, Kovid Goyal -from __future__ import unicode_literals - from calibre.web.feeds.news import BasicNewsRecipe - -def classes(classes): - q = frozenset(classes.split(' ')) - return dict(attrs={'class': lambda x: x and frozenset(x.split()).intersection(q)}) +use_wayback_machine = False def absolutize(url): @@ -28,32 +23,38 @@ class NewYorkTimesBookReview(BasicNewsRecipe): no_javascript = True ignore_duplicate_articles = {'title', 'url'} encoding = 'utf-8' - articles_are_obfuscated = True - delay = 1 - def get_obfuscated_article(self, url): - if not hasattr(self, 'nyt_parser'): + articles_are_obfuscated = use_wayback_machine + + if use_wayback_machine: + def get_obfuscated_article(self, url): + from calibre.ptempfile import PersistentTemporaryFile + with PersistentTemporaryFile() as tf: + tf.write(self.get_nyt_page(url)) + return tf.name + + @property + def nyt_parser(self): + ans = getattr(self, '_nyt_parser', None) + if ans is None: from calibre.live import load_module - m = load_module('calibre.web.site_parsers.nytimes') - self.nyt_parser = m - raw = self.nyt_parser.download_url(url, self.cloned_browser) - from calibre.ptempfile import PersistentTemporaryFile - with PersistentTemporaryFile(suffix='.html') as pt: - pt.write(raw) - return pt.name + self._nyt_parser = ans = load_module('calibre.web.site_parsers.nytimes') + return ans + + def get_nyt_page(self, url): + if use_wayback_machine: + from calibre import browser + return self.nyt_parser.download_url(url, browser()) + return self.browser.open_novisit(url).read() def preprocess_raw_html(self, raw_html, url): - if not hasattr(self, 'nyt_parser'): - from calibre.live import load_module - m = load_module('calibre.web.site_parsers.nytimes') - self.nyt_parser = m html = self.nyt_parser.extract_html(self.index_to_soup(raw_html)) return html def parse_index(self): # return [('Articles', [{'url': 'https://www.nytimes.com/2022/09/08/books/review/karen-armstrong-by-the-book-interview.html', 'title':'test'}])] soup = self.index_to_soup( - 'https://www.nytimes.com/pages/books/review/index.html') + self.get_nyt_page('https://www.nytimes.com/pages/books/review/index.html')) # Find TOC toc = soup.find('section', id='collection-book-review').find('section').find('ol') diff --git a/src/calibre/web/site_parsers/nytimes.py b/src/calibre/web/site_parsers/nytimes.py index f849ac9098..a4e0438518 100644 --- a/src/calibre/web/site_parsers/nytimes.py +++ b/src/calibre/web/site_parsers/nytimes.py @@ -10,7 +10,7 @@ from pprint import pprint from calibre.utils.iso8601 import parse_iso8601 -module_version = 2 # needed for live updates +module_version = 3 # needed for live updates pprint @@ -187,18 +187,15 @@ def extract_html(soup): def download_url(url, br): - # NYT has implemented captcha protection for its article pages, so get - # them from the wayback machine instead. However, wayback machine is - # flaky so god knows how well it will work under load - from calibre.ebooks.metadata.sources.update import search_engines_module - m = search_engines_module() - cu = m.wayback_machine_cached_url(url, br) - raw = m.get_data_for_cached_url(cu) - if raw is None: - raw = br.open_novisit(cu).read() - if not isinstance(raw, bytes): - raw = raw.encode('utf-8') - return raw + # Get the URL from the Wayback machine + from mechanize import Request + rq = Request( + 'http://localhost:8090/nytimes', + data=json.dumps({"url": url}), + headers={'User-Agent': 'calibre', 'Content-Type': 'application/json'} + ) + br.set_handle_gzip(True) + return br.open_novisit(rq, timeout=3 * 60).read() if __name__ == '__main__':