Update NYTimes

This commit is contained in:
Kovid Goyal 2022-09-15 12:24:41 +05:30
parent b37186d3a1
commit 2367d3464c
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
4 changed files with 86 additions and 76 deletions

View File

@ -13,8 +13,9 @@ from calibre.ebooks.BeautifulSoup import Tag
from calibre.utils.date import strptime from calibre.utils.date import strptime
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
is_web_edition = True is_web_edition = False
oldest_web_edition_article = 7 # days oldest_web_edition_article = 7 # days
use_wayback_machine = False
# The sections to download when downloading the web edition, comment out # The sections to download when downloading the web edition, comment out
@ -92,32 +93,37 @@ class NewYorkTimes(BasicNewsRecipe):
remove_attributes = ['style'] remove_attributes = ['style']
conversion_options = {'flow_size': 0} conversion_options = {'flow_size': 0}
def preprocess_raw_html(self, raw_html, url): @property
if not hasattr(self, 'nyt_parser'): def nyt_parser(self):
ans = getattr(self, '_nyt_parser', None)
if ans is None:
from calibre.live import load_module from calibre.live import load_module
m = load_module('calibre.web.site_parsers.nytimes') self._nyt_parser = ans = load_module('calibre.web.site_parsers.nytimes')
self.nyt_parser = m return ans
def get_nyt_page(self, url):
if use_wayback_machine:
from calibre import browser
return self.nyt_parser.download_url(url, browser())
return self.browser.open_novisit(url).read()
def preprocess_raw_html(self, raw_html, url):
html = self.nyt_parser.extract_html(self.index_to_soup(raw_html)) html = self.nyt_parser.extract_html(self.index_to_soup(raw_html))
return html return html
articles_are_obfuscated = use_wayback_machine
if use_wayback_machine:
def get_obfuscated_article(self, url):
from calibre.ptempfile import PersistentTemporaryFile
with PersistentTemporaryFile() as tf:
tf.write(self.get_nyt_page(url))
return tf.name
def read_todays_paper(self): def read_todays_paper(self):
INDEX = 'https://www.nytimes.com/section/todayspaper' INDEX = 'https://www.nytimes.com/section/todayspaper'
# INDEX = 'file:///t/raw.html' # INDEX = 'file:///t/raw.html'
try: return self.index_to_soup(self.get_nyt_page(INDEX))
soup = self.index_to_soup(INDEX)
except Exception as err:
if getattr(err, 'code', None) == 404:
try:
soup = self.index_to_soup(strftime('https://www.nytimes.com/issue/todayspaper/%Y/%m/%d/todays-new-york-times'))
except Exception as err:
if getattr(err, 'code', None) == 404:
dt = datetime.datetime.today() - datetime.timedelta(days=1)
soup = self.index_to_soup(dt.strftime('https://www.nytimes.com/issue/todayspaper/%Y/%m/%d/todays-new-york-times'))
else:
raise
else:
raise
return soup
def read_nyt_metadata(self): def read_nyt_metadata(self):
soup = self.read_todays_paper() soup = self.read_todays_paper()
@ -241,7 +247,7 @@ class NewYorkTimes(BasicNewsRecipe):
for section_title, slug in web_sections: for section_title, slug in web_sections:
url = 'https://www.nytimes.com/section/' + slug url = 'https://www.nytimes.com/section/' + slug
try: try:
soup = self.index_to_soup(url) soup = self.index_to_soup(self.get_nyt_page(url))
except Exception: except Exception:
self.log.error('Failed to download section:', url) self.log.error('Failed to download section:', url)
continue continue

View File

@ -15,6 +15,7 @@ from calibre.web.feeds.news import BasicNewsRecipe
is_web_edition = False is_web_edition = False
oldest_web_edition_article = 7 # days oldest_web_edition_article = 7 # days
use_wayback_machine = False
# The sections to download when downloading the web edition, comment out # The sections to download when downloading the web edition, comment out
@ -92,32 +93,37 @@ class NewYorkTimes(BasicNewsRecipe):
remove_attributes = ['style'] remove_attributes = ['style']
conversion_options = {'flow_size': 0} conversion_options = {'flow_size': 0}
def preprocess_raw_html(self, raw_html, url): @property
if not hasattr(self, 'nyt_parser'): def nyt_parser(self):
ans = getattr(self, '_nyt_parser', None)
if ans is None:
from calibre.live import load_module from calibre.live import load_module
m = load_module('calibre.web.site_parsers.nytimes') self._nyt_parser = ans = load_module('calibre.web.site_parsers.nytimes')
self.nyt_parser = m return ans
def get_nyt_page(self, url):
if use_wayback_machine:
from calibre import browser
return self.nyt_parser.download_url(url, browser())
return self.browser.open_novisit(url).read()
def preprocess_raw_html(self, raw_html, url):
html = self.nyt_parser.extract_html(self.index_to_soup(raw_html)) html = self.nyt_parser.extract_html(self.index_to_soup(raw_html))
return html return html
articles_are_obfuscated = use_wayback_machine
if use_wayback_machine:
def get_obfuscated_article(self, url):
from calibre.ptempfile import PersistentTemporaryFile
with PersistentTemporaryFile() as tf:
tf.write(self.get_nyt_page(url))
return tf.name
def read_todays_paper(self): def read_todays_paper(self):
INDEX = 'https://www.nytimes.com/section/todayspaper' INDEX = 'https://www.nytimes.com/section/todayspaper'
# INDEX = 'file:///t/raw.html' # INDEX = 'file:///t/raw.html'
try: return self.index_to_soup(self.get_nyt_page(INDEX))
soup = self.index_to_soup(INDEX)
except Exception as err:
if getattr(err, 'code', None) == 404:
try:
soup = self.index_to_soup(strftime('https://www.nytimes.com/issue/todayspaper/%Y/%m/%d/todays-new-york-times'))
except Exception as err:
if getattr(err, 'code', None) == 404:
dt = datetime.datetime.today() - datetime.timedelta(days=1)
soup = self.index_to_soup(dt.strftime('https://www.nytimes.com/issue/todayspaper/%Y/%m/%d/todays-new-york-times'))
else:
raise
else:
raise
return soup
def read_nyt_metadata(self): def read_nyt_metadata(self):
soup = self.read_todays_paper() soup = self.read_todays_paper()
@ -241,7 +247,7 @@ class NewYorkTimes(BasicNewsRecipe):
for section_title, slug in web_sections: for section_title, slug in web_sections:
url = 'https://www.nytimes.com/section/' + slug url = 'https://www.nytimes.com/section/' + slug
try: try:
soup = self.index_to_soup(url) soup = self.index_to_soup(self.get_nyt_page(url))
except Exception: except Exception:
self.log.error('Failed to download section:', url) self.log.error('Failed to download section:', url)
continue continue

View File

@ -2,14 +2,9 @@
# vim:fileencoding=utf-8 # vim:fileencoding=utf-8
# License: GPLv3 Copyright: 2015, Kovid Goyal <kovid at kovidgoyal.net> # License: GPLv3 Copyright: 2015, Kovid Goyal <kovid at kovidgoyal.net>
from __future__ import unicode_literals
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
use_wayback_machine = False
def classes(classes):
q = frozenset(classes.split(' '))
return dict(attrs={'class': lambda x: x and frozenset(x.split()).intersection(q)})
def absolutize(url): def absolutize(url):
@ -28,32 +23,38 @@ class NewYorkTimesBookReview(BasicNewsRecipe):
no_javascript = True no_javascript = True
ignore_duplicate_articles = {'title', 'url'} ignore_duplicate_articles = {'title', 'url'}
encoding = 'utf-8' encoding = 'utf-8'
articles_are_obfuscated = True
delay = 1
articles_are_obfuscated = use_wayback_machine
if use_wayback_machine:
def get_obfuscated_article(self, url): def get_obfuscated_article(self, url):
if not hasattr(self, 'nyt_parser'):
from calibre.live import load_module
m = load_module('calibre.web.site_parsers.nytimes')
self.nyt_parser = m
raw = self.nyt_parser.download_url(url, self.cloned_browser)
from calibre.ptempfile import PersistentTemporaryFile from calibre.ptempfile import PersistentTemporaryFile
with PersistentTemporaryFile(suffix='.html') as pt: with PersistentTemporaryFile() as tf:
pt.write(raw) tf.write(self.get_nyt_page(url))
return pt.name return tf.name
@property
def nyt_parser(self):
ans = getattr(self, '_nyt_parser', None)
if ans is None:
from calibre.live import load_module
self._nyt_parser = ans = load_module('calibre.web.site_parsers.nytimes')
return ans
def get_nyt_page(self, url):
if use_wayback_machine:
from calibre import browser
return self.nyt_parser.download_url(url, browser())
return self.browser.open_novisit(url).read()
def preprocess_raw_html(self, raw_html, url): def preprocess_raw_html(self, raw_html, url):
if not hasattr(self, 'nyt_parser'):
from calibre.live import load_module
m = load_module('calibre.web.site_parsers.nytimes')
self.nyt_parser = m
html = self.nyt_parser.extract_html(self.index_to_soup(raw_html)) html = self.nyt_parser.extract_html(self.index_to_soup(raw_html))
return html return html
def parse_index(self): def parse_index(self):
# return [('Articles', [{'url': 'https://www.nytimes.com/2022/09/08/books/review/karen-armstrong-by-the-book-interview.html', 'title':'test'}])] # return [('Articles', [{'url': 'https://www.nytimes.com/2022/09/08/books/review/karen-armstrong-by-the-book-interview.html', 'title':'test'}])]
soup = self.index_to_soup( soup = self.index_to_soup(
'https://www.nytimes.com/pages/books/review/index.html') self.get_nyt_page('https://www.nytimes.com/pages/books/review/index.html'))
# Find TOC # Find TOC
toc = soup.find('section', id='collection-book-review').find('section').find('ol') toc = soup.find('section', id='collection-book-review').find('section').find('ol')

View File

@ -10,7 +10,7 @@ from pprint import pprint
from calibre.utils.iso8601 import parse_iso8601 from calibre.utils.iso8601 import parse_iso8601
module_version = 2 # needed for live updates module_version = 3 # needed for live updates
pprint pprint
@ -187,18 +187,15 @@ def extract_html(soup):
def download_url(url, br): def download_url(url, br):
# NYT has implemented captcha protection for its article pages, so get # Get the URL from the Wayback machine
# them from the wayback machine instead. However, wayback machine is from mechanize import Request
# flaky so god knows how well it will work under load rq = Request(
from calibre.ebooks.metadata.sources.update import search_engines_module 'http://localhost:8090/nytimes',
m = search_engines_module() data=json.dumps({"url": url}),
cu = m.wayback_machine_cached_url(url, br) headers={'User-Agent': 'calibre', 'Content-Type': 'application/json'}
raw = m.get_data_for_cached_url(cu) )
if raw is None: br.set_handle_gzip(True)
raw = br.open_novisit(cu).read() return br.open_novisit(rq, timeout=3 * 60).read()
if not isinstance(raw, bytes):
raw = raw.encode('utf-8')
return raw
if __name__ == '__main__': if __name__ == '__main__':