mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-08 02:34:06 -04:00
Update NYTimes
This commit is contained in:
parent
b37186d3a1
commit
2367d3464c
@ -13,8 +13,9 @@ from calibre.ebooks.BeautifulSoup import Tag
|
||||
from calibre.utils.date import strptime
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
is_web_edition = True
|
||||
is_web_edition = False
|
||||
oldest_web_edition_article = 7 # days
|
||||
use_wayback_machine = False
|
||||
|
||||
|
||||
# The sections to download when downloading the web edition, comment out
|
||||
@ -92,32 +93,37 @@ class NewYorkTimes(BasicNewsRecipe):
|
||||
remove_attributes = ['style']
|
||||
conversion_options = {'flow_size': 0}
|
||||
|
||||
def preprocess_raw_html(self, raw_html, url):
|
||||
if not hasattr(self, 'nyt_parser'):
|
||||
@property
|
||||
def nyt_parser(self):
|
||||
ans = getattr(self, '_nyt_parser', None)
|
||||
if ans is None:
|
||||
from calibre.live import load_module
|
||||
m = load_module('calibre.web.site_parsers.nytimes')
|
||||
self.nyt_parser = m
|
||||
self._nyt_parser = ans = load_module('calibre.web.site_parsers.nytimes')
|
||||
return ans
|
||||
|
||||
def get_nyt_page(self, url):
|
||||
if use_wayback_machine:
|
||||
from calibre import browser
|
||||
return self.nyt_parser.download_url(url, browser())
|
||||
return self.browser.open_novisit(url).read()
|
||||
|
||||
def preprocess_raw_html(self, raw_html, url):
|
||||
html = self.nyt_parser.extract_html(self.index_to_soup(raw_html))
|
||||
return html
|
||||
|
||||
articles_are_obfuscated = use_wayback_machine
|
||||
|
||||
if use_wayback_machine:
|
||||
def get_obfuscated_article(self, url):
|
||||
from calibre.ptempfile import PersistentTemporaryFile
|
||||
with PersistentTemporaryFile() as tf:
|
||||
tf.write(self.get_nyt_page(url))
|
||||
return tf.name
|
||||
|
||||
def read_todays_paper(self):
|
||||
INDEX = 'https://www.nytimes.com/section/todayspaper'
|
||||
# INDEX = 'file:///t/raw.html'
|
||||
try:
|
||||
soup = self.index_to_soup(INDEX)
|
||||
except Exception as err:
|
||||
if getattr(err, 'code', None) == 404:
|
||||
try:
|
||||
soup = self.index_to_soup(strftime('https://www.nytimes.com/issue/todayspaper/%Y/%m/%d/todays-new-york-times'))
|
||||
except Exception as err:
|
||||
if getattr(err, 'code', None) == 404:
|
||||
dt = datetime.datetime.today() - datetime.timedelta(days=1)
|
||||
soup = self.index_to_soup(dt.strftime('https://www.nytimes.com/issue/todayspaper/%Y/%m/%d/todays-new-york-times'))
|
||||
else:
|
||||
raise
|
||||
else:
|
||||
raise
|
||||
return soup
|
||||
return self.index_to_soup(self.get_nyt_page(INDEX))
|
||||
|
||||
def read_nyt_metadata(self):
|
||||
soup = self.read_todays_paper()
|
||||
@ -241,7 +247,7 @@ class NewYorkTimes(BasicNewsRecipe):
|
||||
for section_title, slug in web_sections:
|
||||
url = 'https://www.nytimes.com/section/' + slug
|
||||
try:
|
||||
soup = self.index_to_soup(url)
|
||||
soup = self.index_to_soup(self.get_nyt_page(url))
|
||||
except Exception:
|
||||
self.log.error('Failed to download section:', url)
|
||||
continue
|
||||
|
@ -15,6 +15,7 @@ from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
is_web_edition = False
|
||||
oldest_web_edition_article = 7 # days
|
||||
use_wayback_machine = False
|
||||
|
||||
|
||||
# The sections to download when downloading the web edition, comment out
|
||||
@ -92,32 +93,37 @@ class NewYorkTimes(BasicNewsRecipe):
|
||||
remove_attributes = ['style']
|
||||
conversion_options = {'flow_size': 0}
|
||||
|
||||
def preprocess_raw_html(self, raw_html, url):
|
||||
if not hasattr(self, 'nyt_parser'):
|
||||
@property
|
||||
def nyt_parser(self):
|
||||
ans = getattr(self, '_nyt_parser', None)
|
||||
if ans is None:
|
||||
from calibre.live import load_module
|
||||
m = load_module('calibre.web.site_parsers.nytimes')
|
||||
self.nyt_parser = m
|
||||
self._nyt_parser = ans = load_module('calibre.web.site_parsers.nytimes')
|
||||
return ans
|
||||
|
||||
def get_nyt_page(self, url):
|
||||
if use_wayback_machine:
|
||||
from calibre import browser
|
||||
return self.nyt_parser.download_url(url, browser())
|
||||
return self.browser.open_novisit(url).read()
|
||||
|
||||
def preprocess_raw_html(self, raw_html, url):
|
||||
html = self.nyt_parser.extract_html(self.index_to_soup(raw_html))
|
||||
return html
|
||||
|
||||
articles_are_obfuscated = use_wayback_machine
|
||||
|
||||
if use_wayback_machine:
|
||||
def get_obfuscated_article(self, url):
|
||||
from calibre.ptempfile import PersistentTemporaryFile
|
||||
with PersistentTemporaryFile() as tf:
|
||||
tf.write(self.get_nyt_page(url))
|
||||
return tf.name
|
||||
|
||||
def read_todays_paper(self):
|
||||
INDEX = 'https://www.nytimes.com/section/todayspaper'
|
||||
# INDEX = 'file:///t/raw.html'
|
||||
try:
|
||||
soup = self.index_to_soup(INDEX)
|
||||
except Exception as err:
|
||||
if getattr(err, 'code', None) == 404:
|
||||
try:
|
||||
soup = self.index_to_soup(strftime('https://www.nytimes.com/issue/todayspaper/%Y/%m/%d/todays-new-york-times'))
|
||||
except Exception as err:
|
||||
if getattr(err, 'code', None) == 404:
|
||||
dt = datetime.datetime.today() - datetime.timedelta(days=1)
|
||||
soup = self.index_to_soup(dt.strftime('https://www.nytimes.com/issue/todayspaper/%Y/%m/%d/todays-new-york-times'))
|
||||
else:
|
||||
raise
|
||||
else:
|
||||
raise
|
||||
return soup
|
||||
return self.index_to_soup(self.get_nyt_page(INDEX))
|
||||
|
||||
def read_nyt_metadata(self):
|
||||
soup = self.read_todays_paper()
|
||||
@ -241,7 +247,7 @@ class NewYorkTimes(BasicNewsRecipe):
|
||||
for section_title, slug in web_sections:
|
||||
url = 'https://www.nytimes.com/section/' + slug
|
||||
try:
|
||||
soup = self.index_to_soup(url)
|
||||
soup = self.index_to_soup(self.get_nyt_page(url))
|
||||
except Exception:
|
||||
self.log.error('Failed to download section:', url)
|
||||
continue
|
||||
|
@ -2,14 +2,9 @@
|
||||
# vim:fileencoding=utf-8
|
||||
# License: GPLv3 Copyright: 2015, Kovid Goyal <kovid at kovidgoyal.net>
|
||||
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
|
||||
|
||||
def classes(classes):
|
||||
q = frozenset(classes.split(' '))
|
||||
return dict(attrs={'class': lambda x: x and frozenset(x.split()).intersection(q)})
|
||||
use_wayback_machine = False
|
||||
|
||||
|
||||
def absolutize(url):
|
||||
@ -28,32 +23,38 @@ class NewYorkTimesBookReview(BasicNewsRecipe):
|
||||
no_javascript = True
|
||||
ignore_duplicate_articles = {'title', 'url'}
|
||||
encoding = 'utf-8'
|
||||
articles_are_obfuscated = True
|
||||
delay = 1
|
||||
|
||||
def get_obfuscated_article(self, url):
|
||||
if not hasattr(self, 'nyt_parser'):
|
||||
articles_are_obfuscated = use_wayback_machine
|
||||
|
||||
if use_wayback_machine:
|
||||
def get_obfuscated_article(self, url):
|
||||
from calibre.ptempfile import PersistentTemporaryFile
|
||||
with PersistentTemporaryFile() as tf:
|
||||
tf.write(self.get_nyt_page(url))
|
||||
return tf.name
|
||||
|
||||
@property
|
||||
def nyt_parser(self):
|
||||
ans = getattr(self, '_nyt_parser', None)
|
||||
if ans is None:
|
||||
from calibre.live import load_module
|
||||
m = load_module('calibre.web.site_parsers.nytimes')
|
||||
self.nyt_parser = m
|
||||
raw = self.nyt_parser.download_url(url, self.cloned_browser)
|
||||
from calibre.ptempfile import PersistentTemporaryFile
|
||||
with PersistentTemporaryFile(suffix='.html') as pt:
|
||||
pt.write(raw)
|
||||
return pt.name
|
||||
self._nyt_parser = ans = load_module('calibre.web.site_parsers.nytimes')
|
||||
return ans
|
||||
|
||||
def get_nyt_page(self, url):
|
||||
if use_wayback_machine:
|
||||
from calibre import browser
|
||||
return self.nyt_parser.download_url(url, browser())
|
||||
return self.browser.open_novisit(url).read()
|
||||
|
||||
def preprocess_raw_html(self, raw_html, url):
|
||||
if not hasattr(self, 'nyt_parser'):
|
||||
from calibre.live import load_module
|
||||
m = load_module('calibre.web.site_parsers.nytimes')
|
||||
self.nyt_parser = m
|
||||
html = self.nyt_parser.extract_html(self.index_to_soup(raw_html))
|
||||
return html
|
||||
|
||||
def parse_index(self):
|
||||
# return [('Articles', [{'url': 'https://www.nytimes.com/2022/09/08/books/review/karen-armstrong-by-the-book-interview.html', 'title':'test'}])]
|
||||
soup = self.index_to_soup(
|
||||
'https://www.nytimes.com/pages/books/review/index.html')
|
||||
self.get_nyt_page('https://www.nytimes.com/pages/books/review/index.html'))
|
||||
|
||||
# Find TOC
|
||||
toc = soup.find('section', id='collection-book-review').find('section').find('ol')
|
||||
|
@ -10,7 +10,7 @@ from pprint import pprint
|
||||
from calibre.utils.iso8601 import parse_iso8601
|
||||
|
||||
|
||||
module_version = 2 # needed for live updates
|
||||
module_version = 3 # needed for live updates
|
||||
pprint
|
||||
|
||||
|
||||
@ -187,18 +187,15 @@ def extract_html(soup):
|
||||
|
||||
|
||||
def download_url(url, br):
|
||||
# NYT has implemented captcha protection for its article pages, so get
|
||||
# them from the wayback machine instead. However, wayback machine is
|
||||
# flaky so god knows how well it will work under load
|
||||
from calibre.ebooks.metadata.sources.update import search_engines_module
|
||||
m = search_engines_module()
|
||||
cu = m.wayback_machine_cached_url(url, br)
|
||||
raw = m.get_data_for_cached_url(cu)
|
||||
if raw is None:
|
||||
raw = br.open_novisit(cu).read()
|
||||
if not isinstance(raw, bytes):
|
||||
raw = raw.encode('utf-8')
|
||||
return raw
|
||||
# Get the URL from the Wayback machine
|
||||
from mechanize import Request
|
||||
rq = Request(
|
||||
'http://localhost:8090/nytimes',
|
||||
data=json.dumps({"url": url}),
|
||||
headers={'User-Agent': 'calibre', 'Content-Type': 'application/json'}
|
||||
)
|
||||
br.set_handle_gzip(True)
|
||||
return br.open_novisit(rq, timeout=3 * 60).read()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
Loading…
x
Reference in New Issue
Block a user