mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-08 10:44:09 -04:00
Update NYTimes
This commit is contained in:
parent
b37186d3a1
commit
2367d3464c
@ -13,8 +13,9 @@ from calibre.ebooks.BeautifulSoup import Tag
|
|||||||
from calibre.utils.date import strptime
|
from calibre.utils.date import strptime
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
is_web_edition = True
|
is_web_edition = False
|
||||||
oldest_web_edition_article = 7 # days
|
oldest_web_edition_article = 7 # days
|
||||||
|
use_wayback_machine = False
|
||||||
|
|
||||||
|
|
||||||
# The sections to download when downloading the web edition, comment out
|
# The sections to download when downloading the web edition, comment out
|
||||||
@ -92,32 +93,37 @@ class NewYorkTimes(BasicNewsRecipe):
|
|||||||
remove_attributes = ['style']
|
remove_attributes = ['style']
|
||||||
conversion_options = {'flow_size': 0}
|
conversion_options = {'flow_size': 0}
|
||||||
|
|
||||||
def preprocess_raw_html(self, raw_html, url):
|
@property
|
||||||
if not hasattr(self, 'nyt_parser'):
|
def nyt_parser(self):
|
||||||
|
ans = getattr(self, '_nyt_parser', None)
|
||||||
|
if ans is None:
|
||||||
from calibre.live import load_module
|
from calibre.live import load_module
|
||||||
m = load_module('calibre.web.site_parsers.nytimes')
|
self._nyt_parser = ans = load_module('calibre.web.site_parsers.nytimes')
|
||||||
self.nyt_parser = m
|
return ans
|
||||||
|
|
||||||
|
def get_nyt_page(self, url):
|
||||||
|
if use_wayback_machine:
|
||||||
|
from calibre import browser
|
||||||
|
return self.nyt_parser.download_url(url, browser())
|
||||||
|
return self.browser.open_novisit(url).read()
|
||||||
|
|
||||||
|
def preprocess_raw_html(self, raw_html, url):
|
||||||
html = self.nyt_parser.extract_html(self.index_to_soup(raw_html))
|
html = self.nyt_parser.extract_html(self.index_to_soup(raw_html))
|
||||||
return html
|
return html
|
||||||
|
|
||||||
|
articles_are_obfuscated = use_wayback_machine
|
||||||
|
|
||||||
|
if use_wayback_machine:
|
||||||
|
def get_obfuscated_article(self, url):
|
||||||
|
from calibre.ptempfile import PersistentTemporaryFile
|
||||||
|
with PersistentTemporaryFile() as tf:
|
||||||
|
tf.write(self.get_nyt_page(url))
|
||||||
|
return tf.name
|
||||||
|
|
||||||
def read_todays_paper(self):
|
def read_todays_paper(self):
|
||||||
INDEX = 'https://www.nytimes.com/section/todayspaper'
|
INDEX = 'https://www.nytimes.com/section/todayspaper'
|
||||||
# INDEX = 'file:///t/raw.html'
|
# INDEX = 'file:///t/raw.html'
|
||||||
try:
|
return self.index_to_soup(self.get_nyt_page(INDEX))
|
||||||
soup = self.index_to_soup(INDEX)
|
|
||||||
except Exception as err:
|
|
||||||
if getattr(err, 'code', None) == 404:
|
|
||||||
try:
|
|
||||||
soup = self.index_to_soup(strftime('https://www.nytimes.com/issue/todayspaper/%Y/%m/%d/todays-new-york-times'))
|
|
||||||
except Exception as err:
|
|
||||||
if getattr(err, 'code', None) == 404:
|
|
||||||
dt = datetime.datetime.today() - datetime.timedelta(days=1)
|
|
||||||
soup = self.index_to_soup(dt.strftime('https://www.nytimes.com/issue/todayspaper/%Y/%m/%d/todays-new-york-times'))
|
|
||||||
else:
|
|
||||||
raise
|
|
||||||
else:
|
|
||||||
raise
|
|
||||||
return soup
|
|
||||||
|
|
||||||
def read_nyt_metadata(self):
|
def read_nyt_metadata(self):
|
||||||
soup = self.read_todays_paper()
|
soup = self.read_todays_paper()
|
||||||
@ -241,7 +247,7 @@ class NewYorkTimes(BasicNewsRecipe):
|
|||||||
for section_title, slug in web_sections:
|
for section_title, slug in web_sections:
|
||||||
url = 'https://www.nytimes.com/section/' + slug
|
url = 'https://www.nytimes.com/section/' + slug
|
||||||
try:
|
try:
|
||||||
soup = self.index_to_soup(url)
|
soup = self.index_to_soup(self.get_nyt_page(url))
|
||||||
except Exception:
|
except Exception:
|
||||||
self.log.error('Failed to download section:', url)
|
self.log.error('Failed to download section:', url)
|
||||||
continue
|
continue
|
||||||
|
@ -15,6 +15,7 @@ from calibre.web.feeds.news import BasicNewsRecipe
|
|||||||
|
|
||||||
is_web_edition = False
|
is_web_edition = False
|
||||||
oldest_web_edition_article = 7 # days
|
oldest_web_edition_article = 7 # days
|
||||||
|
use_wayback_machine = False
|
||||||
|
|
||||||
|
|
||||||
# The sections to download when downloading the web edition, comment out
|
# The sections to download when downloading the web edition, comment out
|
||||||
@ -92,32 +93,37 @@ class NewYorkTimes(BasicNewsRecipe):
|
|||||||
remove_attributes = ['style']
|
remove_attributes = ['style']
|
||||||
conversion_options = {'flow_size': 0}
|
conversion_options = {'flow_size': 0}
|
||||||
|
|
||||||
def preprocess_raw_html(self, raw_html, url):
|
@property
|
||||||
if not hasattr(self, 'nyt_parser'):
|
def nyt_parser(self):
|
||||||
|
ans = getattr(self, '_nyt_parser', None)
|
||||||
|
if ans is None:
|
||||||
from calibre.live import load_module
|
from calibre.live import load_module
|
||||||
m = load_module('calibre.web.site_parsers.nytimes')
|
self._nyt_parser = ans = load_module('calibre.web.site_parsers.nytimes')
|
||||||
self.nyt_parser = m
|
return ans
|
||||||
|
|
||||||
|
def get_nyt_page(self, url):
|
||||||
|
if use_wayback_machine:
|
||||||
|
from calibre import browser
|
||||||
|
return self.nyt_parser.download_url(url, browser())
|
||||||
|
return self.browser.open_novisit(url).read()
|
||||||
|
|
||||||
|
def preprocess_raw_html(self, raw_html, url):
|
||||||
html = self.nyt_parser.extract_html(self.index_to_soup(raw_html))
|
html = self.nyt_parser.extract_html(self.index_to_soup(raw_html))
|
||||||
return html
|
return html
|
||||||
|
|
||||||
|
articles_are_obfuscated = use_wayback_machine
|
||||||
|
|
||||||
|
if use_wayback_machine:
|
||||||
|
def get_obfuscated_article(self, url):
|
||||||
|
from calibre.ptempfile import PersistentTemporaryFile
|
||||||
|
with PersistentTemporaryFile() as tf:
|
||||||
|
tf.write(self.get_nyt_page(url))
|
||||||
|
return tf.name
|
||||||
|
|
||||||
def read_todays_paper(self):
|
def read_todays_paper(self):
|
||||||
INDEX = 'https://www.nytimes.com/section/todayspaper'
|
INDEX = 'https://www.nytimes.com/section/todayspaper'
|
||||||
# INDEX = 'file:///t/raw.html'
|
# INDEX = 'file:///t/raw.html'
|
||||||
try:
|
return self.index_to_soup(self.get_nyt_page(INDEX))
|
||||||
soup = self.index_to_soup(INDEX)
|
|
||||||
except Exception as err:
|
|
||||||
if getattr(err, 'code', None) == 404:
|
|
||||||
try:
|
|
||||||
soup = self.index_to_soup(strftime('https://www.nytimes.com/issue/todayspaper/%Y/%m/%d/todays-new-york-times'))
|
|
||||||
except Exception as err:
|
|
||||||
if getattr(err, 'code', None) == 404:
|
|
||||||
dt = datetime.datetime.today() - datetime.timedelta(days=1)
|
|
||||||
soup = self.index_to_soup(dt.strftime('https://www.nytimes.com/issue/todayspaper/%Y/%m/%d/todays-new-york-times'))
|
|
||||||
else:
|
|
||||||
raise
|
|
||||||
else:
|
|
||||||
raise
|
|
||||||
return soup
|
|
||||||
|
|
||||||
def read_nyt_metadata(self):
|
def read_nyt_metadata(self):
|
||||||
soup = self.read_todays_paper()
|
soup = self.read_todays_paper()
|
||||||
@ -241,7 +247,7 @@ class NewYorkTimes(BasicNewsRecipe):
|
|||||||
for section_title, slug in web_sections:
|
for section_title, slug in web_sections:
|
||||||
url = 'https://www.nytimes.com/section/' + slug
|
url = 'https://www.nytimes.com/section/' + slug
|
||||||
try:
|
try:
|
||||||
soup = self.index_to_soup(url)
|
soup = self.index_to_soup(self.get_nyt_page(url))
|
||||||
except Exception:
|
except Exception:
|
||||||
self.log.error('Failed to download section:', url)
|
self.log.error('Failed to download section:', url)
|
||||||
continue
|
continue
|
||||||
|
@ -2,14 +2,9 @@
|
|||||||
# vim:fileencoding=utf-8
|
# vim:fileencoding=utf-8
|
||||||
# License: GPLv3 Copyright: 2015, Kovid Goyal <kovid at kovidgoyal.net>
|
# License: GPLv3 Copyright: 2015, Kovid Goyal <kovid at kovidgoyal.net>
|
||||||
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
use_wayback_machine = False
|
||||||
def classes(classes):
|
|
||||||
q = frozenset(classes.split(' '))
|
|
||||||
return dict(attrs={'class': lambda x: x and frozenset(x.split()).intersection(q)})
|
|
||||||
|
|
||||||
|
|
||||||
def absolutize(url):
|
def absolutize(url):
|
||||||
@ -28,32 +23,38 @@ class NewYorkTimesBookReview(BasicNewsRecipe):
|
|||||||
no_javascript = True
|
no_javascript = True
|
||||||
ignore_duplicate_articles = {'title', 'url'}
|
ignore_duplicate_articles = {'title', 'url'}
|
||||||
encoding = 'utf-8'
|
encoding = 'utf-8'
|
||||||
articles_are_obfuscated = True
|
|
||||||
delay = 1
|
|
||||||
|
|
||||||
def get_obfuscated_article(self, url):
|
articles_are_obfuscated = use_wayback_machine
|
||||||
if not hasattr(self, 'nyt_parser'):
|
|
||||||
|
if use_wayback_machine:
|
||||||
|
def get_obfuscated_article(self, url):
|
||||||
|
from calibre.ptempfile import PersistentTemporaryFile
|
||||||
|
with PersistentTemporaryFile() as tf:
|
||||||
|
tf.write(self.get_nyt_page(url))
|
||||||
|
return tf.name
|
||||||
|
|
||||||
|
@property
|
||||||
|
def nyt_parser(self):
|
||||||
|
ans = getattr(self, '_nyt_parser', None)
|
||||||
|
if ans is None:
|
||||||
from calibre.live import load_module
|
from calibre.live import load_module
|
||||||
m = load_module('calibre.web.site_parsers.nytimes')
|
self._nyt_parser = ans = load_module('calibre.web.site_parsers.nytimes')
|
||||||
self.nyt_parser = m
|
return ans
|
||||||
raw = self.nyt_parser.download_url(url, self.cloned_browser)
|
|
||||||
from calibre.ptempfile import PersistentTemporaryFile
|
def get_nyt_page(self, url):
|
||||||
with PersistentTemporaryFile(suffix='.html') as pt:
|
if use_wayback_machine:
|
||||||
pt.write(raw)
|
from calibre import browser
|
||||||
return pt.name
|
return self.nyt_parser.download_url(url, browser())
|
||||||
|
return self.browser.open_novisit(url).read()
|
||||||
|
|
||||||
def preprocess_raw_html(self, raw_html, url):
|
def preprocess_raw_html(self, raw_html, url):
|
||||||
if not hasattr(self, 'nyt_parser'):
|
|
||||||
from calibre.live import load_module
|
|
||||||
m = load_module('calibre.web.site_parsers.nytimes')
|
|
||||||
self.nyt_parser = m
|
|
||||||
html = self.nyt_parser.extract_html(self.index_to_soup(raw_html))
|
html = self.nyt_parser.extract_html(self.index_to_soup(raw_html))
|
||||||
return html
|
return html
|
||||||
|
|
||||||
def parse_index(self):
|
def parse_index(self):
|
||||||
# return [('Articles', [{'url': 'https://www.nytimes.com/2022/09/08/books/review/karen-armstrong-by-the-book-interview.html', 'title':'test'}])]
|
# return [('Articles', [{'url': 'https://www.nytimes.com/2022/09/08/books/review/karen-armstrong-by-the-book-interview.html', 'title':'test'}])]
|
||||||
soup = self.index_to_soup(
|
soup = self.index_to_soup(
|
||||||
'https://www.nytimes.com/pages/books/review/index.html')
|
self.get_nyt_page('https://www.nytimes.com/pages/books/review/index.html'))
|
||||||
|
|
||||||
# Find TOC
|
# Find TOC
|
||||||
toc = soup.find('section', id='collection-book-review').find('section').find('ol')
|
toc = soup.find('section', id='collection-book-review').find('section').find('ol')
|
||||||
|
@ -10,7 +10,7 @@ from pprint import pprint
|
|||||||
from calibre.utils.iso8601 import parse_iso8601
|
from calibre.utils.iso8601 import parse_iso8601
|
||||||
|
|
||||||
|
|
||||||
module_version = 2 # needed for live updates
|
module_version = 3 # needed for live updates
|
||||||
pprint
|
pprint
|
||||||
|
|
||||||
|
|
||||||
@ -187,18 +187,15 @@ def extract_html(soup):
|
|||||||
|
|
||||||
|
|
||||||
def download_url(url, br):
|
def download_url(url, br):
|
||||||
# NYT has implemented captcha protection for its article pages, so get
|
# Get the URL from the Wayback machine
|
||||||
# them from the wayback machine instead. However, wayback machine is
|
from mechanize import Request
|
||||||
# flaky so god knows how well it will work under load
|
rq = Request(
|
||||||
from calibre.ebooks.metadata.sources.update import search_engines_module
|
'http://localhost:8090/nytimes',
|
||||||
m = search_engines_module()
|
data=json.dumps({"url": url}),
|
||||||
cu = m.wayback_machine_cached_url(url, br)
|
headers={'User-Agent': 'calibre', 'Content-Type': 'application/json'}
|
||||||
raw = m.get_data_for_cached_url(cu)
|
)
|
||||||
if raw is None:
|
br.set_handle_gzip(True)
|
||||||
raw = br.open_novisit(cu).read()
|
return br.open_novisit(rq, timeout=3 * 60).read()
|
||||||
if not isinstance(raw, bytes):
|
|
||||||
raw = raw.encode('utf-8')
|
|
||||||
return raw
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
Loading…
x
Reference in New Issue
Block a user