Update NYTimes

This commit is contained in:
Kovid Goyal 2022-09-12 07:40:24 +05:30
parent 692fa6d4fc
commit ceafc1b05e
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
2 changed files with 24 additions and 200 deletions

View File

@ -3,20 +3,20 @@
# License: GPLv3 Copyright: 2018, Kovid Goyal <kovid at kovidgoyal.net> # License: GPLv3 Copyright: 2018, Kovid Goyal <kovid at kovidgoyal.net>
from __future__ import absolute_import, division, print_function, unicode_literals from __future__ import absolute_import, division, print_function, unicode_literals
import datetime import datetime
import re
import json import json
import re
from pprint import pprint # noqa from pprint import pprint # noqa
from calibre import strftime from calibre import strftime
from calibre.ebooks.BeautifulSoup import Tag
from calibre.utils.date import strptime from calibre.utils.date import strptime
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Tag
is_web_edition = True is_web_edition = True
oldest_web_edition_article = 7 # days oldest_web_edition_article = 7 # days
# The sections to download when downloading the web edition, comment out # The sections to download when downloading the web edition, comment out
# the section you are not interested in # the section you are not interested in
web_sections = [ web_sections = [
@ -92,76 +92,15 @@ class NewYorkTimes(BasicNewsRecipe):
remove_attributes = ['style'] remove_attributes = ['style']
conversion_options = {'flow_size': 0} conversion_options = {'flow_size': 0}
remove_tags = [ def preprocess_raw_html(self, raw_html, url):
dict(attrs={'aria-label':'tools'.split()}), if '/live/' in url:
dict(attrs={'aria-label': lambda x: x and 'New York Times Logo' in x}), self.abort_article('Cant be bothered decoding the JSON for NYT live articles')
dict(href='#site-content #site-index'.split()), if not hasattr(self, 'nyt_parser'):
dict(attrs={'aria-hidden':'true'}), from calibre.live import load_module
dict(attrs={'data-videoid':True}), m = load_module('calibre.web.site_parsers.nytimes')
dict(name='button meta link time source'.split()), self.nyt_parser = m
dict(id=lambda x: x and x.startswith('story-ad-')), html = self.nyt_parser.extract_html(self.index_to_soup(raw_html))
dict(name='head'), return html
dict(role='toolbar'),
dict(name='a', href=lambda x: x and '#story-continues-' in x),
dict(name='a', href=lambda x: x and '#whats-next' in x),
dict(id=lambda x: x and 'sharetools-' in x),
dict(id='newsletter-promo supported-by-ad bottom-wrapper top-wrapper sponsor-wrapper'.split()),
classes('story-print-citation supported-by accessibility-ad-header visually-hidden bottom-of-article ad'),
dict(attrs={'class': lambda x: x and (
'SectionBar' in x or 'recirculation' in x or 'ResponsiveAd' in x or 'accessibility-visuallyHidden' in x or 'RelatedCoverage' in x)}),
]
def preprocess_html(self, soup):
article = soup.find(id='story')
if article is None:
keep_only_tags = [dict(attrs={'aria-label': 'Main content'})]
else:
# The NYT is apparently A/B testing a new page layout
has_supplemental = article is not None and article.find(**classes('story-body-supplemental')) is not None
if has_supplemental:
keep_only_tags = [
dict(id='story-header'),
classes('story-body-supplemental story-interrupter'),
]
else:
keep_only_tags = [
dict(id='story'),
]
body = new_tag(soup, 'body')
for spec in keep_only_tags:
for tag in soup.find('body').findAll(**spec):
body.insert(len(body.contents), tag)
soup.find('body').replaceWith(body)
# Add a space to the dateline
t = soup.find(**classes('dateline'))
if t is not None:
t.insert(0, ' ')
# Remove empty li tags
for li in soup.findAll('li', attrs={'class': lambda x: x and x.startswith('css-')}):
if not li.contents and not li.string:
li.extract()
# Ensure the headline is first
h1 = soup.find('h1', itemprop='headline')
if h1 is not None:
h1.extract()
soup.find('body').contents.insert(0, h1)
# Find lazy loaded images
for div in soup.findAll(itemtype='http://schema.org/ImageObject', itemid=True):
if div.find('img') is None:
span = div.find('span')
if span is not None and self.tag_to_string(span).strip().lower() == 'image':
span.name = 'img'
span['src'] = div['itemid']
# Remove live storline menu
for span in soup.findAll(attrs={'data-storyline-module-name': 'menu'}):
span.parent.extract()
return soup
def read_todays_paper(self): def read_todays_paper(self):
INDEX = 'https://www.nytimes.com/section/todayspaper' INDEX = 'https://www.nytimes.com/section/todayspaper'
@ -323,30 +262,3 @@ class NewYorkTimes(BasicNewsRecipe):
if is_web_edition: if is_web_edition:
return self.parse_web_sections() return self.parse_web_sections()
return self.parse_todays_page() return self.parse_todays_page()
# The NYT occasionally returns bogus articles for some reason just in case
# it is because of cookies, dont store cookies
def get_browser(self, *args, **kwargs):
return self
def clone_browser(self, *args, **kwargs):
return self.get_browser()
def open_novisit(self, *args, **kwargs):
from calibre import browser, random_user_agent
if not hasattr(self, 'rua_stored'):
self.rua_stored = random_user_agent(allow_ie=False)
br = browser(user_agent=self.rua_stored)
response = br.open_novisit(*args, **kwargs)
# headers = response.info()
# if headers.get('X-PageType') == 'vi-story':
# import tempfile
# with tempfile.NamedTemporaryFile(suffix='.html', dir='/t/n', delete=False) as f:
# f.write(response.read())
# import time
# time.sleep(1)
# br = browser()
# response = br.open_novisit(*args, **kwargs)
return response
open = open_novisit

View File

@ -3,20 +3,20 @@
# License: GPLv3 Copyright: 2018, Kovid Goyal <kovid at kovidgoyal.net> # License: GPLv3 Copyright: 2018, Kovid Goyal <kovid at kovidgoyal.net>
from __future__ import absolute_import, division, print_function, unicode_literals from __future__ import absolute_import, division, print_function, unicode_literals
import datetime import datetime
import re
import json import json
import re
from pprint import pprint # noqa from pprint import pprint # noqa
from calibre import strftime from calibre import strftime
from calibre.ebooks.BeautifulSoup import Tag
from calibre.utils.date import strptime from calibre.utils.date import strptime
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Tag
is_web_edition = False is_web_edition = False
oldest_web_edition_article = 7 # days oldest_web_edition_article = 7 # days
# The sections to download when downloading the web edition, comment out # The sections to download when downloading the web edition, comment out
# the section you are not interested in # the section you are not interested in
web_sections = [ web_sections = [
@ -92,76 +92,15 @@ class NewYorkTimes(BasicNewsRecipe):
remove_attributes = ['style'] remove_attributes = ['style']
conversion_options = {'flow_size': 0} conversion_options = {'flow_size': 0}
remove_tags = [ def preprocess_raw_html(self, raw_html, url):
dict(attrs={'aria-label':'tools'.split()}), if '/live/' in url:
dict(attrs={'aria-label': lambda x: x and 'New York Times Logo' in x}), self.abort_article('Cant be bothered decoding the JSON for NYT live articles')
dict(href='#site-content #site-index'.split()), if not hasattr(self, 'nyt_parser'):
dict(attrs={'aria-hidden':'true'}), from calibre.live import load_module
dict(attrs={'data-videoid':True}), m = load_module('calibre.web.site_parsers.nytimes')
dict(name='button meta link time source'.split()), self.nyt_parser = m
dict(id=lambda x: x and x.startswith('story-ad-')), html = self.nyt_parser.extract_html(self.index_to_soup(raw_html))
dict(name='head'), return html
dict(role='toolbar'),
dict(name='a', href=lambda x: x and '#story-continues-' in x),
dict(name='a', href=lambda x: x and '#whats-next' in x),
dict(id=lambda x: x and 'sharetools-' in x),
dict(id='newsletter-promo supported-by-ad bottom-wrapper top-wrapper sponsor-wrapper'.split()),
classes('story-print-citation supported-by accessibility-ad-header visually-hidden bottom-of-article ad'),
dict(attrs={'class': lambda x: x and (
'SectionBar' in x or 'recirculation' in x or 'ResponsiveAd' in x or 'accessibility-visuallyHidden' in x or 'RelatedCoverage' in x)}),
]
def preprocess_html(self, soup):
article = soup.find(id='story')
if article is None:
keep_only_tags = [dict(attrs={'aria-label': 'Main content'})]
else:
# The NYT is apparently A/B testing a new page layout
has_supplemental = article is not None and article.find(**classes('story-body-supplemental')) is not None
if has_supplemental:
keep_only_tags = [
dict(id='story-header'),
classes('story-body-supplemental story-interrupter'),
]
else:
keep_only_tags = [
dict(id='story'),
]
body = new_tag(soup, 'body')
for spec in keep_only_tags:
for tag in soup.find('body').findAll(**spec):
body.insert(len(body.contents), tag)
soup.find('body').replaceWith(body)
# Add a space to the dateline
t = soup.find(**classes('dateline'))
if t is not None:
t.insert(0, ' ')
# Remove empty li tags
for li in soup.findAll('li', attrs={'class': lambda x: x and x.startswith('css-')}):
if not li.contents and not li.string:
li.extract()
# Ensure the headline is first
h1 = soup.find('h1', itemprop='headline')
if h1 is not None:
h1.extract()
soup.find('body').contents.insert(0, h1)
# Find lazy loaded images
for div in soup.findAll(itemtype='http://schema.org/ImageObject', itemid=True):
if div.find('img') is None:
span = div.find('span')
if span is not None and self.tag_to_string(span).strip().lower() == 'image':
span.name = 'img'
span['src'] = div['itemid']
# Remove live storline menu
for span in soup.findAll(attrs={'data-storyline-module-name': 'menu'}):
span.parent.extract()
return soup
def read_todays_paper(self): def read_todays_paper(self):
INDEX = 'https://www.nytimes.com/section/todayspaper' INDEX = 'https://www.nytimes.com/section/todayspaper'
@ -323,30 +262,3 @@ class NewYorkTimes(BasicNewsRecipe):
if is_web_edition: if is_web_edition:
return self.parse_web_sections() return self.parse_web_sections()
return self.parse_todays_page() return self.parse_todays_page()
# The NYT occasionally returns bogus articles for some reason just in case
# it is because of cookies, dont store cookies
def get_browser(self, *args, **kwargs):
return self
def clone_browser(self, *args, **kwargs):
return self.get_browser()
def open_novisit(self, *args, **kwargs):
from calibre import browser, random_user_agent
if not hasattr(self, 'rua_stored'):
self.rua_stored = random_user_agent(allow_ie=False)
br = browser(user_agent=self.rua_stored)
response = br.open_novisit(*args, **kwargs)
# headers = response.info()
# if headers.get('X-PageType') == 'vi-story':
# import tempfile
# with tempfile.NamedTemporaryFile(suffix='.html', dir='/t/n', delete=False) as f:
# f.write(response.read())
# import time
# time.sleep(1)
# br = browser()
# response = br.open_novisit(*args, **kwargs)
return response
open = open_novisit