mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-08 02:34:06 -04:00
Update NYTimes
This commit is contained in:
parent
692fa6d4fc
commit
ceafc1b05e
@ -3,20 +3,20 @@
|
|||||||
# License: GPLv3 Copyright: 2018, Kovid Goyal <kovid at kovidgoyal.net>
|
# License: GPLv3 Copyright: 2018, Kovid Goyal <kovid at kovidgoyal.net>
|
||||||
|
|
||||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||||
|
|
||||||
import datetime
|
import datetime
|
||||||
import re
|
|
||||||
import json
|
import json
|
||||||
|
import re
|
||||||
from pprint import pprint # noqa
|
from pprint import pprint # noqa
|
||||||
|
|
||||||
from calibre import strftime
|
from calibre import strftime
|
||||||
|
from calibre.ebooks.BeautifulSoup import Tag
|
||||||
from calibre.utils.date import strptime
|
from calibre.utils.date import strptime
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
from calibre.ebooks.BeautifulSoup import Tag
|
|
||||||
|
|
||||||
is_web_edition = True
|
is_web_edition = True
|
||||||
oldest_web_edition_article = 7 # days
|
oldest_web_edition_article = 7 # days
|
||||||
|
|
||||||
|
|
||||||
# The sections to download when downloading the web edition, comment out
|
# The sections to download when downloading the web edition, comment out
|
||||||
# the section you are not interested in
|
# the section you are not interested in
|
||||||
web_sections = [
|
web_sections = [
|
||||||
@ -92,76 +92,15 @@ class NewYorkTimes(BasicNewsRecipe):
|
|||||||
remove_attributes = ['style']
|
remove_attributes = ['style']
|
||||||
conversion_options = {'flow_size': 0}
|
conversion_options = {'flow_size': 0}
|
||||||
|
|
||||||
remove_tags = [
|
def preprocess_raw_html(self, raw_html, url):
|
||||||
dict(attrs={'aria-label':'tools'.split()}),
|
if '/live/' in url:
|
||||||
dict(attrs={'aria-label': lambda x: x and 'New York Times Logo' in x}),
|
self.abort_article('Cant be bothered decoding the JSON for NYT live articles')
|
||||||
dict(href='#site-content #site-index'.split()),
|
if not hasattr(self, 'nyt_parser'):
|
||||||
dict(attrs={'aria-hidden':'true'}),
|
from calibre.live import load_module
|
||||||
dict(attrs={'data-videoid':True}),
|
m = load_module('calibre.web.site_parsers.nytimes')
|
||||||
dict(name='button meta link time source'.split()),
|
self.nyt_parser = m
|
||||||
dict(id=lambda x: x and x.startswith('story-ad-')),
|
html = self.nyt_parser.extract_html(self.index_to_soup(raw_html))
|
||||||
dict(name='head'),
|
return html
|
||||||
dict(role='toolbar'),
|
|
||||||
dict(name='a', href=lambda x: x and '#story-continues-' in x),
|
|
||||||
dict(name='a', href=lambda x: x and '#whats-next' in x),
|
|
||||||
dict(id=lambda x: x and 'sharetools-' in x),
|
|
||||||
dict(id='newsletter-promo supported-by-ad bottom-wrapper top-wrapper sponsor-wrapper'.split()),
|
|
||||||
classes('story-print-citation supported-by accessibility-ad-header visually-hidden bottom-of-article ad'),
|
|
||||||
dict(attrs={'class': lambda x: x and (
|
|
||||||
'SectionBar' in x or 'recirculation' in x or 'ResponsiveAd' in x or 'accessibility-visuallyHidden' in x or 'RelatedCoverage' in x)}),
|
|
||||||
]
|
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
|
||||||
article = soup.find(id='story')
|
|
||||||
if article is None:
|
|
||||||
keep_only_tags = [dict(attrs={'aria-label': 'Main content'})]
|
|
||||||
else:
|
|
||||||
# The NYT is apparently A/B testing a new page layout
|
|
||||||
has_supplemental = article is not None and article.find(**classes('story-body-supplemental')) is not None
|
|
||||||
if has_supplemental:
|
|
||||||
keep_only_tags = [
|
|
||||||
dict(id='story-header'),
|
|
||||||
classes('story-body-supplemental story-interrupter'),
|
|
||||||
]
|
|
||||||
else:
|
|
||||||
keep_only_tags = [
|
|
||||||
dict(id='story'),
|
|
||||||
]
|
|
||||||
body = new_tag(soup, 'body')
|
|
||||||
for spec in keep_only_tags:
|
|
||||||
for tag in soup.find('body').findAll(**spec):
|
|
||||||
body.insert(len(body.contents), tag)
|
|
||||||
soup.find('body').replaceWith(body)
|
|
||||||
|
|
||||||
# Add a space to the dateline
|
|
||||||
t = soup.find(**classes('dateline'))
|
|
||||||
if t is not None:
|
|
||||||
t.insert(0, ' ')
|
|
||||||
|
|
||||||
# Remove empty li tags
|
|
||||||
for li in soup.findAll('li', attrs={'class': lambda x: x and x.startswith('css-')}):
|
|
||||||
if not li.contents and not li.string:
|
|
||||||
li.extract()
|
|
||||||
|
|
||||||
# Ensure the headline is first
|
|
||||||
h1 = soup.find('h1', itemprop='headline')
|
|
||||||
if h1 is not None:
|
|
||||||
h1.extract()
|
|
||||||
soup.find('body').contents.insert(0, h1)
|
|
||||||
|
|
||||||
# Find lazy loaded images
|
|
||||||
for div in soup.findAll(itemtype='http://schema.org/ImageObject', itemid=True):
|
|
||||||
if div.find('img') is None:
|
|
||||||
span = div.find('span')
|
|
||||||
if span is not None and self.tag_to_string(span).strip().lower() == 'image':
|
|
||||||
span.name = 'img'
|
|
||||||
span['src'] = div['itemid']
|
|
||||||
|
|
||||||
# Remove live storline menu
|
|
||||||
for span in soup.findAll(attrs={'data-storyline-module-name': 'menu'}):
|
|
||||||
span.parent.extract()
|
|
||||||
|
|
||||||
return soup
|
|
||||||
|
|
||||||
def read_todays_paper(self):
|
def read_todays_paper(self):
|
||||||
INDEX = 'https://www.nytimes.com/section/todayspaper'
|
INDEX = 'https://www.nytimes.com/section/todayspaper'
|
||||||
@ -323,30 +262,3 @@ class NewYorkTimes(BasicNewsRecipe):
|
|||||||
if is_web_edition:
|
if is_web_edition:
|
||||||
return self.parse_web_sections()
|
return self.parse_web_sections()
|
||||||
return self.parse_todays_page()
|
return self.parse_todays_page()
|
||||||
|
|
||||||
# The NYT occasionally returns bogus articles for some reason just in case
|
|
||||||
# it is because of cookies, dont store cookies
|
|
||||||
def get_browser(self, *args, **kwargs):
|
|
||||||
return self
|
|
||||||
|
|
||||||
def clone_browser(self, *args, **kwargs):
|
|
||||||
return self.get_browser()
|
|
||||||
|
|
||||||
def open_novisit(self, *args, **kwargs):
|
|
||||||
from calibre import browser, random_user_agent
|
|
||||||
if not hasattr(self, 'rua_stored'):
|
|
||||||
self.rua_stored = random_user_agent(allow_ie=False)
|
|
||||||
br = browser(user_agent=self.rua_stored)
|
|
||||||
response = br.open_novisit(*args, **kwargs)
|
|
||||||
# headers = response.info()
|
|
||||||
# if headers.get('X-PageType') == 'vi-story':
|
|
||||||
# import tempfile
|
|
||||||
# with tempfile.NamedTemporaryFile(suffix='.html', dir='/t/n', delete=False) as f:
|
|
||||||
# f.write(response.read())
|
|
||||||
# import time
|
|
||||||
# time.sleep(1)
|
|
||||||
# br = browser()
|
|
||||||
# response = br.open_novisit(*args, **kwargs)
|
|
||||||
return response
|
|
||||||
|
|
||||||
open = open_novisit
|
|
||||||
|
@ -3,20 +3,20 @@
|
|||||||
# License: GPLv3 Copyright: 2018, Kovid Goyal <kovid at kovidgoyal.net>
|
# License: GPLv3 Copyright: 2018, Kovid Goyal <kovid at kovidgoyal.net>
|
||||||
|
|
||||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||||
|
|
||||||
import datetime
|
import datetime
|
||||||
import re
|
|
||||||
import json
|
import json
|
||||||
|
import re
|
||||||
from pprint import pprint # noqa
|
from pprint import pprint # noqa
|
||||||
|
|
||||||
from calibre import strftime
|
from calibre import strftime
|
||||||
|
from calibre.ebooks.BeautifulSoup import Tag
|
||||||
from calibre.utils.date import strptime
|
from calibre.utils.date import strptime
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
from calibre.ebooks.BeautifulSoup import Tag
|
|
||||||
|
|
||||||
is_web_edition = False
|
is_web_edition = False
|
||||||
oldest_web_edition_article = 7 # days
|
oldest_web_edition_article = 7 # days
|
||||||
|
|
||||||
|
|
||||||
# The sections to download when downloading the web edition, comment out
|
# The sections to download when downloading the web edition, comment out
|
||||||
# the section you are not interested in
|
# the section you are not interested in
|
||||||
web_sections = [
|
web_sections = [
|
||||||
@ -92,76 +92,15 @@ class NewYorkTimes(BasicNewsRecipe):
|
|||||||
remove_attributes = ['style']
|
remove_attributes = ['style']
|
||||||
conversion_options = {'flow_size': 0}
|
conversion_options = {'flow_size': 0}
|
||||||
|
|
||||||
remove_tags = [
|
def preprocess_raw_html(self, raw_html, url):
|
||||||
dict(attrs={'aria-label':'tools'.split()}),
|
if '/live/' in url:
|
||||||
dict(attrs={'aria-label': lambda x: x and 'New York Times Logo' in x}),
|
self.abort_article('Cant be bothered decoding the JSON for NYT live articles')
|
||||||
dict(href='#site-content #site-index'.split()),
|
if not hasattr(self, 'nyt_parser'):
|
||||||
dict(attrs={'aria-hidden':'true'}),
|
from calibre.live import load_module
|
||||||
dict(attrs={'data-videoid':True}),
|
m = load_module('calibre.web.site_parsers.nytimes')
|
||||||
dict(name='button meta link time source'.split()),
|
self.nyt_parser = m
|
||||||
dict(id=lambda x: x and x.startswith('story-ad-')),
|
html = self.nyt_parser.extract_html(self.index_to_soup(raw_html))
|
||||||
dict(name='head'),
|
return html
|
||||||
dict(role='toolbar'),
|
|
||||||
dict(name='a', href=lambda x: x and '#story-continues-' in x),
|
|
||||||
dict(name='a', href=lambda x: x and '#whats-next' in x),
|
|
||||||
dict(id=lambda x: x and 'sharetools-' in x),
|
|
||||||
dict(id='newsletter-promo supported-by-ad bottom-wrapper top-wrapper sponsor-wrapper'.split()),
|
|
||||||
classes('story-print-citation supported-by accessibility-ad-header visually-hidden bottom-of-article ad'),
|
|
||||||
dict(attrs={'class': lambda x: x and (
|
|
||||||
'SectionBar' in x or 'recirculation' in x or 'ResponsiveAd' in x or 'accessibility-visuallyHidden' in x or 'RelatedCoverage' in x)}),
|
|
||||||
]
|
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
|
||||||
article = soup.find(id='story')
|
|
||||||
if article is None:
|
|
||||||
keep_only_tags = [dict(attrs={'aria-label': 'Main content'})]
|
|
||||||
else:
|
|
||||||
# The NYT is apparently A/B testing a new page layout
|
|
||||||
has_supplemental = article is not None and article.find(**classes('story-body-supplemental')) is not None
|
|
||||||
if has_supplemental:
|
|
||||||
keep_only_tags = [
|
|
||||||
dict(id='story-header'),
|
|
||||||
classes('story-body-supplemental story-interrupter'),
|
|
||||||
]
|
|
||||||
else:
|
|
||||||
keep_only_tags = [
|
|
||||||
dict(id='story'),
|
|
||||||
]
|
|
||||||
body = new_tag(soup, 'body')
|
|
||||||
for spec in keep_only_tags:
|
|
||||||
for tag in soup.find('body').findAll(**spec):
|
|
||||||
body.insert(len(body.contents), tag)
|
|
||||||
soup.find('body').replaceWith(body)
|
|
||||||
|
|
||||||
# Add a space to the dateline
|
|
||||||
t = soup.find(**classes('dateline'))
|
|
||||||
if t is not None:
|
|
||||||
t.insert(0, ' ')
|
|
||||||
|
|
||||||
# Remove empty li tags
|
|
||||||
for li in soup.findAll('li', attrs={'class': lambda x: x and x.startswith('css-')}):
|
|
||||||
if not li.contents and not li.string:
|
|
||||||
li.extract()
|
|
||||||
|
|
||||||
# Ensure the headline is first
|
|
||||||
h1 = soup.find('h1', itemprop='headline')
|
|
||||||
if h1 is not None:
|
|
||||||
h1.extract()
|
|
||||||
soup.find('body').contents.insert(0, h1)
|
|
||||||
|
|
||||||
# Find lazy loaded images
|
|
||||||
for div in soup.findAll(itemtype='http://schema.org/ImageObject', itemid=True):
|
|
||||||
if div.find('img') is None:
|
|
||||||
span = div.find('span')
|
|
||||||
if span is not None and self.tag_to_string(span).strip().lower() == 'image':
|
|
||||||
span.name = 'img'
|
|
||||||
span['src'] = div['itemid']
|
|
||||||
|
|
||||||
# Remove live storline menu
|
|
||||||
for span in soup.findAll(attrs={'data-storyline-module-name': 'menu'}):
|
|
||||||
span.parent.extract()
|
|
||||||
|
|
||||||
return soup
|
|
||||||
|
|
||||||
def read_todays_paper(self):
|
def read_todays_paper(self):
|
||||||
INDEX = 'https://www.nytimes.com/section/todayspaper'
|
INDEX = 'https://www.nytimes.com/section/todayspaper'
|
||||||
@ -323,30 +262,3 @@ class NewYorkTimes(BasicNewsRecipe):
|
|||||||
if is_web_edition:
|
if is_web_edition:
|
||||||
return self.parse_web_sections()
|
return self.parse_web_sections()
|
||||||
return self.parse_todays_page()
|
return self.parse_todays_page()
|
||||||
|
|
||||||
# The NYT occasionally returns bogus articles for some reason just in case
|
|
||||||
# it is because of cookies, dont store cookies
|
|
||||||
def get_browser(self, *args, **kwargs):
|
|
||||||
return self
|
|
||||||
|
|
||||||
def clone_browser(self, *args, **kwargs):
|
|
||||||
return self.get_browser()
|
|
||||||
|
|
||||||
def open_novisit(self, *args, **kwargs):
|
|
||||||
from calibre import browser, random_user_agent
|
|
||||||
if not hasattr(self, 'rua_stored'):
|
|
||||||
self.rua_stored = random_user_agent(allow_ie=False)
|
|
||||||
br = browser(user_agent=self.rua_stored)
|
|
||||||
response = br.open_novisit(*args, **kwargs)
|
|
||||||
# headers = response.info()
|
|
||||||
# if headers.get('X-PageType') == 'vi-story':
|
|
||||||
# import tempfile
|
|
||||||
# with tempfile.NamedTemporaryFile(suffix='.html', dir='/t/n', delete=False) as f:
|
|
||||||
# f.write(response.read())
|
|
||||||
# import time
|
|
||||||
# time.sleep(1)
|
|
||||||
# br = browser()
|
|
||||||
# response = br.open_novisit(*args, **kwargs)
|
|
||||||
return response
|
|
||||||
|
|
||||||
open = open_novisit
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user