mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-08 10:44:09 -04:00
Merge branch 'master' of https://github.com/unkn0w7n/calibre
This commit is contained in:
commit
f08750b33c
Binary file not shown.
Before Width: | Height: | Size: 759 B |
Binary file not shown.
Before Width: | Height: | Size: 301 B After Width: | Height: | Size: 416 B |
@ -1,30 +0,0 @@
|
|||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
|
||||||
|
|
||||||
|
|
||||||
class NYTimesGlobal(BasicNewsRecipe):
|
|
||||||
title = u'NY Times Global'
|
|
||||||
language = 'en'
|
|
||||||
__author__ = 'Krittika Goyal'
|
|
||||||
oldest_article = 1 # days
|
|
||||||
max_articles_per_feed = 25
|
|
||||||
use_embedded_content = False
|
|
||||||
|
|
||||||
no_stylesheets = True
|
|
||||||
auto_cleanup = True
|
|
||||||
|
|
||||||
feeds = [
|
|
||||||
('NYTimes',
|
|
||||||
'http://www.nytimes.com/services/xml/rss/nyt/HomePage.xml'),
|
|
||||||
('NYTimes global',
|
|
||||||
'http://www.nytimes.com/services/xml/rss/nyt/GlobalHome.xml'),
|
|
||||||
('World',
|
|
||||||
'http://www.nytimes.com/services/xml/rss/nyt/World.xml'),
|
|
||||||
('U.S.',
|
|
||||||
'http://www.nytimes.com/services/xml/rss/nyt/US.xml'),
|
|
||||||
('Business',
|
|
||||||
'http://feeds.nytimes.com/nyt/rss/Business'),
|
|
||||||
('Sports',
|
|
||||||
'http://www.nytimes.com/services/xml/rss/nyt/Sports.xml'),
|
|
||||||
('Technology',
|
|
||||||
'http://feeds.nytimes.com/nyt/rss/Technology'),
|
|
||||||
]
|
|
@ -1,9 +1,8 @@
|
|||||||
#!/usr/bin/env python
|
#!/usr/bin/env python
|
||||||
import json
|
import json
|
||||||
import re
|
import re
|
||||||
import time
|
|
||||||
from datetime import datetime, timedelta
|
|
||||||
|
|
||||||
|
from calibre.utils.iso8601 import parse_iso8601
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
|
||||||
@ -66,7 +65,7 @@ def parse_byline(byl):
|
|||||||
yield '</i></b></div>'
|
yield '</i></b></div>'
|
||||||
|
|
||||||
def iso_date(x):
|
def iso_date(x):
|
||||||
dt = datetime.fromisoformat(x[:-1]) + timedelta(seconds=time.timezone)
|
dt = parse_iso8601(x, as_utc=False)
|
||||||
return dt.strftime('%b %d, %Y at %I:%M %p')
|
return dt.strftime('%b %d, %Y at %I:%M %p')
|
||||||
|
|
||||||
def parse_header(h):
|
def parse_header(h):
|
||||||
@ -138,7 +137,7 @@ def parse_types(x):
|
|||||||
elif x.get('__typename', '') == 'Image':
|
elif x.get('__typename', '') == 'Image':
|
||||||
yield ''.join(parse_image(x))
|
yield ''.join(parse_image(x))
|
||||||
elif x.get('__typename', '') == 'ImageBlock':
|
elif x.get('__typename', '') == 'ImageBlock':
|
||||||
yield ''.join(parse_image(x['media']))
|
yield ''.join(parse_types(x['media']))
|
||||||
elif x.get('__typename', '') == 'GridBlock':
|
elif x.get('__typename', '') == 'GridBlock':
|
||||||
yield ''.join(parse_img_grid(x))
|
yield ''.join(parse_img_grid(x))
|
||||||
|
|
||||||
@ -265,6 +264,8 @@ class nytFeeds(BasicNewsRecipe):
|
|||||||
'https://rss.nytimes.com/services/xml/rss/nyt/tmagazine.xml',
|
'https://rss.nytimes.com/services/xml/rss/nyt/tmagazine.xml',
|
||||||
'https://rss.nytimes.com/services/xml/rss/nyt/books.xml',
|
'https://rss.nytimes.com/services/xml/rss/nyt/books.xml',
|
||||||
'https://www.nytimes.com/services/xml/rss/nyt/Travel.xml',
|
'https://www.nytimes.com/services/xml/rss/nyt/Travel.xml',
|
||||||
|
'https://rss.nytimes.com/services/xml/rss/nyt/well.xml',
|
||||||
|
'https://rss.nytimes.com/services/xml/rss/nyt/Sports.xml',
|
||||||
'http://nytimes.com/timeswire/feeds/'
|
'http://nytimes.com/timeswire/feeds/'
|
||||||
]
|
]
|
||||||
|
|
||||||
@ -301,5 +302,6 @@ class nytFeeds(BasicNewsRecipe):
|
|||||||
def get_article_url(self, article):
|
def get_article_url(self, article):
|
||||||
url = BasicNewsRecipe.get_article_url(self, article)
|
url = BasicNewsRecipe.get_article_url(self, article)
|
||||||
# you can remove '|/espanol/' from code below to include spanish articles.
|
# you can remove '|/espanol/' from code below to include spanish articles.
|
||||||
if not re.search(r'/video/|/live/|/athletic/|/espanol/', url):
|
if not re.search(r'/video/|/live/|/athletic/|/espanol/|/card/', url):
|
||||||
return url
|
return url
|
||||||
|
self.log('\tSkipped URL: ', url)
|
||||||
|
@ -14,9 +14,7 @@ from calibre.ebooks.BeautifulSoup import Tag
|
|||||||
from calibre.utils.date import strptime
|
from calibre.utils.date import strptime
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
is_web_edition = True
|
use_wayback_machine = False
|
||||||
oldest_web_edition_article = 7 # days
|
|
||||||
use_wayback_machine = True
|
|
||||||
|
|
||||||
|
|
||||||
# The sections to download when downloading the web edition, comment out
|
# The sections to download when downloading the web edition, comment out
|
||||||
@ -77,22 +75,28 @@ def new_tag(soup, name, attrs=()):
|
|||||||
|
|
||||||
|
|
||||||
class NewYorkTimes(BasicNewsRecipe):
|
class NewYorkTimes(BasicNewsRecipe):
|
||||||
|
title = 'The New York Times (Web)'
|
||||||
if is_web_edition:
|
description = (
|
||||||
title = 'The New York Times (Web)'
|
'New York Times (Web). You can edit the recipe to remove sections you are not interested in. '
|
||||||
description = 'New York Times (Web). You can edit the recipe to remove sections you are not interested in.'
|
'Use advanced menu to make changes to fetch Todays Paper'
|
||||||
else:
|
)
|
||||||
title = 'The New York Times'
|
|
||||||
description = 'Today\'s New York Times'
|
|
||||||
encoding = 'utf-8'
|
encoding = 'utf-8'
|
||||||
__author__ = 'Kovid Goyal'
|
__author__ = 'Kovid Goyal'
|
||||||
language = 'en'
|
language = 'en_US'
|
||||||
ignore_duplicate_articles = {'title', 'url'}
|
ignore_duplicate_articles = {'title', 'url'}
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
compress_news_images = True
|
is_web_edition = True
|
||||||
compress_news_images_auto_size = 5
|
oldest_web_edition_article = 7 # days
|
||||||
conversion_options = {'flow_size': 0}
|
|
||||||
delay = 0 if use_wayback_machine else 1
|
extra_css = '''
|
||||||
|
.byl, .time { font-size:small; color:#202020; }
|
||||||
|
.cap { font-size:small; text-align:center; }
|
||||||
|
.cred { font-style:italic; font-size:small; }
|
||||||
|
em, blockquote { color: #202020; }
|
||||||
|
.sc { font-variant: small-caps; }
|
||||||
|
.lbl { font-size:small; color:#404040; }
|
||||||
|
img { display:block; margin:0 auto; }
|
||||||
|
'''
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def nyt_parser(self):
|
def nyt_parser(self):
|
||||||
@ -106,9 +110,13 @@ class NewYorkTimes(BasicNewsRecipe):
|
|||||||
if use_wayback_machine and not skip_wayback:
|
if use_wayback_machine and not skip_wayback:
|
||||||
from calibre import browser
|
from calibre import browser
|
||||||
return self.nyt_parser.download_url(url, browser())
|
return self.nyt_parser.download_url(url, browser())
|
||||||
return self.browser.open_novisit(url).read()
|
return self.index_to_soup(url, raw=True)
|
||||||
|
|
||||||
def preprocess_raw_html(self, raw_html, url):
|
def preprocess_raw_html(self, raw_html, url):
|
||||||
|
if '/interactive/' in url:
|
||||||
|
return '<html><body><p><em>'\
|
||||||
|
+ 'This is an interactive article, which is supposed to be read in a browser.'\
|
||||||
|
+ '</p></em></body></html>'
|
||||||
html = self.nyt_parser.extract_html(self.index_to_soup(raw_html))
|
html = self.nyt_parser.extract_html(self.index_to_soup(raw_html))
|
||||||
return html
|
return html
|
||||||
|
|
||||||
@ -121,9 +129,51 @@ class NewYorkTimes(BasicNewsRecipe):
|
|||||||
tf.write(self.get_nyt_page(url))
|
tf.write(self.get_nyt_page(url))
|
||||||
return tf.name
|
return tf.name
|
||||||
|
|
||||||
|
recipe_specific_options = {
|
||||||
|
'web': {
|
||||||
|
'short': 'Type in yes, if you want Todays Paper',
|
||||||
|
'default': 'Web Edition'
|
||||||
|
},
|
||||||
|
'days': {
|
||||||
|
'short': 'Oldest article to download from this news source. In days ',
|
||||||
|
'long': 'For example, 1, gives you articles from the past 24 hours\n(Works only for Web_Edition)',
|
||||||
|
'default': str(oldest_web_edition_article)
|
||||||
|
},
|
||||||
|
'date': {
|
||||||
|
'short': 'The date of the edition to download (YYYY/MM/DD format)\nUsed to fetch past editions of NYT newspaper',
|
||||||
|
'long': 'For example, 2024/07/16'
|
||||||
|
},
|
||||||
|
'res': {
|
||||||
|
'short': 'For hi-res images, select a resolution from the following\noptions: popup, jumbo, mobileMasterAt3x, superJumbo',
|
||||||
|
'long': 'This is useful for non e-ink devices, and for a lower file size\nthan the default, use articleInline.',
|
||||||
|
},
|
||||||
|
'comp': {
|
||||||
|
'short': 'Compress News Images?',
|
||||||
|
'long': 'enter yes',
|
||||||
|
'default': 'no'
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
BasicNewsRecipe.__init__(self, *args, **kwargs)
|
||||||
|
c = self.recipe_specific_options.get('comp')
|
||||||
|
d = self.recipe_specific_options.get('days')
|
||||||
|
w = self.recipe_specific_options.get('web')
|
||||||
|
if w and isinstance(w, str):
|
||||||
|
if w == 'yes':
|
||||||
|
self.is_web_edition = False
|
||||||
|
if d and isinstance(d, str):
|
||||||
|
self.oldest_web_edition_article = float(d)
|
||||||
|
if c and isinstance(c, str):
|
||||||
|
if c.lower() == 'yes':
|
||||||
|
self.compress_news_images = True
|
||||||
|
|
||||||
def read_todays_paper(self):
|
def read_todays_paper(self):
|
||||||
INDEX = 'https://www.nytimes.com/section/todayspaper'
|
INDEX = 'https://www.nytimes.com/section/todayspaper'
|
||||||
# INDEX = 'file:///t/raw.html'
|
# INDEX = 'file:///t/raw.html'
|
||||||
|
d = self.recipe_specific_options.get('date')
|
||||||
|
if d and isinstance(d, str):
|
||||||
|
INDEX = 'https://www.nytimes.com/issue/todayspaper/' + d + '/todays-new-york-times'
|
||||||
return self.index_to_soup(self.get_nyt_page(INDEX, skip_wayback=True))
|
return self.index_to_soup(self.get_nyt_page(INDEX, skip_wayback=True))
|
||||||
|
|
||||||
def read_nyt_metadata(self):
|
def read_nyt_metadata(self):
|
||||||
@ -219,7 +269,7 @@ class NewYorkTimes(BasicNewsRecipe):
|
|||||||
date = format_date(d)
|
date = format_date(d)
|
||||||
today = datetime.date.today()
|
today = datetime.date.today()
|
||||||
delta = today - d
|
delta = today - d
|
||||||
if delta.days > oldest_web_edition_article:
|
if delta.days > self.oldest_web_edition_article:
|
||||||
self.log.debug('\tSkipping article', title, 'as it is too old')
|
self.log.debug('\tSkipping article', title, 'as it is too old')
|
||||||
continue
|
continue
|
||||||
yield {'title': title, 'url': url, 'description': desc, 'date': date}
|
yield {'title': title, 'url': url, 'description': desc, 'date': date}
|
||||||
@ -242,7 +292,7 @@ class NewYorkTimes(BasicNewsRecipe):
|
|||||||
date = format_date(d)
|
date = format_date(d)
|
||||||
today = datetime.date.today()
|
today = datetime.date.today()
|
||||||
delta = today - d
|
delta = today - d
|
||||||
if delta.days > oldest_web_edition_article:
|
if delta.days > self.oldest_web_edition_article:
|
||||||
self.log.debug('\tSkipping article', title, 'as it is too old')
|
self.log.debug('\tSkipping article', title, 'as it is too old')
|
||||||
continue
|
continue
|
||||||
yield {'title': title, 'url': url, 'description': desc, 'date': date}
|
yield {'title': title, 'url': url, 'description': desc, 'date': date}
|
||||||
@ -290,6 +340,34 @@ class NewYorkTimes(BasicNewsRecipe):
|
|||||||
# return [('All articles', [
|
# return [('All articles', [
|
||||||
# {'title': 'XXXXX', 'url': 'https://www.nytimes.com/2020/11/27/world/americas/coronavirus-migrants-venezuela.html'},
|
# {'title': 'XXXXX', 'url': 'https://www.nytimes.com/2020/11/27/world/americas/coronavirus-migrants-venezuela.html'},
|
||||||
# ])]
|
# ])]
|
||||||
if is_web_edition:
|
if self.is_web_edition:
|
||||||
return self.parse_web_sections()
|
return self.parse_web_sections()
|
||||||
return self.parse_todays_page()
|
return self.parse_todays_page()
|
||||||
|
|
||||||
|
def get_browser(self, *args, **kwargs):
|
||||||
|
kwargs['user_agent'] = 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'
|
||||||
|
br = BasicNewsRecipe.get_browser(self, *args, **kwargs)
|
||||||
|
br.addheaders += [
|
||||||
|
('Referer', 'https://www.google.com/'),
|
||||||
|
('X-Forwarded-For', '66.249.66.1')
|
||||||
|
]
|
||||||
|
return br
|
||||||
|
|
||||||
|
def preprocess_html(self, soup):
|
||||||
|
w = self.recipe_specific_options.get('res')
|
||||||
|
if w and isinstance(w, str):
|
||||||
|
res = '-' + w
|
||||||
|
for img in soup.findAll('img', attrs={'src':True}):
|
||||||
|
if '-article' in img['src']:
|
||||||
|
ext = img['src'].split('?')[0].split('.')[-1]
|
||||||
|
img['src'] = img['src'].rsplit('-article', 1)[0] + res + '.' + ext
|
||||||
|
for c in soup.findAll('div', attrs={'class':'cap'}):
|
||||||
|
for p in c.findAll(['p', 'div']):
|
||||||
|
p.name = 'span'
|
||||||
|
return soup
|
||||||
|
|
||||||
|
def get_article_url(self, article):
|
||||||
|
url = BasicNewsRecipe.get_article_url(self, article)
|
||||||
|
if not re.search(r'/video/|/athletic/|/card/', url):
|
||||||
|
return url
|
||||||
|
self.log('\tSkipping ', url)
|
||||||
|
@ -14,8 +14,6 @@ from calibre.ebooks.BeautifulSoup import Tag
|
|||||||
from calibre.utils.date import strptime
|
from calibre.utils.date import strptime
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
is_web_edition = False
|
|
||||||
oldest_web_edition_article = 7 # days
|
|
||||||
use_wayback_machine = False
|
use_wayback_machine = False
|
||||||
|
|
||||||
|
|
||||||
@ -77,18 +75,18 @@ def new_tag(soup, name, attrs=()):
|
|||||||
|
|
||||||
|
|
||||||
class NewYorkTimes(BasicNewsRecipe):
|
class NewYorkTimes(BasicNewsRecipe):
|
||||||
|
title = 'The New York Times'
|
||||||
if is_web_edition:
|
description = (
|
||||||
title = 'The New York Times (Web)'
|
'New York Times. Todays Paper '
|
||||||
description = 'New York Times (Web). You can edit the recipe to remove sections you are not interested in.'
|
'Use advanced menu to make changes to fetch Web Edition'
|
||||||
else:
|
)
|
||||||
title = 'The New York Times'
|
|
||||||
description = 'Today\'s New York Times'
|
|
||||||
encoding = 'utf-8'
|
encoding = 'utf-8'
|
||||||
__author__ = 'Kovid Goyal'
|
__author__ = 'Kovid Goyal'
|
||||||
language = 'en_US'
|
language = 'en_US'
|
||||||
ignore_duplicate_articles = {'title', 'url'}
|
ignore_duplicate_articles = {'title', 'url'}
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
|
is_web_edition = False
|
||||||
|
oldest_web_edition_article = 7 # days
|
||||||
|
|
||||||
extra_css = '''
|
extra_css = '''
|
||||||
.byl, .time { font-size:small; color:#202020; }
|
.byl, .time { font-size:small; color:#202020; }
|
||||||
@ -132,8 +130,17 @@ class NewYorkTimes(BasicNewsRecipe):
|
|||||||
return tf.name
|
return tf.name
|
||||||
|
|
||||||
recipe_specific_options = {
|
recipe_specific_options = {
|
||||||
|
'web': {
|
||||||
|
'short': 'Type in yes, if you want Web Edition',
|
||||||
|
'default': 'Todays Paper'
|
||||||
|
},
|
||||||
|
'days': {
|
||||||
|
'short': 'Oldest article to download from this news source. In days ',
|
||||||
|
'long': 'For example, 1, gives you articles from the past 24 hours\n(Works only for Web_Edition)',
|
||||||
|
'default': str(oldest_web_edition_article)
|
||||||
|
},
|
||||||
'date': {
|
'date': {
|
||||||
'short': 'The date of the edition to download (YYYY/MM/DD format)',
|
'short': 'The date of the edition to download (YYYY/MM/DD format)\nUsed to fetch past editions of NYT newspaper',
|
||||||
'long': 'For example, 2024/07/16'
|
'long': 'For example, 2024/07/16'
|
||||||
},
|
},
|
||||||
'res': {
|
'res': {
|
||||||
@ -150,6 +157,13 @@ class NewYorkTimes(BasicNewsRecipe):
|
|||||||
def __init__(self, *args, **kwargs):
|
def __init__(self, *args, **kwargs):
|
||||||
BasicNewsRecipe.__init__(self, *args, **kwargs)
|
BasicNewsRecipe.__init__(self, *args, **kwargs)
|
||||||
c = self.recipe_specific_options.get('comp')
|
c = self.recipe_specific_options.get('comp')
|
||||||
|
d = self.recipe_specific_options.get('days')
|
||||||
|
w = self.recipe_specific_options.get('web')
|
||||||
|
if w and isinstance(w, str):
|
||||||
|
if w == 'yes':
|
||||||
|
self.is_web_edition = True
|
||||||
|
if d and isinstance(d, str):
|
||||||
|
self.oldest_web_edition_article = float(d)
|
||||||
if c and isinstance(c, str):
|
if c and isinstance(c, str):
|
||||||
if c.lower() == 'yes':
|
if c.lower() == 'yes':
|
||||||
self.compress_news_images = True
|
self.compress_news_images = True
|
||||||
@ -255,7 +269,7 @@ class NewYorkTimes(BasicNewsRecipe):
|
|||||||
date = format_date(d)
|
date = format_date(d)
|
||||||
today = datetime.date.today()
|
today = datetime.date.today()
|
||||||
delta = today - d
|
delta = today - d
|
||||||
if delta.days > oldest_web_edition_article:
|
if delta.days > self.oldest_web_edition_article:
|
||||||
self.log.debug('\tSkipping article', title, 'as it is too old')
|
self.log.debug('\tSkipping article', title, 'as it is too old')
|
||||||
continue
|
continue
|
||||||
yield {'title': title, 'url': url, 'description': desc, 'date': date}
|
yield {'title': title, 'url': url, 'description': desc, 'date': date}
|
||||||
@ -278,7 +292,7 @@ class NewYorkTimes(BasicNewsRecipe):
|
|||||||
date = format_date(d)
|
date = format_date(d)
|
||||||
today = datetime.date.today()
|
today = datetime.date.today()
|
||||||
delta = today - d
|
delta = today - d
|
||||||
if delta.days > oldest_web_edition_article:
|
if delta.days > self.oldest_web_edition_article:
|
||||||
self.log.debug('\tSkipping article', title, 'as it is too old')
|
self.log.debug('\tSkipping article', title, 'as it is too old')
|
||||||
continue
|
continue
|
||||||
yield {'title': title, 'url': url, 'description': desc, 'date': date}
|
yield {'title': title, 'url': url, 'description': desc, 'date': date}
|
||||||
@ -326,7 +340,7 @@ class NewYorkTimes(BasicNewsRecipe):
|
|||||||
# return [('All articles', [
|
# return [('All articles', [
|
||||||
# {'title': 'XXXXX', 'url': 'https://www.nytimes.com/2020/11/27/world/americas/coronavirus-migrants-venezuela.html'},
|
# {'title': 'XXXXX', 'url': 'https://www.nytimes.com/2020/11/27/world/americas/coronavirus-migrants-venezuela.html'},
|
||||||
# ])]
|
# ])]
|
||||||
if is_web_edition:
|
if self.is_web_edition:
|
||||||
return self.parse_web_sections()
|
return self.parse_web_sections()
|
||||||
return self.parse_todays_page()
|
return self.parse_todays_page()
|
||||||
|
|
||||||
@ -351,3 +365,8 @@ class NewYorkTimes(BasicNewsRecipe):
|
|||||||
for p in c.findAll(['p', 'div']):
|
for p in c.findAll(['p', 'div']):
|
||||||
p.name = 'span'
|
p.name = 'span'
|
||||||
return soup
|
return soup
|
||||||
|
|
||||||
|
def get_article_url(self, article):
|
||||||
|
url = BasicNewsRecipe.get_article_url(self, article)
|
||||||
|
if not re.search(r'/video/|/athletic/', url):
|
||||||
|
return url
|
||||||
|
@ -1876,6 +1876,8 @@ class BasicNewsRecipe(Recipe):
|
|||||||
if articles:
|
if articles:
|
||||||
arelpath = sorted(articles, key=numeric_sort_key)[0]
|
arelpath = sorted(articles, key=numeric_sort_key)[0]
|
||||||
a.set('href', item.relhref(arelpath))
|
a.set('href', item.relhref(arelpath))
|
||||||
|
if a.text and len(a) == 0:
|
||||||
|
a.text = a.text + '`'
|
||||||
if url not in seen:
|
if url not in seen:
|
||||||
log.debug(f'Resolved internal URL: {url} -> {arelpath}')
|
log.debug(f'Resolved internal URL: {url} -> {arelpath}')
|
||||||
seen.add(url)
|
seen.add(url)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user