Update Irish Times

Fixes #1976297 [Irish Times 'Failed to fetch news'](https://bugs.launchpad.net/calibre/+bug/1976297)
This commit is contained in:
Kovid Goyal 2022-05-31 11:48:46 +05:30
parent 46cc7dc6c2
commit 89eb12d8ec
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C

View File

@ -3,18 +3,15 @@ __copyright__ = "2008, Derry FitzGerald. 2009 Modified by Ray Kinsella and David
''' '''
irishtimes.com irishtimes.com
''' '''
import re
import json import json
from uuid import uuid4 from uuid import uuid4
from mechanize import Request from mechanize import Request
try: try:
from urllib.parse import urlencode, urljoin from urllib.parse import urlencode
except ImportError: except ImportError:
from urllib import urlencode from urllib import urlencode
from urlparse import urljoin
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe, classes
from calibre.ptempfile import PersistentTemporaryFile
class IrishTimes(BasicNewsRecipe): class IrishTimes(BasicNewsRecipe):
@ -34,20 +31,47 @@ class IrishTimes(BasicNewsRecipe):
remove_empty_feeds = True remove_empty_feeds = True
no_stylesheets = True no_stylesheets = True
temp_files = [] temp_files = []
articles_are_obfuscated = True keep_only_tags = [
dict(name=['h1', 'h2']),
feeds = [ classes('lead-art-wrapper article-body-wrapper'),
('News', 'https://www.irishtimes.com/cmlink/the-irish-times-news-1.1319192'),
('World', 'https://www.irishtimes.com/cmlink/irishtimesworldfeed-1.1321046'),
('Politics', 'https://www.irishtimes.com/cmlink/irish-times-politics-rss-1.1315953'),
('Business', 'https://www.irishtimes.com/cmlink/the-irish-times-business-1.1319195'),
('Culture', 'https://www.irishtimes.com/cmlink/the-irish-times-culture-1.1319213'),
('Sport', 'https://www.irishtimes.com/cmlink/the-irish-times-sport-1.1319194'),
('Debate', 'https://www.irishtimes.com/cmlink/debate-1.1319211'),
('Life & Style', 'https://www.irishtimes.com/cmlink/the-irish-times-life-style-1.1319214'),
] ]
remove_tags = [
dict(name='button')
]
remove_attributes = ['width', 'height']
def parse_index(self):
soup = self.index_to_soup('https://www.irishtimes.com/')
section = 'Home page'
articles = []
feeds = []
for x in soup.findAll(name=['h3', 'article']):
if x.name == 'h3':
if 'writer_description' in x.get('class') or '':
continue
articles and feeds.append((section, articles))
section = self.tag_to_string(x)
articles = []
self.log('Section:', section)
continue
a = x.find('a', attrs={'class': lambda x: x and 'primary-font'}, href=True)
if a is None:
a = x.find('a', attrs={'class': lambda x: x and 'promo-headline' in x}, href=True)
if a:
q = ''.join(a['class'])
if 'secondary-font' in q and section == 'Home page':
continue
title = self.tag_to_string(a)
url = a['href']
if url.startswith('/'):
url = 'https://www.irishtimes.com' + url
articles.append({'title': title, 'url': url})
self.log('\t', title)
articles and feeds.append((section, articles))
return feeds
def get_browser(self): def get_browser(self):
return super().get_browser()
# To understand the signin logic read signin javascript from submit button from # To understand the signin logic read signin javascript from submit button from
# https://www.irishtimes.com/signin # https://www.irishtimes.com/signin
@ -89,26 +113,3 @@ class IrishTimes(BasicNewsRecipe):
# br.set_debug_http(False) # br.set_debug_http(False)
return br return br
def get_obfuscated_article(self, url):
# Insert a pic from the original url, but use content from the print url
pic = None
pics = self.index_to_soup(url)
div = pics.find('div', {'class' : re.compile('image-carousel')})
if div:
pic = div.img
if pic:
try:
pic['src'] = urljoin(url, pic['src'])
pic.extract()
except:
pic = None
content = self.index_to_soup(url + '?mode=print&ot=example.AjaxPageLayout.ot')
if pic:
content.p.insert(0, pic)
self.temp_files.append(PersistentTemporaryFile('_fa.html'))
self.temp_files[-1].write(content.prettify().encode('utf-8'))
self.temp_files[-1].close()
return self.temp_files[-1].name