mirror of
				https://github.com/kovidgoyal/calibre.git
				synced 2025-10-30 18:22:25 -04:00 
			
		
		
		
	
		
			
				
	
	
		
			200 lines
		
	
	
		
			7.6 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			200 lines
		
	
	
		
			7.6 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| import json
 | |
| import re
 | |
| import time
 | |
| from datetime import datetime, timedelta
 | |
| 
 | |
| from calibre.ebooks.BeautifulSoup import BeautifulSoup
 | |
| from calibre.web.feeds.news import BasicNewsRecipe, classes
 | |
| from html5_parser import parse
 | |
| 
 | |
| # Past 6 editions are available for download.
 | |
| # For available past editions see log and set date to, for example, '20240513'.
 | |
| past_edition = None
 | |
| 
 | |
| 
 | |
| class WSJ(BasicNewsRecipe):
 | |
|     title = 'The Wall Street Journal'
 | |
|     __author__ = 'unkn0wn'
 | |
|     description = (
 | |
|         'The Wall Street Journal is your source for breaking news, analysis and insights from the U.S. and '
 | |
|         'around the world, the world\'s leading business and finance publication.'
 | |
|     )
 | |
|     language = 'en_US'
 | |
|     masthead_url = 'https://s.wsj.net/media/wsj_amp_masthead_lg.png'
 | |
|     encoding = 'utf-8'
 | |
|     no_javascript = True
 | |
|     no_stylesheets = True
 | |
|     remove_attributes = ['style', 'height', 'width']
 | |
|     resolve_internal_links = True
 | |
| 
 | |
|     extra_css = '''
 | |
|         #subhed, em { font-style:italic; color:#202020; }
 | |
|         #byline, #time-to-read, #orig-pubdate-string, .article-byline, time, #flashline { font-size:small; }
 | |
|         .figc { font-size:small; text-align:center; }
 | |
|         img {display:block; margin:0 auto;}
 | |
|     '''
 | |
| 
 | |
|     remove_tags = [
 | |
|         dict(name=['nav', 'svg', 'iframe', 'source']),
 | |
|         dict(name='panel', attrs={'id':'metadata'}),
 | |
|         dict(name='panel', attrs={'layout':'inline'}),
 | |
|         dict(name='panel', attrs={'embed':'inner-article-ad'}),
 | |
|         dict(name='span', attrs={'embed':'ticker'}),
 | |
|         classes('lamrelated-articles-inset-panel'),
 | |
|         dict(name='p', attrs={'id':[
 | |
|             'keywords', 'orig-pubdate-number', 'type', 'is-custom-flashline', 'grouphed', 'author-ids',
 | |
|             'body-extract', 'category', 'sub-category', 'socialhed', 'summary', 'deckline', 'article-flashline'
 | |
|         ]}),
 | |
|         dict(attrs={'data-inset_type':'dynamic'}),
 | |
|         dict(attrs={'data-block':'dynamic-inset'})
 | |
|     ]
 | |
| 
 | |
|     remove_tags_before = [
 | |
|         dict(name='p', attrs={'id':'orig-pubdate-string'})
 | |
|     ]
 | |
|     remove_tags_after = [
 | |
|         dict(name='article')
 | |
|     ]
 | |
| 
 | |
|     def preprocess_html(self, soup):
 | |
|         jpml = soup.find('jpml')
 | |
|         if jpml:
 | |
|             jpml.name = 'article'
 | |
|         h1 = soup.find('p', attrs={'id':'headline'})
 | |
|         if h1:
 | |
|             h1.name = 'h1'
 | |
|         for h2 in soup.findAll('h2'):
 | |
|             h2.name = 'h4'
 | |
|         dt = soup.find('p', attrs={'id':'orig-pubdate-string'})
 | |
|         read = soup.find('p', attrs={'id':'time-to-read'})
 | |
|         byl = soup.find('p', attrs={'id':'byline'})
 | |
|         if dt and byl and read:
 | |
|             dt.name = read.name = byl.name = 'div'
 | |
|             byl.insert(0, dt)
 | |
|             byl.insert(0, read)
 | |
|         url = soup.find('p', attrs={'id':'share-link'})
 | |
|         if url:
 | |
|             url['title'] = self.tag_to_string(url).strip()
 | |
|             url.string = ''
 | |
|         for img in soup.findAll('img', attrs={'location':True}):
 | |
|             img['src'] = img['location']
 | |
|         for figc in soup.findAll('figcaption'):
 | |
|             figc['class'] = 'figc'
 | |
|         col = soup.find('div', text = re.compile('What to Read Next'))
 | |
|         if col:
 | |
|             div = col.findParent('div')
 | |
|             if div:
 | |
|                 div.extract()
 | |
|         time = soup.find('time')
 | |
|         if time:
 | |
|             p = time.findParent('div')
 | |
|             if p:
 | |
|                 p.name = 'p'
 | |
|         for img in soup.findAll('img', src=True):
 | |
|             if img['src'].endswith('/OR'):
 | |
|                 img['src'] = img['src'][:-3]
 | |
|         return soup
 | |
| 
 | |
|     if not past_edition:
 | |
|         def _download_cover(self):
 | |
|             import os
 | |
|             from contextlib import closing
 | |
| 
 | |
|             from calibre import browser
 | |
|             from calibre.utils.img import save_cover_data_to
 | |
|             br = browser()
 | |
|             raw = br.open('https://www.frontpages.com/the-wall-street-journal/')
 | |
|             soup = BeautifulSoup(raw.read())
 | |
|             cu = 'https://www.frontpages.com' + soup.find('img', attrs={'id':'giornale-img'})['src']
 | |
|             self.report_progress(1, _('Downloading cover from %s')%cu)
 | |
|             with closing(br.open(cu, timeout=self.timeout)) as r:
 | |
|                 cdata = r.read()
 | |
|             cpath = os.path.join(self.output_dir, 'cover.jpg')
 | |
|             save_cover_data_to(cdata, cpath)
 | |
|             self.cover_path = cpath
 | |
| 
 | |
|     def get_browser(self, *args, **kw):
 | |
|         kw['user_agent'] = 'okhttp/4.10.0'
 | |
|         br = BasicNewsRecipe.get_browser(self, *args, **kw)
 | |
|         br.addheaders += [
 | |
|             ('Accept-Encoding', 'gzip'),
 | |
|             ('cache-control', 'no-cache'),
 | |
|             ('x-api-key', ('e''b''2''4''0''8''c''d''2''7''f''8''9''1''3''d''4''2''1''f''a''3''d''5''c''3''d''0''7''c''c''f''0''3''4''c''b''4''4''8')),
 | |
|         ]
 | |
|         return br
 | |
| 
 | |
|     def parse_index(self):
 | |
|         index = 'https://bartender.mobile.dowjones.io'
 | |
|         catalog = json.loads(self.index_to_soup(index + '/catalogs/v1/wsj/us/catalog.json', raw=True))
 | |
|         edit = [itm['key'][3:] for itm in catalog['items'] if itm['type'] == 'ITP'][1:]
 | |
|         self.log('**Past Editions available :', ', '.join(edit))
 | |
|         for itm in catalog['items']:
 | |
|             if past_edition:
 | |
|                 if itm['key'] == 'ITP' + past_edition:
 | |
|                     key = itm['key']
 | |
|                     manifest = itm['manifest']
 | |
|                     date = itm['date']
 | |
|                     break
 | |
|             elif itm['type'] == 'ITP':
 | |
|                 key = itm['key']
 | |
|                 manifest = itm['manifest']
 | |
|                 date = itm['date']
 | |
|                 break
 | |
| 
 | |
|         dt = datetime.fromisoformat(date[:-1]) + timedelta(seconds=time.timezone)
 | |
|         dt = dt.strftime('%b %d, %Y')
 | |
|         self.log('Downloading ', dt)
 | |
|         self.timefmt = ' [' + dt + ']'
 | |
| 
 | |
|         feeds = []
 | |
| 
 | |
|         manif = json.loads(self.index_to_soup(index + manifest, raw=True))
 | |
|         for itm in manif['items']:
 | |
|             for k, v in itm.items():
 | |
|                 if '-pages_' in k:
 | |
|                     section = k.split('-pages_')[0].replace('_', ' ')
 | |
|                     self.log(section)
 | |
| 
 | |
|                     articles = []
 | |
| 
 | |
|                     sec_parse = json.loads(self.index_to_soup(index + v, raw=True))
 | |
|                     data = sec_parse['articles']
 | |
|                     for art in data:
 | |
|                         title = data[art]['headline']
 | |
|                         desc = data[art]['summary']
 | |
|                         if 'articleWebViewLink' in data[art]:
 | |
|                             url = data[art]['articleWebViewLink']
 | |
|                         else:
 | |
|                             url = index + '/contents/v1/wsj/us/' + key + '/' + data[art]['filename']
 | |
|                         self.log('          ', title, '\n\t', desc)
 | |
|                         articles.append({'title': title, 'description':desc, 'url': url})
 | |
|                     feeds.append((section, articles))
 | |
|         return feeds
 | |
| 
 | |
|     def preprocess_raw_html(self, raw, url):
 | |
|         if '/webview/' not in url:
 | |
|             root = parse(raw)
 | |
|             for x in root.xpath('//image'):
 | |
|                 x.tag = 'img'
 | |
|             return BeautifulSoup(raw).prettify()
 | |
|         else:
 | |
|             soup = BeautifulSoup(raw)
 | |
|             url = soup.find('meta', attrs={'property':'og:url'})
 | |
|             if url:
 | |
|                 h1 = soup.find('h1')
 | |
|                 if h1:
 | |
|                     h1['title'] = url['content']
 | |
|             h2 = soup.find('h2')
 | |
|             if h2:
 | |
|                 h2['id'] = 'subhed'
 | |
|                 h2.name = 'p'
 | |
|             return soup.prettify()
 | |
| 
 | |
|     def populate_article_metadata(self, article, soup, first):
 | |
|         lnk = soup.find('p', attrs={'id':'share-link'})
 | |
|         if lnk:
 | |
|             article.url = lnk['title']
 | |
|         art = soup.find('h1', attrs={'title':True})
 | |
|         if art:
 | |
|             article.url = art['title']
 |