mirror of
				https://github.com/kovidgoyal/calibre.git
				synced 2025-10-24 15:28:53 -04:00 
			
		
		
		
	
		
			
				
	
	
		
			203 lines
		
	
	
		
			7.2 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			203 lines
		
	
	
		
			7.2 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| #!/usr/bin/env python
 | |
| # vim:fileencoding=utf-8
 | |
| import json
 | |
| import re
 | |
| from collections import OrderedDict
 | |
| from urllib.parse import urlparse
 | |
| 
 | |
| from html5_parser import parse
 | |
| from mechanize import Request
 | |
| 
 | |
| from calibre import browser, random_user_agent
 | |
| from calibre.web.feeds.news import BasicNewsRecipe, classes
 | |
| 
 | |
| 
 | |
| def absurl(url):
 | |
|     if url.startswith('/'):
 | |
|         url = 'https://www.hbr.org' + url
 | |
|     return url
 | |
| 
 | |
| 
 | |
| class HBR(BasicNewsRecipe):
 | |
|     title = 'Harvard Business Review'
 | |
|     __author__ = 'unkn0wn'
 | |
|     description = (
 | |
|         'Harvard Business Review is the leading destination for smart management thinking. '
 | |
|         'Through its flagship magazine, books, and digital content and tools published on HBR.org, '
 | |
|         'Harvard Business Review aims to provide professionals around the world with rigorous insights '
 | |
|         'and best practices to help lead themselves and their organizations more effectively and to '
 | |
|         'make a positive impact.'
 | |
|     )
 | |
|     language = 'en'
 | |
|     masthead_url = 'https://hbr.org/resources/css/images/hbr_logo.svg'
 | |
|     publication_type = 'magazine'
 | |
|     encoding = 'utf-8'
 | |
|     remove_javascript = True
 | |
|     no_stylesheets = True
 | |
|     compress_news_images = True
 | |
|     ignore_duplicate_articles = {'url'}
 | |
|     base_url = 'https://hbr.org'
 | |
|     remove_attributes = ['height', 'width', 'style']
 | |
|     resolve_internal_links = True
 | |
| 
 | |
|     extra_css = '''
 | |
|         .article-summary, .article-ideainbrief,
 | |
|         .description-text, .link--black, .topic, .auth {font-size:small; color:#202020;}
 | |
|         .credits--hero-image, .credits--inline-image, .caption--inline-image,
 | |
|         .calibre-nuked-tag-figcaption {font-size:small; text-align:center;}
 | |
|         .sub { font-style: italic; }
 | |
|         .article-byline-list {font-size:small; font-weight:bold;}
 | |
|         .question {font-weight:bold;}
 | |
|         .right-rail--container {font-size:small; color:#404040;}
 | |
|         .article-callout, .slug-content {color:#404040;}
 | |
|         .article-sidebar {color:#202020;}
 | |
|     '''
 | |
| 
 | |
|     def preprocess_raw_html(self, raw, url):
 | |
|         root = parse(raw)
 | |
|         script = root.xpath('//script[@id="__NEXT_DATA__"]')
 | |
|         data = json.loads(script[0].text)
 | |
|         data = data['props']['pageProps']['article']
 | |
|         endpoint_url = (
 | |
|             'https://platform.hbr.org/hbr/bff/content/article' + urlparse(url).path
 | |
|         )
 | |
| 
 | |
|         topic = ''
 | |
|         if data.get('primaryTopic'):
 | |
|             topic = f'<div class="topic">{data["primaryTopic"]}</div>'
 | |
|         title = f'<h1>{data["title"]}</h1>'
 | |
|         dek = f'<p class="sub">{data.get("dek", "")}</p>'
 | |
|         hero = ''
 | |
|         if data.get('hero'):
 | |
|             hero = f'<img src={absurl(data["hero"]["image"]["defaultSrc"])}>'
 | |
|         auth = ''
 | |
|         if data.get('authors'):
 | |
|             auth = f'<p class="auth">{"By " + ", ".join(x["name"] for x in data.get("authors", {}))}</p>'
 | |
| 
 | |
|         key_ = {
 | |
|             'contentKey': data['contentKey'],
 | |
|         }
 | |
|         headers = {
 | |
|             'User-Agent': random_user_agent(),
 | |
|             'Pragma': 'no-cache',
 | |
|             'Cache-Control': 'no-cache',
 | |
|             'Content-Type': 'application/json',
 | |
|         }
 | |
|         br = browser()
 | |
|         req = Request(
 | |
|             endpoint_url,
 | |
|             headers=headers,
 | |
|             data=json.dumps(key_),
 | |
|             method='POST',
 | |
|             timeout=self.timeout,
 | |
|         )
 | |
|         res = br.open(req)
 | |
|         body = json.loads(res.read())['content']
 | |
| 
 | |
|         return (
 | |
|             '<html><body><article>'
 | |
|             + topic
 | |
|             + title
 | |
|             + dek
 | |
|             + hero
 | |
|             + auth
 | |
|             + body
 | |
|             + '</article></body></html>'
 | |
|         )
 | |
| 
 | |
|     recipe_specific_options = {
 | |
|         'issue': {
 | |
|             'short': 'Enter the Issue Number you want to download ',
 | |
|             'long': 'For example, 2403',
 | |
|         }
 | |
|     }
 | |
| 
 | |
|     def parse_index(self):
 | |
|         d = self.recipe_specific_options.get('issue')
 | |
|         if not (d and isinstance(d, str)):
 | |
|             soup = self.index_to_soup(f'{self.base_url}/magazine')
 | |
|             a = soup.find('a', href=lambda x: x and x.startswith('/archive-toc/'))
 | |
|             cov_url = a.find('img', attrs={'src': True})['src']
 | |
|             self.cover_url = absurl(cov_url)
 | |
|             issue_url = absurl(a['href'])
 | |
|         else:
 | |
|             issue_url = 'https://hbr.org/archive-toc/BR' + d
 | |
|             mobj = re.search(r'archive-toc/(?P<issue>(BR)?\d+)\b', issue_url)
 | |
|             if mobj:
 | |
|                 self.cover_url = f'https://hbr.org/resources/images/covers/{mobj.group("issue")}_500.png'
 | |
| 
 | |
|         soup = self.index_to_soup(issue_url)
 | |
|         issue_title = soup.find('h1')
 | |
|         if issue_title:
 | |
|             self.timefmt = f' [{self.tag_to_string(issue_title)}]'
 | |
| 
 | |
|         feeds = OrderedDict()
 | |
| 
 | |
|         for h3 in soup.findAll('h3', attrs={'class': 'hed'}):
 | |
|             articles = []
 | |
|             a = h3.find('a')
 | |
|             title = self.tag_to_string(a)
 | |
|             url = absurl(a['href'])
 | |
|             auth = ''
 | |
|             div = h3.find_next_sibling('div', attrs={'class': 'stream-item-info'})
 | |
|             if div:
 | |
|                 aut = self.tag_to_string(div).replace('Magazine Article ', '')
 | |
|                 auth = re.sub(r'(?<=\w)([A-Z])', r', \1', aut)
 | |
|             des = ''
 | |
|             dek = h3.find_next_sibling('div', attrs={'class': 'dek'})
 | |
|             if dek:
 | |
|                 des = self.tag_to_string(dek)
 | |
|             desc = des + ' |' + auth.title()
 | |
|             section_title = 'Articles'
 | |
|             sec = (
 | |
|                 h3.findParent('li')
 | |
|                 .find_previous_sibling('div', **classes('stream-section-label'))
 | |
|                 .find('h4')
 | |
|             )
 | |
|             if sec:
 | |
|                 section_title = self.tag_to_string(sec).title()
 | |
|             self.log(section_title, '\n\t', title, '\n\t', desc, '\n\t\t', url)
 | |
|             articles.append({'title': title, 'url': url, 'description': desc})
 | |
|             if articles:
 | |
|                 if section_title not in feeds:
 | |
|                     feeds[section_title] = []
 | |
|                 feeds[section_title] += articles
 | |
|         ans = [(key, val) for key, val in feeds.items()]
 | |
|         return ans
 | |
| 
 | |
|     def preprocess_html(self, soup):
 | |
|         for slug in soup.findAll(**classes('slug-content')):
 | |
|             del slug['href']
 | |
|         for dek in soup.findAll(**classes('article-byline')):
 | |
|             for by in dek.findAll('span', attrs={'class': 'by-prefix'}):
 | |
|                 by.extract()
 | |
|             for li in dek.findAll('li'):
 | |
|                 li.name = 'span'
 | |
|         for div in soup.findAll(
 | |
|             'div', attrs={'class': ['article-summary', 'article-callout']}
 | |
|         ):
 | |
|             div.name = 'blockquote'
 | |
|         for sidebar in soup.findAll(('article-sidebar', 'article-ideainbrief')):
 | |
|             sidebar.name = 'blockquote'
 | |
|         for img in soup.findAll('img', attrs={'srcset': True}):
 | |
|             split = img['srcset'].split(',')
 | |
|             for x in split:
 | |
|                 if '700w' in x:
 | |
|                     img['src'] = absurl(x.split()[0])
 | |
|             del img['srcset']
 | |
|         return soup
 | |
| 
 | |
|     # HBR changes the content it delivers based on cookies, so the
 | |
|     # following ensures that we send no cookies
 | |
|     def get_browser(self, *args, **kwargs):
 | |
|         return self
 | |
| 
 | |
|     def clone_browser(self, *args, **kwargs):
 | |
|         return self.get_browser()
 | |
| 
 | |
|     def open_novisit(self, *args, **kwargs):
 | |
|         br = browser()
 | |
|         return br.open_novisit(*args, **kwargs)
 | |
| 
 | |
|     open = open_novisit
 |