mirror of
				https://github.com/kovidgoyal/calibre.git
				synced 2025-10-25 15:52:25 -04:00 
			
		
		
		
	
		
			
				
	
	
		
			182 lines
		
	
	
		
			5.6 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			182 lines
		
	
	
		
			5.6 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| #!/usr/bin/env python
 | |
| # vim:fileencoding=utf-8
 | |
| # License: GPLv3 Copyright: 2016, Kovid Goyal <kovid at kovidgoyal.net>
 | |
| 
 | |
| from __future__ import absolute_import, division, print_function, unicode_literals
 | |
| 
 | |
| import json
 | |
| import pprint
 | |
| from datetime import timedelta
 | |
| 
 | |
| from calibre.utils.date import utcnow
 | |
| from calibre.utils.iso8601 import parse_iso8601
 | |
| from calibre.web.feeds.recipes import BasicNewsRecipe
 | |
| 
 | |
| oldest_article = 1  # days, includes articles that were published no more than the specified number of days ago
 | |
| 
 | |
| 
 | |
| def classes(classes):
 | |
|     q = frozenset(classes.split(' '))
 | |
|     return dict(attrs={
 | |
|         'class': lambda x: x and frozenset(x.split()).intersection(q)})
 | |
| 
 | |
| 
 | |
| def class_as_string(x):
 | |
|     if isinstance(x, (list, tuple)):
 | |
|         x = ' '.join(x)
 | |
|     return x
 | |
| 
 | |
| 
 | |
| def class_startswith(*prefixes):
 | |
| 
 | |
|     def q(x):
 | |
|         if x:
 | |
|             x = class_as_string(x)
 | |
|             for prefix in prefixes:
 | |
|                 if x.startswith(prefix):
 | |
|                     return True
 | |
|         return False
 | |
| 
 | |
|     return dict(attrs={'class': q})
 | |
| 
 | |
| 
 | |
| # From: https://www3.bostonglobe.com/lifestyle/comics?arc404=true
 | |
| comics_to_fetch = {
 | |
|     'ADAM@HOME': 'ad',
 | |
|     'ARLO & JANIS': 'aj',
 | |
|     # "CUL DE SAC": 'cds',
 | |
|     # "CURTIS": 'kfcrt',
 | |
|     'DILBERT': 'dt',
 | |
|     'DOONESBURY': 'db',
 | |
|     'DUSTIN': 'kfdus',
 | |
|     'F MINUS': 'fm',
 | |
|     'FOR BETTER OR WORSE': 'fb',
 | |
|     # "GET FUZZY": 'gz',
 | |
|     # "MOTHER GOOSE & GRIMM": 'tmmgg',
 | |
|     # "JUMPSTART": 'jt',
 | |
|     'MONTY': 'mt',
 | |
|     # "POOCH CAFE",
 | |
|     'RHYMES WITH ORANGE': 'kfrwo',
 | |
|     # "ROSE IS ROSE": 'rr',
 | |
|     # "ZIPPY THE PINHEAD": 'kfzpy',
 | |
|     'ZITS': 'kfzt'
 | |
| }
 | |
| 
 | |
| 
 | |
| def extract_json(raw_html):
 | |
|     idx = raw_html.find('Fusion.contentCache={')
 | |
|     close_idx = raw_html.find('</script>', idx)
 | |
|     raw = raw_html[idx:close_idx]
 | |
|     raw = raw[raw.find('{'):]
 | |
|     idx = raw.find(';Fusion')
 | |
|     raw = raw[:idx]
 | |
|     # open('/t/raw.html', 'w').write(raw)
 | |
|     data = json.loads(raw, strict=False)
 | |
|     # open('/t/raw.json', 'w').write(json.dumps(data, indent=2))
 | |
|     return data
 | |
| 
 | |
| 
 | |
| def absolutize_url(url):
 | |
|     if url.startswith('//'):
 | |
|         return 'https:' + url
 | |
|     if url.startswith('/'):
 | |
|         url = 'https://www.bostonglobe.com' + url
 | |
|     return url
 | |
| 
 | |
| 
 | |
| def parse_section(raw_html):
 | |
|     data = extract_json(raw_html)['content-feed']
 | |
|     now = utcnow()
 | |
|     cutoff_date = now - timedelta(days=oldest_article)
 | |
| 
 | |
|     def text(e):
 | |
|         if not e:
 | |
|             return ''
 | |
|         return e.get('basic') or e.get('native', '')
 | |
| 
 | |
|     for group in data.values():
 | |
|         for elem in group['data']['content_elements']:
 | |
|             date = parse_iso8601(elem['publish_date'])
 | |
|             if date < cutoff_date:
 | |
|                 continue
 | |
|             title = text(elem['headlines'])
 | |
|             description = text(elem.get('description'))
 | |
|             try:
 | |
|                 url = absolutize_url(elem['canonical_url'])
 | |
|             except KeyError:
 | |
|                 continue
 | |
|             yield {'title': title, 'url': url, 'description': description, 'date': ' '  + str(date.date())}
 | |
| 
 | |
| 
 | |
| def main():
 | |
|     for sec in 'metro world'.split():
 | |
|         for item in parse_section(open('/t/{}.html'.format(sec)).read()):
 | |
|             print(item)
 | |
| 
 | |
| 
 | |
| # if __name__ == '__main__':
 | |
| #     main()
 | |
| 
 | |
| 
 | |
| class BostonGlobeSubscription(BasicNewsRecipe):
 | |
| 
 | |
|     title = 'Boston Globe'
 | |
|     __author__ = 'Kovid Goyal'
 | |
|     description = 'The Boston Globe'
 | |
|     language = 'en_US'
 | |
|     timefmt = ' [%a, %d %b, %Y]'
 | |
|     keep_only_tags = [
 | |
|         class_startswith('headline |', 'subheader |', 'byline |', 'image |', 'lead |', 'body |', 'comic-debug'),
 | |
|     ]
 | |
|     remove_tags = [
 | |
|         classes('inline-newsletter ad skip-nav article-footer sharebar arc_ad'),
 | |
|         dict(id='continue_button'),
 | |
|         dict(name=['meta', 'link'])
 | |
|     ]
 | |
|     remove_tags_after = dict(attrs={'class': lambda x:x and x.startswith('body |')})
 | |
|     remove_attributes = ['style']
 | |
|     no_stylesheets = True
 | |
|     scale_news_images = 1600, 1200
 | |
|     ignore_duplicate_articles = {'url'}
 | |
|     # simultaneous_downloads = 1
 | |
| 
 | |
|     def image_url_processor(self, baseurl, url):
 | |
|         return absolutize_url(url)
 | |
| 
 | |
|     def parse_index(self):
 | |
|         feeds = []
 | |
|         for sec in 'metro sports nation world business opinion lifestyle arts'.split():
 | |
|             articles = list(parse_section(self.index_to_soup(absolutize_url('/' + sec), raw=True).decode('utf-8')))
 | |
|             if articles:
 | |
|                 self.log(sec.capitalize())
 | |
|                 self.log(pprint.pformat(articles))
 | |
|                 feeds.append((sec.capitalize(), articles))
 | |
|                 if self.test:
 | |
|                     del articles[self.test[1]:]
 | |
|                     if len(feeds) >= self.test[0]:
 | |
|                         break
 | |
| 
 | |
|         articles = []
 | |
|         for title, slug in comics_to_fetch.items():
 | |
|             articles.append({'title':title, 'url':'https://www.bostonglobe.com/games-comics/comics/{}/'.format(slug)})
 | |
|         if articles:
 | |
|             feeds.append(('Comics', articles))
 | |
|         return feeds
 | |
| 
 | |
|     def preprocess_raw_html(self, raw_html, url):
 | |
|         soup = self.index_to_soup(raw_html)
 | |
|         meta = soup.find(attrs={'name': 'description'}, content=True)
 | |
|         if meta is not None and meta['content'].startswith('Comics: '):
 | |
|             meta = soup.find(property='og:image', content=True)
 | |
|             img_url = 'https://cloudfront-us-east-1.images.arcpublishing.com/bostonglobe/' + meta['content'].split('/')[-1]
 | |
|             title = self.tag_to_string(soup.find('title'))
 | |
|             raw_html = '<html><body><h1 class="headline |">{}</h1><div class="image |"><img src="{}"></div></body></html>'.format(title, img_url)
 | |
|         return raw_html
 | |
| 
 | |
|     def preprocess_html(self, soup):
 | |
|         for img in soup.findAll('img'):
 | |
|             fs = img.get('data-src')
 | |
|             if fs:
 | |
|                 img['src'] = fs
 | |
|         return soup
 |