#!/usr/bin/env python # vim:fileencoding=utf-8 # License: GPLv3 Copyright: 2016, Kovid Goyal from __future__ import absolute_import, division, print_function, unicode_literals import json import pprint from datetime import timedelta from calibre.utils.date import utcnow from calibre.utils.iso8601 import parse_iso8601 from calibre.web.feeds.recipes import BasicNewsRecipe oldest_article = 1 # days, includes articles that were published no more than the specified number of days ago def classes(classes): q = frozenset(classes.split(' ')) return dict(attrs={ 'class': lambda x: x and frozenset(x.split()).intersection(q)}) def class_as_string(x): if isinstance(x, (list, tuple)): x = ' '.join(x) return x def class_startswith(*prefixes): def q(x): if x: x = class_as_string(x) for prefix in prefixes: if x.startswith(prefix): return True return False return dict(attrs={'class': q}) # From: https://www3.bostonglobe.com/lifestyle/comics?arc404=true comics_to_fetch = { "ADAM@HOME": 'ad', "ARLO & JANIS": 'aj', # "CUL DE SAC": 'cds', # "CURTIS": 'kfcrt', "DILBERT": 'dt', "DOONESBURY": 'db', "DUSTIN": 'kfdus', "F MINUS": 'fm', "FOR BETTER OR WORSE": 'fb', # "GET FUZZY": 'gz', # "MOTHER GOOSE & GRIMM": 'tmmgg', # "JUMPSTART": 'jt', "MONTY": 'mt', # "POOCH CAFE", "RHYMES WITH ORANGE": 'kfrwo', # "ROSE IS ROSE": 'rr', # "ZIPPY THE PINHEAD": 'kfzpy', "ZITS": 'kfzt' } def extract_json(raw_html): idx = raw_html.find('Fusion.contentCache={') close_idx = raw_html.find('', idx) raw = raw_html[idx:close_idx] raw = raw[raw.find('{'):] idx = raw.find(';Fusion') raw = raw[:idx] # open('/t/raw.html', 'w').write(raw) data = json.loads(raw, strict=False) # open('/t/raw.json', 'w').write(json.dumps(data, indent=2)) return data def absolutize_url(url): if url.startswith("//"): return "https:" + url if url.startswith('/'): url = "https://www.bostonglobe.com" + url return url def parse_section(raw_html): data = extract_json(raw_html)['content-feed'] now = utcnow() cutoff_date = now - timedelta(days=oldest_article) def text(e): if not e: return '' return e.get('basic') or e.get('native', '') for group in data.values(): for elem in group['data']['content_elements']: date = parse_iso8601(elem['publish_date']) if date < cutoff_date: continue title = text(elem['headlines']) description = text(elem.get('description')) url = absolutize_url(elem['canonical_url']) yield {'title': title, 'url': url, 'description': description, 'date': ' ' + str(date.date())} def main(): for sec in 'metro world'.split(): for item in parse_section(open('/t/{}.html'.format(sec)).read()): print(item) # if __name__ == '__main__': # main() class BostonGlobeSubscription(BasicNewsRecipe): title = "Boston Globe" __author__ = 'Kovid Goyal' description = 'The Boston Globe' language = 'en' timefmt = ' [%a, %d %b, %Y]' keep_only_tags = [ class_startswith('headline |', 'subheader |', 'byline |', 'image |', 'lead |', 'body |', 'comic-debug'), ] remove_tags = [ classes('inline-newsletter ad skip-nav article-footer sharebar arc_ad'), dict(id='continue_button'), dict(name=['meta', 'link']) ] remove_tags_after = dict(attrs={'class': lambda x:x and x.startswith('body |')}) remove_attributes = ['style'] no_stylesheets = True scale_news_images = 1600, 1200 ignore_duplicate_articles = {'url'} # simultaneous_downloads = 1 def image_url_processor(self, baseurl, url): return absolutize_url(url) def parse_index(self): feeds = [] for sec in 'metro sports nation world business opinion lifestyle arts'.split(): articles = list(parse_section(self.index_to_soup(absolutize_url('/' + sec), raw=True).decode('utf-8'))) if articles: self.log(sec.capitalize()) self.log(pprint.pformat(articles)) feeds.append((sec.capitalize(), articles)) if self.test: del articles[self.test[1]:] if len(feeds) >= self.test[0]: break articles = [] for title, slug in comics_to_fetch.items(): articles.append({'title':title, 'url':'https://www.bostonglobe.com/games-comics/comics/{}/'.format(slug)}) if articles: feeds.append(('Comics', articles)) return feeds def preprocess_raw_html(self, raw_html, url): soup = self.index_to_soup(raw_html) meta = soup.find(attrs={'name': 'description'}, content=True) if meta is not None and meta['content'].startswith('Comics: '): meta = soup.find(property='og:image', content=True) img_url = 'https://cloudfront-us-east-1.images.arcpublishing.com/bostonglobe/' + meta['content'].split('/')[-1] title = self.tag_to_string(soup.find('title')) raw_html = '

{}

'.format(title, img_url) return raw_html def preprocess_html(self, soup): for img in soup.findAll('img'): fs = img.get('data-src') if fs: img['src'] = fs return soup