mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Update Boston Globe
This commit is contained in:
parent
5832bb74b1
commit
70feb562be
@ -5,10 +5,16 @@
|
|||||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||||
import json
|
import json
|
||||||
import pprint
|
import pprint
|
||||||
|
from datetime import timedelta
|
||||||
|
from calibre.utils.date import utcnow
|
||||||
|
from calibre.utils.iso8601 import parse_iso8601
|
||||||
|
|
||||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||||
|
|
||||||
|
|
||||||
|
oldest_article = 1 # days, includes articles that were published no more than the specified number of days ago
|
||||||
|
|
||||||
|
|
||||||
def classes(classes):
|
def classes(classes):
|
||||||
q = frozenset(classes.split(' '))
|
q = frozenset(classes.split(' '))
|
||||||
return dict(attrs={
|
return dict(attrs={
|
||||||
@ -80,6 +86,8 @@ def absolutize_url(url):
|
|||||||
|
|
||||||
def parse_section(raw_html):
|
def parse_section(raw_html):
|
||||||
data = extract_json(raw_html)['content-feed']
|
data = extract_json(raw_html)['content-feed']
|
||||||
|
now = utcnow()
|
||||||
|
cutoff_date = now - timedelta(days=oldest_article)
|
||||||
|
|
||||||
def text(e):
|
def text(e):
|
||||||
if not e:
|
if not e:
|
||||||
@ -88,10 +96,13 @@ def parse_section(raw_html):
|
|||||||
|
|
||||||
for group in data.values():
|
for group in data.values():
|
||||||
for elem in group['data']['content_elements']:
|
for elem in group['data']['content_elements']:
|
||||||
|
date = parse_iso8601(elem['publish_date'])
|
||||||
|
if date < cutoff_date:
|
||||||
|
continue
|
||||||
title = text(elem['headlines'])
|
title = text(elem['headlines'])
|
||||||
description = text(elem.get('description'))
|
description = text(elem.get('description'))
|
||||||
url = absolutize_url(elem['canonical_url'])
|
url = absolutize_url(elem['canonical_url'])
|
||||||
yield {'title': title, 'url': url, 'description': description}
|
yield {'title': title, 'url': url, 'description': description, 'date': ' ' + str(date.date())}
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
|
Loading…
x
Reference in New Issue
Block a user