Remove not working recipe

Fixes #864 (Deleted bloomberg recipe, change in service)
This commit is contained in:
Kovid Goyal 2019-03-29 18:45:40 +05:30
parent e9ef5296e7
commit d4c95b0df6
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C

View File

@ -1,211 +0,0 @@
#!/usr/bin/env python2
# -*- coding: utf-8 -*-
__license__ = 'GPL v3'
__copyright__ = '2018, Dale Furrow dkfurrow@gmail.com'
'''
chron.com
'''
from datetime import datetime, timedelta
from lxml import html, etree
import io
from calibre.web.feeds.recipes import BasicNewsRecipe
import urllib2
from collections import OrderedDict
import calendar
from calibre.ebooks.BeautifulSoup import Tag
contributors_url = "https://www.bloomberg.com/view/contributors"
output_date_format = "%d %b, %H:%M"
hdr = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
'Accept-Encoding': 'none',
'Accept-Language': 'en-US,en;q=0.8',
'Connection': 'keep-alive'}
def get_article_parsed(this_url):
req = urllib2.Request(this_url, headers=hdr)
page = urllib2.urlopen(req)
content = page.read()
parser = etree.HTMLParser()
parsed = html.parse(io.BytesIO(bytes(content)), parser)
return parsed
class BloombergContributor:
_name = None
_url_name = None
_url_code = None
_article_list = None # article is title, link, date, description
date_formats = ["%B %d, %Y %I:%M %p", "%b %d, %Y %I:%M %p"]
def __init__(self, name, url_name, url_code):
self._name = name
self._url_name = url_name
self._url_code = url_code
self._article_list = []
def __str__(self):
return "{0} ({1}): {2:d} articles".format(self._name, self._url_name, len(self._article_list))
def populate_article_list(self):
list_url = "{0}/{1}/{2}/articles".format(
contributors_url, self._url_code, self._url_name)
parsed_list = get_article_parsed(list_url)
articles = parsed_list.xpath("//li[contains(@class, 'item_lwCjl')]")
for article in articles:
headline = article.find('a')
link = headline.attrib['href']
title = headline.text.strip()
article_date_eles = article.xpath(
".//span[contains(@class, 'time_3qQJR')]")
if len(article_date_eles) > 0:
article_date_str = article_date_eles[0].text.strip()
article_date = self.parse_date_str(article_date_str)
else:
article_date = None
summary_eles = article.xpath(
".//p[contains(@class, 'summary_17SO6')]")
if len(summary_eles) > 0:
summary = summary_eles[0].text.strip()
else:
summary = "No summary..."
self._article_list.append((title.encode('ascii', 'ignore'), link, summary.encode('ascii', 'ignore'),
article_date, self.get_article_timestamp(article_date)))
@staticmethod
def get_article_timestamp(article_date):
# assume all times Eastern...
# 2nd sunday March, 1st Sunday Nov
c = calendar.Calendar(firstweekday=calendar.SUNDAY)
march_cal = c.monthdatescalendar(article_date.year, 3)
dst_start = [
day for week in march_cal for day in week
if day.weekday() == calendar.SUNDAY and day.month == 3
][1]
nov_cal = c.monthdatescalendar(article_date.year, 11)
dst_end = [day for week in nov_cal for day in week
if day.weekday() == calendar.SUNDAY and day.month == 11
][0]
dst_start = datetime(dst_start.year, dst_start.month, dst_start.day, 2)
dst_end = datetime(dst_end.year, dst_end.month, dst_end.day, 1)
if dst_start > article_date > dst_end:
shift = timedelta(hours=4)
else:
shift = timedelta(hours=5)
return float((article_date + shift - datetime.utcfromtimestamp(0)).total_seconds())
def parse_date_str(self, date_str):
parsed = None
for date_format in self.date_formats:
try:
parsed = datetime.strptime(date_str[0:-4], date_format)
break
except Exception:
pass
return parsed
def get_article_list(self):
return self._article_list
def get_ordered_article_feed(self):
output = OrderedDict()
for article in self._article_list:
article_date = article[3]
article_dict = {'title': article[0], 'url': article[1],
'description': "{0}: {1}".format(self.get_name(), article[2]),
'author': self.get_name() + ": " + article[3].strftime(output_date_format),
'date': self.get_name() + ": " + article[3].strftime(output_date_format),
'timestamp': article[4]}
output[article_date] = article_dict
return OrderedDict(sorted(output.items(), key=lambda t: t[0], reverse=True))
def get_name(self):
return self._name
def new_tag(soup, name, attrs=()):
impl = getattr(soup, 'new_tag', None)
if impl is not None:
return impl(name, attrs=dict(attrs))
return Tag(soup, name, attrs=attrs or None)
class BloombergContributors(BasicNewsRecipe):
title = u'Bloomberg, Editorial Contributors'
description = 'Articles from Bloomberg.com contributors'
__author__ = 'Dale Furrow'
xpath_contributor_list = """//li[contains(@class, 'item_2zsS8')]/a"""
language = 'en'
no_stylesheets = True
remove_attributes = ['style', 'xmlns']
keep_only_tags = [dict(name='article', attrs={'data-type': 'article'})]
# note space...
remove_tags = [
dict(name='div', attrs={'class': ['share-article-button ', 'text-to-speech']})]
oldest_article = 7.0
ignore_duplicate_articles = {'url'}
recursions = 0
category = 'news, USA, world, economy, politics'
language = 'en'
def get_contributors_list(self):
page_doc = get_article_parsed(contributors_url)
els = page_doc.xpath(self.xpath_contributor_list)
contributor_list = []
for el in els:
name = el.find("span").text.strip() # name
contibutor_items = el.attrib['href'].split('/')
contributor = BloombergContributor(
name, contibutor_items[4], contibutor_items[3])
contributor_list.append(contributor)
for contributor in contributor_list:
contributor.populate_article_list()
return contributor_list
def postprocess_html(self, soup, first_fetch):
'''
:param soup: A `BeautifulSoup
<https://www.crummy.com/software/BeautifulSoup/bs3/documentation.html>
`_ instance containing the downloaded :term:`HTML`.
:param first_fetch: True if this is the first page of an article.
Remember: BeautifulSoup3! Interface is much different than bs4
'''
time_eles = soup.findAll("time", {"class": "article-timestamp"})
if len(time_eles) > 0:
time_stamp = time_eles[0].get('datetime')
try:
parsed_time = datetime.strptime(time_stamp, "%Y-%m-%dT%H:%M:%S.%fZ")\
.strftime("%B %d, %Y %I:%M %p") + " UTC"
except:
parsed_time = time_stamp
insert_tag = new_tag(soup, "p", [("class", "user-inserted")])
insert_tag.insert(0, parsed_time)
soup.time.replaceWith(insert_tag)
return soup
def parse_index(self):
self.timefmt = ' [%a, %d %b, %Y]'
self.log('starting parse_index: {0}'.format(
datetime.now().strftime("%B %d, %Y %I:%M %p")))
feeds = []
feed_dict = OrderedDict()
contributor_list = self.get_contributors_list()
self.log("Found {0:d} contibutors on main page".format(
len(contributor_list)))
for contributor in contributor_list:
articles = contributor.get_ordered_article_feed()
feed_dict.update(articles)
feed_dict = OrderedDict(
sorted(feed_dict.items(), key=lambda t: t[0], reverse=True))
self.log("Found {0:d} linked articles from contributors".format(
len(feed_dict)))
feeds.append(("Columns", list(feed_dict.values())))
# self.log("Total of {0:d} {1} articles".format(len(article_list), cat))
self.log('finishing parse_index: {0}'.format(
datetime.now().strftime("%B %d, %Y %I:%M %p")))
return feeds