From d4c95b0df650f283312adfb1ee273dd0e4a455b0 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Fri, 29 Mar 2019 18:45:40 +0530 Subject: [PATCH] Remove not working recipe Fixes #864 (Deleted bloomberg recipe, change in service) --- recipes/bloomberg_columns.recipe | 211 ------------------------------- 1 file changed, 211 deletions(-) delete mode 100644 recipes/bloomberg_columns.recipe diff --git a/recipes/bloomberg_columns.recipe b/recipes/bloomberg_columns.recipe deleted file mode 100644 index 301f26349e..0000000000 --- a/recipes/bloomberg_columns.recipe +++ /dev/null @@ -1,211 +0,0 @@ -#!/usr/bin/env python2 -# -*- coding: utf-8 -*- -__license__ = 'GPL v3' -__copyright__ = '2018, Dale Furrow dkfurrow@gmail.com' -''' -chron.com -''' -from datetime import datetime, timedelta -from lxml import html, etree -import io -from calibre.web.feeds.recipes import BasicNewsRecipe -import urllib2 -from collections import OrderedDict -import calendar -from calibre.ebooks.BeautifulSoup import Tag - - -contributors_url = "https://www.bloomberg.com/view/contributors" -output_date_format = "%d %b, %H:%M" - -hdr = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36', - 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', - 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3', - 'Accept-Encoding': 'none', - 'Accept-Language': 'en-US,en;q=0.8', - 'Connection': 'keep-alive'} - - -def get_article_parsed(this_url): - req = urllib2.Request(this_url, headers=hdr) - page = urllib2.urlopen(req) - content = page.read() - parser = etree.HTMLParser() - parsed = html.parse(io.BytesIO(bytes(content)), parser) - return parsed - - -class BloombergContributor: - _name = None - _url_name = None - _url_code = None - _article_list = None # article is title, link, date, description - date_formats = ["%B %d, %Y %I:%M %p", "%b %d, %Y %I:%M %p"] - - def __init__(self, name, url_name, url_code): - self._name = name - self._url_name = url_name - self._url_code = url_code - self._article_list = [] - - def __str__(self): - return "{0} ({1}): {2:d} articles".format(self._name, self._url_name, len(self._article_list)) - - def populate_article_list(self): - list_url = "{0}/{1}/{2}/articles".format( - contributors_url, self._url_code, self._url_name) - parsed_list = get_article_parsed(list_url) - articles = parsed_list.xpath("//li[contains(@class, 'item_lwCjl')]") - for article in articles: - headline = article.find('a') - link = headline.attrib['href'] - title = headline.text.strip() - article_date_eles = article.xpath( - ".//span[contains(@class, 'time_3qQJR')]") - if len(article_date_eles) > 0: - article_date_str = article_date_eles[0].text.strip() - article_date = self.parse_date_str(article_date_str) - else: - article_date = None - summary_eles = article.xpath( - ".//p[contains(@class, 'summary_17SO6')]") - if len(summary_eles) > 0: - summary = summary_eles[0].text.strip() - else: - summary = "No summary..." - self._article_list.append((title.encode('ascii', 'ignore'), link, summary.encode('ascii', 'ignore'), - article_date, self.get_article_timestamp(article_date))) - - @staticmethod - def get_article_timestamp(article_date): - # assume all times Eastern... - # 2nd sunday March, 1st Sunday Nov - c = calendar.Calendar(firstweekday=calendar.SUNDAY) - march_cal = c.monthdatescalendar(article_date.year, 3) - dst_start = [ - day for week in march_cal for day in week - if day.weekday() == calendar.SUNDAY and day.month == 3 - ][1] - nov_cal = c.monthdatescalendar(article_date.year, 11) - dst_end = [day for week in nov_cal for day in week - if day.weekday() == calendar.SUNDAY and day.month == 11 - ][0] - dst_start = datetime(dst_start.year, dst_start.month, dst_start.day, 2) - dst_end = datetime(dst_end.year, dst_end.month, dst_end.day, 1) - if dst_start > article_date > dst_end: - shift = timedelta(hours=4) - else: - shift = timedelta(hours=5) - return float((article_date + shift - datetime.utcfromtimestamp(0)).total_seconds()) - - def parse_date_str(self, date_str): - parsed = None - for date_format in self.date_formats: - try: - parsed = datetime.strptime(date_str[0:-4], date_format) - break - except Exception: - pass - return parsed - - def get_article_list(self): - return self._article_list - - def get_ordered_article_feed(self): - output = OrderedDict() - for article in self._article_list: - article_date = article[3] - article_dict = {'title': article[0], 'url': article[1], - 'description': "{0}: {1}".format(self.get_name(), article[2]), - 'author': self.get_name() + ": " + article[3].strftime(output_date_format), - 'date': self.get_name() + ": " + article[3].strftime(output_date_format), - 'timestamp': article[4]} - output[article_date] = article_dict - return OrderedDict(sorted(output.items(), key=lambda t: t[0], reverse=True)) - - def get_name(self): - return self._name - - -def new_tag(soup, name, attrs=()): - impl = getattr(soup, 'new_tag', None) - if impl is not None: - return impl(name, attrs=dict(attrs)) - return Tag(soup, name, attrs=attrs or None) - - -class BloombergContributors(BasicNewsRecipe): - title = u'Bloomberg, Editorial Contributors' - description = 'Articles from Bloomberg.com contributors' - __author__ = 'Dale Furrow' - xpath_contributor_list = """//li[contains(@class, 'item_2zsS8')]/a""" - language = 'en' - no_stylesheets = True - remove_attributes = ['style', 'xmlns'] - keep_only_tags = [dict(name='article', attrs={'data-type': 'article'})] - # note space... - remove_tags = [ - dict(name='div', attrs={'class': ['share-article-button ', 'text-to-speech']})] - oldest_article = 7.0 - ignore_duplicate_articles = {'url'} - recursions = 0 - category = 'news, USA, world, economy, politics' - language = 'en' - - def get_contributors_list(self): - page_doc = get_article_parsed(contributors_url) - els = page_doc.xpath(self.xpath_contributor_list) - contributor_list = [] - for el in els: - name = el.find("span").text.strip() # name - contibutor_items = el.attrib['href'].split('/') - contributor = BloombergContributor( - name, contibutor_items[4], contibutor_items[3]) - contributor_list.append(contributor) - for contributor in contributor_list: - contributor.populate_article_list() - return contributor_list - - def postprocess_html(self, soup, first_fetch): - ''' - :param soup: A `BeautifulSoup - - `_ instance containing the downloaded :term:`HTML`. - :param first_fetch: True if this is the first page of an article. - Remember: BeautifulSoup3! Interface is much different than bs4 - ''' - time_eles = soup.findAll("time", {"class": "article-timestamp"}) - if len(time_eles) > 0: - time_stamp = time_eles[0].get('datetime') - try: - parsed_time = datetime.strptime(time_stamp, "%Y-%m-%dT%H:%M:%S.%fZ")\ - .strftime("%B %d, %Y %I:%M %p") + " UTC" - except: - parsed_time = time_stamp - insert_tag = new_tag(soup, "p", [("class", "user-inserted")]) - insert_tag.insert(0, parsed_time) - soup.time.replaceWith(insert_tag) - - return soup - - def parse_index(self): - self.timefmt = ' [%a, %d %b, %Y]' - self.log('starting parse_index: {0}'.format( - datetime.now().strftime("%B %d, %Y %I:%M %p"))) - feeds = [] - feed_dict = OrderedDict() - contributor_list = self.get_contributors_list() - self.log("Found {0:d} contibutors on main page".format( - len(contributor_list))) - for contributor in contributor_list: - articles = contributor.get_ordered_article_feed() - feed_dict.update(articles) - feed_dict = OrderedDict( - sorted(feed_dict.items(), key=lambda t: t[0], reverse=True)) - self.log("Found {0:d} linked articles from contributors".format( - len(feed_dict))) - feeds.append(("Columns", list(feed_dict.values()))) - # self.log("Total of {0:d} {1} articles".format(len(article_list), cat)) - self.log('finishing parse_index: {0}'.format( - datetime.now().strftime("%B %d, %Y %I:%M %p"))) - return feeds