Remove not working recipe

Fixes #864 (Deleted bloomberg recipe, change in service)
2025-08-30 23:00:21 -04:00 · 2019-03-29 18:45:40 +05:30 · 2019-03-29 18:45:40 +05:30 · d4c95b0df6
commit d4c95b0df6
parent e9ef5296e7
1 changed files with 0 additions and 211 deletions
--- a/recipes/bloomberg_columns.recipe
+++ b/recipes/bloomberg_columns.recipe
@ -1,211 +0,0 @@
 #!/usr/bin/env python2
 # -*- coding: utf-8 -*-
 __license__ = 'GPL v3'
 __copyright__ = '2018, Dale Furrow dkfurrow@gmail.com'
 '''
 chron.com
 '''
 from datetime import datetime, timedelta
 from lxml import html, etree
 import io
 from calibre.web.feeds.recipes import BasicNewsRecipe
 import urllib2
 from collections import OrderedDict
 import calendar
 from calibre.ebooks.BeautifulSoup import Tag
 contributors_url = "https://www.bloomberg.com/view/contributors"
 output_date_format = "%d %b, %H:%M"
 hdr = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
       'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
       'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
       'Accept-Encoding': 'none',
       'Accept-Language': 'en-US,en;q=0.8',
       'Connection': 'keep-alive'}
 def get_article_parsed(this_url):
    req = urllib2.Request(this_url, headers=hdr)
    page = urllib2.urlopen(req)
    content = page.read()
    parser = etree.HTMLParser()
    parsed = html.parse(io.BytesIO(bytes(content)), parser)
    return parsed
 class BloombergContributor:
    _name = None
    _url_name = None
    _url_code = None
    _article_list = None  # article is title, link, date, description
    date_formats = ["%B %d, %Y %I:%M %p", "%b %d, %Y %I:%M %p"]
    def __init__(self, name, url_name, url_code):
        self._name = name
        self._url_name = url_name
        self._url_code = url_code
        self._article_list = []
    def __str__(self):
        return "{0} ({1}): {2:d} articles".format(self._name, self._url_name, len(self._article_list))
    def populate_article_list(self):
        list_url = "{0}/{1}/{2}/articles".format(
            contributors_url, self._url_code, self._url_name)
        parsed_list = get_article_parsed(list_url)
        articles = parsed_list.xpath("//li[contains(@class, 'item_lwCjl')]")
        for article in articles:
            headline = article.find('a')
            link = headline.attrib['href']
            title = headline.text.strip()
            article_date_eles = article.xpath(
                ".//span[contains(@class, 'time_3qQJR')]")
            if len(article_date_eles) > 0:
                article_date_str = article_date_eles[0].text.strip()
                article_date = self.parse_date_str(article_date_str)
            else:
                article_date = None
            summary_eles = article.xpath(
                ".//p[contains(@class, 'summary_17SO6')]")
            if len(summary_eles) > 0:
                summary = summary_eles[0].text.strip()
            else:
                summary = "No summary..."
            self._article_list.append((title.encode('ascii', 'ignore'), link, summary.encode('ascii', 'ignore'),
                                      article_date, self.get_article_timestamp(article_date)))
    @staticmethod
    def get_article_timestamp(article_date):
        # assume all times Eastern...
        # 2nd sunday March, 1st Sunday Nov
        c = calendar.Calendar(firstweekday=calendar.SUNDAY)
        march_cal = c.monthdatescalendar(article_date.year, 3)
        dst_start = [
                day for week in march_cal for day in week
                if day.weekday() == calendar.SUNDAY and day.month == 3
        ][1]
        nov_cal = c.monthdatescalendar(article_date.year, 11)
        dst_end = [day for week in nov_cal for day in week
                if day.weekday() == calendar.SUNDAY and day.month == 11
        ][0]
        dst_start = datetime(dst_start.year, dst_start.month, dst_start.day, 2)
        dst_end = datetime(dst_end.year, dst_end.month, dst_end.day, 1)
        if dst_start > article_date > dst_end:
            shift = timedelta(hours=4)
        else:
            shift = timedelta(hours=5)
        return float((article_date + shift - datetime.utcfromtimestamp(0)).total_seconds())
    def parse_date_str(self, date_str):
        parsed = None
        for date_format in self.date_formats:
            try:
                parsed = datetime.strptime(date_str[0:-4], date_format)
                break
            except Exception:
                pass
        return parsed
    def get_article_list(self):
        return self._article_list
    def get_ordered_article_feed(self):
        output = OrderedDict()
        for article in self._article_list:
            article_date = article[3]
            article_dict = {'title': article[0], 'url': article[1],
                            'description': "{0}: {1}".format(self.get_name(), article[2]),
                            'author': self.get_name() + ": " + article[3].strftime(output_date_format),
                            'date': self.get_name() + ": " + article[3].strftime(output_date_format),
                            'timestamp': article[4]}
            output[article_date] = article_dict
        return OrderedDict(sorted(output.items(), key=lambda t: t[0], reverse=True))
    def get_name(self):
        return self._name
 def new_tag(soup, name, attrs=()):
    impl = getattr(soup, 'new_tag', None)
    if impl is not None:
        return impl(name, attrs=dict(attrs))
    return Tag(soup, name, attrs=attrs or None)
 class BloombergContributors(BasicNewsRecipe):
    title = u'Bloomberg, Editorial Contributors'
    description = 'Articles from Bloomberg.com contributors'
    __author__ = 'Dale Furrow'
    xpath_contributor_list = """//li[contains(@class, 'item_2zsS8')]/a"""
    language = 'en'
    no_stylesheets = True
    remove_attributes = ['style', 'xmlns']
    keep_only_tags = [dict(name='article', attrs={'data-type': 'article'})]
    # note space...
    remove_tags = [
        dict(name='div', attrs={'class': ['share-article-button ', 'text-to-speech']})]
    oldest_article = 7.0
    ignore_duplicate_articles = {'url'}
    recursions = 0
    category = 'news, USA, world, economy, politics'
    language = 'en'
    def get_contributors_list(self):
        page_doc = get_article_parsed(contributors_url)
        els = page_doc.xpath(self.xpath_contributor_list)
        contributor_list = []
        for el in els:
            name = el.find("span").text.strip()  # name
            contibutor_items = el.attrib['href'].split('/')
            contributor = BloombergContributor(
                name, contibutor_items[4], contibutor_items[3])
            contributor_list.append(contributor)
        for contributor in contributor_list:
            contributor.populate_article_list()
        return contributor_list
    def postprocess_html(self, soup, first_fetch):
        '''
        :param soup: A `BeautifulSoup
        <https://www.crummy.com/software/BeautifulSoup/bs3/documentation.html>
        `_  instance containing the downloaded :term:`HTML`.
        :param first_fetch: True if this is the first page of an article.
        Remember: BeautifulSoup3! Interface is much different than bs4
        '''
        time_eles = soup.findAll("time", {"class": "article-timestamp"})
        if len(time_eles) > 0:
            time_stamp = time_eles[0].get('datetime')
            try:
                parsed_time = datetime.strptime(time_stamp, "%Y-%m-%dT%H:%M:%S.%fZ")\
                    .strftime("%B %d, %Y %I:%M %p") + " UTC"
            except:
                parsed_time = time_stamp
            insert_tag = new_tag(soup, "p", [("class", "user-inserted")])
            insert_tag.insert(0, parsed_time)
            soup.time.replaceWith(insert_tag)
        return soup
    def parse_index(self):
        self.timefmt = ' [%a, %d %b, %Y]'
        self.log('starting parse_index: {0}'.format(
            datetime.now().strftime("%B %d, %Y %I:%M %p")))
        feeds = []
        feed_dict = OrderedDict()
        contributor_list = self.get_contributors_list()
        self.log("Found {0:d} contibutors on main page".format(
            len(contributor_list)))
        for contributor in contributor_list:
            articles = contributor.get_ordered_article_feed()
            feed_dict.update(articles)
        feed_dict = OrderedDict(
            sorted(feed_dict.items(), key=lambda t: t[0], reverse=True))
        self.log("Found {0:d} linked articles from contributors".format(
            len(feed_dict)))
        feeds.append(("Columns", list(feed_dict.values())))
        # self.log("Total of {0:d} {1} articles".format(len(article_list), cat))
        self.log('finishing parse_index: {0}'.format(
            datetime.now().strftime("%B %d, %Y %I:%M %p")))
        return feeds