mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Remove not working recipe
Fixes #864 (Deleted bloomberg recipe, change in service)
This commit is contained in:
parent
e9ef5296e7
commit
d4c95b0df6
@ -1,211 +0,0 @@
|
||||
#!/usr/bin/env python2
|
||||
# -*- coding: utf-8 -*-
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = '2018, Dale Furrow dkfurrow@gmail.com'
|
||||
'''
|
||||
chron.com
|
||||
'''
|
||||
from datetime import datetime, timedelta
|
||||
from lxml import html, etree
|
||||
import io
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
import urllib2
|
||||
from collections import OrderedDict
|
||||
import calendar
|
||||
from calibre.ebooks.BeautifulSoup import Tag
|
||||
|
||||
|
||||
contributors_url = "https://www.bloomberg.com/view/contributors"
|
||||
output_date_format = "%d %b, %H:%M"
|
||||
|
||||
hdr = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
||||
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
|
||||
'Accept-Encoding': 'none',
|
||||
'Accept-Language': 'en-US,en;q=0.8',
|
||||
'Connection': 'keep-alive'}
|
||||
|
||||
|
||||
def get_article_parsed(this_url):
|
||||
req = urllib2.Request(this_url, headers=hdr)
|
||||
page = urllib2.urlopen(req)
|
||||
content = page.read()
|
||||
parser = etree.HTMLParser()
|
||||
parsed = html.parse(io.BytesIO(bytes(content)), parser)
|
||||
return parsed
|
||||
|
||||
|
||||
class BloombergContributor:
|
||||
_name = None
|
||||
_url_name = None
|
||||
_url_code = None
|
||||
_article_list = None # article is title, link, date, description
|
||||
date_formats = ["%B %d, %Y %I:%M %p", "%b %d, %Y %I:%M %p"]
|
||||
|
||||
def __init__(self, name, url_name, url_code):
|
||||
self._name = name
|
||||
self._url_name = url_name
|
||||
self._url_code = url_code
|
||||
self._article_list = []
|
||||
|
||||
def __str__(self):
|
||||
return "{0} ({1}): {2:d} articles".format(self._name, self._url_name, len(self._article_list))
|
||||
|
||||
def populate_article_list(self):
|
||||
list_url = "{0}/{1}/{2}/articles".format(
|
||||
contributors_url, self._url_code, self._url_name)
|
||||
parsed_list = get_article_parsed(list_url)
|
||||
articles = parsed_list.xpath("//li[contains(@class, 'item_lwCjl')]")
|
||||
for article in articles:
|
||||
headline = article.find('a')
|
||||
link = headline.attrib['href']
|
||||
title = headline.text.strip()
|
||||
article_date_eles = article.xpath(
|
||||
".//span[contains(@class, 'time_3qQJR')]")
|
||||
if len(article_date_eles) > 0:
|
||||
article_date_str = article_date_eles[0].text.strip()
|
||||
article_date = self.parse_date_str(article_date_str)
|
||||
else:
|
||||
article_date = None
|
||||
summary_eles = article.xpath(
|
||||
".//p[contains(@class, 'summary_17SO6')]")
|
||||
if len(summary_eles) > 0:
|
||||
summary = summary_eles[0].text.strip()
|
||||
else:
|
||||
summary = "No summary..."
|
||||
self._article_list.append((title.encode('ascii', 'ignore'), link, summary.encode('ascii', 'ignore'),
|
||||
article_date, self.get_article_timestamp(article_date)))
|
||||
|
||||
@staticmethod
|
||||
def get_article_timestamp(article_date):
|
||||
# assume all times Eastern...
|
||||
# 2nd sunday March, 1st Sunday Nov
|
||||
c = calendar.Calendar(firstweekday=calendar.SUNDAY)
|
||||
march_cal = c.monthdatescalendar(article_date.year, 3)
|
||||
dst_start = [
|
||||
day for week in march_cal for day in week
|
||||
if day.weekday() == calendar.SUNDAY and day.month == 3
|
||||
][1]
|
||||
nov_cal = c.monthdatescalendar(article_date.year, 11)
|
||||
dst_end = [day for week in nov_cal for day in week
|
||||
if day.weekday() == calendar.SUNDAY and day.month == 11
|
||||
][0]
|
||||
dst_start = datetime(dst_start.year, dst_start.month, dst_start.day, 2)
|
||||
dst_end = datetime(dst_end.year, dst_end.month, dst_end.day, 1)
|
||||
if dst_start > article_date > dst_end:
|
||||
shift = timedelta(hours=4)
|
||||
else:
|
||||
shift = timedelta(hours=5)
|
||||
return float((article_date + shift - datetime.utcfromtimestamp(0)).total_seconds())
|
||||
|
||||
def parse_date_str(self, date_str):
|
||||
parsed = None
|
||||
for date_format in self.date_formats:
|
||||
try:
|
||||
parsed = datetime.strptime(date_str[0:-4], date_format)
|
||||
break
|
||||
except Exception:
|
||||
pass
|
||||
return parsed
|
||||
|
||||
def get_article_list(self):
|
||||
return self._article_list
|
||||
|
||||
def get_ordered_article_feed(self):
|
||||
output = OrderedDict()
|
||||
for article in self._article_list:
|
||||
article_date = article[3]
|
||||
article_dict = {'title': article[0], 'url': article[1],
|
||||
'description': "{0}: {1}".format(self.get_name(), article[2]),
|
||||
'author': self.get_name() + ": " + article[3].strftime(output_date_format),
|
||||
'date': self.get_name() + ": " + article[3].strftime(output_date_format),
|
||||
'timestamp': article[4]}
|
||||
output[article_date] = article_dict
|
||||
return OrderedDict(sorted(output.items(), key=lambda t: t[0], reverse=True))
|
||||
|
||||
def get_name(self):
|
||||
return self._name
|
||||
|
||||
|
||||
def new_tag(soup, name, attrs=()):
|
||||
impl = getattr(soup, 'new_tag', None)
|
||||
if impl is not None:
|
||||
return impl(name, attrs=dict(attrs))
|
||||
return Tag(soup, name, attrs=attrs or None)
|
||||
|
||||
|
||||
class BloombergContributors(BasicNewsRecipe):
|
||||
title = u'Bloomberg, Editorial Contributors'
|
||||
description = 'Articles from Bloomberg.com contributors'
|
||||
__author__ = 'Dale Furrow'
|
||||
xpath_contributor_list = """//li[contains(@class, 'item_2zsS8')]/a"""
|
||||
language = 'en'
|
||||
no_stylesheets = True
|
||||
remove_attributes = ['style', 'xmlns']
|
||||
keep_only_tags = [dict(name='article', attrs={'data-type': 'article'})]
|
||||
# note space...
|
||||
remove_tags = [
|
||||
dict(name='div', attrs={'class': ['share-article-button ', 'text-to-speech']})]
|
||||
oldest_article = 7.0
|
||||
ignore_duplicate_articles = {'url'}
|
||||
recursions = 0
|
||||
category = 'news, USA, world, economy, politics'
|
||||
language = 'en'
|
||||
|
||||
def get_contributors_list(self):
|
||||
page_doc = get_article_parsed(contributors_url)
|
||||
els = page_doc.xpath(self.xpath_contributor_list)
|
||||
contributor_list = []
|
||||
for el in els:
|
||||
name = el.find("span").text.strip() # name
|
||||
contibutor_items = el.attrib['href'].split('/')
|
||||
contributor = BloombergContributor(
|
||||
name, contibutor_items[4], contibutor_items[3])
|
||||
contributor_list.append(contributor)
|
||||
for contributor in contributor_list:
|
||||
contributor.populate_article_list()
|
||||
return contributor_list
|
||||
|
||||
def postprocess_html(self, soup, first_fetch):
|
||||
'''
|
||||
:param soup: A `BeautifulSoup
|
||||
<https://www.crummy.com/software/BeautifulSoup/bs3/documentation.html>
|
||||
`_ instance containing the downloaded :term:`HTML`.
|
||||
:param first_fetch: True if this is the first page of an article.
|
||||
Remember: BeautifulSoup3! Interface is much different than bs4
|
||||
'''
|
||||
time_eles = soup.findAll("time", {"class": "article-timestamp"})
|
||||
if len(time_eles) > 0:
|
||||
time_stamp = time_eles[0].get('datetime')
|
||||
try:
|
||||
parsed_time = datetime.strptime(time_stamp, "%Y-%m-%dT%H:%M:%S.%fZ")\
|
||||
.strftime("%B %d, %Y %I:%M %p") + " UTC"
|
||||
except:
|
||||
parsed_time = time_stamp
|
||||
insert_tag = new_tag(soup, "p", [("class", "user-inserted")])
|
||||
insert_tag.insert(0, parsed_time)
|
||||
soup.time.replaceWith(insert_tag)
|
||||
|
||||
return soup
|
||||
|
||||
def parse_index(self):
|
||||
self.timefmt = ' [%a, %d %b, %Y]'
|
||||
self.log('starting parse_index: {0}'.format(
|
||||
datetime.now().strftime("%B %d, %Y %I:%M %p")))
|
||||
feeds = []
|
||||
feed_dict = OrderedDict()
|
||||
contributor_list = self.get_contributors_list()
|
||||
self.log("Found {0:d} contibutors on main page".format(
|
||||
len(contributor_list)))
|
||||
for contributor in contributor_list:
|
||||
articles = contributor.get_ordered_article_feed()
|
||||
feed_dict.update(articles)
|
||||
feed_dict = OrderedDict(
|
||||
sorted(feed_dict.items(), key=lambda t: t[0], reverse=True))
|
||||
self.log("Found {0:d} linked articles from contributors".format(
|
||||
len(feed_dict)))
|
||||
feeds.append(("Columns", list(feed_dict.values())))
|
||||
# self.log("Total of {0:d} {1} articles".format(len(article_list), cat))
|
||||
self.log('finishing parse_index: {0}'.format(
|
||||
datetime.now().strftime("%B %d, %Y %I:%M %p")))
|
||||
return feeds
|
Loading…
x
Reference in New Issue
Block a user