diff --git a/recipes/bloomberg_columns.recipe b/recipes/bloomberg_columns.recipe new file mode 100644 index 0000000000..fef294ab28 --- /dev/null +++ b/recipes/bloomberg_columns.recipe @@ -0,0 +1,199 @@ +#!/usr/bin/env python2 +# -*- coding: utf-8 -*- +__license__ = 'GPL v3' +__copyright__ = '2018, Dale Furrow dkfurrow@gmail.com' +''' +chron.com +''' +import re +import time +from datetime import datetime, timedelta, date +from lxml import html, etree +from StringIO import StringIO +from calibre.web.feeds.recipes import BasicNewsRecipe +import urllib2 +import traceback +from collections import OrderedDict +import calendar +import sys +from calibre.ebooks.BeautifulSoup import BeautifulSoup, NavigableString, CData, Tag + + +contributors_url = "https://www.bloomberg.com/view/contributors" +output_date_format = "%d %b, %H:%M" + +hdr = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36', + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', + 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3', + 'Accept-Encoding': 'none', + 'Accept-Language': 'en-US,en;q=0.8', + 'Connection': 'keep-alive'} + + +def get_article_parsed(this_url): + req = urllib2.Request(this_url, headers=hdr) + page = urllib2.urlopen(req) + content = page.read() + parser = etree.HTMLParser() + parsed = html.parse(StringIO(content), parser) + return parsed + +class BloombergContributor: + _name = None + _url_name = None + _url_code = None + _article_list = None # article is title, link, date, description + date_formats = ["%B %d, %Y %I:%M %p", "%b %d, %Y %I:%M %p"] + + def __init__(self, name, url_name, url_code): + self._name = name + self._url_name = url_name + self._url_code = url_code + self._article_list = [] + + def __str__(self): + return "{0} ({1}): {2:d} articles".format(self._name, self._url_name, len(self._article_list)) + + def populate_article_list(self): + list_url = "{0}/{1}/{2}/articles".format(contributors_url, self._url_code, self._url_name) + parsed_list = get_article_parsed(list_url) + articles = parsed_list.xpath("//li[contains(@class, 'item_lwCjl')]") + for article in articles: + headline = article.find('a') + link = headline.attrib['href'] + title = headline.text.strip() + article_date_eles = article.xpath(".//span[contains(@class, 'time_3qQJR')]") + if len(article_date_eles) > 0: + article_date_str = article_date_eles[0].text.strip() + article_date = self.parse_date_str(article_date_str) + else: + article_date = None + summary_eles = article.xpath(".//p[contains(@class, 'summary_17SO6')]") + if len(summary_eles) > 0: + summary = summary_eles[0].text.strip() + else: + summary = "No summary..." + self._article_list.append((title.encode('ascii', 'ignore'), link, summary.encode('ascii', 'ignore'), + article_date, self.get_article_timestamp(article_date))) + + @staticmethod + def get_article_timestamp(article_date): + # assume all times Eastern... + # 2nd sunday March, 1st Sunday Nov + c = calendar.Calendar(firstweekday=calendar.SUNDAY) + march_cal = c.monthdatescalendar(article_date.year, 3) + dst_start = [day for week in march_cal for day in week if \ + day.weekday() == calendar.SUNDAY and \ + day.month == 3][1] + nov_cal = c.monthdatescalendar(article_date.year, 11) + dst_end = [day for week in nov_cal for day in week if \ + day.weekday() == calendar.SUNDAY and \ + day.month == 11][0] + dst_start = datetime(dst_start.year, dst_start.month, dst_start.day, 2) + dst_end = datetime(dst_end.year, dst_end.month, dst_end.day, 1) + if dst_start > article_date > dst_end: + shift = timedelta(hours=4) + else: + shift = timedelta(hours=5) + return float((article_date + shift - datetime.utcfromtimestamp(0)).total_seconds()) + + + def parse_date_str(self, date_str): + parsed = None + for date_format in self.date_formats: + try: + parsed = datetime.strptime(date_str[0:-4], date_format) + break + except Exception as ex: + pass + return parsed + + def get_article_list(self): + return self._article_list + + def get_ordered_article_feed(self): + output = OrderedDict() + for article in self._article_list: + article_date = article[3] + article_dict = {'title': article[0], 'url': article[1], + 'description': "{0}: {1}".format(self.get_name(), article[2]), + 'author': self.get_name() + ": " + article[3].strftime(output_date_format), + 'date': self.get_name() + ": " + article[3].strftime(output_date_format), + 'timestamp': article[4]} + output[article_date] = article_dict + return OrderedDict(sorted(output.items(), key=lambda t: t[0], reverse=True)) + + def get_name(self): + return self._name + +class BloombergContributors(BasicNewsRecipe): + title = u'Bloomberg, Editorial Contributors' + description = 'Articles from Bloomberg.com contributors' + __author__ = 'Dale Furrow' + xpath_contributor_list = """//li[contains(@class, 'item_2zsS8')]/a""" + language = 'en' + no_stylesheets = True + remove_attributes = ['style', 'xmlns'] + keep_only_tags = [dict(name='article', attrs={'data-type': 'article'})] + remove_tags = [dict(name='div', attrs= + {'class': ['share-article-button ', 'text-to-speech']})] # note space... + oldest_article = 7.0 + ignore_duplicate_articles = {'url'} + recursions = 0 + category = 'news, USA, world, economy, politics' + language = 'en' + + def get_contributors_list(self): + page_doc = get_article_parsed(contributors_url) + els = page_doc.xpath(self.xpath_contributor_list) + contributor_list = [] + for el in els: + name = el.find("span").text.strip() # name + contibutor_items = el.attrib['href'].split('/') + contributor = BloombergContributor(name, contibutor_items[4], contibutor_items[3]) + contributor_list.append(contributor) + for contributor in contributor_list: + contributor.populate_article_list() + return contributor_list + + def postprocess_html(self, soup, first_fetch): + ''' + :param soup: A `BeautifulSoup + + `_ instance containing the downloaded :term:`HTML`. + :param first_fetch: True if this is the first page of an article. + Remember: BeautifulSoup3! Interface is much different than bs4 + ''' + time_eles = soup.findAll("time", {"class": "article-timestamp"}) + if len(time_eles) > 0: + time_stamp = time_eles[0].get('datetime') + try: + parsed_time = datetime.strptime(time_stamp, "%Y-%m-%dT%H:%M:%S.%fZ")\ + .strftime("%B %d, %Y %I:%M %p") + " UTC" + except: + parsed_time = time_stamp + insert_tag = Tag(soup, "p", [("class", "user-inserted")]) + insert_tag.insert(0, parsed_time) + soup.time.replaceWith(insert_tag) + + + return soup + + def parse_index(self): + self.timefmt = ' [%a, %d %b, %Y]' + self.log('starting parse_index: {0}'.format(datetime.now().strftime("%B %d, %Y %I:%M %p"))) + feeds = [] + feed_dict = OrderedDict() + contributor_list = self.get_contributors_list() + self.log("Found {0:d} contibutors on main page".format(len(contributor_list))) + for contributor in contributor_list: + articles = contributor.get_ordered_article_feed() + feed_dict.update(articles) + feed_dict = OrderedDict(sorted(feed_dict.items(), key=lambda t: t[0], reverse=True)) + self.log("Found {0:d} linked articles from contributors".format(len(feed_dict))) + feeds.append(("Columns", list(feed_dict.values()))) + # self.log("Total of {0:d} {1} articles".format(len(article_list), cat)) + self.log('finishing parse_index: {0}'.format(datetime.now().strftime("%B %d, %Y %I:%M %p"))) + return feeds + + diff --git a/recipes/economist.recipe b/recipes/economist.recipe index a5291bc6cc..a3f912ae1e 100644 --- a/recipes/economist.recipe +++ b/recipes/economist.recipe @@ -10,6 +10,7 @@ from collections import OrderedDict from calibre.ebooks.BeautifulSoup import NavigableString, Tag from calibre.web.feeds.news import BasicNewsRecipe +from calibre.utils.cleantext import clean_ascii_chars def classes(classes): @@ -146,6 +147,18 @@ class Economist(BasicNewsRecipe): raw = etree.tostring(root, encoding=unicode) return raw + def populate_article_metadata(self, article, soup, first): + els = soup.findAll(name=['span', 'p'], + attrs={'class': ['flytitle-and-title__title', 'blog-post__rubric']}) + result = [] + for el in els[0:2]: + if el is not None: + for descendant in el.contents: + if isinstance(descendant, NavigableString): + result.append(unicode(descendant)) + article.summary = u'. '.join(result).encode('utf-8') + '.' + article.text_summary = clean_ascii_chars(article.summary) + def parse_index(self): # return [('Articles', [{'title':'test', # 'url':'http://www.economist.com/news/business/21718916-worlds-biggest-software-firm-has-transformed-its-culture-better-getting-cloud' diff --git a/recipes/economist_free.recipe b/recipes/economist_free.recipe index a5291bc6cc..a3f912ae1e 100644 --- a/recipes/economist_free.recipe +++ b/recipes/economist_free.recipe @@ -10,6 +10,7 @@ from collections import OrderedDict from calibre.ebooks.BeautifulSoup import NavigableString, Tag from calibre.web.feeds.news import BasicNewsRecipe +from calibre.utils.cleantext import clean_ascii_chars def classes(classes): @@ -146,6 +147,18 @@ class Economist(BasicNewsRecipe): raw = etree.tostring(root, encoding=unicode) return raw + def populate_article_metadata(self, article, soup, first): + els = soup.findAll(name=['span', 'p'], + attrs={'class': ['flytitle-and-title__title', 'blog-post__rubric']}) + result = [] + for el in els[0:2]: + if el is not None: + for descendant in el.contents: + if isinstance(descendant, NavigableString): + result.append(unicode(descendant)) + article.summary = u'. '.join(result).encode('utf-8') + '.' + article.text_summary = clean_ascii_chars(article.summary) + def parse_index(self): # return [('Articles', [{'title':'test', # 'url':'http://www.economist.com/news/business/21718916-worlds-biggest-software-firm-has-transformed-its-culture-better-getting-cloud' diff --git a/recipes/houston_chronicle.recipe b/recipes/houston_chronicle.recipe index 1902631367..a203262dbb 100644 --- a/recipes/houston_chronicle.recipe +++ b/recipes/houston_chronicle.recipe @@ -1,17 +1,135 @@ #!/usr/bin/env python2 # -*- coding: utf-8 -*- __license__ = 'GPL v3' -__copyright__ = '2016, Dale Furrow dkfurrow@gmail.com' +__copyright__ = '2018, Dale Furrow dkfurrow@gmail.com' ''' chron.com ''' import re import time +import urllib2 +from StringIO import StringIO +from datetime import datetime, timedelta, date +import traceback +import sys +from collections import OrderedDict + from datetime import datetime, timedelta, date -from lxml import html from calibre.web.feeds.recipes import BasicNewsRecipe +from calibre.web.feeds import Article +from calibre.utils.cleantext import clean_ascii_chars +from calibre.ebooks.BeautifulSoup import NavigableString +from calibre.utils.date import dt_factory, utcnow, local_tz +from lxml import html +from lxml import etree + +regex_date_only = re.compile("""(?:January|February|March|April| + {8}May|June|July|August|September|October|November| + {8}December)\s[0-9]{1,2},\s20[01][0-9]""") +regex_time_only = re.compile("""[0-9]{1,2}:[0-9]{1,2} \w{2}""") +sentence_regex = re.compile("(\S.+?[.!?])(?=\s+|$)") +blog_regex = re.compile('post-\d+') + +pages = OrderedDict([('news', ('/news/houston-texas/', ['business', 'sports'])), + ('business', ('/business/', ['sports'])), + ('sports', ('/sports/', ['business']))]) + +base_url = "http://www.chron.com" + +# sports has 'core-package sports' class +xpath_general = """//div[contains(@class, 'centerpiece-tabs') or + contains(@class, 'wrapper') or + contains(@class, 'contentGroups') or + contains(@class, 'headline-list') or + contains(@class, 'core-package sports') or + contains(@class, 'news')] + //a[contains(@class, 'hdn-analytics')]""" + +excluded_titles = ["Winning numbers", "TV-radio listings"] +def validate_link(page, link, title): + other_category = page[1][1] + if not title or len(title.strip()) < 5: + print("{0} rejected, title too short".format(link)) + return None + if link.split('/')[3] in other_category: + print("{0} rejected, covered in other section".format(link)) + return None + for excluded_title in excluded_titles: + if title.find(excluded_title) != -1: + print("{0} rejected, excluded title".format(link)) + return None + return link, title + + +def get_article_parsed(this_url): + page = urllib2.urlopen(this_url) + content = page.read() + parser = etree.HTMLParser() + parsed = html.parse(StringIO(content), parser) + return parsed + + +def sort_subject(element_list): + # priority of subjects + subjects = ['news', 'neighborhood', 'entertainment'] + subjects.reverse() + subject_dict = OrderedDict(zip(subjects, range(len(subjects)))) + rank_dict = OrderedDict([(rank, []) for rank in range(len(subjects) + 1)]) + for element in element_list: + subj = element[0].split('/')[3] + if subject_dict.get(subj) is not None: + rank_dict[subject_dict[subj] + 1].append(element) + else: + rank_dict[0].append(element) + # now return in reverse order, sorted + combined_list = [] + for rank in range(len(subjects), -1, -1): + article_list = rank_dict[rank] + article_list.sort() + combined_list.extend(article_list) + return combined_list + + +def get_links_from_section_page(page): + page_doc = get_article_parsed(base_url + page[1][0]) + els = page_doc.xpath(xpath_general) + element_list = [] + for el in els: + link = el.get('href').split('?')[0] + title = el.text + if title is None or len(title.strip()) < 5: + link_id = link.split('/')[-1][:-3].split('-')[:-1] + title = ' '.join(link_id) + if link[:4] != 'http': + link = base_url + link + validated_link = validate_link(page=page, link=link, title=title) + if validated_link is not None: + element_list.append(validated_link) + sorted_element_list = sort_subject(element_list) + return [page[0], sorted_element_list] + + +def get_all_links_from_sections(): + all_sections = [] + article_set = set() + final_dict = OrderedDict() + for item in pages.items(): + print "getting links from {0}".format(item[0]) + all_sections.append(get_links_from_section_page(item)) + for section in all_sections: + section_id = section[0] + article_list = section[1] + final_dict[section_id] = [] + for article in article_list: + if article[0] not in article_set: + article_set.add(article[0]) + final_dict[section_id].append(article) + return final_dict + + +# noinspection PyAbstractClass class HoustonChronicle(BasicNewsRecipe): title = u'The Houston Chronicle' description = 'News from Houston, Texas' @@ -22,196 +140,111 @@ class HoustonChronicle(BasicNewsRecipe): remove_empty_feeds = True timefmt = '[%a, %d %b %Y]' timestampfmt = '%Y%m%d%H%M%S' - ignore_duplicate_articles = {'url'} + # ignore_duplicate_articles = {'url'} # defaults to None extra_css = '.article_date {display: none}' + category = 'news, USA' + masthead_url = 'http://www.chron.com/apple-touch-icon-76x76.png' + keep_only_tags = [dict(name='div', attrs={'class': ['article-content', 'article-wrap']})] + remove_tags = [dict(name='div', attrs={'social-title': True}), + dict(name='div', attrs={'class': + ['control-panel', 'gallery-overlay-inner', + 'most-popular', 'asset-media mos-playlist', + 'asset_media asset-media']}), + dict(name='li', attrs={'class': ['hst-resgalleryitem taboola-frame hidden', + 'hst-resgalleryitem hidden']}), + dict(name='ul', attrs={'class': 'clearfix'})] - oldest_web_article = 7.0 + # max_articles_per_feed = 5 # for use in testing - if oldest_web_article is None: - earliest_date = date.today() - else: - earliest_date = date.today() - timedelta(days=oldest_web_article) - - pages = [('news', '/news/houston-texas/'), - ('business', '/business/'), - ('sports', '/sports/')] - - base_url = "http://www.chron.com" - - xpath_str = """//div[contains(@class, 'news') or contains(@class, 'headline-list')] - //*[self::h4 or self::h5]//a[contains(@class, 'hdn-analytics')]""" - - def get_links_from_section_page(self, section_url): - page_doc = html.parse(self.base_url + section_url) - els = page_doc.xpath(self.xpath_str) - element_list = [] - for el in els: - link = el.get('href') - title = el.text - if link[:4] != 'http': - link = self.base_url + link - if title is not None: - element_list.append((link, el.text)) - return element_list - - def get_article_description_from_doc(self, page_doc): + def get_article_description_from_doc(self, soup): description_chars_break = 140 description_max_chars = 300 - desc_xpath = """//div[contains(@class, 'article-body') or - contains(@class, 'resource-content') or contains(@class, 'post')]//p""" - sentence_regex = re.compile("(\S.+?[.!?])(?=\s+|$)") - - def stringify_children(node): - return ''.join([x for x in node.itertext()]) - try: - els = page_doc.xpath(desc_xpath) - out_text = "" - ellipsis = "" - for el in els: - sentences = re.findall(sentence_regex, stringify_children(el)) - for sentence in sentences: - if len(out_text) < description_chars_break: - out_text += sentence + " " - else: - if len(out_text) > description_max_chars: - ellipsis = "..." - return out_text[:description_max_chars] + ellipsis - return out_text - except: + els = soup.findAll('p') + if len(els) > 0: + out_text = "" + this_ellipsis = "" + for el in els: + if el is not None: + result = [] + for descendant in el.contents: + if isinstance(descendant, NavigableString): + result.append(unicode(descendant).strip()) + all_text = u' '.join(result).encode('utf-8') + if len(all_text) > 1: + sentences = re.findall(sentence_regex, all_text) + if sentences is not None and len(sentences) > 0: + for sentence in sentences: + if len(out_text) < description_chars_break: + out_text += sentence + " " + else: + if len(out_text) > description_max_chars: + this_ellipsis = "..." + return out_text[:description_max_chars] + this_ellipsis + return out_text + else: + return "No Article description returned" + except Exception as ex: self.log('Error on Article Description') + traceback.print_exc(file=sys.stdout) + print(str(ex)) return "" - def get_published_time_from_doc(self, page_doc): - regex_date_only = re.compile("""(?:January|February|March|April| - May|June|July|August|September|October|November| - December)\s[0-9]{1,2},\s20[01][0-9]""") - regex_time_only = re.compile("""[0-9]{1,2}:[0-9]{1,2} \w{2}""") + @staticmethod + def get_published_time_from_doc(page_doc): def get_regular_timestamp(date_string): try: out_date = datetime.strptime(date_string, "%Y-%m-%dT%H:%M:%SZ") return out_date - except: + except ValueError: return None - def get_date_from_string(in_text): - match = re.findall(regex_date_only, in_text) - if match: - try: - out_date = datetime.strptime(match[0], "%B %d, %Y") - match = re.findall(regex_time_only, in_text) - if match: - out_time = datetime.strptime(match[0], "%I:%M %p") - return datetime.combine(out_date.date(), out_time.time()) - return out_date - except: - return None - - el = page_doc.xpath("//*[@class='timestamp'][1]") + el = page_doc.findAll(lambda this_tag: this_tag.name == "time" and + ('itemprop', 'datePublished') in this_tag.attrs) if len(el) == 1: - return get_regular_timestamp(el[0].get('title')) + return get_regular_timestamp(el[0].get('datetime')) else: - el = page_doc.xpath( - "//*[@class='entry-date' or @class='post-date'][1]") - if len(el) == 1: - return get_date_from_string(el[0].text_content()) - else: - return None + return None - def get_all_data_feeds_from_page(self, page): - articles = [] - exclude_titles_with = ['Winning numbers'] + def populate_article_metadata(self, article, soup, first): + """ + Called when each HTML page belonging to article is downloaded. + Intended to be used to get article metadata like author/summary/etc. + from the parsed HTML (soup). - def title_excluded(title): - for text in exclude_titles_with: - if title.find(text) != -1: - return True - return False - - link_list = self.get_links_from_section_page(page[1]) - self.log('from section: ', page[0], - " found ", len(link_list), " links") - for link in link_list: - try: - article_doc = html.parse(link[0]) - description = self.get_article_description_from_doc( - article_doc) - parsed_date = self.get_published_time_from_doc(article_doc) - if parsed_date is not None and description is not None and \ - parsed_date.date() > self.earliest_date and \ - not title_excluded(link[1]): - intro_date = parsed_date.strftime('%d %b %H:%M') + " - " - articles.append({'title': link[1], 'url': link[0], - 'description': intro_date + description, - 'date': ""}) - self.log(page[0] + ": " + link[1] + ', from ' + intro_date + - " description of " + str(len(description)) + ' characters at ' + link[0]) - else: - if parsed_date is None: - msg = " No Timestamp Found" - elif title_excluded(link[1]): - msg = " Title Excluded" - else: - msg = " article older than " + \ - str(self.oldest_web_article) + ' days...' - self.log("Skipping article: ", link[0], msg) - except: - print 'error on fetching ' + link[0] - continue - return articles + :param article: A object of class :class:`calibre.web.feeds.Article`. + If you change the summary, remember to also change the text_summary + :param soup: Parsed HTML belonging to this article + :param first: True iff the parsed HTML is the first page of the article. + """ + summary = self.get_article_description_from_doc(soup) + article_date = self.get_published_time_from_doc(soup) + if article_date is not None: + article_timestamp = float((article_date - datetime.utcfromtimestamp(0)).total_seconds()) + article.date = article_timestamp + article.utctime = dt_factory(article_date.timetuple(), assume_utc=True, as_utc=True) + article.localtime = article.utctime.astimezone(local_tz) + summary_date = article.localtime.strftime("%Y-%m-%d %H:%M") if article_date is not None else "No Date" + article.summary = "{0}: {1}".format(summary_date, summary) + article.text_summary = clean_ascii_chars(article.summary) def parse_index(self): - self.timefmt = ' [%a, %d %b, %Y]' self.log('starting parse_index: ', time.strftime(self.timestampfmt)) feeds = [] - for page in self.pages: - articles = self.get_all_data_feeds_from_page(page) - if articles: - feeds.append((page[0], articles)) + sections = get_all_links_from_sections() + for section_id, article_list in sections.items(): + self.log("Getting {0} section, {1:d} articles".format(section_id, len(article_list))) + articles = [] + for article_info in article_list: + self.log("Adding {0} to feed".format(article_info[0])) + articles.append({'title': article_info[1], 'url': article_info[0], + 'description': '', 'date': ""}) + self.log("Appending {0:d} articles for {1}".format(len(articles), section_id)) + feeds.append((section_id, articles)) self.log('finished parse_index: ', time.strftime(self.timestampfmt)) return feeds def preprocess_html(self, soup): - tags_to_exclude = [('class', "caption staged"), - ('style', "display:none")] - story_tag = soup.find( - name='div', attrs={'class': ['article-content', 'article-body']}) - blog_tag = soup.find(name='div', attrs={'id': re.compile('post-\d+')}) - - def is_excluded(tag_to_check): - for attr in tag_to_check.attrs: - if attr in tags_to_exclude: - return True - return False - - def get_attr_startswith(attrs, this_key, this_valuestart): - starts_with = False - for attr in attrs: - if attr[0] == this_key: - if attr[1].startswith(this_valuestart): - starts_with = True - return starts_with - - base_tags = [] - if story_tag is not None: - base_tags = story_tag.findAll(lambda this_tag: (this_tag.name == "p" and not ('class', 'open') in this_tag.attrs and not ('class', 'close') in this_tag.attrs) or this_tag.name.startswith('h') or this_tag.name == 'table' or (this_tag.name == 'li' and ('class', 'hst-resgalleryitem') in this_tag.attrs)) # noqa - if blog_tag is not None: - base_tags = blog_tag.findAll(lambda this_tag: (this_tag.name == "p" or this_tag.name.startswith('h')) or (this_tag.name == "span" and get_attr_startswith(this_tag.attrs, 'class', 'post')) or (this_tag.name == 'img' and ('lazy-state', 'loaded') in this_tag.attrs)) # noqa - - self.log('content tags: ' + str(type(base_tags)) + str(len(base_tags))) - all_tags = [] - all_tags.extend(base_tags) - if len(base_tags) > 0: - for tag in base_tags: - all_tags.extend(tag.findAll(True)) - - for tag in base_tags: - while tag.parent is not None and not is_excluded(tag): - all_tags.append(tag) - tag = tag.parent - for tag in soup.findAll(True): - if tag not in all_tags: - tag.extract() return soup