#!/usr/bin/env python # -*- coding: utf-8 -*- from __future__ import print_function __license__ = 'GPL v3' __copyright__ = '2018, Dale Furrow dkfurrow@gmail.com' ''' chron.com ''' import re import time from datetime import datetime import traceback import sys from collections import OrderedDict from calibre.web.feeds.recipes import BasicNewsRecipe from calibre.utils.cleantext import clean_ascii_chars from calibre.ebooks.BeautifulSoup import NavigableString from calibre.utils.date import dt_factory, local_tz regex_date_only = re.compile(r"""(?:January|February|March|April| {8}May|June|July|August|September|October|November| {8}December)\s[0-9]{1,2},\s20[01][0-9]""") regex_time_only = re.compile(r"""[0-9]{1,2}:[0-9]{1,2} \w{2}""") sentence_regex = re.compile(r"(\S.+?[.!?])(?=\s+|$)") blog_regex = re.compile(r'post-\d+') pages = OrderedDict([('news', ('/news/houston-texas/', ['business', 'sports'])), ('business', ('/business/', ['sports'])), ('sports', ('/sports/', ['business']))]) base_url = "http://www.chron.com" # sports has 'core-package sports' class xpath_general = """//div[contains(@class, 'centerpiece-tabs') or contains(@class, 'wrapper') or contains(@class, 'contentGroups') or contains(@class, 'headline-list') or contains(@class, 'core-package sports') or contains(@class, 'news')] //a[contains(@class, 'hdn-analytics')]""" excluded_titles = ["Winning numbers", "TV-radio listings"] def validate_link(page, link, title): other_category = page[1][1] if not title or len(title.strip()) < 5: print("{0} rejected, title too short".format(link)) return None parts = link.split('/') if len(parts) > 3 and parts[3] in other_category: print("{0} rejected, covered in other section".format(link)) return None for excluded_title in excluded_titles: if title.find(excluded_title) != -1: print("{0} rejected, excluded title".format(link)) return None return link, title def get_article_parsed(index_to_soup, this_url): return index_to_soup(this_url, as_tree=True) def sort_subject(element_list): # priority of subjects subjects = ['news', 'neighborhood', 'entertainment'] subjects.reverse() subject_dict = OrderedDict(zip(subjects, range(len(subjects)))) rank_dict = OrderedDict([(rank, []) for rank in range(len(subjects) + 1)]) for element in element_list: try: subj = element[0].split('/')[3] except Exception: subj = 'unknown' if subject_dict.get(subj) is not None: rank_dict[subject_dict[subj] + 1].append(element) else: rank_dict[0].append(element) # now return in reverse order, sorted combined_list = [] for rank in range(len(subjects), -1, -1): article_list = rank_dict[rank] article_list.sort() combined_list.extend(article_list) return combined_list def get_links_from_section_page(index_to_soup, page): page_doc = get_article_parsed(index_to_soup, base_url + page[1][0]) els = page_doc.xpath(xpath_general) element_list = [] for el in els: link = el.get('href').split('?')[0] title = el.text if title is None or len(title.strip()) < 5: link_id = link.split('/')[-1][:-3].split('-')[:-1] title = ' '.join(link_id) if link[:4] != 'http': link = base_url + link validated_link = validate_link(page=page, link=link, title=title) if validated_link is not None: element_list.append(validated_link) sorted_element_list = sort_subject(element_list) return [page[0], sorted_element_list] def get_all_links_from_sections(index_to_soup): all_sections = [] article_set = set() final_dict = OrderedDict() for item in pages.items(): print("getting links from {0}".format(item[0])) all_sections.append(get_links_from_section_page(index_to_soup, item)) for section in all_sections: section_id = section[0] article_list = section[1] final_dict[section_id] = [] for article in article_list: if article[0] not in article_set: article_set.add(article[0]) final_dict[section_id].append(article) return final_dict # noinspection PyAbstractClass class HoustonChronicle(BasicNewsRecipe): title = u'The Houston Chronicle' description = 'News from Houston, Texas' __author__ = 'Dale Furrow' language = 'en' no_stylesheets = True remove_attributes = ['style', 'xmlns'] remove_empty_feeds = True timefmt = '[%a, %d %b %Y]' timestampfmt = '%Y%m%d%H%M%S' # ignore_duplicate_articles = {'url'} # defaults to None extra_css = '.article_date {display: none}' category = 'news, USA' masthead_url = 'http://www.chron.com/apple-touch-icon-76x76.png' keep_only_tags = [dict(name='div', attrs={'class': ['article-content', 'article-wrap']})] remove_tags = [dict(name='div', attrs={'social-title': True}), dict(name='div', attrs={'class': ['control-panel', 'gallery-overlay-inner', 'most-popular', 'asset-media mos-playlist', 'asset_media asset-media']}), dict(name='li', attrs={'class': ['hst-resgalleryitem taboola-frame hidden', 'hst-resgalleryitem hidden']}), dict(name='ul', attrs={'class': 'clearfix'})] # max_articles_per_feed = 5 # for use in testing def get_article_description_from_doc(self, soup): description_chars_break = 140 description_max_chars = 300 try: els = soup.findAll('p') if len(els) > 0: out_text = "" this_ellipsis = "" for el in els: if el is not None: result = [] for descendant in el.contents: if isinstance(descendant, NavigableString): result.append(type(u'')(descendant).strip()) all_text = u' '.join(result) if len(all_text) > 1: sentences = re.findall(sentence_regex, all_text) if sentences is not None and len(sentences) > 0: for sentence in sentences: if len(out_text) < description_chars_break: out_text += sentence + " " else: if len(out_text) > description_max_chars: this_ellipsis = "..." return out_text[:description_max_chars] + this_ellipsis return out_text else: return "No Article description returned" except Exception as ex: self.log('Error on Article Description') traceback.print_exc(file=sys.stdout) print(str(ex)) return "" @staticmethod def get_published_time_from_doc(page_doc): def get_regular_timestamp(date_string): try: out_date = datetime.strptime(date_string, "%Y-%m-%dT%H:%M:%SZ") return out_date except ValueError: return None el = page_doc.findAll( lambda this_tag: this_tag.name == "time" and ('itemprop', 'datePublished') in this_tag.attrs) if len(el) == 1: return get_regular_timestamp(el[0].get('datetime')) else: return None def populate_article_metadata(self, article, soup, first): """ Called when each HTML page belonging to article is downloaded. Intended to be used to get article metadata like author/summary/etc. from the parsed HTML (soup). :param article: A object of class :class:`calibre.web.feeds.Article`. If you change the summary, remember to also change the text_summary :param soup: Parsed HTML belonging to this article :param first: True iff the parsed HTML is the first page of the article. """ summary = self.get_article_description_from_doc(soup) article_date = self.get_published_time_from_doc(soup) if article_date is not None: article_timestamp = float((article_date - datetime.utcfromtimestamp(0)).total_seconds()) article.date = article_timestamp article.utctime = dt_factory(article_date.timetuple(), assume_utc=True, as_utc=True) article.localtime = article.utctime.astimezone(local_tz) summary_date = article.localtime.strftime("%Y-%m-%d %H:%M") if article_date is not None else "No Date" article.summary = "{0}: {1}".format(summary_date, summary) article.text_summary = clean_ascii_chars(article.summary) def parse_index(self): self.timefmt = ' [%a, %d %b, %Y]' self.log('starting parse_index: ', time.strftime(self.timestampfmt)) feeds = [] sections = get_all_links_from_sections(self.index_to_soup) for section_id, article_list in sections.items(): self.log("Getting {0} section, {1:d} articles".format(section_id, len(article_list))) articles = [] for article_info in article_list: self.log("Adding {0} to feed".format(article_info[0])) articles.append({'title': article_info[1], 'url': article_info[0], 'description': '', 'date': ""}) self.log("Appending {0:d} articles for {1}".format(len(articles), section_id)) feeds.append((section_id, articles)) self.log('finished parse_index: ', time.strftime(self.timestampfmt)) return feeds def preprocess_html(self, soup): return soup