#!/usr/bin/env python2 # -*- coding: utf-8 -*- __license__ = 'GPL v3' __copyright__ = '2016, Dale Furrow dkfurrow@gmail.com' ''' chron.com ''' import re import time from datetime import datetime, timedelta, date from lxml import html from calibre.web.feeds.recipes import BasicNewsRecipe class HoustonChronicle(BasicNewsRecipe): title = u'The Houston Chronicle' description = 'News from Houston, Texas' __author__ = 'Dale Furrow' language = 'en' no_stylesheets = True remove_attributes = ['style', 'xmlns'] remove_empty_feeds = True timefmt = '[%a, %d %b %Y]' timestampfmt = '%Y%m%d%H%M%S' ignore_duplicate_articles = {'url'} extra_css = '.article_date {display: none}' oldest_web_article = 7.0 if oldest_web_article is None: earliest_date = date.today() else: earliest_date = date.today() - timedelta(days=oldest_web_article) pages = [('news', '/news/houston-texas/'), ('business', '/business/'), ('sports', '/sports/')] base_url = "http://www.chron.com" xpath_str = """//div[contains(@class, 'news') or contains(@class, 'headline-list')] //*[self::h4 or self::h5]//a[contains(@class, 'hdn-analytics')]""" def get_links_from_section_page(self, section_url): page_doc = html.parse(self.base_url + section_url) els = page_doc.xpath(self.xpath_str) element_list = [] for el in els: link = el.get('href') title = el.text if link[:4] != 'http': link = self.base_url + link if title is not None: element_list.append((link, el.text)) return element_list def get_article_description_from_doc(self, page_doc): description_chars_break = 140 description_max_chars = 300 desc_xpath = """//div[contains(@class, 'article-body') or contains(@class, 'resource-content') or contains(@class, 'post')]//p""" sentence_regex = re.compile("(\S.+?[.!?])(?=\s+|$)") def stringify_children(node): return ''.join([x for x in node.itertext()]) try: els = page_doc.xpath(desc_xpath) out_text = "" ellipsis = "" for el in els: sentences = re.findall(sentence_regex, stringify_children(el)) for sentence in sentences: if len(out_text) < description_chars_break: out_text += sentence + " " else: if len(out_text) > description_max_chars: ellipsis = "..." return out_text[:description_max_chars] + ellipsis return out_text except: self.log('Error on Article Description') return "" def get_published_time_from_doc(self, page_doc): regex_date_only = re.compile("""(?:January|February|March|April| May|June|July|August|September|October|November| December)\s[0-9]{1,2},\s20[01][0-9]""") regex_time_only = re.compile("""[0-9]{1,2}:[0-9]{1,2} \w{2}""") def get_regular_timestamp(date_string): try: out_date = datetime.strptime(date_string, "%Y-%m-%dT%H:%M:%SZ") return out_date except: return None def get_date_from_string(in_text): match = re.findall(regex_date_only, in_text) if match: try: out_date = datetime.strptime(match[0], "%B %d, %Y") match = re.findall(regex_time_only, in_text) if match: out_time = datetime.strptime(match[0], "%I:%M %p") return datetime.combine(out_date.date(), out_time.time()) return out_date except: return None el = page_doc.xpath("//*[@class='timestamp'][1]") if len(el) == 1: return get_regular_timestamp(el[0].get('title')) else: el = page_doc.xpath( "//*[@class='entry-date' or @class='post-date'][1]") if len(el) == 1: return get_date_from_string(el[0].text_content()) else: return None def get_all_data_feeds_from_page(self, page): articles = [] exclude_titles_with = ['Winning numbers'] def title_excluded(title): for text in exclude_titles_with: if title.find(text) != -1: return True return False link_list = self.get_links_from_section_page(page[1]) self.log('from section: ', page[0], " found ", len(link_list), " links") for link in link_list: try: article_doc = html.parse(link[0]) description = self.get_article_description_from_doc( article_doc) parsed_date = self.get_published_time_from_doc(article_doc) if parsed_date is not None and description is not None and \ parsed_date.date() > self.earliest_date and \ not title_excluded(link[1]): intro_date = parsed_date.strftime('%d %b %H:%M') + " - " articles.append({'title': link[1], 'url': link[0], 'description': intro_date + description, 'date': ""}) self.log(page[0] + ": " + link[1] + ', from ' + intro_date + " description of " + str(len(description)) + ' characters at ' + link[0]) else: if parsed_date is None: msg = " No Timestamp Found" elif title_excluded(link[1]): msg = " Title Excluded" else: msg = " article older than " + \ str(self.oldest_web_article) + ' days...' self.log("Skipping article: ", link[0], msg) except: print 'error on fetching ' + link[0] continue return articles def parse_index(self): self.timefmt = ' [%a, %d %b, %Y]' self.log('starting parse_index: ', time.strftime(self.timestampfmt)) feeds = [] for page in self.pages: articles = self.get_all_data_feeds_from_page(page) if articles: feeds.append((page[0], articles)) self.log('finished parse_index: ', time.strftime(self.timestampfmt)) return feeds def preprocess_html(self, soup): tags_to_exclude = [('class', "caption staged"), ('style', "display:none")] story_tag = soup.find( name='div', attrs={'class': ['article-content', 'article-body']}) blog_tag = soup.find(name='div', attrs={'id': re.compile('post-\d+')}) def is_excluded(tag_to_check): for attr in tag_to_check.attrs: if attr in tags_to_exclude: return True return False def get_attr_startswith(attrs, this_key, this_valuestart): starts_with = False for attr in attrs: if attr[0] == this_key: if attr[1].startswith(this_valuestart): starts_with = True return starts_with base_tags = [] if story_tag is not None: base_tags = story_tag.findAll(lambda this_tag: (this_tag.name == "p" and not ('class', 'open') in this_tag.attrs and not ('class', 'close') in this_tag.attrs) or this_tag.name.startswith('h') or this_tag.name == 'table' or (this_tag.name == 'li' and ('class', 'hst-resgalleryitem') in this_tag.attrs)) # noqa if blog_tag is not None: base_tags = blog_tag.findAll(lambda this_tag: (this_tag.name == "p" or this_tag.name.startswith('h')) or (this_tag.name == "span" and get_attr_startswith(this_tag.attrs, 'class', 'post')) or (this_tag.name == 'img' and ('lazy-state', 'loaded') in this_tag.attrs)) # noqa self.log('content tags: ' + str(type(base_tags)) + str(len(base_tags))) all_tags = [] all_tags.extend(base_tags) if len(base_tags) > 0: for tag in base_tags: all_tags.extend(tag.findAll(True)) for tag in base_tags: while tag.parent is not None and not is_excluded(tag): all_tags.append(tag) tag = tag.parent for tag in soup.findAll(True): if tag not in all_tags: tag.extract() return soup