calibre/recipes/houston_chronicle.recipe
2020-09-22 18:26:16 +05:30

246 lines
10 KiB
Python

#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import print_function
__license__ = 'GPL v3'
__copyright__ = '2018, Dale Furrow dkfurrow@gmail.com'
'''
chron.com
'''
import re
import time
from datetime import datetime
import traceback
import sys
from collections import OrderedDict
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.utils.cleantext import clean_ascii_chars
from calibre.ebooks.BeautifulSoup import NavigableString
from calibre.utils.date import dt_factory, local_tz
regex_date_only = re.compile(r"""(?:January|February|March|April|
{8}May|June|July|August|September|October|November|
{8}December)\s[0-9]{1,2},\s20[01][0-9]""")
regex_time_only = re.compile(r"""[0-9]{1,2}:[0-9]{1,2} \w{2}""")
sentence_regex = re.compile(r"(\S.+?[.!?])(?=\s+|$)")
blog_regex = re.compile(r'post-\d+')
pages = OrderedDict([('news', ('/news/houston-texas/', ['business', 'sports'])),
('business', ('/business/', ['sports'])),
('sports', ('/sports/', ['business']))])
base_url = "http://www.chron.com"
# sports has 'core-package sports' class
xpath_general = """//div[contains(@class, 'centerpiece-tabs') or
contains(@class, 'wrapper') or
contains(@class, 'contentGroups') or
contains(@class, 'headline-list') or
contains(@class, 'core-package sports') or
contains(@class, 'news')]
//a[contains(@class, 'hdn-analytics')]"""
excluded_titles = ["Winning numbers", "TV-radio listings"]
def validate_link(page, link, title):
other_category = page[1][1]
if not title or len(title.strip()) < 5:
print("{0} rejected, title too short".format(link))
return None
parts = link.split('/')
if len(parts) > 3 and parts[3] in other_category:
print("{0} rejected, covered in other section".format(link))
return None
for excluded_title in excluded_titles:
if title.find(excluded_title) != -1:
print("{0} rejected, excluded title".format(link))
return None
return link, title
def get_article_parsed(index_to_soup, this_url):
return index_to_soup(this_url, as_tree=True)
def sort_subject(element_list):
# priority of subjects
subjects = ['news', 'neighborhood', 'entertainment']
subjects.reverse()
subject_dict = OrderedDict(zip(subjects, range(len(subjects))))
rank_dict = OrderedDict([(rank, []) for rank in range(len(subjects) + 1)])
for element in element_list:
try:
subj = element[0].split('/')[3]
except Exception:
subj = 'unknown'
if subject_dict.get(subj) is not None:
rank_dict[subject_dict[subj] + 1].append(element)
else:
rank_dict[0].append(element)
# now return in reverse order, sorted
combined_list = []
for rank in range(len(subjects), -1, -1):
article_list = rank_dict[rank]
article_list.sort()
combined_list.extend(article_list)
return combined_list
def get_links_from_section_page(index_to_soup, page):
page_doc = get_article_parsed(index_to_soup, base_url + page[1][0])
els = page_doc.xpath(xpath_general)
element_list = []
for el in els:
link = el.get('href').split('?')[0]
title = el.text
if title is None or len(title.strip()) < 5:
link_id = link.split('/')[-1][:-3].split('-')[:-1]
title = ' '.join(link_id)
if link[:4] != 'http':
link = base_url + link
validated_link = validate_link(page=page, link=link, title=title)
if validated_link is not None:
element_list.append(validated_link)
sorted_element_list = sort_subject(element_list)
return [page[0], sorted_element_list]
def get_all_links_from_sections(index_to_soup):
all_sections = []
article_set = set()
final_dict = OrderedDict()
for item in pages.items():
print("getting links from {0}".format(item[0]))
all_sections.append(get_links_from_section_page(index_to_soup, item))
for section in all_sections:
section_id = section[0]
article_list = section[1]
final_dict[section_id] = []
for article in article_list:
if article[0] not in article_set:
article_set.add(article[0])
final_dict[section_id].append(article)
return final_dict
# noinspection PyAbstractClass
class HoustonChronicle(BasicNewsRecipe):
title = u'The Houston Chronicle'
description = 'News from Houston, Texas'
__author__ = 'Dale Furrow'
language = 'en'
no_stylesheets = True
remove_attributes = ['style', 'xmlns']
remove_empty_feeds = True
timefmt = '[%a, %d %b %Y]'
timestampfmt = '%Y%m%d%H%M%S'
# ignore_duplicate_articles = {'url'} # defaults to None
extra_css = '.article_date {display: none}'
category = 'news, USA'
masthead_url = 'http://www.chron.com/apple-touch-icon-76x76.png'
keep_only_tags = [dict(name='div', attrs={'class': ['article-content', 'article-wrap']})]
remove_tags = [dict(name='div', attrs={'social-title': True}),
dict(name='div', attrs={'class':
['control-panel', 'gallery-overlay-inner',
'most-popular', 'asset-media mos-playlist',
'asset_media asset-media']}),
dict(name='li', attrs={'class': ['hst-resgalleryitem taboola-frame hidden',
'hst-resgalleryitem hidden']}),
dict(name='ul', attrs={'class': 'clearfix'})]
# max_articles_per_feed = 5 # for use in testing
def get_article_description_from_doc(self, soup):
description_chars_break = 140
description_max_chars = 300
try:
els = soup.findAll('p')
if len(els) > 0:
out_text = ""
this_ellipsis = ""
for el in els:
if el is not None:
result = []
for descendant in el.contents:
if isinstance(descendant, NavigableString):
result.append(type(u'')(descendant).strip())
all_text = u' '.join(result)
if len(all_text) > 1:
sentences = re.findall(sentence_regex, all_text)
if sentences is not None and len(sentences) > 0:
for sentence in sentences:
if len(out_text) < description_chars_break:
out_text += sentence + " "
else:
if len(out_text) > description_max_chars:
this_ellipsis = "..."
return out_text[:description_max_chars] + this_ellipsis
return out_text
else:
return "No Article description returned"
except Exception as ex:
self.log('Error on Article Description')
traceback.print_exc(file=sys.stdout)
print(str(ex))
return ""
@staticmethod
def get_published_time_from_doc(page_doc):
def get_regular_timestamp(date_string):
try:
out_date = datetime.strptime(date_string, "%Y-%m-%dT%H:%M:%SZ")
return out_date
except ValueError:
return None
el = page_doc.findAll(
lambda this_tag: this_tag.name == "time" and ('itemprop', 'datePublished') in this_tag.attrs)
if len(el) == 1:
return get_regular_timestamp(el[0].get('datetime'))
else:
return None
def populate_article_metadata(self, article, soup, first):
"""
Called when each HTML page belonging to article is downloaded.
Intended to be used to get article metadata like author/summary/etc.
from the parsed HTML (soup).
:param article: A object of class :class:`calibre.web.feeds.Article`.
If you change the summary, remember to also change the text_summary
:param soup: Parsed HTML belonging to this article
:param first: True iff the parsed HTML is the first page of the article.
"""
summary = self.get_article_description_from_doc(soup)
article_date = self.get_published_time_from_doc(soup)
if article_date is not None:
article_timestamp = float((article_date - datetime.utcfromtimestamp(0)).total_seconds())
article.date = article_timestamp
article.utctime = dt_factory(article_date.timetuple(), assume_utc=True, as_utc=True)
article.localtime = article.utctime.astimezone(local_tz)
summary_date = article.localtime.strftime("%Y-%m-%d %H:%M") if article_date is not None else "No Date"
article.summary = "{0}: {1}".format(summary_date, summary)
article.text_summary = clean_ascii_chars(article.summary)
def parse_index(self):
self.timefmt = ' [%a, %d %b, %Y]'
self.log('starting parse_index: ', time.strftime(self.timestampfmt))
feeds = []
sections = get_all_links_from_sections(self.index_to_soup)
for section_id, article_list in sections.items():
self.log("Getting {0} section, {1:d} articles".format(section_id, len(article_list)))
articles = []
for article_info in article_list:
self.log("Adding {0} to feed".format(article_info[0]))
articles.append({'title': article_info[1], 'url': article_info[0],
'description': '', 'date': ""})
self.log("Appending {0:d} articles for {1}".format(len(articles), section_id))
feeds.append((section_id, articles))
self.log('finished parse_index: ', time.strftime(self.timestampfmt))
return feeds
def preprocess_html(self, soup):
return soup