calibre/recipes/houston_chronicle.recipe
Kovid Goyal b7181cfd37
pep8
2018-07-13 08:16:38 +05:30

249 lines
10 KiB
Python

#!/usr/bin/env python2
# -*- coding: utf-8 -*-
__license__ = 'GPL v3'
__copyright__ = '2018, Dale Furrow dkfurrow@gmail.com'
'''
chron.com
'''
import re
import time
import urllib2
from StringIO import StringIO
from datetime import datetime
import traceback
import sys
from collections import OrderedDict
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.utils.cleantext import clean_ascii_chars
from calibre.ebooks.BeautifulSoup import NavigableString
from calibre.utils.date import dt_factory, local_tz
from lxml import html
from lxml import etree
regex_date_only = re.compile(r"""(?:January|February|March|April|
{8}May|June|July|August|September|October|November|
{8}December)\s[0-9]{1,2},\s20[01][0-9]""")
regex_time_only = re.compile(r"""[0-9]{1,2}:[0-9]{1,2} \w{2}""")
sentence_regex = re.compile(r"(\S.+?[.!?])(?=\s+|$)")
blog_regex = re.compile(r'post-\d+')
pages = OrderedDict([('news', ('/news/houston-texas/', ['business', 'sports'])),
('business', ('/business/', ['sports'])),
('sports', ('/sports/', ['business']))])
base_url = "http://www.chron.com"
# sports has 'core-package sports' class
xpath_general = """//div[contains(@class, 'centerpiece-tabs') or
contains(@class, 'wrapper') or
contains(@class, 'contentGroups') or
contains(@class, 'headline-list') or
contains(@class, 'core-package sports') or
contains(@class, 'news')]
//a[contains(@class, 'hdn-analytics')]"""
excluded_titles = ["Winning numbers", "TV-radio listings"]
def validate_link(page, link, title):
other_category = page[1][1]
if not title or len(title.strip()) < 5:
print("{0} rejected, title too short".format(link))
return None
if link.split('/')[3] in other_category:
print("{0} rejected, covered in other section".format(link))
return None
for excluded_title in excluded_titles:
if title.find(excluded_title) != -1:
print("{0} rejected, excluded title".format(link))
return None
return link, title
def get_article_parsed(this_url):
page = urllib2.urlopen(this_url)
content = page.read()
parser = etree.HTMLParser()
parsed = html.parse(StringIO(content), parser)
return parsed
def sort_subject(element_list):
# priority of subjects
subjects = ['news', 'neighborhood', 'entertainment']
subjects.reverse()
subject_dict = OrderedDict(zip(subjects, range(len(subjects))))
rank_dict = OrderedDict([(rank, []) for rank in range(len(subjects) + 1)])
for element in element_list:
subj = element[0].split('/')[3]
if subject_dict.get(subj) is not None:
rank_dict[subject_dict[subj] + 1].append(element)
else:
rank_dict[0].append(element)
# now return in reverse order, sorted
combined_list = []
for rank in range(len(subjects), -1, -1):
article_list = rank_dict[rank]
article_list.sort()
combined_list.extend(article_list)
return combined_list
def get_links_from_section_page(page):
page_doc = get_article_parsed(base_url + page[1][0])
els = page_doc.xpath(xpath_general)
element_list = []
for el in els:
link = el.get('href').split('?')[0]
title = el.text
if title is None or len(title.strip()) < 5:
link_id = link.split('/')[-1][:-3].split('-')[:-1]
title = ' '.join(link_id)
if link[:4] != 'http':
link = base_url + link
validated_link = validate_link(page=page, link=link, title=title)
if validated_link is not None:
element_list.append(validated_link)
sorted_element_list = sort_subject(element_list)
return [page[0], sorted_element_list]
def get_all_links_from_sections():
all_sections = []
article_set = set()
final_dict = OrderedDict()
for item in pages.items():
print("getting links from {0}".format(item[0]))
all_sections.append(get_links_from_section_page(item))
for section in all_sections:
section_id = section[0]
article_list = section[1]
final_dict[section_id] = []
for article in article_list:
if article[0] not in article_set:
article_set.add(article[0])
final_dict[section_id].append(article)
return final_dict
# noinspection PyAbstractClass
class HoustonChronicle(BasicNewsRecipe):
title = u'The Houston Chronicle'
description = 'News from Houston, Texas'
__author__ = 'Dale Furrow'
language = 'en'
no_stylesheets = True
remove_attributes = ['style', 'xmlns']
remove_empty_feeds = True
timefmt = '[%a, %d %b %Y]'
timestampfmt = '%Y%m%d%H%M%S'
# ignore_duplicate_articles = {'url'} # defaults to None
extra_css = '.article_date {display: none}'
category = 'news, USA'
masthead_url = 'http://www.chron.com/apple-touch-icon-76x76.png'
keep_only_tags = [dict(name='div', attrs={'class': ['article-content', 'article-wrap']})]
remove_tags = [dict(name='div', attrs={'social-title': True}),
dict(name='div', attrs={'class':
['control-panel', 'gallery-overlay-inner',
'most-popular', 'asset-media mos-playlist',
'asset_media asset-media']}),
dict(name='li', attrs={'class': ['hst-resgalleryitem taboola-frame hidden',
'hst-resgalleryitem hidden']}),
dict(name='ul', attrs={'class': 'clearfix'})]
# max_articles_per_feed = 5 # for use in testing
def get_article_description_from_doc(self, soup):
description_chars_break = 140
description_max_chars = 300
try:
els = soup.findAll('p')
if len(els) > 0:
out_text = ""
this_ellipsis = ""
for el in els:
if el is not None:
result = []
for descendant in el.contents:
if isinstance(descendant, NavigableString):
result.append(unicode(descendant).strip())
all_text = u' '.join(result).encode('utf-8')
if len(all_text) > 1:
sentences = re.findall(sentence_regex, all_text)
if sentences is not None and len(sentences) > 0:
for sentence in sentences:
if len(out_text) < description_chars_break:
out_text += sentence + " "
else:
if len(out_text) > description_max_chars:
this_ellipsis = "..."
return out_text[:description_max_chars] + this_ellipsis
return out_text
else:
return "No Article description returned"
except Exception as ex:
self.log('Error on Article Description')
traceback.print_exc(file=sys.stdout)
print(str(ex))
return ""
@staticmethod
def get_published_time_from_doc(page_doc):
def get_regular_timestamp(date_string):
try:
out_date = datetime.strptime(date_string, "%Y-%m-%dT%H:%M:%SZ")
return out_date
except ValueError:
return None
el = page_doc.findAll(
lambda this_tag: this_tag.name == "time" and ('itemprop', 'datePublished') in this_tag.attrs)
if len(el) == 1:
return get_regular_timestamp(el[0].get('datetime'))
else:
return None
def populate_article_metadata(self, article, soup, first):
"""
Called when each HTML page belonging to article is downloaded.
Intended to be used to get article metadata like author/summary/etc.
from the parsed HTML (soup).
:param article: A object of class :class:`calibre.web.feeds.Article`.
If you change the summary, remember to also change the text_summary
:param soup: Parsed HTML belonging to this article
:param first: True iff the parsed HTML is the first page of the article.
"""
summary = self.get_article_description_from_doc(soup)
article_date = self.get_published_time_from_doc(soup)
if article_date is not None:
article_timestamp = float((article_date - datetime.utcfromtimestamp(0)).total_seconds())
article.date = article_timestamp
article.utctime = dt_factory(article_date.timetuple(), assume_utc=True, as_utc=True)
article.localtime = article.utctime.astimezone(local_tz)
summary_date = article.localtime.strftime("%Y-%m-%d %H:%M") if article_date is not None else "No Date"
article.summary = "{0}: {1}".format(summary_date, summary)
article.text_summary = clean_ascii_chars(article.summary)
def parse_index(self):
self.timefmt = ' [%a, %d %b, %Y]'
self.log('starting parse_index: ', time.strftime(self.timestampfmt))
feeds = []
sections = get_all_links_from_sections()
for section_id, article_list in sections.items():
self.log("Getting {0} section, {1:d} articles".format(section_id, len(article_list)))
articles = []
for article_info in article_list:
self.log("Adding {0} to feed".format(article_info[0]))
articles.append({'title': article_info[1], 'url': article_info[0],
'description': '', 'date': ""})
self.log("Appending {0:d} articles for {1}".format(len(articles), section_id))
feeds.append((section_id, articles))
self.log('finished parse_index: ', time.strftime(self.timestampfmt))
return feeds
def preprocess_html(self, soup):
return soup