updated houston chronicle recipe

This commit is contained in:
Dale Furrow 2015-07-18 11:06:54 -05:00
parent c3424face0
commit e25dca7651

View File

@ -1,39 +1,31 @@
#!/usr/bin/env python2 #!/usr/bin/env python2
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2013, Dale Furrow dkfurrow@gmail.com' __copyright__ = '2015, Dale Furrow dkfurrow@gmail.com'
''' '''
chron.com chron.com
''' '''
import re, time import re
from calibre.web.feeds.recipes import BasicNewsRecipe import time
from calibre.utils.date import dt_factory, local_tz
from datetime import datetime, timedelta, date from datetime import datetime, timedelta, date
from lxml import html from lxml import html
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.utils.date import dt_factory, local_tz
class HoustonChronicle(BasicNewsRecipe): class HoustonChronicle(BasicNewsRecipe):
title = u'The Houston Chronicle'
title = u'The Houston Chronicle' description = 'News from Houston, Texas'
description = 'News from Houston, Texas'
__author__ = 'Dale Furrow' __author__ = 'Dale Furrow'
language = 'en' language = 'en'
no_stylesheets = True no_stylesheets = True
# use_embedded_content = False remove_attributes = ['style', 'xmlns']
remove_attributes = ['style']
remove_empty_feeds = True remove_empty_feeds = True
timefmt = '[%a, %d %b %Y]' timefmt = '[%a, %d %b %Y]'
timestampfmt = '%Y%m%d%H%M%S' timestampfmt = '%Y%m%d%H%M%S'
ignore_duplicate_articles = {'url'} ignore_duplicate_articles = {'url'}
remove_attributes = ['xmlns']
remove_tags = [dict(name='div', attrs={'class':'socialBar'}), base_url = 'http://www.chron.com'
dict(name='div', attrs={'class':re.compile('post-commentmeta')}),
dict(name='div', attrs={'class':re.compile('slideshow_wrapper')}),
dict(name='div', attrs={'class':'entry-summary'}),
dict(name='a', attrs={'rel':'item-license'})]
baseUrl = 'http://www.chron.com'
oldest_web_article = 7.0 oldest_web_article = 7.0
@ -42,109 +34,121 @@ class HoustonChronicle(BasicNewsRecipe):
else: else:
earliest_date = date.today() - timedelta(days=oldest_web_article) earliest_date = date.today() - timedelta(days=oldest_web_article)
pages = [('news' , '/news/houston-texas/'), pages = [('news', '/news/houston-texas/'),
('business' , '/business/'), ('business', '/business/'),
('opinion', '/opinion/'), ('opinion', '/opinion/'),
('sports', '/sports/')] ('sports', '/sports/')]
def getLinksFromSectionPage(self, sectionUrl): def get_links_from_section_page(self, section_url):
pageDoc = html.parse(sectionUrl) page_doc = html.parse(section_url)
els = pageDoc.xpath("""//div[contains(@class, 'scp-item') els = page_doc.xpath("""//div[contains(@class, 'scp-item')
or @class='scp-feature' or contains(@class, 'simplelist') or @class='scp-feature' or contains(@class, 'simplelist')
or contains(@class, 'scp-blogpromo')] or contains(@class, 'scp-blogpromo')]
//a[@href and not(@target) and not(child::img)]""") //a[@href and not(@target) and not(child::img)]""")
elList = [] element_list = []
for el in els: for el in els:
link = el.get('href') link = el.get('href')
title = el.text title = el.text
if link[:4] != 'http': if link[:4] != 'http':
link = self.baseUrl + link link = self.base_url + link
if title is not None: if title is not None:
elList.append((link, el.text)) element_list.append((link, el.text))
return elList return element_list
def getArticleDescriptionFromDoc(self, pageDoc): def get_article_description_from_doc(self, page_doc):
descriptionCharsBreak = 140 description_chars_break = 140
descriptionMaxChars = 300 description_max_chars = 300
descXpath = """//div[contains(@class, 'article-body') or desc_xpath = """//div[contains(@class, 'article-body') or
contains(@class, 'resource-content') or contains(@class, 'post')]//p""" contains(@class, 'resource-content') or contains(@class, 'post')]//p"""
sentenceRegex = re.compile("(\S.+?[.!?])(?=\s+|$)") sentence_regex = re.compile("(\S.+?[.!?])(?=\s+|$)")
def stringify_children(node): def stringify_children(node):
return ''.join([x for x in node.itertext()]) return ''.join([x for x in node.itertext()])
try: try:
els = pageDoc.xpath(descXpath) els = page_doc.xpath(desc_xpath)
outText = "" out_text = ""
ellipsis = "" ellipsis = ""
for el in els: for el in els:
sentences = re.findall(sentenceRegex, stringify_children(el)) sentences = re.findall(sentence_regex, stringify_children(el))
for sentence in sentences: for sentence in sentences:
if len(outText) < descriptionCharsBreak: if len(out_text) < description_chars_break:
outText += sentence + " " out_text += sentence + " "
else: else:
if len(outText) > descriptionMaxChars: if len(out_text) > description_max_chars:
ellipsis = "..." ellipsis = "..."
return outText[:descriptionMaxChars] + ellipsis return out_text[:description_max_chars] + ellipsis
return outText return out_text
except: except:
self.log('Error on Article Description') self.log('Error on Article Description')
return "" return ""
def getPublishedTimeFromDoc(self, pageDoc): def get_published_time_from_doc(self, page_doc):
regexDateOnly = re.compile("""(?:January|February|March|April| regex_date_only = re.compile("""(?:January|February|March|April|
May|June|July|August|September|October|November| May|June|July|August|September|October|November|
December)\s[0-9]{1,2},\s20[01][0-9]""") December)\s[0-9]{1,2},\s20[01][0-9]""")
regextTimeOnly = re.compile("""[0-9]{1,2}:[0-9]{1,2} \w{2}""") regex_time_only = re.compile("""[0-9]{1,2}:[0-9]{1,2} \w{2}""")
def getRegularTimestamp(dateString):
def get_regular_timestamp(date_string):
try: try:
outDate = datetime.strptime(dateString, "%Y-%m-%dT%H:%M:%SZ") out_date = datetime.strptime(date_string, "%Y-%m-%dT%H:%M:%SZ")
return outDate return out_date
except: except:
return None return None
def getDateFromString(inText):
match = re.findall(regexDateOnly, inText) def get_date_from_string(in_text):
match = re.findall(regex_date_only, in_text)
if match: if match:
try: try:
outDate = datetime.strptime(match[0], "%B %d, %Y") out_date = datetime.strptime(match[0], "%B %d, %Y")
match = re.findall(regextTimeOnly, inText) match = re.findall(regex_time_only, in_text)
if match: if match:
outTime = datetime.strptime(match[0], "%I:%M %p") out_time = datetime.strptime(match[0], "%I:%M %p")
return datetime.combine(outDate.date(), outTime.time()) return datetime.combine(out_date.date(), out_time.time())
return outDate return out_date
except: except:
return None return None
else:
return None el = page_doc.xpath("//*[@class='timestamp'][1]")
el = pageDoc.xpath("//*[@class='timestamp'][1]")
if len(el) == 1: if len(el) == 1:
return getRegularTimestamp(el[0].get('title')) return get_regular_timestamp(el[0].get('title'))
else: else:
el = pageDoc.xpath("//*[@class='entry-date' or @class='post-date'][1]") el = page_doc.xpath("//*[@class='entry-date' or @class='post-date'][1]")
if len(el) == 1: if len(el) == 1:
return getDateFromString(el[0].text_content()) return get_date_from_string(el[0].text_content())
else: else:
return None return None
def getAllFeedDataFromPage(self, page): def get_all_data_feeds_from_page(self, page):
articles = [] articles = []
linkList = self.getLinksFromSectionPage(self.baseUrl + page[1]) exclude_titles_with = ['Winning numbers']
self.log('from section: ', page[0], " found ", len(linkList), " links")
for link in linkList: def title_excluded(title):
for text in exclude_titles_with:
if title.find(text) != -1:
return True
return False
link_list = self.get_links_from_section_page(self.base_url + page[1])
self.log('from section: ', page[0], " found ", len(link_list), " links")
for link in link_list:
try: try:
articleDoc = html.parse(link[0]) article_doc = html.parse(link[0])
description = self.getArticleDescriptionFromDoc(articleDoc) description = self.get_article_description_from_doc(article_doc)
articleDate = self.getPublishedTimeFromDoc(articleDoc) article_date = self.get_published_time_from_doc(article_doc)
if articleDate is not None and description is not None and articleDate.date() > self.earliest_date: if article_date is not None and description is not None and article_date.date() > self.earliest_date \
dateText = articleDate.strftime('%a, %d %b') and not title_excluded(link[1]):
author = articleDate.strftime(self.timestampfmt) date_text = article_date.strftime('%a, %d %b')
articles.append({'title':link[1], 'url':link[0], author = article_date.strftime(self.timestampfmt)
'description':description, 'date':dateText, 'author':author}) articles.append({'title': link[1], 'url': link[0],
self.log(page[0] + ": " + link[1] + ', from ' + dateText + 'description': description, 'date': date_text, 'author': author})
" description of " + str(len(description)) + ' characters at ' + link[0]) self.log(page[0] + ": " + link[1] + ', from ' + date_text +
" description of " + str(len(description)) + ' characters at ' + link[0])
else: else:
msg = "" if article_date is None:
if articleDate is None:
msg = " No Timestamp Found" msg = " No Timestamp Found"
elif title_excluded(link[1]):
msg = " Title Excluded"
else: else:
msg = " article older than " + str(self.oldest_web_article) + ' days...' msg = " article older than " + str(self.oldest_web_article) + ' days...'
self.log("Skipping article: ", link[0], msg) self.log("Skipping article: ", link[0], msg)
@ -156,37 +160,63 @@ class HoustonChronicle(BasicNewsRecipe):
def parse_index(self): def parse_index(self):
self.timefmt = ' [%a, %d %b, %Y]' self.timefmt = ' [%a, %d %b, %Y]'
self.log('starting parse_index: ', time.strftime(self.timestampfmt)) self.log('starting parse_index: ', time.strftime(self.timestampfmt))
feeds = [] feeds = []
for page in self.pages: for page in self.pages:
articles = [] articles = self.get_all_data_feeds_from_page(page)
articles = self.getAllFeedDataFromPage(page)
if articles: if articles:
feeds.append((page[0], articles)) feeds.append((page[0], articles))
self.log('finished parse_index: ', time.strftime(self.timestampfmt)) self.log('finished parse_index: ', time.strftime(self.timestampfmt))
return feeds return feeds
def preprocess_html(self, thisSoup): def preprocess_html(self, soup):
baseTags = [] tags_to_exclude = [('class', "caption staged"), ('style', "display:none")]
baseTags.extend(thisSoup.findAll(name='div', attrs={'id':re.compile('post-\d+')})) story_tag = soup.find(name='div', attrs={'class': 'article-content'})
baseTags.extend(thisSoup.findAll(name='div', attrs={'class':'hnews hentry item'})) blog_tag = soup.find(name='div', attrs={'id': re.compile('post-\d+')})
allTags = []
allTags.extend(baseTags) def is_excluded(tag_to_check):
if len(baseTags) > 0: for attr in tag_to_check.attrs:
for tag in baseTags: if attr in tags_to_exclude:
allTags.extend(tag.findAll(True)) return True
paragraphs = thisSoup.findAll(name='p') return False
for paragraph in paragraphs:
if paragraph not in allTags: def get_attr_startswith(attrs, this_key, this_valuestart):
allTags.append(paragraph) starts_with = False
for tag in baseTags: for attr in attrs:
while tag.parent is not None: if attr[0] == this_key:
allTags.append(tag) if attr[1].startswith(this_valuestart):
starts_with = True
return starts_with
base_tags = []
if story_tag is not None:
base_tags = story_tag.findAll(lambda this_tag: (this_tag.name == "p"
and not ('class', 'open') in this_tag.attrs
and not ('class', 'close') in this_tag.attrs)
or this_tag.name.startswith('h') or this_tag.name == 'table'
or (this_tag.name == 'li'
and ('class', 'hst-resgalleryitem') in this_tag.attrs))
if blog_tag is not None:
base_tags = blog_tag.findAll(lambda this_tag: (this_tag.name == "p" or this_tag.name.startswith('h'))
or (this_tag.name == "span"
and get_attr_startswith(this_tag.attrs, 'class', 'post'))
or (this_tag.name == 'img' and ('lazy-state', 'loaded') in this_tag.attrs))
self.log('content tags: ' + str(type(base_tags)) + str(len(base_tags)))
all_tags = []
all_tags.extend(base_tags)
if len(base_tags) > 0:
for tag in base_tags:
all_tags.extend(tag.findAll(True))
for tag in base_tags:
while tag.parent is not None and not is_excluded(tag):
all_tags.append(tag)
tag = tag.parent tag = tag.parent
for tag in thisSoup.findAll(True): for tag in soup.findAll(True):
if tag not in allTags: if tag not in all_tags:
tag.extract() tag.extract()
return thisSoup return soup
def populate_article_metadata(self, article, soup, first): def populate_article_metadata(self, article, soup, first):
if not first: if not first:
@ -195,12 +225,7 @@ class HoustonChronicle(BasicNewsRecipe):
article.date = time.strptime(article.author, self.timestampfmt) article.date = time.strptime(article.author, self.timestampfmt)
article.utctime = dt_factory(article.date, assume_utc=False, as_utc=False) article.utctime = dt_factory(article.date, assume_utc=False, as_utc=False)
article.localtime = article.utctime.astimezone(local_tz) article.localtime = article.utctime.astimezone(local_tz)
except Exception as inst: # remove after debug except Exception as inst:
self.log('Exception: ', article.title) # remove after debug self.log('Exception: ', article.title)
self.log(type(inst)) # remove after debug self.log(type(inst))
self.log(inst) # remove after debug self.log(inst)