Changes to Houston Chronicle Recipe

This commit is contained in:
Dale Furrow 2016-01-28 14:41:33 -06:00
parent e25dca7651
commit 495be4bbc8

View File

@ -1,7 +1,7 @@
#!/usr/bin/env python2 #!/usr/bin/env python2
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
__license__ = 'GPL v3' __license__ = 'GPL v3'
__copyright__ = '2015, Dale Furrow dkfurrow@gmail.com' __copyright__ = '2016, Dale Furrow dkfurrow@gmail.com'
''' '''
chron.com chron.com
''' '''
@ -10,7 +10,6 @@ import time
from datetime import datetime, timedelta, date from datetime import datetime, timedelta, date
from lxml import html from lxml import html
from calibre.web.feeds.recipes import BasicNewsRecipe from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.utils.date import dt_factory, local_tz
class HoustonChronicle(BasicNewsRecipe): class HoustonChronicle(BasicNewsRecipe):
@ -24,8 +23,7 @@ class HoustonChronicle(BasicNewsRecipe):
timefmt = '[%a, %d %b %Y]' timefmt = '[%a, %d %b %Y]'
timestampfmt = '%Y%m%d%H%M%S' timestampfmt = '%Y%m%d%H%M%S'
ignore_duplicate_articles = {'url'} ignore_duplicate_articles = {'url'}
extra_css = '.article_date {display: none}'
base_url = 'http://www.chron.com'
oldest_web_article = 7.0 oldest_web_article = 7.0
@ -36,15 +34,16 @@ class HoustonChronicle(BasicNewsRecipe):
pages = [('news', '/news/houston-texas/'), pages = [('news', '/news/houston-texas/'),
('business', '/business/'), ('business', '/business/'),
('opinion', '/opinion/'),
('sports', '/sports/')] ('sports', '/sports/')]
base_url = "http://www.chron.com"
xpath_str = """//div[contains(@class, 'news') or contains(@class, 'headline-list')]
//*[self::h4 or self::h5]//a[contains(@class, 'hdn-analytics')]"""
def get_links_from_section_page(self, section_url): def get_links_from_section_page(self, section_url):
page_doc = html.parse(section_url) page_doc = html.parse(self.base_url + section_url)
els = page_doc.xpath("""//div[contains(@class, 'scp-item') els = page_doc.xpath(self.xpath_str)
or @class='scp-feature' or contains(@class, 'simplelist')
or contains(@class, 'scp-blogpromo')]
//a[@href and not(@target) and not(child::img)]""")
element_list = [] element_list = []
for el in els: for el in els:
link = el.get('href') link = el.get('href')
@ -129,23 +128,25 @@ class HoustonChronicle(BasicNewsRecipe):
return True return True
return False return False
link_list = self.get_links_from_section_page(self.base_url + page[1]) link_list = self.get_links_from_section_page(page[1])
self.log('from section: ', page[0], " found ", len(link_list), " links") self.log('from section: ', page[0], " found ", len(link_list), " links")
for link in link_list: for link in link_list:
try: try:
article_doc = html.parse(link[0]) article_doc = html.parse(link[0])
description = self.get_article_description_from_doc(article_doc) description = self.get_article_description_from_doc(article_doc)
article_date = self.get_published_time_from_doc(article_doc) parsed_date = self.get_published_time_from_doc(article_doc)
if article_date is not None and description is not None and article_date.date() > self.earliest_date \ if parsed_date is not None and description is not None and \
and not title_excluded(link[1]): parsed_date.date() > self.earliest_date and \
date_text = article_date.strftime('%a, %d %b') not title_excluded(link[1]):
author = article_date.strftime(self.timestampfmt) intro_date = parsed_date.strftime('%d %b %H:%M') + " - "
author = parsed_date.strftime(self.timestampfmt)
articles.append({'title': link[1], 'url': link[0], articles.append({'title': link[1], 'url': link[0],
'description': description, 'date': date_text, 'author': author}) 'description': intro_date + description,
self.log(page[0] + ": " + link[1] + ', from ' + date_text + 'date': ""})
self.log(page[0] + ": " + link[1] + ', from ' + intro_date +
" description of " + str(len(description)) + ' characters at ' + link[0]) " description of " + str(len(description)) + ' characters at ' + link[0])
else: else:
if article_date is None: if parsed_date is None:
msg = " No Timestamp Found" msg = " No Timestamp Found"
elif title_excluded(link[1]): elif title_excluded(link[1]):
msg = " Title Excluded" msg = " Title Excluded"
@ -171,7 +172,7 @@ class HoustonChronicle(BasicNewsRecipe):
def preprocess_html(self, soup): def preprocess_html(self, soup):
tags_to_exclude = [('class', "caption staged"), ('style', "display:none")] tags_to_exclude = [('class', "caption staged"), ('style', "display:none")]
story_tag = soup.find(name='div', attrs={'class': 'article-content'}) story_tag = soup.find(name='div', attrs={'class': ['article-content', 'article-body']})
blog_tag = soup.find(name='div', attrs={'id': re.compile('post-\d+')}) blog_tag = soup.find(name='div', attrs={'id': re.compile('post-\d+')})
def is_excluded(tag_to_check): def is_excluded(tag_to_check):
@ -200,7 +201,8 @@ class HoustonChronicle(BasicNewsRecipe):
base_tags = blog_tag.findAll(lambda this_tag: (this_tag.name == "p" or this_tag.name.startswith('h')) base_tags = blog_tag.findAll(lambda this_tag: (this_tag.name == "p" or this_tag.name.startswith('h'))
or (this_tag.name == "span" or (this_tag.name == "span"
and get_attr_startswith(this_tag.attrs, 'class', 'post')) and get_attr_startswith(this_tag.attrs, 'class', 'post'))
or (this_tag.name == 'img' and ('lazy-state', 'loaded') in this_tag.attrs)) or (this_tag.name == 'img' and (
'lazy-state', 'loaded') in this_tag.attrs))
self.log('content tags: ' + str(type(base_tags)) + str(len(base_tags))) self.log('content tags: ' + str(type(base_tags)) + str(len(base_tags)))
all_tags = [] all_tags = []
@ -217,15 +219,3 @@ class HoustonChronicle(BasicNewsRecipe):
if tag not in all_tags: if tag not in all_tags:
tag.extract() tag.extract()
return soup return soup
def populate_article_metadata(self, article, soup, first):
if not first:
return
try:
article.date = time.strptime(article.author, self.timestampfmt)
article.utctime = dt_factory(article.date, assume_utc=False, as_utc=False)
article.localtime = article.utctime.astimezone(local_tz)
except Exception as inst:
self.log('Exception: ', article.title)
self.log(type(inst))
self.log(inst)