mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
updated houston chronicle recipe
This commit is contained in:
parent
c3424face0
commit
e25dca7651
@ -1,39 +1,31 @@
|
|||||||
#!/usr/bin/env python2
|
#!/usr/bin/env python2
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
__license__ = 'GPL v3'
|
__license__ = 'GPL v3'
|
||||||
__copyright__ = '2013, Dale Furrow dkfurrow@gmail.com'
|
__copyright__ = '2015, Dale Furrow dkfurrow@gmail.com'
|
||||||
'''
|
'''
|
||||||
chron.com
|
chron.com
|
||||||
'''
|
'''
|
||||||
import re, time
|
import re
|
||||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
import time
|
||||||
from calibre.utils.date import dt_factory, local_tz
|
|
||||||
from datetime import datetime, timedelta, date
|
from datetime import datetime, timedelta, date
|
||||||
from lxml import html
|
from lxml import html
|
||||||
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||||
|
from calibre.utils.date import dt_factory, local_tz
|
||||||
|
|
||||||
|
|
||||||
class HoustonChronicle(BasicNewsRecipe):
|
class HoustonChronicle(BasicNewsRecipe):
|
||||||
|
title = u'The Houston Chronicle'
|
||||||
title = u'The Houston Chronicle'
|
description = 'News from Houston, Texas'
|
||||||
description = 'News from Houston, Texas'
|
|
||||||
__author__ = 'Dale Furrow'
|
__author__ = 'Dale Furrow'
|
||||||
language = 'en'
|
language = 'en'
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
# use_embedded_content = False
|
remove_attributes = ['style', 'xmlns']
|
||||||
remove_attributes = ['style']
|
|
||||||
remove_empty_feeds = True
|
remove_empty_feeds = True
|
||||||
timefmt = '[%a, %d %b %Y]'
|
timefmt = '[%a, %d %b %Y]'
|
||||||
timestampfmt = '%Y%m%d%H%M%S'
|
timestampfmt = '%Y%m%d%H%M%S'
|
||||||
ignore_duplicate_articles = {'url'}
|
ignore_duplicate_articles = {'url'}
|
||||||
remove_attributes = ['xmlns']
|
|
||||||
|
|
||||||
remove_tags = [dict(name='div', attrs={'class':'socialBar'}),
|
base_url = 'http://www.chron.com'
|
||||||
dict(name='div', attrs={'class':re.compile('post-commentmeta')}),
|
|
||||||
dict(name='div', attrs={'class':re.compile('slideshow_wrapper')}),
|
|
||||||
dict(name='div', attrs={'class':'entry-summary'}),
|
|
||||||
dict(name='a', attrs={'rel':'item-license'})]
|
|
||||||
|
|
||||||
baseUrl = 'http://www.chron.com'
|
|
||||||
|
|
||||||
oldest_web_article = 7.0
|
oldest_web_article = 7.0
|
||||||
|
|
||||||
@ -42,109 +34,121 @@ class HoustonChronicle(BasicNewsRecipe):
|
|||||||
else:
|
else:
|
||||||
earliest_date = date.today() - timedelta(days=oldest_web_article)
|
earliest_date = date.today() - timedelta(days=oldest_web_article)
|
||||||
|
|
||||||
pages = [('news' , '/news/houston-texas/'),
|
pages = [('news', '/news/houston-texas/'),
|
||||||
('business' , '/business/'),
|
('business', '/business/'),
|
||||||
('opinion', '/opinion/'),
|
('opinion', '/opinion/'),
|
||||||
('sports', '/sports/')]
|
('sports', '/sports/')]
|
||||||
|
|
||||||
def getLinksFromSectionPage(self, sectionUrl):
|
def get_links_from_section_page(self, section_url):
|
||||||
pageDoc = html.parse(sectionUrl)
|
page_doc = html.parse(section_url)
|
||||||
els = pageDoc.xpath("""//div[contains(@class, 'scp-item')
|
els = page_doc.xpath("""//div[contains(@class, 'scp-item')
|
||||||
or @class='scp-feature' or contains(@class, 'simplelist')
|
or @class='scp-feature' or contains(@class, 'simplelist')
|
||||||
or contains(@class, 'scp-blogpromo')]
|
or contains(@class, 'scp-blogpromo')]
|
||||||
//a[@href and not(@target) and not(child::img)]""")
|
//a[@href and not(@target) and not(child::img)]""")
|
||||||
elList = []
|
element_list = []
|
||||||
for el in els:
|
for el in els:
|
||||||
link = el.get('href')
|
link = el.get('href')
|
||||||
title = el.text
|
title = el.text
|
||||||
if link[:4] != 'http':
|
if link[:4] != 'http':
|
||||||
link = self.baseUrl + link
|
link = self.base_url + link
|
||||||
if title is not None:
|
if title is not None:
|
||||||
elList.append((link, el.text))
|
element_list.append((link, el.text))
|
||||||
return elList
|
return element_list
|
||||||
|
|
||||||
def getArticleDescriptionFromDoc(self, pageDoc):
|
def get_article_description_from_doc(self, page_doc):
|
||||||
descriptionCharsBreak = 140
|
description_chars_break = 140
|
||||||
descriptionMaxChars = 300
|
description_max_chars = 300
|
||||||
descXpath = """//div[contains(@class, 'article-body') or
|
desc_xpath = """//div[contains(@class, 'article-body') or
|
||||||
contains(@class, 'resource-content') or contains(@class, 'post')]//p"""
|
contains(@class, 'resource-content') or contains(@class, 'post')]//p"""
|
||||||
sentenceRegex = re.compile("(\S.+?[.!?])(?=\s+|$)")
|
sentence_regex = re.compile("(\S.+?[.!?])(?=\s+|$)")
|
||||||
|
|
||||||
def stringify_children(node):
|
def stringify_children(node):
|
||||||
return ''.join([x for x in node.itertext()])
|
return ''.join([x for x in node.itertext()])
|
||||||
|
|
||||||
try:
|
try:
|
||||||
els = pageDoc.xpath(descXpath)
|
els = page_doc.xpath(desc_xpath)
|
||||||
outText = ""
|
out_text = ""
|
||||||
ellipsis = ""
|
ellipsis = ""
|
||||||
for el in els:
|
for el in els:
|
||||||
sentences = re.findall(sentenceRegex, stringify_children(el))
|
sentences = re.findall(sentence_regex, stringify_children(el))
|
||||||
for sentence in sentences:
|
for sentence in sentences:
|
||||||
if len(outText) < descriptionCharsBreak:
|
if len(out_text) < description_chars_break:
|
||||||
outText += sentence + " "
|
out_text += sentence + " "
|
||||||
else:
|
else:
|
||||||
if len(outText) > descriptionMaxChars:
|
if len(out_text) > description_max_chars:
|
||||||
ellipsis = "..."
|
ellipsis = "..."
|
||||||
return outText[:descriptionMaxChars] + ellipsis
|
return out_text[:description_max_chars] + ellipsis
|
||||||
return outText
|
return out_text
|
||||||
except:
|
except:
|
||||||
self.log('Error on Article Description')
|
self.log('Error on Article Description')
|
||||||
return ""
|
return ""
|
||||||
|
|
||||||
def getPublishedTimeFromDoc(self, pageDoc):
|
def get_published_time_from_doc(self, page_doc):
|
||||||
regexDateOnly = re.compile("""(?:January|February|March|April|
|
regex_date_only = re.compile("""(?:January|February|March|April|
|
||||||
May|June|July|August|September|October|November|
|
May|June|July|August|September|October|November|
|
||||||
December)\s[0-9]{1,2},\s20[01][0-9]""")
|
December)\s[0-9]{1,2},\s20[01][0-9]""")
|
||||||
regextTimeOnly = re.compile("""[0-9]{1,2}:[0-9]{1,2} \w{2}""")
|
regex_time_only = re.compile("""[0-9]{1,2}:[0-9]{1,2} \w{2}""")
|
||||||
def getRegularTimestamp(dateString):
|
|
||||||
|
def get_regular_timestamp(date_string):
|
||||||
try:
|
try:
|
||||||
outDate = datetime.strptime(dateString, "%Y-%m-%dT%H:%M:%SZ")
|
out_date = datetime.strptime(date_string, "%Y-%m-%dT%H:%M:%SZ")
|
||||||
return outDate
|
return out_date
|
||||||
except:
|
except:
|
||||||
return None
|
return None
|
||||||
def getDateFromString(inText):
|
|
||||||
match = re.findall(regexDateOnly, inText)
|
def get_date_from_string(in_text):
|
||||||
|
match = re.findall(regex_date_only, in_text)
|
||||||
if match:
|
if match:
|
||||||
try:
|
try:
|
||||||
outDate = datetime.strptime(match[0], "%B %d, %Y")
|
out_date = datetime.strptime(match[0], "%B %d, %Y")
|
||||||
match = re.findall(regextTimeOnly, inText)
|
match = re.findall(regex_time_only, in_text)
|
||||||
if match:
|
if match:
|
||||||
outTime = datetime.strptime(match[0], "%I:%M %p")
|
out_time = datetime.strptime(match[0], "%I:%M %p")
|
||||||
return datetime.combine(outDate.date(), outTime.time())
|
return datetime.combine(out_date.date(), out_time.time())
|
||||||
return outDate
|
return out_date
|
||||||
except:
|
except:
|
||||||
return None
|
return None
|
||||||
else:
|
|
||||||
return None
|
el = page_doc.xpath("//*[@class='timestamp'][1]")
|
||||||
el = pageDoc.xpath("//*[@class='timestamp'][1]")
|
|
||||||
if len(el) == 1:
|
if len(el) == 1:
|
||||||
return getRegularTimestamp(el[0].get('title'))
|
return get_regular_timestamp(el[0].get('title'))
|
||||||
else:
|
else:
|
||||||
el = pageDoc.xpath("//*[@class='entry-date' or @class='post-date'][1]")
|
el = page_doc.xpath("//*[@class='entry-date' or @class='post-date'][1]")
|
||||||
if len(el) == 1:
|
if len(el) == 1:
|
||||||
return getDateFromString(el[0].text_content())
|
return get_date_from_string(el[0].text_content())
|
||||||
else:
|
else:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def getAllFeedDataFromPage(self, page):
|
def get_all_data_feeds_from_page(self, page):
|
||||||
articles = []
|
articles = []
|
||||||
linkList = self.getLinksFromSectionPage(self.baseUrl + page[1])
|
exclude_titles_with = ['Winning numbers']
|
||||||
self.log('from section: ', page[0], " found ", len(linkList), " links")
|
|
||||||
for link in linkList:
|
def title_excluded(title):
|
||||||
|
for text in exclude_titles_with:
|
||||||
|
if title.find(text) != -1:
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
link_list = self.get_links_from_section_page(self.base_url + page[1])
|
||||||
|
self.log('from section: ', page[0], " found ", len(link_list), " links")
|
||||||
|
for link in link_list:
|
||||||
try:
|
try:
|
||||||
articleDoc = html.parse(link[0])
|
article_doc = html.parse(link[0])
|
||||||
description = self.getArticleDescriptionFromDoc(articleDoc)
|
description = self.get_article_description_from_doc(article_doc)
|
||||||
articleDate = self.getPublishedTimeFromDoc(articleDoc)
|
article_date = self.get_published_time_from_doc(article_doc)
|
||||||
if articleDate is not None and description is not None and articleDate.date() > self.earliest_date:
|
if article_date is not None and description is not None and article_date.date() > self.earliest_date \
|
||||||
dateText = articleDate.strftime('%a, %d %b')
|
and not title_excluded(link[1]):
|
||||||
author = articleDate.strftime(self.timestampfmt)
|
date_text = article_date.strftime('%a, %d %b')
|
||||||
articles.append({'title':link[1], 'url':link[0],
|
author = article_date.strftime(self.timestampfmt)
|
||||||
'description':description, 'date':dateText, 'author':author})
|
articles.append({'title': link[1], 'url': link[0],
|
||||||
self.log(page[0] + ": " + link[1] + ', from ' + dateText +
|
'description': description, 'date': date_text, 'author': author})
|
||||||
" description of " + str(len(description)) + ' characters at ' + link[0])
|
self.log(page[0] + ": " + link[1] + ', from ' + date_text +
|
||||||
|
" description of " + str(len(description)) + ' characters at ' + link[0])
|
||||||
else:
|
else:
|
||||||
msg = ""
|
if article_date is None:
|
||||||
if articleDate is None:
|
|
||||||
msg = " No Timestamp Found"
|
msg = " No Timestamp Found"
|
||||||
|
elif title_excluded(link[1]):
|
||||||
|
msg = " Title Excluded"
|
||||||
else:
|
else:
|
||||||
msg = " article older than " + str(self.oldest_web_article) + ' days...'
|
msg = " article older than " + str(self.oldest_web_article) + ' days...'
|
||||||
self.log("Skipping article: ", link[0], msg)
|
self.log("Skipping article: ", link[0], msg)
|
||||||
@ -156,37 +160,63 @@ class HoustonChronicle(BasicNewsRecipe):
|
|||||||
def parse_index(self):
|
def parse_index(self):
|
||||||
|
|
||||||
self.timefmt = ' [%a, %d %b, %Y]'
|
self.timefmt = ' [%a, %d %b, %Y]'
|
||||||
self.log('starting parse_index: ', time.strftime(self.timestampfmt))
|
self.log('starting parse_index: ', time.strftime(self.timestampfmt))
|
||||||
feeds = []
|
feeds = []
|
||||||
for page in self.pages:
|
for page in self.pages:
|
||||||
articles = []
|
articles = self.get_all_data_feeds_from_page(page)
|
||||||
articles = self.getAllFeedDataFromPage(page)
|
|
||||||
if articles:
|
if articles:
|
||||||
feeds.append((page[0], articles))
|
feeds.append((page[0], articles))
|
||||||
self.log('finished parse_index: ', time.strftime(self.timestampfmt))
|
self.log('finished parse_index: ', time.strftime(self.timestampfmt))
|
||||||
return feeds
|
return feeds
|
||||||
|
|
||||||
def preprocess_html(self, thisSoup):
|
def preprocess_html(self, soup):
|
||||||
baseTags = []
|
tags_to_exclude = [('class', "caption staged"), ('style', "display:none")]
|
||||||
baseTags.extend(thisSoup.findAll(name='div', attrs={'id':re.compile('post-\d+')}))
|
story_tag = soup.find(name='div', attrs={'class': 'article-content'})
|
||||||
baseTags.extend(thisSoup.findAll(name='div', attrs={'class':'hnews hentry item'}))
|
blog_tag = soup.find(name='div', attrs={'id': re.compile('post-\d+')})
|
||||||
allTags = []
|
|
||||||
allTags.extend(baseTags)
|
def is_excluded(tag_to_check):
|
||||||
if len(baseTags) > 0:
|
for attr in tag_to_check.attrs:
|
||||||
for tag in baseTags:
|
if attr in tags_to_exclude:
|
||||||
allTags.extend(tag.findAll(True))
|
return True
|
||||||
paragraphs = thisSoup.findAll(name='p')
|
return False
|
||||||
for paragraph in paragraphs:
|
|
||||||
if paragraph not in allTags:
|
def get_attr_startswith(attrs, this_key, this_valuestart):
|
||||||
allTags.append(paragraph)
|
starts_with = False
|
||||||
for tag in baseTags:
|
for attr in attrs:
|
||||||
while tag.parent is not None:
|
if attr[0] == this_key:
|
||||||
allTags.append(tag)
|
if attr[1].startswith(this_valuestart):
|
||||||
|
starts_with = True
|
||||||
|
return starts_with
|
||||||
|
|
||||||
|
base_tags = []
|
||||||
|
if story_tag is not None:
|
||||||
|
base_tags = story_tag.findAll(lambda this_tag: (this_tag.name == "p"
|
||||||
|
and not ('class', 'open') in this_tag.attrs
|
||||||
|
and not ('class', 'close') in this_tag.attrs)
|
||||||
|
or this_tag.name.startswith('h') or this_tag.name == 'table'
|
||||||
|
or (this_tag.name == 'li'
|
||||||
|
and ('class', 'hst-resgalleryitem') in this_tag.attrs))
|
||||||
|
if blog_tag is not None:
|
||||||
|
base_tags = blog_tag.findAll(lambda this_tag: (this_tag.name == "p" or this_tag.name.startswith('h'))
|
||||||
|
or (this_tag.name == "span"
|
||||||
|
and get_attr_startswith(this_tag.attrs, 'class', 'post'))
|
||||||
|
or (this_tag.name == 'img' and ('lazy-state', 'loaded') in this_tag.attrs))
|
||||||
|
|
||||||
|
self.log('content tags: ' + str(type(base_tags)) + str(len(base_tags)))
|
||||||
|
all_tags = []
|
||||||
|
all_tags.extend(base_tags)
|
||||||
|
if len(base_tags) > 0:
|
||||||
|
for tag in base_tags:
|
||||||
|
all_tags.extend(tag.findAll(True))
|
||||||
|
|
||||||
|
for tag in base_tags:
|
||||||
|
while tag.parent is not None and not is_excluded(tag):
|
||||||
|
all_tags.append(tag)
|
||||||
tag = tag.parent
|
tag = tag.parent
|
||||||
for tag in thisSoup.findAll(True):
|
for tag in soup.findAll(True):
|
||||||
if tag not in allTags:
|
if tag not in all_tags:
|
||||||
tag.extract()
|
tag.extract()
|
||||||
return thisSoup
|
return soup
|
||||||
|
|
||||||
def populate_article_metadata(self, article, soup, first):
|
def populate_article_metadata(self, article, soup, first):
|
||||||
if not first:
|
if not first:
|
||||||
@ -195,12 +225,7 @@ class HoustonChronicle(BasicNewsRecipe):
|
|||||||
article.date = time.strptime(article.author, self.timestampfmt)
|
article.date = time.strptime(article.author, self.timestampfmt)
|
||||||
article.utctime = dt_factory(article.date, assume_utc=False, as_utc=False)
|
article.utctime = dt_factory(article.date, assume_utc=False, as_utc=False)
|
||||||
article.localtime = article.utctime.astimezone(local_tz)
|
article.localtime = article.utctime.astimezone(local_tz)
|
||||||
except Exception as inst: # remove after debug
|
except Exception as inst:
|
||||||
self.log('Exception: ', article.title) # remove after debug
|
self.log('Exception: ', article.title)
|
||||||
self.log(type(inst)) # remove after debug
|
self.log(type(inst))
|
||||||
self.log(inst) # remove after debug
|
self.log(inst)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user