mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-09-29 15:31:08 -04:00
218 lines
8.6 KiB
Python
218 lines
8.6 KiB
Python
#!/usr/bin/env python2
|
|
# -*- coding: utf-8 -*-
|
|
__license__ = 'GPL v3'
|
|
__copyright__ = '2016, Dale Furrow dkfurrow@gmail.com'
|
|
'''
|
|
chron.com
|
|
'''
|
|
import re
|
|
import time
|
|
from datetime import datetime, timedelta, date
|
|
from lxml import html
|
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
|
|
|
|
|
class HoustonChronicle(BasicNewsRecipe):
|
|
title = u'The Houston Chronicle'
|
|
description = 'News from Houston, Texas'
|
|
__author__ = 'Dale Furrow'
|
|
language = 'en'
|
|
no_stylesheets = True
|
|
remove_attributes = ['style', 'xmlns']
|
|
remove_empty_feeds = True
|
|
timefmt = '[%a, %d %b %Y]'
|
|
timestampfmt = '%Y%m%d%H%M%S'
|
|
ignore_duplicate_articles = {'url'}
|
|
extra_css = '.article_date {display: none}'
|
|
|
|
oldest_web_article = 7.0
|
|
|
|
if oldest_web_article is None:
|
|
earliest_date = date.today()
|
|
else:
|
|
earliest_date = date.today() - timedelta(days=oldest_web_article)
|
|
|
|
pages = [('news', '/news/houston-texas/'),
|
|
('business', '/business/'),
|
|
('sports', '/sports/')]
|
|
|
|
base_url = "http://www.chron.com"
|
|
|
|
xpath_str = """//div[contains(@class, 'news') or contains(@class, 'headline-list')]
|
|
//*[self::h4 or self::h5]//a[contains(@class, 'hdn-analytics')]"""
|
|
|
|
def get_links_from_section_page(self, section_url):
|
|
page_doc = html.parse(self.base_url + section_url)
|
|
els = page_doc.xpath(self.xpath_str)
|
|
element_list = []
|
|
for el in els:
|
|
link = el.get('href')
|
|
title = el.text
|
|
if link[:4] != 'http':
|
|
link = self.base_url + link
|
|
if title is not None:
|
|
element_list.append((link, el.text))
|
|
return element_list
|
|
|
|
def get_article_description_from_doc(self, page_doc):
|
|
description_chars_break = 140
|
|
description_max_chars = 300
|
|
desc_xpath = """//div[contains(@class, 'article-body') or
|
|
contains(@class, 'resource-content') or contains(@class, 'post')]//p"""
|
|
sentence_regex = re.compile("(\S.+?[.!?])(?=\s+|$)")
|
|
|
|
def stringify_children(node):
|
|
return ''.join([x for x in node.itertext()])
|
|
|
|
try:
|
|
els = page_doc.xpath(desc_xpath)
|
|
out_text = ""
|
|
ellipsis = ""
|
|
for el in els:
|
|
sentences = re.findall(sentence_regex, stringify_children(el))
|
|
for sentence in sentences:
|
|
if len(out_text) < description_chars_break:
|
|
out_text += sentence + " "
|
|
else:
|
|
if len(out_text) > description_max_chars:
|
|
ellipsis = "..."
|
|
return out_text[:description_max_chars] + ellipsis
|
|
return out_text
|
|
except:
|
|
self.log('Error on Article Description')
|
|
return ""
|
|
|
|
def get_published_time_from_doc(self, page_doc):
|
|
regex_date_only = re.compile("""(?:January|February|March|April|
|
|
May|June|July|August|September|October|November|
|
|
December)\s[0-9]{1,2},\s20[01][0-9]""")
|
|
regex_time_only = re.compile("""[0-9]{1,2}:[0-9]{1,2} \w{2}""")
|
|
|
|
def get_regular_timestamp(date_string):
|
|
try:
|
|
out_date = datetime.strptime(date_string, "%Y-%m-%dT%H:%M:%SZ")
|
|
return out_date
|
|
except:
|
|
return None
|
|
|
|
def get_date_from_string(in_text):
|
|
match = re.findall(regex_date_only, in_text)
|
|
if match:
|
|
try:
|
|
out_date = datetime.strptime(match[0], "%B %d, %Y")
|
|
match = re.findall(regex_time_only, in_text)
|
|
if match:
|
|
out_time = datetime.strptime(match[0], "%I:%M %p")
|
|
return datetime.combine(out_date.date(), out_time.time())
|
|
return out_date
|
|
except:
|
|
return None
|
|
|
|
el = page_doc.xpath("//*[@class='timestamp'][1]")
|
|
if len(el) == 1:
|
|
return get_regular_timestamp(el[0].get('title'))
|
|
else:
|
|
el = page_doc.xpath(
|
|
"//*[@class='entry-date' or @class='post-date'][1]")
|
|
if len(el) == 1:
|
|
return get_date_from_string(el[0].text_content())
|
|
else:
|
|
return None
|
|
|
|
def get_all_data_feeds_from_page(self, page):
|
|
articles = []
|
|
exclude_titles_with = ['Winning numbers']
|
|
|
|
def title_excluded(title):
|
|
for text in exclude_titles_with:
|
|
if title.find(text) != -1:
|
|
return True
|
|
return False
|
|
|
|
link_list = self.get_links_from_section_page(page[1])
|
|
self.log('from section: ', page[0],
|
|
" found ", len(link_list), " links")
|
|
for link in link_list:
|
|
try:
|
|
article_doc = html.parse(link[0])
|
|
description = self.get_article_description_from_doc(
|
|
article_doc)
|
|
parsed_date = self.get_published_time_from_doc(article_doc)
|
|
if parsed_date is not None and description is not None and \
|
|
parsed_date.date() > self.earliest_date and \
|
|
not title_excluded(link[1]):
|
|
intro_date = parsed_date.strftime('%d %b %H:%M') + " - "
|
|
articles.append({'title': link[1], 'url': link[0],
|
|
'description': intro_date + description,
|
|
'date': ""})
|
|
self.log(page[0] + ": " + link[1] + ', from ' + intro_date +
|
|
" description of " + str(len(description)) + ' characters at ' + link[0])
|
|
else:
|
|
if parsed_date is None:
|
|
msg = " No Timestamp Found"
|
|
elif title_excluded(link[1]):
|
|
msg = " Title Excluded"
|
|
else:
|
|
msg = " article older than " + \
|
|
str(self.oldest_web_article) + ' days...'
|
|
self.log("Skipping article: ", link[0], msg)
|
|
except:
|
|
print 'error on fetching ' + link[0]
|
|
continue
|
|
return articles
|
|
|
|
def parse_index(self):
|
|
|
|
self.timefmt = ' [%a, %d %b, %Y]'
|
|
self.log('starting parse_index: ', time.strftime(self.timestampfmt))
|
|
feeds = []
|
|
for page in self.pages:
|
|
articles = self.get_all_data_feeds_from_page(page)
|
|
if articles:
|
|
feeds.append((page[0], articles))
|
|
self.log('finished parse_index: ', time.strftime(self.timestampfmt))
|
|
return feeds
|
|
|
|
def preprocess_html(self, soup):
|
|
tags_to_exclude = [('class', "caption staged"),
|
|
('style', "display:none")]
|
|
story_tag = soup.find(
|
|
name='div', attrs={'class': ['article-content', 'article-body']})
|
|
blog_tag = soup.find(name='div', attrs={'id': re.compile('post-\d+')})
|
|
|
|
def is_excluded(tag_to_check):
|
|
for attr in tag_to_check.attrs:
|
|
if attr in tags_to_exclude:
|
|
return True
|
|
return False
|
|
|
|
def get_attr_startswith(attrs, this_key, this_valuestart):
|
|
starts_with = False
|
|
for attr in attrs:
|
|
if attr[0] == this_key:
|
|
if attr[1].startswith(this_valuestart):
|
|
starts_with = True
|
|
return starts_with
|
|
|
|
base_tags = []
|
|
if story_tag is not None:
|
|
base_tags = story_tag.findAll(lambda this_tag: (this_tag.name == "p" and not ('class', 'open') in this_tag.attrs and not ('class', 'close') in this_tag.attrs) or this_tag.name.startswith('h') or this_tag.name == 'table' or (this_tag.name == 'li' and ('class', 'hst-resgalleryitem') in this_tag.attrs)) # noqa
|
|
if blog_tag is not None:
|
|
base_tags = blog_tag.findAll(lambda this_tag: (this_tag.name == "p" or this_tag.name.startswith('h')) or (this_tag.name == "span" and get_attr_startswith(this_tag.attrs, 'class', 'post')) or (this_tag.name == 'img' and ('lazy-state', 'loaded') in this_tag.attrs)) # noqa
|
|
|
|
self.log('content tags: ' + str(type(base_tags)) + str(len(base_tags)))
|
|
all_tags = []
|
|
all_tags.extend(base_tags)
|
|
if len(base_tags) > 0:
|
|
for tag in base_tags:
|
|
all_tags.extend(tag.findAll(True))
|
|
|
|
for tag in base_tags:
|
|
while tag.parent is not None and not is_excluded(tag):
|
|
all_tags.append(tag)
|
|
tag = tag.parent
|
|
for tag in soup.findAll(True):
|
|
if tag not in all_tags:
|
|
tag.extract()
|
|
return soup
|