(1) Revisions to existing Houston Chronicle recipe

(2) Added new recipe which scrapes Bloomberg for columnists' blog postings
(3) Revisions to existing economist recipe, added description metadata via populate_article_metadata.
This commit is contained in:
dale 2018-07-12 18:45:07 -05:00
parent 439da63e81
commit 6f93d75c06
4 changed files with 423 additions and 165 deletions

View File

@ -0,0 +1,199 @@
#!/usr/bin/env python2
# -*- coding: utf-8 -*-
__license__ = 'GPL v3'
__copyright__ = '2018, Dale Furrow dkfurrow@gmail.com'
'''
chron.com
'''
import re
import time
from datetime import datetime, timedelta, date
from lxml import html, etree
from StringIO import StringIO
from calibre.web.feeds.recipes import BasicNewsRecipe
import urllib2
import traceback
from collections import OrderedDict
import calendar
import sys
from calibre.ebooks.BeautifulSoup import BeautifulSoup, NavigableString, CData, Tag
contributors_url = "https://www.bloomberg.com/view/contributors"
output_date_format = "%d %b, %H:%M"
hdr = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
'Accept-Encoding': 'none',
'Accept-Language': 'en-US,en;q=0.8',
'Connection': 'keep-alive'}
def get_article_parsed(this_url):
req = urllib2.Request(this_url, headers=hdr)
page = urllib2.urlopen(req)
content = page.read()
parser = etree.HTMLParser()
parsed = html.parse(StringIO(content), parser)
return parsed
class BloombergContributor:
_name = None
_url_name = None
_url_code = None
_article_list = None # article is title, link, date, description
date_formats = ["%B %d, %Y %I:%M %p", "%b %d, %Y %I:%M %p"]
def __init__(self, name, url_name, url_code):
self._name = name
self._url_name = url_name
self._url_code = url_code
self._article_list = []
def __str__(self):
return "{0} ({1}): {2:d} articles".format(self._name, self._url_name, len(self._article_list))
def populate_article_list(self):
list_url = "{0}/{1}/{2}/articles".format(contributors_url, self._url_code, self._url_name)
parsed_list = get_article_parsed(list_url)
articles = parsed_list.xpath("//li[contains(@class, 'item_lwCjl')]")
for article in articles:
headline = article.find('a')
link = headline.attrib['href']
title = headline.text.strip()
article_date_eles = article.xpath(".//span[contains(@class, 'time_3qQJR')]")
if len(article_date_eles) > 0:
article_date_str = article_date_eles[0].text.strip()
article_date = self.parse_date_str(article_date_str)
else:
article_date = None
summary_eles = article.xpath(".//p[contains(@class, 'summary_17SO6')]")
if len(summary_eles) > 0:
summary = summary_eles[0].text.strip()
else:
summary = "No summary..."
self._article_list.append((title.encode('ascii', 'ignore'), link, summary.encode('ascii', 'ignore'),
article_date, self.get_article_timestamp(article_date)))
@staticmethod
def get_article_timestamp(article_date):
# assume all times Eastern...
# 2nd sunday March, 1st Sunday Nov
c = calendar.Calendar(firstweekday=calendar.SUNDAY)
march_cal = c.monthdatescalendar(article_date.year, 3)
dst_start = [day for week in march_cal for day in week if \
day.weekday() == calendar.SUNDAY and \
day.month == 3][1]
nov_cal = c.monthdatescalendar(article_date.year, 11)
dst_end = [day for week in nov_cal for day in week if \
day.weekday() == calendar.SUNDAY and \
day.month == 11][0]
dst_start = datetime(dst_start.year, dst_start.month, dst_start.day, 2)
dst_end = datetime(dst_end.year, dst_end.month, dst_end.day, 1)
if dst_start > article_date > dst_end:
shift = timedelta(hours=4)
else:
shift = timedelta(hours=5)
return float((article_date + shift - datetime.utcfromtimestamp(0)).total_seconds())
def parse_date_str(self, date_str):
parsed = None
for date_format in self.date_formats:
try:
parsed = datetime.strptime(date_str[0:-4], date_format)
break
except Exception as ex:
pass
return parsed
def get_article_list(self):
return self._article_list
def get_ordered_article_feed(self):
output = OrderedDict()
for article in self._article_list:
article_date = article[3]
article_dict = {'title': article[0], 'url': article[1],
'description': "{0}: {1}".format(self.get_name(), article[2]),
'author': self.get_name() + ": " + article[3].strftime(output_date_format),
'date': self.get_name() + ": " + article[3].strftime(output_date_format),
'timestamp': article[4]}
output[article_date] = article_dict
return OrderedDict(sorted(output.items(), key=lambda t: t[0], reverse=True))
def get_name(self):
return self._name
class BloombergContributors(BasicNewsRecipe):
title = u'Bloomberg, Editorial Contributors'
description = 'Articles from Bloomberg.com contributors'
__author__ = 'Dale Furrow'
xpath_contributor_list = """//li[contains(@class, 'item_2zsS8')]/a"""
language = 'en'
no_stylesheets = True
remove_attributes = ['style', 'xmlns']
keep_only_tags = [dict(name='article', attrs={'data-type': 'article'})]
remove_tags = [dict(name='div', attrs=
{'class': ['share-article-button ', 'text-to-speech']})] # note space...
oldest_article = 7.0
ignore_duplicate_articles = {'url'}
recursions = 0
category = 'news, USA, world, economy, politics'
language = 'en'
def get_contributors_list(self):
page_doc = get_article_parsed(contributors_url)
els = page_doc.xpath(self.xpath_contributor_list)
contributor_list = []
for el in els:
name = el.find("span").text.strip() # name
contibutor_items = el.attrib['href'].split('/')
contributor = BloombergContributor(name, contibutor_items[4], contibutor_items[3])
contributor_list.append(contributor)
for contributor in contributor_list:
contributor.populate_article_list()
return contributor_list
def postprocess_html(self, soup, first_fetch):
'''
:param soup: A `BeautifulSoup
<https://www.crummy.com/software/BeautifulSoup/bs3/documentation.html>
`_ instance containing the downloaded :term:`HTML`.
:param first_fetch: True if this is the first page of an article.
Remember: BeautifulSoup3! Interface is much different than bs4
'''
time_eles = soup.findAll("time", {"class": "article-timestamp"})
if len(time_eles) > 0:
time_stamp = time_eles[0].get('datetime')
try:
parsed_time = datetime.strptime(time_stamp, "%Y-%m-%dT%H:%M:%S.%fZ")\
.strftime("%B %d, %Y %I:%M %p") + " UTC"
except:
parsed_time = time_stamp
insert_tag = Tag(soup, "p", [("class", "user-inserted")])
insert_tag.insert(0, parsed_time)
soup.time.replaceWith(insert_tag)
return soup
def parse_index(self):
self.timefmt = ' [%a, %d %b, %Y]'
self.log('starting parse_index: {0}'.format(datetime.now().strftime("%B %d, %Y %I:%M %p")))
feeds = []
feed_dict = OrderedDict()
contributor_list = self.get_contributors_list()
self.log("Found {0:d} contibutors on main page".format(len(contributor_list)))
for contributor in contributor_list:
articles = contributor.get_ordered_article_feed()
feed_dict.update(articles)
feed_dict = OrderedDict(sorted(feed_dict.items(), key=lambda t: t[0], reverse=True))
self.log("Found {0:d} linked articles from contributors".format(len(feed_dict)))
feeds.append(("Columns", list(feed_dict.values())))
# self.log("Total of {0:d} {1} articles".format(len(article_list), cat))
self.log('finishing parse_index: {0}'.format(datetime.now().strftime("%B %d, %Y %I:%M %p")))
return feeds

View File

@ -10,6 +10,7 @@ from collections import OrderedDict
from calibre.ebooks.BeautifulSoup import NavigableString, Tag
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.utils.cleantext import clean_ascii_chars
def classes(classes):
@ -146,6 +147,18 @@ class Economist(BasicNewsRecipe):
raw = etree.tostring(root, encoding=unicode)
return raw
def populate_article_metadata(self, article, soup, first):
els = soup.findAll(name=['span', 'p'],
attrs={'class': ['flytitle-and-title__title', 'blog-post__rubric']})
result = []
for el in els[0:2]:
if el is not None:
for descendant in el.contents:
if isinstance(descendant, NavigableString):
result.append(unicode(descendant))
article.summary = u'. '.join(result).encode('utf-8') + '.'
article.text_summary = clean_ascii_chars(article.summary)
def parse_index(self):
# return [('Articles', [{'title':'test',
# 'url':'http://www.economist.com/news/business/21718916-worlds-biggest-software-firm-has-transformed-its-culture-better-getting-cloud'

View File

@ -10,6 +10,7 @@ from collections import OrderedDict
from calibre.ebooks.BeautifulSoup import NavigableString, Tag
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.utils.cleantext import clean_ascii_chars
def classes(classes):
@ -146,6 +147,18 @@ class Economist(BasicNewsRecipe):
raw = etree.tostring(root, encoding=unicode)
return raw
def populate_article_metadata(self, article, soup, first):
els = soup.findAll(name=['span', 'p'],
attrs={'class': ['flytitle-and-title__title', 'blog-post__rubric']})
result = []
for el in els[0:2]:
if el is not None:
for descendant in el.contents:
if isinstance(descendant, NavigableString):
result.append(unicode(descendant))
article.summary = u'. '.join(result).encode('utf-8') + '.'
article.text_summary = clean_ascii_chars(article.summary)
def parse_index(self):
# return [('Articles', [{'title':'test',
# 'url':'http://www.economist.com/news/business/21718916-worlds-biggest-software-firm-has-transformed-its-culture-better-getting-cloud'

View File

@ -1,17 +1,135 @@
#!/usr/bin/env python2
# -*- coding: utf-8 -*-
__license__ = 'GPL v3'
__copyright__ = '2016, Dale Furrow dkfurrow@gmail.com'
__copyright__ = '2018, Dale Furrow dkfurrow@gmail.com'
'''
chron.com
'''
import re
import time
import urllib2
from StringIO import StringIO
from datetime import datetime, timedelta, date
import traceback
import sys
from collections import OrderedDict
from datetime import datetime, timedelta, date
from lxml import html
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.web.feeds import Article
from calibre.utils.cleantext import clean_ascii_chars
from calibre.ebooks.BeautifulSoup import NavigableString
from calibre.utils.date import dt_factory, utcnow, local_tz
from lxml import html
from lxml import etree
regex_date_only = re.compile("""(?:January|February|March|April|
{8}May|June|July|August|September|October|November|
{8}December)\s[0-9]{1,2},\s20[01][0-9]""")
regex_time_only = re.compile("""[0-9]{1,2}:[0-9]{1,2} \w{2}""")
sentence_regex = re.compile("(\S.+?[.!?])(?=\s+|$)")
blog_regex = re.compile('post-\d+')
pages = OrderedDict([('news', ('/news/houston-texas/', ['business', 'sports'])),
('business', ('/business/', ['sports'])),
('sports', ('/sports/', ['business']))])
base_url = "http://www.chron.com"
# sports has 'core-package sports' class
xpath_general = """//div[contains(@class, 'centerpiece-tabs') or
contains(@class, 'wrapper') or
contains(@class, 'contentGroups') or
contains(@class, 'headline-list') or
contains(@class, 'core-package sports') or
contains(@class, 'news')]
//a[contains(@class, 'hdn-analytics')]"""
excluded_titles = ["Winning numbers", "TV-radio listings"]
def validate_link(page, link, title):
other_category = page[1][1]
if not title or len(title.strip()) < 5:
print("{0} rejected, title too short".format(link))
return None
if link.split('/')[3] in other_category:
print("{0} rejected, covered in other section".format(link))
return None
for excluded_title in excluded_titles:
if title.find(excluded_title) != -1:
print("{0} rejected, excluded title".format(link))
return None
return link, title
def get_article_parsed(this_url):
page = urllib2.urlopen(this_url)
content = page.read()
parser = etree.HTMLParser()
parsed = html.parse(StringIO(content), parser)
return parsed
def sort_subject(element_list):
# priority of subjects
subjects = ['news', 'neighborhood', 'entertainment']
subjects.reverse()
subject_dict = OrderedDict(zip(subjects, range(len(subjects))))
rank_dict = OrderedDict([(rank, []) for rank in range(len(subjects) + 1)])
for element in element_list:
subj = element[0].split('/')[3]
if subject_dict.get(subj) is not None:
rank_dict[subject_dict[subj] + 1].append(element)
else:
rank_dict[0].append(element)
# now return in reverse order, sorted
combined_list = []
for rank in range(len(subjects), -1, -1):
article_list = rank_dict[rank]
article_list.sort()
combined_list.extend(article_list)
return combined_list
def get_links_from_section_page(page):
page_doc = get_article_parsed(base_url + page[1][0])
els = page_doc.xpath(xpath_general)
element_list = []
for el in els:
link = el.get('href').split('?')[0]
title = el.text
if title is None or len(title.strip()) < 5:
link_id = link.split('/')[-1][:-3].split('-')[:-1]
title = ' '.join(link_id)
if link[:4] != 'http':
link = base_url + link
validated_link = validate_link(page=page, link=link, title=title)
if validated_link is not None:
element_list.append(validated_link)
sorted_element_list = sort_subject(element_list)
return [page[0], sorted_element_list]
def get_all_links_from_sections():
all_sections = []
article_set = set()
final_dict = OrderedDict()
for item in pages.items():
print "getting links from {0}".format(item[0])
all_sections.append(get_links_from_section_page(item))
for section in all_sections:
section_id = section[0]
article_list = section[1]
final_dict[section_id] = []
for article in article_list:
if article[0] not in article_set:
article_set.add(article[0])
final_dict[section_id].append(article)
return final_dict
# noinspection PyAbstractClass
class HoustonChronicle(BasicNewsRecipe):
title = u'The Houston Chronicle'
description = 'News from Houston, Texas'
@ -22,196 +140,111 @@ class HoustonChronicle(BasicNewsRecipe):
remove_empty_feeds = True
timefmt = '[%a, %d %b %Y]'
timestampfmt = '%Y%m%d%H%M%S'
ignore_duplicate_articles = {'url'}
# ignore_duplicate_articles = {'url'} # defaults to None
extra_css = '.article_date {display: none}'
category = 'news, USA'
masthead_url = 'http://www.chron.com/apple-touch-icon-76x76.png'
keep_only_tags = [dict(name='div', attrs={'class': ['article-content', 'article-wrap']})]
remove_tags = [dict(name='div', attrs={'social-title': True}),
dict(name='div', attrs={'class':
['control-panel', 'gallery-overlay-inner',
'most-popular', 'asset-media mos-playlist',
'asset_media asset-media']}),
dict(name='li', attrs={'class': ['hst-resgalleryitem taboola-frame hidden',
'hst-resgalleryitem hidden']}),
dict(name='ul', attrs={'class': 'clearfix'})]
oldest_web_article = 7.0
# max_articles_per_feed = 5 # for use in testing
if oldest_web_article is None:
earliest_date = date.today()
else:
earliest_date = date.today() - timedelta(days=oldest_web_article)
pages = [('news', '/news/houston-texas/'),
('business', '/business/'),
('sports', '/sports/')]
base_url = "http://www.chron.com"
xpath_str = """//div[contains(@class, 'news') or contains(@class, 'headline-list')]
//*[self::h4 or self::h5]//a[contains(@class, 'hdn-analytics')]"""
def get_links_from_section_page(self, section_url):
page_doc = html.parse(self.base_url + section_url)
els = page_doc.xpath(self.xpath_str)
element_list = []
for el in els:
link = el.get('href')
title = el.text
if link[:4] != 'http':
link = self.base_url + link
if title is not None:
element_list.append((link, el.text))
return element_list
def get_article_description_from_doc(self, page_doc):
def get_article_description_from_doc(self, soup):
description_chars_break = 140
description_max_chars = 300
desc_xpath = """//div[contains(@class, 'article-body') or
contains(@class, 'resource-content') or contains(@class, 'post')]//p"""
sentence_regex = re.compile("(\S.+?[.!?])(?=\s+|$)")
def stringify_children(node):
return ''.join([x for x in node.itertext()])
try:
els = page_doc.xpath(desc_xpath)
els = soup.findAll('p')
if len(els) > 0:
out_text = ""
ellipsis = ""
this_ellipsis = ""
for el in els:
sentences = re.findall(sentence_regex, stringify_children(el))
if el is not None:
result = []
for descendant in el.contents:
if isinstance(descendant, NavigableString):
result.append(unicode(descendant).strip())
all_text = u' '.join(result).encode('utf-8')
if len(all_text) > 1:
sentences = re.findall(sentence_regex, all_text)
if sentences is not None and len(sentences) > 0:
for sentence in sentences:
if len(out_text) < description_chars_break:
out_text += sentence + " "
else:
if len(out_text) > description_max_chars:
ellipsis = "..."
return out_text[:description_max_chars] + ellipsis
this_ellipsis = "..."
return out_text[:description_max_chars] + this_ellipsis
return out_text
except:
else:
return "No Article description returned"
except Exception as ex:
self.log('Error on Article Description')
traceback.print_exc(file=sys.stdout)
print(str(ex))
return ""
def get_published_time_from_doc(self, page_doc):
regex_date_only = re.compile("""(?:January|February|March|April|
May|June|July|August|September|October|November|
December)\s[0-9]{1,2},\s20[01][0-9]""")
regex_time_only = re.compile("""[0-9]{1,2}:[0-9]{1,2} \w{2}""")
@staticmethod
def get_published_time_from_doc(page_doc):
def get_regular_timestamp(date_string):
try:
out_date = datetime.strptime(date_string, "%Y-%m-%dT%H:%M:%SZ")
return out_date
except:
except ValueError:
return None
def get_date_from_string(in_text):
match = re.findall(regex_date_only, in_text)
if match:
try:
out_date = datetime.strptime(match[0], "%B %d, %Y")
match = re.findall(regex_time_only, in_text)
if match:
out_time = datetime.strptime(match[0], "%I:%M %p")
return datetime.combine(out_date.date(), out_time.time())
return out_date
except:
return None
el = page_doc.xpath("//*[@class='timestamp'][1]")
el = page_doc.findAll(lambda this_tag: this_tag.name == "time" and
('itemprop', 'datePublished') in this_tag.attrs)
if len(el) == 1:
return get_regular_timestamp(el[0].get('title'))
else:
el = page_doc.xpath(
"//*[@class='entry-date' or @class='post-date'][1]")
if len(el) == 1:
return get_date_from_string(el[0].text_content())
return get_regular_timestamp(el[0].get('datetime'))
else:
return None
def get_all_data_feeds_from_page(self, page):
articles = []
exclude_titles_with = ['Winning numbers']
def populate_article_metadata(self, article, soup, first):
"""
Called when each HTML page belonging to article is downloaded.
Intended to be used to get article metadata like author/summary/etc.
from the parsed HTML (soup).
def title_excluded(title):
for text in exclude_titles_with:
if title.find(text) != -1:
return True
return False
link_list = self.get_links_from_section_page(page[1])
self.log('from section: ', page[0],
" found ", len(link_list), " links")
for link in link_list:
try:
article_doc = html.parse(link[0])
description = self.get_article_description_from_doc(
article_doc)
parsed_date = self.get_published_time_from_doc(article_doc)
if parsed_date is not None and description is not None and \
parsed_date.date() > self.earliest_date and \
not title_excluded(link[1]):
intro_date = parsed_date.strftime('%d %b %H:%M') + " - "
articles.append({'title': link[1], 'url': link[0],
'description': intro_date + description,
'date': ""})
self.log(page[0] + ": " + link[1] + ', from ' + intro_date +
" description of " + str(len(description)) + ' characters at ' + link[0])
else:
if parsed_date is None:
msg = " No Timestamp Found"
elif title_excluded(link[1]):
msg = " Title Excluded"
else:
msg = " article older than " + \
str(self.oldest_web_article) + ' days...'
self.log("Skipping article: ", link[0], msg)
except:
print 'error on fetching ' + link[0]
continue
return articles
:param article: A object of class :class:`calibre.web.feeds.Article`.
If you change the summary, remember to also change the text_summary
:param soup: Parsed HTML belonging to this article
:param first: True iff the parsed HTML is the first page of the article.
"""
summary = self.get_article_description_from_doc(soup)
article_date = self.get_published_time_from_doc(soup)
if article_date is not None:
article_timestamp = float((article_date - datetime.utcfromtimestamp(0)).total_seconds())
article.date = article_timestamp
article.utctime = dt_factory(article_date.timetuple(), assume_utc=True, as_utc=True)
article.localtime = article.utctime.astimezone(local_tz)
summary_date = article.localtime.strftime("%Y-%m-%d %H:%M") if article_date is not None else "No Date"
article.summary = "{0}: {1}".format(summary_date, summary)
article.text_summary = clean_ascii_chars(article.summary)
def parse_index(self):
self.timefmt = ' [%a, %d %b, %Y]'
self.log('starting parse_index: ', time.strftime(self.timestampfmt))
feeds = []
for page in self.pages:
articles = self.get_all_data_feeds_from_page(page)
if articles:
feeds.append((page[0], articles))
sections = get_all_links_from_sections()
for section_id, article_list in sections.items():
self.log("Getting {0} section, {1:d} articles".format(section_id, len(article_list)))
articles = []
for article_info in article_list:
self.log("Adding {0} to feed".format(article_info[0]))
articles.append({'title': article_info[1], 'url': article_info[0],
'description': '', 'date': ""})
self.log("Appending {0:d} articles for {1}".format(len(articles), section_id))
feeds.append((section_id, articles))
self.log('finished parse_index: ', time.strftime(self.timestampfmt))
return feeds
def preprocess_html(self, soup):
tags_to_exclude = [('class', "caption staged"),
('style', "display:none")]
story_tag = soup.find(
name='div', attrs={'class': ['article-content', 'article-body']})
blog_tag = soup.find(name='div', attrs={'id': re.compile('post-\d+')})
def is_excluded(tag_to_check):
for attr in tag_to_check.attrs:
if attr in tags_to_exclude:
return True
return False
def get_attr_startswith(attrs, this_key, this_valuestart):
starts_with = False
for attr in attrs:
if attr[0] == this_key:
if attr[1].startswith(this_valuestart):
starts_with = True
return starts_with
base_tags = []
if story_tag is not None:
base_tags = story_tag.findAll(lambda this_tag: (this_tag.name == "p" and not ('class', 'open') in this_tag.attrs and not ('class', 'close') in this_tag.attrs) or this_tag.name.startswith('h') or this_tag.name == 'table' or (this_tag.name == 'li' and ('class', 'hst-resgalleryitem') in this_tag.attrs)) # noqa
if blog_tag is not None:
base_tags = blog_tag.findAll(lambda this_tag: (this_tag.name == "p" or this_tag.name.startswith('h')) or (this_tag.name == "span" and get_attr_startswith(this_tag.attrs, 'class', 'post')) or (this_tag.name == 'img' and ('lazy-state', 'loaded') in this_tag.attrs)) # noqa
self.log('content tags: ' + str(type(base_tags)) + str(len(base_tags)))
all_tags = []
all_tags.extend(base_tags)
if len(base_tags) > 0:
for tag in base_tags:
all_tags.extend(tag.findAll(True))
for tag in base_tags:
while tag.parent is not None and not is_excluded(tag):
all_tags.append(tag)
tag = tag.parent
for tag in soup.findAll(True):
if tag not in all_tags:
tag.extract()
return soup