mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-06-23 15:30:45 -04:00
(1) Revisions to existing Houston Chronicle recipe
(2) Added new recipe which scrapes Bloomberg for columnists' blog postings (3) Revisions to existing economist recipe, added description metadata via populate_article_metadata.
This commit is contained in:
parent
439da63e81
commit
6f93d75c06
199
recipes/bloomberg_columns.recipe
Normal file
199
recipes/bloomberg_columns.recipe
Normal file
@ -0,0 +1,199 @@
|
|||||||
|
#!/usr/bin/env python2
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
__license__ = 'GPL v3'
|
||||||
|
__copyright__ = '2018, Dale Furrow dkfurrow@gmail.com'
|
||||||
|
'''
|
||||||
|
chron.com
|
||||||
|
'''
|
||||||
|
import re
|
||||||
|
import time
|
||||||
|
from datetime import datetime, timedelta, date
|
||||||
|
from lxml import html, etree
|
||||||
|
from StringIO import StringIO
|
||||||
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||||
|
import urllib2
|
||||||
|
import traceback
|
||||||
|
from collections import OrderedDict
|
||||||
|
import calendar
|
||||||
|
import sys
|
||||||
|
from calibre.ebooks.BeautifulSoup import BeautifulSoup, NavigableString, CData, Tag
|
||||||
|
|
||||||
|
|
||||||
|
contributors_url = "https://www.bloomberg.com/view/contributors"
|
||||||
|
output_date_format = "%d %b, %H:%M"
|
||||||
|
|
||||||
|
hdr = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
|
||||||
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
||||||
|
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
|
||||||
|
'Accept-Encoding': 'none',
|
||||||
|
'Accept-Language': 'en-US,en;q=0.8',
|
||||||
|
'Connection': 'keep-alive'}
|
||||||
|
|
||||||
|
|
||||||
|
def get_article_parsed(this_url):
|
||||||
|
req = urllib2.Request(this_url, headers=hdr)
|
||||||
|
page = urllib2.urlopen(req)
|
||||||
|
content = page.read()
|
||||||
|
parser = etree.HTMLParser()
|
||||||
|
parsed = html.parse(StringIO(content), parser)
|
||||||
|
return parsed
|
||||||
|
|
||||||
|
class BloombergContributor:
|
||||||
|
_name = None
|
||||||
|
_url_name = None
|
||||||
|
_url_code = None
|
||||||
|
_article_list = None # article is title, link, date, description
|
||||||
|
date_formats = ["%B %d, %Y %I:%M %p", "%b %d, %Y %I:%M %p"]
|
||||||
|
|
||||||
|
def __init__(self, name, url_name, url_code):
|
||||||
|
self._name = name
|
||||||
|
self._url_name = url_name
|
||||||
|
self._url_code = url_code
|
||||||
|
self._article_list = []
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return "{0} ({1}): {2:d} articles".format(self._name, self._url_name, len(self._article_list))
|
||||||
|
|
||||||
|
def populate_article_list(self):
|
||||||
|
list_url = "{0}/{1}/{2}/articles".format(contributors_url, self._url_code, self._url_name)
|
||||||
|
parsed_list = get_article_parsed(list_url)
|
||||||
|
articles = parsed_list.xpath("//li[contains(@class, 'item_lwCjl')]")
|
||||||
|
for article in articles:
|
||||||
|
headline = article.find('a')
|
||||||
|
link = headline.attrib['href']
|
||||||
|
title = headline.text.strip()
|
||||||
|
article_date_eles = article.xpath(".//span[contains(@class, 'time_3qQJR')]")
|
||||||
|
if len(article_date_eles) > 0:
|
||||||
|
article_date_str = article_date_eles[0].text.strip()
|
||||||
|
article_date = self.parse_date_str(article_date_str)
|
||||||
|
else:
|
||||||
|
article_date = None
|
||||||
|
summary_eles = article.xpath(".//p[contains(@class, 'summary_17SO6')]")
|
||||||
|
if len(summary_eles) > 0:
|
||||||
|
summary = summary_eles[0].text.strip()
|
||||||
|
else:
|
||||||
|
summary = "No summary..."
|
||||||
|
self._article_list.append((title.encode('ascii', 'ignore'), link, summary.encode('ascii', 'ignore'),
|
||||||
|
article_date, self.get_article_timestamp(article_date)))
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def get_article_timestamp(article_date):
|
||||||
|
# assume all times Eastern...
|
||||||
|
# 2nd sunday March, 1st Sunday Nov
|
||||||
|
c = calendar.Calendar(firstweekday=calendar.SUNDAY)
|
||||||
|
march_cal = c.monthdatescalendar(article_date.year, 3)
|
||||||
|
dst_start = [day for week in march_cal for day in week if \
|
||||||
|
day.weekday() == calendar.SUNDAY and \
|
||||||
|
day.month == 3][1]
|
||||||
|
nov_cal = c.monthdatescalendar(article_date.year, 11)
|
||||||
|
dst_end = [day for week in nov_cal for day in week if \
|
||||||
|
day.weekday() == calendar.SUNDAY and \
|
||||||
|
day.month == 11][0]
|
||||||
|
dst_start = datetime(dst_start.year, dst_start.month, dst_start.day, 2)
|
||||||
|
dst_end = datetime(dst_end.year, dst_end.month, dst_end.day, 1)
|
||||||
|
if dst_start > article_date > dst_end:
|
||||||
|
shift = timedelta(hours=4)
|
||||||
|
else:
|
||||||
|
shift = timedelta(hours=5)
|
||||||
|
return float((article_date + shift - datetime.utcfromtimestamp(0)).total_seconds())
|
||||||
|
|
||||||
|
|
||||||
|
def parse_date_str(self, date_str):
|
||||||
|
parsed = None
|
||||||
|
for date_format in self.date_formats:
|
||||||
|
try:
|
||||||
|
parsed = datetime.strptime(date_str[0:-4], date_format)
|
||||||
|
break
|
||||||
|
except Exception as ex:
|
||||||
|
pass
|
||||||
|
return parsed
|
||||||
|
|
||||||
|
def get_article_list(self):
|
||||||
|
return self._article_list
|
||||||
|
|
||||||
|
def get_ordered_article_feed(self):
|
||||||
|
output = OrderedDict()
|
||||||
|
for article in self._article_list:
|
||||||
|
article_date = article[3]
|
||||||
|
article_dict = {'title': article[0], 'url': article[1],
|
||||||
|
'description': "{0}: {1}".format(self.get_name(), article[2]),
|
||||||
|
'author': self.get_name() + ": " + article[3].strftime(output_date_format),
|
||||||
|
'date': self.get_name() + ": " + article[3].strftime(output_date_format),
|
||||||
|
'timestamp': article[4]}
|
||||||
|
output[article_date] = article_dict
|
||||||
|
return OrderedDict(sorted(output.items(), key=lambda t: t[0], reverse=True))
|
||||||
|
|
||||||
|
def get_name(self):
|
||||||
|
return self._name
|
||||||
|
|
||||||
|
class BloombergContributors(BasicNewsRecipe):
|
||||||
|
title = u'Bloomberg, Editorial Contributors'
|
||||||
|
description = 'Articles from Bloomberg.com contributors'
|
||||||
|
__author__ = 'Dale Furrow'
|
||||||
|
xpath_contributor_list = """//li[contains(@class, 'item_2zsS8')]/a"""
|
||||||
|
language = 'en'
|
||||||
|
no_stylesheets = True
|
||||||
|
remove_attributes = ['style', 'xmlns']
|
||||||
|
keep_only_tags = [dict(name='article', attrs={'data-type': 'article'})]
|
||||||
|
remove_tags = [dict(name='div', attrs=
|
||||||
|
{'class': ['share-article-button ', 'text-to-speech']})] # note space...
|
||||||
|
oldest_article = 7.0
|
||||||
|
ignore_duplicate_articles = {'url'}
|
||||||
|
recursions = 0
|
||||||
|
category = 'news, USA, world, economy, politics'
|
||||||
|
language = 'en'
|
||||||
|
|
||||||
|
def get_contributors_list(self):
|
||||||
|
page_doc = get_article_parsed(contributors_url)
|
||||||
|
els = page_doc.xpath(self.xpath_contributor_list)
|
||||||
|
contributor_list = []
|
||||||
|
for el in els:
|
||||||
|
name = el.find("span").text.strip() # name
|
||||||
|
contibutor_items = el.attrib['href'].split('/')
|
||||||
|
contributor = BloombergContributor(name, contibutor_items[4], contibutor_items[3])
|
||||||
|
contributor_list.append(contributor)
|
||||||
|
for contributor in contributor_list:
|
||||||
|
contributor.populate_article_list()
|
||||||
|
return contributor_list
|
||||||
|
|
||||||
|
def postprocess_html(self, soup, first_fetch):
|
||||||
|
'''
|
||||||
|
:param soup: A `BeautifulSoup
|
||||||
|
<https://www.crummy.com/software/BeautifulSoup/bs3/documentation.html>
|
||||||
|
`_ instance containing the downloaded :term:`HTML`.
|
||||||
|
:param first_fetch: True if this is the first page of an article.
|
||||||
|
Remember: BeautifulSoup3! Interface is much different than bs4
|
||||||
|
'''
|
||||||
|
time_eles = soup.findAll("time", {"class": "article-timestamp"})
|
||||||
|
if len(time_eles) > 0:
|
||||||
|
time_stamp = time_eles[0].get('datetime')
|
||||||
|
try:
|
||||||
|
parsed_time = datetime.strptime(time_stamp, "%Y-%m-%dT%H:%M:%S.%fZ")\
|
||||||
|
.strftime("%B %d, %Y %I:%M %p") + " UTC"
|
||||||
|
except:
|
||||||
|
parsed_time = time_stamp
|
||||||
|
insert_tag = Tag(soup, "p", [("class", "user-inserted")])
|
||||||
|
insert_tag.insert(0, parsed_time)
|
||||||
|
soup.time.replaceWith(insert_tag)
|
||||||
|
|
||||||
|
|
||||||
|
return soup
|
||||||
|
|
||||||
|
def parse_index(self):
|
||||||
|
self.timefmt = ' [%a, %d %b, %Y]'
|
||||||
|
self.log('starting parse_index: {0}'.format(datetime.now().strftime("%B %d, %Y %I:%M %p")))
|
||||||
|
feeds = []
|
||||||
|
feed_dict = OrderedDict()
|
||||||
|
contributor_list = self.get_contributors_list()
|
||||||
|
self.log("Found {0:d} contibutors on main page".format(len(contributor_list)))
|
||||||
|
for contributor in contributor_list:
|
||||||
|
articles = contributor.get_ordered_article_feed()
|
||||||
|
feed_dict.update(articles)
|
||||||
|
feed_dict = OrderedDict(sorted(feed_dict.items(), key=lambda t: t[0], reverse=True))
|
||||||
|
self.log("Found {0:d} linked articles from contributors".format(len(feed_dict)))
|
||||||
|
feeds.append(("Columns", list(feed_dict.values())))
|
||||||
|
# self.log("Total of {0:d} {1} articles".format(len(article_list), cat))
|
||||||
|
self.log('finishing parse_index: {0}'.format(datetime.now().strftime("%B %d, %Y %I:%M %p")))
|
||||||
|
return feeds
|
||||||
|
|
||||||
|
|
@ -10,6 +10,7 @@ from collections import OrderedDict
|
|||||||
|
|
||||||
from calibre.ebooks.BeautifulSoup import NavigableString, Tag
|
from calibre.ebooks.BeautifulSoup import NavigableString, Tag
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
from calibre.utils.cleantext import clean_ascii_chars
|
||||||
|
|
||||||
|
|
||||||
def classes(classes):
|
def classes(classes):
|
||||||
@ -146,6 +147,18 @@ class Economist(BasicNewsRecipe):
|
|||||||
raw = etree.tostring(root, encoding=unicode)
|
raw = etree.tostring(root, encoding=unicode)
|
||||||
return raw
|
return raw
|
||||||
|
|
||||||
|
def populate_article_metadata(self, article, soup, first):
|
||||||
|
els = soup.findAll(name=['span', 'p'],
|
||||||
|
attrs={'class': ['flytitle-and-title__title', 'blog-post__rubric']})
|
||||||
|
result = []
|
||||||
|
for el in els[0:2]:
|
||||||
|
if el is not None:
|
||||||
|
for descendant in el.contents:
|
||||||
|
if isinstance(descendant, NavigableString):
|
||||||
|
result.append(unicode(descendant))
|
||||||
|
article.summary = u'. '.join(result).encode('utf-8') + '.'
|
||||||
|
article.text_summary = clean_ascii_chars(article.summary)
|
||||||
|
|
||||||
def parse_index(self):
|
def parse_index(self):
|
||||||
# return [('Articles', [{'title':'test',
|
# return [('Articles', [{'title':'test',
|
||||||
# 'url':'http://www.economist.com/news/business/21718916-worlds-biggest-software-firm-has-transformed-its-culture-better-getting-cloud'
|
# 'url':'http://www.economist.com/news/business/21718916-worlds-biggest-software-firm-has-transformed-its-culture-better-getting-cloud'
|
||||||
|
@ -10,6 +10,7 @@ from collections import OrderedDict
|
|||||||
|
|
||||||
from calibre.ebooks.BeautifulSoup import NavigableString, Tag
|
from calibre.ebooks.BeautifulSoup import NavigableString, Tag
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
from calibre.utils.cleantext import clean_ascii_chars
|
||||||
|
|
||||||
|
|
||||||
def classes(classes):
|
def classes(classes):
|
||||||
@ -146,6 +147,18 @@ class Economist(BasicNewsRecipe):
|
|||||||
raw = etree.tostring(root, encoding=unicode)
|
raw = etree.tostring(root, encoding=unicode)
|
||||||
return raw
|
return raw
|
||||||
|
|
||||||
|
def populate_article_metadata(self, article, soup, first):
|
||||||
|
els = soup.findAll(name=['span', 'p'],
|
||||||
|
attrs={'class': ['flytitle-and-title__title', 'blog-post__rubric']})
|
||||||
|
result = []
|
||||||
|
for el in els[0:2]:
|
||||||
|
if el is not None:
|
||||||
|
for descendant in el.contents:
|
||||||
|
if isinstance(descendant, NavigableString):
|
||||||
|
result.append(unicode(descendant))
|
||||||
|
article.summary = u'. '.join(result).encode('utf-8') + '.'
|
||||||
|
article.text_summary = clean_ascii_chars(article.summary)
|
||||||
|
|
||||||
def parse_index(self):
|
def parse_index(self):
|
||||||
# return [('Articles', [{'title':'test',
|
# return [('Articles', [{'title':'test',
|
||||||
# 'url':'http://www.economist.com/news/business/21718916-worlds-biggest-software-firm-has-transformed-its-culture-better-getting-cloud'
|
# 'url':'http://www.economist.com/news/business/21718916-worlds-biggest-software-firm-has-transformed-its-culture-better-getting-cloud'
|
||||||
|
@ -1,17 +1,135 @@
|
|||||||
#!/usr/bin/env python2
|
#!/usr/bin/env python2
|
||||||
# -*- coding: utf-8 -*-
|
# -*- coding: utf-8 -*-
|
||||||
__license__ = 'GPL v3'
|
__license__ = 'GPL v3'
|
||||||
__copyright__ = '2016, Dale Furrow dkfurrow@gmail.com'
|
__copyright__ = '2018, Dale Furrow dkfurrow@gmail.com'
|
||||||
'''
|
'''
|
||||||
chron.com
|
chron.com
|
||||||
'''
|
'''
|
||||||
import re
|
import re
|
||||||
import time
|
import time
|
||||||
|
import urllib2
|
||||||
|
from StringIO import StringIO
|
||||||
|
from datetime import datetime, timedelta, date
|
||||||
|
import traceback
|
||||||
|
import sys
|
||||||
|
from collections import OrderedDict
|
||||||
|
|
||||||
from datetime import datetime, timedelta, date
|
from datetime import datetime, timedelta, date
|
||||||
from lxml import html
|
|
||||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||||
|
from calibre.web.feeds import Article
|
||||||
|
from calibre.utils.cleantext import clean_ascii_chars
|
||||||
|
from calibre.ebooks.BeautifulSoup import NavigableString
|
||||||
|
from calibre.utils.date import dt_factory, utcnow, local_tz
|
||||||
|
from lxml import html
|
||||||
|
from lxml import etree
|
||||||
|
|
||||||
|
regex_date_only = re.compile("""(?:January|February|March|April|
|
||||||
|
{8}May|June|July|August|September|October|November|
|
||||||
|
{8}December)\s[0-9]{1,2},\s20[01][0-9]""")
|
||||||
|
regex_time_only = re.compile("""[0-9]{1,2}:[0-9]{1,2} \w{2}""")
|
||||||
|
sentence_regex = re.compile("(\S.+?[.!?])(?=\s+|$)")
|
||||||
|
blog_regex = re.compile('post-\d+')
|
||||||
|
|
||||||
|
pages = OrderedDict([('news', ('/news/houston-texas/', ['business', 'sports'])),
|
||||||
|
('business', ('/business/', ['sports'])),
|
||||||
|
('sports', ('/sports/', ['business']))])
|
||||||
|
|
||||||
|
base_url = "http://www.chron.com"
|
||||||
|
|
||||||
|
# sports has 'core-package sports' class
|
||||||
|
xpath_general = """//div[contains(@class, 'centerpiece-tabs') or
|
||||||
|
contains(@class, 'wrapper') or
|
||||||
|
contains(@class, 'contentGroups') or
|
||||||
|
contains(@class, 'headline-list') or
|
||||||
|
contains(@class, 'core-package sports') or
|
||||||
|
contains(@class, 'news')]
|
||||||
|
//a[contains(@class, 'hdn-analytics')]"""
|
||||||
|
|
||||||
|
excluded_titles = ["Winning numbers", "TV-radio listings"]
|
||||||
|
|
||||||
|
|
||||||
|
def validate_link(page, link, title):
|
||||||
|
other_category = page[1][1]
|
||||||
|
if not title or len(title.strip()) < 5:
|
||||||
|
print("{0} rejected, title too short".format(link))
|
||||||
|
return None
|
||||||
|
if link.split('/')[3] in other_category:
|
||||||
|
print("{0} rejected, covered in other section".format(link))
|
||||||
|
return None
|
||||||
|
for excluded_title in excluded_titles:
|
||||||
|
if title.find(excluded_title) != -1:
|
||||||
|
print("{0} rejected, excluded title".format(link))
|
||||||
|
return None
|
||||||
|
return link, title
|
||||||
|
|
||||||
|
|
||||||
|
def get_article_parsed(this_url):
|
||||||
|
page = urllib2.urlopen(this_url)
|
||||||
|
content = page.read()
|
||||||
|
parser = etree.HTMLParser()
|
||||||
|
parsed = html.parse(StringIO(content), parser)
|
||||||
|
return parsed
|
||||||
|
|
||||||
|
|
||||||
|
def sort_subject(element_list):
|
||||||
|
# priority of subjects
|
||||||
|
subjects = ['news', 'neighborhood', 'entertainment']
|
||||||
|
subjects.reverse()
|
||||||
|
subject_dict = OrderedDict(zip(subjects, range(len(subjects))))
|
||||||
|
rank_dict = OrderedDict([(rank, []) for rank in range(len(subjects) + 1)])
|
||||||
|
for element in element_list:
|
||||||
|
subj = element[0].split('/')[3]
|
||||||
|
if subject_dict.get(subj) is not None:
|
||||||
|
rank_dict[subject_dict[subj] + 1].append(element)
|
||||||
|
else:
|
||||||
|
rank_dict[0].append(element)
|
||||||
|
# now return in reverse order, sorted
|
||||||
|
combined_list = []
|
||||||
|
for rank in range(len(subjects), -1, -1):
|
||||||
|
article_list = rank_dict[rank]
|
||||||
|
article_list.sort()
|
||||||
|
combined_list.extend(article_list)
|
||||||
|
return combined_list
|
||||||
|
|
||||||
|
|
||||||
|
def get_links_from_section_page(page):
|
||||||
|
page_doc = get_article_parsed(base_url + page[1][0])
|
||||||
|
els = page_doc.xpath(xpath_general)
|
||||||
|
element_list = []
|
||||||
|
for el in els:
|
||||||
|
link = el.get('href').split('?')[0]
|
||||||
|
title = el.text
|
||||||
|
if title is None or len(title.strip()) < 5:
|
||||||
|
link_id = link.split('/')[-1][:-3].split('-')[:-1]
|
||||||
|
title = ' '.join(link_id)
|
||||||
|
if link[:4] != 'http':
|
||||||
|
link = base_url + link
|
||||||
|
validated_link = validate_link(page=page, link=link, title=title)
|
||||||
|
if validated_link is not None:
|
||||||
|
element_list.append(validated_link)
|
||||||
|
sorted_element_list = sort_subject(element_list)
|
||||||
|
return [page[0], sorted_element_list]
|
||||||
|
|
||||||
|
|
||||||
|
def get_all_links_from_sections():
|
||||||
|
all_sections = []
|
||||||
|
article_set = set()
|
||||||
|
final_dict = OrderedDict()
|
||||||
|
for item in pages.items():
|
||||||
|
print "getting links from {0}".format(item[0])
|
||||||
|
all_sections.append(get_links_from_section_page(item))
|
||||||
|
for section in all_sections:
|
||||||
|
section_id = section[0]
|
||||||
|
article_list = section[1]
|
||||||
|
final_dict[section_id] = []
|
||||||
|
for article in article_list:
|
||||||
|
if article[0] not in article_set:
|
||||||
|
article_set.add(article[0])
|
||||||
|
final_dict[section_id].append(article)
|
||||||
|
return final_dict
|
||||||
|
|
||||||
|
|
||||||
|
# noinspection PyAbstractClass
|
||||||
class HoustonChronicle(BasicNewsRecipe):
|
class HoustonChronicle(BasicNewsRecipe):
|
||||||
title = u'The Houston Chronicle'
|
title = u'The Houston Chronicle'
|
||||||
description = 'News from Houston, Texas'
|
description = 'News from Houston, Texas'
|
||||||
@ -22,196 +140,111 @@ class HoustonChronicle(BasicNewsRecipe):
|
|||||||
remove_empty_feeds = True
|
remove_empty_feeds = True
|
||||||
timefmt = '[%a, %d %b %Y]'
|
timefmt = '[%a, %d %b %Y]'
|
||||||
timestampfmt = '%Y%m%d%H%M%S'
|
timestampfmt = '%Y%m%d%H%M%S'
|
||||||
ignore_duplicate_articles = {'url'}
|
# ignore_duplicate_articles = {'url'} # defaults to None
|
||||||
extra_css = '.article_date {display: none}'
|
extra_css = '.article_date {display: none}'
|
||||||
|
category = 'news, USA'
|
||||||
|
masthead_url = 'http://www.chron.com/apple-touch-icon-76x76.png'
|
||||||
|
keep_only_tags = [dict(name='div', attrs={'class': ['article-content', 'article-wrap']})]
|
||||||
|
remove_tags = [dict(name='div', attrs={'social-title': True}),
|
||||||
|
dict(name='div', attrs={'class':
|
||||||
|
['control-panel', 'gallery-overlay-inner',
|
||||||
|
'most-popular', 'asset-media mos-playlist',
|
||||||
|
'asset_media asset-media']}),
|
||||||
|
dict(name='li', attrs={'class': ['hst-resgalleryitem taboola-frame hidden',
|
||||||
|
'hst-resgalleryitem hidden']}),
|
||||||
|
dict(name='ul', attrs={'class': 'clearfix'})]
|
||||||
|
|
||||||
oldest_web_article = 7.0
|
# max_articles_per_feed = 5 # for use in testing
|
||||||
|
|
||||||
if oldest_web_article is None:
|
def get_article_description_from_doc(self, soup):
|
||||||
earliest_date = date.today()
|
|
||||||
else:
|
|
||||||
earliest_date = date.today() - timedelta(days=oldest_web_article)
|
|
||||||
|
|
||||||
pages = [('news', '/news/houston-texas/'),
|
|
||||||
('business', '/business/'),
|
|
||||||
('sports', '/sports/')]
|
|
||||||
|
|
||||||
base_url = "http://www.chron.com"
|
|
||||||
|
|
||||||
xpath_str = """//div[contains(@class, 'news') or contains(@class, 'headline-list')]
|
|
||||||
//*[self::h4 or self::h5]//a[contains(@class, 'hdn-analytics')]"""
|
|
||||||
|
|
||||||
def get_links_from_section_page(self, section_url):
|
|
||||||
page_doc = html.parse(self.base_url + section_url)
|
|
||||||
els = page_doc.xpath(self.xpath_str)
|
|
||||||
element_list = []
|
|
||||||
for el in els:
|
|
||||||
link = el.get('href')
|
|
||||||
title = el.text
|
|
||||||
if link[:4] != 'http':
|
|
||||||
link = self.base_url + link
|
|
||||||
if title is not None:
|
|
||||||
element_list.append((link, el.text))
|
|
||||||
return element_list
|
|
||||||
|
|
||||||
def get_article_description_from_doc(self, page_doc):
|
|
||||||
description_chars_break = 140
|
description_chars_break = 140
|
||||||
description_max_chars = 300
|
description_max_chars = 300
|
||||||
desc_xpath = """//div[contains(@class, 'article-body') or
|
|
||||||
contains(@class, 'resource-content') or contains(@class, 'post')]//p"""
|
|
||||||
sentence_regex = re.compile("(\S.+?[.!?])(?=\s+|$)")
|
|
||||||
|
|
||||||
def stringify_children(node):
|
|
||||||
return ''.join([x for x in node.itertext()])
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
els = page_doc.xpath(desc_xpath)
|
els = soup.findAll('p')
|
||||||
out_text = ""
|
if len(els) > 0:
|
||||||
ellipsis = ""
|
out_text = ""
|
||||||
for el in els:
|
this_ellipsis = ""
|
||||||
sentences = re.findall(sentence_regex, stringify_children(el))
|
for el in els:
|
||||||
for sentence in sentences:
|
if el is not None:
|
||||||
if len(out_text) < description_chars_break:
|
result = []
|
||||||
out_text += sentence + " "
|
for descendant in el.contents:
|
||||||
else:
|
if isinstance(descendant, NavigableString):
|
||||||
if len(out_text) > description_max_chars:
|
result.append(unicode(descendant).strip())
|
||||||
ellipsis = "..."
|
all_text = u' '.join(result).encode('utf-8')
|
||||||
return out_text[:description_max_chars] + ellipsis
|
if len(all_text) > 1:
|
||||||
return out_text
|
sentences = re.findall(sentence_regex, all_text)
|
||||||
except:
|
if sentences is not None and len(sentences) > 0:
|
||||||
|
for sentence in sentences:
|
||||||
|
if len(out_text) < description_chars_break:
|
||||||
|
out_text += sentence + " "
|
||||||
|
else:
|
||||||
|
if len(out_text) > description_max_chars:
|
||||||
|
this_ellipsis = "..."
|
||||||
|
return out_text[:description_max_chars] + this_ellipsis
|
||||||
|
return out_text
|
||||||
|
else:
|
||||||
|
return "No Article description returned"
|
||||||
|
except Exception as ex:
|
||||||
self.log('Error on Article Description')
|
self.log('Error on Article Description')
|
||||||
|
traceback.print_exc(file=sys.stdout)
|
||||||
|
print(str(ex))
|
||||||
return ""
|
return ""
|
||||||
|
|
||||||
def get_published_time_from_doc(self, page_doc):
|
@staticmethod
|
||||||
regex_date_only = re.compile("""(?:January|February|March|April|
|
def get_published_time_from_doc(page_doc):
|
||||||
May|June|July|August|September|October|November|
|
|
||||||
December)\s[0-9]{1,2},\s20[01][0-9]""")
|
|
||||||
regex_time_only = re.compile("""[0-9]{1,2}:[0-9]{1,2} \w{2}""")
|
|
||||||
|
|
||||||
def get_regular_timestamp(date_string):
|
def get_regular_timestamp(date_string):
|
||||||
try:
|
try:
|
||||||
out_date = datetime.strptime(date_string, "%Y-%m-%dT%H:%M:%SZ")
|
out_date = datetime.strptime(date_string, "%Y-%m-%dT%H:%M:%SZ")
|
||||||
return out_date
|
return out_date
|
||||||
except:
|
except ValueError:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def get_date_from_string(in_text):
|
el = page_doc.findAll(lambda this_tag: this_tag.name == "time" and
|
||||||
match = re.findall(regex_date_only, in_text)
|
('itemprop', 'datePublished') in this_tag.attrs)
|
||||||
if match:
|
|
||||||
try:
|
|
||||||
out_date = datetime.strptime(match[0], "%B %d, %Y")
|
|
||||||
match = re.findall(regex_time_only, in_text)
|
|
||||||
if match:
|
|
||||||
out_time = datetime.strptime(match[0], "%I:%M %p")
|
|
||||||
return datetime.combine(out_date.date(), out_time.time())
|
|
||||||
return out_date
|
|
||||||
except:
|
|
||||||
return None
|
|
||||||
|
|
||||||
el = page_doc.xpath("//*[@class='timestamp'][1]")
|
|
||||||
if len(el) == 1:
|
if len(el) == 1:
|
||||||
return get_regular_timestamp(el[0].get('title'))
|
return get_regular_timestamp(el[0].get('datetime'))
|
||||||
else:
|
else:
|
||||||
el = page_doc.xpath(
|
return None
|
||||||
"//*[@class='entry-date' or @class='post-date'][1]")
|
|
||||||
if len(el) == 1:
|
|
||||||
return get_date_from_string(el[0].text_content())
|
|
||||||
else:
|
|
||||||
return None
|
|
||||||
|
|
||||||
def get_all_data_feeds_from_page(self, page):
|
def populate_article_metadata(self, article, soup, first):
|
||||||
articles = []
|
"""
|
||||||
exclude_titles_with = ['Winning numbers']
|
Called when each HTML page belonging to article is downloaded.
|
||||||
|
Intended to be used to get article metadata like author/summary/etc.
|
||||||
|
from the parsed HTML (soup).
|
||||||
|
|
||||||
def title_excluded(title):
|
:param article: A object of class :class:`calibre.web.feeds.Article`.
|
||||||
for text in exclude_titles_with:
|
If you change the summary, remember to also change the text_summary
|
||||||
if title.find(text) != -1:
|
:param soup: Parsed HTML belonging to this article
|
||||||
return True
|
:param first: True iff the parsed HTML is the first page of the article.
|
||||||
return False
|
"""
|
||||||
|
summary = self.get_article_description_from_doc(soup)
|
||||||
link_list = self.get_links_from_section_page(page[1])
|
article_date = self.get_published_time_from_doc(soup)
|
||||||
self.log('from section: ', page[0],
|
if article_date is not None:
|
||||||
" found ", len(link_list), " links")
|
article_timestamp = float((article_date - datetime.utcfromtimestamp(0)).total_seconds())
|
||||||
for link in link_list:
|
article.date = article_timestamp
|
||||||
try:
|
article.utctime = dt_factory(article_date.timetuple(), assume_utc=True, as_utc=True)
|
||||||
article_doc = html.parse(link[0])
|
article.localtime = article.utctime.astimezone(local_tz)
|
||||||
description = self.get_article_description_from_doc(
|
summary_date = article.localtime.strftime("%Y-%m-%d %H:%M") if article_date is not None else "No Date"
|
||||||
article_doc)
|
article.summary = "{0}: {1}".format(summary_date, summary)
|
||||||
parsed_date = self.get_published_time_from_doc(article_doc)
|
article.text_summary = clean_ascii_chars(article.summary)
|
||||||
if parsed_date is not None and description is not None and \
|
|
||||||
parsed_date.date() > self.earliest_date and \
|
|
||||||
not title_excluded(link[1]):
|
|
||||||
intro_date = parsed_date.strftime('%d %b %H:%M') + " - "
|
|
||||||
articles.append({'title': link[1], 'url': link[0],
|
|
||||||
'description': intro_date + description,
|
|
||||||
'date': ""})
|
|
||||||
self.log(page[0] + ": " + link[1] + ', from ' + intro_date +
|
|
||||||
" description of " + str(len(description)) + ' characters at ' + link[0])
|
|
||||||
else:
|
|
||||||
if parsed_date is None:
|
|
||||||
msg = " No Timestamp Found"
|
|
||||||
elif title_excluded(link[1]):
|
|
||||||
msg = " Title Excluded"
|
|
||||||
else:
|
|
||||||
msg = " article older than " + \
|
|
||||||
str(self.oldest_web_article) + ' days...'
|
|
||||||
self.log("Skipping article: ", link[0], msg)
|
|
||||||
except:
|
|
||||||
print 'error on fetching ' + link[0]
|
|
||||||
continue
|
|
||||||
return articles
|
|
||||||
|
|
||||||
def parse_index(self):
|
def parse_index(self):
|
||||||
|
|
||||||
self.timefmt = ' [%a, %d %b, %Y]'
|
self.timefmt = ' [%a, %d %b, %Y]'
|
||||||
self.log('starting parse_index: ', time.strftime(self.timestampfmt))
|
self.log('starting parse_index: ', time.strftime(self.timestampfmt))
|
||||||
feeds = []
|
feeds = []
|
||||||
for page in self.pages:
|
sections = get_all_links_from_sections()
|
||||||
articles = self.get_all_data_feeds_from_page(page)
|
for section_id, article_list in sections.items():
|
||||||
if articles:
|
self.log("Getting {0} section, {1:d} articles".format(section_id, len(article_list)))
|
||||||
feeds.append((page[0], articles))
|
articles = []
|
||||||
|
for article_info in article_list:
|
||||||
|
self.log("Adding {0} to feed".format(article_info[0]))
|
||||||
|
articles.append({'title': article_info[1], 'url': article_info[0],
|
||||||
|
'description': '', 'date': ""})
|
||||||
|
self.log("Appending {0:d} articles for {1}".format(len(articles), section_id))
|
||||||
|
feeds.append((section_id, articles))
|
||||||
self.log('finished parse_index: ', time.strftime(self.timestampfmt))
|
self.log('finished parse_index: ', time.strftime(self.timestampfmt))
|
||||||
return feeds
|
return feeds
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
tags_to_exclude = [('class', "caption staged"),
|
|
||||||
('style', "display:none")]
|
|
||||||
story_tag = soup.find(
|
|
||||||
name='div', attrs={'class': ['article-content', 'article-body']})
|
|
||||||
blog_tag = soup.find(name='div', attrs={'id': re.compile('post-\d+')})
|
|
||||||
|
|
||||||
def is_excluded(tag_to_check):
|
|
||||||
for attr in tag_to_check.attrs:
|
|
||||||
if attr in tags_to_exclude:
|
|
||||||
return True
|
|
||||||
return False
|
|
||||||
|
|
||||||
def get_attr_startswith(attrs, this_key, this_valuestart):
|
|
||||||
starts_with = False
|
|
||||||
for attr in attrs:
|
|
||||||
if attr[0] == this_key:
|
|
||||||
if attr[1].startswith(this_valuestart):
|
|
||||||
starts_with = True
|
|
||||||
return starts_with
|
|
||||||
|
|
||||||
base_tags = []
|
|
||||||
if story_tag is not None:
|
|
||||||
base_tags = story_tag.findAll(lambda this_tag: (this_tag.name == "p" and not ('class', 'open') in this_tag.attrs and not ('class', 'close') in this_tag.attrs) or this_tag.name.startswith('h') or this_tag.name == 'table' or (this_tag.name == 'li' and ('class', 'hst-resgalleryitem') in this_tag.attrs)) # noqa
|
|
||||||
if blog_tag is not None:
|
|
||||||
base_tags = blog_tag.findAll(lambda this_tag: (this_tag.name == "p" or this_tag.name.startswith('h')) or (this_tag.name == "span" and get_attr_startswith(this_tag.attrs, 'class', 'post')) or (this_tag.name == 'img' and ('lazy-state', 'loaded') in this_tag.attrs)) # noqa
|
|
||||||
|
|
||||||
self.log('content tags: ' + str(type(base_tags)) + str(len(base_tags)))
|
|
||||||
all_tags = []
|
|
||||||
all_tags.extend(base_tags)
|
|
||||||
if len(base_tags) > 0:
|
|
||||||
for tag in base_tags:
|
|
||||||
all_tags.extend(tag.findAll(True))
|
|
||||||
|
|
||||||
for tag in base_tags:
|
|
||||||
while tag.parent is not None and not is_excluded(tag):
|
|
||||||
all_tags.append(tag)
|
|
||||||
tag = tag.parent
|
|
||||||
for tag in soup.findAll(True):
|
|
||||||
if tag not in all_tags:
|
|
||||||
tag.extract()
|
|
||||||
return soup
|
return soup
|
||||||
|
Loading…
x
Reference in New Issue
Block a user