mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-09-29 15:31:08 -04:00
246 lines
10 KiB
Python
246 lines
10 KiB
Python
#!/usr/bin/env python
|
|
# -*- coding: utf-8 -*-
|
|
from __future__ import print_function
|
|
__license__ = 'GPL v3'
|
|
__copyright__ = '2018, Dale Furrow dkfurrow@gmail.com'
|
|
'''
|
|
chron.com
|
|
'''
|
|
import re
|
|
import time
|
|
from datetime import datetime
|
|
import traceback
|
|
import sys
|
|
from collections import OrderedDict
|
|
|
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
|
from calibre.utils.cleantext import clean_ascii_chars
|
|
from calibre.ebooks.BeautifulSoup import NavigableString
|
|
from calibre.utils.date import dt_factory, local_tz
|
|
|
|
regex_date_only = re.compile(r"""(?:January|February|March|April|
|
|
{8}May|June|July|August|September|October|November|
|
|
{8}December)\s[0-9]{1,2},\s20[01][0-9]""")
|
|
regex_time_only = re.compile(r"""[0-9]{1,2}:[0-9]{1,2} \w{2}""")
|
|
sentence_regex = re.compile(r"(\S.+?[.!?])(?=\s+|$)")
|
|
blog_regex = re.compile(r'post-\d+')
|
|
|
|
pages = OrderedDict([('news', ('/news/houston-texas/', ['business', 'sports'])),
|
|
('business', ('/business/', ['sports'])),
|
|
('sports', ('/sports/', ['business']))])
|
|
|
|
base_url = "http://www.chron.com"
|
|
|
|
# sports has 'core-package sports' class
|
|
xpath_general = """//div[contains(@class, 'centerpiece-tabs') or
|
|
contains(@class, 'wrapper') or
|
|
contains(@class, 'contentGroups') or
|
|
contains(@class, 'headline-list') or
|
|
contains(@class, 'core-package sports') or
|
|
contains(@class, 'news')]
|
|
//a[contains(@class, 'hdn-analytics')]"""
|
|
|
|
excluded_titles = ["Winning numbers", "TV-radio listings"]
|
|
|
|
|
|
def validate_link(page, link, title):
|
|
other_category = page[1][1]
|
|
if not title or len(title.strip()) < 5:
|
|
print("{0} rejected, title too short".format(link))
|
|
return None
|
|
parts = link.split('/')
|
|
if len(parts) > 3 and parts[3] in other_category:
|
|
print("{0} rejected, covered in other section".format(link))
|
|
return None
|
|
for excluded_title in excluded_titles:
|
|
if title.find(excluded_title) != -1:
|
|
print("{0} rejected, excluded title".format(link))
|
|
return None
|
|
return link, title
|
|
|
|
|
|
def get_article_parsed(index_to_soup, this_url):
|
|
return index_to_soup(this_url, as_tree=True)
|
|
|
|
|
|
def sort_subject(element_list):
|
|
# priority of subjects
|
|
subjects = ['news', 'neighborhood', 'entertainment']
|
|
subjects.reverse()
|
|
subject_dict = OrderedDict(zip(subjects, range(len(subjects))))
|
|
rank_dict = OrderedDict([(rank, []) for rank in range(len(subjects) + 1)])
|
|
for element in element_list:
|
|
try:
|
|
subj = element[0].split('/')[3]
|
|
except Exception:
|
|
subj = 'unknown'
|
|
if subject_dict.get(subj) is not None:
|
|
rank_dict[subject_dict[subj] + 1].append(element)
|
|
else:
|
|
rank_dict[0].append(element)
|
|
# now return in reverse order, sorted
|
|
combined_list = []
|
|
for rank in range(len(subjects), -1, -1):
|
|
article_list = rank_dict[rank]
|
|
article_list.sort()
|
|
combined_list.extend(article_list)
|
|
return combined_list
|
|
|
|
|
|
def get_links_from_section_page(index_to_soup, page):
|
|
page_doc = get_article_parsed(index_to_soup, base_url + page[1][0])
|
|
els = page_doc.xpath(xpath_general)
|
|
element_list = []
|
|
for el in els:
|
|
link = el.get('href').split('?')[0]
|
|
title = el.text
|
|
if title is None or len(title.strip()) < 5:
|
|
link_id = link.split('/')[-1][:-3].split('-')[:-1]
|
|
title = ' '.join(link_id)
|
|
if link[:4] != 'http':
|
|
link = base_url + link
|
|
validated_link = validate_link(page=page, link=link, title=title)
|
|
if validated_link is not None:
|
|
element_list.append(validated_link)
|
|
sorted_element_list = sort_subject(element_list)
|
|
return [page[0], sorted_element_list]
|
|
|
|
|
|
def get_all_links_from_sections(index_to_soup):
|
|
all_sections = []
|
|
article_set = set()
|
|
final_dict = OrderedDict()
|
|
for item in pages.items():
|
|
print("getting links from {0}".format(item[0]))
|
|
all_sections.append(get_links_from_section_page(index_to_soup, item))
|
|
for section in all_sections:
|
|
section_id = section[0]
|
|
article_list = section[1]
|
|
final_dict[section_id] = []
|
|
for article in article_list:
|
|
if article[0] not in article_set:
|
|
article_set.add(article[0])
|
|
final_dict[section_id].append(article)
|
|
return final_dict
|
|
|
|
|
|
# noinspection PyAbstractClass
|
|
class HoustonChronicle(BasicNewsRecipe):
|
|
title = u'The Houston Chronicle'
|
|
description = 'News from Houston, Texas'
|
|
__author__ = 'Dale Furrow'
|
|
language = 'en'
|
|
no_stylesheets = True
|
|
remove_attributes = ['style', 'xmlns']
|
|
remove_empty_feeds = True
|
|
timefmt = '[%a, %d %b %Y]'
|
|
timestampfmt = '%Y%m%d%H%M%S'
|
|
# ignore_duplicate_articles = {'url'} # defaults to None
|
|
extra_css = '.article_date {display: none}'
|
|
category = 'news, USA'
|
|
masthead_url = 'http://www.chron.com/apple-touch-icon-76x76.png'
|
|
keep_only_tags = [dict(name='div', attrs={'class': ['article-content', 'article-wrap']})]
|
|
remove_tags = [dict(name='div', attrs={'social-title': True}),
|
|
dict(name='div', attrs={'class':
|
|
['control-panel', 'gallery-overlay-inner',
|
|
'most-popular', 'asset-media mos-playlist',
|
|
'asset_media asset-media']}),
|
|
dict(name='li', attrs={'class': ['hst-resgalleryitem taboola-frame hidden',
|
|
'hst-resgalleryitem hidden']}),
|
|
dict(name='ul', attrs={'class': 'clearfix'})]
|
|
|
|
# max_articles_per_feed = 5 # for use in testing
|
|
|
|
def get_article_description_from_doc(self, soup):
|
|
description_chars_break = 140
|
|
description_max_chars = 300
|
|
try:
|
|
els = soup.findAll('p')
|
|
if len(els) > 0:
|
|
out_text = ""
|
|
this_ellipsis = ""
|
|
for el in els:
|
|
if el is not None:
|
|
result = []
|
|
for descendant in el.contents:
|
|
if isinstance(descendant, NavigableString):
|
|
result.append(type(u'')(descendant).strip())
|
|
all_text = u' '.join(result)
|
|
if len(all_text) > 1:
|
|
sentences = re.findall(sentence_regex, all_text)
|
|
if sentences is not None and len(sentences) > 0:
|
|
for sentence in sentences:
|
|
if len(out_text) < description_chars_break:
|
|
out_text += sentence + " "
|
|
else:
|
|
if len(out_text) > description_max_chars:
|
|
this_ellipsis = "..."
|
|
return out_text[:description_max_chars] + this_ellipsis
|
|
return out_text
|
|
else:
|
|
return "No Article description returned"
|
|
except Exception as ex:
|
|
self.log('Error on Article Description')
|
|
traceback.print_exc(file=sys.stdout)
|
|
print(str(ex))
|
|
return ""
|
|
|
|
@staticmethod
|
|
def get_published_time_from_doc(page_doc):
|
|
|
|
def get_regular_timestamp(date_string):
|
|
try:
|
|
out_date = datetime.strptime(date_string, "%Y-%m-%dT%H:%M:%SZ")
|
|
return out_date
|
|
except ValueError:
|
|
return None
|
|
|
|
el = page_doc.findAll(
|
|
lambda this_tag: this_tag.name == "time" and ('itemprop', 'datePublished') in this_tag.attrs)
|
|
if len(el) == 1:
|
|
return get_regular_timestamp(el[0].get('datetime'))
|
|
else:
|
|
return None
|
|
|
|
def populate_article_metadata(self, article, soup, first):
|
|
"""
|
|
Called when each HTML page belonging to article is downloaded.
|
|
Intended to be used to get article metadata like author/summary/etc.
|
|
from the parsed HTML (soup).
|
|
|
|
:param article: A object of class :class:`calibre.web.feeds.Article`.
|
|
If you change the summary, remember to also change the text_summary
|
|
:param soup: Parsed HTML belonging to this article
|
|
:param first: True iff the parsed HTML is the first page of the article.
|
|
"""
|
|
summary = self.get_article_description_from_doc(soup)
|
|
article_date = self.get_published_time_from_doc(soup)
|
|
if article_date is not None:
|
|
article_timestamp = float((article_date - datetime.utcfromtimestamp(0)).total_seconds())
|
|
article.date = article_timestamp
|
|
article.utctime = dt_factory(article_date.timetuple(), assume_utc=True, as_utc=True)
|
|
article.localtime = article.utctime.astimezone(local_tz)
|
|
summary_date = article.localtime.strftime("%Y-%m-%d %H:%M") if article_date is not None else "No Date"
|
|
article.summary = "{0}: {1}".format(summary_date, summary)
|
|
article.text_summary = clean_ascii_chars(article.summary)
|
|
|
|
def parse_index(self):
|
|
self.timefmt = ' [%a, %d %b, %Y]'
|
|
self.log('starting parse_index: ', time.strftime(self.timestampfmt))
|
|
feeds = []
|
|
sections = get_all_links_from_sections(self.index_to_soup)
|
|
for section_id, article_list in sections.items():
|
|
self.log("Getting {0} section, {1:d} articles".format(section_id, len(article_list)))
|
|
articles = []
|
|
for article_info in article_list:
|
|
self.log("Adding {0} to feed".format(article_info[0]))
|
|
articles.append({'title': article_info[1], 'url': article_info[0],
|
|
'description': '', 'date': ""})
|
|
self.log("Appending {0:d} articles for {1}".format(len(articles), section_id))
|
|
feeds.append((section_id, articles))
|
|
self.log('finished parse_index: ', time.strftime(self.timestampfmt))
|
|
return feeds
|
|
|
|
def preprocess_html(self, soup):
|
|
return soup
|