calibre/recipes/houston_chronicle.recipe

#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import print_function
__license__   = 'GPL v3'
__copyright__ = '2018, Dale Furrow dkfurrow@gmail.com'
'''
chron.com
'''
import re
import time
from datetime import datetime
import traceback
import sys
from collections import OrderedDict

from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.utils.cleantext import clean_ascii_chars
from calibre.ebooks.BeautifulSoup import NavigableString
from calibre.utils.date import dt_factory, local_tz

regex_date_only = re.compile(r"""(?:January|February|March|April|
 {8}May|June|July|August|September|October|November|
 {8}December)\s[0-9]{1,2},\s20[01][0-9]""")
regex_time_only = re.compile(r"""[0-9]{1,2}:[0-9]{1,2} \w{2}""")
sentence_regex = re.compile(r"(\S.+?[.!?])(?=\s+|$)")
blog_regex = re.compile(r'post-\d+')

pages = OrderedDict([('news', ('/news/houston-texas/', ['business', 'sports'])),
                     ('business', ('/business/', ['sports'])),
                     ('sports', ('/sports/', ['business']))])

base_url = "http://www.chron.com"

# sports has 'core-package sports' class
xpath_general = """//div[contains(@class, 'centerpiece-tabs') or
                    contains(@class, 'wrapper') or
                    contains(@class, 'contentGroups') or
                    contains(@class, 'headline-list') or
                    contains(@class, 'core-package sports') or
                    contains(@class, 'news')]
                   //a[contains(@class, 'hdn-analytics')]"""

excluded_titles = ["Winning numbers", "TV-radio listings"]


def validate_link(page, link, title):
    other_category = page[1][1]
    if not title or len(title.strip()) < 5:
        print("{0} rejected, title too short".format(link))
        return None
    parts = link.split('/')
    if len(parts) > 3 and parts[3] in other_category:
        print("{0} rejected, covered in other section".format(link))
        return None
    for excluded_title in excluded_titles:
        if title.find(excluded_title) != -1:
            print("{0} rejected, excluded title".format(link))
            return None
    return link, title


def get_article_parsed(index_to_soup, this_url):
    return index_to_soup(this_url, as_tree=True)


def sort_subject(element_list):
    # priority of subjects
    subjects = ['news', 'neighborhood', 'entertainment']
    subjects.reverse()
    subject_dict = OrderedDict(zip(subjects, range(len(subjects))))
    rank_dict = OrderedDict([(rank, []) for rank in range(len(subjects) + 1)])
    for element in element_list:
        try:
            subj = element[0].split('/')[3]
        except Exception:
            subj = 'unknown'
        if subject_dict.get(subj) is not None:
            rank_dict[subject_dict[subj] + 1].append(element)
        else:
            rank_dict[0].append(element)
    # now return in reverse order, sorted
    combined_list = []
    for rank in range(len(subjects), -1, -1):
        article_list = rank_dict[rank]
        article_list.sort()
        combined_list.extend(article_list)
    return combined_list


def get_links_from_section_page(index_to_soup, page):
    page_doc = get_article_parsed(index_to_soup, base_url + page[1][0])
    els = page_doc.xpath(xpath_general)
    element_list = []
    for el in els:
        link = el.get('href').split('?')[0]
        title = el.text
        if title is None or len(title.strip()) < 5:
            link_id = link.split('/')[-1][:-3].split('-')[:-1]
            title = ' '.join(link_id)
        if link[:4] != 'http':
            link = base_url + link
        validated_link = validate_link(page=page, link=link, title=title)
        if validated_link is not None:
            element_list.append(validated_link)
    sorted_element_list = sort_subject(element_list)
    return [page[0], sorted_element_list]


def get_all_links_from_sections(index_to_soup):
    all_sections = []
    article_set = set()
    final_dict = OrderedDict()
    for item in pages.items():
        print("getting links from {0}".format(item[0]))
        all_sections.append(get_links_from_section_page(index_to_soup, item))
    for section in all_sections:
        section_id = section[0]
        article_list = section[1]
        final_dict[section_id] = []
        for article in article_list:
            if article[0] not in article_set:
                article_set.add(article[0])
                final_dict[section_id].append(article)
    return final_dict


# noinspection PyAbstractClass
class HoustonChronicle(BasicNewsRecipe):
    title = u'The Houston Chronicle'
    description = 'News from Houston, Texas'
    __author__ = 'Dale Furrow'
    language = 'en'
    no_stylesheets = True
    remove_attributes = ['style', 'xmlns']
    remove_empty_feeds = True
    timefmt = '[%a, %d %b %Y]'
    timestampfmt = '%Y%m%d%H%M%S'
    # ignore_duplicate_articles = {'url'}  #  defaults to None
    extra_css = '.article_date {display: none}'
    category = 'news, USA'
    masthead_url = 'http://www.chron.com/apple-touch-icon-76x76.png'
    keep_only_tags = [dict(name='div', attrs={'class': ['article-content', 'article-wrap']})]
    remove_tags = [dict(name='div', attrs={'social-title': True}),
                   dict(name='div', attrs={'class':
                                           ['control-panel', 'gallery-overlay-inner',
                                            'most-popular', 'asset-media mos-playlist',
                                            'asset_media asset-media']}),
                   dict(name='li', attrs={'class': ['hst-resgalleryitem taboola-frame hidden',
                                                    'hst-resgalleryitem hidden']}),
                   dict(name='ul', attrs={'class': 'clearfix'})]

    # max_articles_per_feed = 5  # for use in testing

    def get_article_description_from_doc(self, soup):
        description_chars_break = 140
        description_max_chars = 300
        try:
            els = soup.findAll('p')
            if len(els) > 0:
                out_text = ""
                this_ellipsis = ""
                for el in els:
                    if el is not None:
                        result = []
                        for descendant in el.contents:
                            if isinstance(descendant, NavigableString):
                                result.append(type(u'')(descendant).strip())
                        all_text = u' '.join(result)
                        if len(all_text) > 1:
                            sentences = re.findall(sentence_regex, all_text)
                            if sentences is not None and len(sentences) > 0:
                                for sentence in sentences:
                                    if len(out_text) < description_chars_break:
                                        out_text += sentence + " "
                                    else:
                                        if len(out_text) > description_max_chars:
                                            this_ellipsis = "..."
                                        return out_text[:description_max_chars] + this_ellipsis
                return out_text
            else:
                return "No Article description returned"
        except Exception as ex:
            self.log('Error on Article Description')
            traceback.print_exc(file=sys.stdout)
            print(str(ex))
            return ""

    @staticmethod
    def get_published_time_from_doc(page_doc):

        def get_regular_timestamp(date_string):
            try:
                out_date = datetime.strptime(date_string, "%Y-%m-%dT%H:%M:%SZ")
                return out_date
            except ValueError:
                return None

        el = page_doc.findAll(
                lambda this_tag: this_tag.name == "time" and ('itemprop', 'datePublished') in this_tag.attrs)
        if len(el) == 1:
            return get_regular_timestamp(el[0].get('datetime'))
        else:
            return None

    def populate_article_metadata(self, article, soup, first):
        """
        Called when each HTML page belonging to article is downloaded.
        Intended to be used to get article metadata like author/summary/etc.
        from the parsed HTML (soup).

        :param article: A object of class :class:`calibre.web.feeds.Article`.
            If you change the summary, remember to also change the text_summary
        :param soup: Parsed HTML belonging to this article
        :param first: True iff the parsed HTML is the first page of the article.
        """
        summary = self.get_article_description_from_doc(soup)
        article_date = self.get_published_time_from_doc(soup)
        if article_date is not None:
            article_timestamp = float((article_date - datetime.utcfromtimestamp(0)).total_seconds())
            article.date = article_timestamp
            article.utctime = dt_factory(article_date.timetuple(), assume_utc=True, as_utc=True)
            article.localtime = article.utctime.astimezone(local_tz)
        summary_date = article.localtime.strftime("%Y-%m-%d %H:%M") if article_date is not None else "No Date"
        article.summary = "{0}: {1}".format(summary_date, summary)
        article.text_summary = clean_ascii_chars(article.summary)

    def parse_index(self):
        self.timefmt = ' [%a, %d %b, %Y]'
        self.log('starting parse_index: ', time.strftime(self.timestampfmt))
        feeds = []
        sections = get_all_links_from_sections(self.index_to_soup)
        for section_id, article_list in sections.items():
            self.log("Getting {0} section, {1:d} articles".format(section_id, len(article_list)))
            articles = []
            for article_info in article_list:
                self.log("Adding {0} to feed".format(article_info[0]))
                articles.append({'title': article_info[1], 'url': article_info[0],
                                 'description': '', 'date': ""})
            self.log("Appending {0:d} articles for {1}".format(len(articles), section_id))
            feeds.append((section_id, articles))
        self.log('finished parse_index: ', time.strftime(self.timestampfmt))
        return feeds

    def preprocess_html(self, soup):
        return soup