calibre/recipes/houston_chronicle.recipe

#!/usr/bin/env python2
# -*- coding: utf-8 -*-
__license__ = 'GPL v3'
__copyright__ = '2018, Dale Furrow dkfurrow@gmail.com'
'''
chron.com
'''
import re
import time
import urllib2
from StringIO import StringIO
from datetime import datetime
import traceback
import sys
from collections import OrderedDict

from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.utils.cleantext import clean_ascii_chars
from calibre.ebooks.BeautifulSoup import NavigableString
from calibre.utils.date import dt_factory, local_tz
from lxml import html
from lxml import etree

regex_date_only = re.compile(r"""(?:January|February|March|April|
 {8}May|June|July|August|September|October|November|
 {8}December)\s[0-9]{1,2},\s20[01][0-9]""")
regex_time_only = re.compile(r"""[0-9]{1,2}:[0-9]{1,2} \w{2}""")
sentence_regex = re.compile(r"(\S.+?[.!?])(?=\s+|$)")
blog_regex = re.compile(r'post-\d+')

pages = OrderedDict([('news', ('/news/houston-texas/', ['business', 'sports'])),
                     ('business', ('/business/', ['sports'])),
                     ('sports', ('/sports/', ['business']))])

base_url = "http://www.chron.com"

# sports has 'core-package sports' class
xpath_general = """//div[contains(@class, 'centerpiece-tabs') or
                    contains(@class, 'wrapper') or
                    contains(@class, 'contentGroups') or
                    contains(@class, 'headline-list') or
                    contains(@class, 'core-package sports') or
                    contains(@class, 'news')]
                   //a[contains(@class, 'hdn-analytics')]"""

excluded_titles = ["Winning numbers", "TV-radio listings"]


def validate_link(page, link, title):
    other_category = page[1][1]
    if not title or len(title.strip()) < 5:
        print("{0} rejected, title too short".format(link))
        return None
    if link.split('/')[3] in other_category:
        print("{0} rejected, covered in other section".format(link))
        return None
    for excluded_title in excluded_titles:
        if title.find(excluded_title) != -1:
            print("{0} rejected, excluded title".format(link))
            return None
    return link, title


def get_article_parsed(this_url):
    page = urllib2.urlopen(this_url)
    content = page.read()
    parser = etree.HTMLParser()
    parsed = html.parse(StringIO(content), parser)
    return parsed


def sort_subject(element_list):
    # priority of subjects
    subjects = ['news', 'neighborhood', 'entertainment']
    subjects.reverse()
    subject_dict = OrderedDict(zip(subjects, range(len(subjects))))
    rank_dict = OrderedDict([(rank, []) for rank in range(len(subjects) + 1)])
    for element in element_list:
        subj = element[0].split('/')[3]
        if subject_dict.get(subj) is not None:
            rank_dict[subject_dict[subj] + 1].append(element)
        else:
            rank_dict[0].append(element)
    # now return in reverse order, sorted
    combined_list = []
    for rank in range(len(subjects), -1, -1):
        article_list = rank_dict[rank]
        article_list.sort()
        combined_list.extend(article_list)
    return combined_list


def get_links_from_section_page(page):
    page_doc = get_article_parsed(base_url + page[1][0])
    els = page_doc.xpath(xpath_general)
    element_list = []
    for el in els:
        link = el.get('href').split('?')[0]
        title = el.text
        if title is None or len(title.strip()) < 5:
            link_id = link.split('/')[-1][:-3].split('-')[:-1]
            title = ' '.join(link_id)
        if link[:4] != 'http':
            link = base_url + link
        validated_link = validate_link(page=page, link=link, title=title)
        if validated_link is not None:
            element_list.append(validated_link)
    sorted_element_list = sort_subject(element_list)
    return [page[0], sorted_element_list]


def get_all_links_from_sections():
    all_sections = []
    article_set = set()
    final_dict = OrderedDict()
    for item in pages.items():
        print("getting links from {0}".format(item[0]))
        all_sections.append(get_links_from_section_page(item))
    for section in all_sections:
        section_id = section[0]
        article_list = section[1]
        final_dict[section_id] = []
        for article in article_list:
            if article[0] not in article_set:
                article_set.add(article[0])
                final_dict[section_id].append(article)
    return final_dict


# noinspection PyAbstractClass
class HoustonChronicle(BasicNewsRecipe):
    title = u'The Houston Chronicle'
    description = 'News from Houston, Texas'
    __author__ = 'Dale Furrow'
    language = 'en'
    no_stylesheets = True
    remove_attributes = ['style', 'xmlns']
    remove_empty_feeds = True
    timefmt = '[%a, %d %b %Y]'
    timestampfmt = '%Y%m%d%H%M%S'
    # ignore_duplicate_articles = {'url'}  #  defaults to None
    extra_css = '.article_date {display: none}'
    category = 'news, USA'
    masthead_url = 'http://www.chron.com/apple-touch-icon-76x76.png'
    keep_only_tags = [dict(name='div', attrs={'class': ['article-content', 'article-wrap']})]
    remove_tags = [dict(name='div', attrs={'social-title': True}),
                   dict(name='div', attrs={'class':
                                           ['control-panel', 'gallery-overlay-inner',
                                            'most-popular', 'asset-media mos-playlist',
                                            'asset_media asset-media']}),
                   dict(name='li', attrs={'class': ['hst-resgalleryitem taboola-frame hidden',
                                                    'hst-resgalleryitem hidden']}),
                   dict(name='ul', attrs={'class': 'clearfix'})]

    # max_articles_per_feed = 5  # for use in testing

    def get_article_description_from_doc(self, soup):
        description_chars_break = 140
        description_max_chars = 300
        try:
            els = soup.findAll('p')
            if len(els) > 0:
                out_text = ""
                this_ellipsis = ""
                for el in els:
                    if el is not None:
                        result = []
                        for descendant in el.contents:
                            if isinstance(descendant, NavigableString):
                                result.append(unicode(descendant).strip())
                        all_text = u' '.join(result).encode('utf-8')
                        if len(all_text) > 1:
                            sentences = re.findall(sentence_regex, all_text)
                            if sentences is not None and len(sentences) > 0:
                                for sentence in sentences:
                                    if len(out_text) < description_chars_break:
                                        out_text += sentence + " "
                                    else:
                                        if len(out_text) > description_max_chars:
                                            this_ellipsis = "..."
                                        return out_text[:description_max_chars] + this_ellipsis
                return out_text
            else:
                return "No Article description returned"
        except Exception as ex:
            self.log('Error on Article Description')
            traceback.print_exc(file=sys.stdout)
            print(str(ex))
            return ""

    @staticmethod
    def get_published_time_from_doc(page_doc):

        def get_regular_timestamp(date_string):
            try:
                out_date = datetime.strptime(date_string, "%Y-%m-%dT%H:%M:%SZ")
                return out_date
            except ValueError:
                return None

        el = page_doc.findAll(
                lambda this_tag: this_tag.name == "time" and ('itemprop', 'datePublished') in this_tag.attrs)
        if len(el) == 1:
            return get_regular_timestamp(el[0].get('datetime'))
        else:
            return None

    def populate_article_metadata(self, article, soup, first):
        """
        Called when each HTML page belonging to article is downloaded.
        Intended to be used to get article metadata like author/summary/etc.
        from the parsed HTML (soup).

        :param article: A object of class :class:`calibre.web.feeds.Article`.
            If you change the summary, remember to also change the text_summary
        :param soup: Parsed HTML belonging to this article
        :param first: True iff the parsed HTML is the first page of the article.
        """
        summary = self.get_article_description_from_doc(soup)
        article_date = self.get_published_time_from_doc(soup)
        if article_date is not None:
            article_timestamp = float((article_date - datetime.utcfromtimestamp(0)).total_seconds())
            article.date = article_timestamp
            article.utctime = dt_factory(article_date.timetuple(), assume_utc=True, as_utc=True)
            article.localtime = article.utctime.astimezone(local_tz)
        summary_date = article.localtime.strftime("%Y-%m-%d %H:%M") if article_date is not None else "No Date"
        article.summary = "{0}: {1}".format(summary_date, summary)
        article.text_summary = clean_ascii_chars(article.summary)

    def parse_index(self):
        self.timefmt = ' [%a, %d %b, %Y]'
        self.log('starting parse_index: ', time.strftime(self.timestampfmt))
        feeds = []
        sections = get_all_links_from_sections()
        for section_id, article_list in sections.items():
            self.log("Getting {0} section, {1:d} articles".format(section_id, len(article_list)))
            articles = []
            for article_info in article_list:
                self.log("Adding {0} to feed".format(article_info[0]))
                articles.append({'title': article_info[1], 'url': article_info[0],
                                 'description': '', 'date': ""})
            self.log("Appending {0:d} articles for {1}".format(len(articles), section_id))
            feeds.append((section_id, articles))
        self.log('finished parse_index: ', time.strftime(self.timestampfmt))
        return feeds

    def preprocess_html(self, soup):
        return soup