calibre/recipes/houston_chronicle.recipe

#!/usr/bin/env python2
# -*- coding: utf-8 -*-
__license__ = 'GPL v3'
__copyright__ = '2016, Dale Furrow dkfurrow@gmail.com'
'''
chron.com
'''
import re
import time
from datetime import datetime, timedelta, date
from lxml import html
from calibre.web.feeds.recipes import BasicNewsRecipe


class HoustonChronicle(BasicNewsRecipe):
    title = u'The Houston Chronicle'
    description = 'News from Houston, Texas'
    __author__ = 'Dale Furrow'
    language = 'en'
    no_stylesheets = True
    remove_attributes = ['style', 'xmlns']
    remove_empty_feeds = True
    timefmt = '[%a, %d %b %Y]'
    timestampfmt = '%Y%m%d%H%M%S'
    ignore_duplicate_articles = {'url'}
    extra_css = '.article_date {display: none}'

    oldest_web_article = 7.0

    if oldest_web_article is None:
        earliest_date = date.today()
    else:
        earliest_date = date.today() - timedelta(days=oldest_web_article)

    pages = [('news', '/news/houston-texas/'),
             ('business', '/business/'),
             ('sports', '/sports/')]

    base_url = "http://www.chron.com"

    xpath_str = """//div[contains(@class, 'news') or contains(@class, 'headline-list')]
                  //*[self::h4 or self::h5]//a[contains(@class, 'hdn-analytics')]"""

    def get_links_from_section_page(self, section_url):
        page_doc = html.parse(self.base_url + section_url)
        els = page_doc.xpath(self.xpath_str)
        element_list = []
        for el in els:
            link = el.get('href')
            title = el.text
            if link[:4] != 'http':
                link = self.base_url + link
            if title is not None:
                element_list.append((link, el.text))
        return element_list

    def get_article_description_from_doc(self, page_doc):
        description_chars_break = 140
        description_max_chars = 300
        desc_xpath = """//div[contains(@class, 'article-body') or
        contains(@class, 'resource-content') or contains(@class, 'post')]//p"""
        sentence_regex = re.compile("(\S.+?[.!?])(?=\s+|$)")

        def stringify_children(node):
            return ''.join([x for x in node.itertext()])

        try:
            els = page_doc.xpath(desc_xpath)
            out_text = ""
            ellipsis = ""
            for el in els:
                sentences = re.findall(sentence_regex, stringify_children(el))
                for sentence in sentences:
                    if len(out_text) < description_chars_break:
                        out_text += sentence + " "
                    else:
                        if len(out_text) > description_max_chars:
                            ellipsis = "..."
                        return out_text[:description_max_chars] + ellipsis
            return out_text
        except:
            self.log('Error on Article Description')
            return ""

    def get_published_time_from_doc(self, page_doc):
        regex_date_only = re.compile("""(?:January|February|March|April|
        May|June|July|August|September|October|November|
        December)\s[0-9]{1,2},\s20[01][0-9]""")
        regex_time_only = re.compile("""[0-9]{1,2}:[0-9]{1,2} \w{2}""")

        def get_regular_timestamp(date_string):
            try:
                out_date = datetime.strptime(date_string, "%Y-%m-%dT%H:%M:%SZ")
                return out_date
            except:
                return None

        def get_date_from_string(in_text):
            match = re.findall(regex_date_only, in_text)
            if match:
                try:
                    out_date = datetime.strptime(match[0], "%B %d, %Y")
                    match = re.findall(regex_time_only, in_text)
                    if match:
                        out_time = datetime.strptime(match[0], "%I:%M %p")
                        return datetime.combine(out_date.date(), out_time.time())
                    return out_date
                except:
                    return None

        el = page_doc.xpath("//*[@class='timestamp'][1]")
        if len(el) == 1:
            return get_regular_timestamp(el[0].get('title'))
        else:
            el = page_doc.xpath(
                "//*[@class='entry-date' or @class='post-date'][1]")
            if len(el) == 1:
                return get_date_from_string(el[0].text_content())
            else:
                return None

    def get_all_data_feeds_from_page(self, page):
        articles = []
        exclude_titles_with = ['Winning numbers']

        def title_excluded(title):
            for text in exclude_titles_with:
                if title.find(text) != -1:
                    return True
            return False

        link_list = self.get_links_from_section_page(page[1])
        self.log('from section: ', page[0],
                 " found ", len(link_list), " links")
        for link in link_list:
            try:
                article_doc = html.parse(link[0])
                description = self.get_article_description_from_doc(
                    article_doc)
                parsed_date = self.get_published_time_from_doc(article_doc)
                if parsed_date is not None and description is not None and \
                        parsed_date.date() > self.earliest_date and \
                        not title_excluded(link[1]):
                    intro_date = parsed_date.strftime('%d %b %H:%M') + " - "
                    articles.append({'title': link[1], 'url': link[0],
                                     'description': intro_date + description,
                                     'date': ""})
                    self.log(page[0] + ": " + link[1] + ', from ' + intro_date +
                             " description of " + str(len(description)) + ' characters at ' + link[0])
                else:
                    if parsed_date is None:
                        msg = " No Timestamp Found"
                    elif title_excluded(link[1]):
                        msg = " Title Excluded"
                    else:
                        msg = " article older than " + \
                            str(self.oldest_web_article) + ' days...'
                    self.log("Skipping article: ", link[0], msg)
            except:
                print 'error on fetching ' + link[0]
                continue
        return articles

    def parse_index(self):

        self.timefmt = ' [%a, %d %b, %Y]'
        self.log('starting parse_index: ', time.strftime(self.timestampfmt))
        feeds = []
        for page in self.pages:
            articles = self.get_all_data_feeds_from_page(page)
            if articles:
                feeds.append((page[0], articles))
        self.log('finished parse_index: ', time.strftime(self.timestampfmt))
        return feeds

    def preprocess_html(self, soup):
        tags_to_exclude = [('class', "caption  staged"),
                           ('style', "display:none")]
        story_tag = soup.find(
            name='div', attrs={'class': ['article-content', 'article-body']})
        blog_tag = soup.find(name='div', attrs={'id': re.compile('post-\d+')})

        def is_excluded(tag_to_check):
            for attr in tag_to_check.attrs:
                if attr in tags_to_exclude:
                    return True
            return False

        def get_attr_startswith(attrs, this_key, this_valuestart):
            starts_with = False
            for attr in attrs:
                if attr[0] == this_key:
                    if attr[1].startswith(this_valuestart):
                        starts_with = True
            return starts_with

        base_tags = []
        if story_tag is not None:
            base_tags = story_tag.findAll(lambda this_tag: (this_tag.name == "p" and not ('class', 'open') in this_tag.attrs and not ('class', 'close') in this_tag.attrs) or this_tag.name.startswith('h') or this_tag.name == 'table' or (this_tag.name == 'li' and ('class', 'hst-resgalleryitem') in this_tag.attrs))  # noqa
        if blog_tag is not None:
            base_tags = blog_tag.findAll(lambda this_tag: (this_tag.name == "p" or this_tag.name.startswith('h')) or (this_tag.name == "span" and get_attr_startswith(this_tag.attrs, 'class', 'post')) or (this_tag.name == 'img' and ('lazy-state', 'loaded') in this_tag.attrs))  # noqa

        self.log('content tags: ' + str(type(base_tags)) + str(len(base_tags)))
        all_tags = []
        all_tags.extend(base_tags)
        if len(base_tags) > 0:
            for tag in base_tags:
                all_tags.extend(tag.findAll(True))

        for tag in base_tags:
            while tag.parent is not None and not is_excluded(tag):
                all_tags.append(tag)
                tag = tag.parent
        for tag in soup.findAll(True):
            if tag not in all_tags:
                tag.extract()
        return soup