calibre/recipes/al_monitor.recipe

#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import print_function, unicode_literals
__license__   = 'GPL v3'
__copyright__ = '2014, spswerling'
'''
http://www.al-monitor.com/
'''
import string
import inspect
import datetime
import re
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup


class AlMonitor(BasicNewsRecipe):
    title = u'Al Monitor'
    __author__ = u'spswerling'
    description = 'a The Pulse of the Middle East'
    no_stylesheets = True
    encoding = 'utf-8'
    category = 'news'
    language = 'en'
    publication_type = 'newspaper'
    cover_img_url = 'http://www.al-monitor.com/modules/almcontent/a-img/elements/logo.png'
    masthead_url = cover_img_url
    remove_empty_feeds = True

    # on kindle, images can make things kind of fat. Slim them down.
    recursions = 0
    compress_news_images = True
    compress_news_images_max_size = 7
    scale_news_images = (150, 200)  # (kindle touch: 600x800)
    useHighResImages = False
    oldest_article = 1.5
    max_articles_per_section = 15

    sections = [
        (u'egypt', u'http://www.al-monitor.com/pulse/egypt-pulse'),
        (u'gulf', u'http://www.al-monitor.com/pulse/gulf-pulse'),
        (u'iran', u'http://www.al-monitor.com/pulse/iran-pulse'),
        (u'iraq', u'http://www.al-monitor.com/pulse/iraq-pulse'),
        (u'israel', u'http://www.al-monitor.com/pulse/israel-pulse'),
        (u'lebanon', u'http://www.al-monitor.com/pulse/lebanon-pulse'),
        (u'palistine', u'http://www.al-monitor.com/pulse/palistine-pulse'),
        (u'syria', u'http://www.al-monitor.com/pulse/syria-pulse'),
        (u'turkey', u'http://www.al-monitor.com/pulse/turkey-pulse'),
    ]

    # util for creating remove_tags and keep_tags style regex matchers
    def tag_matcher(elt, attr, rgx_str):
        return dict(name=elt, attrs={attr: re.compile(rgx_str, re.IGNORECASE)})

    remove_tags = [
        dict(attrs={'id': [
            'header',
            'pulsebanner',
            'relatedarticles',
            'sidecolumn',
            'disqus',
            'footer',
            'footer2',
            'footer3',
            'mobile-extras',
        ]}),
        tag_matcher('hr', 'id', 'spacer'),
        tag_matcher('a', 'title', 'print this article'),
        tag_matcher('div', 'class', 'extras'),
        tag_matcher('div', 'class', '^clear$'),
        tag_matcher('div', 'class', '^overlay$'),
        tag_matcher('div', 'class', 'shareTag'),
    ]

    articles = {}
    urls_done = []

    def parse_index(self):
        for section in self.sections:
            self.parse_section(section[0], section[1])
        ans = []
        for k in self.articles:
            ans.append((string.capwords(k), self.articles[k]))
        return ans

    def parse_section(self, section, url):

        self.articles[section] = []

        try:
            self._p('process section  ' + section + ', url: ' + url)
            soup = self.index_to_soup(url)
        except:
            self._p('Unable to spider section')
            return []

        self._p('Got section. Processing links.')

        for link in soup.findAll('a', href=True):
            href = link.get('href')
            text = self.text(link)
            if text and ('pulse/originals' in href):
                self.process_link(section, link)

    def process_link(self, section, link):
        title = self.text(link)
        if len(title) > 120:
            title = title[0:120] + '...'
        href = link.get('href')
        if not href:
            self._p("BAD HREF: " + str(link))
            return
        self.queue_article_link(section, href, title)

    def queue_article_link(self, section, url, title):
        full_url = self.abs_url(url)
        if full_url in self.urls_done:
            self._p('Skip (already Qd): ' + ' - '.join([section, title, url]))
            return

        self._p('Q: ' + ' - '.join([section, title, url]))
        self.urls_done.append(full_url)
        if len(self.articles[section]) >= self.max_articles_per_section:
            return
        self.articles[section].append(
            dict(title=title,
                 url=full_url,
                 date='',
                 description='',
                 author='',
                 content=''))

    def preprocess_raw_html(self, raw_html, url):
        reason_to_skip = self.should_skip_article(BeautifulSoup(raw_html))
        if reason_to_skip:
            self._p('Skipping article: ' + reason_to_skip + ', ' + url)
            # Next line will show up as an error in the logs, but ignore, see
            #   http://www.mobileread.com/forums/showthread.php?p=2931136
            return None
        else:
            return super(self.__class__, self).preprocess_raw_html(raw_html, url)

    def populate_article_metadata(self, article, soup, first):
        summary_node = soup.find('div', {'id': 'summary'})
        if summary_node:
            summary = self.text(summary_node)
            self._p('Summary: ' + summary)
            article.text_summary = summary
        else:
            self._p('No summary')

    def should_skip_article(self, soup):
        date = self.scrape_article_date(soup)
        if not date:
            return False

        age = (datetime.datetime.now() - date).days
        if (age > self.oldest_article):
            return "too old"
        return False

    def scrape_article_date(self, soup):
        for span in soup.findAll('span'):
            txt = self.text(span)
            rgx = re.compile(r'Posted ([a-zA-Z]+ \d\d?, \d\d\d\d).*')
            hit = rgx.match(txt)
            if hit:
                return self.date_from_string(txt)

        return None

    def date_from_string(self, datestring):
        try:
            # eg: Posted September 17, 2014
            dt = datetime.datetime.strptime(datestring, "Posted %B %d, %Y")
        except:
            dt = None

        if dt:
            self._p('From string "' + datestring + '", datetime: ' + str(dt))
        else:
            self._p('Could not get datetime from ' + datestring)

        return dt

    def abs_url(self, url):
        if 'www.al-monitor.com' in url:
            abs_url = url
        elif url[0] == '/':
            abs_url = 'http://www.al-monitor.com' + url
        else:
            self._p('Not sure how to make abs_url: ' + url)
            raise

        if '#' in abs_url:
            abs_url = ''.join(abs_url.split('#')[0:-1])

        return abs_url

    def text(self, n):
        return self.tag_to_string(n).strip()

    def _dbg_soup_node(self, node):
        s = '   cls: ' + str(node.get('class')).strip() + \
            '  id: ' + str(node.get('id')).strip() + \
            '  role: ' + str(node.get('role')).strip() + \
            ' txt: ' + self.text(node)
        return s

    def _p(self, msg):
        curframe = inspect.currentframe()
        calframe = inspect.getouterframes(curframe, 2)
        calname = calframe[1][3].upper()
        print('[' + calname + '] ' + msg[0:100])