Al Monitor by spswerling

2025-07-09 03:04:10 -04:00 · 2014-09-25 00:10:45 +05:30 · 2014-09-25 00:10:45 +05:30 · 2b23b0d342
commit 2b23b0d342
parent 5ed5dfeb02
1 changed files with 209 additions and 0 deletions
--- a/recipes/al_monitor.recipe
+++ b/recipes/al_monitor.recipe
@ -0,0 +1,209 @@
+#!/usr/bin/env  python
+# -*- coding: utf-8 -*-
+__license__   = 'GPL v3'
+__copyright__ = '2014, spswerling'
+'''
+http://www.al-monitor.com/
+'''
+import string, inspect, datetime, re
+from calibre.web.feeds.news import BasicNewsRecipe
+from calibre.ebooks.BeautifulSoup import BeautifulSoup
+
+class AlMonitor(BasicNewsRecipe):
+    title = u'Al Monitor'
+    __author__ = u'spswerling'
+    description = 'a The Pulse of the Middle East'
+    no_stylesheets = True
+    encoding = 'utf-8'
+    category = 'news'
+    language = 'en'
+    publication_type = 'newspaper'
+    cover_img_url = 'http://www.al-monitor.com/modules/almcontent/a-img/elements/logo.png'
+    masthead_url = cover_img_url
+    remove_empty_feeds = True
+
+    # on kindle, images can make things kind of fat. Slim them down.
+    recursions = 0
+    compress_news_images = True
+    compress_news_images_max_size = 7
+    scale_news_images = (150,200)  # (kindle touch: 600x800)
+    useHighResImages = False
+    oldest_article = 1.5
+    max_articles_per_section = 15
+
+    sections = [
+                  (u'egypt',u'http://www.al-monitor.com/pulse/egypt-pulse'),
+                  (u'gulf',u'http://www.al-monitor.com/pulse/gulf-pulse'),
+                  (u'iran',u'http://www.al-monitor.com/pulse/iran-pulse'),
+                  (u'iraq',u'http://www.al-monitor.com/pulse/iraq-pulse'),
+                  (u'israel',u'http://www.al-monitor.com/pulse/israel-pulse'),
+                  (u'lebanon',u'http://www.al-monitor.com/pulse/lebanon-pulse'),
+                  (u'palistine',u'http://www.al-monitor.com/pulse/palistine-pulse'),
+                  (u'syria',u'http://www.al-monitor.com/pulse/syria-pulse'),
+                  (u'turkey',u'http://www.al-monitor.com/pulse/turkey-pulse'),
+               ]
+
+    # util for creating remove_tags and keep_tags style regex matchers
+    def tag_matcher(elt, attr, rgx_str):
+        return dict(name=elt, attrs={attr:re.compile(rgx_str, re.IGNORECASE)})
+
+    remove_tags = [
+        dict(attrs={'id':[
+                          'header',
+                          'pulsebanner',
+                          'relatedarticles',
+                          'sidecolumn',
+                          'disqus',
+                          'footer',
+                          'footer2',
+                          'footer3',
+                          'mobile-extras',
+                        ]}),
+        tag_matcher('hr', 'id', 'spacer'),
+        tag_matcher('a', 'title', 'print this article'),
+        tag_matcher('div', 'class', 'extras'),
+        tag_matcher('div', 'class', '^clear$'),
+        tag_matcher('div', 'class', '^overlay$'),
+        tag_matcher('div', 'class', 'shareTag'),
+    ]
+
+    articles = {}
+    urls_done = []
+
+    def parse_index(self):
+        for section in self.sections:
+            self.parse_section(section[0], section[1])
+        ans = []
+        for k in self.articles:
+            ans.append((string.capwords(k), self.articles[k]))
+        return ans
+
+    def parse_section(self, section, url):
+
+        self.articles[section] = []
+
+        try:
+            self._p('process section  ' + section + ', url: ' + url)
+            soup = self.index_to_soup(url)
+        except:
+            self._p('Unable to spider section')
+            return []
+
+        self._p('Got section. Processing links.')
+
+        for link in soup.findAll('a'):
+            href = link.get('href')
+            text = self.text(link)
+            if text and ('pulse/originals' in href):
+                self.process_link(section, link)
+
+    def process_link(self, section, link):
+        title = self.text(link)
+        if len(title) > 120:
+            title = title[0:120] + '...'
+        href = link.get('href')
+        if not href:
+            self._p("BAD HREF: " + str(link))
+            return
+        self.queue_article_link(section, href, title)
+
+    def queue_article_link(self, section, url, title):
+        full_url = self.abs_url(url)
+        if full_url in self.urls_done:
+            self._p('Skip (already Qd): ' + ' - '.join([section, title, url]))
+            return
+
+        self._p('Q: ' + ' - '.join([section, title, url]))
+        self.urls_done.append(full_url)
+        if len(self.articles[section]) >= self.max_articles_per_section:
+            return
+        self.articles[section].append(
+                        dict(title=title,
+                            url=full_url,
+                            date='',
+                            description='',
+                            author='',
+                            content=''))
+
+    def preprocess_raw_html(self, raw_html, url):
+        reason_to_skip = self.should_skip_article(BeautifulSoup(raw_html))
+        if reason_to_skip:
+            self._p('Skipping article: ' + reason_to_skip + ', ' + url)
+            # Next line will show up as an error in the logs, but ignore, see
+            #   http://www.mobileread.com/forums/showthread.php?p=2931136
+            return None
+        else:
+            return super(self.__class__, self).preprocess_raw_html(raw_html, url)
+
+    def populate_article_metadata(self, article, soup, first):
+        summary_node = soup.find('div', {'id':'summary'})
+        if summary_node:
+            summary = self.text(summary_node)
+            self._p('Summary: ' + summary)
+            article.text_summary = summary
+        else:
+            self._p('No summary')
+
+    def should_skip_article(self, soup):
+        date = self.scrape_article_date(soup)
+        if not date:
+            return False
+
+        age = (datetime.datetime.now() - date).days
+        if (age > self.oldest_article):
+            return "too old"
+        return False
+
+    def scrape_article_date(self, soup):
+        for span in soup.findAll('span'):
+            txt = self.text(span)
+            rgx = re.compile('Posted ([a-zA-Z]+ \d\d?, \d\d\d\d).*')
+            hit = rgx.match(txt)
+            if hit:
+                return self.date_from_string(txt)
+
+        return None
+
+    def date_from_string(self, datestring):
+        try:
+            # eg: Posted September 17, 2014
+            dt = datetime.datetime.strptime(datestring,"Posted %B %d, %Y")
+        except:
+            dt = None
+
+        if dt:
+            self._p('From string "' + datestring + '", datetime: ' + str(dt))
+        else:
+            self._p('Could not get datetime from ' + datestring)
+
+        return dt
+
+    def abs_url(self, url):
+        if 'www.al-monitor.com' in url:
+            abs_url = url
+        elif url[0] == '/':
+            abs_url = 'http://www.al-monitor.com' + url
+        else:
+            self._p('Not sure how to make abs_url: ' + url)
+            raise
+
+        if '#' in abs_url:
+            abs_url = ''.join(abs_url.split('#')[0:-1])
+
+        return abs_url
+
+    def text(self,n):
+        return self.tag_to_string(n).strip()
+
+    def _dbg_soup_node(self, node):
+        s = '   cls: ' + str(node.get('class')).strip() + \
+              '  id: ' + str(node.get('id')).strip() + \
+              '  role: ' + str(node.get('role')).strip() + \
+              ' txt: ' + self.text(node)
+        return s
+
+    def _p(self, msg):
+        curframe = inspect.currentframe()
+        calframe = inspect.getouterframes(curframe, 2)
+        calname = calframe[1][3].upper()
+        print('[' + calname + '] ' + msg[0:100])