#!/usr/bin/env python # -*- coding: utf-8 -*- from __future__ import print_function, unicode_literals __license__ = 'GPL v3' __copyright__ = '2014, spswerling' ''' http://www.al-monitor.com/ ''' import string import inspect import datetime import re from calibre.web.feeds.news import BasicNewsRecipe from calibre.ebooks.BeautifulSoup import BeautifulSoup class AlMonitor(BasicNewsRecipe): title = u'Al Monitor' __author__ = u'spswerling' description = 'a The Pulse of the Middle East' no_stylesheets = True encoding = 'utf-8' category = 'news' language = 'en' publication_type = 'newspaper' cover_img_url = 'http://www.al-monitor.com/modules/almcontent/a-img/elements/logo.png' masthead_url = cover_img_url remove_empty_feeds = True # on kindle, images can make things kind of fat. Slim them down. recursions = 0 compress_news_images = True compress_news_images_max_size = 7 scale_news_images = (150, 200) # (kindle touch: 600x800) useHighResImages = False oldest_article = 1.5 max_articles_per_section = 15 sections = [ (u'egypt', u'http://www.al-monitor.com/pulse/egypt-pulse'), (u'gulf', u'http://www.al-monitor.com/pulse/gulf-pulse'), (u'iran', u'http://www.al-monitor.com/pulse/iran-pulse'), (u'iraq', u'http://www.al-monitor.com/pulse/iraq-pulse'), (u'israel', u'http://www.al-monitor.com/pulse/israel-pulse'), (u'lebanon', u'http://www.al-monitor.com/pulse/lebanon-pulse'), (u'palistine', u'http://www.al-monitor.com/pulse/palistine-pulse'), (u'syria', u'http://www.al-monitor.com/pulse/syria-pulse'), (u'turkey', u'http://www.al-monitor.com/pulse/turkey-pulse'), ] # util for creating remove_tags and keep_tags style regex matchers def tag_matcher(elt, attr, rgx_str): return dict(name=elt, attrs={attr: re.compile(rgx_str, re.IGNORECASE)}) remove_tags = [ dict(attrs={'id': [ 'header', 'pulsebanner', 'relatedarticles', 'sidecolumn', 'disqus', 'footer', 'footer2', 'footer3', 'mobile-extras', ]}), tag_matcher('hr', 'id', 'spacer'), tag_matcher('a', 'title', 'print this article'), tag_matcher('div', 'class', 'extras'), tag_matcher('div', 'class', '^clear$'), tag_matcher('div', 'class', '^overlay$'), tag_matcher('div', 'class', 'shareTag'), ] articles = {} urls_done = [] def parse_index(self): for section in self.sections: self.parse_section(section[0], section[1]) ans = [] for k in self.articles: ans.append((string.capwords(k), self.articles[k])) return ans def parse_section(self, section, url): self.articles[section] = [] try: self._p('process section ' + section + ', url: ' + url) soup = self.index_to_soup(url) except: self._p('Unable to spider section') return [] self._p('Got section. Processing links.') for link in soup.findAll('a', href=True): href = link.get('href') text = self.text(link) if text and ('pulse/originals' in href): self.process_link(section, link) def process_link(self, section, link): title = self.text(link) if len(title) > 120: title = title[0:120] + '...' href = link.get('href') if not href: self._p("BAD HREF: " + str(link)) return self.queue_article_link(section, href, title) def queue_article_link(self, section, url, title): full_url = self.abs_url(url) if full_url in self.urls_done: self._p('Skip (already Qd): ' + ' - '.join([section, title, url])) return self._p('Q: ' + ' - '.join([section, title, url])) self.urls_done.append(full_url) if len(self.articles[section]) >= self.max_articles_per_section: return self.articles[section].append( dict(title=title, url=full_url, date='', description='', author='', content='')) def preprocess_raw_html(self, raw_html, url): reason_to_skip = self.should_skip_article(BeautifulSoup(raw_html)) if reason_to_skip: self._p('Skipping article: ' + reason_to_skip + ', ' + url) # Next line will show up as an error in the logs, but ignore, see # http://www.mobileread.com/forums/showthread.php?p=2931136 return None else: return super(self.__class__, self).preprocess_raw_html(raw_html, url) def populate_article_metadata(self, article, soup, first): summary_node = soup.find('div', {'id': 'summary'}) if summary_node: summary = self.text(summary_node) self._p('Summary: ' + summary) article.text_summary = summary else: self._p('No summary') def should_skip_article(self, soup): date = self.scrape_article_date(soup) if not date: return False age = (datetime.datetime.now() - date).days if (age > self.oldest_article): return "too old" return False def scrape_article_date(self, soup): for span in soup.findAll('span'): txt = self.text(span) rgx = re.compile(r'Posted ([a-zA-Z]+ \d\d?, \d\d\d\d).*') hit = rgx.match(txt) if hit: return self.date_from_string(txt) return None def date_from_string(self, datestring): try: # eg: Posted September 17, 2014 dt = datetime.datetime.strptime(datestring, "Posted %B %d, %Y") except: dt = None if dt: self._p('From string "' + datestring + '", datetime: ' + str(dt)) else: self._p('Could not get datetime from ' + datestring) return dt def abs_url(self, url): if 'www.al-monitor.com' in url: abs_url = url elif url[0] == '/': abs_url = 'http://www.al-monitor.com' + url else: self._p('Not sure how to make abs_url: ' + url) raise if '#' in abs_url: abs_url = ''.join(abs_url.split('#')[0:-1]) return abs_url def text(self, n): return self.tag_to_string(n).strip() def _dbg_soup_node(self, node): s = ' cls: ' + str(node.get('class')).strip() + \ ' id: ' + str(node.get('id')).strip() + \ ' role: ' + str(node.get('role')).strip() + \ ' txt: ' + self.text(node) return s def _p(self, msg): curframe = inspect.currentframe() calframe = inspect.getouterframes(curframe, 2) calname = calframe[1][3].upper() print('[' + calname + '] ' + msg[0:100])