From 2b23b0d3427f31a2121cb8ccb606b5f8d65c1a16 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Thu, 25 Sep 2014 00:10:45 +0530 Subject: [PATCH] Al Monitor by spswerling --- recipes/al_monitor.recipe | 209 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 209 insertions(+) create mode 100644 recipes/al_monitor.recipe diff --git a/recipes/al_monitor.recipe b/recipes/al_monitor.recipe new file mode 100644 index 0000000000..1339eaf38c --- /dev/null +++ b/recipes/al_monitor.recipe @@ -0,0 +1,209 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +__license__ = 'GPL v3' +__copyright__ = '2014, spswerling' +''' +http://www.al-monitor.com/ +''' +import string, inspect, datetime, re +from calibre.web.feeds.news import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import BeautifulSoup + +class AlMonitor(BasicNewsRecipe): + title = u'Al Monitor' + __author__ = u'spswerling' + description = 'a The Pulse of the Middle East' + no_stylesheets = True + encoding = 'utf-8' + category = 'news' + language = 'en' + publication_type = 'newspaper' + cover_img_url = 'http://www.al-monitor.com/modules/almcontent/a-img/elements/logo.png' + masthead_url = cover_img_url + remove_empty_feeds = True + + # on kindle, images can make things kind of fat. Slim them down. + recursions = 0 + compress_news_images = True + compress_news_images_max_size = 7 + scale_news_images = (150,200) # (kindle touch: 600x800) + useHighResImages = False + oldest_article = 1.5 + max_articles_per_section = 15 + + sections = [ + (u'egypt',u'http://www.al-monitor.com/pulse/egypt-pulse'), + (u'gulf',u'http://www.al-monitor.com/pulse/gulf-pulse'), + (u'iran',u'http://www.al-monitor.com/pulse/iran-pulse'), + (u'iraq',u'http://www.al-monitor.com/pulse/iraq-pulse'), + (u'israel',u'http://www.al-monitor.com/pulse/israel-pulse'), + (u'lebanon',u'http://www.al-monitor.com/pulse/lebanon-pulse'), + (u'palistine',u'http://www.al-monitor.com/pulse/palistine-pulse'), + (u'syria',u'http://www.al-monitor.com/pulse/syria-pulse'), + (u'turkey',u'http://www.al-monitor.com/pulse/turkey-pulse'), + ] + + # util for creating remove_tags and keep_tags style regex matchers + def tag_matcher(elt, attr, rgx_str): + return dict(name=elt, attrs={attr:re.compile(rgx_str, re.IGNORECASE)}) + + remove_tags = [ + dict(attrs={'id':[ + 'header', + 'pulsebanner', + 'relatedarticles', + 'sidecolumn', + 'disqus', + 'footer', + 'footer2', + 'footer3', + 'mobile-extras', + ]}), + tag_matcher('hr', 'id', 'spacer'), + tag_matcher('a', 'title', 'print this article'), + tag_matcher('div', 'class', 'extras'), + tag_matcher('div', 'class', '^clear$'), + tag_matcher('div', 'class', '^overlay$'), + tag_matcher('div', 'class', 'shareTag'), + ] + + articles = {} + urls_done = [] + + def parse_index(self): + for section in self.sections: + self.parse_section(section[0], section[1]) + ans = [] + for k in self.articles: + ans.append((string.capwords(k), self.articles[k])) + return ans + + def parse_section(self, section, url): + + self.articles[section] = [] + + try: + self._p('process section ' + section + ', url: ' + url) + soup = self.index_to_soup(url) + except: + self._p('Unable to spider section') + return [] + + self._p('Got section. Processing links.') + + for link in soup.findAll('a'): + href = link.get('href') + text = self.text(link) + if text and ('pulse/originals' in href): + self.process_link(section, link) + + def process_link(self, section, link): + title = self.text(link) + if len(title) > 120: + title = title[0:120] + '...' + href = link.get('href') + if not href: + self._p("BAD HREF: " + str(link)) + return + self.queue_article_link(section, href, title) + + def queue_article_link(self, section, url, title): + full_url = self.abs_url(url) + if full_url in self.urls_done: + self._p('Skip (already Qd): ' + ' - '.join([section, title, url])) + return + + self._p('Q: ' + ' - '.join([section, title, url])) + self.urls_done.append(full_url) + if len(self.articles[section]) >= self.max_articles_per_section: + return + self.articles[section].append( + dict(title=title, + url=full_url, + date='', + description='', + author='', + content='')) + + def preprocess_raw_html(self, raw_html, url): + reason_to_skip = self.should_skip_article(BeautifulSoup(raw_html)) + if reason_to_skip: + self._p('Skipping article: ' + reason_to_skip + ', ' + url) + # Next line will show up as an error in the logs, but ignore, see + # http://www.mobileread.com/forums/showthread.php?p=2931136 + return None + else: + return super(self.__class__, self).preprocess_raw_html(raw_html, url) + + def populate_article_metadata(self, article, soup, first): + summary_node = soup.find('div', {'id':'summary'}) + if summary_node: + summary = self.text(summary_node) + self._p('Summary: ' + summary) + article.text_summary = summary + else: + self._p('No summary') + + def should_skip_article(self, soup): + date = self.scrape_article_date(soup) + if not date: + return False + + age = (datetime.datetime.now() - date).days + if (age > self.oldest_article): + return "too old" + return False + + def scrape_article_date(self, soup): + for span in soup.findAll('span'): + txt = self.text(span) + rgx = re.compile('Posted ([a-zA-Z]+ \d\d?, \d\d\d\d).*') + hit = rgx.match(txt) + if hit: + return self.date_from_string(txt) + + return None + + def date_from_string(self, datestring): + try: + # eg: Posted September 17, 2014 + dt = datetime.datetime.strptime(datestring,"Posted %B %d, %Y") + except: + dt = None + + if dt: + self._p('From string "' + datestring + '", datetime: ' + str(dt)) + else: + self._p('Could not get datetime from ' + datestring) + + return dt + + def abs_url(self, url): + if 'www.al-monitor.com' in url: + abs_url = url + elif url[0] == '/': + abs_url = 'http://www.al-monitor.com' + url + else: + self._p('Not sure how to make abs_url: ' + url) + raise + + if '#' in abs_url: + abs_url = ''.join(abs_url.split('#')[0:-1]) + + return abs_url + + def text(self,n): + return self.tag_to_string(n).strip() + + def _dbg_soup_node(self, node): + s = ' cls: ' + str(node.get('class')).strip() + \ + ' id: ' + str(node.get('id')).strip() + \ + ' role: ' + str(node.get('role')).strip() + \ + ' txt: ' + self.text(node) + return s + + def _p(self, msg): + curframe = inspect.currentframe() + calframe = inspect.getouterframes(curframe, 2) + calname = calframe[1][3].upper() + print('[' + calname + '] ' + msg[0:100])