mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-06-23 15:30:45 -04:00
215 lines
6.9 KiB
Python
215 lines
6.9 KiB
Python
#!/usr/bin/env python
|
|
# -*- coding: utf-8 -*-
|
|
from __future__ import print_function, unicode_literals
|
|
__license__ = 'GPL v3'
|
|
__copyright__ = '2014, spswerling'
|
|
'''
|
|
http://www.al-monitor.com/
|
|
'''
|
|
import string
|
|
import inspect
|
|
import datetime
|
|
import re
|
|
from calibre.web.feeds.news import BasicNewsRecipe
|
|
from calibre.ebooks.BeautifulSoup import BeautifulSoup
|
|
|
|
|
|
class AlMonitor(BasicNewsRecipe):
|
|
title = u'Al Monitor'
|
|
__author__ = u'spswerling'
|
|
description = 'a The Pulse of the Middle East'
|
|
no_stylesheets = True
|
|
encoding = 'utf-8'
|
|
category = 'news'
|
|
language = 'en'
|
|
publication_type = 'newspaper'
|
|
cover_img_url = 'http://www.al-monitor.com/modules/almcontent/a-img/elements/logo.png'
|
|
masthead_url = cover_img_url
|
|
remove_empty_feeds = True
|
|
|
|
# on kindle, images can make things kind of fat. Slim them down.
|
|
recursions = 0
|
|
compress_news_images = True
|
|
compress_news_images_max_size = 7
|
|
scale_news_images = (150, 200) # (kindle touch: 600x800)
|
|
useHighResImages = False
|
|
oldest_article = 1.5
|
|
max_articles_per_section = 15
|
|
|
|
sections = [
|
|
(u'egypt', u'http://www.al-monitor.com/pulse/egypt-pulse'),
|
|
(u'gulf', u'http://www.al-monitor.com/pulse/gulf-pulse'),
|
|
(u'iran', u'http://www.al-monitor.com/pulse/iran-pulse'),
|
|
(u'iraq', u'http://www.al-monitor.com/pulse/iraq-pulse'),
|
|
(u'israel', u'http://www.al-monitor.com/pulse/israel-pulse'),
|
|
(u'lebanon', u'http://www.al-monitor.com/pulse/lebanon-pulse'),
|
|
(u'palistine', u'http://www.al-monitor.com/pulse/palistine-pulse'),
|
|
(u'syria', u'http://www.al-monitor.com/pulse/syria-pulse'),
|
|
(u'turkey', u'http://www.al-monitor.com/pulse/turkey-pulse'),
|
|
]
|
|
|
|
# util for creating remove_tags and keep_tags style regex matchers
|
|
def tag_matcher(elt, attr, rgx_str):
|
|
return dict(name=elt, attrs={attr: re.compile(rgx_str, re.IGNORECASE)})
|
|
|
|
remove_tags = [
|
|
dict(attrs={'id': [
|
|
'header',
|
|
'pulsebanner',
|
|
'relatedarticles',
|
|
'sidecolumn',
|
|
'disqus',
|
|
'footer',
|
|
'footer2',
|
|
'footer3',
|
|
'mobile-extras',
|
|
]}),
|
|
tag_matcher('hr', 'id', 'spacer'),
|
|
tag_matcher('a', 'title', 'print this article'),
|
|
tag_matcher('div', 'class', 'extras'),
|
|
tag_matcher('div', 'class', '^clear$'),
|
|
tag_matcher('div', 'class', '^overlay$'),
|
|
tag_matcher('div', 'class', 'shareTag'),
|
|
]
|
|
|
|
articles = {}
|
|
urls_done = []
|
|
|
|
def parse_index(self):
|
|
for section in self.sections:
|
|
self.parse_section(section[0], section[1])
|
|
ans = []
|
|
for k in self.articles:
|
|
ans.append((string.capwords(k), self.articles[k]))
|
|
return ans
|
|
|
|
def parse_section(self, section, url):
|
|
|
|
self.articles[section] = []
|
|
|
|
try:
|
|
self._p('process section ' + section + ', url: ' + url)
|
|
soup = self.index_to_soup(url)
|
|
except:
|
|
self._p('Unable to spider section')
|
|
return []
|
|
|
|
self._p('Got section. Processing links.')
|
|
|
|
for link in soup.findAll('a', href=True):
|
|
href = link.get('href')
|
|
text = self.text(link)
|
|
if text and ('pulse/originals' in href):
|
|
self.process_link(section, link)
|
|
|
|
def process_link(self, section, link):
|
|
title = self.text(link)
|
|
if len(title) > 120:
|
|
title = title[0:120] + '...'
|
|
href = link.get('href')
|
|
if not href:
|
|
self._p("BAD HREF: " + str(link))
|
|
return
|
|
self.queue_article_link(section, href, title)
|
|
|
|
def queue_article_link(self, section, url, title):
|
|
full_url = self.abs_url(url)
|
|
if full_url in self.urls_done:
|
|
self._p('Skip (already Qd): ' + ' - '.join([section, title, url]))
|
|
return
|
|
|
|
self._p('Q: ' + ' - '.join([section, title, url]))
|
|
self.urls_done.append(full_url)
|
|
if len(self.articles[section]) >= self.max_articles_per_section:
|
|
return
|
|
self.articles[section].append(
|
|
dict(title=title,
|
|
url=full_url,
|
|
date='',
|
|
description='',
|
|
author='',
|
|
content=''))
|
|
|
|
def preprocess_raw_html(self, raw_html, url):
|
|
reason_to_skip = self.should_skip_article(BeautifulSoup(raw_html))
|
|
if reason_to_skip:
|
|
self._p('Skipping article: ' + reason_to_skip + ', ' + url)
|
|
# Next line will show up as an error in the logs, but ignore, see
|
|
# http://www.mobileread.com/forums/showthread.php?p=2931136
|
|
return None
|
|
else:
|
|
return super(self.__class__, self).preprocess_raw_html(raw_html, url)
|
|
|
|
def populate_article_metadata(self, article, soup, first):
|
|
summary_node = soup.find('div', {'id': 'summary'})
|
|
if summary_node:
|
|
summary = self.text(summary_node)
|
|
self._p('Summary: ' + summary)
|
|
article.text_summary = summary
|
|
else:
|
|
self._p('No summary')
|
|
|
|
def should_skip_article(self, soup):
|
|
date = self.scrape_article_date(soup)
|
|
if not date:
|
|
return False
|
|
|
|
age = (datetime.datetime.now() - date).days
|
|
if (age > self.oldest_article):
|
|
return "too old"
|
|
return False
|
|
|
|
def scrape_article_date(self, soup):
|
|
for span in soup.findAll('span'):
|
|
txt = self.text(span)
|
|
rgx = re.compile(r'Posted ([a-zA-Z]+ \d\d?, \d\d\d\d).*')
|
|
hit = rgx.match(txt)
|
|
if hit:
|
|
return self.date_from_string(txt)
|
|
|
|
return None
|
|
|
|
def date_from_string(self, datestring):
|
|
try:
|
|
# eg: Posted September 17, 2014
|
|
dt = datetime.datetime.strptime(datestring, "Posted %B %d, %Y")
|
|
except:
|
|
dt = None
|
|
|
|
if dt:
|
|
self._p('From string "' + datestring + '", datetime: ' + str(dt))
|
|
else:
|
|
self._p('Could not get datetime from ' + datestring)
|
|
|
|
return dt
|
|
|
|
def abs_url(self, url):
|
|
if 'www.al-monitor.com' in url:
|
|
abs_url = url
|
|
elif url[0] == '/':
|
|
abs_url = 'http://www.al-monitor.com' + url
|
|
else:
|
|
self._p('Not sure how to make abs_url: ' + url)
|
|
raise
|
|
|
|
if '#' in abs_url:
|
|
abs_url = ''.join(abs_url.split('#')[0:-1])
|
|
|
|
return abs_url
|
|
|
|
def text(self, n):
|
|
return self.tag_to_string(n).strip()
|
|
|
|
def _dbg_soup_node(self, node):
|
|
s = ' cls: ' + str(node.get('class')).strip() + \
|
|
' id: ' + str(node.get('id')).strip() + \
|
|
' role: ' + str(node.get('role')).strip() + \
|
|
' txt: ' + self.text(node)
|
|
return s
|
|
|
|
def _p(self, msg):
|
|
curframe = inspect.currentframe()
|
|
calframe = inspect.getouterframes(curframe, 2)
|
|
calname = calframe[1][3].upper()
|
|
print('[' + calname + '] ' + msg[0:100])
|