From 92cfab55a1925b4372bf8fecc4207e57715bde3e Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Thu, 25 Sep 2014 00:13:34 +0530 Subject: [PATCH] Hurriyet Daily News by spswerling --- recipes/hurriyet_daily_news.recipe | 260 +++++++++++++++++++++++++++++ 1 file changed, 260 insertions(+) create mode 100644 recipes/hurriyet_daily_news.recipe diff --git a/recipes/hurriyet_daily_news.recipe b/recipes/hurriyet_daily_news.recipe new file mode 100644 index 0000000000..1c9d1b32e1 --- /dev/null +++ b/recipes/hurriyet_daily_news.recipe @@ -0,0 +1,260 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +__license__ = 'GPL v3' +__copyright__ = '2014, spswerling' +''' +www.hurriyetdailynews.com +''' +import os, string, inspect, datetime, re +from calibre.web.feeds.news import BasicNewsRecipe +from calibre.ebooks.BeautifulSoup import BeautifulSoup + +class HurriyetDailyNews_en(BasicNewsRecipe): + title = u'Hurriyet Daily News' + __author__ = u'spswerling' + description = 'a Turkey based daily in english' + description = 'English version of Turkish Daily "Hurriyet"' + no_stylesheets = True + encoding = 'utf-8' + category = 'news' + language = 'en_TR' + publication_type = 'newspaper' + cover_img_url = 'http://www.hurriyetdailynews.com/images/design/logo-hurriyet-daily-news.png' + masthead_url = cover_img_url + remove_empty_feeds = True + + # on kindle, images can make things kind of fat. Slim them down. + recursions = 0 + oldest_article = 1 + compress_news_images = True + compress_news_images_max_size = 7 + scale_news_images = (150,200) # (kindle touch: 600x800) + useHighResImages = False + oldest_article = 1.5 + max_articles_per_section = 25 + max_articles_per_subsection = 7 + + sections = [ + u'turkey', + u'economy', + u'world', + u'sports', + # u'life', + u'opinion', + # u'arts/culture' + ] + + # util for creating remove_tags and keep_tags style regex matchers + def tag_matcher(elt, attr, str): + return dict(name=elt, attrs={attr:re.compile(str, re.IGNORECASE)}) + + keep_only_tags = [tag_matcher('div', 'class', 'NewsDetail')] + + remove_tags = [ + tag_matcher('div', 'class', 'Carousel'), + tag_matcher('div', 'class', 'ShareIt'), + tag_matcher('div', 'class', 'tmz'), + tag_matcher('span', 'id', 'comment'), + tag_matcher('h2', 'class', 'NewSpot'), + tag_matcher('h2', 'class', 'pv-gallery'), + ] + + articles = {} + subsection_links = {} + urls_done = [] + links_per_section = {} + + def parse_index(self): + section_links = self.section_links_from_home_page() + for section_link in section_links: + self.articles[self.section_name(section_link)] = [] + subsection_links = self.find_subsection_links(section_link) + for subsection_link in subsection_links: + sub_name = self.subsection_name(subsection_link) + self.subsection_links[sub_name] = [] + self.parse_subsection(section_link, subsection_link) + ans = [] + for k in self.articles: + ans.append((string.capwords(k), self.articles[k])) + return ans + + def section_links_from_home_page(self): + + def include_link(link): + return self.text(link).lower() in self.sections + + url = 'http://www.hurriyetdailynews.com/' + try: + self._p('hitting home page ' + url) + soup = self.index_to_soup(url) + except: + self._p('Unable to spider home page') + return [] + + self._p('Got home page. hunt down section links.') + + regex = re.compile('rmRootLink', re.IGNORECASE) + links = soup.findAll('a', {'class':regex}) + + filtered_links = filter(include_link, links) + self._p(' all sections: ' + ', '.join(map(self.text, links))) + self._p(' filtered sections: ' + + ', '.join(map(self.text, filtered_links))) + + return filtered_links + + def find_subsection_links(self, section_link): + self._p('find subsection links for section ' + str(section_link)) + url = self.abs_url(section_link['href']) + try: + self._p('hitting ' + url) + soup = self.index_to_soup(url) + except: + self._p('Unable to spider subsection') + return [] + self._p('Got ' + url) + + div = soup.find('div', {'class':'SeffafLink'}) + if not div: + self._p('could not find any subsections') + return [section_link] + links = div.findAll('a') + self._p(' subsection links: ' + ', '.join(map(self.text, links))) + return links + + def parse_subsection(self, section_link, subsection_link): + + section = self.section_name(section_link) + if len(self.articles[section]) > self.max_articles_per_section: + return + + # tmp dbg + # if not self.subsection_name(subsection_link) == 'arts': + # return + + self._p('hit section ' + section + + ', subsect ' + self.subsection_name(subsection_link)) + url = self.abs_url(subsection_link['href']) + try: + self._p('hitting ' + url) + soup = self.index_to_soup(url) + except: + self._p('Unable to spider section') + return [] + + self._p('Process links ') + for link in soup.findAll('a'): + if 'NewsDetail' in str(link.get('id')): + self.process_link(section_link, subsection_link, link) + + def process_link(self, section_link, subsection_link, link): + section = self.section_name(section_link) + subsection = self.subsection_name(subsection_link) + title = link['title'] or self.text(link) + href = link.get('href') + if not href: + self._p("BAD HREF: " + str(link)) + return + self.queue_article_link(section, subsection, href, title) + + def queue_article_link(self, section, subsection, url, title): + full_url = self.abs_url(url) + if full_url in self.urls_done: + # self._p('Skip (already Qd): ' + ' - '.join([section, subsection, title, url])) + return + + self.urls_done.append(full_url) + if len(self.articles[section]) >= self.max_articles_per_section: + return + if len(self.subsection_links[subsection]) >= \ + self.max_articles_per_subsection: + return + self._p('Q: ' + ' - '.join([section, subsection, title, url])) + full_title = string.capwords(subsection + ' - ' + title) + self.subsection_links[subsection].append(url) + self.articles[section].append( + dict(title=full_title, + url=full_url, + date='', + description='', + author='', + content='')) + + def text(self,n): + return self.tag_to_string(n).strip() + + def abs_url(self, url): + if 'www.hurriyetdailynews.com' in url: + abs_url = url + elif url[0] == '/': + abs_url = 'http://www.hurriyetdailynews.com' + url + else: + abs_url = 'http://www.hurriyetdailynews.com/' + url + if '#' in abs_url: + abs_url = ''.join(abs_url.split('#')[0:-1]) + + return abs_url + + def section_name(self,link): + return self.text(link).lower() + + def subsection_name(self,link): + from_fn = str(os.path.splitext(link['href'])[0]).split('/')[-1] + return from_fn + + def preprocess_raw_html(self, raw_html, url): + reason_to_skip = self.should_skip_article(BeautifulSoup(raw_html)) + if reason_to_skip: + self._p('Skipping article: ' + reason_to_skip + ', ' + url) + # Next line will show up as an error in the logs, but ignore, see + # http://www.mobileread.com/forums/showthread.php?p=2931136 + return None + else: + return super(self.__class__, self).preprocess_raw_html(raw_html, url) + + def should_skip_article(self, soup): + date = self.scrape_article_date(soup) + if not date: + return False + + age = (datetime.datetime.now() - date).days + if (age > self.oldest_article): + return "too old" + return False + + def date_from_string(self, datestring): + try: + # eg: September/17/2014 + dt = datetime.datetime.strptime(datestring,"%B/%d/%Y") + except: + try: + # eg: September 17/2014 + dt = datetime.datetime.strptime(datestring,"%B %d/%Y") + except: + dt = None + if dt: + self._p('From string "' + datestring + '", datetime: ' + str(dt)) + else: + self._p('Could not get datetime from ' + datestring) + return dt + + def scrape_article_date(self, soup): + dnode = soup.find('p', {'class':'dateagency'}) or \ + soup.find('p', {'class':'Tarih'}) + if dnode: + dstring = self.text(dnode) + return self.date_from_string(dstring) + else: + return None + + def _dbg_soup_node(self, node): + s = ' cls: ' + str(node.get('class')).strip() + \ + ' id: ' + str(node.get('id')).strip() + \ + ' txt: ' + self.text(node) + return s + + def _p(self, msg): + curframe = inspect.currentframe() + calframe = inspect.getouterframes(curframe, 2) + calname = calframe[1][3].upper() + print('[' + calname + '] ' + msg[0:120])