From f0d694a58d0b19e59ed1a212521632275bebf3c3 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Sun, 9 Jun 2019 13:25:36 +0530 Subject: [PATCH] Remove non-working recipe --- recipes/hurriyet_daily_news.recipe | 266 ----------------------------- 1 file changed, 266 deletions(-) delete mode 100644 recipes/hurriyet_daily_news.recipe diff --git a/recipes/hurriyet_daily_news.recipe b/recipes/hurriyet_daily_news.recipe deleted file mode 100644 index c7ea9582e4..0000000000 --- a/recipes/hurriyet_daily_news.recipe +++ /dev/null @@ -1,266 +0,0 @@ -#!/usr/bin/env python2 -# -*- coding: utf-8 -*- -from __future__ import print_function -__license__ = 'GPL v3' -__copyright__ = '2014, spswerling' -''' -www.hurriyetdailynews.com -''' -import os -import string -import inspect -import datetime -import re -from calibre.web.feeds.news import BasicNewsRecipe -from calibre.ebooks.BeautifulSoup import BeautifulSoup - - -class HurriyetDailyNews_en(BasicNewsRecipe): - title = u'Hurriyet Daily News' - __author__ = u'spswerling' - description = 'a Turkey based daily in english' - description = 'English version of Turkish Daily "Hurriyet"' - no_stylesheets = True - encoding = 'utf-8' - category = 'news' - language = 'en_TR' - publication_type = 'newspaper' - cover_img_url = 'http://www.hurriyetdailynews.com/images/design/logo-hurriyet-daily-news.png' - masthead_url = cover_img_url - remove_empty_feeds = True - - # on kindle, images can make things kind of fat. Slim them down. - recursions = 0 - oldest_article = 1 - compress_news_images = True - compress_news_images_max_size = 7 - scale_news_images = (150, 200) # (kindle touch: 600x800) - useHighResImages = False - oldest_article = 1.5 - max_articles_per_section = 25 - max_articles_per_subsection = 7 - - sections = [ - u'turkey', - u'economy', - u'world', - u'sports', - # u'life', - u'opinion', - # u'arts/culture' - ] - - # util for creating remove_tags and keep_tags style regex matchers - def tag_matcher(elt, attr, str): - return dict(name=elt, attrs={attr: re.compile(str, re.IGNORECASE)}) - - keep_only_tags = [tag_matcher('div', 'class', 'NewsDetail')] - - remove_tags = [ - tag_matcher('div', 'class', 'Carousel'), - tag_matcher('div', 'class', 'ShareIt'), - tag_matcher('div', 'class', 'tmz'), - tag_matcher('span', 'id', 'comment'), - tag_matcher('h2', 'class', 'NewSpot'), - tag_matcher('h2', 'class', 'pv-gallery'), - ] - - articles = {} - subsection_links = {} - urls_done = [] - links_per_section = {} - - def parse_index(self): - section_links = self.section_links_from_home_page() - for section_link in section_links: - self.articles[self.section_name(section_link)] = [] - subsection_links = self.find_subsection_links(section_link) - for subsection_link in subsection_links: - sub_name = self.subsection_name(subsection_link) - self.subsection_links[sub_name] = [] - self.parse_subsection(section_link, subsection_link) - ans = [] - for k in self.articles: - ans.append((string.capwords(k), self.articles[k])) - return ans - - def section_links_from_home_page(self): - - def include_link(link): - return self.text(link).lower() in self.sections - - url = 'http://www.hurriyetdailynews.com/' - try: - self._p('hitting home page ' + url) - soup = self.index_to_soup(url) - except: - self._p('Unable to spider home page') - return [] - - self._p('Got home page. hunt down section links.') - - regex = re.compile('rmRootLink', re.IGNORECASE) - links = soup.findAll('a', {'class': regex}) - - filtered_links = list(filter(include_link, links)) - self._p(' all sections: ' + ', '.join(map(self.text, links))) - self._p(' filtered sections: ' + - ', '.join(map(self.text, filtered_links))) - - return filtered_links - - def find_subsection_links(self, section_link): - self._p('find subsection links for section ' + str(section_link)) - url = self.abs_url(section_link['href']) - try: - self._p('hitting ' + url) - soup = self.index_to_soup(url) - except: - self._p('Unable to spider subsection') - return [] - self._p('Got ' + url) - - div = soup.find('div', {'class': 'SeffafLink'}) - if not div: - self._p('could not find any subsections') - return [section_link] - links = div.findAll('a') - self._p(' subsection links: ' + ', '.join(map(self.text, links))) - return links - - def parse_subsection(self, section_link, subsection_link): - - section = self.section_name(section_link) - if len(self.articles[section]) > self.max_articles_per_section: - return - - # tmp dbg - # if not self.subsection_name(subsection_link) == 'arts': - # return - - self._p('hit section ' + section + - ', subsect ' + self.subsection_name(subsection_link)) - url = self.abs_url(subsection_link['href']) - try: - self._p('hitting ' + url) - soup = self.index_to_soup(url) - except: - self._p('Unable to spider section') - return [] - - self._p('Process links ') - for link in soup.findAll('a'): - if 'NewsDetail' in str(link.get('id')): - self.process_link(section_link, subsection_link, link) - - def process_link(self, section_link, subsection_link, link): - section = self.section_name(section_link) - subsection = self.subsection_name(subsection_link) - title = link['title'] or self.text(link) - href = link.get('href') - if not href: - self._p("BAD HREF: " + str(link)) - return - self.queue_article_link(section, subsection, href, title) - - def queue_article_link(self, section, subsection, url, title): - full_url = self.abs_url(url) - if full_url in self.urls_done: - # self._p('Skip (already Qd): ' + ' - '.join([section, subsection, title, url])) - return - - self.urls_done.append(full_url) - if len(self.articles[section]) >= self.max_articles_per_section: - return - if len(self.subsection_links[subsection]) >= \ - self.max_articles_per_subsection: - return - self._p('Q: ' + ' - '.join([section, subsection, title, url])) - full_title = string.capwords(subsection + ' - ' + title) - self.subsection_links[subsection].append(url) - self.articles[section].append( - dict(title=full_title, - url=full_url, - date='', - description='', - author='', - content='')) - - def text(self, n): - return self.tag_to_string(n).strip() - - def abs_url(self, url): - if 'www.hurriyetdailynews.com' in url: - abs_url = url - elif url[0] == '/': - abs_url = 'http://www.hurriyetdailynews.com' + url - else: - abs_url = 'http://www.hurriyetdailynews.com/' + url - if '#' in abs_url: - abs_url = ''.join(abs_url.split('#')[0:-1]) - - return abs_url - - def section_name(self, link): - return self.text(link).lower() - - def subsection_name(self, link): - from_fn = str(os.path.splitext(link['href'])[0]).split('/')[-1] - return from_fn - - def preprocess_raw_html(self, raw_html, url): - reason_to_skip = self.should_skip_article(BeautifulSoup(raw_html)) - if reason_to_skip: - self._p('Skipping article: ' + reason_to_skip + ', ' + url) - # Next line will show up as an error in the logs, but ignore, see - # http://www.mobileread.com/forums/showthread.php?p=2931136 - return None - else: - return super(self.__class__, self).preprocess_raw_html(raw_html, url) - - def should_skip_article(self, soup): - date = self.scrape_article_date(soup) - if not date: - return False - - age = (datetime.datetime.now() - date).days - if (age > self.oldest_article): - return "too old" - return False - - def date_from_string(self, datestring): - try: - # eg: September/17/2014 - dt = datetime.datetime.strptime(datestring, "%B/%d/%Y") - except: - try: - # eg: September 17/2014 - dt = datetime.datetime.strptime(datestring, "%B %d/%Y") - except: - dt = None - if dt: - self._p('From string "' + datestring + '", datetime: ' + str(dt)) - else: - self._p('Could not get datetime from ' + datestring) - return dt - - def scrape_article_date(self, soup): - dnode = soup.find('p', {'class': 'dateagency'}) or \ - soup.find('p', {'class': 'Tarih'}) - if dnode: - dstring = self.text(dnode) - return self.date_from_string(dstring) - else: - return None - - def _dbg_soup_node(self, node): - s = ' cls: ' + str(node.get('class')).strip() + \ - ' id: ' + str(node.get('id')).strip() + \ - ' txt: ' + self.text(node) - return s - - def _p(self, msg): - curframe = inspect.currentframe() - calframe = inspect.getouterframes(curframe, 2) - calname = calframe[1][3].upper() - print('[' + calname + '] ' + msg[0:120])