Hurriyet Daily News by spswerling

2025-12-22 12:57:21 -05:00 · 2014-09-25 00:13:34 +05:30 · 2014-09-25 00:13:34 +05:30 · 92cfab55a1
commit 92cfab55a1
parent 2b23b0d342
1 changed files with 260 additions and 0 deletions
--- a/recipes/hurriyet_daily_news.recipe
+++ b/recipes/hurriyet_daily_news.recipe
@ -0,0 +1,260 @@
+#!/usr/bin/env  python
+# -*- coding: utf-8 -*-
+__license__   = 'GPL v3'
+__copyright__ = '2014, spswerling'
+'''
+www.hurriyetdailynews.com
+'''
+import os, string, inspect, datetime, re
+from calibre.web.feeds.news import BasicNewsRecipe
+from calibre.ebooks.BeautifulSoup import BeautifulSoup
+
+class HurriyetDailyNews_en(BasicNewsRecipe):
+    title          = u'Hurriyet Daily News'
+    __author__            = u'spswerling'
+    description            = 'a Turkey based daily in english'
+    description = 'English version of Turkish Daily "Hurriyet"'
+    no_stylesheets         = True
+    encoding               = 'utf-8'
+    category               = 'news'
+    language               = 'en_TR'
+    publication_type = 'newspaper'
+    cover_img_url = 'http://www.hurriyetdailynews.com/images/design/logo-hurriyet-daily-news.png'
+    masthead_url = cover_img_url
+    remove_empty_feeds = True
+
+    # on kindle, images can make things kind of fat. Slim them down.
+    recursions = 0
+    oldest_article = 1
+    compress_news_images = True
+    compress_news_images_max_size = 7
+    scale_news_images = (150,200)  # (kindle touch: 600x800)
+    useHighResImages = False
+    oldest_article = 1.5
+    max_articles_per_section = 25
+    max_articles_per_subsection = 7
+
+    sections = [
+                 u'turkey',
+                 u'economy',
+                 u'world',
+                 u'sports',
+                 # u'life',
+                 u'opinion',
+                 # u'arts/culture'
+               ]
+
+    # util for creating remove_tags and keep_tags style regex matchers
+    def tag_matcher(elt, attr, str):
+        return dict(name=elt, attrs={attr:re.compile(str, re.IGNORECASE)})
+
+    keep_only_tags = [tag_matcher('div', 'class', 'NewsDetail')]
+
+    remove_tags = [
+        tag_matcher('div', 'class', 'Carousel'),
+        tag_matcher('div', 'class', 'ShareIt'),
+        tag_matcher('div', 'class', 'tmz'),
+        tag_matcher('span', 'id', 'comment'),
+        tag_matcher('h2', 'class', 'NewSpot'),
+        tag_matcher('h2', 'class', 'pv-gallery'),
+        ]
+
+    articles = {}
+    subsection_links = {}
+    urls_done = []
+    links_per_section = {}
+
+    def parse_index(self):
+        section_links = self.section_links_from_home_page()
+        for section_link in section_links:
+            self.articles[self.section_name(section_link)] = []
+            subsection_links = self.find_subsection_links(section_link)
+            for subsection_link in subsection_links:
+                sub_name = self.subsection_name(subsection_link)
+                self.subsection_links[sub_name] = []
+                self.parse_subsection(section_link, subsection_link)
+        ans = []
+        for k in self.articles:
+            ans.append((string.capwords(k), self.articles[k]))
+        return ans
+
+    def section_links_from_home_page(self):
+
+        def include_link(link):
+            return self.text(link).lower() in self.sections
+
+        url = 'http://www.hurriyetdailynews.com/'
+        try:
+            self._p('hitting home page ' + url)
+            soup = self.index_to_soup(url)
+        except:
+            self._p('Unable to spider home page')
+            return []
+
+        self._p('Got home page. hunt down section links.')
+
+        regex = re.compile('rmRootLink', re.IGNORECASE)
+        links = soup.findAll('a', {'class':regex})
+
+        filtered_links = filter(include_link, links)
+        self._p(' all sections: ' + ', '.join(map(self.text, links)))
+        self._p(' filtered sections: ' +
+                ', '.join(map(self.text, filtered_links)))
+
+        return filtered_links
+
+    def find_subsection_links(self, section_link):
+        self._p('find subsection links for section  ' + str(section_link))
+        url = self.abs_url(section_link['href'])
+        try:
+            self._p('hitting ' + url)
+            soup = self.index_to_soup(url)
+        except:
+            self._p('Unable to spider subsection')
+            return []
+        self._p('Got ' + url)
+
+        div = soup.find('div', {'class':'SeffafLink'})
+        if not div:
+            self._p('could not find any subsections')
+            return [section_link]
+        links = div.findAll('a')
+        self._p(' subsection links: ' + ', '.join(map(self.text, links)))
+        return links
+
+    def parse_subsection(self, section_link, subsection_link):
+
+        section = self.section_name(section_link)
+        if len(self.articles[section]) > self.max_articles_per_section:
+            return
+
+        # tmp dbg
+        # if not self.subsection_name(subsection_link) == 'arts':
+        #    return
+
+        self._p('hit section  ' + section +
+           ', subsect ' +  self.subsection_name(subsection_link))
+        url = self.abs_url(subsection_link['href'])
+        try:
+            self._p('hitting ' + url)
+            soup = self.index_to_soup(url)
+        except:
+            self._p('Unable to spider section')
+            return []
+
+        self._p('Process  links ')
+        for link in soup.findAll('a'):
+            if 'NewsDetail' in str(link.get('id')):
+                self.process_link(section_link, subsection_link, link)
+
+    def process_link(self, section_link, subsection_link, link):
+        section = self.section_name(section_link)
+        subsection = self.subsection_name(subsection_link)
+        title = link['title'] or self.text(link)
+        href = link.get('href')
+        if not href:
+            self._p("BAD HREF: " + str(link))
+            return
+        self.queue_article_link(section, subsection, href, title)
+
+    def queue_article_link(self, section, subsection, url, title):
+        full_url = self.abs_url(url)
+        if full_url in self.urls_done:
+            # self._p('Skip (already Qd): ' + ' - '.join([section, subsection, title, url]))
+            return
+
+        self.urls_done.append(full_url)
+        if len(self.articles[section]) >= self.max_articles_per_section:
+            return
+        if len(self.subsection_links[subsection]) >= \
+            self.max_articles_per_subsection:
+            return
+        self._p('Q: ' + ' - '.join([section, subsection, title, url]))
+        full_title = string.capwords(subsection + ' - ' + title)
+        self.subsection_links[subsection].append(url)
+        self.articles[section].append(
+                        dict(title=full_title,
+                            url=full_url,
+                            date='',
+                            description='',
+                            author='',
+                            content=''))
+
+    def text(self,n):
+        return self.tag_to_string(n).strip()
+
+    def abs_url(self, url):
+        if 'www.hurriyetdailynews.com' in url:
+            abs_url = url
+        elif url[0] == '/':
+            abs_url = 'http://www.hurriyetdailynews.com' + url
+        else:
+            abs_url = 'http://www.hurriyetdailynews.com/' + url
+        if '#' in abs_url:
+            abs_url = ''.join(abs_url.split('#')[0:-1])
+
+        return abs_url
+
+    def section_name(self,link):
+        return self.text(link).lower()
+
+    def subsection_name(self,link):
+        from_fn = str(os.path.splitext(link['href'])[0]).split('/')[-1]
+        return from_fn
+
+    def preprocess_raw_html(self, raw_html, url):
+        reason_to_skip = self.should_skip_article(BeautifulSoup(raw_html))
+        if reason_to_skip:
+            self._p('Skipping article: ' + reason_to_skip + ', ' + url)
+            # Next line will show up as an error in the logs, but ignore, see
+            #   http://www.mobileread.com/forums/showthread.php?p=2931136
+            return None
+        else:
+            return super(self.__class__, self).preprocess_raw_html(raw_html, url)
+
+    def should_skip_article(self, soup):
+        date = self.scrape_article_date(soup)
+        if not date:
+            return False
+
+        age = (datetime.datetime.now() - date).days
+        if (age > self.oldest_article):
+            return "too old"
+        return False
+
+    def date_from_string(self, datestring):
+        try:
+            # eg: September/17/2014
+            dt = datetime.datetime.strptime(datestring,"%B/%d/%Y")
+        except:
+            try:
+                # eg: September 17/2014
+                dt = datetime.datetime.strptime(datestring,"%B %d/%Y")
+            except:
+                dt = None
+        if dt:
+            self._p('From string "' + datestring + '", datetime: ' + str(dt))
+        else:
+            self._p('Could not get datetime from ' + datestring)
+        return dt
+
+    def scrape_article_date(self, soup):
+        dnode =  soup.find('p', {'class':'dateagency'}) or \
+                 soup.find('p', {'class':'Tarih'})
+        if dnode:
+            dstring = self.text(dnode)
+            return self.date_from_string(dstring)
+        else:
+            return None
+
+    def _dbg_soup_node(self, node):
+        s = '   cls: ' + str(node.get('class')).strip() + \
+              '  id: ' + str(node.get('id')).strip() + \
+              ' txt: ' + self.text(node)
+        return s
+
+    def _p(self, msg):
+        curframe = inspect.currentframe()
+        calframe = inspect.getouterframes(curframe, 2)
+        calname = calframe[1][3].upper()
+        print('[' + calname + '] ' + msg[0:120])