Hurriyet Daily News by spswerling

2025-11-01 02:57:01 -04:00 · 2014-09-25 00:13:34 +05:30 · 2014-09-25 00:13:34 +05:30 · 92cfab55a1
commit 92cfab55a1
parent 2b23b0d342
1 changed files with 260 additions and 0 deletions
--- a/recipes/hurriyet_daily_news.recipe
+++ b/recipes/hurriyet_daily_news.recipe
@ -0,0 +1,260 @@
 #!/usr/bin/env  python
 # -*- coding: utf-8 -*-
 __license__   = 'GPL v3'
 __copyright__ = '2014, spswerling'
 '''
 www.hurriyetdailynews.com
 '''
 import os, string, inspect, datetime, re
 from calibre.web.feeds.news import BasicNewsRecipe
 from calibre.ebooks.BeautifulSoup import BeautifulSoup
 class HurriyetDailyNews_en(BasicNewsRecipe):
    title          = u'Hurriyet Daily News'
    __author__            = u'spswerling'
    description            = 'a Turkey based daily in english'
    description = 'English version of Turkish Daily "Hurriyet"'
    no_stylesheets         = True
    encoding               = 'utf-8'
    category               = 'news'
    language               = 'en_TR'
    publication_type = 'newspaper'
    cover_img_url = 'http://www.hurriyetdailynews.com/images/design/logo-hurriyet-daily-news.png'
    masthead_url = cover_img_url
    remove_empty_feeds = True
    # on kindle, images can make things kind of fat. Slim them down.
    recursions = 0
    oldest_article = 1
    compress_news_images = True
    compress_news_images_max_size = 7
    scale_news_images = (150,200)  # (kindle touch: 600x800)
    useHighResImages = False
    oldest_article = 1.5
    max_articles_per_section = 25
    max_articles_per_subsection = 7
    sections = [
                 u'turkey',
                 u'economy',
                 u'world',
                 u'sports',
                 # u'life',
                 u'opinion',
                 # u'arts/culture'
               ]
    # util for creating remove_tags and keep_tags style regex matchers
    def tag_matcher(elt, attr, str):
        return dict(name=elt, attrs={attr:re.compile(str, re.IGNORECASE)})
    keep_only_tags = [tag_matcher('div', 'class', 'NewsDetail')]
    remove_tags = [
        tag_matcher('div', 'class', 'Carousel'),
        tag_matcher('div', 'class', 'ShareIt'),
        tag_matcher('div', 'class', 'tmz'),
        tag_matcher('span', 'id', 'comment'),
        tag_matcher('h2', 'class', 'NewSpot'),
        tag_matcher('h2', 'class', 'pv-gallery'),
        ]
    articles = {}
    subsection_links = {}
    urls_done = []
    links_per_section = {}
    def parse_index(self):
        section_links = self.section_links_from_home_page()
        for section_link in section_links:
            self.articles[self.section_name(section_link)] = []
            subsection_links = self.find_subsection_links(section_link)
            for subsection_link in subsection_links:
                sub_name = self.subsection_name(subsection_link)
                self.subsection_links[sub_name] = []
                self.parse_subsection(section_link, subsection_link)
        ans = []
        for k in self.articles:
            ans.append((string.capwords(k), self.articles[k]))
        return ans
    def section_links_from_home_page(self):
        def include_link(link):
            return self.text(link).lower() in self.sections
        url = 'http://www.hurriyetdailynews.com/'
        try:
            self._p('hitting home page ' + url)
            soup = self.index_to_soup(url)
        except:
            self._p('Unable to spider home page')
            return []
        self._p('Got home page. hunt down section links.')
        regex = re.compile('rmRootLink', re.IGNORECASE)
        links = soup.findAll('a', {'class':regex})
        filtered_links = filter(include_link, links)
        self._p(' all sections: ' + ', '.join(map(self.text, links)))
        self._p(' filtered sections: ' +
                ', '.join(map(self.text, filtered_links)))
        return filtered_links
    def find_subsection_links(self, section_link):
        self._p('find subsection links for section  ' + str(section_link))
        url = self.abs_url(section_link['href'])
        try:
            self._p('hitting ' + url)
            soup = self.index_to_soup(url)
        except:
            self._p('Unable to spider subsection')
            return []
        self._p('Got ' + url)
        div = soup.find('div', {'class':'SeffafLink'})
        if not div:
            self._p('could not find any subsections')
            return [section_link]
        links = div.findAll('a')
        self._p(' subsection links: ' + ', '.join(map(self.text, links)))
        return links
    def parse_subsection(self, section_link, subsection_link):
        section = self.section_name(section_link)
        if len(self.articles[section]) > self.max_articles_per_section:
            return
        # tmp dbg
        # if not self.subsection_name(subsection_link) == 'arts':
        #    return
        self._p('hit section  ' + section +
           ', subsect ' +  self.subsection_name(subsection_link))
        url = self.abs_url(subsection_link['href'])
        try:
            self._p('hitting ' + url)
            soup = self.index_to_soup(url)
        except:
            self._p('Unable to spider section')
            return []
        self._p('Process  links ')
        for link in soup.findAll('a'):
            if 'NewsDetail' in str(link.get('id')):
                self.process_link(section_link, subsection_link, link)
    def process_link(self, section_link, subsection_link, link):
        section = self.section_name(section_link)
        subsection = self.subsection_name(subsection_link)
        title = link['title'] or self.text(link)
        href = link.get('href')
        if not href:
            self._p("BAD HREF: " + str(link))
            return
        self.queue_article_link(section, subsection, href, title)
    def queue_article_link(self, section, subsection, url, title):
        full_url = self.abs_url(url)
        if full_url in self.urls_done:
            # self._p('Skip (already Qd): ' + ' - '.join([section, subsection, title, url]))
            return
        self.urls_done.append(full_url)
        if len(self.articles[section]) >= self.max_articles_per_section:
            return
        if len(self.subsection_links[subsection]) >= \
            self.max_articles_per_subsection:
            return
        self._p('Q: ' + ' - '.join([section, subsection, title, url]))
        full_title = string.capwords(subsection + ' - ' + title)
        self.subsection_links[subsection].append(url)
        self.articles[section].append(
                        dict(title=full_title,
                            url=full_url,
                            date='',
                            description='',
                            author='',
                            content=''))
    def text(self,n):
        return self.tag_to_string(n).strip()
    def abs_url(self, url):
        if 'www.hurriyetdailynews.com' in url:
            abs_url = url
        elif url[0] == '/':
            abs_url = 'http://www.hurriyetdailynews.com' + url
        else:
            abs_url = 'http://www.hurriyetdailynews.com/' + url
        if '#' in abs_url:
            abs_url = ''.join(abs_url.split('#')[0:-1])
        return abs_url
    def section_name(self,link):
        return self.text(link).lower()
    def subsection_name(self,link):
        from_fn = str(os.path.splitext(link['href'])[0]).split('/')[-1]
        return from_fn
    def preprocess_raw_html(self, raw_html, url):
        reason_to_skip = self.should_skip_article(BeautifulSoup(raw_html))
        if reason_to_skip:
            self._p('Skipping article: ' + reason_to_skip + ', ' + url)
            # Next line will show up as an error in the logs, but ignore, see
            #   http://www.mobileread.com/forums/showthread.php?p=2931136
            return None
        else:
            return super(self.__class__, self).preprocess_raw_html(raw_html, url)
    def should_skip_article(self, soup):
        date = self.scrape_article_date(soup)
        if not date:
            return False
        age = (datetime.datetime.now() - date).days
        if (age > self.oldest_article):
            return "too old"
        return False
    def date_from_string(self, datestring):
        try:
            # eg: September/17/2014
            dt = datetime.datetime.strptime(datestring,"%B/%d/%Y")
        except:
            try:
                # eg: September 17/2014
                dt = datetime.datetime.strptime(datestring,"%B %d/%Y")
            except:
                dt = None
        if dt:
            self._p('From string "' + datestring + '", datetime: ' + str(dt))
        else:
            self._p('Could not get datetime from ' + datestring)
        return dt
    def scrape_article_date(self, soup):
        dnode =  soup.find('p', {'class':'dateagency'}) or \
                 soup.find('p', {'class':'Tarih'})
        if dnode:
            dstring = self.text(dnode)
            return self.date_from_string(dstring)
        else:
            return None
    def _dbg_soup_node(self, node):
        s = '   cls: ' + str(node.get('class')).strip() + \
              '  id: ' + str(node.get('id')).strip() + \
              ' txt: ' + self.text(node)
        return s
    def _p(self, msg):
        curframe = inspect.currentframe()
        calframe = inspect.getouterframes(curframe, 2)
        calname = calframe[1][3].upper()
        print('[' + calname + '] ' + msg[0:120])