calibre/recipes/todays_zaman.recipe

#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import print_function
__license__   = 'GPL v3'
__copyright__ = '2014, spswerling'
'''
www.todayszaman.com
'''
import re
from calibre.web.feeds.recipes import BasicNewsRecipe

try:
    from urllib.parse import urljoin
except ImportError:
    from urlparse import urljoin


class TodaysZaman(BasicNewsRecipe):

    title = u'Todays Zaman'
    __author__ = u'spswerling'
    description = 'English version of Turkish Daily "Zaman"'
    max_articles_per_feed = 100
    encoding = 'utf-8'
    category = 'news'
    language = 'en_TR'
    publication_type = 'newspaper'
    cover_img_url = 'http://medya.todayszaman.com/todayszaman/images/logo/todays_yenilogo.bmp'  # yep, bmp
    masthead_url = cover_img_url
    remove_empty_feeds = True

    # on kindle, images can make things kind of fat. Slim them down.
    recursions = 0
    oldest_article = 1.5
    compress_news_images = True
    compress_news_images_max_size = 7
    scale_news_images = (150, 200)  # (kindle touch: 600x800)
    useHighResImages = False

    sections = [
        (u'Columnists', u'columnists'),
        (u'Opinion', u'op-ed'),
        (u'World', u'world'),
        (u'National', u'national'),
        (u'Diplomacy', u'diplomacy'),
        (u'Business', u'business'),
    ]

    # util for creating remove_tags and keep_tags style regex matchers
    def tag_matcher(elt, attr, str):
        return dict(name=elt, attrs={attr: re.compile(str, re.IGNORECASE)})

    keep_only_tags = [
        tag_matcher('div', 'class', '^pageNewsDetailContainer$'),
        tag_matcher('div', 'class', '^pageColumnistDetailContainer$'),
    ]

    remove_tags = [
        tag_matcher('div', 'class', 'DetailKeyword'),
        tag_matcher('div', 'class', 'MainContentSocial'),
        tag_matcher('div', 'class', 'SocialNetwork'),
        tag_matcher('div', 'class', 'DetailLeftOther'),
        tag_matcher('div', 'class', 'RelatedNews'),
        tag_matcher('div', 'class', '^topMenuWrapper$'),
        tag_matcher('div', 'class', '^logo$'),
        tag_matcher('a', 'class', 'cf_email'),
    ]
    articles = {}

    def parse_index(self):
        for (sect_title, sect_uri) in self.sections:
            self.parse_section(sect_title, sect_uri)

        ans = []
        for k in self.articles:
            ans.append((k, self.articles[k]))
        return ans

    def parse_section(self, sect_title, sect_uri):
        url = 'http://www.todayszaman.com/' + sect_uri
        print('Start section ' + sect_title + ', ' + url)
        try:
            soup = self.index_to_soup(url)
        except:
            return

        # Find each article
        for div in soup.findAll('div'):
            div_class = div.get('class')
            if div_class:
                if div_class in ['pageColumnistsMainContent',
                                 'pageCategoryContainer']:
                    # print '  DIVCLASS' + div_class
                    for link in div.findAll('a', href=True):
                        self.process_link(sect_title, div_class, link)

        print('Finished section: ' + sect_title)

    def process_link(self, section_title, layout, link):
        def p(s):
            print('[PROCESS LINK] ' + s[0:80])

        href = link['href']
        full_href = urljoin('http://www.todayszaman.com/', href)
        next_sib = link.nextSibling
        child_h2 = link.find('h2')
        link_text = self.tag_to_string(link).strip()
        title_node = None

        if layout in ['pageColumnistsMainContent']:
            if child_h2:
                title_node = child_h2
            else:
                return
        elif layout in ['pageCategoryContainer']:
            top_title = link.find(attrs={'class': 'pageCategoryTopTitle'})
            if top_title:
                title_node = top_title
            elif (not link_text) and (next_sib and next_sib.find('h4')):
                title_node = next_sib.find('h4')
            elif (not link_text) and (next_sib and next_sib.find('h3')):
                title_node = next_sib.find('h3')
        elif link_text:
            title_node = link

        if title_node:
            title = self.tag_to_string(title_node)
            # print '        BING: ' + href + ', ' + title
            self.queue_article_link(section_title, full_href, title)

    def queue_article_link(self, section, url, title):
        if section not in self.articles:
            self.articles[section] = []
        self.articles[section].append(
            dict(title=title,
                 url=url,
                 date='',
                 description='',
                 author='',
                 content=''))

    def populate_article_metadata(self, article, soup, first):

        def p(s):
            print('[POPULATE METADATA] ' + s[0:80])

        tnode = soup.find('title')
        if tnode:
            tstring = self.tag_to_string(tnode)
            if ' - ' in tstring:
                author = tstring.split('-')[0]
                if author:
                    article.author = author
                    article.title = author + ' - ' + article.title.strip()
                    p('Add author to title:' + author)

        # known matches: pageNewsDetailDate, pageColumnistDetailLeftDate
        regex = re.compile('(DetailDate|DetailLeftDate)$', re.IGNORECASE)
        date_node = soup.find('div', {'class': regex})
        if date_node:
            date = self.tag_to_string(date_node).__str__().split('/')[0]
            date = ','.join(date.split(',')[:2]).strip()
            article.title = date + ' - ' + article.title.strip()
            article.date = date
            p('Add date to title: ' + date)

        strong = soup.find('strong')
        if strong:
            article.text_summary = self.tag_to_string(strong)
            p('Summary: ' + article.text_summary)

    def _dbg_soup_node(self, node):
        s = '   cls: ' + node.get('class').__str__().strip() + \
            ' txt: ' + self.tag_to_string(node).strip()
        return s