calibre/recipes/ihned.recipe

import re
import time
from calibre import strftime
from calibre.web.feeds.recipes import BasicNewsRecipe


class IHNed(BasicNewsRecipe):

    stahnout_vsechny = True
    # True   = stahuje vsechny z homepage
    # False  = stahuje pouze dnesni clanky (ze dne, kdy je skript spusten)

    title = 'iHNed'
    __author__ = 'Karel Bílek'
    language = 'cs'
    description = 'Zprávy z iHNed.cz'
    timefmt = ' [%a, %d %b, %Y]'
    needs_subscription = False
    remove_tags = [dict(attrs={'class': ['borderbottom', 'web', 'foot', 'reklama', 'd-elm d-rellinks', 'd-elm']}),
                   dict(style=['text-align: center;']),
                   dict(id=['r-bfull']),
                   dict(name=['script', 'noscript', 'style'])]
    encoding = 'windows-1250'
    no_stylesheets = True
    remove_tags_before = dict(attrs={'class': 'd-nadtit'})
    remove_tags_after = dict(attrs={'class': 'like'})

    conversion_options = {
        'linearize_tables': True,
    }

    def preprocess_html(self, soup):

        def makeurl(wat):
            return "http://ihned.cz" + wat

        for h1 in soup.findAll('h1'):
            a = h1.find('a')
            if a:
                string = a.string
                if string:
                    soup.a.replaceWith(string)
        for a in soup.findAll('a',  href=True):
            cil = str(a['href'])
            if cil.startswith("/") or cil.startswith("index"):
                a['href'] = makeurl(cil)
        return soup

    def parse_index(self):

        def makeurl(wat):
            if wat.startswith("/") or wat.startswith("index"):
                return "http://ihned.cz" + wat
            else:
                return wat

        articles = {}  # vysledek, asi
        ans = []  # vsechny sekce

        articles["Hlavní"] = []
        ans.append("Hlavní")

        was = {}

        def parse_subpage(url, name):
            articles[name] = []
            ans.append(name)

            soup = self.index_to_soup(url)
            otvirak = soup.find(True, attrs={'class': ['otv']})
            if otvirak:

                # the code is copypasted here because I don't know python.
                # simple as that.
                a = otvirak.find('a', href=True)
                title = self.tag_to_string(a, use_alt=True).strip()
                txt = otvirak.find(True, attrs={'class': ['txt']})
                description = ''
                if txt:
                    match = re.match(
                        r'<div class="txt">\s*([^<]*)\s*<a', str(txt), re.L)
                    if match:
                        description = match.group(1)

                pubdate = strftime('%d. %m.')
                if title not in was:
                    articles[name].append(
                        dict(title=title, url=makeurl(a['href']), date=pubdate,
                             description=description,
                             content=''))

            otv234 = soup.find(True, attrs={'class': ['otv234', 'col2a']})
            if otv234:
                for ow in otv234.findAll(True, attrs={'class': ['ow']}):
                    a = ow.find('a', href=True)
                    title = self.tag_to_string(a, use_alt=True).strip()
                    description = ''
                    prx = ow.find(True, attrs={'class': ['prx']})
                    if prx:
                        description = str(prx.string)
                    nfo = ow.find(True, attrs={'class': ['nfo']})
                    pubdate = ''
                    if nfo:
                        dtime = time.localtime()
                        day = dtime[2]
                        month = dtime[1]

                        pubdate = strftime('%d. %m.')

                        match = re.search(r'([0-9]*)\.([0-9]*)\.', str(nfo))

                        if self.stahnout_vsechny or (int(day) == int(match.group(1)) and int(month) == int(match.group(2))):
                            if title not in was:
                                articles[name].append(
                                    dict(title=title, url=makeurl(a['href']), date=pubdate,
                                         description=description,
                                         content=''))

        soup = self.index_to_soup('http://ihned.cz/')
        otvirak = soup.find(True, attrs={'class': ['otv']})
        if otvirak:
            a = otvirak.find('a', href=True)
            title = self.tag_to_string(a, use_alt=True).strip()
            txt = otvirak.find(True, attrs={'class': ['txt']})
            description = ''
            if txt:
                match = re.match(
                    r'<div class="txt">\s*([^<]*)\s*<a', str(txt), re.L)
                if match:
                    description = match.group(1)

            pubdate = strftime('%d. %m.')
            feed = "Hlavní"
            articles[feed].append(
                dict(title=title, url=(a['href']), date=pubdate,
                     description=description,
                     content=''))
            was[title] = 1

        otvirak2345 = soup.find(True, attrs={'class': ['otv2345']})
        if otvirak2345:
            for otv2 in otvirak2345.findAll(True, attrs={'class': ['otv2-5']}):
                a = otv2.find('a', attrs={'class': ['tit2']}, href=True)
                title = self.tag_to_string(a, use_alt=True).strip()
                description = ''
                span = otv2.find('span')
                if span:
                    match = re.match(r'<span>\s*([^<]*)\s*<a', str(span), re.L)
                    if match:
                        description = match.group(1)
                feed = "Hlavní"
                pubdate = strftime('%d. %m.')
                articles[feed].append(
                    dict(title=title, url=(a['href']), date=pubdate,
                         description=description,
                         content=''))
                was[title] = 1

        parse_subpage("http://komentare.ihned.cz/", "Komentáře")
        parse_subpage("http://domaci.ihned.cz", "Domácí")
        parse_subpage("http://ekonomika.ihned.cz", "Ekonomika")
        parse_subpage("http://zahranicni.ihned.cz/", "Zahraničí")
        parse_subpage("http://finweb.ihned.cz/", "Finance")
        parse_subpage("http://digiweb.ihned.cz/", "DigiWeb")
        parse_subpage("http://kultura.ihned.cz/", "Kultura")
        parse_subpage("http://sport.ihned.cz/", "Sport")

        # seradi kategorie
        ans = self.sort_index_by(ans, {'Hlavni': 1, 'Domácí': 2, 'Ekonomika': 5, 'Zahraničí': 3,
                                       'Finance': 6, 'DigiWeb': 7, 'Kultura': 8, 'Sport': 9, 'Komentáře': 4})

        # vrati, ale pouze, kdyz je v kategoriich...
        ans = [(key, articles[key]) for key in ans if key in articles]
        return ans