iHNed by Karel Bilek

2025-08-30 23:00:21 -04:00 · 2011-01-17 10:17:34 -07:00 · 2011-01-17 10:17:34 -07:00 · 46ab37e98f
commit 46ab37e98f
parent 5e2a2d71a6
1 changed files with 182 additions and 0 deletions
--- a/resources/recipes/ihned.recipe
+++ b/resources/recipes/ihned.recipe
@ -0,0 +1,182 @@
+import re, time
+from calibre import strftime
+from calibre.web.feeds.recipes import BasicNewsRecipe
+
+class IHNed(BasicNewsRecipe):
+
+
+    stahnout_vsechny = False
+        #True   = stahuje vsechny z homepage
+        #False  = stahuje pouze dnesni clanky (ze dne, kdy je skript spusten)
+
+    title       = 'iHNed'
+    __author__  = 'Karel Bílek'
+    language = 'cs'
+    description = 'Zprávy z iHNed.cz'
+    timefmt = ' [%a, %d %b, %Y]'
+    needs_subscription = False
+    remove_tags = [dict(attrs={'class':['borderbottom', 'web', 'foot', 'reklama', 'd-elm d-rellinks', 'd-elm']}),
+                 dict(style=['text-align: center;']),
+                 dict(id=['r-bfull']),
+                 dict(name=['script', 'noscript', 'style'])]
+    encoding = 'windows-1250'
+    no_stylesheets = True
+    remove_tags_before = dict(attrs={'class':'d-nadtit'})
+    remove_tags_after = dict(attrs={'class':'like'})
+
+    conversion_options = {
+      'linearize_tables' : True,
+    }
+
+
+
+    def preprocess_html(self, soup):
+
+        def makeurl(wat):
+            return "http://ihned.cz"+wat;
+
+        for h1 in soup.findAll('h1'):
+             a = h1.find('a')
+             if a:
+                 string = a.string
+                 if string:
+                     soup.a.replaceWith(string)
+        for a in soup.findAll('a',  href=True) :
+            cil = str(a['href'])
+            if cil.startswith("/") or  cil.startswith("index"):
+                a['href'] = makeurl(cil)
+        return soup
+
+
+    def parse_index(self):
+
+        def makeurl(wat):
+            if wat.startswith("/") or  wat.startswith("index"):
+                return "http://ihned.cz"+wat;
+            else:
+                return wat
+
+
+        articles = {} #vysledek, asi
+        key = None #soucasna sekce
+        ans = [] #vsechny sekce
+
+        articles["Hlavní"] = []
+        ans.append("Hlavní")
+
+        was = {}
+
+        def parse_subpage(url, name):
+            articles[name] = []
+            ans.append(name)
+
+
+            soup = self.index_to_soup(url)
+            otvirak = soup.find(True, attrs={'class':['otv']})
+            if otvirak:
+
+                #the code is copypasted here because I don't know python. simple as that.
+                a = otvirak.find('a', href=True)
+                title = self.tag_to_string(a, use_alt=True).strip()
+                txt = otvirak.find(True, attrs={'class':['txt']})
+                description = ''
+                if txt:
+                    match = re.match(r'<div class="txt">\s*([^<]*)\s*<a', str(txt), re.L)
+                    if match:
+                        description = match.group(1)
+
+                pubdate = strftime('%d. %m.')
+                if not title in was:
+                    articles[name].append(
+                          dict(title=title, url=makeurl(a['href']), date=pubdate,
+                                description=description,
+                                content=''))
+
+            otv234 = soup.find(True, attrs={'class':['otv234', 'col2a']})
+            if otv234:
+                for ow in otv234.findAll(True, attrs={'class':['ow']}):
+                    a = ow.find('a', href=True)
+                    title = self.tag_to_string(a, use_alt=True).strip()
+                    description=''
+                    prx = ow.find(True, attrs={'class':['prx']});
+                    if prx:
+                        description = str(prx.string)
+                    nfo = ow.find(True, attrs={'class':['nfo']});
+                    pubdate = ''
+                    if nfo:
+                        dtime = time.localtime();
+                        day = dtime[2]
+                        month = dtime[1]
+
+                        pubdate = strftime('%d. %m.')
+
+                        match = re.search(r'([0-9]*)\.([0-9]*)\.', str(nfo))
+
+                        if self.stahnout_vsechny or (int(day) == int(match.group(1)) and int(month) == int(match.group(2))):
+                            if not title in was:
+                                articles[name].append(
+                                      dict(title=title, url=makeurl(a['href']), date=pubdate,
+                                            description=description,
+                                            content=''))
+
+
+
+
+
+
+        soup = self.index_to_soup('http://ihned.cz/')
+        otvirak = soup.find(True, attrs={'class':['otv']})
+        if otvirak:
+            a = otvirak.find('a', href=True)
+            title = self.tag_to_string(a, use_alt=True).strip()
+            txt = otvirak.find(True, attrs={'class':['txt']})
+            description = ''
+            if txt:
+                match = re.match(r'<div class="txt">\s*([^<]*)\s*<a', str(txt), re.L)
+                if match:
+                    description = match.group(1)
+
+            pubdate = strftime('%d. %m.')
+            feed = "Hlavní"
+            articles[feed].append(
+                      dict(title=title, url=(a['href']), date=pubdate,
+                            description=description,
+                            content=''))
+            was[title]=1
+
+        otvirak2345 = soup.find(True, attrs={'class':['otv2345']})
+        if otvirak2345:
+            for otv2 in otvirak2345.findAll(True, attrs={'class':['otv2-5']}):
+                a = otv2.find('a', attrs={'class':['tit2']}, href=True)
+                title = self.tag_to_string(a, use_alt=True).strip()
+                description=''
+                span = otv2.find('span');
+                if span:
+                    match = re.match(r'<span>\s*([^<]*)\s*<a', str(span), re.L)
+                    if match:
+                        description = match.group(1)
+                feed = "Hlavní"
+                pubdate = strftime('%d. %m.')
+                articles[feed].append(
+                          dict(title=title, url=(a['href']), date=pubdate,
+                                description=description,
+                                content=''))
+                was[title]=1
+
+
+        parse_subpage("http://komentare.ihned.cz/", "Komentáře")
+        parse_subpage("http://domaci.ihned.cz", "Domácí")
+        parse_subpage("http://ekonomika.ihned.cz", "Ekonomika")
+        parse_subpage("http://zahranicni.ihned.cz/", "Zahraničí");
+        parse_subpage("http://finweb.ihned.cz/", "Finance");
+        parse_subpage("http://digiweb.ihned.cz/", "DigiWeb");
+        parse_subpage("http://kultura.ihned.cz/", "Kultura")
+        parse_subpage("http://sport.ihned.cz/", "Sport");
+
+        #seradi kategorie
+        ans = self.sort_index_by(ans, {'Hlavni':1, 'Domácí':2, 'Ekonomika':5, 'Zahraničí':3, 'Finance':6, 'DigiWeb':7, 'Kultura':8, 'Sport':9, 'Komentáře':4})
+
+        #vrati, ale pouze, kdyz je v kategoriich...
+        ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
+        return ans
+