diff --git a/resources/recipes/ihned.recipe b/resources/recipes/ihned.recipe new file mode 100644 index 0000000000..daf63e19ed --- /dev/null +++ b/resources/recipes/ihned.recipe @@ -0,0 +1,182 @@ +import re, time +from calibre import strftime +from calibre.web.feeds.recipes import BasicNewsRecipe + +class IHNed(BasicNewsRecipe): + + + stahnout_vsechny = False + #True = stahuje vsechny z homepage + #False = stahuje pouze dnesni clanky (ze dne, kdy je skript spusten) + + title = 'iHNed' + __author__ = 'Karel Bílek' + language = 'cs' + description = 'Zprávy z iHNed.cz' + timefmt = ' [%a, %d %b, %Y]' + needs_subscription = False + remove_tags = [dict(attrs={'class':['borderbottom', 'web', 'foot', 'reklama', 'd-elm d-rellinks', 'd-elm']}), + dict(style=['text-align: center;']), + dict(id=['r-bfull']), + dict(name=['script', 'noscript', 'style'])] + encoding = 'windows-1250' + no_stylesheets = True + remove_tags_before = dict(attrs={'class':'d-nadtit'}) + remove_tags_after = dict(attrs={'class':'like'}) + + conversion_options = { + 'linearize_tables' : True, + } + + + + def preprocess_html(self, soup): + + def makeurl(wat): + return "http://ihned.cz"+wat; + + for h1 in soup.findAll('h1'): + a = h1.find('a') + if a: + string = a.string + if string: + soup.a.replaceWith(string) + for a in soup.findAll('a', href=True) : + cil = str(a['href']) + if cil.startswith("/") or cil.startswith("index"): + a['href'] = makeurl(cil) + return soup + + + def parse_index(self): + + def makeurl(wat): + if wat.startswith("/") or wat.startswith("index"): + return "http://ihned.cz"+wat; + else: + return wat + + + articles = {} #vysledek, asi + key = None #soucasna sekce + ans = [] #vsechny sekce + + articles["Hlavní"] = [] + ans.append("Hlavní") + + was = {} + + def parse_subpage(url, name): + articles[name] = [] + ans.append(name) + + + soup = self.index_to_soup(url) + otvirak = soup.find(True, attrs={'class':['otv']}) + if otvirak: + + #the code is copypasted here because I don't know python. simple as that. + a = otvirak.find('a', href=True) + title = self.tag_to_string(a, use_alt=True).strip() + txt = otvirak.find(True, attrs={'class':['txt']}) + description = '' + if txt: + match = re.match(r'
\s*([^<]*)\s*\s*([^<]*)\s*\s*([^<]*)\s*