import re import time from calibre import strftime from calibre.web.feeds.recipes import BasicNewsRecipe class IHNed(BasicNewsRecipe): stahnout_vsechny = True # True = stahuje vsechny z homepage # False = stahuje pouze dnesni clanky (ze dne, kdy je skript spusten) title = 'iHNed' __author__ = 'Karel Bílek' language = 'cs' description = 'Zprávy z iHNed.cz' timefmt = ' [%a, %d %b, %Y]' needs_subscription = False remove_tags = [dict(attrs={'class': ['borderbottom', 'web', 'foot', 'reklama', 'd-elm d-rellinks', 'd-elm']}), dict(style=['text-align: center;']), dict(id=['r-bfull']), dict(name=['script', 'noscript', 'style'])] encoding = 'windows-1250' no_stylesheets = True remove_tags_before = dict(attrs={'class': 'd-nadtit'}) remove_tags_after = dict(attrs={'class': 'like'}) conversion_options = { 'linearize_tables': True, } def preprocess_html(self, soup): def makeurl(wat): return "http://ihned.cz" + wat for h1 in soup.findAll('h1'): a = h1.find('a') if a: string = a.string if string: soup.a.replaceWith(string) for a in soup.findAll('a', href=True): cil = str(a['href']) if cil.startswith("/") or cil.startswith("index"): a['href'] = makeurl(cil) return soup def parse_index(self): def makeurl(wat): if wat.startswith("/") or wat.startswith("index"): return "http://ihned.cz" + wat else: return wat articles = {} # vysledek, asi ans = [] # vsechny sekce articles["Hlavní"] = [] ans.append("Hlavní") was = {} def parse_subpage(url, name): articles[name] = [] ans.append(name) soup = self.index_to_soup(url) otvirak = soup.find(True, attrs={'class': ['otv']}) if otvirak: # the code is copypasted here because I don't know python. # simple as that. a = otvirak.find('a', href=True) title = self.tag_to_string(a, use_alt=True).strip() txt = otvirak.find(True, attrs={'class': ['txt']}) description = '' if txt: match = re.match( r'
\s*([^<]*)\s*\s*([^<]*)\s*\s*([^<]*)\s*