mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
iHNed by Karel Bilek
This commit is contained in:
parent
5e2a2d71a6
commit
46ab37e98f
182
resources/recipes/ihned.recipe
Normal file
182
resources/recipes/ihned.recipe
Normal file
@ -0,0 +1,182 @@
|
||||
import re, time
|
||||
from calibre import strftime
|
||||
from calibre.web.feeds.recipes import BasicNewsRecipe
|
||||
|
||||
class IHNed(BasicNewsRecipe):
|
||||
|
||||
|
||||
stahnout_vsechny = False
|
||||
#True = stahuje vsechny z homepage
|
||||
#False = stahuje pouze dnesni clanky (ze dne, kdy je skript spusten)
|
||||
|
||||
title = 'iHNed'
|
||||
__author__ = 'Karel Bílek'
|
||||
language = 'cs'
|
||||
description = 'Zprávy z iHNed.cz'
|
||||
timefmt = ' [%a, %d %b, %Y]'
|
||||
needs_subscription = False
|
||||
remove_tags = [dict(attrs={'class':['borderbottom', 'web', 'foot', 'reklama', 'd-elm d-rellinks', 'd-elm']}),
|
||||
dict(style=['text-align: center;']),
|
||||
dict(id=['r-bfull']),
|
||||
dict(name=['script', 'noscript', 'style'])]
|
||||
encoding = 'windows-1250'
|
||||
no_stylesheets = True
|
||||
remove_tags_before = dict(attrs={'class':'d-nadtit'})
|
||||
remove_tags_after = dict(attrs={'class':'like'})
|
||||
|
||||
conversion_options = {
|
||||
'linearize_tables' : True,
|
||||
}
|
||||
|
||||
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
|
||||
def makeurl(wat):
|
||||
return "http://ihned.cz"+wat;
|
||||
|
||||
for h1 in soup.findAll('h1'):
|
||||
a = h1.find('a')
|
||||
if a:
|
||||
string = a.string
|
||||
if string:
|
||||
soup.a.replaceWith(string)
|
||||
for a in soup.findAll('a', href=True) :
|
||||
cil = str(a['href'])
|
||||
if cil.startswith("/") or cil.startswith("index"):
|
||||
a['href'] = makeurl(cil)
|
||||
return soup
|
||||
|
||||
|
||||
def parse_index(self):
|
||||
|
||||
def makeurl(wat):
|
||||
if wat.startswith("/") or wat.startswith("index"):
|
||||
return "http://ihned.cz"+wat;
|
||||
else:
|
||||
return wat
|
||||
|
||||
|
||||
articles = {} #vysledek, asi
|
||||
key = None #soucasna sekce
|
||||
ans = [] #vsechny sekce
|
||||
|
||||
articles["Hlavní"] = []
|
||||
ans.append("Hlavní")
|
||||
|
||||
was = {}
|
||||
|
||||
def parse_subpage(url, name):
|
||||
articles[name] = []
|
||||
ans.append(name)
|
||||
|
||||
|
||||
soup = self.index_to_soup(url)
|
||||
otvirak = soup.find(True, attrs={'class':['otv']})
|
||||
if otvirak:
|
||||
|
||||
#the code is copypasted here because I don't know python. simple as that.
|
||||
a = otvirak.find('a', href=True)
|
||||
title = self.tag_to_string(a, use_alt=True).strip()
|
||||
txt = otvirak.find(True, attrs={'class':['txt']})
|
||||
description = ''
|
||||
if txt:
|
||||
match = re.match(r'<div class="txt">\s*([^<]*)\s*<a', str(txt), re.L)
|
||||
if match:
|
||||
description = match.group(1)
|
||||
|
||||
pubdate = strftime('%d. %m.')
|
||||
if not title in was:
|
||||
articles[name].append(
|
||||
dict(title=title, url=makeurl(a['href']), date=pubdate,
|
||||
description=description,
|
||||
content=''))
|
||||
|
||||
otv234 = soup.find(True, attrs={'class':['otv234', 'col2a']})
|
||||
if otv234:
|
||||
for ow in otv234.findAll(True, attrs={'class':['ow']}):
|
||||
a = ow.find('a', href=True)
|
||||
title = self.tag_to_string(a, use_alt=True).strip()
|
||||
description=''
|
||||
prx = ow.find(True, attrs={'class':['prx']});
|
||||
if prx:
|
||||
description = str(prx.string)
|
||||
nfo = ow.find(True, attrs={'class':['nfo']});
|
||||
pubdate = ''
|
||||
if nfo:
|
||||
dtime = time.localtime();
|
||||
day = dtime[2]
|
||||
month = dtime[1]
|
||||
|
||||
pubdate = strftime('%d. %m.')
|
||||
|
||||
match = re.search(r'([0-9]*)\.([0-9]*)\.', str(nfo))
|
||||
|
||||
if self.stahnout_vsechny or (int(day) == int(match.group(1)) and int(month) == int(match.group(2))):
|
||||
if not title in was:
|
||||
articles[name].append(
|
||||
dict(title=title, url=makeurl(a['href']), date=pubdate,
|
||||
description=description,
|
||||
content=''))
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
soup = self.index_to_soup('http://ihned.cz/')
|
||||
otvirak = soup.find(True, attrs={'class':['otv']})
|
||||
if otvirak:
|
||||
a = otvirak.find('a', href=True)
|
||||
title = self.tag_to_string(a, use_alt=True).strip()
|
||||
txt = otvirak.find(True, attrs={'class':['txt']})
|
||||
description = ''
|
||||
if txt:
|
||||
match = re.match(r'<div class="txt">\s*([^<]*)\s*<a', str(txt), re.L)
|
||||
if match:
|
||||
description = match.group(1)
|
||||
|
||||
pubdate = strftime('%d. %m.')
|
||||
feed = "Hlavní"
|
||||
articles[feed].append(
|
||||
dict(title=title, url=(a['href']), date=pubdate,
|
||||
description=description,
|
||||
content=''))
|
||||
was[title]=1
|
||||
|
||||
otvirak2345 = soup.find(True, attrs={'class':['otv2345']})
|
||||
if otvirak2345:
|
||||
for otv2 in otvirak2345.findAll(True, attrs={'class':['otv2-5']}):
|
||||
a = otv2.find('a', attrs={'class':['tit2']}, href=True)
|
||||
title = self.tag_to_string(a, use_alt=True).strip()
|
||||
description=''
|
||||
span = otv2.find('span');
|
||||
if span:
|
||||
match = re.match(r'<span>\s*([^<]*)\s*<a', str(span), re.L)
|
||||
if match:
|
||||
description = match.group(1)
|
||||
feed = "Hlavní"
|
||||
pubdate = strftime('%d. %m.')
|
||||
articles[feed].append(
|
||||
dict(title=title, url=(a['href']), date=pubdate,
|
||||
description=description,
|
||||
content=''))
|
||||
was[title]=1
|
||||
|
||||
|
||||
parse_subpage("http://komentare.ihned.cz/", "Komentáře")
|
||||
parse_subpage("http://domaci.ihned.cz", "Domácí")
|
||||
parse_subpage("http://ekonomika.ihned.cz", "Ekonomika")
|
||||
parse_subpage("http://zahranicni.ihned.cz/", "Zahraničí");
|
||||
parse_subpage("http://finweb.ihned.cz/", "Finance");
|
||||
parse_subpage("http://digiweb.ihned.cz/", "DigiWeb");
|
||||
parse_subpage("http://kultura.ihned.cz/", "Kultura")
|
||||
parse_subpage("http://sport.ihned.cz/", "Sport");
|
||||
|
||||
#seradi kategorie
|
||||
ans = self.sort_index_by(ans, {'Hlavni':1, 'Domácí':2, 'Ekonomika':5, 'Zahraničí':3, 'Finance':6, 'DigiWeb':7, 'Kultura':8, 'Sport':9, 'Komentáře':4})
|
||||
|
||||
#vrati, ale pouze, kdyz je v kategoriich...
|
||||
ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
|
||||
return ans
|
||||
|
Loading…
x
Reference in New Issue
Block a user