mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-10-23 23:08:55 -04:00
183 lines
6.6 KiB
Plaintext
183 lines
6.6 KiB
Plaintext
import re, time
|
|
from calibre import strftime
|
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
|
|
|
class IHNed(BasicNewsRecipe):
|
|
|
|
|
|
stahnout_vsechny = True
|
|
#True = stahuje vsechny z homepage
|
|
#False = stahuje pouze dnesni clanky (ze dne, kdy je skript spusten)
|
|
|
|
title = 'iHNed'
|
|
__author__ = 'Karel Bílek'
|
|
language = 'cs'
|
|
description = 'Zprávy z iHNed.cz'
|
|
timefmt = ' [%a, %d %b, %Y]'
|
|
needs_subscription = False
|
|
remove_tags = [dict(attrs={'class':['borderbottom', 'web', 'foot', 'reklama', 'd-elm d-rellinks', 'd-elm']}),
|
|
dict(style=['text-align: center;']),
|
|
dict(id=['r-bfull']),
|
|
dict(name=['script', 'noscript', 'style'])]
|
|
encoding = 'windows-1250'
|
|
no_stylesheets = True
|
|
remove_tags_before = dict(attrs={'class':'d-nadtit'})
|
|
remove_tags_after = dict(attrs={'class':'like'})
|
|
|
|
conversion_options = {
|
|
'linearize_tables' : True,
|
|
}
|
|
|
|
|
|
|
|
def preprocess_html(self, soup):
|
|
|
|
def makeurl(wat):
|
|
return "http://ihned.cz"+wat;
|
|
|
|
for h1 in soup.findAll('h1'):
|
|
a = h1.find('a')
|
|
if a:
|
|
string = a.string
|
|
if string:
|
|
soup.a.replaceWith(string)
|
|
for a in soup.findAll('a', href=True) :
|
|
cil = str(a['href'])
|
|
if cil.startswith("/") or cil.startswith("index"):
|
|
a['href'] = makeurl(cil)
|
|
return soup
|
|
|
|
|
|
def parse_index(self):
|
|
|
|
def makeurl(wat):
|
|
if wat.startswith("/") or wat.startswith("index"):
|
|
return "http://ihned.cz"+wat;
|
|
else:
|
|
return wat
|
|
|
|
|
|
articles = {} #vysledek, asi
|
|
key = None #soucasna sekce
|
|
ans = [] #vsechny sekce
|
|
|
|
articles["Hlavní"] = []
|
|
ans.append("Hlavní")
|
|
|
|
was = {}
|
|
|
|
def parse_subpage(url, name):
|
|
articles[name] = []
|
|
ans.append(name)
|
|
|
|
|
|
soup = self.index_to_soup(url)
|
|
otvirak = soup.find(True, attrs={'class':['otv']})
|
|
if otvirak:
|
|
|
|
#the code is copypasted here because I don't know python. simple as that.
|
|
a = otvirak.find('a', href=True)
|
|
title = self.tag_to_string(a, use_alt=True).strip()
|
|
txt = otvirak.find(True, attrs={'class':['txt']})
|
|
description = ''
|
|
if txt:
|
|
match = re.match(r'<div class="txt">\s*([^<]*)\s*<a', str(txt), re.L)
|
|
if match:
|
|
description = match.group(1)
|
|
|
|
pubdate = strftime('%d. %m.')
|
|
if not title in was:
|
|
articles[name].append(
|
|
dict(title=title, url=makeurl(a['href']), date=pubdate,
|
|
description=description,
|
|
content=''))
|
|
|
|
otv234 = soup.find(True, attrs={'class':['otv234', 'col2a']})
|
|
if otv234:
|
|
for ow in otv234.findAll(True, attrs={'class':['ow']}):
|
|
a = ow.find('a', href=True)
|
|
title = self.tag_to_string(a, use_alt=True).strip()
|
|
description=''
|
|
prx = ow.find(True, attrs={'class':['prx']});
|
|
if prx:
|
|
description = str(prx.string)
|
|
nfo = ow.find(True, attrs={'class':['nfo']});
|
|
pubdate = ''
|
|
if nfo:
|
|
dtime = time.localtime();
|
|
day = dtime[2]
|
|
month = dtime[1]
|
|
|
|
pubdate = strftime('%d. %m.')
|
|
|
|
match = re.search(r'([0-9]*)\.([0-9]*)\.', str(nfo))
|
|
|
|
if self.stahnout_vsechny or (int(day) == int(match.group(1)) and int(month) == int(match.group(2))):
|
|
if not title in was:
|
|
articles[name].append(
|
|
dict(title=title, url=makeurl(a['href']), date=pubdate,
|
|
description=description,
|
|
content=''))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
soup = self.index_to_soup('http://ihned.cz/')
|
|
otvirak = soup.find(True, attrs={'class':['otv']})
|
|
if otvirak:
|
|
a = otvirak.find('a', href=True)
|
|
title = self.tag_to_string(a, use_alt=True).strip()
|
|
txt = otvirak.find(True, attrs={'class':['txt']})
|
|
description = ''
|
|
if txt:
|
|
match = re.match(r'<div class="txt">\s*([^<]*)\s*<a', str(txt), re.L)
|
|
if match:
|
|
description = match.group(1)
|
|
|
|
pubdate = strftime('%d. %m.')
|
|
feed = "Hlavní"
|
|
articles[feed].append(
|
|
dict(title=title, url=(a['href']), date=pubdate,
|
|
description=description,
|
|
content=''))
|
|
was[title]=1
|
|
|
|
otvirak2345 = soup.find(True, attrs={'class':['otv2345']})
|
|
if otvirak2345:
|
|
for otv2 in otvirak2345.findAll(True, attrs={'class':['otv2-5']}):
|
|
a = otv2.find('a', attrs={'class':['tit2']}, href=True)
|
|
title = self.tag_to_string(a, use_alt=True).strip()
|
|
description=''
|
|
span = otv2.find('span');
|
|
if span:
|
|
match = re.match(r'<span>\s*([^<]*)\s*<a', str(span), re.L)
|
|
if match:
|
|
description = match.group(1)
|
|
feed = "Hlavní"
|
|
pubdate = strftime('%d. %m.')
|
|
articles[feed].append(
|
|
dict(title=title, url=(a['href']), date=pubdate,
|
|
description=description,
|
|
content=''))
|
|
was[title]=1
|
|
|
|
|
|
parse_subpage("http://komentare.ihned.cz/", "Komentáře")
|
|
parse_subpage("http://domaci.ihned.cz", "Domácí")
|
|
parse_subpage("http://ekonomika.ihned.cz", "Ekonomika")
|
|
parse_subpage("http://zahranicni.ihned.cz/", "Zahraničí");
|
|
parse_subpage("http://finweb.ihned.cz/", "Finance");
|
|
parse_subpage("http://digiweb.ihned.cz/", "DigiWeb");
|
|
parse_subpage("http://kultura.ihned.cz/", "Kultura")
|
|
parse_subpage("http://sport.ihned.cz/", "Sport");
|
|
|
|
#seradi kategorie
|
|
ans = self.sort_index_by(ans, {'Hlavni':1, 'Domácí':2, 'Ekonomika':5, 'Zahraničí':3, 'Finance':6, 'DigiWeb':7, 'Kultura':8, 'Sport':9, 'Komentáře':4})
|
|
|
|
#vrati, ale pouze, kdyz je v kategoriich...
|
|
ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
|
|
return ans
|
|
|