mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-06-23 15:30:45 -04:00
175 lines
6.7 KiB
Plaintext
175 lines
6.7 KiB
Plaintext
import re
|
|
import time
|
|
from calibre import strftime
|
|
from calibre.web.feeds.recipes import BasicNewsRecipe
|
|
|
|
|
|
class IHNed(BasicNewsRecipe):
|
|
|
|
stahnout_vsechny = True
|
|
# True = stahuje vsechny z homepage
|
|
# False = stahuje pouze dnesni clanky (ze dne, kdy je skript spusten)
|
|
|
|
title = 'iHNed'
|
|
__author__ = 'Karel Bílek'
|
|
language = 'cs'
|
|
description = 'Zprávy z iHNed.cz'
|
|
timefmt = ' [%a, %d %b, %Y]'
|
|
needs_subscription = False
|
|
remove_tags = [dict(attrs={'class': ['borderbottom', 'web', 'foot', 'reklama', 'd-elm d-rellinks', 'd-elm']}),
|
|
dict(style=['text-align: center;']),
|
|
dict(id=['r-bfull']),
|
|
dict(name=['script', 'noscript', 'style'])]
|
|
encoding = 'windows-1250'
|
|
no_stylesheets = True
|
|
remove_tags_before = dict(attrs={'class': 'd-nadtit'})
|
|
remove_tags_after = dict(attrs={'class': 'like'})
|
|
|
|
conversion_options = {
|
|
'linearize_tables': True,
|
|
}
|
|
|
|
def preprocess_html(self, soup):
|
|
|
|
def makeurl(wat):
|
|
return "http://ihned.cz" + wat
|
|
|
|
for h1 in soup.findAll('h1'):
|
|
a = h1.find('a')
|
|
if a:
|
|
string = a.string
|
|
if string:
|
|
soup.a.replaceWith(string)
|
|
for a in soup.findAll('a', href=True):
|
|
cil = str(a['href'])
|
|
if cil.startswith("/") or cil.startswith("index"):
|
|
a['href'] = makeurl(cil)
|
|
return soup
|
|
|
|
def parse_index(self):
|
|
|
|
def makeurl(wat):
|
|
if wat.startswith("/") or wat.startswith("index"):
|
|
return "http://ihned.cz" + wat
|
|
else:
|
|
return wat
|
|
|
|
articles = {} # vysledek, asi
|
|
ans = [] # vsechny sekce
|
|
|
|
articles["Hlavní"] = []
|
|
ans.append("Hlavní")
|
|
|
|
was = {}
|
|
|
|
def parse_subpage(url, name):
|
|
articles[name] = []
|
|
ans.append(name)
|
|
|
|
soup = self.index_to_soup(url)
|
|
otvirak = soup.find(True, attrs={'class': ['otv']})
|
|
if otvirak:
|
|
|
|
# the code is copypasted here because I don't know python.
|
|
# simple as that.
|
|
a = otvirak.find('a', href=True)
|
|
title = self.tag_to_string(a, use_alt=True).strip()
|
|
txt = otvirak.find(True, attrs={'class': ['txt']})
|
|
description = ''
|
|
if txt:
|
|
match = re.match(
|
|
r'<div class="txt">\s*([^<]*)\s*<a', str(txt), re.L)
|
|
if match:
|
|
description = match.group(1)
|
|
|
|
pubdate = strftime('%d. %m.')
|
|
if title not in was:
|
|
articles[name].append(
|
|
dict(title=title, url=makeurl(a['href']), date=pubdate,
|
|
description=description,
|
|
content=''))
|
|
|
|
otv234 = soup.find(True, attrs={'class': ['otv234', 'col2a']})
|
|
if otv234:
|
|
for ow in otv234.findAll(True, attrs={'class': ['ow']}):
|
|
a = ow.find('a', href=True)
|
|
title = self.tag_to_string(a, use_alt=True).strip()
|
|
description = ''
|
|
prx = ow.find(True, attrs={'class': ['prx']})
|
|
if prx:
|
|
description = str(prx.string)
|
|
nfo = ow.find(True, attrs={'class': ['nfo']})
|
|
pubdate = ''
|
|
if nfo:
|
|
dtime = time.localtime()
|
|
day = dtime[2]
|
|
month = dtime[1]
|
|
|
|
pubdate = strftime('%d. %m.')
|
|
|
|
match = re.search(r'([0-9]*)\.([0-9]*)\.', str(nfo))
|
|
|
|
if self.stahnout_vsechny or (int(day) == int(match.group(1)) and int(month) == int(match.group(2))):
|
|
if title not in was:
|
|
articles[name].append(
|
|
dict(title=title, url=makeurl(a['href']), date=pubdate,
|
|
description=description,
|
|
content=''))
|
|
|
|
soup = self.index_to_soup('http://ihned.cz/')
|
|
otvirak = soup.find(True, attrs={'class': ['otv']})
|
|
if otvirak:
|
|
a = otvirak.find('a', href=True)
|
|
title = self.tag_to_string(a, use_alt=True).strip()
|
|
txt = otvirak.find(True, attrs={'class': ['txt']})
|
|
description = ''
|
|
if txt:
|
|
match = re.match(
|
|
r'<div class="txt">\s*([^<]*)\s*<a', str(txt), re.L)
|
|
if match:
|
|
description = match.group(1)
|
|
|
|
pubdate = strftime('%d. %m.')
|
|
feed = "Hlavní"
|
|
articles[feed].append(
|
|
dict(title=title, url=(a['href']), date=pubdate,
|
|
description=description,
|
|
content=''))
|
|
was[title] = 1
|
|
|
|
otvirak2345 = soup.find(True, attrs={'class': ['otv2345']})
|
|
if otvirak2345:
|
|
for otv2 in otvirak2345.findAll(True, attrs={'class': ['otv2-5']}):
|
|
a = otv2.find('a', attrs={'class': ['tit2']}, href=True)
|
|
title = self.tag_to_string(a, use_alt=True).strip()
|
|
description = ''
|
|
span = otv2.find('span')
|
|
if span:
|
|
match = re.match(r'<span>\s*([^<]*)\s*<a', str(span), re.L)
|
|
if match:
|
|
description = match.group(1)
|
|
feed = "Hlavní"
|
|
pubdate = strftime('%d. %m.')
|
|
articles[feed].append(
|
|
dict(title=title, url=(a['href']), date=pubdate,
|
|
description=description,
|
|
content=''))
|
|
was[title] = 1
|
|
|
|
parse_subpage("http://komentare.ihned.cz/", "Komentáře")
|
|
parse_subpage("http://domaci.ihned.cz", "Domácí")
|
|
parse_subpage("http://ekonomika.ihned.cz", "Ekonomika")
|
|
parse_subpage("http://zahranicni.ihned.cz/", "Zahraničí")
|
|
parse_subpage("http://finweb.ihned.cz/", "Finance")
|
|
parse_subpage("http://digiweb.ihned.cz/", "DigiWeb")
|
|
parse_subpage("http://kultura.ihned.cz/", "Kultura")
|
|
parse_subpage("http://sport.ihned.cz/", "Sport")
|
|
|
|
# seradi kategorie
|
|
ans = self.sort_index_by(ans, {'Hlavni': 1, 'Domácí': 2, 'Ekonomika': 5, 'Zahraničí': 3,
|
|
'Finance': 6, 'DigiWeb': 7, 'Kultura': 8, 'Sport': 9, 'Komentáře': 4})
|
|
|
|
# vrati, ale pouze, kdyz je v kategoriich...
|
|
ans = [(key, articles[key]) for key in ans if key in articles]
|
|
return ans
|