calibre/recipes/ihned.recipe
Kovid Goyal 567040ee1e Perform PEP8 compliance checks on the entire codebase
Some bits of PEP 8 are turned off via setup.cfg
2016-07-29 21:25:17 +05:30

175 lines
6.7 KiB
Plaintext

import re
import time
from calibre import strftime
from calibre.web.feeds.recipes import BasicNewsRecipe
class IHNed(BasicNewsRecipe):
stahnout_vsechny = True
# True = stahuje vsechny z homepage
# False = stahuje pouze dnesni clanky (ze dne, kdy je skript spusten)
title = 'iHNed'
__author__ = 'Karel Bílek'
language = 'cs'
description = 'Zprávy z iHNed.cz'
timefmt = ' [%a, %d %b, %Y]'
needs_subscription = False
remove_tags = [dict(attrs={'class': ['borderbottom', 'web', 'foot', 'reklama', 'd-elm d-rellinks', 'd-elm']}),
dict(style=['text-align: center;']),
dict(id=['r-bfull']),
dict(name=['script', 'noscript', 'style'])]
encoding = 'windows-1250'
no_stylesheets = True
remove_tags_before = dict(attrs={'class': 'd-nadtit'})
remove_tags_after = dict(attrs={'class': 'like'})
conversion_options = {
'linearize_tables': True,
}
def preprocess_html(self, soup):
def makeurl(wat):
return "http://ihned.cz" + wat
for h1 in soup.findAll('h1'):
a = h1.find('a')
if a:
string = a.string
if string:
soup.a.replaceWith(string)
for a in soup.findAll('a', href=True):
cil = str(a['href'])
if cil.startswith("/") or cil.startswith("index"):
a['href'] = makeurl(cil)
return soup
def parse_index(self):
def makeurl(wat):
if wat.startswith("/") or wat.startswith("index"):
return "http://ihned.cz" + wat
else:
return wat
articles = {} # vysledek, asi
ans = [] # vsechny sekce
articles["Hlavní"] = []
ans.append("Hlavní")
was = {}
def parse_subpage(url, name):
articles[name] = []
ans.append(name)
soup = self.index_to_soup(url)
otvirak = soup.find(True, attrs={'class': ['otv']})
if otvirak:
# the code is copypasted here because I don't know python.
# simple as that.
a = otvirak.find('a', href=True)
title = self.tag_to_string(a, use_alt=True).strip()
txt = otvirak.find(True, attrs={'class': ['txt']})
description = ''
if txt:
match = re.match(
r'<div class="txt">\s*([^<]*)\s*<a', str(txt), re.L)
if match:
description = match.group(1)
pubdate = strftime('%d. %m.')
if title not in was:
articles[name].append(
dict(title=title, url=makeurl(a['href']), date=pubdate,
description=description,
content=''))
otv234 = soup.find(True, attrs={'class': ['otv234', 'col2a']})
if otv234:
for ow in otv234.findAll(True, attrs={'class': ['ow']}):
a = ow.find('a', href=True)
title = self.tag_to_string(a, use_alt=True).strip()
description = ''
prx = ow.find(True, attrs={'class': ['prx']})
if prx:
description = str(prx.string)
nfo = ow.find(True, attrs={'class': ['nfo']})
pubdate = ''
if nfo:
dtime = time.localtime()
day = dtime[2]
month = dtime[1]
pubdate = strftime('%d. %m.')
match = re.search(r'([0-9]*)\.([0-9]*)\.', str(nfo))
if self.stahnout_vsechny or (int(day) == int(match.group(1)) and int(month) == int(match.group(2))):
if title not in was:
articles[name].append(
dict(title=title, url=makeurl(a['href']), date=pubdate,
description=description,
content=''))
soup = self.index_to_soup('http://ihned.cz/')
otvirak = soup.find(True, attrs={'class': ['otv']})
if otvirak:
a = otvirak.find('a', href=True)
title = self.tag_to_string(a, use_alt=True).strip()
txt = otvirak.find(True, attrs={'class': ['txt']})
description = ''
if txt:
match = re.match(
r'<div class="txt">\s*([^<]*)\s*<a', str(txt), re.L)
if match:
description = match.group(1)
pubdate = strftime('%d. %m.')
feed = "Hlavní"
articles[feed].append(
dict(title=title, url=(a['href']), date=pubdate,
description=description,
content=''))
was[title] = 1
otvirak2345 = soup.find(True, attrs={'class': ['otv2345']})
if otvirak2345:
for otv2 in otvirak2345.findAll(True, attrs={'class': ['otv2-5']}):
a = otv2.find('a', attrs={'class': ['tit2']}, href=True)
title = self.tag_to_string(a, use_alt=True).strip()
description = ''
span = otv2.find('span')
if span:
match = re.match(r'<span>\s*([^<]*)\s*<a', str(span), re.L)
if match:
description = match.group(1)
feed = "Hlavní"
pubdate = strftime('%d. %m.')
articles[feed].append(
dict(title=title, url=(a['href']), date=pubdate,
description=description,
content=''))
was[title] = 1
parse_subpage("http://komentare.ihned.cz/", "Komentáře")
parse_subpage("http://domaci.ihned.cz", "Domácí")
parse_subpage("http://ekonomika.ihned.cz", "Ekonomika")
parse_subpage("http://zahranicni.ihned.cz/", "Zahraničí")
parse_subpage("http://finweb.ihned.cz/", "Finance")
parse_subpage("http://digiweb.ihned.cz/", "DigiWeb")
parse_subpage("http://kultura.ihned.cz/", "Kultura")
parse_subpage("http://sport.ihned.cz/", "Sport")
# seradi kategorie
ans = self.sort_index_by(ans, {'Hlavni': 1, 'Domácí': 2, 'Ekonomika': 5, 'Zahraničí': 3,
'Finance': 6, 'DigiWeb': 7, 'Kultura': 8, 'Sport': 9, 'Komentáře': 4})
# vrati, ale pouze, kdyz je v kategoriich...
ans = [(key, articles[key]) for key in ans if key in articles]
return ans