mirror of
				https://github.com/kovidgoyal/calibre.git
				synced 2025-11-01 19:17:02 -04:00 
			
		
		
		
	
		
			
				
	
	
		
			175 lines
		
	
	
		
			6.7 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
			
		
		
	
	
			175 lines
		
	
	
		
			6.7 KiB
		
	
	
	
		
			Plaintext
		
	
	
	
	
	
| import re
 | |
| import time
 | |
| from calibre import strftime
 | |
| from calibre.web.feeds.recipes import BasicNewsRecipe
 | |
| 
 | |
| 
 | |
| class IHNed(BasicNewsRecipe):
 | |
| 
 | |
|     stahnout_vsechny = True
 | |
|     # True   = stahuje vsechny z homepage
 | |
|     # False  = stahuje pouze dnesni clanky (ze dne, kdy je skript spusten)
 | |
| 
 | |
|     title = 'iHNed'
 | |
|     __author__ = 'Karel Bílek'
 | |
|     language = 'cs'
 | |
|     description = 'Zprávy z iHNed.cz'
 | |
|     timefmt = ' [%a, %d %b, %Y]'
 | |
|     needs_subscription = False
 | |
|     remove_tags = [dict(attrs={'class': ['borderbottom', 'web', 'foot', 'reklama', 'd-elm d-rellinks', 'd-elm']}),
 | |
|                    dict(style=['text-align: center;']),
 | |
|                    dict(id=['r-bfull']),
 | |
|                    dict(name=['script', 'noscript', 'style'])]
 | |
|     encoding = 'windows-1250'
 | |
|     no_stylesheets = True
 | |
|     remove_tags_before = dict(attrs={'class': 'd-nadtit'})
 | |
|     remove_tags_after = dict(attrs={'class': 'like'})
 | |
| 
 | |
|     conversion_options = {
 | |
|         'linearize_tables': True,
 | |
|     }
 | |
| 
 | |
|     def preprocess_html(self, soup):
 | |
| 
 | |
|         def makeurl(wat):
 | |
|             return "http://ihned.cz" + wat
 | |
| 
 | |
|         for h1 in soup.findAll('h1'):
 | |
|             a = h1.find('a')
 | |
|             if a:
 | |
|                 string = a.string
 | |
|                 if string:
 | |
|                     soup.a.replaceWith(string)
 | |
|         for a in soup.findAll('a',  href=True):
 | |
|             cil = str(a['href'])
 | |
|             if cil.startswith("/") or cil.startswith("index"):
 | |
|                 a['href'] = makeurl(cil)
 | |
|         return soup
 | |
| 
 | |
|     def parse_index(self):
 | |
| 
 | |
|         def makeurl(wat):
 | |
|             if wat.startswith("/") or wat.startswith("index"):
 | |
|                 return "http://ihned.cz" + wat
 | |
|             else:
 | |
|                 return wat
 | |
| 
 | |
|         articles = {}  # vysledek, asi
 | |
|         ans = []  # vsechny sekce
 | |
| 
 | |
|         articles["Hlavní"] = []
 | |
|         ans.append("Hlavní")
 | |
| 
 | |
|         was = {}
 | |
| 
 | |
|         def parse_subpage(url, name):
 | |
|             articles[name] = []
 | |
|             ans.append(name)
 | |
| 
 | |
|             soup = self.index_to_soup(url)
 | |
|             otvirak = soup.find(True, attrs={'class': ['otv']})
 | |
|             if otvirak:
 | |
| 
 | |
|                 # the code is copypasted here because I don't know python.
 | |
|                 # simple as that.
 | |
|                 a = otvirak.find('a', href=True)
 | |
|                 title = self.tag_to_string(a, use_alt=True).strip()
 | |
|                 txt = otvirak.find(True, attrs={'class': ['txt']})
 | |
|                 description = ''
 | |
|                 if txt:
 | |
|                     match = re.match(
 | |
|                         r'<div class="txt">\s*([^<]*)\s*<a', str(txt), re.L)
 | |
|                     if match:
 | |
|                         description = match.group(1)
 | |
| 
 | |
|                 pubdate = strftime('%d. %m.')
 | |
|                 if title not in was:
 | |
|                     articles[name].append(
 | |
|                         dict(title=title, url=makeurl(a['href']), date=pubdate,
 | |
|                              description=description,
 | |
|                              content=''))
 | |
| 
 | |
|             otv234 = soup.find(True, attrs={'class': ['otv234', 'col2a']})
 | |
|             if otv234:
 | |
|                 for ow in otv234.findAll(True, attrs={'class': ['ow']}):
 | |
|                     a = ow.find('a', href=True)
 | |
|                     title = self.tag_to_string(a, use_alt=True).strip()
 | |
|                     description = ''
 | |
|                     prx = ow.find(True, attrs={'class': ['prx']})
 | |
|                     if prx:
 | |
|                         description = str(prx.string)
 | |
|                     nfo = ow.find(True, attrs={'class': ['nfo']})
 | |
|                     pubdate = ''
 | |
|                     if nfo:
 | |
|                         dtime = time.localtime()
 | |
|                         day = dtime[2]
 | |
|                         month = dtime[1]
 | |
| 
 | |
|                         pubdate = strftime('%d. %m.')
 | |
| 
 | |
|                         match = re.search(r'([0-9]*)\.([0-9]*)\.', str(nfo))
 | |
| 
 | |
|                         if self.stahnout_vsechny or (int(day) == int(match.group(1)) and int(month) == int(match.group(2))):
 | |
|                             if title not in was:
 | |
|                                 articles[name].append(
 | |
|                                     dict(title=title, url=makeurl(a['href']), date=pubdate,
 | |
|                                          description=description,
 | |
|                                          content=''))
 | |
| 
 | |
|         soup = self.index_to_soup('http://ihned.cz/')
 | |
|         otvirak = soup.find(True, attrs={'class': ['otv']})
 | |
|         if otvirak:
 | |
|             a = otvirak.find('a', href=True)
 | |
|             title = self.tag_to_string(a, use_alt=True).strip()
 | |
|             txt = otvirak.find(True, attrs={'class': ['txt']})
 | |
|             description = ''
 | |
|             if txt:
 | |
|                 match = re.match(
 | |
|                     r'<div class="txt">\s*([^<]*)\s*<a', str(txt), re.L)
 | |
|                 if match:
 | |
|                     description = match.group(1)
 | |
| 
 | |
|             pubdate = strftime('%d. %m.')
 | |
|             feed = "Hlavní"
 | |
|             articles[feed].append(
 | |
|                 dict(title=title, url=(a['href']), date=pubdate,
 | |
|                      description=description,
 | |
|                      content=''))
 | |
|             was[title] = 1
 | |
| 
 | |
|         otvirak2345 = soup.find(True, attrs={'class': ['otv2345']})
 | |
|         if otvirak2345:
 | |
|             for otv2 in otvirak2345.findAll(True, attrs={'class': ['otv2-5']}):
 | |
|                 a = otv2.find('a', attrs={'class': ['tit2']}, href=True)
 | |
|                 title = self.tag_to_string(a, use_alt=True).strip()
 | |
|                 description = ''
 | |
|                 span = otv2.find('span')
 | |
|                 if span:
 | |
|                     match = re.match(r'<span>\s*([^<]*)\s*<a', str(span), re.L)
 | |
|                     if match:
 | |
|                         description = match.group(1)
 | |
|                 feed = "Hlavní"
 | |
|                 pubdate = strftime('%d. %m.')
 | |
|                 articles[feed].append(
 | |
|                     dict(title=title, url=(a['href']), date=pubdate,
 | |
|                          description=description,
 | |
|                          content=''))
 | |
|                 was[title] = 1
 | |
| 
 | |
|         parse_subpage("http://komentare.ihned.cz/", "Komentáře")
 | |
|         parse_subpage("http://domaci.ihned.cz", "Domácí")
 | |
|         parse_subpage("http://ekonomika.ihned.cz", "Ekonomika")
 | |
|         parse_subpage("http://zahranicni.ihned.cz/", "Zahraničí")
 | |
|         parse_subpage("http://finweb.ihned.cz/", "Finance")
 | |
|         parse_subpage("http://digiweb.ihned.cz/", "DigiWeb")
 | |
|         parse_subpage("http://kultura.ihned.cz/", "Kultura")
 | |
|         parse_subpage("http://sport.ihned.cz/", "Sport")
 | |
| 
 | |
|         # seradi kategorie
 | |
|         ans = self.sort_index_by(ans, {'Hlavni': 1, 'Domácí': 2, 'Ekonomika': 5, 'Zahraničí': 3,
 | |
|                                        'Finance': 6, 'DigiWeb': 7, 'Kultura': 8, 'Sport': 9, 'Komentáře': 4})
 | |
| 
 | |
|         # vrati, ale pouze, kdyz je v kategoriich...
 | |
|         ans = [(key, articles[key]) for key in ans if key in articles]
 | |
|         return ans
 |