remove fronda, as its new pages are impossible to parse

The first (solveable but resulting in monstrous overhead) problem is that articles don't have dates in feed nor category pages,
The second (not solvable for me) is multipage articles link to next page using relative links.
This commit is contained in:
Tomasz Długosz 2014-05-04 19:32:16 +02:00
parent 8b567ce66e
commit d390b4d361

View File

@ -1,95 +0,0 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = u'2010-2014, Tomasz Dlugosz <tomek3d@gmail.com>'
'''
fronda.pl
'''
import re
from calibre.web.feeds.news import BasicNewsRecipe
from datetime import timedelta, date
class Fronda(BasicNewsRecipe):
title = u'Fronda.pl'
publisher = u'Fronda.pl'
description = u'Portal po\u015bwi\u0119cony - Informacje'
language = 'pl'
__author__ = u'Tomasz D\u0142ugosz'
oldest_article = 7
max_articles_per_feed = 100
use_embedded_content = False
no_stylesheets = True
extra_css = '''
h1 {font-size:150%}
.body {text-align:left;}
div#featured-image {font-style:italic; font-size:70%}
'''
earliest_date = date.today() - timedelta(days=oldest_article)
def date_cut(self,datestr):
# eg. 5.11.2012, 12:07
timestamp = datestr.split(',')[0]
parts = timestamp.split('.')
art_date = date(int(parts[2]),int(parts[1]),int(parts[0]))
return True if art_date < self.earliest_date else False
def parse_index(self):
genres = [
('ekonomia,4.html', 'Ekonomia'),
('filozofia,15.html', 'Filozofia'),
('historia,6.html', 'Historia'),
('kosciol,8.html', 'Kościół'),
('kultura,5.html', 'Kultura'),
('media,10.html', 'Media'),
('nauka,9.html', 'Nauka'),
('polityka,11.html', 'Polityka'),
('polska,12.html', 'Polska'),
('prolife,3.html', 'Prolife'),
('religia,7.html', 'Religia'),
('rodzina,13.html', 'Rodzina'),
('swiat,14.html', 'Świat'),
('wydarzenie,16.html', 'Wydarzenie')
]
feeds = []
articles = {}
for url, genName in genres:
try:
soup = self.index_to_soup('http://www.fronda.pl/c/'+ url)
except:
continue
articles[genName] = []
for item in soup.findAll('article',attrs={'class':'article article-wide'}):
article_a = item.find('a')
article_url = 'http://www.fronda.pl' + article_a['href']
article_title = self.tag_to_string(article_a)
articles[genName].append( { 'title' : article_title, 'url' : article_url })
if articles[genName]:
feeds.append((genName, articles[genName]))
return feeds
def preprocess_html(self, soup):
r = soup.find('small')
timestamp = str(r.contents)[3:].split(',')[0]
parts = timestamp.split('.')
art_date = date(int(parts[2]),int(parts[1]),int(parts[0]))
if self.earliest_date < art_date :
return soup
keep_only_tags = [
dict(name='div', attrs={'class':'content content-70 phone-100'})
]
remove_tags = [
dict(name='div', attrs={'class':['clearfix','last-articles clearfix','comments clearfix','related-articles','social-buttons clearfix']}),
dict(name='span', attrs={'class':'small-info'}),
dict(name='ul', attrs={'class':'nav nav-tags clearfix'}),
dict(name='h3', attrs={'class':'section-header'}),
dict(name='article', attrs={'class':['slided-article hidden-phone', 'article article-wide hidden-phone']})
]
preprocess_regexps = [
(re.compile(r'komentarzy: .*?</h6>', re.IGNORECASE | re.DOTALL | re.M ), lambda match: '</h6>')]