mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
remove fronda, as its new pages are impossible to parse
The first (solveable but resulting in monstrous overhead) problem is that articles don't have dates in feed nor category pages, The second (not solvable for me) is multipage articles link to next page using relative links.
This commit is contained in:
parent
8b567ce66e
commit
d390b4d361
@ -1,95 +0,0 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
__license__ = 'GPL v3'
|
||||
__copyright__ = u'2010-2014, Tomasz Dlugosz <tomek3d@gmail.com>'
|
||||
'''
|
||||
fronda.pl
|
||||
'''
|
||||
|
||||
import re
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
from datetime import timedelta, date
|
||||
|
||||
class Fronda(BasicNewsRecipe):
|
||||
title = u'Fronda.pl'
|
||||
publisher = u'Fronda.pl'
|
||||
description = u'Portal po\u015bwi\u0119cony - Informacje'
|
||||
language = 'pl'
|
||||
__author__ = u'Tomasz D\u0142ugosz'
|
||||
oldest_article = 7
|
||||
max_articles_per_feed = 100
|
||||
use_embedded_content = False
|
||||
no_stylesheets = True
|
||||
|
||||
extra_css = '''
|
||||
h1 {font-size:150%}
|
||||
.body {text-align:left;}
|
||||
div#featured-image {font-style:italic; font-size:70%}
|
||||
'''
|
||||
|
||||
earliest_date = date.today() - timedelta(days=oldest_article)
|
||||
|
||||
def date_cut(self,datestr):
|
||||
# eg. 5.11.2012, 12:07
|
||||
timestamp = datestr.split(',')[0]
|
||||
parts = timestamp.split('.')
|
||||
art_date = date(int(parts[2]),int(parts[1]),int(parts[0]))
|
||||
return True if art_date < self.earliest_date else False
|
||||
|
||||
def parse_index(self):
|
||||
genres = [
|
||||
('ekonomia,4.html', 'Ekonomia'),
|
||||
('filozofia,15.html', 'Filozofia'),
|
||||
('historia,6.html', 'Historia'),
|
||||
('kosciol,8.html', 'Kościół'),
|
||||
('kultura,5.html', 'Kultura'),
|
||||
('media,10.html', 'Media'),
|
||||
('nauka,9.html', 'Nauka'),
|
||||
('polityka,11.html', 'Polityka'),
|
||||
('polska,12.html', 'Polska'),
|
||||
('prolife,3.html', 'Prolife'),
|
||||
('religia,7.html', 'Religia'),
|
||||
('rodzina,13.html', 'Rodzina'),
|
||||
('swiat,14.html', 'Świat'),
|
||||
('wydarzenie,16.html', 'Wydarzenie')
|
||||
]
|
||||
feeds = []
|
||||
articles = {}
|
||||
|
||||
for url, genName in genres:
|
||||
try:
|
||||
soup = self.index_to_soup('http://www.fronda.pl/c/'+ url)
|
||||
except:
|
||||
continue
|
||||
articles[genName] = []
|
||||
for item in soup.findAll('article',attrs={'class':'article article-wide'}):
|
||||
article_a = item.find('a')
|
||||
article_url = 'http://www.fronda.pl' + article_a['href']
|
||||
article_title = self.tag_to_string(article_a)
|
||||
articles[genName].append( { 'title' : article_title, 'url' : article_url })
|
||||
if articles[genName]:
|
||||
feeds.append((genName, articles[genName]))
|
||||
return feeds
|
||||
|
||||
def preprocess_html(self, soup):
|
||||
r = soup.find('small')
|
||||
timestamp = str(r.contents)[3:].split(',')[0]
|
||||
parts = timestamp.split('.')
|
||||
art_date = date(int(parts[2]),int(parts[1]),int(parts[0]))
|
||||
if self.earliest_date < art_date :
|
||||
return soup
|
||||
|
||||
keep_only_tags = [
|
||||
dict(name='div', attrs={'class':'content content-70 phone-100'})
|
||||
]
|
||||
|
||||
remove_tags = [
|
||||
dict(name='div', attrs={'class':['clearfix','last-articles clearfix','comments clearfix','related-articles','social-buttons clearfix']}),
|
||||
dict(name='span', attrs={'class':'small-info'}),
|
||||
dict(name='ul', attrs={'class':'nav nav-tags clearfix'}),
|
||||
dict(name='h3', attrs={'class':'section-header'}),
|
||||
dict(name='article', attrs={'class':['slided-article hidden-phone', 'article article-wide hidden-phone']})
|
||||
]
|
||||
|
||||
preprocess_regexps = [
|
||||
(re.compile(r'komentarzy: .*?</h6>', re.IGNORECASE | re.DOTALL | re.M ), lambda match: '</h6>')]
|
Loading…
x
Reference in New Issue
Block a user