From 73ed9f1268e0c2ff147f85cda5d4bf9d675e2606 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tomasz=20D=C5=82ugosz?= Date: Mon, 28 Apr 2014 00:03:03 +0200 Subject: [PATCH 1/4] quick fix for fronda - just populate article list and get the first page of article content --- recipes/fronda.recipe | 27 +++++++++------------------ 1 file changed, 9 insertions(+), 18 deletions(-) diff --git a/recipes/fronda.recipe b/recipes/fronda.recipe index 8372bb4d81..f677e38499 100644 --- a/recipes/fronda.recipe +++ b/recipes/fronda.recipe @@ -62,34 +62,25 @@ class Fronda(BasicNewsRecipe): except: continue articles[genName] = [] - for item in soup.findAll('li'): - article_h = item.find('h2') - if not article_h: - continue - article_date = self.tag_to_string(item.find('b')) - if self.date_cut(article_date): - continue - article_a = article_h.find('a') + for item in soup.findAll('article',attrs={'class':'article article-wide'}): + article_a = item.find('a') article_url = 'http://www.fronda.pl' + article_a['href'] article_title = self.tag_to_string(article_a) - articles[genName].append( { 'title' : article_title, 'url' : article_url, 'date' : article_date }) + articles[genName].append( { 'title' : article_title, 'url' : article_url }) if articles[genName]: feeds.append((genName, articles[genName])) return feeds keep_only_tags = [ - dict(name='div', attrs={'class':'yui-g'}) + dict(name='div', attrs={'class':'content content-70 phone-100'}) ] remove_tags = [ - dict(name='div', attrs={'class':['related-articles','button right','pagination','related-articles content']}), - dict(name='h3', attrs={'class':'block-header article comments'}), - dict(name='ul', attrs={'class':['comment-list','category','tag-list']}), - dict(name='p', attrs={'id':'comments-disclaimer'}), - dict(name='div', attrs={'style':'text-align: left; margin-bottom: 15px;'}), - dict(name='div', attrs={'style':'text-align: left; margin-top: 15px; margin-bottom: 30px;'}), - dict(name='div', attrs={'id':'comment-form'}), - dict(name='span', attrs={'class':'separator'}) + dict(name='div', attrs={'class':['clearfix','last-articles clearfix','comments clearfix','related-articles','social-buttons clearfix']}), + dict(name='span', attrs={'class':'small-info'}), + dict(name='ul', attrs={'class':'nav nav-tags clearfix'}), + dict(name='h3', attrs={'class':'section-header'}), + dict(name='article', attrs={'class':['slided-article hidden-phone', 'article article-wide hidden-phone']}) ] preprocess_regexps = [ From 8b567ce66e8cb8d1ccc75551c67aea4fedc7f9a9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tomasz=20D=C5=82ugosz?= Date: Sun, 4 May 2014 14:47:55 +0200 Subject: [PATCH 2/4] drop too old articles --- recipes/fronda.recipe | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/recipes/fronda.recipe b/recipes/fronda.recipe index f677e38499..43d566bbc4 100644 --- a/recipes/fronda.recipe +++ b/recipes/fronda.recipe @@ -1,7 +1,7 @@ #!/usr/bin/env python __license__ = 'GPL v3' -__copyright__ = u'2010-2013, Tomasz Dlugosz ' +__copyright__ = u'2010-2014, Tomasz Dlugosz ' ''' fronda.pl ''' @@ -71,6 +71,14 @@ class Fronda(BasicNewsRecipe): feeds.append((genName, articles[genName])) return feeds + def preprocess_html(self, soup): + r = soup.find('small') + timestamp = str(r.contents)[3:].split(',')[0] + parts = timestamp.split('.') + art_date = date(int(parts[2]),int(parts[1]),int(parts[0])) + if self.earliest_date < art_date : + return soup + keep_only_tags = [ dict(name='div', attrs={'class':'content content-70 phone-100'}) ] From d390b4d361c2cbe5a7f064722ceebe4d9213607c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tomasz=20D=C5=82ugosz?= Date: Sun, 4 May 2014 19:32:16 +0200 Subject: [PATCH 3/4] remove fronda, as its new pages are impossible to parse The first (solveable but resulting in monstrous overhead) problem is that articles don't have dates in feed nor category pages, The second (not solvable for me) is multipage articles link to next page using relative links. --- recipes/fronda.recipe | 95 ------------------------------------------- 1 file changed, 95 deletions(-) delete mode 100644 recipes/fronda.recipe diff --git a/recipes/fronda.recipe b/recipes/fronda.recipe deleted file mode 100644 index 43d566bbc4..0000000000 --- a/recipes/fronda.recipe +++ /dev/null @@ -1,95 +0,0 @@ -#!/usr/bin/env python - -__license__ = 'GPL v3' -__copyright__ = u'2010-2014, Tomasz Dlugosz ' -''' -fronda.pl -''' - -import re -from calibre.web.feeds.news import BasicNewsRecipe -from datetime import timedelta, date - -class Fronda(BasicNewsRecipe): - title = u'Fronda.pl' - publisher = u'Fronda.pl' - description = u'Portal po\u015bwi\u0119cony - Informacje' - language = 'pl' - __author__ = u'Tomasz D\u0142ugosz' - oldest_article = 7 - max_articles_per_feed = 100 - use_embedded_content = False - no_stylesheets = True - - extra_css = ''' - h1 {font-size:150%} - .body {text-align:left;} - div#featured-image {font-style:italic; font-size:70%} - ''' - - earliest_date = date.today() - timedelta(days=oldest_article) - - def date_cut(self,datestr): - # eg. 5.11.2012, 12:07 - timestamp = datestr.split(',')[0] - parts = timestamp.split('.') - art_date = date(int(parts[2]),int(parts[1]),int(parts[0])) - return True if art_date < self.earliest_date else False - - def parse_index(self): - genres = [ - ('ekonomia,4.html', 'Ekonomia'), - ('filozofia,15.html', 'Filozofia'), - ('historia,6.html', 'Historia'), - ('kosciol,8.html', 'Kościół'), - ('kultura,5.html', 'Kultura'), - ('media,10.html', 'Media'), - ('nauka,9.html', 'Nauka'), - ('polityka,11.html', 'Polityka'), - ('polska,12.html', 'Polska'), - ('prolife,3.html', 'Prolife'), - ('religia,7.html', 'Religia'), - ('rodzina,13.html', 'Rodzina'), - ('swiat,14.html', 'Świat'), - ('wydarzenie,16.html', 'Wydarzenie') - ] - feeds = [] - articles = {} - - for url, genName in genres: - try: - soup = self.index_to_soup('http://www.fronda.pl/c/'+ url) - except: - continue - articles[genName] = [] - for item in soup.findAll('article',attrs={'class':'article article-wide'}): - article_a = item.find('a') - article_url = 'http://www.fronda.pl' + article_a['href'] - article_title = self.tag_to_string(article_a) - articles[genName].append( { 'title' : article_title, 'url' : article_url }) - if articles[genName]: - feeds.append((genName, articles[genName])) - return feeds - - def preprocess_html(self, soup): - r = soup.find('small') - timestamp = str(r.contents)[3:].split(',')[0] - parts = timestamp.split('.') - art_date = date(int(parts[2]),int(parts[1]),int(parts[0])) - if self.earliest_date < art_date : - return soup - - keep_only_tags = [ - dict(name='div', attrs={'class':'content content-70 phone-100'}) - ] - - remove_tags = [ - dict(name='div', attrs={'class':['clearfix','last-articles clearfix','comments clearfix','related-articles','social-buttons clearfix']}), - dict(name='span', attrs={'class':'small-info'}), - dict(name='ul', attrs={'class':'nav nav-tags clearfix'}), - dict(name='h3', attrs={'class':'section-header'}), - dict(name='article', attrs={'class':['slided-article hidden-phone', 'article article-wide hidden-phone']}) - ] - - preprocess_regexps = [ - (re.compile(r'komentarzy: .*?', re.IGNORECASE | re.DOTALL | re.M ), lambda match: '')] From aa46bace019441a48d7d9ffc783005cfec77440a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tomasz=20D=C5=82ugosz?= Date: Sun, 4 May 2014 19:39:28 +0200 Subject: [PATCH 4/4] remove fornda's icon --- recipes/icons/fronda.png | Bin 646 -> 0 bytes 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 recipes/icons/fronda.png diff --git a/recipes/icons/fronda.png b/recipes/icons/fronda.png deleted file mode 100644 index c332bbda497ebea167e6db230b2000e094538d63..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 646 zcmeAS@N?(olHy`uVBq!ia0vp^0wB!61|;P_|4#%`Y)RhkE)4%caKYZ?lYt_f1s;*b zK-vS0-A-oPfdtD69Mgd`SU*F|v9*U87#P=kx;TbdoSr(#HbcZwqHX^B^z?Ma34YSv z2i*>O1^haooH8=FOQs=WFg;E1RsEcV|t6Ls7m8!=+j4b&}ZkT@jdR{$6gHbLi1C2F7>( z9=U$j(z-T4twZg>yF^P7L6;>f4XA%s<=mco`x4nNr}{S@ox$8R^V6GKm-p@7{4MXV z=z(i<&Hqk#%3^*;)Y$f>lMWY)W5~y6r*8R1Dx8)nK7J<9mQA2VNvzbtw}bU0pGV&!)%d%k(q?5&ZI)f7i3tFY90ZV><00t5;^lpA3vg)e_f;l9a@fRIB8oR3OD*WMF8ZYhbBsWE5gx uX=P|(Wo)5qU}j}t@Ww)1A4NlMeoAIqC2kFKQ`du%5re0zpUXO@geCwyI|fbw