Fix #4964 (Massive recipe update for serbian news feeds)

This commit is contained in:
Kovid Goyal 2010-02-20 16:09:00 -07:00
parent 331dcc787d
commit c04d4a1db7
11 changed files with 140 additions and 147 deletions

View File

@ -1,7 +1,6 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
__copyright__ = '2009-2010, Darko Miletic <darko.miletic at gmail.com>'
'''
24sata.rs
@ -9,7 +8,6 @@ __copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
import re
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Tag
class Ser24Sata(BasicNewsRecipe):
title = '24 Sata - Sr'
@ -17,22 +15,20 @@ class Ser24Sata(BasicNewsRecipe):
description = '24 sata portal vesti iz Srbije'
publisher = 'Ringier d.o.o.'
category = 'news, politics, entertainment, Serbia'
oldest_article = 7
oldest_article = 2
max_articles_per_feed = 100
no_stylesheets = True
encoding = 'utf-8'
use_embedded_content = False
language = 'sr'
lang = 'sr-Latn-RS'
extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} body{font-family: serif1, serif} .article_description{font-family: serif1, serif}'
language = 'sr'
extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} body{font-family: serif1, serif} .article_description{font-family: serif1, serif}'
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher' : publisher
, 'language' : lang
, 'pretty_print' : True
, 'language' : language
, 'linearize_tables' : True
}
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
@ -40,25 +36,6 @@ class Ser24Sata(BasicNewsRecipe):
feeds = [(u'Vesti Dana', u'http://www.24sata.rs/rss.php')]
def preprocess_html(self, soup):
soup.html['xml:lang'] = self.lang
soup.html['lang'] = self.lang
attribs = [ 'style','font','valign'
,'colspan','width','height'
,'rowspan','summary','align'
,'cellspacing','cellpadding'
,'frames','rules','border'
]
for item in soup.body.findAll(name=['table','td','tr','th','caption','thead','tfoot','tbody','colgroup','col']):
item.name = 'div'
for attrib in attribs:
if item.has_key(attrib):
del item[attrib]
mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)])
mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=utf-8")])
soup.head.insert(0,mlang)
soup.head.insert(1,mcharset)
return self.adeify_images(soup)
def print_version(self, url):

View File

@ -1,7 +1,6 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2008-2009, Darko Miletic <darko.miletic at gmail.com>'
__copyright__ = '2008-2010, Darko Miletic <darko.miletic at gmail.com>'
'''
b92.net
'''
@ -19,16 +18,15 @@ class B92(BasicNewsRecipe):
no_stylesheets = True
use_embedded_content = False
encoding = 'cp1250'
language = 'sr'
lang = 'sr-Latn-RS'
extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} body{font-family: serif1, serif} .article_description{font-family: serif1, serif}'
language = 'sr'
extra_css = ' @font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} body{font-family: serif1, serif} .article_description{font-family: serif1, serif} '
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher' : publisher
, 'language' : lang
, 'language' : language
, 'linearize_tables' : True
}
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
@ -50,20 +48,5 @@ class B92(BasicNewsRecipe):
return url + '&version=print'
def preprocess_html(self, soup):
del soup.body['onload']
for item in soup.findAll('font'):
item.name='div'
if item.has_key('size'):
del item['size']
attribs = [ 'style','font','valign'
,'colspan','width','height'
,'rowspan','summary','align'
,'cellspacing','cellpadding'
,'frames','rules','border'
]
for item in soup.body.findAll(name=['table','td','tr','th','caption','thead','tfoot','tbody','colgroup','col']):
item.name = 'div'
for attrib in attribs:
if item.has_key(attrib):
del item[attrib]
return soup
return self.adeify_images(soup)

View File

@ -1,13 +1,11 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
__copyright__ = '2009-2010, Darko Miletic <darko.miletic at gmail.com>'
'''
beta.rs
'''
import re
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Tag
class Danas(BasicNewsRecipe):
title = 'BETA'
@ -19,18 +17,14 @@ class Danas(BasicNewsRecipe):
max_articles_per_feed = 100
no_stylesheets = False
use_embedded_content = True
language = 'sr'
lang = 'sr-Latn-RS'
direction = 'ltr'
extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} body{font-family: serif1, serif} .article_description{font-family: sans1, sans-serif}'
language = 'sr'
extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} body{font-family: serif1, serif} .article_description{font-family: sans1, sans-serif} img{margin-bottom: 0.8em} '
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher' : publisher
, 'language' : lang
, 'pretty_print' : True
, 'language' : language
}
@ -43,9 +37,4 @@ class Danas(BasicNewsRecipe):
]
def preprocess_html(self, soup):
soup.html['lang'] = self.lang
mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)])
mcharset = Tag(soup,'meta',[("http-equiv","Content-Type"),("content","text/html; charset=utf-8")])
soup.head.insert(0,mlang)
soup.head.insert(1,mcharset)
return self.adeify_images(soup)

View File

@ -14,14 +14,13 @@ class Blic(BasicNewsRecipe):
description = 'Blic.rs online verzija najtiraznije novine u Srbiji donosi najnovije vesti iz Srbije i sveta, komentare, politicke analize, poslovne i ekonomske vesti, vesti iz regiona, intervjue, informacije iz kulture, reportaze, pokriva sve sportske dogadjaje, detaljan tv program, nagradne igre, zabavu, fenomenalni Blic strip, dnevni horoskop, arhivu svih dogadjaja'
publisher = 'RINGIER d.o.o.'
category = 'news, politics, Serbia'
delay = 1
oldest_article = 2
max_articles_per_feed = 100
no_stylesheets = True
use_embedded_content = False
masthead_url = 'http://www.blic.rs/resources/images/header/header_back.png'
language = 'sr'
extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} body{font-family: serif1, serif} .article_description{font-family: sans1, sans-serif} '
extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} body{font-family: Georgia, serif1, serif} .article_description{font-family: Arial, sans1, sans-serif} .img_full{float: none} img{margin-bottom: 0.8em} '
conversion_options = {
'comment' : description
@ -31,13 +30,15 @@ class Blic(BasicNewsRecipe):
}
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
remove_tags_before = dict(name='div', attrs={'id':'article_info'})
remove_tags = [dict(name=['object','link'])]
remove_attributes = ['width','height']
feeds = [(u'Danasnje Vesti', u'http://www.blic.rs/rss/danasnje-vesti')]
remove_tags = [dict(name=['object','link'])]
def print_version(self, url):
return url + '/print'
def preprocess_html(self, soup):
return self.adeify_images(soup)

View File

@ -0,0 +1,36 @@
__license__ = 'GPL v3'
__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
'''
chetnixploitation.blogspot.com
'''
import re
from calibre.web.feeds.news import BasicNewsRecipe
class Chetnixploitation(BasicNewsRecipe):
title = 'Chetnixploitation'
__author__ = 'Darko Miletic'
description = 'Filmski blog'
oldest_article = 7
max_articles_per_feed = 100
language = 'sr'
encoding = 'utf-8'
no_stylesheets = True
use_embedded_content = True
extra_css = ' @font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} body{font-family: "Trebuchet MS",Trebuchet,Verdana,sans1,sans-serif} .article_description{font-family: sans1, sans-serif} img{margin-bottom: 0.8em; border: 1px solid #333333; padding: 4px } '
conversion_options = {
'comment' : description
, 'tags' : 'film, blog, cetnici, srbija, ex-yu'
, 'publisher': 'Son of Man'
, 'language' : language
}
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
feeds = [(u'Posts', u'http://chetnixploitation.blogspot.com/feeds/posts/default')]
def preprocess_html(self, soup):
return self.adeify_images(soup)

View File

@ -20,7 +20,7 @@ class Danas(BasicNewsRecipe):
encoding = 'utf-8'
masthead_url = 'http://www.danas.rs/images/basic/danas.gif'
language = 'sr'
extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} .article_description,body,.lokacija{font-family: Tahoma,Arial,Helvetica,sans1,sans-serif} .nadNaslov,h1,.preamble{font-family: Georgia,"Times New Roman",Times,serif1,serif} .antrfileText{border-left: 2px solid #999999; color:#666666; margin-left: 0.8em; padding-left: 1.2em; margin-bottom: 0; margin-top: 0} h2,.datum,.lokacija,.autor{font-size: small} .antrfileNaslov{border-left: 2px solid #999999; color:#666666; margin-left: 0.8em; padding-left: 1.2em; font-weight:bold; margin-bottom: 0; margin-top: 0} img{margin-bottom: 0.8em} '
extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} .article_description,body,.lokacija{font-family: Tahoma,Arial,Helvetica,sans1,sans-serif} .nadNaslov,h1,.preamble{font-family: Georgia,"Times New Roman",Times,serif1,serif} .antrfileText{border-left: 2px solid #999999; margin-left: 0.8em; padding-left: 1.2em; margin-bottom: 0; margin-top: 0} h2,.datum,.lokacija,.autor{font-size: small} .antrfileNaslov{border-left: 2px solid #999999; margin-left: 0.8em; padding-left: 1.2em; font-weight:bold; margin-bottom: 0; margin-top: 0} img{margin-bottom: 0.8em} '
conversion_options = {
'comment' : description
@ -38,7 +38,7 @@ class Danas(BasicNewsRecipe):
,dict(name=['object','link','iframe'])
]
feeds = [
feeds = [
(u'Politika' , u'http://www.danas.rs/rss/rss.asp?column_id=27')
,(u'Hronika' , u'http://www.danas.rs/rss/rss.asp?column_id=2' )
,(u'Drustvo' , u'http://www.danas.rs/rss/rss.asp?column_id=24')
@ -60,4 +60,4 @@ class Danas(BasicNewsRecipe):
def print_version(self, url):
return url + '&action=print'

View File

@ -1,7 +1,5 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
__copyright__ = '2009-2010, Darko Miletic <darko.miletic at gmail.com>'
'''
e-novine.com
@ -9,7 +7,6 @@ e-novine.com
import re
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Tag
class E_novine(BasicNewsRecipe):
title = 'E-Novine'
@ -20,40 +17,38 @@ class E_novine(BasicNewsRecipe):
oldest_article = 2
max_articles_per_feed = 100
no_stylesheets = True
encoding = 'cp1250'
encoding = 'utf-8'
use_embedded_content = False
language = 'sr'
lang = 'sr'
extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} body{text-align: justify; font-family: serif1, serif} .article_description{font-family: sans1, sans-serif}'
language = 'sr'
masthead_url = 'http://www.e-novine.com/themes/e_novine/img/logo.gif'
extra_css = ' @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} .article_description,body{font-family: Arial,Helvetica,sans1,sans-serif} img{float: none; margin-bottom: 0.8em} '
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher' : publisher
, 'language' : lang
, 'pretty_print' : True
, 'language' : language
}
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
keep_only_tags = [dict(name='div', attrs={'id':['css_47_0_2844H']})]
keep_only_tags = [
dict(name='div', attrs={'class':'article_head'})
,dict(name='div', attrs={'id':'article_body'})
]
remove_tags = [dict(name=['object','link','embed','iframe'])]
remove_tags = [
dict(name=['object','link','embed','iframe'])
,dict(attrs={'id':'box_article_tools'})
]
remove_attributes = ['height','width','lang']
feeds = [(u'Sve vesti', u'http://www.e-novine.com/rss/e-novine.xml' )]
feeds = [(u'Sve vesti', u'http://www.e-novine.com/feed/index.1.rss' )]
def preprocess_html(self, soup):
soup.html['xml:lang'] = self.lang
soup.html['lang'] = self.lang
mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)])
soup.head.insert(0,mlang)
for item in soup.findAll(style=True):
del item['style']
ftag = soup.find('div', attrs={'id':'css_47_0_2844H'})
if ftag:
it = ftag.div
it.extract()
ftag.div.extract()
ftag.insert(0,it)
return soup
return self.adeify_images(soup)
def print_version(self, url):
return url + '?print'

View File

@ -1,7 +1,6 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
__copyright__ = '2009-2010, Darko Miletic <darko.miletic at gmail.com>'
'''
glassrpske.com
@ -9,7 +8,6 @@ glassrpske.com
import re
from calibre.web.feeds.recipes import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import Tag
class GlasSrpske(BasicNewsRecipe):
title = 'Glas Srpske'
@ -22,20 +20,16 @@ class GlasSrpske(BasicNewsRecipe):
no_stylesheets = True
encoding = 'utf-8'
use_embedded_content = False
cover_url = 'http://www.glassrpske.com/var/slike/glassrpske-logo.png'
lang = 'sr-BA'
language = 'sr'
masthead_url = 'http://www.glassrpske.com/var/slike/glassrpske-logo.png'
language = 'sr'
INDEX = 'http://www.glassrpske.com'
extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} body{font-family: serif1, serif} .article_description{font-family: serif1, serif}'
extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} body{font-family: serif1, serif} .article_description{font-family: serif1, serif} img{margin-bottom: 0.8em} '
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher' : publisher
, 'language' : lang
, 'pretty_print' : True
, 'language' : language
}
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
@ -63,11 +57,7 @@ class GlasSrpske(BasicNewsRecipe):
]
def preprocess_html(self, soup):
soup.html['xml:lang'] = self.lang
soup.html['lang'] = self.lang
mlang = Tag(soup,'meta',[("http-equiv","Content-Language"),("content",self.lang)])
soup.head.insert(0,mlang)
return soup
return self.adeify_images(soup)
def parse_index(self):
totalfeeds = []

View File

@ -1,7 +1,6 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>'
__copyright__ = '2009-2010, Darko Miletic <darko.miletic at gmail.com>'
'''
www.glas-javnosti.rs
'''
@ -18,18 +17,14 @@ class GlasJavnosti(BasicNewsRecipe):
max_articles_per_feed = 100
no_stylesheets = False
use_embedded_content = False
language = 'sr'
lang = 'sr-Latn-RS'
direction = 'ltr'
extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} body{font-family: serif1, serif} .article_description{font-family: sans1, sans-serif}'
language = 'sr'
extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} body{font-family: serif1, serif} .article_description{font-family: sans1, sans-serif} img{margin-bottom: 0.8em} '
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher' : publisher
, 'language' : lang
, 'pretty_print' : True
, 'language' : language
}

View File

@ -1,7 +1,5 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = '2008-2009, Darko Miletic <darko.miletic at gmail.com>'
__copyright__ = '2008-2010, Darko Miletic <darko.miletic at gmail.com>'
'''
nspm.rs
'''
@ -21,17 +19,16 @@ class Nspm(BasicNewsRecipe):
use_embedded_content = False
INDEX = 'http://www.nspm.rs/?alphabet=l'
encoding = 'utf-8'
language = 'sr'
lang = 'sr-Latn-RS'
language = 'sr'
masthead_url = 'http://www.nspm.rs/templates/jsn_epic_pro/images/logol.jpg'
extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} body{text-align: justify; font-family: serif1, serif} .article_description{font-family: sans1, sans-serif}'
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher' : publisher
, 'language' : lang
, 'pretty_print' : True
, 'language' : language
, 'linearize_tables' : True
}
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
@ -39,6 +36,8 @@ class Nspm(BasicNewsRecipe):
dict(name=['link','object','embed'])
,dict(name='td', attrs={'class':'buttonheading'})
]
remove_tags_after = dict(attrs={'class':'article_separator'})
remove_attributes = ['width','height']
def get_browser(self):
br = BasicNewsRecipe.get_browser()
@ -51,17 +50,6 @@ class Nspm(BasicNewsRecipe):
return url.replace('.html','/stampa.html')
def preprocess_html(self, soup):
soup.html['xml:lang'] = self.lang
soup.html['lang'] = self.lang
attribs = [ 'style','font','valign'
,'colspan','width','height'
,'rowspan','summary','align'
,'cellspacing','cellpadding'
,'frames','rules','border'
]
for item in soup.body.findAll(name=['table','td','tr','th','caption','thead','tfoot','tbody','colgroup','col']):
item.name = 'div'
for attrib in attribs:
if item.has_key(attrib):
del item[attrib]
for item in soup.body.findAll(style=True):
del item['style']
return self.adeify_images(soup)

View File

@ -0,0 +1,39 @@
__license__ = 'GPL v3'
__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
'''
cultofghoul.blogspot.com
'''
import re
from calibre.web.feeds.news import BasicNewsRecipe
class TheCultOfGhoul(BasicNewsRecipe):
title = 'The Cult of Ghoul'
__author__ = 'Darko Miletic'
description = 'Filmski blog'
oldest_article = 7
max_articles_per_feed = 100
language = 'sr'
encoding = 'utf-8'
no_stylesheets = True
use_embedded_content = True
extra_css = ' @font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} body{font-family: "Trebuchet MS",Trebuchet,Verdana,sans1,sans-serif} .article_description{font-family: sans1, sans-serif} img{margin-bottom: 0.8em; border: 1px solid #333333; padding: 4px } '
conversion_options = {
'comment' : description
, 'tags' : 'film, blog, srbija, strava, uzas'
, 'publisher': 'Dejan Ognjanovic'
, 'language' : language
}
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
feeds = [(u'Posts', u'http://cultofghoul.blogspot.com/feeds/posts/default')]
def preprocess_html(self, soup):
for item in soup.findAll(style=True):
del item['style']
return self.adeify_images(soup)