KG changes

This commit is contained in:
GRiker 2010-03-16 04:08:33 -07:00
commit f3e2b9f726
37 changed files with 5343 additions and 4080 deletions

Binary file not shown.

After

Width:  |  Height:  |  Size: 733 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 401 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 475 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 626 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 626 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 808 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.4 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.8 KiB

View File

@ -2,18 +2,22 @@
__license__ = 'GPL v3' __license__ = 'GPL v3'
__author__ = 'Lorenzo Vigentini, based on Darko Miletic' __author__ = 'Lorenzo Vigentini, based on Darko Miletic'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>, Lorenzo Vigentini <l.vigentini at gmail.com>' __copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>, Lorenzo Vigentini <l.vigentini at gmail.com>'
__version__ = 'v1.01' __version__ = 'v1.02'
__date__ = '10, January 2010' __date__ = '14, March 2010'
__description__ = 'Italian daily newspaper (english version)' __description__ = 'Italian daily newspaper (english version)'
# NOTE: the feeds url are broken on the main site as the permalink structure has been changed erroneously ie:
# actual link in feed http://www.corriere.it/english/10_marzo_11/legitimate_impediment_approved_de9ba480-2cfd-11df-a00c-00144f02aabe.shtml
# this needs to be change to
# real feed URL http://www.corriere.it/International/english/articoli/2010/03/11/legitimate_impediment_approved.shtml
''' '''
http://www.corriere.it/ http://www.corriere.it/
''' '''
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
class ilCorriere(BasicNewsRecipe): class ilCorriereEn(BasicNewsRecipe):
__author__ = 'Lorenzo Vigentini, based on Darko Miletic' author = 'Lorenzo Vigentini, based on Darko Miletic'
description = 'Italian daily newspaper (english version)' description = 'Italian daily newspaper (english version)'
cover_url = 'http://images.corriereobjects.it/images/static/common/logo_home.gif?v=200709121520' cover_url = 'http://images.corriereobjects.it/images/static/common/logo_home.gif?v=200709121520'
title = u'Il Corriere della sera (english) ' title = u'Il Corriere della sera (english) '
@ -23,7 +27,7 @@ class ilCorriere(BasicNewsRecipe):
language = 'en' language = 'en'
timefmt = '[%a, %d %b, %Y]' timefmt = '[%a, %d %b, %Y]'
oldest_article = 1 oldest_article = 5
max_articles_per_feed = 100 max_articles_per_feed = 100
use_embedded_content = False use_embedded_content = False
recursion = 10 recursion = 10
@ -31,14 +35,30 @@ class ilCorriere(BasicNewsRecipe):
remove_javascript = True remove_javascript = True
no_stylesheets = True no_stylesheets = True
html2lrf_options = [ def get_article_url(self, article):
'--comment', description articleUrl= article.get('link')
, '--category', category segments = articleUrl.split('/')
, '--publisher', publisher basename = '/'.join(segments[:3]) + '/' + 'International/english/articoli/'
, '--ignore-tables'
]
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\nlinearize_tables=True' #the date has to be redone with the url structure
mlist1 = ['gennaio','febbraio','marzo','aprile','maggio','giugno','luglio','agosto','settembre','ottobre','novembre','dicembre']
mlist2 = ['01','02','03','04','05','06','07','08','09','10','11','12']
myDate = segments[4].split('_')
x=0
for x in range(11):
if myDate[1] == mlist1[x]:
noMonth=mlist2[x]
break
newDateUrl= '20'+ myDate[0] + '/' + noMonth + '/' + myDate[2] + '/'
#clean the article title
articleURLseg=segments[5].split('-')
myArticle = (articleURLseg[0])[:-9] + '.shtml'
myURL= basename + newDateUrl + myArticle
#print myURL
return myURL
keep_only_tags = [dict(name='div', attrs={'class':['news-dettaglio article','article']})] keep_only_tags = [dict(name='div', attrs={'class':['news-dettaglio article','article']})]

View File

@ -15,42 +15,42 @@ class DziennikInternautowRecipe(BasicNewsRecipe):
language = 'pl' language = 'pl'
title = u'Dziennik Internautow' title = u'Dziennik Internautow'
publisher = u'Dziennik Internaut\xc3\xb3w Sp. z o.o.' publisher = u'Dziennik Internaut\u00f3w Sp. z o.o.'
description =u'Internet w \xc5\xbcyciu i biznesie. Porady, wywiady, interwencje, bezpiecze\xc5\x84stwo w Sieci, technologia.' description = u'Internet w \u017cyciu i biznesie. Porady, wywiady, interwencje, bezpiecze\u0144stwo w Sieci, technologia.'
max_articles_per_feed = 100 max_articles_per_feed = 100
oldest_article = 7 oldest_article = 7
cover_url = 'http://di.com.pl/pic/logo_di_norm.gif' cover_url = 'http://di.com.pl/pic/logo_di_norm.gif'
no_stylesheets = True no_stylesheets = True
remove_javascript = True remove_javascript = True
encoding = 'utf-8' encoding = 'utf-8'
extra_css = ''' extra_css = '''
.fotodesc{font-size: 75%;} .fotodesc{font-size: 75%;}
.pub_data{font-size: 75%;} .pub_data{font-size: 75%;}
.fotonews{clear: both; padding-top: 10px; padding-bottom: 10px;} .fotonews{clear: both; padding-top: 10px; padding-bottom: 10px;}
#pub_foto{font-size: 75%; float: left; padding-right: 10px;} #pub_foto{font-size: 75%; float: left; padding-right: 10px;}
''' '''
feeds = [ feeds = [
(u'Dziennik Internautów', u'http://feeds.feedburner.com/glowny-di') (u'Dziennik Internaut\u00f3w', u'http://feeds.feedburner.com/glowny-di')
] ]
keep_only_tags = [ keep_only_tags = [
dict(name = 'div', attrs = {'id' : 'pub_head'}), dict(name = 'div', attrs = {'id' : 'pub_head'}),
dict(name = 'div', attrs = {'id' : 'pub_content'}) dict(name = 'div', attrs = {'id' : 'pub_content'})
] ]
remove_tags = [ remove_tags = [
dict(name = 'div', attrs = {'class' : 'poradniki_context'}), dict(name = 'div', attrs = {'class' : 'poradniki_context'}),
dict(name = 'div', attrs = {'class' : 'uniBox'}), dict(name = 'div', attrs = {'class' : 'uniBox'}),
dict(name = 'object', attrs = {}), dict(name = 'object', attrs = {}),
dict(name = 'h3', attrs = {}) dict(name = 'h3', attrs = {})
] ]
preprocess_regexps = [ preprocess_regexps = [
(re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
[ [
(r', <a href="http://di.com.pl/komentarze,.*?</div>', lambda match: '</div>'), (r', <a href="http://di.com.pl/komentarze,.*?</div>', lambda match: '</div>'),
(r'<div class="fotonews".*?">', lambda match: '<div class="fotonews">'), (r'<div class="fotonews".*?">', lambda match: '<div class="fotonews">'),

View File

@ -1,6 +1,6 @@
#!/usr/bin/env python #!/usr/bin/env python
__license__ = 'GPL v3' __license__ = 'GPL v3'
__author__ = 'Mori' __author__ = 'Mori'
__version__ = 'v. 0.1' __version__ = 'v. 0.1'
''' '''
@ -11,39 +11,39 @@ from calibre.web.feeds.news import BasicNewsRecipe
import re import re
class BlogeClictoRecipe(BasicNewsRecipe): class BlogeClictoRecipe(BasicNewsRecipe):
__author__ = 'Mori' __author__ = 'Mori'
language = 'pl' language = 'pl'
title = u'Blog eClicto' title = u'Blog eClicto'
publisher = u'Blog eClicto' publisher = u'Blog eClicto'
description = u'Blog o e-papierze i e-bookach' description = u'Blog o e-papierze i e-bookach'
max_articles_per_feed = 100 max_articles_per_feed = 100
cover_url = 'http://blog.eclicto.pl/wordpress/wp-content/themes/blog_eclicto/g/logo.gif' cover_url = 'http://blog.eclicto.pl/wordpress/wp-content/themes/blog_eclicto/g/logo.gif'
no_stylesheets = True no_stylesheets = True
remove_javascript = True remove_javascript = True
encoding = 'utf-8' encoding = 'utf-8'
extra_css = ''' extra_css = '''
img{float: left; padding-right: 10px; padding-bottom: 5px;} img{float: left; padding-right: 10px; padding-bottom: 5px;}
''' '''
feeds = [ feeds = [
(u'Blog eClicto', u'http://blog.eclicto.pl/feed/') (u'Blog eClicto', u'http://blog.eclicto.pl/feed/')
] ]
remove_tags = [ remove_tags = [
dict(name = 'span', attrs = {'id' : 'tags'}) dict(name = 'span', attrs = {'id' : 'tags'})
] ]
remove_tags_after = [ remove_tags_after = [
dict(name = 'div', attrs = {'class' : 'post'}) dict(name = 'div', attrs = {'class' : 'post'})
] ]
preprocess_regexps = [ preprocess_regexps = [
(re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
[ [
(r'\s*</', lambda match: '</'), (r'\s*</', lambda match: '</'),
] ]
] ]

View File

@ -11,7 +11,7 @@ from calibre.web.feeds.news import BasicNewsRecipe
class eksiazki(BasicNewsRecipe): class eksiazki(BasicNewsRecipe):
title = u'eKsiazki.org' title = u'eKsiazki.org'
desciption = u'Twoje centrum wiedzy o ePapierze i eBookach' description = u'Twoje centrum wiedzy o ePapierze i eBookach'
language = 'pl' language = 'pl'
__author__ = u'Tomasz D\u0142ugosz' __author__ = u'Tomasz D\u0142ugosz'
no_stylesheets = True no_stylesheets = True

View File

@ -0,0 +1,34 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = u'2010, Tomasz Dlugosz <tomek3d@gmail.com>'
'''
fronda.pl
'''
from calibre.web.feeds.news import BasicNewsRecipe
import re
class Fronda(BasicNewsRecipe):
title = u'Fronda.pl'
publisher = u'Fronda.pl'
description = u'Portal po\u015bwi\u0119cony - Infformacje'
language = 'pl'
__author__ = u'Tomasz D\u0142ugosz'
oldest_article = 7
max_articles_per_feed = 100
use_embedded_content = False
feeds = [(u'Infformacje', u'http://fronda.pl/news/feed')]
keep_only_tags = [dict(name='h1', attrs={'class':'big'}),
dict(name='ul', attrs={'class':'about clear'}),
dict(name='div', attrs={'class':'content'})]
preprocess_regexps = [
(re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
[ (r'<a href="#" class="print">Drukuj</a>', lambda match: ''),
(r'<p><a href="http://fronda.pl/sklepy">.*</a></p>', lambda match: ''),
(r'<p><a href="http://fronda.pl/pasaz">.*</a></p>', lambda match: ''),
(r'<h3><strong>W.* lektury.*</a></p></div>', lambda match: '</div>'),
(r'<h3>Zobacz t.*?</div>', lambda match: '</div>') ]
]

View File

@ -10,6 +10,7 @@ from calibre.web.feeds.news import BasicNewsRecipe
class InteriaFakty(BasicNewsRecipe): class InteriaFakty(BasicNewsRecipe):
title = u'Interia.pl - Fakty' title = u'Interia.pl - Fakty'
description = u'Fakty ze strony interia.pl'
language = 'pl' language = 'pl'
oldest_article = 7 oldest_article = 7
__author__ = u'Tomasz D\u0142ugosz' __author__ = u'Tomasz D\u0142ugosz'

View File

@ -11,6 +11,7 @@ from calibre.web.feeds.news import BasicNewsRecipe
class InteriaSport(BasicNewsRecipe): class InteriaSport(BasicNewsRecipe):
title = u'Interia.pl - Sport' title = u'Interia.pl - Sport'
description = u'Sport ze strony interia.pl'
language = 'pl' language = 'pl'
oldest_article = 7 oldest_article = 7
__author__ = u'Tomasz D\u0142ugosz' __author__ = u'Tomasz D\u0142ugosz'
@ -30,7 +31,8 @@ class InteriaSport(BasicNewsRecipe):
keep_only_tags = [dict(name='div', attrs={'id':'article'})] keep_only_tags = [dict(name='div', attrs={'id':'article'})]
remove_tags = [dict(name='div', attrs={'class':'object gallery'})] remove_tags = [dict(name='div', attrs={'class':'object gallery'}),
dict(name='div', attrs={'class':'box fontSizeSwitch'})]
extra_css = ''' extra_css = '''
.articleDate { .articleDate {

View File

@ -1,6 +1,6 @@
#!/usr/bin/env python #!/usr/bin/env python
__license__ = 'GPL v3' __license__ = 'GPL v3'
__author__ = 'Mori' __author__ = 'Mori'
__version__ = 'v. 0.1' __version__ = 'v. 0.1'
''' '''
@ -10,34 +10,34 @@ olgierd.bblog.pl
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
class LegeArtisRecipe(BasicNewsRecipe): class LegeArtisRecipe(BasicNewsRecipe):
__author__ = 'Mori' __author__ = 'Mori'
language = 'pl' language = 'pl'
title = u'Lege Artis' title = u'Lege Artis'
publisher = u'Olgierd Rudak' publisher = u'Olgierd Rudak'
description = u'Wszystko, co chcieliby\xc5\x9bcie wiedzie\xc4\x87 o prawie, ale wstydzicie si\xc4\x99 zapyta\xc4\x87' description = u'Wszystko, co chcieliby\u015bcie wiedzie\u0107 o prawie, ale wstydzicie si\u0119 zapyta\u0107'
max_articles_per_feed = 100 max_articles_per_feed = 100
no_stylesheets = True no_stylesheets = True
remove_javascript = True remove_javascript = True
extra_css = ''' extra_css = '''
img{clear: both;} img{clear: both;}
''' '''
feeds = [ feeds = [
(u'Lege Artis', u'http://olgierd.bblog.pl/rss/rss20.xml') (u'Lege Artis', u'http://olgierd.bblog.pl/rss/rss20.xml')
] ]
keep_only_tags = [ keep_only_tags = [
dict(name = 'div', attrs = {'class' : 'post_title'}), dict(name = 'div', attrs = {'class' : 'post_title'}),
dict(name = 'div', attrs = {'class' : 'post_date'}), dict(name = 'div', attrs = {'class' : 'post_date'}),
dict(name = 'div', attrs = {'class' : 'post_content'}) dict(name = 'div', attrs = {'class' : 'post_content'})
] ]
remove_tags = [ remove_tags = [
dict(name = 'div', attrs = {'id' : 'bb_tools'}), dict(name = 'div', attrs = {'id' : 'bb_tools'}),
dict(name = 'div', attrs = {'class' : 'post_comments'}), dict(name = 'div', attrs = {'class' : 'post_comments'}),
dict(name = 'object', attrs = {}) dict(name = 'object', attrs = {})
] ]

View File

@ -10,6 +10,7 @@ from calibre.web.feeds.news import BasicNewsRecipe
class Legitymizm(BasicNewsRecipe): class Legitymizm(BasicNewsRecipe):
title = u'Organizacja Monarchist\xf3w Polskich' title = u'Organizacja Monarchist\xf3w Polskich'
description = u'Portal legitymistyczny'
language = 'pl' language = 'pl'
oldest_article = 7 oldest_article = 7
__author__ = u'Tomasz D\u0142ugosz' __author__ = u'Tomasz D\u0142ugosz'

View File

@ -12,7 +12,7 @@ from calibre.web.feeds.news import BasicNewsRecipe
class michalkiewicz(BasicNewsRecipe): class michalkiewicz(BasicNewsRecipe):
title = u'Stanis\u0142aw Michalkiewicz' title = u'Stanis\u0142aw Michalkiewicz'
desciption = u'Strona autorska * felietony * artyku\u0142y * komentarze' description = u'Strona autorska * felietony * artyku\u0142y * komentarze'
__author__ = u'Tomasz D\u0142ugosz' __author__ = u'Tomasz D\u0142ugosz'
language = 'pl' language = 'pl'
oldest_article = 7 oldest_article = 7

View File

@ -12,7 +12,7 @@ from calibre.web.feeds.news import BasicNewsRecipe
class NCzas(BasicNewsRecipe): class NCzas(BasicNewsRecipe):
title = u'Najwy\u017cszy Czas!' title = u'Najwy\u017cszy Czas!'
desciption = u'Najwy\u017cszy Czas!\nwydanie internetowe' description = u'Najwy\u017cszy Czas!\nwydanie internetowe'
__author__ = u'Tomasz D\u0142ugosz' __author__ = u'Tomasz D\u0142ugosz'
language = 'pl' language = 'pl'
oldest_article = 7 oldest_article = 7

View File

@ -0,0 +1,50 @@
__license__ = 'GPL v3'
__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
'''
nrc.nl
'''
from calibre.web.feeds.news import BasicNewsRecipe
class Pagina12(BasicNewsRecipe):
title = 'NRC'
__author__ = 'Darko Miletic'
description = 'News from Netherlands'
publisher = 'nrc.nl'
category = 'news, politics, Netherlands'
oldest_article = 2
max_articles_per_feed = 200
no_stylesheets = True
encoding = 'cp1252'
use_embedded_content = False
language = 'nl'
country = 'NL'
remove_empty_feeds = True
masthead_url = 'http://www.nrc.nl/nrc.nl/images/logo_nrc.png'
extra_css = ' body{font-family: Verdana,Arial,Helvetica,sans-serif } img{margin-bottom: 0.4em} h1,h2,h3{text-align:left} '
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher' : publisher
, 'language' : language
}
keep_only_tags = [dict(name='div',attrs={'class':'article clearfix'})]
feeds = [
(u'Voorpagina' , u'http://feeds.feedburner.com/NRCHandelsbladVoorpagina' )
,(u'Binnenland' , u'http://feeds.feedburner.com/NRCHandelsbladBinnenland' )
,(u'Buitenland' , u'http://feeds.feedburner.com/NRCHandelsbladBuitenland' )
,(u'Economie' , u'http://feeds.feedburner.com/NRCHandelsbladEconomie' )
,(u'Kunst & Film' , u'http://feeds.feedburner.com/nrc/NRCHandelsbladKunstEnFilm')
,(u'Sport' , u'http://feeds.feedburner.com/NRCHandelsbladSport' )
,(u'Wetenschap ' , u'http://www.nrc.nl/rss/wetenschap' )
]
def print_version(self, url):
return url + '?service=Print'
def preprocess_html(self, soup):
return self.adeify_images(soup)

View File

@ -0,0 +1,52 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__author__ = 'Mori'
__version__ = 'v. 0.1'
'''
www.runa.pl/blog
'''
from calibre.web.feeds.news import BasicNewsRecipe
import re
class FantazmatyRecipe(BasicNewsRecipe):
__author__ = 'Mori'
language = 'pl'
title = u'Fantazmaty'
publisher = u'Agencja Wydawnicza Runa'
description = u'Blog Agencji Wydawniczej Runa'
no_stylesheets = True
remove_javascript = True
encoding = 'utf-8'
oldest_article = 100
max_articles_per_feed = 100
extra_css = '''
img{float: left; padding-right: 10px; padding-bottom: 5px;}
'''
feeds = [
(u'Fantazmaty', u'http://www.runa.pl/blog/rss.xml')
]
remove_tags = [
dict(name = 'div', attrs = {'class' : 'path'}),
dict(name = 'div', attrs = {'class' : 'drdot'}),
dict(name = 'div', attrs = {'class' : 'picture'})
]
remove_tags_after = [
dict(name = 'div', attrs = {'class' : 'content'})
]
preprocess_regexps = [
(re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
[
(r'<body>.*?<div id="primary"', lambda match: '<body><div id="primary"'),
(r'<!--.*?-->', lambda match: '')
]
]

View File

@ -1,35 +1,25 @@
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
class SanFranciscoBayGuardian(BasicNewsRecipe): class SanFranciscoBayGuardian(BasicNewsRecipe):
title = u'San Francisco Bay Guardian' title = u'San Francisco Bay Guardian'
language = 'en' language = 'en'
__author__ = 'Krittika Goyal' __author__ = 'Krittika Goyal'
oldest_article = 31 #days oldest_article = 31 #days
max_articles_per_feed = 25 max_articles_per_feed = 25
#encoding = 'latin1'
no_stylesheets = True no_stylesheets = True
#remove_tags_before = dict(name='div', attrs={'id':'story_header'})
#remove_tags_after = dict(name='div', attrs={'id':'shirttail'})
remove_tags = [ remove_tags = [
dict(name='iframe'), dict(name='iframe'),
#dict(name='div', attrs={'class':'related-articles'}),
#dict(name='div', attrs={'id':['story_tools', 'toolbox', 'shirttail', 'comment_widget']}),
#dict(name='ul', attrs={'class':'article-tools'}),
#dict(name='ul', attrs={'id':'story_tabs'}),
] ]
feeds = [ feeds = [
('sfbg', 'http://www.sfbg.com/rss.xml'), ('sfbg', 'http://www.sfbg.com/rss.xml'),
('politics', 'http://www.sfbg.com/politics/rss.xml'),
('blogs', 'http://www.sfbg.com/blog/rss.xml'),
('pixel_vision', 'http://www.sfbg.com/pixel_vision/rss.xml'),
('bruce', 'http://www.sfbg.com/bruce/rss.xml'),
] ]
#def preprocess_html(self, soup):
#story = soup.find(name='div', attrs={'id':'story_body'})
#td = heading.findParent(name='td')
#td.extract()
#soup = BeautifulSoup('<html><head><title>t</title></head><body></body></html>')
#body = soup.find(name='body')
#body.insert(0, story)
#return soup

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff