KG changes

This commit is contained in:
GRiker 2010-03-16 04:08:33 -07:00
commit f3e2b9f726
37 changed files with 5343 additions and 4080 deletions

Binary file not shown.

After

Width:  |  Height:  |  Size: 733 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 401 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 475 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 626 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 626 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 808 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.4 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.8 KiB

View File

@ -2,18 +2,22 @@
__license__ = 'GPL v3'
__author__ = 'Lorenzo Vigentini, based on Darko Miletic'
__copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>, Lorenzo Vigentini <l.vigentini at gmail.com>'
__version__ = 'v1.01'
__date__ = '10, January 2010'
__version__ = 'v1.02'
__date__ = '14, March 2010'
__description__ = 'Italian daily newspaper (english version)'
# NOTE: the feeds url are broken on the main site as the permalink structure has been changed erroneously ie:
# actual link in feed http://www.corriere.it/english/10_marzo_11/legitimate_impediment_approved_de9ba480-2cfd-11df-a00c-00144f02aabe.shtml
# this needs to be change to
# real feed URL http://www.corriere.it/International/english/articoli/2010/03/11/legitimate_impediment_approved.shtml
'''
http://www.corriere.it/
'''
from calibre.web.feeds.news import BasicNewsRecipe
class ilCorriere(BasicNewsRecipe):
__author__ = 'Lorenzo Vigentini, based on Darko Miletic'
description = 'Italian daily newspaper (english version)'
class ilCorriereEn(BasicNewsRecipe):
author = 'Lorenzo Vigentini, based on Darko Miletic'
description = 'Italian daily newspaper (english version)'
cover_url = 'http://images.corriereobjects.it/images/static/common/logo_home.gif?v=200709121520'
title = u'Il Corriere della sera (english) '
@ -23,7 +27,7 @@ class ilCorriere(BasicNewsRecipe):
language = 'en'
timefmt = '[%a, %d %b, %Y]'
oldest_article = 1
oldest_article = 5
max_articles_per_feed = 100
use_embedded_content = False
recursion = 10
@ -31,14 +35,30 @@ class ilCorriere(BasicNewsRecipe):
remove_javascript = True
no_stylesheets = True
html2lrf_options = [
'--comment', description
, '--category', category
, '--publisher', publisher
, '--ignore-tables'
]
def get_article_url(self, article):
articleUrl= article.get('link')
segments = articleUrl.split('/')
basename = '/'.join(segments[:3]) + '/' + 'International/english/articoli/'
html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\nlinearize_tables=True'
#the date has to be redone with the url structure
mlist1 = ['gennaio','febbraio','marzo','aprile','maggio','giugno','luglio','agosto','settembre','ottobre','novembre','dicembre']
mlist2 = ['01','02','03','04','05','06','07','08','09','10','11','12']
myDate = segments[4].split('_')
x=0
for x in range(11):
if myDate[1] == mlist1[x]:
noMonth=mlist2[x]
break
newDateUrl= '20'+ myDate[0] + '/' + noMonth + '/' + myDate[2] + '/'
#clean the article title
articleURLseg=segments[5].split('-')
myArticle = (articleURLseg[0])[:-9] + '.shtml'
myURL= basename + newDateUrl + myArticle
#print myURL
return myURL
keep_only_tags = [dict(name='div', attrs={'class':['news-dettaglio article','article']})]

View File

@ -15,8 +15,8 @@ class DziennikInternautowRecipe(BasicNewsRecipe):
language = 'pl'
title = u'Dziennik Internautow'
publisher = u'Dziennik Internaut\xc3\xb3w Sp. z o.o.'
description =u'Internet w \xc5\xbcyciu i biznesie. Porady, wywiady, interwencje, bezpiecze\xc5\x84stwo w Sieci, technologia.'
publisher = u'Dziennik Internaut\u00f3w Sp. z o.o.'
description = u'Internet w \u017cyciu i biznesie. Porady, wywiady, interwencje, bezpiecze\u0144stwo w Sieci, technologia.'
max_articles_per_feed = 100
oldest_article = 7
@ -34,7 +34,7 @@ class DziennikInternautowRecipe(BasicNewsRecipe):
'''
feeds = [
(u'Dziennik Internautów', u'http://feeds.feedburner.com/glowny-di')
(u'Dziennik Internaut\u00f3w', u'http://feeds.feedburner.com/glowny-di')
]
keep_only_tags = [

View File

@ -1,6 +1,6 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__license__ = 'GPL v3'
__author__ = 'Mori'
__version__ = 'v. 0.1'
'''
@ -11,39 +11,39 @@ from calibre.web.feeds.news import BasicNewsRecipe
import re
class BlogeClictoRecipe(BasicNewsRecipe):
__author__ = 'Mori'
language = 'pl'
__author__ = 'Mori'
language = 'pl'
title = u'Blog eClicto'
publisher = u'Blog eClicto'
description = u'Blog o e-papierze i e-bookach'
title = u'Blog eClicto'
publisher = u'Blog eClicto'
description = u'Blog o e-papierze i e-bookach'
max_articles_per_feed = 100
cover_url = 'http://blog.eclicto.pl/wordpress/wp-content/themes/blog_eclicto/g/logo.gif'
max_articles_per_feed = 100
cover_url = 'http://blog.eclicto.pl/wordpress/wp-content/themes/blog_eclicto/g/logo.gif'
no_stylesheets = True
remove_javascript = True
encoding = 'utf-8'
no_stylesheets = True
remove_javascript = True
encoding = 'utf-8'
extra_css = '''
img{float: left; padding-right: 10px; padding-bottom: 5px;}
'''
extra_css = '''
img{float: left; padding-right: 10px; padding-bottom: 5px;}
'''
feeds = [
(u'Blog eClicto', u'http://blog.eclicto.pl/feed/')
]
feeds = [
(u'Blog eClicto', u'http://blog.eclicto.pl/feed/')
]
remove_tags = [
dict(name = 'span', attrs = {'id' : 'tags'})
]
remove_tags = [
dict(name = 'span', attrs = {'id' : 'tags'})
]
remove_tags_after = [
dict(name = 'div', attrs = {'class' : 'post'})
]
remove_tags_after = [
dict(name = 'div', attrs = {'class' : 'post'})
]
preprocess_regexps = [
(re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
[
(r'\s*</', lambda match: '</'),
]
]
preprocess_regexps = [
(re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
[
(r'\s*</', lambda match: '</'),
]
]

View File

@ -11,7 +11,7 @@ from calibre.web.feeds.news import BasicNewsRecipe
class eksiazki(BasicNewsRecipe):
title = u'eKsiazki.org'
desciption = u'Twoje centrum wiedzy o ePapierze i eBookach'
description = u'Twoje centrum wiedzy o ePapierze i eBookach'
language = 'pl'
__author__ = u'Tomasz D\u0142ugosz'
no_stylesheets = True

View File

@ -0,0 +1,34 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__copyright__ = u'2010, Tomasz Dlugosz <tomek3d@gmail.com>'
'''
fronda.pl
'''
from calibre.web.feeds.news import BasicNewsRecipe
import re
class Fronda(BasicNewsRecipe):
title = u'Fronda.pl'
publisher = u'Fronda.pl'
description = u'Portal po\u015bwi\u0119cony - Infformacje'
language = 'pl'
__author__ = u'Tomasz D\u0142ugosz'
oldest_article = 7
max_articles_per_feed = 100
use_embedded_content = False
feeds = [(u'Infformacje', u'http://fronda.pl/news/feed')]
keep_only_tags = [dict(name='h1', attrs={'class':'big'}),
dict(name='ul', attrs={'class':'about clear'}),
dict(name='div', attrs={'class':'content'})]
preprocess_regexps = [
(re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
[ (r'<a href="#" class="print">Drukuj</a>', lambda match: ''),
(r'<p><a href="http://fronda.pl/sklepy">.*</a></p>', lambda match: ''),
(r'<p><a href="http://fronda.pl/pasaz">.*</a></p>', lambda match: ''),
(r'<h3><strong>W.* lektury.*</a></p></div>', lambda match: '</div>'),
(r'<h3>Zobacz t.*?</div>', lambda match: '</div>') ]
]

View File

@ -10,6 +10,7 @@ from calibre.web.feeds.news import BasicNewsRecipe
class InteriaFakty(BasicNewsRecipe):
title = u'Interia.pl - Fakty'
description = u'Fakty ze strony interia.pl'
language = 'pl'
oldest_article = 7
__author__ = u'Tomasz D\u0142ugosz'

View File

@ -11,6 +11,7 @@ from calibre.web.feeds.news import BasicNewsRecipe
class InteriaSport(BasicNewsRecipe):
title = u'Interia.pl - Sport'
description = u'Sport ze strony interia.pl'
language = 'pl'
oldest_article = 7
__author__ = u'Tomasz D\u0142ugosz'
@ -30,7 +31,8 @@ class InteriaSport(BasicNewsRecipe):
keep_only_tags = [dict(name='div', attrs={'id':'article'})]
remove_tags = [dict(name='div', attrs={'class':'object gallery'})]
remove_tags = [dict(name='div', attrs={'class':'object gallery'}),
dict(name='div', attrs={'class':'box fontSizeSwitch'})]
extra_css = '''
.articleDate {

View File

@ -1,6 +1,6 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__license__ = 'GPL v3'
__author__ = 'Mori'
__version__ = 'v. 0.1'
'''
@ -10,34 +10,34 @@ olgierd.bblog.pl
from calibre.web.feeds.news import BasicNewsRecipe
class LegeArtisRecipe(BasicNewsRecipe):
__author__ = 'Mori'
language = 'pl'
__author__ = 'Mori'
language = 'pl'
title = u'Lege Artis'
publisher = u'Olgierd Rudak'
description = u'Wszystko, co chcieliby\xc5\x9bcie wiedzie\xc4\x87 o prawie, ale wstydzicie si\xc4\x99 zapyta\xc4\x87'
title = u'Lege Artis'
publisher = u'Olgierd Rudak'
description = u'Wszystko, co chcieliby\u015bcie wiedzie\u0107 o prawie, ale wstydzicie si\u0119 zapyta\u0107'
max_articles_per_feed = 100
max_articles_per_feed = 100
no_stylesheets = True
remove_javascript = True
no_stylesheets = True
remove_javascript = True
extra_css = '''
img{clear: both;}
'''
extra_css = '''
img{clear: both;}
'''
feeds = [
(u'Lege Artis', u'http://olgierd.bblog.pl/rss/rss20.xml')
]
feeds = [
(u'Lege Artis', u'http://olgierd.bblog.pl/rss/rss20.xml')
]
keep_only_tags = [
dict(name = 'div', attrs = {'class' : 'post_title'}),
dict(name = 'div', attrs = {'class' : 'post_date'}),
dict(name = 'div', attrs = {'class' : 'post_content'})
]
keep_only_tags = [
dict(name = 'div', attrs = {'class' : 'post_title'}),
dict(name = 'div', attrs = {'class' : 'post_date'}),
dict(name = 'div', attrs = {'class' : 'post_content'})
]
remove_tags = [
dict(name = 'div', attrs = {'id' : 'bb_tools'}),
dict(name = 'div', attrs = {'class' : 'post_comments'}),
dict(name = 'object', attrs = {})
]
remove_tags = [
dict(name = 'div', attrs = {'id' : 'bb_tools'}),
dict(name = 'div', attrs = {'class' : 'post_comments'}),
dict(name = 'object', attrs = {})
]

View File

@ -10,6 +10,7 @@ from calibre.web.feeds.news import BasicNewsRecipe
class Legitymizm(BasicNewsRecipe):
title = u'Organizacja Monarchist\xf3w Polskich'
description = u'Portal legitymistyczny'
language = 'pl'
oldest_article = 7
__author__ = u'Tomasz D\u0142ugosz'

View File

@ -12,7 +12,7 @@ from calibre.web.feeds.news import BasicNewsRecipe
class michalkiewicz(BasicNewsRecipe):
title = u'Stanis\u0142aw Michalkiewicz'
desciption = u'Strona autorska * felietony * artyku\u0142y * komentarze'
description = u'Strona autorska * felietony * artyku\u0142y * komentarze'
__author__ = u'Tomasz D\u0142ugosz'
language = 'pl'
oldest_article = 7

View File

@ -12,7 +12,7 @@ from calibre.web.feeds.news import BasicNewsRecipe
class NCzas(BasicNewsRecipe):
title = u'Najwy\u017cszy Czas!'
desciption = u'Najwy\u017cszy Czas!\nwydanie internetowe'
description = u'Najwy\u017cszy Czas!\nwydanie internetowe'
__author__ = u'Tomasz D\u0142ugosz'
language = 'pl'
oldest_article = 7

View File

@ -0,0 +1,50 @@
__license__ = 'GPL v3'
__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
'''
nrc.nl
'''
from calibre.web.feeds.news import BasicNewsRecipe
class Pagina12(BasicNewsRecipe):
title = 'NRC'
__author__ = 'Darko Miletic'
description = 'News from Netherlands'
publisher = 'nrc.nl'
category = 'news, politics, Netherlands'
oldest_article = 2
max_articles_per_feed = 200
no_stylesheets = True
encoding = 'cp1252'
use_embedded_content = False
language = 'nl'
country = 'NL'
remove_empty_feeds = True
masthead_url = 'http://www.nrc.nl/nrc.nl/images/logo_nrc.png'
extra_css = ' body{font-family: Verdana,Arial,Helvetica,sans-serif } img{margin-bottom: 0.4em} h1,h2,h3{text-align:left} '
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher' : publisher
, 'language' : language
}
keep_only_tags = [dict(name='div',attrs={'class':'article clearfix'})]
feeds = [
(u'Voorpagina' , u'http://feeds.feedburner.com/NRCHandelsbladVoorpagina' )
,(u'Binnenland' , u'http://feeds.feedburner.com/NRCHandelsbladBinnenland' )
,(u'Buitenland' , u'http://feeds.feedburner.com/NRCHandelsbladBuitenland' )
,(u'Economie' , u'http://feeds.feedburner.com/NRCHandelsbladEconomie' )
,(u'Kunst & Film' , u'http://feeds.feedburner.com/nrc/NRCHandelsbladKunstEnFilm')
,(u'Sport' , u'http://feeds.feedburner.com/NRCHandelsbladSport' )
,(u'Wetenschap ' , u'http://www.nrc.nl/rss/wetenschap' )
]
def print_version(self, url):
return url + '?service=Print'
def preprocess_html(self, soup):
return self.adeify_images(soup)

View File

@ -0,0 +1,52 @@
#!/usr/bin/env python
__license__ = 'GPL v3'
__author__ = 'Mori'
__version__ = 'v. 0.1'
'''
www.runa.pl/blog
'''
from calibre.web.feeds.news import BasicNewsRecipe
import re
class FantazmatyRecipe(BasicNewsRecipe):
__author__ = 'Mori'
language = 'pl'
title = u'Fantazmaty'
publisher = u'Agencja Wydawnicza Runa'
description = u'Blog Agencji Wydawniczej Runa'
no_stylesheets = True
remove_javascript = True
encoding = 'utf-8'
oldest_article = 100
max_articles_per_feed = 100
extra_css = '''
img{float: left; padding-right: 10px; padding-bottom: 5px;}
'''
feeds = [
(u'Fantazmaty', u'http://www.runa.pl/blog/rss.xml')
]
remove_tags = [
dict(name = 'div', attrs = {'class' : 'path'}),
dict(name = 'div', attrs = {'class' : 'drdot'}),
dict(name = 'div', attrs = {'class' : 'picture'})
]
remove_tags_after = [
dict(name = 'div', attrs = {'class' : 'content'})
]
preprocess_regexps = [
(re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
[
(r'<body>.*?<div id="primary"', lambda match: '<body><div id="primary"'),
(r'<!--.*?-->', lambda match: '')
]
]

View File

@ -1,35 +1,25 @@
from calibre.web.feeds.news import BasicNewsRecipe
class SanFranciscoBayGuardian(BasicNewsRecipe):
title = u'San Francisco Bay Guardian'
language = 'en'
__author__ = 'Krittika Goyal'
title = u'San Francisco Bay Guardian'
language = 'en'
__author__ = 'Krittika Goyal'
oldest_article = 31 #days
max_articles_per_feed = 25
#encoding = 'latin1'
no_stylesheets = True
#remove_tags_before = dict(name='div', attrs={'id':'story_header'})
#remove_tags_after = dict(name='div', attrs={'id':'shirttail'})
remove_tags = [
dict(name='iframe'),
#dict(name='div', attrs={'class':'related-articles'}),
#dict(name='div', attrs={'id':['story_tools', 'toolbox', 'shirttail', 'comment_widget']}),
#dict(name='ul', attrs={'class':'article-tools'}),
#dict(name='ul', attrs={'id':'story_tabs'}),
dict(name='iframe'),
]
feeds = [
('sfbg', 'http://www.sfbg.com/rss.xml'),
('politics', 'http://www.sfbg.com/politics/rss.xml'),
('blogs', 'http://www.sfbg.com/blog/rss.xml'),
('pixel_vision', 'http://www.sfbg.com/pixel_vision/rss.xml'),
('bruce', 'http://www.sfbg.com/bruce/rss.xml'),
]
#def preprocess_html(self, soup):
#story = soup.find(name='div', attrs={'id':'story_body'})
#td = heading.findParent(name='td')
#td.extract()
#soup = BeautifulSoup('<html><head><title>t</title></head><body></body></html>')
#body = soup.find(name='body')
#body.insert(0, story)
#return soup

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff