KG changes

2025-07-07 18:24:30 -04:00 · 2010-03-16 04:08:33 -07:00 · 2010-03-16 04:08:33 -07:00 · f3e2b9f726
commit f3e2b9f726
parent 667b4c7913 6fd9ed3be3
37 changed files with 5343 additions and 4080 deletions
--- a/resources/images/news/di.png
+++ b/resources/images/news/di.png
--- a/resources/images/news/eclicto.png
+++ b/resources/images/news/eclicto.png
--- a/resources/images/news/eksiazki.png
+++ b/resources/images/news/eksiazki.png
--- a/resources/images/news/interia_fakty.png
+++ b/resources/images/news/interia_fakty.png
--- a/resources/images/news/interia_sport.png
+++ b/resources/images/news/interia_sport.png
--- a/resources/images/news/legitymizm.png
+++ b/resources/images/news/legitymizm.png
--- a/resources/images/news/michalkiewicz.png
+++ b/resources/images/news/michalkiewicz.png
--- a/resources/images/news/nrc.nl.png
+++ b/resources/images/news/nrc.nl.png
--- a/resources/recipes/corriere_della_sera_en.recipe
+++ b/resources/recipes/corriere_della_sera_en.recipe
@ -2,18 +2,22 @@
 __license__   = 'GPL v3'
 __author__    = 'Lorenzo Vigentini, based on Darko Miletic'
 __copyright__ = '2009, Darko Miletic <darko.miletic at gmail.com>, Lorenzo Vigentini <l.vigentini at gmail.com>'
-__version__     = 'v1.01'
-__date__        = '10, January 2010'
+__version__     = 'v1.02'
+__date__        = '14, March 2010'
 __description__ = 'Italian daily newspaper (english version)'
+# NOTE: the feeds url are broken on the main site as the permalink structure has been changed erroneously ie:
+# actual link in feed   http://www.corriere.it/english/10_marzo_11/legitimate_impediment_approved_de9ba480-2cfd-11df-a00c-00144f02aabe.shtml
+# this needs to be change to
+# real feed URL http://www.corriere.it/International/english/articoli/2010/03/11/legitimate_impediment_approved.shtml
 '''
 http://www.corriere.it/
 '''

 from calibre.web.feeds.news import BasicNewsRecipe

-class ilCorriere(BasicNewsRecipe):
-    __author__     = 'Lorenzo Vigentini, based on Darko Miletic'
-    description    = 'Italian daily newspaper (english version)'
+class ilCorriereEn(BasicNewsRecipe):
+    author        = 'Lorenzo Vigentini, based on Darko Miletic'
+    description   = 'Italian daily newspaper (english version)'

    cover_url      = 'http://images.corriereobjects.it/images/static/common/logo_home.gif?v=200709121520'
    title          = u'Il Corriere della sera (english) '
@ -23,7 +27,7 @@ class ilCorriere(BasicNewsRecipe):
    language       = 'en'
    timefmt        = '[%a, %d %b, %Y]'

-    oldest_article = 1
+    oldest_article = 5
    max_articles_per_feed = 100
    use_embedded_content  = False
    recursion             = 10
@ -31,14 +35,30 @@ class ilCorriere(BasicNewsRecipe):
    remove_javascript = True
    no_stylesheets = True

-    html2lrf_options = [
-                          '--comment', description
-                        , '--category', category
-                        , '--publisher', publisher
-                        , '--ignore-tables'
-                        ]
+    def get_article_url(self, article):
+        articleUrl= article.get('link')
+        segments = articleUrl.split('/')
+        basename = '/'.join(segments[:3]) + '/' + 'International/english/articoli/'

-    html2epub_options = 'publisher="' + publisher + '"\ncomments="' + description + '"\ntags="' + category + '"\nlinearize_tables=True'
+    #the date has to be redone with the url structure
+        mlist1 = ['gennaio','febbraio','marzo','aprile','maggio','giugno','luglio','agosto','settembre','ottobre','novembre','dicembre']
+        mlist2 = ['01','02','03','04','05','06','07','08','09','10','11','12']
+        myDate = segments[4].split('_')
+        x=0
+        for x in range(11):
+            if myDate[1] == mlist1[x]:
+                noMonth=mlist2[x]
+                break
+
+        newDateUrl= '20'+ myDate[0] + '/' + noMonth + '/' + myDate[2] + '/'
+
+    #clean the article title
+        articleURLseg=segments[5].split('-')
+        myArticle = (articleURLseg[0])[:-9] + '.shtml'
+
+        myURL= basename + newDateUrl + myArticle
+        #print myURL
+        return myURL

    keep_only_tags = [dict(name='div', attrs={'class':['news-dettaglio article','article']})]

--- a/resources/recipes/di.recipe
+++ b/resources/recipes/di.recipe
@ -15,8 +15,8 @@ class DziennikInternautowRecipe(BasicNewsRecipe):
 	language = 'pl'

 	title = u'Dziennik Internautow'
-	publisher = u'Dziennik Internaut\xc3\xb3w Sp. z o.o.'
-	description =u'Internet w \xc5\xbcyciu i biznesie. Porady, wywiady, interwencje, bezpiecze\xc5\x84stwo w Sieci, technologia.'
+	publisher = u'Dziennik Internaut\u00f3w Sp. z o.o.'
+	description = u'Internet w \u017cyciu i biznesie. Porady, wywiady, interwencje, bezpiecze\u0144stwo w Sieci, technologia.'

 	max_articles_per_feed = 100
 	oldest_article = 7
@ -34,7 +34,7 @@ class DziennikInternautowRecipe(BasicNewsRecipe):
 	'''
 	
 	feeds = [
-		(u'Dziennik Internautów', u'http://feeds.feedburner.com/glowny-di')
+		(u'Dziennik Internaut\u00f3w', u'http://feeds.feedburner.com/glowny-di')
 	]
 	
 	keep_only_tags = [
--- a/resources/recipes/eclicto.recipe
+++ b/resources/recipes/eclicto.recipe
@ -1,6 +1,6 @@
 #!/usr/bin/env  python

-__license__ = 'GPL v3'
+__license__	= 'GPL v3'
 __author__ = 'Mori'
 __version__ = 'v. 0.1'
 '''
@ -11,39 +11,39 @@ from calibre.web.feeds.news import BasicNewsRecipe
 import re

 class BlogeClictoRecipe(BasicNewsRecipe):
-    __author__ = 'Mori'
-    language = 'pl'
+	__author__ = 'Mori'
+	language = 'pl'

-    title = u'Blog eClicto'
-    publisher = u'Blog eClicto'
-    description = u'Blog o e-papierze i e-bookach'
+	title = u'Blog eClicto'
+	publisher = u'Blog eClicto'
+	description = u'Blog o e-papierze i e-bookach'

-    max_articles_per_feed = 100
-    cover_url = 'http://blog.eclicto.pl/wordpress/wp-content/themes/blog_eclicto/g/logo.gif'
+	max_articles_per_feed = 100
+	cover_url = 'http://blog.eclicto.pl/wordpress/wp-content/themes/blog_eclicto/g/logo.gif'
 	
-    no_stylesheets = True
-    remove_javascript = True
-    encoding = 'utf-8'
+	no_stylesheets = True
+	remove_javascript = True
+	encoding = 'utf-8'
 	
-    extra_css = '''
-        img{float: left; padding-right: 10px; padding-bottom: 5px;}
-    '''
+	extra_css = '''
+		img{float: left; padding-right: 10px; padding-bottom: 5px;}
+	'''
 	
-    feeds = [
-        (u'Blog eClicto', u'http://blog.eclicto.pl/feed/')
-    ]
+	feeds = [
+		(u'Blog eClicto', u'http://blog.eclicto.pl/feed/')
+	]
 	
-    remove_tags = [
-        dict(name = 'span', attrs = {'id' : 'tags'})
-    ]
+	remove_tags = [
+		dict(name = 'span', attrs = {'id' : 'tags'})
+	]
 	
-    remove_tags_after = [
-        dict(name = 'div', attrs = {'class' : 'post'})
-    ]
+	remove_tags_after = [
+		dict(name = 'div', attrs = {'class' : 'post'})
+	]
 	
-    preprocess_regexps = [
-        (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in
-        [
-            (r'\s*</', lambda match: '</'),
-        ]
-    ]
+	preprocess_regexps = [
+		(re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in 
+		[
+			(r'\s*</', lambda match: '</'),
+		]
+	]
--- a/resources/recipes/eksiazki.recipe
+++ b/resources/recipes/eksiazki.recipe
@ -11,7 +11,7 @@ from calibre.web.feeds.news import BasicNewsRecipe
 class eksiazki(BasicNewsRecipe):

    title          = u'eKsiazki.org'
-    desciption     = u'Twoje centrum wiedzy o ePapierze i eBookach'
+    description    = u'Twoje centrum wiedzy o ePapierze i eBookach'
    language = 'pl'
    __author__ = u'Tomasz D\u0142ugosz'
    no_stylesheets = True
--- a/resources/recipes/fronda.recipe
+++ b/resources/recipes/fronda.recipe
@ -0,0 +1,34 @@
+#!/usr/bin/env  python
+
+__license__   = 'GPL v3'
+__copyright__ = u'2010, Tomasz Dlugosz <tomek3d@gmail.com>'
+'''
+fronda.pl
+'''
+
+from calibre.web.feeds.news import BasicNewsRecipe
+import re
+
+class Fronda(BasicNewsRecipe):
+    title          = u'Fronda.pl'
+    publisher      = u'Fronda.pl'
+    description    = u'Portal po\u015bwi\u0119cony - Infformacje'
+    language = 'pl'
+    __author__ = u'Tomasz D\u0142ugosz'
+    oldest_article = 7
+    max_articles_per_feed = 100
+    use_embedded_content = False
+
+    feeds          = [(u'Infformacje', u'http://fronda.pl/news/feed')]
+
+    keep_only_tags = [dict(name='h1', attrs={'class':'big'}),
+                      dict(name='ul', attrs={'class':'about clear'}),
+                      dict(name='div', attrs={'class':'content'})]
+    preprocess_regexps = [
+        (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in 
+            [ (r'<a href="#" class="print">Drukuj</a>', lambda match: ''),
+              (r'<p><a href="http://fronda.pl/sklepy">.*</a></p>', lambda match: ''),
+              (r'<p><a href="http://fronda.pl/pasaz">.*</a></p>', lambda match: ''),
+              (r'<h3><strong>W.* lektury.*</a></p></div>', lambda match: '</div>'),
+              (r'<h3>Zobacz t.*?</div>', lambda match: '</div>') ]
+    ]
--- a/resources/recipes/interia_fakty.recipe
+++ b/resources/recipes/interia_fakty.recipe
@ -10,6 +10,7 @@ from calibre.web.feeds.news import BasicNewsRecipe

 class InteriaFakty(BasicNewsRecipe):
    title          = u'Interia.pl - Fakty'
+    description    = u'Fakty ze strony interia.pl'
    language = 'pl'
    oldest_article = 7
    __author__ = u'Tomasz D\u0142ugosz'
--- a/resources/recipes/interia_sport.recipe
+++ b/resources/recipes/interia_sport.recipe
@ -11,6 +11,7 @@ from calibre.web.feeds.news import BasicNewsRecipe

 class InteriaSport(BasicNewsRecipe):
    title          = u'Interia.pl - Sport'
+    description    = u'Sport ze strony interia.pl'
    language = 'pl'
    oldest_article = 7
    __author__ = u'Tomasz D\u0142ugosz'
@ -30,7 +31,8 @@ class InteriaSport(BasicNewsRecipe):

    keep_only_tags = [dict(name='div', attrs={'id':'article'})]

-    remove_tags = [dict(name='div', attrs={'class':'object gallery'})]
+    remove_tags = [dict(name='div', attrs={'class':'object gallery'}),
+                   dict(name='div', attrs={'class':'box fontSizeSwitch'})]

    extra_css = '''
        .articleDate {
--- a/resources/recipes/legeartis.recipe
+++ b/resources/recipes/legeartis.recipe
@ -1,6 +1,6 @@
 #!/usr/bin/env  python

-__license__ = 'GPL v3'
+__license__	= 'GPL v3'
 __author__ = 'Mori'
 __version__ = 'v. 0.1'
 '''
@ -10,34 +10,34 @@ olgierd.bblog.pl
 from calibre.web.feeds.news import BasicNewsRecipe

 class LegeArtisRecipe(BasicNewsRecipe):
-    __author__ = 'Mori'
-    language = 'pl'
+	__author__ = 'Mori'
+	language = 'pl'

-    title = u'Lege Artis'
-    publisher = u'Olgierd Rudak'
-    description = u'Wszystko, co chcieliby\xc5\x9bcie wiedzie\xc4\x87 o prawie, ale wstydzicie si\xc4\x99 zapyta\xc4\x87'
+	title = u'Lege Artis'
+	publisher = u'Olgierd Rudak'
+	description = u'Wszystko, co chcieliby\u015bcie wiedzie\u0107 o prawie, ale wstydzicie si\u0119 zapyta\u0107'

-    max_articles_per_feed = 100
+	max_articles_per_feed = 100
 	
-    no_stylesheets = True
-    remove_javascript = True
+	no_stylesheets = True
+	remove_javascript = True
 	
-    extra_css = '''
-        img{clear: both;}
-    '''
+	extra_css = '''
+		img{clear: both;}
+	'''
 	
-    feeds = [
-        (u'Lege Artis', u'http://olgierd.bblog.pl/rss/rss20.xml')
-    ]
+	feeds = [
+		(u'Lege Artis', u'http://olgierd.bblog.pl/rss/rss20.xml')
+	]
 	
-    keep_only_tags = [
-        dict(name = 'div', attrs = {'class' : 'post_title'}),
-        dict(name = 'div', attrs = {'class' : 'post_date'}),
-        dict(name = 'div', attrs = {'class' : 'post_content'})
-    ]
+	keep_only_tags = [
+		dict(name = 'div', attrs = {'class' : 'post_title'}),
+		dict(name = 'div', attrs = {'class' : 'post_date'}),
+		dict(name = 'div', attrs = {'class' : 'post_content'})
+	]
 	
-    remove_tags = [
-        dict(name = 'div', attrs = {'id' : 'bb_tools'}),
-        dict(name = 'div', attrs = {'class' : 'post_comments'}),
-        dict(name = 'object', attrs = {})
-    ]
+	remove_tags = [
+		dict(name = 'div', attrs = {'id' : 'bb_tools'}),
+		dict(name = 'div', attrs = {'class' : 'post_comments'}),
+		dict(name = 'object', attrs = {})
+	]
--- a/resources/recipes/legitymizm.recipe
+++ b/resources/recipes/legitymizm.recipe
@ -10,6 +10,7 @@ from calibre.web.feeds.news import BasicNewsRecipe

 class Legitymizm(BasicNewsRecipe):
    title          = u'Organizacja Monarchist\xf3w Polskich'
+    description    = u'Portal legitymistyczny'
    language = 'pl'
    oldest_article = 7
    __author__ = u'Tomasz D\u0142ugosz'
--- a/resources/recipes/michalkiewicz.recipe
+++ b/resources/recipes/michalkiewicz.recipe
@ -12,7 +12,7 @@ from calibre.web.feeds.news import BasicNewsRecipe

 class michalkiewicz(BasicNewsRecipe):
    title          = u'Stanis\u0142aw Michalkiewicz'
-    desciption     = u'Strona autorska * felietony * artyku\u0142y * komentarze'
+    description    = u'Strona autorska * felietony * artyku\u0142y * komentarze'
    __author__     = u'Tomasz D\u0142ugosz'
    language       = 'pl'
    oldest_article = 7
--- a/resources/recipes/nczas.recipe
+++ b/resources/recipes/nczas.recipe
@ -12,7 +12,7 @@ from calibre.web.feeds.news import BasicNewsRecipe

 class NCzas(BasicNewsRecipe):
    title          = u'Najwy\u017cszy Czas!'
-    desciption     = u'Najwy\u017cszy Czas!\nwydanie internetowe'
+    description    = u'Najwy\u017cszy Czas!\nwydanie internetowe'
    __author__     = u'Tomasz D\u0142ugosz'
    language       = 'pl'
    oldest_article = 7
--- a/resources/recipes/nrc.nl.recipe
+++ b/resources/recipes/nrc.nl.recipe
@ -0,0 +1,50 @@
+__license__   = 'GPL v3'
+__copyright__ = '2010, Darko Miletic <darko.miletic at gmail.com>'
+'''
+nrc.nl
+'''
+
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class Pagina12(BasicNewsRecipe):
+    title                 = 'NRC'
+    __author__            = 'Darko Miletic'
+    description           = 'News from Netherlands'
+    publisher             = 'nrc.nl'
+    category              = 'news, politics, Netherlands'
+    oldest_article        = 2
+    max_articles_per_feed = 200
+    no_stylesheets        = True
+    encoding              = 'cp1252'
+    use_embedded_content  = False
+    language              = 'nl'
+    country               = 'NL'
+    remove_empty_feeds    = True
+    masthead_url          = 'http://www.nrc.nl/nrc.nl/images/logo_nrc.png'
+    extra_css             = ' body{font-family: Verdana,Arial,Helvetica,sans-serif } img{margin-bottom: 0.4em} h1,h2,h3{text-align:left} '
+
+    conversion_options = {
+                          'comment'   : description
+                        , 'tags'      : category
+                        , 'publisher' : publisher
+                        , 'language'  : language
+                        }
+
+    keep_only_tags = [dict(name='div',attrs={'class':'article clearfix'})]
+
+
+    feeds = [
+              (u'Voorpagina'   , u'http://feeds.feedburner.com/NRCHandelsbladVoorpagina'     )
+             ,(u'Binnenland'   , u'http://feeds.feedburner.com/NRCHandelsbladBinnenland'     )
+             ,(u'Buitenland'   , u'http://feeds.feedburner.com/NRCHandelsbladBuitenland'     )
+             ,(u'Economie'     , u'http://feeds.feedburner.com/NRCHandelsbladEconomie'       )
+             ,(u'Kunst & Film' , u'http://feeds.feedburner.com/nrc/NRCHandelsbladKunstEnFilm')
+             ,(u'Sport'        , u'http://feeds.feedburner.com/NRCHandelsbladSport'          )
+             ,(u'Wetenschap '  , u'http://www.nrc.nl/rss/wetenschap'                         )
+            ]
+
+    def print_version(self, url):
+        return url + '?service=Print'
+
+    def preprocess_html(self, soup):
+        return self.adeify_images(soup)
--- a/resources/recipes/runa.recipe
+++ b/resources/recipes/runa.recipe
@ -0,0 +1,52 @@
+#!/usr/bin/env  python
+
+__license__ = 'GPL v3'
+__author__ = 'Mori'
+__version__ = 'v. 0.1'
+'''
+www.runa.pl/blog
+'''
+
+from calibre.web.feeds.news import BasicNewsRecipe
+import re
+
+class FantazmatyRecipe(BasicNewsRecipe):
+	__author__ = 'Mori'
+	language = 'pl'
+
+	title = u'Fantazmaty'
+	publisher = u'Agencja Wydawnicza Runa'
+	description = u'Blog Agencji Wydawniczej Runa'
+	
+	no_stylesheets = True
+	remove_javascript = True
+	encoding = 'utf-8'
+	
+	oldest_article = 100
+	max_articles_per_feed = 100
+	
+	extra_css = '''
+		img{float: left; padding-right: 10px; padding-bottom: 5px;}
+	'''
+	
+	feeds = [
+		(u'Fantazmaty', u'http://www.runa.pl/blog/rss.xml')
+	]
+	
+	remove_tags = [
+		dict(name = 'div', attrs = {'class' : 'path'}),
+		dict(name = 'div', attrs = {'class' : 'drdot'}),
+		dict(name = 'div', attrs = {'class' : 'picture'})
+	]
+	
+	remove_tags_after = [
+		dict(name = 'div', attrs = {'class' : 'content'})
+	]
+	
+	preprocess_regexps = [
+		(re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in 
+		[
+			(r'<body>.*?<div id="primary"', lambda match: '<body><div id="primary"'),
+			(r'<!--.*?-->', lambda match: '')
+		]
+	]
--- a/resources/recipes/sfbg.recipe
+++ b/resources/recipes/sfbg.recipe
@ -1,35 +1,25 @@
 from calibre.web.feeds.news import BasicNewsRecipe

 class SanFranciscoBayGuardian(BasicNewsRecipe):
-    title          = u'San Francisco Bay Guardian'
-    language       = 'en'
-    __author__     = 'Krittika Goyal'
+    title = u'San Francisco Bay Guardian'
+    language = 'en'
+    __author__ = 'Krittika Goyal'
    oldest_article = 31 #days
    max_articles_per_feed = 25
-    #encoding = 'latin1'

    no_stylesheets = True
-    #remove_tags_before = dict(name='div', attrs={'id':'story_header'})
-    #remove_tags_after  = dict(name='div', attrs={'id':'shirttail'})
    remove_tags = [
-       dict(name='iframe'),
-       #dict(name='div', attrs={'class':'related-articles'}),
-        #dict(name='div', attrs={'id':['story_tools', 'toolbox', 'shirttail', 'comment_widget']}),
-       #dict(name='ul', attrs={'class':'article-tools'}),
-       #dict(name='ul', attrs={'id':'story_tabs'}),
+        dict(name='iframe'),
    ]


    feeds = [
        ('sfbg', 'http://www.sfbg.com/rss.xml'),
+        ('politics', 'http://www.sfbg.com/politics/rss.xml'),
+        ('blogs', 'http://www.sfbg.com/blog/rss.xml'),
+        ('pixel_vision', 'http://www.sfbg.com/pixel_vision/rss.xml'),
+        ('bruce', 'http://www.sfbg.com/bruce/rss.xml'),
    ]


-    #def preprocess_html(self, soup):
-        #story = soup.find(name='div', attrs={'id':'story_body'})
-        #td = heading.findParent(name='td')
-        #td.extract()
-        #soup = BeautifulSoup('<html><head><title>t</title></head><body></body></html>')
-        #body = soup.find(name='body')
-        #body.insert(0, story)
-        #return soup
+
--- a/src/calibre/translations/ar.po
+++ b/src/calibre/translations/ar.po
--- a/src/calibre/translations/cs.po
+++ b/src/calibre/translations/cs.po
--- a/src/calibre/translations/de.po
+++ b/src/calibre/translations/de.po
--- a/src/calibre/translations/es.po
+++ b/src/calibre/translations/es.po
--- a/src/calibre/translations/fi.po
+++ b/src/calibre/translations/fi.po
--- a/src/calibre/translations/fr.po
+++ b/src/calibre/translations/fr.po
--- a/src/calibre/translations/gl.po
+++ b/src/calibre/translations/gl.po
--- a/src/calibre/translations/it.po
+++ b/src/calibre/translations/it.po
--- a/src/calibre/translations/lv.po
+++ b/src/calibre/translations/lv.po
--- a/src/calibre/translations/nb.po
+++ b/src/calibre/translations/nb.po
--- a/src/calibre/translations/ru.po
+++ b/src/calibre/translations/ru.po
--- a/src/calibre/translations/sq.po
+++ b/src/calibre/translations/sq.po
--- a/src/calibre/translations/sr.po
+++ b/src/calibre/translations/sr.po
--- a/src/calibre/translations/zh_CN.po
+++ b/src/calibre/translations/zh_CN.po
--- a/src/calibre/translations/zh_TW.po
+++ b/src/calibre/translations/zh_TW.po