diff --git a/COPYRIGHT b/COPYRIGHT
index cb01503bd8..129b0b0536 100644
--- a/COPYRIGHT
+++ b/COPYRIGHT
@@ -9,6 +9,12 @@ License: GPL-2 or later
The full text of the GPL is distributed as in
/usr/share/common-licenses/GPL-2 on Debian systems.
+Files: setup/iso_639/*
+Copyright: Various
+License: LGPL 2.1
+ The full text of the LGPL is distributed as in
+ /usr/share/common-licenses/LGPL-2.1 on Debian systems.
+
Files: src/calibre/ebooks/BeautifulSoup.py
Copyright: Copyright (c) 2004-2007, Leonard Richardson
License: BSD
@@ -28,6 +34,12 @@ License: other
are permitted in any medium without royalty provided the copyright
notice and this notice are preserved.
+Files: src/calibre/ebooks/readability/*
+Copyright: Unknown
+License: Apache 2.0
+ The full text of the Apache 2.0 license is available at:
+ http://www.apache.org/licenses/LICENSE-2.0
+
Files: /src/cherrypy/*
Copyright: Copyright (c) 2004-2007, CherryPy Team (team@cherrypy.org)
Copyright: Copyright (C) 2005, Tiago Cogumbreiro tags"
+
+ - title: "When adding books that have no language specified, do not automatically set the language to calibre's interface language."
+ tickets: [830092]
+
+ - title: "Fix use of tag browser to search for languages when calibre is translated to a non English language"
+ tickets: [830078]
+
+ - title: "When downloading news, set the language field correctly"
+
+ - title: "Fix languages field in the Edit metadata dialog too wide"
+ tickets: [829912]
+
+ - title: "Fix setting of languages that have commas in their names broken"
+
+ - title: "FB2 Input: When convert FB2 files, read the cover from the FB2 file correctly."
+ tickets: [829240]
+
+ improved recipes:
+ - Politifact
+ - Reuters
+ - Sueddeutsche
+ - CNN
+ - Financial Times UK
+ - MSDN Magazine
+ - Houston Chronicle
+ - Harvard Business Review
+
+ new recipes:
+ - title: CBN News and Fairbanks Daily
+ author: by Roger
+
+ - title: Hacker News
+ author: Tom Scholl
+
+ - title: Various Turkish news sources
+ author: thomass
+
+ - title: Cvece Zla
+ author: Darko Miletic
+
+ - title: Various Polish news sources
+ author: fenuks
+
+ - title: Fluter
+ author: Armin Geller
+
+ - title: Brasil de Fato
+ author: Alex Mitrani
+
- version: 0.8.15
date: 2011-08-19
diff --git a/recipes/android_com_pl.recipe b/recipes/android_com_pl.recipe
new file mode 100644
index 0000000000..a44d5e560a
--- /dev/null
+++ b/recipes/android_com_pl.recipe
@@ -0,0 +1,12 @@
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class Android_com_pl(BasicNewsRecipe):
+ title = u'Android.com.pl'
+ __author__ = 'fenuks'
+ description = 'Android.com.pl - biggest polish Android site'
+ category = 'Android, mobile'
+ language = 'pl'
+ cover_url =u'http://upload.wikimedia.org/wikipedia/commons/thumb/d/d7/Android_robot.svg/220px-Android_robot.svg.png'
+ oldest_article = 8
+ max_articles_per_feed = 100
+ feeds = [(u'Android', u'http://android.com.pl/component/content/frontpage/frontpage.feed?type=rss')]
diff --git a/recipes/bash_org_pl.recipe b/recipes/bash_org_pl.recipe
new file mode 100644
index 0000000000..037870ed6c
--- /dev/null
+++ b/recipes/bash_org_pl.recipe
@@ -0,0 +1,15 @@
+from calibre.web.feeds.news import BasicNewsRecipe
+
+
+class Bash_org_pl(BasicNewsRecipe):
+ title = u'Bash.org.pl'
+ __author__ = 'fenuks'
+ description = 'Bash.org.pl - funny quotations from IRC discussions'
+ category = 'funny quotations, humour'
+ language = 'pl'
+ oldest_article = 15
+ cover_url = u'http://userlogos.org/files/logos/dzikiosiol/none_0.png'
+ max_articles_per_feed = 100
+ no_stylesheets= True
+ keep_only_tags= [dict(name='div', attrs={'class':'quote post-content post-body'})]
+ feeds = [(u'Cytaty', u'http://bash.org.pl/rss')]
diff --git a/recipes/bbc.recipe b/recipes/bbc.recipe
index 9c8b92f25c..2bccbaf4ae 100644
--- a/recipes/bbc.recipe
+++ b/recipes/bbc.recipe
@@ -36,8 +36,9 @@ class BBC(BasicNewsRecipe):
]
remove_tags = [
- dict(name='div', attrs={'class':['story-feature related narrow', 'share-help', 'embedded-hyper', \
- 'story-feature wide ', 'story-feature narrow']})
+ dict(name='div', attrs={'class':['story-feature related narrow', 'share-help', 'embedded-hyper',
+ 'story-feature wide ', 'story-feature narrow']}),
+ dict(id=['hypertab', 'comment-form']),
]
remove_attributes = ['width','height']
diff --git a/recipes/brasil_de_fato.recipe b/recipes/brasil_de_fato.recipe
new file mode 100644
index 0000000000..d060544ece
--- /dev/null
+++ b/recipes/brasil_de_fato.recipe
@@ -0,0 +1,31 @@
+# -*- coding: utf-8 -*-
+
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class BrasilDeFato(BasicNewsRecipe):
+ news = True
+ title = u'Brasil de Fato'
+ __author__ = 'Alex Mitrani'
+ description = u'Uma visão popular do Brasil e do mundo.'
+ publisher = u'SOCIEDADE EDITORIAL BRASIL DE FATO'
+ category = 'news, politics, Brazil, rss, Portuguese'
+ oldest_article = 10
+ max_articles_per_feed = 100
+ summary_length = 1000
+ language = 'pt_BR'
+
+ remove_javascript = True
+ no_stylesheets = True
+ use_embedded_content = False
+ remove_empty_feeds = True
+ masthead_url = 'http://www.brasildefato.com.br/sites/default/files/zeropoint_logo.jpg'
+ keep_only_tags = [dict(name='div', attrs={'id':'main'})]
+ remove_tags = [dict(name='div', attrs={'class':'links'})]
+ remove_tags_after = [dict(name='div', attrs={'class':'links'})]
+
+ feeds = [(u'Nacional', u'http://www.brasildefato.com.br/rss_nacional')
+ ,(u'Internacional', u'http://www.brasildefato.com.br/rss_internacional')
+ ,(u'Entrevista', u'http://www.brasildefato.com.br/rss_entrevista')
+ ,(u'Cultura', u'http://www.brasildefato.com.br/rss_cultura')
+ ,(u'Análise', u'http://www.brasildefato.com.br/rss_analise')
+ ]
diff --git a/recipes/bugun_gazetesi.recipe b/recipes/bugun_gazetesi.recipe
new file mode 100644
index 0000000000..0a1d27f517
--- /dev/null
+++ b/recipes/bugun_gazetesi.recipe
@@ -0,0 +1,57 @@
+# -*- coding: utf-8 -*-
+
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class Bugun (BasicNewsRecipe):
+
+ title = u'BUGÜN Gazetesi'
+ __author__ = u'thomass'
+ oldest_article = 2
+ max_articles_per_feed =100
+ #no_stylesheets = True
+ #delay = 1
+ use_embedded_content = False
+ encoding = 'UTF-8'
+ publisher = 'thomass'
+ category = 'news, haberler,TR,gazete'
+ language = 'tr'
+ publication_type = 'newspaper '
+ extra_css = ' div{font-size: small} h2{font-size: small;font-weight: bold} #ctl00_ortayer_haberBaslik{font-size:20px;font-weight: bold} '#h1{ font-size:10%;font-weight: bold} '#ctl00_ortayer_haberBaslik{ 'font-size:10%;font-weight: bold'}
+ #introduction{} .story-feature{display: block; padding: 0; border: 1px solid; width: 40%; font-size: small} .story-feature h2{text-align: center; text-transform: uppercase} '
+ conversion_options = {
+ 'tags' : category
+ ,'language' : language
+ ,'publisher' : publisher
+ ,'linearize_tables': True
+ }
+ cover_img_url = 'http://www.bugun.com.tr/images/bugunLogo2011.png'
+ masthead_url = 'http://www.bugun.com.tr/images/bugunLogo2011.png'
+
+ keep_only_tags = [dict(name='h1', attrs={'class':[ 'haberBaslik']}),dict(name='h2', attrs={'class':[ 'haberOzet']}), dict(name='div', attrs={'class':['haberGriDivvvv']}), dict(name='div', attrs={'id':[ 'haberTextDiv']}), ]
+
+ #keep_only_tags = [dict(name='div', attrs={'id':[ 'news-detail-content']}), dict(name='td', attrs={'class':['columnist-detail','columnist_head']}) ]
+ #remove_tags = [ dict(name='div', attrs={'id':['news-detail-news-text-font-size','news-detail-gallery','news-detail-news-bottom-social']}),dict(name='div', attrs={'class':['radioEmbedBg','radyoProgramAdi']}),dict(name='a', attrs={'class':['webkit-html-attribute-value webkit-html-external-link']}),dict(name='table', attrs={'id':['yaziYorumTablosu']}),dict(name='img', attrs={'src':['http://medya.zaman.com.tr/pics/paylas.gif','http://medya.zaman.com.tr/extentions/zaman.com.tr/img/columnist/ma-16.png']})]
+
+
+ #remove_attributes = ['width','height']
+ remove_empty_feeds= True
+
+ feeds = [
+ ( u'Son Dakika', u'http://www.bugun.com.tr/haberler.xml'),
+ ( u'Yazarlar', u'http://www.bugun.com.tr/rss/yazarlar.xml'),
+ ( u'Gündem', u'http://www.bugun.com.tr/rss/gundem.xml'),
+ ( u'Ekonomi', u'http://www.bugun.com.tr/rss/ekonomi.xml'),
+ ( u'Spor', u'http://www.bugun.com.tr/rss/spor.xml'),
+ ( u'Magazin', u'http://www.bugun.com.tr/rss/magazin.xml'),
+ ( u'Teknoloji', u'http://www.bugun.com.tr/rss/teknoloji.xml'),
+ ( u'Yaşam', u'http://www.bugun.com.tr/rss/yasam.xml'),
+ ( u'Medya', u'http://www.bugun.com.tr/rss/medya.xml'),
+ ( u'Dünya', u'http://www.bugun.com.tr/rss/dunya.xml'),
+ ( u'Politika', u'http://www.bugun.com.tr/rss/politika.xml'),
+ ( u'Sağlık', u'http://www.bugun.com.tr/rss/saglik.xml'),
+ ( u'Tarifler', u'http://www.bugun.com.tr/rss/yemek-tarifi.xml'),
+
+
+
+
+ ]
diff --git a/recipes/cbn.recipe b/recipes/cbn.recipe
new file mode 100644
index 0000000000..d2ce8dc885
--- /dev/null
+++ b/recipes/cbn.recipe
@@ -0,0 +1,73 @@
+from calibre.web.feeds.news import BasicNewsRecipe
+
+
+class CBN(BasicNewsRecipe):
+ title = u'CBN News'
+ __author__ = 'Roger'
+ # TODO: I just noticed this is downloading 25+ articles, while
+ # the online site is only publishing at most 7 articles daily.
+ # So, somehow this needs to be fixed it only downloads max 7 articles
+ oldest_article = 7
+ max_articles_per_feed = 100
+
+ description = 'The Christian Broadcasting Network'
+ publisher = 'http://www.cbn.com/'
+ category = 'news, religion, spiritual, christian'
+ language = 'en'
+
+ # Make article titles, author and date bold, italic or small font.
+ # TODO: Could use a smaller title text
+ # TODO: Italicize Author and Publisher?
+ #
+ # http://www.cbn.com/App_Themes/Common/base.css,
+ # http://www.cbn.com/App_Themes/CBNNews/article.css",
+ # ... and many more style sheets.
+ #extra_css = '''
+ # .story_item_headline { font-size: medium; font-weight: bold; }
+ # .story_item_author { font-size: small; font-style:italic; }
+ # .signature_line { font-size: small; }
+ # '''
+
+ remove_javascript = True
+ use_embedded_content = False
+ no_stylesheets = True
+ language = 'en'
+ encoding = 'iso-8859-1'
+ conversion_options = {'linearize_tables':True}
+
+ # TODO: No masterhead_url for CBN, using one I grepped from a news article
+ # (There's a better/higher contrast blue on white background image, but
+ # can't get it or it's too big -- embedded into a larger jpeg?)
+ masthead_url = 'http://www.cbn.com/templates/images/cbn_com_logo.jpg'
+
+ keep_only_tags = [
+ dict(name='h1', attrs={'id':'articleTitle'}),
+ dict(name='div', attrs={'class':'articleAuthor'}),
+ dict(name='div', attrs={'class':'articleDate'}),
+ dict(name='div', attrs={'class':'articleText'}),
+ ]
+
+ remove_tags = [
+ # The article image is usually Adobe Flash Player Image
+ # The snapshot .jpg image files of the video are found
+ # within a URL folder named "PageFiles_Files"
+ # Filter this for now.
+ # (Majority of images seem to be Adobe Flash.)
+ dict(name='div', attrs={'class':'articleImage'}),
+ ]
+
+
+ # Comment-out or uncomment any of the following RSS feeds according to your
+ # liking.
+ # A full list can be found here: http://www.cbn.com/rss.aspx
+
+ feeds = [
+ (u'World', u'http://www.cbn.com/cbnnews/world/feed/'),
+ (u'US', u'http://www.cbn.com/cbnnews/us/feed/'),
+ (u'Inside Israel', u'http://www.cbn.com/cbnnews/insideisrael/feed/'),
+ (u'Politics', u'http://www.cbn.com/cbnnews/politics/feed/'),
+ (u'Christian World News', u'http://www.cbn.com/cbnnews/shows/cwn/feed/'),
+ (u'Health and Science', u'http://www.cbn.com/cbnnews/healthscience/feed/'),
+ (u'Finance', u'http://www.cbn.com/cbnnews/finance/feed/'),
+ ]
+
diff --git a/recipes/cd_action.recipe b/recipes/cd_action.recipe
new file mode 100644
index 0000000000..b4cf6b326c
--- /dev/null
+++ b/recipes/cd_action.recipe
@@ -0,0 +1,16 @@
+from calibre.web.feeds.news import BasicNewsRecipe
+
+
+class CD_Action(BasicNewsRecipe):
+ title = u'CD-Action'
+ __author__ = 'fenuks'
+ description = 'cdaction.pl - polish magazine about games site'
+ category = 'games'
+ language = 'pl'
+ oldest_article = 8
+ max_articles_per_feed = 100
+ no_stylesheets= True
+ cover_url =u'http://s.cdaction.pl/obrazki/logo-CD-Action_172k9.JPG'
+ keep_only_tags= dict(id='news_content')
+ remove_tags_after= dict(name='div', attrs={'class':'tresc'})
+ feeds = [(u'Newsy', u'http://www.cdaction.pl/rss_newsy.xml')]
diff --git a/recipes/cnn.recipe b/recipes/cnn.recipe
index ccf47e26d8..096c370706 100644
--- a/recipes/cnn.recipe
+++ b/recipes/cnn.recipe
@@ -28,11 +28,12 @@ class CNN(BasicNewsRecipe):
(re.compile(r'
'
+ else:
+ content = self.get_readable_content(url)
+
+ self.temp_files.append(PersistentTemporaryFile('_fa.html'))
+ self.temp_files[-1].write(content)
+ self.temp_files[-1].close()
+ return self.temp_files[-1].name
+
+ def is_link_wanted(self, url, tag):
+ if url.endswith('.pdf'):
+ return False
+ return True
+
+ def prettyify_url(self, url):
+ return urlparse(url).hostname
+
+ def populate_article_metadata(self, article, soup, first):
+ article.text_summary = self.prettyify_url(article.url)
+ article.summary = article.text_summary
+
+
diff --git a/recipes/hbr.recipe b/recipes/hbr.recipe
index 1152a48784..214ae14f33 100644
--- a/recipes/hbr.recipe
+++ b/recipes/hbr.recipe
@@ -13,6 +13,8 @@ class HBR(BasicNewsRecipe):
no_stylesheets = True
LOGIN_URL = 'http://hbr.org/login?request_url=/'
+ LOGOUT_URL = 'http://hbr.org/logout?request_url=/'
+
INDEX = 'http://hbr.org/archive-toc/BR'
keep_only_tags = [dict(name='div', id='pageContainer')]
@@ -34,6 +36,9 @@ class HBR(BasicNewsRecipe):
def get_browser(self):
br = BasicNewsRecipe.get_browser(self)
+ self.logout_url = None
+
+ #'''
br.open(self.LOGIN_URL)
br.select_form(name='signin-form')
br['signin-form:username'] = self.username
@@ -41,10 +46,13 @@ class HBR(BasicNewsRecipe):
raw = br.submit().read()
if 'My Account' not in raw:
raise Exception('Failed to login, are you sure your username and password are correct?')
- self.logout_url = None
- link = br.find_link(text='Sign out')
- if link:
- self.logout_url = link.absolute_url
+ try:
+ link = br.find_link(text='Sign out')
+ if link:
+ self.logout_url = link.absolute_url
+ except:
+ self.logout_url = self.LOGOUT_URL
+ #'''
return br
def cleanup(self):
@@ -57,6 +65,8 @@ class HBR(BasicNewsRecipe):
def hbr_get_toc(self):
+ #return self.index_to_soup(open('/t/hbr.html').read())
+
today = date.today()
future = today + timedelta(days=30)
for x in [x.strftime('%y%m') for x in (future, today)]:
@@ -66,53 +76,43 @@ class HBR(BasicNewsRecipe):
return soup
raise Exception('Could not find current issue')
- def hbr_parse_section(self, container, feeds):
- current_section = None
- current_articles = []
- for x in container.findAll(name=['li', 'h3', 'h4']):
- if x.name in ['h3', 'h4'] and not x.findAll(True):
- if current_section and current_articles:
- feeds.append((current_section, current_articles))
- current_section = self.tag_to_string(x)
- current_articles = []
- self.log('\tFound section:', current_section)
- if x.name == 'li':
- a = x.find('a', href=True)
- if a is not None:
- title = self.tag_to_string(a)
- url = a.get('href')
- if '/ar/' not in url:
- continue
- if url.startswith('/'):
- url = 'http://hbr.org'+url
- url = self.map_url(url)
- p = x.find('p')
- desc = ''
- if p is not None:
- desc = self.tag_to_string(p)
- if not title or not url:
- continue
- self.log('\t\tFound article:', title)
- self.log('\t\t\t', url)
- self.log('\t\t\t', desc)
- current_articles.append({'title':title, 'url':url,
- 'description':desc, 'date':''})
- if current_section and current_articles:
- feeds.append((current_section, current_articles))
-
-
-
def hbr_parse_toc(self, soup):
feeds = []
- features = soup.find(id='issueFeaturesContent')
- self.hbr_parse_section(features, feeds)
- departments = soup.find(id='issueDepartments')
- self.hbr_parse_section(departments, feeds)
+ current_section = None
+ articles = []
+ for x in soup.find(id='archiveToc').findAll(['h3', 'h4']):
+ if x.name == 'h3':
+ if current_section is not None and articles:
+ feeds.append((current_section, articles))
+ current_section = self.tag_to_string(x).capitalize()
+ articles = []
+ self.log('\tFound section:', current_section)
+ else:
+ a = x.find('a', href=True)
+ if a is None: continue
+ title = self.tag_to_string(a)
+ url = a['href']
+ if '/ar/' not in url:
+ continue
+ if url.startswith('/'):
+ url = 'http://hbr.org' + url
+ url = self.map_url(url)
+ p = x.parent.find('p')
+ desc = ''
+ if p is not None:
+ desc = self.tag_to_string(p)
+ self.log('\t\tFound article:', title)
+ self.log('\t\t\t', url)
+ self.log('\t\t\t', desc)
+
+ articles.append({'title':title, 'url':url, 'description':desc,
+ 'date':''})
return feeds
def parse_index(self):
soup = self.hbr_get_toc()
+ #open('/t/hbr.html', 'wb').write(unicode(soup).encode('utf-8'))
feeds = self.hbr_parse_toc(soup)
return feeds
diff --git a/recipes/hbr_blogs.recipe b/recipes/hbr_blogs.recipe
index acee567d8d..0deaef7a73 100644
--- a/recipes/hbr_blogs.recipe
+++ b/recipes/hbr_blogs.recipe
@@ -6,33 +6,21 @@ class HBR(BasicNewsRecipe):
title = 'Harvard Business Review Blogs'
description = 'To subscribe go to http://hbr.harvardbusiness.org'
needs_subscription = True
- __author__ = 'Kovid Goyal, enhanced by BrianG'
+ __author__ = 'Kovid Goyal'
language = 'en'
no_stylesheets = True
LOGIN_URL = 'http://hbr.org/login?request_url=/'
+ LOGOUT_URL = 'http://hbr.org/logout?request_url=/'
+
INDEX = 'http://hbr.org/current'
- #
- # Blog Stuff
- #
-
-
- INCLUDE_BLOGS = True
- INCLUDE_ARTICLES = False
-
- # option-specific settings.
-
- if INCLUDE_BLOGS == True:
- remove_tags_after = dict(id='articleBody')
- remove_tags_before = dict(id='pageFeature')
- feeds = [('Blog','http://feeds.harvardbusiness.org/harvardbusiness')]
- oldest_article = 30
- max_articles_per_feed = 100
- use_embedded_content = False
- else:
- timefmt = ' [%B %Y]'
-
+ remove_tags_after = dict(id='articleBody')
+ remove_tags_before = dict(id='pageFeature')
+ feeds = [('Blog','http://feeds.harvardbusiness.org/harvardbusiness')]
+ oldest_article = 30
+ max_articles_per_feed = 100
+ use_embedded_content = False
keep_only_tags = [ dict(name='div', id='pageContainer')
]
@@ -41,21 +29,15 @@ class HBR(BasicNewsRecipe):
'articleToolbarTopRD', 'pageRightSubColumn', 'pageRightColumn',
'todayOnHBRListWidget', 'mostWidget', 'keepUpWithHBR',
'articleToolbarTop','articleToolbarBottom', 'articleToolbarRD',
- 'mailingListTout', 'partnerCenter', 'pageFooter']),
- dict(name='iframe')]
+ 'mailingListTout', 'partnerCenter', 'pageFooter', 'shareWidgetTop']),
+ dict(name=['iframe', 'style'])]
- extra_css = '''
- a {font-family:Georgia,"Times New Roman",Times,serif; font-style:italic; color:#000000; }
- .article{font-family:Georgia,"Times New Roman",Times,serif; font-size: xx-small;}
- h2{font-family:Georgia,"Times New Roman",Times,serif; font-weight:bold; font-size:large; }
- h4{font-family:Georgia,"Times New Roman",Times,serif; font-weight:bold; font-size:small; }
- #articleBody{font-family:Georgia,"Times New Roman",Times,serif; font-style:italic; color:#000000;font-size:x-small;}
- #summaryText{font-family:Georgia,"Times New Roman",Times,serif; font-weight:bold; font-size:x-small;}
- '''
-#-------------------------------------------------------------------------------------------------
def get_browser(self):
br = BasicNewsRecipe.get_browser(self)
+ self.logout_url = None
+
+ #'''
br.open(self.LOGIN_URL)
br.select_form(name='signin-form')
br['signin-form:username'] = self.username
@@ -63,11 +45,15 @@ class HBR(BasicNewsRecipe):
raw = br.submit().read()
if 'My Account' not in raw:
raise Exception('Failed to login, are you sure your username and password are correct?')
- self.logout_url = None
- link = br.find_link(text='Sign out')
- if link:
- self.logout_url = link.absolute_url
+ try:
+ link = br.find_link(text='Sign out')
+ if link:
+ self.logout_url = link.absolute_url
+ except:
+ self.logout_url = self.LOGOUT_URL
+ #'''
return br
+
#-------------------------------------------------------------------------------------------------
def cleanup(self):
if self.logout_url is not None:
@@ -76,99 +62,7 @@ class HBR(BasicNewsRecipe):
def map_url(self, url):
if url.endswith('/ar/1'):
return url[:-1]+'pr'
-#-------------------------------------------------------------------------------------------------
- def hbr_get_toc(self):
- soup = self.index_to_soup(self.INDEX)
- url = soup.find('a', text=lambda t:'Full Table of Contents' in t).parent.get('href')
- return self.index_to_soup('http://hbr.org'+url)
-
-#-------------------------------------------------------------------------------------------------
-
- def hbr_parse_section(self, container, feeds):
- current_section = None
- current_articles = []
- for x in container.findAll(name=['li', 'h3', 'h4']):
- if x.name in ['h3', 'h4'] and not x.findAll(True):
- if current_section and current_articles:
- feeds.append((current_section, current_articles))
- current_section = self.tag_to_string(x)
- current_articles = []
- self.log('\tFound section:', current_section)
- if x.name == 'li':
- a = x.find('a', href=True)
- if a is not None:
- title = self.tag_to_string(a)
- url = a.get('href')
- if '/ar/' not in url:
- continue
- if url.startswith('/'):
- url = 'http://hbr.org'+url
- url = self.map_url(url)
- p = x.find('p')
- desc = ''
- if p is not None:
- desc = self.tag_to_string(p)
- if not title or not url:
- continue
- self.log('\t\tFound article:', title)
- self.log('\t\t\t', url)
- self.log('\t\t\t', desc)
- current_articles.append({'title':title, 'url':url,
- 'description':desc, 'date':''})
- if current_section and current_articles:
- feeds.append((current_section, current_articles))
-
-#-------------------------------------------------------------------------------------------------
-
- def hbr_parse_toc(self, soup):
- feeds = []
- features = soup.find(id='issueFeaturesContent')
- self.hbr_parse_section(features, feeds)
- departments = soup.find(id='issueDepartments')
- self.hbr_parse_section(departments, feeds)
- return feeds
-#-------------------------------------------------------------------------------------------------
- def feed_to_index_append(self, feedObject, masterFeed):
- # Loop thru the feed object and build the correct type of article list
- for feed in feedObject:
- # build the correct structure from the feed object
- newArticles = []
- for article in feed.articles:
- newArt = {
- 'title' : article.title,
- 'url' : article.url,
- 'date' : article.date,
- 'description' : article.text_summary
- }
- newArticles.append(newArt)
-
- # Append the earliest/latest dates of the feed to the feed title
- startDate, endDate = self.get_feed_dates(feed, '%d-%b')
- newFeedTitle = feed.title + ' (' + startDate + ' thru ' + endDate + ')'
-
- # append the newly-built list object to the index object passed in
- # as masterFeed.
- masterFeed.append( (newFeedTitle,newArticles) )
-
-#-------------------------------------------------------------------------------------------------
- def get_feed_dates(self, feedObject, dateMask):
- startDate = feedObject.articles[len(feedObject.articles)-1].localtime.strftime(dateMask)
- endDate = feedObject.articles[0].localtime.strftime(dateMask)
-
- return startDate, endDate
-
-#-------------------------------------------------------------------------------------------------
-
- def parse_index(self):
- if self.INCLUDE_ARTICLES == True:
- soup = self.hbr_get_toc()
- feeds = self.hbr_parse_toc(soup)
- else:
- return BasicNewsRecipe.parse_index(self)
-
- return feeds
-#-------------------------------------------------------------------------------------------------
def get_cover_url(self):
cover_url = None
index = 'http://hbr.org/current'
diff --git a/recipes/houston_chronicle.recipe b/recipes/houston_chronicle.recipe
index 3390228455..8d231dac16 100644
--- a/recipes/houston_chronicle.recipe
+++ b/recipes/houston_chronicle.recipe
@@ -1,8 +1,6 @@
#!/usr/bin/env python
# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
-import string, pprint
-
from calibre.web.feeds.news import BasicNewsRecipe
class HoustonChronicle(BasicNewsRecipe):
@@ -13,53 +11,28 @@ class HoustonChronicle(BasicNewsRecipe):
language = 'en'
timefmt = ' [%a, %d %b, %Y]'
no_stylesheets = True
+ use_embedded_content = False
+ remove_attributes = ['style']
- keep_only_tags = [
- dict(id=['story-head', 'story'])
- ]
-
- remove_tags = [
- dict(id=['share-module', 'resource-box',
- 'resource-box-header'])
- ]
-
- extra_css = '''
- h1{font-family :Arial,Helvetica,sans-serif; font-size:large;}
- h2{font-family :Arial,Helvetica,sans-serif; font-size:medium; color:#666666;}
- h3{font-family :Arial,Helvetica,sans-serif; font-size:medium; color:#000000;}
- h4{font-family :Arial,Helvetica,sans-serif; font-size: x-small;}
- p{font-family :Arial,Helvetica,sans-serif; font-size:x-small;}
- #story-head h1{font-family :Arial,Helvetica,sans-serif; font-size: xx-large;}
- #story-head h2{font-family :Arial,Helvetica,sans-serif; font-size: small; color:#000000;}
- #story-head h3{font-family :Arial,Helvetica,sans-serif; font-size: xx-small;}
- #story-head h4{font-family :Arial,Helvetica,sans-serif; font-size: xx-small;}
- #story{font-family :Arial,Helvetica,sans-serif; font-size:xx-small;}
- #Text-TextSubhed BoldCond PoynterAgateZero h3{color:#444444;font-family :Arial,Helvetica,sans-serif; font-size:small;}
- .p260x p{font-family :Arial,Helvetica,serif; font-size:x-small;font-style:italic;}
- .p260x h6{color:#777777;font-family :Arial,Helvetica,sans-serif; font-size:xx-small;}
- '''
-
-
- def parse_index(self):
- categories = ['news', 'sports', 'business', 'entertainment', 'life',
- 'travel']
- feeds = []
- for cat in categories:
- articles = []
- soup = self.index_to_soup('http://www.chron.com/%s/'%cat)
- for elem in soup.findAll(comptype='story', storyid=True):
- a = elem.find('a', href=True)
- if a is None: continue
- url = a['href']
- if not url.startswith('http://'):
- url = 'http://www.chron.com'+url
- articles.append({'title':self.tag_to_string(a), 'url':url,
- 'description':'', 'date':''})
- pprint.pprint(articles[-1])
- if articles:
- feeds.append((string.capwords(cat), articles))
- return feeds
+ oldest_article = 2.0
+ keep_only_tags = {'class':lambda x: x and ('hst-articletitle' in x or
+ 'hst-articletext' in x or 'hst-galleryitem' in x)}
+ feeds = [
+ ('News', "http://www.chron.com/rss/feed/News-270.php"),
+ ('Sports',
+ 'http://www.chron.com/sports/headlines/collectionRss/Sports-Headlines-Staff-Stories-10767.php'),
+ ('Neighborhood',
+ 'http://www.chron.com/rss/feed/Neighborhood-305.php'),
+ ('Business', 'http://www.chron.com/rss/feed/Business-287.php'),
+ ('Entertainment',
+ 'http://www.chron.com/rss/feed/Entertainment-293.php'),
+ ('Editorials',
+ 'http://www.chron.com/opinion/editorials/collectionRss/Opinion-Editorials-Headline-List-10567.php'),
+ ('Life', 'http://www.chron.com/rss/feed/Life-297.php'),
+ ('Science & Tech',
+ 'http://www.chron.com/rss/feed/AP-Technology-and-Science-266.php'),
+ ]
diff --git a/recipes/icons/android_com_pl.png b/recipes/icons/android_com_pl.png
new file mode 100644
index 0000000000..d68bac8810
Binary files /dev/null and b/recipes/icons/android_com_pl.png differ
diff --git a/recipes/icons/bash_org_pl.png b/recipes/icons/bash_org_pl.png
new file mode 100644
index 0000000000..5fc18a38e0
Binary files /dev/null and b/recipes/icons/bash_org_pl.png differ
diff --git a/recipes/icons/cd_action.png b/recipes/icons/cd_action.png
new file mode 100644
index 0000000000..823e09a43e
Binary files /dev/null and b/recipes/icons/cd_action.png differ
diff --git a/recipes/icons/dobreprogamy.png b/recipes/icons/dobreprogamy.png
new file mode 100644
index 0000000000..fcb658cfe1
Binary files /dev/null and b/recipes/icons/dobreprogamy.png differ
diff --git a/recipes/icons/film_web.png b/recipes/icons/film_web.png
new file mode 100644
index 0000000000..3ddcdf1cde
Binary files /dev/null and b/recipes/icons/film_web.png differ
diff --git a/recipes/icons/gram_pl.png b/recipes/icons/gram_pl.png
new file mode 100644
index 0000000000..0a87f28825
Binary files /dev/null and b/recipes/icons/gram_pl.png differ
diff --git a/recipes/icons/niebezpiecznik.png b/recipes/icons/niebezpiecznik.png
new file mode 100644
index 0000000000..4188d91d36
Binary files /dev/null and b/recipes/icons/niebezpiecznik.png differ
diff --git a/recipes/icons/wnp.png b/recipes/icons/wnp.png
new file mode 100644
index 0000000000..3781f671cd
Binary files /dev/null and b/recipes/icons/wnp.png differ
diff --git a/recipes/msdnmag_en.recipe b/recipes/msdnmag_en.recipe
index 77b8da17a8..cf9cfc4f6a 100644
--- a/recipes/msdnmag_en.recipe
+++ b/recipes/msdnmag_en.recipe
@@ -6,11 +6,13 @@ __copyright__ = '2009, Darko Miletic