.*?', re.DOTALL|re.IGNORECASE),lambda match: '')
- ]
+ #preprocess_regexps = [
+ # (re.compile(r'
.*?', re.DOTALL|re.IGNORECASE),lambda match: '')
+ # ]
keep_only_tags = [dict(name='div', attrs={'id':['story','etc-story']})]
@@ -42,7 +48,7 @@ class ArsTechnica2(BasicNewsRecipe):
dict(name=['object','link','embed'])
,dict(name='div', attrs={'class':'read-more-link'})
]
- remove_attributes=['width','height']
+ #remove_attributes=['width','height']
feeds = [
(u'Infinite Loop (Apple content)' , u'http://feeds.arstechnica.com/arstechnica/apple/' )
@@ -56,6 +62,7 @@ class ArsTechnica2(BasicNewsRecipe):
,(u'Law & Disorder (Tech policy content)' , u'http://feeds.arstechnica.com/arstechnica/tech-policy/')
]
+ # This deals with multi-page stories
def append_page(self, soup, appendtag, position):
pager = soup.find('div',attrs={'class':'pager'})
if pager:
@@ -81,6 +88,7 @@ class ArsTechnica2(BasicNewsRecipe):
def preprocess_html(self, soup):
+ # Adds line breaks near the byline (not sure why this is needed)
ftag = soup.find('div', attrs={'class':'byline'})
if ftag:
brtag = Tag(soup,'br')
@@ -88,12 +96,33 @@ class ArsTechnica2(BasicNewsRecipe):
ftag.insert(4,brtag)
ftag.insert(5,brtag2)
+ # Remove style items
for item in soup.findAll(style=True):
del item['style']
+ # Remove id
+ for item in soup.findAll(id=True):
+ del item['id']
+
+ # For some reason, links to authors don't have the domainname
+ a_author = soup.find('a',{'href':re.compile("^/author")})
+ if a_author:
+ a_author['href'] = 'http://arstechnica.com'+a_author['href']
+
+ # within div class news-item-figure, we need to grab images
+
+ # Deal with multi-page stories
self.append_page(soup, soup.body, 3)
return soup
def get_article_url(self, article):
+ # If the article title starts with Etc:, don't return it
+ if self.ignoreEtcArticles:
+ article_title = article.get('title',None)
+ if re.match('Etc: ',article_title) is not None:
+ return None
+
+ # The actual article is in a guid tag
return article.get('guid', None).rpartition('?')[0]
+
diff --git a/resources/recipes/blic.recipe b/resources/recipes/blic.recipe
index 0c955bebde..384518ec13 100644
--- a/resources/recipes/blic.recipe
+++ b/resources/recipes/blic.recipe
@@ -1,6 +1,6 @@
__license__ = 'GPL v3'
-__copyright__ = '2008-2010, Darko Miletic '
+__copyright__ = '2008-2011, Darko Miletic '
'''
blic.rs
'''
@@ -21,21 +21,53 @@ class Blic(BasicNewsRecipe):
masthead_url = 'http://www.blic.rs/resources/images/header/header_back.png'
language = 'sr'
publication_type = 'newspaper'
- extra_css = '@font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)} @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)} body{font-family: Georgia, serif1, serif} .article_description{font-family: Arial, sans1, sans-serif} .img_full{float: none} img{margin-bottom: 0.8em} '
+ extra_css = """
+ @font-face {font-family: "serif1";src:url(res:///opt/sony/ebook/FONT/tt0011m_.ttf)}
+ @font-face {font-family: "sans1";src:url(res:///opt/sony/ebook/FONT/tt0003m_.ttf)}
+ body{font-family: Georgia, serif1, serif}
+ .articledescription,#nadnaslov,.article_info{font-family: Arial, sans1, sans-serif}
+ .img_full{float: none}
+ #nadnaslov{font-size: small}
+ #article_lead{font-size: 1.5em}
+ h1{color: red}
+ .potpis{font-size: x-small; color: gray}
+ .article_info{font-size: small}
+ img{margin-bottom: 0.8em; margin-top: 0.8em; display: block}
+ """
conversion_options = {
'comment' : description
, 'tags' : category
, 'publisher': publisher
, 'language' : language
+ , 'linearize_tables' : True
}
preprocess_regexps = [(re.compile(u'\u0110'), lambda match: u'\u00D0')]
remove_tags_before = dict(name='div', attrs={'id':'article_info'})
- remove_tags = [dict(name=['object','link'])]
- remove_attributes = ['width','height']
+ remove_tags = [dict(name=['object','link','meta','base','object','embed'])]
+ remove_attributes = ['width','height','m_id','m_ext','mlg_id','poll_id','v_id']
- feeds = [(u'Danasnje Vesti', u'http://www.blic.rs/rss/danasnje-vesti')]
+ feeds = [
+ (u'Politika' , u'http://www.blic.rs/rss/Vesti/Politika')
+ ,(u'Tema Dana' , u'http://www.blic.rs/rss/Vesti/Tema-Dana')
+ ,(u'Svet' , u'http://www.blic.rs/rss/Vesti/Svet')
+ ,(u'Drustvo' , u'http://www.blic.rs/rss/Vesti/Drustvo')
+ ,(u'Ekonomija' , u'http://www.blic.rs/rss/Vesti/Ekonomija')
+ ,(u'Hronika' , u'http://www.blic.rs/rss/Vesti/Hronika')
+ ,(u'Beograd' , u'http://www.blic.rs/rss/Vesti/Beograd')
+ ,(u'Srbija' , u'http://www.blic.rs/rss/Vesti/Srbija')
+ ,(u'Vojvodina' , u'http://www.blic.rs/rss/Vesti/Vojvodina')
+ ,(u'Republika Srpska' , u'http://www.blic.rs/rss/Vesti/Republika-Srpska')
+ ,(u'Reportaza' , u'http://www.blic.rs/rss/Vesti/Reportaza')
+ ,(u'Dodatak' , u'http://www.blic.rs/rss/Vesti/Dodatak')
+ ,(u'Zabava' , u'http://www.blic.rs/rss/Zabava')
+ ,(u'Kultura' , u'http://www.blic.rs/rss/Kultura')
+ ,(u'Slobodno Vreme' , u'http://www.blic.rs/rss/Slobodno-vreme')
+ ,(u'IT' , u'http://www.blic.rs/rss/IT')
+ ,(u'Komentar' , u'http://www.blic.rs/rss/Komentar')
+ ,(u'Intervju' , u'http://www.blic.rs/rss/Intervju')
+ ]
def print_version(self, url):
@@ -44,4 +76,4 @@ class Blic(BasicNewsRecipe):
def preprocess_html(self, soup):
for item in soup.findAll(style=True):
del item['style']
- return self.adeify_images(soup)
+ return soup
diff --git a/resources/recipes/cicero.recipe b/resources/recipes/cicero.recipe
new file mode 100644
index 0000000000..2df6b68000
--- /dev/null
+++ b/resources/recipes/cicero.recipe
@@ -0,0 +1,35 @@
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class Cicero(BasicNewsRecipe):
+ timefmt = ' [%Y-%m-%d]'
+ title = u'Cicero'
+ __author__ = 'mad@sharktooth.de'
+ description = u'Magazin f\xfcr politische Kultur'
+ oldest_article = 7
+ language = 'de'
+ max_articles_per_feed = 100
+ no_stylesheets = True
+ use_embedded_content = False
+ publisher = 'Ringier Publishing'
+ category = 'news, politics, Germany'
+ encoding = 'iso-8859-1'
+ publication_type = 'magazine'
+ masthead_url = 'http://www.cicero.de/img2/cicero_logo_rss.gif'
+ feeds = [
+(u'Das gesamte Portfolio', u'http://www.cicero.de/rss/rss.php?ress_id='),
+#(u'Alle Heft-Inhalte', u'http://www.cicero.de/rss/rss.php?ress_id=heft'),
+#(u'Alle Online-Inhalte', u'http://www.cicero.de/rss/rss.php?ress_id=online'),
+#(u'Berliner Republik', u'http://www.cicero.de/rss/rss.php?ress_id=4'),
+#(u'Weltb\xfchne', u'http://www.cicero.de/rss/rss.php?ress_id=1'),
+#(u'Salon', u'http://www.cicero.de/rss/rss.php?ress_id=7'),
+#(u'Kapital', u'http://www.cicero.de/rss/rss.php?ress_id=6'),
+#(u'Netzst\xfccke', u'http://www.cicero.de/rss/rss.php?ress_id=9'),
+#(u'Leinwand', u'http://www.cicero.de/rss/rss.php?ress_id=12'),
+#(u'Bibliothek', u'http://www.cicero.de/rss/rss.php?ress_id=15'),
+(u'Kolumne - Alle Kolulmnen', u'http://www.cicero.de/rss/rss2.php?ress_id='),
+#(u'Kolumne - Schreiber, Berlin', u'http://www.cicero.de/rss/rss2.php?ress_id=35'),
+#(u'Kolumne - TV Kritik', u'http://www.cicero.de/rss/rss2.php?ress_id=34')
+]
+
+ def print_version(self, url):
+ return 'http://www.cicero.de/page_print.php?' + url.rpartition('?')[2]
diff --git a/resources/recipes/cnetjapan.recipe b/resources/recipes/cnetjapan.recipe
index 1058b90401..b57bce5b97 100644
--- a/resources/recipes/cnetjapan.recipe
+++ b/resources/recipes/cnetjapan.recipe
@@ -11,7 +11,7 @@ class CNetJapan(BasicNewsRecipe):
(u'CNet Blog', u'http://feed.japan.cnet.com/rss/blog/index.rdf')
]
language = 'ja'
- encoding = 'Shift_JIS'
+ encoding = 'utf-8'
remove_javascript = True
preprocess_regexps = [
diff --git a/resources/recipes/dallas.recipe b/resources/recipes/dallas.recipe
index 8666fbef30..d46427caa9 100644
--- a/resources/recipes/dallas.recipe
+++ b/resources/recipes/dallas.recipe
@@ -7,22 +7,29 @@ class DallasNews(BasicNewsRecipe):
max_articles_per_feed = 25
no_stylesheets = True
- remove_tags_before = dict(name='h2', attrs={'class':'vitstoryheadline'})
- remove_tags_after = dict(name='div', attrs={'style':'width: 100%; clear: right'})
- remove_tags_after = dict(name='div', attrs={'id':'article_tools_bottom'})
+ use_embedded_content = False
+ remove_tags_before = dict(name='h1')
+ keep_only_tags = {'class':lambda x: x and 'article' in x}
remove_tags = [
- dict(name='iframe'),
- dict(name='div', attrs={'class':'biblockmore'}),
- dict(name='div', attrs={'style':'width: 100%; clear: right'}),
- dict(name='div', attrs={'id':'article_tools_bottom'}),
- #dict(name='ul', attrs={'class':'articleTools'}),
+ {'class':['DMNSocialTools', 'article ', 'article first ', 'article premium']},
]
feeds = [
- ('Latest News', 'http://www.dallasnews.com/newskiosk/rss/dallasnewslatestnews.xml'),
- ('Local News', 'http://www.dallasnews.com/newskiosk/rss/dallasnewslocalnews.xml'),
- ('Nation and World', 'http://www.dallasnews.com/newskiosk/rss/dallasnewsnationworld.xml'),
- ('Politics', 'http://www.dallasnews.com/newskiosk/rss/dallasnewsnationalpolitics.xml'),
- ('Science', 'http://www.dallasnews.com/newskiosk/rss/dallasnewsscience.xml'),
+ ('Local News',
+ 'http://www.dallasnews.com/news/politics/local-politics/?rss'),
+ ('National Politics',
+ 'http://www.dallasnews.com/news/politics/national-politic/?rss'),
+ ('State Politics',
+ 'http://www.dallasnews.com/news/politics/state-politics/?rss'),
+ ('Religion',
+ 'http://www.dallasnews.com/news/religion/?rss'),
+ ('Crime',
+ 'http://www.dallasnews.com/news/crime/headlines/?rss'),
+ ('Celebrity News',
+ 'http://www.dallasnews.com/entertainment/celebrity-news/?rss&listname=TopStories'),
+ ('Nation',
+ 'http://www.dallasnews.com/news/nation-world/nation/?rss'),
+ ('World',
+ 'http://www.dallasnews.com/news/nation-world/world/?rss'),
]
diff --git a/resources/recipes/deia.recipe b/resources/recipes/deia.recipe
index 980d59d3d1..5d39be9a10 100644
--- a/resources/recipes/deia.recipe
+++ b/resources/recipes/deia.recipe
@@ -22,7 +22,7 @@ class Deia(BasicNewsRecipe):
cover_url ='http://2.bp.blogspot.com/_RjrWzC6tI14/TM6jrPLaBZI/AAAAAAAAFaI/ayffwxidFEY/s1600/2009-10-13-logo-deia.jpg'
timefmt ='[%a, %d %b, %Y]'
encoding ='utf8'
- language ='es_ES'
+ language ='es'
remove_javascript =True
remove_tags_after =dict(id='Texto')
remove_tags_before =dict(id='Texto')
diff --git a/resources/recipes/dilbert.recipe b/resources/recipes/dilbert.recipe
index 2c3268da2f..56aa4af8c9 100644
--- a/resources/recipes/dilbert.recipe
+++ b/resources/recipes/dilbert.recipe
@@ -28,7 +28,7 @@ class DilbertBig(BasicNewsRecipe):
,'publisher' : publisher
}
- feeds = [(u'Dilbert', u'http://feeds.dilbert.com/DilbertDailyStrip' )]
+ feeds = [(u'Dilbert', u'http://feed.dilbert.com/dilbert/daily_strip' )]
def get_article_url(self, article):
return article.get('feedburner_origlink', None)
diff --git a/resources/recipes/economist.recipe b/resources/recipes/economist.recipe
index 01ee8e0baf..95b4a2ae05 100644
--- a/resources/recipes/economist.recipe
+++ b/resources/recipes/economist.recipe
@@ -9,7 +9,7 @@ from calibre.web.feeds.news import BasicNewsRecipe
from calibre.ebooks.BeautifulSoup import BeautifulSoup
from calibre.ebooks.BeautifulSoup import Tag, NavigableString
-import mechanize, string, urllib, time, re
+import string, time, re
class Economist(BasicNewsRecipe):
@@ -18,19 +18,19 @@ class Economist(BasicNewsRecipe):
__author__ = "Kovid Goyal"
INDEX = 'http://www.economist.com/printedition'
- description = ('Global news and current affairs from a European perspective.'
- ' Needs a subscription from ')+INDEX
+ description = 'Global news and current affairs from a European perspective.'
oldest_article = 7.0
cover_url = 'http://www.economist.com/images/covers/currentcoverus_large.jpg'
remove_tags = [dict(name=['script', 'noscript', 'title', 'iframe', 'cf_floatingcontent']),
dict(attrs={'class':['dblClkTrk', 'ec-article-info']})]
keep_only_tags = [dict(id='ec-article-body')]
- needs_subscription = True
+ needs_subscription = False
no_stylesheets = True
preprocess_regexps = [(re.compile('