diff --git a/resources/recipes/20_minutos.recipe b/resources/recipes/20_minutos.recipe
new file mode 100644
index 0000000000..8205c918f5
--- /dev/null
+++ b/resources/recipes/20_minutos.recipe
@@ -0,0 +1,17 @@
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class AdvancedUserRecipe1295310874(BasicNewsRecipe):
+ title = u'20 Minutos (Boletin)'
+ __author__ = 'Luis Hernandez'
+ description = 'Periódico gratuito en español'
+ cover_url = 'http://estaticos.20minutos.es/mmedia/especiales/corporativo/css/img/logotipos_grupo20minutos.gif'
+ language = 'es'
+
+ oldest_article = 2
+ max_articles_per_feed = 50
+
+ feeds = [(u'VESPERTINO', u'http://20minutos.feedsportal.com/c/32489/f/478284/index.rss')
+ , (u'DEPORTES', u'http://20minutos.feedsportal.com/c/32489/f/478286/index.rss')
+ , (u'CULTURA', u'http://www.20minutos.es/rss/ocio/')
+ , (u'TV', u'http://20minutos.feedsportal.com/c/32489/f/490877/index.rss')
+]
diff --git a/resources/recipes/abc.recipe b/resources/recipes/abc.recipe
new file mode 100644
index 0000000000..c4ae0aa308
--- /dev/null
+++ b/resources/recipes/abc.recipe
@@ -0,0 +1,43 @@
+import re
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class ABCRecipe(BasicNewsRecipe):
+ title = u'ABC Linuxu'
+ oldest_article = 5
+ max_articles_per_feed = 3#5
+ __author__ = 'Funthomas'
+ language = 'cs'
+
+ feeds = [
+ #(u'Blogy', u'http://www.abclinuxu.cz/auto/blogDigest.rss'),
+ (u'Články', u'http://www.abclinuxu.cz/auto/abc.rss'),
+ (u'Zprávičky','http://www.abclinuxu.cz/auto/zpravicky.rss')
+ ]
+
+ remove_javascript = True
+ no_stylesheets = True
+ remove_attributes = ['width','height']
+
+ remove_tags_before = dict(name='h1')
+ remove_tags = [
+ dict(attrs={'class':['meta-vypis','page_tools','cl_perex']}),
+ dict(attrs={'class':['cl_nadpis-link','komix-nav']})
+ ]
+
+ remove_tags_after = [
+ dict(name='div',attrs={'class':['cl_perex','komix-nav']}),
+ dict(attrs={'class':['meta-vypis','page_tools']}),
+ dict(name='',attrs={'':''}),
+ ]
+
+
+ preprocess_regexps = [
+ (re.compile(r'.*
', re.DOTALL),lambda match: '
')
+ ]
+ def print_version(self, url):
+ return url + '?varianta=print&noDiz'
+
+ extra_css = '''
+ h1 {font-size:130%; font-weight:bold}
+ h3 {font-size:111%; font-weight:bold}
+ '''
diff --git a/resources/recipes/idnes.recipe b/resources/recipes/idnes.recipe
new file mode 100644
index 0000000000..0bd4de2327
--- /dev/null
+++ b/resources/recipes/idnes.recipe
@@ -0,0 +1,54 @@
+from calibre.web.feeds.recipes import BasicNewsRecipe
+
+class iHeuteRecipe(BasicNewsRecipe):
+ __author__ = 'FunThomas'
+ title = u'iDnes.cz'
+ publisher = u'MAFRA a.s.'
+ description = 'iDNES.cz Zprávy, Technet, Komiksy a další'
+ oldest_article = 3
+ max_articles_per_feed = 2
+
+ feeds = [
+ (u'Zprávy', u'http://servis.idnes.cz/rss.asp?c=zpravodaj'),
+ (u'Sport', u'http://servis.idnes.cz/rss.asp?c=sport'),
+ (u'Technet', u'http://servis.idnes.cz/rss.asp?c=technet'),
+ (u'Mobil', u'http://servis.idnes.cz/rss.asp?c=mobil'),
+ (u'Ekonomika', u'http://servis.idnes.cz/rss.asp?c=ekonomikah'),
+ #(u'Kultura', u'http://servis.idnes.cz/rss.asp?c=kultura'),
+ (u'Cestování', u'http://servis.idnes.cz/rss.asp?c=iglobe'),
+ #(u'Kavárna', u'http://servis.idnes.cz/rss.asp?r=kavarna'),
+ (u'Komixy', u'http://servis.idnes.cz/rss.asp?c=komiksy')
+ ]
+
+
+ encoding = 'cp1250'
+ language = 'cs'
+ cover_url = 'http://g.idnes.cz/u/loga-n4/idnes.gif'
+ remove_javascript = True
+ no_stylesheets = True
+
+ remove_attributes = ['width','height']
+ remove_tags = [dict(name='div', attrs={'id':['zooming']}),
+ dict(name='div', attrs={'class':['related','mapa-wrapper']}),
+ dict(name='table', attrs={'id':['opener-img','portal']}),
+ dict(name='table', attrs={'class':['video-16ku9']})]
+ remove_tags_after = [dict(name='div',attrs={'id':['related','related2']})]
+
+ keep_only_tags = [dict(name='div', attrs={'class':['art-full adwords-text','dil-day']})
+ ,dict(name='table',attrs={'class':['kemel-box']})]
+
+ def print_version(self, url):
+ print_url = url
+ split_url = url.split("?")
+ if (split_url[0].rfind('dilbert.asp') != -1): #dilbert komix
+ print_url = print_url.replace('.htm','.gif&tisk=1')
+ print_url = print_url.replace('.asp','.aspx')
+ elif (split_url[0].rfind('kemel.asp') == -1): #not Kemel komix
+ print_url = 'http://zpravy.idnes.cz/tiskni.asp?' + split_url[1]
+ #kemel kemel print page doesn't work
+ return print_url
+
+ extra_css = '''
+ h1 {font-size:125%; font-weight:bold}
+ h3 {font-size:110%; font-weight:bold}
+ '''
diff --git a/resources/recipes/la_tribuna.recipe b/resources/recipes/la_tribuna.recipe
new file mode 100644
index 0000000000..11bdda8f3e
--- /dev/null
+++ b/resources/recipes/la_tribuna.recipe
@@ -0,0 +1,29 @@
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class AdvancedUserRecipe1294946868(BasicNewsRecipe):
+ title = u'La Tribuna de Talavera'
+ __author__ = 'Luis Hernández'
+ description = 'Diario de Talavera de la Reina'
+ cover_url = 'http://www.latribunadetalavera.es/entorno/mancheta.gif'
+
+ oldest_article = 5
+ max_articles_per_feed = 50
+
+ remove_javascript = True
+ no_stylesheets = True
+ use_embedded_content = False
+
+ encoding = 'utf-8'
+ language = 'es'
+ timefmt = '[%a, %d %b, %Y]'
+
+ keep_only_tags = [dict(name='div', attrs={'id':['articulo']})
+ ,dict(name='div', attrs={'class':['foto']})
+ ,dict(name='p', attrs={'id':['texto']})
+ ]
+
+ remove_tags_before = dict(name='div' , attrs={'class':['comparte']})
+ remove_tags_after = dict(name='div' , attrs={'id':['relacionadas']})
+
+
+ feeds = [(u'Portada', u'http://www.latribunadetalavera.es/rss.html')]
diff --git a/resources/recipes/nytimes_sub.recipe b/resources/recipes/nytimes_sub.recipe
index 2424113e31..863e4b22ba 100644
--- a/resources/recipes/nytimes_sub.recipe
+++ b/resources/recipes/nytimes_sub.recipe
@@ -1,6 +1,5 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
-
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal '
'''
@@ -28,6 +27,10 @@ class NYTimes(BasicNewsRecipe):
# previous paid versions of the new york times to best sent to the back issues folder on the kindle
replaceKindleVersion = False
+ # download higher resolution images than the small thumbnails typically included in the article
+ # the down side of having large beautiful images is the file size is much larger, on the order of 7MB per paper
+ useHighResImages = True
+
# includeSections: List of sections to include. If empty, all sections found will be included.
# Otherwise, only the sections named will be included. For example,
#
@@ -90,7 +93,6 @@ class NYTimes(BasicNewsRecipe):
(u'Sunday Magazine',u'magazine'),
(u'Week in Review',u'weekinreview')]
-
if headlinesOnly:
title='New York Times Headlines'
description = 'Headlines from the New York Times'
@@ -127,7 +129,7 @@ class NYTimes(BasicNewsRecipe):
earliest_date = date.today() - timedelta(days=oldest_article)
- __author__ = 'GRiker/Kovid Goyal/Nick Redding'
+ __author__ = 'GRiker/Kovid Goyal/Nick Redding/Ben Collier'
language = 'en'
requires_version = (0, 7, 5)
@@ -149,7 +151,7 @@ class NYTimes(BasicNewsRecipe):
'dottedLine',
'entry-meta',
'entry-response module',
- 'icon enlargeThis',
+ #'icon enlargeThis', #removed to provide option for high res images
'leftNavTabs',
'metaFootnote',
'module box nav',
@@ -163,7 +165,23 @@ class NYTimes(BasicNewsRecipe):
'entry-tags', #added for DealBook
'footer promos clearfix', #added for DealBook
'footer links clearfix', #added for DealBook
- 'inlineImage module', #added for DealBook
+ 'tabsContainer', #added for other blog downloads
+ 'column lastColumn', #added for other blog downloads
+ 'pageHeaderWithLabel', #added for other gadgetwise downloads
+ 'column two', #added for other blog downloads
+ 'column two last', #added for other blog downloads
+ 'column three', #added for other blog downloads
+ 'column three last', #added for other blog downloads
+ 'column four',#added for other blog downloads
+ 'column four last',#added for other blog downloads
+ 'column last', #added for other blog downloads
+ 'timestamp published', #added for other blog downloads
+ 'entry entry-related',
+ 'subNavigation tabContent active', #caucus blog navigation
+ 'columnGroup doubleRule',
+ 'mediaOverlay slideshow',
+ 'headlinesOnly multiline flush',
+ 'wideThumb',
re.compile('^subNavigation'),
re.compile('^leaderboard'),
re.compile('^module'),
@@ -254,7 +272,7 @@ class NYTimes(BasicNewsRecipe):
def exclude_url(self,url):
if not url.startswith("http"):
return True
- if not url.endswith(".html") and 'dealbook.nytimes.com' not in url: #added for DealBook
+ if not url.endswith(".html") and 'dealbook.nytimes.com' not in url and 'blogs.nytimes.com' not in url: #added for DealBook
return True
if 'nytimes.com' not in url:
return True
@@ -480,7 +498,7 @@ class NYTimes(BasicNewsRecipe):
for lidiv in div.findAll('li'):
if not skipping:
self.handle_article(lidiv)
-
+
self.ans = [(k, self.articles[k]) for k in self.ans if self.articles.has_key(k)]
return self.filter_ans(self.ans)
@@ -591,20 +609,85 @@ class NYTimes(BasicNewsRecipe):
if article_date < self.earliest_date:
self.log("Skipping article dated %s" % date_str)
return None
+
+ #all articles are from today, no need to print the date on every page
+ try:
+ if not self.webEdition:
+ date_tag = soup.find(True,attrs={'class': ['dateline','date']})
+ if date_tag:
+ date_tag.extract()
+ except:
+ self.log("Error removing the published date")
- kicker_tag = soup.find(attrs={'class':'kicker'})
- if kicker_tag: # remove Op_Ed author head shots
- tagline = self.tag_to_string(kicker_tag)
- if tagline=='Op-Ed Columnist':
- img_div = soup.find('div','inlineImage module')
- if img_div:
- img_div.extract()
-
+ if self.useHighResImages:
+ try:
+ #open up all the "Enlarge this Image" pop-ups and download the full resolution jpegs
+ enlargeThisList = soup.findAll('div',{'class':'icon enlargeThis'})
+ if enlargeThisList:
+ for popupref in enlargeThisList:
+ popupreflink = popupref.find('a')
+ if popupreflink:
+ reflinkstring = str(popupreflink['href'])
+ refstart = reflinkstring.find("javascript:pop_me_up2('") + len("javascript:pop_me_up2('")
+ refend = reflinkstring.find(".html", refstart) + len(".html")
+ reflinkstring = reflinkstring[refstart:refend]
+
+ popuppage = self.browser.open(reflinkstring)
+ popuphtml = popuppage.read()
+ popuppage.close()
+ if popuphtml:
+ st = time.localtime()
+ year = str(st.tm_year)
+ month = "%.2d" % st.tm_mon
+ day = "%.2d" % st.tm_mday
+ imgstartpos = popuphtml.find('http://graphics8.nytimes.com/images/' + year + '/' + month +'/' + day +'/') + len('http://graphics8.nytimes.com/images/' + year + '/' + month +'/' + day +'/')
+ highResImageLink = 'http://graphics8.nytimes.com/images/' + year + '/' + month +'/' + day +'/' + popuphtml[imgstartpos:popuphtml.find('.jpg',imgstartpos)+4]
+ popupSoup = BeautifulSoup(popuphtml)
+ highResTag = popupSoup.find('img', {'src':highResImageLink})
+ if highResTag:
+ try:
+ newWidth = highResTag['width']
+ newHeight = highResTag['height']
+ imageTag = popupref.parent.find("img")
+ except:
+ self.log("Error: finding width and height of img")
+ popupref.extract()
+ if imageTag:
+ try:
+ imageTag['src'] = highResImageLink
+ imageTag['width'] = newWidth
+ imageTag['height'] = newHeight
+ except:
+ self.log("Error setting the src width and height parameters")
+ except Exception as e:
+ self.log("Error pulling high resolution images")
+
+ try:
+ #remove "Related content" bar
+ runAroundsFound = soup.findAll('div',{'class':['articleInline runaroundLeft','articleInline doubleRule runaroundLeft','articleInline runaroundLeft firstArticleInline']})
+ if runAroundsFound:
+ for runAround in runAroundsFound:
+ #find all section headers
+ hlines = runAround.findAll(True ,{'class':['sectionHeader','sectionHeader flushBottom']})
+ if hlines:
+ for hline in hlines:
+ hline.extract()
+ except:
+ self.log("Error removing related content bar")
+
+
+ try:
+ #in case pulling images failed, delete the enlarge this text
+ enlargeThisList = soup.findAll('div',{'class':'icon enlargeThis'})
+ if enlargeThisList:
+ for popupref in enlargeThisList:
+ popupref.extract()
+ except:
+ self.log("Error removing Enlarge this text")
return self.strip_anchors(soup)
def postprocess_html(self,soup, True):
-
try:
if self.one_picture_per_article:
# Remove all images after first
@@ -766,6 +849,8 @@ class NYTimes(BasicNewsRecipe):
try:
if len(article.text_summary.strip()) == 0:
articlebodies = soup.findAll('div',attrs={'class':'articleBody'})
+ if not articlebodies: #added to account for blog formats
+ articlebodies = soup.findAll('div', attrs={'class':'entry-content'}) #added to account for blog formats
if articlebodies:
for articlebody in articlebodies:
if articlebody:
@@ -774,13 +859,14 @@ class NYTimes(BasicNewsRecipe):
refparagraph = self.massageNCXText(self.tag_to_string(p,use_alt=False)).strip()
#account for blank paragraphs and short paragraphs by appending them to longer ones
if len(refparagraph) > 0:
- if len(refparagraph) > 70: #approximately one line of text
+ if len(refparagraph) > 140: #approximately two lines of text
article.summary = article.text_summary = shortparagraph + refparagraph
return
else:
shortparagraph = refparagraph + " "
if shortparagraph.strip().find(" ") == -1 and not shortparagraph.strip().endswith(":"):
shortparagraph = shortparagraph + "- "
+
except:
self.log("Error creating article descriptions")
return
diff --git a/resources/recipes/root.recipe b/resources/recipes/root.recipe
new file mode 100644
index 0000000000..da065829a7
--- /dev/null
+++ b/resources/recipes/root.recipe
@@ -0,0 +1,39 @@
+import re
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class AdvancedUserRecipe1289939440(BasicNewsRecipe):
+ __author__ = 'FunThomas'
+ title = u'Root.cz'
+ description = u'Zprávičky a články z Root.cz'
+ publisher = u'Internet Info, s.r.o'
+ oldest_article = 2 #max stari clanku ve dnech
+ max_articles_per_feed = 50 #max pocet clanku na feed
+
+ feeds = [
+ (u'Články', u'http://www.root.cz/rss/clanky/'),
+ (u'Zprávičky', u'http://www.root.cz/rss/zpravicky/')
+ ]
+
+ publication_type = u'magazine'
+ language = u'cs'
+ no_stylesheets = True
+ remove_javascript = True
+ cover_url = u'http://i.iinfo.cz/urs/logo-root-bila-oranzova-cerna-111089527143118.gif'
+
+ remove_attributes = ['width','height','href'] #,'href'
+ keep_only_tags = [
+ dict(name='h1'),
+ dict(name='a',attrs={'class':'author'}),
+ dict(name='p', attrs={'class':'intro'}),
+ dict(name='div',attrs={'class':'urs'})
+ ]
+
+ preprocess_regexps = [
+ (re.compile(u'[^<]*
]*>', re.DOTALL),lambda match: '
'),
+ (re.compile(u'