diff --git a/Changelog.yaml b/Changelog.yaml
index 43eb775233..17f3ebcf97 100644
--- a/Changelog.yaml
+++ b/Changelog.yaml
@@ -19,6 +19,51 @@
# new recipes:
# - title:
+- version: 0.8.47
+ date: 2012-04-13
+
+ new features:
+ - title: "Conversion pipeline: Add support for all the named entities in the HTML 5 spec."
+ tickets: [976056]
+
+ - title: "Support for viewing and converting the Haodoo PDB ebook format"
+ tickets: [976478]
+
+ - title: "Device driver for Laser EB720"
+
+ bug fixes:
+ - title: "Fix regression in automatic adding in 0.8.46 that broke automatic adding if adding of duplicates is enabled and auto convert is also enabled"
+ tickets: [976336]
+
+ - title: 'Fix "Tags" field in advanced search does not obey regex setting'
+ tickets: [980221]
+
+ - title: "EPUB Input: Automatically extract cover image from simple HTML title page that consists of only a single
tag, instead of rendering the page"
+
+ - title: "Prevent errors when both author and author_sort are used in a template for reading metadata from filenames for files on a device"
+
+ - title: "Amazon metadata download: Handle books whose titles start with a bracket."
+ tickets: [976365]
+
+ - title: "Get Books: Fix downloading of purchased books from Baen"
+ tickets: [975929]
+
+
+ improved recipes:
+ - Forbes
+ - Caros Amigos
+ - Trouw
+ - Sun UK
+ - Metro
+ - Daily Mirror
+
+ new recipes:
+ - title: "Melbourne Herald Sun"
+ author: Ray Hartley
+
+ - title: "Editoriali and Zerocalcare"
+ author: faber1971
+
- version: 0.8.46
date: 2012-04-06
diff --git a/recipes/adventure_zone_pl.recipe b/recipes/adventure_zone_pl.recipe
index 65f4e3e52d..bb311606ac 100644
--- a/recipes/adventure_zone_pl.recipe
+++ b/recipes/adventure_zone_pl.recipe
@@ -9,6 +9,7 @@ class Adventure_zone(BasicNewsRecipe):
no_stylesheets = True
oldest_article = 20
max_articles_per_feed = 100
+ index='http://www.adventure-zone.info/fusion/'
use_embedded_content=False
preprocess_regexps = [(re.compile(r"
Komentarze | ", re.IGNORECASE), lambda m: '')]
remove_tags_before= dict(name='td', attrs={'class':'main-bg'})
@@ -45,6 +46,19 @@ class Adventure_zone(BasicNewsRecipe):
skip_tag = skip_tag.findAll(name='a')
for r in skip_tag:
if r.strong:
- word=r.strong.string
- if word and (('zapowied' in word) or ('recenzj' in word) or ('solucj' in word)):
- return self.index_to_soup('http://www.adventure-zone.info/fusion/print.php?type=A&item'+r['href'][r['href'].find('article_id')+7:], raw=True)
\ No newline at end of file
+ word=r.strong.string.lower()
+ if word and (('zapowied' in word) or ('recenzj' in word) or ('solucj' in word) or ('poradnik' in word)):
+ return self.index_to_soup('http://www.adventure-zone.info/fusion/print.php?type=A&item'+r['href'][r['href'].find('article_id')+7:], raw=True)
+
+ def preprocess_html(self, soup):
+ footer=soup.find(attrs={'class':'news-footer middle-border'})
+ if footer and len(footer('a'))>=2:
+ footer('a')[1].extract()
+ for item in soup.findAll(style=True):
+ del item['style']
+ for a in soup('a'):
+ if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']:
+ a['href']=self.index + a['href']
+ return soup
+
+
\ No newline at end of file
diff --git a/recipes/benchmark_pl.recipe b/recipes/benchmark_pl.recipe
index cc74cc9128..00eea1be68 100644
--- a/recipes/benchmark_pl.recipe
+++ b/recipes/benchmark_pl.recipe
@@ -68,4 +68,7 @@ class Benchmark_pl(BasicNewsRecipe):
self.image_article(soup, soup.body)
else:
self.append_page(soup, soup.body)
+ for a in soup('a'):
+ if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']:
+ a['href']=self.INDEX + a['href']
return soup
diff --git a/recipes/calgary_herald.recipe b/recipes/calgary_herald.recipe
index dc919a76f8..12134bc9a4 100644
--- a/recipes/calgary_herald.recipe
+++ b/recipes/calgary_herald.recipe
@@ -1,220 +1,35 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-__license__ = 'GPL v3'
-
-'''
-www.canada.com
-'''
-
-import re
-from calibre.web.feeds.news import BasicNewsRecipe
-from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup
-
-
-class CanWestPaper(BasicNewsRecipe):
-
- # un-comment the following four lines for the Victoria Times Colonist
-## title = u'Victoria Times Colonist'
-## url_prefix = 'http://www.timescolonist.com'
-## description = u'News from Victoria, BC'
-## fp_tag = 'CAN_TC'
-
- # un-comment the following four lines for the Vancouver Province
-## title = u'Vancouver Province'
-## url_prefix = 'http://www.theprovince.com'
-## description = u'News from Vancouver, BC'
-## fp_tag = 'CAN_VP'
-
- # un-comment the following four lines for the Vancouver Sun
-## title = u'Vancouver Sun'
-## url_prefix = 'http://www.vancouversun.com'
-## description = u'News from Vancouver, BC'
-## fp_tag = 'CAN_VS'
-
- # un-comment the following four lines for the Edmonton Journal
-## title = u'Edmonton Journal'
-## url_prefix = 'http://www.edmontonjournal.com'
-## description = u'News from Edmonton, AB'
-## fp_tag = 'CAN_EJ'
-
- # un-comment the following four lines for the Calgary Herald
- title = u'Calgary Herald'
- url_prefix = 'http://www.calgaryherald.com'
- description = u'News from Calgary, AB'
- fp_tag = 'CAN_CH'
-
- # un-comment the following four lines for the Regina Leader-Post
-## title = u'Regina Leader-Post'
-## url_prefix = 'http://www.leaderpost.com'
-## description = u'News from Regina, SK'
-## fp_tag = ''
-
- # un-comment the following four lines for the Saskatoon Star-Phoenix
-## title = u'Saskatoon Star-Phoenix'
-## url_prefix = 'http://www.thestarphoenix.com'
-## description = u'News from Saskatoon, SK'
-## fp_tag = ''
-
- # un-comment the following four lines for the Windsor Star
-## title = u'Windsor Star'
-## url_prefix = 'http://www.windsorstar.com'
-## description = u'News from Windsor, ON'
-## fp_tag = 'CAN_'
-
- # un-comment the following four lines for the Ottawa Citizen
-## title = u'Ottawa Citizen'
-## url_prefix = 'http://www.ottawacitizen.com'
-## description = u'News from Ottawa, ON'
-## fp_tag = 'CAN_OC'
-
- # un-comment the following four lines for the Montreal Gazette
-## title = u'Montreal Gazette'
-## url_prefix = 'http://www.montrealgazette.com'
-## description = u'News from Montreal, QC'
-## fp_tag = 'CAN_MG'
-
-
- language = 'en_CA'
- __author__ = 'Nick Redding'
- no_stylesheets = True
- timefmt = ' [%b %d]'
- extra_css = '''
- .timestamp { font-size:xx-small; display: block; }
- #storyheader { font-size: medium; }
- #storyheader h1 { font-size: x-large; }
- #storyheader h2 { font-size: large; font-style: italic; }
- .byline { font-size:xx-small; }
- #photocaption { font-size: small; font-style: italic }
- #photocredit { font-size: xx-small; }'''
- keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'})]
- remove_tags = [{'class':'comments'},
- dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}),
- dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}),
- dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}),
- dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}),
- dict(name='div', attrs={'class':'rule_grey_solid'}),
- dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})]
-
- def get_cover_url(self):
- from datetime import timedelta, date
- if self.fp_tag=='':
- return None
- cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str(date.today().day)+'/lg/'+self.fp_tag+'.jpg'
- br = BasicNewsRecipe.get_browser()
- daysback=1
- try:
- br.open(cover)
- except:
- while daysback<7:
- cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str((date.today() - timedelta(days=daysback)).day)+'/lg/'+self.fp_tag+'.jpg'
- br = BasicNewsRecipe.get_browser()
- try:
- br.open(cover)
- except:
- daysback = daysback+1
- continue
- break
- if daysback==7:
- self.log("\nCover unavailable")
- cover = None
- return cover
-
- def fixChars(self,string):
- # Replace lsquo (\x91)
- fixed = re.sub("\x91","‘",string)
- # Replace rsquo (\x92)
- fixed = re.sub("\x92","’",fixed)
- # Replace ldquo (\x93)
- fixed = re.sub("\x93","“",fixed)
- # Replace rdquo (\x94)
- fixed = re.sub("\x94","”",fixed)
- # Replace ndash (\x96)
- fixed = re.sub("\x96","–",fixed)
- # Replace mdash (\x97)
- fixed = re.sub("\x97","—",fixed)
- fixed = re.sub("’","’",fixed)
- return fixed
-
- def massageNCXText(self, description):
- # Kindle TOC descriptions won't render certain characters
- if description:
- massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
- # Replace '&' with '&'
- massaged = re.sub("&","&", massaged)
- return self.fixChars(massaged)
- else:
- return description
-
- def populate_article_metadata(self, article, soup, first):
- if first:
- picdiv = soup.find('body').find('img')
- if picdiv is not None:
- self.add_toc_thumbnail(article,re.sub(r'links\\link\d+\\','',picdiv['src']))
- xtitle = article.text_summary.strip()
- if len(xtitle) == 0:
- desc = soup.find('meta',attrs={'property':'og:description'})
- if desc is not None:
- article.summary = article.text_summary = desc['content']
-
- def strip_anchors(self,soup):
- paras = soup.findAll(True)
- for para in paras:
- aTags = para.findAll('a')
- for a in aTags:
- if a.img is None:
- a.replaceWith(a.renderContents().decode('cp1252','replace'))
- return soup
-
- def preprocess_html(self, soup):
- return self.strip_anchors(soup)
-
-
-
- def parse_index(self):
- soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html')
-
- articles = {}
- key = 'News'
- ans = ['News']
-
- # Find each instance of class="sectiontitle", class="featurecontent"
- for divtag in soup.findAll('div',attrs={'class' : ["section_title02","featurecontent"]}):
- #self.log(" div class = %s" % divtag['class'])
- if divtag['class'].startswith('section_title'):
- # div contains section title
- if not divtag.h3:
- continue
- key = self.tag_to_string(divtag.h3,False)
- ans.append(key)
- self.log("Section name %s" % key)
- continue
- # div contains article data
- h1tag = divtag.find('h1')
- if not h1tag:
- continue
- atag = h1tag.find('a',href=True)
- if not atag:
- continue
- url = self.url_prefix+'/news/todays-paper/'+atag['href']
- #self.log("Section %s" % key)
- #self.log("url %s" % url)
- title = self.tag_to_string(atag,False)
- #self.log("title %s" % title)
- pubdate = ''
- description = ''
- ptag = divtag.find('p');
- if ptag:
- description = self.tag_to_string(ptag,False)
- #self.log("description %s" % description)
- author = ''
- autag = divtag.find('h4')
- if autag:
- author = self.tag_to_string(autag,False)
- #self.log("author %s" % author)
- if not articles.has_key(key):
- articles[key] = []
- articles[key].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content=''))
-
- ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
- return ans
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class CalgaryHerald(BasicNewsRecipe):
+ title = u'Calgary Herald'
+ oldest_article = 3
+ max_articles_per_feed = 100
+
+ feeds = [
+ (u'News', u'http://rss.canada.com/get/?F233'),
+ (u'Calgary', u'http://www.calgaryherald.com/scripts/sp6query.aspx?catalog=cahr&tags=keyword|calgary&output=rss?link=http%3a%2f%2fwww.calgaryherald'),
+ (u'Alberta', u'http://www.calgaryherald.com/scripts/Sp6Query.aspx?catalog=CAHR&tags=Keyword|Alberta&output=rss?link=http%3A%2F%2Fwww.calgaryherald.com%2Fnews%2Falberta%2Findex.html'),
+ (u'Politics', u'http://rss.canada.com/get/?F7551'),
+ (u'National', u'http://rss.canada.com/get/?F7552'),
+ (u'World', u'http://rss.canada.com/get/?F7553'),
+ ]
+ __author__ = 'rty'
+ pubisher = 'Calgary Herald'
+ description = 'Calgary, Alberta, Canada'
+ category = 'News, Calgary, Alberta, Canada'
+
+
+ remove_javascript = True
+ use_embedded_content = False
+ no_stylesheets = True
+ language = 'en_CA'
+ encoding = 'utf-8'
+ conversion_options = {'linearize_tables':True}
+ ##masthead_url = 'http://www.calgaryherald.com/index.html'
+ keep_only_tags = [
+ dict(name='div', attrs={'id':'storyheader'}),
+ dict(name='div', attrs={'id':'storycontent'})
+
+ ]
+ remove_tags_after = {'class':"story_tool_hr"}
+
diff --git a/recipes/camera_di_commercio_di_bari.recipe b/recipes/camera_di_commercio_di_bari.recipe
new file mode 100644
index 0000000000..c80a825883
--- /dev/null
+++ b/recipes/camera_di_commercio_di_bari.recipe
@@ -0,0 +1,17 @@
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class AdvancedUserRecipe1331729727(BasicNewsRecipe):
+ title = u'Camera di Commercio di Bari'
+ oldest_article = 7
+ __author__ = 'faber1971'
+ description = 'News from the Chamber of Commerce of Bari'
+ language = 'it'
+ max_articles_per_feed = 100
+ auto_cleanup = True
+ masthead_url = 'http://www.ba.camcom.it/grafica/layout-bordo/logo_camcom_bari.png'
+ feeds = [(u'Camera di Commercio di Bari', u'http://feed43.com/4715147488845101.xml')]
+
+__license__ = 'GPL v3'
+__copyright__ = '2012, faber1971'
+__version__ = 'v1.00'
+__date__ = '17, April 2012'
diff --git a/recipes/cd_action.recipe b/recipes/cd_action.recipe
index ff46774dc9..4e19fbc6c1 100644
--- a/recipes/cd_action.recipe
+++ b/recipes/cd_action.recipe
@@ -6,6 +6,7 @@ class CD_Action(BasicNewsRecipe):
description = 'cdaction.pl - polish games magazine site'
category = 'games'
language = 'pl'
+ index='http://www.cdaction.pl'
oldest_article = 8
max_articles_per_feed = 100
no_stylesheets= True
@@ -17,4 +18,10 @@ class CD_Action(BasicNewsRecipe):
def get_cover_url(self):
soup = self.index_to_soup('http://www.cdaction.pl/magazyn/')
self.cover_url='http://www.cdaction.pl'+ soup.find(id='wspolnik').div.a['href']
- return getattr(self, 'cover_url', self.cover_url)
\ No newline at end of file
+ return getattr(self, 'cover_url', self.cover_url)
+
+ def preprocess_html(self, soup):
+ for a in soup('a'):
+ if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']:
+ a['href']=self.index + a['href']
+ return soup
\ No newline at end of file
diff --git a/recipes/countryfile.recipe b/recipes/countryfile.recipe
index 7a41b5b905..0502129791 100644
--- a/recipes/countryfile.recipe
+++ b/recipes/countryfile.recipe
@@ -1,11 +1,12 @@
+from calibre import browser
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1325006965(BasicNewsRecipe):
title = u'Countryfile.com'
- cover_url = 'http://www.buysubscriptions.com/static_content/the-immediate/en/images/covers/CFIL_maxi.jpg'
+ #cover_url = 'http://www.countryfile.com/sites/default/files/imagecache/160px_wide/cover/2_1.jpg'
__author__ = 'Dave Asbury'
description = 'The official website of Countryfile Magazine'
- # last updated 29/1/12
+ # last updated 15/4/12
language = 'en_GB'
oldest_article = 30
max_articles_per_feed = 25
@@ -13,7 +14,23 @@ class AdvancedUserRecipe1325006965(BasicNewsRecipe):
no_stylesheets = True
auto_cleanup = True
#articles_are_obfuscated = True
+ def get_cover_url(self):
+ soup = self.index_to_soup('http://www.countryfile.com/')
+ cov = soup.find(attrs={'class' : 'imagecache imagecache-160px_wide imagecache-linked imagecache-160px_wide_linked'})
+ #print '******** ',cov,' ***'
+ cov2 = str(cov)
+ cov2=cov2[124:-90]
+ #print '******** ',cov2,' ***'
+ # try to get cover - if can't get known cover
+ br = browser()
+ br.set_handle_redirect(False)
+ try:
+ br.open_novisit(cov2)
+ cover_url = cov2
+ except:
+ cover_url = 'http://www.countryfile.com/sites/default/files/imagecache/160px_wide/cover/2_1.jpg'
+ return cover_url
remove_tags = [
# dict(attrs={'class' : ['player']}),
diff --git a/recipes/daily_mirror.recipe b/recipes/daily_mirror.recipe
index d6794b1d97..8bac57951c 100644
--- a/recipes/daily_mirror.recipe
+++ b/recipes/daily_mirror.recipe
@@ -1,20 +1,21 @@
+
from calibre.web.feeds.news import BasicNewsRecipe
import re
+import mechanize
class AdvancedUserRecipe1306061239(BasicNewsRecipe):
title = u'The Daily Mirror'
description = 'News as provide by The Daily Mirror -UK'
__author__ = 'Dave Asbury'
- # last updated 11/2/12
+ # last updated 7/4/12
language = 'en_GB'
-
- cover_url = 'http://yookeo.com/screens/m/i/mirror.co.uk.jpg'
+ #cover_url = 'http://yookeo.com/screens/m/i/mirror.co.uk.jpg'
masthead_url = 'http://www.nmauk.co.uk/nma/images/daily_mirror.gif'
oldest_article = 1
- max_articles_per_feed = 5
+ max_articles_per_feed = 10
remove_empty_feeds = True
remove_javascript = True
no_stylesheets = True
@@ -75,3 +76,28 @@ class AdvancedUserRecipe1306061239(BasicNewsRecipe):
img { display:block}
'''
+ def get_cover_url(self):
+ soup = self.index_to_soup('http://www.politicshome.com/uk/latest_frontpage.html')
+ # look for the block containing the mirror button and url
+ cov = soup.find(attrs={'style' : 'background-image: url(http://www.politicshome.com/images/sources/source_frontpage_button_92.gif);'})
+ cov2 = str(cov)
+ cov2='http://www.politicshome.com'+cov2[9:-142]
+ #cov2 now contains url of the page containing pic
+ soup = self.index_to_soup(cov2)
+ cov = soup.find(attrs={'id' : 'large'})
+ cov2 = str(cov)
+ cov2=cov2[27:-18]
+ #cov2 now is pic url, now go back to original function
+ br = mechanize.Browser()
+ br.set_handle_redirect(False)
+ try:
+ br.open_novisit(cov2)
+ cover_url = cov2
+ except:
+ cover_url = 'http://yookeo.com/screens/m/i/mirror.co.uk.jpg'
+
+ #cover_url = cov2
+ #cover_url = 'http://www.thesun.co.uk/img/global/new-masthead-logo.png'
+ return cover_url
+
+
diff --git a/recipes/dobreprogamy.recipe b/recipes/dobreprogamy.recipe
index a27a9b0877..0614cf98ee 100644
--- a/recipes/dobreprogamy.recipe
+++ b/recipes/dobreprogamy.recipe
@@ -11,6 +11,7 @@ class Dobreprogramy_pl(BasicNewsRecipe):
cover_url = 'http://userlogos.org/files/logos/Karmody/dobreprogramy_01.png'
description = u'Aktualności i blogi z dobreprogramy.pl'
encoding = 'utf-8'
+ index='http://www.dobreprogramy.pl/'
no_stylesheets = True
language = 'pl'
extra_css = '.title {font-size:22px;}'
@@ -22,3 +23,10 @@ class Dobreprogramy_pl(BasicNewsRecipe):
#remove_tags = [dict(name='div', attrs={'class':['komentarze', 'block', 'portalInfo', 'menuBar', 'topBar']})]
feeds = [(u'Aktualności', 'http://feeds.feedburner.com/dobreprogramy/Aktualnosci'),
('Blogi', 'http://feeds.feedburner.com/dobreprogramy/BlogCzytelnikow')]
+
+
+ def preprocess_html(self, soup):
+ for a in soup('a'):
+ if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']:
+ a['href']=self.index + a['href']
+ return soup
\ No newline at end of file
diff --git a/recipes/dzieje_pl.recipe b/recipes/dzieje_pl.recipe
index d80161e71a..4c583e4815 100644
--- a/recipes/dzieje_pl.recipe
+++ b/recipes/dzieje_pl.recipe
@@ -7,6 +7,7 @@ class Dzieje(BasicNewsRecipe):
cover_url = 'http://www.dzieje.pl/sites/default/files/dzieje_logo.png'
category = 'history'
language = 'pl'
+ index='http://dzieje.pl'
oldest_article = 8
max_articles_per_feed = 100
remove_javascript=True
@@ -15,3 +16,10 @@ class Dzieje(BasicNewsRecipe):
remove_tags_after= dict(id='dogory')
remove_tags=[dict(id='dogory')]
feeds = [(u'Dzieje', u'http://dzieje.pl/rss.xml')]
+
+
+ def preprocess_html(self, soup):
+ for a in soup('a'):
+ if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']:
+ a['href']=self.index + a['href']
+ return soup
\ No newline at end of file
diff --git a/recipes/eioba.recipe b/recipes/eioba.recipe
index 14256c5811..1df79d64bd 100644
--- a/recipes/eioba.recipe
+++ b/recipes/eioba.recipe
@@ -21,3 +21,8 @@ class eioba(BasicNewsRecipe):
(u'Rozrywka', u'http://www.eioba.pl/feed/categories/10.xml'),
(u'Rożne', u'http://www.eioba.pl/feed/categories/9.xml')
]
+
+ def preprocess_html(self, soup):
+ for item in soup.findAll(style=True):
+ del item['style']
+ return soup
diff --git a/recipes/emuzica_pl.recipe b/recipes/emuzica_pl.recipe
index 75271c510a..2fbf9ff514 100644
--- a/recipes/emuzica_pl.recipe
+++ b/recipes/emuzica_pl.recipe
@@ -7,6 +7,7 @@ class eMuzyka(BasicNewsRecipe):
description = u'Emuzyka to największa i najpopularniejsza strona o muzyce w Polsce'
category = 'music'
language = 'pl'
+ index='http://www.emuzyka.pl'
cover_url='http://s.emuzyka.pl/img/emuzyka_invert_small.jpg'
no_stylesheets = True
oldest_article = 7
@@ -14,3 +15,9 @@ class eMuzyka(BasicNewsRecipe):
keep_only_tags=[dict(name='div', attrs={'id':'news_container'}), dict(name='h3'), dict(name='div', attrs={'class':'review_text'})]
remove_tags=[dict(name='span', attrs={'id':'date'})]
feeds = [(u'Aktualno\u015bci', u'http://www.emuzyka.pl/rss.php?f=1'), (u'Recenzje', u'http://www.emuzyka.pl/rss.php?f=2')]
+
+ def preprocess_html(self, soup):
+ for a in soup('a'):
+ if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']:
+ a['href']=self.index + a['href']
+ return soup
\ No newline at end of file
diff --git a/recipes/fhm_uk.recipe b/recipes/fhm_uk.recipe
index 0e2d5c1ebe..07f2b4b64e 100644
--- a/recipes/fhm_uk.recipe
+++ b/recipes/fhm_uk.recipe
@@ -7,7 +7,7 @@ class AdvancedUserRecipe1325006965(BasicNewsRecipe):
# cover_url = 'http://profile.ak.fbcdn.net/hprofile-ak-snc4/373529_38324934806_64930243_n.jpg'
masthead_url = 'http://www.fhm.com/App_Resources/Images/Site/re-design/logo.gif'
__author__ = 'Dave Asbury'
- # last updated 17/3/12
+ # last updated 14/4/12
language = 'en_GB'
oldest_article = 28
max_articles_per_feed = 12
@@ -28,7 +28,8 @@ class AdvancedUserRecipe1325006965(BasicNewsRecipe):
#]
feeds = [
- (u'From the Homepage',u'http://feed43.com/8053226782885416.xml'),
+ (u'From the Homepage',u'http://feed43.com/0032328550253453.xml'),
+ #http://feed43.com/8053226782885416.xml'),
(u'Funny - The Very Best Of The Internet',u'http://feed43.com/4538510106331565.xml'),
(u'Upgrade',u'http://feed43.com/0877305847443234.xml'),
#(u'The Final Countdown', u'http://feed43.com/3576106158530118.xml'),
diff --git a/recipes/film_web.recipe b/recipes/film_web.recipe
index 877d4472bc..2a6e00d501 100644
--- a/recipes/film_web.recipe
+++ b/recipes/film_web.recipe
@@ -7,6 +7,7 @@ class Filmweb_pl(BasicNewsRecipe):
cover_url = 'http://userlogos.org/files/logos/crudus/filmweb.png'
category = 'movies'
language = 'pl'
+ index='http://www.filmweb.pl'
oldest_article = 8
max_articles_per_feed = 100
no_stylesheets= True
@@ -39,3 +40,9 @@ class Filmweb_pl(BasicNewsRecipe):
self.log.warn(skip_tag)
return self.index_to_soup(skip_tag['href'], raw=True)
+
+ def preprocess_html(self, soup):
+ for a in soup('a'):
+ if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']:
+ a['href']=self.index + a['href']
+ return soup
\ No newline at end of file
diff --git a/recipes/forbes.recipe b/recipes/forbes.recipe
index a633d0f543..fe72fda536 100644
--- a/recipes/forbes.recipe
+++ b/recipes/forbes.recipe
@@ -1,39 +1,49 @@
-from calibre.ebooks.BeautifulSoup import BeautifulSoup
+import re
from calibre.web.feeds.news import BasicNewsRecipe
class Forbes(BasicNewsRecipe):
title = u'Forbes'
description = 'Business and Financial News'
- __author__ = 'Darko Miletic'
+ __author__ = 'Kovid Goyal'
oldest_article = 30
- max_articles_per_feed = 100
+ max_articles_per_feed = 20
language = 'en'
+ encoding = 'utf-8'
+ recursions = 1
no_stylesheets = True
- html2lrf_options = ['--base-font-size', '10']
cover_url = u'http://www.forbes.com/media/current_covers/forbes_120_160.gif'
-
- feeds = [(u'Latest', u'http://www.forbes.com/news/index.xml'),
- (u'Most Popular', u'http://www.forbes.com/feeds/popstories.xml'),
- (u'Most Emailed', u'http://www.forbes.com/feeds/mostemailed.xml'),
- (u'Faces', u'http://www.forbes.com/facesscan/index.xml'),
- (u'Technology', u'http://www.forbes.com/technology/index.xml'),
- (u'Personal Tech', u'http://www.forbes.com/personaltech/index.xml'),
- (u'Wireless', u'http://www.forbes.com/wireless/index.xml'),
- (u'Business', u'http://www.forbes.com/business/index.xml'),
- (u'Sports Money', u'http://www.forbes.com/sportsmoney/index.xml'),
- (u'Sports', u'http://www.forbes.com/forbeslife/sports/index.xml'),
- (u'Vehicles', u'http://www.forbes.com/forbeslife/vehicles/index.xml'),
- (u'Leadership', u'http://www.forbes.com/leadership/index.xml'),
- (u'Careers', u'http://www.forbes.com/leadership/careers/index.xml'),
- (u'Compensation', u'http://www.forbes.com/leadership/compensation/index.xml'),
- (u'Managing', u'http://www.forbes.com/leadership/managing/index.xml')]
- def print_version(self, url):
- raw = self.browser.open(url).read()
- soup = BeautifulSoup(raw.decode('latin1', 'replace'))
- print_link = soup.find('a', {'onclick':"s_linkTrackVars='prop18';s_linkType='o';s_linkName='Print';if(typeof(globalPageName)!='undefined')s_prop18=globalPageName;s_lnk=s_co(this);s_gs(s_account);"})
- if print_link is None:
- return ''
- return 'http://www.forbes.com' + print_link['href']
\ No newline at end of file
+ feeds = [(u'Latest', u'http://www.forbes.com/news/index.xml'),
+ (u'Most Popular', u'http://www.forbes.com/feeds/popstories.xml'),
+ (u'Technology', u'http://www.forbes.com/technology/index.xml'),
+ (u'Business', u'http://www.forbes.com/business/index.xml'),
+ (u'Sports Money', u'http://www.forbes.com/sportsmoney/index.xml'),
+ (u'Leadership', u'http://www.forbes.com/leadership/index.xml'),]
+
+ keep_only_tags = \
+ {'class':lambda x: x and (set(x.split()) & {'body', 'pagination',
+ 'articleHead', 'article_head'})}
+ remove_tags_before = {'name':'h1'}
+ remove_tags = [
+ {'class':['comment_bug', 'engagement_block',
+ 'video_promo_block', 'article_actions']},
+ {'id':'comments'}
+ ]
+
+ def is_link_wanted(self, url, tag):
+ ans = re.match(r'http://.*/[2-9]/', url) is not None
+ if ans:
+ self.log('Following multipage link: %s'%url)
+ return ans
+
+ def postprocess_html(self, soup, first_fetch):
+ for pag in soup.findAll(True, 'pagination'):
+ pag.extract()
+ if not first_fetch:
+ h1 = soup.find('h1')
+ if h1 is not None:
+ h1.extract()
+ return soup
+
diff --git a/recipes/fotoblogia_pl.recipe b/recipes/fotoblogia_pl.recipe
new file mode 100644
index 0000000000..99df46419a
--- /dev/null
+++ b/recipes/fotoblogia_pl.recipe
@@ -0,0 +1,16 @@
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class Fotoblogia_pl(BasicNewsRecipe):
+ title = u'Fotoblogia.pl'
+ __author__ = 'fenuks'
+ category = 'photography'
+ language = 'pl'
+ masthead_url = 'http://img.interia.pl/komputery/nimg/u/0/fotoblogia21.jpg'
+ cover_url= 'http://fotoblogia.pl/images/2009/03/fotoblogia2.jpg'
+ oldest_article = 7
+ max_articles_per_feed = 100
+ no_stylesheets = True
+ use_embedded_content = False
+ keep_only_tags=[dict(name='div', attrs={'class':'post-view post-standard'})]
+ remove_tags=[dict(attrs={'class':['external fotoblogia', 'categories', 'tags']})]
+ feeds = [(u'Wszystko', u'http://fotoblogia.pl/feed/rss2')]
diff --git a/recipes/gameplay_pl.recipe b/recipes/gameplay_pl.recipe
index f3384263d6..7b0ccb4f55 100644
--- a/recipes/gameplay_pl.recipe
+++ b/recipes/gameplay_pl.recipe
@@ -6,16 +6,24 @@ class Gameplay_pl(BasicNewsRecipe):
description = u'gameplay.pl - serwis o naszych zainteresowaniach, grach, filmach, książkach, muzyce, fotografii i konsolach.'
category = 'games, movies, books, music'
language = 'pl'
+ index='http://gameplay.pl'
masthead_url= 'http://gameplay.pl/img/gpy_top_logo.png'
cover_url= 'http://gameplay.pl/img/gpy_top_logo.png'
max_articles_per_feed = 100
+ remove_javascript= True
no_stylesheets= True
keep_only_tags=[dict(name='div', attrs={'class':['news_endpage_tit', 'news']})]
- remove_tags=[dict(name='div', attrs={'class':['galeria', 'noedit center im']})]
+ remove_tags=[dict(name='div', attrs={'class':['galeria', 'noedit center im', 'news_list', 'news_list_autor', 'stop_bot', 'tagi']}), dict(attrs={'usemap':'#map'})]
feeds = [(u'Wiadomo\u015bci', u'http://gameplay.pl/rss/')]
def image_url_processor(self, baseurl, url):
if 'http' not in url:
return 'http://gameplay.pl'+ url[2:]
else:
- return url
+ return url
+
+ def preprocess_html(self, soup):
+ for a in soup('a'):
+ if a.has_key('href') and '../' in a['href']:
+ a['href']=self.index + a['href'][2:]
+ return soup
\ No newline at end of file
diff --git a/recipes/gildia_pl.recipe b/recipes/gildia_pl.recipe
index 042902b5fc..36d3ef4da2 100644
--- a/recipes/gildia_pl.recipe
+++ b/recipes/gildia_pl.recipe
@@ -9,6 +9,7 @@ class Gildia(BasicNewsRecipe):
language = 'pl'
oldest_article = 8
max_articles_per_feed = 100
+ remove_empty_feeds=True
no_stylesheets=True
remove_tags=[dict(name='div', attrs={'class':'backlink'}), dict(name='div', attrs={'class':'im_img'}), dict(name='div', attrs={'class':'addthis_toolbox addthis_default_style'})]
keep_only_tags=dict(name='div', attrs={'class':'widetext'})
@@ -24,3 +25,16 @@ class Gildia(BasicNewsRecipe):
self.log.warn('odnosnik')
self.log.warn(link['href'])
return self.index_to_soup(link['href'], raw=True)
+
+ def preprocess_html(self, soup):
+ for a in soup('a'):
+ if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']:
+ if '/gry/' in a['href']:
+ a['href']='http://www.gry.gildia.pl' + a['href']
+ elif u'książk' in soup.title.string.lower() or u'komiks' in soup.title.string.lower():
+ a['href']='http://www.literatura.gildia.pl' + a['href']
+ elif u'komiks' in soup.title.string.lower():
+ a['href']='http://www.literatura.gildia.pl' + a['href']
+ else:
+ a['href']='http://www.gildia.pl' + a['href']
+ return soup
diff --git a/recipes/gram_pl.recipe b/recipes/gram_pl.recipe
index 07927796c0..1f8147ba3d 100644
--- a/recipes/gram_pl.recipe
+++ b/recipes/gram_pl.recipe
@@ -7,6 +7,7 @@ class Gram_pl(BasicNewsRecipe):
category = 'games'
language = 'pl'
oldest_article = 8
+ index='http://www.gram.pl'
max_articles_per_feed = 100
no_stylesheets= True
extra_css = 'h2 {font-style: italic; font-size:20px;} .picbox div {float: left;}'
@@ -52,4 +53,7 @@ class Gram_pl(BasicNewsRecipe):
tag=soup.findAll(name='div', attrs={'class':'picbox'})
for t in tag:
t['style']='float: left;'
+ for a in soup('a'):
+ if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']:
+ a['href']=self.index + a['href']
return soup
\ No newline at end of file
diff --git a/recipes/heise.recipe b/recipes/heise.recipe
index 56d5516656..ba93ea96ce 100644
--- a/recipes/heise.recipe
+++ b/recipes/heise.recipe
@@ -59,6 +59,7 @@ class heiseDe(BasicNewsRecipe):
dict(name='span', attrs={'class':'rsaquo'}),
dict(name='div', attrs={'class':'news_logo'}),
dict(name='div', attrs={'class':'bcadv ISI_IGNORE'}),
+ dict(name='div', attrs={'class':'navi_top_container'}),
dict(name='p', attrs={'class':'news_option'}),
dict(name='p', attrs={'class':'news_navi'}),
dict(name='div', attrs={'class':'news_foren'})]
@@ -69,3 +70,5 @@ class heiseDe(BasicNewsRecipe):
+
+
diff --git a/recipes/historia_news.recipe b/recipes/historia_news.recipe
new file mode 100644
index 0000000000..4eca8ade91
--- /dev/null
+++ b/recipes/historia_news.recipe
@@ -0,0 +1,20 @@
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class historia_news(BasicNewsRecipe):
+ title = u'historia-news'
+ __author__ = 'fenuks'
+ description = u'Historia-news to portal dla ludzi kochających historię. Najnowsze newsy z historii bliższej i dalszej, archeologii, paleontologii oraz ciekawostki i podcasty z historii kultury, sportu, motoryzacji i inne.'
+ masthead_url = 'http://historia-news.pl/templates/hajak4/images/header.jpg'
+ cover_url= 'http://www.historia-news.pl/templates/hajak4/images/header.jpg'
+ category = 'history'
+ language = 'pl'
+ oldest_article = 7
+ max_articles_per_feed = 100
+ no_stylesheets = True
+ remove_empty_feeds = True
+ remove_tags=[dict(name='form'), dict(name='img', attrs={'alt':'Print'}), dict(attrs={'class':['commbutt', 'cpr']}), dict(id=['plusone', 'facebook'])]
+ feeds = [(u'Wiadomo\u015bci', u'http://historia-news.pl/wiadomoci.feed?type=rss'), (u'Artyku\u0142y', u'http://historia-news.pl/artykuy.feed?type=rss')]
+
+
+ def print_version(self, url):
+ return url + '?tmpl=component&print=1&layout=default&page='
diff --git a/recipes/icons/fotoblogia_pl.png b/recipes/icons/fotoblogia_pl.png
new file mode 100644
index 0000000000..0204a04e62
Binary files /dev/null and b/recipes/icons/fotoblogia_pl.png differ
diff --git a/recipes/icons/historia_news.png b/recipes/icons/historia_news.png
new file mode 100644
index 0000000000..79b1b52859
Binary files /dev/null and b/recipes/icons/historia_news.png differ
diff --git a/recipes/icons/swiat_obrazu.png b/recipes/icons/swiat_obrazu.png
new file mode 100644
index 0000000000..a61662a864
Binary files /dev/null and b/recipes/icons/swiat_obrazu.png differ
diff --git a/recipes/in4_pl.recipe b/recipes/in4_pl.recipe
index 16ad622b46..e385522714 100644
--- a/recipes/in4_pl.recipe
+++ b/recipes/in4_pl.recipe
@@ -8,6 +8,7 @@ class in4(BasicNewsRecipe):
description = u'Serwis Informacyjny - Aktualnosci, recenzje'
category = 'IT'
language = 'pl'
+ index='http://www.in4.pl/'
#cover_url= 'http://www.in4.pl/recenzje/337/in4pl.jpg'
no_stylesheets = True
remove_empty_feeds = True
@@ -39,6 +40,7 @@ class in4(BasicNewsRecipe):
def preprocess_html(self, soup):
self.append_page(soup, soup.body)
+ for a in soup('a'):
+ if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']:
+ a['href']=self.index + a['href']
return soup
-
-
diff --git a/recipes/infra_pl.recipe b/recipes/infra_pl.recipe
index 0e035e0980..e021fa0c17 100644
--- a/recipes/infra_pl.recipe
+++ b/recipes/infra_pl.recipe
@@ -8,6 +8,7 @@ class INFRA(BasicNewsRecipe):
description = u'Serwis Informacyjny INFRA - UFO, Zjawiska Paranormalne, Duchy, Tajemnice świata.'
cover_url = 'http://npn.nazwa.pl/templates/ja_teline_ii/images/logo.jpg'
category = 'UFO'
+ index='http://infra.org.pl'
language = 'pl'
max_articles_per_feed = 100
no_stylesheers=True
@@ -15,3 +16,11 @@ class INFRA(BasicNewsRecipe):
remove_tags_after=dict(attrs={'class':'pagenav'})
remove_tags=[dict(attrs={'class':'pagenav'})]
feeds = [(u'Najnowsze wiadomo\u015bci', u'http://www.infra.org.pl/index.php?option=com_rd_rss&id=1')]
+
+ def preprocess_html(self, soup):
+ for item in soup.findAll(style=True):
+ del item['style']
+ for a in soup('a'):
+ if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']:
+ a['href']=self.index + a['href']
+ return soup
\ No newline at end of file
diff --git a/recipes/jakarta_globe.recipe b/recipes/jakarta_globe.recipe
new file mode 100644
index 0000000000..1414ac6e5b
--- /dev/null
+++ b/recipes/jakarta_globe.recipe
@@ -0,0 +1,34 @@
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class JakartaGlobe(BasicNewsRecipe):
+ title = u'Jakarta Globe'
+ oldest_article = 3
+ max_articles_per_feed = 100
+
+ feeds = [
+ (u'News', u'http://www.thejakartaglobe.com/pages/getrss/getrss-news.php'),
+ (u'Business', u'http://www.thejakartaglobe.com/pages/getrss/getrss-business.php'),
+ (u'Technology', u'http://www.thejakartaglobe.com/pages/getrss/getrss-tech.php'),
+ (u'My Jakarta', u'http://www.thejakartaglobe.com/pages/getrss/getrss-myjakarta.php'),
+ (u'International', u'http://www.thejakartaglobe.com/pages/getrss/getrss-international.php'),
+ (u'Life and Times', u'http://www.thejakartaglobe.com/pages/getrss/getrss-lifeandtimes.php'),
+ ]
+ __author__ = 'rty'
+ pubisher = 'JakartaGlobe.com'
+ description = 'JakartaGlobe, Indonesia, Newspaper'
+ category = 'News, Indonesia'
+
+
+ remove_javascript = True
+ use_embedded_content = False
+ no_stylesheets = True
+ language = 'en_ID'
+ encoding = 'utf-8'
+ conversion_options = {'linearize_tables':True}
+ masthead_url = 'http://www.thejakartaglobe.com/pages/2010/images/jak-globe-logo.jpg'
+ keep_only_tags = [
+ dict(name='div', attrs={'class':'story'}),
+ dict(name='span', attrs={'class':'headline'}),
+ dict(name='div', attrs={'class':'story'}),
+ dict(name='p', attrs={'id':'bodytext'})
+ ]
diff --git a/recipes/konflikty_zbrojne.recipe b/recipes/konflikty_zbrojne.recipe
index 7921e98f48..e8b28b49bf 100644
--- a/recipes/konflikty_zbrojne.recipe
+++ b/recipes/konflikty_zbrojne.recipe
@@ -1,5 +1,6 @@
# -*- coding: utf-8 -*-
from calibre.web.feeds.news import BasicNewsRecipe
+from calibre.ebooks.BeautifulSoup import BeautifulSoup
class Konflikty(BasicNewsRecipe):
title = u'Konflikty Zbrojne'
@@ -10,6 +11,23 @@ class Konflikty(BasicNewsRecipe):
category='military, history'
oldest_article = 7
max_articles_per_feed = 100
- auto_cleanup = True
+ no_stylesheets = True
+ keep_only_tags=[dict(attrs={'class':['title1', 'image']}), dict(id='body')]
- feeds = [(u'Aktualności', u'http://www.konflikty.pl/rss_aktualnosci_10.xml'), (u'Artyku\u0142y', u'http://www.konflikty.pl/rss_artykuly_10.xml'), (u'Relacje', u'http://www.konflikty.pl/rss_relacje_10.xml'), (u'Recenzje', u'http://www.konflikty.pl/rss_recenzje_10.xml')]
+ feeds = [(u'Aktualności', u'http://www.konflikty.pl/rss_aktualnosci_10.xml'),
+ (u'Artyku\u0142y', u'http://www.konflikty.pl/rss_artykuly_10.xml'),
+ (u'Historia', u'http://www.konflikty.pl/rss_historia_10.xml'),
+ (u'Militaria', u'http://www.konflikty.pl/rss_militaria_10.xml'),
+ (u'Relacje', u'http://www.konflikty.pl/rss_relacje_10.xml'),
+ (u'Recenzje', u'http://www.konflikty.pl/rss_recenzje_10.xml'),
+ (u'Teksty źródłowe', u'http://www.konflikty.pl/rss_tekstyzrodlowe_10.xml')]
+
+ def preprocess_html(self, soup):
+ for item in soup.findAll(style=True):
+ del item['style']
+ for image in soup.findAll(name='a', attrs={'class':'image'}):
+ if image.img and image.img.has_key('alt'):
+ image.name='div'
+ pos = len(image.contents)
+ image.insert(pos, BeautifulSoup(''+image.img['alt']+'
'))
+ return soup
diff --git a/recipes/liberatorio_politico.recipe b/recipes/liberatorio_politico.recipe
new file mode 100644
index 0000000000..bbffcd89b1
--- /dev/null
+++ b/recipes/liberatorio_politico.recipe
@@ -0,0 +1,12 @@
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class AdvancedUserRecipe1334649829(BasicNewsRecipe):
+ title = u'Liberatorio Politico'
+ oldest_article = 7
+ max_articles_per_feed = 100
+ auto_cleanup = True
+ masthead_url = 'http://liberatorio.altervista.org/wp-content/uploads/2012/01/Testata-LIBERATORIO-Altervista1.jpg'
+ feeds = [(u'Liberatorio Politico', u'http://liberatorio.altervista.org/feed/')]
+ __author__ = 'faber1971'
+ description = 'Inquiry journalism - a blog on Molfetta, Land of Bari, Apulia and Italy - v1.00 (07, April 2012)'
+ language = 'it'
diff --git a/recipes/limes.recipe b/recipes/limes.recipe
new file mode 100644
index 0000000000..2290b7099e
--- /dev/null
+++ b/recipes/limes.recipe
@@ -0,0 +1,50 @@
+#!/usr/bin/env python
+__license__ = 'GPL v3'
+__copyright__ = '2012, faber1971'
+__version__ = 'v1.00'
+__date__ = '16, April 2012'
+__description__ = 'Geopolitical Italian magazine'
+
+
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class Limes(BasicNewsRecipe):
+ description = 'Italian weekly magazine'
+ __author__ = 'faber1971'
+
+ cover_url = 'http://temi.repubblica.it/UserFiles/limes/Image/Loghi/logo-limes.gif'
+ title = 'Limes'
+ category = 'Geopolitical news'
+
+ language = 'it'
+# encoding = 'cp1252'
+ timefmt = '[%a, %d %b, %Y]'
+
+ oldest_article = 16
+ max_articles_per_feed = 100
+ use_embedded_content = False
+ recursion = 10
+
+ remove_javascript = True
+ no_stylesheets = True
+ masthead_url = 'http://temi.repubblica.it/UserFiles/limes/Image/Loghi/logo-limes.gif'
+
+ feeds = [
+ (u'Limes', u'http://temi.repubblica.it/limes/feed/')
+ ]
+
+
+
+ keep_only_tags = [
+ dict(name='div', attrs={'class':['testo','copertina','occhiello','firma','didascalia','content-second-right','detail-articles','titolo-local','generic-articles']}),
+ dict(name='div', attrs={'class':['generic-articles','summary','detail-articles']}),
+ dict(name='div', attrs={'id':['content-second-right','content2']})
+ ]
+
+ remove_tags = [
+ dict(name='div',attrs={'class':['servizi','aggiungi','label-web','bottom-mobile','box-abbonamenti','box-cerca','big','little','stampaweb']}),
+ dict(name='div',attrs={'id':['topheader','header','navigation-new','navigation','content-second-left','menutext']}),
+ dict(name='ul',attrs={'id':'user-utility'}),
+ dict(name=['script','noscript','iframe'])
+ ]
+
diff --git a/recipes/metro_news_nl.recipe b/recipes/metro_news_nl.recipe
index ac3e23869b..d95f9bdfd7 100644
--- a/recipes/metro_news_nl.recipe
+++ b/recipes/metro_news_nl.recipe
@@ -3,25 +3,6 @@ from calibre.web.feeds.news import BasicNewsRecipe
import re
from calibre.utils.magick import Image
from BeautifulSoup import BeautifulSoup
-try:
- from calibre_plugins.drMerry.debug import debuglogger as mlog
- print 'drMerry debuglogger found, debug options can be used'
- from calibre_plugins.drMerry.stats import statslogger as mstat
- print 'drMerry stats tracker found, stat can be tracked'
- mlog.setLoglevel(1) #-1 == no log; 0 for normal output
- mstat.calculateStats(False) #track stats (to track stats loglevel must be > 0
- KEEPSTATS = mstat.keepmystats()
- SHOWDEBUG0 = mlog.showdebuglevel(0)
- SHOWDEBUG1 = mlog.showdebuglevel(1)
- SHOWDEBUG2 = mlog.showdebuglevel(2)
-except:
- #print 'drMerry debuglogger not found, skipping debug options'
- SHOWDEBUG0 = False
- SHOWDEBUG1 = False
- SHOWDEBUG2 = False
- KEEPSTATS = False
-
-#print ('level0: %s\nlevel1: %s\nlevel2: %s' % (SHOWDEBUG0,SHOWDEBUG1,SHOWDEBUG2))
''' Version 1.2, updated cover image to match the changed website.
added info date on title
@@ -43,6 +24,9 @@ except:
extended timeout from 2 to 10
changed oldest article from 10 to 1.2
changed max articles from 15 to 25
+ Version 1.9.1 18-04-2012
+ removed some debug settings
+ updated code to match new metro-layout
'''
class AdvancedUserRecipe1306097511(BasicNewsRecipe):
@@ -70,34 +54,40 @@ class AdvancedUserRecipe1306097511(BasicNewsRecipe):
'author_sort' : 'Metro Nederland & calibre & DrMerry',
'publisher' : 'DrMerry/Metro Nederland'
}
- extra_css = 'body {padding:5px 0px; background:#fff;font-size: 13px;}\
- #date, div.share-and-byline div.byline div.text div.title, div.share-and-byline div.byline div.text div.name {clear: both;margin-bottom: 10px;font-size:0.5em; color: #616262;}\
- .article-box-fact.module-title {clear:both;padding: 8px 0;color: #24763b;font-family: arial, sans-serif;font-size: 14px;font-weight: bold;}\
- h1.title {color: #000000;font-size: 44px;padding-bottom: 10px;font-weight: 300;} h2.subtitle {font-size: 13px;font-weight: 700;padding-bottom: 10px;}\
- .article-body p{padding-bottom:10px;}div.column-1-3{margin-left: 19px;padding-right: 9px;}\
- div.column-1-2 {display: inline;padding-right: 7px;}\
- p.article-image-caption {font-size: 12px;font-weight: 300;color: #616262;margin-top: 5px;} \
- p.article-image-caption .credits {font-style: italic;font-size: 10px;}\
- div.article-image-caption {width: 246px;margin-bottom: 5px;margin-left: 10px;}\
- div.article-image-caption-2column {margin-bottom: 10px;width: 373px;} div.article-image-caption-3column {}\
- img {border:0px; padding:2px;} hr.merryhr {width:30%; border-width:0px; color:green; margin-left:5px; background-color: green} div.column-3 {background-color:#eee; width:50%; margin:2px; float:right; padding:2px;} div.column-3 module-title {border: 1px solid #aaa} div.article-box-fact div.subtitle {font-weight:bold; color:green;}'
+ extra_css = 'body {padding:5px 0; background-color:#fff;font-size: 1em}\
+ #date, div.share-and-byline div.byline div.text div.title, div.share-and-byline div.byline div.text div.name {margin-bottom: 10px}\
+ #date, div.share-and-byline div.byline div.text div.title, div.share-and-byline div.byline div.text div.name, p.article-image-caption .credits {font-size:0.5em}\
+ .article-box-fact.module-title, #date, div.share-and-byline div.byline div.text div.title, div.share-and-byline div.byline div.text div.name {clear:both}\
+ .article-box-fact.module-title {padding: 8px 0}\
+ h1.title {color: #000;font-size: 1.4em}\
+ .article-box-fact.module-title, h2.subtitle {font-size: 1.2em}\
+ h1.title, h2.subtitle, .article-body p{padding-bottom:10px}\
+ h1.title, p.article-image-caption {font-weight: 300}\
+ div.column-1-3{margin-left: 19px;padding-right: 9px}\
+ div.column-1-2 {display: inline;padding-right: 7px}\
+ p.article-image-caption {font-size: 0.6em;margin-top: 5px}\
+ p.article-image-caption, #date, div.share-and-byline div.byline div.text div.title, div.share-and-byline div.byline div.text div.name {color: #616262}\
+ p.article-image-caption .credits {font-style: italic}\
+ div.article-image-caption {width: 246px;margin: 5px}\
+ div.article-image-caption-2column {width: 373px}\
+ div.article-image-caption-2column, div.article-image-caption-3column {margin-bottom: 5px}\
+ img {border:0}\
+ img, div.column-3 {padding:2px}\
+ hr.merryhr {width:30%; border-width:0; margin-left:5px; background-color: #24763b}\
+ div.column-3 {background-color:#eee; width:50%; margin:2px; float:right}\
+ div.column-3 module-title {border: 1px solid #aaa}\
+ div.article-box-fact div.subtitle, .article-box-fact.module-title, h2.subtitle {font-weight:bold}\
+ div.article-box-fact div.subtitle, hr.merryhr, .article-box-fact.module-title {color: #24763b}'
+
preprocess_regexps = [
(re.compile(r'
]+top-line[^>]+>', re.DOTALL|re.IGNORECASE),
lambda match: '
'),
- (re.compile(r'(
]+metronieuws\.nl/[^>]+/templates/[^>]+jpe?g[^>]+>|metronieuws\.nl/internal\-roxen\-unit\.gif)', re.DOTALL|re.IGNORECASE),
+ (re.compile(r'
]+(metronieuws\.nl/[^>]+/templates/[^>]+jpe?g|metronieuws\.nl/internal\-roxen\-unit\.gif)[^>]+>', re.DOTALL|re.IGNORECASE),
lambda match: ''),
]
def preprocess_html(self, soup):
- if SHOWDEBUG0 == True:
- mlog.setdefaults()
- mlog.addTextAndTag(['Show debug = on with level'], [str(mlog.debuglevel)])
- if KEEPSTATS == True:
- mlog.addDebug('Stats will be calculated')
- else:
- mlog.addTextAndTag(['Stats won\'t be calculated\nTo be enabled, stats must be true, currently','and debug level must be 1 or higher, currently'],[mstat.dokeepmystats, mlog.debuglevel])
- mlog.showDebug()
myProcess = MerryProcess()
myProcess.removeUnwantedTags(soup)
return soup
@@ -105,18 +95,6 @@ class AdvancedUserRecipe1306097511(BasicNewsRecipe):
def postprocess_html(self, soup, first):
myProcess = MerryProcess()
myProcess.optimizeLayout(soup)
- if SHOWDEBUG0 == True:
- if KEEPSTATS == True:
- statinfo = 'generated stats:'
- statinfo += str(mstat.stats(mstat.statslist))
- print statinfo
- statinfo = 'generated stats (for removed tags):'
- statinfo += str(mstat.stats(mstat.removedtagslist))
- print statinfo
- #show all Debug info we forgot to report
- #Using print to be sure that this text will not be added at the end of the log.
- print '\n!!!!!unreported messages:\n(should be empty)\n'
- mlog.showDebug()
return soup
feeds = [
@@ -142,44 +120,24 @@ class MerryPreProcess():
return soup
def optimizePicture(self,soup):
- if SHOWDEBUG0 == True:
- mlog.addDebug('start image optimize')
for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')):
iurl = tag['src']
img = Image()
img.open(iurl)
img.trim(0)
img.save(iurl)
- if SHOWDEBUG0 == True:
- mlog.addDebug('Images optimized')
- mlog.showDebug()
return soup
class MerryExtract():
def safeRemovePart(self, killingSoup, soupIsArray):
if killingSoup and not killingSoup == None:
- if SHOWDEBUG2 == True:
- mlog.addTextAndTag(['items to remove'],[killingSoup])
try:
if soupIsArray == True:
for killer in killingSoup:
killer.extract()
else:
killingSoup.extract()
- if SHOWDEBUG1 == True:
- mlog.addDebug('tag extracted')
- mlog.showDebug()
- if KEEPSTATS == True:
- try:
- mstat.addstat(mstat.removedtagslist,str(killingSoup.name))
- except:
- mstat.addstat(mstat.removedtagslist,'unknown')
except:
- if SHOWDEBUG1 == True:
- mlog.addDebug('tag extraction failed')
- mlog.showDebug()
- if KEEPSTATS == True:
- mstat.addstat(mstat.removedtagslist,'exception')
return False
else:
return False
@@ -230,60 +188,26 @@ class MerryProcess(BeautifulSoup):
def optimizeLayout(self,soup):
self.myPrepare.optimizePicture(soup)
- if SHOWDEBUG0 == True:
- mlog.addDebug('End of Optimize Layout')
- mlog.showDebug()
return soup
def insertFacts(self, soup):
allfacts = soup.findAll('div', {'class':re.compile('^article-box-fact.*$')})
- if SHOWDEBUG0 == True:
- mlog.addTextAndTag(['allfacts'],[allfacts])
- mlog.showDebug()
if allfacts and not allfacts == None:
allfactsparent = soup.find('div', {'class':re.compile('^article-box-fact.*$')}).parent
- if SHOWDEBUG0 == True:
- mlog.addTextAndTag(['allfactsparent'],[allfactsparent])
- mlog.showDebug()
for part in allfactsparent:
if not part in allfacts:
- if SHOWDEBUG0 == True:
- mlog.addTextAndTag(['FOUND A non-fact'],[part])
- mlog.showDebug()
self.myKiller.safeRemovePart(part, True)
- if SHOWDEBUG1 == True:
- mlog.addTextAndTag(['New All Facts'],[allfacts])
- mlog.showDebug()
articlefacts = soup.find('div', {'class':'article-box-fact column'})
- errorOccured=False
if (articlefacts and not articlefacts==None):
try:
contenttag = soup.find('div', {'class':'article-body'})
- if SHOWDEBUG0 == True:
- mlog.addTextAndTag(['curcontag'],[contenttag])
- mlog.showDebug()
foundrighttag = False
if contenttag and not contenttag == None:
foundrighttag = True
- if SHOWDEBUG0 == True:
- if errorOccured == False:
- mlog.addTextAndTag(['type','curcontag (in while)'],[type(contenttag),contenttag])
- else:
- mlog.addDebug('Could not find right parent tag. Error Occured')
- mlog.showDebug()
if foundrighttag == True:
contenttag.insert(0, allfactsparent)
- if SHOWDEBUG2 == True:
- mlog.addTextAndTag(['added parent'],[soup.prettify()])
- mlog.showDebug()
except:
- errorOccured=True
- mlog.addTrace()
- else:
- errorOccured=True
- if SHOWDEBUG0 == True and errorOccured == True:
- mlog.addTextAndTag(['no articlefacts'],[articlefacts])
- mlog.showDebug()
+ pass
return soup
def previousNextSibRemover(self, soup, previous=True, soupIsArray=False):
@@ -300,71 +224,38 @@ class MerryProcess(BeautifulSoup):
sibs = findsibsof.nextSiblingGenerator()
for sib in sibs:
self.myKiller.safeRemovePart(sib, True)
- else:
- if SHOWDEBUG1 == True:
- mlog.addDebug('Not any sib found')
return
def removeUnwantedTags(self,soup):
- if SHOWDEBUG1 == True:
- mlog.addTextAndTag(['Len of Soup before RemoveTagsByName'],[len(str(soup))])
- mlog.showDebug()
self.removeTagsByName(soup)
- if SHOWDEBUG1 == True:
- mlog.addDebug('Len of Soup before firstandlastpart: %s' % len(str(soup)))
- mlog.showDebug()
self.insertFacts(soup)
self.removeFirstAndLastPart(soup)
- if SHOWDEBUG1 == True:
- mlog.addDebug('Len of Soup before unwantedpart: %s' % len(str(soup)))
- mlog.showDebug()
self.removeUnwantedParts(soup)
- if SHOWDEBUG1 == True:
- mlog.addDebug('Len of Soup before EmptyParts: %s' % len(str(soup)))
- mlog.showDebug()
self.removeEmptyTags(soup)
- if SHOWDEBUG1 == True:
- mlog.addDebug('Len of Soup after EmptyParts: %s' % len(str(soup)))
- mlog.showDebug()
self.myReplacer.replaceATag(soup)
return soup
def removeUnwantedParts(self, soup):
- if SHOWDEBUG1 == True:
- mlog.addDebug('Len of Soup before UnwantedID: %s' % len(str(soup)))
- mlog.showDebug()
self.removeUnwantedTagsByID(soup)
- if SHOWDEBUG1 == True:
- mlog.addDebug('Len of Soup before Class: %s' % len(str(soup)))
- mlog.showDebug()
self.removeUnwantedTagsByClass(soup)
- if SHOWDEBUG1 == True:
- mlog.addDebug('Len of Soup before Style: %s' % len(str(soup)))
- mlog.showDebug()
self.removeUnwantedTagsByStyle(soup)
return soup
def removeUnwantedTagsByStyle(self,soup):
- self.removeArrayOfTags(soup.findAll(attrs={'style' : re.compile("^(.*(display\s?:\s?none|img-mask|white)\s?;?.*)$")}))
- if SHOWDEBUG0 == True:
- mlog.addDebug('end remove by style')
+ self.removeArrayOfTags(soup.findAll(attrs={'style':re.compile("^(.*(display\s?:\s?none|img-mask|white)\s?;?.*)$")}))
+ self.removeArrayOfTags(soup.findAll(attrs={'title':'volledig scherm'}))
return soup
def removeArrayOfTags(self,souparray):
return self.myKiller.safeRemovePart(souparray, True)
def removeUnwantedTagsByClass(self,soup):
- if SHOWDEBUG0 == True:
- mlog.addDebug('start remove by class')
- self.removeArrayOfTags(soup.findAll("div", { "class" :re.compile('^(promo.*?|article-tools-below-title|metroCommentFormWrap|ad|share-tools|tools|header-links|related-links|padding-top-15)$')}))
+ self.removeArrayOfTags(soup.findAll("div", { "class" :re.compile('^(promo.*?|share-tools-top|share-tools-bottom|article-tools-below-title|metroCommentFormWrap|ad|share-tools|tools|header-links|related-links|padding-top-15|footer-[a-zA-Z0-9]+)$')}))
return soup
def removeUnwantedTagsByID(self,soup):
- defaultids = ['footer-extra',re.compile('^ad(\d+|adcomp.*?)?$'),'column-4-5','navigation','header',re.compile('^column-1-5-(top|bottom)$'),'footer','hidden_div','sidebar',re.compile('^article-\d$'),'comments','footer']
+ defaultids = ['footer-extra',re.compile('^ad(\d+|adcomp.*?)?$'),'column-4-5','navigation','header',re.compile('^column-1-5-(top|bottom)$'),'footer','hidden_div','sidebar',re.compile('^article-\d$'),'comments','footer','gallery-1']
for removeid in defaultids:
- if SHOWDEBUG1 == True:
- mlog.addDebug('RemoveTagByID, tag: %s, Len of Soup: %s' % (str(removeid), len(str(soup))))
- mlog.showDebug()
self.removeArrayOfTags(soup.findAll(id=removeid))
return soup
@@ -380,33 +271,12 @@ class MerryProcess(BeautifulSoup):
return soup
def removeEmptyTags(self,soup,run=0):
- if SHOWDEBUG0 == True:
- mlog.addDebug('starting removeEmptyTags')
- if SHOWDEBUG1 == True:
- run += 1
- mlog.addDebug(run)
- if SHOWDEBUG2 == True:
- mlog.addDebug(str(soup.prettify()))
- mlog.showDebug()
emptymatches = re.compile('^( |\s|\n|\r|\t)*$')
emptytags = soup.findAll(lambda tag: tag.find(True) is None and (tag.string is None or tag.string.strip()=="" or tag.string.strip()==emptymatches) and not tag.isSelfClosing)
if emptytags and not (emptytags == None or emptytags == []):
- if SHOWDEBUG1 == True:
- mlog.addDebug('tags found')
- mlog.addDebug(str(emptytags))
self.removeArrayOfTags(emptytags)
#recursive in case removing empty tag creates new empty tag
self.removeEmptyTags(soup, run=run)
- else:
- if SHOWDEBUG1 == True:
- mlog.addDebug('no empty tags found')
- mlog.showDebug()
- if SHOWDEBUG0 == True:
- if SHOWDEBUG2 == True:
- mlog.addDebug('new soup:')
- mlog.addDebug(str(soup.prettify()))
- mlog.addDebug('RemoveEmptyTags Completed')
- mlog.showDebug()
return soup
def removeFirstAndLastPart(self,soup):
diff --git a/recipes/metro_uk.recipe b/recipes/metro_uk.recipe
index 8dc7008a68..c30f81c019 100644
--- a/recipes/metro_uk.recipe
+++ b/recipes/metro_uk.recipe
@@ -1,52 +1,30 @@
-import re
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1306097511(BasicNewsRecipe):
title = u'Metro UK'
description = 'News as provide by The Metro -UK'
-
+ #timefmt = ''
__author__ = 'Dave Asbury'
- #last update 3/12/11
cover_url = 'http://profile.ak.fbcdn.net/hprofile-ak-snc4/276636_117118184990145_2132092232_n.jpg'
- no_stylesheets = True
+ #no_stylesheets = True
oldest_article = 1
- max_articles_per_feed = 20
+ max_articles_per_feed = 10
remove_empty_feeds = True
remove_javascript = True
+ auto_cleanup = True
- #preprocess_regexps = [(re.compile(r'Tweet'), lambda a : '')]
- preprocess_regexps = [
- (re.compile(r'', re.IGNORECASE | re.DOTALL), lambda match: ' ')]
- preprocess_regexps = [
- (re.compile(r'tweet', re.IGNORECASE | re.DOTALL), lambda match: '')]
language = 'en_GB'
-
-
masthead_url = 'http://e-edition.metro.co.uk/images/metro_logo.gif'
-
-
keep_only_tags = [
- dict(name='h1'),dict(name='h2', attrs={'class':'h2'}),
- dict(attrs={'class':['img-cnt figure']}),
- dict(attrs={'class':['art-img']}),
- dict(name='div', attrs={'class':'art-lft'}),
- dict(name='p')
+
]
remove_tags = [
- dict(name = 'div',attrs={'id' : ['comments-news','formSubmission']}),
- dict(name='div', attrs={'class':[ 'news m12 clrd clr-b p5t shareBtm', 'commentForm', 'metroCommentInnerWrap',
- 'art-rgt','pluck-app pluck-comm','news m12 clrd clr-l p5t', 'flt-r','username','clrd' ]}),
- dict(attrs={'class':['username', 'metroCommentFormWrap','commentText','commentsNav','avatar','submDateAndTime','addYourComment','displayName']})
- ,dict(name='div', attrs={'class' : 'clrd art-fd fd-gr1-b'})
+
]
+
+
feeds = [
(u'News', u'http://www.metro.co.uk/rss/news/'), (u'Money', u'http://www.metro.co.uk/rss/money/'), (u'Sport', u'http://www.metro.co.uk/rss/sport/'), (u'Film', u'http://www.metro.co.uk/rss/metrolife/film/'), (u'Music', u'http://www.metro.co.uk/rss/metrolife/music/'), (u'TV', u'http://www.metro.co.uk/rss/tv/'), (u'Showbiz', u'http://www.metro.co.uk/rss/showbiz/'), (u'Weird News', u'http://www.metro.co.uk/rss/weird/'), (u'Travel', u'http://www.metro.co.uk/rss/travel/'), (u'Lifestyle', u'http://www.metro.co.uk/rss/lifestyle/'), (u'Books', u'http://www.metro.co.uk/rss/lifestyle/books/'), (u'Food', u'http://www.metro.co.uk/rss/lifestyle/restaurants/')]
-
extra_css = '''
- body {font: sans-serif medium;}'
- h1 {text-align : center; font-family:Arial,Helvetica,sans-serif; font-size:20px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:bold;}
- h2 {text-align : center;color:#4D4D4D;font-family:Arial,Helvetica,sans-serif; font-size:15px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:bold; }
- span{ font-size:9.5px; font-weight:bold;font-style:italic}
- p { text-align: justify; font-family:Arial,Helvetica,sans-serif; font-size:11px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:normal;}
-
- '''
+ body{ text-align: justify; font-family:Arial,Helvetica,sans-serif; font-size:11px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:normal;}
+ '''
diff --git a/recipes/national_geographic_pl.recipe b/recipes/national_geographic_pl.recipe
index a2f759e878..07fc0da666 100644
--- a/recipes/national_geographic_pl.recipe
+++ b/recipes/national_geographic_pl.recipe
@@ -9,8 +9,9 @@ from calibre.web.feeds.recipes import BasicNewsRecipe
class recipeMagic(BasicNewsRecipe):
title = 'National Geographic PL'
__author__ = 'Marcin Urban 2011'
+ __modified_by__ = 'fenuks'
description = 'legenda wśród magazynów z historią sięgającą 120 lat'
- cover_url = 'http://www.guj.pl/var/guj/storage/images/media/nasze_magazyny/national_geographic/logo/ng_logo/2606-1-pol-PL/ng_logo.jpg'
+ #cover_url = 'http://www.guj.pl/var/guj/storage/images/media/nasze_magazyny/national_geographic/logo/ng_logo/2606-1-pol-PL/ng_logo.jpg'
oldest_article = 7
max_articles_per_feed = 100
no_stylesheets = True
@@ -42,11 +43,43 @@ class recipeMagic(BasicNewsRecipe):
]
remove_attributes = ['width','height']
+ feeds=[]
- feeds = [
- ('National Geographic PL', 'http://www.national-geographic.pl/rss/'),
- ]
+ def find_articles(self, url):
+ articles = []
+ soup=self.index_to_soup(url)
+ tag=soup.find(attrs={'class':'arl'})
+ art=tag.ul.findAll('li')
+ for i in art:
+ title=i.a['title']
+ url=i.a['href']
+ #date=soup.find(id='footer').ul.li.string[41:-1]
+ desc=i.div.p.string
+ articles.append({'title' : title,
+ 'url' : url,
+ 'date' : '',
+ 'description' : desc
+ })
+ return articles
+
+ def parse_index(self):
+ feeds = []
+ feeds.append((u"Aktualności", self.find_articles('http://www.national-geographic.pl/aktualnosci/')))
+ feeds.append((u"Artykuły", self.find_articles('http://www.national-geographic.pl/artykuly/')))
+
+ return feeds
def print_version(self, url):
- return url.replace('artykuly0Cpokaz', 'drukuj-artykul')
+ if 'artykuly' in url:
+ return url.replace('artykuly/pokaz', 'drukuj-artykul')
+ elif 'aktualnosci' in url:
+ return url.replace('aktualnosci/pokaz', 'drukuj-artykul')
+ else:
+ return url
+
+ def get_cover_url(self):
+ soup = self.index_to_soup('http://www.national-geographic.pl/biezace-wydania/')
+ tag=soup.find(attrs={'class':'txt jus'})
+ self.cover_url=tag.img['src']
+ return getattr(self, 'cover_url', self.cover_url)
diff --git a/recipes/nowa_fantastyka.recipe b/recipes/nowa_fantastyka.recipe
index ec556da5fa..0371cb1f58 100644
--- a/recipes/nowa_fantastyka.recipe
+++ b/recipes/nowa_fantastyka.recipe
@@ -81,5 +81,7 @@ class Nowa_Fantastyka(BasicNewsRecipe):
title=soup.find(attrs={'class':'tytul'})
if title:
title['style']='font-size: 20px; font-weight: bold;'
- self.log.warn(soup)
+ for a in soup('a'):
+ if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']:
+ a['href']=self.INDEX + a['href']
return soup
diff --git a/recipes/orlando_sentinel.recipe b/recipes/orlando_sentinel.recipe
index 7a59f6f6ba..b327bc2b74 100644
--- a/recipes/orlando_sentinel.recipe
+++ b/recipes/orlando_sentinel.recipe
@@ -1,3 +1,4 @@
+import urllib, re
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1279258912(BasicNewsRecipe):
@@ -27,12 +28,30 @@ class AdvancedUserRecipe1279258912(BasicNewsRecipe):
encoding = 'utf-8'
conversion_options = {'linearize_tables':True}
masthead_url = 'http://www.orlandosentinel.com/media/graphic/2009-07/46844851.gif'
- keep_only_tags = [
- dict(name='div', attrs={'class':'story'})
- ]
- remove_tags = [
- dict(name='div', attrs={'class':['articlerail','tools','comment-group','clearfix']}),
- ]
- remove_tags_after = [
- dict(name='p', attrs={'class':'copyright'}),
- ]
+
+ auto_cleanup = True
+
+ def get_article_url(self, article):
+ ans = None
+ try:
+ s = article.summary
+ ans = urllib.unquote(
+ re.search(r'href=".+?bookmark.cfm.+?link=(.+?)"', s).group(1))
+ except:
+ pass
+ if ans is None:
+ link = article.get('feedburner_origlink', None)
+ if link and link.split('/')[-1]=="story01.htm":
+ link=link.split('/')[-2]
+ encoding = {'0B': '.', '0C': '/', '0A': '0', '0F': '=', '0G': '&',
+ '0D': '?', '0E': '-', '0N': '.com', '0L': 'http:',
+ '0S':'//'}
+ for k, v in encoding.iteritems():
+ link = link.replace(k, v)
+ ans = link
+ elif link:
+ ans = link
+ if ans is not None:
+ return ans.replace('?track=rss', '')
+
+
diff --git a/recipes/pc_arena.recipe b/recipes/pc_arena.recipe
index 952db30c3e..56bb601f70 100644
--- a/recipes/pc_arena.recipe
+++ b/recipes/pc_arena.recipe
@@ -7,6 +7,7 @@ class PC_Arena(BasicNewsRecipe):
description = u'Najnowsze informacje z branży IT - testy, recenzje, aktualności, rankingi, wywiady. Twoje źródło informacji o sprzęcie komputerowym.'
category = 'IT'
language = 'pl'
+ index='http://pcarena.pl'
masthead_url='http://pcarena.pl/pcarena/img/logo.png'
cover_url= 'http://pcarena.pl/pcarena/img/logo.png'
no_stylesheets = True
@@ -22,4 +23,10 @@ class PC_Arena(BasicNewsRecipe):
if 'http' not in url:
return 'http://pcarena.pl' + url
else:
- return url
\ No newline at end of file
+ return url
+
+ def preprocess_html(self, soup):
+ for a in soup('a'):
+ if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']:
+ a['href']=self.index + a['href']
+ return soup
\ No newline at end of file
diff --git a/recipes/readitlater.recipe b/recipes/readitlater.recipe
index 38f7ec1a9a..92c9aaf9d6 100644
--- a/recipes/readitlater.recipe
+++ b/recipes/readitlater.recipe
@@ -1,5 +1,5 @@
"""
-readitlaterlist.com
+Pocket Calibre Recipe v1.0
"""
__license__ = 'GPL v3'
__copyright__ = '''
@@ -12,22 +12,23 @@ from calibre import strftime
from calibre.web.feeds.news import BasicNewsRecipe
-class Readitlater(BasicNewsRecipe):
- title = 'ReadItLater'
+class Pocket(BasicNewsRecipe):
+ title = 'Pocket'
__author__ = 'Darko Miletic, Przemyslaw Kryger, Keith Callenberg, tBunnyMan'
- description = '''Personalized news feeds. Go to readitlaterlist.com to setup \
- up your news. This version displays pages of articles from \
+ description = '''Personalized news feeds. Go to getpocket.com to setup up \
+ your news. This version displays pages of articles from \
oldest to newest, with max & minimum counts, and marks articles \
read after downloading.'''
- publisher = 'readitlaterlist.com'
+ publisher = 'getpocket.com'
category = 'news, custom'
oldest_article = 7
max_articles_per_feed = 50
- minimum_articles = 1
+ minimum_articles = 10
+ mark_as_read_after_dl = True
no_stylesheets = True
use_embedded_content = False
needs_subscription = True
- INDEX = u'http://readitlaterlist.com'
+ INDEX = u'http://getpocket.com'
LOGIN = INDEX + u'/l'
readList = []
@@ -100,9 +101,31 @@ class Readitlater(BasicNewsRecipe):
br = self.get_browser()
for link in markList:
url = self.INDEX + link
+ print 'Marking read: ', url
response = br.open(url)
- response
+ print response.info()
def cleanup(self):
- self.mark_as_read(self.readList)
+ if self.mark_as_read_after_dl:
+ self.mark_as_read(self.readList)
+ else:
+ pass
+ def default_cover(self, cover_file):
+ '''
+ Create a generic cover for recipes that don't have a cover
+ This override adds time to the cover
+ '''
+ try:
+ from calibre.ebooks import calibre_cover
+ title = self.title if isinstance(self.title, unicode) else \
+ self.title.decode('utf-8', 'replace')
+ date = strftime(self.timefmt)
+ time = strftime('[%I:%M %p]')
+ img_data = calibre_cover(title, date, time)
+ cover_file.write(img_data)
+ cover_file.flush()
+ except:
+ self.log.exception('Failed to generate default cover')
+ return False
+ return True
diff --git a/recipes/swiat_obrazu.recipe b/recipes/swiat_obrazu.recipe
new file mode 100644
index 0000000000..68740fa4dd
--- /dev/null
+++ b/recipes/swiat_obrazu.recipe
@@ -0,0 +1,25 @@
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class Swiat_Obrazu(BasicNewsRecipe):
+ title = u'Swiat Obrazu'
+ __author__ = 'fenuks'
+ description = u'Internetowy Dziennik o Fotografii i Wideo www.SwiatObrazu.pl to źródło informacji o technice fotografii i wideo, o sprzęcie najbardziej znanych i uznanych firm: Canon, Nikon, Sony, Hasselblad i wielu innych. Znajdziecie tu programy do obróbki zdjęć, forum foto i forum wideo i galerie zdjęć. Codziennie najświeższe informacje: aktualności, testy, poradniki, wywiady, felietony. Swiatobrazu.pl stale organizuje konkursy oraz warsztaty fotograficzne i wideo.'
+ category = 'photography'
+ masthead_url = 'http://www.swiatobrazu.pl/img/logo.jpg'
+ cover_url = 'http://www.swiatobrazu.pl/img/logo.jpg'
+ language = 'pl'
+ oldest_article = 7
+ max_articles_per_feed = 100
+ no_stylesheets = True
+ remove_javascript= True
+ use_embedded_content = False
+ feeds = [(u'Wszystko', u'http://www.swiatobrazu.pl/rss')]
+
+ def print_version(self, url):
+ return url + ',drukuj'
+
+ def image_url_processor(self, baseurl, url):
+ if 'http://' not in url or 'https://' not in url:
+ return 'http://www.swiatobrazu.pl' + url[5:]
+ else:
+ return url
diff --git a/recipes/tanuki.recipe b/recipes/tanuki.recipe
index 666cb8aa77..a615763307 100644
--- a/recipes/tanuki.recipe
+++ b/recipes/tanuki.recipe
@@ -34,4 +34,12 @@ class tanuki(BasicNewsRecipe):
def preprocess_html(self, soup):
self.append_page(soup, soup.body)
+ for a in soup('a'):
+ if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']:
+ if 'tanuki-anime' in soup.title.string.lower():
+ a['href']='http://anime.tanuki.pl' + a['href']
+ elif 'tanuki-manga' in soup.title.string.lower():
+ a['href']='http://manga.tanuki.pl' + a['href']
+ elif 'tanuki-czytelnia' in soup.title.string.lower():
+ a['href']='http://czytelnia.tanuki.pl' + a['href']
return soup
\ No newline at end of file
diff --git a/recipes/the_sun.recipe b/recipes/the_sun.recipe
index 9285c0b2c2..db74e003a0 100644
--- a/recipes/the_sun.recipe
+++ b/recipes/the_sun.recipe
@@ -1,9 +1,8 @@
-import re
+import re, mechanize
from calibre.web.feeds.recipes import BasicNewsRecipe
class AdvancedUserRecipe1325006965(BasicNewsRecipe):
title = u'The Sun UK'
- cover_url = 'http://www.thesun.co.uk/img/global/new-masthead-logo.png'
description = 'A Recipe for The Sun tabloid UK'
__author__ = 'Dave Asbury'
@@ -24,37 +23,69 @@ class AdvancedUserRecipe1325006965(BasicNewsRecipe):
no_stylesheets = True
extra_css = '''
- body{ text-align: justify; font-family:Arial,Helvetica,sans-serif; font-size:11px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:normal;}
- '''
+ body{ text-align: justify; font-family:Arial,Helvetica,sans-serif; font-size:11px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:normal;}
+ '''
preprocess_regexps = [
- (re.compile(r'