diff --git a/Changelog.yaml b/Changelog.yaml
index 43eb775233..452744ba94 100644
--- a/Changelog.yaml
+++ b/Changelog.yaml
@@ -19,6 +19,158 @@
# new recipes:
# - title:
+- version: 0.8.49
+ date: 2012-04-27
+
+ new features:
+ - title: "Experimental support for generating Amazon's new KF8 format MOBI files"
+ description: "calibre can now generate Amazon's new KF8 format MOBI files.
+ To turn on this feature, go to Preferences->Tweaks and click Plugin Tweaks. In the box add:
+ test_mobi_output_type = 'both'
+ calibre will now produce MOBI files that have both the old MOBI format and the new KF8 format in them.
+ To learn more about KF8, see: http://www.amazon.com/gp/feature.html?docId=1000729511
+ Note that calibre support for KF8 is still experimental and there will likely be bugs."
+
+ - title: "Upgrade to using cssutils 0.9.9 for CSS parsing. Improved speed and robustness."
+
+ - title: "Show cover size in a tooltip in the conversion dialog"
+ tickets: [986958]
+
+ - title: "Driver for Nook Simple Touch with Glow Light"
+ tickets: [989264]
+
+ bug fixes:
+ - title: "Heuristics: When italicizing words do not operate on words not in between HTML tags."
+ tickets: [986298]
+
+ - title: "Fix (I hope) the bulk metadata download process crashing for some people on OS X when clicking the Yes button to apply the updates."
+ tickets: [986658]
+
+ - title: "Fix tooltip not being updated in the book details panel when pasting in a new cover"
+ tickets: [986958]
+
+ - title: "Cover Browser: Wrap the title on space only, not in between words."
+ tickets: [986516]
+
+ - title: "Edit metadata dialog: If a permission denied error occurs when clicking the next or prev buttons, stay on the current book."
+ tickets: [986903]
+
+ - title: "Fix heuristics not removing unnecessary hyphens from the end of lines."
+ tickets: [822744]
+
+ improved recipes:
+ - Metro Nieuws NL
+ - Der Tagesspiegel
+
+ new recipes:
+ - title: Berria
+ author: Alayn Gortazar
+
+ - title: Sol Haber
+ author: Onur Gungor
+
+ - title: Telam
+ author: Darko Miletic
+
+ - title: Richmond Times-Dispatch
+ author: jde
+
+- version: 0.8.48
+ date: 2012-04-20
+
+ new features:
+ - title: "Conversion: The search and replace feature has been completely revamped."
+ description: "You can now use any number of search and replace
+ expression, not just three. You can also store and load frequently used
+ sets of search and replace expressions. Also, the wizard generates its
+ preview in a separate process to protect against crashes/memory leaks."
+ tickets: [983476,983484,983478]
+
+ - title: "Support for the new '.azw3' files that Amazon recently started generating. calibre will now detect them as ebooks. It can also view/convert them, if they are DRM free."
+
+ - title: "Drivers for Samsung Galaxy ACE GT-S5830L and HTC One X"
+ tickets: [981185]
+
+ bug fixes:
+ - title: "Get Books: Support the new website design of Barnes & Noble"
+
+ - title: "T1 driver: Fix books sent to SD card sometimes resulting problems when deleted."
+ tickets: [943586]
+
+ - title: "Do not allow author names to be set to blank via the Manage authors function. Blank authors are now automatically set to 'Unknown'"
+
+ - title: "MOBI Output: Handle background color specified on
and | in addition to tags."
+ tickets: [980813]
+
+ - title: "MOBI Output: Fix underline style applied to parent element not getting inherited by children."
+ tickets: [985711]
+
+ improved recipes:
+ - xkcd
+ - Metro Nieuws
+ - Calgary Herald
+ - Orlando Sentinel
+ - countryfile
+ - Heise
+
+ new recipes:
+ - title: Various new Polish news sources
+ author: fenuks
+
+ - title: Various Italian news sources
+ author: faber1971
+
+ - title: Jakarta Globe
+ author: rty
+
+ - title: Acim Bilim Dergisi
+ author: thomass
+
+- version: 0.8.47
+ date: 2012-04-13
+
+ new features:
+ - title: "Conversion pipeline: Add support for all the named entities in the HTML 5 spec."
+ tickets: [976056]
+
+ - title: "Support for viewing and converting the Haodoo PDB ebook format"
+ tickets: [976478]
+
+ - title: "Device driver for Laser EB720"
+
+ bug fixes:
+ - title: "Fix regression in automatic adding in 0.8.46 that broke automatic adding if adding of duplicates is enabled and auto convert is also enabled"
+ tickets: [976336]
+
+ - title: 'Fix "Tags" field in advanced search does not obey regex setting'
+ tickets: [980221]
+
+ - title: "EPUB Input: Automatically extract cover image from simple HTML title page that consists of only a single
tag, instead of rendering the page"
+
+ - title: "Prevent errors when both author and author_sort are used in a template for reading metadata from filenames for files on a device"
+
+ - title: "Amazon metadata download: Handle books whose titles start with a bracket."
+ tickets: [976365]
+
+ - title: "Get Books: Fix downloading of purchased books from Baen"
+ tickets: [975929]
+
+
+ improved recipes:
+ - Forbes
+ - Caros Amigos
+ - Trouw
+ - Sun UK
+ - Metro
+ - Daily Mirror
+
+ new recipes:
+ - title: "Melbourne Herald Sun"
+ author: Ray Hartley
+
+ - title: "Editoriali and Zerocalcare"
+ author: faber1971
+
- version: 0.8.46
date: 2012-04-06
diff --git a/recipes/acim_bilim_dergisi.recipe b/recipes/acim_bilim_dergisi.recipe
new file mode 100644
index 0000000000..5d674fe93a
--- /dev/null
+++ b/recipes/acim_bilim_dergisi.recipe
@@ -0,0 +1,27 @@
+# -*- coding: utf-8 -*-
+
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class AdvancedUserRecipe1334868409(BasicNewsRecipe):
+ title = u'AÇIK BİLİM DERGİSİ'
+ description = ' Aylık çevrimiçi bilim dergisi'
+ __author__ = u'thomass'
+ oldest_article = 30
+ max_articles_per_feed = 300
+ auto_cleanup = True
+ encoding = 'UTF-8'
+ publisher = 'açık bilim'
+ category = 'haber, bilim,TR,dergi'
+ language = 'tr'
+ publication_type = 'magazine '
+ conversion_options = {
+ 'tags' : category
+ ,'language' : language
+ ,'publisher' : publisher
+ ,'linearize_tables': True
+ }
+ cover_img_url = 'http://www.acikbilim.com/wp-content/themes/Equilibrium/images/logodene.jpg'
+ masthead_url = 'http://www.acikbilim.com/wp-content/themes/Equilibrium/images/logodene.jpg'
+
+
+ feeds = [(u'Tüm Yayınlar', u'http://www.acikbilim.com/feed')]
diff --git a/recipes/adventure_zone_pl.recipe b/recipes/adventure_zone_pl.recipe
index 65f4e3e52d..bb311606ac 100644
--- a/recipes/adventure_zone_pl.recipe
+++ b/recipes/adventure_zone_pl.recipe
@@ -9,6 +9,7 @@ class Adventure_zone(BasicNewsRecipe):
no_stylesheets = True
oldest_article = 20
max_articles_per_feed = 100
+ index='http://www.adventure-zone.info/fusion/'
use_embedded_content=False
preprocess_regexps = [(re.compile(r"Komentarze | ", re.IGNORECASE), lambda m: '')]
remove_tags_before= dict(name='td', attrs={'class':'main-bg'})
@@ -45,6 +46,19 @@ class Adventure_zone(BasicNewsRecipe):
skip_tag = skip_tag.findAll(name='a')
for r in skip_tag:
if r.strong:
- word=r.strong.string
- if word and (('zapowied' in word) or ('recenzj' in word) or ('solucj' in word)):
- return self.index_to_soup('http://www.adventure-zone.info/fusion/print.php?type=A&item'+r['href'][r['href'].find('article_id')+7:], raw=True)
\ No newline at end of file
+ word=r.strong.string.lower()
+ if word and (('zapowied' in word) or ('recenzj' in word) or ('solucj' in word) or ('poradnik' in word)):
+ return self.index_to_soup('http://www.adventure-zone.info/fusion/print.php?type=A&item'+r['href'][r['href'].find('article_id')+7:], raw=True)
+
+ def preprocess_html(self, soup):
+ footer=soup.find(attrs={'class':'news-footer middle-border'})
+ if footer and len(footer('a'))>=2:
+ footer('a')[1].extract()
+ for item in soup.findAll(style=True):
+ del item['style']
+ for a in soup('a'):
+ if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']:
+ a['href']=self.index + a['href']
+ return soup
+
+
\ No newline at end of file
diff --git a/recipes/benchmark_pl.recipe b/recipes/benchmark_pl.recipe
index cc74cc9128..00eea1be68 100644
--- a/recipes/benchmark_pl.recipe
+++ b/recipes/benchmark_pl.recipe
@@ -68,4 +68,7 @@ class Benchmark_pl(BasicNewsRecipe):
self.image_article(soup, soup.body)
else:
self.append_page(soup, soup.body)
+ for a in soup('a'):
+ if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']:
+ a['href']=self.INDEX + a['href']
return soup
diff --git a/recipes/berria.recipe b/recipes/berria.recipe
new file mode 100644
index 0000000000..06f8344988
--- /dev/null
+++ b/recipes/berria.recipe
@@ -0,0 +1,44 @@
+__license__ = 'GPL v3'
+__copyright__ = '2012, Alayn Gortazar '
+'''
+www.berria.info
+'''
+
+from calibre.web.feeds.news import BasicNewsRecipe
+
+
+class Berria(BasicNewsRecipe):
+ title = 'Berria'
+ __author__ = 'Alayn Gortazar'
+ description = 'Euskal Herriko euskarazko egunkaria'
+ publisher = 'Berria'
+ category = 'news, politics, sports, Basque Country'
+ oldest_article = 2
+ max_articles_per_feed = 100
+ no_stylesheets = True
+ use_embedded_content = False
+ language = 'eu'
+ remove_empty_feeds = True
+ masthead_url = 'http://upload.wikimedia.org/wikipedia/commons/thumb/6/6a/Berria_Logo.svg/400px-Berria_Logo.svg.png'
+
+ keep_only_tags = [
+ dict(id='goiburua'),
+ dict(name='div', attrs={'class':['ber_ikus']}),
+ dict(name='section', attrs={'class':'ber_ikus'})
+ ]
+ remove_tags = [
+ dict(name='a', attrs={'class':'iruzkinak'}),
+ dict(name='div', attrs={'class':'laguntzaileak'})
+ ]
+
+ extra_css = '#goiburua{font-weight: bold} .zintiloa{font-size: small} .sarrera{color:#666} .titularra{font-size: x-large} .sarrera{font-weight: bold} .argazoin{color:#666; font-size: small}'
+
+ feeds = [
+ (u'Edizioa jarraia', u'http://berria.info/rss/ediziojarraia.xml'),
+ (u'Iritzia', u'http://berria.info/rss/iritzia.xml'),
+ (u'Euskal Herria', u'http://berria.info/rss/euskalherria.xml'),
+ (u'Ekonomia', u'http://berria.info/rss/ekonomia.xml'),
+ (u'Mundua', u'http://berria.info/rss/mundua.xml'),
+ (u'Kirola', u'http://berria.info/rss/kirola.xml'),
+ (u'Plaza', u'http://berria.info/rss/plaza.xml')
+ ]
diff --git a/recipes/calgary_herald.recipe b/recipes/calgary_herald.recipe
index dc919a76f8..12134bc9a4 100644
--- a/recipes/calgary_herald.recipe
+++ b/recipes/calgary_herald.recipe
@@ -1,220 +1,35 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-
-__license__ = 'GPL v3'
-
-'''
-www.canada.com
-'''
-
-import re
-from calibre.web.feeds.news import BasicNewsRecipe
-from calibre.ebooks.BeautifulSoup import BeautifulStoneSoup
-
-
-class CanWestPaper(BasicNewsRecipe):
-
- # un-comment the following four lines for the Victoria Times Colonist
-## title = u'Victoria Times Colonist'
-## url_prefix = 'http://www.timescolonist.com'
-## description = u'News from Victoria, BC'
-## fp_tag = 'CAN_TC'
-
- # un-comment the following four lines for the Vancouver Province
-## title = u'Vancouver Province'
-## url_prefix = 'http://www.theprovince.com'
-## description = u'News from Vancouver, BC'
-## fp_tag = 'CAN_VP'
-
- # un-comment the following four lines for the Vancouver Sun
-## title = u'Vancouver Sun'
-## url_prefix = 'http://www.vancouversun.com'
-## description = u'News from Vancouver, BC'
-## fp_tag = 'CAN_VS'
-
- # un-comment the following four lines for the Edmonton Journal
-## title = u'Edmonton Journal'
-## url_prefix = 'http://www.edmontonjournal.com'
-## description = u'News from Edmonton, AB'
-## fp_tag = 'CAN_EJ'
-
- # un-comment the following four lines for the Calgary Herald
- title = u'Calgary Herald'
- url_prefix = 'http://www.calgaryherald.com'
- description = u'News from Calgary, AB'
- fp_tag = 'CAN_CH'
-
- # un-comment the following four lines for the Regina Leader-Post
-## title = u'Regina Leader-Post'
-## url_prefix = 'http://www.leaderpost.com'
-## description = u'News from Regina, SK'
-## fp_tag = ''
-
- # un-comment the following four lines for the Saskatoon Star-Phoenix
-## title = u'Saskatoon Star-Phoenix'
-## url_prefix = 'http://www.thestarphoenix.com'
-## description = u'News from Saskatoon, SK'
-## fp_tag = ''
-
- # un-comment the following four lines for the Windsor Star
-## title = u'Windsor Star'
-## url_prefix = 'http://www.windsorstar.com'
-## description = u'News from Windsor, ON'
-## fp_tag = 'CAN_'
-
- # un-comment the following four lines for the Ottawa Citizen
-## title = u'Ottawa Citizen'
-## url_prefix = 'http://www.ottawacitizen.com'
-## description = u'News from Ottawa, ON'
-## fp_tag = 'CAN_OC'
-
- # un-comment the following four lines for the Montreal Gazette
-## title = u'Montreal Gazette'
-## url_prefix = 'http://www.montrealgazette.com'
-## description = u'News from Montreal, QC'
-## fp_tag = 'CAN_MG'
-
-
- language = 'en_CA'
- __author__ = 'Nick Redding'
- no_stylesheets = True
- timefmt = ' [%b %d]'
- extra_css = '''
- .timestamp { font-size:xx-small; display: block; }
- #storyheader { font-size: medium; }
- #storyheader h1 { font-size: x-large; }
- #storyheader h2 { font-size: large; font-style: italic; }
- .byline { font-size:xx-small; }
- #photocaption { font-size: small; font-style: italic }
- #photocredit { font-size: xx-small; }'''
- keep_only_tags = [dict(name='div', attrs={'id':'storyheader'}),dict(name='div', attrs={'id':'storycontent'})]
- remove_tags = [{'class':'comments'},
- dict(name='div', attrs={'class':'navbar'}),dict(name='div', attrs={'class':'morelinks'}),
- dict(name='div', attrs={'class':'viewmore'}),dict(name='li', attrs={'class':'email'}),
- dict(name='div', attrs={'class':'story_tool_hr'}),dict(name='div', attrs={'class':'clear'}),
- dict(name='div', attrs={'class':'story_tool'}),dict(name='div', attrs={'class':'copyright'}),
- dict(name='div', attrs={'class':'rule_grey_solid'}),
- dict(name='li', attrs={'class':'print'}),dict(name='li', attrs={'class':'share'}),dict(name='ul', attrs={'class':'bullet'})]
-
- def get_cover_url(self):
- from datetime import timedelta, date
- if self.fp_tag=='':
- return None
- cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str(date.today().day)+'/lg/'+self.fp_tag+'.jpg'
- br = BasicNewsRecipe.get_browser()
- daysback=1
- try:
- br.open(cover)
- except:
- while daysback<7:
- cover = 'http://webmedia.newseum.org/newseum-multimedia/dfp/jpg'+str((date.today() - timedelta(days=daysback)).day)+'/lg/'+self.fp_tag+'.jpg'
- br = BasicNewsRecipe.get_browser()
- try:
- br.open(cover)
- except:
- daysback = daysback+1
- continue
- break
- if daysback==7:
- self.log("\nCover unavailable")
- cover = None
- return cover
-
- def fixChars(self,string):
- # Replace lsquo (\x91)
- fixed = re.sub("\x91","‘",string)
- # Replace rsquo (\x92)
- fixed = re.sub("\x92","’",fixed)
- # Replace ldquo (\x93)
- fixed = re.sub("\x93","“",fixed)
- # Replace rdquo (\x94)
- fixed = re.sub("\x94","”",fixed)
- # Replace ndash (\x96)
- fixed = re.sub("\x96","–",fixed)
- # Replace mdash (\x97)
- fixed = re.sub("\x97","—",fixed)
- fixed = re.sub("’","’",fixed)
- return fixed
-
- def massageNCXText(self, description):
- # Kindle TOC descriptions won't render certain characters
- if description:
- massaged = unicode(BeautifulStoneSoup(description, convertEntities=BeautifulStoneSoup.HTML_ENTITIES))
- # Replace '&' with '&'
- massaged = re.sub("&","&", massaged)
- return self.fixChars(massaged)
- else:
- return description
-
- def populate_article_metadata(self, article, soup, first):
- if first:
- picdiv = soup.find('body').find('img')
- if picdiv is not None:
- self.add_toc_thumbnail(article,re.sub(r'links\\link\d+\\','',picdiv['src']))
- xtitle = article.text_summary.strip()
- if len(xtitle) == 0:
- desc = soup.find('meta',attrs={'property':'og:description'})
- if desc is not None:
- article.summary = article.text_summary = desc['content']
-
- def strip_anchors(self,soup):
- paras = soup.findAll(True)
- for para in paras:
- aTags = para.findAll('a')
- for a in aTags:
- if a.img is None:
- a.replaceWith(a.renderContents().decode('cp1252','replace'))
- return soup
-
- def preprocess_html(self, soup):
- return self.strip_anchors(soup)
-
-
-
- def parse_index(self):
- soup = self.index_to_soup(self.url_prefix+'/news/todays-paper/index.html')
-
- articles = {}
- key = 'News'
- ans = ['News']
-
- # Find each instance of class="sectiontitle", class="featurecontent"
- for divtag in soup.findAll('div',attrs={'class' : ["section_title02","featurecontent"]}):
- #self.log(" div class = %s" % divtag['class'])
- if divtag['class'].startswith('section_title'):
- # div contains section title
- if not divtag.h3:
- continue
- key = self.tag_to_string(divtag.h3,False)
- ans.append(key)
- self.log("Section name %s" % key)
- continue
- # div contains article data
- h1tag = divtag.find('h1')
- if not h1tag:
- continue
- atag = h1tag.find('a',href=True)
- if not atag:
- continue
- url = self.url_prefix+'/news/todays-paper/'+atag['href']
- #self.log("Section %s" % key)
- #self.log("url %s" % url)
- title = self.tag_to_string(atag,False)
- #self.log("title %s" % title)
- pubdate = ''
- description = ''
- ptag = divtag.find('p');
- if ptag:
- description = self.tag_to_string(ptag,False)
- #self.log("description %s" % description)
- author = ''
- autag = divtag.find('h4')
- if autag:
- author = self.tag_to_string(autag,False)
- #self.log("author %s" % author)
- if not articles.has_key(key):
- articles[key] = []
- articles[key].append(dict(title=title,url=url,date=pubdate,description=description,author=author,content=''))
-
- ans = [(key, articles[key]) for key in ans if articles.has_key(key)]
- return ans
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class CalgaryHerald(BasicNewsRecipe):
+ title = u'Calgary Herald'
+ oldest_article = 3
+ max_articles_per_feed = 100
+
+ feeds = [
+ (u'News', u'http://rss.canada.com/get/?F233'),
+ (u'Calgary', u'http://www.calgaryherald.com/scripts/sp6query.aspx?catalog=cahr&tags=keyword|calgary&output=rss?link=http%3a%2f%2fwww.calgaryherald'),
+ (u'Alberta', u'http://www.calgaryherald.com/scripts/Sp6Query.aspx?catalog=CAHR&tags=Keyword|Alberta&output=rss?link=http%3A%2F%2Fwww.calgaryherald.com%2Fnews%2Falberta%2Findex.html'),
+ (u'Politics', u'http://rss.canada.com/get/?F7551'),
+ (u'National', u'http://rss.canada.com/get/?F7552'),
+ (u'World', u'http://rss.canada.com/get/?F7553'),
+ ]
+ __author__ = 'rty'
+ pubisher = 'Calgary Herald'
+ description = 'Calgary, Alberta, Canada'
+ category = 'News, Calgary, Alberta, Canada'
+
+
+ remove_javascript = True
+ use_embedded_content = False
+ no_stylesheets = True
+ language = 'en_CA'
+ encoding = 'utf-8'
+ conversion_options = {'linearize_tables':True}
+ ##masthead_url = 'http://www.calgaryherald.com/index.html'
+ keep_only_tags = [
+ dict(name='div', attrs={'id':'storyheader'}),
+ dict(name='div', attrs={'id':'storycontent'})
+
+ ]
+ remove_tags_after = {'class':"story_tool_hr"}
+
diff --git a/recipes/camera_di_commercio_di_bari.recipe b/recipes/camera_di_commercio_di_bari.recipe
new file mode 100644
index 0000000000..c80a825883
--- /dev/null
+++ b/recipes/camera_di_commercio_di_bari.recipe
@@ -0,0 +1,17 @@
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class AdvancedUserRecipe1331729727(BasicNewsRecipe):
+ title = u'Camera di Commercio di Bari'
+ oldest_article = 7
+ __author__ = 'faber1971'
+ description = 'News from the Chamber of Commerce of Bari'
+ language = 'it'
+ max_articles_per_feed = 100
+ auto_cleanup = True
+ masthead_url = 'http://www.ba.camcom.it/grafica/layout-bordo/logo_camcom_bari.png'
+ feeds = [(u'Camera di Commercio di Bari', u'http://feed43.com/4715147488845101.xml')]
+
+__license__ = 'GPL v3'
+__copyright__ = '2012, faber1971'
+__version__ = 'v1.00'
+__date__ = '17, April 2012'
diff --git a/recipes/cd_action.recipe b/recipes/cd_action.recipe
index ff46774dc9..4e19fbc6c1 100644
--- a/recipes/cd_action.recipe
+++ b/recipes/cd_action.recipe
@@ -6,6 +6,7 @@ class CD_Action(BasicNewsRecipe):
description = 'cdaction.pl - polish games magazine site'
category = 'games'
language = 'pl'
+ index='http://www.cdaction.pl'
oldest_article = 8
max_articles_per_feed = 100
no_stylesheets= True
@@ -17,4 +18,10 @@ class CD_Action(BasicNewsRecipe):
def get_cover_url(self):
soup = self.index_to_soup('http://www.cdaction.pl/magazyn/')
self.cover_url='http://www.cdaction.pl'+ soup.find(id='wspolnik').div.a['href']
- return getattr(self, 'cover_url', self.cover_url)
\ No newline at end of file
+ return getattr(self, 'cover_url', self.cover_url)
+
+ def preprocess_html(self, soup):
+ for a in soup('a'):
+ if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']:
+ a['href']=self.index + a['href']
+ return soup
\ No newline at end of file
diff --git a/recipes/countryfile.recipe b/recipes/countryfile.recipe
index 7a41b5b905..0502129791 100644
--- a/recipes/countryfile.recipe
+++ b/recipes/countryfile.recipe
@@ -1,11 +1,12 @@
+from calibre import browser
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1325006965(BasicNewsRecipe):
title = u'Countryfile.com'
- cover_url = 'http://www.buysubscriptions.com/static_content/the-immediate/en/images/covers/CFIL_maxi.jpg'
+ #cover_url = 'http://www.countryfile.com/sites/default/files/imagecache/160px_wide/cover/2_1.jpg'
__author__ = 'Dave Asbury'
description = 'The official website of Countryfile Magazine'
- # last updated 29/1/12
+ # last updated 15/4/12
language = 'en_GB'
oldest_article = 30
max_articles_per_feed = 25
@@ -13,7 +14,23 @@ class AdvancedUserRecipe1325006965(BasicNewsRecipe):
no_stylesheets = True
auto_cleanup = True
#articles_are_obfuscated = True
+ def get_cover_url(self):
+ soup = self.index_to_soup('http://www.countryfile.com/')
+ cov = soup.find(attrs={'class' : 'imagecache imagecache-160px_wide imagecache-linked imagecache-160px_wide_linked'})
+ #print '******** ',cov,' ***'
+ cov2 = str(cov)
+ cov2=cov2[124:-90]
+ #print '******** ',cov2,' ***'
+ # try to get cover - if can't get known cover
+ br = browser()
+ br.set_handle_redirect(False)
+ try:
+ br.open_novisit(cov2)
+ cover_url = cov2
+ except:
+ cover_url = 'http://www.countryfile.com/sites/default/files/imagecache/160px_wide/cover/2_1.jpg'
+ return cover_url
remove_tags = [
# dict(attrs={'class' : ['player']}),
diff --git a/recipes/daily_mirror.recipe b/recipes/daily_mirror.recipe
index d6794b1d97..8bac57951c 100644
--- a/recipes/daily_mirror.recipe
+++ b/recipes/daily_mirror.recipe
@@ -1,20 +1,21 @@
+
from calibre.web.feeds.news import BasicNewsRecipe
import re
+import mechanize
class AdvancedUserRecipe1306061239(BasicNewsRecipe):
title = u'The Daily Mirror'
description = 'News as provide by The Daily Mirror -UK'
__author__ = 'Dave Asbury'
- # last updated 11/2/12
+ # last updated 7/4/12
language = 'en_GB'
-
- cover_url = 'http://yookeo.com/screens/m/i/mirror.co.uk.jpg'
+ #cover_url = 'http://yookeo.com/screens/m/i/mirror.co.uk.jpg'
masthead_url = 'http://www.nmauk.co.uk/nma/images/daily_mirror.gif'
oldest_article = 1
- max_articles_per_feed = 5
+ max_articles_per_feed = 10
remove_empty_feeds = True
remove_javascript = True
no_stylesheets = True
@@ -75,3 +76,28 @@ class AdvancedUserRecipe1306061239(BasicNewsRecipe):
img { display:block}
'''
+ def get_cover_url(self):
+ soup = self.index_to_soup('http://www.politicshome.com/uk/latest_frontpage.html')
+ # look for the block containing the mirror button and url
+ cov = soup.find(attrs={'style' : 'background-image: url(http://www.politicshome.com/images/sources/source_frontpage_button_92.gif);'})
+ cov2 = str(cov)
+ cov2='http://www.politicshome.com'+cov2[9:-142]
+ #cov2 now contains url of the page containing pic
+ soup = self.index_to_soup(cov2)
+ cov = soup.find(attrs={'id' : 'large'})
+ cov2 = str(cov)
+ cov2=cov2[27:-18]
+ #cov2 now is pic url, now go back to original function
+ br = mechanize.Browser()
+ br.set_handle_redirect(False)
+ try:
+ br.open_novisit(cov2)
+ cover_url = cov2
+ except:
+ cover_url = 'http://yookeo.com/screens/m/i/mirror.co.uk.jpg'
+
+ #cover_url = cov2
+ #cover_url = 'http://www.thesun.co.uk/img/global/new-masthead-logo.png'
+ return cover_url
+
+
diff --git a/recipes/dobreprogamy.recipe b/recipes/dobreprogamy.recipe
index a27a9b0877..0614cf98ee 100644
--- a/recipes/dobreprogamy.recipe
+++ b/recipes/dobreprogamy.recipe
@@ -11,6 +11,7 @@ class Dobreprogramy_pl(BasicNewsRecipe):
cover_url = 'http://userlogos.org/files/logos/Karmody/dobreprogramy_01.png'
description = u'Aktualności i blogi z dobreprogramy.pl'
encoding = 'utf-8'
+ index='http://www.dobreprogramy.pl/'
no_stylesheets = True
language = 'pl'
extra_css = '.title {font-size:22px;}'
@@ -22,3 +23,10 @@ class Dobreprogramy_pl(BasicNewsRecipe):
#remove_tags = [dict(name='div', attrs={'class':['komentarze', 'block', 'portalInfo', 'menuBar', 'topBar']})]
feeds = [(u'Aktualności', 'http://feeds.feedburner.com/dobreprogramy/Aktualnosci'),
('Blogi', 'http://feeds.feedburner.com/dobreprogramy/BlogCzytelnikow')]
+
+
+ def preprocess_html(self, soup):
+ for a in soup('a'):
+ if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']:
+ a['href']=self.index + a['href']
+ return soup
\ No newline at end of file
diff --git a/recipes/dzieje_pl.recipe b/recipes/dzieje_pl.recipe
index d80161e71a..4c583e4815 100644
--- a/recipes/dzieje_pl.recipe
+++ b/recipes/dzieje_pl.recipe
@@ -7,6 +7,7 @@ class Dzieje(BasicNewsRecipe):
cover_url = 'http://www.dzieje.pl/sites/default/files/dzieje_logo.png'
category = 'history'
language = 'pl'
+ index='http://dzieje.pl'
oldest_article = 8
max_articles_per_feed = 100
remove_javascript=True
@@ -15,3 +16,10 @@ class Dzieje(BasicNewsRecipe):
remove_tags_after= dict(id='dogory')
remove_tags=[dict(id='dogory')]
feeds = [(u'Dzieje', u'http://dzieje.pl/rss.xml')]
+
+
+ def preprocess_html(self, soup):
+ for a in soup('a'):
+ if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']:
+ a['href']=self.index + a['href']
+ return soup
\ No newline at end of file
diff --git a/recipes/eioba.recipe b/recipes/eioba.recipe
index 14256c5811..1df79d64bd 100644
--- a/recipes/eioba.recipe
+++ b/recipes/eioba.recipe
@@ -21,3 +21,8 @@ class eioba(BasicNewsRecipe):
(u'Rozrywka', u'http://www.eioba.pl/feed/categories/10.xml'),
(u'Rożne', u'http://www.eioba.pl/feed/categories/9.xml')
]
+
+ def preprocess_html(self, soup):
+ for item in soup.findAll(style=True):
+ del item['style']
+ return soup
diff --git a/recipes/emuzica_pl.recipe b/recipes/emuzica_pl.recipe
index 75271c510a..2fbf9ff514 100644
--- a/recipes/emuzica_pl.recipe
+++ b/recipes/emuzica_pl.recipe
@@ -7,6 +7,7 @@ class eMuzyka(BasicNewsRecipe):
description = u'Emuzyka to największa i najpopularniejsza strona o muzyce w Polsce'
category = 'music'
language = 'pl'
+ index='http://www.emuzyka.pl'
cover_url='http://s.emuzyka.pl/img/emuzyka_invert_small.jpg'
no_stylesheets = True
oldest_article = 7
@@ -14,3 +15,9 @@ class eMuzyka(BasicNewsRecipe):
keep_only_tags=[dict(name='div', attrs={'id':'news_container'}), dict(name='h3'), dict(name='div', attrs={'class':'review_text'})]
remove_tags=[dict(name='span', attrs={'id':'date'})]
feeds = [(u'Aktualno\u015bci', u'http://www.emuzyka.pl/rss.php?f=1'), (u'Recenzje', u'http://www.emuzyka.pl/rss.php?f=2')]
+
+ def preprocess_html(self, soup):
+ for a in soup('a'):
+ if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']:
+ a['href']=self.index + a['href']
+ return soup
\ No newline at end of file
diff --git a/recipes/fhm_uk.recipe b/recipes/fhm_uk.recipe
index 0e2d5c1ebe..07f2b4b64e 100644
--- a/recipes/fhm_uk.recipe
+++ b/recipes/fhm_uk.recipe
@@ -7,7 +7,7 @@ class AdvancedUserRecipe1325006965(BasicNewsRecipe):
# cover_url = 'http://profile.ak.fbcdn.net/hprofile-ak-snc4/373529_38324934806_64930243_n.jpg'
masthead_url = 'http://www.fhm.com/App_Resources/Images/Site/re-design/logo.gif'
__author__ = 'Dave Asbury'
- # last updated 17/3/12
+ # last updated 14/4/12
language = 'en_GB'
oldest_article = 28
max_articles_per_feed = 12
@@ -28,7 +28,8 @@ class AdvancedUserRecipe1325006965(BasicNewsRecipe):
#]
feeds = [
- (u'From the Homepage',u'http://feed43.com/8053226782885416.xml'),
+ (u'From the Homepage',u'http://feed43.com/0032328550253453.xml'),
+ #http://feed43.com/8053226782885416.xml'),
(u'Funny - The Very Best Of The Internet',u'http://feed43.com/4538510106331565.xml'),
(u'Upgrade',u'http://feed43.com/0877305847443234.xml'),
#(u'The Final Countdown', u'http://feed43.com/3576106158530118.xml'),
diff --git a/recipes/film_web.recipe b/recipes/film_web.recipe
index 877d4472bc..2a6e00d501 100644
--- a/recipes/film_web.recipe
+++ b/recipes/film_web.recipe
@@ -7,6 +7,7 @@ class Filmweb_pl(BasicNewsRecipe):
cover_url = 'http://userlogos.org/files/logos/crudus/filmweb.png'
category = 'movies'
language = 'pl'
+ index='http://www.filmweb.pl'
oldest_article = 8
max_articles_per_feed = 100
no_stylesheets= True
@@ -39,3 +40,9 @@ class Filmweb_pl(BasicNewsRecipe):
self.log.warn(skip_tag)
return self.index_to_soup(skip_tag['href'], raw=True)
+
+ def preprocess_html(self, soup):
+ for a in soup('a'):
+ if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']:
+ a['href']=self.index + a['href']
+ return soup
\ No newline at end of file
diff --git a/recipes/forbes.recipe b/recipes/forbes.recipe
index a633d0f543..fe72fda536 100644
--- a/recipes/forbes.recipe
+++ b/recipes/forbes.recipe
@@ -1,39 +1,49 @@
-from calibre.ebooks.BeautifulSoup import BeautifulSoup
+import re
from calibre.web.feeds.news import BasicNewsRecipe
class Forbes(BasicNewsRecipe):
title = u'Forbes'
description = 'Business and Financial News'
- __author__ = 'Darko Miletic'
+ __author__ = 'Kovid Goyal'
oldest_article = 30
- max_articles_per_feed = 100
+ max_articles_per_feed = 20
language = 'en'
+ encoding = 'utf-8'
+ recursions = 1
no_stylesheets = True
- html2lrf_options = ['--base-font-size', '10']
cover_url = u'http://www.forbes.com/media/current_covers/forbes_120_160.gif'
-
- feeds = [(u'Latest', u'http://www.forbes.com/news/index.xml'),
- (u'Most Popular', u'http://www.forbes.com/feeds/popstories.xml'),
- (u'Most Emailed', u'http://www.forbes.com/feeds/mostemailed.xml'),
- (u'Faces', u'http://www.forbes.com/facesscan/index.xml'),
- (u'Technology', u'http://www.forbes.com/technology/index.xml'),
- (u'Personal Tech', u'http://www.forbes.com/personaltech/index.xml'),
- (u'Wireless', u'http://www.forbes.com/wireless/index.xml'),
- (u'Business', u'http://www.forbes.com/business/index.xml'),
- (u'Sports Money', u'http://www.forbes.com/sportsmoney/index.xml'),
- (u'Sports', u'http://www.forbes.com/forbeslife/sports/index.xml'),
- (u'Vehicles', u'http://www.forbes.com/forbeslife/vehicles/index.xml'),
- (u'Leadership', u'http://www.forbes.com/leadership/index.xml'),
- (u'Careers', u'http://www.forbes.com/leadership/careers/index.xml'),
- (u'Compensation', u'http://www.forbes.com/leadership/compensation/index.xml'),
- (u'Managing', u'http://www.forbes.com/leadership/managing/index.xml')]
- def print_version(self, url):
- raw = self.browser.open(url).read()
- soup = BeautifulSoup(raw.decode('latin1', 'replace'))
- print_link = soup.find('a', {'onclick':"s_linkTrackVars='prop18';s_linkType='o';s_linkName='Print';if(typeof(globalPageName)!='undefined')s_prop18=globalPageName;s_lnk=s_co(this);s_gs(s_account);"})
- if print_link is None:
- return ''
- return 'http://www.forbes.com' + print_link['href']
\ No newline at end of file
+ feeds = [(u'Latest', u'http://www.forbes.com/news/index.xml'),
+ (u'Most Popular', u'http://www.forbes.com/feeds/popstories.xml'),
+ (u'Technology', u'http://www.forbes.com/technology/index.xml'),
+ (u'Business', u'http://www.forbes.com/business/index.xml'),
+ (u'Sports Money', u'http://www.forbes.com/sportsmoney/index.xml'),
+ (u'Leadership', u'http://www.forbes.com/leadership/index.xml'),]
+
+ keep_only_tags = \
+ {'class':lambda x: x and (set(x.split()) & {'body', 'pagination',
+ 'articleHead', 'article_head'})}
+ remove_tags_before = {'name':'h1'}
+ remove_tags = [
+ {'class':['comment_bug', 'engagement_block',
+ 'video_promo_block', 'article_actions']},
+ {'id':'comments'}
+ ]
+
+ def is_link_wanted(self, url, tag):
+ ans = re.match(r'http://.*/[2-9]/', url) is not None
+ if ans:
+ self.log('Following multipage link: %s'%url)
+ return ans
+
+ def postprocess_html(self, soup, first_fetch):
+ for pag in soup.findAll(True, 'pagination'):
+ pag.extract()
+ if not first_fetch:
+ h1 = soup.find('h1')
+ if h1 is not None:
+ h1.extract()
+ return soup
+
diff --git a/recipes/fotoblogia_pl.recipe b/recipes/fotoblogia_pl.recipe
new file mode 100644
index 0000000000..99df46419a
--- /dev/null
+++ b/recipes/fotoblogia_pl.recipe
@@ -0,0 +1,16 @@
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class Fotoblogia_pl(BasicNewsRecipe):
+ title = u'Fotoblogia.pl'
+ __author__ = 'fenuks'
+ category = 'photography'
+ language = 'pl'
+ masthead_url = 'http://img.interia.pl/komputery/nimg/u/0/fotoblogia21.jpg'
+ cover_url= 'http://fotoblogia.pl/images/2009/03/fotoblogia2.jpg'
+ oldest_article = 7
+ max_articles_per_feed = 100
+ no_stylesheets = True
+ use_embedded_content = False
+ keep_only_tags=[dict(name='div', attrs={'class':'post-view post-standard'})]
+ remove_tags=[dict(attrs={'class':['external fotoblogia', 'categories', 'tags']})]
+ feeds = [(u'Wszystko', u'http://fotoblogia.pl/feed/rss2')]
diff --git a/recipes/gameplay_pl.recipe b/recipes/gameplay_pl.recipe
index f3384263d6..7b0ccb4f55 100644
--- a/recipes/gameplay_pl.recipe
+++ b/recipes/gameplay_pl.recipe
@@ -6,16 +6,24 @@ class Gameplay_pl(BasicNewsRecipe):
description = u'gameplay.pl - serwis o naszych zainteresowaniach, grach, filmach, książkach, muzyce, fotografii i konsolach.'
category = 'games, movies, books, music'
language = 'pl'
+ index='http://gameplay.pl'
masthead_url= 'http://gameplay.pl/img/gpy_top_logo.png'
cover_url= 'http://gameplay.pl/img/gpy_top_logo.png'
max_articles_per_feed = 100
+ remove_javascript= True
no_stylesheets= True
keep_only_tags=[dict(name='div', attrs={'class':['news_endpage_tit', 'news']})]
- remove_tags=[dict(name='div', attrs={'class':['galeria', 'noedit center im']})]
+ remove_tags=[dict(name='div', attrs={'class':['galeria', 'noedit center im', 'news_list', 'news_list_autor', 'stop_bot', 'tagi']}), dict(attrs={'usemap':'#map'})]
feeds = [(u'Wiadomo\u015bci', u'http://gameplay.pl/rss/')]
def image_url_processor(self, baseurl, url):
if 'http' not in url:
return 'http://gameplay.pl'+ url[2:]
else:
- return url
+ return url
+
+ def preprocess_html(self, soup):
+ for a in soup('a'):
+ if a.has_key('href') and '../' in a['href']:
+ a['href']=self.index + a['href'][2:]
+ return soup
\ No newline at end of file
diff --git a/recipes/gildia_pl.recipe b/recipes/gildia_pl.recipe
index 042902b5fc..36d3ef4da2 100644
--- a/recipes/gildia_pl.recipe
+++ b/recipes/gildia_pl.recipe
@@ -9,6 +9,7 @@ class Gildia(BasicNewsRecipe):
language = 'pl'
oldest_article = 8
max_articles_per_feed = 100
+ remove_empty_feeds=True
no_stylesheets=True
remove_tags=[dict(name='div', attrs={'class':'backlink'}), dict(name='div', attrs={'class':'im_img'}), dict(name='div', attrs={'class':'addthis_toolbox addthis_default_style'})]
keep_only_tags=dict(name='div', attrs={'class':'widetext'})
@@ -24,3 +25,16 @@ class Gildia(BasicNewsRecipe):
self.log.warn('odnosnik')
self.log.warn(link['href'])
return self.index_to_soup(link['href'], raw=True)
+
+ def preprocess_html(self, soup):
+ for a in soup('a'):
+ if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']:
+ if '/gry/' in a['href']:
+ a['href']='http://www.gry.gildia.pl' + a['href']
+ elif u'książk' in soup.title.string.lower() or u'komiks' in soup.title.string.lower():
+ a['href']='http://www.literatura.gildia.pl' + a['href']
+ elif u'komiks' in soup.title.string.lower():
+ a['href']='http://www.literatura.gildia.pl' + a['href']
+ else:
+ a['href']='http://www.gildia.pl' + a['href']
+ return soup
diff --git a/recipes/gram_pl.recipe b/recipes/gram_pl.recipe
index 07927796c0..1f8147ba3d 100644
--- a/recipes/gram_pl.recipe
+++ b/recipes/gram_pl.recipe
@@ -7,6 +7,7 @@ class Gram_pl(BasicNewsRecipe):
category = 'games'
language = 'pl'
oldest_article = 8
+ index='http://www.gram.pl'
max_articles_per_feed = 100
no_stylesheets= True
extra_css = 'h2 {font-style: italic; font-size:20px;} .picbox div {float: left;}'
@@ -52,4 +53,7 @@ class Gram_pl(BasicNewsRecipe):
tag=soup.findAll(name='div', attrs={'class':'picbox'})
for t in tag:
t['style']='float: left;'
+ for a in soup('a'):
+ if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']:
+ a['href']=self.index + a['href']
return soup
\ No newline at end of file
diff --git a/recipes/heise.recipe b/recipes/heise.recipe
index 56d5516656..ba93ea96ce 100644
--- a/recipes/heise.recipe
+++ b/recipes/heise.recipe
@@ -59,6 +59,7 @@ class heiseDe(BasicNewsRecipe):
dict(name='span', attrs={'class':'rsaquo'}),
dict(name='div', attrs={'class':'news_logo'}),
dict(name='div', attrs={'class':'bcadv ISI_IGNORE'}),
+ dict(name='div', attrs={'class':'navi_top_container'}),
dict(name='p', attrs={'class':'news_option'}),
dict(name='p', attrs={'class':'news_navi'}),
dict(name='div', attrs={'class':'news_foren'})]
@@ -69,3 +70,5 @@ class heiseDe(BasicNewsRecipe):
+
+
diff --git a/recipes/historia_news.recipe b/recipes/historia_news.recipe
new file mode 100644
index 0000000000..4eca8ade91
--- /dev/null
+++ b/recipes/historia_news.recipe
@@ -0,0 +1,20 @@
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class historia_news(BasicNewsRecipe):
+ title = u'historia-news'
+ __author__ = 'fenuks'
+ description = u'Historia-news to portal dla ludzi kochających historię. Najnowsze newsy z historii bliższej i dalszej, archeologii, paleontologii oraz ciekawostki i podcasty z historii kultury, sportu, motoryzacji i inne.'
+ masthead_url = 'http://historia-news.pl/templates/hajak4/images/header.jpg'
+ cover_url= 'http://www.historia-news.pl/templates/hajak4/images/header.jpg'
+ category = 'history'
+ language = 'pl'
+ oldest_article = 7
+ max_articles_per_feed = 100
+ no_stylesheets = True
+ remove_empty_feeds = True
+ remove_tags=[dict(name='form'), dict(name='img', attrs={'alt':'Print'}), dict(attrs={'class':['commbutt', 'cpr']}), dict(id=['plusone', 'facebook'])]
+ feeds = [(u'Wiadomo\u015bci', u'http://historia-news.pl/wiadomoci.feed?type=rss'), (u'Artyku\u0142y', u'http://historia-news.pl/artykuy.feed?type=rss')]
+
+
+ def print_version(self, url):
+ return url + '?tmpl=component&print=1&layout=default&page='
diff --git a/recipes/icons/fotoblogia_pl.png b/recipes/icons/fotoblogia_pl.png
new file mode 100644
index 0000000000..0204a04e62
Binary files /dev/null and b/recipes/icons/fotoblogia_pl.png differ
diff --git a/recipes/icons/historia_news.png b/recipes/icons/historia_news.png
new file mode 100644
index 0000000000..79b1b52859
Binary files /dev/null and b/recipes/icons/historia_news.png differ
diff --git a/recipes/icons/swiat_obrazu.png b/recipes/icons/swiat_obrazu.png
new file mode 100644
index 0000000000..a61662a864
Binary files /dev/null and b/recipes/icons/swiat_obrazu.png differ
diff --git a/recipes/icons/telam.png b/recipes/icons/telam.png
new file mode 100644
index 0000000000..f86dcc1dbf
Binary files /dev/null and b/recipes/icons/telam.png differ
diff --git a/recipes/in4_pl.recipe b/recipes/in4_pl.recipe
index 16ad622b46..e385522714 100644
--- a/recipes/in4_pl.recipe
+++ b/recipes/in4_pl.recipe
@@ -8,6 +8,7 @@ class in4(BasicNewsRecipe):
description = u'Serwis Informacyjny - Aktualnosci, recenzje'
category = 'IT'
language = 'pl'
+ index='http://www.in4.pl/'
#cover_url= 'http://www.in4.pl/recenzje/337/in4pl.jpg'
no_stylesheets = True
remove_empty_feeds = True
@@ -39,6 +40,7 @@ class in4(BasicNewsRecipe):
def preprocess_html(self, soup):
self.append_page(soup, soup.body)
+ for a in soup('a'):
+ if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']:
+ a['href']=self.index + a['href']
return soup
-
-
diff --git a/recipes/infra_pl.recipe b/recipes/infra_pl.recipe
index 0e035e0980..e021fa0c17 100644
--- a/recipes/infra_pl.recipe
+++ b/recipes/infra_pl.recipe
@@ -8,6 +8,7 @@ class INFRA(BasicNewsRecipe):
description = u'Serwis Informacyjny INFRA - UFO, Zjawiska Paranormalne, Duchy, Tajemnice świata.'
cover_url = 'http://npn.nazwa.pl/templates/ja_teline_ii/images/logo.jpg'
category = 'UFO'
+ index='http://infra.org.pl'
language = 'pl'
max_articles_per_feed = 100
no_stylesheers=True
@@ -15,3 +16,11 @@ class INFRA(BasicNewsRecipe):
remove_tags_after=dict(attrs={'class':'pagenav'})
remove_tags=[dict(attrs={'class':'pagenav'})]
feeds = [(u'Najnowsze wiadomo\u015bci', u'http://www.infra.org.pl/index.php?option=com_rd_rss&id=1')]
+
+ def preprocess_html(self, soup):
+ for item in soup.findAll(style=True):
+ del item['style']
+ for a in soup('a'):
+ if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']:
+ a['href']=self.index + a['href']
+ return soup
\ No newline at end of file
diff --git a/recipes/jakarta_globe.recipe b/recipes/jakarta_globe.recipe
new file mode 100644
index 0000000000..1414ac6e5b
--- /dev/null
+++ b/recipes/jakarta_globe.recipe
@@ -0,0 +1,34 @@
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class JakartaGlobe(BasicNewsRecipe):
+ title = u'Jakarta Globe'
+ oldest_article = 3
+ max_articles_per_feed = 100
+
+ feeds = [
+ (u'News', u'http://www.thejakartaglobe.com/pages/getrss/getrss-news.php'),
+ (u'Business', u'http://www.thejakartaglobe.com/pages/getrss/getrss-business.php'),
+ (u'Technology', u'http://www.thejakartaglobe.com/pages/getrss/getrss-tech.php'),
+ (u'My Jakarta', u'http://www.thejakartaglobe.com/pages/getrss/getrss-myjakarta.php'),
+ (u'International', u'http://www.thejakartaglobe.com/pages/getrss/getrss-international.php'),
+ (u'Life and Times', u'http://www.thejakartaglobe.com/pages/getrss/getrss-lifeandtimes.php'),
+ ]
+ __author__ = 'rty'
+ pubisher = 'JakartaGlobe.com'
+ description = 'JakartaGlobe, Indonesia, Newspaper'
+ category = 'News, Indonesia'
+
+
+ remove_javascript = True
+ use_embedded_content = False
+ no_stylesheets = True
+ language = 'en_ID'
+ encoding = 'utf-8'
+ conversion_options = {'linearize_tables':True}
+ masthead_url = 'http://www.thejakartaglobe.com/pages/2010/images/jak-globe-logo.jpg'
+ keep_only_tags = [
+ dict(name='div', attrs={'class':'story'}),
+ dict(name='span', attrs={'class':'headline'}),
+ dict(name='div', attrs={'class':'story'}),
+ dict(name='p', attrs={'id':'bodytext'})
+ ]
diff --git a/recipes/konflikty_zbrojne.recipe b/recipes/konflikty_zbrojne.recipe
index 7921e98f48..e8b28b49bf 100644
--- a/recipes/konflikty_zbrojne.recipe
+++ b/recipes/konflikty_zbrojne.recipe
@@ -1,5 +1,6 @@
# -*- coding: utf-8 -*-
from calibre.web.feeds.news import BasicNewsRecipe
+from calibre.ebooks.BeautifulSoup import BeautifulSoup
class Konflikty(BasicNewsRecipe):
title = u'Konflikty Zbrojne'
@@ -10,6 +11,23 @@ class Konflikty(BasicNewsRecipe):
category='military, history'
oldest_article = 7
max_articles_per_feed = 100
- auto_cleanup = True
+ no_stylesheets = True
+ keep_only_tags=[dict(attrs={'class':['title1', 'image']}), dict(id='body')]
- feeds = [(u'Aktualności', u'http://www.konflikty.pl/rss_aktualnosci_10.xml'), (u'Artyku\u0142y', u'http://www.konflikty.pl/rss_artykuly_10.xml'), (u'Relacje', u'http://www.konflikty.pl/rss_relacje_10.xml'), (u'Recenzje', u'http://www.konflikty.pl/rss_recenzje_10.xml')]
+ feeds = [(u'Aktualności', u'http://www.konflikty.pl/rss_aktualnosci_10.xml'),
+ (u'Artyku\u0142y', u'http://www.konflikty.pl/rss_artykuly_10.xml'),
+ (u'Historia', u'http://www.konflikty.pl/rss_historia_10.xml'),
+ (u'Militaria', u'http://www.konflikty.pl/rss_militaria_10.xml'),
+ (u'Relacje', u'http://www.konflikty.pl/rss_relacje_10.xml'),
+ (u'Recenzje', u'http://www.konflikty.pl/rss_recenzje_10.xml'),
+ (u'Teksty źródłowe', u'http://www.konflikty.pl/rss_tekstyzrodlowe_10.xml')]
+
+ def preprocess_html(self, soup):
+ for item in soup.findAll(style=True):
+ del item['style']
+ for image in soup.findAll(name='a', attrs={'class':'image'}):
+ if image.img and image.img.has_key('alt'):
+ image.name='div'
+ pos = len(image.contents)
+ image.insert(pos, BeautifulSoup(''+image.img['alt']+'
'))
+ return soup
diff --git a/recipes/liberatorio_politico.recipe b/recipes/liberatorio_politico.recipe
new file mode 100644
index 0000000000..bbffcd89b1
--- /dev/null
+++ b/recipes/liberatorio_politico.recipe
@@ -0,0 +1,12 @@
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class AdvancedUserRecipe1334649829(BasicNewsRecipe):
+ title = u'Liberatorio Politico'
+ oldest_article = 7
+ max_articles_per_feed = 100
+ auto_cleanup = True
+ masthead_url = 'http://liberatorio.altervista.org/wp-content/uploads/2012/01/Testata-LIBERATORIO-Altervista1.jpg'
+ feeds = [(u'Liberatorio Politico', u'http://liberatorio.altervista.org/feed/')]
+ __author__ = 'faber1971'
+ description = 'Inquiry journalism - a blog on Molfetta, Land of Bari, Apulia and Italy - v1.00 (07, April 2012)'
+ language = 'it'
diff --git a/recipes/limes.recipe b/recipes/limes.recipe
new file mode 100644
index 0000000000..2290b7099e
--- /dev/null
+++ b/recipes/limes.recipe
@@ -0,0 +1,50 @@
+#!/usr/bin/env python
+__license__ = 'GPL v3'
+__copyright__ = '2012, faber1971'
+__version__ = 'v1.00'
+__date__ = '16, April 2012'
+__description__ = 'Geopolitical Italian magazine'
+
+
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class Limes(BasicNewsRecipe):
+ description = 'Italian weekly magazine'
+ __author__ = 'faber1971'
+
+ cover_url = 'http://temi.repubblica.it/UserFiles/limes/Image/Loghi/logo-limes.gif'
+ title = 'Limes'
+ category = 'Geopolitical news'
+
+ language = 'it'
+# encoding = 'cp1252'
+ timefmt = '[%a, %d %b, %Y]'
+
+ oldest_article = 16
+ max_articles_per_feed = 100
+ use_embedded_content = False
+ recursion = 10
+
+ remove_javascript = True
+ no_stylesheets = True
+ masthead_url = 'http://temi.repubblica.it/UserFiles/limes/Image/Loghi/logo-limes.gif'
+
+ feeds = [
+ (u'Limes', u'http://temi.repubblica.it/limes/feed/')
+ ]
+
+
+
+ keep_only_tags = [
+ dict(name='div', attrs={'class':['testo','copertina','occhiello','firma','didascalia','content-second-right','detail-articles','titolo-local','generic-articles']}),
+ dict(name='div', attrs={'class':['generic-articles','summary','detail-articles']}),
+ dict(name='div', attrs={'id':['content-second-right','content2']})
+ ]
+
+ remove_tags = [
+ dict(name='div',attrs={'class':['servizi','aggiungi','label-web','bottom-mobile','box-abbonamenti','box-cerca','big','little','stampaweb']}),
+ dict(name='div',attrs={'id':['topheader','header','navigation-new','navigation','content-second-left','menutext']}),
+ dict(name='ul',attrs={'id':'user-utility'}),
+ dict(name=['script','noscript','iframe'])
+ ]
+
diff --git a/recipes/marketing_magazine.recipe b/recipes/marketing_magazine.recipe
index bdec85a0ce..d004f274af 100644
--- a/recipes/marketing_magazine.recipe
+++ b/recipes/marketing_magazine.recipe
@@ -1,11 +1,13 @@
__license__ = 'GPL v3'
__author__ = 'faber1971'
-description = 'Collection of Italian marketing websites - v1.04 (17, March 2012)'
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1327062445(BasicNewsRecipe):
title = u'Marketing Magazine'
+ description = 'Collection of Italian marketing websites'
+ language = 'it'
+ __author__ = 'faber1971'
oldest_article = 7
max_articles_per_feed = 100
auto_cleanup = True
@@ -16,4 +18,4 @@ class AdvancedUserRecipe1327062445(BasicNewsRecipe):
dict(name='ul', attrs={'id':'ads0'})
]
masthead_url = 'http://www.simrendeogun.com/wp-content/uploads/2011/06/New-Marketing-Magazine-Logo.jpg'
- feeds = [(u'My Marketing', u'http://feed43.com/0537744466058428.xml'), (u'My Marketing_', u'http://feed43.com/8126723074604845.xml'), (u'Venturini', u'http://robertoventurini.blogspot.com/feeds/posts/default?alt=rss'), (u'Ninja Marketing', u'http://feeds.feedburner.com/NinjaMarketing'), (u'Comunitàzione', u'http://www.comunitazione.it/feed/novita.asp'), (u'Brandforum news', u'http://www.brandforum.it/rss/news'), (u'Brandforum papers', u'http://www.brandforum.it/rss/papers'), (u'MarketingArena', u'http://feeds.feedburner.com/marketingarena'), (u'minimarketing', u'http://feeds.feedburner.com/minimarketingit'), (u'Marketing Journal', u'http://feeds.feedburner.com/marketingjournal/jPwA'), (u'Disambiguando', u'http://giovannacosenza.wordpress.com/feed/')]
+ feeds = [(u'My Marketing', u'http://feed43.com/0537744466058428.xml'), (u'My Marketing_', u'http://feed43.com/8126723074604845.xml'), (u'MarketingArena', u'http://feeds.feedburner.com/marketingarena'), (u'Marketing Journal', u'http://feeds.feedburner.com/marketingjournal/jPwA'), (u'Venturini', u'http://robertoventurini.blogspot.com/feeds/posts/default?alt=rss'), (u'Brandforum news', u'http://www.brandforum.it/rss/news'), (u'Brandforum papers', u'http://www.brandforum.it/rss/papers'), (u'minimarketing', u'http://feeds.feedburner.com/minimarketingit'), (u'[4]marketing.biz', u'http://feeds.feedburner.com/4marketing'), (u'Ninja Marketing', u'http://feeds.feedburner.com/NinjaMarketing'), (u'Bloguerrilla', u'http://feeds.feedburner.com/Bloguerrilla'), (u'Nonconvenzionale', u'http://feeds.feedburner.com/nonconvenzionale'), (u'Comunitàzione', u'http://www.comunitazione.it/feed/novita.asp'), (u'Disambiguando', u'http://giovannacosenza.wordpress.com/feed/')]
diff --git a/recipes/metro_news_nl.recipe b/recipes/metro_news_nl.recipe
index ac3e23869b..9191f7caec 100644
--- a/recipes/metro_news_nl.recipe
+++ b/recipes/metro_news_nl.recipe
@@ -3,25 +3,6 @@ from calibre.web.feeds.news import BasicNewsRecipe
import re
from calibre.utils.magick import Image
from BeautifulSoup import BeautifulSoup
-try:
- from calibre_plugins.drMerry.debug import debuglogger as mlog
- print 'drMerry debuglogger found, debug options can be used'
- from calibre_plugins.drMerry.stats import statslogger as mstat
- print 'drMerry stats tracker found, stat can be tracked'
- mlog.setLoglevel(1) #-1 == no log; 0 for normal output
- mstat.calculateStats(False) #track stats (to track stats loglevel must be > 0
- KEEPSTATS = mstat.keepmystats()
- SHOWDEBUG0 = mlog.showdebuglevel(0)
- SHOWDEBUG1 = mlog.showdebuglevel(1)
- SHOWDEBUG2 = mlog.showdebuglevel(2)
-except:
- #print 'drMerry debuglogger not found, skipping debug options'
- SHOWDEBUG0 = False
- SHOWDEBUG1 = False
- SHOWDEBUG2 = False
- KEEPSTATS = False
-
-#print ('level0: %s\nlevel1: %s\nlevel2: %s' % (SHOWDEBUG0,SHOWDEBUG1,SHOWDEBUG2))
''' Version 1.2, updated cover image to match the changed website.
added info date on title
@@ -43,80 +24,75 @@ except:
extended timeout from 2 to 10
changed oldest article from 10 to 1.2
changed max articles from 15 to 25
+ Version 1.9.1 18-04-2012
+ removed some debug settings
+ updated code to match new metro-layout
+ Version 1.9.2 24-04-2012
+ updated code to match new metro-layout
+ Version 1.9.3 25-04-2012
+ Changed a lot of custom code into calibre code as the default code of calibre has become much faster since the first version fo this recipe
+ Added new feeds
+ Updated css
+ Changed order of regex to speedup proces
'''
class AdvancedUserRecipe1306097511(BasicNewsRecipe):
title = u'Metro Nieuws NL'
oldest_article = 1.2
max_articles_per_feed = 25
- __author__ = u'DrMerry'
- description = u'Metro Nederland'
- language = u'nl'
- simultaneous_downloads = 3
+ __author__ = u'DrMerry'
+ description = u'Metro Nederland'
+ language = u'nl'
+ simultaneous_downloads = 5
masthead_url = 'http://blog.metronieuws.nl/wp-content/themes/metro/images/header.gif'
timeout = 10
- center_navbar = True
- timefmt = ' [%A, %d %b %Y]'
+ center_navbar = True
+ timefmt = ' [%A, %d %b %Y]'
no_stylesheets = True
remove_javascript = True
remove_empty_feeds = True
- cover_url = 'http://www.oldreadmetro.com/img/en/metroholland/last/1/small.jpg'
+ cover_url = 'http://www.oldreadmetro.com/img/en/metroholland/last/1/small.jpg'
publication_type = 'newspaper'
- encoding = 'utf-8'
- remove_attributes = ['style', 'font', 'width', 'height']
+ encoding = 'utf-8'
+ remove_attributes = ['style', 'font', 'width', 'height', 'itemtype', 'itemprop', 'itemscope']#, 'href']
use_embedded_content = False
- conversion_options = {
- 'authors' : 'Metro Nederland & calibre & DrMerry',
- 'author_sort' : 'Metro Nederland & calibre & DrMerry',
- 'publisher' : 'DrMerry/Metro Nederland'
- }
- extra_css = 'body {padding:5px 0px; background:#fff;font-size: 13px;}\
- #date, div.share-and-byline div.byline div.text div.title, div.share-and-byline div.byline div.text div.name {clear: both;margin-bottom: 10px;font-size:0.5em; color: #616262;}\
- .article-box-fact.module-title {clear:both;padding: 8px 0;color: #24763b;font-family: arial, sans-serif;font-size: 14px;font-weight: bold;}\
- h1.title {color: #000000;font-size: 44px;padding-bottom: 10px;font-weight: 300;} h2.subtitle {font-size: 13px;font-weight: 700;padding-bottom: 10px;}\
- .article-body p{padding-bottom:10px;}div.column-1-3{margin-left: 19px;padding-right: 9px;}\
- div.column-1-2 {display: inline;padding-right: 7px;}\
- p.article-image-caption {font-size: 12px;font-weight: 300;color: #616262;margin-top: 5px;} \
- p.article-image-caption .credits {font-style: italic;font-size: 10px;}\
- div.article-image-caption {width: 246px;margin-bottom: 5px;margin-left: 10px;}\
- div.article-image-caption-2column {margin-bottom: 10px;width: 373px;} div.article-image-caption-3column {}\
- img {border:0px; padding:2px;} hr.merryhr {width:30%; border-width:0px; color:green; margin-left:5px; background-color: green} div.column-3 {background-color:#eee; width:50%; margin:2px; float:right; padding:2px;} div.column-3 module-title {border: 1px solid #aaa} div.article-box-fact div.subtitle {font-weight:bold; color:green;}'
+ extra_css = 'body{font-size:1em;padding:5px 0}body,a,h2{background-color:#fff;text-decoration:none;color:#000}#date,div.byline,p.article-image-caption .credits,.calibrenavbar{font-size:.5em}.article-box-fact.module-title,#date,div.byline{clear:both}.article-box-fact.module-title{margin:8px 0}.article-box-fact.module-title,h2{font-size:1.1em}h1.title{font-size:1.4em}h1.title,.article-body p,div.article-image-caption-2column,div.article-image-caption-3column,#date,div.byline{margin-bottom:.6em}div.article-box-fact div.subtitle,.article-box-fact.module-title,h1.title,p.article-image-caption{font-weight:700}div.column-1-3{margin-left:19px}div.column-1-2{display:inline}div.column-1-2,div.column-1-3{margin-right:7px}p.article-image-caption{font-size:.6em;margin-top:5px}p.article-image-caption,#date,div.byline{color:#616262}p.article-image-caption .credits{font-style:italic}div.article-image-caption{width:246px}div.article-image-caption-2column{width:373px}div.column-3{background-color:#eee;float:right;width:50%}div.column-3 module-title{border:1px solid #aaa}div.article-box-fact div.subtitle,.article-box-fact.module-title{color:#24763b}div.byline{border-top:2px solid #24763b}div.column-3,img,div.column-3,p.small,div.article-image-caption{margin:.5em}img,p.small,.column1,h2{border:0;padding:0}.column1,h1,h2{margin:0}'
+
preprocess_regexps = [
- (re.compile(r'
]+top-line[^>]+>', re.DOTALL|re.IGNORECASE),
- lambda match: '
'),
- (re.compile(r'(
]+metronieuws\.nl/[^>]+/templates/[^>]+jpe?g[^>]+>|metronieuws\.nl/internal\-roxen\-unit\.gif)', re.DOTALL|re.IGNORECASE),
- lambda match: ''),
+ (re.compile(r'( |\s|
]+metronieuws\.nl/([^>]+/templates/[^>]+\.jpe?g|internal\-roxen\-unit\.gif)[^>]+>)', re.DOTALL|re.IGNORECASE),lambda match: ' '),
+ #(re.compile(r'( |\s)+', re.DOTALL|re.IGNORECASE),lambda match:' '),
+ #(re.compile(r'<(a |/a)[^>]*>', re.DOTALL|re.IGNORECASE),lambda match:'')
+ #(re.compile('(?)h2', re.DOTALL|re.IGNORECASE),lambda match:'\1em')
]
+
+ remove_tags_before= dict(id='date')
+ remove_tags_after = [dict(name='div', attrs={'class':['column-1-3','gallery-text']})]#id='share-and-byline')]
+ remove_tags = [
+ dict(name=['iframe','script','noscript','style']),
+ dict(name='div', attrs={'class':[re.compile('column-[14]-5'),'col-179 ','col-373 ','clear','ad','navigation',re.compile('share-tools(-top)?'),'tools','metroCommentFormWrap','article-tools-below-title','related-links','padding-top-15',re.compile('^promo.*?$'),'teaser-component',re.compile('fb(-comments|_iframe_widget)')]}),
+ dict(id=['column-1-5-bottom','column-4-5',re.compile('^ad(\d+|adcomp.*?)?$'),'sidebar',re.compile('^article-\d'),'comments','gallery-1']),
+ dict(name='a', attrs={'name':'comments'}),
+ #dict(name='div', attrs={'data-href'}),
+ dict(name='img', attrs={'class':'top-line'}),
+ dict(attrs={'style':re.compile('^(.*(display\s?:\s?none|img-mask|white)\s?;?.*)$'),'title':'volledig scherm'})]
+
+ '''removed by before/after:
+ id:
+ column-1-5-top,'hidden_div','footer',
+ class:
+ 'header',re.compile('^footer-[a-zA-Z0-9]+$),'header-links',
+ '''
def preprocess_html(self, soup):
- if SHOWDEBUG0 == True:
- mlog.setdefaults()
- mlog.addTextAndTag(['Show debug = on with level'], [str(mlog.debuglevel)])
- if KEEPSTATS == True:
- mlog.addDebug('Stats will be calculated')
- else:
- mlog.addTextAndTag(['Stats won\'t be calculated\nTo be enabled, stats must be true, currently','and debug level must be 1 or higher, currently'],[mstat.dokeepmystats, mlog.debuglevel])
- mlog.showDebug()
myProcess = MerryProcess()
+ myProcess.moveTitleAndAuthor(soup)
myProcess.removeUnwantedTags(soup)
return soup
def postprocess_html(self, soup, first):
myProcess = MerryProcess()
myProcess.optimizeLayout(soup)
- if SHOWDEBUG0 == True:
- if KEEPSTATS == True:
- statinfo = 'generated stats:'
- statinfo += str(mstat.stats(mstat.statslist))
- print statinfo
- statinfo = 'generated stats (for removed tags):'
- statinfo += str(mstat.stats(mstat.removedtagslist))
- print statinfo
- #show all Debug info we forgot to report
- #Using print to be sure that this text will not be added at the end of the log.
- print '\n!!!!!unreported messages:\n(should be empty)\n'
- mlog.showDebug()
return soup
feeds = [
@@ -128,295 +104,109 @@ class AdvancedUserRecipe1306097511(BasicNewsRecipe):
(u'Buitenland', u'http://www.metronieuws.nl/rss.xml?c=1277377288-4'),
(u'Columns', u'http://www.metronieuws.nl/rss.xml?c=1277377288-17'),
(u'Entertainment', u'http://www.metronieuws.nl/rss.xml?c=1277377288-2'),
- (u'Dot', u'http://www.metronieuws.nl/rss.xml?c=1283166782-12'),
+ (u'Strips',u'http://www.metronieuws.nl/rss.xml?c=1325037714-0'),
+ (u'Tech', u'http://www.metronieuws.nl/rss.xml?c=1283166782-12'),
(u'Familie', u'http://www.metronieuws.nl/rss.xml?c=1283166782-9'),
(u'Blogs', u'http://www.metronieuws.nl/rss.xml?c=1295586825-6'),
(u'Reizen', u'http://www.metronieuws.nl/rss.xml?c=1277377288-13'),
- (u'Carrière', u'http://www.metronieuws.nl/rss.xml?c=1278070988-1'),
+ (u'Carrière', u'http://www.metronieuws.nl/rss.xml?c=1278070988-1'),
+ (u'Wetenschap',u'http://www.metronieuws.nl/rss.xml?c=1303088437-0'),
+ (u'Planeet',u'http://www.metronieuws.nl/rss.xml?c=1277377288-14'),
+ (u'Gezondheid',u'http://www.metronieuws.nl/rss.xml?c=1277377288-15'),
(u'Sport', u'http://www.metronieuws.nl/rss.xml?c=1277377288-12')
]
class MerryPreProcess():
- def replacePictures(self, soup):
- #to be implemented
- return soup
-
def optimizePicture(self,soup):
- if SHOWDEBUG0 == True:
- mlog.addDebug('start image optimize')
for tag in soup.findAll(lambda tag: tag.name.lower()=='img' and tag.has_key('src')):
- iurl = tag['src']
- img = Image()
- img.open(iurl)
- img.trim(0)
- img.save(iurl)
- if SHOWDEBUG0 == True:
- mlog.addDebug('Images optimized')
- mlog.showDebug()
+ try:
+ iurl = tag['src']
+ img = Image()
+ img.open(iurl)
+ img.trim(0)
+ img.save(iurl)
+ except:
+ print '\n!!image optimize failed!!\n'
+ continue
return soup
class MerryExtract():
def safeRemovePart(self, killingSoup, soupIsArray):
if killingSoup and not killingSoup == None:
- if SHOWDEBUG2 == True:
- mlog.addTextAndTag(['items to remove'],[killingSoup])
try:
if soupIsArray == True:
for killer in killingSoup:
killer.extract()
else:
killingSoup.extract()
- if SHOWDEBUG1 == True:
- mlog.addDebug('tag extracted')
- mlog.showDebug()
- if KEEPSTATS == True:
- try:
- mstat.addstat(mstat.removedtagslist,str(killingSoup.name))
- except:
- mstat.addstat(mstat.removedtagslist,'unknown')
except:
- if SHOWDEBUG1 == True:
- mlog.addDebug('tag extraction failed')
- mlog.showDebug()
- if KEEPSTATS == True:
- mstat.addstat(mstat.removedtagslist,'exception')
return False
else:
return False
return killingSoup
-class MerryReplace():
- myKiller = MerryExtract()
- def replaceATag(self, soup):
- anchors = []
- anchors = soup.findAll('a')
- if anchors and not (anchors == None or anchors == []):
- try:
- for link in anchors:
- # print str(link)
- if link and not link == None:
- # print ('type: %s'%(str(type(link))))
- # print ('link: %s' % (link))
- myParent = link.parent
- # print str('parent: %s'%(myParent))
- try:
- myIndex = link.parent.index(link)
- hasIndex = True
- except:
- myIndex = 0
- hasIndex = False
- # print str('index %s'%(myIndex))
- if not link.string == None:
- # print 'link=notnone'
- if hasIndex == True:
- myParent.insert(myIndex, link.string)
- else:
- myParent.append(link.string)
- else:
- # print 'link=none'
- myParent.insert(myIndex, link.contents)
- self.myKiller.safeRemovePart(link, False)
- else:
- notshown = 'tag received is empty' # print
- except:
- notshown = 'tag received is empty' # print
- notshown
- return soup
-
class MerryProcess(BeautifulSoup):
myKiller = MerryExtract()
- myReplacer = MerryReplace()
myPrepare = MerryPreProcess()
def optimizeLayout(self,soup):
self.myPrepare.optimizePicture(soup)
- if SHOWDEBUG0 == True:
- mlog.addDebug('End of Optimize Layout')
- mlog.showDebug()
return soup
def insertFacts(self, soup):
- allfacts = soup.findAll('div', {'class':re.compile('^article-box-fact.*$')})
- if SHOWDEBUG0 == True:
- mlog.addTextAndTag(['allfacts'],[allfacts])
- mlog.showDebug()
+ thefactpart = re.compile('^article-box-fact.*$')
+ allfacts = soup.findAll('div', {'class':thefactpart})
if allfacts and not allfacts == None:
- allfactsparent = soup.find('div', {'class':re.compile('^article-box-fact.*$')}).parent
- if SHOWDEBUG0 == True:
- mlog.addTextAndTag(['allfactsparent'],[allfactsparent])
- mlog.showDebug()
+ allfactsparent = soup.find('div', {'class':thefactpart}).parent
for part in allfactsparent:
if not part in allfacts:
- if SHOWDEBUG0 == True:
- mlog.addTextAndTag(['FOUND A non-fact'],[part])
- mlog.showDebug()
self.myKiller.safeRemovePart(part, True)
- if SHOWDEBUG1 == True:
- mlog.addTextAndTag(['New All Facts'],[allfacts])
- mlog.showDebug()
articlefacts = soup.find('div', {'class':'article-box-fact column'})
- errorOccured=False
if (articlefacts and not articlefacts==None):
try:
contenttag = soup.find('div', {'class':'article-body'})
- if SHOWDEBUG0 == True:
- mlog.addTextAndTag(['curcontag'],[contenttag])
- mlog.showDebug()
foundrighttag = False
if contenttag and not contenttag == None:
foundrighttag = True
- if SHOWDEBUG0 == True:
- if errorOccured == False:
- mlog.addTextAndTag(['type','curcontag (in while)'],[type(contenttag),contenttag])
- else:
- mlog.addDebug('Could not find right parent tag. Error Occured')
- mlog.showDebug()
if foundrighttag == True:
contenttag.insert(0, allfactsparent)
- if SHOWDEBUG2 == True:
- mlog.addTextAndTag(['added parent'],[soup.prettify()])
- mlog.showDebug()
except:
- errorOccured=True
- mlog.addTrace()
- else:
- errorOccured=True
- if SHOWDEBUG0 == True and errorOccured == True:
- mlog.addTextAndTag(['no articlefacts'],[articlefacts])
- mlog.showDebug()
+ pass
+ return soup
+
+ def moveTitleAndAuthor(self, soup):
+ moveitem = soup.h1
+ pubdate = soup.find(id="date")
+ if moveitem and not moveitem == None and pubdate and not pubdate == None:
+ try:
+ pubdate.parent.insert(0, moveitem)
+ except:
+ print '\n!!error in moving title!!\n'
+ pass
+ moveitem = None
+ moveitem = soup.find('div', {'class':'byline'})
+ if moveitem and not moveitem == None:
+ try:
+ moveitem.parent.parent.insert(-1, moveitem)
+ except:
+ print '\n!!error in moving byline!!\n'
+ pass
return soup
-
- def previousNextSibRemover(self, soup, previous=True, soupIsArray=False):
- findsibsof = soup
- firstpart = previous
- if findsibsof and not findsibsof == None:
- if soupIsArray == True:
- for foundsib in findsibsof:
- self.previousNextSibRemover(foundsib, firstpart, soupIsArray=False)
- else:
- if firstpart == True and soupIsArray == False:
- sibs = findsibsof.previousSiblingGenerator()
- else:
- sibs = findsibsof.nextSiblingGenerator()
- for sib in sibs:
- self.myKiller.safeRemovePart(sib, True)
- else:
- if SHOWDEBUG1 == True:
- mlog.addDebug('Not any sib found')
- return
def removeUnwantedTags(self,soup):
- if SHOWDEBUG1 == True:
- mlog.addTextAndTag(['Len of Soup before RemoveTagsByName'],[len(str(soup))])
- mlog.showDebug()
- self.removeTagsByName(soup)
- if SHOWDEBUG1 == True:
- mlog.addDebug('Len of Soup before firstandlastpart: %s' % len(str(soup)))
- mlog.showDebug()
self.insertFacts(soup)
- self.removeFirstAndLastPart(soup)
- if SHOWDEBUG1 == True:
- mlog.addDebug('Len of Soup before unwantedpart: %s' % len(str(soup)))
- mlog.showDebug()
- self.removeUnwantedParts(soup)
- if SHOWDEBUG1 == True:
- mlog.addDebug('Len of Soup before EmptyParts: %s' % len(str(soup)))
- mlog.showDebug()
self.removeEmptyTags(soup)
- if SHOWDEBUG1 == True:
- mlog.addDebug('Len of Soup after EmptyParts: %s' % len(str(soup)))
- mlog.showDebug()
- self.myReplacer.replaceATag(soup)
- return soup
-
- def removeUnwantedParts(self, soup):
- if SHOWDEBUG1 == True:
- mlog.addDebug('Len of Soup before UnwantedID: %s' % len(str(soup)))
- mlog.showDebug()
- self.removeUnwantedTagsByID(soup)
- if SHOWDEBUG1 == True:
- mlog.addDebug('Len of Soup before Class: %s' % len(str(soup)))
- mlog.showDebug()
- self.removeUnwantedTagsByClass(soup)
- if SHOWDEBUG1 == True:
- mlog.addDebug('Len of Soup before Style: %s' % len(str(soup)))
- mlog.showDebug()
- self.removeUnwantedTagsByStyle(soup)
- return soup
-
- def removeUnwantedTagsByStyle(self,soup):
- self.removeArrayOfTags(soup.findAll(attrs={'style' : re.compile("^(.*(display\s?:\s?none|img-mask|white)\s?;?.*)$")}))
- if SHOWDEBUG0 == True:
- mlog.addDebug('end remove by style')
+ self.removeArrayOfTags(soup.findAll(attrs={'class': 'share-tools-bottom'})) # at end to keep author
return soup
def removeArrayOfTags(self,souparray):
return self.myKiller.safeRemovePart(souparray, True)
- def removeUnwantedTagsByClass(self,soup):
- if SHOWDEBUG0 == True:
- mlog.addDebug('start remove by class')
- self.removeArrayOfTags(soup.findAll("div", { "class" :re.compile('^(promo.*?|article-tools-below-title|metroCommentFormWrap|ad|share-tools|tools|header-links|related-links|padding-top-15)$')}))
- return soup
-
- def removeUnwantedTagsByID(self,soup):
- defaultids = ['footer-extra',re.compile('^ad(\d+|adcomp.*?)?$'),'column-4-5','navigation','header',re.compile('^column-1-5-(top|bottom)$'),'footer','hidden_div','sidebar',re.compile('^article-\d$'),'comments','footer']
- for removeid in defaultids:
- if SHOWDEBUG1 == True:
- mlog.addDebug('RemoveTagByID, tag: %s, Len of Soup: %s' % (str(removeid), len(str(soup))))
- mlog.showDebug()
- self.removeArrayOfTags(soup.findAll(id=removeid))
- return soup
-
- # def safeRemoveTag(self, subtree):
- # return self.myKiller.safeRemovePart(subtree, True)
-
-
- def removeTagsByName(self, soup):
- self.myKiller.safeRemovePart(soup.script, True)
- self.myKiller.safeRemovePart(soup.iframe, True)
- self.myKiller.safeRemovePart(soup.style, True)
- self.myKiller.safeRemovePart(soup.noscript, True)
- return soup
-
def removeEmptyTags(self,soup,run=0):
- if SHOWDEBUG0 == True:
- mlog.addDebug('starting removeEmptyTags')
- if SHOWDEBUG1 == True:
- run += 1
- mlog.addDebug(run)
- if SHOWDEBUG2 == True:
- mlog.addDebug(str(soup.prettify()))
- mlog.showDebug()
- emptymatches = re.compile('^( |\s|\n|\r|\t)*$')
+ emptymatches = re.compile('^[ \s\n\r\t ]*$')
emptytags = soup.findAll(lambda tag: tag.find(True) is None and (tag.string is None or tag.string.strip()=="" or tag.string.strip()==emptymatches) and not tag.isSelfClosing)
if emptytags and not (emptytags == None or emptytags == []):
- if SHOWDEBUG1 == True:
- mlog.addDebug('tags found')
- mlog.addDebug(str(emptytags))
self.removeArrayOfTags(emptytags)
#recursive in case removing empty tag creates new empty tag
self.removeEmptyTags(soup, run=run)
- else:
- if SHOWDEBUG1 == True:
- mlog.addDebug('no empty tags found')
- mlog.showDebug()
- if SHOWDEBUG0 == True:
- if SHOWDEBUG2 == True:
- mlog.addDebug('new soup:')
- mlog.addDebug(str(soup.prettify()))
- mlog.addDebug('RemoveEmptyTags Completed')
- mlog.showDebug()
- return soup
-
- def removeFirstAndLastPart(self,soup):
- def findparenttag(lookuptag):
- if lookuptag and not lookuptag == None:
- return lookuptag.findParents()
- findtag = soup.find(id="date")
- self.previousNextSibRemover(findtag, previous=True, soupIsArray=False)
- self.previousNextSibRemover(findparenttag(findtag), previous=True, soupIsArray=True)
- for endtag in [soup.find(id="share-and-byline"), soup.find("div", { "class" : "gallery-text" })]:
- self.previousNextSibRemover(endtag, previous=False, soupIsArray=False)
- self.previousNextSibRemover(findparenttag(endtag), previous=False, soupIsArray=True)
- return soup
+ return soup
\ No newline at end of file
diff --git a/recipes/metro_uk.recipe b/recipes/metro_uk.recipe
index 8dc7008a68..c30f81c019 100644
--- a/recipes/metro_uk.recipe
+++ b/recipes/metro_uk.recipe
@@ -1,52 +1,30 @@
-import re
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1306097511(BasicNewsRecipe):
title = u'Metro UK'
description = 'News as provide by The Metro -UK'
-
+ #timefmt = ''
__author__ = 'Dave Asbury'
- #last update 3/12/11
cover_url = 'http://profile.ak.fbcdn.net/hprofile-ak-snc4/276636_117118184990145_2132092232_n.jpg'
- no_stylesheets = True
+ #no_stylesheets = True
oldest_article = 1
- max_articles_per_feed = 20
+ max_articles_per_feed = 10
remove_empty_feeds = True
remove_javascript = True
+ auto_cleanup = True
- #preprocess_regexps = [(re.compile(r'Tweet'), lambda a : '')]
- preprocess_regexps = [
- (re.compile(r'', re.IGNORECASE | re.DOTALL), lambda match: ' ')]
- preprocess_regexps = [
- (re.compile(r'tweet', re.IGNORECASE | re.DOTALL), lambda match: '')]
language = 'en_GB'
-
-
masthead_url = 'http://e-edition.metro.co.uk/images/metro_logo.gif'
-
-
keep_only_tags = [
- dict(name='h1'),dict(name='h2', attrs={'class':'h2'}),
- dict(attrs={'class':['img-cnt figure']}),
- dict(attrs={'class':['art-img']}),
- dict(name='div', attrs={'class':'art-lft'}),
- dict(name='p')
+
]
remove_tags = [
- dict(name = 'div',attrs={'id' : ['comments-news','formSubmission']}),
- dict(name='div', attrs={'class':[ 'news m12 clrd clr-b p5t shareBtm', 'commentForm', 'metroCommentInnerWrap',
- 'art-rgt','pluck-app pluck-comm','news m12 clrd clr-l p5t', 'flt-r','username','clrd' ]}),
- dict(attrs={'class':['username', 'metroCommentFormWrap','commentText','commentsNav','avatar','submDateAndTime','addYourComment','displayName']})
- ,dict(name='div', attrs={'class' : 'clrd art-fd fd-gr1-b'})
+
]
+
+
feeds = [
(u'News', u'http://www.metro.co.uk/rss/news/'), (u'Money', u'http://www.metro.co.uk/rss/money/'), (u'Sport', u'http://www.metro.co.uk/rss/sport/'), (u'Film', u'http://www.metro.co.uk/rss/metrolife/film/'), (u'Music', u'http://www.metro.co.uk/rss/metrolife/music/'), (u'TV', u'http://www.metro.co.uk/rss/tv/'), (u'Showbiz', u'http://www.metro.co.uk/rss/showbiz/'), (u'Weird News', u'http://www.metro.co.uk/rss/weird/'), (u'Travel', u'http://www.metro.co.uk/rss/travel/'), (u'Lifestyle', u'http://www.metro.co.uk/rss/lifestyle/'), (u'Books', u'http://www.metro.co.uk/rss/lifestyle/books/'), (u'Food', u'http://www.metro.co.uk/rss/lifestyle/restaurants/')]
-
extra_css = '''
- body {font: sans-serif medium;}'
- h1 {text-align : center; font-family:Arial,Helvetica,sans-serif; font-size:20px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:bold;}
- h2 {text-align : center;color:#4D4D4D;font-family:Arial,Helvetica,sans-serif; font-size:15px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:bold; }
- span{ font-size:9.5px; font-weight:bold;font-style:italic}
- p { text-align: justify; font-family:Arial,Helvetica,sans-serif; font-size:11px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:normal;}
-
- '''
+ body{ text-align: justify; font-family:Arial,Helvetica,sans-serif; font-size:11px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:normal;}
+ '''
diff --git a/recipes/national_geographic_pl.recipe b/recipes/national_geographic_pl.recipe
index a2f759e878..07fc0da666 100644
--- a/recipes/national_geographic_pl.recipe
+++ b/recipes/national_geographic_pl.recipe
@@ -9,8 +9,9 @@ from calibre.web.feeds.recipes import BasicNewsRecipe
class recipeMagic(BasicNewsRecipe):
title = 'National Geographic PL'
__author__ = 'Marcin Urban 2011'
+ __modified_by__ = 'fenuks'
description = 'legenda wśród magazynów z historią sięgającą 120 lat'
- cover_url = 'http://www.guj.pl/var/guj/storage/images/media/nasze_magazyny/national_geographic/logo/ng_logo/2606-1-pol-PL/ng_logo.jpg'
+ #cover_url = 'http://www.guj.pl/var/guj/storage/images/media/nasze_magazyny/national_geographic/logo/ng_logo/2606-1-pol-PL/ng_logo.jpg'
oldest_article = 7
max_articles_per_feed = 100
no_stylesheets = True
@@ -42,11 +43,43 @@ class recipeMagic(BasicNewsRecipe):
]
remove_attributes = ['width','height']
+ feeds=[]
- feeds = [
- ('National Geographic PL', 'http://www.national-geographic.pl/rss/'),
- ]
+ def find_articles(self, url):
+ articles = []
+ soup=self.index_to_soup(url)
+ tag=soup.find(attrs={'class':'arl'})
+ art=tag.ul.findAll('li')
+ for i in art:
+ title=i.a['title']
+ url=i.a['href']
+ #date=soup.find(id='footer').ul.li.string[41:-1]
+ desc=i.div.p.string
+ articles.append({'title' : title,
+ 'url' : url,
+ 'date' : '',
+ 'description' : desc
+ })
+ return articles
+
+ def parse_index(self):
+ feeds = []
+ feeds.append((u"Aktualności", self.find_articles('http://www.national-geographic.pl/aktualnosci/')))
+ feeds.append((u"Artykuły", self.find_articles('http://www.national-geographic.pl/artykuly/')))
+
+ return feeds
def print_version(self, url):
- return url.replace('artykuly0Cpokaz', 'drukuj-artykul')
+ if 'artykuly' in url:
+ return url.replace('artykuly/pokaz', 'drukuj-artykul')
+ elif 'aktualnosci' in url:
+ return url.replace('aktualnosci/pokaz', 'drukuj-artykul')
+ else:
+ return url
+
+ def get_cover_url(self):
+ soup = self.index_to_soup('http://www.national-geographic.pl/biezace-wydania/')
+ tag=soup.find(attrs={'class':'txt jus'})
+ self.cover_url=tag.img['src']
+ return getattr(self, 'cover_url', self.cover_url)
diff --git a/recipes/non_leggerlo.recipe b/recipes/non_leggerlo.recipe
new file mode 100644
index 0000000000..90bb76c0ef
--- /dev/null
+++ b/recipes/non_leggerlo.recipe
@@ -0,0 +1,16 @@
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class AdvancedUserRecipe1335362999(BasicNewsRecipe):
+ title = u'Non leggerlo'
+ oldest_article = 7
+ max_articles_per_feed = 100
+ auto_cleanup = False
+ keep_only_tags = [
+ dict(name='div', attrs={'class':'post hentry'})
+ ]
+ feeds = [(u'Non leggerlo', u'http://nonleggerlo.blogspot.com/feeds/posts/default')]
+ description = 'An Italian satirical blog'
+ language = 'it'
+ __author__ = 'faber1971'
+__version__ = 'v1.0'
+__date__ = '24, April 2012'
diff --git a/recipes/nowa_fantastyka.recipe b/recipes/nowa_fantastyka.recipe
index ec556da5fa..0371cb1f58 100644
--- a/recipes/nowa_fantastyka.recipe
+++ b/recipes/nowa_fantastyka.recipe
@@ -81,5 +81,7 @@ class Nowa_Fantastyka(BasicNewsRecipe):
title=soup.find(attrs={'class':'tytul'})
if title:
title['style']='font-size: 20px; font-weight: bold;'
- self.log.warn(soup)
+ for a in soup('a'):
+ if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']:
+ a['href']=self.INDEX + a['href']
return soup
diff --git a/recipes/orlando_sentinel.recipe b/recipes/orlando_sentinel.recipe
index 7a59f6f6ba..b327bc2b74 100644
--- a/recipes/orlando_sentinel.recipe
+++ b/recipes/orlando_sentinel.recipe
@@ -1,3 +1,4 @@
+import urllib, re
from calibre.web.feeds.news import BasicNewsRecipe
class AdvancedUserRecipe1279258912(BasicNewsRecipe):
@@ -27,12 +28,30 @@ class AdvancedUserRecipe1279258912(BasicNewsRecipe):
encoding = 'utf-8'
conversion_options = {'linearize_tables':True}
masthead_url = 'http://www.orlandosentinel.com/media/graphic/2009-07/46844851.gif'
- keep_only_tags = [
- dict(name='div', attrs={'class':'story'})
- ]
- remove_tags = [
- dict(name='div', attrs={'class':['articlerail','tools','comment-group','clearfix']}),
- ]
- remove_tags_after = [
- dict(name='p', attrs={'class':'copyright'}),
- ]
+
+ auto_cleanup = True
+
+ def get_article_url(self, article):
+ ans = None
+ try:
+ s = article.summary
+ ans = urllib.unquote(
+ re.search(r'href=".+?bookmark.cfm.+?link=(.+?)"', s).group(1))
+ except:
+ pass
+ if ans is None:
+ link = article.get('feedburner_origlink', None)
+ if link and link.split('/')[-1]=="story01.htm":
+ link=link.split('/')[-2]
+ encoding = {'0B': '.', '0C': '/', '0A': '0', '0F': '=', '0G': '&',
+ '0D': '?', '0E': '-', '0N': '.com', '0L': 'http:',
+ '0S':'//'}
+ for k, v in encoding.iteritems():
+ link = link.replace(k, v)
+ ans = link
+ elif link:
+ ans = link
+ if ans is not None:
+ return ans.replace('?track=rss', '')
+
+
diff --git a/recipes/pc_arena.recipe b/recipes/pc_arena.recipe
index 952db30c3e..56bb601f70 100644
--- a/recipes/pc_arena.recipe
+++ b/recipes/pc_arena.recipe
@@ -7,6 +7,7 @@ class PC_Arena(BasicNewsRecipe):
description = u'Najnowsze informacje z branży IT - testy, recenzje, aktualności, rankingi, wywiady. Twoje źródło informacji o sprzęcie komputerowym.'
category = 'IT'
language = 'pl'
+ index='http://pcarena.pl'
masthead_url='http://pcarena.pl/pcarena/img/logo.png'
cover_url= 'http://pcarena.pl/pcarena/img/logo.png'
no_stylesheets = True
@@ -22,4 +23,10 @@ class PC_Arena(BasicNewsRecipe):
if 'http' not in url:
return 'http://pcarena.pl' + url
else:
- return url
\ No newline at end of file
+ return url
+
+ def preprocess_html(self, soup):
+ for a in soup('a'):
+ if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']:
+ a['href']=self.index + a['href']
+ return soup
\ No newline at end of file
diff --git a/recipes/readitlater.recipe b/recipes/readitlater.recipe
index 38f7ec1a9a..92c9aaf9d6 100644
--- a/recipes/readitlater.recipe
+++ b/recipes/readitlater.recipe
@@ -1,5 +1,5 @@
"""
-readitlaterlist.com
+Pocket Calibre Recipe v1.0
"""
__license__ = 'GPL v3'
__copyright__ = '''
@@ -12,22 +12,23 @@ from calibre import strftime
from calibre.web.feeds.news import BasicNewsRecipe
-class Readitlater(BasicNewsRecipe):
- title = 'ReadItLater'
+class Pocket(BasicNewsRecipe):
+ title = 'Pocket'
__author__ = 'Darko Miletic, Przemyslaw Kryger, Keith Callenberg, tBunnyMan'
- description = '''Personalized news feeds. Go to readitlaterlist.com to setup \
- up your news. This version displays pages of articles from \
+ description = '''Personalized news feeds. Go to getpocket.com to setup up \
+ your news. This version displays pages of articles from \
oldest to newest, with max & minimum counts, and marks articles \
read after downloading.'''
- publisher = 'readitlaterlist.com'
+ publisher = 'getpocket.com'
category = 'news, custom'
oldest_article = 7
max_articles_per_feed = 50
- minimum_articles = 1
+ minimum_articles = 10
+ mark_as_read_after_dl = True
no_stylesheets = True
use_embedded_content = False
needs_subscription = True
- INDEX = u'http://readitlaterlist.com'
+ INDEX = u'http://getpocket.com'
LOGIN = INDEX + u'/l'
readList = []
@@ -100,9 +101,31 @@ class Readitlater(BasicNewsRecipe):
br = self.get_browser()
for link in markList:
url = self.INDEX + link
+ print 'Marking read: ', url
response = br.open(url)
- response
+ print response.info()
def cleanup(self):
- self.mark_as_read(self.readList)
+ if self.mark_as_read_after_dl:
+ self.mark_as_read(self.readList)
+ else:
+ pass
+ def default_cover(self, cover_file):
+ '''
+ Create a generic cover for recipes that don't have a cover
+ This override adds time to the cover
+ '''
+ try:
+ from calibre.ebooks import calibre_cover
+ title = self.title if isinstance(self.title, unicode) else \
+ self.title.decode('utf-8', 'replace')
+ date = strftime(self.timefmt)
+ time = strftime('[%I:%M %p]')
+ img_data = calibre_cover(title, date, time)
+ cover_file.write(img_data)
+ cover_file.flush()
+ except:
+ self.log.exception('Failed to generate default cover')
+ return False
+ return True
diff --git a/recipes/richmond_times_dispatch.recipe b/recipes/richmond_times_dispatch.recipe
new file mode 100644
index 0000000000..163a6317ff
--- /dev/null
+++ b/recipes/richmond_times_dispatch.recipe
@@ -0,0 +1,59 @@
+from calibre.web.feeds.recipes import BasicNewsRecipe
+
+class AdvancedUserRecipe1335532466(BasicNewsRecipe):
+ title = u'Richmond Times-Dispatch'
+ description = 'News from Richmond, Virginia, USA'
+ __author__ = 'jde'
+ cover_url = 'http://static2.dukecms.com/va_tn/timesdispatch_com/site-media/img/icons/logo252x97.png'
+ language = 'en'
+ encoding = 'utf8'
+ oldest_article = 1 #days
+ max_articles_per_feed = 25
+ needs_subscription = False
+ remove_javascript = True
+ recursions = 0
+ use_embedded_content = False
+ no_stylesheets = True
+ auto_cleanup = True
+
+ feeds = [
+
+('News',
+'http://www2.timesdispatch.com/list/feed/rss/news-archive'),
+('Breaking News',
+'http://www2.timesdispatch.com/list/feed/rss/breaking-news'),
+('National News',
+'http://www2.timesdispatch.com/list/feed/rss/national-news'),
+('Local News',
+'http://www2.timesdispatch.com/list/feed/rss/local-news'),
+('Business',
+'http://www2.timesdispatch.com/list/feed/rss/business'),
+('Local Business',
+'http://www2.timesdispatch.com/list/feed/rss/local-business'),
+('Politics',
+'http://www2.timesdispatch.com/list/feed/rss/politics'),
+('Virginia Politics',
+'http://www2.timesdispatch.com/list/feed/rss/virginia-politics'),
+('Editorials',
+'http://www2.timesdispatch.com/list/feed/rss/editorial-desk'),
+('Columnists and Blogs',
+'http://www2.timesdispatch.com/list/feed/rss/news-columnists-blogs'),
+('Opinion Columnists',
+'http://www2.timesdispatch.com/list/feed/rss/opinion-editorial-columnists'),
+('Letters to the Editor',
+'http://www2.timesdispatch.com/list/feed/rss/opinion-letters'),
+('Traffic',
+'http://www2.timesdispatch.com/list/feed/rss/traffic'),
+('Sports',
+'http://www2.timesdispatch.com/list/feed/rss/sports2'),
+('Entertainment/Life',
+'http://www2.timesdispatch.com/list/feed/rss/entertainment'),
+('Movies',
+'http://www2.timesdispatch.com/list/feed/rss/movies'),
+('Music',
+'http://www2.timesdispatch.com/list/feed/rss/music'),
+('Dining & Food',
+'http://www2.timesdispatch.com/list/feed/rss/dining'),
+
+ ]
+
diff --git a/recipes/sol_haber.recipe b/recipes/sol_haber.recipe
new file mode 100644
index 0000000000..29db88019c
--- /dev/null
+++ b/recipes/sol_haber.recipe
@@ -0,0 +1,141 @@
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+from __future__ import unicode_literals
+
+__license__ = 'GPL v3'
+__copyright__ = '2012, Onur Gungor onurgu@gmail.com'
+__docformat__ = 'restructuredtext en'
+
+'''
+www.sol.org.tr
+'''
+
+import datetime
+
+import re
+
+from calibre.web.feeds.recipes import BasicNewsRecipe
+
+class SolHaberRecipe(BasicNewsRecipe):
+ title = u'soL Haber'
+ oldest_article = 7
+ max_articles_per_feed = 100
+
+ language = 'tr'
+ __author__ = 'Onur Güngör'
+ description = 'Hayata soL''dan bakın..'
+ publisher = 'soL Haber'
+ tags = 'news, haberler, siyaset, türkiye, turkey, politics'
+
+
+ conversion_options = {
+ 'comment' : description
+ , 'tags' : tags
+ , 'publisher' : publisher
+ , 'language' : language
+ }
+
+ category_dict = { 'sonuncu-kavga':'Sonuncu Kavga',
+ 'devlet-ve-siyaset':'Devlet ve Siyaset',
+ 'ekonomi':'Ekonomi',
+ 'enternasyonal-gundem':'Enternasyonel Gündem',
+ 'kent-gundemleri':'Kent Gündemleri',
+ 'kultur-sanat':'Kültür Sanat',
+ 'dunyadan':'Dünyadan',
+ 'serbest-kursu':'Serbest Kürsü',
+ 'medya':'Medya',
+ 'liseliler':'Liseliler',
+ 'yazarlar':'Köşe Yazıları'}
+
+ end_date = datetime.date.today().isoformat()
+ start_date = (datetime.date.today()-datetime.timedelta(days=1)).isoformat()
+
+
+ section_tuples = [['Köşe Yazıları', 'http://haber.sol.org.tr/arsiv?icerik=kose_yazisi&tarih%%5Bmin%%5D%%5Bdate%%5D=%s&tarih%%5Bmax%%5D%%5Bdate%%5D=%s' % (start_date, end_date)],
+ ['Haberler', 'http://haber.sol.org.tr/arsiv?icerik=haber&tarih%%5Bmin%%5D%%5Bdate%%5D=%s&tarih%%5Bmax%%5D%%5Bdate%%5D=%s' % (start_date, end_date)],
+ ['soL postal', 'http://haber.sol.org.tr/arsiv?icerik=postal&tarih%%5Bmin%%5D%%5Bdate%%5D=%s&tarih%%5Bmax%%5D%%5Bdate%%5D=%s' % (start_date, end_date)],
+ ['Bizim Amerika', 'http://haber.sol.org.tr/arsiv?icerik=bizim_amerika&tarih%%5Bmin%%5D%%5Bdate%%5D=%s&tarih%%5Bmax%%5D%%5Bdate%%5D=%s' % (start_date, end_date)]]
+
+
+ # Disable stylesheets from site.
+ no_stylesheets = True
+
+ cover_margins = (20, 20, '#ffffff')
+
+ storybody_reg_exp = '^\s*(haber|kose)\s*$'
+
+ comments_reg_exp = '^\s*makale-elestiri\s*$'
+
+ remove_tags = [dict(name='div', attrs={'class':re.compile(comments_reg_exp, re.IGNORECASE)})]
+
+ keep_only_tags = [dict(name='div', attrs={'class':re.compile(storybody_reg_exp, re.IGNORECASE)})]
+
+ def get_masthead_title(self):
+ return self.title + "(" + self.end_date + ")"
+
+ def parse_index(self):
+
+ result = []
+ articles_dict = dict()
+
+ author_regexp = re.compile('^http://.*?/yazarlar/(.*?)/.*$')
+ category_regexp = re.compile('^http://.*?/(.+?)/.*$')
+
+ for section_tuple in self.section_tuples:
+
+ section_title = section_tuple[0]
+ section_index_url = section_tuple[1]
+
+ self.log('Bölüm:', section_title, 'URL:', section_index_url)
+
+ soup = self.index_to_soup(section_index_url)
+
+ logo = soup.find('div', id='logo').find('img', src=True)
+ if logo is not None:
+ self.cover_url = logo['src']
+ if self.cover_url.startswith('/'):
+ self.cover_url = 'http://haber.sol.org.tr'+self.cover_url
+
+ view_content = soup.find('div', id='ana-icerik').find('div', attrs={'class':'view-content'})
+ if view_content == None:
+ break
+ rows = view_content.find('tbody').findAll('tr')
+
+ self.log('Row sayısı', len(rows))
+ for row in rows:
+ cells = row.findAll('td')
+
+ a = cells[1].find('a', href=True)
+
+ url = a['href']
+ title = self.tag_to_string(a)
+
+ if url.startswith('/'):
+ url = 'http://haber.sol.org.tr'+url
+
+ category = section_title
+ category_match_result = category_regexp.match(url)
+ if category_match_result:
+ category = category_match_result.group(1)
+
+ date = self.tag_to_string(cells[2])
+
+ author = 'soL haber'
+
+ author_match_result = author_regexp.match(url)
+ if author_match_result:
+ author = author_match_result.group(1)
+
+ self.log('\tFound article:', title, 'at', url, 'published at ', date, 'by', author)
+ article = {'title':title, 'url':url, 'description':None, 'date':date, 'author':author}
+ if category in articles_dict:
+ articles_dict[category].append(article)
+ else:
+ articles_dict[category] = [article]
+
+ for category in articles_dict.keys():
+ if category in self.category_dict:
+ result.append((self.category_dict[category], articles_dict[category]))
+ else:
+ result.append((category, articles_dict[category]))
+
+ return result
diff --git a/recipes/swiat_obrazu.recipe b/recipes/swiat_obrazu.recipe
new file mode 100644
index 0000000000..68740fa4dd
--- /dev/null
+++ b/recipes/swiat_obrazu.recipe
@@ -0,0 +1,25 @@
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class Swiat_Obrazu(BasicNewsRecipe):
+ title = u'Swiat Obrazu'
+ __author__ = 'fenuks'
+ description = u'Internetowy Dziennik o Fotografii i Wideo www.SwiatObrazu.pl to źródło informacji o technice fotografii i wideo, o sprzęcie najbardziej znanych i uznanych firm: Canon, Nikon, Sony, Hasselblad i wielu innych. Znajdziecie tu programy do obróbki zdjęć, forum foto i forum wideo i galerie zdjęć. Codziennie najświeższe informacje: aktualności, testy, poradniki, wywiady, felietony. Swiatobrazu.pl stale organizuje konkursy oraz warsztaty fotograficzne i wideo.'
+ category = 'photography'
+ masthead_url = 'http://www.swiatobrazu.pl/img/logo.jpg'
+ cover_url = 'http://www.swiatobrazu.pl/img/logo.jpg'
+ language = 'pl'
+ oldest_article = 7
+ max_articles_per_feed = 100
+ no_stylesheets = True
+ remove_javascript= True
+ use_embedded_content = False
+ feeds = [(u'Wszystko', u'http://www.swiatobrazu.pl/rss')]
+
+ def print_version(self, url):
+ return url + ',drukuj'
+
+ def image_url_processor(self, baseurl, url):
+ if 'http://' not in url or 'https://' not in url:
+ return 'http://www.swiatobrazu.pl' + url[5:]
+ else:
+ return url
diff --git a/recipes/tagesspiegel.recipe b/recipes/tagesspiegel.recipe
index 92d88d56ae..71191065f1 100644
--- a/recipes/tagesspiegel.recipe
+++ b/recipes/tagesspiegel.recipe
@@ -34,7 +34,7 @@ class TagesspiegelRSS(BasicNewsRecipe):
no_javascript = True
remove_empty_feeds = True
encoding = 'utf-8'
- remove_tags = [{'class':'hcf-header'}, {'class':'hcf-atlas'}, {'class':'hcf-date hcf-separate'}]
+ remove_tags = [{'class':'hcf-header'}, {'class':'hcf-atlas'}, {'class':'hcf-colon'}, {'class':'hcf-date hcf-separate'}]
def print_version(self, url):
url = url.split('/')
@@ -51,6 +51,7 @@ class TagesspiegelRSS(BasicNewsRecipe):
return ''.join(div.findAll(text=True, recursive=False)).strip() if div is not None else None
articles = {}
+ links = set()
key = None
ans = []
maincol = soup.find('div', attrs={'class':re.compile('hcf-main-col')})
@@ -59,7 +60,7 @@ class TagesspiegelRSS(BasicNewsRecipe):
if div['class'] == 'hcf-header':
try:
- key = string.capwords(feed_title(div.em.a))
+ key = string.capwords(feed_title(div.em))
articles[key] = []
ans.append(key)
except:
@@ -70,6 +71,12 @@ class TagesspiegelRSS(BasicNewsRecipe):
if not a:
continue
url = 'http://www.tagesspiegel.de' + a['href']
+
+ # check for duplicates
+ if url in links:
+ continue
+ links.add(url)
+
title = self.tag_to_string(a, use_alt=True).strip()
description = ''
pubdate = strftime('%a, %d %b')
diff --git a/recipes/tanuki.recipe b/recipes/tanuki.recipe
index 666cb8aa77..a615763307 100644
--- a/recipes/tanuki.recipe
+++ b/recipes/tanuki.recipe
@@ -34,4 +34,12 @@ class tanuki(BasicNewsRecipe):
def preprocess_html(self, soup):
self.append_page(soup, soup.body)
+ for a in soup('a'):
+ if a.has_key('href') and 'http://' not in a['href'] and 'https://' not in a['href']:
+ if 'tanuki-anime' in soup.title.string.lower():
+ a['href']='http://anime.tanuki.pl' + a['href']
+ elif 'tanuki-manga' in soup.title.string.lower():
+ a['href']='http://manga.tanuki.pl' + a['href']
+ elif 'tanuki-czytelnia' in soup.title.string.lower():
+ a['href']='http://czytelnia.tanuki.pl' + a['href']
return soup
\ No newline at end of file
diff --git a/recipes/telam.recipe b/recipes/telam.recipe
new file mode 100644
index 0000000000..c2dbfee1d7
--- /dev/null
+++ b/recipes/telam.recipe
@@ -0,0 +1,62 @@
+__license__ = 'GPL v3'
+__copyright__ = '2012, Darko Miletic '
+'''
+www.telam.com.ar
+'''
+
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class Telam(BasicNewsRecipe):
+ title = 'Telam'
+ __author__ = 'Darko Miletic'
+ description = 'AGENCIA DE NOTICIAS DE LA REPUBLICA ARGENTINA'
+ publisher = 'Telam S.E.'
+ category = 'news, politics, Argentina'
+ oldest_article = 2
+ max_articles_per_feed = 200
+ no_stylesheets = True
+ encoding = 'utf8'
+ use_embedded_content = False
+ language = 'es_AR'
+ remove_empty_feeds = True
+ publication_type = 'newsportal'
+ masthead_url = 'http://www.telam.com.ar/front/imagenes/encabezado/logotelam.jpg'
+ extra_css = """
+ body{font-family: Arial,Helvetica,sans-serif }
+ img{margin-bottom: 0.4em; display:block}
+ """
+
+ conversion_options = {
+ 'comment' : description
+ , 'tags' : category
+ , 'publisher' : publisher
+ , 'language' : language
+ }
+
+ remove_tags = [dict(name=['meta','link'])]
+ remove_tags_before = dict(attrs={'class':'nota_fecha'})
+ remove_tags_after = dict(attrs={'class':'nota_completa'})
+ remove_attributes = ['lang']
+
+
+ feeds = [
+ (u'Ultimas noticias', u'http://www.telam.com.ar/xml/rss/' )
+ ,(u'Politica' , u'http://www.telam.com.ar/xml/rss/1')
+ ,(u'Economia' , u'http://www.telam.com.ar/xml/rss/2')
+ ,(u'Sociedad' , u'http://www.telam.com.ar/xml/rss/3')
+ ,(u'Policiales' , u'http://www.telam.com.ar/xml/rss/4')
+ ,(u'Internacionales' , u'http://www.telam.com.ar/xml/rss/6')
+ ,(u'Espectaculos' , u'http://www.telam.com.ar/xml/rss/7')
+ ,(u'Cultura' , u'http://www.telam.com.ar/xml/rss/8')
+ ,(u'Deportes' , u'http://www.telam.com.ar/xml/rss/9')
+ ,(u'Telam Investiga' , u'http://www.telam.com.ar/xml/rss/5')
+ ]
+
+ def print_version(self, url):
+ artid = url.rpartition('/')[2]
+ return 'http://www.telam.com.ar/?codProg=imprimir-nota&id=' + artid
+
+ def preprocess_html(self, soup):
+ for item in soup.findAll(style=True):
+ del item['style']
+ return soup
diff --git a/recipes/the_sun.recipe b/recipes/the_sun.recipe
index 9285c0b2c2..db74e003a0 100644
--- a/recipes/the_sun.recipe
+++ b/recipes/the_sun.recipe
@@ -1,9 +1,8 @@
-import re
+import re, mechanize
from calibre.web.feeds.recipes import BasicNewsRecipe
class AdvancedUserRecipe1325006965(BasicNewsRecipe):
title = u'The Sun UK'
- cover_url = 'http://www.thesun.co.uk/img/global/new-masthead-logo.png'
description = 'A Recipe for The Sun tabloid UK'
__author__ = 'Dave Asbury'
@@ -24,37 +23,69 @@ class AdvancedUserRecipe1325006965(BasicNewsRecipe):
no_stylesheets = True
extra_css = '''
- body{ text-align: justify; font-family:Arial,Helvetica,sans-serif; font-size:11px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:normal;}
- '''
+ body{ text-align: justify; font-family:Arial,Helvetica,sans-serif; font-size:11px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:normal;}
+ '''
preprocess_regexps = [
- (re.compile(r'