diff --git a/.bzrignore b/.bzrignore
index 4dc2b4b9d4..88fc9188fc 100644
--- a/.bzrignore
+++ b/.bzrignore
@@ -2,6 +2,7 @@
.check-cache.pickle
src/calibre/plugins
resources/images.qrc
+src/calibre/ebooks/oeb/display/test/*.js
src/calibre/manual/.build/
src/calibre/manual/cli/
src/calibre/manual/template_ref.rst
@@ -15,6 +16,7 @@ resources/ebook-convert-complete.pickle
resources/builtin_recipes.xml
resources/builtin_recipes.zip
resources/template-functions.json
+resources/display/*.js
setup/installer/windows/calibre/build.log
src/calibre/translations/.errors
src/cssutils/.svn/
diff --git a/Changelog.yaml b/Changelog.yaml
index 7f43887264..c2b9843036 100644
--- a/Changelog.yaml
+++ b/Changelog.yaml
@@ -19,6 +19,125 @@
# new recipes:
# - title:
+- version: 0.8.31
+ date: 2011-12-16
+
+ new features:
+ - title: "Conversion engine: When parsing invalid XHTML use the HTML 5 algorithm, for greater robustness."
+ tickets: [901466]
+
+ - title: "Driver for PocketBook 611 and Lenovo IdeaPad"
+
+ - title: "Allow customization of the order in which custom column editing is performed in the edit metadata dialog. Setting is available via Preferences->Tweaks."
+ tickets: [902731]
+
+ - title: "MOBI news download: Allow recipes to set a thumbnail for entries in the periodical table of contents. Currently used by the NYTimes, WSJ, Independent, GUardian and Globe and Mail recipes"
+ tickets: [900130]
+
+ - title: "E-book viewer: Add an option to the right click menu to search for the currently selected word"
+
+ - title: "Automatically hide the no internet connection available error message if the connection is restored before the user clicks OK"
+
+ bug fixes:
+ - title: "Fix comments not hidden in Book details panel when they are turned off via Preferences->Look & Feel->Book Details"
+
+ - title: "E-book viewer: Do not popup an error message if the user tries to use the mouse wheel to scroll before a document is loaded."
+ tickets: [903449]
+
+ - title: "Add docx to the list of ebook extensions."
+ tickets: [903452]
+
+ - title: "When downloading metadata from non-English Amazon websites, do not correct the case of book titles."
+
+ - title: "Fix regression in 0.8.30 that broke bulk conversion of a single book."
+ tickets: [902506]
+
+ - title: "When minimized to system tray do not display the no internet connection error as a dialog box, instead use a system tray notification"
+
+ - title: "Catalog generation: Include the series_index field for custom series columns as well"
+
+ - title: "Comic Input: Do not rescale images when using the Tablet output profile (or any output profile with a screen size larger than 3000x3000)"
+
+ - title: "HTML Input: Ignore unparseable URLs instead of crashing on them."
+ tickets: [902372]
+
+
+ improved recipes:
+ - La Republica
+ - CND
+ - Berliner Zeitung
+ - Zaman Gazetesi
+
+ new recipes:
+ - title: CND Weekly
+ author: Derek Liang
+
+ - title: descopera.org
+ author: Marius Ignatescu
+
+ - title: Rynek Zdrowia
+ author: spi630
+
+- version: 0.8.30
+ date: 2011-12-09
+
+ new features:
+ - title: "Get Books: Add amazon.es and amazon.it"
+
+ - title: "Bulk convert dialog: Disable the Use saved conversion settings checkbox when none of the books being converted has saved conversion settings"
+
+ - title: "ebook-viewer: And a command line switch to specify the position at which the file should be opened."
+ tickets: [899325]
+
+ - title: "Distribute calibre source code compressed with xz instead of gzip for a 40% reduction in size"
+
+ bug fixes:
+ - title: "Get Books: Fix ebooks.com and amazon.fr. Fix cover display in Diesel ebooks store."
+
+ - title: "HTML Input: Fix regression that broke processing of a small fraction of HTML files encoded in a multi-byte character encoding."
+ tickets: [899691]
+
+ - title: "Greatly reduce the delay at the end of a bulk metadata edit operation that operates on a very large number (thousands) of books"
+
+ - title: "Template language: Fix the subitems formatter function to split only when the period is surrounded by non-white space and not another period"
+
+ - title: "Fix ampersands in titles not displaying in the Cover Browser"
+
+ - title: "MOBI Output: Do not ignore an empty anchor at the end of a block element."
+
+ - title: "MOBI Output: Handle links to inline anchors placed inside large blocks of text correctly, i.e. the link should not point to the start of the block."
+ tickets: [899831]
+
+ - title: "E-book viewer: Fix searching for text that is represented as entities in the underlying HTML."
+ tickets: [899573]
+
+ - title: "Have the Esc shortcut perform exactly the same set of actions as clicking the clear button."
+ tickets: [900048]
+
+ - title: "Prevent the adding books dialog from becoming too wide"
+
+ - title: "Fix custom column editing not behaving correctly with the Previous button in the edit metadata dialog."
+ tickets: [899836]
+
+ - title: "T1 driver. More fixes to datetime handling to try to convince the T1's buggy firmware to not rescan metadata."
+ tickets: [899514]
+
+ - title: "Only allow searching via non accented author names if the user interface language in calibre is set to English."
+ tickets: [899227]
+
+ improved recipes:
+ - Die Zeit subscription
+ - Metro UK
+ - suedeutsche.de
+
+ new recipes:
+ - title: Blues News
+ author: Oskar Kunicki
+
+ - title: "TVXS"
+ author: Hargikas
+
+
- version: 0.8.29
date: 2011-12-02
diff --git a/recipes/adventure_zone_pl.recipe b/recipes/adventure_zone_pl.recipe
index 366b1ccf5a..65f4e3e52d 100644
--- a/recipes/adventure_zone_pl.recipe
+++ b/recipes/adventure_zone_pl.recipe
@@ -1,19 +1,38 @@
from calibre.web.feeds.news import BasicNewsRecipe
-
+import re
class Adventure_zone(BasicNewsRecipe):
title = u'Adventure Zone'
__author__ = 'fenuks'
description = 'Adventure zone - adventure games from A to Z'
category = 'games'
language = 'pl'
- oldest_article = 15
- max_articles_per_feed = 100
no_stylesheets = True
+ oldest_article = 20
+ max_articles_per_feed = 100
+ use_embedded_content=False
+ preprocess_regexps = [(re.compile(r"
Komentarze | ", re.IGNORECASE), lambda m: '')]
remove_tags_before= dict(name='td', attrs={'class':'main-bg'})
- remove_tags_after= dict(name='td', attrs={'class':'main-body middle-border'})
+ remove_tags= [dict(name='img', attrs={'alt':'Drukuj'})]
+ remove_tags_after= dict(id='comments')
extra_css = '.main-bg{text-align: left;} td.capmain{ font-size: 22px; }'
feeds = [(u'Nowinki', u'http://www.adventure-zone.info/fusion/feeds/news.php')]
+ def parse_feeds (self):
+ feeds = BasicNewsRecipe.parse_feeds(self)
+ soup=self.index_to_soup(u'http://www.adventure-zone.info/fusion/feeds/news.php')
+ tag=soup.find(name='channel')
+ titles=[]
+ for r in tag.findAll(name='image'):
+ r.extract()
+ art=tag.findAll(name='item')
+ for i in art:
+ titles.append(i.title.string)
+ for feed in feeds:
+ for article in feed.articles[:]:
+ article.title=titles[feed.articles.index(article)]
+ return feeds
+
+
def get_cover_url(self):
soup = self.index_to_soup('http://www.adventure-zone.info/fusion/news.php')
cover=soup.find(id='box_OstatninumerAZ')
@@ -22,17 +41,10 @@ class Adventure_zone(BasicNewsRecipe):
def skip_ad_pages(self, soup):
- skip_tag = soup.body.findAll(name='a')
- if skip_tag is not None:
- for r in skip_tag:
- if 'articles.php?' in r['href']:
- if r.strong is not None:
- word=r.strong.string
- if ('zapowied' or 'recenzj') in word:
- return self.index_to_soup('http://www.adventure-zone.info/fusion/print.php?type=A&item_id'+r['href'][r['href'].find('_id')+3:], raw=True)
- else:
- None
-
- def print_version(self, url):
- return url.replace('news.php?readmore', 'print.php?type=N&item_id')
-
+ skip_tag = soup.body.find(name='td', attrs={'class':'main-bg'})
+ skip_tag = skip_tag.findAll(name='a')
+ for r in skip_tag:
+ if r.strong:
+ word=r.strong.string
+ if word and (('zapowied' in word) or ('recenzj' in word) or ('solucj' in word)):
+ return self.index_to_soup('http://www.adventure-zone.info/fusion/print.php?type=A&item'+r['href'][r['href'].find('article_id')+7:], raw=True)
\ No newline at end of file
diff --git a/recipes/astro_news_pl.recipe b/recipes/astro_news_pl.recipe
index e5561fc98d..2808fed6e1 100644
--- a/recipes/astro_news_pl.recipe
+++ b/recipes/astro_news_pl.recipe
@@ -1,5 +1,4 @@
from calibre.web.feeds.news import BasicNewsRecipe
-
class AstroNEWS(BasicNewsRecipe):
title = u'AstroNEWS'
__author__ = 'fenuks'
@@ -8,11 +7,16 @@ class AstroNEWS(BasicNewsRecipe):
language = 'pl'
oldest_article = 8
max_articles_per_feed = 100
- auto_cleanup = True
+ #extra_css= 'table {text-align: left;}'
+ no_stylesheets=True
cover_url='http://news.astronet.pl/img/logo_news.jpg'
- # no_stylesheets= True
+ remove_tags=[dict(name='hr')]
feeds = [(u'Wiadomości', u'http://news.astronet.pl/rss.cgi')]
def print_version(self, url):
return url.replace('astronet.pl/', 'astronet.pl/print.cgi?')
+ def preprocess_html(self, soup):
+ for item in soup.findAll(align=True):
+ del item['align']
+ return soup
diff --git a/recipes/berliner_zeitung.recipe b/recipes/berliner_zeitung.recipe
index 6df88835eb..c4190439c7 100644
--- a/recipes/berliner_zeitung.recipe
+++ b/recipes/berliner_zeitung.recipe
@@ -1,61 +1,44 @@
from calibre.web.feeds.recipes import BasicNewsRecipe
-import re
+
+'''Calibre recipe to convert the RSS feeds of the Berliner Zeitung to an ebook.'''
class SportsIllustratedRecipe(BasicNewsRecipe) :
- __author__ = 'ape'
- __copyright__ = 'ape'
+ __author__ = 'a.peter'
+ __copyright__ = 'a.peter'
__license__ = 'GPL v3'
language = 'de'
- description = 'Berliner Zeitung'
- version = 2
+ description = 'Berliner Zeitung RSS'
+ version = 4
title = u'Berliner Zeitung'
timefmt = ' [%d.%m.%Y]'
+ #oldest_article = 7.0
no_stylesheets = True
remove_javascript = True
use_embedded_content = False
publication_type = 'newspaper'
- keep_only_tags = [dict(name='div', attrs={'class':'teaser t_split t_artikel'})]
+ remove_tags_before = dict(name='div', attrs={'class':'newstype'})
+ remove_tags_after = [dict(id='article_text')]
- INDEX = 'http://www.berlinonline.de/berliner-zeitung/'
-
- def parse_index(self):
- base = 'http://www.berlinonline.de'
- answer = []
- articles = {}
- more = 1
-
- soup = self.index_to_soup(self.INDEX)
-
- # Get list of links to ressorts from index page
- ressort_list = soup.findAll('ul', attrs={'class': re.compile('ressortlist')})
- for ressort in ressort_list[0].findAll('a'):
- feed_title = ressort.string
- print 'Analyzing', feed_title
- if not articles.has_key(feed_title):
- articles[feed_title] = []
- answer.append(feed_title)
- # Load ressort page.
- feed = self.index_to_soup('http://www.berlinonline.de' + ressort['href'])
- # find mainbar div which contains the list of all articles
- for article_container in feed.findAll('div', attrs={'class': re.compile('mainbar')}):
- # iterate over all articles
- for article_teaser in article_container.findAll('div', attrs={'class': re.compile('teaser')}):
- # extract title of article
- if article_teaser.h3 != None:
- article = {'title' : article_teaser.h3.a.string, 'date' : u'', 'url' : base + article_teaser.h3.a['href'], 'description' : u''}
- articles[feed_title].append(article)
- else:
- # Skip teasers for missing photos
- if article_teaser.div.p.contents[0].find('Foto:') > -1:
- continue
- article = {'title': 'Weitere Artikel ' + str(more), 'date': u'', 'url': base + article_teaser.div.p.a['href'], 'description': u''}
- articles[feed_title].append(article)
- more += 1
- answer = [[key, articles[key]] for key in answer if articles.has_key(key)]
- return answer
+ feeds = [(u'Startseite', u'http://www.berliner-zeitung.de/home/10808950,10808950,view,asFeed.xml'),
+ (u'Politik', u'http://www.berliner-zeitung.de/home/10808018,10808018,view,asFeed.xml'),
+ (u'Wirtschaft', u'http://www.berliner-zeitung.de/home/10808230,10808230,view,asFeed.xml'),
+ (u'Berlin', u'http://www.berliner-zeitung.de/home/10809148,10809148,view,asFeed.xml'),
+ (u'Brandenburg', u'http://www.berliner-zeitung.de/home/10809312,10809312,view,asFeed.xml'),
+ (u'Wissenschaft', u'http://www.berliner-zeitung.de/home/10808894,10808894,view,asFeed.xml'),
+ (u'Digital', u'http://www.berliner-zeitung.de/home/10808718,10808718,view,asFeed.xml'),
+ (u'Kultur', u'http://www.berliner-zeitung.de/home/10809150,10809150,view,asFeed.xml'),
+ (u'Panorama', u'http://www.berliner-zeitung.de/home/10808334,10808334,view,asFeed.xml'),
+ (u'Sport', u'http://www.berliner-zeitung.de/home/10808794,10808794,view,asFeed.xml'),
+ (u'Hertha', u'http://www.berliner-zeitung.de/home/10808800,10808800,view,asFeed.xml'),
+ (u'Union', u'http://www.berliner-zeitung.de/home/10808802,10808802,view,asFeed.xml'),
+ (u'Verkehr', u'http://www.berliner-zeitung.de/home/10809298,10809298,view,asFeed.xml'),
+ (u'Polizei', u'http://www.berliner-zeitung.de/home/10809296,10809296,view,asFeed.xml'),
+ (u'Meinung', u'http://www.berliner-zeitung.de/home/10808020,10808020,view,asFeed.xml')]
def get_masthead_url(self):
- return 'http://www.berlinonline.de/.img/berliner-zeitung/blz_logo.gif'
+ return 'http://www.berliner-zeitung.de/image/view/10810244,7040611,data,logo.png'
+ def print_version(self, url):
+ return url.replace('.html', ',view,printVersion.html')
diff --git a/recipes/biolog_pl.recipe b/recipes/biolog_pl.recipe
new file mode 100644
index 0000000000..af9ad77e44
--- /dev/null
+++ b/recipes/biolog_pl.recipe
@@ -0,0 +1,19 @@
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+
+from calibre.web.feeds.news import BasicNewsRecipe
+class Biolog_pl(BasicNewsRecipe):
+ title = u'Biolog.pl'
+ oldest_article = 7
+ max_articles_per_feed = 100
+ remove_empty_feeds=True
+ __author__ = 'fenuks'
+ description = u'Przyrodnicze aktualności ze świata nauki (codziennie aktualizowane), kurs biologii, testy i sprawdziany, forum dyskusyjne.'
+ category = 'biology'
+ language = 'pl'
+ cover_url='http://www.biolog.pl/naukowy,portal,biolog.png'
+ no_stylesheets = True
+ #keeps_only_tags=[dict(id='main')]
+ remove_tags_before=dict(id='main')
+ remove_tags_after=dict(name='a', attrs={'name':'komentarze'})
+ remove_tags=[dict(name='img', attrs={'alt':'Komentarze'})]
+ feeds = [(u'Wszystkie', u'http://www.biolog.pl/backend.php'), (u'Medycyna', u'http://www.biolog.pl/medycyna-rss.php'), (u'Ekologia', u'http://www.biolog.pl/rss-ekologia.php'), (u'Genetyka i biotechnologia', u'http://www.biolog.pl/rss-biotechnologia.php'), (u'Botanika', u'http://www.biolog.pl/rss-botanika.php'), (u'Le\u015bnictwo', u'http://www.biolog.pl/rss-lesnictwo.php'), (u'Zoologia', u'http://www.biolog.pl/rss-zoologia.php')]
diff --git a/recipes/blues.recipe b/recipes/blues.recipe
new file mode 100644
index 0000000000..a7db8375f8
--- /dev/null
+++ b/recipes/blues.recipe
@@ -0,0 +1,26 @@
+__license__ = 'GPL v3'
+__copyright__ = '2011, Oskar Kunicki '
+'''
+Changelog:
+2011-11-27
+News from BluesRSS.info
+'''
+
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class BluesRSS(BasicNewsRecipe):
+ title = 'Blues News'
+ __author__ = 'Oskar Kunicki'
+ description ='Blues news from around the world'
+ publisher = 'BluesRSS.info'
+ category = 'news, blues, USA,UK'
+ oldest_article = 5
+ max_articles_per_feed = 100
+ language = 'en'
+ cover_url = 'http://bluesrss.info/cover.jpg'
+ masthead_url = 'http://bluesrss.info/cover.jpg'
+ no_stylesheets = True
+
+ remove_tags = [dict(name='div', attrs={'class':'wp-pagenavi'})]
+
+ feeds = [(u'News', u'http://bluesrss.info/feed/')]
diff --git a/recipes/cnd.recipe b/recipes/cnd.recipe
index 0e8206d07a..c99166730c 100644
--- a/recipes/cnd.recipe
+++ b/recipes/cnd.recipe
@@ -23,7 +23,9 @@ class TheCND(BasicNewsRecipe):
remove_tags = [dict(name='table', attrs={'align':'right'}), dict(name='img', attrs={'src':'http://my.cnd.org/images/logo.gif'}), dict(name='hr', attrs={}), dict(name='small', attrs={})]
no_stylesheets = True
- preprocess_regexps = [(re.compile(r'', re.DOTALL), lambda m: '')]
+ preprocess_regexps = [ (re.compile(r'', re.DOTALL), lambda m: ''),
+ (re.compile('', re.DOTALL), lambda m: ''),
+ ]
def print_version(self, url):
if url.find('news/article.php') >= 0:
@@ -46,16 +48,18 @@ class TheCND(BasicNewsRecipe):
title = self.tag_to_string(a)
self.log('\tFound article: ', title, 'at', url)
date = a.nextSibling
+ if re.search('cm', date):
+ continue
if (date is not None) and len(date)>2:
if not articles.has_key(date):
articles[date] = []
articles[date].append({'title':title, 'url':url, 'description': '', 'date':''})
self.log('\t\tAppend to : ', date)
- self.log('log articles', articles)
+ #self.log('log articles', articles)
mostCurrent = sorted(articles).pop()
- self.title = 'CND ' + mostCurrent
-
+ self.title = 'CND ' + mostCurrent
+
feeds.append((self.title, articles[mostCurrent]))
return feeds
diff --git a/recipes/cnd_weekly.recipe b/recipes/cnd_weekly.recipe
new file mode 100644
index 0000000000..21839ae110
--- /dev/null
+++ b/recipes/cnd_weekly.recipe
@@ -0,0 +1,72 @@
+#!/usr/bin/env python
+
+__license__ = 'GPL v3'
+__copyright__ = '2010, Derek Liang '
+'''
+cnd.org
+'''
+import re
+
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class TheCND(BasicNewsRecipe):
+
+ title = 'CND Weekly'
+ __author__ = 'Derek Liang'
+ description = ''
+ INDEX = 'http://cnd.org'
+ language = 'zh'
+ conversion_options = {'linearize_tables':True}
+
+ remove_tags_before = dict(name='div', id='articleHead')
+ remove_tags_after = dict(id='copyright')
+ remove_tags = [dict(name='table', attrs={'align':'right'}), dict(name='img', attrs={'src':'http://my.cnd.org/images/logo.gif'}), dict(name='hr', attrs={}), dict(name='small', attrs={})]
+ no_stylesheets = True
+
+ preprocess_regexps = [ (re.compile(r'', re.DOTALL), lambda m: ''),
+ (re.compile('', re.DOTALL), lambda m: ''),
+ ]
+
+ def print_version(self, url):
+ if url.find('news/article.php') >= 0:
+ return re.sub("^[^=]*", "http://my.cnd.org/modules/news/print.php?storyid", url)
+ else:
+ return re.sub("^[^=]*", "http://my.cnd.org/modules/wfsection/print.php?articleid", url)
+
+ def parse_index(self):
+ soup = self.index_to_soup(self.INDEX)
+
+ feeds = []
+ articles = {}
+
+ for a in soup.findAll('a', attrs={'target':'_cnd'}):
+ url = a['href']
+ if url.find('article.php') < 0 :
+ continue
+ if url.startswith('/'):
+ url = 'http://cnd.org'+url
+ title = self.tag_to_string(a)
+ date = a.nextSibling
+ if not re.search('cm', date):
+ continue
+ self.log('\tFound article: ', title, 'at', url, '@', date)
+ if (date is not None) and len(date)>2:
+ if not articles.has_key(date):
+ articles[date] = []
+ articles[date].append({'title':title, 'url':url, 'description': '', 'date':''})
+ self.log('\t\tAppend to : ', date)
+
+
+ sorted_articles = sorted(articles)
+ while sorted_articles:
+ mostCurrent = sorted_articles.pop()
+ self.title = 'CND ' + mostCurrent
+ feeds.append((self.title, articles[mostCurrent]))
+
+ return feeds
+
+ def populate_article_metadata(self, article, soup, first):
+ header = soup.find('h3')
+ self.log('header: ' + self.tag_to_string(header))
+ pass
+
diff --git a/recipes/computerworld_pl.recipe b/recipes/computerworld_pl.recipe
new file mode 100644
index 0000000000..90b7d63c56
--- /dev/null
+++ b/recipes/computerworld_pl.recipe
@@ -0,0 +1,22 @@
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+
+from calibre.web.feeds.news import BasicNewsRecipe
+class Computerworld_pl(BasicNewsRecipe):
+ title = u'Computerworld.pl'
+ __author__ = 'fenuks'
+ description = u'Serwis o IT w przemyśle, finansach, handlu, administracji oraz rynku IT i telekomunikacyjnym - wiadomości, opinie, analizy, porady prawne'
+ category = 'IT'
+ language = 'pl'
+ no_stylesheets=True
+ oldest_article = 7
+ max_articles_per_feed = 100
+ keep_only_tags=[dict(name='div', attrs={'id':'s'})]
+ remove_tags_after=dict(name='div', attrs={'class':'rMobi'})
+ remove_tags=[dict(name='div', attrs={'class':['nnav', 'rMobi']}), dict(name='table', attrs={'class':'ramka_slx'})]
+ feeds = [(u'Wiadomo\u015bci', u'http://rssout.idg.pl/cw/news_iso.xml')]
+
+ def get_cover_url(self):
+ soup = self.index_to_soup('http://www.computerworld.pl/')
+ cover=soup.find(name='img', attrs={'class':'prawo'})
+ self.cover_url=cover['src']
+ return getattr(self, 'cover_url', self.cover_url)
diff --git a/recipes/datasport.recipe b/recipes/datasport.recipe
new file mode 100644
index 0000000000..235b262d48
--- /dev/null
+++ b/recipes/datasport.recipe
@@ -0,0 +1,15 @@
+__license__ = 'GPL v3'
+__author__ = 'faber1971'
+description = 'Italian soccer news website - v1.00 (17, December 2011)'
+
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class AdvancedUserRecipe1324114272(BasicNewsRecipe):
+ title = u'Datasport'
+ language = 'it'
+ __author__ = 'faber1971'
+ oldest_article = 1
+ max_articles_per_feed = 100
+ auto_cleanup = True
+
+ feeds = [(u'Datasport', u'http://www.datasport.it/calcio/rss.xml')]
diff --git a/recipes/descopera_org.recipe b/recipes/descopera_org.recipe
new file mode 100644
index 0000000000..c24046da12
--- /dev/null
+++ b/recipes/descopera_org.recipe
@@ -0,0 +1,27 @@
+# -*- coding: utf-8 -*-
+'''
+descopera.org
+'''
+
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class Descopera(BasicNewsRecipe):
+ title = u'Descoperă.org'
+ __author__ = 'Marius Ignătescu'
+ description = 'Descoperă. Placerea de a cunoaște'
+ publisher = 'descopera.org'
+ category = 'science, technology, culture, history, earth'
+ language = 'ro'
+ oldest_article = 14
+ max_articles_per_feed = 100
+ encoding = 'utf8'
+ no_stylesheets = True
+ extra_css = ' body{ font-family: Verdana,Helvetica,Arial,sans-serif } .introduction{font-weight: bold} .story-feature{display: block; padding: 0; border: 1px solid; width: 40%; font-size: small} .story-feature h2{text-align: center; text-transform: uppercase} '
+ keep_only_tags = [dict(name='div', attrs={'class':['post']})]
+ remove_tags = [dict(name='div', attrs={'class':['topnav', 'box_a', 'shr-bookmarks shr-bookmarks-expand shr-bookmarks-center shr-bookmarks-bg-knowledge']})]
+ remove_attributes = ['width','height']
+ cover_url = 'http://www.descopera.org/wp-content/themes/dorg/styles/default/img/b_top.png?width=400'
+ feeds = [(u'Articles', u'http://www.descopera.org/feed/')]
+
+ def preprocess_html(self, soup):
+ return self.adeify_images(soup)
diff --git a/recipes/dziennik_pl.recipe b/recipes/dziennik_pl.recipe
new file mode 100644
index 0000000000..b5453659ef
--- /dev/null
+++ b/recipes/dziennik_pl.recipe
@@ -0,0 +1,58 @@
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+
+from calibre.web.feeds.news import BasicNewsRecipe
+import re
+class Dziennik_pl(BasicNewsRecipe):
+ title = u'Dziennik.pl'
+ __author__ = 'fenuks'
+ description = u'Wiadomości z kraju i ze świata. Wiadomości gospodarcze. Znajdziesz u nas informacje, wydarzenia, komentarze, opinie.'
+ category = 'newspaper'
+ language = 'pl'
+ cover_url='http://6.s.dziennik.pl/images/og_dziennik.jpg'
+ no_stylesheets = True
+ oldest_article = 7
+ max_articles_per_feed = 100
+ remove_javascript=True
+ remove_empty_feeds=True
+ preprocess_regexps = [(re.compile("Komentarze:"), lambda m: '')]
+ keep_only_tags=[dict(id='article')]
+ remove_tags=[dict(name='div', attrs={'class':['art_box_dodatki', 'new_facebook_icons2', 'leftArt', 'article_print', 'quiz-widget']}), dict(name='a', attrs={'class':'komentarz'})]
+ feeds = [(u'Wszystko', u'http://rss.dziennik.pl/Dziennik-PL/'),
+ (u'Wiadomości', u'http://rss.dziennik.pl/Dziennik-Wiadomosci'),
+ (u'Gospodarka', u'http://rss.dziennik.pl/Dziennik-Gospodarka'),
+ (u'Kobieta', u'http://rss.dziennik.pl/Dziennik-Kobieta'),
+ (u'Auto', u'http://rss.dziennik.pl/Dziennik-Auto'),
+ (u'Rozrywka', u'http://rss.dziennik.pl/Dziennik-Rozrywka'),
+ (u'Film', u'http://rss.dziennik.pl/Dziennik-Film'),
+ (u'Muzyka' , u'http://rss.dziennik.pl/Dziennik-Muzyka'),
+ (u'Kultura', u'http://rss.dziennik.pl/Dziennik-Kultura'),
+ (u'Nauka', u'http://rss.dziennik.pl/Dziennik-Nauka'),
+ (u'Podróże', u'http://rss.dziennik.pl/Dziennik-Podroze/'),
+ (u'Nieruchomości', u'http://rss.dziennik.pl/Dziennik-Nieruchomosci')]
+
+ def append_page(self, soup, appendtag):
+ tag=soup.find('a', attrs={'class':'page_next'})
+ if tag:
+ appendtag.find('div', attrs={'class':'article_paginator'}).extract()
+ while tag:
+ soup2= self.index_to_soup(tag['href'])
+ tag=soup2.find('a', attrs={'class':'page_next'})
+ if not tag:
+ for r in appendtag.findAll('div', attrs={'class':'art_src'}):
+ r.extract()
+ pagetext = soup2.find(name='div', attrs={'class':'article_body'})
+ for dictionary in self.remove_tags:
+ v=pagetext.findAll(name=dictionary['name'], attrs=dictionary['attrs'])
+ for delete in v:
+ delete.extract()
+ pos = len(appendtag.contents)
+ appendtag.insert(pos, pagetext)
+ if appendtag.find('div', attrs={'class':'article_paginator'}):
+ appendtag.find('div', attrs={'class':'article_paginator'}).extract()
+
+
+
+
+ def preprocess_html(self, soup):
+ self.append_page(soup, soup.body)
+ return soup
diff --git a/recipes/emuzica_pl.recipe b/recipes/emuzica_pl.recipe
new file mode 100644
index 0000000000..75271c510a
--- /dev/null
+++ b/recipes/emuzica_pl.recipe
@@ -0,0 +1,16 @@
+# vim:fileencoding=UTF-8:ts=4:sw=4:sta:et:sts=4:ai
+
+from calibre.web.feeds.news import BasicNewsRecipe
+class eMuzyka(BasicNewsRecipe):
+ title = u'eMuzyka'
+ __author__ = 'fenuks'
+ description = u'Emuzyka to największa i najpopularniejsza strona o muzyce w Polsce'
+ category = 'music'
+ language = 'pl'
+ cover_url='http://s.emuzyka.pl/img/emuzyka_invert_small.jpg'
+ no_stylesheets = True
+ oldest_article = 7
+ max_articles_per_feed = 100
+ keep_only_tags=[dict(name='div', attrs={'id':'news_container'}), dict(name='h3'), dict(name='div', attrs={'class':'review_text'})]
+ remove_tags=[dict(name='span', attrs={'id':'date'})]
+ feeds = [(u'Aktualno\u015bci', u'http://www.emuzyka.pl/rss.php?f=1'), (u'Recenzje', u'http://www.emuzyka.pl/rss.php?f=2')]
diff --git a/recipes/fisco_oggi.recipe b/recipes/fisco_oggi.recipe
new file mode 100644
index 0000000000..2b99441a89
--- /dev/null
+++ b/recipes/fisco_oggi.recipe
@@ -0,0 +1,18 @@
+__license__ = 'GPL v3'
+__author__ = 'faber1971'
+description = 'Website of Italian Governament Income Agency (about revenue, taxation, taxes)- v1.00 (17, December 2011)'
+
+from calibre.web.feeds.news import BasicNewsRecipe
+
+class AdvancedUserRecipe1324112023(BasicNewsRecipe):
+ title = u'Fisco Oggi'
+ language = 'it'
+ __author__ = 'faber1971'
+ oldest_article = 7
+ max_articles_per_feed = 100
+ auto_cleanup = True
+ remove_javascript = True
+ no_stylesheets = True
+
+ feeds = [(u'Attualit\xe0', u'http://www.fiscooggi.it/taxonomy/term/1/feed'), (u'Normativa', u'http://www.fiscooggi.it/taxonomy/term/5/feed'), (u'Giurisprudenza', u'http://www.fiscooggi.it/taxonomy/term/8/feed'), (u'Dati e statistiche', u'http://www.fiscooggi.it/taxonomy/term/12/feed'), (u'Analisi e commenti', u'http://www.fiscooggi.it/taxonomy/term/13/feed'), (u'Bilancio e contabilit\xe0', u'http://www.fiscooggi.it/taxonomy/term/576/feed'), (u'Dalle regioni', u'http://www.fiscooggi.it/taxonomy/term/16/feed'), (u'Dal mondo', u'http://www.fiscooggi.it/taxonomy/term/17/feed')]
+
diff --git a/recipes/focus_pl.recipe b/recipes/focus_pl.recipe
index d63af135bc..342aa0d2db 100644
--- a/recipes/focus_pl.recipe
+++ b/recipes/focus_pl.recipe
@@ -1,57 +1,68 @@
-# -*- coding: utf-8 -*-
+import re
+
from calibre.web.feeds.news import BasicNewsRecipe
-class Focus_pl(BasicNewsRecipe):
- title = u'Focus.pl'
- oldest_article = 15
- max_articles_per_feed = 100
- __author__ = 'fenuks'
- language = 'pl'
- description ='polish scientific monthly magazine'
+class FocusRecipe(BasicNewsRecipe):
+ __license__ = 'GPL v3'
+ __author__ = u'intromatyk '
+ language = 'pl'
+ version = 1
+
+ title = u'Focus'
+ publisher = u'Gruner + Jahr Polska'
+ category = u'News'
+ description = u'Newspaper'
category='magazine'
cover_url=''
remove_empty_feeds= True
no_stylesheets=True
- remove_tags_before=dict(name='div', attrs={'class':'h2 h2f'})
- remove_tags_after=dict(name='div', attrs={'class':'clear'})
- feeds = [(u'Wszystkie kategorie', u'http://focus.pl.feedsportal.com/c/32992/f/532692/index.rss'),
- (u'Nauka', u'http://focus.pl.feedsportal.com/c/32992/f/532693/index.rss'),
- (u'Historia', u'http://focus.pl.feedsportal.com/c/32992/f/532694/index.rss'),
- (u'Cywilizacja', u'http://focus.pl.feedsportal.com/c/32992/f/532695/index.rss'),
- (u'Sport', u'http://focus.pl.feedsportal.com/c/32992/f/532696/index.rss'),
- (u'Technika', u'http://focus.pl.feedsportal.com/c/32992/f/532697/index.rss'),
- (u'Przyroda', u'http://focus.pl.feedsportal.com/c/32992/f/532698/index.rss'),
- (u'Technologie', u'http://focus.pl.feedsportal.com/c/32992/f/532699/index.rss'),
- (u'Warto wiedzieć', u'http://focus.pl.feedsportal.com/c/32992/f/532700/index.rss'),
+ oldest_article = 7
+ max_articles_per_feed = 100000
+ recursions = 0
+
+ no_stylesheets = True
+ remove_javascript = True
+ encoding = 'utf-8'
+ # Seems to work best, but YMMV
+ simultaneous_downloads = 5
+
+ r = re.compile('.*(?Phttp:\/\/(www.focus.pl)|(rss.feedsportal.com\/c)\/.*\.html?).*')
+ keep_only_tags =[]
+ keep_only_tags.append(dict(name = 'div', attrs = {'id' : 'cll'}))
+
+ remove_tags =[]
+ remove_tags.append(dict(name = 'div', attrs = {'class' : 'ulm noprint'}))
+ remove_tags.append(dict(name = 'div', attrs = {'class' : 'txb'}))
+ remove_tags.append(dict(name = 'div', attrs = {'class' : 'h2'}))
+ remove_tags.append(dict(name = 'ul', attrs = {'class' : 'txu'}))
+ remove_tags.append(dict(name = 'div', attrs = {'class' : 'ulc'}))
+
+ extra_css = '''
+ body {font-family: verdana, arial, helvetica, geneva, sans-serif ;}
+ h1{text-align: left;}
+ h2{font-size: medium; font-weight: bold;}
+ p.lead {font-weight: bold; text-align: left;}
+ .authordate {font-size: small; color: #696969;}
+ .fot{font-size: x-small; color: #666666;}
+ '''
-
-]
+ feeds = [
+ ('Nauka', 'http://focus.pl.feedsportal.com/c/32992/f/532693/index.rss'),
+ ('Historia', 'http://focus.pl.feedsportal.com/c/32992/f/532694/index.rss'),
+ ('Cywilizacja', 'http://focus.pl.feedsportal.com/c/32992/f/532695/index.rss'),
+ ('Sport', 'http://focus.pl.feedsportal.com/c/32992/f/532696/index.rss'),
+ ('Technika', 'http://focus.pl.feedsportal.com/c/32992/f/532697/index.rss'),
+ ('Przyroda', 'http://focus.pl.feedsportal.com/c/32992/f/532698/index.rss'),
+ ('Technologie', 'http://focus.pl.feedsportal.com/c/32992/f/532699/index.rss'),
+ ]
def skip_ad_pages(self, soup):
- tag=soup.find(name='a')
- if tag:
- new_soup=self.index_to_soup(tag['href']+ 'do-druku/1/', raw=True)
- return new_soup
-
- def append_page(self, appendtag):
- tag=appendtag.find(name='div', attrs={'class':'arrows'})
- if tag:
- nexturl='http://www.focus.pl/'+tag.a['href']
- for rem in appendtag.findAll(name='div', attrs={'class':'klik-nav'}):
- rem.extract()
- while nexturl:
- soup2=self.index_to_soup(nexturl)
- nexturl=None
- pagetext=soup2.find(name='div', attrs={'class':'txt'})
- tag=pagetext.find(name='div', attrs={'class':'arrows'})
- for r in tag.findAll(name='a'):
- if u'Następne' in r.string:
- nexturl='http://www.focus.pl/'+r['href']
- for rem in pagetext.findAll(name='div', attrs={'class':'klik-nav'}):
- rem.extract()
- pos = len(appendtag.contents)
- appendtag.insert(pos, pagetext)
+ if ('advertisement' in soup.find('title').string.lower()):
+ href = soup.find('a').get('href')
+ return self.index_to_soup(href, raw=True)
+ else:
+ return None
def get_cover_url(self):
soup=self.index_to_soup('http://www.focus.pl/magazyn/')
@@ -60,7 +71,14 @@ class Focus_pl(BasicNewsRecipe):
self.cover_url='http://www.focus.pl/' + tag.a['href']
return getattr(self, 'cover_url', self.cover_url)
-
- def preprocess_html(self, soup):
- self.append_page(soup.body)
- return soup
+ def print_version(self, url):
+ if url.count ('focus.pl.feedsportal.com'):
+ u = url.find('focus0Bpl')
+ u = 'http://www.focus.pl/' + url[u + 11:]
+ u = u.replace('0C', '/')
+ u = u.replace('A', '')
+ u = u.replace ('0E','-')
+ u = u.replace('/nc/1//story01.htm', '/do-druku/1')
+ else:
+ u = url.replace('/nc/1','/do-druku/1')
+ return u
\ No newline at end of file
diff --git a/recipes/globe_and_mail.recipe b/recipes/globe_and_mail.recipe
index 03061a2329..a7c78887c5 100644
--- a/recipes/globe_and_mail.recipe
+++ b/recipes/globe_and_mail.recipe
@@ -51,6 +51,13 @@ class AdvancedUserRecipe1287083651(BasicNewsRecipe):
{'class':['articleTools', 'pagination', 'Ads', 'topad',
'breadcrumbs', 'footerNav', 'footerUtil', 'downloadlinks']}]
+ def populate_article_metadata(self, article, soup, first):
+ if first and hasattr(self, 'add_toc_thumbnail'):
+ picdiv = soup.find('img')
+ if picdiv is not None:
+ self.add_toc_thumbnail(article,picdiv['src'])
+
+
#Use the mobile version rather than the web version
def print_version(self, url):
return url.rpartition('?')[0] + '?service=mobile'
diff --git a/recipes/guardian.recipe b/recipes/guardian.recipe
index 840e8302af..8bff4f9be8 100644
--- a/recipes/guardian.recipe
+++ b/recipes/guardian.recipe
@@ -79,6 +79,12 @@ class Guardian(BasicNewsRecipe):
url = None
return url
+ def populate_article_metadata(self, article, soup, first):
+ if first and hasattr(self, 'add_toc_thumbnail'):
+ picdiv = soup.find('img')
+ if picdiv is not None:
+ self.add_toc_thumbnail(article,picdiv['src'])
+
def preprocess_html(self, soup):
# multiple html sections in soup, useful stuff in the first
diff --git a/recipes/hackernews.recipe b/recipes/hackernews.recipe
index 8d80008467..dca984eae5 100644
--- a/recipes/hackernews.recipe
+++ b/recipes/hackernews.recipe
@@ -9,9 +9,9 @@ from calibre.ptempfile import PersistentTemporaryFile
from urlparse import urlparse
import re
-class HackerNews(BasicNewsRecipe):
- title = 'Hacker News'
- __author__ = 'Tom Scholl'
+class HNWithCommentsLink(BasicNewsRecipe):
+ title = 'HN With Comments Link'
+ __author__ = 'Tom Scholl & David Kerschner'
description = u'Hacker News, run by Y Combinator. Anything that good hackers would find interesting, with a focus on programming and startups.'
publisher = 'Y Combinator'
category = 'news, programming, it, technology'
@@ -80,6 +80,11 @@ class HackerNews(BasicNewsRecipe):
body = body + comments
return u'' + title + u'' + body + ''
+ def parse_feeds(self):
+ a = super(HNWithCommentsLink, self).parse_feeds()
+ self.hn_articles = a[0].articles
+ return a
+
def get_obfuscated_article(self, url):
if url.startswith('http://news.ycombinator.com'):
content = self.get_hn_content(url)
@@ -97,6 +102,13 @@ class HackerNews(BasicNewsRecipe):
else:
content = self.get_readable_content(url)
+ article = 0
+ for a in self.hn_articles:
+ if a.url == url:
+ article = a
+
+ content = re.sub(r'